diff --git "a/video_mllm_swift/s2_siglip2_qwen3_1.7b_10pct/checkpoint-900/trainer_state.json" "b/video_mllm_swift/s2_siglip2_qwen3_1.7b_10pct/checkpoint-900/trainer_state.json" new file mode 100644--- /dev/null +++ "b/video_mllm_swift/s2_siglip2_qwen3_1.7b_10pct/checkpoint-900/trainer_state.json" @@ -0,0 +1,7234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.357533471619838, + "eval_steps": 100.0, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015085800490288515, + "grad_norm": 23.124605178833008, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.189295768737793, + "step": 1, + "token_acc": 0.548826954362531 + }, + { + "epoch": 0.003017160098057703, + "grad_norm": 30.720653533935547, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.3531155586242676, + "step": 2, + "token_acc": 0.5386499554650719 + }, + { + "epoch": 0.004525740147086555, + "grad_norm": 29.18061065673828, + "learning_rate": 6.000000000000001e-07, + "loss": 2.3362576961517334, + "step": 3, + "token_acc": 0.533344527467607 + }, + { + "epoch": 0.006034320196115406, + "grad_norm": 25.411008834838867, + "learning_rate": 8.000000000000001e-07, + "loss": 2.1949615478515625, + "step": 4, + "token_acc": 0.5521145186042417 + }, + { + "epoch": 0.007542900245144258, + "grad_norm": 26.599546432495117, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.2117741107940674, + "step": 5, + "token_acc": 0.5545308443565335 + }, + { + "epoch": 0.00905148029417311, + "grad_norm": 24.21645736694336, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.1966896057128906, + "step": 6, + "token_acc": 0.5543595012375659 + }, + { + "epoch": 0.010560060343201961, + "grad_norm": 26.403793334960938, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.2035701274871826, + "step": 7, + "token_acc": 0.5639327959803203 + }, + { + "epoch": 0.012068640392230812, + "grad_norm": 22.211029052734375, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.16054105758667, + "step": 8, + "token_acc": 0.5544489611562782 + }, + { + "epoch": 0.013577220441259665, + "grad_norm": 22.49414825439453, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.1442711353302, + "step": 9, + "token_acc": 0.5531443346569397 + }, + { + "epoch": 0.015085800490288516, + "grad_norm": 17.29258918762207, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.9600356817245483, + "step": 10, + "token_acc": 0.5812984627941776 + }, + { + "epoch": 0.016594380539317367, + "grad_norm": 15.91141128540039, + "learning_rate": 2.2e-06, + "loss": 2.1071956157684326, + "step": 11, + "token_acc": 0.5534001534624976 + }, + { + "epoch": 0.01810296058834622, + "grad_norm": 16.09598159790039, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.039396286010742, + "step": 12, + "token_acc": 0.5636199561686219 + }, + { + "epoch": 0.01961154063737507, + "grad_norm": 11.826876640319824, + "learning_rate": 2.6e-06, + "loss": 1.8663848638534546, + "step": 13, + "token_acc": 0.5900159519564605 + }, + { + "epoch": 0.021120120686403922, + "grad_norm": 9.096514701843262, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.7871583700180054, + "step": 14, + "token_acc": 0.6052875082617316 + }, + { + "epoch": 0.022628700735432775, + "grad_norm": 9.173731803894043, + "learning_rate": 3e-06, + "loss": 1.730183720588684, + "step": 15, + "token_acc": 0.6137154922617309 + }, + { + "epoch": 0.024137280784461625, + "grad_norm": 10.401883125305176, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.7819602489471436, + "step": 16, + "token_acc": 0.6036012464023215 + }, + { + "epoch": 0.025645860833490478, + "grad_norm": 7.653559684753418, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.7723816633224487, + "step": 17, + "token_acc": 0.6097611160264192 + }, + { + "epoch": 0.02715444088251933, + "grad_norm": 7.4187397956848145, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.6256237030029297, + "step": 18, + "token_acc": 0.630830379202922 + }, + { + "epoch": 0.02866302093154818, + "grad_norm": 5.428247451782227, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.6749640703201294, + "step": 19, + "token_acc": 0.6113693884511138 + }, + { + "epoch": 0.030171600980577033, + "grad_norm": 4.826409339904785, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6341395378112793, + "step": 20, + "token_acc": 0.6291252379944997 + }, + { + "epoch": 0.03168018102960588, + "grad_norm": 4.36765718460083, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.5215749740600586, + "step": 21, + "token_acc": 0.651028217652366 + }, + { + "epoch": 0.033188761078634735, + "grad_norm": 4.657986164093018, + "learning_rate": 4.4e-06, + "loss": 1.5829765796661377, + "step": 22, + "token_acc": 0.6384052796178187 + }, + { + "epoch": 0.03469734112766359, + "grad_norm": 3.0909276008605957, + "learning_rate": 4.600000000000001e-06, + "loss": 1.405585765838623, + "step": 23, + "token_acc": 0.6739241501277264 + }, + { + "epoch": 0.03620592117669244, + "grad_norm": 3.1741816997528076, + "learning_rate": 4.800000000000001e-06, + "loss": 1.5444121360778809, + "step": 24, + "token_acc": 0.6441629931303006 + }, + { + "epoch": 0.037714501225721286, + "grad_norm": 3.106473922729492, + "learning_rate": 5e-06, + "loss": 1.5049681663513184, + "step": 25, + "token_acc": 0.6443499360763952 + }, + { + "epoch": 0.03922308127475014, + "grad_norm": 3.003570556640625, + "learning_rate": 5.2e-06, + "loss": 1.536961555480957, + "step": 26, + "token_acc": 0.6387864886267443 + }, + { + "epoch": 0.04073166132377899, + "grad_norm": 2.4693381786346436, + "learning_rate": 5.400000000000001e-06, + "loss": 1.4762600660324097, + "step": 27, + "token_acc": 0.6508818512722221 + }, + { + "epoch": 0.042240241372807845, + "grad_norm": 3.5997209548950195, + "learning_rate": 5.600000000000001e-06, + "loss": 1.5022133588790894, + "step": 28, + "token_acc": 0.6411592393392239 + }, + { + "epoch": 0.0437488214218367, + "grad_norm": 3.0522239208221436, + "learning_rate": 5.8e-06, + "loss": 1.3743257522583008, + "step": 29, + "token_acc": 0.6629581010996578 + }, + { + "epoch": 0.04525740147086555, + "grad_norm": 3.111283302307129, + "learning_rate": 6e-06, + "loss": 1.5005674362182617, + "step": 30, + "token_acc": 0.6428525316864281 + }, + { + "epoch": 0.046765981519894397, + "grad_norm": 2.9558024406433105, + "learning_rate": 6.200000000000001e-06, + "loss": 1.4562207460403442, + "step": 31, + "token_acc": 0.6533377495408148 + }, + { + "epoch": 0.04827456156892325, + "grad_norm": 2.362173557281494, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.4116486310958862, + "step": 32, + "token_acc": 0.6584267393719075 + }, + { + "epoch": 0.0497831416179521, + "grad_norm": 2.822561025619507, + "learning_rate": 6.600000000000001e-06, + "loss": 1.377655267715454, + "step": 33, + "token_acc": 0.6677375049468073 + }, + { + "epoch": 0.051291721666980955, + "grad_norm": 2.2099828720092773, + "learning_rate": 6.800000000000001e-06, + "loss": 1.391998291015625, + "step": 34, + "token_acc": 0.663858647850986 + }, + { + "epoch": 0.05280030171600981, + "grad_norm": 2.7615244388580322, + "learning_rate": 7e-06, + "loss": 1.323710560798645, + "step": 35, + "token_acc": 0.6733675158026098 + }, + { + "epoch": 0.05430888176503866, + "grad_norm": 2.2214949131011963, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.3986542224884033, + "step": 36, + "token_acc": 0.6596454200084424 + }, + { + "epoch": 0.05581746181406751, + "grad_norm": 2.194584369659424, + "learning_rate": 7.4e-06, + "loss": 1.4356110095977783, + "step": 37, + "token_acc": 0.6591843518729115 + }, + { + "epoch": 0.05732604186309636, + "grad_norm": 4.760872840881348, + "learning_rate": 7.600000000000001e-06, + "loss": 1.3106210231781006, + "step": 38, + "token_acc": 0.6806390863252536 + }, + { + "epoch": 0.05883462191212521, + "grad_norm": 2.0381641387939453, + "learning_rate": 7.800000000000002e-06, + "loss": 1.3523569107055664, + "step": 39, + "token_acc": 0.6706152365440142 + }, + { + "epoch": 0.060343201961154065, + "grad_norm": 1.861093282699585, + "learning_rate": 8.000000000000001e-06, + "loss": 1.309014081954956, + "step": 40, + "token_acc": 0.6748093891084843 + }, + { + "epoch": 0.06185178201018292, + "grad_norm": 1.7846293449401855, + "learning_rate": 8.2e-06, + "loss": 1.3355516195297241, + "step": 41, + "token_acc": 0.6685706915295462 + }, + { + "epoch": 0.06336036205921176, + "grad_norm": 2.722421169281006, + "learning_rate": 8.400000000000001e-06, + "loss": 1.3051695823669434, + "step": 42, + "token_acc": 0.6840095949187472 + }, + { + "epoch": 0.06486894210824062, + "grad_norm": 1.8847869634628296, + "learning_rate": 8.6e-06, + "loss": 1.3639600276947021, + "step": 43, + "token_acc": 0.66565752503067 + }, + { + "epoch": 0.06637752215726947, + "grad_norm": 2.6605076789855957, + "learning_rate": 8.8e-06, + "loss": 1.392951488494873, + "step": 44, + "token_acc": 0.6626246600181324 + }, + { + "epoch": 0.06788610220629832, + "grad_norm": 2.6811139583587646, + "learning_rate": 9e-06, + "loss": 1.398695945739746, + "step": 45, + "token_acc": 0.6615109710640588 + }, + { + "epoch": 0.06939468225532718, + "grad_norm": 2.140552043914795, + "learning_rate": 9.200000000000002e-06, + "loss": 1.3160855770111084, + "step": 46, + "token_acc": 0.6745101974261847 + }, + { + "epoch": 0.07090326230435602, + "grad_norm": 2.3291702270507812, + "learning_rate": 9.4e-06, + "loss": 1.3565082550048828, + "step": 47, + "token_acc": 0.6638154976873851 + }, + { + "epoch": 0.07241184235338488, + "grad_norm": 1.5316709280014038, + "learning_rate": 9.600000000000001e-06, + "loss": 1.1960928440093994, + "step": 48, + "token_acc": 0.6998262960983431 + }, + { + "epoch": 0.07392042240241373, + "grad_norm": 1.7314256429672241, + "learning_rate": 9.800000000000001e-06, + "loss": 1.287950873374939, + "step": 49, + "token_acc": 0.6765742693582404 + }, + { + "epoch": 0.07542900245144257, + "grad_norm": 2.265610456466675, + "learning_rate": 1e-05, + "loss": 1.2918251752853394, + "step": 50, + "token_acc": 0.6790073832744505 + }, + { + "epoch": 0.07693758250047143, + "grad_norm": 1.9585295915603638, + "learning_rate": 9.999972660400536e-06, + "loss": 1.245097041130066, + "step": 51, + "token_acc": 0.6841476915062189 + }, + { + "epoch": 0.07844616254950028, + "grad_norm": 2.2394933700561523, + "learning_rate": 9.999890641901124e-06, + "loss": 1.2696806192398071, + "step": 52, + "token_acc": 0.6851838298079973 + }, + { + "epoch": 0.07995474259852914, + "grad_norm": 2.1018149852752686, + "learning_rate": 9.999753945398704e-06, + "loss": 1.3278920650482178, + "step": 53, + "token_acc": 0.6696586854565447 + }, + { + "epoch": 0.08146332264755798, + "grad_norm": 2.7504725456237793, + "learning_rate": 9.99956257238817e-06, + "loss": 1.1971238851547241, + "step": 54, + "token_acc": 0.700527568014896 + }, + { + "epoch": 0.08297190269658684, + "grad_norm": 2.3626255989074707, + "learning_rate": 9.999316524962347e-06, + "loss": 1.314717173576355, + "step": 55, + "token_acc": 0.6721572102478564 + }, + { + "epoch": 0.08448048274561569, + "grad_norm": 1.8942910432815552, + "learning_rate": 9.999015805811965e-06, + "loss": 1.2371171712875366, + "step": 56, + "token_acc": 0.6883267404066618 + }, + { + "epoch": 0.08598906279464454, + "grad_norm": 1.847533941268921, + "learning_rate": 9.998660418225645e-06, + "loss": 1.263476848602295, + "step": 57, + "token_acc": 0.6805764917827861 + }, + { + "epoch": 0.0874976428436734, + "grad_norm": 2.302502155303955, + "learning_rate": 9.998250366089848e-06, + "loss": 1.2871187925338745, + "step": 58, + "token_acc": 0.670246869605721 + }, + { + "epoch": 0.08900622289270224, + "grad_norm": 2.251429796218872, + "learning_rate": 9.997785653888835e-06, + "loss": 1.2965680360794067, + "step": 59, + "token_acc": 0.6782139830126414 + }, + { + "epoch": 0.0905148029417311, + "grad_norm": 2.190948486328125, + "learning_rate": 9.99726628670463e-06, + "loss": 1.2930569648742676, + "step": 60, + "token_acc": 0.6805769230769231 + }, + { + "epoch": 0.09202338299075995, + "grad_norm": 2.8639395236968994, + "learning_rate": 9.996692270216946e-06, + "loss": 1.1929700374603271, + "step": 61, + "token_acc": 0.6977534453464225 + }, + { + "epoch": 0.09353196303978879, + "grad_norm": 2.1096410751342773, + "learning_rate": 9.996063610703138e-06, + "loss": 1.2735100984573364, + "step": 62, + "token_acc": 0.6825310734463277 + }, + { + "epoch": 0.09504054308881765, + "grad_norm": 2.234870672225952, + "learning_rate": 9.995380315038119e-06, + "loss": 1.2855546474456787, + "step": 63, + "token_acc": 0.6771805687848687 + }, + { + "epoch": 0.0965491231378465, + "grad_norm": 2.0339884757995605, + "learning_rate": 9.994642390694308e-06, + "loss": 1.2287287712097168, + "step": 64, + "token_acc": 0.691285472257518 + }, + { + "epoch": 0.09805770318687536, + "grad_norm": 3.0364999771118164, + "learning_rate": 9.993849845741525e-06, + "loss": 1.1940693855285645, + "step": 65, + "token_acc": 0.6951159429810155 + }, + { + "epoch": 0.0995662832359042, + "grad_norm": 2.355409860610962, + "learning_rate": 9.993002688846913e-06, + "loss": 1.2781426906585693, + "step": 66, + "token_acc": 0.6813532450263386 + }, + { + "epoch": 0.10107486328493305, + "grad_norm": 1.7342333793640137, + "learning_rate": 9.992100929274848e-06, + "loss": 1.236812710762024, + "step": 67, + "token_acc": 0.6908614643626685 + }, + { + "epoch": 0.10258344333396191, + "grad_norm": 2.261669158935547, + "learning_rate": 9.991144576886824e-06, + "loss": 1.2098701000213623, + "step": 68, + "token_acc": 0.6951142731222061 + }, + { + "epoch": 0.10409202338299076, + "grad_norm": 2.0735859870910645, + "learning_rate": 9.990133642141359e-06, + "loss": 1.2505481243133545, + "step": 69, + "token_acc": 0.6808613902320648 + }, + { + "epoch": 0.10560060343201962, + "grad_norm": 1.9308006763458252, + "learning_rate": 9.989068136093873e-06, + "loss": 1.2238191366195679, + "step": 70, + "token_acc": 0.6905540853112172 + }, + { + "epoch": 0.10710918348104846, + "grad_norm": 2.02999210357666, + "learning_rate": 9.987948070396572e-06, + "loss": 1.1689461469650269, + "step": 71, + "token_acc": 0.700115866051247 + }, + { + "epoch": 0.10861776353007732, + "grad_norm": 2.0273749828338623, + "learning_rate": 9.986773457298311e-06, + "loss": 1.2780811786651611, + "step": 72, + "token_acc": 0.6727383120825744 + }, + { + "epoch": 0.11012634357910617, + "grad_norm": 2.358456611633301, + "learning_rate": 9.985544309644474e-06, + "loss": 1.1954686641693115, + "step": 73, + "token_acc": 0.6952962191128312 + }, + { + "epoch": 0.11163492362813501, + "grad_norm": 2.2116260528564453, + "learning_rate": 9.984260640876821e-06, + "loss": 1.1607370376586914, + "step": 74, + "token_acc": 0.7091361666820319 + }, + { + "epoch": 0.11314350367716387, + "grad_norm": 2.304090738296509, + "learning_rate": 9.98292246503335e-06, + "loss": 1.1716623306274414, + "step": 75, + "token_acc": 0.705377358490566 + }, + { + "epoch": 0.11465208372619272, + "grad_norm": 1.7727265357971191, + "learning_rate": 9.981529796748135e-06, + "loss": 1.2567411661148071, + "step": 76, + "token_acc": 0.6769889895222874 + }, + { + "epoch": 0.11616066377522158, + "grad_norm": 5.008306503295898, + "learning_rate": 9.980082651251175e-06, + "loss": 1.0880424976348877, + "step": 77, + "token_acc": 0.723162828820983 + }, + { + "epoch": 0.11766924382425042, + "grad_norm": 4.082327842712402, + "learning_rate": 9.97858104436822e-06, + "loss": 1.1559984683990479, + "step": 78, + "token_acc": 0.7037714531661914 + }, + { + "epoch": 0.11917782387327927, + "grad_norm": 2.174713611602783, + "learning_rate": 9.977024992520604e-06, + "loss": 1.1979345083236694, + "step": 79, + "token_acc": 0.6934356273833303 + }, + { + "epoch": 0.12068640392230813, + "grad_norm": 2.2870583534240723, + "learning_rate": 9.975414512725058e-06, + "loss": 1.1505500078201294, + "step": 80, + "token_acc": 0.7055914069797062 + }, + { + "epoch": 0.12219498397133698, + "grad_norm": 1.4413859844207764, + "learning_rate": 9.973749622593534e-06, + "loss": 1.167055368423462, + "step": 81, + "token_acc": 0.7048204414550234 + }, + { + "epoch": 0.12370356402036584, + "grad_norm": 1.8660067319869995, + "learning_rate": 9.972030340333e-06, + "loss": 1.195404291152954, + "step": 82, + "token_acc": 0.6965120618126742 + }, + { + "epoch": 0.12521214406939468, + "grad_norm": 2.1937623023986816, + "learning_rate": 9.970256684745258e-06, + "loss": 1.2255498170852661, + "step": 83, + "token_acc": 0.6914352188454268 + }, + { + "epoch": 0.12672072411842353, + "grad_norm": 2.1932120323181152, + "learning_rate": 9.968428675226714e-06, + "loss": 1.162563443183899, + "step": 84, + "token_acc": 0.7053398610952978 + }, + { + "epoch": 0.12822930416745237, + "grad_norm": 1.5851740837097168, + "learning_rate": 9.966546331768192e-06, + "loss": 1.1537158489227295, + "step": 85, + "token_acc": 0.7030873234584912 + }, + { + "epoch": 0.12973788421648125, + "grad_norm": 1.8369523286819458, + "learning_rate": 9.964609674954696e-06, + "loss": 1.2103166580200195, + "step": 86, + "token_acc": 0.6901546078346217 + }, + { + "epoch": 0.1312464642655101, + "grad_norm": 1.922118067741394, + "learning_rate": 9.962618725965196e-06, + "loss": 1.2470711469650269, + "step": 87, + "token_acc": 0.6889579349904398 + }, + { + "epoch": 0.13275504431453894, + "grad_norm": 2.258385419845581, + "learning_rate": 9.960573506572391e-06, + "loss": 1.1735998392105103, + "step": 88, + "token_acc": 0.6990891271762942 + }, + { + "epoch": 0.13426362436356779, + "grad_norm": 2.177549123764038, + "learning_rate": 9.95847403914247e-06, + "loss": 1.221960425376892, + "step": 89, + "token_acc": 0.6877445748843828 + }, + { + "epoch": 0.13577220441259663, + "grad_norm": 2.615755319595337, + "learning_rate": 9.956320346634877e-06, + "loss": 1.2302175760269165, + "step": 90, + "token_acc": 0.6942848020434227 + }, + { + "epoch": 0.1372807844616255, + "grad_norm": 2.0189881324768066, + "learning_rate": 9.954112452602045e-06, + "loss": 1.1147913932800293, + "step": 91, + "token_acc": 0.708954660899741 + }, + { + "epoch": 0.13878936451065435, + "grad_norm": 1.9413909912109375, + "learning_rate": 9.951850381189152e-06, + "loss": 1.1957597732543945, + "step": 92, + "token_acc": 0.6956550218340611 + }, + { + "epoch": 0.1402979445596832, + "grad_norm": 2.3060622215270996, + "learning_rate": 9.949534157133844e-06, + "loss": 1.1521681547164917, + "step": 93, + "token_acc": 0.7056484971045133 + }, + { + "epoch": 0.14180652460871204, + "grad_norm": 1.7739169597625732, + "learning_rate": 9.94716380576598e-06, + "loss": 1.164400577545166, + "step": 94, + "token_acc": 0.7053741396983453 + }, + { + "epoch": 0.1433151046577409, + "grad_norm": 1.8751460313796997, + "learning_rate": 9.944739353007344e-06, + "loss": 1.1337116956710815, + "step": 95, + "token_acc": 0.7091129838594487 + }, + { + "epoch": 0.14482368470676976, + "grad_norm": 1.7847442626953125, + "learning_rate": 9.942260825371359e-06, + "loss": 1.221888780593872, + "step": 96, + "token_acc": 0.683116988315344 + }, + { + "epoch": 0.1463322647557986, + "grad_norm": 1.9020072221755981, + "learning_rate": 9.939728249962808e-06, + "loss": 1.1728981733322144, + "step": 97, + "token_acc": 0.6902698985731477 + }, + { + "epoch": 0.14784084480482745, + "grad_norm": 2.0021426677703857, + "learning_rate": 9.937141654477529e-06, + "loss": 1.2124426364898682, + "step": 98, + "token_acc": 0.6880733944954128 + }, + { + "epoch": 0.1493494248538563, + "grad_norm": 1.5634818077087402, + "learning_rate": 9.934501067202117e-06, + "loss": 1.2374353408813477, + "step": 99, + "token_acc": 0.684928003273741 + }, + { + "epoch": 0.15085800490288515, + "grad_norm": 2.02734112739563, + "learning_rate": 9.931806517013612e-06, + "loss": 1.2164281606674194, + "step": 100, + "token_acc": 0.6908819336003802 + }, + { + "epoch": 0.15236658495191402, + "grad_norm": 2.5195748805999756, + "learning_rate": 9.929058033379181e-06, + "loss": 1.2114039659500122, + "step": 101, + "token_acc": 0.6911258213439507 + }, + { + "epoch": 0.15387516500094287, + "grad_norm": 2.111295700073242, + "learning_rate": 9.926255646355804e-06, + "loss": 1.2071397304534912, + "step": 102, + "token_acc": 0.6893698281349459 + }, + { + "epoch": 0.1553837450499717, + "grad_norm": 3.3383662700653076, + "learning_rate": 9.923399386589933e-06, + "loss": 1.1924270391464233, + "step": 103, + "token_acc": 0.6931244447333637 + }, + { + "epoch": 0.15689232509900056, + "grad_norm": 2.5398898124694824, + "learning_rate": 9.920489285317169e-06, + "loss": 1.1555728912353516, + "step": 104, + "token_acc": 0.7050274435024609 + }, + { + "epoch": 0.15840090514802943, + "grad_norm": 2.314823627471924, + "learning_rate": 9.917525374361913e-06, + "loss": 1.1892518997192383, + "step": 105, + "token_acc": 0.6910610561787789 + }, + { + "epoch": 0.15990948519705828, + "grad_norm": 2.043548107147217, + "learning_rate": 9.91450768613702e-06, + "loss": 1.1156737804412842, + "step": 106, + "token_acc": 0.7092646544691761 + }, + { + "epoch": 0.16141806524608712, + "grad_norm": 2.1194136142730713, + "learning_rate": 9.911436253643445e-06, + "loss": 1.1857516765594482, + "step": 107, + "token_acc": 0.6943764769851727 + }, + { + "epoch": 0.16292664529511597, + "grad_norm": 1.8385933637619019, + "learning_rate": 9.908311110469881e-06, + "loss": 1.2180548906326294, + "step": 108, + "token_acc": 0.6857133253785523 + }, + { + "epoch": 0.16443522534414481, + "grad_norm": 2.030042886734009, + "learning_rate": 9.905132290792395e-06, + "loss": 1.1628551483154297, + "step": 109, + "token_acc": 0.6994099519839468 + }, + { + "epoch": 0.1659438053931737, + "grad_norm": 2.098581075668335, + "learning_rate": 9.901899829374048e-06, + "loss": 1.1854050159454346, + "step": 110, + "token_acc": 0.7011512451089046 + }, + { + "epoch": 0.16745238544220253, + "grad_norm": 1.702873945236206, + "learning_rate": 9.89861376156452e-06, + "loss": 1.0707236528396606, + "step": 111, + "token_acc": 0.7186409318363399 + }, + { + "epoch": 0.16896096549123138, + "grad_norm": 1.9038037061691284, + "learning_rate": 9.895274123299724e-06, + "loss": 1.1752533912658691, + "step": 112, + "token_acc": 0.6964655350859036 + }, + { + "epoch": 0.17046954554026023, + "grad_norm": 1.8525303602218628, + "learning_rate": 9.891880951101407e-06, + "loss": 1.1609431505203247, + "step": 113, + "token_acc": 0.6961265435621191 + }, + { + "epoch": 0.17197812558928907, + "grad_norm": 2.27610182762146, + "learning_rate": 9.888434282076759e-06, + "loss": 1.1465742588043213, + "step": 114, + "token_acc": 0.7004584819154356 + }, + { + "epoch": 0.17348670563831795, + "grad_norm": 2.0383033752441406, + "learning_rate": 9.884934153917998e-06, + "loss": 1.1776819229125977, + "step": 115, + "token_acc": 0.6986651394334552 + }, + { + "epoch": 0.1749952856873468, + "grad_norm": 2.3039402961730957, + "learning_rate": 9.881380604901964e-06, + "loss": 1.148888111114502, + "step": 116, + "token_acc": 0.7045989640558782 + }, + { + "epoch": 0.17650386573637564, + "grad_norm": 1.7093273401260376, + "learning_rate": 9.877773673889702e-06, + "loss": 1.1658339500427246, + "step": 117, + "token_acc": 0.7026754056086817 + }, + { + "epoch": 0.17801244578540448, + "grad_norm": 2.0707805156707764, + "learning_rate": 9.874113400326031e-06, + "loss": 1.1091554164886475, + "step": 118, + "token_acc": 0.7148843140827109 + }, + { + "epoch": 0.17952102583443333, + "grad_norm": 1.8449821472167969, + "learning_rate": 9.870399824239116e-06, + "loss": 1.2200647592544556, + "step": 119, + "token_acc": 0.688803231296004 + }, + { + "epoch": 0.1810296058834622, + "grad_norm": 1.7078940868377686, + "learning_rate": 9.86663298624003e-06, + "loss": 1.1771886348724365, + "step": 120, + "token_acc": 0.700543170521989 + }, + { + "epoch": 0.18253818593249105, + "grad_norm": 1.7552517652511597, + "learning_rate": 9.86281292752231e-06, + "loss": 1.2181274890899658, + "step": 121, + "token_acc": 0.6903644366564449 + }, + { + "epoch": 0.1840467659815199, + "grad_norm": 1.6700843572616577, + "learning_rate": 9.858939689861506e-06, + "loss": 1.1699402332305908, + "step": 122, + "token_acc": 0.6956770957726296 + }, + { + "epoch": 0.18555534603054874, + "grad_norm": 1.816776990890503, + "learning_rate": 9.855013315614725e-06, + "loss": 1.1845039129257202, + "step": 123, + "token_acc": 0.6938677371469192 + }, + { + "epoch": 0.18706392607957759, + "grad_norm": 1.8946900367736816, + "learning_rate": 9.851033847720167e-06, + "loss": 1.1642723083496094, + "step": 124, + "token_acc": 0.6976266310490086 + }, + { + "epoch": 0.18857250612860646, + "grad_norm": 1.687634825706482, + "learning_rate": 9.847001329696653e-06, + "loss": 1.122023105621338, + "step": 125, + "token_acc": 0.713326941514861 + }, + { + "epoch": 0.1900810861776353, + "grad_norm": 1.7281925678253174, + "learning_rate": 9.842915805643156e-06, + "loss": 1.1540591716766357, + "step": 126, + "token_acc": 0.7050361604207758 + }, + { + "epoch": 0.19158966622666415, + "grad_norm": 1.4000903367996216, + "learning_rate": 9.838777320238312e-06, + "loss": 1.1336901187896729, + "step": 127, + "token_acc": 0.7066336086188751 + }, + { + "epoch": 0.193098246275693, + "grad_norm": 1.6249912977218628, + "learning_rate": 9.834585918739936e-06, + "loss": 1.1623244285583496, + "step": 128, + "token_acc": 0.6933034173282706 + }, + { + "epoch": 0.19460682632472184, + "grad_norm": 1.6293485164642334, + "learning_rate": 9.830341646984521e-06, + "loss": 1.179638147354126, + "step": 129, + "token_acc": 0.6921218083058894 + }, + { + "epoch": 0.19611540637375072, + "grad_norm": 1.985175609588623, + "learning_rate": 9.826044551386743e-06, + "loss": 1.1424317359924316, + "step": 130, + "token_acc": 0.7041070145376154 + }, + { + "epoch": 0.19762398642277956, + "grad_norm": 1.820330023765564, + "learning_rate": 9.821694678938954e-06, + "loss": 1.1555452346801758, + "step": 131, + "token_acc": 0.7000307098359217 + }, + { + "epoch": 0.1991325664718084, + "grad_norm": 2.679926633834839, + "learning_rate": 9.817292077210658e-06, + "loss": 1.2416629791259766, + "step": 132, + "token_acc": 0.6857180026452159 + }, + { + "epoch": 0.20064114652083725, + "grad_norm": 1.961089015007019, + "learning_rate": 9.812836794348005e-06, + "loss": 1.1599154472351074, + "step": 133, + "token_acc": 0.707706013363029 + }, + { + "epoch": 0.2021497265698661, + "grad_norm": 1.8208398818969727, + "learning_rate": 9.808328879073251e-06, + "loss": 1.1624665260314941, + "step": 134, + "token_acc": 0.7017644231869464 + }, + { + "epoch": 0.20365830661889497, + "grad_norm": 2.0188512802124023, + "learning_rate": 9.803768380684242e-06, + "loss": 1.1222360134124756, + "step": 135, + "token_acc": 0.7085112722216704 + }, + { + "epoch": 0.20516688666792382, + "grad_norm": 1.6327846050262451, + "learning_rate": 9.79915534905385e-06, + "loss": 1.1569013595581055, + "step": 136, + "token_acc": 0.6957277542589525 + }, + { + "epoch": 0.20667546671695267, + "grad_norm": 1.764774203300476, + "learning_rate": 9.794489834629457e-06, + "loss": 1.0867125988006592, + "step": 137, + "token_acc": 0.7212045067682581 + }, + { + "epoch": 0.2081840467659815, + "grad_norm": 2.229799270629883, + "learning_rate": 9.789771888432375e-06, + "loss": 1.1493878364562988, + "step": 138, + "token_acc": 0.7056579783852511 + }, + { + "epoch": 0.20969262681501036, + "grad_norm": 1.8823713064193726, + "learning_rate": 9.785001562057311e-06, + "loss": 1.2018505334854126, + "step": 139, + "token_acc": 0.6922897154259622 + }, + { + "epoch": 0.21120120686403923, + "grad_norm": 2.318209409713745, + "learning_rate": 9.780178907671788e-06, + "loss": 1.1732218265533447, + "step": 140, + "token_acc": 0.7006454896698517 + }, + { + "epoch": 0.21270978691306808, + "grad_norm": 2.489802837371826, + "learning_rate": 9.775303978015585e-06, + "loss": 1.0911521911621094, + "step": 141, + "token_acc": 0.7155516900944948 + }, + { + "epoch": 0.21421836696209692, + "grad_norm": 2.0332915782928467, + "learning_rate": 9.77037682640015e-06, + "loss": 1.1484969854354858, + "step": 142, + "token_acc": 0.7026216526485954 + }, + { + "epoch": 0.21572694701112577, + "grad_norm": 2.414355993270874, + "learning_rate": 9.765397506708023e-06, + "loss": 1.1555111408233643, + "step": 143, + "token_acc": 0.6970965142574926 + }, + { + "epoch": 0.21723552706015464, + "grad_norm": 1.9892491102218628, + "learning_rate": 9.760366073392246e-06, + "loss": 1.1795839071273804, + "step": 144, + "token_acc": 0.6944251724278659 + }, + { + "epoch": 0.2187441071091835, + "grad_norm": 2.204913377761841, + "learning_rate": 9.755282581475769e-06, + "loss": 1.1478748321533203, + "step": 145, + "token_acc": 0.7039714841873752 + }, + { + "epoch": 0.22025268715821233, + "grad_norm": 2.0797486305236816, + "learning_rate": 9.750147086550843e-06, + "loss": 1.1760129928588867, + "step": 146, + "token_acc": 0.6997278277713753 + }, + { + "epoch": 0.22176126720724118, + "grad_norm": 2.267526388168335, + "learning_rate": 9.744959644778422e-06, + "loss": 1.1616630554199219, + "step": 147, + "token_acc": 0.6984533333333334 + }, + { + "epoch": 0.22326984725627003, + "grad_norm": 2.139894485473633, + "learning_rate": 9.739720312887536e-06, + "loss": 1.102414846420288, + "step": 148, + "token_acc": 0.7106428862854811 + }, + { + "epoch": 0.2247784273052989, + "grad_norm": 2.238325357437134, + "learning_rate": 9.734429148174676e-06, + "loss": 1.1668992042541504, + "step": 149, + "token_acc": 0.7008337690212157 + }, + { + "epoch": 0.22628700735432775, + "grad_norm": 2.0391504764556885, + "learning_rate": 9.729086208503174e-06, + "loss": 1.1308966875076294, + "step": 150, + "token_acc": 0.7064176873246085 + }, + { + "epoch": 0.2277955874033566, + "grad_norm": 2.1557424068450928, + "learning_rate": 9.723691552302563e-06, + "loss": 1.1744414567947388, + "step": 151, + "token_acc": 0.6940479678540036 + }, + { + "epoch": 0.22930416745238544, + "grad_norm": 1.962188720703125, + "learning_rate": 9.718245238567939e-06, + "loss": 1.0274202823638916, + "step": 152, + "token_acc": 0.7308855790324443 + }, + { + "epoch": 0.23081274750141428, + "grad_norm": 2.174717903137207, + "learning_rate": 9.712747326859316e-06, + "loss": 1.0731227397918701, + "step": 153, + "token_acc": 0.7222308851811424 + }, + { + "epoch": 0.23232132755044316, + "grad_norm": 2.0987601280212402, + "learning_rate": 9.707197877300974e-06, + "loss": 1.1705842018127441, + "step": 154, + "token_acc": 0.7008969828757815 + }, + { + "epoch": 0.233829907599472, + "grad_norm": 2.0567665100097656, + "learning_rate": 9.701596950580807e-06, + "loss": 1.0934505462646484, + "step": 155, + "token_acc": 0.7137202525497814 + }, + { + "epoch": 0.23533848764850085, + "grad_norm": 2.062399387359619, + "learning_rate": 9.69594460794965e-06, + "loss": 1.079542875289917, + "step": 156, + "token_acc": 0.7154171487850617 + }, + { + "epoch": 0.2368470676975297, + "grad_norm": 2.9084770679473877, + "learning_rate": 9.690240911220618e-06, + "loss": 1.1270215511322021, + "step": 157, + "token_acc": 0.7089350934568285 + }, + { + "epoch": 0.23835564774655854, + "grad_norm": 1.9727081060409546, + "learning_rate": 9.684485922768422e-06, + "loss": 1.149571418762207, + "step": 158, + "token_acc": 0.7035308198683423 + }, + { + "epoch": 0.23986422779558741, + "grad_norm": 1.8495759963989258, + "learning_rate": 9.678679705528699e-06, + "loss": 1.1205863952636719, + "step": 159, + "token_acc": 0.707068105728935 + }, + { + "epoch": 0.24137280784461626, + "grad_norm": 2.0891835689544678, + "learning_rate": 9.672822322997305e-06, + "loss": 1.0921928882598877, + "step": 160, + "token_acc": 0.7187694978704935 + }, + { + "epoch": 0.2428813878936451, + "grad_norm": 1.8296209573745728, + "learning_rate": 9.666913839229639e-06, + "loss": 1.0496504306793213, + "step": 161, + "token_acc": 0.7255546883710559 + }, + { + "epoch": 0.24438996794267395, + "grad_norm": 2.522357940673828, + "learning_rate": 9.660954318839934e-06, + "loss": 1.1296055316925049, + "step": 162, + "token_acc": 0.7070172971452678 + }, + { + "epoch": 0.2458985479917028, + "grad_norm": 2.0354416370391846, + "learning_rate": 9.654943827000548e-06, + "loss": 1.1254849433898926, + "step": 163, + "token_acc": 0.7150837988826816 + }, + { + "epoch": 0.24740712804073167, + "grad_norm": 1.871104121208191, + "learning_rate": 9.648882429441258e-06, + "loss": 1.1197452545166016, + "step": 164, + "token_acc": 0.7105013292821876 + }, + { + "epoch": 0.24891570808976052, + "grad_norm": 2.1282925605773926, + "learning_rate": 9.642770192448537e-06, + "loss": 1.1131396293640137, + "step": 165, + "token_acc": 0.71103066439523 + }, + { + "epoch": 0.25042428813878936, + "grad_norm": 1.5473743677139282, + "learning_rate": 9.636607182864828e-06, + "loss": 1.02193021774292, + "step": 166, + "token_acc": 0.7259595405508735 + }, + { + "epoch": 0.25193286818781824, + "grad_norm": 1.9163477420806885, + "learning_rate": 9.630393468087818e-06, + "loss": 1.1910886764526367, + "step": 167, + "token_acc": 0.6917693900943112 + }, + { + "epoch": 0.25344144823684706, + "grad_norm": 1.8917031288146973, + "learning_rate": 9.624129116069695e-06, + "loss": 1.1297574043273926, + "step": 168, + "token_acc": 0.7016270337922403 + }, + { + "epoch": 0.25495002828587593, + "grad_norm": 2.0999059677124023, + "learning_rate": 9.61781419531641e-06, + "loss": 1.1372027397155762, + "step": 169, + "token_acc": 0.7038554621043299 + }, + { + "epoch": 0.25645860833490475, + "grad_norm": 1.9726512432098389, + "learning_rate": 9.611448774886925e-06, + "loss": 1.162552833557129, + "step": 170, + "token_acc": 0.6968314606741574 + }, + { + "epoch": 0.2579671883839336, + "grad_norm": 1.959041714668274, + "learning_rate": 9.605032924392457e-06, + "loss": 1.1787631511688232, + "step": 171, + "token_acc": 0.7001003456349649 + }, + { + "epoch": 0.2594757684329625, + "grad_norm": 1.9699612855911255, + "learning_rate": 9.598566713995718e-06, + "loss": 1.1443536281585693, + "step": 172, + "token_acc": 0.7023014275256223 + }, + { + "epoch": 0.2609843484819913, + "grad_norm": 2.174241065979004, + "learning_rate": 9.592050214410152e-06, + "loss": 1.132582426071167, + "step": 173, + "token_acc": 0.7084237165582068 + }, + { + "epoch": 0.2624929285310202, + "grad_norm": 2.2024037837982178, + "learning_rate": 9.585483496899151e-06, + "loss": 1.056187629699707, + "step": 174, + "token_acc": 0.7221542592868865 + }, + { + "epoch": 0.264001508580049, + "grad_norm": 1.7501752376556396, + "learning_rate": 9.578866633275289e-06, + "loss": 1.0824569463729858, + "step": 175, + "token_acc": 0.7156855931614428 + }, + { + "epoch": 0.2655100886290779, + "grad_norm": 1.742267370223999, + "learning_rate": 9.572199695899522e-06, + "loss": 1.1377878189086914, + "step": 176, + "token_acc": 0.7083164847014423 + }, + { + "epoch": 0.26701866867810675, + "grad_norm": 2.0966010093688965, + "learning_rate": 9.565482757680415e-06, + "loss": 1.1372158527374268, + "step": 177, + "token_acc": 0.7038177681593696 + }, + { + "epoch": 0.26852724872713557, + "grad_norm": 2.0057849884033203, + "learning_rate": 9.558715892073324e-06, + "loss": 1.1649508476257324, + "step": 178, + "token_acc": 0.6971963453959155 + }, + { + "epoch": 0.27003582877616444, + "grad_norm": 2.1614139080047607, + "learning_rate": 9.551899173079607e-06, + "loss": 1.1121848821640015, + "step": 179, + "token_acc": 0.7184943207360566 + }, + { + "epoch": 0.27154440882519326, + "grad_norm": 2.0753509998321533, + "learning_rate": 9.545032675245814e-06, + "loss": 1.109001636505127, + "step": 180, + "token_acc": 0.7115236084650636 + }, + { + "epoch": 0.27305298887422214, + "grad_norm": 2.0139389038085938, + "learning_rate": 9.538116473662862e-06, + "loss": 1.0812448263168335, + "step": 181, + "token_acc": 0.7192062112173626 + }, + { + "epoch": 0.274561568923251, + "grad_norm": 1.9699065685272217, + "learning_rate": 9.531150643965224e-06, + "loss": 1.1494128704071045, + "step": 182, + "token_acc": 0.7048619661276643 + }, + { + "epoch": 0.2760701489722798, + "grad_norm": 2.0108418464660645, + "learning_rate": 9.524135262330098e-06, + "loss": 1.1145198345184326, + "step": 183, + "token_acc": 0.70841774567662 + }, + { + "epoch": 0.2775787290213087, + "grad_norm": 1.8934191465377808, + "learning_rate": 9.517070405476575e-06, + "loss": 1.1146862506866455, + "step": 184, + "token_acc": 0.7129017447199265 + }, + { + "epoch": 0.2790873090703375, + "grad_norm": 2.1472039222717285, + "learning_rate": 9.509956150664796e-06, + "loss": 1.1092115640640259, + "step": 185, + "token_acc": 0.7077206305373375 + }, + { + "epoch": 0.2805958891193664, + "grad_norm": 1.921444296836853, + "learning_rate": 9.502792575695112e-06, + "loss": 1.113539695739746, + "step": 186, + "token_acc": 0.7090873656662398 + }, + { + "epoch": 0.28210446916839527, + "grad_norm": 1.6960666179656982, + "learning_rate": 9.495579758907231e-06, + "loss": 1.1016550064086914, + "step": 187, + "token_acc": 0.7150675898450379 + }, + { + "epoch": 0.2836130492174241, + "grad_norm": 1.7241804599761963, + "learning_rate": 9.48831777917936e-06, + "loss": 1.1296112537384033, + "step": 188, + "token_acc": 0.7053865973175372 + }, + { + "epoch": 0.28512162926645296, + "grad_norm": 1.6816834211349487, + "learning_rate": 9.481006715927352e-06, + "loss": 1.1421611309051514, + "step": 189, + "token_acc": 0.7015271338969185 + }, + { + "epoch": 0.2866302093154818, + "grad_norm": 1.7255656719207764, + "learning_rate": 9.473646649103819e-06, + "loss": 1.149240255355835, + "step": 190, + "token_acc": 0.6989262603996302 + }, + { + "epoch": 0.28813878936451065, + "grad_norm": 1.882737398147583, + "learning_rate": 9.466237659197271e-06, + "loss": 1.0752456188201904, + "step": 191, + "token_acc": 0.7176824483390007 + }, + { + "epoch": 0.2896473694135395, + "grad_norm": 1.7682944536209106, + "learning_rate": 9.458779827231237e-06, + "loss": 1.1241464614868164, + "step": 192, + "token_acc": 0.7075347921840589 + }, + { + "epoch": 0.29115594946256834, + "grad_norm": 1.596488356590271, + "learning_rate": 9.451273234763372e-06, + "loss": 1.0523662567138672, + "step": 193, + "token_acc": 0.7233033524121014 + }, + { + "epoch": 0.2926645295115972, + "grad_norm": 2.074023962020874, + "learning_rate": 9.443717963884568e-06, + "loss": 1.1342031955718994, + "step": 194, + "token_acc": 0.7073977598332899 + }, + { + "epoch": 0.29417310956062603, + "grad_norm": 1.837063193321228, + "learning_rate": 9.43611409721806e-06, + "loss": 1.1586201190948486, + "step": 195, + "token_acc": 0.698721446023199 + }, + { + "epoch": 0.2956816896096549, + "grad_norm": 2.3650221824645996, + "learning_rate": 9.428461717918512e-06, + "loss": 1.0968009233474731, + "step": 196, + "token_acc": 0.7157787012342616 + }, + { + "epoch": 0.2971902696586838, + "grad_norm": 1.7242475748062134, + "learning_rate": 9.420760909671119e-06, + "loss": 1.1063261032104492, + "step": 197, + "token_acc": 0.7132504642046626 + }, + { + "epoch": 0.2986988497077126, + "grad_norm": 2.012909173965454, + "learning_rate": 9.413011756690686e-06, + "loss": 1.1428766250610352, + "step": 198, + "token_acc": 0.6996804377313378 + }, + { + "epoch": 0.3002074297567415, + "grad_norm": 2.1190404891967773, + "learning_rate": 9.405214343720708e-06, + "loss": 1.0923025608062744, + "step": 199, + "token_acc": 0.7120200826934436 + }, + { + "epoch": 0.3017160098057703, + "grad_norm": 1.751379132270813, + "learning_rate": 9.397368756032445e-06, + "loss": 1.0642454624176025, + "step": 200, + "token_acc": 0.7237617590388756 + }, + { + "epoch": 0.30322458985479916, + "grad_norm": 3.0516157150268555, + "learning_rate": 9.389475079423988e-06, + "loss": 1.0653831958770752, + "step": 201, + "token_acc": 0.7211538461538461 + }, + { + "epoch": 0.30473316990382804, + "grad_norm": 1.9552617073059082, + "learning_rate": 9.381533400219319e-06, + "loss": 1.08341646194458, + "step": 202, + "token_acc": 0.7225309696047688 + }, + { + "epoch": 0.30624174995285686, + "grad_norm": 2.0092577934265137, + "learning_rate": 9.373543805267367e-06, + "loss": 1.056673526763916, + "step": 203, + "token_acc": 0.7214854111405835 + }, + { + "epoch": 0.30775033000188573, + "grad_norm": 2.015540599822998, + "learning_rate": 9.365506381941066e-06, + "loss": 1.1072001457214355, + "step": 204, + "token_acc": 0.7112011334836663 + }, + { + "epoch": 0.3092589100509146, + "grad_norm": 1.9784111976623535, + "learning_rate": 9.357421218136387e-06, + "loss": 1.0533015727996826, + "step": 205, + "token_acc": 0.7225071073677498 + }, + { + "epoch": 0.3107674900999434, + "grad_norm": 1.9075231552124023, + "learning_rate": 9.349288402271387e-06, + "loss": 1.1120229959487915, + "step": 206, + "token_acc": 0.7109068602803644 + }, + { + "epoch": 0.3122760701489723, + "grad_norm": 1.850492000579834, + "learning_rate": 9.341108023285239e-06, + "loss": 1.109849452972412, + "step": 207, + "token_acc": 0.7107822786311606 + }, + { + "epoch": 0.3137846501980011, + "grad_norm": 1.926287293434143, + "learning_rate": 9.332880170637252e-06, + "loss": 1.0478284358978271, + "step": 208, + "token_acc": 0.7269679189209785 + }, + { + "epoch": 0.31529323024703, + "grad_norm": 1.6131747961044312, + "learning_rate": 9.324604934305911e-06, + "loss": 1.1022543907165527, + "step": 209, + "token_acc": 0.7082330899103664 + }, + { + "epoch": 0.31680181029605886, + "grad_norm": 2.119939088821411, + "learning_rate": 9.31628240478787e-06, + "loss": 1.1021747589111328, + "step": 210, + "token_acc": 0.7140350317455253 + }, + { + "epoch": 0.3183103903450877, + "grad_norm": 1.8385939598083496, + "learning_rate": 9.30791267309698e-06, + "loss": 1.1266725063323975, + "step": 211, + "token_acc": 0.7078918194240338 + }, + { + "epoch": 0.31981897039411655, + "grad_norm": 1.9170836210250854, + "learning_rate": 9.299495830763285e-06, + "loss": 1.0435452461242676, + "step": 212, + "token_acc": 0.7232079874111454 + }, + { + "epoch": 0.32132755044314537, + "grad_norm": 1.7575520277023315, + "learning_rate": 9.291031969832026e-06, + "loss": 1.1187375783920288, + "step": 213, + "token_acc": 0.7062836458139966 + }, + { + "epoch": 0.32283613049217424, + "grad_norm": 1.5742466449737549, + "learning_rate": 9.28252118286263e-06, + "loss": 1.1634769439697266, + "step": 214, + "token_acc": 0.6947277505191393 + }, + { + "epoch": 0.3243447105412031, + "grad_norm": 1.980936884880066, + "learning_rate": 9.273963562927695e-06, + "loss": 1.1613801717758179, + "step": 215, + "token_acc": 0.6972912445344993 + }, + { + "epoch": 0.32585329059023194, + "grad_norm": 1.5372153520584106, + "learning_rate": 9.265359203611988e-06, + "loss": 1.124567985534668, + "step": 216, + "token_acc": 0.706635970616147 + }, + { + "epoch": 0.3273618706392608, + "grad_norm": 1.8141430616378784, + "learning_rate": 9.256708199011402e-06, + "loss": 1.1056387424468994, + "step": 217, + "token_acc": 0.7131346947885281 + }, + { + "epoch": 0.32887045068828963, + "grad_norm": 2.080238103866577, + "learning_rate": 9.248010643731936e-06, + "loss": 1.111203670501709, + "step": 218, + "token_acc": 0.706952212067476 + }, + { + "epoch": 0.3303790307373185, + "grad_norm": 2.1560044288635254, + "learning_rate": 9.23926663288866e-06, + "loss": 1.0826705694198608, + "step": 219, + "token_acc": 0.7126981576692374 + }, + { + "epoch": 0.3318876107863474, + "grad_norm": 1.7542237043380737, + "learning_rate": 9.230476262104678e-06, + "loss": 1.0765860080718994, + "step": 220, + "token_acc": 0.7191515683614016 + }, + { + "epoch": 0.3333961908353762, + "grad_norm": 1.6123614311218262, + "learning_rate": 9.221639627510076e-06, + "loss": 1.0405281782150269, + "step": 221, + "token_acc": 0.7257684210526316 + }, + { + "epoch": 0.33490477088440507, + "grad_norm": 1.97556734085083, + "learning_rate": 9.212756825740874e-06, + "loss": 1.1250073909759521, + "step": 222, + "token_acc": 0.7071779010238908 + }, + { + "epoch": 0.3364133509334339, + "grad_norm": 2.338576316833496, + "learning_rate": 9.203827953937969e-06, + "loss": 1.0830904245376587, + "step": 223, + "token_acc": 0.710997780361484 + }, + { + "epoch": 0.33792193098246276, + "grad_norm": 1.844110131263733, + "learning_rate": 9.194853109746073e-06, + "loss": 1.1183950901031494, + "step": 224, + "token_acc": 0.7052707581227436 + }, + { + "epoch": 0.33943051103149163, + "grad_norm": 1.9182608127593994, + "learning_rate": 9.185832391312644e-06, + "loss": 1.1435399055480957, + "step": 225, + "token_acc": 0.6981693425178911 + }, + { + "epoch": 0.34093909108052045, + "grad_norm": 2.943948984146118, + "learning_rate": 9.176765897286812e-06, + "loss": 1.083754301071167, + "step": 226, + "token_acc": 0.7200755077440925 + }, + { + "epoch": 0.3424476711295493, + "grad_norm": 2.2212650775909424, + "learning_rate": 9.167653726818305e-06, + "loss": 1.0911099910736084, + "step": 227, + "token_acc": 0.7122508484298073 + }, + { + "epoch": 0.34395625117857814, + "grad_norm": 2.2965805530548096, + "learning_rate": 9.15849597955636e-06, + "loss": 1.0312912464141846, + "step": 228, + "token_acc": 0.730110178220294 + }, + { + "epoch": 0.345464831227607, + "grad_norm": 2.0392351150512695, + "learning_rate": 9.149292755648631e-06, + "loss": 1.0396887063980103, + "step": 229, + "token_acc": 0.7272702203347765 + }, + { + "epoch": 0.3469734112766359, + "grad_norm": 2.050694465637207, + "learning_rate": 9.140044155740102e-06, + "loss": 1.182438850402832, + "step": 230, + "token_acc": 0.6979478460764829 + }, + { + "epoch": 0.3484819913256647, + "grad_norm": 2.1184041500091553, + "learning_rate": 9.130750280971978e-06, + "loss": 1.0819886922836304, + "step": 231, + "token_acc": 0.7168344773156674 + }, + { + "epoch": 0.3499905713746936, + "grad_norm": 1.7165330648422241, + "learning_rate": 9.121411232980589e-06, + "loss": 1.1045360565185547, + "step": 232, + "token_acc": 0.7141434047252961 + }, + { + "epoch": 0.3514991514237224, + "grad_norm": 1.8305047750473022, + "learning_rate": 9.112027113896262e-06, + "loss": 1.0134657621383667, + "step": 233, + "token_acc": 0.7282068279748094 + }, + { + "epoch": 0.3530077314727513, + "grad_norm": 2.090362548828125, + "learning_rate": 9.102598026342223e-06, + "loss": 1.1045575141906738, + "step": 234, + "token_acc": 0.7151906334434104 + }, + { + "epoch": 0.35451631152178015, + "grad_norm": 2.2762529850006104, + "learning_rate": 9.093124073433464e-06, + "loss": 1.1139613389968872, + "step": 235, + "token_acc": 0.7018661915530278 + }, + { + "epoch": 0.35602489157080897, + "grad_norm": 2.042255401611328, + "learning_rate": 9.083605358775612e-06, + "loss": 1.0923964977264404, + "step": 236, + "token_acc": 0.7140312139308476 + }, + { + "epoch": 0.35753347161983784, + "grad_norm": 1.8897353410720825, + "learning_rate": 9.074041986463808e-06, + "loss": 1.1121604442596436, + "step": 237, + "token_acc": 0.7068383874154288 + }, + { + "epoch": 0.35904205166886666, + "grad_norm": 2.287414073944092, + "learning_rate": 9.064434061081562e-06, + "loss": 1.0770710706710815, + "step": 238, + "token_acc": 0.7219777217956111 + }, + { + "epoch": 0.36055063171789553, + "grad_norm": 1.7816894054412842, + "learning_rate": 9.0547816876996e-06, + "loss": 1.1002464294433594, + "step": 239, + "token_acc": 0.7130671462246956 + }, + { + "epoch": 0.3620592117669244, + "grad_norm": 1.9119371175765991, + "learning_rate": 9.045084971874738e-06, + "loss": 1.1262714862823486, + "step": 240, + "token_acc": 0.7046194369207067 + }, + { + "epoch": 0.3635677918159532, + "grad_norm": 1.7933969497680664, + "learning_rate": 9.035344019648701e-06, + "loss": 1.1534756422042847, + "step": 241, + "token_acc": 0.6988591568891156 + }, + { + "epoch": 0.3650763718649821, + "grad_norm": 1.6037219762802124, + "learning_rate": 9.025558937546987e-06, + "loss": 1.02713942527771, + "step": 242, + "token_acc": 0.7251662609962244 + }, + { + "epoch": 0.3665849519140109, + "grad_norm": 1.6463598012924194, + "learning_rate": 9.015729832577681e-06, + "loss": 1.0751371383666992, + "step": 243, + "token_acc": 0.7189655914286943 + }, + { + "epoch": 0.3680935319630398, + "grad_norm": 1.8134129047393799, + "learning_rate": 9.005856812230304e-06, + "loss": 1.080014705657959, + "step": 244, + "token_acc": 0.7103486655859081 + }, + { + "epoch": 0.36960211201206866, + "grad_norm": 1.7472238540649414, + "learning_rate": 8.995939984474624e-06, + "loss": 1.0555657148361206, + "step": 245, + "token_acc": 0.7222912903755483 + }, + { + "epoch": 0.3711106920610975, + "grad_norm": 1.948832631111145, + "learning_rate": 8.98597945775948e-06, + "loss": 1.1867132186889648, + "step": 246, + "token_acc": 0.691625056740808 + }, + { + "epoch": 0.37261927211012635, + "grad_norm": 1.63556969165802, + "learning_rate": 8.975975341011595e-06, + "loss": 1.1363940238952637, + "step": 247, + "token_acc": 0.6995856657997972 + }, + { + "epoch": 0.37412785215915517, + "grad_norm": 1.9527708292007446, + "learning_rate": 8.96592774363439e-06, + "loss": 1.1012463569641113, + "step": 248, + "token_acc": 0.7145785358200268 + }, + { + "epoch": 0.37563643220818405, + "grad_norm": 1.54280686378479, + "learning_rate": 8.955836775506776e-06, + "loss": 1.0727810859680176, + "step": 249, + "token_acc": 0.7130031019401576 + }, + { + "epoch": 0.3771450122572129, + "grad_norm": 1.6649891138076782, + "learning_rate": 8.94570254698197e-06, + "loss": 1.0866367816925049, + "step": 250, + "token_acc": 0.7192876327824425 + }, + { + "epoch": 0.37865359230624174, + "grad_norm": 1.6812975406646729, + "learning_rate": 8.935525168886263e-06, + "loss": 1.1309764385223389, + "step": 251, + "token_acc": 0.7100875893877205 + }, + { + "epoch": 0.3801621723552706, + "grad_norm": 1.8071800470352173, + "learning_rate": 8.92530475251784e-06, + "loss": 1.0937108993530273, + "step": 252, + "token_acc": 0.7171619442524764 + }, + { + "epoch": 0.38167075240429943, + "grad_norm": 1.5008201599121094, + "learning_rate": 8.91504140964553e-06, + "loss": 0.9853099584579468, + "step": 253, + "token_acc": 0.7420177269771739 + }, + { + "epoch": 0.3831793324533283, + "grad_norm": 2.0749621391296387, + "learning_rate": 8.90473525250761e-06, + "loss": 1.1279444694519043, + "step": 254, + "token_acc": 0.7038175762234179 + }, + { + "epoch": 0.3846879125023572, + "grad_norm": 1.776676058769226, + "learning_rate": 8.894386393810563e-06, + "loss": 1.1233603954315186, + "step": 255, + "token_acc": 0.7035439580188341 + }, + { + "epoch": 0.386196492551386, + "grad_norm": 1.928527593612671, + "learning_rate": 8.883994946727848e-06, + "loss": 1.0377331972122192, + "step": 256, + "token_acc": 0.7301100538805457 + }, + { + "epoch": 0.38770507260041487, + "grad_norm": 1.7477319240570068, + "learning_rate": 8.873561024898668e-06, + "loss": 1.0835624933242798, + "step": 257, + "token_acc": 0.7145872156986012 + }, + { + "epoch": 0.3892136526494437, + "grad_norm": 2.0472376346588135, + "learning_rate": 8.863084742426719e-06, + "loss": 1.0790083408355713, + "step": 258, + "token_acc": 0.7136501299884752 + }, + { + "epoch": 0.39072223269847256, + "grad_norm": 1.4249579906463623, + "learning_rate": 8.852566213878947e-06, + "loss": 1.0344488620758057, + "step": 259, + "token_acc": 0.7237232704402515 + }, + { + "epoch": 0.39223081274750143, + "grad_norm": 2.2533130645751953, + "learning_rate": 8.842005554284296e-06, + "loss": 1.0688493251800537, + "step": 260, + "token_acc": 0.7219815288374157 + }, + { + "epoch": 0.39373939279653025, + "grad_norm": 1.5969963073730469, + "learning_rate": 8.831402879132447e-06, + "loss": 1.1118829250335693, + "step": 261, + "token_acc": 0.7106441125194365 + }, + { + "epoch": 0.3952479728455591, + "grad_norm": 1.8639003038406372, + "learning_rate": 8.820758304372557e-06, + "loss": 1.0827083587646484, + "step": 262, + "token_acc": 0.714449322389769 + }, + { + "epoch": 0.39675655289458794, + "grad_norm": 1.7328518629074097, + "learning_rate": 8.810071946411989e-06, + "loss": 1.1757081747055054, + "step": 263, + "token_acc": 0.6985168976416242 + }, + { + "epoch": 0.3982651329436168, + "grad_norm": 1.6629317998886108, + "learning_rate": 8.799343922115045e-06, + "loss": 1.1307930946350098, + "step": 264, + "token_acc": 0.7089913748966169 + }, + { + "epoch": 0.3997737129926457, + "grad_norm": 1.8871866464614868, + "learning_rate": 8.788574348801676e-06, + "loss": 1.0787943601608276, + "step": 265, + "token_acc": 0.7168478409682101 + }, + { + "epoch": 0.4012822930416745, + "grad_norm": 3.168402910232544, + "learning_rate": 8.777763344246209e-06, + "loss": 1.0637046098709106, + "step": 266, + "token_acc": 0.7181803549152952 + }, + { + "epoch": 0.4027908730907034, + "grad_norm": 2.578517198562622, + "learning_rate": 8.766911026676063e-06, + "loss": 1.1035090684890747, + "step": 267, + "token_acc": 0.7123029875323453 + }, + { + "epoch": 0.4042994531397322, + "grad_norm": 1.7681666612625122, + "learning_rate": 8.756017514770444e-06, + "loss": 1.0298861265182495, + "step": 268, + "token_acc": 0.7254434884959224 + }, + { + "epoch": 0.4058080331887611, + "grad_norm": 1.7394912242889404, + "learning_rate": 8.745082927659048e-06, + "loss": 1.1273338794708252, + "step": 269, + "token_acc": 0.7080005354513409 + }, + { + "epoch": 0.40731661323778995, + "grad_norm": 1.6136854887008667, + "learning_rate": 8.734107384920771e-06, + "loss": 1.114363670349121, + "step": 270, + "token_acc": 0.7102918953312654 + }, + { + "epoch": 0.40882519328681877, + "grad_norm": 2.029174327850342, + "learning_rate": 8.72309100658239e-06, + "loss": 1.1459369659423828, + "step": 271, + "token_acc": 0.697837404885863 + }, + { + "epoch": 0.41033377333584764, + "grad_norm": 1.624672770500183, + "learning_rate": 8.71203391311725e-06, + "loss": 1.0908006429672241, + "step": 272, + "token_acc": 0.7149325820084524 + }, + { + "epoch": 0.41184235338487646, + "grad_norm": 1.9763164520263672, + "learning_rate": 8.700936225443958e-06, + "loss": 1.0963537693023682, + "step": 273, + "token_acc": 0.7116567978636944 + }, + { + "epoch": 0.41335093343390533, + "grad_norm": 1.8740562200546265, + "learning_rate": 8.689798064925049e-06, + "loss": 1.057687759399414, + "step": 274, + "token_acc": 0.7253401162112337 + }, + { + "epoch": 0.4148595134829342, + "grad_norm": 2.1799628734588623, + "learning_rate": 8.67861955336566e-06, + "loss": 1.0733835697174072, + "step": 275, + "token_acc": 0.7190192180251822 + }, + { + "epoch": 0.416368093531963, + "grad_norm": 1.900963306427002, + "learning_rate": 8.6674008130122e-06, + "loss": 1.0639750957489014, + "step": 276, + "token_acc": 0.7232151847749472 + }, + { + "epoch": 0.4178766735809919, + "grad_norm": 2.9893555641174316, + "learning_rate": 8.65614196655102e-06, + "loss": 1.060741901397705, + "step": 277, + "token_acc": 0.7205446446970057 + }, + { + "epoch": 0.4193852536300207, + "grad_norm": 1.5394455194473267, + "learning_rate": 8.644843137107058e-06, + "loss": 1.0864263772964478, + "step": 278, + "token_acc": 0.7105863453815261 + }, + { + "epoch": 0.4208938336790496, + "grad_norm": 1.7442381381988525, + "learning_rate": 8.633504448242504e-06, + "loss": 0.9813418388366699, + "step": 279, + "token_acc": 0.7447164290027405 + }, + { + "epoch": 0.42240241372807846, + "grad_norm": 1.7886149883270264, + "learning_rate": 8.622126023955446e-06, + "loss": 1.0896599292755127, + "step": 280, + "token_acc": 0.7198823652977628 + }, + { + "epoch": 0.4239109937771073, + "grad_norm": 1.6104036569595337, + "learning_rate": 8.610707988678504e-06, + "loss": 1.0969982147216797, + "step": 281, + "token_acc": 0.7091311067227613 + }, + { + "epoch": 0.42541957382613615, + "grad_norm": 2.182807683944702, + "learning_rate": 8.599250467277483e-06, + "loss": 0.9899187088012695, + "step": 282, + "token_acc": 0.7401355013550136 + }, + { + "epoch": 0.42692815387516503, + "grad_norm": 1.9726905822753906, + "learning_rate": 8.587753585050004e-06, + "loss": 1.0537909269332886, + "step": 283, + "token_acc": 0.7198859586600143 + }, + { + "epoch": 0.42843673392419385, + "grad_norm": 1.125136375427246, + "learning_rate": 8.576217467724129e-06, + "loss": 0.9191839098930359, + "step": 284, + "token_acc": 0.7527989442765315 + }, + { + "epoch": 0.4299453139732227, + "grad_norm": 1.652367115020752, + "learning_rate": 8.564642241456986e-06, + "loss": 1.082000732421875, + "step": 285, + "token_acc": 0.7173268765133172 + }, + { + "epoch": 0.43145389402225154, + "grad_norm": 1.799703598022461, + "learning_rate": 8.553028032833397e-06, + "loss": 1.031562089920044, + "step": 286, + "token_acc": 0.7287473083044659 + }, + { + "epoch": 0.4329624740712804, + "grad_norm": 2.1267964839935303, + "learning_rate": 8.541374968864486e-06, + "loss": 1.0316166877746582, + "step": 287, + "token_acc": 0.7268808159402765 + }, + { + "epoch": 0.4344710541203093, + "grad_norm": 2.5477280616760254, + "learning_rate": 8.529683176986295e-06, + "loss": 1.0617053508758545, + "step": 288, + "token_acc": 0.7170723165820346 + }, + { + "epoch": 0.4359796341693381, + "grad_norm": 1.9452764987945557, + "learning_rate": 8.517952785058385e-06, + "loss": 1.0585565567016602, + "step": 289, + "token_acc": 0.7259486742884943 + }, + { + "epoch": 0.437488214218367, + "grad_norm": 1.803718090057373, + "learning_rate": 8.506183921362443e-06, + "loss": 1.0784348249435425, + "step": 290, + "token_acc": 0.7115873888176508 + }, + { + "epoch": 0.4389967942673958, + "grad_norm": 2.3601601123809814, + "learning_rate": 8.494376714600878e-06, + "loss": 1.071596384048462, + "step": 291, + "token_acc": 0.7211230339556078 + }, + { + "epoch": 0.44050537431642467, + "grad_norm": 1.7919965982437134, + "learning_rate": 8.482531293895412e-06, + "loss": 1.0617738962173462, + "step": 292, + "token_acc": 0.7209671179883946 + }, + { + "epoch": 0.44201395436545354, + "grad_norm": 1.8360674381256104, + "learning_rate": 8.470647788785665e-06, + "loss": 1.10646390914917, + "step": 293, + "token_acc": 0.7074055568831056 + }, + { + "epoch": 0.44352253441448236, + "grad_norm": 1.814076542854309, + "learning_rate": 8.458726329227748e-06, + "loss": 1.1367568969726562, + "step": 294, + "token_acc": 0.7044879344475906 + }, + { + "epoch": 0.44503111446351123, + "grad_norm": 1.769569993019104, + "learning_rate": 8.446767045592829e-06, + "loss": 1.077988624572754, + "step": 295, + "token_acc": 0.718860510805501 + }, + { + "epoch": 0.44653969451254005, + "grad_norm": 1.9124197959899902, + "learning_rate": 8.434770068665723e-06, + "loss": 0.9767885804176331, + "step": 296, + "token_acc": 0.7394225753996256 + }, + { + "epoch": 0.4480482745615689, + "grad_norm": 1.9047906398773193, + "learning_rate": 8.422735529643445e-06, + "loss": 1.091620922088623, + "step": 297, + "token_acc": 0.7193328647888847 + }, + { + "epoch": 0.4495568546105978, + "grad_norm": 1.943240761756897, + "learning_rate": 8.410663560133784e-06, + "loss": 1.0385065078735352, + "step": 298, + "token_acc": 0.7346272922124315 + }, + { + "epoch": 0.4510654346596266, + "grad_norm": 1.8141487836837769, + "learning_rate": 8.398554292153866e-06, + "loss": 1.060175895690918, + "step": 299, + "token_acc": 0.719645311001063 + }, + { + "epoch": 0.4525740147086555, + "grad_norm": 2.1070666313171387, + "learning_rate": 8.386407858128707e-06, + "loss": 1.0710911750793457, + "step": 300, + "token_acc": 0.7184140312328338 + }, + { + "epoch": 0.4540825947576843, + "grad_norm": 1.9047869443893433, + "learning_rate": 8.37422439088976e-06, + "loss": 1.0909945964813232, + "step": 301, + "token_acc": 0.7097920165353313 + }, + { + "epoch": 0.4555911748067132, + "grad_norm": 1.860037922859192, + "learning_rate": 8.362004023673473e-06, + "loss": 1.130741834640503, + "step": 302, + "token_acc": 0.7010990492886522 + }, + { + "epoch": 0.45709975485574206, + "grad_norm": 2.0214152336120605, + "learning_rate": 8.349746890119826e-06, + "loss": 1.0799329280853271, + "step": 303, + "token_acc": 0.7161689916754269 + }, + { + "epoch": 0.4586083349047709, + "grad_norm": 1.7764649391174316, + "learning_rate": 8.337453124270864e-06, + "loss": 1.0793256759643555, + "step": 304, + "token_acc": 0.7159331114878299 + }, + { + "epoch": 0.46011691495379975, + "grad_norm": 1.6309493780136108, + "learning_rate": 8.325122860569241e-06, + "loss": 1.1250845193862915, + "step": 305, + "token_acc": 0.708081375744633 + }, + { + "epoch": 0.46162549500282857, + "grad_norm": 2.3263089656829834, + "learning_rate": 8.31275623385675e-06, + "loss": 1.1161826848983765, + "step": 306, + "token_acc": 0.7087722931172832 + }, + { + "epoch": 0.46313407505185744, + "grad_norm": 1.776930570602417, + "learning_rate": 8.300353379372834e-06, + "loss": 1.0625174045562744, + "step": 307, + "token_acc": 0.7188530279368385 + }, + { + "epoch": 0.4646426551008863, + "grad_norm": 1.901285171508789, + "learning_rate": 8.287914432753123e-06, + "loss": 1.0942513942718506, + "step": 308, + "token_acc": 0.710700877861518 + }, + { + "epoch": 0.46615123514991513, + "grad_norm": 1.5522140264511108, + "learning_rate": 8.275439530027948e-06, + "loss": 1.1332457065582275, + "step": 309, + "token_acc": 0.7041547583500405 + }, + { + "epoch": 0.467659815198944, + "grad_norm": 1.5998601913452148, + "learning_rate": 8.262928807620843e-06, + "loss": 1.1073472499847412, + "step": 310, + "token_acc": 0.7106197383101734 + }, + { + "epoch": 0.4691683952479728, + "grad_norm": 1.6061841249465942, + "learning_rate": 8.250382402347066e-06, + "loss": 1.1213736534118652, + "step": 311, + "token_acc": 0.6994938715775352 + }, + { + "epoch": 0.4706769752970017, + "grad_norm": 2.0051493644714355, + "learning_rate": 8.237800451412095e-06, + "loss": 0.9779979586601257, + "step": 312, + "token_acc": 0.7345784558726819 + }, + { + "epoch": 0.47218555534603057, + "grad_norm": 2.350712299346924, + "learning_rate": 8.225183092410128e-06, + "loss": 1.01034677028656, + "step": 313, + "token_acc": 0.736130832884493 + }, + { + "epoch": 0.4736941353950594, + "grad_norm": 1.389146089553833, + "learning_rate": 8.212530463322584e-06, + "loss": 1.1225346326828003, + "step": 314, + "token_acc": 0.6967101421062877 + }, + { + "epoch": 0.47520271544408826, + "grad_norm": 1.7256191968917847, + "learning_rate": 8.199842702516584e-06, + "loss": 1.0473668575286865, + "step": 315, + "token_acc": 0.7253412192902639 + }, + { + "epoch": 0.4767112954931171, + "grad_norm": 1.716261625289917, + "learning_rate": 8.18711994874345e-06, + "loss": 1.123284935951233, + "step": 316, + "token_acc": 0.704513054024725 + }, + { + "epoch": 0.47821987554214596, + "grad_norm": 1.772814393043518, + "learning_rate": 8.174362341137177e-06, + "loss": 1.1330331563949585, + "step": 317, + "token_acc": 0.7030755411070237 + }, + { + "epoch": 0.47972845559117483, + "grad_norm": 1.7024438381195068, + "learning_rate": 8.161570019212921e-06, + "loss": 1.140034556388855, + "step": 318, + "token_acc": 0.7041194961121221 + }, + { + "epoch": 0.48123703564020365, + "grad_norm": 1.7964457273483276, + "learning_rate": 8.148743122865463e-06, + "loss": 1.0832043886184692, + "step": 319, + "token_acc": 0.7051326613370409 + }, + { + "epoch": 0.4827456156892325, + "grad_norm": 2.058018922805786, + "learning_rate": 8.135881792367686e-06, + "loss": 0.9922590255737305, + "step": 320, + "token_acc": 0.7340101789180964 + }, + { + "epoch": 0.48425419573826134, + "grad_norm": 1.8939220905303955, + "learning_rate": 8.12298616836904e-06, + "loss": 1.0277425050735474, + "step": 321, + "token_acc": 0.7293661236898226 + }, + { + "epoch": 0.4857627757872902, + "grad_norm": 1.665968418121338, + "learning_rate": 8.110056391894005e-06, + "loss": 1.074292540550232, + "step": 322, + "token_acc": 0.7202264877945633 + }, + { + "epoch": 0.4872713558363191, + "grad_norm": 1.9305287599563599, + "learning_rate": 8.097092604340543e-06, + "loss": 1.0579659938812256, + "step": 323, + "token_acc": 0.7181328988941548 + }, + { + "epoch": 0.4887799358853479, + "grad_norm": 2.13419508934021, + "learning_rate": 8.084094947478556e-06, + "loss": 1.1063854694366455, + "step": 324, + "token_acc": 0.7122429110380918 + }, + { + "epoch": 0.4902885159343768, + "grad_norm": 2.105365037918091, + "learning_rate": 8.071063563448341e-06, + "loss": 0.9605776071548462, + "step": 325, + "token_acc": 0.7444644856325413 + }, + { + "epoch": 0.4917970959834056, + "grad_norm": 2.0500199794769287, + "learning_rate": 8.057998594759022e-06, + "loss": 1.0895134210586548, + "step": 326, + "token_acc": 0.7141750047359086 + }, + { + "epoch": 0.49330567603243447, + "grad_norm": 2.253749132156372, + "learning_rate": 8.044900184287007e-06, + "loss": 1.04857337474823, + "step": 327, + "token_acc": 0.7224411252634473 + }, + { + "epoch": 0.49481425608146334, + "grad_norm": 1.7167673110961914, + "learning_rate": 8.031768475274412e-06, + "loss": 1.0455167293548584, + "step": 328, + "token_acc": 0.7175676867448628 + }, + { + "epoch": 0.49632283613049216, + "grad_norm": 1.5565273761749268, + "learning_rate": 8.018603611327505e-06, + "loss": 1.0515671968460083, + "step": 329, + "token_acc": 0.7212167221656688 + }, + { + "epoch": 0.49783141617952104, + "grad_norm": 1.6809618473052979, + "learning_rate": 8.005405736415127e-06, + "loss": 1.0210798978805542, + "step": 330, + "token_acc": 0.7290781519337687 + }, + { + "epoch": 0.49933999622854985, + "grad_norm": 2.041860342025757, + "learning_rate": 7.992174994867124e-06, + "loss": 1.0171223878860474, + "step": 331, + "token_acc": 0.7346058677978085 + }, + { + "epoch": 0.5008485762775787, + "grad_norm": 1.981676459312439, + "learning_rate": 7.978911531372764e-06, + "loss": 1.100087285041809, + "step": 332, + "token_acc": 0.7118870204410432 + }, + { + "epoch": 0.5023571563266076, + "grad_norm": 1.6547303199768066, + "learning_rate": 7.965615490979165e-06, + "loss": 1.0905532836914062, + "step": 333, + "token_acc": 0.7101524730383042 + }, + { + "epoch": 0.5038657363756365, + "grad_norm": 1.8345566987991333, + "learning_rate": 7.952287019089686e-06, + "loss": 1.1057326793670654, + "step": 334, + "token_acc": 0.7049101376940085 + }, + { + "epoch": 0.5053743164246652, + "grad_norm": 1.581992506980896, + "learning_rate": 7.938926261462366e-06, + "loss": 1.0629876852035522, + "step": 335, + "token_acc": 0.7200724869416907 + }, + { + "epoch": 0.5068828964736941, + "grad_norm": 1.456152081489563, + "learning_rate": 7.925533364208308e-06, + "loss": 0.9903478026390076, + "step": 336, + "token_acc": 0.7397175322077298 + }, + { + "epoch": 0.508391476522723, + "grad_norm": 2.0703678131103516, + "learning_rate": 7.912108473790092e-06, + "loss": 1.0896110534667969, + "step": 337, + "token_acc": 0.7089397089397089 + }, + { + "epoch": 0.5099000565717519, + "grad_norm": 1.6947776079177856, + "learning_rate": 7.898651737020166e-06, + "loss": 1.0237860679626465, + "step": 338, + "token_acc": 0.7321725650332729 + }, + { + "epoch": 0.5114086366207807, + "grad_norm": 2.0731279850006104, + "learning_rate": 7.885163301059251e-06, + "loss": 1.0439159870147705, + "step": 339, + "token_acc": 0.723366003595662 + }, + { + "epoch": 0.5129172166698095, + "grad_norm": 2.0650253295898438, + "learning_rate": 7.871643313414718e-06, + "loss": 1.1425635814666748, + "step": 340, + "token_acc": 0.7025939559969181 + }, + { + "epoch": 0.5144257967188384, + "grad_norm": 1.8568668365478516, + "learning_rate": 7.858091921938989e-06, + "loss": 1.0421661138534546, + "step": 341, + "token_acc": 0.7256970428485214 + }, + { + "epoch": 0.5159343767678672, + "grad_norm": 1.8513833284378052, + "learning_rate": 7.844509274827907e-06, + "loss": 1.0383763313293457, + "step": 342, + "token_acc": 0.7240441140747219 + }, + { + "epoch": 0.5174429568168961, + "grad_norm": 2.0846774578094482, + "learning_rate": 7.830895520619129e-06, + "loss": 1.0690534114837646, + "step": 343, + "token_acc": 0.7158270985502742 + }, + { + "epoch": 0.518951536865925, + "grad_norm": 1.7119104862213135, + "learning_rate": 7.817250808190483e-06, + "loss": 1.1205004453659058, + "step": 344, + "token_acc": 0.7055546566191258 + }, + { + "epoch": 0.5204601169149538, + "grad_norm": 1.9147571325302124, + "learning_rate": 7.803575286758365e-06, + "loss": 1.1148525476455688, + "step": 345, + "token_acc": 0.7111058129957952 + }, + { + "epoch": 0.5219686969639826, + "grad_norm": 1.5817005634307861, + "learning_rate": 7.789869105876083e-06, + "loss": 1.1351449489593506, + "step": 346, + "token_acc": 0.7065078859327705 + }, + { + "epoch": 0.5234772770130115, + "grad_norm": 1.7524263858795166, + "learning_rate": 7.776132415432234e-06, + "loss": 1.0746632814407349, + "step": 347, + "token_acc": 0.7173098024871982 + }, + { + "epoch": 0.5249858570620404, + "grad_norm": 1.8386244773864746, + "learning_rate": 7.762365365649068e-06, + "loss": 1.0769001245498657, + "step": 348, + "token_acc": 0.7198564772132658 + }, + { + "epoch": 0.5264944371110692, + "grad_norm": 2.105241060256958, + "learning_rate": 7.748568107080831e-06, + "loss": 1.0313432216644287, + "step": 349, + "token_acc": 0.7229176118924978 + }, + { + "epoch": 0.528003017160098, + "grad_norm": 1.6372648477554321, + "learning_rate": 7.734740790612137e-06, + "loss": 1.0383625030517578, + "step": 350, + "token_acc": 0.7225170386556371 + }, + { + "epoch": 0.5295115972091269, + "grad_norm": 1.8087550401687622, + "learning_rate": 7.720883567456299e-06, + "loss": 1.0589823722839355, + "step": 351, + "token_acc": 0.7251922335612409 + }, + { + "epoch": 0.5310201772581558, + "grad_norm": 1.5290846824645996, + "learning_rate": 7.70699658915369e-06, + "loss": 1.0353952646255493, + "step": 352, + "token_acc": 0.7266627012156586 + }, + { + "epoch": 0.5325287573071846, + "grad_norm": 1.92183518409729, + "learning_rate": 7.693080007570084e-06, + "loss": 1.169959306716919, + "step": 353, + "token_acc": 0.6952417157701324 + }, + { + "epoch": 0.5340373373562135, + "grad_norm": 1.94788658618927, + "learning_rate": 7.679133974894984e-06, + "loss": 1.054610013961792, + "step": 354, + "token_acc": 0.7217220151597639 + }, + { + "epoch": 0.5355459174052423, + "grad_norm": 1.947944164276123, + "learning_rate": 7.66515864363997e-06, + "loss": 1.027885913848877, + "step": 355, + "token_acc": 0.730953040587339 + }, + { + "epoch": 0.5370544974542711, + "grad_norm": 1.8873100280761719, + "learning_rate": 7.651154166637025e-06, + "loss": 0.9860036373138428, + "step": 356, + "token_acc": 0.7405523473265312 + }, + { + "epoch": 0.5385630775033, + "grad_norm": 1.9255895614624023, + "learning_rate": 7.637120697036866e-06, + "loss": 1.1041179895401, + "step": 357, + "token_acc": 0.7101939851581826 + }, + { + "epoch": 0.5400716575523289, + "grad_norm": 1.5525470972061157, + "learning_rate": 7.62305838830727e-06, + "loss": 1.0902361869812012, + "step": 358, + "token_acc": 0.7105808722863289 + }, + { + "epoch": 0.5415802376013578, + "grad_norm": 1.708338975906372, + "learning_rate": 7.608967394231387e-06, + "loss": 1.0408700704574585, + "step": 359, + "token_acc": 0.7229779807301504 + }, + { + "epoch": 0.5430888176503865, + "grad_norm": 1.5198639631271362, + "learning_rate": 7.594847868906076e-06, + "loss": 1.0225660800933838, + "step": 360, + "token_acc": 0.7230989325409927 + }, + { + "epoch": 0.5445973976994154, + "grad_norm": 2.232391119003296, + "learning_rate": 7.580699966740201e-06, + "loss": 1.0441985130310059, + "step": 361, + "token_acc": 0.7276262881501175 + }, + { + "epoch": 0.5461059777484443, + "grad_norm": 1.7873284816741943, + "learning_rate": 7.566523842452958e-06, + "loss": 0.987572431564331, + "step": 362, + "token_acc": 0.7402627691829278 + }, + { + "epoch": 0.5476145577974731, + "grad_norm": 1.6346279382705688, + "learning_rate": 7.552319651072164e-06, + "loss": 1.0211169719696045, + "step": 363, + "token_acc": 0.7241921585523481 + }, + { + "epoch": 0.549123137846502, + "grad_norm": 1.6549104452133179, + "learning_rate": 7.5380875479325855e-06, + "loss": 1.0593831539154053, + "step": 364, + "token_acc": 0.7166937195654888 + }, + { + "epoch": 0.5506317178955308, + "grad_norm": 1.6476542949676514, + "learning_rate": 7.52382768867422e-06, + "loss": 1.0829533338546753, + "step": 365, + "token_acc": 0.7157911580487358 + }, + { + "epoch": 0.5521402979445597, + "grad_norm": 1.8034530878067017, + "learning_rate": 7.509540229240601e-06, + "loss": 0.9785500764846802, + "step": 366, + "token_acc": 0.7427162240905686 + }, + { + "epoch": 0.5536488779935885, + "grad_norm": 1.9358595609664917, + "learning_rate": 7.4952253258771036e-06, + "loss": 1.074288010597229, + "step": 367, + "token_acc": 0.7164285714285714 + }, + { + "epoch": 0.5551574580426174, + "grad_norm": 2.0328221321105957, + "learning_rate": 7.480883135129211e-06, + "loss": 1.1360138654708862, + "step": 368, + "token_acc": 0.7064842298318009 + }, + { + "epoch": 0.5566660380916463, + "grad_norm": 1.8718421459197998, + "learning_rate": 7.4665138138408255e-06, + "loss": 1.073230266571045, + "step": 369, + "token_acc": 0.7162921348314607 + }, + { + "epoch": 0.558174618140675, + "grad_norm": 1.7607126235961914, + "learning_rate": 7.452117519152542e-06, + "loss": 1.0579783916473389, + "step": 370, + "token_acc": 0.7214491726964529 + }, + { + "epoch": 0.5596831981897039, + "grad_norm": 2.2622265815734863, + "learning_rate": 7.437694408499932e-06, + "loss": 1.0581026077270508, + "step": 371, + "token_acc": 0.7174761377419184 + }, + { + "epoch": 0.5611917782387328, + "grad_norm": 1.9631376266479492, + "learning_rate": 7.4232446396118265e-06, + "loss": 1.0350099802017212, + "step": 372, + "token_acc": 0.7285010823798083 + }, + { + "epoch": 0.5627003582877617, + "grad_norm": 1.6819504499435425, + "learning_rate": 7.408768370508577e-06, + "loss": 1.1177228689193726, + "step": 373, + "token_acc": 0.7052088999404253 + }, + { + "epoch": 0.5642089383367905, + "grad_norm": 1.8726435899734497, + "learning_rate": 7.394265759500348e-06, + "loss": 1.0807349681854248, + "step": 374, + "token_acc": 0.7120237360829588 + }, + { + "epoch": 0.5657175183858193, + "grad_norm": 1.619909405708313, + "learning_rate": 7.379736965185369e-06, + "loss": 1.039980173110962, + "step": 375, + "token_acc": 0.7230322013655722 + }, + { + "epoch": 0.5672260984348482, + "grad_norm": 1.7848247289657593, + "learning_rate": 7.365182146448205e-06, + "loss": 1.056114673614502, + "step": 376, + "token_acc": 0.7217417941044794 + }, + { + "epoch": 0.568734678483877, + "grad_norm": 1.8457938432693481, + "learning_rate": 7.350601462458025e-06, + "loss": 1.074272632598877, + "step": 377, + "token_acc": 0.710047740646164 + }, + { + "epoch": 0.5702432585329059, + "grad_norm": 1.4955483675003052, + "learning_rate": 7.335995072666848e-06, + "loss": 1.0058588981628418, + "step": 378, + "token_acc": 0.7292362538540597 + }, + { + "epoch": 0.5717518385819348, + "grad_norm": 1.9584203958511353, + "learning_rate": 7.3213631368078196e-06, + "loss": 1.018998384475708, + "step": 379, + "token_acc": 0.7313575265152548 + }, + { + "epoch": 0.5732604186309636, + "grad_norm": 1.4288082122802734, + "learning_rate": 7.30670581489344e-06, + "loss": 0.9927615523338318, + "step": 380, + "token_acc": 0.7370691330198578 + }, + { + "epoch": 0.5747689986799924, + "grad_norm": 2.0152997970581055, + "learning_rate": 7.292023267213836e-06, + "loss": 1.0861599445343018, + "step": 381, + "token_acc": 0.7144196010175392 + }, + { + "epoch": 0.5762775787290213, + "grad_norm": 6.5980143547058105, + "learning_rate": 7.2773156543349965e-06, + "loss": 1.0500900745391846, + "step": 382, + "token_acc": 0.7250676459219173 + }, + { + "epoch": 0.5777861587780502, + "grad_norm": 2.300058126449585, + "learning_rate": 7.262583137097019e-06, + "loss": 1.086035966873169, + "step": 383, + "token_acc": 0.7165979921232954 + }, + { + "epoch": 0.579294738827079, + "grad_norm": 1.7684295177459717, + "learning_rate": 7.247825876612353e-06, + "loss": 1.0611170530319214, + "step": 384, + "token_acc": 0.7227742897149826 + }, + { + "epoch": 0.5808033188761078, + "grad_norm": 1.662962555885315, + "learning_rate": 7.233044034264034e-06, + "loss": 1.0630278587341309, + "step": 385, + "token_acc": 0.7200887488076971 + }, + { + "epoch": 0.5823118989251367, + "grad_norm": 1.626369595527649, + "learning_rate": 7.218237771703921e-06, + "loss": 1.0240843296051025, + "step": 386, + "token_acc": 0.7307370284429348 + }, + { + "epoch": 0.5838204789741656, + "grad_norm": 1.9818180799484253, + "learning_rate": 7.203407250850929e-06, + "loss": 1.068251371383667, + "step": 387, + "token_acc": 0.7217704493654858 + }, + { + "epoch": 0.5853290590231944, + "grad_norm": 1.4807775020599365, + "learning_rate": 7.18855263388926e-06, + "loss": 1.0617135763168335, + "step": 388, + "token_acc": 0.7223062589060616 + }, + { + "epoch": 0.5868376390722233, + "grad_norm": 1.6532444953918457, + "learning_rate": 7.173674083266624e-06, + "loss": 1.0930352210998535, + "step": 389, + "token_acc": 0.7110771701031937 + }, + { + "epoch": 0.5883462191212521, + "grad_norm": 2.3185484409332275, + "learning_rate": 7.158771761692464e-06, + "loss": 1.025978684425354, + "step": 390, + "token_acc": 0.7320617620345141 + }, + { + "epoch": 0.5898547991702809, + "grad_norm": 2.0221846103668213, + "learning_rate": 7.143845832136188e-06, + "loss": 0.9246326088905334, + "step": 391, + "token_acc": 0.7562167824029173 + }, + { + "epoch": 0.5913633792193098, + "grad_norm": 1.7317066192626953, + "learning_rate": 7.128896457825364e-06, + "loss": 1.0132015943527222, + "step": 392, + "token_acc": 0.731554253360521 + }, + { + "epoch": 0.5928719592683387, + "grad_norm": 1.6808310747146606, + "learning_rate": 7.113923802243957e-06, + "loss": 1.1298030614852905, + "step": 393, + "token_acc": 0.7069070532342551 + }, + { + "epoch": 0.5943805393173676, + "grad_norm": 1.922598958015442, + "learning_rate": 7.098928029130529e-06, + "loss": 1.053634762763977, + "step": 394, + "token_acc": 0.720421483489131 + }, + { + "epoch": 0.5958891193663963, + "grad_norm": 1.7541972398757935, + "learning_rate": 7.083909302476453e-06, + "loss": 1.0911366939544678, + "step": 395, + "token_acc": 0.7137909319899244 + }, + { + "epoch": 0.5973976994154252, + "grad_norm": 1.8168507814407349, + "learning_rate": 7.068867786524116e-06, + "loss": 1.06001615524292, + "step": 396, + "token_acc": 0.7205586342287716 + }, + { + "epoch": 0.5989062794644541, + "grad_norm": 1.6193066835403442, + "learning_rate": 7.053803645765128e-06, + "loss": 1.1115810871124268, + "step": 397, + "token_acc": 0.7092469018112488 + }, + { + "epoch": 0.600414859513483, + "grad_norm": 1.7091399431228638, + "learning_rate": 7.038717044938519e-06, + "loss": 1.0904370546340942, + "step": 398, + "token_acc": 0.716276872194661 + }, + { + "epoch": 0.6019234395625118, + "grad_norm": 1.9059756994247437, + "learning_rate": 7.023608149028936e-06, + "loss": 1.011240005493164, + "step": 399, + "token_acc": 0.7324724953003174 + }, + { + "epoch": 0.6034320196115406, + "grad_norm": 1.6240061521530151, + "learning_rate": 7.008477123264849e-06, + "loss": 1.0212568044662476, + "step": 400, + "token_acc": 0.7293236247316807 + }, + { + "epoch": 0.6049405996605695, + "grad_norm": 1.6936088800430298, + "learning_rate": 6.993324133116726e-06, + "loss": 1.1276183128356934, + "step": 401, + "token_acc": 0.699568047221625 + }, + { + "epoch": 0.6064491797095983, + "grad_norm": 1.7851431369781494, + "learning_rate": 6.978149344295242e-06, + "loss": 1.0876280069351196, + "step": 402, + "token_acc": 0.7145382894876984 + }, + { + "epoch": 0.6079577597586272, + "grad_norm": 1.770546555519104, + "learning_rate": 6.9629529227494575e-06, + "loss": 1.1071759462356567, + "step": 403, + "token_acc": 0.7012365871186776 + }, + { + "epoch": 0.6094663398076561, + "grad_norm": 1.8950902223587036, + "learning_rate": 6.9477350346650016e-06, + "loss": 1.0406562089920044, + "step": 404, + "token_acc": 0.7264676438124086 + }, + { + "epoch": 0.610974919856685, + "grad_norm": 1.9422787427902222, + "learning_rate": 6.932495846462262e-06, + "loss": 1.0936779975891113, + "step": 405, + "token_acc": 0.7044013705659378 + }, + { + "epoch": 0.6124834999057137, + "grad_norm": 1.9265121221542358, + "learning_rate": 6.9172355247945586e-06, + "loss": 1.0682458877563477, + "step": 406, + "token_acc": 0.7196719609675075 + }, + { + "epoch": 0.6139920799547426, + "grad_norm": 1.7908693552017212, + "learning_rate": 6.901954236546324e-06, + "loss": 1.0964593887329102, + "step": 407, + "token_acc": 0.7142297945981582 + }, + { + "epoch": 0.6155006600037715, + "grad_norm": 2.142164707183838, + "learning_rate": 6.88665214883128e-06, + "loss": 1.0631380081176758, + "step": 408, + "token_acc": 0.7231717177349652 + }, + { + "epoch": 0.6170092400528003, + "grad_norm": 2.0588438510894775, + "learning_rate": 6.871329428990602e-06, + "loss": 1.029168963432312, + "step": 409, + "token_acc": 0.7297169950231175 + }, + { + "epoch": 0.6185178201018292, + "grad_norm": 1.9685001373291016, + "learning_rate": 6.855986244591104e-06, + "loss": 1.0743292570114136, + "step": 410, + "token_acc": 0.7167094571345385 + }, + { + "epoch": 0.620026400150858, + "grad_norm": 1.6551837921142578, + "learning_rate": 6.840622763423391e-06, + "loss": 1.037000298500061, + "step": 411, + "token_acc": 0.7215511271831961 + }, + { + "epoch": 0.6215349801998868, + "grad_norm": 2.2364120483398438, + "learning_rate": 6.825239153500029e-06, + "loss": 1.0270576477050781, + "step": 412, + "token_acc": 0.7221798333137692 + }, + { + "epoch": 0.6230435602489157, + "grad_norm": 2.151427745819092, + "learning_rate": 6.809835583053716e-06, + "loss": 1.0138657093048096, + "step": 413, + "token_acc": 0.7283263627986106 + }, + { + "epoch": 0.6245521402979446, + "grad_norm": 1.8536280393600464, + "learning_rate": 6.794412220535426e-06, + "loss": 1.0561426877975464, + "step": 414, + "token_acc": 0.7185600547898296 + }, + { + "epoch": 0.6260607203469735, + "grad_norm": 2.162677049636841, + "learning_rate": 6.778969234612583e-06, + "loss": 1.028409481048584, + "step": 415, + "token_acc": 0.7287534664934883 + }, + { + "epoch": 0.6275693003960022, + "grad_norm": 1.7925492525100708, + "learning_rate": 6.763506794167207e-06, + "loss": 1.0484743118286133, + "step": 416, + "token_acc": 0.7263908778945661 + }, + { + "epoch": 0.6290778804450311, + "grad_norm": 2.982769250869751, + "learning_rate": 6.748025068294067e-06, + "loss": 1.0736019611358643, + "step": 417, + "token_acc": 0.7168967994253332 + }, + { + "epoch": 0.63058646049406, + "grad_norm": 1.8204153776168823, + "learning_rate": 6.732524226298841e-06, + "loss": 1.0563300848007202, + "step": 418, + "token_acc": 0.7212504869497468 + }, + { + "epoch": 0.6320950405430888, + "grad_norm": 1.9906964302062988, + "learning_rate": 6.717004437696249e-06, + "loss": 1.0816138982772827, + "step": 419, + "token_acc": 0.7109104623302867 + }, + { + "epoch": 0.6336036205921177, + "grad_norm": 1.6285972595214844, + "learning_rate": 6.701465872208216e-06, + "loss": 1.0677554607391357, + "step": 420, + "token_acc": 0.7177388656261896 + }, + { + "epoch": 0.6351122006411465, + "grad_norm": 2.058015823364258, + "learning_rate": 6.685908699762003e-06, + "loss": 1.0556941032409668, + "step": 421, + "token_acc": 0.7207183297947629 + }, + { + "epoch": 0.6366207806901754, + "grad_norm": 1.7505825757980347, + "learning_rate": 6.670333090488357e-06, + "loss": 1.0424883365631104, + "step": 422, + "token_acc": 0.7218502323039082 + }, + { + "epoch": 0.6381293607392042, + "grad_norm": 1.8318870067596436, + "learning_rate": 6.654739214719642e-06, + "loss": 1.0448384284973145, + "step": 423, + "token_acc": 0.7290303390838786 + }, + { + "epoch": 0.6396379407882331, + "grad_norm": 1.8287194967269897, + "learning_rate": 6.6391272429879886e-06, + "loss": 1.0797233581542969, + "step": 424, + "token_acc": 0.7160534799924142 + }, + { + "epoch": 0.641146520837262, + "grad_norm": 1.9645090103149414, + "learning_rate": 6.6234973460234184e-06, + "loss": 1.1176778078079224, + "step": 425, + "token_acc": 0.7072520656150808 + }, + { + "epoch": 0.6426551008862907, + "grad_norm": 1.8919235467910767, + "learning_rate": 6.607849694751978e-06, + "loss": 1.104222297668457, + "step": 426, + "token_acc": 0.7099882766705744 + }, + { + "epoch": 0.6441636809353196, + "grad_norm": 1.5803821086883545, + "learning_rate": 6.592184460293878e-06, + "loss": 1.0277819633483887, + "step": 427, + "token_acc": 0.7214622586170487 + }, + { + "epoch": 0.6456722609843485, + "grad_norm": 1.8017348051071167, + "learning_rate": 6.576501813961609e-06, + "loss": 1.0658228397369385, + "step": 428, + "token_acc": 0.72180607573866 + }, + { + "epoch": 0.6471808410333774, + "grad_norm": 1.7503068447113037, + "learning_rate": 6.560801927258081e-06, + "loss": 1.033654808998108, + "step": 429, + "token_acc": 0.7259663744790918 + }, + { + "epoch": 0.6486894210824062, + "grad_norm": 1.8259599208831787, + "learning_rate": 6.545084971874738e-06, + "loss": 1.0272496938705444, + "step": 430, + "token_acc": 0.7321432793474406 + }, + { + "epoch": 0.650198001131435, + "grad_norm": 2.4734461307525635, + "learning_rate": 6.529351119689687e-06, + "loss": 1.0510509014129639, + "step": 431, + "token_acc": 0.7230790834835116 + }, + { + "epoch": 0.6517065811804639, + "grad_norm": 1.8868094682693481, + "learning_rate": 6.513600542765816e-06, + "loss": 1.0341954231262207, + "step": 432, + "token_acc": 0.728 + }, + { + "epoch": 0.6532151612294927, + "grad_norm": 2.2582576274871826, + "learning_rate": 6.49783341334891e-06, + "loss": 1.0557807683944702, + "step": 433, + "token_acc": 0.7231695849958067 + }, + { + "epoch": 0.6547237412785216, + "grad_norm": 2.2365760803222656, + "learning_rate": 6.4820499038657695e-06, + "loss": 1.1121265888214111, + "step": 434, + "token_acc": 0.7109618668184405 + }, + { + "epoch": 0.6562323213275505, + "grad_norm": 1.8216484785079956, + "learning_rate": 6.466250186922325e-06, + "loss": 1.08517587184906, + "step": 435, + "token_acc": 0.7136493584203178 + }, + { + "epoch": 0.6577409013765793, + "grad_norm": 1.8552532196044922, + "learning_rate": 6.450434435301751e-06, + "loss": 1.085634708404541, + "step": 436, + "token_acc": 0.7142949144537707 + }, + { + "epoch": 0.6592494814256081, + "grad_norm": 1.720496654510498, + "learning_rate": 6.434602821962571e-06, + "loss": 1.0143402814865112, + "step": 437, + "token_acc": 0.7262405814359333 + }, + { + "epoch": 0.660758061474637, + "grad_norm": 1.95842707157135, + "learning_rate": 6.418755520036775e-06, + "loss": 1.0316746234893799, + "step": 438, + "token_acc": 0.7242511459332693 + }, + { + "epoch": 0.6622666415236659, + "grad_norm": 2.6778366565704346, + "learning_rate": 6.402892702827916e-06, + "loss": 1.0383585691452026, + "step": 439, + "token_acc": 0.7243439799259984 + }, + { + "epoch": 0.6637752215726948, + "grad_norm": 1.9620119333267212, + "learning_rate": 6.387014543809224e-06, + "loss": 1.0201284885406494, + "step": 440, + "token_acc": 0.7293385929672563 + }, + { + "epoch": 0.6652838016217235, + "grad_norm": 1.6087896823883057, + "learning_rate": 6.371121216621698e-06, + "loss": 1.0268536806106567, + "step": 441, + "token_acc": 0.7261447014024334 + }, + { + "epoch": 0.6667923816707524, + "grad_norm": 2.8445332050323486, + "learning_rate": 6.355212895072223e-06, + "loss": 0.9969682097434998, + "step": 442, + "token_acc": 0.7356999293823673 + }, + { + "epoch": 0.6683009617197813, + "grad_norm": 2.1567375659942627, + "learning_rate": 6.339289753131649e-06, + "loss": 1.0502561330795288, + "step": 443, + "token_acc": 0.7287027621557681 + }, + { + "epoch": 0.6698095417688101, + "grad_norm": 2.0766618251800537, + "learning_rate": 6.323351964932909e-06, + "loss": 1.0280673503875732, + "step": 444, + "token_acc": 0.7303461041455663 + }, + { + "epoch": 0.671318121817839, + "grad_norm": 1.8667423725128174, + "learning_rate": 6.3073997047691e-06, + "loss": 1.0537922382354736, + "step": 445, + "token_acc": 0.7255950409312449 + }, + { + "epoch": 0.6728267018668678, + "grad_norm": 1.8974792957305908, + "learning_rate": 6.291433147091583e-06, + "loss": 1.0473456382751465, + "step": 446, + "token_acc": 0.7273272472846005 + }, + { + "epoch": 0.6743352819158966, + "grad_norm": 2.9829742908477783, + "learning_rate": 6.275452466508076e-06, + "loss": 1.0842602252960205, + "step": 447, + "token_acc": 0.7148258233265962 + }, + { + "epoch": 0.6758438619649255, + "grad_norm": 1.8196378946304321, + "learning_rate": 6.259457837780741e-06, + "loss": 1.0181214809417725, + "step": 448, + "token_acc": 0.7321142857142857 + }, + { + "epoch": 0.6773524420139544, + "grad_norm": 2.0543696880340576, + "learning_rate": 6.243449435824276e-06, + "loss": 1.0639190673828125, + "step": 449, + "token_acc": 0.7219584204699225 + }, + { + "epoch": 0.6788610220629833, + "grad_norm": 1.865490436553955, + "learning_rate": 6.227427435703997e-06, + "loss": 1.0338490009307861, + "step": 450, + "token_acc": 0.7225590842688759 + }, + { + "epoch": 0.680369602112012, + "grad_norm": 2.0611252784729004, + "learning_rate": 6.211392012633932e-06, + "loss": 1.0903571844100952, + "step": 451, + "token_acc": 0.712118293035353 + }, + { + "epoch": 0.6818781821610409, + "grad_norm": 2.00286602973938, + "learning_rate": 6.1953433419748995e-06, + "loss": 1.111462116241455, + "step": 452, + "token_acc": 0.7146028809303555 + }, + { + "epoch": 0.6833867622100698, + "grad_norm": 1.6739310026168823, + "learning_rate": 6.179281599232592e-06, + "loss": 1.0183110237121582, + "step": 453, + "token_acc": 0.7253908881815858 + }, + { + "epoch": 0.6848953422590986, + "grad_norm": 1.8656288385391235, + "learning_rate": 6.163206960055652e-06, + "loss": 0.939572274684906, + "step": 454, + "token_acc": 0.7478663997471289 + }, + { + "epoch": 0.6864039223081275, + "grad_norm": 1.819954752922058, + "learning_rate": 6.147119600233758e-06, + "loss": 1.0358134508132935, + "step": 455, + "token_acc": 0.7264064680687232 + }, + { + "epoch": 0.6879125023571563, + "grad_norm": 2.6484479904174805, + "learning_rate": 6.131019695695702e-06, + "loss": 0.9884731769561768, + "step": 456, + "token_acc": 0.7403164822853942 + }, + { + "epoch": 0.6894210824061852, + "grad_norm": 2.083184003829956, + "learning_rate": 6.114907422507459e-06, + "loss": 0.9496172666549683, + "step": 457, + "token_acc": 0.7432501273560876 + }, + { + "epoch": 0.690929662455214, + "grad_norm": 1.6787505149841309, + "learning_rate": 6.098782956870266e-06, + "loss": 1.0099568367004395, + "step": 458, + "token_acc": 0.731698991466253 + }, + { + "epoch": 0.6924382425042429, + "grad_norm": 2.4523606300354004, + "learning_rate": 6.0826464751187e-06, + "loss": 0.9122902154922485, + "step": 459, + "token_acc": 0.7536777441629524 + }, + { + "epoch": 0.6939468225532718, + "grad_norm": 2.0770370960235596, + "learning_rate": 6.066498153718735e-06, + "loss": 1.0828238725662231, + "step": 460, + "token_acc": 0.7126667513657731 + }, + { + "epoch": 0.6954554026023005, + "grad_norm": 1.7312464714050293, + "learning_rate": 6.0503381692658305e-06, + "loss": 1.102535367012024, + "step": 461, + "token_acc": 0.7056192660550459 + }, + { + "epoch": 0.6969639826513294, + "grad_norm": 1.973297357559204, + "learning_rate": 6.034166698482984e-06, + "loss": 1.049569010734558, + "step": 462, + "token_acc": 0.7192105052017865 + }, + { + "epoch": 0.6984725627003583, + "grad_norm": 2.7333481311798096, + "learning_rate": 6.0179839182188125e-06, + "loss": 1.0108845233917236, + "step": 463, + "token_acc": 0.7290674718383463 + }, + { + "epoch": 0.6999811427493872, + "grad_norm": 1.6706620454788208, + "learning_rate": 6.001790005445607e-06, + "loss": 1.0939772129058838, + "step": 464, + "token_acc": 0.711604464920689 + }, + { + "epoch": 0.701489722798416, + "grad_norm": 1.8926599025726318, + "learning_rate": 5.985585137257401e-06, + "loss": 1.0778906345367432, + "step": 465, + "token_acc": 0.7161705641680117 + }, + { + "epoch": 0.7029983028474448, + "grad_norm": 3.985429286956787, + "learning_rate": 5.969369490868042e-06, + "loss": 0.9847612380981445, + "step": 466, + "token_acc": 0.7397007305216848 + }, + { + "epoch": 0.7045068828964737, + "grad_norm": 2.065019130706787, + "learning_rate": 5.953143243609235e-06, + "loss": 1.0943577289581299, + "step": 467, + "token_acc": 0.7063771273461078 + }, + { + "epoch": 0.7060154629455025, + "grad_norm": 1.6235759258270264, + "learning_rate": 5.936906572928625e-06, + "loss": 1.0321025848388672, + "step": 468, + "token_acc": 0.729424838362069 + }, + { + "epoch": 0.7075240429945314, + "grad_norm": 2.009197473526001, + "learning_rate": 5.920659656387836e-06, + "loss": 1.0952529907226562, + "step": 469, + "token_acc": 0.7133090567367556 + }, + { + "epoch": 0.7090326230435603, + "grad_norm": 2.5111048221588135, + "learning_rate": 5.904402671660551e-06, + "loss": 1.0776491165161133, + "step": 470, + "token_acc": 0.7194365087183529 + }, + { + "epoch": 0.7105412030925891, + "grad_norm": 1.723303198814392, + "learning_rate": 5.8881357965305444e-06, + "loss": 1.0253169536590576, + "step": 471, + "token_acc": 0.726647341323065 + }, + { + "epoch": 0.7120497831416179, + "grad_norm": 1.6365454196929932, + "learning_rate": 5.871859208889759e-06, + "loss": 1.0463616847991943, + "step": 472, + "token_acc": 0.7203148239960337 + }, + { + "epoch": 0.7135583631906468, + "grad_norm": 1.8090174198150635, + "learning_rate": 5.855573086736351e-06, + "loss": 1.0304409265518188, + "step": 473, + "token_acc": 0.7265488969655989 + }, + { + "epoch": 0.7150669432396757, + "grad_norm": 1.7898191213607788, + "learning_rate": 5.839277608172739e-06, + "loss": 1.0160740613937378, + "step": 474, + "token_acc": 0.7277277517003544 + }, + { + "epoch": 0.7165755232887046, + "grad_norm": 2.0119030475616455, + "learning_rate": 5.82297295140367e-06, + "loss": 1.049438238143921, + "step": 475, + "token_acc": 0.7220689111408972 + }, + { + "epoch": 0.7180841033377333, + "grad_norm": 2.078810930252075, + "learning_rate": 5.806659294734256e-06, + "loss": 1.052063226699829, + "step": 476, + "token_acc": 0.7209957060682168 + }, + { + "epoch": 0.7195926833867622, + "grad_norm": 1.58182692527771, + "learning_rate": 5.790336816568033e-06, + "loss": 0.9491676092147827, + "step": 477, + "token_acc": 0.7439329089447498 + }, + { + "epoch": 0.7211012634357911, + "grad_norm": 2.3952503204345703, + "learning_rate": 5.774005695405008e-06, + "loss": 1.0512211322784424, + "step": 478, + "token_acc": 0.7175498884668716 + }, + { + "epoch": 0.7226098434848199, + "grad_norm": 1.636983036994934, + "learning_rate": 5.7576661098397024e-06, + "loss": 1.0860540866851807, + "step": 479, + "token_acc": 0.7119085312225154 + }, + { + "epoch": 0.7241184235338488, + "grad_norm": 1.9575828313827515, + "learning_rate": 5.74131823855921e-06, + "loss": 1.0170823335647583, + "step": 480, + "token_acc": 0.7272457549753515 + }, + { + "epoch": 0.7256270035828776, + "grad_norm": 2.7882261276245117, + "learning_rate": 5.72496226034123e-06, + "loss": 1.0176913738250732, + "step": 481, + "token_acc": 0.726940077866573 + }, + { + "epoch": 0.7271355836319064, + "grad_norm": 1.8567211627960205, + "learning_rate": 5.708598354052122e-06, + "loss": 0.9691476821899414, + "step": 482, + "token_acc": 0.741912022196482 + }, + { + "epoch": 0.7286441636809353, + "grad_norm": 1.792314052581787, + "learning_rate": 5.692226698644938e-06, + "loss": 1.0314260721206665, + "step": 483, + "token_acc": 0.7282526893620789 + }, + { + "epoch": 0.7301527437299642, + "grad_norm": 1.7519475221633911, + "learning_rate": 5.675847473157485e-06, + "loss": 1.0447940826416016, + "step": 484, + "token_acc": 0.7197125465686773 + }, + { + "epoch": 0.7316613237789931, + "grad_norm": 2.195399284362793, + "learning_rate": 5.659460856710346e-06, + "loss": 1.093864917755127, + "step": 485, + "token_acc": 0.7139177290051255 + }, + { + "epoch": 0.7331699038280218, + "grad_norm": 1.7832348346710205, + "learning_rate": 5.643067028504931e-06, + "loss": 0.9985391497612, + "step": 486, + "token_acc": 0.7355338767873847 + }, + { + "epoch": 0.7346784838770507, + "grad_norm": 1.692111849784851, + "learning_rate": 5.626666167821522e-06, + "loss": 1.010223150253296, + "step": 487, + "token_acc": 0.7278096594479119 + }, + { + "epoch": 0.7361870639260796, + "grad_norm": 1.5468913316726685, + "learning_rate": 5.610258454017301e-06, + "loss": 1.02261483669281, + "step": 488, + "token_acc": 0.7296336541446755 + }, + { + "epoch": 0.7376956439751085, + "grad_norm": 1.623683214187622, + "learning_rate": 5.593844066524401e-06, + "loss": 1.0238165855407715, + "step": 489, + "token_acc": 0.7242505764796311 + }, + { + "epoch": 0.7392042240241373, + "grad_norm": 1.587205410003662, + "learning_rate": 5.577423184847932e-06, + "loss": 1.0333800315856934, + "step": 490, + "token_acc": 0.7269147252585477 + }, + { + "epoch": 0.7407128040731661, + "grad_norm": 2.26971697807312, + "learning_rate": 5.560995988564023e-06, + "loss": 1.0046448707580566, + "step": 491, + "token_acc": 0.7322918991296585 + }, + { + "epoch": 0.742221384122195, + "grad_norm": 1.9151287078857422, + "learning_rate": 5.544562657317863e-06, + "loss": 1.0457384586334229, + "step": 492, + "token_acc": 0.7234116321400711 + }, + { + "epoch": 0.7437299641712238, + "grad_norm": 1.7963476181030273, + "learning_rate": 5.52812337082173e-06, + "loss": 1.053754210472107, + "step": 493, + "token_acc": 0.7177801343230824 + }, + { + "epoch": 0.7452385442202527, + "grad_norm": 1.9358562231063843, + "learning_rate": 5.5116783088530255e-06, + "loss": 1.1072862148284912, + "step": 494, + "token_acc": 0.7078415046284958 + }, + { + "epoch": 0.7467471242692816, + "grad_norm": 1.6318477392196655, + "learning_rate": 5.495227651252315e-06, + "loss": 1.0753233432769775, + "step": 495, + "token_acc": 0.7121745249824067 + }, + { + "epoch": 0.7482557043183103, + "grad_norm": 1.6419304609298706, + "learning_rate": 5.478771577921351e-06, + "loss": 1.0624401569366455, + "step": 496, + "token_acc": 0.7226446280991735 + }, + { + "epoch": 0.7497642843673392, + "grad_norm": 1.6563493013381958, + "learning_rate": 5.4623102688211186e-06, + "loss": 1.0426563024520874, + "step": 497, + "token_acc": 0.7230899830220713 + }, + { + "epoch": 0.7512728644163681, + "grad_norm": 1.8772647380828857, + "learning_rate": 5.445843903969854e-06, + "loss": 1.081175446510315, + "step": 498, + "token_acc": 0.7123568535280292 + }, + { + "epoch": 0.752781444465397, + "grad_norm": 1.815774917602539, + "learning_rate": 5.429372663441086e-06, + "loss": 1.0234770774841309, + "step": 499, + "token_acc": 0.7283280498095651 + }, + { + "epoch": 0.7542900245144258, + "grad_norm": 1.9452921152114868, + "learning_rate": 5.412896727361663e-06, + "loss": 1.014378547668457, + "step": 500, + "token_acc": 0.7337648680859019 + }, + { + "epoch": 0.7557986045634546, + "grad_norm": 1.517088532447815, + "learning_rate": 5.396416275909779e-06, + "loss": 1.065323829650879, + "step": 501, + "token_acc": 0.7263767238606913 + }, + { + "epoch": 0.7573071846124835, + "grad_norm": 1.8350716829299927, + "learning_rate": 5.379931489313016e-06, + "loss": 1.0177841186523438, + "step": 502, + "token_acc": 0.7265169109467398 + }, + { + "epoch": 0.7588157646615123, + "grad_norm": 1.8073683977127075, + "learning_rate": 5.363442547846356e-06, + "loss": 1.0423173904418945, + "step": 503, + "token_acc": 0.7261943919085602 + }, + { + "epoch": 0.7603243447105412, + "grad_norm": 1.994448184967041, + "learning_rate": 5.346949631830221e-06, + "loss": 1.0371301174163818, + "step": 504, + "token_acc": 0.7211158612479031 + }, + { + "epoch": 0.7618329247595701, + "grad_norm": 1.9085524082183838, + "learning_rate": 5.3304529216284974e-06, + "loss": 0.9996076226234436, + "step": 505, + "token_acc": 0.7319218058589593 + }, + { + "epoch": 0.7633415048085989, + "grad_norm": 1.4438563585281372, + "learning_rate": 5.3139525976465675e-06, + "loss": 0.9833379983901978, + "step": 506, + "token_acc": 0.7333137636223535 + }, + { + "epoch": 0.7648500848576277, + "grad_norm": 1.5760232210159302, + "learning_rate": 5.2974488403293285e-06, + "loss": 1.0267274379730225, + "step": 507, + "token_acc": 0.727858739538027 + }, + { + "epoch": 0.7663586649066566, + "grad_norm": 2.7281668186187744, + "learning_rate": 5.280941830159228e-06, + "loss": 1.0271592140197754, + "step": 508, + "token_acc": 0.7318856514568444 + }, + { + "epoch": 0.7678672449556855, + "grad_norm": 1.8877264261245728, + "learning_rate": 5.264431747654284e-06, + "loss": 1.0254554748535156, + "step": 509, + "token_acc": 0.726330320631365 + }, + { + "epoch": 0.7693758250047144, + "grad_norm": 2.155935287475586, + "learning_rate": 5.247918773366112e-06, + "loss": 1.0466581583023071, + "step": 510, + "token_acc": 0.7232285649009432 + }, + { + "epoch": 0.7708844050537431, + "grad_norm": 2.094679117202759, + "learning_rate": 5.231403087877955e-06, + "loss": 1.0157034397125244, + "step": 511, + "token_acc": 0.7279374169137949 + }, + { + "epoch": 0.772392985102772, + "grad_norm": 1.887584924697876, + "learning_rate": 5.214884871802703e-06, + "loss": 1.0657141208648682, + "step": 512, + "token_acc": 0.7225426247261189 + }, + { + "epoch": 0.7739015651518009, + "grad_norm": 1.650427222251892, + "learning_rate": 5.198364305780922e-06, + "loss": 1.1210763454437256, + "step": 513, + "token_acc": 0.7044955119371299 + }, + { + "epoch": 0.7754101452008297, + "grad_norm": 2.0321574211120605, + "learning_rate": 5.1818415704788725e-06, + "loss": 1.074467420578003, + "step": 514, + "token_acc": 0.7189865087199737 + }, + { + "epoch": 0.7769187252498586, + "grad_norm": 1.8543435335159302, + "learning_rate": 5.165316846586541e-06, + "loss": 1.026384711265564, + "step": 515, + "token_acc": 0.7309894695977095 + }, + { + "epoch": 0.7784273052988874, + "grad_norm": 1.7272353172302246, + "learning_rate": 5.148790314815662e-06, + "loss": 1.0584019422531128, + "step": 516, + "token_acc": 0.7208280790248003 + }, + { + "epoch": 0.7799358853479162, + "grad_norm": 2.133244752883911, + "learning_rate": 5.132262155897739e-06, + "loss": 1.018196702003479, + "step": 517, + "token_acc": 0.7271030602442706 + }, + { + "epoch": 0.7814444653969451, + "grad_norm": 1.940161943435669, + "learning_rate": 5.11573255058207e-06, + "loss": 1.0759893655776978, + "step": 518, + "token_acc": 0.7150993727620101 + }, + { + "epoch": 0.782953045445974, + "grad_norm": 2.513899326324463, + "learning_rate": 5.099201679633769e-06, + "loss": 1.0492745637893677, + "step": 519, + "token_acc": 0.7199188109182401 + }, + { + "epoch": 0.7844616254950029, + "grad_norm": 1.8655506372451782, + "learning_rate": 5.082669723831793e-06, + "loss": 1.0886518955230713, + "step": 520, + "token_acc": 0.715925784863159 + }, + { + "epoch": 0.7859702055440316, + "grad_norm": 2.029088258743286, + "learning_rate": 5.066136863966963e-06, + "loss": 1.050947904586792, + "step": 521, + "token_acc": 0.7213414332987603 + }, + { + "epoch": 0.7874787855930605, + "grad_norm": 2.1910438537597656, + "learning_rate": 5.049603280839982e-06, + "loss": 0.9835829734802246, + "step": 522, + "token_acc": 0.7347484579799537 + }, + { + "epoch": 0.7889873656420894, + "grad_norm": 1.846093773841858, + "learning_rate": 5.033069155259471e-06, + "loss": 1.012216567993164, + "step": 523, + "token_acc": 0.7279435804025968 + }, + { + "epoch": 0.7904959456911183, + "grad_norm": 1.7225626707077026, + "learning_rate": 5.016534668039976e-06, + "loss": 1.0690052509307861, + "step": 524, + "token_acc": 0.7186567018557184 + }, + { + "epoch": 0.7920045257401471, + "grad_norm": 1.7933125495910645, + "learning_rate": 5e-06, + "loss": 0.9993718862533569, + "step": 525, + "token_acc": 0.7347105693630943 + }, + { + "epoch": 0.7935131057891759, + "grad_norm": 1.7874919176101685, + "learning_rate": 4.983465331960025e-06, + "loss": 1.0533263683319092, + "step": 526, + "token_acc": 0.7158531806504985 + }, + { + "epoch": 0.7950216858382048, + "grad_norm": 2.206632137298584, + "learning_rate": 4.96693084474053e-06, + "loss": 1.0019879341125488, + "step": 527, + "token_acc": 0.7344903559970624 + }, + { + "epoch": 0.7965302658872336, + "grad_norm": 1.898016333580017, + "learning_rate": 4.950396719160019e-06, + "loss": 1.0321991443634033, + "step": 528, + "token_acc": 0.7275534841933854 + }, + { + "epoch": 0.7980388459362625, + "grad_norm": 1.6039094924926758, + "learning_rate": 4.93386313603304e-06, + "loss": 1.0233955383300781, + "step": 529, + "token_acc": 0.7300314465408805 + }, + { + "epoch": 0.7995474259852914, + "grad_norm": 1.6252700090408325, + "learning_rate": 4.917330276168208e-06, + "loss": 0.987075686454773, + "step": 530, + "token_acc": 0.7353460217461432 + }, + { + "epoch": 0.8010560060343201, + "grad_norm": 1.844788908958435, + "learning_rate": 4.900798320366233e-06, + "loss": 0.9963685870170593, + "step": 531, + "token_acc": 0.7404075546719682 + }, + { + "epoch": 0.802564586083349, + "grad_norm": 2.0242061614990234, + "learning_rate": 4.884267449417932e-06, + "loss": 0.9329904913902283, + "step": 532, + "token_acc": 0.7527779360647331 + }, + { + "epoch": 0.8040731661323779, + "grad_norm": 1.9341564178466797, + "learning_rate": 4.867737844102261e-06, + "loss": 1.052427887916565, + "step": 533, + "token_acc": 0.7182104752667313 + }, + { + "epoch": 0.8055817461814068, + "grad_norm": 1.6331071853637695, + "learning_rate": 4.851209685184339e-06, + "loss": 1.0778707265853882, + "step": 534, + "token_acc": 0.7113597313339581 + }, + { + "epoch": 0.8070903262304356, + "grad_norm": 1.872962474822998, + "learning_rate": 4.8346831534134595e-06, + "loss": 1.015707015991211, + "step": 535, + "token_acc": 0.7287764169068204 + }, + { + "epoch": 0.8085989062794644, + "grad_norm": 1.601584792137146, + "learning_rate": 4.818158429521129e-06, + "loss": 1.0452444553375244, + "step": 536, + "token_acc": 0.722498751313274 + }, + { + "epoch": 0.8101074863284933, + "grad_norm": 1.8661402463912964, + "learning_rate": 4.801635694219079e-06, + "loss": 1.0527936220169067, + "step": 537, + "token_acc": 0.7248286367098248 + }, + { + "epoch": 0.8116160663775221, + "grad_norm": 2.9478535652160645, + "learning_rate": 4.785115128197298e-06, + "loss": 1.0789413452148438, + "step": 538, + "token_acc": 0.708814349640883 + }, + { + "epoch": 0.813124646426551, + "grad_norm": 2.1882660388946533, + "learning_rate": 4.768596912122046e-06, + "loss": 1.0256974697113037, + "step": 539, + "token_acc": 0.7265846502798254 + }, + { + "epoch": 0.8146332264755799, + "grad_norm": 1.4274197816848755, + "learning_rate": 4.752081226633888e-06, + "loss": 1.0401779413223267, + "step": 540, + "token_acc": 0.719770446689264 + }, + { + "epoch": 0.8161418065246087, + "grad_norm": 1.5515495538711548, + "learning_rate": 4.735568252345718e-06, + "loss": 0.9992239475250244, + "step": 541, + "token_acc": 0.724567720492225 + }, + { + "epoch": 0.8176503865736375, + "grad_norm": 1.996112585067749, + "learning_rate": 4.719058169840773e-06, + "loss": 1.0366744995117188, + "step": 542, + "token_acc": 0.7269926364112518 + }, + { + "epoch": 0.8191589666226664, + "grad_norm": 1.708045482635498, + "learning_rate": 4.702551159670672e-06, + "loss": 1.0585676431655884, + "step": 543, + "token_acc": 0.7148242704779175 + }, + { + "epoch": 0.8206675466716953, + "grad_norm": 1.833902359008789, + "learning_rate": 4.686047402353433e-06, + "loss": 1.0295124053955078, + "step": 544, + "token_acc": 0.7274400430387632 + }, + { + "epoch": 0.8221761267207242, + "grad_norm": 1.590248465538025, + "learning_rate": 4.669547078371503e-06, + "loss": 1.0483739376068115, + "step": 545, + "token_acc": 0.7230941932850291 + }, + { + "epoch": 0.8236847067697529, + "grad_norm": 1.8889268636703491, + "learning_rate": 4.65305036816978e-06, + "loss": 1.046649694442749, + "step": 546, + "token_acc": 0.7200257842715944 + }, + { + "epoch": 0.8251932868187818, + "grad_norm": 1.9480907917022705, + "learning_rate": 4.636557452153645e-06, + "loss": 1.114920973777771, + "step": 547, + "token_acc": 0.7108572868330989 + }, + { + "epoch": 0.8267018668678107, + "grad_norm": 1.826660394668579, + "learning_rate": 4.620068510686985e-06, + "loss": 1.005754828453064, + "step": 548, + "token_acc": 0.7264181604540113 + }, + { + "epoch": 0.8282104469168395, + "grad_norm": 1.766832709312439, + "learning_rate": 4.60358372409022e-06, + "loss": 1.0189857482910156, + "step": 549, + "token_acc": 0.7311113283159026 + }, + { + "epoch": 0.8297190269658684, + "grad_norm": 2.219547748565674, + "learning_rate": 4.587103272638339e-06, + "loss": 0.9962954521179199, + "step": 550, + "token_acc": 0.7340694965049134 + }, + { + "epoch": 0.8312276070148972, + "grad_norm": 1.642620325088501, + "learning_rate": 4.570627336558915e-06, + "loss": 1.0292123556137085, + "step": 551, + "token_acc": 0.7267908141309025 + }, + { + "epoch": 0.832736187063926, + "grad_norm": 1.474797010421753, + "learning_rate": 4.554156096030149e-06, + "loss": 0.9796549081802368, + "step": 552, + "token_acc": 0.7338556413140441 + }, + { + "epoch": 0.8342447671129549, + "grad_norm": 1.7124395370483398, + "learning_rate": 4.537689731178883e-06, + "loss": 1.061699390411377, + "step": 553, + "token_acc": 0.7237870703561236 + }, + { + "epoch": 0.8357533471619838, + "grad_norm": 1.8044353723526, + "learning_rate": 4.5212284220786495e-06, + "loss": 1.0605119466781616, + "step": 554, + "token_acc": 0.7199576852956402 + }, + { + "epoch": 0.8372619272110127, + "grad_norm": 1.618354082107544, + "learning_rate": 4.504772348747687e-06, + "loss": 1.0913270711898804, + "step": 555, + "token_acc": 0.7064050159345988 + }, + { + "epoch": 0.8387705072600414, + "grad_norm": 1.8473085165023804, + "learning_rate": 4.488321691146975e-06, + "loss": 1.0139802694320679, + "step": 556, + "token_acc": 0.7316685711100582 + }, + { + "epoch": 0.8402790873090703, + "grad_norm": 2.097944498062134, + "learning_rate": 4.471876629178273e-06, + "loss": 1.0312366485595703, + "step": 557, + "token_acc": 0.7258432526046678 + }, + { + "epoch": 0.8417876673580992, + "grad_norm": 2.0012364387512207, + "learning_rate": 4.4554373426821375e-06, + "loss": 0.9902812242507935, + "step": 558, + "token_acc": 0.7354179377669311 + }, + { + "epoch": 0.843296247407128, + "grad_norm": 1.7152091264724731, + "learning_rate": 4.439004011435979e-06, + "loss": 1.0574164390563965, + "step": 559, + "token_acc": 0.7194797904191617 + }, + { + "epoch": 0.8448048274561569, + "grad_norm": 1.7510721683502197, + "learning_rate": 4.42257681515207e-06, + "loss": 1.1168850660324097, + "step": 560, + "token_acc": 0.7032745696546855 + }, + { + "epoch": 0.8463134075051857, + "grad_norm": 1.9027713537216187, + "learning_rate": 4.406155933475599e-06, + "loss": 1.0321595668792725, + "step": 561, + "token_acc": 0.7281336017419169 + }, + { + "epoch": 0.8478219875542146, + "grad_norm": 1.7455378770828247, + "learning_rate": 4.3897415459827e-06, + "loss": 1.0598769187927246, + "step": 562, + "token_acc": 0.7193884325739325 + }, + { + "epoch": 0.8493305676032434, + "grad_norm": 2.026305913925171, + "learning_rate": 4.373333832178478e-06, + "loss": 1.0668879747390747, + "step": 563, + "token_acc": 0.717763040882181 + }, + { + "epoch": 0.8508391476522723, + "grad_norm": 1.9603196382522583, + "learning_rate": 4.356932971495071e-06, + "loss": 1.0637216567993164, + "step": 564, + "token_acc": 0.7143371109911855 + }, + { + "epoch": 0.8523477277013012, + "grad_norm": 2.15147066116333, + "learning_rate": 4.340539143289655e-06, + "loss": 0.9707341194152832, + "step": 565, + "token_acc": 0.7389071146899957 + }, + { + "epoch": 0.8538563077503301, + "grad_norm": 1.5453519821166992, + "learning_rate": 4.324152526842517e-06, + "loss": 1.0601102113723755, + "step": 566, + "token_acc": 0.7216300823612716 + }, + { + "epoch": 0.8553648877993588, + "grad_norm": 2.0775973796844482, + "learning_rate": 4.307773301355063e-06, + "loss": 1.0625584125518799, + "step": 567, + "token_acc": 0.7192691460412077 + }, + { + "epoch": 0.8568734678483877, + "grad_norm": 1.9119807481765747, + "learning_rate": 4.291401645947879e-06, + "loss": 1.0614428520202637, + "step": 568, + "token_acc": 0.7195538446352778 + }, + { + "epoch": 0.8583820478974166, + "grad_norm": 2.062467336654663, + "learning_rate": 4.275037739658771e-06, + "loss": 1.0344290733337402, + "step": 569, + "token_acc": 0.7280806203351436 + }, + { + "epoch": 0.8598906279464454, + "grad_norm": 2.1574866771698, + "learning_rate": 4.25868176144079e-06, + "loss": 1.0130680799484253, + "step": 570, + "token_acc": 0.7292886806516878 + }, + { + "epoch": 0.8613992079954743, + "grad_norm": 1.6127513647079468, + "learning_rate": 4.242333890160299e-06, + "loss": 1.040123462677002, + "step": 571, + "token_acc": 0.7231471931634006 + }, + { + "epoch": 0.8629077880445031, + "grad_norm": 1.7476307153701782, + "learning_rate": 4.225994304594994e-06, + "loss": 1.0430809259414673, + "step": 572, + "token_acc": 0.7207976617400137 + }, + { + "epoch": 0.864416368093532, + "grad_norm": 1.7726634740829468, + "learning_rate": 4.209663183431969e-06, + "loss": 1.1194026470184326, + "step": 573, + "token_acc": 0.7026123634627246 + }, + { + "epoch": 0.8659249481425608, + "grad_norm": 2.0140292644500732, + "learning_rate": 4.193340705265746e-06, + "loss": 1.0264077186584473, + "step": 574, + "token_acc": 0.7288190628434409 + }, + { + "epoch": 0.8674335281915897, + "grad_norm": 5.797197341918945, + "learning_rate": 4.17702704859633e-06, + "loss": 0.960410475730896, + "step": 575, + "token_acc": 0.7467691138140747 + }, + { + "epoch": 0.8689421082406186, + "grad_norm": 1.8315378427505493, + "learning_rate": 4.160722391827262e-06, + "loss": 1.0545425415039062, + "step": 576, + "token_acc": 0.7240204049809539 + }, + { + "epoch": 0.8704506882896473, + "grad_norm": 1.4729951620101929, + "learning_rate": 4.14442691326365e-06, + "loss": 1.0044481754302979, + "step": 577, + "token_acc": 0.7270260457472626 + }, + { + "epoch": 0.8719592683386762, + "grad_norm": 1.8943400382995605, + "learning_rate": 4.128140791110243e-06, + "loss": 1.0536444187164307, + "step": 578, + "token_acc": 0.7221897350546534 + }, + { + "epoch": 0.8734678483877051, + "grad_norm": 1.3474115133285522, + "learning_rate": 4.111864203469457e-06, + "loss": 1.0629198551177979, + "step": 579, + "token_acc": 0.7139775593633122 + }, + { + "epoch": 0.874976428436734, + "grad_norm": 2.3920254707336426, + "learning_rate": 4.0955973283394525e-06, + "loss": 1.045224905014038, + "step": 580, + "token_acc": 0.7251079668822028 + }, + { + "epoch": 0.8764850084857628, + "grad_norm": 1.7864826917648315, + "learning_rate": 4.079340343612165e-06, + "loss": 0.9681279063224792, + "step": 581, + "token_acc": 0.7359263942789867 + }, + { + "epoch": 0.8779935885347916, + "grad_norm": 1.6977518796920776, + "learning_rate": 4.063093427071376e-06, + "loss": 0.9895939826965332, + "step": 582, + "token_acc": 0.7381979672521799 + }, + { + "epoch": 0.8795021685838205, + "grad_norm": 1.5250004529953003, + "learning_rate": 4.046856756390767e-06, + "loss": 0.9930386543273926, + "step": 583, + "token_acc": 0.7347820515167603 + }, + { + "epoch": 0.8810107486328493, + "grad_norm": 1.8343875408172607, + "learning_rate": 4.03063050913196e-06, + "loss": 1.0659531354904175, + "step": 584, + "token_acc": 0.7141949392942244 + }, + { + "epoch": 0.8825193286818782, + "grad_norm": 1.6999495029449463, + "learning_rate": 4.0144148627426e-06, + "loss": 1.0370471477508545, + "step": 585, + "token_acc": 0.7263593882752761 + }, + { + "epoch": 0.8840279087309071, + "grad_norm": 1.6950886249542236, + "learning_rate": 3.998209994554395e-06, + "loss": 1.0114948749542236, + "step": 586, + "token_acc": 0.7309895167476349 + }, + { + "epoch": 0.8855364887799358, + "grad_norm": 1.8514199256896973, + "learning_rate": 3.982016081781189e-06, + "loss": 1.0452059507369995, + "step": 587, + "token_acc": 0.719009075300466 + }, + { + "epoch": 0.8870450688289647, + "grad_norm": 1.44926917552948, + "learning_rate": 3.965833301517017e-06, + "loss": 1.0612425804138184, + "step": 588, + "token_acc": 0.7174135777344244 + }, + { + "epoch": 0.8885536488779936, + "grad_norm": 1.6306642293930054, + "learning_rate": 3.949661830734172e-06, + "loss": 0.9965438842773438, + "step": 589, + "token_acc": 0.7329620253164557 + }, + { + "epoch": 0.8900622289270225, + "grad_norm": 1.7434444427490234, + "learning_rate": 3.9335018462812664e-06, + "loss": 0.995252788066864, + "step": 590, + "token_acc": 0.7335218093699515 + }, + { + "epoch": 0.8915708089760513, + "grad_norm": 1.638238549232483, + "learning_rate": 3.9173535248813026e-06, + "loss": 1.0281872749328613, + "step": 591, + "token_acc": 0.7213418530351438 + }, + { + "epoch": 0.8930793890250801, + "grad_norm": 1.847936749458313, + "learning_rate": 3.901217043129735e-06, + "loss": 1.0718886852264404, + "step": 592, + "token_acc": 0.7159040968797304 + }, + { + "epoch": 0.894587969074109, + "grad_norm": 2.310101270675659, + "learning_rate": 3.885092577492543e-06, + "loss": 1.006734013557434, + "step": 593, + "token_acc": 0.7290177945915651 + }, + { + "epoch": 0.8960965491231379, + "grad_norm": 1.7288548946380615, + "learning_rate": 3.8689803043043e-06, + "loss": 1.0565593242645264, + "step": 594, + "token_acc": 0.7146984974692121 + }, + { + "epoch": 0.8976051291721667, + "grad_norm": 2.3463895320892334, + "learning_rate": 3.852880399766243e-06, + "loss": 1.0266249179840088, + "step": 595, + "token_acc": 0.7200602270094973 + }, + { + "epoch": 0.8991137092211956, + "grad_norm": 2.242974042892456, + "learning_rate": 3.8367930399443495e-06, + "loss": 1.013922929763794, + "step": 596, + "token_acc": 0.7334527515123039 + }, + { + "epoch": 0.9006222892702244, + "grad_norm": 1.5093663930892944, + "learning_rate": 3.820718400767409e-06, + "loss": 1.0853023529052734, + "step": 597, + "token_acc": 0.7076195991257729 + }, + { + "epoch": 0.9021308693192532, + "grad_norm": 2.258570671081543, + "learning_rate": 3.8046566580251e-06, + "loss": 0.9982188940048218, + "step": 598, + "token_acc": 0.7368648089034596 + }, + { + "epoch": 0.9036394493682821, + "grad_norm": 1.731276035308838, + "learning_rate": 3.7886079873660693e-06, + "loss": 1.0135633945465088, + "step": 599, + "token_acc": 0.728326199799654 + }, + { + "epoch": 0.905148029417311, + "grad_norm": 1.828013300895691, + "learning_rate": 3.7725725642960047e-06, + "loss": 1.0183073282241821, + "step": 600, + "token_acc": 0.7302929337673093 + }, + { + "epoch": 0.9066566094663399, + "grad_norm": 1.9945558309555054, + "learning_rate": 3.756550564175727e-06, + "loss": 1.0274524688720703, + "step": 601, + "token_acc": 0.7270367700072098 + }, + { + "epoch": 0.9081651895153686, + "grad_norm": 3.1782305240631104, + "learning_rate": 3.7405421622192607e-06, + "loss": 1.0179848670959473, + "step": 602, + "token_acc": 0.7313132746712082 + }, + { + "epoch": 0.9096737695643975, + "grad_norm": 1.4516139030456543, + "learning_rate": 3.7245475334919246e-06, + "loss": 1.0211703777313232, + "step": 603, + "token_acc": 0.7278850690659421 + }, + { + "epoch": 0.9111823496134264, + "grad_norm": 1.5925132036209106, + "learning_rate": 3.7085668529084183e-06, + "loss": 0.954882800579071, + "step": 604, + "token_acc": 0.743773987206823 + }, + { + "epoch": 0.9126909296624552, + "grad_norm": 1.7870968580245972, + "learning_rate": 3.6926002952309015e-06, + "loss": 1.0111029148101807, + "step": 605, + "token_acc": 0.7296811120196238 + }, + { + "epoch": 0.9141995097114841, + "grad_norm": 2.091892719268799, + "learning_rate": 3.676648035067093e-06, + "loss": 1.00455641746521, + "step": 606, + "token_acc": 0.7310988329480747 + }, + { + "epoch": 0.9157080897605129, + "grad_norm": 2.190229892730713, + "learning_rate": 3.6607102468683524e-06, + "loss": 0.9872298240661621, + "step": 607, + "token_acc": 0.7392593664111095 + }, + { + "epoch": 0.9172166698095418, + "grad_norm": 1.7085151672363281, + "learning_rate": 3.64478710492778e-06, + "loss": 1.0182137489318848, + "step": 608, + "token_acc": 0.7329062622565041 + }, + { + "epoch": 0.9187252498585706, + "grad_norm": 1.8669676780700684, + "learning_rate": 3.628878783378302e-06, + "loss": 1.0653152465820312, + "step": 609, + "token_acc": 0.7132568201248737 + }, + { + "epoch": 0.9202338299075995, + "grad_norm": 2.6574013233184814, + "learning_rate": 3.6129854561907786e-06, + "loss": 0.978446900844574, + "step": 610, + "token_acc": 0.7382480485689505 + }, + { + "epoch": 0.9217424099566284, + "grad_norm": 1.7808107137680054, + "learning_rate": 3.5971072971720844e-06, + "loss": 1.0076225996017456, + "step": 611, + "token_acc": 0.7268224733013465 + }, + { + "epoch": 0.9232509900056571, + "grad_norm": 2.233445644378662, + "learning_rate": 3.581244479963225e-06, + "loss": 1.0435742139816284, + "step": 612, + "token_acc": 0.7184824245155476 + }, + { + "epoch": 0.924759570054686, + "grad_norm": 1.793081283569336, + "learning_rate": 3.56539717803743e-06, + "loss": 0.966189444065094, + "step": 613, + "token_acc": 0.7409753008233059 + }, + { + "epoch": 0.9262681501037149, + "grad_norm": 1.7225725650787354, + "learning_rate": 3.5495655646982506e-06, + "loss": 0.9894843101501465, + "step": 614, + "token_acc": 0.736391356224314 + }, + { + "epoch": 0.9277767301527438, + "grad_norm": 1.6624302864074707, + "learning_rate": 3.533749813077677e-06, + "loss": 0.978921115398407, + "step": 615, + "token_acc": 0.7329856584093872 + }, + { + "epoch": 0.9292853102017726, + "grad_norm": 2.492824077606201, + "learning_rate": 3.517950096134232e-06, + "loss": 0.9910463094711304, + "step": 616, + "token_acc": 0.7296101813290566 + }, + { + "epoch": 0.9307938902508014, + "grad_norm": 1.7076109647750854, + "learning_rate": 3.5021665866510924e-06, + "loss": 1.030413031578064, + "step": 617, + "token_acc": 0.7229139482937833 + }, + { + "epoch": 0.9323024702998303, + "grad_norm": 1.8808494806289673, + "learning_rate": 3.4863994572341845e-06, + "loss": 0.9829146862030029, + "step": 618, + "token_acc": 0.7381431445817763 + }, + { + "epoch": 0.9338110503488591, + "grad_norm": 2.130441188812256, + "learning_rate": 3.470648880310313e-06, + "loss": 1.0092934370040894, + "step": 619, + "token_acc": 0.7290599506734965 + }, + { + "epoch": 0.935319630397888, + "grad_norm": 2.03429913520813, + "learning_rate": 3.4549150281252635e-06, + "loss": 1.050522804260254, + "step": 620, + "token_acc": 0.7242481740441349 + }, + { + "epoch": 0.9368282104469169, + "grad_norm": 1.738586187362671, + "learning_rate": 3.4391980727419206e-06, + "loss": 1.0056005716323853, + "step": 621, + "token_acc": 0.7337035954399298 + }, + { + "epoch": 0.9383367904959456, + "grad_norm": 2.1147565841674805, + "learning_rate": 3.423498186038393e-06, + "loss": 0.9933333396911621, + "step": 622, + "token_acc": 0.7350120449070496 + }, + { + "epoch": 0.9398453705449745, + "grad_norm": 1.776733160018921, + "learning_rate": 3.4078155397061243e-06, + "loss": 0.9937822818756104, + "step": 623, + "token_acc": 0.7350757810083514 + }, + { + "epoch": 0.9413539505940034, + "grad_norm": 2.0285677909851074, + "learning_rate": 3.3921503052480243e-06, + "loss": 1.023162841796875, + "step": 624, + "token_acc": 0.7294474189312051 + }, + { + "epoch": 0.9428625306430323, + "grad_norm": 1.937382698059082, + "learning_rate": 3.3765026539765832e-06, + "loss": 1.0625144243240356, + "step": 625, + "token_acc": 0.720278066773858 + }, + { + "epoch": 0.9443711106920611, + "grad_norm": 2.0632500648498535, + "learning_rate": 3.3608727570120114e-06, + "loss": 0.9372000694274902, + "step": 626, + "token_acc": 0.7461263408820024 + }, + { + "epoch": 0.9458796907410899, + "grad_norm": 2.123002052307129, + "learning_rate": 3.3452607852803585e-06, + "loss": 1.033599615097046, + "step": 627, + "token_acc": 0.7248429436840924 + }, + { + "epoch": 0.9473882707901188, + "grad_norm": 1.9222874641418457, + "learning_rate": 3.3296669095116454e-06, + "loss": 1.008436918258667, + "step": 628, + "token_acc": 0.7292303385204022 + }, + { + "epoch": 0.9488968508391477, + "grad_norm": 1.8565330505371094, + "learning_rate": 3.3140913002379993e-06, + "loss": 0.9962430596351624, + "step": 629, + "token_acc": 0.7366631587468631 + }, + { + "epoch": 0.9504054308881765, + "grad_norm": 2.5017733573913574, + "learning_rate": 3.298534127791785e-06, + "loss": 1.017548680305481, + "step": 630, + "token_acc": 0.7280129456488236 + }, + { + "epoch": 0.9519140109372054, + "grad_norm": 2.0651659965515137, + "learning_rate": 3.2829955623037536e-06, + "loss": 1.011000156402588, + "step": 631, + "token_acc": 0.725199214689918 + }, + { + "epoch": 0.9534225909862342, + "grad_norm": 1.6946897506713867, + "learning_rate": 3.267475773701161e-06, + "loss": 1.0209898948669434, + "step": 632, + "token_acc": 0.7242810943099379 + }, + { + "epoch": 0.954931171035263, + "grad_norm": 1.8070124387741089, + "learning_rate": 3.251974931705933e-06, + "loss": 1.010417103767395, + "step": 633, + "token_acc": 0.7332030476944389 + }, + { + "epoch": 0.9564397510842919, + "grad_norm": 1.7403738498687744, + "learning_rate": 3.236493205832795e-06, + "loss": 1.0137724876403809, + "step": 634, + "token_acc": 0.7329824932564658 + }, + { + "epoch": 0.9579483311333208, + "grad_norm": 1.5408200025558472, + "learning_rate": 3.2210307653874175e-06, + "loss": 1.0014162063598633, + "step": 635, + "token_acc": 0.7347369309311843 + }, + { + "epoch": 0.9594569111823497, + "grad_norm": 1.6927746534347534, + "learning_rate": 3.205587779464576e-06, + "loss": 0.9966017007827759, + "step": 636, + "token_acc": 0.7335542168674699 + }, + { + "epoch": 0.9609654912313784, + "grad_norm": 1.663963794708252, + "learning_rate": 3.1901644169462854e-06, + "loss": 1.013539433479309, + "step": 637, + "token_acc": 0.7283959806475624 + }, + { + "epoch": 0.9624740712804073, + "grad_norm": 1.5094809532165527, + "learning_rate": 3.1747608464999723e-06, + "loss": 0.9437252283096313, + "step": 638, + "token_acc": 0.7417498700414075 + }, + { + "epoch": 0.9639826513294362, + "grad_norm": 1.8169877529144287, + "learning_rate": 3.1593772365766107e-06, + "loss": 0.9592390656471252, + "step": 639, + "token_acc": 0.7401287143433657 + }, + { + "epoch": 0.965491231378465, + "grad_norm": 1.9399446249008179, + "learning_rate": 3.1440137554088957e-06, + "loss": 1.0221266746520996, + "step": 640, + "token_acc": 0.7242537313432836 + }, + { + "epoch": 0.9669998114274939, + "grad_norm": 1.8464694023132324, + "learning_rate": 3.128670571009399e-06, + "loss": 1.036946177482605, + "step": 641, + "token_acc": 0.7245742760731382 + }, + { + "epoch": 0.9685083914765227, + "grad_norm": 1.8377056121826172, + "learning_rate": 3.1133478511687217e-06, + "loss": 1.0493295192718506, + "step": 642, + "token_acc": 0.7251433938885224 + }, + { + "epoch": 0.9700169715255516, + "grad_norm": 1.9684299230575562, + "learning_rate": 3.0980457634536775e-06, + "loss": 0.9758336544036865, + "step": 643, + "token_acc": 0.7387948139490637 + }, + { + "epoch": 0.9715255515745804, + "grad_norm": 1.9786468744277954, + "learning_rate": 3.082764475205442e-06, + "loss": 1.016764760017395, + "step": 644, + "token_acc": 0.7336975857687421 + }, + { + "epoch": 0.9730341316236093, + "grad_norm": 1.8363456726074219, + "learning_rate": 3.06750415353774e-06, + "loss": 1.038301706314087, + "step": 645, + "token_acc": 0.7204810711535043 + }, + { + "epoch": 0.9745427116726382, + "grad_norm": 1.8726056814193726, + "learning_rate": 3.052264965335e-06, + "loss": 1.0489846467971802, + "step": 646, + "token_acc": 0.7236939013090684 + }, + { + "epoch": 0.9760512917216669, + "grad_norm": 2.253614664077759, + "learning_rate": 3.0370470772505433e-06, + "loss": 1.009007215499878, + "step": 647, + "token_acc": 0.7297306824591089 + }, + { + "epoch": 0.9775598717706958, + "grad_norm": 4.38145637512207, + "learning_rate": 3.02185065570476e-06, + "loss": 1.0285372734069824, + "step": 648, + "token_acc": 0.727710843373494 + }, + { + "epoch": 0.9790684518197247, + "grad_norm": 1.7132043838500977, + "learning_rate": 3.0066758668832752e-06, + "loss": 1.06623375415802, + "step": 649, + "token_acc": 0.711100608252507 + }, + { + "epoch": 0.9805770318687536, + "grad_norm": 1.7995939254760742, + "learning_rate": 2.991522876735154e-06, + "loss": 1.0441339015960693, + "step": 650, + "token_acc": 0.7211513568217657 + }, + { + "epoch": 0.9820856119177824, + "grad_norm": 2.332531452178955, + "learning_rate": 2.9763918509710647e-06, + "loss": 0.997922420501709, + "step": 651, + "token_acc": 0.7342555143935017 + }, + { + "epoch": 0.9835941919668112, + "grad_norm": 2.4428551197052, + "learning_rate": 2.9612829550614836e-06, + "loss": 0.9762682914733887, + "step": 652, + "token_acc": 0.7428345745881291 + }, + { + "epoch": 0.9851027720158401, + "grad_norm": 4.400890350341797, + "learning_rate": 2.9461963542348737e-06, + "loss": 1.0094895362854004, + "step": 653, + "token_acc": 0.7292152477038024 + }, + { + "epoch": 0.9866113520648689, + "grad_norm": 1.905814290046692, + "learning_rate": 2.931132213475884e-06, + "loss": 1.0046759843826294, + "step": 654, + "token_acc": 0.7375923470354234 + }, + { + "epoch": 0.9881199321138978, + "grad_norm": 1.6962034702301025, + "learning_rate": 2.9160906975235493e-06, + "loss": 1.0463544130325317, + "step": 655, + "token_acc": 0.7250240483715817 + }, + { + "epoch": 0.9896285121629267, + "grad_norm": 1.4924402236938477, + "learning_rate": 2.9010719708694724e-06, + "loss": 1.0403640270233154, + "step": 656, + "token_acc": 0.7240203784019242 + }, + { + "epoch": 0.9911370922119555, + "grad_norm": 2.1192667484283447, + "learning_rate": 2.8860761977560435e-06, + "loss": 1.0421209335327148, + "step": 657, + "token_acc": 0.7272775353695623 + }, + { + "epoch": 0.9926456722609843, + "grad_norm": 1.7598618268966675, + "learning_rate": 2.871103542174637e-06, + "loss": 1.0004884004592896, + "step": 658, + "token_acc": 0.725534729878721 + }, + { + "epoch": 0.9941542523100132, + "grad_norm": 1.7949347496032715, + "learning_rate": 2.8561541678638145e-06, + "loss": 1.0497692823410034, + "step": 659, + "token_acc": 0.7202207505518764 + }, + { + "epoch": 0.9956628323590421, + "grad_norm": 2.0620481967926025, + "learning_rate": 2.8412282383075362e-06, + "loss": 0.9895404577255249, + "step": 660, + "token_acc": 0.7334742734635541 + }, + { + "epoch": 0.997171412408071, + "grad_norm": 2.117434024810791, + "learning_rate": 2.826325916733378e-06, + "loss": 1.0997333526611328, + "step": 661, + "token_acc": 0.7193315191995822 + }, + { + "epoch": 0.9986799924570997, + "grad_norm": 1.5192148685455322, + "learning_rate": 2.811447366110741e-06, + "loss": 0.8911752104759216, + "step": 662, + "token_acc": 0.7557833956397824 + }, + { + "epoch": 1.0, + "grad_norm": 1.4657164812088013, + "learning_rate": 2.796592749149071e-06, + "loss": 0.9475839734077454, + "step": 663, + "token_acc": 0.7403959256200415 + }, + { + "epoch": 1.0015085800490289, + "grad_norm": 1.940261721611023, + "learning_rate": 2.7817622282960816e-06, + "loss": 0.9128296375274658, + "step": 664, + "token_acc": 0.7530687601876185 + }, + { + "epoch": 1.0030171600980577, + "grad_norm": 1.8972185850143433, + "learning_rate": 2.766955965735968e-06, + "loss": 0.9728459715843201, + "step": 665, + "token_acc": 0.7353614144565782 + }, + { + "epoch": 1.0045257401470866, + "grad_norm": 1.836915373802185, + "learning_rate": 2.7521741233876496e-06, + "loss": 0.974313497543335, + "step": 666, + "token_acc": 0.7330869389749145 + }, + { + "epoch": 1.0060343201961155, + "grad_norm": 1.8581467866897583, + "learning_rate": 2.7374168629029814e-06, + "loss": 0.9710570573806763, + "step": 667, + "token_acc": 0.7375868804797165 + }, + { + "epoch": 1.0075429002451444, + "grad_norm": 1.5058178901672363, + "learning_rate": 2.722684345665004e-06, + "loss": 0.9634053111076355, + "step": 668, + "token_acc": 0.7408511874722821 + }, + { + "epoch": 1.009051480294173, + "grad_norm": 1.9686533212661743, + "learning_rate": 2.707976732786166e-06, + "loss": 0.9671412706375122, + "step": 669, + "token_acc": 0.7370878901572603 + }, + { + "epoch": 1.010560060343202, + "grad_norm": 1.6449058055877686, + "learning_rate": 2.693294185106562e-06, + "loss": 0.9256716966629028, + "step": 670, + "token_acc": 0.7481655468764224 + }, + { + "epoch": 1.0120686403922308, + "grad_norm": 1.6851818561553955, + "learning_rate": 2.678636863192184e-06, + "loss": 0.9527168273925781, + "step": 671, + "token_acc": 0.7368122937496921 + }, + { + "epoch": 1.0135772204412596, + "grad_norm": 2.0373694896698, + "learning_rate": 2.6640049273331516e-06, + "loss": 1.0088560581207275, + "step": 672, + "token_acc": 0.7225473552537527 + }, + { + "epoch": 1.0150858004902885, + "grad_norm": 2.0390496253967285, + "learning_rate": 2.649398537541978e-06, + "loss": 0.97276771068573, + "step": 673, + "token_acc": 0.7403462793096719 + }, + { + "epoch": 1.0165943805393174, + "grad_norm": 1.8950082063674927, + "learning_rate": 2.6348178535517967e-06, + "loss": 0.9665853381156921, + "step": 674, + "token_acc": 0.737511817461737 + }, + { + "epoch": 1.0181029605883463, + "grad_norm": 1.7364633083343506, + "learning_rate": 2.6202630348146323e-06, + "loss": 1.0180974006652832, + "step": 675, + "token_acc": 0.7276204308884048 + }, + { + "epoch": 1.0196115406373751, + "grad_norm": 2.166182041168213, + "learning_rate": 2.605734240499652e-06, + "loss": 1.0275691747665405, + "step": 676, + "token_acc": 0.7261073532620554 + }, + { + "epoch": 1.021120120686404, + "grad_norm": 2.00640606880188, + "learning_rate": 2.5912316294914232e-06, + "loss": 0.9593705534934998, + "step": 677, + "token_acc": 0.7423252775963423 + }, + { + "epoch": 1.0226287007354329, + "grad_norm": 1.73147451877594, + "learning_rate": 2.576755360388177e-06, + "loss": 0.9404624700546265, + "step": 678, + "token_acc": 0.741882848645076 + }, + { + "epoch": 1.0241372807844615, + "grad_norm": 1.986806869506836, + "learning_rate": 2.562305591500069e-06, + "loss": 0.9454247355461121, + "step": 679, + "token_acc": 0.7449106875561321 + }, + { + "epoch": 1.0256458608334904, + "grad_norm": 2.4402265548706055, + "learning_rate": 2.5478824808474613e-06, + "loss": 0.8899649977684021, + "step": 680, + "token_acc": 0.7580290224844523 + }, + { + "epoch": 1.0271544408825193, + "grad_norm": 2.0099384784698486, + "learning_rate": 2.5334861861591753e-06, + "loss": 0.9586272835731506, + "step": 681, + "token_acc": 0.7410303587856486 + }, + { + "epoch": 1.0286630209315482, + "grad_norm": 1.485263705253601, + "learning_rate": 2.5191168648707888e-06, + "loss": 0.9728769659996033, + "step": 682, + "token_acc": 0.7352111873771753 + }, + { + "epoch": 1.030171600980577, + "grad_norm": 1.9325636625289917, + "learning_rate": 2.5047746741228977e-06, + "loss": 0.9416171312332153, + "step": 683, + "token_acc": 0.7405371446521124 + }, + { + "epoch": 1.031680181029606, + "grad_norm": 1.7862087488174438, + "learning_rate": 2.490459770759398e-06, + "loss": 0.9893593788146973, + "step": 684, + "token_acc": 0.739930522955591 + }, + { + "epoch": 1.0331887610786348, + "grad_norm": 1.6451327800750732, + "learning_rate": 2.476172311325783e-06, + "loss": 0.994601309299469, + "step": 685, + "token_acc": 0.7254228135202141 + }, + { + "epoch": 1.0346973411276637, + "grad_norm": 1.927067518234253, + "learning_rate": 2.461912452067415e-06, + "loss": 0.9902089834213257, + "step": 686, + "token_acc": 0.7362722527409906 + }, + { + "epoch": 1.0362059211766925, + "grad_norm": 2.502016067504883, + "learning_rate": 2.447680348927837e-06, + "loss": 0.9662191867828369, + "step": 687, + "token_acc": 0.7367522409903043 + }, + { + "epoch": 1.0377145012257212, + "grad_norm": 3.027484178543091, + "learning_rate": 2.433476157547044e-06, + "loss": 0.9326030015945435, + "step": 688, + "token_acc": 0.7491740641788043 + }, + { + "epoch": 1.03922308127475, + "grad_norm": 1.8432146310806274, + "learning_rate": 2.4193000332597984e-06, + "loss": 0.9986776113510132, + "step": 689, + "token_acc": 0.7347249973709118 + }, + { + "epoch": 1.040731661323779, + "grad_norm": 1.7616575956344604, + "learning_rate": 2.4051521310939258e-06, + "loss": 0.9970366358757019, + "step": 690, + "token_acc": 0.737896390623062 + }, + { + "epoch": 1.0422402413728078, + "grad_norm": 2.630901336669922, + "learning_rate": 2.391032605768613e-06, + "loss": 0.9956340789794922, + "step": 691, + "token_acc": 0.7350882723833544 + }, + { + "epoch": 1.0437488214218367, + "grad_norm": 2.1941123008728027, + "learning_rate": 2.3769416116927335e-06, + "loss": 0.930854082107544, + "step": 692, + "token_acc": 0.7454521659468656 + }, + { + "epoch": 1.0452574014708655, + "grad_norm": 2.2033956050872803, + "learning_rate": 2.3628793029631353e-06, + "loss": 1.0024913549423218, + "step": 693, + "token_acc": 0.7294554028408764 + }, + { + "epoch": 1.0467659815198944, + "grad_norm": 1.9845638275146484, + "learning_rate": 2.3488458333629777e-06, + "loss": 0.9307359457015991, + "step": 694, + "token_acc": 0.748718887262079 + }, + { + "epoch": 1.0482745615689233, + "grad_norm": 2.2986338138580322, + "learning_rate": 2.3348413563600324e-06, + "loss": 0.980610728263855, + "step": 695, + "token_acc": 0.7430798036536612 + }, + { + "epoch": 1.0497831416179522, + "grad_norm": 1.9241856336593628, + "learning_rate": 2.320866025105016e-06, + "loss": 0.9825087189674377, + "step": 696, + "token_acc": 0.7351659267368079 + }, + { + "epoch": 1.051291721666981, + "grad_norm": 2.2569739818573, + "learning_rate": 2.3069199924299175e-06, + "loss": 1.0065641403198242, + "step": 697, + "token_acc": 0.7275556338441665 + }, + { + "epoch": 1.05280030171601, + "grad_norm": 1.7244075536727905, + "learning_rate": 2.29300341084631e-06, + "loss": 0.9419455528259277, + "step": 698, + "token_acc": 0.7437867760916692 + }, + { + "epoch": 1.0543088817650386, + "grad_norm": 2.6683413982391357, + "learning_rate": 2.2791164325437047e-06, + "loss": 0.9948554635047913, + "step": 699, + "token_acc": 0.7320281588818663 + }, + { + "epoch": 1.0558174618140674, + "grad_norm": 2.418962240219116, + "learning_rate": 2.265259209387867e-06, + "loss": 0.9835085272789001, + "step": 700, + "token_acc": 0.7327017162382542 + }, + { + "epoch": 1.0573260418630963, + "grad_norm": 1.8841325044631958, + "learning_rate": 2.2514318929191707e-06, + "loss": 1.0224485397338867, + "step": 701, + "token_acc": 0.7226960171459189 + }, + { + "epoch": 1.0588346219121252, + "grad_norm": 1.9105582237243652, + "learning_rate": 2.2376346343509343e-06, + "loss": 0.9762578010559082, + "step": 702, + "token_acc": 0.739794032215474 + }, + { + "epoch": 1.060343201961154, + "grad_norm": 1.6198151111602783, + "learning_rate": 2.2238675845677663e-06, + "loss": 0.9545873999595642, + "step": 703, + "token_acc": 0.7397340115508928 + }, + { + "epoch": 1.061851782010183, + "grad_norm": 1.8821969032287598, + "learning_rate": 2.2101308941239204e-06, + "loss": 0.9463695287704468, + "step": 704, + "token_acc": 0.738154970969823 + }, + { + "epoch": 1.0633603620592118, + "grad_norm": 1.6714779138565063, + "learning_rate": 2.1964247132416373e-06, + "loss": 0.9031643867492676, + "step": 705, + "token_acc": 0.7501501362775442 + }, + { + "epoch": 1.0648689421082407, + "grad_norm": 1.6961950063705444, + "learning_rate": 2.182749191809518e-06, + "loss": 0.9809706211090088, + "step": 706, + "token_acc": 0.7395117558387658 + }, + { + "epoch": 1.0663775221572696, + "grad_norm": 1.777477502822876, + "learning_rate": 2.1691044793808734e-06, + "loss": 0.9781726002693176, + "step": 707, + "token_acc": 0.7366929394484785 + }, + { + "epoch": 1.0678861022062982, + "grad_norm": 1.8834912776947021, + "learning_rate": 2.1554907251720947e-06, + "loss": 0.8747799396514893, + "step": 708, + "token_acc": 0.7640444616110896 + }, + { + "epoch": 1.069394682255327, + "grad_norm": 2.5899436473846436, + "learning_rate": 2.1419080780610123e-06, + "loss": 0.9561811089515686, + "step": 709, + "token_acc": 0.7386106330948283 + }, + { + "epoch": 1.070903262304356, + "grad_norm": 1.7520380020141602, + "learning_rate": 2.1283566865852824e-06, + "loss": 0.9618830680847168, + "step": 710, + "token_acc": 0.7422116428631087 + }, + { + "epoch": 1.0724118423533848, + "grad_norm": 1.7773940563201904, + "learning_rate": 2.11483669894075e-06, + "loss": 0.9755873680114746, + "step": 711, + "token_acc": 0.7365710990278465 + }, + { + "epoch": 1.0739204224024137, + "grad_norm": 5.488055229187012, + "learning_rate": 2.1013482629798334e-06, + "loss": 0.9814971685409546, + "step": 712, + "token_acc": 0.7365990095198994 + }, + { + "epoch": 1.0754290024514426, + "grad_norm": 1.8907276391983032, + "learning_rate": 2.08789152620991e-06, + "loss": 0.9524366855621338, + "step": 713, + "token_acc": 0.7388553835772427 + }, + { + "epoch": 1.0769375825004714, + "grad_norm": 2.13346791267395, + "learning_rate": 2.0744666357916925e-06, + "loss": 0.991515576839447, + "step": 714, + "token_acc": 0.738126575501372 + }, + { + "epoch": 1.0784461625495003, + "grad_norm": 1.8754643201828003, + "learning_rate": 2.061073738537635e-06, + "loss": 0.9899577498435974, + "step": 715, + "token_acc": 0.7356850835182805 + }, + { + "epoch": 1.0799547425985292, + "grad_norm": 1.9401235580444336, + "learning_rate": 2.0477129809103147e-06, + "loss": 0.9677337408065796, + "step": 716, + "token_acc": 0.7377985874136973 + }, + { + "epoch": 1.081463322647558, + "grad_norm": 1.7741327285766602, + "learning_rate": 2.034384509020837e-06, + "loss": 0.9789707064628601, + "step": 717, + "token_acc": 0.7294345923017914 + }, + { + "epoch": 1.082971902696587, + "grad_norm": 1.7175211906433105, + "learning_rate": 2.021088468627237e-06, + "loss": 0.9918591380119324, + "step": 718, + "token_acc": 0.7370982433311646 + }, + { + "epoch": 1.0844804827456156, + "grad_norm": 1.7309610843658447, + "learning_rate": 2.0078250051328783e-06, + "loss": 0.9816150665283203, + "step": 719, + "token_acc": 0.731834611356957 + }, + { + "epoch": 1.0859890627946445, + "grad_norm": 2.6974611282348633, + "learning_rate": 1.9945942635848745e-06, + "loss": 0.9954217672348022, + "step": 720, + "token_acc": 0.7344408048666354 + }, + { + "epoch": 1.0874976428436733, + "grad_norm": 2.218445301055908, + "learning_rate": 1.981396388672496e-06, + "loss": 1.02736496925354, + "step": 721, + "token_acc": 0.7269788946444625 + }, + { + "epoch": 1.0890062228927022, + "grad_norm": 1.4891468286514282, + "learning_rate": 1.9682315247255897e-06, + "loss": 0.8803458213806152, + "step": 722, + "token_acc": 0.7582582034529328 + }, + { + "epoch": 1.090514802941731, + "grad_norm": 1.9883731603622437, + "learning_rate": 1.9550998157129946e-06, + "loss": 0.984562873840332, + "step": 723, + "token_acc": 0.737860186474286 + }, + { + "epoch": 1.09202338299076, + "grad_norm": 2.153334379196167, + "learning_rate": 1.9420014052409793e-06, + "loss": 0.9894038438796997, + "step": 724, + "token_acc": 0.7275278356114783 + }, + { + "epoch": 1.0935319630397888, + "grad_norm": 1.7793869972229004, + "learning_rate": 1.928936436551661e-06, + "loss": 0.9563705325126648, + "step": 725, + "token_acc": 0.73974756006006 + }, + { + "epoch": 1.0950405430888177, + "grad_norm": 1.7301340103149414, + "learning_rate": 1.915905052521445e-06, + "loss": 1.0130729675292969, + "step": 726, + "token_acc": 0.7271245202030624 + }, + { + "epoch": 1.0965491231378466, + "grad_norm": 1.9354634284973145, + "learning_rate": 1.9029073956594607e-06, + "loss": 0.9821668267250061, + "step": 727, + "token_acc": 0.7344351203781744 + }, + { + "epoch": 1.0980577031868755, + "grad_norm": 1.98508882522583, + "learning_rate": 1.8899436081059974e-06, + "loss": 0.9322086572647095, + "step": 728, + "token_acc": 0.7432247042683716 + }, + { + "epoch": 1.099566283235904, + "grad_norm": 1.7080272436141968, + "learning_rate": 1.877013831630961e-06, + "loss": 0.9658929109573364, + "step": 729, + "token_acc": 0.7420047948613562 + }, + { + "epoch": 1.101074863284933, + "grad_norm": 1.6963342428207397, + "learning_rate": 1.864118207632315e-06, + "loss": 0.9788750410079956, + "step": 730, + "token_acc": 0.7406140271900603 + }, + { + "epoch": 1.1025834433339619, + "grad_norm": 1.825722098350525, + "learning_rate": 1.851256877134538e-06, + "loss": 0.9220666885375977, + "step": 731, + "token_acc": 0.7450178400487338 + }, + { + "epoch": 1.1040920233829907, + "grad_norm": 1.808821678161621, + "learning_rate": 1.838429980787081e-06, + "loss": 0.9150567650794983, + "step": 732, + "token_acc": 0.7571684368024725 + }, + { + "epoch": 1.1056006034320196, + "grad_norm": 1.7835012674331665, + "learning_rate": 1.825637658862824e-06, + "loss": 0.9868404865264893, + "step": 733, + "token_acc": 0.7306973174043968 + }, + { + "epoch": 1.1071091834810485, + "grad_norm": 1.9414182901382446, + "learning_rate": 1.8128800512565514e-06, + "loss": 0.9682952761650085, + "step": 734, + "token_acc": 0.7432016962629208 + }, + { + "epoch": 1.1086177635300773, + "grad_norm": 2.069830894470215, + "learning_rate": 1.8001572974834169e-06, + "loss": 0.9780141711235046, + "step": 735, + "token_acc": 0.7398651856071394 + }, + { + "epoch": 1.1101263435791062, + "grad_norm": 2.1255006790161133, + "learning_rate": 1.7874695366774191e-06, + "loss": 0.9581552743911743, + "step": 736, + "token_acc": 0.7459519233300861 + }, + { + "epoch": 1.111634923628135, + "grad_norm": 2.1308693885803223, + "learning_rate": 1.774816907589873e-06, + "loss": 0.9390983581542969, + "step": 737, + "token_acc": 0.7426739083739913 + }, + { + "epoch": 1.113143503677164, + "grad_norm": 2.0100886821746826, + "learning_rate": 1.7621995485879062e-06, + "loss": 0.9809353947639465, + "step": 738, + "token_acc": 0.736348273513759 + }, + { + "epoch": 1.1146520837261926, + "grad_norm": 1.9453312158584595, + "learning_rate": 1.749617597652934e-06, + "loss": 1.0272727012634277, + "step": 739, + "token_acc": 0.7247264304161453 + }, + { + "epoch": 1.1161606637752215, + "grad_norm": 1.75546133518219, + "learning_rate": 1.7370711923791567e-06, + "loss": 1.0196990966796875, + "step": 740, + "token_acc": 0.7230008946449111 + }, + { + "epoch": 1.1176692438242504, + "grad_norm": 2.0240206718444824, + "learning_rate": 1.7245604699720536e-06, + "loss": 1.0177955627441406, + "step": 741, + "token_acc": 0.7261435165757448 + }, + { + "epoch": 1.1191778238732792, + "grad_norm": 1.738242745399475, + "learning_rate": 1.7120855672468779e-06, + "loss": 0.9998416304588318, + "step": 742, + "token_acc": 0.7309708534283917 + }, + { + "epoch": 1.1206864039223081, + "grad_norm": 1.903008222579956, + "learning_rate": 1.6996466206271679e-06, + "loss": 0.9691488742828369, + "step": 743, + "token_acc": 0.7375711905294835 + }, + { + "epoch": 1.122194983971337, + "grad_norm": 2.0862903594970703, + "learning_rate": 1.6872437661432518e-06, + "loss": 0.9496877789497375, + "step": 744, + "token_acc": 0.7436965735220171 + }, + { + "epoch": 1.1237035640203659, + "grad_norm": 2.220475435256958, + "learning_rate": 1.6748771394307584e-06, + "loss": 0.965717077255249, + "step": 745, + "token_acc": 0.7373008145527085 + }, + { + "epoch": 1.1252121440693947, + "grad_norm": 1.9992204904556274, + "learning_rate": 1.6625468757291379e-06, + "loss": 0.994499683380127, + "step": 746, + "token_acc": 0.7391742195367573 + }, + { + "epoch": 1.1267207241184236, + "grad_norm": 1.9144926071166992, + "learning_rate": 1.6502531098801756e-06, + "loss": 0.98415207862854, + "step": 747, + "token_acc": 0.7328290165823356 + }, + { + "epoch": 1.1282293041674523, + "grad_norm": 1.8519779443740845, + "learning_rate": 1.6379959763265268e-06, + "loss": 0.9692360162734985, + "step": 748, + "token_acc": 0.7345141599119384 + }, + { + "epoch": 1.1297378842164814, + "grad_norm": 1.8473130464553833, + "learning_rate": 1.62577560911024e-06, + "loss": 0.9577260613441467, + "step": 749, + "token_acc": 0.7400583410207185 + }, + { + "epoch": 1.13124646426551, + "grad_norm": 1.5577995777130127, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.983310341835022, + "step": 750, + "token_acc": 0.7324867125555459 + }, + { + "epoch": 1.1327550443145389, + "grad_norm": 1.8383173942565918, + "learning_rate": 1.6014457078461354e-06, + "loss": 0.9243171215057373, + "step": 751, + "token_acc": 0.7434546429487329 + }, + { + "epoch": 1.1342636243635678, + "grad_norm": 2.044954538345337, + "learning_rate": 1.5893364398662175e-06, + "loss": 1.0037078857421875, + "step": 752, + "token_acc": 0.7359555246699097 + }, + { + "epoch": 1.1357722044125966, + "grad_norm": 1.684251308441162, + "learning_rate": 1.5772644703565564e-06, + "loss": 0.9518958330154419, + "step": 753, + "token_acc": 0.7407195324223995 + }, + { + "epoch": 1.1372807844616255, + "grad_norm": 1.752968668937683, + "learning_rate": 1.5652299313342772e-06, + "loss": 0.9631948471069336, + "step": 754, + "token_acc": 0.7411079778040571 + }, + { + "epoch": 1.1387893645106544, + "grad_norm": 1.5955580472946167, + "learning_rate": 1.5532329544071712e-06, + "loss": 0.9723101854324341, + "step": 755, + "token_acc": 0.7362145317369198 + }, + { + "epoch": 1.1402979445596833, + "grad_norm": 2.0654706954956055, + "learning_rate": 1.5412736707722537e-06, + "loss": 0.998774528503418, + "step": 756, + "token_acc": 0.7322742270560919 + }, + { + "epoch": 1.1418065246087121, + "grad_norm": 2.7580862045288086, + "learning_rate": 1.5293522112143371e-06, + "loss": 1.006318211555481, + "step": 757, + "token_acc": 0.7308229066410009 + }, + { + "epoch": 1.143315104657741, + "grad_norm": 2.1798880100250244, + "learning_rate": 1.517468706104589e-06, + "loss": 0.9756070971488953, + "step": 758, + "token_acc": 0.7364563416188655 + }, + { + "epoch": 1.1448236847067697, + "grad_norm": 2.069528818130493, + "learning_rate": 1.505623285399121e-06, + "loss": 0.9813940525054932, + "step": 759, + "token_acc": 0.7386527141922825 + }, + { + "epoch": 1.1463322647557985, + "grad_norm": 2.053407907485962, + "learning_rate": 1.4938160786375571e-06, + "loss": 0.9564691781997681, + "step": 760, + "token_acc": 0.7393674391488121 + }, + { + "epoch": 1.1478408448048274, + "grad_norm": 2.0016956329345703, + "learning_rate": 1.4820472149416153e-06, + "loss": 0.9750658869743347, + "step": 761, + "token_acc": 0.736780368789002 + }, + { + "epoch": 1.1493494248538563, + "grad_norm": 2.9020416736602783, + "learning_rate": 1.4703168230137072e-06, + "loss": 0.9960695505142212, + "step": 762, + "token_acc": 0.733732585796806 + }, + { + "epoch": 1.1508580049028851, + "grad_norm": 2.383183479309082, + "learning_rate": 1.4586250311355132e-06, + "loss": 0.9438557028770447, + "step": 763, + "token_acc": 0.7476213130352045 + }, + { + "epoch": 1.152366584951914, + "grad_norm": 1.8042714595794678, + "learning_rate": 1.4469719671666043e-06, + "loss": 1.0173859596252441, + "step": 764, + "token_acc": 0.726316615650263 + }, + { + "epoch": 1.153875165000943, + "grad_norm": 2.033050775527954, + "learning_rate": 1.4353577585430152e-06, + "loss": 0.9739550352096558, + "step": 765, + "token_acc": 0.7330792752498745 + }, + { + "epoch": 1.1553837450499718, + "grad_norm": 1.9862827062606812, + "learning_rate": 1.4237825322758735e-06, + "loss": 0.9689726829528809, + "step": 766, + "token_acc": 0.7437693229856773 + }, + { + "epoch": 1.1568923250990006, + "grad_norm": 2.030550241470337, + "learning_rate": 1.412246414949997e-06, + "loss": 0.9579986929893494, + "step": 767, + "token_acc": 0.7380606196174925 + }, + { + "epoch": 1.1584009051480295, + "grad_norm": 1.7782319784164429, + "learning_rate": 1.4007495327225162e-06, + "loss": 0.9647389650344849, + "step": 768, + "token_acc": 0.7398619834069938 + }, + { + "epoch": 1.1599094851970584, + "grad_norm": 2.022088050842285, + "learning_rate": 1.389292011321498e-06, + "loss": 0.9913321733474731, + "step": 769, + "token_acc": 0.7312104382412146 + }, + { + "epoch": 1.161418065246087, + "grad_norm": 1.823661208152771, + "learning_rate": 1.3778739760445552e-06, + "loss": 1.0006479024887085, + "step": 770, + "token_acc": 0.7263045793397231 + }, + { + "epoch": 1.162926645295116, + "grad_norm": 1.8158284425735474, + "learning_rate": 1.3664955517574967e-06, + "loss": 0.8759629130363464, + "step": 771, + "token_acc": 0.7550529698845375 + }, + { + "epoch": 1.1644352253441448, + "grad_norm": 2.013143301010132, + "learning_rate": 1.3551568628929434e-06, + "loss": 0.968482494354248, + "step": 772, + "token_acc": 0.7335769144731936 + }, + { + "epoch": 1.1659438053931737, + "grad_norm": 1.887747883796692, + "learning_rate": 1.343858033448982e-06, + "loss": 1.0241531133651733, + "step": 773, + "token_acc": 0.7263975586072965 + }, + { + "epoch": 1.1674523854422025, + "grad_norm": 1.7216143608093262, + "learning_rate": 1.3325991869878013e-06, + "loss": 0.958422064781189, + "step": 774, + "token_acc": 0.743002087588947 + }, + { + "epoch": 1.1689609654912314, + "grad_norm": 1.915213942527771, + "learning_rate": 1.321380446634342e-06, + "loss": 0.9400535821914673, + "step": 775, + "token_acc": 0.7427055702917772 + }, + { + "epoch": 1.1704695455402603, + "grad_norm": 1.6879397630691528, + "learning_rate": 1.3102019350749528e-06, + "loss": 0.9499019384384155, + "step": 776, + "token_acc": 0.7454979388153613 + }, + { + "epoch": 1.1719781255892892, + "grad_norm": 1.6247007846832275, + "learning_rate": 1.2990637745560418e-06, + "loss": 0.9707174897193909, + "step": 777, + "token_acc": 0.7391779353562006 + }, + { + "epoch": 1.173486705638318, + "grad_norm": 2.126030683517456, + "learning_rate": 1.2879660868827508e-06, + "loss": 1.031003713607788, + "step": 778, + "token_acc": 0.7233654463712267 + }, + { + "epoch": 1.1749952856873467, + "grad_norm": 2.024141550064087, + "learning_rate": 1.2769089934176126e-06, + "loss": 0.942808210849762, + "step": 779, + "token_acc": 0.7437908496732026 + }, + { + "epoch": 1.1765038657363756, + "grad_norm": 2.28425669670105, + "learning_rate": 1.2658926150792321e-06, + "loss": 0.9922306537628174, + "step": 780, + "token_acc": 0.7369622011309898 + }, + { + "epoch": 1.1780124457854044, + "grad_norm": 2.287721633911133, + "learning_rate": 1.2549170723409548e-06, + "loss": 0.9475542306900024, + "step": 781, + "token_acc": 0.7436015892081678 + }, + { + "epoch": 1.1795210258344333, + "grad_norm": 1.807377576828003, + "learning_rate": 1.243982485229559e-06, + "loss": 0.9856640100479126, + "step": 782, + "token_acc": 0.7324687800192123 + }, + { + "epoch": 1.1810296058834622, + "grad_norm": 1.6336188316345215, + "learning_rate": 1.233088973323937e-06, + "loss": 0.97825026512146, + "step": 783, + "token_acc": 0.7351865262845153 + }, + { + "epoch": 1.182538185932491, + "grad_norm": 1.75718092918396, + "learning_rate": 1.2222366557537911e-06, + "loss": 0.968316376209259, + "step": 784, + "token_acc": 0.733608739557514 + }, + { + "epoch": 1.18404676598152, + "grad_norm": 2.353073835372925, + "learning_rate": 1.2114256511983274e-06, + "loss": 0.9801476001739502, + "step": 785, + "token_acc": 0.735658042744657 + }, + { + "epoch": 1.1855553460305488, + "grad_norm": 1.5095616579055786, + "learning_rate": 1.200656077884958e-06, + "loss": 0.9668078422546387, + "step": 786, + "token_acc": 0.7296519366431519 + }, + { + "epoch": 1.1870639260795777, + "grad_norm": 1.920823574066162, + "learning_rate": 1.189928053588012e-06, + "loss": 0.9017779231071472, + "step": 787, + "token_acc": 0.7538431709276238 + }, + { + "epoch": 1.1885725061286065, + "grad_norm": 1.9143998622894287, + "learning_rate": 1.1792416956274443e-06, + "loss": 0.9851276874542236, + "step": 788, + "token_acc": 0.7344009330283049 + }, + { + "epoch": 1.1900810861776354, + "grad_norm": 1.7909094095230103, + "learning_rate": 1.1685971208675539e-06, + "loss": 0.9635279178619385, + "step": 789, + "token_acc": 0.7413360693114455 + }, + { + "epoch": 1.191589666226664, + "grad_norm": 2.632859945297241, + "learning_rate": 1.157994445715706e-06, + "loss": 0.994879424571991, + "step": 790, + "token_acc": 0.7324759025836993 + }, + { + "epoch": 1.193098246275693, + "grad_norm": 1.7419313192367554, + "learning_rate": 1.1474337861210543e-06, + "loss": 0.9468947052955627, + "step": 791, + "token_acc": 0.7485660611147751 + }, + { + "epoch": 1.1946068263247218, + "grad_norm": 1.8631842136383057, + "learning_rate": 1.1369152575732823e-06, + "loss": 0.9924584031105042, + "step": 792, + "token_acc": 0.7396952054000139 + }, + { + "epoch": 1.1961154063737507, + "grad_norm": 1.7275878190994263, + "learning_rate": 1.1264389751013326e-06, + "loss": 0.9233929514884949, + "step": 793, + "token_acc": 0.7427759940726105 + }, + { + "epoch": 1.1976239864227796, + "grad_norm": 1.771911859512329, + "learning_rate": 1.1160050532721527e-06, + "loss": 0.9944145679473877, + "step": 794, + "token_acc": 0.7297540426092108 + }, + { + "epoch": 1.1991325664718084, + "grad_norm": 2.3955419063568115, + "learning_rate": 1.1056136061894386e-06, + "loss": 0.9966319799423218, + "step": 795, + "token_acc": 0.7375780613630193 + }, + { + "epoch": 1.2006411465208373, + "grad_norm": 2.8586337566375732, + "learning_rate": 1.095264747492391e-06, + "loss": 0.9164185523986816, + "step": 796, + "token_acc": 0.7515670168007098 + }, + { + "epoch": 1.2021497265698662, + "grad_norm": 1.7866860628128052, + "learning_rate": 1.0849585903544707e-06, + "loss": 0.9441959261894226, + "step": 797, + "token_acc": 0.7392558716047272 + }, + { + "epoch": 1.203658306618895, + "grad_norm": 1.8811817169189453, + "learning_rate": 1.0746952474821615e-06, + "loss": 0.9866641163825989, + "step": 798, + "token_acc": 0.7305692484300088 + }, + { + "epoch": 1.2051668866679237, + "grad_norm": 1.96930730342865, + "learning_rate": 1.0644748311137377e-06, + "loss": 0.9931862354278564, + "step": 799, + "token_acc": 0.7355446276476071 + }, + { + "epoch": 1.2066754667169526, + "grad_norm": 1.8745235204696655, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.9447171092033386, + "step": 800, + "token_acc": 0.7444985260848701 + }, + { + "epoch": 1.2081840467659815, + "grad_norm": 3.5514159202575684, + "learning_rate": 1.0441632244932238e-06, + "loss": 0.9942536950111389, + "step": 801, + "token_acc": 0.7281512400318332 + }, + { + "epoch": 1.2096926268150103, + "grad_norm": 2.551311492919922, + "learning_rate": 1.0340722563656109e-06, + "loss": 1.0092523097991943, + "step": 802, + "token_acc": 0.7320104062880549 + }, + { + "epoch": 1.2112012068640392, + "grad_norm": 2.0251779556274414, + "learning_rate": 1.0240246589884046e-06, + "loss": 0.979934573173523, + "step": 803, + "token_acc": 0.7347451944568618 + }, + { + "epoch": 1.212709786913068, + "grad_norm": 2.689196825027466, + "learning_rate": 1.0140205422405213e-06, + "loss": 0.9517975449562073, + "step": 804, + "token_acc": 0.7409844783461296 + }, + { + "epoch": 1.214218366962097, + "grad_norm": 2.20412015914917, + "learning_rate": 1.0040600155253766e-06, + "loss": 1.005642056465149, + "step": 805, + "token_acc": 0.7275903115619501 + }, + { + "epoch": 1.2157269470111258, + "grad_norm": 2.760805368423462, + "learning_rate": 9.941431877696955e-07, + "loss": 0.8914682865142822, + "step": 806, + "token_acc": 0.763964604566643 + }, + { + "epoch": 1.2172355270601547, + "grad_norm": 2.0134992599487305, + "learning_rate": 9.842701674223187e-07, + "loss": 0.9227648973464966, + "step": 807, + "token_acc": 0.7476672067944348 + }, + { + "epoch": 1.2187441071091836, + "grad_norm": 1.7903823852539062, + "learning_rate": 9.744410624530148e-07, + "loss": 1.0278069972991943, + "step": 808, + "token_acc": 0.7226474460536005 + }, + { + "epoch": 1.2202526871582124, + "grad_norm": 2.7869725227355957, + "learning_rate": 9.646559803512995e-07, + "loss": 0.9191211462020874, + "step": 809, + "token_acc": 0.745027382479285 + }, + { + "epoch": 1.221761267207241, + "grad_norm": 1.5524405241012573, + "learning_rate": 9.549150281252633e-07, + "loss": 0.8644911050796509, + "step": 810, + "token_acc": 0.7641425295166907 + }, + { + "epoch": 1.22326984725627, + "grad_norm": 2.006319761276245, + "learning_rate": 9.452183123003999e-07, + "loss": 0.951104462146759, + "step": 811, + "token_acc": 0.7464535196131112 + }, + { + "epoch": 1.2247784273052988, + "grad_norm": 2.244492292404175, + "learning_rate": 9.355659389184396e-07, + "loss": 0.9697079062461853, + "step": 812, + "token_acc": 0.7337425495952318 + }, + { + "epoch": 1.2262870073543277, + "grad_norm": 1.8747849464416504, + "learning_rate": 9.259580135361929e-07, + "loss": 1.0017671585083008, + "step": 813, + "token_acc": 0.7233523002142412 + }, + { + "epoch": 1.2277955874033566, + "grad_norm": 2.170590877532959, + "learning_rate": 9.163946412243896e-07, + "loss": 0.9223774671554565, + "step": 814, + "token_acc": 0.7477591613087131 + }, + { + "epoch": 1.2293041674523855, + "grad_norm": 2.7934672832489014, + "learning_rate": 9.068759265665384e-07, + "loss": 0.9916120767593384, + "step": 815, + "token_acc": 0.7322255311694487 + }, + { + "epoch": 1.2308127475014143, + "grad_norm": 3.6863653659820557, + "learning_rate": 8.974019736577777e-07, + "loss": 0.8878031969070435, + "step": 816, + "token_acc": 0.7574589536868123 + }, + { + "epoch": 1.2323213275504432, + "grad_norm": 2.0693209171295166, + "learning_rate": 8.879728861037385e-07, + "loss": 1.0020530223846436, + "step": 817, + "token_acc": 0.7299389203491679 + }, + { + "epoch": 1.233829907599472, + "grad_norm": 2.223858118057251, + "learning_rate": 8.785887670194137e-07, + "loss": 0.9727979302406311, + "step": 818, + "token_acc": 0.7390627086207664 + }, + { + "epoch": 1.2353384876485007, + "grad_norm": 2.1986441612243652, + "learning_rate": 8.692497190280225e-07, + "loss": 0.9895301461219788, + "step": 819, + "token_acc": 0.7321825615033352 + }, + { + "epoch": 1.2368470676975296, + "grad_norm": 1.595699429512024, + "learning_rate": 8.599558442598998e-07, + "loss": 0.9951426982879639, + "step": 820, + "token_acc": 0.7238150587627534 + }, + { + "epoch": 1.2383556477465585, + "grad_norm": 1.9911519289016724, + "learning_rate": 8.507072443513703e-07, + "loss": 0.9374018907546997, + "step": 821, + "token_acc": 0.7427404163710644 + }, + { + "epoch": 1.2398642277955874, + "grad_norm": 1.9524264335632324, + "learning_rate": 8.415040204436426e-07, + "loss": 0.9438478946685791, + "step": 822, + "token_acc": 0.7425453648712521 + }, + { + "epoch": 1.2413728078446162, + "grad_norm": 1.6910455226898193, + "learning_rate": 8.323462731816962e-07, + "loss": 0.9962350726127625, + "step": 823, + "token_acc": 0.7296304023366661 + }, + { + "epoch": 1.242881387893645, + "grad_norm": 2.013437032699585, + "learning_rate": 8.232341027131885e-07, + "loss": 0.9993690848350525, + "step": 824, + "token_acc": 0.7307738625689444 + }, + { + "epoch": 1.244389967942674, + "grad_norm": 1.974143385887146, + "learning_rate": 8.141676086873574e-07, + "loss": 0.9950785636901855, + "step": 825, + "token_acc": 0.7349562832161838 + }, + { + "epoch": 1.2458985479917029, + "grad_norm": 1.7452044486999512, + "learning_rate": 8.051468902539272e-07, + "loss": 0.9755558967590332, + "step": 826, + "token_acc": 0.7350938535149062 + }, + { + "epoch": 1.2474071280407317, + "grad_norm": 1.9603310823440552, + "learning_rate": 7.961720460620321e-07, + "loss": 0.9722100496292114, + "step": 827, + "token_acc": 0.7371144744369896 + }, + { + "epoch": 1.2489157080897606, + "grad_norm": 1.5346273183822632, + "learning_rate": 7.872431742591268e-07, + "loss": 1.008032202720642, + "step": 828, + "token_acc": 0.7269352177344143 + }, + { + "epoch": 1.2504242881387895, + "grad_norm": 3.3117144107818604, + "learning_rate": 7.783603724899258e-07, + "loss": 1.00252366065979, + "step": 829, + "token_acc": 0.7310746899074928 + }, + { + "epoch": 1.2519328681878181, + "grad_norm": 2.5289394855499268, + "learning_rate": 7.695237378953224e-07, + "loss": 0.9258033037185669, + "step": 830, + "token_acc": 0.7515389678268612 + }, + { + "epoch": 1.253441448236847, + "grad_norm": 1.8152717351913452, + "learning_rate": 7.607333671113409e-07, + "loss": 0.9263573288917542, + "step": 831, + "token_acc": 0.7492149313235575 + }, + { + "epoch": 1.2549500282858759, + "grad_norm": 1.837641954421997, + "learning_rate": 7.519893562680663e-07, + "loss": 0.9581955671310425, + "step": 832, + "token_acc": 0.7416528925619835 + }, + { + "epoch": 1.2564586083349047, + "grad_norm": 1.9911967515945435, + "learning_rate": 7.432918009885997e-07, + "loss": 0.996119499206543, + "step": 833, + "token_acc": 0.7278605874616396 + }, + { + "epoch": 1.2579671883839336, + "grad_norm": 1.987295389175415, + "learning_rate": 7.346407963880137e-07, + "loss": 0.9772776365280151, + "step": 834, + "token_acc": 0.7356116376978621 + }, + { + "epoch": 1.2594757684329625, + "grad_norm": 2.3384385108947754, + "learning_rate": 7.260364370723044e-07, + "loss": 0.9837565422058105, + "step": 835, + "token_acc": 0.7339554875176637 + }, + { + "epoch": 1.2609843484819914, + "grad_norm": 2.050389528274536, + "learning_rate": 7.174788171373731e-07, + "loss": 0.9623764157295227, + "step": 836, + "token_acc": 0.7384060302431401 + }, + { + "epoch": 1.2624929285310202, + "grad_norm": 1.8911354541778564, + "learning_rate": 7.089680301679752e-07, + "loss": 0.9408401846885681, + "step": 837, + "token_acc": 0.74871990412899 + }, + { + "epoch": 1.2640015085800491, + "grad_norm": 2.191833019256592, + "learning_rate": 7.005041692367154e-07, + "loss": 0.909685492515564, + "step": 838, + "token_acc": 0.757968154415939 + }, + { + "epoch": 1.2655100886290778, + "grad_norm": 2.004509687423706, + "learning_rate": 6.92087326903022e-07, + "loss": 0.9448233842849731, + "step": 839, + "token_acc": 0.7407430612117035 + }, + { + "epoch": 1.2670186686781069, + "grad_norm": 1.4623405933380127, + "learning_rate": 6.837175952121305e-07, + "loss": 0.9916695952415466, + "step": 840, + "token_acc": 0.7305493574813922 + }, + { + "epoch": 1.2685272487271355, + "grad_norm": 2.4984498023986816, + "learning_rate": 6.753950656940905e-07, + "loss": 1.0102533102035522, + "step": 841, + "token_acc": 0.7337656076748783 + }, + { + "epoch": 1.2700358287761644, + "grad_norm": 2.5764572620391846, + "learning_rate": 6.671198293627479e-07, + "loss": 0.9360049366950989, + "step": 842, + "token_acc": 0.7483646770237122 + }, + { + "epoch": 1.2715444088251933, + "grad_norm": 1.8887144327163696, + "learning_rate": 6.58891976714764e-07, + "loss": 0.9787989854812622, + "step": 843, + "token_acc": 0.7390813503043719 + }, + { + "epoch": 1.2730529888742221, + "grad_norm": 2.1674139499664307, + "learning_rate": 6.507115977286144e-07, + "loss": 1.0091636180877686, + "step": 844, + "token_acc": 0.7283969974325304 + }, + { + "epoch": 1.274561568923251, + "grad_norm": 2.017744541168213, + "learning_rate": 6.425787818636131e-07, + "loss": 0.9506663084030151, + "step": 845, + "token_acc": 0.7451289168846421 + }, + { + "epoch": 1.2760701489722799, + "grad_norm": 1.8274377584457397, + "learning_rate": 6.34493618058935e-07, + "loss": 1.0213708877563477, + "step": 846, + "token_acc": 0.7187904734494172 + }, + { + "epoch": 1.2775787290213088, + "grad_norm": 1.8065828084945679, + "learning_rate": 6.264561947326331e-07, + "loss": 0.9685531258583069, + "step": 847, + "token_acc": 0.7389944392956441 + }, + { + "epoch": 1.2790873090703374, + "grad_norm": 2.3132944107055664, + "learning_rate": 6.184665997806832e-07, + "loss": 0.9232146739959717, + "step": 848, + "token_acc": 0.7466055916490255 + }, + { + "epoch": 1.2805958891193665, + "grad_norm": 2.3938615322113037, + "learning_rate": 6.105249205760128e-07, + "loss": 0.9624519348144531, + "step": 849, + "token_acc": 0.739568546228852 + }, + { + "epoch": 1.2821044691683952, + "grad_norm": 1.9711883068084717, + "learning_rate": 6.026312439675553e-07, + "loss": 0.9819716811180115, + "step": 850, + "token_acc": 0.7345152653722411 + }, + { + "epoch": 1.283613049217424, + "grad_norm": 1.9473552703857422, + "learning_rate": 5.947856562792926e-07, + "loss": 1.0068871974945068, + "step": 851, + "token_acc": 0.7287461993344346 + }, + { + "epoch": 1.285121629266453, + "grad_norm": 2.020416259765625, + "learning_rate": 5.869882433093154e-07, + "loss": 0.9687259197235107, + "step": 852, + "token_acc": 0.7402947139096493 + }, + { + "epoch": 1.2866302093154818, + "grad_norm": 1.8370989561080933, + "learning_rate": 5.79239090328883e-07, + "loss": 0.950388491153717, + "step": 853, + "token_acc": 0.7422264778511314 + }, + { + "epoch": 1.2881387893645107, + "grad_norm": 1.9507230520248413, + "learning_rate": 5.715382820814885e-07, + "loss": 0.8991225957870483, + "step": 854, + "token_acc": 0.7543374110423741 + }, + { + "epoch": 1.2896473694135395, + "grad_norm": 1.6064006090164185, + "learning_rate": 5.63885902781941e-07, + "loss": 0.9454917311668396, + "step": 855, + "token_acc": 0.741236153221172 + }, + { + "epoch": 1.2911559494625684, + "grad_norm": 1.7535923719406128, + "learning_rate": 5.562820361154315e-07, + "loss": 0.9715473055839539, + "step": 856, + "token_acc": 0.7344603153802473 + }, + { + "epoch": 1.2926645295115973, + "grad_norm": 2.1352646350860596, + "learning_rate": 5.487267652366291e-07, + "loss": 0.9560214281082153, + "step": 857, + "token_acc": 0.7406910156999206 + }, + { + "epoch": 1.2941731095606261, + "grad_norm": 1.6978834867477417, + "learning_rate": 5.412201727687644e-07, + "loss": 0.9208300709724426, + "step": 858, + "token_acc": 0.7441850718746071 + }, + { + "epoch": 1.2956816896096548, + "grad_norm": 1.9034186601638794, + "learning_rate": 5.337623408027293e-07, + "loss": 0.9707709550857544, + "step": 859, + "token_acc": 0.7362092087617345 + }, + { + "epoch": 1.297190269658684, + "grad_norm": 1.8723925352096558, + "learning_rate": 5.263533508961827e-07, + "loss": 0.9726244211196899, + "step": 860, + "token_acc": 0.7358395463614408 + }, + { + "epoch": 1.2986988497077125, + "grad_norm": 2.2860147953033447, + "learning_rate": 5.189932840726486e-07, + "loss": 0.9839601516723633, + "step": 861, + "token_acc": 0.735234677885012 + }, + { + "epoch": 1.3002074297567414, + "grad_norm": 2.1331417560577393, + "learning_rate": 5.116822208206396e-07, + "loss": 0.9471396803855896, + "step": 862, + "token_acc": 0.7416779361281598 + }, + { + "epoch": 1.3017160098057703, + "grad_norm": 1.6943060159683228, + "learning_rate": 5.044202410927707e-07, + "loss": 0.9858335256576538, + "step": 863, + "token_acc": 0.7301160048593411 + }, + { + "epoch": 1.3032245898547992, + "grad_norm": 2.3178012371063232, + "learning_rate": 4.972074243048896e-07, + "loss": 1.020275592803955, + "step": 864, + "token_acc": 0.7283236994219653 + }, + { + "epoch": 1.304733169903828, + "grad_norm": 1.5733225345611572, + "learning_rate": 4.900438493352056e-07, + "loss": 0.9658796191215515, + "step": 865, + "token_acc": 0.72979924137293 + }, + { + "epoch": 1.306241749952857, + "grad_norm": 1.5555676221847534, + "learning_rate": 4.829295945234258e-07, + "loss": 0.9544912576675415, + "step": 866, + "token_acc": 0.7378570290227188 + }, + { + "epoch": 1.3077503300018858, + "grad_norm": 2.555567502975464, + "learning_rate": 4.758647376699033e-07, + "loss": 1.0028256177902222, + "step": 867, + "token_acc": 0.7282685201478093 + }, + { + "epoch": 1.3092589100509147, + "grad_norm": 1.9084442853927612, + "learning_rate": 4.6884935603477733e-07, + "loss": 1.0015586614608765, + "step": 868, + "token_acc": 0.7285907751135187 + }, + { + "epoch": 1.3107674900999435, + "grad_norm": 1.8713854551315308, + "learning_rate": 4.6188352633713964e-07, + "loss": 0.9118603467941284, + "step": 869, + "token_acc": 0.7471701933785563 + }, + { + "epoch": 1.3122760701489722, + "grad_norm": 1.9853101968765259, + "learning_rate": 4.549673247541875e-07, + "loss": 0.899925708770752, + "step": 870, + "token_acc": 0.7539948865452221 + }, + { + "epoch": 1.313784650198001, + "grad_norm": 1.8795055150985718, + "learning_rate": 4.48100826920394e-07, + "loss": 0.9778237342834473, + "step": 871, + "token_acc": 0.7323549156230468 + }, + { + "epoch": 1.31529323024703, + "grad_norm": 1.8937017917633057, + "learning_rate": 4.412841079266778e-07, + "loss": 0.9401206374168396, + "step": 872, + "token_acc": 0.7479521533248054 + }, + { + "epoch": 1.3168018102960588, + "grad_norm": 1.5193790197372437, + "learning_rate": 4.345172423195865e-07, + "loss": 0.9711022973060608, + "step": 873, + "token_acc": 0.7359068059136693 + }, + { + "epoch": 1.3183103903450877, + "grad_norm": 1.658700942993164, + "learning_rate": 4.27800304100478e-07, + "loss": 1.0044441223144531, + "step": 874, + "token_acc": 0.7282785422532418 + }, + { + "epoch": 1.3198189703941166, + "grad_norm": 2.381953239440918, + "learning_rate": 4.211333667247125e-07, + "loss": 0.9123528003692627, + "step": 875, + "token_acc": 0.7477600126642393 + }, + { + "epoch": 1.3213275504431454, + "grad_norm": 1.7543773651123047, + "learning_rate": 4.1451650310085076e-07, + "loss": 1.0061793327331543, + "step": 876, + "token_acc": 0.7254200754421202 + }, + { + "epoch": 1.3228361304921743, + "grad_norm": 1.7452945709228516, + "learning_rate": 4.079497855898501e-07, + "loss": 1.0251508951187134, + "step": 877, + "token_acc": 0.7252198281989318 + }, + { + "epoch": 1.3243447105412032, + "grad_norm": 1.8602429628372192, + "learning_rate": 4.01433286004283e-07, + "loss": 1.0073888301849365, + "step": 878, + "token_acc": 0.7317527279583323 + }, + { + "epoch": 1.3258532905902318, + "grad_norm": 1.6515202522277832, + "learning_rate": 3.949670756075447e-07, + "loss": 0.9759517312049866, + "step": 879, + "token_acc": 0.7363058258654498 + }, + { + "epoch": 1.327361870639261, + "grad_norm": 2.259262800216675, + "learning_rate": 3.885512251130763e-07, + "loss": 0.9415392875671387, + "step": 880, + "token_acc": 0.7367594486805943 + }, + { + "epoch": 1.3288704506882896, + "grad_norm": 2.341904640197754, + "learning_rate": 3.8218580468359136e-07, + "loss": 0.9629039168357849, + "step": 881, + "token_acc": 0.7386207193254173 + }, + { + "epoch": 1.3303790307373184, + "grad_norm": 1.849068284034729, + "learning_rate": 3.7587088393030604e-07, + "loss": 0.9572469592094421, + "step": 882, + "token_acc": 0.7419253190483693 + }, + { + "epoch": 1.3318876107863473, + "grad_norm": 2.30039644241333, + "learning_rate": 3.6960653191218333e-07, + "loss": 0.9797049164772034, + "step": 883, + "token_acc": 0.7375852862652033 + }, + { + "epoch": 1.3333961908353762, + "grad_norm": 1.9996850490570068, + "learning_rate": 3.6339281713517304e-07, + "loss": 0.9314875602722168, + "step": 884, + "token_acc": 0.7470868295762422 + }, + { + "epoch": 1.334904770884405, + "grad_norm": 2.2913050651550293, + "learning_rate": 3.572298075514652e-07, + "loss": 0.8879678249359131, + "step": 885, + "token_acc": 0.7577568941611462 + }, + { + "epoch": 1.336413350933434, + "grad_norm": 2.2234644889831543, + "learning_rate": 3.511175705587433e-07, + "loss": 0.9131568670272827, + "step": 886, + "token_acc": 0.7519109238639783 + }, + { + "epoch": 1.3379219309824628, + "grad_norm": 1.7255728244781494, + "learning_rate": 3.450561729994534e-07, + "loss": 0.9348055124282837, + "step": 887, + "token_acc": 0.7436455320347096 + }, + { + "epoch": 1.3394305110314917, + "grad_norm": 2.313727855682373, + "learning_rate": 3.390456811600673e-07, + "loss": 0.9853873252868652, + "step": 888, + "token_acc": 0.7370166685565257 + }, + { + "epoch": 1.3409390910805206, + "grad_norm": 1.8186455965042114, + "learning_rate": 3.3308616077036113e-07, + "loss": 0.9497115015983582, + "step": 889, + "token_acc": 0.7430063241284631 + }, + { + "epoch": 1.3424476711295492, + "grad_norm": 2.061354398727417, + "learning_rate": 3.271776770026963e-07, + "loss": 0.9446015357971191, + "step": 890, + "token_acc": 0.7481424191365357 + }, + { + "epoch": 1.343956251178578, + "grad_norm": 2.014603853225708, + "learning_rate": 3.213202944713023e-07, + "loss": 0.9980448484420776, + "step": 891, + "token_acc": 0.7331215569584624 + }, + { + "epoch": 1.345464831227607, + "grad_norm": 2.049685478210449, + "learning_rate": 3.1551407723157734e-07, + "loss": 0.9866378307342529, + "step": 892, + "token_acc": 0.733074530530156 + }, + { + "epoch": 1.3469734112766358, + "grad_norm": 2.0940017700195312, + "learning_rate": 3.0975908877938277e-07, + "loss": 0.971149206161499, + "step": 893, + "token_acc": 0.7445070568164194 + }, + { + "epoch": 1.3484819913256647, + "grad_norm": 1.763009786605835, + "learning_rate": 3.040553920503503e-07, + "loss": 0.882137656211853, + "step": 894, + "token_acc": 0.7579319764447728 + }, + { + "epoch": 1.3499905713746936, + "grad_norm": 1.577987551689148, + "learning_rate": 2.984030494191942e-07, + "loss": 1.002955436706543, + "step": 895, + "token_acc": 0.7305987515236583 + }, + { + "epoch": 1.3514991514237225, + "grad_norm": 4.48616361618042, + "learning_rate": 2.928021226990263e-07, + "loss": 0.926548957824707, + "step": 896, + "token_acc": 0.7500458575359218 + }, + { + "epoch": 1.3530077314727513, + "grad_norm": 1.9470696449279785, + "learning_rate": 2.8725267314068496e-07, + "loss": 0.9650593996047974, + "step": 897, + "token_acc": 0.744224458783888 + }, + { + "epoch": 1.3545163115217802, + "grad_norm": 1.6612807512283325, + "learning_rate": 2.817547614320615e-07, + "loss": 0.9435169696807861, + "step": 898, + "token_acc": 0.7368239188156985 + }, + { + "epoch": 1.3560248915708089, + "grad_norm": 1.843275785446167, + "learning_rate": 2.763084476974376e-07, + "loss": 0.9824391603469849, + "step": 899, + "token_acc": 0.7366006696369723 + }, + { + "epoch": 1.357533471619838, + "grad_norm": 2.1927034854888916, + "learning_rate": 2.7091379149682683e-07, + "loss": 1.000361680984497, + "step": 900, + "token_acc": 0.7267433252726645 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.077624227797087e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}