diff --git "a/checkpoint-9000/trainer_state.json" "b/checkpoint-9000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-9000/trainer_state.json" @@ -0,0 +1,6321 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8648734680884926, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.0, + "learning_rate": 0, + "loss": 7.7169, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 10.55540657043457, + "learning_rate": 9.997877083112197e-05, + "loss": 9.0438, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 6.060225009918213, + "learning_rate": 9.987262498673178e-05, + "loss": 3.211, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 2.5255496501922607, + "learning_rate": 9.976647914234159e-05, + "loss": 0.6387, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 2.976543664932251, + "learning_rate": 9.966033329795139e-05, + "loss": 0.5633, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 2.2680673599243164, + "learning_rate": 9.95541874535612e-05, + "loss": 0.474, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 3.136930465698242, + "learning_rate": 9.944804160917101e-05, + "loss": 0.3379, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 4.159604072570801, + "learning_rate": 9.935251034921983e-05, + "loss": 0.4444, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.704042911529541, + "learning_rate": 9.924636450482963e-05, + "loss": 0.4925, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 3.9414522647857666, + "learning_rate": 9.914021866043945e-05, + "loss": 0.4583, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 2.938662052154541, + "learning_rate": 9.903407281604927e-05, + "loss": 0.3838, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 1.8753790855407715, + "learning_rate": 9.892792697165907e-05, + "loss": 0.3247, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 1.75948965549469, + "learning_rate": 9.882178112726887e-05, + "loss": 0.3609, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 1.9066141843795776, + "learning_rate": 9.871563528287868e-05, + "loss": 0.3453, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.7767695188522339, + "learning_rate": 9.86094894384885e-05, + "loss": 0.5076, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 2.5219664573669434, + "learning_rate": 9.85033435940983e-05, + "loss": 0.4999, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 2.0505383014678955, + "learning_rate": 9.83971977497081e-05, + "loss": 0.5429, + "step": 170 + }, + { + "epoch": 0.06, + "grad_norm": 6.132015705108643, + "learning_rate": 9.82910519053179e-05, + "loss": 0.5099, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.057868480682373, + "learning_rate": 9.818490606092772e-05, + "loss": 0.4416, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 2.6155290603637695, + "learning_rate": 9.807876021653753e-05, + "loss": 0.3986, + "step": 200 + }, + { + "epoch": 0.07, + "grad_norm": 2.1468820571899414, + "learning_rate": 9.797261437214733e-05, + "loss": 0.3216, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 0.6600925326347351, + "learning_rate": 9.786646852775713e-05, + "loss": 0.3552, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 5.129382133483887, + "learning_rate": 9.776032268336695e-05, + "loss": 0.3221, + "step": 230 + }, + { + "epoch": 0.08, + "grad_norm": 0.3891478180885315, + "learning_rate": 9.765417683897677e-05, + "loss": 0.4073, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 3.254958391189575, + "learning_rate": 9.754803099458657e-05, + "loss": 0.4212, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 3.34332013130188, + "learning_rate": 9.744188515019638e-05, + "loss": 0.2167, + "step": 260 + }, + { + "epoch": 0.09, + "grad_norm": 3.801086902618408, + "learning_rate": 9.733573930580618e-05, + "loss": 0.5605, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 5.026745796203613, + "learning_rate": 9.7229593461416e-05, + "loss": 0.3527, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 3.8389620780944824, + "learning_rate": 9.71234476170258e-05, + "loss": 0.295, + "step": 290 + }, + { + "epoch": 0.1, + "grad_norm": 2.0584566593170166, + "learning_rate": 9.70173017726356e-05, + "loss": 0.2759, + "step": 300 + }, + { + "epoch": 0.1, + "grad_norm": 3.132164239883423, + "learning_rate": 9.691115592824541e-05, + "loss": 0.3888, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.5387492179870605, + "learning_rate": 9.680501008385522e-05, + "loss": 0.2285, + "step": 320 + }, + { + "epoch": 0.11, + "grad_norm": 3.0382373332977295, + "learning_rate": 9.669886423946503e-05, + "loss": 0.2549, + "step": 330 + }, + { + "epoch": 0.11, + "grad_norm": 6.465576648712158, + "learning_rate": 9.659271839507483e-05, + "loss": 0.7377, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 4.1156134605407715, + "learning_rate": 9.648657255068465e-05, + "loss": 0.3387, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 4.147655963897705, + "learning_rate": 9.638042670629445e-05, + "loss": 0.2605, + "step": 360 + }, + { + "epoch": 0.12, + "grad_norm": 1.4572869539260864, + "learning_rate": 9.627428086190427e-05, + "loss": 0.3024, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.906175971031189, + "learning_rate": 9.616813501751407e-05, + "loss": 0.3728, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.169878363609314, + "learning_rate": 9.606198917312388e-05, + "loss": 0.3961, + "step": 390 + }, + { + "epoch": 0.13, + "grad_norm": 1.2084730863571167, + "learning_rate": 9.595584332873368e-05, + "loss": 0.4887, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 0.7927988171577454, + "learning_rate": 9.58496974843435e-05, + "loss": 0.4519, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 6.37067985534668, + "learning_rate": 9.57435516399533e-05, + "loss": 0.237, + "step": 420 + }, + { + "epoch": 0.14, + "grad_norm": 2.9806203842163086, + "learning_rate": 9.56374057955631e-05, + "loss": 0.2917, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 5.05634880065918, + "learning_rate": 9.553125995117291e-05, + "loss": 0.2794, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 3.0483241081237793, + "learning_rate": 9.542511410678273e-05, + "loss": 0.3182, + "step": 450 + }, + { + "epoch": 0.15, + "grad_norm": 3.2123796939849854, + "learning_rate": 9.531896826239253e-05, + "loss": 0.2872, + "step": 460 + }, + { + "epoch": 0.15, + "grad_norm": 1.532020092010498, + "learning_rate": 9.521282241800233e-05, + "loss": 0.3258, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.1242539882659912, + "learning_rate": 9.510667657361215e-05, + "loss": 0.3356, + "step": 480 + }, + { + "epoch": 0.16, + "grad_norm": 4.846567153930664, + "learning_rate": 9.500053072922196e-05, + "loss": 0.3551, + "step": 490 + }, + { + "epoch": 0.16, + "grad_norm": 3.233238458633423, + "learning_rate": 9.489438488483177e-05, + "loss": 0.3971, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.7334824800491333, + "learning_rate": 9.478823904044158e-05, + "loss": 0.1896, + "step": 510 + }, + { + "epoch": 0.17, + "grad_norm": 7.36009407043457, + "learning_rate": 9.468209319605138e-05, + "loss": 0.3338, + "step": 520 + }, + { + "epoch": 0.17, + "grad_norm": 2.7838549613952637, + "learning_rate": 9.457594735166118e-05, + "loss": 0.3331, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 2.643627405166626, + "learning_rate": 9.4469801507271e-05, + "loss": 0.4575, + "step": 540 + }, + { + "epoch": 0.18, + "grad_norm": 5.420917510986328, + "learning_rate": 9.43636556628808e-05, + "loss": 0.37, + "step": 550 + }, + { + "epoch": 0.18, + "grad_norm": 2.1689090728759766, + "learning_rate": 9.425750981849061e-05, + "loss": 0.3551, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 0.7210526466369629, + "learning_rate": 9.415136397410041e-05, + "loss": 0.4028, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 0.3214457929134369, + "learning_rate": 9.404521812971022e-05, + "loss": 0.2391, + "step": 580 + }, + { + "epoch": 0.19, + "grad_norm": 3.8258142471313477, + "learning_rate": 9.393907228532003e-05, + "loss": 0.3399, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.5249234437942505, + "learning_rate": 9.383292644092985e-05, + "loss": 0.4449, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 0.22292350232601166, + "learning_rate": 9.372678059653965e-05, + "loss": 0.2156, + "step": 610 + }, + { + "epoch": 0.2, + "grad_norm": 1.3040258884429932, + "learning_rate": 9.362063475214946e-05, + "loss": 0.4175, + "step": 620 + }, + { + "epoch": 0.2, + "grad_norm": 1.3762481212615967, + "learning_rate": 9.351448890775926e-05, + "loss": 0.3191, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 2.706467866897583, + "learning_rate": 9.340834306336908e-05, + "loss": 0.5163, + "step": 640 + }, + { + "epoch": 0.21, + "grad_norm": 1.8577134609222412, + "learning_rate": 9.330219721897888e-05, + "loss": 0.1832, + "step": 650 + }, + { + "epoch": 0.21, + "grad_norm": 5.450695037841797, + "learning_rate": 9.319605137458869e-05, + "loss": 0.269, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 3.1967124938964844, + "learning_rate": 9.308990553019849e-05, + "loss": 0.3387, + "step": 670 + }, + { + "epoch": 0.22, + "grad_norm": 2.2148098945617676, + "learning_rate": 9.29837596858083e-05, + "loss": 0.3407, + "step": 680 + }, + { + "epoch": 0.22, + "grad_norm": 2.2693583965301514, + "learning_rate": 9.287761384141811e-05, + "loss": 0.2758, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 4.460744857788086, + "learning_rate": 9.277146799702791e-05, + "loss": 0.2493, + "step": 700 + }, + { + "epoch": 0.23, + "grad_norm": 8.331945419311523, + "learning_rate": 9.266532215263772e-05, + "loss": 0.2264, + "step": 710 + }, + { + "epoch": 0.23, + "grad_norm": 2.7469747066497803, + "learning_rate": 9.255917630824753e-05, + "loss": 0.3038, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 3.013535737991333, + "learning_rate": 9.245303046385735e-05, + "loss": 0.3136, + "step": 730 + }, + { + "epoch": 0.24, + "grad_norm": 3.508979558944702, + "learning_rate": 9.234688461946716e-05, + "loss": 0.3502, + "step": 740 + }, + { + "epoch": 0.24, + "grad_norm": 5.0464301109313965, + "learning_rate": 9.224073877507696e-05, + "loss": 0.1776, + "step": 750 + }, + { + "epoch": 0.24, + "grad_norm": 1.6929841041564941, + "learning_rate": 9.213459293068676e-05, + "loss": 0.2984, + "step": 760 + }, + { + "epoch": 0.25, + "grad_norm": 1.1452223062515259, + "learning_rate": 9.202844708629658e-05, + "loss": 0.2503, + "step": 770 + }, + { + "epoch": 0.25, + "grad_norm": 1.3975647687911987, + "learning_rate": 9.192230124190638e-05, + "loss": 0.2423, + "step": 780 + }, + { + "epoch": 0.25, + "grad_norm": 1.8630661964416504, + "learning_rate": 9.181615539751619e-05, + "loss": 0.327, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 5.333163261413574, + "learning_rate": 9.171000955312599e-05, + "loss": 0.4495, + "step": 800 + }, + { + "epoch": 0.26, + "grad_norm": 1.6478999853134155, + "learning_rate": 9.160386370873581e-05, + "loss": 0.2546, + "step": 810 + }, + { + "epoch": 0.26, + "grad_norm": 1.2132633924484253, + "learning_rate": 9.149771786434561e-05, + "loss": 0.2439, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 2.2123448848724365, + "learning_rate": 9.139157201995542e-05, + "loss": 0.3715, + "step": 830 + }, + { + "epoch": 0.27, + "grad_norm": 2.148674726486206, + "learning_rate": 9.128542617556523e-05, + "loss": 0.252, + "step": 840 + }, + { + "epoch": 0.27, + "grad_norm": 3.6980788707733154, + "learning_rate": 9.117928033117504e-05, + "loss": 0.4487, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 6.548594951629639, + "learning_rate": 9.107313448678485e-05, + "loss": 0.2199, + "step": 860 + }, + { + "epoch": 0.28, + "grad_norm": 3.5746383666992188, + "learning_rate": 9.096698864239466e-05, + "loss": 0.2728, + "step": 870 + }, + { + "epoch": 0.28, + "grad_norm": 0.9120383858680725, + "learning_rate": 9.086084279800446e-05, + "loss": 0.2737, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 4.220329761505127, + "learning_rate": 9.075469695361427e-05, + "loss": 0.4124, + "step": 890 + }, + { + "epoch": 0.29, + "grad_norm": 2.5000956058502197, + "learning_rate": 9.064855110922408e-05, + "loss": 0.302, + "step": 900 + }, + { + "epoch": 0.29, + "grad_norm": 5.3845906257629395, + "learning_rate": 9.054240526483389e-05, + "loss": 0.4177, + "step": 910 + }, + { + "epoch": 0.29, + "grad_norm": 1.0533277988433838, + "learning_rate": 9.043625942044369e-05, + "loss": 0.3834, + "step": 920 + }, + { + "epoch": 0.3, + "grad_norm": 2.482363224029541, + "learning_rate": 9.03301135760535e-05, + "loss": 0.3497, + "step": 930 + }, + { + "epoch": 0.3, + "grad_norm": 2.785825729370117, + "learning_rate": 9.022396773166331e-05, + "loss": 0.2696, + "step": 940 + }, + { + "epoch": 0.3, + "grad_norm": 0.9899762868881226, + "learning_rate": 9.011782188727311e-05, + "loss": 0.3139, + "step": 950 + }, + { + "epoch": 0.31, + "grad_norm": 3.0521786212921143, + "learning_rate": 9.001167604288293e-05, + "loss": 0.4116, + "step": 960 + }, + { + "epoch": 0.31, + "grad_norm": 1.1553211212158203, + "learning_rate": 8.990553019849274e-05, + "loss": 0.3239, + "step": 970 + }, + { + "epoch": 0.31, + "grad_norm": 2.973958730697632, + "learning_rate": 8.979938435410254e-05, + "loss": 0.297, + "step": 980 + }, + { + "epoch": 0.32, + "grad_norm": 1.3011306524276733, + "learning_rate": 8.969323850971236e-05, + "loss": 0.3136, + "step": 990 + }, + { + "epoch": 0.32, + "grad_norm": 2.6845755577087402, + "learning_rate": 8.958709266532216e-05, + "loss": 0.3207, + "step": 1000 + }, + { + "epoch": 0.32, + "grad_norm": 0.33025118708610535, + "learning_rate": 8.948094682093196e-05, + "loss": 0.1847, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 1.9631307125091553, + "learning_rate": 8.937480097654177e-05, + "loss": 0.2798, + "step": 1020 + }, + { + "epoch": 0.33, + "grad_norm": 1.952580451965332, + "learning_rate": 8.926865513215158e-05, + "loss": 0.2184, + "step": 1030 + }, + { + "epoch": 0.33, + "grad_norm": 5.541811466217041, + "learning_rate": 8.916250928776139e-05, + "loss": 0.2649, + "step": 1040 + }, + { + "epoch": 0.33, + "grad_norm": 1.0800001621246338, + "learning_rate": 8.905636344337119e-05, + "loss": 0.3064, + "step": 1050 + }, + { + "epoch": 0.34, + "grad_norm": 4.908554553985596, + "learning_rate": 8.8950217598981e-05, + "loss": 0.2, + "step": 1060 + }, + { + "epoch": 0.34, + "grad_norm": 0.08677980303764343, + "learning_rate": 8.884407175459081e-05, + "loss": 0.1262, + "step": 1070 + }, + { + "epoch": 0.34, + "grad_norm": 1.9461978673934937, + "learning_rate": 8.873792591020062e-05, + "loss": 0.3098, + "step": 1080 + }, + { + "epoch": 0.35, + "grad_norm": 0.11714805662631989, + "learning_rate": 8.863178006581043e-05, + "loss": 0.3596, + "step": 1090 + }, + { + "epoch": 0.35, + "grad_norm": 2.0041699409484863, + "learning_rate": 8.852563422142024e-05, + "loss": 0.2518, + "step": 1100 + }, + { + "epoch": 0.35, + "grad_norm": 5.036510467529297, + "learning_rate": 8.841948837703004e-05, + "loss": 0.3654, + "step": 1110 + }, + { + "epoch": 0.36, + "grad_norm": 2.267143726348877, + "learning_rate": 8.831334253263986e-05, + "loss": 0.2812, + "step": 1120 + }, + { + "epoch": 0.36, + "grad_norm": 3.063321113586426, + "learning_rate": 8.820719668824966e-05, + "loss": 0.3135, + "step": 1130 + }, + { + "epoch": 0.36, + "grad_norm": 4.012215614318848, + "learning_rate": 8.810105084385947e-05, + "loss": 0.2423, + "step": 1140 + }, + { + "epoch": 0.37, + "grad_norm": 1.7306702136993408, + "learning_rate": 8.799490499946927e-05, + "loss": 0.187, + "step": 1150 + }, + { + "epoch": 0.37, + "grad_norm": 1.7319563627243042, + "learning_rate": 8.788875915507909e-05, + "loss": 0.3792, + "step": 1160 + }, + { + "epoch": 0.37, + "grad_norm": 4.382763862609863, + "learning_rate": 8.778261331068889e-05, + "loss": 0.483, + "step": 1170 + }, + { + "epoch": 0.38, + "grad_norm": 1.3643946647644043, + "learning_rate": 8.76764674662987e-05, + "loss": 0.1497, + "step": 1180 + }, + { + "epoch": 0.38, + "grad_norm": 5.549211025238037, + "learning_rate": 8.75703216219085e-05, + "loss": 0.2628, + "step": 1190 + }, + { + "epoch": 0.38, + "grad_norm": 2.2046520709991455, + "learning_rate": 8.747479036195734e-05, + "loss": 0.3474, + "step": 1200 + }, + { + "epoch": 0.39, + "grad_norm": 3.313180446624756, + "learning_rate": 8.736864451756715e-05, + "loss": 0.3096, + "step": 1210 + }, + { + "epoch": 0.39, + "grad_norm": 2.811859130859375, + "learning_rate": 8.726249867317695e-05, + "loss": 0.1371, + "step": 1220 + }, + { + "epoch": 0.39, + "grad_norm": 0.43377700448036194, + "learning_rate": 8.715635282878675e-05, + "loss": 0.2461, + "step": 1230 + }, + { + "epoch": 0.39, + "grad_norm": 2.7710583209991455, + "learning_rate": 8.705020698439657e-05, + "loss": 0.3332, + "step": 1240 + }, + { + "epoch": 0.4, + "grad_norm": 0.4188406467437744, + "learning_rate": 8.694406114000637e-05, + "loss": 0.3196, + "step": 1250 + }, + { + "epoch": 0.4, + "grad_norm": 0.7705641388893127, + "learning_rate": 8.683791529561618e-05, + "loss": 0.1709, + "step": 1260 + }, + { + "epoch": 0.4, + "grad_norm": 2.6247994899749756, + "learning_rate": 8.673176945122598e-05, + "loss": 0.3033, + "step": 1270 + }, + { + "epoch": 0.41, + "grad_norm": 1.033170461654663, + "learning_rate": 8.66256236068358e-05, + "loss": 0.2506, + "step": 1280 + }, + { + "epoch": 0.41, + "grad_norm": 4.289760112762451, + "learning_rate": 8.65194777624456e-05, + "loss": 0.2839, + "step": 1290 + }, + { + "epoch": 0.41, + "grad_norm": 1.3554538488388062, + "learning_rate": 8.64133319180554e-05, + "loss": 0.2703, + "step": 1300 + }, + { + "epoch": 0.42, + "grad_norm": 1.9523005485534668, + "learning_rate": 8.630718607366522e-05, + "loss": 0.1133, + "step": 1310 + }, + { + "epoch": 0.42, + "grad_norm": 5.332389831542969, + "learning_rate": 8.620104022927503e-05, + "loss": 0.3579, + "step": 1320 + }, + { + "epoch": 0.42, + "grad_norm": 5.874100208282471, + "learning_rate": 8.609489438488484e-05, + "loss": 0.4038, + "step": 1330 + }, + { + "epoch": 0.43, + "grad_norm": 1.4143377542495728, + "learning_rate": 8.598874854049465e-05, + "loss": 0.2451, + "step": 1340 + }, + { + "epoch": 0.43, + "grad_norm": 0.5176362991333008, + "learning_rate": 8.588260269610445e-05, + "loss": 0.2561, + "step": 1350 + }, + { + "epoch": 0.43, + "grad_norm": 1.5968561172485352, + "learning_rate": 8.577645685171426e-05, + "loss": 0.3456, + "step": 1360 + }, + { + "epoch": 0.44, + "grad_norm": 1.039812445640564, + "learning_rate": 8.567031100732407e-05, + "loss": 0.2792, + "step": 1370 + }, + { + "epoch": 0.44, + "grad_norm": 5.390068531036377, + "learning_rate": 8.556416516293388e-05, + "loss": 0.398, + "step": 1380 + }, + { + "epoch": 0.44, + "grad_norm": 1.3645654916763306, + "learning_rate": 8.545801931854368e-05, + "loss": 0.4537, + "step": 1390 + }, + { + "epoch": 0.45, + "grad_norm": 2.444027900695801, + "learning_rate": 8.535187347415348e-05, + "loss": 0.218, + "step": 1400 + }, + { + "epoch": 0.45, + "grad_norm": 4.201082229614258, + "learning_rate": 8.52457276297633e-05, + "loss": 0.3146, + "step": 1410 + }, + { + "epoch": 0.45, + "grad_norm": 4.080310344696045, + "learning_rate": 8.51395817853731e-05, + "loss": 0.2769, + "step": 1420 + }, + { + "epoch": 0.46, + "grad_norm": 2.712216377258301, + "learning_rate": 8.503343594098292e-05, + "loss": 0.2795, + "step": 1430 + }, + { + "epoch": 0.46, + "grad_norm": 3.2429492473602295, + "learning_rate": 8.492729009659273e-05, + "loss": 0.2956, + "step": 1440 + }, + { + "epoch": 0.46, + "grad_norm": 6.107478618621826, + "learning_rate": 8.482114425220253e-05, + "loss": 0.3381, + "step": 1450 + }, + { + "epoch": 0.46, + "grad_norm": 0.9037106037139893, + "learning_rate": 8.471499840781235e-05, + "loss": 0.4196, + "step": 1460 + }, + { + "epoch": 0.47, + "grad_norm": 1.2487717866897583, + "learning_rate": 8.460885256342215e-05, + "loss": 0.2471, + "step": 1470 + }, + { + "epoch": 0.47, + "grad_norm": 2.8922715187072754, + "learning_rate": 8.450270671903195e-05, + "loss": 0.2664, + "step": 1480 + }, + { + "epoch": 0.47, + "grad_norm": 0.6493813991546631, + "learning_rate": 8.439656087464176e-05, + "loss": 0.206, + "step": 1490 + }, + { + "epoch": 0.48, + "grad_norm": 0.11327870935201645, + "learning_rate": 8.429041503025157e-05, + "loss": 0.2593, + "step": 1500 + }, + { + "epoch": 0.48, + "grad_norm": 4.4462690353393555, + "learning_rate": 8.418426918586138e-05, + "loss": 0.4474, + "step": 1510 + }, + { + "epoch": 0.48, + "grad_norm": 2.0405867099761963, + "learning_rate": 8.407812334147118e-05, + "loss": 0.1657, + "step": 1520 + }, + { + "epoch": 0.49, + "grad_norm": 0.3047516942024231, + "learning_rate": 8.397197749708099e-05, + "loss": 0.1691, + "step": 1530 + }, + { + "epoch": 0.49, + "grad_norm": 6.330657958984375, + "learning_rate": 8.386583165269079e-05, + "loss": 0.2041, + "step": 1540 + }, + { + "epoch": 0.49, + "grad_norm": 2.403702974319458, + "learning_rate": 8.375968580830062e-05, + "loss": 0.3408, + "step": 1550 + }, + { + "epoch": 0.5, + "grad_norm": 3.2958528995513916, + "learning_rate": 8.365353996391042e-05, + "loss": 0.3271, + "step": 1560 + }, + { + "epoch": 0.5, + "grad_norm": 3.2511487007141113, + "learning_rate": 8.354739411952023e-05, + "loss": 0.1719, + "step": 1570 + }, + { + "epoch": 0.5, + "grad_norm": 2.447939872741699, + "learning_rate": 8.344124827513003e-05, + "loss": 0.2823, + "step": 1580 + }, + { + "epoch": 0.51, + "grad_norm": 1.9992095232009888, + "learning_rate": 8.333510243073985e-05, + "loss": 0.2479, + "step": 1590 + }, + { + "epoch": 0.51, + "grad_norm": 3.8574376106262207, + "learning_rate": 8.322895658634965e-05, + "loss": 0.2539, + "step": 1600 + }, + { + "epoch": 0.51, + "grad_norm": 3.184896230697632, + "learning_rate": 8.312281074195946e-05, + "loss": 0.2826, + "step": 1610 + }, + { + "epoch": 0.52, + "grad_norm": 0.6027563810348511, + "learning_rate": 8.301666489756926e-05, + "loss": 0.1404, + "step": 1620 + }, + { + "epoch": 0.52, + "grad_norm": 1.0776386260986328, + "learning_rate": 8.291051905317906e-05, + "loss": 0.3887, + "step": 1630 + }, + { + "epoch": 0.52, + "grad_norm": 2.386305093765259, + "learning_rate": 8.280437320878888e-05, + "loss": 0.4232, + "step": 1640 + }, + { + "epoch": 0.53, + "grad_norm": 1.299332618713379, + "learning_rate": 8.269822736439868e-05, + "loss": 0.2855, + "step": 1650 + }, + { + "epoch": 0.53, + "grad_norm": 1.3506910800933838, + "learning_rate": 8.259208152000849e-05, + "loss": 0.2412, + "step": 1660 + }, + { + "epoch": 0.53, + "grad_norm": 2.2037456035614014, + "learning_rate": 8.24859356756183e-05, + "loss": 0.2399, + "step": 1670 + }, + { + "epoch": 0.53, + "grad_norm": 2.2852354049682617, + "learning_rate": 8.237978983122812e-05, + "loss": 0.202, + "step": 1680 + }, + { + "epoch": 0.54, + "grad_norm": 0.2693609297275543, + "learning_rate": 8.227364398683793e-05, + "loss": 0.3235, + "step": 1690 + }, + { + "epoch": 0.54, + "grad_norm": 3.526648998260498, + "learning_rate": 8.216749814244773e-05, + "loss": 0.3102, + "step": 1700 + }, + { + "epoch": 0.54, + "grad_norm": 1.9742597341537476, + "learning_rate": 8.206135229805753e-05, + "loss": 0.3293, + "step": 1710 + }, + { + "epoch": 0.55, + "grad_norm": 2.933436155319214, + "learning_rate": 8.195520645366734e-05, + "loss": 0.207, + "step": 1720 + }, + { + "epoch": 0.55, + "grad_norm": 0.5870353579521179, + "learning_rate": 8.184906060927715e-05, + "loss": 0.3731, + "step": 1730 + }, + { + "epoch": 0.55, + "grad_norm": 1.7825034856796265, + "learning_rate": 8.174291476488696e-05, + "loss": 0.1747, + "step": 1740 + }, + { + "epoch": 0.56, + "grad_norm": 4.706550598144531, + "learning_rate": 8.163676892049676e-05, + "loss": 0.2143, + "step": 1750 + }, + { + "epoch": 0.56, + "grad_norm": 3.326359748840332, + "learning_rate": 8.153062307610657e-05, + "loss": 0.363, + "step": 1760 + }, + { + "epoch": 0.56, + "grad_norm": 1.3437646627426147, + "learning_rate": 8.142447723171638e-05, + "loss": 0.2806, + "step": 1770 + }, + { + "epoch": 0.57, + "grad_norm": 4.6950249671936035, + "learning_rate": 8.131833138732619e-05, + "loss": 0.2547, + "step": 1780 + }, + { + "epoch": 0.57, + "grad_norm": 1.557305097579956, + "learning_rate": 8.1212185542936e-05, + "loss": 0.277, + "step": 1790 + }, + { + "epoch": 0.57, + "grad_norm": 1.5373164415359497, + "learning_rate": 8.110603969854581e-05, + "loss": 0.2878, + "step": 1800 + }, + { + "epoch": 0.58, + "grad_norm": 1.3761144876480103, + "learning_rate": 8.099989385415561e-05, + "loss": 0.4071, + "step": 1810 + }, + { + "epoch": 0.58, + "grad_norm": 0.7141520977020264, + "learning_rate": 8.089374800976543e-05, + "loss": 0.2002, + "step": 1820 + }, + { + "epoch": 0.58, + "grad_norm": 0.6471810340881348, + "learning_rate": 8.078760216537523e-05, + "loss": 0.1962, + "step": 1830 + }, + { + "epoch": 0.59, + "grad_norm": 1.8333234786987305, + "learning_rate": 8.068145632098504e-05, + "loss": 0.23, + "step": 1840 + }, + { + "epoch": 0.59, + "grad_norm": 0.7382714152336121, + "learning_rate": 8.057531047659484e-05, + "loss": 0.1602, + "step": 1850 + }, + { + "epoch": 0.59, + "grad_norm": 2.2624874114990234, + "learning_rate": 8.046916463220466e-05, + "loss": 0.3355, + "step": 1860 + }, + { + "epoch": 0.6, + "grad_norm": 1.3432509899139404, + "learning_rate": 8.036301878781446e-05, + "loss": 0.1226, + "step": 1870 + }, + { + "epoch": 0.6, + "grad_norm": 1.3153080940246582, + "learning_rate": 8.025687294342426e-05, + "loss": 0.2797, + "step": 1880 + }, + { + "epoch": 0.6, + "grad_norm": 0.13998636603355408, + "learning_rate": 8.015072709903407e-05, + "loss": 0.3126, + "step": 1890 + }, + { + "epoch": 0.6, + "grad_norm": 7.6837382316589355, + "learning_rate": 8.004458125464388e-05, + "loss": 0.348, + "step": 1900 + }, + { + "epoch": 0.61, + "grad_norm": 2.536726236343384, + "learning_rate": 7.993843541025369e-05, + "loss": 0.2518, + "step": 1910 + }, + { + "epoch": 0.61, + "grad_norm": 2.798586130142212, + "learning_rate": 7.98322895658635e-05, + "loss": 0.187, + "step": 1920 + }, + { + "epoch": 0.61, + "grad_norm": 2.047030210494995, + "learning_rate": 7.972614372147331e-05, + "loss": 0.1801, + "step": 1930 + }, + { + "epoch": 0.62, + "grad_norm": 2.5127789974212646, + "learning_rate": 7.961999787708311e-05, + "loss": 0.2613, + "step": 1940 + }, + { + "epoch": 0.62, + "grad_norm": 5.015801429748535, + "learning_rate": 7.951385203269293e-05, + "loss": 0.4155, + "step": 1950 + }, + { + "epoch": 0.62, + "grad_norm": 4.095780849456787, + "learning_rate": 7.940770618830273e-05, + "loss": 0.2413, + "step": 1960 + }, + { + "epoch": 0.63, + "grad_norm": 0.575307309627533, + "learning_rate": 7.930156034391254e-05, + "loss": 0.2799, + "step": 1970 + }, + { + "epoch": 0.63, + "grad_norm": 0.26382434368133545, + "learning_rate": 7.919541449952234e-05, + "loss": 0.1894, + "step": 1980 + }, + { + "epoch": 0.63, + "grad_norm": 1.7955100536346436, + "learning_rate": 7.908926865513216e-05, + "loss": 0.199, + "step": 1990 + }, + { + "epoch": 0.64, + "grad_norm": 0.4029354453086853, + "learning_rate": 7.898312281074196e-05, + "loss": 0.2465, + "step": 2000 + }, + { + "epoch": 0.64, + "grad_norm": 1.4386157989501953, + "learning_rate": 7.887697696635177e-05, + "loss": 0.2603, + "step": 2010 + }, + { + "epoch": 0.64, + "grad_norm": 4.048315525054932, + "learning_rate": 7.877083112196157e-05, + "loss": 0.3663, + "step": 2020 + }, + { + "epoch": 0.65, + "grad_norm": 4.0357255935668945, + "learning_rate": 7.866468527757139e-05, + "loss": 0.2365, + "step": 2030 + }, + { + "epoch": 0.65, + "grad_norm": 0.6603661775588989, + "learning_rate": 7.85585394331812e-05, + "loss": 0.2848, + "step": 2040 + }, + { + "epoch": 0.65, + "grad_norm": 2.005911111831665, + "learning_rate": 7.845239358879101e-05, + "loss": 0.316, + "step": 2050 + }, + { + "epoch": 0.66, + "grad_norm": 1.5447591543197632, + "learning_rate": 7.834624774440081e-05, + "loss": 0.2741, + "step": 2060 + }, + { + "epoch": 0.66, + "grad_norm": 3.2413675785064697, + "learning_rate": 7.824010190001062e-05, + "loss": 0.4234, + "step": 2070 + }, + { + "epoch": 0.66, + "grad_norm": 2.6230356693267822, + "learning_rate": 7.813395605562043e-05, + "loss": 0.1797, + "step": 2080 + }, + { + "epoch": 0.67, + "grad_norm": 1.5376132726669312, + "learning_rate": 7.802781021123024e-05, + "loss": 0.3815, + "step": 2090 + }, + { + "epoch": 0.67, + "grad_norm": 1.4491734504699707, + "learning_rate": 7.792166436684004e-05, + "loss": 0.3153, + "step": 2100 + }, + { + "epoch": 0.67, + "grad_norm": 1.949112057685852, + "learning_rate": 7.781551852244984e-05, + "loss": 0.2751, + "step": 2110 + }, + { + "epoch": 0.67, + "grad_norm": 0.3488381803035736, + "learning_rate": 7.770937267805966e-05, + "loss": 0.3558, + "step": 2120 + }, + { + "epoch": 0.68, + "grad_norm": 1.4437161684036255, + "learning_rate": 7.760322683366946e-05, + "loss": 0.2827, + "step": 2130 + }, + { + "epoch": 0.68, + "grad_norm": 1.1105573177337646, + "learning_rate": 7.749708098927927e-05, + "loss": 0.1867, + "step": 2140 + }, + { + "epoch": 0.68, + "grad_norm": 2.1235313415527344, + "learning_rate": 7.739093514488907e-05, + "loss": 0.1689, + "step": 2150 + }, + { + "epoch": 0.69, + "grad_norm": 1.60935378074646, + "learning_rate": 7.728478930049889e-05, + "loss": 0.3198, + "step": 2160 + }, + { + "epoch": 0.69, + "grad_norm": 1.3222334384918213, + "learning_rate": 7.71786434561087e-05, + "loss": 0.1978, + "step": 2170 + }, + { + "epoch": 0.69, + "grad_norm": 1.4521784782409668, + "learning_rate": 7.707249761171851e-05, + "loss": 0.3276, + "step": 2180 + }, + { + "epoch": 0.7, + "grad_norm": 0.4480780363082886, + "learning_rate": 7.696635176732831e-05, + "loss": 0.2151, + "step": 2190 + }, + { + "epoch": 0.7, + "grad_norm": 1.5750231742858887, + "learning_rate": 7.686020592293812e-05, + "loss": 0.1659, + "step": 2200 + }, + { + "epoch": 0.7, + "grad_norm": 2.5736334323883057, + "learning_rate": 7.675406007854793e-05, + "loss": 0.3704, + "step": 2210 + }, + { + "epoch": 0.71, + "grad_norm": 3.719284772872925, + "learning_rate": 7.664791423415774e-05, + "loss": 0.1645, + "step": 2220 + }, + { + "epoch": 0.71, + "grad_norm": 3.429244041442871, + "learning_rate": 7.654176838976754e-05, + "loss": 0.3323, + "step": 2230 + }, + { + "epoch": 0.71, + "grad_norm": 2.801398277282715, + "learning_rate": 7.643562254537735e-05, + "loss": 0.2805, + "step": 2240 + }, + { + "epoch": 0.72, + "grad_norm": 2.050607204437256, + "learning_rate": 7.632947670098716e-05, + "loss": 0.2308, + "step": 2250 + }, + { + "epoch": 0.72, + "grad_norm": 3.164123773574829, + "learning_rate": 7.622333085659697e-05, + "loss": 0.2401, + "step": 2260 + }, + { + "epoch": 0.72, + "grad_norm": 3.276832342147827, + "learning_rate": 7.611718501220677e-05, + "loss": 0.2399, + "step": 2270 + }, + { + "epoch": 0.73, + "grad_norm": 2.8366944789886475, + "learning_rate": 7.601103916781659e-05, + "loss": 0.4004, + "step": 2280 + }, + { + "epoch": 0.73, + "grad_norm": 2.4258265495300293, + "learning_rate": 7.590489332342639e-05, + "loss": 0.3202, + "step": 2290 + }, + { + "epoch": 0.73, + "grad_norm": 1.4008164405822754, + "learning_rate": 7.579874747903621e-05, + "loss": 0.1952, + "step": 2300 + }, + { + "epoch": 0.74, + "grad_norm": 1.1098754405975342, + "learning_rate": 7.569260163464601e-05, + "loss": 0.1867, + "step": 2310 + }, + { + "epoch": 0.74, + "grad_norm": 0.15033583343029022, + "learning_rate": 7.558645579025582e-05, + "loss": 0.1995, + "step": 2320 + }, + { + "epoch": 0.74, + "grad_norm": 0.9557719230651855, + "learning_rate": 7.548030994586562e-05, + "loss": 0.2475, + "step": 2330 + }, + { + "epoch": 0.74, + "grad_norm": 8.91406536102295, + "learning_rate": 7.537416410147544e-05, + "loss": 0.2756, + "step": 2340 + }, + { + "epoch": 0.75, + "grad_norm": 1.9521056413650513, + "learning_rate": 7.526801825708524e-05, + "loss": 0.2595, + "step": 2350 + }, + { + "epoch": 0.75, + "grad_norm": 3.3855483531951904, + "learning_rate": 7.516187241269504e-05, + "loss": 0.2948, + "step": 2360 + }, + { + "epoch": 0.75, + "grad_norm": 1.6990065574645996, + "learning_rate": 7.506634115274387e-05, + "loss": 0.2755, + "step": 2370 + }, + { + "epoch": 0.76, + "grad_norm": 2.098942518234253, + "learning_rate": 7.496019530835369e-05, + "loss": 0.175, + "step": 2380 + }, + { + "epoch": 0.76, + "grad_norm": 0.9781967997550964, + "learning_rate": 7.48540494639635e-05, + "loss": 0.4592, + "step": 2390 + }, + { + "epoch": 0.76, + "grad_norm": 0.4728473722934723, + "learning_rate": 7.47479036195733e-05, + "loss": 0.3847, + "step": 2400 + }, + { + "epoch": 0.77, + "grad_norm": 3.3047373294830322, + "learning_rate": 7.46417577751831e-05, + "loss": 0.1848, + "step": 2410 + }, + { + "epoch": 0.77, + "grad_norm": 2.424025535583496, + "learning_rate": 7.453561193079292e-05, + "loss": 0.2197, + "step": 2420 + }, + { + "epoch": 0.77, + "grad_norm": 2.697960376739502, + "learning_rate": 7.442946608640272e-05, + "loss": 0.2314, + "step": 2430 + }, + { + "epoch": 0.78, + "grad_norm": 0.496898353099823, + "learning_rate": 7.432332024201253e-05, + "loss": 0.3299, + "step": 2440 + }, + { + "epoch": 0.78, + "grad_norm": 1.4845099449157715, + "learning_rate": 7.421717439762233e-05, + "loss": 0.2832, + "step": 2450 + }, + { + "epoch": 0.78, + "grad_norm": 3.8896942138671875, + "learning_rate": 7.411102855323215e-05, + "loss": 0.2837, + "step": 2460 + }, + { + "epoch": 0.79, + "grad_norm": 4.288979530334473, + "learning_rate": 7.400488270884195e-05, + "loss": 0.1653, + "step": 2470 + }, + { + "epoch": 0.79, + "grad_norm": 3.0013909339904785, + "learning_rate": 7.389873686445176e-05, + "loss": 0.3207, + "step": 2480 + }, + { + "epoch": 0.79, + "grad_norm": 0.38008421659469604, + "learning_rate": 7.379259102006156e-05, + "loss": 0.2916, + "step": 2490 + }, + { + "epoch": 0.8, + "grad_norm": 3.843106985092163, + "learning_rate": 7.368644517567138e-05, + "loss": 0.4216, + "step": 2500 + }, + { + "epoch": 0.8, + "grad_norm": 0.46844518184661865, + "learning_rate": 7.35802993312812e-05, + "loss": 0.3038, + "step": 2510 + }, + { + "epoch": 0.8, + "grad_norm": 0.5063233375549316, + "learning_rate": 7.3474153486891e-05, + "loss": 0.2392, + "step": 2520 + }, + { + "epoch": 0.81, + "grad_norm": 6.260082721710205, + "learning_rate": 7.33680076425008e-05, + "loss": 0.317, + "step": 2530 + }, + { + "epoch": 0.81, + "grad_norm": 1.771292805671692, + "learning_rate": 7.32618617981106e-05, + "loss": 0.2229, + "step": 2540 + }, + { + "epoch": 0.81, + "grad_norm": 5.619741439819336, + "learning_rate": 7.315571595372042e-05, + "loss": 0.1364, + "step": 2550 + }, + { + "epoch": 0.81, + "grad_norm": 2.196967363357544, + "learning_rate": 7.304957010933023e-05, + "loss": 0.2732, + "step": 2560 + }, + { + "epoch": 0.82, + "grad_norm": 0.6409101486206055, + "learning_rate": 7.294342426494003e-05, + "loss": 0.2754, + "step": 2570 + }, + { + "epoch": 0.82, + "grad_norm": 1.4790414571762085, + "learning_rate": 7.283727842054983e-05, + "loss": 0.2017, + "step": 2580 + }, + { + "epoch": 0.82, + "grad_norm": 2.013932943344116, + "learning_rate": 7.273113257615965e-05, + "loss": 0.24, + "step": 2590 + }, + { + "epoch": 0.83, + "grad_norm": 3.7832634449005127, + "learning_rate": 7.262498673176945e-05, + "loss": 0.3675, + "step": 2600 + }, + { + "epoch": 0.83, + "grad_norm": 0.3102867007255554, + "learning_rate": 7.251884088737926e-05, + "loss": 0.379, + "step": 2610 + }, + { + "epoch": 0.83, + "grad_norm": 2.4098093509674072, + "learning_rate": 7.241269504298906e-05, + "loss": 0.381, + "step": 2620 + }, + { + "epoch": 0.84, + "grad_norm": 2.3519186973571777, + "learning_rate": 7.230654919859888e-05, + "loss": 0.2574, + "step": 2630 + }, + { + "epoch": 0.84, + "grad_norm": 1.1589571237564087, + "learning_rate": 7.22004033542087e-05, + "loss": 0.1603, + "step": 2640 + }, + { + "epoch": 0.84, + "grad_norm": 3.823918342590332, + "learning_rate": 7.20942575098185e-05, + "loss": 0.2485, + "step": 2650 + }, + { + "epoch": 0.85, + "grad_norm": 1.778441071510315, + "learning_rate": 7.19881116654283e-05, + "loss": 0.234, + "step": 2660 + }, + { + "epoch": 0.85, + "grad_norm": 2.2710683345794678, + "learning_rate": 7.188196582103811e-05, + "loss": 0.1746, + "step": 2670 + }, + { + "epoch": 0.85, + "grad_norm": 6.078259468078613, + "learning_rate": 7.177581997664792e-05, + "loss": 0.3255, + "step": 2680 + }, + { + "epoch": 0.86, + "grad_norm": 0.585472583770752, + "learning_rate": 7.166967413225773e-05, + "loss": 0.3718, + "step": 2690 + }, + { + "epoch": 0.86, + "grad_norm": 1.9394687414169312, + "learning_rate": 7.156352828786753e-05, + "loss": 0.3181, + "step": 2700 + }, + { + "epoch": 0.86, + "grad_norm": 1.6753870248794556, + "learning_rate": 7.145738244347734e-05, + "loss": 0.2424, + "step": 2710 + }, + { + "epoch": 0.87, + "grad_norm": 0.37682977318763733, + "learning_rate": 7.135123659908714e-05, + "loss": 0.2963, + "step": 2720 + }, + { + "epoch": 0.87, + "grad_norm": 3.564805507659912, + "learning_rate": 7.124509075469696e-05, + "loss": 0.2822, + "step": 2730 + }, + { + "epoch": 0.87, + "grad_norm": 0.22953364253044128, + "learning_rate": 7.113894491030676e-05, + "loss": 0.3489, + "step": 2740 + }, + { + "epoch": 0.88, + "grad_norm": 4.16074275970459, + "learning_rate": 7.103279906591658e-05, + "loss": 0.405, + "step": 2750 + }, + { + "epoch": 0.88, + "grad_norm": 1.4540446996688843, + "learning_rate": 7.092665322152638e-05, + "loss": 0.2634, + "step": 2760 + }, + { + "epoch": 0.88, + "grad_norm": 1.9992202520370483, + "learning_rate": 7.082050737713618e-05, + "loss": 0.2762, + "step": 2770 + }, + { + "epoch": 0.88, + "grad_norm": 1.3939869403839111, + "learning_rate": 7.0714361532746e-05, + "loss": 0.3462, + "step": 2780 + }, + { + "epoch": 0.89, + "grad_norm": 0.6099751591682434, + "learning_rate": 7.06082156883558e-05, + "loss": 0.367, + "step": 2790 + }, + { + "epoch": 0.89, + "grad_norm": 6.303842067718506, + "learning_rate": 7.050206984396561e-05, + "loss": 0.2596, + "step": 2800 + }, + { + "epoch": 0.89, + "grad_norm": 1.5723298788070679, + "learning_rate": 7.039592399957541e-05, + "loss": 0.3136, + "step": 2810 + }, + { + "epoch": 0.9, + "grad_norm": 1.3614245653152466, + "learning_rate": 7.028977815518523e-05, + "loss": 0.2983, + "step": 2820 + }, + { + "epoch": 0.9, + "grad_norm": 2.220656633377075, + "learning_rate": 7.018363231079503e-05, + "loss": 0.3549, + "step": 2830 + }, + { + "epoch": 0.9, + "grad_norm": 2.8158984184265137, + "learning_rate": 7.007748646640484e-05, + "loss": 0.2431, + "step": 2840 + }, + { + "epoch": 0.91, + "grad_norm": 0.46454083919525146, + "learning_rate": 6.997134062201464e-05, + "loss": 0.204, + "step": 2850 + }, + { + "epoch": 0.91, + "grad_norm": 2.5426604747772217, + "learning_rate": 6.986519477762446e-05, + "loss": 0.1241, + "step": 2860 + }, + { + "epoch": 0.91, + "grad_norm": 2.6442790031433105, + "learning_rate": 6.975904893323428e-05, + "loss": 0.2026, + "step": 2870 + }, + { + "epoch": 0.92, + "grad_norm": 0.07216634601354599, + "learning_rate": 6.965290308884408e-05, + "loss": 0.1619, + "step": 2880 + }, + { + "epoch": 0.92, + "grad_norm": 1.6410995721817017, + "learning_rate": 6.954675724445388e-05, + "loss": 0.309, + "step": 2890 + }, + { + "epoch": 0.92, + "grad_norm": 1.0634126663208008, + "learning_rate": 6.944061140006369e-05, + "loss": 0.2269, + "step": 2900 + }, + { + "epoch": 0.93, + "grad_norm": 1.272518277168274, + "learning_rate": 6.93344655556735e-05, + "loss": 0.2748, + "step": 2910 + }, + { + "epoch": 0.93, + "grad_norm": 8.030739784240723, + "learning_rate": 6.922831971128331e-05, + "loss": 0.2386, + "step": 2920 + }, + { + "epoch": 0.93, + "grad_norm": 1.0459538698196411, + "learning_rate": 6.912217386689311e-05, + "loss": 0.2162, + "step": 2930 + }, + { + "epoch": 0.94, + "grad_norm": 2.7766873836517334, + "learning_rate": 6.901602802250292e-05, + "loss": 0.18, + "step": 2940 + }, + { + "epoch": 0.94, + "grad_norm": 1.345751166343689, + "learning_rate": 6.890988217811273e-05, + "loss": 0.1927, + "step": 2950 + }, + { + "epoch": 0.94, + "grad_norm": 3.475550889968872, + "learning_rate": 6.880373633372254e-05, + "loss": 0.1593, + "step": 2960 + }, + { + "epoch": 0.95, + "grad_norm": 4.3208088874816895, + "learning_rate": 6.869759048933234e-05, + "loss": 0.3782, + "step": 2970 + }, + { + "epoch": 0.95, + "grad_norm": 0.5283639430999756, + "learning_rate": 6.859144464494214e-05, + "loss": 0.2065, + "step": 2980 + }, + { + "epoch": 0.95, + "grad_norm": 0.3912002444267273, + "learning_rate": 6.848529880055196e-05, + "loss": 0.2094, + "step": 2990 + }, + { + "epoch": 0.95, + "grad_norm": 5.560369968414307, + "learning_rate": 6.837915295616178e-05, + "loss": 0.2598, + "step": 3000 + }, + { + "epoch": 0.96, + "grad_norm": 2.0859804153442383, + "learning_rate": 6.827300711177158e-05, + "loss": 0.2396, + "step": 3010 + }, + { + "epoch": 0.96, + "grad_norm": 1.9198240041732788, + "learning_rate": 6.816686126738139e-05, + "loss": 0.326, + "step": 3020 + }, + { + "epoch": 0.96, + "grad_norm": 2.559525728225708, + "learning_rate": 6.806071542299119e-05, + "loss": 0.2846, + "step": 3030 + }, + { + "epoch": 0.97, + "grad_norm": 8.122730255126953, + "learning_rate": 6.7954569578601e-05, + "loss": 0.3404, + "step": 3040 + }, + { + "epoch": 0.97, + "grad_norm": 1.4377597570419312, + "learning_rate": 6.784842373421081e-05, + "loss": 0.3534, + "step": 3050 + }, + { + "epoch": 0.97, + "grad_norm": 1.3202710151672363, + "learning_rate": 6.774227788982061e-05, + "loss": 0.3151, + "step": 3060 + }, + { + "epoch": 0.98, + "grad_norm": 1.2933627367019653, + "learning_rate": 6.763613204543042e-05, + "loss": 0.1983, + "step": 3070 + }, + { + "epoch": 0.98, + "grad_norm": 0.8253432512283325, + "learning_rate": 6.752998620104023e-05, + "loss": 0.1989, + "step": 3080 + }, + { + "epoch": 0.98, + "grad_norm": 1.008435606956482, + "learning_rate": 6.742384035665004e-05, + "loss": 0.2045, + "step": 3090 + }, + { + "epoch": 0.99, + "grad_norm": 4.022599220275879, + "learning_rate": 6.731769451225984e-05, + "loss": 0.2166, + "step": 3100 + }, + { + "epoch": 0.99, + "grad_norm": 0.5018757581710815, + "learning_rate": 6.721154866786966e-05, + "loss": 0.1841, + "step": 3110 + }, + { + "epoch": 0.99, + "grad_norm": 1.1110012531280518, + "learning_rate": 6.710540282347946e-05, + "loss": 0.208, + "step": 3120 + }, + { + "epoch": 1.0, + "grad_norm": 4.160871505737305, + "learning_rate": 6.699925697908928e-05, + "loss": 0.2853, + "step": 3130 + }, + { + "epoch": 1.0, + "grad_norm": 3.1839327812194824, + "learning_rate": 6.689311113469908e-05, + "loss": 0.239, + "step": 3140 + }, + { + "epoch": 1.0, + "grad_norm": 1.2867355346679688, + "learning_rate": 6.678696529030889e-05, + "loss": 0.1678, + "step": 3150 + }, + { + "epoch": 1.01, + "grad_norm": 0.3853776454925537, + "learning_rate": 6.668081944591869e-05, + "loss": 0.1119, + "step": 3160 + }, + { + "epoch": 1.01, + "grad_norm": 0.9403756856918335, + "learning_rate": 6.657467360152851e-05, + "loss": 0.1772, + "step": 3170 + }, + { + "epoch": 1.01, + "grad_norm": 2.8056976795196533, + "learning_rate": 6.646852775713831e-05, + "loss": 0.1438, + "step": 3180 + }, + { + "epoch": 1.02, + "grad_norm": 0.9233602285385132, + "learning_rate": 6.636238191274812e-05, + "loss": 0.2491, + "step": 3190 + }, + { + "epoch": 1.02, + "grad_norm": 2.179743766784668, + "learning_rate": 6.625623606835792e-05, + "loss": 0.1493, + "step": 3200 + }, + { + "epoch": 1.02, + "grad_norm": 1.8002713918685913, + "learning_rate": 6.615009022396774e-05, + "loss": 0.1557, + "step": 3210 + }, + { + "epoch": 1.02, + "grad_norm": 1.0567578077316284, + "learning_rate": 6.604394437957754e-05, + "loss": 0.1573, + "step": 3220 + }, + { + "epoch": 1.03, + "grad_norm": 1.7498853206634521, + "learning_rate": 6.593779853518734e-05, + "loss": 0.2639, + "step": 3230 + }, + { + "epoch": 1.03, + "grad_norm": 0.14960238337516785, + "learning_rate": 6.583165269079716e-05, + "loss": 0.2314, + "step": 3240 + }, + { + "epoch": 1.03, + "grad_norm": 0.858378529548645, + "learning_rate": 6.572550684640697e-05, + "loss": 0.1898, + "step": 3250 + }, + { + "epoch": 1.04, + "grad_norm": 4.104907989501953, + "learning_rate": 6.561936100201678e-05, + "loss": 0.2381, + "step": 3260 + }, + { + "epoch": 1.04, + "grad_norm": 0.1154847964644432, + "learning_rate": 6.551321515762659e-05, + "loss": 0.0987, + "step": 3270 + }, + { + "epoch": 1.04, + "grad_norm": 1.8907705545425415, + "learning_rate": 6.540706931323639e-05, + "loss": 0.125, + "step": 3280 + }, + { + "epoch": 1.05, + "grad_norm": 1.2750372886657715, + "learning_rate": 6.53009234688462e-05, + "loss": 0.234, + "step": 3290 + }, + { + "epoch": 1.05, + "grad_norm": 1.584429144859314, + "learning_rate": 6.519477762445601e-05, + "loss": 0.1328, + "step": 3300 + }, + { + "epoch": 1.05, + "grad_norm": 2.3900089263916016, + "learning_rate": 6.508863178006581e-05, + "loss": 0.2681, + "step": 3310 + }, + { + "epoch": 1.06, + "grad_norm": 1.9859068393707275, + "learning_rate": 6.498248593567562e-05, + "loss": 0.4136, + "step": 3320 + }, + { + "epoch": 1.06, + "grad_norm": 3.4652695655822754, + "learning_rate": 6.487634009128542e-05, + "loss": 0.2059, + "step": 3330 + }, + { + "epoch": 1.06, + "grad_norm": 4.06072473526001, + "learning_rate": 6.477019424689524e-05, + "loss": 0.2378, + "step": 3340 + }, + { + "epoch": 1.07, + "grad_norm": 1.2823538780212402, + "learning_rate": 6.466404840250504e-05, + "loss": 0.1772, + "step": 3350 + }, + { + "epoch": 1.07, + "grad_norm": 0.545313835144043, + "learning_rate": 6.455790255811486e-05, + "loss": 0.1587, + "step": 3360 + }, + { + "epoch": 1.07, + "grad_norm": 5.666371822357178, + "learning_rate": 6.445175671372466e-05, + "loss": 0.1486, + "step": 3370 + }, + { + "epoch": 1.08, + "grad_norm": 0.3175773620605469, + "learning_rate": 6.434561086933447e-05, + "loss": 0.2295, + "step": 3380 + }, + { + "epoch": 1.08, + "grad_norm": 3.88968563079834, + "learning_rate": 6.423946502494428e-05, + "loss": 0.16, + "step": 3390 + }, + { + "epoch": 1.08, + "grad_norm": 2.4445409774780273, + "learning_rate": 6.413331918055409e-05, + "loss": 0.1766, + "step": 3400 + }, + { + "epoch": 1.09, + "grad_norm": 0.5478050708770752, + "learning_rate": 6.402717333616389e-05, + "loss": 0.1299, + "step": 3410 + }, + { + "epoch": 1.09, + "grad_norm": 4.029285907745361, + "learning_rate": 6.393164207621272e-05, + "loss": 0.3463, + "step": 3420 + }, + { + "epoch": 1.09, + "grad_norm": 0.3899819552898407, + "learning_rate": 6.382549623182253e-05, + "loss": 0.1214, + "step": 3430 + }, + { + "epoch": 1.1, + "grad_norm": 0.7180734276771545, + "learning_rate": 6.371935038743233e-05, + "loss": 0.2756, + "step": 3440 + }, + { + "epoch": 1.1, + "grad_norm": 3.6423099040985107, + "learning_rate": 6.361320454304213e-05, + "loss": 0.2059, + "step": 3450 + }, + { + "epoch": 1.1, + "grad_norm": 3.006516933441162, + "learning_rate": 6.350705869865195e-05, + "loss": 0.2151, + "step": 3460 + }, + { + "epoch": 1.1, + "grad_norm": 2.1426503658294678, + "learning_rate": 6.340091285426177e-05, + "loss": 0.2644, + "step": 3470 + }, + { + "epoch": 1.11, + "grad_norm": 1.4418883323669434, + "learning_rate": 6.329476700987157e-05, + "loss": 0.1675, + "step": 3480 + }, + { + "epoch": 1.11, + "grad_norm": 1.2576738595962524, + "learning_rate": 6.318862116548138e-05, + "loss": 0.1612, + "step": 3490 + }, + { + "epoch": 1.11, + "grad_norm": 3.26369309425354, + "learning_rate": 6.308247532109118e-05, + "loss": 0.2346, + "step": 3500 + }, + { + "epoch": 1.12, + "grad_norm": 0.9214788675308228, + "learning_rate": 6.2976329476701e-05, + "loss": 0.1714, + "step": 3510 + }, + { + "epoch": 1.12, + "grad_norm": 1.696925163269043, + "learning_rate": 6.28701836323108e-05, + "loss": 0.1306, + "step": 3520 + }, + { + "epoch": 1.12, + "grad_norm": 1.1808693408966064, + "learning_rate": 6.27640377879206e-05, + "loss": 0.1135, + "step": 3530 + }, + { + "epoch": 1.13, + "grad_norm": 4.710297107696533, + "learning_rate": 6.265789194353041e-05, + "loss": 0.158, + "step": 3540 + }, + { + "epoch": 1.13, + "grad_norm": 0.5521005988121033, + "learning_rate": 6.255174609914022e-05, + "loss": 0.3224, + "step": 3550 + }, + { + "epoch": 1.13, + "grad_norm": 2.172825336456299, + "learning_rate": 6.244560025475003e-05, + "loss": 0.0946, + "step": 3560 + }, + { + "epoch": 1.14, + "grad_norm": 1.8690552711486816, + "learning_rate": 6.233945441035983e-05, + "loss": 0.1972, + "step": 3570 + }, + { + "epoch": 1.14, + "grad_norm": 0.059970393776893616, + "learning_rate": 6.223330856596965e-05, + "loss": 0.0601, + "step": 3580 + }, + { + "epoch": 1.14, + "grad_norm": 0.0773802176117897, + "learning_rate": 6.212716272157945e-05, + "loss": 0.2881, + "step": 3590 + }, + { + "epoch": 1.15, + "grad_norm": 1.320061206817627, + "learning_rate": 6.202101687718927e-05, + "loss": 0.1966, + "step": 3600 + }, + { + "epoch": 1.15, + "grad_norm": 2.4339261054992676, + "learning_rate": 6.191487103279907e-05, + "loss": 0.1808, + "step": 3610 + }, + { + "epoch": 1.15, + "grad_norm": 5.3104729652404785, + "learning_rate": 6.180872518840888e-05, + "loss": 0.1737, + "step": 3620 + }, + { + "epoch": 1.16, + "grad_norm": 3.9139719009399414, + "learning_rate": 6.170257934401868e-05, + "loss": 0.239, + "step": 3630 + }, + { + "epoch": 1.16, + "grad_norm": 0.9480198621749878, + "learning_rate": 6.15964334996285e-05, + "loss": 0.1556, + "step": 3640 + }, + { + "epoch": 1.16, + "grad_norm": 0.807107150554657, + "learning_rate": 6.14902876552383e-05, + "loss": 0.131, + "step": 3650 + }, + { + "epoch": 1.17, + "grad_norm": 0.059983473271131516, + "learning_rate": 6.13841418108481e-05, + "loss": 0.1479, + "step": 3660 + }, + { + "epoch": 1.17, + "grad_norm": 0.7000637650489807, + "learning_rate": 6.127799596645791e-05, + "loss": 0.0861, + "step": 3670 + }, + { + "epoch": 1.17, + "grad_norm": 0.43273600935935974, + "learning_rate": 6.117185012206771e-05, + "loss": 0.1848, + "step": 3680 + }, + { + "epoch": 1.17, + "grad_norm": 0.056298673152923584, + "learning_rate": 6.106570427767753e-05, + "loss": 0.1313, + "step": 3690 + }, + { + "epoch": 1.18, + "grad_norm": 0.6714267134666443, + "learning_rate": 6.095955843328735e-05, + "loss": 0.2817, + "step": 3700 + }, + { + "epoch": 1.18, + "grad_norm": 2.8052423000335693, + "learning_rate": 6.085341258889715e-05, + "loss": 0.2095, + "step": 3710 + }, + { + "epoch": 1.18, + "grad_norm": 3.0490353107452393, + "learning_rate": 6.074726674450696e-05, + "loss": 0.2707, + "step": 3720 + }, + { + "epoch": 1.19, + "grad_norm": 2.3823633193969727, + "learning_rate": 6.0641120900116766e-05, + "loss": 0.1918, + "step": 3730 + }, + { + "epoch": 1.19, + "grad_norm": 5.9893293380737305, + "learning_rate": 6.0534975055726576e-05, + "loss": 0.1855, + "step": 3740 + }, + { + "epoch": 1.19, + "grad_norm": 5.253934383392334, + "learning_rate": 6.042882921133638e-05, + "loss": 0.1286, + "step": 3750 + }, + { + "epoch": 1.2, + "grad_norm": 3.3353893756866455, + "learning_rate": 6.0322683366946183e-05, + "loss": 0.1656, + "step": 3760 + }, + { + "epoch": 1.2, + "grad_norm": 1.5391966104507446, + "learning_rate": 6.0216537522555994e-05, + "loss": 0.1783, + "step": 3770 + }, + { + "epoch": 1.2, + "grad_norm": 3.3716678619384766, + "learning_rate": 6.01103916781658e-05, + "loss": 0.1025, + "step": 3780 + }, + { + "epoch": 1.21, + "grad_norm": 0.8058392405509949, + "learning_rate": 6.000424583377561e-05, + "loss": 0.1224, + "step": 3790 + }, + { + "epoch": 1.21, + "grad_norm": 1.5231162309646606, + "learning_rate": 5.989809998938541e-05, + "loss": 0.0579, + "step": 3800 + }, + { + "epoch": 1.21, + "grad_norm": 3.7527573108673096, + "learning_rate": 5.979195414499522e-05, + "loss": 0.3109, + "step": 3810 + }, + { + "epoch": 1.22, + "grad_norm": 1.884722113609314, + "learning_rate": 5.968580830060504e-05, + "loss": 0.2569, + "step": 3820 + }, + { + "epoch": 1.22, + "grad_norm": 1.2949138879776, + "learning_rate": 5.957966245621484e-05, + "loss": 0.2067, + "step": 3830 + }, + { + "epoch": 1.22, + "grad_norm": 1.9406439065933228, + "learning_rate": 5.9473516611824654e-05, + "loss": 0.1397, + "step": 3840 + }, + { + "epoch": 1.23, + "grad_norm": 3.048089027404785, + "learning_rate": 5.936737076743446e-05, + "loss": 0.1903, + "step": 3850 + }, + { + "epoch": 1.23, + "grad_norm": 2.7827141284942627, + "learning_rate": 5.926122492304427e-05, + "loss": 0.2375, + "step": 3860 + }, + { + "epoch": 1.23, + "grad_norm": 0.30664700269699097, + "learning_rate": 5.915507907865407e-05, + "loss": 0.2605, + "step": 3870 + }, + { + "epoch": 1.24, + "grad_norm": 5.038077354431152, + "learning_rate": 5.904893323426388e-05, + "loss": 0.2249, + "step": 3880 + }, + { + "epoch": 1.24, + "grad_norm": 0.5563170313835144, + "learning_rate": 5.8942787389873686e-05, + "loss": 0.1407, + "step": 3890 + }, + { + "epoch": 1.24, + "grad_norm": 3.5176491737365723, + "learning_rate": 5.8836641545483496e-05, + "loss": 0.1955, + "step": 3900 + }, + { + "epoch": 1.24, + "grad_norm": 0.16444259881973267, + "learning_rate": 5.87304957010933e-05, + "loss": 0.2973, + "step": 3910 + }, + { + "epoch": 1.25, + "grad_norm": 2.3163607120513916, + "learning_rate": 5.862434985670311e-05, + "loss": 0.1388, + "step": 3920 + }, + { + "epoch": 1.25, + "grad_norm": 2.4921140670776367, + "learning_rate": 5.8518204012312914e-05, + "loss": 0.2844, + "step": 3930 + }, + { + "epoch": 1.25, + "grad_norm": 6.664550304412842, + "learning_rate": 5.841205816792273e-05, + "loss": 0.5434, + "step": 3940 + }, + { + "epoch": 1.26, + "grad_norm": 0.27615758776664734, + "learning_rate": 5.830591232353254e-05, + "loss": 0.2716, + "step": 3950 + }, + { + "epoch": 1.26, + "grad_norm": 7.205143451690674, + "learning_rate": 5.8199766479142345e-05, + "loss": 0.1927, + "step": 3960 + }, + { + "epoch": 1.26, + "grad_norm": 2.423842191696167, + "learning_rate": 5.8093620634752156e-05, + "loss": 0.2013, + "step": 3970 + }, + { + "epoch": 1.27, + "grad_norm": 0.6563037037849426, + "learning_rate": 5.798747479036196e-05, + "loss": 0.2597, + "step": 3980 + }, + { + "epoch": 1.27, + "grad_norm": 2.216214418411255, + "learning_rate": 5.788132894597177e-05, + "loss": 0.1484, + "step": 3990 + }, + { + "epoch": 1.27, + "grad_norm": 0.21049724519252777, + "learning_rate": 5.7775183101581574e-05, + "loss": 0.1205, + "step": 4000 + }, + { + "epoch": 1.28, + "grad_norm": 1.838711142539978, + "learning_rate": 5.7669037257191384e-05, + "loss": 0.1806, + "step": 4010 + }, + { + "epoch": 1.28, + "grad_norm": 4.584275245666504, + "learning_rate": 5.756289141280119e-05, + "loss": 0.1459, + "step": 4020 + }, + { + "epoch": 1.28, + "grad_norm": 3.7076704502105713, + "learning_rate": 5.7456745568411e-05, + "loss": 0.2119, + "step": 4030 + }, + { + "epoch": 1.29, + "grad_norm": 4.600487232208252, + "learning_rate": 5.73505997240208e-05, + "loss": 0.1846, + "step": 4040 + }, + { + "epoch": 1.29, + "grad_norm": 2.9479613304138184, + "learning_rate": 5.724445387963061e-05, + "loss": 0.1373, + "step": 4050 + }, + { + "epoch": 1.29, + "grad_norm": 2.7824301719665527, + "learning_rate": 5.7138308035240416e-05, + "loss": 0.1573, + "step": 4060 + }, + { + "epoch": 1.3, + "grad_norm": 1.3697668313980103, + "learning_rate": 5.703216219085023e-05, + "loss": 0.1067, + "step": 4070 + }, + { + "epoch": 1.3, + "grad_norm": 4.134962558746338, + "learning_rate": 5.6926016346460044e-05, + "loss": 0.3154, + "step": 4080 + }, + { + "epoch": 1.3, + "grad_norm": 1.986623764038086, + "learning_rate": 5.681987050206985e-05, + "loss": 0.162, + "step": 4090 + }, + { + "epoch": 1.31, + "grad_norm": 1.7553232908248901, + "learning_rate": 5.671372465767966e-05, + "loss": 0.2197, + "step": 4100 + }, + { + "epoch": 1.31, + "grad_norm": 1.666942834854126, + "learning_rate": 5.660757881328946e-05, + "loss": 0.2144, + "step": 4110 + }, + { + "epoch": 1.31, + "grad_norm": 1.3620635271072388, + "learning_rate": 5.650143296889927e-05, + "loss": 0.2823, + "step": 4120 + }, + { + "epoch": 1.31, + "grad_norm": 3.4056193828582764, + "learning_rate": 5.6395287124509076e-05, + "loss": 0.3223, + "step": 4130 + }, + { + "epoch": 1.32, + "grad_norm": 0.8397992253303528, + "learning_rate": 5.6289141280118886e-05, + "loss": 0.1297, + "step": 4140 + }, + { + "epoch": 1.32, + "grad_norm": 0.09627294540405273, + "learning_rate": 5.618299543572869e-05, + "loss": 0.1154, + "step": 4150 + }, + { + "epoch": 1.32, + "grad_norm": 2.1529462337493896, + "learning_rate": 5.60768495913385e-05, + "loss": 0.1903, + "step": 4160 + }, + { + "epoch": 1.33, + "grad_norm": 0.42282378673553467, + "learning_rate": 5.5970703746948304e-05, + "loss": 0.0992, + "step": 4170 + }, + { + "epoch": 1.33, + "grad_norm": 0.34097906947135925, + "learning_rate": 5.5864557902558115e-05, + "loss": 0.2193, + "step": 4180 + }, + { + "epoch": 1.33, + "grad_norm": 0.11647669225931168, + "learning_rate": 5.575841205816793e-05, + "loss": 0.1511, + "step": 4190 + }, + { + "epoch": 1.34, + "grad_norm": 7.489476680755615, + "learning_rate": 5.5652266213777736e-05, + "loss": 0.182, + "step": 4200 + }, + { + "epoch": 1.34, + "grad_norm": 0.0627538189291954, + "learning_rate": 5.5546120369387546e-05, + "loss": 0.2056, + "step": 4210 + }, + { + "epoch": 1.34, + "grad_norm": 1.6038990020751953, + "learning_rate": 5.543997452499735e-05, + "loss": 0.317, + "step": 4220 + }, + { + "epoch": 1.35, + "grad_norm": 2.0296130180358887, + "learning_rate": 5.533382868060716e-05, + "loss": 0.221, + "step": 4230 + }, + { + "epoch": 1.35, + "grad_norm": 3.08427357673645, + "learning_rate": 5.5227682836216964e-05, + "loss": 0.309, + "step": 4240 + }, + { + "epoch": 1.35, + "grad_norm": 6.700926303863525, + "learning_rate": 5.5121536991826774e-05, + "loss": 0.3862, + "step": 4250 + }, + { + "epoch": 1.36, + "grad_norm": 3.3283987045288086, + "learning_rate": 5.501539114743658e-05, + "loss": 0.1449, + "step": 4260 + }, + { + "epoch": 1.36, + "grad_norm": 2.7718186378479004, + "learning_rate": 5.490924530304639e-05, + "loss": 0.1237, + "step": 4270 + }, + { + "epoch": 1.36, + "grad_norm": 1.7264149188995361, + "learning_rate": 5.480309945865619e-05, + "loss": 0.0537, + "step": 4280 + }, + { + "epoch": 1.37, + "grad_norm": 2.8292267322540283, + "learning_rate": 5.4696953614266e-05, + "loss": 0.1139, + "step": 4290 + }, + { + "epoch": 1.37, + "grad_norm": 2.6377663612365723, + "learning_rate": 5.4590807769875806e-05, + "loss": 0.1632, + "step": 4300 + }, + { + "epoch": 1.37, + "grad_norm": 0.1827862560749054, + "learning_rate": 5.4484661925485624e-05, + "loss": 0.1809, + "step": 4310 + }, + { + "epoch": 1.38, + "grad_norm": 5.187005996704102, + "learning_rate": 5.4378516081095434e-05, + "loss": 0.1735, + "step": 4320 + }, + { + "epoch": 1.38, + "grad_norm": 2.064953327178955, + "learning_rate": 5.427237023670524e-05, + "loss": 0.3226, + "step": 4330 + }, + { + "epoch": 1.38, + "grad_norm": 0.03769757226109505, + "learning_rate": 5.416622439231505e-05, + "loss": 0.1563, + "step": 4340 + }, + { + "epoch": 1.38, + "grad_norm": 5.220246315002441, + "learning_rate": 5.406007854792485e-05, + "loss": 0.2403, + "step": 4350 + }, + { + "epoch": 1.39, + "grad_norm": 0.1891440451145172, + "learning_rate": 5.395393270353466e-05, + "loss": 0.1741, + "step": 4360 + }, + { + "epoch": 1.39, + "grad_norm": 5.661322116851807, + "learning_rate": 5.3847786859144466e-05, + "loss": 0.1514, + "step": 4370 + }, + { + "epoch": 1.39, + "grad_norm": 8.325531005859375, + "learning_rate": 5.3741641014754277e-05, + "loss": 0.1954, + "step": 4380 + }, + { + "epoch": 1.4, + "grad_norm": 3.1849327087402344, + "learning_rate": 5.363549517036408e-05, + "loss": 0.2667, + "step": 4390 + }, + { + "epoch": 1.4, + "grad_norm": 4.426061153411865, + "learning_rate": 5.352934932597389e-05, + "loss": 0.1621, + "step": 4400 + }, + { + "epoch": 1.4, + "grad_norm": 0.08511369675397873, + "learning_rate": 5.3423203481583694e-05, + "loss": 0.2384, + "step": 4410 + }, + { + "epoch": 1.41, + "grad_norm": 2.6035985946655273, + "learning_rate": 5.3317057637193505e-05, + "loss": 0.2029, + "step": 4420 + }, + { + "epoch": 1.41, + "grad_norm": 3.637746810913086, + "learning_rate": 5.321091179280332e-05, + "loss": 0.2054, + "step": 4430 + }, + { + "epoch": 1.41, + "grad_norm": 2.6887290477752686, + "learning_rate": 5.3104765948413126e-05, + "loss": 0.194, + "step": 4440 + }, + { + "epoch": 1.42, + "grad_norm": 0.5362237691879272, + "learning_rate": 5.2998620104022936e-05, + "loss": 0.1243, + "step": 4450 + }, + { + "epoch": 1.42, + "grad_norm": 6.602662086486816, + "learning_rate": 5.289247425963274e-05, + "loss": 0.1005, + "step": 4460 + }, + { + "epoch": 1.42, + "grad_norm": 0.16585449874401093, + "learning_rate": 5.278632841524255e-05, + "loss": 0.116, + "step": 4470 + }, + { + "epoch": 1.43, + "grad_norm": 3.062458038330078, + "learning_rate": 5.2690797155291374e-05, + "loss": 0.2236, + "step": 4480 + }, + { + "epoch": 1.43, + "grad_norm": 3.1578338146209717, + "learning_rate": 5.258465131090118e-05, + "loss": 0.1248, + "step": 4490 + }, + { + "epoch": 1.43, + "grad_norm": 6.487752914428711, + "learning_rate": 5.247850546651099e-05, + "loss": 0.2268, + "step": 4500 + }, + { + "epoch": 1.44, + "grad_norm": 4.561209678649902, + "learning_rate": 5.237235962212079e-05, + "loss": 0.3183, + "step": 4510 + }, + { + "epoch": 1.44, + "grad_norm": 1.6614716053009033, + "learning_rate": 5.22662137777306e-05, + "loss": 0.2555, + "step": 4520 + }, + { + "epoch": 1.44, + "grad_norm": 2.4814791679382324, + "learning_rate": 5.216006793334042e-05, + "loss": 0.1524, + "step": 4530 + }, + { + "epoch": 1.45, + "grad_norm": 0.17691956460475922, + "learning_rate": 5.205392208895022e-05, + "loss": 0.1934, + "step": 4540 + }, + { + "epoch": 1.45, + "grad_norm": 5.082562446594238, + "learning_rate": 5.1947776244560033e-05, + "loss": 0.4279, + "step": 4550 + }, + { + "epoch": 1.45, + "grad_norm": 3.106387138366699, + "learning_rate": 5.184163040016984e-05, + "loss": 0.1194, + "step": 4560 + }, + { + "epoch": 1.45, + "grad_norm": 7.02073335647583, + "learning_rate": 5.173548455577965e-05, + "loss": 0.1109, + "step": 4570 + }, + { + "epoch": 1.46, + "grad_norm": 0.2526942193508148, + "learning_rate": 5.162933871138945e-05, + "loss": 0.1913, + "step": 4580 + }, + { + "epoch": 1.46, + "grad_norm": 4.575504302978516, + "learning_rate": 5.152319286699926e-05, + "loss": 0.2151, + "step": 4590 + }, + { + "epoch": 1.46, + "grad_norm": 2.3890509605407715, + "learning_rate": 5.1417047022609066e-05, + "loss": 0.2336, + "step": 4600 + }, + { + "epoch": 1.47, + "grad_norm": 0.8267619013786316, + "learning_rate": 5.1310901178218876e-05, + "loss": 0.0856, + "step": 4610 + }, + { + "epoch": 1.47, + "grad_norm": 4.056538105010986, + "learning_rate": 5.120475533382868e-05, + "loss": 0.1947, + "step": 4620 + }, + { + "epoch": 1.47, + "grad_norm": 6.964923858642578, + "learning_rate": 5.109860948943849e-05, + "loss": 0.1195, + "step": 4630 + }, + { + "epoch": 1.48, + "grad_norm": 2.813004970550537, + "learning_rate": 5.100307822948732e-05, + "loss": 0.1225, + "step": 4640 + }, + { + "epoch": 1.48, + "grad_norm": 2.654339075088501, + "learning_rate": 5.089693238509713e-05, + "loss": 0.1006, + "step": 4650 + }, + { + "epoch": 1.48, + "grad_norm": 6.5991644859313965, + "learning_rate": 5.0790786540706934e-05, + "loss": 0.2646, + "step": 4660 + }, + { + "epoch": 1.49, + "grad_norm": 5.099368572235107, + "learning_rate": 5.0684640696316745e-05, + "loss": 0.2748, + "step": 4670 + }, + { + "epoch": 1.49, + "grad_norm": 5.0444655418396, + "learning_rate": 5.057849485192655e-05, + "loss": 0.2295, + "step": 4680 + }, + { + "epoch": 1.49, + "grad_norm": 0.07431354373693466, + "learning_rate": 5.047234900753636e-05, + "loss": 0.1348, + "step": 4690 + }, + { + "epoch": 1.5, + "grad_norm": 0.1366661787033081, + "learning_rate": 5.036620316314616e-05, + "loss": 0.1164, + "step": 4700 + }, + { + "epoch": 1.5, + "grad_norm": 4.550073146820068, + "learning_rate": 5.026005731875597e-05, + "loss": 0.2377, + "step": 4710 + }, + { + "epoch": 1.5, + "grad_norm": 0.12663549184799194, + "learning_rate": 5.015391147436578e-05, + "loss": 0.0871, + "step": 4720 + }, + { + "epoch": 1.51, + "grad_norm": 5.191462993621826, + "learning_rate": 5.004776562997559e-05, + "loss": 0.2778, + "step": 4730 + }, + { + "epoch": 1.51, + "grad_norm": 2.7582337856292725, + "learning_rate": 4.99416197855854e-05, + "loss": 0.203, + "step": 4740 + }, + { + "epoch": 1.51, + "grad_norm": 7.114481449127197, + "learning_rate": 4.98354739411952e-05, + "loss": 0.1426, + "step": 4750 + }, + { + "epoch": 1.52, + "grad_norm": 0.41717416048049927, + "learning_rate": 4.972932809680501e-05, + "loss": 0.2009, + "step": 4760 + }, + { + "epoch": 1.52, + "grad_norm": 1.8175145387649536, + "learning_rate": 4.9623182252414816e-05, + "loss": 0.1152, + "step": 4770 + }, + { + "epoch": 1.52, + "grad_norm": 3.585702419281006, + "learning_rate": 4.951703640802463e-05, + "loss": 0.1615, + "step": 4780 + }, + { + "epoch": 1.52, + "grad_norm": 0.385105699300766, + "learning_rate": 4.9410890563634437e-05, + "loss": 0.1569, + "step": 4790 + }, + { + "epoch": 1.53, + "grad_norm": 2.8163392543792725, + "learning_rate": 4.930474471924425e-05, + "loss": 0.0942, + "step": 4800 + }, + { + "epoch": 1.53, + "grad_norm": 5.181662082672119, + "learning_rate": 4.919859887485405e-05, + "loss": 0.2076, + "step": 4810 + }, + { + "epoch": 1.53, + "grad_norm": 0.15229104459285736, + "learning_rate": 4.909245303046386e-05, + "loss": 0.2249, + "step": 4820 + }, + { + "epoch": 1.54, + "grad_norm": 3.2373440265655518, + "learning_rate": 4.8986307186073665e-05, + "loss": 0.5439, + "step": 4830 + }, + { + "epoch": 1.54, + "grad_norm": 1.7857202291488647, + "learning_rate": 4.8880161341683475e-05, + "loss": 0.1806, + "step": 4840 + }, + { + "epoch": 1.54, + "grad_norm": 1.1035951375961304, + "learning_rate": 4.8774015497293286e-05, + "loss": 0.1309, + "step": 4850 + }, + { + "epoch": 1.55, + "grad_norm": 7.660123825073242, + "learning_rate": 4.866786965290309e-05, + "loss": 0.1587, + "step": 4860 + }, + { + "epoch": 1.55, + "grad_norm": 0.20227286219596863, + "learning_rate": 4.85617238085129e-05, + "loss": 0.3051, + "step": 4870 + }, + { + "epoch": 1.55, + "grad_norm": 6.558931827545166, + "learning_rate": 4.8455577964122704e-05, + "loss": 0.2137, + "step": 4880 + }, + { + "epoch": 1.56, + "grad_norm": 2.683018922805786, + "learning_rate": 4.8349432119732514e-05, + "loss": 0.1528, + "step": 4890 + }, + { + "epoch": 1.56, + "grad_norm": 1.2843786478042603, + "learning_rate": 4.8243286275342325e-05, + "loss": 0.1525, + "step": 4900 + }, + { + "epoch": 1.56, + "grad_norm": 0.9824750423431396, + "learning_rate": 4.8137140430952135e-05, + "loss": 0.1682, + "step": 4910 + }, + { + "epoch": 1.57, + "grad_norm": 1.0165822505950928, + "learning_rate": 4.803099458656194e-05, + "loss": 0.2397, + "step": 4920 + }, + { + "epoch": 1.57, + "grad_norm": 2.0921578407287598, + "learning_rate": 4.792484874217175e-05, + "loss": 0.2342, + "step": 4930 + }, + { + "epoch": 1.57, + "grad_norm": 2.5232343673706055, + "learning_rate": 4.781870289778155e-05, + "loss": 0.2216, + "step": 4940 + }, + { + "epoch": 1.58, + "grad_norm": 5.7156782150268555, + "learning_rate": 4.7712557053391363e-05, + "loss": 0.2342, + "step": 4950 + }, + { + "epoch": 1.58, + "grad_norm": 3.128016233444214, + "learning_rate": 4.760641120900117e-05, + "loss": 0.1759, + "step": 4960 + }, + { + "epoch": 1.58, + "grad_norm": 2.2040598392486572, + "learning_rate": 4.750026536461098e-05, + "loss": 0.1414, + "step": 4970 + }, + { + "epoch": 1.59, + "grad_norm": 2.1795644760131836, + "learning_rate": 4.739411952022079e-05, + "loss": 0.1648, + "step": 4980 + }, + { + "epoch": 1.59, + "grad_norm": 5.399777412414551, + "learning_rate": 4.728797367583059e-05, + "loss": 0.1344, + "step": 4990 + }, + { + "epoch": 1.59, + "grad_norm": 0.06098851189017296, + "learning_rate": 4.71818278314404e-05, + "loss": 0.1188, + "step": 5000 + }, + { + "epoch": 1.59, + "grad_norm": 3.174159049987793, + "learning_rate": 4.7075681987050206e-05, + "loss": 0.3419, + "step": 5010 + }, + { + "epoch": 1.6, + "grad_norm": 4.566168308258057, + "learning_rate": 4.6969536142660016e-05, + "loss": 0.2582, + "step": 5020 + }, + { + "epoch": 1.6, + "grad_norm": 0.5227226614952087, + "learning_rate": 4.686339029826983e-05, + "loss": 0.1691, + "step": 5030 + }, + { + "epoch": 1.6, + "grad_norm": 5.8460869789123535, + "learning_rate": 4.675724445387963e-05, + "loss": 0.1399, + "step": 5040 + }, + { + "epoch": 1.61, + "grad_norm": 2.2399487495422363, + "learning_rate": 4.665109860948944e-05, + "loss": 0.1549, + "step": 5050 + }, + { + "epoch": 1.61, + "grad_norm": 2.9508166313171387, + "learning_rate": 4.6544952765099245e-05, + "loss": 0.1665, + "step": 5060 + }, + { + "epoch": 1.61, + "grad_norm": 2.5230746269226074, + "learning_rate": 4.6438806920709055e-05, + "loss": 0.2108, + "step": 5070 + }, + { + "epoch": 1.62, + "grad_norm": 0.5516650080680847, + "learning_rate": 4.633266107631886e-05, + "loss": 0.2275, + "step": 5080 + }, + { + "epoch": 1.62, + "grad_norm": 8.398303985595703, + "learning_rate": 4.6226515231928676e-05, + "loss": 0.1801, + "step": 5090 + }, + { + "epoch": 1.62, + "grad_norm": 0.2512928247451782, + "learning_rate": 4.612036938753848e-05, + "loss": 0.2654, + "step": 5100 + }, + { + "epoch": 1.63, + "grad_norm": 5.312344551086426, + "learning_rate": 4.601422354314829e-05, + "loss": 0.2992, + "step": 5110 + }, + { + "epoch": 1.63, + "grad_norm": 1.728023648262024, + "learning_rate": 4.5908077698758094e-05, + "loss": 0.1638, + "step": 5120 + }, + { + "epoch": 1.63, + "grad_norm": 1.6222649812698364, + "learning_rate": 4.5801931854367904e-05, + "loss": 0.2216, + "step": 5130 + }, + { + "epoch": 1.64, + "grad_norm": 0.5581383109092712, + "learning_rate": 4.569578600997771e-05, + "loss": 0.2467, + "step": 5140 + }, + { + "epoch": 1.64, + "grad_norm": 3.051811456680298, + "learning_rate": 4.558964016558752e-05, + "loss": 0.1486, + "step": 5150 + }, + { + "epoch": 1.64, + "grad_norm": 0.6013765931129456, + "learning_rate": 4.548349432119733e-05, + "loss": 0.123, + "step": 5160 + }, + { + "epoch": 1.65, + "grad_norm": 3.8984789848327637, + "learning_rate": 4.537734847680713e-05, + "loss": 0.3698, + "step": 5170 + }, + { + "epoch": 1.65, + "grad_norm": 1.3346749544143677, + "learning_rate": 4.527120263241694e-05, + "loss": 0.1814, + "step": 5180 + }, + { + "epoch": 1.65, + "grad_norm": 11.491423606872559, + "learning_rate": 4.516505678802675e-05, + "loss": 0.1745, + "step": 5190 + }, + { + "epoch": 1.66, + "grad_norm": 2.358656883239746, + "learning_rate": 4.505891094363656e-05, + "loss": 0.2734, + "step": 5200 + }, + { + "epoch": 1.66, + "grad_norm": 3.3352041244506836, + "learning_rate": 4.495276509924637e-05, + "loss": 0.2054, + "step": 5210 + }, + { + "epoch": 1.66, + "grad_norm": 0.052441373467445374, + "learning_rate": 4.484661925485618e-05, + "loss": 0.1389, + "step": 5220 + }, + { + "epoch": 1.66, + "grad_norm": 0.20047003030776978, + "learning_rate": 4.474047341046598e-05, + "loss": 0.1197, + "step": 5230 + }, + { + "epoch": 1.67, + "grad_norm": 1.4837030172348022, + "learning_rate": 4.463432756607579e-05, + "loss": 0.2446, + "step": 5240 + }, + { + "epoch": 1.67, + "grad_norm": 0.3104861378669739, + "learning_rate": 4.4528181721685596e-05, + "loss": 0.1842, + "step": 5250 + }, + { + "epoch": 1.67, + "grad_norm": 7.954286098480225, + "learning_rate": 4.442203587729541e-05, + "loss": 0.1221, + "step": 5260 + }, + { + "epoch": 1.68, + "grad_norm": 0.03400198742747307, + "learning_rate": 4.431589003290522e-05, + "loss": 0.1513, + "step": 5270 + }, + { + "epoch": 1.68, + "grad_norm": 0.08371475338935852, + "learning_rate": 4.420974418851502e-05, + "loss": 0.2098, + "step": 5280 + }, + { + "epoch": 1.68, + "grad_norm": 1.2470760345458984, + "learning_rate": 4.410359834412483e-05, + "loss": 0.117, + "step": 5290 + }, + { + "epoch": 1.69, + "grad_norm": 1.5426656007766724, + "learning_rate": 4.3997452499734635e-05, + "loss": 0.1826, + "step": 5300 + }, + { + "epoch": 1.69, + "grad_norm": 3.978109121322632, + "learning_rate": 4.3891306655344445e-05, + "loss": 0.1103, + "step": 5310 + }, + { + "epoch": 1.69, + "grad_norm": 1.6321693658828735, + "learning_rate": 4.378516081095425e-05, + "loss": 0.151, + "step": 5320 + }, + { + "epoch": 1.7, + "grad_norm": 2.555723190307617, + "learning_rate": 4.3679014966564066e-05, + "loss": 0.1786, + "step": 5330 + }, + { + "epoch": 1.7, + "grad_norm": 0.2461155354976654, + "learning_rate": 4.357286912217387e-05, + "loss": 0.1914, + "step": 5340 + }, + { + "epoch": 1.7, + "grad_norm": 0.41670894622802734, + "learning_rate": 4.346672327778368e-05, + "loss": 0.2582, + "step": 5350 + }, + { + "epoch": 1.71, + "grad_norm": 4.785902976989746, + "learning_rate": 4.3360577433393484e-05, + "loss": 0.0911, + "step": 5360 + }, + { + "epoch": 1.71, + "grad_norm": 4.179080963134766, + "learning_rate": 4.3254431589003295e-05, + "loss": 0.2264, + "step": 5370 + }, + { + "epoch": 1.71, + "grad_norm": 0.9344226717948914, + "learning_rate": 4.31482857446131e-05, + "loss": 0.2003, + "step": 5380 + }, + { + "epoch": 1.72, + "grad_norm": 0.3643859624862671, + "learning_rate": 4.304213990022291e-05, + "loss": 0.1, + "step": 5390 + }, + { + "epoch": 1.72, + "grad_norm": 2.3688154220581055, + "learning_rate": 4.293599405583272e-05, + "loss": 0.2461, + "step": 5400 + }, + { + "epoch": 1.72, + "grad_norm": 4.223112106323242, + "learning_rate": 4.282984821144252e-05, + "loss": 0.1316, + "step": 5410 + }, + { + "epoch": 1.73, + "grad_norm": 1.52751886844635, + "learning_rate": 4.2723702367052333e-05, + "loss": 0.162, + "step": 5420 + }, + { + "epoch": 1.73, + "grad_norm": 0.06534834951162338, + "learning_rate": 4.261755652266214e-05, + "loss": 0.1787, + "step": 5430 + }, + { + "epoch": 1.73, + "grad_norm": 0.0435919463634491, + "learning_rate": 4.251141067827195e-05, + "loss": 0.2196, + "step": 5440 + }, + { + "epoch": 1.73, + "grad_norm": 1.0877362489700317, + "learning_rate": 4.240526483388176e-05, + "loss": 0.2829, + "step": 5450 + }, + { + "epoch": 1.74, + "grad_norm": 1.7220368385314941, + "learning_rate": 4.229911898949156e-05, + "loss": 0.211, + "step": 5460 + }, + { + "epoch": 1.74, + "grad_norm": 1.6200969219207764, + "learning_rate": 4.219297314510137e-05, + "loss": 0.2046, + "step": 5470 + }, + { + "epoch": 1.74, + "grad_norm": 2.376384735107422, + "learning_rate": 4.2086827300711176e-05, + "loss": 0.2518, + "step": 5480 + }, + { + "epoch": 1.75, + "grad_norm": 1.6646453142166138, + "learning_rate": 4.1980681456320986e-05, + "loss": 0.1542, + "step": 5490 + }, + { + "epoch": 1.75, + "grad_norm": 0.580792248249054, + "learning_rate": 4.187453561193079e-05, + "loss": 0.1503, + "step": 5500 + }, + { + "epoch": 1.75, + "grad_norm": 2.325477123260498, + "learning_rate": 4.176838976754061e-05, + "loss": 0.1867, + "step": 5510 + }, + { + "epoch": 1.76, + "grad_norm": 3.004499673843384, + "learning_rate": 4.166224392315041e-05, + "loss": 0.1816, + "step": 5520 + }, + { + "epoch": 1.76, + "grad_norm": 1.7592769861221313, + "learning_rate": 4.155609807876022e-05, + "loss": 0.2155, + "step": 5530 + }, + { + "epoch": 1.76, + "grad_norm": 0.4255143105983734, + "learning_rate": 4.1449952234370025e-05, + "loss": 0.2298, + "step": 5540 + }, + { + "epoch": 1.77, + "grad_norm": 4.217332363128662, + "learning_rate": 4.1343806389979836e-05, + "loss": 0.1263, + "step": 5550 + }, + { + "epoch": 1.77, + "grad_norm": 1.6670517921447754, + "learning_rate": 4.123766054558964e-05, + "loss": 0.1993, + "step": 5560 + }, + { + "epoch": 1.77, + "grad_norm": 0.2432798445224762, + "learning_rate": 4.113151470119945e-05, + "loss": 0.1992, + "step": 5570 + }, + { + "epoch": 1.78, + "grad_norm": 5.0905070304870605, + "learning_rate": 4.102536885680926e-05, + "loss": 0.1381, + "step": 5580 + }, + { + "epoch": 1.78, + "grad_norm": 12.299093246459961, + "learning_rate": 4.0919223012419064e-05, + "loss": 0.2233, + "step": 5590 + }, + { + "epoch": 1.78, + "grad_norm": 0.27092546224594116, + "learning_rate": 4.0813077168028874e-05, + "loss": 0.1675, + "step": 5600 + }, + { + "epoch": 1.79, + "grad_norm": 3.4481306076049805, + "learning_rate": 4.070693132363868e-05, + "loss": 0.3113, + "step": 5610 + }, + { + "epoch": 1.79, + "grad_norm": 12.642804145812988, + "learning_rate": 4.060078547924849e-05, + "loss": 0.1557, + "step": 5620 + }, + { + "epoch": 1.79, + "grad_norm": 4.341307163238525, + "learning_rate": 4.049463963485829e-05, + "loss": 0.0825, + "step": 5630 + }, + { + "epoch": 1.8, + "grad_norm": 0.728386402130127, + "learning_rate": 4.038849379046811e-05, + "loss": 0.1589, + "step": 5640 + }, + { + "epoch": 1.8, + "grad_norm": 4.2692084312438965, + "learning_rate": 4.028234794607791e-05, + "loss": 0.0908, + "step": 5650 + }, + { + "epoch": 1.8, + "grad_norm": 3.5218265056610107, + "learning_rate": 4.0176202101687724e-05, + "loss": 0.2008, + "step": 5660 + }, + { + "epoch": 1.8, + "grad_norm": 0.6934779286384583, + "learning_rate": 4.007005625729753e-05, + "loss": 0.1652, + "step": 5670 + }, + { + "epoch": 1.81, + "grad_norm": 7.079185485839844, + "learning_rate": 3.996391041290734e-05, + "loss": 0.1854, + "step": 5680 + }, + { + "epoch": 1.81, + "grad_norm": 2.6828112602233887, + "learning_rate": 3.985776456851714e-05, + "loss": 0.0911, + "step": 5690 + }, + { + "epoch": 1.81, + "grad_norm": 5.049779891967773, + "learning_rate": 3.975161872412695e-05, + "loss": 0.1191, + "step": 5700 + }, + { + "epoch": 1.82, + "grad_norm": 2.4732673168182373, + "learning_rate": 3.9656087464175775e-05, + "loss": 0.2192, + "step": 5710 + }, + { + "epoch": 1.82, + "grad_norm": 0.11808130145072937, + "learning_rate": 3.9549941619785586e-05, + "loss": 0.1782, + "step": 5720 + }, + { + "epoch": 1.82, + "grad_norm": 3.8879833221435547, + "learning_rate": 3.944379577539539e-05, + "loss": 0.1692, + "step": 5730 + }, + { + "epoch": 1.83, + "grad_norm": 3.667048931121826, + "learning_rate": 3.933764993100521e-05, + "loss": 0.1236, + "step": 5740 + }, + { + "epoch": 1.83, + "grad_norm": 4.494665622711182, + "learning_rate": 3.923150408661501e-05, + "loss": 0.2373, + "step": 5750 + }, + { + "epoch": 1.83, + "grad_norm": 0.3976966440677643, + "learning_rate": 3.912535824222482e-05, + "loss": 0.2805, + "step": 5760 + }, + { + "epoch": 1.84, + "grad_norm": 2.046142578125, + "learning_rate": 3.9019212397834625e-05, + "loss": 0.1198, + "step": 5770 + }, + { + "epoch": 1.84, + "grad_norm": 0.27937573194503784, + "learning_rate": 3.8913066553444435e-05, + "loss": 0.1443, + "step": 5780 + }, + { + "epoch": 1.84, + "grad_norm": 6.109045028686523, + "learning_rate": 3.880692070905424e-05, + "loss": 0.3341, + "step": 5790 + }, + { + "epoch": 1.85, + "grad_norm": 0.7306396961212158, + "learning_rate": 3.870077486466405e-05, + "loss": 0.1208, + "step": 5800 + }, + { + "epoch": 1.85, + "grad_norm": 1.7087950706481934, + "learning_rate": 3.859462902027386e-05, + "loss": 0.1464, + "step": 5810 + }, + { + "epoch": 1.85, + "grad_norm": 0.5200537443161011, + "learning_rate": 3.8488483175883663e-05, + "loss": 0.1639, + "step": 5820 + }, + { + "epoch": 1.86, + "grad_norm": 6.455096244812012, + "learning_rate": 3.8382337331493474e-05, + "loss": 0.1885, + "step": 5830 + }, + { + "epoch": 1.86, + "grad_norm": 7.437272548675537, + "learning_rate": 3.827619148710328e-05, + "loss": 0.1916, + "step": 5840 + }, + { + "epoch": 1.86, + "grad_norm": 6.395534515380859, + "learning_rate": 3.817004564271309e-05, + "loss": 0.2988, + "step": 5850 + }, + { + "epoch": 1.87, + "grad_norm": 20.61446762084961, + "learning_rate": 3.80638997983229e-05, + "loss": 0.0853, + "step": 5860 + }, + { + "epoch": 1.87, + "grad_norm": 1.0395785570144653, + "learning_rate": 3.795775395393271e-05, + "loss": 0.2113, + "step": 5870 + }, + { + "epoch": 1.87, + "grad_norm": 8.83860969543457, + "learning_rate": 3.785160810954251e-05, + "loss": 0.1904, + "step": 5880 + }, + { + "epoch": 1.87, + "grad_norm": 5.42601203918457, + "learning_rate": 3.774546226515232e-05, + "loss": 0.3887, + "step": 5890 + }, + { + "epoch": 1.88, + "grad_norm": 3.3505442142486572, + "learning_rate": 3.763931642076213e-05, + "loss": 0.1397, + "step": 5900 + }, + { + "epoch": 1.88, + "grad_norm": 4.929141521453857, + "learning_rate": 3.753317057637194e-05, + "loss": 0.2773, + "step": 5910 + }, + { + "epoch": 1.88, + "grad_norm": 2.1540703773498535, + "learning_rate": 3.742702473198175e-05, + "loss": 0.1679, + "step": 5920 + }, + { + "epoch": 1.89, + "grad_norm": 10.82689094543457, + "learning_rate": 3.732087888759155e-05, + "loss": 0.1776, + "step": 5930 + }, + { + "epoch": 1.89, + "grad_norm": 3.0525174140930176, + "learning_rate": 3.721473304320136e-05, + "loss": 0.1619, + "step": 5940 + }, + { + "epoch": 1.89, + "grad_norm": 5.296212196350098, + "learning_rate": 3.7108587198811166e-05, + "loss": 0.3294, + "step": 5950 + }, + { + "epoch": 1.9, + "grad_norm": 3.4226958751678467, + "learning_rate": 3.7002441354420976e-05, + "loss": 0.3229, + "step": 5960 + }, + { + "epoch": 1.9, + "grad_norm": 0.4734908938407898, + "learning_rate": 3.689629551003078e-05, + "loss": 0.1179, + "step": 5970 + }, + { + "epoch": 1.9, + "grad_norm": 5.436024188995361, + "learning_rate": 3.67901496656406e-05, + "loss": 0.1892, + "step": 5980 + }, + { + "epoch": 1.91, + "grad_norm": 5.233070373535156, + "learning_rate": 3.66840038212504e-05, + "loss": 0.2054, + "step": 5990 + }, + { + "epoch": 1.91, + "grad_norm": 0.5661432147026062, + "learning_rate": 3.657785797686021e-05, + "loss": 0.2202, + "step": 6000 + }, + { + "epoch": 1.91, + "grad_norm": 0.23524077236652374, + "learning_rate": 3.6471712132470015e-05, + "loss": 0.2318, + "step": 6010 + }, + { + "epoch": 1.92, + "grad_norm": 0.05953243002295494, + "learning_rate": 3.6365566288079825e-05, + "loss": 0.2486, + "step": 6020 + }, + { + "epoch": 1.92, + "grad_norm": 1.3823449611663818, + "learning_rate": 3.625942044368963e-05, + "loss": 0.1171, + "step": 6030 + }, + { + "epoch": 1.92, + "grad_norm": 7.733388423919678, + "learning_rate": 3.615327459929944e-05, + "loss": 0.2469, + "step": 6040 + }, + { + "epoch": 1.93, + "grad_norm": 1.4917621612548828, + "learning_rate": 3.604712875490925e-05, + "loss": 0.2045, + "step": 6050 + }, + { + "epoch": 1.93, + "grad_norm": 7.689728736877441, + "learning_rate": 3.5940982910519054e-05, + "loss": 0.1648, + "step": 6060 + }, + { + "epoch": 1.93, + "grad_norm": 2.2216577529907227, + "learning_rate": 3.5834837066128864e-05, + "loss": 0.2779, + "step": 6070 + }, + { + "epoch": 1.94, + "grad_norm": 1.7362425327301025, + "learning_rate": 3.572869122173867e-05, + "loss": 0.1664, + "step": 6080 + }, + { + "epoch": 1.94, + "grad_norm": 4.933811187744141, + "learning_rate": 3.562254537734848e-05, + "loss": 0.293, + "step": 6090 + }, + { + "epoch": 1.94, + "grad_norm": 4.054910182952881, + "learning_rate": 3.551639953295829e-05, + "loss": 0.1539, + "step": 6100 + }, + { + "epoch": 1.94, + "grad_norm": 0.9219651222229004, + "learning_rate": 3.541025368856809e-05, + "loss": 0.1111, + "step": 6110 + }, + { + "epoch": 1.95, + "grad_norm": 4.558506488800049, + "learning_rate": 3.53041078441779e-05, + "loss": 0.1783, + "step": 6120 + }, + { + "epoch": 1.95, + "grad_norm": 2.6951773166656494, + "learning_rate": 3.5197961999787707e-05, + "loss": 0.2916, + "step": 6130 + }, + { + "epoch": 1.95, + "grad_norm": 0.9989050626754761, + "learning_rate": 3.509181615539752e-05, + "loss": 0.2099, + "step": 6140 + }, + { + "epoch": 1.96, + "grad_norm": 0.08494656533002853, + "learning_rate": 3.498567031100732e-05, + "loss": 0.1255, + "step": 6150 + }, + { + "epoch": 1.96, + "grad_norm": 0.20273062586784363, + "learning_rate": 3.487952446661714e-05, + "loss": 0.1523, + "step": 6160 + }, + { + "epoch": 1.96, + "grad_norm": 0.2878829538822174, + "learning_rate": 3.477337862222694e-05, + "loss": 0.1732, + "step": 6170 + }, + { + "epoch": 1.97, + "grad_norm": 2.026616096496582, + "learning_rate": 3.466723277783675e-05, + "loss": 0.1037, + "step": 6180 + }, + { + "epoch": 1.97, + "grad_norm": 0.8350101709365845, + "learning_rate": 3.4561086933446556e-05, + "loss": 0.1169, + "step": 6190 + }, + { + "epoch": 1.97, + "grad_norm": 0.6492775082588196, + "learning_rate": 3.4454941089056366e-05, + "loss": 0.1758, + "step": 6200 + }, + { + "epoch": 1.98, + "grad_norm": 4.830353736877441, + "learning_rate": 3.434879524466617e-05, + "loss": 0.3367, + "step": 6210 + }, + { + "epoch": 1.98, + "grad_norm": 5.267330169677734, + "learning_rate": 3.424264940027598e-05, + "loss": 0.1753, + "step": 6220 + }, + { + "epoch": 1.98, + "grad_norm": 0.11368358880281448, + "learning_rate": 3.413650355588579e-05, + "loss": 0.2409, + "step": 6230 + }, + { + "epoch": 1.99, + "grad_norm": 0.10408168286085129, + "learning_rate": 3.4030357711495595e-05, + "loss": 0.1407, + "step": 6240 + }, + { + "epoch": 1.99, + "grad_norm": 4.495917320251465, + "learning_rate": 3.3924211867105405e-05, + "loss": 0.1504, + "step": 6250 + }, + { + "epoch": 1.99, + "grad_norm": 0.16925585269927979, + "learning_rate": 3.381806602271521e-05, + "loss": 0.1323, + "step": 6260 + }, + { + "epoch": 2.0, + "grad_norm": 2.5475289821624756, + "learning_rate": 3.371192017832502e-05, + "loss": 0.1902, + "step": 6270 + }, + { + "epoch": 2.0, + "grad_norm": 2.21207332611084, + "learning_rate": 3.360577433393483e-05, + "loss": 0.1019, + "step": 6280 + }, + { + "epoch": 2.0, + "grad_norm": 2.7308425903320312, + "learning_rate": 3.349962848954464e-05, + "loss": 0.1368, + "step": 6290 + }, + { + "epoch": 2.01, + "grad_norm": 0.8695929646492004, + "learning_rate": 3.3393482645154444e-05, + "loss": 0.1979, + "step": 6300 + }, + { + "epoch": 2.01, + "grad_norm": 5.150228977203369, + "learning_rate": 3.3287336800764254e-05, + "loss": 0.1237, + "step": 6310 + }, + { + "epoch": 2.01, + "grad_norm": 0.1432078331708908, + "learning_rate": 3.318119095637406e-05, + "loss": 0.1547, + "step": 6320 + }, + { + "epoch": 2.01, + "grad_norm": 3.952962875366211, + "learning_rate": 3.307504511198387e-05, + "loss": 0.1682, + "step": 6330 + }, + { + "epoch": 2.02, + "grad_norm": 0.044416822493076324, + "learning_rate": 3.296889926759367e-05, + "loss": 0.0388, + "step": 6340 + }, + { + "epoch": 2.02, + "grad_norm": 6.307524681091309, + "learning_rate": 3.286275342320348e-05, + "loss": 0.1418, + "step": 6350 + }, + { + "epoch": 2.02, + "grad_norm": 0.1354295015335083, + "learning_rate": 3.275660757881329e-05, + "loss": 0.2588, + "step": 6360 + }, + { + "epoch": 2.03, + "grad_norm": 3.275066614151001, + "learning_rate": 3.26504617344231e-05, + "loss": 0.1091, + "step": 6370 + }, + { + "epoch": 2.03, + "grad_norm": 0.0923081785440445, + "learning_rate": 3.254431589003291e-05, + "loss": 0.1384, + "step": 6380 + }, + { + "epoch": 2.03, + "grad_norm": 3.508528232574463, + "learning_rate": 3.243817004564271e-05, + "loss": 0.217, + "step": 6390 + }, + { + "epoch": 2.04, + "grad_norm": 2.36240291595459, + "learning_rate": 3.233202420125252e-05, + "loss": 0.0337, + "step": 6400 + }, + { + "epoch": 2.04, + "grad_norm": 0.20124652981758118, + "learning_rate": 3.222587835686233e-05, + "loss": 0.0982, + "step": 6410 + }, + { + "epoch": 2.04, + "grad_norm": 0.8248081803321838, + "learning_rate": 3.211973251247214e-05, + "loss": 0.2217, + "step": 6420 + }, + { + "epoch": 2.05, + "grad_norm": 1.1201878786087036, + "learning_rate": 3.2013586668081946e-05, + "loss": 0.0651, + "step": 6430 + }, + { + "epoch": 2.05, + "grad_norm": 1.6418076753616333, + "learning_rate": 3.1907440823691757e-05, + "loss": 0.0738, + "step": 6440 + }, + { + "epoch": 2.05, + "grad_norm": 2.1913180351257324, + "learning_rate": 3.180129497930156e-05, + "loss": 0.0863, + "step": 6450 + }, + { + "epoch": 2.06, + "grad_norm": 1.3282325267791748, + "learning_rate": 3.1695149134911364e-05, + "loss": 0.0582, + "step": 6460 + }, + { + "epoch": 2.06, + "grad_norm": 2.451772451400757, + "learning_rate": 3.158900329052118e-05, + "loss": 0.1187, + "step": 6470 + }, + { + "epoch": 2.06, + "grad_norm": 0.1372409611940384, + "learning_rate": 3.1482857446130985e-05, + "loss": 0.0618, + "step": 6480 + }, + { + "epoch": 2.07, + "grad_norm": 0.08469751477241516, + "learning_rate": 3.1376711601740795e-05, + "loss": 0.0316, + "step": 6490 + }, + { + "epoch": 2.07, + "grad_norm": 0.1473696529865265, + "learning_rate": 3.12705657573506e-05, + "loss": 0.0954, + "step": 6500 + }, + { + "epoch": 2.07, + "grad_norm": 0.06819278746843338, + "learning_rate": 3.116441991296041e-05, + "loss": 0.1365, + "step": 6510 + }, + { + "epoch": 2.08, + "grad_norm": 8.832886695861816, + "learning_rate": 3.105827406857021e-05, + "loss": 0.1828, + "step": 6520 + }, + { + "epoch": 2.08, + "grad_norm": 0.043228354305028915, + "learning_rate": 3.0952128224180024e-05, + "loss": 0.1541, + "step": 6530 + }, + { + "epoch": 2.08, + "grad_norm": 0.1457592248916626, + "learning_rate": 3.0845982379789834e-05, + "loss": 0.0291, + "step": 6540 + }, + { + "epoch": 2.08, + "grad_norm": 1.5548399686813354, + "learning_rate": 3.073983653539964e-05, + "loss": 0.123, + "step": 6550 + }, + { + "epoch": 2.09, + "grad_norm": 5.61803674697876, + "learning_rate": 3.063369069100945e-05, + "loss": 0.1871, + "step": 6560 + }, + { + "epoch": 2.09, + "grad_norm": 0.020372767001390457, + "learning_rate": 3.052754484661925e-05, + "loss": 0.0865, + "step": 6570 + }, + { + "epoch": 2.09, + "grad_norm": 5.178860664367676, + "learning_rate": 3.0421399002229062e-05, + "loss": 0.1568, + "step": 6580 + }, + { + "epoch": 2.1, + "grad_norm": 4.118620872497559, + "learning_rate": 3.0315253157838873e-05, + "loss": 0.0729, + "step": 6590 + }, + { + "epoch": 2.1, + "grad_norm": 3.9899566173553467, + "learning_rate": 3.020910731344868e-05, + "loss": 0.2327, + "step": 6600 + }, + { + "epoch": 2.1, + "grad_norm": 1.3902517557144165, + "learning_rate": 3.0102961469058487e-05, + "loss": 0.1305, + "step": 6610 + }, + { + "epoch": 2.11, + "grad_norm": 5.5835957527160645, + "learning_rate": 2.9996815624668294e-05, + "loss": 0.1032, + "step": 6620 + }, + { + "epoch": 2.11, + "grad_norm": 1.521474003791809, + "learning_rate": 2.98906697802781e-05, + "loss": 0.1188, + "step": 6630 + }, + { + "epoch": 2.11, + "grad_norm": 0.19501766562461853, + "learning_rate": 2.978452393588791e-05, + "loss": 0.0989, + "step": 6640 + }, + { + "epoch": 2.12, + "grad_norm": 0.03989823907613754, + "learning_rate": 2.9678378091497722e-05, + "loss": 0.0736, + "step": 6650 + }, + { + "epoch": 2.12, + "grad_norm": 3.9346630573272705, + "learning_rate": 2.957223224710753e-05, + "loss": 0.0347, + "step": 6660 + }, + { + "epoch": 2.12, + "grad_norm": 0.05866791680455208, + "learning_rate": 2.9466086402717336e-05, + "loss": 0.1317, + "step": 6670 + }, + { + "epoch": 2.13, + "grad_norm": 0.660900890827179, + "learning_rate": 2.9359940558327143e-05, + "loss": 0.1365, + "step": 6680 + }, + { + "epoch": 2.13, + "grad_norm": 0.20864763855934143, + "learning_rate": 2.925379471393695e-05, + "loss": 0.2221, + "step": 6690 + }, + { + "epoch": 2.13, + "grad_norm": 2.8652963638305664, + "learning_rate": 2.9147648869546758e-05, + "loss": 0.0355, + "step": 6700 + }, + { + "epoch": 2.14, + "grad_norm": 3.0343375205993652, + "learning_rate": 2.9041503025156568e-05, + "loss": 0.2081, + "step": 6710 + }, + { + "epoch": 2.14, + "grad_norm": 2.393002510070801, + "learning_rate": 2.8935357180766375e-05, + "loss": 0.1076, + "step": 6720 + }, + { + "epoch": 2.14, + "grad_norm": 0.08225111663341522, + "learning_rate": 2.8829211336376182e-05, + "loss": 0.1367, + "step": 6730 + }, + { + "epoch": 2.15, + "grad_norm": 4.09624719619751, + "learning_rate": 2.872306549198599e-05, + "loss": 0.2712, + "step": 6740 + }, + { + "epoch": 2.15, + "grad_norm": 0.667273998260498, + "learning_rate": 2.8616919647595796e-05, + "loss": 0.152, + "step": 6750 + }, + { + "epoch": 2.15, + "grad_norm": 1.4781357049942017, + "learning_rate": 2.8510773803205603e-05, + "loss": 0.0949, + "step": 6760 + }, + { + "epoch": 2.16, + "grad_norm": 4.563651084899902, + "learning_rate": 2.8404627958815417e-05, + "loss": 0.0873, + "step": 6770 + }, + { + "epoch": 2.16, + "grad_norm": 3.7740418910980225, + "learning_rate": 2.830909669886424e-05, + "loss": 0.1207, + "step": 6780 + }, + { + "epoch": 2.16, + "grad_norm": 10.370115280151367, + "learning_rate": 2.8202950854474048e-05, + "loss": 0.1369, + "step": 6790 + }, + { + "epoch": 2.16, + "grad_norm": 0.13098500669002533, + "learning_rate": 2.8096805010083855e-05, + "loss": 0.1016, + "step": 6800 + }, + { + "epoch": 2.17, + "grad_norm": 9.170578956604004, + "learning_rate": 2.7990659165693665e-05, + "loss": 0.0463, + "step": 6810 + }, + { + "epoch": 2.17, + "grad_norm": 10.379976272583008, + "learning_rate": 2.7884513321303472e-05, + "loss": 0.0798, + "step": 6820 + }, + { + "epoch": 2.17, + "grad_norm": 0.10993140935897827, + "learning_rate": 2.777836747691328e-05, + "loss": 0.084, + "step": 6830 + }, + { + "epoch": 2.18, + "grad_norm": 0.4707590937614441, + "learning_rate": 2.7672221632523087e-05, + "loss": 0.1232, + "step": 6840 + }, + { + "epoch": 2.18, + "grad_norm": 4.587014198303223, + "learning_rate": 2.7566075788132894e-05, + "loss": 0.1474, + "step": 6850 + }, + { + "epoch": 2.18, + "grad_norm": 10.61086654663086, + "learning_rate": 2.74599299437427e-05, + "loss": 0.1282, + "step": 6860 + }, + { + "epoch": 2.19, + "grad_norm": 0.2299477756023407, + "learning_rate": 2.7353784099352515e-05, + "loss": 0.0667, + "step": 6870 + }, + { + "epoch": 2.19, + "grad_norm": 5.911661624908447, + "learning_rate": 2.724763825496232e-05, + "loss": 0.1222, + "step": 6880 + }, + { + "epoch": 2.19, + "grad_norm": 0.1657014936208725, + "learning_rate": 2.714149241057213e-05, + "loss": 0.072, + "step": 6890 + }, + { + "epoch": 2.2, + "grad_norm": 0.04870441555976868, + "learning_rate": 2.7035346566181936e-05, + "loss": 0.1132, + "step": 6900 + }, + { + "epoch": 2.2, + "grad_norm": 0.7382871508598328, + "learning_rate": 2.6929200721791743e-05, + "loss": 0.0272, + "step": 6910 + }, + { + "epoch": 2.2, + "grad_norm": 1.1875141859054565, + "learning_rate": 2.682305487740155e-05, + "loss": 0.0833, + "step": 6920 + }, + { + "epoch": 2.21, + "grad_norm": 0.070220448076725, + "learning_rate": 2.671690903301136e-05, + "loss": 0.1321, + "step": 6930 + }, + { + "epoch": 2.21, + "grad_norm": 3.7514150142669678, + "learning_rate": 2.6610763188621167e-05, + "loss": 0.0971, + "step": 6940 + }, + { + "epoch": 2.21, + "grad_norm": 0.04383459314703941, + "learning_rate": 2.6504617344230975e-05, + "loss": 0.0878, + "step": 6950 + }, + { + "epoch": 2.22, + "grad_norm": 0.11518880724906921, + "learning_rate": 2.639847149984078e-05, + "loss": 0.0679, + "step": 6960 + }, + { + "epoch": 2.22, + "grad_norm": 5.474330902099609, + "learning_rate": 2.629232565545059e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 2.22, + "grad_norm": 0.03785128891468048, + "learning_rate": 2.6186179811060396e-05, + "loss": 0.1183, + "step": 6980 + }, + { + "epoch": 2.23, + "grad_norm": 0.050687942653894424, + "learning_rate": 2.608003396667021e-05, + "loss": 0.1141, + "step": 6990 + }, + { + "epoch": 2.23, + "grad_norm": 5.501091003417969, + "learning_rate": 2.5973888122280017e-05, + "loss": 0.1175, + "step": 7000 + }, + { + "epoch": 2.23, + "grad_norm": 1.3896145820617676, + "learning_rate": 2.5867742277889824e-05, + "loss": 0.1665, + "step": 7010 + }, + { + "epoch": 2.23, + "grad_norm": 5.888062000274658, + "learning_rate": 2.576159643349963e-05, + "loss": 0.1868, + "step": 7020 + }, + { + "epoch": 2.24, + "grad_norm": 0.3350411653518677, + "learning_rate": 2.5655450589109438e-05, + "loss": 0.0262, + "step": 7030 + }, + { + "epoch": 2.24, + "grad_norm": 0.12134930491447449, + "learning_rate": 2.5549304744719245e-05, + "loss": 0.1391, + "step": 7040 + }, + { + "epoch": 2.24, + "grad_norm": 2.653724193572998, + "learning_rate": 2.5443158900329056e-05, + "loss": 0.044, + "step": 7050 + }, + { + "epoch": 2.25, + "grad_norm": 1.480675458908081, + "learning_rate": 2.5337013055938863e-05, + "loss": 0.125, + "step": 7060 + }, + { + "epoch": 2.25, + "grad_norm": 2.112579584121704, + "learning_rate": 2.523086721154867e-05, + "loss": 0.0774, + "step": 7070 + }, + { + "epoch": 2.25, + "grad_norm": 0.03731192275881767, + "learning_rate": 2.5124721367158477e-05, + "loss": 0.0703, + "step": 7080 + }, + { + "epoch": 2.26, + "grad_norm": 0.06327365338802338, + "learning_rate": 2.5018575522768284e-05, + "loss": 0.1557, + "step": 7090 + }, + { + "epoch": 2.26, + "grad_norm": 0.10991324484348297, + "learning_rate": 2.4912429678378094e-05, + "loss": 0.0682, + "step": 7100 + }, + { + "epoch": 2.26, + "grad_norm": 0.03156714513897896, + "learning_rate": 2.48062838339879e-05, + "loss": 0.1716, + "step": 7110 + }, + { + "epoch": 2.27, + "grad_norm": 9.979147911071777, + "learning_rate": 2.470013798959771e-05, + "loss": 0.1797, + "step": 7120 + }, + { + "epoch": 2.27, + "grad_norm": 3.263706684112549, + "learning_rate": 2.459399214520752e-05, + "loss": 0.0659, + "step": 7130 + }, + { + "epoch": 2.27, + "grad_norm": 6.261413097381592, + "learning_rate": 2.4487846300817323e-05, + "loss": 0.0968, + "step": 7140 + }, + { + "epoch": 2.28, + "grad_norm": 1.550948143005371, + "learning_rate": 2.438170045642713e-05, + "loss": 0.1336, + "step": 7150 + }, + { + "epoch": 2.28, + "grad_norm": 1.1487703323364258, + "learning_rate": 2.427555461203694e-05, + "loss": 0.0647, + "step": 7160 + }, + { + "epoch": 2.28, + "grad_norm": 6.673706531524658, + "learning_rate": 2.4169408767646747e-05, + "loss": 0.1567, + "step": 7170 + }, + { + "epoch": 2.29, + "grad_norm": 0.17169363796710968, + "learning_rate": 2.4063262923256554e-05, + "loss": 0.1096, + "step": 7180 + }, + { + "epoch": 2.29, + "grad_norm": 8.660694122314453, + "learning_rate": 2.3957117078866365e-05, + "loss": 0.1589, + "step": 7190 + }, + { + "epoch": 2.29, + "grad_norm": 1.5906010866165161, + "learning_rate": 2.3850971234476172e-05, + "loss": 0.1224, + "step": 7200 + }, + { + "epoch": 2.3, + "grad_norm": 0.8341835141181946, + "learning_rate": 2.374482539008598e-05, + "loss": 0.02, + "step": 7210 + }, + { + "epoch": 2.3, + "grad_norm": 10.785898208618164, + "learning_rate": 2.363867954569579e-05, + "loss": 0.1153, + "step": 7220 + }, + { + "epoch": 2.3, + "grad_norm": 5.174521446228027, + "learning_rate": 2.3532533701305597e-05, + "loss": 0.0843, + "step": 7230 + }, + { + "epoch": 2.3, + "grad_norm": 0.7447335720062256, + "learning_rate": 2.3426387856915404e-05, + "loss": 0.1465, + "step": 7240 + }, + { + "epoch": 2.31, + "grad_norm": 3.2618470191955566, + "learning_rate": 2.332024201252521e-05, + "loss": 0.0874, + "step": 7250 + }, + { + "epoch": 2.31, + "grad_norm": 1.483594298362732, + "learning_rate": 2.3214096168135018e-05, + "loss": 0.1061, + "step": 7260 + }, + { + "epoch": 2.31, + "grad_norm": 0.303654283285141, + "learning_rate": 2.3107950323744825e-05, + "loss": 0.117, + "step": 7270 + }, + { + "epoch": 2.32, + "grad_norm": 10.942138671875, + "learning_rate": 2.3001804479354635e-05, + "loss": 0.2048, + "step": 7280 + }, + { + "epoch": 2.32, + "grad_norm": 7.95550012588501, + "learning_rate": 2.2895658634964442e-05, + "loss": 0.1158, + "step": 7290 + }, + { + "epoch": 2.32, + "grad_norm": 0.05263487249612808, + "learning_rate": 2.278951279057425e-05, + "loss": 0.0142, + "step": 7300 + }, + { + "epoch": 2.33, + "grad_norm": 0.04684547707438469, + "learning_rate": 2.268336694618406e-05, + "loss": 0.15, + "step": 7310 + }, + { + "epoch": 2.33, + "grad_norm": 6.8654890060424805, + "learning_rate": 2.2577221101793867e-05, + "loss": 0.1816, + "step": 7320 + }, + { + "epoch": 2.33, + "grad_norm": 11.469459533691406, + "learning_rate": 2.2471075257403674e-05, + "loss": 0.1426, + "step": 7330 + }, + { + "epoch": 2.34, + "grad_norm": 5.302177906036377, + "learning_rate": 2.236492941301348e-05, + "loss": 0.2248, + "step": 7340 + }, + { + "epoch": 2.34, + "grad_norm": 2.6794090270996094, + "learning_rate": 2.2258783568623288e-05, + "loss": 0.157, + "step": 7350 + }, + { + "epoch": 2.34, + "grad_norm": 1.3895156383514404, + "learning_rate": 2.2152637724233095e-05, + "loss": 0.159, + "step": 7360 + }, + { + "epoch": 2.35, + "grad_norm": 0.17077626287937164, + "learning_rate": 2.2046491879842906e-05, + "loss": 0.1298, + "step": 7370 + }, + { + "epoch": 2.35, + "grad_norm": 0.14379891753196716, + "learning_rate": 2.1940346035452713e-05, + "loss": 0.072, + "step": 7380 + }, + { + "epoch": 2.35, + "grad_norm": 0.946506142616272, + "learning_rate": 2.183420019106252e-05, + "loss": 0.0414, + "step": 7390 + }, + { + "epoch": 2.36, + "grad_norm": 0.10742925852537155, + "learning_rate": 2.172805434667233e-05, + "loss": 0.1991, + "step": 7400 + }, + { + "epoch": 2.36, + "grad_norm": 4.503111362457275, + "learning_rate": 2.1621908502282138e-05, + "loss": 0.1018, + "step": 7410 + }, + { + "epoch": 2.36, + "grad_norm": 0.025181856006383896, + "learning_rate": 2.1515762657891945e-05, + "loss": 0.2192, + "step": 7420 + }, + { + "epoch": 2.37, + "grad_norm": 0.2496863454580307, + "learning_rate": 2.140961681350175e-05, + "loss": 0.1513, + "step": 7430 + }, + { + "epoch": 2.37, + "grad_norm": 0.18356376886367798, + "learning_rate": 2.1303470969111562e-05, + "loss": 0.0928, + "step": 7440 + }, + { + "epoch": 2.37, + "grad_norm": 4.700144290924072, + "learning_rate": 2.119732512472137e-05, + "loss": 0.1076, + "step": 7450 + }, + { + "epoch": 2.37, + "grad_norm": 1.5925829410552979, + "learning_rate": 2.1091179280331176e-05, + "loss": 0.0673, + "step": 7460 + }, + { + "epoch": 2.38, + "grad_norm": 0.5920007228851318, + "learning_rate": 2.0985033435940983e-05, + "loss": 0.1291, + "step": 7470 + }, + { + "epoch": 2.38, + "grad_norm": 5.156589508056641, + "learning_rate": 2.087888759155079e-05, + "loss": 0.3071, + "step": 7480 + }, + { + "epoch": 2.38, + "grad_norm": 0.03765925392508507, + "learning_rate": 2.0772741747160598e-05, + "loss": 0.1093, + "step": 7490 + }, + { + "epoch": 2.39, + "grad_norm": 0.4249335825443268, + "learning_rate": 2.0666595902770408e-05, + "loss": 0.1279, + "step": 7500 + }, + { + "epoch": 2.39, + "grad_norm": 0.016695374622941017, + "learning_rate": 2.0560450058380215e-05, + "loss": 0.2595, + "step": 7510 + }, + { + "epoch": 2.39, + "grad_norm": 0.8157448768615723, + "learning_rate": 2.0454304213990022e-05, + "loss": 0.2879, + "step": 7520 + }, + { + "epoch": 2.4, + "grad_norm": 0.43193209171295166, + "learning_rate": 2.0348158369599833e-05, + "loss": 0.1147, + "step": 7530 + }, + { + "epoch": 2.4, + "grad_norm": 1.1754236221313477, + "learning_rate": 2.024201252520964e-05, + "loss": 0.1228, + "step": 7540 + }, + { + "epoch": 2.4, + "grad_norm": 0.073044553399086, + "learning_rate": 2.0135866680819447e-05, + "loss": 0.0953, + "step": 7550 + }, + { + "epoch": 2.41, + "grad_norm": 6.481806755065918, + "learning_rate": 2.0029720836429254e-05, + "loss": 0.0925, + "step": 7560 + }, + { + "epoch": 2.41, + "grad_norm": 3.421597719192505, + "learning_rate": 1.992357499203906e-05, + "loss": 0.1844, + "step": 7570 + }, + { + "epoch": 2.41, + "grad_norm": 0.15194571018218994, + "learning_rate": 1.9817429147648868e-05, + "loss": 0.3675, + "step": 7580 + }, + { + "epoch": 2.42, + "grad_norm": 0.44171637296676636, + "learning_rate": 1.971128330325868e-05, + "loss": 0.043, + "step": 7590 + }, + { + "epoch": 2.42, + "grad_norm": 0.06510256975889206, + "learning_rate": 1.9605137458868486e-05, + "loss": 0.1354, + "step": 7600 + }, + { + "epoch": 2.42, + "grad_norm": 2.7437000274658203, + "learning_rate": 1.9498991614478293e-05, + "loss": 0.0389, + "step": 7610 + }, + { + "epoch": 2.43, + "grad_norm": 1.2895437479019165, + "learning_rate": 1.9392845770088103e-05, + "loss": 0.1704, + "step": 7620 + }, + { + "epoch": 2.43, + "grad_norm": 0.03322044759988785, + "learning_rate": 1.928669992569791e-05, + "loss": 0.1065, + "step": 7630 + }, + { + "epoch": 2.43, + "grad_norm": 2.8655242919921875, + "learning_rate": 1.9180554081307717e-05, + "loss": 0.1504, + "step": 7640 + }, + { + "epoch": 2.44, + "grad_norm": 0.2032870352268219, + "learning_rate": 1.9074408236917528e-05, + "loss": 0.0229, + "step": 7650 + }, + { + "epoch": 2.44, + "grad_norm": 1.7102253437042236, + "learning_rate": 1.8968262392527335e-05, + "loss": 0.1019, + "step": 7660 + }, + { + "epoch": 2.44, + "grad_norm": 2.740474224090576, + "learning_rate": 1.8862116548137142e-05, + "loss": 0.1235, + "step": 7670 + }, + { + "epoch": 2.44, + "grad_norm": 2.9120683670043945, + "learning_rate": 1.875597070374695e-05, + "loss": 0.0516, + "step": 7680 + }, + { + "epoch": 2.45, + "grad_norm": 0.11502601206302643, + "learning_rate": 1.8649824859356756e-05, + "loss": 0.2791, + "step": 7690 + }, + { + "epoch": 2.45, + "grad_norm": 0.7027528882026672, + "learning_rate": 1.8543679014966563e-05, + "loss": 0.0385, + "step": 7700 + }, + { + "epoch": 2.45, + "grad_norm": 2.4370245933532715, + "learning_rate": 1.8437533170576374e-05, + "loss": 0.0936, + "step": 7710 + }, + { + "epoch": 2.46, + "grad_norm": 6.21151876449585, + "learning_rate": 1.833138732618618e-05, + "loss": 0.0806, + "step": 7720 + }, + { + "epoch": 2.46, + "grad_norm": 0.052706655114889145, + "learning_rate": 1.8225241481795988e-05, + "loss": 0.1684, + "step": 7730 + }, + { + "epoch": 2.46, + "grad_norm": 0.24665802717208862, + "learning_rate": 1.8119095637405798e-05, + "loss": 0.1383, + "step": 7740 + }, + { + "epoch": 2.47, + "grad_norm": 3.6017708778381348, + "learning_rate": 1.8012949793015605e-05, + "loss": 0.1204, + "step": 7750 + }, + { + "epoch": 2.47, + "grad_norm": 3.1942765712738037, + "learning_rate": 1.7906803948625412e-05, + "loss": 0.0627, + "step": 7760 + }, + { + "epoch": 2.47, + "grad_norm": 3.020968437194824, + "learning_rate": 1.780065810423522e-05, + "loss": 0.1656, + "step": 7770 + }, + { + "epoch": 2.48, + "grad_norm": 0.13594868779182434, + "learning_rate": 1.7694512259845027e-05, + "loss": 0.0529, + "step": 7780 + }, + { + "epoch": 2.48, + "grad_norm": 0.0280010886490345, + "learning_rate": 1.7588366415454834e-05, + "loss": 0.1539, + "step": 7790 + }, + { + "epoch": 2.48, + "grad_norm": 8.52804946899414, + "learning_rate": 1.7482220571064644e-05, + "loss": 0.0498, + "step": 7800 + }, + { + "epoch": 2.49, + "grad_norm": 0.20770138502120972, + "learning_rate": 1.737607472667445e-05, + "loss": 0.0903, + "step": 7810 + }, + { + "epoch": 2.49, + "grad_norm": 0.06971104443073273, + "learning_rate": 1.7269928882284258e-05, + "loss": 0.2458, + "step": 7820 + }, + { + "epoch": 2.49, + "grad_norm": 0.022506361827254295, + "learning_rate": 1.716378303789407e-05, + "loss": 0.0741, + "step": 7830 + }, + { + "epoch": 2.5, + "grad_norm": 4.818386077880859, + "learning_rate": 1.7057637193503876e-05, + "loss": 0.0586, + "step": 7840 + }, + { + "epoch": 2.5, + "grad_norm": 0.05160210281610489, + "learning_rate": 1.6951491349113683e-05, + "loss": 0.0817, + "step": 7850 + }, + { + "epoch": 2.5, + "grad_norm": 0.15953780710697174, + "learning_rate": 1.6845345504723493e-05, + "loss": 0.0905, + "step": 7860 + }, + { + "epoch": 2.51, + "grad_norm": 0.015429453924298286, + "learning_rate": 1.67391996603333e-05, + "loss": 0.0719, + "step": 7870 + }, + { + "epoch": 2.51, + "grad_norm": 3.159700632095337, + "learning_rate": 1.6633053815943108e-05, + "loss": 0.048, + "step": 7880 + }, + { + "epoch": 2.51, + "grad_norm": 1.702974796295166, + "learning_rate": 1.6526907971552915e-05, + "loss": 0.1025, + "step": 7890 + }, + { + "epoch": 2.51, + "grad_norm": 0.7218146324157715, + "learning_rate": 1.6420762127162722e-05, + "loss": 0.0534, + "step": 7900 + }, + { + "epoch": 2.52, + "grad_norm": 4.001716136932373, + "learning_rate": 1.632523086721155e-05, + "loss": 0.1611, + "step": 7910 + }, + { + "epoch": 2.52, + "grad_norm": 0.6529110074043274, + "learning_rate": 1.6219085022821356e-05, + "loss": 0.1739, + "step": 7920 + }, + { + "epoch": 2.52, + "grad_norm": 3.3086657524108887, + "learning_rate": 1.6112939178431166e-05, + "loss": 0.1373, + "step": 7930 + }, + { + "epoch": 2.53, + "grad_norm": 2.368133068084717, + "learning_rate": 1.6006793334040973e-05, + "loss": 0.0841, + "step": 7940 + }, + { + "epoch": 2.53, + "grad_norm": 5.263741970062256, + "learning_rate": 1.590064748965078e-05, + "loss": 0.1226, + "step": 7950 + }, + { + "epoch": 2.53, + "grad_norm": 10.581872940063477, + "learning_rate": 1.579450164526059e-05, + "loss": 0.1063, + "step": 7960 + }, + { + "epoch": 2.54, + "grad_norm": 0.07476484030485153, + "learning_rate": 1.5688355800870398e-05, + "loss": 0.3208, + "step": 7970 + }, + { + "epoch": 2.54, + "grad_norm": 0.976747453212738, + "learning_rate": 1.5582209956480205e-05, + "loss": 0.2336, + "step": 7980 + }, + { + "epoch": 2.54, + "grad_norm": 0.2981054186820984, + "learning_rate": 1.5476064112090012e-05, + "loss": 0.0603, + "step": 7990 + }, + { + "epoch": 2.55, + "grad_norm": 0.032338302582502365, + "learning_rate": 1.536991826769982e-05, + "loss": 0.0123, + "step": 8000 + }, + { + "epoch": 2.55, + "grad_norm": 7.625821590423584, + "learning_rate": 1.5263772423309626e-05, + "loss": 0.1735, + "step": 8010 + }, + { + "epoch": 2.55, + "grad_norm": 4.120946407318115, + "learning_rate": 1.5157626578919436e-05, + "loss": 0.1199, + "step": 8020 + }, + { + "epoch": 2.56, + "grad_norm": 0.04417848959565163, + "learning_rate": 1.5051480734529244e-05, + "loss": 0.0629, + "step": 8030 + }, + { + "epoch": 2.56, + "grad_norm": 3.9831886291503906, + "learning_rate": 1.494533489013905e-05, + "loss": 0.1507, + "step": 8040 + }, + { + "epoch": 2.56, + "grad_norm": 0.2706742286682129, + "learning_rate": 1.4839189045748861e-05, + "loss": 0.1195, + "step": 8050 + }, + { + "epoch": 2.57, + "grad_norm": 0.045659586787223816, + "learning_rate": 1.4733043201358668e-05, + "loss": 0.0875, + "step": 8060 + }, + { + "epoch": 2.57, + "grad_norm": 2.9574756622314453, + "learning_rate": 1.4626897356968475e-05, + "loss": 0.0828, + "step": 8070 + }, + { + "epoch": 2.57, + "grad_norm": 11.923121452331543, + "learning_rate": 1.4520751512578284e-05, + "loss": 0.1937, + "step": 8080 + }, + { + "epoch": 2.58, + "grad_norm": 0.8571139574050903, + "learning_rate": 1.4414605668188091e-05, + "loss": 0.1385, + "step": 8090 + }, + { + "epoch": 2.58, + "grad_norm": 11.532151222229004, + "learning_rate": 1.4308459823797898e-05, + "loss": 0.1644, + "step": 8100 + }, + { + "epoch": 2.58, + "grad_norm": 0.02608746476471424, + "learning_rate": 1.4202313979407709e-05, + "loss": 0.0845, + "step": 8110 + }, + { + "epoch": 2.58, + "grad_norm": 0.3875482976436615, + "learning_rate": 1.4096168135017516e-05, + "loss": 0.1526, + "step": 8120 + }, + { + "epoch": 2.59, + "grad_norm": 0.46190938353538513, + "learning_rate": 1.3990022290627323e-05, + "loss": 0.0656, + "step": 8130 + }, + { + "epoch": 2.59, + "grad_norm": 0.06178577244281769, + "learning_rate": 1.3883876446237132e-05, + "loss": 0.0245, + "step": 8140 + }, + { + "epoch": 2.59, + "grad_norm": 0.41626548767089844, + "learning_rate": 1.3777730601846939e-05, + "loss": 0.1448, + "step": 8150 + }, + { + "epoch": 2.6, + "grad_norm": 0.8394218683242798, + "learning_rate": 1.3671584757456746e-05, + "loss": 0.1566, + "step": 8160 + }, + { + "epoch": 2.6, + "grad_norm": 0.030064724385738373, + "learning_rate": 1.3565438913066556e-05, + "loss": 0.1614, + "step": 8170 + }, + { + "epoch": 2.6, + "grad_norm": 0.7408326864242554, + "learning_rate": 1.3459293068676362e-05, + "loss": 0.0493, + "step": 8180 + }, + { + "epoch": 2.61, + "grad_norm": 6.210927486419678, + "learning_rate": 1.3353147224286169e-05, + "loss": 0.173, + "step": 8190 + }, + { + "epoch": 2.61, + "grad_norm": 0.3989274501800537, + "learning_rate": 1.3247001379895976e-05, + "loss": 0.2242, + "step": 8200 + }, + { + "epoch": 2.61, + "grad_norm": 0.21221469342708588, + "learning_rate": 1.3140855535505786e-05, + "loss": 0.1021, + "step": 8210 + }, + { + "epoch": 2.62, + "grad_norm": 0.018684396520256996, + "learning_rate": 1.3034709691115593e-05, + "loss": 0.1168, + "step": 8220 + }, + { + "epoch": 2.62, + "grad_norm": 4.258501052856445, + "learning_rate": 1.29285638467254e-05, + "loss": 0.101, + "step": 8230 + }, + { + "epoch": 2.62, + "grad_norm": 0.18293698132038116, + "learning_rate": 1.282241800233521e-05, + "loss": 0.0888, + "step": 8240 + }, + { + "epoch": 2.63, + "grad_norm": 5.2559685707092285, + "learning_rate": 1.2716272157945016e-05, + "loss": 0.1593, + "step": 8250 + }, + { + "epoch": 2.63, + "grad_norm": 0.714055597782135, + "learning_rate": 1.2610126313554823e-05, + "loss": 0.0548, + "step": 8260 + }, + { + "epoch": 2.63, + "grad_norm": 5.772704124450684, + "learning_rate": 1.2503980469164634e-05, + "loss": 0.1758, + "step": 8270 + }, + { + "epoch": 2.64, + "grad_norm": 0.15256932377815247, + "learning_rate": 1.2397834624774441e-05, + "loss": 0.1546, + "step": 8280 + }, + { + "epoch": 2.64, + "grad_norm": 0.17343765497207642, + "learning_rate": 1.2291688780384248e-05, + "loss": 0.0422, + "step": 8290 + }, + { + "epoch": 2.64, + "grad_norm": 5.067286491394043, + "learning_rate": 1.2185542935994057e-05, + "loss": 0.0384, + "step": 8300 + }, + { + "epoch": 2.65, + "grad_norm": 2.087721109390259, + "learning_rate": 1.2079397091603864e-05, + "loss": 0.1132, + "step": 8310 + }, + { + "epoch": 2.65, + "grad_norm": 6.7488017082214355, + "learning_rate": 1.1973251247213673e-05, + "loss": 0.0729, + "step": 8320 + }, + { + "epoch": 2.65, + "grad_norm": 10.669734954833984, + "learning_rate": 1.1867105402823481e-05, + "loss": 0.0677, + "step": 8330 + }, + { + "epoch": 2.65, + "grad_norm": 4.734282970428467, + "learning_rate": 1.1760959558433288e-05, + "loss": 0.1741, + "step": 8340 + }, + { + "epoch": 2.66, + "grad_norm": 1.1807498931884766, + "learning_rate": 1.1665428298482115e-05, + "loss": 0.1219, + "step": 8350 + }, + { + "epoch": 2.66, + "grad_norm": 0.1118198037147522, + "learning_rate": 1.1559282454091922e-05, + "loss": 0.0724, + "step": 8360 + }, + { + "epoch": 2.66, + "grad_norm": 8.563444137573242, + "learning_rate": 1.1453136609701731e-05, + "loss": 0.1453, + "step": 8370 + }, + { + "epoch": 2.67, + "grad_norm": 1.4987778663635254, + "learning_rate": 1.1346990765311538e-05, + "loss": 0.1087, + "step": 8380 + }, + { + "epoch": 2.67, + "grad_norm": 6.070169448852539, + "learning_rate": 1.1240844920921347e-05, + "loss": 0.1372, + "step": 8390 + }, + { + "epoch": 2.67, + "grad_norm": 3.5378408432006836, + "learning_rate": 1.1134699076531156e-05, + "loss": 0.1421, + "step": 8400 + }, + { + "epoch": 2.68, + "grad_norm": 0.18879607319831848, + "learning_rate": 1.1028553232140961e-05, + "loss": 0.0617, + "step": 8410 + }, + { + "epoch": 2.68, + "grad_norm": 3.873791217803955, + "learning_rate": 1.092240738775077e-05, + "loss": 0.1256, + "step": 8420 + }, + { + "epoch": 2.68, + "grad_norm": 3.0632710456848145, + "learning_rate": 1.0816261543360579e-05, + "loss": 0.1084, + "step": 8430 + }, + { + "epoch": 2.69, + "grad_norm": 0.044198133051395416, + "learning_rate": 1.0710115698970386e-05, + "loss": 0.0972, + "step": 8440 + }, + { + "epoch": 2.69, + "grad_norm": 0.06533059477806091, + "learning_rate": 1.0603969854580194e-05, + "loss": 0.0659, + "step": 8450 + }, + { + "epoch": 2.69, + "grad_norm": 0.024154966697096825, + "learning_rate": 1.0497824010190002e-05, + "loss": 0.2245, + "step": 8460 + }, + { + "epoch": 2.7, + "grad_norm": 0.06551453471183777, + "learning_rate": 1.0391678165799809e-05, + "loss": 0.0679, + "step": 8470 + }, + { + "epoch": 2.7, + "grad_norm": 2.244358777999878, + "learning_rate": 1.0285532321409617e-05, + "loss": 0.1138, + "step": 8480 + }, + { + "epoch": 2.7, + "grad_norm": 1.3429971933364868, + "learning_rate": 1.0179386477019426e-05, + "loss": 0.1286, + "step": 8490 + }, + { + "epoch": 2.71, + "grad_norm": 13.364596366882324, + "learning_rate": 1.0073240632629233e-05, + "loss": 0.1304, + "step": 8500 + }, + { + "epoch": 2.71, + "grad_norm": 1.5777560472488403, + "learning_rate": 9.96709478823904e-06, + "loss": 0.0324, + "step": 8510 + }, + { + "epoch": 2.71, + "grad_norm": 3.5468719005584717, + "learning_rate": 9.860948943848847e-06, + "loss": 0.1142, + "step": 8520 + }, + { + "epoch": 2.72, + "grad_norm": 9.198564529418945, + "learning_rate": 9.754803099458656e-06, + "loss": 0.1208, + "step": 8530 + }, + { + "epoch": 2.72, + "grad_norm": 0.10464298725128174, + "learning_rate": 9.648657255068465e-06, + "loss": 0.062, + "step": 8540 + }, + { + "epoch": 2.72, + "grad_norm": 7.4889702796936035, + "learning_rate": 9.542511410678272e-06, + "loss": 0.1081, + "step": 8550 + }, + { + "epoch": 2.72, + "grad_norm": 4.211546897888184, + "learning_rate": 9.43636556628808e-06, + "loss": 0.122, + "step": 8560 + }, + { + "epoch": 2.73, + "grad_norm": 5.125463008880615, + "learning_rate": 9.330219721897888e-06, + "loss": 0.2547, + "step": 8570 + }, + { + "epoch": 2.73, + "grad_norm": 0.17111606895923615, + "learning_rate": 9.224073877507695e-06, + "loss": 0.0792, + "step": 8580 + }, + { + "epoch": 2.73, + "grad_norm": 0.17677658796310425, + "learning_rate": 9.117928033117504e-06, + "loss": 0.2517, + "step": 8590 + }, + { + "epoch": 2.74, + "grad_norm": 0.88303542137146, + "learning_rate": 9.011782188727312e-06, + "loss": 0.1207, + "step": 8600 + }, + { + "epoch": 2.74, + "grad_norm": 0.934140682220459, + "learning_rate": 8.90563634433712e-06, + "loss": 0.0874, + "step": 8610 + }, + { + "epoch": 2.74, + "grad_norm": 0.1124495416879654, + "learning_rate": 8.799490499946927e-06, + "loss": 0.2207, + "step": 8620 + }, + { + "epoch": 2.75, + "grad_norm": 1.9301073551177979, + "learning_rate": 8.693344655556735e-06, + "loss": 0.1351, + "step": 8630 + }, + { + "epoch": 2.75, + "grad_norm": 0.42326900362968445, + "learning_rate": 8.587198811166543e-06, + "loss": 0.1563, + "step": 8640 + }, + { + "epoch": 2.75, + "grad_norm": 0.01322962436825037, + "learning_rate": 8.481052966776351e-06, + "loss": 0.0387, + "step": 8650 + }, + { + "epoch": 2.76, + "grad_norm": 3.7665517330169678, + "learning_rate": 8.37490712238616e-06, + "loss": 0.2157, + "step": 8660 + }, + { + "epoch": 2.76, + "grad_norm": 0.2205476611852646, + "learning_rate": 8.268761277995967e-06, + "loss": 0.0491, + "step": 8670 + }, + { + "epoch": 2.76, + "grad_norm": 0.10910103470087051, + "learning_rate": 8.162615433605774e-06, + "loss": 0.0924, + "step": 8680 + }, + { + "epoch": 2.77, + "grad_norm": 0.030913598835468292, + "learning_rate": 8.056469589215583e-06, + "loss": 0.0553, + "step": 8690 + }, + { + "epoch": 2.77, + "grad_norm": 0.08986567705869675, + "learning_rate": 7.95032374482539e-06, + "loss": 0.0613, + "step": 8700 + }, + { + "epoch": 2.77, + "grad_norm": 0.21952463686466217, + "learning_rate": 7.844177900435199e-06, + "loss": 0.0898, + "step": 8710 + }, + { + "epoch": 2.78, + "grad_norm": 0.6068935990333557, + "learning_rate": 7.738032056045006e-06, + "loss": 0.043, + "step": 8720 + }, + { + "epoch": 2.78, + "grad_norm": 0.03201749920845032, + "learning_rate": 7.631886211654813e-06, + "loss": 0.1957, + "step": 8730 + }, + { + "epoch": 2.78, + "grad_norm": 3.205738067626953, + "learning_rate": 7.525740367264622e-06, + "loss": 0.1425, + "step": 8740 + }, + { + "epoch": 2.79, + "grad_norm": 3.265514612197876, + "learning_rate": 7.4195945228744306e-06, + "loss": 0.1652, + "step": 8750 + }, + { + "epoch": 2.79, + "grad_norm": 0.11868763715028763, + "learning_rate": 7.313448678484238e-06, + "loss": 0.0193, + "step": 8760 + }, + { + "epoch": 2.79, + "grad_norm": 0.03614291548728943, + "learning_rate": 7.2073028340940456e-06, + "loss": 0.1368, + "step": 8770 + }, + { + "epoch": 2.79, + "grad_norm": 2.512045383453369, + "learning_rate": 7.101156989703854e-06, + "loss": 0.0949, + "step": 8780 + }, + { + "epoch": 2.8, + "grad_norm": 5.77540922164917, + "learning_rate": 6.995011145313661e-06, + "loss": 0.1639, + "step": 8790 + }, + { + "epoch": 2.8, + "grad_norm": 7.473822116851807, + "learning_rate": 6.888865300923469e-06, + "loss": 0.1023, + "step": 8800 + }, + { + "epoch": 2.8, + "grad_norm": 0.0789722427725792, + "learning_rate": 6.782719456533278e-06, + "loss": 0.0627, + "step": 8810 + }, + { + "epoch": 2.81, + "grad_norm": 2.9245636463165283, + "learning_rate": 6.676573612143084e-06, + "loss": 0.1771, + "step": 8820 + }, + { + "epoch": 2.81, + "grad_norm": 2.1707448959350586, + "learning_rate": 6.570427767752893e-06, + "loss": 0.0423, + "step": 8830 + }, + { + "epoch": 2.81, + "grad_norm": 4.990893363952637, + "learning_rate": 6.4642819233627e-06, + "loss": 0.073, + "step": 8840 + }, + { + "epoch": 2.82, + "grad_norm": 6.3620452880859375, + "learning_rate": 6.358136078972508e-06, + "loss": 0.0627, + "step": 8850 + }, + { + "epoch": 2.82, + "grad_norm": 0.09669307619333267, + "learning_rate": 6.251990234582317e-06, + "loss": 0.0879, + "step": 8860 + }, + { + "epoch": 2.82, + "grad_norm": 5.8794779777526855, + "learning_rate": 6.145844390192124e-06, + "loss": 0.1667, + "step": 8870 + }, + { + "epoch": 2.83, + "grad_norm": 0.0750487744808197, + "learning_rate": 6.039698545801932e-06, + "loss": 0.1538, + "step": 8880 + }, + { + "epoch": 2.83, + "grad_norm": 4.174580097198486, + "learning_rate": 5.933552701411741e-06, + "loss": 0.1782, + "step": 8890 + }, + { + "epoch": 2.83, + "grad_norm": 2.7931034564971924, + "learning_rate": 5.827406857021548e-06, + "loss": 0.1047, + "step": 8900 + }, + { + "epoch": 2.84, + "grad_norm": 0.11179756373167038, + "learning_rate": 5.721261012631356e-06, + "loss": 0.0648, + "step": 8910 + }, + { + "epoch": 2.84, + "grad_norm": 0.25602421164512634, + "learning_rate": 5.615115168241164e-06, + "loss": 0.1657, + "step": 8920 + }, + { + "epoch": 2.84, + "grad_norm": 0.030272111296653748, + "learning_rate": 5.5089693238509715e-06, + "loss": 0.1344, + "step": 8930 + }, + { + "epoch": 2.85, + "grad_norm": 1.8802919387817383, + "learning_rate": 5.4028234794607795e-06, + "loss": 0.1284, + "step": 8940 + }, + { + "epoch": 2.85, + "grad_norm": 0.9859854578971863, + "learning_rate": 5.296677635070587e-06, + "loss": 0.0504, + "step": 8950 + }, + { + "epoch": 2.85, + "grad_norm": 0.5083135962486267, + "learning_rate": 5.190531790680395e-06, + "loss": 0.0193, + "step": 8960 + }, + { + "epoch": 2.86, + "grad_norm": 3.466031789779663, + "learning_rate": 5.084385946290203e-06, + "loss": 0.0459, + "step": 8970 + }, + { + "epoch": 2.86, + "grad_norm": 8.049098014831543, + "learning_rate": 4.97824010190001e-06, + "loss": 0.1025, + "step": 8980 + }, + { + "epoch": 2.86, + "grad_norm": 5.528136730194092, + "learning_rate": 4.872094257509819e-06, + "loss": 0.109, + "step": 8990 + }, + { + "epoch": 2.86, + "grad_norm": 0.02654377557337284, + "learning_rate": 4.765948413119627e-06, + "loss": 0.1655, + "step": 9000 + } + ], + "logging_steps": 10, + "max_steps": 9423, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 1.0453875280157082e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}