{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 21.660649819494584, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.036101083032490974, "grad_norm": 0.8792149424552917, "learning_rate": 6.000000000000001e-07, "loss": 1.1882, "step": 10 }, { "epoch": 0.07220216606498195, "grad_norm": 0.7292852997779846, "learning_rate": 1.2666666666666667e-06, "loss": 1.183, "step": 20 }, { "epoch": 0.10830324909747292, "grad_norm": 0.6755103468894958, "learning_rate": 1.9333333333333336e-06, "loss": 1.172, "step": 30 }, { "epoch": 0.1444043321299639, "grad_norm": 0.5594439506530762, "learning_rate": 2.6e-06, "loss": 1.1539, "step": 40 }, { "epoch": 0.18050541516245489, "grad_norm": 0.5405519008636475, "learning_rate": 3.2666666666666666e-06, "loss": 1.1356, "step": 50 }, { "epoch": 0.21660649819494585, "grad_norm": 0.5031427145004272, "learning_rate": 3.9333333333333335e-06, "loss": 1.1221, "step": 60 }, { "epoch": 0.2527075812274368, "grad_norm": 0.502995491027832, "learning_rate": 4.6e-06, "loss": 1.1053, "step": 70 }, { "epoch": 0.2888086642599278, "grad_norm": 0.46390628814697266, "learning_rate": 5.266666666666667e-06, "loss": 1.0908, "step": 80 }, { "epoch": 0.3249097472924188, "grad_norm": 0.39707711338996887, "learning_rate": 5.933333333333334e-06, "loss": 1.0768, "step": 90 }, { "epoch": 0.36101083032490977, "grad_norm": 0.2680375277996063, "learning_rate": 6.6e-06, "loss": 1.0651, "step": 100 }, { "epoch": 0.3971119133574007, "grad_norm": 0.20656010508537292, "learning_rate": 7.266666666666668e-06, "loss": 1.0499, "step": 110 }, { "epoch": 0.4332129963898917, "grad_norm": 0.2176329642534256, "learning_rate": 7.933333333333334e-06, "loss": 1.04, "step": 120 }, { "epoch": 0.4693140794223827, "grad_norm": 0.1952984780073166, "learning_rate": 8.599999999999999e-06, "loss": 1.0314, "step": 130 }, { "epoch": 0.5054151624548736, "grad_norm": 0.17900057137012482, "learning_rate": 9.266666666666667e-06, "loss": 1.023, "step": 140 }, { "epoch": 0.5415162454873647, "grad_norm": 0.15559467673301697, "learning_rate": 9.933333333333334e-06, "loss": 1.0192, "step": 150 }, { "epoch": 0.5776173285198556, "grad_norm": 0.1702832281589508, "learning_rate": 1.06e-05, "loss": 1.0109, "step": 160 }, { "epoch": 0.6137184115523465, "grad_norm": 0.19487395882606506, "learning_rate": 1.1266666666666667e-05, "loss": 1.008, "step": 170 }, { "epoch": 0.6498194945848376, "grad_norm": 0.16858609020709991, "learning_rate": 1.1933333333333333e-05, "loss": 1.0011, "step": 180 }, { "epoch": 0.6859205776173285, "grad_norm": 0.2135946899652481, "learning_rate": 1.2600000000000001e-05, "loss": 0.9937, "step": 190 }, { "epoch": 0.7220216606498195, "grad_norm": 0.23672480881214142, "learning_rate": 1.3266666666666666e-05, "loss": 0.9848, "step": 200 }, { "epoch": 0.7581227436823105, "grad_norm": 0.36581552028656006, "learning_rate": 1.3933333333333334e-05, "loss": 0.9661, "step": 210 }, { "epoch": 0.7942238267148014, "grad_norm": 0.5021904110908508, "learning_rate": 1.4599999999999999e-05, "loss": 0.9308, "step": 220 }, { "epoch": 0.8303249097472925, "grad_norm": 0.9705759882926941, "learning_rate": 1.5266666666666667e-05, "loss": 0.8545, "step": 230 }, { "epoch": 0.8664259927797834, "grad_norm": 1.263683557510376, "learning_rate": 1.5933333333333332e-05, "loss": 0.7534, "step": 240 }, { "epoch": 0.9025270758122743, "grad_norm": 1.0056917667388916, "learning_rate": 1.66e-05, "loss": 0.669, "step": 250 }, { "epoch": 0.9386281588447654, "grad_norm": 1.5650357007980347, "learning_rate": 1.726666666666667e-05, "loss": 0.595, "step": 260 }, { "epoch": 0.9747292418772563, "grad_norm": 1.4451838731765747, "learning_rate": 1.7933333333333337e-05, "loss": 0.5332, "step": 270 }, { "epoch": 1.0108303249097472, "grad_norm": 1.5739387273788452, "learning_rate": 1.86e-05, "loss": 0.4718, "step": 280 }, { "epoch": 1.0469314079422383, "grad_norm": 1.5814650058746338, "learning_rate": 1.926666666666667e-05, "loss": 0.4245, "step": 290 }, { "epoch": 1.0830324909747293, "grad_norm": 1.677675485610962, "learning_rate": 1.9933333333333334e-05, "loss": 0.3778, "step": 300 }, { "epoch": 1.1191335740072201, "grad_norm": 1.9410228729248047, "learning_rate": 2.06e-05, "loss": 0.3371, "step": 310 }, { "epoch": 1.1552346570397112, "grad_norm": 1.7728365659713745, "learning_rate": 2.1266666666666667e-05, "loss": 0.3061, "step": 320 }, { "epoch": 1.1913357400722022, "grad_norm": 1.5503789186477661, "learning_rate": 2.1933333333333332e-05, "loss": 0.2825, "step": 330 }, { "epoch": 1.2274368231046933, "grad_norm": 1.77287757396698, "learning_rate": 2.26e-05, "loss": 0.2613, "step": 340 }, { "epoch": 1.263537906137184, "grad_norm": 1.6768401861190796, "learning_rate": 2.326666666666667e-05, "loss": 0.2476, "step": 350 }, { "epoch": 1.2996389891696751, "grad_norm": 1.5678222179412842, "learning_rate": 2.3933333333333337e-05, "loss": 0.2395, "step": 360 }, { "epoch": 1.335740072202166, "grad_norm": 1.7035478353500366, "learning_rate": 2.46e-05, "loss": 0.2336, "step": 370 }, { "epoch": 1.371841155234657, "grad_norm": 1.7723883390426636, "learning_rate": 2.5266666666666666e-05, "loss": 0.2269, "step": 380 }, { "epoch": 1.407942238267148, "grad_norm": 1.5948989391326904, "learning_rate": 2.5933333333333338e-05, "loss": 0.224, "step": 390 }, { "epoch": 1.444043321299639, "grad_norm": 2.1878089904785156, "learning_rate": 2.6600000000000003e-05, "loss": 0.2185, "step": 400 }, { "epoch": 1.48014440433213, "grad_norm": 1.9472143650054932, "learning_rate": 2.7266666666666668e-05, "loss": 0.2148, "step": 410 }, { "epoch": 1.516245487364621, "grad_norm": 2.4479265213012695, "learning_rate": 2.7933333333333332e-05, "loss": 0.215, "step": 420 }, { "epoch": 1.5523465703971118, "grad_norm": 1.6668602228164673, "learning_rate": 2.86e-05, "loss": 0.2156, "step": 430 }, { "epoch": 1.5884476534296028, "grad_norm": 2.297792434692383, "learning_rate": 2.926666666666667e-05, "loss": 0.2131, "step": 440 }, { "epoch": 1.6245487364620939, "grad_norm": 1.9421613216400146, "learning_rate": 2.9933333333333337e-05, "loss": 0.2136, "step": 450 }, { "epoch": 1.660649819494585, "grad_norm": 2.2717418670654297, "learning_rate": 3.06e-05, "loss": 0.2093, "step": 460 }, { "epoch": 1.696750902527076, "grad_norm": 2.1423869132995605, "learning_rate": 3.126666666666666e-05, "loss": 0.2094, "step": 470 }, { "epoch": 1.7328519855595668, "grad_norm": 2.2144198417663574, "learning_rate": 3.1933333333333335e-05, "loss": 0.2092, "step": 480 }, { "epoch": 1.7689530685920578, "grad_norm": 1.9322779178619385, "learning_rate": 3.26e-05, "loss": 0.2064, "step": 490 }, { "epoch": 1.8050541516245486, "grad_norm": 1.9350993633270264, "learning_rate": 3.326666666666667e-05, "loss": 0.2077, "step": 500 }, { "epoch": 1.8411552346570397, "grad_norm": 1.7844723463058472, "learning_rate": 3.3933333333333336e-05, "loss": 0.2067, "step": 510 }, { "epoch": 1.8772563176895307, "grad_norm": 2.405451774597168, "learning_rate": 3.46e-05, "loss": 0.2026, "step": 520 }, { "epoch": 1.9133574007220218, "grad_norm": 2.382427453994751, "learning_rate": 3.526666666666667e-05, "loss": 0.2004, "step": 530 }, { "epoch": 1.9494584837545126, "grad_norm": 2.5038397312164307, "learning_rate": 3.593333333333334e-05, "loss": 0.2034, "step": 540 }, { "epoch": 1.9855595667870036, "grad_norm": 2.2127082347869873, "learning_rate": 3.66e-05, "loss": 0.2005, "step": 550 }, { "epoch": 2.0216606498194944, "grad_norm": 2.9056589603424072, "learning_rate": 3.726666666666667e-05, "loss": 0.1995, "step": 560 }, { "epoch": 2.0577617328519855, "grad_norm": 2.2383415699005127, "learning_rate": 3.793333333333334e-05, "loss": 0.2031, "step": 570 }, { "epoch": 2.0938628158844765, "grad_norm": 1.9492878913879395, "learning_rate": 3.86e-05, "loss": 0.1991, "step": 580 }, { "epoch": 2.1299638989169676, "grad_norm": 1.8478870391845703, "learning_rate": 3.926666666666667e-05, "loss": 0.1993, "step": 590 }, { "epoch": 2.1660649819494586, "grad_norm": 2.0714190006256104, "learning_rate": 3.993333333333333e-05, "loss": 0.1977, "step": 600 }, { "epoch": 2.2021660649819497, "grad_norm": 2.0085227489471436, "learning_rate": 4.0600000000000004e-05, "loss": 0.1949, "step": 610 }, { "epoch": 2.2382671480144403, "grad_norm": 2.1462786197662354, "learning_rate": 4.126666666666667e-05, "loss": 0.1955, "step": 620 }, { "epoch": 2.2743682310469313, "grad_norm": 1.9504297971725464, "learning_rate": 4.1933333333333334e-05, "loss": 0.1932, "step": 630 }, { "epoch": 2.3104693140794224, "grad_norm": 2.304089069366455, "learning_rate": 4.26e-05, "loss": 0.1935, "step": 640 }, { "epoch": 2.3465703971119134, "grad_norm": 2.4123048782348633, "learning_rate": 4.3266666666666664e-05, "loss": 0.1911, "step": 650 }, { "epoch": 2.3826714801444044, "grad_norm": 2.5345194339752197, "learning_rate": 4.3933333333333335e-05, "loss": 0.1861, "step": 660 }, { "epoch": 2.4187725631768955, "grad_norm": 2.2014682292938232, "learning_rate": 4.46e-05, "loss": 0.1887, "step": 670 }, { "epoch": 2.4548736462093865, "grad_norm": 2.7363312244415283, "learning_rate": 4.526666666666667e-05, "loss": 0.1826, "step": 680 }, { "epoch": 2.490974729241877, "grad_norm": 2.445457935333252, "learning_rate": 4.5933333333333336e-05, "loss": 0.1782, "step": 690 }, { "epoch": 2.527075812274368, "grad_norm": 3.096940279006958, "learning_rate": 4.660000000000001e-05, "loss": 0.1719, "step": 700 }, { "epoch": 2.563176895306859, "grad_norm": 3.281512498855591, "learning_rate": 4.726666666666667e-05, "loss": 0.168, "step": 710 }, { "epoch": 2.5992779783393503, "grad_norm": 3.32570743560791, "learning_rate": 4.793333333333334e-05, "loss": 0.1634, "step": 720 }, { "epoch": 2.6353790613718413, "grad_norm": 2.5920348167419434, "learning_rate": 4.86e-05, "loss": 0.1641, "step": 730 }, { "epoch": 2.671480144404332, "grad_norm": 3.0771641731262207, "learning_rate": 4.926666666666667e-05, "loss": 0.1589, "step": 740 }, { "epoch": 2.707581227436823, "grad_norm": 3.3847527503967285, "learning_rate": 4.993333333333334e-05, "loss": 0.1588, "step": 750 }, { "epoch": 2.743682310469314, "grad_norm": 3.3967106342315674, "learning_rate": 5.0600000000000003e-05, "loss": 0.1537, "step": 760 }, { "epoch": 2.779783393501805, "grad_norm": 3.562208652496338, "learning_rate": 5.1266666666666675e-05, "loss": 0.1518, "step": 770 }, { "epoch": 2.815884476534296, "grad_norm": 4.249292850494385, "learning_rate": 5.193333333333333e-05, "loss": 0.1517, "step": 780 }, { "epoch": 2.851985559566787, "grad_norm": 3.7535207271575928, "learning_rate": 5.2600000000000005e-05, "loss": 0.1489, "step": 790 }, { "epoch": 2.888086642599278, "grad_norm": 4.374902248382568, "learning_rate": 5.326666666666666e-05, "loss": 0.1449, "step": 800 }, { "epoch": 2.9241877256317688, "grad_norm": 3.9763412475585938, "learning_rate": 5.3933333333333334e-05, "loss": 0.1359, "step": 810 }, { "epoch": 2.96028880866426, "grad_norm": 3.976372241973877, "learning_rate": 5.4600000000000006e-05, "loss": 0.1327, "step": 820 }, { "epoch": 2.996389891696751, "grad_norm": 2.532400131225586, "learning_rate": 5.5266666666666664e-05, "loss": 0.1238, "step": 830 }, { "epoch": 3.032490974729242, "grad_norm": 7.225079536437988, "learning_rate": 5.5933333333333335e-05, "loss": 0.1338, "step": 840 }, { "epoch": 3.068592057761733, "grad_norm": 3.196591854095459, "learning_rate": 5.66e-05, "loss": 0.1232, "step": 850 }, { "epoch": 3.104693140794224, "grad_norm": 4.404566764831543, "learning_rate": 5.726666666666667e-05, "loss": 0.1137, "step": 860 }, { "epoch": 3.140794223826715, "grad_norm": 5.581392765045166, "learning_rate": 5.7933333333333337e-05, "loss": 0.1121, "step": 870 }, { "epoch": 3.1768953068592056, "grad_norm": 4.208507061004639, "learning_rate": 5.86e-05, "loss": 0.1147, "step": 880 }, { "epoch": 3.2129963898916967, "grad_norm": 4.072608947753906, "learning_rate": 5.926666666666667e-05, "loss": 0.1048, "step": 890 }, { "epoch": 3.2490974729241877, "grad_norm": 6.415537357330322, "learning_rate": 5.9933333333333345e-05, "loss": 0.1023, "step": 900 }, { "epoch": 3.2851985559566788, "grad_norm": 5.0296854972839355, "learning_rate": 6.06e-05, "loss": 0.0996, "step": 910 }, { "epoch": 3.32129963898917, "grad_norm": 3.894113779067993, "learning_rate": 6.126666666666667e-05, "loss": 0.0899, "step": 920 }, { "epoch": 3.357400722021661, "grad_norm": 4.2843017578125, "learning_rate": 6.193333333333333e-05, "loss": 0.0829, "step": 930 }, { "epoch": 3.3935018050541514, "grad_norm": 7.592728614807129, "learning_rate": 6.26e-05, "loss": 0.0781, "step": 940 }, { "epoch": 3.4296028880866425, "grad_norm": 5.444018840789795, "learning_rate": 6.326666666666667e-05, "loss": 0.0922, "step": 950 }, { "epoch": 3.4657039711191335, "grad_norm": 4.786616802215576, "learning_rate": 6.393333333333333e-05, "loss": 0.0789, "step": 960 }, { "epoch": 3.5018050541516246, "grad_norm": 3.325745105743408, "learning_rate": 6.460000000000001e-05, "loss": 0.0673, "step": 970 }, { "epoch": 3.5379061371841156, "grad_norm": 3.444308280944824, "learning_rate": 6.526666666666666e-05, "loss": 0.063, "step": 980 }, { "epoch": 3.5740072202166067, "grad_norm": 4.334812641143799, "learning_rate": 6.593333333333334e-05, "loss": 0.0629, "step": 990 }, { "epoch": 3.6101083032490973, "grad_norm": 3.642155647277832, "learning_rate": 6.66e-05, "loss": 0.0584, "step": 1000 }, { "epoch": 3.6462093862815883, "grad_norm": 3.5180516242980957, "learning_rate": 6.726666666666667e-05, "loss": 0.052, "step": 1010 }, { "epoch": 3.6823104693140793, "grad_norm": 2.7395567893981934, "learning_rate": 6.793333333333334e-05, "loss": 0.049, "step": 1020 }, { "epoch": 3.7184115523465704, "grad_norm": 6.758563041687012, "learning_rate": 6.860000000000001e-05, "loss": 0.0553, "step": 1030 }, { "epoch": 3.7545126353790614, "grad_norm": 3.3643603324890137, "learning_rate": 6.926666666666667e-05, "loss": 0.066, "step": 1040 }, { "epoch": 3.7906137184115525, "grad_norm": 5.385725021362305, "learning_rate": 6.993333333333334e-05, "loss": 0.0769, "step": 1050 }, { "epoch": 3.8267148014440435, "grad_norm": 3.6493208408355713, "learning_rate": 7.06e-05, "loss": 0.0569, "step": 1060 }, { "epoch": 3.862815884476534, "grad_norm": 3.325450897216797, "learning_rate": 7.126666666666667e-05, "loss": 0.0483, "step": 1070 }, { "epoch": 3.898916967509025, "grad_norm": 2.4501149654388428, "learning_rate": 7.193333333333334e-05, "loss": 0.0408, "step": 1080 }, { "epoch": 3.935018050541516, "grad_norm": 1.90947425365448, "learning_rate": 7.26e-05, "loss": 0.0362, "step": 1090 }, { "epoch": 3.9711191335740073, "grad_norm": 1.8476409912109375, "learning_rate": 7.326666666666667e-05, "loss": 0.0392, "step": 1100 }, { "epoch": 4.007220216606498, "grad_norm": 2.7553763389587402, "learning_rate": 7.393333333333333e-05, "loss": 0.0381, "step": 1110 }, { "epoch": 4.043321299638989, "grad_norm": 2.647038698196411, "learning_rate": 7.46e-05, "loss": 0.0338, "step": 1120 }, { "epoch": 4.07942238267148, "grad_norm": 2.1170711517333984, "learning_rate": 7.526666666666668e-05, "loss": 0.0353, "step": 1130 }, { "epoch": 4.115523465703971, "grad_norm": 2.6540422439575195, "learning_rate": 7.593333333333334e-05, "loss": 0.0326, "step": 1140 }, { "epoch": 4.1516245487364625, "grad_norm": 2.1124935150146484, "learning_rate": 7.66e-05, "loss": 0.0309, "step": 1150 }, { "epoch": 4.187725631768953, "grad_norm": 2.653675079345703, "learning_rate": 7.726666666666667e-05, "loss": 0.0319, "step": 1160 }, { "epoch": 4.223826714801444, "grad_norm": 2.696803569793701, "learning_rate": 7.793333333333333e-05, "loss": 0.0328, "step": 1170 }, { "epoch": 4.259927797833935, "grad_norm": 2.675212860107422, "learning_rate": 7.860000000000001e-05, "loss": 0.0317, "step": 1180 }, { "epoch": 4.296028880866426, "grad_norm": 2.1661317348480225, "learning_rate": 7.926666666666666e-05, "loss": 0.0293, "step": 1190 }, { "epoch": 4.332129963898917, "grad_norm": 2.6710309982299805, "learning_rate": 7.993333333333334e-05, "loss": 0.0309, "step": 1200 }, { "epoch": 4.368231046931408, "grad_norm": 2.7322568893432617, "learning_rate": 8.060000000000001e-05, "loss": 0.0311, "step": 1210 }, { "epoch": 4.404332129963899, "grad_norm": 2.741199016571045, "learning_rate": 8.126666666666667e-05, "loss": 0.03, "step": 1220 }, { "epoch": 4.44043321299639, "grad_norm": 2.3981032371520996, "learning_rate": 8.193333333333334e-05, "loss": 0.0282, "step": 1230 }, { "epoch": 4.4765342960288805, "grad_norm": 2.7882485389709473, "learning_rate": 8.26e-05, "loss": 0.028, "step": 1240 }, { "epoch": 4.512635379061372, "grad_norm": 2.624581813812256, "learning_rate": 8.326666666666667e-05, "loss": 0.0307, "step": 1250 }, { "epoch": 4.548736462093863, "grad_norm": 2.1602065563201904, "learning_rate": 8.393333333333335e-05, "loss": 0.0275, "step": 1260 }, { "epoch": 4.584837545126354, "grad_norm": 1.852003812789917, "learning_rate": 8.46e-05, "loss": 0.025, "step": 1270 }, { "epoch": 4.620938628158845, "grad_norm": 2.615730047225952, "learning_rate": 8.526666666666667e-05, "loss": 0.03, "step": 1280 }, { "epoch": 4.657039711191336, "grad_norm": 2.2223312854766846, "learning_rate": 8.593333333333333e-05, "loss": 0.0267, "step": 1290 }, { "epoch": 4.693140794223827, "grad_norm": 2.612130880355835, "learning_rate": 8.66e-05, "loss": 0.027, "step": 1300 }, { "epoch": 4.729241877256317, "grad_norm": 2.06195068359375, "learning_rate": 8.726666666666667e-05, "loss": 0.0256, "step": 1310 }, { "epoch": 4.765342960288809, "grad_norm": 2.914454936981201, "learning_rate": 8.793333333333333e-05, "loss": 0.0263, "step": 1320 }, { "epoch": 4.8014440433212995, "grad_norm": 2.4399166107177734, "learning_rate": 8.86e-05, "loss": 0.0269, "step": 1330 }, { "epoch": 4.837545126353791, "grad_norm": 2.0535504817962646, "learning_rate": 8.926666666666668e-05, "loss": 0.026, "step": 1340 }, { "epoch": 4.873646209386282, "grad_norm": 2.612743377685547, "learning_rate": 8.993333333333334e-05, "loss": 0.0265, "step": 1350 }, { "epoch": 4.909747292418773, "grad_norm": 2.352599859237671, "learning_rate": 9.06e-05, "loss": 0.026, "step": 1360 }, { "epoch": 4.945848375451264, "grad_norm": 2.1630280017852783, "learning_rate": 9.126666666666667e-05, "loss": 0.0253, "step": 1370 }, { "epoch": 4.981949458483754, "grad_norm": 2.0943126678466797, "learning_rate": 9.193333333333334e-05, "loss": 0.0225, "step": 1380 }, { "epoch": 5.018050541516246, "grad_norm": 1.8519400358200073, "learning_rate": 9.260000000000001e-05, "loss": 0.0245, "step": 1390 }, { "epoch": 5.054151624548736, "grad_norm": 2.359534740447998, "learning_rate": 9.326666666666667e-05, "loss": 0.0241, "step": 1400 }, { "epoch": 5.090252707581228, "grad_norm": 1.862101435661316, "learning_rate": 9.393333333333334e-05, "loss": 0.0268, "step": 1410 }, { "epoch": 5.126353790613718, "grad_norm": 2.0919692516326904, "learning_rate": 9.46e-05, "loss": 0.0237, "step": 1420 }, { "epoch": 5.162454873646209, "grad_norm": 1.861625075340271, "learning_rate": 9.526666666666667e-05, "loss": 0.0258, "step": 1430 }, { "epoch": 5.1985559566787005, "grad_norm": 2.132181167602539, "learning_rate": 9.593333333333334e-05, "loss": 0.0252, "step": 1440 }, { "epoch": 5.234657039711191, "grad_norm": 1.9146952629089355, "learning_rate": 9.66e-05, "loss": 0.0233, "step": 1450 }, { "epoch": 5.270758122743683, "grad_norm": 1.6309505701065063, "learning_rate": 9.726666666666667e-05, "loss": 0.0226, "step": 1460 }, { "epoch": 5.306859205776173, "grad_norm": 1.7791682481765747, "learning_rate": 9.793333333333333e-05, "loss": 0.0257, "step": 1470 }, { "epoch": 5.342960288808664, "grad_norm": 1.6199488639831543, "learning_rate": 9.86e-05, "loss": 0.0221, "step": 1480 }, { "epoch": 5.379061371841155, "grad_norm": 2.206078052520752, "learning_rate": 9.926666666666668e-05, "loss": 0.0206, "step": 1490 }, { "epoch": 5.415162454873646, "grad_norm": 1.5716779232025146, "learning_rate": 9.993333333333334e-05, "loss": 0.021, "step": 1500 }, { "epoch": 5.451263537906137, "grad_norm": 1.9805904626846313, "learning_rate": 9.999997539434007e-05, "loss": 0.021, "step": 1510 }, { "epoch": 5.487364620938628, "grad_norm": 2.1759257316589355, "learning_rate": 9.999989033776898e-05, "loss": 0.0202, "step": 1520 }, { "epoch": 5.5234657039711195, "grad_norm": 2.1213581562042236, "learning_rate": 9.999974452661641e-05, "loss": 0.0204, "step": 1530 }, { "epoch": 5.55956678700361, "grad_norm": 1.5911200046539307, "learning_rate": 9.999953796105959e-05, "loss": 0.0214, "step": 1540 }, { "epoch": 5.595667870036101, "grad_norm": 1.9394819736480713, "learning_rate": 9.999927064134949e-05, "loss": 0.0243, "step": 1550 }, { "epoch": 5.631768953068592, "grad_norm": 1.736773133277893, "learning_rate": 9.999894256781095e-05, "loss": 0.0215, "step": 1560 }, { "epoch": 5.667870036101083, "grad_norm": 1.6920562982559204, "learning_rate": 9.99985537408426e-05, "loss": 0.0222, "step": 1570 }, { "epoch": 5.703971119133574, "grad_norm": 1.6096794605255127, "learning_rate": 9.999810416091688e-05, "loss": 0.0194, "step": 1580 }, { "epoch": 5.740072202166065, "grad_norm": 1.619868278503418, "learning_rate": 9.99975938285801e-05, "loss": 0.0184, "step": 1590 }, { "epoch": 5.776173285198556, "grad_norm": 1.3996607065200806, "learning_rate": 9.999702274445236e-05, "loss": 0.0198, "step": 1600 }, { "epoch": 5.812274368231047, "grad_norm": 2.005847454071045, "learning_rate": 9.999639090922756e-05, "loss": 0.0185, "step": 1610 }, { "epoch": 5.8483754512635375, "grad_norm": 1.3449925184249878, "learning_rate": 9.999569832367346e-05, "loss": 0.0188, "step": 1620 }, { "epoch": 5.884476534296029, "grad_norm": 1.3387703895568848, "learning_rate": 9.999494498863162e-05, "loss": 0.0188, "step": 1630 }, { "epoch": 5.92057761732852, "grad_norm": 1.3687690496444702, "learning_rate": 9.99941309050174e-05, "loss": 0.019, "step": 1640 }, { "epoch": 5.956678700361011, "grad_norm": 1.5104888677597046, "learning_rate": 9.999325607381999e-05, "loss": 0.0219, "step": 1650 }, { "epoch": 5.992779783393502, "grad_norm": 1.6341077089309692, "learning_rate": 9.999232049610238e-05, "loss": 0.0193, "step": 1660 }, { "epoch": 6.028880866425993, "grad_norm": 1.4791560173034668, "learning_rate": 9.999132417300142e-05, "loss": 0.0182, "step": 1670 }, { "epoch": 6.064981949458484, "grad_norm": 2.0534698963165283, "learning_rate": 9.99902671057277e-05, "loss": 0.0185, "step": 1680 }, { "epoch": 6.101083032490974, "grad_norm": 1.9715570211410522, "learning_rate": 9.998914929556569e-05, "loss": 0.0197, "step": 1690 }, { "epoch": 6.137184115523466, "grad_norm": 1.714123010635376, "learning_rate": 9.998797074387361e-05, "loss": 0.0186, "step": 1700 }, { "epoch": 6.1732851985559565, "grad_norm": 1.6185153722763062, "learning_rate": 9.99867314520835e-05, "loss": 0.016, "step": 1710 }, { "epoch": 6.209386281588448, "grad_norm": 1.3633183240890503, "learning_rate": 9.998543142170126e-05, "loss": 0.0193, "step": 1720 }, { "epoch": 6.245487364620939, "grad_norm": 1.5912808179855347, "learning_rate": 9.99840706543065e-05, "loss": 0.0194, "step": 1730 }, { "epoch": 6.28158844765343, "grad_norm": 1.9936370849609375, "learning_rate": 9.998264915155274e-05, "loss": 0.0172, "step": 1740 }, { "epoch": 6.317689530685921, "grad_norm": 1.7528094053268433, "learning_rate": 9.998116691516718e-05, "loss": 0.0185, "step": 1750 }, { "epoch": 6.353790613718411, "grad_norm": 1.7351869344711304, "learning_rate": 9.997962394695091e-05, "loss": 0.0174, "step": 1760 }, { "epoch": 6.389891696750903, "grad_norm": 1.1427206993103027, "learning_rate": 9.997802024877875e-05, "loss": 0.0176, "step": 1770 }, { "epoch": 6.425992779783393, "grad_norm": 1.2416205406188965, "learning_rate": 9.99763558225994e-05, "loss": 0.0169, "step": 1780 }, { "epoch": 6.462093862815885, "grad_norm": 1.6932954788208008, "learning_rate": 9.997463067043526e-05, "loss": 0.0178, "step": 1790 }, { "epoch": 6.498194945848375, "grad_norm": 1.6806602478027344, "learning_rate": 9.997284479438253e-05, "loss": 0.0186, "step": 1800 }, { "epoch": 6.534296028880867, "grad_norm": 1.7252382040023804, "learning_rate": 9.997099819661127e-05, "loss": 0.0169, "step": 1810 }, { "epoch": 6.5703971119133575, "grad_norm": 1.7361781597137451, "learning_rate": 9.996909087936524e-05, "loss": 0.0172, "step": 1820 }, { "epoch": 6.606498194945848, "grad_norm": 1.5690287351608276, "learning_rate": 9.996712284496201e-05, "loss": 0.0186, "step": 1830 }, { "epoch": 6.64259927797834, "grad_norm": 1.3086954355239868, "learning_rate": 9.996509409579293e-05, "loss": 0.016, "step": 1840 }, { "epoch": 6.67870036101083, "grad_norm": 1.392793893814087, "learning_rate": 9.996300463432312e-05, "loss": 0.0167, "step": 1850 }, { "epoch": 6.714801444043322, "grad_norm": 1.2733964920043945, "learning_rate": 9.996085446309148e-05, "loss": 0.0166, "step": 1860 }, { "epoch": 6.750902527075812, "grad_norm": 1.3202944993972778, "learning_rate": 9.995864358471066e-05, "loss": 0.0161, "step": 1870 }, { "epoch": 6.787003610108303, "grad_norm": 1.843495488166809, "learning_rate": 9.99563720018671e-05, "loss": 0.0168, "step": 1880 }, { "epoch": 6.823104693140794, "grad_norm": 1.3712539672851562, "learning_rate": 9.995403971732098e-05, "loss": 0.0158, "step": 1890 }, { "epoch": 6.859205776173285, "grad_norm": 1.2827091217041016, "learning_rate": 9.995164673390625e-05, "loss": 0.0174, "step": 1900 }, { "epoch": 6.8953068592057765, "grad_norm": 1.1793349981307983, "learning_rate": 9.994919305453059e-05, "loss": 0.0143, "step": 1910 }, { "epoch": 6.931407942238267, "grad_norm": 1.3544561862945557, "learning_rate": 9.994667868217548e-05, "loss": 0.0162, "step": 1920 }, { "epoch": 6.967509025270758, "grad_norm": 1.3066773414611816, "learning_rate": 9.99441036198961e-05, "loss": 0.0151, "step": 1930 }, { "epoch": 7.003610108303249, "grad_norm": 1.5574922561645508, "learning_rate": 9.99414678708214e-05, "loss": 0.0152, "step": 1940 }, { "epoch": 7.03971119133574, "grad_norm": 1.518237829208374, "learning_rate": 9.993877143815407e-05, "loss": 0.015, "step": 1950 }, { "epoch": 7.075812274368231, "grad_norm": 1.5742323398590088, "learning_rate": 9.993601432517053e-05, "loss": 0.0188, "step": 1960 }, { "epoch": 7.111913357400722, "grad_norm": 1.7677223682403564, "learning_rate": 9.993319653522091e-05, "loss": 0.0163, "step": 1970 }, { "epoch": 7.148014440433213, "grad_norm": 1.4965007305145264, "learning_rate": 9.993031807172911e-05, "loss": 0.0164, "step": 1980 }, { "epoch": 7.184115523465704, "grad_norm": 1.4197651147842407, "learning_rate": 9.992737893819273e-05, "loss": 0.0187, "step": 1990 }, { "epoch": 7.2202166064981945, "grad_norm": 1.5714497566223145, "learning_rate": 9.992437913818312e-05, "loss": 0.0176, "step": 2000 }, { "epoch": 7.256317689530686, "grad_norm": 1.3549633026123047, "learning_rate": 9.992131867534526e-05, "loss": 0.0154, "step": 2010 }, { "epoch": 7.292418772563177, "grad_norm": 1.270080327987671, "learning_rate": 9.991819755339796e-05, "loss": 0.0141, "step": 2020 }, { "epoch": 7.328519855595668, "grad_norm": 1.090819001197815, "learning_rate": 9.991501577613365e-05, "loss": 0.0135, "step": 2030 }, { "epoch": 7.364620938628159, "grad_norm": 1.3705732822418213, "learning_rate": 9.99117733474185e-05, "loss": 0.0149, "step": 2040 }, { "epoch": 7.40072202166065, "grad_norm": 1.3088817596435547, "learning_rate": 9.990847027119234e-05, "loss": 0.0148, "step": 2050 }, { "epoch": 7.436823104693141, "grad_norm": 1.511338233947754, "learning_rate": 9.990510655146877e-05, "loss": 0.0149, "step": 2060 }, { "epoch": 7.472924187725631, "grad_norm": 1.5276422500610352, "learning_rate": 9.990168219233496e-05, "loss": 0.016, "step": 2070 }, { "epoch": 7.509025270758123, "grad_norm": 1.3451029062271118, "learning_rate": 9.989819719795188e-05, "loss": 0.015, "step": 2080 }, { "epoch": 7.5451263537906135, "grad_norm": 1.1688835620880127, "learning_rate": 9.989465157255412e-05, "loss": 0.0141, "step": 2090 }, { "epoch": 7.581227436823105, "grad_norm": 0.800317108631134, "learning_rate": 9.989104532044994e-05, "loss": 0.0128, "step": 2100 }, { "epoch": 7.617328519855596, "grad_norm": 0.9989607334136963, "learning_rate": 9.988737844602128e-05, "loss": 0.0127, "step": 2110 }, { "epoch": 7.653429602888087, "grad_norm": 1.1656674146652222, "learning_rate": 9.988365095372372e-05, "loss": 0.0139, "step": 2120 }, { "epoch": 7.689530685920578, "grad_norm": 1.295882225036621, "learning_rate": 9.987986284808654e-05, "loss": 0.014, "step": 2130 }, { "epoch": 7.725631768953068, "grad_norm": 1.0984694957733154, "learning_rate": 9.987601413371264e-05, "loss": 0.0129, "step": 2140 }, { "epoch": 7.76173285198556, "grad_norm": 1.3702571392059326, "learning_rate": 9.987210481527855e-05, "loss": 0.0149, "step": 2150 }, { "epoch": 7.79783393501805, "grad_norm": 1.617008090019226, "learning_rate": 9.98681348975345e-05, "loss": 0.0141, "step": 2160 }, { "epoch": 7.833935018050542, "grad_norm": 1.3128085136413574, "learning_rate": 9.986410438530427e-05, "loss": 0.0154, "step": 2170 }, { "epoch": 7.870036101083032, "grad_norm": 1.1459583044052124, "learning_rate": 9.986001328348534e-05, "loss": 0.0127, "step": 2180 }, { "epoch": 7.906137184115524, "grad_norm": 1.0433539152145386, "learning_rate": 9.985586159704878e-05, "loss": 0.0143, "step": 2190 }, { "epoch": 7.9422382671480145, "grad_norm": 0.9214907288551331, "learning_rate": 9.985164933103929e-05, "loss": 0.0139, "step": 2200 }, { "epoch": 7.978339350180505, "grad_norm": 1.1234197616577148, "learning_rate": 9.984737649057513e-05, "loss": 0.0132, "step": 2210 }, { "epoch": 8.014440433212997, "grad_norm": 1.1866891384124756, "learning_rate": 9.984304308084827e-05, "loss": 0.0135, "step": 2220 }, { "epoch": 8.050541516245488, "grad_norm": 1.1017005443572998, "learning_rate": 9.983864910712416e-05, "loss": 0.0128, "step": 2230 }, { "epoch": 8.086642599277978, "grad_norm": 0.9575669169425964, "learning_rate": 9.98341945747419e-05, "loss": 0.0128, "step": 2240 }, { "epoch": 8.12274368231047, "grad_norm": 0.9450060129165649, "learning_rate": 9.98296794891142e-05, "loss": 0.0138, "step": 2250 }, { "epoch": 8.15884476534296, "grad_norm": 0.7817940711975098, "learning_rate": 9.982510385572725e-05, "loss": 0.0111, "step": 2260 }, { "epoch": 8.19494584837545, "grad_norm": 1.0894697904586792, "learning_rate": 9.982046768014094e-05, "loss": 0.0123, "step": 2270 }, { "epoch": 8.231046931407942, "grad_norm": 1.186640977859497, "learning_rate": 9.981577096798863e-05, "loss": 0.0133, "step": 2280 }, { "epoch": 8.267148014440433, "grad_norm": 1.110902190208435, "learning_rate": 9.981101372497727e-05, "loss": 0.0128, "step": 2290 }, { "epoch": 8.303249097472925, "grad_norm": 1.0896000862121582, "learning_rate": 9.980619595688737e-05, "loss": 0.0136, "step": 2300 }, { "epoch": 8.339350180505415, "grad_norm": 0.944868803024292, "learning_rate": 9.980131766957295e-05, "loss": 0.013, "step": 2310 }, { "epoch": 8.375451263537906, "grad_norm": 0.8644301891326904, "learning_rate": 9.979637886896163e-05, "loss": 0.0122, "step": 2320 }, { "epoch": 8.411552346570398, "grad_norm": 0.7042171359062195, "learning_rate": 9.979137956105447e-05, "loss": 0.0121, "step": 2330 }, { "epoch": 8.447653429602887, "grad_norm": 0.7054566740989685, "learning_rate": 9.978631975192613e-05, "loss": 0.0122, "step": 2340 }, { "epoch": 8.483754512635379, "grad_norm": 0.9158279895782471, "learning_rate": 9.978119944772475e-05, "loss": 0.0133, "step": 2350 }, { "epoch": 8.51985559566787, "grad_norm": 0.7022776007652283, "learning_rate": 9.977601865467197e-05, "loss": 0.0126, "step": 2360 }, { "epoch": 8.555956678700362, "grad_norm": 0.7667548060417175, "learning_rate": 9.977077737906297e-05, "loss": 0.0118, "step": 2370 }, { "epoch": 8.592057761732852, "grad_norm": 0.766633152961731, "learning_rate": 9.976547562726636e-05, "loss": 0.0135, "step": 2380 }, { "epoch": 8.628158844765343, "grad_norm": 0.8130113482475281, "learning_rate": 9.976011340572429e-05, "loss": 0.0135, "step": 2390 }, { "epoch": 8.664259927797834, "grad_norm": 0.9371001124382019, "learning_rate": 9.975469072095237e-05, "loss": 0.0116, "step": 2400 }, { "epoch": 8.700361010830324, "grad_norm": 0.9312263131141663, "learning_rate": 9.974920757953965e-05, "loss": 0.0131, "step": 2410 }, { "epoch": 8.736462093862816, "grad_norm": 0.9781073331832886, "learning_rate": 9.97436639881487e-05, "loss": 0.0127, "step": 2420 }, { "epoch": 8.772563176895307, "grad_norm": 0.923537015914917, "learning_rate": 9.973805995351545e-05, "loss": 0.0119, "step": 2430 }, { "epoch": 8.808664259927799, "grad_norm": 0.9015010595321655, "learning_rate": 9.973239548244939e-05, "loss": 0.0119, "step": 2440 }, { "epoch": 8.844765342960288, "grad_norm": 0.9531866312026978, "learning_rate": 9.972667058183333e-05, "loss": 0.0115, "step": 2450 }, { "epoch": 8.88086642599278, "grad_norm": 0.8554368019104004, "learning_rate": 9.972088525862362e-05, "loss": 0.0127, "step": 2460 }, { "epoch": 8.916967509025271, "grad_norm": 0.7815523147583008, "learning_rate": 9.971503951984995e-05, "loss": 0.0129, "step": 2470 }, { "epoch": 8.953068592057761, "grad_norm": 0.6722756624221802, "learning_rate": 9.970913337261543e-05, "loss": 0.0134, "step": 2480 }, { "epoch": 8.989169675090253, "grad_norm": 0.7026623487472534, "learning_rate": 9.97031668240966e-05, "loss": 0.0118, "step": 2490 }, { "epoch": 9.025270758122744, "grad_norm": 0.8489606976509094, "learning_rate": 9.969713988154339e-05, "loss": 0.0125, "step": 2500 }, { "epoch": 9.061371841155236, "grad_norm": 0.8544819951057434, "learning_rate": 9.969105255227906e-05, "loss": 0.0125, "step": 2510 }, { "epoch": 9.097472924187725, "grad_norm": 0.865927517414093, "learning_rate": 9.968490484370035e-05, "loss": 0.0117, "step": 2520 }, { "epoch": 9.133574007220217, "grad_norm": 0.9897856712341309, "learning_rate": 9.967869676327726e-05, "loss": 0.0109, "step": 2530 }, { "epoch": 9.169675090252708, "grad_norm": 1.0381364822387695, "learning_rate": 9.96724283185532e-05, "loss": 0.0102, "step": 2540 }, { "epoch": 9.205776173285198, "grad_norm": 1.1119805574417114, "learning_rate": 9.966609951714494e-05, "loss": 0.0117, "step": 2550 }, { "epoch": 9.24187725631769, "grad_norm": 1.0177388191223145, "learning_rate": 9.965971036674255e-05, "loss": 0.012, "step": 2560 }, { "epoch": 9.277978339350181, "grad_norm": 0.804807722568512, "learning_rate": 9.965326087510947e-05, "loss": 0.0125, "step": 2570 }, { "epoch": 9.314079422382672, "grad_norm": 0.9009324312210083, "learning_rate": 9.964675105008244e-05, "loss": 0.0118, "step": 2580 }, { "epoch": 9.350180505415162, "grad_norm": 0.7471716403961182, "learning_rate": 9.964018089957147e-05, "loss": 0.0093, "step": 2590 }, { "epoch": 9.386281588447654, "grad_norm": 0.7393360137939453, "learning_rate": 9.963355043155996e-05, "loss": 0.0111, "step": 2600 }, { "epoch": 9.422382671480145, "grad_norm": 0.7434808611869812, "learning_rate": 9.962685965410455e-05, "loss": 0.011, "step": 2610 }, { "epoch": 9.458483754512635, "grad_norm": 0.7294809818267822, "learning_rate": 9.962010857533514e-05, "loss": 0.0107, "step": 2620 }, { "epoch": 9.494584837545126, "grad_norm": 0.849262535572052, "learning_rate": 9.961329720345493e-05, "loss": 0.0117, "step": 2630 }, { "epoch": 9.530685920577618, "grad_norm": 0.7124184370040894, "learning_rate": 9.96064255467404e-05, "loss": 0.01, "step": 2640 }, { "epoch": 9.566787003610107, "grad_norm": 0.6838423013687134, "learning_rate": 9.959949361354126e-05, "loss": 0.01, "step": 2650 }, { "epoch": 9.602888086642599, "grad_norm": 0.5993446111679077, "learning_rate": 9.959250141228045e-05, "loss": 0.0099, "step": 2660 }, { "epoch": 9.63898916967509, "grad_norm": 0.6491556763648987, "learning_rate": 9.958544895145414e-05, "loss": 0.0102, "step": 2670 }, { "epoch": 9.675090252707582, "grad_norm": 0.7681136131286621, "learning_rate": 9.957833623963177e-05, "loss": 0.0099, "step": 2680 }, { "epoch": 9.711191335740072, "grad_norm": 0.7443376183509827, "learning_rate": 9.957116328545593e-05, "loss": 0.0104, "step": 2690 }, { "epoch": 9.747292418772563, "grad_norm": 0.862918496131897, "learning_rate": 9.956393009764244e-05, "loss": 0.0104, "step": 2700 }, { "epoch": 9.783393501805055, "grad_norm": 0.6566956043243408, "learning_rate": 9.955663668498032e-05, "loss": 0.0106, "step": 2710 }, { "epoch": 9.819494584837544, "grad_norm": 0.5753354430198669, "learning_rate": 9.954928305633173e-05, "loss": 0.0092, "step": 2720 }, { "epoch": 9.855595667870036, "grad_norm": 0.6308586597442627, "learning_rate": 9.954186922063204e-05, "loss": 0.0088, "step": 2730 }, { "epoch": 9.891696750902527, "grad_norm": 0.6705576181411743, "learning_rate": 9.953439518688974e-05, "loss": 0.0097, "step": 2740 }, { "epoch": 9.927797833935019, "grad_norm": 0.7357208132743835, "learning_rate": 9.952686096418652e-05, "loss": 0.0112, "step": 2750 }, { "epoch": 9.963898916967509, "grad_norm": 0.6457874774932861, "learning_rate": 9.951926656167714e-05, "loss": 0.0096, "step": 2760 }, { "epoch": 10.0, "grad_norm": 0.759169340133667, "learning_rate": 9.951161198858953e-05, "loss": 0.01, "step": 2770 }, { "epoch": 10.036101083032491, "grad_norm": 1.0353105068206787, "learning_rate": 9.950389725422471e-05, "loss": 0.0112, "step": 2780 }, { "epoch": 10.072202166064981, "grad_norm": 0.7813242673873901, "learning_rate": 9.949612236795682e-05, "loss": 0.0129, "step": 2790 }, { "epoch": 10.108303249097473, "grad_norm": 0.8926829695701599, "learning_rate": 9.948828733923305e-05, "loss": 0.012, "step": 2800 }, { "epoch": 10.144404332129964, "grad_norm": 0.8344612717628479, "learning_rate": 9.948039217757374e-05, "loss": 0.0114, "step": 2810 }, { "epoch": 10.180505415162456, "grad_norm": 0.9931730031967163, "learning_rate": 9.947243689257225e-05, "loss": 0.0103, "step": 2820 }, { "epoch": 10.216606498194945, "grad_norm": 0.8401676416397095, "learning_rate": 9.946442149389497e-05, "loss": 0.0104, "step": 2830 }, { "epoch": 10.252707581227437, "grad_norm": 0.8963848352432251, "learning_rate": 9.945634599128139e-05, "loss": 0.0109, "step": 2840 }, { "epoch": 10.288808664259928, "grad_norm": 0.7665980458259583, "learning_rate": 9.944821039454402e-05, "loss": 0.0098, "step": 2850 }, { "epoch": 10.324909747292418, "grad_norm": 0.7776787281036377, "learning_rate": 9.944001471356835e-05, "loss": 0.0103, "step": 2860 }, { "epoch": 10.36101083032491, "grad_norm": 0.7740691900253296, "learning_rate": 9.94317589583129e-05, "loss": 0.0115, "step": 2870 }, { "epoch": 10.397111913357401, "grad_norm": 0.7140895128250122, "learning_rate": 9.942344313880922e-05, "loss": 0.0097, "step": 2880 }, { "epoch": 10.433212996389893, "grad_norm": 0.6975634098052979, "learning_rate": 9.941506726516179e-05, "loss": 0.0102, "step": 2890 }, { "epoch": 10.469314079422382, "grad_norm": 0.5943614840507507, "learning_rate": 9.94066313475481e-05, "loss": 0.0093, "step": 2900 }, { "epoch": 10.505415162454874, "grad_norm": 0.7553809285163879, "learning_rate": 9.939813539621857e-05, "loss": 0.0098, "step": 2910 }, { "epoch": 10.541516245487365, "grad_norm": 0.6999721527099609, "learning_rate": 9.93895794214966e-05, "loss": 0.009, "step": 2920 }, { "epoch": 10.577617328519855, "grad_norm": 0.681658148765564, "learning_rate": 9.938096343377852e-05, "loss": 0.0102, "step": 2930 }, { "epoch": 10.613718411552346, "grad_norm": 0.7771198153495789, "learning_rate": 9.937228744353353e-05, "loss": 0.0096, "step": 2940 }, { "epoch": 10.649819494584838, "grad_norm": 0.6081448793411255, "learning_rate": 9.936355146130379e-05, "loss": 0.0085, "step": 2950 }, { "epoch": 10.685920577617328, "grad_norm": 0.8011517524719238, "learning_rate": 9.935475549770435e-05, "loss": 0.0096, "step": 2960 }, { "epoch": 10.722021660649819, "grad_norm": 0.7534987926483154, "learning_rate": 9.934589956342315e-05, "loss": 0.0112, "step": 2970 }, { "epoch": 10.75812274368231, "grad_norm": 0.8054004907608032, "learning_rate": 9.933698366922093e-05, "loss": 0.0103, "step": 2980 }, { "epoch": 10.794223826714802, "grad_norm": 0.8305302858352661, "learning_rate": 9.93280078259314e-05, "loss": 0.0094, "step": 2990 }, { "epoch": 10.830324909747292, "grad_norm": 0.8965534567832947, "learning_rate": 9.931897204446104e-05, "loss": 0.0094, "step": 3000 }, { "epoch": 10.866425992779783, "grad_norm": 0.7903763651847839, "learning_rate": 9.930987633578915e-05, "loss": 0.0106, "step": 3010 }, { "epoch": 10.902527075812275, "grad_norm": 0.7398828268051147, "learning_rate": 9.93007207109679e-05, "loss": 0.0105, "step": 3020 }, { "epoch": 10.938628158844764, "grad_norm": 0.7578723430633545, "learning_rate": 9.929150518112224e-05, "loss": 0.0096, "step": 3030 }, { "epoch": 10.974729241877256, "grad_norm": 0.8357623815536499, "learning_rate": 9.928222975744991e-05, "loss": 0.0106, "step": 3040 }, { "epoch": 11.010830324909747, "grad_norm": 0.689484179019928, "learning_rate": 9.92728944512214e-05, "loss": 0.0098, "step": 3050 }, { "epoch": 11.046931407942239, "grad_norm": 0.8133997321128845, "learning_rate": 9.926349927378001e-05, "loss": 0.0107, "step": 3060 }, { "epoch": 11.083032490974729, "grad_norm": 0.6470313668251038, "learning_rate": 9.925404423654174e-05, "loss": 0.0106, "step": 3070 }, { "epoch": 11.11913357400722, "grad_norm": 0.6673492789268494, "learning_rate": 9.924452935099537e-05, "loss": 0.0084, "step": 3080 }, { "epoch": 11.155234657039712, "grad_norm": 0.6224140524864197, "learning_rate": 9.92349546287024e-05, "loss": 0.0081, "step": 3090 }, { "epoch": 11.191335740072201, "grad_norm": 0.6376920938491821, "learning_rate": 9.9225320081297e-05, "loss": 0.0094, "step": 3100 }, { "epoch": 11.227436823104693, "grad_norm": 0.7415140271186829, "learning_rate": 9.921562572048606e-05, "loss": 0.0096, "step": 3110 }, { "epoch": 11.263537906137184, "grad_norm": 0.5629510283470154, "learning_rate": 9.920587155804913e-05, "loss": 0.0089, "step": 3120 }, { "epoch": 11.299638989169676, "grad_norm": 0.6213931441307068, "learning_rate": 9.919605760583845e-05, "loss": 0.0084, "step": 3130 }, { "epoch": 11.335740072202166, "grad_norm": 0.6785014867782593, "learning_rate": 9.91861838757789e-05, "loss": 0.0085, "step": 3140 }, { "epoch": 11.371841155234657, "grad_norm": 0.6595868468284607, "learning_rate": 9.917625037986798e-05, "loss": 0.0093, "step": 3150 }, { "epoch": 11.407942238267148, "grad_norm": 0.719587504863739, "learning_rate": 9.916625713017583e-05, "loss": 0.0087, "step": 3160 }, { "epoch": 11.444043321299638, "grad_norm": 0.8673713207244873, "learning_rate": 9.915620413884519e-05, "loss": 0.0091, "step": 3170 }, { "epoch": 11.48014440433213, "grad_norm": 0.8191733956336975, "learning_rate": 9.914609141809139e-05, "loss": 0.0105, "step": 3180 }, { "epoch": 11.516245487364621, "grad_norm": 0.723475456237793, "learning_rate": 9.913591898020235e-05, "loss": 0.0092, "step": 3190 }, { "epoch": 11.552346570397113, "grad_norm": 0.6822453737258911, "learning_rate": 9.912568683753853e-05, "loss": 0.0106, "step": 3200 }, { "epoch": 11.588447653429602, "grad_norm": 0.606898844242096, "learning_rate": 9.911539500253295e-05, "loss": 0.0102, "step": 3210 }, { "epoch": 11.624548736462094, "grad_norm": 0.5995931029319763, "learning_rate": 9.910504348769118e-05, "loss": 0.0091, "step": 3220 }, { "epoch": 11.660649819494585, "grad_norm": 0.6429259777069092, "learning_rate": 9.909463230559127e-05, "loss": 0.0088, "step": 3230 }, { "epoch": 11.696750902527075, "grad_norm": 0.5485654473304749, "learning_rate": 9.908416146888376e-05, "loss": 0.0081, "step": 3240 }, { "epoch": 11.732851985559567, "grad_norm": 0.6281844973564148, "learning_rate": 9.907363099029175e-05, "loss": 0.01, "step": 3250 }, { "epoch": 11.768953068592058, "grad_norm": 0.5267898440361023, "learning_rate": 9.906304088261073e-05, "loss": 0.0095, "step": 3260 }, { "epoch": 11.80505415162455, "grad_norm": 0.7603709697723389, "learning_rate": 9.905239115870872e-05, "loss": 0.0079, "step": 3270 }, { "epoch": 11.84115523465704, "grad_norm": 0.7378809452056885, "learning_rate": 9.90416818315261e-05, "loss": 0.0094, "step": 3280 }, { "epoch": 11.87725631768953, "grad_norm": 0.7573925852775574, "learning_rate": 9.903091291407573e-05, "loss": 0.0086, "step": 3290 }, { "epoch": 11.913357400722022, "grad_norm": 0.6064549088478088, "learning_rate": 9.902008441944286e-05, "loss": 0.0096, "step": 3300 }, { "epoch": 11.949458483754512, "grad_norm": 0.6390209794044495, "learning_rate": 9.900919636078512e-05, "loss": 0.0104, "step": 3310 }, { "epoch": 11.985559566787003, "grad_norm": 0.8857858180999756, "learning_rate": 9.899824875133255e-05, "loss": 0.0108, "step": 3320 }, { "epoch": 12.021660649819495, "grad_norm": 0.6796932220458984, "learning_rate": 9.898724160438749e-05, "loss": 0.0099, "step": 3330 }, { "epoch": 12.057761732851986, "grad_norm": 0.7772940397262573, "learning_rate": 9.89761749333247e-05, "loss": 0.0092, "step": 3340 }, { "epoch": 12.093862815884476, "grad_norm": 0.6416860818862915, "learning_rate": 9.896504875159122e-05, "loss": 0.0088, "step": 3350 }, { "epoch": 12.129963898916968, "grad_norm": 0.6209238767623901, "learning_rate": 9.89538630727064e-05, "loss": 0.01, "step": 3360 }, { "epoch": 12.166064981949459, "grad_norm": 0.7372642159461975, "learning_rate": 9.894261791026189e-05, "loss": 0.0089, "step": 3370 }, { "epoch": 12.202166064981949, "grad_norm": 0.5251403450965881, "learning_rate": 9.893131327792165e-05, "loss": 0.0105, "step": 3380 }, { "epoch": 12.23826714801444, "grad_norm": 0.5557438135147095, "learning_rate": 9.891994918942182e-05, "loss": 0.0101, "step": 3390 }, { "epoch": 12.274368231046932, "grad_norm": 0.6687502861022949, "learning_rate": 9.890852565857092e-05, "loss": 0.0091, "step": 3400 }, { "epoch": 12.310469314079423, "grad_norm": 0.6372770667076111, "learning_rate": 9.889704269924954e-05, "loss": 0.0099, "step": 3410 }, { "epoch": 12.346570397111913, "grad_norm": 0.6314915418624878, "learning_rate": 9.888550032541059e-05, "loss": 0.0092, "step": 3420 }, { "epoch": 12.382671480144404, "grad_norm": 0.7343876957893372, "learning_rate": 9.887389855107916e-05, "loss": 0.0088, "step": 3430 }, { "epoch": 12.418772563176896, "grad_norm": 0.8022611141204834, "learning_rate": 9.886223739035248e-05, "loss": 0.0098, "step": 3440 }, { "epoch": 12.454873646209386, "grad_norm": 0.6204238533973694, "learning_rate": 9.885051685739997e-05, "loss": 0.0089, "step": 3450 }, { "epoch": 12.490974729241877, "grad_norm": 0.5853930115699768, "learning_rate": 9.883873696646316e-05, "loss": 0.0099, "step": 3460 }, { "epoch": 12.527075812274369, "grad_norm": 0.6876018047332764, "learning_rate": 9.882689773185575e-05, "loss": 0.0087, "step": 3470 }, { "epoch": 12.56317689530686, "grad_norm": 0.6854273080825806, "learning_rate": 9.881499916796353e-05, "loss": 0.0093, "step": 3480 }, { "epoch": 12.59927797833935, "grad_norm": 0.6009523272514343, "learning_rate": 9.880304128924434e-05, "loss": 0.0091, "step": 3490 }, { "epoch": 12.635379061371841, "grad_norm": 0.5833231806755066, "learning_rate": 9.879102411022817e-05, "loss": 0.0076, "step": 3500 }, { "epoch": 12.671480144404333, "grad_norm": 0.7677366733551025, "learning_rate": 9.877894764551703e-05, "loss": 0.0086, "step": 3510 }, { "epoch": 12.707581227436823, "grad_norm": 0.6318122148513794, "learning_rate": 9.876681190978494e-05, "loss": 0.0081, "step": 3520 }, { "epoch": 12.743682310469314, "grad_norm": 0.6317865252494812, "learning_rate": 9.875461691777797e-05, "loss": 0.0074, "step": 3530 }, { "epoch": 12.779783393501805, "grad_norm": 0.6839991807937622, "learning_rate": 9.874236268431417e-05, "loss": 0.01, "step": 3540 }, { "epoch": 12.815884476534297, "grad_norm": 0.7005046606063843, "learning_rate": 9.873004922428361e-05, "loss": 0.0084, "step": 3550 }, { "epoch": 12.851985559566787, "grad_norm": 0.6627283096313477, "learning_rate": 9.871767655264829e-05, "loss": 0.0086, "step": 3560 }, { "epoch": 12.888086642599278, "grad_norm": 0.7797370553016663, "learning_rate": 9.87052446844422e-05, "loss": 0.0085, "step": 3570 }, { "epoch": 12.92418772563177, "grad_norm": 0.5306588411331177, "learning_rate": 9.869275363477122e-05, "loss": 0.0084, "step": 3580 }, { "epoch": 12.96028880866426, "grad_norm": 0.7744714021682739, "learning_rate": 9.868020341881312e-05, "loss": 0.0097, "step": 3590 }, { "epoch": 12.99638989169675, "grad_norm": 0.6790797710418701, "learning_rate": 9.866759405181765e-05, "loss": 0.0089, "step": 3600 }, { "epoch": 13.032490974729242, "grad_norm": 0.7086886763572693, "learning_rate": 9.865492554910633e-05, "loss": 0.008, "step": 3610 }, { "epoch": 13.068592057761732, "grad_norm": 0.6534098386764526, "learning_rate": 9.864219792607262e-05, "loss": 0.0104, "step": 3620 }, { "epoch": 13.104693140794224, "grad_norm": 0.6140812635421753, "learning_rate": 9.862941119818177e-05, "loss": 0.0073, "step": 3630 }, { "epoch": 13.140794223826715, "grad_norm": 0.6613591909408569, "learning_rate": 9.861656538097086e-05, "loss": 0.009, "step": 3640 }, { "epoch": 13.176895306859207, "grad_norm": 0.7520333528518677, "learning_rate": 9.860366049004877e-05, "loss": 0.0082, "step": 3650 }, { "epoch": 13.212996389891696, "grad_norm": 0.7076119184494019, "learning_rate": 9.859069654109615e-05, "loss": 0.0093, "step": 3660 }, { "epoch": 13.249097472924188, "grad_norm": 0.691775918006897, "learning_rate": 9.857767354986545e-05, "loss": 0.0075, "step": 3670 }, { "epoch": 13.28519855595668, "grad_norm": 0.9084518551826477, "learning_rate": 9.856459153218078e-05, "loss": 0.0108, "step": 3680 }, { "epoch": 13.321299638989169, "grad_norm": 0.7507837414741516, "learning_rate": 9.855145050393808e-05, "loss": 0.0107, "step": 3690 }, { "epoch": 13.35740072202166, "grad_norm": 0.7127672433853149, "learning_rate": 9.85382504811049e-05, "loss": 0.0088, "step": 3700 }, { "epoch": 13.393501805054152, "grad_norm": 0.7873700857162476, "learning_rate": 9.852499147972054e-05, "loss": 0.0081, "step": 3710 }, { "epoch": 13.429602888086643, "grad_norm": 0.8309053778648376, "learning_rate": 9.851167351589592e-05, "loss": 0.0081, "step": 3720 }, { "epoch": 13.465703971119133, "grad_norm": 0.8734296560287476, "learning_rate": 9.849829660581363e-05, "loss": 0.009, "step": 3730 }, { "epoch": 13.501805054151625, "grad_norm": 0.7617871165275574, "learning_rate": 9.848486076572787e-05, "loss": 0.0085, "step": 3740 }, { "epoch": 13.537906137184116, "grad_norm": 0.4951501786708832, "learning_rate": 9.847136601196446e-05, "loss": 0.0076, "step": 3750 }, { "epoch": 13.574007220216606, "grad_norm": 0.7322642803192139, "learning_rate": 9.845781236092078e-05, "loss": 0.0082, "step": 3760 }, { "epoch": 13.610108303249097, "grad_norm": 0.616602897644043, "learning_rate": 9.844419982906583e-05, "loss": 0.008, "step": 3770 }, { "epoch": 13.646209386281589, "grad_norm": 0.6850390434265137, "learning_rate": 9.843052843294008e-05, "loss": 0.0094, "step": 3780 }, { "epoch": 13.68231046931408, "grad_norm": 0.6563327312469482, "learning_rate": 9.841679818915559e-05, "loss": 0.0077, "step": 3790 }, { "epoch": 13.71841155234657, "grad_norm": 0.6361055970191956, "learning_rate": 9.840300911439591e-05, "loss": 0.0084, "step": 3800 }, { "epoch": 13.754512635379061, "grad_norm": 0.6403736472129822, "learning_rate": 9.838916122541603e-05, "loss": 0.0077, "step": 3810 }, { "epoch": 13.790613718411553, "grad_norm": 0.5514874458312988, "learning_rate": 9.837525453904246e-05, "loss": 0.0089, "step": 3820 }, { "epoch": 13.826714801444043, "grad_norm": 0.531114935874939, "learning_rate": 9.836128907217314e-05, "loss": 0.0098, "step": 3830 }, { "epoch": 13.862815884476534, "grad_norm": 0.72089684009552, "learning_rate": 9.834726484177743e-05, "loss": 0.0084, "step": 3840 }, { "epoch": 13.898916967509026, "grad_norm": 0.7317929267883301, "learning_rate": 9.833318186489609e-05, "loss": 0.0095, "step": 3850 }, { "epoch": 13.935018050541515, "grad_norm": 0.6702656745910645, "learning_rate": 9.831904015864126e-05, "loss": 0.0085, "step": 3860 }, { "epoch": 13.971119133574007, "grad_norm": 0.5811823010444641, "learning_rate": 9.830483974019645e-05, "loss": 0.0081, "step": 3870 }, { "epoch": 14.007220216606498, "grad_norm": 0.5628147125244141, "learning_rate": 9.82905806268165e-05, "loss": 0.0082, "step": 3880 }, { "epoch": 14.04332129963899, "grad_norm": 0.6289241909980774, "learning_rate": 9.82762628358276e-05, "loss": 0.0098, "step": 3890 }, { "epoch": 14.07942238267148, "grad_norm": 0.567337691783905, "learning_rate": 9.826188638462718e-05, "loss": 0.0074, "step": 3900 }, { "epoch": 14.115523465703971, "grad_norm": 0.632425844669342, "learning_rate": 9.824745129068402e-05, "loss": 0.007, "step": 3910 }, { "epoch": 14.151624548736462, "grad_norm": 0.6318320035934448, "learning_rate": 9.82329575715381e-05, "loss": 0.0086, "step": 3920 }, { "epoch": 14.187725631768952, "grad_norm": 0.6592440605163574, "learning_rate": 9.821840524480066e-05, "loss": 0.0082, "step": 3930 }, { "epoch": 14.223826714801444, "grad_norm": 0.7359952926635742, "learning_rate": 9.820379432815414e-05, "loss": 0.0079, "step": 3940 }, { "epoch": 14.259927797833935, "grad_norm": 0.49768030643463135, "learning_rate": 9.81891248393522e-05, "loss": 0.0088, "step": 3950 }, { "epoch": 14.296028880866427, "grad_norm": 0.7085164785385132, "learning_rate": 9.817439679621963e-05, "loss": 0.0082, "step": 3960 }, { "epoch": 14.332129963898916, "grad_norm": 0.5585772395133972, "learning_rate": 9.815961021665243e-05, "loss": 0.0087, "step": 3970 }, { "epoch": 14.368231046931408, "grad_norm": 0.6714861392974854, "learning_rate": 9.814476511861763e-05, "loss": 0.0088, "step": 3980 }, { "epoch": 14.4043321299639, "grad_norm": 0.6995107531547546, "learning_rate": 9.812986152015348e-05, "loss": 0.01, "step": 3990 }, { "epoch": 14.440433212996389, "grad_norm": 0.7201774716377258, "learning_rate": 9.811489943936922e-05, "loss": 0.0078, "step": 4000 }, { "epoch": 14.47653429602888, "grad_norm": 0.7069374322891235, "learning_rate": 9.809987889444522e-05, "loss": 0.007, "step": 4010 }, { "epoch": 14.512635379061372, "grad_norm": 0.45528268814086914, "learning_rate": 9.808479990363282e-05, "loss": 0.0081, "step": 4020 }, { "epoch": 14.548736462093864, "grad_norm": 0.6015822887420654, "learning_rate": 9.806966248525445e-05, "loss": 0.0081, "step": 4030 }, { "epoch": 14.584837545126353, "grad_norm": 0.7270722389221191, "learning_rate": 9.805446665770348e-05, "loss": 0.0084, "step": 4040 }, { "epoch": 14.620938628158845, "grad_norm": 0.726533055305481, "learning_rate": 9.803921243944429e-05, "loss": 0.009, "step": 4050 }, { "epoch": 14.657039711191336, "grad_norm": 0.5868228673934937, "learning_rate": 9.802389984901218e-05, "loss": 0.0076, "step": 4060 }, { "epoch": 14.693140794223826, "grad_norm": 0.5780026316642761, "learning_rate": 9.80085289050134e-05, "loss": 0.0074, "step": 4070 }, { "epoch": 14.729241877256317, "grad_norm": 0.5615423917770386, "learning_rate": 9.799309962612508e-05, "loss": 0.0084, "step": 4080 }, { "epoch": 14.765342960288809, "grad_norm": 0.5624262690544128, "learning_rate": 9.797761203109527e-05, "loss": 0.0082, "step": 4090 }, { "epoch": 14.8014440433213, "grad_norm": 0.5993496179580688, "learning_rate": 9.796206613874283e-05, "loss": 0.0082, "step": 4100 }, { "epoch": 14.83754512635379, "grad_norm": 0.49344608187675476, "learning_rate": 9.794646196795754e-05, "loss": 0.007, "step": 4110 }, { "epoch": 14.873646209386282, "grad_norm": 0.5589749813079834, "learning_rate": 9.793079953769987e-05, "loss": 0.0074, "step": 4120 }, { "epoch": 14.909747292418773, "grad_norm": 0.5978914499282837, "learning_rate": 9.79150788670012e-05, "loss": 0.0077, "step": 4130 }, { "epoch": 14.945848375451263, "grad_norm": 0.7583843469619751, "learning_rate": 9.78992999749636e-05, "loss": 0.0076, "step": 4140 }, { "epoch": 14.981949458483754, "grad_norm": 0.704357922077179, "learning_rate": 9.788346288075994e-05, "loss": 0.0079, "step": 4150 }, { "epoch": 15.018050541516246, "grad_norm": 0.664689838886261, "learning_rate": 9.786756760363373e-05, "loss": 0.0079, "step": 4160 }, { "epoch": 15.054151624548737, "grad_norm": 0.5933771133422852, "learning_rate": 9.78516141628993e-05, "loss": 0.0078, "step": 4170 }, { "epoch": 15.090252707581227, "grad_norm": 0.6095234155654907, "learning_rate": 9.783560257794154e-05, "loss": 0.0089, "step": 4180 }, { "epoch": 15.126353790613718, "grad_norm": 0.6241867542266846, "learning_rate": 9.781953286821603e-05, "loss": 0.0068, "step": 4190 }, { "epoch": 15.16245487364621, "grad_norm": 0.6755119562149048, "learning_rate": 9.780340505324901e-05, "loss": 0.0082, "step": 4200 }, { "epoch": 15.1985559566787, "grad_norm": 0.43262916803359985, "learning_rate": 9.778721915263727e-05, "loss": 0.0078, "step": 4210 }, { "epoch": 15.234657039711191, "grad_norm": 0.49696260690689087, "learning_rate": 9.777097518604824e-05, "loss": 0.0072, "step": 4220 }, { "epoch": 15.270758122743683, "grad_norm": 0.5388179421424866, "learning_rate": 9.775467317321984e-05, "loss": 0.0091, "step": 4230 }, { "epoch": 15.306859205776174, "grad_norm": 0.5314399003982544, "learning_rate": 9.773831313396055e-05, "loss": 0.0073, "step": 4240 }, { "epoch": 15.342960288808664, "grad_norm": 0.49604231119155884, "learning_rate": 9.77218950881494e-05, "loss": 0.0089, "step": 4250 }, { "epoch": 15.379061371841155, "grad_norm": 0.5194151401519775, "learning_rate": 9.770541905573583e-05, "loss": 0.007, "step": 4260 }, { "epoch": 15.415162454873647, "grad_norm": 0.6234323382377625, "learning_rate": 9.768888505673976e-05, "loss": 0.0077, "step": 4270 }, { "epoch": 15.451263537906136, "grad_norm": 0.7056440711021423, "learning_rate": 9.767229311125162e-05, "loss": 0.0077, "step": 4280 }, { "epoch": 15.487364620938628, "grad_norm": 0.6590222716331482, "learning_rate": 9.765564323943211e-05, "loss": 0.0093, "step": 4290 }, { "epoch": 15.52346570397112, "grad_norm": 0.5882553458213806, "learning_rate": 9.763893546151244e-05, "loss": 0.0075, "step": 4300 }, { "epoch": 15.559566787003611, "grad_norm": 0.6169595122337341, "learning_rate": 9.762216979779412e-05, "loss": 0.0079, "step": 4310 }, { "epoch": 15.5956678700361, "grad_norm": 0.4988597631454468, "learning_rate": 9.760534626864902e-05, "loss": 0.0087, "step": 4320 }, { "epoch": 15.631768953068592, "grad_norm": 0.46784234046936035, "learning_rate": 9.758846489451931e-05, "loss": 0.0082, "step": 4330 }, { "epoch": 15.667870036101084, "grad_norm": 0.46982425451278687, "learning_rate": 9.757152569591748e-05, "loss": 0.0085, "step": 4340 }, { "epoch": 15.703971119133573, "grad_norm": 0.4954351782798767, "learning_rate": 9.75545286934262e-05, "loss": 0.0063, "step": 4350 }, { "epoch": 15.740072202166065, "grad_norm": 0.581403374671936, "learning_rate": 9.753747390769847e-05, "loss": 0.0082, "step": 4360 }, { "epoch": 15.776173285198556, "grad_norm": 0.5989372730255127, "learning_rate": 9.752036135945744e-05, "loss": 0.0076, "step": 4370 }, { "epoch": 15.812274368231048, "grad_norm": 0.6102561354637146, "learning_rate": 9.750319106949649e-05, "loss": 0.0064, "step": 4380 }, { "epoch": 15.848375451263538, "grad_norm": 0.592056930065155, "learning_rate": 9.748596305867913e-05, "loss": 0.009, "step": 4390 }, { "epoch": 15.884476534296029, "grad_norm": 0.5222209692001343, "learning_rate": 9.746867734793903e-05, "loss": 0.0068, "step": 4400 }, { "epoch": 15.92057761732852, "grad_norm": 0.4383890628814697, "learning_rate": 9.745133395827993e-05, "loss": 0.0074, "step": 4410 }, { "epoch": 15.95667870036101, "grad_norm": 0.6366679072380066, "learning_rate": 9.743393291077572e-05, "loss": 0.0081, "step": 4420 }, { "epoch": 15.992779783393502, "grad_norm": 0.5929124355316162, "learning_rate": 9.741647422657028e-05, "loss": 0.0082, "step": 4430 }, { "epoch": 16.028880866425993, "grad_norm": 0.5620942711830139, "learning_rate": 9.739895792687758e-05, "loss": 0.0078, "step": 4440 }, { "epoch": 16.064981949458485, "grad_norm": 0.5677127242088318, "learning_rate": 9.738138403298157e-05, "loss": 0.0087, "step": 4450 }, { "epoch": 16.101083032490976, "grad_norm": 0.5328949093818665, "learning_rate": 9.736375256623619e-05, "loss": 0.0075, "step": 4460 }, { "epoch": 16.137184115523464, "grad_norm": 0.5947726368904114, "learning_rate": 9.734606354806533e-05, "loss": 0.0088, "step": 4470 }, { "epoch": 16.173285198555956, "grad_norm": 0.6008957624435425, "learning_rate": 9.73283169999628e-05, "loss": 0.0079, "step": 4480 }, { "epoch": 16.209386281588447, "grad_norm": 0.6824911832809448, "learning_rate": 9.731051294349238e-05, "loss": 0.0083, "step": 4490 }, { "epoch": 16.24548736462094, "grad_norm": 0.6403281688690186, "learning_rate": 9.729265140028762e-05, "loss": 0.009, "step": 4500 }, { "epoch": 16.28158844765343, "grad_norm": 0.5412993431091309, "learning_rate": 9.727473239205201e-05, "loss": 0.0095, "step": 4510 }, { "epoch": 16.31768953068592, "grad_norm": 0.6415403485298157, "learning_rate": 9.725675594055883e-05, "loss": 0.0082, "step": 4520 }, { "epoch": 16.353790613718413, "grad_norm": 0.6148087978363037, "learning_rate": 9.723872206765116e-05, "loss": 0.0075, "step": 4530 }, { "epoch": 16.3898916967509, "grad_norm": 0.6932973265647888, "learning_rate": 9.722063079524185e-05, "loss": 0.0078, "step": 4540 }, { "epoch": 16.425992779783392, "grad_norm": 0.530030369758606, "learning_rate": 9.720248214531351e-05, "loss": 0.0074, "step": 4550 }, { "epoch": 16.462093862815884, "grad_norm": 0.6194853186607361, "learning_rate": 9.718427613991848e-05, "loss": 0.0073, "step": 4560 }, { "epoch": 16.498194945848375, "grad_norm": 0.5014938116073608, "learning_rate": 9.716601280117873e-05, "loss": 0.0068, "step": 4570 }, { "epoch": 16.534296028880867, "grad_norm": 0.5091082453727722, "learning_rate": 9.714769215128596e-05, "loss": 0.0074, "step": 4580 }, { "epoch": 16.57039711191336, "grad_norm": 0.5059235692024231, "learning_rate": 9.712931421250152e-05, "loss": 0.0073, "step": 4590 }, { "epoch": 16.60649819494585, "grad_norm": 0.6493478417396545, "learning_rate": 9.711087900715627e-05, "loss": 0.0083, "step": 4600 }, { "epoch": 16.642599277978338, "grad_norm": 0.6570061445236206, "learning_rate": 9.709238655765078e-05, "loss": 0.0084, "step": 4610 }, { "epoch": 16.67870036101083, "grad_norm": 0.4971005320549011, "learning_rate": 9.707383688645511e-05, "loss": 0.0072, "step": 4620 }, { "epoch": 16.71480144404332, "grad_norm": 0.6760922074317932, "learning_rate": 9.705523001610883e-05, "loss": 0.0085, "step": 4630 }, { "epoch": 16.750902527075812, "grad_norm": 0.5637115836143494, "learning_rate": 9.703656596922107e-05, "loss": 0.0096, "step": 4640 }, { "epoch": 16.787003610108304, "grad_norm": 0.5399641990661621, "learning_rate": 9.70178447684704e-05, "loss": 0.0078, "step": 4650 }, { "epoch": 16.823104693140795, "grad_norm": 0.4492916166782379, "learning_rate": 9.699906643660483e-05, "loss": 0.0067, "step": 4660 }, { "epoch": 16.859205776173287, "grad_norm": 0.5601198077201843, "learning_rate": 9.698023099644185e-05, "loss": 0.009, "step": 4670 }, { "epoch": 16.895306859205775, "grad_norm": 0.5658851861953735, "learning_rate": 9.696133847086823e-05, "loss": 0.008, "step": 4680 }, { "epoch": 16.931407942238266, "grad_norm": 0.4090757966041565, "learning_rate": 9.694238888284022e-05, "loss": 0.0065, "step": 4690 }, { "epoch": 16.967509025270758, "grad_norm": 0.43657976388931274, "learning_rate": 9.692338225538333e-05, "loss": 0.0075, "step": 4700 }, { "epoch": 17.00361010830325, "grad_norm": 0.6505222916603088, "learning_rate": 9.690431861159241e-05, "loss": 0.007, "step": 4710 }, { "epoch": 17.03971119133574, "grad_norm": 0.6438521146774292, "learning_rate": 9.688519797463161e-05, "loss": 0.0069, "step": 4720 }, { "epoch": 17.075812274368232, "grad_norm": 0.5986621975898743, "learning_rate": 9.686602036773426e-05, "loss": 0.0077, "step": 4730 }, { "epoch": 17.111913357400724, "grad_norm": 0.6103493571281433, "learning_rate": 9.684678581420302e-05, "loss": 0.0069, "step": 4740 }, { "epoch": 17.14801444043321, "grad_norm": 0.4867769777774811, "learning_rate": 9.682749433740962e-05, "loss": 0.0059, "step": 4750 }, { "epoch": 17.184115523465703, "grad_norm": 0.5553053617477417, "learning_rate": 9.680814596079507e-05, "loss": 0.0074, "step": 4760 }, { "epoch": 17.220216606498195, "grad_norm": 0.4939308762550354, "learning_rate": 9.678874070786945e-05, "loss": 0.0067, "step": 4770 }, { "epoch": 17.256317689530686, "grad_norm": 0.5850286483764648, "learning_rate": 9.676927860221199e-05, "loss": 0.0063, "step": 4780 }, { "epoch": 17.292418772563177, "grad_norm": 0.5425153374671936, "learning_rate": 9.674975966747097e-05, "loss": 0.0075, "step": 4790 }, { "epoch": 17.32851985559567, "grad_norm": 0.5524667501449585, "learning_rate": 9.673018392736374e-05, "loss": 0.0071, "step": 4800 }, { "epoch": 17.36462093862816, "grad_norm": 0.4978862702846527, "learning_rate": 9.671055140567667e-05, "loss": 0.0068, "step": 4810 }, { "epoch": 17.40072202166065, "grad_norm": 0.5207685232162476, "learning_rate": 9.669086212626511e-05, "loss": 0.0072, "step": 4820 }, { "epoch": 17.43682310469314, "grad_norm": 0.5226748585700989, "learning_rate": 9.667111611305341e-05, "loss": 0.0064, "step": 4830 }, { "epoch": 17.47292418772563, "grad_norm": 0.4616841673851013, "learning_rate": 9.665131339003486e-05, "loss": 0.0072, "step": 4840 }, { "epoch": 17.509025270758123, "grad_norm": 0.6443553566932678, "learning_rate": 9.663145398127158e-05, "loss": 0.0066, "step": 4850 }, { "epoch": 17.545126353790614, "grad_norm": 0.5076514482498169, "learning_rate": 9.661153791089467e-05, "loss": 0.0073, "step": 4860 }, { "epoch": 17.581227436823106, "grad_norm": 0.5068714618682861, "learning_rate": 9.659156520310402e-05, "loss": 0.0074, "step": 4870 }, { "epoch": 17.617328519855597, "grad_norm": 0.6643645167350769, "learning_rate": 9.657153588216835e-05, "loss": 0.0075, "step": 4880 }, { "epoch": 17.653429602888085, "grad_norm": 0.6366770267486572, "learning_rate": 9.655144997242516e-05, "loss": 0.0087, "step": 4890 }, { "epoch": 17.689530685920577, "grad_norm": 0.44640955328941345, "learning_rate": 9.653130749828075e-05, "loss": 0.0075, "step": 4900 }, { "epoch": 17.72563176895307, "grad_norm": 0.45529672503471375, "learning_rate": 9.65111084842101e-05, "loss": 0.0057, "step": 4910 }, { "epoch": 17.76173285198556, "grad_norm": 0.4766358435153961, "learning_rate": 9.649085295475695e-05, "loss": 0.0066, "step": 4920 }, { "epoch": 17.79783393501805, "grad_norm": 0.5507829189300537, "learning_rate": 9.647054093453365e-05, "loss": 0.008, "step": 4930 }, { "epoch": 17.833935018050543, "grad_norm": 0.5131349563598633, "learning_rate": 9.645017244822123e-05, "loss": 0.0078, "step": 4940 }, { "epoch": 17.870036101083034, "grad_norm": 0.45431241393089294, "learning_rate": 9.642974752056931e-05, "loss": 0.008, "step": 4950 }, { "epoch": 17.906137184115522, "grad_norm": 0.5684696435928345, "learning_rate": 9.640926617639613e-05, "loss": 0.0082, "step": 4960 }, { "epoch": 17.942238267148014, "grad_norm": 0.5622549057006836, "learning_rate": 9.638872844058843e-05, "loss": 0.0082, "step": 4970 }, { "epoch": 17.978339350180505, "grad_norm": 0.5874543786048889, "learning_rate": 9.63681343381015e-05, "loss": 0.0069, "step": 4980 }, { "epoch": 18.014440433212997, "grad_norm": 0.4115467071533203, "learning_rate": 9.634748389395914e-05, "loss": 0.0065, "step": 4990 }, { "epoch": 18.050541516245488, "grad_norm": 0.4604302942752838, "learning_rate": 9.632677713325353e-05, "loss": 0.0067, "step": 5000 }, { "epoch": 18.08664259927798, "grad_norm": 0.4758187532424927, "learning_rate": 9.63060140811454e-05, "loss": 0.0065, "step": 5010 }, { "epoch": 18.12274368231047, "grad_norm": 0.47490420937538147, "learning_rate": 9.628519476286379e-05, "loss": 0.0066, "step": 5020 }, { "epoch": 18.15884476534296, "grad_norm": 0.5891820192337036, "learning_rate": 9.626431920370612e-05, "loss": 0.0062, "step": 5030 }, { "epoch": 18.19494584837545, "grad_norm": 0.5240359306335449, "learning_rate": 9.624338742903819e-05, "loss": 0.0076, "step": 5040 }, { "epoch": 18.231046931407942, "grad_norm": 0.516766369342804, "learning_rate": 9.622239946429406e-05, "loss": 0.0082, "step": 5050 }, { "epoch": 18.267148014440433, "grad_norm": 0.5181319117546082, "learning_rate": 9.620135533497609e-05, "loss": 0.0069, "step": 5060 }, { "epoch": 18.303249097472925, "grad_norm": 0.5097251534461975, "learning_rate": 9.61802550666549e-05, "loss": 0.006, "step": 5070 }, { "epoch": 18.339350180505416, "grad_norm": 0.49024176597595215, "learning_rate": 9.615909868496928e-05, "loss": 0.006, "step": 5080 }, { "epoch": 18.375451263537904, "grad_norm": 0.5576670169830322, "learning_rate": 9.613788621562622e-05, "loss": 0.008, "step": 5090 }, { "epoch": 18.411552346570396, "grad_norm": 0.5302563905715942, "learning_rate": 9.611661768440093e-05, "loss": 0.0059, "step": 5100 }, { "epoch": 18.447653429602887, "grad_norm": 0.6087308526039124, "learning_rate": 9.609529311713661e-05, "loss": 0.0075, "step": 5110 }, { "epoch": 18.48375451263538, "grad_norm": 0.4414571523666382, "learning_rate": 9.607391253974466e-05, "loss": 0.0066, "step": 5120 }, { "epoch": 18.51985559566787, "grad_norm": 0.46488794684410095, "learning_rate": 9.605247597820448e-05, "loss": 0.0069, "step": 5130 }, { "epoch": 18.555956678700362, "grad_norm": 0.5927522778511047, "learning_rate": 9.603098345856354e-05, "loss": 0.0071, "step": 5140 }, { "epoch": 18.592057761732853, "grad_norm": 0.5715414881706238, "learning_rate": 9.600943500693725e-05, "loss": 0.0076, "step": 5150 }, { "epoch": 18.628158844765345, "grad_norm": 0.4302375912666321, "learning_rate": 9.598783064950902e-05, "loss": 0.0061, "step": 5160 }, { "epoch": 18.664259927797833, "grad_norm": 0.5064087510108948, "learning_rate": 9.596617041253018e-05, "loss": 0.0071, "step": 5170 }, { "epoch": 18.700361010830324, "grad_norm": 0.5059252977371216, "learning_rate": 9.594445432231996e-05, "loss": 0.0065, "step": 5180 }, { "epoch": 18.736462093862816, "grad_norm": 0.575046718120575, "learning_rate": 9.592268240526547e-05, "loss": 0.0061, "step": 5190 }, { "epoch": 18.772563176895307, "grad_norm": 0.555399477481842, "learning_rate": 9.590085468782162e-05, "loss": 0.0078, "step": 5200 }, { "epoch": 18.8086642599278, "grad_norm": 0.4235028624534607, "learning_rate": 9.587897119651116e-05, "loss": 0.007, "step": 5210 }, { "epoch": 18.84476534296029, "grad_norm": 0.531582236289978, "learning_rate": 9.585703195792459e-05, "loss": 0.0061, "step": 5220 }, { "epoch": 18.880866425992778, "grad_norm": 0.3979736268520355, "learning_rate": 9.583503699872016e-05, "loss": 0.0069, "step": 5230 }, { "epoch": 18.91696750902527, "grad_norm": 0.4949239194393158, "learning_rate": 9.581298634562381e-05, "loss": 0.0067, "step": 5240 }, { "epoch": 18.95306859205776, "grad_norm": 0.3901923894882202, "learning_rate": 9.579088002542917e-05, "loss": 0.0084, "step": 5250 }, { "epoch": 18.989169675090253, "grad_norm": 0.5102773904800415, "learning_rate": 9.57687180649975e-05, "loss": 0.0057, "step": 5260 }, { "epoch": 19.025270758122744, "grad_norm": 0.48473867774009705, "learning_rate": 9.574650049125768e-05, "loss": 0.0079, "step": 5270 }, { "epoch": 19.061371841155236, "grad_norm": 0.516612708568573, "learning_rate": 9.572422733120614e-05, "loss": 0.0066, "step": 5280 }, { "epoch": 19.097472924187727, "grad_norm": 0.46928471326828003, "learning_rate": 9.570189861190689e-05, "loss": 0.0067, "step": 5290 }, { "epoch": 19.133574007220215, "grad_norm": 0.4073295295238495, "learning_rate": 9.56795143604914e-05, "loss": 0.0064, "step": 5300 }, { "epoch": 19.169675090252706, "grad_norm": 0.42316439747810364, "learning_rate": 9.565707460415869e-05, "loss": 0.0063, "step": 5310 }, { "epoch": 19.205776173285198, "grad_norm": 0.39684924483299255, "learning_rate": 9.563457937017515e-05, "loss": 0.0077, "step": 5320 }, { "epoch": 19.24187725631769, "grad_norm": 0.5032602548599243, "learning_rate": 9.56120286858746e-05, "loss": 0.0066, "step": 5330 }, { "epoch": 19.27797833935018, "grad_norm": 0.488912433385849, "learning_rate": 9.558942257865829e-05, "loss": 0.007, "step": 5340 }, { "epoch": 19.314079422382672, "grad_norm": 0.5092653632164001, "learning_rate": 9.556676107599472e-05, "loss": 0.0069, "step": 5350 }, { "epoch": 19.350180505415164, "grad_norm": 0.6036354303359985, "learning_rate": 9.554404420541978e-05, "loss": 0.0075, "step": 5360 }, { "epoch": 19.386281588447652, "grad_norm": 0.4569160044193268, "learning_rate": 9.55212719945366e-05, "loss": 0.0075, "step": 5370 }, { "epoch": 19.422382671480143, "grad_norm": 0.46270015835762024, "learning_rate": 9.549844447101559e-05, "loss": 0.0069, "step": 5380 }, { "epoch": 19.458483754512635, "grad_norm": 0.5937749147415161, "learning_rate": 9.547556166259433e-05, "loss": 0.0057, "step": 5390 }, { "epoch": 19.494584837545126, "grad_norm": 0.5011341571807861, "learning_rate": 9.545262359707756e-05, "loss": 0.0069, "step": 5400 }, { "epoch": 19.530685920577618, "grad_norm": 0.5448009967803955, "learning_rate": 9.542963030233724e-05, "loss": 0.0071, "step": 5410 }, { "epoch": 19.56678700361011, "grad_norm": 0.5897495746612549, "learning_rate": 9.540658180631237e-05, "loss": 0.0059, "step": 5420 }, { "epoch": 19.6028880866426, "grad_norm": 0.543596088886261, "learning_rate": 9.538347813700904e-05, "loss": 0.0069, "step": 5430 }, { "epoch": 19.63898916967509, "grad_norm": 0.4065196216106415, "learning_rate": 9.536031932250036e-05, "loss": 0.0069, "step": 5440 }, { "epoch": 19.67509025270758, "grad_norm": 0.42265546321868896, "learning_rate": 9.533710539092653e-05, "loss": 0.0063, "step": 5450 }, { "epoch": 19.71119133574007, "grad_norm": 0.42158156633377075, "learning_rate": 9.531383637049464e-05, "loss": 0.0068, "step": 5460 }, { "epoch": 19.747292418772563, "grad_norm": 0.40992334485054016, "learning_rate": 9.529051228947875e-05, "loss": 0.0065, "step": 5470 }, { "epoch": 19.783393501805055, "grad_norm": 0.4669246971607208, "learning_rate": 9.52671331762198e-05, "loss": 0.0059, "step": 5480 }, { "epoch": 19.819494584837546, "grad_norm": 0.4964161813259125, "learning_rate": 9.524369905912565e-05, "loss": 0.0073, "step": 5490 }, { "epoch": 19.855595667870038, "grad_norm": 0.5335344076156616, "learning_rate": 9.522020996667092e-05, "loss": 0.0072, "step": 5500 }, { "epoch": 19.891696750902526, "grad_norm": 0.4653686285018921, "learning_rate": 9.519666592739709e-05, "loss": 0.007, "step": 5510 }, { "epoch": 19.927797833935017, "grad_norm": 0.6025059223175049, "learning_rate": 9.517306696991241e-05, "loss": 0.0064, "step": 5520 }, { "epoch": 19.96389891696751, "grad_norm": 0.325710654258728, "learning_rate": 9.51494131228918e-05, "loss": 0.0076, "step": 5530 }, { "epoch": 20.0, "grad_norm": 0.5524658560752869, "learning_rate": 9.512570441507695e-05, "loss": 0.0065, "step": 5540 }, { "epoch": 20.03610108303249, "grad_norm": 0.3591141700744629, "learning_rate": 9.510194087527615e-05, "loss": 0.0062, "step": 5550 }, { "epoch": 20.072202166064983, "grad_norm": 0.6223198175430298, "learning_rate": 9.507812253236435e-05, "loss": 0.0071, "step": 5560 }, { "epoch": 20.108303249097474, "grad_norm": 0.49687519669532776, "learning_rate": 9.505424941528309e-05, "loss": 0.0063, "step": 5570 }, { "epoch": 20.144404332129962, "grad_norm": 0.5842772722244263, "learning_rate": 9.503032155304046e-05, "loss": 0.0061, "step": 5580 }, { "epoch": 20.180505415162454, "grad_norm": 0.5326083302497864, "learning_rate": 9.500633897471106e-05, "loss": 0.0063, "step": 5590 }, { "epoch": 20.216606498194945, "grad_norm": 0.5466530919075012, "learning_rate": 9.498230170943596e-05, "loss": 0.0079, "step": 5600 }, { "epoch": 20.252707581227437, "grad_norm": 0.41405656933784485, "learning_rate": 9.495820978642275e-05, "loss": 0.0064, "step": 5610 }, { "epoch": 20.28880866425993, "grad_norm": 0.4599393308162689, "learning_rate": 9.493406323494535e-05, "loss": 0.008, "step": 5620 }, { "epoch": 20.32490974729242, "grad_norm": 0.4277969002723694, "learning_rate": 9.490986208434413e-05, "loss": 0.0064, "step": 5630 }, { "epoch": 20.36101083032491, "grad_norm": 0.33089083433151245, "learning_rate": 9.488560636402577e-05, "loss": 0.0055, "step": 5640 }, { "epoch": 20.3971119133574, "grad_norm": 0.40624913573265076, "learning_rate": 9.486129610346321e-05, "loss": 0.0054, "step": 5650 }, { "epoch": 20.43321299638989, "grad_norm": 0.43951189517974854, "learning_rate": 9.483693133219576e-05, "loss": 0.0059, "step": 5660 }, { "epoch": 20.469314079422382, "grad_norm": 0.4479890465736389, "learning_rate": 9.481251207982888e-05, "loss": 0.0065, "step": 5670 }, { "epoch": 20.505415162454874, "grad_norm": 0.4974622428417206, "learning_rate": 9.47880383760343e-05, "loss": 0.007, "step": 5680 }, { "epoch": 20.541516245487365, "grad_norm": 0.4705882966518402, "learning_rate": 9.476351025054983e-05, "loss": 0.0069, "step": 5690 }, { "epoch": 20.577617328519857, "grad_norm": 0.39997318387031555, "learning_rate": 9.473892773317952e-05, "loss": 0.0066, "step": 5700 }, { "epoch": 20.613718411552348, "grad_norm": 0.3788575530052185, "learning_rate": 9.471429085379338e-05, "loss": 0.0063, "step": 5710 }, { "epoch": 20.649819494584836, "grad_norm": 0.3653368353843689, "learning_rate": 9.468959964232757e-05, "loss": 0.0059, "step": 5720 }, { "epoch": 20.685920577617328, "grad_norm": 0.546480119228363, "learning_rate": 9.466485412878425e-05, "loss": 0.0069, "step": 5730 }, { "epoch": 20.72202166064982, "grad_norm": 0.4471173882484436, "learning_rate": 9.464005434323154e-05, "loss": 0.0058, "step": 5740 }, { "epoch": 20.75812274368231, "grad_norm": 0.5625684857368469, "learning_rate": 9.461520031580352e-05, "loss": 0.0061, "step": 5750 }, { "epoch": 20.794223826714802, "grad_norm": 0.5397229790687561, "learning_rate": 9.459029207670019e-05, "loss": 0.0068, "step": 5760 }, { "epoch": 20.830324909747294, "grad_norm": 0.41750290989875793, "learning_rate": 9.456532965618737e-05, "loss": 0.0084, "step": 5770 }, { "epoch": 20.866425992779785, "grad_norm": 0.5338872671127319, "learning_rate": 9.454031308459681e-05, "loss": 0.0073, "step": 5780 }, { "epoch": 20.902527075812273, "grad_norm": 0.5177128911018372, "learning_rate": 9.451524239232595e-05, "loss": 0.0072, "step": 5790 }, { "epoch": 20.938628158844764, "grad_norm": 0.5074653029441833, "learning_rate": 9.449011760983809e-05, "loss": 0.007, "step": 5800 }, { "epoch": 20.974729241877256, "grad_norm": 0.4906035363674164, "learning_rate": 9.446493876766218e-05, "loss": 0.0087, "step": 5810 }, { "epoch": 21.010830324909747, "grad_norm": 0.48160770535469055, "learning_rate": 9.44397058963929e-05, "loss": 0.0083, "step": 5820 }, { "epoch": 21.04693140794224, "grad_norm": 0.501342236995697, "learning_rate": 9.441441902669056e-05, "loss": 0.0066, "step": 5830 }, { "epoch": 21.08303249097473, "grad_norm": 0.5209288001060486, "learning_rate": 9.43890781892811e-05, "loss": 0.0064, "step": 5840 }, { "epoch": 21.119133574007222, "grad_norm": 0.5229851007461548, "learning_rate": 9.436368341495603e-05, "loss": 0.0071, "step": 5850 }, { "epoch": 21.15523465703971, "grad_norm": 0.35847046971321106, "learning_rate": 9.43382347345724e-05, "loss": 0.0064, "step": 5860 }, { "epoch": 21.1913357400722, "grad_norm": 0.4250674843788147, "learning_rate": 9.431273217905273e-05, "loss": 0.0069, "step": 5870 }, { "epoch": 21.227436823104693, "grad_norm": 0.40796077251434326, "learning_rate": 9.428717577938504e-05, "loss": 0.0066, "step": 5880 }, { "epoch": 21.263537906137184, "grad_norm": 0.4614047706127167, "learning_rate": 9.426156556662276e-05, "loss": 0.0055, "step": 5890 }, { "epoch": 21.299638989169676, "grad_norm": 0.4617994725704193, "learning_rate": 9.423590157188474e-05, "loss": 0.0068, "step": 5900 }, { "epoch": 21.335740072202167, "grad_norm": 0.5680766701698303, "learning_rate": 9.421018382635513e-05, "loss": 0.0071, "step": 5910 }, { "epoch": 21.37184115523466, "grad_norm": 0.4481453001499176, "learning_rate": 9.418441236128343e-05, "loss": 0.0072, "step": 5920 }, { "epoch": 21.407942238267147, "grad_norm": 0.4868505895137787, "learning_rate": 9.41585872079844e-05, "loss": 0.0062, "step": 5930 }, { "epoch": 21.444043321299638, "grad_norm": 0.43570268154144287, "learning_rate": 9.413270839783802e-05, "loss": 0.0078, "step": 5940 }, { "epoch": 21.48014440433213, "grad_norm": 0.3691798448562622, "learning_rate": 9.41067759622895e-05, "loss": 0.0065, "step": 5950 }, { "epoch": 21.51624548736462, "grad_norm": 0.3901306390762329, "learning_rate": 9.408078993284917e-05, "loss": 0.0069, "step": 5960 }, { "epoch": 21.552346570397113, "grad_norm": 0.3907696008682251, "learning_rate": 9.405475034109254e-05, "loss": 0.0083, "step": 5970 }, { "epoch": 21.588447653429604, "grad_norm": 0.3574260175228119, "learning_rate": 9.402865721866015e-05, "loss": 0.0072, "step": 5980 }, { "epoch": 21.624548736462096, "grad_norm": 0.4298070967197418, "learning_rate": 9.400251059725762e-05, "loss": 0.0064, "step": 5990 }, { "epoch": 21.660649819494584, "grad_norm": 0.48715445399284363, "learning_rate": 9.397631050865554e-05, "loss": 0.0069, "step": 6000 } ], "logging_steps": 10, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 109, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }