groot-twist2_full_6000 / trainer_state.json
ArnieRamesh's picture
Upload folder using huggingface_hub
dfff520 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 21.660649819494584,
"eval_steps": 500,
"global_step": 6000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.036101083032490974,
"grad_norm": 0.8792149424552917,
"learning_rate": 6.000000000000001e-07,
"loss": 1.1882,
"step": 10
},
{
"epoch": 0.07220216606498195,
"grad_norm": 0.7292852997779846,
"learning_rate": 1.2666666666666667e-06,
"loss": 1.183,
"step": 20
},
{
"epoch": 0.10830324909747292,
"grad_norm": 0.6755103468894958,
"learning_rate": 1.9333333333333336e-06,
"loss": 1.172,
"step": 30
},
{
"epoch": 0.1444043321299639,
"grad_norm": 0.5594439506530762,
"learning_rate": 2.6e-06,
"loss": 1.1539,
"step": 40
},
{
"epoch": 0.18050541516245489,
"grad_norm": 0.5405519008636475,
"learning_rate": 3.2666666666666666e-06,
"loss": 1.1356,
"step": 50
},
{
"epoch": 0.21660649819494585,
"grad_norm": 0.5031427145004272,
"learning_rate": 3.9333333333333335e-06,
"loss": 1.1221,
"step": 60
},
{
"epoch": 0.2527075812274368,
"grad_norm": 0.502995491027832,
"learning_rate": 4.6e-06,
"loss": 1.1053,
"step": 70
},
{
"epoch": 0.2888086642599278,
"grad_norm": 0.46390628814697266,
"learning_rate": 5.266666666666667e-06,
"loss": 1.0908,
"step": 80
},
{
"epoch": 0.3249097472924188,
"grad_norm": 0.39707711338996887,
"learning_rate": 5.933333333333334e-06,
"loss": 1.0768,
"step": 90
},
{
"epoch": 0.36101083032490977,
"grad_norm": 0.2680375277996063,
"learning_rate": 6.6e-06,
"loss": 1.0651,
"step": 100
},
{
"epoch": 0.3971119133574007,
"grad_norm": 0.20656010508537292,
"learning_rate": 7.266666666666668e-06,
"loss": 1.0499,
"step": 110
},
{
"epoch": 0.4332129963898917,
"grad_norm": 0.2176329642534256,
"learning_rate": 7.933333333333334e-06,
"loss": 1.04,
"step": 120
},
{
"epoch": 0.4693140794223827,
"grad_norm": 0.1952984780073166,
"learning_rate": 8.599999999999999e-06,
"loss": 1.0314,
"step": 130
},
{
"epoch": 0.5054151624548736,
"grad_norm": 0.17900057137012482,
"learning_rate": 9.266666666666667e-06,
"loss": 1.023,
"step": 140
},
{
"epoch": 0.5415162454873647,
"grad_norm": 0.15559467673301697,
"learning_rate": 9.933333333333334e-06,
"loss": 1.0192,
"step": 150
},
{
"epoch": 0.5776173285198556,
"grad_norm": 0.1702832281589508,
"learning_rate": 1.06e-05,
"loss": 1.0109,
"step": 160
},
{
"epoch": 0.6137184115523465,
"grad_norm": 0.19487395882606506,
"learning_rate": 1.1266666666666667e-05,
"loss": 1.008,
"step": 170
},
{
"epoch": 0.6498194945848376,
"grad_norm": 0.16858609020709991,
"learning_rate": 1.1933333333333333e-05,
"loss": 1.0011,
"step": 180
},
{
"epoch": 0.6859205776173285,
"grad_norm": 0.2135946899652481,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.9937,
"step": 190
},
{
"epoch": 0.7220216606498195,
"grad_norm": 0.23672480881214142,
"learning_rate": 1.3266666666666666e-05,
"loss": 0.9848,
"step": 200
},
{
"epoch": 0.7581227436823105,
"grad_norm": 0.36581552028656006,
"learning_rate": 1.3933333333333334e-05,
"loss": 0.9661,
"step": 210
},
{
"epoch": 0.7942238267148014,
"grad_norm": 0.5021904110908508,
"learning_rate": 1.4599999999999999e-05,
"loss": 0.9308,
"step": 220
},
{
"epoch": 0.8303249097472925,
"grad_norm": 0.9705759882926941,
"learning_rate": 1.5266666666666667e-05,
"loss": 0.8545,
"step": 230
},
{
"epoch": 0.8664259927797834,
"grad_norm": 1.263683557510376,
"learning_rate": 1.5933333333333332e-05,
"loss": 0.7534,
"step": 240
},
{
"epoch": 0.9025270758122743,
"grad_norm": 1.0056917667388916,
"learning_rate": 1.66e-05,
"loss": 0.669,
"step": 250
},
{
"epoch": 0.9386281588447654,
"grad_norm": 1.5650357007980347,
"learning_rate": 1.726666666666667e-05,
"loss": 0.595,
"step": 260
},
{
"epoch": 0.9747292418772563,
"grad_norm": 1.4451838731765747,
"learning_rate": 1.7933333333333337e-05,
"loss": 0.5332,
"step": 270
},
{
"epoch": 1.0108303249097472,
"grad_norm": 1.5739387273788452,
"learning_rate": 1.86e-05,
"loss": 0.4718,
"step": 280
},
{
"epoch": 1.0469314079422383,
"grad_norm": 1.5814650058746338,
"learning_rate": 1.926666666666667e-05,
"loss": 0.4245,
"step": 290
},
{
"epoch": 1.0830324909747293,
"grad_norm": 1.677675485610962,
"learning_rate": 1.9933333333333334e-05,
"loss": 0.3778,
"step": 300
},
{
"epoch": 1.1191335740072201,
"grad_norm": 1.9410228729248047,
"learning_rate": 2.06e-05,
"loss": 0.3371,
"step": 310
},
{
"epoch": 1.1552346570397112,
"grad_norm": 1.7728365659713745,
"learning_rate": 2.1266666666666667e-05,
"loss": 0.3061,
"step": 320
},
{
"epoch": 1.1913357400722022,
"grad_norm": 1.5503789186477661,
"learning_rate": 2.1933333333333332e-05,
"loss": 0.2825,
"step": 330
},
{
"epoch": 1.2274368231046933,
"grad_norm": 1.77287757396698,
"learning_rate": 2.26e-05,
"loss": 0.2613,
"step": 340
},
{
"epoch": 1.263537906137184,
"grad_norm": 1.6768401861190796,
"learning_rate": 2.326666666666667e-05,
"loss": 0.2476,
"step": 350
},
{
"epoch": 1.2996389891696751,
"grad_norm": 1.5678222179412842,
"learning_rate": 2.3933333333333337e-05,
"loss": 0.2395,
"step": 360
},
{
"epoch": 1.335740072202166,
"grad_norm": 1.7035478353500366,
"learning_rate": 2.46e-05,
"loss": 0.2336,
"step": 370
},
{
"epoch": 1.371841155234657,
"grad_norm": 1.7723883390426636,
"learning_rate": 2.5266666666666666e-05,
"loss": 0.2269,
"step": 380
},
{
"epoch": 1.407942238267148,
"grad_norm": 1.5948989391326904,
"learning_rate": 2.5933333333333338e-05,
"loss": 0.224,
"step": 390
},
{
"epoch": 1.444043321299639,
"grad_norm": 2.1878089904785156,
"learning_rate": 2.6600000000000003e-05,
"loss": 0.2185,
"step": 400
},
{
"epoch": 1.48014440433213,
"grad_norm": 1.9472143650054932,
"learning_rate": 2.7266666666666668e-05,
"loss": 0.2148,
"step": 410
},
{
"epoch": 1.516245487364621,
"grad_norm": 2.4479265213012695,
"learning_rate": 2.7933333333333332e-05,
"loss": 0.215,
"step": 420
},
{
"epoch": 1.5523465703971118,
"grad_norm": 1.6668602228164673,
"learning_rate": 2.86e-05,
"loss": 0.2156,
"step": 430
},
{
"epoch": 1.5884476534296028,
"grad_norm": 2.297792434692383,
"learning_rate": 2.926666666666667e-05,
"loss": 0.2131,
"step": 440
},
{
"epoch": 1.6245487364620939,
"grad_norm": 1.9421613216400146,
"learning_rate": 2.9933333333333337e-05,
"loss": 0.2136,
"step": 450
},
{
"epoch": 1.660649819494585,
"grad_norm": 2.2717418670654297,
"learning_rate": 3.06e-05,
"loss": 0.2093,
"step": 460
},
{
"epoch": 1.696750902527076,
"grad_norm": 2.1423869132995605,
"learning_rate": 3.126666666666666e-05,
"loss": 0.2094,
"step": 470
},
{
"epoch": 1.7328519855595668,
"grad_norm": 2.2144198417663574,
"learning_rate": 3.1933333333333335e-05,
"loss": 0.2092,
"step": 480
},
{
"epoch": 1.7689530685920578,
"grad_norm": 1.9322779178619385,
"learning_rate": 3.26e-05,
"loss": 0.2064,
"step": 490
},
{
"epoch": 1.8050541516245486,
"grad_norm": 1.9350993633270264,
"learning_rate": 3.326666666666667e-05,
"loss": 0.2077,
"step": 500
},
{
"epoch": 1.8411552346570397,
"grad_norm": 1.7844723463058472,
"learning_rate": 3.3933333333333336e-05,
"loss": 0.2067,
"step": 510
},
{
"epoch": 1.8772563176895307,
"grad_norm": 2.405451774597168,
"learning_rate": 3.46e-05,
"loss": 0.2026,
"step": 520
},
{
"epoch": 1.9133574007220218,
"grad_norm": 2.382427453994751,
"learning_rate": 3.526666666666667e-05,
"loss": 0.2004,
"step": 530
},
{
"epoch": 1.9494584837545126,
"grad_norm": 2.5038397312164307,
"learning_rate": 3.593333333333334e-05,
"loss": 0.2034,
"step": 540
},
{
"epoch": 1.9855595667870036,
"grad_norm": 2.2127082347869873,
"learning_rate": 3.66e-05,
"loss": 0.2005,
"step": 550
},
{
"epoch": 2.0216606498194944,
"grad_norm": 2.9056589603424072,
"learning_rate": 3.726666666666667e-05,
"loss": 0.1995,
"step": 560
},
{
"epoch": 2.0577617328519855,
"grad_norm": 2.2383415699005127,
"learning_rate": 3.793333333333334e-05,
"loss": 0.2031,
"step": 570
},
{
"epoch": 2.0938628158844765,
"grad_norm": 1.9492878913879395,
"learning_rate": 3.86e-05,
"loss": 0.1991,
"step": 580
},
{
"epoch": 2.1299638989169676,
"grad_norm": 1.8478870391845703,
"learning_rate": 3.926666666666667e-05,
"loss": 0.1993,
"step": 590
},
{
"epoch": 2.1660649819494586,
"grad_norm": 2.0714190006256104,
"learning_rate": 3.993333333333333e-05,
"loss": 0.1977,
"step": 600
},
{
"epoch": 2.2021660649819497,
"grad_norm": 2.0085227489471436,
"learning_rate": 4.0600000000000004e-05,
"loss": 0.1949,
"step": 610
},
{
"epoch": 2.2382671480144403,
"grad_norm": 2.1462786197662354,
"learning_rate": 4.126666666666667e-05,
"loss": 0.1955,
"step": 620
},
{
"epoch": 2.2743682310469313,
"grad_norm": 1.9504297971725464,
"learning_rate": 4.1933333333333334e-05,
"loss": 0.1932,
"step": 630
},
{
"epoch": 2.3104693140794224,
"grad_norm": 2.304089069366455,
"learning_rate": 4.26e-05,
"loss": 0.1935,
"step": 640
},
{
"epoch": 2.3465703971119134,
"grad_norm": 2.4123048782348633,
"learning_rate": 4.3266666666666664e-05,
"loss": 0.1911,
"step": 650
},
{
"epoch": 2.3826714801444044,
"grad_norm": 2.5345194339752197,
"learning_rate": 4.3933333333333335e-05,
"loss": 0.1861,
"step": 660
},
{
"epoch": 2.4187725631768955,
"grad_norm": 2.2014682292938232,
"learning_rate": 4.46e-05,
"loss": 0.1887,
"step": 670
},
{
"epoch": 2.4548736462093865,
"grad_norm": 2.7363312244415283,
"learning_rate": 4.526666666666667e-05,
"loss": 0.1826,
"step": 680
},
{
"epoch": 2.490974729241877,
"grad_norm": 2.445457935333252,
"learning_rate": 4.5933333333333336e-05,
"loss": 0.1782,
"step": 690
},
{
"epoch": 2.527075812274368,
"grad_norm": 3.096940279006958,
"learning_rate": 4.660000000000001e-05,
"loss": 0.1719,
"step": 700
},
{
"epoch": 2.563176895306859,
"grad_norm": 3.281512498855591,
"learning_rate": 4.726666666666667e-05,
"loss": 0.168,
"step": 710
},
{
"epoch": 2.5992779783393503,
"grad_norm": 3.32570743560791,
"learning_rate": 4.793333333333334e-05,
"loss": 0.1634,
"step": 720
},
{
"epoch": 2.6353790613718413,
"grad_norm": 2.5920348167419434,
"learning_rate": 4.86e-05,
"loss": 0.1641,
"step": 730
},
{
"epoch": 2.671480144404332,
"grad_norm": 3.0771641731262207,
"learning_rate": 4.926666666666667e-05,
"loss": 0.1589,
"step": 740
},
{
"epoch": 2.707581227436823,
"grad_norm": 3.3847527503967285,
"learning_rate": 4.993333333333334e-05,
"loss": 0.1588,
"step": 750
},
{
"epoch": 2.743682310469314,
"grad_norm": 3.3967106342315674,
"learning_rate": 5.0600000000000003e-05,
"loss": 0.1537,
"step": 760
},
{
"epoch": 2.779783393501805,
"grad_norm": 3.562208652496338,
"learning_rate": 5.1266666666666675e-05,
"loss": 0.1518,
"step": 770
},
{
"epoch": 2.815884476534296,
"grad_norm": 4.249292850494385,
"learning_rate": 5.193333333333333e-05,
"loss": 0.1517,
"step": 780
},
{
"epoch": 2.851985559566787,
"grad_norm": 3.7535207271575928,
"learning_rate": 5.2600000000000005e-05,
"loss": 0.1489,
"step": 790
},
{
"epoch": 2.888086642599278,
"grad_norm": 4.374902248382568,
"learning_rate": 5.326666666666666e-05,
"loss": 0.1449,
"step": 800
},
{
"epoch": 2.9241877256317688,
"grad_norm": 3.9763412475585938,
"learning_rate": 5.3933333333333334e-05,
"loss": 0.1359,
"step": 810
},
{
"epoch": 2.96028880866426,
"grad_norm": 3.976372241973877,
"learning_rate": 5.4600000000000006e-05,
"loss": 0.1327,
"step": 820
},
{
"epoch": 2.996389891696751,
"grad_norm": 2.532400131225586,
"learning_rate": 5.5266666666666664e-05,
"loss": 0.1238,
"step": 830
},
{
"epoch": 3.032490974729242,
"grad_norm": 7.225079536437988,
"learning_rate": 5.5933333333333335e-05,
"loss": 0.1338,
"step": 840
},
{
"epoch": 3.068592057761733,
"grad_norm": 3.196591854095459,
"learning_rate": 5.66e-05,
"loss": 0.1232,
"step": 850
},
{
"epoch": 3.104693140794224,
"grad_norm": 4.404566764831543,
"learning_rate": 5.726666666666667e-05,
"loss": 0.1137,
"step": 860
},
{
"epoch": 3.140794223826715,
"grad_norm": 5.581392765045166,
"learning_rate": 5.7933333333333337e-05,
"loss": 0.1121,
"step": 870
},
{
"epoch": 3.1768953068592056,
"grad_norm": 4.208507061004639,
"learning_rate": 5.86e-05,
"loss": 0.1147,
"step": 880
},
{
"epoch": 3.2129963898916967,
"grad_norm": 4.072608947753906,
"learning_rate": 5.926666666666667e-05,
"loss": 0.1048,
"step": 890
},
{
"epoch": 3.2490974729241877,
"grad_norm": 6.415537357330322,
"learning_rate": 5.9933333333333345e-05,
"loss": 0.1023,
"step": 900
},
{
"epoch": 3.2851985559566788,
"grad_norm": 5.0296854972839355,
"learning_rate": 6.06e-05,
"loss": 0.0996,
"step": 910
},
{
"epoch": 3.32129963898917,
"grad_norm": 3.894113779067993,
"learning_rate": 6.126666666666667e-05,
"loss": 0.0899,
"step": 920
},
{
"epoch": 3.357400722021661,
"grad_norm": 4.2843017578125,
"learning_rate": 6.193333333333333e-05,
"loss": 0.0829,
"step": 930
},
{
"epoch": 3.3935018050541514,
"grad_norm": 7.592728614807129,
"learning_rate": 6.26e-05,
"loss": 0.0781,
"step": 940
},
{
"epoch": 3.4296028880866425,
"grad_norm": 5.444018840789795,
"learning_rate": 6.326666666666667e-05,
"loss": 0.0922,
"step": 950
},
{
"epoch": 3.4657039711191335,
"grad_norm": 4.786616802215576,
"learning_rate": 6.393333333333333e-05,
"loss": 0.0789,
"step": 960
},
{
"epoch": 3.5018050541516246,
"grad_norm": 3.325745105743408,
"learning_rate": 6.460000000000001e-05,
"loss": 0.0673,
"step": 970
},
{
"epoch": 3.5379061371841156,
"grad_norm": 3.444308280944824,
"learning_rate": 6.526666666666666e-05,
"loss": 0.063,
"step": 980
},
{
"epoch": 3.5740072202166067,
"grad_norm": 4.334812641143799,
"learning_rate": 6.593333333333334e-05,
"loss": 0.0629,
"step": 990
},
{
"epoch": 3.6101083032490973,
"grad_norm": 3.642155647277832,
"learning_rate": 6.66e-05,
"loss": 0.0584,
"step": 1000
},
{
"epoch": 3.6462093862815883,
"grad_norm": 3.5180516242980957,
"learning_rate": 6.726666666666667e-05,
"loss": 0.052,
"step": 1010
},
{
"epoch": 3.6823104693140793,
"grad_norm": 2.7395567893981934,
"learning_rate": 6.793333333333334e-05,
"loss": 0.049,
"step": 1020
},
{
"epoch": 3.7184115523465704,
"grad_norm": 6.758563041687012,
"learning_rate": 6.860000000000001e-05,
"loss": 0.0553,
"step": 1030
},
{
"epoch": 3.7545126353790614,
"grad_norm": 3.3643603324890137,
"learning_rate": 6.926666666666667e-05,
"loss": 0.066,
"step": 1040
},
{
"epoch": 3.7906137184115525,
"grad_norm": 5.385725021362305,
"learning_rate": 6.993333333333334e-05,
"loss": 0.0769,
"step": 1050
},
{
"epoch": 3.8267148014440435,
"grad_norm": 3.6493208408355713,
"learning_rate": 7.06e-05,
"loss": 0.0569,
"step": 1060
},
{
"epoch": 3.862815884476534,
"grad_norm": 3.325450897216797,
"learning_rate": 7.126666666666667e-05,
"loss": 0.0483,
"step": 1070
},
{
"epoch": 3.898916967509025,
"grad_norm": 2.4501149654388428,
"learning_rate": 7.193333333333334e-05,
"loss": 0.0408,
"step": 1080
},
{
"epoch": 3.935018050541516,
"grad_norm": 1.90947425365448,
"learning_rate": 7.26e-05,
"loss": 0.0362,
"step": 1090
},
{
"epoch": 3.9711191335740073,
"grad_norm": 1.8476409912109375,
"learning_rate": 7.326666666666667e-05,
"loss": 0.0392,
"step": 1100
},
{
"epoch": 4.007220216606498,
"grad_norm": 2.7553763389587402,
"learning_rate": 7.393333333333333e-05,
"loss": 0.0381,
"step": 1110
},
{
"epoch": 4.043321299638989,
"grad_norm": 2.647038698196411,
"learning_rate": 7.46e-05,
"loss": 0.0338,
"step": 1120
},
{
"epoch": 4.07942238267148,
"grad_norm": 2.1170711517333984,
"learning_rate": 7.526666666666668e-05,
"loss": 0.0353,
"step": 1130
},
{
"epoch": 4.115523465703971,
"grad_norm": 2.6540422439575195,
"learning_rate": 7.593333333333334e-05,
"loss": 0.0326,
"step": 1140
},
{
"epoch": 4.1516245487364625,
"grad_norm": 2.1124935150146484,
"learning_rate": 7.66e-05,
"loss": 0.0309,
"step": 1150
},
{
"epoch": 4.187725631768953,
"grad_norm": 2.653675079345703,
"learning_rate": 7.726666666666667e-05,
"loss": 0.0319,
"step": 1160
},
{
"epoch": 4.223826714801444,
"grad_norm": 2.696803569793701,
"learning_rate": 7.793333333333333e-05,
"loss": 0.0328,
"step": 1170
},
{
"epoch": 4.259927797833935,
"grad_norm": 2.675212860107422,
"learning_rate": 7.860000000000001e-05,
"loss": 0.0317,
"step": 1180
},
{
"epoch": 4.296028880866426,
"grad_norm": 2.1661317348480225,
"learning_rate": 7.926666666666666e-05,
"loss": 0.0293,
"step": 1190
},
{
"epoch": 4.332129963898917,
"grad_norm": 2.6710309982299805,
"learning_rate": 7.993333333333334e-05,
"loss": 0.0309,
"step": 1200
},
{
"epoch": 4.368231046931408,
"grad_norm": 2.7322568893432617,
"learning_rate": 8.060000000000001e-05,
"loss": 0.0311,
"step": 1210
},
{
"epoch": 4.404332129963899,
"grad_norm": 2.741199016571045,
"learning_rate": 8.126666666666667e-05,
"loss": 0.03,
"step": 1220
},
{
"epoch": 4.44043321299639,
"grad_norm": 2.3981032371520996,
"learning_rate": 8.193333333333334e-05,
"loss": 0.0282,
"step": 1230
},
{
"epoch": 4.4765342960288805,
"grad_norm": 2.7882485389709473,
"learning_rate": 8.26e-05,
"loss": 0.028,
"step": 1240
},
{
"epoch": 4.512635379061372,
"grad_norm": 2.624581813812256,
"learning_rate": 8.326666666666667e-05,
"loss": 0.0307,
"step": 1250
},
{
"epoch": 4.548736462093863,
"grad_norm": 2.1602065563201904,
"learning_rate": 8.393333333333335e-05,
"loss": 0.0275,
"step": 1260
},
{
"epoch": 4.584837545126354,
"grad_norm": 1.852003812789917,
"learning_rate": 8.46e-05,
"loss": 0.025,
"step": 1270
},
{
"epoch": 4.620938628158845,
"grad_norm": 2.615730047225952,
"learning_rate": 8.526666666666667e-05,
"loss": 0.03,
"step": 1280
},
{
"epoch": 4.657039711191336,
"grad_norm": 2.2223312854766846,
"learning_rate": 8.593333333333333e-05,
"loss": 0.0267,
"step": 1290
},
{
"epoch": 4.693140794223827,
"grad_norm": 2.612130880355835,
"learning_rate": 8.66e-05,
"loss": 0.027,
"step": 1300
},
{
"epoch": 4.729241877256317,
"grad_norm": 2.06195068359375,
"learning_rate": 8.726666666666667e-05,
"loss": 0.0256,
"step": 1310
},
{
"epoch": 4.765342960288809,
"grad_norm": 2.914454936981201,
"learning_rate": 8.793333333333333e-05,
"loss": 0.0263,
"step": 1320
},
{
"epoch": 4.8014440433212995,
"grad_norm": 2.4399166107177734,
"learning_rate": 8.86e-05,
"loss": 0.0269,
"step": 1330
},
{
"epoch": 4.837545126353791,
"grad_norm": 2.0535504817962646,
"learning_rate": 8.926666666666668e-05,
"loss": 0.026,
"step": 1340
},
{
"epoch": 4.873646209386282,
"grad_norm": 2.612743377685547,
"learning_rate": 8.993333333333334e-05,
"loss": 0.0265,
"step": 1350
},
{
"epoch": 4.909747292418773,
"grad_norm": 2.352599859237671,
"learning_rate": 9.06e-05,
"loss": 0.026,
"step": 1360
},
{
"epoch": 4.945848375451264,
"grad_norm": 2.1630280017852783,
"learning_rate": 9.126666666666667e-05,
"loss": 0.0253,
"step": 1370
},
{
"epoch": 4.981949458483754,
"grad_norm": 2.0943126678466797,
"learning_rate": 9.193333333333334e-05,
"loss": 0.0225,
"step": 1380
},
{
"epoch": 5.018050541516246,
"grad_norm": 1.8519400358200073,
"learning_rate": 9.260000000000001e-05,
"loss": 0.0245,
"step": 1390
},
{
"epoch": 5.054151624548736,
"grad_norm": 2.359534740447998,
"learning_rate": 9.326666666666667e-05,
"loss": 0.0241,
"step": 1400
},
{
"epoch": 5.090252707581228,
"grad_norm": 1.862101435661316,
"learning_rate": 9.393333333333334e-05,
"loss": 0.0268,
"step": 1410
},
{
"epoch": 5.126353790613718,
"grad_norm": 2.0919692516326904,
"learning_rate": 9.46e-05,
"loss": 0.0237,
"step": 1420
},
{
"epoch": 5.162454873646209,
"grad_norm": 1.861625075340271,
"learning_rate": 9.526666666666667e-05,
"loss": 0.0258,
"step": 1430
},
{
"epoch": 5.1985559566787005,
"grad_norm": 2.132181167602539,
"learning_rate": 9.593333333333334e-05,
"loss": 0.0252,
"step": 1440
},
{
"epoch": 5.234657039711191,
"grad_norm": 1.9146952629089355,
"learning_rate": 9.66e-05,
"loss": 0.0233,
"step": 1450
},
{
"epoch": 5.270758122743683,
"grad_norm": 1.6309505701065063,
"learning_rate": 9.726666666666667e-05,
"loss": 0.0226,
"step": 1460
},
{
"epoch": 5.306859205776173,
"grad_norm": 1.7791682481765747,
"learning_rate": 9.793333333333333e-05,
"loss": 0.0257,
"step": 1470
},
{
"epoch": 5.342960288808664,
"grad_norm": 1.6199488639831543,
"learning_rate": 9.86e-05,
"loss": 0.0221,
"step": 1480
},
{
"epoch": 5.379061371841155,
"grad_norm": 2.206078052520752,
"learning_rate": 9.926666666666668e-05,
"loss": 0.0206,
"step": 1490
},
{
"epoch": 5.415162454873646,
"grad_norm": 1.5716779232025146,
"learning_rate": 9.993333333333334e-05,
"loss": 0.021,
"step": 1500
},
{
"epoch": 5.451263537906137,
"grad_norm": 1.9805904626846313,
"learning_rate": 9.999997539434007e-05,
"loss": 0.021,
"step": 1510
},
{
"epoch": 5.487364620938628,
"grad_norm": 2.1759257316589355,
"learning_rate": 9.999989033776898e-05,
"loss": 0.0202,
"step": 1520
},
{
"epoch": 5.5234657039711195,
"grad_norm": 2.1213581562042236,
"learning_rate": 9.999974452661641e-05,
"loss": 0.0204,
"step": 1530
},
{
"epoch": 5.55956678700361,
"grad_norm": 1.5911200046539307,
"learning_rate": 9.999953796105959e-05,
"loss": 0.0214,
"step": 1540
},
{
"epoch": 5.595667870036101,
"grad_norm": 1.9394819736480713,
"learning_rate": 9.999927064134949e-05,
"loss": 0.0243,
"step": 1550
},
{
"epoch": 5.631768953068592,
"grad_norm": 1.736773133277893,
"learning_rate": 9.999894256781095e-05,
"loss": 0.0215,
"step": 1560
},
{
"epoch": 5.667870036101083,
"grad_norm": 1.6920562982559204,
"learning_rate": 9.99985537408426e-05,
"loss": 0.0222,
"step": 1570
},
{
"epoch": 5.703971119133574,
"grad_norm": 1.6096794605255127,
"learning_rate": 9.999810416091688e-05,
"loss": 0.0194,
"step": 1580
},
{
"epoch": 5.740072202166065,
"grad_norm": 1.619868278503418,
"learning_rate": 9.99975938285801e-05,
"loss": 0.0184,
"step": 1590
},
{
"epoch": 5.776173285198556,
"grad_norm": 1.3996607065200806,
"learning_rate": 9.999702274445236e-05,
"loss": 0.0198,
"step": 1600
},
{
"epoch": 5.812274368231047,
"grad_norm": 2.005847454071045,
"learning_rate": 9.999639090922756e-05,
"loss": 0.0185,
"step": 1610
},
{
"epoch": 5.8483754512635375,
"grad_norm": 1.3449925184249878,
"learning_rate": 9.999569832367346e-05,
"loss": 0.0188,
"step": 1620
},
{
"epoch": 5.884476534296029,
"grad_norm": 1.3387703895568848,
"learning_rate": 9.999494498863162e-05,
"loss": 0.0188,
"step": 1630
},
{
"epoch": 5.92057761732852,
"grad_norm": 1.3687690496444702,
"learning_rate": 9.99941309050174e-05,
"loss": 0.019,
"step": 1640
},
{
"epoch": 5.956678700361011,
"grad_norm": 1.5104888677597046,
"learning_rate": 9.999325607381999e-05,
"loss": 0.0219,
"step": 1650
},
{
"epoch": 5.992779783393502,
"grad_norm": 1.6341077089309692,
"learning_rate": 9.999232049610238e-05,
"loss": 0.0193,
"step": 1660
},
{
"epoch": 6.028880866425993,
"grad_norm": 1.4791560173034668,
"learning_rate": 9.999132417300142e-05,
"loss": 0.0182,
"step": 1670
},
{
"epoch": 6.064981949458484,
"grad_norm": 2.0534698963165283,
"learning_rate": 9.99902671057277e-05,
"loss": 0.0185,
"step": 1680
},
{
"epoch": 6.101083032490974,
"grad_norm": 1.9715570211410522,
"learning_rate": 9.998914929556569e-05,
"loss": 0.0197,
"step": 1690
},
{
"epoch": 6.137184115523466,
"grad_norm": 1.714123010635376,
"learning_rate": 9.998797074387361e-05,
"loss": 0.0186,
"step": 1700
},
{
"epoch": 6.1732851985559565,
"grad_norm": 1.6185153722763062,
"learning_rate": 9.99867314520835e-05,
"loss": 0.016,
"step": 1710
},
{
"epoch": 6.209386281588448,
"grad_norm": 1.3633183240890503,
"learning_rate": 9.998543142170126e-05,
"loss": 0.0193,
"step": 1720
},
{
"epoch": 6.245487364620939,
"grad_norm": 1.5912808179855347,
"learning_rate": 9.99840706543065e-05,
"loss": 0.0194,
"step": 1730
},
{
"epoch": 6.28158844765343,
"grad_norm": 1.9936370849609375,
"learning_rate": 9.998264915155274e-05,
"loss": 0.0172,
"step": 1740
},
{
"epoch": 6.317689530685921,
"grad_norm": 1.7528094053268433,
"learning_rate": 9.998116691516718e-05,
"loss": 0.0185,
"step": 1750
},
{
"epoch": 6.353790613718411,
"grad_norm": 1.7351869344711304,
"learning_rate": 9.997962394695091e-05,
"loss": 0.0174,
"step": 1760
},
{
"epoch": 6.389891696750903,
"grad_norm": 1.1427206993103027,
"learning_rate": 9.997802024877875e-05,
"loss": 0.0176,
"step": 1770
},
{
"epoch": 6.425992779783393,
"grad_norm": 1.2416205406188965,
"learning_rate": 9.99763558225994e-05,
"loss": 0.0169,
"step": 1780
},
{
"epoch": 6.462093862815885,
"grad_norm": 1.6932954788208008,
"learning_rate": 9.997463067043526e-05,
"loss": 0.0178,
"step": 1790
},
{
"epoch": 6.498194945848375,
"grad_norm": 1.6806602478027344,
"learning_rate": 9.997284479438253e-05,
"loss": 0.0186,
"step": 1800
},
{
"epoch": 6.534296028880867,
"grad_norm": 1.7252382040023804,
"learning_rate": 9.997099819661127e-05,
"loss": 0.0169,
"step": 1810
},
{
"epoch": 6.5703971119133575,
"grad_norm": 1.7361781597137451,
"learning_rate": 9.996909087936524e-05,
"loss": 0.0172,
"step": 1820
},
{
"epoch": 6.606498194945848,
"grad_norm": 1.5690287351608276,
"learning_rate": 9.996712284496201e-05,
"loss": 0.0186,
"step": 1830
},
{
"epoch": 6.64259927797834,
"grad_norm": 1.3086954355239868,
"learning_rate": 9.996509409579293e-05,
"loss": 0.016,
"step": 1840
},
{
"epoch": 6.67870036101083,
"grad_norm": 1.392793893814087,
"learning_rate": 9.996300463432312e-05,
"loss": 0.0167,
"step": 1850
},
{
"epoch": 6.714801444043322,
"grad_norm": 1.2733964920043945,
"learning_rate": 9.996085446309148e-05,
"loss": 0.0166,
"step": 1860
},
{
"epoch": 6.750902527075812,
"grad_norm": 1.3202944993972778,
"learning_rate": 9.995864358471066e-05,
"loss": 0.0161,
"step": 1870
},
{
"epoch": 6.787003610108303,
"grad_norm": 1.843495488166809,
"learning_rate": 9.99563720018671e-05,
"loss": 0.0168,
"step": 1880
},
{
"epoch": 6.823104693140794,
"grad_norm": 1.3712539672851562,
"learning_rate": 9.995403971732098e-05,
"loss": 0.0158,
"step": 1890
},
{
"epoch": 6.859205776173285,
"grad_norm": 1.2827091217041016,
"learning_rate": 9.995164673390625e-05,
"loss": 0.0174,
"step": 1900
},
{
"epoch": 6.8953068592057765,
"grad_norm": 1.1793349981307983,
"learning_rate": 9.994919305453059e-05,
"loss": 0.0143,
"step": 1910
},
{
"epoch": 6.931407942238267,
"grad_norm": 1.3544561862945557,
"learning_rate": 9.994667868217548e-05,
"loss": 0.0162,
"step": 1920
},
{
"epoch": 6.967509025270758,
"grad_norm": 1.3066773414611816,
"learning_rate": 9.99441036198961e-05,
"loss": 0.0151,
"step": 1930
},
{
"epoch": 7.003610108303249,
"grad_norm": 1.5574922561645508,
"learning_rate": 9.99414678708214e-05,
"loss": 0.0152,
"step": 1940
},
{
"epoch": 7.03971119133574,
"grad_norm": 1.518237829208374,
"learning_rate": 9.993877143815407e-05,
"loss": 0.015,
"step": 1950
},
{
"epoch": 7.075812274368231,
"grad_norm": 1.5742323398590088,
"learning_rate": 9.993601432517053e-05,
"loss": 0.0188,
"step": 1960
},
{
"epoch": 7.111913357400722,
"grad_norm": 1.7677223682403564,
"learning_rate": 9.993319653522091e-05,
"loss": 0.0163,
"step": 1970
},
{
"epoch": 7.148014440433213,
"grad_norm": 1.4965007305145264,
"learning_rate": 9.993031807172911e-05,
"loss": 0.0164,
"step": 1980
},
{
"epoch": 7.184115523465704,
"grad_norm": 1.4197651147842407,
"learning_rate": 9.992737893819273e-05,
"loss": 0.0187,
"step": 1990
},
{
"epoch": 7.2202166064981945,
"grad_norm": 1.5714497566223145,
"learning_rate": 9.992437913818312e-05,
"loss": 0.0176,
"step": 2000
},
{
"epoch": 7.256317689530686,
"grad_norm": 1.3549633026123047,
"learning_rate": 9.992131867534526e-05,
"loss": 0.0154,
"step": 2010
},
{
"epoch": 7.292418772563177,
"grad_norm": 1.270080327987671,
"learning_rate": 9.991819755339796e-05,
"loss": 0.0141,
"step": 2020
},
{
"epoch": 7.328519855595668,
"grad_norm": 1.090819001197815,
"learning_rate": 9.991501577613365e-05,
"loss": 0.0135,
"step": 2030
},
{
"epoch": 7.364620938628159,
"grad_norm": 1.3705732822418213,
"learning_rate": 9.99117733474185e-05,
"loss": 0.0149,
"step": 2040
},
{
"epoch": 7.40072202166065,
"grad_norm": 1.3088817596435547,
"learning_rate": 9.990847027119234e-05,
"loss": 0.0148,
"step": 2050
},
{
"epoch": 7.436823104693141,
"grad_norm": 1.511338233947754,
"learning_rate": 9.990510655146877e-05,
"loss": 0.0149,
"step": 2060
},
{
"epoch": 7.472924187725631,
"grad_norm": 1.5276422500610352,
"learning_rate": 9.990168219233496e-05,
"loss": 0.016,
"step": 2070
},
{
"epoch": 7.509025270758123,
"grad_norm": 1.3451029062271118,
"learning_rate": 9.989819719795188e-05,
"loss": 0.015,
"step": 2080
},
{
"epoch": 7.5451263537906135,
"grad_norm": 1.1688835620880127,
"learning_rate": 9.989465157255412e-05,
"loss": 0.0141,
"step": 2090
},
{
"epoch": 7.581227436823105,
"grad_norm": 0.800317108631134,
"learning_rate": 9.989104532044994e-05,
"loss": 0.0128,
"step": 2100
},
{
"epoch": 7.617328519855596,
"grad_norm": 0.9989607334136963,
"learning_rate": 9.988737844602128e-05,
"loss": 0.0127,
"step": 2110
},
{
"epoch": 7.653429602888087,
"grad_norm": 1.1656674146652222,
"learning_rate": 9.988365095372372e-05,
"loss": 0.0139,
"step": 2120
},
{
"epoch": 7.689530685920578,
"grad_norm": 1.295882225036621,
"learning_rate": 9.987986284808654e-05,
"loss": 0.014,
"step": 2130
},
{
"epoch": 7.725631768953068,
"grad_norm": 1.0984694957733154,
"learning_rate": 9.987601413371264e-05,
"loss": 0.0129,
"step": 2140
},
{
"epoch": 7.76173285198556,
"grad_norm": 1.3702571392059326,
"learning_rate": 9.987210481527855e-05,
"loss": 0.0149,
"step": 2150
},
{
"epoch": 7.79783393501805,
"grad_norm": 1.617008090019226,
"learning_rate": 9.98681348975345e-05,
"loss": 0.0141,
"step": 2160
},
{
"epoch": 7.833935018050542,
"grad_norm": 1.3128085136413574,
"learning_rate": 9.986410438530427e-05,
"loss": 0.0154,
"step": 2170
},
{
"epoch": 7.870036101083032,
"grad_norm": 1.1459583044052124,
"learning_rate": 9.986001328348534e-05,
"loss": 0.0127,
"step": 2180
},
{
"epoch": 7.906137184115524,
"grad_norm": 1.0433539152145386,
"learning_rate": 9.985586159704878e-05,
"loss": 0.0143,
"step": 2190
},
{
"epoch": 7.9422382671480145,
"grad_norm": 0.9214907288551331,
"learning_rate": 9.985164933103929e-05,
"loss": 0.0139,
"step": 2200
},
{
"epoch": 7.978339350180505,
"grad_norm": 1.1234197616577148,
"learning_rate": 9.984737649057513e-05,
"loss": 0.0132,
"step": 2210
},
{
"epoch": 8.014440433212997,
"grad_norm": 1.1866891384124756,
"learning_rate": 9.984304308084827e-05,
"loss": 0.0135,
"step": 2220
},
{
"epoch": 8.050541516245488,
"grad_norm": 1.1017005443572998,
"learning_rate": 9.983864910712416e-05,
"loss": 0.0128,
"step": 2230
},
{
"epoch": 8.086642599277978,
"grad_norm": 0.9575669169425964,
"learning_rate": 9.98341945747419e-05,
"loss": 0.0128,
"step": 2240
},
{
"epoch": 8.12274368231047,
"grad_norm": 0.9450060129165649,
"learning_rate": 9.98296794891142e-05,
"loss": 0.0138,
"step": 2250
},
{
"epoch": 8.15884476534296,
"grad_norm": 0.7817940711975098,
"learning_rate": 9.982510385572725e-05,
"loss": 0.0111,
"step": 2260
},
{
"epoch": 8.19494584837545,
"grad_norm": 1.0894697904586792,
"learning_rate": 9.982046768014094e-05,
"loss": 0.0123,
"step": 2270
},
{
"epoch": 8.231046931407942,
"grad_norm": 1.186640977859497,
"learning_rate": 9.981577096798863e-05,
"loss": 0.0133,
"step": 2280
},
{
"epoch": 8.267148014440433,
"grad_norm": 1.110902190208435,
"learning_rate": 9.981101372497727e-05,
"loss": 0.0128,
"step": 2290
},
{
"epoch": 8.303249097472925,
"grad_norm": 1.0896000862121582,
"learning_rate": 9.980619595688737e-05,
"loss": 0.0136,
"step": 2300
},
{
"epoch": 8.339350180505415,
"grad_norm": 0.944868803024292,
"learning_rate": 9.980131766957295e-05,
"loss": 0.013,
"step": 2310
},
{
"epoch": 8.375451263537906,
"grad_norm": 0.8644301891326904,
"learning_rate": 9.979637886896163e-05,
"loss": 0.0122,
"step": 2320
},
{
"epoch": 8.411552346570398,
"grad_norm": 0.7042171359062195,
"learning_rate": 9.979137956105447e-05,
"loss": 0.0121,
"step": 2330
},
{
"epoch": 8.447653429602887,
"grad_norm": 0.7054566740989685,
"learning_rate": 9.978631975192613e-05,
"loss": 0.0122,
"step": 2340
},
{
"epoch": 8.483754512635379,
"grad_norm": 0.9158279895782471,
"learning_rate": 9.978119944772475e-05,
"loss": 0.0133,
"step": 2350
},
{
"epoch": 8.51985559566787,
"grad_norm": 0.7022776007652283,
"learning_rate": 9.977601865467197e-05,
"loss": 0.0126,
"step": 2360
},
{
"epoch": 8.555956678700362,
"grad_norm": 0.7667548060417175,
"learning_rate": 9.977077737906297e-05,
"loss": 0.0118,
"step": 2370
},
{
"epoch": 8.592057761732852,
"grad_norm": 0.766633152961731,
"learning_rate": 9.976547562726636e-05,
"loss": 0.0135,
"step": 2380
},
{
"epoch": 8.628158844765343,
"grad_norm": 0.8130113482475281,
"learning_rate": 9.976011340572429e-05,
"loss": 0.0135,
"step": 2390
},
{
"epoch": 8.664259927797834,
"grad_norm": 0.9371001124382019,
"learning_rate": 9.975469072095237e-05,
"loss": 0.0116,
"step": 2400
},
{
"epoch": 8.700361010830324,
"grad_norm": 0.9312263131141663,
"learning_rate": 9.974920757953965e-05,
"loss": 0.0131,
"step": 2410
},
{
"epoch": 8.736462093862816,
"grad_norm": 0.9781073331832886,
"learning_rate": 9.97436639881487e-05,
"loss": 0.0127,
"step": 2420
},
{
"epoch": 8.772563176895307,
"grad_norm": 0.923537015914917,
"learning_rate": 9.973805995351545e-05,
"loss": 0.0119,
"step": 2430
},
{
"epoch": 8.808664259927799,
"grad_norm": 0.9015010595321655,
"learning_rate": 9.973239548244939e-05,
"loss": 0.0119,
"step": 2440
},
{
"epoch": 8.844765342960288,
"grad_norm": 0.9531866312026978,
"learning_rate": 9.972667058183333e-05,
"loss": 0.0115,
"step": 2450
},
{
"epoch": 8.88086642599278,
"grad_norm": 0.8554368019104004,
"learning_rate": 9.972088525862362e-05,
"loss": 0.0127,
"step": 2460
},
{
"epoch": 8.916967509025271,
"grad_norm": 0.7815523147583008,
"learning_rate": 9.971503951984995e-05,
"loss": 0.0129,
"step": 2470
},
{
"epoch": 8.953068592057761,
"grad_norm": 0.6722756624221802,
"learning_rate": 9.970913337261543e-05,
"loss": 0.0134,
"step": 2480
},
{
"epoch": 8.989169675090253,
"grad_norm": 0.7026623487472534,
"learning_rate": 9.97031668240966e-05,
"loss": 0.0118,
"step": 2490
},
{
"epoch": 9.025270758122744,
"grad_norm": 0.8489606976509094,
"learning_rate": 9.969713988154339e-05,
"loss": 0.0125,
"step": 2500
},
{
"epoch": 9.061371841155236,
"grad_norm": 0.8544819951057434,
"learning_rate": 9.969105255227906e-05,
"loss": 0.0125,
"step": 2510
},
{
"epoch": 9.097472924187725,
"grad_norm": 0.865927517414093,
"learning_rate": 9.968490484370035e-05,
"loss": 0.0117,
"step": 2520
},
{
"epoch": 9.133574007220217,
"grad_norm": 0.9897856712341309,
"learning_rate": 9.967869676327726e-05,
"loss": 0.0109,
"step": 2530
},
{
"epoch": 9.169675090252708,
"grad_norm": 1.0381364822387695,
"learning_rate": 9.96724283185532e-05,
"loss": 0.0102,
"step": 2540
},
{
"epoch": 9.205776173285198,
"grad_norm": 1.1119805574417114,
"learning_rate": 9.966609951714494e-05,
"loss": 0.0117,
"step": 2550
},
{
"epoch": 9.24187725631769,
"grad_norm": 1.0177388191223145,
"learning_rate": 9.965971036674255e-05,
"loss": 0.012,
"step": 2560
},
{
"epoch": 9.277978339350181,
"grad_norm": 0.804807722568512,
"learning_rate": 9.965326087510947e-05,
"loss": 0.0125,
"step": 2570
},
{
"epoch": 9.314079422382672,
"grad_norm": 0.9009324312210083,
"learning_rate": 9.964675105008244e-05,
"loss": 0.0118,
"step": 2580
},
{
"epoch": 9.350180505415162,
"grad_norm": 0.7471716403961182,
"learning_rate": 9.964018089957147e-05,
"loss": 0.0093,
"step": 2590
},
{
"epoch": 9.386281588447654,
"grad_norm": 0.7393360137939453,
"learning_rate": 9.963355043155996e-05,
"loss": 0.0111,
"step": 2600
},
{
"epoch": 9.422382671480145,
"grad_norm": 0.7434808611869812,
"learning_rate": 9.962685965410455e-05,
"loss": 0.011,
"step": 2610
},
{
"epoch": 9.458483754512635,
"grad_norm": 0.7294809818267822,
"learning_rate": 9.962010857533514e-05,
"loss": 0.0107,
"step": 2620
},
{
"epoch": 9.494584837545126,
"grad_norm": 0.849262535572052,
"learning_rate": 9.961329720345493e-05,
"loss": 0.0117,
"step": 2630
},
{
"epoch": 9.530685920577618,
"grad_norm": 0.7124184370040894,
"learning_rate": 9.96064255467404e-05,
"loss": 0.01,
"step": 2640
},
{
"epoch": 9.566787003610107,
"grad_norm": 0.6838423013687134,
"learning_rate": 9.959949361354126e-05,
"loss": 0.01,
"step": 2650
},
{
"epoch": 9.602888086642599,
"grad_norm": 0.5993446111679077,
"learning_rate": 9.959250141228045e-05,
"loss": 0.0099,
"step": 2660
},
{
"epoch": 9.63898916967509,
"grad_norm": 0.6491556763648987,
"learning_rate": 9.958544895145414e-05,
"loss": 0.0102,
"step": 2670
},
{
"epoch": 9.675090252707582,
"grad_norm": 0.7681136131286621,
"learning_rate": 9.957833623963177e-05,
"loss": 0.0099,
"step": 2680
},
{
"epoch": 9.711191335740072,
"grad_norm": 0.7443376183509827,
"learning_rate": 9.957116328545593e-05,
"loss": 0.0104,
"step": 2690
},
{
"epoch": 9.747292418772563,
"grad_norm": 0.862918496131897,
"learning_rate": 9.956393009764244e-05,
"loss": 0.0104,
"step": 2700
},
{
"epoch": 9.783393501805055,
"grad_norm": 0.6566956043243408,
"learning_rate": 9.955663668498032e-05,
"loss": 0.0106,
"step": 2710
},
{
"epoch": 9.819494584837544,
"grad_norm": 0.5753354430198669,
"learning_rate": 9.954928305633173e-05,
"loss": 0.0092,
"step": 2720
},
{
"epoch": 9.855595667870036,
"grad_norm": 0.6308586597442627,
"learning_rate": 9.954186922063204e-05,
"loss": 0.0088,
"step": 2730
},
{
"epoch": 9.891696750902527,
"grad_norm": 0.6705576181411743,
"learning_rate": 9.953439518688974e-05,
"loss": 0.0097,
"step": 2740
},
{
"epoch": 9.927797833935019,
"grad_norm": 0.7357208132743835,
"learning_rate": 9.952686096418652e-05,
"loss": 0.0112,
"step": 2750
},
{
"epoch": 9.963898916967509,
"grad_norm": 0.6457874774932861,
"learning_rate": 9.951926656167714e-05,
"loss": 0.0096,
"step": 2760
},
{
"epoch": 10.0,
"grad_norm": 0.759169340133667,
"learning_rate": 9.951161198858953e-05,
"loss": 0.01,
"step": 2770
},
{
"epoch": 10.036101083032491,
"grad_norm": 1.0353105068206787,
"learning_rate": 9.950389725422471e-05,
"loss": 0.0112,
"step": 2780
},
{
"epoch": 10.072202166064981,
"grad_norm": 0.7813242673873901,
"learning_rate": 9.949612236795682e-05,
"loss": 0.0129,
"step": 2790
},
{
"epoch": 10.108303249097473,
"grad_norm": 0.8926829695701599,
"learning_rate": 9.948828733923305e-05,
"loss": 0.012,
"step": 2800
},
{
"epoch": 10.144404332129964,
"grad_norm": 0.8344612717628479,
"learning_rate": 9.948039217757374e-05,
"loss": 0.0114,
"step": 2810
},
{
"epoch": 10.180505415162456,
"grad_norm": 0.9931730031967163,
"learning_rate": 9.947243689257225e-05,
"loss": 0.0103,
"step": 2820
},
{
"epoch": 10.216606498194945,
"grad_norm": 0.8401676416397095,
"learning_rate": 9.946442149389497e-05,
"loss": 0.0104,
"step": 2830
},
{
"epoch": 10.252707581227437,
"grad_norm": 0.8963848352432251,
"learning_rate": 9.945634599128139e-05,
"loss": 0.0109,
"step": 2840
},
{
"epoch": 10.288808664259928,
"grad_norm": 0.7665980458259583,
"learning_rate": 9.944821039454402e-05,
"loss": 0.0098,
"step": 2850
},
{
"epoch": 10.324909747292418,
"grad_norm": 0.7776787281036377,
"learning_rate": 9.944001471356835e-05,
"loss": 0.0103,
"step": 2860
},
{
"epoch": 10.36101083032491,
"grad_norm": 0.7740691900253296,
"learning_rate": 9.94317589583129e-05,
"loss": 0.0115,
"step": 2870
},
{
"epoch": 10.397111913357401,
"grad_norm": 0.7140895128250122,
"learning_rate": 9.942344313880922e-05,
"loss": 0.0097,
"step": 2880
},
{
"epoch": 10.433212996389893,
"grad_norm": 0.6975634098052979,
"learning_rate": 9.941506726516179e-05,
"loss": 0.0102,
"step": 2890
},
{
"epoch": 10.469314079422382,
"grad_norm": 0.5943614840507507,
"learning_rate": 9.94066313475481e-05,
"loss": 0.0093,
"step": 2900
},
{
"epoch": 10.505415162454874,
"grad_norm": 0.7553809285163879,
"learning_rate": 9.939813539621857e-05,
"loss": 0.0098,
"step": 2910
},
{
"epoch": 10.541516245487365,
"grad_norm": 0.6999721527099609,
"learning_rate": 9.93895794214966e-05,
"loss": 0.009,
"step": 2920
},
{
"epoch": 10.577617328519855,
"grad_norm": 0.681658148765564,
"learning_rate": 9.938096343377852e-05,
"loss": 0.0102,
"step": 2930
},
{
"epoch": 10.613718411552346,
"grad_norm": 0.7771198153495789,
"learning_rate": 9.937228744353353e-05,
"loss": 0.0096,
"step": 2940
},
{
"epoch": 10.649819494584838,
"grad_norm": 0.6081448793411255,
"learning_rate": 9.936355146130379e-05,
"loss": 0.0085,
"step": 2950
},
{
"epoch": 10.685920577617328,
"grad_norm": 0.8011517524719238,
"learning_rate": 9.935475549770435e-05,
"loss": 0.0096,
"step": 2960
},
{
"epoch": 10.722021660649819,
"grad_norm": 0.7534987926483154,
"learning_rate": 9.934589956342315e-05,
"loss": 0.0112,
"step": 2970
},
{
"epoch": 10.75812274368231,
"grad_norm": 0.8054004907608032,
"learning_rate": 9.933698366922093e-05,
"loss": 0.0103,
"step": 2980
},
{
"epoch": 10.794223826714802,
"grad_norm": 0.8305302858352661,
"learning_rate": 9.93280078259314e-05,
"loss": 0.0094,
"step": 2990
},
{
"epoch": 10.830324909747292,
"grad_norm": 0.8965534567832947,
"learning_rate": 9.931897204446104e-05,
"loss": 0.0094,
"step": 3000
},
{
"epoch": 10.866425992779783,
"grad_norm": 0.7903763651847839,
"learning_rate": 9.930987633578915e-05,
"loss": 0.0106,
"step": 3010
},
{
"epoch": 10.902527075812275,
"grad_norm": 0.7398828268051147,
"learning_rate": 9.93007207109679e-05,
"loss": 0.0105,
"step": 3020
},
{
"epoch": 10.938628158844764,
"grad_norm": 0.7578723430633545,
"learning_rate": 9.929150518112224e-05,
"loss": 0.0096,
"step": 3030
},
{
"epoch": 10.974729241877256,
"grad_norm": 0.8357623815536499,
"learning_rate": 9.928222975744991e-05,
"loss": 0.0106,
"step": 3040
},
{
"epoch": 11.010830324909747,
"grad_norm": 0.689484179019928,
"learning_rate": 9.92728944512214e-05,
"loss": 0.0098,
"step": 3050
},
{
"epoch": 11.046931407942239,
"grad_norm": 0.8133997321128845,
"learning_rate": 9.926349927378001e-05,
"loss": 0.0107,
"step": 3060
},
{
"epoch": 11.083032490974729,
"grad_norm": 0.6470313668251038,
"learning_rate": 9.925404423654174e-05,
"loss": 0.0106,
"step": 3070
},
{
"epoch": 11.11913357400722,
"grad_norm": 0.6673492789268494,
"learning_rate": 9.924452935099537e-05,
"loss": 0.0084,
"step": 3080
},
{
"epoch": 11.155234657039712,
"grad_norm": 0.6224140524864197,
"learning_rate": 9.92349546287024e-05,
"loss": 0.0081,
"step": 3090
},
{
"epoch": 11.191335740072201,
"grad_norm": 0.6376920938491821,
"learning_rate": 9.9225320081297e-05,
"loss": 0.0094,
"step": 3100
},
{
"epoch": 11.227436823104693,
"grad_norm": 0.7415140271186829,
"learning_rate": 9.921562572048606e-05,
"loss": 0.0096,
"step": 3110
},
{
"epoch": 11.263537906137184,
"grad_norm": 0.5629510283470154,
"learning_rate": 9.920587155804913e-05,
"loss": 0.0089,
"step": 3120
},
{
"epoch": 11.299638989169676,
"grad_norm": 0.6213931441307068,
"learning_rate": 9.919605760583845e-05,
"loss": 0.0084,
"step": 3130
},
{
"epoch": 11.335740072202166,
"grad_norm": 0.6785014867782593,
"learning_rate": 9.91861838757789e-05,
"loss": 0.0085,
"step": 3140
},
{
"epoch": 11.371841155234657,
"grad_norm": 0.6595868468284607,
"learning_rate": 9.917625037986798e-05,
"loss": 0.0093,
"step": 3150
},
{
"epoch": 11.407942238267148,
"grad_norm": 0.719587504863739,
"learning_rate": 9.916625713017583e-05,
"loss": 0.0087,
"step": 3160
},
{
"epoch": 11.444043321299638,
"grad_norm": 0.8673713207244873,
"learning_rate": 9.915620413884519e-05,
"loss": 0.0091,
"step": 3170
},
{
"epoch": 11.48014440433213,
"grad_norm": 0.8191733956336975,
"learning_rate": 9.914609141809139e-05,
"loss": 0.0105,
"step": 3180
},
{
"epoch": 11.516245487364621,
"grad_norm": 0.723475456237793,
"learning_rate": 9.913591898020235e-05,
"loss": 0.0092,
"step": 3190
},
{
"epoch": 11.552346570397113,
"grad_norm": 0.6822453737258911,
"learning_rate": 9.912568683753853e-05,
"loss": 0.0106,
"step": 3200
},
{
"epoch": 11.588447653429602,
"grad_norm": 0.606898844242096,
"learning_rate": 9.911539500253295e-05,
"loss": 0.0102,
"step": 3210
},
{
"epoch": 11.624548736462094,
"grad_norm": 0.5995931029319763,
"learning_rate": 9.910504348769118e-05,
"loss": 0.0091,
"step": 3220
},
{
"epoch": 11.660649819494585,
"grad_norm": 0.6429259777069092,
"learning_rate": 9.909463230559127e-05,
"loss": 0.0088,
"step": 3230
},
{
"epoch": 11.696750902527075,
"grad_norm": 0.5485654473304749,
"learning_rate": 9.908416146888376e-05,
"loss": 0.0081,
"step": 3240
},
{
"epoch": 11.732851985559567,
"grad_norm": 0.6281844973564148,
"learning_rate": 9.907363099029175e-05,
"loss": 0.01,
"step": 3250
},
{
"epoch": 11.768953068592058,
"grad_norm": 0.5267898440361023,
"learning_rate": 9.906304088261073e-05,
"loss": 0.0095,
"step": 3260
},
{
"epoch": 11.80505415162455,
"grad_norm": 0.7603709697723389,
"learning_rate": 9.905239115870872e-05,
"loss": 0.0079,
"step": 3270
},
{
"epoch": 11.84115523465704,
"grad_norm": 0.7378809452056885,
"learning_rate": 9.90416818315261e-05,
"loss": 0.0094,
"step": 3280
},
{
"epoch": 11.87725631768953,
"grad_norm": 0.7573925852775574,
"learning_rate": 9.903091291407573e-05,
"loss": 0.0086,
"step": 3290
},
{
"epoch": 11.913357400722022,
"grad_norm": 0.6064549088478088,
"learning_rate": 9.902008441944286e-05,
"loss": 0.0096,
"step": 3300
},
{
"epoch": 11.949458483754512,
"grad_norm": 0.6390209794044495,
"learning_rate": 9.900919636078512e-05,
"loss": 0.0104,
"step": 3310
},
{
"epoch": 11.985559566787003,
"grad_norm": 0.8857858180999756,
"learning_rate": 9.899824875133255e-05,
"loss": 0.0108,
"step": 3320
},
{
"epoch": 12.021660649819495,
"grad_norm": 0.6796932220458984,
"learning_rate": 9.898724160438749e-05,
"loss": 0.0099,
"step": 3330
},
{
"epoch": 12.057761732851986,
"grad_norm": 0.7772940397262573,
"learning_rate": 9.89761749333247e-05,
"loss": 0.0092,
"step": 3340
},
{
"epoch": 12.093862815884476,
"grad_norm": 0.6416860818862915,
"learning_rate": 9.896504875159122e-05,
"loss": 0.0088,
"step": 3350
},
{
"epoch": 12.129963898916968,
"grad_norm": 0.6209238767623901,
"learning_rate": 9.89538630727064e-05,
"loss": 0.01,
"step": 3360
},
{
"epoch": 12.166064981949459,
"grad_norm": 0.7372642159461975,
"learning_rate": 9.894261791026189e-05,
"loss": 0.0089,
"step": 3370
},
{
"epoch": 12.202166064981949,
"grad_norm": 0.5251403450965881,
"learning_rate": 9.893131327792165e-05,
"loss": 0.0105,
"step": 3380
},
{
"epoch": 12.23826714801444,
"grad_norm": 0.5557438135147095,
"learning_rate": 9.891994918942182e-05,
"loss": 0.0101,
"step": 3390
},
{
"epoch": 12.274368231046932,
"grad_norm": 0.6687502861022949,
"learning_rate": 9.890852565857092e-05,
"loss": 0.0091,
"step": 3400
},
{
"epoch": 12.310469314079423,
"grad_norm": 0.6372770667076111,
"learning_rate": 9.889704269924954e-05,
"loss": 0.0099,
"step": 3410
},
{
"epoch": 12.346570397111913,
"grad_norm": 0.6314915418624878,
"learning_rate": 9.888550032541059e-05,
"loss": 0.0092,
"step": 3420
},
{
"epoch": 12.382671480144404,
"grad_norm": 0.7343876957893372,
"learning_rate": 9.887389855107916e-05,
"loss": 0.0088,
"step": 3430
},
{
"epoch": 12.418772563176896,
"grad_norm": 0.8022611141204834,
"learning_rate": 9.886223739035248e-05,
"loss": 0.0098,
"step": 3440
},
{
"epoch": 12.454873646209386,
"grad_norm": 0.6204238533973694,
"learning_rate": 9.885051685739997e-05,
"loss": 0.0089,
"step": 3450
},
{
"epoch": 12.490974729241877,
"grad_norm": 0.5853930115699768,
"learning_rate": 9.883873696646316e-05,
"loss": 0.0099,
"step": 3460
},
{
"epoch": 12.527075812274369,
"grad_norm": 0.6876018047332764,
"learning_rate": 9.882689773185575e-05,
"loss": 0.0087,
"step": 3470
},
{
"epoch": 12.56317689530686,
"grad_norm": 0.6854273080825806,
"learning_rate": 9.881499916796353e-05,
"loss": 0.0093,
"step": 3480
},
{
"epoch": 12.59927797833935,
"grad_norm": 0.6009523272514343,
"learning_rate": 9.880304128924434e-05,
"loss": 0.0091,
"step": 3490
},
{
"epoch": 12.635379061371841,
"grad_norm": 0.5833231806755066,
"learning_rate": 9.879102411022817e-05,
"loss": 0.0076,
"step": 3500
},
{
"epoch": 12.671480144404333,
"grad_norm": 0.7677366733551025,
"learning_rate": 9.877894764551703e-05,
"loss": 0.0086,
"step": 3510
},
{
"epoch": 12.707581227436823,
"grad_norm": 0.6318122148513794,
"learning_rate": 9.876681190978494e-05,
"loss": 0.0081,
"step": 3520
},
{
"epoch": 12.743682310469314,
"grad_norm": 0.6317865252494812,
"learning_rate": 9.875461691777797e-05,
"loss": 0.0074,
"step": 3530
},
{
"epoch": 12.779783393501805,
"grad_norm": 0.6839991807937622,
"learning_rate": 9.874236268431417e-05,
"loss": 0.01,
"step": 3540
},
{
"epoch": 12.815884476534297,
"grad_norm": 0.7005046606063843,
"learning_rate": 9.873004922428361e-05,
"loss": 0.0084,
"step": 3550
},
{
"epoch": 12.851985559566787,
"grad_norm": 0.6627283096313477,
"learning_rate": 9.871767655264829e-05,
"loss": 0.0086,
"step": 3560
},
{
"epoch": 12.888086642599278,
"grad_norm": 0.7797370553016663,
"learning_rate": 9.87052446844422e-05,
"loss": 0.0085,
"step": 3570
},
{
"epoch": 12.92418772563177,
"grad_norm": 0.5306588411331177,
"learning_rate": 9.869275363477122e-05,
"loss": 0.0084,
"step": 3580
},
{
"epoch": 12.96028880866426,
"grad_norm": 0.7744714021682739,
"learning_rate": 9.868020341881312e-05,
"loss": 0.0097,
"step": 3590
},
{
"epoch": 12.99638989169675,
"grad_norm": 0.6790797710418701,
"learning_rate": 9.866759405181765e-05,
"loss": 0.0089,
"step": 3600
},
{
"epoch": 13.032490974729242,
"grad_norm": 0.7086886763572693,
"learning_rate": 9.865492554910633e-05,
"loss": 0.008,
"step": 3610
},
{
"epoch": 13.068592057761732,
"grad_norm": 0.6534098386764526,
"learning_rate": 9.864219792607262e-05,
"loss": 0.0104,
"step": 3620
},
{
"epoch": 13.104693140794224,
"grad_norm": 0.6140812635421753,
"learning_rate": 9.862941119818177e-05,
"loss": 0.0073,
"step": 3630
},
{
"epoch": 13.140794223826715,
"grad_norm": 0.6613591909408569,
"learning_rate": 9.861656538097086e-05,
"loss": 0.009,
"step": 3640
},
{
"epoch": 13.176895306859207,
"grad_norm": 0.7520333528518677,
"learning_rate": 9.860366049004877e-05,
"loss": 0.0082,
"step": 3650
},
{
"epoch": 13.212996389891696,
"grad_norm": 0.7076119184494019,
"learning_rate": 9.859069654109615e-05,
"loss": 0.0093,
"step": 3660
},
{
"epoch": 13.249097472924188,
"grad_norm": 0.691775918006897,
"learning_rate": 9.857767354986545e-05,
"loss": 0.0075,
"step": 3670
},
{
"epoch": 13.28519855595668,
"grad_norm": 0.9084518551826477,
"learning_rate": 9.856459153218078e-05,
"loss": 0.0108,
"step": 3680
},
{
"epoch": 13.321299638989169,
"grad_norm": 0.7507837414741516,
"learning_rate": 9.855145050393808e-05,
"loss": 0.0107,
"step": 3690
},
{
"epoch": 13.35740072202166,
"grad_norm": 0.7127672433853149,
"learning_rate": 9.85382504811049e-05,
"loss": 0.0088,
"step": 3700
},
{
"epoch": 13.393501805054152,
"grad_norm": 0.7873700857162476,
"learning_rate": 9.852499147972054e-05,
"loss": 0.0081,
"step": 3710
},
{
"epoch": 13.429602888086643,
"grad_norm": 0.8309053778648376,
"learning_rate": 9.851167351589592e-05,
"loss": 0.0081,
"step": 3720
},
{
"epoch": 13.465703971119133,
"grad_norm": 0.8734296560287476,
"learning_rate": 9.849829660581363e-05,
"loss": 0.009,
"step": 3730
},
{
"epoch": 13.501805054151625,
"grad_norm": 0.7617871165275574,
"learning_rate": 9.848486076572787e-05,
"loss": 0.0085,
"step": 3740
},
{
"epoch": 13.537906137184116,
"grad_norm": 0.4951501786708832,
"learning_rate": 9.847136601196446e-05,
"loss": 0.0076,
"step": 3750
},
{
"epoch": 13.574007220216606,
"grad_norm": 0.7322642803192139,
"learning_rate": 9.845781236092078e-05,
"loss": 0.0082,
"step": 3760
},
{
"epoch": 13.610108303249097,
"grad_norm": 0.616602897644043,
"learning_rate": 9.844419982906583e-05,
"loss": 0.008,
"step": 3770
},
{
"epoch": 13.646209386281589,
"grad_norm": 0.6850390434265137,
"learning_rate": 9.843052843294008e-05,
"loss": 0.0094,
"step": 3780
},
{
"epoch": 13.68231046931408,
"grad_norm": 0.6563327312469482,
"learning_rate": 9.841679818915559e-05,
"loss": 0.0077,
"step": 3790
},
{
"epoch": 13.71841155234657,
"grad_norm": 0.6361055970191956,
"learning_rate": 9.840300911439591e-05,
"loss": 0.0084,
"step": 3800
},
{
"epoch": 13.754512635379061,
"grad_norm": 0.6403736472129822,
"learning_rate": 9.838916122541603e-05,
"loss": 0.0077,
"step": 3810
},
{
"epoch": 13.790613718411553,
"grad_norm": 0.5514874458312988,
"learning_rate": 9.837525453904246e-05,
"loss": 0.0089,
"step": 3820
},
{
"epoch": 13.826714801444043,
"grad_norm": 0.531114935874939,
"learning_rate": 9.836128907217314e-05,
"loss": 0.0098,
"step": 3830
},
{
"epoch": 13.862815884476534,
"grad_norm": 0.72089684009552,
"learning_rate": 9.834726484177743e-05,
"loss": 0.0084,
"step": 3840
},
{
"epoch": 13.898916967509026,
"grad_norm": 0.7317929267883301,
"learning_rate": 9.833318186489609e-05,
"loss": 0.0095,
"step": 3850
},
{
"epoch": 13.935018050541515,
"grad_norm": 0.6702656745910645,
"learning_rate": 9.831904015864126e-05,
"loss": 0.0085,
"step": 3860
},
{
"epoch": 13.971119133574007,
"grad_norm": 0.5811823010444641,
"learning_rate": 9.830483974019645e-05,
"loss": 0.0081,
"step": 3870
},
{
"epoch": 14.007220216606498,
"grad_norm": 0.5628147125244141,
"learning_rate": 9.82905806268165e-05,
"loss": 0.0082,
"step": 3880
},
{
"epoch": 14.04332129963899,
"grad_norm": 0.6289241909980774,
"learning_rate": 9.82762628358276e-05,
"loss": 0.0098,
"step": 3890
},
{
"epoch": 14.07942238267148,
"grad_norm": 0.567337691783905,
"learning_rate": 9.826188638462718e-05,
"loss": 0.0074,
"step": 3900
},
{
"epoch": 14.115523465703971,
"grad_norm": 0.632425844669342,
"learning_rate": 9.824745129068402e-05,
"loss": 0.007,
"step": 3910
},
{
"epoch": 14.151624548736462,
"grad_norm": 0.6318320035934448,
"learning_rate": 9.82329575715381e-05,
"loss": 0.0086,
"step": 3920
},
{
"epoch": 14.187725631768952,
"grad_norm": 0.6592440605163574,
"learning_rate": 9.821840524480066e-05,
"loss": 0.0082,
"step": 3930
},
{
"epoch": 14.223826714801444,
"grad_norm": 0.7359952926635742,
"learning_rate": 9.820379432815414e-05,
"loss": 0.0079,
"step": 3940
},
{
"epoch": 14.259927797833935,
"grad_norm": 0.49768030643463135,
"learning_rate": 9.81891248393522e-05,
"loss": 0.0088,
"step": 3950
},
{
"epoch": 14.296028880866427,
"grad_norm": 0.7085164785385132,
"learning_rate": 9.817439679621963e-05,
"loss": 0.0082,
"step": 3960
},
{
"epoch": 14.332129963898916,
"grad_norm": 0.5585772395133972,
"learning_rate": 9.815961021665243e-05,
"loss": 0.0087,
"step": 3970
},
{
"epoch": 14.368231046931408,
"grad_norm": 0.6714861392974854,
"learning_rate": 9.814476511861763e-05,
"loss": 0.0088,
"step": 3980
},
{
"epoch": 14.4043321299639,
"grad_norm": 0.6995107531547546,
"learning_rate": 9.812986152015348e-05,
"loss": 0.01,
"step": 3990
},
{
"epoch": 14.440433212996389,
"grad_norm": 0.7201774716377258,
"learning_rate": 9.811489943936922e-05,
"loss": 0.0078,
"step": 4000
},
{
"epoch": 14.47653429602888,
"grad_norm": 0.7069374322891235,
"learning_rate": 9.809987889444522e-05,
"loss": 0.007,
"step": 4010
},
{
"epoch": 14.512635379061372,
"grad_norm": 0.45528268814086914,
"learning_rate": 9.808479990363282e-05,
"loss": 0.0081,
"step": 4020
},
{
"epoch": 14.548736462093864,
"grad_norm": 0.6015822887420654,
"learning_rate": 9.806966248525445e-05,
"loss": 0.0081,
"step": 4030
},
{
"epoch": 14.584837545126353,
"grad_norm": 0.7270722389221191,
"learning_rate": 9.805446665770348e-05,
"loss": 0.0084,
"step": 4040
},
{
"epoch": 14.620938628158845,
"grad_norm": 0.726533055305481,
"learning_rate": 9.803921243944429e-05,
"loss": 0.009,
"step": 4050
},
{
"epoch": 14.657039711191336,
"grad_norm": 0.5868228673934937,
"learning_rate": 9.802389984901218e-05,
"loss": 0.0076,
"step": 4060
},
{
"epoch": 14.693140794223826,
"grad_norm": 0.5780026316642761,
"learning_rate": 9.80085289050134e-05,
"loss": 0.0074,
"step": 4070
},
{
"epoch": 14.729241877256317,
"grad_norm": 0.5615423917770386,
"learning_rate": 9.799309962612508e-05,
"loss": 0.0084,
"step": 4080
},
{
"epoch": 14.765342960288809,
"grad_norm": 0.5624262690544128,
"learning_rate": 9.797761203109527e-05,
"loss": 0.0082,
"step": 4090
},
{
"epoch": 14.8014440433213,
"grad_norm": 0.5993496179580688,
"learning_rate": 9.796206613874283e-05,
"loss": 0.0082,
"step": 4100
},
{
"epoch": 14.83754512635379,
"grad_norm": 0.49344608187675476,
"learning_rate": 9.794646196795754e-05,
"loss": 0.007,
"step": 4110
},
{
"epoch": 14.873646209386282,
"grad_norm": 0.5589749813079834,
"learning_rate": 9.793079953769987e-05,
"loss": 0.0074,
"step": 4120
},
{
"epoch": 14.909747292418773,
"grad_norm": 0.5978914499282837,
"learning_rate": 9.79150788670012e-05,
"loss": 0.0077,
"step": 4130
},
{
"epoch": 14.945848375451263,
"grad_norm": 0.7583843469619751,
"learning_rate": 9.78992999749636e-05,
"loss": 0.0076,
"step": 4140
},
{
"epoch": 14.981949458483754,
"grad_norm": 0.704357922077179,
"learning_rate": 9.788346288075994e-05,
"loss": 0.0079,
"step": 4150
},
{
"epoch": 15.018050541516246,
"grad_norm": 0.664689838886261,
"learning_rate": 9.786756760363373e-05,
"loss": 0.0079,
"step": 4160
},
{
"epoch": 15.054151624548737,
"grad_norm": 0.5933771133422852,
"learning_rate": 9.78516141628993e-05,
"loss": 0.0078,
"step": 4170
},
{
"epoch": 15.090252707581227,
"grad_norm": 0.6095234155654907,
"learning_rate": 9.783560257794154e-05,
"loss": 0.0089,
"step": 4180
},
{
"epoch": 15.126353790613718,
"grad_norm": 0.6241867542266846,
"learning_rate": 9.781953286821603e-05,
"loss": 0.0068,
"step": 4190
},
{
"epoch": 15.16245487364621,
"grad_norm": 0.6755119562149048,
"learning_rate": 9.780340505324901e-05,
"loss": 0.0082,
"step": 4200
},
{
"epoch": 15.1985559566787,
"grad_norm": 0.43262916803359985,
"learning_rate": 9.778721915263727e-05,
"loss": 0.0078,
"step": 4210
},
{
"epoch": 15.234657039711191,
"grad_norm": 0.49696260690689087,
"learning_rate": 9.777097518604824e-05,
"loss": 0.0072,
"step": 4220
},
{
"epoch": 15.270758122743683,
"grad_norm": 0.5388179421424866,
"learning_rate": 9.775467317321984e-05,
"loss": 0.0091,
"step": 4230
},
{
"epoch": 15.306859205776174,
"grad_norm": 0.5314399003982544,
"learning_rate": 9.773831313396055e-05,
"loss": 0.0073,
"step": 4240
},
{
"epoch": 15.342960288808664,
"grad_norm": 0.49604231119155884,
"learning_rate": 9.77218950881494e-05,
"loss": 0.0089,
"step": 4250
},
{
"epoch": 15.379061371841155,
"grad_norm": 0.5194151401519775,
"learning_rate": 9.770541905573583e-05,
"loss": 0.007,
"step": 4260
},
{
"epoch": 15.415162454873647,
"grad_norm": 0.6234323382377625,
"learning_rate": 9.768888505673976e-05,
"loss": 0.0077,
"step": 4270
},
{
"epoch": 15.451263537906136,
"grad_norm": 0.7056440711021423,
"learning_rate": 9.767229311125162e-05,
"loss": 0.0077,
"step": 4280
},
{
"epoch": 15.487364620938628,
"grad_norm": 0.6590222716331482,
"learning_rate": 9.765564323943211e-05,
"loss": 0.0093,
"step": 4290
},
{
"epoch": 15.52346570397112,
"grad_norm": 0.5882553458213806,
"learning_rate": 9.763893546151244e-05,
"loss": 0.0075,
"step": 4300
},
{
"epoch": 15.559566787003611,
"grad_norm": 0.6169595122337341,
"learning_rate": 9.762216979779412e-05,
"loss": 0.0079,
"step": 4310
},
{
"epoch": 15.5956678700361,
"grad_norm": 0.4988597631454468,
"learning_rate": 9.760534626864902e-05,
"loss": 0.0087,
"step": 4320
},
{
"epoch": 15.631768953068592,
"grad_norm": 0.46784234046936035,
"learning_rate": 9.758846489451931e-05,
"loss": 0.0082,
"step": 4330
},
{
"epoch": 15.667870036101084,
"grad_norm": 0.46982425451278687,
"learning_rate": 9.757152569591748e-05,
"loss": 0.0085,
"step": 4340
},
{
"epoch": 15.703971119133573,
"grad_norm": 0.4954351782798767,
"learning_rate": 9.75545286934262e-05,
"loss": 0.0063,
"step": 4350
},
{
"epoch": 15.740072202166065,
"grad_norm": 0.581403374671936,
"learning_rate": 9.753747390769847e-05,
"loss": 0.0082,
"step": 4360
},
{
"epoch": 15.776173285198556,
"grad_norm": 0.5989372730255127,
"learning_rate": 9.752036135945744e-05,
"loss": 0.0076,
"step": 4370
},
{
"epoch": 15.812274368231048,
"grad_norm": 0.6102561354637146,
"learning_rate": 9.750319106949649e-05,
"loss": 0.0064,
"step": 4380
},
{
"epoch": 15.848375451263538,
"grad_norm": 0.592056930065155,
"learning_rate": 9.748596305867913e-05,
"loss": 0.009,
"step": 4390
},
{
"epoch": 15.884476534296029,
"grad_norm": 0.5222209692001343,
"learning_rate": 9.746867734793903e-05,
"loss": 0.0068,
"step": 4400
},
{
"epoch": 15.92057761732852,
"grad_norm": 0.4383890628814697,
"learning_rate": 9.745133395827993e-05,
"loss": 0.0074,
"step": 4410
},
{
"epoch": 15.95667870036101,
"grad_norm": 0.6366679072380066,
"learning_rate": 9.743393291077572e-05,
"loss": 0.0081,
"step": 4420
},
{
"epoch": 15.992779783393502,
"grad_norm": 0.5929124355316162,
"learning_rate": 9.741647422657028e-05,
"loss": 0.0082,
"step": 4430
},
{
"epoch": 16.028880866425993,
"grad_norm": 0.5620942711830139,
"learning_rate": 9.739895792687758e-05,
"loss": 0.0078,
"step": 4440
},
{
"epoch": 16.064981949458485,
"grad_norm": 0.5677127242088318,
"learning_rate": 9.738138403298157e-05,
"loss": 0.0087,
"step": 4450
},
{
"epoch": 16.101083032490976,
"grad_norm": 0.5328949093818665,
"learning_rate": 9.736375256623619e-05,
"loss": 0.0075,
"step": 4460
},
{
"epoch": 16.137184115523464,
"grad_norm": 0.5947726368904114,
"learning_rate": 9.734606354806533e-05,
"loss": 0.0088,
"step": 4470
},
{
"epoch": 16.173285198555956,
"grad_norm": 0.6008957624435425,
"learning_rate": 9.73283169999628e-05,
"loss": 0.0079,
"step": 4480
},
{
"epoch": 16.209386281588447,
"grad_norm": 0.6824911832809448,
"learning_rate": 9.731051294349238e-05,
"loss": 0.0083,
"step": 4490
},
{
"epoch": 16.24548736462094,
"grad_norm": 0.6403281688690186,
"learning_rate": 9.729265140028762e-05,
"loss": 0.009,
"step": 4500
},
{
"epoch": 16.28158844765343,
"grad_norm": 0.5412993431091309,
"learning_rate": 9.727473239205201e-05,
"loss": 0.0095,
"step": 4510
},
{
"epoch": 16.31768953068592,
"grad_norm": 0.6415403485298157,
"learning_rate": 9.725675594055883e-05,
"loss": 0.0082,
"step": 4520
},
{
"epoch": 16.353790613718413,
"grad_norm": 0.6148087978363037,
"learning_rate": 9.723872206765116e-05,
"loss": 0.0075,
"step": 4530
},
{
"epoch": 16.3898916967509,
"grad_norm": 0.6932973265647888,
"learning_rate": 9.722063079524185e-05,
"loss": 0.0078,
"step": 4540
},
{
"epoch": 16.425992779783392,
"grad_norm": 0.530030369758606,
"learning_rate": 9.720248214531351e-05,
"loss": 0.0074,
"step": 4550
},
{
"epoch": 16.462093862815884,
"grad_norm": 0.6194853186607361,
"learning_rate": 9.718427613991848e-05,
"loss": 0.0073,
"step": 4560
},
{
"epoch": 16.498194945848375,
"grad_norm": 0.5014938116073608,
"learning_rate": 9.716601280117873e-05,
"loss": 0.0068,
"step": 4570
},
{
"epoch": 16.534296028880867,
"grad_norm": 0.5091082453727722,
"learning_rate": 9.714769215128596e-05,
"loss": 0.0074,
"step": 4580
},
{
"epoch": 16.57039711191336,
"grad_norm": 0.5059235692024231,
"learning_rate": 9.712931421250152e-05,
"loss": 0.0073,
"step": 4590
},
{
"epoch": 16.60649819494585,
"grad_norm": 0.6493478417396545,
"learning_rate": 9.711087900715627e-05,
"loss": 0.0083,
"step": 4600
},
{
"epoch": 16.642599277978338,
"grad_norm": 0.6570061445236206,
"learning_rate": 9.709238655765078e-05,
"loss": 0.0084,
"step": 4610
},
{
"epoch": 16.67870036101083,
"grad_norm": 0.4971005320549011,
"learning_rate": 9.707383688645511e-05,
"loss": 0.0072,
"step": 4620
},
{
"epoch": 16.71480144404332,
"grad_norm": 0.6760922074317932,
"learning_rate": 9.705523001610883e-05,
"loss": 0.0085,
"step": 4630
},
{
"epoch": 16.750902527075812,
"grad_norm": 0.5637115836143494,
"learning_rate": 9.703656596922107e-05,
"loss": 0.0096,
"step": 4640
},
{
"epoch": 16.787003610108304,
"grad_norm": 0.5399641990661621,
"learning_rate": 9.70178447684704e-05,
"loss": 0.0078,
"step": 4650
},
{
"epoch": 16.823104693140795,
"grad_norm": 0.4492916166782379,
"learning_rate": 9.699906643660483e-05,
"loss": 0.0067,
"step": 4660
},
{
"epoch": 16.859205776173287,
"grad_norm": 0.5601198077201843,
"learning_rate": 9.698023099644185e-05,
"loss": 0.009,
"step": 4670
},
{
"epoch": 16.895306859205775,
"grad_norm": 0.5658851861953735,
"learning_rate": 9.696133847086823e-05,
"loss": 0.008,
"step": 4680
},
{
"epoch": 16.931407942238266,
"grad_norm": 0.4090757966041565,
"learning_rate": 9.694238888284022e-05,
"loss": 0.0065,
"step": 4690
},
{
"epoch": 16.967509025270758,
"grad_norm": 0.43657976388931274,
"learning_rate": 9.692338225538333e-05,
"loss": 0.0075,
"step": 4700
},
{
"epoch": 17.00361010830325,
"grad_norm": 0.6505222916603088,
"learning_rate": 9.690431861159241e-05,
"loss": 0.007,
"step": 4710
},
{
"epoch": 17.03971119133574,
"grad_norm": 0.6438521146774292,
"learning_rate": 9.688519797463161e-05,
"loss": 0.0069,
"step": 4720
},
{
"epoch": 17.075812274368232,
"grad_norm": 0.5986621975898743,
"learning_rate": 9.686602036773426e-05,
"loss": 0.0077,
"step": 4730
},
{
"epoch": 17.111913357400724,
"grad_norm": 0.6103493571281433,
"learning_rate": 9.684678581420302e-05,
"loss": 0.0069,
"step": 4740
},
{
"epoch": 17.14801444043321,
"grad_norm": 0.4867769777774811,
"learning_rate": 9.682749433740962e-05,
"loss": 0.0059,
"step": 4750
},
{
"epoch": 17.184115523465703,
"grad_norm": 0.5553053617477417,
"learning_rate": 9.680814596079507e-05,
"loss": 0.0074,
"step": 4760
},
{
"epoch": 17.220216606498195,
"grad_norm": 0.4939308762550354,
"learning_rate": 9.678874070786945e-05,
"loss": 0.0067,
"step": 4770
},
{
"epoch": 17.256317689530686,
"grad_norm": 0.5850286483764648,
"learning_rate": 9.676927860221199e-05,
"loss": 0.0063,
"step": 4780
},
{
"epoch": 17.292418772563177,
"grad_norm": 0.5425153374671936,
"learning_rate": 9.674975966747097e-05,
"loss": 0.0075,
"step": 4790
},
{
"epoch": 17.32851985559567,
"grad_norm": 0.5524667501449585,
"learning_rate": 9.673018392736374e-05,
"loss": 0.0071,
"step": 4800
},
{
"epoch": 17.36462093862816,
"grad_norm": 0.4978862702846527,
"learning_rate": 9.671055140567667e-05,
"loss": 0.0068,
"step": 4810
},
{
"epoch": 17.40072202166065,
"grad_norm": 0.5207685232162476,
"learning_rate": 9.669086212626511e-05,
"loss": 0.0072,
"step": 4820
},
{
"epoch": 17.43682310469314,
"grad_norm": 0.5226748585700989,
"learning_rate": 9.667111611305341e-05,
"loss": 0.0064,
"step": 4830
},
{
"epoch": 17.47292418772563,
"grad_norm": 0.4616841673851013,
"learning_rate": 9.665131339003486e-05,
"loss": 0.0072,
"step": 4840
},
{
"epoch": 17.509025270758123,
"grad_norm": 0.6443553566932678,
"learning_rate": 9.663145398127158e-05,
"loss": 0.0066,
"step": 4850
},
{
"epoch": 17.545126353790614,
"grad_norm": 0.5076514482498169,
"learning_rate": 9.661153791089467e-05,
"loss": 0.0073,
"step": 4860
},
{
"epoch": 17.581227436823106,
"grad_norm": 0.5068714618682861,
"learning_rate": 9.659156520310402e-05,
"loss": 0.0074,
"step": 4870
},
{
"epoch": 17.617328519855597,
"grad_norm": 0.6643645167350769,
"learning_rate": 9.657153588216835e-05,
"loss": 0.0075,
"step": 4880
},
{
"epoch": 17.653429602888085,
"grad_norm": 0.6366770267486572,
"learning_rate": 9.655144997242516e-05,
"loss": 0.0087,
"step": 4890
},
{
"epoch": 17.689530685920577,
"grad_norm": 0.44640955328941345,
"learning_rate": 9.653130749828075e-05,
"loss": 0.0075,
"step": 4900
},
{
"epoch": 17.72563176895307,
"grad_norm": 0.45529672503471375,
"learning_rate": 9.65111084842101e-05,
"loss": 0.0057,
"step": 4910
},
{
"epoch": 17.76173285198556,
"grad_norm": 0.4766358435153961,
"learning_rate": 9.649085295475695e-05,
"loss": 0.0066,
"step": 4920
},
{
"epoch": 17.79783393501805,
"grad_norm": 0.5507829189300537,
"learning_rate": 9.647054093453365e-05,
"loss": 0.008,
"step": 4930
},
{
"epoch": 17.833935018050543,
"grad_norm": 0.5131349563598633,
"learning_rate": 9.645017244822123e-05,
"loss": 0.0078,
"step": 4940
},
{
"epoch": 17.870036101083034,
"grad_norm": 0.45431241393089294,
"learning_rate": 9.642974752056931e-05,
"loss": 0.008,
"step": 4950
},
{
"epoch": 17.906137184115522,
"grad_norm": 0.5684696435928345,
"learning_rate": 9.640926617639613e-05,
"loss": 0.0082,
"step": 4960
},
{
"epoch": 17.942238267148014,
"grad_norm": 0.5622549057006836,
"learning_rate": 9.638872844058843e-05,
"loss": 0.0082,
"step": 4970
},
{
"epoch": 17.978339350180505,
"grad_norm": 0.5874543786048889,
"learning_rate": 9.63681343381015e-05,
"loss": 0.0069,
"step": 4980
},
{
"epoch": 18.014440433212997,
"grad_norm": 0.4115467071533203,
"learning_rate": 9.634748389395914e-05,
"loss": 0.0065,
"step": 4990
},
{
"epoch": 18.050541516245488,
"grad_norm": 0.4604302942752838,
"learning_rate": 9.632677713325353e-05,
"loss": 0.0067,
"step": 5000
},
{
"epoch": 18.08664259927798,
"grad_norm": 0.4758187532424927,
"learning_rate": 9.63060140811454e-05,
"loss": 0.0065,
"step": 5010
},
{
"epoch": 18.12274368231047,
"grad_norm": 0.47490420937538147,
"learning_rate": 9.628519476286379e-05,
"loss": 0.0066,
"step": 5020
},
{
"epoch": 18.15884476534296,
"grad_norm": 0.5891820192337036,
"learning_rate": 9.626431920370612e-05,
"loss": 0.0062,
"step": 5030
},
{
"epoch": 18.19494584837545,
"grad_norm": 0.5240359306335449,
"learning_rate": 9.624338742903819e-05,
"loss": 0.0076,
"step": 5040
},
{
"epoch": 18.231046931407942,
"grad_norm": 0.516766369342804,
"learning_rate": 9.622239946429406e-05,
"loss": 0.0082,
"step": 5050
},
{
"epoch": 18.267148014440433,
"grad_norm": 0.5181319117546082,
"learning_rate": 9.620135533497609e-05,
"loss": 0.0069,
"step": 5060
},
{
"epoch": 18.303249097472925,
"grad_norm": 0.5097251534461975,
"learning_rate": 9.61802550666549e-05,
"loss": 0.006,
"step": 5070
},
{
"epoch": 18.339350180505416,
"grad_norm": 0.49024176597595215,
"learning_rate": 9.615909868496928e-05,
"loss": 0.006,
"step": 5080
},
{
"epoch": 18.375451263537904,
"grad_norm": 0.5576670169830322,
"learning_rate": 9.613788621562622e-05,
"loss": 0.008,
"step": 5090
},
{
"epoch": 18.411552346570396,
"grad_norm": 0.5302563905715942,
"learning_rate": 9.611661768440093e-05,
"loss": 0.0059,
"step": 5100
},
{
"epoch": 18.447653429602887,
"grad_norm": 0.6087308526039124,
"learning_rate": 9.609529311713661e-05,
"loss": 0.0075,
"step": 5110
},
{
"epoch": 18.48375451263538,
"grad_norm": 0.4414571523666382,
"learning_rate": 9.607391253974466e-05,
"loss": 0.0066,
"step": 5120
},
{
"epoch": 18.51985559566787,
"grad_norm": 0.46488794684410095,
"learning_rate": 9.605247597820448e-05,
"loss": 0.0069,
"step": 5130
},
{
"epoch": 18.555956678700362,
"grad_norm": 0.5927522778511047,
"learning_rate": 9.603098345856354e-05,
"loss": 0.0071,
"step": 5140
},
{
"epoch": 18.592057761732853,
"grad_norm": 0.5715414881706238,
"learning_rate": 9.600943500693725e-05,
"loss": 0.0076,
"step": 5150
},
{
"epoch": 18.628158844765345,
"grad_norm": 0.4302375912666321,
"learning_rate": 9.598783064950902e-05,
"loss": 0.0061,
"step": 5160
},
{
"epoch": 18.664259927797833,
"grad_norm": 0.5064087510108948,
"learning_rate": 9.596617041253018e-05,
"loss": 0.0071,
"step": 5170
},
{
"epoch": 18.700361010830324,
"grad_norm": 0.5059252977371216,
"learning_rate": 9.594445432231996e-05,
"loss": 0.0065,
"step": 5180
},
{
"epoch": 18.736462093862816,
"grad_norm": 0.575046718120575,
"learning_rate": 9.592268240526547e-05,
"loss": 0.0061,
"step": 5190
},
{
"epoch": 18.772563176895307,
"grad_norm": 0.555399477481842,
"learning_rate": 9.590085468782162e-05,
"loss": 0.0078,
"step": 5200
},
{
"epoch": 18.8086642599278,
"grad_norm": 0.4235028624534607,
"learning_rate": 9.587897119651116e-05,
"loss": 0.007,
"step": 5210
},
{
"epoch": 18.84476534296029,
"grad_norm": 0.531582236289978,
"learning_rate": 9.585703195792459e-05,
"loss": 0.0061,
"step": 5220
},
{
"epoch": 18.880866425992778,
"grad_norm": 0.3979736268520355,
"learning_rate": 9.583503699872016e-05,
"loss": 0.0069,
"step": 5230
},
{
"epoch": 18.91696750902527,
"grad_norm": 0.4949239194393158,
"learning_rate": 9.581298634562381e-05,
"loss": 0.0067,
"step": 5240
},
{
"epoch": 18.95306859205776,
"grad_norm": 0.3901923894882202,
"learning_rate": 9.579088002542917e-05,
"loss": 0.0084,
"step": 5250
},
{
"epoch": 18.989169675090253,
"grad_norm": 0.5102773904800415,
"learning_rate": 9.57687180649975e-05,
"loss": 0.0057,
"step": 5260
},
{
"epoch": 19.025270758122744,
"grad_norm": 0.48473867774009705,
"learning_rate": 9.574650049125768e-05,
"loss": 0.0079,
"step": 5270
},
{
"epoch": 19.061371841155236,
"grad_norm": 0.516612708568573,
"learning_rate": 9.572422733120614e-05,
"loss": 0.0066,
"step": 5280
},
{
"epoch": 19.097472924187727,
"grad_norm": 0.46928471326828003,
"learning_rate": 9.570189861190689e-05,
"loss": 0.0067,
"step": 5290
},
{
"epoch": 19.133574007220215,
"grad_norm": 0.4073295295238495,
"learning_rate": 9.56795143604914e-05,
"loss": 0.0064,
"step": 5300
},
{
"epoch": 19.169675090252706,
"grad_norm": 0.42316439747810364,
"learning_rate": 9.565707460415869e-05,
"loss": 0.0063,
"step": 5310
},
{
"epoch": 19.205776173285198,
"grad_norm": 0.39684924483299255,
"learning_rate": 9.563457937017515e-05,
"loss": 0.0077,
"step": 5320
},
{
"epoch": 19.24187725631769,
"grad_norm": 0.5032602548599243,
"learning_rate": 9.56120286858746e-05,
"loss": 0.0066,
"step": 5330
},
{
"epoch": 19.27797833935018,
"grad_norm": 0.488912433385849,
"learning_rate": 9.558942257865829e-05,
"loss": 0.007,
"step": 5340
},
{
"epoch": 19.314079422382672,
"grad_norm": 0.5092653632164001,
"learning_rate": 9.556676107599472e-05,
"loss": 0.0069,
"step": 5350
},
{
"epoch": 19.350180505415164,
"grad_norm": 0.6036354303359985,
"learning_rate": 9.554404420541978e-05,
"loss": 0.0075,
"step": 5360
},
{
"epoch": 19.386281588447652,
"grad_norm": 0.4569160044193268,
"learning_rate": 9.55212719945366e-05,
"loss": 0.0075,
"step": 5370
},
{
"epoch": 19.422382671480143,
"grad_norm": 0.46270015835762024,
"learning_rate": 9.549844447101559e-05,
"loss": 0.0069,
"step": 5380
},
{
"epoch": 19.458483754512635,
"grad_norm": 0.5937749147415161,
"learning_rate": 9.547556166259433e-05,
"loss": 0.0057,
"step": 5390
},
{
"epoch": 19.494584837545126,
"grad_norm": 0.5011341571807861,
"learning_rate": 9.545262359707756e-05,
"loss": 0.0069,
"step": 5400
},
{
"epoch": 19.530685920577618,
"grad_norm": 0.5448009967803955,
"learning_rate": 9.542963030233724e-05,
"loss": 0.0071,
"step": 5410
},
{
"epoch": 19.56678700361011,
"grad_norm": 0.5897495746612549,
"learning_rate": 9.540658180631237e-05,
"loss": 0.0059,
"step": 5420
},
{
"epoch": 19.6028880866426,
"grad_norm": 0.543596088886261,
"learning_rate": 9.538347813700904e-05,
"loss": 0.0069,
"step": 5430
},
{
"epoch": 19.63898916967509,
"grad_norm": 0.4065196216106415,
"learning_rate": 9.536031932250036e-05,
"loss": 0.0069,
"step": 5440
},
{
"epoch": 19.67509025270758,
"grad_norm": 0.42265546321868896,
"learning_rate": 9.533710539092653e-05,
"loss": 0.0063,
"step": 5450
},
{
"epoch": 19.71119133574007,
"grad_norm": 0.42158156633377075,
"learning_rate": 9.531383637049464e-05,
"loss": 0.0068,
"step": 5460
},
{
"epoch": 19.747292418772563,
"grad_norm": 0.40992334485054016,
"learning_rate": 9.529051228947875e-05,
"loss": 0.0065,
"step": 5470
},
{
"epoch": 19.783393501805055,
"grad_norm": 0.4669246971607208,
"learning_rate": 9.52671331762198e-05,
"loss": 0.0059,
"step": 5480
},
{
"epoch": 19.819494584837546,
"grad_norm": 0.4964161813259125,
"learning_rate": 9.524369905912565e-05,
"loss": 0.0073,
"step": 5490
},
{
"epoch": 19.855595667870038,
"grad_norm": 0.5335344076156616,
"learning_rate": 9.522020996667092e-05,
"loss": 0.0072,
"step": 5500
},
{
"epoch": 19.891696750902526,
"grad_norm": 0.4653686285018921,
"learning_rate": 9.519666592739709e-05,
"loss": 0.007,
"step": 5510
},
{
"epoch": 19.927797833935017,
"grad_norm": 0.6025059223175049,
"learning_rate": 9.517306696991241e-05,
"loss": 0.0064,
"step": 5520
},
{
"epoch": 19.96389891696751,
"grad_norm": 0.325710654258728,
"learning_rate": 9.51494131228918e-05,
"loss": 0.0076,
"step": 5530
},
{
"epoch": 20.0,
"grad_norm": 0.5524658560752869,
"learning_rate": 9.512570441507695e-05,
"loss": 0.0065,
"step": 5540
},
{
"epoch": 20.03610108303249,
"grad_norm": 0.3591141700744629,
"learning_rate": 9.510194087527615e-05,
"loss": 0.0062,
"step": 5550
},
{
"epoch": 20.072202166064983,
"grad_norm": 0.6223198175430298,
"learning_rate": 9.507812253236435e-05,
"loss": 0.0071,
"step": 5560
},
{
"epoch": 20.108303249097474,
"grad_norm": 0.49687519669532776,
"learning_rate": 9.505424941528309e-05,
"loss": 0.0063,
"step": 5570
},
{
"epoch": 20.144404332129962,
"grad_norm": 0.5842772722244263,
"learning_rate": 9.503032155304046e-05,
"loss": 0.0061,
"step": 5580
},
{
"epoch": 20.180505415162454,
"grad_norm": 0.5326083302497864,
"learning_rate": 9.500633897471106e-05,
"loss": 0.0063,
"step": 5590
},
{
"epoch": 20.216606498194945,
"grad_norm": 0.5466530919075012,
"learning_rate": 9.498230170943596e-05,
"loss": 0.0079,
"step": 5600
},
{
"epoch": 20.252707581227437,
"grad_norm": 0.41405656933784485,
"learning_rate": 9.495820978642275e-05,
"loss": 0.0064,
"step": 5610
},
{
"epoch": 20.28880866425993,
"grad_norm": 0.4599393308162689,
"learning_rate": 9.493406323494535e-05,
"loss": 0.008,
"step": 5620
},
{
"epoch": 20.32490974729242,
"grad_norm": 0.4277969002723694,
"learning_rate": 9.490986208434413e-05,
"loss": 0.0064,
"step": 5630
},
{
"epoch": 20.36101083032491,
"grad_norm": 0.33089083433151245,
"learning_rate": 9.488560636402577e-05,
"loss": 0.0055,
"step": 5640
},
{
"epoch": 20.3971119133574,
"grad_norm": 0.40624913573265076,
"learning_rate": 9.486129610346321e-05,
"loss": 0.0054,
"step": 5650
},
{
"epoch": 20.43321299638989,
"grad_norm": 0.43951189517974854,
"learning_rate": 9.483693133219576e-05,
"loss": 0.0059,
"step": 5660
},
{
"epoch": 20.469314079422382,
"grad_norm": 0.4479890465736389,
"learning_rate": 9.481251207982888e-05,
"loss": 0.0065,
"step": 5670
},
{
"epoch": 20.505415162454874,
"grad_norm": 0.4974622428417206,
"learning_rate": 9.47880383760343e-05,
"loss": 0.007,
"step": 5680
},
{
"epoch": 20.541516245487365,
"grad_norm": 0.4705882966518402,
"learning_rate": 9.476351025054983e-05,
"loss": 0.0069,
"step": 5690
},
{
"epoch": 20.577617328519857,
"grad_norm": 0.39997318387031555,
"learning_rate": 9.473892773317952e-05,
"loss": 0.0066,
"step": 5700
},
{
"epoch": 20.613718411552348,
"grad_norm": 0.3788575530052185,
"learning_rate": 9.471429085379338e-05,
"loss": 0.0063,
"step": 5710
},
{
"epoch": 20.649819494584836,
"grad_norm": 0.3653368353843689,
"learning_rate": 9.468959964232757e-05,
"loss": 0.0059,
"step": 5720
},
{
"epoch": 20.685920577617328,
"grad_norm": 0.546480119228363,
"learning_rate": 9.466485412878425e-05,
"loss": 0.0069,
"step": 5730
},
{
"epoch": 20.72202166064982,
"grad_norm": 0.4471173882484436,
"learning_rate": 9.464005434323154e-05,
"loss": 0.0058,
"step": 5740
},
{
"epoch": 20.75812274368231,
"grad_norm": 0.5625684857368469,
"learning_rate": 9.461520031580352e-05,
"loss": 0.0061,
"step": 5750
},
{
"epoch": 20.794223826714802,
"grad_norm": 0.5397229790687561,
"learning_rate": 9.459029207670019e-05,
"loss": 0.0068,
"step": 5760
},
{
"epoch": 20.830324909747294,
"grad_norm": 0.41750290989875793,
"learning_rate": 9.456532965618737e-05,
"loss": 0.0084,
"step": 5770
},
{
"epoch": 20.866425992779785,
"grad_norm": 0.5338872671127319,
"learning_rate": 9.454031308459681e-05,
"loss": 0.0073,
"step": 5780
},
{
"epoch": 20.902527075812273,
"grad_norm": 0.5177128911018372,
"learning_rate": 9.451524239232595e-05,
"loss": 0.0072,
"step": 5790
},
{
"epoch": 20.938628158844764,
"grad_norm": 0.5074653029441833,
"learning_rate": 9.449011760983809e-05,
"loss": 0.007,
"step": 5800
},
{
"epoch": 20.974729241877256,
"grad_norm": 0.4906035363674164,
"learning_rate": 9.446493876766218e-05,
"loss": 0.0087,
"step": 5810
},
{
"epoch": 21.010830324909747,
"grad_norm": 0.48160770535469055,
"learning_rate": 9.44397058963929e-05,
"loss": 0.0083,
"step": 5820
},
{
"epoch": 21.04693140794224,
"grad_norm": 0.501342236995697,
"learning_rate": 9.441441902669056e-05,
"loss": 0.0066,
"step": 5830
},
{
"epoch": 21.08303249097473,
"grad_norm": 0.5209288001060486,
"learning_rate": 9.43890781892811e-05,
"loss": 0.0064,
"step": 5840
},
{
"epoch": 21.119133574007222,
"grad_norm": 0.5229851007461548,
"learning_rate": 9.436368341495603e-05,
"loss": 0.0071,
"step": 5850
},
{
"epoch": 21.15523465703971,
"grad_norm": 0.35847046971321106,
"learning_rate": 9.43382347345724e-05,
"loss": 0.0064,
"step": 5860
},
{
"epoch": 21.1913357400722,
"grad_norm": 0.4250674843788147,
"learning_rate": 9.431273217905273e-05,
"loss": 0.0069,
"step": 5870
},
{
"epoch": 21.227436823104693,
"grad_norm": 0.40796077251434326,
"learning_rate": 9.428717577938504e-05,
"loss": 0.0066,
"step": 5880
},
{
"epoch": 21.263537906137184,
"grad_norm": 0.4614047706127167,
"learning_rate": 9.426156556662276e-05,
"loss": 0.0055,
"step": 5890
},
{
"epoch": 21.299638989169676,
"grad_norm": 0.4617994725704193,
"learning_rate": 9.423590157188474e-05,
"loss": 0.0068,
"step": 5900
},
{
"epoch": 21.335740072202167,
"grad_norm": 0.5680766701698303,
"learning_rate": 9.421018382635513e-05,
"loss": 0.0071,
"step": 5910
},
{
"epoch": 21.37184115523466,
"grad_norm": 0.4481453001499176,
"learning_rate": 9.418441236128343e-05,
"loss": 0.0072,
"step": 5920
},
{
"epoch": 21.407942238267147,
"grad_norm": 0.4868505895137787,
"learning_rate": 9.41585872079844e-05,
"loss": 0.0062,
"step": 5930
},
{
"epoch": 21.444043321299638,
"grad_norm": 0.43570268154144287,
"learning_rate": 9.413270839783802e-05,
"loss": 0.0078,
"step": 5940
},
{
"epoch": 21.48014440433213,
"grad_norm": 0.3691798448562622,
"learning_rate": 9.41067759622895e-05,
"loss": 0.0065,
"step": 5950
},
{
"epoch": 21.51624548736462,
"grad_norm": 0.3901306390762329,
"learning_rate": 9.408078993284917e-05,
"loss": 0.0069,
"step": 5960
},
{
"epoch": 21.552346570397113,
"grad_norm": 0.3907696008682251,
"learning_rate": 9.405475034109254e-05,
"loss": 0.0083,
"step": 5970
},
{
"epoch": 21.588447653429604,
"grad_norm": 0.3574260175228119,
"learning_rate": 9.402865721866015e-05,
"loss": 0.0072,
"step": 5980
},
{
"epoch": 21.624548736462096,
"grad_norm": 0.4298070967197418,
"learning_rate": 9.400251059725762e-05,
"loss": 0.0064,
"step": 5990
},
{
"epoch": 21.660649819494584,
"grad_norm": 0.48715445399284363,
"learning_rate": 9.397631050865554e-05,
"loss": 0.0069,
"step": 6000
}
],
"logging_steps": 10,
"max_steps": 30000,
"num_input_tokens_seen": 0,
"num_train_epochs": 109,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}