Gencode-MxDNA / checkpoint-2250 /trainer_state.json
andyjzhao's picture
Upload folder using huggingface_hub
5cb57e8 verified
{
"best_global_step": 2250,
"best_metric": 1.0298579931259155,
"best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-MxDNA/checkpoint-2250",
"epoch": 0.638252606198142,
"eval_steps": 125,
"global_step": 2250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005673356499539039,
"grad_norm": 8450.4345703125,
"loss": 876.9911,
"lr": 2e-06,
"step": 2,
"tokens_trained": 0.000985992
},
{
"epoch": 0.0011346712999078079,
"grad_norm": 8980.888671875,
"loss": 779.4711,
"lr": 6e-06,
"step": 4,
"tokens_trained": 0.001968088
},
{
"epoch": 0.001702006949861712,
"grad_norm": 7489.92529296875,
"loss": 488.6157,
"lr": 1e-05,
"step": 6,
"tokens_trained": 0.002953808
},
{
"epoch": 0.0022693425998156157,
"grad_norm": 1952.1917724609375,
"loss": 237.0602,
"lr": 1.4e-05,
"step": 8,
"tokens_trained": 0.003935728
},
{
"epoch": 0.0028366782497695198,
"grad_norm": 1418.443603515625,
"loss": 159.0854,
"lr": 1.8e-05,
"step": 10,
"tokens_trained": 0.004916488
},
{
"epoch": 0.003404013899723424,
"grad_norm": 874.7195434570312,
"loss": 91.9563,
"lr": 2.2e-05,
"step": 12,
"tokens_trained": 0.005902792
},
{
"epoch": 0.003971349549677328,
"grad_norm": 1339.8248291015625,
"loss": 40.3366,
"lr": 2.6e-05,
"step": 14,
"tokens_trained": 0.0068856
},
{
"epoch": 0.0045386851996312315,
"grad_norm": 2936.7607421875,
"loss": 22.7436,
"lr": 3e-05,
"step": 16,
"tokens_trained": 0.007868248
},
{
"epoch": 0.005106020849585136,
"grad_norm": 1531.3807373046875,
"loss": 23.4797,
"lr": 3.4000000000000007e-05,
"step": 18,
"tokens_trained": 0.008849296
},
{
"epoch": 0.0056733564995390395,
"grad_norm": 3027.4189453125,
"loss": 38.7379,
"lr": 3.8e-05,
"step": 20,
"tokens_trained": 0.009830984
},
{
"epoch": 0.006240692149492944,
"grad_norm": 2435.890625,
"loss": 26.2427,
"lr": 4.2000000000000004e-05,
"step": 22,
"tokens_trained": 0.01081364
},
{
"epoch": 0.006808027799446848,
"grad_norm": 3217.990478515625,
"loss": 31.0263,
"lr": 4.6e-05,
"step": 24,
"tokens_trained": 0.01179036
},
{
"epoch": 0.007375363449400752,
"grad_norm": 3854.00634765625,
"loss": 33.8781,
"lr": 5e-05,
"step": 26,
"tokens_trained": 0.012774504
},
{
"epoch": 0.007942699099354656,
"grad_norm": 3197.489990234375,
"loss": 27.7927,
"lr": 5.4e-05,
"step": 28,
"tokens_trained": 0.013759992
},
{
"epoch": 0.00851003474930856,
"grad_norm": 3034.156494140625,
"loss": 37.9083,
"lr": 5.800000000000001e-05,
"step": 30,
"tokens_trained": 0.014740536
},
{
"epoch": 0.009077370399262463,
"grad_norm": 3040.314453125,
"loss": 34.0659,
"lr": 6.2e-05,
"step": 32,
"tokens_trained": 0.015725984
},
{
"epoch": 0.009644706049216368,
"grad_norm": 3065.5791015625,
"loss": 27.7768,
"lr": 6.6e-05,
"step": 34,
"tokens_trained": 0.016706864
},
{
"epoch": 0.010212041699170272,
"grad_norm": 2454.293701171875,
"loss": 35.1143,
"lr": 7.000000000000001e-05,
"step": 36,
"tokens_trained": 0.017688816
},
{
"epoch": 0.010779377349124175,
"grad_norm": 3100.7802734375,
"loss": 42.2603,
"lr": 7.4e-05,
"step": 38,
"tokens_trained": 0.018669072
},
{
"epoch": 0.011346712999078079,
"grad_norm": 2749.84423828125,
"loss": 39.3879,
"lr": 7.8e-05,
"step": 40,
"tokens_trained": 0.019652072
},
{
"epoch": 0.011914048649031984,
"grad_norm": 1519.9908447265625,
"loss": 35.0735,
"lr": 8.2e-05,
"step": 42,
"tokens_trained": 0.020633112
},
{
"epoch": 0.012481384298985888,
"grad_norm": 1474.4244384765625,
"loss": 25.8965,
"lr": 8.599999999999999e-05,
"step": 44,
"tokens_trained": 0.021616192
},
{
"epoch": 0.013048719948939792,
"grad_norm": 2962.500244140625,
"loss": 51.0784,
"lr": 8.999999999999999e-05,
"step": 46,
"tokens_trained": 0.022597288
},
{
"epoch": 0.013616055598893695,
"grad_norm": 2419.41455078125,
"loss": 43.0334,
"lr": 9.400000000000001e-05,
"step": 48,
"tokens_trained": 0.02357572
},
{
"epoch": 0.014183391248847599,
"grad_norm": 1267.87451171875,
"loss": 21.8063,
"lr": 9.800000000000001e-05,
"step": 50,
"tokens_trained": 0.024553376
},
{
"epoch": 0.014750726898801504,
"grad_norm": 1573.944091796875,
"loss": 52.9693,
"lr": 0.000102,
"step": 52,
"tokens_trained": 0.025536728
},
{
"epoch": 0.015318062548755408,
"grad_norm": 1509.650146484375,
"loss": 50.0825,
"lr": 0.000106,
"step": 54,
"tokens_trained": 0.026517
},
{
"epoch": 0.01588539819870931,
"grad_norm": 2334.765380859375,
"loss": 42.1982,
"lr": 0.00011,
"step": 56,
"tokens_trained": 0.027504728
},
{
"epoch": 0.016452733848663217,
"grad_norm": 1594.16259765625,
"loss": 39.0562,
"lr": 0.000114,
"step": 58,
"tokens_trained": 0.028485416
},
{
"epoch": 0.01702006949861712,
"grad_norm": 1628.082275390625,
"loss": 35.0488,
"lr": 0.000118,
"step": 60,
"tokens_trained": 0.029468696
},
{
"epoch": 0.017587405148571024,
"grad_norm": 2496.6455078125,
"loss": 49.4241,
"lr": 0.000122,
"step": 62,
"tokens_trained": 0.030453584
},
{
"epoch": 0.018154740798524926,
"grad_norm": 2521.721435546875,
"loss": 69.0275,
"lr": 0.000126,
"step": 64,
"tokens_trained": 0.031432864
},
{
"epoch": 0.01872207644847883,
"grad_norm": 2179.571533203125,
"loss": 63.1409,
"lr": 0.00013000000000000002,
"step": 66,
"tokens_trained": 0.032418416
},
{
"epoch": 0.019289412098432736,
"grad_norm": 899.7137451171875,
"loss": 38.4131,
"lr": 0.000134,
"step": 68,
"tokens_trained": 0.033402136
},
{
"epoch": 0.01985674774838664,
"grad_norm": 2109.377685546875,
"loss": 51.0044,
"lr": 0.00013800000000000002,
"step": 70,
"tokens_trained": 0.03438832
},
{
"epoch": 0.020424083398340544,
"grad_norm": 1649.1873779296875,
"loss": 32.1408,
"lr": 0.00014199999999999998,
"step": 72,
"tokens_trained": 0.035374464
},
{
"epoch": 0.020991419048294446,
"grad_norm": 1807.994140625,
"loss": 28.8357,
"lr": 0.000146,
"step": 74,
"tokens_trained": 0.03635784
},
{
"epoch": 0.02155875469824835,
"grad_norm": 998.9485473632812,
"loss": 23.0343,
"lr": 0.00015,
"step": 76,
"tokens_trained": 0.037340248
},
{
"epoch": 0.022126090348202256,
"grad_norm": 2240.17578125,
"loss": 32.0397,
"lr": 0.000154,
"step": 78,
"tokens_trained": 0.038321968
},
{
"epoch": 0.022693425998156158,
"grad_norm": 1606.0067138671875,
"loss": 32.1776,
"lr": 0.000158,
"step": 80,
"tokens_trained": 0.039304992
},
{
"epoch": 0.023260761648110063,
"grad_norm": 1685.1015625,
"loss": 24.3428,
"lr": 0.000162,
"step": 82,
"tokens_trained": 0.040286808
},
{
"epoch": 0.02382809729806397,
"grad_norm": 1761.7890625,
"loss": 23.9261,
"lr": 0.00016600000000000002,
"step": 84,
"tokens_trained": 0.041271776
},
{
"epoch": 0.02439543294801787,
"grad_norm": 2036.0982666015625,
"loss": 27.7196,
"lr": 0.00017,
"step": 86,
"tokens_trained": 0.042252784
},
{
"epoch": 0.024962768597971776,
"grad_norm": 1564.3870849609375,
"loss": 25.3722,
"lr": 0.000174,
"step": 88,
"tokens_trained": 0.04323596
},
{
"epoch": 0.025530104247925678,
"grad_norm": 1508.349853515625,
"loss": 18.4107,
"lr": 0.000178,
"step": 90,
"tokens_trained": 0.044218984
},
{
"epoch": 0.026097439897879583,
"grad_norm": 1955.011474609375,
"loss": 28.8456,
"lr": 0.000182,
"step": 92,
"tokens_trained": 0.045202144
},
{
"epoch": 0.02666477554783349,
"grad_norm": 1679.9423828125,
"loss": 23.6139,
"lr": 0.000186,
"step": 94,
"tokens_trained": 0.046192336
},
{
"epoch": 0.02723211119778739,
"grad_norm": 1517.5731201171875,
"loss": 42.145,
"lr": 0.00019,
"step": 96,
"tokens_trained": 0.047174312
},
{
"epoch": 0.027799446847741296,
"grad_norm": 1535.3076171875,
"loss": 31.9711,
"lr": 0.000194,
"step": 98,
"tokens_trained": 0.048158944
},
{
"epoch": 0.028366782497695198,
"grad_norm": 1475.2569580078125,
"loss": 37.645,
"lr": 0.00019800000000000002,
"step": 100,
"tokens_trained": 0.04914364
},
{
"epoch": 0.028934118147649103,
"grad_norm": 1918.4088134765625,
"loss": 69.4053,
"lr": 0.000202,
"step": 102,
"tokens_trained": 0.050123488
},
{
"epoch": 0.02950145379760301,
"grad_norm": 1631.6231689453125,
"loss": 50.9725,
"lr": 0.000206,
"step": 104,
"tokens_trained": 0.051105512
},
{
"epoch": 0.03006878944755691,
"grad_norm": 1291.6376953125,
"loss": 22.6527,
"lr": 0.00021,
"step": 106,
"tokens_trained": 0.052091704
},
{
"epoch": 0.030636125097510816,
"grad_norm": 1224.9625244140625,
"loss": 60.2725,
"lr": 0.000214,
"step": 108,
"tokens_trained": 0.053074824
},
{
"epoch": 0.031203460747464717,
"grad_norm": 1218.2022705078125,
"loss": 75.8728,
"lr": 0.000218,
"step": 110,
"tokens_trained": 0.054057104
},
{
"epoch": 0.03177079639741862,
"grad_norm": 1761.8861083984375,
"loss": 61.6427,
"lr": 0.000222,
"step": 112,
"tokens_trained": 0.055039128
},
{
"epoch": 0.03233813204737253,
"grad_norm": 1482.4256591796875,
"loss": 35.3351,
"lr": 0.00022600000000000002,
"step": 114,
"tokens_trained": 0.05602388
},
{
"epoch": 0.03290546769732643,
"grad_norm": 563.6399536132812,
"loss": 40.1461,
"lr": 0.00023,
"step": 116,
"tokens_trained": 0.057005376
},
{
"epoch": 0.03347280334728033,
"grad_norm": 1266.058837890625,
"loss": 24.0657,
"lr": 0.00023400000000000002,
"step": 118,
"tokens_trained": 0.057985136
},
{
"epoch": 0.03404013899723424,
"grad_norm": 918.206298828125,
"loss": 23.9626,
"lr": 0.00023799999999999998,
"step": 120,
"tokens_trained": 0.058968288
},
{
"epoch": 0.03460747464718814,
"grad_norm": 1495.7191162109375,
"loss": 19.798,
"lr": 0.000242,
"step": 122,
"tokens_trained": 0.05995348
},
{
"epoch": 0.03517481029714205,
"grad_norm": 1264.302734375,
"loss": 31.5342,
"lr": 0.000246,
"step": 124,
"tokens_trained": 0.060935832
},
{
"epoch": 0.035458478122119,
"eval_loss": 5.312118053436279,
"eval_runtime": 21.3065,
"step": 125,
"tokens_trained": 0.061426608
},
{
"epoch": 0.03574214594709595,
"grad_norm": 907.4861450195312,
"loss": 25.1262,
"lr": 0.00025,
"step": 126,
"tokens_trained": 0.061918184
},
{
"epoch": 0.03630948159704985,
"grad_norm": 1287.6158447265625,
"loss": 26.963,
"lr": 0.000254,
"step": 128,
"tokens_trained": 0.062902328
},
{
"epoch": 0.03687681724700376,
"grad_norm": 1260.570556640625,
"loss": 24.9633,
"lr": 0.00025800000000000004,
"step": 130,
"tokens_trained": 0.063883456
},
{
"epoch": 0.03744415289695766,
"grad_norm": 1436.82373046875,
"loss": 23.1028,
"lr": 0.000262,
"step": 132,
"tokens_trained": 0.06486748
},
{
"epoch": 0.03801148854691157,
"grad_norm": 812.9523315429688,
"loss": 20.5496,
"lr": 0.000266,
"step": 134,
"tokens_trained": 0.065847104
},
{
"epoch": 0.03857882419686547,
"grad_norm": 1336.5322265625,
"loss": 23.673,
"lr": 0.00027,
"step": 136,
"tokens_trained": 0.066829928
},
{
"epoch": 0.03914615984681937,
"grad_norm": 1381.282470703125,
"loss": 32.0373,
"lr": 0.00027400000000000005,
"step": 138,
"tokens_trained": 0.067814024
},
{
"epoch": 0.03971349549677328,
"grad_norm": 972.7861938476562,
"loss": 26.9454,
"lr": 0.00027800000000000004,
"step": 140,
"tokens_trained": 0.068797744
},
{
"epoch": 0.04028083114672718,
"grad_norm": 1347.2249755859375,
"loss": 22.3578,
"lr": 0.00028199999999999997,
"step": 142,
"tokens_trained": 0.069780072
},
{
"epoch": 0.04084816679668109,
"grad_norm": 829.525390625,
"loss": 37.9879,
"lr": 0.00028599999999999996,
"step": 144,
"tokens_trained": 0.070759896
},
{
"epoch": 0.04141550244663499,
"grad_norm": 1094.1033935546875,
"loss": 21.1972,
"lr": 0.00029,
"step": 146,
"tokens_trained": 0.0717452
},
{
"epoch": 0.04198283809658889,
"grad_norm": 717.107421875,
"loss": 21.7774,
"lr": 0.000294,
"step": 148,
"tokens_trained": 0.072727432
},
{
"epoch": 0.042550173746542796,
"grad_norm": 744.4456787109375,
"loss": 20.3235,
"lr": 0.000298,
"step": 150,
"tokens_trained": 0.073712128
},
{
"epoch": 0.0431175093964967,
"grad_norm": 904.1460571289062,
"loss": 22.7878,
"lr": 0.000302,
"step": 152,
"tokens_trained": 0.074695296
},
{
"epoch": 0.04368484504645061,
"grad_norm": 1352.303955078125,
"loss": 20.9757,
"lr": 0.000306,
"step": 154,
"tokens_trained": 0.0756798
},
{
"epoch": 0.04425218069640451,
"grad_norm": 997.0473022460938,
"loss": 17.4647,
"lr": 0.00031,
"step": 156,
"tokens_trained": 0.076666504
},
{
"epoch": 0.04481951634635841,
"grad_norm": 1206.387939453125,
"loss": 21.1846,
"lr": 0.000314,
"step": 158,
"tokens_trained": 0.07764868
},
{
"epoch": 0.045386851996312316,
"grad_norm": 1029.6807861328125,
"loss": 17.8853,
"lr": 0.00031800000000000003,
"step": 160,
"tokens_trained": 0.07863548
},
{
"epoch": 0.04595418764626622,
"grad_norm": 1136.4635009765625,
"loss": 30.057,
"lr": 0.000322,
"step": 162,
"tokens_trained": 0.079618928
},
{
"epoch": 0.04652152329622013,
"grad_norm": 834.3464965820312,
"loss": 28.1782,
"lr": 0.000326,
"step": 164,
"tokens_trained": 0.0806032
},
{
"epoch": 0.04708885894617403,
"grad_norm": 1177.8365478515625,
"loss": 16.4267,
"lr": 0.00033,
"step": 166,
"tokens_trained": 0.081583752
},
{
"epoch": 0.04765619459612794,
"grad_norm": 572.501708984375,
"loss": 16.5752,
"lr": 0.00033400000000000004,
"step": 168,
"tokens_trained": 0.082568184
},
{
"epoch": 0.048223530246081836,
"grad_norm": 437.6822814941406,
"loss": 11.5509,
"lr": 0.00033800000000000003,
"step": 170,
"tokens_trained": 0.083553352
},
{
"epoch": 0.04879086589603574,
"grad_norm": 1119.0416259765625,
"loss": 16.2689,
"lr": 0.000342,
"step": 172,
"tokens_trained": 0.084536352
},
{
"epoch": 0.04935820154598965,
"grad_norm": 895.4021606445312,
"loss": 12.6663,
"lr": 0.000346,
"step": 174,
"tokens_trained": 0.085517312
},
{
"epoch": 0.04992553719594355,
"grad_norm": 995.6289672851562,
"loss": 26.0663,
"lr": 0.00035,
"step": 176,
"tokens_trained": 0.086496088
},
{
"epoch": 0.05049287284589746,
"grad_norm": 839.6610717773438,
"loss": 21.5115,
"lr": 0.000354,
"step": 178,
"tokens_trained": 0.087480632
},
{
"epoch": 0.051060208495851356,
"grad_norm": 734.1155395507812,
"loss": 29.3287,
"lr": 0.000358,
"step": 180,
"tokens_trained": 0.088460408
},
{
"epoch": 0.05162754414580526,
"grad_norm": 721.4505615234375,
"loss": 26.0801,
"lr": 0.000362,
"step": 182,
"tokens_trained": 0.08944248
},
{
"epoch": 0.052194879795759166,
"grad_norm": 845.9672241210938,
"loss": 19.0639,
"lr": 0.000366,
"step": 184,
"tokens_trained": 0.090427832
},
{
"epoch": 0.05276221544571307,
"grad_norm": 1210.9969482421875,
"loss": 23.9036,
"lr": 0.00037,
"step": 186,
"tokens_trained": 0.091411504
},
{
"epoch": 0.05332955109566698,
"grad_norm": 1079.1690673828125,
"loss": 23.5588,
"lr": 0.000374,
"step": 188,
"tokens_trained": 0.092392672
},
{
"epoch": 0.053896886745620876,
"grad_norm": 596.111328125,
"loss": 20.8275,
"lr": 0.000378,
"step": 190,
"tokens_trained": 0.093374696
},
{
"epoch": 0.05446422239557478,
"grad_norm": 761.8096923828125,
"loss": 22.512,
"lr": 0.000382,
"step": 192,
"tokens_trained": 0.094361912
},
{
"epoch": 0.055031558045528686,
"grad_norm": 1081.9832763671875,
"loss": 32.335,
"lr": 0.000386,
"step": 194,
"tokens_trained": 0.095342992
},
{
"epoch": 0.05559889369548259,
"grad_norm": 304.3534240722656,
"loss": 11.5275,
"lr": 0.00039000000000000005,
"step": 196,
"tokens_trained": 0.096323512
},
{
"epoch": 0.0561662293454365,
"grad_norm": 586.6314086914062,
"loss": 16.2663,
"lr": 0.00039400000000000004,
"step": 198,
"tokens_trained": 0.097308864
},
{
"epoch": 0.056733564995390395,
"grad_norm": 624.9953002929688,
"loss": 16.627,
"lr": 0.000398,
"step": 200,
"tokens_trained": 0.098289064
},
{
"epoch": 0.0573009006453443,
"grad_norm": 585.9645385742188,
"loss": 15.8359,
"lr": 0.000402,
"step": 202,
"tokens_trained": 0.099269696
},
{
"epoch": 0.057868236295298206,
"grad_norm": 537.9913330078125,
"loss": 20.0779,
"lr": 0.00040600000000000006,
"step": 204,
"tokens_trained": 0.100248448
},
{
"epoch": 0.05843557194525211,
"grad_norm": 805.04931640625,
"loss": 21.4524,
"lr": 0.00041,
"step": 206,
"tokens_trained": 0.101231248
},
{
"epoch": 0.05900290759520602,
"grad_norm": 439.1418151855469,
"loss": 23.9852,
"lr": 0.000414,
"step": 208,
"tokens_trained": 0.102210688
},
{
"epoch": 0.059570243245159915,
"grad_norm": 502.684814453125,
"loss": 17.6273,
"lr": 0.00041799999999999997,
"step": 210,
"tokens_trained": 0.103192176
},
{
"epoch": 0.06013757889511382,
"grad_norm": 849.9979858398438,
"loss": 33.7517,
"lr": 0.000422,
"step": 212,
"tokens_trained": 0.104172824
},
{
"epoch": 0.060704914545067726,
"grad_norm": 939.583740234375,
"loss": 26.2559,
"lr": 0.000426,
"step": 214,
"tokens_trained": 0.105156672
},
{
"epoch": 0.06127225019502163,
"grad_norm": 525.0505981445312,
"loss": 20.0923,
"lr": 0.00043,
"step": 216,
"tokens_trained": 0.106141368
},
{
"epoch": 0.061839585844975536,
"grad_norm": 420.296630859375,
"loss": 17.9608,
"lr": 0.00043400000000000003,
"step": 218,
"tokens_trained": 0.107124088
},
{
"epoch": 0.062406921494929435,
"grad_norm": 711.3380737304688,
"loss": 19.387,
"lr": 0.000438,
"step": 220,
"tokens_trained": 0.108112632
},
{
"epoch": 0.06297425714488335,
"grad_norm": 759.183349609375,
"loss": 17.8061,
"lr": 0.000442,
"step": 222,
"tokens_trained": 0.1090934
},
{
"epoch": 0.06354159279483725,
"grad_norm": 790.025146484375,
"loss": 13.8539,
"lr": 0.000446,
"step": 224,
"tokens_trained": 0.110079512
},
{
"epoch": 0.06410892844479114,
"grad_norm": 769.8306274414062,
"loss": 22.1258,
"lr": 0.00045000000000000004,
"step": 226,
"tokens_trained": 0.111060152
},
{
"epoch": 0.06467626409474506,
"grad_norm": 656.8352661132812,
"loss": 14.8646,
"lr": 0.00045400000000000003,
"step": 228,
"tokens_trained": 0.112044144
},
{
"epoch": 0.06524359974469895,
"grad_norm": 498.92010498046875,
"loss": 23.1558,
"lr": 0.000458,
"step": 230,
"tokens_trained": 0.113022928
},
{
"epoch": 0.06581093539465287,
"grad_norm": 764.0186157226562,
"loss": 16.7089,
"lr": 0.000462,
"step": 232,
"tokens_trained": 0.114003832
},
{
"epoch": 0.06637827104460677,
"grad_norm": 491.5793762207031,
"loss": 12.3979,
"lr": 0.00046600000000000005,
"step": 234,
"tokens_trained": 0.114991008
},
{
"epoch": 0.06694560669456066,
"grad_norm": 679.9217529296875,
"loss": 14.9037,
"lr": 0.00047,
"step": 236,
"tokens_trained": 0.115971888
},
{
"epoch": 0.06751294234451458,
"grad_norm": 491.0369567871094,
"loss": 7.7603,
"lr": 0.000474,
"step": 238,
"tokens_trained": 0.116952616
},
{
"epoch": 0.06808027799446847,
"grad_norm": 369.2186279296875,
"loss": 8.2256,
"lr": 0.00047799999999999996,
"step": 240,
"tokens_trained": 0.117935816
},
{
"epoch": 0.06864761364442239,
"grad_norm": 312.72137451171875,
"loss": 7.5486,
"lr": 0.000482,
"step": 242,
"tokens_trained": 0.118919392
},
{
"epoch": 0.06921494929437629,
"grad_norm": 596.1439208984375,
"loss": 11.7351,
"lr": 0.000486,
"step": 244,
"tokens_trained": 0.119901856
},
{
"epoch": 0.06978228494433018,
"grad_norm": 467.5667419433594,
"loss": 11.8403,
"lr": 0.00049,
"step": 246,
"tokens_trained": 0.120884624
},
{
"epoch": 0.0703496205942841,
"grad_norm": 430.50048828125,
"loss": 13.8081,
"lr": 0.000494,
"step": 248,
"tokens_trained": 0.121869224
},
{
"epoch": 0.070916956244238,
"grad_norm": 522.242919921875,
"loss": 14.1892,
"lr": 0.000498,
"step": 250,
"tokens_trained": 0.122853584
},
{
"epoch": 0.070916956244238,
"eval_loss": 1.9294606447219849,
"eval_runtime": 20.4162,
"step": 250,
"tokens_trained": 0.122853584
},
{
"epoch": 0.0714842918941919,
"grad_norm": 835.2765502929688,
"loss": 13.2462,
"lr": 0.0005020000000000001,
"step": 252,
"tokens_trained": 0.123835544
},
{
"epoch": 0.0720516275441458,
"grad_norm": 714.8098754882812,
"loss": 20.0498,
"lr": 0.000506,
"step": 254,
"tokens_trained": 0.124821616
},
{
"epoch": 0.0726189631940997,
"grad_norm": 701.512939453125,
"loss": 18.3664,
"lr": 0.00051,
"step": 256,
"tokens_trained": 0.125807608
},
{
"epoch": 0.07318629884405362,
"grad_norm": 773.987060546875,
"loss": 21.3807,
"lr": 0.000514,
"step": 258,
"tokens_trained": 0.126791464
},
{
"epoch": 0.07375363449400751,
"grad_norm": 826.422119140625,
"loss": 22.6403,
"lr": 0.000518,
"step": 260,
"tokens_trained": 0.127771752
},
{
"epoch": 0.07432097014396143,
"grad_norm": 742.8673095703125,
"loss": 20.1504,
"lr": 0.000522,
"step": 262,
"tokens_trained": 0.128755448
},
{
"epoch": 0.07488830579391532,
"grad_norm": 797.79296875,
"loss": 26.7343,
"lr": 0.000526,
"step": 264,
"tokens_trained": 0.129741088
},
{
"epoch": 0.07545564144386922,
"grad_norm": 673.9141235351562,
"loss": 12.505,
"lr": 0.0005300000000000001,
"step": 266,
"tokens_trained": 0.130727504
},
{
"epoch": 0.07602297709382314,
"grad_norm": 310.6510925292969,
"loss": 12.6344,
"lr": 0.0005340000000000001,
"step": 268,
"tokens_trained": 0.131710296
},
{
"epoch": 0.07659031274377703,
"grad_norm": 312.40966796875,
"loss": 14.254,
"lr": 0.0005380000000000001,
"step": 270,
"tokens_trained": 0.132695352
},
{
"epoch": 0.07715764839373095,
"grad_norm": 492.2834777832031,
"loss": 19.0979,
"lr": 0.0005420000000000001,
"step": 272,
"tokens_trained": 0.133677928
},
{
"epoch": 0.07772498404368484,
"grad_norm": 628.457763671875,
"loss": 21.7735,
"lr": 0.000546,
"step": 274,
"tokens_trained": 0.134655504
},
{
"epoch": 0.07829231969363874,
"grad_norm": 382.8389892578125,
"loss": 12.5128,
"lr": 0.00055,
"step": 276,
"tokens_trained": 0.135640208
},
{
"epoch": 0.07885965534359266,
"grad_norm": 483.12335205078125,
"loss": 15.2589,
"lr": 0.000554,
"step": 278,
"tokens_trained": 0.136624232
},
{
"epoch": 0.07942699099354655,
"grad_norm": 640.658447265625,
"loss": 12.1341,
"lr": 0.000558,
"step": 280,
"tokens_trained": 0.13760628
},
{
"epoch": 0.07999432664350047,
"grad_norm": 410.0824279785156,
"loss": 12.5723,
"lr": 0.0005620000000000001,
"step": 282,
"tokens_trained": 0.13858832
},
{
"epoch": 0.08056166229345436,
"grad_norm": 513.2861328125,
"loss": 14.8461,
"lr": 0.000566,
"step": 284,
"tokens_trained": 0.139568424
},
{
"epoch": 0.08112899794340826,
"grad_norm": 564.547607421875,
"loss": 12.5792,
"lr": 0.00057,
"step": 286,
"tokens_trained": 0.140557016
},
{
"epoch": 0.08169633359336217,
"grad_norm": 451.3592834472656,
"loss": 16.5433,
"lr": 0.000574,
"step": 288,
"tokens_trained": 0.141540248
},
{
"epoch": 0.08226366924331607,
"grad_norm": 404.2495422363281,
"loss": 16.4138,
"lr": 0.000578,
"step": 290,
"tokens_trained": 0.142528272
},
{
"epoch": 0.08283100489326999,
"grad_norm": 566.5219116210938,
"loss": 16.4743,
"lr": 0.0005819999999999999,
"step": 292,
"tokens_trained": 0.143513096
},
{
"epoch": 0.08339834054322388,
"grad_norm": 559.6517333984375,
"loss": 16.421,
"lr": 0.0005859999999999999,
"step": 294,
"tokens_trained": 0.144494472
},
{
"epoch": 0.08396567619317778,
"grad_norm": 260.874755859375,
"loss": 11.2214,
"lr": 0.00059,
"step": 296,
"tokens_trained": 0.14547876
},
{
"epoch": 0.0845330118431317,
"grad_norm": 272.02899169921875,
"loss": 10.3491,
"lr": 0.000594,
"step": 298,
"tokens_trained": 0.146465864
},
{
"epoch": 0.08510034749308559,
"grad_norm": 556.9845581054688,
"loss": 10.4348,
"lr": 0.000598,
"step": 300,
"tokens_trained": 0.147446344
},
{
"epoch": 0.0856676831430395,
"grad_norm": 273.35772705078125,
"loss": 8.3292,
"lr": 0.000602,
"step": 302,
"tokens_trained": 0.14843244
},
{
"epoch": 0.0862350187929934,
"grad_norm": 246.6316680908203,
"loss": 9.9362,
"lr": 0.000606,
"step": 304,
"tokens_trained": 0.149415976
},
{
"epoch": 0.0868023544429473,
"grad_norm": 564.4365844726562,
"loss": 9.2621,
"lr": 0.00061,
"step": 306,
"tokens_trained": 0.150398728
},
{
"epoch": 0.08736969009290121,
"grad_norm": 396.0948791503906,
"loss": 11.8526,
"lr": 0.000614,
"step": 308,
"tokens_trained": 0.151385104
},
{
"epoch": 0.08793702574285511,
"grad_norm": 488.6072692871094,
"loss": 11.8473,
"lr": 0.0006180000000000001,
"step": 310,
"tokens_trained": 0.152373672
},
{
"epoch": 0.08850436139280903,
"grad_norm": 346.70660400390625,
"loss": 12.0897,
"lr": 0.000622,
"step": 312,
"tokens_trained": 0.153356256
},
{
"epoch": 0.08907169704276292,
"grad_norm": 382.40679931640625,
"loss": 9.271,
"lr": 0.000626,
"step": 314,
"tokens_trained": 0.154342632
},
{
"epoch": 0.08963903269271682,
"grad_norm": 288.7908935546875,
"loss": 9.185,
"lr": 0.00063,
"step": 316,
"tokens_trained": 0.1553238
},
{
"epoch": 0.09020636834267073,
"grad_norm": 337.5335388183594,
"loss": 12.0555,
"lr": 0.000634,
"step": 318,
"tokens_trained": 0.156313168
},
{
"epoch": 0.09077370399262463,
"grad_norm": 349.25531005859375,
"loss": 8.51,
"lr": 0.000638,
"step": 320,
"tokens_trained": 0.157299448
},
{
"epoch": 0.09134103964257854,
"grad_norm": 471.7824401855469,
"loss": 14.1888,
"lr": 0.000642,
"step": 322,
"tokens_trained": 0.158285264
},
{
"epoch": 0.09190837529253244,
"grad_norm": 284.94036865234375,
"loss": 10.1593,
"lr": 0.000646,
"step": 324,
"tokens_trained": 0.159267512
},
{
"epoch": 0.09247571094248634,
"grad_norm": 510.90478515625,
"loss": 13.5744,
"lr": 0.0006500000000000001,
"step": 326,
"tokens_trained": 0.160250856
},
{
"epoch": 0.09304304659244025,
"grad_norm": 373.82965087890625,
"loss": 8.4999,
"lr": 0.0006540000000000001,
"step": 328,
"tokens_trained": 0.161231832
},
{
"epoch": 0.09361038224239415,
"grad_norm": 219.3827362060547,
"loss": 8.4436,
"lr": 0.0006580000000000001,
"step": 330,
"tokens_trained": 0.162217656
},
{
"epoch": 0.09417771789234806,
"grad_norm": 433.0914001464844,
"loss": 11.2019,
"lr": 0.000662,
"step": 332,
"tokens_trained": 0.163199096
},
{
"epoch": 0.09474505354230196,
"grad_norm": 242.65907287597656,
"loss": 9.0666,
"lr": 0.000666,
"step": 334,
"tokens_trained": 0.164178512
},
{
"epoch": 0.09531238919225588,
"grad_norm": 446.07916259765625,
"loss": 8.6546,
"lr": 0.00067,
"step": 336,
"tokens_trained": 0.165162464
},
{
"epoch": 0.09587972484220977,
"grad_norm": 231.8892364501953,
"loss": 7.5819,
"lr": 0.000674,
"step": 338,
"tokens_trained": 0.166141536
},
{
"epoch": 0.09644706049216367,
"grad_norm": 100.7306137084961,
"loss": 6.7047,
"lr": 0.0006780000000000001,
"step": 340,
"tokens_trained": 0.167123944
},
{
"epoch": 0.09701439614211758,
"grad_norm": 78.11279296875,
"loss": 5.9308,
"lr": 0.0006820000000000001,
"step": 342,
"tokens_trained": 0.168105264
},
{
"epoch": 0.09758173179207148,
"grad_norm": 271.466064453125,
"loss": 6.9141,
"lr": 0.0006860000000000001,
"step": 344,
"tokens_trained": 0.169088912
},
{
"epoch": 0.0981490674420254,
"grad_norm": 252.54478454589844,
"loss": 6.3281,
"lr": 0.00069,
"step": 346,
"tokens_trained": 0.170077368
},
{
"epoch": 0.0987164030919793,
"grad_norm": 305.8559875488281,
"loss": 6.443,
"lr": 0.000694,
"step": 348,
"tokens_trained": 0.171057232
},
{
"epoch": 0.09928373874193319,
"grad_norm": 227.74374389648438,
"loss": 6.552,
"lr": 0.0006979999999999999,
"step": 350,
"tokens_trained": 0.172041376
},
{
"epoch": 0.0998510743918871,
"grad_norm": 446.7601623535156,
"loss": 10.8184,
"lr": 0.0007019999999999999,
"step": 352,
"tokens_trained": 0.173023624
},
{
"epoch": 0.100418410041841,
"grad_norm": 353.0849609375,
"loss": 8.6327,
"lr": 0.0007059999999999999,
"step": 354,
"tokens_trained": 0.174005992
},
{
"epoch": 0.10098574569179491,
"grad_norm": 367.9427185058594,
"loss": 9.3898,
"lr": 0.00071,
"step": 356,
"tokens_trained": 0.174988304
},
{
"epoch": 0.10155308134174881,
"grad_norm": 224.4961700439453,
"loss": 8.284,
"lr": 0.000714,
"step": 358,
"tokens_trained": 0.175969816
},
{
"epoch": 0.10212041699170271,
"grad_norm": 221.86537170410156,
"loss": 7.0578,
"lr": 0.000718,
"step": 360,
"tokens_trained": 0.176952688
},
{
"epoch": 0.10268775264165662,
"grad_norm": 331.0989685058594,
"loss": 6.9561,
"lr": 0.000722,
"step": 362,
"tokens_trained": 0.177935144
},
{
"epoch": 0.10325508829161052,
"grad_norm": 171.6498260498047,
"loss": 7.203,
"lr": 0.000726,
"step": 364,
"tokens_trained": 0.178916776
},
{
"epoch": 0.10382242394156443,
"grad_norm": 284.2208557128906,
"loss": 10.3517,
"lr": 0.00073,
"step": 366,
"tokens_trained": 0.179903432
},
{
"epoch": 0.10438975959151833,
"grad_norm": 354.8574523925781,
"loss": 9.3888,
"lr": 0.000734,
"step": 368,
"tokens_trained": 0.180883224
},
{
"epoch": 0.10495709524147223,
"grad_norm": 344.82574462890625,
"loss": 10.5933,
"lr": 0.000738,
"step": 370,
"tokens_trained": 0.181863808
},
{
"epoch": 0.10552443089142614,
"grad_norm": 302.6838073730469,
"loss": 10.2832,
"lr": 0.000742,
"step": 372,
"tokens_trained": 0.182843712
},
{
"epoch": 0.10609176654138004,
"grad_norm": 323.0387878417969,
"loss": 6.4864,
"lr": 0.000746,
"step": 374,
"tokens_trained": 0.183825832
},
{
"epoch": 0.10637543436635699,
"eval_loss": 1.4430732727050781,
"eval_runtime": 20.5468,
"step": 375,
"tokens_trained": 0.184317744
},
{
"epoch": 0.10665910219133395,
"grad_norm": 133.74822998046875,
"loss": 5.4176,
"lr": 0.00075,
"step": 376,
"tokens_trained": 0.184811352
},
{
"epoch": 0.10722643784128785,
"grad_norm": 180.3372344970703,
"loss": 5.5641,
"lr": 0.000754,
"step": 378,
"tokens_trained": 0.185792528
},
{
"epoch": 0.10779377349124175,
"grad_norm": 250.83999633789062,
"loss": 5.8612,
"lr": 0.000758,
"step": 380,
"tokens_trained": 0.186777112
},
{
"epoch": 0.10836110914119566,
"grad_norm": 293.51959228515625,
"loss": 6.0418,
"lr": 0.000762,
"step": 382,
"tokens_trained": 0.18775724
},
{
"epoch": 0.10892844479114956,
"grad_norm": 292.56207275390625,
"loss": 6.1812,
"lr": 0.0007660000000000001,
"step": 384,
"tokens_trained": 0.188733568
},
{
"epoch": 0.10949578044110347,
"grad_norm": 121.82467651367188,
"loss": 6.0855,
"lr": 0.0007700000000000001,
"step": 386,
"tokens_trained": 0.189718512
},
{
"epoch": 0.11006311609105737,
"grad_norm": 124.30497741699219,
"loss": 5.7734,
"lr": 0.0007740000000000001,
"step": 388,
"tokens_trained": 0.190703776
},
{
"epoch": 0.11063045174101127,
"grad_norm": 143.64004516601562,
"loss": 5.7641,
"lr": 0.000778,
"step": 390,
"tokens_trained": 0.191689888
},
{
"epoch": 0.11119778739096518,
"grad_norm": 160.06784057617188,
"loss": 5.6025,
"lr": 0.000782,
"step": 392,
"tokens_trained": 0.192673992
},
{
"epoch": 0.11176512304091908,
"grad_norm": 226.97988891601562,
"loss": 6.0049,
"lr": 0.000786,
"step": 394,
"tokens_trained": 0.193656272
},
{
"epoch": 0.112332458690873,
"grad_norm": 223.26898193359375,
"loss": 5.6972,
"lr": 0.00079,
"step": 396,
"tokens_trained": 0.194639144
},
{
"epoch": 0.11289979434082689,
"grad_norm": 249.34912109375,
"loss": 5.7348,
"lr": 0.0007940000000000001,
"step": 398,
"tokens_trained": 0.195621256
},
{
"epoch": 0.11346712999078079,
"grad_norm": 161.34271240234375,
"loss": 5.6689,
"lr": 0.0007980000000000001,
"step": 400,
"tokens_trained": 0.196604136
},
{
"epoch": 0.1140344656407347,
"grad_norm": 148.53176879882812,
"loss": 5.702,
"lr": 0.0008020000000000001,
"step": 402,
"tokens_trained": 0.197586784
},
{
"epoch": 0.1146018012906886,
"grad_norm": 144.40835571289062,
"loss": 6.2402,
"lr": 0.0008060000000000001,
"step": 404,
"tokens_trained": 0.198570824
},
{
"epoch": 0.11516913694064251,
"grad_norm": 306.57562255859375,
"loss": 7.1739,
"lr": 0.0008100000000000001,
"step": 406,
"tokens_trained": 0.199548328
},
{
"epoch": 0.11573647259059641,
"grad_norm": 308.79180908203125,
"loss": 6.0972,
"lr": 0.0008139999999999999,
"step": 408,
"tokens_trained": 0.200532496
},
{
"epoch": 0.11630380824055031,
"grad_norm": 197.76791381835938,
"loss": 6.3533,
"lr": 0.0008179999999999999,
"step": 410,
"tokens_trained": 0.201514648
},
{
"epoch": 0.11687114389050422,
"grad_norm": 129.5694580078125,
"loss": 6.9628,
"lr": 0.0008219999999999999,
"step": 412,
"tokens_trained": 0.2024994
},
{
"epoch": 0.11743847954045812,
"grad_norm": 446.0195617675781,
"loss": 11.7562,
"lr": 0.000826,
"step": 414,
"tokens_trained": 0.20348012
},
{
"epoch": 0.11800581519041203,
"grad_norm": 355.5342712402344,
"loss": 8.8055,
"lr": 0.00083,
"step": 416,
"tokens_trained": 0.20446356
},
{
"epoch": 0.11857315084036593,
"grad_norm": 456.2491149902344,
"loss": 9.606,
"lr": 0.000834,
"step": 418,
"tokens_trained": 0.205445288
},
{
"epoch": 0.11914048649031983,
"grad_norm": 369.8676452636719,
"loss": 8.385,
"lr": 0.000838,
"step": 420,
"tokens_trained": 0.206427832
},
{
"epoch": 0.11970782214027374,
"grad_norm": 262.19073486328125,
"loss": 9.0956,
"lr": 0.000842,
"step": 422,
"tokens_trained": 0.207409848
},
{
"epoch": 0.12027515779022764,
"grad_norm": 120.3193130493164,
"loss": 5.4937,
"lr": 0.000846,
"step": 424,
"tokens_trained": 0.208391752
},
{
"epoch": 0.12084249344018155,
"grad_norm": 222.1111297607422,
"loss": 8.9367,
"lr": 0.00085,
"step": 426,
"tokens_trained": 0.20937384
},
{
"epoch": 0.12140982909013545,
"grad_norm": 137.16819763183594,
"loss": 7.5876,
"lr": 0.000854,
"step": 428,
"tokens_trained": 0.210358576
},
{
"epoch": 0.12197716474008935,
"grad_norm": 267.61846923828125,
"loss": 8.817,
"lr": 0.000858,
"step": 430,
"tokens_trained": 0.211340064
},
{
"epoch": 0.12254450039004326,
"grad_norm": 472.72906494140625,
"loss": 8.203,
"lr": 0.000862,
"step": 432,
"tokens_trained": 0.212321144
},
{
"epoch": 0.12311183603999716,
"grad_norm": 297.1420593261719,
"loss": 10.987,
"lr": 0.000866,
"step": 434,
"tokens_trained": 0.213300312
},
{
"epoch": 0.12367917168995107,
"grad_norm": 281.7297668457031,
"loss": 7.6117,
"lr": 0.00087,
"step": 436,
"tokens_trained": 0.214287624
},
{
"epoch": 0.12424650733990497,
"grad_norm": 203.09678649902344,
"loss": 6.5638,
"lr": 0.000874,
"step": 438,
"tokens_trained": 0.215272136
},
{
"epoch": 0.12481384298985887,
"grad_norm": 155.7823944091797,
"loss": 6.1131,
"lr": 0.000878,
"step": 440,
"tokens_trained": 0.216256392
},
{
"epoch": 0.12538117863981277,
"grad_norm": 189.86196899414062,
"loss": 8.2565,
"lr": 0.000882,
"step": 442,
"tokens_trained": 0.217242504
},
{
"epoch": 0.1259485142897667,
"grad_norm": 247.4568634033203,
"loss": 7.1005,
"lr": 0.0008860000000000001,
"step": 444,
"tokens_trained": 0.218226008
},
{
"epoch": 0.1265158499397206,
"grad_norm": 179.72825622558594,
"loss": 6.3379,
"lr": 0.0008900000000000001,
"step": 446,
"tokens_trained": 0.219210584
},
{
"epoch": 0.1270831855896745,
"grad_norm": 212.96356201171875,
"loss": 7.2514,
"lr": 0.000894,
"step": 448,
"tokens_trained": 0.220193952
},
{
"epoch": 0.1276505212396284,
"grad_norm": 105.67095947265625,
"loss": 5.456,
"lr": 0.000898,
"step": 450,
"tokens_trained": 0.221176936
},
{
"epoch": 0.1282178568895823,
"grad_norm": 302.9122619628906,
"loss": 6.4018,
"lr": 0.000902,
"step": 452,
"tokens_trained": 0.222161952
},
{
"epoch": 0.12878519253953621,
"grad_norm": 215.66561889648438,
"loss": 6.2853,
"lr": 0.000906,
"step": 454,
"tokens_trained": 0.223144912
},
{
"epoch": 0.1293525281894901,
"grad_norm": 272.9984130859375,
"loss": 7.3902,
"lr": 0.00091,
"step": 456,
"tokens_trained": 0.224127392
},
{
"epoch": 0.129919863839444,
"grad_norm": 200.7503662109375,
"loss": 6.1637,
"lr": 0.0009140000000000001,
"step": 458,
"tokens_trained": 0.22511648
},
{
"epoch": 0.1304871994893979,
"grad_norm": 93.23990631103516,
"loss": 6.4867,
"lr": 0.0009180000000000001,
"step": 460,
"tokens_trained": 0.226098144
},
{
"epoch": 0.1310545351393518,
"grad_norm": 274.37164306640625,
"loss": 8.99,
"lr": 0.0009220000000000001,
"step": 462,
"tokens_trained": 0.227081848
},
{
"epoch": 0.13162187078930573,
"grad_norm": 186.66322326660156,
"loss": 8.7122,
"lr": 0.0009260000000000001,
"step": 464,
"tokens_trained": 0.22806636
},
{
"epoch": 0.13218920643925963,
"grad_norm": 586.1035766601562,
"loss": 9.1045,
"lr": 0.00093,
"step": 466,
"tokens_trained": 0.229047872
},
{
"epoch": 0.13275654208921353,
"grad_norm": 227.55996704101562,
"loss": 9.7276,
"lr": 0.000934,
"step": 468,
"tokens_trained": 0.230031144
},
{
"epoch": 0.13332387773916743,
"grad_norm": 229.26609802246094,
"loss": 6.6244,
"lr": 0.0009379999999999999,
"step": 470,
"tokens_trained": 0.2310158
},
{
"epoch": 0.13389121338912133,
"grad_norm": 145.16331481933594,
"loss": 5.759,
"lr": 0.000942,
"step": 472,
"tokens_trained": 0.2319996
},
{
"epoch": 0.13445854903907525,
"grad_norm": 109.9937744140625,
"loss": 5.4838,
"lr": 0.000946,
"step": 474,
"tokens_trained": 0.232983808
},
{
"epoch": 0.13502588468902915,
"grad_norm": 135.74899291992188,
"loss": 6.2738,
"lr": 0.00095,
"step": 476,
"tokens_trained": 0.233963016
},
{
"epoch": 0.13559322033898305,
"grad_norm": 142.99449157714844,
"loss": 5.8459,
"lr": 0.000954,
"step": 478,
"tokens_trained": 0.234948864
},
{
"epoch": 0.13616055598893695,
"grad_norm": 198.66883850097656,
"loss": 6.6626,
"lr": 0.000958,
"step": 480,
"tokens_trained": 0.235932392
},
{
"epoch": 0.13672789163889085,
"grad_norm": 260.76507568359375,
"loss": 6.9299,
"lr": 0.000962,
"step": 482,
"tokens_trained": 0.236915664
},
{
"epoch": 0.13729522728884477,
"grad_norm": 267.97589111328125,
"loss": 6.4343,
"lr": 0.000966,
"step": 484,
"tokens_trained": 0.237896904
},
{
"epoch": 0.13786256293879867,
"grad_norm": 89.8781967163086,
"loss": 6.3203,
"lr": 0.0009699999999999999,
"step": 486,
"tokens_trained": 0.238874528
},
{
"epoch": 0.13842989858875257,
"grad_norm": 225.62985229492188,
"loss": 6.2778,
"lr": 0.000974,
"step": 488,
"tokens_trained": 0.2398588
},
{
"epoch": 0.13899723423870647,
"grad_norm": 85.84110260009766,
"loss": 5.2786,
"lr": 0.000978,
"step": 490,
"tokens_trained": 0.240839968
},
{
"epoch": 0.13956456988866037,
"grad_norm": 141.4368438720703,
"loss": 5.5525,
"lr": 0.000982,
"step": 492,
"tokens_trained": 0.241823544
},
{
"epoch": 0.1401319055386143,
"grad_norm": 94.9535140991211,
"loss": 5.4386,
"lr": 0.0009860000000000001,
"step": 494,
"tokens_trained": 0.242805456
},
{
"epoch": 0.1406992411885682,
"grad_norm": 157.4557647705078,
"loss": 5.9786,
"lr": 0.00099,
"step": 496,
"tokens_trained": 0.243792496
},
{
"epoch": 0.1412665768385221,
"grad_norm": 319.5025634765625,
"loss": 7.04,
"lr": 0.000994,
"step": 498,
"tokens_trained": 0.244772472
},
{
"epoch": 0.141833912488476,
"grad_norm": 282.26824951171875,
"loss": 9.4037,
"lr": 0.000998,
"step": 500,
"tokens_trained": 0.245758968
},
{
"epoch": 0.141833912488476,
"eval_loss": 2.152184247970581,
"eval_runtime": 21.2772,
"step": 500,
"tokens_trained": 0.245758968
},
{
"epoch": 0.1424012481384299,
"grad_norm": 306.0666809082031,
"loss": 7.8845,
"lr": 0.00099986013986014,
"step": 502,
"tokens_trained": 0.246739024
},
{
"epoch": 0.1429685837883838,
"grad_norm": 188.89024353027344,
"loss": 6.8118,
"lr": 0.0009995804195804196,
"step": 504,
"tokens_trained": 0.247726552
},
{
"epoch": 0.1435359194383377,
"grad_norm": 228.97474670410156,
"loss": 6.8475,
"lr": 0.0009993006993006994,
"step": 506,
"tokens_trained": 0.24870688
},
{
"epoch": 0.1441032550882916,
"grad_norm": 229.80029296875,
"loss": 6.2171,
"lr": 0.000999020979020979,
"step": 508,
"tokens_trained": 0.249689096
},
{
"epoch": 0.1446705907382455,
"grad_norm": 157.30340576171875,
"loss": 6.2281,
"lr": 0.0009987412587412587,
"step": 510,
"tokens_trained": 0.250671768
},
{
"epoch": 0.1452379263881994,
"grad_norm": 176.64683532714844,
"loss": 6.5993,
"lr": 0.0009984615384615386,
"step": 512,
"tokens_trained": 0.25165608
},
{
"epoch": 0.14580526203815333,
"grad_norm": 197.20526123046875,
"loss": 5.7267,
"lr": 0.0009981818181818182,
"step": 514,
"tokens_trained": 0.252639712
},
{
"epoch": 0.14637259768810723,
"grad_norm": 54.713260650634766,
"loss": 5.7911,
"lr": 0.000997902097902098,
"step": 516,
"tokens_trained": 0.253622816
},
{
"epoch": 0.14693993333806113,
"grad_norm": 185.74923706054688,
"loss": 7.0055,
"lr": 0.0009976223776223777,
"step": 518,
"tokens_trained": 0.254602792
},
{
"epoch": 0.14750726898801503,
"grad_norm": 240.31021118164062,
"loss": 6.452,
"lr": 0.0009973426573426573,
"step": 520,
"tokens_trained": 0.255584736
},
{
"epoch": 0.14807460463796893,
"grad_norm": 160.2477264404297,
"loss": 7.6556,
"lr": 0.000997062937062937,
"step": 522,
"tokens_trained": 0.256563792
},
{
"epoch": 0.14864194028792285,
"grad_norm": 283.0034484863281,
"loss": 6.5345,
"lr": 0.0009967832167832168,
"step": 524,
"tokens_trained": 0.257546656
},
{
"epoch": 0.14920927593787675,
"grad_norm": 245.537109375,
"loss": 6.3281,
"lr": 0.0009965034965034964,
"step": 526,
"tokens_trained": 0.258530832
},
{
"epoch": 0.14977661158783065,
"grad_norm": 162.1538848876953,
"loss": 7.4072,
"lr": 0.0009962237762237763,
"step": 528,
"tokens_trained": 0.259514528
},
{
"epoch": 0.15034394723778455,
"grad_norm": 107.25792694091797,
"loss": 5.356,
"lr": 0.000995944055944056,
"step": 530,
"tokens_trained": 0.260500912
},
{
"epoch": 0.15091128288773845,
"grad_norm": 173.73353576660156,
"loss": 6.8625,
"lr": 0.0009956643356643356,
"step": 532,
"tokens_trained": 0.26148632
},
{
"epoch": 0.15147861853769237,
"grad_norm": 178.33541870117188,
"loss": 5.8794,
"lr": 0.0009953846153846154,
"step": 534,
"tokens_trained": 0.262468816
},
{
"epoch": 0.15204595418764627,
"grad_norm": 181.2533416748047,
"loss": 7.0243,
"lr": 0.000995104895104895,
"step": 536,
"tokens_trained": 0.263446696
},
{
"epoch": 0.15261328983760017,
"grad_norm": 208.79293823242188,
"loss": 5.8908,
"lr": 0.000994825174825175,
"step": 538,
"tokens_trained": 0.26443108
},
{
"epoch": 0.15318062548755407,
"grad_norm": 148.66285705566406,
"loss": 6.0831,
"lr": 0.0009945454545454546,
"step": 540,
"tokens_trained": 0.265414496
},
{
"epoch": 0.15374796113750797,
"grad_norm": 165.044189453125,
"loss": 5.5594,
"lr": 0.0009942657342657344,
"step": 542,
"tokens_trained": 0.266394128
},
{
"epoch": 0.1543152967874619,
"grad_norm": 124.5405502319336,
"loss": 5.2442,
"lr": 0.000993986013986014,
"step": 544,
"tokens_trained": 0.267378768
},
{
"epoch": 0.1548826324374158,
"grad_norm": 68.66510772705078,
"loss": 5.1173,
"lr": 0.0009937062937062937,
"step": 546,
"tokens_trained": 0.268360184
},
{
"epoch": 0.1554499680873697,
"grad_norm": 57.052860260009766,
"loss": 5.2348,
"lr": 0.0009934265734265735,
"step": 548,
"tokens_trained": 0.269345672
},
{
"epoch": 0.1560173037373236,
"grad_norm": 184.9175567626953,
"loss": 6.7748,
"lr": 0.0009931468531468532,
"step": 550,
"tokens_trained": 0.2703288
},
{
"epoch": 0.15658463938727749,
"grad_norm": 72.9861831665039,
"loss": 5.7387,
"lr": 0.000992867132867133,
"step": 552,
"tokens_trained": 0.271309176
},
{
"epoch": 0.1571519750372314,
"grad_norm": 135.864501953125,
"loss": 6.3035,
"lr": 0.0009925874125874127,
"step": 554,
"tokens_trained": 0.27229644
},
{
"epoch": 0.1577193106871853,
"grad_norm": 130.579833984375,
"loss": 5.4434,
"lr": 0.0009923076923076923,
"step": 556,
"tokens_trained": 0.273277904
},
{
"epoch": 0.1582866463371392,
"grad_norm": 206.77345275878906,
"loss": 5.8649,
"lr": 0.000992027972027972,
"step": 558,
"tokens_trained": 0.274261712
},
{
"epoch": 0.1588539819870931,
"grad_norm": 144.0505828857422,
"loss": 5.3459,
"lr": 0.0009917482517482518,
"step": 560,
"tokens_trained": 0.2752468
},
{
"epoch": 0.159421317637047,
"grad_norm": 87.56634521484375,
"loss": 5.6321,
"lr": 0.0009914685314685314,
"step": 562,
"tokens_trained": 0.276232384
},
{
"epoch": 0.15998865328700093,
"grad_norm": 275.2727355957031,
"loss": 6.7515,
"lr": 0.0009911888111888113,
"step": 564,
"tokens_trained": 0.277211608
},
{
"epoch": 0.16055598893695483,
"grad_norm": 97.00019836425781,
"loss": 5.4374,
"lr": 0.000990909090909091,
"step": 566,
"tokens_trained": 0.278196336
},
{
"epoch": 0.16112332458690873,
"grad_norm": 102.91439056396484,
"loss": 5.729,
"lr": 0.0009906293706293705,
"step": 568,
"tokens_trained": 0.279175672
},
{
"epoch": 0.16169066023686263,
"grad_norm": 151.12432861328125,
"loss": 5.4189,
"lr": 0.0009903496503496504,
"step": 570,
"tokens_trained": 0.280161088
},
{
"epoch": 0.16225799588681653,
"grad_norm": 86.6823959350586,
"loss": 5.1704,
"lr": 0.00099006993006993,
"step": 572,
"tokens_trained": 0.28114256
},
{
"epoch": 0.16282533153677045,
"grad_norm": 90.7052230834961,
"loss": 5.3673,
"lr": 0.0009897902097902099,
"step": 574,
"tokens_trained": 0.282128904
},
{
"epoch": 0.16339266718672435,
"grad_norm": 146.92874145507812,
"loss": 5.5971,
"lr": 0.0009895104895104895,
"step": 576,
"tokens_trained": 0.28311528
},
{
"epoch": 0.16396000283667825,
"grad_norm": 189.76296997070312,
"loss": 5.3109,
"lr": 0.0009892307692307694,
"step": 578,
"tokens_trained": 0.284098528
},
{
"epoch": 0.16452733848663215,
"grad_norm": 174.48092651367188,
"loss": 5.68,
"lr": 0.000988951048951049,
"step": 580,
"tokens_trained": 0.285081064
},
{
"epoch": 0.16509467413658604,
"grad_norm": 154.10816955566406,
"loss": 5.3307,
"lr": 0.0009886713286713286,
"step": 582,
"tokens_trained": 0.286067952
},
{
"epoch": 0.16566200978653997,
"grad_norm": 64.28263092041016,
"loss": 5.1676,
"lr": 0.0009883916083916085,
"step": 584,
"tokens_trained": 0.287051384
},
{
"epoch": 0.16622934543649387,
"grad_norm": 103.81795501708984,
"loss": 5.3436,
"lr": 0.0009881118881118881,
"step": 586,
"tokens_trained": 0.28803284
},
{
"epoch": 0.16679668108644777,
"grad_norm": 144.0076904296875,
"loss": 5.3033,
"lr": 0.000987832167832168,
"step": 588,
"tokens_trained": 0.289014824
},
{
"epoch": 0.16736401673640167,
"grad_norm": 88.31237030029297,
"loss": 5.0609,
"lr": 0.0009875524475524476,
"step": 590,
"tokens_trained": 0.289999864
},
{
"epoch": 0.16793135238635556,
"grad_norm": 68.4583740234375,
"loss": 5.0702,
"lr": 0.0009872727272727273,
"step": 592,
"tokens_trained": 0.290983888
},
{
"epoch": 0.1684986880363095,
"grad_norm": 135.28665161132812,
"loss": 5.3962,
"lr": 0.000986993006993007,
"step": 594,
"tokens_trained": 0.291965752
},
{
"epoch": 0.1690660236862634,
"grad_norm": 80.0412368774414,
"loss": 5.0246,
"lr": 0.0009867132867132867,
"step": 596,
"tokens_trained": 0.292946952
},
{
"epoch": 0.1696333593362173,
"grad_norm": 43.29194641113281,
"loss": 5.0051,
"lr": 0.0009864335664335664,
"step": 598,
"tokens_trained": 0.293928976
},
{
"epoch": 0.17020069498617119,
"grad_norm": 220.88687133789062,
"loss": 6.0798,
"lr": 0.0009861538461538462,
"step": 600,
"tokens_trained": 0.294912408
},
{
"epoch": 0.17076803063612508,
"grad_norm": 102.58654022216797,
"loss": 5.1271,
"lr": 0.0009858741258741259,
"step": 602,
"tokens_trained": 0.29589416
},
{
"epoch": 0.171335366286079,
"grad_norm": 119.0067138671875,
"loss": 5.7402,
"lr": 0.0009855944055944055,
"step": 604,
"tokens_trained": 0.296878584
},
{
"epoch": 0.1719027019360329,
"grad_norm": 138.8656005859375,
"loss": 5.1951,
"lr": 0.0009853146853146854,
"step": 606,
"tokens_trained": 0.297864552
},
{
"epoch": 0.1724700375859868,
"grad_norm": 73.5890884399414,
"loss": 5.2522,
"lr": 0.000985034965034965,
"step": 608,
"tokens_trained": 0.298854088
},
{
"epoch": 0.1730373732359407,
"grad_norm": 113.78330993652344,
"loss": 5.6683,
"lr": 0.0009847552447552449,
"step": 610,
"tokens_trained": 0.299835024
},
{
"epoch": 0.1736047088858946,
"grad_norm": 125.20297241210938,
"loss": 5.1812,
"lr": 0.0009844755244755245,
"step": 612,
"tokens_trained": 0.30082032
},
{
"epoch": 0.17417204453584853,
"grad_norm": 67.46041870117188,
"loss": 5.0417,
"lr": 0.0009841958041958043,
"step": 614,
"tokens_trained": 0.301808456
},
{
"epoch": 0.17473938018580243,
"grad_norm": 117.30754852294922,
"loss": 5.3064,
"lr": 0.000983916083916084,
"step": 616,
"tokens_trained": 0.302794456
},
{
"epoch": 0.17530671583575633,
"grad_norm": 124.30754089355469,
"loss": 5.1614,
"lr": 0.0009836363636363636,
"step": 618,
"tokens_trained": 0.303777376
},
{
"epoch": 0.17587405148571023,
"grad_norm": 102.72042083740234,
"loss": 5.1265,
"lr": 0.0009833566433566435,
"step": 620,
"tokens_trained": 0.304758864
},
{
"epoch": 0.17644138713566412,
"grad_norm": 39.332252502441406,
"loss": 5.1078,
"lr": 0.000983076923076923,
"step": 622,
"tokens_trained": 0.30574392
},
{
"epoch": 0.17700872278561805,
"grad_norm": 153.84811401367188,
"loss": 5.7696,
"lr": 0.000982797202797203,
"step": 624,
"tokens_trained": 0.306727584
},
{
"epoch": 0.17729239061059499,
"eval_loss": 1.3463915586471558,
"eval_runtime": 20.8357,
"step": 625,
"tokens_trained": 0.307220496
},
{
"epoch": 0.17757605843557195,
"grad_norm": 160.2552490234375,
"loss": 5.2283,
"lr": 0.0009825174825174826,
"step": 626,
"tokens_trained": 0.307713024
},
{
"epoch": 0.17814339408552585,
"grad_norm": 186.77407836914062,
"loss": 5.2866,
"lr": 0.0009822377622377622,
"step": 628,
"tokens_trained": 0.308700128
},
{
"epoch": 0.17871072973547975,
"grad_norm": 84.55519104003906,
"loss": 5.1106,
"lr": 0.0009819580419580419,
"step": 630,
"tokens_trained": 0.309681208
},
{
"epoch": 0.17927806538543364,
"grad_norm": 20.617040634155273,
"loss": 4.8327,
"lr": 0.0009816783216783217,
"step": 632,
"tokens_trained": 0.310662224
},
{
"epoch": 0.17984540103538757,
"grad_norm": 168.06039428710938,
"loss": 6.0704,
"lr": 0.0009813986013986014,
"step": 634,
"tokens_trained": 0.31164064
},
{
"epoch": 0.18041273668534147,
"grad_norm": 238.23736572265625,
"loss": 5.6188,
"lr": 0.0009811188811188812,
"step": 636,
"tokens_trained": 0.312622568
},
{
"epoch": 0.18098007233529537,
"grad_norm": 140.0707550048828,
"loss": 6.4034,
"lr": 0.0009808391608391608,
"step": 638,
"tokens_trained": 0.313604944
},
{
"epoch": 0.18154740798524927,
"grad_norm": 161.19302368164062,
"loss": 5.4906,
"lr": 0.0009805594405594405,
"step": 640,
"tokens_trained": 0.314592072
},
{
"epoch": 0.18211474363520316,
"grad_norm": 121.9577407836914,
"loss": 5.2097,
"lr": 0.0009802797202797203,
"step": 642,
"tokens_trained": 0.315574392
},
{
"epoch": 0.1826820792851571,
"grad_norm": 121.25574493408203,
"loss": 5.0317,
"lr": 0.00098,
"step": 644,
"tokens_trained": 0.316559008
},
{
"epoch": 0.183249414935111,
"grad_norm": 28.328269958496094,
"loss": 4.932,
"lr": 0.0009797202797202798,
"step": 646,
"tokens_trained": 0.317538776
},
{
"epoch": 0.1838167505850649,
"grad_norm": 127.77408599853516,
"loss": 5.8335,
"lr": 0.0009794405594405595,
"step": 648,
"tokens_trained": 0.31851792
},
{
"epoch": 0.18438408623501878,
"grad_norm": 94.9522933959961,
"loss": 5.1948,
"lr": 0.000979160839160839,
"step": 650,
"tokens_trained": 0.319501576
},
{
"epoch": 0.18495142188497268,
"grad_norm": 110.33658599853516,
"loss": 5.098,
"lr": 0.000978881118881119,
"step": 652,
"tokens_trained": 0.320482392
},
{
"epoch": 0.1855187575349266,
"grad_norm": 67.23124694824219,
"loss": 4.7723,
"lr": 0.0009786013986013986,
"step": 654,
"tokens_trained": 0.32146712
},
{
"epoch": 0.1860860931848805,
"grad_norm": 61.519866943359375,
"loss": 4.7245,
"lr": 0.0009783216783216782,
"step": 656,
"tokens_trained": 0.322449576
},
{
"epoch": 0.1866534288348344,
"grad_norm": 99.51078033447266,
"loss": 4.783,
"lr": 0.000978041958041958,
"step": 658,
"tokens_trained": 0.323432688
},
{
"epoch": 0.1872207644847883,
"grad_norm": 44.619197845458984,
"loss": 4.7495,
"lr": 0.000977762237762238,
"step": 660,
"tokens_trained": 0.324413952
},
{
"epoch": 0.18778810013474223,
"grad_norm": 114.5891342163086,
"loss": 5.1261,
"lr": 0.0009774825174825176,
"step": 662,
"tokens_trained": 0.325394536
},
{
"epoch": 0.18835543578469613,
"grad_norm": 100.3728256225586,
"loss": 4.7883,
"lr": 0.0009772027972027972,
"step": 664,
"tokens_trained": 0.326374672
},
{
"epoch": 0.18892277143465003,
"grad_norm": 51.883033752441406,
"loss": 4.7249,
"lr": 0.0009769230769230768,
"step": 666,
"tokens_trained": 0.327357152
},
{
"epoch": 0.18949010708460393,
"grad_norm": 82.27507019042969,
"loss": 4.8277,
"lr": 0.0009766433566433567,
"step": 668,
"tokens_trained": 0.328342088
},
{
"epoch": 0.19005744273455782,
"grad_norm": 83.53064727783203,
"loss": 4.8338,
"lr": 0.0009763636363636363,
"step": 670,
"tokens_trained": 0.329319248
},
{
"epoch": 0.19062477838451175,
"grad_norm": 76.18387603759766,
"loss": 4.6958,
"lr": 0.0009760839160839161,
"step": 672,
"tokens_trained": 0.330305968
},
{
"epoch": 0.19119211403446565,
"grad_norm": 27.401426315307617,
"loss": 4.6929,
"lr": 0.0009758041958041958,
"step": 674,
"tokens_trained": 0.3312912
},
{
"epoch": 0.19175944968441955,
"grad_norm": 186.770263671875,
"loss": 5.5089,
"lr": 0.0009755244755244756,
"step": 676,
"tokens_trained": 0.332275224
},
{
"epoch": 0.19232678533437345,
"grad_norm": 105.02385711669922,
"loss": 4.8876,
"lr": 0.0009752447552447553,
"step": 678,
"tokens_trained": 0.33325588
},
{
"epoch": 0.19289412098432734,
"grad_norm": 94.96269989013672,
"loss": 5.1235,
"lr": 0.0009749650349650349,
"step": 680,
"tokens_trained": 0.334238408
},
{
"epoch": 0.19346145663428127,
"grad_norm": 92.29356384277344,
"loss": 4.8194,
"lr": 0.0009746853146853148,
"step": 682,
"tokens_trained": 0.335219368
},
{
"epoch": 0.19402879228423517,
"grad_norm": 59.1584358215332,
"loss": 4.7511,
"lr": 0.0009744055944055944,
"step": 684,
"tokens_trained": 0.336207136
},
{
"epoch": 0.19459612793418907,
"grad_norm": 54.759002685546875,
"loss": 4.777,
"lr": 0.0009741258741258742,
"step": 686,
"tokens_trained": 0.337193536
},
{
"epoch": 0.19516346358414297,
"grad_norm": 92.20452880859375,
"loss": 4.8225,
"lr": 0.0009738461538461538,
"step": 688,
"tokens_trained": 0.338179224
},
{
"epoch": 0.19573079923409686,
"grad_norm": 75.97005462646484,
"loss": 4.655,
"lr": 0.0009735664335664336,
"step": 690,
"tokens_trained": 0.339162168
},
{
"epoch": 0.1962981348840508,
"grad_norm": 58.19076919555664,
"loss": 4.6446,
"lr": 0.0009732867132867133,
"step": 692,
"tokens_trained": 0.340138904
},
{
"epoch": 0.1968654705340047,
"grad_norm": 50.81512451171875,
"loss": 4.5866,
"lr": 0.000973006993006993,
"step": 694,
"tokens_trained": 0.34112288
},
{
"epoch": 0.1974328061839586,
"grad_norm": 61.683372497558594,
"loss": 4.6018,
"lr": 0.0009727272727272728,
"step": 696,
"tokens_trained": 0.342111992
},
{
"epoch": 0.19800014183391249,
"grad_norm": 61.01798629760742,
"loss": 4.6007,
"lr": 0.0009724475524475524,
"step": 698,
"tokens_trained": 0.343095912
},
{
"epoch": 0.19856747748386638,
"grad_norm": 96.49671936035156,
"loss": 4.7035,
"lr": 0.0009721678321678323,
"step": 700,
"tokens_trained": 0.344078632
},
{
"epoch": 0.1991348131338203,
"grad_norm": 64.7771224975586,
"loss": 4.8341,
"lr": 0.0009718881118881119,
"step": 702,
"tokens_trained": 0.345060576
},
{
"epoch": 0.1997021487837742,
"grad_norm": 90.1478042602539,
"loss": 4.7739,
"lr": 0.0009716083916083917,
"step": 704,
"tokens_trained": 0.34604112
},
{
"epoch": 0.2002694844337281,
"grad_norm": 67.6308822631836,
"loss": 4.6218,
"lr": 0.0009713286713286713,
"step": 706,
"tokens_trained": 0.347023496
},
{
"epoch": 0.200836820083682,
"grad_norm": 40.50175094604492,
"loss": 4.6008,
"lr": 0.000971048951048951,
"step": 708,
"tokens_trained": 0.348005416
},
{
"epoch": 0.2014041557336359,
"grad_norm": 33.6448860168457,
"loss": 4.5307,
"lr": 0.0009707692307692308,
"step": 710,
"tokens_trained": 0.3489886
},
{
"epoch": 0.20197149138358983,
"grad_norm": 15.484851837158203,
"loss": 4.5065,
"lr": 0.0009704895104895105,
"step": 712,
"tokens_trained": 0.34997024
},
{
"epoch": 0.20253882703354373,
"grad_norm": 109.26301574707031,
"loss": 4.9613,
"lr": 0.0009702097902097903,
"step": 714,
"tokens_trained": 0.350958496
},
{
"epoch": 0.20310616268349763,
"grad_norm": 150.07492065429688,
"loss": 4.8507,
"lr": 0.0009699300699300699,
"step": 716,
"tokens_trained": 0.35193892
},
{
"epoch": 0.20367349833345152,
"grad_norm": 113.43978881835938,
"loss": 5.4494,
"lr": 0.0009696503496503498,
"step": 718,
"tokens_trained": 0.35291908
},
{
"epoch": 0.20424083398340542,
"grad_norm": 123.0071792602539,
"loss": 4.9475,
"lr": 0.0009693706293706294,
"step": 720,
"tokens_trained": 0.353896072
},
{
"epoch": 0.20480816963335935,
"grad_norm": 65.55500793457031,
"loss": 4.7585,
"lr": 0.0009690909090909091,
"step": 722,
"tokens_trained": 0.354878992
},
{
"epoch": 0.20537550528331325,
"grad_norm": 36.11159896850586,
"loss": 4.6323,
"lr": 0.0009688111888111888,
"step": 724,
"tokens_trained": 0.355863728
},
{
"epoch": 0.20594284093326715,
"grad_norm": 30.566436767578125,
"loss": 4.53,
"lr": 0.0009685314685314685,
"step": 726,
"tokens_trained": 0.356845272
},
{
"epoch": 0.20651017658322104,
"grad_norm": 59.01853561401367,
"loss": 4.5283,
"lr": 0.0009682517482517483,
"step": 728,
"tokens_trained": 0.357826656
},
{
"epoch": 0.20707751223317494,
"grad_norm": 91.78115844726562,
"loss": 4.6149,
"lr": 0.000967972027972028,
"step": 730,
"tokens_trained": 0.358809896
},
{
"epoch": 0.20764484788312887,
"grad_norm": 67.97398376464844,
"loss": 4.617,
"lr": 0.0009676923076923078,
"step": 732,
"tokens_trained": 0.359788736
},
{
"epoch": 0.20821218353308277,
"grad_norm": 42.82001876831055,
"loss": 4.6134,
"lr": 0.0009674125874125874,
"step": 734,
"tokens_trained": 0.360771744
},
{
"epoch": 0.20877951918303667,
"grad_norm": 63.52122116088867,
"loss": 4.6995,
"lr": 0.0009671328671328672,
"step": 736,
"tokens_trained": 0.361757656
},
{
"epoch": 0.20934685483299056,
"grad_norm": 116.39544677734375,
"loss": 4.7153,
"lr": 0.0009668531468531469,
"step": 738,
"tokens_trained": 0.362744008
},
{
"epoch": 0.20991419048294446,
"grad_norm": 40.74269485473633,
"loss": 4.7978,
"lr": 0.0009665734265734266,
"step": 740,
"tokens_trained": 0.36372872
},
{
"epoch": 0.2104815261328984,
"grad_norm": 114.29917907714844,
"loss": 5.1683,
"lr": 0.0009662937062937063,
"step": 742,
"tokens_trained": 0.364710536
},
{
"epoch": 0.2110488617828523,
"grad_norm": 115.83326721191406,
"loss": 4.7642,
"lr": 0.000966013986013986,
"step": 744,
"tokens_trained": 0.3656912
},
{
"epoch": 0.21161619743280619,
"grad_norm": 21.708093643188477,
"loss": 4.8244,
"lr": 0.0009657342657342657,
"step": 746,
"tokens_trained": 0.36667388
},
{
"epoch": 0.21218353308276008,
"grad_norm": 182.01918029785156,
"loss": 5.6045,
"lr": 0.0009654545454545455,
"step": 748,
"tokens_trained": 0.3676634
},
{
"epoch": 0.21275086873271398,
"grad_norm": 47.119319915771484,
"loss": 4.7929,
"lr": 0.0009651748251748252,
"step": 750,
"tokens_trained": 0.368647288
},
{
"epoch": 0.21275086873271398,
"eval_loss": 1.2186306715011597,
"eval_runtime": 20.9362,
"step": 750,
"tokens_trained": 0.368647288
},
{
"epoch": 0.2133182043826679,
"grad_norm": 51.43566131591797,
"loss": 4.7298,
"lr": 0.0009648951048951049,
"step": 752,
"tokens_trained": 0.36962992
},
{
"epoch": 0.2138855400326218,
"grad_norm": 79.49323272705078,
"loss": 5.0749,
"lr": 0.0009646153846153846,
"step": 754,
"tokens_trained": 0.370616064
},
{
"epoch": 0.2144528756825757,
"grad_norm": 119.80200958251953,
"loss": 4.8198,
"lr": 0.0009643356643356644,
"step": 756,
"tokens_trained": 0.371596208
},
{
"epoch": 0.2150202113325296,
"grad_norm": 95.88092041015625,
"loss": 4.7437,
"lr": 0.0009640559440559441,
"step": 758,
"tokens_trained": 0.372579584
},
{
"epoch": 0.2155875469824835,
"grad_norm": 79.64202117919922,
"loss": 4.9181,
"lr": 0.0009637762237762237,
"step": 760,
"tokens_trained": 0.373563056
},
{
"epoch": 0.21615488263243743,
"grad_norm": 79.93920135498047,
"loss": 4.6393,
"lr": 0.0009634965034965035,
"step": 762,
"tokens_trained": 0.374547648
},
{
"epoch": 0.21672221828239133,
"grad_norm": 78.67620849609375,
"loss": 4.6178,
"lr": 0.0009632167832167832,
"step": 764,
"tokens_trained": 0.375531456
},
{
"epoch": 0.21728955393234523,
"grad_norm": 56.32818603515625,
"loss": 4.6498,
"lr": 0.000962937062937063,
"step": 766,
"tokens_trained": 0.376516896
},
{
"epoch": 0.21785688958229912,
"grad_norm": 45.35737228393555,
"loss": 4.5812,
"lr": 0.0009626573426573427,
"step": 768,
"tokens_trained": 0.377499752
},
{
"epoch": 0.21842422523225302,
"grad_norm": 58.13076400756836,
"loss": 4.5793,
"lr": 0.0009623776223776224,
"step": 770,
"tokens_trained": 0.37848276
},
{
"epoch": 0.21899156088220695,
"grad_norm": 55.620628356933594,
"loss": 4.4865,
"lr": 0.0009620979020979021,
"step": 772,
"tokens_trained": 0.379466296
},
{
"epoch": 0.21955889653216085,
"grad_norm": 77.26813507080078,
"loss": 4.5671,
"lr": 0.0009618181818181818,
"step": 774,
"tokens_trained": 0.380449888
},
{
"epoch": 0.22012623218211474,
"grad_norm": 45.00653839111328,
"loss": 4.5923,
"lr": 0.0009615384615384616,
"step": 776,
"tokens_trained": 0.381430352
},
{
"epoch": 0.22069356783206864,
"grad_norm": 52.77407455444336,
"loss": 4.5094,
"lr": 0.0009612587412587412,
"step": 778,
"tokens_trained": 0.382416152
},
{
"epoch": 0.22126090348202254,
"grad_norm": 36.721073150634766,
"loss": 4.4536,
"lr": 0.000960979020979021,
"step": 780,
"tokens_trained": 0.383396672
},
{
"epoch": 0.22182823913197647,
"grad_norm": 51.21247100830078,
"loss": 4.4599,
"lr": 0.0009606993006993007,
"step": 782,
"tokens_trained": 0.384380584
},
{
"epoch": 0.22239557478193037,
"grad_norm": 65.23794555664062,
"loss": 4.5397,
"lr": 0.0009604195804195805,
"step": 784,
"tokens_trained": 0.385361368
},
{
"epoch": 0.22296291043188426,
"grad_norm": 23.255144119262695,
"loss": 4.5007,
"lr": 0.0009601398601398602,
"step": 786,
"tokens_trained": 0.386341416
},
{
"epoch": 0.22353024608183816,
"grad_norm": 30.812740325927734,
"loss": 4.5239,
"lr": 0.0009598601398601398,
"step": 788,
"tokens_trained": 0.387324624
},
{
"epoch": 0.22409758173179206,
"grad_norm": 50.781219482421875,
"loss": 4.5131,
"lr": 0.0009595804195804196,
"step": 790,
"tokens_trained": 0.388312744
},
{
"epoch": 0.224664917381746,
"grad_norm": 47.88816452026367,
"loss": 4.4622,
"lr": 0.0009593006993006993,
"step": 792,
"tokens_trained": 0.38929852
},
{
"epoch": 0.22523225303169989,
"grad_norm": 49.32049560546875,
"loss": 4.5053,
"lr": 0.0009590209790209791,
"step": 794,
"tokens_trained": 0.390279792
},
{
"epoch": 0.22579958868165378,
"grad_norm": 36.98805618286133,
"loss": 4.5144,
"lr": 0.0009587412587412587,
"step": 796,
"tokens_trained": 0.391258904
},
{
"epoch": 0.22636692433160768,
"grad_norm": 24.88475799560547,
"loss": 4.4992,
"lr": 0.0009584615384615385,
"step": 798,
"tokens_trained": 0.392238976
},
{
"epoch": 0.22693425998156158,
"grad_norm": 38.89309310913086,
"loss": 4.4853,
"lr": 0.0009581818181818182,
"step": 800,
"tokens_trained": 0.393226312
},
{
"epoch": 0.2275015956315155,
"grad_norm": 34.86774444580078,
"loss": 4.4519,
"lr": 0.000957902097902098,
"step": 802,
"tokens_trained": 0.394206688
},
{
"epoch": 0.2280689312814694,
"grad_norm": 24.966291427612305,
"loss": 4.456,
"lr": 0.0009576223776223777,
"step": 804,
"tokens_trained": 0.395191608
},
{
"epoch": 0.2286362669314233,
"grad_norm": 12.218213081359863,
"loss": 4.4266,
"lr": 0.0009573426573426573,
"step": 806,
"tokens_trained": 0.396174512
},
{
"epoch": 0.2292036025813772,
"grad_norm": 50.817054748535156,
"loss": 4.586,
"lr": 0.0009570629370629371,
"step": 808,
"tokens_trained": 0.397156912
},
{
"epoch": 0.2297709382313311,
"grad_norm": 37.60087203979492,
"loss": 4.4616,
"lr": 0.0009567832167832168,
"step": 810,
"tokens_trained": 0.398140016
},
{
"epoch": 0.23033827388128503,
"grad_norm": 37.55678176879883,
"loss": 4.4755,
"lr": 0.0009565034965034966,
"step": 812,
"tokens_trained": 0.39912384
},
{
"epoch": 0.23090560953123893,
"grad_norm": 56.427215576171875,
"loss": 4.5078,
"lr": 0.0009562237762237762,
"step": 814,
"tokens_trained": 0.400111224
},
{
"epoch": 0.23147294518119282,
"grad_norm": 31.869827270507812,
"loss": 4.5013,
"lr": 0.0009559440559440559,
"step": 816,
"tokens_trained": 0.401094936
},
{
"epoch": 0.23204028083114672,
"grad_norm": 77.57958984375,
"loss": 4.6977,
"lr": 0.0009556643356643357,
"step": 818,
"tokens_trained": 0.402078888
},
{
"epoch": 0.23260761648110062,
"grad_norm": 52.50204849243164,
"loss": 4.5142,
"lr": 0.0009553846153846154,
"step": 820,
"tokens_trained": 0.403059904
},
{
"epoch": 0.23317495213105455,
"grad_norm": 32.34305191040039,
"loss": 4.4828,
"lr": 0.0009551048951048952,
"step": 822,
"tokens_trained": 0.404049848
},
{
"epoch": 0.23374228778100845,
"grad_norm": 52.08961486816406,
"loss": 4.4869,
"lr": 0.0009548251748251748,
"step": 824,
"tokens_trained": 0.405033872
},
{
"epoch": 0.23430962343096234,
"grad_norm": 44.32194900512695,
"loss": 4.4802,
"lr": 0.0009545454545454546,
"step": 826,
"tokens_trained": 0.406017872
},
{
"epoch": 0.23487695908091624,
"grad_norm": 30.941524505615234,
"loss": 4.4323,
"lr": 0.0009542657342657343,
"step": 828,
"tokens_trained": 0.40700704
},
{
"epoch": 0.23544429473087014,
"grad_norm": 20.52709197998047,
"loss": 4.4919,
"lr": 0.000953986013986014,
"step": 830,
"tokens_trained": 0.407991512
},
{
"epoch": 0.23601163038082407,
"grad_norm": 86.80307006835938,
"loss": 4.8228,
"lr": 0.0009537062937062937,
"step": 832,
"tokens_trained": 0.408979272
},
{
"epoch": 0.23657896603077797,
"grad_norm": 73.71435546875,
"loss": 4.5954,
"lr": 0.0009534265734265734,
"step": 834,
"tokens_trained": 0.409962984
},
{
"epoch": 0.23714630168073186,
"grad_norm": 66.3813247680664,
"loss": 4.5969,
"lr": 0.0009531468531468532,
"step": 836,
"tokens_trained": 0.410945248
},
{
"epoch": 0.23771363733068576,
"grad_norm": 86.94453430175781,
"loss": 4.5894,
"lr": 0.0009528671328671329,
"step": 838,
"tokens_trained": 0.411930872
},
{
"epoch": 0.23828097298063966,
"grad_norm": 61.28915786743164,
"loss": 4.5613,
"lr": 0.0009525874125874127,
"step": 840,
"tokens_trained": 0.412912608
},
{
"epoch": 0.2388483086305936,
"grad_norm": 65.02153778076172,
"loss": 4.5398,
"lr": 0.0009523076923076923,
"step": 842,
"tokens_trained": 0.413897488
},
{
"epoch": 0.23941564428054748,
"grad_norm": 54.01200485229492,
"loss": 4.4922,
"lr": 0.000952027972027972,
"step": 844,
"tokens_trained": 0.414872888
},
{
"epoch": 0.23998297993050138,
"grad_norm": 66.7095718383789,
"loss": 4.5317,
"lr": 0.0009517482517482518,
"step": 846,
"tokens_trained": 0.415856296
},
{
"epoch": 0.24055031558045528,
"grad_norm": 64.23979949951172,
"loss": 4.4686,
"lr": 0.0009514685314685315,
"step": 848,
"tokens_trained": 0.416843344
},
{
"epoch": 0.24111765123040918,
"grad_norm": 51.012840270996094,
"loss": 4.4544,
"lr": 0.0009511888111888112,
"step": 850,
"tokens_trained": 0.41782032
},
{
"epoch": 0.2416849868803631,
"grad_norm": 40.83076095581055,
"loss": 4.4665,
"lr": 0.0009509090909090909,
"step": 852,
"tokens_trained": 0.418805672
},
{
"epoch": 0.242252322530317,
"grad_norm": 48.31489944458008,
"loss": 4.4748,
"lr": 0.0009506293706293707,
"step": 854,
"tokens_trained": 0.419786344
},
{
"epoch": 0.2428196581802709,
"grad_norm": 50.08705520629883,
"loss": 4.4973,
"lr": 0.0009503496503496504,
"step": 856,
"tokens_trained": 0.420768872
},
{
"epoch": 0.2433869938302248,
"grad_norm": 26.840139389038086,
"loss": 4.461,
"lr": 0.0009500699300699301,
"step": 858,
"tokens_trained": 0.421750296
},
{
"epoch": 0.2439543294801787,
"grad_norm": 24.721454620361328,
"loss": 4.4246,
"lr": 0.0009497902097902098,
"step": 860,
"tokens_trained": 0.422730976
},
{
"epoch": 0.24452166513013263,
"grad_norm": 63.147926330566406,
"loss": 4.623,
"lr": 0.0009495104895104895,
"step": 862,
"tokens_trained": 0.423715768
},
{
"epoch": 0.24508900078008652,
"grad_norm": 50.99778747558594,
"loss": 4.4663,
"lr": 0.0009492307692307693,
"step": 864,
"tokens_trained": 0.424697072
},
{
"epoch": 0.24565633643004042,
"grad_norm": 38.0300407409668,
"loss": 4.4649,
"lr": 0.000948951048951049,
"step": 866,
"tokens_trained": 0.425681392
},
{
"epoch": 0.24622367207999432,
"grad_norm": 19.017776489257812,
"loss": 4.4296,
"lr": 0.0009486713286713286,
"step": 868,
"tokens_trained": 0.426665088
},
{
"epoch": 0.24679100772994822,
"grad_norm": 24.02813148498535,
"loss": 4.4958,
"lr": 0.0009483916083916084,
"step": 870,
"tokens_trained": 0.427646016
},
{
"epoch": 0.24735834337990215,
"grad_norm": 59.40018081665039,
"loss": 4.5919,
"lr": 0.0009481118881118881,
"step": 872,
"tokens_trained": 0.428628048
},
{
"epoch": 0.24792567902985604,
"grad_norm": 61.13710403442383,
"loss": 4.4642,
"lr": 0.0009478321678321679,
"step": 874,
"tokens_trained": 0.4296112
},
{
"epoch": 0.24820934685483298,
"eval_loss": 1.1135390996932983,
"eval_runtime": 20.4738,
"step": 875,
"tokens_trained": 0.430109024
},
{
"epoch": 0.24849301467980994,
"grad_norm": 47.920021057128906,
"loss": 4.4832,
"lr": 0.0009475524475524476,
"step": 876,
"tokens_trained": 0.430599208
},
{
"epoch": 0.24906035032976384,
"grad_norm": 25.661701202392578,
"loss": 4.4176,
"lr": 0.0009472727272727273,
"step": 878,
"tokens_trained": 0.43158356
},
{
"epoch": 0.24962768597971774,
"grad_norm": 32.86565399169922,
"loss": 4.405,
"lr": 0.000946993006993007,
"step": 880,
"tokens_trained": 0.432570584
},
{
"epoch": 0.25019502162967167,
"grad_norm": 23.443584442138672,
"loss": 4.4218,
"lr": 0.0009467132867132868,
"step": 882,
"tokens_trained": 0.433557672
},
{
"epoch": 0.25076235727962554,
"grad_norm": 28.315975189208984,
"loss": 4.4019,
"lr": 0.0009464335664335665,
"step": 884,
"tokens_trained": 0.434542736
},
{
"epoch": 0.25132969292957946,
"grad_norm": 31.056642532348633,
"loss": 4.4027,
"lr": 0.0009461538461538461,
"step": 886,
"tokens_trained": 0.43553112
},
{
"epoch": 0.2518970285795334,
"grad_norm": 13.661805152893066,
"loss": 4.3745,
"lr": 0.0009458741258741259,
"step": 888,
"tokens_trained": 0.436511584
},
{
"epoch": 0.25246436422948726,
"grad_norm": 47.04901885986328,
"loss": 4.4875,
"lr": 0.0009455944055944056,
"step": 890,
"tokens_trained": 0.43749464
},
{
"epoch": 0.2530316998794412,
"grad_norm": 84.91446685791016,
"loss": 4.5185,
"lr": 0.0009453146853146854,
"step": 892,
"tokens_trained": 0.43847764
},
{
"epoch": 0.25359903552939506,
"grad_norm": 40.9110107421875,
"loss": 4.5735,
"lr": 0.000945034965034965,
"step": 894,
"tokens_trained": 0.439461496
},
{
"epoch": 0.254166371179349,
"grad_norm": 58.98877716064453,
"loss": 4.5146,
"lr": 0.0009447552447552447,
"step": 896,
"tokens_trained": 0.440443656
},
{
"epoch": 0.2547337068293029,
"grad_norm": 34.037315368652344,
"loss": 4.4714,
"lr": 0.0009444755244755245,
"step": 898,
"tokens_trained": 0.441423496
},
{
"epoch": 0.2553010424792568,
"grad_norm": 24.91920280456543,
"loss": 4.4334,
"lr": 0.0009441958041958042,
"step": 900,
"tokens_trained": 0.442407408
},
{
"epoch": 0.2558683781292107,
"grad_norm": 30.612323760986328,
"loss": 4.4459,
"lr": 0.000943916083916084,
"step": 902,
"tokens_trained": 0.443383464
},
{
"epoch": 0.2564357137791646,
"grad_norm": 50.595577239990234,
"loss": 4.4848,
"lr": 0.0009436363636363636,
"step": 904,
"tokens_trained": 0.4443674
},
{
"epoch": 0.2570030494291185,
"grad_norm": 41.3300895690918,
"loss": 4.4445,
"lr": 0.0009433566433566434,
"step": 906,
"tokens_trained": 0.445346072
},
{
"epoch": 0.25757038507907243,
"grad_norm": 48.33689880371094,
"loss": 4.4058,
"lr": 0.0009430769230769231,
"step": 908,
"tokens_trained": 0.446329872
},
{
"epoch": 0.2581377207290263,
"grad_norm": 39.081382751464844,
"loss": 4.4321,
"lr": 0.0009427972027972029,
"step": 910,
"tokens_trained": 0.447309544
},
{
"epoch": 0.2587050563789802,
"grad_norm": 62.18062210083008,
"loss": 4.4672,
"lr": 0.0009425174825174825,
"step": 912,
"tokens_trained": 0.448295056
},
{
"epoch": 0.2592723920289341,
"grad_norm": 28.725404739379883,
"loss": 4.4786,
"lr": 0.0009422377622377622,
"step": 914,
"tokens_trained": 0.449274208
},
{
"epoch": 0.259839727678888,
"grad_norm": 47.55582809448242,
"loss": 4.4227,
"lr": 0.000941958041958042,
"step": 916,
"tokens_trained": 0.450256408
},
{
"epoch": 0.26040706332884195,
"grad_norm": 35.743125915527344,
"loss": 4.379,
"lr": 0.0009416783216783217,
"step": 918,
"tokens_trained": 0.45123684
},
{
"epoch": 0.2609743989787958,
"grad_norm": 31.489402770996094,
"loss": 4.3888,
"lr": 0.0009413986013986015,
"step": 920,
"tokens_trained": 0.45221748
},
{
"epoch": 0.26154173462874974,
"grad_norm": 36.46233367919922,
"loss": 4.3982,
"lr": 0.0009411188811188811,
"step": 922,
"tokens_trained": 0.453202064
},
{
"epoch": 0.2621090702787036,
"grad_norm": 41.6457633972168,
"loss": 4.385,
"lr": 0.0009408391608391608,
"step": 924,
"tokens_trained": 0.454183456
},
{
"epoch": 0.26267640592865754,
"grad_norm": 26.52242088317871,
"loss": 4.4091,
"lr": 0.0009405594405594406,
"step": 926,
"tokens_trained": 0.455165496
},
{
"epoch": 0.26324374157861147,
"grad_norm": 14.401509284973145,
"loss": 4.3549,
"lr": 0.0009402797202797203,
"step": 928,
"tokens_trained": 0.456150248
},
{
"epoch": 0.26381107722856534,
"grad_norm": 30.626131057739258,
"loss": 4.3325,
"lr": 0.00094,
"step": 930,
"tokens_trained": 0.457134184
},
{
"epoch": 0.26437841287851926,
"grad_norm": 63.74067687988281,
"loss": 4.442,
"lr": 0.0009397202797202797,
"step": 932,
"tokens_trained": 0.458118808
},
{
"epoch": 0.26494574852847314,
"grad_norm": 12.15156364440918,
"loss": 4.4658,
"lr": 0.0009394405594405595,
"step": 934,
"tokens_trained": 0.459103872
},
{
"epoch": 0.26551308417842706,
"grad_norm": 76.2789306640625,
"loss": 4.8153,
"lr": 0.0009391608391608392,
"step": 936,
"tokens_trained": 0.460087216
},
{
"epoch": 0.266080419828381,
"grad_norm": 63.919334411621094,
"loss": 4.5707,
"lr": 0.000938881118881119,
"step": 938,
"tokens_trained": 0.461070568
},
{
"epoch": 0.26664775547833486,
"grad_norm": 75.1481704711914,
"loss": 4.5931,
"lr": 0.0009386013986013986,
"step": 940,
"tokens_trained": 0.462055184
},
{
"epoch": 0.2672150911282888,
"grad_norm": 33.118961334228516,
"loss": 4.4723,
"lr": 0.0009383216783216783,
"step": 942,
"tokens_trained": 0.463034592
},
{
"epoch": 0.26778242677824265,
"grad_norm": 30.8759765625,
"loss": 4.4275,
"lr": 0.0009380419580419581,
"step": 944,
"tokens_trained": 0.464016816
},
{
"epoch": 0.2683497624281966,
"grad_norm": 41.05061340332031,
"loss": 4.4566,
"lr": 0.0009377622377622378,
"step": 946,
"tokens_trained": 0.465000872
},
{
"epoch": 0.2689170980781505,
"grad_norm": 30.93424415588379,
"loss": 4.3985,
"lr": 0.0009374825174825175,
"step": 948,
"tokens_trained": 0.465984096
},
{
"epoch": 0.2694844337281044,
"grad_norm": 29.477052688598633,
"loss": 4.3718,
"lr": 0.0009372027972027972,
"step": 950,
"tokens_trained": 0.466961752
},
{
"epoch": 0.2700517693780583,
"grad_norm": 21.568912506103516,
"loss": 4.3697,
"lr": 0.0009369230769230769,
"step": 952,
"tokens_trained": 0.467950088
},
{
"epoch": 0.2706191050280122,
"grad_norm": 41.66835021972656,
"loss": 4.4241,
"lr": 0.0009366433566433567,
"step": 954,
"tokens_trained": 0.468928736
},
{
"epoch": 0.2711864406779661,
"grad_norm": 68.04551696777344,
"loss": 4.3978,
"lr": 0.0009363636363636364,
"step": 956,
"tokens_trained": 0.469907496
},
{
"epoch": 0.27175377632792,
"grad_norm": 37.655181884765625,
"loss": 4.4497,
"lr": 0.0009360839160839161,
"step": 958,
"tokens_trained": 0.470889168
},
{
"epoch": 0.2723211119778739,
"grad_norm": 22.074953079223633,
"loss": 4.3918,
"lr": 0.0009358041958041958,
"step": 960,
"tokens_trained": 0.471871816
},
{
"epoch": 0.2728884476278278,
"grad_norm": 49.925777435302734,
"loss": 4.4745,
"lr": 0.0009355244755244755,
"step": 962,
"tokens_trained": 0.472856728
},
{
"epoch": 0.2734557832777817,
"grad_norm": 46.520851135253906,
"loss": 4.403,
"lr": 0.0009352447552447553,
"step": 964,
"tokens_trained": 0.473838544
},
{
"epoch": 0.2740231189277356,
"grad_norm": 25.053146362304688,
"loss": 4.4247,
"lr": 0.0009349650349650349,
"step": 966,
"tokens_trained": 0.474819976
},
{
"epoch": 0.27459045457768955,
"grad_norm": 30.127140045166016,
"loss": 4.3834,
"lr": 0.0009346853146853147,
"step": 968,
"tokens_trained": 0.475800696
},
{
"epoch": 0.2751577902276434,
"grad_norm": 41.478328704833984,
"loss": 4.3978,
"lr": 0.0009344055944055944,
"step": 970,
"tokens_trained": 0.4767834
},
{
"epoch": 0.27572512587759734,
"grad_norm": 23.739456176757812,
"loss": 4.3698,
"lr": 0.0009341258741258742,
"step": 972,
"tokens_trained": 0.47776944
},
{
"epoch": 0.2762924615275512,
"grad_norm": 21.813220977783203,
"loss": 4.3902,
"lr": 0.0009338461538461539,
"step": 974,
"tokens_trained": 0.478757048
},
{
"epoch": 0.27685979717750514,
"grad_norm": 64.79598999023438,
"loss": 4.5237,
"lr": 0.0009335664335664336,
"step": 976,
"tokens_trained": 0.47973872
},
{
"epoch": 0.27742713282745907,
"grad_norm": 68.32705688476562,
"loss": 4.4461,
"lr": 0.0009332867132867133,
"step": 978,
"tokens_trained": 0.480721912
},
{
"epoch": 0.27799446847741294,
"grad_norm": 41.857582092285156,
"loss": 4.4663,
"lr": 0.0009330069930069929,
"step": 980,
"tokens_trained": 0.481704248
},
{
"epoch": 0.27856180412736686,
"grad_norm": 28.30609893798828,
"loss": 4.3461,
"lr": 0.0009327272727272728,
"step": 982,
"tokens_trained": 0.482689768
},
{
"epoch": 0.27912913977732073,
"grad_norm": 33.207950592041016,
"loss": 4.4185,
"lr": 0.0009324475524475524,
"step": 984,
"tokens_trained": 0.483670008
},
{
"epoch": 0.27969647542727466,
"grad_norm": 29.541227340698242,
"loss": 4.388,
"lr": 0.0009321678321678322,
"step": 986,
"tokens_trained": 0.48465836
},
{
"epoch": 0.2802638110772286,
"grad_norm": 16.23346710205078,
"loss": 4.3219,
"lr": 0.0009318881118881119,
"step": 988,
"tokens_trained": 0.4856402
},
{
"epoch": 0.28083114672718246,
"grad_norm": 20.036178588867188,
"loss": 4.3273,
"lr": 0.0009316083916083917,
"step": 990,
"tokens_trained": 0.486621648
},
{
"epoch": 0.2813984823771364,
"grad_norm": 49.25468063354492,
"loss": 4.4649,
"lr": 0.0009313286713286714,
"step": 992,
"tokens_trained": 0.48760744
},
{
"epoch": 0.28196581802709025,
"grad_norm": 48.59744644165039,
"loss": 4.3979,
"lr": 0.000931048951048951,
"step": 994,
"tokens_trained": 0.488590472
},
{
"epoch": 0.2825331536770442,
"grad_norm": 16.33649253845215,
"loss": 4.3945,
"lr": 0.0009307692307692308,
"step": 996,
"tokens_trained": 0.489570976
},
{
"epoch": 0.2831004893269981,
"grad_norm": 60.632591247558594,
"loss": 4.5581,
"lr": 0.0009304895104895104,
"step": 998,
"tokens_trained": 0.490552296
},
{
"epoch": 0.283667824976952,
"grad_norm": 52.75735092163086,
"loss": 4.424,
"lr": 0.0009302097902097903,
"step": 1000,
"tokens_trained": 0.49153744
},
{
"epoch": 0.283667824976952,
"eval_loss": 1.1363450288772583,
"eval_runtime": 20.7491,
"step": 1000,
"tokens_trained": 0.49153744
},
{
"epoch": 0.2842351606269059,
"grad_norm": 20.506614685058594,
"loss": 4.4241,
"lr": 0.0009299300699300699,
"step": 1002,
"tokens_trained": 0.492522608
},
{
"epoch": 0.2848024962768598,
"grad_norm": 23.148601531982422,
"loss": 4.3975,
"lr": 0.0009296503496503497,
"step": 1004,
"tokens_trained": 0.493501384
},
{
"epoch": 0.2853698319268137,
"grad_norm": 9.550869941711426,
"loss": 4.3952,
"lr": 0.0009293706293706294,
"step": 1006,
"tokens_trained": 0.494482544
},
{
"epoch": 0.2859371675767676,
"grad_norm": 80.31155395507812,
"loss": 4.7614,
"lr": 0.0009290909090909091,
"step": 1008,
"tokens_trained": 0.495459416
},
{
"epoch": 0.2865045032267215,
"grad_norm": 61.021026611328125,
"loss": 4.4396,
"lr": 0.0009288111888111889,
"step": 1010,
"tokens_trained": 0.4964418
},
{
"epoch": 0.2870718388766754,
"grad_norm": 35.23258972167969,
"loss": 4.5548,
"lr": 0.0009285314685314685,
"step": 1012,
"tokens_trained": 0.497428288
},
{
"epoch": 0.2876391745266293,
"grad_norm": 36.45478057861328,
"loss": 4.46,
"lr": 0.0009282517482517483,
"step": 1014,
"tokens_trained": 0.498416832
},
{
"epoch": 0.2882065101765832,
"grad_norm": 46.622982025146484,
"loss": 4.3554,
"lr": 0.0009279720279720279,
"step": 1016,
"tokens_trained": 0.499399792
},
{
"epoch": 0.28877384582653715,
"grad_norm": 87.00289154052734,
"loss": 4.5276,
"lr": 0.0009276923076923078,
"step": 1018,
"tokens_trained": 0.500383776
},
{
"epoch": 0.289341181476491,
"grad_norm": 11.444964408874512,
"loss": 4.5483,
"lr": 0.0009274125874125874,
"step": 1020,
"tokens_trained": 0.50136468
},
{
"epoch": 0.28990851712644494,
"grad_norm": 89.05914306640625,
"loss": 4.8957,
"lr": 0.0009271328671328671,
"step": 1022,
"tokens_trained": 0.50235172
},
{
"epoch": 0.2904758527763988,
"grad_norm": 26.915477752685547,
"loss": 4.6184,
"lr": 0.0009268531468531469,
"step": 1024,
"tokens_trained": 0.50333208
},
{
"epoch": 0.29104318842635274,
"grad_norm": 44.32100296020508,
"loss": 4.5263,
"lr": 0.0009265734265734266,
"step": 1026,
"tokens_trained": 0.504314656
},
{
"epoch": 0.29161052407630667,
"grad_norm": 26.699670791625977,
"loss": 4.3871,
"lr": 0.0009262937062937064,
"step": 1028,
"tokens_trained": 0.505296568
},
{
"epoch": 0.29217785972626054,
"grad_norm": 27.469482421875,
"loss": 4.3558,
"lr": 0.000926013986013986,
"step": 1030,
"tokens_trained": 0.506280416
},
{
"epoch": 0.29274519537621446,
"grad_norm": 26.149612426757812,
"loss": 4.3368,
"lr": 0.0009257342657342658,
"step": 1032,
"tokens_trained": 0.507261224
},
{
"epoch": 0.29331253102616833,
"grad_norm": 8.754459381103516,
"loss": 4.3447,
"lr": 0.0009254545454545454,
"step": 1034,
"tokens_trained": 0.508243288
},
{
"epoch": 0.29387986667612226,
"grad_norm": 32.17164611816406,
"loss": 4.4174,
"lr": 0.0009251748251748252,
"step": 1036,
"tokens_trained": 0.509224176
},
{
"epoch": 0.2944472023260762,
"grad_norm": 41.17238235473633,
"loss": 4.4221,
"lr": 0.0009248951048951049,
"step": 1038,
"tokens_trained": 0.510203568
},
{
"epoch": 0.29501453797603006,
"grad_norm": 44.97213363647461,
"loss": 4.3594,
"lr": 0.0009246153846153846,
"step": 1040,
"tokens_trained": 0.511186464
},
{
"epoch": 0.295581873625984,
"grad_norm": 42.23421859741211,
"loss": 4.4159,
"lr": 0.0009243356643356644,
"step": 1042,
"tokens_trained": 0.51216944
},
{
"epoch": 0.29614920927593785,
"grad_norm": 36.13594436645508,
"loss": 4.4105,
"lr": 0.0009240559440559441,
"step": 1044,
"tokens_trained": 0.513153144
},
{
"epoch": 0.2967165449258918,
"grad_norm": 36.89309310913086,
"loss": 4.3947,
"lr": 0.0009237762237762239,
"step": 1046,
"tokens_trained": 0.51413388
},
{
"epoch": 0.2972838805758457,
"grad_norm": 58.599700927734375,
"loss": 4.3988,
"lr": 0.0009234965034965035,
"step": 1048,
"tokens_trained": 0.515119288
},
{
"epoch": 0.2978512162257996,
"grad_norm": 13.725994110107422,
"loss": 4.412,
"lr": 0.0009232167832167832,
"step": 1050,
"tokens_trained": 0.51610284
},
{
"epoch": 0.2984185518757535,
"grad_norm": 105.28518676757812,
"loss": 4.7305,
"lr": 0.0009229370629370629,
"step": 1052,
"tokens_trained": 0.517085576
},
{
"epoch": 0.2989858875257074,
"grad_norm": 29.499713897705078,
"loss": 4.5106,
"lr": 0.0009226573426573427,
"step": 1054,
"tokens_trained": 0.518064224
},
{
"epoch": 0.2995532231756613,
"grad_norm": 60.907203674316406,
"loss": 4.5249,
"lr": 0.0009223776223776224,
"step": 1056,
"tokens_trained": 0.51905084
},
{
"epoch": 0.3001205588256152,
"grad_norm": 39.825069427490234,
"loss": 4.3695,
"lr": 0.0009220979020979021,
"step": 1058,
"tokens_trained": 0.5200318
},
{
"epoch": 0.3006878944755691,
"grad_norm": 42.77061462402344,
"loss": 4.4094,
"lr": 0.0009218181818181819,
"step": 1060,
"tokens_trained": 0.521013568
},
{
"epoch": 0.301255230125523,
"grad_norm": 37.05888748168945,
"loss": 4.3684,
"lr": 0.0009215384615384616,
"step": 1062,
"tokens_trained": 0.521997624
},
{
"epoch": 0.3018225657754769,
"grad_norm": 42.28252029418945,
"loss": 4.3489,
"lr": 0.0009212587412587413,
"step": 1064,
"tokens_trained": 0.522986184
},
{
"epoch": 0.3023899014254308,
"grad_norm": 40.95197677612305,
"loss": 4.3564,
"lr": 0.000920979020979021,
"step": 1066,
"tokens_trained": 0.523970984
},
{
"epoch": 0.30295723707538474,
"grad_norm": 25.469568252563477,
"loss": 4.3833,
"lr": 0.0009206993006993007,
"step": 1068,
"tokens_trained": 0.524952808
},
{
"epoch": 0.3035245727253386,
"grad_norm": 29.921735763549805,
"loss": 4.3579,
"lr": 0.0009204195804195804,
"step": 1070,
"tokens_trained": 0.525935696
},
{
"epoch": 0.30409190837529254,
"grad_norm": 26.038026809692383,
"loss": 4.2898,
"lr": 0.0009201398601398602,
"step": 1072,
"tokens_trained": 0.526916904
},
{
"epoch": 0.3046592440252464,
"grad_norm": 32.59503936767578,
"loss": 4.3335,
"lr": 0.0009198601398601398,
"step": 1074,
"tokens_trained": 0.527899864
},
{
"epoch": 0.30522657967520034,
"grad_norm": 14.04964828491211,
"loss": 4.3171,
"lr": 0.0009195804195804196,
"step": 1076,
"tokens_trained": 0.528878176
},
{
"epoch": 0.30579391532515426,
"grad_norm": 15.936906814575195,
"loss": 4.3005,
"lr": 0.0009193006993006993,
"step": 1078,
"tokens_trained": 0.529859952
},
{
"epoch": 0.30636125097510813,
"grad_norm": 9.73235034942627,
"loss": 4.3287,
"lr": 0.0009190209790209791,
"step": 1080,
"tokens_trained": 0.530838192
},
{
"epoch": 0.30692858662506206,
"grad_norm": 45.44027328491211,
"loss": 4.4384,
"lr": 0.0009187412587412588,
"step": 1082,
"tokens_trained": 0.531818376
},
{
"epoch": 0.30749592227501593,
"grad_norm": 55.65925598144531,
"loss": 4.3772,
"lr": 0.0009184615384615385,
"step": 1084,
"tokens_trained": 0.532802048
},
{
"epoch": 0.30806325792496986,
"grad_norm": 33.47093200683594,
"loss": 4.4257,
"lr": 0.0009181818181818182,
"step": 1086,
"tokens_trained": 0.533785376
},
{
"epoch": 0.3086305935749238,
"grad_norm": 39.709224700927734,
"loss": 4.4177,
"lr": 0.0009179020979020978,
"step": 1088,
"tokens_trained": 0.5347698
},
{
"epoch": 0.30919792922487765,
"grad_norm": 34.25212097167969,
"loss": 4.3518,
"lr": 0.0009176223776223777,
"step": 1090,
"tokens_trained": 0.53575108
},
{
"epoch": 0.3097652648748316,
"grad_norm": 29.156312942504883,
"loss": 4.3596,
"lr": 0.0009173426573426573,
"step": 1092,
"tokens_trained": 0.536735544
},
{
"epoch": 0.31033260052478545,
"grad_norm": 31.714128494262695,
"loss": 4.3736,
"lr": 0.0009170629370629371,
"step": 1094,
"tokens_trained": 0.537718008
},
{
"epoch": 0.3108999361747394,
"grad_norm": 12.244729042053223,
"loss": 4.3472,
"lr": 0.0009167832167832168,
"step": 1096,
"tokens_trained": 0.538693512
},
{
"epoch": 0.3114672718246933,
"grad_norm": 10.271063804626465,
"loss": 4.301,
"lr": 0.0009165034965034966,
"step": 1098,
"tokens_trained": 0.539681376
},
{
"epoch": 0.3120346074746472,
"grad_norm": 35.79754638671875,
"loss": 4.3912,
"lr": 0.0009162237762237763,
"step": 1100,
"tokens_trained": 0.540661392
},
{
"epoch": 0.3126019431246011,
"grad_norm": 24.1260986328125,
"loss": 4.3303,
"lr": 0.0009159440559440559,
"step": 1102,
"tokens_trained": 0.541646968
},
{
"epoch": 0.31316927877455497,
"grad_norm": 24.501169204711914,
"loss": 4.3205,
"lr": 0.0009156643356643357,
"step": 1104,
"tokens_trained": 0.542629392
},
{
"epoch": 0.3137366144245089,
"grad_norm": 17.031600952148438,
"loss": 4.2521,
"lr": 0.0009153846153846153,
"step": 1106,
"tokens_trained": 0.54361348
},
{
"epoch": 0.3143039500744628,
"grad_norm": 19.506216049194336,
"loss": 4.3225,
"lr": 0.0009151048951048952,
"step": 1108,
"tokens_trained": 0.544595336
},
{
"epoch": 0.3148712857244167,
"grad_norm": 20.822546005249023,
"loss": 4.2711,
"lr": 0.0009148251748251748,
"step": 1110,
"tokens_trained": 0.545578256
},
{
"epoch": 0.3154386213743706,
"grad_norm": 29.967998504638672,
"loss": 4.2868,
"lr": 0.0009145454545454546,
"step": 1112,
"tokens_trained": 0.546561024
},
{
"epoch": 0.3160059570243245,
"grad_norm": 24.06121063232422,
"loss": 4.2701,
"lr": 0.0009142657342657343,
"step": 1114,
"tokens_trained": 0.547544616
},
{
"epoch": 0.3165732926742784,
"grad_norm": 15.868765830993652,
"loss": 4.3233,
"lr": 0.000913986013986014,
"step": 1116,
"tokens_trained": 0.548526216
},
{
"epoch": 0.31714062832423234,
"grad_norm": 27.47897720336914,
"loss": 4.2813,
"lr": 0.0009137062937062938,
"step": 1118,
"tokens_trained": 0.549506544
},
{
"epoch": 0.3177079639741862,
"grad_norm": 15.343204498291016,
"loss": 4.3002,
"lr": 0.0009134265734265734,
"step": 1120,
"tokens_trained": 0.550488496
},
{
"epoch": 0.31827529962414014,
"grad_norm": 4.320124626159668,
"loss": 4.2622,
"lr": 0.0009131468531468532,
"step": 1122,
"tokens_trained": 0.551471792
},
{
"epoch": 0.318842635274094,
"grad_norm": 34.520050048828125,
"loss": 4.366,
"lr": 0.0009128671328671328,
"step": 1124,
"tokens_trained": 0.552457008
},
{
"epoch": 0.319126303099071,
"eval_loss": 1.096465826034546,
"eval_runtime": 20.7643,
"step": 1125,
"tokens_trained": 0.552948064
},
{
"epoch": 0.31940997092404794,
"grad_norm": 39.718719482421875,
"loss": 4.3317,
"lr": 0.0009125874125874127,
"step": 1126,
"tokens_trained": 0.5534394
},
{
"epoch": 0.31997730657400186,
"grad_norm": 20.843252182006836,
"loss": 4.3883,
"lr": 0.0009123076923076923,
"step": 1128,
"tokens_trained": 0.554419184
},
{
"epoch": 0.32054464222395573,
"grad_norm": 12.916360855102539,
"loss": 4.3119,
"lr": 0.000912027972027972,
"step": 1130,
"tokens_trained": 0.555401952
},
{
"epoch": 0.32111197787390966,
"grad_norm": 48.54426956176758,
"loss": 4.4155,
"lr": 0.0009117482517482518,
"step": 1132,
"tokens_trained": 0.556385024
},
{
"epoch": 0.32167931352386353,
"grad_norm": 41.00883483886719,
"loss": 4.362,
"lr": 0.0009114685314685315,
"step": 1134,
"tokens_trained": 0.557368472
},
{
"epoch": 0.32224664917381746,
"grad_norm": 28.0487060546875,
"loss": 4.3504,
"lr": 0.0009111888111888113,
"step": 1136,
"tokens_trained": 0.55835288
},
{
"epoch": 0.3228139848237714,
"grad_norm": 22.05229377746582,
"loss": 4.331,
"lr": 0.0009109090909090909,
"step": 1138,
"tokens_trained": 0.559337064
},
{
"epoch": 0.32338132047372525,
"grad_norm": 16.770631790161133,
"loss": 4.3008,
"lr": 0.0009106293706293707,
"step": 1140,
"tokens_trained": 0.560317984
},
{
"epoch": 0.3239486561236792,
"grad_norm": 35.300262451171875,
"loss": 4.4083,
"lr": 0.0009103496503496503,
"step": 1142,
"tokens_trained": 0.561299688
},
{
"epoch": 0.32451599177363305,
"grad_norm": 23.788284301757812,
"loss": 4.2772,
"lr": 0.0009100699300699301,
"step": 1144,
"tokens_trained": 0.562285664
},
{
"epoch": 0.325083327423587,
"grad_norm": 23.085710525512695,
"loss": 4.3185,
"lr": 0.0009097902097902098,
"step": 1146,
"tokens_trained": 0.563267832
},
{
"epoch": 0.3256506630735409,
"grad_norm": 13.11314582824707,
"loss": 4.2711,
"lr": 0.0009095104895104895,
"step": 1148,
"tokens_trained": 0.564248928
},
{
"epoch": 0.3262179987234948,
"grad_norm": 31.297805786132812,
"loss": 4.3096,
"lr": 0.0009092307692307692,
"step": 1150,
"tokens_trained": 0.56522952
},
{
"epoch": 0.3267853343734487,
"grad_norm": 11.668539047241211,
"loss": 4.2667,
"lr": 0.000908951048951049,
"step": 1152,
"tokens_trained": 0.566212392
},
{
"epoch": 0.32735267002340257,
"grad_norm": 23.359189987182617,
"loss": 4.3156,
"lr": 0.0009086713286713288,
"step": 1154,
"tokens_trained": 0.567192216
},
{
"epoch": 0.3279200056733565,
"grad_norm": 31.09916114807129,
"loss": 4.3367,
"lr": 0.0009083916083916084,
"step": 1156,
"tokens_trained": 0.568177088
},
{
"epoch": 0.3284873413233104,
"grad_norm": 24.03261947631836,
"loss": 4.3504,
"lr": 0.0009081118881118881,
"step": 1158,
"tokens_trained": 0.56915868
},
{
"epoch": 0.3290546769732643,
"grad_norm": 16.029443740844727,
"loss": 4.3192,
"lr": 0.0009078321678321678,
"step": 1160,
"tokens_trained": 0.570142976
},
{
"epoch": 0.3296220126232182,
"grad_norm": 53.486724853515625,
"loss": 4.3921,
"lr": 0.0009075524475524476,
"step": 1162,
"tokens_trained": 0.57112748
},
{
"epoch": 0.3301893482731721,
"grad_norm": 37.42267608642578,
"loss": 4.2821,
"lr": 0.0009072727272727273,
"step": 1164,
"tokens_trained": 0.57211356
},
{
"epoch": 0.330756683923126,
"grad_norm": 28.862472534179688,
"loss": 4.3002,
"lr": 0.000906993006993007,
"step": 1166,
"tokens_trained": 0.57309492
},
{
"epoch": 0.33132401957307994,
"grad_norm": 22.26299476623535,
"loss": 4.2729,
"lr": 0.0009067132867132866,
"step": 1168,
"tokens_trained": 0.5740806
},
{
"epoch": 0.3318913552230338,
"grad_norm": 21.635013580322266,
"loss": 4.2866,
"lr": 0.0009064335664335665,
"step": 1170,
"tokens_trained": 0.575061664
},
{
"epoch": 0.33245869087298774,
"grad_norm": 18.995012283325195,
"loss": 4.2814,
"lr": 0.0009061538461538462,
"step": 1172,
"tokens_trained": 0.576046304
},
{
"epoch": 0.3330260265229416,
"grad_norm": 22.621299743652344,
"loss": 4.2739,
"lr": 0.0009058741258741259,
"step": 1174,
"tokens_trained": 0.577032376
},
{
"epoch": 0.33359336217289554,
"grad_norm": 21.758216857910156,
"loss": 4.263,
"lr": 0.0009055944055944056,
"step": 1176,
"tokens_trained": 0.578013896
},
{
"epoch": 0.33416069782284946,
"grad_norm": 32.38374710083008,
"loss": 4.2713,
"lr": 0.0009053146853146853,
"step": 1178,
"tokens_trained": 0.57900508
},
{
"epoch": 0.33472803347280333,
"grad_norm": 35.57462692260742,
"loss": 4.2986,
"lr": 0.0009050349650349651,
"step": 1180,
"tokens_trained": 0.57999512
},
{
"epoch": 0.33529536912275726,
"grad_norm": 11.77812385559082,
"loss": 4.3085,
"lr": 0.0009047552447552448,
"step": 1182,
"tokens_trained": 0.580982752
},
{
"epoch": 0.33586270477271113,
"grad_norm": 51.48725509643555,
"loss": 4.4003,
"lr": 0.0009044755244755245,
"step": 1184,
"tokens_trained": 0.581964936
},
{
"epoch": 0.33643004042266506,
"grad_norm": 47.01481628417969,
"loss": 4.3182,
"lr": 0.0009041958041958041,
"step": 1186,
"tokens_trained": 0.582949944
},
{
"epoch": 0.336997376072619,
"grad_norm": 22.935691833496094,
"loss": 4.3432,
"lr": 0.000903916083916084,
"step": 1188,
"tokens_trained": 0.583934776
},
{
"epoch": 0.33756471172257285,
"grad_norm": 45.21054458618164,
"loss": 4.4674,
"lr": 0.0009036363636363637,
"step": 1190,
"tokens_trained": 0.584918344
},
{
"epoch": 0.3381320473725268,
"grad_norm": 27.012706756591797,
"loss": 4.2889,
"lr": 0.0009033566433566434,
"step": 1192,
"tokens_trained": 0.585897632
},
{
"epoch": 0.33869938302248065,
"grad_norm": 16.68247413635254,
"loss": 4.2896,
"lr": 0.0009030769230769231,
"step": 1194,
"tokens_trained": 0.586879408
},
{
"epoch": 0.3392667186724346,
"grad_norm": 20.664148330688477,
"loss": 4.304,
"lr": 0.0009027972027972027,
"step": 1196,
"tokens_trained": 0.587859392
},
{
"epoch": 0.3398340543223885,
"grad_norm": 22.954742431640625,
"loss": 4.2853,
"lr": 0.0009025174825174826,
"step": 1198,
"tokens_trained": 0.588845408
},
{
"epoch": 0.34040138997234237,
"grad_norm": 23.226943969726562,
"loss": 4.2597,
"lr": 0.0009022377622377622,
"step": 1200,
"tokens_trained": 0.589832736
},
{
"epoch": 0.3409687256222963,
"grad_norm": 7.963059902191162,
"loss": 4.261,
"lr": 0.000901958041958042,
"step": 1202,
"tokens_trained": 0.590816568
},
{
"epoch": 0.34153606127225017,
"grad_norm": 25.160730361938477,
"loss": 4.3288,
"lr": 0.0009016783216783216,
"step": 1204,
"tokens_trained": 0.59179692
},
{
"epoch": 0.3421033969222041,
"grad_norm": 38.45030212402344,
"loss": 4.3371,
"lr": 0.0009013986013986014,
"step": 1206,
"tokens_trained": 0.592780968
},
{
"epoch": 0.342670732572158,
"grad_norm": 52.66873550415039,
"loss": 4.2805,
"lr": 0.0009011188811188812,
"step": 1208,
"tokens_trained": 0.593760896
},
{
"epoch": 0.3432380682221119,
"grad_norm": 28.104921340942383,
"loss": 4.3885,
"lr": 0.0009008391608391609,
"step": 1210,
"tokens_trained": 0.59474304
},
{
"epoch": 0.3438054038720658,
"grad_norm": 49.20989990234375,
"loss": 4.346,
"lr": 0.0009005594405594406,
"step": 1212,
"tokens_trained": 0.59572768
},
{
"epoch": 0.3443727395220197,
"grad_norm": 20.652427673339844,
"loss": 4.2368,
"lr": 0.0009002797202797202,
"step": 1214,
"tokens_trained": 0.59671092
},
{
"epoch": 0.3449400751719736,
"grad_norm": 17.821596145629883,
"loss": 4.3041,
"lr": 0.0009000000000000001,
"step": 1216,
"tokens_trained": 0.597697344
},
{
"epoch": 0.34550741082192754,
"grad_norm": 48.594932556152344,
"loss": 4.3668,
"lr": 0.0008997202797202797,
"step": 1218,
"tokens_trained": 0.598677288
},
{
"epoch": 0.3460747464718814,
"grad_norm": 27.70078468322754,
"loss": 4.2939,
"lr": 0.0008994405594405595,
"step": 1220,
"tokens_trained": 0.599662488
},
{
"epoch": 0.34664208212183534,
"grad_norm": 25.498798370361328,
"loss": 4.2891,
"lr": 0.0008991608391608391,
"step": 1222,
"tokens_trained": 0.600646904
},
{
"epoch": 0.3472094177717892,
"grad_norm": 13.455835342407227,
"loss": 4.2881,
"lr": 0.0008988811188811188,
"step": 1224,
"tokens_trained": 0.601628112
},
{
"epoch": 0.34777675342174313,
"grad_norm": 17.518342971801758,
"loss": 4.2977,
"lr": 0.0008986013986013987,
"step": 1226,
"tokens_trained": 0.602612336
},
{
"epoch": 0.34834408907169706,
"grad_norm": 20.642597198486328,
"loss": 4.2921,
"lr": 0.0008983216783216783,
"step": 1228,
"tokens_trained": 0.603595
},
{
"epoch": 0.34891142472165093,
"grad_norm": 14.464616775512695,
"loss": 4.233,
"lr": 0.0008980419580419581,
"step": 1230,
"tokens_trained": 0.604576592
},
{
"epoch": 0.34947876037160486,
"grad_norm": 13.204504013061523,
"loss": 4.2707,
"lr": 0.0008977622377622377,
"step": 1232,
"tokens_trained": 0.60555656
},
{
"epoch": 0.35004609602155873,
"grad_norm": 12.241665840148926,
"loss": 4.2506,
"lr": 0.0008974825174825176,
"step": 1234,
"tokens_trained": 0.606536024
},
{
"epoch": 0.35061343167151265,
"grad_norm": 18.187660217285156,
"loss": 4.2659,
"lr": 0.0008972027972027972,
"step": 1236,
"tokens_trained": 0.607522576
},
{
"epoch": 0.3511807673214666,
"grad_norm": 8.911888122558594,
"loss": 4.2505,
"lr": 0.000896923076923077,
"step": 1238,
"tokens_trained": 0.608507736
},
{
"epoch": 0.35174810297142045,
"grad_norm": 21.351713180541992,
"loss": 4.2291,
"lr": 0.0008966433566433566,
"step": 1240,
"tokens_trained": 0.609486688
},
{
"epoch": 0.3523154386213744,
"grad_norm": 47.81566619873047,
"loss": 4.2725,
"lr": 0.0008963636363636363,
"step": 1242,
"tokens_trained": 0.610470272
},
{
"epoch": 0.35288277427132825,
"grad_norm": 33.53351974487305,
"loss": 4.3237,
"lr": 0.0008960839160839162,
"step": 1244,
"tokens_trained": 0.611455176
},
{
"epoch": 0.3534501099212822,
"grad_norm": 15.252607345581055,
"loss": 4.2868,
"lr": 0.0008958041958041958,
"step": 1246,
"tokens_trained": 0.612437888
},
{
"epoch": 0.3540174455712361,
"grad_norm": 24.129865646362305,
"loss": 4.2626,
"lr": 0.0008955244755244756,
"step": 1248,
"tokens_trained": 0.613420728
},
{
"epoch": 0.35458478122118997,
"grad_norm": 34.814605712890625,
"loss": 4.2627,
"lr": 0.0008952447552447552,
"step": 1250,
"tokens_trained": 0.614405904
},
{
"epoch": 0.35458478122118997,
"eval_loss": 1.078355312347412,
"eval_runtime": 20.4723,
"step": 1250,
"tokens_trained": 0.614405904
},
{
"epoch": 0.3551521168711439,
"grad_norm": 18.26809310913086,
"loss": 4.2986,
"lr": 0.000894965034965035,
"step": 1252,
"tokens_trained": 0.615386288
},
{
"epoch": 0.35571945252109777,
"grad_norm": 24.68335723876953,
"loss": 4.3146,
"lr": 0.0008946853146853147,
"step": 1254,
"tokens_trained": 0.616370576
},
{
"epoch": 0.3562867881710517,
"grad_norm": 35.34586715698242,
"loss": 4.2905,
"lr": 0.0008944055944055944,
"step": 1256,
"tokens_trained": 0.617351944
},
{
"epoch": 0.3568541238210056,
"grad_norm": 22.668407440185547,
"loss": 4.2607,
"lr": 0.0008941258741258741,
"step": 1258,
"tokens_trained": 0.618334816
},
{
"epoch": 0.3574214594709595,
"grad_norm": 14.068164825439453,
"loss": 4.2459,
"lr": 0.0008938461538461538,
"step": 1260,
"tokens_trained": 0.619319736
},
{
"epoch": 0.3579887951209134,
"grad_norm": 8.274995803833008,
"loss": 4.2713,
"lr": 0.0008935664335664337,
"step": 1262,
"tokens_trained": 0.620299344
},
{
"epoch": 0.3585561307708673,
"grad_norm": 22.12897491455078,
"loss": 4.2841,
"lr": 0.0008932867132867133,
"step": 1264,
"tokens_trained": 0.621282592
},
{
"epoch": 0.3591234664208212,
"grad_norm": 26.171052932739258,
"loss": 4.2505,
"lr": 0.000893006993006993,
"step": 1266,
"tokens_trained": 0.622266136
},
{
"epoch": 0.35969080207077514,
"grad_norm": 14.768603324890137,
"loss": 4.271,
"lr": 0.0008927272727272727,
"step": 1268,
"tokens_trained": 0.623247816
},
{
"epoch": 0.360258137720729,
"grad_norm": 13.065408706665039,
"loss": 4.2387,
"lr": 0.0008924475524475525,
"step": 1270,
"tokens_trained": 0.624234848
},
{
"epoch": 0.36082547337068294,
"grad_norm": 14.043888092041016,
"loss": 4.2601,
"lr": 0.0008921678321678322,
"step": 1272,
"tokens_trained": 0.625214176
},
{
"epoch": 0.3613928090206368,
"grad_norm": 13.734328269958496,
"loss": 4.2426,
"lr": 0.0008918881118881119,
"step": 1274,
"tokens_trained": 0.626197608
},
{
"epoch": 0.36196014467059073,
"grad_norm": 10.075374603271484,
"loss": 4.2259,
"lr": 0.0008916083916083916,
"step": 1276,
"tokens_trained": 0.62717884
},
{
"epoch": 0.36252748032054466,
"grad_norm": 33.92001724243164,
"loss": 4.3054,
"lr": 0.0008913286713286713,
"step": 1278,
"tokens_trained": 0.628166888
},
{
"epoch": 0.36309481597049853,
"grad_norm": 31.1391544342041,
"loss": 4.3066,
"lr": 0.0008910489510489512,
"step": 1280,
"tokens_trained": 0.629152528
},
{
"epoch": 0.36366215162045246,
"grad_norm": 10.888711929321289,
"loss": 4.2348,
"lr": 0.0008907692307692308,
"step": 1282,
"tokens_trained": 0.630132584
},
{
"epoch": 0.3642294872704063,
"grad_norm": 27.298410415649414,
"loss": 4.3225,
"lr": 0.0008904895104895105,
"step": 1284,
"tokens_trained": 0.63111212
},
{
"epoch": 0.36479682292036025,
"grad_norm": 23.396818161010742,
"loss": 4.3177,
"lr": 0.0008902097902097902,
"step": 1286,
"tokens_trained": 0.632094984
},
{
"epoch": 0.3653641585703142,
"grad_norm": 18.824432373046875,
"loss": 4.2235,
"lr": 0.00088993006993007,
"step": 1288,
"tokens_trained": 0.633076832
},
{
"epoch": 0.36593149422026805,
"grad_norm": 8.04826545715332,
"loss": 4.2268,
"lr": 0.0008896503496503497,
"step": 1290,
"tokens_trained": 0.63405868
},
{
"epoch": 0.366498829870222,
"grad_norm": 32.26673889160156,
"loss": 4.3113,
"lr": 0.0008893706293706294,
"step": 1292,
"tokens_trained": 0.635045096
},
{
"epoch": 0.36706616552017585,
"grad_norm": 29.91358184814453,
"loss": 4.2971,
"lr": 0.000889090909090909,
"step": 1294,
"tokens_trained": 0.63603008
},
{
"epoch": 0.3676335011701298,
"grad_norm": 12.093538284301758,
"loss": 4.2502,
"lr": 0.0008888111888111888,
"step": 1296,
"tokens_trained": 0.637014016
},
{
"epoch": 0.3682008368200837,
"grad_norm": 8.252509117126465,
"loss": 4.2905,
"lr": 0.0008885314685314686,
"step": 1298,
"tokens_trained": 0.637997752
},
{
"epoch": 0.36876817247003757,
"grad_norm": 61.22240447998047,
"loss": 4.4753,
"lr": 0.0008882517482517483,
"step": 1300,
"tokens_trained": 0.638981552
},
{
"epoch": 0.3693355081199915,
"grad_norm": 47.58195877075195,
"loss": 4.2769,
"lr": 0.000887972027972028,
"step": 1302,
"tokens_trained": 0.639963512
},
{
"epoch": 0.36990284376994537,
"grad_norm": 28.806411743164062,
"loss": 4.3728,
"lr": 0.0008876923076923077,
"step": 1304,
"tokens_trained": 0.640948392
},
{
"epoch": 0.3704701794198993,
"grad_norm": 38.960853576660156,
"loss": 4.338,
"lr": 0.0008874125874125875,
"step": 1306,
"tokens_trained": 0.641935304
},
{
"epoch": 0.3710375150698532,
"grad_norm": 25.05726432800293,
"loss": 4.3002,
"lr": 0.0008871328671328671,
"step": 1308,
"tokens_trained": 0.642924168
},
{
"epoch": 0.3716048507198071,
"grad_norm": 39.84127426147461,
"loss": 4.3593,
"lr": 0.0008868531468531469,
"step": 1310,
"tokens_trained": 0.64390412
},
{
"epoch": 0.372172186369761,
"grad_norm": 15.03055191040039,
"loss": 4.223,
"lr": 0.0008865734265734265,
"step": 1312,
"tokens_trained": 0.644882104
},
{
"epoch": 0.3727395220197149,
"grad_norm": 41.85628890991211,
"loss": 4.3819,
"lr": 0.0008862937062937063,
"step": 1314,
"tokens_trained": 0.645866912
},
{
"epoch": 0.3733068576696688,
"grad_norm": 29.014118194580078,
"loss": 4.2843,
"lr": 0.0008860139860139861,
"step": 1316,
"tokens_trained": 0.646850376
},
{
"epoch": 0.37387419331962274,
"grad_norm": 24.407743453979492,
"loss": 4.2598,
"lr": 0.0008857342657342658,
"step": 1318,
"tokens_trained": 0.647832272
},
{
"epoch": 0.3744415289695766,
"grad_norm": 23.28154182434082,
"loss": 4.2162,
"lr": 0.0008854545454545455,
"step": 1320,
"tokens_trained": 0.64881652
},
{
"epoch": 0.37500886461953054,
"grad_norm": 17.70418930053711,
"loss": 4.2386,
"lr": 0.0008851748251748251,
"step": 1322,
"tokens_trained": 0.649794936
},
{
"epoch": 0.37557620026948446,
"grad_norm": 22.582124710083008,
"loss": 4.2358,
"lr": 0.000884895104895105,
"step": 1324,
"tokens_trained": 0.650777784
},
{
"epoch": 0.37614353591943833,
"grad_norm": 16.77848243713379,
"loss": 4.2536,
"lr": 0.0008846153846153846,
"step": 1326,
"tokens_trained": 0.651762472
},
{
"epoch": 0.37671087156939226,
"grad_norm": 14.382417678833008,
"loss": 4.2403,
"lr": 0.0008843356643356644,
"step": 1328,
"tokens_trained": 0.652741832
},
{
"epoch": 0.37727820721934613,
"grad_norm": 22.420886993408203,
"loss": 4.1977,
"lr": 0.000884055944055944,
"step": 1330,
"tokens_trained": 0.653725792
},
{
"epoch": 0.37784554286930006,
"grad_norm": 9.768660545349121,
"loss": 4.2148,
"lr": 0.0008837762237762238,
"step": 1332,
"tokens_trained": 0.654704648
},
{
"epoch": 0.378412878519254,
"grad_norm": 5.091487407684326,
"loss": 4.2062,
"lr": 0.0008834965034965036,
"step": 1334,
"tokens_trained": 0.65569176
},
{
"epoch": 0.37898021416920785,
"grad_norm": 53.520957946777344,
"loss": 4.4082,
"lr": 0.0008832167832167832,
"step": 1336,
"tokens_trained": 0.656679344
},
{
"epoch": 0.3795475498191618,
"grad_norm": 32.17420959472656,
"loss": 4.2911,
"lr": 0.000882937062937063,
"step": 1338,
"tokens_trained": 0.657665136
},
{
"epoch": 0.38011488546911565,
"grad_norm": 14.12790584564209,
"loss": 4.2899,
"lr": 0.0008826573426573426,
"step": 1340,
"tokens_trained": 0.658651576
},
{
"epoch": 0.3806822211190696,
"grad_norm": 51.74199676513672,
"loss": 4.3901,
"lr": 0.0008823776223776225,
"step": 1342,
"tokens_trained": 0.659631792
},
{
"epoch": 0.3812495567690235,
"grad_norm": 48.99909973144531,
"loss": 4.298,
"lr": 0.0008820979020979021,
"step": 1344,
"tokens_trained": 0.660616912
},
{
"epoch": 0.38181689241897737,
"grad_norm": 28.356245040893555,
"loss": 4.3171,
"lr": 0.0008818181818181819,
"step": 1346,
"tokens_trained": 0.66159872
},
{
"epoch": 0.3823842280689313,
"grad_norm": 45.081703186035156,
"loss": 4.3067,
"lr": 0.0008815384615384615,
"step": 1348,
"tokens_trained": 0.662582152
},
{
"epoch": 0.38295156371888517,
"grad_norm": 37.175052642822266,
"loss": 4.241,
"lr": 0.0008812587412587412,
"step": 1350,
"tokens_trained": 0.663561176
},
{
"epoch": 0.3835188993688391,
"grad_norm": 49.46076965332031,
"loss": 4.2896,
"lr": 0.0008809790209790211,
"step": 1352,
"tokens_trained": 0.664545144
},
{
"epoch": 0.384086235018793,
"grad_norm": 22.20182991027832,
"loss": 4.323,
"lr": 0.0008806993006993007,
"step": 1354,
"tokens_trained": 0.66553092
},
{
"epoch": 0.3846535706687469,
"grad_norm": 34.111549377441406,
"loss": 4.3138,
"lr": 0.0008804195804195805,
"step": 1356,
"tokens_trained": 0.666517568
},
{
"epoch": 0.3852209063187008,
"grad_norm": 47.01582336425781,
"loss": 4.3009,
"lr": 0.0008801398601398601,
"step": 1358,
"tokens_trained": 0.667498192
},
{
"epoch": 0.3857882419686547,
"grad_norm": 18.845388412475586,
"loss": 4.3176,
"lr": 0.00087986013986014,
"step": 1360,
"tokens_trained": 0.668479008
},
{
"epoch": 0.3863555776186086,
"grad_norm": 53.68927764892578,
"loss": 4.4024,
"lr": 0.0008795804195804196,
"step": 1362,
"tokens_trained": 0.669462472
},
{
"epoch": 0.38692291326856254,
"grad_norm": 29.88358497619629,
"loss": 4.286,
"lr": 0.0008793006993006993,
"step": 1364,
"tokens_trained": 0.67044392
},
{
"epoch": 0.3874902489185164,
"grad_norm": 11.12879753112793,
"loss": 4.3024,
"lr": 0.000879020979020979,
"step": 1366,
"tokens_trained": 0.671424552
},
{
"epoch": 0.38805758456847034,
"grad_norm": 23.573301315307617,
"loss": 4.2662,
"lr": 0.0008787412587412587,
"step": 1368,
"tokens_trained": 0.672409992
},
{
"epoch": 0.3886249202184242,
"grad_norm": 24.749160766601562,
"loss": 4.274,
"lr": 0.0008784615384615386,
"step": 1370,
"tokens_trained": 0.67339824
},
{
"epoch": 0.38919225586837813,
"grad_norm": 33.26881408691406,
"loss": 4.2588,
"lr": 0.0008781818181818182,
"step": 1372,
"tokens_trained": 0.67438204
},
{
"epoch": 0.38975959151833206,
"grad_norm": 24.466472625732422,
"loss": 4.2837,
"lr": 0.000877902097902098,
"step": 1374,
"tokens_trained": 0.67536356
},
{
"epoch": 0.39004325934330897,
"eval_loss": 1.0616238117218018,
"eval_runtime": 20.3698,
"step": 1375,
"tokens_trained": 0.675855672
},
{
"epoch": 0.39032692716828593,
"grad_norm": 24.48844337463379,
"loss": 4.259,
"lr": 0.0008776223776223776,
"step": 1376,
"tokens_trained": 0.676346368
},
{
"epoch": 0.39089426281823986,
"grad_norm": 30.594989776611328,
"loss": 4.1894,
"lr": 0.0008773426573426574,
"step": 1378,
"tokens_trained": 0.677329312
},
{
"epoch": 0.3914615984681937,
"grad_norm": 19.835350036621094,
"loss": 4.2718,
"lr": 0.0008770629370629371,
"step": 1380,
"tokens_trained": 0.678312272
},
{
"epoch": 0.39202893411814765,
"grad_norm": 14.570358276367188,
"loss": 4.2419,
"lr": 0.0008767832167832168,
"step": 1382,
"tokens_trained": 0.679291216
},
{
"epoch": 0.3925962697681016,
"grad_norm": 11.608271598815918,
"loss": 4.1917,
"lr": 0.0008765034965034965,
"step": 1384,
"tokens_trained": 0.680273296
},
{
"epoch": 0.39316360541805545,
"grad_norm": 26.094860076904297,
"loss": 4.2762,
"lr": 0.0008762237762237762,
"step": 1386,
"tokens_trained": 0.681249464
},
{
"epoch": 0.3937309410680094,
"grad_norm": 12.754049301147461,
"loss": 4.2032,
"lr": 0.0008759440559440561,
"step": 1388,
"tokens_trained": 0.682234168
},
{
"epoch": 0.39429827671796325,
"grad_norm": 5.951663970947266,
"loss": 4.1921,
"lr": 0.0008756643356643357,
"step": 1390,
"tokens_trained": 0.683217176
},
{
"epoch": 0.3948656123679172,
"grad_norm": 26.907669067382812,
"loss": 4.24,
"lr": 0.0008753846153846154,
"step": 1392,
"tokens_trained": 0.68419888
},
{
"epoch": 0.3954329480178711,
"grad_norm": 25.04796600341797,
"loss": 4.2656,
"lr": 0.0008751048951048951,
"step": 1394,
"tokens_trained": 0.685178784
},
{
"epoch": 0.39600028366782497,
"grad_norm": 19.600811004638672,
"loss": 4.2683,
"lr": 0.0008748251748251749,
"step": 1396,
"tokens_trained": 0.686161632
},
{
"epoch": 0.3965676193177789,
"grad_norm": 14.087088584899902,
"loss": 4.2658,
"lr": 0.0008745454545454546,
"step": 1398,
"tokens_trained": 0.687139992
},
{
"epoch": 0.39713495496773277,
"grad_norm": 9.257765769958496,
"loss": 4.2021,
"lr": 0.0008742657342657343,
"step": 1400,
"tokens_trained": 0.688117912
},
{
"epoch": 0.3977022906176867,
"grad_norm": 18.830154418945312,
"loss": 4.2249,
"lr": 0.0008739860139860139,
"step": 1402,
"tokens_trained": 0.689098776
},
{
"epoch": 0.3982696262676406,
"grad_norm": 24.81566619873047,
"loss": 4.246,
"lr": 0.0008737062937062937,
"step": 1404,
"tokens_trained": 0.690085432
},
{
"epoch": 0.3988369619175945,
"grad_norm": 14.071616172790527,
"loss": 4.2531,
"lr": 0.0008734265734265734,
"step": 1406,
"tokens_trained": 0.691069232
},
{
"epoch": 0.3994042975675484,
"grad_norm": 21.414424896240234,
"loss": 4.2192,
"lr": 0.0008731468531468532,
"step": 1408,
"tokens_trained": 0.692051224
},
{
"epoch": 0.3999716332175023,
"grad_norm": 38.74683380126953,
"loss": 4.2421,
"lr": 0.0008728671328671329,
"step": 1410,
"tokens_trained": 0.693029976
},
{
"epoch": 0.4005389688674562,
"grad_norm": 12.595442771911621,
"loss": 4.2569,
"lr": 0.0008725874125874126,
"step": 1412,
"tokens_trained": 0.694013304
},
{
"epoch": 0.40110630451741014,
"grad_norm": 55.233673095703125,
"loss": 4.3422,
"lr": 0.0008723076923076924,
"step": 1414,
"tokens_trained": 0.694997536
},
{
"epoch": 0.401673640167364,
"grad_norm": 24.717113494873047,
"loss": 4.2567,
"lr": 0.000872027972027972,
"step": 1416,
"tokens_trained": 0.695982632
},
{
"epoch": 0.40224097581731794,
"grad_norm": 20.552875518798828,
"loss": 4.2464,
"lr": 0.0008717482517482518,
"step": 1418,
"tokens_trained": 0.696966408
},
{
"epoch": 0.4028083114672718,
"grad_norm": 25.569900512695312,
"loss": 4.21,
"lr": 0.0008714685314685314,
"step": 1420,
"tokens_trained": 0.697948224
},
{
"epoch": 0.40337564711722573,
"grad_norm": 24.538320541381836,
"loss": 4.2605,
"lr": 0.0008711888111888112,
"step": 1422,
"tokens_trained": 0.698934688
},
{
"epoch": 0.40394298276717966,
"grad_norm": 9.585651397705078,
"loss": 4.2524,
"lr": 0.0008709090909090909,
"step": 1424,
"tokens_trained": 0.699921976
},
{
"epoch": 0.40451031841713353,
"grad_norm": 11.886672973632812,
"loss": 4.1934,
"lr": 0.0008706293706293707,
"step": 1426,
"tokens_trained": 0.70090396
},
{
"epoch": 0.40507765406708746,
"grad_norm": 26.162124633789062,
"loss": 4.2412,
"lr": 0.0008703496503496504,
"step": 1428,
"tokens_trained": 0.701888448
},
{
"epoch": 0.4056449897170413,
"grad_norm": 5.03931188583374,
"loss": 4.202,
"lr": 0.00087006993006993,
"step": 1430,
"tokens_trained": 0.702864336
},
{
"epoch": 0.40621232536699525,
"grad_norm": 33.67579650878906,
"loss": 4.3087,
"lr": 0.0008697902097902099,
"step": 1432,
"tokens_trained": 0.703847784
},
{
"epoch": 0.4067796610169492,
"grad_norm": 34.38542556762695,
"loss": 4.2807,
"lr": 0.0008695104895104895,
"step": 1434,
"tokens_trained": 0.704827288
},
{
"epoch": 0.40734699666690305,
"grad_norm": 13.319886207580566,
"loss": 4.3332,
"lr": 0.0008692307692307693,
"step": 1436,
"tokens_trained": 0.705815392
},
{
"epoch": 0.407914332316857,
"grad_norm": 36.58311080932617,
"loss": 4.3318,
"lr": 0.0008689510489510489,
"step": 1438,
"tokens_trained": 0.7067914
},
{
"epoch": 0.40848166796681085,
"grad_norm": 29.63648223876953,
"loss": 4.2962,
"lr": 0.0008686713286713287,
"step": 1440,
"tokens_trained": 0.70777396
},
{
"epoch": 0.4090490036167648,
"grad_norm": 9.55128002166748,
"loss": 4.2773,
"lr": 0.0008683916083916084,
"step": 1442,
"tokens_trained": 0.708750496
},
{
"epoch": 0.4096163392667187,
"grad_norm": 53.83981704711914,
"loss": 4.3875,
"lr": 0.0008681118881118881,
"step": 1444,
"tokens_trained": 0.709730168
},
{
"epoch": 0.41018367491667257,
"grad_norm": 54.59236526489258,
"loss": 4.3582,
"lr": 0.0008678321678321679,
"step": 1446,
"tokens_trained": 0.710709704
},
{
"epoch": 0.4107510105666265,
"grad_norm": 13.964411735534668,
"loss": 4.3065,
"lr": 0.0008675524475524475,
"step": 1448,
"tokens_trained": 0.711690136
},
{
"epoch": 0.41131834621658037,
"grad_norm": 25.506649017333984,
"loss": 4.2686,
"lr": 0.0008672727272727273,
"step": 1450,
"tokens_trained": 0.712668056
},
{
"epoch": 0.4118856818665343,
"grad_norm": 21.1628360748291,
"loss": 4.2485,
"lr": 0.000866993006993007,
"step": 1452,
"tokens_trained": 0.71365004
},
{
"epoch": 0.4124530175164882,
"grad_norm": 15.751238822937012,
"loss": 4.2078,
"lr": 0.0008667132867132868,
"step": 1454,
"tokens_trained": 0.714632032
},
{
"epoch": 0.4130203531664421,
"grad_norm": 15.838552474975586,
"loss": 4.1944,
"lr": 0.0008664335664335664,
"step": 1456,
"tokens_trained": 0.715611376
},
{
"epoch": 0.413587688816396,
"grad_norm": 15.968609809875488,
"loss": 4.1768,
"lr": 0.0008661538461538461,
"step": 1458,
"tokens_trained": 0.716591112
},
{
"epoch": 0.4141550244663499,
"grad_norm": 15.419891357421875,
"loss": 4.1978,
"lr": 0.0008658741258741259,
"step": 1460,
"tokens_trained": 0.717575952
},
{
"epoch": 0.4147223601163038,
"grad_norm": 15.088132858276367,
"loss": 4.2361,
"lr": 0.0008655944055944056,
"step": 1462,
"tokens_trained": 0.718563696
},
{
"epoch": 0.41528969576625774,
"grad_norm": 4.839190483093262,
"loss": 4.2089,
"lr": 0.0008653146853146854,
"step": 1464,
"tokens_trained": 0.71954848
},
{
"epoch": 0.4158570314162116,
"grad_norm": 22.192466735839844,
"loss": 4.2109,
"lr": 0.000865034965034965,
"step": 1466,
"tokens_trained": 0.720533304
},
{
"epoch": 0.41642436706616553,
"grad_norm": 28.983531951904297,
"loss": 4.2402,
"lr": 0.0008647552447552448,
"step": 1468,
"tokens_trained": 0.721518176
},
{
"epoch": 0.4169917027161194,
"grad_norm": 21.010780334472656,
"loss": 4.1732,
"lr": 0.0008644755244755245,
"step": 1470,
"tokens_trained": 0.72250176
},
{
"epoch": 0.41755903836607333,
"grad_norm": 14.59277057647705,
"loss": 4.1847,
"lr": 0.0008641958041958042,
"step": 1472,
"tokens_trained": 0.723486664
},
{
"epoch": 0.41812637401602726,
"grad_norm": 13.688531875610352,
"loss": 4.1577,
"lr": 0.0008639160839160839,
"step": 1474,
"tokens_trained": 0.724469328
},
{
"epoch": 0.41869370966598113,
"grad_norm": 15.879347801208496,
"loss": 4.1721,
"lr": 0.0008636363636363636,
"step": 1476,
"tokens_trained": 0.725454968
},
{
"epoch": 0.41926104531593505,
"grad_norm": 10.225201606750488,
"loss": 4.1999,
"lr": 0.0008633566433566434,
"step": 1478,
"tokens_trained": 0.7264426
},
{
"epoch": 0.4198283809658889,
"grad_norm": 17.007728576660156,
"loss": 4.2229,
"lr": 0.0008630769230769231,
"step": 1480,
"tokens_trained": 0.727422056
},
{
"epoch": 0.42039571661584285,
"grad_norm": 13.517934799194336,
"loss": 4.2241,
"lr": 0.0008627972027972029,
"step": 1482,
"tokens_trained": 0.728403688
},
{
"epoch": 0.4209630522657968,
"grad_norm": 17.132064819335938,
"loss": 4.1679,
"lr": 0.0008625174825174825,
"step": 1484,
"tokens_trained": 0.729386248
},
{
"epoch": 0.42153038791575065,
"grad_norm": 19.782320022583008,
"loss": 4.1817,
"lr": 0.0008622377622377622,
"step": 1486,
"tokens_trained": 0.730368752
},
{
"epoch": 0.4220977235657046,
"grad_norm": 3.388552188873291,
"loss": 4.1726,
"lr": 0.000861958041958042,
"step": 1488,
"tokens_trained": 0.731354304
},
{
"epoch": 0.42266505921565845,
"grad_norm": 28.33499526977539,
"loss": 4.2623,
"lr": 0.0008616783216783217,
"step": 1490,
"tokens_trained": 0.732337296
},
{
"epoch": 0.42323239486561237,
"grad_norm": 24.927406311035156,
"loss": 4.2422,
"lr": 0.0008613986013986014,
"step": 1492,
"tokens_trained": 0.733319824
},
{
"epoch": 0.4237997305155663,
"grad_norm": 25.996028900146484,
"loss": 4.2227,
"lr": 0.0008611188811188811,
"step": 1494,
"tokens_trained": 0.73430636
},
{
"epoch": 0.42436706616552017,
"grad_norm": 14.625783920288086,
"loss": 4.2268,
"lr": 0.0008608391608391609,
"step": 1496,
"tokens_trained": 0.735285848
},
{
"epoch": 0.4249344018154741,
"grad_norm": 12.556640625,
"loss": 4.2352,
"lr": 0.0008605594405594406,
"step": 1498,
"tokens_trained": 0.736270632
},
{
"epoch": 0.42550173746542796,
"grad_norm": 18.579416275024414,
"loss": 4.2377,
"lr": 0.0008602797202797203,
"step": 1500,
"tokens_trained": 0.737255104
},
{
"epoch": 0.42550173746542796,
"eval_loss": 1.052606463432312,
"eval_runtime": 20.5089,
"step": 1500,
"tokens_trained": 0.737255104
},
{
"epoch": 0.4260690731153819,
"grad_norm": 16.550657272338867,
"loss": 4.182,
"lr": 0.00086,
"step": 1502,
"tokens_trained": 0.738240848
},
{
"epoch": 0.4266364087653358,
"grad_norm": 24.4381046295166,
"loss": 4.2093,
"lr": 0.0008597202797202797,
"step": 1504,
"tokens_trained": 0.73922592
},
{
"epoch": 0.4272037444152897,
"grad_norm": 13.155163764953613,
"loss": 4.239,
"lr": 0.0008594405594405595,
"step": 1506,
"tokens_trained": 0.740208896
},
{
"epoch": 0.4277710800652436,
"grad_norm": 27.667949676513672,
"loss": 4.2607,
"lr": 0.0008591608391608392,
"step": 1508,
"tokens_trained": 0.741189312
},
{
"epoch": 0.4283384157151975,
"grad_norm": 35.897743225097656,
"loss": 4.2153,
"lr": 0.0008588811188811188,
"step": 1510,
"tokens_trained": 0.742170456
},
{
"epoch": 0.4289057513651514,
"grad_norm": 18.16407012939453,
"loss": 4.2753,
"lr": 0.0008586013986013986,
"step": 1512,
"tokens_trained": 0.743152504
},
{
"epoch": 0.42947308701510534,
"grad_norm": 27.447364807128906,
"loss": 4.2321,
"lr": 0.0008583216783216783,
"step": 1514,
"tokens_trained": 0.744139768
},
{
"epoch": 0.4300404226650592,
"grad_norm": 21.115859985351562,
"loss": 4.2048,
"lr": 0.0008580419580419581,
"step": 1516,
"tokens_trained": 0.745122368
},
{
"epoch": 0.43060775831501313,
"grad_norm": 5.949585914611816,
"loss": 4.1787,
"lr": 0.0008577622377622378,
"step": 1518,
"tokens_trained": 0.746104936
},
{
"epoch": 0.431175093964967,
"grad_norm": 6.631585121154785,
"loss": 4.2035,
"lr": 0.0008574825174825175,
"step": 1520,
"tokens_trained": 0.747086264
},
{
"epoch": 0.43174242961492093,
"grad_norm": 38.91585159301758,
"loss": 4.354,
"lr": 0.0008572027972027972,
"step": 1522,
"tokens_trained": 0.74806844
},
{
"epoch": 0.43230976526487486,
"grad_norm": 37.53727722167969,
"loss": 4.228,
"lr": 0.000856923076923077,
"step": 1524,
"tokens_trained": 0.749052432
},
{
"epoch": 0.4328771009148287,
"grad_norm": 19.87713623046875,
"loss": 4.2696,
"lr": 0.0008566433566433567,
"step": 1526,
"tokens_trained": 0.750037072
},
{
"epoch": 0.43344443656478265,
"grad_norm": 25.615995407104492,
"loss": 4.2676,
"lr": 0.0008563636363636363,
"step": 1528,
"tokens_trained": 0.751020584
},
{
"epoch": 0.4340117722147365,
"grad_norm": 16.643299102783203,
"loss": 4.201,
"lr": 0.0008560839160839161,
"step": 1530,
"tokens_trained": 0.75200224
},
{
"epoch": 0.43457910786469045,
"grad_norm": 16.207853317260742,
"loss": 4.1944,
"lr": 0.0008558041958041958,
"step": 1532,
"tokens_trained": 0.752981624
},
{
"epoch": 0.4351464435146444,
"grad_norm": 27.054973602294922,
"loss": 4.2188,
"lr": 0.0008555244755244756,
"step": 1534,
"tokens_trained": 0.753968464
},
{
"epoch": 0.43571377916459825,
"grad_norm": 33.468238830566406,
"loss": 4.2052,
"lr": 0.0008552447552447553,
"step": 1536,
"tokens_trained": 0.754950976
},
{
"epoch": 0.4362811148145522,
"grad_norm": 21.083576202392578,
"loss": 4.2514,
"lr": 0.000854965034965035,
"step": 1538,
"tokens_trained": 0.755938272
},
{
"epoch": 0.43684845046450604,
"grad_norm": 19.927122116088867,
"loss": 4.2493,
"lr": 0.0008546853146853147,
"step": 1540,
"tokens_trained": 0.756916784
},
{
"epoch": 0.43741578611445997,
"grad_norm": 22.105287551879883,
"loss": 4.2264,
"lr": 0.0008544055944055944,
"step": 1542,
"tokens_trained": 0.757901152
},
{
"epoch": 0.4379831217644139,
"grad_norm": 22.448705673217773,
"loss": 4.1987,
"lr": 0.0008541258741258742,
"step": 1544,
"tokens_trained": 0.758886048
},
{
"epoch": 0.43855045741436777,
"grad_norm": 17.740005493164062,
"loss": 4.1918,
"lr": 0.0008538461538461538,
"step": 1546,
"tokens_trained": 0.759864304
},
{
"epoch": 0.4391177930643217,
"grad_norm": 20.58041763305664,
"loss": 4.2144,
"lr": 0.0008535664335664336,
"step": 1548,
"tokens_trained": 0.760844312
},
{
"epoch": 0.43968512871427556,
"grad_norm": 21.937252044677734,
"loss": 4.2129,
"lr": 0.0008532867132867133,
"step": 1550,
"tokens_trained": 0.761827256
},
{
"epoch": 0.4402524643642295,
"grad_norm": 26.883426666259766,
"loss": 4.2244,
"lr": 0.000853006993006993,
"step": 1552,
"tokens_trained": 0.7628098
},
{
"epoch": 0.4408198000141834,
"grad_norm": 10.297266960144043,
"loss": 4.1724,
"lr": 0.0008527272727272728,
"step": 1554,
"tokens_trained": 0.763792488
},
{
"epoch": 0.4413871356641373,
"grad_norm": 12.119601249694824,
"loss": 4.1828,
"lr": 0.0008524475524475524,
"step": 1556,
"tokens_trained": 0.764769936
},
{
"epoch": 0.4419544713140912,
"grad_norm": 16.565885543823242,
"loss": 4.2113,
"lr": 0.0008521678321678322,
"step": 1558,
"tokens_trained": 0.765752376
},
{
"epoch": 0.4425218069640451,
"grad_norm": 18.860309600830078,
"loss": 4.1864,
"lr": 0.0008518881118881119,
"step": 1560,
"tokens_trained": 0.766736256
},
{
"epoch": 0.443089142613999,
"grad_norm": 4.049737453460693,
"loss": 4.2108,
"lr": 0.0008516083916083917,
"step": 1562,
"tokens_trained": 0.767720568
},
{
"epoch": 0.44365647826395294,
"grad_norm": 15.730945587158203,
"loss": 4.2339,
"lr": 0.0008513286713286713,
"step": 1564,
"tokens_trained": 0.768701288
},
{
"epoch": 0.4442238139139068,
"grad_norm": 18.64398956298828,
"loss": 4.2132,
"lr": 0.000851048951048951,
"step": 1566,
"tokens_trained": 0.769681336
},
{
"epoch": 0.44479114956386073,
"grad_norm": 22.01759147644043,
"loss": 4.2211,
"lr": 0.0008507692307692308,
"step": 1568,
"tokens_trained": 0.770661168
},
{
"epoch": 0.4453584852138146,
"grad_norm": 3.097306489944458,
"loss": 4.2114,
"lr": 0.0008504895104895105,
"step": 1570,
"tokens_trained": 0.7716424
},
{
"epoch": 0.44592582086376853,
"grad_norm": 35.901546478271484,
"loss": 4.3,
"lr": 0.0008502097902097903,
"step": 1572,
"tokens_trained": 0.772627536
},
{
"epoch": 0.44649315651372246,
"grad_norm": 20.762710571289062,
"loss": 4.2465,
"lr": 0.0008499300699300699,
"step": 1574,
"tokens_trained": 0.77361008
},
{
"epoch": 0.4470604921636763,
"grad_norm": 13.54304027557373,
"loss": 4.221,
"lr": 0.0008496503496503497,
"step": 1576,
"tokens_trained": 0.774591184
},
{
"epoch": 0.44762782781363025,
"grad_norm": 18.83641242980957,
"loss": 4.2228,
"lr": 0.0008493706293706294,
"step": 1578,
"tokens_trained": 0.775574136
},
{
"epoch": 0.4481951634635841,
"grad_norm": 12.294941902160645,
"loss": 4.1768,
"lr": 0.0008490909090909091,
"step": 1580,
"tokens_trained": 0.776554752
},
{
"epoch": 0.44876249911353805,
"grad_norm": 5.768923759460449,
"loss": 4.2255,
"lr": 0.0008488111888111888,
"step": 1582,
"tokens_trained": 0.777539368
},
{
"epoch": 0.449329834763492,
"grad_norm": 7.9961137771606445,
"loss": 4.2218,
"lr": 0.0008485314685314685,
"step": 1584,
"tokens_trained": 0.778522344
},
{
"epoch": 0.44989717041344585,
"grad_norm": 22.005645751953125,
"loss": 4.2452,
"lr": 0.0008482517482517483,
"step": 1586,
"tokens_trained": 0.77950768
},
{
"epoch": 0.45046450606339977,
"grad_norm": 27.313426971435547,
"loss": 4.1875,
"lr": 0.000847972027972028,
"step": 1588,
"tokens_trained": 0.780490984
},
{
"epoch": 0.45103184171335364,
"grad_norm": 10.344687461853027,
"loss": 4.2356,
"lr": 0.0008476923076923078,
"step": 1590,
"tokens_trained": 0.781469
},
{
"epoch": 0.45159917736330757,
"grad_norm": 27.348726272583008,
"loss": 4.2962,
"lr": 0.0008474125874125874,
"step": 1592,
"tokens_trained": 0.782450304
},
{
"epoch": 0.4521665130132615,
"grad_norm": 32.965911865234375,
"loss": 4.2736,
"lr": 0.0008471328671328671,
"step": 1594,
"tokens_trained": 0.783431416
},
{
"epoch": 0.45273384866321537,
"grad_norm": 7.752636909484863,
"loss": 4.2074,
"lr": 0.0008468531468531469,
"step": 1596,
"tokens_trained": 0.784409568
},
{
"epoch": 0.4533011843131693,
"grad_norm": 38.85223388671875,
"loss": 4.3261,
"lr": 0.0008465734265734266,
"step": 1598,
"tokens_trained": 0.785399368
},
{
"epoch": 0.45386851996312316,
"grad_norm": 38.017967224121094,
"loss": 4.2646,
"lr": 0.0008462937062937063,
"step": 1600,
"tokens_trained": 0.786376072
},
{
"epoch": 0.4544358556130771,
"grad_norm": 7.856576442718506,
"loss": 4.191,
"lr": 0.000846013986013986,
"step": 1602,
"tokens_trained": 0.787362072
},
{
"epoch": 0.455003191263031,
"grad_norm": 37.902870178222656,
"loss": 4.2651,
"lr": 0.0008457342657342658,
"step": 1604,
"tokens_trained": 0.788345104
},
{
"epoch": 0.4555705269129849,
"grad_norm": 7.724793434143066,
"loss": 4.1994,
"lr": 0.0008454545454545455,
"step": 1606,
"tokens_trained": 0.7893314
},
{
"epoch": 0.4561378625629388,
"grad_norm": 26.484699249267578,
"loss": 4.2276,
"lr": 0.0008451748251748252,
"step": 1608,
"tokens_trained": 0.790309344
},
{
"epoch": 0.4567051982128927,
"grad_norm": 23.137874603271484,
"loss": 4.2082,
"lr": 0.0008448951048951049,
"step": 1610,
"tokens_trained": 0.791295784
},
{
"epoch": 0.4572725338628466,
"grad_norm": 13.902606964111328,
"loss": 4.2035,
"lr": 0.0008446153846153846,
"step": 1612,
"tokens_trained": 0.79228076
},
{
"epoch": 0.45783986951280053,
"grad_norm": 8.438498497009277,
"loss": 4.1713,
"lr": 0.0008443356643356644,
"step": 1614,
"tokens_trained": 0.793265456
},
{
"epoch": 0.4584072051627544,
"grad_norm": 11.60899829864502,
"loss": 4.1971,
"lr": 0.0008440559440559441,
"step": 1616,
"tokens_trained": 0.794245896
},
{
"epoch": 0.45897454081270833,
"grad_norm": 19.33312225341797,
"loss": 4.2328,
"lr": 0.0008437762237762238,
"step": 1618,
"tokens_trained": 0.795229016
},
{
"epoch": 0.4595418764626622,
"grad_norm": 16.45014190673828,
"loss": 4.2277,
"lr": 0.0008434965034965035,
"step": 1620,
"tokens_trained": 0.79620792
},
{
"epoch": 0.46010921211261613,
"grad_norm": 9.818867683410645,
"loss": 4.1494,
"lr": 0.0008432167832167832,
"step": 1622,
"tokens_trained": 0.797192352
},
{
"epoch": 0.46067654776257005,
"grad_norm": 7.920058250427246,
"loss": 4.2027,
"lr": 0.000842937062937063,
"step": 1624,
"tokens_trained": 0.798174104
},
{
"epoch": 0.46096021558754696,
"eval_loss": 1.044265627861023,
"eval_runtime": 20.5617,
"step": 1625,
"tokens_trained": 0.798668072
},
{
"epoch": 0.4612438834125239,
"grad_norm": 10.734235763549805,
"loss": 4.1505,
"lr": 0.0008426573426573427,
"step": 1626,
"tokens_trained": 0.799160304
},
{
"epoch": 0.46181121906247785,
"grad_norm": 23.376392364501953,
"loss": 4.195,
"lr": 0.0008423776223776224,
"step": 1628,
"tokens_trained": 0.800144144
},
{
"epoch": 0.4623785547124317,
"grad_norm": 23.567371368408203,
"loss": 4.2367,
"lr": 0.0008420979020979021,
"step": 1630,
"tokens_trained": 0.801131184
},
{
"epoch": 0.46294589036238565,
"grad_norm": 19.271820068359375,
"loss": 4.1899,
"lr": 0.0008418181818181819,
"step": 1632,
"tokens_trained": 0.802111296
},
{
"epoch": 0.4635132260123396,
"grad_norm": 17.468698501586914,
"loss": 4.1941,
"lr": 0.0008415384615384616,
"step": 1634,
"tokens_trained": 0.803095112
},
{
"epoch": 0.46408056166229344,
"grad_norm": 22.298749923706055,
"loss": 4.2083,
"lr": 0.0008412587412587412,
"step": 1636,
"tokens_trained": 0.804080456
},
{
"epoch": 0.46464789731224737,
"grad_norm": 12.506179809570312,
"loss": 4.1953,
"lr": 0.000840979020979021,
"step": 1638,
"tokens_trained": 0.805062464
},
{
"epoch": 0.46521523296220124,
"grad_norm": 11.819656372070312,
"loss": 4.2047,
"lr": 0.0008406993006993006,
"step": 1640,
"tokens_trained": 0.806045504
},
{
"epoch": 0.46578256861215517,
"grad_norm": 15.925740242004395,
"loss": 4.1565,
"lr": 0.0008404195804195805,
"step": 1642,
"tokens_trained": 0.80702736
},
{
"epoch": 0.4663499042621091,
"grad_norm": 15.869892120361328,
"loss": 4.2134,
"lr": 0.0008401398601398602,
"step": 1644,
"tokens_trained": 0.808009192
},
{
"epoch": 0.46691723991206296,
"grad_norm": 10.851021766662598,
"loss": 4.2041,
"lr": 0.0008398601398601399,
"step": 1646,
"tokens_trained": 0.808994728
},
{
"epoch": 0.4674845755620169,
"grad_norm": 8.271230697631836,
"loss": 4.1739,
"lr": 0.0008395804195804196,
"step": 1648,
"tokens_trained": 0.809976448
},
{
"epoch": 0.46805191121197076,
"grad_norm": 13.768092155456543,
"loss": 4.1761,
"lr": 0.0008393006993006993,
"step": 1650,
"tokens_trained": 0.810958392
},
{
"epoch": 0.4686192468619247,
"grad_norm": 7.760485649108887,
"loss": 4.1826,
"lr": 0.0008390209790209791,
"step": 1652,
"tokens_trained": 0.81194136
},
{
"epoch": 0.4691865825118786,
"grad_norm": 13.28488540649414,
"loss": 4.1659,
"lr": 0.0008387412587412587,
"step": 1654,
"tokens_trained": 0.812924984
},
{
"epoch": 0.4697539181618325,
"grad_norm": 10.466367721557617,
"loss": 4.1432,
"lr": 0.0008384615384615385,
"step": 1656,
"tokens_trained": 0.813907424
},
{
"epoch": 0.4703212538117864,
"grad_norm": 15.40854549407959,
"loss": 4.1625,
"lr": 0.0008381818181818181,
"step": 1658,
"tokens_trained": 0.814888712
},
{
"epoch": 0.4708885894617403,
"grad_norm": 20.580612182617188,
"loss": 4.1636,
"lr": 0.000837902097902098,
"step": 1660,
"tokens_trained": 0.815869152
},
{
"epoch": 0.4714559251116942,
"grad_norm": 14.908403396606445,
"loss": 4.1763,
"lr": 0.0008376223776223776,
"step": 1662,
"tokens_trained": 0.816852664
},
{
"epoch": 0.47202326076164813,
"grad_norm": 10.217529296875,
"loss": 4.1934,
"lr": 0.0008373426573426573,
"step": 1664,
"tokens_trained": 0.817832792
},
{
"epoch": 0.472590596411602,
"grad_norm": 15.74150276184082,
"loss": 4.1714,
"lr": 0.0008370629370629371,
"step": 1666,
"tokens_trained": 0.81881728
},
{
"epoch": 0.47315793206155593,
"grad_norm": 15.39499282836914,
"loss": 4.2005,
"lr": 0.0008367832167832168,
"step": 1668,
"tokens_trained": 0.819800824
},
{
"epoch": 0.4737252677115098,
"grad_norm": 11.585809707641602,
"loss": 4.136,
"lr": 0.0008365034965034966,
"step": 1670,
"tokens_trained": 0.8207856
},
{
"epoch": 0.4742926033614637,
"grad_norm": 16.053237915039062,
"loss": 4.1827,
"lr": 0.0008362237762237762,
"step": 1672,
"tokens_trained": 0.821766576
},
{
"epoch": 0.47485993901141765,
"grad_norm": 9.23779582977295,
"loss": 4.1159,
"lr": 0.000835944055944056,
"step": 1674,
"tokens_trained": 0.822749696
},
{
"epoch": 0.4754272746613715,
"grad_norm": 11.395891189575195,
"loss": 4.17,
"lr": 0.0008356643356643356,
"step": 1676,
"tokens_trained": 0.82373032
},
{
"epoch": 0.47599461031132545,
"grad_norm": 17.745365142822266,
"loss": 4.1696,
"lr": 0.0008353846153846154,
"step": 1678,
"tokens_trained": 0.824712192
},
{
"epoch": 0.4765619459612793,
"grad_norm": 6.7816572189331055,
"loss": 4.1933,
"lr": 0.0008351048951048951,
"step": 1680,
"tokens_trained": 0.825691208
},
{
"epoch": 0.47712928161123325,
"grad_norm": 20.552772521972656,
"loss": 4.1625,
"lr": 0.0008348251748251748,
"step": 1682,
"tokens_trained": 0.826672584
},
{
"epoch": 0.4776966172611872,
"grad_norm": 21.632352828979492,
"loss": 4.2061,
"lr": 0.0008345454545454546,
"step": 1684,
"tokens_trained": 0.827654368
},
{
"epoch": 0.47826395291114104,
"grad_norm": 17.754596710205078,
"loss": 4.222,
"lr": 0.0008342657342657343,
"step": 1686,
"tokens_trained": 0.828639392
},
{
"epoch": 0.47883128856109497,
"grad_norm": 20.73906707763672,
"loss": 4.1679,
"lr": 0.0008339860139860141,
"step": 1688,
"tokens_trained": 0.829627232
},
{
"epoch": 0.47939862421104884,
"grad_norm": 28.157238006591797,
"loss": 4.1658,
"lr": 0.0008337062937062937,
"step": 1690,
"tokens_trained": 0.830610904
},
{
"epoch": 0.47996595986100277,
"grad_norm": 12.728020668029785,
"loss": 4.1892,
"lr": 0.0008334265734265734,
"step": 1692,
"tokens_trained": 0.831602544
},
{
"epoch": 0.4805332955109567,
"grad_norm": 20.21622657775879,
"loss": 4.1453,
"lr": 0.0008331468531468531,
"step": 1694,
"tokens_trained": 0.832584656
},
{
"epoch": 0.48110063116091056,
"grad_norm": 18.5329647064209,
"loss": 4.2145,
"lr": 0.0008328671328671329,
"step": 1696,
"tokens_trained": 0.833570472
},
{
"epoch": 0.4816679668108645,
"grad_norm": 12.47617244720459,
"loss": 4.1944,
"lr": 0.0008325874125874126,
"step": 1698,
"tokens_trained": 0.834556104
},
{
"epoch": 0.48223530246081836,
"grad_norm": 21.34851837158203,
"loss": 4.1754,
"lr": 0.0008323076923076923,
"step": 1700,
"tokens_trained": 0.835540592
},
{
"epoch": 0.4828026381107723,
"grad_norm": 13.20995807647705,
"loss": 4.1657,
"lr": 0.000832027972027972,
"step": 1702,
"tokens_trained": 0.836525136
},
{
"epoch": 0.4833699737607262,
"grad_norm": 16.77725601196289,
"loss": 4.1905,
"lr": 0.0008317482517482518,
"step": 1704,
"tokens_trained": 0.837509224
},
{
"epoch": 0.4839373094106801,
"grad_norm": 15.17611312866211,
"loss": 4.1823,
"lr": 0.0008314685314685315,
"step": 1706,
"tokens_trained": 0.838492472
},
{
"epoch": 0.484504645060634,
"grad_norm": 13.06942081451416,
"loss": 4.1732,
"lr": 0.0008311888111888112,
"step": 1708,
"tokens_trained": 0.839471696
},
{
"epoch": 0.4850719807105879,
"grad_norm": 10.456578254699707,
"loss": 4.1862,
"lr": 0.0008309090909090909,
"step": 1710,
"tokens_trained": 0.840452808
},
{
"epoch": 0.4856393163605418,
"grad_norm": 13.80197525024414,
"loss": 4.1663,
"lr": 0.0008306293706293706,
"step": 1712,
"tokens_trained": 0.841434224
},
{
"epoch": 0.48620665201049573,
"grad_norm": 20.076507568359375,
"loss": 4.1436,
"lr": 0.0008303496503496504,
"step": 1714,
"tokens_trained": 0.842415304
},
{
"epoch": 0.4867739876604496,
"grad_norm": 5.629086971282959,
"loss": 4.149,
"lr": 0.00083006993006993,
"step": 1716,
"tokens_trained": 0.84339416
},
{
"epoch": 0.48734132331040353,
"grad_norm": 13.932148933410645,
"loss": 4.1785,
"lr": 0.0008297902097902098,
"step": 1718,
"tokens_trained": 0.844380472
},
{
"epoch": 0.4879086589603574,
"grad_norm": 18.951047897338867,
"loss": 4.216,
"lr": 0.0008295104895104895,
"step": 1720,
"tokens_trained": 0.845366896
},
{
"epoch": 0.4884759946103113,
"grad_norm": 21.042476654052734,
"loss": 4.1634,
"lr": 0.0008292307692307693,
"step": 1722,
"tokens_trained": 0.846344792
},
{
"epoch": 0.48904333026026525,
"grad_norm": 23.94416618347168,
"loss": 4.1613,
"lr": 0.000828951048951049,
"step": 1724,
"tokens_trained": 0.847323608
},
{
"epoch": 0.4896106659102191,
"grad_norm": 5.057071208953857,
"loss": 4.1729,
"lr": 0.0008286713286713287,
"step": 1726,
"tokens_trained": 0.848304856
},
{
"epoch": 0.49017800156017305,
"grad_norm": 18.068674087524414,
"loss": 4.2194,
"lr": 0.0008283916083916084,
"step": 1728,
"tokens_trained": 0.849287712
},
{
"epoch": 0.4907453372101269,
"grad_norm": 11.621233940124512,
"loss": 4.2232,
"lr": 0.000828111888111888,
"step": 1730,
"tokens_trained": 0.850268968
},
{
"epoch": 0.49131267286008085,
"grad_norm": 12.939676284790039,
"loss": 4.2003,
"lr": 0.0008278321678321679,
"step": 1732,
"tokens_trained": 0.851256528
},
{
"epoch": 0.49188000851003477,
"grad_norm": 10.638157844543457,
"loss": 4.1975,
"lr": 0.0008275524475524475,
"step": 1734,
"tokens_trained": 0.852240824
},
{
"epoch": 0.49244734415998864,
"grad_norm": 6.2671003341674805,
"loss": 4.1617,
"lr": 0.0008272727272727273,
"step": 1736,
"tokens_trained": 0.853224768
},
{
"epoch": 0.49301467980994257,
"grad_norm": 12.318375587463379,
"loss": 4.1939,
"lr": 0.000826993006993007,
"step": 1738,
"tokens_trained": 0.8542062
},
{
"epoch": 0.49358201545989644,
"grad_norm": 17.275348663330078,
"loss": 4.1911,
"lr": 0.0008267132867132868,
"step": 1740,
"tokens_trained": 0.855192024
},
{
"epoch": 0.49414935110985037,
"grad_norm": 11.122747421264648,
"loss": 4.17,
"lr": 0.0008264335664335665,
"step": 1742,
"tokens_trained": 0.856172136
},
{
"epoch": 0.4947166867598043,
"grad_norm": 6.223485469818115,
"loss": 4.1774,
"lr": 0.0008261538461538461,
"step": 1744,
"tokens_trained": 0.857156312
},
{
"epoch": 0.49528402240975816,
"grad_norm": 14.62152099609375,
"loss": 4.1607,
"lr": 0.0008258741258741259,
"step": 1746,
"tokens_trained": 0.858140152
},
{
"epoch": 0.4958513580597121,
"grad_norm": 15.991989135742188,
"loss": 4.1825,
"lr": 0.0008255944055944055,
"step": 1748,
"tokens_trained": 0.85912524
},
{
"epoch": 0.49641869370966596,
"grad_norm": 28.88335418701172,
"loss": 4.2244,
"lr": 0.0008253146853146854,
"step": 1750,
"tokens_trained": 0.860105784
},
{
"epoch": 0.49641869370966596,
"eval_loss": 1.061833143234253,
"eval_runtime": 20.4841,
"step": 1750,
"tokens_trained": 0.860105784
},
{
"epoch": 0.4969860293596199,
"grad_norm": 14.708030700683594,
"loss": 4.2036,
"lr": 0.000825034965034965,
"step": 1752,
"tokens_trained": 0.861089272
},
{
"epoch": 0.4975533650095738,
"grad_norm": 24.67535400390625,
"loss": 4.2405,
"lr": 0.0008247552447552448,
"step": 1754,
"tokens_trained": 0.862066656
},
{
"epoch": 0.4981207006595277,
"grad_norm": 10.923722267150879,
"loss": 4.1713,
"lr": 0.0008244755244755245,
"step": 1756,
"tokens_trained": 0.863049256
},
{
"epoch": 0.4986880363094816,
"grad_norm": 8.88796615600586,
"loss": 4.1834,
"lr": 0.0008241958041958042,
"step": 1758,
"tokens_trained": 0.864029352
},
{
"epoch": 0.4992553719594355,
"grad_norm": 34.90485382080078,
"loss": 4.2338,
"lr": 0.000823916083916084,
"step": 1760,
"tokens_trained": 0.865013008
},
{
"epoch": 0.4998227076093894,
"grad_norm": 36.34440612792969,
"loss": 4.2012,
"lr": 0.0008236363636363636,
"step": 1762,
"tokens_trained": 0.86599204
},
{
"epoch": 0.5003900432593433,
"grad_norm": 27.913984298706055,
"loss": 4.269,
"lr": 0.0008233566433566434,
"step": 1764,
"tokens_trained": 0.866975456
},
{
"epoch": 0.5009573789092973,
"grad_norm": 28.236122131347656,
"loss": 4.2413,
"lr": 0.000823076923076923,
"step": 1766,
"tokens_trained": 0.867963912
},
{
"epoch": 0.5015247145592511,
"grad_norm": 18.181337356567383,
"loss": 4.2088,
"lr": 0.0008227972027972029,
"step": 1768,
"tokens_trained": 0.86894656
},
{
"epoch": 0.502092050209205,
"grad_norm": 17.403850555419922,
"loss": 4.1854,
"lr": 0.0008225174825174825,
"step": 1770,
"tokens_trained": 0.869932592
},
{
"epoch": 0.5026593858591589,
"grad_norm": 15.002805709838867,
"loss": 4.1897,
"lr": 0.0008222377622377622,
"step": 1772,
"tokens_trained": 0.87091592
},
{
"epoch": 0.5032267215091129,
"grad_norm": 6.787586688995361,
"loss": 4.1625,
"lr": 0.000821958041958042,
"step": 1774,
"tokens_trained": 0.871899144
},
{
"epoch": 0.5037940571590668,
"grad_norm": 6.255197525024414,
"loss": 4.1682,
"lr": 0.0008216783216783217,
"step": 1776,
"tokens_trained": 0.872874824
},
{
"epoch": 0.5043613928090206,
"grad_norm": 25.828433990478516,
"loss": 4.2354,
"lr": 0.0008213986013986015,
"step": 1778,
"tokens_trained": 0.873858424
},
{
"epoch": 0.5049287284589745,
"grad_norm": 20.261323928833008,
"loss": 4.2373,
"lr": 0.0008211188811188811,
"step": 1780,
"tokens_trained": 0.87483884
},
{
"epoch": 0.5054960641089284,
"grad_norm": 9.670608520507812,
"loss": 4.191,
"lr": 0.0008208391608391609,
"step": 1782,
"tokens_trained": 0.875820792
},
{
"epoch": 0.5060633997588824,
"grad_norm": 23.33945655822754,
"loss": 4.2319,
"lr": 0.0008205594405594405,
"step": 1784,
"tokens_trained": 0.876804368
},
{
"epoch": 0.5066307354088363,
"grad_norm": 32.22544479370117,
"loss": 4.1799,
"lr": 0.0008202797202797203,
"step": 1786,
"tokens_trained": 0.877784816
},
{
"epoch": 0.5071980710587901,
"grad_norm": 21.048891067504883,
"loss": 4.2635,
"lr": 0.00082,
"step": 1788,
"tokens_trained": 0.878768256
},
{
"epoch": 0.507765406708744,
"grad_norm": 28.73198699951172,
"loss": 4.2436,
"lr": 0.0008197202797202797,
"step": 1790,
"tokens_trained": 0.879751288
},
{
"epoch": 0.508332742358698,
"grad_norm": 27.627851486206055,
"loss": 4.2118,
"lr": 0.0008194405594405595,
"step": 1792,
"tokens_trained": 0.880732072
},
{
"epoch": 0.5089000780086519,
"grad_norm": 21.16539192199707,
"loss": 4.2123,
"lr": 0.0008191608391608392,
"step": 1794,
"tokens_trained": 0.88171332
},
{
"epoch": 0.5094674136586058,
"grad_norm": 11.402868270874023,
"loss": 4.1524,
"lr": 0.000818881118881119,
"step": 1796,
"tokens_trained": 0.882695464
},
{
"epoch": 0.5100347493085596,
"grad_norm": 11.958270072937012,
"loss": 4.2091,
"lr": 0.0008186013986013986,
"step": 1798,
"tokens_trained": 0.883678736
},
{
"epoch": 0.5106020849585136,
"grad_norm": 15.902670860290527,
"loss": 4.1687,
"lr": 0.0008183216783216783,
"step": 1800,
"tokens_trained": 0.8846604
},
{
"epoch": 0.5111694206084675,
"grad_norm": 19.732566833496094,
"loss": 4.1302,
"lr": 0.000818041958041958,
"step": 1802,
"tokens_trained": 0.885641384
},
{
"epoch": 0.5117367562584214,
"grad_norm": 15.119332313537598,
"loss": 4.1546,
"lr": 0.0008177622377622378,
"step": 1804,
"tokens_trained": 0.8866262
},
{
"epoch": 0.5123040919083753,
"grad_norm": 9.641027450561523,
"loss": 4.1748,
"lr": 0.0008174825174825175,
"step": 1806,
"tokens_trained": 0.887604504
},
{
"epoch": 0.5128714275583292,
"grad_norm": 11.642073631286621,
"loss": 4.1879,
"lr": 0.0008172027972027972,
"step": 1808,
"tokens_trained": 0.888584152
},
{
"epoch": 0.5134387632082831,
"grad_norm": 12.05164909362793,
"loss": 4.1332,
"lr": 0.000816923076923077,
"step": 1810,
"tokens_trained": 0.889568448
},
{
"epoch": 0.514006098858237,
"grad_norm": 13.54423999786377,
"loss": 4.1398,
"lr": 0.0008166433566433567,
"step": 1812,
"tokens_trained": 0.890550896
},
{
"epoch": 0.5145734345081909,
"grad_norm": 21.94988441467285,
"loss": 4.1523,
"lr": 0.0008163636363636364,
"step": 1814,
"tokens_trained": 0.89153436
},
{
"epoch": 0.5151407701581449,
"grad_norm": 8.613338470458984,
"loss": 4.1428,
"lr": 0.0008160839160839161,
"step": 1816,
"tokens_trained": 0.89251064
},
{
"epoch": 0.5157081058080987,
"grad_norm": 27.448917388916016,
"loss": 4.2014,
"lr": 0.0008158041958041958,
"step": 1818,
"tokens_trained": 0.893493904
},
{
"epoch": 0.5162754414580526,
"grad_norm": 16.226577758789062,
"loss": 4.1787,
"lr": 0.0008155244755244755,
"step": 1820,
"tokens_trained": 0.894476344
},
{
"epoch": 0.5168427771080065,
"grad_norm": 16.967891693115234,
"loss": 4.1898,
"lr": 0.0008152447552447553,
"step": 1822,
"tokens_trained": 0.895460064
},
{
"epoch": 0.5174101127579604,
"grad_norm": 13.723483085632324,
"loss": 4.2058,
"lr": 0.000814965034965035,
"step": 1824,
"tokens_trained": 0.896443272
},
{
"epoch": 0.5179774484079144,
"grad_norm": 16.789636611938477,
"loss": 4.1669,
"lr": 0.0008146853146853147,
"step": 1826,
"tokens_trained": 0.897426712
},
{
"epoch": 0.5185447840578682,
"grad_norm": 11.26768684387207,
"loss": 4.1401,
"lr": 0.0008144055944055944,
"step": 1828,
"tokens_trained": 0.89840672
},
{
"epoch": 0.5191121197078221,
"grad_norm": 9.25829029083252,
"loss": 4.1581,
"lr": 0.0008141258741258742,
"step": 1830,
"tokens_trained": 0.89939132
},
{
"epoch": 0.519679455357776,
"grad_norm": 12.006930351257324,
"loss": 4.1768,
"lr": 0.0008138461538461539,
"step": 1832,
"tokens_trained": 0.900373704
},
{
"epoch": 0.52024679100773,
"grad_norm": 18.766008377075195,
"loss": 4.1419,
"lr": 0.0008135664335664336,
"step": 1834,
"tokens_trained": 0.901356176
},
{
"epoch": 0.5208141266576839,
"grad_norm": 17.483421325683594,
"loss": 4.1382,
"lr": 0.0008132867132867133,
"step": 1836,
"tokens_trained": 0.902344088
},
{
"epoch": 0.5213814623076377,
"grad_norm": 10.484652519226074,
"loss": 4.1571,
"lr": 0.000813006993006993,
"step": 1838,
"tokens_trained": 0.903328896
},
{
"epoch": 0.5219487979575916,
"grad_norm": 13.653974533081055,
"loss": 4.1638,
"lr": 0.0008127272727272728,
"step": 1840,
"tokens_trained": 0.904309368
},
{
"epoch": 0.5225161336075456,
"grad_norm": 12.48718547821045,
"loss": 4.1226,
"lr": 0.0008124475524475524,
"step": 1842,
"tokens_trained": 0.905293112
},
{
"epoch": 0.5230834692574995,
"grad_norm": 8.086355209350586,
"loss": 4.1303,
"lr": 0.0008121678321678322,
"step": 1844,
"tokens_trained": 0.906275632
},
{
"epoch": 0.5236508049074534,
"grad_norm": 10.940073013305664,
"loss": 4.1634,
"lr": 0.0008118881118881119,
"step": 1846,
"tokens_trained": 0.907255808
},
{
"epoch": 0.5242181405574072,
"grad_norm": 13.844099044799805,
"loss": 4.1505,
"lr": 0.0008116083916083917,
"step": 1848,
"tokens_trained": 0.908238664
},
{
"epoch": 0.5247854762073612,
"grad_norm": 6.305738925933838,
"loss": 4.1463,
"lr": 0.0008113286713286714,
"step": 1850,
"tokens_trained": 0.909221424
},
{
"epoch": 0.5253528118573151,
"grad_norm": 8.957951545715332,
"loss": 4.1785,
"lr": 0.000811048951048951,
"step": 1852,
"tokens_trained": 0.910204472
},
{
"epoch": 0.525920147507269,
"grad_norm": 12.665373802185059,
"loss": 4.1776,
"lr": 0.0008107692307692308,
"step": 1854,
"tokens_trained": 0.911186456
},
{
"epoch": 0.5264874831572229,
"grad_norm": 13.7921781539917,
"loss": 4.2058,
"lr": 0.0008104895104895104,
"step": 1856,
"tokens_trained": 0.912163912
},
{
"epoch": 0.5270548188071768,
"grad_norm": 18.400495529174805,
"loss": 4.1378,
"lr": 0.0008102097902097903,
"step": 1858,
"tokens_trained": 0.913143416
},
{
"epoch": 0.5276221544571307,
"grad_norm": 10.095234870910645,
"loss": 4.1673,
"lr": 0.0008099300699300699,
"step": 1860,
"tokens_trained": 0.914125056
},
{
"epoch": 0.5281894901070846,
"grad_norm": 9.396644592285156,
"loss": 4.1226,
"lr": 0.0008096503496503497,
"step": 1862,
"tokens_trained": 0.915109128
},
{
"epoch": 0.5287568257570385,
"grad_norm": 12.686080932617188,
"loss": 4.1356,
"lr": 0.0008093706293706294,
"step": 1864,
"tokens_trained": 0.916092096
},
{
"epoch": 0.5293241614069925,
"grad_norm": 15.91020679473877,
"loss": 4.1276,
"lr": 0.0008090909090909092,
"step": 1866,
"tokens_trained": 0.917077264
},
{
"epoch": 0.5298914970569463,
"grad_norm": 21.305110931396484,
"loss": 4.1492,
"lr": 0.0008088111888111889,
"step": 1868,
"tokens_trained": 0.918060288
},
{
"epoch": 0.5304588327069002,
"grad_norm": 9.242319107055664,
"loss": 4.1457,
"lr": 0.0008085314685314685,
"step": 1870,
"tokens_trained": 0.91904616
},
{
"epoch": 0.5310261683568541,
"grad_norm": 17.556922912597656,
"loss": 4.1698,
"lr": 0.0008082517482517483,
"step": 1872,
"tokens_trained": 0.920028192
},
{
"epoch": 0.531593504006808,
"grad_norm": 24.155885696411133,
"loss": 4.193,
"lr": 0.0008079720279720279,
"step": 1874,
"tokens_trained": 0.921010456
},
{
"epoch": 0.531877171831785,
"eval_loss": 1.0404243469238281,
"eval_runtime": 21.451,
"step": 1875,
"tokens_trained": 0.921502192
},
{
"epoch": 0.532160839656762,
"grad_norm": 4.985994338989258,
"loss": 4.1649,
"lr": 0.0008076923076923078,
"step": 1876,
"tokens_trained": 0.921994216
},
{
"epoch": 0.5327281753067158,
"grad_norm": 19.2642765045166,
"loss": 4.1883,
"lr": 0.0008074125874125874,
"step": 1878,
"tokens_trained": 0.922978112
},
{
"epoch": 0.5332955109566697,
"grad_norm": 15.012572288513184,
"loss": 4.1944,
"lr": 0.0008071328671328671,
"step": 1880,
"tokens_trained": 0.923962952
},
{
"epoch": 0.5338628466066236,
"grad_norm": 21.37204360961914,
"loss": 4.1708,
"lr": 0.0008068531468531469,
"step": 1882,
"tokens_trained": 0.92494744
},
{
"epoch": 0.5344301822565776,
"grad_norm": 6.402398586273193,
"loss": 4.1921,
"lr": 0.0008065734265734265,
"step": 1884,
"tokens_trained": 0.925927984
},
{
"epoch": 0.5349975179065315,
"grad_norm": 27.606822967529297,
"loss": 4.2033,
"lr": 0.0008062937062937064,
"step": 1886,
"tokens_trained": 0.926911352
},
{
"epoch": 0.5355648535564853,
"grad_norm": 16.434572219848633,
"loss": 4.1504,
"lr": 0.000806013986013986,
"step": 1888,
"tokens_trained": 0.927894056
},
{
"epoch": 0.5361321892064392,
"grad_norm": 8.066178321838379,
"loss": 4.1674,
"lr": 0.0008057342657342658,
"step": 1890,
"tokens_trained": 0.928879504
},
{
"epoch": 0.5366995248563932,
"grad_norm": 6.167456150054932,
"loss": 4.1207,
"lr": 0.0008054545454545454,
"step": 1892,
"tokens_trained": 0.92986424
},
{
"epoch": 0.5372668605063471,
"grad_norm": 3.584982395172119,
"loss": 4.1051,
"lr": 0.0008051748251748253,
"step": 1894,
"tokens_trained": 0.930846696
},
{
"epoch": 0.537834196156301,
"grad_norm": 14.988295555114746,
"loss": 4.1199,
"lr": 0.0008048951048951049,
"step": 1896,
"tokens_trained": 0.931831112
},
{
"epoch": 0.5384015318062548,
"grad_norm": 12.735363960266113,
"loss": 4.1368,
"lr": 0.0008046153846153846,
"step": 1898,
"tokens_trained": 0.932816952
},
{
"epoch": 0.5389688674562088,
"grad_norm": 7.701294422149658,
"loss": 4.1205,
"lr": 0.0008043356643356644,
"step": 1900,
"tokens_trained": 0.93380264
},
{
"epoch": 0.5395362031061627,
"grad_norm": 9.15809440612793,
"loss": 4.1567,
"lr": 0.000804055944055944,
"step": 1902,
"tokens_trained": 0.934785848
},
{
"epoch": 0.5401035387561166,
"grad_norm": 10.8292875289917,
"loss": 4.1645,
"lr": 0.0008037762237762239,
"step": 1904,
"tokens_trained": 0.935766912
},
{
"epoch": 0.5406708744060705,
"grad_norm": 10.906803131103516,
"loss": 4.1398,
"lr": 0.0008034965034965035,
"step": 1906,
"tokens_trained": 0.936749352
},
{
"epoch": 0.5412382100560243,
"grad_norm": 10.140864372253418,
"loss": 4.1754,
"lr": 0.0008032167832167832,
"step": 1908,
"tokens_trained": 0.9377304
},
{
"epoch": 0.5418055457059783,
"grad_norm": 10.061383247375488,
"loss": 4.1485,
"lr": 0.0008029370629370629,
"step": 1910,
"tokens_trained": 0.938712336
},
{
"epoch": 0.5423728813559322,
"grad_norm": 8.252259254455566,
"loss": 4.1502,
"lr": 0.0008026573426573427,
"step": 1912,
"tokens_trained": 0.939693304
},
{
"epoch": 0.5429402170058861,
"grad_norm": 15.104400634765625,
"loss": 4.182,
"lr": 0.0008023776223776224,
"step": 1914,
"tokens_trained": 0.940679832
},
{
"epoch": 0.54350755265584,
"grad_norm": 21.167285919189453,
"loss": 4.1241,
"lr": 0.0008020979020979021,
"step": 1916,
"tokens_trained": 0.941665088
},
{
"epoch": 0.5440748883057939,
"grad_norm": 17.936481475830078,
"loss": 4.1846,
"lr": 0.0008018181818181818,
"step": 1918,
"tokens_trained": 0.942651632
},
{
"epoch": 0.5446422239557478,
"grad_norm": 9.773019790649414,
"loss": 4.1164,
"lr": 0.0008015384615384615,
"step": 1920,
"tokens_trained": 0.943635928
},
{
"epoch": 0.5452095596057017,
"grad_norm": 14.120475769042969,
"loss": 4.1556,
"lr": 0.0008012587412587414,
"step": 1922,
"tokens_trained": 0.944618336
},
{
"epoch": 0.5457768952556556,
"grad_norm": 10.898097038269043,
"loss": 4.1521,
"lr": 0.000800979020979021,
"step": 1924,
"tokens_trained": 0.945608216
},
{
"epoch": 0.5463442309056096,
"grad_norm": 8.271462440490723,
"loss": 4.0785,
"lr": 0.0008006993006993007,
"step": 1926,
"tokens_trained": 0.946593504
},
{
"epoch": 0.5469115665555634,
"grad_norm": 17.28820037841797,
"loss": 4.0998,
"lr": 0.0008004195804195804,
"step": 1928,
"tokens_trained": 0.947575288
},
{
"epoch": 0.5474789022055173,
"grad_norm": 17.754959106445312,
"loss": 4.1652,
"lr": 0.0008001398601398602,
"step": 1930,
"tokens_trained": 0.948562968
},
{
"epoch": 0.5480462378554712,
"grad_norm": 10.576292037963867,
"loss": 4.1754,
"lr": 0.0007998601398601399,
"step": 1932,
"tokens_trained": 0.949545728
},
{
"epoch": 0.5486135735054252,
"grad_norm": 14.297791481018066,
"loss": 4.1597,
"lr": 0.0007995804195804196,
"step": 1934,
"tokens_trained": 0.950528952
},
{
"epoch": 0.5491809091553791,
"grad_norm": 23.882539749145508,
"loss": 4.1366,
"lr": 0.0007993006993006992,
"step": 1936,
"tokens_trained": 0.951513448
},
{
"epoch": 0.5497482448053329,
"grad_norm": 5.12502908706665,
"loss": 4.1441,
"lr": 0.000799020979020979,
"step": 1938,
"tokens_trained": 0.952497048
},
{
"epoch": 0.5503155804552868,
"grad_norm": 26.879070281982422,
"loss": 4.2595,
"lr": 0.0007987412587412588,
"step": 1940,
"tokens_trained": 0.953475816
},
{
"epoch": 0.5508829161052408,
"grad_norm": 23.032690048217773,
"loss": 4.1841,
"lr": 0.0007984615384615385,
"step": 1942,
"tokens_trained": 0.954459984
},
{
"epoch": 0.5514502517551947,
"grad_norm": 8.810720443725586,
"loss": 4.1329,
"lr": 0.0007981818181818182,
"step": 1944,
"tokens_trained": 0.95544252
},
{
"epoch": 0.5520175874051486,
"grad_norm": 31.051185607910156,
"loss": 4.2278,
"lr": 0.0007979020979020979,
"step": 1946,
"tokens_trained": 0.956428016
},
{
"epoch": 0.5525849230551024,
"grad_norm": 22.537412643432617,
"loss": 4.1729,
"lr": 0.0007976223776223777,
"step": 1948,
"tokens_trained": 0.957406024
},
{
"epoch": 0.5531522587050564,
"grad_norm": 10.596793174743652,
"loss": 4.1636,
"lr": 0.0007973426573426573,
"step": 1950,
"tokens_trained": 0.958391232
},
{
"epoch": 0.5537195943550103,
"grad_norm": 16.45500373840332,
"loss": 4.1591,
"lr": 0.0007970629370629371,
"step": 1952,
"tokens_trained": 0.959378448
},
{
"epoch": 0.5542869300049642,
"grad_norm": 15.090359687805176,
"loss": 4.1516,
"lr": 0.0007967832167832167,
"step": 1954,
"tokens_trained": 0.960363384
},
{
"epoch": 0.5548542656549181,
"grad_norm": 28.482192993164062,
"loss": 4.1211,
"lr": 0.0007965034965034965,
"step": 1956,
"tokens_trained": 0.961348752
},
{
"epoch": 0.555421601304872,
"grad_norm": 9.402368545532227,
"loss": 4.178,
"lr": 0.0007962237762237763,
"step": 1958,
"tokens_trained": 0.962332976
},
{
"epoch": 0.5559889369548259,
"grad_norm": 33.001346588134766,
"loss": 4.218,
"lr": 0.000795944055944056,
"step": 1960,
"tokens_trained": 0.963316928
},
{
"epoch": 0.5565562726047798,
"grad_norm": 29.695520401000977,
"loss": 4.2071,
"lr": 0.0007956643356643357,
"step": 1962,
"tokens_trained": 0.964301728
},
{
"epoch": 0.5571236082547337,
"grad_norm": 22.22412109375,
"loss": 4.2158,
"lr": 0.0007953846153846153,
"step": 1964,
"tokens_trained": 0.96528524
},
{
"epoch": 0.5576909439046877,
"grad_norm": 15.590829849243164,
"loss": 4.1681,
"lr": 0.0007951048951048952,
"step": 1966,
"tokens_trained": 0.966268264
},
{
"epoch": 0.5582582795546415,
"grad_norm": 16.011110305786133,
"loss": 4.1591,
"lr": 0.0007948251748251748,
"step": 1968,
"tokens_trained": 0.967252016
},
{
"epoch": 0.5588256152045954,
"grad_norm": 15.24573040008545,
"loss": 4.1446,
"lr": 0.0007945454545454546,
"step": 1970,
"tokens_trained": 0.96823396
},
{
"epoch": 0.5593929508545493,
"grad_norm": 15.718021392822266,
"loss": 4.1846,
"lr": 0.0007942657342657342,
"step": 1972,
"tokens_trained": 0.969217792
},
{
"epoch": 0.5599602865045032,
"grad_norm": 8.648459434509277,
"loss": 4.1655,
"lr": 0.000793986013986014,
"step": 1974,
"tokens_trained": 0.970200776
},
{
"epoch": 0.5605276221544572,
"grad_norm": 7.273077487945557,
"loss": 4.1397,
"lr": 0.0007937062937062938,
"step": 1976,
"tokens_trained": 0.971181376
},
{
"epoch": 0.561094957804411,
"grad_norm": 25.027616500854492,
"loss": 4.1918,
"lr": 0.0007934265734265734,
"step": 1978,
"tokens_trained": 0.972165496
},
{
"epoch": 0.5616622934543649,
"grad_norm": 25.485851287841797,
"loss": 4.1896,
"lr": 0.0007931468531468532,
"step": 1980,
"tokens_trained": 0.973145616
},
{
"epoch": 0.5622296291043188,
"grad_norm": 18.065462112426758,
"loss": 4.1876,
"lr": 0.0007928671328671328,
"step": 1982,
"tokens_trained": 0.974131104
},
{
"epoch": 0.5627969647542728,
"grad_norm": 20.412248611450195,
"loss": 4.1556,
"lr": 0.0007925874125874127,
"step": 1984,
"tokens_trained": 0.975111232
},
{
"epoch": 0.5633643004042267,
"grad_norm": 15.51710319519043,
"loss": 4.1391,
"lr": 0.0007923076923076923,
"step": 1986,
"tokens_trained": 0.976098968
},
{
"epoch": 0.5639316360541805,
"grad_norm": 8.650726318359375,
"loss": 4.1421,
"lr": 0.000792027972027972,
"step": 1988,
"tokens_trained": 0.977082992
},
{
"epoch": 0.5644989717041344,
"grad_norm": 19.833505630493164,
"loss": 4.1505,
"lr": 0.0007917482517482517,
"step": 1990,
"tokens_trained": 0.978068896
},
{
"epoch": 0.5650663073540884,
"grad_norm": 26.585390090942383,
"loss": 4.1661,
"lr": 0.0007914685314685314,
"step": 1992,
"tokens_trained": 0.979048504
},
{
"epoch": 0.5656336430040423,
"grad_norm": 20.827394485473633,
"loss": 4.1987,
"lr": 0.0007911888111888113,
"step": 1994,
"tokens_trained": 0.98003104
},
{
"epoch": 0.5662009786539962,
"grad_norm": 23.700273513793945,
"loss": 4.1773,
"lr": 0.0007909090909090909,
"step": 1996,
"tokens_trained": 0.981013384
},
{
"epoch": 0.56676831430395,
"grad_norm": 15.673397064208984,
"loss": 4.12,
"lr": 0.0007906293706293707,
"step": 1998,
"tokens_trained": 0.981999776
},
{
"epoch": 0.567335649953904,
"grad_norm": 11.268630981445312,
"loss": 4.1373,
"lr": 0.0007903496503496503,
"step": 2000,
"tokens_trained": 0.982980936
},
{
"epoch": 0.567335649953904,
"eval_loss": 1.0422048568725586,
"eval_runtime": 20.3928,
"step": 2000,
"tokens_trained": 0.982980936
},
{
"epoch": 0.5679029856038579,
"grad_norm": 18.37994384765625,
"loss": 4.1536,
"lr": 0.0007900699300699302,
"step": 2002,
"tokens_trained": 0.983969536
},
{
"epoch": 0.5684703212538118,
"grad_norm": 23.911537170410156,
"loss": 4.1652,
"lr": 0.0007897902097902098,
"step": 2004,
"tokens_trained": 0.98495052
},
{
"epoch": 0.5690376569037657,
"grad_norm": 7.355772018432617,
"loss": 4.1846,
"lr": 0.0007895104895104895,
"step": 2006,
"tokens_trained": 0.98593252
},
{
"epoch": 0.5696049925537195,
"grad_norm": 35.29991149902344,
"loss": 4.2145,
"lr": 0.0007892307692307692,
"step": 2008,
"tokens_trained": 0.986922392
},
{
"epoch": 0.5701723282036735,
"grad_norm": 14.28709602355957,
"loss": 4.1629,
"lr": 0.0007889510489510489,
"step": 2010,
"tokens_trained": 0.987905712
},
{
"epoch": 0.5707396638536274,
"grad_norm": 22.50174331665039,
"loss": 4.1907,
"lr": 0.0007886713286713288,
"step": 2012,
"tokens_trained": 0.988887536
},
{
"epoch": 0.5713069995035813,
"grad_norm": 14.588640213012695,
"loss": 4.1523,
"lr": 0.0007883916083916084,
"step": 2014,
"tokens_trained": 0.989872712
},
{
"epoch": 0.5718743351535353,
"grad_norm": 2.776369094848633,
"loss": 4.1548,
"lr": 0.0007881118881118882,
"step": 2016,
"tokens_trained": 0.990854072
},
{
"epoch": 0.5724416708034891,
"grad_norm": 16.00047492980957,
"loss": 4.1319,
"lr": 0.0007878321678321678,
"step": 2018,
"tokens_trained": 0.991834552
},
{
"epoch": 0.573009006453443,
"grad_norm": 21.678735733032227,
"loss": 4.1986,
"lr": 0.0007875524475524476,
"step": 2020,
"tokens_trained": 0.992818256
},
{
"epoch": 0.5735763421033969,
"grad_norm": 4.835119724273682,
"loss": 4.1625,
"lr": 0.0007872727272727273,
"step": 2022,
"tokens_trained": 0.993801376
},
{
"epoch": 0.5741436777533508,
"grad_norm": 19.427467346191406,
"loss": 4.1594,
"lr": 0.000786993006993007,
"step": 2024,
"tokens_trained": 0.994788568
},
{
"epoch": 0.5747110134033048,
"grad_norm": 15.458346366882324,
"loss": 4.1829,
"lr": 0.0007867132867132867,
"step": 2026,
"tokens_trained": 0.995769976
},
{
"epoch": 0.5752783490532586,
"grad_norm": 11.073614120483398,
"loss": 4.1303,
"lr": 0.0007864335664335664,
"step": 2028,
"tokens_trained": 0.996751464
},
{
"epoch": 0.5758456847032125,
"grad_norm": 4.685436248779297,
"loss": 4.1368,
"lr": 0.0007861538461538463,
"step": 2030,
"tokens_trained": 0.997733952
},
{
"epoch": 0.5764130203531664,
"grad_norm": 15.977241516113281,
"loss": 4.1584,
"lr": 0.0007858741258741259,
"step": 2032,
"tokens_trained": 0.998716976
},
{
"epoch": 0.5769803560031204,
"grad_norm": 11.305732727050781,
"loss": 4.102,
"lr": 0.0007855944055944056,
"step": 2034,
"tokens_trained": 0.999703632
},
{
"epoch": 0.5775476916530743,
"grad_norm": 7.794003963470459,
"loss": 4.161,
"lr": 0.0007853146853146853,
"step": 2036,
"tokens_trained": 1.000687488
},
{
"epoch": 0.5781150273030281,
"grad_norm": 7.609982013702393,
"loss": 4.1546,
"lr": 0.0007850349650349651,
"step": 2038,
"tokens_trained": 1.0016692
},
{
"epoch": 0.578682362952982,
"grad_norm": 7.622653961181641,
"loss": 4.1246,
"lr": 0.0007847552447552448,
"step": 2040,
"tokens_trained": 1.002653352
},
{
"epoch": 0.579249698602936,
"grad_norm": 9.98919677734375,
"loss": 4.1319,
"lr": 0.0007844755244755245,
"step": 2042,
"tokens_trained": 1.003639528
},
{
"epoch": 0.5798170342528899,
"grad_norm": 9.557628631591797,
"loss": 4.1105,
"lr": 0.0007841958041958041,
"step": 2044,
"tokens_trained": 1.004623776
},
{
"epoch": 0.5803843699028438,
"grad_norm": 14.172621726989746,
"loss": 4.1339,
"lr": 0.0007839160839160839,
"step": 2046,
"tokens_trained": 1.005604008
},
{
"epoch": 0.5809517055527976,
"grad_norm": 8.185248374938965,
"loss": 4.1142,
"lr": 0.0007836363636363637,
"step": 2048,
"tokens_trained": 1.006585704
},
{
"epoch": 0.5815190412027516,
"grad_norm": 10.642661094665527,
"loss": 4.131,
"lr": 0.0007833566433566434,
"step": 2050,
"tokens_trained": 1.00757132
},
{
"epoch": 0.5820863768527055,
"grad_norm": 7.868969917297363,
"loss": 4.1477,
"lr": 0.0007830769230769231,
"step": 2052,
"tokens_trained": 1.008556824
},
{
"epoch": 0.5826537125026594,
"grad_norm": 2.8441150188446045,
"loss": 4.1156,
"lr": 0.0007827972027972028,
"step": 2054,
"tokens_trained": 1.00954056
},
{
"epoch": 0.5832210481526133,
"grad_norm": 5.2797932624816895,
"loss": 4.1058,
"lr": 0.0007825174825174826,
"step": 2056,
"tokens_trained": 1.010526488
},
{
"epoch": 0.5837883838025671,
"grad_norm": 11.850811004638672,
"loss": 4.165,
"lr": 0.0007822377622377622,
"step": 2058,
"tokens_trained": 1.011507584
},
{
"epoch": 0.5843557194525211,
"grad_norm": 11.073920249938965,
"loss": 4.1509,
"lr": 0.000781958041958042,
"step": 2060,
"tokens_trained": 1.012491648
},
{
"epoch": 0.584923055102475,
"grad_norm": 8.282343864440918,
"loss": 4.0656,
"lr": 0.0007816783216783216,
"step": 2062,
"tokens_trained": 1.013475224
},
{
"epoch": 0.5854903907524289,
"grad_norm": 10.414461135864258,
"loss": 4.1285,
"lr": 0.0007813986013986014,
"step": 2064,
"tokens_trained": 1.014458144
},
{
"epoch": 0.5860577264023829,
"grad_norm": 9.988463401794434,
"loss": 4.1234,
"lr": 0.0007811188811188812,
"step": 2066,
"tokens_trained": 1.015444112
},
{
"epoch": 0.5866250620523367,
"grad_norm": 8.713189125061035,
"loss": 4.129,
"lr": 0.0007808391608391609,
"step": 2068,
"tokens_trained": 1.016427568
},
{
"epoch": 0.5871923977022906,
"grad_norm": 3.4149773120880127,
"loss": 4.155,
"lr": 0.0007805594405594406,
"step": 2070,
"tokens_trained": 1.017412264
},
{
"epoch": 0.5877597333522445,
"grad_norm": 12.33522891998291,
"loss": 4.1856,
"lr": 0.0007802797202797202,
"step": 2072,
"tokens_trained": 1.018402216
},
{
"epoch": 0.5883270690021984,
"grad_norm": 12.155695915222168,
"loss": 4.1468,
"lr": 0.0007800000000000001,
"step": 2074,
"tokens_trained": 1.019387096
},
{
"epoch": 0.5888944046521524,
"grad_norm": 7.73326301574707,
"loss": 4.1239,
"lr": 0.0007797202797202797,
"step": 2076,
"tokens_trained": 1.020370008
},
{
"epoch": 0.5894617403021062,
"grad_norm": 6.425852298736572,
"loss": 4.1101,
"lr": 0.0007794405594405595,
"step": 2078,
"tokens_trained": 1.02135716
},
{
"epoch": 0.5900290759520601,
"grad_norm": 18.360816955566406,
"loss": 4.1726,
"lr": 0.0007791608391608391,
"step": 2080,
"tokens_trained": 1.022338024
},
{
"epoch": 0.590596411602014,
"grad_norm": 28.31681251525879,
"loss": 4.1341,
"lr": 0.0007788811188811189,
"step": 2082,
"tokens_trained": 1.023318008
},
{
"epoch": 0.591163747251968,
"grad_norm": 10.673089027404785,
"loss": 4.1268,
"lr": 0.0007786013986013987,
"step": 2084,
"tokens_trained": 1.02430432
},
{
"epoch": 0.5917310829019219,
"grad_norm": 26.656522750854492,
"loss": 4.1703,
"lr": 0.0007783216783216783,
"step": 2086,
"tokens_trained": 1.025288272
},
{
"epoch": 0.5922984185518757,
"grad_norm": 20.022029876708984,
"loss": 4.1532,
"lr": 0.0007780419580419581,
"step": 2088,
"tokens_trained": 1.026272984
},
{
"epoch": 0.5928657542018296,
"grad_norm": 7.2955121994018555,
"loss": 4.1992,
"lr": 0.0007777622377622377,
"step": 2090,
"tokens_trained": 1.02725572
},
{
"epoch": 0.5934330898517836,
"grad_norm": 28.561243057250977,
"loss": 4.2098,
"lr": 0.0007774825174825176,
"step": 2092,
"tokens_trained": 1.028238456
},
{
"epoch": 0.5940004255017375,
"grad_norm": 16.715425491333008,
"loss": 4.1509,
"lr": 0.0007772027972027972,
"step": 2094,
"tokens_trained": 1.029226048
},
{
"epoch": 0.5945677611516914,
"grad_norm": 6.325936317443848,
"loss": 4.1221,
"lr": 0.000776923076923077,
"step": 2096,
"tokens_trained": 1.030210528
},
{
"epoch": 0.5951350968016452,
"grad_norm": 12.83181381225586,
"loss": 4.1808,
"lr": 0.0007766433566433566,
"step": 2098,
"tokens_trained": 1.031193456
},
{
"epoch": 0.5957024324515992,
"grad_norm": 12.183184623718262,
"loss": 4.1292,
"lr": 0.0007763636363636363,
"step": 2100,
"tokens_trained": 1.032173528
},
{
"epoch": 0.5962697681015531,
"grad_norm": 8.247485160827637,
"loss": 4.1425,
"lr": 0.0007760839160839162,
"step": 2102,
"tokens_trained": 1.033158144
},
{
"epoch": 0.596837103751507,
"grad_norm": 10.814559936523438,
"loss": 4.1167,
"lr": 0.0007758041958041958,
"step": 2104,
"tokens_trained": 1.034141216
},
{
"epoch": 0.5974044394014609,
"grad_norm": 12.589309692382812,
"loss": 4.0916,
"lr": 0.0007755244755244756,
"step": 2106,
"tokens_trained": 1.035121888
},
{
"epoch": 0.5979717750514147,
"grad_norm": 11.65658187866211,
"loss": 4.0776,
"lr": 0.0007752447552447552,
"step": 2108,
"tokens_trained": 1.036103688
},
{
"epoch": 0.5985391107013687,
"grad_norm": 18.0120792388916,
"loss": 4.1588,
"lr": 0.0007749650349650351,
"step": 2110,
"tokens_trained": 1.03708248
},
{
"epoch": 0.5991064463513226,
"grad_norm": 5.742938995361328,
"loss": 4.151,
"lr": 0.0007746853146853147,
"step": 2112,
"tokens_trained": 1.038068792
},
{
"epoch": 0.5996737820012765,
"grad_norm": 36.54581832885742,
"loss": 4.2239,
"lr": 0.0007744055944055944,
"step": 2114,
"tokens_trained": 1.03904728
},
{
"epoch": 0.6002411176512304,
"grad_norm": 13.304069519042969,
"loss": 4.152,
"lr": 0.0007741258741258741,
"step": 2116,
"tokens_trained": 1.040031312
},
{
"epoch": 0.6008084533011843,
"grad_norm": 18.68927001953125,
"loss": 4.1413,
"lr": 0.0007738461538461538,
"step": 2118,
"tokens_trained": 1.041018376
},
{
"epoch": 0.6013757889511382,
"grad_norm": 16.946630477905273,
"loss": 4.1122,
"lr": 0.0007735664335664337,
"step": 2120,
"tokens_trained": 1.0420056
},
{
"epoch": 0.6019431246010921,
"grad_norm": 4.236926078796387,
"loss": 4.1146,
"lr": 0.0007732867132867133,
"step": 2122,
"tokens_trained": 1.042990376
},
{
"epoch": 0.602510460251046,
"grad_norm": 12.148641586303711,
"loss": 4.1472,
"lr": 0.0007730069930069931,
"step": 2124,
"tokens_trained": 1.0439754
},
{
"epoch": 0.602794128076023,
"eval_loss": 1.039306640625,
"eval_runtime": 20.6138,
"step": 2125,
"tokens_trained": 1.044467008
},
{
"epoch": 0.603077795901,
"grad_norm": 17.051687240600586,
"loss": 4.1572,
"lr": 0.0007727272727272727,
"step": 2126,
"tokens_trained": 1.044957456
},
{
"epoch": 0.6036451315509538,
"grad_norm": 14.019828796386719,
"loss": 4.1464,
"lr": 0.0007724475524475525,
"step": 2128,
"tokens_trained": 1.04593944
},
{
"epoch": 0.6042124672009077,
"grad_norm": 11.22962760925293,
"loss": 4.1345,
"lr": 0.0007721678321678322,
"step": 2130,
"tokens_trained": 1.046919592
},
{
"epoch": 0.6047798028508616,
"grad_norm": 11.524348258972168,
"loss": 4.1233,
"lr": 0.0007718881118881119,
"step": 2132,
"tokens_trained": 1.047904744
},
{
"epoch": 0.6053471385008156,
"grad_norm": 7.174457550048828,
"loss": 4.1201,
"lr": 0.0007716083916083916,
"step": 2134,
"tokens_trained": 1.048885328
},
{
"epoch": 0.6059144741507695,
"grad_norm": 6.847499847412109,
"loss": 4.1313,
"lr": 0.0007713286713286713,
"step": 2136,
"tokens_trained": 1.049868776
},
{
"epoch": 0.6064818098007233,
"grad_norm": 8.44458293914795,
"loss": 4.1236,
"lr": 0.0007710489510489512,
"step": 2138,
"tokens_trained": 1.050852704
},
{
"epoch": 0.6070491454506772,
"grad_norm": 15.415260314941406,
"loss": 4.1424,
"lr": 0.0007707692307692308,
"step": 2140,
"tokens_trained": 1.051837736
},
{
"epoch": 0.6076164811006312,
"grad_norm": 16.845874786376953,
"loss": 4.1037,
"lr": 0.0007704895104895105,
"step": 2142,
"tokens_trained": 1.05282172
},
{
"epoch": 0.6081838167505851,
"grad_norm": 1.3947086334228516,
"loss": 4.1389,
"lr": 0.0007702097902097902,
"step": 2144,
"tokens_trained": 1.053802928
},
{
"epoch": 0.608751152400539,
"grad_norm": 3.4119038581848145,
"loss": 4.16,
"lr": 0.0007699300699300699,
"step": 2146,
"tokens_trained": 1.054784368
},
{
"epoch": 0.6093184880504928,
"grad_norm": 9.26860523223877,
"loss": 4.1841,
"lr": 0.0007696503496503497,
"step": 2148,
"tokens_trained": 1.05576888
},
{
"epoch": 0.6098858237004467,
"grad_norm": 8.744836807250977,
"loss": 4.1043,
"lr": 0.0007693706293706294,
"step": 2150,
"tokens_trained": 1.056751336
},
{
"epoch": 0.6104531593504007,
"grad_norm": 8.805045127868652,
"loss": 4.1032,
"lr": 0.000769090909090909,
"step": 2152,
"tokens_trained": 1.057734
},
{
"epoch": 0.6110204950003546,
"grad_norm": 4.785625457763672,
"loss": 4.1817,
"lr": 0.0007688111888111888,
"step": 2154,
"tokens_trained": 1.058716328
},
{
"epoch": 0.6115878306503085,
"grad_norm": 2.2137513160705566,
"loss": 4.1514,
"lr": 0.0007685314685314686,
"step": 2156,
"tokens_trained": 1.059696248
},
{
"epoch": 0.6121551663002623,
"grad_norm": 7.164271354675293,
"loss": 4.1433,
"lr": 0.0007682517482517483,
"step": 2158,
"tokens_trained": 1.060676648
},
{
"epoch": 0.6127225019502163,
"grad_norm": 9.481597900390625,
"loss": 4.0971,
"lr": 0.000767972027972028,
"step": 2160,
"tokens_trained": 1.061656688
},
{
"epoch": 0.6132898376001702,
"grad_norm": 11.28831672668457,
"loss": 4.149,
"lr": 0.0007676923076923077,
"step": 2162,
"tokens_trained": 1.062640576
},
{
"epoch": 0.6138571732501241,
"grad_norm": 17.21572494506836,
"loss": 4.098,
"lr": 0.0007674125874125874,
"step": 2164,
"tokens_trained": 1.063617688
},
{
"epoch": 0.614424508900078,
"grad_norm": 14.486310005187988,
"loss": 4.123,
"lr": 0.0007671328671328672,
"step": 2166,
"tokens_trained": 1.06460584
},
{
"epoch": 0.6149918445500319,
"grad_norm": 10.582398414611816,
"loss": 4.1243,
"lr": 0.0007668531468531469,
"step": 2168,
"tokens_trained": 1.065589064
},
{
"epoch": 0.6155591801999858,
"grad_norm": 12.923002243041992,
"loss": 4.0928,
"lr": 0.0007665734265734265,
"step": 2170,
"tokens_trained": 1.06657224
},
{
"epoch": 0.6161265158499397,
"grad_norm": 12.445414543151855,
"loss": 4.1697,
"lr": 0.0007662937062937063,
"step": 2172,
"tokens_trained": 1.067556952
},
{
"epoch": 0.6166938514998936,
"grad_norm": 3.562396287918091,
"loss": 4.0763,
"lr": 0.000766013986013986,
"step": 2174,
"tokens_trained": 1.068538248
},
{
"epoch": 0.6172611871498476,
"grad_norm": 12.62887954711914,
"loss": 4.1203,
"lr": 0.0007657342657342658,
"step": 2176,
"tokens_trained": 1.06952032
},
{
"epoch": 0.6178285227998014,
"grad_norm": 9.387356758117676,
"loss": 4.1318,
"lr": 0.0007654545454545455,
"step": 2178,
"tokens_trained": 1.070503872
},
{
"epoch": 0.6183958584497553,
"grad_norm": 8.885710716247559,
"loss": 4.1609,
"lr": 0.0007651748251748251,
"step": 2180,
"tokens_trained": 1.071486328
},
{
"epoch": 0.6189631940997092,
"grad_norm": 7.174533843994141,
"loss": 4.0824,
"lr": 0.0007648951048951049,
"step": 2182,
"tokens_trained": 1.07246928
},
{
"epoch": 0.6195305297496632,
"grad_norm": 15.866931915283203,
"loss": 4.1461,
"lr": 0.0007646153846153846,
"step": 2184,
"tokens_trained": 1.07345252
},
{
"epoch": 0.6200978653996171,
"grad_norm": 4.892337799072266,
"loss": 4.1418,
"lr": 0.0007643356643356644,
"step": 2186,
"tokens_trained": 1.07443796
},
{
"epoch": 0.6206652010495709,
"grad_norm": 4.796551704406738,
"loss": 4.1394,
"lr": 0.000764055944055944,
"step": 2188,
"tokens_trained": 1.075421392
},
{
"epoch": 0.6212325366995248,
"grad_norm": 10.585665702819824,
"loss": 4.1046,
"lr": 0.0007637762237762238,
"step": 2190,
"tokens_trained": 1.076404848
},
{
"epoch": 0.6217998723494788,
"grad_norm": 8.71747875213623,
"loss": 4.1819,
"lr": 0.0007634965034965035,
"step": 2192,
"tokens_trained": 1.077386672
},
{
"epoch": 0.6223672079994327,
"grad_norm": 10.74347972869873,
"loss": 4.1231,
"lr": 0.0007632167832167833,
"step": 2194,
"tokens_trained": 1.078365112
},
{
"epoch": 0.6229345436493866,
"grad_norm": 12.079446792602539,
"loss": 4.1132,
"lr": 0.000762937062937063,
"step": 2196,
"tokens_trained": 1.07935376
},
{
"epoch": 0.6235018792993404,
"grad_norm": 7.8133649826049805,
"loss": 4.0915,
"lr": 0.0007626573426573426,
"step": 2198,
"tokens_trained": 1.080332872
},
{
"epoch": 0.6240692149492943,
"grad_norm": 4.51243782043457,
"loss": 4.1108,
"lr": 0.0007623776223776224,
"step": 2200,
"tokens_trained": 1.081316664
},
{
"epoch": 0.6246365505992483,
"grad_norm": 12.625933647155762,
"loss": 4.1552,
"lr": 0.0007620979020979021,
"step": 2202,
"tokens_trained": 1.08230448
},
{
"epoch": 0.6252038862492022,
"grad_norm": 9.984200477600098,
"loss": 4.1199,
"lr": 0.0007618181818181819,
"step": 2204,
"tokens_trained": 1.083288992
},
{
"epoch": 0.6257712218991561,
"grad_norm": 11.338666915893555,
"loss": 4.0821,
"lr": 0.0007615384615384615,
"step": 2206,
"tokens_trained": 1.084273864
},
{
"epoch": 0.6263385575491099,
"grad_norm": 6.808894634246826,
"loss": 4.1202,
"lr": 0.0007612587412587412,
"step": 2208,
"tokens_trained": 1.085254584
},
{
"epoch": 0.6269058931990639,
"grad_norm": 4.182394027709961,
"loss": 4.1072,
"lr": 0.000760979020979021,
"step": 2210,
"tokens_trained": 1.086237312
},
{
"epoch": 0.6274732288490178,
"grad_norm": 13.04654312133789,
"loss": 4.1611,
"lr": 0.0007606993006993007,
"step": 2212,
"tokens_trained": 1.087220136
},
{
"epoch": 0.6280405644989717,
"grad_norm": 8.223962783813477,
"loss": 4.1094,
"lr": 0.0007604195804195805,
"step": 2214,
"tokens_trained": 1.088203464
},
{
"epoch": 0.6286079001489256,
"grad_norm": 7.974697589874268,
"loss": 4.1061,
"lr": 0.0007601398601398601,
"step": 2216,
"tokens_trained": 1.089188056
},
{
"epoch": 0.6291752357988795,
"grad_norm": 9.93747329711914,
"loss": 4.1625,
"lr": 0.0007598601398601399,
"step": 2218,
"tokens_trained": 1.090168464
},
{
"epoch": 0.6297425714488334,
"grad_norm": 14.117332458496094,
"loss": 4.1386,
"lr": 0.0007595804195804196,
"step": 2220,
"tokens_trained": 1.09115228
},
{
"epoch": 0.6303099070987873,
"grad_norm": 8.045380592346191,
"loss": 4.0962,
"lr": 0.0007593006993006993,
"step": 2222,
"tokens_trained": 1.0921348
},
{
"epoch": 0.6308772427487412,
"grad_norm": 7.286352634429932,
"loss": 4.1456,
"lr": 0.000759020979020979,
"step": 2224,
"tokens_trained": 1.0931198
},
{
"epoch": 0.6314445783986952,
"grad_norm": 7.278292179107666,
"loss": 4.1155,
"lr": 0.0007587412587412587,
"step": 2226,
"tokens_trained": 1.094107536
},
{
"epoch": 0.632011914048649,
"grad_norm": 5.973489761352539,
"loss": 4.1403,
"lr": 0.0007584615384615385,
"step": 2228,
"tokens_trained": 1.095090384
},
{
"epoch": 0.6325792496986029,
"grad_norm": 11.78962230682373,
"loss": 4.1322,
"lr": 0.0007581818181818182,
"step": 2230,
"tokens_trained": 1.096072192
},
{
"epoch": 0.6331465853485568,
"grad_norm": 9.853010177612305,
"loss": 4.0905,
"lr": 0.000757902097902098,
"step": 2232,
"tokens_trained": 1.097057368
},
{
"epoch": 0.6337139209985108,
"grad_norm": 12.578025817871094,
"loss": 4.0871,
"lr": 0.0007576223776223776,
"step": 2234,
"tokens_trained": 1.0980418
},
{
"epoch": 0.6342812566484647,
"grad_norm": 8.467657089233398,
"loss": 4.0972,
"lr": 0.0007573426573426573,
"step": 2236,
"tokens_trained": 1.099023032
},
{
"epoch": 0.6348485922984185,
"grad_norm": 10.768691062927246,
"loss": 4.0683,
"lr": 0.0007570629370629371,
"step": 2238,
"tokens_trained": 1.1000078
},
{
"epoch": 0.6354159279483724,
"grad_norm": 8.509350776672363,
"loss": 4.1319,
"lr": 0.0007567832167832168,
"step": 2240,
"tokens_trained": 1.100990904
},
{
"epoch": 0.6359832635983264,
"grad_norm": 9.473450660705566,
"loss": 4.0971,
"lr": 0.0007565034965034965,
"step": 2242,
"tokens_trained": 1.101971112
},
{
"epoch": 0.6365505992482803,
"grad_norm": 5.248406887054443,
"loss": 4.1212,
"lr": 0.0007562237762237762,
"step": 2244,
"tokens_trained": 1.10295244
},
{
"epoch": 0.6371179348982342,
"grad_norm": 2.8849964141845703,
"loss": 4.0914,
"lr": 0.000755944055944056,
"step": 2246,
"tokens_trained": 1.103935728
},
{
"epoch": 0.637685270548188,
"grad_norm": 10.757996559143066,
"loss": 4.0711,
"lr": 0.0007556643356643357,
"step": 2248,
"tokens_trained": 1.104917112
},
{
"epoch": 0.638252606198142,
"grad_norm": 14.822528839111328,
"loss": 4.1311,
"lr": 0.0007553846153846154,
"step": 2250,
"tokens_trained": 1.105899872
},
{
"epoch": 0.638252606198142,
"eval_loss": 1.0298579931259155,
"eval_runtime": 20.7482,
"step": 2250,
"tokens_trained": 1.105899872
}
],
"logging_steps": 2,
"max_steps": 7650,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 750,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}