Gencode-MxDNA / checkpoint-4500 /trainer_state.json
andyjzhao's picture
Upload folder using huggingface_hub
fd8f11e verified
{
"best_global_step": 4500,
"best_metric": 0.9826880097389221,
"best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-MxDNA/checkpoint-4500",
"epoch": 1.2762924615275513,
"eval_steps": 125,
"global_step": 4500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005673356499539039,
"grad_norm": 8450.4345703125,
"loss": 876.9911,
"lr": 2e-06,
"step": 2,
"tokens_trained": 0.000985992
},
{
"epoch": 0.0011346712999078079,
"grad_norm": 8980.888671875,
"loss": 779.4711,
"lr": 6e-06,
"step": 4,
"tokens_trained": 0.001968088
},
{
"epoch": 0.001702006949861712,
"grad_norm": 7489.92529296875,
"loss": 488.6157,
"lr": 1e-05,
"step": 6,
"tokens_trained": 0.002953808
},
{
"epoch": 0.0022693425998156157,
"grad_norm": 1952.1917724609375,
"loss": 237.0602,
"lr": 1.4e-05,
"step": 8,
"tokens_trained": 0.003935728
},
{
"epoch": 0.0028366782497695198,
"grad_norm": 1418.443603515625,
"loss": 159.0854,
"lr": 1.8e-05,
"step": 10,
"tokens_trained": 0.004916488
},
{
"epoch": 0.003404013899723424,
"grad_norm": 874.7195434570312,
"loss": 91.9563,
"lr": 2.2e-05,
"step": 12,
"tokens_trained": 0.005902792
},
{
"epoch": 0.003971349549677328,
"grad_norm": 1339.8248291015625,
"loss": 40.3366,
"lr": 2.6e-05,
"step": 14,
"tokens_trained": 0.0068856
},
{
"epoch": 0.0045386851996312315,
"grad_norm": 2936.7607421875,
"loss": 22.7436,
"lr": 3e-05,
"step": 16,
"tokens_trained": 0.007868248
},
{
"epoch": 0.005106020849585136,
"grad_norm": 1531.3807373046875,
"loss": 23.4797,
"lr": 3.4000000000000007e-05,
"step": 18,
"tokens_trained": 0.008849296
},
{
"epoch": 0.0056733564995390395,
"grad_norm": 3027.4189453125,
"loss": 38.7379,
"lr": 3.8e-05,
"step": 20,
"tokens_trained": 0.009830984
},
{
"epoch": 0.006240692149492944,
"grad_norm": 2435.890625,
"loss": 26.2427,
"lr": 4.2000000000000004e-05,
"step": 22,
"tokens_trained": 0.01081364
},
{
"epoch": 0.006808027799446848,
"grad_norm": 3217.990478515625,
"loss": 31.0263,
"lr": 4.6e-05,
"step": 24,
"tokens_trained": 0.01179036
},
{
"epoch": 0.007375363449400752,
"grad_norm": 3854.00634765625,
"loss": 33.8781,
"lr": 5e-05,
"step": 26,
"tokens_trained": 0.012774504
},
{
"epoch": 0.007942699099354656,
"grad_norm": 3197.489990234375,
"loss": 27.7927,
"lr": 5.4e-05,
"step": 28,
"tokens_trained": 0.013759992
},
{
"epoch": 0.00851003474930856,
"grad_norm": 3034.156494140625,
"loss": 37.9083,
"lr": 5.800000000000001e-05,
"step": 30,
"tokens_trained": 0.014740536
},
{
"epoch": 0.009077370399262463,
"grad_norm": 3040.314453125,
"loss": 34.0659,
"lr": 6.2e-05,
"step": 32,
"tokens_trained": 0.015725984
},
{
"epoch": 0.009644706049216368,
"grad_norm": 3065.5791015625,
"loss": 27.7768,
"lr": 6.6e-05,
"step": 34,
"tokens_trained": 0.016706864
},
{
"epoch": 0.010212041699170272,
"grad_norm": 2454.293701171875,
"loss": 35.1143,
"lr": 7.000000000000001e-05,
"step": 36,
"tokens_trained": 0.017688816
},
{
"epoch": 0.010779377349124175,
"grad_norm": 3100.7802734375,
"loss": 42.2603,
"lr": 7.4e-05,
"step": 38,
"tokens_trained": 0.018669072
},
{
"epoch": 0.011346712999078079,
"grad_norm": 2749.84423828125,
"loss": 39.3879,
"lr": 7.8e-05,
"step": 40,
"tokens_trained": 0.019652072
},
{
"epoch": 0.011914048649031984,
"grad_norm": 1519.9908447265625,
"loss": 35.0735,
"lr": 8.2e-05,
"step": 42,
"tokens_trained": 0.020633112
},
{
"epoch": 0.012481384298985888,
"grad_norm": 1474.4244384765625,
"loss": 25.8965,
"lr": 8.599999999999999e-05,
"step": 44,
"tokens_trained": 0.021616192
},
{
"epoch": 0.013048719948939792,
"grad_norm": 2962.500244140625,
"loss": 51.0784,
"lr": 8.999999999999999e-05,
"step": 46,
"tokens_trained": 0.022597288
},
{
"epoch": 0.013616055598893695,
"grad_norm": 2419.41455078125,
"loss": 43.0334,
"lr": 9.400000000000001e-05,
"step": 48,
"tokens_trained": 0.02357572
},
{
"epoch": 0.014183391248847599,
"grad_norm": 1267.87451171875,
"loss": 21.8063,
"lr": 9.800000000000001e-05,
"step": 50,
"tokens_trained": 0.024553376
},
{
"epoch": 0.014750726898801504,
"grad_norm": 1573.944091796875,
"loss": 52.9693,
"lr": 0.000102,
"step": 52,
"tokens_trained": 0.025536728
},
{
"epoch": 0.015318062548755408,
"grad_norm": 1509.650146484375,
"loss": 50.0825,
"lr": 0.000106,
"step": 54,
"tokens_trained": 0.026517
},
{
"epoch": 0.01588539819870931,
"grad_norm": 2334.765380859375,
"loss": 42.1982,
"lr": 0.00011,
"step": 56,
"tokens_trained": 0.027504728
},
{
"epoch": 0.016452733848663217,
"grad_norm": 1594.16259765625,
"loss": 39.0562,
"lr": 0.000114,
"step": 58,
"tokens_trained": 0.028485416
},
{
"epoch": 0.01702006949861712,
"grad_norm": 1628.082275390625,
"loss": 35.0488,
"lr": 0.000118,
"step": 60,
"tokens_trained": 0.029468696
},
{
"epoch": 0.017587405148571024,
"grad_norm": 2496.6455078125,
"loss": 49.4241,
"lr": 0.000122,
"step": 62,
"tokens_trained": 0.030453584
},
{
"epoch": 0.018154740798524926,
"grad_norm": 2521.721435546875,
"loss": 69.0275,
"lr": 0.000126,
"step": 64,
"tokens_trained": 0.031432864
},
{
"epoch": 0.01872207644847883,
"grad_norm": 2179.571533203125,
"loss": 63.1409,
"lr": 0.00013000000000000002,
"step": 66,
"tokens_trained": 0.032418416
},
{
"epoch": 0.019289412098432736,
"grad_norm": 899.7137451171875,
"loss": 38.4131,
"lr": 0.000134,
"step": 68,
"tokens_trained": 0.033402136
},
{
"epoch": 0.01985674774838664,
"grad_norm": 2109.377685546875,
"loss": 51.0044,
"lr": 0.00013800000000000002,
"step": 70,
"tokens_trained": 0.03438832
},
{
"epoch": 0.020424083398340544,
"grad_norm": 1649.1873779296875,
"loss": 32.1408,
"lr": 0.00014199999999999998,
"step": 72,
"tokens_trained": 0.035374464
},
{
"epoch": 0.020991419048294446,
"grad_norm": 1807.994140625,
"loss": 28.8357,
"lr": 0.000146,
"step": 74,
"tokens_trained": 0.03635784
},
{
"epoch": 0.02155875469824835,
"grad_norm": 998.9485473632812,
"loss": 23.0343,
"lr": 0.00015,
"step": 76,
"tokens_trained": 0.037340248
},
{
"epoch": 0.022126090348202256,
"grad_norm": 2240.17578125,
"loss": 32.0397,
"lr": 0.000154,
"step": 78,
"tokens_trained": 0.038321968
},
{
"epoch": 0.022693425998156158,
"grad_norm": 1606.0067138671875,
"loss": 32.1776,
"lr": 0.000158,
"step": 80,
"tokens_trained": 0.039304992
},
{
"epoch": 0.023260761648110063,
"grad_norm": 1685.1015625,
"loss": 24.3428,
"lr": 0.000162,
"step": 82,
"tokens_trained": 0.040286808
},
{
"epoch": 0.02382809729806397,
"grad_norm": 1761.7890625,
"loss": 23.9261,
"lr": 0.00016600000000000002,
"step": 84,
"tokens_trained": 0.041271776
},
{
"epoch": 0.02439543294801787,
"grad_norm": 2036.0982666015625,
"loss": 27.7196,
"lr": 0.00017,
"step": 86,
"tokens_trained": 0.042252784
},
{
"epoch": 0.024962768597971776,
"grad_norm": 1564.3870849609375,
"loss": 25.3722,
"lr": 0.000174,
"step": 88,
"tokens_trained": 0.04323596
},
{
"epoch": 0.025530104247925678,
"grad_norm": 1508.349853515625,
"loss": 18.4107,
"lr": 0.000178,
"step": 90,
"tokens_trained": 0.044218984
},
{
"epoch": 0.026097439897879583,
"grad_norm": 1955.011474609375,
"loss": 28.8456,
"lr": 0.000182,
"step": 92,
"tokens_trained": 0.045202144
},
{
"epoch": 0.02666477554783349,
"grad_norm": 1679.9423828125,
"loss": 23.6139,
"lr": 0.000186,
"step": 94,
"tokens_trained": 0.046192336
},
{
"epoch": 0.02723211119778739,
"grad_norm": 1517.5731201171875,
"loss": 42.145,
"lr": 0.00019,
"step": 96,
"tokens_trained": 0.047174312
},
{
"epoch": 0.027799446847741296,
"grad_norm": 1535.3076171875,
"loss": 31.9711,
"lr": 0.000194,
"step": 98,
"tokens_trained": 0.048158944
},
{
"epoch": 0.028366782497695198,
"grad_norm": 1475.2569580078125,
"loss": 37.645,
"lr": 0.00019800000000000002,
"step": 100,
"tokens_trained": 0.04914364
},
{
"epoch": 0.028934118147649103,
"grad_norm": 1918.4088134765625,
"loss": 69.4053,
"lr": 0.000202,
"step": 102,
"tokens_trained": 0.050123488
},
{
"epoch": 0.02950145379760301,
"grad_norm": 1631.6231689453125,
"loss": 50.9725,
"lr": 0.000206,
"step": 104,
"tokens_trained": 0.051105512
},
{
"epoch": 0.03006878944755691,
"grad_norm": 1291.6376953125,
"loss": 22.6527,
"lr": 0.00021,
"step": 106,
"tokens_trained": 0.052091704
},
{
"epoch": 0.030636125097510816,
"grad_norm": 1224.9625244140625,
"loss": 60.2725,
"lr": 0.000214,
"step": 108,
"tokens_trained": 0.053074824
},
{
"epoch": 0.031203460747464717,
"grad_norm": 1218.2022705078125,
"loss": 75.8728,
"lr": 0.000218,
"step": 110,
"tokens_trained": 0.054057104
},
{
"epoch": 0.03177079639741862,
"grad_norm": 1761.8861083984375,
"loss": 61.6427,
"lr": 0.000222,
"step": 112,
"tokens_trained": 0.055039128
},
{
"epoch": 0.03233813204737253,
"grad_norm": 1482.4256591796875,
"loss": 35.3351,
"lr": 0.00022600000000000002,
"step": 114,
"tokens_trained": 0.05602388
},
{
"epoch": 0.03290546769732643,
"grad_norm": 563.6399536132812,
"loss": 40.1461,
"lr": 0.00023,
"step": 116,
"tokens_trained": 0.057005376
},
{
"epoch": 0.03347280334728033,
"grad_norm": 1266.058837890625,
"loss": 24.0657,
"lr": 0.00023400000000000002,
"step": 118,
"tokens_trained": 0.057985136
},
{
"epoch": 0.03404013899723424,
"grad_norm": 918.206298828125,
"loss": 23.9626,
"lr": 0.00023799999999999998,
"step": 120,
"tokens_trained": 0.058968288
},
{
"epoch": 0.03460747464718814,
"grad_norm": 1495.7191162109375,
"loss": 19.798,
"lr": 0.000242,
"step": 122,
"tokens_trained": 0.05995348
},
{
"epoch": 0.03517481029714205,
"grad_norm": 1264.302734375,
"loss": 31.5342,
"lr": 0.000246,
"step": 124,
"tokens_trained": 0.060935832
},
{
"epoch": 0.035458478122119,
"eval_loss": 5.312118053436279,
"eval_runtime": 21.3065,
"step": 125,
"tokens_trained": 0.061426608
},
{
"epoch": 0.03574214594709595,
"grad_norm": 907.4861450195312,
"loss": 25.1262,
"lr": 0.00025,
"step": 126,
"tokens_trained": 0.061918184
},
{
"epoch": 0.03630948159704985,
"grad_norm": 1287.6158447265625,
"loss": 26.963,
"lr": 0.000254,
"step": 128,
"tokens_trained": 0.062902328
},
{
"epoch": 0.03687681724700376,
"grad_norm": 1260.570556640625,
"loss": 24.9633,
"lr": 0.00025800000000000004,
"step": 130,
"tokens_trained": 0.063883456
},
{
"epoch": 0.03744415289695766,
"grad_norm": 1436.82373046875,
"loss": 23.1028,
"lr": 0.000262,
"step": 132,
"tokens_trained": 0.06486748
},
{
"epoch": 0.03801148854691157,
"grad_norm": 812.9523315429688,
"loss": 20.5496,
"lr": 0.000266,
"step": 134,
"tokens_trained": 0.065847104
},
{
"epoch": 0.03857882419686547,
"grad_norm": 1336.5322265625,
"loss": 23.673,
"lr": 0.00027,
"step": 136,
"tokens_trained": 0.066829928
},
{
"epoch": 0.03914615984681937,
"grad_norm": 1381.282470703125,
"loss": 32.0373,
"lr": 0.00027400000000000005,
"step": 138,
"tokens_trained": 0.067814024
},
{
"epoch": 0.03971349549677328,
"grad_norm": 972.7861938476562,
"loss": 26.9454,
"lr": 0.00027800000000000004,
"step": 140,
"tokens_trained": 0.068797744
},
{
"epoch": 0.04028083114672718,
"grad_norm": 1347.2249755859375,
"loss": 22.3578,
"lr": 0.00028199999999999997,
"step": 142,
"tokens_trained": 0.069780072
},
{
"epoch": 0.04084816679668109,
"grad_norm": 829.525390625,
"loss": 37.9879,
"lr": 0.00028599999999999996,
"step": 144,
"tokens_trained": 0.070759896
},
{
"epoch": 0.04141550244663499,
"grad_norm": 1094.1033935546875,
"loss": 21.1972,
"lr": 0.00029,
"step": 146,
"tokens_trained": 0.0717452
},
{
"epoch": 0.04198283809658889,
"grad_norm": 717.107421875,
"loss": 21.7774,
"lr": 0.000294,
"step": 148,
"tokens_trained": 0.072727432
},
{
"epoch": 0.042550173746542796,
"grad_norm": 744.4456787109375,
"loss": 20.3235,
"lr": 0.000298,
"step": 150,
"tokens_trained": 0.073712128
},
{
"epoch": 0.0431175093964967,
"grad_norm": 904.1460571289062,
"loss": 22.7878,
"lr": 0.000302,
"step": 152,
"tokens_trained": 0.074695296
},
{
"epoch": 0.04368484504645061,
"grad_norm": 1352.303955078125,
"loss": 20.9757,
"lr": 0.000306,
"step": 154,
"tokens_trained": 0.0756798
},
{
"epoch": 0.04425218069640451,
"grad_norm": 997.0473022460938,
"loss": 17.4647,
"lr": 0.00031,
"step": 156,
"tokens_trained": 0.076666504
},
{
"epoch": 0.04481951634635841,
"grad_norm": 1206.387939453125,
"loss": 21.1846,
"lr": 0.000314,
"step": 158,
"tokens_trained": 0.07764868
},
{
"epoch": 0.045386851996312316,
"grad_norm": 1029.6807861328125,
"loss": 17.8853,
"lr": 0.00031800000000000003,
"step": 160,
"tokens_trained": 0.07863548
},
{
"epoch": 0.04595418764626622,
"grad_norm": 1136.4635009765625,
"loss": 30.057,
"lr": 0.000322,
"step": 162,
"tokens_trained": 0.079618928
},
{
"epoch": 0.04652152329622013,
"grad_norm": 834.3464965820312,
"loss": 28.1782,
"lr": 0.000326,
"step": 164,
"tokens_trained": 0.0806032
},
{
"epoch": 0.04708885894617403,
"grad_norm": 1177.8365478515625,
"loss": 16.4267,
"lr": 0.00033,
"step": 166,
"tokens_trained": 0.081583752
},
{
"epoch": 0.04765619459612794,
"grad_norm": 572.501708984375,
"loss": 16.5752,
"lr": 0.00033400000000000004,
"step": 168,
"tokens_trained": 0.082568184
},
{
"epoch": 0.048223530246081836,
"grad_norm": 437.6822814941406,
"loss": 11.5509,
"lr": 0.00033800000000000003,
"step": 170,
"tokens_trained": 0.083553352
},
{
"epoch": 0.04879086589603574,
"grad_norm": 1119.0416259765625,
"loss": 16.2689,
"lr": 0.000342,
"step": 172,
"tokens_trained": 0.084536352
},
{
"epoch": 0.04935820154598965,
"grad_norm": 895.4021606445312,
"loss": 12.6663,
"lr": 0.000346,
"step": 174,
"tokens_trained": 0.085517312
},
{
"epoch": 0.04992553719594355,
"grad_norm": 995.6289672851562,
"loss": 26.0663,
"lr": 0.00035,
"step": 176,
"tokens_trained": 0.086496088
},
{
"epoch": 0.05049287284589746,
"grad_norm": 839.6610717773438,
"loss": 21.5115,
"lr": 0.000354,
"step": 178,
"tokens_trained": 0.087480632
},
{
"epoch": 0.051060208495851356,
"grad_norm": 734.1155395507812,
"loss": 29.3287,
"lr": 0.000358,
"step": 180,
"tokens_trained": 0.088460408
},
{
"epoch": 0.05162754414580526,
"grad_norm": 721.4505615234375,
"loss": 26.0801,
"lr": 0.000362,
"step": 182,
"tokens_trained": 0.08944248
},
{
"epoch": 0.052194879795759166,
"grad_norm": 845.9672241210938,
"loss": 19.0639,
"lr": 0.000366,
"step": 184,
"tokens_trained": 0.090427832
},
{
"epoch": 0.05276221544571307,
"grad_norm": 1210.9969482421875,
"loss": 23.9036,
"lr": 0.00037,
"step": 186,
"tokens_trained": 0.091411504
},
{
"epoch": 0.05332955109566698,
"grad_norm": 1079.1690673828125,
"loss": 23.5588,
"lr": 0.000374,
"step": 188,
"tokens_trained": 0.092392672
},
{
"epoch": 0.053896886745620876,
"grad_norm": 596.111328125,
"loss": 20.8275,
"lr": 0.000378,
"step": 190,
"tokens_trained": 0.093374696
},
{
"epoch": 0.05446422239557478,
"grad_norm": 761.8096923828125,
"loss": 22.512,
"lr": 0.000382,
"step": 192,
"tokens_trained": 0.094361912
},
{
"epoch": 0.055031558045528686,
"grad_norm": 1081.9832763671875,
"loss": 32.335,
"lr": 0.000386,
"step": 194,
"tokens_trained": 0.095342992
},
{
"epoch": 0.05559889369548259,
"grad_norm": 304.3534240722656,
"loss": 11.5275,
"lr": 0.00039000000000000005,
"step": 196,
"tokens_trained": 0.096323512
},
{
"epoch": 0.0561662293454365,
"grad_norm": 586.6314086914062,
"loss": 16.2663,
"lr": 0.00039400000000000004,
"step": 198,
"tokens_trained": 0.097308864
},
{
"epoch": 0.056733564995390395,
"grad_norm": 624.9953002929688,
"loss": 16.627,
"lr": 0.000398,
"step": 200,
"tokens_trained": 0.098289064
},
{
"epoch": 0.0573009006453443,
"grad_norm": 585.9645385742188,
"loss": 15.8359,
"lr": 0.000402,
"step": 202,
"tokens_trained": 0.099269696
},
{
"epoch": 0.057868236295298206,
"grad_norm": 537.9913330078125,
"loss": 20.0779,
"lr": 0.00040600000000000006,
"step": 204,
"tokens_trained": 0.100248448
},
{
"epoch": 0.05843557194525211,
"grad_norm": 805.04931640625,
"loss": 21.4524,
"lr": 0.00041,
"step": 206,
"tokens_trained": 0.101231248
},
{
"epoch": 0.05900290759520602,
"grad_norm": 439.1418151855469,
"loss": 23.9852,
"lr": 0.000414,
"step": 208,
"tokens_trained": 0.102210688
},
{
"epoch": 0.059570243245159915,
"grad_norm": 502.684814453125,
"loss": 17.6273,
"lr": 0.00041799999999999997,
"step": 210,
"tokens_trained": 0.103192176
},
{
"epoch": 0.06013757889511382,
"grad_norm": 849.9979858398438,
"loss": 33.7517,
"lr": 0.000422,
"step": 212,
"tokens_trained": 0.104172824
},
{
"epoch": 0.060704914545067726,
"grad_norm": 939.583740234375,
"loss": 26.2559,
"lr": 0.000426,
"step": 214,
"tokens_trained": 0.105156672
},
{
"epoch": 0.06127225019502163,
"grad_norm": 525.0505981445312,
"loss": 20.0923,
"lr": 0.00043,
"step": 216,
"tokens_trained": 0.106141368
},
{
"epoch": 0.061839585844975536,
"grad_norm": 420.296630859375,
"loss": 17.9608,
"lr": 0.00043400000000000003,
"step": 218,
"tokens_trained": 0.107124088
},
{
"epoch": 0.062406921494929435,
"grad_norm": 711.3380737304688,
"loss": 19.387,
"lr": 0.000438,
"step": 220,
"tokens_trained": 0.108112632
},
{
"epoch": 0.06297425714488335,
"grad_norm": 759.183349609375,
"loss": 17.8061,
"lr": 0.000442,
"step": 222,
"tokens_trained": 0.1090934
},
{
"epoch": 0.06354159279483725,
"grad_norm": 790.025146484375,
"loss": 13.8539,
"lr": 0.000446,
"step": 224,
"tokens_trained": 0.110079512
},
{
"epoch": 0.06410892844479114,
"grad_norm": 769.8306274414062,
"loss": 22.1258,
"lr": 0.00045000000000000004,
"step": 226,
"tokens_trained": 0.111060152
},
{
"epoch": 0.06467626409474506,
"grad_norm": 656.8352661132812,
"loss": 14.8646,
"lr": 0.00045400000000000003,
"step": 228,
"tokens_trained": 0.112044144
},
{
"epoch": 0.06524359974469895,
"grad_norm": 498.92010498046875,
"loss": 23.1558,
"lr": 0.000458,
"step": 230,
"tokens_trained": 0.113022928
},
{
"epoch": 0.06581093539465287,
"grad_norm": 764.0186157226562,
"loss": 16.7089,
"lr": 0.000462,
"step": 232,
"tokens_trained": 0.114003832
},
{
"epoch": 0.06637827104460677,
"grad_norm": 491.5793762207031,
"loss": 12.3979,
"lr": 0.00046600000000000005,
"step": 234,
"tokens_trained": 0.114991008
},
{
"epoch": 0.06694560669456066,
"grad_norm": 679.9217529296875,
"loss": 14.9037,
"lr": 0.00047,
"step": 236,
"tokens_trained": 0.115971888
},
{
"epoch": 0.06751294234451458,
"grad_norm": 491.0369567871094,
"loss": 7.7603,
"lr": 0.000474,
"step": 238,
"tokens_trained": 0.116952616
},
{
"epoch": 0.06808027799446847,
"grad_norm": 369.2186279296875,
"loss": 8.2256,
"lr": 0.00047799999999999996,
"step": 240,
"tokens_trained": 0.117935816
},
{
"epoch": 0.06864761364442239,
"grad_norm": 312.72137451171875,
"loss": 7.5486,
"lr": 0.000482,
"step": 242,
"tokens_trained": 0.118919392
},
{
"epoch": 0.06921494929437629,
"grad_norm": 596.1439208984375,
"loss": 11.7351,
"lr": 0.000486,
"step": 244,
"tokens_trained": 0.119901856
},
{
"epoch": 0.06978228494433018,
"grad_norm": 467.5667419433594,
"loss": 11.8403,
"lr": 0.00049,
"step": 246,
"tokens_trained": 0.120884624
},
{
"epoch": 0.0703496205942841,
"grad_norm": 430.50048828125,
"loss": 13.8081,
"lr": 0.000494,
"step": 248,
"tokens_trained": 0.121869224
},
{
"epoch": 0.070916956244238,
"grad_norm": 522.242919921875,
"loss": 14.1892,
"lr": 0.000498,
"step": 250,
"tokens_trained": 0.122853584
},
{
"epoch": 0.070916956244238,
"eval_loss": 1.9294606447219849,
"eval_runtime": 20.4162,
"step": 250,
"tokens_trained": 0.122853584
},
{
"epoch": 0.0714842918941919,
"grad_norm": 835.2765502929688,
"loss": 13.2462,
"lr": 0.0005020000000000001,
"step": 252,
"tokens_trained": 0.123835544
},
{
"epoch": 0.0720516275441458,
"grad_norm": 714.8098754882812,
"loss": 20.0498,
"lr": 0.000506,
"step": 254,
"tokens_trained": 0.124821616
},
{
"epoch": 0.0726189631940997,
"grad_norm": 701.512939453125,
"loss": 18.3664,
"lr": 0.00051,
"step": 256,
"tokens_trained": 0.125807608
},
{
"epoch": 0.07318629884405362,
"grad_norm": 773.987060546875,
"loss": 21.3807,
"lr": 0.000514,
"step": 258,
"tokens_trained": 0.126791464
},
{
"epoch": 0.07375363449400751,
"grad_norm": 826.422119140625,
"loss": 22.6403,
"lr": 0.000518,
"step": 260,
"tokens_trained": 0.127771752
},
{
"epoch": 0.07432097014396143,
"grad_norm": 742.8673095703125,
"loss": 20.1504,
"lr": 0.000522,
"step": 262,
"tokens_trained": 0.128755448
},
{
"epoch": 0.07488830579391532,
"grad_norm": 797.79296875,
"loss": 26.7343,
"lr": 0.000526,
"step": 264,
"tokens_trained": 0.129741088
},
{
"epoch": 0.07545564144386922,
"grad_norm": 673.9141235351562,
"loss": 12.505,
"lr": 0.0005300000000000001,
"step": 266,
"tokens_trained": 0.130727504
},
{
"epoch": 0.07602297709382314,
"grad_norm": 310.6510925292969,
"loss": 12.6344,
"lr": 0.0005340000000000001,
"step": 268,
"tokens_trained": 0.131710296
},
{
"epoch": 0.07659031274377703,
"grad_norm": 312.40966796875,
"loss": 14.254,
"lr": 0.0005380000000000001,
"step": 270,
"tokens_trained": 0.132695352
},
{
"epoch": 0.07715764839373095,
"grad_norm": 492.2834777832031,
"loss": 19.0979,
"lr": 0.0005420000000000001,
"step": 272,
"tokens_trained": 0.133677928
},
{
"epoch": 0.07772498404368484,
"grad_norm": 628.457763671875,
"loss": 21.7735,
"lr": 0.000546,
"step": 274,
"tokens_trained": 0.134655504
},
{
"epoch": 0.07829231969363874,
"grad_norm": 382.8389892578125,
"loss": 12.5128,
"lr": 0.00055,
"step": 276,
"tokens_trained": 0.135640208
},
{
"epoch": 0.07885965534359266,
"grad_norm": 483.12335205078125,
"loss": 15.2589,
"lr": 0.000554,
"step": 278,
"tokens_trained": 0.136624232
},
{
"epoch": 0.07942699099354655,
"grad_norm": 640.658447265625,
"loss": 12.1341,
"lr": 0.000558,
"step": 280,
"tokens_trained": 0.13760628
},
{
"epoch": 0.07999432664350047,
"grad_norm": 410.0824279785156,
"loss": 12.5723,
"lr": 0.0005620000000000001,
"step": 282,
"tokens_trained": 0.13858832
},
{
"epoch": 0.08056166229345436,
"grad_norm": 513.2861328125,
"loss": 14.8461,
"lr": 0.000566,
"step": 284,
"tokens_trained": 0.139568424
},
{
"epoch": 0.08112899794340826,
"grad_norm": 564.547607421875,
"loss": 12.5792,
"lr": 0.00057,
"step": 286,
"tokens_trained": 0.140557016
},
{
"epoch": 0.08169633359336217,
"grad_norm": 451.3592834472656,
"loss": 16.5433,
"lr": 0.000574,
"step": 288,
"tokens_trained": 0.141540248
},
{
"epoch": 0.08226366924331607,
"grad_norm": 404.2495422363281,
"loss": 16.4138,
"lr": 0.000578,
"step": 290,
"tokens_trained": 0.142528272
},
{
"epoch": 0.08283100489326999,
"grad_norm": 566.5219116210938,
"loss": 16.4743,
"lr": 0.0005819999999999999,
"step": 292,
"tokens_trained": 0.143513096
},
{
"epoch": 0.08339834054322388,
"grad_norm": 559.6517333984375,
"loss": 16.421,
"lr": 0.0005859999999999999,
"step": 294,
"tokens_trained": 0.144494472
},
{
"epoch": 0.08396567619317778,
"grad_norm": 260.874755859375,
"loss": 11.2214,
"lr": 0.00059,
"step": 296,
"tokens_trained": 0.14547876
},
{
"epoch": 0.0845330118431317,
"grad_norm": 272.02899169921875,
"loss": 10.3491,
"lr": 0.000594,
"step": 298,
"tokens_trained": 0.146465864
},
{
"epoch": 0.08510034749308559,
"grad_norm": 556.9845581054688,
"loss": 10.4348,
"lr": 0.000598,
"step": 300,
"tokens_trained": 0.147446344
},
{
"epoch": 0.0856676831430395,
"grad_norm": 273.35772705078125,
"loss": 8.3292,
"lr": 0.000602,
"step": 302,
"tokens_trained": 0.14843244
},
{
"epoch": 0.0862350187929934,
"grad_norm": 246.6316680908203,
"loss": 9.9362,
"lr": 0.000606,
"step": 304,
"tokens_trained": 0.149415976
},
{
"epoch": 0.0868023544429473,
"grad_norm": 564.4365844726562,
"loss": 9.2621,
"lr": 0.00061,
"step": 306,
"tokens_trained": 0.150398728
},
{
"epoch": 0.08736969009290121,
"grad_norm": 396.0948791503906,
"loss": 11.8526,
"lr": 0.000614,
"step": 308,
"tokens_trained": 0.151385104
},
{
"epoch": 0.08793702574285511,
"grad_norm": 488.6072692871094,
"loss": 11.8473,
"lr": 0.0006180000000000001,
"step": 310,
"tokens_trained": 0.152373672
},
{
"epoch": 0.08850436139280903,
"grad_norm": 346.70660400390625,
"loss": 12.0897,
"lr": 0.000622,
"step": 312,
"tokens_trained": 0.153356256
},
{
"epoch": 0.08907169704276292,
"grad_norm": 382.40679931640625,
"loss": 9.271,
"lr": 0.000626,
"step": 314,
"tokens_trained": 0.154342632
},
{
"epoch": 0.08963903269271682,
"grad_norm": 288.7908935546875,
"loss": 9.185,
"lr": 0.00063,
"step": 316,
"tokens_trained": 0.1553238
},
{
"epoch": 0.09020636834267073,
"grad_norm": 337.5335388183594,
"loss": 12.0555,
"lr": 0.000634,
"step": 318,
"tokens_trained": 0.156313168
},
{
"epoch": 0.09077370399262463,
"grad_norm": 349.25531005859375,
"loss": 8.51,
"lr": 0.000638,
"step": 320,
"tokens_trained": 0.157299448
},
{
"epoch": 0.09134103964257854,
"grad_norm": 471.7824401855469,
"loss": 14.1888,
"lr": 0.000642,
"step": 322,
"tokens_trained": 0.158285264
},
{
"epoch": 0.09190837529253244,
"grad_norm": 284.94036865234375,
"loss": 10.1593,
"lr": 0.000646,
"step": 324,
"tokens_trained": 0.159267512
},
{
"epoch": 0.09247571094248634,
"grad_norm": 510.90478515625,
"loss": 13.5744,
"lr": 0.0006500000000000001,
"step": 326,
"tokens_trained": 0.160250856
},
{
"epoch": 0.09304304659244025,
"grad_norm": 373.82965087890625,
"loss": 8.4999,
"lr": 0.0006540000000000001,
"step": 328,
"tokens_trained": 0.161231832
},
{
"epoch": 0.09361038224239415,
"grad_norm": 219.3827362060547,
"loss": 8.4436,
"lr": 0.0006580000000000001,
"step": 330,
"tokens_trained": 0.162217656
},
{
"epoch": 0.09417771789234806,
"grad_norm": 433.0914001464844,
"loss": 11.2019,
"lr": 0.000662,
"step": 332,
"tokens_trained": 0.163199096
},
{
"epoch": 0.09474505354230196,
"grad_norm": 242.65907287597656,
"loss": 9.0666,
"lr": 0.000666,
"step": 334,
"tokens_trained": 0.164178512
},
{
"epoch": 0.09531238919225588,
"grad_norm": 446.07916259765625,
"loss": 8.6546,
"lr": 0.00067,
"step": 336,
"tokens_trained": 0.165162464
},
{
"epoch": 0.09587972484220977,
"grad_norm": 231.8892364501953,
"loss": 7.5819,
"lr": 0.000674,
"step": 338,
"tokens_trained": 0.166141536
},
{
"epoch": 0.09644706049216367,
"grad_norm": 100.7306137084961,
"loss": 6.7047,
"lr": 0.0006780000000000001,
"step": 340,
"tokens_trained": 0.167123944
},
{
"epoch": 0.09701439614211758,
"grad_norm": 78.11279296875,
"loss": 5.9308,
"lr": 0.0006820000000000001,
"step": 342,
"tokens_trained": 0.168105264
},
{
"epoch": 0.09758173179207148,
"grad_norm": 271.466064453125,
"loss": 6.9141,
"lr": 0.0006860000000000001,
"step": 344,
"tokens_trained": 0.169088912
},
{
"epoch": 0.0981490674420254,
"grad_norm": 252.54478454589844,
"loss": 6.3281,
"lr": 0.00069,
"step": 346,
"tokens_trained": 0.170077368
},
{
"epoch": 0.0987164030919793,
"grad_norm": 305.8559875488281,
"loss": 6.443,
"lr": 0.000694,
"step": 348,
"tokens_trained": 0.171057232
},
{
"epoch": 0.09928373874193319,
"grad_norm": 227.74374389648438,
"loss": 6.552,
"lr": 0.0006979999999999999,
"step": 350,
"tokens_trained": 0.172041376
},
{
"epoch": 0.0998510743918871,
"grad_norm": 446.7601623535156,
"loss": 10.8184,
"lr": 0.0007019999999999999,
"step": 352,
"tokens_trained": 0.173023624
},
{
"epoch": 0.100418410041841,
"grad_norm": 353.0849609375,
"loss": 8.6327,
"lr": 0.0007059999999999999,
"step": 354,
"tokens_trained": 0.174005992
},
{
"epoch": 0.10098574569179491,
"grad_norm": 367.9427185058594,
"loss": 9.3898,
"lr": 0.00071,
"step": 356,
"tokens_trained": 0.174988304
},
{
"epoch": 0.10155308134174881,
"grad_norm": 224.4961700439453,
"loss": 8.284,
"lr": 0.000714,
"step": 358,
"tokens_trained": 0.175969816
},
{
"epoch": 0.10212041699170271,
"grad_norm": 221.86537170410156,
"loss": 7.0578,
"lr": 0.000718,
"step": 360,
"tokens_trained": 0.176952688
},
{
"epoch": 0.10268775264165662,
"grad_norm": 331.0989685058594,
"loss": 6.9561,
"lr": 0.000722,
"step": 362,
"tokens_trained": 0.177935144
},
{
"epoch": 0.10325508829161052,
"grad_norm": 171.6498260498047,
"loss": 7.203,
"lr": 0.000726,
"step": 364,
"tokens_trained": 0.178916776
},
{
"epoch": 0.10382242394156443,
"grad_norm": 284.2208557128906,
"loss": 10.3517,
"lr": 0.00073,
"step": 366,
"tokens_trained": 0.179903432
},
{
"epoch": 0.10438975959151833,
"grad_norm": 354.8574523925781,
"loss": 9.3888,
"lr": 0.000734,
"step": 368,
"tokens_trained": 0.180883224
},
{
"epoch": 0.10495709524147223,
"grad_norm": 344.82574462890625,
"loss": 10.5933,
"lr": 0.000738,
"step": 370,
"tokens_trained": 0.181863808
},
{
"epoch": 0.10552443089142614,
"grad_norm": 302.6838073730469,
"loss": 10.2832,
"lr": 0.000742,
"step": 372,
"tokens_trained": 0.182843712
},
{
"epoch": 0.10609176654138004,
"grad_norm": 323.0387878417969,
"loss": 6.4864,
"lr": 0.000746,
"step": 374,
"tokens_trained": 0.183825832
},
{
"epoch": 0.10637543436635699,
"eval_loss": 1.4430732727050781,
"eval_runtime": 20.5468,
"step": 375,
"tokens_trained": 0.184317744
},
{
"epoch": 0.10665910219133395,
"grad_norm": 133.74822998046875,
"loss": 5.4176,
"lr": 0.00075,
"step": 376,
"tokens_trained": 0.184811352
},
{
"epoch": 0.10722643784128785,
"grad_norm": 180.3372344970703,
"loss": 5.5641,
"lr": 0.000754,
"step": 378,
"tokens_trained": 0.185792528
},
{
"epoch": 0.10779377349124175,
"grad_norm": 250.83999633789062,
"loss": 5.8612,
"lr": 0.000758,
"step": 380,
"tokens_trained": 0.186777112
},
{
"epoch": 0.10836110914119566,
"grad_norm": 293.51959228515625,
"loss": 6.0418,
"lr": 0.000762,
"step": 382,
"tokens_trained": 0.18775724
},
{
"epoch": 0.10892844479114956,
"grad_norm": 292.56207275390625,
"loss": 6.1812,
"lr": 0.0007660000000000001,
"step": 384,
"tokens_trained": 0.188733568
},
{
"epoch": 0.10949578044110347,
"grad_norm": 121.82467651367188,
"loss": 6.0855,
"lr": 0.0007700000000000001,
"step": 386,
"tokens_trained": 0.189718512
},
{
"epoch": 0.11006311609105737,
"grad_norm": 124.30497741699219,
"loss": 5.7734,
"lr": 0.0007740000000000001,
"step": 388,
"tokens_trained": 0.190703776
},
{
"epoch": 0.11063045174101127,
"grad_norm": 143.64004516601562,
"loss": 5.7641,
"lr": 0.000778,
"step": 390,
"tokens_trained": 0.191689888
},
{
"epoch": 0.11119778739096518,
"grad_norm": 160.06784057617188,
"loss": 5.6025,
"lr": 0.000782,
"step": 392,
"tokens_trained": 0.192673992
},
{
"epoch": 0.11176512304091908,
"grad_norm": 226.97988891601562,
"loss": 6.0049,
"lr": 0.000786,
"step": 394,
"tokens_trained": 0.193656272
},
{
"epoch": 0.112332458690873,
"grad_norm": 223.26898193359375,
"loss": 5.6972,
"lr": 0.00079,
"step": 396,
"tokens_trained": 0.194639144
},
{
"epoch": 0.11289979434082689,
"grad_norm": 249.34912109375,
"loss": 5.7348,
"lr": 0.0007940000000000001,
"step": 398,
"tokens_trained": 0.195621256
},
{
"epoch": 0.11346712999078079,
"grad_norm": 161.34271240234375,
"loss": 5.6689,
"lr": 0.0007980000000000001,
"step": 400,
"tokens_trained": 0.196604136
},
{
"epoch": 0.1140344656407347,
"grad_norm": 148.53176879882812,
"loss": 5.702,
"lr": 0.0008020000000000001,
"step": 402,
"tokens_trained": 0.197586784
},
{
"epoch": 0.1146018012906886,
"grad_norm": 144.40835571289062,
"loss": 6.2402,
"lr": 0.0008060000000000001,
"step": 404,
"tokens_trained": 0.198570824
},
{
"epoch": 0.11516913694064251,
"grad_norm": 306.57562255859375,
"loss": 7.1739,
"lr": 0.0008100000000000001,
"step": 406,
"tokens_trained": 0.199548328
},
{
"epoch": 0.11573647259059641,
"grad_norm": 308.79180908203125,
"loss": 6.0972,
"lr": 0.0008139999999999999,
"step": 408,
"tokens_trained": 0.200532496
},
{
"epoch": 0.11630380824055031,
"grad_norm": 197.76791381835938,
"loss": 6.3533,
"lr": 0.0008179999999999999,
"step": 410,
"tokens_trained": 0.201514648
},
{
"epoch": 0.11687114389050422,
"grad_norm": 129.5694580078125,
"loss": 6.9628,
"lr": 0.0008219999999999999,
"step": 412,
"tokens_trained": 0.2024994
},
{
"epoch": 0.11743847954045812,
"grad_norm": 446.0195617675781,
"loss": 11.7562,
"lr": 0.000826,
"step": 414,
"tokens_trained": 0.20348012
},
{
"epoch": 0.11800581519041203,
"grad_norm": 355.5342712402344,
"loss": 8.8055,
"lr": 0.00083,
"step": 416,
"tokens_trained": 0.20446356
},
{
"epoch": 0.11857315084036593,
"grad_norm": 456.2491149902344,
"loss": 9.606,
"lr": 0.000834,
"step": 418,
"tokens_trained": 0.205445288
},
{
"epoch": 0.11914048649031983,
"grad_norm": 369.8676452636719,
"loss": 8.385,
"lr": 0.000838,
"step": 420,
"tokens_trained": 0.206427832
},
{
"epoch": 0.11970782214027374,
"grad_norm": 262.19073486328125,
"loss": 9.0956,
"lr": 0.000842,
"step": 422,
"tokens_trained": 0.207409848
},
{
"epoch": 0.12027515779022764,
"grad_norm": 120.3193130493164,
"loss": 5.4937,
"lr": 0.000846,
"step": 424,
"tokens_trained": 0.208391752
},
{
"epoch": 0.12084249344018155,
"grad_norm": 222.1111297607422,
"loss": 8.9367,
"lr": 0.00085,
"step": 426,
"tokens_trained": 0.20937384
},
{
"epoch": 0.12140982909013545,
"grad_norm": 137.16819763183594,
"loss": 7.5876,
"lr": 0.000854,
"step": 428,
"tokens_trained": 0.210358576
},
{
"epoch": 0.12197716474008935,
"grad_norm": 267.61846923828125,
"loss": 8.817,
"lr": 0.000858,
"step": 430,
"tokens_trained": 0.211340064
},
{
"epoch": 0.12254450039004326,
"grad_norm": 472.72906494140625,
"loss": 8.203,
"lr": 0.000862,
"step": 432,
"tokens_trained": 0.212321144
},
{
"epoch": 0.12311183603999716,
"grad_norm": 297.1420593261719,
"loss": 10.987,
"lr": 0.000866,
"step": 434,
"tokens_trained": 0.213300312
},
{
"epoch": 0.12367917168995107,
"grad_norm": 281.7297668457031,
"loss": 7.6117,
"lr": 0.00087,
"step": 436,
"tokens_trained": 0.214287624
},
{
"epoch": 0.12424650733990497,
"grad_norm": 203.09678649902344,
"loss": 6.5638,
"lr": 0.000874,
"step": 438,
"tokens_trained": 0.215272136
},
{
"epoch": 0.12481384298985887,
"grad_norm": 155.7823944091797,
"loss": 6.1131,
"lr": 0.000878,
"step": 440,
"tokens_trained": 0.216256392
},
{
"epoch": 0.12538117863981277,
"grad_norm": 189.86196899414062,
"loss": 8.2565,
"lr": 0.000882,
"step": 442,
"tokens_trained": 0.217242504
},
{
"epoch": 0.1259485142897667,
"grad_norm": 247.4568634033203,
"loss": 7.1005,
"lr": 0.0008860000000000001,
"step": 444,
"tokens_trained": 0.218226008
},
{
"epoch": 0.1265158499397206,
"grad_norm": 179.72825622558594,
"loss": 6.3379,
"lr": 0.0008900000000000001,
"step": 446,
"tokens_trained": 0.219210584
},
{
"epoch": 0.1270831855896745,
"grad_norm": 212.96356201171875,
"loss": 7.2514,
"lr": 0.000894,
"step": 448,
"tokens_trained": 0.220193952
},
{
"epoch": 0.1276505212396284,
"grad_norm": 105.67095947265625,
"loss": 5.456,
"lr": 0.000898,
"step": 450,
"tokens_trained": 0.221176936
},
{
"epoch": 0.1282178568895823,
"grad_norm": 302.9122619628906,
"loss": 6.4018,
"lr": 0.000902,
"step": 452,
"tokens_trained": 0.222161952
},
{
"epoch": 0.12878519253953621,
"grad_norm": 215.66561889648438,
"loss": 6.2853,
"lr": 0.000906,
"step": 454,
"tokens_trained": 0.223144912
},
{
"epoch": 0.1293525281894901,
"grad_norm": 272.9984130859375,
"loss": 7.3902,
"lr": 0.00091,
"step": 456,
"tokens_trained": 0.224127392
},
{
"epoch": 0.129919863839444,
"grad_norm": 200.7503662109375,
"loss": 6.1637,
"lr": 0.0009140000000000001,
"step": 458,
"tokens_trained": 0.22511648
},
{
"epoch": 0.1304871994893979,
"grad_norm": 93.23990631103516,
"loss": 6.4867,
"lr": 0.0009180000000000001,
"step": 460,
"tokens_trained": 0.226098144
},
{
"epoch": 0.1310545351393518,
"grad_norm": 274.37164306640625,
"loss": 8.99,
"lr": 0.0009220000000000001,
"step": 462,
"tokens_trained": 0.227081848
},
{
"epoch": 0.13162187078930573,
"grad_norm": 186.66322326660156,
"loss": 8.7122,
"lr": 0.0009260000000000001,
"step": 464,
"tokens_trained": 0.22806636
},
{
"epoch": 0.13218920643925963,
"grad_norm": 586.1035766601562,
"loss": 9.1045,
"lr": 0.00093,
"step": 466,
"tokens_trained": 0.229047872
},
{
"epoch": 0.13275654208921353,
"grad_norm": 227.55996704101562,
"loss": 9.7276,
"lr": 0.000934,
"step": 468,
"tokens_trained": 0.230031144
},
{
"epoch": 0.13332387773916743,
"grad_norm": 229.26609802246094,
"loss": 6.6244,
"lr": 0.0009379999999999999,
"step": 470,
"tokens_trained": 0.2310158
},
{
"epoch": 0.13389121338912133,
"grad_norm": 145.16331481933594,
"loss": 5.759,
"lr": 0.000942,
"step": 472,
"tokens_trained": 0.2319996
},
{
"epoch": 0.13445854903907525,
"grad_norm": 109.9937744140625,
"loss": 5.4838,
"lr": 0.000946,
"step": 474,
"tokens_trained": 0.232983808
},
{
"epoch": 0.13502588468902915,
"grad_norm": 135.74899291992188,
"loss": 6.2738,
"lr": 0.00095,
"step": 476,
"tokens_trained": 0.233963016
},
{
"epoch": 0.13559322033898305,
"grad_norm": 142.99449157714844,
"loss": 5.8459,
"lr": 0.000954,
"step": 478,
"tokens_trained": 0.234948864
},
{
"epoch": 0.13616055598893695,
"grad_norm": 198.66883850097656,
"loss": 6.6626,
"lr": 0.000958,
"step": 480,
"tokens_trained": 0.235932392
},
{
"epoch": 0.13672789163889085,
"grad_norm": 260.76507568359375,
"loss": 6.9299,
"lr": 0.000962,
"step": 482,
"tokens_trained": 0.236915664
},
{
"epoch": 0.13729522728884477,
"grad_norm": 267.97589111328125,
"loss": 6.4343,
"lr": 0.000966,
"step": 484,
"tokens_trained": 0.237896904
},
{
"epoch": 0.13786256293879867,
"grad_norm": 89.8781967163086,
"loss": 6.3203,
"lr": 0.0009699999999999999,
"step": 486,
"tokens_trained": 0.238874528
},
{
"epoch": 0.13842989858875257,
"grad_norm": 225.62985229492188,
"loss": 6.2778,
"lr": 0.000974,
"step": 488,
"tokens_trained": 0.2398588
},
{
"epoch": 0.13899723423870647,
"grad_norm": 85.84110260009766,
"loss": 5.2786,
"lr": 0.000978,
"step": 490,
"tokens_trained": 0.240839968
},
{
"epoch": 0.13956456988866037,
"grad_norm": 141.4368438720703,
"loss": 5.5525,
"lr": 0.000982,
"step": 492,
"tokens_trained": 0.241823544
},
{
"epoch": 0.1401319055386143,
"grad_norm": 94.9535140991211,
"loss": 5.4386,
"lr": 0.0009860000000000001,
"step": 494,
"tokens_trained": 0.242805456
},
{
"epoch": 0.1406992411885682,
"grad_norm": 157.4557647705078,
"loss": 5.9786,
"lr": 0.00099,
"step": 496,
"tokens_trained": 0.243792496
},
{
"epoch": 0.1412665768385221,
"grad_norm": 319.5025634765625,
"loss": 7.04,
"lr": 0.000994,
"step": 498,
"tokens_trained": 0.244772472
},
{
"epoch": 0.141833912488476,
"grad_norm": 282.26824951171875,
"loss": 9.4037,
"lr": 0.000998,
"step": 500,
"tokens_trained": 0.245758968
},
{
"epoch": 0.141833912488476,
"eval_loss": 2.152184247970581,
"eval_runtime": 21.2772,
"step": 500,
"tokens_trained": 0.245758968
},
{
"epoch": 0.1424012481384299,
"grad_norm": 306.0666809082031,
"loss": 7.8845,
"lr": 0.00099986013986014,
"step": 502,
"tokens_trained": 0.246739024
},
{
"epoch": 0.1429685837883838,
"grad_norm": 188.89024353027344,
"loss": 6.8118,
"lr": 0.0009995804195804196,
"step": 504,
"tokens_trained": 0.247726552
},
{
"epoch": 0.1435359194383377,
"grad_norm": 228.97474670410156,
"loss": 6.8475,
"lr": 0.0009993006993006994,
"step": 506,
"tokens_trained": 0.24870688
},
{
"epoch": 0.1441032550882916,
"grad_norm": 229.80029296875,
"loss": 6.2171,
"lr": 0.000999020979020979,
"step": 508,
"tokens_trained": 0.249689096
},
{
"epoch": 0.1446705907382455,
"grad_norm": 157.30340576171875,
"loss": 6.2281,
"lr": 0.0009987412587412587,
"step": 510,
"tokens_trained": 0.250671768
},
{
"epoch": 0.1452379263881994,
"grad_norm": 176.64683532714844,
"loss": 6.5993,
"lr": 0.0009984615384615386,
"step": 512,
"tokens_trained": 0.25165608
},
{
"epoch": 0.14580526203815333,
"grad_norm": 197.20526123046875,
"loss": 5.7267,
"lr": 0.0009981818181818182,
"step": 514,
"tokens_trained": 0.252639712
},
{
"epoch": 0.14637259768810723,
"grad_norm": 54.713260650634766,
"loss": 5.7911,
"lr": 0.000997902097902098,
"step": 516,
"tokens_trained": 0.253622816
},
{
"epoch": 0.14693993333806113,
"grad_norm": 185.74923706054688,
"loss": 7.0055,
"lr": 0.0009976223776223777,
"step": 518,
"tokens_trained": 0.254602792
},
{
"epoch": 0.14750726898801503,
"grad_norm": 240.31021118164062,
"loss": 6.452,
"lr": 0.0009973426573426573,
"step": 520,
"tokens_trained": 0.255584736
},
{
"epoch": 0.14807460463796893,
"grad_norm": 160.2477264404297,
"loss": 7.6556,
"lr": 0.000997062937062937,
"step": 522,
"tokens_trained": 0.256563792
},
{
"epoch": 0.14864194028792285,
"grad_norm": 283.0034484863281,
"loss": 6.5345,
"lr": 0.0009967832167832168,
"step": 524,
"tokens_trained": 0.257546656
},
{
"epoch": 0.14920927593787675,
"grad_norm": 245.537109375,
"loss": 6.3281,
"lr": 0.0009965034965034964,
"step": 526,
"tokens_trained": 0.258530832
},
{
"epoch": 0.14977661158783065,
"grad_norm": 162.1538848876953,
"loss": 7.4072,
"lr": 0.0009962237762237763,
"step": 528,
"tokens_trained": 0.259514528
},
{
"epoch": 0.15034394723778455,
"grad_norm": 107.25792694091797,
"loss": 5.356,
"lr": 0.000995944055944056,
"step": 530,
"tokens_trained": 0.260500912
},
{
"epoch": 0.15091128288773845,
"grad_norm": 173.73353576660156,
"loss": 6.8625,
"lr": 0.0009956643356643356,
"step": 532,
"tokens_trained": 0.26148632
},
{
"epoch": 0.15147861853769237,
"grad_norm": 178.33541870117188,
"loss": 5.8794,
"lr": 0.0009953846153846154,
"step": 534,
"tokens_trained": 0.262468816
},
{
"epoch": 0.15204595418764627,
"grad_norm": 181.2533416748047,
"loss": 7.0243,
"lr": 0.000995104895104895,
"step": 536,
"tokens_trained": 0.263446696
},
{
"epoch": 0.15261328983760017,
"grad_norm": 208.79293823242188,
"loss": 5.8908,
"lr": 0.000994825174825175,
"step": 538,
"tokens_trained": 0.26443108
},
{
"epoch": 0.15318062548755407,
"grad_norm": 148.66285705566406,
"loss": 6.0831,
"lr": 0.0009945454545454546,
"step": 540,
"tokens_trained": 0.265414496
},
{
"epoch": 0.15374796113750797,
"grad_norm": 165.044189453125,
"loss": 5.5594,
"lr": 0.0009942657342657344,
"step": 542,
"tokens_trained": 0.266394128
},
{
"epoch": 0.1543152967874619,
"grad_norm": 124.5405502319336,
"loss": 5.2442,
"lr": 0.000993986013986014,
"step": 544,
"tokens_trained": 0.267378768
},
{
"epoch": 0.1548826324374158,
"grad_norm": 68.66510772705078,
"loss": 5.1173,
"lr": 0.0009937062937062937,
"step": 546,
"tokens_trained": 0.268360184
},
{
"epoch": 0.1554499680873697,
"grad_norm": 57.052860260009766,
"loss": 5.2348,
"lr": 0.0009934265734265735,
"step": 548,
"tokens_trained": 0.269345672
},
{
"epoch": 0.1560173037373236,
"grad_norm": 184.9175567626953,
"loss": 6.7748,
"lr": 0.0009931468531468532,
"step": 550,
"tokens_trained": 0.2703288
},
{
"epoch": 0.15658463938727749,
"grad_norm": 72.9861831665039,
"loss": 5.7387,
"lr": 0.000992867132867133,
"step": 552,
"tokens_trained": 0.271309176
},
{
"epoch": 0.1571519750372314,
"grad_norm": 135.864501953125,
"loss": 6.3035,
"lr": 0.0009925874125874127,
"step": 554,
"tokens_trained": 0.27229644
},
{
"epoch": 0.1577193106871853,
"grad_norm": 130.579833984375,
"loss": 5.4434,
"lr": 0.0009923076923076923,
"step": 556,
"tokens_trained": 0.273277904
},
{
"epoch": 0.1582866463371392,
"grad_norm": 206.77345275878906,
"loss": 5.8649,
"lr": 0.000992027972027972,
"step": 558,
"tokens_trained": 0.274261712
},
{
"epoch": 0.1588539819870931,
"grad_norm": 144.0505828857422,
"loss": 5.3459,
"lr": 0.0009917482517482518,
"step": 560,
"tokens_trained": 0.2752468
},
{
"epoch": 0.159421317637047,
"grad_norm": 87.56634521484375,
"loss": 5.6321,
"lr": 0.0009914685314685314,
"step": 562,
"tokens_trained": 0.276232384
},
{
"epoch": 0.15998865328700093,
"grad_norm": 275.2727355957031,
"loss": 6.7515,
"lr": 0.0009911888111888113,
"step": 564,
"tokens_trained": 0.277211608
},
{
"epoch": 0.16055598893695483,
"grad_norm": 97.00019836425781,
"loss": 5.4374,
"lr": 0.000990909090909091,
"step": 566,
"tokens_trained": 0.278196336
},
{
"epoch": 0.16112332458690873,
"grad_norm": 102.91439056396484,
"loss": 5.729,
"lr": 0.0009906293706293705,
"step": 568,
"tokens_trained": 0.279175672
},
{
"epoch": 0.16169066023686263,
"grad_norm": 151.12432861328125,
"loss": 5.4189,
"lr": 0.0009903496503496504,
"step": 570,
"tokens_trained": 0.280161088
},
{
"epoch": 0.16225799588681653,
"grad_norm": 86.6823959350586,
"loss": 5.1704,
"lr": 0.00099006993006993,
"step": 572,
"tokens_trained": 0.28114256
},
{
"epoch": 0.16282533153677045,
"grad_norm": 90.7052230834961,
"loss": 5.3673,
"lr": 0.0009897902097902099,
"step": 574,
"tokens_trained": 0.282128904
},
{
"epoch": 0.16339266718672435,
"grad_norm": 146.92874145507812,
"loss": 5.5971,
"lr": 0.0009895104895104895,
"step": 576,
"tokens_trained": 0.28311528
},
{
"epoch": 0.16396000283667825,
"grad_norm": 189.76296997070312,
"loss": 5.3109,
"lr": 0.0009892307692307694,
"step": 578,
"tokens_trained": 0.284098528
},
{
"epoch": 0.16452733848663215,
"grad_norm": 174.48092651367188,
"loss": 5.68,
"lr": 0.000988951048951049,
"step": 580,
"tokens_trained": 0.285081064
},
{
"epoch": 0.16509467413658604,
"grad_norm": 154.10816955566406,
"loss": 5.3307,
"lr": 0.0009886713286713286,
"step": 582,
"tokens_trained": 0.286067952
},
{
"epoch": 0.16566200978653997,
"grad_norm": 64.28263092041016,
"loss": 5.1676,
"lr": 0.0009883916083916085,
"step": 584,
"tokens_trained": 0.287051384
},
{
"epoch": 0.16622934543649387,
"grad_norm": 103.81795501708984,
"loss": 5.3436,
"lr": 0.0009881118881118881,
"step": 586,
"tokens_trained": 0.28803284
},
{
"epoch": 0.16679668108644777,
"grad_norm": 144.0076904296875,
"loss": 5.3033,
"lr": 0.000987832167832168,
"step": 588,
"tokens_trained": 0.289014824
},
{
"epoch": 0.16736401673640167,
"grad_norm": 88.31237030029297,
"loss": 5.0609,
"lr": 0.0009875524475524476,
"step": 590,
"tokens_trained": 0.289999864
},
{
"epoch": 0.16793135238635556,
"grad_norm": 68.4583740234375,
"loss": 5.0702,
"lr": 0.0009872727272727273,
"step": 592,
"tokens_trained": 0.290983888
},
{
"epoch": 0.1684986880363095,
"grad_norm": 135.28665161132812,
"loss": 5.3962,
"lr": 0.000986993006993007,
"step": 594,
"tokens_trained": 0.291965752
},
{
"epoch": 0.1690660236862634,
"grad_norm": 80.0412368774414,
"loss": 5.0246,
"lr": 0.0009867132867132867,
"step": 596,
"tokens_trained": 0.292946952
},
{
"epoch": 0.1696333593362173,
"grad_norm": 43.29194641113281,
"loss": 5.0051,
"lr": 0.0009864335664335664,
"step": 598,
"tokens_trained": 0.293928976
},
{
"epoch": 0.17020069498617119,
"grad_norm": 220.88687133789062,
"loss": 6.0798,
"lr": 0.0009861538461538462,
"step": 600,
"tokens_trained": 0.294912408
},
{
"epoch": 0.17076803063612508,
"grad_norm": 102.58654022216797,
"loss": 5.1271,
"lr": 0.0009858741258741259,
"step": 602,
"tokens_trained": 0.29589416
},
{
"epoch": 0.171335366286079,
"grad_norm": 119.0067138671875,
"loss": 5.7402,
"lr": 0.0009855944055944055,
"step": 604,
"tokens_trained": 0.296878584
},
{
"epoch": 0.1719027019360329,
"grad_norm": 138.8656005859375,
"loss": 5.1951,
"lr": 0.0009853146853146854,
"step": 606,
"tokens_trained": 0.297864552
},
{
"epoch": 0.1724700375859868,
"grad_norm": 73.5890884399414,
"loss": 5.2522,
"lr": 0.000985034965034965,
"step": 608,
"tokens_trained": 0.298854088
},
{
"epoch": 0.1730373732359407,
"grad_norm": 113.78330993652344,
"loss": 5.6683,
"lr": 0.0009847552447552449,
"step": 610,
"tokens_trained": 0.299835024
},
{
"epoch": 0.1736047088858946,
"grad_norm": 125.20297241210938,
"loss": 5.1812,
"lr": 0.0009844755244755245,
"step": 612,
"tokens_trained": 0.30082032
},
{
"epoch": 0.17417204453584853,
"grad_norm": 67.46041870117188,
"loss": 5.0417,
"lr": 0.0009841958041958043,
"step": 614,
"tokens_trained": 0.301808456
},
{
"epoch": 0.17473938018580243,
"grad_norm": 117.30754852294922,
"loss": 5.3064,
"lr": 0.000983916083916084,
"step": 616,
"tokens_trained": 0.302794456
},
{
"epoch": 0.17530671583575633,
"grad_norm": 124.30754089355469,
"loss": 5.1614,
"lr": 0.0009836363636363636,
"step": 618,
"tokens_trained": 0.303777376
},
{
"epoch": 0.17587405148571023,
"grad_norm": 102.72042083740234,
"loss": 5.1265,
"lr": 0.0009833566433566435,
"step": 620,
"tokens_trained": 0.304758864
},
{
"epoch": 0.17644138713566412,
"grad_norm": 39.332252502441406,
"loss": 5.1078,
"lr": 0.000983076923076923,
"step": 622,
"tokens_trained": 0.30574392
},
{
"epoch": 0.17700872278561805,
"grad_norm": 153.84811401367188,
"loss": 5.7696,
"lr": 0.000982797202797203,
"step": 624,
"tokens_trained": 0.306727584
},
{
"epoch": 0.17729239061059499,
"eval_loss": 1.3463915586471558,
"eval_runtime": 20.8357,
"step": 625,
"tokens_trained": 0.307220496
},
{
"epoch": 0.17757605843557195,
"grad_norm": 160.2552490234375,
"loss": 5.2283,
"lr": 0.0009825174825174826,
"step": 626,
"tokens_trained": 0.307713024
},
{
"epoch": 0.17814339408552585,
"grad_norm": 186.77407836914062,
"loss": 5.2866,
"lr": 0.0009822377622377622,
"step": 628,
"tokens_trained": 0.308700128
},
{
"epoch": 0.17871072973547975,
"grad_norm": 84.55519104003906,
"loss": 5.1106,
"lr": 0.0009819580419580419,
"step": 630,
"tokens_trained": 0.309681208
},
{
"epoch": 0.17927806538543364,
"grad_norm": 20.617040634155273,
"loss": 4.8327,
"lr": 0.0009816783216783217,
"step": 632,
"tokens_trained": 0.310662224
},
{
"epoch": 0.17984540103538757,
"grad_norm": 168.06039428710938,
"loss": 6.0704,
"lr": 0.0009813986013986014,
"step": 634,
"tokens_trained": 0.31164064
},
{
"epoch": 0.18041273668534147,
"grad_norm": 238.23736572265625,
"loss": 5.6188,
"lr": 0.0009811188811188812,
"step": 636,
"tokens_trained": 0.312622568
},
{
"epoch": 0.18098007233529537,
"grad_norm": 140.0707550048828,
"loss": 6.4034,
"lr": 0.0009808391608391608,
"step": 638,
"tokens_trained": 0.313604944
},
{
"epoch": 0.18154740798524927,
"grad_norm": 161.19302368164062,
"loss": 5.4906,
"lr": 0.0009805594405594405,
"step": 640,
"tokens_trained": 0.314592072
},
{
"epoch": 0.18211474363520316,
"grad_norm": 121.9577407836914,
"loss": 5.2097,
"lr": 0.0009802797202797203,
"step": 642,
"tokens_trained": 0.315574392
},
{
"epoch": 0.1826820792851571,
"grad_norm": 121.25574493408203,
"loss": 5.0317,
"lr": 0.00098,
"step": 644,
"tokens_trained": 0.316559008
},
{
"epoch": 0.183249414935111,
"grad_norm": 28.328269958496094,
"loss": 4.932,
"lr": 0.0009797202797202798,
"step": 646,
"tokens_trained": 0.317538776
},
{
"epoch": 0.1838167505850649,
"grad_norm": 127.77408599853516,
"loss": 5.8335,
"lr": 0.0009794405594405595,
"step": 648,
"tokens_trained": 0.31851792
},
{
"epoch": 0.18438408623501878,
"grad_norm": 94.9522933959961,
"loss": 5.1948,
"lr": 0.000979160839160839,
"step": 650,
"tokens_trained": 0.319501576
},
{
"epoch": 0.18495142188497268,
"grad_norm": 110.33658599853516,
"loss": 5.098,
"lr": 0.000978881118881119,
"step": 652,
"tokens_trained": 0.320482392
},
{
"epoch": 0.1855187575349266,
"grad_norm": 67.23124694824219,
"loss": 4.7723,
"lr": 0.0009786013986013986,
"step": 654,
"tokens_trained": 0.32146712
},
{
"epoch": 0.1860860931848805,
"grad_norm": 61.519866943359375,
"loss": 4.7245,
"lr": 0.0009783216783216782,
"step": 656,
"tokens_trained": 0.322449576
},
{
"epoch": 0.1866534288348344,
"grad_norm": 99.51078033447266,
"loss": 4.783,
"lr": 0.000978041958041958,
"step": 658,
"tokens_trained": 0.323432688
},
{
"epoch": 0.1872207644847883,
"grad_norm": 44.619197845458984,
"loss": 4.7495,
"lr": 0.000977762237762238,
"step": 660,
"tokens_trained": 0.324413952
},
{
"epoch": 0.18778810013474223,
"grad_norm": 114.5891342163086,
"loss": 5.1261,
"lr": 0.0009774825174825176,
"step": 662,
"tokens_trained": 0.325394536
},
{
"epoch": 0.18835543578469613,
"grad_norm": 100.3728256225586,
"loss": 4.7883,
"lr": 0.0009772027972027972,
"step": 664,
"tokens_trained": 0.326374672
},
{
"epoch": 0.18892277143465003,
"grad_norm": 51.883033752441406,
"loss": 4.7249,
"lr": 0.0009769230769230768,
"step": 666,
"tokens_trained": 0.327357152
},
{
"epoch": 0.18949010708460393,
"grad_norm": 82.27507019042969,
"loss": 4.8277,
"lr": 0.0009766433566433567,
"step": 668,
"tokens_trained": 0.328342088
},
{
"epoch": 0.19005744273455782,
"grad_norm": 83.53064727783203,
"loss": 4.8338,
"lr": 0.0009763636363636363,
"step": 670,
"tokens_trained": 0.329319248
},
{
"epoch": 0.19062477838451175,
"grad_norm": 76.18387603759766,
"loss": 4.6958,
"lr": 0.0009760839160839161,
"step": 672,
"tokens_trained": 0.330305968
},
{
"epoch": 0.19119211403446565,
"grad_norm": 27.401426315307617,
"loss": 4.6929,
"lr": 0.0009758041958041958,
"step": 674,
"tokens_trained": 0.3312912
},
{
"epoch": 0.19175944968441955,
"grad_norm": 186.770263671875,
"loss": 5.5089,
"lr": 0.0009755244755244756,
"step": 676,
"tokens_trained": 0.332275224
},
{
"epoch": 0.19232678533437345,
"grad_norm": 105.02385711669922,
"loss": 4.8876,
"lr": 0.0009752447552447553,
"step": 678,
"tokens_trained": 0.33325588
},
{
"epoch": 0.19289412098432734,
"grad_norm": 94.96269989013672,
"loss": 5.1235,
"lr": 0.0009749650349650349,
"step": 680,
"tokens_trained": 0.334238408
},
{
"epoch": 0.19346145663428127,
"grad_norm": 92.29356384277344,
"loss": 4.8194,
"lr": 0.0009746853146853148,
"step": 682,
"tokens_trained": 0.335219368
},
{
"epoch": 0.19402879228423517,
"grad_norm": 59.1584358215332,
"loss": 4.7511,
"lr": 0.0009744055944055944,
"step": 684,
"tokens_trained": 0.336207136
},
{
"epoch": 0.19459612793418907,
"grad_norm": 54.759002685546875,
"loss": 4.777,
"lr": 0.0009741258741258742,
"step": 686,
"tokens_trained": 0.337193536
},
{
"epoch": 0.19516346358414297,
"grad_norm": 92.20452880859375,
"loss": 4.8225,
"lr": 0.0009738461538461538,
"step": 688,
"tokens_trained": 0.338179224
},
{
"epoch": 0.19573079923409686,
"grad_norm": 75.97005462646484,
"loss": 4.655,
"lr": 0.0009735664335664336,
"step": 690,
"tokens_trained": 0.339162168
},
{
"epoch": 0.1962981348840508,
"grad_norm": 58.19076919555664,
"loss": 4.6446,
"lr": 0.0009732867132867133,
"step": 692,
"tokens_trained": 0.340138904
},
{
"epoch": 0.1968654705340047,
"grad_norm": 50.81512451171875,
"loss": 4.5866,
"lr": 0.000973006993006993,
"step": 694,
"tokens_trained": 0.34112288
},
{
"epoch": 0.1974328061839586,
"grad_norm": 61.683372497558594,
"loss": 4.6018,
"lr": 0.0009727272727272728,
"step": 696,
"tokens_trained": 0.342111992
},
{
"epoch": 0.19800014183391249,
"grad_norm": 61.01798629760742,
"loss": 4.6007,
"lr": 0.0009724475524475524,
"step": 698,
"tokens_trained": 0.343095912
},
{
"epoch": 0.19856747748386638,
"grad_norm": 96.49671936035156,
"loss": 4.7035,
"lr": 0.0009721678321678323,
"step": 700,
"tokens_trained": 0.344078632
},
{
"epoch": 0.1991348131338203,
"grad_norm": 64.7771224975586,
"loss": 4.8341,
"lr": 0.0009718881118881119,
"step": 702,
"tokens_trained": 0.345060576
},
{
"epoch": 0.1997021487837742,
"grad_norm": 90.1478042602539,
"loss": 4.7739,
"lr": 0.0009716083916083917,
"step": 704,
"tokens_trained": 0.34604112
},
{
"epoch": 0.2002694844337281,
"grad_norm": 67.6308822631836,
"loss": 4.6218,
"lr": 0.0009713286713286713,
"step": 706,
"tokens_trained": 0.347023496
},
{
"epoch": 0.200836820083682,
"grad_norm": 40.50175094604492,
"loss": 4.6008,
"lr": 0.000971048951048951,
"step": 708,
"tokens_trained": 0.348005416
},
{
"epoch": 0.2014041557336359,
"grad_norm": 33.6448860168457,
"loss": 4.5307,
"lr": 0.0009707692307692308,
"step": 710,
"tokens_trained": 0.3489886
},
{
"epoch": 0.20197149138358983,
"grad_norm": 15.484851837158203,
"loss": 4.5065,
"lr": 0.0009704895104895105,
"step": 712,
"tokens_trained": 0.34997024
},
{
"epoch": 0.20253882703354373,
"grad_norm": 109.26301574707031,
"loss": 4.9613,
"lr": 0.0009702097902097903,
"step": 714,
"tokens_trained": 0.350958496
},
{
"epoch": 0.20310616268349763,
"grad_norm": 150.07492065429688,
"loss": 4.8507,
"lr": 0.0009699300699300699,
"step": 716,
"tokens_trained": 0.35193892
},
{
"epoch": 0.20367349833345152,
"grad_norm": 113.43978881835938,
"loss": 5.4494,
"lr": 0.0009696503496503498,
"step": 718,
"tokens_trained": 0.35291908
},
{
"epoch": 0.20424083398340542,
"grad_norm": 123.0071792602539,
"loss": 4.9475,
"lr": 0.0009693706293706294,
"step": 720,
"tokens_trained": 0.353896072
},
{
"epoch": 0.20480816963335935,
"grad_norm": 65.55500793457031,
"loss": 4.7585,
"lr": 0.0009690909090909091,
"step": 722,
"tokens_trained": 0.354878992
},
{
"epoch": 0.20537550528331325,
"grad_norm": 36.11159896850586,
"loss": 4.6323,
"lr": 0.0009688111888111888,
"step": 724,
"tokens_trained": 0.355863728
},
{
"epoch": 0.20594284093326715,
"grad_norm": 30.566436767578125,
"loss": 4.53,
"lr": 0.0009685314685314685,
"step": 726,
"tokens_trained": 0.356845272
},
{
"epoch": 0.20651017658322104,
"grad_norm": 59.01853561401367,
"loss": 4.5283,
"lr": 0.0009682517482517483,
"step": 728,
"tokens_trained": 0.357826656
},
{
"epoch": 0.20707751223317494,
"grad_norm": 91.78115844726562,
"loss": 4.6149,
"lr": 0.000967972027972028,
"step": 730,
"tokens_trained": 0.358809896
},
{
"epoch": 0.20764484788312887,
"grad_norm": 67.97398376464844,
"loss": 4.617,
"lr": 0.0009676923076923078,
"step": 732,
"tokens_trained": 0.359788736
},
{
"epoch": 0.20821218353308277,
"grad_norm": 42.82001876831055,
"loss": 4.6134,
"lr": 0.0009674125874125874,
"step": 734,
"tokens_trained": 0.360771744
},
{
"epoch": 0.20877951918303667,
"grad_norm": 63.52122116088867,
"loss": 4.6995,
"lr": 0.0009671328671328672,
"step": 736,
"tokens_trained": 0.361757656
},
{
"epoch": 0.20934685483299056,
"grad_norm": 116.39544677734375,
"loss": 4.7153,
"lr": 0.0009668531468531469,
"step": 738,
"tokens_trained": 0.362744008
},
{
"epoch": 0.20991419048294446,
"grad_norm": 40.74269485473633,
"loss": 4.7978,
"lr": 0.0009665734265734266,
"step": 740,
"tokens_trained": 0.36372872
},
{
"epoch": 0.2104815261328984,
"grad_norm": 114.29917907714844,
"loss": 5.1683,
"lr": 0.0009662937062937063,
"step": 742,
"tokens_trained": 0.364710536
},
{
"epoch": 0.2110488617828523,
"grad_norm": 115.83326721191406,
"loss": 4.7642,
"lr": 0.000966013986013986,
"step": 744,
"tokens_trained": 0.3656912
},
{
"epoch": 0.21161619743280619,
"grad_norm": 21.708093643188477,
"loss": 4.8244,
"lr": 0.0009657342657342657,
"step": 746,
"tokens_trained": 0.36667388
},
{
"epoch": 0.21218353308276008,
"grad_norm": 182.01918029785156,
"loss": 5.6045,
"lr": 0.0009654545454545455,
"step": 748,
"tokens_trained": 0.3676634
},
{
"epoch": 0.21275086873271398,
"grad_norm": 47.119319915771484,
"loss": 4.7929,
"lr": 0.0009651748251748252,
"step": 750,
"tokens_trained": 0.368647288
},
{
"epoch": 0.21275086873271398,
"eval_loss": 1.2186306715011597,
"eval_runtime": 20.9362,
"step": 750,
"tokens_trained": 0.368647288
},
{
"epoch": 0.2133182043826679,
"grad_norm": 51.43566131591797,
"loss": 4.7298,
"lr": 0.0009648951048951049,
"step": 752,
"tokens_trained": 0.36962992
},
{
"epoch": 0.2138855400326218,
"grad_norm": 79.49323272705078,
"loss": 5.0749,
"lr": 0.0009646153846153846,
"step": 754,
"tokens_trained": 0.370616064
},
{
"epoch": 0.2144528756825757,
"grad_norm": 119.80200958251953,
"loss": 4.8198,
"lr": 0.0009643356643356644,
"step": 756,
"tokens_trained": 0.371596208
},
{
"epoch": 0.2150202113325296,
"grad_norm": 95.88092041015625,
"loss": 4.7437,
"lr": 0.0009640559440559441,
"step": 758,
"tokens_trained": 0.372579584
},
{
"epoch": 0.2155875469824835,
"grad_norm": 79.64202117919922,
"loss": 4.9181,
"lr": 0.0009637762237762237,
"step": 760,
"tokens_trained": 0.373563056
},
{
"epoch": 0.21615488263243743,
"grad_norm": 79.93920135498047,
"loss": 4.6393,
"lr": 0.0009634965034965035,
"step": 762,
"tokens_trained": 0.374547648
},
{
"epoch": 0.21672221828239133,
"grad_norm": 78.67620849609375,
"loss": 4.6178,
"lr": 0.0009632167832167832,
"step": 764,
"tokens_trained": 0.375531456
},
{
"epoch": 0.21728955393234523,
"grad_norm": 56.32818603515625,
"loss": 4.6498,
"lr": 0.000962937062937063,
"step": 766,
"tokens_trained": 0.376516896
},
{
"epoch": 0.21785688958229912,
"grad_norm": 45.35737228393555,
"loss": 4.5812,
"lr": 0.0009626573426573427,
"step": 768,
"tokens_trained": 0.377499752
},
{
"epoch": 0.21842422523225302,
"grad_norm": 58.13076400756836,
"loss": 4.5793,
"lr": 0.0009623776223776224,
"step": 770,
"tokens_trained": 0.37848276
},
{
"epoch": 0.21899156088220695,
"grad_norm": 55.620628356933594,
"loss": 4.4865,
"lr": 0.0009620979020979021,
"step": 772,
"tokens_trained": 0.379466296
},
{
"epoch": 0.21955889653216085,
"grad_norm": 77.26813507080078,
"loss": 4.5671,
"lr": 0.0009618181818181818,
"step": 774,
"tokens_trained": 0.380449888
},
{
"epoch": 0.22012623218211474,
"grad_norm": 45.00653839111328,
"loss": 4.5923,
"lr": 0.0009615384615384616,
"step": 776,
"tokens_trained": 0.381430352
},
{
"epoch": 0.22069356783206864,
"grad_norm": 52.77407455444336,
"loss": 4.5094,
"lr": 0.0009612587412587412,
"step": 778,
"tokens_trained": 0.382416152
},
{
"epoch": 0.22126090348202254,
"grad_norm": 36.721073150634766,
"loss": 4.4536,
"lr": 0.000960979020979021,
"step": 780,
"tokens_trained": 0.383396672
},
{
"epoch": 0.22182823913197647,
"grad_norm": 51.21247100830078,
"loss": 4.4599,
"lr": 0.0009606993006993007,
"step": 782,
"tokens_trained": 0.384380584
},
{
"epoch": 0.22239557478193037,
"grad_norm": 65.23794555664062,
"loss": 4.5397,
"lr": 0.0009604195804195805,
"step": 784,
"tokens_trained": 0.385361368
},
{
"epoch": 0.22296291043188426,
"grad_norm": 23.255144119262695,
"loss": 4.5007,
"lr": 0.0009601398601398602,
"step": 786,
"tokens_trained": 0.386341416
},
{
"epoch": 0.22353024608183816,
"grad_norm": 30.812740325927734,
"loss": 4.5239,
"lr": 0.0009598601398601398,
"step": 788,
"tokens_trained": 0.387324624
},
{
"epoch": 0.22409758173179206,
"grad_norm": 50.781219482421875,
"loss": 4.5131,
"lr": 0.0009595804195804196,
"step": 790,
"tokens_trained": 0.388312744
},
{
"epoch": 0.224664917381746,
"grad_norm": 47.88816452026367,
"loss": 4.4622,
"lr": 0.0009593006993006993,
"step": 792,
"tokens_trained": 0.38929852
},
{
"epoch": 0.22523225303169989,
"grad_norm": 49.32049560546875,
"loss": 4.5053,
"lr": 0.0009590209790209791,
"step": 794,
"tokens_trained": 0.390279792
},
{
"epoch": 0.22579958868165378,
"grad_norm": 36.98805618286133,
"loss": 4.5144,
"lr": 0.0009587412587412587,
"step": 796,
"tokens_trained": 0.391258904
},
{
"epoch": 0.22636692433160768,
"grad_norm": 24.88475799560547,
"loss": 4.4992,
"lr": 0.0009584615384615385,
"step": 798,
"tokens_trained": 0.392238976
},
{
"epoch": 0.22693425998156158,
"grad_norm": 38.89309310913086,
"loss": 4.4853,
"lr": 0.0009581818181818182,
"step": 800,
"tokens_trained": 0.393226312
},
{
"epoch": 0.2275015956315155,
"grad_norm": 34.86774444580078,
"loss": 4.4519,
"lr": 0.000957902097902098,
"step": 802,
"tokens_trained": 0.394206688
},
{
"epoch": 0.2280689312814694,
"grad_norm": 24.966291427612305,
"loss": 4.456,
"lr": 0.0009576223776223777,
"step": 804,
"tokens_trained": 0.395191608
},
{
"epoch": 0.2286362669314233,
"grad_norm": 12.218213081359863,
"loss": 4.4266,
"lr": 0.0009573426573426573,
"step": 806,
"tokens_trained": 0.396174512
},
{
"epoch": 0.2292036025813772,
"grad_norm": 50.817054748535156,
"loss": 4.586,
"lr": 0.0009570629370629371,
"step": 808,
"tokens_trained": 0.397156912
},
{
"epoch": 0.2297709382313311,
"grad_norm": 37.60087203979492,
"loss": 4.4616,
"lr": 0.0009567832167832168,
"step": 810,
"tokens_trained": 0.398140016
},
{
"epoch": 0.23033827388128503,
"grad_norm": 37.55678176879883,
"loss": 4.4755,
"lr": 0.0009565034965034966,
"step": 812,
"tokens_trained": 0.39912384
},
{
"epoch": 0.23090560953123893,
"grad_norm": 56.427215576171875,
"loss": 4.5078,
"lr": 0.0009562237762237762,
"step": 814,
"tokens_trained": 0.400111224
},
{
"epoch": 0.23147294518119282,
"grad_norm": 31.869827270507812,
"loss": 4.5013,
"lr": 0.0009559440559440559,
"step": 816,
"tokens_trained": 0.401094936
},
{
"epoch": 0.23204028083114672,
"grad_norm": 77.57958984375,
"loss": 4.6977,
"lr": 0.0009556643356643357,
"step": 818,
"tokens_trained": 0.402078888
},
{
"epoch": 0.23260761648110062,
"grad_norm": 52.50204849243164,
"loss": 4.5142,
"lr": 0.0009553846153846154,
"step": 820,
"tokens_trained": 0.403059904
},
{
"epoch": 0.23317495213105455,
"grad_norm": 32.34305191040039,
"loss": 4.4828,
"lr": 0.0009551048951048952,
"step": 822,
"tokens_trained": 0.404049848
},
{
"epoch": 0.23374228778100845,
"grad_norm": 52.08961486816406,
"loss": 4.4869,
"lr": 0.0009548251748251748,
"step": 824,
"tokens_trained": 0.405033872
},
{
"epoch": 0.23430962343096234,
"grad_norm": 44.32194900512695,
"loss": 4.4802,
"lr": 0.0009545454545454546,
"step": 826,
"tokens_trained": 0.406017872
},
{
"epoch": 0.23487695908091624,
"grad_norm": 30.941524505615234,
"loss": 4.4323,
"lr": 0.0009542657342657343,
"step": 828,
"tokens_trained": 0.40700704
},
{
"epoch": 0.23544429473087014,
"grad_norm": 20.52709197998047,
"loss": 4.4919,
"lr": 0.000953986013986014,
"step": 830,
"tokens_trained": 0.407991512
},
{
"epoch": 0.23601163038082407,
"grad_norm": 86.80307006835938,
"loss": 4.8228,
"lr": 0.0009537062937062937,
"step": 832,
"tokens_trained": 0.408979272
},
{
"epoch": 0.23657896603077797,
"grad_norm": 73.71435546875,
"loss": 4.5954,
"lr": 0.0009534265734265734,
"step": 834,
"tokens_trained": 0.409962984
},
{
"epoch": 0.23714630168073186,
"grad_norm": 66.3813247680664,
"loss": 4.5969,
"lr": 0.0009531468531468532,
"step": 836,
"tokens_trained": 0.410945248
},
{
"epoch": 0.23771363733068576,
"grad_norm": 86.94453430175781,
"loss": 4.5894,
"lr": 0.0009528671328671329,
"step": 838,
"tokens_trained": 0.411930872
},
{
"epoch": 0.23828097298063966,
"grad_norm": 61.28915786743164,
"loss": 4.5613,
"lr": 0.0009525874125874127,
"step": 840,
"tokens_trained": 0.412912608
},
{
"epoch": 0.2388483086305936,
"grad_norm": 65.02153778076172,
"loss": 4.5398,
"lr": 0.0009523076923076923,
"step": 842,
"tokens_trained": 0.413897488
},
{
"epoch": 0.23941564428054748,
"grad_norm": 54.01200485229492,
"loss": 4.4922,
"lr": 0.000952027972027972,
"step": 844,
"tokens_trained": 0.414872888
},
{
"epoch": 0.23998297993050138,
"grad_norm": 66.7095718383789,
"loss": 4.5317,
"lr": 0.0009517482517482518,
"step": 846,
"tokens_trained": 0.415856296
},
{
"epoch": 0.24055031558045528,
"grad_norm": 64.23979949951172,
"loss": 4.4686,
"lr": 0.0009514685314685315,
"step": 848,
"tokens_trained": 0.416843344
},
{
"epoch": 0.24111765123040918,
"grad_norm": 51.012840270996094,
"loss": 4.4544,
"lr": 0.0009511888111888112,
"step": 850,
"tokens_trained": 0.41782032
},
{
"epoch": 0.2416849868803631,
"grad_norm": 40.83076095581055,
"loss": 4.4665,
"lr": 0.0009509090909090909,
"step": 852,
"tokens_trained": 0.418805672
},
{
"epoch": 0.242252322530317,
"grad_norm": 48.31489944458008,
"loss": 4.4748,
"lr": 0.0009506293706293707,
"step": 854,
"tokens_trained": 0.419786344
},
{
"epoch": 0.2428196581802709,
"grad_norm": 50.08705520629883,
"loss": 4.4973,
"lr": 0.0009503496503496504,
"step": 856,
"tokens_trained": 0.420768872
},
{
"epoch": 0.2433869938302248,
"grad_norm": 26.840139389038086,
"loss": 4.461,
"lr": 0.0009500699300699301,
"step": 858,
"tokens_trained": 0.421750296
},
{
"epoch": 0.2439543294801787,
"grad_norm": 24.721454620361328,
"loss": 4.4246,
"lr": 0.0009497902097902098,
"step": 860,
"tokens_trained": 0.422730976
},
{
"epoch": 0.24452166513013263,
"grad_norm": 63.147926330566406,
"loss": 4.623,
"lr": 0.0009495104895104895,
"step": 862,
"tokens_trained": 0.423715768
},
{
"epoch": 0.24508900078008652,
"grad_norm": 50.99778747558594,
"loss": 4.4663,
"lr": 0.0009492307692307693,
"step": 864,
"tokens_trained": 0.424697072
},
{
"epoch": 0.24565633643004042,
"grad_norm": 38.0300407409668,
"loss": 4.4649,
"lr": 0.000948951048951049,
"step": 866,
"tokens_trained": 0.425681392
},
{
"epoch": 0.24622367207999432,
"grad_norm": 19.017776489257812,
"loss": 4.4296,
"lr": 0.0009486713286713286,
"step": 868,
"tokens_trained": 0.426665088
},
{
"epoch": 0.24679100772994822,
"grad_norm": 24.02813148498535,
"loss": 4.4958,
"lr": 0.0009483916083916084,
"step": 870,
"tokens_trained": 0.427646016
},
{
"epoch": 0.24735834337990215,
"grad_norm": 59.40018081665039,
"loss": 4.5919,
"lr": 0.0009481118881118881,
"step": 872,
"tokens_trained": 0.428628048
},
{
"epoch": 0.24792567902985604,
"grad_norm": 61.13710403442383,
"loss": 4.4642,
"lr": 0.0009478321678321679,
"step": 874,
"tokens_trained": 0.4296112
},
{
"epoch": 0.24820934685483298,
"eval_loss": 1.1135390996932983,
"eval_runtime": 20.4738,
"step": 875,
"tokens_trained": 0.430109024
},
{
"epoch": 0.24849301467980994,
"grad_norm": 47.920021057128906,
"loss": 4.4832,
"lr": 0.0009475524475524476,
"step": 876,
"tokens_trained": 0.430599208
},
{
"epoch": 0.24906035032976384,
"grad_norm": 25.661701202392578,
"loss": 4.4176,
"lr": 0.0009472727272727273,
"step": 878,
"tokens_trained": 0.43158356
},
{
"epoch": 0.24962768597971774,
"grad_norm": 32.86565399169922,
"loss": 4.405,
"lr": 0.000946993006993007,
"step": 880,
"tokens_trained": 0.432570584
},
{
"epoch": 0.25019502162967167,
"grad_norm": 23.443584442138672,
"loss": 4.4218,
"lr": 0.0009467132867132868,
"step": 882,
"tokens_trained": 0.433557672
},
{
"epoch": 0.25076235727962554,
"grad_norm": 28.315975189208984,
"loss": 4.4019,
"lr": 0.0009464335664335665,
"step": 884,
"tokens_trained": 0.434542736
},
{
"epoch": 0.25132969292957946,
"grad_norm": 31.056642532348633,
"loss": 4.4027,
"lr": 0.0009461538461538461,
"step": 886,
"tokens_trained": 0.43553112
},
{
"epoch": 0.2518970285795334,
"grad_norm": 13.661805152893066,
"loss": 4.3745,
"lr": 0.0009458741258741259,
"step": 888,
"tokens_trained": 0.436511584
},
{
"epoch": 0.25246436422948726,
"grad_norm": 47.04901885986328,
"loss": 4.4875,
"lr": 0.0009455944055944056,
"step": 890,
"tokens_trained": 0.43749464
},
{
"epoch": 0.2530316998794412,
"grad_norm": 84.91446685791016,
"loss": 4.5185,
"lr": 0.0009453146853146854,
"step": 892,
"tokens_trained": 0.43847764
},
{
"epoch": 0.25359903552939506,
"grad_norm": 40.9110107421875,
"loss": 4.5735,
"lr": 0.000945034965034965,
"step": 894,
"tokens_trained": 0.439461496
},
{
"epoch": 0.254166371179349,
"grad_norm": 58.98877716064453,
"loss": 4.5146,
"lr": 0.0009447552447552447,
"step": 896,
"tokens_trained": 0.440443656
},
{
"epoch": 0.2547337068293029,
"grad_norm": 34.037315368652344,
"loss": 4.4714,
"lr": 0.0009444755244755245,
"step": 898,
"tokens_trained": 0.441423496
},
{
"epoch": 0.2553010424792568,
"grad_norm": 24.91920280456543,
"loss": 4.4334,
"lr": 0.0009441958041958042,
"step": 900,
"tokens_trained": 0.442407408
},
{
"epoch": 0.2558683781292107,
"grad_norm": 30.612323760986328,
"loss": 4.4459,
"lr": 0.000943916083916084,
"step": 902,
"tokens_trained": 0.443383464
},
{
"epoch": 0.2564357137791646,
"grad_norm": 50.595577239990234,
"loss": 4.4848,
"lr": 0.0009436363636363636,
"step": 904,
"tokens_trained": 0.4443674
},
{
"epoch": 0.2570030494291185,
"grad_norm": 41.3300895690918,
"loss": 4.4445,
"lr": 0.0009433566433566434,
"step": 906,
"tokens_trained": 0.445346072
},
{
"epoch": 0.25757038507907243,
"grad_norm": 48.33689880371094,
"loss": 4.4058,
"lr": 0.0009430769230769231,
"step": 908,
"tokens_trained": 0.446329872
},
{
"epoch": 0.2581377207290263,
"grad_norm": 39.081382751464844,
"loss": 4.4321,
"lr": 0.0009427972027972029,
"step": 910,
"tokens_trained": 0.447309544
},
{
"epoch": 0.2587050563789802,
"grad_norm": 62.18062210083008,
"loss": 4.4672,
"lr": 0.0009425174825174825,
"step": 912,
"tokens_trained": 0.448295056
},
{
"epoch": 0.2592723920289341,
"grad_norm": 28.725404739379883,
"loss": 4.4786,
"lr": 0.0009422377622377622,
"step": 914,
"tokens_trained": 0.449274208
},
{
"epoch": 0.259839727678888,
"grad_norm": 47.55582809448242,
"loss": 4.4227,
"lr": 0.000941958041958042,
"step": 916,
"tokens_trained": 0.450256408
},
{
"epoch": 0.26040706332884195,
"grad_norm": 35.743125915527344,
"loss": 4.379,
"lr": 0.0009416783216783217,
"step": 918,
"tokens_trained": 0.45123684
},
{
"epoch": 0.2609743989787958,
"grad_norm": 31.489402770996094,
"loss": 4.3888,
"lr": 0.0009413986013986015,
"step": 920,
"tokens_trained": 0.45221748
},
{
"epoch": 0.26154173462874974,
"grad_norm": 36.46233367919922,
"loss": 4.3982,
"lr": 0.0009411188811188811,
"step": 922,
"tokens_trained": 0.453202064
},
{
"epoch": 0.2621090702787036,
"grad_norm": 41.6457633972168,
"loss": 4.385,
"lr": 0.0009408391608391608,
"step": 924,
"tokens_trained": 0.454183456
},
{
"epoch": 0.26267640592865754,
"grad_norm": 26.52242088317871,
"loss": 4.4091,
"lr": 0.0009405594405594406,
"step": 926,
"tokens_trained": 0.455165496
},
{
"epoch": 0.26324374157861147,
"grad_norm": 14.401509284973145,
"loss": 4.3549,
"lr": 0.0009402797202797203,
"step": 928,
"tokens_trained": 0.456150248
},
{
"epoch": 0.26381107722856534,
"grad_norm": 30.626131057739258,
"loss": 4.3325,
"lr": 0.00094,
"step": 930,
"tokens_trained": 0.457134184
},
{
"epoch": 0.26437841287851926,
"grad_norm": 63.74067687988281,
"loss": 4.442,
"lr": 0.0009397202797202797,
"step": 932,
"tokens_trained": 0.458118808
},
{
"epoch": 0.26494574852847314,
"grad_norm": 12.15156364440918,
"loss": 4.4658,
"lr": 0.0009394405594405595,
"step": 934,
"tokens_trained": 0.459103872
},
{
"epoch": 0.26551308417842706,
"grad_norm": 76.2789306640625,
"loss": 4.8153,
"lr": 0.0009391608391608392,
"step": 936,
"tokens_trained": 0.460087216
},
{
"epoch": 0.266080419828381,
"grad_norm": 63.919334411621094,
"loss": 4.5707,
"lr": 0.000938881118881119,
"step": 938,
"tokens_trained": 0.461070568
},
{
"epoch": 0.26664775547833486,
"grad_norm": 75.1481704711914,
"loss": 4.5931,
"lr": 0.0009386013986013986,
"step": 940,
"tokens_trained": 0.462055184
},
{
"epoch": 0.2672150911282888,
"grad_norm": 33.118961334228516,
"loss": 4.4723,
"lr": 0.0009383216783216783,
"step": 942,
"tokens_trained": 0.463034592
},
{
"epoch": 0.26778242677824265,
"grad_norm": 30.8759765625,
"loss": 4.4275,
"lr": 0.0009380419580419581,
"step": 944,
"tokens_trained": 0.464016816
},
{
"epoch": 0.2683497624281966,
"grad_norm": 41.05061340332031,
"loss": 4.4566,
"lr": 0.0009377622377622378,
"step": 946,
"tokens_trained": 0.465000872
},
{
"epoch": 0.2689170980781505,
"grad_norm": 30.93424415588379,
"loss": 4.3985,
"lr": 0.0009374825174825175,
"step": 948,
"tokens_trained": 0.465984096
},
{
"epoch": 0.2694844337281044,
"grad_norm": 29.477052688598633,
"loss": 4.3718,
"lr": 0.0009372027972027972,
"step": 950,
"tokens_trained": 0.466961752
},
{
"epoch": 0.2700517693780583,
"grad_norm": 21.568912506103516,
"loss": 4.3697,
"lr": 0.0009369230769230769,
"step": 952,
"tokens_trained": 0.467950088
},
{
"epoch": 0.2706191050280122,
"grad_norm": 41.66835021972656,
"loss": 4.4241,
"lr": 0.0009366433566433567,
"step": 954,
"tokens_trained": 0.468928736
},
{
"epoch": 0.2711864406779661,
"grad_norm": 68.04551696777344,
"loss": 4.3978,
"lr": 0.0009363636363636364,
"step": 956,
"tokens_trained": 0.469907496
},
{
"epoch": 0.27175377632792,
"grad_norm": 37.655181884765625,
"loss": 4.4497,
"lr": 0.0009360839160839161,
"step": 958,
"tokens_trained": 0.470889168
},
{
"epoch": 0.2723211119778739,
"grad_norm": 22.074953079223633,
"loss": 4.3918,
"lr": 0.0009358041958041958,
"step": 960,
"tokens_trained": 0.471871816
},
{
"epoch": 0.2728884476278278,
"grad_norm": 49.925777435302734,
"loss": 4.4745,
"lr": 0.0009355244755244755,
"step": 962,
"tokens_trained": 0.472856728
},
{
"epoch": 0.2734557832777817,
"grad_norm": 46.520851135253906,
"loss": 4.403,
"lr": 0.0009352447552447553,
"step": 964,
"tokens_trained": 0.473838544
},
{
"epoch": 0.2740231189277356,
"grad_norm": 25.053146362304688,
"loss": 4.4247,
"lr": 0.0009349650349650349,
"step": 966,
"tokens_trained": 0.474819976
},
{
"epoch": 0.27459045457768955,
"grad_norm": 30.127140045166016,
"loss": 4.3834,
"lr": 0.0009346853146853147,
"step": 968,
"tokens_trained": 0.475800696
},
{
"epoch": 0.2751577902276434,
"grad_norm": 41.478328704833984,
"loss": 4.3978,
"lr": 0.0009344055944055944,
"step": 970,
"tokens_trained": 0.4767834
},
{
"epoch": 0.27572512587759734,
"grad_norm": 23.739456176757812,
"loss": 4.3698,
"lr": 0.0009341258741258742,
"step": 972,
"tokens_trained": 0.47776944
},
{
"epoch": 0.2762924615275512,
"grad_norm": 21.813220977783203,
"loss": 4.3902,
"lr": 0.0009338461538461539,
"step": 974,
"tokens_trained": 0.478757048
},
{
"epoch": 0.27685979717750514,
"grad_norm": 64.79598999023438,
"loss": 4.5237,
"lr": 0.0009335664335664336,
"step": 976,
"tokens_trained": 0.47973872
},
{
"epoch": 0.27742713282745907,
"grad_norm": 68.32705688476562,
"loss": 4.4461,
"lr": 0.0009332867132867133,
"step": 978,
"tokens_trained": 0.480721912
},
{
"epoch": 0.27799446847741294,
"grad_norm": 41.857582092285156,
"loss": 4.4663,
"lr": 0.0009330069930069929,
"step": 980,
"tokens_trained": 0.481704248
},
{
"epoch": 0.27856180412736686,
"grad_norm": 28.30609893798828,
"loss": 4.3461,
"lr": 0.0009327272727272728,
"step": 982,
"tokens_trained": 0.482689768
},
{
"epoch": 0.27912913977732073,
"grad_norm": 33.207950592041016,
"loss": 4.4185,
"lr": 0.0009324475524475524,
"step": 984,
"tokens_trained": 0.483670008
},
{
"epoch": 0.27969647542727466,
"grad_norm": 29.541227340698242,
"loss": 4.388,
"lr": 0.0009321678321678322,
"step": 986,
"tokens_trained": 0.48465836
},
{
"epoch": 0.2802638110772286,
"grad_norm": 16.23346710205078,
"loss": 4.3219,
"lr": 0.0009318881118881119,
"step": 988,
"tokens_trained": 0.4856402
},
{
"epoch": 0.28083114672718246,
"grad_norm": 20.036178588867188,
"loss": 4.3273,
"lr": 0.0009316083916083917,
"step": 990,
"tokens_trained": 0.486621648
},
{
"epoch": 0.2813984823771364,
"grad_norm": 49.25468063354492,
"loss": 4.4649,
"lr": 0.0009313286713286714,
"step": 992,
"tokens_trained": 0.48760744
},
{
"epoch": 0.28196581802709025,
"grad_norm": 48.59744644165039,
"loss": 4.3979,
"lr": 0.000931048951048951,
"step": 994,
"tokens_trained": 0.488590472
},
{
"epoch": 0.2825331536770442,
"grad_norm": 16.33649253845215,
"loss": 4.3945,
"lr": 0.0009307692307692308,
"step": 996,
"tokens_trained": 0.489570976
},
{
"epoch": 0.2831004893269981,
"grad_norm": 60.632591247558594,
"loss": 4.5581,
"lr": 0.0009304895104895104,
"step": 998,
"tokens_trained": 0.490552296
},
{
"epoch": 0.283667824976952,
"grad_norm": 52.75735092163086,
"loss": 4.424,
"lr": 0.0009302097902097903,
"step": 1000,
"tokens_trained": 0.49153744
},
{
"epoch": 0.283667824976952,
"eval_loss": 1.1363450288772583,
"eval_runtime": 20.7491,
"step": 1000,
"tokens_trained": 0.49153744
},
{
"epoch": 0.2842351606269059,
"grad_norm": 20.506614685058594,
"loss": 4.4241,
"lr": 0.0009299300699300699,
"step": 1002,
"tokens_trained": 0.492522608
},
{
"epoch": 0.2848024962768598,
"grad_norm": 23.148601531982422,
"loss": 4.3975,
"lr": 0.0009296503496503497,
"step": 1004,
"tokens_trained": 0.493501384
},
{
"epoch": 0.2853698319268137,
"grad_norm": 9.550869941711426,
"loss": 4.3952,
"lr": 0.0009293706293706294,
"step": 1006,
"tokens_trained": 0.494482544
},
{
"epoch": 0.2859371675767676,
"grad_norm": 80.31155395507812,
"loss": 4.7614,
"lr": 0.0009290909090909091,
"step": 1008,
"tokens_trained": 0.495459416
},
{
"epoch": 0.2865045032267215,
"grad_norm": 61.021026611328125,
"loss": 4.4396,
"lr": 0.0009288111888111889,
"step": 1010,
"tokens_trained": 0.4964418
},
{
"epoch": 0.2870718388766754,
"grad_norm": 35.23258972167969,
"loss": 4.5548,
"lr": 0.0009285314685314685,
"step": 1012,
"tokens_trained": 0.497428288
},
{
"epoch": 0.2876391745266293,
"grad_norm": 36.45478057861328,
"loss": 4.46,
"lr": 0.0009282517482517483,
"step": 1014,
"tokens_trained": 0.498416832
},
{
"epoch": 0.2882065101765832,
"grad_norm": 46.622982025146484,
"loss": 4.3554,
"lr": 0.0009279720279720279,
"step": 1016,
"tokens_trained": 0.499399792
},
{
"epoch": 0.28877384582653715,
"grad_norm": 87.00289154052734,
"loss": 4.5276,
"lr": 0.0009276923076923078,
"step": 1018,
"tokens_trained": 0.500383776
},
{
"epoch": 0.289341181476491,
"grad_norm": 11.444964408874512,
"loss": 4.5483,
"lr": 0.0009274125874125874,
"step": 1020,
"tokens_trained": 0.50136468
},
{
"epoch": 0.28990851712644494,
"grad_norm": 89.05914306640625,
"loss": 4.8957,
"lr": 0.0009271328671328671,
"step": 1022,
"tokens_trained": 0.50235172
},
{
"epoch": 0.2904758527763988,
"grad_norm": 26.915477752685547,
"loss": 4.6184,
"lr": 0.0009268531468531469,
"step": 1024,
"tokens_trained": 0.50333208
},
{
"epoch": 0.29104318842635274,
"grad_norm": 44.32100296020508,
"loss": 4.5263,
"lr": 0.0009265734265734266,
"step": 1026,
"tokens_trained": 0.504314656
},
{
"epoch": 0.29161052407630667,
"grad_norm": 26.699670791625977,
"loss": 4.3871,
"lr": 0.0009262937062937064,
"step": 1028,
"tokens_trained": 0.505296568
},
{
"epoch": 0.29217785972626054,
"grad_norm": 27.469482421875,
"loss": 4.3558,
"lr": 0.000926013986013986,
"step": 1030,
"tokens_trained": 0.506280416
},
{
"epoch": 0.29274519537621446,
"grad_norm": 26.149612426757812,
"loss": 4.3368,
"lr": 0.0009257342657342658,
"step": 1032,
"tokens_trained": 0.507261224
},
{
"epoch": 0.29331253102616833,
"grad_norm": 8.754459381103516,
"loss": 4.3447,
"lr": 0.0009254545454545454,
"step": 1034,
"tokens_trained": 0.508243288
},
{
"epoch": 0.29387986667612226,
"grad_norm": 32.17164611816406,
"loss": 4.4174,
"lr": 0.0009251748251748252,
"step": 1036,
"tokens_trained": 0.509224176
},
{
"epoch": 0.2944472023260762,
"grad_norm": 41.17238235473633,
"loss": 4.4221,
"lr": 0.0009248951048951049,
"step": 1038,
"tokens_trained": 0.510203568
},
{
"epoch": 0.29501453797603006,
"grad_norm": 44.97213363647461,
"loss": 4.3594,
"lr": 0.0009246153846153846,
"step": 1040,
"tokens_trained": 0.511186464
},
{
"epoch": 0.295581873625984,
"grad_norm": 42.23421859741211,
"loss": 4.4159,
"lr": 0.0009243356643356644,
"step": 1042,
"tokens_trained": 0.51216944
},
{
"epoch": 0.29614920927593785,
"grad_norm": 36.13594436645508,
"loss": 4.4105,
"lr": 0.0009240559440559441,
"step": 1044,
"tokens_trained": 0.513153144
},
{
"epoch": 0.2967165449258918,
"grad_norm": 36.89309310913086,
"loss": 4.3947,
"lr": 0.0009237762237762239,
"step": 1046,
"tokens_trained": 0.51413388
},
{
"epoch": 0.2972838805758457,
"grad_norm": 58.599700927734375,
"loss": 4.3988,
"lr": 0.0009234965034965035,
"step": 1048,
"tokens_trained": 0.515119288
},
{
"epoch": 0.2978512162257996,
"grad_norm": 13.725994110107422,
"loss": 4.412,
"lr": 0.0009232167832167832,
"step": 1050,
"tokens_trained": 0.51610284
},
{
"epoch": 0.2984185518757535,
"grad_norm": 105.28518676757812,
"loss": 4.7305,
"lr": 0.0009229370629370629,
"step": 1052,
"tokens_trained": 0.517085576
},
{
"epoch": 0.2989858875257074,
"grad_norm": 29.499713897705078,
"loss": 4.5106,
"lr": 0.0009226573426573427,
"step": 1054,
"tokens_trained": 0.518064224
},
{
"epoch": 0.2995532231756613,
"grad_norm": 60.907203674316406,
"loss": 4.5249,
"lr": 0.0009223776223776224,
"step": 1056,
"tokens_trained": 0.51905084
},
{
"epoch": 0.3001205588256152,
"grad_norm": 39.825069427490234,
"loss": 4.3695,
"lr": 0.0009220979020979021,
"step": 1058,
"tokens_trained": 0.5200318
},
{
"epoch": 0.3006878944755691,
"grad_norm": 42.77061462402344,
"loss": 4.4094,
"lr": 0.0009218181818181819,
"step": 1060,
"tokens_trained": 0.521013568
},
{
"epoch": 0.301255230125523,
"grad_norm": 37.05888748168945,
"loss": 4.3684,
"lr": 0.0009215384615384616,
"step": 1062,
"tokens_trained": 0.521997624
},
{
"epoch": 0.3018225657754769,
"grad_norm": 42.28252029418945,
"loss": 4.3489,
"lr": 0.0009212587412587413,
"step": 1064,
"tokens_trained": 0.522986184
},
{
"epoch": 0.3023899014254308,
"grad_norm": 40.95197677612305,
"loss": 4.3564,
"lr": 0.000920979020979021,
"step": 1066,
"tokens_trained": 0.523970984
},
{
"epoch": 0.30295723707538474,
"grad_norm": 25.469568252563477,
"loss": 4.3833,
"lr": 0.0009206993006993007,
"step": 1068,
"tokens_trained": 0.524952808
},
{
"epoch": 0.3035245727253386,
"grad_norm": 29.921735763549805,
"loss": 4.3579,
"lr": 0.0009204195804195804,
"step": 1070,
"tokens_trained": 0.525935696
},
{
"epoch": 0.30409190837529254,
"grad_norm": 26.038026809692383,
"loss": 4.2898,
"lr": 0.0009201398601398602,
"step": 1072,
"tokens_trained": 0.526916904
},
{
"epoch": 0.3046592440252464,
"grad_norm": 32.59503936767578,
"loss": 4.3335,
"lr": 0.0009198601398601398,
"step": 1074,
"tokens_trained": 0.527899864
},
{
"epoch": 0.30522657967520034,
"grad_norm": 14.04964828491211,
"loss": 4.3171,
"lr": 0.0009195804195804196,
"step": 1076,
"tokens_trained": 0.528878176
},
{
"epoch": 0.30579391532515426,
"grad_norm": 15.936906814575195,
"loss": 4.3005,
"lr": 0.0009193006993006993,
"step": 1078,
"tokens_trained": 0.529859952
},
{
"epoch": 0.30636125097510813,
"grad_norm": 9.73235034942627,
"loss": 4.3287,
"lr": 0.0009190209790209791,
"step": 1080,
"tokens_trained": 0.530838192
},
{
"epoch": 0.30692858662506206,
"grad_norm": 45.44027328491211,
"loss": 4.4384,
"lr": 0.0009187412587412588,
"step": 1082,
"tokens_trained": 0.531818376
},
{
"epoch": 0.30749592227501593,
"grad_norm": 55.65925598144531,
"loss": 4.3772,
"lr": 0.0009184615384615385,
"step": 1084,
"tokens_trained": 0.532802048
},
{
"epoch": 0.30806325792496986,
"grad_norm": 33.47093200683594,
"loss": 4.4257,
"lr": 0.0009181818181818182,
"step": 1086,
"tokens_trained": 0.533785376
},
{
"epoch": 0.3086305935749238,
"grad_norm": 39.709224700927734,
"loss": 4.4177,
"lr": 0.0009179020979020978,
"step": 1088,
"tokens_trained": 0.5347698
},
{
"epoch": 0.30919792922487765,
"grad_norm": 34.25212097167969,
"loss": 4.3518,
"lr": 0.0009176223776223777,
"step": 1090,
"tokens_trained": 0.53575108
},
{
"epoch": 0.3097652648748316,
"grad_norm": 29.156312942504883,
"loss": 4.3596,
"lr": 0.0009173426573426573,
"step": 1092,
"tokens_trained": 0.536735544
},
{
"epoch": 0.31033260052478545,
"grad_norm": 31.714128494262695,
"loss": 4.3736,
"lr": 0.0009170629370629371,
"step": 1094,
"tokens_trained": 0.537718008
},
{
"epoch": 0.3108999361747394,
"grad_norm": 12.244729042053223,
"loss": 4.3472,
"lr": 0.0009167832167832168,
"step": 1096,
"tokens_trained": 0.538693512
},
{
"epoch": 0.3114672718246933,
"grad_norm": 10.271063804626465,
"loss": 4.301,
"lr": 0.0009165034965034966,
"step": 1098,
"tokens_trained": 0.539681376
},
{
"epoch": 0.3120346074746472,
"grad_norm": 35.79754638671875,
"loss": 4.3912,
"lr": 0.0009162237762237763,
"step": 1100,
"tokens_trained": 0.540661392
},
{
"epoch": 0.3126019431246011,
"grad_norm": 24.1260986328125,
"loss": 4.3303,
"lr": 0.0009159440559440559,
"step": 1102,
"tokens_trained": 0.541646968
},
{
"epoch": 0.31316927877455497,
"grad_norm": 24.501169204711914,
"loss": 4.3205,
"lr": 0.0009156643356643357,
"step": 1104,
"tokens_trained": 0.542629392
},
{
"epoch": 0.3137366144245089,
"grad_norm": 17.031600952148438,
"loss": 4.2521,
"lr": 0.0009153846153846153,
"step": 1106,
"tokens_trained": 0.54361348
},
{
"epoch": 0.3143039500744628,
"grad_norm": 19.506216049194336,
"loss": 4.3225,
"lr": 0.0009151048951048952,
"step": 1108,
"tokens_trained": 0.544595336
},
{
"epoch": 0.3148712857244167,
"grad_norm": 20.822546005249023,
"loss": 4.2711,
"lr": 0.0009148251748251748,
"step": 1110,
"tokens_trained": 0.545578256
},
{
"epoch": 0.3154386213743706,
"grad_norm": 29.967998504638672,
"loss": 4.2868,
"lr": 0.0009145454545454546,
"step": 1112,
"tokens_trained": 0.546561024
},
{
"epoch": 0.3160059570243245,
"grad_norm": 24.06121063232422,
"loss": 4.2701,
"lr": 0.0009142657342657343,
"step": 1114,
"tokens_trained": 0.547544616
},
{
"epoch": 0.3165732926742784,
"grad_norm": 15.868765830993652,
"loss": 4.3233,
"lr": 0.000913986013986014,
"step": 1116,
"tokens_trained": 0.548526216
},
{
"epoch": 0.31714062832423234,
"grad_norm": 27.47897720336914,
"loss": 4.2813,
"lr": 0.0009137062937062938,
"step": 1118,
"tokens_trained": 0.549506544
},
{
"epoch": 0.3177079639741862,
"grad_norm": 15.343204498291016,
"loss": 4.3002,
"lr": 0.0009134265734265734,
"step": 1120,
"tokens_trained": 0.550488496
},
{
"epoch": 0.31827529962414014,
"grad_norm": 4.320124626159668,
"loss": 4.2622,
"lr": 0.0009131468531468532,
"step": 1122,
"tokens_trained": 0.551471792
},
{
"epoch": 0.318842635274094,
"grad_norm": 34.520050048828125,
"loss": 4.366,
"lr": 0.0009128671328671328,
"step": 1124,
"tokens_trained": 0.552457008
},
{
"epoch": 0.319126303099071,
"eval_loss": 1.096465826034546,
"eval_runtime": 20.7643,
"step": 1125,
"tokens_trained": 0.552948064
},
{
"epoch": 0.31940997092404794,
"grad_norm": 39.718719482421875,
"loss": 4.3317,
"lr": 0.0009125874125874127,
"step": 1126,
"tokens_trained": 0.5534394
},
{
"epoch": 0.31997730657400186,
"grad_norm": 20.843252182006836,
"loss": 4.3883,
"lr": 0.0009123076923076923,
"step": 1128,
"tokens_trained": 0.554419184
},
{
"epoch": 0.32054464222395573,
"grad_norm": 12.916360855102539,
"loss": 4.3119,
"lr": 0.000912027972027972,
"step": 1130,
"tokens_trained": 0.555401952
},
{
"epoch": 0.32111197787390966,
"grad_norm": 48.54426956176758,
"loss": 4.4155,
"lr": 0.0009117482517482518,
"step": 1132,
"tokens_trained": 0.556385024
},
{
"epoch": 0.32167931352386353,
"grad_norm": 41.00883483886719,
"loss": 4.362,
"lr": 0.0009114685314685315,
"step": 1134,
"tokens_trained": 0.557368472
},
{
"epoch": 0.32224664917381746,
"grad_norm": 28.0487060546875,
"loss": 4.3504,
"lr": 0.0009111888111888113,
"step": 1136,
"tokens_trained": 0.55835288
},
{
"epoch": 0.3228139848237714,
"grad_norm": 22.05229377746582,
"loss": 4.331,
"lr": 0.0009109090909090909,
"step": 1138,
"tokens_trained": 0.559337064
},
{
"epoch": 0.32338132047372525,
"grad_norm": 16.770631790161133,
"loss": 4.3008,
"lr": 0.0009106293706293707,
"step": 1140,
"tokens_trained": 0.560317984
},
{
"epoch": 0.3239486561236792,
"grad_norm": 35.300262451171875,
"loss": 4.4083,
"lr": 0.0009103496503496503,
"step": 1142,
"tokens_trained": 0.561299688
},
{
"epoch": 0.32451599177363305,
"grad_norm": 23.788284301757812,
"loss": 4.2772,
"lr": 0.0009100699300699301,
"step": 1144,
"tokens_trained": 0.562285664
},
{
"epoch": 0.325083327423587,
"grad_norm": 23.085710525512695,
"loss": 4.3185,
"lr": 0.0009097902097902098,
"step": 1146,
"tokens_trained": 0.563267832
},
{
"epoch": 0.3256506630735409,
"grad_norm": 13.11314582824707,
"loss": 4.2711,
"lr": 0.0009095104895104895,
"step": 1148,
"tokens_trained": 0.564248928
},
{
"epoch": 0.3262179987234948,
"grad_norm": 31.297805786132812,
"loss": 4.3096,
"lr": 0.0009092307692307692,
"step": 1150,
"tokens_trained": 0.56522952
},
{
"epoch": 0.3267853343734487,
"grad_norm": 11.668539047241211,
"loss": 4.2667,
"lr": 0.000908951048951049,
"step": 1152,
"tokens_trained": 0.566212392
},
{
"epoch": 0.32735267002340257,
"grad_norm": 23.359189987182617,
"loss": 4.3156,
"lr": 0.0009086713286713288,
"step": 1154,
"tokens_trained": 0.567192216
},
{
"epoch": 0.3279200056733565,
"grad_norm": 31.09916114807129,
"loss": 4.3367,
"lr": 0.0009083916083916084,
"step": 1156,
"tokens_trained": 0.568177088
},
{
"epoch": 0.3284873413233104,
"grad_norm": 24.03261947631836,
"loss": 4.3504,
"lr": 0.0009081118881118881,
"step": 1158,
"tokens_trained": 0.56915868
},
{
"epoch": 0.3290546769732643,
"grad_norm": 16.029443740844727,
"loss": 4.3192,
"lr": 0.0009078321678321678,
"step": 1160,
"tokens_trained": 0.570142976
},
{
"epoch": 0.3296220126232182,
"grad_norm": 53.486724853515625,
"loss": 4.3921,
"lr": 0.0009075524475524476,
"step": 1162,
"tokens_trained": 0.57112748
},
{
"epoch": 0.3301893482731721,
"grad_norm": 37.42267608642578,
"loss": 4.2821,
"lr": 0.0009072727272727273,
"step": 1164,
"tokens_trained": 0.57211356
},
{
"epoch": 0.330756683923126,
"grad_norm": 28.862472534179688,
"loss": 4.3002,
"lr": 0.000906993006993007,
"step": 1166,
"tokens_trained": 0.57309492
},
{
"epoch": 0.33132401957307994,
"grad_norm": 22.26299476623535,
"loss": 4.2729,
"lr": 0.0009067132867132866,
"step": 1168,
"tokens_trained": 0.5740806
},
{
"epoch": 0.3318913552230338,
"grad_norm": 21.635013580322266,
"loss": 4.2866,
"lr": 0.0009064335664335665,
"step": 1170,
"tokens_trained": 0.575061664
},
{
"epoch": 0.33245869087298774,
"grad_norm": 18.995012283325195,
"loss": 4.2814,
"lr": 0.0009061538461538462,
"step": 1172,
"tokens_trained": 0.576046304
},
{
"epoch": 0.3330260265229416,
"grad_norm": 22.621299743652344,
"loss": 4.2739,
"lr": 0.0009058741258741259,
"step": 1174,
"tokens_trained": 0.577032376
},
{
"epoch": 0.33359336217289554,
"grad_norm": 21.758216857910156,
"loss": 4.263,
"lr": 0.0009055944055944056,
"step": 1176,
"tokens_trained": 0.578013896
},
{
"epoch": 0.33416069782284946,
"grad_norm": 32.38374710083008,
"loss": 4.2713,
"lr": 0.0009053146853146853,
"step": 1178,
"tokens_trained": 0.57900508
},
{
"epoch": 0.33472803347280333,
"grad_norm": 35.57462692260742,
"loss": 4.2986,
"lr": 0.0009050349650349651,
"step": 1180,
"tokens_trained": 0.57999512
},
{
"epoch": 0.33529536912275726,
"grad_norm": 11.77812385559082,
"loss": 4.3085,
"lr": 0.0009047552447552448,
"step": 1182,
"tokens_trained": 0.580982752
},
{
"epoch": 0.33586270477271113,
"grad_norm": 51.48725509643555,
"loss": 4.4003,
"lr": 0.0009044755244755245,
"step": 1184,
"tokens_trained": 0.581964936
},
{
"epoch": 0.33643004042266506,
"grad_norm": 47.01481628417969,
"loss": 4.3182,
"lr": 0.0009041958041958041,
"step": 1186,
"tokens_trained": 0.582949944
},
{
"epoch": 0.336997376072619,
"grad_norm": 22.935691833496094,
"loss": 4.3432,
"lr": 0.000903916083916084,
"step": 1188,
"tokens_trained": 0.583934776
},
{
"epoch": 0.33756471172257285,
"grad_norm": 45.21054458618164,
"loss": 4.4674,
"lr": 0.0009036363636363637,
"step": 1190,
"tokens_trained": 0.584918344
},
{
"epoch": 0.3381320473725268,
"grad_norm": 27.012706756591797,
"loss": 4.2889,
"lr": 0.0009033566433566434,
"step": 1192,
"tokens_trained": 0.585897632
},
{
"epoch": 0.33869938302248065,
"grad_norm": 16.68247413635254,
"loss": 4.2896,
"lr": 0.0009030769230769231,
"step": 1194,
"tokens_trained": 0.586879408
},
{
"epoch": 0.3392667186724346,
"grad_norm": 20.664148330688477,
"loss": 4.304,
"lr": 0.0009027972027972027,
"step": 1196,
"tokens_trained": 0.587859392
},
{
"epoch": 0.3398340543223885,
"grad_norm": 22.954742431640625,
"loss": 4.2853,
"lr": 0.0009025174825174826,
"step": 1198,
"tokens_trained": 0.588845408
},
{
"epoch": 0.34040138997234237,
"grad_norm": 23.226943969726562,
"loss": 4.2597,
"lr": 0.0009022377622377622,
"step": 1200,
"tokens_trained": 0.589832736
},
{
"epoch": 0.3409687256222963,
"grad_norm": 7.963059902191162,
"loss": 4.261,
"lr": 0.000901958041958042,
"step": 1202,
"tokens_trained": 0.590816568
},
{
"epoch": 0.34153606127225017,
"grad_norm": 25.160730361938477,
"loss": 4.3288,
"lr": 0.0009016783216783216,
"step": 1204,
"tokens_trained": 0.59179692
},
{
"epoch": 0.3421033969222041,
"grad_norm": 38.45030212402344,
"loss": 4.3371,
"lr": 0.0009013986013986014,
"step": 1206,
"tokens_trained": 0.592780968
},
{
"epoch": 0.342670732572158,
"grad_norm": 52.66873550415039,
"loss": 4.2805,
"lr": 0.0009011188811188812,
"step": 1208,
"tokens_trained": 0.593760896
},
{
"epoch": 0.3432380682221119,
"grad_norm": 28.104921340942383,
"loss": 4.3885,
"lr": 0.0009008391608391609,
"step": 1210,
"tokens_trained": 0.59474304
},
{
"epoch": 0.3438054038720658,
"grad_norm": 49.20989990234375,
"loss": 4.346,
"lr": 0.0009005594405594406,
"step": 1212,
"tokens_trained": 0.59572768
},
{
"epoch": 0.3443727395220197,
"grad_norm": 20.652427673339844,
"loss": 4.2368,
"lr": 0.0009002797202797202,
"step": 1214,
"tokens_trained": 0.59671092
},
{
"epoch": 0.3449400751719736,
"grad_norm": 17.821596145629883,
"loss": 4.3041,
"lr": 0.0009000000000000001,
"step": 1216,
"tokens_trained": 0.597697344
},
{
"epoch": 0.34550741082192754,
"grad_norm": 48.594932556152344,
"loss": 4.3668,
"lr": 0.0008997202797202797,
"step": 1218,
"tokens_trained": 0.598677288
},
{
"epoch": 0.3460747464718814,
"grad_norm": 27.70078468322754,
"loss": 4.2939,
"lr": 0.0008994405594405595,
"step": 1220,
"tokens_trained": 0.599662488
},
{
"epoch": 0.34664208212183534,
"grad_norm": 25.498798370361328,
"loss": 4.2891,
"lr": 0.0008991608391608391,
"step": 1222,
"tokens_trained": 0.600646904
},
{
"epoch": 0.3472094177717892,
"grad_norm": 13.455835342407227,
"loss": 4.2881,
"lr": 0.0008988811188811188,
"step": 1224,
"tokens_trained": 0.601628112
},
{
"epoch": 0.34777675342174313,
"grad_norm": 17.518342971801758,
"loss": 4.2977,
"lr": 0.0008986013986013987,
"step": 1226,
"tokens_trained": 0.602612336
},
{
"epoch": 0.34834408907169706,
"grad_norm": 20.642597198486328,
"loss": 4.2921,
"lr": 0.0008983216783216783,
"step": 1228,
"tokens_trained": 0.603595
},
{
"epoch": 0.34891142472165093,
"grad_norm": 14.464616775512695,
"loss": 4.233,
"lr": 0.0008980419580419581,
"step": 1230,
"tokens_trained": 0.604576592
},
{
"epoch": 0.34947876037160486,
"grad_norm": 13.204504013061523,
"loss": 4.2707,
"lr": 0.0008977622377622377,
"step": 1232,
"tokens_trained": 0.60555656
},
{
"epoch": 0.35004609602155873,
"grad_norm": 12.241665840148926,
"loss": 4.2506,
"lr": 0.0008974825174825176,
"step": 1234,
"tokens_trained": 0.606536024
},
{
"epoch": 0.35061343167151265,
"grad_norm": 18.187660217285156,
"loss": 4.2659,
"lr": 0.0008972027972027972,
"step": 1236,
"tokens_trained": 0.607522576
},
{
"epoch": 0.3511807673214666,
"grad_norm": 8.911888122558594,
"loss": 4.2505,
"lr": 0.000896923076923077,
"step": 1238,
"tokens_trained": 0.608507736
},
{
"epoch": 0.35174810297142045,
"grad_norm": 21.351713180541992,
"loss": 4.2291,
"lr": 0.0008966433566433566,
"step": 1240,
"tokens_trained": 0.609486688
},
{
"epoch": 0.3523154386213744,
"grad_norm": 47.81566619873047,
"loss": 4.2725,
"lr": 0.0008963636363636363,
"step": 1242,
"tokens_trained": 0.610470272
},
{
"epoch": 0.35288277427132825,
"grad_norm": 33.53351974487305,
"loss": 4.3237,
"lr": 0.0008960839160839162,
"step": 1244,
"tokens_trained": 0.611455176
},
{
"epoch": 0.3534501099212822,
"grad_norm": 15.252607345581055,
"loss": 4.2868,
"lr": 0.0008958041958041958,
"step": 1246,
"tokens_trained": 0.612437888
},
{
"epoch": 0.3540174455712361,
"grad_norm": 24.129865646362305,
"loss": 4.2626,
"lr": 0.0008955244755244756,
"step": 1248,
"tokens_trained": 0.613420728
},
{
"epoch": 0.35458478122118997,
"grad_norm": 34.814605712890625,
"loss": 4.2627,
"lr": 0.0008952447552447552,
"step": 1250,
"tokens_trained": 0.614405904
},
{
"epoch": 0.35458478122118997,
"eval_loss": 1.078355312347412,
"eval_runtime": 20.4723,
"step": 1250,
"tokens_trained": 0.614405904
},
{
"epoch": 0.3551521168711439,
"grad_norm": 18.26809310913086,
"loss": 4.2986,
"lr": 0.000894965034965035,
"step": 1252,
"tokens_trained": 0.615386288
},
{
"epoch": 0.35571945252109777,
"grad_norm": 24.68335723876953,
"loss": 4.3146,
"lr": 0.0008946853146853147,
"step": 1254,
"tokens_trained": 0.616370576
},
{
"epoch": 0.3562867881710517,
"grad_norm": 35.34586715698242,
"loss": 4.2905,
"lr": 0.0008944055944055944,
"step": 1256,
"tokens_trained": 0.617351944
},
{
"epoch": 0.3568541238210056,
"grad_norm": 22.668407440185547,
"loss": 4.2607,
"lr": 0.0008941258741258741,
"step": 1258,
"tokens_trained": 0.618334816
},
{
"epoch": 0.3574214594709595,
"grad_norm": 14.068164825439453,
"loss": 4.2459,
"lr": 0.0008938461538461538,
"step": 1260,
"tokens_trained": 0.619319736
},
{
"epoch": 0.3579887951209134,
"grad_norm": 8.274995803833008,
"loss": 4.2713,
"lr": 0.0008935664335664337,
"step": 1262,
"tokens_trained": 0.620299344
},
{
"epoch": 0.3585561307708673,
"grad_norm": 22.12897491455078,
"loss": 4.2841,
"lr": 0.0008932867132867133,
"step": 1264,
"tokens_trained": 0.621282592
},
{
"epoch": 0.3591234664208212,
"grad_norm": 26.171052932739258,
"loss": 4.2505,
"lr": 0.000893006993006993,
"step": 1266,
"tokens_trained": 0.622266136
},
{
"epoch": 0.35969080207077514,
"grad_norm": 14.768603324890137,
"loss": 4.271,
"lr": 0.0008927272727272727,
"step": 1268,
"tokens_trained": 0.623247816
},
{
"epoch": 0.360258137720729,
"grad_norm": 13.065408706665039,
"loss": 4.2387,
"lr": 0.0008924475524475525,
"step": 1270,
"tokens_trained": 0.624234848
},
{
"epoch": 0.36082547337068294,
"grad_norm": 14.043888092041016,
"loss": 4.2601,
"lr": 0.0008921678321678322,
"step": 1272,
"tokens_trained": 0.625214176
},
{
"epoch": 0.3613928090206368,
"grad_norm": 13.734328269958496,
"loss": 4.2426,
"lr": 0.0008918881118881119,
"step": 1274,
"tokens_trained": 0.626197608
},
{
"epoch": 0.36196014467059073,
"grad_norm": 10.075374603271484,
"loss": 4.2259,
"lr": 0.0008916083916083916,
"step": 1276,
"tokens_trained": 0.62717884
},
{
"epoch": 0.36252748032054466,
"grad_norm": 33.92001724243164,
"loss": 4.3054,
"lr": 0.0008913286713286713,
"step": 1278,
"tokens_trained": 0.628166888
},
{
"epoch": 0.36309481597049853,
"grad_norm": 31.1391544342041,
"loss": 4.3066,
"lr": 0.0008910489510489512,
"step": 1280,
"tokens_trained": 0.629152528
},
{
"epoch": 0.36366215162045246,
"grad_norm": 10.888711929321289,
"loss": 4.2348,
"lr": 0.0008907692307692308,
"step": 1282,
"tokens_trained": 0.630132584
},
{
"epoch": 0.3642294872704063,
"grad_norm": 27.298410415649414,
"loss": 4.3225,
"lr": 0.0008904895104895105,
"step": 1284,
"tokens_trained": 0.63111212
},
{
"epoch": 0.36479682292036025,
"grad_norm": 23.396818161010742,
"loss": 4.3177,
"lr": 0.0008902097902097902,
"step": 1286,
"tokens_trained": 0.632094984
},
{
"epoch": 0.3653641585703142,
"grad_norm": 18.824432373046875,
"loss": 4.2235,
"lr": 0.00088993006993007,
"step": 1288,
"tokens_trained": 0.633076832
},
{
"epoch": 0.36593149422026805,
"grad_norm": 8.04826545715332,
"loss": 4.2268,
"lr": 0.0008896503496503497,
"step": 1290,
"tokens_trained": 0.63405868
},
{
"epoch": 0.366498829870222,
"grad_norm": 32.26673889160156,
"loss": 4.3113,
"lr": 0.0008893706293706294,
"step": 1292,
"tokens_trained": 0.635045096
},
{
"epoch": 0.36706616552017585,
"grad_norm": 29.91358184814453,
"loss": 4.2971,
"lr": 0.000889090909090909,
"step": 1294,
"tokens_trained": 0.63603008
},
{
"epoch": 0.3676335011701298,
"grad_norm": 12.093538284301758,
"loss": 4.2502,
"lr": 0.0008888111888111888,
"step": 1296,
"tokens_trained": 0.637014016
},
{
"epoch": 0.3682008368200837,
"grad_norm": 8.252509117126465,
"loss": 4.2905,
"lr": 0.0008885314685314686,
"step": 1298,
"tokens_trained": 0.637997752
},
{
"epoch": 0.36876817247003757,
"grad_norm": 61.22240447998047,
"loss": 4.4753,
"lr": 0.0008882517482517483,
"step": 1300,
"tokens_trained": 0.638981552
},
{
"epoch": 0.3693355081199915,
"grad_norm": 47.58195877075195,
"loss": 4.2769,
"lr": 0.000887972027972028,
"step": 1302,
"tokens_trained": 0.639963512
},
{
"epoch": 0.36990284376994537,
"grad_norm": 28.806411743164062,
"loss": 4.3728,
"lr": 0.0008876923076923077,
"step": 1304,
"tokens_trained": 0.640948392
},
{
"epoch": 0.3704701794198993,
"grad_norm": 38.960853576660156,
"loss": 4.338,
"lr": 0.0008874125874125875,
"step": 1306,
"tokens_trained": 0.641935304
},
{
"epoch": 0.3710375150698532,
"grad_norm": 25.05726432800293,
"loss": 4.3002,
"lr": 0.0008871328671328671,
"step": 1308,
"tokens_trained": 0.642924168
},
{
"epoch": 0.3716048507198071,
"grad_norm": 39.84127426147461,
"loss": 4.3593,
"lr": 0.0008868531468531469,
"step": 1310,
"tokens_trained": 0.64390412
},
{
"epoch": 0.372172186369761,
"grad_norm": 15.03055191040039,
"loss": 4.223,
"lr": 0.0008865734265734265,
"step": 1312,
"tokens_trained": 0.644882104
},
{
"epoch": 0.3727395220197149,
"grad_norm": 41.85628890991211,
"loss": 4.3819,
"lr": 0.0008862937062937063,
"step": 1314,
"tokens_trained": 0.645866912
},
{
"epoch": 0.3733068576696688,
"grad_norm": 29.014118194580078,
"loss": 4.2843,
"lr": 0.0008860139860139861,
"step": 1316,
"tokens_trained": 0.646850376
},
{
"epoch": 0.37387419331962274,
"grad_norm": 24.407743453979492,
"loss": 4.2598,
"lr": 0.0008857342657342658,
"step": 1318,
"tokens_trained": 0.647832272
},
{
"epoch": 0.3744415289695766,
"grad_norm": 23.28154182434082,
"loss": 4.2162,
"lr": 0.0008854545454545455,
"step": 1320,
"tokens_trained": 0.64881652
},
{
"epoch": 0.37500886461953054,
"grad_norm": 17.70418930053711,
"loss": 4.2386,
"lr": 0.0008851748251748251,
"step": 1322,
"tokens_trained": 0.649794936
},
{
"epoch": 0.37557620026948446,
"grad_norm": 22.582124710083008,
"loss": 4.2358,
"lr": 0.000884895104895105,
"step": 1324,
"tokens_trained": 0.650777784
},
{
"epoch": 0.37614353591943833,
"grad_norm": 16.77848243713379,
"loss": 4.2536,
"lr": 0.0008846153846153846,
"step": 1326,
"tokens_trained": 0.651762472
},
{
"epoch": 0.37671087156939226,
"grad_norm": 14.382417678833008,
"loss": 4.2403,
"lr": 0.0008843356643356644,
"step": 1328,
"tokens_trained": 0.652741832
},
{
"epoch": 0.37727820721934613,
"grad_norm": 22.420886993408203,
"loss": 4.1977,
"lr": 0.000884055944055944,
"step": 1330,
"tokens_trained": 0.653725792
},
{
"epoch": 0.37784554286930006,
"grad_norm": 9.768660545349121,
"loss": 4.2148,
"lr": 0.0008837762237762238,
"step": 1332,
"tokens_trained": 0.654704648
},
{
"epoch": 0.378412878519254,
"grad_norm": 5.091487407684326,
"loss": 4.2062,
"lr": 0.0008834965034965036,
"step": 1334,
"tokens_trained": 0.65569176
},
{
"epoch": 0.37898021416920785,
"grad_norm": 53.520957946777344,
"loss": 4.4082,
"lr": 0.0008832167832167832,
"step": 1336,
"tokens_trained": 0.656679344
},
{
"epoch": 0.3795475498191618,
"grad_norm": 32.17420959472656,
"loss": 4.2911,
"lr": 0.000882937062937063,
"step": 1338,
"tokens_trained": 0.657665136
},
{
"epoch": 0.38011488546911565,
"grad_norm": 14.12790584564209,
"loss": 4.2899,
"lr": 0.0008826573426573426,
"step": 1340,
"tokens_trained": 0.658651576
},
{
"epoch": 0.3806822211190696,
"grad_norm": 51.74199676513672,
"loss": 4.3901,
"lr": 0.0008823776223776225,
"step": 1342,
"tokens_trained": 0.659631792
},
{
"epoch": 0.3812495567690235,
"grad_norm": 48.99909973144531,
"loss": 4.298,
"lr": 0.0008820979020979021,
"step": 1344,
"tokens_trained": 0.660616912
},
{
"epoch": 0.38181689241897737,
"grad_norm": 28.356245040893555,
"loss": 4.3171,
"lr": 0.0008818181818181819,
"step": 1346,
"tokens_trained": 0.66159872
},
{
"epoch": 0.3823842280689313,
"grad_norm": 45.081703186035156,
"loss": 4.3067,
"lr": 0.0008815384615384615,
"step": 1348,
"tokens_trained": 0.662582152
},
{
"epoch": 0.38295156371888517,
"grad_norm": 37.175052642822266,
"loss": 4.241,
"lr": 0.0008812587412587412,
"step": 1350,
"tokens_trained": 0.663561176
},
{
"epoch": 0.3835188993688391,
"grad_norm": 49.46076965332031,
"loss": 4.2896,
"lr": 0.0008809790209790211,
"step": 1352,
"tokens_trained": 0.664545144
},
{
"epoch": 0.384086235018793,
"grad_norm": 22.20182991027832,
"loss": 4.323,
"lr": 0.0008806993006993007,
"step": 1354,
"tokens_trained": 0.66553092
},
{
"epoch": 0.3846535706687469,
"grad_norm": 34.111549377441406,
"loss": 4.3138,
"lr": 0.0008804195804195805,
"step": 1356,
"tokens_trained": 0.666517568
},
{
"epoch": 0.3852209063187008,
"grad_norm": 47.01582336425781,
"loss": 4.3009,
"lr": 0.0008801398601398601,
"step": 1358,
"tokens_trained": 0.667498192
},
{
"epoch": 0.3857882419686547,
"grad_norm": 18.845388412475586,
"loss": 4.3176,
"lr": 0.00087986013986014,
"step": 1360,
"tokens_trained": 0.668479008
},
{
"epoch": 0.3863555776186086,
"grad_norm": 53.68927764892578,
"loss": 4.4024,
"lr": 0.0008795804195804196,
"step": 1362,
"tokens_trained": 0.669462472
},
{
"epoch": 0.38692291326856254,
"grad_norm": 29.88358497619629,
"loss": 4.286,
"lr": 0.0008793006993006993,
"step": 1364,
"tokens_trained": 0.67044392
},
{
"epoch": 0.3874902489185164,
"grad_norm": 11.12879753112793,
"loss": 4.3024,
"lr": 0.000879020979020979,
"step": 1366,
"tokens_trained": 0.671424552
},
{
"epoch": 0.38805758456847034,
"grad_norm": 23.573301315307617,
"loss": 4.2662,
"lr": 0.0008787412587412587,
"step": 1368,
"tokens_trained": 0.672409992
},
{
"epoch": 0.3886249202184242,
"grad_norm": 24.749160766601562,
"loss": 4.274,
"lr": 0.0008784615384615386,
"step": 1370,
"tokens_trained": 0.67339824
},
{
"epoch": 0.38919225586837813,
"grad_norm": 33.26881408691406,
"loss": 4.2588,
"lr": 0.0008781818181818182,
"step": 1372,
"tokens_trained": 0.67438204
},
{
"epoch": 0.38975959151833206,
"grad_norm": 24.466472625732422,
"loss": 4.2837,
"lr": 0.000877902097902098,
"step": 1374,
"tokens_trained": 0.67536356
},
{
"epoch": 0.39004325934330897,
"eval_loss": 1.0616238117218018,
"eval_runtime": 20.3698,
"step": 1375,
"tokens_trained": 0.675855672
},
{
"epoch": 0.39032692716828593,
"grad_norm": 24.48844337463379,
"loss": 4.259,
"lr": 0.0008776223776223776,
"step": 1376,
"tokens_trained": 0.676346368
},
{
"epoch": 0.39089426281823986,
"grad_norm": 30.594989776611328,
"loss": 4.1894,
"lr": 0.0008773426573426574,
"step": 1378,
"tokens_trained": 0.677329312
},
{
"epoch": 0.3914615984681937,
"grad_norm": 19.835350036621094,
"loss": 4.2718,
"lr": 0.0008770629370629371,
"step": 1380,
"tokens_trained": 0.678312272
},
{
"epoch": 0.39202893411814765,
"grad_norm": 14.570358276367188,
"loss": 4.2419,
"lr": 0.0008767832167832168,
"step": 1382,
"tokens_trained": 0.679291216
},
{
"epoch": 0.3925962697681016,
"grad_norm": 11.608271598815918,
"loss": 4.1917,
"lr": 0.0008765034965034965,
"step": 1384,
"tokens_trained": 0.680273296
},
{
"epoch": 0.39316360541805545,
"grad_norm": 26.094860076904297,
"loss": 4.2762,
"lr": 0.0008762237762237762,
"step": 1386,
"tokens_trained": 0.681249464
},
{
"epoch": 0.3937309410680094,
"grad_norm": 12.754049301147461,
"loss": 4.2032,
"lr": 0.0008759440559440561,
"step": 1388,
"tokens_trained": 0.682234168
},
{
"epoch": 0.39429827671796325,
"grad_norm": 5.951663970947266,
"loss": 4.1921,
"lr": 0.0008756643356643357,
"step": 1390,
"tokens_trained": 0.683217176
},
{
"epoch": 0.3948656123679172,
"grad_norm": 26.907669067382812,
"loss": 4.24,
"lr": 0.0008753846153846154,
"step": 1392,
"tokens_trained": 0.68419888
},
{
"epoch": 0.3954329480178711,
"grad_norm": 25.04796600341797,
"loss": 4.2656,
"lr": 0.0008751048951048951,
"step": 1394,
"tokens_trained": 0.685178784
},
{
"epoch": 0.39600028366782497,
"grad_norm": 19.600811004638672,
"loss": 4.2683,
"lr": 0.0008748251748251749,
"step": 1396,
"tokens_trained": 0.686161632
},
{
"epoch": 0.3965676193177789,
"grad_norm": 14.087088584899902,
"loss": 4.2658,
"lr": 0.0008745454545454546,
"step": 1398,
"tokens_trained": 0.687139992
},
{
"epoch": 0.39713495496773277,
"grad_norm": 9.257765769958496,
"loss": 4.2021,
"lr": 0.0008742657342657343,
"step": 1400,
"tokens_trained": 0.688117912
},
{
"epoch": 0.3977022906176867,
"grad_norm": 18.830154418945312,
"loss": 4.2249,
"lr": 0.0008739860139860139,
"step": 1402,
"tokens_trained": 0.689098776
},
{
"epoch": 0.3982696262676406,
"grad_norm": 24.81566619873047,
"loss": 4.246,
"lr": 0.0008737062937062937,
"step": 1404,
"tokens_trained": 0.690085432
},
{
"epoch": 0.3988369619175945,
"grad_norm": 14.071616172790527,
"loss": 4.2531,
"lr": 0.0008734265734265734,
"step": 1406,
"tokens_trained": 0.691069232
},
{
"epoch": 0.3994042975675484,
"grad_norm": 21.414424896240234,
"loss": 4.2192,
"lr": 0.0008731468531468532,
"step": 1408,
"tokens_trained": 0.692051224
},
{
"epoch": 0.3999716332175023,
"grad_norm": 38.74683380126953,
"loss": 4.2421,
"lr": 0.0008728671328671329,
"step": 1410,
"tokens_trained": 0.693029976
},
{
"epoch": 0.4005389688674562,
"grad_norm": 12.595442771911621,
"loss": 4.2569,
"lr": 0.0008725874125874126,
"step": 1412,
"tokens_trained": 0.694013304
},
{
"epoch": 0.40110630451741014,
"grad_norm": 55.233673095703125,
"loss": 4.3422,
"lr": 0.0008723076923076924,
"step": 1414,
"tokens_trained": 0.694997536
},
{
"epoch": 0.401673640167364,
"grad_norm": 24.717113494873047,
"loss": 4.2567,
"lr": 0.000872027972027972,
"step": 1416,
"tokens_trained": 0.695982632
},
{
"epoch": 0.40224097581731794,
"grad_norm": 20.552875518798828,
"loss": 4.2464,
"lr": 0.0008717482517482518,
"step": 1418,
"tokens_trained": 0.696966408
},
{
"epoch": 0.4028083114672718,
"grad_norm": 25.569900512695312,
"loss": 4.21,
"lr": 0.0008714685314685314,
"step": 1420,
"tokens_trained": 0.697948224
},
{
"epoch": 0.40337564711722573,
"grad_norm": 24.538320541381836,
"loss": 4.2605,
"lr": 0.0008711888111888112,
"step": 1422,
"tokens_trained": 0.698934688
},
{
"epoch": 0.40394298276717966,
"grad_norm": 9.585651397705078,
"loss": 4.2524,
"lr": 0.0008709090909090909,
"step": 1424,
"tokens_trained": 0.699921976
},
{
"epoch": 0.40451031841713353,
"grad_norm": 11.886672973632812,
"loss": 4.1934,
"lr": 0.0008706293706293707,
"step": 1426,
"tokens_trained": 0.70090396
},
{
"epoch": 0.40507765406708746,
"grad_norm": 26.162124633789062,
"loss": 4.2412,
"lr": 0.0008703496503496504,
"step": 1428,
"tokens_trained": 0.701888448
},
{
"epoch": 0.4056449897170413,
"grad_norm": 5.03931188583374,
"loss": 4.202,
"lr": 0.00087006993006993,
"step": 1430,
"tokens_trained": 0.702864336
},
{
"epoch": 0.40621232536699525,
"grad_norm": 33.67579650878906,
"loss": 4.3087,
"lr": 0.0008697902097902099,
"step": 1432,
"tokens_trained": 0.703847784
},
{
"epoch": 0.4067796610169492,
"grad_norm": 34.38542556762695,
"loss": 4.2807,
"lr": 0.0008695104895104895,
"step": 1434,
"tokens_trained": 0.704827288
},
{
"epoch": 0.40734699666690305,
"grad_norm": 13.319886207580566,
"loss": 4.3332,
"lr": 0.0008692307692307693,
"step": 1436,
"tokens_trained": 0.705815392
},
{
"epoch": 0.407914332316857,
"grad_norm": 36.58311080932617,
"loss": 4.3318,
"lr": 0.0008689510489510489,
"step": 1438,
"tokens_trained": 0.7067914
},
{
"epoch": 0.40848166796681085,
"grad_norm": 29.63648223876953,
"loss": 4.2962,
"lr": 0.0008686713286713287,
"step": 1440,
"tokens_trained": 0.70777396
},
{
"epoch": 0.4090490036167648,
"grad_norm": 9.55128002166748,
"loss": 4.2773,
"lr": 0.0008683916083916084,
"step": 1442,
"tokens_trained": 0.708750496
},
{
"epoch": 0.4096163392667187,
"grad_norm": 53.83981704711914,
"loss": 4.3875,
"lr": 0.0008681118881118881,
"step": 1444,
"tokens_trained": 0.709730168
},
{
"epoch": 0.41018367491667257,
"grad_norm": 54.59236526489258,
"loss": 4.3582,
"lr": 0.0008678321678321679,
"step": 1446,
"tokens_trained": 0.710709704
},
{
"epoch": 0.4107510105666265,
"grad_norm": 13.964411735534668,
"loss": 4.3065,
"lr": 0.0008675524475524475,
"step": 1448,
"tokens_trained": 0.711690136
},
{
"epoch": 0.41131834621658037,
"grad_norm": 25.506649017333984,
"loss": 4.2686,
"lr": 0.0008672727272727273,
"step": 1450,
"tokens_trained": 0.712668056
},
{
"epoch": 0.4118856818665343,
"grad_norm": 21.1628360748291,
"loss": 4.2485,
"lr": 0.000866993006993007,
"step": 1452,
"tokens_trained": 0.71365004
},
{
"epoch": 0.4124530175164882,
"grad_norm": 15.751238822937012,
"loss": 4.2078,
"lr": 0.0008667132867132868,
"step": 1454,
"tokens_trained": 0.714632032
},
{
"epoch": 0.4130203531664421,
"grad_norm": 15.838552474975586,
"loss": 4.1944,
"lr": 0.0008664335664335664,
"step": 1456,
"tokens_trained": 0.715611376
},
{
"epoch": 0.413587688816396,
"grad_norm": 15.968609809875488,
"loss": 4.1768,
"lr": 0.0008661538461538461,
"step": 1458,
"tokens_trained": 0.716591112
},
{
"epoch": 0.4141550244663499,
"grad_norm": 15.419891357421875,
"loss": 4.1978,
"lr": 0.0008658741258741259,
"step": 1460,
"tokens_trained": 0.717575952
},
{
"epoch": 0.4147223601163038,
"grad_norm": 15.088132858276367,
"loss": 4.2361,
"lr": 0.0008655944055944056,
"step": 1462,
"tokens_trained": 0.718563696
},
{
"epoch": 0.41528969576625774,
"grad_norm": 4.839190483093262,
"loss": 4.2089,
"lr": 0.0008653146853146854,
"step": 1464,
"tokens_trained": 0.71954848
},
{
"epoch": 0.4158570314162116,
"grad_norm": 22.192466735839844,
"loss": 4.2109,
"lr": 0.000865034965034965,
"step": 1466,
"tokens_trained": 0.720533304
},
{
"epoch": 0.41642436706616553,
"grad_norm": 28.983531951904297,
"loss": 4.2402,
"lr": 0.0008647552447552448,
"step": 1468,
"tokens_trained": 0.721518176
},
{
"epoch": 0.4169917027161194,
"grad_norm": 21.010780334472656,
"loss": 4.1732,
"lr": 0.0008644755244755245,
"step": 1470,
"tokens_trained": 0.72250176
},
{
"epoch": 0.41755903836607333,
"grad_norm": 14.59277057647705,
"loss": 4.1847,
"lr": 0.0008641958041958042,
"step": 1472,
"tokens_trained": 0.723486664
},
{
"epoch": 0.41812637401602726,
"grad_norm": 13.688531875610352,
"loss": 4.1577,
"lr": 0.0008639160839160839,
"step": 1474,
"tokens_trained": 0.724469328
},
{
"epoch": 0.41869370966598113,
"grad_norm": 15.879347801208496,
"loss": 4.1721,
"lr": 0.0008636363636363636,
"step": 1476,
"tokens_trained": 0.725454968
},
{
"epoch": 0.41926104531593505,
"grad_norm": 10.225201606750488,
"loss": 4.1999,
"lr": 0.0008633566433566434,
"step": 1478,
"tokens_trained": 0.7264426
},
{
"epoch": 0.4198283809658889,
"grad_norm": 17.007728576660156,
"loss": 4.2229,
"lr": 0.0008630769230769231,
"step": 1480,
"tokens_trained": 0.727422056
},
{
"epoch": 0.42039571661584285,
"grad_norm": 13.517934799194336,
"loss": 4.2241,
"lr": 0.0008627972027972029,
"step": 1482,
"tokens_trained": 0.728403688
},
{
"epoch": 0.4209630522657968,
"grad_norm": 17.132064819335938,
"loss": 4.1679,
"lr": 0.0008625174825174825,
"step": 1484,
"tokens_trained": 0.729386248
},
{
"epoch": 0.42153038791575065,
"grad_norm": 19.782320022583008,
"loss": 4.1817,
"lr": 0.0008622377622377622,
"step": 1486,
"tokens_trained": 0.730368752
},
{
"epoch": 0.4220977235657046,
"grad_norm": 3.388552188873291,
"loss": 4.1726,
"lr": 0.000861958041958042,
"step": 1488,
"tokens_trained": 0.731354304
},
{
"epoch": 0.42266505921565845,
"grad_norm": 28.33499526977539,
"loss": 4.2623,
"lr": 0.0008616783216783217,
"step": 1490,
"tokens_trained": 0.732337296
},
{
"epoch": 0.42323239486561237,
"grad_norm": 24.927406311035156,
"loss": 4.2422,
"lr": 0.0008613986013986014,
"step": 1492,
"tokens_trained": 0.733319824
},
{
"epoch": 0.4237997305155663,
"grad_norm": 25.996028900146484,
"loss": 4.2227,
"lr": 0.0008611188811188811,
"step": 1494,
"tokens_trained": 0.73430636
},
{
"epoch": 0.42436706616552017,
"grad_norm": 14.625783920288086,
"loss": 4.2268,
"lr": 0.0008608391608391609,
"step": 1496,
"tokens_trained": 0.735285848
},
{
"epoch": 0.4249344018154741,
"grad_norm": 12.556640625,
"loss": 4.2352,
"lr": 0.0008605594405594406,
"step": 1498,
"tokens_trained": 0.736270632
},
{
"epoch": 0.42550173746542796,
"grad_norm": 18.579416275024414,
"loss": 4.2377,
"lr": 0.0008602797202797203,
"step": 1500,
"tokens_trained": 0.737255104
},
{
"epoch": 0.42550173746542796,
"eval_loss": 1.052606463432312,
"eval_runtime": 20.5089,
"step": 1500,
"tokens_trained": 0.737255104
},
{
"epoch": 0.4260690731153819,
"grad_norm": 16.550657272338867,
"loss": 4.182,
"lr": 0.00086,
"step": 1502,
"tokens_trained": 0.738240848
},
{
"epoch": 0.4266364087653358,
"grad_norm": 24.4381046295166,
"loss": 4.2093,
"lr": 0.0008597202797202797,
"step": 1504,
"tokens_trained": 0.73922592
},
{
"epoch": 0.4272037444152897,
"grad_norm": 13.155163764953613,
"loss": 4.239,
"lr": 0.0008594405594405595,
"step": 1506,
"tokens_trained": 0.740208896
},
{
"epoch": 0.4277710800652436,
"grad_norm": 27.667949676513672,
"loss": 4.2607,
"lr": 0.0008591608391608392,
"step": 1508,
"tokens_trained": 0.741189312
},
{
"epoch": 0.4283384157151975,
"grad_norm": 35.897743225097656,
"loss": 4.2153,
"lr": 0.0008588811188811188,
"step": 1510,
"tokens_trained": 0.742170456
},
{
"epoch": 0.4289057513651514,
"grad_norm": 18.16407012939453,
"loss": 4.2753,
"lr": 0.0008586013986013986,
"step": 1512,
"tokens_trained": 0.743152504
},
{
"epoch": 0.42947308701510534,
"grad_norm": 27.447364807128906,
"loss": 4.2321,
"lr": 0.0008583216783216783,
"step": 1514,
"tokens_trained": 0.744139768
},
{
"epoch": 0.4300404226650592,
"grad_norm": 21.115859985351562,
"loss": 4.2048,
"lr": 0.0008580419580419581,
"step": 1516,
"tokens_trained": 0.745122368
},
{
"epoch": 0.43060775831501313,
"grad_norm": 5.949585914611816,
"loss": 4.1787,
"lr": 0.0008577622377622378,
"step": 1518,
"tokens_trained": 0.746104936
},
{
"epoch": 0.431175093964967,
"grad_norm": 6.631585121154785,
"loss": 4.2035,
"lr": 0.0008574825174825175,
"step": 1520,
"tokens_trained": 0.747086264
},
{
"epoch": 0.43174242961492093,
"grad_norm": 38.91585159301758,
"loss": 4.354,
"lr": 0.0008572027972027972,
"step": 1522,
"tokens_trained": 0.74806844
},
{
"epoch": 0.43230976526487486,
"grad_norm": 37.53727722167969,
"loss": 4.228,
"lr": 0.000856923076923077,
"step": 1524,
"tokens_trained": 0.749052432
},
{
"epoch": 0.4328771009148287,
"grad_norm": 19.87713623046875,
"loss": 4.2696,
"lr": 0.0008566433566433567,
"step": 1526,
"tokens_trained": 0.750037072
},
{
"epoch": 0.43344443656478265,
"grad_norm": 25.615995407104492,
"loss": 4.2676,
"lr": 0.0008563636363636363,
"step": 1528,
"tokens_trained": 0.751020584
},
{
"epoch": 0.4340117722147365,
"grad_norm": 16.643299102783203,
"loss": 4.201,
"lr": 0.0008560839160839161,
"step": 1530,
"tokens_trained": 0.75200224
},
{
"epoch": 0.43457910786469045,
"grad_norm": 16.207853317260742,
"loss": 4.1944,
"lr": 0.0008558041958041958,
"step": 1532,
"tokens_trained": 0.752981624
},
{
"epoch": 0.4351464435146444,
"grad_norm": 27.054973602294922,
"loss": 4.2188,
"lr": 0.0008555244755244756,
"step": 1534,
"tokens_trained": 0.753968464
},
{
"epoch": 0.43571377916459825,
"grad_norm": 33.468238830566406,
"loss": 4.2052,
"lr": 0.0008552447552447553,
"step": 1536,
"tokens_trained": 0.754950976
},
{
"epoch": 0.4362811148145522,
"grad_norm": 21.083576202392578,
"loss": 4.2514,
"lr": 0.000854965034965035,
"step": 1538,
"tokens_trained": 0.755938272
},
{
"epoch": 0.43684845046450604,
"grad_norm": 19.927122116088867,
"loss": 4.2493,
"lr": 0.0008546853146853147,
"step": 1540,
"tokens_trained": 0.756916784
},
{
"epoch": 0.43741578611445997,
"grad_norm": 22.105287551879883,
"loss": 4.2264,
"lr": 0.0008544055944055944,
"step": 1542,
"tokens_trained": 0.757901152
},
{
"epoch": 0.4379831217644139,
"grad_norm": 22.448705673217773,
"loss": 4.1987,
"lr": 0.0008541258741258742,
"step": 1544,
"tokens_trained": 0.758886048
},
{
"epoch": 0.43855045741436777,
"grad_norm": 17.740005493164062,
"loss": 4.1918,
"lr": 0.0008538461538461538,
"step": 1546,
"tokens_trained": 0.759864304
},
{
"epoch": 0.4391177930643217,
"grad_norm": 20.58041763305664,
"loss": 4.2144,
"lr": 0.0008535664335664336,
"step": 1548,
"tokens_trained": 0.760844312
},
{
"epoch": 0.43968512871427556,
"grad_norm": 21.937252044677734,
"loss": 4.2129,
"lr": 0.0008532867132867133,
"step": 1550,
"tokens_trained": 0.761827256
},
{
"epoch": 0.4402524643642295,
"grad_norm": 26.883426666259766,
"loss": 4.2244,
"lr": 0.000853006993006993,
"step": 1552,
"tokens_trained": 0.7628098
},
{
"epoch": 0.4408198000141834,
"grad_norm": 10.297266960144043,
"loss": 4.1724,
"lr": 0.0008527272727272728,
"step": 1554,
"tokens_trained": 0.763792488
},
{
"epoch": 0.4413871356641373,
"grad_norm": 12.119601249694824,
"loss": 4.1828,
"lr": 0.0008524475524475524,
"step": 1556,
"tokens_trained": 0.764769936
},
{
"epoch": 0.4419544713140912,
"grad_norm": 16.565885543823242,
"loss": 4.2113,
"lr": 0.0008521678321678322,
"step": 1558,
"tokens_trained": 0.765752376
},
{
"epoch": 0.4425218069640451,
"grad_norm": 18.860309600830078,
"loss": 4.1864,
"lr": 0.0008518881118881119,
"step": 1560,
"tokens_trained": 0.766736256
},
{
"epoch": 0.443089142613999,
"grad_norm": 4.049737453460693,
"loss": 4.2108,
"lr": 0.0008516083916083917,
"step": 1562,
"tokens_trained": 0.767720568
},
{
"epoch": 0.44365647826395294,
"grad_norm": 15.730945587158203,
"loss": 4.2339,
"lr": 0.0008513286713286713,
"step": 1564,
"tokens_trained": 0.768701288
},
{
"epoch": 0.4442238139139068,
"grad_norm": 18.64398956298828,
"loss": 4.2132,
"lr": 0.000851048951048951,
"step": 1566,
"tokens_trained": 0.769681336
},
{
"epoch": 0.44479114956386073,
"grad_norm": 22.01759147644043,
"loss": 4.2211,
"lr": 0.0008507692307692308,
"step": 1568,
"tokens_trained": 0.770661168
},
{
"epoch": 0.4453584852138146,
"grad_norm": 3.097306489944458,
"loss": 4.2114,
"lr": 0.0008504895104895105,
"step": 1570,
"tokens_trained": 0.7716424
},
{
"epoch": 0.44592582086376853,
"grad_norm": 35.901546478271484,
"loss": 4.3,
"lr": 0.0008502097902097903,
"step": 1572,
"tokens_trained": 0.772627536
},
{
"epoch": 0.44649315651372246,
"grad_norm": 20.762710571289062,
"loss": 4.2465,
"lr": 0.0008499300699300699,
"step": 1574,
"tokens_trained": 0.77361008
},
{
"epoch": 0.4470604921636763,
"grad_norm": 13.54304027557373,
"loss": 4.221,
"lr": 0.0008496503496503497,
"step": 1576,
"tokens_trained": 0.774591184
},
{
"epoch": 0.44762782781363025,
"grad_norm": 18.83641242980957,
"loss": 4.2228,
"lr": 0.0008493706293706294,
"step": 1578,
"tokens_trained": 0.775574136
},
{
"epoch": 0.4481951634635841,
"grad_norm": 12.294941902160645,
"loss": 4.1768,
"lr": 0.0008490909090909091,
"step": 1580,
"tokens_trained": 0.776554752
},
{
"epoch": 0.44876249911353805,
"grad_norm": 5.768923759460449,
"loss": 4.2255,
"lr": 0.0008488111888111888,
"step": 1582,
"tokens_trained": 0.777539368
},
{
"epoch": 0.449329834763492,
"grad_norm": 7.9961137771606445,
"loss": 4.2218,
"lr": 0.0008485314685314685,
"step": 1584,
"tokens_trained": 0.778522344
},
{
"epoch": 0.44989717041344585,
"grad_norm": 22.005645751953125,
"loss": 4.2452,
"lr": 0.0008482517482517483,
"step": 1586,
"tokens_trained": 0.77950768
},
{
"epoch": 0.45046450606339977,
"grad_norm": 27.313426971435547,
"loss": 4.1875,
"lr": 0.000847972027972028,
"step": 1588,
"tokens_trained": 0.780490984
},
{
"epoch": 0.45103184171335364,
"grad_norm": 10.344687461853027,
"loss": 4.2356,
"lr": 0.0008476923076923078,
"step": 1590,
"tokens_trained": 0.781469
},
{
"epoch": 0.45159917736330757,
"grad_norm": 27.348726272583008,
"loss": 4.2962,
"lr": 0.0008474125874125874,
"step": 1592,
"tokens_trained": 0.782450304
},
{
"epoch": 0.4521665130132615,
"grad_norm": 32.965911865234375,
"loss": 4.2736,
"lr": 0.0008471328671328671,
"step": 1594,
"tokens_trained": 0.783431416
},
{
"epoch": 0.45273384866321537,
"grad_norm": 7.752636909484863,
"loss": 4.2074,
"lr": 0.0008468531468531469,
"step": 1596,
"tokens_trained": 0.784409568
},
{
"epoch": 0.4533011843131693,
"grad_norm": 38.85223388671875,
"loss": 4.3261,
"lr": 0.0008465734265734266,
"step": 1598,
"tokens_trained": 0.785399368
},
{
"epoch": 0.45386851996312316,
"grad_norm": 38.017967224121094,
"loss": 4.2646,
"lr": 0.0008462937062937063,
"step": 1600,
"tokens_trained": 0.786376072
},
{
"epoch": 0.4544358556130771,
"grad_norm": 7.856576442718506,
"loss": 4.191,
"lr": 0.000846013986013986,
"step": 1602,
"tokens_trained": 0.787362072
},
{
"epoch": 0.455003191263031,
"grad_norm": 37.902870178222656,
"loss": 4.2651,
"lr": 0.0008457342657342658,
"step": 1604,
"tokens_trained": 0.788345104
},
{
"epoch": 0.4555705269129849,
"grad_norm": 7.724793434143066,
"loss": 4.1994,
"lr": 0.0008454545454545455,
"step": 1606,
"tokens_trained": 0.7893314
},
{
"epoch": 0.4561378625629388,
"grad_norm": 26.484699249267578,
"loss": 4.2276,
"lr": 0.0008451748251748252,
"step": 1608,
"tokens_trained": 0.790309344
},
{
"epoch": 0.4567051982128927,
"grad_norm": 23.137874603271484,
"loss": 4.2082,
"lr": 0.0008448951048951049,
"step": 1610,
"tokens_trained": 0.791295784
},
{
"epoch": 0.4572725338628466,
"grad_norm": 13.902606964111328,
"loss": 4.2035,
"lr": 0.0008446153846153846,
"step": 1612,
"tokens_trained": 0.79228076
},
{
"epoch": 0.45783986951280053,
"grad_norm": 8.438498497009277,
"loss": 4.1713,
"lr": 0.0008443356643356644,
"step": 1614,
"tokens_trained": 0.793265456
},
{
"epoch": 0.4584072051627544,
"grad_norm": 11.60899829864502,
"loss": 4.1971,
"lr": 0.0008440559440559441,
"step": 1616,
"tokens_trained": 0.794245896
},
{
"epoch": 0.45897454081270833,
"grad_norm": 19.33312225341797,
"loss": 4.2328,
"lr": 0.0008437762237762238,
"step": 1618,
"tokens_trained": 0.795229016
},
{
"epoch": 0.4595418764626622,
"grad_norm": 16.45014190673828,
"loss": 4.2277,
"lr": 0.0008434965034965035,
"step": 1620,
"tokens_trained": 0.79620792
},
{
"epoch": 0.46010921211261613,
"grad_norm": 9.818867683410645,
"loss": 4.1494,
"lr": 0.0008432167832167832,
"step": 1622,
"tokens_trained": 0.797192352
},
{
"epoch": 0.46067654776257005,
"grad_norm": 7.920058250427246,
"loss": 4.2027,
"lr": 0.000842937062937063,
"step": 1624,
"tokens_trained": 0.798174104
},
{
"epoch": 0.46096021558754696,
"eval_loss": 1.044265627861023,
"eval_runtime": 20.5617,
"step": 1625,
"tokens_trained": 0.798668072
},
{
"epoch": 0.4612438834125239,
"grad_norm": 10.734235763549805,
"loss": 4.1505,
"lr": 0.0008426573426573427,
"step": 1626,
"tokens_trained": 0.799160304
},
{
"epoch": 0.46181121906247785,
"grad_norm": 23.376392364501953,
"loss": 4.195,
"lr": 0.0008423776223776224,
"step": 1628,
"tokens_trained": 0.800144144
},
{
"epoch": 0.4623785547124317,
"grad_norm": 23.567371368408203,
"loss": 4.2367,
"lr": 0.0008420979020979021,
"step": 1630,
"tokens_trained": 0.801131184
},
{
"epoch": 0.46294589036238565,
"grad_norm": 19.271820068359375,
"loss": 4.1899,
"lr": 0.0008418181818181819,
"step": 1632,
"tokens_trained": 0.802111296
},
{
"epoch": 0.4635132260123396,
"grad_norm": 17.468698501586914,
"loss": 4.1941,
"lr": 0.0008415384615384616,
"step": 1634,
"tokens_trained": 0.803095112
},
{
"epoch": 0.46408056166229344,
"grad_norm": 22.298749923706055,
"loss": 4.2083,
"lr": 0.0008412587412587412,
"step": 1636,
"tokens_trained": 0.804080456
},
{
"epoch": 0.46464789731224737,
"grad_norm": 12.506179809570312,
"loss": 4.1953,
"lr": 0.000840979020979021,
"step": 1638,
"tokens_trained": 0.805062464
},
{
"epoch": 0.46521523296220124,
"grad_norm": 11.819656372070312,
"loss": 4.2047,
"lr": 0.0008406993006993006,
"step": 1640,
"tokens_trained": 0.806045504
},
{
"epoch": 0.46578256861215517,
"grad_norm": 15.925740242004395,
"loss": 4.1565,
"lr": 0.0008404195804195805,
"step": 1642,
"tokens_trained": 0.80702736
},
{
"epoch": 0.4663499042621091,
"grad_norm": 15.869892120361328,
"loss": 4.2134,
"lr": 0.0008401398601398602,
"step": 1644,
"tokens_trained": 0.808009192
},
{
"epoch": 0.46691723991206296,
"grad_norm": 10.851021766662598,
"loss": 4.2041,
"lr": 0.0008398601398601399,
"step": 1646,
"tokens_trained": 0.808994728
},
{
"epoch": 0.4674845755620169,
"grad_norm": 8.271230697631836,
"loss": 4.1739,
"lr": 0.0008395804195804196,
"step": 1648,
"tokens_trained": 0.809976448
},
{
"epoch": 0.46805191121197076,
"grad_norm": 13.768092155456543,
"loss": 4.1761,
"lr": 0.0008393006993006993,
"step": 1650,
"tokens_trained": 0.810958392
},
{
"epoch": 0.4686192468619247,
"grad_norm": 7.760485649108887,
"loss": 4.1826,
"lr": 0.0008390209790209791,
"step": 1652,
"tokens_trained": 0.81194136
},
{
"epoch": 0.4691865825118786,
"grad_norm": 13.28488540649414,
"loss": 4.1659,
"lr": 0.0008387412587412587,
"step": 1654,
"tokens_trained": 0.812924984
},
{
"epoch": 0.4697539181618325,
"grad_norm": 10.466367721557617,
"loss": 4.1432,
"lr": 0.0008384615384615385,
"step": 1656,
"tokens_trained": 0.813907424
},
{
"epoch": 0.4703212538117864,
"grad_norm": 15.40854549407959,
"loss": 4.1625,
"lr": 0.0008381818181818181,
"step": 1658,
"tokens_trained": 0.814888712
},
{
"epoch": 0.4708885894617403,
"grad_norm": 20.580612182617188,
"loss": 4.1636,
"lr": 0.000837902097902098,
"step": 1660,
"tokens_trained": 0.815869152
},
{
"epoch": 0.4714559251116942,
"grad_norm": 14.908403396606445,
"loss": 4.1763,
"lr": 0.0008376223776223776,
"step": 1662,
"tokens_trained": 0.816852664
},
{
"epoch": 0.47202326076164813,
"grad_norm": 10.217529296875,
"loss": 4.1934,
"lr": 0.0008373426573426573,
"step": 1664,
"tokens_trained": 0.817832792
},
{
"epoch": 0.472590596411602,
"grad_norm": 15.74150276184082,
"loss": 4.1714,
"lr": 0.0008370629370629371,
"step": 1666,
"tokens_trained": 0.81881728
},
{
"epoch": 0.47315793206155593,
"grad_norm": 15.39499282836914,
"loss": 4.2005,
"lr": 0.0008367832167832168,
"step": 1668,
"tokens_trained": 0.819800824
},
{
"epoch": 0.4737252677115098,
"grad_norm": 11.585809707641602,
"loss": 4.136,
"lr": 0.0008365034965034966,
"step": 1670,
"tokens_trained": 0.8207856
},
{
"epoch": 0.4742926033614637,
"grad_norm": 16.053237915039062,
"loss": 4.1827,
"lr": 0.0008362237762237762,
"step": 1672,
"tokens_trained": 0.821766576
},
{
"epoch": 0.47485993901141765,
"grad_norm": 9.23779582977295,
"loss": 4.1159,
"lr": 0.000835944055944056,
"step": 1674,
"tokens_trained": 0.822749696
},
{
"epoch": 0.4754272746613715,
"grad_norm": 11.395891189575195,
"loss": 4.17,
"lr": 0.0008356643356643356,
"step": 1676,
"tokens_trained": 0.82373032
},
{
"epoch": 0.47599461031132545,
"grad_norm": 17.745365142822266,
"loss": 4.1696,
"lr": 0.0008353846153846154,
"step": 1678,
"tokens_trained": 0.824712192
},
{
"epoch": 0.4765619459612793,
"grad_norm": 6.7816572189331055,
"loss": 4.1933,
"lr": 0.0008351048951048951,
"step": 1680,
"tokens_trained": 0.825691208
},
{
"epoch": 0.47712928161123325,
"grad_norm": 20.552772521972656,
"loss": 4.1625,
"lr": 0.0008348251748251748,
"step": 1682,
"tokens_trained": 0.826672584
},
{
"epoch": 0.4776966172611872,
"grad_norm": 21.632352828979492,
"loss": 4.2061,
"lr": 0.0008345454545454546,
"step": 1684,
"tokens_trained": 0.827654368
},
{
"epoch": 0.47826395291114104,
"grad_norm": 17.754596710205078,
"loss": 4.222,
"lr": 0.0008342657342657343,
"step": 1686,
"tokens_trained": 0.828639392
},
{
"epoch": 0.47883128856109497,
"grad_norm": 20.73906707763672,
"loss": 4.1679,
"lr": 0.0008339860139860141,
"step": 1688,
"tokens_trained": 0.829627232
},
{
"epoch": 0.47939862421104884,
"grad_norm": 28.157238006591797,
"loss": 4.1658,
"lr": 0.0008337062937062937,
"step": 1690,
"tokens_trained": 0.830610904
},
{
"epoch": 0.47996595986100277,
"grad_norm": 12.728020668029785,
"loss": 4.1892,
"lr": 0.0008334265734265734,
"step": 1692,
"tokens_trained": 0.831602544
},
{
"epoch": 0.4805332955109567,
"grad_norm": 20.21622657775879,
"loss": 4.1453,
"lr": 0.0008331468531468531,
"step": 1694,
"tokens_trained": 0.832584656
},
{
"epoch": 0.48110063116091056,
"grad_norm": 18.5329647064209,
"loss": 4.2145,
"lr": 0.0008328671328671329,
"step": 1696,
"tokens_trained": 0.833570472
},
{
"epoch": 0.4816679668108645,
"grad_norm": 12.47617244720459,
"loss": 4.1944,
"lr": 0.0008325874125874126,
"step": 1698,
"tokens_trained": 0.834556104
},
{
"epoch": 0.48223530246081836,
"grad_norm": 21.34851837158203,
"loss": 4.1754,
"lr": 0.0008323076923076923,
"step": 1700,
"tokens_trained": 0.835540592
},
{
"epoch": 0.4828026381107723,
"grad_norm": 13.20995807647705,
"loss": 4.1657,
"lr": 0.000832027972027972,
"step": 1702,
"tokens_trained": 0.836525136
},
{
"epoch": 0.4833699737607262,
"grad_norm": 16.77725601196289,
"loss": 4.1905,
"lr": 0.0008317482517482518,
"step": 1704,
"tokens_trained": 0.837509224
},
{
"epoch": 0.4839373094106801,
"grad_norm": 15.17611312866211,
"loss": 4.1823,
"lr": 0.0008314685314685315,
"step": 1706,
"tokens_trained": 0.838492472
},
{
"epoch": 0.484504645060634,
"grad_norm": 13.06942081451416,
"loss": 4.1732,
"lr": 0.0008311888111888112,
"step": 1708,
"tokens_trained": 0.839471696
},
{
"epoch": 0.4850719807105879,
"grad_norm": 10.456578254699707,
"loss": 4.1862,
"lr": 0.0008309090909090909,
"step": 1710,
"tokens_trained": 0.840452808
},
{
"epoch": 0.4856393163605418,
"grad_norm": 13.80197525024414,
"loss": 4.1663,
"lr": 0.0008306293706293706,
"step": 1712,
"tokens_trained": 0.841434224
},
{
"epoch": 0.48620665201049573,
"grad_norm": 20.076507568359375,
"loss": 4.1436,
"lr": 0.0008303496503496504,
"step": 1714,
"tokens_trained": 0.842415304
},
{
"epoch": 0.4867739876604496,
"grad_norm": 5.629086971282959,
"loss": 4.149,
"lr": 0.00083006993006993,
"step": 1716,
"tokens_trained": 0.84339416
},
{
"epoch": 0.48734132331040353,
"grad_norm": 13.932148933410645,
"loss": 4.1785,
"lr": 0.0008297902097902098,
"step": 1718,
"tokens_trained": 0.844380472
},
{
"epoch": 0.4879086589603574,
"grad_norm": 18.951047897338867,
"loss": 4.216,
"lr": 0.0008295104895104895,
"step": 1720,
"tokens_trained": 0.845366896
},
{
"epoch": 0.4884759946103113,
"grad_norm": 21.042476654052734,
"loss": 4.1634,
"lr": 0.0008292307692307693,
"step": 1722,
"tokens_trained": 0.846344792
},
{
"epoch": 0.48904333026026525,
"grad_norm": 23.94416618347168,
"loss": 4.1613,
"lr": 0.000828951048951049,
"step": 1724,
"tokens_trained": 0.847323608
},
{
"epoch": 0.4896106659102191,
"grad_norm": 5.057071208953857,
"loss": 4.1729,
"lr": 0.0008286713286713287,
"step": 1726,
"tokens_trained": 0.848304856
},
{
"epoch": 0.49017800156017305,
"grad_norm": 18.068674087524414,
"loss": 4.2194,
"lr": 0.0008283916083916084,
"step": 1728,
"tokens_trained": 0.849287712
},
{
"epoch": 0.4907453372101269,
"grad_norm": 11.621233940124512,
"loss": 4.2232,
"lr": 0.000828111888111888,
"step": 1730,
"tokens_trained": 0.850268968
},
{
"epoch": 0.49131267286008085,
"grad_norm": 12.939676284790039,
"loss": 4.2003,
"lr": 0.0008278321678321679,
"step": 1732,
"tokens_trained": 0.851256528
},
{
"epoch": 0.49188000851003477,
"grad_norm": 10.638157844543457,
"loss": 4.1975,
"lr": 0.0008275524475524475,
"step": 1734,
"tokens_trained": 0.852240824
},
{
"epoch": 0.49244734415998864,
"grad_norm": 6.2671003341674805,
"loss": 4.1617,
"lr": 0.0008272727272727273,
"step": 1736,
"tokens_trained": 0.853224768
},
{
"epoch": 0.49301467980994257,
"grad_norm": 12.318375587463379,
"loss": 4.1939,
"lr": 0.000826993006993007,
"step": 1738,
"tokens_trained": 0.8542062
},
{
"epoch": 0.49358201545989644,
"grad_norm": 17.275348663330078,
"loss": 4.1911,
"lr": 0.0008267132867132868,
"step": 1740,
"tokens_trained": 0.855192024
},
{
"epoch": 0.49414935110985037,
"grad_norm": 11.122747421264648,
"loss": 4.17,
"lr": 0.0008264335664335665,
"step": 1742,
"tokens_trained": 0.856172136
},
{
"epoch": 0.4947166867598043,
"grad_norm": 6.223485469818115,
"loss": 4.1774,
"lr": 0.0008261538461538461,
"step": 1744,
"tokens_trained": 0.857156312
},
{
"epoch": 0.49528402240975816,
"grad_norm": 14.62152099609375,
"loss": 4.1607,
"lr": 0.0008258741258741259,
"step": 1746,
"tokens_trained": 0.858140152
},
{
"epoch": 0.4958513580597121,
"grad_norm": 15.991989135742188,
"loss": 4.1825,
"lr": 0.0008255944055944055,
"step": 1748,
"tokens_trained": 0.85912524
},
{
"epoch": 0.49641869370966596,
"grad_norm": 28.88335418701172,
"loss": 4.2244,
"lr": 0.0008253146853146854,
"step": 1750,
"tokens_trained": 0.860105784
},
{
"epoch": 0.49641869370966596,
"eval_loss": 1.061833143234253,
"eval_runtime": 20.4841,
"step": 1750,
"tokens_trained": 0.860105784
},
{
"epoch": 0.4969860293596199,
"grad_norm": 14.708030700683594,
"loss": 4.2036,
"lr": 0.000825034965034965,
"step": 1752,
"tokens_trained": 0.861089272
},
{
"epoch": 0.4975533650095738,
"grad_norm": 24.67535400390625,
"loss": 4.2405,
"lr": 0.0008247552447552448,
"step": 1754,
"tokens_trained": 0.862066656
},
{
"epoch": 0.4981207006595277,
"grad_norm": 10.923722267150879,
"loss": 4.1713,
"lr": 0.0008244755244755245,
"step": 1756,
"tokens_trained": 0.863049256
},
{
"epoch": 0.4986880363094816,
"grad_norm": 8.88796615600586,
"loss": 4.1834,
"lr": 0.0008241958041958042,
"step": 1758,
"tokens_trained": 0.864029352
},
{
"epoch": 0.4992553719594355,
"grad_norm": 34.90485382080078,
"loss": 4.2338,
"lr": 0.000823916083916084,
"step": 1760,
"tokens_trained": 0.865013008
},
{
"epoch": 0.4998227076093894,
"grad_norm": 36.34440612792969,
"loss": 4.2012,
"lr": 0.0008236363636363636,
"step": 1762,
"tokens_trained": 0.86599204
},
{
"epoch": 0.5003900432593433,
"grad_norm": 27.913984298706055,
"loss": 4.269,
"lr": 0.0008233566433566434,
"step": 1764,
"tokens_trained": 0.866975456
},
{
"epoch": 0.5009573789092973,
"grad_norm": 28.236122131347656,
"loss": 4.2413,
"lr": 0.000823076923076923,
"step": 1766,
"tokens_trained": 0.867963912
},
{
"epoch": 0.5015247145592511,
"grad_norm": 18.181337356567383,
"loss": 4.2088,
"lr": 0.0008227972027972029,
"step": 1768,
"tokens_trained": 0.86894656
},
{
"epoch": 0.502092050209205,
"grad_norm": 17.403850555419922,
"loss": 4.1854,
"lr": 0.0008225174825174825,
"step": 1770,
"tokens_trained": 0.869932592
},
{
"epoch": 0.5026593858591589,
"grad_norm": 15.002805709838867,
"loss": 4.1897,
"lr": 0.0008222377622377622,
"step": 1772,
"tokens_trained": 0.87091592
},
{
"epoch": 0.5032267215091129,
"grad_norm": 6.787586688995361,
"loss": 4.1625,
"lr": 0.000821958041958042,
"step": 1774,
"tokens_trained": 0.871899144
},
{
"epoch": 0.5037940571590668,
"grad_norm": 6.255197525024414,
"loss": 4.1682,
"lr": 0.0008216783216783217,
"step": 1776,
"tokens_trained": 0.872874824
},
{
"epoch": 0.5043613928090206,
"grad_norm": 25.828433990478516,
"loss": 4.2354,
"lr": 0.0008213986013986015,
"step": 1778,
"tokens_trained": 0.873858424
},
{
"epoch": 0.5049287284589745,
"grad_norm": 20.261323928833008,
"loss": 4.2373,
"lr": 0.0008211188811188811,
"step": 1780,
"tokens_trained": 0.87483884
},
{
"epoch": 0.5054960641089284,
"grad_norm": 9.670608520507812,
"loss": 4.191,
"lr": 0.0008208391608391609,
"step": 1782,
"tokens_trained": 0.875820792
},
{
"epoch": 0.5060633997588824,
"grad_norm": 23.33945655822754,
"loss": 4.2319,
"lr": 0.0008205594405594405,
"step": 1784,
"tokens_trained": 0.876804368
},
{
"epoch": 0.5066307354088363,
"grad_norm": 32.22544479370117,
"loss": 4.1799,
"lr": 0.0008202797202797203,
"step": 1786,
"tokens_trained": 0.877784816
},
{
"epoch": 0.5071980710587901,
"grad_norm": 21.048891067504883,
"loss": 4.2635,
"lr": 0.00082,
"step": 1788,
"tokens_trained": 0.878768256
},
{
"epoch": 0.507765406708744,
"grad_norm": 28.73198699951172,
"loss": 4.2436,
"lr": 0.0008197202797202797,
"step": 1790,
"tokens_trained": 0.879751288
},
{
"epoch": 0.508332742358698,
"grad_norm": 27.627851486206055,
"loss": 4.2118,
"lr": 0.0008194405594405595,
"step": 1792,
"tokens_trained": 0.880732072
},
{
"epoch": 0.5089000780086519,
"grad_norm": 21.16539192199707,
"loss": 4.2123,
"lr": 0.0008191608391608392,
"step": 1794,
"tokens_trained": 0.88171332
},
{
"epoch": 0.5094674136586058,
"grad_norm": 11.402868270874023,
"loss": 4.1524,
"lr": 0.000818881118881119,
"step": 1796,
"tokens_trained": 0.882695464
},
{
"epoch": 0.5100347493085596,
"grad_norm": 11.958270072937012,
"loss": 4.2091,
"lr": 0.0008186013986013986,
"step": 1798,
"tokens_trained": 0.883678736
},
{
"epoch": 0.5106020849585136,
"grad_norm": 15.902670860290527,
"loss": 4.1687,
"lr": 0.0008183216783216783,
"step": 1800,
"tokens_trained": 0.8846604
},
{
"epoch": 0.5111694206084675,
"grad_norm": 19.732566833496094,
"loss": 4.1302,
"lr": 0.000818041958041958,
"step": 1802,
"tokens_trained": 0.885641384
},
{
"epoch": 0.5117367562584214,
"grad_norm": 15.119332313537598,
"loss": 4.1546,
"lr": 0.0008177622377622378,
"step": 1804,
"tokens_trained": 0.8866262
},
{
"epoch": 0.5123040919083753,
"grad_norm": 9.641027450561523,
"loss": 4.1748,
"lr": 0.0008174825174825175,
"step": 1806,
"tokens_trained": 0.887604504
},
{
"epoch": 0.5128714275583292,
"grad_norm": 11.642073631286621,
"loss": 4.1879,
"lr": 0.0008172027972027972,
"step": 1808,
"tokens_trained": 0.888584152
},
{
"epoch": 0.5134387632082831,
"grad_norm": 12.05164909362793,
"loss": 4.1332,
"lr": 0.000816923076923077,
"step": 1810,
"tokens_trained": 0.889568448
},
{
"epoch": 0.514006098858237,
"grad_norm": 13.54423999786377,
"loss": 4.1398,
"lr": 0.0008166433566433567,
"step": 1812,
"tokens_trained": 0.890550896
},
{
"epoch": 0.5145734345081909,
"grad_norm": 21.94988441467285,
"loss": 4.1523,
"lr": 0.0008163636363636364,
"step": 1814,
"tokens_trained": 0.89153436
},
{
"epoch": 0.5151407701581449,
"grad_norm": 8.613338470458984,
"loss": 4.1428,
"lr": 0.0008160839160839161,
"step": 1816,
"tokens_trained": 0.89251064
},
{
"epoch": 0.5157081058080987,
"grad_norm": 27.448917388916016,
"loss": 4.2014,
"lr": 0.0008158041958041958,
"step": 1818,
"tokens_trained": 0.893493904
},
{
"epoch": 0.5162754414580526,
"grad_norm": 16.226577758789062,
"loss": 4.1787,
"lr": 0.0008155244755244755,
"step": 1820,
"tokens_trained": 0.894476344
},
{
"epoch": 0.5168427771080065,
"grad_norm": 16.967891693115234,
"loss": 4.1898,
"lr": 0.0008152447552447553,
"step": 1822,
"tokens_trained": 0.895460064
},
{
"epoch": 0.5174101127579604,
"grad_norm": 13.723483085632324,
"loss": 4.2058,
"lr": 0.000814965034965035,
"step": 1824,
"tokens_trained": 0.896443272
},
{
"epoch": 0.5179774484079144,
"grad_norm": 16.789636611938477,
"loss": 4.1669,
"lr": 0.0008146853146853147,
"step": 1826,
"tokens_trained": 0.897426712
},
{
"epoch": 0.5185447840578682,
"grad_norm": 11.26768684387207,
"loss": 4.1401,
"lr": 0.0008144055944055944,
"step": 1828,
"tokens_trained": 0.89840672
},
{
"epoch": 0.5191121197078221,
"grad_norm": 9.25829029083252,
"loss": 4.1581,
"lr": 0.0008141258741258742,
"step": 1830,
"tokens_trained": 0.89939132
},
{
"epoch": 0.519679455357776,
"grad_norm": 12.006930351257324,
"loss": 4.1768,
"lr": 0.0008138461538461539,
"step": 1832,
"tokens_trained": 0.900373704
},
{
"epoch": 0.52024679100773,
"grad_norm": 18.766008377075195,
"loss": 4.1419,
"lr": 0.0008135664335664336,
"step": 1834,
"tokens_trained": 0.901356176
},
{
"epoch": 0.5208141266576839,
"grad_norm": 17.483421325683594,
"loss": 4.1382,
"lr": 0.0008132867132867133,
"step": 1836,
"tokens_trained": 0.902344088
},
{
"epoch": 0.5213814623076377,
"grad_norm": 10.484652519226074,
"loss": 4.1571,
"lr": 0.000813006993006993,
"step": 1838,
"tokens_trained": 0.903328896
},
{
"epoch": 0.5219487979575916,
"grad_norm": 13.653974533081055,
"loss": 4.1638,
"lr": 0.0008127272727272728,
"step": 1840,
"tokens_trained": 0.904309368
},
{
"epoch": 0.5225161336075456,
"grad_norm": 12.48718547821045,
"loss": 4.1226,
"lr": 0.0008124475524475524,
"step": 1842,
"tokens_trained": 0.905293112
},
{
"epoch": 0.5230834692574995,
"grad_norm": 8.086355209350586,
"loss": 4.1303,
"lr": 0.0008121678321678322,
"step": 1844,
"tokens_trained": 0.906275632
},
{
"epoch": 0.5236508049074534,
"grad_norm": 10.940073013305664,
"loss": 4.1634,
"lr": 0.0008118881118881119,
"step": 1846,
"tokens_trained": 0.907255808
},
{
"epoch": 0.5242181405574072,
"grad_norm": 13.844099044799805,
"loss": 4.1505,
"lr": 0.0008116083916083917,
"step": 1848,
"tokens_trained": 0.908238664
},
{
"epoch": 0.5247854762073612,
"grad_norm": 6.305738925933838,
"loss": 4.1463,
"lr": 0.0008113286713286714,
"step": 1850,
"tokens_trained": 0.909221424
},
{
"epoch": 0.5253528118573151,
"grad_norm": 8.957951545715332,
"loss": 4.1785,
"lr": 0.000811048951048951,
"step": 1852,
"tokens_trained": 0.910204472
},
{
"epoch": 0.525920147507269,
"grad_norm": 12.665373802185059,
"loss": 4.1776,
"lr": 0.0008107692307692308,
"step": 1854,
"tokens_trained": 0.911186456
},
{
"epoch": 0.5264874831572229,
"grad_norm": 13.7921781539917,
"loss": 4.2058,
"lr": 0.0008104895104895104,
"step": 1856,
"tokens_trained": 0.912163912
},
{
"epoch": 0.5270548188071768,
"grad_norm": 18.400495529174805,
"loss": 4.1378,
"lr": 0.0008102097902097903,
"step": 1858,
"tokens_trained": 0.913143416
},
{
"epoch": 0.5276221544571307,
"grad_norm": 10.095234870910645,
"loss": 4.1673,
"lr": 0.0008099300699300699,
"step": 1860,
"tokens_trained": 0.914125056
},
{
"epoch": 0.5281894901070846,
"grad_norm": 9.396644592285156,
"loss": 4.1226,
"lr": 0.0008096503496503497,
"step": 1862,
"tokens_trained": 0.915109128
},
{
"epoch": 0.5287568257570385,
"grad_norm": 12.686080932617188,
"loss": 4.1356,
"lr": 0.0008093706293706294,
"step": 1864,
"tokens_trained": 0.916092096
},
{
"epoch": 0.5293241614069925,
"grad_norm": 15.91020679473877,
"loss": 4.1276,
"lr": 0.0008090909090909092,
"step": 1866,
"tokens_trained": 0.917077264
},
{
"epoch": 0.5298914970569463,
"grad_norm": 21.305110931396484,
"loss": 4.1492,
"lr": 0.0008088111888111889,
"step": 1868,
"tokens_trained": 0.918060288
},
{
"epoch": 0.5304588327069002,
"grad_norm": 9.242319107055664,
"loss": 4.1457,
"lr": 0.0008085314685314685,
"step": 1870,
"tokens_trained": 0.91904616
},
{
"epoch": 0.5310261683568541,
"grad_norm": 17.556922912597656,
"loss": 4.1698,
"lr": 0.0008082517482517483,
"step": 1872,
"tokens_trained": 0.920028192
},
{
"epoch": 0.531593504006808,
"grad_norm": 24.155885696411133,
"loss": 4.193,
"lr": 0.0008079720279720279,
"step": 1874,
"tokens_trained": 0.921010456
},
{
"epoch": 0.531877171831785,
"eval_loss": 1.0404243469238281,
"eval_runtime": 21.451,
"step": 1875,
"tokens_trained": 0.921502192
},
{
"epoch": 0.532160839656762,
"grad_norm": 4.985994338989258,
"loss": 4.1649,
"lr": 0.0008076923076923078,
"step": 1876,
"tokens_trained": 0.921994216
},
{
"epoch": 0.5327281753067158,
"grad_norm": 19.2642765045166,
"loss": 4.1883,
"lr": 0.0008074125874125874,
"step": 1878,
"tokens_trained": 0.922978112
},
{
"epoch": 0.5332955109566697,
"grad_norm": 15.012572288513184,
"loss": 4.1944,
"lr": 0.0008071328671328671,
"step": 1880,
"tokens_trained": 0.923962952
},
{
"epoch": 0.5338628466066236,
"grad_norm": 21.37204360961914,
"loss": 4.1708,
"lr": 0.0008068531468531469,
"step": 1882,
"tokens_trained": 0.92494744
},
{
"epoch": 0.5344301822565776,
"grad_norm": 6.402398586273193,
"loss": 4.1921,
"lr": 0.0008065734265734265,
"step": 1884,
"tokens_trained": 0.925927984
},
{
"epoch": 0.5349975179065315,
"grad_norm": 27.606822967529297,
"loss": 4.2033,
"lr": 0.0008062937062937064,
"step": 1886,
"tokens_trained": 0.926911352
},
{
"epoch": 0.5355648535564853,
"grad_norm": 16.434572219848633,
"loss": 4.1504,
"lr": 0.000806013986013986,
"step": 1888,
"tokens_trained": 0.927894056
},
{
"epoch": 0.5361321892064392,
"grad_norm": 8.066178321838379,
"loss": 4.1674,
"lr": 0.0008057342657342658,
"step": 1890,
"tokens_trained": 0.928879504
},
{
"epoch": 0.5366995248563932,
"grad_norm": 6.167456150054932,
"loss": 4.1207,
"lr": 0.0008054545454545454,
"step": 1892,
"tokens_trained": 0.92986424
},
{
"epoch": 0.5372668605063471,
"grad_norm": 3.584982395172119,
"loss": 4.1051,
"lr": 0.0008051748251748253,
"step": 1894,
"tokens_trained": 0.930846696
},
{
"epoch": 0.537834196156301,
"grad_norm": 14.988295555114746,
"loss": 4.1199,
"lr": 0.0008048951048951049,
"step": 1896,
"tokens_trained": 0.931831112
},
{
"epoch": 0.5384015318062548,
"grad_norm": 12.735363960266113,
"loss": 4.1368,
"lr": 0.0008046153846153846,
"step": 1898,
"tokens_trained": 0.932816952
},
{
"epoch": 0.5389688674562088,
"grad_norm": 7.701294422149658,
"loss": 4.1205,
"lr": 0.0008043356643356644,
"step": 1900,
"tokens_trained": 0.93380264
},
{
"epoch": 0.5395362031061627,
"grad_norm": 9.15809440612793,
"loss": 4.1567,
"lr": 0.000804055944055944,
"step": 1902,
"tokens_trained": 0.934785848
},
{
"epoch": 0.5401035387561166,
"grad_norm": 10.8292875289917,
"loss": 4.1645,
"lr": 0.0008037762237762239,
"step": 1904,
"tokens_trained": 0.935766912
},
{
"epoch": 0.5406708744060705,
"grad_norm": 10.906803131103516,
"loss": 4.1398,
"lr": 0.0008034965034965035,
"step": 1906,
"tokens_trained": 0.936749352
},
{
"epoch": 0.5412382100560243,
"grad_norm": 10.140864372253418,
"loss": 4.1754,
"lr": 0.0008032167832167832,
"step": 1908,
"tokens_trained": 0.9377304
},
{
"epoch": 0.5418055457059783,
"grad_norm": 10.061383247375488,
"loss": 4.1485,
"lr": 0.0008029370629370629,
"step": 1910,
"tokens_trained": 0.938712336
},
{
"epoch": 0.5423728813559322,
"grad_norm": 8.252259254455566,
"loss": 4.1502,
"lr": 0.0008026573426573427,
"step": 1912,
"tokens_trained": 0.939693304
},
{
"epoch": 0.5429402170058861,
"grad_norm": 15.104400634765625,
"loss": 4.182,
"lr": 0.0008023776223776224,
"step": 1914,
"tokens_trained": 0.940679832
},
{
"epoch": 0.54350755265584,
"grad_norm": 21.167285919189453,
"loss": 4.1241,
"lr": 0.0008020979020979021,
"step": 1916,
"tokens_trained": 0.941665088
},
{
"epoch": 0.5440748883057939,
"grad_norm": 17.936481475830078,
"loss": 4.1846,
"lr": 0.0008018181818181818,
"step": 1918,
"tokens_trained": 0.942651632
},
{
"epoch": 0.5446422239557478,
"grad_norm": 9.773019790649414,
"loss": 4.1164,
"lr": 0.0008015384615384615,
"step": 1920,
"tokens_trained": 0.943635928
},
{
"epoch": 0.5452095596057017,
"grad_norm": 14.120475769042969,
"loss": 4.1556,
"lr": 0.0008012587412587414,
"step": 1922,
"tokens_trained": 0.944618336
},
{
"epoch": 0.5457768952556556,
"grad_norm": 10.898097038269043,
"loss": 4.1521,
"lr": 0.000800979020979021,
"step": 1924,
"tokens_trained": 0.945608216
},
{
"epoch": 0.5463442309056096,
"grad_norm": 8.271462440490723,
"loss": 4.0785,
"lr": 0.0008006993006993007,
"step": 1926,
"tokens_trained": 0.946593504
},
{
"epoch": 0.5469115665555634,
"grad_norm": 17.28820037841797,
"loss": 4.0998,
"lr": 0.0008004195804195804,
"step": 1928,
"tokens_trained": 0.947575288
},
{
"epoch": 0.5474789022055173,
"grad_norm": 17.754959106445312,
"loss": 4.1652,
"lr": 0.0008001398601398602,
"step": 1930,
"tokens_trained": 0.948562968
},
{
"epoch": 0.5480462378554712,
"grad_norm": 10.576292037963867,
"loss": 4.1754,
"lr": 0.0007998601398601399,
"step": 1932,
"tokens_trained": 0.949545728
},
{
"epoch": 0.5486135735054252,
"grad_norm": 14.297791481018066,
"loss": 4.1597,
"lr": 0.0007995804195804196,
"step": 1934,
"tokens_trained": 0.950528952
},
{
"epoch": 0.5491809091553791,
"grad_norm": 23.882539749145508,
"loss": 4.1366,
"lr": 0.0007993006993006992,
"step": 1936,
"tokens_trained": 0.951513448
},
{
"epoch": 0.5497482448053329,
"grad_norm": 5.12502908706665,
"loss": 4.1441,
"lr": 0.000799020979020979,
"step": 1938,
"tokens_trained": 0.952497048
},
{
"epoch": 0.5503155804552868,
"grad_norm": 26.879070281982422,
"loss": 4.2595,
"lr": 0.0007987412587412588,
"step": 1940,
"tokens_trained": 0.953475816
},
{
"epoch": 0.5508829161052408,
"grad_norm": 23.032690048217773,
"loss": 4.1841,
"lr": 0.0007984615384615385,
"step": 1942,
"tokens_trained": 0.954459984
},
{
"epoch": 0.5514502517551947,
"grad_norm": 8.810720443725586,
"loss": 4.1329,
"lr": 0.0007981818181818182,
"step": 1944,
"tokens_trained": 0.95544252
},
{
"epoch": 0.5520175874051486,
"grad_norm": 31.051185607910156,
"loss": 4.2278,
"lr": 0.0007979020979020979,
"step": 1946,
"tokens_trained": 0.956428016
},
{
"epoch": 0.5525849230551024,
"grad_norm": 22.537412643432617,
"loss": 4.1729,
"lr": 0.0007976223776223777,
"step": 1948,
"tokens_trained": 0.957406024
},
{
"epoch": 0.5531522587050564,
"grad_norm": 10.596793174743652,
"loss": 4.1636,
"lr": 0.0007973426573426573,
"step": 1950,
"tokens_trained": 0.958391232
},
{
"epoch": 0.5537195943550103,
"grad_norm": 16.45500373840332,
"loss": 4.1591,
"lr": 0.0007970629370629371,
"step": 1952,
"tokens_trained": 0.959378448
},
{
"epoch": 0.5542869300049642,
"grad_norm": 15.090359687805176,
"loss": 4.1516,
"lr": 0.0007967832167832167,
"step": 1954,
"tokens_trained": 0.960363384
},
{
"epoch": 0.5548542656549181,
"grad_norm": 28.482192993164062,
"loss": 4.1211,
"lr": 0.0007965034965034965,
"step": 1956,
"tokens_trained": 0.961348752
},
{
"epoch": 0.555421601304872,
"grad_norm": 9.402368545532227,
"loss": 4.178,
"lr": 0.0007962237762237763,
"step": 1958,
"tokens_trained": 0.962332976
},
{
"epoch": 0.5559889369548259,
"grad_norm": 33.001346588134766,
"loss": 4.218,
"lr": 0.000795944055944056,
"step": 1960,
"tokens_trained": 0.963316928
},
{
"epoch": 0.5565562726047798,
"grad_norm": 29.695520401000977,
"loss": 4.2071,
"lr": 0.0007956643356643357,
"step": 1962,
"tokens_trained": 0.964301728
},
{
"epoch": 0.5571236082547337,
"grad_norm": 22.22412109375,
"loss": 4.2158,
"lr": 0.0007953846153846153,
"step": 1964,
"tokens_trained": 0.96528524
},
{
"epoch": 0.5576909439046877,
"grad_norm": 15.590829849243164,
"loss": 4.1681,
"lr": 0.0007951048951048952,
"step": 1966,
"tokens_trained": 0.966268264
},
{
"epoch": 0.5582582795546415,
"grad_norm": 16.011110305786133,
"loss": 4.1591,
"lr": 0.0007948251748251748,
"step": 1968,
"tokens_trained": 0.967252016
},
{
"epoch": 0.5588256152045954,
"grad_norm": 15.24573040008545,
"loss": 4.1446,
"lr": 0.0007945454545454546,
"step": 1970,
"tokens_trained": 0.96823396
},
{
"epoch": 0.5593929508545493,
"grad_norm": 15.718021392822266,
"loss": 4.1846,
"lr": 0.0007942657342657342,
"step": 1972,
"tokens_trained": 0.969217792
},
{
"epoch": 0.5599602865045032,
"grad_norm": 8.648459434509277,
"loss": 4.1655,
"lr": 0.000793986013986014,
"step": 1974,
"tokens_trained": 0.970200776
},
{
"epoch": 0.5605276221544572,
"grad_norm": 7.273077487945557,
"loss": 4.1397,
"lr": 0.0007937062937062938,
"step": 1976,
"tokens_trained": 0.971181376
},
{
"epoch": 0.561094957804411,
"grad_norm": 25.027616500854492,
"loss": 4.1918,
"lr": 0.0007934265734265734,
"step": 1978,
"tokens_trained": 0.972165496
},
{
"epoch": 0.5616622934543649,
"grad_norm": 25.485851287841797,
"loss": 4.1896,
"lr": 0.0007931468531468532,
"step": 1980,
"tokens_trained": 0.973145616
},
{
"epoch": 0.5622296291043188,
"grad_norm": 18.065462112426758,
"loss": 4.1876,
"lr": 0.0007928671328671328,
"step": 1982,
"tokens_trained": 0.974131104
},
{
"epoch": 0.5627969647542728,
"grad_norm": 20.412248611450195,
"loss": 4.1556,
"lr": 0.0007925874125874127,
"step": 1984,
"tokens_trained": 0.975111232
},
{
"epoch": 0.5633643004042267,
"grad_norm": 15.51710319519043,
"loss": 4.1391,
"lr": 0.0007923076923076923,
"step": 1986,
"tokens_trained": 0.976098968
},
{
"epoch": 0.5639316360541805,
"grad_norm": 8.650726318359375,
"loss": 4.1421,
"lr": 0.000792027972027972,
"step": 1988,
"tokens_trained": 0.977082992
},
{
"epoch": 0.5644989717041344,
"grad_norm": 19.833505630493164,
"loss": 4.1505,
"lr": 0.0007917482517482517,
"step": 1990,
"tokens_trained": 0.978068896
},
{
"epoch": 0.5650663073540884,
"grad_norm": 26.585390090942383,
"loss": 4.1661,
"lr": 0.0007914685314685314,
"step": 1992,
"tokens_trained": 0.979048504
},
{
"epoch": 0.5656336430040423,
"grad_norm": 20.827394485473633,
"loss": 4.1987,
"lr": 0.0007911888111888113,
"step": 1994,
"tokens_trained": 0.98003104
},
{
"epoch": 0.5662009786539962,
"grad_norm": 23.700273513793945,
"loss": 4.1773,
"lr": 0.0007909090909090909,
"step": 1996,
"tokens_trained": 0.981013384
},
{
"epoch": 0.56676831430395,
"grad_norm": 15.673397064208984,
"loss": 4.12,
"lr": 0.0007906293706293707,
"step": 1998,
"tokens_trained": 0.981999776
},
{
"epoch": 0.567335649953904,
"grad_norm": 11.268630981445312,
"loss": 4.1373,
"lr": 0.0007903496503496503,
"step": 2000,
"tokens_trained": 0.982980936
},
{
"epoch": 0.567335649953904,
"eval_loss": 1.0422048568725586,
"eval_runtime": 20.3928,
"step": 2000,
"tokens_trained": 0.982980936
},
{
"epoch": 0.5679029856038579,
"grad_norm": 18.37994384765625,
"loss": 4.1536,
"lr": 0.0007900699300699302,
"step": 2002,
"tokens_trained": 0.983969536
},
{
"epoch": 0.5684703212538118,
"grad_norm": 23.911537170410156,
"loss": 4.1652,
"lr": 0.0007897902097902098,
"step": 2004,
"tokens_trained": 0.98495052
},
{
"epoch": 0.5690376569037657,
"grad_norm": 7.355772018432617,
"loss": 4.1846,
"lr": 0.0007895104895104895,
"step": 2006,
"tokens_trained": 0.98593252
},
{
"epoch": 0.5696049925537195,
"grad_norm": 35.29991149902344,
"loss": 4.2145,
"lr": 0.0007892307692307692,
"step": 2008,
"tokens_trained": 0.986922392
},
{
"epoch": 0.5701723282036735,
"grad_norm": 14.28709602355957,
"loss": 4.1629,
"lr": 0.0007889510489510489,
"step": 2010,
"tokens_trained": 0.987905712
},
{
"epoch": 0.5707396638536274,
"grad_norm": 22.50174331665039,
"loss": 4.1907,
"lr": 0.0007886713286713288,
"step": 2012,
"tokens_trained": 0.988887536
},
{
"epoch": 0.5713069995035813,
"grad_norm": 14.588640213012695,
"loss": 4.1523,
"lr": 0.0007883916083916084,
"step": 2014,
"tokens_trained": 0.989872712
},
{
"epoch": 0.5718743351535353,
"grad_norm": 2.776369094848633,
"loss": 4.1548,
"lr": 0.0007881118881118882,
"step": 2016,
"tokens_trained": 0.990854072
},
{
"epoch": 0.5724416708034891,
"grad_norm": 16.00047492980957,
"loss": 4.1319,
"lr": 0.0007878321678321678,
"step": 2018,
"tokens_trained": 0.991834552
},
{
"epoch": 0.573009006453443,
"grad_norm": 21.678735733032227,
"loss": 4.1986,
"lr": 0.0007875524475524476,
"step": 2020,
"tokens_trained": 0.992818256
},
{
"epoch": 0.5735763421033969,
"grad_norm": 4.835119724273682,
"loss": 4.1625,
"lr": 0.0007872727272727273,
"step": 2022,
"tokens_trained": 0.993801376
},
{
"epoch": 0.5741436777533508,
"grad_norm": 19.427467346191406,
"loss": 4.1594,
"lr": 0.000786993006993007,
"step": 2024,
"tokens_trained": 0.994788568
},
{
"epoch": 0.5747110134033048,
"grad_norm": 15.458346366882324,
"loss": 4.1829,
"lr": 0.0007867132867132867,
"step": 2026,
"tokens_trained": 0.995769976
},
{
"epoch": 0.5752783490532586,
"grad_norm": 11.073614120483398,
"loss": 4.1303,
"lr": 0.0007864335664335664,
"step": 2028,
"tokens_trained": 0.996751464
},
{
"epoch": 0.5758456847032125,
"grad_norm": 4.685436248779297,
"loss": 4.1368,
"lr": 0.0007861538461538463,
"step": 2030,
"tokens_trained": 0.997733952
},
{
"epoch": 0.5764130203531664,
"grad_norm": 15.977241516113281,
"loss": 4.1584,
"lr": 0.0007858741258741259,
"step": 2032,
"tokens_trained": 0.998716976
},
{
"epoch": 0.5769803560031204,
"grad_norm": 11.305732727050781,
"loss": 4.102,
"lr": 0.0007855944055944056,
"step": 2034,
"tokens_trained": 0.999703632
},
{
"epoch": 0.5775476916530743,
"grad_norm": 7.794003963470459,
"loss": 4.161,
"lr": 0.0007853146853146853,
"step": 2036,
"tokens_trained": 1.000687488
},
{
"epoch": 0.5781150273030281,
"grad_norm": 7.609982013702393,
"loss": 4.1546,
"lr": 0.0007850349650349651,
"step": 2038,
"tokens_trained": 1.0016692
},
{
"epoch": 0.578682362952982,
"grad_norm": 7.622653961181641,
"loss": 4.1246,
"lr": 0.0007847552447552448,
"step": 2040,
"tokens_trained": 1.002653352
},
{
"epoch": 0.579249698602936,
"grad_norm": 9.98919677734375,
"loss": 4.1319,
"lr": 0.0007844755244755245,
"step": 2042,
"tokens_trained": 1.003639528
},
{
"epoch": 0.5798170342528899,
"grad_norm": 9.557628631591797,
"loss": 4.1105,
"lr": 0.0007841958041958041,
"step": 2044,
"tokens_trained": 1.004623776
},
{
"epoch": 0.5803843699028438,
"grad_norm": 14.172621726989746,
"loss": 4.1339,
"lr": 0.0007839160839160839,
"step": 2046,
"tokens_trained": 1.005604008
},
{
"epoch": 0.5809517055527976,
"grad_norm": 8.185248374938965,
"loss": 4.1142,
"lr": 0.0007836363636363637,
"step": 2048,
"tokens_trained": 1.006585704
},
{
"epoch": 0.5815190412027516,
"grad_norm": 10.642661094665527,
"loss": 4.131,
"lr": 0.0007833566433566434,
"step": 2050,
"tokens_trained": 1.00757132
},
{
"epoch": 0.5820863768527055,
"grad_norm": 7.868969917297363,
"loss": 4.1477,
"lr": 0.0007830769230769231,
"step": 2052,
"tokens_trained": 1.008556824
},
{
"epoch": 0.5826537125026594,
"grad_norm": 2.8441150188446045,
"loss": 4.1156,
"lr": 0.0007827972027972028,
"step": 2054,
"tokens_trained": 1.00954056
},
{
"epoch": 0.5832210481526133,
"grad_norm": 5.2797932624816895,
"loss": 4.1058,
"lr": 0.0007825174825174826,
"step": 2056,
"tokens_trained": 1.010526488
},
{
"epoch": 0.5837883838025671,
"grad_norm": 11.850811004638672,
"loss": 4.165,
"lr": 0.0007822377622377622,
"step": 2058,
"tokens_trained": 1.011507584
},
{
"epoch": 0.5843557194525211,
"grad_norm": 11.073920249938965,
"loss": 4.1509,
"lr": 0.000781958041958042,
"step": 2060,
"tokens_trained": 1.012491648
},
{
"epoch": 0.584923055102475,
"grad_norm": 8.282343864440918,
"loss": 4.0656,
"lr": 0.0007816783216783216,
"step": 2062,
"tokens_trained": 1.013475224
},
{
"epoch": 0.5854903907524289,
"grad_norm": 10.414461135864258,
"loss": 4.1285,
"lr": 0.0007813986013986014,
"step": 2064,
"tokens_trained": 1.014458144
},
{
"epoch": 0.5860577264023829,
"grad_norm": 9.988463401794434,
"loss": 4.1234,
"lr": 0.0007811188811188812,
"step": 2066,
"tokens_trained": 1.015444112
},
{
"epoch": 0.5866250620523367,
"grad_norm": 8.713189125061035,
"loss": 4.129,
"lr": 0.0007808391608391609,
"step": 2068,
"tokens_trained": 1.016427568
},
{
"epoch": 0.5871923977022906,
"grad_norm": 3.4149773120880127,
"loss": 4.155,
"lr": 0.0007805594405594406,
"step": 2070,
"tokens_trained": 1.017412264
},
{
"epoch": 0.5877597333522445,
"grad_norm": 12.33522891998291,
"loss": 4.1856,
"lr": 0.0007802797202797202,
"step": 2072,
"tokens_trained": 1.018402216
},
{
"epoch": 0.5883270690021984,
"grad_norm": 12.155695915222168,
"loss": 4.1468,
"lr": 0.0007800000000000001,
"step": 2074,
"tokens_trained": 1.019387096
},
{
"epoch": 0.5888944046521524,
"grad_norm": 7.73326301574707,
"loss": 4.1239,
"lr": 0.0007797202797202797,
"step": 2076,
"tokens_trained": 1.020370008
},
{
"epoch": 0.5894617403021062,
"grad_norm": 6.425852298736572,
"loss": 4.1101,
"lr": 0.0007794405594405595,
"step": 2078,
"tokens_trained": 1.02135716
},
{
"epoch": 0.5900290759520601,
"grad_norm": 18.360816955566406,
"loss": 4.1726,
"lr": 0.0007791608391608391,
"step": 2080,
"tokens_trained": 1.022338024
},
{
"epoch": 0.590596411602014,
"grad_norm": 28.31681251525879,
"loss": 4.1341,
"lr": 0.0007788811188811189,
"step": 2082,
"tokens_trained": 1.023318008
},
{
"epoch": 0.591163747251968,
"grad_norm": 10.673089027404785,
"loss": 4.1268,
"lr": 0.0007786013986013987,
"step": 2084,
"tokens_trained": 1.02430432
},
{
"epoch": 0.5917310829019219,
"grad_norm": 26.656522750854492,
"loss": 4.1703,
"lr": 0.0007783216783216783,
"step": 2086,
"tokens_trained": 1.025288272
},
{
"epoch": 0.5922984185518757,
"grad_norm": 20.022029876708984,
"loss": 4.1532,
"lr": 0.0007780419580419581,
"step": 2088,
"tokens_trained": 1.026272984
},
{
"epoch": 0.5928657542018296,
"grad_norm": 7.2955121994018555,
"loss": 4.1992,
"lr": 0.0007777622377622377,
"step": 2090,
"tokens_trained": 1.02725572
},
{
"epoch": 0.5934330898517836,
"grad_norm": 28.561243057250977,
"loss": 4.2098,
"lr": 0.0007774825174825176,
"step": 2092,
"tokens_trained": 1.028238456
},
{
"epoch": 0.5940004255017375,
"grad_norm": 16.715425491333008,
"loss": 4.1509,
"lr": 0.0007772027972027972,
"step": 2094,
"tokens_trained": 1.029226048
},
{
"epoch": 0.5945677611516914,
"grad_norm": 6.325936317443848,
"loss": 4.1221,
"lr": 0.000776923076923077,
"step": 2096,
"tokens_trained": 1.030210528
},
{
"epoch": 0.5951350968016452,
"grad_norm": 12.83181381225586,
"loss": 4.1808,
"lr": 0.0007766433566433566,
"step": 2098,
"tokens_trained": 1.031193456
},
{
"epoch": 0.5957024324515992,
"grad_norm": 12.183184623718262,
"loss": 4.1292,
"lr": 0.0007763636363636363,
"step": 2100,
"tokens_trained": 1.032173528
},
{
"epoch": 0.5962697681015531,
"grad_norm": 8.247485160827637,
"loss": 4.1425,
"lr": 0.0007760839160839162,
"step": 2102,
"tokens_trained": 1.033158144
},
{
"epoch": 0.596837103751507,
"grad_norm": 10.814559936523438,
"loss": 4.1167,
"lr": 0.0007758041958041958,
"step": 2104,
"tokens_trained": 1.034141216
},
{
"epoch": 0.5974044394014609,
"grad_norm": 12.589309692382812,
"loss": 4.0916,
"lr": 0.0007755244755244756,
"step": 2106,
"tokens_trained": 1.035121888
},
{
"epoch": 0.5979717750514147,
"grad_norm": 11.65658187866211,
"loss": 4.0776,
"lr": 0.0007752447552447552,
"step": 2108,
"tokens_trained": 1.036103688
},
{
"epoch": 0.5985391107013687,
"grad_norm": 18.0120792388916,
"loss": 4.1588,
"lr": 0.0007749650349650351,
"step": 2110,
"tokens_trained": 1.03708248
},
{
"epoch": 0.5991064463513226,
"grad_norm": 5.742938995361328,
"loss": 4.151,
"lr": 0.0007746853146853147,
"step": 2112,
"tokens_trained": 1.038068792
},
{
"epoch": 0.5996737820012765,
"grad_norm": 36.54581832885742,
"loss": 4.2239,
"lr": 0.0007744055944055944,
"step": 2114,
"tokens_trained": 1.03904728
},
{
"epoch": 0.6002411176512304,
"grad_norm": 13.304069519042969,
"loss": 4.152,
"lr": 0.0007741258741258741,
"step": 2116,
"tokens_trained": 1.040031312
},
{
"epoch": 0.6008084533011843,
"grad_norm": 18.68927001953125,
"loss": 4.1413,
"lr": 0.0007738461538461538,
"step": 2118,
"tokens_trained": 1.041018376
},
{
"epoch": 0.6013757889511382,
"grad_norm": 16.946630477905273,
"loss": 4.1122,
"lr": 0.0007735664335664337,
"step": 2120,
"tokens_trained": 1.0420056
},
{
"epoch": 0.6019431246010921,
"grad_norm": 4.236926078796387,
"loss": 4.1146,
"lr": 0.0007732867132867133,
"step": 2122,
"tokens_trained": 1.042990376
},
{
"epoch": 0.602510460251046,
"grad_norm": 12.148641586303711,
"loss": 4.1472,
"lr": 0.0007730069930069931,
"step": 2124,
"tokens_trained": 1.0439754
},
{
"epoch": 0.602794128076023,
"eval_loss": 1.039306640625,
"eval_runtime": 20.6138,
"step": 2125,
"tokens_trained": 1.044467008
},
{
"epoch": 0.603077795901,
"grad_norm": 17.051687240600586,
"loss": 4.1572,
"lr": 0.0007727272727272727,
"step": 2126,
"tokens_trained": 1.044957456
},
{
"epoch": 0.6036451315509538,
"grad_norm": 14.019828796386719,
"loss": 4.1464,
"lr": 0.0007724475524475525,
"step": 2128,
"tokens_trained": 1.04593944
},
{
"epoch": 0.6042124672009077,
"grad_norm": 11.22962760925293,
"loss": 4.1345,
"lr": 0.0007721678321678322,
"step": 2130,
"tokens_trained": 1.046919592
},
{
"epoch": 0.6047798028508616,
"grad_norm": 11.524348258972168,
"loss": 4.1233,
"lr": 0.0007718881118881119,
"step": 2132,
"tokens_trained": 1.047904744
},
{
"epoch": 0.6053471385008156,
"grad_norm": 7.174457550048828,
"loss": 4.1201,
"lr": 0.0007716083916083916,
"step": 2134,
"tokens_trained": 1.048885328
},
{
"epoch": 0.6059144741507695,
"grad_norm": 6.847499847412109,
"loss": 4.1313,
"lr": 0.0007713286713286713,
"step": 2136,
"tokens_trained": 1.049868776
},
{
"epoch": 0.6064818098007233,
"grad_norm": 8.44458293914795,
"loss": 4.1236,
"lr": 0.0007710489510489512,
"step": 2138,
"tokens_trained": 1.050852704
},
{
"epoch": 0.6070491454506772,
"grad_norm": 15.415260314941406,
"loss": 4.1424,
"lr": 0.0007707692307692308,
"step": 2140,
"tokens_trained": 1.051837736
},
{
"epoch": 0.6076164811006312,
"grad_norm": 16.845874786376953,
"loss": 4.1037,
"lr": 0.0007704895104895105,
"step": 2142,
"tokens_trained": 1.05282172
},
{
"epoch": 0.6081838167505851,
"grad_norm": 1.3947086334228516,
"loss": 4.1389,
"lr": 0.0007702097902097902,
"step": 2144,
"tokens_trained": 1.053802928
},
{
"epoch": 0.608751152400539,
"grad_norm": 3.4119038581848145,
"loss": 4.16,
"lr": 0.0007699300699300699,
"step": 2146,
"tokens_trained": 1.054784368
},
{
"epoch": 0.6093184880504928,
"grad_norm": 9.26860523223877,
"loss": 4.1841,
"lr": 0.0007696503496503497,
"step": 2148,
"tokens_trained": 1.05576888
},
{
"epoch": 0.6098858237004467,
"grad_norm": 8.744836807250977,
"loss": 4.1043,
"lr": 0.0007693706293706294,
"step": 2150,
"tokens_trained": 1.056751336
},
{
"epoch": 0.6104531593504007,
"grad_norm": 8.805045127868652,
"loss": 4.1032,
"lr": 0.000769090909090909,
"step": 2152,
"tokens_trained": 1.057734
},
{
"epoch": 0.6110204950003546,
"grad_norm": 4.785625457763672,
"loss": 4.1817,
"lr": 0.0007688111888111888,
"step": 2154,
"tokens_trained": 1.058716328
},
{
"epoch": 0.6115878306503085,
"grad_norm": 2.2137513160705566,
"loss": 4.1514,
"lr": 0.0007685314685314686,
"step": 2156,
"tokens_trained": 1.059696248
},
{
"epoch": 0.6121551663002623,
"grad_norm": 7.164271354675293,
"loss": 4.1433,
"lr": 0.0007682517482517483,
"step": 2158,
"tokens_trained": 1.060676648
},
{
"epoch": 0.6127225019502163,
"grad_norm": 9.481597900390625,
"loss": 4.0971,
"lr": 0.000767972027972028,
"step": 2160,
"tokens_trained": 1.061656688
},
{
"epoch": 0.6132898376001702,
"grad_norm": 11.28831672668457,
"loss": 4.149,
"lr": 0.0007676923076923077,
"step": 2162,
"tokens_trained": 1.062640576
},
{
"epoch": 0.6138571732501241,
"grad_norm": 17.21572494506836,
"loss": 4.098,
"lr": 0.0007674125874125874,
"step": 2164,
"tokens_trained": 1.063617688
},
{
"epoch": 0.614424508900078,
"grad_norm": 14.486310005187988,
"loss": 4.123,
"lr": 0.0007671328671328672,
"step": 2166,
"tokens_trained": 1.06460584
},
{
"epoch": 0.6149918445500319,
"grad_norm": 10.582398414611816,
"loss": 4.1243,
"lr": 0.0007668531468531469,
"step": 2168,
"tokens_trained": 1.065589064
},
{
"epoch": 0.6155591801999858,
"grad_norm": 12.923002243041992,
"loss": 4.0928,
"lr": 0.0007665734265734265,
"step": 2170,
"tokens_trained": 1.06657224
},
{
"epoch": 0.6161265158499397,
"grad_norm": 12.445414543151855,
"loss": 4.1697,
"lr": 0.0007662937062937063,
"step": 2172,
"tokens_trained": 1.067556952
},
{
"epoch": 0.6166938514998936,
"grad_norm": 3.562396287918091,
"loss": 4.0763,
"lr": 0.000766013986013986,
"step": 2174,
"tokens_trained": 1.068538248
},
{
"epoch": 0.6172611871498476,
"grad_norm": 12.62887954711914,
"loss": 4.1203,
"lr": 0.0007657342657342658,
"step": 2176,
"tokens_trained": 1.06952032
},
{
"epoch": 0.6178285227998014,
"grad_norm": 9.387356758117676,
"loss": 4.1318,
"lr": 0.0007654545454545455,
"step": 2178,
"tokens_trained": 1.070503872
},
{
"epoch": 0.6183958584497553,
"grad_norm": 8.885710716247559,
"loss": 4.1609,
"lr": 0.0007651748251748251,
"step": 2180,
"tokens_trained": 1.071486328
},
{
"epoch": 0.6189631940997092,
"grad_norm": 7.174533843994141,
"loss": 4.0824,
"lr": 0.0007648951048951049,
"step": 2182,
"tokens_trained": 1.07246928
},
{
"epoch": 0.6195305297496632,
"grad_norm": 15.866931915283203,
"loss": 4.1461,
"lr": 0.0007646153846153846,
"step": 2184,
"tokens_trained": 1.07345252
},
{
"epoch": 0.6200978653996171,
"grad_norm": 4.892337799072266,
"loss": 4.1418,
"lr": 0.0007643356643356644,
"step": 2186,
"tokens_trained": 1.07443796
},
{
"epoch": 0.6206652010495709,
"grad_norm": 4.796551704406738,
"loss": 4.1394,
"lr": 0.000764055944055944,
"step": 2188,
"tokens_trained": 1.075421392
},
{
"epoch": 0.6212325366995248,
"grad_norm": 10.585665702819824,
"loss": 4.1046,
"lr": 0.0007637762237762238,
"step": 2190,
"tokens_trained": 1.076404848
},
{
"epoch": 0.6217998723494788,
"grad_norm": 8.71747875213623,
"loss": 4.1819,
"lr": 0.0007634965034965035,
"step": 2192,
"tokens_trained": 1.077386672
},
{
"epoch": 0.6223672079994327,
"grad_norm": 10.74347972869873,
"loss": 4.1231,
"lr": 0.0007632167832167833,
"step": 2194,
"tokens_trained": 1.078365112
},
{
"epoch": 0.6229345436493866,
"grad_norm": 12.079446792602539,
"loss": 4.1132,
"lr": 0.000762937062937063,
"step": 2196,
"tokens_trained": 1.07935376
},
{
"epoch": 0.6235018792993404,
"grad_norm": 7.8133649826049805,
"loss": 4.0915,
"lr": 0.0007626573426573426,
"step": 2198,
"tokens_trained": 1.080332872
},
{
"epoch": 0.6240692149492943,
"grad_norm": 4.51243782043457,
"loss": 4.1108,
"lr": 0.0007623776223776224,
"step": 2200,
"tokens_trained": 1.081316664
},
{
"epoch": 0.6246365505992483,
"grad_norm": 12.625933647155762,
"loss": 4.1552,
"lr": 0.0007620979020979021,
"step": 2202,
"tokens_trained": 1.08230448
},
{
"epoch": 0.6252038862492022,
"grad_norm": 9.984200477600098,
"loss": 4.1199,
"lr": 0.0007618181818181819,
"step": 2204,
"tokens_trained": 1.083288992
},
{
"epoch": 0.6257712218991561,
"grad_norm": 11.338666915893555,
"loss": 4.0821,
"lr": 0.0007615384615384615,
"step": 2206,
"tokens_trained": 1.084273864
},
{
"epoch": 0.6263385575491099,
"grad_norm": 6.808894634246826,
"loss": 4.1202,
"lr": 0.0007612587412587412,
"step": 2208,
"tokens_trained": 1.085254584
},
{
"epoch": 0.6269058931990639,
"grad_norm": 4.182394027709961,
"loss": 4.1072,
"lr": 0.000760979020979021,
"step": 2210,
"tokens_trained": 1.086237312
},
{
"epoch": 0.6274732288490178,
"grad_norm": 13.04654312133789,
"loss": 4.1611,
"lr": 0.0007606993006993007,
"step": 2212,
"tokens_trained": 1.087220136
},
{
"epoch": 0.6280405644989717,
"grad_norm": 8.223962783813477,
"loss": 4.1094,
"lr": 0.0007604195804195805,
"step": 2214,
"tokens_trained": 1.088203464
},
{
"epoch": 0.6286079001489256,
"grad_norm": 7.974697589874268,
"loss": 4.1061,
"lr": 0.0007601398601398601,
"step": 2216,
"tokens_trained": 1.089188056
},
{
"epoch": 0.6291752357988795,
"grad_norm": 9.93747329711914,
"loss": 4.1625,
"lr": 0.0007598601398601399,
"step": 2218,
"tokens_trained": 1.090168464
},
{
"epoch": 0.6297425714488334,
"grad_norm": 14.117332458496094,
"loss": 4.1386,
"lr": 0.0007595804195804196,
"step": 2220,
"tokens_trained": 1.09115228
},
{
"epoch": 0.6303099070987873,
"grad_norm": 8.045380592346191,
"loss": 4.0962,
"lr": 0.0007593006993006993,
"step": 2222,
"tokens_trained": 1.0921348
},
{
"epoch": 0.6308772427487412,
"grad_norm": 7.286352634429932,
"loss": 4.1456,
"lr": 0.000759020979020979,
"step": 2224,
"tokens_trained": 1.0931198
},
{
"epoch": 0.6314445783986952,
"grad_norm": 7.278292179107666,
"loss": 4.1155,
"lr": 0.0007587412587412587,
"step": 2226,
"tokens_trained": 1.094107536
},
{
"epoch": 0.632011914048649,
"grad_norm": 5.973489761352539,
"loss": 4.1403,
"lr": 0.0007584615384615385,
"step": 2228,
"tokens_trained": 1.095090384
},
{
"epoch": 0.6325792496986029,
"grad_norm": 11.78962230682373,
"loss": 4.1322,
"lr": 0.0007581818181818182,
"step": 2230,
"tokens_trained": 1.096072192
},
{
"epoch": 0.6331465853485568,
"grad_norm": 9.853010177612305,
"loss": 4.0905,
"lr": 0.000757902097902098,
"step": 2232,
"tokens_trained": 1.097057368
},
{
"epoch": 0.6337139209985108,
"grad_norm": 12.578025817871094,
"loss": 4.0871,
"lr": 0.0007576223776223776,
"step": 2234,
"tokens_trained": 1.0980418
},
{
"epoch": 0.6342812566484647,
"grad_norm": 8.467657089233398,
"loss": 4.0972,
"lr": 0.0007573426573426573,
"step": 2236,
"tokens_trained": 1.099023032
},
{
"epoch": 0.6348485922984185,
"grad_norm": 10.768691062927246,
"loss": 4.0683,
"lr": 0.0007570629370629371,
"step": 2238,
"tokens_trained": 1.1000078
},
{
"epoch": 0.6354159279483724,
"grad_norm": 8.509350776672363,
"loss": 4.1319,
"lr": 0.0007567832167832168,
"step": 2240,
"tokens_trained": 1.100990904
},
{
"epoch": 0.6359832635983264,
"grad_norm": 9.473450660705566,
"loss": 4.0971,
"lr": 0.0007565034965034965,
"step": 2242,
"tokens_trained": 1.101971112
},
{
"epoch": 0.6365505992482803,
"grad_norm": 5.248406887054443,
"loss": 4.1212,
"lr": 0.0007562237762237762,
"step": 2244,
"tokens_trained": 1.10295244
},
{
"epoch": 0.6371179348982342,
"grad_norm": 2.8849964141845703,
"loss": 4.0914,
"lr": 0.000755944055944056,
"step": 2246,
"tokens_trained": 1.103935728
},
{
"epoch": 0.637685270548188,
"grad_norm": 10.757996559143066,
"loss": 4.0711,
"lr": 0.0007556643356643357,
"step": 2248,
"tokens_trained": 1.104917112
},
{
"epoch": 0.638252606198142,
"grad_norm": 14.822528839111328,
"loss": 4.1311,
"lr": 0.0007553846153846154,
"step": 2250,
"tokens_trained": 1.105899872
},
{
"epoch": 0.638252606198142,
"eval_loss": 1.0298579931259155,
"eval_runtime": 20.7482,
"step": 2250,
"tokens_trained": 1.105899872
},
{
"epoch": 0.6388199418480959,
"grad_norm": 12.402534484863281,
"loss": 4.0729,
"lr": 0.0007551048951048951,
"step": 2252,
"tokens_trained": 1.106885776
},
{
"epoch": 0.6393872774980498,
"grad_norm": 8.585915565490723,
"loss": 4.1026,
"lr": 0.0007548251748251748,
"step": 2254,
"tokens_trained": 1.107867784
},
{
"epoch": 0.6399546131480037,
"grad_norm": 9.298388481140137,
"loss": 4.1033,
"lr": 0.0007545454545454546,
"step": 2256,
"tokens_trained": 1.108846136
},
{
"epoch": 0.6405219487979575,
"grad_norm": 10.894235610961914,
"loss": 4.1212,
"lr": 0.0007542657342657343,
"step": 2258,
"tokens_trained": 1.10982972
},
{
"epoch": 0.6410892844479115,
"grad_norm": 7.488401889801025,
"loss": 4.1268,
"lr": 0.000753986013986014,
"step": 2260,
"tokens_trained": 1.110815128
},
{
"epoch": 0.6416566200978654,
"grad_norm": 10.087981224060059,
"loss": 4.0819,
"lr": 0.0007537062937062937,
"step": 2262,
"tokens_trained": 1.111796896
},
{
"epoch": 0.6422239557478193,
"grad_norm": 8.851993560791016,
"loss": 4.0903,
"lr": 0.0007534265734265734,
"step": 2264,
"tokens_trained": 1.112779032
},
{
"epoch": 0.6427912913977732,
"grad_norm": 7.973280429840088,
"loss": 4.1251,
"lr": 0.0007531468531468532,
"step": 2266,
"tokens_trained": 1.11376248
},
{
"epoch": 0.6433586270477271,
"grad_norm": 10.600922584533691,
"loss": 4.1062,
"lr": 0.0007528671328671329,
"step": 2268,
"tokens_trained": 1.11474752
},
{
"epoch": 0.643925962697681,
"grad_norm": 6.029149532318115,
"loss": 4.1174,
"lr": 0.0007525874125874126,
"step": 2270,
"tokens_trained": 1.115730304
},
{
"epoch": 0.6444932983476349,
"grad_norm": 5.804802417755127,
"loss": 4.0634,
"lr": 0.0007523076923076923,
"step": 2272,
"tokens_trained": 1.116712712
},
{
"epoch": 0.6450606339975888,
"grad_norm": 12.601567268371582,
"loss": 4.111,
"lr": 0.0007520279720279721,
"step": 2274,
"tokens_trained": 1.117692824
},
{
"epoch": 0.6456279696475428,
"grad_norm": 6.2783203125,
"loss": 4.1375,
"lr": 0.0007517482517482518,
"step": 2276,
"tokens_trained": 1.118681616
},
{
"epoch": 0.6461953052974966,
"grad_norm": 3.368333339691162,
"loss": 4.096,
"lr": 0.0007514685314685314,
"step": 2278,
"tokens_trained": 1.119662896
},
{
"epoch": 0.6467626409474505,
"grad_norm": 28.135610580444336,
"loss": 4.1362,
"lr": 0.0007511888111888112,
"step": 2280,
"tokens_trained": 1.120644592
},
{
"epoch": 0.6473299765974044,
"grad_norm": 31.932798385620117,
"loss": 4.177,
"lr": 0.0007509090909090909,
"step": 2282,
"tokens_trained": 1.1216274
},
{
"epoch": 0.6478973122473584,
"grad_norm": 18.303653717041016,
"loss": 4.2105,
"lr": 0.0007506293706293707,
"step": 2284,
"tokens_trained": 1.122610568
},
{
"epoch": 0.6484646478973123,
"grad_norm": 24.33900260925293,
"loss": 4.1685,
"lr": 0.0007503496503496504,
"step": 2286,
"tokens_trained": 1.1235948
},
{
"epoch": 0.6490319835472661,
"grad_norm": 14.718119621276855,
"loss": 4.1309,
"lr": 0.00075006993006993,
"step": 2288,
"tokens_trained": 1.124576952
},
{
"epoch": 0.64959931919722,
"grad_norm": 10.44218921661377,
"loss": 4.1178,
"lr": 0.0007497902097902098,
"step": 2290,
"tokens_trained": 1.12555812
},
{
"epoch": 0.650166654847174,
"grad_norm": 12.619060516357422,
"loss": 4.088,
"lr": 0.0007495104895104895,
"step": 2292,
"tokens_trained": 1.126542504
},
{
"epoch": 0.6507339904971279,
"grad_norm": 12.677931785583496,
"loss": 4.1146,
"lr": 0.0007492307692307693,
"step": 2294,
"tokens_trained": 1.127527144
},
{
"epoch": 0.6513013261470818,
"grad_norm": 9.913066864013672,
"loss": 4.1376,
"lr": 0.0007489510489510489,
"step": 2296,
"tokens_trained": 1.128511472
},
{
"epoch": 0.6518686617970356,
"grad_norm": 10.902573585510254,
"loss": 4.1184,
"lr": 0.0007486713286713287,
"step": 2298,
"tokens_trained": 1.129493144
},
{
"epoch": 0.6524359974469895,
"grad_norm": 11.475235939025879,
"loss": 4.098,
"lr": 0.0007483916083916084,
"step": 2300,
"tokens_trained": 1.13047816
},
{
"epoch": 0.6530033330969435,
"grad_norm": 11.541910171508789,
"loss": 4.106,
"lr": 0.0007481118881118882,
"step": 2302,
"tokens_trained": 1.131461952
},
{
"epoch": 0.6535706687468974,
"grad_norm": 8.055131912231445,
"loss": 4.0913,
"lr": 0.0007478321678321679,
"step": 2304,
"tokens_trained": 1.132445928
},
{
"epoch": 0.6541380043968513,
"grad_norm": 11.786042213439941,
"loss": 4.14,
"lr": 0.0007475524475524475,
"step": 2306,
"tokens_trained": 1.133430104
},
{
"epoch": 0.6547053400468051,
"grad_norm": 7.311541557312012,
"loss": 4.0989,
"lr": 0.0007472727272727273,
"step": 2308,
"tokens_trained": 1.1344128
},
{
"epoch": 0.6552726756967591,
"grad_norm": 5.909560680389404,
"loss": 4.1226,
"lr": 0.000746993006993007,
"step": 2310,
"tokens_trained": 1.135395456
},
{
"epoch": 0.655840011346713,
"grad_norm": 15.199941635131836,
"loss": 4.1003,
"lr": 0.0007467132867132868,
"step": 2312,
"tokens_trained": 1.136377952
},
{
"epoch": 0.6564073469966669,
"grad_norm": 11.078165054321289,
"loss": 4.1273,
"lr": 0.0007464335664335664,
"step": 2314,
"tokens_trained": 1.137364488
},
{
"epoch": 0.6569746826466208,
"grad_norm": 14.202346801757812,
"loss": 4.074,
"lr": 0.0007461538461538462,
"step": 2316,
"tokens_trained": 1.138348624
},
{
"epoch": 0.6575420182965747,
"grad_norm": 12.573927879333496,
"loss": 4.0749,
"lr": 0.0007458741258741259,
"step": 2318,
"tokens_trained": 1.139332304
},
{
"epoch": 0.6581093539465286,
"grad_norm": 4.582006454467773,
"loss": 4.1204,
"lr": 0.0007455944055944056,
"step": 2320,
"tokens_trained": 1.140317248
},
{
"epoch": 0.6586766895964825,
"grad_norm": 12.172183990478516,
"loss": 4.1045,
"lr": 0.0007453146853146854,
"step": 2322,
"tokens_trained": 1.141300976
},
{
"epoch": 0.6592440252464364,
"grad_norm": 8.110429763793945,
"loss": 4.1081,
"lr": 0.000745034965034965,
"step": 2324,
"tokens_trained": 1.142283576
},
{
"epoch": 0.6598113608963904,
"grad_norm": 7.653029918670654,
"loss": 4.1272,
"lr": 0.0007447552447552448,
"step": 2326,
"tokens_trained": 1.143264144
},
{
"epoch": 0.6603786965463442,
"grad_norm": 8.91545295715332,
"loss": 4.0604,
"lr": 0.0007444755244755245,
"step": 2328,
"tokens_trained": 1.144248336
},
{
"epoch": 0.6609460321962981,
"grad_norm": 8.173501014709473,
"loss": 4.1033,
"lr": 0.0007441958041958043,
"step": 2330,
"tokens_trained": 1.145231936
},
{
"epoch": 0.661513367846252,
"grad_norm": 6.748053550720215,
"loss": 4.1,
"lr": 0.0007439160839160839,
"step": 2332,
"tokens_trained": 1.146214208
},
{
"epoch": 0.662080703496206,
"grad_norm": 8.997527122497559,
"loss": 4.0642,
"lr": 0.0007436363636363636,
"step": 2334,
"tokens_trained": 1.147203592
},
{
"epoch": 0.6626480391461599,
"grad_norm": 5.39633321762085,
"loss": 4.0531,
"lr": 0.0007433566433566433,
"step": 2336,
"tokens_trained": 1.148189176
},
{
"epoch": 0.6632153747961137,
"grad_norm": 11.717559814453125,
"loss": 4.1069,
"lr": 0.0007430769230769231,
"step": 2338,
"tokens_trained": 1.14917232
},
{
"epoch": 0.6637827104460676,
"grad_norm": 4.895142078399658,
"loss": 4.1119,
"lr": 0.0007427972027972029,
"step": 2340,
"tokens_trained": 1.150150104
},
{
"epoch": 0.6643500460960216,
"grad_norm": 7.677682399749756,
"loss": 4.0787,
"lr": 0.0007425174825174825,
"step": 2342,
"tokens_trained": 1.15113228
},
{
"epoch": 0.6649173817459755,
"grad_norm": 9.910654067993164,
"loss": 4.114,
"lr": 0.0007422377622377622,
"step": 2344,
"tokens_trained": 1.152119112
},
{
"epoch": 0.6654847173959294,
"grad_norm": 7.880978107452393,
"loss": 4.1188,
"lr": 0.000741958041958042,
"step": 2346,
"tokens_trained": 1.153100688
},
{
"epoch": 0.6660520530458832,
"grad_norm": 3.284940242767334,
"loss": 4.0736,
"lr": 0.0007416783216783217,
"step": 2348,
"tokens_trained": 1.1540818
},
{
"epoch": 0.6666193886958371,
"grad_norm": 13.524490356445312,
"loss": 4.0621,
"lr": 0.0007413986013986014,
"step": 2350,
"tokens_trained": 1.155065608
},
{
"epoch": 0.6671867243457911,
"grad_norm": 5.8569135665893555,
"loss": 4.0904,
"lr": 0.0007411188811188811,
"step": 2352,
"tokens_trained": 1.156048544
},
{
"epoch": 0.667754059995745,
"grad_norm": 7.1157450675964355,
"loss": 4.0774,
"lr": 0.0007408391608391608,
"step": 2354,
"tokens_trained": 1.157030432
},
{
"epoch": 0.6683213956456989,
"grad_norm": 7.612982273101807,
"loss": 4.0829,
"lr": 0.0007405594405594406,
"step": 2356,
"tokens_trained": 1.158012728
},
{
"epoch": 0.6688887312956527,
"grad_norm": 8.317691802978516,
"loss": 4.1176,
"lr": 0.0007402797202797204,
"step": 2358,
"tokens_trained": 1.158993632
},
{
"epoch": 0.6694560669456067,
"grad_norm": 5.272528648376465,
"loss": 4.0977,
"lr": 0.00074,
"step": 2360,
"tokens_trained": 1.159976328
},
{
"epoch": 0.6700234025955606,
"grad_norm": 11.313931465148926,
"loss": 4.0792,
"lr": 0.0007397202797202797,
"step": 2362,
"tokens_trained": 1.160962072
},
{
"epoch": 0.6705907382455145,
"grad_norm": 12.588369369506836,
"loss": 4.0491,
"lr": 0.0007394405594405595,
"step": 2364,
"tokens_trained": 1.161947664
},
{
"epoch": 0.6711580738954684,
"grad_norm": 23.921968460083008,
"loss": 4.1085,
"lr": 0.0007391608391608392,
"step": 2366,
"tokens_trained": 1.16292872
},
{
"epoch": 0.6717254095454223,
"grad_norm": 9.100578308105469,
"loss": 4.1305,
"lr": 0.0007388811188811189,
"step": 2368,
"tokens_trained": 1.163913888
},
{
"epoch": 0.6722927451953762,
"grad_norm": 35.22720718383789,
"loss": 4.1538,
"lr": 0.0007386013986013986,
"step": 2370,
"tokens_trained": 1.164894912
},
{
"epoch": 0.6728600808453301,
"grad_norm": 16.7394962310791,
"loss": 4.1449,
"lr": 0.0007383216783216782,
"step": 2372,
"tokens_trained": 1.165879832
},
{
"epoch": 0.673427416495284,
"grad_norm": 11.066312789916992,
"loss": 4.1172,
"lr": 0.0007380419580419581,
"step": 2374,
"tokens_trained": 1.166864736
},
{
"epoch": 0.6737110843202609,
"eval_loss": 1.0303717851638794,
"eval_runtime": 20.7454,
"step": 2375,
"tokens_trained": 1.167358632
},
{
"epoch": 0.673994752145238,
"grad_norm": 12.827569007873535,
"loss": 4.1377,
"lr": 0.0007377622377622378,
"step": 2376,
"tokens_trained": 1.16784964
},
{
"epoch": 0.6745620877951918,
"grad_norm": 13.321866035461426,
"loss": 4.0747,
"lr": 0.0007374825174825175,
"step": 2378,
"tokens_trained": 1.168834992
},
{
"epoch": 0.6751294234451457,
"grad_norm": 15.812009811401367,
"loss": 4.1107,
"lr": 0.0007372027972027972,
"step": 2380,
"tokens_trained": 1.169817608
},
{
"epoch": 0.6756967590950996,
"grad_norm": 16.37995719909668,
"loss": 4.1556,
"lr": 0.000736923076923077,
"step": 2382,
"tokens_trained": 1.170800952
},
{
"epoch": 0.6762640947450536,
"grad_norm": 3.3421339988708496,
"loss": 4.1199,
"lr": 0.0007366433566433567,
"step": 2384,
"tokens_trained": 1.1717818
},
{
"epoch": 0.6768314303950075,
"grad_norm": 9.120339393615723,
"loss": 4.0834,
"lr": 0.0007363636363636363,
"step": 2386,
"tokens_trained": 1.172767384
},
{
"epoch": 0.6773987660449613,
"grad_norm": 12.614449501037598,
"loss": 4.0852,
"lr": 0.0007360839160839161,
"step": 2388,
"tokens_trained": 1.173755008
},
{
"epoch": 0.6779661016949152,
"grad_norm": 4.983767986297607,
"loss": 4.0881,
"lr": 0.0007358041958041957,
"step": 2390,
"tokens_trained": 1.174738528
},
{
"epoch": 0.6785334373448692,
"grad_norm": 4.194960117340088,
"loss": 4.1279,
"lr": 0.0007355244755244756,
"step": 2392,
"tokens_trained": 1.175724848
},
{
"epoch": 0.6791007729948231,
"grad_norm": 5.257171154022217,
"loss": 4.1044,
"lr": 0.0007352447552447553,
"step": 2394,
"tokens_trained": 1.176708808
},
{
"epoch": 0.679668108644777,
"grad_norm": 10.38420295715332,
"loss": 4.124,
"lr": 0.000734965034965035,
"step": 2396,
"tokens_trained": 1.177695552
},
{
"epoch": 0.6802354442947308,
"grad_norm": 8.629493713378906,
"loss": 4.0992,
"lr": 0.0007346853146853147,
"step": 2398,
"tokens_trained": 1.17868064
},
{
"epoch": 0.6808027799446847,
"grad_norm": 9.099041938781738,
"loss": 4.1047,
"lr": 0.0007344055944055944,
"step": 2400,
"tokens_trained": 1.179664536
},
{
"epoch": 0.6813701155946387,
"grad_norm": 11.343080520629883,
"loss": 4.1027,
"lr": 0.0007341258741258742,
"step": 2402,
"tokens_trained": 1.180644264
},
{
"epoch": 0.6819374512445926,
"grad_norm": 5.834907054901123,
"loss": 4.098,
"lr": 0.0007338461538461538,
"step": 2404,
"tokens_trained": 1.181629672
},
{
"epoch": 0.6825047868945465,
"grad_norm": 4.648270606994629,
"loss": 4.0775,
"lr": 0.0007335664335664336,
"step": 2406,
"tokens_trained": 1.182614064
},
{
"epoch": 0.6830721225445003,
"grad_norm": 6.934843063354492,
"loss": 4.1206,
"lr": 0.0007332867132867132,
"step": 2408,
"tokens_trained": 1.183597056
},
{
"epoch": 0.6836394581944543,
"grad_norm": 9.745563507080078,
"loss": 4.0921,
"lr": 0.0007330069930069931,
"step": 2410,
"tokens_trained": 1.184579832
},
{
"epoch": 0.6842067938444082,
"grad_norm": 7.189306259155273,
"loss": 4.095,
"lr": 0.0007327272727272728,
"step": 2412,
"tokens_trained": 1.185567912
},
{
"epoch": 0.6847741294943621,
"grad_norm": 6.303226947784424,
"loss": 4.0462,
"lr": 0.0007324475524475524,
"step": 2414,
"tokens_trained": 1.186550184
},
{
"epoch": 0.685341465144316,
"grad_norm": 6.373469352722168,
"loss": 4.1126,
"lr": 0.0007321678321678322,
"step": 2416,
"tokens_trained": 1.1875374
},
{
"epoch": 0.6859088007942699,
"grad_norm": 7.8680853843688965,
"loss": 4.0954,
"lr": 0.0007318881118881119,
"step": 2418,
"tokens_trained": 1.188519808
},
{
"epoch": 0.6864761364442238,
"grad_norm": 6.305267810821533,
"loss": 4.0951,
"lr": 0.0007316083916083917,
"step": 2420,
"tokens_trained": 1.18950228
},
{
"epoch": 0.6870434720941777,
"grad_norm": 9.990362167358398,
"loss": 4.0902,
"lr": 0.0007313286713286713,
"step": 2422,
"tokens_trained": 1.190483872
},
{
"epoch": 0.6876108077441316,
"grad_norm": 7.421126365661621,
"loss": 4.082,
"lr": 0.0007310489510489511,
"step": 2424,
"tokens_trained": 1.191465424
},
{
"epoch": 0.6881781433940856,
"grad_norm": 7.08989953994751,
"loss": 4.057,
"lr": 0.0007307692307692307,
"step": 2426,
"tokens_trained": 1.192446
},
{
"epoch": 0.6887454790440394,
"grad_norm": 16.008317947387695,
"loss": 4.0857,
"lr": 0.0007304895104895105,
"step": 2428,
"tokens_trained": 1.193428632
},
{
"epoch": 0.6893128146939933,
"grad_norm": 14.471416473388672,
"loss": 4.127,
"lr": 0.0007302097902097902,
"step": 2430,
"tokens_trained": 1.194413624
},
{
"epoch": 0.6898801503439472,
"grad_norm": 8.250576972961426,
"loss": 4.1244,
"lr": 0.0007299300699300699,
"step": 2432,
"tokens_trained": 1.195396768
},
{
"epoch": 0.6904474859939012,
"grad_norm": 17.120845794677734,
"loss": 4.107,
"lr": 0.0007296503496503497,
"step": 2434,
"tokens_trained": 1.196377144
},
{
"epoch": 0.6910148216438551,
"grad_norm": 24.250490188598633,
"loss": 4.1443,
"lr": 0.0007293706293706294,
"step": 2436,
"tokens_trained": 1.197361496
},
{
"epoch": 0.6915821572938089,
"grad_norm": 9.916406631469727,
"loss": 4.1308,
"lr": 0.0007290909090909092,
"step": 2438,
"tokens_trained": 1.198343376
},
{
"epoch": 0.6921494929437628,
"grad_norm": 29.035507202148438,
"loss": 4.1809,
"lr": 0.0007288111888111888,
"step": 2440,
"tokens_trained": 1.19932396
},
{
"epoch": 0.6927168285937167,
"grad_norm": 26.963102340698242,
"loss": 4.1343,
"lr": 0.0007285314685314685,
"step": 2442,
"tokens_trained": 1.200310088
},
{
"epoch": 0.6932841642436707,
"grad_norm": 9.7550048828125,
"loss": 4.0746,
"lr": 0.0007282517482517482,
"step": 2444,
"tokens_trained": 1.201291576
},
{
"epoch": 0.6938514998936246,
"grad_norm": 18.56088638305664,
"loss": 4.1634,
"lr": 0.000727972027972028,
"step": 2446,
"tokens_trained": 1.202271312
},
{
"epoch": 0.6944188355435784,
"grad_norm": 20.842105865478516,
"loss": 4.128,
"lr": 0.0007276923076923077,
"step": 2448,
"tokens_trained": 1.203252912
},
{
"epoch": 0.6949861711935323,
"grad_norm": 21.38428497314453,
"loss": 4.1263,
"lr": 0.0007274125874125874,
"step": 2450,
"tokens_trained": 1.204231328
},
{
"epoch": 0.6955535068434863,
"grad_norm": 9.129469871520996,
"loss": 4.0964,
"lr": 0.0007271328671328672,
"step": 2452,
"tokens_trained": 1.205215552
},
{
"epoch": 0.6961208424934402,
"grad_norm": 25.37588882446289,
"loss": 4.1568,
"lr": 0.0007268531468531469,
"step": 2454,
"tokens_trained": 1.206202536
},
{
"epoch": 0.6966881781433941,
"grad_norm": 17.409656524658203,
"loss": 4.1214,
"lr": 0.0007265734265734266,
"step": 2456,
"tokens_trained": 1.207182664
},
{
"epoch": 0.6972555137933479,
"grad_norm": 12.378538131713867,
"loss": 4.1235,
"lr": 0.0007262937062937063,
"step": 2458,
"tokens_trained": 1.208164408
},
{
"epoch": 0.6978228494433019,
"grad_norm": 15.208183288574219,
"loss": 4.0724,
"lr": 0.000726013986013986,
"step": 2460,
"tokens_trained": 1.209151056
},
{
"epoch": 0.6983901850932558,
"grad_norm": 15.311476707458496,
"loss": 4.1146,
"lr": 0.0007257342657342657,
"step": 2462,
"tokens_trained": 1.210135672
},
{
"epoch": 0.6989575207432097,
"grad_norm": 8.551816940307617,
"loss": 4.0944,
"lr": 0.0007254545454545455,
"step": 2464,
"tokens_trained": 1.211118992
},
{
"epoch": 0.6995248563931636,
"grad_norm": 5.893448829650879,
"loss": 4.0777,
"lr": 0.0007251748251748252,
"step": 2466,
"tokens_trained": 1.212102
},
{
"epoch": 0.7000921920431175,
"grad_norm": 12.23680591583252,
"loss": 4.0998,
"lr": 0.0007248951048951049,
"step": 2468,
"tokens_trained": 1.213078936
},
{
"epoch": 0.7006595276930714,
"grad_norm": 6.285398006439209,
"loss": 4.0691,
"lr": 0.0007246153846153846,
"step": 2470,
"tokens_trained": 1.214058832
},
{
"epoch": 0.7012268633430253,
"grad_norm": 5.049949645996094,
"loss": 4.0849,
"lr": 0.0007243356643356644,
"step": 2472,
"tokens_trained": 1.215045384
},
{
"epoch": 0.7017941989929792,
"grad_norm": 8.333894729614258,
"loss": 4.1072,
"lr": 0.0007240559440559441,
"step": 2474,
"tokens_trained": 1.216029416
},
{
"epoch": 0.7023615346429332,
"grad_norm": 10.236394882202148,
"loss": 4.1144,
"lr": 0.0007237762237762238,
"step": 2476,
"tokens_trained": 1.217012872
},
{
"epoch": 0.702928870292887,
"grad_norm": 7.674532413482666,
"loss": 4.0948,
"lr": 0.0007234965034965035,
"step": 2478,
"tokens_trained": 1.2179988
},
{
"epoch": 0.7034962059428409,
"grad_norm": 8.445834159851074,
"loss": 4.0937,
"lr": 0.0007232167832167831,
"step": 2480,
"tokens_trained": 1.218980608
},
{
"epoch": 0.7040635415927948,
"grad_norm": 6.923468112945557,
"loss": 4.0756,
"lr": 0.000722937062937063,
"step": 2482,
"tokens_trained": 1.219966912
},
{
"epoch": 0.7046308772427488,
"grad_norm": 5.95997428894043,
"loss": 4.0618,
"lr": 0.0007226573426573426,
"step": 2484,
"tokens_trained": 1.220952696
},
{
"epoch": 0.7051982128927027,
"grad_norm": 3.7207870483398438,
"loss": 4.0869,
"lr": 0.0007223776223776224,
"step": 2486,
"tokens_trained": 1.22193476
},
{
"epoch": 0.7057655485426565,
"grad_norm": 8.434130668640137,
"loss": 4.0965,
"lr": 0.0007220979020979021,
"step": 2488,
"tokens_trained": 1.222914616
},
{
"epoch": 0.7063328841926104,
"grad_norm": 10.180377006530762,
"loss": 4.0871,
"lr": 0.0007218181818181819,
"step": 2490,
"tokens_trained": 1.22389764
},
{
"epoch": 0.7069002198425643,
"grad_norm": 8.211799621582031,
"loss": 4.0811,
"lr": 0.0007215384615384616,
"step": 2492,
"tokens_trained": 1.224875448
},
{
"epoch": 0.7074675554925183,
"grad_norm": 5.268981456756592,
"loss": 4.0926,
"lr": 0.0007212587412587412,
"step": 2494,
"tokens_trained": 1.225858112
},
{
"epoch": 0.7080348911424722,
"grad_norm": 7.387131690979004,
"loss": 4.1097,
"lr": 0.000720979020979021,
"step": 2496,
"tokens_trained": 1.226838472
},
{
"epoch": 0.708602226792426,
"grad_norm": 7.289080619812012,
"loss": 4.0566,
"lr": 0.0007206993006993006,
"step": 2498,
"tokens_trained": 1.227821848
},
{
"epoch": 0.7091695624423799,
"grad_norm": 6.981493949890137,
"loss": 4.062,
"lr": 0.0007204195804195805,
"step": 2500,
"tokens_trained": 1.228806208
},
{
"epoch": 0.7091695624423799,
"eval_loss": 1.0222537517547607,
"eval_runtime": 20.7945,
"step": 2500,
"tokens_trained": 1.228806208
},
{
"epoch": 0.7097368980923339,
"grad_norm": 6.244803428649902,
"loss": 4.1417,
"lr": 0.0007201398601398601,
"step": 2502,
"tokens_trained": 1.229787872
},
{
"epoch": 0.7103042337422878,
"grad_norm": 4.354197978973389,
"loss": 4.0663,
"lr": 0.0007198601398601399,
"step": 2504,
"tokens_trained": 1.23077076
},
{
"epoch": 0.7108715693922417,
"grad_norm": 4.971379280090332,
"loss": 4.0495,
"lr": 0.0007195804195804196,
"step": 2506,
"tokens_trained": 1.231752344
},
{
"epoch": 0.7114389050421955,
"grad_norm": 5.990703582763672,
"loss": 4.0837,
"lr": 0.0007193006993006994,
"step": 2508,
"tokens_trained": 1.232733864
},
{
"epoch": 0.7120062406921495,
"grad_norm": 8.498222351074219,
"loss": 4.0379,
"lr": 0.0007190209790209791,
"step": 2510,
"tokens_trained": 1.233716744
},
{
"epoch": 0.7125735763421034,
"grad_norm": 13.36562442779541,
"loss": 4.0187,
"lr": 0.0007187412587412587,
"step": 2512,
"tokens_trained": 1.234699872
},
{
"epoch": 0.7131409119920573,
"grad_norm": 8.733027458190918,
"loss": 4.092,
"lr": 0.0007184615384615385,
"step": 2514,
"tokens_trained": 1.235684584
},
{
"epoch": 0.7137082476420112,
"grad_norm": 4.150378227233887,
"loss": 4.1277,
"lr": 0.0007181818181818181,
"step": 2516,
"tokens_trained": 1.236669584
},
{
"epoch": 0.714275583291965,
"grad_norm": 5.051011085510254,
"loss": 4.0942,
"lr": 0.000717902097902098,
"step": 2518,
"tokens_trained": 1.237654456
},
{
"epoch": 0.714842918941919,
"grad_norm": 19.51820945739746,
"loss": 4.0784,
"lr": 0.0007176223776223776,
"step": 2520,
"tokens_trained": 1.238634888
},
{
"epoch": 0.7154102545918729,
"grad_norm": 12.287970542907715,
"loss": 4.1096,
"lr": 0.0007173426573426573,
"step": 2522,
"tokens_trained": 1.239617096
},
{
"epoch": 0.7159775902418268,
"grad_norm": 7.280889511108398,
"loss": 4.1173,
"lr": 0.0007170629370629371,
"step": 2524,
"tokens_trained": 1.240599456
},
{
"epoch": 0.7165449258917808,
"grad_norm": 7.321331024169922,
"loss": 4.1011,
"lr": 0.0007167832167832168,
"step": 2526,
"tokens_trained": 1.2415852
},
{
"epoch": 0.7171122615417346,
"grad_norm": 12.695849418640137,
"loss": 4.0652,
"lr": 0.0007165034965034966,
"step": 2528,
"tokens_trained": 1.242566296
},
{
"epoch": 0.7176795971916885,
"grad_norm": 10.30766487121582,
"loss": 4.0683,
"lr": 0.0007162237762237762,
"step": 2530,
"tokens_trained": 1.24354928
},
{
"epoch": 0.7182469328416424,
"grad_norm": 6.451354503631592,
"loss": 4.0712,
"lr": 0.000715944055944056,
"step": 2532,
"tokens_trained": 1.244534464
},
{
"epoch": 0.7188142684915964,
"grad_norm": 13.049304962158203,
"loss": 4.0662,
"lr": 0.0007156643356643356,
"step": 2534,
"tokens_trained": 1.245514976
},
{
"epoch": 0.7193816041415503,
"grad_norm": 6.242895603179932,
"loss": 4.089,
"lr": 0.0007153846153846155,
"step": 2536,
"tokens_trained": 1.246499648
},
{
"epoch": 0.7199489397915041,
"grad_norm": 9.09418773651123,
"loss": 4.0727,
"lr": 0.0007151048951048951,
"step": 2538,
"tokens_trained": 1.247482424
},
{
"epoch": 0.720516275441458,
"grad_norm": 5.704024791717529,
"loss": 4.0973,
"lr": 0.0007148251748251748,
"step": 2540,
"tokens_trained": 1.248465776
},
{
"epoch": 0.721083611091412,
"grad_norm": 1.818793535232544,
"loss": 4.0928,
"lr": 0.0007145454545454546,
"step": 2542,
"tokens_trained": 1.249446792
},
{
"epoch": 0.7216509467413659,
"grad_norm": 8.157804489135742,
"loss": 4.1082,
"lr": 0.0007142657342657343,
"step": 2544,
"tokens_trained": 1.25042832
},
{
"epoch": 0.7222182823913198,
"grad_norm": 12.176240921020508,
"loss": 4.0472,
"lr": 0.0007139860139860141,
"step": 2546,
"tokens_trained": 1.251411112
},
{
"epoch": 0.7227856180412736,
"grad_norm": 9.750322341918945,
"loss": 4.0892,
"lr": 0.0007137062937062937,
"step": 2548,
"tokens_trained": 1.25239148
},
{
"epoch": 0.7233529536912275,
"grad_norm": 7.636045455932617,
"loss": 4.0939,
"lr": 0.0007134265734265734,
"step": 2550,
"tokens_trained": 1.253374936
},
{
"epoch": 0.7239202893411815,
"grad_norm": 9.795125007629395,
"loss": 4.0542,
"lr": 0.0007131468531468531,
"step": 2552,
"tokens_trained": 1.254359048
},
{
"epoch": 0.7244876249911354,
"grad_norm": 7.851208686828613,
"loss": 4.0546,
"lr": 0.0007128671328671329,
"step": 2554,
"tokens_trained": 1.255343552
},
{
"epoch": 0.7250549606410893,
"grad_norm": 7.749396800994873,
"loss": 4.0834,
"lr": 0.0007125874125874126,
"step": 2556,
"tokens_trained": 1.256332976
},
{
"epoch": 0.7256222962910431,
"grad_norm": 7.826572418212891,
"loss": 4.0914,
"lr": 0.0007123076923076923,
"step": 2558,
"tokens_trained": 1.257315376
},
{
"epoch": 0.7261896319409971,
"grad_norm": 7.173867225646973,
"loss": 4.0721,
"lr": 0.0007120279720279721,
"step": 2560,
"tokens_trained": 1.258296944
},
{
"epoch": 0.726756967590951,
"grad_norm": 7.722167015075684,
"loss": 4.092,
"lr": 0.0007117482517482518,
"step": 2562,
"tokens_trained": 1.259278984
},
{
"epoch": 0.7273243032409049,
"grad_norm": 5.8100690841674805,
"loss": 4.0592,
"lr": 0.0007114685314685315,
"step": 2564,
"tokens_trained": 1.260261648
},
{
"epoch": 0.7278916388908588,
"grad_norm": 6.633793830871582,
"loss": 4.0871,
"lr": 0.0007111888111888112,
"step": 2566,
"tokens_trained": 1.261235168
},
{
"epoch": 0.7284589745408127,
"grad_norm": 9.645057678222656,
"loss": 4.0707,
"lr": 0.0007109090909090909,
"step": 2568,
"tokens_trained": 1.26221864
},
{
"epoch": 0.7290263101907666,
"grad_norm": 8.770727157592773,
"loss": 4.0757,
"lr": 0.0007106293706293706,
"step": 2570,
"tokens_trained": 1.263199256
},
{
"epoch": 0.7295936458407205,
"grad_norm": 6.190083980560303,
"loss": 4.0911,
"lr": 0.0007103496503496504,
"step": 2572,
"tokens_trained": 1.264180424
},
{
"epoch": 0.7301609814906744,
"grad_norm": 11.070337295532227,
"loss": 4.0566,
"lr": 0.0007100699300699301,
"step": 2574,
"tokens_trained": 1.265164384
},
{
"epoch": 0.7307283171406284,
"grad_norm": 8.301725387573242,
"loss": 4.0636,
"lr": 0.0007097902097902098,
"step": 2576,
"tokens_trained": 1.266148592
},
{
"epoch": 0.7312956527905822,
"grad_norm": 5.524992942810059,
"loss": 4.0974,
"lr": 0.0007095104895104895,
"step": 2578,
"tokens_trained": 1.26712948
},
{
"epoch": 0.7318629884405361,
"grad_norm": 11.42268180847168,
"loss": 4.0858,
"lr": 0.0007092307692307692,
"step": 2580,
"tokens_trained": 1.268107968
},
{
"epoch": 0.73243032409049,
"grad_norm": 6.110471725463867,
"loss": 4.0563,
"lr": 0.000708951048951049,
"step": 2582,
"tokens_trained": 1.26909272
},
{
"epoch": 0.732997659740444,
"grad_norm": 4.583469867706299,
"loss": 4.0907,
"lr": 0.0007086713286713287,
"step": 2584,
"tokens_trained": 1.270074432
},
{
"epoch": 0.7335649953903979,
"grad_norm": 4.348790645599365,
"loss": 4.0768,
"lr": 0.0007083916083916084,
"step": 2586,
"tokens_trained": 1.271059184
},
{
"epoch": 0.7341323310403517,
"grad_norm": 9.383113861083984,
"loss": 4.0829,
"lr": 0.000708111888111888,
"step": 2588,
"tokens_trained": 1.272044288
},
{
"epoch": 0.7346996666903056,
"grad_norm": 8.594022750854492,
"loss": 4.097,
"lr": 0.0007078321678321679,
"step": 2590,
"tokens_trained": 1.273026808
},
{
"epoch": 0.7352670023402595,
"grad_norm": 8.971443176269531,
"loss": 4.0689,
"lr": 0.0007075524475524475,
"step": 2592,
"tokens_trained": 1.274011272
},
{
"epoch": 0.7358343379902135,
"grad_norm": 14.21872615814209,
"loss": 4.0892,
"lr": 0.0007072727272727273,
"step": 2594,
"tokens_trained": 1.274995728
},
{
"epoch": 0.7364016736401674,
"grad_norm": 5.579262733459473,
"loss": 4.1151,
"lr": 0.000706993006993007,
"step": 2596,
"tokens_trained": 1.27598244
},
{
"epoch": 0.7369690092901212,
"grad_norm": 7.760303974151611,
"loss": 4.0923,
"lr": 0.0007067132867132867,
"step": 2598,
"tokens_trained": 1.276966176
},
{
"epoch": 0.7375363449400751,
"grad_norm": 8.493928909301758,
"loss": 4.1002,
"lr": 0.0007064335664335665,
"step": 2600,
"tokens_trained": 1.277946064
},
{
"epoch": 0.7381036805900291,
"grad_norm": 7.7460126876831055,
"loss": 4.0464,
"lr": 0.0007061538461538462,
"step": 2602,
"tokens_trained": 1.278928016
},
{
"epoch": 0.738671016239983,
"grad_norm": 14.752384185791016,
"loss": 4.0694,
"lr": 0.0007058741258741259,
"step": 2604,
"tokens_trained": 1.27991464
},
{
"epoch": 0.7392383518899369,
"grad_norm": 4.13566255569458,
"loss": 4.0852,
"lr": 0.0007055944055944055,
"step": 2606,
"tokens_trained": 1.280898424
},
{
"epoch": 0.7398056875398907,
"grad_norm": 9.910110473632812,
"loss": 4.0819,
"lr": 0.0007053146853146854,
"step": 2608,
"tokens_trained": 1.281880448
},
{
"epoch": 0.7403730231898447,
"grad_norm": 8.776302337646484,
"loss": 4.0908,
"lr": 0.000705034965034965,
"step": 2610,
"tokens_trained": 1.282866224
},
{
"epoch": 0.7409403588397986,
"grad_norm": 7.437447547912598,
"loss": 4.0914,
"lr": 0.0007047552447552448,
"step": 2612,
"tokens_trained": 1.283846848
},
{
"epoch": 0.7415076944897525,
"grad_norm": 5.371145248413086,
"loss": 4.0601,
"lr": 0.0007044755244755245,
"step": 2614,
"tokens_trained": 1.284828288
},
{
"epoch": 0.7420750301397064,
"grad_norm": 5.754990100860596,
"loss": 4.034,
"lr": 0.0007041958041958041,
"step": 2616,
"tokens_trained": 1.285813632
},
{
"epoch": 0.7426423657896603,
"grad_norm": 12.21330738067627,
"loss": 4.0893,
"lr": 0.000703916083916084,
"step": 2618,
"tokens_trained": 1.286796048
},
{
"epoch": 0.7432097014396142,
"grad_norm": 6.313106060028076,
"loss": 4.1348,
"lr": 0.0007036363636363636,
"step": 2620,
"tokens_trained": 1.287779984
},
{
"epoch": 0.7437770370895681,
"grad_norm": 3.671832323074341,
"loss": 4.0892,
"lr": 0.0007033566433566434,
"step": 2622,
"tokens_trained": 1.288763704
},
{
"epoch": 0.744344372739522,
"grad_norm": 7.610039710998535,
"loss": 4.0544,
"lr": 0.000703076923076923,
"step": 2624,
"tokens_trained": 1.289748608
},
{
"epoch": 0.7446280405644989,
"eval_loss": 1.0216281414031982,
"eval_runtime": 21.3239,
"step": 2625,
"tokens_trained": 1.290237248
},
{
"epoch": 0.744911708389476,
"grad_norm": 10.805936813354492,
"loss": 4.0702,
"lr": 0.0007027972027972029,
"step": 2626,
"tokens_trained": 1.290726104
},
{
"epoch": 0.7454790440394298,
"grad_norm": 8.497400283813477,
"loss": 4.056,
"lr": 0.0007025174825174825,
"step": 2628,
"tokens_trained": 1.291710888
},
{
"epoch": 0.7460463796893837,
"grad_norm": 7.71652364730835,
"loss": 4.0428,
"lr": 0.0007022377622377623,
"step": 2630,
"tokens_trained": 1.2926998
},
{
"epoch": 0.7466137153393376,
"grad_norm": 11.314064979553223,
"loss": 4.0442,
"lr": 0.000701958041958042,
"step": 2632,
"tokens_trained": 1.293681648
},
{
"epoch": 0.7471810509892916,
"grad_norm": 8.498956680297852,
"loss": 4.0806,
"lr": 0.0007016783216783216,
"step": 2634,
"tokens_trained": 1.29466332
},
{
"epoch": 0.7477483866392455,
"grad_norm": 8.315062522888184,
"loss": 4.0496,
"lr": 0.0007013986013986015,
"step": 2636,
"tokens_trained": 1.29565108
},
{
"epoch": 0.7483157222891993,
"grad_norm": 7.541136264801025,
"loss": 4.0901,
"lr": 0.0007011188811188811,
"step": 2638,
"tokens_trained": 1.296633192
},
{
"epoch": 0.7488830579391532,
"grad_norm": 5.977221965789795,
"loss": 4.0612,
"lr": 0.0007008391608391609,
"step": 2640,
"tokens_trained": 1.297621272
},
{
"epoch": 0.7494503935891071,
"grad_norm": 5.02126932144165,
"loss": 4.0944,
"lr": 0.0007005594405594405,
"step": 2642,
"tokens_trained": 1.298601744
},
{
"epoch": 0.7500177292390611,
"grad_norm": 6.345284938812256,
"loss": 4.0578,
"lr": 0.0007002797202797204,
"step": 2644,
"tokens_trained": 1.299583072
},
{
"epoch": 0.750585064889015,
"grad_norm": 7.036267280578613,
"loss": 4.0472,
"lr": 0.0007,
"step": 2646,
"tokens_trained": 1.300567448
},
{
"epoch": 0.7511524005389689,
"grad_norm": 2.7125253677368164,
"loss": 4.0534,
"lr": 0.0006997202797202797,
"step": 2648,
"tokens_trained": 1.301554096
},
{
"epoch": 0.7517197361889227,
"grad_norm": 3.862492322921753,
"loss": 4.0696,
"lr": 0.0006994405594405595,
"step": 2650,
"tokens_trained": 1.302540112
},
{
"epoch": 0.7522870718388767,
"grad_norm": 2.0384063720703125,
"loss": 4.0662,
"lr": 0.0006991608391608391,
"step": 2652,
"tokens_trained": 1.30352596
},
{
"epoch": 0.7528544074888306,
"grad_norm": 5.195199966430664,
"loss": 4.0819,
"lr": 0.000698881118881119,
"step": 2654,
"tokens_trained": 1.30450616
},
{
"epoch": 0.7534217431387845,
"grad_norm": 14.55208969116211,
"loss": 4.0757,
"lr": 0.0006986013986013986,
"step": 2656,
"tokens_trained": 1.305488752
},
{
"epoch": 0.7539890787887384,
"grad_norm": 10.982531547546387,
"loss": 4.0474,
"lr": 0.0006983216783216784,
"step": 2658,
"tokens_trained": 1.306474856
},
{
"epoch": 0.7545564144386923,
"grad_norm": 7.926928997039795,
"loss": 4.0497,
"lr": 0.000698041958041958,
"step": 2660,
"tokens_trained": 1.307456136
},
{
"epoch": 0.7551237500886462,
"grad_norm": 5.156681537628174,
"loss": 4.098,
"lr": 0.0006977622377622378,
"step": 2662,
"tokens_trained": 1.308442664
},
{
"epoch": 0.7556910857386001,
"grad_norm": 8.156705856323242,
"loss": 4.0828,
"lr": 0.0006974825174825175,
"step": 2664,
"tokens_trained": 1.309422976
},
{
"epoch": 0.756258421388554,
"grad_norm": 8.489871978759766,
"loss": 4.0668,
"lr": 0.0006972027972027972,
"step": 2666,
"tokens_trained": 1.310406152
},
{
"epoch": 0.756825757038508,
"grad_norm": 13.065528869628906,
"loss": 4.0915,
"lr": 0.000696923076923077,
"step": 2668,
"tokens_trained": 1.311392576
},
{
"epoch": 0.7573930926884618,
"grad_norm": 7.475847244262695,
"loss": 4.0308,
"lr": 0.0006966433566433566,
"step": 2670,
"tokens_trained": 1.312378776
},
{
"epoch": 0.7579604283384157,
"grad_norm": 7.049544334411621,
"loss": 4.0662,
"lr": 0.0006963636363636365,
"step": 2672,
"tokens_trained": 1.313358848
},
{
"epoch": 0.7585277639883696,
"grad_norm": 5.037269115447998,
"loss": 4.1016,
"lr": 0.0006960839160839161,
"step": 2674,
"tokens_trained": 1.3143412
},
{
"epoch": 0.7590950996383236,
"grad_norm": 10.421965599060059,
"loss": 4.0655,
"lr": 0.0006958041958041958,
"step": 2676,
"tokens_trained": 1.315322968
},
{
"epoch": 0.7596624352882775,
"grad_norm": 8.08486557006836,
"loss": 4.0933,
"lr": 0.0006955244755244755,
"step": 2678,
"tokens_trained": 1.316306592
},
{
"epoch": 0.7602297709382313,
"grad_norm": 10.121665954589844,
"loss": 4.0673,
"lr": 0.0006952447552447553,
"step": 2680,
"tokens_trained": 1.317292536
},
{
"epoch": 0.7607971065881852,
"grad_norm": 4.840561389923096,
"loss": 4.089,
"lr": 0.000694965034965035,
"step": 2682,
"tokens_trained": 1.318278512
},
{
"epoch": 0.7613644422381391,
"grad_norm": 5.03504753112793,
"loss": 4.0696,
"lr": 0.0006946853146853147,
"step": 2684,
"tokens_trained": 1.319263032
},
{
"epoch": 0.7619317778880931,
"grad_norm": 12.180596351623535,
"loss": 4.1166,
"lr": 0.0006944055944055943,
"step": 2686,
"tokens_trained": 1.320252752
},
{
"epoch": 0.762499113538047,
"grad_norm": 8.842597007751465,
"loss": 4.0946,
"lr": 0.0006941258741258741,
"step": 2688,
"tokens_trained": 1.321239648
},
{
"epoch": 0.7630664491880008,
"grad_norm": 4.742710113525391,
"loss": 4.0894,
"lr": 0.0006938461538461539,
"step": 2690,
"tokens_trained": 1.322224872
},
{
"epoch": 0.7636337848379547,
"grad_norm": 2.7827649116516113,
"loss": 4.0453,
"lr": 0.0006935664335664336,
"step": 2692,
"tokens_trained": 1.323211432
},
{
"epoch": 0.7642011204879087,
"grad_norm": 8.263550758361816,
"loss": 4.0034,
"lr": 0.0006932867132867133,
"step": 2694,
"tokens_trained": 1.324190272
},
{
"epoch": 0.7647684561378626,
"grad_norm": 14.927130699157715,
"loss": 4.0243,
"lr": 0.000693006993006993,
"step": 2696,
"tokens_trained": 1.325175184
},
{
"epoch": 0.7653357917878165,
"grad_norm": 9.046390533447266,
"loss": 4.0646,
"lr": 0.0006927272727272728,
"step": 2698,
"tokens_trained": 1.326156856
},
{
"epoch": 0.7659031274377703,
"grad_norm": 7.640266418457031,
"loss": 4.0581,
"lr": 0.0006924475524475524,
"step": 2700,
"tokens_trained": 1.327134224
},
{
"epoch": 0.7664704630877243,
"grad_norm": 11.179667472839355,
"loss": 4.0286,
"lr": 0.0006921678321678322,
"step": 2702,
"tokens_trained": 1.328119376
},
{
"epoch": 0.7670377987376782,
"grad_norm": 13.961971282958984,
"loss": 4.072,
"lr": 0.0006918881118881118,
"step": 2704,
"tokens_trained": 1.329097248
},
{
"epoch": 0.7676051343876321,
"grad_norm": 5.873361110687256,
"loss": 4.1069,
"lr": 0.0006916083916083916,
"step": 2706,
"tokens_trained": 1.330079272
},
{
"epoch": 0.768172470037586,
"grad_norm": 5.7134623527526855,
"loss": 4.0483,
"lr": 0.0006913286713286714,
"step": 2708,
"tokens_trained": 1.331062968
},
{
"epoch": 0.7687398056875399,
"grad_norm": 8.088322639465332,
"loss": 4.0806,
"lr": 0.0006910489510489511,
"step": 2710,
"tokens_trained": 1.3320508
},
{
"epoch": 0.7693071413374938,
"grad_norm": 12.358318328857422,
"loss": 4.0281,
"lr": 0.0006907692307692308,
"step": 2712,
"tokens_trained": 1.333034392
},
{
"epoch": 0.7698744769874477,
"grad_norm": 6.448056221008301,
"loss": 4.0449,
"lr": 0.0006904895104895104,
"step": 2714,
"tokens_trained": 1.334018424
},
{
"epoch": 0.7704418126374016,
"grad_norm": 10.305964469909668,
"loss": 4.0611,
"lr": 0.0006902097902097903,
"step": 2716,
"tokens_trained": 1.33500044
},
{
"epoch": 0.7710091482873556,
"grad_norm": 8.82204532623291,
"loss": 4.0697,
"lr": 0.0006899300699300699,
"step": 2718,
"tokens_trained": 1.335985304
},
{
"epoch": 0.7715764839373094,
"grad_norm": 11.34217643737793,
"loss": 4.0471,
"lr": 0.0006896503496503497,
"step": 2720,
"tokens_trained": 1.336971752
},
{
"epoch": 0.7721438195872633,
"grad_norm": 9.843841552734375,
"loss": 4.1015,
"lr": 0.0006893706293706293,
"step": 2722,
"tokens_trained": 1.337955296
},
{
"epoch": 0.7727111552372172,
"grad_norm": 8.029809951782227,
"loss": 4.0432,
"lr": 0.0006890909090909091,
"step": 2724,
"tokens_trained": 1.338936912
},
{
"epoch": 0.7732784908871712,
"grad_norm": 8.858033180236816,
"loss": 4.0841,
"lr": 0.0006888111888111889,
"step": 2726,
"tokens_trained": 1.339920296
},
{
"epoch": 0.7738458265371251,
"grad_norm": 6.917725086212158,
"loss": 4.0701,
"lr": 0.0006885314685314685,
"step": 2728,
"tokens_trained": 1.340910088
},
{
"epoch": 0.7744131621870789,
"grad_norm": 9.695552825927734,
"loss": 4.0818,
"lr": 0.0006882517482517483,
"step": 2730,
"tokens_trained": 1.341895264
},
{
"epoch": 0.7749804978370328,
"grad_norm": 8.998181343078613,
"loss": 4.0734,
"lr": 0.0006879720279720279,
"step": 2732,
"tokens_trained": 1.342875544
},
{
"epoch": 0.7755478334869867,
"grad_norm": 7.250143527984619,
"loss": 4.0511,
"lr": 0.0006876923076923078,
"step": 2734,
"tokens_trained": 1.34386044
},
{
"epoch": 0.7761151691369407,
"grad_norm": 8.95149040222168,
"loss": 4.0671,
"lr": 0.0006874125874125874,
"step": 2736,
"tokens_trained": 1.344844568
},
{
"epoch": 0.7766825047868946,
"grad_norm": 9.469155311584473,
"loss": 4.0549,
"lr": 0.0006871328671328672,
"step": 2738,
"tokens_trained": 1.3458226
},
{
"epoch": 0.7772498404368484,
"grad_norm": 6.303086757659912,
"loss": 4.0808,
"lr": 0.0006868531468531468,
"step": 2740,
"tokens_trained": 1.346809256
},
{
"epoch": 0.7778171760868023,
"grad_norm": 6.282865524291992,
"loss": 4.0425,
"lr": 0.0006865734265734265,
"step": 2742,
"tokens_trained": 1.347790504
},
{
"epoch": 0.7783845117367563,
"grad_norm": 6.448110103607178,
"loss": 4.0512,
"lr": 0.0006862937062937064,
"step": 2744,
"tokens_trained": 1.348770416
},
{
"epoch": 0.7789518473867102,
"grad_norm": 3.967651128768921,
"loss": 4.0189,
"lr": 0.000686013986013986,
"step": 2746,
"tokens_trained": 1.34975288
},
{
"epoch": 0.7795191830366641,
"grad_norm": 4.253781318664551,
"loss": 4.0774,
"lr": 0.0006857342657342658,
"step": 2748,
"tokens_trained": 1.350729672
},
{
"epoch": 0.7800865186866179,
"grad_norm": 15.237231254577637,
"loss": 4.0929,
"lr": 0.0006854545454545454,
"step": 2750,
"tokens_trained": 1.351711184
},
{
"epoch": 0.7800865186866179,
"eval_loss": 1.0141865015029907,
"eval_runtime": 20.7754,
"step": 2750,
"tokens_trained": 1.351711184
},
{
"epoch": 0.7806538543365719,
"grad_norm": 14.367753028869629,
"loss": 4.0422,
"lr": 0.0006851748251748253,
"step": 2752,
"tokens_trained": 1.352694296
},
{
"epoch": 0.7812211899865258,
"grad_norm": 4.344571590423584,
"loss": 4.018,
"lr": 0.0006848951048951049,
"step": 2754,
"tokens_trained": 1.353678976
},
{
"epoch": 0.7817885256364797,
"grad_norm": 4.031637191772461,
"loss": 4.0568,
"lr": 0.0006846153846153846,
"step": 2756,
"tokens_trained": 1.354661624
},
{
"epoch": 0.7823558612864336,
"grad_norm": 11.08716106414795,
"loss": 4.0717,
"lr": 0.0006843356643356643,
"step": 2758,
"tokens_trained": 1.355644416
},
{
"epoch": 0.7829231969363875,
"grad_norm": 10.119296073913574,
"loss": 4.0726,
"lr": 0.000684055944055944,
"step": 2760,
"tokens_trained": 1.356625632
},
{
"epoch": 0.7834905325863414,
"grad_norm": 14.678930282592773,
"loss": 4.065,
"lr": 0.0006837762237762239,
"step": 2762,
"tokens_trained": 1.357605968
},
{
"epoch": 0.7840578682362953,
"grad_norm": 2.6932129859924316,
"loss": 4.0831,
"lr": 0.0006834965034965035,
"step": 2764,
"tokens_trained": 1.358590808
},
{
"epoch": 0.7846252038862492,
"grad_norm": 22.138845443725586,
"loss": 4.1011,
"lr": 0.0006832167832167833,
"step": 2766,
"tokens_trained": 1.359570928
},
{
"epoch": 0.7851925395362032,
"grad_norm": 17.627702713012695,
"loss": 4.1441,
"lr": 0.0006829370629370629,
"step": 2768,
"tokens_trained": 1.36055716
},
{
"epoch": 0.785759875186157,
"grad_norm": 9.9471435546875,
"loss": 4.122,
"lr": 0.0006826573426573427,
"step": 2770,
"tokens_trained": 1.361539352
},
{
"epoch": 0.7863272108361109,
"grad_norm": 11.452835083007812,
"loss": 4.0928,
"lr": 0.0006823776223776224,
"step": 2772,
"tokens_trained": 1.362519
},
{
"epoch": 0.7868945464860648,
"grad_norm": 15.566934585571289,
"loss": 4.0816,
"lr": 0.0006820979020979021,
"step": 2774,
"tokens_trained": 1.363505808
},
{
"epoch": 0.7874618821360188,
"grad_norm": 8.46238899230957,
"loss": 4.0924,
"lr": 0.0006818181818181818,
"step": 2776,
"tokens_trained": 1.364484496
},
{
"epoch": 0.7880292177859727,
"grad_norm": 4.6673688888549805,
"loss": 4.0732,
"lr": 0.0006815384615384615,
"step": 2778,
"tokens_trained": 1.365468696
},
{
"epoch": 0.7885965534359265,
"grad_norm": 10.422809600830078,
"loss": 4.0285,
"lr": 0.0006812587412587414,
"step": 2780,
"tokens_trained": 1.36645104
},
{
"epoch": 0.7891638890858804,
"grad_norm": 11.707451820373535,
"loss": 4.0645,
"lr": 0.000680979020979021,
"step": 2782,
"tokens_trained": 1.367433136
},
{
"epoch": 0.7897312247358343,
"grad_norm": 6.887526988983154,
"loss": 4.0591,
"lr": 0.0006806993006993007,
"step": 2784,
"tokens_trained": 1.368420024
},
{
"epoch": 0.7902985603857883,
"grad_norm": 7.914979457855225,
"loss": 4.0641,
"lr": 0.0006804195804195804,
"step": 2786,
"tokens_trained": 1.369401936
},
{
"epoch": 0.7908658960357422,
"grad_norm": 7.964488506317139,
"loss": 4.0462,
"lr": 0.0006801398601398602,
"step": 2788,
"tokens_trained": 1.370384896
},
{
"epoch": 0.791433231685696,
"grad_norm": 7.16652774810791,
"loss": 4.026,
"lr": 0.0006798601398601399,
"step": 2790,
"tokens_trained": 1.371365304
},
{
"epoch": 0.7920005673356499,
"grad_norm": 8.604512214660645,
"loss": 4.0407,
"lr": 0.0006795804195804196,
"step": 2792,
"tokens_trained": 1.372349584
},
{
"epoch": 0.7925679029856039,
"grad_norm": 6.616272449493408,
"loss": 4.0417,
"lr": 0.0006793006993006992,
"step": 2794,
"tokens_trained": 1.373330584
},
{
"epoch": 0.7931352386355578,
"grad_norm": 3.8474340438842773,
"loss": 4.0322,
"lr": 0.000679020979020979,
"step": 2796,
"tokens_trained": 1.374312888
},
{
"epoch": 0.7937025742855117,
"grad_norm": 11.628402709960938,
"loss": 4.0378,
"lr": 0.0006787412587412588,
"step": 2798,
"tokens_trained": 1.375294704
},
{
"epoch": 0.7942699099354655,
"grad_norm": 7.480481147766113,
"loss": 4.1031,
"lr": 0.0006784615384615385,
"step": 2800,
"tokens_trained": 1.376279072
},
{
"epoch": 0.7948372455854195,
"grad_norm": 6.449431896209717,
"loss": 4.0397,
"lr": 0.0006781818181818182,
"step": 2802,
"tokens_trained": 1.377265568
},
{
"epoch": 0.7954045812353734,
"grad_norm": 5.179644584655762,
"loss": 4.0826,
"lr": 0.0006779020979020979,
"step": 2804,
"tokens_trained": 1.378250776
},
{
"epoch": 0.7959719168853273,
"grad_norm": 8.918203353881836,
"loss": 4.0358,
"lr": 0.0006776223776223777,
"step": 2806,
"tokens_trained": 1.379235464
},
{
"epoch": 0.7965392525352812,
"grad_norm": 6.065394878387451,
"loss": 4.0754,
"lr": 0.0006773426573426574,
"step": 2808,
"tokens_trained": 1.380215248
},
{
"epoch": 0.797106588185235,
"grad_norm": 3.9142706394195557,
"loss": 4.0274,
"lr": 0.0006770629370629371,
"step": 2810,
"tokens_trained": 1.381197872
},
{
"epoch": 0.797673923835189,
"grad_norm": 12.86207103729248,
"loss": 4.0471,
"lr": 0.0006767832167832167,
"step": 2812,
"tokens_trained": 1.38218364
},
{
"epoch": 0.7982412594851429,
"grad_norm": 10.052533149719238,
"loss": 4.0628,
"lr": 0.0006765034965034965,
"step": 2814,
"tokens_trained": 1.383170176
},
{
"epoch": 0.7988085951350968,
"grad_norm": 5.910792827606201,
"loss": 4.0358,
"lr": 0.0006762237762237763,
"step": 2816,
"tokens_trained": 1.384154592
},
{
"epoch": 0.7993759307850508,
"grad_norm": 13.312492370605469,
"loss": 4.0694,
"lr": 0.000675944055944056,
"step": 2818,
"tokens_trained": 1.385138352
},
{
"epoch": 0.7999432664350046,
"grad_norm": 12.467507362365723,
"loss": 4.0705,
"lr": 0.0006756643356643357,
"step": 2820,
"tokens_trained": 1.386123232
},
{
"epoch": 0.8005106020849585,
"grad_norm": 4.8490824699401855,
"loss": 4.0387,
"lr": 0.0006753846153846153,
"step": 2822,
"tokens_trained": 1.387107008
},
{
"epoch": 0.8010779377349124,
"grad_norm": 13.596024513244629,
"loss": 4.0505,
"lr": 0.0006751048951048951,
"step": 2824,
"tokens_trained": 1.388091632
},
{
"epoch": 0.8016452733848664,
"grad_norm": 13.633816719055176,
"loss": 4.0894,
"lr": 0.0006748251748251748,
"step": 2826,
"tokens_trained": 1.389077456
},
{
"epoch": 0.8022126090348203,
"grad_norm": 4.448362827301025,
"loss": 4.0623,
"lr": 0.0006745454545454546,
"step": 2828,
"tokens_trained": 1.39006124
},
{
"epoch": 0.8027799446847741,
"grad_norm": 21.12818717956543,
"loss": 4.1275,
"lr": 0.0006742657342657342,
"step": 2830,
"tokens_trained": 1.391043016
},
{
"epoch": 0.803347280334728,
"grad_norm": 10.096168518066406,
"loss": 4.0858,
"lr": 0.000673986013986014,
"step": 2832,
"tokens_trained": 1.392026656
},
{
"epoch": 0.803914615984682,
"grad_norm": 4.614907264709473,
"loss": 4.0075,
"lr": 0.0006737062937062938,
"step": 2834,
"tokens_trained": 1.393006784
},
{
"epoch": 0.8044819516346359,
"grad_norm": 13.106852531433105,
"loss": 4.1113,
"lr": 0.0006734265734265734,
"step": 2836,
"tokens_trained": 1.393990424
},
{
"epoch": 0.8050492872845898,
"grad_norm": 4.287477493286133,
"loss": 4.0818,
"lr": 0.0006731468531468532,
"step": 2838,
"tokens_trained": 1.39497072
},
{
"epoch": 0.8056166229345436,
"grad_norm": 9.295431137084961,
"loss": 4.0652,
"lr": 0.0006728671328671328,
"step": 2840,
"tokens_trained": 1.395951488
},
{
"epoch": 0.8061839585844975,
"grad_norm": 12.001997947692871,
"loss": 4.1061,
"lr": 0.0006725874125874126,
"step": 2842,
"tokens_trained": 1.396933744
},
{
"epoch": 0.8067512942344515,
"grad_norm": 15.18830680847168,
"loss": 4.0483,
"lr": 0.0006723076923076923,
"step": 2844,
"tokens_trained": 1.397915696
},
{
"epoch": 0.8073186298844054,
"grad_norm": 9.936029434204102,
"loss": 4.0559,
"lr": 0.0006720279720279721,
"step": 2846,
"tokens_trained": 1.398900048
},
{
"epoch": 0.8078859655343593,
"grad_norm": 4.903693199157715,
"loss": 4.0474,
"lr": 0.0006717482517482517,
"step": 2848,
"tokens_trained": 1.399885336
},
{
"epoch": 0.8084533011843131,
"grad_norm": 6.753813743591309,
"loss": 4.0365,
"lr": 0.0006714685314685314,
"step": 2850,
"tokens_trained": 1.400867432
},
{
"epoch": 0.8090206368342671,
"grad_norm": 10.53545093536377,
"loss": 4.0697,
"lr": 0.0006711888111888113,
"step": 2852,
"tokens_trained": 1.401849552
},
{
"epoch": 0.809587972484221,
"grad_norm": 7.666012763977051,
"loss": 3.9955,
"lr": 0.0006709090909090909,
"step": 2854,
"tokens_trained": 1.402832496
},
{
"epoch": 0.8101553081341749,
"grad_norm": 11.65257740020752,
"loss": 4.0377,
"lr": 0.0006706293706293707,
"step": 2856,
"tokens_trained": 1.403816768
},
{
"epoch": 0.8107226437841288,
"grad_norm": 10.997775077819824,
"loss": 4.0145,
"lr": 0.0006703496503496503,
"step": 2858,
"tokens_trained": 1.404804968
},
{
"epoch": 0.8112899794340827,
"grad_norm": 3.699673652648926,
"loss": 4.1053,
"lr": 0.0006700699300699301,
"step": 2860,
"tokens_trained": 1.40578656
},
{
"epoch": 0.8118573150840366,
"grad_norm": 17.54732894897461,
"loss": 4.121,
"lr": 0.0006697902097902098,
"step": 2862,
"tokens_trained": 1.406773056
},
{
"epoch": 0.8124246507339905,
"grad_norm": 10.354470252990723,
"loss": 4.0353,
"lr": 0.0006695104895104895,
"step": 2864,
"tokens_trained": 1.407756592
},
{
"epoch": 0.8129919863839444,
"grad_norm": 7.760607719421387,
"loss": 4.0529,
"lr": 0.0006692307692307692,
"step": 2866,
"tokens_trained": 1.408742176
},
{
"epoch": 0.8135593220338984,
"grad_norm": 11.074470520019531,
"loss": 4.0223,
"lr": 0.0006689510489510489,
"step": 2868,
"tokens_trained": 1.409727856
},
{
"epoch": 0.8141266576838522,
"grad_norm": 12.221083641052246,
"loss": 4.0228,
"lr": 0.0006686713286713288,
"step": 2870,
"tokens_trained": 1.410712016
},
{
"epoch": 0.8146939933338061,
"grad_norm": 8.933589935302734,
"loss": 4.1234,
"lr": 0.0006683916083916084,
"step": 2872,
"tokens_trained": 1.411694496
},
{
"epoch": 0.81526132898376,
"grad_norm": 12.326020240783691,
"loss": 4.0772,
"lr": 0.0006681118881118882,
"step": 2874,
"tokens_trained": 1.412676992
},
{
"epoch": 0.8155449968087369,
"eval_loss": 1.015201449394226,
"eval_runtime": 20.3991,
"step": 2875,
"tokens_trained": 1.413169416
},
{
"epoch": 0.815828664633714,
"grad_norm": 8.320648193359375,
"loss": 4.0045,
"lr": 0.0006678321678321678,
"step": 2876,
"tokens_trained": 1.413657912
},
{
"epoch": 0.8163960002836679,
"grad_norm": 4.708253383636475,
"loss": 4.022,
"lr": 0.0006675524475524475,
"step": 2878,
"tokens_trained": 1.414641576
},
{
"epoch": 0.8169633359336217,
"grad_norm": 13.005586624145508,
"loss": 4.0305,
"lr": 0.0006672727272727273,
"step": 2880,
"tokens_trained": 1.415624992
},
{
"epoch": 0.8175306715835756,
"grad_norm": 8.445854187011719,
"loss": 4.0723,
"lr": 0.000666993006993007,
"step": 2882,
"tokens_trained": 1.416605936
},
{
"epoch": 0.8180980072335295,
"grad_norm": 5.153830528259277,
"loss": 4.0766,
"lr": 0.0006667132867132867,
"step": 2884,
"tokens_trained": 1.417593408
},
{
"epoch": 0.8186653428834835,
"grad_norm": 13.989762306213379,
"loss": 4.043,
"lr": 0.0006664335664335664,
"step": 2886,
"tokens_trained": 1.418577984
},
{
"epoch": 0.8192326785334374,
"grad_norm": 6.2893805503845215,
"loss": 4.0576,
"lr": 0.0006661538461538463,
"step": 2888,
"tokens_trained": 1.419557304
},
{
"epoch": 0.8198000141833912,
"grad_norm": 3.1825716495513916,
"loss": 4.0216,
"lr": 0.0006658741258741259,
"step": 2890,
"tokens_trained": 1.420538736
},
{
"epoch": 0.8203673498333451,
"grad_norm": 13.280265808105469,
"loss": 4.0665,
"lr": 0.0006655944055944056,
"step": 2892,
"tokens_trained": 1.421523048
},
{
"epoch": 0.8209346854832991,
"grad_norm": 8.963871955871582,
"loss": 4.0996,
"lr": 0.0006653146853146853,
"step": 2894,
"tokens_trained": 1.422504352
},
{
"epoch": 0.821502021133253,
"grad_norm": 9.463395118713379,
"loss": 4.0638,
"lr": 0.000665034965034965,
"step": 2896,
"tokens_trained": 1.423490256
},
{
"epoch": 0.8220693567832069,
"grad_norm": 10.848092079162598,
"loss": 4.0767,
"lr": 0.0006647552447552448,
"step": 2898,
"tokens_trained": 1.424473728
},
{
"epoch": 0.8226366924331607,
"grad_norm": 9.271900177001953,
"loss": 4.0675,
"lr": 0.0006644755244755245,
"step": 2900,
"tokens_trained": 1.425456216
},
{
"epoch": 0.8232040280831147,
"grad_norm": 8.910347938537598,
"loss": 4.031,
"lr": 0.0006641958041958042,
"step": 2902,
"tokens_trained": 1.426442408
},
{
"epoch": 0.8237713637330686,
"grad_norm": 6.92717981338501,
"loss": 4.1025,
"lr": 0.0006639160839160839,
"step": 2904,
"tokens_trained": 1.42742624
},
{
"epoch": 0.8243386993830225,
"grad_norm": 6.383159637451172,
"loss": 4.0057,
"lr": 0.0006636363636363638,
"step": 2906,
"tokens_trained": 1.428414912
},
{
"epoch": 0.8249060350329764,
"grad_norm": 5.782074451446533,
"loss": 4.0169,
"lr": 0.0006633566433566434,
"step": 2908,
"tokens_trained": 1.42939668
},
{
"epoch": 0.8254733706829303,
"grad_norm": 10.663660049438477,
"loss": 4.0504,
"lr": 0.0006630769230769231,
"step": 2910,
"tokens_trained": 1.430382648
},
{
"epoch": 0.8260407063328842,
"grad_norm": 11.806394577026367,
"loss": 4.065,
"lr": 0.0006627972027972028,
"step": 2912,
"tokens_trained": 1.43136304
},
{
"epoch": 0.8266080419828381,
"grad_norm": 5.7375617027282715,
"loss": 4.0133,
"lr": 0.0006625174825174825,
"step": 2914,
"tokens_trained": 1.432347472
},
{
"epoch": 0.827175377632792,
"grad_norm": 6.814542293548584,
"loss": 4.0656,
"lr": 0.0006622377622377623,
"step": 2916,
"tokens_trained": 1.433329632
},
{
"epoch": 0.827742713282746,
"grad_norm": 8.265726089477539,
"loss": 4.0206,
"lr": 0.000661958041958042,
"step": 2918,
"tokens_trained": 1.434312216
},
{
"epoch": 0.8283100489326998,
"grad_norm": 6.937063694000244,
"loss": 4.0372,
"lr": 0.0006616783216783216,
"step": 2920,
"tokens_trained": 1.435294504
},
{
"epoch": 0.8288773845826537,
"grad_norm": 6.773707866668701,
"loss": 4.0496,
"lr": 0.0006613986013986014,
"step": 2922,
"tokens_trained": 1.436276344
},
{
"epoch": 0.8294447202326076,
"grad_norm": 8.471631050109863,
"loss": 4.0834,
"lr": 0.0006611188811188812,
"step": 2924,
"tokens_trained": 1.43725852
},
{
"epoch": 0.8300120558825616,
"grad_norm": 10.602453231811523,
"loss": 4.0445,
"lr": 0.0006608391608391609,
"step": 2926,
"tokens_trained": 1.438239768
},
{
"epoch": 0.8305793915325155,
"grad_norm": 8.173192977905273,
"loss": 4.0423,
"lr": 0.0006605594405594406,
"step": 2928,
"tokens_trained": 1.43921892
},
{
"epoch": 0.8311467271824693,
"grad_norm": 9.510146141052246,
"loss": 4.0012,
"lr": 0.0006602797202797203,
"step": 2930,
"tokens_trained": 1.440203128
},
{
"epoch": 0.8317140628324232,
"grad_norm": 4.894539833068848,
"loss": 4.0574,
"lr": 0.00066,
"step": 2932,
"tokens_trained": 1.441187856
},
{
"epoch": 0.8322813984823771,
"grad_norm": 4.4945149421691895,
"loss": 4.0107,
"lr": 0.0006597202797202797,
"step": 2934,
"tokens_trained": 1.442164056
},
{
"epoch": 0.8328487341323311,
"grad_norm": 7.323387145996094,
"loss": 4.0779,
"lr": 0.0006594405594405595,
"step": 2936,
"tokens_trained": 1.44314688
},
{
"epoch": 0.833416069782285,
"grad_norm": 9.858680725097656,
"loss": 4.03,
"lr": 0.0006591608391608391,
"step": 2938,
"tokens_trained": 1.444127552
},
{
"epoch": 0.8339834054322388,
"grad_norm": 8.214831352233887,
"loss": 4.0591,
"lr": 0.0006588811188811189,
"step": 2940,
"tokens_trained": 1.445109336
},
{
"epoch": 0.8345507410821927,
"grad_norm": 6.628262996673584,
"loss": 4.0834,
"lr": 0.0006586013986013986,
"step": 2942,
"tokens_trained": 1.4460904
},
{
"epoch": 0.8351180767321467,
"grad_norm": 11.043391227722168,
"loss": 4.0516,
"lr": 0.0006583216783216784,
"step": 2944,
"tokens_trained": 1.447068776
},
{
"epoch": 0.8356854123821006,
"grad_norm": 8.013843536376953,
"loss": 4.0309,
"lr": 0.0006580419580419581,
"step": 2946,
"tokens_trained": 1.448046952
},
{
"epoch": 0.8362527480320545,
"grad_norm": 4.856717586517334,
"loss": 4.0547,
"lr": 0.0006577622377622377,
"step": 2948,
"tokens_trained": 1.449033752
},
{
"epoch": 0.8368200836820083,
"grad_norm": 4.799930572509766,
"loss": 4.0044,
"lr": 0.0006574825174825175,
"step": 2950,
"tokens_trained": 1.450019912
},
{
"epoch": 0.8373874193319623,
"grad_norm": 8.492339134216309,
"loss": 4.0368,
"lr": 0.0006572027972027972,
"step": 2952,
"tokens_trained": 1.451002976
},
{
"epoch": 0.8379547549819162,
"grad_norm": 7.098823547363281,
"loss": 3.9807,
"lr": 0.000656923076923077,
"step": 2954,
"tokens_trained": 1.45198412
},
{
"epoch": 0.8385220906318701,
"grad_norm": 8.705301284790039,
"loss": 4.0749,
"lr": 0.0006566433566433566,
"step": 2956,
"tokens_trained": 1.452963832
},
{
"epoch": 0.839089426281824,
"grad_norm": 2.8292014598846436,
"loss": 4.0241,
"lr": 0.0006563636363636364,
"step": 2958,
"tokens_trained": 1.453947688
},
{
"epoch": 0.8396567619317779,
"grad_norm": 3.7414586544036865,
"loss": 4.0554,
"lr": 0.0006560839160839161,
"step": 2960,
"tokens_trained": 1.45492676
},
{
"epoch": 0.8402240975817318,
"grad_norm": 11.956228256225586,
"loss": 4.0343,
"lr": 0.0006558041958041958,
"step": 2962,
"tokens_trained": 1.455907464
},
{
"epoch": 0.8407914332316857,
"grad_norm": 11.086222648620605,
"loss": 4.0324,
"lr": 0.0006555244755244756,
"step": 2964,
"tokens_trained": 1.456891688
},
{
"epoch": 0.8413587688816396,
"grad_norm": 8.380780220031738,
"loss": 4.0335,
"lr": 0.0006552447552447552,
"step": 2966,
"tokens_trained": 1.457880016
},
{
"epoch": 0.8419261045315936,
"grad_norm": 8.568910598754883,
"loss": 4.0431,
"lr": 0.000654965034965035,
"step": 2968,
"tokens_trained": 1.458866944
},
{
"epoch": 0.8424934401815474,
"grad_norm": 10.840734481811523,
"loss": 4.0275,
"lr": 0.0006546853146853147,
"step": 2970,
"tokens_trained": 1.459849096
},
{
"epoch": 0.8430607758315013,
"grad_norm": 5.364732265472412,
"loss": 4.0464,
"lr": 0.0006544055944055945,
"step": 2972,
"tokens_trained": 1.460833976
},
{
"epoch": 0.8436281114814552,
"grad_norm": 8.918869018554688,
"loss": 4.0501,
"lr": 0.0006541258741258741,
"step": 2974,
"tokens_trained": 1.461811472
},
{
"epoch": 0.8441954471314091,
"grad_norm": 10.94211483001709,
"loss": 4.0284,
"lr": 0.0006538461538461538,
"step": 2976,
"tokens_trained": 1.462798528
},
{
"epoch": 0.8447627827813631,
"grad_norm": 14.475136756896973,
"loss": 4.0597,
"lr": 0.0006535664335664336,
"step": 2978,
"tokens_trained": 1.46378116
},
{
"epoch": 0.8453301184313169,
"grad_norm": 8.219613075256348,
"loss": 4.0499,
"lr": 0.0006532867132867133,
"step": 2980,
"tokens_trained": 1.464758752
},
{
"epoch": 0.8458974540812708,
"grad_norm": 8.898524284362793,
"loss": 4.0472,
"lr": 0.0006530069930069931,
"step": 2982,
"tokens_trained": 1.465737992
},
{
"epoch": 0.8464647897312247,
"grad_norm": 6.673952579498291,
"loss": 3.9971,
"lr": 0.0006527272727272727,
"step": 2984,
"tokens_trained": 1.466724672
},
{
"epoch": 0.8470321253811787,
"grad_norm": 6.514251708984375,
"loss": 4.0245,
"lr": 0.0006524475524475524,
"step": 2986,
"tokens_trained": 1.46770572
},
{
"epoch": 0.8475994610311326,
"grad_norm": 8.130202293395996,
"loss": 4.0332,
"lr": 0.0006521678321678322,
"step": 2988,
"tokens_trained": 1.468690624
},
{
"epoch": 0.8481667966810864,
"grad_norm": 4.283686637878418,
"loss": 4.0551,
"lr": 0.0006518881118881119,
"step": 2990,
"tokens_trained": 1.469674696
},
{
"epoch": 0.8487341323310403,
"grad_norm": 4.8144426345825195,
"loss": 4.0408,
"lr": 0.0006516083916083916,
"step": 2992,
"tokens_trained": 1.470659816
},
{
"epoch": 0.8493014679809943,
"grad_norm": 11.117393493652344,
"loss": 4.0423,
"lr": 0.0006513286713286713,
"step": 2994,
"tokens_trained": 1.47164192
},
{
"epoch": 0.8498688036309482,
"grad_norm": 8.022162437438965,
"loss": 4.064,
"lr": 0.0006510489510489511,
"step": 2996,
"tokens_trained": 1.472624344
},
{
"epoch": 0.8504361392809021,
"grad_norm": 5.267605304718018,
"loss": 3.9804,
"lr": 0.0006507692307692308,
"step": 2998,
"tokens_trained": 1.473606552
},
{
"epoch": 0.8510034749308559,
"grad_norm": 9.365017890930176,
"loss": 4.0223,
"lr": 0.0006504895104895106,
"step": 3000,
"tokens_trained": 1.474586552
},
{
"epoch": 0.8510034749308559,
"eval_loss": 1.0078805685043335,
"eval_runtime": 20.7752,
"step": 3000,
"tokens_trained": 1.474586552
},
{
"epoch": 0.8515708105808099,
"grad_norm": 10.311480522155762,
"loss": 3.969,
"lr": 0.0006502097902097902,
"step": 3002,
"tokens_trained": 1.475564304
},
{
"epoch": 0.8521381462307638,
"grad_norm": 5.622078895568848,
"loss": 3.9803,
"lr": 0.0006499300699300699,
"step": 3004,
"tokens_trained": 1.476547088
},
{
"epoch": 0.8527054818807177,
"grad_norm": 6.005502223968506,
"loss": 4.0584,
"lr": 0.0006496503496503497,
"step": 3006,
"tokens_trained": 1.477531352
},
{
"epoch": 0.8532728175306716,
"grad_norm": 5.769370079040527,
"loss": 4.0332,
"lr": 0.0006493706293706294,
"step": 3008,
"tokens_trained": 1.478512136
},
{
"epoch": 0.8538401531806254,
"grad_norm": 4.246579647064209,
"loss": 3.9848,
"lr": 0.0006490909090909091,
"step": 3010,
"tokens_trained": 1.47949464
},
{
"epoch": 0.8544074888305794,
"grad_norm": 3.3972086906433105,
"loss": 3.9969,
"lr": 0.0006488111888111888,
"step": 3012,
"tokens_trained": 1.4804812
},
{
"epoch": 0.8549748244805333,
"grad_norm": 4.793631553649902,
"loss": 3.9748,
"lr": 0.0006485314685314685,
"step": 3014,
"tokens_trained": 1.481469176
},
{
"epoch": 0.8555421601304872,
"grad_norm": 7.709076881408691,
"loss": 4.0399,
"lr": 0.0006482517482517483,
"step": 3016,
"tokens_trained": 1.482450232
},
{
"epoch": 0.8561094957804412,
"grad_norm": 9.06294059753418,
"loss": 4.0279,
"lr": 0.000647972027972028,
"step": 3018,
"tokens_trained": 1.48343416
},
{
"epoch": 0.856676831430395,
"grad_norm": 7.496627330780029,
"loss": 4.047,
"lr": 0.0006476923076923077,
"step": 3020,
"tokens_trained": 1.484423072
},
{
"epoch": 0.8572441670803489,
"grad_norm": 6.635293006896973,
"loss": 4.0583,
"lr": 0.0006474125874125874,
"step": 3022,
"tokens_trained": 1.485406296
},
{
"epoch": 0.8578115027303028,
"grad_norm": 6.3066864013671875,
"loss": 3.9902,
"lr": 0.0006471328671328672,
"step": 3024,
"tokens_trained": 1.486391472
},
{
"epoch": 0.8583788383802567,
"grad_norm": 1.1249172687530518,
"loss": 4.0032,
"lr": 0.0006468531468531469,
"step": 3026,
"tokens_trained": 1.487377128
},
{
"epoch": 0.8589461740302107,
"grad_norm": 2.966470241546631,
"loss": 3.9859,
"lr": 0.0006465734265734265,
"step": 3028,
"tokens_trained": 1.488359656
},
{
"epoch": 0.8595135096801645,
"grad_norm": 6.611581325531006,
"loss": 4.0259,
"lr": 0.0006462937062937063,
"step": 3030,
"tokens_trained": 1.489340552
},
{
"epoch": 0.8600808453301184,
"grad_norm": 7.76756477355957,
"loss": 4.0223,
"lr": 0.0006460139860139859,
"step": 3032,
"tokens_trained": 1.49032648
},
{
"epoch": 0.8606481809800723,
"grad_norm": 10.86517333984375,
"loss": 4.0457,
"lr": 0.0006457342657342658,
"step": 3034,
"tokens_trained": 1.491312608
},
{
"epoch": 0.8612155166300263,
"grad_norm": 4.524630546569824,
"loss": 4.0882,
"lr": 0.0006454545454545455,
"step": 3036,
"tokens_trained": 1.49229724
},
{
"epoch": 0.8617828522799802,
"grad_norm": 10.601529121398926,
"loss": 4.0466,
"lr": 0.0006451748251748252,
"step": 3038,
"tokens_trained": 1.49327952
},
{
"epoch": 0.862350187929934,
"grad_norm": 10.691457748413086,
"loss": 4.0239,
"lr": 0.0006448951048951049,
"step": 3040,
"tokens_trained": 1.494263528
},
{
"epoch": 0.8629175235798879,
"grad_norm": 5.371310710906982,
"loss": 4.0864,
"lr": 0.0006446153846153846,
"step": 3042,
"tokens_trained": 1.49524708
},
{
"epoch": 0.8634848592298419,
"grad_norm": 5.7418999671936035,
"loss": 4.0618,
"lr": 0.0006443356643356644,
"step": 3044,
"tokens_trained": 1.496229136
},
{
"epoch": 0.8640521948797958,
"grad_norm": 7.521689414978027,
"loss": 4.0235,
"lr": 0.000644055944055944,
"step": 3046,
"tokens_trained": 1.497212944
},
{
"epoch": 0.8646195305297497,
"grad_norm": 6.966773509979248,
"loss": 4.0187,
"lr": 0.0006437762237762238,
"step": 3048,
"tokens_trained": 1.498198992
},
{
"epoch": 0.8651868661797035,
"grad_norm": 12.514280319213867,
"loss": 4.0306,
"lr": 0.0006434965034965034,
"step": 3050,
"tokens_trained": 1.499181312
},
{
"epoch": 0.8657542018296575,
"grad_norm": 4.849910736083984,
"loss": 4.033,
"lr": 0.0006432167832167833,
"step": 3052,
"tokens_trained": 1.500163288
},
{
"epoch": 0.8663215374796114,
"grad_norm": 9.553950309753418,
"loss": 4.0465,
"lr": 0.000642937062937063,
"step": 3054,
"tokens_trained": 1.501147464
},
{
"epoch": 0.8668888731295653,
"grad_norm": 8.58786678314209,
"loss": 4.0584,
"lr": 0.0006426573426573426,
"step": 3056,
"tokens_trained": 1.50212956
},
{
"epoch": 0.8674562087795192,
"grad_norm": 11.174147605895996,
"loss": 4.0152,
"lr": 0.0006423776223776224,
"step": 3058,
"tokens_trained": 1.503112168
},
{
"epoch": 0.868023544429473,
"grad_norm": 1.879528522491455,
"loss": 3.999,
"lr": 0.0006420979020979021,
"step": 3060,
"tokens_trained": 1.504099584
},
{
"epoch": 0.868590880079427,
"grad_norm": 19.370494842529297,
"loss": 4.1039,
"lr": 0.0006418181818181819,
"step": 3062,
"tokens_trained": 1.50508356
},
{
"epoch": 0.8691582157293809,
"grad_norm": 10.598268508911133,
"loss": 4.0542,
"lr": 0.0006415384615384615,
"step": 3064,
"tokens_trained": 1.506063304
},
{
"epoch": 0.8697255513793348,
"grad_norm": 8.537477493286133,
"loss": 4.0529,
"lr": 0.0006412587412587413,
"step": 3066,
"tokens_trained": 1.507046368
},
{
"epoch": 0.8702928870292888,
"grad_norm": 8.395747184753418,
"loss": 3.9941,
"lr": 0.0006409790209790209,
"step": 3068,
"tokens_trained": 1.508029128
},
{
"epoch": 0.8708602226792426,
"grad_norm": 5.918806552886963,
"loss": 4.0078,
"lr": 0.0006406993006993007,
"step": 3070,
"tokens_trained": 1.5090132
},
{
"epoch": 0.8714275583291965,
"grad_norm": 3.845099925994873,
"loss": 4.0564,
"lr": 0.0006404195804195805,
"step": 3072,
"tokens_trained": 1.509994832
},
{
"epoch": 0.8719948939791504,
"grad_norm": 3.3807923793792725,
"loss": 4.0438,
"lr": 0.0006401398601398601,
"step": 3074,
"tokens_trained": 1.510975552
},
{
"epoch": 0.8725622296291043,
"grad_norm": 4.468081951141357,
"loss": 4.066,
"lr": 0.0006398601398601399,
"step": 3076,
"tokens_trained": 1.511959576
},
{
"epoch": 0.8731295652790583,
"grad_norm": 1.8455613851547241,
"loss": 4.0247,
"lr": 0.0006395804195804196,
"step": 3078,
"tokens_trained": 1.512939112
},
{
"epoch": 0.8736969009290121,
"grad_norm": 7.184399127960205,
"loss": 4.081,
"lr": 0.0006393006993006994,
"step": 3080,
"tokens_trained": 1.513924792
},
{
"epoch": 0.874264236578966,
"grad_norm": 8.416154861450195,
"loss": 4.0372,
"lr": 0.000639020979020979,
"step": 3082,
"tokens_trained": 1.514905096
},
{
"epoch": 0.8748315722289199,
"grad_norm": 6.620309829711914,
"loss": 4.0822,
"lr": 0.0006387412587412587,
"step": 3084,
"tokens_trained": 1.51588724
},
{
"epoch": 0.8753989078788739,
"grad_norm": 7.424724102020264,
"loss": 4.053,
"lr": 0.0006384615384615384,
"step": 3086,
"tokens_trained": 1.516871792
},
{
"epoch": 0.8759662435288278,
"grad_norm": 7.8764448165893555,
"loss": 4.059,
"lr": 0.0006381818181818182,
"step": 3088,
"tokens_trained": 1.517857872
},
{
"epoch": 0.8765335791787816,
"grad_norm": 7.330927848815918,
"loss": 4.0182,
"lr": 0.000637902097902098,
"step": 3090,
"tokens_trained": 1.518840616
},
{
"epoch": 0.8771009148287355,
"grad_norm": 8.612639427185059,
"loss": 4.0181,
"lr": 0.0006376223776223776,
"step": 3092,
"tokens_trained": 1.519826616
},
{
"epoch": 0.8776682504786895,
"grad_norm": 9.889811515808105,
"loss": 4.0434,
"lr": 0.0006373426573426574,
"step": 3094,
"tokens_trained": 1.520805784
},
{
"epoch": 0.8782355861286434,
"grad_norm": 5.421345233917236,
"loss": 4.0237,
"lr": 0.0006370629370629371,
"step": 3096,
"tokens_trained": 1.521789344
},
{
"epoch": 0.8788029217785973,
"grad_norm": 4.9160990715026855,
"loss": 4.0497,
"lr": 0.0006367832167832168,
"step": 3098,
"tokens_trained": 1.522772664
},
{
"epoch": 0.8793702574285511,
"grad_norm": 8.828028678894043,
"loss": 4.0381,
"lr": 0.0006365034965034965,
"step": 3100,
"tokens_trained": 1.523755712
},
{
"epoch": 0.879937593078505,
"grad_norm": 5.6704182624816895,
"loss": 4.0017,
"lr": 0.0006362237762237762,
"step": 3102,
"tokens_trained": 1.52473876
},
{
"epoch": 0.880504928728459,
"grad_norm": 4.982235908508301,
"loss": 3.9826,
"lr": 0.0006359440559440559,
"step": 3104,
"tokens_trained": 1.52571756
},
{
"epoch": 0.8810722643784129,
"grad_norm": 8.639644622802734,
"loss": 4.0177,
"lr": 0.0006356643356643357,
"step": 3106,
"tokens_trained": 1.526695632
},
{
"epoch": 0.8816396000283668,
"grad_norm": 6.1896820068359375,
"loss": 4.0248,
"lr": 0.0006353846153846155,
"step": 3108,
"tokens_trained": 1.527678296
},
{
"epoch": 0.8822069356783206,
"grad_norm": 3.787477731704712,
"loss": 4.0489,
"lr": 0.0006351048951048951,
"step": 3110,
"tokens_trained": 1.528665456
},
{
"epoch": 0.8827742713282746,
"grad_norm": 4.418561935424805,
"loss": 4.0422,
"lr": 0.0006348251748251748,
"step": 3112,
"tokens_trained": 1.529648584
},
{
"epoch": 0.8833416069782285,
"grad_norm": 8.951369285583496,
"loss": 4.028,
"lr": 0.0006345454545454546,
"step": 3114,
"tokens_trained": 1.530628808
},
{
"epoch": 0.8839089426281824,
"grad_norm": 4.903277397155762,
"loss": 4.0772,
"lr": 0.0006342657342657343,
"step": 3116,
"tokens_trained": 1.531612144
},
{
"epoch": 0.8844762782781364,
"grad_norm": 4.366726875305176,
"loss": 3.9975,
"lr": 0.000633986013986014,
"step": 3118,
"tokens_trained": 1.532595304
},
{
"epoch": 0.8850436139280902,
"grad_norm": 6.9316911697387695,
"loss": 4.0019,
"lr": 0.0006337062937062937,
"step": 3120,
"tokens_trained": 1.533578888
},
{
"epoch": 0.8856109495780441,
"grad_norm": 8.896012306213379,
"loss": 4.04,
"lr": 0.0006334265734265733,
"step": 3122,
"tokens_trained": 1.534557552
},
{
"epoch": 0.886178285227998,
"grad_norm": 5.350147724151611,
"loss": 4.0229,
"lr": 0.0006331468531468532,
"step": 3124,
"tokens_trained": 1.535539672
},
{
"epoch": 0.8864619530529749,
"eval_loss": 1.007444143295288,
"eval_runtime": 20.5976,
"step": 3125,
"tokens_trained": 1.53603052
},
{
"epoch": 0.886745620877952,
"grad_norm": 5.331796646118164,
"loss": 4.0331,
"lr": 0.0006328671328671329,
"step": 3126,
"tokens_trained": 1.536525432
},
{
"epoch": 0.8873129565279059,
"grad_norm": 11.335051536560059,
"loss": 4.041,
"lr": 0.0006325874125874126,
"step": 3128,
"tokens_trained": 1.537508928
},
{
"epoch": 0.8878802921778597,
"grad_norm": 8.185080528259277,
"loss": 4.0299,
"lr": 0.0006323076923076923,
"step": 3130,
"tokens_trained": 1.53848672
},
{
"epoch": 0.8884476278278136,
"grad_norm": 4.136550426483154,
"loss": 4.0268,
"lr": 0.0006320279720279721,
"step": 3132,
"tokens_trained": 1.5394682
},
{
"epoch": 0.8890149634777675,
"grad_norm": 4.993428707122803,
"loss": 3.9808,
"lr": 0.0006317482517482518,
"step": 3134,
"tokens_trained": 1.540449416
},
{
"epoch": 0.8895822991277215,
"grad_norm": 5.485887050628662,
"loss": 4.0201,
"lr": 0.0006314685314685314,
"step": 3136,
"tokens_trained": 1.541436136
},
{
"epoch": 0.8901496347776754,
"grad_norm": 4.517815589904785,
"loss": 3.9985,
"lr": 0.0006311888111888112,
"step": 3138,
"tokens_trained": 1.542421992
},
{
"epoch": 0.8907169704276292,
"grad_norm": 3.8219170570373535,
"loss": 4.0299,
"lr": 0.0006309090909090908,
"step": 3140,
"tokens_trained": 1.543399648
},
{
"epoch": 0.8912843060775831,
"grad_norm": 7.318249702453613,
"loss": 4.0377,
"lr": 0.0006306293706293707,
"step": 3142,
"tokens_trained": 1.54438384
},
{
"epoch": 0.8918516417275371,
"grad_norm": 9.09650707244873,
"loss": 4.0572,
"lr": 0.0006303496503496504,
"step": 3144,
"tokens_trained": 1.545367632
},
{
"epoch": 0.892418977377491,
"grad_norm": 6.241589069366455,
"loss": 4.025,
"lr": 0.0006300699300699301,
"step": 3146,
"tokens_trained": 1.546355136
},
{
"epoch": 0.8929863130274449,
"grad_norm": 6.9915385246276855,
"loss": 4.0177,
"lr": 0.0006297902097902098,
"step": 3148,
"tokens_trained": 1.547340304
},
{
"epoch": 0.8935536486773987,
"grad_norm": 5.599451541900635,
"loss": 3.9892,
"lr": 0.0006295104895104896,
"step": 3150,
"tokens_trained": 1.54832164
},
{
"epoch": 0.8941209843273527,
"grad_norm": 7.765986442565918,
"loss": 4.0232,
"lr": 0.0006292307692307693,
"step": 3152,
"tokens_trained": 1.54930228
},
{
"epoch": 0.8946883199773066,
"grad_norm": 10.365357398986816,
"loss": 4.0254,
"lr": 0.0006289510489510489,
"step": 3154,
"tokens_trained": 1.550282888
},
{
"epoch": 0.8952556556272605,
"grad_norm": 7.8539276123046875,
"loss": 4.008,
"lr": 0.0006286713286713287,
"step": 3156,
"tokens_trained": 1.551265008
},
{
"epoch": 0.8958229912772144,
"grad_norm": 8.106318473815918,
"loss": 4.0351,
"lr": 0.0006283916083916083,
"step": 3158,
"tokens_trained": 1.552245928
},
{
"epoch": 0.8963903269271682,
"grad_norm": 10.22494125366211,
"loss": 3.9873,
"lr": 0.0006281118881118882,
"step": 3160,
"tokens_trained": 1.553227848
},
{
"epoch": 0.8969576625771222,
"grad_norm": 2.8810367584228516,
"loss": 4.0399,
"lr": 0.0006278321678321679,
"step": 3162,
"tokens_trained": 1.554208112
},
{
"epoch": 0.8975249982270761,
"grad_norm": 10.036259651184082,
"loss": 4.0072,
"lr": 0.0006275524475524475,
"step": 3164,
"tokens_trained": 1.555186496
},
{
"epoch": 0.89809233387703,
"grad_norm": 6.596704006195068,
"loss": 4.0306,
"lr": 0.0006272727272727273,
"step": 3166,
"tokens_trained": 1.556170896
},
{
"epoch": 0.898659669526984,
"grad_norm": 4.411632537841797,
"loss": 4.035,
"lr": 0.000626993006993007,
"step": 3168,
"tokens_trained": 1.55715312
},
{
"epoch": 0.8992270051769378,
"grad_norm": 4.391601085662842,
"loss": 3.9973,
"lr": 0.0006267132867132868,
"step": 3170,
"tokens_trained": 1.558133552
},
{
"epoch": 0.8997943408268917,
"grad_norm": 9.456700325012207,
"loss": 4.0255,
"lr": 0.0006264335664335664,
"step": 3172,
"tokens_trained": 1.559115752
},
{
"epoch": 0.9003616764768456,
"grad_norm": 8.490089416503906,
"loss": 4.0368,
"lr": 0.0006261538461538462,
"step": 3174,
"tokens_trained": 1.560095384
},
{
"epoch": 0.9009290121267995,
"grad_norm": 7.3357744216918945,
"loss": 4.0528,
"lr": 0.0006258741258741258,
"step": 3176,
"tokens_trained": 1.561078856
},
{
"epoch": 0.9014963477767535,
"grad_norm": 6.7389092445373535,
"loss": 4.0457,
"lr": 0.0006255944055944057,
"step": 3178,
"tokens_trained": 1.562063936
},
{
"epoch": 0.9020636834267073,
"grad_norm": 7.586348056793213,
"loss": 4.0516,
"lr": 0.0006253146853146854,
"step": 3180,
"tokens_trained": 1.5630424
},
{
"epoch": 0.9026310190766612,
"grad_norm": 5.646294116973877,
"loss": 4.0048,
"lr": 0.000625034965034965,
"step": 3182,
"tokens_trained": 1.564028064
},
{
"epoch": 0.9031983547266151,
"grad_norm": 7.30889368057251,
"loss": 3.9952,
"lr": 0.0006247552447552448,
"step": 3184,
"tokens_trained": 1.565010296
},
{
"epoch": 0.9037656903765691,
"grad_norm": 6.234517574310303,
"loss": 4.0267,
"lr": 0.0006244755244755245,
"step": 3186,
"tokens_trained": 1.565993536
},
{
"epoch": 0.904333026026523,
"grad_norm": 4.630068302154541,
"loss": 4.0638,
"lr": 0.0006241958041958043,
"step": 3188,
"tokens_trained": 1.566973648
},
{
"epoch": 0.9049003616764768,
"grad_norm": 10.530085563659668,
"loss": 4.056,
"lr": 0.0006239160839160839,
"step": 3190,
"tokens_trained": 1.567954192
},
{
"epoch": 0.9054676973264307,
"grad_norm": 6.909562110900879,
"loss": 4.0297,
"lr": 0.0006236363636363636,
"step": 3192,
"tokens_trained": 1.568941888
},
{
"epoch": 0.9060350329763847,
"grad_norm": 3.382798910140991,
"loss": 3.9554,
"lr": 0.0006233566433566433,
"step": 3194,
"tokens_trained": 1.569926344
},
{
"epoch": 0.9066023686263386,
"grad_norm": 6.318317890167236,
"loss": 4.0313,
"lr": 0.0006230769230769231,
"step": 3196,
"tokens_trained": 1.570909072
},
{
"epoch": 0.9071697042762925,
"grad_norm": 8.904982566833496,
"loss": 4.0422,
"lr": 0.0006227972027972028,
"step": 3198,
"tokens_trained": 1.571891864
},
{
"epoch": 0.9077370399262463,
"grad_norm": 4.008038520812988,
"loss": 4.0254,
"lr": 0.0006225174825174825,
"step": 3200,
"tokens_trained": 1.572877488
},
{
"epoch": 0.9083043755762003,
"grad_norm": 4.28498649597168,
"loss": 3.9916,
"lr": 0.0006222377622377623,
"step": 3202,
"tokens_trained": 1.57385788
},
{
"epoch": 0.9088717112261542,
"grad_norm": 7.385266304016113,
"loss": 3.9841,
"lr": 0.000621958041958042,
"step": 3204,
"tokens_trained": 1.574841232
},
{
"epoch": 0.9094390468761081,
"grad_norm": 6.1430134773254395,
"loss": 3.9886,
"lr": 0.0006216783216783217,
"step": 3206,
"tokens_trained": 1.5758212
},
{
"epoch": 0.910006382526062,
"grad_norm": 4.640578746795654,
"loss": 4.036,
"lr": 0.0006213986013986014,
"step": 3208,
"tokens_trained": 1.576803856
},
{
"epoch": 0.9105737181760158,
"grad_norm": 2.6749765872955322,
"loss": 3.9934,
"lr": 0.0006211188811188811,
"step": 3210,
"tokens_trained": 1.577788136
},
{
"epoch": 0.9111410538259698,
"grad_norm": 2.5117337703704834,
"loss": 3.9924,
"lr": 0.0006208391608391608,
"step": 3212,
"tokens_trained": 1.5787728
},
{
"epoch": 0.9117083894759237,
"grad_norm": 9.552038192749023,
"loss": 4.0141,
"lr": 0.0006205594405594406,
"step": 3214,
"tokens_trained": 1.579757576
},
{
"epoch": 0.9122757251258776,
"grad_norm": 4.317904949188232,
"loss": 4.0242,
"lr": 0.0006202797202797203,
"step": 3216,
"tokens_trained": 1.580737776
},
{
"epoch": 0.9128430607758315,
"grad_norm": 4.847869873046875,
"loss": 4.0037,
"lr": 0.00062,
"step": 3218,
"tokens_trained": 1.58172144
},
{
"epoch": 0.9134103964257854,
"grad_norm": 8.135149002075195,
"loss": 4.056,
"lr": 0.0006197202797202797,
"step": 3220,
"tokens_trained": 1.58270064
},
{
"epoch": 0.9139777320757393,
"grad_norm": 4.46032190322876,
"loss": 4.0037,
"lr": 0.0006194405594405595,
"step": 3222,
"tokens_trained": 1.58368244
},
{
"epoch": 0.9145450677256932,
"grad_norm": 4.710826873779297,
"loss": 4.0083,
"lr": 0.0006191608391608392,
"step": 3224,
"tokens_trained": 1.584669984
},
{
"epoch": 0.9151124033756471,
"grad_norm": 6.524029731750488,
"loss": 4.0394,
"lr": 0.0006188811188811189,
"step": 3226,
"tokens_trained": 1.585651952
},
{
"epoch": 0.9156797390256011,
"grad_norm": 8.807348251342773,
"loss": 4.0215,
"lr": 0.0006186013986013986,
"step": 3228,
"tokens_trained": 1.586634416
},
{
"epoch": 0.9162470746755549,
"grad_norm": 8.313971519470215,
"loss": 4.048,
"lr": 0.0006183216783216783,
"step": 3230,
"tokens_trained": 1.587616352
},
{
"epoch": 0.9168144103255088,
"grad_norm": 7.2862868309021,
"loss": 4.0326,
"lr": 0.0006180419580419581,
"step": 3232,
"tokens_trained": 1.588597696
},
{
"epoch": 0.9173817459754627,
"grad_norm": 6.1933746337890625,
"loss": 4.0232,
"lr": 0.0006177622377622377,
"step": 3234,
"tokens_trained": 1.589579384
},
{
"epoch": 0.9179490816254167,
"grad_norm": 6.848970890045166,
"loss": 4.0134,
"lr": 0.0006174825174825175,
"step": 3236,
"tokens_trained": 1.590563936
},
{
"epoch": 0.9185164172753706,
"grad_norm": 6.213261604309082,
"loss": 3.9622,
"lr": 0.0006172027972027972,
"step": 3238,
"tokens_trained": 1.591546488
},
{
"epoch": 0.9190837529253244,
"grad_norm": 11.642724990844727,
"loss": 4.0487,
"lr": 0.000616923076923077,
"step": 3240,
"tokens_trained": 1.592528992
},
{
"epoch": 0.9196510885752783,
"grad_norm": 2.465311288833618,
"loss": 3.9996,
"lr": 0.0006166433566433567,
"step": 3242,
"tokens_trained": 1.593514088
},
{
"epoch": 0.9202184242252323,
"grad_norm": 14.788623809814453,
"loss": 4.1041,
"lr": 0.0006163636363636364,
"step": 3244,
"tokens_trained": 1.594498768
},
{
"epoch": 0.9207857598751862,
"grad_norm": 11.614027976989746,
"loss": 3.99,
"lr": 0.0006160839160839161,
"step": 3246,
"tokens_trained": 1.595477496
},
{
"epoch": 0.9213530955251401,
"grad_norm": 8.917405128479004,
"loss": 4.0626,
"lr": 0.0006158041958041957,
"step": 3248,
"tokens_trained": 1.596459208
},
{
"epoch": 0.9219204311750939,
"grad_norm": 9.843046188354492,
"loss": 4.0256,
"lr": 0.0006155244755244756,
"step": 3250,
"tokens_trained": 1.59744676
},
{
"epoch": 0.9219204311750939,
"eval_loss": 1.0055779218673706,
"eval_runtime": 20.5405,
"step": 3250,
"tokens_trained": 1.59744676
},
{
"epoch": 0.9224877668250479,
"grad_norm": 5.153568267822266,
"loss": 3.9596,
"lr": 0.0006152447552447552,
"step": 3252,
"tokens_trained": 1.598428968
},
{
"epoch": 0.9230551024750018,
"grad_norm": 3.321300745010376,
"loss": 3.969,
"lr": 0.000614965034965035,
"step": 3254,
"tokens_trained": 1.599406304
},
{
"epoch": 0.9236224381249557,
"grad_norm": 5.910068511962891,
"loss": 3.9806,
"lr": 0.0006146853146853147,
"step": 3256,
"tokens_trained": 1.60038644
},
{
"epoch": 0.9241897737749096,
"grad_norm": 9.364005088806152,
"loss": 3.9919,
"lr": 0.0006144055944055945,
"step": 3258,
"tokens_trained": 1.601371288
},
{
"epoch": 0.9247571094248634,
"grad_norm": 9.865127563476562,
"loss": 3.9827,
"lr": 0.0006141258741258742,
"step": 3260,
"tokens_trained": 1.602351528
},
{
"epoch": 0.9253244450748174,
"grad_norm": 6.053020000457764,
"loss": 3.9769,
"lr": 0.0006138461538461538,
"step": 3262,
"tokens_trained": 1.603337336
},
{
"epoch": 0.9258917807247713,
"grad_norm": 5.632033348083496,
"loss": 4.061,
"lr": 0.0006135664335664336,
"step": 3264,
"tokens_trained": 1.6043186
},
{
"epoch": 0.9264591163747252,
"grad_norm": 6.253534317016602,
"loss": 3.9414,
"lr": 0.0006132867132867132,
"step": 3266,
"tokens_trained": 1.605300448
},
{
"epoch": 0.9270264520246791,
"grad_norm": 7.757418632507324,
"loss": 4.0119,
"lr": 0.0006130069930069931,
"step": 3268,
"tokens_trained": 1.60628376
},
{
"epoch": 0.927593787674633,
"grad_norm": 5.378245830535889,
"loss": 3.9746,
"lr": 0.0006127272727272727,
"step": 3270,
"tokens_trained": 1.607265384
},
{
"epoch": 0.9281611233245869,
"grad_norm": 5.998968124389648,
"loss": 4.0218,
"lr": 0.0006124475524475525,
"step": 3272,
"tokens_trained": 1.60824544
},
{
"epoch": 0.9287284589745408,
"grad_norm": 6.340670585632324,
"loss": 4.0204,
"lr": 0.0006121678321678322,
"step": 3274,
"tokens_trained": 1.609232632
},
{
"epoch": 0.9292957946244947,
"grad_norm": 6.357148170471191,
"loss": 3.9686,
"lr": 0.0006118881118881118,
"step": 3276,
"tokens_trained": 1.610216024
},
{
"epoch": 0.9298631302744487,
"grad_norm": 4.993794918060303,
"loss": 3.9812,
"lr": 0.0006116083916083917,
"step": 3278,
"tokens_trained": 1.611196872
},
{
"epoch": 0.9304304659244025,
"grad_norm": 7.559938430786133,
"loss": 4.0018,
"lr": 0.0006113286713286713,
"step": 3280,
"tokens_trained": 1.612184944
},
{
"epoch": 0.9309978015743564,
"grad_norm": 3.7233004570007324,
"loss": 3.9835,
"lr": 0.0006110489510489511,
"step": 3282,
"tokens_trained": 1.613170464
},
{
"epoch": 0.9315651372243103,
"grad_norm": 7.3292717933654785,
"loss": 3.977,
"lr": 0.0006107692307692307,
"step": 3284,
"tokens_trained": 1.614153168
},
{
"epoch": 0.9321324728742643,
"grad_norm": 8.804302215576172,
"loss": 3.962,
"lr": 0.0006104895104895106,
"step": 3286,
"tokens_trained": 1.615134208
},
{
"epoch": 0.9326998085242182,
"grad_norm": 5.557953834533691,
"loss": 3.9729,
"lr": 0.0006102097902097902,
"step": 3288,
"tokens_trained": 1.616116248
},
{
"epoch": 0.933267144174172,
"grad_norm": 5.135542869567871,
"loss": 3.9855,
"lr": 0.0006099300699300699,
"step": 3290,
"tokens_trained": 1.617100064
},
{
"epoch": 0.9338344798241259,
"grad_norm": 10.206086158752441,
"loss": 4.0058,
"lr": 0.0006096503496503497,
"step": 3292,
"tokens_trained": 1.61808084
},
{
"epoch": 0.9344018154740799,
"grad_norm": 6.490070819854736,
"loss": 4.0328,
"lr": 0.0006093706293706293,
"step": 3294,
"tokens_trained": 1.619061608
},
{
"epoch": 0.9349691511240338,
"grad_norm": 6.246134281158447,
"loss": 3.9858,
"lr": 0.0006090909090909092,
"step": 3296,
"tokens_trained": 1.620046896
},
{
"epoch": 0.9355364867739877,
"grad_norm": 6.82793664932251,
"loss": 3.9416,
"lr": 0.0006088111888111888,
"step": 3298,
"tokens_trained": 1.621030544
},
{
"epoch": 0.9361038224239415,
"grad_norm": 5.400341510772705,
"loss": 4.0048,
"lr": 0.0006085314685314686,
"step": 3300,
"tokens_trained": 1.622010024
},
{
"epoch": 0.9366711580738954,
"grad_norm": 2.7493224143981934,
"loss": 3.9987,
"lr": 0.0006082517482517482,
"step": 3302,
"tokens_trained": 1.622992736
},
{
"epoch": 0.9372384937238494,
"grad_norm": 8.426931381225586,
"loss": 4.0074,
"lr": 0.000607972027972028,
"step": 3304,
"tokens_trained": 1.623977336
},
{
"epoch": 0.9378058293738033,
"grad_norm": 6.779547691345215,
"loss": 4.0041,
"lr": 0.0006076923076923077,
"step": 3306,
"tokens_trained": 1.624958504
},
{
"epoch": 0.9383731650237572,
"grad_norm": 5.38230562210083,
"loss": 4.0297,
"lr": 0.0006074125874125874,
"step": 3308,
"tokens_trained": 1.625948568
},
{
"epoch": 0.938940500673711,
"grad_norm": 5.785275936126709,
"loss": 4.0112,
"lr": 0.0006071328671328672,
"step": 3310,
"tokens_trained": 1.626932696
},
{
"epoch": 0.939507836323665,
"grad_norm": 14.610711097717285,
"loss": 3.9558,
"lr": 0.0006068531468531468,
"step": 3312,
"tokens_trained": 1.62791704
},
{
"epoch": 0.9400751719736189,
"grad_norm": 2.3301351070404053,
"loss": 4.0155,
"lr": 0.0006065734265734267,
"step": 3314,
"tokens_trained": 1.628900096
},
{
"epoch": 0.9406425076235728,
"grad_norm": 17.020362854003906,
"loss": 4.0244,
"lr": 0.0006062937062937063,
"step": 3316,
"tokens_trained": 1.629885888
},
{
"epoch": 0.9412098432735267,
"grad_norm": 8.809579849243164,
"loss": 4.0622,
"lr": 0.000606013986013986,
"step": 3318,
"tokens_trained": 1.630868992
},
{
"epoch": 0.9417771789234806,
"grad_norm": 4.908751964569092,
"loss": 4.0464,
"lr": 0.0006057342657342657,
"step": 3320,
"tokens_trained": 1.631855664
},
{
"epoch": 0.9423445145734345,
"grad_norm": 9.65546989440918,
"loss": 4.013,
"lr": 0.0006054545454545455,
"step": 3322,
"tokens_trained": 1.632839496
},
{
"epoch": 0.9429118502233884,
"grad_norm": 5.595473766326904,
"loss": 4.0371,
"lr": 0.0006051748251748252,
"step": 3324,
"tokens_trained": 1.633827536
},
{
"epoch": 0.9434791858733423,
"grad_norm": 10.249938011169434,
"loss": 4.0702,
"lr": 0.0006048951048951049,
"step": 3326,
"tokens_trained": 1.634811888
},
{
"epoch": 0.9440465215232963,
"grad_norm": 12.086007118225098,
"loss": 4.0042,
"lr": 0.0006046153846153846,
"step": 3328,
"tokens_trained": 1.635792824
},
{
"epoch": 0.9446138571732501,
"grad_norm": 3.0745136737823486,
"loss": 4.0355,
"lr": 0.0006043356643356643,
"step": 3330,
"tokens_trained": 1.636776176
},
{
"epoch": 0.945181192823204,
"grad_norm": 4.060697078704834,
"loss": 4.0016,
"lr": 0.0006040559440559441,
"step": 3332,
"tokens_trained": 1.637758008
},
{
"epoch": 0.9457485284731579,
"grad_norm": 7.648933410644531,
"loss": 3.9939,
"lr": 0.0006037762237762238,
"step": 3334,
"tokens_trained": 1.638744408
},
{
"epoch": 0.9463158641231119,
"grad_norm": 5.033253192901611,
"loss": 4.0245,
"lr": 0.0006034965034965035,
"step": 3336,
"tokens_trained": 1.639724776
},
{
"epoch": 0.9468831997730658,
"grad_norm": 4.653557300567627,
"loss": 4.0169,
"lr": 0.0006032167832167832,
"step": 3338,
"tokens_trained": 1.640708864
},
{
"epoch": 0.9474505354230196,
"grad_norm": 6.682651042938232,
"loss": 4.0062,
"lr": 0.000602937062937063,
"step": 3340,
"tokens_trained": 1.641689864
},
{
"epoch": 0.9480178710729735,
"grad_norm": 5.059361934661865,
"loss": 3.9681,
"lr": 0.0006026573426573426,
"step": 3342,
"tokens_trained": 1.64267264
},
{
"epoch": 0.9485852067229275,
"grad_norm": 4.165974140167236,
"loss": 3.9941,
"lr": 0.0006023776223776224,
"step": 3344,
"tokens_trained": 1.643655624
},
{
"epoch": 0.9491525423728814,
"grad_norm": 6.669079780578613,
"loss": 4.0258,
"lr": 0.0006020979020979021,
"step": 3346,
"tokens_trained": 1.644635752
},
{
"epoch": 0.9497198780228353,
"grad_norm": 5.924664497375488,
"loss": 4.0589,
"lr": 0.0006018181818181818,
"step": 3348,
"tokens_trained": 1.64561992
},
{
"epoch": 0.9502872136727891,
"grad_norm": 1.662906527519226,
"loss": 3.9894,
"lr": 0.0006015384615384616,
"step": 3350,
"tokens_trained": 1.646605552
},
{
"epoch": 0.950854549322743,
"grad_norm": 3.1677517890930176,
"loss": 4.0062,
"lr": 0.0006012587412587413,
"step": 3352,
"tokens_trained": 1.647587824
},
{
"epoch": 0.951421884972697,
"grad_norm": 5.4521918296813965,
"loss": 4.0244,
"lr": 0.000600979020979021,
"step": 3354,
"tokens_trained": 1.648566792
},
{
"epoch": 0.9519892206226509,
"grad_norm": 7.839843273162842,
"loss": 3.9954,
"lr": 0.0006006993006993006,
"step": 3356,
"tokens_trained": 1.6495504
},
{
"epoch": 0.9525565562726048,
"grad_norm": 5.340535640716553,
"loss": 3.9915,
"lr": 0.0006004195804195805,
"step": 3358,
"tokens_trained": 1.65053064
},
{
"epoch": 0.9531238919225586,
"grad_norm": 3.9342992305755615,
"loss": 3.9507,
"lr": 0.0006001398601398601,
"step": 3360,
"tokens_trained": 1.651516704
},
{
"epoch": 0.9536912275725126,
"grad_norm": 3.879631519317627,
"loss": 4.0369,
"lr": 0.0005998601398601399,
"step": 3362,
"tokens_trained": 1.652501248
},
{
"epoch": 0.9542585632224665,
"grad_norm": 4.699181079864502,
"loss": 4.0151,
"lr": 0.0005995804195804196,
"step": 3364,
"tokens_trained": 1.653486632
},
{
"epoch": 0.9548258988724204,
"grad_norm": 7.259454250335693,
"loss": 3.9855,
"lr": 0.0005993006993006993,
"step": 3366,
"tokens_trained": 1.654473488
},
{
"epoch": 0.9553932345223743,
"grad_norm": 6.6725029945373535,
"loss": 3.9972,
"lr": 0.0005990209790209791,
"step": 3368,
"tokens_trained": 1.655456328
},
{
"epoch": 0.9559605701723282,
"grad_norm": 5.077842712402344,
"loss": 3.9706,
"lr": 0.0005987412587412587,
"step": 3370,
"tokens_trained": 1.656442256
},
{
"epoch": 0.9565279058222821,
"grad_norm": 7.882787704467773,
"loss": 4.0581,
"lr": 0.0005984615384615385,
"step": 3372,
"tokens_trained": 1.657425912
},
{
"epoch": 0.957095241472236,
"grad_norm": 7.118039608001709,
"loss": 3.9939,
"lr": 0.0005981818181818181,
"step": 3374,
"tokens_trained": 1.658406184
},
{
"epoch": 0.9573789092972129,
"eval_loss": 1.0043113231658936,
"eval_runtime": 20.471,
"step": 3375,
"tokens_trained": 1.658898224
},
{
"epoch": 0.9576625771221899,
"grad_norm": 11.206400871276855,
"loss": 4.0073,
"lr": 0.000597902097902098,
"step": 3376,
"tokens_trained": 1.65938968
},
{
"epoch": 0.9582299127721439,
"grad_norm": 3.2221481800079346,
"loss": 3.9924,
"lr": 0.0005976223776223776,
"step": 3378,
"tokens_trained": 1.660372856
},
{
"epoch": 0.9587972484220977,
"grad_norm": 15.000614166259766,
"loss": 4.0361,
"lr": 0.0005973426573426574,
"step": 3380,
"tokens_trained": 1.66135512
},
{
"epoch": 0.9593645840720516,
"grad_norm": 13.365633964538574,
"loss": 4.0258,
"lr": 0.0005970629370629371,
"step": 3382,
"tokens_trained": 1.662332728
},
{
"epoch": 0.9599319197220055,
"grad_norm": 6.362198829650879,
"loss": 3.9868,
"lr": 0.0005967832167832167,
"step": 3384,
"tokens_trained": 1.663311392
},
{
"epoch": 0.9604992553719595,
"grad_norm": 16.104549407958984,
"loss": 3.9893,
"lr": 0.0005965034965034966,
"step": 3386,
"tokens_trained": 1.664296088
},
{
"epoch": 0.9610665910219134,
"grad_norm": 32.109375,
"loss": 4.0635,
"lr": 0.0005962237762237762,
"step": 3388,
"tokens_trained": 1.665278232
},
{
"epoch": 0.9616339266718672,
"grad_norm": 14.814417839050293,
"loss": 4.0545,
"lr": 0.000595944055944056,
"step": 3390,
"tokens_trained": 1.666262952
},
{
"epoch": 0.9622012623218211,
"grad_norm": 8.69149398803711,
"loss": 4.0214,
"lr": 0.0005956643356643356,
"step": 3392,
"tokens_trained": 1.66724224
},
{
"epoch": 0.962768597971775,
"grad_norm": 6.150435447692871,
"loss": 4.0675,
"lr": 0.0005953846153846155,
"step": 3394,
"tokens_trained": 1.668222488
},
{
"epoch": 0.963335933621729,
"grad_norm": 14.53095817565918,
"loss": 4.0293,
"lr": 0.0005951048951048951,
"step": 3396,
"tokens_trained": 1.66920572
},
{
"epoch": 0.9639032692716829,
"grad_norm": 14.750361442565918,
"loss": 4.0345,
"lr": 0.0005948251748251748,
"step": 3398,
"tokens_trained": 1.670191456
},
{
"epoch": 0.9644706049216367,
"grad_norm": 10.563243865966797,
"loss": 4.0796,
"lr": 0.0005945454545454546,
"step": 3400,
"tokens_trained": 1.671174992
},
{
"epoch": 0.9650379405715906,
"grad_norm": 14.203415870666504,
"loss": 4.0078,
"lr": 0.0005942657342657342,
"step": 3402,
"tokens_trained": 1.672159048
},
{
"epoch": 0.9656052762215446,
"grad_norm": 7.918346405029297,
"loss": 4.0015,
"lr": 0.0005939860139860141,
"step": 3404,
"tokens_trained": 1.6731408
},
{
"epoch": 0.9661726118714985,
"grad_norm": 3.3628811836242676,
"loss": 4.0656,
"lr": 0.0005937062937062937,
"step": 3406,
"tokens_trained": 1.674120472
},
{
"epoch": 0.9667399475214524,
"grad_norm": 13.740876197814941,
"loss": 4.0296,
"lr": 0.0005934265734265735,
"step": 3408,
"tokens_trained": 1.67510176
},
{
"epoch": 0.9673072831714062,
"grad_norm": 8.178666114807129,
"loss": 3.9804,
"lr": 0.0005931468531468531,
"step": 3410,
"tokens_trained": 1.676087336
},
{
"epoch": 0.9678746188213602,
"grad_norm": 6.31284761428833,
"loss": 3.9905,
"lr": 0.000592867132867133,
"step": 3412,
"tokens_trained": 1.677069328
},
{
"epoch": 0.9684419544713141,
"grad_norm": 10.166040420532227,
"loss": 3.9962,
"lr": 0.0005925874125874126,
"step": 3414,
"tokens_trained": 1.678049672
},
{
"epoch": 0.969009290121268,
"grad_norm": 6.166718006134033,
"loss": 3.9966,
"lr": 0.0005923076923076923,
"step": 3416,
"tokens_trained": 1.679035104
},
{
"epoch": 0.969576625771222,
"grad_norm": 3.7397615909576416,
"loss": 4.0323,
"lr": 0.0005920279720279721,
"step": 3418,
"tokens_trained": 1.680018424
},
{
"epoch": 0.9701439614211758,
"grad_norm": 12.122432708740234,
"loss": 4.0143,
"lr": 0.0005917482517482517,
"step": 3420,
"tokens_trained": 1.681001112
},
{
"epoch": 0.9707112970711297,
"grad_norm": 5.118746280670166,
"loss": 3.9909,
"lr": 0.0005914685314685316,
"step": 3422,
"tokens_trained": 1.681987648
},
{
"epoch": 0.9712786327210836,
"grad_norm": 5.810860633850098,
"loss": 3.9675,
"lr": 0.0005911888111888112,
"step": 3424,
"tokens_trained": 1.68296972
},
{
"epoch": 0.9718459683710375,
"grad_norm": 7.637686252593994,
"loss": 3.9976,
"lr": 0.0005909090909090909,
"step": 3426,
"tokens_trained": 1.683952
},
{
"epoch": 0.9724133040209915,
"grad_norm": 5.637698173522949,
"loss": 3.9829,
"lr": 0.0005906293706293706,
"step": 3428,
"tokens_trained": 1.684933912
},
{
"epoch": 0.9729806396709453,
"grad_norm": 2.2650809288024902,
"loss": 3.9656,
"lr": 0.0005903496503496504,
"step": 3430,
"tokens_trained": 1.685915176
},
{
"epoch": 0.9735479753208992,
"grad_norm": 6.0117058753967285,
"loss": 4.0575,
"lr": 0.0005900699300699301,
"step": 3432,
"tokens_trained": 1.686901184
},
{
"epoch": 0.9741153109708531,
"grad_norm": 8.301697731018066,
"loss": 3.9869,
"lr": 0.0005897902097902098,
"step": 3434,
"tokens_trained": 1.687886888
},
{
"epoch": 0.9746826466208071,
"grad_norm": 6.436981678009033,
"loss": 4.01,
"lr": 0.0005895104895104896,
"step": 3436,
"tokens_trained": 1.68886904
},
{
"epoch": 0.975249982270761,
"grad_norm": 4.290571212768555,
"loss": 3.9953,
"lr": 0.0005892307692307692,
"step": 3438,
"tokens_trained": 1.689850264
},
{
"epoch": 0.9758173179207148,
"grad_norm": 4.618532657623291,
"loss": 3.9995,
"lr": 0.000588951048951049,
"step": 3440,
"tokens_trained": 1.69083728
},
{
"epoch": 0.9763846535706687,
"grad_norm": 8.481820106506348,
"loss": 4.0019,
"lr": 0.0005886713286713287,
"step": 3442,
"tokens_trained": 1.691819976
},
{
"epoch": 0.9769519892206227,
"grad_norm": 4.643980503082275,
"loss": 3.9974,
"lr": 0.0005883916083916084,
"step": 3444,
"tokens_trained": 1.692803784
},
{
"epoch": 0.9775193248705766,
"grad_norm": 6.828413009643555,
"loss": 3.9886,
"lr": 0.0005881118881118881,
"step": 3446,
"tokens_trained": 1.69378512
},
{
"epoch": 0.9780866605205305,
"grad_norm": 7.530898094177246,
"loss": 4.0318,
"lr": 0.0005878321678321679,
"step": 3448,
"tokens_trained": 1.694768152
},
{
"epoch": 0.9786539961704843,
"grad_norm": 6.020658493041992,
"loss": 4.0057,
"lr": 0.0005875524475524476,
"step": 3450,
"tokens_trained": 1.695752832
},
{
"epoch": 0.9792213318204382,
"grad_norm": 5.292300224304199,
"loss": 3.9915,
"lr": 0.0005872727272727273,
"step": 3452,
"tokens_trained": 1.696735104
},
{
"epoch": 0.9797886674703922,
"grad_norm": 4.932474613189697,
"loss": 4.0163,
"lr": 0.0005869930069930069,
"step": 3454,
"tokens_trained": 1.697718208
},
{
"epoch": 0.9803560031203461,
"grad_norm": 4.504141807556152,
"loss": 3.9875,
"lr": 0.0005867132867132867,
"step": 3456,
"tokens_trained": 1.698697752
},
{
"epoch": 0.9809233387703,
"grad_norm": 4.826939582824707,
"loss": 3.9326,
"lr": 0.0005864335664335665,
"step": 3458,
"tokens_trained": 1.699672392
},
{
"epoch": 0.9814906744202538,
"grad_norm": 7.805232524871826,
"loss": 3.9695,
"lr": 0.0005861538461538462,
"step": 3460,
"tokens_trained": 1.700656392
},
{
"epoch": 0.9820580100702078,
"grad_norm": 6.857801914215088,
"loss": 3.995,
"lr": 0.0005858741258741259,
"step": 3462,
"tokens_trained": 1.701644848
},
{
"epoch": 0.9826253457201617,
"grad_norm": 4.32315731048584,
"loss": 3.9701,
"lr": 0.0005855944055944055,
"step": 3464,
"tokens_trained": 1.702624688
},
{
"epoch": 0.9831926813701156,
"grad_norm": 6.007495880126953,
"loss": 3.9887,
"lr": 0.0005853146853146854,
"step": 3466,
"tokens_trained": 1.703607376
},
{
"epoch": 0.9837600170200695,
"grad_norm": 4.779850006103516,
"loss": 3.9852,
"lr": 0.000585034965034965,
"step": 3468,
"tokens_trained": 1.704589808
},
{
"epoch": 0.9843273526700234,
"grad_norm": 4.593331336975098,
"loss": 4.0136,
"lr": 0.0005847552447552448,
"step": 3470,
"tokens_trained": 1.705573184
},
{
"epoch": 0.9848946883199773,
"grad_norm": 5.466218948364258,
"loss": 3.9426,
"lr": 0.0005844755244755244,
"step": 3472,
"tokens_trained": 1.706555864
},
{
"epoch": 0.9854620239699312,
"grad_norm": 8.283979415893555,
"loss": 3.9788,
"lr": 0.0005841958041958042,
"step": 3474,
"tokens_trained": 1.70754036
},
{
"epoch": 0.9860293596198851,
"grad_norm": 2.4386069774627686,
"loss": 3.9413,
"lr": 0.000583916083916084,
"step": 3476,
"tokens_trained": 1.708525528
},
{
"epoch": 0.9865966952698391,
"grad_norm": 4.485580921173096,
"loss": 3.9695,
"lr": 0.0005836363636363636,
"step": 3478,
"tokens_trained": 1.709508232
},
{
"epoch": 0.9871640309197929,
"grad_norm": 6.725922584533691,
"loss": 4.0084,
"lr": 0.0005833566433566434,
"step": 3480,
"tokens_trained": 1.710493288
},
{
"epoch": 0.9877313665697468,
"grad_norm": 5.532742023468018,
"loss": 3.9571,
"lr": 0.000583076923076923,
"step": 3482,
"tokens_trained": 1.711478792
},
{
"epoch": 0.9882987022197007,
"grad_norm": 5.568683624267578,
"loss": 4.0178,
"lr": 0.0005827972027972029,
"step": 3484,
"tokens_trained": 1.712464864
},
{
"epoch": 0.9888660378696547,
"grad_norm": 5.192487716674805,
"loss": 4.0294,
"lr": 0.0005825174825174825,
"step": 3486,
"tokens_trained": 1.713448256
},
{
"epoch": 0.9894333735196086,
"grad_norm": 5.584596633911133,
"loss": 3.9992,
"lr": 0.0005822377622377623,
"step": 3488,
"tokens_trained": 1.714435472
},
{
"epoch": 0.9900007091695624,
"grad_norm": 5.044432163238525,
"loss": 4.0119,
"lr": 0.0005819580419580419,
"step": 3490,
"tokens_trained": 1.715418784
},
{
"epoch": 0.9905680448195163,
"grad_norm": 3.4799540042877197,
"loss": 4.0099,
"lr": 0.0005816783216783216,
"step": 3492,
"tokens_trained": 1.716402544
},
{
"epoch": 0.9911353804694703,
"grad_norm": 4.949790000915527,
"loss": 3.9372,
"lr": 0.0005813986013986015,
"step": 3494,
"tokens_trained": 1.71738848
},
{
"epoch": 0.9917027161194242,
"grad_norm": 6.527776718139648,
"loss": 3.9938,
"lr": 0.0005811188811188811,
"step": 3496,
"tokens_trained": 1.718371984
},
{
"epoch": 0.9922700517693781,
"grad_norm": 5.616584300994873,
"loss": 3.9352,
"lr": 0.0005808391608391609,
"step": 3498,
"tokens_trained": 1.719358256
},
{
"epoch": 0.9928373874193319,
"grad_norm": 7.028440952301025,
"loss": 3.9494,
"lr": 0.0005805594405594405,
"step": 3500,
"tokens_trained": 1.720339264
},
{
"epoch": 0.9928373874193319,
"eval_loss": 0.999991238117218,
"eval_runtime": 20.318,
"step": 3500,
"tokens_trained": 1.720339264
},
{
"epoch": 0.9934047230692858,
"grad_norm": 5.338140487670898,
"loss": 3.9748,
"lr": 0.0005802797202797204,
"step": 3502,
"tokens_trained": 1.72132272
},
{
"epoch": 0.9939720587192398,
"grad_norm": 3.3448476791381836,
"loss": 3.96,
"lr": 0.00058,
"step": 3504,
"tokens_trained": 1.722307576
},
{
"epoch": 0.9945393943691937,
"grad_norm": 10.660968780517578,
"loss": 4.0199,
"lr": 0.0005797202797202797,
"step": 3506,
"tokens_trained": 1.723288472
},
{
"epoch": 0.9951067300191476,
"grad_norm": 7.261615753173828,
"loss": 3.9889,
"lr": 0.0005794405594405594,
"step": 3508,
"tokens_trained": 1.724272744
},
{
"epoch": 0.9956740656691014,
"grad_norm": 5.103553295135498,
"loss": 4.0047,
"lr": 0.0005791608391608391,
"step": 3510,
"tokens_trained": 1.725255576
},
{
"epoch": 0.9962414013190554,
"grad_norm": 1.5151104927062988,
"loss": 4.0228,
"lr": 0.000578881118881119,
"step": 3512,
"tokens_trained": 1.72624092
},
{
"epoch": 0.9968087369690093,
"grad_norm": 6.042428493499756,
"loss": 3.9699,
"lr": 0.0005786013986013986,
"step": 3514,
"tokens_trained": 1.727227176
},
{
"epoch": 0.9973760726189632,
"grad_norm": 10.020720481872559,
"loss": 3.9961,
"lr": 0.0005783216783216784,
"step": 3516,
"tokens_trained": 1.728205072
},
{
"epoch": 0.9979434082689171,
"grad_norm": 9.385619163513184,
"loss": 3.9962,
"lr": 0.000578041958041958,
"step": 3518,
"tokens_trained": 1.729187536
},
{
"epoch": 0.998510743918871,
"grad_norm": 1.413792371749878,
"loss": 4.0256,
"lr": 0.0005777622377622377,
"step": 3520,
"tokens_trained": 1.730168968
},
{
"epoch": 0.9990780795688249,
"grad_norm": 2.8461780548095703,
"loss": 3.9616,
"lr": 0.0005774825174825175,
"step": 3522,
"tokens_trained": 1.731150472
},
{
"epoch": 0.9996454152187788,
"grad_norm": 4.164590835571289,
"loss": 3.9786,
"lr": 0.0005772027972027972,
"step": 3524,
"tokens_trained": 1.732130536
},
{
"epoch": 1.0,
"grad_norm": 1.0116016864776611,
"loss": 2.5007,
"lr": 0.0005769230769230769,
"step": 3526,
"tokens_trained": 1.732744968
},
{
"epoch": 1.0005673356499538,
"grad_norm": 5.954165458679199,
"loss": 3.9598,
"lr": 0.0005766433566433566,
"step": 3528,
"tokens_trained": 1.733727424
},
{
"epoch": 1.0011346712999079,
"grad_norm": 8.648826599121094,
"loss": 3.9773,
"lr": 0.0005763636363636365,
"step": 3530,
"tokens_trained": 1.734708184
},
{
"epoch": 1.0017020069498617,
"grad_norm": 2.920509099960327,
"loss": 3.9745,
"lr": 0.0005760839160839161,
"step": 3532,
"tokens_trained": 1.735688616
},
{
"epoch": 1.0022693425998157,
"grad_norm": 9.963903427124023,
"loss": 3.9742,
"lr": 0.0005758041958041958,
"step": 3534,
"tokens_trained": 1.73667084
},
{
"epoch": 1.0028366782497695,
"grad_norm": 9.745009422302246,
"loss": 4.028,
"lr": 0.0005755244755244755,
"step": 3536,
"tokens_trained": 1.737656328
},
{
"epoch": 1.0034040138997233,
"grad_norm": 5.159154891967773,
"loss": 3.9812,
"lr": 0.0005752447552447552,
"step": 3538,
"tokens_trained": 1.738637688
},
{
"epoch": 1.0039713495496774,
"grad_norm": 10.829404830932617,
"loss": 3.9795,
"lr": 0.000574965034965035,
"step": 3540,
"tokens_trained": 1.739621688
},
{
"epoch": 1.0045386851996312,
"grad_norm": 8.493478775024414,
"loss": 3.9918,
"lr": 0.0005746853146853147,
"step": 3542,
"tokens_trained": 1.740604488
},
{
"epoch": 1.0051060208495852,
"grad_norm": 4.013627529144287,
"loss": 3.9928,
"lr": 0.0005744055944055944,
"step": 3544,
"tokens_trained": 1.74158764
},
{
"epoch": 1.005673356499539,
"grad_norm": 12.669920921325684,
"loss": 4.0114,
"lr": 0.0005741258741258741,
"step": 3546,
"tokens_trained": 1.742573592
},
{
"epoch": 1.0062406921494929,
"grad_norm": 6.349422931671143,
"loss": 4.0294,
"lr": 0.000573846153846154,
"step": 3548,
"tokens_trained": 1.743555672
},
{
"epoch": 1.006808027799447,
"grad_norm": 4.14855432510376,
"loss": 3.9963,
"lr": 0.0005735664335664336,
"step": 3550,
"tokens_trained": 1.744538384
},
{
"epoch": 1.0073753634494007,
"grad_norm": 9.063926696777344,
"loss": 3.9557,
"lr": 0.0005732867132867133,
"step": 3552,
"tokens_trained": 1.745523552
},
{
"epoch": 1.0079426990993547,
"grad_norm": 11.227505683898926,
"loss": 4.0087,
"lr": 0.000573006993006993,
"step": 3554,
"tokens_trained": 1.746510024
},
{
"epoch": 1.0085100347493086,
"grad_norm": 2.418097972869873,
"loss": 3.9942,
"lr": 0.0005727272727272727,
"step": 3556,
"tokens_trained": 1.747493048
},
{
"epoch": 1.0090773703992624,
"grad_norm": 14.376424789428711,
"loss": 3.999,
"lr": 0.0005724475524475525,
"step": 3558,
"tokens_trained": 1.748476808
},
{
"epoch": 1.0096447060492164,
"grad_norm": 9.035455703735352,
"loss": 4.063,
"lr": 0.0005721678321678322,
"step": 3560,
"tokens_trained": 1.749460504
},
{
"epoch": 1.0102120416991702,
"grad_norm": 3.8785758018493652,
"loss": 4.0269,
"lr": 0.0005718881118881118,
"step": 3562,
"tokens_trained": 1.750438936
},
{
"epoch": 1.0107793773491243,
"grad_norm": 15.488290786743164,
"loss": 4.0294,
"lr": 0.0005716083916083916,
"step": 3564,
"tokens_trained": 1.751420168
},
{
"epoch": 1.011346712999078,
"grad_norm": 10.785538673400879,
"loss": 4.0102,
"lr": 0.0005713286713286714,
"step": 3566,
"tokens_trained": 1.752405288
},
{
"epoch": 1.011914048649032,
"grad_norm": 5.724320888519287,
"loss": 4.0148,
"lr": 0.0005710489510489511,
"step": 3568,
"tokens_trained": 1.75338604
},
{
"epoch": 1.012481384298986,
"grad_norm": 11.051252365112305,
"loss": 4.022,
"lr": 0.0005707692307692308,
"step": 3570,
"tokens_trained": 1.75436632
},
{
"epoch": 1.0130487199489397,
"grad_norm": 10.290446281433105,
"loss": 3.9781,
"lr": 0.0005704895104895105,
"step": 3572,
"tokens_trained": 1.755349944
},
{
"epoch": 1.0136160555988938,
"grad_norm": 4.81416130065918,
"loss": 4.0393,
"lr": 0.0005702097902097902,
"step": 3574,
"tokens_trained": 1.756337976
},
{
"epoch": 1.0141833912488476,
"grad_norm": 14.237113952636719,
"loss": 4.087,
"lr": 0.0005699300699300699,
"step": 3576,
"tokens_trained": 1.75732372
},
{
"epoch": 1.0147507268988014,
"grad_norm": 3.973662853240967,
"loss": 3.9692,
"lr": 0.0005696503496503497,
"step": 3578,
"tokens_trained": 1.7583098
},
{
"epoch": 1.0153180625487555,
"grad_norm": 5.629733562469482,
"loss": 4.0003,
"lr": 0.0005693706293706293,
"step": 3580,
"tokens_trained": 1.759300416
},
{
"epoch": 1.0158853981987093,
"grad_norm": 7.505983352661133,
"loss": 4.011,
"lr": 0.0005690909090909091,
"step": 3582,
"tokens_trained": 1.760288632
},
{
"epoch": 1.0164527338486633,
"grad_norm": 5.501095294952393,
"loss": 3.994,
"lr": 0.0005688111888111889,
"step": 3584,
"tokens_trained": 1.761270328
},
{
"epoch": 1.0170200694986171,
"grad_norm": 4.74052619934082,
"loss": 4.0241,
"lr": 0.0005685314685314686,
"step": 3586,
"tokens_trained": 1.762252432
},
{
"epoch": 1.017587405148571,
"grad_norm": 8.409584045410156,
"loss": 4.0137,
"lr": 0.0005682517482517483,
"step": 3588,
"tokens_trained": 1.76323772
},
{
"epoch": 1.018154740798525,
"grad_norm": 5.391080379486084,
"loss": 3.9424,
"lr": 0.0005679720279720279,
"step": 3590,
"tokens_trained": 1.764220272
},
{
"epoch": 1.0187220764484788,
"grad_norm": 4.679509162902832,
"loss": 3.9893,
"lr": 0.0005676923076923077,
"step": 3592,
"tokens_trained": 1.765203832
},
{
"epoch": 1.0192894120984328,
"grad_norm": 5.354970932006836,
"loss": 4.023,
"lr": 0.0005674125874125874,
"step": 3594,
"tokens_trained": 1.76618936
},
{
"epoch": 1.0198567477483866,
"grad_norm": 5.1085357666015625,
"loss": 3.9995,
"lr": 0.0005671328671328672,
"step": 3596,
"tokens_trained": 1.767171216
},
{
"epoch": 1.0204240833983405,
"grad_norm": 3.0856151580810547,
"loss": 4.0084,
"lr": 0.0005668531468531468,
"step": 3598,
"tokens_trained": 1.76815464
},
{
"epoch": 1.0209914190482945,
"grad_norm": 2.330599308013916,
"loss": 3.9838,
"lr": 0.0005665734265734265,
"step": 3600,
"tokens_trained": 1.76913612
},
{
"epoch": 1.0215587546982483,
"grad_norm": 5.641542434692383,
"loss": 3.951,
"lr": 0.0005662937062937064,
"step": 3602,
"tokens_trained": 1.770119592
},
{
"epoch": 1.0221260903482023,
"grad_norm": 8.442550659179688,
"loss": 4.0088,
"lr": 0.000566013986013986,
"step": 3604,
"tokens_trained": 1.771103624
},
{
"epoch": 1.0226934259981562,
"grad_norm": 6.0125732421875,
"loss": 4.0243,
"lr": 0.0005657342657342658,
"step": 3606,
"tokens_trained": 1.772091496
},
{
"epoch": 1.02326076164811,
"grad_norm": 4.9415388107299805,
"loss": 3.9874,
"lr": 0.0005654545454545454,
"step": 3608,
"tokens_trained": 1.77307708
},
{
"epoch": 1.023828097298064,
"grad_norm": 5.762909889221191,
"loss": 4.0242,
"lr": 0.0005651748251748252,
"step": 3610,
"tokens_trained": 1.774058032
},
{
"epoch": 1.0243954329480178,
"grad_norm": 6.652433395385742,
"loss": 3.9908,
"lr": 0.0005648951048951049,
"step": 3612,
"tokens_trained": 1.775036512
},
{
"epoch": 1.0249627685979719,
"grad_norm": 3.539031505584717,
"loss": 3.9406,
"lr": 0.0005646153846153847,
"step": 3614,
"tokens_trained": 1.776021656
},
{
"epoch": 1.0255301042479257,
"grad_norm": 6.829031467437744,
"loss": 3.9839,
"lr": 0.0005643356643356643,
"step": 3616,
"tokens_trained": 1.777000824
},
{
"epoch": 1.0260974398978795,
"grad_norm": 3.46431040763855,
"loss": 4.0013,
"lr": 0.000564055944055944,
"step": 3618,
"tokens_trained": 1.777983504
},
{
"epoch": 1.0266647755478335,
"grad_norm": 5.163998126983643,
"loss": 3.9898,
"lr": 0.0005637762237762239,
"step": 3620,
"tokens_trained": 1.778966368
},
{
"epoch": 1.0272321111977873,
"grad_norm": 4.270689010620117,
"loss": 3.9868,
"lr": 0.0005634965034965035,
"step": 3622,
"tokens_trained": 1.77994468
},
{
"epoch": 1.0277994468477414,
"grad_norm": 5.297236442565918,
"loss": 3.9903,
"lr": 0.0005632167832167833,
"step": 3624,
"tokens_trained": 1.7809246
},
{
"epoch": 1.0280831146727183,
"eval_loss": 0.9977753162384033,
"eval_runtime": 20.5557,
"step": 3625,
"tokens_trained": 1.781418056
},
{
"epoch": 1.0283667824976952,
"grad_norm": 4.560519218444824,
"loss": 3.9339,
"lr": 0.0005629370629370629,
"step": 3626,
"tokens_trained": 1.781910808
},
{
"epoch": 1.028934118147649,
"grad_norm": 3.7894208431243896,
"loss": 3.9739,
"lr": 0.0005626573426573426,
"step": 3628,
"tokens_trained": 1.782891912
},
{
"epoch": 1.029501453797603,
"grad_norm": 3.9937522411346436,
"loss": 3.9734,
"lr": 0.0005623776223776224,
"step": 3630,
"tokens_trained": 1.783871032
},
{
"epoch": 1.0300687894475569,
"grad_norm": 5.798377990722656,
"loss": 3.9526,
"lr": 0.0005620979020979021,
"step": 3632,
"tokens_trained": 1.784855792
},
{
"epoch": 1.030636125097511,
"grad_norm": 3.2532927989959717,
"loss": 3.9237,
"lr": 0.0005618181818181818,
"step": 3634,
"tokens_trained": 1.785835216
},
{
"epoch": 1.0312034607474647,
"grad_norm": 3.2262985706329346,
"loss": 3.9676,
"lr": 0.0005615384615384615,
"step": 3636,
"tokens_trained": 1.78682184
},
{
"epoch": 1.0317707963974185,
"grad_norm": 2.4307727813720703,
"loss": 3.9376,
"lr": 0.0005612587412587414,
"step": 3638,
"tokens_trained": 1.787804536
},
{
"epoch": 1.0323381320473726,
"grad_norm": 11.10562515258789,
"loss": 4.0096,
"lr": 0.000560979020979021,
"step": 3640,
"tokens_trained": 1.788785152
},
{
"epoch": 1.0329054676973264,
"grad_norm": 8.139045715332031,
"loss": 3.992,
"lr": 0.0005606993006993008,
"step": 3642,
"tokens_trained": 1.789766736
},
{
"epoch": 1.0334728033472804,
"grad_norm": 5.561949729919434,
"loss": 3.9368,
"lr": 0.0005604195804195804,
"step": 3644,
"tokens_trained": 1.790746488
},
{
"epoch": 1.0340401389972342,
"grad_norm": 6.812232494354248,
"loss": 4.0185,
"lr": 0.0005601398601398601,
"step": 3646,
"tokens_trained": 1.79172608
},
{
"epoch": 1.034607474647188,
"grad_norm": 6.200248718261719,
"loss": 3.9072,
"lr": 0.0005598601398601399,
"step": 3648,
"tokens_trained": 1.792710784
},
{
"epoch": 1.035174810297142,
"grad_norm": 5.059606075286865,
"loss": 3.9334,
"lr": 0.0005595804195804196,
"step": 3650,
"tokens_trained": 1.793692736
},
{
"epoch": 1.035742145947096,
"grad_norm": 2.722522020339966,
"loss": 3.9438,
"lr": 0.0005593006993006993,
"step": 3652,
"tokens_trained": 1.79467536
},
{
"epoch": 1.03630948159705,
"grad_norm": 5.643895626068115,
"loss": 4.0213,
"lr": 0.000559020979020979,
"step": 3654,
"tokens_trained": 1.795662048
},
{
"epoch": 1.0368768172470038,
"grad_norm": 3.948822021484375,
"loss": 4.0022,
"lr": 0.0005587412587412589,
"step": 3656,
"tokens_trained": 1.79664468
},
{
"epoch": 1.0374441528969576,
"grad_norm": 2.5267179012298584,
"loss": 3.9655,
"lr": 0.0005584615384615385,
"step": 3658,
"tokens_trained": 1.7976262
},
{
"epoch": 1.0380114885469116,
"grad_norm": 2.7988510131835938,
"loss": 4.0161,
"lr": 0.0005581818181818182,
"step": 3660,
"tokens_trained": 1.79861132
},
{
"epoch": 1.0385788241968654,
"grad_norm": 8.685417175292969,
"loss": 4.0038,
"lr": 0.0005579020979020979,
"step": 3662,
"tokens_trained": 1.799592384
},
{
"epoch": 1.0391461598468195,
"grad_norm": 8.391874313354492,
"loss": 3.9519,
"lr": 0.0005576223776223776,
"step": 3664,
"tokens_trained": 1.800577208
},
{
"epoch": 1.0397134954967733,
"grad_norm": 7.6766815185546875,
"loss": 4.0119,
"lr": 0.0005573426573426574,
"step": 3666,
"tokens_trained": 1.801559128
},
{
"epoch": 1.040280831146727,
"grad_norm": 6.230587959289551,
"loss": 3.9528,
"lr": 0.0005570629370629371,
"step": 3668,
"tokens_trained": 1.802540608
},
{
"epoch": 1.0408481667966811,
"grad_norm": 7.4818010330200195,
"loss": 3.9532,
"lr": 0.0005567832167832167,
"step": 3670,
"tokens_trained": 1.80352688
},
{
"epoch": 1.041415502446635,
"grad_norm": 7.714044094085693,
"loss": 4.0154,
"lr": 0.0005565034965034965,
"step": 3672,
"tokens_trained": 1.804515736
},
{
"epoch": 1.041982838096589,
"grad_norm": 5.260356426239014,
"loss": 3.9931,
"lr": 0.0005562237762237763,
"step": 3674,
"tokens_trained": 1.805497152
},
{
"epoch": 1.0425501737465428,
"grad_norm": 4.576403617858887,
"loss": 4.0345,
"lr": 0.000555944055944056,
"step": 3676,
"tokens_trained": 1.806479328
},
{
"epoch": 1.0431175093964966,
"grad_norm": 3.378896713256836,
"loss": 3.9827,
"lr": 0.0005556643356643357,
"step": 3678,
"tokens_trained": 1.807459232
},
{
"epoch": 1.0436848450464506,
"grad_norm": 6.739299774169922,
"loss": 3.9811,
"lr": 0.0005553846153846154,
"step": 3680,
"tokens_trained": 1.808441944
},
{
"epoch": 1.0442521806964045,
"grad_norm": 4.965353012084961,
"loss": 3.9292,
"lr": 0.0005551048951048951,
"step": 3682,
"tokens_trained": 1.809423488
},
{
"epoch": 1.0448195163463585,
"grad_norm": 7.479167461395264,
"loss": 3.9386,
"lr": 0.0005548251748251748,
"step": 3684,
"tokens_trained": 1.810409008
},
{
"epoch": 1.0453868519963123,
"grad_norm": 3.754814863204956,
"loss": 3.9936,
"lr": 0.0005545454545454546,
"step": 3686,
"tokens_trained": 1.811387856
},
{
"epoch": 1.0459541876462661,
"grad_norm": 5.744228839874268,
"loss": 3.9761,
"lr": 0.0005542657342657342,
"step": 3688,
"tokens_trained": 1.812371104
},
{
"epoch": 1.0465215232962202,
"grad_norm": 5.926168918609619,
"loss": 3.904,
"lr": 0.000553986013986014,
"step": 3690,
"tokens_trained": 1.813356456
},
{
"epoch": 1.047088858946174,
"grad_norm": 5.209751605987549,
"loss": 3.9706,
"lr": 0.0005537062937062938,
"step": 3692,
"tokens_trained": 1.81434056
},
{
"epoch": 1.047656194596128,
"grad_norm": 4.979823112487793,
"loss": 3.972,
"lr": 0.0005534265734265735,
"step": 3694,
"tokens_trained": 1.815319936
},
{
"epoch": 1.0482235302460818,
"grad_norm": 5.393070220947266,
"loss": 3.9694,
"lr": 0.0005531468531468532,
"step": 3696,
"tokens_trained": 1.816299016
},
{
"epoch": 1.0487908658960357,
"grad_norm": 3.27998423576355,
"loss": 3.9706,
"lr": 0.0005528671328671328,
"step": 3698,
"tokens_trained": 1.817284696
},
{
"epoch": 1.0493582015459897,
"grad_norm": 6.364100456237793,
"loss": 3.9803,
"lr": 0.0005525874125874126,
"step": 3700,
"tokens_trained": 1.818268736
},
{
"epoch": 1.0499255371959435,
"grad_norm": 6.063296794891357,
"loss": 3.9761,
"lr": 0.0005523076923076923,
"step": 3702,
"tokens_trained": 1.819255432
},
{
"epoch": 1.0504928728458975,
"grad_norm": 6.279892444610596,
"loss": 3.9792,
"lr": 0.0005520279720279721,
"step": 3704,
"tokens_trained": 1.820241704
},
{
"epoch": 1.0510602084958514,
"grad_norm": 3.804609537124634,
"loss": 3.9763,
"lr": 0.0005517482517482517,
"step": 3706,
"tokens_trained": 1.821226584
},
{
"epoch": 1.0516275441458052,
"grad_norm": 5.056581497192383,
"loss": 3.9886,
"lr": 0.0005514685314685315,
"step": 3708,
"tokens_trained": 1.822208432
},
{
"epoch": 1.0521948797957592,
"grad_norm": 2.052483081817627,
"loss": 3.9485,
"lr": 0.0005511888111888111,
"step": 3710,
"tokens_trained": 1.823195928
},
{
"epoch": 1.052762215445713,
"grad_norm": 6.076491832733154,
"loss": 4.0132,
"lr": 0.0005509090909090909,
"step": 3712,
"tokens_trained": 1.824178568
},
{
"epoch": 1.053329551095667,
"grad_norm": 7.526022434234619,
"loss": 3.9478,
"lr": 0.0005506293706293707,
"step": 3714,
"tokens_trained": 1.82516128
},
{
"epoch": 1.0538968867456209,
"grad_norm": 2.7086679935455322,
"loss": 3.9913,
"lr": 0.0005503496503496503,
"step": 3716,
"tokens_trained": 1.826142864
},
{
"epoch": 1.0544642223955747,
"grad_norm": 1.7643057107925415,
"loss": 3.9813,
"lr": 0.0005500699300699301,
"step": 3718,
"tokens_trained": 1.82712608
},
{
"epoch": 1.0550315580455287,
"grad_norm": 6.2813029289245605,
"loss": 3.9772,
"lr": 0.0005497902097902098,
"step": 3720,
"tokens_trained": 1.828107616
},
{
"epoch": 1.0555988936954825,
"grad_norm": 7.591973781585693,
"loss": 3.938,
"lr": 0.0005495104895104896,
"step": 3722,
"tokens_trained": 1.82909308
},
{
"epoch": 1.0561662293454366,
"grad_norm": 4.976797580718994,
"loss": 3.9889,
"lr": 0.0005492307692307692,
"step": 3724,
"tokens_trained": 1.830079168
},
{
"epoch": 1.0567335649953904,
"grad_norm": 5.417744159698486,
"loss": 4.0039,
"lr": 0.0005489510489510489,
"step": 3726,
"tokens_trained": 1.831062488
},
{
"epoch": 1.0573009006453442,
"grad_norm": 4.516066074371338,
"loss": 3.9845,
"lr": 0.0005486713286713286,
"step": 3728,
"tokens_trained": 1.832046528
},
{
"epoch": 1.0578682362952982,
"grad_norm": 3.677839756011963,
"loss": 3.9446,
"lr": 0.0005483916083916084,
"step": 3730,
"tokens_trained": 1.83303104
},
{
"epoch": 1.058435571945252,
"grad_norm": 5.22024393081665,
"loss": 3.9746,
"lr": 0.0005481118881118882,
"step": 3732,
"tokens_trained": 1.834017736
},
{
"epoch": 1.059002907595206,
"grad_norm": 7.4156060218811035,
"loss": 3.9898,
"lr": 0.0005478321678321678,
"step": 3734,
"tokens_trained": 1.8349996
},
{
"epoch": 1.05957024324516,
"grad_norm": 3.472533702850342,
"loss": 3.9558,
"lr": 0.0005475524475524476,
"step": 3736,
"tokens_trained": 1.835979152
},
{
"epoch": 1.0601375788951137,
"grad_norm": 2.4360055923461914,
"loss": 3.9627,
"lr": 0.0005472727272727273,
"step": 3738,
"tokens_trained": 1.836963416
},
{
"epoch": 1.0607049145450678,
"grad_norm": 4.8988728523254395,
"loss": 3.9492,
"lr": 0.000546993006993007,
"step": 3740,
"tokens_trained": 1.83794088
},
{
"epoch": 1.0612722501950216,
"grad_norm": 5.711161136627197,
"loss": 4.002,
"lr": 0.0005467132867132867,
"step": 3742,
"tokens_trained": 1.838924456
},
{
"epoch": 1.0618395858449756,
"grad_norm": 4.373830318450928,
"loss": 3.9811,
"lr": 0.0005464335664335664,
"step": 3744,
"tokens_trained": 1.839902072
},
{
"epoch": 1.0624069214949294,
"grad_norm": 3.2446751594543457,
"loss": 3.9551,
"lr": 0.0005461538461538461,
"step": 3746,
"tokens_trained": 1.840882688
},
{
"epoch": 1.0629742571448832,
"grad_norm": 3.3250389099121094,
"loss": 3.9556,
"lr": 0.0005458741258741259,
"step": 3748,
"tokens_trained": 1.841863816
},
{
"epoch": 1.0635415927948373,
"grad_norm": 7.377841949462891,
"loss": 4.0118,
"lr": 0.0005455944055944057,
"step": 3750,
"tokens_trained": 1.842844072
},
{
"epoch": 1.0635415927948373,
"eval_loss": 0.994845449924469,
"eval_runtime": 20.2191,
"step": 3750,
"tokens_trained": 1.842844072
},
{
"epoch": 1.064108928444791,
"grad_norm": 3.671860694885254,
"loss": 3.9439,
"lr": 0.0005453146853146853,
"step": 3752,
"tokens_trained": 1.843832472
},
{
"epoch": 1.0646762640947451,
"grad_norm": 3.7120800018310547,
"loss": 3.9992,
"lr": 0.000545034965034965,
"step": 3754,
"tokens_trained": 1.84481192
},
{
"epoch": 1.065243599744699,
"grad_norm": 6.560836315155029,
"loss": 3.9594,
"lr": 0.0005447552447552448,
"step": 3756,
"tokens_trained": 1.84579436
},
{
"epoch": 1.0658109353946528,
"grad_norm": 1.7166560888290405,
"loss": 3.9656,
"lr": 0.0005444755244755245,
"step": 3758,
"tokens_trained": 1.84678316
},
{
"epoch": 1.0663782710446068,
"grad_norm": 5.579006671905518,
"loss": 4.0034,
"lr": 0.0005441958041958042,
"step": 3760,
"tokens_trained": 1.847770488
},
{
"epoch": 1.0669456066945606,
"grad_norm": 3.6601710319519043,
"loss": 3.9346,
"lr": 0.0005439160839160839,
"step": 3762,
"tokens_trained": 1.848747488
},
{
"epoch": 1.0675129423445147,
"grad_norm": 1.2449930906295776,
"loss": 3.9493,
"lr": 0.0005436363636363635,
"step": 3764,
"tokens_trained": 1.849727168
},
{
"epoch": 1.0680802779944685,
"grad_norm": 5.6108479499816895,
"loss": 3.9527,
"lr": 0.0005433566433566434,
"step": 3766,
"tokens_trained": 1.85070748
},
{
"epoch": 1.0686476136444223,
"grad_norm": 7.556972980499268,
"loss": 3.9465,
"lr": 0.0005430769230769231,
"step": 3768,
"tokens_trained": 1.851693328
},
{
"epoch": 1.0692149492943763,
"grad_norm": 3.7439489364624023,
"loss": 3.964,
"lr": 0.0005427972027972028,
"step": 3770,
"tokens_trained": 1.852674992
},
{
"epoch": 1.0697822849443301,
"grad_norm": 4.162338733673096,
"loss": 3.969,
"lr": 0.0005425174825174825,
"step": 3772,
"tokens_trained": 1.853659048
},
{
"epoch": 1.0703496205942842,
"grad_norm": 3.8950648307800293,
"loss": 3.9691,
"lr": 0.0005422377622377623,
"step": 3774,
"tokens_trained": 1.854644728
},
{
"epoch": 1.070916956244238,
"grad_norm": 4.361495018005371,
"loss": 3.9437,
"lr": 0.000541958041958042,
"step": 3776,
"tokens_trained": 1.855626632
},
{
"epoch": 1.0714842918941918,
"grad_norm": 3.5286366939544678,
"loss": 3.9831,
"lr": 0.0005416783216783216,
"step": 3778,
"tokens_trained": 1.856606216
},
{
"epoch": 1.0720516275441458,
"grad_norm": 4.972531795501709,
"loss": 4.0222,
"lr": 0.0005413986013986014,
"step": 3780,
"tokens_trained": 1.857590816
},
{
"epoch": 1.0726189631940997,
"grad_norm": 9.155055046081543,
"loss": 3.9442,
"lr": 0.000541118881118881,
"step": 3782,
"tokens_trained": 1.85857288
},
{
"epoch": 1.0731862988440537,
"grad_norm": 1.4077136516571045,
"loss": 3.9806,
"lr": 0.0005408391608391609,
"step": 3784,
"tokens_trained": 1.859555224
},
{
"epoch": 1.0737536344940075,
"grad_norm": 3.204779863357544,
"loss": 3.9506,
"lr": 0.0005405594405594406,
"step": 3786,
"tokens_trained": 1.860538984
},
{
"epoch": 1.0743209701439613,
"grad_norm": 3.988658905029297,
"loss": 4.0025,
"lr": 0.0005402797202797203,
"step": 3788,
"tokens_trained": 1.861522976
},
{
"epoch": 1.0748883057939154,
"grad_norm": 3.0060372352600098,
"loss": 3.9308,
"lr": 0.00054,
"step": 3790,
"tokens_trained": 1.86250564
},
{
"epoch": 1.0754556414438692,
"grad_norm": 2.494147777557373,
"loss": 4.0116,
"lr": 0.0005397202797202798,
"step": 3792,
"tokens_trained": 1.863491248
},
{
"epoch": 1.0760229770938232,
"grad_norm": 5.260354518890381,
"loss": 3.9917,
"lr": 0.0005394405594405595,
"step": 3794,
"tokens_trained": 1.864474808
},
{
"epoch": 1.076590312743777,
"grad_norm": 4.43446159362793,
"loss": 3.9698,
"lr": 0.0005391608391608391,
"step": 3796,
"tokens_trained": 1.865457608
},
{
"epoch": 1.0771576483937308,
"grad_norm": 5.485021114349365,
"loss": 3.9494,
"lr": 0.0005388811188811189,
"step": 3798,
"tokens_trained": 1.866439336
},
{
"epoch": 1.0777249840436849,
"grad_norm": 5.432106971740723,
"loss": 3.9749,
"lr": 0.0005386013986013985,
"step": 3800,
"tokens_trained": 1.867422432
},
{
"epoch": 1.0782923196936387,
"grad_norm": 5.726179122924805,
"loss": 3.9524,
"lr": 0.0005383216783216784,
"step": 3802,
"tokens_trained": 1.868404976
},
{
"epoch": 1.0788596553435927,
"grad_norm": 7.2211594581604,
"loss": 3.954,
"lr": 0.0005380419580419581,
"step": 3804,
"tokens_trained": 1.869387272
},
{
"epoch": 1.0794269909935466,
"grad_norm": 3.6406068801879883,
"loss": 4.0125,
"lr": 0.0005377622377622377,
"step": 3806,
"tokens_trained": 1.870371664
},
{
"epoch": 1.0799943266435004,
"grad_norm": 7.254781723022461,
"loss": 3.9535,
"lr": 0.0005374825174825175,
"step": 3808,
"tokens_trained": 1.87135524
},
{
"epoch": 1.0805616622934544,
"grad_norm": 7.8573079109191895,
"loss": 4.0054,
"lr": 0.0005372027972027972,
"step": 3810,
"tokens_trained": 1.872337216
},
{
"epoch": 1.0811289979434082,
"grad_norm": 1.049710988998413,
"loss": 3.9541,
"lr": 0.000536923076923077,
"step": 3812,
"tokens_trained": 1.873317672
},
{
"epoch": 1.0816963335933623,
"grad_norm": 7.515570163726807,
"loss": 3.9466,
"lr": 0.0005366433566433566,
"step": 3814,
"tokens_trained": 1.874299184
},
{
"epoch": 1.082263669243316,
"grad_norm": 6.041797637939453,
"loss": 3.9508,
"lr": 0.0005363636363636364,
"step": 3816,
"tokens_trained": 1.875282768
},
{
"epoch": 1.0828310048932699,
"grad_norm": 2.9910285472869873,
"loss": 3.9368,
"lr": 0.000536083916083916,
"step": 3818,
"tokens_trained": 1.876264312
},
{
"epoch": 1.083398340543224,
"grad_norm": 3.5802299976348877,
"loss": 3.9661,
"lr": 0.0005358041958041959,
"step": 3820,
"tokens_trained": 1.877245472
},
{
"epoch": 1.0839656761931777,
"grad_norm": 6.078779697418213,
"loss": 3.9758,
"lr": 0.0005355244755244756,
"step": 3822,
"tokens_trained": 1.87822768
},
{
"epoch": 1.0845330118431318,
"grad_norm": 6.143925189971924,
"loss": 3.947,
"lr": 0.0005352447552447552,
"step": 3824,
"tokens_trained": 1.879209824
},
{
"epoch": 1.0851003474930856,
"grad_norm": 4.272439002990723,
"loss": 4.0284,
"lr": 0.000534965034965035,
"step": 3826,
"tokens_trained": 1.88019528
},
{
"epoch": 1.0856676831430394,
"grad_norm": 7.169465065002441,
"loss": 3.9651,
"lr": 0.0005346853146853147,
"step": 3828,
"tokens_trained": 1.88117776
},
{
"epoch": 1.0862350187929934,
"grad_norm": 6.489839553833008,
"loss": 3.9505,
"lr": 0.0005344055944055945,
"step": 3830,
"tokens_trained": 1.88216468
},
{
"epoch": 1.0868023544429473,
"grad_norm": 2.966554880142212,
"loss": 4.0406,
"lr": 0.0005341258741258741,
"step": 3832,
"tokens_trained": 1.883147968
},
{
"epoch": 1.0873696900929013,
"grad_norm": 4.948841094970703,
"loss": 3.9704,
"lr": 0.0005338461538461538,
"step": 3834,
"tokens_trained": 1.884132176
},
{
"epoch": 1.0879370257428551,
"grad_norm": 7.666274547576904,
"loss": 4.0082,
"lr": 0.0005335664335664335,
"step": 3836,
"tokens_trained": 1.885119008
},
{
"epoch": 1.088504361392809,
"grad_norm": 12.454533576965332,
"loss": 3.9702,
"lr": 0.0005332867132867133,
"step": 3838,
"tokens_trained": 1.88610144
},
{
"epoch": 1.089071697042763,
"grad_norm": 4.42985725402832,
"loss": 3.9601,
"lr": 0.0005330069930069931,
"step": 3840,
"tokens_trained": 1.88708772
},
{
"epoch": 1.0896390326927168,
"grad_norm": 14.10716438293457,
"loss": 3.9942,
"lr": 0.0005327272727272727,
"step": 3842,
"tokens_trained": 1.888068192
},
{
"epoch": 1.0902063683426708,
"grad_norm": 6.3290910720825195,
"loss": 3.9218,
"lr": 0.0005324475524475525,
"step": 3844,
"tokens_trained": 1.88905572
},
{
"epoch": 1.0907737039926246,
"grad_norm": 6.61427640914917,
"loss": 4.0173,
"lr": 0.0005321678321678322,
"step": 3846,
"tokens_trained": 1.890040152
},
{
"epoch": 1.0913410396425784,
"grad_norm": 6.868432998657227,
"loss": 3.9553,
"lr": 0.000531888111888112,
"step": 3848,
"tokens_trained": 1.891031024
},
{
"epoch": 1.0919083752925325,
"grad_norm": 4.057258129119873,
"loss": 3.9839,
"lr": 0.0005316083916083916,
"step": 3850,
"tokens_trained": 1.892009904
},
{
"epoch": 1.0924757109424863,
"grad_norm": 3.5418479442596436,
"loss": 3.9839,
"lr": 0.0005313286713286713,
"step": 3852,
"tokens_trained": 1.892993976
},
{
"epoch": 1.0930430465924403,
"grad_norm": 1.231491208076477,
"loss": 3.9549,
"lr": 0.000531048951048951,
"step": 3854,
"tokens_trained": 1.893972744
},
{
"epoch": 1.0936103822423942,
"grad_norm": 4.056438446044922,
"loss": 3.9512,
"lr": 0.0005307692307692308,
"step": 3856,
"tokens_trained": 1.894954248
},
{
"epoch": 1.094177717892348,
"grad_norm": 2.9252607822418213,
"loss": 3.9201,
"lr": 0.0005304895104895106,
"step": 3858,
"tokens_trained": 1.895938816
},
{
"epoch": 1.094745053542302,
"grad_norm": 3.035308599472046,
"loss": 3.9367,
"lr": 0.0005302097902097902,
"step": 3860,
"tokens_trained": 1.896920832
},
{
"epoch": 1.0953123891922558,
"grad_norm": 2.2526092529296875,
"loss": 3.9554,
"lr": 0.0005299300699300699,
"step": 3862,
"tokens_trained": 1.897903216
},
{
"epoch": 1.0958797248422099,
"grad_norm": 2.882819175720215,
"loss": 3.926,
"lr": 0.0005296503496503497,
"step": 3864,
"tokens_trained": 1.898886632
},
{
"epoch": 1.0964470604921637,
"grad_norm": 7.817485809326172,
"loss": 3.9583,
"lr": 0.0005293706293706294,
"step": 3866,
"tokens_trained": 1.899872128
},
{
"epoch": 1.0970143961421175,
"grad_norm": 8.241719245910645,
"loss": 3.9391,
"lr": 0.0005290909090909091,
"step": 3868,
"tokens_trained": 1.900856544
},
{
"epoch": 1.0975817317920715,
"grad_norm": 4.160614013671875,
"loss": 3.9285,
"lr": 0.0005288111888111888,
"step": 3870,
"tokens_trained": 1.901838952
},
{
"epoch": 1.0981490674420253,
"grad_norm": 3.527678966522217,
"loss": 3.9593,
"lr": 0.0005285314685314684,
"step": 3872,
"tokens_trained": 1.902823024
},
{
"epoch": 1.0987164030919794,
"grad_norm": 5.290194511413574,
"loss": 3.9357,
"lr": 0.0005282517482517483,
"step": 3874,
"tokens_trained": 1.903803456
},
{
"epoch": 1.0990000709169563,
"eval_loss": 0.9935861229896545,
"eval_runtime": 20.2396,
"step": 3875,
"tokens_trained": 1.904295504
},
{
"epoch": 1.0992837387419332,
"grad_norm": 5.472379207611084,
"loss": 4.0255,
"lr": 0.000527972027972028,
"step": 3876,
"tokens_trained": 1.904786344
},
{
"epoch": 1.099851074391887,
"grad_norm": 6.999550819396973,
"loss": 3.9523,
"lr": 0.0005276923076923077,
"step": 3878,
"tokens_trained": 1.90576952
},
{
"epoch": 1.100418410041841,
"grad_norm": 3.3077871799468994,
"loss": 3.9452,
"lr": 0.0005274125874125874,
"step": 3880,
"tokens_trained": 1.906745784
},
{
"epoch": 1.1009857456917949,
"grad_norm": 4.513088226318359,
"loss": 3.9687,
"lr": 0.0005271328671328672,
"step": 3882,
"tokens_trained": 1.907734576
},
{
"epoch": 1.101553081341749,
"grad_norm": 8.249629020690918,
"loss": 3.9445,
"lr": 0.0005268531468531469,
"step": 3884,
"tokens_trained": 1.908716328
},
{
"epoch": 1.1021204169917027,
"grad_norm": 8.281685829162598,
"loss": 3.9906,
"lr": 0.0005265734265734266,
"step": 3886,
"tokens_trained": 1.909702984
},
{
"epoch": 1.1026877526416565,
"grad_norm": 6.521668910980225,
"loss": 3.9971,
"lr": 0.0005262937062937063,
"step": 3888,
"tokens_trained": 1.91068828
},
{
"epoch": 1.1032550882916106,
"grad_norm": 6.442141056060791,
"loss": 3.9769,
"lr": 0.0005260139860139859,
"step": 3890,
"tokens_trained": 1.911668976
},
{
"epoch": 1.1038224239415644,
"grad_norm": 11.120711326599121,
"loss": 3.9455,
"lr": 0.0005257342657342658,
"step": 3892,
"tokens_trained": 1.912650176
},
{
"epoch": 1.1043897595915184,
"grad_norm": 2.695085048675537,
"loss": 3.984,
"lr": 0.0005254545454545455,
"step": 3894,
"tokens_trained": 1.913624832
},
{
"epoch": 1.1049570952414722,
"grad_norm": 16.994462966918945,
"loss": 3.968,
"lr": 0.0005251748251748252,
"step": 3896,
"tokens_trained": 1.914609128
},
{
"epoch": 1.105524430891426,
"grad_norm": 5.866199016571045,
"loss": 3.9157,
"lr": 0.0005248951048951049,
"step": 3898,
"tokens_trained": 1.91559088
},
{
"epoch": 1.10609176654138,
"grad_norm": 8.222938537597656,
"loss": 3.9516,
"lr": 0.0005246153846153847,
"step": 3900,
"tokens_trained": 1.916575752
},
{
"epoch": 1.106659102191334,
"grad_norm": 6.4162774085998535,
"loss": 3.9761,
"lr": 0.0005243356643356644,
"step": 3902,
"tokens_trained": 1.9175578
},
{
"epoch": 1.107226437841288,
"grad_norm": 5.338213920593262,
"loss": 3.9804,
"lr": 0.000524055944055944,
"step": 3904,
"tokens_trained": 1.918538192
},
{
"epoch": 1.1077937734912418,
"grad_norm": 6.3608927726745605,
"loss": 3.9675,
"lr": 0.0005237762237762238,
"step": 3906,
"tokens_trained": 1.9195184
},
{
"epoch": 1.1083611091411956,
"grad_norm": 6.1585845947265625,
"loss": 3.9385,
"lr": 0.0005234965034965034,
"step": 3908,
"tokens_trained": 1.920498704
},
{
"epoch": 1.1089284447911496,
"grad_norm": 5.266563415527344,
"loss": 4.0169,
"lr": 0.0005232167832167833,
"step": 3910,
"tokens_trained": 1.921477824
},
{
"epoch": 1.1094957804411034,
"grad_norm": 3.5322930812835693,
"loss": 3.9734,
"lr": 0.000522937062937063,
"step": 3912,
"tokens_trained": 1.922456704
},
{
"epoch": 1.1100631160910575,
"grad_norm": 3.8564069271087646,
"loss": 3.9873,
"lr": 0.0005226573426573427,
"step": 3914,
"tokens_trained": 1.92343992
},
{
"epoch": 1.1106304517410113,
"grad_norm": 3.9069607257843018,
"loss": 3.9892,
"lr": 0.0005223776223776224,
"step": 3916,
"tokens_trained": 1.924424576
},
{
"epoch": 1.111197787390965,
"grad_norm": 6.195169925689697,
"loss": 3.9489,
"lr": 0.0005220979020979021,
"step": 3918,
"tokens_trained": 1.92540764
},
{
"epoch": 1.1117651230409191,
"grad_norm": 4.950653076171875,
"loss": 3.9561,
"lr": 0.0005218181818181819,
"step": 3920,
"tokens_trained": 1.926386144
},
{
"epoch": 1.112332458690873,
"grad_norm": 4.923401832580566,
"loss": 3.991,
"lr": 0.0005215384615384615,
"step": 3922,
"tokens_trained": 1.92736516
},
{
"epoch": 1.112899794340827,
"grad_norm": 4.2394561767578125,
"loss": 3.9445,
"lr": 0.0005212587412587413,
"step": 3924,
"tokens_trained": 1.928350608
},
{
"epoch": 1.1134671299907808,
"grad_norm": 3.4303910732269287,
"loss": 3.9871,
"lr": 0.0005209790209790209,
"step": 3926,
"tokens_trained": 1.929333008
},
{
"epoch": 1.1140344656407346,
"grad_norm": 6.241591453552246,
"loss": 3.9799,
"lr": 0.0005206993006993008,
"step": 3928,
"tokens_trained": 1.930315616
},
{
"epoch": 1.1146018012906886,
"grad_norm": 5.21243143081665,
"loss": 3.9624,
"lr": 0.0005204195804195805,
"step": 3930,
"tokens_trained": 1.931298192
},
{
"epoch": 1.1151691369406425,
"grad_norm": 7.095268249511719,
"loss": 3.9263,
"lr": 0.0005201398601398601,
"step": 3932,
"tokens_trained": 1.93228248
},
{
"epoch": 1.1157364725905965,
"grad_norm": 9.025245666503906,
"loss": 4.0058,
"lr": 0.0005198601398601399,
"step": 3934,
"tokens_trained": 1.93326592
},
{
"epoch": 1.1163038082405503,
"grad_norm": 3.9758048057556152,
"loss": 3.9299,
"lr": 0.0005195804195804196,
"step": 3936,
"tokens_trained": 1.93424888
},
{
"epoch": 1.1168711438905041,
"grad_norm": 9.68726634979248,
"loss": 3.9433,
"lr": 0.0005193006993006994,
"step": 3938,
"tokens_trained": 1.935231688
},
{
"epoch": 1.1174384795404582,
"grad_norm": 7.5478901863098145,
"loss": 4.0053,
"lr": 0.000519020979020979,
"step": 3940,
"tokens_trained": 1.936216832
},
{
"epoch": 1.118005815190412,
"grad_norm": 6.016645431518555,
"loss": 3.9481,
"lr": 0.0005187412587412588,
"step": 3942,
"tokens_trained": 1.937196632
},
{
"epoch": 1.118573150840366,
"grad_norm": 7.313266277313232,
"loss": 3.9539,
"lr": 0.0005184615384615384,
"step": 3944,
"tokens_trained": 1.938180424
},
{
"epoch": 1.1191404864903198,
"grad_norm": 4.228805065155029,
"loss": 3.9528,
"lr": 0.0005181818181818182,
"step": 3946,
"tokens_trained": 1.939165376
},
{
"epoch": 1.1197078221402736,
"grad_norm": 1.2050669193267822,
"loss": 3.9699,
"lr": 0.000517902097902098,
"step": 3948,
"tokens_trained": 1.940146184
},
{
"epoch": 1.1202751577902277,
"grad_norm": 4.581719875335693,
"loss": 3.9346,
"lr": 0.0005176223776223776,
"step": 3950,
"tokens_trained": 1.941130648
},
{
"epoch": 1.1208424934401815,
"grad_norm": 9.381650924682617,
"loss": 3.9294,
"lr": 0.0005173426573426574,
"step": 3952,
"tokens_trained": 1.94210952
},
{
"epoch": 1.1214098290901355,
"grad_norm": 5.3781585693359375,
"loss": 3.9208,
"lr": 0.000517062937062937,
"step": 3954,
"tokens_trained": 1.943096344
},
{
"epoch": 1.1219771647400893,
"grad_norm": 4.263558387756348,
"loss": 3.9492,
"lr": 0.0005167832167832169,
"step": 3956,
"tokens_trained": 1.94407804
},
{
"epoch": 1.1225445003900432,
"grad_norm": 5.920651435852051,
"loss": 3.8951,
"lr": 0.0005165034965034965,
"step": 3958,
"tokens_trained": 1.94506156
},
{
"epoch": 1.1231118360399972,
"grad_norm": 7.0110344886779785,
"loss": 3.9329,
"lr": 0.0005162237762237762,
"step": 3960,
"tokens_trained": 1.946040072
},
{
"epoch": 1.123679171689951,
"grad_norm": 4.611392021179199,
"loss": 3.9094,
"lr": 0.0005159440559440559,
"step": 3962,
"tokens_trained": 1.947023256
},
{
"epoch": 1.124246507339905,
"grad_norm": 5.340510845184326,
"loss": 3.9552,
"lr": 0.0005156643356643357,
"step": 3964,
"tokens_trained": 1.948006848
},
{
"epoch": 1.1248138429898589,
"grad_norm": 5.190691947937012,
"loss": 3.956,
"lr": 0.0005153846153846154,
"step": 3966,
"tokens_trained": 1.948991632
},
{
"epoch": 1.1253811786398127,
"grad_norm": 5.612351894378662,
"loss": 3.9861,
"lr": 0.0005151048951048951,
"step": 3968,
"tokens_trained": 1.949975704
},
{
"epoch": 1.1259485142897667,
"grad_norm": 6.097261428833008,
"loss": 3.9867,
"lr": 0.0005148251748251748,
"step": 3970,
"tokens_trained": 1.950957944
},
{
"epoch": 1.1265158499397205,
"grad_norm": 4.194180965423584,
"loss": 3.9242,
"lr": 0.0005145454545454545,
"step": 3972,
"tokens_trained": 1.9519416
},
{
"epoch": 1.1270831855896746,
"grad_norm": 4.118505477905273,
"loss": 3.9553,
"lr": 0.0005142657342657343,
"step": 3974,
"tokens_trained": 1.95292252
},
{
"epoch": 1.1276505212396284,
"grad_norm": 5.10177755355835,
"loss": 3.9653,
"lr": 0.000513986013986014,
"step": 3976,
"tokens_trained": 1.953902792
},
{
"epoch": 1.1282178568895822,
"grad_norm": 5.665530204772949,
"loss": 3.916,
"lr": 0.0005137062937062937,
"step": 3978,
"tokens_trained": 1.954888184
},
{
"epoch": 1.1287851925395362,
"grad_norm": 4.1443963050842285,
"loss": 3.9254,
"lr": 0.0005134265734265734,
"step": 3980,
"tokens_trained": 1.955868688
},
{
"epoch": 1.12935252818949,
"grad_norm": 2.4941980838775635,
"loss": 3.9502,
"lr": 0.0005131468531468532,
"step": 3982,
"tokens_trained": 1.956852472
},
{
"epoch": 1.129919863839444,
"grad_norm": 3.85143780708313,
"loss": 3.8926,
"lr": 0.0005128671328671328,
"step": 3984,
"tokens_trained": 1.957835808
},
{
"epoch": 1.130487199489398,
"grad_norm": 5.975537300109863,
"loss": 3.9926,
"lr": 0.0005125874125874126,
"step": 3986,
"tokens_trained": 1.958816736
},
{
"epoch": 1.1310545351393517,
"grad_norm": 6.722855567932129,
"loss": 3.986,
"lr": 0.0005123076923076923,
"step": 3988,
"tokens_trained": 1.9598008
},
{
"epoch": 1.1316218707893058,
"grad_norm": 3.1752729415893555,
"loss": 3.9343,
"lr": 0.000512027972027972,
"step": 3990,
"tokens_trained": 1.960783816
},
{
"epoch": 1.1321892064392596,
"grad_norm": 3.669602394104004,
"loss": 3.9746,
"lr": 0.0005117482517482518,
"step": 3992,
"tokens_trained": 1.96176816
},
{
"epoch": 1.1327565420892136,
"grad_norm": 7.3116326332092285,
"loss": 3.9829,
"lr": 0.0005114685314685315,
"step": 3994,
"tokens_trained": 1.962752696
},
{
"epoch": 1.1333238777391674,
"grad_norm": 5.816486358642578,
"loss": 3.9617,
"lr": 0.0005111888111888112,
"step": 3996,
"tokens_trained": 1.96373432
},
{
"epoch": 1.1338912133891212,
"grad_norm": 2.3524768352508545,
"loss": 3.929,
"lr": 0.0005109090909090908,
"step": 3998,
"tokens_trained": 1.964713416
},
{
"epoch": 1.1344585490390753,
"grad_norm": 4.908108711242676,
"loss": 3.9741,
"lr": 0.0005106293706293707,
"step": 4000,
"tokens_trained": 1.965692096
},
{
"epoch": 1.1344585490390753,
"eval_loss": 0.9912415146827698,
"eval_runtime": 20.338,
"step": 4000,
"tokens_trained": 1.965692096
},
{
"epoch": 1.135025884689029,
"grad_norm": 4.395096778869629,
"loss": 3.955,
"lr": 0.0005103496503496503,
"step": 4002,
"tokens_trained": 1.966677008
},
{
"epoch": 1.1355932203389831,
"grad_norm": 3.2460927963256836,
"loss": 3.9522,
"lr": 0.0005100699300699301,
"step": 4004,
"tokens_trained": 1.967662208
},
{
"epoch": 1.136160555988937,
"grad_norm": 3.2880218029022217,
"loss": 3.9111,
"lr": 0.0005097902097902098,
"step": 4006,
"tokens_trained": 1.968642816
},
{
"epoch": 1.1367278916388908,
"grad_norm": 3.694084644317627,
"loss": 3.9045,
"lr": 0.0005095104895104895,
"step": 4008,
"tokens_trained": 1.969623616
},
{
"epoch": 1.1372952272888448,
"grad_norm": 2.690668821334839,
"loss": 3.9534,
"lr": 0.0005092307692307693,
"step": 4010,
"tokens_trained": 1.970607456
},
{
"epoch": 1.1378625629387986,
"grad_norm": 3.6751973628997803,
"loss": 3.9979,
"lr": 0.0005089510489510489,
"step": 4012,
"tokens_trained": 1.971587136
},
{
"epoch": 1.1384298985887527,
"grad_norm": 3.0805108547210693,
"loss": 3.888,
"lr": 0.0005086713286713287,
"step": 4014,
"tokens_trained": 1.972575152
},
{
"epoch": 1.1389972342387065,
"grad_norm": 5.386228084564209,
"loss": 3.9586,
"lr": 0.0005083916083916083,
"step": 4016,
"tokens_trained": 1.973563872
},
{
"epoch": 1.1395645698886603,
"grad_norm": 5.567631721496582,
"loss": 3.9337,
"lr": 0.0005081118881118882,
"step": 4018,
"tokens_trained": 1.97454444
},
{
"epoch": 1.1401319055386143,
"grad_norm": 5.159145355224609,
"loss": 3.9311,
"lr": 0.0005078321678321678,
"step": 4020,
"tokens_trained": 1.975528128
},
{
"epoch": 1.1406992411885681,
"grad_norm": 3.8111817836761475,
"loss": 3.9542,
"lr": 0.0005075524475524476,
"step": 4022,
"tokens_trained": 1.97651136
},
{
"epoch": 1.1412665768385222,
"grad_norm": 5.618584156036377,
"loss": 3.9841,
"lr": 0.0005072727272727273,
"step": 4024,
"tokens_trained": 1.97749408
},
{
"epoch": 1.141833912488476,
"grad_norm": 5.414000511169434,
"loss": 3.9435,
"lr": 0.0005069930069930069,
"step": 4026,
"tokens_trained": 1.978478936
},
{
"epoch": 1.1424012481384298,
"grad_norm": 7.3321661949157715,
"loss": 3.962,
"lr": 0.0005067132867132868,
"step": 4028,
"tokens_trained": 1.979462272
},
{
"epoch": 1.1429685837883838,
"grad_norm": 3.5029044151306152,
"loss": 3.9399,
"lr": 0.0005064335664335664,
"step": 4030,
"tokens_trained": 1.98044648
},
{
"epoch": 1.1435359194383377,
"grad_norm": 6.343649387359619,
"loss": 3.9788,
"lr": 0.0005061538461538462,
"step": 4032,
"tokens_trained": 1.981432816
},
{
"epoch": 1.1441032550882917,
"grad_norm": 8.250723838806152,
"loss": 3.9025,
"lr": 0.0005058741258741258,
"step": 4034,
"tokens_trained": 1.982413272
},
{
"epoch": 1.1446705907382455,
"grad_norm": 3.6089327335357666,
"loss": 3.9855,
"lr": 0.0005055944055944057,
"step": 4036,
"tokens_trained": 1.983396296
},
{
"epoch": 1.1452379263881993,
"grad_norm": 5.802486896514893,
"loss": 3.9569,
"lr": 0.0005053146853146853,
"step": 4038,
"tokens_trained": 1.984378296
},
{
"epoch": 1.1458052620381534,
"grad_norm": 6.48319673538208,
"loss": 3.9423,
"lr": 0.000505034965034965,
"step": 4040,
"tokens_trained": 1.985356768
},
{
"epoch": 1.1463725976881072,
"grad_norm": 2.9942495822906494,
"loss": 3.9667,
"lr": 0.0005047552447552448,
"step": 4042,
"tokens_trained": 1.98633836
},
{
"epoch": 1.1469399333380612,
"grad_norm": 1.4219609498977661,
"loss": 3.9238,
"lr": 0.0005044755244755244,
"step": 4044,
"tokens_trained": 1.98732128
},
{
"epoch": 1.147507268988015,
"grad_norm": 2.6950814723968506,
"loss": 3.9829,
"lr": 0.0005041958041958043,
"step": 4046,
"tokens_trained": 1.988304968
},
{
"epoch": 1.1480746046379688,
"grad_norm": 4.490326404571533,
"loss": 3.9506,
"lr": 0.0005039160839160839,
"step": 4048,
"tokens_trained": 1.989288848
},
{
"epoch": 1.1486419402879229,
"grad_norm": 7.026235580444336,
"loss": 3.9374,
"lr": 0.0005036363636363637,
"step": 4050,
"tokens_trained": 1.990270344
},
{
"epoch": 1.1492092759378767,
"grad_norm": 6.214878082275391,
"loss": 3.9627,
"lr": 0.0005033566433566433,
"step": 4052,
"tokens_trained": 1.991250424
},
{
"epoch": 1.1497766115878307,
"grad_norm": 4.663200855255127,
"loss": 3.9631,
"lr": 0.0005030769230769231,
"step": 4054,
"tokens_trained": 1.9922354
},
{
"epoch": 1.1503439472377845,
"grad_norm": 4.318966865539551,
"loss": 4.0147,
"lr": 0.0005027972027972028,
"step": 4056,
"tokens_trained": 1.993221056
},
{
"epoch": 1.1509112828877384,
"grad_norm": 5.912793159484863,
"loss": 3.9639,
"lr": 0.0005025174825174825,
"step": 4058,
"tokens_trained": 1.994207552
},
{
"epoch": 1.1514786185376924,
"grad_norm": 3.6957592964172363,
"loss": 3.9253,
"lr": 0.0005022377622377623,
"step": 4060,
"tokens_trained": 1.99519044
},
{
"epoch": 1.1520459541876462,
"grad_norm": 2.9899842739105225,
"loss": 3.9874,
"lr": 0.0005019580419580419,
"step": 4062,
"tokens_trained": 1.996177368
},
{
"epoch": 1.1526132898376003,
"grad_norm": 6.149812698364258,
"loss": 3.9278,
"lr": 0.0005016783216783218,
"step": 4064,
"tokens_trained": 1.997162248
},
{
"epoch": 1.153180625487554,
"grad_norm": 3.7720232009887695,
"loss": 3.9526,
"lr": 0.0005013986013986014,
"step": 4066,
"tokens_trained": 1.99815024
},
{
"epoch": 1.1537479611375079,
"grad_norm": 3.3968939781188965,
"loss": 3.9522,
"lr": 0.0005011188811188811,
"step": 4068,
"tokens_trained": 1.999129208
},
{
"epoch": 1.154315296787462,
"grad_norm": 7.051310062408447,
"loss": 3.9545,
"lr": 0.0005008391608391608,
"step": 4070,
"tokens_trained": 2.000111232
},
{
"epoch": 1.1548826324374157,
"grad_norm": 4.798380374908447,
"loss": 3.9114,
"lr": 0.0005005594405594406,
"step": 4072,
"tokens_trained": 2.001098352
},
{
"epoch": 1.1554499680873698,
"grad_norm": 7.5074992179870605,
"loss": 3.9795,
"lr": 0.0005002797202797203,
"step": 4074,
"tokens_trained": 2.002077616
},
{
"epoch": 1.1560173037373236,
"grad_norm": 3.944998025894165,
"loss": 3.9208,
"lr": 0.0005,
"step": 4076,
"tokens_trained": 2.003065976
},
{
"epoch": 1.1565846393872774,
"grad_norm": 9.103386878967285,
"loss": 3.9577,
"lr": 0.0004997202797202798,
"step": 4078,
"tokens_trained": 2.004046568
},
{
"epoch": 1.1571519750372314,
"grad_norm": 8.950857162475586,
"loss": 3.9474,
"lr": 0.0004994405594405594,
"step": 4080,
"tokens_trained": 2.005031288
},
{
"epoch": 1.1577193106871853,
"grad_norm": 6.812939643859863,
"loss": 3.9995,
"lr": 0.0004991608391608391,
"step": 4082,
"tokens_trained": 2.00601472
},
{
"epoch": 1.1582866463371393,
"grad_norm": 8.14719009399414,
"loss": 3.9496,
"lr": 0.0004988811188811189,
"step": 4084,
"tokens_trained": 2.006996416
},
{
"epoch": 1.158853981987093,
"grad_norm": 7.125198841094971,
"loss": 3.9074,
"lr": 0.0004986013986013986,
"step": 4086,
"tokens_trained": 2.007980248
},
{
"epoch": 1.159421317637047,
"grad_norm": 2.4099230766296387,
"loss": 3.9675,
"lr": 0.0004983216783216784,
"step": 4088,
"tokens_trained": 2.008964792
},
{
"epoch": 1.159988653287001,
"grad_norm": 3.9759979248046875,
"loss": 3.9655,
"lr": 0.0004980419580419581,
"step": 4090,
"tokens_trained": 2.009945552
},
{
"epoch": 1.1605559889369548,
"grad_norm": 5.3169264793396,
"loss": 3.9856,
"lr": 0.0004977622377622378,
"step": 4092,
"tokens_trained": 2.010931072
},
{
"epoch": 1.1611233245869088,
"grad_norm": 9.010540008544922,
"loss": 3.9293,
"lr": 0.0004974825174825175,
"step": 4094,
"tokens_trained": 2.011911712
},
{
"epoch": 1.1616906602368626,
"grad_norm": 5.83132266998291,
"loss": 3.9725,
"lr": 0.0004972027972027972,
"step": 4096,
"tokens_trained": 2.012895208
},
{
"epoch": 1.1622579958868164,
"grad_norm": 8.76009750366211,
"loss": 3.9875,
"lr": 0.0004969230769230769,
"step": 4098,
"tokens_trained": 2.013881768
},
{
"epoch": 1.1628253315367705,
"grad_norm": 4.634799480438232,
"loss": 3.9478,
"lr": 0.0004966433566433566,
"step": 4100,
"tokens_trained": 2.014862288
},
{
"epoch": 1.1633926671867243,
"grad_norm": 3.717115879058838,
"loss": 3.9029,
"lr": 0.0004963636363636364,
"step": 4102,
"tokens_trained": 2.015846344
},
{
"epoch": 1.1639600028366783,
"grad_norm": 5.467166423797607,
"loss": 3.9561,
"lr": 0.0004960839160839161,
"step": 4104,
"tokens_trained": 2.01682528
},
{
"epoch": 1.1645273384866321,
"grad_norm": 5.645481109619141,
"loss": 3.9889,
"lr": 0.0004958041958041959,
"step": 4106,
"tokens_trained": 2.017809272
},
{
"epoch": 1.165094674136586,
"grad_norm": 4.796457767486572,
"loss": 3.9554,
"lr": 0.0004955244755244756,
"step": 4108,
"tokens_trained": 2.018791344
},
{
"epoch": 1.16566200978654,
"grad_norm": 6.111627578735352,
"loss": 3.9495,
"lr": 0.0004952447552447552,
"step": 4110,
"tokens_trained": 2.019777776
},
{
"epoch": 1.1662293454364938,
"grad_norm": 4.132344722747803,
"loss": 3.878,
"lr": 0.000494965034965035,
"step": 4112,
"tokens_trained": 2.020760032
},
{
"epoch": 1.1667966810864479,
"grad_norm": 4.833931922912598,
"loss": 3.9537,
"lr": 0.0004946853146853147,
"step": 4114,
"tokens_trained": 2.021745984
},
{
"epoch": 1.1673640167364017,
"grad_norm": 5.027078628540039,
"loss": 3.9359,
"lr": 0.0004944055944055944,
"step": 4116,
"tokens_trained": 2.022724968
},
{
"epoch": 1.1679313523863555,
"grad_norm": 5.339116096496582,
"loss": 3.9104,
"lr": 0.0004941258741258741,
"step": 4118,
"tokens_trained": 2.023705248
},
{
"epoch": 1.1684986880363095,
"grad_norm": 5.1652607917785645,
"loss": 3.9671,
"lr": 0.0004938461538461538,
"step": 4120,
"tokens_trained": 2.024688648
},
{
"epoch": 1.1690660236862633,
"grad_norm": 4.289709568023682,
"loss": 3.9315,
"lr": 0.0004935664335664336,
"step": 4122,
"tokens_trained": 2.025667424
},
{
"epoch": 1.1696333593362174,
"grad_norm": 5.6946492195129395,
"loss": 3.9498,
"lr": 0.0004932867132867133,
"step": 4124,
"tokens_trained": 2.026647168
},
{
"epoch": 1.1699170271611943,
"eval_loss": 0.9880662560462952,
"eval_runtime": 21.3984,
"step": 4125,
"tokens_trained": 2.027139168
},
{
"epoch": 1.1702006949861712,
"grad_norm": 3.798551082611084,
"loss": 3.9244,
"lr": 0.0004930069930069931,
"step": 4126,
"tokens_trained": 2.027631096
},
{
"epoch": 1.170768030636125,
"grad_norm": 3.644767999649048,
"loss": 3.939,
"lr": 0.0004927272727272727,
"step": 4128,
"tokens_trained": 2.028613776
},
{
"epoch": 1.171335366286079,
"grad_norm": 5.300503253936768,
"loss": 3.9352,
"lr": 0.0004924475524475525,
"step": 4130,
"tokens_trained": 2.0295936
},
{
"epoch": 1.1719027019360329,
"grad_norm": 4.033862590789795,
"loss": 3.9805,
"lr": 0.0004921678321678322,
"step": 4132,
"tokens_trained": 2.030575632
},
{
"epoch": 1.172470037585987,
"grad_norm": 3.5188965797424316,
"loss": 3.979,
"lr": 0.0004918881118881118,
"step": 4134,
"tokens_trained": 2.031559704
},
{
"epoch": 1.1730373732359407,
"grad_norm": 2.1571266651153564,
"loss": 3.9798,
"lr": 0.0004916083916083916,
"step": 4136,
"tokens_trained": 2.032544624
},
{
"epoch": 1.1736047088858945,
"grad_norm": 1.2364273071289062,
"loss": 3.971,
"lr": 0.0004913286713286713,
"step": 4138,
"tokens_trained": 2.033524816
},
{
"epoch": 1.1741720445358486,
"grad_norm": 2.3588576316833496,
"loss": 3.9631,
"lr": 0.0004910489510489511,
"step": 4140,
"tokens_trained": 2.034509784
},
{
"epoch": 1.1747393801858024,
"grad_norm": 1.2670316696166992,
"loss": 3.9317,
"lr": 0.0004907692307692308,
"step": 4142,
"tokens_trained": 2.035493456
},
{
"epoch": 1.1753067158357564,
"grad_norm": 3.2413010597229004,
"loss": 3.9778,
"lr": 0.0004904895104895106,
"step": 4144,
"tokens_trained": 2.03647368
},
{
"epoch": 1.1758740514857102,
"grad_norm": 4.079458713531494,
"loss": 3.9715,
"lr": 0.0004902097902097902,
"step": 4146,
"tokens_trained": 2.037452696
},
{
"epoch": 1.176441387135664,
"grad_norm": 2.3634743690490723,
"loss": 3.9857,
"lr": 0.00048993006993007,
"step": 4148,
"tokens_trained": 2.038437256
},
{
"epoch": 1.177008722785618,
"grad_norm": 1.7258849143981934,
"loss": 3.9044,
"lr": 0.0004896503496503497,
"step": 4150,
"tokens_trained": 2.039421224
},
{
"epoch": 1.177576058435572,
"grad_norm": 4.426620960235596,
"loss": 3.9366,
"lr": 0.0004893706293706293,
"step": 4152,
"tokens_trained": 2.040399768
},
{
"epoch": 1.178143394085526,
"grad_norm": 4.946300506591797,
"loss": 3.8394,
"lr": 0.0004890909090909091,
"step": 4154,
"tokens_trained": 2.041382744
},
{
"epoch": 1.1787107297354797,
"grad_norm": 7.814687252044678,
"loss": 3.9504,
"lr": 0.0004888111888111888,
"step": 4156,
"tokens_trained": 2.042364152
},
{
"epoch": 1.1792780653854336,
"grad_norm": 1.7227815389633179,
"loss": 3.8821,
"lr": 0.0004885314685314686,
"step": 4158,
"tokens_trained": 2.043344264
},
{
"epoch": 1.1798454010353876,
"grad_norm": 11.620087623596191,
"loss": 3.9375,
"lr": 0.0004882517482517483,
"step": 4160,
"tokens_trained": 2.04432976
},
{
"epoch": 1.1804127366853414,
"grad_norm": 11.146257400512695,
"loss": 3.9933,
"lr": 0.000487972027972028,
"step": 4162,
"tokens_trained": 2.0453136
},
{
"epoch": 1.1809800723352954,
"grad_norm": 9.995295524597168,
"loss": 3.9977,
"lr": 0.0004876923076923077,
"step": 4164,
"tokens_trained": 2.046294384
},
{
"epoch": 1.1815474079852493,
"grad_norm": 9.448521614074707,
"loss": 3.8709,
"lr": 0.00048741258741258743,
"step": 4166,
"tokens_trained": 2.047279192
},
{
"epoch": 1.182114743635203,
"grad_norm": 2.3229587078094482,
"loss": 3.9194,
"lr": 0.0004871328671328671,
"step": 4168,
"tokens_trained": 2.048260136
},
{
"epoch": 1.1826820792851571,
"grad_norm": 3.8930304050445557,
"loss": 3.9447,
"lr": 0.00048685314685314687,
"step": 4170,
"tokens_trained": 2.049238496
},
{
"epoch": 1.183249414935111,
"grad_norm": 6.03069543838501,
"loss": 3.9134,
"lr": 0.00048657342657342656,
"step": 4172,
"tokens_trained": 2.050226352
},
{
"epoch": 1.183816750585065,
"grad_norm": 6.509665489196777,
"loss": 3.9005,
"lr": 0.0004862937062937063,
"step": 4174,
"tokens_trained": 2.05121248
},
{
"epoch": 1.1843840862350188,
"grad_norm": 2.0728557109832764,
"loss": 3.9646,
"lr": 0.000486013986013986,
"step": 4176,
"tokens_trained": 2.052196784
},
{
"epoch": 1.1849514218849726,
"grad_norm": 1.972641944885254,
"loss": 3.9529,
"lr": 0.0004857342657342658,
"step": 4178,
"tokens_trained": 2.053177512
},
{
"epoch": 1.1855187575349266,
"grad_norm": 6.664553165435791,
"loss": 3.9424,
"lr": 0.0004854545454545455,
"step": 4180,
"tokens_trained": 2.054159928
},
{
"epoch": 1.1860860931848805,
"grad_norm": 7.182534217834473,
"loss": 3.9572,
"lr": 0.00048517482517482517,
"step": 4182,
"tokens_trained": 2.05514288
},
{
"epoch": 1.1866534288348345,
"grad_norm": 3.3657350540161133,
"loss": 3.9027,
"lr": 0.0004848951048951049,
"step": 4184,
"tokens_trained": 2.056127256
},
{
"epoch": 1.1872207644847883,
"grad_norm": 3.8826489448547363,
"loss": 3.9045,
"lr": 0.0004846153846153846,
"step": 4186,
"tokens_trained": 2.057110184
},
{
"epoch": 1.1877881001347421,
"grad_norm": 3.4556474685668945,
"loss": 3.9407,
"lr": 0.00048433566433566435,
"step": 4188,
"tokens_trained": 2.058090016
},
{
"epoch": 1.1883554357846962,
"grad_norm": 5.431522846221924,
"loss": 3.93,
"lr": 0.00048405594405594404,
"step": 4190,
"tokens_trained": 2.059071208
},
{
"epoch": 1.18892277143465,
"grad_norm": 3.987600803375244,
"loss": 3.9276,
"lr": 0.0004837762237762238,
"step": 4192,
"tokens_trained": 2.060047448
},
{
"epoch": 1.189490107084604,
"grad_norm": 5.114170074462891,
"loss": 3.9685,
"lr": 0.0004834965034965035,
"step": 4194,
"tokens_trained": 2.0610266
},
{
"epoch": 1.1900574427345578,
"grad_norm": 3.948340654373169,
"loss": 3.9357,
"lr": 0.0004832167832167833,
"step": 4196,
"tokens_trained": 2.062014792
},
{
"epoch": 1.1906247783845116,
"grad_norm": 4.607158660888672,
"loss": 3.9441,
"lr": 0.00048293706293706297,
"step": 4198,
"tokens_trained": 2.062993768
},
{
"epoch": 1.1911921140344657,
"grad_norm": 2.860197067260742,
"loss": 3.9469,
"lr": 0.00048265734265734266,
"step": 4200,
"tokens_trained": 2.063974352
},
{
"epoch": 1.1917594496844195,
"grad_norm": 4.8133544921875,
"loss": 3.9549,
"lr": 0.0004823776223776224,
"step": 4202,
"tokens_trained": 2.064955
},
{
"epoch": 1.1923267853343735,
"grad_norm": 3.1824069023132324,
"loss": 3.9589,
"lr": 0.0004820979020979021,
"step": 4204,
"tokens_trained": 2.065938728
},
{
"epoch": 1.1928941209843273,
"grad_norm": 4.413929462432861,
"loss": 3.9259,
"lr": 0.00048181818181818184,
"step": 4206,
"tokens_trained": 2.066920408
},
{
"epoch": 1.1934614566342812,
"grad_norm": 4.193307876586914,
"loss": 3.8911,
"lr": 0.0004815384615384615,
"step": 4208,
"tokens_trained": 2.067904384
},
{
"epoch": 1.1940287922842352,
"grad_norm": 3.4476332664489746,
"loss": 3.9646,
"lr": 0.00048125874125874127,
"step": 4210,
"tokens_trained": 2.068888184
},
{
"epoch": 1.194596127934189,
"grad_norm": 1.2195734977722168,
"loss": 3.9053,
"lr": 0.00048097902097902096,
"step": 4212,
"tokens_trained": 2.069866408
},
{
"epoch": 1.195163463584143,
"grad_norm": 2.1013519763946533,
"loss": 3.9806,
"lr": 0.00048069930069930076,
"step": 4214,
"tokens_trained": 2.070848272
},
{
"epoch": 1.1957307992340969,
"grad_norm": 6.16254186630249,
"loss": 3.99,
"lr": 0.00048041958041958045,
"step": 4216,
"tokens_trained": 2.071833968
},
{
"epoch": 1.1962981348840507,
"grad_norm": 4.7692179679870605,
"loss": 3.9775,
"lr": 0.00048013986013986014,
"step": 4218,
"tokens_trained": 2.07281356
},
{
"epoch": 1.1968654705340047,
"grad_norm": 3.336514949798584,
"loss": 4.0087,
"lr": 0.0004798601398601399,
"step": 4220,
"tokens_trained": 2.07380172
},
{
"epoch": 1.1974328061839585,
"grad_norm": 3.2661092281341553,
"loss": 3.9471,
"lr": 0.0004795804195804196,
"step": 4222,
"tokens_trained": 2.074785216
},
{
"epoch": 1.1980001418339126,
"grad_norm": 3.0861871242523193,
"loss": 3.9829,
"lr": 0.0004793006993006993,
"step": 4224,
"tokens_trained": 2.075770912
},
{
"epoch": 1.1985674774838664,
"grad_norm": 4.010982036590576,
"loss": 3.9013,
"lr": 0.000479020979020979,
"step": 4226,
"tokens_trained": 2.076755104
},
{
"epoch": 1.1991348131338202,
"grad_norm": 3.736706495285034,
"loss": 3.9455,
"lr": 0.00047874125874125875,
"step": 4228,
"tokens_trained": 2.077737472
},
{
"epoch": 1.1997021487837742,
"grad_norm": 2.741546392440796,
"loss": 3.929,
"lr": 0.00047846153846153844,
"step": 4230,
"tokens_trained": 2.078721008
},
{
"epoch": 1.200269484433728,
"grad_norm": 5.045975685119629,
"loss": 3.938,
"lr": 0.00047818181818181824,
"step": 4232,
"tokens_trained": 2.079705624
},
{
"epoch": 1.200836820083682,
"grad_norm": 6.466317653656006,
"loss": 3.9189,
"lr": 0.00047790209790209793,
"step": 4234,
"tokens_trained": 2.080689632
},
{
"epoch": 1.201404155733636,
"grad_norm": 10.680752754211426,
"loss": 3.924,
"lr": 0.0004776223776223776,
"step": 4236,
"tokens_trained": 2.0816728
},
{
"epoch": 1.2019714913835897,
"grad_norm": 4.394003868103027,
"loss": 3.9587,
"lr": 0.00047734265734265737,
"step": 4238,
"tokens_trained": 2.082649352
},
{
"epoch": 1.2025388270335438,
"grad_norm": 14.375049591064453,
"loss": 3.8901,
"lr": 0.00047706293706293706,
"step": 4240,
"tokens_trained": 2.083629016
},
{
"epoch": 1.2031061626834976,
"grad_norm": 6.259925365447998,
"loss": 3.9736,
"lr": 0.0004767832167832168,
"step": 4242,
"tokens_trained": 2.084612464
},
{
"epoch": 1.2036734983334516,
"grad_norm": 7.176869869232178,
"loss": 3.9335,
"lr": 0.0004765034965034965,
"step": 4244,
"tokens_trained": 2.085598128
},
{
"epoch": 1.2042408339834054,
"grad_norm": 7.3431291580200195,
"loss": 3.9129,
"lr": 0.00047622377622377624,
"step": 4246,
"tokens_trained": 2.086582144
},
{
"epoch": 1.2048081696333592,
"grad_norm": 3.1388702392578125,
"loss": 3.9645,
"lr": 0.00047594405594405593,
"step": 4248,
"tokens_trained": 2.087566256
},
{
"epoch": 1.2053755052833133,
"grad_norm": 4.360974311828613,
"loss": 3.8965,
"lr": 0.00047566433566433573,
"step": 4250,
"tokens_trained": 2.088546896
},
{
"epoch": 1.2053755052833133,
"eval_loss": 0.9876537919044495,
"eval_runtime": 20.2375,
"step": 4250,
"tokens_trained": 2.088546896
},
{
"epoch": 1.205942840933267,
"grad_norm": 6.790876388549805,
"loss": 3.8925,
"lr": 0.0004753846153846154,
"step": 4252,
"tokens_trained": 2.089529312
},
{
"epoch": 1.2065101765832211,
"grad_norm": 5.942895412445068,
"loss": 3.9429,
"lr": 0.0004751048951048951,
"step": 4254,
"tokens_trained": 2.090517856
},
{
"epoch": 1.207077512233175,
"grad_norm": 7.182357311248779,
"loss": 3.975,
"lr": 0.00047482517482517485,
"step": 4256,
"tokens_trained": 2.091501152
},
{
"epoch": 1.2076448478831288,
"grad_norm": 3.092268228530884,
"loss": 3.9078,
"lr": 0.00047454545454545454,
"step": 4258,
"tokens_trained": 2.0924852
},
{
"epoch": 1.2082121835330828,
"grad_norm": 7.483865737915039,
"loss": 3.9469,
"lr": 0.0004742657342657343,
"step": 4260,
"tokens_trained": 2.093467328
},
{
"epoch": 1.2087795191830366,
"grad_norm": 6.828039169311523,
"loss": 3.9683,
"lr": 0.000473986013986014,
"step": 4262,
"tokens_trained": 2.094447
},
{
"epoch": 1.2093468548329906,
"grad_norm": 2.1174066066741943,
"loss": 3.9575,
"lr": 0.0004737062937062937,
"step": 4264,
"tokens_trained": 2.095428552
},
{
"epoch": 1.2099141904829445,
"grad_norm": 1.7029787302017212,
"loss": 3.9174,
"lr": 0.0004734265734265734,
"step": 4266,
"tokens_trained": 2.096413944
},
{
"epoch": 1.2104815261328983,
"grad_norm": 8.107586860656738,
"loss": 3.9526,
"lr": 0.0004731468531468531,
"step": 4268,
"tokens_trained": 2.097395416
},
{
"epoch": 1.2110488617828523,
"grad_norm": 6.090738773345947,
"loss": 3.8711,
"lr": 0.0004728671328671329,
"step": 4270,
"tokens_trained": 2.098379488
},
{
"epoch": 1.2116161974328061,
"grad_norm": 3.09671950340271,
"loss": 3.9489,
"lr": 0.0004725874125874126,
"step": 4272,
"tokens_trained": 2.099365672
},
{
"epoch": 1.2121835330827602,
"grad_norm": 1.3280375003814697,
"loss": 3.8766,
"lr": 0.00047230769230769234,
"step": 4274,
"tokens_trained": 2.100345872
},
{
"epoch": 1.212750868732714,
"grad_norm": 2.2725517749786377,
"loss": 3.9298,
"lr": 0.00047202797202797203,
"step": 4276,
"tokens_trained": 2.101330144
},
{
"epoch": 1.2133182043826678,
"grad_norm": 7.571750164031982,
"loss": 3.9129,
"lr": 0.00047174825174825177,
"step": 4278,
"tokens_trained": 2.102310504
},
{
"epoch": 1.2138855400326218,
"grad_norm": 5.49086856842041,
"loss": 3.9257,
"lr": 0.00047146853146853146,
"step": 4280,
"tokens_trained": 2.10329544
},
{
"epoch": 1.2144528756825756,
"grad_norm": 3.936779737472534,
"loss": 3.9055,
"lr": 0.0004711888111888112,
"step": 4282,
"tokens_trained": 2.104280736
},
{
"epoch": 1.2150202113325297,
"grad_norm": 3.1779263019561768,
"loss": 3.9624,
"lr": 0.0004709090909090909,
"step": 4284,
"tokens_trained": 2.10526688
},
{
"epoch": 1.2155875469824835,
"grad_norm": 2.7246220111846924,
"loss": 3.9584,
"lr": 0.0004706293706293706,
"step": 4286,
"tokens_trained": 2.106249208
},
{
"epoch": 1.2161548826324373,
"grad_norm": 6.718515396118164,
"loss": 3.9084,
"lr": 0.0004703496503496504,
"step": 4288,
"tokens_trained": 2.107231312
},
{
"epoch": 1.2167222182823914,
"grad_norm": 5.000235080718994,
"loss": 3.9648,
"lr": 0.0004700699300699301,
"step": 4290,
"tokens_trained": 2.108215624
},
{
"epoch": 1.2172895539323452,
"grad_norm": 4.756376266479492,
"loss": 3.9848,
"lr": 0.0004697902097902098,
"step": 4292,
"tokens_trained": 2.10920156
},
{
"epoch": 1.2178568895822992,
"grad_norm": 1.9365978240966797,
"loss": 3.9517,
"lr": 0.0004695104895104895,
"step": 4294,
"tokens_trained": 2.110182936
},
{
"epoch": 1.218424225232253,
"grad_norm": 5.350283622741699,
"loss": 3.9737,
"lr": 0.00046923076923076926,
"step": 4296,
"tokens_trained": 2.111164808
},
{
"epoch": 1.2189915608822068,
"grad_norm": 4.543917655944824,
"loss": 3.9111,
"lr": 0.00046895104895104895,
"step": 4298,
"tokens_trained": 2.112146848
},
{
"epoch": 1.2195588965321609,
"grad_norm": 5.1316938400268555,
"loss": 3.9194,
"lr": 0.0004686713286713287,
"step": 4300,
"tokens_trained": 2.113134184
},
{
"epoch": 1.2201262321821147,
"grad_norm": 3.0844085216522217,
"loss": 3.8872,
"lr": 0.0004683916083916084,
"step": 4302,
"tokens_trained": 2.114120832
},
{
"epoch": 1.2206935678320687,
"grad_norm": 2.2305877208709717,
"loss": 3.9497,
"lr": 0.00046811188811188807,
"step": 4304,
"tokens_trained": 2.115103856
},
{
"epoch": 1.2212609034820225,
"grad_norm": 1.7684617042541504,
"loss": 3.9218,
"lr": 0.00046783216783216787,
"step": 4306,
"tokens_trained": 2.116086968
},
{
"epoch": 1.2218282391319764,
"grad_norm": 6.3064680099487305,
"loss": 3.9657,
"lr": 0.00046755244755244756,
"step": 4308,
"tokens_trained": 2.11707108
},
{
"epoch": 1.2223955747819304,
"grad_norm": 2.4910192489624023,
"loss": 3.8588,
"lr": 0.0004672727272727273,
"step": 4310,
"tokens_trained": 2.118053928
},
{
"epoch": 1.2229629104318842,
"grad_norm": 3.482459306716919,
"loss": 3.9213,
"lr": 0.000466993006993007,
"step": 4312,
"tokens_trained": 2.119037056
},
{
"epoch": 1.2235302460818382,
"grad_norm": 6.552737712860107,
"loss": 3.8804,
"lr": 0.00046671328671328674,
"step": 4314,
"tokens_trained": 2.120019576
},
{
"epoch": 1.224097581731792,
"grad_norm": 5.225849628448486,
"loss": 3.9562,
"lr": 0.00046643356643356643,
"step": 4316,
"tokens_trained": 2.121000112
},
{
"epoch": 1.2246649173817459,
"grad_norm": 2.1894407272338867,
"loss": 3.8752,
"lr": 0.0004661538461538462,
"step": 4318,
"tokens_trained": 2.121988376
},
{
"epoch": 1.2252322530317,
"grad_norm": 1.5741831064224243,
"loss": 3.953,
"lr": 0.00046587412587412587,
"step": 4320,
"tokens_trained": 2.122965864
},
{
"epoch": 1.2257995886816537,
"grad_norm": 4.103208065032959,
"loss": 3.9216,
"lr": 0.00046559440559440556,
"step": 4322,
"tokens_trained": 2.123950848
},
{
"epoch": 1.2263669243316078,
"grad_norm": 7.347278118133545,
"loss": 3.9547,
"lr": 0.00046531468531468536,
"step": 4324,
"tokens_trained": 2.124933448
},
{
"epoch": 1.2269342599815616,
"grad_norm": 4.8083930015563965,
"loss": 3.9711,
"lr": 0.00046503496503496505,
"step": 4326,
"tokens_trained": 2.125921528
},
{
"epoch": 1.2275015956315154,
"grad_norm": 5.4488654136657715,
"loss": 3.8941,
"lr": 0.0004647552447552448,
"step": 4328,
"tokens_trained": 2.126897152
},
{
"epoch": 1.2280689312814694,
"grad_norm": 6.24332332611084,
"loss": 3.9178,
"lr": 0.0004644755244755245,
"step": 4330,
"tokens_trained": 2.127881384
},
{
"epoch": 1.2286362669314232,
"grad_norm": 5.97770881652832,
"loss": 3.8804,
"lr": 0.0004641958041958042,
"step": 4332,
"tokens_trained": 2.128864008
},
{
"epoch": 1.2292036025813773,
"grad_norm": 3.901036500930786,
"loss": 3.8968,
"lr": 0.0004639160839160839,
"step": 4334,
"tokens_trained": 2.129847632
},
{
"epoch": 1.229770938231331,
"grad_norm": 5.377021789550781,
"loss": 3.9565,
"lr": 0.00046363636363636366,
"step": 4336,
"tokens_trained": 2.130832296
},
{
"epoch": 1.230338273881285,
"grad_norm": 4.565158367156982,
"loss": 3.9672,
"lr": 0.00046335664335664335,
"step": 4338,
"tokens_trained": 2.131814648
},
{
"epoch": 1.230905609531239,
"grad_norm": 1.2882499694824219,
"loss": 3.9515,
"lr": 0.00046307692307692304,
"step": 4340,
"tokens_trained": 2.132797872
},
{
"epoch": 1.2314729451811928,
"grad_norm": 0.9845411777496338,
"loss": 3.9057,
"lr": 0.00046279720279720284,
"step": 4342,
"tokens_trained": 2.133780992
},
{
"epoch": 1.2320402808311468,
"grad_norm": 3.7839152812957764,
"loss": 3.8909,
"lr": 0.00046251748251748253,
"step": 4344,
"tokens_trained": 2.134762864
},
{
"epoch": 1.2326076164811006,
"grad_norm": 3.8872299194335938,
"loss": 3.9262,
"lr": 0.0004622377622377623,
"step": 4346,
"tokens_trained": 2.135743504
},
{
"epoch": 1.2331749521310544,
"grad_norm": 4.538093566894531,
"loss": 3.9098,
"lr": 0.00046195804195804196,
"step": 4348,
"tokens_trained": 2.136727288
},
{
"epoch": 1.2337422877810085,
"grad_norm": 6.453696250915527,
"loss": 3.9103,
"lr": 0.0004616783216783217,
"step": 4350,
"tokens_trained": 2.137710256
},
{
"epoch": 1.2343096234309623,
"grad_norm": 4.033708572387695,
"loss": 3.9144,
"lr": 0.0004613986013986014,
"step": 4352,
"tokens_trained": 2.138691568
},
{
"epoch": 1.2348769590809163,
"grad_norm": 4.32963752746582,
"loss": 3.9154,
"lr": 0.00046111888111888114,
"step": 4354,
"tokens_trained": 2.13967628
},
{
"epoch": 1.2354442947308701,
"grad_norm": 3.0617220401763916,
"loss": 3.8984,
"lr": 0.00046083916083916083,
"step": 4356,
"tokens_trained": 2.140659368
},
{
"epoch": 1.236011630380824,
"grad_norm": 2.51361346244812,
"loss": 3.8971,
"lr": 0.0004605594405594405,
"step": 4358,
"tokens_trained": 2.141644648
},
{
"epoch": 1.236578966030778,
"grad_norm": 3.6975977420806885,
"loss": 3.9208,
"lr": 0.0004602797202797203,
"step": 4360,
"tokens_trained": 2.142628176
},
{
"epoch": 1.2371463016807318,
"grad_norm": 5.2992844581604,
"loss": 3.8855,
"lr": 0.00046,
"step": 4362,
"tokens_trained": 2.143610328
},
{
"epoch": 1.2377136373306858,
"grad_norm": 4.426636695861816,
"loss": 3.893,
"lr": 0.00045972027972027976,
"step": 4364,
"tokens_trained": 2.144591512
},
{
"epoch": 1.2382809729806397,
"grad_norm": 4.131166458129883,
"loss": 3.9098,
"lr": 0.00045944055944055945,
"step": 4366,
"tokens_trained": 2.14557312
},
{
"epoch": 1.2388483086305935,
"grad_norm": 2.9156816005706787,
"loss": 3.9771,
"lr": 0.0004591608391608392,
"step": 4368,
"tokens_trained": 2.146551592
},
{
"epoch": 1.2394156442805475,
"grad_norm": 3.8412554264068604,
"loss": 3.9584,
"lr": 0.0004588811188811189,
"step": 4370,
"tokens_trained": 2.147533032
},
{
"epoch": 1.2399829799305013,
"grad_norm": 3.1897640228271484,
"loss": 3.8253,
"lr": 0.0004586013986013986,
"step": 4372,
"tokens_trained": 2.148517592
},
{
"epoch": 1.2405503155804554,
"grad_norm": 4.066483020782471,
"loss": 3.8905,
"lr": 0.0004583216783216783,
"step": 4374,
"tokens_trained": 2.149502368
},
{
"epoch": 1.2408339834054323,
"eval_loss": 0.9844964146614075,
"eval_runtime": 21.0593,
"step": 4375,
"tokens_trained": 2.14999612
},
{
"epoch": 1.2411176512304092,
"grad_norm": 2.0596890449523926,
"loss": 3.9142,
"lr": 0.000458041958041958,
"step": 4376,
"tokens_trained": 2.15048712
},
{
"epoch": 1.241684986880363,
"grad_norm": 4.4018988609313965,
"loss": 3.9487,
"lr": 0.0004577622377622378,
"step": 4378,
"tokens_trained": 2.151468832
},
{
"epoch": 1.242252322530317,
"grad_norm": 3.294774055480957,
"loss": 3.979,
"lr": 0.0004574825174825175,
"step": 4380,
"tokens_trained": 2.152451456
},
{
"epoch": 1.2428196581802708,
"grad_norm": 2.5546209812164307,
"loss": 3.9135,
"lr": 0.00045720279720279724,
"step": 4382,
"tokens_trained": 2.1534348
},
{
"epoch": 1.2433869938302249,
"grad_norm": 2.1771605014801025,
"loss": 3.9207,
"lr": 0.00045692307692307693,
"step": 4384,
"tokens_trained": 2.154414104
},
{
"epoch": 1.2439543294801787,
"grad_norm": 3.5681049823760986,
"loss": 3.8632,
"lr": 0.0004566433566433567,
"step": 4386,
"tokens_trained": 2.155399088
},
{
"epoch": 1.2445216651301325,
"grad_norm": 5.588647365570068,
"loss": 3.9769,
"lr": 0.00045636363636363637,
"step": 4388,
"tokens_trained": 2.15638104
},
{
"epoch": 1.2450890007800866,
"grad_norm": 5.798253059387207,
"loss": 3.9167,
"lr": 0.00045608391608391606,
"step": 4390,
"tokens_trained": 2.157366296
},
{
"epoch": 1.2456563364300404,
"grad_norm": 2.425339698791504,
"loss": 3.9152,
"lr": 0.0004558041958041958,
"step": 4392,
"tokens_trained": 2.158347208
},
{
"epoch": 1.2462236720799944,
"grad_norm": 4.4874444007873535,
"loss": 3.9171,
"lr": 0.0004555244755244755,
"step": 4394,
"tokens_trained": 2.159329056
},
{
"epoch": 1.2467910077299482,
"grad_norm": 4.653798580169678,
"loss": 3.9308,
"lr": 0.00045524475524475524,
"step": 4396,
"tokens_trained": 2.160312792
},
{
"epoch": 1.247358343379902,
"grad_norm": 5.013849258422852,
"loss": 3.9224,
"lr": 0.000454965034965035,
"step": 4398,
"tokens_trained": 2.161298728
},
{
"epoch": 1.247925679029856,
"grad_norm": 3.3346633911132812,
"loss": 3.9482,
"lr": 0.0004546853146853147,
"step": 4400,
"tokens_trained": 2.162280664
},
{
"epoch": 1.2484930146798099,
"grad_norm": 2.408282518386841,
"loss": 3.9468,
"lr": 0.0004544055944055944,
"step": 4402,
"tokens_trained": 2.163262608
},
{
"epoch": 1.249060350329764,
"grad_norm": 2.3152034282684326,
"loss": 3.9346,
"lr": 0.00045412587412587416,
"step": 4404,
"tokens_trained": 2.16424488
},
{
"epoch": 1.2496276859797177,
"grad_norm": 4.722060680389404,
"loss": 3.93,
"lr": 0.00045384615384615385,
"step": 4406,
"tokens_trained": 2.165227184
},
{
"epoch": 1.2501950216296716,
"grad_norm": 2.3931281566619873,
"loss": 3.9412,
"lr": 0.00045356643356643354,
"step": 4408,
"tokens_trained": 2.166208312
},
{
"epoch": 1.2507623572796256,
"grad_norm": 3.703711986541748,
"loss": 3.9661,
"lr": 0.0004532867132867133,
"step": 4410,
"tokens_trained": 2.167191896
},
{
"epoch": 1.2513296929295794,
"grad_norm": 3.168426036834717,
"loss": 3.9108,
"lr": 0.000453006993006993,
"step": 4412,
"tokens_trained": 2.1681734
},
{
"epoch": 1.2518970285795334,
"grad_norm": 4.465419769287109,
"loss": 3.9224,
"lr": 0.0004527272727272727,
"step": 4414,
"tokens_trained": 2.16915824
},
{
"epoch": 1.2524643642294873,
"grad_norm": 3.145385265350342,
"loss": 3.9317,
"lr": 0.00045244755244755247,
"step": 4416,
"tokens_trained": 2.170140944
},
{
"epoch": 1.253031699879441,
"grad_norm": 3.0174384117126465,
"loss": 3.9592,
"lr": 0.0004521678321678322,
"step": 4418,
"tokens_trained": 2.171127312
},
{
"epoch": 1.2535990355293951,
"grad_norm": 2.9682352542877197,
"loss": 3.9248,
"lr": 0.0004518881118881119,
"step": 4420,
"tokens_trained": 2.17211552
},
{
"epoch": 1.254166371179349,
"grad_norm": 4.654287338256836,
"loss": 3.9592,
"lr": 0.00045160839160839165,
"step": 4422,
"tokens_trained": 2.173101456
},
{
"epoch": 1.254733706829303,
"grad_norm": 5.210162162780762,
"loss": 3.9463,
"lr": 0.00045132867132867134,
"step": 4424,
"tokens_trained": 2.174081192
},
{
"epoch": 1.2553010424792568,
"grad_norm": 1.6227176189422607,
"loss": 3.8894,
"lr": 0.000451048951048951,
"step": 4426,
"tokens_trained": 2.175063888
},
{
"epoch": 1.2558683781292106,
"grad_norm": 1.6847152709960938,
"loss": 3.9207,
"lr": 0.00045076923076923077,
"step": 4428,
"tokens_trained": 2.176047656
},
{
"epoch": 1.2564357137791646,
"grad_norm": 7.743977069854736,
"loss": 3.9202,
"lr": 0.00045048951048951046,
"step": 4430,
"tokens_trained": 2.177030728
},
{
"epoch": 1.2570030494291184,
"grad_norm": 5.493525981903076,
"loss": 3.8951,
"lr": 0.0004502097902097902,
"step": 4432,
"tokens_trained": 2.178010048
},
{
"epoch": 1.2575703850790725,
"grad_norm": 4.744298934936523,
"loss": 3.9641,
"lr": 0.00044993006993006995,
"step": 4434,
"tokens_trained": 2.178992816
},
{
"epoch": 1.2581377207290263,
"grad_norm": 5.230485916137695,
"loss": 3.9552,
"lr": 0.0004496503496503497,
"step": 4436,
"tokens_trained": 2.179977048
},
{
"epoch": 1.2587050563789801,
"grad_norm": 2.7955129146575928,
"loss": 3.9462,
"lr": 0.0004493706293706294,
"step": 4438,
"tokens_trained": 2.18096108
},
{
"epoch": 1.2592723920289342,
"grad_norm": 4.869340419769287,
"loss": 3.8819,
"lr": 0.00044909090909090913,
"step": 4440,
"tokens_trained": 2.181941176
},
{
"epoch": 1.259839727678888,
"grad_norm": 4.538938045501709,
"loss": 3.8967,
"lr": 0.0004488111888111888,
"step": 4442,
"tokens_trained": 2.182923032
},
{
"epoch": 1.260407063328842,
"grad_norm": 4.085853576660156,
"loss": 3.9155,
"lr": 0.0004485314685314685,
"step": 4444,
"tokens_trained": 2.183902584
},
{
"epoch": 1.2609743989787958,
"grad_norm": 6.15781831741333,
"loss": 3.9379,
"lr": 0.00044825174825174826,
"step": 4446,
"tokens_trained": 2.184884968
},
{
"epoch": 1.2615417346287496,
"grad_norm": 2.5738606452941895,
"loss": 3.9642,
"lr": 0.00044797202797202795,
"step": 4448,
"tokens_trained": 2.185870952
},
{
"epoch": 1.2621090702787037,
"grad_norm": 4.356530666351318,
"loss": 3.8908,
"lr": 0.0004476923076923077,
"step": 4450,
"tokens_trained": 2.186854928
},
{
"epoch": 1.2626764059286575,
"grad_norm": 5.518537998199463,
"loss": 3.8954,
"lr": 0.00044741258741258744,
"step": 4452,
"tokens_trained": 2.187847
},
{
"epoch": 1.2632437415786115,
"grad_norm": 7.3632354736328125,
"loss": 3.9363,
"lr": 0.0004471328671328672,
"step": 4454,
"tokens_trained": 2.188829592
},
{
"epoch": 1.2638110772285653,
"grad_norm": 0.9625980854034424,
"loss": 3.9416,
"lr": 0.00044685314685314687,
"step": 4456,
"tokens_trained": 2.189811456
},
{
"epoch": 1.2643784128785192,
"grad_norm": 4.0898003578186035,
"loss": 3.9133,
"lr": 0.0004465734265734266,
"step": 4458,
"tokens_trained": 2.19079428
},
{
"epoch": 1.2649457485284732,
"grad_norm": 6.740445137023926,
"loss": 3.9282,
"lr": 0.0004462937062937063,
"step": 4460,
"tokens_trained": 2.1917786
},
{
"epoch": 1.265513084178427,
"grad_norm": 6.742666244506836,
"loss": 3.9077,
"lr": 0.000446013986013986,
"step": 4462,
"tokens_trained": 2.192758016
},
{
"epoch": 1.266080419828381,
"grad_norm": 4.592698097229004,
"loss": 3.9123,
"lr": 0.00044573426573426574,
"step": 4464,
"tokens_trained": 2.193741496
},
{
"epoch": 1.2666477554783349,
"grad_norm": 8.934327125549316,
"loss": 3.9647,
"lr": 0.00044545454545454543,
"step": 4466,
"tokens_trained": 2.194723584
},
{
"epoch": 1.2672150911282887,
"grad_norm": 4.280580997467041,
"loss": 3.9189,
"lr": 0.0004451748251748252,
"step": 4468,
"tokens_trained": 2.195708432
},
{
"epoch": 1.2677824267782427,
"grad_norm": 3.257995843887329,
"loss": 3.9698,
"lr": 0.0004448951048951049,
"step": 4470,
"tokens_trained": 2.196691336
},
{
"epoch": 1.2683497624281965,
"grad_norm": 6.521494388580322,
"loss": 3.9676,
"lr": 0.00044461538461538466,
"step": 4472,
"tokens_trained": 2.197674528
},
{
"epoch": 1.2689170980781506,
"grad_norm": 6.169503211975098,
"loss": 3.9404,
"lr": 0.00044433566433566435,
"step": 4474,
"tokens_trained": 2.198658448
},
{
"epoch": 1.2694844337281044,
"grad_norm": 3.5009562969207764,
"loss": 3.9229,
"lr": 0.0004440559440559441,
"step": 4476,
"tokens_trained": 2.199646232
},
{
"epoch": 1.2700517693780582,
"grad_norm": 3.2101058959960938,
"loss": 3.9536,
"lr": 0.0004437762237762238,
"step": 4478,
"tokens_trained": 2.200630024
},
{
"epoch": 1.2706191050280122,
"grad_norm": 5.417990684509277,
"loss": 3.9591,
"lr": 0.0004434965034965035,
"step": 4480,
"tokens_trained": 2.2016182
},
{
"epoch": 1.271186440677966,
"grad_norm": 3.1346352100372314,
"loss": 3.9408,
"lr": 0.0004432167832167832,
"step": 4482,
"tokens_trained": 2.2025994
},
{
"epoch": 1.27175377632792,
"grad_norm": 3.2468717098236084,
"loss": 3.922,
"lr": 0.0004429370629370629,
"step": 4484,
"tokens_trained": 2.203581424
},
{
"epoch": 1.272321111977874,
"grad_norm": 5.069144248962402,
"loss": 3.9616,
"lr": 0.00044265734265734266,
"step": 4486,
"tokens_trained": 2.204562264
},
{
"epoch": 1.2728884476278277,
"grad_norm": 4.097993850708008,
"loss": 3.931,
"lr": 0.0004423776223776224,
"step": 4488,
"tokens_trained": 2.205548376
},
{
"epoch": 1.2734557832777817,
"grad_norm": 2.3711421489715576,
"loss": 3.9201,
"lr": 0.00044209790209790215,
"step": 4490,
"tokens_trained": 2.206535208
},
{
"epoch": 1.2740231189277356,
"grad_norm": 7.32819938659668,
"loss": 3.8766,
"lr": 0.00044181818181818184,
"step": 4492,
"tokens_trained": 2.207522192
},
{
"epoch": 1.2745904545776896,
"grad_norm": 3.9666519165039062,
"loss": 3.894,
"lr": 0.00044153846153846153,
"step": 4494,
"tokens_trained": 2.208506616
},
{
"epoch": 1.2751577902276434,
"grad_norm": 2.1190407276153564,
"loss": 3.9141,
"lr": 0.0004412587412587413,
"step": 4496,
"tokens_trained": 2.209489192
},
{
"epoch": 1.2757251258775972,
"grad_norm": 1.3682332038879395,
"loss": 3.8666,
"lr": 0.00044097902097902096,
"step": 4498,
"tokens_trained": 2.210472392
},
{
"epoch": 1.2762924615275513,
"grad_norm": 2.5941426753997803,
"loss": 3.8921,
"lr": 0.0004406993006993007,
"step": 4500,
"tokens_trained": 2.211451384
},
{
"epoch": 1.2762924615275513,
"eval_loss": 0.9826880097389221,
"eval_runtime": 20.931,
"step": 4500,
"tokens_trained": 2.211451384
}
],
"logging_steps": 2,
"max_steps": 7650,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 750,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}