{ "best_global_step": 6000, "best_metric": 0.9661399722099304, "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-MxDNA/checkpoint-6000", "epoch": 1.7017941989929792, "eval_steps": 125, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005673356499539039, "grad_norm": 8450.4345703125, "loss": 876.9911, "lr": 2e-06, "step": 2, "tokens_trained": 0.000985992 }, { "epoch": 0.0011346712999078079, "grad_norm": 8980.888671875, "loss": 779.4711, "lr": 6e-06, "step": 4, "tokens_trained": 0.001968088 }, { "epoch": 0.001702006949861712, "grad_norm": 7489.92529296875, "loss": 488.6157, "lr": 1e-05, "step": 6, "tokens_trained": 0.002953808 }, { "epoch": 0.0022693425998156157, "grad_norm": 1952.1917724609375, "loss": 237.0602, "lr": 1.4e-05, "step": 8, "tokens_trained": 0.003935728 }, { "epoch": 0.0028366782497695198, "grad_norm": 1418.443603515625, "loss": 159.0854, "lr": 1.8e-05, "step": 10, "tokens_trained": 0.004916488 }, { "epoch": 0.003404013899723424, "grad_norm": 874.7195434570312, "loss": 91.9563, "lr": 2.2e-05, "step": 12, "tokens_trained": 0.005902792 }, { "epoch": 0.003971349549677328, "grad_norm": 1339.8248291015625, "loss": 40.3366, "lr": 2.6e-05, "step": 14, "tokens_trained": 0.0068856 }, { "epoch": 0.0045386851996312315, "grad_norm": 2936.7607421875, "loss": 22.7436, "lr": 3e-05, "step": 16, "tokens_trained": 0.007868248 }, { "epoch": 0.005106020849585136, "grad_norm": 1531.3807373046875, "loss": 23.4797, "lr": 3.4000000000000007e-05, "step": 18, "tokens_trained": 0.008849296 }, { "epoch": 0.0056733564995390395, "grad_norm": 3027.4189453125, "loss": 38.7379, "lr": 3.8e-05, "step": 20, "tokens_trained": 0.009830984 }, { "epoch": 0.006240692149492944, "grad_norm": 2435.890625, "loss": 26.2427, "lr": 4.2000000000000004e-05, "step": 22, "tokens_trained": 0.01081364 }, { "epoch": 0.006808027799446848, "grad_norm": 3217.990478515625, "loss": 31.0263, "lr": 4.6e-05, "step": 24, "tokens_trained": 0.01179036 }, { "epoch": 0.007375363449400752, "grad_norm": 3854.00634765625, "loss": 33.8781, "lr": 5e-05, "step": 26, "tokens_trained": 0.012774504 }, { "epoch": 0.007942699099354656, "grad_norm": 3197.489990234375, "loss": 27.7927, "lr": 5.4e-05, "step": 28, "tokens_trained": 0.013759992 }, { "epoch": 0.00851003474930856, "grad_norm": 3034.156494140625, "loss": 37.9083, "lr": 5.800000000000001e-05, "step": 30, "tokens_trained": 0.014740536 }, { "epoch": 0.009077370399262463, "grad_norm": 3040.314453125, "loss": 34.0659, "lr": 6.2e-05, "step": 32, "tokens_trained": 0.015725984 }, { "epoch": 0.009644706049216368, "grad_norm": 3065.5791015625, "loss": 27.7768, "lr": 6.6e-05, "step": 34, "tokens_trained": 0.016706864 }, { "epoch": 0.010212041699170272, "grad_norm": 2454.293701171875, "loss": 35.1143, "lr": 7.000000000000001e-05, "step": 36, "tokens_trained": 0.017688816 }, { "epoch": 0.010779377349124175, "grad_norm": 3100.7802734375, "loss": 42.2603, "lr": 7.4e-05, "step": 38, "tokens_trained": 0.018669072 }, { "epoch": 0.011346712999078079, "grad_norm": 2749.84423828125, "loss": 39.3879, "lr": 7.8e-05, "step": 40, "tokens_trained": 0.019652072 }, { "epoch": 0.011914048649031984, "grad_norm": 1519.9908447265625, "loss": 35.0735, "lr": 8.2e-05, "step": 42, "tokens_trained": 0.020633112 }, { "epoch": 0.012481384298985888, "grad_norm": 1474.4244384765625, "loss": 25.8965, "lr": 8.599999999999999e-05, "step": 44, "tokens_trained": 0.021616192 }, { "epoch": 0.013048719948939792, "grad_norm": 2962.500244140625, "loss": 51.0784, "lr": 8.999999999999999e-05, "step": 46, "tokens_trained": 0.022597288 }, { "epoch": 0.013616055598893695, "grad_norm": 2419.41455078125, "loss": 43.0334, "lr": 9.400000000000001e-05, "step": 48, "tokens_trained": 0.02357572 }, { "epoch": 0.014183391248847599, "grad_norm": 1267.87451171875, "loss": 21.8063, "lr": 9.800000000000001e-05, "step": 50, "tokens_trained": 0.024553376 }, { "epoch": 0.014750726898801504, "grad_norm": 1573.944091796875, "loss": 52.9693, "lr": 0.000102, "step": 52, "tokens_trained": 0.025536728 }, { "epoch": 0.015318062548755408, "grad_norm": 1509.650146484375, "loss": 50.0825, "lr": 0.000106, "step": 54, "tokens_trained": 0.026517 }, { "epoch": 0.01588539819870931, "grad_norm": 2334.765380859375, "loss": 42.1982, "lr": 0.00011, "step": 56, "tokens_trained": 0.027504728 }, { "epoch": 0.016452733848663217, "grad_norm": 1594.16259765625, "loss": 39.0562, "lr": 0.000114, "step": 58, "tokens_trained": 0.028485416 }, { "epoch": 0.01702006949861712, "grad_norm": 1628.082275390625, "loss": 35.0488, "lr": 0.000118, "step": 60, "tokens_trained": 0.029468696 }, { "epoch": 0.017587405148571024, "grad_norm": 2496.6455078125, "loss": 49.4241, "lr": 0.000122, "step": 62, "tokens_trained": 0.030453584 }, { "epoch": 0.018154740798524926, "grad_norm": 2521.721435546875, "loss": 69.0275, "lr": 0.000126, "step": 64, "tokens_trained": 0.031432864 }, { "epoch": 0.01872207644847883, "grad_norm": 2179.571533203125, "loss": 63.1409, "lr": 0.00013000000000000002, "step": 66, "tokens_trained": 0.032418416 }, { "epoch": 0.019289412098432736, "grad_norm": 899.7137451171875, "loss": 38.4131, "lr": 0.000134, "step": 68, "tokens_trained": 0.033402136 }, { "epoch": 0.01985674774838664, "grad_norm": 2109.377685546875, "loss": 51.0044, "lr": 0.00013800000000000002, "step": 70, "tokens_trained": 0.03438832 }, { "epoch": 0.020424083398340544, "grad_norm": 1649.1873779296875, "loss": 32.1408, "lr": 0.00014199999999999998, "step": 72, "tokens_trained": 0.035374464 }, { "epoch": 0.020991419048294446, "grad_norm": 1807.994140625, "loss": 28.8357, "lr": 0.000146, "step": 74, "tokens_trained": 0.03635784 }, { "epoch": 0.02155875469824835, "grad_norm": 998.9485473632812, "loss": 23.0343, "lr": 0.00015, "step": 76, "tokens_trained": 0.037340248 }, { "epoch": 0.022126090348202256, "grad_norm": 2240.17578125, "loss": 32.0397, "lr": 0.000154, "step": 78, "tokens_trained": 0.038321968 }, { "epoch": 0.022693425998156158, "grad_norm": 1606.0067138671875, "loss": 32.1776, "lr": 0.000158, "step": 80, "tokens_trained": 0.039304992 }, { "epoch": 0.023260761648110063, "grad_norm": 1685.1015625, "loss": 24.3428, "lr": 0.000162, "step": 82, "tokens_trained": 0.040286808 }, { "epoch": 0.02382809729806397, "grad_norm": 1761.7890625, "loss": 23.9261, "lr": 0.00016600000000000002, "step": 84, "tokens_trained": 0.041271776 }, { "epoch": 0.02439543294801787, "grad_norm": 2036.0982666015625, "loss": 27.7196, "lr": 0.00017, "step": 86, "tokens_trained": 0.042252784 }, { "epoch": 0.024962768597971776, "grad_norm": 1564.3870849609375, "loss": 25.3722, "lr": 0.000174, "step": 88, "tokens_trained": 0.04323596 }, { "epoch": 0.025530104247925678, "grad_norm": 1508.349853515625, "loss": 18.4107, "lr": 0.000178, "step": 90, "tokens_trained": 0.044218984 }, { "epoch": 0.026097439897879583, "grad_norm": 1955.011474609375, "loss": 28.8456, "lr": 0.000182, "step": 92, "tokens_trained": 0.045202144 }, { "epoch": 0.02666477554783349, "grad_norm": 1679.9423828125, "loss": 23.6139, "lr": 0.000186, "step": 94, "tokens_trained": 0.046192336 }, { "epoch": 0.02723211119778739, "grad_norm": 1517.5731201171875, "loss": 42.145, "lr": 0.00019, "step": 96, "tokens_trained": 0.047174312 }, { "epoch": 0.027799446847741296, "grad_norm": 1535.3076171875, "loss": 31.9711, "lr": 0.000194, "step": 98, "tokens_trained": 0.048158944 }, { "epoch": 0.028366782497695198, "grad_norm": 1475.2569580078125, "loss": 37.645, "lr": 0.00019800000000000002, "step": 100, "tokens_trained": 0.04914364 }, { "epoch": 0.028934118147649103, "grad_norm": 1918.4088134765625, "loss": 69.4053, "lr": 0.000202, "step": 102, "tokens_trained": 0.050123488 }, { "epoch": 0.02950145379760301, "grad_norm": 1631.6231689453125, "loss": 50.9725, "lr": 0.000206, "step": 104, "tokens_trained": 0.051105512 }, { "epoch": 0.03006878944755691, "grad_norm": 1291.6376953125, "loss": 22.6527, "lr": 0.00021, "step": 106, "tokens_trained": 0.052091704 }, { "epoch": 0.030636125097510816, "grad_norm": 1224.9625244140625, "loss": 60.2725, "lr": 0.000214, "step": 108, "tokens_trained": 0.053074824 }, { "epoch": 0.031203460747464717, "grad_norm": 1218.2022705078125, "loss": 75.8728, "lr": 0.000218, "step": 110, "tokens_trained": 0.054057104 }, { "epoch": 0.03177079639741862, "grad_norm": 1761.8861083984375, "loss": 61.6427, "lr": 0.000222, "step": 112, "tokens_trained": 0.055039128 }, { "epoch": 0.03233813204737253, "grad_norm": 1482.4256591796875, "loss": 35.3351, "lr": 0.00022600000000000002, "step": 114, "tokens_trained": 0.05602388 }, { "epoch": 0.03290546769732643, "grad_norm": 563.6399536132812, "loss": 40.1461, "lr": 0.00023, "step": 116, "tokens_trained": 0.057005376 }, { "epoch": 0.03347280334728033, "grad_norm": 1266.058837890625, "loss": 24.0657, "lr": 0.00023400000000000002, "step": 118, "tokens_trained": 0.057985136 }, { "epoch": 0.03404013899723424, "grad_norm": 918.206298828125, "loss": 23.9626, "lr": 0.00023799999999999998, "step": 120, "tokens_trained": 0.058968288 }, { "epoch": 0.03460747464718814, "grad_norm": 1495.7191162109375, "loss": 19.798, "lr": 0.000242, "step": 122, "tokens_trained": 0.05995348 }, { "epoch": 0.03517481029714205, "grad_norm": 1264.302734375, "loss": 31.5342, "lr": 0.000246, "step": 124, "tokens_trained": 0.060935832 }, { "epoch": 0.035458478122119, "eval_loss": 5.312118053436279, "eval_runtime": 21.3065, "step": 125, "tokens_trained": 0.061426608 }, { "epoch": 0.03574214594709595, "grad_norm": 907.4861450195312, "loss": 25.1262, "lr": 0.00025, "step": 126, "tokens_trained": 0.061918184 }, { "epoch": 0.03630948159704985, "grad_norm": 1287.6158447265625, "loss": 26.963, "lr": 0.000254, "step": 128, "tokens_trained": 0.062902328 }, { "epoch": 0.03687681724700376, "grad_norm": 1260.570556640625, "loss": 24.9633, "lr": 0.00025800000000000004, "step": 130, "tokens_trained": 0.063883456 }, { "epoch": 0.03744415289695766, "grad_norm": 1436.82373046875, "loss": 23.1028, "lr": 0.000262, "step": 132, "tokens_trained": 0.06486748 }, { "epoch": 0.03801148854691157, "grad_norm": 812.9523315429688, "loss": 20.5496, "lr": 0.000266, "step": 134, "tokens_trained": 0.065847104 }, { "epoch": 0.03857882419686547, "grad_norm": 1336.5322265625, "loss": 23.673, "lr": 0.00027, "step": 136, "tokens_trained": 0.066829928 }, { "epoch": 0.03914615984681937, "grad_norm": 1381.282470703125, "loss": 32.0373, "lr": 0.00027400000000000005, "step": 138, "tokens_trained": 0.067814024 }, { "epoch": 0.03971349549677328, "grad_norm": 972.7861938476562, "loss": 26.9454, "lr": 0.00027800000000000004, "step": 140, "tokens_trained": 0.068797744 }, { "epoch": 0.04028083114672718, "grad_norm": 1347.2249755859375, "loss": 22.3578, "lr": 0.00028199999999999997, "step": 142, "tokens_trained": 0.069780072 }, { "epoch": 0.04084816679668109, "grad_norm": 829.525390625, "loss": 37.9879, "lr": 0.00028599999999999996, "step": 144, "tokens_trained": 0.070759896 }, { "epoch": 0.04141550244663499, "grad_norm": 1094.1033935546875, "loss": 21.1972, "lr": 0.00029, "step": 146, "tokens_trained": 0.0717452 }, { "epoch": 0.04198283809658889, "grad_norm": 717.107421875, "loss": 21.7774, "lr": 0.000294, "step": 148, "tokens_trained": 0.072727432 }, { "epoch": 0.042550173746542796, "grad_norm": 744.4456787109375, "loss": 20.3235, "lr": 0.000298, "step": 150, "tokens_trained": 0.073712128 }, { "epoch": 0.0431175093964967, "grad_norm": 904.1460571289062, "loss": 22.7878, "lr": 0.000302, "step": 152, "tokens_trained": 0.074695296 }, { "epoch": 0.04368484504645061, "grad_norm": 1352.303955078125, "loss": 20.9757, "lr": 0.000306, "step": 154, "tokens_trained": 0.0756798 }, { "epoch": 0.04425218069640451, "grad_norm": 997.0473022460938, "loss": 17.4647, "lr": 0.00031, "step": 156, "tokens_trained": 0.076666504 }, { "epoch": 0.04481951634635841, "grad_norm": 1206.387939453125, "loss": 21.1846, "lr": 0.000314, "step": 158, "tokens_trained": 0.07764868 }, { "epoch": 0.045386851996312316, "grad_norm": 1029.6807861328125, "loss": 17.8853, "lr": 0.00031800000000000003, "step": 160, "tokens_trained": 0.07863548 }, { "epoch": 0.04595418764626622, "grad_norm": 1136.4635009765625, "loss": 30.057, "lr": 0.000322, "step": 162, "tokens_trained": 0.079618928 }, { "epoch": 0.04652152329622013, "grad_norm": 834.3464965820312, "loss": 28.1782, "lr": 0.000326, "step": 164, "tokens_trained": 0.0806032 }, { "epoch": 0.04708885894617403, "grad_norm": 1177.8365478515625, "loss": 16.4267, "lr": 0.00033, "step": 166, "tokens_trained": 0.081583752 }, { "epoch": 0.04765619459612794, "grad_norm": 572.501708984375, "loss": 16.5752, "lr": 0.00033400000000000004, "step": 168, "tokens_trained": 0.082568184 }, { "epoch": 0.048223530246081836, "grad_norm": 437.6822814941406, "loss": 11.5509, "lr": 0.00033800000000000003, "step": 170, "tokens_trained": 0.083553352 }, { "epoch": 0.04879086589603574, "grad_norm": 1119.0416259765625, "loss": 16.2689, "lr": 0.000342, "step": 172, "tokens_trained": 0.084536352 }, { "epoch": 0.04935820154598965, "grad_norm": 895.4021606445312, "loss": 12.6663, "lr": 0.000346, "step": 174, "tokens_trained": 0.085517312 }, { "epoch": 0.04992553719594355, "grad_norm": 995.6289672851562, "loss": 26.0663, "lr": 0.00035, "step": 176, "tokens_trained": 0.086496088 }, { "epoch": 0.05049287284589746, "grad_norm": 839.6610717773438, "loss": 21.5115, "lr": 0.000354, "step": 178, "tokens_trained": 0.087480632 }, { "epoch": 0.051060208495851356, "grad_norm": 734.1155395507812, "loss": 29.3287, "lr": 0.000358, "step": 180, "tokens_trained": 0.088460408 }, { "epoch": 0.05162754414580526, "grad_norm": 721.4505615234375, "loss": 26.0801, "lr": 0.000362, "step": 182, "tokens_trained": 0.08944248 }, { "epoch": 0.052194879795759166, "grad_norm": 845.9672241210938, "loss": 19.0639, "lr": 0.000366, "step": 184, "tokens_trained": 0.090427832 }, { "epoch": 0.05276221544571307, "grad_norm": 1210.9969482421875, "loss": 23.9036, "lr": 0.00037, "step": 186, "tokens_trained": 0.091411504 }, { "epoch": 0.05332955109566698, "grad_norm": 1079.1690673828125, "loss": 23.5588, "lr": 0.000374, "step": 188, "tokens_trained": 0.092392672 }, { "epoch": 0.053896886745620876, "grad_norm": 596.111328125, "loss": 20.8275, "lr": 0.000378, "step": 190, "tokens_trained": 0.093374696 }, { "epoch": 0.05446422239557478, "grad_norm": 761.8096923828125, "loss": 22.512, "lr": 0.000382, "step": 192, "tokens_trained": 0.094361912 }, { "epoch": 0.055031558045528686, "grad_norm": 1081.9832763671875, "loss": 32.335, "lr": 0.000386, "step": 194, "tokens_trained": 0.095342992 }, { "epoch": 0.05559889369548259, "grad_norm": 304.3534240722656, "loss": 11.5275, "lr": 0.00039000000000000005, "step": 196, "tokens_trained": 0.096323512 }, { "epoch": 0.0561662293454365, "grad_norm": 586.6314086914062, "loss": 16.2663, "lr": 0.00039400000000000004, "step": 198, "tokens_trained": 0.097308864 }, { "epoch": 0.056733564995390395, "grad_norm": 624.9953002929688, "loss": 16.627, "lr": 0.000398, "step": 200, "tokens_trained": 0.098289064 }, { "epoch": 0.0573009006453443, "grad_norm": 585.9645385742188, "loss": 15.8359, "lr": 0.000402, "step": 202, "tokens_trained": 0.099269696 }, { "epoch": 0.057868236295298206, "grad_norm": 537.9913330078125, "loss": 20.0779, "lr": 0.00040600000000000006, "step": 204, "tokens_trained": 0.100248448 }, { "epoch": 0.05843557194525211, "grad_norm": 805.04931640625, "loss": 21.4524, "lr": 0.00041, "step": 206, "tokens_trained": 0.101231248 }, { "epoch": 0.05900290759520602, "grad_norm": 439.1418151855469, "loss": 23.9852, "lr": 0.000414, "step": 208, "tokens_trained": 0.102210688 }, { "epoch": 0.059570243245159915, "grad_norm": 502.684814453125, "loss": 17.6273, "lr": 0.00041799999999999997, "step": 210, "tokens_trained": 0.103192176 }, { "epoch": 0.06013757889511382, "grad_norm": 849.9979858398438, "loss": 33.7517, "lr": 0.000422, "step": 212, "tokens_trained": 0.104172824 }, { "epoch": 0.060704914545067726, "grad_norm": 939.583740234375, "loss": 26.2559, "lr": 0.000426, "step": 214, "tokens_trained": 0.105156672 }, { "epoch": 0.06127225019502163, "grad_norm": 525.0505981445312, "loss": 20.0923, "lr": 0.00043, "step": 216, "tokens_trained": 0.106141368 }, { "epoch": 0.061839585844975536, "grad_norm": 420.296630859375, "loss": 17.9608, "lr": 0.00043400000000000003, "step": 218, "tokens_trained": 0.107124088 }, { "epoch": 0.062406921494929435, "grad_norm": 711.3380737304688, "loss": 19.387, "lr": 0.000438, "step": 220, "tokens_trained": 0.108112632 }, { "epoch": 0.06297425714488335, "grad_norm": 759.183349609375, "loss": 17.8061, "lr": 0.000442, "step": 222, "tokens_trained": 0.1090934 }, { "epoch": 0.06354159279483725, "grad_norm": 790.025146484375, "loss": 13.8539, "lr": 0.000446, "step": 224, "tokens_trained": 0.110079512 }, { "epoch": 0.06410892844479114, "grad_norm": 769.8306274414062, "loss": 22.1258, "lr": 0.00045000000000000004, "step": 226, "tokens_trained": 0.111060152 }, { "epoch": 0.06467626409474506, "grad_norm": 656.8352661132812, "loss": 14.8646, "lr": 0.00045400000000000003, "step": 228, "tokens_trained": 0.112044144 }, { "epoch": 0.06524359974469895, "grad_norm": 498.92010498046875, "loss": 23.1558, "lr": 0.000458, "step": 230, "tokens_trained": 0.113022928 }, { "epoch": 0.06581093539465287, "grad_norm": 764.0186157226562, "loss": 16.7089, "lr": 0.000462, "step": 232, "tokens_trained": 0.114003832 }, { "epoch": 0.06637827104460677, "grad_norm": 491.5793762207031, "loss": 12.3979, "lr": 0.00046600000000000005, "step": 234, "tokens_trained": 0.114991008 }, { "epoch": 0.06694560669456066, "grad_norm": 679.9217529296875, "loss": 14.9037, "lr": 0.00047, "step": 236, "tokens_trained": 0.115971888 }, { "epoch": 0.06751294234451458, "grad_norm": 491.0369567871094, "loss": 7.7603, "lr": 0.000474, "step": 238, "tokens_trained": 0.116952616 }, { "epoch": 0.06808027799446847, "grad_norm": 369.2186279296875, "loss": 8.2256, "lr": 0.00047799999999999996, "step": 240, "tokens_trained": 0.117935816 }, { "epoch": 0.06864761364442239, "grad_norm": 312.72137451171875, "loss": 7.5486, "lr": 0.000482, "step": 242, "tokens_trained": 0.118919392 }, { "epoch": 0.06921494929437629, "grad_norm": 596.1439208984375, "loss": 11.7351, "lr": 0.000486, "step": 244, "tokens_trained": 0.119901856 }, { "epoch": 0.06978228494433018, "grad_norm": 467.5667419433594, "loss": 11.8403, "lr": 0.00049, "step": 246, "tokens_trained": 0.120884624 }, { "epoch": 0.0703496205942841, "grad_norm": 430.50048828125, "loss": 13.8081, "lr": 0.000494, "step": 248, "tokens_trained": 0.121869224 }, { "epoch": 0.070916956244238, "grad_norm": 522.242919921875, "loss": 14.1892, "lr": 0.000498, "step": 250, "tokens_trained": 0.122853584 }, { "epoch": 0.070916956244238, "eval_loss": 1.9294606447219849, "eval_runtime": 20.4162, "step": 250, "tokens_trained": 0.122853584 }, { "epoch": 0.0714842918941919, "grad_norm": 835.2765502929688, "loss": 13.2462, "lr": 0.0005020000000000001, "step": 252, "tokens_trained": 0.123835544 }, { "epoch": 0.0720516275441458, "grad_norm": 714.8098754882812, "loss": 20.0498, "lr": 0.000506, "step": 254, "tokens_trained": 0.124821616 }, { "epoch": 0.0726189631940997, "grad_norm": 701.512939453125, "loss": 18.3664, "lr": 0.00051, "step": 256, "tokens_trained": 0.125807608 }, { "epoch": 0.07318629884405362, "grad_norm": 773.987060546875, "loss": 21.3807, "lr": 0.000514, "step": 258, "tokens_trained": 0.126791464 }, { "epoch": 0.07375363449400751, "grad_norm": 826.422119140625, "loss": 22.6403, "lr": 0.000518, "step": 260, "tokens_trained": 0.127771752 }, { "epoch": 0.07432097014396143, "grad_norm": 742.8673095703125, "loss": 20.1504, "lr": 0.000522, "step": 262, "tokens_trained": 0.128755448 }, { "epoch": 0.07488830579391532, "grad_norm": 797.79296875, "loss": 26.7343, "lr": 0.000526, "step": 264, "tokens_trained": 0.129741088 }, { "epoch": 0.07545564144386922, "grad_norm": 673.9141235351562, "loss": 12.505, "lr": 0.0005300000000000001, "step": 266, "tokens_trained": 0.130727504 }, { "epoch": 0.07602297709382314, "grad_norm": 310.6510925292969, "loss": 12.6344, "lr": 0.0005340000000000001, "step": 268, "tokens_trained": 0.131710296 }, { "epoch": 0.07659031274377703, "grad_norm": 312.40966796875, "loss": 14.254, "lr": 0.0005380000000000001, "step": 270, "tokens_trained": 0.132695352 }, { "epoch": 0.07715764839373095, "grad_norm": 492.2834777832031, "loss": 19.0979, "lr": 0.0005420000000000001, "step": 272, "tokens_trained": 0.133677928 }, { "epoch": 0.07772498404368484, "grad_norm": 628.457763671875, "loss": 21.7735, "lr": 0.000546, "step": 274, "tokens_trained": 0.134655504 }, { "epoch": 0.07829231969363874, "grad_norm": 382.8389892578125, "loss": 12.5128, "lr": 0.00055, "step": 276, "tokens_trained": 0.135640208 }, { "epoch": 0.07885965534359266, "grad_norm": 483.12335205078125, "loss": 15.2589, "lr": 0.000554, "step": 278, "tokens_trained": 0.136624232 }, { "epoch": 0.07942699099354655, "grad_norm": 640.658447265625, "loss": 12.1341, "lr": 0.000558, "step": 280, "tokens_trained": 0.13760628 }, { "epoch": 0.07999432664350047, "grad_norm": 410.0824279785156, "loss": 12.5723, "lr": 0.0005620000000000001, "step": 282, "tokens_trained": 0.13858832 }, { "epoch": 0.08056166229345436, "grad_norm": 513.2861328125, "loss": 14.8461, "lr": 0.000566, "step": 284, "tokens_trained": 0.139568424 }, { "epoch": 0.08112899794340826, "grad_norm": 564.547607421875, "loss": 12.5792, "lr": 0.00057, "step": 286, "tokens_trained": 0.140557016 }, { "epoch": 0.08169633359336217, "grad_norm": 451.3592834472656, "loss": 16.5433, "lr": 0.000574, "step": 288, "tokens_trained": 0.141540248 }, { "epoch": 0.08226366924331607, "grad_norm": 404.2495422363281, "loss": 16.4138, "lr": 0.000578, "step": 290, "tokens_trained": 0.142528272 }, { "epoch": 0.08283100489326999, "grad_norm": 566.5219116210938, "loss": 16.4743, "lr": 0.0005819999999999999, "step": 292, "tokens_trained": 0.143513096 }, { "epoch": 0.08339834054322388, "grad_norm": 559.6517333984375, "loss": 16.421, "lr": 0.0005859999999999999, "step": 294, "tokens_trained": 0.144494472 }, { "epoch": 0.08396567619317778, "grad_norm": 260.874755859375, "loss": 11.2214, "lr": 0.00059, "step": 296, "tokens_trained": 0.14547876 }, { "epoch": 0.0845330118431317, "grad_norm": 272.02899169921875, "loss": 10.3491, "lr": 0.000594, "step": 298, "tokens_trained": 0.146465864 }, { "epoch": 0.08510034749308559, "grad_norm": 556.9845581054688, "loss": 10.4348, "lr": 0.000598, "step": 300, "tokens_trained": 0.147446344 }, { "epoch": 0.0856676831430395, "grad_norm": 273.35772705078125, "loss": 8.3292, "lr": 0.000602, "step": 302, "tokens_trained": 0.14843244 }, { "epoch": 0.0862350187929934, "grad_norm": 246.6316680908203, "loss": 9.9362, "lr": 0.000606, "step": 304, "tokens_trained": 0.149415976 }, { "epoch": 0.0868023544429473, "grad_norm": 564.4365844726562, "loss": 9.2621, "lr": 0.00061, "step": 306, "tokens_trained": 0.150398728 }, { "epoch": 0.08736969009290121, "grad_norm": 396.0948791503906, "loss": 11.8526, "lr": 0.000614, "step": 308, "tokens_trained": 0.151385104 }, { "epoch": 0.08793702574285511, "grad_norm": 488.6072692871094, "loss": 11.8473, "lr": 0.0006180000000000001, "step": 310, "tokens_trained": 0.152373672 }, { "epoch": 0.08850436139280903, "grad_norm": 346.70660400390625, "loss": 12.0897, "lr": 0.000622, "step": 312, "tokens_trained": 0.153356256 }, { "epoch": 0.08907169704276292, "grad_norm": 382.40679931640625, "loss": 9.271, "lr": 0.000626, "step": 314, "tokens_trained": 0.154342632 }, { "epoch": 0.08963903269271682, "grad_norm": 288.7908935546875, "loss": 9.185, "lr": 0.00063, "step": 316, "tokens_trained": 0.1553238 }, { "epoch": 0.09020636834267073, "grad_norm": 337.5335388183594, "loss": 12.0555, "lr": 0.000634, "step": 318, "tokens_trained": 0.156313168 }, { "epoch": 0.09077370399262463, "grad_norm": 349.25531005859375, "loss": 8.51, "lr": 0.000638, "step": 320, "tokens_trained": 0.157299448 }, { "epoch": 0.09134103964257854, "grad_norm": 471.7824401855469, "loss": 14.1888, "lr": 0.000642, "step": 322, "tokens_trained": 0.158285264 }, { "epoch": 0.09190837529253244, "grad_norm": 284.94036865234375, "loss": 10.1593, "lr": 0.000646, "step": 324, "tokens_trained": 0.159267512 }, { "epoch": 0.09247571094248634, "grad_norm": 510.90478515625, "loss": 13.5744, "lr": 0.0006500000000000001, "step": 326, "tokens_trained": 0.160250856 }, { "epoch": 0.09304304659244025, "grad_norm": 373.82965087890625, "loss": 8.4999, "lr": 0.0006540000000000001, "step": 328, "tokens_trained": 0.161231832 }, { "epoch": 0.09361038224239415, "grad_norm": 219.3827362060547, "loss": 8.4436, "lr": 0.0006580000000000001, "step": 330, "tokens_trained": 0.162217656 }, { "epoch": 0.09417771789234806, "grad_norm": 433.0914001464844, "loss": 11.2019, "lr": 0.000662, "step": 332, "tokens_trained": 0.163199096 }, { "epoch": 0.09474505354230196, "grad_norm": 242.65907287597656, "loss": 9.0666, "lr": 0.000666, "step": 334, "tokens_trained": 0.164178512 }, { "epoch": 0.09531238919225588, "grad_norm": 446.07916259765625, "loss": 8.6546, "lr": 0.00067, "step": 336, "tokens_trained": 0.165162464 }, { "epoch": 0.09587972484220977, "grad_norm": 231.8892364501953, "loss": 7.5819, "lr": 0.000674, "step": 338, "tokens_trained": 0.166141536 }, { "epoch": 0.09644706049216367, "grad_norm": 100.7306137084961, "loss": 6.7047, "lr": 0.0006780000000000001, "step": 340, "tokens_trained": 0.167123944 }, { "epoch": 0.09701439614211758, "grad_norm": 78.11279296875, "loss": 5.9308, "lr": 0.0006820000000000001, "step": 342, "tokens_trained": 0.168105264 }, { "epoch": 0.09758173179207148, "grad_norm": 271.466064453125, "loss": 6.9141, "lr": 0.0006860000000000001, "step": 344, "tokens_trained": 0.169088912 }, { "epoch": 0.0981490674420254, "grad_norm": 252.54478454589844, "loss": 6.3281, "lr": 0.00069, "step": 346, "tokens_trained": 0.170077368 }, { "epoch": 0.0987164030919793, "grad_norm": 305.8559875488281, "loss": 6.443, "lr": 0.000694, "step": 348, "tokens_trained": 0.171057232 }, { "epoch": 0.09928373874193319, "grad_norm": 227.74374389648438, "loss": 6.552, "lr": 0.0006979999999999999, "step": 350, "tokens_trained": 0.172041376 }, { "epoch": 0.0998510743918871, "grad_norm": 446.7601623535156, "loss": 10.8184, "lr": 0.0007019999999999999, "step": 352, "tokens_trained": 0.173023624 }, { "epoch": 0.100418410041841, "grad_norm": 353.0849609375, "loss": 8.6327, "lr": 0.0007059999999999999, "step": 354, "tokens_trained": 0.174005992 }, { "epoch": 0.10098574569179491, "grad_norm": 367.9427185058594, "loss": 9.3898, "lr": 0.00071, "step": 356, "tokens_trained": 0.174988304 }, { "epoch": 0.10155308134174881, "grad_norm": 224.4961700439453, "loss": 8.284, "lr": 0.000714, "step": 358, "tokens_trained": 0.175969816 }, { "epoch": 0.10212041699170271, "grad_norm": 221.86537170410156, "loss": 7.0578, "lr": 0.000718, "step": 360, "tokens_trained": 0.176952688 }, { "epoch": 0.10268775264165662, "grad_norm": 331.0989685058594, "loss": 6.9561, "lr": 0.000722, "step": 362, "tokens_trained": 0.177935144 }, { "epoch": 0.10325508829161052, "grad_norm": 171.6498260498047, "loss": 7.203, "lr": 0.000726, "step": 364, "tokens_trained": 0.178916776 }, { "epoch": 0.10382242394156443, "grad_norm": 284.2208557128906, "loss": 10.3517, "lr": 0.00073, "step": 366, "tokens_trained": 0.179903432 }, { "epoch": 0.10438975959151833, "grad_norm": 354.8574523925781, "loss": 9.3888, "lr": 0.000734, "step": 368, "tokens_trained": 0.180883224 }, { "epoch": 0.10495709524147223, "grad_norm": 344.82574462890625, "loss": 10.5933, "lr": 0.000738, "step": 370, "tokens_trained": 0.181863808 }, { "epoch": 0.10552443089142614, "grad_norm": 302.6838073730469, "loss": 10.2832, "lr": 0.000742, "step": 372, "tokens_trained": 0.182843712 }, { "epoch": 0.10609176654138004, "grad_norm": 323.0387878417969, "loss": 6.4864, "lr": 0.000746, "step": 374, "tokens_trained": 0.183825832 }, { "epoch": 0.10637543436635699, "eval_loss": 1.4430732727050781, "eval_runtime": 20.5468, "step": 375, "tokens_trained": 0.184317744 }, { "epoch": 0.10665910219133395, "grad_norm": 133.74822998046875, "loss": 5.4176, "lr": 0.00075, "step": 376, "tokens_trained": 0.184811352 }, { "epoch": 0.10722643784128785, "grad_norm": 180.3372344970703, "loss": 5.5641, "lr": 0.000754, "step": 378, "tokens_trained": 0.185792528 }, { "epoch": 0.10779377349124175, "grad_norm": 250.83999633789062, "loss": 5.8612, "lr": 0.000758, "step": 380, "tokens_trained": 0.186777112 }, { "epoch": 0.10836110914119566, "grad_norm": 293.51959228515625, "loss": 6.0418, "lr": 0.000762, "step": 382, "tokens_trained": 0.18775724 }, { "epoch": 0.10892844479114956, "grad_norm": 292.56207275390625, "loss": 6.1812, "lr": 0.0007660000000000001, "step": 384, "tokens_trained": 0.188733568 }, { "epoch": 0.10949578044110347, "grad_norm": 121.82467651367188, "loss": 6.0855, "lr": 0.0007700000000000001, "step": 386, "tokens_trained": 0.189718512 }, { "epoch": 0.11006311609105737, "grad_norm": 124.30497741699219, "loss": 5.7734, "lr": 0.0007740000000000001, "step": 388, "tokens_trained": 0.190703776 }, { "epoch": 0.11063045174101127, "grad_norm": 143.64004516601562, "loss": 5.7641, "lr": 0.000778, "step": 390, "tokens_trained": 0.191689888 }, { "epoch": 0.11119778739096518, "grad_norm": 160.06784057617188, "loss": 5.6025, "lr": 0.000782, "step": 392, "tokens_trained": 0.192673992 }, { "epoch": 0.11176512304091908, "grad_norm": 226.97988891601562, "loss": 6.0049, "lr": 0.000786, "step": 394, "tokens_trained": 0.193656272 }, { "epoch": 0.112332458690873, "grad_norm": 223.26898193359375, "loss": 5.6972, "lr": 0.00079, "step": 396, "tokens_trained": 0.194639144 }, { "epoch": 0.11289979434082689, "grad_norm": 249.34912109375, "loss": 5.7348, "lr": 0.0007940000000000001, "step": 398, "tokens_trained": 0.195621256 }, { "epoch": 0.11346712999078079, "grad_norm": 161.34271240234375, "loss": 5.6689, "lr": 0.0007980000000000001, "step": 400, "tokens_trained": 0.196604136 }, { "epoch": 0.1140344656407347, "grad_norm": 148.53176879882812, "loss": 5.702, "lr": 0.0008020000000000001, "step": 402, "tokens_trained": 0.197586784 }, { "epoch": 0.1146018012906886, "grad_norm": 144.40835571289062, "loss": 6.2402, "lr": 0.0008060000000000001, "step": 404, "tokens_trained": 0.198570824 }, { "epoch": 0.11516913694064251, "grad_norm": 306.57562255859375, "loss": 7.1739, "lr": 0.0008100000000000001, "step": 406, "tokens_trained": 0.199548328 }, { "epoch": 0.11573647259059641, "grad_norm": 308.79180908203125, "loss": 6.0972, "lr": 0.0008139999999999999, "step": 408, "tokens_trained": 0.200532496 }, { "epoch": 0.11630380824055031, "grad_norm": 197.76791381835938, "loss": 6.3533, "lr": 0.0008179999999999999, "step": 410, "tokens_trained": 0.201514648 }, { "epoch": 0.11687114389050422, "grad_norm": 129.5694580078125, "loss": 6.9628, "lr": 0.0008219999999999999, "step": 412, "tokens_trained": 0.2024994 }, { "epoch": 0.11743847954045812, "grad_norm": 446.0195617675781, "loss": 11.7562, "lr": 0.000826, "step": 414, "tokens_trained": 0.20348012 }, { "epoch": 0.11800581519041203, "grad_norm": 355.5342712402344, "loss": 8.8055, "lr": 0.00083, "step": 416, "tokens_trained": 0.20446356 }, { "epoch": 0.11857315084036593, "grad_norm": 456.2491149902344, "loss": 9.606, "lr": 0.000834, "step": 418, "tokens_trained": 0.205445288 }, { "epoch": 0.11914048649031983, "grad_norm": 369.8676452636719, "loss": 8.385, "lr": 0.000838, "step": 420, "tokens_trained": 0.206427832 }, { "epoch": 0.11970782214027374, "grad_norm": 262.19073486328125, "loss": 9.0956, "lr": 0.000842, "step": 422, "tokens_trained": 0.207409848 }, { "epoch": 0.12027515779022764, "grad_norm": 120.3193130493164, "loss": 5.4937, "lr": 0.000846, "step": 424, "tokens_trained": 0.208391752 }, { "epoch": 0.12084249344018155, "grad_norm": 222.1111297607422, "loss": 8.9367, "lr": 0.00085, "step": 426, "tokens_trained": 0.20937384 }, { "epoch": 0.12140982909013545, "grad_norm": 137.16819763183594, "loss": 7.5876, "lr": 0.000854, "step": 428, "tokens_trained": 0.210358576 }, { "epoch": 0.12197716474008935, "grad_norm": 267.61846923828125, "loss": 8.817, "lr": 0.000858, "step": 430, "tokens_trained": 0.211340064 }, { "epoch": 0.12254450039004326, "grad_norm": 472.72906494140625, "loss": 8.203, "lr": 0.000862, "step": 432, "tokens_trained": 0.212321144 }, { "epoch": 0.12311183603999716, "grad_norm": 297.1420593261719, "loss": 10.987, "lr": 0.000866, "step": 434, "tokens_trained": 0.213300312 }, { "epoch": 0.12367917168995107, "grad_norm": 281.7297668457031, "loss": 7.6117, "lr": 0.00087, "step": 436, "tokens_trained": 0.214287624 }, { "epoch": 0.12424650733990497, "grad_norm": 203.09678649902344, "loss": 6.5638, "lr": 0.000874, "step": 438, "tokens_trained": 0.215272136 }, { "epoch": 0.12481384298985887, "grad_norm": 155.7823944091797, "loss": 6.1131, "lr": 0.000878, "step": 440, "tokens_trained": 0.216256392 }, { "epoch": 0.12538117863981277, "grad_norm": 189.86196899414062, "loss": 8.2565, "lr": 0.000882, "step": 442, "tokens_trained": 0.217242504 }, { "epoch": 0.1259485142897667, "grad_norm": 247.4568634033203, "loss": 7.1005, "lr": 0.0008860000000000001, "step": 444, "tokens_trained": 0.218226008 }, { "epoch": 0.1265158499397206, "grad_norm": 179.72825622558594, "loss": 6.3379, "lr": 0.0008900000000000001, "step": 446, "tokens_trained": 0.219210584 }, { "epoch": 0.1270831855896745, "grad_norm": 212.96356201171875, "loss": 7.2514, "lr": 0.000894, "step": 448, "tokens_trained": 0.220193952 }, { "epoch": 0.1276505212396284, "grad_norm": 105.67095947265625, "loss": 5.456, "lr": 0.000898, "step": 450, "tokens_trained": 0.221176936 }, { "epoch": 0.1282178568895823, "grad_norm": 302.9122619628906, "loss": 6.4018, "lr": 0.000902, "step": 452, "tokens_trained": 0.222161952 }, { "epoch": 0.12878519253953621, "grad_norm": 215.66561889648438, "loss": 6.2853, "lr": 0.000906, "step": 454, "tokens_trained": 0.223144912 }, { "epoch": 0.1293525281894901, "grad_norm": 272.9984130859375, "loss": 7.3902, "lr": 0.00091, "step": 456, "tokens_trained": 0.224127392 }, { "epoch": 0.129919863839444, "grad_norm": 200.7503662109375, "loss": 6.1637, "lr": 0.0009140000000000001, "step": 458, "tokens_trained": 0.22511648 }, { "epoch": 0.1304871994893979, "grad_norm": 93.23990631103516, "loss": 6.4867, "lr": 0.0009180000000000001, "step": 460, "tokens_trained": 0.226098144 }, { "epoch": 0.1310545351393518, "grad_norm": 274.37164306640625, "loss": 8.99, "lr": 0.0009220000000000001, "step": 462, "tokens_trained": 0.227081848 }, { "epoch": 0.13162187078930573, "grad_norm": 186.66322326660156, "loss": 8.7122, "lr": 0.0009260000000000001, "step": 464, "tokens_trained": 0.22806636 }, { "epoch": 0.13218920643925963, "grad_norm": 586.1035766601562, "loss": 9.1045, "lr": 0.00093, "step": 466, "tokens_trained": 0.229047872 }, { "epoch": 0.13275654208921353, "grad_norm": 227.55996704101562, "loss": 9.7276, "lr": 0.000934, "step": 468, "tokens_trained": 0.230031144 }, { "epoch": 0.13332387773916743, "grad_norm": 229.26609802246094, "loss": 6.6244, "lr": 0.0009379999999999999, "step": 470, "tokens_trained": 0.2310158 }, { "epoch": 0.13389121338912133, "grad_norm": 145.16331481933594, "loss": 5.759, "lr": 0.000942, "step": 472, "tokens_trained": 0.2319996 }, { "epoch": 0.13445854903907525, "grad_norm": 109.9937744140625, "loss": 5.4838, "lr": 0.000946, "step": 474, "tokens_trained": 0.232983808 }, { "epoch": 0.13502588468902915, "grad_norm": 135.74899291992188, "loss": 6.2738, "lr": 0.00095, "step": 476, "tokens_trained": 0.233963016 }, { "epoch": 0.13559322033898305, "grad_norm": 142.99449157714844, "loss": 5.8459, "lr": 0.000954, "step": 478, "tokens_trained": 0.234948864 }, { "epoch": 0.13616055598893695, "grad_norm": 198.66883850097656, "loss": 6.6626, "lr": 0.000958, "step": 480, "tokens_trained": 0.235932392 }, { "epoch": 0.13672789163889085, "grad_norm": 260.76507568359375, "loss": 6.9299, "lr": 0.000962, "step": 482, "tokens_trained": 0.236915664 }, { "epoch": 0.13729522728884477, "grad_norm": 267.97589111328125, "loss": 6.4343, "lr": 0.000966, "step": 484, "tokens_trained": 0.237896904 }, { "epoch": 0.13786256293879867, "grad_norm": 89.8781967163086, "loss": 6.3203, "lr": 0.0009699999999999999, "step": 486, "tokens_trained": 0.238874528 }, { "epoch": 0.13842989858875257, "grad_norm": 225.62985229492188, "loss": 6.2778, "lr": 0.000974, "step": 488, "tokens_trained": 0.2398588 }, { "epoch": 0.13899723423870647, "grad_norm": 85.84110260009766, "loss": 5.2786, "lr": 0.000978, "step": 490, "tokens_trained": 0.240839968 }, { "epoch": 0.13956456988866037, "grad_norm": 141.4368438720703, "loss": 5.5525, "lr": 0.000982, "step": 492, "tokens_trained": 0.241823544 }, { "epoch": 0.1401319055386143, "grad_norm": 94.9535140991211, "loss": 5.4386, "lr": 0.0009860000000000001, "step": 494, "tokens_trained": 0.242805456 }, { "epoch": 0.1406992411885682, "grad_norm": 157.4557647705078, "loss": 5.9786, "lr": 0.00099, "step": 496, "tokens_trained": 0.243792496 }, { "epoch": 0.1412665768385221, "grad_norm": 319.5025634765625, "loss": 7.04, "lr": 0.000994, "step": 498, "tokens_trained": 0.244772472 }, { "epoch": 0.141833912488476, "grad_norm": 282.26824951171875, "loss": 9.4037, "lr": 0.000998, "step": 500, "tokens_trained": 0.245758968 }, { "epoch": 0.141833912488476, "eval_loss": 2.152184247970581, "eval_runtime": 21.2772, "step": 500, "tokens_trained": 0.245758968 }, { "epoch": 0.1424012481384299, "grad_norm": 306.0666809082031, "loss": 7.8845, "lr": 0.00099986013986014, "step": 502, "tokens_trained": 0.246739024 }, { "epoch": 0.1429685837883838, "grad_norm": 188.89024353027344, "loss": 6.8118, "lr": 0.0009995804195804196, "step": 504, "tokens_trained": 0.247726552 }, { "epoch": 0.1435359194383377, "grad_norm": 228.97474670410156, "loss": 6.8475, "lr": 0.0009993006993006994, "step": 506, "tokens_trained": 0.24870688 }, { "epoch": 0.1441032550882916, "grad_norm": 229.80029296875, "loss": 6.2171, "lr": 0.000999020979020979, "step": 508, "tokens_trained": 0.249689096 }, { "epoch": 0.1446705907382455, "grad_norm": 157.30340576171875, "loss": 6.2281, "lr": 0.0009987412587412587, "step": 510, "tokens_trained": 0.250671768 }, { "epoch": 0.1452379263881994, "grad_norm": 176.64683532714844, "loss": 6.5993, "lr": 0.0009984615384615386, "step": 512, "tokens_trained": 0.25165608 }, { "epoch": 0.14580526203815333, "grad_norm": 197.20526123046875, "loss": 5.7267, "lr": 0.0009981818181818182, "step": 514, "tokens_trained": 0.252639712 }, { "epoch": 0.14637259768810723, "grad_norm": 54.713260650634766, "loss": 5.7911, "lr": 0.000997902097902098, "step": 516, "tokens_trained": 0.253622816 }, { "epoch": 0.14693993333806113, "grad_norm": 185.74923706054688, "loss": 7.0055, "lr": 0.0009976223776223777, "step": 518, "tokens_trained": 0.254602792 }, { "epoch": 0.14750726898801503, "grad_norm": 240.31021118164062, "loss": 6.452, "lr": 0.0009973426573426573, "step": 520, "tokens_trained": 0.255584736 }, { "epoch": 0.14807460463796893, "grad_norm": 160.2477264404297, "loss": 7.6556, "lr": 0.000997062937062937, "step": 522, "tokens_trained": 0.256563792 }, { "epoch": 0.14864194028792285, "grad_norm": 283.0034484863281, "loss": 6.5345, "lr": 0.0009967832167832168, "step": 524, "tokens_trained": 0.257546656 }, { "epoch": 0.14920927593787675, "grad_norm": 245.537109375, "loss": 6.3281, "lr": 0.0009965034965034964, "step": 526, "tokens_trained": 0.258530832 }, { "epoch": 0.14977661158783065, "grad_norm": 162.1538848876953, "loss": 7.4072, "lr": 0.0009962237762237763, "step": 528, "tokens_trained": 0.259514528 }, { "epoch": 0.15034394723778455, "grad_norm": 107.25792694091797, "loss": 5.356, "lr": 0.000995944055944056, "step": 530, "tokens_trained": 0.260500912 }, { "epoch": 0.15091128288773845, "grad_norm": 173.73353576660156, "loss": 6.8625, "lr": 0.0009956643356643356, "step": 532, "tokens_trained": 0.26148632 }, { "epoch": 0.15147861853769237, "grad_norm": 178.33541870117188, "loss": 5.8794, "lr": 0.0009953846153846154, "step": 534, "tokens_trained": 0.262468816 }, { "epoch": 0.15204595418764627, "grad_norm": 181.2533416748047, "loss": 7.0243, "lr": 0.000995104895104895, "step": 536, "tokens_trained": 0.263446696 }, { "epoch": 0.15261328983760017, "grad_norm": 208.79293823242188, "loss": 5.8908, "lr": 0.000994825174825175, "step": 538, "tokens_trained": 0.26443108 }, { "epoch": 0.15318062548755407, "grad_norm": 148.66285705566406, "loss": 6.0831, "lr": 0.0009945454545454546, "step": 540, "tokens_trained": 0.265414496 }, { "epoch": 0.15374796113750797, "grad_norm": 165.044189453125, "loss": 5.5594, "lr": 0.0009942657342657344, "step": 542, "tokens_trained": 0.266394128 }, { "epoch": 0.1543152967874619, "grad_norm": 124.5405502319336, "loss": 5.2442, "lr": 0.000993986013986014, "step": 544, "tokens_trained": 0.267378768 }, { "epoch": 0.1548826324374158, "grad_norm": 68.66510772705078, "loss": 5.1173, "lr": 0.0009937062937062937, "step": 546, "tokens_trained": 0.268360184 }, { "epoch": 0.1554499680873697, "grad_norm": 57.052860260009766, "loss": 5.2348, "lr": 0.0009934265734265735, "step": 548, "tokens_trained": 0.269345672 }, { "epoch": 0.1560173037373236, "grad_norm": 184.9175567626953, "loss": 6.7748, "lr": 0.0009931468531468532, "step": 550, "tokens_trained": 0.2703288 }, { "epoch": 0.15658463938727749, "grad_norm": 72.9861831665039, "loss": 5.7387, "lr": 0.000992867132867133, "step": 552, "tokens_trained": 0.271309176 }, { "epoch": 0.1571519750372314, "grad_norm": 135.864501953125, "loss": 6.3035, "lr": 0.0009925874125874127, "step": 554, "tokens_trained": 0.27229644 }, { "epoch": 0.1577193106871853, "grad_norm": 130.579833984375, "loss": 5.4434, "lr": 0.0009923076923076923, "step": 556, "tokens_trained": 0.273277904 }, { "epoch": 0.1582866463371392, "grad_norm": 206.77345275878906, "loss": 5.8649, "lr": 0.000992027972027972, "step": 558, "tokens_trained": 0.274261712 }, { "epoch": 0.1588539819870931, "grad_norm": 144.0505828857422, "loss": 5.3459, "lr": 0.0009917482517482518, "step": 560, "tokens_trained": 0.2752468 }, { "epoch": 0.159421317637047, "grad_norm": 87.56634521484375, "loss": 5.6321, "lr": 0.0009914685314685314, "step": 562, "tokens_trained": 0.276232384 }, { "epoch": 0.15998865328700093, "grad_norm": 275.2727355957031, "loss": 6.7515, "lr": 0.0009911888111888113, "step": 564, "tokens_trained": 0.277211608 }, { "epoch": 0.16055598893695483, "grad_norm": 97.00019836425781, "loss": 5.4374, "lr": 0.000990909090909091, "step": 566, "tokens_trained": 0.278196336 }, { "epoch": 0.16112332458690873, "grad_norm": 102.91439056396484, "loss": 5.729, "lr": 0.0009906293706293705, "step": 568, "tokens_trained": 0.279175672 }, { "epoch": 0.16169066023686263, "grad_norm": 151.12432861328125, "loss": 5.4189, "lr": 0.0009903496503496504, "step": 570, "tokens_trained": 0.280161088 }, { "epoch": 0.16225799588681653, "grad_norm": 86.6823959350586, "loss": 5.1704, "lr": 0.00099006993006993, "step": 572, "tokens_trained": 0.28114256 }, { "epoch": 0.16282533153677045, "grad_norm": 90.7052230834961, "loss": 5.3673, "lr": 0.0009897902097902099, "step": 574, "tokens_trained": 0.282128904 }, { "epoch": 0.16339266718672435, "grad_norm": 146.92874145507812, "loss": 5.5971, "lr": 0.0009895104895104895, "step": 576, "tokens_trained": 0.28311528 }, { "epoch": 0.16396000283667825, "grad_norm": 189.76296997070312, "loss": 5.3109, "lr": 0.0009892307692307694, "step": 578, "tokens_trained": 0.284098528 }, { "epoch": 0.16452733848663215, "grad_norm": 174.48092651367188, "loss": 5.68, "lr": 0.000988951048951049, "step": 580, "tokens_trained": 0.285081064 }, { "epoch": 0.16509467413658604, "grad_norm": 154.10816955566406, "loss": 5.3307, "lr": 0.0009886713286713286, "step": 582, "tokens_trained": 0.286067952 }, { "epoch": 0.16566200978653997, "grad_norm": 64.28263092041016, "loss": 5.1676, "lr": 0.0009883916083916085, "step": 584, "tokens_trained": 0.287051384 }, { "epoch": 0.16622934543649387, "grad_norm": 103.81795501708984, "loss": 5.3436, "lr": 0.0009881118881118881, "step": 586, "tokens_trained": 0.28803284 }, { "epoch": 0.16679668108644777, "grad_norm": 144.0076904296875, "loss": 5.3033, "lr": 0.000987832167832168, "step": 588, "tokens_trained": 0.289014824 }, { "epoch": 0.16736401673640167, "grad_norm": 88.31237030029297, "loss": 5.0609, "lr": 0.0009875524475524476, "step": 590, "tokens_trained": 0.289999864 }, { "epoch": 0.16793135238635556, "grad_norm": 68.4583740234375, "loss": 5.0702, "lr": 0.0009872727272727273, "step": 592, "tokens_trained": 0.290983888 }, { "epoch": 0.1684986880363095, "grad_norm": 135.28665161132812, "loss": 5.3962, "lr": 0.000986993006993007, "step": 594, "tokens_trained": 0.291965752 }, { "epoch": 0.1690660236862634, "grad_norm": 80.0412368774414, "loss": 5.0246, "lr": 0.0009867132867132867, "step": 596, "tokens_trained": 0.292946952 }, { "epoch": 0.1696333593362173, "grad_norm": 43.29194641113281, "loss": 5.0051, "lr": 0.0009864335664335664, "step": 598, "tokens_trained": 0.293928976 }, { "epoch": 0.17020069498617119, "grad_norm": 220.88687133789062, "loss": 6.0798, "lr": 0.0009861538461538462, "step": 600, "tokens_trained": 0.294912408 }, { "epoch": 0.17076803063612508, "grad_norm": 102.58654022216797, "loss": 5.1271, "lr": 0.0009858741258741259, "step": 602, "tokens_trained": 0.29589416 }, { "epoch": 0.171335366286079, "grad_norm": 119.0067138671875, "loss": 5.7402, "lr": 0.0009855944055944055, "step": 604, "tokens_trained": 0.296878584 }, { "epoch": 0.1719027019360329, "grad_norm": 138.8656005859375, "loss": 5.1951, "lr": 0.0009853146853146854, "step": 606, "tokens_trained": 0.297864552 }, { "epoch": 0.1724700375859868, "grad_norm": 73.5890884399414, "loss": 5.2522, "lr": 0.000985034965034965, "step": 608, "tokens_trained": 0.298854088 }, { "epoch": 0.1730373732359407, "grad_norm": 113.78330993652344, "loss": 5.6683, "lr": 0.0009847552447552449, "step": 610, "tokens_trained": 0.299835024 }, { "epoch": 0.1736047088858946, "grad_norm": 125.20297241210938, "loss": 5.1812, "lr": 0.0009844755244755245, "step": 612, "tokens_trained": 0.30082032 }, { "epoch": 0.17417204453584853, "grad_norm": 67.46041870117188, "loss": 5.0417, "lr": 0.0009841958041958043, "step": 614, "tokens_trained": 0.301808456 }, { "epoch": 0.17473938018580243, "grad_norm": 117.30754852294922, "loss": 5.3064, "lr": 0.000983916083916084, "step": 616, "tokens_trained": 0.302794456 }, { "epoch": 0.17530671583575633, "grad_norm": 124.30754089355469, "loss": 5.1614, "lr": 0.0009836363636363636, "step": 618, "tokens_trained": 0.303777376 }, { "epoch": 0.17587405148571023, "grad_norm": 102.72042083740234, "loss": 5.1265, "lr": 0.0009833566433566435, "step": 620, "tokens_trained": 0.304758864 }, { "epoch": 0.17644138713566412, "grad_norm": 39.332252502441406, "loss": 5.1078, "lr": 0.000983076923076923, "step": 622, "tokens_trained": 0.30574392 }, { "epoch": 0.17700872278561805, "grad_norm": 153.84811401367188, "loss": 5.7696, "lr": 0.000982797202797203, "step": 624, "tokens_trained": 0.306727584 }, { "epoch": 0.17729239061059499, "eval_loss": 1.3463915586471558, "eval_runtime": 20.8357, "step": 625, "tokens_trained": 0.307220496 }, { "epoch": 0.17757605843557195, "grad_norm": 160.2552490234375, "loss": 5.2283, "lr": 0.0009825174825174826, "step": 626, "tokens_trained": 0.307713024 }, { "epoch": 0.17814339408552585, "grad_norm": 186.77407836914062, "loss": 5.2866, "lr": 0.0009822377622377622, "step": 628, "tokens_trained": 0.308700128 }, { "epoch": 0.17871072973547975, "grad_norm": 84.55519104003906, "loss": 5.1106, "lr": 0.0009819580419580419, "step": 630, "tokens_trained": 0.309681208 }, { "epoch": 0.17927806538543364, "grad_norm": 20.617040634155273, "loss": 4.8327, "lr": 0.0009816783216783217, "step": 632, "tokens_trained": 0.310662224 }, { "epoch": 0.17984540103538757, "grad_norm": 168.06039428710938, "loss": 6.0704, "lr": 0.0009813986013986014, "step": 634, "tokens_trained": 0.31164064 }, { "epoch": 0.18041273668534147, "grad_norm": 238.23736572265625, "loss": 5.6188, "lr": 0.0009811188811188812, "step": 636, "tokens_trained": 0.312622568 }, { "epoch": 0.18098007233529537, "grad_norm": 140.0707550048828, "loss": 6.4034, "lr": 0.0009808391608391608, "step": 638, "tokens_trained": 0.313604944 }, { "epoch": 0.18154740798524927, "grad_norm": 161.19302368164062, "loss": 5.4906, "lr": 0.0009805594405594405, "step": 640, "tokens_trained": 0.314592072 }, { "epoch": 0.18211474363520316, "grad_norm": 121.9577407836914, "loss": 5.2097, "lr": 0.0009802797202797203, "step": 642, "tokens_trained": 0.315574392 }, { "epoch": 0.1826820792851571, "grad_norm": 121.25574493408203, "loss": 5.0317, "lr": 0.00098, "step": 644, "tokens_trained": 0.316559008 }, { "epoch": 0.183249414935111, "grad_norm": 28.328269958496094, "loss": 4.932, "lr": 0.0009797202797202798, "step": 646, "tokens_trained": 0.317538776 }, { "epoch": 0.1838167505850649, "grad_norm": 127.77408599853516, "loss": 5.8335, "lr": 0.0009794405594405595, "step": 648, "tokens_trained": 0.31851792 }, { "epoch": 0.18438408623501878, "grad_norm": 94.9522933959961, "loss": 5.1948, "lr": 0.000979160839160839, "step": 650, "tokens_trained": 0.319501576 }, { "epoch": 0.18495142188497268, "grad_norm": 110.33658599853516, "loss": 5.098, "lr": 0.000978881118881119, "step": 652, "tokens_trained": 0.320482392 }, { "epoch": 0.1855187575349266, "grad_norm": 67.23124694824219, "loss": 4.7723, "lr": 0.0009786013986013986, "step": 654, "tokens_trained": 0.32146712 }, { "epoch": 0.1860860931848805, "grad_norm": 61.519866943359375, "loss": 4.7245, "lr": 0.0009783216783216782, "step": 656, "tokens_trained": 0.322449576 }, { "epoch": 0.1866534288348344, "grad_norm": 99.51078033447266, "loss": 4.783, "lr": 0.000978041958041958, "step": 658, "tokens_trained": 0.323432688 }, { "epoch": 0.1872207644847883, "grad_norm": 44.619197845458984, "loss": 4.7495, "lr": 0.000977762237762238, "step": 660, "tokens_trained": 0.324413952 }, { "epoch": 0.18778810013474223, "grad_norm": 114.5891342163086, "loss": 5.1261, "lr": 0.0009774825174825176, "step": 662, "tokens_trained": 0.325394536 }, { "epoch": 0.18835543578469613, "grad_norm": 100.3728256225586, "loss": 4.7883, "lr": 0.0009772027972027972, "step": 664, "tokens_trained": 0.326374672 }, { "epoch": 0.18892277143465003, "grad_norm": 51.883033752441406, "loss": 4.7249, "lr": 0.0009769230769230768, "step": 666, "tokens_trained": 0.327357152 }, { "epoch": 0.18949010708460393, "grad_norm": 82.27507019042969, "loss": 4.8277, "lr": 0.0009766433566433567, "step": 668, "tokens_trained": 0.328342088 }, { "epoch": 0.19005744273455782, "grad_norm": 83.53064727783203, "loss": 4.8338, "lr": 0.0009763636363636363, "step": 670, "tokens_trained": 0.329319248 }, { "epoch": 0.19062477838451175, "grad_norm": 76.18387603759766, "loss": 4.6958, "lr": 0.0009760839160839161, "step": 672, "tokens_trained": 0.330305968 }, { "epoch": 0.19119211403446565, "grad_norm": 27.401426315307617, "loss": 4.6929, "lr": 0.0009758041958041958, "step": 674, "tokens_trained": 0.3312912 }, { "epoch": 0.19175944968441955, "grad_norm": 186.770263671875, "loss": 5.5089, "lr": 0.0009755244755244756, "step": 676, "tokens_trained": 0.332275224 }, { "epoch": 0.19232678533437345, "grad_norm": 105.02385711669922, "loss": 4.8876, "lr": 0.0009752447552447553, "step": 678, "tokens_trained": 0.33325588 }, { "epoch": 0.19289412098432734, "grad_norm": 94.96269989013672, "loss": 5.1235, "lr": 0.0009749650349650349, "step": 680, "tokens_trained": 0.334238408 }, { "epoch": 0.19346145663428127, "grad_norm": 92.29356384277344, "loss": 4.8194, "lr": 0.0009746853146853148, "step": 682, "tokens_trained": 0.335219368 }, { "epoch": 0.19402879228423517, "grad_norm": 59.1584358215332, "loss": 4.7511, "lr": 0.0009744055944055944, "step": 684, "tokens_trained": 0.336207136 }, { "epoch": 0.19459612793418907, "grad_norm": 54.759002685546875, "loss": 4.777, "lr": 0.0009741258741258742, "step": 686, "tokens_trained": 0.337193536 }, { "epoch": 0.19516346358414297, "grad_norm": 92.20452880859375, "loss": 4.8225, "lr": 0.0009738461538461538, "step": 688, "tokens_trained": 0.338179224 }, { "epoch": 0.19573079923409686, "grad_norm": 75.97005462646484, "loss": 4.655, "lr": 0.0009735664335664336, "step": 690, "tokens_trained": 0.339162168 }, { "epoch": 0.1962981348840508, "grad_norm": 58.19076919555664, "loss": 4.6446, "lr": 0.0009732867132867133, "step": 692, "tokens_trained": 0.340138904 }, { "epoch": 0.1968654705340047, "grad_norm": 50.81512451171875, "loss": 4.5866, "lr": 0.000973006993006993, "step": 694, "tokens_trained": 0.34112288 }, { "epoch": 0.1974328061839586, "grad_norm": 61.683372497558594, "loss": 4.6018, "lr": 0.0009727272727272728, "step": 696, "tokens_trained": 0.342111992 }, { "epoch": 0.19800014183391249, "grad_norm": 61.01798629760742, "loss": 4.6007, "lr": 0.0009724475524475524, "step": 698, "tokens_trained": 0.343095912 }, { "epoch": 0.19856747748386638, "grad_norm": 96.49671936035156, "loss": 4.7035, "lr": 0.0009721678321678323, "step": 700, "tokens_trained": 0.344078632 }, { "epoch": 0.1991348131338203, "grad_norm": 64.7771224975586, "loss": 4.8341, "lr": 0.0009718881118881119, "step": 702, "tokens_trained": 0.345060576 }, { "epoch": 0.1997021487837742, "grad_norm": 90.1478042602539, "loss": 4.7739, "lr": 0.0009716083916083917, "step": 704, "tokens_trained": 0.34604112 }, { "epoch": 0.2002694844337281, "grad_norm": 67.6308822631836, "loss": 4.6218, "lr": 0.0009713286713286713, "step": 706, "tokens_trained": 0.347023496 }, { "epoch": 0.200836820083682, "grad_norm": 40.50175094604492, "loss": 4.6008, "lr": 0.000971048951048951, "step": 708, "tokens_trained": 0.348005416 }, { "epoch": 0.2014041557336359, "grad_norm": 33.6448860168457, "loss": 4.5307, "lr": 0.0009707692307692308, "step": 710, "tokens_trained": 0.3489886 }, { "epoch": 0.20197149138358983, "grad_norm": 15.484851837158203, "loss": 4.5065, "lr": 0.0009704895104895105, "step": 712, "tokens_trained": 0.34997024 }, { "epoch": 0.20253882703354373, "grad_norm": 109.26301574707031, "loss": 4.9613, "lr": 0.0009702097902097903, "step": 714, "tokens_trained": 0.350958496 }, { "epoch": 0.20310616268349763, "grad_norm": 150.07492065429688, "loss": 4.8507, "lr": 0.0009699300699300699, "step": 716, "tokens_trained": 0.35193892 }, { "epoch": 0.20367349833345152, "grad_norm": 113.43978881835938, "loss": 5.4494, "lr": 0.0009696503496503498, "step": 718, "tokens_trained": 0.35291908 }, { "epoch": 0.20424083398340542, "grad_norm": 123.0071792602539, "loss": 4.9475, "lr": 0.0009693706293706294, "step": 720, "tokens_trained": 0.353896072 }, { "epoch": 0.20480816963335935, "grad_norm": 65.55500793457031, "loss": 4.7585, "lr": 0.0009690909090909091, "step": 722, "tokens_trained": 0.354878992 }, { "epoch": 0.20537550528331325, "grad_norm": 36.11159896850586, "loss": 4.6323, "lr": 0.0009688111888111888, "step": 724, "tokens_trained": 0.355863728 }, { "epoch": 0.20594284093326715, "grad_norm": 30.566436767578125, "loss": 4.53, "lr": 0.0009685314685314685, "step": 726, "tokens_trained": 0.356845272 }, { "epoch": 0.20651017658322104, "grad_norm": 59.01853561401367, "loss": 4.5283, "lr": 0.0009682517482517483, "step": 728, "tokens_trained": 0.357826656 }, { "epoch": 0.20707751223317494, "grad_norm": 91.78115844726562, "loss": 4.6149, "lr": 0.000967972027972028, "step": 730, "tokens_trained": 0.358809896 }, { "epoch": 0.20764484788312887, "grad_norm": 67.97398376464844, "loss": 4.617, "lr": 0.0009676923076923078, "step": 732, "tokens_trained": 0.359788736 }, { "epoch": 0.20821218353308277, "grad_norm": 42.82001876831055, "loss": 4.6134, "lr": 0.0009674125874125874, "step": 734, "tokens_trained": 0.360771744 }, { "epoch": 0.20877951918303667, "grad_norm": 63.52122116088867, "loss": 4.6995, "lr": 0.0009671328671328672, "step": 736, "tokens_trained": 0.361757656 }, { "epoch": 0.20934685483299056, "grad_norm": 116.39544677734375, "loss": 4.7153, "lr": 0.0009668531468531469, "step": 738, "tokens_trained": 0.362744008 }, { "epoch": 0.20991419048294446, "grad_norm": 40.74269485473633, "loss": 4.7978, "lr": 0.0009665734265734266, "step": 740, "tokens_trained": 0.36372872 }, { "epoch": 0.2104815261328984, "grad_norm": 114.29917907714844, "loss": 5.1683, "lr": 0.0009662937062937063, "step": 742, "tokens_trained": 0.364710536 }, { "epoch": 0.2110488617828523, "grad_norm": 115.83326721191406, "loss": 4.7642, "lr": 0.000966013986013986, "step": 744, "tokens_trained": 0.3656912 }, { "epoch": 0.21161619743280619, "grad_norm": 21.708093643188477, "loss": 4.8244, "lr": 0.0009657342657342657, "step": 746, "tokens_trained": 0.36667388 }, { "epoch": 0.21218353308276008, "grad_norm": 182.01918029785156, "loss": 5.6045, "lr": 0.0009654545454545455, "step": 748, "tokens_trained": 0.3676634 }, { "epoch": 0.21275086873271398, "grad_norm": 47.119319915771484, "loss": 4.7929, "lr": 0.0009651748251748252, "step": 750, "tokens_trained": 0.368647288 }, { "epoch": 0.21275086873271398, "eval_loss": 1.2186306715011597, "eval_runtime": 20.9362, "step": 750, "tokens_trained": 0.368647288 }, { "epoch": 0.2133182043826679, "grad_norm": 51.43566131591797, "loss": 4.7298, "lr": 0.0009648951048951049, "step": 752, "tokens_trained": 0.36962992 }, { "epoch": 0.2138855400326218, "grad_norm": 79.49323272705078, "loss": 5.0749, "lr": 0.0009646153846153846, "step": 754, "tokens_trained": 0.370616064 }, { "epoch": 0.2144528756825757, "grad_norm": 119.80200958251953, "loss": 4.8198, "lr": 0.0009643356643356644, "step": 756, "tokens_trained": 0.371596208 }, { "epoch": 0.2150202113325296, "grad_norm": 95.88092041015625, "loss": 4.7437, "lr": 0.0009640559440559441, "step": 758, "tokens_trained": 0.372579584 }, { "epoch": 0.2155875469824835, "grad_norm": 79.64202117919922, "loss": 4.9181, "lr": 0.0009637762237762237, "step": 760, "tokens_trained": 0.373563056 }, { "epoch": 0.21615488263243743, "grad_norm": 79.93920135498047, "loss": 4.6393, "lr": 0.0009634965034965035, "step": 762, "tokens_trained": 0.374547648 }, { "epoch": 0.21672221828239133, "grad_norm": 78.67620849609375, "loss": 4.6178, "lr": 0.0009632167832167832, "step": 764, "tokens_trained": 0.375531456 }, { "epoch": 0.21728955393234523, "grad_norm": 56.32818603515625, "loss": 4.6498, "lr": 0.000962937062937063, "step": 766, "tokens_trained": 0.376516896 }, { "epoch": 0.21785688958229912, "grad_norm": 45.35737228393555, "loss": 4.5812, "lr": 0.0009626573426573427, "step": 768, "tokens_trained": 0.377499752 }, { "epoch": 0.21842422523225302, "grad_norm": 58.13076400756836, "loss": 4.5793, "lr": 0.0009623776223776224, "step": 770, "tokens_trained": 0.37848276 }, { "epoch": 0.21899156088220695, "grad_norm": 55.620628356933594, "loss": 4.4865, "lr": 0.0009620979020979021, "step": 772, "tokens_trained": 0.379466296 }, { "epoch": 0.21955889653216085, "grad_norm": 77.26813507080078, "loss": 4.5671, "lr": 0.0009618181818181818, "step": 774, "tokens_trained": 0.380449888 }, { "epoch": 0.22012623218211474, "grad_norm": 45.00653839111328, "loss": 4.5923, "lr": 0.0009615384615384616, "step": 776, "tokens_trained": 0.381430352 }, { "epoch": 0.22069356783206864, "grad_norm": 52.77407455444336, "loss": 4.5094, "lr": 0.0009612587412587412, "step": 778, "tokens_trained": 0.382416152 }, { "epoch": 0.22126090348202254, "grad_norm": 36.721073150634766, "loss": 4.4536, "lr": 0.000960979020979021, "step": 780, "tokens_trained": 0.383396672 }, { "epoch": 0.22182823913197647, "grad_norm": 51.21247100830078, "loss": 4.4599, "lr": 0.0009606993006993007, "step": 782, "tokens_trained": 0.384380584 }, { "epoch": 0.22239557478193037, "grad_norm": 65.23794555664062, "loss": 4.5397, "lr": 0.0009604195804195805, "step": 784, "tokens_trained": 0.385361368 }, { "epoch": 0.22296291043188426, "grad_norm": 23.255144119262695, "loss": 4.5007, "lr": 0.0009601398601398602, "step": 786, "tokens_trained": 0.386341416 }, { "epoch": 0.22353024608183816, "grad_norm": 30.812740325927734, "loss": 4.5239, "lr": 0.0009598601398601398, "step": 788, "tokens_trained": 0.387324624 }, { "epoch": 0.22409758173179206, "grad_norm": 50.781219482421875, "loss": 4.5131, "lr": 0.0009595804195804196, "step": 790, "tokens_trained": 0.388312744 }, { "epoch": 0.224664917381746, "grad_norm": 47.88816452026367, "loss": 4.4622, "lr": 0.0009593006993006993, "step": 792, "tokens_trained": 0.38929852 }, { "epoch": 0.22523225303169989, "grad_norm": 49.32049560546875, "loss": 4.5053, "lr": 0.0009590209790209791, "step": 794, "tokens_trained": 0.390279792 }, { "epoch": 0.22579958868165378, "grad_norm": 36.98805618286133, "loss": 4.5144, "lr": 0.0009587412587412587, "step": 796, "tokens_trained": 0.391258904 }, { "epoch": 0.22636692433160768, "grad_norm": 24.88475799560547, "loss": 4.4992, "lr": 0.0009584615384615385, "step": 798, "tokens_trained": 0.392238976 }, { "epoch": 0.22693425998156158, "grad_norm": 38.89309310913086, "loss": 4.4853, "lr": 0.0009581818181818182, "step": 800, "tokens_trained": 0.393226312 }, { "epoch": 0.2275015956315155, "grad_norm": 34.86774444580078, "loss": 4.4519, "lr": 0.000957902097902098, "step": 802, "tokens_trained": 0.394206688 }, { "epoch": 0.2280689312814694, "grad_norm": 24.966291427612305, "loss": 4.456, "lr": 0.0009576223776223777, "step": 804, "tokens_trained": 0.395191608 }, { "epoch": 0.2286362669314233, "grad_norm": 12.218213081359863, "loss": 4.4266, "lr": 0.0009573426573426573, "step": 806, "tokens_trained": 0.396174512 }, { "epoch": 0.2292036025813772, "grad_norm": 50.817054748535156, "loss": 4.586, "lr": 0.0009570629370629371, "step": 808, "tokens_trained": 0.397156912 }, { "epoch": 0.2297709382313311, "grad_norm": 37.60087203979492, "loss": 4.4616, "lr": 0.0009567832167832168, "step": 810, "tokens_trained": 0.398140016 }, { "epoch": 0.23033827388128503, "grad_norm": 37.55678176879883, "loss": 4.4755, "lr": 0.0009565034965034966, "step": 812, "tokens_trained": 0.39912384 }, { "epoch": 0.23090560953123893, "grad_norm": 56.427215576171875, "loss": 4.5078, "lr": 0.0009562237762237762, "step": 814, "tokens_trained": 0.400111224 }, { "epoch": 0.23147294518119282, "grad_norm": 31.869827270507812, "loss": 4.5013, "lr": 0.0009559440559440559, "step": 816, "tokens_trained": 0.401094936 }, { "epoch": 0.23204028083114672, "grad_norm": 77.57958984375, "loss": 4.6977, "lr": 0.0009556643356643357, "step": 818, "tokens_trained": 0.402078888 }, { "epoch": 0.23260761648110062, "grad_norm": 52.50204849243164, "loss": 4.5142, "lr": 0.0009553846153846154, "step": 820, "tokens_trained": 0.403059904 }, { "epoch": 0.23317495213105455, "grad_norm": 32.34305191040039, "loss": 4.4828, "lr": 0.0009551048951048952, "step": 822, "tokens_trained": 0.404049848 }, { "epoch": 0.23374228778100845, "grad_norm": 52.08961486816406, "loss": 4.4869, "lr": 0.0009548251748251748, "step": 824, "tokens_trained": 0.405033872 }, { "epoch": 0.23430962343096234, "grad_norm": 44.32194900512695, "loss": 4.4802, "lr": 0.0009545454545454546, "step": 826, "tokens_trained": 0.406017872 }, { "epoch": 0.23487695908091624, "grad_norm": 30.941524505615234, "loss": 4.4323, "lr": 0.0009542657342657343, "step": 828, "tokens_trained": 0.40700704 }, { "epoch": 0.23544429473087014, "grad_norm": 20.52709197998047, "loss": 4.4919, "lr": 0.000953986013986014, "step": 830, "tokens_trained": 0.407991512 }, { "epoch": 0.23601163038082407, "grad_norm": 86.80307006835938, "loss": 4.8228, "lr": 0.0009537062937062937, "step": 832, "tokens_trained": 0.408979272 }, { "epoch": 0.23657896603077797, "grad_norm": 73.71435546875, "loss": 4.5954, "lr": 0.0009534265734265734, "step": 834, "tokens_trained": 0.409962984 }, { "epoch": 0.23714630168073186, "grad_norm": 66.3813247680664, "loss": 4.5969, "lr": 0.0009531468531468532, "step": 836, "tokens_trained": 0.410945248 }, { "epoch": 0.23771363733068576, "grad_norm": 86.94453430175781, "loss": 4.5894, "lr": 0.0009528671328671329, "step": 838, "tokens_trained": 0.411930872 }, { "epoch": 0.23828097298063966, "grad_norm": 61.28915786743164, "loss": 4.5613, "lr": 0.0009525874125874127, "step": 840, "tokens_trained": 0.412912608 }, { "epoch": 0.2388483086305936, "grad_norm": 65.02153778076172, "loss": 4.5398, "lr": 0.0009523076923076923, "step": 842, "tokens_trained": 0.413897488 }, { "epoch": 0.23941564428054748, "grad_norm": 54.01200485229492, "loss": 4.4922, "lr": 0.000952027972027972, "step": 844, "tokens_trained": 0.414872888 }, { "epoch": 0.23998297993050138, "grad_norm": 66.7095718383789, "loss": 4.5317, "lr": 0.0009517482517482518, "step": 846, "tokens_trained": 0.415856296 }, { "epoch": 0.24055031558045528, "grad_norm": 64.23979949951172, "loss": 4.4686, "lr": 0.0009514685314685315, "step": 848, "tokens_trained": 0.416843344 }, { "epoch": 0.24111765123040918, "grad_norm": 51.012840270996094, "loss": 4.4544, "lr": 0.0009511888111888112, "step": 850, "tokens_trained": 0.41782032 }, { "epoch": 0.2416849868803631, "grad_norm": 40.83076095581055, "loss": 4.4665, "lr": 0.0009509090909090909, "step": 852, "tokens_trained": 0.418805672 }, { "epoch": 0.242252322530317, "grad_norm": 48.31489944458008, "loss": 4.4748, "lr": 0.0009506293706293707, "step": 854, "tokens_trained": 0.419786344 }, { "epoch": 0.2428196581802709, "grad_norm": 50.08705520629883, "loss": 4.4973, "lr": 0.0009503496503496504, "step": 856, "tokens_trained": 0.420768872 }, { "epoch": 0.2433869938302248, "grad_norm": 26.840139389038086, "loss": 4.461, "lr": 0.0009500699300699301, "step": 858, "tokens_trained": 0.421750296 }, { "epoch": 0.2439543294801787, "grad_norm": 24.721454620361328, "loss": 4.4246, "lr": 0.0009497902097902098, "step": 860, "tokens_trained": 0.422730976 }, { "epoch": 0.24452166513013263, "grad_norm": 63.147926330566406, "loss": 4.623, "lr": 0.0009495104895104895, "step": 862, "tokens_trained": 0.423715768 }, { "epoch": 0.24508900078008652, "grad_norm": 50.99778747558594, "loss": 4.4663, "lr": 0.0009492307692307693, "step": 864, "tokens_trained": 0.424697072 }, { "epoch": 0.24565633643004042, "grad_norm": 38.0300407409668, "loss": 4.4649, "lr": 0.000948951048951049, "step": 866, "tokens_trained": 0.425681392 }, { "epoch": 0.24622367207999432, "grad_norm": 19.017776489257812, "loss": 4.4296, "lr": 0.0009486713286713286, "step": 868, "tokens_trained": 0.426665088 }, { "epoch": 0.24679100772994822, "grad_norm": 24.02813148498535, "loss": 4.4958, "lr": 0.0009483916083916084, "step": 870, "tokens_trained": 0.427646016 }, { "epoch": 0.24735834337990215, "grad_norm": 59.40018081665039, "loss": 4.5919, "lr": 0.0009481118881118881, "step": 872, "tokens_trained": 0.428628048 }, { "epoch": 0.24792567902985604, "grad_norm": 61.13710403442383, "loss": 4.4642, "lr": 0.0009478321678321679, "step": 874, "tokens_trained": 0.4296112 }, { "epoch": 0.24820934685483298, "eval_loss": 1.1135390996932983, "eval_runtime": 20.4738, "step": 875, "tokens_trained": 0.430109024 }, { "epoch": 0.24849301467980994, "grad_norm": 47.920021057128906, "loss": 4.4832, "lr": 0.0009475524475524476, "step": 876, "tokens_trained": 0.430599208 }, { "epoch": 0.24906035032976384, "grad_norm": 25.661701202392578, "loss": 4.4176, "lr": 0.0009472727272727273, "step": 878, "tokens_trained": 0.43158356 }, { "epoch": 0.24962768597971774, "grad_norm": 32.86565399169922, "loss": 4.405, "lr": 0.000946993006993007, "step": 880, "tokens_trained": 0.432570584 }, { "epoch": 0.25019502162967167, "grad_norm": 23.443584442138672, "loss": 4.4218, "lr": 0.0009467132867132868, "step": 882, "tokens_trained": 0.433557672 }, { "epoch": 0.25076235727962554, "grad_norm": 28.315975189208984, "loss": 4.4019, "lr": 0.0009464335664335665, "step": 884, "tokens_trained": 0.434542736 }, { "epoch": 0.25132969292957946, "grad_norm": 31.056642532348633, "loss": 4.4027, "lr": 0.0009461538461538461, "step": 886, "tokens_trained": 0.43553112 }, { "epoch": 0.2518970285795334, "grad_norm": 13.661805152893066, "loss": 4.3745, "lr": 0.0009458741258741259, "step": 888, "tokens_trained": 0.436511584 }, { "epoch": 0.25246436422948726, "grad_norm": 47.04901885986328, "loss": 4.4875, "lr": 0.0009455944055944056, "step": 890, "tokens_trained": 0.43749464 }, { "epoch": 0.2530316998794412, "grad_norm": 84.91446685791016, "loss": 4.5185, "lr": 0.0009453146853146854, "step": 892, "tokens_trained": 0.43847764 }, { "epoch": 0.25359903552939506, "grad_norm": 40.9110107421875, "loss": 4.5735, "lr": 0.000945034965034965, "step": 894, "tokens_trained": 0.439461496 }, { "epoch": 0.254166371179349, "grad_norm": 58.98877716064453, "loss": 4.5146, "lr": 0.0009447552447552447, "step": 896, "tokens_trained": 0.440443656 }, { "epoch": 0.2547337068293029, "grad_norm": 34.037315368652344, "loss": 4.4714, "lr": 0.0009444755244755245, "step": 898, "tokens_trained": 0.441423496 }, { "epoch": 0.2553010424792568, "grad_norm": 24.91920280456543, "loss": 4.4334, "lr": 0.0009441958041958042, "step": 900, "tokens_trained": 0.442407408 }, { "epoch": 0.2558683781292107, "grad_norm": 30.612323760986328, "loss": 4.4459, "lr": 0.000943916083916084, "step": 902, "tokens_trained": 0.443383464 }, { "epoch": 0.2564357137791646, "grad_norm": 50.595577239990234, "loss": 4.4848, "lr": 0.0009436363636363636, "step": 904, "tokens_trained": 0.4443674 }, { "epoch": 0.2570030494291185, "grad_norm": 41.3300895690918, "loss": 4.4445, "lr": 0.0009433566433566434, "step": 906, "tokens_trained": 0.445346072 }, { "epoch": 0.25757038507907243, "grad_norm": 48.33689880371094, "loss": 4.4058, "lr": 0.0009430769230769231, "step": 908, "tokens_trained": 0.446329872 }, { "epoch": 0.2581377207290263, "grad_norm": 39.081382751464844, "loss": 4.4321, "lr": 0.0009427972027972029, "step": 910, "tokens_trained": 0.447309544 }, { "epoch": 0.2587050563789802, "grad_norm": 62.18062210083008, "loss": 4.4672, "lr": 0.0009425174825174825, "step": 912, "tokens_trained": 0.448295056 }, { "epoch": 0.2592723920289341, "grad_norm": 28.725404739379883, "loss": 4.4786, "lr": 0.0009422377622377622, "step": 914, "tokens_trained": 0.449274208 }, { "epoch": 0.259839727678888, "grad_norm": 47.55582809448242, "loss": 4.4227, "lr": 0.000941958041958042, "step": 916, "tokens_trained": 0.450256408 }, { "epoch": 0.26040706332884195, "grad_norm": 35.743125915527344, "loss": 4.379, "lr": 0.0009416783216783217, "step": 918, "tokens_trained": 0.45123684 }, { "epoch": 0.2609743989787958, "grad_norm": 31.489402770996094, "loss": 4.3888, "lr": 0.0009413986013986015, "step": 920, "tokens_trained": 0.45221748 }, { "epoch": 0.26154173462874974, "grad_norm": 36.46233367919922, "loss": 4.3982, "lr": 0.0009411188811188811, "step": 922, "tokens_trained": 0.453202064 }, { "epoch": 0.2621090702787036, "grad_norm": 41.6457633972168, "loss": 4.385, "lr": 0.0009408391608391608, "step": 924, "tokens_trained": 0.454183456 }, { "epoch": 0.26267640592865754, "grad_norm": 26.52242088317871, "loss": 4.4091, "lr": 0.0009405594405594406, "step": 926, "tokens_trained": 0.455165496 }, { "epoch": 0.26324374157861147, "grad_norm": 14.401509284973145, "loss": 4.3549, "lr": 0.0009402797202797203, "step": 928, "tokens_trained": 0.456150248 }, { "epoch": 0.26381107722856534, "grad_norm": 30.626131057739258, "loss": 4.3325, "lr": 0.00094, "step": 930, "tokens_trained": 0.457134184 }, { "epoch": 0.26437841287851926, "grad_norm": 63.74067687988281, "loss": 4.442, "lr": 0.0009397202797202797, "step": 932, "tokens_trained": 0.458118808 }, { "epoch": 0.26494574852847314, "grad_norm": 12.15156364440918, "loss": 4.4658, "lr": 0.0009394405594405595, "step": 934, "tokens_trained": 0.459103872 }, { "epoch": 0.26551308417842706, "grad_norm": 76.2789306640625, "loss": 4.8153, "lr": 0.0009391608391608392, "step": 936, "tokens_trained": 0.460087216 }, { "epoch": 0.266080419828381, "grad_norm": 63.919334411621094, "loss": 4.5707, "lr": 0.000938881118881119, "step": 938, "tokens_trained": 0.461070568 }, { "epoch": 0.26664775547833486, "grad_norm": 75.1481704711914, "loss": 4.5931, "lr": 0.0009386013986013986, "step": 940, "tokens_trained": 0.462055184 }, { "epoch": 0.2672150911282888, "grad_norm": 33.118961334228516, "loss": 4.4723, "lr": 0.0009383216783216783, "step": 942, "tokens_trained": 0.463034592 }, { "epoch": 0.26778242677824265, "grad_norm": 30.8759765625, "loss": 4.4275, "lr": 0.0009380419580419581, "step": 944, "tokens_trained": 0.464016816 }, { "epoch": 0.2683497624281966, "grad_norm": 41.05061340332031, "loss": 4.4566, "lr": 0.0009377622377622378, "step": 946, "tokens_trained": 0.465000872 }, { "epoch": 0.2689170980781505, "grad_norm": 30.93424415588379, "loss": 4.3985, "lr": 0.0009374825174825175, "step": 948, "tokens_trained": 0.465984096 }, { "epoch": 0.2694844337281044, "grad_norm": 29.477052688598633, "loss": 4.3718, "lr": 0.0009372027972027972, "step": 950, "tokens_trained": 0.466961752 }, { "epoch": 0.2700517693780583, "grad_norm": 21.568912506103516, "loss": 4.3697, "lr": 0.0009369230769230769, "step": 952, "tokens_trained": 0.467950088 }, { "epoch": 0.2706191050280122, "grad_norm": 41.66835021972656, "loss": 4.4241, "lr": 0.0009366433566433567, "step": 954, "tokens_trained": 0.468928736 }, { "epoch": 0.2711864406779661, "grad_norm": 68.04551696777344, "loss": 4.3978, "lr": 0.0009363636363636364, "step": 956, "tokens_trained": 0.469907496 }, { "epoch": 0.27175377632792, "grad_norm": 37.655181884765625, "loss": 4.4497, "lr": 0.0009360839160839161, "step": 958, "tokens_trained": 0.470889168 }, { "epoch": 0.2723211119778739, "grad_norm": 22.074953079223633, "loss": 4.3918, "lr": 0.0009358041958041958, "step": 960, "tokens_trained": 0.471871816 }, { "epoch": 0.2728884476278278, "grad_norm": 49.925777435302734, "loss": 4.4745, "lr": 0.0009355244755244755, "step": 962, "tokens_trained": 0.472856728 }, { "epoch": 0.2734557832777817, "grad_norm": 46.520851135253906, "loss": 4.403, "lr": 0.0009352447552447553, "step": 964, "tokens_trained": 0.473838544 }, { "epoch": 0.2740231189277356, "grad_norm": 25.053146362304688, "loss": 4.4247, "lr": 0.0009349650349650349, "step": 966, "tokens_trained": 0.474819976 }, { "epoch": 0.27459045457768955, "grad_norm": 30.127140045166016, "loss": 4.3834, "lr": 0.0009346853146853147, "step": 968, "tokens_trained": 0.475800696 }, { "epoch": 0.2751577902276434, "grad_norm": 41.478328704833984, "loss": 4.3978, "lr": 0.0009344055944055944, "step": 970, "tokens_trained": 0.4767834 }, { "epoch": 0.27572512587759734, "grad_norm": 23.739456176757812, "loss": 4.3698, "lr": 0.0009341258741258742, "step": 972, "tokens_trained": 0.47776944 }, { "epoch": 0.2762924615275512, "grad_norm": 21.813220977783203, "loss": 4.3902, "lr": 0.0009338461538461539, "step": 974, "tokens_trained": 0.478757048 }, { "epoch": 0.27685979717750514, "grad_norm": 64.79598999023438, "loss": 4.5237, "lr": 0.0009335664335664336, "step": 976, "tokens_trained": 0.47973872 }, { "epoch": 0.27742713282745907, "grad_norm": 68.32705688476562, "loss": 4.4461, "lr": 0.0009332867132867133, "step": 978, "tokens_trained": 0.480721912 }, { "epoch": 0.27799446847741294, "grad_norm": 41.857582092285156, "loss": 4.4663, "lr": 0.0009330069930069929, "step": 980, "tokens_trained": 0.481704248 }, { "epoch": 0.27856180412736686, "grad_norm": 28.30609893798828, "loss": 4.3461, "lr": 0.0009327272727272728, "step": 982, "tokens_trained": 0.482689768 }, { "epoch": 0.27912913977732073, "grad_norm": 33.207950592041016, "loss": 4.4185, "lr": 0.0009324475524475524, "step": 984, "tokens_trained": 0.483670008 }, { "epoch": 0.27969647542727466, "grad_norm": 29.541227340698242, "loss": 4.388, "lr": 0.0009321678321678322, "step": 986, "tokens_trained": 0.48465836 }, { "epoch": 0.2802638110772286, "grad_norm": 16.23346710205078, "loss": 4.3219, "lr": 0.0009318881118881119, "step": 988, "tokens_trained": 0.4856402 }, { "epoch": 0.28083114672718246, "grad_norm": 20.036178588867188, "loss": 4.3273, "lr": 0.0009316083916083917, "step": 990, "tokens_trained": 0.486621648 }, { "epoch": 0.2813984823771364, "grad_norm": 49.25468063354492, "loss": 4.4649, "lr": 0.0009313286713286714, "step": 992, "tokens_trained": 0.48760744 }, { "epoch": 0.28196581802709025, "grad_norm": 48.59744644165039, "loss": 4.3979, "lr": 0.000931048951048951, "step": 994, "tokens_trained": 0.488590472 }, { "epoch": 0.2825331536770442, "grad_norm": 16.33649253845215, "loss": 4.3945, "lr": 0.0009307692307692308, "step": 996, "tokens_trained": 0.489570976 }, { "epoch": 0.2831004893269981, "grad_norm": 60.632591247558594, "loss": 4.5581, "lr": 0.0009304895104895104, "step": 998, "tokens_trained": 0.490552296 }, { "epoch": 0.283667824976952, "grad_norm": 52.75735092163086, "loss": 4.424, "lr": 0.0009302097902097903, "step": 1000, "tokens_trained": 0.49153744 }, { "epoch": 0.283667824976952, "eval_loss": 1.1363450288772583, "eval_runtime": 20.7491, "step": 1000, "tokens_trained": 0.49153744 }, { "epoch": 0.2842351606269059, "grad_norm": 20.506614685058594, "loss": 4.4241, "lr": 0.0009299300699300699, "step": 1002, "tokens_trained": 0.492522608 }, { "epoch": 0.2848024962768598, "grad_norm": 23.148601531982422, "loss": 4.3975, "lr": 0.0009296503496503497, "step": 1004, "tokens_trained": 0.493501384 }, { "epoch": 0.2853698319268137, "grad_norm": 9.550869941711426, "loss": 4.3952, "lr": 0.0009293706293706294, "step": 1006, "tokens_trained": 0.494482544 }, { "epoch": 0.2859371675767676, "grad_norm": 80.31155395507812, "loss": 4.7614, "lr": 0.0009290909090909091, "step": 1008, "tokens_trained": 0.495459416 }, { "epoch": 0.2865045032267215, "grad_norm": 61.021026611328125, "loss": 4.4396, "lr": 0.0009288111888111889, "step": 1010, "tokens_trained": 0.4964418 }, { "epoch": 0.2870718388766754, "grad_norm": 35.23258972167969, "loss": 4.5548, "lr": 0.0009285314685314685, "step": 1012, "tokens_trained": 0.497428288 }, { "epoch": 0.2876391745266293, "grad_norm": 36.45478057861328, "loss": 4.46, "lr": 0.0009282517482517483, "step": 1014, "tokens_trained": 0.498416832 }, { "epoch": 0.2882065101765832, "grad_norm": 46.622982025146484, "loss": 4.3554, "lr": 0.0009279720279720279, "step": 1016, "tokens_trained": 0.499399792 }, { "epoch": 0.28877384582653715, "grad_norm": 87.00289154052734, "loss": 4.5276, "lr": 0.0009276923076923078, "step": 1018, "tokens_trained": 0.500383776 }, { "epoch": 0.289341181476491, "grad_norm": 11.444964408874512, "loss": 4.5483, "lr": 0.0009274125874125874, "step": 1020, "tokens_trained": 0.50136468 }, { "epoch": 0.28990851712644494, "grad_norm": 89.05914306640625, "loss": 4.8957, "lr": 0.0009271328671328671, "step": 1022, "tokens_trained": 0.50235172 }, { "epoch": 0.2904758527763988, "grad_norm": 26.915477752685547, "loss": 4.6184, "lr": 0.0009268531468531469, "step": 1024, "tokens_trained": 0.50333208 }, { "epoch": 0.29104318842635274, "grad_norm": 44.32100296020508, "loss": 4.5263, "lr": 0.0009265734265734266, "step": 1026, "tokens_trained": 0.504314656 }, { "epoch": 0.29161052407630667, "grad_norm": 26.699670791625977, "loss": 4.3871, "lr": 0.0009262937062937064, "step": 1028, "tokens_trained": 0.505296568 }, { "epoch": 0.29217785972626054, "grad_norm": 27.469482421875, "loss": 4.3558, "lr": 0.000926013986013986, "step": 1030, "tokens_trained": 0.506280416 }, { "epoch": 0.29274519537621446, "grad_norm": 26.149612426757812, "loss": 4.3368, "lr": 0.0009257342657342658, "step": 1032, "tokens_trained": 0.507261224 }, { "epoch": 0.29331253102616833, "grad_norm": 8.754459381103516, "loss": 4.3447, "lr": 0.0009254545454545454, "step": 1034, "tokens_trained": 0.508243288 }, { "epoch": 0.29387986667612226, "grad_norm": 32.17164611816406, "loss": 4.4174, "lr": 0.0009251748251748252, "step": 1036, "tokens_trained": 0.509224176 }, { "epoch": 0.2944472023260762, "grad_norm": 41.17238235473633, "loss": 4.4221, "lr": 0.0009248951048951049, "step": 1038, "tokens_trained": 0.510203568 }, { "epoch": 0.29501453797603006, "grad_norm": 44.97213363647461, "loss": 4.3594, "lr": 0.0009246153846153846, "step": 1040, "tokens_trained": 0.511186464 }, { "epoch": 0.295581873625984, "grad_norm": 42.23421859741211, "loss": 4.4159, "lr": 0.0009243356643356644, "step": 1042, "tokens_trained": 0.51216944 }, { "epoch": 0.29614920927593785, "grad_norm": 36.13594436645508, "loss": 4.4105, "lr": 0.0009240559440559441, "step": 1044, "tokens_trained": 0.513153144 }, { "epoch": 0.2967165449258918, "grad_norm": 36.89309310913086, "loss": 4.3947, "lr": 0.0009237762237762239, "step": 1046, "tokens_trained": 0.51413388 }, { "epoch": 0.2972838805758457, "grad_norm": 58.599700927734375, "loss": 4.3988, "lr": 0.0009234965034965035, "step": 1048, "tokens_trained": 0.515119288 }, { "epoch": 0.2978512162257996, "grad_norm": 13.725994110107422, "loss": 4.412, "lr": 0.0009232167832167832, "step": 1050, "tokens_trained": 0.51610284 }, { "epoch": 0.2984185518757535, "grad_norm": 105.28518676757812, "loss": 4.7305, "lr": 0.0009229370629370629, "step": 1052, "tokens_trained": 0.517085576 }, { "epoch": 0.2989858875257074, "grad_norm": 29.499713897705078, "loss": 4.5106, "lr": 0.0009226573426573427, "step": 1054, "tokens_trained": 0.518064224 }, { "epoch": 0.2995532231756613, "grad_norm": 60.907203674316406, "loss": 4.5249, "lr": 0.0009223776223776224, "step": 1056, "tokens_trained": 0.51905084 }, { "epoch": 0.3001205588256152, "grad_norm": 39.825069427490234, "loss": 4.3695, "lr": 0.0009220979020979021, "step": 1058, "tokens_trained": 0.5200318 }, { "epoch": 0.3006878944755691, "grad_norm": 42.77061462402344, "loss": 4.4094, "lr": 0.0009218181818181819, "step": 1060, "tokens_trained": 0.521013568 }, { "epoch": 0.301255230125523, "grad_norm": 37.05888748168945, "loss": 4.3684, "lr": 0.0009215384615384616, "step": 1062, "tokens_trained": 0.521997624 }, { "epoch": 0.3018225657754769, "grad_norm": 42.28252029418945, "loss": 4.3489, "lr": 0.0009212587412587413, "step": 1064, "tokens_trained": 0.522986184 }, { "epoch": 0.3023899014254308, "grad_norm": 40.95197677612305, "loss": 4.3564, "lr": 0.000920979020979021, "step": 1066, "tokens_trained": 0.523970984 }, { "epoch": 0.30295723707538474, "grad_norm": 25.469568252563477, "loss": 4.3833, "lr": 0.0009206993006993007, "step": 1068, "tokens_trained": 0.524952808 }, { "epoch": 0.3035245727253386, "grad_norm": 29.921735763549805, "loss": 4.3579, "lr": 0.0009204195804195804, "step": 1070, "tokens_trained": 0.525935696 }, { "epoch": 0.30409190837529254, "grad_norm": 26.038026809692383, "loss": 4.2898, "lr": 0.0009201398601398602, "step": 1072, "tokens_trained": 0.526916904 }, { "epoch": 0.3046592440252464, "grad_norm": 32.59503936767578, "loss": 4.3335, "lr": 0.0009198601398601398, "step": 1074, "tokens_trained": 0.527899864 }, { "epoch": 0.30522657967520034, "grad_norm": 14.04964828491211, "loss": 4.3171, "lr": 0.0009195804195804196, "step": 1076, "tokens_trained": 0.528878176 }, { "epoch": 0.30579391532515426, "grad_norm": 15.936906814575195, "loss": 4.3005, "lr": 0.0009193006993006993, "step": 1078, "tokens_trained": 0.529859952 }, { "epoch": 0.30636125097510813, "grad_norm": 9.73235034942627, "loss": 4.3287, "lr": 0.0009190209790209791, "step": 1080, "tokens_trained": 0.530838192 }, { "epoch": 0.30692858662506206, "grad_norm": 45.44027328491211, "loss": 4.4384, "lr": 0.0009187412587412588, "step": 1082, "tokens_trained": 0.531818376 }, { "epoch": 0.30749592227501593, "grad_norm": 55.65925598144531, "loss": 4.3772, "lr": 0.0009184615384615385, "step": 1084, "tokens_trained": 0.532802048 }, { "epoch": 0.30806325792496986, "grad_norm": 33.47093200683594, "loss": 4.4257, "lr": 0.0009181818181818182, "step": 1086, "tokens_trained": 0.533785376 }, { "epoch": 0.3086305935749238, "grad_norm": 39.709224700927734, "loss": 4.4177, "lr": 0.0009179020979020978, "step": 1088, "tokens_trained": 0.5347698 }, { "epoch": 0.30919792922487765, "grad_norm": 34.25212097167969, "loss": 4.3518, "lr": 0.0009176223776223777, "step": 1090, "tokens_trained": 0.53575108 }, { "epoch": 0.3097652648748316, "grad_norm": 29.156312942504883, "loss": 4.3596, "lr": 0.0009173426573426573, "step": 1092, "tokens_trained": 0.536735544 }, { "epoch": 0.31033260052478545, "grad_norm": 31.714128494262695, "loss": 4.3736, "lr": 0.0009170629370629371, "step": 1094, "tokens_trained": 0.537718008 }, { "epoch": 0.3108999361747394, "grad_norm": 12.244729042053223, "loss": 4.3472, "lr": 0.0009167832167832168, "step": 1096, "tokens_trained": 0.538693512 }, { "epoch": 0.3114672718246933, "grad_norm": 10.271063804626465, "loss": 4.301, "lr": 0.0009165034965034966, "step": 1098, "tokens_trained": 0.539681376 }, { "epoch": 0.3120346074746472, "grad_norm": 35.79754638671875, "loss": 4.3912, "lr": 0.0009162237762237763, "step": 1100, "tokens_trained": 0.540661392 }, { "epoch": 0.3126019431246011, "grad_norm": 24.1260986328125, "loss": 4.3303, "lr": 0.0009159440559440559, "step": 1102, "tokens_trained": 0.541646968 }, { "epoch": 0.31316927877455497, "grad_norm": 24.501169204711914, "loss": 4.3205, "lr": 0.0009156643356643357, "step": 1104, "tokens_trained": 0.542629392 }, { "epoch": 0.3137366144245089, "grad_norm": 17.031600952148438, "loss": 4.2521, "lr": 0.0009153846153846153, "step": 1106, "tokens_trained": 0.54361348 }, { "epoch": 0.3143039500744628, "grad_norm": 19.506216049194336, "loss": 4.3225, "lr": 0.0009151048951048952, "step": 1108, "tokens_trained": 0.544595336 }, { "epoch": 0.3148712857244167, "grad_norm": 20.822546005249023, "loss": 4.2711, "lr": 0.0009148251748251748, "step": 1110, "tokens_trained": 0.545578256 }, { "epoch": 0.3154386213743706, "grad_norm": 29.967998504638672, "loss": 4.2868, "lr": 0.0009145454545454546, "step": 1112, "tokens_trained": 0.546561024 }, { "epoch": 0.3160059570243245, "grad_norm": 24.06121063232422, "loss": 4.2701, "lr": 0.0009142657342657343, "step": 1114, "tokens_trained": 0.547544616 }, { "epoch": 0.3165732926742784, "grad_norm": 15.868765830993652, "loss": 4.3233, "lr": 0.000913986013986014, "step": 1116, "tokens_trained": 0.548526216 }, { "epoch": 0.31714062832423234, "grad_norm": 27.47897720336914, "loss": 4.2813, "lr": 0.0009137062937062938, "step": 1118, "tokens_trained": 0.549506544 }, { "epoch": 0.3177079639741862, "grad_norm": 15.343204498291016, "loss": 4.3002, "lr": 0.0009134265734265734, "step": 1120, "tokens_trained": 0.550488496 }, { "epoch": 0.31827529962414014, "grad_norm": 4.320124626159668, "loss": 4.2622, "lr": 0.0009131468531468532, "step": 1122, "tokens_trained": 0.551471792 }, { "epoch": 0.318842635274094, "grad_norm": 34.520050048828125, "loss": 4.366, "lr": 0.0009128671328671328, "step": 1124, "tokens_trained": 0.552457008 }, { "epoch": 0.319126303099071, "eval_loss": 1.096465826034546, "eval_runtime": 20.7643, "step": 1125, "tokens_trained": 0.552948064 }, { "epoch": 0.31940997092404794, "grad_norm": 39.718719482421875, "loss": 4.3317, "lr": 0.0009125874125874127, "step": 1126, "tokens_trained": 0.5534394 }, { "epoch": 0.31997730657400186, "grad_norm": 20.843252182006836, "loss": 4.3883, "lr": 0.0009123076923076923, "step": 1128, "tokens_trained": 0.554419184 }, { "epoch": 0.32054464222395573, "grad_norm": 12.916360855102539, "loss": 4.3119, "lr": 0.000912027972027972, "step": 1130, "tokens_trained": 0.555401952 }, { "epoch": 0.32111197787390966, "grad_norm": 48.54426956176758, "loss": 4.4155, "lr": 0.0009117482517482518, "step": 1132, "tokens_trained": 0.556385024 }, { "epoch": 0.32167931352386353, "grad_norm": 41.00883483886719, "loss": 4.362, "lr": 0.0009114685314685315, "step": 1134, "tokens_trained": 0.557368472 }, { "epoch": 0.32224664917381746, "grad_norm": 28.0487060546875, "loss": 4.3504, "lr": 0.0009111888111888113, "step": 1136, "tokens_trained": 0.55835288 }, { "epoch": 0.3228139848237714, "grad_norm": 22.05229377746582, "loss": 4.331, "lr": 0.0009109090909090909, "step": 1138, "tokens_trained": 0.559337064 }, { "epoch": 0.32338132047372525, "grad_norm": 16.770631790161133, "loss": 4.3008, "lr": 0.0009106293706293707, "step": 1140, "tokens_trained": 0.560317984 }, { "epoch": 0.3239486561236792, "grad_norm": 35.300262451171875, "loss": 4.4083, "lr": 0.0009103496503496503, "step": 1142, "tokens_trained": 0.561299688 }, { "epoch": 0.32451599177363305, "grad_norm": 23.788284301757812, "loss": 4.2772, "lr": 0.0009100699300699301, "step": 1144, "tokens_trained": 0.562285664 }, { "epoch": 0.325083327423587, "grad_norm": 23.085710525512695, "loss": 4.3185, "lr": 0.0009097902097902098, "step": 1146, "tokens_trained": 0.563267832 }, { "epoch": 0.3256506630735409, "grad_norm": 13.11314582824707, "loss": 4.2711, "lr": 0.0009095104895104895, "step": 1148, "tokens_trained": 0.564248928 }, { "epoch": 0.3262179987234948, "grad_norm": 31.297805786132812, "loss": 4.3096, "lr": 0.0009092307692307692, "step": 1150, "tokens_trained": 0.56522952 }, { "epoch": 0.3267853343734487, "grad_norm": 11.668539047241211, "loss": 4.2667, "lr": 0.000908951048951049, "step": 1152, "tokens_trained": 0.566212392 }, { "epoch": 0.32735267002340257, "grad_norm": 23.359189987182617, "loss": 4.3156, "lr": 0.0009086713286713288, "step": 1154, "tokens_trained": 0.567192216 }, { "epoch": 0.3279200056733565, "grad_norm": 31.09916114807129, "loss": 4.3367, "lr": 0.0009083916083916084, "step": 1156, "tokens_trained": 0.568177088 }, { "epoch": 0.3284873413233104, "grad_norm": 24.03261947631836, "loss": 4.3504, "lr": 0.0009081118881118881, "step": 1158, "tokens_trained": 0.56915868 }, { "epoch": 0.3290546769732643, "grad_norm": 16.029443740844727, "loss": 4.3192, "lr": 0.0009078321678321678, "step": 1160, "tokens_trained": 0.570142976 }, { "epoch": 0.3296220126232182, "grad_norm": 53.486724853515625, "loss": 4.3921, "lr": 0.0009075524475524476, "step": 1162, "tokens_trained": 0.57112748 }, { "epoch": 0.3301893482731721, "grad_norm": 37.42267608642578, "loss": 4.2821, "lr": 0.0009072727272727273, "step": 1164, "tokens_trained": 0.57211356 }, { "epoch": 0.330756683923126, "grad_norm": 28.862472534179688, "loss": 4.3002, "lr": 0.000906993006993007, "step": 1166, "tokens_trained": 0.57309492 }, { "epoch": 0.33132401957307994, "grad_norm": 22.26299476623535, "loss": 4.2729, "lr": 0.0009067132867132866, "step": 1168, "tokens_trained": 0.5740806 }, { "epoch": 0.3318913552230338, "grad_norm": 21.635013580322266, "loss": 4.2866, "lr": 0.0009064335664335665, "step": 1170, "tokens_trained": 0.575061664 }, { "epoch": 0.33245869087298774, "grad_norm": 18.995012283325195, "loss": 4.2814, "lr": 0.0009061538461538462, "step": 1172, "tokens_trained": 0.576046304 }, { "epoch": 0.3330260265229416, "grad_norm": 22.621299743652344, "loss": 4.2739, "lr": 0.0009058741258741259, "step": 1174, "tokens_trained": 0.577032376 }, { "epoch": 0.33359336217289554, "grad_norm": 21.758216857910156, "loss": 4.263, "lr": 0.0009055944055944056, "step": 1176, "tokens_trained": 0.578013896 }, { "epoch": 0.33416069782284946, "grad_norm": 32.38374710083008, "loss": 4.2713, "lr": 0.0009053146853146853, "step": 1178, "tokens_trained": 0.57900508 }, { "epoch": 0.33472803347280333, "grad_norm": 35.57462692260742, "loss": 4.2986, "lr": 0.0009050349650349651, "step": 1180, "tokens_trained": 0.57999512 }, { "epoch": 0.33529536912275726, "grad_norm": 11.77812385559082, "loss": 4.3085, "lr": 0.0009047552447552448, "step": 1182, "tokens_trained": 0.580982752 }, { "epoch": 0.33586270477271113, "grad_norm": 51.48725509643555, "loss": 4.4003, "lr": 0.0009044755244755245, "step": 1184, "tokens_trained": 0.581964936 }, { "epoch": 0.33643004042266506, "grad_norm": 47.01481628417969, "loss": 4.3182, "lr": 0.0009041958041958041, "step": 1186, "tokens_trained": 0.582949944 }, { "epoch": 0.336997376072619, "grad_norm": 22.935691833496094, "loss": 4.3432, "lr": 0.000903916083916084, "step": 1188, "tokens_trained": 0.583934776 }, { "epoch": 0.33756471172257285, "grad_norm": 45.21054458618164, "loss": 4.4674, "lr": 0.0009036363636363637, "step": 1190, "tokens_trained": 0.584918344 }, { "epoch": 0.3381320473725268, "grad_norm": 27.012706756591797, "loss": 4.2889, "lr": 0.0009033566433566434, "step": 1192, "tokens_trained": 0.585897632 }, { "epoch": 0.33869938302248065, "grad_norm": 16.68247413635254, "loss": 4.2896, "lr": 0.0009030769230769231, "step": 1194, "tokens_trained": 0.586879408 }, { "epoch": 0.3392667186724346, "grad_norm": 20.664148330688477, "loss": 4.304, "lr": 0.0009027972027972027, "step": 1196, "tokens_trained": 0.587859392 }, { "epoch": 0.3398340543223885, "grad_norm": 22.954742431640625, "loss": 4.2853, "lr": 0.0009025174825174826, "step": 1198, "tokens_trained": 0.588845408 }, { "epoch": 0.34040138997234237, "grad_norm": 23.226943969726562, "loss": 4.2597, "lr": 0.0009022377622377622, "step": 1200, "tokens_trained": 0.589832736 }, { "epoch": 0.3409687256222963, "grad_norm": 7.963059902191162, "loss": 4.261, "lr": 0.000901958041958042, "step": 1202, "tokens_trained": 0.590816568 }, { "epoch": 0.34153606127225017, "grad_norm": 25.160730361938477, "loss": 4.3288, "lr": 0.0009016783216783216, "step": 1204, "tokens_trained": 0.59179692 }, { "epoch": 0.3421033969222041, "grad_norm": 38.45030212402344, "loss": 4.3371, "lr": 0.0009013986013986014, "step": 1206, "tokens_trained": 0.592780968 }, { "epoch": 0.342670732572158, "grad_norm": 52.66873550415039, "loss": 4.2805, "lr": 0.0009011188811188812, "step": 1208, "tokens_trained": 0.593760896 }, { "epoch": 0.3432380682221119, "grad_norm": 28.104921340942383, "loss": 4.3885, "lr": 0.0009008391608391609, "step": 1210, "tokens_trained": 0.59474304 }, { "epoch": 0.3438054038720658, "grad_norm": 49.20989990234375, "loss": 4.346, "lr": 0.0009005594405594406, "step": 1212, "tokens_trained": 0.59572768 }, { "epoch": 0.3443727395220197, "grad_norm": 20.652427673339844, "loss": 4.2368, "lr": 0.0009002797202797202, "step": 1214, "tokens_trained": 0.59671092 }, { "epoch": 0.3449400751719736, "grad_norm": 17.821596145629883, "loss": 4.3041, "lr": 0.0009000000000000001, "step": 1216, "tokens_trained": 0.597697344 }, { "epoch": 0.34550741082192754, "grad_norm": 48.594932556152344, "loss": 4.3668, "lr": 0.0008997202797202797, "step": 1218, "tokens_trained": 0.598677288 }, { "epoch": 0.3460747464718814, "grad_norm": 27.70078468322754, "loss": 4.2939, "lr": 0.0008994405594405595, "step": 1220, "tokens_trained": 0.599662488 }, { "epoch": 0.34664208212183534, "grad_norm": 25.498798370361328, "loss": 4.2891, "lr": 0.0008991608391608391, "step": 1222, "tokens_trained": 0.600646904 }, { "epoch": 0.3472094177717892, "grad_norm": 13.455835342407227, "loss": 4.2881, "lr": 0.0008988811188811188, "step": 1224, "tokens_trained": 0.601628112 }, { "epoch": 0.34777675342174313, "grad_norm": 17.518342971801758, "loss": 4.2977, "lr": 0.0008986013986013987, "step": 1226, "tokens_trained": 0.602612336 }, { "epoch": 0.34834408907169706, "grad_norm": 20.642597198486328, "loss": 4.2921, "lr": 0.0008983216783216783, "step": 1228, "tokens_trained": 0.603595 }, { "epoch": 0.34891142472165093, "grad_norm": 14.464616775512695, "loss": 4.233, "lr": 0.0008980419580419581, "step": 1230, "tokens_trained": 0.604576592 }, { "epoch": 0.34947876037160486, "grad_norm": 13.204504013061523, "loss": 4.2707, "lr": 0.0008977622377622377, "step": 1232, "tokens_trained": 0.60555656 }, { "epoch": 0.35004609602155873, "grad_norm": 12.241665840148926, "loss": 4.2506, "lr": 0.0008974825174825176, "step": 1234, "tokens_trained": 0.606536024 }, { "epoch": 0.35061343167151265, "grad_norm": 18.187660217285156, "loss": 4.2659, "lr": 0.0008972027972027972, "step": 1236, "tokens_trained": 0.607522576 }, { "epoch": 0.3511807673214666, "grad_norm": 8.911888122558594, "loss": 4.2505, "lr": 0.000896923076923077, "step": 1238, "tokens_trained": 0.608507736 }, { "epoch": 0.35174810297142045, "grad_norm": 21.351713180541992, "loss": 4.2291, "lr": 0.0008966433566433566, "step": 1240, "tokens_trained": 0.609486688 }, { "epoch": 0.3523154386213744, "grad_norm": 47.81566619873047, "loss": 4.2725, "lr": 0.0008963636363636363, "step": 1242, "tokens_trained": 0.610470272 }, { "epoch": 0.35288277427132825, "grad_norm": 33.53351974487305, "loss": 4.3237, "lr": 0.0008960839160839162, "step": 1244, "tokens_trained": 0.611455176 }, { "epoch": 0.3534501099212822, "grad_norm": 15.252607345581055, "loss": 4.2868, "lr": 0.0008958041958041958, "step": 1246, "tokens_trained": 0.612437888 }, { "epoch": 0.3540174455712361, "grad_norm": 24.129865646362305, "loss": 4.2626, "lr": 0.0008955244755244756, "step": 1248, "tokens_trained": 0.613420728 }, { "epoch": 0.35458478122118997, "grad_norm": 34.814605712890625, "loss": 4.2627, "lr": 0.0008952447552447552, "step": 1250, "tokens_trained": 0.614405904 }, { "epoch": 0.35458478122118997, "eval_loss": 1.078355312347412, "eval_runtime": 20.4723, "step": 1250, "tokens_trained": 0.614405904 }, { "epoch": 0.3551521168711439, "grad_norm": 18.26809310913086, "loss": 4.2986, "lr": 0.000894965034965035, "step": 1252, "tokens_trained": 0.615386288 }, { "epoch": 0.35571945252109777, "grad_norm": 24.68335723876953, "loss": 4.3146, "lr": 0.0008946853146853147, "step": 1254, "tokens_trained": 0.616370576 }, { "epoch": 0.3562867881710517, "grad_norm": 35.34586715698242, "loss": 4.2905, "lr": 0.0008944055944055944, "step": 1256, "tokens_trained": 0.617351944 }, { "epoch": 0.3568541238210056, "grad_norm": 22.668407440185547, "loss": 4.2607, "lr": 0.0008941258741258741, "step": 1258, "tokens_trained": 0.618334816 }, { "epoch": 0.3574214594709595, "grad_norm": 14.068164825439453, "loss": 4.2459, "lr": 0.0008938461538461538, "step": 1260, "tokens_trained": 0.619319736 }, { "epoch": 0.3579887951209134, "grad_norm": 8.274995803833008, "loss": 4.2713, "lr": 0.0008935664335664337, "step": 1262, "tokens_trained": 0.620299344 }, { "epoch": 0.3585561307708673, "grad_norm": 22.12897491455078, "loss": 4.2841, "lr": 0.0008932867132867133, "step": 1264, "tokens_trained": 0.621282592 }, { "epoch": 0.3591234664208212, "grad_norm": 26.171052932739258, "loss": 4.2505, "lr": 0.000893006993006993, "step": 1266, "tokens_trained": 0.622266136 }, { "epoch": 0.35969080207077514, "grad_norm": 14.768603324890137, "loss": 4.271, "lr": 0.0008927272727272727, "step": 1268, "tokens_trained": 0.623247816 }, { "epoch": 0.360258137720729, "grad_norm": 13.065408706665039, "loss": 4.2387, "lr": 0.0008924475524475525, "step": 1270, "tokens_trained": 0.624234848 }, { "epoch": 0.36082547337068294, "grad_norm": 14.043888092041016, "loss": 4.2601, "lr": 0.0008921678321678322, "step": 1272, "tokens_trained": 0.625214176 }, { "epoch": 0.3613928090206368, "grad_norm": 13.734328269958496, "loss": 4.2426, "lr": 0.0008918881118881119, "step": 1274, "tokens_trained": 0.626197608 }, { "epoch": 0.36196014467059073, "grad_norm": 10.075374603271484, "loss": 4.2259, "lr": 0.0008916083916083916, "step": 1276, "tokens_trained": 0.62717884 }, { "epoch": 0.36252748032054466, "grad_norm": 33.92001724243164, "loss": 4.3054, "lr": 0.0008913286713286713, "step": 1278, "tokens_trained": 0.628166888 }, { "epoch": 0.36309481597049853, "grad_norm": 31.1391544342041, "loss": 4.3066, "lr": 0.0008910489510489512, "step": 1280, "tokens_trained": 0.629152528 }, { "epoch": 0.36366215162045246, "grad_norm": 10.888711929321289, "loss": 4.2348, "lr": 0.0008907692307692308, "step": 1282, "tokens_trained": 0.630132584 }, { "epoch": 0.3642294872704063, "grad_norm": 27.298410415649414, "loss": 4.3225, "lr": 0.0008904895104895105, "step": 1284, "tokens_trained": 0.63111212 }, { "epoch": 0.36479682292036025, "grad_norm": 23.396818161010742, "loss": 4.3177, "lr": 0.0008902097902097902, "step": 1286, "tokens_trained": 0.632094984 }, { "epoch": 0.3653641585703142, "grad_norm": 18.824432373046875, "loss": 4.2235, "lr": 0.00088993006993007, "step": 1288, "tokens_trained": 0.633076832 }, { "epoch": 0.36593149422026805, "grad_norm": 8.04826545715332, "loss": 4.2268, "lr": 0.0008896503496503497, "step": 1290, "tokens_trained": 0.63405868 }, { "epoch": 0.366498829870222, "grad_norm": 32.26673889160156, "loss": 4.3113, "lr": 0.0008893706293706294, "step": 1292, "tokens_trained": 0.635045096 }, { "epoch": 0.36706616552017585, "grad_norm": 29.91358184814453, "loss": 4.2971, "lr": 0.000889090909090909, "step": 1294, "tokens_trained": 0.63603008 }, { "epoch": 0.3676335011701298, "grad_norm": 12.093538284301758, "loss": 4.2502, "lr": 0.0008888111888111888, "step": 1296, "tokens_trained": 0.637014016 }, { "epoch": 0.3682008368200837, "grad_norm": 8.252509117126465, "loss": 4.2905, "lr": 0.0008885314685314686, "step": 1298, "tokens_trained": 0.637997752 }, { "epoch": 0.36876817247003757, "grad_norm": 61.22240447998047, "loss": 4.4753, "lr": 0.0008882517482517483, "step": 1300, "tokens_trained": 0.638981552 }, { "epoch": 0.3693355081199915, "grad_norm": 47.58195877075195, "loss": 4.2769, "lr": 0.000887972027972028, "step": 1302, "tokens_trained": 0.639963512 }, { "epoch": 0.36990284376994537, "grad_norm": 28.806411743164062, "loss": 4.3728, "lr": 0.0008876923076923077, "step": 1304, "tokens_trained": 0.640948392 }, { "epoch": 0.3704701794198993, "grad_norm": 38.960853576660156, "loss": 4.338, "lr": 0.0008874125874125875, "step": 1306, "tokens_trained": 0.641935304 }, { "epoch": 0.3710375150698532, "grad_norm": 25.05726432800293, "loss": 4.3002, "lr": 0.0008871328671328671, "step": 1308, "tokens_trained": 0.642924168 }, { "epoch": 0.3716048507198071, "grad_norm": 39.84127426147461, "loss": 4.3593, "lr": 0.0008868531468531469, "step": 1310, "tokens_trained": 0.64390412 }, { "epoch": 0.372172186369761, "grad_norm": 15.03055191040039, "loss": 4.223, "lr": 0.0008865734265734265, "step": 1312, "tokens_trained": 0.644882104 }, { "epoch": 0.3727395220197149, "grad_norm": 41.85628890991211, "loss": 4.3819, "lr": 0.0008862937062937063, "step": 1314, "tokens_trained": 0.645866912 }, { "epoch": 0.3733068576696688, "grad_norm": 29.014118194580078, "loss": 4.2843, "lr": 0.0008860139860139861, "step": 1316, "tokens_trained": 0.646850376 }, { "epoch": 0.37387419331962274, "grad_norm": 24.407743453979492, "loss": 4.2598, "lr": 0.0008857342657342658, "step": 1318, "tokens_trained": 0.647832272 }, { "epoch": 0.3744415289695766, "grad_norm": 23.28154182434082, "loss": 4.2162, "lr": 0.0008854545454545455, "step": 1320, "tokens_trained": 0.64881652 }, { "epoch": 0.37500886461953054, "grad_norm": 17.70418930053711, "loss": 4.2386, "lr": 0.0008851748251748251, "step": 1322, "tokens_trained": 0.649794936 }, { "epoch": 0.37557620026948446, "grad_norm": 22.582124710083008, "loss": 4.2358, "lr": 0.000884895104895105, "step": 1324, "tokens_trained": 0.650777784 }, { "epoch": 0.37614353591943833, "grad_norm": 16.77848243713379, "loss": 4.2536, "lr": 0.0008846153846153846, "step": 1326, "tokens_trained": 0.651762472 }, { "epoch": 0.37671087156939226, "grad_norm": 14.382417678833008, "loss": 4.2403, "lr": 0.0008843356643356644, "step": 1328, "tokens_trained": 0.652741832 }, { "epoch": 0.37727820721934613, "grad_norm": 22.420886993408203, "loss": 4.1977, "lr": 0.000884055944055944, "step": 1330, "tokens_trained": 0.653725792 }, { "epoch": 0.37784554286930006, "grad_norm": 9.768660545349121, "loss": 4.2148, "lr": 0.0008837762237762238, "step": 1332, "tokens_trained": 0.654704648 }, { "epoch": 0.378412878519254, "grad_norm": 5.091487407684326, "loss": 4.2062, "lr": 0.0008834965034965036, "step": 1334, "tokens_trained": 0.65569176 }, { "epoch": 0.37898021416920785, "grad_norm": 53.520957946777344, "loss": 4.4082, "lr": 0.0008832167832167832, "step": 1336, "tokens_trained": 0.656679344 }, { "epoch": 0.3795475498191618, "grad_norm": 32.17420959472656, "loss": 4.2911, "lr": 0.000882937062937063, "step": 1338, "tokens_trained": 0.657665136 }, { "epoch": 0.38011488546911565, "grad_norm": 14.12790584564209, "loss": 4.2899, "lr": 0.0008826573426573426, "step": 1340, "tokens_trained": 0.658651576 }, { "epoch": 0.3806822211190696, "grad_norm": 51.74199676513672, "loss": 4.3901, "lr": 0.0008823776223776225, "step": 1342, "tokens_trained": 0.659631792 }, { "epoch": 0.3812495567690235, "grad_norm": 48.99909973144531, "loss": 4.298, "lr": 0.0008820979020979021, "step": 1344, "tokens_trained": 0.660616912 }, { "epoch": 0.38181689241897737, "grad_norm": 28.356245040893555, "loss": 4.3171, "lr": 0.0008818181818181819, "step": 1346, "tokens_trained": 0.66159872 }, { "epoch": 0.3823842280689313, "grad_norm": 45.081703186035156, "loss": 4.3067, "lr": 0.0008815384615384615, "step": 1348, "tokens_trained": 0.662582152 }, { "epoch": 0.38295156371888517, "grad_norm": 37.175052642822266, "loss": 4.241, "lr": 0.0008812587412587412, "step": 1350, "tokens_trained": 0.663561176 }, { "epoch": 0.3835188993688391, "grad_norm": 49.46076965332031, "loss": 4.2896, "lr": 0.0008809790209790211, "step": 1352, "tokens_trained": 0.664545144 }, { "epoch": 0.384086235018793, "grad_norm": 22.20182991027832, "loss": 4.323, "lr": 0.0008806993006993007, "step": 1354, "tokens_trained": 0.66553092 }, { "epoch": 0.3846535706687469, "grad_norm": 34.111549377441406, "loss": 4.3138, "lr": 0.0008804195804195805, "step": 1356, "tokens_trained": 0.666517568 }, { "epoch": 0.3852209063187008, "grad_norm": 47.01582336425781, "loss": 4.3009, "lr": 0.0008801398601398601, "step": 1358, "tokens_trained": 0.667498192 }, { "epoch": 0.3857882419686547, "grad_norm": 18.845388412475586, "loss": 4.3176, "lr": 0.00087986013986014, "step": 1360, "tokens_trained": 0.668479008 }, { "epoch": 0.3863555776186086, "grad_norm": 53.68927764892578, "loss": 4.4024, "lr": 0.0008795804195804196, "step": 1362, "tokens_trained": 0.669462472 }, { "epoch": 0.38692291326856254, "grad_norm": 29.88358497619629, "loss": 4.286, "lr": 0.0008793006993006993, "step": 1364, "tokens_trained": 0.67044392 }, { "epoch": 0.3874902489185164, "grad_norm": 11.12879753112793, "loss": 4.3024, "lr": 0.000879020979020979, "step": 1366, "tokens_trained": 0.671424552 }, { "epoch": 0.38805758456847034, "grad_norm": 23.573301315307617, "loss": 4.2662, "lr": 0.0008787412587412587, "step": 1368, "tokens_trained": 0.672409992 }, { "epoch": 0.3886249202184242, "grad_norm": 24.749160766601562, "loss": 4.274, "lr": 0.0008784615384615386, "step": 1370, "tokens_trained": 0.67339824 }, { "epoch": 0.38919225586837813, "grad_norm": 33.26881408691406, "loss": 4.2588, "lr": 0.0008781818181818182, "step": 1372, "tokens_trained": 0.67438204 }, { "epoch": 0.38975959151833206, "grad_norm": 24.466472625732422, "loss": 4.2837, "lr": 0.000877902097902098, "step": 1374, "tokens_trained": 0.67536356 }, { "epoch": 0.39004325934330897, "eval_loss": 1.0616238117218018, "eval_runtime": 20.3698, "step": 1375, "tokens_trained": 0.675855672 }, { "epoch": 0.39032692716828593, "grad_norm": 24.48844337463379, "loss": 4.259, "lr": 0.0008776223776223776, "step": 1376, "tokens_trained": 0.676346368 }, { "epoch": 0.39089426281823986, "grad_norm": 30.594989776611328, "loss": 4.1894, "lr": 0.0008773426573426574, "step": 1378, "tokens_trained": 0.677329312 }, { "epoch": 0.3914615984681937, "grad_norm": 19.835350036621094, "loss": 4.2718, "lr": 0.0008770629370629371, "step": 1380, "tokens_trained": 0.678312272 }, { "epoch": 0.39202893411814765, "grad_norm": 14.570358276367188, "loss": 4.2419, "lr": 0.0008767832167832168, "step": 1382, "tokens_trained": 0.679291216 }, { "epoch": 0.3925962697681016, "grad_norm": 11.608271598815918, "loss": 4.1917, "lr": 0.0008765034965034965, "step": 1384, "tokens_trained": 0.680273296 }, { "epoch": 0.39316360541805545, "grad_norm": 26.094860076904297, "loss": 4.2762, "lr": 0.0008762237762237762, "step": 1386, "tokens_trained": 0.681249464 }, { "epoch": 0.3937309410680094, "grad_norm": 12.754049301147461, "loss": 4.2032, "lr": 0.0008759440559440561, "step": 1388, "tokens_trained": 0.682234168 }, { "epoch": 0.39429827671796325, "grad_norm": 5.951663970947266, "loss": 4.1921, "lr": 0.0008756643356643357, "step": 1390, "tokens_trained": 0.683217176 }, { "epoch": 0.3948656123679172, "grad_norm": 26.907669067382812, "loss": 4.24, "lr": 0.0008753846153846154, "step": 1392, "tokens_trained": 0.68419888 }, { "epoch": 0.3954329480178711, "grad_norm": 25.04796600341797, "loss": 4.2656, "lr": 0.0008751048951048951, "step": 1394, "tokens_trained": 0.685178784 }, { "epoch": 0.39600028366782497, "grad_norm": 19.600811004638672, "loss": 4.2683, "lr": 0.0008748251748251749, "step": 1396, "tokens_trained": 0.686161632 }, { "epoch": 0.3965676193177789, "grad_norm": 14.087088584899902, "loss": 4.2658, "lr": 0.0008745454545454546, "step": 1398, "tokens_trained": 0.687139992 }, { "epoch": 0.39713495496773277, "grad_norm": 9.257765769958496, "loss": 4.2021, "lr": 0.0008742657342657343, "step": 1400, "tokens_trained": 0.688117912 }, { "epoch": 0.3977022906176867, "grad_norm": 18.830154418945312, "loss": 4.2249, "lr": 0.0008739860139860139, "step": 1402, "tokens_trained": 0.689098776 }, { "epoch": 0.3982696262676406, "grad_norm": 24.81566619873047, "loss": 4.246, "lr": 0.0008737062937062937, "step": 1404, "tokens_trained": 0.690085432 }, { "epoch": 0.3988369619175945, "grad_norm": 14.071616172790527, "loss": 4.2531, "lr": 0.0008734265734265734, "step": 1406, "tokens_trained": 0.691069232 }, { "epoch": 0.3994042975675484, "grad_norm": 21.414424896240234, "loss": 4.2192, "lr": 0.0008731468531468532, "step": 1408, "tokens_trained": 0.692051224 }, { "epoch": 0.3999716332175023, "grad_norm": 38.74683380126953, "loss": 4.2421, "lr": 0.0008728671328671329, "step": 1410, "tokens_trained": 0.693029976 }, { "epoch": 0.4005389688674562, "grad_norm": 12.595442771911621, "loss": 4.2569, "lr": 0.0008725874125874126, "step": 1412, "tokens_trained": 0.694013304 }, { "epoch": 0.40110630451741014, "grad_norm": 55.233673095703125, "loss": 4.3422, "lr": 0.0008723076923076924, "step": 1414, "tokens_trained": 0.694997536 }, { "epoch": 0.401673640167364, "grad_norm": 24.717113494873047, "loss": 4.2567, "lr": 0.000872027972027972, "step": 1416, "tokens_trained": 0.695982632 }, { "epoch": 0.40224097581731794, "grad_norm": 20.552875518798828, "loss": 4.2464, "lr": 0.0008717482517482518, "step": 1418, "tokens_trained": 0.696966408 }, { "epoch": 0.4028083114672718, "grad_norm": 25.569900512695312, "loss": 4.21, "lr": 0.0008714685314685314, "step": 1420, "tokens_trained": 0.697948224 }, { "epoch": 0.40337564711722573, "grad_norm": 24.538320541381836, "loss": 4.2605, "lr": 0.0008711888111888112, "step": 1422, "tokens_trained": 0.698934688 }, { "epoch": 0.40394298276717966, "grad_norm": 9.585651397705078, "loss": 4.2524, "lr": 0.0008709090909090909, "step": 1424, "tokens_trained": 0.699921976 }, { "epoch": 0.40451031841713353, "grad_norm": 11.886672973632812, "loss": 4.1934, "lr": 0.0008706293706293707, "step": 1426, "tokens_trained": 0.70090396 }, { "epoch": 0.40507765406708746, "grad_norm": 26.162124633789062, "loss": 4.2412, "lr": 0.0008703496503496504, "step": 1428, "tokens_trained": 0.701888448 }, { "epoch": 0.4056449897170413, "grad_norm": 5.03931188583374, "loss": 4.202, "lr": 0.00087006993006993, "step": 1430, "tokens_trained": 0.702864336 }, { "epoch": 0.40621232536699525, "grad_norm": 33.67579650878906, "loss": 4.3087, "lr": 0.0008697902097902099, "step": 1432, "tokens_trained": 0.703847784 }, { "epoch": 0.4067796610169492, "grad_norm": 34.38542556762695, "loss": 4.2807, "lr": 0.0008695104895104895, "step": 1434, "tokens_trained": 0.704827288 }, { "epoch": 0.40734699666690305, "grad_norm": 13.319886207580566, "loss": 4.3332, "lr": 0.0008692307692307693, "step": 1436, "tokens_trained": 0.705815392 }, { "epoch": 0.407914332316857, "grad_norm": 36.58311080932617, "loss": 4.3318, "lr": 0.0008689510489510489, "step": 1438, "tokens_trained": 0.7067914 }, { "epoch": 0.40848166796681085, "grad_norm": 29.63648223876953, "loss": 4.2962, "lr": 0.0008686713286713287, "step": 1440, "tokens_trained": 0.70777396 }, { "epoch": 0.4090490036167648, "grad_norm": 9.55128002166748, "loss": 4.2773, "lr": 0.0008683916083916084, "step": 1442, "tokens_trained": 0.708750496 }, { "epoch": 0.4096163392667187, "grad_norm": 53.83981704711914, "loss": 4.3875, "lr": 0.0008681118881118881, "step": 1444, "tokens_trained": 0.709730168 }, { "epoch": 0.41018367491667257, "grad_norm": 54.59236526489258, "loss": 4.3582, "lr": 0.0008678321678321679, "step": 1446, "tokens_trained": 0.710709704 }, { "epoch": 0.4107510105666265, "grad_norm": 13.964411735534668, "loss": 4.3065, "lr": 0.0008675524475524475, "step": 1448, "tokens_trained": 0.711690136 }, { "epoch": 0.41131834621658037, "grad_norm": 25.506649017333984, "loss": 4.2686, "lr": 0.0008672727272727273, "step": 1450, "tokens_trained": 0.712668056 }, { "epoch": 0.4118856818665343, "grad_norm": 21.1628360748291, "loss": 4.2485, "lr": 0.000866993006993007, "step": 1452, "tokens_trained": 0.71365004 }, { "epoch": 0.4124530175164882, "grad_norm": 15.751238822937012, "loss": 4.2078, "lr": 0.0008667132867132868, "step": 1454, "tokens_trained": 0.714632032 }, { "epoch": 0.4130203531664421, "grad_norm": 15.838552474975586, "loss": 4.1944, "lr": 0.0008664335664335664, "step": 1456, "tokens_trained": 0.715611376 }, { "epoch": 0.413587688816396, "grad_norm": 15.968609809875488, "loss": 4.1768, "lr": 0.0008661538461538461, "step": 1458, "tokens_trained": 0.716591112 }, { "epoch": 0.4141550244663499, "grad_norm": 15.419891357421875, "loss": 4.1978, "lr": 0.0008658741258741259, "step": 1460, "tokens_trained": 0.717575952 }, { "epoch": 0.4147223601163038, "grad_norm": 15.088132858276367, "loss": 4.2361, "lr": 0.0008655944055944056, "step": 1462, "tokens_trained": 0.718563696 }, { "epoch": 0.41528969576625774, "grad_norm": 4.839190483093262, "loss": 4.2089, "lr": 0.0008653146853146854, "step": 1464, "tokens_trained": 0.71954848 }, { "epoch": 0.4158570314162116, "grad_norm": 22.192466735839844, "loss": 4.2109, "lr": 0.000865034965034965, "step": 1466, "tokens_trained": 0.720533304 }, { "epoch": 0.41642436706616553, "grad_norm": 28.983531951904297, "loss": 4.2402, "lr": 0.0008647552447552448, "step": 1468, "tokens_trained": 0.721518176 }, { "epoch": 0.4169917027161194, "grad_norm": 21.010780334472656, "loss": 4.1732, "lr": 0.0008644755244755245, "step": 1470, "tokens_trained": 0.72250176 }, { "epoch": 0.41755903836607333, "grad_norm": 14.59277057647705, "loss": 4.1847, "lr": 0.0008641958041958042, "step": 1472, "tokens_trained": 0.723486664 }, { "epoch": 0.41812637401602726, "grad_norm": 13.688531875610352, "loss": 4.1577, "lr": 0.0008639160839160839, "step": 1474, "tokens_trained": 0.724469328 }, { "epoch": 0.41869370966598113, "grad_norm": 15.879347801208496, "loss": 4.1721, "lr": 0.0008636363636363636, "step": 1476, "tokens_trained": 0.725454968 }, { "epoch": 0.41926104531593505, "grad_norm": 10.225201606750488, "loss": 4.1999, "lr": 0.0008633566433566434, "step": 1478, "tokens_trained": 0.7264426 }, { "epoch": 0.4198283809658889, "grad_norm": 17.007728576660156, "loss": 4.2229, "lr": 0.0008630769230769231, "step": 1480, "tokens_trained": 0.727422056 }, { "epoch": 0.42039571661584285, "grad_norm": 13.517934799194336, "loss": 4.2241, "lr": 0.0008627972027972029, "step": 1482, "tokens_trained": 0.728403688 }, { "epoch": 0.4209630522657968, "grad_norm": 17.132064819335938, "loss": 4.1679, "lr": 0.0008625174825174825, "step": 1484, "tokens_trained": 0.729386248 }, { "epoch": 0.42153038791575065, "grad_norm": 19.782320022583008, "loss": 4.1817, "lr": 0.0008622377622377622, "step": 1486, "tokens_trained": 0.730368752 }, { "epoch": 0.4220977235657046, "grad_norm": 3.388552188873291, "loss": 4.1726, "lr": 0.000861958041958042, "step": 1488, "tokens_trained": 0.731354304 }, { "epoch": 0.42266505921565845, "grad_norm": 28.33499526977539, "loss": 4.2623, "lr": 0.0008616783216783217, "step": 1490, "tokens_trained": 0.732337296 }, { "epoch": 0.42323239486561237, "grad_norm": 24.927406311035156, "loss": 4.2422, "lr": 0.0008613986013986014, "step": 1492, "tokens_trained": 0.733319824 }, { "epoch": 0.4237997305155663, "grad_norm": 25.996028900146484, "loss": 4.2227, "lr": 0.0008611188811188811, "step": 1494, "tokens_trained": 0.73430636 }, { "epoch": 0.42436706616552017, "grad_norm": 14.625783920288086, "loss": 4.2268, "lr": 0.0008608391608391609, "step": 1496, "tokens_trained": 0.735285848 }, { "epoch": 0.4249344018154741, "grad_norm": 12.556640625, "loss": 4.2352, "lr": 0.0008605594405594406, "step": 1498, "tokens_trained": 0.736270632 }, { "epoch": 0.42550173746542796, "grad_norm": 18.579416275024414, "loss": 4.2377, "lr": 0.0008602797202797203, "step": 1500, "tokens_trained": 0.737255104 }, { "epoch": 0.42550173746542796, "eval_loss": 1.052606463432312, "eval_runtime": 20.5089, "step": 1500, "tokens_trained": 0.737255104 }, { "epoch": 0.4260690731153819, "grad_norm": 16.550657272338867, "loss": 4.182, "lr": 0.00086, "step": 1502, "tokens_trained": 0.738240848 }, { "epoch": 0.4266364087653358, "grad_norm": 24.4381046295166, "loss": 4.2093, "lr": 0.0008597202797202797, "step": 1504, "tokens_trained": 0.73922592 }, { "epoch": 0.4272037444152897, "grad_norm": 13.155163764953613, "loss": 4.239, "lr": 0.0008594405594405595, "step": 1506, "tokens_trained": 0.740208896 }, { "epoch": 0.4277710800652436, "grad_norm": 27.667949676513672, "loss": 4.2607, "lr": 0.0008591608391608392, "step": 1508, "tokens_trained": 0.741189312 }, { "epoch": 0.4283384157151975, "grad_norm": 35.897743225097656, "loss": 4.2153, "lr": 0.0008588811188811188, "step": 1510, "tokens_trained": 0.742170456 }, { "epoch": 0.4289057513651514, "grad_norm": 18.16407012939453, "loss": 4.2753, "lr": 0.0008586013986013986, "step": 1512, "tokens_trained": 0.743152504 }, { "epoch": 0.42947308701510534, "grad_norm": 27.447364807128906, "loss": 4.2321, "lr": 0.0008583216783216783, "step": 1514, "tokens_trained": 0.744139768 }, { "epoch": 0.4300404226650592, "grad_norm": 21.115859985351562, "loss": 4.2048, "lr": 0.0008580419580419581, "step": 1516, "tokens_trained": 0.745122368 }, { "epoch": 0.43060775831501313, "grad_norm": 5.949585914611816, "loss": 4.1787, "lr": 0.0008577622377622378, "step": 1518, "tokens_trained": 0.746104936 }, { "epoch": 0.431175093964967, "grad_norm": 6.631585121154785, "loss": 4.2035, "lr": 0.0008574825174825175, "step": 1520, "tokens_trained": 0.747086264 }, { "epoch": 0.43174242961492093, "grad_norm": 38.91585159301758, "loss": 4.354, "lr": 0.0008572027972027972, "step": 1522, "tokens_trained": 0.74806844 }, { "epoch": 0.43230976526487486, "grad_norm": 37.53727722167969, "loss": 4.228, "lr": 0.000856923076923077, "step": 1524, "tokens_trained": 0.749052432 }, { "epoch": 0.4328771009148287, "grad_norm": 19.87713623046875, "loss": 4.2696, "lr": 0.0008566433566433567, "step": 1526, "tokens_trained": 0.750037072 }, { "epoch": 0.43344443656478265, "grad_norm": 25.615995407104492, "loss": 4.2676, "lr": 0.0008563636363636363, "step": 1528, "tokens_trained": 0.751020584 }, { "epoch": 0.4340117722147365, "grad_norm": 16.643299102783203, "loss": 4.201, "lr": 0.0008560839160839161, "step": 1530, "tokens_trained": 0.75200224 }, { "epoch": 0.43457910786469045, "grad_norm": 16.207853317260742, "loss": 4.1944, "lr": 0.0008558041958041958, "step": 1532, "tokens_trained": 0.752981624 }, { "epoch": 0.4351464435146444, "grad_norm": 27.054973602294922, "loss": 4.2188, "lr": 0.0008555244755244756, "step": 1534, "tokens_trained": 0.753968464 }, { "epoch": 0.43571377916459825, "grad_norm": 33.468238830566406, "loss": 4.2052, "lr": 0.0008552447552447553, "step": 1536, "tokens_trained": 0.754950976 }, { "epoch": 0.4362811148145522, "grad_norm": 21.083576202392578, "loss": 4.2514, "lr": 0.000854965034965035, "step": 1538, "tokens_trained": 0.755938272 }, { "epoch": 0.43684845046450604, "grad_norm": 19.927122116088867, "loss": 4.2493, "lr": 0.0008546853146853147, "step": 1540, "tokens_trained": 0.756916784 }, { "epoch": 0.43741578611445997, "grad_norm": 22.105287551879883, "loss": 4.2264, "lr": 0.0008544055944055944, "step": 1542, "tokens_trained": 0.757901152 }, { "epoch": 0.4379831217644139, "grad_norm": 22.448705673217773, "loss": 4.1987, "lr": 0.0008541258741258742, "step": 1544, "tokens_trained": 0.758886048 }, { "epoch": 0.43855045741436777, "grad_norm": 17.740005493164062, "loss": 4.1918, "lr": 0.0008538461538461538, "step": 1546, "tokens_trained": 0.759864304 }, { "epoch": 0.4391177930643217, "grad_norm": 20.58041763305664, "loss": 4.2144, "lr": 0.0008535664335664336, "step": 1548, "tokens_trained": 0.760844312 }, { "epoch": 0.43968512871427556, "grad_norm": 21.937252044677734, "loss": 4.2129, "lr": 0.0008532867132867133, "step": 1550, "tokens_trained": 0.761827256 }, { "epoch": 0.4402524643642295, "grad_norm": 26.883426666259766, "loss": 4.2244, "lr": 0.000853006993006993, "step": 1552, "tokens_trained": 0.7628098 }, { "epoch": 0.4408198000141834, "grad_norm": 10.297266960144043, "loss": 4.1724, "lr": 0.0008527272727272728, "step": 1554, "tokens_trained": 0.763792488 }, { "epoch": 0.4413871356641373, "grad_norm": 12.119601249694824, "loss": 4.1828, "lr": 0.0008524475524475524, "step": 1556, "tokens_trained": 0.764769936 }, { "epoch": 0.4419544713140912, "grad_norm": 16.565885543823242, "loss": 4.2113, "lr": 0.0008521678321678322, "step": 1558, "tokens_trained": 0.765752376 }, { "epoch": 0.4425218069640451, "grad_norm": 18.860309600830078, "loss": 4.1864, "lr": 0.0008518881118881119, "step": 1560, "tokens_trained": 0.766736256 }, { "epoch": 0.443089142613999, "grad_norm": 4.049737453460693, "loss": 4.2108, "lr": 0.0008516083916083917, "step": 1562, "tokens_trained": 0.767720568 }, { "epoch": 0.44365647826395294, "grad_norm": 15.730945587158203, "loss": 4.2339, "lr": 0.0008513286713286713, "step": 1564, "tokens_trained": 0.768701288 }, { "epoch": 0.4442238139139068, "grad_norm": 18.64398956298828, "loss": 4.2132, "lr": 0.000851048951048951, "step": 1566, "tokens_trained": 0.769681336 }, { "epoch": 0.44479114956386073, "grad_norm": 22.01759147644043, "loss": 4.2211, "lr": 0.0008507692307692308, "step": 1568, "tokens_trained": 0.770661168 }, { "epoch": 0.4453584852138146, "grad_norm": 3.097306489944458, "loss": 4.2114, "lr": 0.0008504895104895105, "step": 1570, "tokens_trained": 0.7716424 }, { "epoch": 0.44592582086376853, "grad_norm": 35.901546478271484, "loss": 4.3, "lr": 0.0008502097902097903, "step": 1572, "tokens_trained": 0.772627536 }, { "epoch": 0.44649315651372246, "grad_norm": 20.762710571289062, "loss": 4.2465, "lr": 0.0008499300699300699, "step": 1574, "tokens_trained": 0.77361008 }, { "epoch": 0.4470604921636763, "grad_norm": 13.54304027557373, "loss": 4.221, "lr": 0.0008496503496503497, "step": 1576, "tokens_trained": 0.774591184 }, { "epoch": 0.44762782781363025, "grad_norm": 18.83641242980957, "loss": 4.2228, "lr": 0.0008493706293706294, "step": 1578, "tokens_trained": 0.775574136 }, { "epoch": 0.4481951634635841, "grad_norm": 12.294941902160645, "loss": 4.1768, "lr": 0.0008490909090909091, "step": 1580, "tokens_trained": 0.776554752 }, { "epoch": 0.44876249911353805, "grad_norm": 5.768923759460449, "loss": 4.2255, "lr": 0.0008488111888111888, "step": 1582, "tokens_trained": 0.777539368 }, { "epoch": 0.449329834763492, "grad_norm": 7.9961137771606445, "loss": 4.2218, "lr": 0.0008485314685314685, "step": 1584, "tokens_trained": 0.778522344 }, { "epoch": 0.44989717041344585, "grad_norm": 22.005645751953125, "loss": 4.2452, "lr": 0.0008482517482517483, "step": 1586, "tokens_trained": 0.77950768 }, { "epoch": 0.45046450606339977, "grad_norm": 27.313426971435547, "loss": 4.1875, "lr": 0.000847972027972028, "step": 1588, "tokens_trained": 0.780490984 }, { "epoch": 0.45103184171335364, "grad_norm": 10.344687461853027, "loss": 4.2356, "lr": 0.0008476923076923078, "step": 1590, "tokens_trained": 0.781469 }, { "epoch": 0.45159917736330757, "grad_norm": 27.348726272583008, "loss": 4.2962, "lr": 0.0008474125874125874, "step": 1592, "tokens_trained": 0.782450304 }, { "epoch": 0.4521665130132615, "grad_norm": 32.965911865234375, "loss": 4.2736, "lr": 0.0008471328671328671, "step": 1594, "tokens_trained": 0.783431416 }, { "epoch": 0.45273384866321537, "grad_norm": 7.752636909484863, "loss": 4.2074, "lr": 0.0008468531468531469, "step": 1596, "tokens_trained": 0.784409568 }, { "epoch": 0.4533011843131693, "grad_norm": 38.85223388671875, "loss": 4.3261, "lr": 0.0008465734265734266, "step": 1598, "tokens_trained": 0.785399368 }, { "epoch": 0.45386851996312316, "grad_norm": 38.017967224121094, "loss": 4.2646, "lr": 0.0008462937062937063, "step": 1600, "tokens_trained": 0.786376072 }, { "epoch": 0.4544358556130771, "grad_norm": 7.856576442718506, "loss": 4.191, "lr": 0.000846013986013986, "step": 1602, "tokens_trained": 0.787362072 }, { "epoch": 0.455003191263031, "grad_norm": 37.902870178222656, "loss": 4.2651, "lr": 0.0008457342657342658, "step": 1604, "tokens_trained": 0.788345104 }, { "epoch": 0.4555705269129849, "grad_norm": 7.724793434143066, "loss": 4.1994, "lr": 0.0008454545454545455, "step": 1606, "tokens_trained": 0.7893314 }, { "epoch": 0.4561378625629388, "grad_norm": 26.484699249267578, "loss": 4.2276, "lr": 0.0008451748251748252, "step": 1608, "tokens_trained": 0.790309344 }, { "epoch": 0.4567051982128927, "grad_norm": 23.137874603271484, "loss": 4.2082, "lr": 0.0008448951048951049, "step": 1610, "tokens_trained": 0.791295784 }, { "epoch": 0.4572725338628466, "grad_norm": 13.902606964111328, "loss": 4.2035, "lr": 0.0008446153846153846, "step": 1612, "tokens_trained": 0.79228076 }, { "epoch": 0.45783986951280053, "grad_norm": 8.438498497009277, "loss": 4.1713, "lr": 0.0008443356643356644, "step": 1614, "tokens_trained": 0.793265456 }, { "epoch": 0.4584072051627544, "grad_norm": 11.60899829864502, "loss": 4.1971, "lr": 0.0008440559440559441, "step": 1616, "tokens_trained": 0.794245896 }, { "epoch": 0.45897454081270833, "grad_norm": 19.33312225341797, "loss": 4.2328, "lr": 0.0008437762237762238, "step": 1618, "tokens_trained": 0.795229016 }, { "epoch": 0.4595418764626622, "grad_norm": 16.45014190673828, "loss": 4.2277, "lr": 0.0008434965034965035, "step": 1620, "tokens_trained": 0.79620792 }, { "epoch": 0.46010921211261613, "grad_norm": 9.818867683410645, "loss": 4.1494, "lr": 0.0008432167832167832, "step": 1622, "tokens_trained": 0.797192352 }, { "epoch": 0.46067654776257005, "grad_norm": 7.920058250427246, "loss": 4.2027, "lr": 0.000842937062937063, "step": 1624, "tokens_trained": 0.798174104 }, { "epoch": 0.46096021558754696, "eval_loss": 1.044265627861023, "eval_runtime": 20.5617, "step": 1625, "tokens_trained": 0.798668072 }, { "epoch": 0.4612438834125239, "grad_norm": 10.734235763549805, "loss": 4.1505, "lr": 0.0008426573426573427, "step": 1626, "tokens_trained": 0.799160304 }, { "epoch": 0.46181121906247785, "grad_norm": 23.376392364501953, "loss": 4.195, "lr": 0.0008423776223776224, "step": 1628, "tokens_trained": 0.800144144 }, { "epoch": 0.4623785547124317, "grad_norm": 23.567371368408203, "loss": 4.2367, "lr": 0.0008420979020979021, "step": 1630, "tokens_trained": 0.801131184 }, { "epoch": 0.46294589036238565, "grad_norm": 19.271820068359375, "loss": 4.1899, "lr": 0.0008418181818181819, "step": 1632, "tokens_trained": 0.802111296 }, { "epoch": 0.4635132260123396, "grad_norm": 17.468698501586914, "loss": 4.1941, "lr": 0.0008415384615384616, "step": 1634, "tokens_trained": 0.803095112 }, { "epoch": 0.46408056166229344, "grad_norm": 22.298749923706055, "loss": 4.2083, "lr": 0.0008412587412587412, "step": 1636, "tokens_trained": 0.804080456 }, { "epoch": 0.46464789731224737, "grad_norm": 12.506179809570312, "loss": 4.1953, "lr": 0.000840979020979021, "step": 1638, "tokens_trained": 0.805062464 }, { "epoch": 0.46521523296220124, "grad_norm": 11.819656372070312, "loss": 4.2047, "lr": 0.0008406993006993006, "step": 1640, "tokens_trained": 0.806045504 }, { "epoch": 0.46578256861215517, "grad_norm": 15.925740242004395, "loss": 4.1565, "lr": 0.0008404195804195805, "step": 1642, "tokens_trained": 0.80702736 }, { "epoch": 0.4663499042621091, "grad_norm": 15.869892120361328, "loss": 4.2134, "lr": 0.0008401398601398602, "step": 1644, "tokens_trained": 0.808009192 }, { "epoch": 0.46691723991206296, "grad_norm": 10.851021766662598, "loss": 4.2041, "lr": 0.0008398601398601399, "step": 1646, "tokens_trained": 0.808994728 }, { "epoch": 0.4674845755620169, "grad_norm": 8.271230697631836, "loss": 4.1739, "lr": 0.0008395804195804196, "step": 1648, "tokens_trained": 0.809976448 }, { "epoch": 0.46805191121197076, "grad_norm": 13.768092155456543, "loss": 4.1761, "lr": 0.0008393006993006993, "step": 1650, "tokens_trained": 0.810958392 }, { "epoch": 0.4686192468619247, "grad_norm": 7.760485649108887, "loss": 4.1826, "lr": 0.0008390209790209791, "step": 1652, "tokens_trained": 0.81194136 }, { "epoch": 0.4691865825118786, "grad_norm": 13.28488540649414, "loss": 4.1659, "lr": 0.0008387412587412587, "step": 1654, "tokens_trained": 0.812924984 }, { "epoch": 0.4697539181618325, "grad_norm": 10.466367721557617, "loss": 4.1432, "lr": 0.0008384615384615385, "step": 1656, "tokens_trained": 0.813907424 }, { "epoch": 0.4703212538117864, "grad_norm": 15.40854549407959, "loss": 4.1625, "lr": 0.0008381818181818181, "step": 1658, "tokens_trained": 0.814888712 }, { "epoch": 0.4708885894617403, "grad_norm": 20.580612182617188, "loss": 4.1636, "lr": 0.000837902097902098, "step": 1660, "tokens_trained": 0.815869152 }, { "epoch": 0.4714559251116942, "grad_norm": 14.908403396606445, "loss": 4.1763, "lr": 0.0008376223776223776, "step": 1662, "tokens_trained": 0.816852664 }, { "epoch": 0.47202326076164813, "grad_norm": 10.217529296875, "loss": 4.1934, "lr": 0.0008373426573426573, "step": 1664, "tokens_trained": 0.817832792 }, { "epoch": 0.472590596411602, "grad_norm": 15.74150276184082, "loss": 4.1714, "lr": 0.0008370629370629371, "step": 1666, "tokens_trained": 0.81881728 }, { "epoch": 0.47315793206155593, "grad_norm": 15.39499282836914, "loss": 4.2005, "lr": 0.0008367832167832168, "step": 1668, "tokens_trained": 0.819800824 }, { "epoch": 0.4737252677115098, "grad_norm": 11.585809707641602, "loss": 4.136, "lr": 0.0008365034965034966, "step": 1670, "tokens_trained": 0.8207856 }, { "epoch": 0.4742926033614637, "grad_norm": 16.053237915039062, "loss": 4.1827, "lr": 0.0008362237762237762, "step": 1672, "tokens_trained": 0.821766576 }, { "epoch": 0.47485993901141765, "grad_norm": 9.23779582977295, "loss": 4.1159, "lr": 0.000835944055944056, "step": 1674, "tokens_trained": 0.822749696 }, { "epoch": 0.4754272746613715, "grad_norm": 11.395891189575195, "loss": 4.17, "lr": 0.0008356643356643356, "step": 1676, "tokens_trained": 0.82373032 }, { "epoch": 0.47599461031132545, "grad_norm": 17.745365142822266, "loss": 4.1696, "lr": 0.0008353846153846154, "step": 1678, "tokens_trained": 0.824712192 }, { "epoch": 0.4765619459612793, "grad_norm": 6.7816572189331055, "loss": 4.1933, "lr": 0.0008351048951048951, "step": 1680, "tokens_trained": 0.825691208 }, { "epoch": 0.47712928161123325, "grad_norm": 20.552772521972656, "loss": 4.1625, "lr": 0.0008348251748251748, "step": 1682, "tokens_trained": 0.826672584 }, { "epoch": 0.4776966172611872, "grad_norm": 21.632352828979492, "loss": 4.2061, "lr": 0.0008345454545454546, "step": 1684, "tokens_trained": 0.827654368 }, { "epoch": 0.47826395291114104, "grad_norm": 17.754596710205078, "loss": 4.222, "lr": 0.0008342657342657343, "step": 1686, "tokens_trained": 0.828639392 }, { "epoch": 0.47883128856109497, "grad_norm": 20.73906707763672, "loss": 4.1679, "lr": 0.0008339860139860141, "step": 1688, "tokens_trained": 0.829627232 }, { "epoch": 0.47939862421104884, "grad_norm": 28.157238006591797, "loss": 4.1658, "lr": 0.0008337062937062937, "step": 1690, "tokens_trained": 0.830610904 }, { "epoch": 0.47996595986100277, "grad_norm": 12.728020668029785, "loss": 4.1892, "lr": 0.0008334265734265734, "step": 1692, "tokens_trained": 0.831602544 }, { "epoch": 0.4805332955109567, "grad_norm": 20.21622657775879, "loss": 4.1453, "lr": 0.0008331468531468531, "step": 1694, "tokens_trained": 0.832584656 }, { "epoch": 0.48110063116091056, "grad_norm": 18.5329647064209, "loss": 4.2145, "lr": 0.0008328671328671329, "step": 1696, "tokens_trained": 0.833570472 }, { "epoch": 0.4816679668108645, "grad_norm": 12.47617244720459, "loss": 4.1944, "lr": 0.0008325874125874126, "step": 1698, "tokens_trained": 0.834556104 }, { "epoch": 0.48223530246081836, "grad_norm": 21.34851837158203, "loss": 4.1754, "lr": 0.0008323076923076923, "step": 1700, "tokens_trained": 0.835540592 }, { "epoch": 0.4828026381107723, "grad_norm": 13.20995807647705, "loss": 4.1657, "lr": 0.000832027972027972, "step": 1702, "tokens_trained": 0.836525136 }, { "epoch": 0.4833699737607262, "grad_norm": 16.77725601196289, "loss": 4.1905, "lr": 0.0008317482517482518, "step": 1704, "tokens_trained": 0.837509224 }, { "epoch": 0.4839373094106801, "grad_norm": 15.17611312866211, "loss": 4.1823, "lr": 0.0008314685314685315, "step": 1706, "tokens_trained": 0.838492472 }, { "epoch": 0.484504645060634, "grad_norm": 13.06942081451416, "loss": 4.1732, "lr": 0.0008311888111888112, "step": 1708, "tokens_trained": 0.839471696 }, { "epoch": 0.4850719807105879, "grad_norm": 10.456578254699707, "loss": 4.1862, "lr": 0.0008309090909090909, "step": 1710, "tokens_trained": 0.840452808 }, { "epoch": 0.4856393163605418, "grad_norm": 13.80197525024414, "loss": 4.1663, "lr": 0.0008306293706293706, "step": 1712, "tokens_trained": 0.841434224 }, { "epoch": 0.48620665201049573, "grad_norm": 20.076507568359375, "loss": 4.1436, "lr": 0.0008303496503496504, "step": 1714, "tokens_trained": 0.842415304 }, { "epoch": 0.4867739876604496, "grad_norm": 5.629086971282959, "loss": 4.149, "lr": 0.00083006993006993, "step": 1716, "tokens_trained": 0.84339416 }, { "epoch": 0.48734132331040353, "grad_norm": 13.932148933410645, "loss": 4.1785, "lr": 0.0008297902097902098, "step": 1718, "tokens_trained": 0.844380472 }, { "epoch": 0.4879086589603574, "grad_norm": 18.951047897338867, "loss": 4.216, "lr": 0.0008295104895104895, "step": 1720, "tokens_trained": 0.845366896 }, { "epoch": 0.4884759946103113, "grad_norm": 21.042476654052734, "loss": 4.1634, "lr": 0.0008292307692307693, "step": 1722, "tokens_trained": 0.846344792 }, { "epoch": 0.48904333026026525, "grad_norm": 23.94416618347168, "loss": 4.1613, "lr": 0.000828951048951049, "step": 1724, "tokens_trained": 0.847323608 }, { "epoch": 0.4896106659102191, "grad_norm": 5.057071208953857, "loss": 4.1729, "lr": 0.0008286713286713287, "step": 1726, "tokens_trained": 0.848304856 }, { "epoch": 0.49017800156017305, "grad_norm": 18.068674087524414, "loss": 4.2194, "lr": 0.0008283916083916084, "step": 1728, "tokens_trained": 0.849287712 }, { "epoch": 0.4907453372101269, "grad_norm": 11.621233940124512, "loss": 4.2232, "lr": 0.000828111888111888, "step": 1730, "tokens_trained": 0.850268968 }, { "epoch": 0.49131267286008085, "grad_norm": 12.939676284790039, "loss": 4.2003, "lr": 0.0008278321678321679, "step": 1732, "tokens_trained": 0.851256528 }, { "epoch": 0.49188000851003477, "grad_norm": 10.638157844543457, "loss": 4.1975, "lr": 0.0008275524475524475, "step": 1734, "tokens_trained": 0.852240824 }, { "epoch": 0.49244734415998864, "grad_norm": 6.2671003341674805, "loss": 4.1617, "lr": 0.0008272727272727273, "step": 1736, "tokens_trained": 0.853224768 }, { "epoch": 0.49301467980994257, "grad_norm": 12.318375587463379, "loss": 4.1939, "lr": 0.000826993006993007, "step": 1738, "tokens_trained": 0.8542062 }, { "epoch": 0.49358201545989644, "grad_norm": 17.275348663330078, "loss": 4.1911, "lr": 0.0008267132867132868, "step": 1740, "tokens_trained": 0.855192024 }, { "epoch": 0.49414935110985037, "grad_norm": 11.122747421264648, "loss": 4.17, "lr": 0.0008264335664335665, "step": 1742, "tokens_trained": 0.856172136 }, { "epoch": 0.4947166867598043, "grad_norm": 6.223485469818115, "loss": 4.1774, "lr": 0.0008261538461538461, "step": 1744, "tokens_trained": 0.857156312 }, { "epoch": 0.49528402240975816, "grad_norm": 14.62152099609375, "loss": 4.1607, "lr": 0.0008258741258741259, "step": 1746, "tokens_trained": 0.858140152 }, { "epoch": 0.4958513580597121, "grad_norm": 15.991989135742188, "loss": 4.1825, "lr": 0.0008255944055944055, "step": 1748, "tokens_trained": 0.85912524 }, { "epoch": 0.49641869370966596, "grad_norm": 28.88335418701172, "loss": 4.2244, "lr": 0.0008253146853146854, "step": 1750, "tokens_trained": 0.860105784 }, { "epoch": 0.49641869370966596, "eval_loss": 1.061833143234253, "eval_runtime": 20.4841, "step": 1750, "tokens_trained": 0.860105784 }, { "epoch": 0.4969860293596199, "grad_norm": 14.708030700683594, "loss": 4.2036, "lr": 0.000825034965034965, "step": 1752, "tokens_trained": 0.861089272 }, { "epoch": 0.4975533650095738, "grad_norm": 24.67535400390625, "loss": 4.2405, "lr": 0.0008247552447552448, "step": 1754, "tokens_trained": 0.862066656 }, { "epoch": 0.4981207006595277, "grad_norm": 10.923722267150879, "loss": 4.1713, "lr": 0.0008244755244755245, "step": 1756, "tokens_trained": 0.863049256 }, { "epoch": 0.4986880363094816, "grad_norm": 8.88796615600586, "loss": 4.1834, "lr": 0.0008241958041958042, "step": 1758, "tokens_trained": 0.864029352 }, { "epoch": 0.4992553719594355, "grad_norm": 34.90485382080078, "loss": 4.2338, "lr": 0.000823916083916084, "step": 1760, "tokens_trained": 0.865013008 }, { "epoch": 0.4998227076093894, "grad_norm": 36.34440612792969, "loss": 4.2012, "lr": 0.0008236363636363636, "step": 1762, "tokens_trained": 0.86599204 }, { "epoch": 0.5003900432593433, "grad_norm": 27.913984298706055, "loss": 4.269, "lr": 0.0008233566433566434, "step": 1764, "tokens_trained": 0.866975456 }, { "epoch": 0.5009573789092973, "grad_norm": 28.236122131347656, "loss": 4.2413, "lr": 0.000823076923076923, "step": 1766, "tokens_trained": 0.867963912 }, { "epoch": 0.5015247145592511, "grad_norm": 18.181337356567383, "loss": 4.2088, "lr": 0.0008227972027972029, "step": 1768, "tokens_trained": 0.86894656 }, { "epoch": 0.502092050209205, "grad_norm": 17.403850555419922, "loss": 4.1854, "lr": 0.0008225174825174825, "step": 1770, "tokens_trained": 0.869932592 }, { "epoch": 0.5026593858591589, "grad_norm": 15.002805709838867, "loss": 4.1897, "lr": 0.0008222377622377622, "step": 1772, "tokens_trained": 0.87091592 }, { "epoch": 0.5032267215091129, "grad_norm": 6.787586688995361, "loss": 4.1625, "lr": 0.000821958041958042, "step": 1774, "tokens_trained": 0.871899144 }, { "epoch": 0.5037940571590668, "grad_norm": 6.255197525024414, "loss": 4.1682, "lr": 0.0008216783216783217, "step": 1776, "tokens_trained": 0.872874824 }, { "epoch": 0.5043613928090206, "grad_norm": 25.828433990478516, "loss": 4.2354, "lr": 0.0008213986013986015, "step": 1778, "tokens_trained": 0.873858424 }, { "epoch": 0.5049287284589745, "grad_norm": 20.261323928833008, "loss": 4.2373, "lr": 0.0008211188811188811, "step": 1780, "tokens_trained": 0.87483884 }, { "epoch": 0.5054960641089284, "grad_norm": 9.670608520507812, "loss": 4.191, "lr": 0.0008208391608391609, "step": 1782, "tokens_trained": 0.875820792 }, { "epoch": 0.5060633997588824, "grad_norm": 23.33945655822754, "loss": 4.2319, "lr": 0.0008205594405594405, "step": 1784, "tokens_trained": 0.876804368 }, { "epoch": 0.5066307354088363, "grad_norm": 32.22544479370117, "loss": 4.1799, "lr": 0.0008202797202797203, "step": 1786, "tokens_trained": 0.877784816 }, { "epoch": 0.5071980710587901, "grad_norm": 21.048891067504883, "loss": 4.2635, "lr": 0.00082, "step": 1788, "tokens_trained": 0.878768256 }, { "epoch": 0.507765406708744, "grad_norm": 28.73198699951172, "loss": 4.2436, "lr": 0.0008197202797202797, "step": 1790, "tokens_trained": 0.879751288 }, { "epoch": 0.508332742358698, "grad_norm": 27.627851486206055, "loss": 4.2118, "lr": 0.0008194405594405595, "step": 1792, "tokens_trained": 0.880732072 }, { "epoch": 0.5089000780086519, "grad_norm": 21.16539192199707, "loss": 4.2123, "lr": 0.0008191608391608392, "step": 1794, "tokens_trained": 0.88171332 }, { "epoch": 0.5094674136586058, "grad_norm": 11.402868270874023, "loss": 4.1524, "lr": 0.000818881118881119, "step": 1796, "tokens_trained": 0.882695464 }, { "epoch": 0.5100347493085596, "grad_norm": 11.958270072937012, "loss": 4.2091, "lr": 0.0008186013986013986, "step": 1798, "tokens_trained": 0.883678736 }, { "epoch": 0.5106020849585136, "grad_norm": 15.902670860290527, "loss": 4.1687, "lr": 0.0008183216783216783, "step": 1800, "tokens_trained": 0.8846604 }, { "epoch": 0.5111694206084675, "grad_norm": 19.732566833496094, "loss": 4.1302, "lr": 0.000818041958041958, "step": 1802, "tokens_trained": 0.885641384 }, { "epoch": 0.5117367562584214, "grad_norm": 15.119332313537598, "loss": 4.1546, "lr": 0.0008177622377622378, "step": 1804, "tokens_trained": 0.8866262 }, { "epoch": 0.5123040919083753, "grad_norm": 9.641027450561523, "loss": 4.1748, "lr": 0.0008174825174825175, "step": 1806, "tokens_trained": 0.887604504 }, { "epoch": 0.5128714275583292, "grad_norm": 11.642073631286621, "loss": 4.1879, "lr": 0.0008172027972027972, "step": 1808, "tokens_trained": 0.888584152 }, { "epoch": 0.5134387632082831, "grad_norm": 12.05164909362793, "loss": 4.1332, "lr": 0.000816923076923077, "step": 1810, "tokens_trained": 0.889568448 }, { "epoch": 0.514006098858237, "grad_norm": 13.54423999786377, "loss": 4.1398, "lr": 0.0008166433566433567, "step": 1812, "tokens_trained": 0.890550896 }, { "epoch": 0.5145734345081909, "grad_norm": 21.94988441467285, "loss": 4.1523, "lr": 0.0008163636363636364, "step": 1814, "tokens_trained": 0.89153436 }, { "epoch": 0.5151407701581449, "grad_norm": 8.613338470458984, "loss": 4.1428, "lr": 0.0008160839160839161, "step": 1816, "tokens_trained": 0.89251064 }, { "epoch": 0.5157081058080987, "grad_norm": 27.448917388916016, "loss": 4.2014, "lr": 0.0008158041958041958, "step": 1818, "tokens_trained": 0.893493904 }, { "epoch": 0.5162754414580526, "grad_norm": 16.226577758789062, "loss": 4.1787, "lr": 0.0008155244755244755, "step": 1820, "tokens_trained": 0.894476344 }, { "epoch": 0.5168427771080065, "grad_norm": 16.967891693115234, "loss": 4.1898, "lr": 0.0008152447552447553, "step": 1822, "tokens_trained": 0.895460064 }, { "epoch": 0.5174101127579604, "grad_norm": 13.723483085632324, "loss": 4.2058, "lr": 0.000814965034965035, "step": 1824, "tokens_trained": 0.896443272 }, { "epoch": 0.5179774484079144, "grad_norm": 16.789636611938477, "loss": 4.1669, "lr": 0.0008146853146853147, "step": 1826, "tokens_trained": 0.897426712 }, { "epoch": 0.5185447840578682, "grad_norm": 11.26768684387207, "loss": 4.1401, "lr": 0.0008144055944055944, "step": 1828, "tokens_trained": 0.89840672 }, { "epoch": 0.5191121197078221, "grad_norm": 9.25829029083252, "loss": 4.1581, "lr": 0.0008141258741258742, "step": 1830, "tokens_trained": 0.89939132 }, { "epoch": 0.519679455357776, "grad_norm": 12.006930351257324, "loss": 4.1768, "lr": 0.0008138461538461539, "step": 1832, "tokens_trained": 0.900373704 }, { "epoch": 0.52024679100773, "grad_norm": 18.766008377075195, "loss": 4.1419, "lr": 0.0008135664335664336, "step": 1834, "tokens_trained": 0.901356176 }, { "epoch": 0.5208141266576839, "grad_norm": 17.483421325683594, "loss": 4.1382, "lr": 0.0008132867132867133, "step": 1836, "tokens_trained": 0.902344088 }, { "epoch": 0.5213814623076377, "grad_norm": 10.484652519226074, "loss": 4.1571, "lr": 0.000813006993006993, "step": 1838, "tokens_trained": 0.903328896 }, { "epoch": 0.5219487979575916, "grad_norm": 13.653974533081055, "loss": 4.1638, "lr": 0.0008127272727272728, "step": 1840, "tokens_trained": 0.904309368 }, { "epoch": 0.5225161336075456, "grad_norm": 12.48718547821045, "loss": 4.1226, "lr": 0.0008124475524475524, "step": 1842, "tokens_trained": 0.905293112 }, { "epoch": 0.5230834692574995, "grad_norm": 8.086355209350586, "loss": 4.1303, "lr": 0.0008121678321678322, "step": 1844, "tokens_trained": 0.906275632 }, { "epoch": 0.5236508049074534, "grad_norm": 10.940073013305664, "loss": 4.1634, "lr": 0.0008118881118881119, "step": 1846, "tokens_trained": 0.907255808 }, { "epoch": 0.5242181405574072, "grad_norm": 13.844099044799805, "loss": 4.1505, "lr": 0.0008116083916083917, "step": 1848, "tokens_trained": 0.908238664 }, { "epoch": 0.5247854762073612, "grad_norm": 6.305738925933838, "loss": 4.1463, "lr": 0.0008113286713286714, "step": 1850, "tokens_trained": 0.909221424 }, { "epoch": 0.5253528118573151, "grad_norm": 8.957951545715332, "loss": 4.1785, "lr": 0.000811048951048951, "step": 1852, "tokens_trained": 0.910204472 }, { "epoch": 0.525920147507269, "grad_norm": 12.665373802185059, "loss": 4.1776, "lr": 0.0008107692307692308, "step": 1854, "tokens_trained": 0.911186456 }, { "epoch": 0.5264874831572229, "grad_norm": 13.7921781539917, "loss": 4.2058, "lr": 0.0008104895104895104, "step": 1856, "tokens_trained": 0.912163912 }, { "epoch": 0.5270548188071768, "grad_norm": 18.400495529174805, "loss": 4.1378, "lr": 0.0008102097902097903, "step": 1858, "tokens_trained": 0.913143416 }, { "epoch": 0.5276221544571307, "grad_norm": 10.095234870910645, "loss": 4.1673, "lr": 0.0008099300699300699, "step": 1860, "tokens_trained": 0.914125056 }, { "epoch": 0.5281894901070846, "grad_norm": 9.396644592285156, "loss": 4.1226, "lr": 0.0008096503496503497, "step": 1862, "tokens_trained": 0.915109128 }, { "epoch": 0.5287568257570385, "grad_norm": 12.686080932617188, "loss": 4.1356, "lr": 0.0008093706293706294, "step": 1864, "tokens_trained": 0.916092096 }, { "epoch": 0.5293241614069925, "grad_norm": 15.91020679473877, "loss": 4.1276, "lr": 0.0008090909090909092, "step": 1866, "tokens_trained": 0.917077264 }, { "epoch": 0.5298914970569463, "grad_norm": 21.305110931396484, "loss": 4.1492, "lr": 0.0008088111888111889, "step": 1868, "tokens_trained": 0.918060288 }, { "epoch": 0.5304588327069002, "grad_norm": 9.242319107055664, "loss": 4.1457, "lr": 0.0008085314685314685, "step": 1870, "tokens_trained": 0.91904616 }, { "epoch": 0.5310261683568541, "grad_norm": 17.556922912597656, "loss": 4.1698, "lr": 0.0008082517482517483, "step": 1872, "tokens_trained": 0.920028192 }, { "epoch": 0.531593504006808, "grad_norm": 24.155885696411133, "loss": 4.193, "lr": 0.0008079720279720279, "step": 1874, "tokens_trained": 0.921010456 }, { "epoch": 0.531877171831785, "eval_loss": 1.0404243469238281, "eval_runtime": 21.451, "step": 1875, "tokens_trained": 0.921502192 }, { "epoch": 0.532160839656762, "grad_norm": 4.985994338989258, "loss": 4.1649, "lr": 0.0008076923076923078, "step": 1876, "tokens_trained": 0.921994216 }, { "epoch": 0.5327281753067158, "grad_norm": 19.2642765045166, "loss": 4.1883, "lr": 0.0008074125874125874, "step": 1878, "tokens_trained": 0.922978112 }, { "epoch": 0.5332955109566697, "grad_norm": 15.012572288513184, "loss": 4.1944, "lr": 0.0008071328671328671, "step": 1880, "tokens_trained": 0.923962952 }, { "epoch": 0.5338628466066236, "grad_norm": 21.37204360961914, "loss": 4.1708, "lr": 0.0008068531468531469, "step": 1882, "tokens_trained": 0.92494744 }, { "epoch": 0.5344301822565776, "grad_norm": 6.402398586273193, "loss": 4.1921, "lr": 0.0008065734265734265, "step": 1884, "tokens_trained": 0.925927984 }, { "epoch": 0.5349975179065315, "grad_norm": 27.606822967529297, "loss": 4.2033, "lr": 0.0008062937062937064, "step": 1886, "tokens_trained": 0.926911352 }, { "epoch": 0.5355648535564853, "grad_norm": 16.434572219848633, "loss": 4.1504, "lr": 0.000806013986013986, "step": 1888, "tokens_trained": 0.927894056 }, { "epoch": 0.5361321892064392, "grad_norm": 8.066178321838379, "loss": 4.1674, "lr": 0.0008057342657342658, "step": 1890, "tokens_trained": 0.928879504 }, { "epoch": 0.5366995248563932, "grad_norm": 6.167456150054932, "loss": 4.1207, "lr": 0.0008054545454545454, "step": 1892, "tokens_trained": 0.92986424 }, { "epoch": 0.5372668605063471, "grad_norm": 3.584982395172119, "loss": 4.1051, "lr": 0.0008051748251748253, "step": 1894, "tokens_trained": 0.930846696 }, { "epoch": 0.537834196156301, "grad_norm": 14.988295555114746, "loss": 4.1199, "lr": 0.0008048951048951049, "step": 1896, "tokens_trained": 0.931831112 }, { "epoch": 0.5384015318062548, "grad_norm": 12.735363960266113, "loss": 4.1368, "lr": 0.0008046153846153846, "step": 1898, "tokens_trained": 0.932816952 }, { "epoch": 0.5389688674562088, "grad_norm": 7.701294422149658, "loss": 4.1205, "lr": 0.0008043356643356644, "step": 1900, "tokens_trained": 0.93380264 }, { "epoch": 0.5395362031061627, "grad_norm": 9.15809440612793, "loss": 4.1567, "lr": 0.000804055944055944, "step": 1902, "tokens_trained": 0.934785848 }, { "epoch": 0.5401035387561166, "grad_norm": 10.8292875289917, "loss": 4.1645, "lr": 0.0008037762237762239, "step": 1904, "tokens_trained": 0.935766912 }, { "epoch": 0.5406708744060705, "grad_norm": 10.906803131103516, "loss": 4.1398, "lr": 0.0008034965034965035, "step": 1906, "tokens_trained": 0.936749352 }, { "epoch": 0.5412382100560243, "grad_norm": 10.140864372253418, "loss": 4.1754, "lr": 0.0008032167832167832, "step": 1908, "tokens_trained": 0.9377304 }, { "epoch": 0.5418055457059783, "grad_norm": 10.061383247375488, "loss": 4.1485, "lr": 0.0008029370629370629, "step": 1910, "tokens_trained": 0.938712336 }, { "epoch": 0.5423728813559322, "grad_norm": 8.252259254455566, "loss": 4.1502, "lr": 0.0008026573426573427, "step": 1912, "tokens_trained": 0.939693304 }, { "epoch": 0.5429402170058861, "grad_norm": 15.104400634765625, "loss": 4.182, "lr": 0.0008023776223776224, "step": 1914, "tokens_trained": 0.940679832 }, { "epoch": 0.54350755265584, "grad_norm": 21.167285919189453, "loss": 4.1241, "lr": 0.0008020979020979021, "step": 1916, "tokens_trained": 0.941665088 }, { "epoch": 0.5440748883057939, "grad_norm": 17.936481475830078, "loss": 4.1846, "lr": 0.0008018181818181818, "step": 1918, "tokens_trained": 0.942651632 }, { "epoch": 0.5446422239557478, "grad_norm": 9.773019790649414, "loss": 4.1164, "lr": 0.0008015384615384615, "step": 1920, "tokens_trained": 0.943635928 }, { "epoch": 0.5452095596057017, "grad_norm": 14.120475769042969, "loss": 4.1556, "lr": 0.0008012587412587414, "step": 1922, "tokens_trained": 0.944618336 }, { "epoch": 0.5457768952556556, "grad_norm": 10.898097038269043, "loss": 4.1521, "lr": 0.000800979020979021, "step": 1924, "tokens_trained": 0.945608216 }, { "epoch": 0.5463442309056096, "grad_norm": 8.271462440490723, "loss": 4.0785, "lr": 0.0008006993006993007, "step": 1926, "tokens_trained": 0.946593504 }, { "epoch": 0.5469115665555634, "grad_norm": 17.28820037841797, "loss": 4.0998, "lr": 0.0008004195804195804, "step": 1928, "tokens_trained": 0.947575288 }, { "epoch": 0.5474789022055173, "grad_norm": 17.754959106445312, "loss": 4.1652, "lr": 0.0008001398601398602, "step": 1930, "tokens_trained": 0.948562968 }, { "epoch": 0.5480462378554712, "grad_norm": 10.576292037963867, "loss": 4.1754, "lr": 0.0007998601398601399, "step": 1932, "tokens_trained": 0.949545728 }, { "epoch": 0.5486135735054252, "grad_norm": 14.297791481018066, "loss": 4.1597, "lr": 0.0007995804195804196, "step": 1934, "tokens_trained": 0.950528952 }, { "epoch": 0.5491809091553791, "grad_norm": 23.882539749145508, "loss": 4.1366, "lr": 0.0007993006993006992, "step": 1936, "tokens_trained": 0.951513448 }, { "epoch": 0.5497482448053329, "grad_norm": 5.12502908706665, "loss": 4.1441, "lr": 0.000799020979020979, "step": 1938, "tokens_trained": 0.952497048 }, { "epoch": 0.5503155804552868, "grad_norm": 26.879070281982422, "loss": 4.2595, "lr": 0.0007987412587412588, "step": 1940, "tokens_trained": 0.953475816 }, { "epoch": 0.5508829161052408, "grad_norm": 23.032690048217773, "loss": 4.1841, "lr": 0.0007984615384615385, "step": 1942, "tokens_trained": 0.954459984 }, { "epoch": 0.5514502517551947, "grad_norm": 8.810720443725586, "loss": 4.1329, "lr": 0.0007981818181818182, "step": 1944, "tokens_trained": 0.95544252 }, { "epoch": 0.5520175874051486, "grad_norm": 31.051185607910156, "loss": 4.2278, "lr": 0.0007979020979020979, "step": 1946, "tokens_trained": 0.956428016 }, { "epoch": 0.5525849230551024, "grad_norm": 22.537412643432617, "loss": 4.1729, "lr": 0.0007976223776223777, "step": 1948, "tokens_trained": 0.957406024 }, { "epoch": 0.5531522587050564, "grad_norm": 10.596793174743652, "loss": 4.1636, "lr": 0.0007973426573426573, "step": 1950, "tokens_trained": 0.958391232 }, { "epoch": 0.5537195943550103, "grad_norm": 16.45500373840332, "loss": 4.1591, "lr": 0.0007970629370629371, "step": 1952, "tokens_trained": 0.959378448 }, { "epoch": 0.5542869300049642, "grad_norm": 15.090359687805176, "loss": 4.1516, "lr": 0.0007967832167832167, "step": 1954, "tokens_trained": 0.960363384 }, { "epoch": 0.5548542656549181, "grad_norm": 28.482192993164062, "loss": 4.1211, "lr": 0.0007965034965034965, "step": 1956, "tokens_trained": 0.961348752 }, { "epoch": 0.555421601304872, "grad_norm": 9.402368545532227, "loss": 4.178, "lr": 0.0007962237762237763, "step": 1958, "tokens_trained": 0.962332976 }, { "epoch": 0.5559889369548259, "grad_norm": 33.001346588134766, "loss": 4.218, "lr": 0.000795944055944056, "step": 1960, "tokens_trained": 0.963316928 }, { "epoch": 0.5565562726047798, "grad_norm": 29.695520401000977, "loss": 4.2071, "lr": 0.0007956643356643357, "step": 1962, "tokens_trained": 0.964301728 }, { "epoch": 0.5571236082547337, "grad_norm": 22.22412109375, "loss": 4.2158, "lr": 0.0007953846153846153, "step": 1964, "tokens_trained": 0.96528524 }, { "epoch": 0.5576909439046877, "grad_norm": 15.590829849243164, "loss": 4.1681, "lr": 0.0007951048951048952, "step": 1966, "tokens_trained": 0.966268264 }, { "epoch": 0.5582582795546415, "grad_norm": 16.011110305786133, "loss": 4.1591, "lr": 0.0007948251748251748, "step": 1968, "tokens_trained": 0.967252016 }, { "epoch": 0.5588256152045954, "grad_norm": 15.24573040008545, "loss": 4.1446, "lr": 0.0007945454545454546, "step": 1970, "tokens_trained": 0.96823396 }, { "epoch": 0.5593929508545493, "grad_norm": 15.718021392822266, "loss": 4.1846, "lr": 0.0007942657342657342, "step": 1972, "tokens_trained": 0.969217792 }, { "epoch": 0.5599602865045032, "grad_norm": 8.648459434509277, "loss": 4.1655, "lr": 0.000793986013986014, "step": 1974, "tokens_trained": 0.970200776 }, { "epoch": 0.5605276221544572, "grad_norm": 7.273077487945557, "loss": 4.1397, "lr": 0.0007937062937062938, "step": 1976, "tokens_trained": 0.971181376 }, { "epoch": 0.561094957804411, "grad_norm": 25.027616500854492, "loss": 4.1918, "lr": 0.0007934265734265734, "step": 1978, "tokens_trained": 0.972165496 }, { "epoch": 0.5616622934543649, "grad_norm": 25.485851287841797, "loss": 4.1896, "lr": 0.0007931468531468532, "step": 1980, "tokens_trained": 0.973145616 }, { "epoch": 0.5622296291043188, "grad_norm": 18.065462112426758, "loss": 4.1876, "lr": 0.0007928671328671328, "step": 1982, "tokens_trained": 0.974131104 }, { "epoch": 0.5627969647542728, "grad_norm": 20.412248611450195, "loss": 4.1556, "lr": 0.0007925874125874127, "step": 1984, "tokens_trained": 0.975111232 }, { "epoch": 0.5633643004042267, "grad_norm": 15.51710319519043, "loss": 4.1391, "lr": 0.0007923076923076923, "step": 1986, "tokens_trained": 0.976098968 }, { "epoch": 0.5639316360541805, "grad_norm": 8.650726318359375, "loss": 4.1421, "lr": 0.000792027972027972, "step": 1988, "tokens_trained": 0.977082992 }, { "epoch": 0.5644989717041344, "grad_norm": 19.833505630493164, "loss": 4.1505, "lr": 0.0007917482517482517, "step": 1990, "tokens_trained": 0.978068896 }, { "epoch": 0.5650663073540884, "grad_norm": 26.585390090942383, "loss": 4.1661, "lr": 0.0007914685314685314, "step": 1992, "tokens_trained": 0.979048504 }, { "epoch": 0.5656336430040423, "grad_norm": 20.827394485473633, "loss": 4.1987, "lr": 0.0007911888111888113, "step": 1994, "tokens_trained": 0.98003104 }, { "epoch": 0.5662009786539962, "grad_norm": 23.700273513793945, "loss": 4.1773, "lr": 0.0007909090909090909, "step": 1996, "tokens_trained": 0.981013384 }, { "epoch": 0.56676831430395, "grad_norm": 15.673397064208984, "loss": 4.12, "lr": 0.0007906293706293707, "step": 1998, "tokens_trained": 0.981999776 }, { "epoch": 0.567335649953904, "grad_norm": 11.268630981445312, "loss": 4.1373, "lr": 0.0007903496503496503, "step": 2000, "tokens_trained": 0.982980936 }, { "epoch": 0.567335649953904, "eval_loss": 1.0422048568725586, "eval_runtime": 20.3928, "step": 2000, "tokens_trained": 0.982980936 }, { "epoch": 0.5679029856038579, "grad_norm": 18.37994384765625, "loss": 4.1536, "lr": 0.0007900699300699302, "step": 2002, "tokens_trained": 0.983969536 }, { "epoch": 0.5684703212538118, "grad_norm": 23.911537170410156, "loss": 4.1652, "lr": 0.0007897902097902098, "step": 2004, "tokens_trained": 0.98495052 }, { "epoch": 0.5690376569037657, "grad_norm": 7.355772018432617, "loss": 4.1846, "lr": 0.0007895104895104895, "step": 2006, "tokens_trained": 0.98593252 }, { "epoch": 0.5696049925537195, "grad_norm": 35.29991149902344, "loss": 4.2145, "lr": 0.0007892307692307692, "step": 2008, "tokens_trained": 0.986922392 }, { "epoch": 0.5701723282036735, "grad_norm": 14.28709602355957, "loss": 4.1629, "lr": 0.0007889510489510489, "step": 2010, "tokens_trained": 0.987905712 }, { "epoch": 0.5707396638536274, "grad_norm": 22.50174331665039, "loss": 4.1907, "lr": 0.0007886713286713288, "step": 2012, "tokens_trained": 0.988887536 }, { "epoch": 0.5713069995035813, "grad_norm": 14.588640213012695, "loss": 4.1523, "lr": 0.0007883916083916084, "step": 2014, "tokens_trained": 0.989872712 }, { "epoch": 0.5718743351535353, "grad_norm": 2.776369094848633, "loss": 4.1548, "lr": 0.0007881118881118882, "step": 2016, "tokens_trained": 0.990854072 }, { "epoch": 0.5724416708034891, "grad_norm": 16.00047492980957, "loss": 4.1319, "lr": 0.0007878321678321678, "step": 2018, "tokens_trained": 0.991834552 }, { "epoch": 0.573009006453443, "grad_norm": 21.678735733032227, "loss": 4.1986, "lr": 0.0007875524475524476, "step": 2020, "tokens_trained": 0.992818256 }, { "epoch": 0.5735763421033969, "grad_norm": 4.835119724273682, "loss": 4.1625, "lr": 0.0007872727272727273, "step": 2022, "tokens_trained": 0.993801376 }, { "epoch": 0.5741436777533508, "grad_norm": 19.427467346191406, "loss": 4.1594, "lr": 0.000786993006993007, "step": 2024, "tokens_trained": 0.994788568 }, { "epoch": 0.5747110134033048, "grad_norm": 15.458346366882324, "loss": 4.1829, "lr": 0.0007867132867132867, "step": 2026, "tokens_trained": 0.995769976 }, { "epoch": 0.5752783490532586, "grad_norm": 11.073614120483398, "loss": 4.1303, "lr": 0.0007864335664335664, "step": 2028, "tokens_trained": 0.996751464 }, { "epoch": 0.5758456847032125, "grad_norm": 4.685436248779297, "loss": 4.1368, "lr": 0.0007861538461538463, "step": 2030, "tokens_trained": 0.997733952 }, { "epoch": 0.5764130203531664, "grad_norm": 15.977241516113281, "loss": 4.1584, "lr": 0.0007858741258741259, "step": 2032, "tokens_trained": 0.998716976 }, { "epoch": 0.5769803560031204, "grad_norm": 11.305732727050781, "loss": 4.102, "lr": 0.0007855944055944056, "step": 2034, "tokens_trained": 0.999703632 }, { "epoch": 0.5775476916530743, "grad_norm": 7.794003963470459, "loss": 4.161, "lr": 0.0007853146853146853, "step": 2036, "tokens_trained": 1.000687488 }, { "epoch": 0.5781150273030281, "grad_norm": 7.609982013702393, "loss": 4.1546, "lr": 0.0007850349650349651, "step": 2038, "tokens_trained": 1.0016692 }, { "epoch": 0.578682362952982, "grad_norm": 7.622653961181641, "loss": 4.1246, "lr": 0.0007847552447552448, "step": 2040, "tokens_trained": 1.002653352 }, { "epoch": 0.579249698602936, "grad_norm": 9.98919677734375, "loss": 4.1319, "lr": 0.0007844755244755245, "step": 2042, "tokens_trained": 1.003639528 }, { "epoch": 0.5798170342528899, "grad_norm": 9.557628631591797, "loss": 4.1105, "lr": 0.0007841958041958041, "step": 2044, "tokens_trained": 1.004623776 }, { "epoch": 0.5803843699028438, "grad_norm": 14.172621726989746, "loss": 4.1339, "lr": 0.0007839160839160839, "step": 2046, "tokens_trained": 1.005604008 }, { "epoch": 0.5809517055527976, "grad_norm": 8.185248374938965, "loss": 4.1142, "lr": 0.0007836363636363637, "step": 2048, "tokens_trained": 1.006585704 }, { "epoch": 0.5815190412027516, "grad_norm": 10.642661094665527, "loss": 4.131, "lr": 0.0007833566433566434, "step": 2050, "tokens_trained": 1.00757132 }, { "epoch": 0.5820863768527055, "grad_norm": 7.868969917297363, "loss": 4.1477, "lr": 0.0007830769230769231, "step": 2052, "tokens_trained": 1.008556824 }, { "epoch": 0.5826537125026594, "grad_norm": 2.8441150188446045, "loss": 4.1156, "lr": 0.0007827972027972028, "step": 2054, "tokens_trained": 1.00954056 }, { "epoch": 0.5832210481526133, "grad_norm": 5.2797932624816895, "loss": 4.1058, "lr": 0.0007825174825174826, "step": 2056, "tokens_trained": 1.010526488 }, { "epoch": 0.5837883838025671, "grad_norm": 11.850811004638672, "loss": 4.165, "lr": 0.0007822377622377622, "step": 2058, "tokens_trained": 1.011507584 }, { "epoch": 0.5843557194525211, "grad_norm": 11.073920249938965, "loss": 4.1509, "lr": 0.000781958041958042, "step": 2060, "tokens_trained": 1.012491648 }, { "epoch": 0.584923055102475, "grad_norm": 8.282343864440918, "loss": 4.0656, "lr": 0.0007816783216783216, "step": 2062, "tokens_trained": 1.013475224 }, { "epoch": 0.5854903907524289, "grad_norm": 10.414461135864258, "loss": 4.1285, "lr": 0.0007813986013986014, "step": 2064, "tokens_trained": 1.014458144 }, { "epoch": 0.5860577264023829, "grad_norm": 9.988463401794434, "loss": 4.1234, "lr": 0.0007811188811188812, "step": 2066, "tokens_trained": 1.015444112 }, { "epoch": 0.5866250620523367, "grad_norm": 8.713189125061035, "loss": 4.129, "lr": 0.0007808391608391609, "step": 2068, "tokens_trained": 1.016427568 }, { "epoch": 0.5871923977022906, "grad_norm": 3.4149773120880127, "loss": 4.155, "lr": 0.0007805594405594406, "step": 2070, "tokens_trained": 1.017412264 }, { "epoch": 0.5877597333522445, "grad_norm": 12.33522891998291, "loss": 4.1856, "lr": 0.0007802797202797202, "step": 2072, "tokens_trained": 1.018402216 }, { "epoch": 0.5883270690021984, "grad_norm": 12.155695915222168, "loss": 4.1468, "lr": 0.0007800000000000001, "step": 2074, "tokens_trained": 1.019387096 }, { "epoch": 0.5888944046521524, "grad_norm": 7.73326301574707, "loss": 4.1239, "lr": 0.0007797202797202797, "step": 2076, "tokens_trained": 1.020370008 }, { "epoch": 0.5894617403021062, "grad_norm": 6.425852298736572, "loss": 4.1101, "lr": 0.0007794405594405595, "step": 2078, "tokens_trained": 1.02135716 }, { "epoch": 0.5900290759520601, "grad_norm": 18.360816955566406, "loss": 4.1726, "lr": 0.0007791608391608391, "step": 2080, "tokens_trained": 1.022338024 }, { "epoch": 0.590596411602014, "grad_norm": 28.31681251525879, "loss": 4.1341, "lr": 0.0007788811188811189, "step": 2082, "tokens_trained": 1.023318008 }, { "epoch": 0.591163747251968, "grad_norm": 10.673089027404785, "loss": 4.1268, "lr": 0.0007786013986013987, "step": 2084, "tokens_trained": 1.02430432 }, { "epoch": 0.5917310829019219, "grad_norm": 26.656522750854492, "loss": 4.1703, "lr": 0.0007783216783216783, "step": 2086, "tokens_trained": 1.025288272 }, { "epoch": 0.5922984185518757, "grad_norm": 20.022029876708984, "loss": 4.1532, "lr": 0.0007780419580419581, "step": 2088, "tokens_trained": 1.026272984 }, { "epoch": 0.5928657542018296, "grad_norm": 7.2955121994018555, "loss": 4.1992, "lr": 0.0007777622377622377, "step": 2090, "tokens_trained": 1.02725572 }, { "epoch": 0.5934330898517836, "grad_norm": 28.561243057250977, "loss": 4.2098, "lr": 0.0007774825174825176, "step": 2092, "tokens_trained": 1.028238456 }, { "epoch": 0.5940004255017375, "grad_norm": 16.715425491333008, "loss": 4.1509, "lr": 0.0007772027972027972, "step": 2094, "tokens_trained": 1.029226048 }, { "epoch": 0.5945677611516914, "grad_norm": 6.325936317443848, "loss": 4.1221, "lr": 0.000776923076923077, "step": 2096, "tokens_trained": 1.030210528 }, { "epoch": 0.5951350968016452, "grad_norm": 12.83181381225586, "loss": 4.1808, "lr": 0.0007766433566433566, "step": 2098, "tokens_trained": 1.031193456 }, { "epoch": 0.5957024324515992, "grad_norm": 12.183184623718262, "loss": 4.1292, "lr": 0.0007763636363636363, "step": 2100, "tokens_trained": 1.032173528 }, { "epoch": 0.5962697681015531, "grad_norm": 8.247485160827637, "loss": 4.1425, "lr": 0.0007760839160839162, "step": 2102, "tokens_trained": 1.033158144 }, { "epoch": 0.596837103751507, "grad_norm": 10.814559936523438, "loss": 4.1167, "lr": 0.0007758041958041958, "step": 2104, "tokens_trained": 1.034141216 }, { "epoch": 0.5974044394014609, "grad_norm": 12.589309692382812, "loss": 4.0916, "lr": 0.0007755244755244756, "step": 2106, "tokens_trained": 1.035121888 }, { "epoch": 0.5979717750514147, "grad_norm": 11.65658187866211, "loss": 4.0776, "lr": 0.0007752447552447552, "step": 2108, "tokens_trained": 1.036103688 }, { "epoch": 0.5985391107013687, "grad_norm": 18.0120792388916, "loss": 4.1588, "lr": 0.0007749650349650351, "step": 2110, "tokens_trained": 1.03708248 }, { "epoch": 0.5991064463513226, "grad_norm": 5.742938995361328, "loss": 4.151, "lr": 0.0007746853146853147, "step": 2112, "tokens_trained": 1.038068792 }, { "epoch": 0.5996737820012765, "grad_norm": 36.54581832885742, "loss": 4.2239, "lr": 0.0007744055944055944, "step": 2114, "tokens_trained": 1.03904728 }, { "epoch": 0.6002411176512304, "grad_norm": 13.304069519042969, "loss": 4.152, "lr": 0.0007741258741258741, "step": 2116, "tokens_trained": 1.040031312 }, { "epoch": 0.6008084533011843, "grad_norm": 18.68927001953125, "loss": 4.1413, "lr": 0.0007738461538461538, "step": 2118, "tokens_trained": 1.041018376 }, { "epoch": 0.6013757889511382, "grad_norm": 16.946630477905273, "loss": 4.1122, "lr": 0.0007735664335664337, "step": 2120, "tokens_trained": 1.0420056 }, { "epoch": 0.6019431246010921, "grad_norm": 4.236926078796387, "loss": 4.1146, "lr": 0.0007732867132867133, "step": 2122, "tokens_trained": 1.042990376 }, { "epoch": 0.602510460251046, "grad_norm": 12.148641586303711, "loss": 4.1472, "lr": 0.0007730069930069931, "step": 2124, "tokens_trained": 1.0439754 }, { "epoch": 0.602794128076023, "eval_loss": 1.039306640625, "eval_runtime": 20.6138, "step": 2125, "tokens_trained": 1.044467008 }, { "epoch": 0.603077795901, "grad_norm": 17.051687240600586, "loss": 4.1572, "lr": 0.0007727272727272727, "step": 2126, "tokens_trained": 1.044957456 }, { "epoch": 0.6036451315509538, "grad_norm": 14.019828796386719, "loss": 4.1464, "lr": 0.0007724475524475525, "step": 2128, "tokens_trained": 1.04593944 }, { "epoch": 0.6042124672009077, "grad_norm": 11.22962760925293, "loss": 4.1345, "lr": 0.0007721678321678322, "step": 2130, "tokens_trained": 1.046919592 }, { "epoch": 0.6047798028508616, "grad_norm": 11.524348258972168, "loss": 4.1233, "lr": 0.0007718881118881119, "step": 2132, "tokens_trained": 1.047904744 }, { "epoch": 0.6053471385008156, "grad_norm": 7.174457550048828, "loss": 4.1201, "lr": 0.0007716083916083916, "step": 2134, "tokens_trained": 1.048885328 }, { "epoch": 0.6059144741507695, "grad_norm": 6.847499847412109, "loss": 4.1313, "lr": 0.0007713286713286713, "step": 2136, "tokens_trained": 1.049868776 }, { "epoch": 0.6064818098007233, "grad_norm": 8.44458293914795, "loss": 4.1236, "lr": 0.0007710489510489512, "step": 2138, "tokens_trained": 1.050852704 }, { "epoch": 0.6070491454506772, "grad_norm": 15.415260314941406, "loss": 4.1424, "lr": 0.0007707692307692308, "step": 2140, "tokens_trained": 1.051837736 }, { "epoch": 0.6076164811006312, "grad_norm": 16.845874786376953, "loss": 4.1037, "lr": 0.0007704895104895105, "step": 2142, "tokens_trained": 1.05282172 }, { "epoch": 0.6081838167505851, "grad_norm": 1.3947086334228516, "loss": 4.1389, "lr": 0.0007702097902097902, "step": 2144, "tokens_trained": 1.053802928 }, { "epoch": 0.608751152400539, "grad_norm": 3.4119038581848145, "loss": 4.16, "lr": 0.0007699300699300699, "step": 2146, "tokens_trained": 1.054784368 }, { "epoch": 0.6093184880504928, "grad_norm": 9.26860523223877, "loss": 4.1841, "lr": 0.0007696503496503497, "step": 2148, "tokens_trained": 1.05576888 }, { "epoch": 0.6098858237004467, "grad_norm": 8.744836807250977, "loss": 4.1043, "lr": 0.0007693706293706294, "step": 2150, "tokens_trained": 1.056751336 }, { "epoch": 0.6104531593504007, "grad_norm": 8.805045127868652, "loss": 4.1032, "lr": 0.000769090909090909, "step": 2152, "tokens_trained": 1.057734 }, { "epoch": 0.6110204950003546, "grad_norm": 4.785625457763672, "loss": 4.1817, "lr": 0.0007688111888111888, "step": 2154, "tokens_trained": 1.058716328 }, { "epoch": 0.6115878306503085, "grad_norm": 2.2137513160705566, "loss": 4.1514, "lr": 0.0007685314685314686, "step": 2156, "tokens_trained": 1.059696248 }, { "epoch": 0.6121551663002623, "grad_norm": 7.164271354675293, "loss": 4.1433, "lr": 0.0007682517482517483, "step": 2158, "tokens_trained": 1.060676648 }, { "epoch": 0.6127225019502163, "grad_norm": 9.481597900390625, "loss": 4.0971, "lr": 0.000767972027972028, "step": 2160, "tokens_trained": 1.061656688 }, { "epoch": 0.6132898376001702, "grad_norm": 11.28831672668457, "loss": 4.149, "lr": 0.0007676923076923077, "step": 2162, "tokens_trained": 1.062640576 }, { "epoch": 0.6138571732501241, "grad_norm": 17.21572494506836, "loss": 4.098, "lr": 0.0007674125874125874, "step": 2164, "tokens_trained": 1.063617688 }, { "epoch": 0.614424508900078, "grad_norm": 14.486310005187988, "loss": 4.123, "lr": 0.0007671328671328672, "step": 2166, "tokens_trained": 1.06460584 }, { "epoch": 0.6149918445500319, "grad_norm": 10.582398414611816, "loss": 4.1243, "lr": 0.0007668531468531469, "step": 2168, "tokens_trained": 1.065589064 }, { "epoch": 0.6155591801999858, "grad_norm": 12.923002243041992, "loss": 4.0928, "lr": 0.0007665734265734265, "step": 2170, "tokens_trained": 1.06657224 }, { "epoch": 0.6161265158499397, "grad_norm": 12.445414543151855, "loss": 4.1697, "lr": 0.0007662937062937063, "step": 2172, "tokens_trained": 1.067556952 }, { "epoch": 0.6166938514998936, "grad_norm": 3.562396287918091, "loss": 4.0763, "lr": 0.000766013986013986, "step": 2174, "tokens_trained": 1.068538248 }, { "epoch": 0.6172611871498476, "grad_norm": 12.62887954711914, "loss": 4.1203, "lr": 0.0007657342657342658, "step": 2176, "tokens_trained": 1.06952032 }, { "epoch": 0.6178285227998014, "grad_norm": 9.387356758117676, "loss": 4.1318, "lr": 0.0007654545454545455, "step": 2178, "tokens_trained": 1.070503872 }, { "epoch": 0.6183958584497553, "grad_norm": 8.885710716247559, "loss": 4.1609, "lr": 0.0007651748251748251, "step": 2180, "tokens_trained": 1.071486328 }, { "epoch": 0.6189631940997092, "grad_norm": 7.174533843994141, "loss": 4.0824, "lr": 0.0007648951048951049, "step": 2182, "tokens_trained": 1.07246928 }, { "epoch": 0.6195305297496632, "grad_norm": 15.866931915283203, "loss": 4.1461, "lr": 0.0007646153846153846, "step": 2184, "tokens_trained": 1.07345252 }, { "epoch": 0.6200978653996171, "grad_norm": 4.892337799072266, "loss": 4.1418, "lr": 0.0007643356643356644, "step": 2186, "tokens_trained": 1.07443796 }, { "epoch": 0.6206652010495709, "grad_norm": 4.796551704406738, "loss": 4.1394, "lr": 0.000764055944055944, "step": 2188, "tokens_trained": 1.075421392 }, { "epoch": 0.6212325366995248, "grad_norm": 10.585665702819824, "loss": 4.1046, "lr": 0.0007637762237762238, "step": 2190, "tokens_trained": 1.076404848 }, { "epoch": 0.6217998723494788, "grad_norm": 8.71747875213623, "loss": 4.1819, "lr": 0.0007634965034965035, "step": 2192, "tokens_trained": 1.077386672 }, { "epoch": 0.6223672079994327, "grad_norm": 10.74347972869873, "loss": 4.1231, "lr": 0.0007632167832167833, "step": 2194, "tokens_trained": 1.078365112 }, { "epoch": 0.6229345436493866, "grad_norm": 12.079446792602539, "loss": 4.1132, "lr": 0.000762937062937063, "step": 2196, "tokens_trained": 1.07935376 }, { "epoch": 0.6235018792993404, "grad_norm": 7.8133649826049805, "loss": 4.0915, "lr": 0.0007626573426573426, "step": 2198, "tokens_trained": 1.080332872 }, { "epoch": 0.6240692149492943, "grad_norm": 4.51243782043457, "loss": 4.1108, "lr": 0.0007623776223776224, "step": 2200, "tokens_trained": 1.081316664 }, { "epoch": 0.6246365505992483, "grad_norm": 12.625933647155762, "loss": 4.1552, "lr": 0.0007620979020979021, "step": 2202, "tokens_trained": 1.08230448 }, { "epoch": 0.6252038862492022, "grad_norm": 9.984200477600098, "loss": 4.1199, "lr": 0.0007618181818181819, "step": 2204, "tokens_trained": 1.083288992 }, { "epoch": 0.6257712218991561, "grad_norm": 11.338666915893555, "loss": 4.0821, "lr": 0.0007615384615384615, "step": 2206, "tokens_trained": 1.084273864 }, { "epoch": 0.6263385575491099, "grad_norm": 6.808894634246826, "loss": 4.1202, "lr": 0.0007612587412587412, "step": 2208, "tokens_trained": 1.085254584 }, { "epoch": 0.6269058931990639, "grad_norm": 4.182394027709961, "loss": 4.1072, "lr": 0.000760979020979021, "step": 2210, "tokens_trained": 1.086237312 }, { "epoch": 0.6274732288490178, "grad_norm": 13.04654312133789, "loss": 4.1611, "lr": 0.0007606993006993007, "step": 2212, "tokens_trained": 1.087220136 }, { "epoch": 0.6280405644989717, "grad_norm": 8.223962783813477, "loss": 4.1094, "lr": 0.0007604195804195805, "step": 2214, "tokens_trained": 1.088203464 }, { "epoch": 0.6286079001489256, "grad_norm": 7.974697589874268, "loss": 4.1061, "lr": 0.0007601398601398601, "step": 2216, "tokens_trained": 1.089188056 }, { "epoch": 0.6291752357988795, "grad_norm": 9.93747329711914, "loss": 4.1625, "lr": 0.0007598601398601399, "step": 2218, "tokens_trained": 1.090168464 }, { "epoch": 0.6297425714488334, "grad_norm": 14.117332458496094, "loss": 4.1386, "lr": 0.0007595804195804196, "step": 2220, "tokens_trained": 1.09115228 }, { "epoch": 0.6303099070987873, "grad_norm": 8.045380592346191, "loss": 4.0962, "lr": 0.0007593006993006993, "step": 2222, "tokens_trained": 1.0921348 }, { "epoch": 0.6308772427487412, "grad_norm": 7.286352634429932, "loss": 4.1456, "lr": 0.000759020979020979, "step": 2224, "tokens_trained": 1.0931198 }, { "epoch": 0.6314445783986952, "grad_norm": 7.278292179107666, "loss": 4.1155, "lr": 0.0007587412587412587, "step": 2226, "tokens_trained": 1.094107536 }, { "epoch": 0.632011914048649, "grad_norm": 5.973489761352539, "loss": 4.1403, "lr": 0.0007584615384615385, "step": 2228, "tokens_trained": 1.095090384 }, { "epoch": 0.6325792496986029, "grad_norm": 11.78962230682373, "loss": 4.1322, "lr": 0.0007581818181818182, "step": 2230, "tokens_trained": 1.096072192 }, { "epoch": 0.6331465853485568, "grad_norm": 9.853010177612305, "loss": 4.0905, "lr": 0.000757902097902098, "step": 2232, "tokens_trained": 1.097057368 }, { "epoch": 0.6337139209985108, "grad_norm": 12.578025817871094, "loss": 4.0871, "lr": 0.0007576223776223776, "step": 2234, "tokens_trained": 1.0980418 }, { "epoch": 0.6342812566484647, "grad_norm": 8.467657089233398, "loss": 4.0972, "lr": 0.0007573426573426573, "step": 2236, "tokens_trained": 1.099023032 }, { "epoch": 0.6348485922984185, "grad_norm": 10.768691062927246, "loss": 4.0683, "lr": 0.0007570629370629371, "step": 2238, "tokens_trained": 1.1000078 }, { "epoch": 0.6354159279483724, "grad_norm": 8.509350776672363, "loss": 4.1319, "lr": 0.0007567832167832168, "step": 2240, "tokens_trained": 1.100990904 }, { "epoch": 0.6359832635983264, "grad_norm": 9.473450660705566, "loss": 4.0971, "lr": 0.0007565034965034965, "step": 2242, "tokens_trained": 1.101971112 }, { "epoch": 0.6365505992482803, "grad_norm": 5.248406887054443, "loss": 4.1212, "lr": 0.0007562237762237762, "step": 2244, "tokens_trained": 1.10295244 }, { "epoch": 0.6371179348982342, "grad_norm": 2.8849964141845703, "loss": 4.0914, "lr": 0.000755944055944056, "step": 2246, "tokens_trained": 1.103935728 }, { "epoch": 0.637685270548188, "grad_norm": 10.757996559143066, "loss": 4.0711, "lr": 0.0007556643356643357, "step": 2248, "tokens_trained": 1.104917112 }, { "epoch": 0.638252606198142, "grad_norm": 14.822528839111328, "loss": 4.1311, "lr": 0.0007553846153846154, "step": 2250, "tokens_trained": 1.105899872 }, { "epoch": 0.638252606198142, "eval_loss": 1.0298579931259155, "eval_runtime": 20.7482, "step": 2250, "tokens_trained": 1.105899872 }, { "epoch": 0.6388199418480959, "grad_norm": 12.402534484863281, "loss": 4.0729, "lr": 0.0007551048951048951, "step": 2252, "tokens_trained": 1.106885776 }, { "epoch": 0.6393872774980498, "grad_norm": 8.585915565490723, "loss": 4.1026, "lr": 0.0007548251748251748, "step": 2254, "tokens_trained": 1.107867784 }, { "epoch": 0.6399546131480037, "grad_norm": 9.298388481140137, "loss": 4.1033, "lr": 0.0007545454545454546, "step": 2256, "tokens_trained": 1.108846136 }, { "epoch": 0.6405219487979575, "grad_norm": 10.894235610961914, "loss": 4.1212, "lr": 0.0007542657342657343, "step": 2258, "tokens_trained": 1.10982972 }, { "epoch": 0.6410892844479115, "grad_norm": 7.488401889801025, "loss": 4.1268, "lr": 0.000753986013986014, "step": 2260, "tokens_trained": 1.110815128 }, { "epoch": 0.6416566200978654, "grad_norm": 10.087981224060059, "loss": 4.0819, "lr": 0.0007537062937062937, "step": 2262, "tokens_trained": 1.111796896 }, { "epoch": 0.6422239557478193, "grad_norm": 8.851993560791016, "loss": 4.0903, "lr": 0.0007534265734265734, "step": 2264, "tokens_trained": 1.112779032 }, { "epoch": 0.6427912913977732, "grad_norm": 7.973280429840088, "loss": 4.1251, "lr": 0.0007531468531468532, "step": 2266, "tokens_trained": 1.11376248 }, { "epoch": 0.6433586270477271, "grad_norm": 10.600922584533691, "loss": 4.1062, "lr": 0.0007528671328671329, "step": 2268, "tokens_trained": 1.11474752 }, { "epoch": 0.643925962697681, "grad_norm": 6.029149532318115, "loss": 4.1174, "lr": 0.0007525874125874126, "step": 2270, "tokens_trained": 1.115730304 }, { "epoch": 0.6444932983476349, "grad_norm": 5.804802417755127, "loss": 4.0634, "lr": 0.0007523076923076923, "step": 2272, "tokens_trained": 1.116712712 }, { "epoch": 0.6450606339975888, "grad_norm": 12.601567268371582, "loss": 4.111, "lr": 0.0007520279720279721, "step": 2274, "tokens_trained": 1.117692824 }, { "epoch": 0.6456279696475428, "grad_norm": 6.2783203125, "loss": 4.1375, "lr": 0.0007517482517482518, "step": 2276, "tokens_trained": 1.118681616 }, { "epoch": 0.6461953052974966, "grad_norm": 3.368333339691162, "loss": 4.096, "lr": 0.0007514685314685314, "step": 2278, "tokens_trained": 1.119662896 }, { "epoch": 0.6467626409474505, "grad_norm": 28.135610580444336, "loss": 4.1362, "lr": 0.0007511888111888112, "step": 2280, "tokens_trained": 1.120644592 }, { "epoch": 0.6473299765974044, "grad_norm": 31.932798385620117, "loss": 4.177, "lr": 0.0007509090909090909, "step": 2282, "tokens_trained": 1.1216274 }, { "epoch": 0.6478973122473584, "grad_norm": 18.303653717041016, "loss": 4.2105, "lr": 0.0007506293706293707, "step": 2284, "tokens_trained": 1.122610568 }, { "epoch": 0.6484646478973123, "grad_norm": 24.33900260925293, "loss": 4.1685, "lr": 0.0007503496503496504, "step": 2286, "tokens_trained": 1.1235948 }, { "epoch": 0.6490319835472661, "grad_norm": 14.718119621276855, "loss": 4.1309, "lr": 0.00075006993006993, "step": 2288, "tokens_trained": 1.124576952 }, { "epoch": 0.64959931919722, "grad_norm": 10.44218921661377, "loss": 4.1178, "lr": 0.0007497902097902098, "step": 2290, "tokens_trained": 1.12555812 }, { "epoch": 0.650166654847174, "grad_norm": 12.619060516357422, "loss": 4.088, "lr": 0.0007495104895104895, "step": 2292, "tokens_trained": 1.126542504 }, { "epoch": 0.6507339904971279, "grad_norm": 12.677931785583496, "loss": 4.1146, "lr": 0.0007492307692307693, "step": 2294, "tokens_trained": 1.127527144 }, { "epoch": 0.6513013261470818, "grad_norm": 9.913066864013672, "loss": 4.1376, "lr": 0.0007489510489510489, "step": 2296, "tokens_trained": 1.128511472 }, { "epoch": 0.6518686617970356, "grad_norm": 10.902573585510254, "loss": 4.1184, "lr": 0.0007486713286713287, "step": 2298, "tokens_trained": 1.129493144 }, { "epoch": 0.6524359974469895, "grad_norm": 11.475235939025879, "loss": 4.098, "lr": 0.0007483916083916084, "step": 2300, "tokens_trained": 1.13047816 }, { "epoch": 0.6530033330969435, "grad_norm": 11.541910171508789, "loss": 4.106, "lr": 0.0007481118881118882, "step": 2302, "tokens_trained": 1.131461952 }, { "epoch": 0.6535706687468974, "grad_norm": 8.055131912231445, "loss": 4.0913, "lr": 0.0007478321678321679, "step": 2304, "tokens_trained": 1.132445928 }, { "epoch": 0.6541380043968513, "grad_norm": 11.786042213439941, "loss": 4.14, "lr": 0.0007475524475524475, "step": 2306, "tokens_trained": 1.133430104 }, { "epoch": 0.6547053400468051, "grad_norm": 7.311541557312012, "loss": 4.0989, "lr": 0.0007472727272727273, "step": 2308, "tokens_trained": 1.1344128 }, { "epoch": 0.6552726756967591, "grad_norm": 5.909560680389404, "loss": 4.1226, "lr": 0.000746993006993007, "step": 2310, "tokens_trained": 1.135395456 }, { "epoch": 0.655840011346713, "grad_norm": 15.199941635131836, "loss": 4.1003, "lr": 0.0007467132867132868, "step": 2312, "tokens_trained": 1.136377952 }, { "epoch": 0.6564073469966669, "grad_norm": 11.078165054321289, "loss": 4.1273, "lr": 0.0007464335664335664, "step": 2314, "tokens_trained": 1.137364488 }, { "epoch": 0.6569746826466208, "grad_norm": 14.202346801757812, "loss": 4.074, "lr": 0.0007461538461538462, "step": 2316, "tokens_trained": 1.138348624 }, { "epoch": 0.6575420182965747, "grad_norm": 12.573927879333496, "loss": 4.0749, "lr": 0.0007458741258741259, "step": 2318, "tokens_trained": 1.139332304 }, { "epoch": 0.6581093539465286, "grad_norm": 4.582006454467773, "loss": 4.1204, "lr": 0.0007455944055944056, "step": 2320, "tokens_trained": 1.140317248 }, { "epoch": 0.6586766895964825, "grad_norm": 12.172183990478516, "loss": 4.1045, "lr": 0.0007453146853146854, "step": 2322, "tokens_trained": 1.141300976 }, { "epoch": 0.6592440252464364, "grad_norm": 8.110429763793945, "loss": 4.1081, "lr": 0.000745034965034965, "step": 2324, "tokens_trained": 1.142283576 }, { "epoch": 0.6598113608963904, "grad_norm": 7.653029918670654, "loss": 4.1272, "lr": 0.0007447552447552448, "step": 2326, "tokens_trained": 1.143264144 }, { "epoch": 0.6603786965463442, "grad_norm": 8.91545295715332, "loss": 4.0604, "lr": 0.0007444755244755245, "step": 2328, "tokens_trained": 1.144248336 }, { "epoch": 0.6609460321962981, "grad_norm": 8.173501014709473, "loss": 4.1033, "lr": 0.0007441958041958043, "step": 2330, "tokens_trained": 1.145231936 }, { "epoch": 0.661513367846252, "grad_norm": 6.748053550720215, "loss": 4.1, "lr": 0.0007439160839160839, "step": 2332, "tokens_trained": 1.146214208 }, { "epoch": 0.662080703496206, "grad_norm": 8.997527122497559, "loss": 4.0642, "lr": 0.0007436363636363636, "step": 2334, "tokens_trained": 1.147203592 }, { "epoch": 0.6626480391461599, "grad_norm": 5.39633321762085, "loss": 4.0531, "lr": 0.0007433566433566433, "step": 2336, "tokens_trained": 1.148189176 }, { "epoch": 0.6632153747961137, "grad_norm": 11.717559814453125, "loss": 4.1069, "lr": 0.0007430769230769231, "step": 2338, "tokens_trained": 1.14917232 }, { "epoch": 0.6637827104460676, "grad_norm": 4.895142078399658, "loss": 4.1119, "lr": 0.0007427972027972029, "step": 2340, "tokens_trained": 1.150150104 }, { "epoch": 0.6643500460960216, "grad_norm": 7.677682399749756, "loss": 4.0787, "lr": 0.0007425174825174825, "step": 2342, "tokens_trained": 1.15113228 }, { "epoch": 0.6649173817459755, "grad_norm": 9.910654067993164, "loss": 4.114, "lr": 0.0007422377622377622, "step": 2344, "tokens_trained": 1.152119112 }, { "epoch": 0.6654847173959294, "grad_norm": 7.880978107452393, "loss": 4.1188, "lr": 0.000741958041958042, "step": 2346, "tokens_trained": 1.153100688 }, { "epoch": 0.6660520530458832, "grad_norm": 3.284940242767334, "loss": 4.0736, "lr": 0.0007416783216783217, "step": 2348, "tokens_trained": 1.1540818 }, { "epoch": 0.6666193886958371, "grad_norm": 13.524490356445312, "loss": 4.0621, "lr": 0.0007413986013986014, "step": 2350, "tokens_trained": 1.155065608 }, { "epoch": 0.6671867243457911, "grad_norm": 5.8569135665893555, "loss": 4.0904, "lr": 0.0007411188811188811, "step": 2352, "tokens_trained": 1.156048544 }, { "epoch": 0.667754059995745, "grad_norm": 7.1157450675964355, "loss": 4.0774, "lr": 0.0007408391608391608, "step": 2354, "tokens_trained": 1.157030432 }, { "epoch": 0.6683213956456989, "grad_norm": 7.612982273101807, "loss": 4.0829, "lr": 0.0007405594405594406, "step": 2356, "tokens_trained": 1.158012728 }, { "epoch": 0.6688887312956527, "grad_norm": 8.317691802978516, "loss": 4.1176, "lr": 0.0007402797202797204, "step": 2358, "tokens_trained": 1.158993632 }, { "epoch": 0.6694560669456067, "grad_norm": 5.272528648376465, "loss": 4.0977, "lr": 0.00074, "step": 2360, "tokens_trained": 1.159976328 }, { "epoch": 0.6700234025955606, "grad_norm": 11.313931465148926, "loss": 4.0792, "lr": 0.0007397202797202797, "step": 2362, "tokens_trained": 1.160962072 }, { "epoch": 0.6705907382455145, "grad_norm": 12.588369369506836, "loss": 4.0491, "lr": 0.0007394405594405595, "step": 2364, "tokens_trained": 1.161947664 }, { "epoch": 0.6711580738954684, "grad_norm": 23.921968460083008, "loss": 4.1085, "lr": 0.0007391608391608392, "step": 2366, "tokens_trained": 1.16292872 }, { "epoch": 0.6717254095454223, "grad_norm": 9.100578308105469, "loss": 4.1305, "lr": 0.0007388811188811189, "step": 2368, "tokens_trained": 1.163913888 }, { "epoch": 0.6722927451953762, "grad_norm": 35.22720718383789, "loss": 4.1538, "lr": 0.0007386013986013986, "step": 2370, "tokens_trained": 1.164894912 }, { "epoch": 0.6728600808453301, "grad_norm": 16.7394962310791, "loss": 4.1449, "lr": 0.0007383216783216782, "step": 2372, "tokens_trained": 1.165879832 }, { "epoch": 0.673427416495284, "grad_norm": 11.066312789916992, "loss": 4.1172, "lr": 0.0007380419580419581, "step": 2374, "tokens_trained": 1.166864736 }, { "epoch": 0.6737110843202609, "eval_loss": 1.0303717851638794, "eval_runtime": 20.7454, "step": 2375, "tokens_trained": 1.167358632 }, { "epoch": 0.673994752145238, "grad_norm": 12.827569007873535, "loss": 4.1377, "lr": 0.0007377622377622378, "step": 2376, "tokens_trained": 1.16784964 }, { "epoch": 0.6745620877951918, "grad_norm": 13.321866035461426, "loss": 4.0747, "lr": 0.0007374825174825175, "step": 2378, "tokens_trained": 1.168834992 }, { "epoch": 0.6751294234451457, "grad_norm": 15.812009811401367, "loss": 4.1107, "lr": 0.0007372027972027972, "step": 2380, "tokens_trained": 1.169817608 }, { "epoch": 0.6756967590950996, "grad_norm": 16.37995719909668, "loss": 4.1556, "lr": 0.000736923076923077, "step": 2382, "tokens_trained": 1.170800952 }, { "epoch": 0.6762640947450536, "grad_norm": 3.3421339988708496, "loss": 4.1199, "lr": 0.0007366433566433567, "step": 2384, "tokens_trained": 1.1717818 }, { "epoch": 0.6768314303950075, "grad_norm": 9.120339393615723, "loss": 4.0834, "lr": 0.0007363636363636363, "step": 2386, "tokens_trained": 1.172767384 }, { "epoch": 0.6773987660449613, "grad_norm": 12.614449501037598, "loss": 4.0852, "lr": 0.0007360839160839161, "step": 2388, "tokens_trained": 1.173755008 }, { "epoch": 0.6779661016949152, "grad_norm": 4.983767986297607, "loss": 4.0881, "lr": 0.0007358041958041957, "step": 2390, "tokens_trained": 1.174738528 }, { "epoch": 0.6785334373448692, "grad_norm": 4.194960117340088, "loss": 4.1279, "lr": 0.0007355244755244756, "step": 2392, "tokens_trained": 1.175724848 }, { "epoch": 0.6791007729948231, "grad_norm": 5.257171154022217, "loss": 4.1044, "lr": 0.0007352447552447553, "step": 2394, "tokens_trained": 1.176708808 }, { "epoch": 0.679668108644777, "grad_norm": 10.38420295715332, "loss": 4.124, "lr": 0.000734965034965035, "step": 2396, "tokens_trained": 1.177695552 }, { "epoch": 0.6802354442947308, "grad_norm": 8.629493713378906, "loss": 4.0992, "lr": 0.0007346853146853147, "step": 2398, "tokens_trained": 1.17868064 }, { "epoch": 0.6808027799446847, "grad_norm": 9.099041938781738, "loss": 4.1047, "lr": 0.0007344055944055944, "step": 2400, "tokens_trained": 1.179664536 }, { "epoch": 0.6813701155946387, "grad_norm": 11.343080520629883, "loss": 4.1027, "lr": 0.0007341258741258742, "step": 2402, "tokens_trained": 1.180644264 }, { "epoch": 0.6819374512445926, "grad_norm": 5.834907054901123, "loss": 4.098, "lr": 0.0007338461538461538, "step": 2404, "tokens_trained": 1.181629672 }, { "epoch": 0.6825047868945465, "grad_norm": 4.648270606994629, "loss": 4.0775, "lr": 0.0007335664335664336, "step": 2406, "tokens_trained": 1.182614064 }, { "epoch": 0.6830721225445003, "grad_norm": 6.934843063354492, "loss": 4.1206, "lr": 0.0007332867132867132, "step": 2408, "tokens_trained": 1.183597056 }, { "epoch": 0.6836394581944543, "grad_norm": 9.745563507080078, "loss": 4.0921, "lr": 0.0007330069930069931, "step": 2410, "tokens_trained": 1.184579832 }, { "epoch": 0.6842067938444082, "grad_norm": 7.189306259155273, "loss": 4.095, "lr": 0.0007327272727272728, "step": 2412, "tokens_trained": 1.185567912 }, { "epoch": 0.6847741294943621, "grad_norm": 6.303226947784424, "loss": 4.0462, "lr": 0.0007324475524475524, "step": 2414, "tokens_trained": 1.186550184 }, { "epoch": 0.685341465144316, "grad_norm": 6.373469352722168, "loss": 4.1126, "lr": 0.0007321678321678322, "step": 2416, "tokens_trained": 1.1875374 }, { "epoch": 0.6859088007942699, "grad_norm": 7.8680853843688965, "loss": 4.0954, "lr": 0.0007318881118881119, "step": 2418, "tokens_trained": 1.188519808 }, { "epoch": 0.6864761364442238, "grad_norm": 6.305267810821533, "loss": 4.0951, "lr": 0.0007316083916083917, "step": 2420, "tokens_trained": 1.18950228 }, { "epoch": 0.6870434720941777, "grad_norm": 9.990362167358398, "loss": 4.0902, "lr": 0.0007313286713286713, "step": 2422, "tokens_trained": 1.190483872 }, { "epoch": 0.6876108077441316, "grad_norm": 7.421126365661621, "loss": 4.082, "lr": 0.0007310489510489511, "step": 2424, "tokens_trained": 1.191465424 }, { "epoch": 0.6881781433940856, "grad_norm": 7.08989953994751, "loss": 4.057, "lr": 0.0007307692307692307, "step": 2426, "tokens_trained": 1.192446 }, { "epoch": 0.6887454790440394, "grad_norm": 16.008317947387695, "loss": 4.0857, "lr": 0.0007304895104895105, "step": 2428, "tokens_trained": 1.193428632 }, { "epoch": 0.6893128146939933, "grad_norm": 14.471416473388672, "loss": 4.127, "lr": 0.0007302097902097902, "step": 2430, "tokens_trained": 1.194413624 }, { "epoch": 0.6898801503439472, "grad_norm": 8.250576972961426, "loss": 4.1244, "lr": 0.0007299300699300699, "step": 2432, "tokens_trained": 1.195396768 }, { "epoch": 0.6904474859939012, "grad_norm": 17.120845794677734, "loss": 4.107, "lr": 0.0007296503496503497, "step": 2434, "tokens_trained": 1.196377144 }, { "epoch": 0.6910148216438551, "grad_norm": 24.250490188598633, "loss": 4.1443, "lr": 0.0007293706293706294, "step": 2436, "tokens_trained": 1.197361496 }, { "epoch": 0.6915821572938089, "grad_norm": 9.916406631469727, "loss": 4.1308, "lr": 0.0007290909090909092, "step": 2438, "tokens_trained": 1.198343376 }, { "epoch": 0.6921494929437628, "grad_norm": 29.035507202148438, "loss": 4.1809, "lr": 0.0007288111888111888, "step": 2440, "tokens_trained": 1.19932396 }, { "epoch": 0.6927168285937167, "grad_norm": 26.963102340698242, "loss": 4.1343, "lr": 0.0007285314685314685, "step": 2442, "tokens_trained": 1.200310088 }, { "epoch": 0.6932841642436707, "grad_norm": 9.7550048828125, "loss": 4.0746, "lr": 0.0007282517482517482, "step": 2444, "tokens_trained": 1.201291576 }, { "epoch": 0.6938514998936246, "grad_norm": 18.56088638305664, "loss": 4.1634, "lr": 0.000727972027972028, "step": 2446, "tokens_trained": 1.202271312 }, { "epoch": 0.6944188355435784, "grad_norm": 20.842105865478516, "loss": 4.128, "lr": 0.0007276923076923077, "step": 2448, "tokens_trained": 1.203252912 }, { "epoch": 0.6949861711935323, "grad_norm": 21.38428497314453, "loss": 4.1263, "lr": 0.0007274125874125874, "step": 2450, "tokens_trained": 1.204231328 }, { "epoch": 0.6955535068434863, "grad_norm": 9.129469871520996, "loss": 4.0964, "lr": 0.0007271328671328672, "step": 2452, "tokens_trained": 1.205215552 }, { "epoch": 0.6961208424934402, "grad_norm": 25.37588882446289, "loss": 4.1568, "lr": 0.0007268531468531469, "step": 2454, "tokens_trained": 1.206202536 }, { "epoch": 0.6966881781433941, "grad_norm": 17.409656524658203, "loss": 4.1214, "lr": 0.0007265734265734266, "step": 2456, "tokens_trained": 1.207182664 }, { "epoch": 0.6972555137933479, "grad_norm": 12.378538131713867, "loss": 4.1235, "lr": 0.0007262937062937063, "step": 2458, "tokens_trained": 1.208164408 }, { "epoch": 0.6978228494433019, "grad_norm": 15.208183288574219, "loss": 4.0724, "lr": 0.000726013986013986, "step": 2460, "tokens_trained": 1.209151056 }, { "epoch": 0.6983901850932558, "grad_norm": 15.311476707458496, "loss": 4.1146, "lr": 0.0007257342657342657, "step": 2462, "tokens_trained": 1.210135672 }, { "epoch": 0.6989575207432097, "grad_norm": 8.551816940307617, "loss": 4.0944, "lr": 0.0007254545454545455, "step": 2464, "tokens_trained": 1.211118992 }, { "epoch": 0.6995248563931636, "grad_norm": 5.893448829650879, "loss": 4.0777, "lr": 0.0007251748251748252, "step": 2466, "tokens_trained": 1.212102 }, { "epoch": 0.7000921920431175, "grad_norm": 12.23680591583252, "loss": 4.0998, "lr": 0.0007248951048951049, "step": 2468, "tokens_trained": 1.213078936 }, { "epoch": 0.7006595276930714, "grad_norm": 6.285398006439209, "loss": 4.0691, "lr": 0.0007246153846153846, "step": 2470, "tokens_trained": 1.214058832 }, { "epoch": 0.7012268633430253, "grad_norm": 5.049949645996094, "loss": 4.0849, "lr": 0.0007243356643356644, "step": 2472, "tokens_trained": 1.215045384 }, { "epoch": 0.7017941989929792, "grad_norm": 8.333894729614258, "loss": 4.1072, "lr": 0.0007240559440559441, "step": 2474, "tokens_trained": 1.216029416 }, { "epoch": 0.7023615346429332, "grad_norm": 10.236394882202148, "loss": 4.1144, "lr": 0.0007237762237762238, "step": 2476, "tokens_trained": 1.217012872 }, { "epoch": 0.702928870292887, "grad_norm": 7.674532413482666, "loss": 4.0948, "lr": 0.0007234965034965035, "step": 2478, "tokens_trained": 1.2179988 }, { "epoch": 0.7034962059428409, "grad_norm": 8.445834159851074, "loss": 4.0937, "lr": 0.0007232167832167831, "step": 2480, "tokens_trained": 1.218980608 }, { "epoch": 0.7040635415927948, "grad_norm": 6.923468112945557, "loss": 4.0756, "lr": 0.000722937062937063, "step": 2482, "tokens_trained": 1.219966912 }, { "epoch": 0.7046308772427488, "grad_norm": 5.95997428894043, "loss": 4.0618, "lr": 0.0007226573426573426, "step": 2484, "tokens_trained": 1.220952696 }, { "epoch": 0.7051982128927027, "grad_norm": 3.7207870483398438, "loss": 4.0869, "lr": 0.0007223776223776224, "step": 2486, "tokens_trained": 1.22193476 }, { "epoch": 0.7057655485426565, "grad_norm": 8.434130668640137, "loss": 4.0965, "lr": 0.0007220979020979021, "step": 2488, "tokens_trained": 1.222914616 }, { "epoch": 0.7063328841926104, "grad_norm": 10.180377006530762, "loss": 4.0871, "lr": 0.0007218181818181819, "step": 2490, "tokens_trained": 1.22389764 }, { "epoch": 0.7069002198425643, "grad_norm": 8.211799621582031, "loss": 4.0811, "lr": 0.0007215384615384616, "step": 2492, "tokens_trained": 1.224875448 }, { "epoch": 0.7074675554925183, "grad_norm": 5.268981456756592, "loss": 4.0926, "lr": 0.0007212587412587412, "step": 2494, "tokens_trained": 1.225858112 }, { "epoch": 0.7080348911424722, "grad_norm": 7.387131690979004, "loss": 4.1097, "lr": 0.000720979020979021, "step": 2496, "tokens_trained": 1.226838472 }, { "epoch": 0.708602226792426, "grad_norm": 7.289080619812012, "loss": 4.0566, "lr": 0.0007206993006993006, "step": 2498, "tokens_trained": 1.227821848 }, { "epoch": 0.7091695624423799, "grad_norm": 6.981493949890137, "loss": 4.062, "lr": 0.0007204195804195805, "step": 2500, "tokens_trained": 1.228806208 }, { "epoch": 0.7091695624423799, "eval_loss": 1.0222537517547607, "eval_runtime": 20.7945, "step": 2500, "tokens_trained": 1.228806208 }, { "epoch": 0.7097368980923339, "grad_norm": 6.244803428649902, "loss": 4.1417, "lr": 0.0007201398601398601, "step": 2502, "tokens_trained": 1.229787872 }, { "epoch": 0.7103042337422878, "grad_norm": 4.354197978973389, "loss": 4.0663, "lr": 0.0007198601398601399, "step": 2504, "tokens_trained": 1.23077076 }, { "epoch": 0.7108715693922417, "grad_norm": 4.971379280090332, "loss": 4.0495, "lr": 0.0007195804195804196, "step": 2506, "tokens_trained": 1.231752344 }, { "epoch": 0.7114389050421955, "grad_norm": 5.990703582763672, "loss": 4.0837, "lr": 0.0007193006993006994, "step": 2508, "tokens_trained": 1.232733864 }, { "epoch": 0.7120062406921495, "grad_norm": 8.498222351074219, "loss": 4.0379, "lr": 0.0007190209790209791, "step": 2510, "tokens_trained": 1.233716744 }, { "epoch": 0.7125735763421034, "grad_norm": 13.36562442779541, "loss": 4.0187, "lr": 0.0007187412587412587, "step": 2512, "tokens_trained": 1.234699872 }, { "epoch": 0.7131409119920573, "grad_norm": 8.733027458190918, "loss": 4.092, "lr": 0.0007184615384615385, "step": 2514, "tokens_trained": 1.235684584 }, { "epoch": 0.7137082476420112, "grad_norm": 4.150378227233887, "loss": 4.1277, "lr": 0.0007181818181818181, "step": 2516, "tokens_trained": 1.236669584 }, { "epoch": 0.714275583291965, "grad_norm": 5.051011085510254, "loss": 4.0942, "lr": 0.000717902097902098, "step": 2518, "tokens_trained": 1.237654456 }, { "epoch": 0.714842918941919, "grad_norm": 19.51820945739746, "loss": 4.0784, "lr": 0.0007176223776223776, "step": 2520, "tokens_trained": 1.238634888 }, { "epoch": 0.7154102545918729, "grad_norm": 12.287970542907715, "loss": 4.1096, "lr": 0.0007173426573426573, "step": 2522, "tokens_trained": 1.239617096 }, { "epoch": 0.7159775902418268, "grad_norm": 7.280889511108398, "loss": 4.1173, "lr": 0.0007170629370629371, "step": 2524, "tokens_trained": 1.240599456 }, { "epoch": 0.7165449258917808, "grad_norm": 7.321331024169922, "loss": 4.1011, "lr": 0.0007167832167832168, "step": 2526, "tokens_trained": 1.2415852 }, { "epoch": 0.7171122615417346, "grad_norm": 12.695849418640137, "loss": 4.0652, "lr": 0.0007165034965034966, "step": 2528, "tokens_trained": 1.242566296 }, { "epoch": 0.7176795971916885, "grad_norm": 10.30766487121582, "loss": 4.0683, "lr": 0.0007162237762237762, "step": 2530, "tokens_trained": 1.24354928 }, { "epoch": 0.7182469328416424, "grad_norm": 6.451354503631592, "loss": 4.0712, "lr": 0.000715944055944056, "step": 2532, "tokens_trained": 1.244534464 }, { "epoch": 0.7188142684915964, "grad_norm": 13.049304962158203, "loss": 4.0662, "lr": 0.0007156643356643356, "step": 2534, "tokens_trained": 1.245514976 }, { "epoch": 0.7193816041415503, "grad_norm": 6.242895603179932, "loss": 4.089, "lr": 0.0007153846153846155, "step": 2536, "tokens_trained": 1.246499648 }, { "epoch": 0.7199489397915041, "grad_norm": 9.09418773651123, "loss": 4.0727, "lr": 0.0007151048951048951, "step": 2538, "tokens_trained": 1.247482424 }, { "epoch": 0.720516275441458, "grad_norm": 5.704024791717529, "loss": 4.0973, "lr": 0.0007148251748251748, "step": 2540, "tokens_trained": 1.248465776 }, { "epoch": 0.721083611091412, "grad_norm": 1.818793535232544, "loss": 4.0928, "lr": 0.0007145454545454546, "step": 2542, "tokens_trained": 1.249446792 }, { "epoch": 0.7216509467413659, "grad_norm": 8.157804489135742, "loss": 4.1082, "lr": 0.0007142657342657343, "step": 2544, "tokens_trained": 1.25042832 }, { "epoch": 0.7222182823913198, "grad_norm": 12.176240921020508, "loss": 4.0472, "lr": 0.0007139860139860141, "step": 2546, "tokens_trained": 1.251411112 }, { "epoch": 0.7227856180412736, "grad_norm": 9.750322341918945, "loss": 4.0892, "lr": 0.0007137062937062937, "step": 2548, "tokens_trained": 1.25239148 }, { "epoch": 0.7233529536912275, "grad_norm": 7.636045455932617, "loss": 4.0939, "lr": 0.0007134265734265734, "step": 2550, "tokens_trained": 1.253374936 }, { "epoch": 0.7239202893411815, "grad_norm": 9.795125007629395, "loss": 4.0542, "lr": 0.0007131468531468531, "step": 2552, "tokens_trained": 1.254359048 }, { "epoch": 0.7244876249911354, "grad_norm": 7.851208686828613, "loss": 4.0546, "lr": 0.0007128671328671329, "step": 2554, "tokens_trained": 1.255343552 }, { "epoch": 0.7250549606410893, "grad_norm": 7.749396800994873, "loss": 4.0834, "lr": 0.0007125874125874126, "step": 2556, "tokens_trained": 1.256332976 }, { "epoch": 0.7256222962910431, "grad_norm": 7.826572418212891, "loss": 4.0914, "lr": 0.0007123076923076923, "step": 2558, "tokens_trained": 1.257315376 }, { "epoch": 0.7261896319409971, "grad_norm": 7.173867225646973, "loss": 4.0721, "lr": 0.0007120279720279721, "step": 2560, "tokens_trained": 1.258296944 }, { "epoch": 0.726756967590951, "grad_norm": 7.722167015075684, "loss": 4.092, "lr": 0.0007117482517482518, "step": 2562, "tokens_trained": 1.259278984 }, { "epoch": 0.7273243032409049, "grad_norm": 5.8100690841674805, "loss": 4.0592, "lr": 0.0007114685314685315, "step": 2564, "tokens_trained": 1.260261648 }, { "epoch": 0.7278916388908588, "grad_norm": 6.633793830871582, "loss": 4.0871, "lr": 0.0007111888111888112, "step": 2566, "tokens_trained": 1.261235168 }, { "epoch": 0.7284589745408127, "grad_norm": 9.645057678222656, "loss": 4.0707, "lr": 0.0007109090909090909, "step": 2568, "tokens_trained": 1.26221864 }, { "epoch": 0.7290263101907666, "grad_norm": 8.770727157592773, "loss": 4.0757, "lr": 0.0007106293706293706, "step": 2570, "tokens_trained": 1.263199256 }, { "epoch": 0.7295936458407205, "grad_norm": 6.190083980560303, "loss": 4.0911, "lr": 0.0007103496503496504, "step": 2572, "tokens_trained": 1.264180424 }, { "epoch": 0.7301609814906744, "grad_norm": 11.070337295532227, "loss": 4.0566, "lr": 0.0007100699300699301, "step": 2574, "tokens_trained": 1.265164384 }, { "epoch": 0.7307283171406284, "grad_norm": 8.301725387573242, "loss": 4.0636, "lr": 0.0007097902097902098, "step": 2576, "tokens_trained": 1.266148592 }, { "epoch": 0.7312956527905822, "grad_norm": 5.524992942810059, "loss": 4.0974, "lr": 0.0007095104895104895, "step": 2578, "tokens_trained": 1.26712948 }, { "epoch": 0.7318629884405361, "grad_norm": 11.42268180847168, "loss": 4.0858, "lr": 0.0007092307692307692, "step": 2580, "tokens_trained": 1.268107968 }, { "epoch": 0.73243032409049, "grad_norm": 6.110471725463867, "loss": 4.0563, "lr": 0.000708951048951049, "step": 2582, "tokens_trained": 1.26909272 }, { "epoch": 0.732997659740444, "grad_norm": 4.583469867706299, "loss": 4.0907, "lr": 0.0007086713286713287, "step": 2584, "tokens_trained": 1.270074432 }, { "epoch": 0.7335649953903979, "grad_norm": 4.348790645599365, "loss": 4.0768, "lr": 0.0007083916083916084, "step": 2586, "tokens_trained": 1.271059184 }, { "epoch": 0.7341323310403517, "grad_norm": 9.383113861083984, "loss": 4.0829, "lr": 0.000708111888111888, "step": 2588, "tokens_trained": 1.272044288 }, { "epoch": 0.7346996666903056, "grad_norm": 8.594022750854492, "loss": 4.097, "lr": 0.0007078321678321679, "step": 2590, "tokens_trained": 1.273026808 }, { "epoch": 0.7352670023402595, "grad_norm": 8.971443176269531, "loss": 4.0689, "lr": 0.0007075524475524475, "step": 2592, "tokens_trained": 1.274011272 }, { "epoch": 0.7358343379902135, "grad_norm": 14.21872615814209, "loss": 4.0892, "lr": 0.0007072727272727273, "step": 2594, "tokens_trained": 1.274995728 }, { "epoch": 0.7364016736401674, "grad_norm": 5.579262733459473, "loss": 4.1151, "lr": 0.000706993006993007, "step": 2596, "tokens_trained": 1.27598244 }, { "epoch": 0.7369690092901212, "grad_norm": 7.760303974151611, "loss": 4.0923, "lr": 0.0007067132867132867, "step": 2598, "tokens_trained": 1.276966176 }, { "epoch": 0.7375363449400751, "grad_norm": 8.493928909301758, "loss": 4.1002, "lr": 0.0007064335664335665, "step": 2600, "tokens_trained": 1.277946064 }, { "epoch": 0.7381036805900291, "grad_norm": 7.7460126876831055, "loss": 4.0464, "lr": 0.0007061538461538462, "step": 2602, "tokens_trained": 1.278928016 }, { "epoch": 0.738671016239983, "grad_norm": 14.752384185791016, "loss": 4.0694, "lr": 0.0007058741258741259, "step": 2604, "tokens_trained": 1.27991464 }, { "epoch": 0.7392383518899369, "grad_norm": 4.13566255569458, "loss": 4.0852, "lr": 0.0007055944055944055, "step": 2606, "tokens_trained": 1.280898424 }, { "epoch": 0.7398056875398907, "grad_norm": 9.910110473632812, "loss": 4.0819, "lr": 0.0007053146853146854, "step": 2608, "tokens_trained": 1.281880448 }, { "epoch": 0.7403730231898447, "grad_norm": 8.776302337646484, "loss": 4.0908, "lr": 0.000705034965034965, "step": 2610, "tokens_trained": 1.282866224 }, { "epoch": 0.7409403588397986, "grad_norm": 7.437447547912598, "loss": 4.0914, "lr": 0.0007047552447552448, "step": 2612, "tokens_trained": 1.283846848 }, { "epoch": 0.7415076944897525, "grad_norm": 5.371145248413086, "loss": 4.0601, "lr": 0.0007044755244755245, "step": 2614, "tokens_trained": 1.284828288 }, { "epoch": 0.7420750301397064, "grad_norm": 5.754990100860596, "loss": 4.034, "lr": 0.0007041958041958041, "step": 2616, "tokens_trained": 1.285813632 }, { "epoch": 0.7426423657896603, "grad_norm": 12.21330738067627, "loss": 4.0893, "lr": 0.000703916083916084, "step": 2618, "tokens_trained": 1.286796048 }, { "epoch": 0.7432097014396142, "grad_norm": 6.313106060028076, "loss": 4.1348, "lr": 0.0007036363636363636, "step": 2620, "tokens_trained": 1.287779984 }, { "epoch": 0.7437770370895681, "grad_norm": 3.671832323074341, "loss": 4.0892, "lr": 0.0007033566433566434, "step": 2622, "tokens_trained": 1.288763704 }, { "epoch": 0.744344372739522, "grad_norm": 7.610039710998535, "loss": 4.0544, "lr": 0.000703076923076923, "step": 2624, "tokens_trained": 1.289748608 }, { "epoch": 0.7446280405644989, "eval_loss": 1.0216281414031982, "eval_runtime": 21.3239, "step": 2625, "tokens_trained": 1.290237248 }, { "epoch": 0.744911708389476, "grad_norm": 10.805936813354492, "loss": 4.0702, "lr": 0.0007027972027972029, "step": 2626, "tokens_trained": 1.290726104 }, { "epoch": 0.7454790440394298, "grad_norm": 8.497400283813477, "loss": 4.056, "lr": 0.0007025174825174825, "step": 2628, "tokens_trained": 1.291710888 }, { "epoch": 0.7460463796893837, "grad_norm": 7.71652364730835, "loss": 4.0428, "lr": 0.0007022377622377623, "step": 2630, "tokens_trained": 1.2926998 }, { "epoch": 0.7466137153393376, "grad_norm": 11.314064979553223, "loss": 4.0442, "lr": 0.000701958041958042, "step": 2632, "tokens_trained": 1.293681648 }, { "epoch": 0.7471810509892916, "grad_norm": 8.498956680297852, "loss": 4.0806, "lr": 0.0007016783216783216, "step": 2634, "tokens_trained": 1.29466332 }, { "epoch": 0.7477483866392455, "grad_norm": 8.315062522888184, "loss": 4.0496, "lr": 0.0007013986013986015, "step": 2636, "tokens_trained": 1.29565108 }, { "epoch": 0.7483157222891993, "grad_norm": 7.541136264801025, "loss": 4.0901, "lr": 0.0007011188811188811, "step": 2638, "tokens_trained": 1.296633192 }, { "epoch": 0.7488830579391532, "grad_norm": 5.977221965789795, "loss": 4.0612, "lr": 0.0007008391608391609, "step": 2640, "tokens_trained": 1.297621272 }, { "epoch": 0.7494503935891071, "grad_norm": 5.02126932144165, "loss": 4.0944, "lr": 0.0007005594405594405, "step": 2642, "tokens_trained": 1.298601744 }, { "epoch": 0.7500177292390611, "grad_norm": 6.345284938812256, "loss": 4.0578, "lr": 0.0007002797202797204, "step": 2644, "tokens_trained": 1.299583072 }, { "epoch": 0.750585064889015, "grad_norm": 7.036267280578613, "loss": 4.0472, "lr": 0.0007, "step": 2646, "tokens_trained": 1.300567448 }, { "epoch": 0.7511524005389689, "grad_norm": 2.7125253677368164, "loss": 4.0534, "lr": 0.0006997202797202797, "step": 2648, "tokens_trained": 1.301554096 }, { "epoch": 0.7517197361889227, "grad_norm": 3.862492322921753, "loss": 4.0696, "lr": 0.0006994405594405595, "step": 2650, "tokens_trained": 1.302540112 }, { "epoch": 0.7522870718388767, "grad_norm": 2.0384063720703125, "loss": 4.0662, "lr": 0.0006991608391608391, "step": 2652, "tokens_trained": 1.30352596 }, { "epoch": 0.7528544074888306, "grad_norm": 5.195199966430664, "loss": 4.0819, "lr": 0.000698881118881119, "step": 2654, "tokens_trained": 1.30450616 }, { "epoch": 0.7534217431387845, "grad_norm": 14.55208969116211, "loss": 4.0757, "lr": 0.0006986013986013986, "step": 2656, "tokens_trained": 1.305488752 }, { "epoch": 0.7539890787887384, "grad_norm": 10.982531547546387, "loss": 4.0474, "lr": 0.0006983216783216784, "step": 2658, "tokens_trained": 1.306474856 }, { "epoch": 0.7545564144386923, "grad_norm": 7.926928997039795, "loss": 4.0497, "lr": 0.000698041958041958, "step": 2660, "tokens_trained": 1.307456136 }, { "epoch": 0.7551237500886462, "grad_norm": 5.156681537628174, "loss": 4.098, "lr": 0.0006977622377622378, "step": 2662, "tokens_trained": 1.308442664 }, { "epoch": 0.7556910857386001, "grad_norm": 8.156705856323242, "loss": 4.0828, "lr": 0.0006974825174825175, "step": 2664, "tokens_trained": 1.309422976 }, { "epoch": 0.756258421388554, "grad_norm": 8.489871978759766, "loss": 4.0668, "lr": 0.0006972027972027972, "step": 2666, "tokens_trained": 1.310406152 }, { "epoch": 0.756825757038508, "grad_norm": 13.065528869628906, "loss": 4.0915, "lr": 0.000696923076923077, "step": 2668, "tokens_trained": 1.311392576 }, { "epoch": 0.7573930926884618, "grad_norm": 7.475847244262695, "loss": 4.0308, "lr": 0.0006966433566433566, "step": 2670, "tokens_trained": 1.312378776 }, { "epoch": 0.7579604283384157, "grad_norm": 7.049544334411621, "loss": 4.0662, "lr": 0.0006963636363636365, "step": 2672, "tokens_trained": 1.313358848 }, { "epoch": 0.7585277639883696, "grad_norm": 5.037269115447998, "loss": 4.1016, "lr": 0.0006960839160839161, "step": 2674, "tokens_trained": 1.3143412 }, { "epoch": 0.7590950996383236, "grad_norm": 10.421965599060059, "loss": 4.0655, "lr": 0.0006958041958041958, "step": 2676, "tokens_trained": 1.315322968 }, { "epoch": 0.7596624352882775, "grad_norm": 8.08486557006836, "loss": 4.0933, "lr": 0.0006955244755244755, "step": 2678, "tokens_trained": 1.316306592 }, { "epoch": 0.7602297709382313, "grad_norm": 10.121665954589844, "loss": 4.0673, "lr": 0.0006952447552447553, "step": 2680, "tokens_trained": 1.317292536 }, { "epoch": 0.7607971065881852, "grad_norm": 4.840561389923096, "loss": 4.089, "lr": 0.000694965034965035, "step": 2682, "tokens_trained": 1.318278512 }, { "epoch": 0.7613644422381391, "grad_norm": 5.03504753112793, "loss": 4.0696, "lr": 0.0006946853146853147, "step": 2684, "tokens_trained": 1.319263032 }, { "epoch": 0.7619317778880931, "grad_norm": 12.180596351623535, "loss": 4.1166, "lr": 0.0006944055944055943, "step": 2686, "tokens_trained": 1.320252752 }, { "epoch": 0.762499113538047, "grad_norm": 8.842597007751465, "loss": 4.0946, "lr": 0.0006941258741258741, "step": 2688, "tokens_trained": 1.321239648 }, { "epoch": 0.7630664491880008, "grad_norm": 4.742710113525391, "loss": 4.0894, "lr": 0.0006938461538461539, "step": 2690, "tokens_trained": 1.322224872 }, { "epoch": 0.7636337848379547, "grad_norm": 2.7827649116516113, "loss": 4.0453, "lr": 0.0006935664335664336, "step": 2692, "tokens_trained": 1.323211432 }, { "epoch": 0.7642011204879087, "grad_norm": 8.263550758361816, "loss": 4.0034, "lr": 0.0006932867132867133, "step": 2694, "tokens_trained": 1.324190272 }, { "epoch": 0.7647684561378626, "grad_norm": 14.927130699157715, "loss": 4.0243, "lr": 0.000693006993006993, "step": 2696, "tokens_trained": 1.325175184 }, { "epoch": 0.7653357917878165, "grad_norm": 9.046390533447266, "loss": 4.0646, "lr": 0.0006927272727272728, "step": 2698, "tokens_trained": 1.326156856 }, { "epoch": 0.7659031274377703, "grad_norm": 7.640266418457031, "loss": 4.0581, "lr": 0.0006924475524475524, "step": 2700, "tokens_trained": 1.327134224 }, { "epoch": 0.7664704630877243, "grad_norm": 11.179667472839355, "loss": 4.0286, "lr": 0.0006921678321678322, "step": 2702, "tokens_trained": 1.328119376 }, { "epoch": 0.7670377987376782, "grad_norm": 13.961971282958984, "loss": 4.072, "lr": 0.0006918881118881118, "step": 2704, "tokens_trained": 1.329097248 }, { "epoch": 0.7676051343876321, "grad_norm": 5.873361110687256, "loss": 4.1069, "lr": 0.0006916083916083916, "step": 2706, "tokens_trained": 1.330079272 }, { "epoch": 0.768172470037586, "grad_norm": 5.7134623527526855, "loss": 4.0483, "lr": 0.0006913286713286714, "step": 2708, "tokens_trained": 1.331062968 }, { "epoch": 0.7687398056875399, "grad_norm": 8.088322639465332, "loss": 4.0806, "lr": 0.0006910489510489511, "step": 2710, "tokens_trained": 1.3320508 }, { "epoch": 0.7693071413374938, "grad_norm": 12.358318328857422, "loss": 4.0281, "lr": 0.0006907692307692308, "step": 2712, "tokens_trained": 1.333034392 }, { "epoch": 0.7698744769874477, "grad_norm": 6.448056221008301, "loss": 4.0449, "lr": 0.0006904895104895104, "step": 2714, "tokens_trained": 1.334018424 }, { "epoch": 0.7704418126374016, "grad_norm": 10.305964469909668, "loss": 4.0611, "lr": 0.0006902097902097903, "step": 2716, "tokens_trained": 1.33500044 }, { "epoch": 0.7710091482873556, "grad_norm": 8.82204532623291, "loss": 4.0697, "lr": 0.0006899300699300699, "step": 2718, "tokens_trained": 1.335985304 }, { "epoch": 0.7715764839373094, "grad_norm": 11.34217643737793, "loss": 4.0471, "lr": 0.0006896503496503497, "step": 2720, "tokens_trained": 1.336971752 }, { "epoch": 0.7721438195872633, "grad_norm": 9.843841552734375, "loss": 4.1015, "lr": 0.0006893706293706293, "step": 2722, "tokens_trained": 1.337955296 }, { "epoch": 0.7727111552372172, "grad_norm": 8.029809951782227, "loss": 4.0432, "lr": 0.0006890909090909091, "step": 2724, "tokens_trained": 1.338936912 }, { "epoch": 0.7732784908871712, "grad_norm": 8.858033180236816, "loss": 4.0841, "lr": 0.0006888111888111889, "step": 2726, "tokens_trained": 1.339920296 }, { "epoch": 0.7738458265371251, "grad_norm": 6.917725086212158, "loss": 4.0701, "lr": 0.0006885314685314685, "step": 2728, "tokens_trained": 1.340910088 }, { "epoch": 0.7744131621870789, "grad_norm": 9.695552825927734, "loss": 4.0818, "lr": 0.0006882517482517483, "step": 2730, "tokens_trained": 1.341895264 }, { "epoch": 0.7749804978370328, "grad_norm": 8.998181343078613, "loss": 4.0734, "lr": 0.0006879720279720279, "step": 2732, "tokens_trained": 1.342875544 }, { "epoch": 0.7755478334869867, "grad_norm": 7.250143527984619, "loss": 4.0511, "lr": 0.0006876923076923078, "step": 2734, "tokens_trained": 1.34386044 }, { "epoch": 0.7761151691369407, "grad_norm": 8.95149040222168, "loss": 4.0671, "lr": 0.0006874125874125874, "step": 2736, "tokens_trained": 1.344844568 }, { "epoch": 0.7766825047868946, "grad_norm": 9.469155311584473, "loss": 4.0549, "lr": 0.0006871328671328672, "step": 2738, "tokens_trained": 1.3458226 }, { "epoch": 0.7772498404368484, "grad_norm": 6.303086757659912, "loss": 4.0808, "lr": 0.0006868531468531468, "step": 2740, "tokens_trained": 1.346809256 }, { "epoch": 0.7778171760868023, "grad_norm": 6.282865524291992, "loss": 4.0425, "lr": 0.0006865734265734265, "step": 2742, "tokens_trained": 1.347790504 }, { "epoch": 0.7783845117367563, "grad_norm": 6.448110103607178, "loss": 4.0512, "lr": 0.0006862937062937064, "step": 2744, "tokens_trained": 1.348770416 }, { "epoch": 0.7789518473867102, "grad_norm": 3.967651128768921, "loss": 4.0189, "lr": 0.000686013986013986, "step": 2746, "tokens_trained": 1.34975288 }, { "epoch": 0.7795191830366641, "grad_norm": 4.253781318664551, "loss": 4.0774, "lr": 0.0006857342657342658, "step": 2748, "tokens_trained": 1.350729672 }, { "epoch": 0.7800865186866179, "grad_norm": 15.237231254577637, "loss": 4.0929, "lr": 0.0006854545454545454, "step": 2750, "tokens_trained": 1.351711184 }, { "epoch": 0.7800865186866179, "eval_loss": 1.0141865015029907, "eval_runtime": 20.7754, "step": 2750, "tokens_trained": 1.351711184 }, { "epoch": 0.7806538543365719, "grad_norm": 14.367753028869629, "loss": 4.0422, "lr": 0.0006851748251748253, "step": 2752, "tokens_trained": 1.352694296 }, { "epoch": 0.7812211899865258, "grad_norm": 4.344571590423584, "loss": 4.018, "lr": 0.0006848951048951049, "step": 2754, "tokens_trained": 1.353678976 }, { "epoch": 0.7817885256364797, "grad_norm": 4.031637191772461, "loss": 4.0568, "lr": 0.0006846153846153846, "step": 2756, "tokens_trained": 1.354661624 }, { "epoch": 0.7823558612864336, "grad_norm": 11.08716106414795, "loss": 4.0717, "lr": 0.0006843356643356643, "step": 2758, "tokens_trained": 1.355644416 }, { "epoch": 0.7829231969363875, "grad_norm": 10.119296073913574, "loss": 4.0726, "lr": 0.000684055944055944, "step": 2760, "tokens_trained": 1.356625632 }, { "epoch": 0.7834905325863414, "grad_norm": 14.678930282592773, "loss": 4.065, "lr": 0.0006837762237762239, "step": 2762, "tokens_trained": 1.357605968 }, { "epoch": 0.7840578682362953, "grad_norm": 2.6932129859924316, "loss": 4.0831, "lr": 0.0006834965034965035, "step": 2764, "tokens_trained": 1.358590808 }, { "epoch": 0.7846252038862492, "grad_norm": 22.138845443725586, "loss": 4.1011, "lr": 0.0006832167832167833, "step": 2766, "tokens_trained": 1.359570928 }, { "epoch": 0.7851925395362032, "grad_norm": 17.627702713012695, "loss": 4.1441, "lr": 0.0006829370629370629, "step": 2768, "tokens_trained": 1.36055716 }, { "epoch": 0.785759875186157, "grad_norm": 9.9471435546875, "loss": 4.122, "lr": 0.0006826573426573427, "step": 2770, "tokens_trained": 1.361539352 }, { "epoch": 0.7863272108361109, "grad_norm": 11.452835083007812, "loss": 4.0928, "lr": 0.0006823776223776224, "step": 2772, "tokens_trained": 1.362519 }, { "epoch": 0.7868945464860648, "grad_norm": 15.566934585571289, "loss": 4.0816, "lr": 0.0006820979020979021, "step": 2774, "tokens_trained": 1.363505808 }, { "epoch": 0.7874618821360188, "grad_norm": 8.46238899230957, "loss": 4.0924, "lr": 0.0006818181818181818, "step": 2776, "tokens_trained": 1.364484496 }, { "epoch": 0.7880292177859727, "grad_norm": 4.6673688888549805, "loss": 4.0732, "lr": 0.0006815384615384615, "step": 2778, "tokens_trained": 1.365468696 }, { "epoch": 0.7885965534359265, "grad_norm": 10.422809600830078, "loss": 4.0285, "lr": 0.0006812587412587414, "step": 2780, "tokens_trained": 1.36645104 }, { "epoch": 0.7891638890858804, "grad_norm": 11.707451820373535, "loss": 4.0645, "lr": 0.000680979020979021, "step": 2782, "tokens_trained": 1.367433136 }, { "epoch": 0.7897312247358343, "grad_norm": 6.887526988983154, "loss": 4.0591, "lr": 0.0006806993006993007, "step": 2784, "tokens_trained": 1.368420024 }, { "epoch": 0.7902985603857883, "grad_norm": 7.914979457855225, "loss": 4.0641, "lr": 0.0006804195804195804, "step": 2786, "tokens_trained": 1.369401936 }, { "epoch": 0.7908658960357422, "grad_norm": 7.964488506317139, "loss": 4.0462, "lr": 0.0006801398601398602, "step": 2788, "tokens_trained": 1.370384896 }, { "epoch": 0.791433231685696, "grad_norm": 7.16652774810791, "loss": 4.026, "lr": 0.0006798601398601399, "step": 2790, "tokens_trained": 1.371365304 }, { "epoch": 0.7920005673356499, "grad_norm": 8.604512214660645, "loss": 4.0407, "lr": 0.0006795804195804196, "step": 2792, "tokens_trained": 1.372349584 }, { "epoch": 0.7925679029856039, "grad_norm": 6.616272449493408, "loss": 4.0417, "lr": 0.0006793006993006992, "step": 2794, "tokens_trained": 1.373330584 }, { "epoch": 0.7931352386355578, "grad_norm": 3.8474340438842773, "loss": 4.0322, "lr": 0.000679020979020979, "step": 2796, "tokens_trained": 1.374312888 }, { "epoch": 0.7937025742855117, "grad_norm": 11.628402709960938, "loss": 4.0378, "lr": 0.0006787412587412588, "step": 2798, "tokens_trained": 1.375294704 }, { "epoch": 0.7942699099354655, "grad_norm": 7.480481147766113, "loss": 4.1031, "lr": 0.0006784615384615385, "step": 2800, "tokens_trained": 1.376279072 }, { "epoch": 0.7948372455854195, "grad_norm": 6.449431896209717, "loss": 4.0397, "lr": 0.0006781818181818182, "step": 2802, "tokens_trained": 1.377265568 }, { "epoch": 0.7954045812353734, "grad_norm": 5.179644584655762, "loss": 4.0826, "lr": 0.0006779020979020979, "step": 2804, "tokens_trained": 1.378250776 }, { "epoch": 0.7959719168853273, "grad_norm": 8.918203353881836, "loss": 4.0358, "lr": 0.0006776223776223777, "step": 2806, "tokens_trained": 1.379235464 }, { "epoch": 0.7965392525352812, "grad_norm": 6.065394878387451, "loss": 4.0754, "lr": 0.0006773426573426574, "step": 2808, "tokens_trained": 1.380215248 }, { "epoch": 0.797106588185235, "grad_norm": 3.9142706394195557, "loss": 4.0274, "lr": 0.0006770629370629371, "step": 2810, "tokens_trained": 1.381197872 }, { "epoch": 0.797673923835189, "grad_norm": 12.86207103729248, "loss": 4.0471, "lr": 0.0006767832167832167, "step": 2812, "tokens_trained": 1.38218364 }, { "epoch": 0.7982412594851429, "grad_norm": 10.052533149719238, "loss": 4.0628, "lr": 0.0006765034965034965, "step": 2814, "tokens_trained": 1.383170176 }, { "epoch": 0.7988085951350968, "grad_norm": 5.910792827606201, "loss": 4.0358, "lr": 0.0006762237762237763, "step": 2816, "tokens_trained": 1.384154592 }, { "epoch": 0.7993759307850508, "grad_norm": 13.312492370605469, "loss": 4.0694, "lr": 0.000675944055944056, "step": 2818, "tokens_trained": 1.385138352 }, { "epoch": 0.7999432664350046, "grad_norm": 12.467507362365723, "loss": 4.0705, "lr": 0.0006756643356643357, "step": 2820, "tokens_trained": 1.386123232 }, { "epoch": 0.8005106020849585, "grad_norm": 4.8490824699401855, "loss": 4.0387, "lr": 0.0006753846153846153, "step": 2822, "tokens_trained": 1.387107008 }, { "epoch": 0.8010779377349124, "grad_norm": 13.596024513244629, "loss": 4.0505, "lr": 0.0006751048951048951, "step": 2824, "tokens_trained": 1.388091632 }, { "epoch": 0.8016452733848664, "grad_norm": 13.633816719055176, "loss": 4.0894, "lr": 0.0006748251748251748, "step": 2826, "tokens_trained": 1.389077456 }, { "epoch": 0.8022126090348203, "grad_norm": 4.448362827301025, "loss": 4.0623, "lr": 0.0006745454545454546, "step": 2828, "tokens_trained": 1.39006124 }, { "epoch": 0.8027799446847741, "grad_norm": 21.12818717956543, "loss": 4.1275, "lr": 0.0006742657342657342, "step": 2830, "tokens_trained": 1.391043016 }, { "epoch": 0.803347280334728, "grad_norm": 10.096168518066406, "loss": 4.0858, "lr": 0.000673986013986014, "step": 2832, "tokens_trained": 1.392026656 }, { "epoch": 0.803914615984682, "grad_norm": 4.614907264709473, "loss": 4.0075, "lr": 0.0006737062937062938, "step": 2834, "tokens_trained": 1.393006784 }, { "epoch": 0.8044819516346359, "grad_norm": 13.106852531433105, "loss": 4.1113, "lr": 0.0006734265734265734, "step": 2836, "tokens_trained": 1.393990424 }, { "epoch": 0.8050492872845898, "grad_norm": 4.287477493286133, "loss": 4.0818, "lr": 0.0006731468531468532, "step": 2838, "tokens_trained": 1.39497072 }, { "epoch": 0.8056166229345436, "grad_norm": 9.295431137084961, "loss": 4.0652, "lr": 0.0006728671328671328, "step": 2840, "tokens_trained": 1.395951488 }, { "epoch": 0.8061839585844975, "grad_norm": 12.001997947692871, "loss": 4.1061, "lr": 0.0006725874125874126, "step": 2842, "tokens_trained": 1.396933744 }, { "epoch": 0.8067512942344515, "grad_norm": 15.18830680847168, "loss": 4.0483, "lr": 0.0006723076923076923, "step": 2844, "tokens_trained": 1.397915696 }, { "epoch": 0.8073186298844054, "grad_norm": 9.936029434204102, "loss": 4.0559, "lr": 0.0006720279720279721, "step": 2846, "tokens_trained": 1.398900048 }, { "epoch": 0.8078859655343593, "grad_norm": 4.903693199157715, "loss": 4.0474, "lr": 0.0006717482517482517, "step": 2848, "tokens_trained": 1.399885336 }, { "epoch": 0.8084533011843131, "grad_norm": 6.753813743591309, "loss": 4.0365, "lr": 0.0006714685314685314, "step": 2850, "tokens_trained": 1.400867432 }, { "epoch": 0.8090206368342671, "grad_norm": 10.53545093536377, "loss": 4.0697, "lr": 0.0006711888111888113, "step": 2852, "tokens_trained": 1.401849552 }, { "epoch": 0.809587972484221, "grad_norm": 7.666012763977051, "loss": 3.9955, "lr": 0.0006709090909090909, "step": 2854, "tokens_trained": 1.402832496 }, { "epoch": 0.8101553081341749, "grad_norm": 11.65257740020752, "loss": 4.0377, "lr": 0.0006706293706293707, "step": 2856, "tokens_trained": 1.403816768 }, { "epoch": 0.8107226437841288, "grad_norm": 10.997775077819824, "loss": 4.0145, "lr": 0.0006703496503496503, "step": 2858, "tokens_trained": 1.404804968 }, { "epoch": 0.8112899794340827, "grad_norm": 3.699673652648926, "loss": 4.1053, "lr": 0.0006700699300699301, "step": 2860, "tokens_trained": 1.40578656 }, { "epoch": 0.8118573150840366, "grad_norm": 17.54732894897461, "loss": 4.121, "lr": 0.0006697902097902098, "step": 2862, "tokens_trained": 1.406773056 }, { "epoch": 0.8124246507339905, "grad_norm": 10.354470252990723, "loss": 4.0353, "lr": 0.0006695104895104895, "step": 2864, "tokens_trained": 1.407756592 }, { "epoch": 0.8129919863839444, "grad_norm": 7.760607719421387, "loss": 4.0529, "lr": 0.0006692307692307692, "step": 2866, "tokens_trained": 1.408742176 }, { "epoch": 0.8135593220338984, "grad_norm": 11.074470520019531, "loss": 4.0223, "lr": 0.0006689510489510489, "step": 2868, "tokens_trained": 1.409727856 }, { "epoch": 0.8141266576838522, "grad_norm": 12.221083641052246, "loss": 4.0228, "lr": 0.0006686713286713288, "step": 2870, "tokens_trained": 1.410712016 }, { "epoch": 0.8146939933338061, "grad_norm": 8.933589935302734, "loss": 4.1234, "lr": 0.0006683916083916084, "step": 2872, "tokens_trained": 1.411694496 }, { "epoch": 0.81526132898376, "grad_norm": 12.326020240783691, "loss": 4.0772, "lr": 0.0006681118881118882, "step": 2874, "tokens_trained": 1.412676992 }, { "epoch": 0.8155449968087369, "eval_loss": 1.015201449394226, "eval_runtime": 20.3991, "step": 2875, "tokens_trained": 1.413169416 }, { "epoch": 0.815828664633714, "grad_norm": 8.320648193359375, "loss": 4.0045, "lr": 0.0006678321678321678, "step": 2876, "tokens_trained": 1.413657912 }, { "epoch": 0.8163960002836679, "grad_norm": 4.708253383636475, "loss": 4.022, "lr": 0.0006675524475524475, "step": 2878, "tokens_trained": 1.414641576 }, { "epoch": 0.8169633359336217, "grad_norm": 13.005586624145508, "loss": 4.0305, "lr": 0.0006672727272727273, "step": 2880, "tokens_trained": 1.415624992 }, { "epoch": 0.8175306715835756, "grad_norm": 8.445854187011719, "loss": 4.0723, "lr": 0.000666993006993007, "step": 2882, "tokens_trained": 1.416605936 }, { "epoch": 0.8180980072335295, "grad_norm": 5.153830528259277, "loss": 4.0766, "lr": 0.0006667132867132867, "step": 2884, "tokens_trained": 1.417593408 }, { "epoch": 0.8186653428834835, "grad_norm": 13.989762306213379, "loss": 4.043, "lr": 0.0006664335664335664, "step": 2886, "tokens_trained": 1.418577984 }, { "epoch": 0.8192326785334374, "grad_norm": 6.2893805503845215, "loss": 4.0576, "lr": 0.0006661538461538463, "step": 2888, "tokens_trained": 1.419557304 }, { "epoch": 0.8198000141833912, "grad_norm": 3.1825716495513916, "loss": 4.0216, "lr": 0.0006658741258741259, "step": 2890, "tokens_trained": 1.420538736 }, { "epoch": 0.8203673498333451, "grad_norm": 13.280265808105469, "loss": 4.0665, "lr": 0.0006655944055944056, "step": 2892, "tokens_trained": 1.421523048 }, { "epoch": 0.8209346854832991, "grad_norm": 8.963871955871582, "loss": 4.0996, "lr": 0.0006653146853146853, "step": 2894, "tokens_trained": 1.422504352 }, { "epoch": 0.821502021133253, "grad_norm": 9.463395118713379, "loss": 4.0638, "lr": 0.000665034965034965, "step": 2896, "tokens_trained": 1.423490256 }, { "epoch": 0.8220693567832069, "grad_norm": 10.848092079162598, "loss": 4.0767, "lr": 0.0006647552447552448, "step": 2898, "tokens_trained": 1.424473728 }, { "epoch": 0.8226366924331607, "grad_norm": 9.271900177001953, "loss": 4.0675, "lr": 0.0006644755244755245, "step": 2900, "tokens_trained": 1.425456216 }, { "epoch": 0.8232040280831147, "grad_norm": 8.910347938537598, "loss": 4.031, "lr": 0.0006641958041958042, "step": 2902, "tokens_trained": 1.426442408 }, { "epoch": 0.8237713637330686, "grad_norm": 6.92717981338501, "loss": 4.1025, "lr": 0.0006639160839160839, "step": 2904, "tokens_trained": 1.42742624 }, { "epoch": 0.8243386993830225, "grad_norm": 6.383159637451172, "loss": 4.0057, "lr": 0.0006636363636363638, "step": 2906, "tokens_trained": 1.428414912 }, { "epoch": 0.8249060350329764, "grad_norm": 5.782074451446533, "loss": 4.0169, "lr": 0.0006633566433566434, "step": 2908, "tokens_trained": 1.42939668 }, { "epoch": 0.8254733706829303, "grad_norm": 10.663660049438477, "loss": 4.0504, "lr": 0.0006630769230769231, "step": 2910, "tokens_trained": 1.430382648 }, { "epoch": 0.8260407063328842, "grad_norm": 11.806394577026367, "loss": 4.065, "lr": 0.0006627972027972028, "step": 2912, "tokens_trained": 1.43136304 }, { "epoch": 0.8266080419828381, "grad_norm": 5.7375617027282715, "loss": 4.0133, "lr": 0.0006625174825174825, "step": 2914, "tokens_trained": 1.432347472 }, { "epoch": 0.827175377632792, "grad_norm": 6.814542293548584, "loss": 4.0656, "lr": 0.0006622377622377623, "step": 2916, "tokens_trained": 1.433329632 }, { "epoch": 0.827742713282746, "grad_norm": 8.265726089477539, "loss": 4.0206, "lr": 0.000661958041958042, "step": 2918, "tokens_trained": 1.434312216 }, { "epoch": 0.8283100489326998, "grad_norm": 6.937063694000244, "loss": 4.0372, "lr": 0.0006616783216783216, "step": 2920, "tokens_trained": 1.435294504 }, { "epoch": 0.8288773845826537, "grad_norm": 6.773707866668701, "loss": 4.0496, "lr": 0.0006613986013986014, "step": 2922, "tokens_trained": 1.436276344 }, { "epoch": 0.8294447202326076, "grad_norm": 8.471631050109863, "loss": 4.0834, "lr": 0.0006611188811188812, "step": 2924, "tokens_trained": 1.43725852 }, { "epoch": 0.8300120558825616, "grad_norm": 10.602453231811523, "loss": 4.0445, "lr": 0.0006608391608391609, "step": 2926, "tokens_trained": 1.438239768 }, { "epoch": 0.8305793915325155, "grad_norm": 8.173192977905273, "loss": 4.0423, "lr": 0.0006605594405594406, "step": 2928, "tokens_trained": 1.43921892 }, { "epoch": 0.8311467271824693, "grad_norm": 9.510146141052246, "loss": 4.0012, "lr": 0.0006602797202797203, "step": 2930, "tokens_trained": 1.440203128 }, { "epoch": 0.8317140628324232, "grad_norm": 4.894539833068848, "loss": 4.0574, "lr": 0.00066, "step": 2932, "tokens_trained": 1.441187856 }, { "epoch": 0.8322813984823771, "grad_norm": 4.4945149421691895, "loss": 4.0107, "lr": 0.0006597202797202797, "step": 2934, "tokens_trained": 1.442164056 }, { "epoch": 0.8328487341323311, "grad_norm": 7.323387145996094, "loss": 4.0779, "lr": 0.0006594405594405595, "step": 2936, "tokens_trained": 1.44314688 }, { "epoch": 0.833416069782285, "grad_norm": 9.858680725097656, "loss": 4.03, "lr": 0.0006591608391608391, "step": 2938, "tokens_trained": 1.444127552 }, { "epoch": 0.8339834054322388, "grad_norm": 8.214831352233887, "loss": 4.0591, "lr": 0.0006588811188811189, "step": 2940, "tokens_trained": 1.445109336 }, { "epoch": 0.8345507410821927, "grad_norm": 6.628262996673584, "loss": 4.0834, "lr": 0.0006586013986013986, "step": 2942, "tokens_trained": 1.4460904 }, { "epoch": 0.8351180767321467, "grad_norm": 11.043391227722168, "loss": 4.0516, "lr": 0.0006583216783216784, "step": 2944, "tokens_trained": 1.447068776 }, { "epoch": 0.8356854123821006, "grad_norm": 8.013843536376953, "loss": 4.0309, "lr": 0.0006580419580419581, "step": 2946, "tokens_trained": 1.448046952 }, { "epoch": 0.8362527480320545, "grad_norm": 4.856717586517334, "loss": 4.0547, "lr": 0.0006577622377622377, "step": 2948, "tokens_trained": 1.449033752 }, { "epoch": 0.8368200836820083, "grad_norm": 4.799930572509766, "loss": 4.0044, "lr": 0.0006574825174825175, "step": 2950, "tokens_trained": 1.450019912 }, { "epoch": 0.8373874193319623, "grad_norm": 8.492339134216309, "loss": 4.0368, "lr": 0.0006572027972027972, "step": 2952, "tokens_trained": 1.451002976 }, { "epoch": 0.8379547549819162, "grad_norm": 7.098823547363281, "loss": 3.9807, "lr": 0.000656923076923077, "step": 2954, "tokens_trained": 1.45198412 }, { "epoch": 0.8385220906318701, "grad_norm": 8.705301284790039, "loss": 4.0749, "lr": 0.0006566433566433566, "step": 2956, "tokens_trained": 1.452963832 }, { "epoch": 0.839089426281824, "grad_norm": 2.8292014598846436, "loss": 4.0241, "lr": 0.0006563636363636364, "step": 2958, "tokens_trained": 1.453947688 }, { "epoch": 0.8396567619317779, "grad_norm": 3.7414586544036865, "loss": 4.0554, "lr": 0.0006560839160839161, "step": 2960, "tokens_trained": 1.45492676 }, { "epoch": 0.8402240975817318, "grad_norm": 11.956228256225586, "loss": 4.0343, "lr": 0.0006558041958041958, "step": 2962, "tokens_trained": 1.455907464 }, { "epoch": 0.8407914332316857, "grad_norm": 11.086222648620605, "loss": 4.0324, "lr": 0.0006555244755244756, "step": 2964, "tokens_trained": 1.456891688 }, { "epoch": 0.8413587688816396, "grad_norm": 8.380780220031738, "loss": 4.0335, "lr": 0.0006552447552447552, "step": 2966, "tokens_trained": 1.457880016 }, { "epoch": 0.8419261045315936, "grad_norm": 8.568910598754883, "loss": 4.0431, "lr": 0.000654965034965035, "step": 2968, "tokens_trained": 1.458866944 }, { "epoch": 0.8424934401815474, "grad_norm": 10.840734481811523, "loss": 4.0275, "lr": 0.0006546853146853147, "step": 2970, "tokens_trained": 1.459849096 }, { "epoch": 0.8430607758315013, "grad_norm": 5.364732265472412, "loss": 4.0464, "lr": 0.0006544055944055945, "step": 2972, "tokens_trained": 1.460833976 }, { "epoch": 0.8436281114814552, "grad_norm": 8.918869018554688, "loss": 4.0501, "lr": 0.0006541258741258741, "step": 2974, "tokens_trained": 1.461811472 }, { "epoch": 0.8441954471314091, "grad_norm": 10.94211483001709, "loss": 4.0284, "lr": 0.0006538461538461538, "step": 2976, "tokens_trained": 1.462798528 }, { "epoch": 0.8447627827813631, "grad_norm": 14.475136756896973, "loss": 4.0597, "lr": 0.0006535664335664336, "step": 2978, "tokens_trained": 1.46378116 }, { "epoch": 0.8453301184313169, "grad_norm": 8.219613075256348, "loss": 4.0499, "lr": 0.0006532867132867133, "step": 2980, "tokens_trained": 1.464758752 }, { "epoch": 0.8458974540812708, "grad_norm": 8.898524284362793, "loss": 4.0472, "lr": 0.0006530069930069931, "step": 2982, "tokens_trained": 1.465737992 }, { "epoch": 0.8464647897312247, "grad_norm": 6.673952579498291, "loss": 3.9971, "lr": 0.0006527272727272727, "step": 2984, "tokens_trained": 1.466724672 }, { "epoch": 0.8470321253811787, "grad_norm": 6.514251708984375, "loss": 4.0245, "lr": 0.0006524475524475524, "step": 2986, "tokens_trained": 1.46770572 }, { "epoch": 0.8475994610311326, "grad_norm": 8.130202293395996, "loss": 4.0332, "lr": 0.0006521678321678322, "step": 2988, "tokens_trained": 1.468690624 }, { "epoch": 0.8481667966810864, "grad_norm": 4.283686637878418, "loss": 4.0551, "lr": 0.0006518881118881119, "step": 2990, "tokens_trained": 1.469674696 }, { "epoch": 0.8487341323310403, "grad_norm": 4.8144426345825195, "loss": 4.0408, "lr": 0.0006516083916083916, "step": 2992, "tokens_trained": 1.470659816 }, { "epoch": 0.8493014679809943, "grad_norm": 11.117393493652344, "loss": 4.0423, "lr": 0.0006513286713286713, "step": 2994, "tokens_trained": 1.47164192 }, { "epoch": 0.8498688036309482, "grad_norm": 8.022162437438965, "loss": 4.064, "lr": 0.0006510489510489511, "step": 2996, "tokens_trained": 1.472624344 }, { "epoch": 0.8504361392809021, "grad_norm": 5.267605304718018, "loss": 3.9804, "lr": 0.0006507692307692308, "step": 2998, "tokens_trained": 1.473606552 }, { "epoch": 0.8510034749308559, "grad_norm": 9.365017890930176, "loss": 4.0223, "lr": 0.0006504895104895106, "step": 3000, "tokens_trained": 1.474586552 }, { "epoch": 0.8510034749308559, "eval_loss": 1.0078805685043335, "eval_runtime": 20.7752, "step": 3000, "tokens_trained": 1.474586552 }, { "epoch": 0.8515708105808099, "grad_norm": 10.311480522155762, "loss": 3.969, "lr": 0.0006502097902097902, "step": 3002, "tokens_trained": 1.475564304 }, { "epoch": 0.8521381462307638, "grad_norm": 5.622078895568848, "loss": 3.9803, "lr": 0.0006499300699300699, "step": 3004, "tokens_trained": 1.476547088 }, { "epoch": 0.8527054818807177, "grad_norm": 6.005502223968506, "loss": 4.0584, "lr": 0.0006496503496503497, "step": 3006, "tokens_trained": 1.477531352 }, { "epoch": 0.8532728175306716, "grad_norm": 5.769370079040527, "loss": 4.0332, "lr": 0.0006493706293706294, "step": 3008, "tokens_trained": 1.478512136 }, { "epoch": 0.8538401531806254, "grad_norm": 4.246579647064209, "loss": 3.9848, "lr": 0.0006490909090909091, "step": 3010, "tokens_trained": 1.47949464 }, { "epoch": 0.8544074888305794, "grad_norm": 3.3972086906433105, "loss": 3.9969, "lr": 0.0006488111888111888, "step": 3012, "tokens_trained": 1.4804812 }, { "epoch": 0.8549748244805333, "grad_norm": 4.793631553649902, "loss": 3.9748, "lr": 0.0006485314685314685, "step": 3014, "tokens_trained": 1.481469176 }, { "epoch": 0.8555421601304872, "grad_norm": 7.709076881408691, "loss": 4.0399, "lr": 0.0006482517482517483, "step": 3016, "tokens_trained": 1.482450232 }, { "epoch": 0.8561094957804412, "grad_norm": 9.06294059753418, "loss": 4.0279, "lr": 0.000647972027972028, "step": 3018, "tokens_trained": 1.48343416 }, { "epoch": 0.856676831430395, "grad_norm": 7.496627330780029, "loss": 4.047, "lr": 0.0006476923076923077, "step": 3020, "tokens_trained": 1.484423072 }, { "epoch": 0.8572441670803489, "grad_norm": 6.635293006896973, "loss": 4.0583, "lr": 0.0006474125874125874, "step": 3022, "tokens_trained": 1.485406296 }, { "epoch": 0.8578115027303028, "grad_norm": 6.3066864013671875, "loss": 3.9902, "lr": 0.0006471328671328672, "step": 3024, "tokens_trained": 1.486391472 }, { "epoch": 0.8583788383802567, "grad_norm": 1.1249172687530518, "loss": 4.0032, "lr": 0.0006468531468531469, "step": 3026, "tokens_trained": 1.487377128 }, { "epoch": 0.8589461740302107, "grad_norm": 2.966470241546631, "loss": 3.9859, "lr": 0.0006465734265734265, "step": 3028, "tokens_trained": 1.488359656 }, { "epoch": 0.8595135096801645, "grad_norm": 6.611581325531006, "loss": 4.0259, "lr": 0.0006462937062937063, "step": 3030, "tokens_trained": 1.489340552 }, { "epoch": 0.8600808453301184, "grad_norm": 7.76756477355957, "loss": 4.0223, "lr": 0.0006460139860139859, "step": 3032, "tokens_trained": 1.49032648 }, { "epoch": 0.8606481809800723, "grad_norm": 10.86517333984375, "loss": 4.0457, "lr": 0.0006457342657342658, "step": 3034, "tokens_trained": 1.491312608 }, { "epoch": 0.8612155166300263, "grad_norm": 4.524630546569824, "loss": 4.0882, "lr": 0.0006454545454545455, "step": 3036, "tokens_trained": 1.49229724 }, { "epoch": 0.8617828522799802, "grad_norm": 10.601529121398926, "loss": 4.0466, "lr": 0.0006451748251748252, "step": 3038, "tokens_trained": 1.49327952 }, { "epoch": 0.862350187929934, "grad_norm": 10.691457748413086, "loss": 4.0239, "lr": 0.0006448951048951049, "step": 3040, "tokens_trained": 1.494263528 }, { "epoch": 0.8629175235798879, "grad_norm": 5.371310710906982, "loss": 4.0864, "lr": 0.0006446153846153846, "step": 3042, "tokens_trained": 1.49524708 }, { "epoch": 0.8634848592298419, "grad_norm": 5.7418999671936035, "loss": 4.0618, "lr": 0.0006443356643356644, "step": 3044, "tokens_trained": 1.496229136 }, { "epoch": 0.8640521948797958, "grad_norm": 7.521689414978027, "loss": 4.0235, "lr": 0.000644055944055944, "step": 3046, "tokens_trained": 1.497212944 }, { "epoch": 0.8646195305297497, "grad_norm": 6.966773509979248, "loss": 4.0187, "lr": 0.0006437762237762238, "step": 3048, "tokens_trained": 1.498198992 }, { "epoch": 0.8651868661797035, "grad_norm": 12.514280319213867, "loss": 4.0306, "lr": 0.0006434965034965034, "step": 3050, "tokens_trained": 1.499181312 }, { "epoch": 0.8657542018296575, "grad_norm": 4.849910736083984, "loss": 4.033, "lr": 0.0006432167832167833, "step": 3052, "tokens_trained": 1.500163288 }, { "epoch": 0.8663215374796114, "grad_norm": 9.553950309753418, "loss": 4.0465, "lr": 0.000642937062937063, "step": 3054, "tokens_trained": 1.501147464 }, { "epoch": 0.8668888731295653, "grad_norm": 8.58786678314209, "loss": 4.0584, "lr": 0.0006426573426573426, "step": 3056, "tokens_trained": 1.50212956 }, { "epoch": 0.8674562087795192, "grad_norm": 11.174147605895996, "loss": 4.0152, "lr": 0.0006423776223776224, "step": 3058, "tokens_trained": 1.503112168 }, { "epoch": 0.868023544429473, "grad_norm": 1.879528522491455, "loss": 3.999, "lr": 0.0006420979020979021, "step": 3060, "tokens_trained": 1.504099584 }, { "epoch": 0.868590880079427, "grad_norm": 19.370494842529297, "loss": 4.1039, "lr": 0.0006418181818181819, "step": 3062, "tokens_trained": 1.50508356 }, { "epoch": 0.8691582157293809, "grad_norm": 10.598268508911133, "loss": 4.0542, "lr": 0.0006415384615384615, "step": 3064, "tokens_trained": 1.506063304 }, { "epoch": 0.8697255513793348, "grad_norm": 8.537477493286133, "loss": 4.0529, "lr": 0.0006412587412587413, "step": 3066, "tokens_trained": 1.507046368 }, { "epoch": 0.8702928870292888, "grad_norm": 8.395747184753418, "loss": 3.9941, "lr": 0.0006409790209790209, "step": 3068, "tokens_trained": 1.508029128 }, { "epoch": 0.8708602226792426, "grad_norm": 5.918806552886963, "loss": 4.0078, "lr": 0.0006406993006993007, "step": 3070, "tokens_trained": 1.5090132 }, { "epoch": 0.8714275583291965, "grad_norm": 3.845099925994873, "loss": 4.0564, "lr": 0.0006404195804195805, "step": 3072, "tokens_trained": 1.509994832 }, { "epoch": 0.8719948939791504, "grad_norm": 3.3807923793792725, "loss": 4.0438, "lr": 0.0006401398601398601, "step": 3074, "tokens_trained": 1.510975552 }, { "epoch": 0.8725622296291043, "grad_norm": 4.468081951141357, "loss": 4.066, "lr": 0.0006398601398601399, "step": 3076, "tokens_trained": 1.511959576 }, { "epoch": 0.8731295652790583, "grad_norm": 1.8455613851547241, "loss": 4.0247, "lr": 0.0006395804195804196, "step": 3078, "tokens_trained": 1.512939112 }, { "epoch": 0.8736969009290121, "grad_norm": 7.184399127960205, "loss": 4.081, "lr": 0.0006393006993006994, "step": 3080, "tokens_trained": 1.513924792 }, { "epoch": 0.874264236578966, "grad_norm": 8.416154861450195, "loss": 4.0372, "lr": 0.000639020979020979, "step": 3082, "tokens_trained": 1.514905096 }, { "epoch": 0.8748315722289199, "grad_norm": 6.620309829711914, "loss": 4.0822, "lr": 0.0006387412587412587, "step": 3084, "tokens_trained": 1.51588724 }, { "epoch": 0.8753989078788739, "grad_norm": 7.424724102020264, "loss": 4.053, "lr": 0.0006384615384615384, "step": 3086, "tokens_trained": 1.516871792 }, { "epoch": 0.8759662435288278, "grad_norm": 7.8764448165893555, "loss": 4.059, "lr": 0.0006381818181818182, "step": 3088, "tokens_trained": 1.517857872 }, { "epoch": 0.8765335791787816, "grad_norm": 7.330927848815918, "loss": 4.0182, "lr": 0.000637902097902098, "step": 3090, "tokens_trained": 1.518840616 }, { "epoch": 0.8771009148287355, "grad_norm": 8.612639427185059, "loss": 4.0181, "lr": 0.0006376223776223776, "step": 3092, "tokens_trained": 1.519826616 }, { "epoch": 0.8776682504786895, "grad_norm": 9.889811515808105, "loss": 4.0434, "lr": 0.0006373426573426574, "step": 3094, "tokens_trained": 1.520805784 }, { "epoch": 0.8782355861286434, "grad_norm": 5.421345233917236, "loss": 4.0237, "lr": 0.0006370629370629371, "step": 3096, "tokens_trained": 1.521789344 }, { "epoch": 0.8788029217785973, "grad_norm": 4.9160990715026855, "loss": 4.0497, "lr": 0.0006367832167832168, "step": 3098, "tokens_trained": 1.522772664 }, { "epoch": 0.8793702574285511, "grad_norm": 8.828028678894043, "loss": 4.0381, "lr": 0.0006365034965034965, "step": 3100, "tokens_trained": 1.523755712 }, { "epoch": 0.879937593078505, "grad_norm": 5.6704182624816895, "loss": 4.0017, "lr": 0.0006362237762237762, "step": 3102, "tokens_trained": 1.52473876 }, { "epoch": 0.880504928728459, "grad_norm": 4.982235908508301, "loss": 3.9826, "lr": 0.0006359440559440559, "step": 3104, "tokens_trained": 1.52571756 }, { "epoch": 0.8810722643784129, "grad_norm": 8.639644622802734, "loss": 4.0177, "lr": 0.0006356643356643357, "step": 3106, "tokens_trained": 1.526695632 }, { "epoch": 0.8816396000283668, "grad_norm": 6.1896820068359375, "loss": 4.0248, "lr": 0.0006353846153846155, "step": 3108, "tokens_trained": 1.527678296 }, { "epoch": 0.8822069356783206, "grad_norm": 3.787477731704712, "loss": 4.0489, "lr": 0.0006351048951048951, "step": 3110, "tokens_trained": 1.528665456 }, { "epoch": 0.8827742713282746, "grad_norm": 4.418561935424805, "loss": 4.0422, "lr": 0.0006348251748251748, "step": 3112, "tokens_trained": 1.529648584 }, { "epoch": 0.8833416069782285, "grad_norm": 8.951369285583496, "loss": 4.028, "lr": 0.0006345454545454546, "step": 3114, "tokens_trained": 1.530628808 }, { "epoch": 0.8839089426281824, "grad_norm": 4.903277397155762, "loss": 4.0772, "lr": 0.0006342657342657343, "step": 3116, "tokens_trained": 1.531612144 }, { "epoch": 0.8844762782781364, "grad_norm": 4.366726875305176, "loss": 3.9975, "lr": 0.000633986013986014, "step": 3118, "tokens_trained": 1.532595304 }, { "epoch": 0.8850436139280902, "grad_norm": 6.9316911697387695, "loss": 4.0019, "lr": 0.0006337062937062937, "step": 3120, "tokens_trained": 1.533578888 }, { "epoch": 0.8856109495780441, "grad_norm": 8.896012306213379, "loss": 4.04, "lr": 0.0006334265734265733, "step": 3122, "tokens_trained": 1.534557552 }, { "epoch": 0.886178285227998, "grad_norm": 5.350147724151611, "loss": 4.0229, "lr": 0.0006331468531468532, "step": 3124, "tokens_trained": 1.535539672 }, { "epoch": 0.8864619530529749, "eval_loss": 1.007444143295288, "eval_runtime": 20.5976, "step": 3125, "tokens_trained": 1.53603052 }, { "epoch": 0.886745620877952, "grad_norm": 5.331796646118164, "loss": 4.0331, "lr": 0.0006328671328671329, "step": 3126, "tokens_trained": 1.536525432 }, { "epoch": 0.8873129565279059, "grad_norm": 11.335051536560059, "loss": 4.041, "lr": 0.0006325874125874126, "step": 3128, "tokens_trained": 1.537508928 }, { "epoch": 0.8878802921778597, "grad_norm": 8.185080528259277, "loss": 4.0299, "lr": 0.0006323076923076923, "step": 3130, "tokens_trained": 1.53848672 }, { "epoch": 0.8884476278278136, "grad_norm": 4.136550426483154, "loss": 4.0268, "lr": 0.0006320279720279721, "step": 3132, "tokens_trained": 1.5394682 }, { "epoch": 0.8890149634777675, "grad_norm": 4.993428707122803, "loss": 3.9808, "lr": 0.0006317482517482518, "step": 3134, "tokens_trained": 1.540449416 }, { "epoch": 0.8895822991277215, "grad_norm": 5.485887050628662, "loss": 4.0201, "lr": 0.0006314685314685314, "step": 3136, "tokens_trained": 1.541436136 }, { "epoch": 0.8901496347776754, "grad_norm": 4.517815589904785, "loss": 3.9985, "lr": 0.0006311888111888112, "step": 3138, "tokens_trained": 1.542421992 }, { "epoch": 0.8907169704276292, "grad_norm": 3.8219170570373535, "loss": 4.0299, "lr": 0.0006309090909090908, "step": 3140, "tokens_trained": 1.543399648 }, { "epoch": 0.8912843060775831, "grad_norm": 7.318249702453613, "loss": 4.0377, "lr": 0.0006306293706293707, "step": 3142, "tokens_trained": 1.54438384 }, { "epoch": 0.8918516417275371, "grad_norm": 9.09650707244873, "loss": 4.0572, "lr": 0.0006303496503496504, "step": 3144, "tokens_trained": 1.545367632 }, { "epoch": 0.892418977377491, "grad_norm": 6.241589069366455, "loss": 4.025, "lr": 0.0006300699300699301, "step": 3146, "tokens_trained": 1.546355136 }, { "epoch": 0.8929863130274449, "grad_norm": 6.9915385246276855, "loss": 4.0177, "lr": 0.0006297902097902098, "step": 3148, "tokens_trained": 1.547340304 }, { "epoch": 0.8935536486773987, "grad_norm": 5.599451541900635, "loss": 3.9892, "lr": 0.0006295104895104896, "step": 3150, "tokens_trained": 1.54832164 }, { "epoch": 0.8941209843273527, "grad_norm": 7.765986442565918, "loss": 4.0232, "lr": 0.0006292307692307693, "step": 3152, "tokens_trained": 1.54930228 }, { "epoch": 0.8946883199773066, "grad_norm": 10.365357398986816, "loss": 4.0254, "lr": 0.0006289510489510489, "step": 3154, "tokens_trained": 1.550282888 }, { "epoch": 0.8952556556272605, "grad_norm": 7.8539276123046875, "loss": 4.008, "lr": 0.0006286713286713287, "step": 3156, "tokens_trained": 1.551265008 }, { "epoch": 0.8958229912772144, "grad_norm": 8.106318473815918, "loss": 4.0351, "lr": 0.0006283916083916083, "step": 3158, "tokens_trained": 1.552245928 }, { "epoch": 0.8963903269271682, "grad_norm": 10.22494125366211, "loss": 3.9873, "lr": 0.0006281118881118882, "step": 3160, "tokens_trained": 1.553227848 }, { "epoch": 0.8969576625771222, "grad_norm": 2.8810367584228516, "loss": 4.0399, "lr": 0.0006278321678321679, "step": 3162, "tokens_trained": 1.554208112 }, { "epoch": 0.8975249982270761, "grad_norm": 10.036259651184082, "loss": 4.0072, "lr": 0.0006275524475524475, "step": 3164, "tokens_trained": 1.555186496 }, { "epoch": 0.89809233387703, "grad_norm": 6.596704006195068, "loss": 4.0306, "lr": 0.0006272727272727273, "step": 3166, "tokens_trained": 1.556170896 }, { "epoch": 0.898659669526984, "grad_norm": 4.411632537841797, "loss": 4.035, "lr": 0.000626993006993007, "step": 3168, "tokens_trained": 1.55715312 }, { "epoch": 0.8992270051769378, "grad_norm": 4.391601085662842, "loss": 3.9973, "lr": 0.0006267132867132868, "step": 3170, "tokens_trained": 1.558133552 }, { "epoch": 0.8997943408268917, "grad_norm": 9.456700325012207, "loss": 4.0255, "lr": 0.0006264335664335664, "step": 3172, "tokens_trained": 1.559115752 }, { "epoch": 0.9003616764768456, "grad_norm": 8.490089416503906, "loss": 4.0368, "lr": 0.0006261538461538462, "step": 3174, "tokens_trained": 1.560095384 }, { "epoch": 0.9009290121267995, "grad_norm": 7.3357744216918945, "loss": 4.0528, "lr": 0.0006258741258741258, "step": 3176, "tokens_trained": 1.561078856 }, { "epoch": 0.9014963477767535, "grad_norm": 6.7389092445373535, "loss": 4.0457, "lr": 0.0006255944055944057, "step": 3178, "tokens_trained": 1.562063936 }, { "epoch": 0.9020636834267073, "grad_norm": 7.586348056793213, "loss": 4.0516, "lr": 0.0006253146853146854, "step": 3180, "tokens_trained": 1.5630424 }, { "epoch": 0.9026310190766612, "grad_norm": 5.646294116973877, "loss": 4.0048, "lr": 0.000625034965034965, "step": 3182, "tokens_trained": 1.564028064 }, { "epoch": 0.9031983547266151, "grad_norm": 7.30889368057251, "loss": 3.9952, "lr": 0.0006247552447552448, "step": 3184, "tokens_trained": 1.565010296 }, { "epoch": 0.9037656903765691, "grad_norm": 6.234517574310303, "loss": 4.0267, "lr": 0.0006244755244755245, "step": 3186, "tokens_trained": 1.565993536 }, { "epoch": 0.904333026026523, "grad_norm": 4.630068302154541, "loss": 4.0638, "lr": 0.0006241958041958043, "step": 3188, "tokens_trained": 1.566973648 }, { "epoch": 0.9049003616764768, "grad_norm": 10.530085563659668, "loss": 4.056, "lr": 0.0006239160839160839, "step": 3190, "tokens_trained": 1.567954192 }, { "epoch": 0.9054676973264307, "grad_norm": 6.909562110900879, "loss": 4.0297, "lr": 0.0006236363636363636, "step": 3192, "tokens_trained": 1.568941888 }, { "epoch": 0.9060350329763847, "grad_norm": 3.382798910140991, "loss": 3.9554, "lr": 0.0006233566433566433, "step": 3194, "tokens_trained": 1.569926344 }, { "epoch": 0.9066023686263386, "grad_norm": 6.318317890167236, "loss": 4.0313, "lr": 0.0006230769230769231, "step": 3196, "tokens_trained": 1.570909072 }, { "epoch": 0.9071697042762925, "grad_norm": 8.904982566833496, "loss": 4.0422, "lr": 0.0006227972027972028, "step": 3198, "tokens_trained": 1.571891864 }, { "epoch": 0.9077370399262463, "grad_norm": 4.008038520812988, "loss": 4.0254, "lr": 0.0006225174825174825, "step": 3200, "tokens_trained": 1.572877488 }, { "epoch": 0.9083043755762003, "grad_norm": 4.28498649597168, "loss": 3.9916, "lr": 0.0006222377622377623, "step": 3202, "tokens_trained": 1.57385788 }, { "epoch": 0.9088717112261542, "grad_norm": 7.385266304016113, "loss": 3.9841, "lr": 0.000621958041958042, "step": 3204, "tokens_trained": 1.574841232 }, { "epoch": 0.9094390468761081, "grad_norm": 6.1430134773254395, "loss": 3.9886, "lr": 0.0006216783216783217, "step": 3206, "tokens_trained": 1.5758212 }, { "epoch": 0.910006382526062, "grad_norm": 4.640578746795654, "loss": 4.036, "lr": 0.0006213986013986014, "step": 3208, "tokens_trained": 1.576803856 }, { "epoch": 0.9105737181760158, "grad_norm": 2.6749765872955322, "loss": 3.9934, "lr": 0.0006211188811188811, "step": 3210, "tokens_trained": 1.577788136 }, { "epoch": 0.9111410538259698, "grad_norm": 2.5117337703704834, "loss": 3.9924, "lr": 0.0006208391608391608, "step": 3212, "tokens_trained": 1.5787728 }, { "epoch": 0.9117083894759237, "grad_norm": 9.552038192749023, "loss": 4.0141, "lr": 0.0006205594405594406, "step": 3214, "tokens_trained": 1.579757576 }, { "epoch": 0.9122757251258776, "grad_norm": 4.317904949188232, "loss": 4.0242, "lr": 0.0006202797202797203, "step": 3216, "tokens_trained": 1.580737776 }, { "epoch": 0.9128430607758315, "grad_norm": 4.847869873046875, "loss": 4.0037, "lr": 0.00062, "step": 3218, "tokens_trained": 1.58172144 }, { "epoch": 0.9134103964257854, "grad_norm": 8.135149002075195, "loss": 4.056, "lr": 0.0006197202797202797, "step": 3220, "tokens_trained": 1.58270064 }, { "epoch": 0.9139777320757393, "grad_norm": 4.46032190322876, "loss": 4.0037, "lr": 0.0006194405594405595, "step": 3222, "tokens_trained": 1.58368244 }, { "epoch": 0.9145450677256932, "grad_norm": 4.710826873779297, "loss": 4.0083, "lr": 0.0006191608391608392, "step": 3224, "tokens_trained": 1.584669984 }, { "epoch": 0.9151124033756471, "grad_norm": 6.524029731750488, "loss": 4.0394, "lr": 0.0006188811188811189, "step": 3226, "tokens_trained": 1.585651952 }, { "epoch": 0.9156797390256011, "grad_norm": 8.807348251342773, "loss": 4.0215, "lr": 0.0006186013986013986, "step": 3228, "tokens_trained": 1.586634416 }, { "epoch": 0.9162470746755549, "grad_norm": 8.313971519470215, "loss": 4.048, "lr": 0.0006183216783216783, "step": 3230, "tokens_trained": 1.587616352 }, { "epoch": 0.9168144103255088, "grad_norm": 7.2862868309021, "loss": 4.0326, "lr": 0.0006180419580419581, "step": 3232, "tokens_trained": 1.588597696 }, { "epoch": 0.9173817459754627, "grad_norm": 6.1933746337890625, "loss": 4.0232, "lr": 0.0006177622377622377, "step": 3234, "tokens_trained": 1.589579384 }, { "epoch": 0.9179490816254167, "grad_norm": 6.848970890045166, "loss": 4.0134, "lr": 0.0006174825174825175, "step": 3236, "tokens_trained": 1.590563936 }, { "epoch": 0.9185164172753706, "grad_norm": 6.213261604309082, "loss": 3.9622, "lr": 0.0006172027972027972, "step": 3238, "tokens_trained": 1.591546488 }, { "epoch": 0.9190837529253244, "grad_norm": 11.642724990844727, "loss": 4.0487, "lr": 0.000616923076923077, "step": 3240, "tokens_trained": 1.592528992 }, { "epoch": 0.9196510885752783, "grad_norm": 2.465311288833618, "loss": 3.9996, "lr": 0.0006166433566433567, "step": 3242, "tokens_trained": 1.593514088 }, { "epoch": 0.9202184242252323, "grad_norm": 14.788623809814453, "loss": 4.1041, "lr": 0.0006163636363636364, "step": 3244, "tokens_trained": 1.594498768 }, { "epoch": 0.9207857598751862, "grad_norm": 11.614027976989746, "loss": 3.99, "lr": 0.0006160839160839161, "step": 3246, "tokens_trained": 1.595477496 }, { "epoch": 0.9213530955251401, "grad_norm": 8.917405128479004, "loss": 4.0626, "lr": 0.0006158041958041957, "step": 3248, "tokens_trained": 1.596459208 }, { "epoch": 0.9219204311750939, "grad_norm": 9.843046188354492, "loss": 4.0256, "lr": 0.0006155244755244756, "step": 3250, "tokens_trained": 1.59744676 }, { "epoch": 0.9219204311750939, "eval_loss": 1.0055779218673706, "eval_runtime": 20.5405, "step": 3250, "tokens_trained": 1.59744676 }, { "epoch": 0.9224877668250479, "grad_norm": 5.153568267822266, "loss": 3.9596, "lr": 0.0006152447552447552, "step": 3252, "tokens_trained": 1.598428968 }, { "epoch": 0.9230551024750018, "grad_norm": 3.321300745010376, "loss": 3.969, "lr": 0.000614965034965035, "step": 3254, "tokens_trained": 1.599406304 }, { "epoch": 0.9236224381249557, "grad_norm": 5.910068511962891, "loss": 3.9806, "lr": 0.0006146853146853147, "step": 3256, "tokens_trained": 1.60038644 }, { "epoch": 0.9241897737749096, "grad_norm": 9.364005088806152, "loss": 3.9919, "lr": 0.0006144055944055945, "step": 3258, "tokens_trained": 1.601371288 }, { "epoch": 0.9247571094248634, "grad_norm": 9.865127563476562, "loss": 3.9827, "lr": 0.0006141258741258742, "step": 3260, "tokens_trained": 1.602351528 }, { "epoch": 0.9253244450748174, "grad_norm": 6.053020000457764, "loss": 3.9769, "lr": 0.0006138461538461538, "step": 3262, "tokens_trained": 1.603337336 }, { "epoch": 0.9258917807247713, "grad_norm": 5.632033348083496, "loss": 4.061, "lr": 0.0006135664335664336, "step": 3264, "tokens_trained": 1.6043186 }, { "epoch": 0.9264591163747252, "grad_norm": 6.253534317016602, "loss": 3.9414, "lr": 0.0006132867132867132, "step": 3266, "tokens_trained": 1.605300448 }, { "epoch": 0.9270264520246791, "grad_norm": 7.757418632507324, "loss": 4.0119, "lr": 0.0006130069930069931, "step": 3268, "tokens_trained": 1.60628376 }, { "epoch": 0.927593787674633, "grad_norm": 5.378245830535889, "loss": 3.9746, "lr": 0.0006127272727272727, "step": 3270, "tokens_trained": 1.607265384 }, { "epoch": 0.9281611233245869, "grad_norm": 5.998968124389648, "loss": 4.0218, "lr": 0.0006124475524475525, "step": 3272, "tokens_trained": 1.60824544 }, { "epoch": 0.9287284589745408, "grad_norm": 6.340670585632324, "loss": 4.0204, "lr": 0.0006121678321678322, "step": 3274, "tokens_trained": 1.609232632 }, { "epoch": 0.9292957946244947, "grad_norm": 6.357148170471191, "loss": 3.9686, "lr": 0.0006118881118881118, "step": 3276, "tokens_trained": 1.610216024 }, { "epoch": 0.9298631302744487, "grad_norm": 4.993794918060303, "loss": 3.9812, "lr": 0.0006116083916083917, "step": 3278, "tokens_trained": 1.611196872 }, { "epoch": 0.9304304659244025, "grad_norm": 7.559938430786133, "loss": 4.0018, "lr": 0.0006113286713286713, "step": 3280, "tokens_trained": 1.612184944 }, { "epoch": 0.9309978015743564, "grad_norm": 3.7233004570007324, "loss": 3.9835, "lr": 0.0006110489510489511, "step": 3282, "tokens_trained": 1.613170464 }, { "epoch": 0.9315651372243103, "grad_norm": 7.3292717933654785, "loss": 3.977, "lr": 0.0006107692307692307, "step": 3284, "tokens_trained": 1.614153168 }, { "epoch": 0.9321324728742643, "grad_norm": 8.804302215576172, "loss": 3.962, "lr": 0.0006104895104895106, "step": 3286, "tokens_trained": 1.615134208 }, { "epoch": 0.9326998085242182, "grad_norm": 5.557953834533691, "loss": 3.9729, "lr": 0.0006102097902097902, "step": 3288, "tokens_trained": 1.616116248 }, { "epoch": 0.933267144174172, "grad_norm": 5.135542869567871, "loss": 3.9855, "lr": 0.0006099300699300699, "step": 3290, "tokens_trained": 1.617100064 }, { "epoch": 0.9338344798241259, "grad_norm": 10.206086158752441, "loss": 4.0058, "lr": 0.0006096503496503497, "step": 3292, "tokens_trained": 1.61808084 }, { "epoch": 0.9344018154740799, "grad_norm": 6.490070819854736, "loss": 4.0328, "lr": 0.0006093706293706293, "step": 3294, "tokens_trained": 1.619061608 }, { "epoch": 0.9349691511240338, "grad_norm": 6.246134281158447, "loss": 3.9858, "lr": 0.0006090909090909092, "step": 3296, "tokens_trained": 1.620046896 }, { "epoch": 0.9355364867739877, "grad_norm": 6.82793664932251, "loss": 3.9416, "lr": 0.0006088111888111888, "step": 3298, "tokens_trained": 1.621030544 }, { "epoch": 0.9361038224239415, "grad_norm": 5.400341510772705, "loss": 4.0048, "lr": 0.0006085314685314686, "step": 3300, "tokens_trained": 1.622010024 }, { "epoch": 0.9366711580738954, "grad_norm": 2.7493224143981934, "loss": 3.9987, "lr": 0.0006082517482517482, "step": 3302, "tokens_trained": 1.622992736 }, { "epoch": 0.9372384937238494, "grad_norm": 8.426931381225586, "loss": 4.0074, "lr": 0.000607972027972028, "step": 3304, "tokens_trained": 1.623977336 }, { "epoch": 0.9378058293738033, "grad_norm": 6.779547691345215, "loss": 4.0041, "lr": 0.0006076923076923077, "step": 3306, "tokens_trained": 1.624958504 }, { "epoch": 0.9383731650237572, "grad_norm": 5.38230562210083, "loss": 4.0297, "lr": 0.0006074125874125874, "step": 3308, "tokens_trained": 1.625948568 }, { "epoch": 0.938940500673711, "grad_norm": 5.785275936126709, "loss": 4.0112, "lr": 0.0006071328671328672, "step": 3310, "tokens_trained": 1.626932696 }, { "epoch": 0.939507836323665, "grad_norm": 14.610711097717285, "loss": 3.9558, "lr": 0.0006068531468531468, "step": 3312, "tokens_trained": 1.62791704 }, { "epoch": 0.9400751719736189, "grad_norm": 2.3301351070404053, "loss": 4.0155, "lr": 0.0006065734265734267, "step": 3314, "tokens_trained": 1.628900096 }, { "epoch": 0.9406425076235728, "grad_norm": 17.020362854003906, "loss": 4.0244, "lr": 0.0006062937062937063, "step": 3316, "tokens_trained": 1.629885888 }, { "epoch": 0.9412098432735267, "grad_norm": 8.809579849243164, "loss": 4.0622, "lr": 0.000606013986013986, "step": 3318, "tokens_trained": 1.630868992 }, { "epoch": 0.9417771789234806, "grad_norm": 4.908751964569092, "loss": 4.0464, "lr": 0.0006057342657342657, "step": 3320, "tokens_trained": 1.631855664 }, { "epoch": 0.9423445145734345, "grad_norm": 9.65546989440918, "loss": 4.013, "lr": 0.0006054545454545455, "step": 3322, "tokens_trained": 1.632839496 }, { "epoch": 0.9429118502233884, "grad_norm": 5.595473766326904, "loss": 4.0371, "lr": 0.0006051748251748252, "step": 3324, "tokens_trained": 1.633827536 }, { "epoch": 0.9434791858733423, "grad_norm": 10.249938011169434, "loss": 4.0702, "lr": 0.0006048951048951049, "step": 3326, "tokens_trained": 1.634811888 }, { "epoch": 0.9440465215232963, "grad_norm": 12.086007118225098, "loss": 4.0042, "lr": 0.0006046153846153846, "step": 3328, "tokens_trained": 1.635792824 }, { "epoch": 0.9446138571732501, "grad_norm": 3.0745136737823486, "loss": 4.0355, "lr": 0.0006043356643356643, "step": 3330, "tokens_trained": 1.636776176 }, { "epoch": 0.945181192823204, "grad_norm": 4.060697078704834, "loss": 4.0016, "lr": 0.0006040559440559441, "step": 3332, "tokens_trained": 1.637758008 }, { "epoch": 0.9457485284731579, "grad_norm": 7.648933410644531, "loss": 3.9939, "lr": 0.0006037762237762238, "step": 3334, "tokens_trained": 1.638744408 }, { "epoch": 0.9463158641231119, "grad_norm": 5.033253192901611, "loss": 4.0245, "lr": 0.0006034965034965035, "step": 3336, "tokens_trained": 1.639724776 }, { "epoch": 0.9468831997730658, "grad_norm": 4.653557300567627, "loss": 4.0169, "lr": 0.0006032167832167832, "step": 3338, "tokens_trained": 1.640708864 }, { "epoch": 0.9474505354230196, "grad_norm": 6.682651042938232, "loss": 4.0062, "lr": 0.000602937062937063, "step": 3340, "tokens_trained": 1.641689864 }, { "epoch": 0.9480178710729735, "grad_norm": 5.059361934661865, "loss": 3.9681, "lr": 0.0006026573426573426, "step": 3342, "tokens_trained": 1.64267264 }, { "epoch": 0.9485852067229275, "grad_norm": 4.165974140167236, "loss": 3.9941, "lr": 0.0006023776223776224, "step": 3344, "tokens_trained": 1.643655624 }, { "epoch": 0.9491525423728814, "grad_norm": 6.669079780578613, "loss": 4.0258, "lr": 0.0006020979020979021, "step": 3346, "tokens_trained": 1.644635752 }, { "epoch": 0.9497198780228353, "grad_norm": 5.924664497375488, "loss": 4.0589, "lr": 0.0006018181818181818, "step": 3348, "tokens_trained": 1.64561992 }, { "epoch": 0.9502872136727891, "grad_norm": 1.662906527519226, "loss": 3.9894, "lr": 0.0006015384615384616, "step": 3350, "tokens_trained": 1.646605552 }, { "epoch": 0.950854549322743, "grad_norm": 3.1677517890930176, "loss": 4.0062, "lr": 0.0006012587412587413, "step": 3352, "tokens_trained": 1.647587824 }, { "epoch": 0.951421884972697, "grad_norm": 5.4521918296813965, "loss": 4.0244, "lr": 0.000600979020979021, "step": 3354, "tokens_trained": 1.648566792 }, { "epoch": 0.9519892206226509, "grad_norm": 7.839843273162842, "loss": 3.9954, "lr": 0.0006006993006993006, "step": 3356, "tokens_trained": 1.6495504 }, { "epoch": 0.9525565562726048, "grad_norm": 5.340535640716553, "loss": 3.9915, "lr": 0.0006004195804195805, "step": 3358, "tokens_trained": 1.65053064 }, { "epoch": 0.9531238919225586, "grad_norm": 3.9342992305755615, "loss": 3.9507, "lr": 0.0006001398601398601, "step": 3360, "tokens_trained": 1.651516704 }, { "epoch": 0.9536912275725126, "grad_norm": 3.879631519317627, "loss": 4.0369, "lr": 0.0005998601398601399, "step": 3362, "tokens_trained": 1.652501248 }, { "epoch": 0.9542585632224665, "grad_norm": 4.699181079864502, "loss": 4.0151, "lr": 0.0005995804195804196, "step": 3364, "tokens_trained": 1.653486632 }, { "epoch": 0.9548258988724204, "grad_norm": 7.259454250335693, "loss": 3.9855, "lr": 0.0005993006993006993, "step": 3366, "tokens_trained": 1.654473488 }, { "epoch": 0.9553932345223743, "grad_norm": 6.6725029945373535, "loss": 3.9972, "lr": 0.0005990209790209791, "step": 3368, "tokens_trained": 1.655456328 }, { "epoch": 0.9559605701723282, "grad_norm": 5.077842712402344, "loss": 3.9706, "lr": 0.0005987412587412587, "step": 3370, "tokens_trained": 1.656442256 }, { "epoch": 0.9565279058222821, "grad_norm": 7.882787704467773, "loss": 4.0581, "lr": 0.0005984615384615385, "step": 3372, "tokens_trained": 1.657425912 }, { "epoch": 0.957095241472236, "grad_norm": 7.118039608001709, "loss": 3.9939, "lr": 0.0005981818181818181, "step": 3374, "tokens_trained": 1.658406184 }, { "epoch": 0.9573789092972129, "eval_loss": 1.0043113231658936, "eval_runtime": 20.471, "step": 3375, "tokens_trained": 1.658898224 }, { "epoch": 0.9576625771221899, "grad_norm": 11.206400871276855, "loss": 4.0073, "lr": 0.000597902097902098, "step": 3376, "tokens_trained": 1.65938968 }, { "epoch": 0.9582299127721439, "grad_norm": 3.2221481800079346, "loss": 3.9924, "lr": 0.0005976223776223776, "step": 3378, "tokens_trained": 1.660372856 }, { "epoch": 0.9587972484220977, "grad_norm": 15.000614166259766, "loss": 4.0361, "lr": 0.0005973426573426574, "step": 3380, "tokens_trained": 1.66135512 }, { "epoch": 0.9593645840720516, "grad_norm": 13.365633964538574, "loss": 4.0258, "lr": 0.0005970629370629371, "step": 3382, "tokens_trained": 1.662332728 }, { "epoch": 0.9599319197220055, "grad_norm": 6.362198829650879, "loss": 3.9868, "lr": 0.0005967832167832167, "step": 3384, "tokens_trained": 1.663311392 }, { "epoch": 0.9604992553719595, "grad_norm": 16.104549407958984, "loss": 3.9893, "lr": 0.0005965034965034966, "step": 3386, "tokens_trained": 1.664296088 }, { "epoch": 0.9610665910219134, "grad_norm": 32.109375, "loss": 4.0635, "lr": 0.0005962237762237762, "step": 3388, "tokens_trained": 1.665278232 }, { "epoch": 0.9616339266718672, "grad_norm": 14.814417839050293, "loss": 4.0545, "lr": 0.000595944055944056, "step": 3390, "tokens_trained": 1.666262952 }, { "epoch": 0.9622012623218211, "grad_norm": 8.69149398803711, "loss": 4.0214, "lr": 0.0005956643356643356, "step": 3392, "tokens_trained": 1.66724224 }, { "epoch": 0.962768597971775, "grad_norm": 6.150435447692871, "loss": 4.0675, "lr": 0.0005953846153846155, "step": 3394, "tokens_trained": 1.668222488 }, { "epoch": 0.963335933621729, "grad_norm": 14.53095817565918, "loss": 4.0293, "lr": 0.0005951048951048951, "step": 3396, "tokens_trained": 1.66920572 }, { "epoch": 0.9639032692716829, "grad_norm": 14.750361442565918, "loss": 4.0345, "lr": 0.0005948251748251748, "step": 3398, "tokens_trained": 1.670191456 }, { "epoch": 0.9644706049216367, "grad_norm": 10.563243865966797, "loss": 4.0796, "lr": 0.0005945454545454546, "step": 3400, "tokens_trained": 1.671174992 }, { "epoch": 0.9650379405715906, "grad_norm": 14.203415870666504, "loss": 4.0078, "lr": 0.0005942657342657342, "step": 3402, "tokens_trained": 1.672159048 }, { "epoch": 0.9656052762215446, "grad_norm": 7.918346405029297, "loss": 4.0015, "lr": 0.0005939860139860141, "step": 3404, "tokens_trained": 1.6731408 }, { "epoch": 0.9661726118714985, "grad_norm": 3.3628811836242676, "loss": 4.0656, "lr": 0.0005937062937062937, "step": 3406, "tokens_trained": 1.674120472 }, { "epoch": 0.9667399475214524, "grad_norm": 13.740876197814941, "loss": 4.0296, "lr": 0.0005934265734265735, "step": 3408, "tokens_trained": 1.67510176 }, { "epoch": 0.9673072831714062, "grad_norm": 8.178666114807129, "loss": 3.9804, "lr": 0.0005931468531468531, "step": 3410, "tokens_trained": 1.676087336 }, { "epoch": 0.9678746188213602, "grad_norm": 6.31284761428833, "loss": 3.9905, "lr": 0.000592867132867133, "step": 3412, "tokens_trained": 1.677069328 }, { "epoch": 0.9684419544713141, "grad_norm": 10.166040420532227, "loss": 3.9962, "lr": 0.0005925874125874126, "step": 3414, "tokens_trained": 1.678049672 }, { "epoch": 0.969009290121268, "grad_norm": 6.166718006134033, "loss": 3.9966, "lr": 0.0005923076923076923, "step": 3416, "tokens_trained": 1.679035104 }, { "epoch": 0.969576625771222, "grad_norm": 3.7397615909576416, "loss": 4.0323, "lr": 0.0005920279720279721, "step": 3418, "tokens_trained": 1.680018424 }, { "epoch": 0.9701439614211758, "grad_norm": 12.122432708740234, "loss": 4.0143, "lr": 0.0005917482517482517, "step": 3420, "tokens_trained": 1.681001112 }, { "epoch": 0.9707112970711297, "grad_norm": 5.118746280670166, "loss": 3.9909, "lr": 0.0005914685314685316, "step": 3422, "tokens_trained": 1.681987648 }, { "epoch": 0.9712786327210836, "grad_norm": 5.810860633850098, "loss": 3.9675, "lr": 0.0005911888111888112, "step": 3424, "tokens_trained": 1.68296972 }, { "epoch": 0.9718459683710375, "grad_norm": 7.637686252593994, "loss": 3.9976, "lr": 0.0005909090909090909, "step": 3426, "tokens_trained": 1.683952 }, { "epoch": 0.9724133040209915, "grad_norm": 5.637698173522949, "loss": 3.9829, "lr": 0.0005906293706293706, "step": 3428, "tokens_trained": 1.684933912 }, { "epoch": 0.9729806396709453, "grad_norm": 2.2650809288024902, "loss": 3.9656, "lr": 0.0005903496503496504, "step": 3430, "tokens_trained": 1.685915176 }, { "epoch": 0.9735479753208992, "grad_norm": 6.0117058753967285, "loss": 4.0575, "lr": 0.0005900699300699301, "step": 3432, "tokens_trained": 1.686901184 }, { "epoch": 0.9741153109708531, "grad_norm": 8.301697731018066, "loss": 3.9869, "lr": 0.0005897902097902098, "step": 3434, "tokens_trained": 1.687886888 }, { "epoch": 0.9746826466208071, "grad_norm": 6.436981678009033, "loss": 4.01, "lr": 0.0005895104895104896, "step": 3436, "tokens_trained": 1.68886904 }, { "epoch": 0.975249982270761, "grad_norm": 4.290571212768555, "loss": 3.9953, "lr": 0.0005892307692307692, "step": 3438, "tokens_trained": 1.689850264 }, { "epoch": 0.9758173179207148, "grad_norm": 4.618532657623291, "loss": 3.9995, "lr": 0.000588951048951049, "step": 3440, "tokens_trained": 1.69083728 }, { "epoch": 0.9763846535706687, "grad_norm": 8.481820106506348, "loss": 4.0019, "lr": 0.0005886713286713287, "step": 3442, "tokens_trained": 1.691819976 }, { "epoch": 0.9769519892206227, "grad_norm": 4.643980503082275, "loss": 3.9974, "lr": 0.0005883916083916084, "step": 3444, "tokens_trained": 1.692803784 }, { "epoch": 0.9775193248705766, "grad_norm": 6.828413009643555, "loss": 3.9886, "lr": 0.0005881118881118881, "step": 3446, "tokens_trained": 1.69378512 }, { "epoch": 0.9780866605205305, "grad_norm": 7.530898094177246, "loss": 4.0318, "lr": 0.0005878321678321679, "step": 3448, "tokens_trained": 1.694768152 }, { "epoch": 0.9786539961704843, "grad_norm": 6.020658493041992, "loss": 4.0057, "lr": 0.0005875524475524476, "step": 3450, "tokens_trained": 1.695752832 }, { "epoch": 0.9792213318204382, "grad_norm": 5.292300224304199, "loss": 3.9915, "lr": 0.0005872727272727273, "step": 3452, "tokens_trained": 1.696735104 }, { "epoch": 0.9797886674703922, "grad_norm": 4.932474613189697, "loss": 4.0163, "lr": 0.0005869930069930069, "step": 3454, "tokens_trained": 1.697718208 }, { "epoch": 0.9803560031203461, "grad_norm": 4.504141807556152, "loss": 3.9875, "lr": 0.0005867132867132867, "step": 3456, "tokens_trained": 1.698697752 }, { "epoch": 0.9809233387703, "grad_norm": 4.826939582824707, "loss": 3.9326, "lr": 0.0005864335664335665, "step": 3458, "tokens_trained": 1.699672392 }, { "epoch": 0.9814906744202538, "grad_norm": 7.805232524871826, "loss": 3.9695, "lr": 0.0005861538461538462, "step": 3460, "tokens_trained": 1.700656392 }, { "epoch": 0.9820580100702078, "grad_norm": 6.857801914215088, "loss": 3.995, "lr": 0.0005858741258741259, "step": 3462, "tokens_trained": 1.701644848 }, { "epoch": 0.9826253457201617, "grad_norm": 4.32315731048584, "loss": 3.9701, "lr": 0.0005855944055944055, "step": 3464, "tokens_trained": 1.702624688 }, { "epoch": 0.9831926813701156, "grad_norm": 6.007495880126953, "loss": 3.9887, "lr": 0.0005853146853146854, "step": 3466, "tokens_trained": 1.703607376 }, { "epoch": 0.9837600170200695, "grad_norm": 4.779850006103516, "loss": 3.9852, "lr": 0.000585034965034965, "step": 3468, "tokens_trained": 1.704589808 }, { "epoch": 0.9843273526700234, "grad_norm": 4.593331336975098, "loss": 4.0136, "lr": 0.0005847552447552448, "step": 3470, "tokens_trained": 1.705573184 }, { "epoch": 0.9848946883199773, "grad_norm": 5.466218948364258, "loss": 3.9426, "lr": 0.0005844755244755244, "step": 3472, "tokens_trained": 1.706555864 }, { "epoch": 0.9854620239699312, "grad_norm": 8.283979415893555, "loss": 3.9788, "lr": 0.0005841958041958042, "step": 3474, "tokens_trained": 1.70754036 }, { "epoch": 0.9860293596198851, "grad_norm": 2.4386069774627686, "loss": 3.9413, "lr": 0.000583916083916084, "step": 3476, "tokens_trained": 1.708525528 }, { "epoch": 0.9865966952698391, "grad_norm": 4.485580921173096, "loss": 3.9695, "lr": 0.0005836363636363636, "step": 3478, "tokens_trained": 1.709508232 }, { "epoch": 0.9871640309197929, "grad_norm": 6.725922584533691, "loss": 4.0084, "lr": 0.0005833566433566434, "step": 3480, "tokens_trained": 1.710493288 }, { "epoch": 0.9877313665697468, "grad_norm": 5.532742023468018, "loss": 3.9571, "lr": 0.000583076923076923, "step": 3482, "tokens_trained": 1.711478792 }, { "epoch": 0.9882987022197007, "grad_norm": 5.568683624267578, "loss": 4.0178, "lr": 0.0005827972027972029, "step": 3484, "tokens_trained": 1.712464864 }, { "epoch": 0.9888660378696547, "grad_norm": 5.192487716674805, "loss": 4.0294, "lr": 0.0005825174825174825, "step": 3486, "tokens_trained": 1.713448256 }, { "epoch": 0.9894333735196086, "grad_norm": 5.584596633911133, "loss": 3.9992, "lr": 0.0005822377622377623, "step": 3488, "tokens_trained": 1.714435472 }, { "epoch": 0.9900007091695624, "grad_norm": 5.044432163238525, "loss": 4.0119, "lr": 0.0005819580419580419, "step": 3490, "tokens_trained": 1.715418784 }, { "epoch": 0.9905680448195163, "grad_norm": 3.4799540042877197, "loss": 4.0099, "lr": 0.0005816783216783216, "step": 3492, "tokens_trained": 1.716402544 }, { "epoch": 0.9911353804694703, "grad_norm": 4.949790000915527, "loss": 3.9372, "lr": 0.0005813986013986015, "step": 3494, "tokens_trained": 1.71738848 }, { "epoch": 0.9917027161194242, "grad_norm": 6.527776718139648, "loss": 3.9938, "lr": 0.0005811188811188811, "step": 3496, "tokens_trained": 1.718371984 }, { "epoch": 0.9922700517693781, "grad_norm": 5.616584300994873, "loss": 3.9352, "lr": 0.0005808391608391609, "step": 3498, "tokens_trained": 1.719358256 }, { "epoch": 0.9928373874193319, "grad_norm": 7.028440952301025, "loss": 3.9494, "lr": 0.0005805594405594405, "step": 3500, "tokens_trained": 1.720339264 }, { "epoch": 0.9928373874193319, "eval_loss": 0.999991238117218, "eval_runtime": 20.318, "step": 3500, "tokens_trained": 1.720339264 }, { "epoch": 0.9934047230692858, "grad_norm": 5.338140487670898, "loss": 3.9748, "lr": 0.0005802797202797204, "step": 3502, "tokens_trained": 1.72132272 }, { "epoch": 0.9939720587192398, "grad_norm": 3.3448476791381836, "loss": 3.96, "lr": 0.00058, "step": 3504, "tokens_trained": 1.722307576 }, { "epoch": 0.9945393943691937, "grad_norm": 10.660968780517578, "loss": 4.0199, "lr": 0.0005797202797202797, "step": 3506, "tokens_trained": 1.723288472 }, { "epoch": 0.9951067300191476, "grad_norm": 7.261615753173828, "loss": 3.9889, "lr": 0.0005794405594405594, "step": 3508, "tokens_trained": 1.724272744 }, { "epoch": 0.9956740656691014, "grad_norm": 5.103553295135498, "loss": 4.0047, "lr": 0.0005791608391608391, "step": 3510, "tokens_trained": 1.725255576 }, { "epoch": 0.9962414013190554, "grad_norm": 1.5151104927062988, "loss": 4.0228, "lr": 0.000578881118881119, "step": 3512, "tokens_trained": 1.72624092 }, { "epoch": 0.9968087369690093, "grad_norm": 6.042428493499756, "loss": 3.9699, "lr": 0.0005786013986013986, "step": 3514, "tokens_trained": 1.727227176 }, { "epoch": 0.9973760726189632, "grad_norm": 10.020720481872559, "loss": 3.9961, "lr": 0.0005783216783216784, "step": 3516, "tokens_trained": 1.728205072 }, { "epoch": 0.9979434082689171, "grad_norm": 9.385619163513184, "loss": 3.9962, "lr": 0.000578041958041958, "step": 3518, "tokens_trained": 1.729187536 }, { "epoch": 0.998510743918871, "grad_norm": 1.413792371749878, "loss": 4.0256, "lr": 0.0005777622377622377, "step": 3520, "tokens_trained": 1.730168968 }, { "epoch": 0.9990780795688249, "grad_norm": 2.8461780548095703, "loss": 3.9616, "lr": 0.0005774825174825175, "step": 3522, "tokens_trained": 1.731150472 }, { "epoch": 0.9996454152187788, "grad_norm": 4.164590835571289, "loss": 3.9786, "lr": 0.0005772027972027972, "step": 3524, "tokens_trained": 1.732130536 }, { "epoch": 1.0, "grad_norm": 1.0116016864776611, "loss": 2.5007, "lr": 0.0005769230769230769, "step": 3526, "tokens_trained": 1.732744968 }, { "epoch": 1.0005673356499538, "grad_norm": 5.954165458679199, "loss": 3.9598, "lr": 0.0005766433566433566, "step": 3528, "tokens_trained": 1.733727424 }, { "epoch": 1.0011346712999079, "grad_norm": 8.648826599121094, "loss": 3.9773, "lr": 0.0005763636363636365, "step": 3530, "tokens_trained": 1.734708184 }, { "epoch": 1.0017020069498617, "grad_norm": 2.920509099960327, "loss": 3.9745, "lr": 0.0005760839160839161, "step": 3532, "tokens_trained": 1.735688616 }, { "epoch": 1.0022693425998157, "grad_norm": 9.963903427124023, "loss": 3.9742, "lr": 0.0005758041958041958, "step": 3534, "tokens_trained": 1.73667084 }, { "epoch": 1.0028366782497695, "grad_norm": 9.745009422302246, "loss": 4.028, "lr": 0.0005755244755244755, "step": 3536, "tokens_trained": 1.737656328 }, { "epoch": 1.0034040138997233, "grad_norm": 5.159154891967773, "loss": 3.9812, "lr": 0.0005752447552447552, "step": 3538, "tokens_trained": 1.738637688 }, { "epoch": 1.0039713495496774, "grad_norm": 10.829404830932617, "loss": 3.9795, "lr": 0.000574965034965035, "step": 3540, "tokens_trained": 1.739621688 }, { "epoch": 1.0045386851996312, "grad_norm": 8.493478775024414, "loss": 3.9918, "lr": 0.0005746853146853147, "step": 3542, "tokens_trained": 1.740604488 }, { "epoch": 1.0051060208495852, "grad_norm": 4.013627529144287, "loss": 3.9928, "lr": 0.0005744055944055944, "step": 3544, "tokens_trained": 1.74158764 }, { "epoch": 1.005673356499539, "grad_norm": 12.669920921325684, "loss": 4.0114, "lr": 0.0005741258741258741, "step": 3546, "tokens_trained": 1.742573592 }, { "epoch": 1.0062406921494929, "grad_norm": 6.349422931671143, "loss": 4.0294, "lr": 0.000573846153846154, "step": 3548, "tokens_trained": 1.743555672 }, { "epoch": 1.006808027799447, "grad_norm": 4.14855432510376, "loss": 3.9963, "lr": 0.0005735664335664336, "step": 3550, "tokens_trained": 1.744538384 }, { "epoch": 1.0073753634494007, "grad_norm": 9.063926696777344, "loss": 3.9557, "lr": 0.0005732867132867133, "step": 3552, "tokens_trained": 1.745523552 }, { "epoch": 1.0079426990993547, "grad_norm": 11.227505683898926, "loss": 4.0087, "lr": 0.000573006993006993, "step": 3554, "tokens_trained": 1.746510024 }, { "epoch": 1.0085100347493086, "grad_norm": 2.418097972869873, "loss": 3.9942, "lr": 0.0005727272727272727, "step": 3556, "tokens_trained": 1.747493048 }, { "epoch": 1.0090773703992624, "grad_norm": 14.376424789428711, "loss": 3.999, "lr": 0.0005724475524475525, "step": 3558, "tokens_trained": 1.748476808 }, { "epoch": 1.0096447060492164, "grad_norm": 9.035455703735352, "loss": 4.063, "lr": 0.0005721678321678322, "step": 3560, "tokens_trained": 1.749460504 }, { "epoch": 1.0102120416991702, "grad_norm": 3.8785758018493652, "loss": 4.0269, "lr": 0.0005718881118881118, "step": 3562, "tokens_trained": 1.750438936 }, { "epoch": 1.0107793773491243, "grad_norm": 15.488290786743164, "loss": 4.0294, "lr": 0.0005716083916083916, "step": 3564, "tokens_trained": 1.751420168 }, { "epoch": 1.011346712999078, "grad_norm": 10.785538673400879, "loss": 4.0102, "lr": 0.0005713286713286714, "step": 3566, "tokens_trained": 1.752405288 }, { "epoch": 1.011914048649032, "grad_norm": 5.724320888519287, "loss": 4.0148, "lr": 0.0005710489510489511, "step": 3568, "tokens_trained": 1.75338604 }, { "epoch": 1.012481384298986, "grad_norm": 11.051252365112305, "loss": 4.022, "lr": 0.0005707692307692308, "step": 3570, "tokens_trained": 1.75436632 }, { "epoch": 1.0130487199489397, "grad_norm": 10.290446281433105, "loss": 3.9781, "lr": 0.0005704895104895105, "step": 3572, "tokens_trained": 1.755349944 }, { "epoch": 1.0136160555988938, "grad_norm": 4.81416130065918, "loss": 4.0393, "lr": 0.0005702097902097902, "step": 3574, "tokens_trained": 1.756337976 }, { "epoch": 1.0141833912488476, "grad_norm": 14.237113952636719, "loss": 4.087, "lr": 0.0005699300699300699, "step": 3576, "tokens_trained": 1.75732372 }, { "epoch": 1.0147507268988014, "grad_norm": 3.973662853240967, "loss": 3.9692, "lr": 0.0005696503496503497, "step": 3578, "tokens_trained": 1.7583098 }, { "epoch": 1.0153180625487555, "grad_norm": 5.629733562469482, "loss": 4.0003, "lr": 0.0005693706293706293, "step": 3580, "tokens_trained": 1.759300416 }, { "epoch": 1.0158853981987093, "grad_norm": 7.505983352661133, "loss": 4.011, "lr": 0.0005690909090909091, "step": 3582, "tokens_trained": 1.760288632 }, { "epoch": 1.0164527338486633, "grad_norm": 5.501095294952393, "loss": 3.994, "lr": 0.0005688111888111889, "step": 3584, "tokens_trained": 1.761270328 }, { "epoch": 1.0170200694986171, "grad_norm": 4.74052619934082, "loss": 4.0241, "lr": 0.0005685314685314686, "step": 3586, "tokens_trained": 1.762252432 }, { "epoch": 1.017587405148571, "grad_norm": 8.409584045410156, "loss": 4.0137, "lr": 0.0005682517482517483, "step": 3588, "tokens_trained": 1.76323772 }, { "epoch": 1.018154740798525, "grad_norm": 5.391080379486084, "loss": 3.9424, "lr": 0.0005679720279720279, "step": 3590, "tokens_trained": 1.764220272 }, { "epoch": 1.0187220764484788, "grad_norm": 4.679509162902832, "loss": 3.9893, "lr": 0.0005676923076923077, "step": 3592, "tokens_trained": 1.765203832 }, { "epoch": 1.0192894120984328, "grad_norm": 5.354970932006836, "loss": 4.023, "lr": 0.0005674125874125874, "step": 3594, "tokens_trained": 1.76618936 }, { "epoch": 1.0198567477483866, "grad_norm": 5.1085357666015625, "loss": 3.9995, "lr": 0.0005671328671328672, "step": 3596, "tokens_trained": 1.767171216 }, { "epoch": 1.0204240833983405, "grad_norm": 3.0856151580810547, "loss": 4.0084, "lr": 0.0005668531468531468, "step": 3598, "tokens_trained": 1.76815464 }, { "epoch": 1.0209914190482945, "grad_norm": 2.330599308013916, "loss": 3.9838, "lr": 0.0005665734265734265, "step": 3600, "tokens_trained": 1.76913612 }, { "epoch": 1.0215587546982483, "grad_norm": 5.641542434692383, "loss": 3.951, "lr": 0.0005662937062937064, "step": 3602, "tokens_trained": 1.770119592 }, { "epoch": 1.0221260903482023, "grad_norm": 8.442550659179688, "loss": 4.0088, "lr": 0.000566013986013986, "step": 3604, "tokens_trained": 1.771103624 }, { "epoch": 1.0226934259981562, "grad_norm": 6.0125732421875, "loss": 4.0243, "lr": 0.0005657342657342658, "step": 3606, "tokens_trained": 1.772091496 }, { "epoch": 1.02326076164811, "grad_norm": 4.9415388107299805, "loss": 3.9874, "lr": 0.0005654545454545454, "step": 3608, "tokens_trained": 1.77307708 }, { "epoch": 1.023828097298064, "grad_norm": 5.762909889221191, "loss": 4.0242, "lr": 0.0005651748251748252, "step": 3610, "tokens_trained": 1.774058032 }, { "epoch": 1.0243954329480178, "grad_norm": 6.652433395385742, "loss": 3.9908, "lr": 0.0005648951048951049, "step": 3612, "tokens_trained": 1.775036512 }, { "epoch": 1.0249627685979719, "grad_norm": 3.539031505584717, "loss": 3.9406, "lr": 0.0005646153846153847, "step": 3614, "tokens_trained": 1.776021656 }, { "epoch": 1.0255301042479257, "grad_norm": 6.829031467437744, "loss": 3.9839, "lr": 0.0005643356643356643, "step": 3616, "tokens_trained": 1.777000824 }, { "epoch": 1.0260974398978795, "grad_norm": 3.46431040763855, "loss": 4.0013, "lr": 0.000564055944055944, "step": 3618, "tokens_trained": 1.777983504 }, { "epoch": 1.0266647755478335, "grad_norm": 5.163998126983643, "loss": 3.9898, "lr": 0.0005637762237762239, "step": 3620, "tokens_trained": 1.778966368 }, { "epoch": 1.0272321111977873, "grad_norm": 4.270689010620117, "loss": 3.9868, "lr": 0.0005634965034965035, "step": 3622, "tokens_trained": 1.77994468 }, { "epoch": 1.0277994468477414, "grad_norm": 5.297236442565918, "loss": 3.9903, "lr": 0.0005632167832167833, "step": 3624, "tokens_trained": 1.7809246 }, { "epoch": 1.0280831146727183, "eval_loss": 0.9977753162384033, "eval_runtime": 20.5557, "step": 3625, "tokens_trained": 1.781418056 }, { "epoch": 1.0283667824976952, "grad_norm": 4.560519218444824, "loss": 3.9339, "lr": 0.0005629370629370629, "step": 3626, "tokens_trained": 1.781910808 }, { "epoch": 1.028934118147649, "grad_norm": 3.7894208431243896, "loss": 3.9739, "lr": 0.0005626573426573426, "step": 3628, "tokens_trained": 1.782891912 }, { "epoch": 1.029501453797603, "grad_norm": 3.9937522411346436, "loss": 3.9734, "lr": 0.0005623776223776224, "step": 3630, "tokens_trained": 1.783871032 }, { "epoch": 1.0300687894475569, "grad_norm": 5.798377990722656, "loss": 3.9526, "lr": 0.0005620979020979021, "step": 3632, "tokens_trained": 1.784855792 }, { "epoch": 1.030636125097511, "grad_norm": 3.2532927989959717, "loss": 3.9237, "lr": 0.0005618181818181818, "step": 3634, "tokens_trained": 1.785835216 }, { "epoch": 1.0312034607474647, "grad_norm": 3.2262985706329346, "loss": 3.9676, "lr": 0.0005615384615384615, "step": 3636, "tokens_trained": 1.78682184 }, { "epoch": 1.0317707963974185, "grad_norm": 2.4307727813720703, "loss": 3.9376, "lr": 0.0005612587412587414, "step": 3638, "tokens_trained": 1.787804536 }, { "epoch": 1.0323381320473726, "grad_norm": 11.10562515258789, "loss": 4.0096, "lr": 0.000560979020979021, "step": 3640, "tokens_trained": 1.788785152 }, { "epoch": 1.0329054676973264, "grad_norm": 8.139045715332031, "loss": 3.992, "lr": 0.0005606993006993008, "step": 3642, "tokens_trained": 1.789766736 }, { "epoch": 1.0334728033472804, "grad_norm": 5.561949729919434, "loss": 3.9368, "lr": 0.0005604195804195804, "step": 3644, "tokens_trained": 1.790746488 }, { "epoch": 1.0340401389972342, "grad_norm": 6.812232494354248, "loss": 4.0185, "lr": 0.0005601398601398601, "step": 3646, "tokens_trained": 1.79172608 }, { "epoch": 1.034607474647188, "grad_norm": 6.200248718261719, "loss": 3.9072, "lr": 0.0005598601398601399, "step": 3648, "tokens_trained": 1.792710784 }, { "epoch": 1.035174810297142, "grad_norm": 5.059606075286865, "loss": 3.9334, "lr": 0.0005595804195804196, "step": 3650, "tokens_trained": 1.793692736 }, { "epoch": 1.035742145947096, "grad_norm": 2.722522020339966, "loss": 3.9438, "lr": 0.0005593006993006993, "step": 3652, "tokens_trained": 1.79467536 }, { "epoch": 1.03630948159705, "grad_norm": 5.643895626068115, "loss": 4.0213, "lr": 0.000559020979020979, "step": 3654, "tokens_trained": 1.795662048 }, { "epoch": 1.0368768172470038, "grad_norm": 3.948822021484375, "loss": 4.0022, "lr": 0.0005587412587412589, "step": 3656, "tokens_trained": 1.79664468 }, { "epoch": 1.0374441528969576, "grad_norm": 2.5267179012298584, "loss": 3.9655, "lr": 0.0005584615384615385, "step": 3658, "tokens_trained": 1.7976262 }, { "epoch": 1.0380114885469116, "grad_norm": 2.7988510131835938, "loss": 4.0161, "lr": 0.0005581818181818182, "step": 3660, "tokens_trained": 1.79861132 }, { "epoch": 1.0385788241968654, "grad_norm": 8.685417175292969, "loss": 4.0038, "lr": 0.0005579020979020979, "step": 3662, "tokens_trained": 1.799592384 }, { "epoch": 1.0391461598468195, "grad_norm": 8.391874313354492, "loss": 3.9519, "lr": 0.0005576223776223776, "step": 3664, "tokens_trained": 1.800577208 }, { "epoch": 1.0397134954967733, "grad_norm": 7.6766815185546875, "loss": 4.0119, "lr": 0.0005573426573426574, "step": 3666, "tokens_trained": 1.801559128 }, { "epoch": 1.040280831146727, "grad_norm": 6.230587959289551, "loss": 3.9528, "lr": 0.0005570629370629371, "step": 3668, "tokens_trained": 1.802540608 }, { "epoch": 1.0408481667966811, "grad_norm": 7.4818010330200195, "loss": 3.9532, "lr": 0.0005567832167832167, "step": 3670, "tokens_trained": 1.80352688 }, { "epoch": 1.041415502446635, "grad_norm": 7.714044094085693, "loss": 4.0154, "lr": 0.0005565034965034965, "step": 3672, "tokens_trained": 1.804515736 }, { "epoch": 1.041982838096589, "grad_norm": 5.260356426239014, "loss": 3.9931, "lr": 0.0005562237762237763, "step": 3674, "tokens_trained": 1.805497152 }, { "epoch": 1.0425501737465428, "grad_norm": 4.576403617858887, "loss": 4.0345, "lr": 0.000555944055944056, "step": 3676, "tokens_trained": 1.806479328 }, { "epoch": 1.0431175093964966, "grad_norm": 3.378896713256836, "loss": 3.9827, "lr": 0.0005556643356643357, "step": 3678, "tokens_trained": 1.807459232 }, { "epoch": 1.0436848450464506, "grad_norm": 6.739299774169922, "loss": 3.9811, "lr": 0.0005553846153846154, "step": 3680, "tokens_trained": 1.808441944 }, { "epoch": 1.0442521806964045, "grad_norm": 4.965353012084961, "loss": 3.9292, "lr": 0.0005551048951048951, "step": 3682, "tokens_trained": 1.809423488 }, { "epoch": 1.0448195163463585, "grad_norm": 7.479167461395264, "loss": 3.9386, "lr": 0.0005548251748251748, "step": 3684, "tokens_trained": 1.810409008 }, { "epoch": 1.0453868519963123, "grad_norm": 3.754814863204956, "loss": 3.9936, "lr": 0.0005545454545454546, "step": 3686, "tokens_trained": 1.811387856 }, { "epoch": 1.0459541876462661, "grad_norm": 5.744228839874268, "loss": 3.9761, "lr": 0.0005542657342657342, "step": 3688, "tokens_trained": 1.812371104 }, { "epoch": 1.0465215232962202, "grad_norm": 5.926168918609619, "loss": 3.904, "lr": 0.000553986013986014, "step": 3690, "tokens_trained": 1.813356456 }, { "epoch": 1.047088858946174, "grad_norm": 5.209751605987549, "loss": 3.9706, "lr": 0.0005537062937062938, "step": 3692, "tokens_trained": 1.81434056 }, { "epoch": 1.047656194596128, "grad_norm": 4.979823112487793, "loss": 3.972, "lr": 0.0005534265734265735, "step": 3694, "tokens_trained": 1.815319936 }, { "epoch": 1.0482235302460818, "grad_norm": 5.393070220947266, "loss": 3.9694, "lr": 0.0005531468531468532, "step": 3696, "tokens_trained": 1.816299016 }, { "epoch": 1.0487908658960357, "grad_norm": 3.27998423576355, "loss": 3.9706, "lr": 0.0005528671328671328, "step": 3698, "tokens_trained": 1.817284696 }, { "epoch": 1.0493582015459897, "grad_norm": 6.364100456237793, "loss": 3.9803, "lr": 0.0005525874125874126, "step": 3700, "tokens_trained": 1.818268736 }, { "epoch": 1.0499255371959435, "grad_norm": 6.063296794891357, "loss": 3.9761, "lr": 0.0005523076923076923, "step": 3702, "tokens_trained": 1.819255432 }, { "epoch": 1.0504928728458975, "grad_norm": 6.279892444610596, "loss": 3.9792, "lr": 0.0005520279720279721, "step": 3704, "tokens_trained": 1.820241704 }, { "epoch": 1.0510602084958514, "grad_norm": 3.804609537124634, "loss": 3.9763, "lr": 0.0005517482517482517, "step": 3706, "tokens_trained": 1.821226584 }, { "epoch": 1.0516275441458052, "grad_norm": 5.056581497192383, "loss": 3.9886, "lr": 0.0005514685314685315, "step": 3708, "tokens_trained": 1.822208432 }, { "epoch": 1.0521948797957592, "grad_norm": 2.052483081817627, "loss": 3.9485, "lr": 0.0005511888111888111, "step": 3710, "tokens_trained": 1.823195928 }, { "epoch": 1.052762215445713, "grad_norm": 6.076491832733154, "loss": 4.0132, "lr": 0.0005509090909090909, "step": 3712, "tokens_trained": 1.824178568 }, { "epoch": 1.053329551095667, "grad_norm": 7.526022434234619, "loss": 3.9478, "lr": 0.0005506293706293707, "step": 3714, "tokens_trained": 1.82516128 }, { "epoch": 1.0538968867456209, "grad_norm": 2.7086679935455322, "loss": 3.9913, "lr": 0.0005503496503496503, "step": 3716, "tokens_trained": 1.826142864 }, { "epoch": 1.0544642223955747, "grad_norm": 1.7643057107925415, "loss": 3.9813, "lr": 0.0005500699300699301, "step": 3718, "tokens_trained": 1.82712608 }, { "epoch": 1.0550315580455287, "grad_norm": 6.2813029289245605, "loss": 3.9772, "lr": 0.0005497902097902098, "step": 3720, "tokens_trained": 1.828107616 }, { "epoch": 1.0555988936954825, "grad_norm": 7.591973781585693, "loss": 3.938, "lr": 0.0005495104895104896, "step": 3722, "tokens_trained": 1.82909308 }, { "epoch": 1.0561662293454366, "grad_norm": 4.976797580718994, "loss": 3.9889, "lr": 0.0005492307692307692, "step": 3724, "tokens_trained": 1.830079168 }, { "epoch": 1.0567335649953904, "grad_norm": 5.417744159698486, "loss": 4.0039, "lr": 0.0005489510489510489, "step": 3726, "tokens_trained": 1.831062488 }, { "epoch": 1.0573009006453442, "grad_norm": 4.516066074371338, "loss": 3.9845, "lr": 0.0005486713286713286, "step": 3728, "tokens_trained": 1.832046528 }, { "epoch": 1.0578682362952982, "grad_norm": 3.677839756011963, "loss": 3.9446, "lr": 0.0005483916083916084, "step": 3730, "tokens_trained": 1.83303104 }, { "epoch": 1.058435571945252, "grad_norm": 5.22024393081665, "loss": 3.9746, "lr": 0.0005481118881118882, "step": 3732, "tokens_trained": 1.834017736 }, { "epoch": 1.059002907595206, "grad_norm": 7.4156060218811035, "loss": 3.9898, "lr": 0.0005478321678321678, "step": 3734, "tokens_trained": 1.8349996 }, { "epoch": 1.05957024324516, "grad_norm": 3.472533702850342, "loss": 3.9558, "lr": 0.0005475524475524476, "step": 3736, "tokens_trained": 1.835979152 }, { "epoch": 1.0601375788951137, "grad_norm": 2.4360055923461914, "loss": 3.9627, "lr": 0.0005472727272727273, "step": 3738, "tokens_trained": 1.836963416 }, { "epoch": 1.0607049145450678, "grad_norm": 4.8988728523254395, "loss": 3.9492, "lr": 0.000546993006993007, "step": 3740, "tokens_trained": 1.83794088 }, { "epoch": 1.0612722501950216, "grad_norm": 5.711161136627197, "loss": 4.002, "lr": 0.0005467132867132867, "step": 3742, "tokens_trained": 1.838924456 }, { "epoch": 1.0618395858449756, "grad_norm": 4.373830318450928, "loss": 3.9811, "lr": 0.0005464335664335664, "step": 3744, "tokens_trained": 1.839902072 }, { "epoch": 1.0624069214949294, "grad_norm": 3.2446751594543457, "loss": 3.9551, "lr": 0.0005461538461538461, "step": 3746, "tokens_trained": 1.840882688 }, { "epoch": 1.0629742571448832, "grad_norm": 3.3250389099121094, "loss": 3.9556, "lr": 0.0005458741258741259, "step": 3748, "tokens_trained": 1.841863816 }, { "epoch": 1.0635415927948373, "grad_norm": 7.377841949462891, "loss": 4.0118, "lr": 0.0005455944055944057, "step": 3750, "tokens_trained": 1.842844072 }, { "epoch": 1.0635415927948373, "eval_loss": 0.994845449924469, "eval_runtime": 20.2191, "step": 3750, "tokens_trained": 1.842844072 }, { "epoch": 1.064108928444791, "grad_norm": 3.671860694885254, "loss": 3.9439, "lr": 0.0005453146853146853, "step": 3752, "tokens_trained": 1.843832472 }, { "epoch": 1.0646762640947451, "grad_norm": 3.7120800018310547, "loss": 3.9992, "lr": 0.000545034965034965, "step": 3754, "tokens_trained": 1.84481192 }, { "epoch": 1.065243599744699, "grad_norm": 6.560836315155029, "loss": 3.9594, "lr": 0.0005447552447552448, "step": 3756, "tokens_trained": 1.84579436 }, { "epoch": 1.0658109353946528, "grad_norm": 1.7166560888290405, "loss": 3.9656, "lr": 0.0005444755244755245, "step": 3758, "tokens_trained": 1.84678316 }, { "epoch": 1.0663782710446068, "grad_norm": 5.579006671905518, "loss": 4.0034, "lr": 0.0005441958041958042, "step": 3760, "tokens_trained": 1.847770488 }, { "epoch": 1.0669456066945606, "grad_norm": 3.6601710319519043, "loss": 3.9346, "lr": 0.0005439160839160839, "step": 3762, "tokens_trained": 1.848747488 }, { "epoch": 1.0675129423445147, "grad_norm": 1.2449930906295776, "loss": 3.9493, "lr": 0.0005436363636363635, "step": 3764, "tokens_trained": 1.849727168 }, { "epoch": 1.0680802779944685, "grad_norm": 5.6108479499816895, "loss": 3.9527, "lr": 0.0005433566433566434, "step": 3766, "tokens_trained": 1.85070748 }, { "epoch": 1.0686476136444223, "grad_norm": 7.556972980499268, "loss": 3.9465, "lr": 0.0005430769230769231, "step": 3768, "tokens_trained": 1.851693328 }, { "epoch": 1.0692149492943763, "grad_norm": 3.7439489364624023, "loss": 3.964, "lr": 0.0005427972027972028, "step": 3770, "tokens_trained": 1.852674992 }, { "epoch": 1.0697822849443301, "grad_norm": 4.162338733673096, "loss": 3.969, "lr": 0.0005425174825174825, "step": 3772, "tokens_trained": 1.853659048 }, { "epoch": 1.0703496205942842, "grad_norm": 3.8950648307800293, "loss": 3.9691, "lr": 0.0005422377622377623, "step": 3774, "tokens_trained": 1.854644728 }, { "epoch": 1.070916956244238, "grad_norm": 4.361495018005371, "loss": 3.9437, "lr": 0.000541958041958042, "step": 3776, "tokens_trained": 1.855626632 }, { "epoch": 1.0714842918941918, "grad_norm": 3.5286366939544678, "loss": 3.9831, "lr": 0.0005416783216783216, "step": 3778, "tokens_trained": 1.856606216 }, { "epoch": 1.0720516275441458, "grad_norm": 4.972531795501709, "loss": 4.0222, "lr": 0.0005413986013986014, "step": 3780, "tokens_trained": 1.857590816 }, { "epoch": 1.0726189631940997, "grad_norm": 9.155055046081543, "loss": 3.9442, "lr": 0.000541118881118881, "step": 3782, "tokens_trained": 1.85857288 }, { "epoch": 1.0731862988440537, "grad_norm": 1.4077136516571045, "loss": 3.9806, "lr": 0.0005408391608391609, "step": 3784, "tokens_trained": 1.859555224 }, { "epoch": 1.0737536344940075, "grad_norm": 3.204779863357544, "loss": 3.9506, "lr": 0.0005405594405594406, "step": 3786, "tokens_trained": 1.860538984 }, { "epoch": 1.0743209701439613, "grad_norm": 3.988658905029297, "loss": 4.0025, "lr": 0.0005402797202797203, "step": 3788, "tokens_trained": 1.861522976 }, { "epoch": 1.0748883057939154, "grad_norm": 3.0060372352600098, "loss": 3.9308, "lr": 0.00054, "step": 3790, "tokens_trained": 1.86250564 }, { "epoch": 1.0754556414438692, "grad_norm": 2.494147777557373, "loss": 4.0116, "lr": 0.0005397202797202798, "step": 3792, "tokens_trained": 1.863491248 }, { "epoch": 1.0760229770938232, "grad_norm": 5.260354518890381, "loss": 3.9917, "lr": 0.0005394405594405595, "step": 3794, "tokens_trained": 1.864474808 }, { "epoch": 1.076590312743777, "grad_norm": 4.43446159362793, "loss": 3.9698, "lr": 0.0005391608391608391, "step": 3796, "tokens_trained": 1.865457608 }, { "epoch": 1.0771576483937308, "grad_norm": 5.485021114349365, "loss": 3.9494, "lr": 0.0005388811188811189, "step": 3798, "tokens_trained": 1.866439336 }, { "epoch": 1.0777249840436849, "grad_norm": 5.432106971740723, "loss": 3.9749, "lr": 0.0005386013986013985, "step": 3800, "tokens_trained": 1.867422432 }, { "epoch": 1.0782923196936387, "grad_norm": 5.726179122924805, "loss": 3.9524, "lr": 0.0005383216783216784, "step": 3802, "tokens_trained": 1.868404976 }, { "epoch": 1.0788596553435927, "grad_norm": 7.2211594581604, "loss": 3.954, "lr": 0.0005380419580419581, "step": 3804, "tokens_trained": 1.869387272 }, { "epoch": 1.0794269909935466, "grad_norm": 3.6406068801879883, "loss": 4.0125, "lr": 0.0005377622377622377, "step": 3806, "tokens_trained": 1.870371664 }, { "epoch": 1.0799943266435004, "grad_norm": 7.254781723022461, "loss": 3.9535, "lr": 0.0005374825174825175, "step": 3808, "tokens_trained": 1.87135524 }, { "epoch": 1.0805616622934544, "grad_norm": 7.8573079109191895, "loss": 4.0054, "lr": 0.0005372027972027972, "step": 3810, "tokens_trained": 1.872337216 }, { "epoch": 1.0811289979434082, "grad_norm": 1.049710988998413, "loss": 3.9541, "lr": 0.000536923076923077, "step": 3812, "tokens_trained": 1.873317672 }, { "epoch": 1.0816963335933623, "grad_norm": 7.515570163726807, "loss": 3.9466, "lr": 0.0005366433566433566, "step": 3814, "tokens_trained": 1.874299184 }, { "epoch": 1.082263669243316, "grad_norm": 6.041797637939453, "loss": 3.9508, "lr": 0.0005363636363636364, "step": 3816, "tokens_trained": 1.875282768 }, { "epoch": 1.0828310048932699, "grad_norm": 2.9910285472869873, "loss": 3.9368, "lr": 0.000536083916083916, "step": 3818, "tokens_trained": 1.876264312 }, { "epoch": 1.083398340543224, "grad_norm": 3.5802299976348877, "loss": 3.9661, "lr": 0.0005358041958041959, "step": 3820, "tokens_trained": 1.877245472 }, { "epoch": 1.0839656761931777, "grad_norm": 6.078779697418213, "loss": 3.9758, "lr": 0.0005355244755244756, "step": 3822, "tokens_trained": 1.87822768 }, { "epoch": 1.0845330118431318, "grad_norm": 6.143925189971924, "loss": 3.947, "lr": 0.0005352447552447552, "step": 3824, "tokens_trained": 1.879209824 }, { "epoch": 1.0851003474930856, "grad_norm": 4.272439002990723, "loss": 4.0284, "lr": 0.000534965034965035, "step": 3826, "tokens_trained": 1.88019528 }, { "epoch": 1.0856676831430394, "grad_norm": 7.169465065002441, "loss": 3.9651, "lr": 0.0005346853146853147, "step": 3828, "tokens_trained": 1.88117776 }, { "epoch": 1.0862350187929934, "grad_norm": 6.489839553833008, "loss": 3.9505, "lr": 0.0005344055944055945, "step": 3830, "tokens_trained": 1.88216468 }, { "epoch": 1.0868023544429473, "grad_norm": 2.966554880142212, "loss": 4.0406, "lr": 0.0005341258741258741, "step": 3832, "tokens_trained": 1.883147968 }, { "epoch": 1.0873696900929013, "grad_norm": 4.948841094970703, "loss": 3.9704, "lr": 0.0005338461538461538, "step": 3834, "tokens_trained": 1.884132176 }, { "epoch": 1.0879370257428551, "grad_norm": 7.666274547576904, "loss": 4.0082, "lr": 0.0005335664335664335, "step": 3836, "tokens_trained": 1.885119008 }, { "epoch": 1.088504361392809, "grad_norm": 12.454533576965332, "loss": 3.9702, "lr": 0.0005332867132867133, "step": 3838, "tokens_trained": 1.88610144 }, { "epoch": 1.089071697042763, "grad_norm": 4.42985725402832, "loss": 3.9601, "lr": 0.0005330069930069931, "step": 3840, "tokens_trained": 1.88708772 }, { "epoch": 1.0896390326927168, "grad_norm": 14.10716438293457, "loss": 3.9942, "lr": 0.0005327272727272727, "step": 3842, "tokens_trained": 1.888068192 }, { "epoch": 1.0902063683426708, "grad_norm": 6.3290910720825195, "loss": 3.9218, "lr": 0.0005324475524475525, "step": 3844, "tokens_trained": 1.88905572 }, { "epoch": 1.0907737039926246, "grad_norm": 6.61427640914917, "loss": 4.0173, "lr": 0.0005321678321678322, "step": 3846, "tokens_trained": 1.890040152 }, { "epoch": 1.0913410396425784, "grad_norm": 6.868432998657227, "loss": 3.9553, "lr": 0.000531888111888112, "step": 3848, "tokens_trained": 1.891031024 }, { "epoch": 1.0919083752925325, "grad_norm": 4.057258129119873, "loss": 3.9839, "lr": 0.0005316083916083916, "step": 3850, "tokens_trained": 1.892009904 }, { "epoch": 1.0924757109424863, "grad_norm": 3.5418479442596436, "loss": 3.9839, "lr": 0.0005313286713286713, "step": 3852, "tokens_trained": 1.892993976 }, { "epoch": 1.0930430465924403, "grad_norm": 1.231491208076477, "loss": 3.9549, "lr": 0.000531048951048951, "step": 3854, "tokens_trained": 1.893972744 }, { "epoch": 1.0936103822423942, "grad_norm": 4.056438446044922, "loss": 3.9512, "lr": 0.0005307692307692308, "step": 3856, "tokens_trained": 1.894954248 }, { "epoch": 1.094177717892348, "grad_norm": 2.9252607822418213, "loss": 3.9201, "lr": 0.0005304895104895106, "step": 3858, "tokens_trained": 1.895938816 }, { "epoch": 1.094745053542302, "grad_norm": 3.035308599472046, "loss": 3.9367, "lr": 0.0005302097902097902, "step": 3860, "tokens_trained": 1.896920832 }, { "epoch": 1.0953123891922558, "grad_norm": 2.2526092529296875, "loss": 3.9554, "lr": 0.0005299300699300699, "step": 3862, "tokens_trained": 1.897903216 }, { "epoch": 1.0958797248422099, "grad_norm": 2.882819175720215, "loss": 3.926, "lr": 0.0005296503496503497, "step": 3864, "tokens_trained": 1.898886632 }, { "epoch": 1.0964470604921637, "grad_norm": 7.817485809326172, "loss": 3.9583, "lr": 0.0005293706293706294, "step": 3866, "tokens_trained": 1.899872128 }, { "epoch": 1.0970143961421175, "grad_norm": 8.241719245910645, "loss": 3.9391, "lr": 0.0005290909090909091, "step": 3868, "tokens_trained": 1.900856544 }, { "epoch": 1.0975817317920715, "grad_norm": 4.160614013671875, "loss": 3.9285, "lr": 0.0005288111888111888, "step": 3870, "tokens_trained": 1.901838952 }, { "epoch": 1.0981490674420253, "grad_norm": 3.527678966522217, "loss": 3.9593, "lr": 0.0005285314685314684, "step": 3872, "tokens_trained": 1.902823024 }, { "epoch": 1.0987164030919794, "grad_norm": 5.290194511413574, "loss": 3.9357, "lr": 0.0005282517482517483, "step": 3874, "tokens_trained": 1.903803456 }, { "epoch": 1.0990000709169563, "eval_loss": 0.9935861229896545, "eval_runtime": 20.2396, "step": 3875, "tokens_trained": 1.904295504 }, { "epoch": 1.0992837387419332, "grad_norm": 5.472379207611084, "loss": 4.0255, "lr": 0.000527972027972028, "step": 3876, "tokens_trained": 1.904786344 }, { "epoch": 1.099851074391887, "grad_norm": 6.999550819396973, "loss": 3.9523, "lr": 0.0005276923076923077, "step": 3878, "tokens_trained": 1.90576952 }, { "epoch": 1.100418410041841, "grad_norm": 3.3077871799468994, "loss": 3.9452, "lr": 0.0005274125874125874, "step": 3880, "tokens_trained": 1.906745784 }, { "epoch": 1.1009857456917949, "grad_norm": 4.513088226318359, "loss": 3.9687, "lr": 0.0005271328671328672, "step": 3882, "tokens_trained": 1.907734576 }, { "epoch": 1.101553081341749, "grad_norm": 8.249629020690918, "loss": 3.9445, "lr": 0.0005268531468531469, "step": 3884, "tokens_trained": 1.908716328 }, { "epoch": 1.1021204169917027, "grad_norm": 8.281685829162598, "loss": 3.9906, "lr": 0.0005265734265734266, "step": 3886, "tokens_trained": 1.909702984 }, { "epoch": 1.1026877526416565, "grad_norm": 6.521668910980225, "loss": 3.9971, "lr": 0.0005262937062937063, "step": 3888, "tokens_trained": 1.91068828 }, { "epoch": 1.1032550882916106, "grad_norm": 6.442141056060791, "loss": 3.9769, "lr": 0.0005260139860139859, "step": 3890, "tokens_trained": 1.911668976 }, { "epoch": 1.1038224239415644, "grad_norm": 11.120711326599121, "loss": 3.9455, "lr": 0.0005257342657342658, "step": 3892, "tokens_trained": 1.912650176 }, { "epoch": 1.1043897595915184, "grad_norm": 2.695085048675537, "loss": 3.984, "lr": 0.0005254545454545455, "step": 3894, "tokens_trained": 1.913624832 }, { "epoch": 1.1049570952414722, "grad_norm": 16.994462966918945, "loss": 3.968, "lr": 0.0005251748251748252, "step": 3896, "tokens_trained": 1.914609128 }, { "epoch": 1.105524430891426, "grad_norm": 5.866199016571045, "loss": 3.9157, "lr": 0.0005248951048951049, "step": 3898, "tokens_trained": 1.91559088 }, { "epoch": 1.10609176654138, "grad_norm": 8.222938537597656, "loss": 3.9516, "lr": 0.0005246153846153847, "step": 3900, "tokens_trained": 1.916575752 }, { "epoch": 1.106659102191334, "grad_norm": 6.4162774085998535, "loss": 3.9761, "lr": 0.0005243356643356644, "step": 3902, "tokens_trained": 1.9175578 }, { "epoch": 1.107226437841288, "grad_norm": 5.338213920593262, "loss": 3.9804, "lr": 0.000524055944055944, "step": 3904, "tokens_trained": 1.918538192 }, { "epoch": 1.1077937734912418, "grad_norm": 6.3608927726745605, "loss": 3.9675, "lr": 0.0005237762237762238, "step": 3906, "tokens_trained": 1.9195184 }, { "epoch": 1.1083611091411956, "grad_norm": 6.1585845947265625, "loss": 3.9385, "lr": 0.0005234965034965034, "step": 3908, "tokens_trained": 1.920498704 }, { "epoch": 1.1089284447911496, "grad_norm": 5.266563415527344, "loss": 4.0169, "lr": 0.0005232167832167833, "step": 3910, "tokens_trained": 1.921477824 }, { "epoch": 1.1094957804411034, "grad_norm": 3.5322930812835693, "loss": 3.9734, "lr": 0.000522937062937063, "step": 3912, "tokens_trained": 1.922456704 }, { "epoch": 1.1100631160910575, "grad_norm": 3.8564069271087646, "loss": 3.9873, "lr": 0.0005226573426573427, "step": 3914, "tokens_trained": 1.92343992 }, { "epoch": 1.1106304517410113, "grad_norm": 3.9069607257843018, "loss": 3.9892, "lr": 0.0005223776223776224, "step": 3916, "tokens_trained": 1.924424576 }, { "epoch": 1.111197787390965, "grad_norm": 6.195169925689697, "loss": 3.9489, "lr": 0.0005220979020979021, "step": 3918, "tokens_trained": 1.92540764 }, { "epoch": 1.1117651230409191, "grad_norm": 4.950653076171875, "loss": 3.9561, "lr": 0.0005218181818181819, "step": 3920, "tokens_trained": 1.926386144 }, { "epoch": 1.112332458690873, "grad_norm": 4.923401832580566, "loss": 3.991, "lr": 0.0005215384615384615, "step": 3922, "tokens_trained": 1.92736516 }, { "epoch": 1.112899794340827, "grad_norm": 4.2394561767578125, "loss": 3.9445, "lr": 0.0005212587412587413, "step": 3924, "tokens_trained": 1.928350608 }, { "epoch": 1.1134671299907808, "grad_norm": 3.4303910732269287, "loss": 3.9871, "lr": 0.0005209790209790209, "step": 3926, "tokens_trained": 1.929333008 }, { "epoch": 1.1140344656407346, "grad_norm": 6.241591453552246, "loss": 3.9799, "lr": 0.0005206993006993008, "step": 3928, "tokens_trained": 1.930315616 }, { "epoch": 1.1146018012906886, "grad_norm": 5.21243143081665, "loss": 3.9624, "lr": 0.0005204195804195805, "step": 3930, "tokens_trained": 1.931298192 }, { "epoch": 1.1151691369406425, "grad_norm": 7.095268249511719, "loss": 3.9263, "lr": 0.0005201398601398601, "step": 3932, "tokens_trained": 1.93228248 }, { "epoch": 1.1157364725905965, "grad_norm": 9.025245666503906, "loss": 4.0058, "lr": 0.0005198601398601399, "step": 3934, "tokens_trained": 1.93326592 }, { "epoch": 1.1163038082405503, "grad_norm": 3.9758048057556152, "loss": 3.9299, "lr": 0.0005195804195804196, "step": 3936, "tokens_trained": 1.93424888 }, { "epoch": 1.1168711438905041, "grad_norm": 9.68726634979248, "loss": 3.9433, "lr": 0.0005193006993006994, "step": 3938, "tokens_trained": 1.935231688 }, { "epoch": 1.1174384795404582, "grad_norm": 7.5478901863098145, "loss": 4.0053, "lr": 0.000519020979020979, "step": 3940, "tokens_trained": 1.936216832 }, { "epoch": 1.118005815190412, "grad_norm": 6.016645431518555, "loss": 3.9481, "lr": 0.0005187412587412588, "step": 3942, "tokens_trained": 1.937196632 }, { "epoch": 1.118573150840366, "grad_norm": 7.313266277313232, "loss": 3.9539, "lr": 0.0005184615384615384, "step": 3944, "tokens_trained": 1.938180424 }, { "epoch": 1.1191404864903198, "grad_norm": 4.228805065155029, "loss": 3.9528, "lr": 0.0005181818181818182, "step": 3946, "tokens_trained": 1.939165376 }, { "epoch": 1.1197078221402736, "grad_norm": 1.2050669193267822, "loss": 3.9699, "lr": 0.000517902097902098, "step": 3948, "tokens_trained": 1.940146184 }, { "epoch": 1.1202751577902277, "grad_norm": 4.581719875335693, "loss": 3.9346, "lr": 0.0005176223776223776, "step": 3950, "tokens_trained": 1.941130648 }, { "epoch": 1.1208424934401815, "grad_norm": 9.381650924682617, "loss": 3.9294, "lr": 0.0005173426573426574, "step": 3952, "tokens_trained": 1.94210952 }, { "epoch": 1.1214098290901355, "grad_norm": 5.3781585693359375, "loss": 3.9208, "lr": 0.000517062937062937, "step": 3954, "tokens_trained": 1.943096344 }, { "epoch": 1.1219771647400893, "grad_norm": 4.263558387756348, "loss": 3.9492, "lr": 0.0005167832167832169, "step": 3956, "tokens_trained": 1.94407804 }, { "epoch": 1.1225445003900432, "grad_norm": 5.920651435852051, "loss": 3.8951, "lr": 0.0005165034965034965, "step": 3958, "tokens_trained": 1.94506156 }, { "epoch": 1.1231118360399972, "grad_norm": 7.0110344886779785, "loss": 3.9329, "lr": 0.0005162237762237762, "step": 3960, "tokens_trained": 1.946040072 }, { "epoch": 1.123679171689951, "grad_norm": 4.611392021179199, "loss": 3.9094, "lr": 0.0005159440559440559, "step": 3962, "tokens_trained": 1.947023256 }, { "epoch": 1.124246507339905, "grad_norm": 5.340510845184326, "loss": 3.9552, "lr": 0.0005156643356643357, "step": 3964, "tokens_trained": 1.948006848 }, { "epoch": 1.1248138429898589, "grad_norm": 5.190691947937012, "loss": 3.956, "lr": 0.0005153846153846154, "step": 3966, "tokens_trained": 1.948991632 }, { "epoch": 1.1253811786398127, "grad_norm": 5.612351894378662, "loss": 3.9861, "lr": 0.0005151048951048951, "step": 3968, "tokens_trained": 1.949975704 }, { "epoch": 1.1259485142897667, "grad_norm": 6.097261428833008, "loss": 3.9867, "lr": 0.0005148251748251748, "step": 3970, "tokens_trained": 1.950957944 }, { "epoch": 1.1265158499397205, "grad_norm": 4.194180965423584, "loss": 3.9242, "lr": 0.0005145454545454545, "step": 3972, "tokens_trained": 1.9519416 }, { "epoch": 1.1270831855896746, "grad_norm": 4.118505477905273, "loss": 3.9553, "lr": 0.0005142657342657343, "step": 3974, "tokens_trained": 1.95292252 }, { "epoch": 1.1276505212396284, "grad_norm": 5.10177755355835, "loss": 3.9653, "lr": 0.000513986013986014, "step": 3976, "tokens_trained": 1.953902792 }, { "epoch": 1.1282178568895822, "grad_norm": 5.665530204772949, "loss": 3.916, "lr": 0.0005137062937062937, "step": 3978, "tokens_trained": 1.954888184 }, { "epoch": 1.1287851925395362, "grad_norm": 4.1443963050842285, "loss": 3.9254, "lr": 0.0005134265734265734, "step": 3980, "tokens_trained": 1.955868688 }, { "epoch": 1.12935252818949, "grad_norm": 2.4941980838775635, "loss": 3.9502, "lr": 0.0005131468531468532, "step": 3982, "tokens_trained": 1.956852472 }, { "epoch": 1.129919863839444, "grad_norm": 3.85143780708313, "loss": 3.8926, "lr": 0.0005128671328671328, "step": 3984, "tokens_trained": 1.957835808 }, { "epoch": 1.130487199489398, "grad_norm": 5.975537300109863, "loss": 3.9926, "lr": 0.0005125874125874126, "step": 3986, "tokens_trained": 1.958816736 }, { "epoch": 1.1310545351393517, "grad_norm": 6.722855567932129, "loss": 3.986, "lr": 0.0005123076923076923, "step": 3988, "tokens_trained": 1.9598008 }, { "epoch": 1.1316218707893058, "grad_norm": 3.1752729415893555, "loss": 3.9343, "lr": 0.000512027972027972, "step": 3990, "tokens_trained": 1.960783816 }, { "epoch": 1.1321892064392596, "grad_norm": 3.669602394104004, "loss": 3.9746, "lr": 0.0005117482517482518, "step": 3992, "tokens_trained": 1.96176816 }, { "epoch": 1.1327565420892136, "grad_norm": 7.3116326332092285, "loss": 3.9829, "lr": 0.0005114685314685315, "step": 3994, "tokens_trained": 1.962752696 }, { "epoch": 1.1333238777391674, "grad_norm": 5.816486358642578, "loss": 3.9617, "lr": 0.0005111888111888112, "step": 3996, "tokens_trained": 1.96373432 }, { "epoch": 1.1338912133891212, "grad_norm": 2.3524768352508545, "loss": 3.929, "lr": 0.0005109090909090908, "step": 3998, "tokens_trained": 1.964713416 }, { "epoch": 1.1344585490390753, "grad_norm": 4.908108711242676, "loss": 3.9741, "lr": 0.0005106293706293707, "step": 4000, "tokens_trained": 1.965692096 }, { "epoch": 1.1344585490390753, "eval_loss": 0.9912415146827698, "eval_runtime": 20.338, "step": 4000, "tokens_trained": 1.965692096 }, { "epoch": 1.135025884689029, "grad_norm": 4.395096778869629, "loss": 3.955, "lr": 0.0005103496503496503, "step": 4002, "tokens_trained": 1.966677008 }, { "epoch": 1.1355932203389831, "grad_norm": 3.2460927963256836, "loss": 3.9522, "lr": 0.0005100699300699301, "step": 4004, "tokens_trained": 1.967662208 }, { "epoch": 1.136160555988937, "grad_norm": 3.2880218029022217, "loss": 3.9111, "lr": 0.0005097902097902098, "step": 4006, "tokens_trained": 1.968642816 }, { "epoch": 1.1367278916388908, "grad_norm": 3.694084644317627, "loss": 3.9045, "lr": 0.0005095104895104895, "step": 4008, "tokens_trained": 1.969623616 }, { "epoch": 1.1372952272888448, "grad_norm": 2.690668821334839, "loss": 3.9534, "lr": 0.0005092307692307693, "step": 4010, "tokens_trained": 1.970607456 }, { "epoch": 1.1378625629387986, "grad_norm": 3.6751973628997803, "loss": 3.9979, "lr": 0.0005089510489510489, "step": 4012, "tokens_trained": 1.971587136 }, { "epoch": 1.1384298985887527, "grad_norm": 3.0805108547210693, "loss": 3.888, "lr": 0.0005086713286713287, "step": 4014, "tokens_trained": 1.972575152 }, { "epoch": 1.1389972342387065, "grad_norm": 5.386228084564209, "loss": 3.9586, "lr": 0.0005083916083916083, "step": 4016, "tokens_trained": 1.973563872 }, { "epoch": 1.1395645698886603, "grad_norm": 5.567631721496582, "loss": 3.9337, "lr": 0.0005081118881118882, "step": 4018, "tokens_trained": 1.97454444 }, { "epoch": 1.1401319055386143, "grad_norm": 5.159145355224609, "loss": 3.9311, "lr": 0.0005078321678321678, "step": 4020, "tokens_trained": 1.975528128 }, { "epoch": 1.1406992411885681, "grad_norm": 3.8111817836761475, "loss": 3.9542, "lr": 0.0005075524475524476, "step": 4022, "tokens_trained": 1.97651136 }, { "epoch": 1.1412665768385222, "grad_norm": 5.618584156036377, "loss": 3.9841, "lr": 0.0005072727272727273, "step": 4024, "tokens_trained": 1.97749408 }, { "epoch": 1.141833912488476, "grad_norm": 5.414000511169434, "loss": 3.9435, "lr": 0.0005069930069930069, "step": 4026, "tokens_trained": 1.978478936 }, { "epoch": 1.1424012481384298, "grad_norm": 7.3321661949157715, "loss": 3.962, "lr": 0.0005067132867132868, "step": 4028, "tokens_trained": 1.979462272 }, { "epoch": 1.1429685837883838, "grad_norm": 3.5029044151306152, "loss": 3.9399, "lr": 0.0005064335664335664, "step": 4030, "tokens_trained": 1.98044648 }, { "epoch": 1.1435359194383377, "grad_norm": 6.343649387359619, "loss": 3.9788, "lr": 0.0005061538461538462, "step": 4032, "tokens_trained": 1.981432816 }, { "epoch": 1.1441032550882917, "grad_norm": 8.250723838806152, "loss": 3.9025, "lr": 0.0005058741258741258, "step": 4034, "tokens_trained": 1.982413272 }, { "epoch": 1.1446705907382455, "grad_norm": 3.6089327335357666, "loss": 3.9855, "lr": 0.0005055944055944057, "step": 4036, "tokens_trained": 1.983396296 }, { "epoch": 1.1452379263881993, "grad_norm": 5.802486896514893, "loss": 3.9569, "lr": 0.0005053146853146853, "step": 4038, "tokens_trained": 1.984378296 }, { "epoch": 1.1458052620381534, "grad_norm": 6.48319673538208, "loss": 3.9423, "lr": 0.000505034965034965, "step": 4040, "tokens_trained": 1.985356768 }, { "epoch": 1.1463725976881072, "grad_norm": 2.9942495822906494, "loss": 3.9667, "lr": 0.0005047552447552448, "step": 4042, "tokens_trained": 1.98633836 }, { "epoch": 1.1469399333380612, "grad_norm": 1.4219609498977661, "loss": 3.9238, "lr": 0.0005044755244755244, "step": 4044, "tokens_trained": 1.98732128 }, { "epoch": 1.147507268988015, "grad_norm": 2.6950814723968506, "loss": 3.9829, "lr": 0.0005041958041958043, "step": 4046, "tokens_trained": 1.988304968 }, { "epoch": 1.1480746046379688, "grad_norm": 4.490326404571533, "loss": 3.9506, "lr": 0.0005039160839160839, "step": 4048, "tokens_trained": 1.989288848 }, { "epoch": 1.1486419402879229, "grad_norm": 7.026235580444336, "loss": 3.9374, "lr": 0.0005036363636363637, "step": 4050, "tokens_trained": 1.990270344 }, { "epoch": 1.1492092759378767, "grad_norm": 6.214878082275391, "loss": 3.9627, "lr": 0.0005033566433566433, "step": 4052, "tokens_trained": 1.991250424 }, { "epoch": 1.1497766115878307, "grad_norm": 4.663200855255127, "loss": 3.9631, "lr": 0.0005030769230769231, "step": 4054, "tokens_trained": 1.9922354 }, { "epoch": 1.1503439472377845, "grad_norm": 4.318966865539551, "loss": 4.0147, "lr": 0.0005027972027972028, "step": 4056, "tokens_trained": 1.993221056 }, { "epoch": 1.1509112828877384, "grad_norm": 5.912793159484863, "loss": 3.9639, "lr": 0.0005025174825174825, "step": 4058, "tokens_trained": 1.994207552 }, { "epoch": 1.1514786185376924, "grad_norm": 3.6957592964172363, "loss": 3.9253, "lr": 0.0005022377622377623, "step": 4060, "tokens_trained": 1.99519044 }, { "epoch": 1.1520459541876462, "grad_norm": 2.9899842739105225, "loss": 3.9874, "lr": 0.0005019580419580419, "step": 4062, "tokens_trained": 1.996177368 }, { "epoch": 1.1526132898376003, "grad_norm": 6.149812698364258, "loss": 3.9278, "lr": 0.0005016783216783218, "step": 4064, "tokens_trained": 1.997162248 }, { "epoch": 1.153180625487554, "grad_norm": 3.7720232009887695, "loss": 3.9526, "lr": 0.0005013986013986014, "step": 4066, "tokens_trained": 1.99815024 }, { "epoch": 1.1537479611375079, "grad_norm": 3.3968939781188965, "loss": 3.9522, "lr": 0.0005011188811188811, "step": 4068, "tokens_trained": 1.999129208 }, { "epoch": 1.154315296787462, "grad_norm": 7.051310062408447, "loss": 3.9545, "lr": 0.0005008391608391608, "step": 4070, "tokens_trained": 2.000111232 }, { "epoch": 1.1548826324374157, "grad_norm": 4.798380374908447, "loss": 3.9114, "lr": 0.0005005594405594406, "step": 4072, "tokens_trained": 2.001098352 }, { "epoch": 1.1554499680873698, "grad_norm": 7.5074992179870605, "loss": 3.9795, "lr": 0.0005002797202797203, "step": 4074, "tokens_trained": 2.002077616 }, { "epoch": 1.1560173037373236, "grad_norm": 3.944998025894165, "loss": 3.9208, "lr": 0.0005, "step": 4076, "tokens_trained": 2.003065976 }, { "epoch": 1.1565846393872774, "grad_norm": 9.103386878967285, "loss": 3.9577, "lr": 0.0004997202797202798, "step": 4078, "tokens_trained": 2.004046568 }, { "epoch": 1.1571519750372314, "grad_norm": 8.950857162475586, "loss": 3.9474, "lr": 0.0004994405594405594, "step": 4080, "tokens_trained": 2.005031288 }, { "epoch": 1.1577193106871853, "grad_norm": 6.812939643859863, "loss": 3.9995, "lr": 0.0004991608391608391, "step": 4082, "tokens_trained": 2.00601472 }, { "epoch": 1.1582866463371393, "grad_norm": 8.14719009399414, "loss": 3.9496, "lr": 0.0004988811188811189, "step": 4084, "tokens_trained": 2.006996416 }, { "epoch": 1.158853981987093, "grad_norm": 7.125198841094971, "loss": 3.9074, "lr": 0.0004986013986013986, "step": 4086, "tokens_trained": 2.007980248 }, { "epoch": 1.159421317637047, "grad_norm": 2.4099230766296387, "loss": 3.9675, "lr": 0.0004983216783216784, "step": 4088, "tokens_trained": 2.008964792 }, { "epoch": 1.159988653287001, "grad_norm": 3.9759979248046875, "loss": 3.9655, "lr": 0.0004980419580419581, "step": 4090, "tokens_trained": 2.009945552 }, { "epoch": 1.1605559889369548, "grad_norm": 5.3169264793396, "loss": 3.9856, "lr": 0.0004977622377622378, "step": 4092, "tokens_trained": 2.010931072 }, { "epoch": 1.1611233245869088, "grad_norm": 9.010540008544922, "loss": 3.9293, "lr": 0.0004974825174825175, "step": 4094, "tokens_trained": 2.011911712 }, { "epoch": 1.1616906602368626, "grad_norm": 5.83132266998291, "loss": 3.9725, "lr": 0.0004972027972027972, "step": 4096, "tokens_trained": 2.012895208 }, { "epoch": 1.1622579958868164, "grad_norm": 8.76009750366211, "loss": 3.9875, "lr": 0.0004969230769230769, "step": 4098, "tokens_trained": 2.013881768 }, { "epoch": 1.1628253315367705, "grad_norm": 4.634799480438232, "loss": 3.9478, "lr": 0.0004966433566433566, "step": 4100, "tokens_trained": 2.014862288 }, { "epoch": 1.1633926671867243, "grad_norm": 3.717115879058838, "loss": 3.9029, "lr": 0.0004963636363636364, "step": 4102, "tokens_trained": 2.015846344 }, { "epoch": 1.1639600028366783, "grad_norm": 5.467166423797607, "loss": 3.9561, "lr": 0.0004960839160839161, "step": 4104, "tokens_trained": 2.01682528 }, { "epoch": 1.1645273384866321, "grad_norm": 5.645481109619141, "loss": 3.9889, "lr": 0.0004958041958041959, "step": 4106, "tokens_trained": 2.017809272 }, { "epoch": 1.165094674136586, "grad_norm": 4.796457767486572, "loss": 3.9554, "lr": 0.0004955244755244756, "step": 4108, "tokens_trained": 2.018791344 }, { "epoch": 1.16566200978654, "grad_norm": 6.111627578735352, "loss": 3.9495, "lr": 0.0004952447552447552, "step": 4110, "tokens_trained": 2.019777776 }, { "epoch": 1.1662293454364938, "grad_norm": 4.132344722747803, "loss": 3.878, "lr": 0.000494965034965035, "step": 4112, "tokens_trained": 2.020760032 }, { "epoch": 1.1667966810864479, "grad_norm": 4.833931922912598, "loss": 3.9537, "lr": 0.0004946853146853147, "step": 4114, "tokens_trained": 2.021745984 }, { "epoch": 1.1673640167364017, "grad_norm": 5.027078628540039, "loss": 3.9359, "lr": 0.0004944055944055944, "step": 4116, "tokens_trained": 2.022724968 }, { "epoch": 1.1679313523863555, "grad_norm": 5.339116096496582, "loss": 3.9104, "lr": 0.0004941258741258741, "step": 4118, "tokens_trained": 2.023705248 }, { "epoch": 1.1684986880363095, "grad_norm": 5.1652607917785645, "loss": 3.9671, "lr": 0.0004938461538461538, "step": 4120, "tokens_trained": 2.024688648 }, { "epoch": 1.1690660236862633, "grad_norm": 4.289709568023682, "loss": 3.9315, "lr": 0.0004935664335664336, "step": 4122, "tokens_trained": 2.025667424 }, { "epoch": 1.1696333593362174, "grad_norm": 5.6946492195129395, "loss": 3.9498, "lr": 0.0004932867132867133, "step": 4124, "tokens_trained": 2.026647168 }, { "epoch": 1.1699170271611943, "eval_loss": 0.9880662560462952, "eval_runtime": 21.3984, "step": 4125, "tokens_trained": 2.027139168 }, { "epoch": 1.1702006949861712, "grad_norm": 3.798551082611084, "loss": 3.9244, "lr": 0.0004930069930069931, "step": 4126, "tokens_trained": 2.027631096 }, { "epoch": 1.170768030636125, "grad_norm": 3.644767999649048, "loss": 3.939, "lr": 0.0004927272727272727, "step": 4128, "tokens_trained": 2.028613776 }, { "epoch": 1.171335366286079, "grad_norm": 5.300503253936768, "loss": 3.9352, "lr": 0.0004924475524475525, "step": 4130, "tokens_trained": 2.0295936 }, { "epoch": 1.1719027019360329, "grad_norm": 4.033862590789795, "loss": 3.9805, "lr": 0.0004921678321678322, "step": 4132, "tokens_trained": 2.030575632 }, { "epoch": 1.172470037585987, "grad_norm": 3.5188965797424316, "loss": 3.979, "lr": 0.0004918881118881118, "step": 4134, "tokens_trained": 2.031559704 }, { "epoch": 1.1730373732359407, "grad_norm": 2.1571266651153564, "loss": 3.9798, "lr": 0.0004916083916083916, "step": 4136, "tokens_trained": 2.032544624 }, { "epoch": 1.1736047088858945, "grad_norm": 1.2364273071289062, "loss": 3.971, "lr": 0.0004913286713286713, "step": 4138, "tokens_trained": 2.033524816 }, { "epoch": 1.1741720445358486, "grad_norm": 2.3588576316833496, "loss": 3.9631, "lr": 0.0004910489510489511, "step": 4140, "tokens_trained": 2.034509784 }, { "epoch": 1.1747393801858024, "grad_norm": 1.2670316696166992, "loss": 3.9317, "lr": 0.0004907692307692308, "step": 4142, "tokens_trained": 2.035493456 }, { "epoch": 1.1753067158357564, "grad_norm": 3.2413010597229004, "loss": 3.9778, "lr": 0.0004904895104895106, "step": 4144, "tokens_trained": 2.03647368 }, { "epoch": 1.1758740514857102, "grad_norm": 4.079458713531494, "loss": 3.9715, "lr": 0.0004902097902097902, "step": 4146, "tokens_trained": 2.037452696 }, { "epoch": 1.176441387135664, "grad_norm": 2.3634743690490723, "loss": 3.9857, "lr": 0.00048993006993007, "step": 4148, "tokens_trained": 2.038437256 }, { "epoch": 1.177008722785618, "grad_norm": 1.7258849143981934, "loss": 3.9044, "lr": 0.0004896503496503497, "step": 4150, "tokens_trained": 2.039421224 }, { "epoch": 1.177576058435572, "grad_norm": 4.426620960235596, "loss": 3.9366, "lr": 0.0004893706293706293, "step": 4152, "tokens_trained": 2.040399768 }, { "epoch": 1.178143394085526, "grad_norm": 4.946300506591797, "loss": 3.8394, "lr": 0.0004890909090909091, "step": 4154, "tokens_trained": 2.041382744 }, { "epoch": 1.1787107297354797, "grad_norm": 7.814687252044678, "loss": 3.9504, "lr": 0.0004888111888111888, "step": 4156, "tokens_trained": 2.042364152 }, { "epoch": 1.1792780653854336, "grad_norm": 1.7227815389633179, "loss": 3.8821, "lr": 0.0004885314685314686, "step": 4158, "tokens_trained": 2.043344264 }, { "epoch": 1.1798454010353876, "grad_norm": 11.620087623596191, "loss": 3.9375, "lr": 0.0004882517482517483, "step": 4160, "tokens_trained": 2.04432976 }, { "epoch": 1.1804127366853414, "grad_norm": 11.146257400512695, "loss": 3.9933, "lr": 0.000487972027972028, "step": 4162, "tokens_trained": 2.0453136 }, { "epoch": 1.1809800723352954, "grad_norm": 9.995295524597168, "loss": 3.9977, "lr": 0.0004876923076923077, "step": 4164, "tokens_trained": 2.046294384 }, { "epoch": 1.1815474079852493, "grad_norm": 9.448521614074707, "loss": 3.8709, "lr": 0.00048741258741258743, "step": 4166, "tokens_trained": 2.047279192 }, { "epoch": 1.182114743635203, "grad_norm": 2.3229587078094482, "loss": 3.9194, "lr": 0.0004871328671328671, "step": 4168, "tokens_trained": 2.048260136 }, { "epoch": 1.1826820792851571, "grad_norm": 3.8930304050445557, "loss": 3.9447, "lr": 0.00048685314685314687, "step": 4170, "tokens_trained": 2.049238496 }, { "epoch": 1.183249414935111, "grad_norm": 6.03069543838501, "loss": 3.9134, "lr": 0.00048657342657342656, "step": 4172, "tokens_trained": 2.050226352 }, { "epoch": 1.183816750585065, "grad_norm": 6.509665489196777, "loss": 3.9005, "lr": 0.0004862937062937063, "step": 4174, "tokens_trained": 2.05121248 }, { "epoch": 1.1843840862350188, "grad_norm": 2.0728557109832764, "loss": 3.9646, "lr": 0.000486013986013986, "step": 4176, "tokens_trained": 2.052196784 }, { "epoch": 1.1849514218849726, "grad_norm": 1.972641944885254, "loss": 3.9529, "lr": 0.0004857342657342658, "step": 4178, "tokens_trained": 2.053177512 }, { "epoch": 1.1855187575349266, "grad_norm": 6.664553165435791, "loss": 3.9424, "lr": 0.0004854545454545455, "step": 4180, "tokens_trained": 2.054159928 }, { "epoch": 1.1860860931848805, "grad_norm": 7.182534217834473, "loss": 3.9572, "lr": 0.00048517482517482517, "step": 4182, "tokens_trained": 2.05514288 }, { "epoch": 1.1866534288348345, "grad_norm": 3.3657350540161133, "loss": 3.9027, "lr": 0.0004848951048951049, "step": 4184, "tokens_trained": 2.056127256 }, { "epoch": 1.1872207644847883, "grad_norm": 3.8826489448547363, "loss": 3.9045, "lr": 0.0004846153846153846, "step": 4186, "tokens_trained": 2.057110184 }, { "epoch": 1.1877881001347421, "grad_norm": 3.4556474685668945, "loss": 3.9407, "lr": 0.00048433566433566435, "step": 4188, "tokens_trained": 2.058090016 }, { "epoch": 1.1883554357846962, "grad_norm": 5.431522846221924, "loss": 3.93, "lr": 0.00048405594405594404, "step": 4190, "tokens_trained": 2.059071208 }, { "epoch": 1.18892277143465, "grad_norm": 3.987600803375244, "loss": 3.9276, "lr": 0.0004837762237762238, "step": 4192, "tokens_trained": 2.060047448 }, { "epoch": 1.189490107084604, "grad_norm": 5.114170074462891, "loss": 3.9685, "lr": 0.0004834965034965035, "step": 4194, "tokens_trained": 2.0610266 }, { "epoch": 1.1900574427345578, "grad_norm": 3.948340654373169, "loss": 3.9357, "lr": 0.0004832167832167833, "step": 4196, "tokens_trained": 2.062014792 }, { "epoch": 1.1906247783845116, "grad_norm": 4.607158660888672, "loss": 3.9441, "lr": 0.00048293706293706297, "step": 4198, "tokens_trained": 2.062993768 }, { "epoch": 1.1911921140344657, "grad_norm": 2.860197067260742, "loss": 3.9469, "lr": 0.00048265734265734266, "step": 4200, "tokens_trained": 2.063974352 }, { "epoch": 1.1917594496844195, "grad_norm": 4.8133544921875, "loss": 3.9549, "lr": 0.0004823776223776224, "step": 4202, "tokens_trained": 2.064955 }, { "epoch": 1.1923267853343735, "grad_norm": 3.1824069023132324, "loss": 3.9589, "lr": 0.0004820979020979021, "step": 4204, "tokens_trained": 2.065938728 }, { "epoch": 1.1928941209843273, "grad_norm": 4.413929462432861, "loss": 3.9259, "lr": 0.00048181818181818184, "step": 4206, "tokens_trained": 2.066920408 }, { "epoch": 1.1934614566342812, "grad_norm": 4.193307876586914, "loss": 3.8911, "lr": 0.0004815384615384615, "step": 4208, "tokens_trained": 2.067904384 }, { "epoch": 1.1940287922842352, "grad_norm": 3.4476332664489746, "loss": 3.9646, "lr": 0.00048125874125874127, "step": 4210, "tokens_trained": 2.068888184 }, { "epoch": 1.194596127934189, "grad_norm": 1.2195734977722168, "loss": 3.9053, "lr": 0.00048097902097902096, "step": 4212, "tokens_trained": 2.069866408 }, { "epoch": 1.195163463584143, "grad_norm": 2.1013519763946533, "loss": 3.9806, "lr": 0.00048069930069930076, "step": 4214, "tokens_trained": 2.070848272 }, { "epoch": 1.1957307992340969, "grad_norm": 6.16254186630249, "loss": 3.99, "lr": 0.00048041958041958045, "step": 4216, "tokens_trained": 2.071833968 }, { "epoch": 1.1962981348840507, "grad_norm": 4.7692179679870605, "loss": 3.9775, "lr": 0.00048013986013986014, "step": 4218, "tokens_trained": 2.07281356 }, { "epoch": 1.1968654705340047, "grad_norm": 3.336514949798584, "loss": 4.0087, "lr": 0.0004798601398601399, "step": 4220, "tokens_trained": 2.07380172 }, { "epoch": 1.1974328061839585, "grad_norm": 3.2661092281341553, "loss": 3.9471, "lr": 0.0004795804195804196, "step": 4222, "tokens_trained": 2.074785216 }, { "epoch": 1.1980001418339126, "grad_norm": 3.0861871242523193, "loss": 3.9829, "lr": 0.0004793006993006993, "step": 4224, "tokens_trained": 2.075770912 }, { "epoch": 1.1985674774838664, "grad_norm": 4.010982036590576, "loss": 3.9013, "lr": 0.000479020979020979, "step": 4226, "tokens_trained": 2.076755104 }, { "epoch": 1.1991348131338202, "grad_norm": 3.736706495285034, "loss": 3.9455, "lr": 0.00047874125874125875, "step": 4228, "tokens_trained": 2.077737472 }, { "epoch": 1.1997021487837742, "grad_norm": 2.741546392440796, "loss": 3.929, "lr": 0.00047846153846153844, "step": 4230, "tokens_trained": 2.078721008 }, { "epoch": 1.200269484433728, "grad_norm": 5.045975685119629, "loss": 3.938, "lr": 0.00047818181818181824, "step": 4232, "tokens_trained": 2.079705624 }, { "epoch": 1.200836820083682, "grad_norm": 6.466317653656006, "loss": 3.9189, "lr": 0.00047790209790209793, "step": 4234, "tokens_trained": 2.080689632 }, { "epoch": 1.201404155733636, "grad_norm": 10.680752754211426, "loss": 3.924, "lr": 0.0004776223776223776, "step": 4236, "tokens_trained": 2.0816728 }, { "epoch": 1.2019714913835897, "grad_norm": 4.394003868103027, "loss": 3.9587, "lr": 0.00047734265734265737, "step": 4238, "tokens_trained": 2.082649352 }, { "epoch": 1.2025388270335438, "grad_norm": 14.375049591064453, "loss": 3.8901, "lr": 0.00047706293706293706, "step": 4240, "tokens_trained": 2.083629016 }, { "epoch": 1.2031061626834976, "grad_norm": 6.259925365447998, "loss": 3.9736, "lr": 0.0004767832167832168, "step": 4242, "tokens_trained": 2.084612464 }, { "epoch": 1.2036734983334516, "grad_norm": 7.176869869232178, "loss": 3.9335, "lr": 0.0004765034965034965, "step": 4244, "tokens_trained": 2.085598128 }, { "epoch": 1.2042408339834054, "grad_norm": 7.3431291580200195, "loss": 3.9129, "lr": 0.00047622377622377624, "step": 4246, "tokens_trained": 2.086582144 }, { "epoch": 1.2048081696333592, "grad_norm": 3.1388702392578125, "loss": 3.9645, "lr": 0.00047594405594405593, "step": 4248, "tokens_trained": 2.087566256 }, { "epoch": 1.2053755052833133, "grad_norm": 4.360974311828613, "loss": 3.8965, "lr": 0.00047566433566433573, "step": 4250, "tokens_trained": 2.088546896 }, { "epoch": 1.2053755052833133, "eval_loss": 0.9876537919044495, "eval_runtime": 20.2375, "step": 4250, "tokens_trained": 2.088546896 }, { "epoch": 1.205942840933267, "grad_norm": 6.790876388549805, "loss": 3.8925, "lr": 0.0004753846153846154, "step": 4252, "tokens_trained": 2.089529312 }, { "epoch": 1.2065101765832211, "grad_norm": 5.942895412445068, "loss": 3.9429, "lr": 0.0004751048951048951, "step": 4254, "tokens_trained": 2.090517856 }, { "epoch": 1.207077512233175, "grad_norm": 7.182357311248779, "loss": 3.975, "lr": 0.00047482517482517485, "step": 4256, "tokens_trained": 2.091501152 }, { "epoch": 1.2076448478831288, "grad_norm": 3.092268228530884, "loss": 3.9078, "lr": 0.00047454545454545454, "step": 4258, "tokens_trained": 2.0924852 }, { "epoch": 1.2082121835330828, "grad_norm": 7.483865737915039, "loss": 3.9469, "lr": 0.0004742657342657343, "step": 4260, "tokens_trained": 2.093467328 }, { "epoch": 1.2087795191830366, "grad_norm": 6.828039169311523, "loss": 3.9683, "lr": 0.000473986013986014, "step": 4262, "tokens_trained": 2.094447 }, { "epoch": 1.2093468548329906, "grad_norm": 2.1174066066741943, "loss": 3.9575, "lr": 0.0004737062937062937, "step": 4264, "tokens_trained": 2.095428552 }, { "epoch": 1.2099141904829445, "grad_norm": 1.7029787302017212, "loss": 3.9174, "lr": 0.0004734265734265734, "step": 4266, "tokens_trained": 2.096413944 }, { "epoch": 1.2104815261328983, "grad_norm": 8.107586860656738, "loss": 3.9526, "lr": 0.0004731468531468531, "step": 4268, "tokens_trained": 2.097395416 }, { "epoch": 1.2110488617828523, "grad_norm": 6.090738773345947, "loss": 3.8711, "lr": 0.0004728671328671329, "step": 4270, "tokens_trained": 2.098379488 }, { "epoch": 1.2116161974328061, "grad_norm": 3.09671950340271, "loss": 3.9489, "lr": 0.0004725874125874126, "step": 4272, "tokens_trained": 2.099365672 }, { "epoch": 1.2121835330827602, "grad_norm": 1.3280375003814697, "loss": 3.8766, "lr": 0.00047230769230769234, "step": 4274, "tokens_trained": 2.100345872 }, { "epoch": 1.212750868732714, "grad_norm": 2.2725517749786377, "loss": 3.9298, "lr": 0.00047202797202797203, "step": 4276, "tokens_trained": 2.101330144 }, { "epoch": 1.2133182043826678, "grad_norm": 7.571750164031982, "loss": 3.9129, "lr": 0.00047174825174825177, "step": 4278, "tokens_trained": 2.102310504 }, { "epoch": 1.2138855400326218, "grad_norm": 5.49086856842041, "loss": 3.9257, "lr": 0.00047146853146853146, "step": 4280, "tokens_trained": 2.10329544 }, { "epoch": 1.2144528756825756, "grad_norm": 3.936779737472534, "loss": 3.9055, "lr": 0.0004711888111888112, "step": 4282, "tokens_trained": 2.104280736 }, { "epoch": 1.2150202113325297, "grad_norm": 3.1779263019561768, "loss": 3.9624, "lr": 0.0004709090909090909, "step": 4284, "tokens_trained": 2.10526688 }, { "epoch": 1.2155875469824835, "grad_norm": 2.7246220111846924, "loss": 3.9584, "lr": 0.0004706293706293706, "step": 4286, "tokens_trained": 2.106249208 }, { "epoch": 1.2161548826324373, "grad_norm": 6.718515396118164, "loss": 3.9084, "lr": 0.0004703496503496504, "step": 4288, "tokens_trained": 2.107231312 }, { "epoch": 1.2167222182823914, "grad_norm": 5.000235080718994, "loss": 3.9648, "lr": 0.0004700699300699301, "step": 4290, "tokens_trained": 2.108215624 }, { "epoch": 1.2172895539323452, "grad_norm": 4.756376266479492, "loss": 3.9848, "lr": 0.0004697902097902098, "step": 4292, "tokens_trained": 2.10920156 }, { "epoch": 1.2178568895822992, "grad_norm": 1.9365978240966797, "loss": 3.9517, "lr": 0.0004695104895104895, "step": 4294, "tokens_trained": 2.110182936 }, { "epoch": 1.218424225232253, "grad_norm": 5.350283622741699, "loss": 3.9737, "lr": 0.00046923076923076926, "step": 4296, "tokens_trained": 2.111164808 }, { "epoch": 1.2189915608822068, "grad_norm": 4.543917655944824, "loss": 3.9111, "lr": 0.00046895104895104895, "step": 4298, "tokens_trained": 2.112146848 }, { "epoch": 1.2195588965321609, "grad_norm": 5.1316938400268555, "loss": 3.9194, "lr": 0.0004686713286713287, "step": 4300, "tokens_trained": 2.113134184 }, { "epoch": 1.2201262321821147, "grad_norm": 3.0844085216522217, "loss": 3.8872, "lr": 0.0004683916083916084, "step": 4302, "tokens_trained": 2.114120832 }, { "epoch": 1.2206935678320687, "grad_norm": 2.2305877208709717, "loss": 3.9497, "lr": 0.00046811188811188807, "step": 4304, "tokens_trained": 2.115103856 }, { "epoch": 1.2212609034820225, "grad_norm": 1.7684617042541504, "loss": 3.9218, "lr": 0.00046783216783216787, "step": 4306, "tokens_trained": 2.116086968 }, { "epoch": 1.2218282391319764, "grad_norm": 6.3064680099487305, "loss": 3.9657, "lr": 0.00046755244755244756, "step": 4308, "tokens_trained": 2.11707108 }, { "epoch": 1.2223955747819304, "grad_norm": 2.4910192489624023, "loss": 3.8588, "lr": 0.0004672727272727273, "step": 4310, "tokens_trained": 2.118053928 }, { "epoch": 1.2229629104318842, "grad_norm": 3.482459306716919, "loss": 3.9213, "lr": 0.000466993006993007, "step": 4312, "tokens_trained": 2.119037056 }, { "epoch": 1.2235302460818382, "grad_norm": 6.552737712860107, "loss": 3.8804, "lr": 0.00046671328671328674, "step": 4314, "tokens_trained": 2.120019576 }, { "epoch": 1.224097581731792, "grad_norm": 5.225849628448486, "loss": 3.9562, "lr": 0.00046643356643356643, "step": 4316, "tokens_trained": 2.121000112 }, { "epoch": 1.2246649173817459, "grad_norm": 2.1894407272338867, "loss": 3.8752, "lr": 0.0004661538461538462, "step": 4318, "tokens_trained": 2.121988376 }, { "epoch": 1.2252322530317, "grad_norm": 1.5741831064224243, "loss": 3.953, "lr": 0.00046587412587412587, "step": 4320, "tokens_trained": 2.122965864 }, { "epoch": 1.2257995886816537, "grad_norm": 4.103208065032959, "loss": 3.9216, "lr": 0.00046559440559440556, "step": 4322, "tokens_trained": 2.123950848 }, { "epoch": 1.2263669243316078, "grad_norm": 7.347278118133545, "loss": 3.9547, "lr": 0.00046531468531468536, "step": 4324, "tokens_trained": 2.124933448 }, { "epoch": 1.2269342599815616, "grad_norm": 4.8083930015563965, "loss": 3.9711, "lr": 0.00046503496503496505, "step": 4326, "tokens_trained": 2.125921528 }, { "epoch": 1.2275015956315154, "grad_norm": 5.4488654136657715, "loss": 3.8941, "lr": 0.0004647552447552448, "step": 4328, "tokens_trained": 2.126897152 }, { "epoch": 1.2280689312814694, "grad_norm": 6.24332332611084, "loss": 3.9178, "lr": 0.0004644755244755245, "step": 4330, "tokens_trained": 2.127881384 }, { "epoch": 1.2286362669314232, "grad_norm": 5.97770881652832, "loss": 3.8804, "lr": 0.0004641958041958042, "step": 4332, "tokens_trained": 2.128864008 }, { "epoch": 1.2292036025813773, "grad_norm": 3.901036500930786, "loss": 3.8968, "lr": 0.0004639160839160839, "step": 4334, "tokens_trained": 2.129847632 }, { "epoch": 1.229770938231331, "grad_norm": 5.377021789550781, "loss": 3.9565, "lr": 0.00046363636363636366, "step": 4336, "tokens_trained": 2.130832296 }, { "epoch": 1.230338273881285, "grad_norm": 4.565158367156982, "loss": 3.9672, "lr": 0.00046335664335664335, "step": 4338, "tokens_trained": 2.131814648 }, { "epoch": 1.230905609531239, "grad_norm": 1.2882499694824219, "loss": 3.9515, "lr": 0.00046307692307692304, "step": 4340, "tokens_trained": 2.132797872 }, { "epoch": 1.2314729451811928, "grad_norm": 0.9845411777496338, "loss": 3.9057, "lr": 0.00046279720279720284, "step": 4342, "tokens_trained": 2.133780992 }, { "epoch": 1.2320402808311468, "grad_norm": 3.7839152812957764, "loss": 3.8909, "lr": 0.00046251748251748253, "step": 4344, "tokens_trained": 2.134762864 }, { "epoch": 1.2326076164811006, "grad_norm": 3.8872299194335938, "loss": 3.9262, "lr": 0.0004622377622377623, "step": 4346, "tokens_trained": 2.135743504 }, { "epoch": 1.2331749521310544, "grad_norm": 4.538093566894531, "loss": 3.9098, "lr": 0.00046195804195804196, "step": 4348, "tokens_trained": 2.136727288 }, { "epoch": 1.2337422877810085, "grad_norm": 6.453696250915527, "loss": 3.9103, "lr": 0.0004616783216783217, "step": 4350, "tokens_trained": 2.137710256 }, { "epoch": 1.2343096234309623, "grad_norm": 4.033708572387695, "loss": 3.9144, "lr": 0.0004613986013986014, "step": 4352, "tokens_trained": 2.138691568 }, { "epoch": 1.2348769590809163, "grad_norm": 4.32963752746582, "loss": 3.9154, "lr": 0.00046111888111888114, "step": 4354, "tokens_trained": 2.13967628 }, { "epoch": 1.2354442947308701, "grad_norm": 3.0617220401763916, "loss": 3.8984, "lr": 0.00046083916083916083, "step": 4356, "tokens_trained": 2.140659368 }, { "epoch": 1.236011630380824, "grad_norm": 2.51361346244812, "loss": 3.8971, "lr": 0.0004605594405594405, "step": 4358, "tokens_trained": 2.141644648 }, { "epoch": 1.236578966030778, "grad_norm": 3.6975977420806885, "loss": 3.9208, "lr": 0.0004602797202797203, "step": 4360, "tokens_trained": 2.142628176 }, { "epoch": 1.2371463016807318, "grad_norm": 5.2992844581604, "loss": 3.8855, "lr": 0.00046, "step": 4362, "tokens_trained": 2.143610328 }, { "epoch": 1.2377136373306858, "grad_norm": 4.426636695861816, "loss": 3.893, "lr": 0.00045972027972027976, "step": 4364, "tokens_trained": 2.144591512 }, { "epoch": 1.2382809729806397, "grad_norm": 4.131166458129883, "loss": 3.9098, "lr": 0.00045944055944055945, "step": 4366, "tokens_trained": 2.14557312 }, { "epoch": 1.2388483086305935, "grad_norm": 2.9156816005706787, "loss": 3.9771, "lr": 0.0004591608391608392, "step": 4368, "tokens_trained": 2.146551592 }, { "epoch": 1.2394156442805475, "grad_norm": 3.8412554264068604, "loss": 3.9584, "lr": 0.0004588811188811189, "step": 4370, "tokens_trained": 2.147533032 }, { "epoch": 1.2399829799305013, "grad_norm": 3.1897640228271484, "loss": 3.8253, "lr": 0.0004586013986013986, "step": 4372, "tokens_trained": 2.148517592 }, { "epoch": 1.2405503155804554, "grad_norm": 4.066483020782471, "loss": 3.8905, "lr": 0.0004583216783216783, "step": 4374, "tokens_trained": 2.149502368 }, { "epoch": 1.2408339834054323, "eval_loss": 0.9844964146614075, "eval_runtime": 21.0593, "step": 4375, "tokens_trained": 2.14999612 }, { "epoch": 1.2411176512304092, "grad_norm": 2.0596890449523926, "loss": 3.9142, "lr": 0.000458041958041958, "step": 4376, "tokens_trained": 2.15048712 }, { "epoch": 1.241684986880363, "grad_norm": 4.4018988609313965, "loss": 3.9487, "lr": 0.0004577622377622378, "step": 4378, "tokens_trained": 2.151468832 }, { "epoch": 1.242252322530317, "grad_norm": 3.294774055480957, "loss": 3.979, "lr": 0.0004574825174825175, "step": 4380, "tokens_trained": 2.152451456 }, { "epoch": 1.2428196581802708, "grad_norm": 2.5546209812164307, "loss": 3.9135, "lr": 0.00045720279720279724, "step": 4382, "tokens_trained": 2.1534348 }, { "epoch": 1.2433869938302249, "grad_norm": 2.1771605014801025, "loss": 3.9207, "lr": 0.00045692307692307693, "step": 4384, "tokens_trained": 2.154414104 }, { "epoch": 1.2439543294801787, "grad_norm": 3.5681049823760986, "loss": 3.8632, "lr": 0.0004566433566433567, "step": 4386, "tokens_trained": 2.155399088 }, { "epoch": 1.2445216651301325, "grad_norm": 5.588647365570068, "loss": 3.9769, "lr": 0.00045636363636363637, "step": 4388, "tokens_trained": 2.15638104 }, { "epoch": 1.2450890007800866, "grad_norm": 5.798253059387207, "loss": 3.9167, "lr": 0.00045608391608391606, "step": 4390, "tokens_trained": 2.157366296 }, { "epoch": 1.2456563364300404, "grad_norm": 2.425339698791504, "loss": 3.9152, "lr": 0.0004558041958041958, "step": 4392, "tokens_trained": 2.158347208 }, { "epoch": 1.2462236720799944, "grad_norm": 4.4874444007873535, "loss": 3.9171, "lr": 0.0004555244755244755, "step": 4394, "tokens_trained": 2.159329056 }, { "epoch": 1.2467910077299482, "grad_norm": 4.653798580169678, "loss": 3.9308, "lr": 0.00045524475524475524, "step": 4396, "tokens_trained": 2.160312792 }, { "epoch": 1.247358343379902, "grad_norm": 5.013849258422852, "loss": 3.9224, "lr": 0.000454965034965035, "step": 4398, "tokens_trained": 2.161298728 }, { "epoch": 1.247925679029856, "grad_norm": 3.3346633911132812, "loss": 3.9482, "lr": 0.0004546853146853147, "step": 4400, "tokens_trained": 2.162280664 }, { "epoch": 1.2484930146798099, "grad_norm": 2.408282518386841, "loss": 3.9468, "lr": 0.0004544055944055944, "step": 4402, "tokens_trained": 2.163262608 }, { "epoch": 1.249060350329764, "grad_norm": 2.3152034282684326, "loss": 3.9346, "lr": 0.00045412587412587416, "step": 4404, "tokens_trained": 2.16424488 }, { "epoch": 1.2496276859797177, "grad_norm": 4.722060680389404, "loss": 3.93, "lr": 0.00045384615384615385, "step": 4406, "tokens_trained": 2.165227184 }, { "epoch": 1.2501950216296716, "grad_norm": 2.3931281566619873, "loss": 3.9412, "lr": 0.00045356643356643354, "step": 4408, "tokens_trained": 2.166208312 }, { "epoch": 1.2507623572796256, "grad_norm": 3.703711986541748, "loss": 3.9661, "lr": 0.0004532867132867133, "step": 4410, "tokens_trained": 2.167191896 }, { "epoch": 1.2513296929295794, "grad_norm": 3.168426036834717, "loss": 3.9108, "lr": 0.000453006993006993, "step": 4412, "tokens_trained": 2.1681734 }, { "epoch": 1.2518970285795334, "grad_norm": 4.465419769287109, "loss": 3.9224, "lr": 0.0004527272727272727, "step": 4414, "tokens_trained": 2.16915824 }, { "epoch": 1.2524643642294873, "grad_norm": 3.145385265350342, "loss": 3.9317, "lr": 0.00045244755244755247, "step": 4416, "tokens_trained": 2.170140944 }, { "epoch": 1.253031699879441, "grad_norm": 3.0174384117126465, "loss": 3.9592, "lr": 0.0004521678321678322, "step": 4418, "tokens_trained": 2.171127312 }, { "epoch": 1.2535990355293951, "grad_norm": 2.9682352542877197, "loss": 3.9248, "lr": 0.0004518881118881119, "step": 4420, "tokens_trained": 2.17211552 }, { "epoch": 1.254166371179349, "grad_norm": 4.654287338256836, "loss": 3.9592, "lr": 0.00045160839160839165, "step": 4422, "tokens_trained": 2.173101456 }, { "epoch": 1.254733706829303, "grad_norm": 5.210162162780762, "loss": 3.9463, "lr": 0.00045132867132867134, "step": 4424, "tokens_trained": 2.174081192 }, { "epoch": 1.2553010424792568, "grad_norm": 1.6227176189422607, "loss": 3.8894, "lr": 0.000451048951048951, "step": 4426, "tokens_trained": 2.175063888 }, { "epoch": 1.2558683781292106, "grad_norm": 1.6847152709960938, "loss": 3.9207, "lr": 0.00045076923076923077, "step": 4428, "tokens_trained": 2.176047656 }, { "epoch": 1.2564357137791646, "grad_norm": 7.743977069854736, "loss": 3.9202, "lr": 0.00045048951048951046, "step": 4430, "tokens_trained": 2.177030728 }, { "epoch": 1.2570030494291184, "grad_norm": 5.493525981903076, "loss": 3.8951, "lr": 0.0004502097902097902, "step": 4432, "tokens_trained": 2.178010048 }, { "epoch": 1.2575703850790725, "grad_norm": 4.744298934936523, "loss": 3.9641, "lr": 0.00044993006993006995, "step": 4434, "tokens_trained": 2.178992816 }, { "epoch": 1.2581377207290263, "grad_norm": 5.230485916137695, "loss": 3.9552, "lr": 0.0004496503496503497, "step": 4436, "tokens_trained": 2.179977048 }, { "epoch": 1.2587050563789801, "grad_norm": 2.7955129146575928, "loss": 3.9462, "lr": 0.0004493706293706294, "step": 4438, "tokens_trained": 2.18096108 }, { "epoch": 1.2592723920289342, "grad_norm": 4.869340419769287, "loss": 3.8819, "lr": 0.00044909090909090913, "step": 4440, "tokens_trained": 2.181941176 }, { "epoch": 1.259839727678888, "grad_norm": 4.538938045501709, "loss": 3.8967, "lr": 0.0004488111888111888, "step": 4442, "tokens_trained": 2.182923032 }, { "epoch": 1.260407063328842, "grad_norm": 4.085853576660156, "loss": 3.9155, "lr": 0.0004485314685314685, "step": 4444, "tokens_trained": 2.183902584 }, { "epoch": 1.2609743989787958, "grad_norm": 6.15781831741333, "loss": 3.9379, "lr": 0.00044825174825174826, "step": 4446, "tokens_trained": 2.184884968 }, { "epoch": 1.2615417346287496, "grad_norm": 2.5738606452941895, "loss": 3.9642, "lr": 0.00044797202797202795, "step": 4448, "tokens_trained": 2.185870952 }, { "epoch": 1.2621090702787037, "grad_norm": 4.356530666351318, "loss": 3.8908, "lr": 0.0004476923076923077, "step": 4450, "tokens_trained": 2.186854928 }, { "epoch": 1.2626764059286575, "grad_norm": 5.518537998199463, "loss": 3.8954, "lr": 0.00044741258741258744, "step": 4452, "tokens_trained": 2.187847 }, { "epoch": 1.2632437415786115, "grad_norm": 7.3632354736328125, "loss": 3.9363, "lr": 0.0004471328671328672, "step": 4454, "tokens_trained": 2.188829592 }, { "epoch": 1.2638110772285653, "grad_norm": 0.9625980854034424, "loss": 3.9416, "lr": 0.00044685314685314687, "step": 4456, "tokens_trained": 2.189811456 }, { "epoch": 1.2643784128785192, "grad_norm": 4.0898003578186035, "loss": 3.9133, "lr": 0.0004465734265734266, "step": 4458, "tokens_trained": 2.19079428 }, { "epoch": 1.2649457485284732, "grad_norm": 6.740445137023926, "loss": 3.9282, "lr": 0.0004462937062937063, "step": 4460, "tokens_trained": 2.1917786 }, { "epoch": 1.265513084178427, "grad_norm": 6.742666244506836, "loss": 3.9077, "lr": 0.000446013986013986, "step": 4462, "tokens_trained": 2.192758016 }, { "epoch": 1.266080419828381, "grad_norm": 4.592698097229004, "loss": 3.9123, "lr": 0.00044573426573426574, "step": 4464, "tokens_trained": 2.193741496 }, { "epoch": 1.2666477554783349, "grad_norm": 8.934327125549316, "loss": 3.9647, "lr": 0.00044545454545454543, "step": 4466, "tokens_trained": 2.194723584 }, { "epoch": 1.2672150911282887, "grad_norm": 4.280580997467041, "loss": 3.9189, "lr": 0.0004451748251748252, "step": 4468, "tokens_trained": 2.195708432 }, { "epoch": 1.2677824267782427, "grad_norm": 3.257995843887329, "loss": 3.9698, "lr": 0.0004448951048951049, "step": 4470, "tokens_trained": 2.196691336 }, { "epoch": 1.2683497624281965, "grad_norm": 6.521494388580322, "loss": 3.9676, "lr": 0.00044461538461538466, "step": 4472, "tokens_trained": 2.197674528 }, { "epoch": 1.2689170980781506, "grad_norm": 6.169503211975098, "loss": 3.9404, "lr": 0.00044433566433566435, "step": 4474, "tokens_trained": 2.198658448 }, { "epoch": 1.2694844337281044, "grad_norm": 3.5009562969207764, "loss": 3.9229, "lr": 0.0004440559440559441, "step": 4476, "tokens_trained": 2.199646232 }, { "epoch": 1.2700517693780582, "grad_norm": 3.2101058959960938, "loss": 3.9536, "lr": 0.0004437762237762238, "step": 4478, "tokens_trained": 2.200630024 }, { "epoch": 1.2706191050280122, "grad_norm": 5.417990684509277, "loss": 3.9591, "lr": 0.0004434965034965035, "step": 4480, "tokens_trained": 2.2016182 }, { "epoch": 1.271186440677966, "grad_norm": 3.1346352100372314, "loss": 3.9408, "lr": 0.0004432167832167832, "step": 4482, "tokens_trained": 2.2025994 }, { "epoch": 1.27175377632792, "grad_norm": 3.2468717098236084, "loss": 3.922, "lr": 0.0004429370629370629, "step": 4484, "tokens_trained": 2.203581424 }, { "epoch": 1.272321111977874, "grad_norm": 5.069144248962402, "loss": 3.9616, "lr": 0.00044265734265734266, "step": 4486, "tokens_trained": 2.204562264 }, { "epoch": 1.2728884476278277, "grad_norm": 4.097993850708008, "loss": 3.931, "lr": 0.0004423776223776224, "step": 4488, "tokens_trained": 2.205548376 }, { "epoch": 1.2734557832777817, "grad_norm": 2.3711421489715576, "loss": 3.9201, "lr": 0.00044209790209790215, "step": 4490, "tokens_trained": 2.206535208 }, { "epoch": 1.2740231189277356, "grad_norm": 7.32819938659668, "loss": 3.8766, "lr": 0.00044181818181818184, "step": 4492, "tokens_trained": 2.207522192 }, { "epoch": 1.2745904545776896, "grad_norm": 3.9666519165039062, "loss": 3.894, "lr": 0.00044153846153846153, "step": 4494, "tokens_trained": 2.208506616 }, { "epoch": 1.2751577902276434, "grad_norm": 2.1190407276153564, "loss": 3.9141, "lr": 0.0004412587412587413, "step": 4496, "tokens_trained": 2.209489192 }, { "epoch": 1.2757251258775972, "grad_norm": 1.3682332038879395, "loss": 3.8666, "lr": 0.00044097902097902096, "step": 4498, "tokens_trained": 2.210472392 }, { "epoch": 1.2762924615275513, "grad_norm": 2.5941426753997803, "loss": 3.8921, "lr": 0.0004406993006993007, "step": 4500, "tokens_trained": 2.211451384 }, { "epoch": 1.2762924615275513, "eval_loss": 0.9826880097389221, "eval_runtime": 20.931, "step": 4500, "tokens_trained": 2.211451384 }, { "epoch": 1.276859797177505, "grad_norm": 3.0399274826049805, "loss": 3.9331, "lr": 0.0004404195804195804, "step": 4502, "tokens_trained": 2.212433736 }, { "epoch": 1.2774271328274591, "grad_norm": 4.30709981918335, "loss": 3.9199, "lr": 0.00044013986013986014, "step": 4504, "tokens_trained": 2.21341728 }, { "epoch": 1.277994468477413, "grad_norm": 3.0569705963134766, "loss": 3.8956, "lr": 0.0004398601398601399, "step": 4506, "tokens_trained": 2.214397528 }, { "epoch": 1.2785618041273668, "grad_norm": 4.9559197425842285, "loss": 3.9736, "lr": 0.00043958041958041963, "step": 4508, "tokens_trained": 2.215381712 }, { "epoch": 1.2791291397773208, "grad_norm": 2.7426505088806152, "loss": 3.9042, "lr": 0.0004393006993006993, "step": 4510, "tokens_trained": 2.216361048 }, { "epoch": 1.2796964754272746, "grad_norm": 1.8043859004974365, "loss": 3.8892, "lr": 0.000439020979020979, "step": 4512, "tokens_trained": 2.217344128 }, { "epoch": 1.2802638110772286, "grad_norm": 4.298875331878662, "loss": 3.9133, "lr": 0.00043874125874125876, "step": 4514, "tokens_trained": 2.218327112 }, { "epoch": 1.2808311467271825, "grad_norm": 2.752638339996338, "loss": 3.9078, "lr": 0.00043846153846153845, "step": 4516, "tokens_trained": 2.219311704 }, { "epoch": 1.2813984823771363, "grad_norm": 4.202718257904053, "loss": 3.9452, "lr": 0.0004381818181818182, "step": 4518, "tokens_trained": 2.220295888 }, { "epoch": 1.2819658180270903, "grad_norm": 3.5449273586273193, "loss": 3.8367, "lr": 0.0004379020979020979, "step": 4520, "tokens_trained": 2.221281456 }, { "epoch": 1.2825331536770441, "grad_norm": 2.472935199737549, "loss": 3.8939, "lr": 0.00043762237762237763, "step": 4522, "tokens_trained": 2.222262496 }, { "epoch": 1.2831004893269982, "grad_norm": 1.1959093809127808, "loss": 3.9271, "lr": 0.0004373426573426573, "step": 4524, "tokens_trained": 2.223247352 }, { "epoch": 1.283667824976952, "grad_norm": 2.553889036178589, "loss": 3.9083, "lr": 0.0004370629370629371, "step": 4526, "tokens_trained": 2.224231696 }, { "epoch": 1.2842351606269058, "grad_norm": 2.028510570526123, "loss": 3.9004, "lr": 0.0004367832167832168, "step": 4528, "tokens_trained": 2.225211808 }, { "epoch": 1.2848024962768598, "grad_norm": 2.498624086380005, "loss": 3.9138, "lr": 0.0004365034965034965, "step": 4530, "tokens_trained": 2.226197488 }, { "epoch": 1.2853698319268136, "grad_norm": 2.689389228820801, "loss": 3.9439, "lr": 0.00043622377622377624, "step": 4532, "tokens_trained": 2.227181176 }, { "epoch": 1.2859371675767677, "grad_norm": 6.014649868011475, "loss": 3.9232, "lr": 0.00043594405594405593, "step": 4534, "tokens_trained": 2.228163272 }, { "epoch": 1.2865045032267215, "grad_norm": 4.911413192749023, "loss": 3.9831, "lr": 0.0004356643356643357, "step": 4536, "tokens_trained": 2.229142248 }, { "epoch": 1.2870718388766753, "grad_norm": 3.633075714111328, "loss": 3.944, "lr": 0.00043538461538461537, "step": 4538, "tokens_trained": 2.23012372 }, { "epoch": 1.2876391745266293, "grad_norm": 2.4579458236694336, "loss": 3.9051, "lr": 0.0004351048951048951, "step": 4540, "tokens_trained": 2.231109312 }, { "epoch": 1.2882065101765832, "grad_norm": 2.5251097679138184, "loss": 3.9248, "lr": 0.0004348251748251748, "step": 4542, "tokens_trained": 2.232096216 }, { "epoch": 1.2887738458265372, "grad_norm": 4.023996353149414, "loss": 3.9358, "lr": 0.0004345454545454546, "step": 4544, "tokens_trained": 2.233081224 }, { "epoch": 1.289341181476491, "grad_norm": 3.8658416271209717, "loss": 3.9193, "lr": 0.0004342657342657343, "step": 4546, "tokens_trained": 2.234069488 }, { "epoch": 1.2899085171264448, "grad_norm": 6.1119914054870605, "loss": 3.8991, "lr": 0.000433986013986014, "step": 4548, "tokens_trained": 2.23505432 }, { "epoch": 1.2904758527763989, "grad_norm": 3.834200620651245, "loss": 3.9844, "lr": 0.0004337062937062937, "step": 4550, "tokens_trained": 2.236034064 }, { "epoch": 1.2910431884263527, "grad_norm": 3.4992194175720215, "loss": 3.9358, "lr": 0.0004334265734265734, "step": 4552, "tokens_trained": 2.23701784 }, { "epoch": 1.2916105240763067, "grad_norm": 5.517240524291992, "loss": 3.9046, "lr": 0.00043314685314685316, "step": 4554, "tokens_trained": 2.238003144 }, { "epoch": 1.2921778597262605, "grad_norm": 3.596975803375244, "loss": 3.9073, "lr": 0.00043286713286713285, "step": 4556, "tokens_trained": 2.238986056 }, { "epoch": 1.2927451953762144, "grad_norm": 6.674678325653076, "loss": 3.9285, "lr": 0.0004325874125874126, "step": 4558, "tokens_trained": 2.239968824 }, { "epoch": 1.2933125310261684, "grad_norm": 3.589822292327881, "loss": 3.9137, "lr": 0.0004323076923076923, "step": 4560, "tokens_trained": 2.24095252 }, { "epoch": 1.2938798666761222, "grad_norm": 4.785327434539795, "loss": 3.9188, "lr": 0.0004320279720279721, "step": 4562, "tokens_trained": 2.241935408 }, { "epoch": 1.2944472023260762, "grad_norm": 5.784316062927246, "loss": 3.8804, "lr": 0.0004317482517482518, "step": 4564, "tokens_trained": 2.242919696 }, { "epoch": 1.29501453797603, "grad_norm": 4.1364641189575195, "loss": 3.9, "lr": 0.00043146853146853147, "step": 4566, "tokens_trained": 2.243899048 }, { "epoch": 1.2955818736259839, "grad_norm": 4.100215435028076, "loss": 3.875, "lr": 0.0004311888111888112, "step": 4568, "tokens_trained": 2.24487848 }, { "epoch": 1.296149209275938, "grad_norm": 5.456444263458252, "loss": 3.9252, "lr": 0.0004309090909090909, "step": 4570, "tokens_trained": 2.245860712 }, { "epoch": 1.2967165449258917, "grad_norm": 4.084255695343018, "loss": 3.8755, "lr": 0.00043062937062937065, "step": 4572, "tokens_trained": 2.246846216 }, { "epoch": 1.2972838805758458, "grad_norm": 4.147522926330566, "loss": 3.9162, "lr": 0.00043034965034965034, "step": 4574, "tokens_trained": 2.24783164 }, { "epoch": 1.2978512162257996, "grad_norm": 5.48593807220459, "loss": 3.9073, "lr": 0.0004300699300699301, "step": 4576, "tokens_trained": 2.248817664 }, { "epoch": 1.2984185518757534, "grad_norm": 2.8644235134124756, "loss": 3.9117, "lr": 0.00042979020979020977, "step": 4578, "tokens_trained": 2.249802224 }, { "epoch": 1.2989858875257074, "grad_norm": 1.8577483892440796, "loss": 3.8833, "lr": 0.00042951048951048957, "step": 4580, "tokens_trained": 2.250785496 }, { "epoch": 1.2995532231756612, "grad_norm": 1.4357212781906128, "loss": 3.8986, "lr": 0.00042923076923076926, "step": 4582, "tokens_trained": 2.25177068 }, { "epoch": 1.3001205588256153, "grad_norm": 1.9124270677566528, "loss": 3.9149, "lr": 0.00042895104895104895, "step": 4584, "tokens_trained": 2.252752112 }, { "epoch": 1.300687894475569, "grad_norm": 0.6659060120582581, "loss": 3.8666, "lr": 0.0004286713286713287, "step": 4586, "tokens_trained": 2.25373604 }, { "epoch": 1.301255230125523, "grad_norm": 1.1679121255874634, "loss": 3.9469, "lr": 0.0004283916083916084, "step": 4588, "tokens_trained": 2.254720456 }, { "epoch": 1.301822565775477, "grad_norm": 2.010969877243042, "loss": 3.9181, "lr": 0.00042811188811188813, "step": 4590, "tokens_trained": 2.255698824 }, { "epoch": 1.3023899014254308, "grad_norm": 2.0586466789245605, "loss": 3.8682, "lr": 0.0004278321678321678, "step": 4592, "tokens_trained": 2.256682928 }, { "epoch": 1.3029572370753848, "grad_norm": 1.4269180297851562, "loss": 3.935, "lr": 0.00042755244755244756, "step": 4594, "tokens_trained": 2.257665184 }, { "epoch": 1.3035245727253386, "grad_norm": 3.324599504470825, "loss": 3.9849, "lr": 0.00042727272727272726, "step": 4596, "tokens_trained": 2.258650496 }, { "epoch": 1.3040919083752924, "grad_norm": 5.035736560821533, "loss": 3.9088, "lr": 0.00042699300699300705, "step": 4598, "tokens_trained": 2.259632984 }, { "epoch": 1.3046592440252465, "grad_norm": 3.3298044204711914, "loss": 3.9033, "lr": 0.00042671328671328674, "step": 4600, "tokens_trained": 2.260615528 }, { "epoch": 1.3052265796752003, "grad_norm": 1.253243088722229, "loss": 3.9154, "lr": 0.00042643356643356643, "step": 4602, "tokens_trained": 2.261605648 }, { "epoch": 1.3057939153251543, "grad_norm": 1.8505600690841675, "loss": 3.8771, "lr": 0.0004261538461538462, "step": 4604, "tokens_trained": 2.262590648 }, { "epoch": 1.3063612509751081, "grad_norm": 7.305438995361328, "loss": 3.9323, "lr": 0.00042587412587412587, "step": 4606, "tokens_trained": 2.263579136 }, { "epoch": 1.306928586625062, "grad_norm": 4.584920406341553, "loss": 3.9097, "lr": 0.0004255944055944056, "step": 4608, "tokens_trained": 2.264559496 }, { "epoch": 1.307495922275016, "grad_norm": 2.3128468990325928, "loss": 3.8532, "lr": 0.0004253146853146853, "step": 4610, "tokens_trained": 2.265543104 }, { "epoch": 1.3080632579249698, "grad_norm": 3.1513102054595947, "loss": 3.944, "lr": 0.00042503496503496505, "step": 4612, "tokens_trained": 2.266528816 }, { "epoch": 1.3086305935749238, "grad_norm": 3.1904358863830566, "loss": 3.8706, "lr": 0.00042475524475524474, "step": 4614, "tokens_trained": 2.26751496 }, { "epoch": 1.3091979292248777, "grad_norm": 2.383105516433716, "loss": 3.925, "lr": 0.0004244755244755245, "step": 4616, "tokens_trained": 2.268497744 }, { "epoch": 1.3097652648748315, "grad_norm": 2.642970561981201, "loss": 3.918, "lr": 0.00042419580419580423, "step": 4618, "tokens_trained": 2.269478888 }, { "epoch": 1.3103326005247855, "grad_norm": 1.1598117351531982, "loss": 3.8815, "lr": 0.0004239160839160839, "step": 4620, "tokens_trained": 2.270465888 }, { "epoch": 1.3108999361747393, "grad_norm": 0.9736254811286926, "loss": 3.8866, "lr": 0.00042363636363636366, "step": 4622, "tokens_trained": 2.271446656 }, { "epoch": 1.3114672718246934, "grad_norm": 2.0817017555236816, "loss": 3.9753, "lr": 0.00042335664335664335, "step": 4624, "tokens_trained": 2.272427288 }, { "epoch": 1.3117509396496703, "eval_loss": 0.9817197918891907, "eval_runtime": 20.1783, "step": 4625, "tokens_trained": 2.272921208 }, { "epoch": 1.3120346074746472, "grad_norm": 4.969366550445557, "loss": 3.8696, "lr": 0.0004230769230769231, "step": 4626, "tokens_trained": 2.273412256 }, { "epoch": 1.312601943124601, "grad_norm": 3.270707130432129, "loss": 3.9589, "lr": 0.0004227972027972028, "step": 4628, "tokens_trained": 2.274396776 }, { "epoch": 1.313169278774555, "grad_norm": 2.6939852237701416, "loss": 3.8711, "lr": 0.00042251748251748253, "step": 4630, "tokens_trained": 2.27537728 }, { "epoch": 1.3137366144245088, "grad_norm": 3.0615079402923584, "loss": 3.8899, "lr": 0.0004222377622377622, "step": 4632, "tokens_trained": 2.276362448 }, { "epoch": 1.3143039500744629, "grad_norm": 3.1804049015045166, "loss": 3.9158, "lr": 0.00042195804195804197, "step": 4634, "tokens_trained": 2.277342984 }, { "epoch": 1.3148712857244167, "grad_norm": 1.3030014038085938, "loss": 3.958, "lr": 0.0004216783216783217, "step": 4636, "tokens_trained": 2.278326696 }, { "epoch": 1.3154386213743705, "grad_norm": 2.9791958332061768, "loss": 3.9412, "lr": 0.0004213986013986014, "step": 4638, "tokens_trained": 2.27930736 }, { "epoch": 1.3160059570243245, "grad_norm": 4.533553600311279, "loss": 3.9069, "lr": 0.00042111888111888115, "step": 4640, "tokens_trained": 2.280293032 }, { "epoch": 1.3165732926742784, "grad_norm": 4.159526348114014, "loss": 3.9262, "lr": 0.00042083916083916084, "step": 4642, "tokens_trained": 2.281273472 }, { "epoch": 1.3171406283242324, "grad_norm": 2.847492218017578, "loss": 3.9244, "lr": 0.0004205594405594406, "step": 4644, "tokens_trained": 2.282257288 }, { "epoch": 1.3177079639741862, "grad_norm": 3.4552533626556396, "loss": 3.9252, "lr": 0.00042027972027972027, "step": 4646, "tokens_trained": 2.28324236 }, { "epoch": 1.31827529962414, "grad_norm": 1.4335713386535645, "loss": 3.9075, "lr": 0.00042, "step": 4648, "tokens_trained": 2.284224 }, { "epoch": 1.318842635274094, "grad_norm": 3.8727214336395264, "loss": 3.8907, "lr": 0.0004197202797202797, "step": 4650, "tokens_trained": 2.285205656 }, { "epoch": 1.3194099709240479, "grad_norm": 4.415209770202637, "loss": 3.9138, "lr": 0.0004194405594405594, "step": 4652, "tokens_trained": 2.286191744 }, { "epoch": 1.319977306574002, "grad_norm": 3.026095151901245, "loss": 3.889, "lr": 0.0004191608391608392, "step": 4654, "tokens_trained": 2.287174 }, { "epoch": 1.3205446422239557, "grad_norm": 3.9142091274261475, "loss": 3.8506, "lr": 0.0004188811188811189, "step": 4656, "tokens_trained": 2.288156824 }, { "epoch": 1.3211119778739095, "grad_norm": 5.409343719482422, "loss": 3.9258, "lr": 0.00041860139860139863, "step": 4658, "tokens_trained": 2.289136776 }, { "epoch": 1.3216793135238636, "grad_norm": 1.3607697486877441, "loss": 3.9254, "lr": 0.0004183216783216783, "step": 4660, "tokens_trained": 2.290117448 }, { "epoch": 1.3222466491738174, "grad_norm": 4.911555290222168, "loss": 3.9406, "lr": 0.00041804195804195807, "step": 4662, "tokens_trained": 2.291098904 }, { "epoch": 1.3228139848237714, "grad_norm": 5.282960891723633, "loss": 3.9109, "lr": 0.00041776223776223776, "step": 4664, "tokens_trained": 2.29208568 }, { "epoch": 1.3233813204737253, "grad_norm": 4.313295364379883, "loss": 3.9077, "lr": 0.0004174825174825175, "step": 4666, "tokens_trained": 2.293067688 }, { "epoch": 1.323948656123679, "grad_norm": 2.7871968746185303, "loss": 3.9306, "lr": 0.0004172027972027972, "step": 4668, "tokens_trained": 2.294050264 }, { "epoch": 1.324515991773633, "grad_norm": 2.481030225753784, "loss": 3.9429, "lr": 0.0004169230769230769, "step": 4670, "tokens_trained": 2.295031904 }, { "epoch": 1.325083327423587, "grad_norm": 5.044018268585205, "loss": 3.8738, "lr": 0.0004166433566433567, "step": 4672, "tokens_trained": 2.296011688 }, { "epoch": 1.325650663073541, "grad_norm": 6.23581075668335, "loss": 3.9253, "lr": 0.00041636363636363637, "step": 4674, "tokens_trained": 2.296996288 }, { "epoch": 1.3262179987234948, "grad_norm": 2.041799545288086, "loss": 3.905, "lr": 0.0004160839160839161, "step": 4676, "tokens_trained": 2.297978912 }, { "epoch": 1.3267853343734486, "grad_norm": 1.1758520603179932, "loss": 3.8992, "lr": 0.0004158041958041958, "step": 4678, "tokens_trained": 2.298963184 }, { "epoch": 1.3273526700234026, "grad_norm": 2.1230716705322266, "loss": 3.9038, "lr": 0.00041552447552447555, "step": 4680, "tokens_trained": 2.299946672 }, { "epoch": 1.3279200056733564, "grad_norm": 1.821915626525879, "loss": 3.9239, "lr": 0.00041524475524475524, "step": 4682, "tokens_trained": 2.300931632 }, { "epoch": 1.3284873413233105, "grad_norm": 0.7051568627357483, "loss": 3.9281, "lr": 0.000414965034965035, "step": 4684, "tokens_trained": 2.301910304 }, { "epoch": 1.3290546769732643, "grad_norm": 1.8326458930969238, "loss": 3.9498, "lr": 0.0004146853146853147, "step": 4686, "tokens_trained": 2.302891896 }, { "epoch": 1.329622012623218, "grad_norm": 1.4614375829696655, "loss": 3.9342, "lr": 0.00041440559440559437, "step": 4688, "tokens_trained": 2.30387572 }, { "epoch": 1.3301893482731721, "grad_norm": 1.6197412014007568, "loss": 3.8507, "lr": 0.00041412587412587417, "step": 4690, "tokens_trained": 2.304857152 }, { "epoch": 1.330756683923126, "grad_norm": 0.5570790767669678, "loss": 3.9307, "lr": 0.00041384615384615386, "step": 4692, "tokens_trained": 2.305841336 }, { "epoch": 1.33132401957308, "grad_norm": 1.6550084352493286, "loss": 3.9237, "lr": 0.0004135664335664336, "step": 4694, "tokens_trained": 2.30682708 }, { "epoch": 1.3318913552230338, "grad_norm": 1.334955096244812, "loss": 3.917, "lr": 0.0004132867132867133, "step": 4696, "tokens_trained": 2.307809136 }, { "epoch": 1.3324586908729876, "grad_norm": 3.4471423625946045, "loss": 3.9231, "lr": 0.00041300699300699304, "step": 4698, "tokens_trained": 2.308789496 }, { "epoch": 1.3330260265229417, "grad_norm": 4.426776885986328, "loss": 3.8495, "lr": 0.0004127272727272727, "step": 4700, "tokens_trained": 2.3097722 }, { "epoch": 1.3335933621728955, "grad_norm": 4.349783897399902, "loss": 3.918, "lr": 0.00041244755244755247, "step": 4702, "tokens_trained": 2.310748672 }, { "epoch": 1.3341606978228495, "grad_norm": 4.3204426765441895, "loss": 3.8733, "lr": 0.00041216783216783216, "step": 4704, "tokens_trained": 2.31172956 }, { "epoch": 1.3347280334728033, "grad_norm": 4.6586480140686035, "loss": 3.9174, "lr": 0.00041188811188811185, "step": 4706, "tokens_trained": 2.312715208 }, { "epoch": 1.3352953691227571, "grad_norm": 4.72362756729126, "loss": 3.8998, "lr": 0.00041160839160839165, "step": 4708, "tokens_trained": 2.313700896 }, { "epoch": 1.3358627047727112, "grad_norm": 4.0833516120910645, "loss": 3.8726, "lr": 0.00041132867132867134, "step": 4710, "tokens_trained": 2.314683176 }, { "epoch": 1.336430040422665, "grad_norm": 3.979100227355957, "loss": 3.9482, "lr": 0.0004110489510489511, "step": 4712, "tokens_trained": 2.315667504 }, { "epoch": 1.336997376072619, "grad_norm": 2.9478790760040283, "loss": 3.8954, "lr": 0.0004107692307692308, "step": 4714, "tokens_trained": 2.316647488 }, { "epoch": 1.3375647117225729, "grad_norm": 3.2437031269073486, "loss": 3.8961, "lr": 0.0004104895104895105, "step": 4716, "tokens_trained": 2.317629888 }, { "epoch": 1.3381320473725267, "grad_norm": 3.9469761848449707, "loss": 3.916, "lr": 0.0004102097902097902, "step": 4718, "tokens_trained": 2.318613016 }, { "epoch": 1.3386993830224807, "grad_norm": 4.271415710449219, "loss": 3.928, "lr": 0.0004099300699300699, "step": 4720, "tokens_trained": 2.319592112 }, { "epoch": 1.3392667186724345, "grad_norm": 2.656351327896118, "loss": 3.8936, "lr": 0.00040965034965034964, "step": 4722, "tokens_trained": 2.320572056 }, { "epoch": 1.3398340543223886, "grad_norm": 2.6717190742492676, "loss": 3.8753, "lr": 0.00040937062937062934, "step": 4724, "tokens_trained": 2.321557736 }, { "epoch": 1.3404013899723424, "grad_norm": 4.214351654052734, "loss": 3.8905, "lr": 0.00040909090909090913, "step": 4726, "tokens_trained": 2.322539008 }, { "epoch": 1.3409687256222962, "grad_norm": 4.417314052581787, "loss": 3.9017, "lr": 0.0004088111888111888, "step": 4728, "tokens_trained": 2.323524672 }, { "epoch": 1.3415360612722502, "grad_norm": 3.1664652824401855, "loss": 3.9226, "lr": 0.00040853146853146857, "step": 4730, "tokens_trained": 2.32450916 }, { "epoch": 1.342103396922204, "grad_norm": 2.39656662940979, "loss": 3.91, "lr": 0.00040825174825174826, "step": 4732, "tokens_trained": 2.325490472 }, { "epoch": 1.342670732572158, "grad_norm": 2.9324393272399902, "loss": 3.8945, "lr": 0.000407972027972028, "step": 4734, "tokens_trained": 2.326473872 }, { "epoch": 1.343238068222112, "grad_norm": 3.534731388092041, "loss": 3.8557, "lr": 0.0004076923076923077, "step": 4736, "tokens_trained": 2.327458424 }, { "epoch": 1.3438054038720657, "grad_norm": 2.3089616298675537, "loss": 3.8957, "lr": 0.0004074125874125874, "step": 4738, "tokens_trained": 2.328444432 }, { "epoch": 1.3443727395220197, "grad_norm": 3.3014519214630127, "loss": 3.8746, "lr": 0.00040713286713286713, "step": 4740, "tokens_trained": 2.32942976 }, { "epoch": 1.3449400751719736, "grad_norm": 5.408111572265625, "loss": 3.9117, "lr": 0.0004068531468531468, "step": 4742, "tokens_trained": 2.330411736 }, { "epoch": 1.3455074108219276, "grad_norm": 4.326341152191162, "loss": 3.8331, "lr": 0.0004065734265734266, "step": 4744, "tokens_trained": 2.3313928 }, { "epoch": 1.3460747464718814, "grad_norm": 3.9538161754608154, "loss": 3.9216, "lr": 0.0004062937062937063, "step": 4746, "tokens_trained": 2.332380728 }, { "epoch": 1.3466420821218352, "grad_norm": 2.4591166973114014, "loss": 3.8795, "lr": 0.00040601398601398605, "step": 4748, "tokens_trained": 2.333363448 }, { "epoch": 1.3472094177717893, "grad_norm": 3.2325263023376465, "loss": 3.9277, "lr": 0.00040573426573426574, "step": 4750, "tokens_trained": 2.334348496 }, { "epoch": 1.3472094177717893, "eval_loss": 0.9784421920776367, "eval_runtime": 20.3876, "step": 4750, "tokens_trained": 2.334348496 }, { "epoch": 1.347776753421743, "grad_norm": 2.721426486968994, "loss": 3.9264, "lr": 0.0004054545454545455, "step": 4752, "tokens_trained": 2.335333088 }, { "epoch": 1.3483440890716971, "grad_norm": 1.0679550170898438, "loss": 3.9515, "lr": 0.0004051748251748252, "step": 4754, "tokens_trained": 2.33632024 }, { "epoch": 1.348911424721651, "grad_norm": 0.6162118911743164, "loss": 3.8984, "lr": 0.00040489510489510487, "step": 4756, "tokens_trained": 2.337303072 }, { "epoch": 1.3494787603716047, "grad_norm": 1.993177890777588, "loss": 3.8845, "lr": 0.0004046153846153846, "step": 4758, "tokens_trained": 2.338281984 }, { "epoch": 1.3500460960215588, "grad_norm": 2.5877304077148438, "loss": 3.9123, "lr": 0.0004043356643356643, "step": 4760, "tokens_trained": 2.339261224 }, { "epoch": 1.3506134316715126, "grad_norm": 6.708667755126953, "loss": 3.9087, "lr": 0.0004040559440559441, "step": 4762, "tokens_trained": 2.340241416 }, { "epoch": 1.3511807673214666, "grad_norm": 4.514158248901367, "loss": 3.9094, "lr": 0.0004037762237762238, "step": 4764, "tokens_trained": 2.341223048 }, { "epoch": 1.3517481029714205, "grad_norm": 2.3245937824249268, "loss": 3.9466, "lr": 0.00040349650349650354, "step": 4766, "tokens_trained": 2.34220588 }, { "epoch": 1.3523154386213743, "grad_norm": 4.14736795425415, "loss": 3.8754, "lr": 0.00040321678321678323, "step": 4768, "tokens_trained": 2.343188752 }, { "epoch": 1.3528827742713283, "grad_norm": 6.2871880531311035, "loss": 3.8384, "lr": 0.00040293706293706297, "step": 4770, "tokens_trained": 2.344169864 }, { "epoch": 1.3534501099212821, "grad_norm": 1.5958847999572754, "loss": 3.8735, "lr": 0.00040265734265734266, "step": 4772, "tokens_trained": 2.345153104 }, { "epoch": 1.3540174455712362, "grad_norm": 5.585666179656982, "loss": 3.9073, "lr": 0.00040237762237762235, "step": 4774, "tokens_trained": 2.346137 }, { "epoch": 1.35458478122119, "grad_norm": 3.8506343364715576, "loss": 3.9298, "lr": 0.0004020979020979021, "step": 4776, "tokens_trained": 2.347124712 }, { "epoch": 1.3551521168711438, "grad_norm": 4.7482757568359375, "loss": 3.8957, "lr": 0.0004018181818181818, "step": 4778, "tokens_trained": 2.348107992 }, { "epoch": 1.3557194525210978, "grad_norm": 1.6603455543518066, "loss": 3.9345, "lr": 0.00040153846153846153, "step": 4780, "tokens_trained": 2.349091872 }, { "epoch": 1.3562867881710516, "grad_norm": 0.40717223286628723, "loss": 3.8988, "lr": 0.0004012587412587413, "step": 4782, "tokens_trained": 2.350073024 }, { "epoch": 1.3568541238210057, "grad_norm": 1.7904951572418213, "loss": 3.9163, "lr": 0.000400979020979021, "step": 4784, "tokens_trained": 2.351057608 }, { "epoch": 1.3574214594709595, "grad_norm": 1.3750170469284058, "loss": 3.9321, "lr": 0.0004006993006993007, "step": 4786, "tokens_trained": 2.352039248 }, { "epoch": 1.3579887951209133, "grad_norm": 3.6613173484802246, "loss": 3.9463, "lr": 0.00040041958041958046, "step": 4788, "tokens_trained": 2.3530216 }, { "epoch": 1.3585561307708673, "grad_norm": 3.13639497756958, "loss": 3.8653, "lr": 0.00040013986013986015, "step": 4790, "tokens_trained": 2.353999496 }, { "epoch": 1.3591234664208212, "grad_norm": 3.408346176147461, "loss": 3.946, "lr": 0.00039986013986013984, "step": 4792, "tokens_trained": 2.354983984 }, { "epoch": 1.3596908020707752, "grad_norm": 4.422549247741699, "loss": 3.9123, "lr": 0.0003995804195804196, "step": 4794, "tokens_trained": 2.355968032 }, { "epoch": 1.360258137720729, "grad_norm": 2.9923927783966064, "loss": 3.9502, "lr": 0.00039930069930069927, "step": 4796, "tokens_trained": 2.356951192 }, { "epoch": 1.3608254733706828, "grad_norm": 1.1125166416168213, "loss": 3.8638, "lr": 0.000399020979020979, "step": 4798, "tokens_trained": 2.357930688 }, { "epoch": 1.3613928090206369, "grad_norm": 2.9915504455566406, "loss": 3.9227, "lr": 0.00039874125874125876, "step": 4800, "tokens_trained": 2.358909896 }, { "epoch": 1.3619601446705907, "grad_norm": 4.443681716918945, "loss": 3.9206, "lr": 0.0003984615384615385, "step": 4802, "tokens_trained": 2.35989144 }, { "epoch": 1.3625274803205447, "grad_norm": 5.246060848236084, "loss": 3.8391, "lr": 0.0003981818181818182, "step": 4804, "tokens_trained": 2.360878576 }, { "epoch": 1.3630948159704985, "grad_norm": 7.064333915710449, "loss": 3.9045, "lr": 0.00039790209790209794, "step": 4806, "tokens_trained": 2.361860808 }, { "epoch": 1.3636621516204523, "grad_norm": 2.9516990184783936, "loss": 3.9038, "lr": 0.00039762237762237763, "step": 4808, "tokens_trained": 2.362848272 }, { "epoch": 1.3642294872704064, "grad_norm": 7.830825328826904, "loss": 3.9086, "lr": 0.0003973426573426573, "step": 4810, "tokens_trained": 2.363829304 }, { "epoch": 1.3647968229203602, "grad_norm": 3.3761377334594727, "loss": 3.8936, "lr": 0.00039706293706293707, "step": 4812, "tokens_trained": 2.364814928 }, { "epoch": 1.3653641585703142, "grad_norm": 3.8069584369659424, "loss": 3.8805, "lr": 0.00039678321678321676, "step": 4814, "tokens_trained": 2.365793656 }, { "epoch": 1.365931494220268, "grad_norm": 5.233834743499756, "loss": 3.8868, "lr": 0.0003965034965034965, "step": 4816, "tokens_trained": 2.366781064 }, { "epoch": 1.3664988298702219, "grad_norm": 5.134295463562012, "loss": 3.883, "lr": 0.00039622377622377625, "step": 4818, "tokens_trained": 2.367763392 }, { "epoch": 1.367066165520176, "grad_norm": 1.2896602153778076, "loss": 3.8733, "lr": 0.000395944055944056, "step": 4820, "tokens_trained": 2.368744936 }, { "epoch": 1.3676335011701297, "grad_norm": 6.089853763580322, "loss": 3.943, "lr": 0.0003956643356643357, "step": 4822, "tokens_trained": 2.369729952 }, { "epoch": 1.3682008368200838, "grad_norm": 4.928650379180908, "loss": 3.9151, "lr": 0.0003953846153846154, "step": 4824, "tokens_trained": 2.370710944 }, { "epoch": 1.3687681724700376, "grad_norm": 4.412777423858643, "loss": 3.9165, "lr": 0.0003951048951048951, "step": 4826, "tokens_trained": 2.371693224 }, { "epoch": 1.3693355081199914, "grad_norm": 3.940869092941284, "loss": 3.9457, "lr": 0.0003948251748251748, "step": 4828, "tokens_trained": 2.372672808 }, { "epoch": 1.3699028437699454, "grad_norm": 4.23148775100708, "loss": 3.8893, "lr": 0.00039454545454545455, "step": 4830, "tokens_trained": 2.373659816 }, { "epoch": 1.3704701794198992, "grad_norm": 2.781536817550659, "loss": 3.8649, "lr": 0.00039426573426573424, "step": 4832, "tokens_trained": 2.374645648 }, { "epoch": 1.3710375150698533, "grad_norm": 1.7263449430465698, "loss": 3.9142, "lr": 0.000393986013986014, "step": 4834, "tokens_trained": 2.375627952 }, { "epoch": 1.371604850719807, "grad_norm": 7.530355453491211, "loss": 3.9146, "lr": 0.00039370629370629373, "step": 4836, "tokens_trained": 2.376608568 }, { "epoch": 1.372172186369761, "grad_norm": 5.03418493270874, "loss": 3.8995, "lr": 0.0003934265734265735, "step": 4838, "tokens_trained": 2.377594368 }, { "epoch": 1.372739522019715, "grad_norm": 3.9235804080963135, "loss": 3.8575, "lr": 0.00039314685314685316, "step": 4840, "tokens_trained": 2.37857584 }, { "epoch": 1.3733068576696688, "grad_norm": 4.762357234954834, "loss": 3.9044, "lr": 0.00039286713286713286, "step": 4842, "tokens_trained": 2.37956052 }, { "epoch": 1.3738741933196228, "grad_norm": 4.108587741851807, "loss": 3.8829, "lr": 0.0003925874125874126, "step": 4844, "tokens_trained": 2.380537584 }, { "epoch": 1.3744415289695766, "grad_norm": 2.686072826385498, "loss": 3.8575, "lr": 0.0003923076923076923, "step": 4846, "tokens_trained": 2.381523144 }, { "epoch": 1.3750088646195304, "grad_norm": 3.9192161560058594, "loss": 3.8674, "lr": 0.00039202797202797203, "step": 4848, "tokens_trained": 2.382504752 }, { "epoch": 1.3755762002694845, "grad_norm": 3.2957770824432373, "loss": 3.8897, "lr": 0.0003917482517482517, "step": 4850, "tokens_trained": 2.383486968 }, { "epoch": 1.3761435359194383, "grad_norm": 3.0208771228790283, "loss": 3.8923, "lr": 0.00039146853146853147, "step": 4852, "tokens_trained": 2.384467168 }, { "epoch": 1.3767108715693923, "grad_norm": 1.4386385679244995, "loss": 3.8489, "lr": 0.0003911888111888112, "step": 4854, "tokens_trained": 2.385446096 }, { "epoch": 1.3772782072193461, "grad_norm": 2.494499444961548, "loss": 3.8857, "lr": 0.00039090909090909096, "step": 4856, "tokens_trained": 2.386428088 }, { "epoch": 1.3778455428693, "grad_norm": 2.573397397994995, "loss": 3.8716, "lr": 0.00039062937062937065, "step": 4858, "tokens_trained": 2.387410288 }, { "epoch": 1.378412878519254, "grad_norm": 2.8497166633605957, "loss": 3.9172, "lr": 0.00039034965034965034, "step": 4860, "tokens_trained": 2.388389632 }, { "epoch": 1.3789802141692078, "grad_norm": 1.3268458843231201, "loss": 3.8724, "lr": 0.0003900699300699301, "step": 4862, "tokens_trained": 2.389375248 }, { "epoch": 1.3795475498191618, "grad_norm": 2.5455031394958496, "loss": 3.9061, "lr": 0.0003897902097902098, "step": 4864, "tokens_trained": 2.390357104 }, { "epoch": 1.3801148854691156, "grad_norm": 2.6307923793792725, "loss": 3.894, "lr": 0.0003895104895104895, "step": 4866, "tokens_trained": 2.391334728 }, { "epoch": 1.3806822211190695, "grad_norm": 2.4805266857147217, "loss": 3.848, "lr": 0.0003892307692307692, "step": 4868, "tokens_trained": 2.392315672 }, { "epoch": 1.3812495567690235, "grad_norm": 2.6160788536071777, "loss": 3.9057, "lr": 0.00038895104895104895, "step": 4870, "tokens_trained": 2.393302152 }, { "epoch": 1.3818168924189773, "grad_norm": 1.4398711919784546, "loss": 3.8609, "lr": 0.0003886713286713287, "step": 4872, "tokens_trained": 2.394281808 }, { "epoch": 1.3823842280689314, "grad_norm": 2.4663705825805664, "loss": 3.9316, "lr": 0.00038839160839160844, "step": 4874, "tokens_trained": 2.395263216 }, { "epoch": 1.3826678958939083, "eval_loss": 0.9779003858566284, "eval_runtime": 20.7864, "step": 4875, "tokens_trained": 2.395753536 }, { "epoch": 1.3829515637188852, "grad_norm": 2.455738067626953, "loss": 3.8944, "lr": 0.00038811188811188813, "step": 4876, "tokens_trained": 2.396244232 }, { "epoch": 1.383518899368839, "grad_norm": 4.183098793029785, "loss": 3.9096, "lr": 0.0003878321678321678, "step": 4878, "tokens_trained": 2.39722784 }, { "epoch": 1.384086235018793, "grad_norm": 6.7180495262146, "loss": 3.8762, "lr": 0.00038755244755244757, "step": 4880, "tokens_trained": 2.39820884 }, { "epoch": 1.3846535706687468, "grad_norm": 1.702336072921753, "loss": 3.8616, "lr": 0.00038727272727272726, "step": 4882, "tokens_trained": 2.39919008 }, { "epoch": 1.3852209063187009, "grad_norm": 10.165470123291016, "loss": 3.9159, "lr": 0.000386993006993007, "step": 4884, "tokens_trained": 2.400176128 }, { "epoch": 1.3857882419686547, "grad_norm": 4.2575297355651855, "loss": 3.9513, "lr": 0.0003867132867132867, "step": 4886, "tokens_trained": 2.401159616 }, { "epoch": 1.3863555776186085, "grad_norm": 4.321669578552246, "loss": 3.8929, "lr": 0.00038643356643356644, "step": 4888, "tokens_trained": 2.40214392 }, { "epoch": 1.3869229132685625, "grad_norm": 4.289078235626221, "loss": 3.8378, "lr": 0.0003861538461538462, "step": 4890, "tokens_trained": 2.403127904 }, { "epoch": 1.3874902489185164, "grad_norm": 6.578473091125488, "loss": 3.8603, "lr": 0.00038587412587412593, "step": 4892, "tokens_trained": 2.404109496 }, { "epoch": 1.3880575845684704, "grad_norm": 4.092262268066406, "loss": 3.9049, "lr": 0.0003855944055944056, "step": 4894, "tokens_trained": 2.405092064 }, { "epoch": 1.3886249202184242, "grad_norm": 3.304581642150879, "loss": 3.8862, "lr": 0.0003853146853146853, "step": 4896, "tokens_trained": 2.406074136 }, { "epoch": 1.389192255868378, "grad_norm": 3.7834372520446777, "loss": 3.916, "lr": 0.00038503496503496505, "step": 4898, "tokens_trained": 2.407054056 }, { "epoch": 1.389759591518332, "grad_norm": 3.30719256401062, "loss": 3.9162, "lr": 0.00038475524475524474, "step": 4900, "tokens_trained": 2.408035 }, { "epoch": 1.3903269271682859, "grad_norm": 2.2104077339172363, "loss": 3.9094, "lr": 0.0003844755244755245, "step": 4902, "tokens_trained": 2.409019864 }, { "epoch": 1.39089426281824, "grad_norm": 3.2836616039276123, "loss": 3.8586, "lr": 0.0003841958041958042, "step": 4904, "tokens_trained": 2.410002576 }, { "epoch": 1.3914615984681937, "grad_norm": 2.468010187149048, "loss": 3.8655, "lr": 0.0003839160839160839, "step": 4906, "tokens_trained": 2.410983384 }, { "epoch": 1.3920289341181475, "grad_norm": 2.7495617866516113, "loss": 3.8934, "lr": 0.0003836363636363636, "step": 4908, "tokens_trained": 2.411967688 }, { "epoch": 1.3925962697681016, "grad_norm": 2.61542010307312, "loss": 3.8928, "lr": 0.0003833566433566434, "step": 4910, "tokens_trained": 2.412946648 }, { "epoch": 1.3931636054180554, "grad_norm": 3.393087148666382, "loss": 3.9396, "lr": 0.0003830769230769231, "step": 4912, "tokens_trained": 2.413930608 }, { "epoch": 1.3937309410680094, "grad_norm": 2.1915347576141357, "loss": 3.8685, "lr": 0.0003827972027972028, "step": 4914, "tokens_trained": 2.414910456 }, { "epoch": 1.3942982767179632, "grad_norm": 1.2087231874465942, "loss": 3.9201, "lr": 0.00038251748251748254, "step": 4916, "tokens_trained": 2.41588864 }, { "epoch": 1.394865612367917, "grad_norm": 2.1861963272094727, "loss": 3.8936, "lr": 0.0003822377622377622, "step": 4918, "tokens_trained": 2.416869936 }, { "epoch": 1.395432948017871, "grad_norm": 2.2949490547180176, "loss": 3.8818, "lr": 0.00038195804195804197, "step": 4920, "tokens_trained": 2.417855472 }, { "epoch": 1.396000283667825, "grad_norm": 2.027250289916992, "loss": 3.8758, "lr": 0.00038167832167832166, "step": 4922, "tokens_trained": 2.418839744 }, { "epoch": 1.396567619317779, "grad_norm": 4.480210304260254, "loss": 3.8769, "lr": 0.0003813986013986014, "step": 4924, "tokens_trained": 2.4198266 }, { "epoch": 1.3971349549677328, "grad_norm": 2.718602180480957, "loss": 3.9309, "lr": 0.0003811188811188811, "step": 4926, "tokens_trained": 2.420809488 }, { "epoch": 1.3977022906176866, "grad_norm": 3.022064447402954, "loss": 3.8953, "lr": 0.0003808391608391609, "step": 4928, "tokens_trained": 2.421795792 }, { "epoch": 1.3982696262676406, "grad_norm": 3.6465160846710205, "loss": 3.901, "lr": 0.0003805594405594406, "step": 4930, "tokens_trained": 2.422778216 }, { "epoch": 1.3988369619175944, "grad_norm": 2.549898386001587, "loss": 3.8641, "lr": 0.0003802797202797203, "step": 4932, "tokens_trained": 2.423761104 }, { "epoch": 1.3994042975675485, "grad_norm": 2.1666665077209473, "loss": 3.9211, "lr": 0.00038, "step": 4934, "tokens_trained": 2.424746424 }, { "epoch": 1.3999716332175023, "grad_norm": 5.31266450881958, "loss": 3.8729, "lr": 0.0003797202797202797, "step": 4936, "tokens_trained": 2.425730296 }, { "epoch": 1.400538968867456, "grad_norm": 3.2631475925445557, "loss": 3.8741, "lr": 0.00037944055944055946, "step": 4938, "tokens_trained": 2.426711856 }, { "epoch": 1.4011063045174101, "grad_norm": 2.7507376670837402, "loss": 3.8877, "lr": 0.00037916083916083915, "step": 4940, "tokens_trained": 2.427695064 }, { "epoch": 1.401673640167364, "grad_norm": 2.361859083175659, "loss": 3.8937, "lr": 0.0003788811188811189, "step": 4942, "tokens_trained": 2.428680184 }, { "epoch": 1.402240975817318, "grad_norm": 3.007972240447998, "loss": 3.8591, "lr": 0.0003786013986013986, "step": 4944, "tokens_trained": 2.429668312 }, { "epoch": 1.4028083114672718, "grad_norm": 3.033128261566162, "loss": 3.9293, "lr": 0.0003783216783216784, "step": 4946, "tokens_trained": 2.430652248 }, { "epoch": 1.4033756471172256, "grad_norm": 1.0569933652877808, "loss": 3.9047, "lr": 0.00037804195804195807, "step": 4948, "tokens_trained": 2.431634048 }, { "epoch": 1.4039429827671797, "grad_norm": 1.1776299476623535, "loss": 3.8985, "lr": 0.00037776223776223776, "step": 4950, "tokens_trained": 2.432615856 }, { "epoch": 1.4045103184171335, "grad_norm": 2.139624834060669, "loss": 3.8648, "lr": 0.0003774825174825175, "step": 4952, "tokens_trained": 2.433598912 }, { "epoch": 1.4050776540670875, "grad_norm": 3.9667930603027344, "loss": 3.9196, "lr": 0.0003772027972027972, "step": 4954, "tokens_trained": 2.434583464 }, { "epoch": 1.4056449897170413, "grad_norm": 3.4130353927612305, "loss": 3.873, "lr": 0.00037692307692307694, "step": 4956, "tokens_trained": 2.435562696 }, { "epoch": 1.4062123253669951, "grad_norm": 2.91157603263855, "loss": 3.8901, "lr": 0.00037664335664335663, "step": 4958, "tokens_trained": 2.436544192 }, { "epoch": 1.4067796610169492, "grad_norm": 2.038764715194702, "loss": 3.8951, "lr": 0.0003763636363636364, "step": 4960, "tokens_trained": 2.43752728 }, { "epoch": 1.407346996666903, "grad_norm": 2.2672388553619385, "loss": 3.8573, "lr": 0.00037608391608391607, "step": 4962, "tokens_trained": 2.438511552 }, { "epoch": 1.407914332316857, "grad_norm": 2.4656710624694824, "loss": 3.8819, "lr": 0.0003758041958041958, "step": 4964, "tokens_trained": 2.4394974 }, { "epoch": 1.4084816679668108, "grad_norm": 2.4732837677001953, "loss": 3.8761, "lr": 0.00037552447552447555, "step": 4966, "tokens_trained": 2.440477752 }, { "epoch": 1.4090490036167647, "grad_norm": 4.646571636199951, "loss": 3.8883, "lr": 0.00037524475524475524, "step": 4968, "tokens_trained": 2.441464112 }, { "epoch": 1.4096163392667187, "grad_norm": 3.602743625640869, "loss": 3.9305, "lr": 0.000374965034965035, "step": 4970, "tokens_trained": 2.442441072 }, { "epoch": 1.4101836749166725, "grad_norm": 2.1577095985412598, "loss": 3.8883, "lr": 0.0003746853146853147, "step": 4972, "tokens_trained": 2.443425168 }, { "epoch": 1.4107510105666266, "grad_norm": 2.35933256149292, "loss": 3.9124, "lr": 0.0003744055944055944, "step": 4974, "tokens_trained": 2.444408448 }, { "epoch": 1.4113183462165804, "grad_norm": 7.452941417694092, "loss": 3.8741, "lr": 0.0003741258741258741, "step": 4976, "tokens_trained": 2.445390888 }, { "epoch": 1.4118856818665342, "grad_norm": 4.788355827331543, "loss": 3.8915, "lr": 0.00037384615384615386, "step": 4978, "tokens_trained": 2.44637136 }, { "epoch": 1.4124530175164882, "grad_norm": 2.7765729427337646, "loss": 3.8878, "lr": 0.00037356643356643355, "step": 4980, "tokens_trained": 2.447355568 }, { "epoch": 1.413020353166442, "grad_norm": 3.324477195739746, "loss": 3.8546, "lr": 0.0003732867132867133, "step": 4982, "tokens_trained": 2.448338952 }, { "epoch": 1.413587688816396, "grad_norm": 1.5179075002670288, "loss": 3.9019, "lr": 0.00037300699300699304, "step": 4984, "tokens_trained": 2.449324032 }, { "epoch": 1.4141550244663499, "grad_norm": 4.929554462432861, "loss": 3.8773, "lr": 0.00037272727272727273, "step": 4986, "tokens_trained": 2.450307912 }, { "epoch": 1.4147223601163037, "grad_norm": 4.763064384460449, "loss": 3.9035, "lr": 0.0003724475524475525, "step": 4988, "tokens_trained": 2.451293168 }, { "epoch": 1.4152896957662577, "grad_norm": 2.141029119491577, "loss": 3.9224, "lr": 0.00037216783216783216, "step": 4990, "tokens_trained": 2.452276952 }, { "epoch": 1.4158570314162116, "grad_norm": 3.93829607963562, "loss": 3.8889, "lr": 0.0003718881118881119, "step": 4992, "tokens_trained": 2.453264688 }, { "epoch": 1.4164243670661656, "grad_norm": 3.691845178604126, "loss": 3.884, "lr": 0.0003716083916083916, "step": 4994, "tokens_trained": 2.454252408 }, { "epoch": 1.4169917027161194, "grad_norm": 1.6449168920516968, "loss": 3.8893, "lr": 0.00037132867132867134, "step": 4996, "tokens_trained": 2.4552316 }, { "epoch": 1.4175590383660732, "grad_norm": 3.0063729286193848, "loss": 3.8786, "lr": 0.00037104895104895103, "step": 4998, "tokens_trained": 2.456215176 }, { "epoch": 1.4181263740160273, "grad_norm": 4.001911163330078, "loss": 3.8797, "lr": 0.0003707692307692308, "step": 5000, "tokens_trained": 2.4571994 }, { "epoch": 1.4181263740160273, "eval_loss": 0.9744628667831421, "eval_runtime": 20.7577, "step": 5000, "tokens_trained": 2.4571994 }, { "epoch": 1.418693709665981, "grad_norm": 2.209125518798828, "loss": 3.8578, "lr": 0.0003704895104895105, "step": 5002, "tokens_trained": 2.458180112 }, { "epoch": 1.419261045315935, "grad_norm": 3.4210712909698486, "loss": 3.8956, "lr": 0.0003702097902097902, "step": 5004, "tokens_trained": 2.459162136 }, { "epoch": 1.419828380965889, "grad_norm": 3.204285144805908, "loss": 3.8689, "lr": 0.00036993006993006996, "step": 5006, "tokens_trained": 2.460144776 }, { "epoch": 1.4203957166158427, "grad_norm": 2.6957204341888428, "loss": 3.9176, "lr": 0.00036965034965034965, "step": 5008, "tokens_trained": 2.461124056 }, { "epoch": 1.4209630522657968, "grad_norm": 3.2292940616607666, "loss": 3.8843, "lr": 0.0003693706293706294, "step": 5010, "tokens_trained": 2.462105048 }, { "epoch": 1.4215303879157506, "grad_norm": 3.2393546104431152, "loss": 3.9098, "lr": 0.0003690909090909091, "step": 5012, "tokens_trained": 2.46308476 }, { "epoch": 1.4220977235657046, "grad_norm": 4.3664774894714355, "loss": 3.8755, "lr": 0.00036881118881118883, "step": 5014, "tokens_trained": 2.464068376 }, { "epoch": 1.4226650592156584, "grad_norm": 3.5531437397003174, "loss": 3.9183, "lr": 0.0003685314685314685, "step": 5016, "tokens_trained": 2.465050144 }, { "epoch": 1.4232323948656123, "grad_norm": 2.292147636413574, "loss": 3.9113, "lr": 0.00036825174825174826, "step": 5018, "tokens_trained": 2.466032864 }, { "epoch": 1.4237997305155663, "grad_norm": 3.2202541828155518, "loss": 3.9005, "lr": 0.000367972027972028, "step": 5020, "tokens_trained": 2.467016672 }, { "epoch": 1.4243670661655201, "grad_norm": 2.978670835494995, "loss": 3.8717, "lr": 0.0003676923076923077, "step": 5022, "tokens_trained": 2.468001272 }, { "epoch": 1.4249344018154741, "grad_norm": 1.9841945171356201, "loss": 3.8642, "lr": 0.00036741258741258744, "step": 5024, "tokens_trained": 2.468983928 }, { "epoch": 1.425501737465428, "grad_norm": 3.14475417137146, "loss": 3.8952, "lr": 0.00036713286713286713, "step": 5026, "tokens_trained": 2.469965368 }, { "epoch": 1.4260690731153818, "grad_norm": 2.5225462913513184, "loss": 3.8684, "lr": 0.0003668531468531469, "step": 5028, "tokens_trained": 2.470944904 }, { "epoch": 1.4266364087653358, "grad_norm": 2.162013053894043, "loss": 3.8492, "lr": 0.00036657342657342657, "step": 5030, "tokens_trained": 2.471925792 }, { "epoch": 1.4272037444152896, "grad_norm": 3.798084020614624, "loss": 3.8492, "lr": 0.0003662937062937063, "step": 5032, "tokens_trained": 2.4729112 }, { "epoch": 1.4277710800652437, "grad_norm": 3.125767707824707, "loss": 3.8675, "lr": 0.000366013986013986, "step": 5034, "tokens_trained": 2.473893608 }, { "epoch": 1.4283384157151975, "grad_norm": 2.3426859378814697, "loss": 3.829, "lr": 0.0003657342657342657, "step": 5036, "tokens_trained": 2.474873112 }, { "epoch": 1.4289057513651513, "grad_norm": 3.2585058212280273, "loss": 3.8476, "lr": 0.0003654545454545455, "step": 5038, "tokens_trained": 2.475857392 }, { "epoch": 1.4294730870151053, "grad_norm": 2.814438581466675, "loss": 3.8467, "lr": 0.0003651748251748252, "step": 5040, "tokens_trained": 2.476838664 }, { "epoch": 1.4300404226650592, "grad_norm": 1.8864086866378784, "loss": 3.8652, "lr": 0.0003648951048951049, "step": 5042, "tokens_trained": 2.47782168 }, { "epoch": 1.4306077583150132, "grad_norm": 1.7076116800308228, "loss": 3.855, "lr": 0.0003646153846153846, "step": 5044, "tokens_trained": 2.478803744 }, { "epoch": 1.431175093964967, "grad_norm": 2.2379872798919678, "loss": 3.8486, "lr": 0.00036433566433566436, "step": 5046, "tokens_trained": 2.479785216 }, { "epoch": 1.4317424296149208, "grad_norm": 2.4551987648010254, "loss": 3.8613, "lr": 0.00036405594405594405, "step": 5048, "tokens_trained": 2.480762576 }, { "epoch": 1.4323097652648749, "grad_norm": 1.9165434837341309, "loss": 3.8691, "lr": 0.00036377622377622374, "step": 5050, "tokens_trained": 2.48174416 }, { "epoch": 1.4328771009148287, "grad_norm": 3.355273723602295, "loss": 3.9335, "lr": 0.0003634965034965035, "step": 5052, "tokens_trained": 2.482725264 }, { "epoch": 1.4334444365647827, "grad_norm": 2.419801712036133, "loss": 3.8996, "lr": 0.0003632167832167832, "step": 5054, "tokens_trained": 2.483708016 }, { "epoch": 1.4340117722147365, "grad_norm": 0.953630268573761, "loss": 3.8797, "lr": 0.000362937062937063, "step": 5056, "tokens_trained": 2.484690616 }, { "epoch": 1.4345791078646903, "grad_norm": 2.454457998275757, "loss": 3.9272, "lr": 0.00036265734265734267, "step": 5058, "tokens_trained": 2.485672144 }, { "epoch": 1.4351464435146444, "grad_norm": 2.6845757961273193, "loss": 3.8732, "lr": 0.0003623776223776224, "step": 5060, "tokens_trained": 2.486659072 }, { "epoch": 1.4357137791645982, "grad_norm": 1.8361189365386963, "loss": 3.8963, "lr": 0.0003620979020979021, "step": 5062, "tokens_trained": 2.487640824 }, { "epoch": 1.4362811148145522, "grad_norm": 2.003408432006836, "loss": 3.8925, "lr": 0.00036181818181818185, "step": 5064, "tokens_trained": 2.488623064 }, { "epoch": 1.436848450464506, "grad_norm": 2.320922374725342, "loss": 3.8839, "lr": 0.00036153846153846154, "step": 5066, "tokens_trained": 2.489605856 }, { "epoch": 1.4374157861144599, "grad_norm": 3.1108357906341553, "loss": 3.8711, "lr": 0.0003612587412587412, "step": 5068, "tokens_trained": 2.490593592 }, { "epoch": 1.437983121764414, "grad_norm": 4.1830267906188965, "loss": 3.8753, "lr": 0.00036097902097902097, "step": 5070, "tokens_trained": 2.491577552 }, { "epoch": 1.4385504574143677, "grad_norm": 4.149252414703369, "loss": 3.9214, "lr": 0.00036069930069930066, "step": 5072, "tokens_trained": 2.492563048 }, { "epoch": 1.4391177930643217, "grad_norm": 2.50063419342041, "loss": 3.9325, "lr": 0.00036041958041958046, "step": 5074, "tokens_trained": 2.493544432 }, { "epoch": 1.4396851287142756, "grad_norm": 3.926102638244629, "loss": 3.9229, "lr": 0.00036013986013986015, "step": 5076, "tokens_trained": 2.494525176 }, { "epoch": 1.4402524643642294, "grad_norm": 2.9965932369232178, "loss": 3.8654, "lr": 0.0003598601398601399, "step": 5078, "tokens_trained": 2.495506904 }, { "epoch": 1.4408198000141834, "grad_norm": 3.242460250854492, "loss": 3.8657, "lr": 0.0003595804195804196, "step": 5080, "tokens_trained": 2.496486016 }, { "epoch": 1.4413871356641372, "grad_norm": 4.620968341827393, "loss": 3.9037, "lr": 0.00035930069930069933, "step": 5082, "tokens_trained": 2.497472288 }, { "epoch": 1.4419544713140913, "grad_norm": 4.284809112548828, "loss": 3.8864, "lr": 0.000359020979020979, "step": 5084, "tokens_trained": 2.49845476 }, { "epoch": 1.442521806964045, "grad_norm": 3.115851640701294, "loss": 3.9297, "lr": 0.0003587412587412587, "step": 5086, "tokens_trained": 2.499438056 }, { "epoch": 1.443089142613999, "grad_norm": 3.842564105987549, "loss": 3.8401, "lr": 0.00035846153846153846, "step": 5088, "tokens_trained": 2.500420768 }, { "epoch": 1.443656478263953, "grad_norm": 3.615903615951538, "loss": 3.869, "lr": 0.00035818181818181815, "step": 5090, "tokens_trained": 2.50140196 }, { "epoch": 1.4442238139139068, "grad_norm": 3.166294574737549, "loss": 3.858, "lr": 0.00035790209790209794, "step": 5092, "tokens_trained": 2.502388264 }, { "epoch": 1.4447911495638608, "grad_norm": 2.21025013923645, "loss": 3.867, "lr": 0.00035762237762237763, "step": 5094, "tokens_trained": 2.503375728 }, { "epoch": 1.4453584852138146, "grad_norm": 3.1004698276519775, "loss": 3.8808, "lr": 0.0003573426573426574, "step": 5096, "tokens_trained": 2.504358936 }, { "epoch": 1.4459258208637684, "grad_norm": 1.524992823600769, "loss": 3.8603, "lr": 0.00035706293706293707, "step": 5098, "tokens_trained": 2.505342432 }, { "epoch": 1.4464931565137225, "grad_norm": 2.9289309978485107, "loss": 3.8623, "lr": 0.0003567832167832168, "step": 5100, "tokens_trained": 2.50632448 }, { "epoch": 1.4470604921636763, "grad_norm": 1.872747540473938, "loss": 3.873, "lr": 0.0003565034965034965, "step": 5102, "tokens_trained": 2.507303672 }, { "epoch": 1.4476278278136303, "grad_norm": 5.076520919799805, "loss": 3.8882, "lr": 0.0003562237762237762, "step": 5104, "tokens_trained": 2.508294896 }, { "epoch": 1.4481951634635841, "grad_norm": 3.738583564758301, "loss": 3.8517, "lr": 0.00035594405594405594, "step": 5106, "tokens_trained": 2.50927976 }, { "epoch": 1.448762499113538, "grad_norm": 4.042014122009277, "loss": 3.8544, "lr": 0.00035566433566433563, "step": 5108, "tokens_trained": 2.510263368 }, { "epoch": 1.449329834763492, "grad_norm": 4.474701881408691, "loss": 3.9099, "lr": 0.00035538461538461543, "step": 5110, "tokens_trained": 2.511249408 }, { "epoch": 1.4498971704134458, "grad_norm": 2.5567240715026855, "loss": 3.9099, "lr": 0.0003551048951048951, "step": 5112, "tokens_trained": 2.51222996 }, { "epoch": 1.4504645060633998, "grad_norm": 1.9672293663024902, "loss": 3.8462, "lr": 0.00035482517482517486, "step": 5114, "tokens_trained": 2.513214328 }, { "epoch": 1.4510318417133536, "grad_norm": 3.023873805999756, "loss": 3.8937, "lr": 0.00035454545454545455, "step": 5116, "tokens_trained": 2.51419652 }, { "epoch": 1.4515991773633075, "grad_norm": 2.4916296005249023, "loss": 3.8856, "lr": 0.0003542657342657343, "step": 5118, "tokens_trained": 2.515177864 }, { "epoch": 1.4521665130132615, "grad_norm": 2.6898279190063477, "loss": 3.8899, "lr": 0.000353986013986014, "step": 5120, "tokens_trained": 2.516162416 }, { "epoch": 1.4527338486632153, "grad_norm": 2.552603244781494, "loss": 3.9042, "lr": 0.0003537062937062937, "step": 5122, "tokens_trained": 2.517143872 }, { "epoch": 1.4533011843131693, "grad_norm": 2.877371072769165, "loss": 3.9297, "lr": 0.0003534265734265734, "step": 5124, "tokens_trained": 2.518128736 }, { "epoch": 1.4535848521381463, "eval_loss": 0.97332763671875, "eval_runtime": 20.9544, "step": 5125, "tokens_trained": 2.518622672 }, { "epoch": 1.4538685199631232, "grad_norm": 3.5342681407928467, "loss": 3.9252, "lr": 0.0003531468531468531, "step": 5126, "tokens_trained": 2.51911752 }, { "epoch": 1.454435855613077, "grad_norm": 3.628720998764038, "loss": 3.859, "lr": 0.0003528671328671329, "step": 5128, "tokens_trained": 2.520098472 }, { "epoch": 1.455003191263031, "grad_norm": 6.547962188720703, "loss": 3.8856, "lr": 0.0003525874125874126, "step": 5130, "tokens_trained": 2.521079136 }, { "epoch": 1.4555705269129848, "grad_norm": 2.413188934326172, "loss": 3.8697, "lr": 0.00035230769230769235, "step": 5132, "tokens_trained": 2.522058328 }, { "epoch": 1.4561378625629389, "grad_norm": 3.4512171745300293, "loss": 3.8708, "lr": 0.00035202797202797204, "step": 5134, "tokens_trained": 2.523038472 }, { "epoch": 1.4567051982128927, "grad_norm": 3.700793504714966, "loss": 3.8337, "lr": 0.0003517482517482518, "step": 5136, "tokens_trained": 2.524023792 }, { "epoch": 1.4572725338628465, "grad_norm": 3.7885332107543945, "loss": 3.9458, "lr": 0.0003514685314685315, "step": 5138, "tokens_trained": 2.525009728 }, { "epoch": 1.4578398695128005, "grad_norm": 2.7266547679901123, "loss": 3.9023, "lr": 0.00035118881118881116, "step": 5140, "tokens_trained": 2.525989656 }, { "epoch": 1.4584072051627543, "grad_norm": 3.19142746925354, "loss": 3.8541, "lr": 0.0003509090909090909, "step": 5142, "tokens_trained": 2.526971216 }, { "epoch": 1.4589745408127084, "grad_norm": 4.478598117828369, "loss": 3.8717, "lr": 0.0003506293706293706, "step": 5144, "tokens_trained": 2.527954072 }, { "epoch": 1.4595418764626622, "grad_norm": 2.4593617916107178, "loss": 3.8894, "lr": 0.0003503496503496504, "step": 5146, "tokens_trained": 2.528939184 }, { "epoch": 1.460109212112616, "grad_norm": 3.3200669288635254, "loss": 3.8385, "lr": 0.0003500699300699301, "step": 5148, "tokens_trained": 2.529926776 }, { "epoch": 1.46067654776257, "grad_norm": 4.14384651184082, "loss": 3.8771, "lr": 0.00034979020979020983, "step": 5150, "tokens_trained": 2.530912672 }, { "epoch": 1.4612438834125239, "grad_norm": 4.013224124908447, "loss": 3.8845, "lr": 0.0003495104895104895, "step": 5152, "tokens_trained": 2.531895672 }, { "epoch": 1.461811219062478, "grad_norm": 2.421576976776123, "loss": 3.8511, "lr": 0.00034923076923076927, "step": 5154, "tokens_trained": 2.53288024 }, { "epoch": 1.4623785547124317, "grad_norm": 2.5835623741149902, "loss": 3.8596, "lr": 0.00034895104895104896, "step": 5156, "tokens_trained": 2.533866024 }, { "epoch": 1.4629458903623855, "grad_norm": 4.254941940307617, "loss": 3.8578, "lr": 0.00034867132867132865, "step": 5158, "tokens_trained": 2.534849704 }, { "epoch": 1.4635132260123396, "grad_norm": 5.818271160125732, "loss": 3.8577, "lr": 0.0003483916083916084, "step": 5160, "tokens_trained": 2.535833776 }, { "epoch": 1.4640805616622934, "grad_norm": 0.8015483021736145, "loss": 3.8521, "lr": 0.0003481118881118881, "step": 5162, "tokens_trained": 2.536811552 }, { "epoch": 1.4646478973122474, "grad_norm": 2.905026435852051, "loss": 3.9294, "lr": 0.0003478321678321678, "step": 5164, "tokens_trained": 2.537794976 }, { "epoch": 1.4652152329622012, "grad_norm": 4.075428009033203, "loss": 3.8707, "lr": 0.00034755244755244757, "step": 5166, "tokens_trained": 2.538778856 }, { "epoch": 1.465782568612155, "grad_norm": 3.1994779109954834, "loss": 3.8997, "lr": 0.0003472727272727273, "step": 5168, "tokens_trained": 2.539766424 }, { "epoch": 1.466349904262109, "grad_norm": 3.8348865509033203, "loss": 3.8407, "lr": 0.000346993006993007, "step": 5170, "tokens_trained": 2.54074992 }, { "epoch": 1.466917239912063, "grad_norm": 3.057394504547119, "loss": 3.8996, "lr": 0.00034671328671328675, "step": 5172, "tokens_trained": 2.54173296 }, { "epoch": 1.467484575562017, "grad_norm": 2.629530668258667, "loss": 3.8695, "lr": 0.00034643356643356644, "step": 5174, "tokens_trained": 2.542712656 }, { "epoch": 1.4680519112119708, "grad_norm": 6.596874237060547, "loss": 3.8993, "lr": 0.00034615384615384613, "step": 5176, "tokens_trained": 2.543697248 }, { "epoch": 1.4686192468619246, "grad_norm": 6.877425670623779, "loss": 3.8664, "lr": 0.0003458741258741259, "step": 5178, "tokens_trained": 2.544676296 }, { "epoch": 1.4691865825118786, "grad_norm": 4.752718448638916, "loss": 3.8747, "lr": 0.00034559440559440557, "step": 5180, "tokens_trained": 2.54566048 }, { "epoch": 1.4697539181618324, "grad_norm": 6.17790412902832, "loss": 3.8618, "lr": 0.0003453146853146853, "step": 5182, "tokens_trained": 2.5466462 }, { "epoch": 1.4703212538117865, "grad_norm": 4.023257732391357, "loss": 3.9337, "lr": 0.00034503496503496506, "step": 5184, "tokens_trained": 2.547626248 }, { "epoch": 1.4708885894617403, "grad_norm": 5.393856048583984, "loss": 3.9071, "lr": 0.0003447552447552448, "step": 5186, "tokens_trained": 2.548609344 }, { "epoch": 1.471455925111694, "grad_norm": 3.888399124145508, "loss": 3.8781, "lr": 0.0003444755244755245, "step": 5188, "tokens_trained": 2.549590128 }, { "epoch": 1.4720232607616481, "grad_norm": 2.120105504989624, "loss": 3.8423, "lr": 0.0003441958041958042, "step": 5190, "tokens_trained": 2.550575896 }, { "epoch": 1.472590596411602, "grad_norm": 2.569045305252075, "loss": 3.9112, "lr": 0.0003439160839160839, "step": 5192, "tokens_trained": 2.551560416 }, { "epoch": 1.473157932061556, "grad_norm": 3.4651668071746826, "loss": 3.9087, "lr": 0.0003436363636363636, "step": 5194, "tokens_trained": 2.552542536 }, { "epoch": 1.4737252677115098, "grad_norm": 2.7434427738189697, "loss": 3.8748, "lr": 0.00034335664335664336, "step": 5196, "tokens_trained": 2.553529144 }, { "epoch": 1.4742926033614636, "grad_norm": 2.8238751888275146, "loss": 3.883, "lr": 0.00034307692307692305, "step": 5198, "tokens_trained": 2.554512624 }, { "epoch": 1.4748599390114177, "grad_norm": 2.6443698406219482, "loss": 3.8933, "lr": 0.0003427972027972028, "step": 5200, "tokens_trained": 2.555493584 }, { "epoch": 1.4754272746613715, "grad_norm": 3.0539839267730713, "loss": 3.8446, "lr": 0.00034251748251748254, "step": 5202, "tokens_trained": 2.556470992 }, { "epoch": 1.4759946103113255, "grad_norm": 2.7458343505859375, "loss": 3.8937, "lr": 0.0003422377622377623, "step": 5204, "tokens_trained": 2.557456112 }, { "epoch": 1.4765619459612793, "grad_norm": 2.1506590843200684, "loss": 3.8197, "lr": 0.000341958041958042, "step": 5206, "tokens_trained": 2.558440904 }, { "epoch": 1.4771292816112331, "grad_norm": 3.0944714546203613, "loss": 3.8825, "lr": 0.00034167832167832167, "step": 5208, "tokens_trained": 2.559422448 }, { "epoch": 1.4776966172611872, "grad_norm": 4.058701038360596, "loss": 3.8541, "lr": 0.0003413986013986014, "step": 5210, "tokens_trained": 2.560408408 }, { "epoch": 1.478263952911141, "grad_norm": 3.390343189239502, "loss": 3.8573, "lr": 0.0003411188811188811, "step": 5212, "tokens_trained": 2.561387848 }, { "epoch": 1.478831288561095, "grad_norm": 1.3469499349594116, "loss": 3.8657, "lr": 0.00034083916083916084, "step": 5214, "tokens_trained": 2.562369448 }, { "epoch": 1.4793986242110488, "grad_norm": 2.6407840251922607, "loss": 3.8439, "lr": 0.00034055944055944054, "step": 5216, "tokens_trained": 2.563352728 }, { "epoch": 1.4799659598610027, "grad_norm": 4.6244378089904785, "loss": 3.8524, "lr": 0.0003402797202797203, "step": 5218, "tokens_trained": 2.56433732 }, { "epoch": 1.4805332955109567, "grad_norm": 3.53739333152771, "loss": 3.8383, "lr": 0.00034, "step": 5220, "tokens_trained": 2.565318712 }, { "epoch": 1.4811006311609105, "grad_norm": 1.2742515802383423, "loss": 3.8461, "lr": 0.00033972027972027977, "step": 5222, "tokens_trained": 2.566305728 }, { "epoch": 1.4816679668108645, "grad_norm": 2.308912754058838, "loss": 3.9021, "lr": 0.00033944055944055946, "step": 5224, "tokens_trained": 2.567289704 }, { "epoch": 1.4822353024608184, "grad_norm": 4.492687225341797, "loss": 3.942, "lr": 0.00033916083916083915, "step": 5226, "tokens_trained": 2.568271784 }, { "epoch": 1.4828026381107722, "grad_norm": 3.4822142124176025, "loss": 3.8815, "lr": 0.0003388811188811189, "step": 5228, "tokens_trained": 2.569255992 }, { "epoch": 1.4833699737607262, "grad_norm": 2.584545612335205, "loss": 3.8663, "lr": 0.0003386013986013986, "step": 5230, "tokens_trained": 2.570237432 }, { "epoch": 1.48393730941068, "grad_norm": 1.3823322057724, "loss": 3.8608, "lr": 0.00033832167832167833, "step": 5232, "tokens_trained": 2.571214488 }, { "epoch": 1.484504645060634, "grad_norm": 0.8751009702682495, "loss": 3.8788, "lr": 0.000338041958041958, "step": 5234, "tokens_trained": 2.572196464 }, { "epoch": 1.4850719807105879, "grad_norm": 0.723051905632019, "loss": 3.8176, "lr": 0.00033776223776223776, "step": 5236, "tokens_trained": 2.5731788 }, { "epoch": 1.4856393163605417, "grad_norm": 1.073199987411499, "loss": 3.8142, "lr": 0.0003374825174825175, "step": 5238, "tokens_trained": 2.5741574 }, { "epoch": 1.4862066520104957, "grad_norm": 1.4350844621658325, "loss": 3.8865, "lr": 0.00033720279720279725, "step": 5240, "tokens_trained": 2.57514396 }, { "epoch": 1.4867739876604495, "grad_norm": 2.656418561935425, "loss": 3.8975, "lr": 0.00033692307692307694, "step": 5242, "tokens_trained": 2.576123752 }, { "epoch": 1.4873413233104036, "grad_norm": 2.0948193073272705, "loss": 3.8543, "lr": 0.00033664335664335663, "step": 5244, "tokens_trained": 2.577109968 }, { "epoch": 1.4879086589603574, "grad_norm": 2.3233394622802734, "loss": 3.9017, "lr": 0.0003363636363636364, "step": 5246, "tokens_trained": 2.578092128 }, { "epoch": 1.4884759946103112, "grad_norm": 2.3845908641815186, "loss": 3.8993, "lr": 0.00033608391608391607, "step": 5248, "tokens_trained": 2.579077664 }, { "epoch": 1.4890433302602653, "grad_norm": 1.0513813495635986, "loss": 3.8597, "lr": 0.0003358041958041958, "step": 5250, "tokens_trained": 2.580059224 }, { "epoch": 1.4890433302602653, "eval_loss": 0.9717268943786621, "eval_runtime": 20.2853, "step": 5250, "tokens_trained": 2.580059224 }, { "epoch": 1.489610665910219, "grad_norm": 1.56052827835083, "loss": 3.8768, "lr": 0.0003355244755244755, "step": 5252, "tokens_trained": 2.581038592 }, { "epoch": 1.490178001560173, "grad_norm": 3.654672384262085, "loss": 3.8987, "lr": 0.00033524475524475525, "step": 5254, "tokens_trained": 2.58201948 }, { "epoch": 1.490745337210127, "grad_norm": 2.942765474319458, "loss": 3.9019, "lr": 0.000334965034965035, "step": 5256, "tokens_trained": 2.582999032 }, { "epoch": 1.4913126728600807, "grad_norm": 2.78128719329834, "loss": 3.858, "lr": 0.00033468531468531474, "step": 5258, "tokens_trained": 2.583980504 }, { "epoch": 1.4918800085100348, "grad_norm": 2.8371148109436035, "loss": 3.9144, "lr": 0.00033440559440559443, "step": 5260, "tokens_trained": 2.584964024 }, { "epoch": 1.4924473441599886, "grad_norm": 3.362802743911743, "loss": 3.9014, "lr": 0.0003341258741258741, "step": 5262, "tokens_trained": 2.585946728 }, { "epoch": 1.4930146798099426, "grad_norm": 2.9014971256256104, "loss": 3.88, "lr": 0.00033384615384615386, "step": 5264, "tokens_trained": 2.586928936 }, { "epoch": 1.4935820154598964, "grad_norm": 4.144679546356201, "loss": 3.8206, "lr": 0.00033356643356643355, "step": 5266, "tokens_trained": 2.587912456 }, { "epoch": 1.4941493511098503, "grad_norm": 2.4919822216033936, "loss": 3.8968, "lr": 0.0003332867132867133, "step": 5268, "tokens_trained": 2.58889736 }, { "epoch": 1.4947166867598043, "grad_norm": 4.618200778961182, "loss": 3.8869, "lr": 0.000333006993006993, "step": 5270, "tokens_trained": 2.58988292 }, { "epoch": 1.495284022409758, "grad_norm": 2.53562068939209, "loss": 3.8692, "lr": 0.00033272727272727273, "step": 5272, "tokens_trained": 2.590868032 }, { "epoch": 1.4958513580597121, "grad_norm": 2.9674575328826904, "loss": 3.8982, "lr": 0.0003324475524475525, "step": 5274, "tokens_trained": 2.591849336 }, { "epoch": 1.496418693709666, "grad_norm": 3.1153666973114014, "loss": 3.8812, "lr": 0.0003321678321678322, "step": 5276, "tokens_trained": 2.592831392 }, { "epoch": 1.4969860293596198, "grad_norm": 1.1431063413619995, "loss": 3.8505, "lr": 0.0003318881118881119, "step": 5278, "tokens_trained": 2.593812872 }, { "epoch": 1.4975533650095738, "grad_norm": 1.3542804718017578, "loss": 3.7819, "lr": 0.0003316083916083916, "step": 5280, "tokens_trained": 2.594791464 }, { "epoch": 1.4981207006595276, "grad_norm": 2.3613617420196533, "loss": 3.8637, "lr": 0.00033132867132867135, "step": 5282, "tokens_trained": 2.595773248 }, { "epoch": 1.4986880363094817, "grad_norm": 3.757537364959717, "loss": 3.8808, "lr": 0.00033104895104895104, "step": 5284, "tokens_trained": 2.596756584 }, { "epoch": 1.4992553719594355, "grad_norm": 2.953866481781006, "loss": 3.9131, "lr": 0.0003307692307692308, "step": 5286, "tokens_trained": 2.59773796 }, { "epoch": 1.4998227076093893, "grad_norm": 2.655200958251953, "loss": 3.8643, "lr": 0.00033048951048951047, "step": 5288, "tokens_trained": 2.598719536 }, { "epoch": 1.5003900432593433, "grad_norm": 2.889563798904419, "loss": 3.9746, "lr": 0.0003302097902097902, "step": 5290, "tokens_trained": 2.599697288 }, { "epoch": 1.5009573789092974, "grad_norm": 2.8182990550994873, "loss": 3.8618, "lr": 0.0003299300699300699, "step": 5292, "tokens_trained": 2.600680032 }, { "epoch": 1.5015247145592512, "grad_norm": 3.309007406234741, "loss": 3.9308, "lr": 0.0003296503496503497, "step": 5294, "tokens_trained": 2.601664832 }, { "epoch": 1.502092050209205, "grad_norm": 2.542564868927002, "loss": 3.8754, "lr": 0.0003293706293706294, "step": 5296, "tokens_trained": 2.602645048 }, { "epoch": 1.5026593858591588, "grad_norm": 1.6452852487564087, "loss": 3.8644, "lr": 0.0003290909090909091, "step": 5298, "tokens_trained": 2.603625752 }, { "epoch": 1.5032267215091129, "grad_norm": 2.0452191829681396, "loss": 3.8651, "lr": 0.00032881118881118883, "step": 5300, "tokens_trained": 2.604608136 }, { "epoch": 1.5037940571590669, "grad_norm": 3.8787152767181396, "loss": 3.897, "lr": 0.0003285314685314685, "step": 5302, "tokens_trained": 2.605591744 }, { "epoch": 1.5043613928090207, "grad_norm": 2.729074478149414, "loss": 3.8237, "lr": 0.00032825174825174827, "step": 5304, "tokens_trained": 2.606578392 }, { "epoch": 1.5049287284589745, "grad_norm": 4.348790168762207, "loss": 3.8972, "lr": 0.00032797202797202796, "step": 5306, "tokens_trained": 2.607560696 }, { "epoch": 1.5054960641089283, "grad_norm": 3.7172658443450928, "loss": 3.9184, "lr": 0.0003276923076923077, "step": 5308, "tokens_trained": 2.608540552 }, { "epoch": 1.5060633997588824, "grad_norm": 0.9424030780792236, "loss": 3.9053, "lr": 0.0003274125874125874, "step": 5310, "tokens_trained": 2.609521736 }, { "epoch": 1.5066307354088364, "grad_norm": 1.4858821630477905, "loss": 3.9223, "lr": 0.00032713286713286714, "step": 5312, "tokens_trained": 2.610500288 }, { "epoch": 1.5071980710587902, "grad_norm": 1.154492974281311, "loss": 3.8776, "lr": 0.0003268531468531469, "step": 5314, "tokens_trained": 2.611484024 }, { "epoch": 1.507765406708744, "grad_norm": 2.853030204772949, "loss": 3.839, "lr": 0.00032657342657342657, "step": 5316, "tokens_trained": 2.61246684 }, { "epoch": 1.5083327423586979, "grad_norm": 5.903510570526123, "loss": 3.9016, "lr": 0.0003262937062937063, "step": 5318, "tokens_trained": 2.61344732 }, { "epoch": 1.508900078008652, "grad_norm": 4.1008453369140625, "loss": 3.8961, "lr": 0.000326013986013986, "step": 5320, "tokens_trained": 2.614426112 }, { "epoch": 1.509467413658606, "grad_norm": 1.814429759979248, "loss": 3.8708, "lr": 0.00032573426573426575, "step": 5322, "tokens_trained": 2.615404984 }, { "epoch": 1.5100347493085597, "grad_norm": 7.854028224945068, "loss": 3.8936, "lr": 0.00032545454545454544, "step": 5324, "tokens_trained": 2.61638448 }, { "epoch": 1.5106020849585136, "grad_norm": 8.18005084991455, "loss": 3.8511, "lr": 0.0003251748251748252, "step": 5326, "tokens_trained": 2.617367336 }, { "epoch": 1.5111694206084674, "grad_norm": 6.5862135887146, "loss": 3.8394, "lr": 0.0003248951048951049, "step": 5328, "tokens_trained": 2.618350632 }, { "epoch": 1.5117367562584214, "grad_norm": 5.746713638305664, "loss": 3.9074, "lr": 0.0003246153846153846, "step": 5330, "tokens_trained": 2.619333 }, { "epoch": 1.5123040919083754, "grad_norm": 3.554576873779297, "loss": 3.8755, "lr": 0.00032433566433566436, "step": 5332, "tokens_trained": 2.62031072 }, { "epoch": 1.5128714275583293, "grad_norm": 3.7476911544799805, "loss": 3.8923, "lr": 0.00032405594405594406, "step": 5334, "tokens_trained": 2.621290952 }, { "epoch": 1.513438763208283, "grad_norm": 3.5004961490631104, "loss": 3.8579, "lr": 0.0003237762237762238, "step": 5336, "tokens_trained": 2.622267496 }, { "epoch": 1.514006098858237, "grad_norm": 2.527608633041382, "loss": 3.8617, "lr": 0.0003234965034965035, "step": 5338, "tokens_trained": 2.623250648 }, { "epoch": 1.514573434508191, "grad_norm": 1.698697805404663, "loss": 3.8735, "lr": 0.00032321678321678323, "step": 5340, "tokens_trained": 2.624232432 }, { "epoch": 1.515140770158145, "grad_norm": 1.567301630973816, "loss": 3.8696, "lr": 0.0003229370629370629, "step": 5342, "tokens_trained": 2.625214496 }, { "epoch": 1.5157081058080988, "grad_norm": 1.1091945171356201, "loss": 3.9026, "lr": 0.00032265734265734267, "step": 5344, "tokens_trained": 2.626195168 }, { "epoch": 1.5162754414580526, "grad_norm": 2.308842420578003, "loss": 3.9068, "lr": 0.00032237762237762236, "step": 5346, "tokens_trained": 2.627178136 }, { "epoch": 1.5168427771080064, "grad_norm": 1.895664930343628, "loss": 3.8927, "lr": 0.0003220979020979021, "step": 5348, "tokens_trained": 2.628160768 }, { "epoch": 1.5174101127579604, "grad_norm": 3.357377529144287, "loss": 3.9102, "lr": 0.00032181818181818185, "step": 5350, "tokens_trained": 2.629144816 }, { "epoch": 1.5179774484079145, "grad_norm": 3.45583176612854, "loss": 3.8342, "lr": 0.00032153846153846154, "step": 5352, "tokens_trained": 2.630123376 }, { "epoch": 1.5185447840578683, "grad_norm": 2.129251718521118, "loss": 3.8917, "lr": 0.0003212587412587413, "step": 5354, "tokens_trained": 2.631108104 }, { "epoch": 1.5191121197078221, "grad_norm": 3.7762246131896973, "loss": 3.9044, "lr": 0.000320979020979021, "step": 5356, "tokens_trained": 2.632086912 }, { "epoch": 1.519679455357776, "grad_norm": 3.4620509147644043, "loss": 3.8829, "lr": 0.0003206993006993007, "step": 5358, "tokens_trained": 2.633066048 }, { "epoch": 1.52024679100773, "grad_norm": 2.9604990482330322, "loss": 3.9058, "lr": 0.0003204195804195804, "step": 5360, "tokens_trained": 2.634048104 }, { "epoch": 1.520814126657684, "grad_norm": 2.3409082889556885, "loss": 3.8871, "lr": 0.00032013986013986015, "step": 5362, "tokens_trained": 2.635033888 }, { "epoch": 1.5213814623076378, "grad_norm": 2.3598854541778564, "loss": 3.8278, "lr": 0.00031986013986013984, "step": 5364, "tokens_trained": 2.636016304 }, { "epoch": 1.5219487979575916, "grad_norm": 2.3019731044769287, "loss": 3.8662, "lr": 0.0003195804195804196, "step": 5366, "tokens_trained": 2.637003712 }, { "epoch": 1.5225161336075455, "grad_norm": 1.2325515747070312, "loss": 3.871, "lr": 0.00031930069930069933, "step": 5368, "tokens_trained": 2.637982504 }, { "epoch": 1.5230834692574995, "grad_norm": 0.7675896883010864, "loss": 3.8765, "lr": 0.000319020979020979, "step": 5370, "tokens_trained": 2.638965288 }, { "epoch": 1.5236508049074535, "grad_norm": 1.4549137353897095, "loss": 3.8722, "lr": 0.00031874125874125877, "step": 5372, "tokens_trained": 2.63994804 }, { "epoch": 1.5242181405574073, "grad_norm": 3.515141248703003, "loss": 3.8776, "lr": 0.00031846153846153846, "step": 5374, "tokens_trained": 2.640932128 }, { "epoch": 1.5245018083823842, "eval_loss": 0.9721737504005432, "eval_runtime": 20.8279, "step": 5375, "tokens_trained": 2.641423648 }, { "epoch": 1.5247854762073612, "grad_norm": 6.560733318328857, "loss": 3.9073, "lr": 0.0003181818181818182, "step": 5376, "tokens_trained": 2.641915928 }, { "epoch": 1.525352811857315, "grad_norm": 1.6997367143630981, "loss": 3.8291, "lr": 0.0003179020979020979, "step": 5378, "tokens_trained": 2.642900016 }, { "epoch": 1.525920147507269, "grad_norm": 9.629950523376465, "loss": 3.8959, "lr": 0.00031762237762237764, "step": 5380, "tokens_trained": 2.64388104 }, { "epoch": 1.526487483157223, "grad_norm": 3.9039199352264404, "loss": 3.8823, "lr": 0.00031734265734265733, "step": 5382, "tokens_trained": 2.644863936 }, { "epoch": 1.5270548188071769, "grad_norm": 3.925534963607788, "loss": 3.8869, "lr": 0.0003170629370629371, "step": 5384, "tokens_trained": 2.645846304 }, { "epoch": 1.5276221544571307, "grad_norm": 3.528144121170044, "loss": 3.8808, "lr": 0.0003167832167832168, "step": 5386, "tokens_trained": 2.646833528 }, { "epoch": 1.5281894901070845, "grad_norm": 4.170341968536377, "loss": 3.9114, "lr": 0.0003165034965034965, "step": 5388, "tokens_trained": 2.647815024 }, { "epoch": 1.5287568257570385, "grad_norm": 1.840369462966919, "loss": 3.8531, "lr": 0.00031622377622377625, "step": 5390, "tokens_trained": 2.64879892 }, { "epoch": 1.5293241614069926, "grad_norm": 3.2327773571014404, "loss": 3.9133, "lr": 0.00031594405594405594, "step": 5392, "tokens_trained": 2.649780992 }, { "epoch": 1.5298914970569464, "grad_norm": 4.462336540222168, "loss": 3.8588, "lr": 0.0003156643356643357, "step": 5394, "tokens_trained": 2.650768296 }, { "epoch": 1.5304588327069002, "grad_norm": 4.678606033325195, "loss": 3.9386, "lr": 0.0003153846153846154, "step": 5396, "tokens_trained": 2.651752624 }, { "epoch": 1.531026168356854, "grad_norm": 1.7649297714233398, "loss": 3.8813, "lr": 0.00031510489510489507, "step": 5398, "tokens_trained": 2.652734752 }, { "epoch": 1.531593504006808, "grad_norm": 5.314251899719238, "loss": 3.857, "lr": 0.0003148251748251748, "step": 5400, "tokens_trained": 2.653716744 }, { "epoch": 1.532160839656762, "grad_norm": 3.9521164894104004, "loss": 3.8795, "lr": 0.00031454545454545456, "step": 5402, "tokens_trained": 2.654698216 }, { "epoch": 1.532728175306716, "grad_norm": 2.3679206371307373, "loss": 3.8809, "lr": 0.0003142657342657343, "step": 5404, "tokens_trained": 2.655684352 }, { "epoch": 1.5332955109566697, "grad_norm": 5.0761871337890625, "loss": 3.8539, "lr": 0.000313986013986014, "step": 5406, "tokens_trained": 2.656668872 }, { "epoch": 1.5338628466066235, "grad_norm": 3.036986827850342, "loss": 3.8724, "lr": 0.00031370629370629374, "step": 5408, "tokens_trained": 2.657649648 }, { "epoch": 1.5344301822565776, "grad_norm": 1.9492992162704468, "loss": 3.8559, "lr": 0.00031342657342657343, "step": 5410, "tokens_trained": 2.65863172 }, { "epoch": 1.5349975179065316, "grad_norm": 5.674772262573242, "loss": 3.9009, "lr": 0.00031314685314685317, "step": 5412, "tokens_trained": 2.659614912 }, { "epoch": 1.5355648535564854, "grad_norm": 3.045802116394043, "loss": 3.8898, "lr": 0.00031286713286713286, "step": 5414, "tokens_trained": 2.660596088 }, { "epoch": 1.5361321892064392, "grad_norm": 2.8371381759643555, "loss": 3.8409, "lr": 0.00031258741258741255, "step": 5416, "tokens_trained": 2.661580752 }, { "epoch": 1.536699524856393, "grad_norm": 3.7679245471954346, "loss": 3.8803, "lr": 0.0003123076923076923, "step": 5418, "tokens_trained": 2.662563832 }, { "epoch": 1.537266860506347, "grad_norm": 3.2771692276000977, "loss": 3.8699, "lr": 0.000312027972027972, "step": 5420, "tokens_trained": 2.663544624 }, { "epoch": 1.5378341961563011, "grad_norm": 2.7474050521850586, "loss": 3.899, "lr": 0.0003117482517482518, "step": 5422, "tokens_trained": 2.664520368 }, { "epoch": 1.538401531806255, "grad_norm": 3.284118890762329, "loss": 3.8437, "lr": 0.0003114685314685315, "step": 5424, "tokens_trained": 2.665499312 }, { "epoch": 1.5389688674562088, "grad_norm": 2.7903459072113037, "loss": 3.8609, "lr": 0.0003111888111888112, "step": 5426, "tokens_trained": 2.666482416 }, { "epoch": 1.5395362031061626, "grad_norm": 3.876206398010254, "loss": 3.876, "lr": 0.0003109090909090909, "step": 5428, "tokens_trained": 2.667467008 }, { "epoch": 1.5401035387561166, "grad_norm": 1.5711065530776978, "loss": 3.9029, "lr": 0.00031062937062937066, "step": 5430, "tokens_trained": 2.668449184 }, { "epoch": 1.5406708744060706, "grad_norm": 1.2520103454589844, "loss": 3.8487, "lr": 0.00031034965034965035, "step": 5432, "tokens_trained": 2.669431912 }, { "epoch": 1.5412382100560245, "grad_norm": 0.9419916272163391, "loss": 3.8721, "lr": 0.00031006993006993004, "step": 5434, "tokens_trained": 2.670416424 }, { "epoch": 1.5418055457059783, "grad_norm": 1.9234577417373657, "loss": 3.8729, "lr": 0.0003097902097902098, "step": 5436, "tokens_trained": 2.671400888 }, { "epoch": 1.542372881355932, "grad_norm": 3.8806726932525635, "loss": 3.8551, "lr": 0.00030951048951048947, "step": 5438, "tokens_trained": 2.672375848 }, { "epoch": 1.5429402170058861, "grad_norm": 3.5235371589660645, "loss": 3.8662, "lr": 0.00030923076923076927, "step": 5440, "tokens_trained": 2.673355968 }, { "epoch": 1.5435075526558402, "grad_norm": 2.4708411693573, "loss": 3.844, "lr": 0.00030895104895104896, "step": 5442, "tokens_trained": 2.674337336 }, { "epoch": 1.544074888305794, "grad_norm": 2.014948606491089, "loss": 3.8766, "lr": 0.0003086713286713287, "step": 5444, "tokens_trained": 2.67532064 }, { "epoch": 1.5446422239557478, "grad_norm": 2.5892093181610107, "loss": 3.858, "lr": 0.0003083916083916084, "step": 5446, "tokens_trained": 2.676301352 }, { "epoch": 1.5452095596057016, "grad_norm": 3.082036018371582, "loss": 3.8663, "lr": 0.00030811188811188814, "step": 5448, "tokens_trained": 2.67728328 }, { "epoch": 1.5457768952556556, "grad_norm": 3.072131395339966, "loss": 3.8562, "lr": 0.00030783216783216783, "step": 5450, "tokens_trained": 2.678268 }, { "epoch": 1.5463442309056097, "grad_norm": 2.331498384475708, "loss": 3.874, "lr": 0.0003075524475524475, "step": 5452, "tokens_trained": 2.6792526 }, { "epoch": 1.5469115665555635, "grad_norm": 4.706553936004639, "loss": 3.889, "lr": 0.00030727272727272727, "step": 5454, "tokens_trained": 2.680234128 }, { "epoch": 1.5474789022055173, "grad_norm": 4.815377712249756, "loss": 3.8797, "lr": 0.00030699300699300696, "step": 5456, "tokens_trained": 2.68121644 }, { "epoch": 1.5480462378554711, "grad_norm": 4.225409507751465, "loss": 3.8561, "lr": 0.00030671328671328675, "step": 5458, "tokens_trained": 2.682198312 }, { "epoch": 1.5486135735054252, "grad_norm": 2.394444227218628, "loss": 3.9328, "lr": 0.00030643356643356645, "step": 5460, "tokens_trained": 2.683179776 }, { "epoch": 1.5491809091553792, "grad_norm": 3.93528151512146, "loss": 3.8418, "lr": 0.0003061538461538462, "step": 5462, "tokens_trained": 2.684163624 }, { "epoch": 1.549748244805333, "grad_norm": 3.366722822189331, "loss": 3.8553, "lr": 0.0003058741258741259, "step": 5464, "tokens_trained": 2.68514788 }, { "epoch": 1.5503155804552868, "grad_norm": 2.567106246948242, "loss": 3.8859, "lr": 0.0003055944055944056, "step": 5466, "tokens_trained": 2.686129328 }, { "epoch": 1.5508829161052406, "grad_norm": 2.0634472370147705, "loss": 3.8997, "lr": 0.0003053146853146853, "step": 5468, "tokens_trained": 2.687110848 }, { "epoch": 1.5514502517551947, "grad_norm": 0.823783814907074, "loss": 3.905, "lr": 0.000305034965034965, "step": 5470, "tokens_trained": 2.688097216 }, { "epoch": 1.5520175874051487, "grad_norm": 1.0160223245620728, "loss": 3.8902, "lr": 0.00030475524475524475, "step": 5472, "tokens_trained": 2.689077344 }, { "epoch": 1.5525849230551025, "grad_norm": 1.5037281513214111, "loss": 3.823, "lr": 0.00030447552447552444, "step": 5474, "tokens_trained": 2.690056832 }, { "epoch": 1.5531522587050564, "grad_norm": 0.46490955352783203, "loss": 3.8819, "lr": 0.00030419580419580424, "step": 5476, "tokens_trained": 2.691035328 }, { "epoch": 1.5537195943550102, "grad_norm": 1.715409278869629, "loss": 3.8291, "lr": 0.00030391608391608393, "step": 5478, "tokens_trained": 2.6920166 }, { "epoch": 1.5542869300049642, "grad_norm": 2.430316925048828, "loss": 3.8457, "lr": 0.0003036363636363637, "step": 5480, "tokens_trained": 2.6930046 }, { "epoch": 1.5548542656549182, "grad_norm": 3.483908176422119, "loss": 3.8864, "lr": 0.00030335664335664336, "step": 5482, "tokens_trained": 2.693984624 }, { "epoch": 1.555421601304872, "grad_norm": 1.167831301689148, "loss": 3.8714, "lr": 0.0003030769230769231, "step": 5484, "tokens_trained": 2.694967672 }, { "epoch": 1.5559889369548259, "grad_norm": 1.5959419012069702, "loss": 3.8725, "lr": 0.0003027972027972028, "step": 5486, "tokens_trained": 2.69595152 }, { "epoch": 1.5565562726047797, "grad_norm": 2.6633737087249756, "loss": 3.8369, "lr": 0.0003025174825174825, "step": 5488, "tokens_trained": 2.696935328 }, { "epoch": 1.5571236082547337, "grad_norm": 4.084526062011719, "loss": 3.8687, "lr": 0.00030223776223776223, "step": 5490, "tokens_trained": 2.697917976 }, { "epoch": 1.5576909439046878, "grad_norm": 2.062319040298462, "loss": 3.8875, "lr": 0.0003019580419580419, "step": 5492, "tokens_trained": 2.698902704 }, { "epoch": 1.5582582795546416, "grad_norm": 1.9942662715911865, "loss": 3.8643, "lr": 0.0003016783216783217, "step": 5494, "tokens_trained": 2.699887184 }, { "epoch": 1.5588256152045954, "grad_norm": 3.3120384216308594, "loss": 3.8805, "lr": 0.0003013986013986014, "step": 5496, "tokens_trained": 2.700869192 }, { "epoch": 1.5593929508545492, "grad_norm": 4.658695220947266, "loss": 3.8846, "lr": 0.00030111888111888116, "step": 5498, "tokens_trained": 2.701852504 }, { "epoch": 1.5599602865045032, "grad_norm": 2.397148370742798, "loss": 3.9023, "lr": 0.00030083916083916085, "step": 5500, "tokens_trained": 2.702838232 }, { "epoch": 1.5599602865045032, "eval_loss": 0.9699593782424927, "eval_runtime": 20.4232, "step": 5500, "tokens_trained": 2.702838232 }, { "epoch": 1.5605276221544573, "grad_norm": 3.4792380332946777, "loss": 3.9054, "lr": 0.0003005594405594406, "step": 5502, "tokens_trained": 2.703821968 }, { "epoch": 1.561094957804411, "grad_norm": 2.4424889087677, "loss": 3.8883, "lr": 0.0003002797202797203, "step": 5504, "tokens_trained": 2.704803952 }, { "epoch": 1.561662293454365, "grad_norm": 2.9872353076934814, "loss": 3.8708, "lr": 0.0003, "step": 5506, "tokens_trained": 2.705786744 }, { "epoch": 1.5622296291043187, "grad_norm": 2.74369740486145, "loss": 3.8685, "lr": 0.0002997202797202797, "step": 5508, "tokens_trained": 2.70677 }, { "epoch": 1.5627969647542728, "grad_norm": 3.588508367538452, "loss": 3.8362, "lr": 0.0002994405594405594, "step": 5510, "tokens_trained": 2.707756336 }, { "epoch": 1.5633643004042268, "grad_norm": 3.268918037414551, "loss": 3.8514, "lr": 0.0002991608391608392, "step": 5512, "tokens_trained": 2.708738096 }, { "epoch": 1.5639316360541806, "grad_norm": 3.9960944652557373, "loss": 3.8601, "lr": 0.0002988811188811189, "step": 5514, "tokens_trained": 2.709718392 }, { "epoch": 1.5644989717041344, "grad_norm": 1.5690975189208984, "loss": 3.8826, "lr": 0.00029860139860139864, "step": 5516, "tokens_trained": 2.710698272 }, { "epoch": 1.5650663073540882, "grad_norm": 1.7052137851715088, "loss": 3.858, "lr": 0.00029832167832167833, "step": 5518, "tokens_trained": 2.711681816 }, { "epoch": 1.5656336430040423, "grad_norm": 2.0696487426757812, "loss": 3.8539, "lr": 0.000298041958041958, "step": 5520, "tokens_trained": 2.712668992 }, { "epoch": 1.5662009786539963, "grad_norm": 3.0199241638183594, "loss": 3.8253, "lr": 0.00029776223776223777, "step": 5522, "tokens_trained": 2.713648024 }, { "epoch": 1.5667683143039501, "grad_norm": 2.88175106048584, "loss": 3.9067, "lr": 0.00029748251748251746, "step": 5524, "tokens_trained": 2.71462988 }, { "epoch": 1.567335649953904, "grad_norm": 2.287402868270874, "loss": 3.7902, "lr": 0.0002972027972027972, "step": 5526, "tokens_trained": 2.715612736 }, { "epoch": 1.5679029856038578, "grad_norm": 2.2216570377349854, "loss": 3.8992, "lr": 0.0002969230769230769, "step": 5528, "tokens_trained": 2.716597376 }, { "epoch": 1.5684703212538118, "grad_norm": 4.012553691864014, "loss": 3.85, "lr": 0.0002966433566433567, "step": 5530, "tokens_trained": 2.717578944 }, { "epoch": 1.5690376569037658, "grad_norm": 3.187795639038086, "loss": 3.8657, "lr": 0.0002963636363636364, "step": 5532, "tokens_trained": 2.718558344 }, { "epoch": 1.5696049925537197, "grad_norm": 0.5813043713569641, "loss": 3.8923, "lr": 0.0002960839160839161, "step": 5534, "tokens_trained": 2.719547264 }, { "epoch": 1.5701723282036735, "grad_norm": 2.481187105178833, "loss": 3.8645, "lr": 0.0002958041958041958, "step": 5536, "tokens_trained": 2.7205292 }, { "epoch": 1.5707396638536273, "grad_norm": 2.6428370475769043, "loss": 3.809, "lr": 0.0002955244755244755, "step": 5538, "tokens_trained": 2.721513648 }, { "epoch": 1.5713069995035813, "grad_norm": 3.8301851749420166, "loss": 3.844, "lr": 0.00029524475524475525, "step": 5540, "tokens_trained": 2.722494504 }, { "epoch": 1.5718743351535354, "grad_norm": 2.134653091430664, "loss": 3.8471, "lr": 0.00029496503496503494, "step": 5542, "tokens_trained": 2.72347492 }, { "epoch": 1.5724416708034892, "grad_norm": 0.9983079433441162, "loss": 3.8889, "lr": 0.0002946853146853147, "step": 5544, "tokens_trained": 2.724455024 }, { "epoch": 1.573009006453443, "grad_norm": 0.41518381237983704, "loss": 3.8726, "lr": 0.0002944055944055944, "step": 5546, "tokens_trained": 2.72544024 }, { "epoch": 1.5735763421033968, "grad_norm": 0.42304641008377075, "loss": 3.8176, "lr": 0.0002941258741258741, "step": 5548, "tokens_trained": 2.726424232 }, { "epoch": 1.5741436777533508, "grad_norm": 1.4611626863479614, "loss": 3.8959, "lr": 0.00029384615384615387, "step": 5550, "tokens_trained": 2.727410496 }, { "epoch": 1.5747110134033049, "grad_norm": 0.546713650226593, "loss": 3.9122, "lr": 0.0002935664335664336, "step": 5552, "tokens_trained": 2.728396976 }, { "epoch": 1.5752783490532587, "grad_norm": 1.1208237409591675, "loss": 3.8791, "lr": 0.0002932867132867133, "step": 5554, "tokens_trained": 2.729385936 }, { "epoch": 1.5758456847032125, "grad_norm": 2.6620264053344727, "loss": 3.877, "lr": 0.000293006993006993, "step": 5556, "tokens_trained": 2.730368968 }, { "epoch": 1.5764130203531663, "grad_norm": 0.7671589255332947, "loss": 3.8491, "lr": 0.00029272727272727274, "step": 5558, "tokens_trained": 2.731351336 }, { "epoch": 1.5769803560031204, "grad_norm": 0.7316055297851562, "loss": 3.849, "lr": 0.0002924475524475524, "step": 5560, "tokens_trained": 2.732334296 }, { "epoch": 1.5775476916530744, "grad_norm": 3.3884339332580566, "loss": 3.9126, "lr": 0.00029216783216783217, "step": 5562, "tokens_trained": 2.733314536 }, { "epoch": 1.5781150273030282, "grad_norm": 1.2948181629180908, "loss": 3.9129, "lr": 0.00029188811188811186, "step": 5564, "tokens_trained": 2.73429728 }, { "epoch": 1.578682362952982, "grad_norm": 3.948852777481079, "loss": 3.845, "lr": 0.0002916083916083916, "step": 5566, "tokens_trained": 2.735282408 }, { "epoch": 1.5792496986029358, "grad_norm": 4.460155963897705, "loss": 3.8637, "lr": 0.00029132867132867135, "step": 5568, "tokens_trained": 2.736265168 }, { "epoch": 1.5798170342528899, "grad_norm": 2.052924633026123, "loss": 3.9199, "lr": 0.0002910489510489511, "step": 5570, "tokens_trained": 2.737247144 }, { "epoch": 1.580384369902844, "grad_norm": 2.460111379623413, "loss": 3.8224, "lr": 0.0002907692307692308, "step": 5572, "tokens_trained": 2.73823084 }, { "epoch": 1.5809517055527977, "grad_norm": 1.7709126472473145, "loss": 3.8537, "lr": 0.0002904895104895105, "step": 5574, "tokens_trained": 2.739212296 }, { "epoch": 1.5815190412027516, "grad_norm": 2.155181884765625, "loss": 3.9214, "lr": 0.0002902097902097902, "step": 5576, "tokens_trained": 2.740193656 }, { "epoch": 1.5820863768527054, "grad_norm": 2.0963149070739746, "loss": 3.8832, "lr": 0.0002899300699300699, "step": 5578, "tokens_trained": 2.741176512 }, { "epoch": 1.5826537125026594, "grad_norm": 2.6366584300994873, "loss": 3.8542, "lr": 0.00028965034965034966, "step": 5580, "tokens_trained": 2.742157504 }, { "epoch": 1.5832210481526134, "grad_norm": 1.9845340251922607, "loss": 3.8644, "lr": 0.00028937062937062935, "step": 5582, "tokens_trained": 2.743138064 }, { "epoch": 1.5837883838025673, "grad_norm": 0.9953936338424683, "loss": 3.8577, "lr": 0.0002890909090909091, "step": 5584, "tokens_trained": 2.744115648 }, { "epoch": 1.584355719452521, "grad_norm": 1.3023415803909302, "loss": 3.8784, "lr": 0.00028881118881118883, "step": 5586, "tokens_trained": 2.745097536 }, { "epoch": 1.5849230551024749, "grad_norm": 1.2267543077468872, "loss": 3.8312, "lr": 0.0002885314685314686, "step": 5588, "tokens_trained": 2.74608184 }, { "epoch": 1.585490390752429, "grad_norm": 0.7333025932312012, "loss": 3.8134, "lr": 0.00028825174825174827, "step": 5590, "tokens_trained": 2.747067224 }, { "epoch": 1.586057726402383, "grad_norm": 3.838825225830078, "loss": 3.8554, "lr": 0.00028797202797202796, "step": 5592, "tokens_trained": 2.74804864 }, { "epoch": 1.5866250620523368, "grad_norm": 2.8580691814422607, "loss": 3.8197, "lr": 0.0002876923076923077, "step": 5594, "tokens_trained": 2.749030704 }, { "epoch": 1.5871923977022906, "grad_norm": 3.3770620822906494, "loss": 3.845, "lr": 0.0002874125874125874, "step": 5596, "tokens_trained": 2.750011464 }, { "epoch": 1.5877597333522444, "grad_norm": 2.183331251144409, "loss": 3.8625, "lr": 0.00028713286713286714, "step": 5598, "tokens_trained": 2.750993992 }, { "epoch": 1.5883270690021984, "grad_norm": 1.1044546365737915, "loss": 3.8821, "lr": 0.00028685314685314683, "step": 5600, "tokens_trained": 2.75197632 }, { "epoch": 1.5888944046521525, "grad_norm": 1.9587361812591553, "loss": 3.9019, "lr": 0.0002865734265734266, "step": 5602, "tokens_trained": 2.75295512 }, { "epoch": 1.5894617403021063, "grad_norm": 5.257344722747803, "loss": 3.8587, "lr": 0.0002862937062937063, "step": 5604, "tokens_trained": 2.75393968 }, { "epoch": 1.5900290759520601, "grad_norm": 2.98882794380188, "loss": 3.8751, "lr": 0.00028601398601398606, "step": 5606, "tokens_trained": 2.754917056 }, { "epoch": 1.590596411602014, "grad_norm": 3.215801239013672, "loss": 3.8178, "lr": 0.00028573426573426575, "step": 5608, "tokens_trained": 2.755897352 }, { "epoch": 1.591163747251968, "grad_norm": 3.7019567489624023, "loss": 3.8578, "lr": 0.00028545454545454544, "step": 5610, "tokens_trained": 2.756876792 }, { "epoch": 1.591731082901922, "grad_norm": 0.5233857035636902, "loss": 3.84, "lr": 0.0002851748251748252, "step": 5612, "tokens_trained": 2.75786064 }, { "epoch": 1.5922984185518758, "grad_norm": 1.3499095439910889, "loss": 3.846, "lr": 0.0002848951048951049, "step": 5614, "tokens_trained": 2.758843208 }, { "epoch": 1.5928657542018296, "grad_norm": 3.770670175552368, "loss": 3.8715, "lr": 0.0002846153846153846, "step": 5616, "tokens_trained": 2.759831272 }, { "epoch": 1.5934330898517834, "grad_norm": 2.2430431842803955, "loss": 3.8457, "lr": 0.0002843356643356643, "step": 5618, "tokens_trained": 2.760816576 }, { "epoch": 1.5940004255017375, "grad_norm": 2.121674060821533, "loss": 3.8379, "lr": 0.00028405594405594406, "step": 5620, "tokens_trained": 2.761806192 }, { "epoch": 1.5945677611516915, "grad_norm": 2.42568302154541, "loss": 3.8762, "lr": 0.0002837762237762238, "step": 5622, "tokens_trained": 2.762787368 }, { "epoch": 1.5951350968016453, "grad_norm": 2.4501335620880127, "loss": 3.8985, "lr": 0.00028349650349650355, "step": 5624, "tokens_trained": 2.763768648 }, { "epoch": 1.5954187646266222, "eval_loss": 0.9685465693473816, "eval_runtime": 20.383, "step": 5625, "tokens_trained": 2.764258728 }, { "epoch": 1.5957024324515992, "grad_norm": 1.7675210237503052, "loss": 3.8317, "lr": 0.00028321678321678324, "step": 5626, "tokens_trained": 2.764748448 }, { "epoch": 1.596269768101553, "grad_norm": 2.069201707839966, "loss": 3.8316, "lr": 0.00028293706293706293, "step": 5628, "tokens_trained": 2.7657334 }, { "epoch": 1.596837103751507, "grad_norm": 3.7776238918304443, "loss": 3.8705, "lr": 0.0002826573426573427, "step": 5630, "tokens_trained": 2.766718144 }, { "epoch": 1.597404439401461, "grad_norm": 4.658926963806152, "loss": 3.8201, "lr": 0.00028237762237762236, "step": 5632, "tokens_trained": 2.76770032 }, { "epoch": 1.5979717750514149, "grad_norm": 2.883873462677002, "loss": 3.8384, "lr": 0.0002820979020979021, "step": 5634, "tokens_trained": 2.768682736 }, { "epoch": 1.5985391107013687, "grad_norm": 3.313469886779785, "loss": 3.8485, "lr": 0.0002818181818181818, "step": 5636, "tokens_trained": 2.769667704 }, { "epoch": 1.5991064463513225, "grad_norm": 3.279757022857666, "loss": 3.8407, "lr": 0.00028153846153846154, "step": 5638, "tokens_trained": 2.77065176 }, { "epoch": 1.5996737820012765, "grad_norm": 3.4190688133239746, "loss": 3.8733, "lr": 0.0002812587412587413, "step": 5640, "tokens_trained": 2.77163072 }, { "epoch": 1.6002411176512306, "grad_norm": 2.766123056411743, "loss": 3.8826, "lr": 0.000280979020979021, "step": 5642, "tokens_trained": 2.772610632 }, { "epoch": 1.6008084533011844, "grad_norm": 2.292541742324829, "loss": 3.8072, "lr": 0.0002806993006993007, "step": 5644, "tokens_trained": 2.77359028 }, { "epoch": 1.6013757889511382, "grad_norm": 3.0967636108398438, "loss": 3.8529, "lr": 0.0002804195804195804, "step": 5646, "tokens_trained": 2.774572744 }, { "epoch": 1.601943124601092, "grad_norm": 4.144455432891846, "loss": 3.8964, "lr": 0.00028013986013986016, "step": 5648, "tokens_trained": 2.775555344 }, { "epoch": 1.602510460251046, "grad_norm": 1.0935693979263306, "loss": 3.8742, "lr": 0.00027986013986013985, "step": 5650, "tokens_trained": 2.77653816 }, { "epoch": 1.603077795901, "grad_norm": 1.5766457319259644, "loss": 3.854, "lr": 0.0002795804195804196, "step": 5652, "tokens_trained": 2.777518952 }, { "epoch": 1.603645131550954, "grad_norm": 7.910213470458984, "loss": 3.8816, "lr": 0.0002793006993006993, "step": 5654, "tokens_trained": 2.778506272 }, { "epoch": 1.6042124672009077, "grad_norm": 4.65513277053833, "loss": 3.8729, "lr": 0.00027902097902097903, "step": 5656, "tokens_trained": 2.779489664 }, { "epoch": 1.6047798028508615, "grad_norm": 3.681711435317993, "loss": 3.8677, "lr": 0.00027874125874125877, "step": 5658, "tokens_trained": 2.780472776 }, { "epoch": 1.6053471385008156, "grad_norm": 5.058254718780518, "loss": 3.8606, "lr": 0.00027846153846153846, "step": 5660, "tokens_trained": 2.781455744 }, { "epoch": 1.6059144741507696, "grad_norm": 4.267047882080078, "loss": 3.8373, "lr": 0.0002781818181818182, "step": 5662, "tokens_trained": 2.78244188 }, { "epoch": 1.6064818098007234, "grad_norm": 3.1416563987731934, "loss": 3.8404, "lr": 0.0002779020979020979, "step": 5664, "tokens_trained": 2.783428128 }, { "epoch": 1.6070491454506772, "grad_norm": 4.125866413116455, "loss": 3.8763, "lr": 0.00027762237762237764, "step": 5666, "tokens_trained": 2.784414224 }, { "epoch": 1.607616481100631, "grad_norm": 3.6334707736968994, "loss": 3.8564, "lr": 0.00027734265734265733, "step": 5668, "tokens_trained": 2.785394264 }, { "epoch": 1.608183816750585, "grad_norm": 4.244611740112305, "loss": 3.8709, "lr": 0.0002770629370629371, "step": 5670, "tokens_trained": 2.786378176 }, { "epoch": 1.6087511524005391, "grad_norm": 2.5464348793029785, "loss": 3.9095, "lr": 0.00027678321678321677, "step": 5672, "tokens_trained": 2.787364264 }, { "epoch": 1.609318488050493, "grad_norm": 2.5525379180908203, "loss": 3.868, "lr": 0.0002765034965034965, "step": 5674, "tokens_trained": 2.78834892 }, { "epoch": 1.6098858237004467, "grad_norm": 1.4956291913986206, "loss": 3.8024, "lr": 0.0002762237762237762, "step": 5676, "tokens_trained": 2.789330144 }, { "epoch": 1.6104531593504006, "grad_norm": 1.331429362297058, "loss": 3.8308, "lr": 0.00027594405594405595, "step": 5678, "tokens_trained": 2.790313456 }, { "epoch": 1.6110204950003546, "grad_norm": 1.6636086702346802, "loss": 3.8013, "lr": 0.0002756643356643357, "step": 5680, "tokens_trained": 2.791291288 }, { "epoch": 1.6115878306503086, "grad_norm": 1.0856963396072388, "loss": 3.8851, "lr": 0.0002753846153846154, "step": 5682, "tokens_trained": 2.792272912 }, { "epoch": 1.6121551663002625, "grad_norm": 0.8681638240814209, "loss": 3.8744, "lr": 0.0002751048951048951, "step": 5684, "tokens_trained": 2.793256312 }, { "epoch": 1.6127225019502163, "grad_norm": 1.770532488822937, "loss": 3.8362, "lr": 0.0002748251748251748, "step": 5686, "tokens_trained": 2.794240288 }, { "epoch": 1.61328983760017, "grad_norm": 2.9169862270355225, "loss": 3.8114, "lr": 0.00027454545454545456, "step": 5688, "tokens_trained": 2.795223128 }, { "epoch": 1.6138571732501241, "grad_norm": 2.319213628768921, "loss": 3.823, "lr": 0.00027426573426573425, "step": 5690, "tokens_trained": 2.796204352 }, { "epoch": 1.6144245089000782, "grad_norm": 1.7466791868209839, "loss": 3.8292, "lr": 0.000273986013986014, "step": 5692, "tokens_trained": 2.797188408 }, { "epoch": 1.614991844550032, "grad_norm": 2.5481719970703125, "loss": 3.8338, "lr": 0.0002737062937062937, "step": 5694, "tokens_trained": 2.79817152 }, { "epoch": 1.6155591801999858, "grad_norm": 1.9857237339019775, "loss": 3.8354, "lr": 0.00027342657342657343, "step": 5696, "tokens_trained": 2.799152488 }, { "epoch": 1.6161265158499396, "grad_norm": 2.332441568374634, "loss": 3.8649, "lr": 0.0002731468531468532, "step": 5698, "tokens_trained": 2.800136424 }, { "epoch": 1.6166938514998936, "grad_norm": 1.6021710634231567, "loss": 3.892, "lr": 0.00027286713286713287, "step": 5700, "tokens_trained": 2.801121064 }, { "epoch": 1.6172611871498477, "grad_norm": 1.5943433046340942, "loss": 3.8719, "lr": 0.0002725874125874126, "step": 5702, "tokens_trained": 2.802104456 }, { "epoch": 1.6178285227998015, "grad_norm": 1.7614659070968628, "loss": 3.8755, "lr": 0.0002723076923076923, "step": 5704, "tokens_trained": 2.803086472 }, { "epoch": 1.6183958584497553, "grad_norm": 0.709842324256897, "loss": 3.883, "lr": 0.00027202797202797205, "step": 5706, "tokens_trained": 2.804074552 }, { "epoch": 1.6189631940997091, "grad_norm": 2.912022829055786, "loss": 3.827, "lr": 0.00027174825174825174, "step": 5708, "tokens_trained": 2.805050624 }, { "epoch": 1.6195305297496632, "grad_norm": 1.5365500450134277, "loss": 3.8561, "lr": 0.0002714685314685315, "step": 5710, "tokens_trained": 2.806032664 }, { "epoch": 1.6200978653996172, "grad_norm": 1.8530750274658203, "loss": 3.8715, "lr": 0.00027118881118881117, "step": 5712, "tokens_trained": 2.807015072 }, { "epoch": 1.620665201049571, "grad_norm": 6.598786354064941, "loss": 3.9111, "lr": 0.0002709090909090909, "step": 5714, "tokens_trained": 2.80800292 }, { "epoch": 1.6212325366995248, "grad_norm": 3.761838436126709, "loss": 3.8632, "lr": 0.00027062937062937066, "step": 5716, "tokens_trained": 2.80898492 }, { "epoch": 1.6217998723494786, "grad_norm": 1.7242389917373657, "loss": 3.8467, "lr": 0.00027034965034965035, "step": 5718, "tokens_trained": 2.80996564 }, { "epoch": 1.6223672079994327, "grad_norm": 5.131701946258545, "loss": 3.8868, "lr": 0.0002700699300699301, "step": 5720, "tokens_trained": 2.810949456 }, { "epoch": 1.6229345436493867, "grad_norm": 3.7940638065338135, "loss": 3.8526, "lr": 0.0002697902097902098, "step": 5722, "tokens_trained": 2.811933712 }, { "epoch": 1.6235018792993405, "grad_norm": 3.0134806632995605, "loss": 3.9174, "lr": 0.00026951048951048953, "step": 5724, "tokens_trained": 2.812914312 }, { "epoch": 1.6240692149492943, "grad_norm": 4.154657363891602, "loss": 3.86, "lr": 0.0002692307692307692, "step": 5726, "tokens_trained": 2.813900408 }, { "epoch": 1.6246365505992482, "grad_norm": 4.034200668334961, "loss": 3.8983, "lr": 0.00026895104895104896, "step": 5728, "tokens_trained": 2.8148846 }, { "epoch": 1.6252038862492022, "grad_norm": 2.5282604694366455, "loss": 3.8614, "lr": 0.00026867132867132865, "step": 5730, "tokens_trained": 2.81587128 }, { "epoch": 1.6257712218991562, "grad_norm": 3.9052770137786865, "loss": 3.8696, "lr": 0.0002683916083916084, "step": 5732, "tokens_trained": 2.816857792 }, { "epoch": 1.62633855754911, "grad_norm": 4.2138352394104, "loss": 3.8902, "lr": 0.00026811188811188814, "step": 5734, "tokens_trained": 2.817840552 }, { "epoch": 1.6269058931990639, "grad_norm": 1.2808244228363037, "loss": 3.8675, "lr": 0.00026783216783216783, "step": 5736, "tokens_trained": 2.818825232 }, { "epoch": 1.6274732288490177, "grad_norm": 2.491243839263916, "loss": 3.9218, "lr": 0.0002675524475524476, "step": 5738, "tokens_trained": 2.819809352 }, { "epoch": 1.6280405644989717, "grad_norm": 3.1643896102905273, "loss": 3.8813, "lr": 0.00026727272727272727, "step": 5740, "tokens_trained": 2.820793456 }, { "epoch": 1.6286079001489258, "grad_norm": 3.648646593093872, "loss": 3.8445, "lr": 0.000266993006993007, "step": 5742, "tokens_trained": 2.821780088 }, { "epoch": 1.6291752357988796, "grad_norm": 2.1239254474639893, "loss": 3.8781, "lr": 0.0002667132867132867, "step": 5744, "tokens_trained": 2.82276436 }, { "epoch": 1.6297425714488334, "grad_norm": 2.5850162506103516, "loss": 3.8625, "lr": 0.0002664335664335664, "step": 5746, "tokens_trained": 2.82375128 }, { "epoch": 1.6303099070987872, "grad_norm": 2.6930086612701416, "loss": 3.817, "lr": 0.00026615384615384614, "step": 5748, "tokens_trained": 2.824735192 }, { "epoch": 1.6308772427487412, "grad_norm": 0.6374559998512268, "loss": 3.8555, "lr": 0.0002658741258741259, "step": 5750, "tokens_trained": 2.825718392 }, { "epoch": 1.6308772427487412, "eval_loss": 0.9677565097808838, "eval_runtime": 20.3934, "step": 5750, "tokens_trained": 2.825718392 }, { "epoch": 1.6314445783986953, "grad_norm": 2.324770212173462, "loss": 3.8393, "lr": 0.00026559440559440563, "step": 5752, "tokens_trained": 2.826700664 }, { "epoch": 1.632011914048649, "grad_norm": 3.6169118881225586, "loss": 3.8783, "lr": 0.0002653146853146853, "step": 5754, "tokens_trained": 2.82768492 }, { "epoch": 1.632579249698603, "grad_norm": 3.1136844158172607, "loss": 3.8528, "lr": 0.00026503496503496506, "step": 5756, "tokens_trained": 2.828668856 }, { "epoch": 1.6331465853485567, "grad_norm": 1.646531105041504, "loss": 3.8368, "lr": 0.00026475524475524475, "step": 5758, "tokens_trained": 2.82965104 }, { "epoch": 1.6337139209985108, "grad_norm": 1.9851844310760498, "loss": 3.8839, "lr": 0.0002644755244755245, "step": 5760, "tokens_trained": 2.830636984 }, { "epoch": 1.6342812566484648, "grad_norm": 5.908127307891846, "loss": 3.8477, "lr": 0.0002641958041958042, "step": 5762, "tokens_trained": 2.831616488 }, { "epoch": 1.6348485922984186, "grad_norm": 4.9002909660339355, "loss": 3.8279, "lr": 0.0002639160839160839, "step": 5764, "tokens_trained": 2.832599464 }, { "epoch": 1.6354159279483724, "grad_norm": 2.045973539352417, "loss": 3.8317, "lr": 0.0002636363636363636, "step": 5766, "tokens_trained": 2.83358544 }, { "epoch": 1.6359832635983262, "grad_norm": 1.7147414684295654, "loss": 3.8913, "lr": 0.00026335664335664337, "step": 5768, "tokens_trained": 2.83456792 }, { "epoch": 1.6365505992482803, "grad_norm": 2.8540899753570557, "loss": 3.8896, "lr": 0.0002630769230769231, "step": 5770, "tokens_trained": 2.835546528 }, { "epoch": 1.6371179348982343, "grad_norm": 2.798184633255005, "loss": 3.8901, "lr": 0.0002627972027972028, "step": 5772, "tokens_trained": 2.836531536 }, { "epoch": 1.6376852705481881, "grad_norm": 3.74381160736084, "loss": 3.8667, "lr": 0.00026251748251748255, "step": 5774, "tokens_trained": 2.837511976 }, { "epoch": 1.638252606198142, "grad_norm": 1.3036679029464722, "loss": 3.8828, "lr": 0.00026223776223776224, "step": 5776, "tokens_trained": 2.838494216 }, { "epoch": 1.6388199418480958, "grad_norm": 2.3305046558380127, "loss": 3.8687, "lr": 0.000261958041958042, "step": 5778, "tokens_trained": 2.839477616 }, { "epoch": 1.6393872774980498, "grad_norm": 1.8486007452011108, "loss": 3.8277, "lr": 0.00026167832167832167, "step": 5780, "tokens_trained": 2.840460776 }, { "epoch": 1.6399546131480038, "grad_norm": 7.9603681564331055, "loss": 3.8558, "lr": 0.00026139860139860136, "step": 5782, "tokens_trained": 2.841442784 }, { "epoch": 1.6405219487979577, "grad_norm": 6.6514410972595215, "loss": 3.8566, "lr": 0.0002611188811188811, "step": 5784, "tokens_trained": 2.842423376 }, { "epoch": 1.6410892844479115, "grad_norm": 4.3851237297058105, "loss": 3.8145, "lr": 0.00026083916083916085, "step": 5786, "tokens_trained": 2.843410992 }, { "epoch": 1.6416566200978653, "grad_norm": 6.750310897827148, "loss": 3.8696, "lr": 0.0002605594405594406, "step": 5788, "tokens_trained": 2.844393928 }, { "epoch": 1.6422239557478193, "grad_norm": 3.409925937652588, "loss": 3.8069, "lr": 0.0002602797202797203, "step": 5790, "tokens_trained": 2.845371936 }, { "epoch": 1.6427912913977734, "grad_norm": 4.318549633026123, "loss": 3.862, "lr": 0.00026000000000000003, "step": 5792, "tokens_trained": 2.846352808 }, { "epoch": 1.6433586270477272, "grad_norm": 3.3245508670806885, "loss": 3.9211, "lr": 0.0002597202797202797, "step": 5794, "tokens_trained": 2.847335992 }, { "epoch": 1.643925962697681, "grad_norm": 2.312521457672119, "loss": 3.7887, "lr": 0.00025944055944055947, "step": 5796, "tokens_trained": 2.848320616 }, { "epoch": 1.6444932983476348, "grad_norm": 1.4259709119796753, "loss": 3.8607, "lr": 0.00025916083916083916, "step": 5798, "tokens_trained": 2.849300928 }, { "epoch": 1.6450606339975888, "grad_norm": 0.9020340442657471, "loss": 3.869, "lr": 0.00025888111888111885, "step": 5800, "tokens_trained": 2.850282648 }, { "epoch": 1.6456279696475429, "grad_norm": 2.114844799041748, "loss": 3.8049, "lr": 0.0002586013986013986, "step": 5802, "tokens_trained": 2.85126584 }, { "epoch": 1.6461953052974967, "grad_norm": 4.662852764129639, "loss": 3.8474, "lr": 0.0002583216783216783, "step": 5804, "tokens_trained": 2.852253648 }, { "epoch": 1.6467626409474505, "grad_norm": 4.038625240325928, "loss": 3.8813, "lr": 0.0002580419580419581, "step": 5806, "tokens_trained": 2.853237552 }, { "epoch": 1.6473299765974043, "grad_norm": 2.922651767730713, "loss": 3.8331, "lr": 0.00025776223776223777, "step": 5808, "tokens_trained": 2.854218656 }, { "epoch": 1.6478973122473584, "grad_norm": 4.35854434967041, "loss": 3.8623, "lr": 0.0002574825174825175, "step": 5810, "tokens_trained": 2.855199264 }, { "epoch": 1.6484646478973124, "grad_norm": 2.1086177825927734, "loss": 3.8747, "lr": 0.0002572027972027972, "step": 5812, "tokens_trained": 2.856182856 }, { "epoch": 1.6490319835472662, "grad_norm": 1.4423526525497437, "loss": 3.8822, "lr": 0.00025692307692307695, "step": 5814, "tokens_trained": 2.85716116 }, { "epoch": 1.64959931919722, "grad_norm": 1.7866076231002808, "loss": 3.8701, "lr": 0.00025664335664335664, "step": 5816, "tokens_trained": 2.8581406 }, { "epoch": 1.6501666548471738, "grad_norm": 0.9082437753677368, "loss": 3.8207, "lr": 0.00025636363636363633, "step": 5818, "tokens_trained": 2.859123392 }, { "epoch": 1.6507339904971279, "grad_norm": 2.493602991104126, "loss": 3.8473, "lr": 0.0002560839160839161, "step": 5820, "tokens_trained": 2.860107752 }, { "epoch": 1.651301326147082, "grad_norm": 2.814542055130005, "loss": 3.8977, "lr": 0.00025580419580419577, "step": 5822, "tokens_trained": 2.86109204 }, { "epoch": 1.6518686617970357, "grad_norm": 3.3991076946258545, "loss": 3.7998, "lr": 0.00025552447552447557, "step": 5824, "tokens_trained": 2.862077776 }, { "epoch": 1.6524359974469895, "grad_norm": 4.02992057800293, "loss": 3.8594, "lr": 0.00025524475524475526, "step": 5826, "tokens_trained": 2.863062432 }, { "epoch": 1.6530033330969434, "grad_norm": 5.211875915527344, "loss": 3.8718, "lr": 0.000254965034965035, "step": 5828, "tokens_trained": 2.86404512 }, { "epoch": 1.6535706687468974, "grad_norm": 2.361069917678833, "loss": 3.837, "lr": 0.0002546853146853147, "step": 5830, "tokens_trained": 2.865024888 }, { "epoch": 1.6541380043968514, "grad_norm": 6.926619052886963, "loss": 3.8268, "lr": 0.00025440559440559443, "step": 5832, "tokens_trained": 2.866006128 }, { "epoch": 1.6547053400468053, "grad_norm": 3.741729974746704, "loss": 3.8109, "lr": 0.0002541258741258741, "step": 5834, "tokens_trained": 2.866990408 }, { "epoch": 1.655272675696759, "grad_norm": 4.150857448577881, "loss": 3.9021, "lr": 0.0002538461538461538, "step": 5836, "tokens_trained": 2.867972736 }, { "epoch": 1.6558400113467129, "grad_norm": 3.9393651485443115, "loss": 3.8552, "lr": 0.00025356643356643356, "step": 5838, "tokens_trained": 2.86895772 }, { "epoch": 1.656407346996667, "grad_norm": 1.9962868690490723, "loss": 3.8358, "lr": 0.00025328671328671325, "step": 5840, "tokens_trained": 2.869938184 }, { "epoch": 1.656974682646621, "grad_norm": 0.8876021504402161, "loss": 3.8869, "lr": 0.00025300699300699305, "step": 5842, "tokens_trained": 2.87091688 }, { "epoch": 1.6575420182965748, "grad_norm": 1.5319703817367554, "loss": 3.8339, "lr": 0.00025272727272727274, "step": 5844, "tokens_trained": 2.871899592 }, { "epoch": 1.6581093539465286, "grad_norm": 1.3673622608184814, "loss": 3.8461, "lr": 0.0002524475524475525, "step": 5846, "tokens_trained": 2.87288524 }, { "epoch": 1.6586766895964824, "grad_norm": 1.8747504949569702, "loss": 3.8687, "lr": 0.0002521678321678322, "step": 5848, "tokens_trained": 2.873868904 }, { "epoch": 1.6592440252464364, "grad_norm": 1.78745698928833, "loss": 3.8289, "lr": 0.0002518881118881119, "step": 5850, "tokens_trained": 2.874854024 }, { "epoch": 1.6598113608963905, "grad_norm": 1.74812650680542, "loss": 3.8631, "lr": 0.0002516083916083916, "step": 5852, "tokens_trained": 2.875837264 }, { "epoch": 1.6603786965463443, "grad_norm": 4.6655778884887695, "loss": 3.767, "lr": 0.0002513286713286713, "step": 5854, "tokens_trained": 2.876824112 }, { "epoch": 1.660946032196298, "grad_norm": 4.012164115905762, "loss": 3.8522, "lr": 0.00025104895104895104, "step": 5856, "tokens_trained": 2.877811008 }, { "epoch": 1.661513367846252, "grad_norm": 4.21424674987793, "loss": 3.8507, "lr": 0.00025076923076923073, "step": 5858, "tokens_trained": 2.87879564 }, { "epoch": 1.662080703496206, "grad_norm": 4.155895233154297, "loss": 3.8529, "lr": 0.00025048951048951053, "step": 5860, "tokens_trained": 2.879774832 }, { "epoch": 1.66264803914616, "grad_norm": 2.7593812942504883, "loss": 3.8609, "lr": 0.0002502097902097902, "step": 5862, "tokens_trained": 2.880758592 }, { "epoch": 1.6632153747961138, "grad_norm": 1.1735769510269165, "loss": 3.8646, "lr": 0.00024993006993006997, "step": 5864, "tokens_trained": 2.881743336 }, { "epoch": 1.6637827104460676, "grad_norm": 2.4293084144592285, "loss": 3.867, "lr": 0.00024965034965034966, "step": 5866, "tokens_trained": 2.882725984 }, { "epoch": 1.6643500460960214, "grad_norm": 1.6900265216827393, "loss": 3.8467, "lr": 0.00024937062937062935, "step": 5868, "tokens_trained": 2.883709024 }, { "epoch": 1.6649173817459755, "grad_norm": 1.6338657140731812, "loss": 3.8499, "lr": 0.0002490909090909091, "step": 5870, "tokens_trained": 2.884690136 }, { "epoch": 1.6654847173959295, "grad_norm": 1.3867520093917847, "loss": 3.8497, "lr": 0.0002488111888111888, "step": 5872, "tokens_trained": 2.885670192 }, { "epoch": 1.6660520530458833, "grad_norm": 2.3722336292266846, "loss": 3.8404, "lr": 0.00024853146853146853, "step": 5874, "tokens_trained": 2.88665212 }, { "epoch": 1.6663357208708602, "eval_loss": 0.9677584767341614, "eval_runtime": 21.0454, "step": 5875, "tokens_trained": 2.887143888 }, { "epoch": 1.6666193886958371, "grad_norm": 3.559649705886841, "loss": 3.8521, "lr": 0.0002482517482517483, "step": 5876, "tokens_trained": 2.887632456 }, { "epoch": 1.667186724345791, "grad_norm": 3.4279959201812744, "loss": 3.8603, "lr": 0.00024797202797202796, "step": 5878, "tokens_trained": 2.888610976 }, { "epoch": 1.667754059995745, "grad_norm": 2.7501025199890137, "loss": 3.8123, "lr": 0.0002476923076923077, "step": 5880, "tokens_trained": 2.889593048 }, { "epoch": 1.668321395645699, "grad_norm": 4.056321144104004, "loss": 3.8852, "lr": 0.00024741258741258745, "step": 5882, "tokens_trained": 2.890573992 }, { "epoch": 1.6688887312956528, "grad_norm": 2.395308017730713, "loss": 3.8352, "lr": 0.00024713286713286714, "step": 5884, "tokens_trained": 2.89156064 }, { "epoch": 1.6694560669456067, "grad_norm": 1.6177494525909424, "loss": 3.8488, "lr": 0.00024685314685314683, "step": 5886, "tokens_trained": 2.892541712 }, { "epoch": 1.6700234025955605, "grad_norm": 1.648560881614685, "loss": 3.8204, "lr": 0.0002465734265734266, "step": 5888, "tokens_trained": 2.893523312 }, { "epoch": 1.6705907382455145, "grad_norm": 2.471012830734253, "loss": 3.8384, "lr": 0.00024629370629370627, "step": 5890, "tokens_trained": 2.89450544 }, { "epoch": 1.6711580738954686, "grad_norm": 3.052476644515991, "loss": 3.9013, "lr": 0.000246013986013986, "step": 5892, "tokens_trained": 2.895485296 }, { "epoch": 1.6717254095454224, "grad_norm": 2.2633492946624756, "loss": 3.8368, "lr": 0.00024573426573426576, "step": 5894, "tokens_trained": 2.896470736 }, { "epoch": 1.6722927451953762, "grad_norm": 1.6077561378479004, "loss": 3.8656, "lr": 0.00024545454545454545, "step": 5896, "tokens_trained": 2.897452728 }, { "epoch": 1.67286008084533, "grad_norm": 2.796211004257202, "loss": 3.8573, "lr": 0.0002451748251748252, "step": 5898, "tokens_trained": 2.89843608 }, { "epoch": 1.673427416495284, "grad_norm": 2.2211575508117676, "loss": 3.8605, "lr": 0.0002448951048951049, "step": 5900, "tokens_trained": 2.89941884 }, { "epoch": 1.673994752145238, "grad_norm": 3.0687623023986816, "loss": 3.8518, "lr": 0.00024461538461538463, "step": 5902, "tokens_trained": 2.90040236 }, { "epoch": 1.674562087795192, "grad_norm": 3.5390725135803223, "loss": 3.8293, "lr": 0.0002443356643356643, "step": 5904, "tokens_trained": 2.901387496 }, { "epoch": 1.6751294234451457, "grad_norm": 0.4400745630264282, "loss": 3.8764, "lr": 0.00024405594405594406, "step": 5906, "tokens_trained": 2.902366184 }, { "epoch": 1.6756967590950995, "grad_norm": 2.00661301612854, "loss": 3.8548, "lr": 0.00024377622377622378, "step": 5908, "tokens_trained": 2.90334932 }, { "epoch": 1.6762640947450536, "grad_norm": 2.0423686504364014, "loss": 3.8576, "lr": 0.00024349650349650352, "step": 5910, "tokens_trained": 2.904335264 }, { "epoch": 1.6768314303950076, "grad_norm": 4.125240325927734, "loss": 3.8416, "lr": 0.00024321678321678321, "step": 5912, "tokens_trained": 2.905314864 }, { "epoch": 1.6773987660449614, "grad_norm": 3.8097951412200928, "loss": 3.8451, "lr": 0.00024293706293706293, "step": 5914, "tokens_trained": 2.906299024 }, { "epoch": 1.6779661016949152, "grad_norm": 3.335597276687622, "loss": 3.8206, "lr": 0.00024265734265734265, "step": 5916, "tokens_trained": 2.907281608 }, { "epoch": 1.678533437344869, "grad_norm": 1.986657977104187, "loss": 3.8006, "lr": 0.00024237762237762237, "step": 5918, "tokens_trained": 2.908257288 }, { "epoch": 1.679100772994823, "grad_norm": 1.9969795942306519, "loss": 3.909, "lr": 0.0002420979020979021, "step": 5920, "tokens_trained": 2.909239544 }, { "epoch": 1.6796681086447771, "grad_norm": 2.5585694313049316, "loss": 3.8341, "lr": 0.00024181818181818183, "step": 5922, "tokens_trained": 2.910221368 }, { "epoch": 1.680235444294731, "grad_norm": 2.500028371810913, "loss": 3.8697, "lr": 0.00024153846153846155, "step": 5924, "tokens_trained": 2.911205368 }, { "epoch": 1.6808027799446847, "grad_norm": 2.8927834033966064, "loss": 3.8504, "lr": 0.00024125874125874126, "step": 5926, "tokens_trained": 2.91218908 }, { "epoch": 1.6813701155946386, "grad_norm": 3.0361721515655518, "loss": 3.8477, "lr": 0.000240979020979021, "step": 5928, "tokens_trained": 2.913171488 }, { "epoch": 1.6819374512445926, "grad_norm": 2.912531852722168, "loss": 3.9016, "lr": 0.0002406993006993007, "step": 5930, "tokens_trained": 2.914153488 }, { "epoch": 1.6825047868945466, "grad_norm": 2.563627004623413, "loss": 3.8274, "lr": 0.00024041958041958042, "step": 5932, "tokens_trained": 2.915134264 }, { "epoch": 1.6830721225445004, "grad_norm": 1.3338478803634644, "loss": 3.7957, "lr": 0.00024013986013986013, "step": 5934, "tokens_trained": 2.916116168 }, { "epoch": 1.6836394581944543, "grad_norm": 1.8714828491210938, "loss": 3.8932, "lr": 0.00023986013986013985, "step": 5936, "tokens_trained": 2.917097976 }, { "epoch": 1.684206793844408, "grad_norm": 6.701860427856445, "loss": 3.8579, "lr": 0.0002395804195804196, "step": 5938, "tokens_trained": 2.918085424 }, { "epoch": 1.6847741294943621, "grad_norm": 7.627328395843506, "loss": 3.8301, "lr": 0.0002393006993006993, "step": 5940, "tokens_trained": 2.9190662 }, { "epoch": 1.6853414651443162, "grad_norm": 7.1663713455200195, "loss": 3.8541, "lr": 0.00023902097902097903, "step": 5942, "tokens_trained": 2.920054232 }, { "epoch": 1.68590880079427, "grad_norm": 6.0305094718933105, "loss": 3.8826, "lr": 0.00023874125874125875, "step": 5944, "tokens_trained": 2.921033344 }, { "epoch": 1.6864761364442238, "grad_norm": 4.241663932800293, "loss": 3.8372, "lr": 0.0002384615384615385, "step": 5946, "tokens_trained": 2.922014616 }, { "epoch": 1.6870434720941776, "grad_norm": 4.3776984214782715, "loss": 3.8612, "lr": 0.00023818181818181818, "step": 5948, "tokens_trained": 2.922998192 }, { "epoch": 1.6876108077441316, "grad_norm": 1.773468255996704, "loss": 3.8674, "lr": 0.0002379020979020979, "step": 5950, "tokens_trained": 2.92398352 }, { "epoch": 1.6881781433940857, "grad_norm": 1.1746567487716675, "loss": 3.8757, "lr": 0.00023762237762237762, "step": 5952, "tokens_trained": 2.924967368 }, { "epoch": 1.6887454790440395, "grad_norm": 2.353240728378296, "loss": 3.8848, "lr": 0.00023734265734265734, "step": 5954, "tokens_trained": 2.925950768 }, { "epoch": 1.6893128146939933, "grad_norm": 2.495901584625244, "loss": 3.8353, "lr": 0.00023706293706293708, "step": 5956, "tokens_trained": 2.926931672 }, { "epoch": 1.6898801503439471, "grad_norm": 2.9549484252929688, "loss": 3.8289, "lr": 0.0002367832167832168, "step": 5958, "tokens_trained": 2.9279126 }, { "epoch": 1.6904474859939012, "grad_norm": 3.3719921112060547, "loss": 3.8499, "lr": 0.00023650349650349652, "step": 5960, "tokens_trained": 2.928895256 }, { "epoch": 1.6910148216438552, "grad_norm": 2.7297303676605225, "loss": 3.8315, "lr": 0.00023622377622377623, "step": 5962, "tokens_trained": 2.929876128 }, { "epoch": 1.691582157293809, "grad_norm": 2.845301389694214, "loss": 3.8558, "lr": 0.00023594405594405592, "step": 5964, "tokens_trained": 2.93085852 }, { "epoch": 1.6921494929437628, "grad_norm": 1.7312262058258057, "loss": 3.8874, "lr": 0.00023566433566433567, "step": 5966, "tokens_trained": 2.93183948 }, { "epoch": 1.6927168285937166, "grad_norm": 6.511951923370361, "loss": 3.8787, "lr": 0.00023538461538461538, "step": 5968, "tokens_trained": 2.932820528 }, { "epoch": 1.6932841642436707, "grad_norm": 2.518000841140747, "loss": 3.8689, "lr": 0.0002351048951048951, "step": 5970, "tokens_trained": 2.93380352 }, { "epoch": 1.6938514998936247, "grad_norm": 3.1675634384155273, "loss": 3.8182, "lr": 0.00023482517482517482, "step": 5972, "tokens_trained": 2.93478664 }, { "epoch": 1.6944188355435785, "grad_norm": 1.4572842121124268, "loss": 3.8316, "lr": 0.00023454545454545456, "step": 5974, "tokens_trained": 2.935771864 }, { "epoch": 1.6949861711935323, "grad_norm": 3.347806453704834, "loss": 3.8602, "lr": 0.00023426573426573428, "step": 5976, "tokens_trained": 2.936755656 }, { "epoch": 1.6955535068434862, "grad_norm": 5.49018669128418, "loss": 3.8196, "lr": 0.000233986013986014, "step": 5978, "tokens_trained": 2.937737104 }, { "epoch": 1.6961208424934402, "grad_norm": 2.272129535675049, "loss": 3.8414, "lr": 0.00023370629370629372, "step": 5980, "tokens_trained": 2.93872084 }, { "epoch": 1.6966881781433942, "grad_norm": 5.61100435256958, "loss": 3.8377, "lr": 0.0002334265734265734, "step": 5982, "tokens_trained": 2.939702104 }, { "epoch": 1.697255513793348, "grad_norm": 4.814182758331299, "loss": 3.9013, "lr": 0.00023314685314685315, "step": 5984, "tokens_trained": 2.940683392 }, { "epoch": 1.6978228494433019, "grad_norm": 2.8431384563446045, "loss": 3.8322, "lr": 0.00023286713286713287, "step": 5986, "tokens_trained": 2.941665112 }, { "epoch": 1.6983901850932557, "grad_norm": 5.4591450691223145, "loss": 3.8187, "lr": 0.0002325874125874126, "step": 5988, "tokens_trained": 2.942654464 }, { "epoch": 1.6989575207432097, "grad_norm": 2.687572956085205, "loss": 3.8279, "lr": 0.0002323076923076923, "step": 5990, "tokens_trained": 2.943639176 }, { "epoch": 1.6995248563931638, "grad_norm": 2.6767523288726807, "loss": 3.8326, "lr": 0.00023202797202797205, "step": 5992, "tokens_trained": 2.944621208 }, { "epoch": 1.7000921920431176, "grad_norm": 5.612683296203613, "loss": 3.8468, "lr": 0.00023174825174825177, "step": 5994, "tokens_trained": 2.94560208 }, { "epoch": 1.7006595276930714, "grad_norm": 3.099323272705078, "loss": 3.8726, "lr": 0.00023146853146853148, "step": 5996, "tokens_trained": 2.946583688 }, { "epoch": 1.7012268633430252, "grad_norm": 2.9504568576812744, "loss": 3.8566, "lr": 0.0002311888111888112, "step": 5998, "tokens_trained": 2.947561592 }, { "epoch": 1.7017941989929792, "grad_norm": 2.855426073074341, "loss": 3.8686, "lr": 0.0002309090909090909, "step": 6000, "tokens_trained": 2.948546968 }, { "epoch": 1.7017941989929792, "eval_loss": 0.9661399722099304, "eval_runtime": 20.2295, "step": 6000, "tokens_trained": 2.948546968 } ], "logging_steps": 2, "max_steps": 7650, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 750, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }