{ "best_global_step": 1500, "best_metric": 1.052606463432312, "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-MxDNA/checkpoint-1500", "epoch": 0.42550173746542796, "eval_steps": 125, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005673356499539039, "grad_norm": 8450.4345703125, "loss": 876.9911, "lr": 2e-06, "step": 2, "tokens_trained": 0.000985992 }, { "epoch": 0.0011346712999078079, "grad_norm": 8980.888671875, "loss": 779.4711, "lr": 6e-06, "step": 4, "tokens_trained": 0.001968088 }, { "epoch": 0.001702006949861712, "grad_norm": 7489.92529296875, "loss": 488.6157, "lr": 1e-05, "step": 6, "tokens_trained": 0.002953808 }, { "epoch": 0.0022693425998156157, "grad_norm": 1952.1917724609375, "loss": 237.0602, "lr": 1.4e-05, "step": 8, "tokens_trained": 0.003935728 }, { "epoch": 0.0028366782497695198, "grad_norm": 1418.443603515625, "loss": 159.0854, "lr": 1.8e-05, "step": 10, "tokens_trained": 0.004916488 }, { "epoch": 0.003404013899723424, "grad_norm": 874.7195434570312, "loss": 91.9563, "lr": 2.2e-05, "step": 12, "tokens_trained": 0.005902792 }, { "epoch": 0.003971349549677328, "grad_norm": 1339.8248291015625, "loss": 40.3366, "lr": 2.6e-05, "step": 14, "tokens_trained": 0.0068856 }, { "epoch": 0.0045386851996312315, "grad_norm": 2936.7607421875, "loss": 22.7436, "lr": 3e-05, "step": 16, "tokens_trained": 0.007868248 }, { "epoch": 0.005106020849585136, "grad_norm": 1531.3807373046875, "loss": 23.4797, "lr": 3.4000000000000007e-05, "step": 18, "tokens_trained": 0.008849296 }, { "epoch": 0.0056733564995390395, "grad_norm": 3027.4189453125, "loss": 38.7379, "lr": 3.8e-05, "step": 20, "tokens_trained": 0.009830984 }, { "epoch": 0.006240692149492944, "grad_norm": 2435.890625, "loss": 26.2427, "lr": 4.2000000000000004e-05, "step": 22, "tokens_trained": 0.01081364 }, { "epoch": 0.006808027799446848, "grad_norm": 3217.990478515625, "loss": 31.0263, "lr": 4.6e-05, "step": 24, "tokens_trained": 0.01179036 }, { "epoch": 0.007375363449400752, "grad_norm": 3854.00634765625, "loss": 33.8781, "lr": 5e-05, "step": 26, "tokens_trained": 0.012774504 }, { "epoch": 0.007942699099354656, "grad_norm": 3197.489990234375, "loss": 27.7927, "lr": 5.4e-05, "step": 28, "tokens_trained": 0.013759992 }, { "epoch": 0.00851003474930856, "grad_norm": 3034.156494140625, "loss": 37.9083, "lr": 5.800000000000001e-05, "step": 30, "tokens_trained": 0.014740536 }, { "epoch": 0.009077370399262463, "grad_norm": 3040.314453125, "loss": 34.0659, "lr": 6.2e-05, "step": 32, "tokens_trained": 0.015725984 }, { "epoch": 0.009644706049216368, "grad_norm": 3065.5791015625, "loss": 27.7768, "lr": 6.6e-05, "step": 34, "tokens_trained": 0.016706864 }, { "epoch": 0.010212041699170272, "grad_norm": 2454.293701171875, "loss": 35.1143, "lr": 7.000000000000001e-05, "step": 36, "tokens_trained": 0.017688816 }, { "epoch": 0.010779377349124175, "grad_norm": 3100.7802734375, "loss": 42.2603, "lr": 7.4e-05, "step": 38, "tokens_trained": 0.018669072 }, { "epoch": 0.011346712999078079, "grad_norm": 2749.84423828125, "loss": 39.3879, "lr": 7.8e-05, "step": 40, "tokens_trained": 0.019652072 }, { "epoch": 0.011914048649031984, "grad_norm": 1519.9908447265625, "loss": 35.0735, "lr": 8.2e-05, "step": 42, "tokens_trained": 0.020633112 }, { "epoch": 0.012481384298985888, "grad_norm": 1474.4244384765625, "loss": 25.8965, "lr": 8.599999999999999e-05, "step": 44, "tokens_trained": 0.021616192 }, { "epoch": 0.013048719948939792, "grad_norm": 2962.500244140625, "loss": 51.0784, "lr": 8.999999999999999e-05, "step": 46, "tokens_trained": 0.022597288 }, { "epoch": 0.013616055598893695, "grad_norm": 2419.41455078125, "loss": 43.0334, "lr": 9.400000000000001e-05, "step": 48, "tokens_trained": 0.02357572 }, { "epoch": 0.014183391248847599, "grad_norm": 1267.87451171875, "loss": 21.8063, "lr": 9.800000000000001e-05, "step": 50, "tokens_trained": 0.024553376 }, { "epoch": 0.014750726898801504, "grad_norm": 1573.944091796875, "loss": 52.9693, "lr": 0.000102, "step": 52, "tokens_trained": 0.025536728 }, { "epoch": 0.015318062548755408, "grad_norm": 1509.650146484375, "loss": 50.0825, "lr": 0.000106, "step": 54, "tokens_trained": 0.026517 }, { "epoch": 0.01588539819870931, "grad_norm": 2334.765380859375, "loss": 42.1982, "lr": 0.00011, "step": 56, "tokens_trained": 0.027504728 }, { "epoch": 0.016452733848663217, "grad_norm": 1594.16259765625, "loss": 39.0562, "lr": 0.000114, "step": 58, "tokens_trained": 0.028485416 }, { "epoch": 0.01702006949861712, "grad_norm": 1628.082275390625, "loss": 35.0488, "lr": 0.000118, "step": 60, "tokens_trained": 0.029468696 }, { "epoch": 0.017587405148571024, "grad_norm": 2496.6455078125, "loss": 49.4241, "lr": 0.000122, "step": 62, "tokens_trained": 0.030453584 }, { "epoch": 0.018154740798524926, "grad_norm": 2521.721435546875, "loss": 69.0275, "lr": 0.000126, "step": 64, "tokens_trained": 0.031432864 }, { "epoch": 0.01872207644847883, "grad_norm": 2179.571533203125, "loss": 63.1409, "lr": 0.00013000000000000002, "step": 66, "tokens_trained": 0.032418416 }, { "epoch": 0.019289412098432736, "grad_norm": 899.7137451171875, "loss": 38.4131, "lr": 0.000134, "step": 68, "tokens_trained": 0.033402136 }, { "epoch": 0.01985674774838664, "grad_norm": 2109.377685546875, "loss": 51.0044, "lr": 0.00013800000000000002, "step": 70, "tokens_trained": 0.03438832 }, { "epoch": 0.020424083398340544, "grad_norm": 1649.1873779296875, "loss": 32.1408, "lr": 0.00014199999999999998, "step": 72, "tokens_trained": 0.035374464 }, { "epoch": 0.020991419048294446, "grad_norm": 1807.994140625, "loss": 28.8357, "lr": 0.000146, "step": 74, "tokens_trained": 0.03635784 }, { "epoch": 0.02155875469824835, "grad_norm": 998.9485473632812, "loss": 23.0343, "lr": 0.00015, "step": 76, "tokens_trained": 0.037340248 }, { "epoch": 0.022126090348202256, "grad_norm": 2240.17578125, "loss": 32.0397, "lr": 0.000154, "step": 78, "tokens_trained": 0.038321968 }, { "epoch": 0.022693425998156158, "grad_norm": 1606.0067138671875, "loss": 32.1776, "lr": 0.000158, "step": 80, "tokens_trained": 0.039304992 }, { "epoch": 0.023260761648110063, "grad_norm": 1685.1015625, "loss": 24.3428, "lr": 0.000162, "step": 82, "tokens_trained": 0.040286808 }, { "epoch": 0.02382809729806397, "grad_norm": 1761.7890625, "loss": 23.9261, "lr": 0.00016600000000000002, "step": 84, "tokens_trained": 0.041271776 }, { "epoch": 0.02439543294801787, "grad_norm": 2036.0982666015625, "loss": 27.7196, "lr": 0.00017, "step": 86, "tokens_trained": 0.042252784 }, { "epoch": 0.024962768597971776, "grad_norm": 1564.3870849609375, "loss": 25.3722, "lr": 0.000174, "step": 88, "tokens_trained": 0.04323596 }, { "epoch": 0.025530104247925678, "grad_norm": 1508.349853515625, "loss": 18.4107, "lr": 0.000178, "step": 90, "tokens_trained": 0.044218984 }, { "epoch": 0.026097439897879583, "grad_norm": 1955.011474609375, "loss": 28.8456, "lr": 0.000182, "step": 92, "tokens_trained": 0.045202144 }, { "epoch": 0.02666477554783349, "grad_norm": 1679.9423828125, "loss": 23.6139, "lr": 0.000186, "step": 94, "tokens_trained": 0.046192336 }, { "epoch": 0.02723211119778739, "grad_norm": 1517.5731201171875, "loss": 42.145, "lr": 0.00019, "step": 96, "tokens_trained": 0.047174312 }, { "epoch": 0.027799446847741296, "grad_norm": 1535.3076171875, "loss": 31.9711, "lr": 0.000194, "step": 98, "tokens_trained": 0.048158944 }, { "epoch": 0.028366782497695198, "grad_norm": 1475.2569580078125, "loss": 37.645, "lr": 0.00019800000000000002, "step": 100, "tokens_trained": 0.04914364 }, { "epoch": 0.028934118147649103, "grad_norm": 1918.4088134765625, "loss": 69.4053, "lr": 0.000202, "step": 102, "tokens_trained": 0.050123488 }, { "epoch": 0.02950145379760301, "grad_norm": 1631.6231689453125, "loss": 50.9725, "lr": 0.000206, "step": 104, "tokens_trained": 0.051105512 }, { "epoch": 0.03006878944755691, "grad_norm": 1291.6376953125, "loss": 22.6527, "lr": 0.00021, "step": 106, "tokens_trained": 0.052091704 }, { "epoch": 0.030636125097510816, "grad_norm": 1224.9625244140625, "loss": 60.2725, "lr": 0.000214, "step": 108, "tokens_trained": 0.053074824 }, { "epoch": 0.031203460747464717, "grad_norm": 1218.2022705078125, "loss": 75.8728, "lr": 0.000218, "step": 110, "tokens_trained": 0.054057104 }, { "epoch": 0.03177079639741862, "grad_norm": 1761.8861083984375, "loss": 61.6427, "lr": 0.000222, "step": 112, "tokens_trained": 0.055039128 }, { "epoch": 0.03233813204737253, "grad_norm": 1482.4256591796875, "loss": 35.3351, "lr": 0.00022600000000000002, "step": 114, "tokens_trained": 0.05602388 }, { "epoch": 0.03290546769732643, "grad_norm": 563.6399536132812, "loss": 40.1461, "lr": 0.00023, "step": 116, "tokens_trained": 0.057005376 }, { "epoch": 0.03347280334728033, "grad_norm": 1266.058837890625, "loss": 24.0657, "lr": 0.00023400000000000002, "step": 118, "tokens_trained": 0.057985136 }, { "epoch": 0.03404013899723424, "grad_norm": 918.206298828125, "loss": 23.9626, "lr": 0.00023799999999999998, "step": 120, "tokens_trained": 0.058968288 }, { "epoch": 0.03460747464718814, "grad_norm": 1495.7191162109375, "loss": 19.798, "lr": 0.000242, "step": 122, "tokens_trained": 0.05995348 }, { "epoch": 0.03517481029714205, "grad_norm": 1264.302734375, "loss": 31.5342, "lr": 0.000246, "step": 124, "tokens_trained": 0.060935832 }, { "epoch": 0.035458478122119, "eval_loss": 5.312118053436279, "eval_runtime": 21.3065, "step": 125, "tokens_trained": 0.061426608 }, { "epoch": 0.03574214594709595, "grad_norm": 907.4861450195312, "loss": 25.1262, "lr": 0.00025, "step": 126, "tokens_trained": 0.061918184 }, { "epoch": 0.03630948159704985, "grad_norm": 1287.6158447265625, "loss": 26.963, "lr": 0.000254, "step": 128, "tokens_trained": 0.062902328 }, { "epoch": 0.03687681724700376, "grad_norm": 1260.570556640625, "loss": 24.9633, "lr": 0.00025800000000000004, "step": 130, "tokens_trained": 0.063883456 }, { "epoch": 0.03744415289695766, "grad_norm": 1436.82373046875, "loss": 23.1028, "lr": 0.000262, "step": 132, "tokens_trained": 0.06486748 }, { "epoch": 0.03801148854691157, "grad_norm": 812.9523315429688, "loss": 20.5496, "lr": 0.000266, "step": 134, "tokens_trained": 0.065847104 }, { "epoch": 0.03857882419686547, "grad_norm": 1336.5322265625, "loss": 23.673, "lr": 0.00027, "step": 136, "tokens_trained": 0.066829928 }, { "epoch": 0.03914615984681937, "grad_norm": 1381.282470703125, "loss": 32.0373, "lr": 0.00027400000000000005, "step": 138, "tokens_trained": 0.067814024 }, { "epoch": 0.03971349549677328, "grad_norm": 972.7861938476562, "loss": 26.9454, "lr": 0.00027800000000000004, "step": 140, "tokens_trained": 0.068797744 }, { "epoch": 0.04028083114672718, "grad_norm": 1347.2249755859375, "loss": 22.3578, "lr": 0.00028199999999999997, "step": 142, "tokens_trained": 0.069780072 }, { "epoch": 0.04084816679668109, "grad_norm": 829.525390625, "loss": 37.9879, "lr": 0.00028599999999999996, "step": 144, "tokens_trained": 0.070759896 }, { "epoch": 0.04141550244663499, "grad_norm": 1094.1033935546875, "loss": 21.1972, "lr": 0.00029, "step": 146, "tokens_trained": 0.0717452 }, { "epoch": 0.04198283809658889, "grad_norm": 717.107421875, "loss": 21.7774, "lr": 0.000294, "step": 148, "tokens_trained": 0.072727432 }, { "epoch": 0.042550173746542796, "grad_norm": 744.4456787109375, "loss": 20.3235, "lr": 0.000298, "step": 150, "tokens_trained": 0.073712128 }, { "epoch": 0.0431175093964967, "grad_norm": 904.1460571289062, "loss": 22.7878, "lr": 0.000302, "step": 152, "tokens_trained": 0.074695296 }, { "epoch": 0.04368484504645061, "grad_norm": 1352.303955078125, "loss": 20.9757, "lr": 0.000306, "step": 154, "tokens_trained": 0.0756798 }, { "epoch": 0.04425218069640451, "grad_norm": 997.0473022460938, "loss": 17.4647, "lr": 0.00031, "step": 156, "tokens_trained": 0.076666504 }, { "epoch": 0.04481951634635841, "grad_norm": 1206.387939453125, "loss": 21.1846, "lr": 0.000314, "step": 158, "tokens_trained": 0.07764868 }, { "epoch": 0.045386851996312316, "grad_norm": 1029.6807861328125, "loss": 17.8853, "lr": 0.00031800000000000003, "step": 160, "tokens_trained": 0.07863548 }, { "epoch": 0.04595418764626622, "grad_norm": 1136.4635009765625, "loss": 30.057, "lr": 0.000322, "step": 162, "tokens_trained": 0.079618928 }, { "epoch": 0.04652152329622013, "grad_norm": 834.3464965820312, "loss": 28.1782, "lr": 0.000326, "step": 164, "tokens_trained": 0.0806032 }, { "epoch": 0.04708885894617403, "grad_norm": 1177.8365478515625, "loss": 16.4267, "lr": 0.00033, "step": 166, "tokens_trained": 0.081583752 }, { "epoch": 0.04765619459612794, "grad_norm": 572.501708984375, "loss": 16.5752, "lr": 0.00033400000000000004, "step": 168, "tokens_trained": 0.082568184 }, { "epoch": 0.048223530246081836, "grad_norm": 437.6822814941406, "loss": 11.5509, "lr": 0.00033800000000000003, "step": 170, "tokens_trained": 0.083553352 }, { "epoch": 0.04879086589603574, "grad_norm": 1119.0416259765625, "loss": 16.2689, "lr": 0.000342, "step": 172, "tokens_trained": 0.084536352 }, { "epoch": 0.04935820154598965, "grad_norm": 895.4021606445312, "loss": 12.6663, "lr": 0.000346, "step": 174, "tokens_trained": 0.085517312 }, { "epoch": 0.04992553719594355, "grad_norm": 995.6289672851562, "loss": 26.0663, "lr": 0.00035, "step": 176, "tokens_trained": 0.086496088 }, { "epoch": 0.05049287284589746, "grad_norm": 839.6610717773438, "loss": 21.5115, "lr": 0.000354, "step": 178, "tokens_trained": 0.087480632 }, { "epoch": 0.051060208495851356, "grad_norm": 734.1155395507812, "loss": 29.3287, "lr": 0.000358, "step": 180, "tokens_trained": 0.088460408 }, { "epoch": 0.05162754414580526, "grad_norm": 721.4505615234375, "loss": 26.0801, "lr": 0.000362, "step": 182, "tokens_trained": 0.08944248 }, { "epoch": 0.052194879795759166, "grad_norm": 845.9672241210938, "loss": 19.0639, "lr": 0.000366, "step": 184, "tokens_trained": 0.090427832 }, { "epoch": 0.05276221544571307, "grad_norm": 1210.9969482421875, "loss": 23.9036, "lr": 0.00037, "step": 186, "tokens_trained": 0.091411504 }, { "epoch": 0.05332955109566698, "grad_norm": 1079.1690673828125, "loss": 23.5588, "lr": 0.000374, "step": 188, "tokens_trained": 0.092392672 }, { "epoch": 0.053896886745620876, "grad_norm": 596.111328125, "loss": 20.8275, "lr": 0.000378, "step": 190, "tokens_trained": 0.093374696 }, { "epoch": 0.05446422239557478, "grad_norm": 761.8096923828125, "loss": 22.512, "lr": 0.000382, "step": 192, "tokens_trained": 0.094361912 }, { "epoch": 0.055031558045528686, "grad_norm": 1081.9832763671875, "loss": 32.335, "lr": 0.000386, "step": 194, "tokens_trained": 0.095342992 }, { "epoch": 0.05559889369548259, "grad_norm": 304.3534240722656, "loss": 11.5275, "lr": 0.00039000000000000005, "step": 196, "tokens_trained": 0.096323512 }, { "epoch": 0.0561662293454365, "grad_norm": 586.6314086914062, "loss": 16.2663, "lr": 0.00039400000000000004, "step": 198, "tokens_trained": 0.097308864 }, { "epoch": 0.056733564995390395, "grad_norm": 624.9953002929688, "loss": 16.627, "lr": 0.000398, "step": 200, "tokens_trained": 0.098289064 }, { "epoch": 0.0573009006453443, "grad_norm": 585.9645385742188, "loss": 15.8359, "lr": 0.000402, "step": 202, "tokens_trained": 0.099269696 }, { "epoch": 0.057868236295298206, "grad_norm": 537.9913330078125, "loss": 20.0779, "lr": 0.00040600000000000006, "step": 204, "tokens_trained": 0.100248448 }, { "epoch": 0.05843557194525211, "grad_norm": 805.04931640625, "loss": 21.4524, "lr": 0.00041, "step": 206, "tokens_trained": 0.101231248 }, { "epoch": 0.05900290759520602, "grad_norm": 439.1418151855469, "loss": 23.9852, "lr": 0.000414, "step": 208, "tokens_trained": 0.102210688 }, { "epoch": 0.059570243245159915, "grad_norm": 502.684814453125, "loss": 17.6273, "lr": 0.00041799999999999997, "step": 210, "tokens_trained": 0.103192176 }, { "epoch": 0.06013757889511382, "grad_norm": 849.9979858398438, "loss": 33.7517, "lr": 0.000422, "step": 212, "tokens_trained": 0.104172824 }, { "epoch": 0.060704914545067726, "grad_norm": 939.583740234375, "loss": 26.2559, "lr": 0.000426, "step": 214, "tokens_trained": 0.105156672 }, { "epoch": 0.06127225019502163, "grad_norm": 525.0505981445312, "loss": 20.0923, "lr": 0.00043, "step": 216, "tokens_trained": 0.106141368 }, { "epoch": 0.061839585844975536, "grad_norm": 420.296630859375, "loss": 17.9608, "lr": 0.00043400000000000003, "step": 218, "tokens_trained": 0.107124088 }, { "epoch": 0.062406921494929435, "grad_norm": 711.3380737304688, "loss": 19.387, "lr": 0.000438, "step": 220, "tokens_trained": 0.108112632 }, { "epoch": 0.06297425714488335, "grad_norm": 759.183349609375, "loss": 17.8061, "lr": 0.000442, "step": 222, "tokens_trained": 0.1090934 }, { "epoch": 0.06354159279483725, "grad_norm": 790.025146484375, "loss": 13.8539, "lr": 0.000446, "step": 224, "tokens_trained": 0.110079512 }, { "epoch": 0.06410892844479114, "grad_norm": 769.8306274414062, "loss": 22.1258, "lr": 0.00045000000000000004, "step": 226, "tokens_trained": 0.111060152 }, { "epoch": 0.06467626409474506, "grad_norm": 656.8352661132812, "loss": 14.8646, "lr": 0.00045400000000000003, "step": 228, "tokens_trained": 0.112044144 }, { "epoch": 0.06524359974469895, "grad_norm": 498.92010498046875, "loss": 23.1558, "lr": 0.000458, "step": 230, "tokens_trained": 0.113022928 }, { "epoch": 0.06581093539465287, "grad_norm": 764.0186157226562, "loss": 16.7089, "lr": 0.000462, "step": 232, "tokens_trained": 0.114003832 }, { "epoch": 0.06637827104460677, "grad_norm": 491.5793762207031, "loss": 12.3979, "lr": 0.00046600000000000005, "step": 234, "tokens_trained": 0.114991008 }, { "epoch": 0.06694560669456066, "grad_norm": 679.9217529296875, "loss": 14.9037, "lr": 0.00047, "step": 236, "tokens_trained": 0.115971888 }, { "epoch": 0.06751294234451458, "grad_norm": 491.0369567871094, "loss": 7.7603, "lr": 0.000474, "step": 238, "tokens_trained": 0.116952616 }, { "epoch": 0.06808027799446847, "grad_norm": 369.2186279296875, "loss": 8.2256, "lr": 0.00047799999999999996, "step": 240, "tokens_trained": 0.117935816 }, { "epoch": 0.06864761364442239, "grad_norm": 312.72137451171875, "loss": 7.5486, "lr": 0.000482, "step": 242, "tokens_trained": 0.118919392 }, { "epoch": 0.06921494929437629, "grad_norm": 596.1439208984375, "loss": 11.7351, "lr": 0.000486, "step": 244, "tokens_trained": 0.119901856 }, { "epoch": 0.06978228494433018, "grad_norm": 467.5667419433594, "loss": 11.8403, "lr": 0.00049, "step": 246, "tokens_trained": 0.120884624 }, { "epoch": 0.0703496205942841, "grad_norm": 430.50048828125, "loss": 13.8081, "lr": 0.000494, "step": 248, "tokens_trained": 0.121869224 }, { "epoch": 0.070916956244238, "grad_norm": 522.242919921875, "loss": 14.1892, "lr": 0.000498, "step": 250, "tokens_trained": 0.122853584 }, { "epoch": 0.070916956244238, "eval_loss": 1.9294606447219849, "eval_runtime": 20.4162, "step": 250, "tokens_trained": 0.122853584 }, { "epoch": 0.0714842918941919, "grad_norm": 835.2765502929688, "loss": 13.2462, "lr": 0.0005020000000000001, "step": 252, "tokens_trained": 0.123835544 }, { "epoch": 0.0720516275441458, "grad_norm": 714.8098754882812, "loss": 20.0498, "lr": 0.000506, "step": 254, "tokens_trained": 0.124821616 }, { "epoch": 0.0726189631940997, "grad_norm": 701.512939453125, "loss": 18.3664, "lr": 0.00051, "step": 256, "tokens_trained": 0.125807608 }, { "epoch": 0.07318629884405362, "grad_norm": 773.987060546875, "loss": 21.3807, "lr": 0.000514, "step": 258, "tokens_trained": 0.126791464 }, { "epoch": 0.07375363449400751, "grad_norm": 826.422119140625, "loss": 22.6403, "lr": 0.000518, "step": 260, "tokens_trained": 0.127771752 }, { "epoch": 0.07432097014396143, "grad_norm": 742.8673095703125, "loss": 20.1504, "lr": 0.000522, "step": 262, "tokens_trained": 0.128755448 }, { "epoch": 0.07488830579391532, "grad_norm": 797.79296875, "loss": 26.7343, "lr": 0.000526, "step": 264, "tokens_trained": 0.129741088 }, { "epoch": 0.07545564144386922, "grad_norm": 673.9141235351562, "loss": 12.505, "lr": 0.0005300000000000001, "step": 266, "tokens_trained": 0.130727504 }, { "epoch": 0.07602297709382314, "grad_norm": 310.6510925292969, "loss": 12.6344, "lr": 0.0005340000000000001, "step": 268, "tokens_trained": 0.131710296 }, { "epoch": 0.07659031274377703, "grad_norm": 312.40966796875, "loss": 14.254, "lr": 0.0005380000000000001, "step": 270, "tokens_trained": 0.132695352 }, { "epoch": 0.07715764839373095, "grad_norm": 492.2834777832031, "loss": 19.0979, "lr": 0.0005420000000000001, "step": 272, "tokens_trained": 0.133677928 }, { "epoch": 0.07772498404368484, "grad_norm": 628.457763671875, "loss": 21.7735, "lr": 0.000546, "step": 274, "tokens_trained": 0.134655504 }, { "epoch": 0.07829231969363874, "grad_norm": 382.8389892578125, "loss": 12.5128, "lr": 0.00055, "step": 276, "tokens_trained": 0.135640208 }, { "epoch": 0.07885965534359266, "grad_norm": 483.12335205078125, "loss": 15.2589, "lr": 0.000554, "step": 278, "tokens_trained": 0.136624232 }, { "epoch": 0.07942699099354655, "grad_norm": 640.658447265625, "loss": 12.1341, "lr": 0.000558, "step": 280, "tokens_trained": 0.13760628 }, { "epoch": 0.07999432664350047, "grad_norm": 410.0824279785156, "loss": 12.5723, "lr": 0.0005620000000000001, "step": 282, "tokens_trained": 0.13858832 }, { "epoch": 0.08056166229345436, "grad_norm": 513.2861328125, "loss": 14.8461, "lr": 0.000566, "step": 284, "tokens_trained": 0.139568424 }, { "epoch": 0.08112899794340826, "grad_norm": 564.547607421875, "loss": 12.5792, "lr": 0.00057, "step": 286, "tokens_trained": 0.140557016 }, { "epoch": 0.08169633359336217, "grad_norm": 451.3592834472656, "loss": 16.5433, "lr": 0.000574, "step": 288, "tokens_trained": 0.141540248 }, { "epoch": 0.08226366924331607, "grad_norm": 404.2495422363281, "loss": 16.4138, "lr": 0.000578, "step": 290, "tokens_trained": 0.142528272 }, { "epoch": 0.08283100489326999, "grad_norm": 566.5219116210938, "loss": 16.4743, "lr": 0.0005819999999999999, "step": 292, "tokens_trained": 0.143513096 }, { "epoch": 0.08339834054322388, "grad_norm": 559.6517333984375, "loss": 16.421, "lr": 0.0005859999999999999, "step": 294, "tokens_trained": 0.144494472 }, { "epoch": 0.08396567619317778, "grad_norm": 260.874755859375, "loss": 11.2214, "lr": 0.00059, "step": 296, "tokens_trained": 0.14547876 }, { "epoch": 0.0845330118431317, "grad_norm": 272.02899169921875, "loss": 10.3491, "lr": 0.000594, "step": 298, "tokens_trained": 0.146465864 }, { "epoch": 0.08510034749308559, "grad_norm": 556.9845581054688, "loss": 10.4348, "lr": 0.000598, "step": 300, "tokens_trained": 0.147446344 }, { "epoch": 0.0856676831430395, "grad_norm": 273.35772705078125, "loss": 8.3292, "lr": 0.000602, "step": 302, "tokens_trained": 0.14843244 }, { "epoch": 0.0862350187929934, "grad_norm": 246.6316680908203, "loss": 9.9362, "lr": 0.000606, "step": 304, "tokens_trained": 0.149415976 }, { "epoch": 0.0868023544429473, "grad_norm": 564.4365844726562, "loss": 9.2621, "lr": 0.00061, "step": 306, "tokens_trained": 0.150398728 }, { "epoch": 0.08736969009290121, "grad_norm": 396.0948791503906, "loss": 11.8526, "lr": 0.000614, "step": 308, "tokens_trained": 0.151385104 }, { "epoch": 0.08793702574285511, "grad_norm": 488.6072692871094, "loss": 11.8473, "lr": 0.0006180000000000001, "step": 310, "tokens_trained": 0.152373672 }, { "epoch": 0.08850436139280903, "grad_norm": 346.70660400390625, "loss": 12.0897, "lr": 0.000622, "step": 312, "tokens_trained": 0.153356256 }, { "epoch": 0.08907169704276292, "grad_norm": 382.40679931640625, "loss": 9.271, "lr": 0.000626, "step": 314, "tokens_trained": 0.154342632 }, { "epoch": 0.08963903269271682, "grad_norm": 288.7908935546875, "loss": 9.185, "lr": 0.00063, "step": 316, "tokens_trained": 0.1553238 }, { "epoch": 0.09020636834267073, "grad_norm": 337.5335388183594, "loss": 12.0555, "lr": 0.000634, "step": 318, "tokens_trained": 0.156313168 }, { "epoch": 0.09077370399262463, "grad_norm": 349.25531005859375, "loss": 8.51, "lr": 0.000638, "step": 320, "tokens_trained": 0.157299448 }, { "epoch": 0.09134103964257854, "grad_norm": 471.7824401855469, "loss": 14.1888, "lr": 0.000642, "step": 322, "tokens_trained": 0.158285264 }, { "epoch": 0.09190837529253244, "grad_norm": 284.94036865234375, "loss": 10.1593, "lr": 0.000646, "step": 324, "tokens_trained": 0.159267512 }, { "epoch": 0.09247571094248634, "grad_norm": 510.90478515625, "loss": 13.5744, "lr": 0.0006500000000000001, "step": 326, "tokens_trained": 0.160250856 }, { "epoch": 0.09304304659244025, "grad_norm": 373.82965087890625, "loss": 8.4999, "lr": 0.0006540000000000001, "step": 328, "tokens_trained": 0.161231832 }, { "epoch": 0.09361038224239415, "grad_norm": 219.3827362060547, "loss": 8.4436, "lr": 0.0006580000000000001, "step": 330, "tokens_trained": 0.162217656 }, { "epoch": 0.09417771789234806, "grad_norm": 433.0914001464844, "loss": 11.2019, "lr": 0.000662, "step": 332, "tokens_trained": 0.163199096 }, { "epoch": 0.09474505354230196, "grad_norm": 242.65907287597656, "loss": 9.0666, "lr": 0.000666, "step": 334, "tokens_trained": 0.164178512 }, { "epoch": 0.09531238919225588, "grad_norm": 446.07916259765625, "loss": 8.6546, "lr": 0.00067, "step": 336, "tokens_trained": 0.165162464 }, { "epoch": 0.09587972484220977, "grad_norm": 231.8892364501953, "loss": 7.5819, "lr": 0.000674, "step": 338, "tokens_trained": 0.166141536 }, { "epoch": 0.09644706049216367, "grad_norm": 100.7306137084961, "loss": 6.7047, "lr": 0.0006780000000000001, "step": 340, "tokens_trained": 0.167123944 }, { "epoch": 0.09701439614211758, "grad_norm": 78.11279296875, "loss": 5.9308, "lr": 0.0006820000000000001, "step": 342, "tokens_trained": 0.168105264 }, { "epoch": 0.09758173179207148, "grad_norm": 271.466064453125, "loss": 6.9141, "lr": 0.0006860000000000001, "step": 344, "tokens_trained": 0.169088912 }, { "epoch": 0.0981490674420254, "grad_norm": 252.54478454589844, "loss": 6.3281, "lr": 0.00069, "step": 346, "tokens_trained": 0.170077368 }, { "epoch": 0.0987164030919793, "grad_norm": 305.8559875488281, "loss": 6.443, "lr": 0.000694, "step": 348, "tokens_trained": 0.171057232 }, { "epoch": 0.09928373874193319, "grad_norm": 227.74374389648438, "loss": 6.552, "lr": 0.0006979999999999999, "step": 350, "tokens_trained": 0.172041376 }, { "epoch": 0.0998510743918871, "grad_norm": 446.7601623535156, "loss": 10.8184, "lr": 0.0007019999999999999, "step": 352, "tokens_trained": 0.173023624 }, { "epoch": 0.100418410041841, "grad_norm": 353.0849609375, "loss": 8.6327, "lr": 0.0007059999999999999, "step": 354, "tokens_trained": 0.174005992 }, { "epoch": 0.10098574569179491, "grad_norm": 367.9427185058594, "loss": 9.3898, "lr": 0.00071, "step": 356, "tokens_trained": 0.174988304 }, { "epoch": 0.10155308134174881, "grad_norm": 224.4961700439453, "loss": 8.284, "lr": 0.000714, "step": 358, "tokens_trained": 0.175969816 }, { "epoch": 0.10212041699170271, "grad_norm": 221.86537170410156, "loss": 7.0578, "lr": 0.000718, "step": 360, "tokens_trained": 0.176952688 }, { "epoch": 0.10268775264165662, "grad_norm": 331.0989685058594, "loss": 6.9561, "lr": 0.000722, "step": 362, "tokens_trained": 0.177935144 }, { "epoch": 0.10325508829161052, "grad_norm": 171.6498260498047, "loss": 7.203, "lr": 0.000726, "step": 364, "tokens_trained": 0.178916776 }, { "epoch": 0.10382242394156443, "grad_norm": 284.2208557128906, "loss": 10.3517, "lr": 0.00073, "step": 366, "tokens_trained": 0.179903432 }, { "epoch": 0.10438975959151833, "grad_norm": 354.8574523925781, "loss": 9.3888, "lr": 0.000734, "step": 368, "tokens_trained": 0.180883224 }, { "epoch": 0.10495709524147223, "grad_norm": 344.82574462890625, "loss": 10.5933, "lr": 0.000738, "step": 370, "tokens_trained": 0.181863808 }, { "epoch": 0.10552443089142614, "grad_norm": 302.6838073730469, "loss": 10.2832, "lr": 0.000742, "step": 372, "tokens_trained": 0.182843712 }, { "epoch": 0.10609176654138004, "grad_norm": 323.0387878417969, "loss": 6.4864, "lr": 0.000746, "step": 374, "tokens_trained": 0.183825832 }, { "epoch": 0.10637543436635699, "eval_loss": 1.4430732727050781, "eval_runtime": 20.5468, "step": 375, "tokens_trained": 0.184317744 }, { "epoch": 0.10665910219133395, "grad_norm": 133.74822998046875, "loss": 5.4176, "lr": 0.00075, "step": 376, "tokens_trained": 0.184811352 }, { "epoch": 0.10722643784128785, "grad_norm": 180.3372344970703, "loss": 5.5641, "lr": 0.000754, "step": 378, "tokens_trained": 0.185792528 }, { "epoch": 0.10779377349124175, "grad_norm": 250.83999633789062, "loss": 5.8612, "lr": 0.000758, "step": 380, "tokens_trained": 0.186777112 }, { "epoch": 0.10836110914119566, "grad_norm": 293.51959228515625, "loss": 6.0418, "lr": 0.000762, "step": 382, "tokens_trained": 0.18775724 }, { "epoch": 0.10892844479114956, "grad_norm": 292.56207275390625, "loss": 6.1812, "lr": 0.0007660000000000001, "step": 384, "tokens_trained": 0.188733568 }, { "epoch": 0.10949578044110347, "grad_norm": 121.82467651367188, "loss": 6.0855, "lr": 0.0007700000000000001, "step": 386, "tokens_trained": 0.189718512 }, { "epoch": 0.11006311609105737, "grad_norm": 124.30497741699219, "loss": 5.7734, "lr": 0.0007740000000000001, "step": 388, "tokens_trained": 0.190703776 }, { "epoch": 0.11063045174101127, "grad_norm": 143.64004516601562, "loss": 5.7641, "lr": 0.000778, "step": 390, "tokens_trained": 0.191689888 }, { "epoch": 0.11119778739096518, "grad_norm": 160.06784057617188, "loss": 5.6025, "lr": 0.000782, "step": 392, "tokens_trained": 0.192673992 }, { "epoch": 0.11176512304091908, "grad_norm": 226.97988891601562, "loss": 6.0049, "lr": 0.000786, "step": 394, "tokens_trained": 0.193656272 }, { "epoch": 0.112332458690873, "grad_norm": 223.26898193359375, "loss": 5.6972, "lr": 0.00079, "step": 396, "tokens_trained": 0.194639144 }, { "epoch": 0.11289979434082689, "grad_norm": 249.34912109375, "loss": 5.7348, "lr": 0.0007940000000000001, "step": 398, "tokens_trained": 0.195621256 }, { "epoch": 0.11346712999078079, "grad_norm": 161.34271240234375, "loss": 5.6689, "lr": 0.0007980000000000001, "step": 400, "tokens_trained": 0.196604136 }, { "epoch": 0.1140344656407347, "grad_norm": 148.53176879882812, "loss": 5.702, "lr": 0.0008020000000000001, "step": 402, "tokens_trained": 0.197586784 }, { "epoch": 0.1146018012906886, "grad_norm": 144.40835571289062, "loss": 6.2402, "lr": 0.0008060000000000001, "step": 404, "tokens_trained": 0.198570824 }, { "epoch": 0.11516913694064251, "grad_norm": 306.57562255859375, "loss": 7.1739, "lr": 0.0008100000000000001, "step": 406, "tokens_trained": 0.199548328 }, { "epoch": 0.11573647259059641, "grad_norm": 308.79180908203125, "loss": 6.0972, "lr": 0.0008139999999999999, "step": 408, "tokens_trained": 0.200532496 }, { "epoch": 0.11630380824055031, "grad_norm": 197.76791381835938, "loss": 6.3533, "lr": 0.0008179999999999999, "step": 410, "tokens_trained": 0.201514648 }, { "epoch": 0.11687114389050422, "grad_norm": 129.5694580078125, "loss": 6.9628, "lr": 0.0008219999999999999, "step": 412, "tokens_trained": 0.2024994 }, { "epoch": 0.11743847954045812, "grad_norm": 446.0195617675781, "loss": 11.7562, "lr": 0.000826, "step": 414, "tokens_trained": 0.20348012 }, { "epoch": 0.11800581519041203, "grad_norm": 355.5342712402344, "loss": 8.8055, "lr": 0.00083, "step": 416, "tokens_trained": 0.20446356 }, { "epoch": 0.11857315084036593, "grad_norm": 456.2491149902344, "loss": 9.606, "lr": 0.000834, "step": 418, "tokens_trained": 0.205445288 }, { "epoch": 0.11914048649031983, "grad_norm": 369.8676452636719, "loss": 8.385, "lr": 0.000838, "step": 420, "tokens_trained": 0.206427832 }, { "epoch": 0.11970782214027374, "grad_norm": 262.19073486328125, "loss": 9.0956, "lr": 0.000842, "step": 422, "tokens_trained": 0.207409848 }, { "epoch": 0.12027515779022764, "grad_norm": 120.3193130493164, "loss": 5.4937, "lr": 0.000846, "step": 424, "tokens_trained": 0.208391752 }, { "epoch": 0.12084249344018155, "grad_norm": 222.1111297607422, "loss": 8.9367, "lr": 0.00085, "step": 426, "tokens_trained": 0.20937384 }, { "epoch": 0.12140982909013545, "grad_norm": 137.16819763183594, "loss": 7.5876, "lr": 0.000854, "step": 428, "tokens_trained": 0.210358576 }, { "epoch": 0.12197716474008935, "grad_norm": 267.61846923828125, "loss": 8.817, "lr": 0.000858, "step": 430, "tokens_trained": 0.211340064 }, { "epoch": 0.12254450039004326, "grad_norm": 472.72906494140625, "loss": 8.203, "lr": 0.000862, "step": 432, "tokens_trained": 0.212321144 }, { "epoch": 0.12311183603999716, "grad_norm": 297.1420593261719, "loss": 10.987, "lr": 0.000866, "step": 434, "tokens_trained": 0.213300312 }, { "epoch": 0.12367917168995107, "grad_norm": 281.7297668457031, "loss": 7.6117, "lr": 0.00087, "step": 436, "tokens_trained": 0.214287624 }, { "epoch": 0.12424650733990497, "grad_norm": 203.09678649902344, "loss": 6.5638, "lr": 0.000874, "step": 438, "tokens_trained": 0.215272136 }, { "epoch": 0.12481384298985887, "grad_norm": 155.7823944091797, "loss": 6.1131, "lr": 0.000878, "step": 440, "tokens_trained": 0.216256392 }, { "epoch": 0.12538117863981277, "grad_norm": 189.86196899414062, "loss": 8.2565, "lr": 0.000882, "step": 442, "tokens_trained": 0.217242504 }, { "epoch": 0.1259485142897667, "grad_norm": 247.4568634033203, "loss": 7.1005, "lr": 0.0008860000000000001, "step": 444, "tokens_trained": 0.218226008 }, { "epoch": 0.1265158499397206, "grad_norm": 179.72825622558594, "loss": 6.3379, "lr": 0.0008900000000000001, "step": 446, "tokens_trained": 0.219210584 }, { "epoch": 0.1270831855896745, "grad_norm": 212.96356201171875, "loss": 7.2514, "lr": 0.000894, "step": 448, "tokens_trained": 0.220193952 }, { "epoch": 0.1276505212396284, "grad_norm": 105.67095947265625, "loss": 5.456, "lr": 0.000898, "step": 450, "tokens_trained": 0.221176936 }, { "epoch": 0.1282178568895823, "grad_norm": 302.9122619628906, "loss": 6.4018, "lr": 0.000902, "step": 452, "tokens_trained": 0.222161952 }, { "epoch": 0.12878519253953621, "grad_norm": 215.66561889648438, "loss": 6.2853, "lr": 0.000906, "step": 454, "tokens_trained": 0.223144912 }, { "epoch": 0.1293525281894901, "grad_norm": 272.9984130859375, "loss": 7.3902, "lr": 0.00091, "step": 456, "tokens_trained": 0.224127392 }, { "epoch": 0.129919863839444, "grad_norm": 200.7503662109375, "loss": 6.1637, "lr": 0.0009140000000000001, "step": 458, "tokens_trained": 0.22511648 }, { "epoch": 0.1304871994893979, "grad_norm": 93.23990631103516, "loss": 6.4867, "lr": 0.0009180000000000001, "step": 460, "tokens_trained": 0.226098144 }, { "epoch": 0.1310545351393518, "grad_norm": 274.37164306640625, "loss": 8.99, "lr": 0.0009220000000000001, "step": 462, "tokens_trained": 0.227081848 }, { "epoch": 0.13162187078930573, "grad_norm": 186.66322326660156, "loss": 8.7122, "lr": 0.0009260000000000001, "step": 464, "tokens_trained": 0.22806636 }, { "epoch": 0.13218920643925963, "grad_norm": 586.1035766601562, "loss": 9.1045, "lr": 0.00093, "step": 466, "tokens_trained": 0.229047872 }, { "epoch": 0.13275654208921353, "grad_norm": 227.55996704101562, "loss": 9.7276, "lr": 0.000934, "step": 468, "tokens_trained": 0.230031144 }, { "epoch": 0.13332387773916743, "grad_norm": 229.26609802246094, "loss": 6.6244, "lr": 0.0009379999999999999, "step": 470, "tokens_trained": 0.2310158 }, { "epoch": 0.13389121338912133, "grad_norm": 145.16331481933594, "loss": 5.759, "lr": 0.000942, "step": 472, "tokens_trained": 0.2319996 }, { "epoch": 0.13445854903907525, "grad_norm": 109.9937744140625, "loss": 5.4838, "lr": 0.000946, "step": 474, "tokens_trained": 0.232983808 }, { "epoch": 0.13502588468902915, "grad_norm": 135.74899291992188, "loss": 6.2738, "lr": 0.00095, "step": 476, "tokens_trained": 0.233963016 }, { "epoch": 0.13559322033898305, "grad_norm": 142.99449157714844, "loss": 5.8459, "lr": 0.000954, "step": 478, "tokens_trained": 0.234948864 }, { "epoch": 0.13616055598893695, "grad_norm": 198.66883850097656, "loss": 6.6626, "lr": 0.000958, "step": 480, "tokens_trained": 0.235932392 }, { "epoch": 0.13672789163889085, "grad_norm": 260.76507568359375, "loss": 6.9299, "lr": 0.000962, "step": 482, "tokens_trained": 0.236915664 }, { "epoch": 0.13729522728884477, "grad_norm": 267.97589111328125, "loss": 6.4343, "lr": 0.000966, "step": 484, "tokens_trained": 0.237896904 }, { "epoch": 0.13786256293879867, "grad_norm": 89.8781967163086, "loss": 6.3203, "lr": 0.0009699999999999999, "step": 486, "tokens_trained": 0.238874528 }, { "epoch": 0.13842989858875257, "grad_norm": 225.62985229492188, "loss": 6.2778, "lr": 0.000974, "step": 488, "tokens_trained": 0.2398588 }, { "epoch": 0.13899723423870647, "grad_norm": 85.84110260009766, "loss": 5.2786, "lr": 0.000978, "step": 490, "tokens_trained": 0.240839968 }, { "epoch": 0.13956456988866037, "grad_norm": 141.4368438720703, "loss": 5.5525, "lr": 0.000982, "step": 492, "tokens_trained": 0.241823544 }, { "epoch": 0.1401319055386143, "grad_norm": 94.9535140991211, "loss": 5.4386, "lr": 0.0009860000000000001, "step": 494, "tokens_trained": 0.242805456 }, { "epoch": 0.1406992411885682, "grad_norm": 157.4557647705078, "loss": 5.9786, "lr": 0.00099, "step": 496, "tokens_trained": 0.243792496 }, { "epoch": 0.1412665768385221, "grad_norm": 319.5025634765625, "loss": 7.04, "lr": 0.000994, "step": 498, "tokens_trained": 0.244772472 }, { "epoch": 0.141833912488476, "grad_norm": 282.26824951171875, "loss": 9.4037, "lr": 0.000998, "step": 500, "tokens_trained": 0.245758968 }, { "epoch": 0.141833912488476, "eval_loss": 2.152184247970581, "eval_runtime": 21.2772, "step": 500, "tokens_trained": 0.245758968 }, { "epoch": 0.1424012481384299, "grad_norm": 306.0666809082031, "loss": 7.8845, "lr": 0.00099986013986014, "step": 502, "tokens_trained": 0.246739024 }, { "epoch": 0.1429685837883838, "grad_norm": 188.89024353027344, "loss": 6.8118, "lr": 0.0009995804195804196, "step": 504, "tokens_trained": 0.247726552 }, { "epoch": 0.1435359194383377, "grad_norm": 228.97474670410156, "loss": 6.8475, "lr": 0.0009993006993006994, "step": 506, "tokens_trained": 0.24870688 }, { "epoch": 0.1441032550882916, "grad_norm": 229.80029296875, "loss": 6.2171, "lr": 0.000999020979020979, "step": 508, "tokens_trained": 0.249689096 }, { "epoch": 0.1446705907382455, "grad_norm": 157.30340576171875, "loss": 6.2281, "lr": 0.0009987412587412587, "step": 510, "tokens_trained": 0.250671768 }, { "epoch": 0.1452379263881994, "grad_norm": 176.64683532714844, "loss": 6.5993, "lr": 0.0009984615384615386, "step": 512, "tokens_trained": 0.25165608 }, { "epoch": 0.14580526203815333, "grad_norm": 197.20526123046875, "loss": 5.7267, "lr": 0.0009981818181818182, "step": 514, "tokens_trained": 0.252639712 }, { "epoch": 0.14637259768810723, "grad_norm": 54.713260650634766, "loss": 5.7911, "lr": 0.000997902097902098, "step": 516, "tokens_trained": 0.253622816 }, { "epoch": 0.14693993333806113, "grad_norm": 185.74923706054688, "loss": 7.0055, "lr": 0.0009976223776223777, "step": 518, "tokens_trained": 0.254602792 }, { "epoch": 0.14750726898801503, "grad_norm": 240.31021118164062, "loss": 6.452, "lr": 0.0009973426573426573, "step": 520, "tokens_trained": 0.255584736 }, { "epoch": 0.14807460463796893, "grad_norm": 160.2477264404297, "loss": 7.6556, "lr": 0.000997062937062937, "step": 522, "tokens_trained": 0.256563792 }, { "epoch": 0.14864194028792285, "grad_norm": 283.0034484863281, "loss": 6.5345, "lr": 0.0009967832167832168, "step": 524, "tokens_trained": 0.257546656 }, { "epoch": 0.14920927593787675, "grad_norm": 245.537109375, "loss": 6.3281, "lr": 0.0009965034965034964, "step": 526, "tokens_trained": 0.258530832 }, { "epoch": 0.14977661158783065, "grad_norm": 162.1538848876953, "loss": 7.4072, "lr": 0.0009962237762237763, "step": 528, "tokens_trained": 0.259514528 }, { "epoch": 0.15034394723778455, "grad_norm": 107.25792694091797, "loss": 5.356, "lr": 0.000995944055944056, "step": 530, "tokens_trained": 0.260500912 }, { "epoch": 0.15091128288773845, "grad_norm": 173.73353576660156, "loss": 6.8625, "lr": 0.0009956643356643356, "step": 532, "tokens_trained": 0.26148632 }, { "epoch": 0.15147861853769237, "grad_norm": 178.33541870117188, "loss": 5.8794, "lr": 0.0009953846153846154, "step": 534, "tokens_trained": 0.262468816 }, { "epoch": 0.15204595418764627, "grad_norm": 181.2533416748047, "loss": 7.0243, "lr": 0.000995104895104895, "step": 536, "tokens_trained": 0.263446696 }, { "epoch": 0.15261328983760017, "grad_norm": 208.79293823242188, "loss": 5.8908, "lr": 0.000994825174825175, "step": 538, "tokens_trained": 0.26443108 }, { "epoch": 0.15318062548755407, "grad_norm": 148.66285705566406, "loss": 6.0831, "lr": 0.0009945454545454546, "step": 540, "tokens_trained": 0.265414496 }, { "epoch": 0.15374796113750797, "grad_norm": 165.044189453125, "loss": 5.5594, "lr": 0.0009942657342657344, "step": 542, "tokens_trained": 0.266394128 }, { "epoch": 0.1543152967874619, "grad_norm": 124.5405502319336, "loss": 5.2442, "lr": 0.000993986013986014, "step": 544, "tokens_trained": 0.267378768 }, { "epoch": 0.1548826324374158, "grad_norm": 68.66510772705078, "loss": 5.1173, "lr": 0.0009937062937062937, "step": 546, "tokens_trained": 0.268360184 }, { "epoch": 0.1554499680873697, "grad_norm": 57.052860260009766, "loss": 5.2348, "lr": 0.0009934265734265735, "step": 548, "tokens_trained": 0.269345672 }, { "epoch": 0.1560173037373236, "grad_norm": 184.9175567626953, "loss": 6.7748, "lr": 0.0009931468531468532, "step": 550, "tokens_trained": 0.2703288 }, { "epoch": 0.15658463938727749, "grad_norm": 72.9861831665039, "loss": 5.7387, "lr": 0.000992867132867133, "step": 552, "tokens_trained": 0.271309176 }, { "epoch": 0.1571519750372314, "grad_norm": 135.864501953125, "loss": 6.3035, "lr": 0.0009925874125874127, "step": 554, "tokens_trained": 0.27229644 }, { "epoch": 0.1577193106871853, "grad_norm": 130.579833984375, "loss": 5.4434, "lr": 0.0009923076923076923, "step": 556, "tokens_trained": 0.273277904 }, { "epoch": 0.1582866463371392, "grad_norm": 206.77345275878906, "loss": 5.8649, "lr": 0.000992027972027972, "step": 558, "tokens_trained": 0.274261712 }, { "epoch": 0.1588539819870931, "grad_norm": 144.0505828857422, "loss": 5.3459, "lr": 0.0009917482517482518, "step": 560, "tokens_trained": 0.2752468 }, { "epoch": 0.159421317637047, "grad_norm": 87.56634521484375, "loss": 5.6321, "lr": 0.0009914685314685314, "step": 562, "tokens_trained": 0.276232384 }, { "epoch": 0.15998865328700093, "grad_norm": 275.2727355957031, "loss": 6.7515, "lr": 0.0009911888111888113, "step": 564, "tokens_trained": 0.277211608 }, { "epoch": 0.16055598893695483, "grad_norm": 97.00019836425781, "loss": 5.4374, "lr": 0.000990909090909091, "step": 566, "tokens_trained": 0.278196336 }, { "epoch": 0.16112332458690873, "grad_norm": 102.91439056396484, "loss": 5.729, "lr": 0.0009906293706293705, "step": 568, "tokens_trained": 0.279175672 }, { "epoch": 0.16169066023686263, "grad_norm": 151.12432861328125, "loss": 5.4189, "lr": 0.0009903496503496504, "step": 570, "tokens_trained": 0.280161088 }, { "epoch": 0.16225799588681653, "grad_norm": 86.6823959350586, "loss": 5.1704, "lr": 0.00099006993006993, "step": 572, "tokens_trained": 0.28114256 }, { "epoch": 0.16282533153677045, "grad_norm": 90.7052230834961, "loss": 5.3673, "lr": 0.0009897902097902099, "step": 574, "tokens_trained": 0.282128904 }, { "epoch": 0.16339266718672435, "grad_norm": 146.92874145507812, "loss": 5.5971, "lr": 0.0009895104895104895, "step": 576, "tokens_trained": 0.28311528 }, { "epoch": 0.16396000283667825, "grad_norm": 189.76296997070312, "loss": 5.3109, "lr": 0.0009892307692307694, "step": 578, "tokens_trained": 0.284098528 }, { "epoch": 0.16452733848663215, "grad_norm": 174.48092651367188, "loss": 5.68, "lr": 0.000988951048951049, "step": 580, "tokens_trained": 0.285081064 }, { "epoch": 0.16509467413658604, "grad_norm": 154.10816955566406, "loss": 5.3307, "lr": 0.0009886713286713286, "step": 582, "tokens_trained": 0.286067952 }, { "epoch": 0.16566200978653997, "grad_norm": 64.28263092041016, "loss": 5.1676, "lr": 0.0009883916083916085, "step": 584, "tokens_trained": 0.287051384 }, { "epoch": 0.16622934543649387, "grad_norm": 103.81795501708984, "loss": 5.3436, "lr": 0.0009881118881118881, "step": 586, "tokens_trained": 0.28803284 }, { "epoch": 0.16679668108644777, "grad_norm": 144.0076904296875, "loss": 5.3033, "lr": 0.000987832167832168, "step": 588, "tokens_trained": 0.289014824 }, { "epoch": 0.16736401673640167, "grad_norm": 88.31237030029297, "loss": 5.0609, "lr": 0.0009875524475524476, "step": 590, "tokens_trained": 0.289999864 }, { "epoch": 0.16793135238635556, "grad_norm": 68.4583740234375, "loss": 5.0702, "lr": 0.0009872727272727273, "step": 592, "tokens_trained": 0.290983888 }, { "epoch": 0.1684986880363095, "grad_norm": 135.28665161132812, "loss": 5.3962, "lr": 0.000986993006993007, "step": 594, "tokens_trained": 0.291965752 }, { "epoch": 0.1690660236862634, "grad_norm": 80.0412368774414, "loss": 5.0246, "lr": 0.0009867132867132867, "step": 596, "tokens_trained": 0.292946952 }, { "epoch": 0.1696333593362173, "grad_norm": 43.29194641113281, "loss": 5.0051, "lr": 0.0009864335664335664, "step": 598, "tokens_trained": 0.293928976 }, { "epoch": 0.17020069498617119, "grad_norm": 220.88687133789062, "loss": 6.0798, "lr": 0.0009861538461538462, "step": 600, "tokens_trained": 0.294912408 }, { "epoch": 0.17076803063612508, "grad_norm": 102.58654022216797, "loss": 5.1271, "lr": 0.0009858741258741259, "step": 602, "tokens_trained": 0.29589416 }, { "epoch": 0.171335366286079, "grad_norm": 119.0067138671875, "loss": 5.7402, "lr": 0.0009855944055944055, "step": 604, "tokens_trained": 0.296878584 }, { "epoch": 0.1719027019360329, "grad_norm": 138.8656005859375, "loss": 5.1951, "lr": 0.0009853146853146854, "step": 606, "tokens_trained": 0.297864552 }, { "epoch": 0.1724700375859868, "grad_norm": 73.5890884399414, "loss": 5.2522, "lr": 0.000985034965034965, "step": 608, "tokens_trained": 0.298854088 }, { "epoch": 0.1730373732359407, "grad_norm": 113.78330993652344, "loss": 5.6683, "lr": 0.0009847552447552449, "step": 610, "tokens_trained": 0.299835024 }, { "epoch": 0.1736047088858946, "grad_norm": 125.20297241210938, "loss": 5.1812, "lr": 0.0009844755244755245, "step": 612, "tokens_trained": 0.30082032 }, { "epoch": 0.17417204453584853, "grad_norm": 67.46041870117188, "loss": 5.0417, "lr": 0.0009841958041958043, "step": 614, "tokens_trained": 0.301808456 }, { "epoch": 0.17473938018580243, "grad_norm": 117.30754852294922, "loss": 5.3064, "lr": 0.000983916083916084, "step": 616, "tokens_trained": 0.302794456 }, { "epoch": 0.17530671583575633, "grad_norm": 124.30754089355469, "loss": 5.1614, "lr": 0.0009836363636363636, "step": 618, "tokens_trained": 0.303777376 }, { "epoch": 0.17587405148571023, "grad_norm": 102.72042083740234, "loss": 5.1265, "lr": 0.0009833566433566435, "step": 620, "tokens_trained": 0.304758864 }, { "epoch": 0.17644138713566412, "grad_norm": 39.332252502441406, "loss": 5.1078, "lr": 0.000983076923076923, "step": 622, "tokens_trained": 0.30574392 }, { "epoch": 0.17700872278561805, "grad_norm": 153.84811401367188, "loss": 5.7696, "lr": 0.000982797202797203, "step": 624, "tokens_trained": 0.306727584 }, { "epoch": 0.17729239061059499, "eval_loss": 1.3463915586471558, "eval_runtime": 20.8357, "step": 625, "tokens_trained": 0.307220496 }, { "epoch": 0.17757605843557195, "grad_norm": 160.2552490234375, "loss": 5.2283, "lr": 0.0009825174825174826, "step": 626, "tokens_trained": 0.307713024 }, { "epoch": 0.17814339408552585, "grad_norm": 186.77407836914062, "loss": 5.2866, "lr": 0.0009822377622377622, "step": 628, "tokens_trained": 0.308700128 }, { "epoch": 0.17871072973547975, "grad_norm": 84.55519104003906, "loss": 5.1106, "lr": 0.0009819580419580419, "step": 630, "tokens_trained": 0.309681208 }, { "epoch": 0.17927806538543364, "grad_norm": 20.617040634155273, "loss": 4.8327, "lr": 0.0009816783216783217, "step": 632, "tokens_trained": 0.310662224 }, { "epoch": 0.17984540103538757, "grad_norm": 168.06039428710938, "loss": 6.0704, "lr": 0.0009813986013986014, "step": 634, "tokens_trained": 0.31164064 }, { "epoch": 0.18041273668534147, "grad_norm": 238.23736572265625, "loss": 5.6188, "lr": 0.0009811188811188812, "step": 636, "tokens_trained": 0.312622568 }, { "epoch": 0.18098007233529537, "grad_norm": 140.0707550048828, "loss": 6.4034, "lr": 0.0009808391608391608, "step": 638, "tokens_trained": 0.313604944 }, { "epoch": 0.18154740798524927, "grad_norm": 161.19302368164062, "loss": 5.4906, "lr": 0.0009805594405594405, "step": 640, "tokens_trained": 0.314592072 }, { "epoch": 0.18211474363520316, "grad_norm": 121.9577407836914, "loss": 5.2097, "lr": 0.0009802797202797203, "step": 642, "tokens_trained": 0.315574392 }, { "epoch": 0.1826820792851571, "grad_norm": 121.25574493408203, "loss": 5.0317, "lr": 0.00098, "step": 644, "tokens_trained": 0.316559008 }, { "epoch": 0.183249414935111, "grad_norm": 28.328269958496094, "loss": 4.932, "lr": 0.0009797202797202798, "step": 646, "tokens_trained": 0.317538776 }, { "epoch": 0.1838167505850649, "grad_norm": 127.77408599853516, "loss": 5.8335, "lr": 0.0009794405594405595, "step": 648, "tokens_trained": 0.31851792 }, { "epoch": 0.18438408623501878, "grad_norm": 94.9522933959961, "loss": 5.1948, "lr": 0.000979160839160839, "step": 650, "tokens_trained": 0.319501576 }, { "epoch": 0.18495142188497268, "grad_norm": 110.33658599853516, "loss": 5.098, "lr": 0.000978881118881119, "step": 652, "tokens_trained": 0.320482392 }, { "epoch": 0.1855187575349266, "grad_norm": 67.23124694824219, "loss": 4.7723, "lr": 0.0009786013986013986, "step": 654, "tokens_trained": 0.32146712 }, { "epoch": 0.1860860931848805, "grad_norm": 61.519866943359375, "loss": 4.7245, "lr": 0.0009783216783216782, "step": 656, "tokens_trained": 0.322449576 }, { "epoch": 0.1866534288348344, "grad_norm": 99.51078033447266, "loss": 4.783, "lr": 0.000978041958041958, "step": 658, "tokens_trained": 0.323432688 }, { "epoch": 0.1872207644847883, "grad_norm": 44.619197845458984, "loss": 4.7495, "lr": 0.000977762237762238, "step": 660, "tokens_trained": 0.324413952 }, { "epoch": 0.18778810013474223, "grad_norm": 114.5891342163086, "loss": 5.1261, "lr": 0.0009774825174825176, "step": 662, "tokens_trained": 0.325394536 }, { "epoch": 0.18835543578469613, "grad_norm": 100.3728256225586, "loss": 4.7883, "lr": 0.0009772027972027972, "step": 664, "tokens_trained": 0.326374672 }, { "epoch": 0.18892277143465003, "grad_norm": 51.883033752441406, "loss": 4.7249, "lr": 0.0009769230769230768, "step": 666, "tokens_trained": 0.327357152 }, { "epoch": 0.18949010708460393, "grad_norm": 82.27507019042969, "loss": 4.8277, "lr": 0.0009766433566433567, "step": 668, "tokens_trained": 0.328342088 }, { "epoch": 0.19005744273455782, "grad_norm": 83.53064727783203, "loss": 4.8338, "lr": 0.0009763636363636363, "step": 670, "tokens_trained": 0.329319248 }, { "epoch": 0.19062477838451175, "grad_norm": 76.18387603759766, "loss": 4.6958, "lr": 0.0009760839160839161, "step": 672, "tokens_trained": 0.330305968 }, { "epoch": 0.19119211403446565, "grad_norm": 27.401426315307617, "loss": 4.6929, "lr": 0.0009758041958041958, "step": 674, "tokens_trained": 0.3312912 }, { "epoch": 0.19175944968441955, "grad_norm": 186.770263671875, "loss": 5.5089, "lr": 0.0009755244755244756, "step": 676, "tokens_trained": 0.332275224 }, { "epoch": 0.19232678533437345, "grad_norm": 105.02385711669922, "loss": 4.8876, "lr": 0.0009752447552447553, "step": 678, "tokens_trained": 0.33325588 }, { "epoch": 0.19289412098432734, "grad_norm": 94.96269989013672, "loss": 5.1235, "lr": 0.0009749650349650349, "step": 680, "tokens_trained": 0.334238408 }, { "epoch": 0.19346145663428127, "grad_norm": 92.29356384277344, "loss": 4.8194, "lr": 0.0009746853146853148, "step": 682, "tokens_trained": 0.335219368 }, { "epoch": 0.19402879228423517, "grad_norm": 59.1584358215332, "loss": 4.7511, "lr": 0.0009744055944055944, "step": 684, "tokens_trained": 0.336207136 }, { "epoch": 0.19459612793418907, "grad_norm": 54.759002685546875, "loss": 4.777, "lr": 0.0009741258741258742, "step": 686, "tokens_trained": 0.337193536 }, { "epoch": 0.19516346358414297, "grad_norm": 92.20452880859375, "loss": 4.8225, "lr": 0.0009738461538461538, "step": 688, "tokens_trained": 0.338179224 }, { "epoch": 0.19573079923409686, "grad_norm": 75.97005462646484, "loss": 4.655, "lr": 0.0009735664335664336, "step": 690, "tokens_trained": 0.339162168 }, { "epoch": 0.1962981348840508, "grad_norm": 58.19076919555664, "loss": 4.6446, "lr": 0.0009732867132867133, "step": 692, "tokens_trained": 0.340138904 }, { "epoch": 0.1968654705340047, "grad_norm": 50.81512451171875, "loss": 4.5866, "lr": 0.000973006993006993, "step": 694, "tokens_trained": 0.34112288 }, { "epoch": 0.1974328061839586, "grad_norm": 61.683372497558594, "loss": 4.6018, "lr": 0.0009727272727272728, "step": 696, "tokens_trained": 0.342111992 }, { "epoch": 0.19800014183391249, "grad_norm": 61.01798629760742, "loss": 4.6007, "lr": 0.0009724475524475524, "step": 698, "tokens_trained": 0.343095912 }, { "epoch": 0.19856747748386638, "grad_norm": 96.49671936035156, "loss": 4.7035, "lr": 0.0009721678321678323, "step": 700, "tokens_trained": 0.344078632 }, { "epoch": 0.1991348131338203, "grad_norm": 64.7771224975586, "loss": 4.8341, "lr": 0.0009718881118881119, "step": 702, "tokens_trained": 0.345060576 }, { "epoch": 0.1997021487837742, "grad_norm": 90.1478042602539, "loss": 4.7739, "lr": 0.0009716083916083917, "step": 704, "tokens_trained": 0.34604112 }, { "epoch": 0.2002694844337281, "grad_norm": 67.6308822631836, "loss": 4.6218, "lr": 0.0009713286713286713, "step": 706, "tokens_trained": 0.347023496 }, { "epoch": 0.200836820083682, "grad_norm": 40.50175094604492, "loss": 4.6008, "lr": 0.000971048951048951, "step": 708, "tokens_trained": 0.348005416 }, { "epoch": 0.2014041557336359, "grad_norm": 33.6448860168457, "loss": 4.5307, "lr": 0.0009707692307692308, "step": 710, "tokens_trained": 0.3489886 }, { "epoch": 0.20197149138358983, "grad_norm": 15.484851837158203, "loss": 4.5065, "lr": 0.0009704895104895105, "step": 712, "tokens_trained": 0.34997024 }, { "epoch": 0.20253882703354373, "grad_norm": 109.26301574707031, "loss": 4.9613, "lr": 0.0009702097902097903, "step": 714, "tokens_trained": 0.350958496 }, { "epoch": 0.20310616268349763, "grad_norm": 150.07492065429688, "loss": 4.8507, "lr": 0.0009699300699300699, "step": 716, "tokens_trained": 0.35193892 }, { "epoch": 0.20367349833345152, "grad_norm": 113.43978881835938, "loss": 5.4494, "lr": 0.0009696503496503498, "step": 718, "tokens_trained": 0.35291908 }, { "epoch": 0.20424083398340542, "grad_norm": 123.0071792602539, "loss": 4.9475, "lr": 0.0009693706293706294, "step": 720, "tokens_trained": 0.353896072 }, { "epoch": 0.20480816963335935, "grad_norm": 65.55500793457031, "loss": 4.7585, "lr": 0.0009690909090909091, "step": 722, "tokens_trained": 0.354878992 }, { "epoch": 0.20537550528331325, "grad_norm": 36.11159896850586, "loss": 4.6323, "lr": 0.0009688111888111888, "step": 724, "tokens_trained": 0.355863728 }, { "epoch": 0.20594284093326715, "grad_norm": 30.566436767578125, "loss": 4.53, "lr": 0.0009685314685314685, "step": 726, "tokens_trained": 0.356845272 }, { "epoch": 0.20651017658322104, "grad_norm": 59.01853561401367, "loss": 4.5283, "lr": 0.0009682517482517483, "step": 728, "tokens_trained": 0.357826656 }, { "epoch": 0.20707751223317494, "grad_norm": 91.78115844726562, "loss": 4.6149, "lr": 0.000967972027972028, "step": 730, "tokens_trained": 0.358809896 }, { "epoch": 0.20764484788312887, "grad_norm": 67.97398376464844, "loss": 4.617, "lr": 0.0009676923076923078, "step": 732, "tokens_trained": 0.359788736 }, { "epoch": 0.20821218353308277, "grad_norm": 42.82001876831055, "loss": 4.6134, "lr": 0.0009674125874125874, "step": 734, "tokens_trained": 0.360771744 }, { "epoch": 0.20877951918303667, "grad_norm": 63.52122116088867, "loss": 4.6995, "lr": 0.0009671328671328672, "step": 736, "tokens_trained": 0.361757656 }, { "epoch": 0.20934685483299056, "grad_norm": 116.39544677734375, "loss": 4.7153, "lr": 0.0009668531468531469, "step": 738, "tokens_trained": 0.362744008 }, { "epoch": 0.20991419048294446, "grad_norm": 40.74269485473633, "loss": 4.7978, "lr": 0.0009665734265734266, "step": 740, "tokens_trained": 0.36372872 }, { "epoch": 0.2104815261328984, "grad_norm": 114.29917907714844, "loss": 5.1683, "lr": 0.0009662937062937063, "step": 742, "tokens_trained": 0.364710536 }, { "epoch": 0.2110488617828523, "grad_norm": 115.83326721191406, "loss": 4.7642, "lr": 0.000966013986013986, "step": 744, "tokens_trained": 0.3656912 }, { "epoch": 0.21161619743280619, "grad_norm": 21.708093643188477, "loss": 4.8244, "lr": 0.0009657342657342657, "step": 746, "tokens_trained": 0.36667388 }, { "epoch": 0.21218353308276008, "grad_norm": 182.01918029785156, "loss": 5.6045, "lr": 0.0009654545454545455, "step": 748, "tokens_trained": 0.3676634 }, { "epoch": 0.21275086873271398, "grad_norm": 47.119319915771484, "loss": 4.7929, "lr": 0.0009651748251748252, "step": 750, "tokens_trained": 0.368647288 }, { "epoch": 0.21275086873271398, "eval_loss": 1.2186306715011597, "eval_runtime": 20.9362, "step": 750, "tokens_trained": 0.368647288 }, { "epoch": 0.2133182043826679, "grad_norm": 51.43566131591797, "loss": 4.7298, "lr": 0.0009648951048951049, "step": 752, "tokens_trained": 0.36962992 }, { "epoch": 0.2138855400326218, "grad_norm": 79.49323272705078, "loss": 5.0749, "lr": 0.0009646153846153846, "step": 754, "tokens_trained": 0.370616064 }, { "epoch": 0.2144528756825757, "grad_norm": 119.80200958251953, "loss": 4.8198, "lr": 0.0009643356643356644, "step": 756, "tokens_trained": 0.371596208 }, { "epoch": 0.2150202113325296, "grad_norm": 95.88092041015625, "loss": 4.7437, "lr": 0.0009640559440559441, "step": 758, "tokens_trained": 0.372579584 }, { "epoch": 0.2155875469824835, "grad_norm": 79.64202117919922, "loss": 4.9181, "lr": 0.0009637762237762237, "step": 760, "tokens_trained": 0.373563056 }, { "epoch": 0.21615488263243743, "grad_norm": 79.93920135498047, "loss": 4.6393, "lr": 0.0009634965034965035, "step": 762, "tokens_trained": 0.374547648 }, { "epoch": 0.21672221828239133, "grad_norm": 78.67620849609375, "loss": 4.6178, "lr": 0.0009632167832167832, "step": 764, "tokens_trained": 0.375531456 }, { "epoch": 0.21728955393234523, "grad_norm": 56.32818603515625, "loss": 4.6498, "lr": 0.000962937062937063, "step": 766, "tokens_trained": 0.376516896 }, { "epoch": 0.21785688958229912, "grad_norm": 45.35737228393555, "loss": 4.5812, "lr": 0.0009626573426573427, "step": 768, "tokens_trained": 0.377499752 }, { "epoch": 0.21842422523225302, "grad_norm": 58.13076400756836, "loss": 4.5793, "lr": 0.0009623776223776224, "step": 770, "tokens_trained": 0.37848276 }, { "epoch": 0.21899156088220695, "grad_norm": 55.620628356933594, "loss": 4.4865, "lr": 0.0009620979020979021, "step": 772, "tokens_trained": 0.379466296 }, { "epoch": 0.21955889653216085, "grad_norm": 77.26813507080078, "loss": 4.5671, "lr": 0.0009618181818181818, "step": 774, "tokens_trained": 0.380449888 }, { "epoch": 0.22012623218211474, "grad_norm": 45.00653839111328, "loss": 4.5923, "lr": 0.0009615384615384616, "step": 776, "tokens_trained": 0.381430352 }, { "epoch": 0.22069356783206864, "grad_norm": 52.77407455444336, "loss": 4.5094, "lr": 0.0009612587412587412, "step": 778, "tokens_trained": 0.382416152 }, { "epoch": 0.22126090348202254, "grad_norm": 36.721073150634766, "loss": 4.4536, "lr": 0.000960979020979021, "step": 780, "tokens_trained": 0.383396672 }, { "epoch": 0.22182823913197647, "grad_norm": 51.21247100830078, "loss": 4.4599, "lr": 0.0009606993006993007, "step": 782, "tokens_trained": 0.384380584 }, { "epoch": 0.22239557478193037, "grad_norm": 65.23794555664062, "loss": 4.5397, "lr": 0.0009604195804195805, "step": 784, "tokens_trained": 0.385361368 }, { "epoch": 0.22296291043188426, "grad_norm": 23.255144119262695, "loss": 4.5007, "lr": 0.0009601398601398602, "step": 786, "tokens_trained": 0.386341416 }, { "epoch": 0.22353024608183816, "grad_norm": 30.812740325927734, "loss": 4.5239, "lr": 0.0009598601398601398, "step": 788, "tokens_trained": 0.387324624 }, { "epoch": 0.22409758173179206, "grad_norm": 50.781219482421875, "loss": 4.5131, "lr": 0.0009595804195804196, "step": 790, "tokens_trained": 0.388312744 }, { "epoch": 0.224664917381746, "grad_norm": 47.88816452026367, "loss": 4.4622, "lr": 0.0009593006993006993, "step": 792, "tokens_trained": 0.38929852 }, { "epoch": 0.22523225303169989, "grad_norm": 49.32049560546875, "loss": 4.5053, "lr": 0.0009590209790209791, "step": 794, "tokens_trained": 0.390279792 }, { "epoch": 0.22579958868165378, "grad_norm": 36.98805618286133, "loss": 4.5144, "lr": 0.0009587412587412587, "step": 796, "tokens_trained": 0.391258904 }, { "epoch": 0.22636692433160768, "grad_norm": 24.88475799560547, "loss": 4.4992, "lr": 0.0009584615384615385, "step": 798, "tokens_trained": 0.392238976 }, { "epoch": 0.22693425998156158, "grad_norm": 38.89309310913086, "loss": 4.4853, "lr": 0.0009581818181818182, "step": 800, "tokens_trained": 0.393226312 }, { "epoch": 0.2275015956315155, "grad_norm": 34.86774444580078, "loss": 4.4519, "lr": 0.000957902097902098, "step": 802, "tokens_trained": 0.394206688 }, { "epoch": 0.2280689312814694, "grad_norm": 24.966291427612305, "loss": 4.456, "lr": 0.0009576223776223777, "step": 804, "tokens_trained": 0.395191608 }, { "epoch": 0.2286362669314233, "grad_norm": 12.218213081359863, "loss": 4.4266, "lr": 0.0009573426573426573, "step": 806, "tokens_trained": 0.396174512 }, { "epoch": 0.2292036025813772, "grad_norm": 50.817054748535156, "loss": 4.586, "lr": 0.0009570629370629371, "step": 808, "tokens_trained": 0.397156912 }, { "epoch": 0.2297709382313311, "grad_norm": 37.60087203979492, "loss": 4.4616, "lr": 0.0009567832167832168, "step": 810, "tokens_trained": 0.398140016 }, { "epoch": 0.23033827388128503, "grad_norm": 37.55678176879883, "loss": 4.4755, "lr": 0.0009565034965034966, "step": 812, "tokens_trained": 0.39912384 }, { "epoch": 0.23090560953123893, "grad_norm": 56.427215576171875, "loss": 4.5078, "lr": 0.0009562237762237762, "step": 814, "tokens_trained": 0.400111224 }, { "epoch": 0.23147294518119282, "grad_norm": 31.869827270507812, "loss": 4.5013, "lr": 0.0009559440559440559, "step": 816, "tokens_trained": 0.401094936 }, { "epoch": 0.23204028083114672, "grad_norm": 77.57958984375, "loss": 4.6977, "lr": 0.0009556643356643357, "step": 818, "tokens_trained": 0.402078888 }, { "epoch": 0.23260761648110062, "grad_norm": 52.50204849243164, "loss": 4.5142, "lr": 0.0009553846153846154, "step": 820, "tokens_trained": 0.403059904 }, { "epoch": 0.23317495213105455, "grad_norm": 32.34305191040039, "loss": 4.4828, "lr": 0.0009551048951048952, "step": 822, "tokens_trained": 0.404049848 }, { "epoch": 0.23374228778100845, "grad_norm": 52.08961486816406, "loss": 4.4869, "lr": 0.0009548251748251748, "step": 824, "tokens_trained": 0.405033872 }, { "epoch": 0.23430962343096234, "grad_norm": 44.32194900512695, "loss": 4.4802, "lr": 0.0009545454545454546, "step": 826, "tokens_trained": 0.406017872 }, { "epoch": 0.23487695908091624, "grad_norm": 30.941524505615234, "loss": 4.4323, "lr": 0.0009542657342657343, "step": 828, "tokens_trained": 0.40700704 }, { "epoch": 0.23544429473087014, "grad_norm": 20.52709197998047, "loss": 4.4919, "lr": 0.000953986013986014, "step": 830, "tokens_trained": 0.407991512 }, { "epoch": 0.23601163038082407, "grad_norm": 86.80307006835938, "loss": 4.8228, "lr": 0.0009537062937062937, "step": 832, "tokens_trained": 0.408979272 }, { "epoch": 0.23657896603077797, "grad_norm": 73.71435546875, "loss": 4.5954, "lr": 0.0009534265734265734, "step": 834, "tokens_trained": 0.409962984 }, { "epoch": 0.23714630168073186, "grad_norm": 66.3813247680664, "loss": 4.5969, "lr": 0.0009531468531468532, "step": 836, "tokens_trained": 0.410945248 }, { "epoch": 0.23771363733068576, "grad_norm": 86.94453430175781, "loss": 4.5894, "lr": 0.0009528671328671329, "step": 838, "tokens_trained": 0.411930872 }, { "epoch": 0.23828097298063966, "grad_norm": 61.28915786743164, "loss": 4.5613, "lr": 0.0009525874125874127, "step": 840, "tokens_trained": 0.412912608 }, { "epoch": 0.2388483086305936, "grad_norm": 65.02153778076172, "loss": 4.5398, "lr": 0.0009523076923076923, "step": 842, "tokens_trained": 0.413897488 }, { "epoch": 0.23941564428054748, "grad_norm": 54.01200485229492, "loss": 4.4922, "lr": 0.000952027972027972, "step": 844, "tokens_trained": 0.414872888 }, { "epoch": 0.23998297993050138, "grad_norm": 66.7095718383789, "loss": 4.5317, "lr": 0.0009517482517482518, "step": 846, "tokens_trained": 0.415856296 }, { "epoch": 0.24055031558045528, "grad_norm": 64.23979949951172, "loss": 4.4686, "lr": 0.0009514685314685315, "step": 848, "tokens_trained": 0.416843344 }, { "epoch": 0.24111765123040918, "grad_norm": 51.012840270996094, "loss": 4.4544, "lr": 0.0009511888111888112, "step": 850, "tokens_trained": 0.41782032 }, { "epoch": 0.2416849868803631, "grad_norm": 40.83076095581055, "loss": 4.4665, "lr": 0.0009509090909090909, "step": 852, "tokens_trained": 0.418805672 }, { "epoch": 0.242252322530317, "grad_norm": 48.31489944458008, "loss": 4.4748, "lr": 0.0009506293706293707, "step": 854, "tokens_trained": 0.419786344 }, { "epoch": 0.2428196581802709, "grad_norm": 50.08705520629883, "loss": 4.4973, "lr": 0.0009503496503496504, "step": 856, "tokens_trained": 0.420768872 }, { "epoch": 0.2433869938302248, "grad_norm": 26.840139389038086, "loss": 4.461, "lr": 0.0009500699300699301, "step": 858, "tokens_trained": 0.421750296 }, { "epoch": 0.2439543294801787, "grad_norm": 24.721454620361328, "loss": 4.4246, "lr": 0.0009497902097902098, "step": 860, "tokens_trained": 0.422730976 }, { "epoch": 0.24452166513013263, "grad_norm": 63.147926330566406, "loss": 4.623, "lr": 0.0009495104895104895, "step": 862, "tokens_trained": 0.423715768 }, { "epoch": 0.24508900078008652, "grad_norm": 50.99778747558594, "loss": 4.4663, "lr": 0.0009492307692307693, "step": 864, "tokens_trained": 0.424697072 }, { "epoch": 0.24565633643004042, "grad_norm": 38.0300407409668, "loss": 4.4649, "lr": 0.000948951048951049, "step": 866, "tokens_trained": 0.425681392 }, { "epoch": 0.24622367207999432, "grad_norm": 19.017776489257812, "loss": 4.4296, "lr": 0.0009486713286713286, "step": 868, "tokens_trained": 0.426665088 }, { "epoch": 0.24679100772994822, "grad_norm": 24.02813148498535, "loss": 4.4958, "lr": 0.0009483916083916084, "step": 870, "tokens_trained": 0.427646016 }, { "epoch": 0.24735834337990215, "grad_norm": 59.40018081665039, "loss": 4.5919, "lr": 0.0009481118881118881, "step": 872, "tokens_trained": 0.428628048 }, { "epoch": 0.24792567902985604, "grad_norm": 61.13710403442383, "loss": 4.4642, "lr": 0.0009478321678321679, "step": 874, "tokens_trained": 0.4296112 }, { "epoch": 0.24820934685483298, "eval_loss": 1.1135390996932983, "eval_runtime": 20.4738, "step": 875, "tokens_trained": 0.430109024 }, { "epoch": 0.24849301467980994, "grad_norm": 47.920021057128906, "loss": 4.4832, "lr": 0.0009475524475524476, "step": 876, "tokens_trained": 0.430599208 }, { "epoch": 0.24906035032976384, "grad_norm": 25.661701202392578, "loss": 4.4176, "lr": 0.0009472727272727273, "step": 878, "tokens_trained": 0.43158356 }, { "epoch": 0.24962768597971774, "grad_norm": 32.86565399169922, "loss": 4.405, "lr": 0.000946993006993007, "step": 880, "tokens_trained": 0.432570584 }, { "epoch": 0.25019502162967167, "grad_norm": 23.443584442138672, "loss": 4.4218, "lr": 0.0009467132867132868, "step": 882, "tokens_trained": 0.433557672 }, { "epoch": 0.25076235727962554, "grad_norm": 28.315975189208984, "loss": 4.4019, "lr": 0.0009464335664335665, "step": 884, "tokens_trained": 0.434542736 }, { "epoch": 0.25132969292957946, "grad_norm": 31.056642532348633, "loss": 4.4027, "lr": 0.0009461538461538461, "step": 886, "tokens_trained": 0.43553112 }, { "epoch": 0.2518970285795334, "grad_norm": 13.661805152893066, "loss": 4.3745, "lr": 0.0009458741258741259, "step": 888, "tokens_trained": 0.436511584 }, { "epoch": 0.25246436422948726, "grad_norm": 47.04901885986328, "loss": 4.4875, "lr": 0.0009455944055944056, "step": 890, "tokens_trained": 0.43749464 }, { "epoch": 0.2530316998794412, "grad_norm": 84.91446685791016, "loss": 4.5185, "lr": 0.0009453146853146854, "step": 892, "tokens_trained": 0.43847764 }, { "epoch": 0.25359903552939506, "grad_norm": 40.9110107421875, "loss": 4.5735, "lr": 0.000945034965034965, "step": 894, "tokens_trained": 0.439461496 }, { "epoch": 0.254166371179349, "grad_norm": 58.98877716064453, "loss": 4.5146, "lr": 0.0009447552447552447, "step": 896, "tokens_trained": 0.440443656 }, { "epoch": 0.2547337068293029, "grad_norm": 34.037315368652344, "loss": 4.4714, "lr": 0.0009444755244755245, "step": 898, "tokens_trained": 0.441423496 }, { "epoch": 0.2553010424792568, "grad_norm": 24.91920280456543, "loss": 4.4334, "lr": 0.0009441958041958042, "step": 900, "tokens_trained": 0.442407408 }, { "epoch": 0.2558683781292107, "grad_norm": 30.612323760986328, "loss": 4.4459, "lr": 0.000943916083916084, "step": 902, "tokens_trained": 0.443383464 }, { "epoch": 0.2564357137791646, "grad_norm": 50.595577239990234, "loss": 4.4848, "lr": 0.0009436363636363636, "step": 904, "tokens_trained": 0.4443674 }, { "epoch": 0.2570030494291185, "grad_norm": 41.3300895690918, "loss": 4.4445, "lr": 0.0009433566433566434, "step": 906, "tokens_trained": 0.445346072 }, { "epoch": 0.25757038507907243, "grad_norm": 48.33689880371094, "loss": 4.4058, "lr": 0.0009430769230769231, "step": 908, "tokens_trained": 0.446329872 }, { "epoch": 0.2581377207290263, "grad_norm": 39.081382751464844, "loss": 4.4321, "lr": 0.0009427972027972029, "step": 910, "tokens_trained": 0.447309544 }, { "epoch": 0.2587050563789802, "grad_norm": 62.18062210083008, "loss": 4.4672, "lr": 0.0009425174825174825, "step": 912, "tokens_trained": 0.448295056 }, { "epoch": 0.2592723920289341, "grad_norm": 28.725404739379883, "loss": 4.4786, "lr": 0.0009422377622377622, "step": 914, "tokens_trained": 0.449274208 }, { "epoch": 0.259839727678888, "grad_norm": 47.55582809448242, "loss": 4.4227, "lr": 0.000941958041958042, "step": 916, "tokens_trained": 0.450256408 }, { "epoch": 0.26040706332884195, "grad_norm": 35.743125915527344, "loss": 4.379, "lr": 0.0009416783216783217, "step": 918, "tokens_trained": 0.45123684 }, { "epoch": 0.2609743989787958, "grad_norm": 31.489402770996094, "loss": 4.3888, "lr": 0.0009413986013986015, "step": 920, "tokens_trained": 0.45221748 }, { "epoch": 0.26154173462874974, "grad_norm": 36.46233367919922, "loss": 4.3982, "lr": 0.0009411188811188811, "step": 922, "tokens_trained": 0.453202064 }, { "epoch": 0.2621090702787036, "grad_norm": 41.6457633972168, "loss": 4.385, "lr": 0.0009408391608391608, "step": 924, "tokens_trained": 0.454183456 }, { "epoch": 0.26267640592865754, "grad_norm": 26.52242088317871, "loss": 4.4091, "lr": 0.0009405594405594406, "step": 926, "tokens_trained": 0.455165496 }, { "epoch": 0.26324374157861147, "grad_norm": 14.401509284973145, "loss": 4.3549, "lr": 0.0009402797202797203, "step": 928, "tokens_trained": 0.456150248 }, { "epoch": 0.26381107722856534, "grad_norm": 30.626131057739258, "loss": 4.3325, "lr": 0.00094, "step": 930, "tokens_trained": 0.457134184 }, { "epoch": 0.26437841287851926, "grad_norm": 63.74067687988281, "loss": 4.442, "lr": 0.0009397202797202797, "step": 932, "tokens_trained": 0.458118808 }, { "epoch": 0.26494574852847314, "grad_norm": 12.15156364440918, "loss": 4.4658, "lr": 0.0009394405594405595, "step": 934, "tokens_trained": 0.459103872 }, { "epoch": 0.26551308417842706, "grad_norm": 76.2789306640625, "loss": 4.8153, "lr": 0.0009391608391608392, "step": 936, "tokens_trained": 0.460087216 }, { "epoch": 0.266080419828381, "grad_norm": 63.919334411621094, "loss": 4.5707, "lr": 0.000938881118881119, "step": 938, "tokens_trained": 0.461070568 }, { "epoch": 0.26664775547833486, "grad_norm": 75.1481704711914, "loss": 4.5931, "lr": 0.0009386013986013986, "step": 940, "tokens_trained": 0.462055184 }, { "epoch": 0.2672150911282888, "grad_norm": 33.118961334228516, "loss": 4.4723, "lr": 0.0009383216783216783, "step": 942, "tokens_trained": 0.463034592 }, { "epoch": 0.26778242677824265, "grad_norm": 30.8759765625, "loss": 4.4275, "lr": 0.0009380419580419581, "step": 944, "tokens_trained": 0.464016816 }, { "epoch": 0.2683497624281966, "grad_norm": 41.05061340332031, "loss": 4.4566, "lr": 0.0009377622377622378, "step": 946, "tokens_trained": 0.465000872 }, { "epoch": 0.2689170980781505, "grad_norm": 30.93424415588379, "loss": 4.3985, "lr": 0.0009374825174825175, "step": 948, "tokens_trained": 0.465984096 }, { "epoch": 0.2694844337281044, "grad_norm": 29.477052688598633, "loss": 4.3718, "lr": 0.0009372027972027972, "step": 950, "tokens_trained": 0.466961752 }, { "epoch": 0.2700517693780583, "grad_norm": 21.568912506103516, "loss": 4.3697, "lr": 0.0009369230769230769, "step": 952, "tokens_trained": 0.467950088 }, { "epoch": 0.2706191050280122, "grad_norm": 41.66835021972656, "loss": 4.4241, "lr": 0.0009366433566433567, "step": 954, "tokens_trained": 0.468928736 }, { "epoch": 0.2711864406779661, "grad_norm": 68.04551696777344, "loss": 4.3978, "lr": 0.0009363636363636364, "step": 956, "tokens_trained": 0.469907496 }, { "epoch": 0.27175377632792, "grad_norm": 37.655181884765625, "loss": 4.4497, "lr": 0.0009360839160839161, "step": 958, "tokens_trained": 0.470889168 }, { "epoch": 0.2723211119778739, "grad_norm": 22.074953079223633, "loss": 4.3918, "lr": 0.0009358041958041958, "step": 960, "tokens_trained": 0.471871816 }, { "epoch": 0.2728884476278278, "grad_norm": 49.925777435302734, "loss": 4.4745, "lr": 0.0009355244755244755, "step": 962, "tokens_trained": 0.472856728 }, { "epoch": 0.2734557832777817, "grad_norm": 46.520851135253906, "loss": 4.403, "lr": 0.0009352447552447553, "step": 964, "tokens_trained": 0.473838544 }, { "epoch": 0.2740231189277356, "grad_norm": 25.053146362304688, "loss": 4.4247, "lr": 0.0009349650349650349, "step": 966, "tokens_trained": 0.474819976 }, { "epoch": 0.27459045457768955, "grad_norm": 30.127140045166016, "loss": 4.3834, "lr": 0.0009346853146853147, "step": 968, "tokens_trained": 0.475800696 }, { "epoch": 0.2751577902276434, "grad_norm": 41.478328704833984, "loss": 4.3978, "lr": 0.0009344055944055944, "step": 970, "tokens_trained": 0.4767834 }, { "epoch": 0.27572512587759734, "grad_norm": 23.739456176757812, "loss": 4.3698, "lr": 0.0009341258741258742, "step": 972, "tokens_trained": 0.47776944 }, { "epoch": 0.2762924615275512, "grad_norm": 21.813220977783203, "loss": 4.3902, "lr": 0.0009338461538461539, "step": 974, "tokens_trained": 0.478757048 }, { "epoch": 0.27685979717750514, "grad_norm": 64.79598999023438, "loss": 4.5237, "lr": 0.0009335664335664336, "step": 976, "tokens_trained": 0.47973872 }, { "epoch": 0.27742713282745907, "grad_norm": 68.32705688476562, "loss": 4.4461, "lr": 0.0009332867132867133, "step": 978, "tokens_trained": 0.480721912 }, { "epoch": 0.27799446847741294, "grad_norm": 41.857582092285156, "loss": 4.4663, "lr": 0.0009330069930069929, "step": 980, "tokens_trained": 0.481704248 }, { "epoch": 0.27856180412736686, "grad_norm": 28.30609893798828, "loss": 4.3461, "lr": 0.0009327272727272728, "step": 982, "tokens_trained": 0.482689768 }, { "epoch": 0.27912913977732073, "grad_norm": 33.207950592041016, "loss": 4.4185, "lr": 0.0009324475524475524, "step": 984, "tokens_trained": 0.483670008 }, { "epoch": 0.27969647542727466, "grad_norm": 29.541227340698242, "loss": 4.388, "lr": 0.0009321678321678322, "step": 986, "tokens_trained": 0.48465836 }, { "epoch": 0.2802638110772286, "grad_norm": 16.23346710205078, "loss": 4.3219, "lr": 0.0009318881118881119, "step": 988, "tokens_trained": 0.4856402 }, { "epoch": 0.28083114672718246, "grad_norm": 20.036178588867188, "loss": 4.3273, "lr": 0.0009316083916083917, "step": 990, "tokens_trained": 0.486621648 }, { "epoch": 0.2813984823771364, "grad_norm": 49.25468063354492, "loss": 4.4649, "lr": 0.0009313286713286714, "step": 992, "tokens_trained": 0.48760744 }, { "epoch": 0.28196581802709025, "grad_norm": 48.59744644165039, "loss": 4.3979, "lr": 0.000931048951048951, "step": 994, "tokens_trained": 0.488590472 }, { "epoch": 0.2825331536770442, "grad_norm": 16.33649253845215, "loss": 4.3945, "lr": 0.0009307692307692308, "step": 996, "tokens_trained": 0.489570976 }, { "epoch": 0.2831004893269981, "grad_norm": 60.632591247558594, "loss": 4.5581, "lr": 0.0009304895104895104, "step": 998, "tokens_trained": 0.490552296 }, { "epoch": 0.283667824976952, "grad_norm": 52.75735092163086, "loss": 4.424, "lr": 0.0009302097902097903, "step": 1000, "tokens_trained": 0.49153744 }, { "epoch": 0.283667824976952, "eval_loss": 1.1363450288772583, "eval_runtime": 20.7491, "step": 1000, "tokens_trained": 0.49153744 }, { "epoch": 0.2842351606269059, "grad_norm": 20.506614685058594, "loss": 4.4241, "lr": 0.0009299300699300699, "step": 1002, "tokens_trained": 0.492522608 }, { "epoch": 0.2848024962768598, "grad_norm": 23.148601531982422, "loss": 4.3975, "lr": 0.0009296503496503497, "step": 1004, "tokens_trained": 0.493501384 }, { "epoch": 0.2853698319268137, "grad_norm": 9.550869941711426, "loss": 4.3952, "lr": 0.0009293706293706294, "step": 1006, "tokens_trained": 0.494482544 }, { "epoch": 0.2859371675767676, "grad_norm": 80.31155395507812, "loss": 4.7614, "lr": 0.0009290909090909091, "step": 1008, "tokens_trained": 0.495459416 }, { "epoch": 0.2865045032267215, "grad_norm": 61.021026611328125, "loss": 4.4396, "lr": 0.0009288111888111889, "step": 1010, "tokens_trained": 0.4964418 }, { "epoch": 0.2870718388766754, "grad_norm": 35.23258972167969, "loss": 4.5548, "lr": 0.0009285314685314685, "step": 1012, "tokens_trained": 0.497428288 }, { "epoch": 0.2876391745266293, "grad_norm": 36.45478057861328, "loss": 4.46, "lr": 0.0009282517482517483, "step": 1014, "tokens_trained": 0.498416832 }, { "epoch": 0.2882065101765832, "grad_norm": 46.622982025146484, "loss": 4.3554, "lr": 0.0009279720279720279, "step": 1016, "tokens_trained": 0.499399792 }, { "epoch": 0.28877384582653715, "grad_norm": 87.00289154052734, "loss": 4.5276, "lr": 0.0009276923076923078, "step": 1018, "tokens_trained": 0.500383776 }, { "epoch": 0.289341181476491, "grad_norm": 11.444964408874512, "loss": 4.5483, "lr": 0.0009274125874125874, "step": 1020, "tokens_trained": 0.50136468 }, { "epoch": 0.28990851712644494, "grad_norm": 89.05914306640625, "loss": 4.8957, "lr": 0.0009271328671328671, "step": 1022, "tokens_trained": 0.50235172 }, { "epoch": 0.2904758527763988, "grad_norm": 26.915477752685547, "loss": 4.6184, "lr": 0.0009268531468531469, "step": 1024, "tokens_trained": 0.50333208 }, { "epoch": 0.29104318842635274, "grad_norm": 44.32100296020508, "loss": 4.5263, "lr": 0.0009265734265734266, "step": 1026, "tokens_trained": 0.504314656 }, { "epoch": 0.29161052407630667, "grad_norm": 26.699670791625977, "loss": 4.3871, "lr": 0.0009262937062937064, "step": 1028, "tokens_trained": 0.505296568 }, { "epoch": 0.29217785972626054, "grad_norm": 27.469482421875, "loss": 4.3558, "lr": 0.000926013986013986, "step": 1030, "tokens_trained": 0.506280416 }, { "epoch": 0.29274519537621446, "grad_norm": 26.149612426757812, "loss": 4.3368, "lr": 0.0009257342657342658, "step": 1032, "tokens_trained": 0.507261224 }, { "epoch": 0.29331253102616833, "grad_norm": 8.754459381103516, "loss": 4.3447, "lr": 0.0009254545454545454, "step": 1034, "tokens_trained": 0.508243288 }, { "epoch": 0.29387986667612226, "grad_norm": 32.17164611816406, "loss": 4.4174, "lr": 0.0009251748251748252, "step": 1036, "tokens_trained": 0.509224176 }, { "epoch": 0.2944472023260762, "grad_norm": 41.17238235473633, "loss": 4.4221, "lr": 0.0009248951048951049, "step": 1038, "tokens_trained": 0.510203568 }, { "epoch": 0.29501453797603006, "grad_norm": 44.97213363647461, "loss": 4.3594, "lr": 0.0009246153846153846, "step": 1040, "tokens_trained": 0.511186464 }, { "epoch": 0.295581873625984, "grad_norm": 42.23421859741211, "loss": 4.4159, "lr": 0.0009243356643356644, "step": 1042, "tokens_trained": 0.51216944 }, { "epoch": 0.29614920927593785, "grad_norm": 36.13594436645508, "loss": 4.4105, "lr": 0.0009240559440559441, "step": 1044, "tokens_trained": 0.513153144 }, { "epoch": 0.2967165449258918, "grad_norm": 36.89309310913086, "loss": 4.3947, "lr": 0.0009237762237762239, "step": 1046, "tokens_trained": 0.51413388 }, { "epoch": 0.2972838805758457, "grad_norm": 58.599700927734375, "loss": 4.3988, "lr": 0.0009234965034965035, "step": 1048, "tokens_trained": 0.515119288 }, { "epoch": 0.2978512162257996, "grad_norm": 13.725994110107422, "loss": 4.412, "lr": 0.0009232167832167832, "step": 1050, "tokens_trained": 0.51610284 }, { "epoch": 0.2984185518757535, "grad_norm": 105.28518676757812, "loss": 4.7305, "lr": 0.0009229370629370629, "step": 1052, "tokens_trained": 0.517085576 }, { "epoch": 0.2989858875257074, "grad_norm": 29.499713897705078, "loss": 4.5106, "lr": 0.0009226573426573427, "step": 1054, "tokens_trained": 0.518064224 }, { "epoch": 0.2995532231756613, "grad_norm": 60.907203674316406, "loss": 4.5249, "lr": 0.0009223776223776224, "step": 1056, "tokens_trained": 0.51905084 }, { "epoch": 0.3001205588256152, "grad_norm": 39.825069427490234, "loss": 4.3695, "lr": 0.0009220979020979021, "step": 1058, "tokens_trained": 0.5200318 }, { "epoch": 0.3006878944755691, "grad_norm": 42.77061462402344, "loss": 4.4094, "lr": 0.0009218181818181819, "step": 1060, "tokens_trained": 0.521013568 }, { "epoch": 0.301255230125523, "grad_norm": 37.05888748168945, "loss": 4.3684, "lr": 0.0009215384615384616, "step": 1062, "tokens_trained": 0.521997624 }, { "epoch": 0.3018225657754769, "grad_norm": 42.28252029418945, "loss": 4.3489, "lr": 0.0009212587412587413, "step": 1064, "tokens_trained": 0.522986184 }, { "epoch": 0.3023899014254308, "grad_norm": 40.95197677612305, "loss": 4.3564, "lr": 0.000920979020979021, "step": 1066, "tokens_trained": 0.523970984 }, { "epoch": 0.30295723707538474, "grad_norm": 25.469568252563477, "loss": 4.3833, "lr": 0.0009206993006993007, "step": 1068, "tokens_trained": 0.524952808 }, { "epoch": 0.3035245727253386, "grad_norm": 29.921735763549805, "loss": 4.3579, "lr": 0.0009204195804195804, "step": 1070, "tokens_trained": 0.525935696 }, { "epoch": 0.30409190837529254, "grad_norm": 26.038026809692383, "loss": 4.2898, "lr": 0.0009201398601398602, "step": 1072, "tokens_trained": 0.526916904 }, { "epoch": 0.3046592440252464, "grad_norm": 32.59503936767578, "loss": 4.3335, "lr": 0.0009198601398601398, "step": 1074, "tokens_trained": 0.527899864 }, { "epoch": 0.30522657967520034, "grad_norm": 14.04964828491211, "loss": 4.3171, "lr": 0.0009195804195804196, "step": 1076, "tokens_trained": 0.528878176 }, { "epoch": 0.30579391532515426, "grad_norm": 15.936906814575195, "loss": 4.3005, "lr": 0.0009193006993006993, "step": 1078, "tokens_trained": 0.529859952 }, { "epoch": 0.30636125097510813, "grad_norm": 9.73235034942627, "loss": 4.3287, "lr": 0.0009190209790209791, "step": 1080, "tokens_trained": 0.530838192 }, { "epoch": 0.30692858662506206, "grad_norm": 45.44027328491211, "loss": 4.4384, "lr": 0.0009187412587412588, "step": 1082, "tokens_trained": 0.531818376 }, { "epoch": 0.30749592227501593, "grad_norm": 55.65925598144531, "loss": 4.3772, "lr": 0.0009184615384615385, "step": 1084, "tokens_trained": 0.532802048 }, { "epoch": 0.30806325792496986, "grad_norm": 33.47093200683594, "loss": 4.4257, "lr": 0.0009181818181818182, "step": 1086, "tokens_trained": 0.533785376 }, { "epoch": 0.3086305935749238, "grad_norm": 39.709224700927734, "loss": 4.4177, "lr": 0.0009179020979020978, "step": 1088, "tokens_trained": 0.5347698 }, { "epoch": 0.30919792922487765, "grad_norm": 34.25212097167969, "loss": 4.3518, "lr": 0.0009176223776223777, "step": 1090, "tokens_trained": 0.53575108 }, { "epoch": 0.3097652648748316, "grad_norm": 29.156312942504883, "loss": 4.3596, "lr": 0.0009173426573426573, "step": 1092, "tokens_trained": 0.536735544 }, { "epoch": 0.31033260052478545, "grad_norm": 31.714128494262695, "loss": 4.3736, "lr": 0.0009170629370629371, "step": 1094, "tokens_trained": 0.537718008 }, { "epoch": 0.3108999361747394, "grad_norm": 12.244729042053223, "loss": 4.3472, "lr": 0.0009167832167832168, "step": 1096, "tokens_trained": 0.538693512 }, { "epoch": 0.3114672718246933, "grad_norm": 10.271063804626465, "loss": 4.301, "lr": 0.0009165034965034966, "step": 1098, "tokens_trained": 0.539681376 }, { "epoch": 0.3120346074746472, "grad_norm": 35.79754638671875, "loss": 4.3912, "lr": 0.0009162237762237763, "step": 1100, "tokens_trained": 0.540661392 }, { "epoch": 0.3126019431246011, "grad_norm": 24.1260986328125, "loss": 4.3303, "lr": 0.0009159440559440559, "step": 1102, "tokens_trained": 0.541646968 }, { "epoch": 0.31316927877455497, "grad_norm": 24.501169204711914, "loss": 4.3205, "lr": 0.0009156643356643357, "step": 1104, "tokens_trained": 0.542629392 }, { "epoch": 0.3137366144245089, "grad_norm": 17.031600952148438, "loss": 4.2521, "lr": 0.0009153846153846153, "step": 1106, "tokens_trained": 0.54361348 }, { "epoch": 0.3143039500744628, "grad_norm": 19.506216049194336, "loss": 4.3225, "lr": 0.0009151048951048952, "step": 1108, "tokens_trained": 0.544595336 }, { "epoch": 0.3148712857244167, "grad_norm": 20.822546005249023, "loss": 4.2711, "lr": 0.0009148251748251748, "step": 1110, "tokens_trained": 0.545578256 }, { "epoch": 0.3154386213743706, "grad_norm": 29.967998504638672, "loss": 4.2868, "lr": 0.0009145454545454546, "step": 1112, "tokens_trained": 0.546561024 }, { "epoch": 0.3160059570243245, "grad_norm": 24.06121063232422, "loss": 4.2701, "lr": 0.0009142657342657343, "step": 1114, "tokens_trained": 0.547544616 }, { "epoch": 0.3165732926742784, "grad_norm": 15.868765830993652, "loss": 4.3233, "lr": 0.000913986013986014, "step": 1116, "tokens_trained": 0.548526216 }, { "epoch": 0.31714062832423234, "grad_norm": 27.47897720336914, "loss": 4.2813, "lr": 0.0009137062937062938, "step": 1118, "tokens_trained": 0.549506544 }, { "epoch": 0.3177079639741862, "grad_norm": 15.343204498291016, "loss": 4.3002, "lr": 0.0009134265734265734, "step": 1120, "tokens_trained": 0.550488496 }, { "epoch": 0.31827529962414014, "grad_norm": 4.320124626159668, "loss": 4.2622, "lr": 0.0009131468531468532, "step": 1122, "tokens_trained": 0.551471792 }, { "epoch": 0.318842635274094, "grad_norm": 34.520050048828125, "loss": 4.366, "lr": 0.0009128671328671328, "step": 1124, "tokens_trained": 0.552457008 }, { "epoch": 0.319126303099071, "eval_loss": 1.096465826034546, "eval_runtime": 20.7643, "step": 1125, "tokens_trained": 0.552948064 }, { "epoch": 0.31940997092404794, "grad_norm": 39.718719482421875, "loss": 4.3317, "lr": 0.0009125874125874127, "step": 1126, "tokens_trained": 0.5534394 }, { "epoch": 0.31997730657400186, "grad_norm": 20.843252182006836, "loss": 4.3883, "lr": 0.0009123076923076923, "step": 1128, "tokens_trained": 0.554419184 }, { "epoch": 0.32054464222395573, "grad_norm": 12.916360855102539, "loss": 4.3119, "lr": 0.000912027972027972, "step": 1130, "tokens_trained": 0.555401952 }, { "epoch": 0.32111197787390966, "grad_norm": 48.54426956176758, "loss": 4.4155, "lr": 0.0009117482517482518, "step": 1132, "tokens_trained": 0.556385024 }, { "epoch": 0.32167931352386353, "grad_norm": 41.00883483886719, "loss": 4.362, "lr": 0.0009114685314685315, "step": 1134, "tokens_trained": 0.557368472 }, { "epoch": 0.32224664917381746, "grad_norm": 28.0487060546875, "loss": 4.3504, "lr": 0.0009111888111888113, "step": 1136, "tokens_trained": 0.55835288 }, { "epoch": 0.3228139848237714, "grad_norm": 22.05229377746582, "loss": 4.331, "lr": 0.0009109090909090909, "step": 1138, "tokens_trained": 0.559337064 }, { "epoch": 0.32338132047372525, "grad_norm": 16.770631790161133, "loss": 4.3008, "lr": 0.0009106293706293707, "step": 1140, "tokens_trained": 0.560317984 }, { "epoch": 0.3239486561236792, "grad_norm": 35.300262451171875, "loss": 4.4083, "lr": 0.0009103496503496503, "step": 1142, "tokens_trained": 0.561299688 }, { "epoch": 0.32451599177363305, "grad_norm": 23.788284301757812, "loss": 4.2772, "lr": 0.0009100699300699301, "step": 1144, "tokens_trained": 0.562285664 }, { "epoch": 0.325083327423587, "grad_norm": 23.085710525512695, "loss": 4.3185, "lr": 0.0009097902097902098, "step": 1146, "tokens_trained": 0.563267832 }, { "epoch": 0.3256506630735409, "grad_norm": 13.11314582824707, "loss": 4.2711, "lr": 0.0009095104895104895, "step": 1148, "tokens_trained": 0.564248928 }, { "epoch": 0.3262179987234948, "grad_norm": 31.297805786132812, "loss": 4.3096, "lr": 0.0009092307692307692, "step": 1150, "tokens_trained": 0.56522952 }, { "epoch": 0.3267853343734487, "grad_norm": 11.668539047241211, "loss": 4.2667, "lr": 0.000908951048951049, "step": 1152, "tokens_trained": 0.566212392 }, { "epoch": 0.32735267002340257, "grad_norm": 23.359189987182617, "loss": 4.3156, "lr": 0.0009086713286713288, "step": 1154, "tokens_trained": 0.567192216 }, { "epoch": 0.3279200056733565, "grad_norm": 31.09916114807129, "loss": 4.3367, "lr": 0.0009083916083916084, "step": 1156, "tokens_trained": 0.568177088 }, { "epoch": 0.3284873413233104, "grad_norm": 24.03261947631836, "loss": 4.3504, "lr": 0.0009081118881118881, "step": 1158, "tokens_trained": 0.56915868 }, { "epoch": 0.3290546769732643, "grad_norm": 16.029443740844727, "loss": 4.3192, "lr": 0.0009078321678321678, "step": 1160, "tokens_trained": 0.570142976 }, { "epoch": 0.3296220126232182, "grad_norm": 53.486724853515625, "loss": 4.3921, "lr": 0.0009075524475524476, "step": 1162, "tokens_trained": 0.57112748 }, { "epoch": 0.3301893482731721, "grad_norm": 37.42267608642578, "loss": 4.2821, "lr": 0.0009072727272727273, "step": 1164, "tokens_trained": 0.57211356 }, { "epoch": 0.330756683923126, "grad_norm": 28.862472534179688, "loss": 4.3002, "lr": 0.000906993006993007, "step": 1166, "tokens_trained": 0.57309492 }, { "epoch": 0.33132401957307994, "grad_norm": 22.26299476623535, "loss": 4.2729, "lr": 0.0009067132867132866, "step": 1168, "tokens_trained": 0.5740806 }, { "epoch": 0.3318913552230338, "grad_norm": 21.635013580322266, "loss": 4.2866, "lr": 0.0009064335664335665, "step": 1170, "tokens_trained": 0.575061664 }, { "epoch": 0.33245869087298774, "grad_norm": 18.995012283325195, "loss": 4.2814, "lr": 0.0009061538461538462, "step": 1172, "tokens_trained": 0.576046304 }, { "epoch": 0.3330260265229416, "grad_norm": 22.621299743652344, "loss": 4.2739, "lr": 0.0009058741258741259, "step": 1174, "tokens_trained": 0.577032376 }, { "epoch": 0.33359336217289554, "grad_norm": 21.758216857910156, "loss": 4.263, "lr": 0.0009055944055944056, "step": 1176, "tokens_trained": 0.578013896 }, { "epoch": 0.33416069782284946, "grad_norm": 32.38374710083008, "loss": 4.2713, "lr": 0.0009053146853146853, "step": 1178, "tokens_trained": 0.57900508 }, { "epoch": 0.33472803347280333, "grad_norm": 35.57462692260742, "loss": 4.2986, "lr": 0.0009050349650349651, "step": 1180, "tokens_trained": 0.57999512 }, { "epoch": 0.33529536912275726, "grad_norm": 11.77812385559082, "loss": 4.3085, "lr": 0.0009047552447552448, "step": 1182, "tokens_trained": 0.580982752 }, { "epoch": 0.33586270477271113, "grad_norm": 51.48725509643555, "loss": 4.4003, "lr": 0.0009044755244755245, "step": 1184, "tokens_trained": 0.581964936 }, { "epoch": 0.33643004042266506, "grad_norm": 47.01481628417969, "loss": 4.3182, "lr": 0.0009041958041958041, "step": 1186, "tokens_trained": 0.582949944 }, { "epoch": 0.336997376072619, "grad_norm": 22.935691833496094, "loss": 4.3432, "lr": 0.000903916083916084, "step": 1188, "tokens_trained": 0.583934776 }, { "epoch": 0.33756471172257285, "grad_norm": 45.21054458618164, "loss": 4.4674, "lr": 0.0009036363636363637, "step": 1190, "tokens_trained": 0.584918344 }, { "epoch": 0.3381320473725268, "grad_norm": 27.012706756591797, "loss": 4.2889, "lr": 0.0009033566433566434, "step": 1192, "tokens_trained": 0.585897632 }, { "epoch": 0.33869938302248065, "grad_norm": 16.68247413635254, "loss": 4.2896, "lr": 0.0009030769230769231, "step": 1194, "tokens_trained": 0.586879408 }, { "epoch": 0.3392667186724346, "grad_norm": 20.664148330688477, "loss": 4.304, "lr": 0.0009027972027972027, "step": 1196, "tokens_trained": 0.587859392 }, { "epoch": 0.3398340543223885, "grad_norm": 22.954742431640625, "loss": 4.2853, "lr": 0.0009025174825174826, "step": 1198, "tokens_trained": 0.588845408 }, { "epoch": 0.34040138997234237, "grad_norm": 23.226943969726562, "loss": 4.2597, "lr": 0.0009022377622377622, "step": 1200, "tokens_trained": 0.589832736 }, { "epoch": 0.3409687256222963, "grad_norm": 7.963059902191162, "loss": 4.261, "lr": 0.000901958041958042, "step": 1202, "tokens_trained": 0.590816568 }, { "epoch": 0.34153606127225017, "grad_norm": 25.160730361938477, "loss": 4.3288, "lr": 0.0009016783216783216, "step": 1204, "tokens_trained": 0.59179692 }, { "epoch": 0.3421033969222041, "grad_norm": 38.45030212402344, "loss": 4.3371, "lr": 0.0009013986013986014, "step": 1206, "tokens_trained": 0.592780968 }, { "epoch": 0.342670732572158, "grad_norm": 52.66873550415039, "loss": 4.2805, "lr": 0.0009011188811188812, "step": 1208, "tokens_trained": 0.593760896 }, { "epoch": 0.3432380682221119, "grad_norm": 28.104921340942383, "loss": 4.3885, "lr": 0.0009008391608391609, "step": 1210, "tokens_trained": 0.59474304 }, { "epoch": 0.3438054038720658, "grad_norm": 49.20989990234375, "loss": 4.346, "lr": 0.0009005594405594406, "step": 1212, "tokens_trained": 0.59572768 }, { "epoch": 0.3443727395220197, "grad_norm": 20.652427673339844, "loss": 4.2368, "lr": 0.0009002797202797202, "step": 1214, "tokens_trained": 0.59671092 }, { "epoch": 0.3449400751719736, "grad_norm": 17.821596145629883, "loss": 4.3041, "lr": 0.0009000000000000001, "step": 1216, "tokens_trained": 0.597697344 }, { "epoch": 0.34550741082192754, "grad_norm": 48.594932556152344, "loss": 4.3668, "lr": 0.0008997202797202797, "step": 1218, "tokens_trained": 0.598677288 }, { "epoch": 0.3460747464718814, "grad_norm": 27.70078468322754, "loss": 4.2939, "lr": 0.0008994405594405595, "step": 1220, "tokens_trained": 0.599662488 }, { "epoch": 0.34664208212183534, "grad_norm": 25.498798370361328, "loss": 4.2891, "lr": 0.0008991608391608391, "step": 1222, "tokens_trained": 0.600646904 }, { "epoch": 0.3472094177717892, "grad_norm": 13.455835342407227, "loss": 4.2881, "lr": 0.0008988811188811188, "step": 1224, "tokens_trained": 0.601628112 }, { "epoch": 0.34777675342174313, "grad_norm": 17.518342971801758, "loss": 4.2977, "lr": 0.0008986013986013987, "step": 1226, "tokens_trained": 0.602612336 }, { "epoch": 0.34834408907169706, "grad_norm": 20.642597198486328, "loss": 4.2921, "lr": 0.0008983216783216783, "step": 1228, "tokens_trained": 0.603595 }, { "epoch": 0.34891142472165093, "grad_norm": 14.464616775512695, "loss": 4.233, "lr": 0.0008980419580419581, "step": 1230, "tokens_trained": 0.604576592 }, { "epoch": 0.34947876037160486, "grad_norm": 13.204504013061523, "loss": 4.2707, "lr": 0.0008977622377622377, "step": 1232, "tokens_trained": 0.60555656 }, { "epoch": 0.35004609602155873, "grad_norm": 12.241665840148926, "loss": 4.2506, "lr": 0.0008974825174825176, "step": 1234, "tokens_trained": 0.606536024 }, { "epoch": 0.35061343167151265, "grad_norm": 18.187660217285156, "loss": 4.2659, "lr": 0.0008972027972027972, "step": 1236, "tokens_trained": 0.607522576 }, { "epoch": 0.3511807673214666, "grad_norm": 8.911888122558594, "loss": 4.2505, "lr": 0.000896923076923077, "step": 1238, "tokens_trained": 0.608507736 }, { "epoch": 0.35174810297142045, "grad_norm": 21.351713180541992, "loss": 4.2291, "lr": 0.0008966433566433566, "step": 1240, "tokens_trained": 0.609486688 }, { "epoch": 0.3523154386213744, "grad_norm": 47.81566619873047, "loss": 4.2725, "lr": 0.0008963636363636363, "step": 1242, "tokens_trained": 0.610470272 }, { "epoch": 0.35288277427132825, "grad_norm": 33.53351974487305, "loss": 4.3237, "lr": 0.0008960839160839162, "step": 1244, "tokens_trained": 0.611455176 }, { "epoch": 0.3534501099212822, "grad_norm": 15.252607345581055, "loss": 4.2868, "lr": 0.0008958041958041958, "step": 1246, "tokens_trained": 0.612437888 }, { "epoch": 0.3540174455712361, "grad_norm": 24.129865646362305, "loss": 4.2626, "lr": 0.0008955244755244756, "step": 1248, "tokens_trained": 0.613420728 }, { "epoch": 0.35458478122118997, "grad_norm": 34.814605712890625, "loss": 4.2627, "lr": 0.0008952447552447552, "step": 1250, "tokens_trained": 0.614405904 }, { "epoch": 0.35458478122118997, "eval_loss": 1.078355312347412, "eval_runtime": 20.4723, "step": 1250, "tokens_trained": 0.614405904 }, { "epoch": 0.3551521168711439, "grad_norm": 18.26809310913086, "loss": 4.2986, "lr": 0.000894965034965035, "step": 1252, "tokens_trained": 0.615386288 }, { "epoch": 0.35571945252109777, "grad_norm": 24.68335723876953, "loss": 4.3146, "lr": 0.0008946853146853147, "step": 1254, "tokens_trained": 0.616370576 }, { "epoch": 0.3562867881710517, "grad_norm": 35.34586715698242, "loss": 4.2905, "lr": 0.0008944055944055944, "step": 1256, "tokens_trained": 0.617351944 }, { "epoch": 0.3568541238210056, "grad_norm": 22.668407440185547, "loss": 4.2607, "lr": 0.0008941258741258741, "step": 1258, "tokens_trained": 0.618334816 }, { "epoch": 0.3574214594709595, "grad_norm": 14.068164825439453, "loss": 4.2459, "lr": 0.0008938461538461538, "step": 1260, "tokens_trained": 0.619319736 }, { "epoch": 0.3579887951209134, "grad_norm": 8.274995803833008, "loss": 4.2713, "lr": 0.0008935664335664337, "step": 1262, "tokens_trained": 0.620299344 }, { "epoch": 0.3585561307708673, "grad_norm": 22.12897491455078, "loss": 4.2841, "lr": 0.0008932867132867133, "step": 1264, "tokens_trained": 0.621282592 }, { "epoch": 0.3591234664208212, "grad_norm": 26.171052932739258, "loss": 4.2505, "lr": 0.000893006993006993, "step": 1266, "tokens_trained": 0.622266136 }, { "epoch": 0.35969080207077514, "grad_norm": 14.768603324890137, "loss": 4.271, "lr": 0.0008927272727272727, "step": 1268, "tokens_trained": 0.623247816 }, { "epoch": 0.360258137720729, "grad_norm": 13.065408706665039, "loss": 4.2387, "lr": 0.0008924475524475525, "step": 1270, "tokens_trained": 0.624234848 }, { "epoch": 0.36082547337068294, "grad_norm": 14.043888092041016, "loss": 4.2601, "lr": 0.0008921678321678322, "step": 1272, "tokens_trained": 0.625214176 }, { "epoch": 0.3613928090206368, "grad_norm": 13.734328269958496, "loss": 4.2426, "lr": 0.0008918881118881119, "step": 1274, "tokens_trained": 0.626197608 }, { "epoch": 0.36196014467059073, "grad_norm": 10.075374603271484, "loss": 4.2259, "lr": 0.0008916083916083916, "step": 1276, "tokens_trained": 0.62717884 }, { "epoch": 0.36252748032054466, "grad_norm": 33.92001724243164, "loss": 4.3054, "lr": 0.0008913286713286713, "step": 1278, "tokens_trained": 0.628166888 }, { "epoch": 0.36309481597049853, "grad_norm": 31.1391544342041, "loss": 4.3066, "lr": 0.0008910489510489512, "step": 1280, "tokens_trained": 0.629152528 }, { "epoch": 0.36366215162045246, "grad_norm": 10.888711929321289, "loss": 4.2348, "lr": 0.0008907692307692308, "step": 1282, "tokens_trained": 0.630132584 }, { "epoch": 0.3642294872704063, "grad_norm": 27.298410415649414, "loss": 4.3225, "lr": 0.0008904895104895105, "step": 1284, "tokens_trained": 0.63111212 }, { "epoch": 0.36479682292036025, "grad_norm": 23.396818161010742, "loss": 4.3177, "lr": 0.0008902097902097902, "step": 1286, "tokens_trained": 0.632094984 }, { "epoch": 0.3653641585703142, "grad_norm": 18.824432373046875, "loss": 4.2235, "lr": 0.00088993006993007, "step": 1288, "tokens_trained": 0.633076832 }, { "epoch": 0.36593149422026805, "grad_norm": 8.04826545715332, "loss": 4.2268, "lr": 0.0008896503496503497, "step": 1290, "tokens_trained": 0.63405868 }, { "epoch": 0.366498829870222, "grad_norm": 32.26673889160156, "loss": 4.3113, "lr": 0.0008893706293706294, "step": 1292, "tokens_trained": 0.635045096 }, { "epoch": 0.36706616552017585, "grad_norm": 29.91358184814453, "loss": 4.2971, "lr": 0.000889090909090909, "step": 1294, "tokens_trained": 0.63603008 }, { "epoch": 0.3676335011701298, "grad_norm": 12.093538284301758, "loss": 4.2502, "lr": 0.0008888111888111888, "step": 1296, "tokens_trained": 0.637014016 }, { "epoch": 0.3682008368200837, "grad_norm": 8.252509117126465, "loss": 4.2905, "lr": 0.0008885314685314686, "step": 1298, "tokens_trained": 0.637997752 }, { "epoch": 0.36876817247003757, "grad_norm": 61.22240447998047, "loss": 4.4753, "lr": 0.0008882517482517483, "step": 1300, "tokens_trained": 0.638981552 }, { "epoch": 0.3693355081199915, "grad_norm": 47.58195877075195, "loss": 4.2769, "lr": 0.000887972027972028, "step": 1302, "tokens_trained": 0.639963512 }, { "epoch": 0.36990284376994537, "grad_norm": 28.806411743164062, "loss": 4.3728, "lr": 0.0008876923076923077, "step": 1304, "tokens_trained": 0.640948392 }, { "epoch": 0.3704701794198993, "grad_norm": 38.960853576660156, "loss": 4.338, "lr": 0.0008874125874125875, "step": 1306, "tokens_trained": 0.641935304 }, { "epoch": 0.3710375150698532, "grad_norm": 25.05726432800293, "loss": 4.3002, "lr": 0.0008871328671328671, "step": 1308, "tokens_trained": 0.642924168 }, { "epoch": 0.3716048507198071, "grad_norm": 39.84127426147461, "loss": 4.3593, "lr": 0.0008868531468531469, "step": 1310, "tokens_trained": 0.64390412 }, { "epoch": 0.372172186369761, "grad_norm": 15.03055191040039, "loss": 4.223, "lr": 0.0008865734265734265, "step": 1312, "tokens_trained": 0.644882104 }, { "epoch": 0.3727395220197149, "grad_norm": 41.85628890991211, "loss": 4.3819, "lr": 0.0008862937062937063, "step": 1314, "tokens_trained": 0.645866912 }, { "epoch": 0.3733068576696688, "grad_norm": 29.014118194580078, "loss": 4.2843, "lr": 0.0008860139860139861, "step": 1316, "tokens_trained": 0.646850376 }, { "epoch": 0.37387419331962274, "grad_norm": 24.407743453979492, "loss": 4.2598, "lr": 0.0008857342657342658, "step": 1318, "tokens_trained": 0.647832272 }, { "epoch": 0.3744415289695766, "grad_norm": 23.28154182434082, "loss": 4.2162, "lr": 0.0008854545454545455, "step": 1320, "tokens_trained": 0.64881652 }, { "epoch": 0.37500886461953054, "grad_norm": 17.70418930053711, "loss": 4.2386, "lr": 0.0008851748251748251, "step": 1322, "tokens_trained": 0.649794936 }, { "epoch": 0.37557620026948446, "grad_norm": 22.582124710083008, "loss": 4.2358, "lr": 0.000884895104895105, "step": 1324, "tokens_trained": 0.650777784 }, { "epoch": 0.37614353591943833, "grad_norm": 16.77848243713379, "loss": 4.2536, "lr": 0.0008846153846153846, "step": 1326, "tokens_trained": 0.651762472 }, { "epoch": 0.37671087156939226, "grad_norm": 14.382417678833008, "loss": 4.2403, "lr": 0.0008843356643356644, "step": 1328, "tokens_trained": 0.652741832 }, { "epoch": 0.37727820721934613, "grad_norm": 22.420886993408203, "loss": 4.1977, "lr": 0.000884055944055944, "step": 1330, "tokens_trained": 0.653725792 }, { "epoch": 0.37784554286930006, "grad_norm": 9.768660545349121, "loss": 4.2148, "lr": 0.0008837762237762238, "step": 1332, "tokens_trained": 0.654704648 }, { "epoch": 0.378412878519254, "grad_norm": 5.091487407684326, "loss": 4.2062, "lr": 0.0008834965034965036, "step": 1334, "tokens_trained": 0.65569176 }, { "epoch": 0.37898021416920785, "grad_norm": 53.520957946777344, "loss": 4.4082, "lr": 0.0008832167832167832, "step": 1336, "tokens_trained": 0.656679344 }, { "epoch": 0.3795475498191618, "grad_norm": 32.17420959472656, "loss": 4.2911, "lr": 0.000882937062937063, "step": 1338, "tokens_trained": 0.657665136 }, { "epoch": 0.38011488546911565, "grad_norm": 14.12790584564209, "loss": 4.2899, "lr": 0.0008826573426573426, "step": 1340, "tokens_trained": 0.658651576 }, { "epoch": 0.3806822211190696, "grad_norm": 51.74199676513672, "loss": 4.3901, "lr": 0.0008823776223776225, "step": 1342, "tokens_trained": 0.659631792 }, { "epoch": 0.3812495567690235, "grad_norm": 48.99909973144531, "loss": 4.298, "lr": 0.0008820979020979021, "step": 1344, "tokens_trained": 0.660616912 }, { "epoch": 0.38181689241897737, "grad_norm": 28.356245040893555, "loss": 4.3171, "lr": 0.0008818181818181819, "step": 1346, "tokens_trained": 0.66159872 }, { "epoch": 0.3823842280689313, "grad_norm": 45.081703186035156, "loss": 4.3067, "lr": 0.0008815384615384615, "step": 1348, "tokens_trained": 0.662582152 }, { "epoch": 0.38295156371888517, "grad_norm": 37.175052642822266, "loss": 4.241, "lr": 0.0008812587412587412, "step": 1350, "tokens_trained": 0.663561176 }, { "epoch": 0.3835188993688391, "grad_norm": 49.46076965332031, "loss": 4.2896, "lr": 0.0008809790209790211, "step": 1352, "tokens_trained": 0.664545144 }, { "epoch": 0.384086235018793, "grad_norm": 22.20182991027832, "loss": 4.323, "lr": 0.0008806993006993007, "step": 1354, "tokens_trained": 0.66553092 }, { "epoch": 0.3846535706687469, "grad_norm": 34.111549377441406, "loss": 4.3138, "lr": 0.0008804195804195805, "step": 1356, "tokens_trained": 0.666517568 }, { "epoch": 0.3852209063187008, "grad_norm": 47.01582336425781, "loss": 4.3009, "lr": 0.0008801398601398601, "step": 1358, "tokens_trained": 0.667498192 }, { "epoch": 0.3857882419686547, "grad_norm": 18.845388412475586, "loss": 4.3176, "lr": 0.00087986013986014, "step": 1360, "tokens_trained": 0.668479008 }, { "epoch": 0.3863555776186086, "grad_norm": 53.68927764892578, "loss": 4.4024, "lr": 0.0008795804195804196, "step": 1362, "tokens_trained": 0.669462472 }, { "epoch": 0.38692291326856254, "grad_norm": 29.88358497619629, "loss": 4.286, "lr": 0.0008793006993006993, "step": 1364, "tokens_trained": 0.67044392 }, { "epoch": 0.3874902489185164, "grad_norm": 11.12879753112793, "loss": 4.3024, "lr": 0.000879020979020979, "step": 1366, "tokens_trained": 0.671424552 }, { "epoch": 0.38805758456847034, "grad_norm": 23.573301315307617, "loss": 4.2662, "lr": 0.0008787412587412587, "step": 1368, "tokens_trained": 0.672409992 }, { "epoch": 0.3886249202184242, "grad_norm": 24.749160766601562, "loss": 4.274, "lr": 0.0008784615384615386, "step": 1370, "tokens_trained": 0.67339824 }, { "epoch": 0.38919225586837813, "grad_norm": 33.26881408691406, "loss": 4.2588, "lr": 0.0008781818181818182, "step": 1372, "tokens_trained": 0.67438204 }, { "epoch": 0.38975959151833206, "grad_norm": 24.466472625732422, "loss": 4.2837, "lr": 0.000877902097902098, "step": 1374, "tokens_trained": 0.67536356 }, { "epoch": 0.39004325934330897, "eval_loss": 1.0616238117218018, "eval_runtime": 20.3698, "step": 1375, "tokens_trained": 0.675855672 }, { "epoch": 0.39032692716828593, "grad_norm": 24.48844337463379, "loss": 4.259, "lr": 0.0008776223776223776, "step": 1376, "tokens_trained": 0.676346368 }, { "epoch": 0.39089426281823986, "grad_norm": 30.594989776611328, "loss": 4.1894, "lr": 0.0008773426573426574, "step": 1378, "tokens_trained": 0.677329312 }, { "epoch": 0.3914615984681937, "grad_norm": 19.835350036621094, "loss": 4.2718, "lr": 0.0008770629370629371, "step": 1380, "tokens_trained": 0.678312272 }, { "epoch": 0.39202893411814765, "grad_norm": 14.570358276367188, "loss": 4.2419, "lr": 0.0008767832167832168, "step": 1382, "tokens_trained": 0.679291216 }, { "epoch": 0.3925962697681016, "grad_norm": 11.608271598815918, "loss": 4.1917, "lr": 0.0008765034965034965, "step": 1384, "tokens_trained": 0.680273296 }, { "epoch": 0.39316360541805545, "grad_norm": 26.094860076904297, "loss": 4.2762, "lr": 0.0008762237762237762, "step": 1386, "tokens_trained": 0.681249464 }, { "epoch": 0.3937309410680094, "grad_norm": 12.754049301147461, "loss": 4.2032, "lr": 0.0008759440559440561, "step": 1388, "tokens_trained": 0.682234168 }, { "epoch": 0.39429827671796325, "grad_norm": 5.951663970947266, "loss": 4.1921, "lr": 0.0008756643356643357, "step": 1390, "tokens_trained": 0.683217176 }, { "epoch": 0.3948656123679172, "grad_norm": 26.907669067382812, "loss": 4.24, "lr": 0.0008753846153846154, "step": 1392, "tokens_trained": 0.68419888 }, { "epoch": 0.3954329480178711, "grad_norm": 25.04796600341797, "loss": 4.2656, "lr": 0.0008751048951048951, "step": 1394, "tokens_trained": 0.685178784 }, { "epoch": 0.39600028366782497, "grad_norm": 19.600811004638672, "loss": 4.2683, "lr": 0.0008748251748251749, "step": 1396, "tokens_trained": 0.686161632 }, { "epoch": 0.3965676193177789, "grad_norm": 14.087088584899902, "loss": 4.2658, "lr": 0.0008745454545454546, "step": 1398, "tokens_trained": 0.687139992 }, { "epoch": 0.39713495496773277, "grad_norm": 9.257765769958496, "loss": 4.2021, "lr": 0.0008742657342657343, "step": 1400, "tokens_trained": 0.688117912 }, { "epoch": 0.3977022906176867, "grad_norm": 18.830154418945312, "loss": 4.2249, "lr": 0.0008739860139860139, "step": 1402, "tokens_trained": 0.689098776 }, { "epoch": 0.3982696262676406, "grad_norm": 24.81566619873047, "loss": 4.246, "lr": 0.0008737062937062937, "step": 1404, "tokens_trained": 0.690085432 }, { "epoch": 0.3988369619175945, "grad_norm": 14.071616172790527, "loss": 4.2531, "lr": 0.0008734265734265734, "step": 1406, "tokens_trained": 0.691069232 }, { "epoch": 0.3994042975675484, "grad_norm": 21.414424896240234, "loss": 4.2192, "lr": 0.0008731468531468532, "step": 1408, "tokens_trained": 0.692051224 }, { "epoch": 0.3999716332175023, "grad_norm": 38.74683380126953, "loss": 4.2421, "lr": 0.0008728671328671329, "step": 1410, "tokens_trained": 0.693029976 }, { "epoch": 0.4005389688674562, "grad_norm": 12.595442771911621, "loss": 4.2569, "lr": 0.0008725874125874126, "step": 1412, "tokens_trained": 0.694013304 }, { "epoch": 0.40110630451741014, "grad_norm": 55.233673095703125, "loss": 4.3422, "lr": 0.0008723076923076924, "step": 1414, "tokens_trained": 0.694997536 }, { "epoch": 0.401673640167364, "grad_norm": 24.717113494873047, "loss": 4.2567, "lr": 0.000872027972027972, "step": 1416, "tokens_trained": 0.695982632 }, { "epoch": 0.40224097581731794, "grad_norm": 20.552875518798828, "loss": 4.2464, "lr": 0.0008717482517482518, "step": 1418, "tokens_trained": 0.696966408 }, { "epoch": 0.4028083114672718, "grad_norm": 25.569900512695312, "loss": 4.21, "lr": 0.0008714685314685314, "step": 1420, "tokens_trained": 0.697948224 }, { "epoch": 0.40337564711722573, "grad_norm": 24.538320541381836, "loss": 4.2605, "lr": 0.0008711888111888112, "step": 1422, "tokens_trained": 0.698934688 }, { "epoch": 0.40394298276717966, "grad_norm": 9.585651397705078, "loss": 4.2524, "lr": 0.0008709090909090909, "step": 1424, "tokens_trained": 0.699921976 }, { "epoch": 0.40451031841713353, "grad_norm": 11.886672973632812, "loss": 4.1934, "lr": 0.0008706293706293707, "step": 1426, "tokens_trained": 0.70090396 }, { "epoch": 0.40507765406708746, "grad_norm": 26.162124633789062, "loss": 4.2412, "lr": 0.0008703496503496504, "step": 1428, "tokens_trained": 0.701888448 }, { "epoch": 0.4056449897170413, "grad_norm": 5.03931188583374, "loss": 4.202, "lr": 0.00087006993006993, "step": 1430, "tokens_trained": 0.702864336 }, { "epoch": 0.40621232536699525, "grad_norm": 33.67579650878906, "loss": 4.3087, "lr": 0.0008697902097902099, "step": 1432, "tokens_trained": 0.703847784 }, { "epoch": 0.4067796610169492, "grad_norm": 34.38542556762695, "loss": 4.2807, "lr": 0.0008695104895104895, "step": 1434, "tokens_trained": 0.704827288 }, { "epoch": 0.40734699666690305, "grad_norm": 13.319886207580566, "loss": 4.3332, "lr": 0.0008692307692307693, "step": 1436, "tokens_trained": 0.705815392 }, { "epoch": 0.407914332316857, "grad_norm": 36.58311080932617, "loss": 4.3318, "lr": 0.0008689510489510489, "step": 1438, "tokens_trained": 0.7067914 }, { "epoch": 0.40848166796681085, "grad_norm": 29.63648223876953, "loss": 4.2962, "lr": 0.0008686713286713287, "step": 1440, "tokens_trained": 0.70777396 }, { "epoch": 0.4090490036167648, "grad_norm": 9.55128002166748, "loss": 4.2773, "lr": 0.0008683916083916084, "step": 1442, "tokens_trained": 0.708750496 }, { "epoch": 0.4096163392667187, "grad_norm": 53.83981704711914, "loss": 4.3875, "lr": 0.0008681118881118881, "step": 1444, "tokens_trained": 0.709730168 }, { "epoch": 0.41018367491667257, "grad_norm": 54.59236526489258, "loss": 4.3582, "lr": 0.0008678321678321679, "step": 1446, "tokens_trained": 0.710709704 }, { "epoch": 0.4107510105666265, "grad_norm": 13.964411735534668, "loss": 4.3065, "lr": 0.0008675524475524475, "step": 1448, "tokens_trained": 0.711690136 }, { "epoch": 0.41131834621658037, "grad_norm": 25.506649017333984, "loss": 4.2686, "lr": 0.0008672727272727273, "step": 1450, "tokens_trained": 0.712668056 }, { "epoch": 0.4118856818665343, "grad_norm": 21.1628360748291, "loss": 4.2485, "lr": 0.000866993006993007, "step": 1452, "tokens_trained": 0.71365004 }, { "epoch": 0.4124530175164882, "grad_norm": 15.751238822937012, "loss": 4.2078, "lr": 0.0008667132867132868, "step": 1454, "tokens_trained": 0.714632032 }, { "epoch": 0.4130203531664421, "grad_norm": 15.838552474975586, "loss": 4.1944, "lr": 0.0008664335664335664, "step": 1456, "tokens_trained": 0.715611376 }, { "epoch": 0.413587688816396, "grad_norm": 15.968609809875488, "loss": 4.1768, "lr": 0.0008661538461538461, "step": 1458, "tokens_trained": 0.716591112 }, { "epoch": 0.4141550244663499, "grad_norm": 15.419891357421875, "loss": 4.1978, "lr": 0.0008658741258741259, "step": 1460, "tokens_trained": 0.717575952 }, { "epoch": 0.4147223601163038, "grad_norm": 15.088132858276367, "loss": 4.2361, "lr": 0.0008655944055944056, "step": 1462, "tokens_trained": 0.718563696 }, { "epoch": 0.41528969576625774, "grad_norm": 4.839190483093262, "loss": 4.2089, "lr": 0.0008653146853146854, "step": 1464, "tokens_trained": 0.71954848 }, { "epoch": 0.4158570314162116, "grad_norm": 22.192466735839844, "loss": 4.2109, "lr": 0.000865034965034965, "step": 1466, "tokens_trained": 0.720533304 }, { "epoch": 0.41642436706616553, "grad_norm": 28.983531951904297, "loss": 4.2402, "lr": 0.0008647552447552448, "step": 1468, "tokens_trained": 0.721518176 }, { "epoch": 0.4169917027161194, "grad_norm": 21.010780334472656, "loss": 4.1732, "lr": 0.0008644755244755245, "step": 1470, "tokens_trained": 0.72250176 }, { "epoch": 0.41755903836607333, "grad_norm": 14.59277057647705, "loss": 4.1847, "lr": 0.0008641958041958042, "step": 1472, "tokens_trained": 0.723486664 }, { "epoch": 0.41812637401602726, "grad_norm": 13.688531875610352, "loss": 4.1577, "lr": 0.0008639160839160839, "step": 1474, "tokens_trained": 0.724469328 }, { "epoch": 0.41869370966598113, "grad_norm": 15.879347801208496, "loss": 4.1721, "lr": 0.0008636363636363636, "step": 1476, "tokens_trained": 0.725454968 }, { "epoch": 0.41926104531593505, "grad_norm": 10.225201606750488, "loss": 4.1999, "lr": 0.0008633566433566434, "step": 1478, "tokens_trained": 0.7264426 }, { "epoch": 0.4198283809658889, "grad_norm": 17.007728576660156, "loss": 4.2229, "lr": 0.0008630769230769231, "step": 1480, "tokens_trained": 0.727422056 }, { "epoch": 0.42039571661584285, "grad_norm": 13.517934799194336, "loss": 4.2241, "lr": 0.0008627972027972029, "step": 1482, "tokens_trained": 0.728403688 }, { "epoch": 0.4209630522657968, "grad_norm": 17.132064819335938, "loss": 4.1679, "lr": 0.0008625174825174825, "step": 1484, "tokens_trained": 0.729386248 }, { "epoch": 0.42153038791575065, "grad_norm": 19.782320022583008, "loss": 4.1817, "lr": 0.0008622377622377622, "step": 1486, "tokens_trained": 0.730368752 }, { "epoch": 0.4220977235657046, "grad_norm": 3.388552188873291, "loss": 4.1726, "lr": 0.000861958041958042, "step": 1488, "tokens_trained": 0.731354304 }, { "epoch": 0.42266505921565845, "grad_norm": 28.33499526977539, "loss": 4.2623, "lr": 0.0008616783216783217, "step": 1490, "tokens_trained": 0.732337296 }, { "epoch": 0.42323239486561237, "grad_norm": 24.927406311035156, "loss": 4.2422, "lr": 0.0008613986013986014, "step": 1492, "tokens_trained": 0.733319824 }, { "epoch": 0.4237997305155663, "grad_norm": 25.996028900146484, "loss": 4.2227, "lr": 0.0008611188811188811, "step": 1494, "tokens_trained": 0.73430636 }, { "epoch": 0.42436706616552017, "grad_norm": 14.625783920288086, "loss": 4.2268, "lr": 0.0008608391608391609, "step": 1496, "tokens_trained": 0.735285848 }, { "epoch": 0.4249344018154741, "grad_norm": 12.556640625, "loss": 4.2352, "lr": 0.0008605594405594406, "step": 1498, "tokens_trained": 0.736270632 }, { "epoch": 0.42550173746542796, "grad_norm": 18.579416275024414, "loss": 4.2377, "lr": 0.0008602797202797203, "step": 1500, "tokens_trained": 0.737255104 }, { "epoch": 0.42550173746542796, "eval_loss": 1.052606463432312, "eval_runtime": 20.5089, "step": 1500, "tokens_trained": 0.737255104 } ], "logging_steps": 2, "max_steps": 7650, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 750, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }