{ "best_global_step": 750, "best_metric": 1.2186306715011597, "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-MxDNA/checkpoint-750", "epoch": 0.21275086873271398, "eval_steps": 125, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005673356499539039, "grad_norm": 8450.4345703125, "loss": 876.9911, "lr": 2e-06, "step": 2, "tokens_trained": 0.000985992 }, { "epoch": 0.0011346712999078079, "grad_norm": 8980.888671875, "loss": 779.4711, "lr": 6e-06, "step": 4, "tokens_trained": 0.001968088 }, { "epoch": 0.001702006949861712, "grad_norm": 7489.92529296875, "loss": 488.6157, "lr": 1e-05, "step": 6, "tokens_trained": 0.002953808 }, { "epoch": 0.0022693425998156157, "grad_norm": 1952.1917724609375, "loss": 237.0602, "lr": 1.4e-05, "step": 8, "tokens_trained": 0.003935728 }, { "epoch": 0.0028366782497695198, "grad_norm": 1418.443603515625, "loss": 159.0854, "lr": 1.8e-05, "step": 10, "tokens_trained": 0.004916488 }, { "epoch": 0.003404013899723424, "grad_norm": 874.7195434570312, "loss": 91.9563, "lr": 2.2e-05, "step": 12, "tokens_trained": 0.005902792 }, { "epoch": 0.003971349549677328, "grad_norm": 1339.8248291015625, "loss": 40.3366, "lr": 2.6e-05, "step": 14, "tokens_trained": 0.0068856 }, { "epoch": 0.0045386851996312315, "grad_norm": 2936.7607421875, "loss": 22.7436, "lr": 3e-05, "step": 16, "tokens_trained": 0.007868248 }, { "epoch": 0.005106020849585136, "grad_norm": 1531.3807373046875, "loss": 23.4797, "lr": 3.4000000000000007e-05, "step": 18, "tokens_trained": 0.008849296 }, { "epoch": 0.0056733564995390395, "grad_norm": 3027.4189453125, "loss": 38.7379, "lr": 3.8e-05, "step": 20, "tokens_trained": 0.009830984 }, { "epoch": 0.006240692149492944, "grad_norm": 2435.890625, "loss": 26.2427, "lr": 4.2000000000000004e-05, "step": 22, "tokens_trained": 0.01081364 }, { "epoch": 0.006808027799446848, "grad_norm": 3217.990478515625, "loss": 31.0263, "lr": 4.6e-05, "step": 24, "tokens_trained": 0.01179036 }, { "epoch": 0.007375363449400752, "grad_norm": 3854.00634765625, "loss": 33.8781, "lr": 5e-05, "step": 26, "tokens_trained": 0.012774504 }, { "epoch": 0.007942699099354656, "grad_norm": 3197.489990234375, "loss": 27.7927, "lr": 5.4e-05, "step": 28, "tokens_trained": 0.013759992 }, { "epoch": 0.00851003474930856, "grad_norm": 3034.156494140625, "loss": 37.9083, "lr": 5.800000000000001e-05, "step": 30, "tokens_trained": 0.014740536 }, { "epoch": 0.009077370399262463, "grad_norm": 3040.314453125, "loss": 34.0659, "lr": 6.2e-05, "step": 32, "tokens_trained": 0.015725984 }, { "epoch": 0.009644706049216368, "grad_norm": 3065.5791015625, "loss": 27.7768, "lr": 6.6e-05, "step": 34, "tokens_trained": 0.016706864 }, { "epoch": 0.010212041699170272, "grad_norm": 2454.293701171875, "loss": 35.1143, "lr": 7.000000000000001e-05, "step": 36, "tokens_trained": 0.017688816 }, { "epoch": 0.010779377349124175, "grad_norm": 3100.7802734375, "loss": 42.2603, "lr": 7.4e-05, "step": 38, "tokens_trained": 0.018669072 }, { "epoch": 0.011346712999078079, "grad_norm": 2749.84423828125, "loss": 39.3879, "lr": 7.8e-05, "step": 40, "tokens_trained": 0.019652072 }, { "epoch": 0.011914048649031984, "grad_norm": 1519.9908447265625, "loss": 35.0735, "lr": 8.2e-05, "step": 42, "tokens_trained": 0.020633112 }, { "epoch": 0.012481384298985888, "grad_norm": 1474.4244384765625, "loss": 25.8965, "lr": 8.599999999999999e-05, "step": 44, "tokens_trained": 0.021616192 }, { "epoch": 0.013048719948939792, "grad_norm": 2962.500244140625, "loss": 51.0784, "lr": 8.999999999999999e-05, "step": 46, "tokens_trained": 0.022597288 }, { "epoch": 0.013616055598893695, "grad_norm": 2419.41455078125, "loss": 43.0334, "lr": 9.400000000000001e-05, "step": 48, "tokens_trained": 0.02357572 }, { "epoch": 0.014183391248847599, "grad_norm": 1267.87451171875, "loss": 21.8063, "lr": 9.800000000000001e-05, "step": 50, "tokens_trained": 0.024553376 }, { "epoch": 0.014750726898801504, "grad_norm": 1573.944091796875, "loss": 52.9693, "lr": 0.000102, "step": 52, "tokens_trained": 0.025536728 }, { "epoch": 0.015318062548755408, "grad_norm": 1509.650146484375, "loss": 50.0825, "lr": 0.000106, "step": 54, "tokens_trained": 0.026517 }, { "epoch": 0.01588539819870931, "grad_norm": 2334.765380859375, "loss": 42.1982, "lr": 0.00011, "step": 56, "tokens_trained": 0.027504728 }, { "epoch": 0.016452733848663217, "grad_norm": 1594.16259765625, "loss": 39.0562, "lr": 0.000114, "step": 58, "tokens_trained": 0.028485416 }, { "epoch": 0.01702006949861712, "grad_norm": 1628.082275390625, "loss": 35.0488, "lr": 0.000118, "step": 60, "tokens_trained": 0.029468696 }, { "epoch": 0.017587405148571024, "grad_norm": 2496.6455078125, "loss": 49.4241, "lr": 0.000122, "step": 62, "tokens_trained": 0.030453584 }, { "epoch": 0.018154740798524926, "grad_norm": 2521.721435546875, "loss": 69.0275, "lr": 0.000126, "step": 64, "tokens_trained": 0.031432864 }, { "epoch": 0.01872207644847883, "grad_norm": 2179.571533203125, "loss": 63.1409, "lr": 0.00013000000000000002, "step": 66, "tokens_trained": 0.032418416 }, { "epoch": 0.019289412098432736, "grad_norm": 899.7137451171875, "loss": 38.4131, "lr": 0.000134, "step": 68, "tokens_trained": 0.033402136 }, { "epoch": 0.01985674774838664, "grad_norm": 2109.377685546875, "loss": 51.0044, "lr": 0.00013800000000000002, "step": 70, "tokens_trained": 0.03438832 }, { "epoch": 0.020424083398340544, "grad_norm": 1649.1873779296875, "loss": 32.1408, "lr": 0.00014199999999999998, "step": 72, "tokens_trained": 0.035374464 }, { "epoch": 0.020991419048294446, "grad_norm": 1807.994140625, "loss": 28.8357, "lr": 0.000146, "step": 74, "tokens_trained": 0.03635784 }, { "epoch": 0.02155875469824835, "grad_norm": 998.9485473632812, "loss": 23.0343, "lr": 0.00015, "step": 76, "tokens_trained": 0.037340248 }, { "epoch": 0.022126090348202256, "grad_norm": 2240.17578125, "loss": 32.0397, "lr": 0.000154, "step": 78, "tokens_trained": 0.038321968 }, { "epoch": 0.022693425998156158, "grad_norm": 1606.0067138671875, "loss": 32.1776, "lr": 0.000158, "step": 80, "tokens_trained": 0.039304992 }, { "epoch": 0.023260761648110063, "grad_norm": 1685.1015625, "loss": 24.3428, "lr": 0.000162, "step": 82, "tokens_trained": 0.040286808 }, { "epoch": 0.02382809729806397, "grad_norm": 1761.7890625, "loss": 23.9261, "lr": 0.00016600000000000002, "step": 84, "tokens_trained": 0.041271776 }, { "epoch": 0.02439543294801787, "grad_norm": 2036.0982666015625, "loss": 27.7196, "lr": 0.00017, "step": 86, "tokens_trained": 0.042252784 }, { "epoch": 0.024962768597971776, "grad_norm": 1564.3870849609375, "loss": 25.3722, "lr": 0.000174, "step": 88, "tokens_trained": 0.04323596 }, { "epoch": 0.025530104247925678, "grad_norm": 1508.349853515625, "loss": 18.4107, "lr": 0.000178, "step": 90, "tokens_trained": 0.044218984 }, { "epoch": 0.026097439897879583, "grad_norm": 1955.011474609375, "loss": 28.8456, "lr": 0.000182, "step": 92, "tokens_trained": 0.045202144 }, { "epoch": 0.02666477554783349, "grad_norm": 1679.9423828125, "loss": 23.6139, "lr": 0.000186, "step": 94, "tokens_trained": 0.046192336 }, { "epoch": 0.02723211119778739, "grad_norm": 1517.5731201171875, "loss": 42.145, "lr": 0.00019, "step": 96, "tokens_trained": 0.047174312 }, { "epoch": 0.027799446847741296, "grad_norm": 1535.3076171875, "loss": 31.9711, "lr": 0.000194, "step": 98, "tokens_trained": 0.048158944 }, { "epoch": 0.028366782497695198, "grad_norm": 1475.2569580078125, "loss": 37.645, "lr": 0.00019800000000000002, "step": 100, "tokens_trained": 0.04914364 }, { "epoch": 0.028934118147649103, "grad_norm": 1918.4088134765625, "loss": 69.4053, "lr": 0.000202, "step": 102, "tokens_trained": 0.050123488 }, { "epoch": 0.02950145379760301, "grad_norm": 1631.6231689453125, "loss": 50.9725, "lr": 0.000206, "step": 104, "tokens_trained": 0.051105512 }, { "epoch": 0.03006878944755691, "grad_norm": 1291.6376953125, "loss": 22.6527, "lr": 0.00021, "step": 106, "tokens_trained": 0.052091704 }, { "epoch": 0.030636125097510816, "grad_norm": 1224.9625244140625, "loss": 60.2725, "lr": 0.000214, "step": 108, "tokens_trained": 0.053074824 }, { "epoch": 0.031203460747464717, "grad_norm": 1218.2022705078125, "loss": 75.8728, "lr": 0.000218, "step": 110, "tokens_trained": 0.054057104 }, { "epoch": 0.03177079639741862, "grad_norm": 1761.8861083984375, "loss": 61.6427, "lr": 0.000222, "step": 112, "tokens_trained": 0.055039128 }, { "epoch": 0.03233813204737253, "grad_norm": 1482.4256591796875, "loss": 35.3351, "lr": 0.00022600000000000002, "step": 114, "tokens_trained": 0.05602388 }, { "epoch": 0.03290546769732643, "grad_norm": 563.6399536132812, "loss": 40.1461, "lr": 0.00023, "step": 116, "tokens_trained": 0.057005376 }, { "epoch": 0.03347280334728033, "grad_norm": 1266.058837890625, "loss": 24.0657, "lr": 0.00023400000000000002, "step": 118, "tokens_trained": 0.057985136 }, { "epoch": 0.03404013899723424, "grad_norm": 918.206298828125, "loss": 23.9626, "lr": 0.00023799999999999998, "step": 120, "tokens_trained": 0.058968288 }, { "epoch": 0.03460747464718814, "grad_norm": 1495.7191162109375, "loss": 19.798, "lr": 0.000242, "step": 122, "tokens_trained": 0.05995348 }, { "epoch": 0.03517481029714205, "grad_norm": 1264.302734375, "loss": 31.5342, "lr": 0.000246, "step": 124, "tokens_trained": 0.060935832 }, { "epoch": 0.035458478122119, "eval_loss": 5.312118053436279, "eval_runtime": 21.3065, "step": 125, "tokens_trained": 0.061426608 }, { "epoch": 0.03574214594709595, "grad_norm": 907.4861450195312, "loss": 25.1262, "lr": 0.00025, "step": 126, "tokens_trained": 0.061918184 }, { "epoch": 0.03630948159704985, "grad_norm": 1287.6158447265625, "loss": 26.963, "lr": 0.000254, "step": 128, "tokens_trained": 0.062902328 }, { "epoch": 0.03687681724700376, "grad_norm": 1260.570556640625, "loss": 24.9633, "lr": 0.00025800000000000004, "step": 130, "tokens_trained": 0.063883456 }, { "epoch": 0.03744415289695766, "grad_norm": 1436.82373046875, "loss": 23.1028, "lr": 0.000262, "step": 132, "tokens_trained": 0.06486748 }, { "epoch": 0.03801148854691157, "grad_norm": 812.9523315429688, "loss": 20.5496, "lr": 0.000266, "step": 134, "tokens_trained": 0.065847104 }, { "epoch": 0.03857882419686547, "grad_norm": 1336.5322265625, "loss": 23.673, "lr": 0.00027, "step": 136, "tokens_trained": 0.066829928 }, { "epoch": 0.03914615984681937, "grad_norm": 1381.282470703125, "loss": 32.0373, "lr": 0.00027400000000000005, "step": 138, "tokens_trained": 0.067814024 }, { "epoch": 0.03971349549677328, "grad_norm": 972.7861938476562, "loss": 26.9454, "lr": 0.00027800000000000004, "step": 140, "tokens_trained": 0.068797744 }, { "epoch": 0.04028083114672718, "grad_norm": 1347.2249755859375, "loss": 22.3578, "lr": 0.00028199999999999997, "step": 142, "tokens_trained": 0.069780072 }, { "epoch": 0.04084816679668109, "grad_norm": 829.525390625, "loss": 37.9879, "lr": 0.00028599999999999996, "step": 144, "tokens_trained": 0.070759896 }, { "epoch": 0.04141550244663499, "grad_norm": 1094.1033935546875, "loss": 21.1972, "lr": 0.00029, "step": 146, "tokens_trained": 0.0717452 }, { "epoch": 0.04198283809658889, "grad_norm": 717.107421875, "loss": 21.7774, "lr": 0.000294, "step": 148, "tokens_trained": 0.072727432 }, { "epoch": 0.042550173746542796, "grad_norm": 744.4456787109375, "loss": 20.3235, "lr": 0.000298, "step": 150, "tokens_trained": 0.073712128 }, { "epoch": 0.0431175093964967, "grad_norm": 904.1460571289062, "loss": 22.7878, "lr": 0.000302, "step": 152, "tokens_trained": 0.074695296 }, { "epoch": 0.04368484504645061, "grad_norm": 1352.303955078125, "loss": 20.9757, "lr": 0.000306, "step": 154, "tokens_trained": 0.0756798 }, { "epoch": 0.04425218069640451, "grad_norm": 997.0473022460938, "loss": 17.4647, "lr": 0.00031, "step": 156, "tokens_trained": 0.076666504 }, { "epoch": 0.04481951634635841, "grad_norm": 1206.387939453125, "loss": 21.1846, "lr": 0.000314, "step": 158, "tokens_trained": 0.07764868 }, { "epoch": 0.045386851996312316, "grad_norm": 1029.6807861328125, "loss": 17.8853, "lr": 0.00031800000000000003, "step": 160, "tokens_trained": 0.07863548 }, { "epoch": 0.04595418764626622, "grad_norm": 1136.4635009765625, "loss": 30.057, "lr": 0.000322, "step": 162, "tokens_trained": 0.079618928 }, { "epoch": 0.04652152329622013, "grad_norm": 834.3464965820312, "loss": 28.1782, "lr": 0.000326, "step": 164, "tokens_trained": 0.0806032 }, { "epoch": 0.04708885894617403, "grad_norm": 1177.8365478515625, "loss": 16.4267, "lr": 0.00033, "step": 166, "tokens_trained": 0.081583752 }, { "epoch": 0.04765619459612794, "grad_norm": 572.501708984375, "loss": 16.5752, "lr": 0.00033400000000000004, "step": 168, "tokens_trained": 0.082568184 }, { "epoch": 0.048223530246081836, "grad_norm": 437.6822814941406, "loss": 11.5509, "lr": 0.00033800000000000003, "step": 170, "tokens_trained": 0.083553352 }, { "epoch": 0.04879086589603574, "grad_norm": 1119.0416259765625, "loss": 16.2689, "lr": 0.000342, "step": 172, "tokens_trained": 0.084536352 }, { "epoch": 0.04935820154598965, "grad_norm": 895.4021606445312, "loss": 12.6663, "lr": 0.000346, "step": 174, "tokens_trained": 0.085517312 }, { "epoch": 0.04992553719594355, "grad_norm": 995.6289672851562, "loss": 26.0663, "lr": 0.00035, "step": 176, "tokens_trained": 0.086496088 }, { "epoch": 0.05049287284589746, "grad_norm": 839.6610717773438, "loss": 21.5115, "lr": 0.000354, "step": 178, "tokens_trained": 0.087480632 }, { "epoch": 0.051060208495851356, "grad_norm": 734.1155395507812, "loss": 29.3287, "lr": 0.000358, "step": 180, "tokens_trained": 0.088460408 }, { "epoch": 0.05162754414580526, "grad_norm": 721.4505615234375, "loss": 26.0801, "lr": 0.000362, "step": 182, "tokens_trained": 0.08944248 }, { "epoch": 0.052194879795759166, "grad_norm": 845.9672241210938, "loss": 19.0639, "lr": 0.000366, "step": 184, "tokens_trained": 0.090427832 }, { "epoch": 0.05276221544571307, "grad_norm": 1210.9969482421875, "loss": 23.9036, "lr": 0.00037, "step": 186, "tokens_trained": 0.091411504 }, { "epoch": 0.05332955109566698, "grad_norm": 1079.1690673828125, "loss": 23.5588, "lr": 0.000374, "step": 188, "tokens_trained": 0.092392672 }, { "epoch": 0.053896886745620876, "grad_norm": 596.111328125, "loss": 20.8275, "lr": 0.000378, "step": 190, "tokens_trained": 0.093374696 }, { "epoch": 0.05446422239557478, "grad_norm": 761.8096923828125, "loss": 22.512, "lr": 0.000382, "step": 192, "tokens_trained": 0.094361912 }, { "epoch": 0.055031558045528686, "grad_norm": 1081.9832763671875, "loss": 32.335, "lr": 0.000386, "step": 194, "tokens_trained": 0.095342992 }, { "epoch": 0.05559889369548259, "grad_norm": 304.3534240722656, "loss": 11.5275, "lr": 0.00039000000000000005, "step": 196, "tokens_trained": 0.096323512 }, { "epoch": 0.0561662293454365, "grad_norm": 586.6314086914062, "loss": 16.2663, "lr": 0.00039400000000000004, "step": 198, "tokens_trained": 0.097308864 }, { "epoch": 0.056733564995390395, "grad_norm": 624.9953002929688, "loss": 16.627, "lr": 0.000398, "step": 200, "tokens_trained": 0.098289064 }, { "epoch": 0.0573009006453443, "grad_norm": 585.9645385742188, "loss": 15.8359, "lr": 0.000402, "step": 202, "tokens_trained": 0.099269696 }, { "epoch": 0.057868236295298206, "grad_norm": 537.9913330078125, "loss": 20.0779, "lr": 0.00040600000000000006, "step": 204, "tokens_trained": 0.100248448 }, { "epoch": 0.05843557194525211, "grad_norm": 805.04931640625, "loss": 21.4524, "lr": 0.00041, "step": 206, "tokens_trained": 0.101231248 }, { "epoch": 0.05900290759520602, "grad_norm": 439.1418151855469, "loss": 23.9852, "lr": 0.000414, "step": 208, "tokens_trained": 0.102210688 }, { "epoch": 0.059570243245159915, "grad_norm": 502.684814453125, "loss": 17.6273, "lr": 0.00041799999999999997, "step": 210, "tokens_trained": 0.103192176 }, { "epoch": 0.06013757889511382, "grad_norm": 849.9979858398438, "loss": 33.7517, "lr": 0.000422, "step": 212, "tokens_trained": 0.104172824 }, { "epoch": 0.060704914545067726, "grad_norm": 939.583740234375, "loss": 26.2559, "lr": 0.000426, "step": 214, "tokens_trained": 0.105156672 }, { "epoch": 0.06127225019502163, "grad_norm": 525.0505981445312, "loss": 20.0923, "lr": 0.00043, "step": 216, "tokens_trained": 0.106141368 }, { "epoch": 0.061839585844975536, "grad_norm": 420.296630859375, "loss": 17.9608, "lr": 0.00043400000000000003, "step": 218, "tokens_trained": 0.107124088 }, { "epoch": 0.062406921494929435, "grad_norm": 711.3380737304688, "loss": 19.387, "lr": 0.000438, "step": 220, "tokens_trained": 0.108112632 }, { "epoch": 0.06297425714488335, "grad_norm": 759.183349609375, "loss": 17.8061, "lr": 0.000442, "step": 222, "tokens_trained": 0.1090934 }, { "epoch": 0.06354159279483725, "grad_norm": 790.025146484375, "loss": 13.8539, "lr": 0.000446, "step": 224, "tokens_trained": 0.110079512 }, { "epoch": 0.06410892844479114, "grad_norm": 769.8306274414062, "loss": 22.1258, "lr": 0.00045000000000000004, "step": 226, "tokens_trained": 0.111060152 }, { "epoch": 0.06467626409474506, "grad_norm": 656.8352661132812, "loss": 14.8646, "lr": 0.00045400000000000003, "step": 228, "tokens_trained": 0.112044144 }, { "epoch": 0.06524359974469895, "grad_norm": 498.92010498046875, "loss": 23.1558, "lr": 0.000458, "step": 230, "tokens_trained": 0.113022928 }, { "epoch": 0.06581093539465287, "grad_norm": 764.0186157226562, "loss": 16.7089, "lr": 0.000462, "step": 232, "tokens_trained": 0.114003832 }, { "epoch": 0.06637827104460677, "grad_norm": 491.5793762207031, "loss": 12.3979, "lr": 0.00046600000000000005, "step": 234, "tokens_trained": 0.114991008 }, { "epoch": 0.06694560669456066, "grad_norm": 679.9217529296875, "loss": 14.9037, "lr": 0.00047, "step": 236, "tokens_trained": 0.115971888 }, { "epoch": 0.06751294234451458, "grad_norm": 491.0369567871094, "loss": 7.7603, "lr": 0.000474, "step": 238, "tokens_trained": 0.116952616 }, { "epoch": 0.06808027799446847, "grad_norm": 369.2186279296875, "loss": 8.2256, "lr": 0.00047799999999999996, "step": 240, "tokens_trained": 0.117935816 }, { "epoch": 0.06864761364442239, "grad_norm": 312.72137451171875, "loss": 7.5486, "lr": 0.000482, "step": 242, "tokens_trained": 0.118919392 }, { "epoch": 0.06921494929437629, "grad_norm": 596.1439208984375, "loss": 11.7351, "lr": 0.000486, "step": 244, "tokens_trained": 0.119901856 }, { "epoch": 0.06978228494433018, "grad_norm": 467.5667419433594, "loss": 11.8403, "lr": 0.00049, "step": 246, "tokens_trained": 0.120884624 }, { "epoch": 0.0703496205942841, "grad_norm": 430.50048828125, "loss": 13.8081, "lr": 0.000494, "step": 248, "tokens_trained": 0.121869224 }, { "epoch": 0.070916956244238, "grad_norm": 522.242919921875, "loss": 14.1892, "lr": 0.000498, "step": 250, "tokens_trained": 0.122853584 }, { "epoch": 0.070916956244238, "eval_loss": 1.9294606447219849, "eval_runtime": 20.4162, "step": 250, "tokens_trained": 0.122853584 }, { "epoch": 0.0714842918941919, "grad_norm": 835.2765502929688, "loss": 13.2462, "lr": 0.0005020000000000001, "step": 252, "tokens_trained": 0.123835544 }, { "epoch": 0.0720516275441458, "grad_norm": 714.8098754882812, "loss": 20.0498, "lr": 0.000506, "step": 254, "tokens_trained": 0.124821616 }, { "epoch": 0.0726189631940997, "grad_norm": 701.512939453125, "loss": 18.3664, "lr": 0.00051, "step": 256, "tokens_trained": 0.125807608 }, { "epoch": 0.07318629884405362, "grad_norm": 773.987060546875, "loss": 21.3807, "lr": 0.000514, "step": 258, "tokens_trained": 0.126791464 }, { "epoch": 0.07375363449400751, "grad_norm": 826.422119140625, "loss": 22.6403, "lr": 0.000518, "step": 260, "tokens_trained": 0.127771752 }, { "epoch": 0.07432097014396143, "grad_norm": 742.8673095703125, "loss": 20.1504, "lr": 0.000522, "step": 262, "tokens_trained": 0.128755448 }, { "epoch": 0.07488830579391532, "grad_norm": 797.79296875, "loss": 26.7343, "lr": 0.000526, "step": 264, "tokens_trained": 0.129741088 }, { "epoch": 0.07545564144386922, "grad_norm": 673.9141235351562, "loss": 12.505, "lr": 0.0005300000000000001, "step": 266, "tokens_trained": 0.130727504 }, { "epoch": 0.07602297709382314, "grad_norm": 310.6510925292969, "loss": 12.6344, "lr": 0.0005340000000000001, "step": 268, "tokens_trained": 0.131710296 }, { "epoch": 0.07659031274377703, "grad_norm": 312.40966796875, "loss": 14.254, "lr": 0.0005380000000000001, "step": 270, "tokens_trained": 0.132695352 }, { "epoch": 0.07715764839373095, "grad_norm": 492.2834777832031, "loss": 19.0979, "lr": 0.0005420000000000001, "step": 272, "tokens_trained": 0.133677928 }, { "epoch": 0.07772498404368484, "grad_norm": 628.457763671875, "loss": 21.7735, "lr": 0.000546, "step": 274, "tokens_trained": 0.134655504 }, { "epoch": 0.07829231969363874, "grad_norm": 382.8389892578125, "loss": 12.5128, "lr": 0.00055, "step": 276, "tokens_trained": 0.135640208 }, { "epoch": 0.07885965534359266, "grad_norm": 483.12335205078125, "loss": 15.2589, "lr": 0.000554, "step": 278, "tokens_trained": 0.136624232 }, { "epoch": 0.07942699099354655, "grad_norm": 640.658447265625, "loss": 12.1341, "lr": 0.000558, "step": 280, "tokens_trained": 0.13760628 }, { "epoch": 0.07999432664350047, "grad_norm": 410.0824279785156, "loss": 12.5723, "lr": 0.0005620000000000001, "step": 282, "tokens_trained": 0.13858832 }, { "epoch": 0.08056166229345436, "grad_norm": 513.2861328125, "loss": 14.8461, "lr": 0.000566, "step": 284, "tokens_trained": 0.139568424 }, { "epoch": 0.08112899794340826, "grad_norm": 564.547607421875, "loss": 12.5792, "lr": 0.00057, "step": 286, "tokens_trained": 0.140557016 }, { "epoch": 0.08169633359336217, "grad_norm": 451.3592834472656, "loss": 16.5433, "lr": 0.000574, "step": 288, "tokens_trained": 0.141540248 }, { "epoch": 0.08226366924331607, "grad_norm": 404.2495422363281, "loss": 16.4138, "lr": 0.000578, "step": 290, "tokens_trained": 0.142528272 }, { "epoch": 0.08283100489326999, "grad_norm": 566.5219116210938, "loss": 16.4743, "lr": 0.0005819999999999999, "step": 292, "tokens_trained": 0.143513096 }, { "epoch": 0.08339834054322388, "grad_norm": 559.6517333984375, "loss": 16.421, "lr": 0.0005859999999999999, "step": 294, "tokens_trained": 0.144494472 }, { "epoch": 0.08396567619317778, "grad_norm": 260.874755859375, "loss": 11.2214, "lr": 0.00059, "step": 296, "tokens_trained": 0.14547876 }, { "epoch": 0.0845330118431317, "grad_norm": 272.02899169921875, "loss": 10.3491, "lr": 0.000594, "step": 298, "tokens_trained": 0.146465864 }, { "epoch": 0.08510034749308559, "grad_norm": 556.9845581054688, "loss": 10.4348, "lr": 0.000598, "step": 300, "tokens_trained": 0.147446344 }, { "epoch": 0.0856676831430395, "grad_norm": 273.35772705078125, "loss": 8.3292, "lr": 0.000602, "step": 302, "tokens_trained": 0.14843244 }, { "epoch": 0.0862350187929934, "grad_norm": 246.6316680908203, "loss": 9.9362, "lr": 0.000606, "step": 304, "tokens_trained": 0.149415976 }, { "epoch": 0.0868023544429473, "grad_norm": 564.4365844726562, "loss": 9.2621, "lr": 0.00061, "step": 306, "tokens_trained": 0.150398728 }, { "epoch": 0.08736969009290121, "grad_norm": 396.0948791503906, "loss": 11.8526, "lr": 0.000614, "step": 308, "tokens_trained": 0.151385104 }, { "epoch": 0.08793702574285511, "grad_norm": 488.6072692871094, "loss": 11.8473, "lr": 0.0006180000000000001, "step": 310, "tokens_trained": 0.152373672 }, { "epoch": 0.08850436139280903, "grad_norm": 346.70660400390625, "loss": 12.0897, "lr": 0.000622, "step": 312, "tokens_trained": 0.153356256 }, { "epoch": 0.08907169704276292, "grad_norm": 382.40679931640625, "loss": 9.271, "lr": 0.000626, "step": 314, "tokens_trained": 0.154342632 }, { "epoch": 0.08963903269271682, "grad_norm": 288.7908935546875, "loss": 9.185, "lr": 0.00063, "step": 316, "tokens_trained": 0.1553238 }, { "epoch": 0.09020636834267073, "grad_norm": 337.5335388183594, "loss": 12.0555, "lr": 0.000634, "step": 318, "tokens_trained": 0.156313168 }, { "epoch": 0.09077370399262463, "grad_norm": 349.25531005859375, "loss": 8.51, "lr": 0.000638, "step": 320, "tokens_trained": 0.157299448 }, { "epoch": 0.09134103964257854, "grad_norm": 471.7824401855469, "loss": 14.1888, "lr": 0.000642, "step": 322, "tokens_trained": 0.158285264 }, { "epoch": 0.09190837529253244, "grad_norm": 284.94036865234375, "loss": 10.1593, "lr": 0.000646, "step": 324, "tokens_trained": 0.159267512 }, { "epoch": 0.09247571094248634, "grad_norm": 510.90478515625, "loss": 13.5744, "lr": 0.0006500000000000001, "step": 326, "tokens_trained": 0.160250856 }, { "epoch": 0.09304304659244025, "grad_norm": 373.82965087890625, "loss": 8.4999, "lr": 0.0006540000000000001, "step": 328, "tokens_trained": 0.161231832 }, { "epoch": 0.09361038224239415, "grad_norm": 219.3827362060547, "loss": 8.4436, "lr": 0.0006580000000000001, "step": 330, "tokens_trained": 0.162217656 }, { "epoch": 0.09417771789234806, "grad_norm": 433.0914001464844, "loss": 11.2019, "lr": 0.000662, "step": 332, "tokens_trained": 0.163199096 }, { "epoch": 0.09474505354230196, "grad_norm": 242.65907287597656, "loss": 9.0666, "lr": 0.000666, "step": 334, "tokens_trained": 0.164178512 }, { "epoch": 0.09531238919225588, "grad_norm": 446.07916259765625, "loss": 8.6546, "lr": 0.00067, "step": 336, "tokens_trained": 0.165162464 }, { "epoch": 0.09587972484220977, "grad_norm": 231.8892364501953, "loss": 7.5819, "lr": 0.000674, "step": 338, "tokens_trained": 0.166141536 }, { "epoch": 0.09644706049216367, "grad_norm": 100.7306137084961, "loss": 6.7047, "lr": 0.0006780000000000001, "step": 340, "tokens_trained": 0.167123944 }, { "epoch": 0.09701439614211758, "grad_norm": 78.11279296875, "loss": 5.9308, "lr": 0.0006820000000000001, "step": 342, "tokens_trained": 0.168105264 }, { "epoch": 0.09758173179207148, "grad_norm": 271.466064453125, "loss": 6.9141, "lr": 0.0006860000000000001, "step": 344, "tokens_trained": 0.169088912 }, { "epoch": 0.0981490674420254, "grad_norm": 252.54478454589844, "loss": 6.3281, "lr": 0.00069, "step": 346, "tokens_trained": 0.170077368 }, { "epoch": 0.0987164030919793, "grad_norm": 305.8559875488281, "loss": 6.443, "lr": 0.000694, "step": 348, "tokens_trained": 0.171057232 }, { "epoch": 0.09928373874193319, "grad_norm": 227.74374389648438, "loss": 6.552, "lr": 0.0006979999999999999, "step": 350, "tokens_trained": 0.172041376 }, { "epoch": 0.0998510743918871, "grad_norm": 446.7601623535156, "loss": 10.8184, "lr": 0.0007019999999999999, "step": 352, "tokens_trained": 0.173023624 }, { "epoch": 0.100418410041841, "grad_norm": 353.0849609375, "loss": 8.6327, "lr": 0.0007059999999999999, "step": 354, "tokens_trained": 0.174005992 }, { "epoch": 0.10098574569179491, "grad_norm": 367.9427185058594, "loss": 9.3898, "lr": 0.00071, "step": 356, "tokens_trained": 0.174988304 }, { "epoch": 0.10155308134174881, "grad_norm": 224.4961700439453, "loss": 8.284, "lr": 0.000714, "step": 358, "tokens_trained": 0.175969816 }, { "epoch": 0.10212041699170271, "grad_norm": 221.86537170410156, "loss": 7.0578, "lr": 0.000718, "step": 360, "tokens_trained": 0.176952688 }, { "epoch": 0.10268775264165662, "grad_norm": 331.0989685058594, "loss": 6.9561, "lr": 0.000722, "step": 362, "tokens_trained": 0.177935144 }, { "epoch": 0.10325508829161052, "grad_norm": 171.6498260498047, "loss": 7.203, "lr": 0.000726, "step": 364, "tokens_trained": 0.178916776 }, { "epoch": 0.10382242394156443, "grad_norm": 284.2208557128906, "loss": 10.3517, "lr": 0.00073, "step": 366, "tokens_trained": 0.179903432 }, { "epoch": 0.10438975959151833, "grad_norm": 354.8574523925781, "loss": 9.3888, "lr": 0.000734, "step": 368, "tokens_trained": 0.180883224 }, { "epoch": 0.10495709524147223, "grad_norm": 344.82574462890625, "loss": 10.5933, "lr": 0.000738, "step": 370, "tokens_trained": 0.181863808 }, { "epoch": 0.10552443089142614, "grad_norm": 302.6838073730469, "loss": 10.2832, "lr": 0.000742, "step": 372, "tokens_trained": 0.182843712 }, { "epoch": 0.10609176654138004, "grad_norm": 323.0387878417969, "loss": 6.4864, "lr": 0.000746, "step": 374, "tokens_trained": 0.183825832 }, { "epoch": 0.10637543436635699, "eval_loss": 1.4430732727050781, "eval_runtime": 20.5468, "step": 375, "tokens_trained": 0.184317744 }, { "epoch": 0.10665910219133395, "grad_norm": 133.74822998046875, "loss": 5.4176, "lr": 0.00075, "step": 376, "tokens_trained": 0.184811352 }, { "epoch": 0.10722643784128785, "grad_norm": 180.3372344970703, "loss": 5.5641, "lr": 0.000754, "step": 378, "tokens_trained": 0.185792528 }, { "epoch": 0.10779377349124175, "grad_norm": 250.83999633789062, "loss": 5.8612, "lr": 0.000758, "step": 380, "tokens_trained": 0.186777112 }, { "epoch": 0.10836110914119566, "grad_norm": 293.51959228515625, "loss": 6.0418, "lr": 0.000762, "step": 382, "tokens_trained": 0.18775724 }, { "epoch": 0.10892844479114956, "grad_norm": 292.56207275390625, "loss": 6.1812, "lr": 0.0007660000000000001, "step": 384, "tokens_trained": 0.188733568 }, { "epoch": 0.10949578044110347, "grad_norm": 121.82467651367188, "loss": 6.0855, "lr": 0.0007700000000000001, "step": 386, "tokens_trained": 0.189718512 }, { "epoch": 0.11006311609105737, "grad_norm": 124.30497741699219, "loss": 5.7734, "lr": 0.0007740000000000001, "step": 388, "tokens_trained": 0.190703776 }, { "epoch": 0.11063045174101127, "grad_norm": 143.64004516601562, "loss": 5.7641, "lr": 0.000778, "step": 390, "tokens_trained": 0.191689888 }, { "epoch": 0.11119778739096518, "grad_norm": 160.06784057617188, "loss": 5.6025, "lr": 0.000782, "step": 392, "tokens_trained": 0.192673992 }, { "epoch": 0.11176512304091908, "grad_norm": 226.97988891601562, "loss": 6.0049, "lr": 0.000786, "step": 394, "tokens_trained": 0.193656272 }, { "epoch": 0.112332458690873, "grad_norm": 223.26898193359375, "loss": 5.6972, "lr": 0.00079, "step": 396, "tokens_trained": 0.194639144 }, { "epoch": 0.11289979434082689, "grad_norm": 249.34912109375, "loss": 5.7348, "lr": 0.0007940000000000001, "step": 398, "tokens_trained": 0.195621256 }, { "epoch": 0.11346712999078079, "grad_norm": 161.34271240234375, "loss": 5.6689, "lr": 0.0007980000000000001, "step": 400, "tokens_trained": 0.196604136 }, { "epoch": 0.1140344656407347, "grad_norm": 148.53176879882812, "loss": 5.702, "lr": 0.0008020000000000001, "step": 402, "tokens_trained": 0.197586784 }, { "epoch": 0.1146018012906886, "grad_norm": 144.40835571289062, "loss": 6.2402, "lr": 0.0008060000000000001, "step": 404, "tokens_trained": 0.198570824 }, { "epoch": 0.11516913694064251, "grad_norm": 306.57562255859375, "loss": 7.1739, "lr": 0.0008100000000000001, "step": 406, "tokens_trained": 0.199548328 }, { "epoch": 0.11573647259059641, "grad_norm": 308.79180908203125, "loss": 6.0972, "lr": 0.0008139999999999999, "step": 408, "tokens_trained": 0.200532496 }, { "epoch": 0.11630380824055031, "grad_norm": 197.76791381835938, "loss": 6.3533, "lr": 0.0008179999999999999, "step": 410, "tokens_trained": 0.201514648 }, { "epoch": 0.11687114389050422, "grad_norm": 129.5694580078125, "loss": 6.9628, "lr": 0.0008219999999999999, "step": 412, "tokens_trained": 0.2024994 }, { "epoch": 0.11743847954045812, "grad_norm": 446.0195617675781, "loss": 11.7562, "lr": 0.000826, "step": 414, "tokens_trained": 0.20348012 }, { "epoch": 0.11800581519041203, "grad_norm": 355.5342712402344, "loss": 8.8055, "lr": 0.00083, "step": 416, "tokens_trained": 0.20446356 }, { "epoch": 0.11857315084036593, "grad_norm": 456.2491149902344, "loss": 9.606, "lr": 0.000834, "step": 418, "tokens_trained": 0.205445288 }, { "epoch": 0.11914048649031983, "grad_norm": 369.8676452636719, "loss": 8.385, "lr": 0.000838, "step": 420, "tokens_trained": 0.206427832 }, { "epoch": 0.11970782214027374, "grad_norm": 262.19073486328125, "loss": 9.0956, "lr": 0.000842, "step": 422, "tokens_trained": 0.207409848 }, { "epoch": 0.12027515779022764, "grad_norm": 120.3193130493164, "loss": 5.4937, "lr": 0.000846, "step": 424, "tokens_trained": 0.208391752 }, { "epoch": 0.12084249344018155, "grad_norm": 222.1111297607422, "loss": 8.9367, "lr": 0.00085, "step": 426, "tokens_trained": 0.20937384 }, { "epoch": 0.12140982909013545, "grad_norm": 137.16819763183594, "loss": 7.5876, "lr": 0.000854, "step": 428, "tokens_trained": 0.210358576 }, { "epoch": 0.12197716474008935, "grad_norm": 267.61846923828125, "loss": 8.817, "lr": 0.000858, "step": 430, "tokens_trained": 0.211340064 }, { "epoch": 0.12254450039004326, "grad_norm": 472.72906494140625, "loss": 8.203, "lr": 0.000862, "step": 432, "tokens_trained": 0.212321144 }, { "epoch": 0.12311183603999716, "grad_norm": 297.1420593261719, "loss": 10.987, "lr": 0.000866, "step": 434, "tokens_trained": 0.213300312 }, { "epoch": 0.12367917168995107, "grad_norm": 281.7297668457031, "loss": 7.6117, "lr": 0.00087, "step": 436, "tokens_trained": 0.214287624 }, { "epoch": 0.12424650733990497, "grad_norm": 203.09678649902344, "loss": 6.5638, "lr": 0.000874, "step": 438, "tokens_trained": 0.215272136 }, { "epoch": 0.12481384298985887, "grad_norm": 155.7823944091797, "loss": 6.1131, "lr": 0.000878, "step": 440, "tokens_trained": 0.216256392 }, { "epoch": 0.12538117863981277, "grad_norm": 189.86196899414062, "loss": 8.2565, "lr": 0.000882, "step": 442, "tokens_trained": 0.217242504 }, { "epoch": 0.1259485142897667, "grad_norm": 247.4568634033203, "loss": 7.1005, "lr": 0.0008860000000000001, "step": 444, "tokens_trained": 0.218226008 }, { "epoch": 0.1265158499397206, "grad_norm": 179.72825622558594, "loss": 6.3379, "lr": 0.0008900000000000001, "step": 446, "tokens_trained": 0.219210584 }, { "epoch": 0.1270831855896745, "grad_norm": 212.96356201171875, "loss": 7.2514, "lr": 0.000894, "step": 448, "tokens_trained": 0.220193952 }, { "epoch": 0.1276505212396284, "grad_norm": 105.67095947265625, "loss": 5.456, "lr": 0.000898, "step": 450, "tokens_trained": 0.221176936 }, { "epoch": 0.1282178568895823, "grad_norm": 302.9122619628906, "loss": 6.4018, "lr": 0.000902, "step": 452, "tokens_trained": 0.222161952 }, { "epoch": 0.12878519253953621, "grad_norm": 215.66561889648438, "loss": 6.2853, "lr": 0.000906, "step": 454, "tokens_trained": 0.223144912 }, { "epoch": 0.1293525281894901, "grad_norm": 272.9984130859375, "loss": 7.3902, "lr": 0.00091, "step": 456, "tokens_trained": 0.224127392 }, { "epoch": 0.129919863839444, "grad_norm": 200.7503662109375, "loss": 6.1637, "lr": 0.0009140000000000001, "step": 458, "tokens_trained": 0.22511648 }, { "epoch": 0.1304871994893979, "grad_norm": 93.23990631103516, "loss": 6.4867, "lr": 0.0009180000000000001, "step": 460, "tokens_trained": 0.226098144 }, { "epoch": 0.1310545351393518, "grad_norm": 274.37164306640625, "loss": 8.99, "lr": 0.0009220000000000001, "step": 462, "tokens_trained": 0.227081848 }, { "epoch": 0.13162187078930573, "grad_norm": 186.66322326660156, "loss": 8.7122, "lr": 0.0009260000000000001, "step": 464, "tokens_trained": 0.22806636 }, { "epoch": 0.13218920643925963, "grad_norm": 586.1035766601562, "loss": 9.1045, "lr": 0.00093, "step": 466, "tokens_trained": 0.229047872 }, { "epoch": 0.13275654208921353, "grad_norm": 227.55996704101562, "loss": 9.7276, "lr": 0.000934, "step": 468, "tokens_trained": 0.230031144 }, { "epoch": 0.13332387773916743, "grad_norm": 229.26609802246094, "loss": 6.6244, "lr": 0.0009379999999999999, "step": 470, "tokens_trained": 0.2310158 }, { "epoch": 0.13389121338912133, "grad_norm": 145.16331481933594, "loss": 5.759, "lr": 0.000942, "step": 472, "tokens_trained": 0.2319996 }, { "epoch": 0.13445854903907525, "grad_norm": 109.9937744140625, "loss": 5.4838, "lr": 0.000946, "step": 474, "tokens_trained": 0.232983808 }, { "epoch": 0.13502588468902915, "grad_norm": 135.74899291992188, "loss": 6.2738, "lr": 0.00095, "step": 476, "tokens_trained": 0.233963016 }, { "epoch": 0.13559322033898305, "grad_norm": 142.99449157714844, "loss": 5.8459, "lr": 0.000954, "step": 478, "tokens_trained": 0.234948864 }, { "epoch": 0.13616055598893695, "grad_norm": 198.66883850097656, "loss": 6.6626, "lr": 0.000958, "step": 480, "tokens_trained": 0.235932392 }, { "epoch": 0.13672789163889085, "grad_norm": 260.76507568359375, "loss": 6.9299, "lr": 0.000962, "step": 482, "tokens_trained": 0.236915664 }, { "epoch": 0.13729522728884477, "grad_norm": 267.97589111328125, "loss": 6.4343, "lr": 0.000966, "step": 484, "tokens_trained": 0.237896904 }, { "epoch": 0.13786256293879867, "grad_norm": 89.8781967163086, "loss": 6.3203, "lr": 0.0009699999999999999, "step": 486, "tokens_trained": 0.238874528 }, { "epoch": 0.13842989858875257, "grad_norm": 225.62985229492188, "loss": 6.2778, "lr": 0.000974, "step": 488, "tokens_trained": 0.2398588 }, { "epoch": 0.13899723423870647, "grad_norm": 85.84110260009766, "loss": 5.2786, "lr": 0.000978, "step": 490, "tokens_trained": 0.240839968 }, { "epoch": 0.13956456988866037, "grad_norm": 141.4368438720703, "loss": 5.5525, "lr": 0.000982, "step": 492, "tokens_trained": 0.241823544 }, { "epoch": 0.1401319055386143, "grad_norm": 94.9535140991211, "loss": 5.4386, "lr": 0.0009860000000000001, "step": 494, "tokens_trained": 0.242805456 }, { "epoch": 0.1406992411885682, "grad_norm": 157.4557647705078, "loss": 5.9786, "lr": 0.00099, "step": 496, "tokens_trained": 0.243792496 }, { "epoch": 0.1412665768385221, "grad_norm": 319.5025634765625, "loss": 7.04, "lr": 0.000994, "step": 498, "tokens_trained": 0.244772472 }, { "epoch": 0.141833912488476, "grad_norm": 282.26824951171875, "loss": 9.4037, "lr": 0.000998, "step": 500, "tokens_trained": 0.245758968 }, { "epoch": 0.141833912488476, "eval_loss": 2.152184247970581, "eval_runtime": 21.2772, "step": 500, "tokens_trained": 0.245758968 }, { "epoch": 0.1424012481384299, "grad_norm": 306.0666809082031, "loss": 7.8845, "lr": 0.00099986013986014, "step": 502, "tokens_trained": 0.246739024 }, { "epoch": 0.1429685837883838, "grad_norm": 188.89024353027344, "loss": 6.8118, "lr": 0.0009995804195804196, "step": 504, "tokens_trained": 0.247726552 }, { "epoch": 0.1435359194383377, "grad_norm": 228.97474670410156, "loss": 6.8475, "lr": 0.0009993006993006994, "step": 506, "tokens_trained": 0.24870688 }, { "epoch": 0.1441032550882916, "grad_norm": 229.80029296875, "loss": 6.2171, "lr": 0.000999020979020979, "step": 508, "tokens_trained": 0.249689096 }, { "epoch": 0.1446705907382455, "grad_norm": 157.30340576171875, "loss": 6.2281, "lr": 0.0009987412587412587, "step": 510, "tokens_trained": 0.250671768 }, { "epoch": 0.1452379263881994, "grad_norm": 176.64683532714844, "loss": 6.5993, "lr": 0.0009984615384615386, "step": 512, "tokens_trained": 0.25165608 }, { "epoch": 0.14580526203815333, "grad_norm": 197.20526123046875, "loss": 5.7267, "lr": 0.0009981818181818182, "step": 514, "tokens_trained": 0.252639712 }, { "epoch": 0.14637259768810723, "grad_norm": 54.713260650634766, "loss": 5.7911, "lr": 0.000997902097902098, "step": 516, "tokens_trained": 0.253622816 }, { "epoch": 0.14693993333806113, "grad_norm": 185.74923706054688, "loss": 7.0055, "lr": 0.0009976223776223777, "step": 518, "tokens_trained": 0.254602792 }, { "epoch": 0.14750726898801503, "grad_norm": 240.31021118164062, "loss": 6.452, "lr": 0.0009973426573426573, "step": 520, "tokens_trained": 0.255584736 }, { "epoch": 0.14807460463796893, "grad_norm": 160.2477264404297, "loss": 7.6556, "lr": 0.000997062937062937, "step": 522, "tokens_trained": 0.256563792 }, { "epoch": 0.14864194028792285, "grad_norm": 283.0034484863281, "loss": 6.5345, "lr": 0.0009967832167832168, "step": 524, "tokens_trained": 0.257546656 }, { "epoch": 0.14920927593787675, "grad_norm": 245.537109375, "loss": 6.3281, "lr": 0.0009965034965034964, "step": 526, "tokens_trained": 0.258530832 }, { "epoch": 0.14977661158783065, "grad_norm": 162.1538848876953, "loss": 7.4072, "lr": 0.0009962237762237763, "step": 528, "tokens_trained": 0.259514528 }, { "epoch": 0.15034394723778455, "grad_norm": 107.25792694091797, "loss": 5.356, "lr": 0.000995944055944056, "step": 530, "tokens_trained": 0.260500912 }, { "epoch": 0.15091128288773845, "grad_norm": 173.73353576660156, "loss": 6.8625, "lr": 0.0009956643356643356, "step": 532, "tokens_trained": 0.26148632 }, { "epoch": 0.15147861853769237, "grad_norm": 178.33541870117188, "loss": 5.8794, "lr": 0.0009953846153846154, "step": 534, "tokens_trained": 0.262468816 }, { "epoch": 0.15204595418764627, "grad_norm": 181.2533416748047, "loss": 7.0243, "lr": 0.000995104895104895, "step": 536, "tokens_trained": 0.263446696 }, { "epoch": 0.15261328983760017, "grad_norm": 208.79293823242188, "loss": 5.8908, "lr": 0.000994825174825175, "step": 538, "tokens_trained": 0.26443108 }, { "epoch": 0.15318062548755407, "grad_norm": 148.66285705566406, "loss": 6.0831, "lr": 0.0009945454545454546, "step": 540, "tokens_trained": 0.265414496 }, { "epoch": 0.15374796113750797, "grad_norm": 165.044189453125, "loss": 5.5594, "lr": 0.0009942657342657344, "step": 542, "tokens_trained": 0.266394128 }, { "epoch": 0.1543152967874619, "grad_norm": 124.5405502319336, "loss": 5.2442, "lr": 0.000993986013986014, "step": 544, "tokens_trained": 0.267378768 }, { "epoch": 0.1548826324374158, "grad_norm": 68.66510772705078, "loss": 5.1173, "lr": 0.0009937062937062937, "step": 546, "tokens_trained": 0.268360184 }, { "epoch": 0.1554499680873697, "grad_norm": 57.052860260009766, "loss": 5.2348, "lr": 0.0009934265734265735, "step": 548, "tokens_trained": 0.269345672 }, { "epoch": 0.1560173037373236, "grad_norm": 184.9175567626953, "loss": 6.7748, "lr": 0.0009931468531468532, "step": 550, "tokens_trained": 0.2703288 }, { "epoch": 0.15658463938727749, "grad_norm": 72.9861831665039, "loss": 5.7387, "lr": 0.000992867132867133, "step": 552, "tokens_trained": 0.271309176 }, { "epoch": 0.1571519750372314, "grad_norm": 135.864501953125, "loss": 6.3035, "lr": 0.0009925874125874127, "step": 554, "tokens_trained": 0.27229644 }, { "epoch": 0.1577193106871853, "grad_norm": 130.579833984375, "loss": 5.4434, "lr": 0.0009923076923076923, "step": 556, "tokens_trained": 0.273277904 }, { "epoch": 0.1582866463371392, "grad_norm": 206.77345275878906, "loss": 5.8649, "lr": 0.000992027972027972, "step": 558, "tokens_trained": 0.274261712 }, { "epoch": 0.1588539819870931, "grad_norm": 144.0505828857422, "loss": 5.3459, "lr": 0.0009917482517482518, "step": 560, "tokens_trained": 0.2752468 }, { "epoch": 0.159421317637047, "grad_norm": 87.56634521484375, "loss": 5.6321, "lr": 0.0009914685314685314, "step": 562, "tokens_trained": 0.276232384 }, { "epoch": 0.15998865328700093, "grad_norm": 275.2727355957031, "loss": 6.7515, "lr": 0.0009911888111888113, "step": 564, "tokens_trained": 0.277211608 }, { "epoch": 0.16055598893695483, "grad_norm": 97.00019836425781, "loss": 5.4374, "lr": 0.000990909090909091, "step": 566, "tokens_trained": 0.278196336 }, { "epoch": 0.16112332458690873, "grad_norm": 102.91439056396484, "loss": 5.729, "lr": 0.0009906293706293705, "step": 568, "tokens_trained": 0.279175672 }, { "epoch": 0.16169066023686263, "grad_norm": 151.12432861328125, "loss": 5.4189, "lr": 0.0009903496503496504, "step": 570, "tokens_trained": 0.280161088 }, { "epoch": 0.16225799588681653, "grad_norm": 86.6823959350586, "loss": 5.1704, "lr": 0.00099006993006993, "step": 572, "tokens_trained": 0.28114256 }, { "epoch": 0.16282533153677045, "grad_norm": 90.7052230834961, "loss": 5.3673, "lr": 0.0009897902097902099, "step": 574, "tokens_trained": 0.282128904 }, { "epoch": 0.16339266718672435, "grad_norm": 146.92874145507812, "loss": 5.5971, "lr": 0.0009895104895104895, "step": 576, "tokens_trained": 0.28311528 }, { "epoch": 0.16396000283667825, "grad_norm": 189.76296997070312, "loss": 5.3109, "lr": 0.0009892307692307694, "step": 578, "tokens_trained": 0.284098528 }, { "epoch": 0.16452733848663215, "grad_norm": 174.48092651367188, "loss": 5.68, "lr": 0.000988951048951049, "step": 580, "tokens_trained": 0.285081064 }, { "epoch": 0.16509467413658604, "grad_norm": 154.10816955566406, "loss": 5.3307, "lr": 0.0009886713286713286, "step": 582, "tokens_trained": 0.286067952 }, { "epoch": 0.16566200978653997, "grad_norm": 64.28263092041016, "loss": 5.1676, "lr": 0.0009883916083916085, "step": 584, "tokens_trained": 0.287051384 }, { "epoch": 0.16622934543649387, "grad_norm": 103.81795501708984, "loss": 5.3436, "lr": 0.0009881118881118881, "step": 586, "tokens_trained": 0.28803284 }, { "epoch": 0.16679668108644777, "grad_norm": 144.0076904296875, "loss": 5.3033, "lr": 0.000987832167832168, "step": 588, "tokens_trained": 0.289014824 }, { "epoch": 0.16736401673640167, "grad_norm": 88.31237030029297, "loss": 5.0609, "lr": 0.0009875524475524476, "step": 590, "tokens_trained": 0.289999864 }, { "epoch": 0.16793135238635556, "grad_norm": 68.4583740234375, "loss": 5.0702, "lr": 0.0009872727272727273, "step": 592, "tokens_trained": 0.290983888 }, { "epoch": 0.1684986880363095, "grad_norm": 135.28665161132812, "loss": 5.3962, "lr": 0.000986993006993007, "step": 594, "tokens_trained": 0.291965752 }, { "epoch": 0.1690660236862634, "grad_norm": 80.0412368774414, "loss": 5.0246, "lr": 0.0009867132867132867, "step": 596, "tokens_trained": 0.292946952 }, { "epoch": 0.1696333593362173, "grad_norm": 43.29194641113281, "loss": 5.0051, "lr": 0.0009864335664335664, "step": 598, "tokens_trained": 0.293928976 }, { "epoch": 0.17020069498617119, "grad_norm": 220.88687133789062, "loss": 6.0798, "lr": 0.0009861538461538462, "step": 600, "tokens_trained": 0.294912408 }, { "epoch": 0.17076803063612508, "grad_norm": 102.58654022216797, "loss": 5.1271, "lr": 0.0009858741258741259, "step": 602, "tokens_trained": 0.29589416 }, { "epoch": 0.171335366286079, "grad_norm": 119.0067138671875, "loss": 5.7402, "lr": 0.0009855944055944055, "step": 604, "tokens_trained": 0.296878584 }, { "epoch": 0.1719027019360329, "grad_norm": 138.8656005859375, "loss": 5.1951, "lr": 0.0009853146853146854, "step": 606, "tokens_trained": 0.297864552 }, { "epoch": 0.1724700375859868, "grad_norm": 73.5890884399414, "loss": 5.2522, "lr": 0.000985034965034965, "step": 608, "tokens_trained": 0.298854088 }, { "epoch": 0.1730373732359407, "grad_norm": 113.78330993652344, "loss": 5.6683, "lr": 0.0009847552447552449, "step": 610, "tokens_trained": 0.299835024 }, { "epoch": 0.1736047088858946, "grad_norm": 125.20297241210938, "loss": 5.1812, "lr": 0.0009844755244755245, "step": 612, "tokens_trained": 0.30082032 }, { "epoch": 0.17417204453584853, "grad_norm": 67.46041870117188, "loss": 5.0417, "lr": 0.0009841958041958043, "step": 614, "tokens_trained": 0.301808456 }, { "epoch": 0.17473938018580243, "grad_norm": 117.30754852294922, "loss": 5.3064, "lr": 0.000983916083916084, "step": 616, "tokens_trained": 0.302794456 }, { "epoch": 0.17530671583575633, "grad_norm": 124.30754089355469, "loss": 5.1614, "lr": 0.0009836363636363636, "step": 618, "tokens_trained": 0.303777376 }, { "epoch": 0.17587405148571023, "grad_norm": 102.72042083740234, "loss": 5.1265, "lr": 0.0009833566433566435, "step": 620, "tokens_trained": 0.304758864 }, { "epoch": 0.17644138713566412, "grad_norm": 39.332252502441406, "loss": 5.1078, "lr": 0.000983076923076923, "step": 622, "tokens_trained": 0.30574392 }, { "epoch": 0.17700872278561805, "grad_norm": 153.84811401367188, "loss": 5.7696, "lr": 0.000982797202797203, "step": 624, "tokens_trained": 0.306727584 }, { "epoch": 0.17729239061059499, "eval_loss": 1.3463915586471558, "eval_runtime": 20.8357, "step": 625, "tokens_trained": 0.307220496 }, { "epoch": 0.17757605843557195, "grad_norm": 160.2552490234375, "loss": 5.2283, "lr": 0.0009825174825174826, "step": 626, "tokens_trained": 0.307713024 }, { "epoch": 0.17814339408552585, "grad_norm": 186.77407836914062, "loss": 5.2866, "lr": 0.0009822377622377622, "step": 628, "tokens_trained": 0.308700128 }, { "epoch": 0.17871072973547975, "grad_norm": 84.55519104003906, "loss": 5.1106, "lr": 0.0009819580419580419, "step": 630, "tokens_trained": 0.309681208 }, { "epoch": 0.17927806538543364, "grad_norm": 20.617040634155273, "loss": 4.8327, "lr": 0.0009816783216783217, "step": 632, "tokens_trained": 0.310662224 }, { "epoch": 0.17984540103538757, "grad_norm": 168.06039428710938, "loss": 6.0704, "lr": 0.0009813986013986014, "step": 634, "tokens_trained": 0.31164064 }, { "epoch": 0.18041273668534147, "grad_norm": 238.23736572265625, "loss": 5.6188, "lr": 0.0009811188811188812, "step": 636, "tokens_trained": 0.312622568 }, { "epoch": 0.18098007233529537, "grad_norm": 140.0707550048828, "loss": 6.4034, "lr": 0.0009808391608391608, "step": 638, "tokens_trained": 0.313604944 }, { "epoch": 0.18154740798524927, "grad_norm": 161.19302368164062, "loss": 5.4906, "lr": 0.0009805594405594405, "step": 640, "tokens_trained": 0.314592072 }, { "epoch": 0.18211474363520316, "grad_norm": 121.9577407836914, "loss": 5.2097, "lr": 0.0009802797202797203, "step": 642, "tokens_trained": 0.315574392 }, { "epoch": 0.1826820792851571, "grad_norm": 121.25574493408203, "loss": 5.0317, "lr": 0.00098, "step": 644, "tokens_trained": 0.316559008 }, { "epoch": 0.183249414935111, "grad_norm": 28.328269958496094, "loss": 4.932, "lr": 0.0009797202797202798, "step": 646, "tokens_trained": 0.317538776 }, { "epoch": 0.1838167505850649, "grad_norm": 127.77408599853516, "loss": 5.8335, "lr": 0.0009794405594405595, "step": 648, "tokens_trained": 0.31851792 }, { "epoch": 0.18438408623501878, "grad_norm": 94.9522933959961, "loss": 5.1948, "lr": 0.000979160839160839, "step": 650, "tokens_trained": 0.319501576 }, { "epoch": 0.18495142188497268, "grad_norm": 110.33658599853516, "loss": 5.098, "lr": 0.000978881118881119, "step": 652, "tokens_trained": 0.320482392 }, { "epoch": 0.1855187575349266, "grad_norm": 67.23124694824219, "loss": 4.7723, "lr": 0.0009786013986013986, "step": 654, "tokens_trained": 0.32146712 }, { "epoch": 0.1860860931848805, "grad_norm": 61.519866943359375, "loss": 4.7245, "lr": 0.0009783216783216782, "step": 656, "tokens_trained": 0.322449576 }, { "epoch": 0.1866534288348344, "grad_norm": 99.51078033447266, "loss": 4.783, "lr": 0.000978041958041958, "step": 658, "tokens_trained": 0.323432688 }, { "epoch": 0.1872207644847883, "grad_norm": 44.619197845458984, "loss": 4.7495, "lr": 0.000977762237762238, "step": 660, "tokens_trained": 0.324413952 }, { "epoch": 0.18778810013474223, "grad_norm": 114.5891342163086, "loss": 5.1261, "lr": 0.0009774825174825176, "step": 662, "tokens_trained": 0.325394536 }, { "epoch": 0.18835543578469613, "grad_norm": 100.3728256225586, "loss": 4.7883, "lr": 0.0009772027972027972, "step": 664, "tokens_trained": 0.326374672 }, { "epoch": 0.18892277143465003, "grad_norm": 51.883033752441406, "loss": 4.7249, "lr": 0.0009769230769230768, "step": 666, "tokens_trained": 0.327357152 }, { "epoch": 0.18949010708460393, "grad_norm": 82.27507019042969, "loss": 4.8277, "lr": 0.0009766433566433567, "step": 668, "tokens_trained": 0.328342088 }, { "epoch": 0.19005744273455782, "grad_norm": 83.53064727783203, "loss": 4.8338, "lr": 0.0009763636363636363, "step": 670, "tokens_trained": 0.329319248 }, { "epoch": 0.19062477838451175, "grad_norm": 76.18387603759766, "loss": 4.6958, "lr": 0.0009760839160839161, "step": 672, "tokens_trained": 0.330305968 }, { "epoch": 0.19119211403446565, "grad_norm": 27.401426315307617, "loss": 4.6929, "lr": 0.0009758041958041958, "step": 674, "tokens_trained": 0.3312912 }, { "epoch": 0.19175944968441955, "grad_norm": 186.770263671875, "loss": 5.5089, "lr": 0.0009755244755244756, "step": 676, "tokens_trained": 0.332275224 }, { "epoch": 0.19232678533437345, "grad_norm": 105.02385711669922, "loss": 4.8876, "lr": 0.0009752447552447553, "step": 678, "tokens_trained": 0.33325588 }, { "epoch": 0.19289412098432734, "grad_norm": 94.96269989013672, "loss": 5.1235, "lr": 0.0009749650349650349, "step": 680, "tokens_trained": 0.334238408 }, { "epoch": 0.19346145663428127, "grad_norm": 92.29356384277344, "loss": 4.8194, "lr": 0.0009746853146853148, "step": 682, "tokens_trained": 0.335219368 }, { "epoch": 0.19402879228423517, "grad_norm": 59.1584358215332, "loss": 4.7511, "lr": 0.0009744055944055944, "step": 684, "tokens_trained": 0.336207136 }, { "epoch": 0.19459612793418907, "grad_norm": 54.759002685546875, "loss": 4.777, "lr": 0.0009741258741258742, "step": 686, "tokens_trained": 0.337193536 }, { "epoch": 0.19516346358414297, "grad_norm": 92.20452880859375, "loss": 4.8225, "lr": 0.0009738461538461538, "step": 688, "tokens_trained": 0.338179224 }, { "epoch": 0.19573079923409686, "grad_norm": 75.97005462646484, "loss": 4.655, "lr": 0.0009735664335664336, "step": 690, "tokens_trained": 0.339162168 }, { "epoch": 0.1962981348840508, "grad_norm": 58.19076919555664, "loss": 4.6446, "lr": 0.0009732867132867133, "step": 692, "tokens_trained": 0.340138904 }, { "epoch": 0.1968654705340047, "grad_norm": 50.81512451171875, "loss": 4.5866, "lr": 0.000973006993006993, "step": 694, "tokens_trained": 0.34112288 }, { "epoch": 0.1974328061839586, "grad_norm": 61.683372497558594, "loss": 4.6018, "lr": 0.0009727272727272728, "step": 696, "tokens_trained": 0.342111992 }, { "epoch": 0.19800014183391249, "grad_norm": 61.01798629760742, "loss": 4.6007, "lr": 0.0009724475524475524, "step": 698, "tokens_trained": 0.343095912 }, { "epoch": 0.19856747748386638, "grad_norm": 96.49671936035156, "loss": 4.7035, "lr": 0.0009721678321678323, "step": 700, "tokens_trained": 0.344078632 }, { "epoch": 0.1991348131338203, "grad_norm": 64.7771224975586, "loss": 4.8341, "lr": 0.0009718881118881119, "step": 702, "tokens_trained": 0.345060576 }, { "epoch": 0.1997021487837742, "grad_norm": 90.1478042602539, "loss": 4.7739, "lr": 0.0009716083916083917, "step": 704, "tokens_trained": 0.34604112 }, { "epoch": 0.2002694844337281, "grad_norm": 67.6308822631836, "loss": 4.6218, "lr": 0.0009713286713286713, "step": 706, "tokens_trained": 0.347023496 }, { "epoch": 0.200836820083682, "grad_norm": 40.50175094604492, "loss": 4.6008, "lr": 0.000971048951048951, "step": 708, "tokens_trained": 0.348005416 }, { "epoch": 0.2014041557336359, "grad_norm": 33.6448860168457, "loss": 4.5307, "lr": 0.0009707692307692308, "step": 710, "tokens_trained": 0.3489886 }, { "epoch": 0.20197149138358983, "grad_norm": 15.484851837158203, "loss": 4.5065, "lr": 0.0009704895104895105, "step": 712, "tokens_trained": 0.34997024 }, { "epoch": 0.20253882703354373, "grad_norm": 109.26301574707031, "loss": 4.9613, "lr": 0.0009702097902097903, "step": 714, "tokens_trained": 0.350958496 }, { "epoch": 0.20310616268349763, "grad_norm": 150.07492065429688, "loss": 4.8507, "lr": 0.0009699300699300699, "step": 716, "tokens_trained": 0.35193892 }, { "epoch": 0.20367349833345152, "grad_norm": 113.43978881835938, "loss": 5.4494, "lr": 0.0009696503496503498, "step": 718, "tokens_trained": 0.35291908 }, { "epoch": 0.20424083398340542, "grad_norm": 123.0071792602539, "loss": 4.9475, "lr": 0.0009693706293706294, "step": 720, "tokens_trained": 0.353896072 }, { "epoch": 0.20480816963335935, "grad_norm": 65.55500793457031, "loss": 4.7585, "lr": 0.0009690909090909091, "step": 722, "tokens_trained": 0.354878992 }, { "epoch": 0.20537550528331325, "grad_norm": 36.11159896850586, "loss": 4.6323, "lr": 0.0009688111888111888, "step": 724, "tokens_trained": 0.355863728 }, { "epoch": 0.20594284093326715, "grad_norm": 30.566436767578125, "loss": 4.53, "lr": 0.0009685314685314685, "step": 726, "tokens_trained": 0.356845272 }, { "epoch": 0.20651017658322104, "grad_norm": 59.01853561401367, "loss": 4.5283, "lr": 0.0009682517482517483, "step": 728, "tokens_trained": 0.357826656 }, { "epoch": 0.20707751223317494, "grad_norm": 91.78115844726562, "loss": 4.6149, "lr": 0.000967972027972028, "step": 730, "tokens_trained": 0.358809896 }, { "epoch": 0.20764484788312887, "grad_norm": 67.97398376464844, "loss": 4.617, "lr": 0.0009676923076923078, "step": 732, "tokens_trained": 0.359788736 }, { "epoch": 0.20821218353308277, "grad_norm": 42.82001876831055, "loss": 4.6134, "lr": 0.0009674125874125874, "step": 734, "tokens_trained": 0.360771744 }, { "epoch": 0.20877951918303667, "grad_norm": 63.52122116088867, "loss": 4.6995, "lr": 0.0009671328671328672, "step": 736, "tokens_trained": 0.361757656 }, { "epoch": 0.20934685483299056, "grad_norm": 116.39544677734375, "loss": 4.7153, "lr": 0.0009668531468531469, "step": 738, "tokens_trained": 0.362744008 }, { "epoch": 0.20991419048294446, "grad_norm": 40.74269485473633, "loss": 4.7978, "lr": 0.0009665734265734266, "step": 740, "tokens_trained": 0.36372872 }, { "epoch": 0.2104815261328984, "grad_norm": 114.29917907714844, "loss": 5.1683, "lr": 0.0009662937062937063, "step": 742, "tokens_trained": 0.364710536 }, { "epoch": 0.2110488617828523, "grad_norm": 115.83326721191406, "loss": 4.7642, "lr": 0.000966013986013986, "step": 744, "tokens_trained": 0.3656912 }, { "epoch": 0.21161619743280619, "grad_norm": 21.708093643188477, "loss": 4.8244, "lr": 0.0009657342657342657, "step": 746, "tokens_trained": 0.36667388 }, { "epoch": 0.21218353308276008, "grad_norm": 182.01918029785156, "loss": 5.6045, "lr": 0.0009654545454545455, "step": 748, "tokens_trained": 0.3676634 }, { "epoch": 0.21275086873271398, "grad_norm": 47.119319915771484, "loss": 4.7929, "lr": 0.0009651748251748252, "step": 750, "tokens_trained": 0.368647288 }, { "epoch": 0.21275086873271398, "eval_loss": 1.2186306715011597, "eval_runtime": 20.9362, "step": 750, "tokens_trained": 0.368647288 } ], "logging_steps": 2, "max_steps": 7650, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 750, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }