diff --git "a/checkpoint-2250/trainer_state.json" "b/checkpoint-2250/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2250/trainer_state.json" @@ -0,0 +1,9160 @@ +{ + "best_global_step": 2250, + "best_metric": 1.0298579931259155, + "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-MxDNA/checkpoint-2250", + "epoch": 0.638252606198142, + "eval_steps": 125, + "global_step": 2250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005673356499539039, + "grad_norm": 8450.4345703125, + "loss": 876.9911, + "lr": 2e-06, + "step": 2, + "tokens_trained": 0.000985992 + }, + { + "epoch": 0.0011346712999078079, + "grad_norm": 8980.888671875, + "loss": 779.4711, + "lr": 6e-06, + "step": 4, + "tokens_trained": 0.001968088 + }, + { + "epoch": 0.001702006949861712, + "grad_norm": 7489.92529296875, + "loss": 488.6157, + "lr": 1e-05, + "step": 6, + "tokens_trained": 0.002953808 + }, + { + "epoch": 0.0022693425998156157, + "grad_norm": 1952.1917724609375, + "loss": 237.0602, + "lr": 1.4e-05, + "step": 8, + "tokens_trained": 0.003935728 + }, + { + "epoch": 0.0028366782497695198, + "grad_norm": 1418.443603515625, + "loss": 159.0854, + "lr": 1.8e-05, + "step": 10, + "tokens_trained": 0.004916488 + }, + { + "epoch": 0.003404013899723424, + "grad_norm": 874.7195434570312, + "loss": 91.9563, + "lr": 2.2e-05, + "step": 12, + "tokens_trained": 0.005902792 + }, + { + "epoch": 0.003971349549677328, + "grad_norm": 1339.8248291015625, + "loss": 40.3366, + "lr": 2.6e-05, + "step": 14, + "tokens_trained": 0.0068856 + }, + { + "epoch": 0.0045386851996312315, + "grad_norm": 2936.7607421875, + "loss": 22.7436, + "lr": 3e-05, + "step": 16, + "tokens_trained": 0.007868248 + }, + { + "epoch": 0.005106020849585136, + "grad_norm": 1531.3807373046875, + "loss": 23.4797, + "lr": 3.4000000000000007e-05, + "step": 18, + "tokens_trained": 0.008849296 + }, + { + "epoch": 0.0056733564995390395, + "grad_norm": 3027.4189453125, + "loss": 38.7379, + "lr": 3.8e-05, + "step": 20, + "tokens_trained": 0.009830984 + }, + { + "epoch": 0.006240692149492944, + "grad_norm": 2435.890625, + "loss": 26.2427, + "lr": 4.2000000000000004e-05, + "step": 22, + "tokens_trained": 0.01081364 + }, + { + "epoch": 0.006808027799446848, + "grad_norm": 3217.990478515625, + "loss": 31.0263, + "lr": 4.6e-05, + "step": 24, + "tokens_trained": 0.01179036 + }, + { + "epoch": 0.007375363449400752, + "grad_norm": 3854.00634765625, + "loss": 33.8781, + "lr": 5e-05, + "step": 26, + "tokens_trained": 0.012774504 + }, + { + "epoch": 0.007942699099354656, + "grad_norm": 3197.489990234375, + "loss": 27.7927, + "lr": 5.4e-05, + "step": 28, + "tokens_trained": 0.013759992 + }, + { + "epoch": 0.00851003474930856, + "grad_norm": 3034.156494140625, + "loss": 37.9083, + "lr": 5.800000000000001e-05, + "step": 30, + "tokens_trained": 0.014740536 + }, + { + "epoch": 0.009077370399262463, + "grad_norm": 3040.314453125, + "loss": 34.0659, + "lr": 6.2e-05, + "step": 32, + "tokens_trained": 0.015725984 + }, + { + "epoch": 0.009644706049216368, + "grad_norm": 3065.5791015625, + "loss": 27.7768, + "lr": 6.6e-05, + "step": 34, + "tokens_trained": 0.016706864 + }, + { + "epoch": 0.010212041699170272, + "grad_norm": 2454.293701171875, + "loss": 35.1143, + "lr": 7.000000000000001e-05, + "step": 36, + "tokens_trained": 0.017688816 + }, + { + "epoch": 0.010779377349124175, + "grad_norm": 3100.7802734375, + "loss": 42.2603, + "lr": 7.4e-05, + "step": 38, + "tokens_trained": 0.018669072 + }, + { + "epoch": 0.011346712999078079, + "grad_norm": 2749.84423828125, + "loss": 39.3879, + "lr": 7.8e-05, + "step": 40, + "tokens_trained": 0.019652072 + }, + { + "epoch": 0.011914048649031984, + "grad_norm": 1519.9908447265625, + "loss": 35.0735, + "lr": 8.2e-05, + "step": 42, + "tokens_trained": 0.020633112 + }, + { + "epoch": 0.012481384298985888, + "grad_norm": 1474.4244384765625, + "loss": 25.8965, + "lr": 8.599999999999999e-05, + "step": 44, + "tokens_trained": 0.021616192 + }, + { + "epoch": 0.013048719948939792, + "grad_norm": 2962.500244140625, + "loss": 51.0784, + "lr": 8.999999999999999e-05, + "step": 46, + "tokens_trained": 0.022597288 + }, + { + "epoch": 0.013616055598893695, + "grad_norm": 2419.41455078125, + "loss": 43.0334, + "lr": 9.400000000000001e-05, + "step": 48, + "tokens_trained": 0.02357572 + }, + { + "epoch": 0.014183391248847599, + "grad_norm": 1267.87451171875, + "loss": 21.8063, + "lr": 9.800000000000001e-05, + "step": 50, + "tokens_trained": 0.024553376 + }, + { + "epoch": 0.014750726898801504, + "grad_norm": 1573.944091796875, + "loss": 52.9693, + "lr": 0.000102, + "step": 52, + "tokens_trained": 0.025536728 + }, + { + "epoch": 0.015318062548755408, + "grad_norm": 1509.650146484375, + "loss": 50.0825, + "lr": 0.000106, + "step": 54, + "tokens_trained": 0.026517 + }, + { + "epoch": 0.01588539819870931, + "grad_norm": 2334.765380859375, + "loss": 42.1982, + "lr": 0.00011, + "step": 56, + "tokens_trained": 0.027504728 + }, + { + "epoch": 0.016452733848663217, + "grad_norm": 1594.16259765625, + "loss": 39.0562, + "lr": 0.000114, + "step": 58, + "tokens_trained": 0.028485416 + }, + { + "epoch": 0.01702006949861712, + "grad_norm": 1628.082275390625, + "loss": 35.0488, + "lr": 0.000118, + "step": 60, + "tokens_trained": 0.029468696 + }, + { + "epoch": 0.017587405148571024, + "grad_norm": 2496.6455078125, + "loss": 49.4241, + "lr": 0.000122, + "step": 62, + "tokens_trained": 0.030453584 + }, + { + "epoch": 0.018154740798524926, + "grad_norm": 2521.721435546875, + "loss": 69.0275, + "lr": 0.000126, + "step": 64, + "tokens_trained": 0.031432864 + }, + { + "epoch": 0.01872207644847883, + "grad_norm": 2179.571533203125, + "loss": 63.1409, + "lr": 0.00013000000000000002, + "step": 66, + "tokens_trained": 0.032418416 + }, + { + "epoch": 0.019289412098432736, + "grad_norm": 899.7137451171875, + "loss": 38.4131, + "lr": 0.000134, + "step": 68, + "tokens_trained": 0.033402136 + }, + { + "epoch": 0.01985674774838664, + "grad_norm": 2109.377685546875, + "loss": 51.0044, + "lr": 0.00013800000000000002, + "step": 70, + "tokens_trained": 0.03438832 + }, + { + "epoch": 0.020424083398340544, + "grad_norm": 1649.1873779296875, + "loss": 32.1408, + "lr": 0.00014199999999999998, + "step": 72, + "tokens_trained": 0.035374464 + }, + { + "epoch": 0.020991419048294446, + "grad_norm": 1807.994140625, + "loss": 28.8357, + "lr": 0.000146, + "step": 74, + "tokens_trained": 0.03635784 + }, + { + "epoch": 0.02155875469824835, + "grad_norm": 998.9485473632812, + "loss": 23.0343, + "lr": 0.00015, + "step": 76, + "tokens_trained": 0.037340248 + }, + { + "epoch": 0.022126090348202256, + "grad_norm": 2240.17578125, + "loss": 32.0397, + "lr": 0.000154, + "step": 78, + "tokens_trained": 0.038321968 + }, + { + "epoch": 0.022693425998156158, + "grad_norm": 1606.0067138671875, + "loss": 32.1776, + "lr": 0.000158, + "step": 80, + "tokens_trained": 0.039304992 + }, + { + "epoch": 0.023260761648110063, + "grad_norm": 1685.1015625, + "loss": 24.3428, + "lr": 0.000162, + "step": 82, + "tokens_trained": 0.040286808 + }, + { + "epoch": 0.02382809729806397, + "grad_norm": 1761.7890625, + "loss": 23.9261, + "lr": 0.00016600000000000002, + "step": 84, + "tokens_trained": 0.041271776 + }, + { + "epoch": 0.02439543294801787, + "grad_norm": 2036.0982666015625, + "loss": 27.7196, + "lr": 0.00017, + "step": 86, + "tokens_trained": 0.042252784 + }, + { + "epoch": 0.024962768597971776, + "grad_norm": 1564.3870849609375, + "loss": 25.3722, + "lr": 0.000174, + "step": 88, + "tokens_trained": 0.04323596 + }, + { + "epoch": 0.025530104247925678, + "grad_norm": 1508.349853515625, + "loss": 18.4107, + "lr": 0.000178, + "step": 90, + "tokens_trained": 0.044218984 + }, + { + "epoch": 0.026097439897879583, + "grad_norm": 1955.011474609375, + "loss": 28.8456, + "lr": 0.000182, + "step": 92, + "tokens_trained": 0.045202144 + }, + { + "epoch": 0.02666477554783349, + "grad_norm": 1679.9423828125, + "loss": 23.6139, + "lr": 0.000186, + "step": 94, + "tokens_trained": 0.046192336 + }, + { + "epoch": 0.02723211119778739, + "grad_norm": 1517.5731201171875, + "loss": 42.145, + "lr": 0.00019, + "step": 96, + "tokens_trained": 0.047174312 + }, + { + "epoch": 0.027799446847741296, + "grad_norm": 1535.3076171875, + "loss": 31.9711, + "lr": 0.000194, + "step": 98, + "tokens_trained": 0.048158944 + }, + { + "epoch": 0.028366782497695198, + "grad_norm": 1475.2569580078125, + "loss": 37.645, + "lr": 0.00019800000000000002, + "step": 100, + "tokens_trained": 0.04914364 + }, + { + "epoch": 0.028934118147649103, + "grad_norm": 1918.4088134765625, + "loss": 69.4053, + "lr": 0.000202, + "step": 102, + "tokens_trained": 0.050123488 + }, + { + "epoch": 0.02950145379760301, + "grad_norm": 1631.6231689453125, + "loss": 50.9725, + "lr": 0.000206, + "step": 104, + "tokens_trained": 0.051105512 + }, + { + "epoch": 0.03006878944755691, + "grad_norm": 1291.6376953125, + "loss": 22.6527, + "lr": 0.00021, + "step": 106, + "tokens_trained": 0.052091704 + }, + { + "epoch": 0.030636125097510816, + "grad_norm": 1224.9625244140625, + "loss": 60.2725, + "lr": 0.000214, + "step": 108, + "tokens_trained": 0.053074824 + }, + { + "epoch": 0.031203460747464717, + "grad_norm": 1218.2022705078125, + "loss": 75.8728, + "lr": 0.000218, + "step": 110, + "tokens_trained": 0.054057104 + }, + { + "epoch": 0.03177079639741862, + "grad_norm": 1761.8861083984375, + "loss": 61.6427, + "lr": 0.000222, + "step": 112, + "tokens_trained": 0.055039128 + }, + { + "epoch": 0.03233813204737253, + "grad_norm": 1482.4256591796875, + "loss": 35.3351, + "lr": 0.00022600000000000002, + "step": 114, + "tokens_trained": 0.05602388 + }, + { + "epoch": 0.03290546769732643, + "grad_norm": 563.6399536132812, + "loss": 40.1461, + "lr": 0.00023, + "step": 116, + "tokens_trained": 0.057005376 + }, + { + "epoch": 0.03347280334728033, + "grad_norm": 1266.058837890625, + "loss": 24.0657, + "lr": 0.00023400000000000002, + "step": 118, + "tokens_trained": 0.057985136 + }, + { + "epoch": 0.03404013899723424, + "grad_norm": 918.206298828125, + "loss": 23.9626, + "lr": 0.00023799999999999998, + "step": 120, + "tokens_trained": 0.058968288 + }, + { + "epoch": 0.03460747464718814, + "grad_norm": 1495.7191162109375, + "loss": 19.798, + "lr": 0.000242, + "step": 122, + "tokens_trained": 0.05995348 + }, + { + "epoch": 0.03517481029714205, + "grad_norm": 1264.302734375, + "loss": 31.5342, + "lr": 0.000246, + "step": 124, + "tokens_trained": 0.060935832 + }, + { + "epoch": 0.035458478122119, + "eval_loss": 5.312118053436279, + "eval_runtime": 21.3065, + "step": 125, + "tokens_trained": 0.061426608 + }, + { + "epoch": 0.03574214594709595, + "grad_norm": 907.4861450195312, + "loss": 25.1262, + "lr": 0.00025, + "step": 126, + "tokens_trained": 0.061918184 + }, + { + "epoch": 0.03630948159704985, + "grad_norm": 1287.6158447265625, + "loss": 26.963, + "lr": 0.000254, + "step": 128, + "tokens_trained": 0.062902328 + }, + { + "epoch": 0.03687681724700376, + "grad_norm": 1260.570556640625, + "loss": 24.9633, + "lr": 0.00025800000000000004, + "step": 130, + "tokens_trained": 0.063883456 + }, + { + "epoch": 0.03744415289695766, + "grad_norm": 1436.82373046875, + "loss": 23.1028, + "lr": 0.000262, + "step": 132, + "tokens_trained": 0.06486748 + }, + { + "epoch": 0.03801148854691157, + "grad_norm": 812.9523315429688, + "loss": 20.5496, + "lr": 0.000266, + "step": 134, + "tokens_trained": 0.065847104 + }, + { + "epoch": 0.03857882419686547, + "grad_norm": 1336.5322265625, + "loss": 23.673, + "lr": 0.00027, + "step": 136, + "tokens_trained": 0.066829928 + }, + { + "epoch": 0.03914615984681937, + "grad_norm": 1381.282470703125, + "loss": 32.0373, + "lr": 0.00027400000000000005, + "step": 138, + "tokens_trained": 0.067814024 + }, + { + "epoch": 0.03971349549677328, + "grad_norm": 972.7861938476562, + "loss": 26.9454, + "lr": 0.00027800000000000004, + "step": 140, + "tokens_trained": 0.068797744 + }, + { + "epoch": 0.04028083114672718, + "grad_norm": 1347.2249755859375, + "loss": 22.3578, + "lr": 0.00028199999999999997, + "step": 142, + "tokens_trained": 0.069780072 + }, + { + "epoch": 0.04084816679668109, + "grad_norm": 829.525390625, + "loss": 37.9879, + "lr": 0.00028599999999999996, + "step": 144, + "tokens_trained": 0.070759896 + }, + { + "epoch": 0.04141550244663499, + "grad_norm": 1094.1033935546875, + "loss": 21.1972, + "lr": 0.00029, + "step": 146, + "tokens_trained": 0.0717452 + }, + { + "epoch": 0.04198283809658889, + "grad_norm": 717.107421875, + "loss": 21.7774, + "lr": 0.000294, + "step": 148, + "tokens_trained": 0.072727432 + }, + { + "epoch": 0.042550173746542796, + "grad_norm": 744.4456787109375, + "loss": 20.3235, + "lr": 0.000298, + "step": 150, + "tokens_trained": 0.073712128 + }, + { + "epoch": 0.0431175093964967, + "grad_norm": 904.1460571289062, + "loss": 22.7878, + "lr": 0.000302, + "step": 152, + "tokens_trained": 0.074695296 + }, + { + "epoch": 0.04368484504645061, + "grad_norm": 1352.303955078125, + "loss": 20.9757, + "lr": 0.000306, + "step": 154, + "tokens_trained": 0.0756798 + }, + { + "epoch": 0.04425218069640451, + "grad_norm": 997.0473022460938, + "loss": 17.4647, + "lr": 0.00031, + "step": 156, + "tokens_trained": 0.076666504 + }, + { + "epoch": 0.04481951634635841, + "grad_norm": 1206.387939453125, + "loss": 21.1846, + "lr": 0.000314, + "step": 158, + "tokens_trained": 0.07764868 + }, + { + "epoch": 0.045386851996312316, + "grad_norm": 1029.6807861328125, + "loss": 17.8853, + "lr": 0.00031800000000000003, + "step": 160, + "tokens_trained": 0.07863548 + }, + { + "epoch": 0.04595418764626622, + "grad_norm": 1136.4635009765625, + "loss": 30.057, + "lr": 0.000322, + "step": 162, + "tokens_trained": 0.079618928 + }, + { + "epoch": 0.04652152329622013, + "grad_norm": 834.3464965820312, + "loss": 28.1782, + "lr": 0.000326, + "step": 164, + "tokens_trained": 0.0806032 + }, + { + "epoch": 0.04708885894617403, + "grad_norm": 1177.8365478515625, + "loss": 16.4267, + "lr": 0.00033, + "step": 166, + "tokens_trained": 0.081583752 + }, + { + "epoch": 0.04765619459612794, + "grad_norm": 572.501708984375, + "loss": 16.5752, + "lr": 0.00033400000000000004, + "step": 168, + "tokens_trained": 0.082568184 + }, + { + "epoch": 0.048223530246081836, + "grad_norm": 437.6822814941406, + "loss": 11.5509, + "lr": 0.00033800000000000003, + "step": 170, + "tokens_trained": 0.083553352 + }, + { + "epoch": 0.04879086589603574, + "grad_norm": 1119.0416259765625, + "loss": 16.2689, + "lr": 0.000342, + "step": 172, + "tokens_trained": 0.084536352 + }, + { + "epoch": 0.04935820154598965, + "grad_norm": 895.4021606445312, + "loss": 12.6663, + "lr": 0.000346, + "step": 174, + "tokens_trained": 0.085517312 + }, + { + "epoch": 0.04992553719594355, + "grad_norm": 995.6289672851562, + "loss": 26.0663, + "lr": 0.00035, + "step": 176, + "tokens_trained": 0.086496088 + }, + { + "epoch": 0.05049287284589746, + "grad_norm": 839.6610717773438, + "loss": 21.5115, + "lr": 0.000354, + "step": 178, + "tokens_trained": 0.087480632 + }, + { + "epoch": 0.051060208495851356, + "grad_norm": 734.1155395507812, + "loss": 29.3287, + "lr": 0.000358, + "step": 180, + "tokens_trained": 0.088460408 + }, + { + "epoch": 0.05162754414580526, + "grad_norm": 721.4505615234375, + "loss": 26.0801, + "lr": 0.000362, + "step": 182, + "tokens_trained": 0.08944248 + }, + { + "epoch": 0.052194879795759166, + "grad_norm": 845.9672241210938, + "loss": 19.0639, + "lr": 0.000366, + "step": 184, + "tokens_trained": 0.090427832 + }, + { + "epoch": 0.05276221544571307, + "grad_norm": 1210.9969482421875, + "loss": 23.9036, + "lr": 0.00037, + "step": 186, + "tokens_trained": 0.091411504 + }, + { + "epoch": 0.05332955109566698, + "grad_norm": 1079.1690673828125, + "loss": 23.5588, + "lr": 0.000374, + "step": 188, + "tokens_trained": 0.092392672 + }, + { + "epoch": 0.053896886745620876, + "grad_norm": 596.111328125, + "loss": 20.8275, + "lr": 0.000378, + "step": 190, + "tokens_trained": 0.093374696 + }, + { + "epoch": 0.05446422239557478, + "grad_norm": 761.8096923828125, + "loss": 22.512, + "lr": 0.000382, + "step": 192, + "tokens_trained": 0.094361912 + }, + { + "epoch": 0.055031558045528686, + "grad_norm": 1081.9832763671875, + "loss": 32.335, + "lr": 0.000386, + "step": 194, + "tokens_trained": 0.095342992 + }, + { + "epoch": 0.05559889369548259, + "grad_norm": 304.3534240722656, + "loss": 11.5275, + "lr": 0.00039000000000000005, + "step": 196, + "tokens_trained": 0.096323512 + }, + { + "epoch": 0.0561662293454365, + "grad_norm": 586.6314086914062, + "loss": 16.2663, + "lr": 0.00039400000000000004, + "step": 198, + "tokens_trained": 0.097308864 + }, + { + "epoch": 0.056733564995390395, + "grad_norm": 624.9953002929688, + "loss": 16.627, + "lr": 0.000398, + "step": 200, + "tokens_trained": 0.098289064 + }, + { + "epoch": 0.0573009006453443, + "grad_norm": 585.9645385742188, + "loss": 15.8359, + "lr": 0.000402, + "step": 202, + "tokens_trained": 0.099269696 + }, + { + "epoch": 0.057868236295298206, + "grad_norm": 537.9913330078125, + "loss": 20.0779, + "lr": 0.00040600000000000006, + "step": 204, + "tokens_trained": 0.100248448 + }, + { + "epoch": 0.05843557194525211, + "grad_norm": 805.04931640625, + "loss": 21.4524, + "lr": 0.00041, + "step": 206, + "tokens_trained": 0.101231248 + }, + { + "epoch": 0.05900290759520602, + "grad_norm": 439.1418151855469, + "loss": 23.9852, + "lr": 0.000414, + "step": 208, + "tokens_trained": 0.102210688 + }, + { + "epoch": 0.059570243245159915, + "grad_norm": 502.684814453125, + "loss": 17.6273, + "lr": 0.00041799999999999997, + "step": 210, + "tokens_trained": 0.103192176 + }, + { + "epoch": 0.06013757889511382, + "grad_norm": 849.9979858398438, + "loss": 33.7517, + "lr": 0.000422, + "step": 212, + "tokens_trained": 0.104172824 + }, + { + "epoch": 0.060704914545067726, + "grad_norm": 939.583740234375, + "loss": 26.2559, + "lr": 0.000426, + "step": 214, + "tokens_trained": 0.105156672 + }, + { + "epoch": 0.06127225019502163, + "grad_norm": 525.0505981445312, + "loss": 20.0923, + "lr": 0.00043, + "step": 216, + "tokens_trained": 0.106141368 + }, + { + "epoch": 0.061839585844975536, + "grad_norm": 420.296630859375, + "loss": 17.9608, + "lr": 0.00043400000000000003, + "step": 218, + "tokens_trained": 0.107124088 + }, + { + "epoch": 0.062406921494929435, + "grad_norm": 711.3380737304688, + "loss": 19.387, + "lr": 0.000438, + "step": 220, + "tokens_trained": 0.108112632 + }, + { + "epoch": 0.06297425714488335, + "grad_norm": 759.183349609375, + "loss": 17.8061, + "lr": 0.000442, + "step": 222, + "tokens_trained": 0.1090934 + }, + { + "epoch": 0.06354159279483725, + "grad_norm": 790.025146484375, + "loss": 13.8539, + "lr": 0.000446, + "step": 224, + "tokens_trained": 0.110079512 + }, + { + "epoch": 0.06410892844479114, + "grad_norm": 769.8306274414062, + "loss": 22.1258, + "lr": 0.00045000000000000004, + "step": 226, + "tokens_trained": 0.111060152 + }, + { + "epoch": 0.06467626409474506, + "grad_norm": 656.8352661132812, + "loss": 14.8646, + "lr": 0.00045400000000000003, + "step": 228, + "tokens_trained": 0.112044144 + }, + { + "epoch": 0.06524359974469895, + "grad_norm": 498.92010498046875, + "loss": 23.1558, + "lr": 0.000458, + "step": 230, + "tokens_trained": 0.113022928 + }, + { + "epoch": 0.06581093539465287, + "grad_norm": 764.0186157226562, + "loss": 16.7089, + "lr": 0.000462, + "step": 232, + "tokens_trained": 0.114003832 + }, + { + "epoch": 0.06637827104460677, + "grad_norm": 491.5793762207031, + "loss": 12.3979, + "lr": 0.00046600000000000005, + "step": 234, + "tokens_trained": 0.114991008 + }, + { + "epoch": 0.06694560669456066, + "grad_norm": 679.9217529296875, + "loss": 14.9037, + "lr": 0.00047, + "step": 236, + "tokens_trained": 0.115971888 + }, + { + "epoch": 0.06751294234451458, + "grad_norm": 491.0369567871094, + "loss": 7.7603, + "lr": 0.000474, + "step": 238, + "tokens_trained": 0.116952616 + }, + { + "epoch": 0.06808027799446847, + "grad_norm": 369.2186279296875, + "loss": 8.2256, + "lr": 0.00047799999999999996, + "step": 240, + "tokens_trained": 0.117935816 + }, + { + "epoch": 0.06864761364442239, + "grad_norm": 312.72137451171875, + "loss": 7.5486, + "lr": 0.000482, + "step": 242, + "tokens_trained": 0.118919392 + }, + { + "epoch": 0.06921494929437629, + "grad_norm": 596.1439208984375, + "loss": 11.7351, + "lr": 0.000486, + "step": 244, + "tokens_trained": 0.119901856 + }, + { + "epoch": 0.06978228494433018, + "grad_norm": 467.5667419433594, + "loss": 11.8403, + "lr": 0.00049, + "step": 246, + "tokens_trained": 0.120884624 + }, + { + "epoch": 0.0703496205942841, + "grad_norm": 430.50048828125, + "loss": 13.8081, + "lr": 0.000494, + "step": 248, + "tokens_trained": 0.121869224 + }, + { + "epoch": 0.070916956244238, + "grad_norm": 522.242919921875, + "loss": 14.1892, + "lr": 0.000498, + "step": 250, + "tokens_trained": 0.122853584 + }, + { + "epoch": 0.070916956244238, + "eval_loss": 1.9294606447219849, + "eval_runtime": 20.4162, + "step": 250, + "tokens_trained": 0.122853584 + }, + { + "epoch": 0.0714842918941919, + "grad_norm": 835.2765502929688, + "loss": 13.2462, + "lr": 0.0005020000000000001, + "step": 252, + "tokens_trained": 0.123835544 + }, + { + "epoch": 0.0720516275441458, + "grad_norm": 714.8098754882812, + "loss": 20.0498, + "lr": 0.000506, + "step": 254, + "tokens_trained": 0.124821616 + }, + { + "epoch": 0.0726189631940997, + "grad_norm": 701.512939453125, + "loss": 18.3664, + "lr": 0.00051, + "step": 256, + "tokens_trained": 0.125807608 + }, + { + "epoch": 0.07318629884405362, + "grad_norm": 773.987060546875, + "loss": 21.3807, + "lr": 0.000514, + "step": 258, + "tokens_trained": 0.126791464 + }, + { + "epoch": 0.07375363449400751, + "grad_norm": 826.422119140625, + "loss": 22.6403, + "lr": 0.000518, + "step": 260, + "tokens_trained": 0.127771752 + }, + { + "epoch": 0.07432097014396143, + "grad_norm": 742.8673095703125, + "loss": 20.1504, + "lr": 0.000522, + "step": 262, + "tokens_trained": 0.128755448 + }, + { + "epoch": 0.07488830579391532, + "grad_norm": 797.79296875, + "loss": 26.7343, + "lr": 0.000526, + "step": 264, + "tokens_trained": 0.129741088 + }, + { + "epoch": 0.07545564144386922, + "grad_norm": 673.9141235351562, + "loss": 12.505, + "lr": 0.0005300000000000001, + "step": 266, + "tokens_trained": 0.130727504 + }, + { + "epoch": 0.07602297709382314, + "grad_norm": 310.6510925292969, + "loss": 12.6344, + "lr": 0.0005340000000000001, + "step": 268, + "tokens_trained": 0.131710296 + }, + { + "epoch": 0.07659031274377703, + "grad_norm": 312.40966796875, + "loss": 14.254, + "lr": 0.0005380000000000001, + "step": 270, + "tokens_trained": 0.132695352 + }, + { + "epoch": 0.07715764839373095, + "grad_norm": 492.2834777832031, + "loss": 19.0979, + "lr": 0.0005420000000000001, + "step": 272, + "tokens_trained": 0.133677928 + }, + { + "epoch": 0.07772498404368484, + "grad_norm": 628.457763671875, + "loss": 21.7735, + "lr": 0.000546, + "step": 274, + "tokens_trained": 0.134655504 + }, + { + "epoch": 0.07829231969363874, + "grad_norm": 382.8389892578125, + "loss": 12.5128, + "lr": 0.00055, + "step": 276, + "tokens_trained": 0.135640208 + }, + { + "epoch": 0.07885965534359266, + "grad_norm": 483.12335205078125, + "loss": 15.2589, + "lr": 0.000554, + "step": 278, + "tokens_trained": 0.136624232 + }, + { + "epoch": 0.07942699099354655, + "grad_norm": 640.658447265625, + "loss": 12.1341, + "lr": 0.000558, + "step": 280, + "tokens_trained": 0.13760628 + }, + { + "epoch": 0.07999432664350047, + "grad_norm": 410.0824279785156, + "loss": 12.5723, + "lr": 0.0005620000000000001, + "step": 282, + "tokens_trained": 0.13858832 + }, + { + "epoch": 0.08056166229345436, + "grad_norm": 513.2861328125, + "loss": 14.8461, + "lr": 0.000566, + "step": 284, + "tokens_trained": 0.139568424 + }, + { + "epoch": 0.08112899794340826, + "grad_norm": 564.547607421875, + "loss": 12.5792, + "lr": 0.00057, + "step": 286, + "tokens_trained": 0.140557016 + }, + { + "epoch": 0.08169633359336217, + "grad_norm": 451.3592834472656, + "loss": 16.5433, + "lr": 0.000574, + "step": 288, + "tokens_trained": 0.141540248 + }, + { + "epoch": 0.08226366924331607, + "grad_norm": 404.2495422363281, + "loss": 16.4138, + "lr": 0.000578, + "step": 290, + "tokens_trained": 0.142528272 + }, + { + "epoch": 0.08283100489326999, + "grad_norm": 566.5219116210938, + "loss": 16.4743, + "lr": 0.0005819999999999999, + "step": 292, + "tokens_trained": 0.143513096 + }, + { + "epoch": 0.08339834054322388, + "grad_norm": 559.6517333984375, + "loss": 16.421, + "lr": 0.0005859999999999999, + "step": 294, + "tokens_trained": 0.144494472 + }, + { + "epoch": 0.08396567619317778, + "grad_norm": 260.874755859375, + "loss": 11.2214, + "lr": 0.00059, + "step": 296, + "tokens_trained": 0.14547876 + }, + { + "epoch": 0.0845330118431317, + "grad_norm": 272.02899169921875, + "loss": 10.3491, + "lr": 0.000594, + "step": 298, + "tokens_trained": 0.146465864 + }, + { + "epoch": 0.08510034749308559, + "grad_norm": 556.9845581054688, + "loss": 10.4348, + "lr": 0.000598, + "step": 300, + "tokens_trained": 0.147446344 + }, + { + "epoch": 0.0856676831430395, + "grad_norm": 273.35772705078125, + "loss": 8.3292, + "lr": 0.000602, + "step": 302, + "tokens_trained": 0.14843244 + }, + { + "epoch": 0.0862350187929934, + "grad_norm": 246.6316680908203, + "loss": 9.9362, + "lr": 0.000606, + "step": 304, + "tokens_trained": 0.149415976 + }, + { + "epoch": 0.0868023544429473, + "grad_norm": 564.4365844726562, + "loss": 9.2621, + "lr": 0.00061, + "step": 306, + "tokens_trained": 0.150398728 + }, + { + "epoch": 0.08736969009290121, + "grad_norm": 396.0948791503906, + "loss": 11.8526, + "lr": 0.000614, + "step": 308, + "tokens_trained": 0.151385104 + }, + { + "epoch": 0.08793702574285511, + "grad_norm": 488.6072692871094, + "loss": 11.8473, + "lr": 0.0006180000000000001, + "step": 310, + "tokens_trained": 0.152373672 + }, + { + "epoch": 0.08850436139280903, + "grad_norm": 346.70660400390625, + "loss": 12.0897, + "lr": 0.000622, + "step": 312, + "tokens_trained": 0.153356256 + }, + { + "epoch": 0.08907169704276292, + "grad_norm": 382.40679931640625, + "loss": 9.271, + "lr": 0.000626, + "step": 314, + "tokens_trained": 0.154342632 + }, + { + "epoch": 0.08963903269271682, + "grad_norm": 288.7908935546875, + "loss": 9.185, + "lr": 0.00063, + "step": 316, + "tokens_trained": 0.1553238 + }, + { + "epoch": 0.09020636834267073, + "grad_norm": 337.5335388183594, + "loss": 12.0555, + "lr": 0.000634, + "step": 318, + "tokens_trained": 0.156313168 + }, + { + "epoch": 0.09077370399262463, + "grad_norm": 349.25531005859375, + "loss": 8.51, + "lr": 0.000638, + "step": 320, + "tokens_trained": 0.157299448 + }, + { + "epoch": 0.09134103964257854, + "grad_norm": 471.7824401855469, + "loss": 14.1888, + "lr": 0.000642, + "step": 322, + "tokens_trained": 0.158285264 + }, + { + "epoch": 0.09190837529253244, + "grad_norm": 284.94036865234375, + "loss": 10.1593, + "lr": 0.000646, + "step": 324, + "tokens_trained": 0.159267512 + }, + { + "epoch": 0.09247571094248634, + "grad_norm": 510.90478515625, + "loss": 13.5744, + "lr": 0.0006500000000000001, + "step": 326, + "tokens_trained": 0.160250856 + }, + { + "epoch": 0.09304304659244025, + "grad_norm": 373.82965087890625, + "loss": 8.4999, + "lr": 0.0006540000000000001, + "step": 328, + "tokens_trained": 0.161231832 + }, + { + "epoch": 0.09361038224239415, + "grad_norm": 219.3827362060547, + "loss": 8.4436, + "lr": 0.0006580000000000001, + "step": 330, + "tokens_trained": 0.162217656 + }, + { + "epoch": 0.09417771789234806, + "grad_norm": 433.0914001464844, + "loss": 11.2019, + "lr": 0.000662, + "step": 332, + "tokens_trained": 0.163199096 + }, + { + "epoch": 0.09474505354230196, + "grad_norm": 242.65907287597656, + "loss": 9.0666, + "lr": 0.000666, + "step": 334, + "tokens_trained": 0.164178512 + }, + { + "epoch": 0.09531238919225588, + "grad_norm": 446.07916259765625, + "loss": 8.6546, + "lr": 0.00067, + "step": 336, + "tokens_trained": 0.165162464 + }, + { + "epoch": 0.09587972484220977, + "grad_norm": 231.8892364501953, + "loss": 7.5819, + "lr": 0.000674, + "step": 338, + "tokens_trained": 0.166141536 + }, + { + "epoch": 0.09644706049216367, + "grad_norm": 100.7306137084961, + "loss": 6.7047, + "lr": 0.0006780000000000001, + "step": 340, + "tokens_trained": 0.167123944 + }, + { + "epoch": 0.09701439614211758, + "grad_norm": 78.11279296875, + "loss": 5.9308, + "lr": 0.0006820000000000001, + "step": 342, + "tokens_trained": 0.168105264 + }, + { + "epoch": 0.09758173179207148, + "grad_norm": 271.466064453125, + "loss": 6.9141, + "lr": 0.0006860000000000001, + "step": 344, + "tokens_trained": 0.169088912 + }, + { + "epoch": 0.0981490674420254, + "grad_norm": 252.54478454589844, + "loss": 6.3281, + "lr": 0.00069, + "step": 346, + "tokens_trained": 0.170077368 + }, + { + "epoch": 0.0987164030919793, + "grad_norm": 305.8559875488281, + "loss": 6.443, + "lr": 0.000694, + "step": 348, + "tokens_trained": 0.171057232 + }, + { + "epoch": 0.09928373874193319, + "grad_norm": 227.74374389648438, + "loss": 6.552, + "lr": 0.0006979999999999999, + "step": 350, + "tokens_trained": 0.172041376 + }, + { + "epoch": 0.0998510743918871, + "grad_norm": 446.7601623535156, + "loss": 10.8184, + "lr": 0.0007019999999999999, + "step": 352, + "tokens_trained": 0.173023624 + }, + { + "epoch": 0.100418410041841, + "grad_norm": 353.0849609375, + "loss": 8.6327, + "lr": 0.0007059999999999999, + "step": 354, + "tokens_trained": 0.174005992 + }, + { + "epoch": 0.10098574569179491, + "grad_norm": 367.9427185058594, + "loss": 9.3898, + "lr": 0.00071, + "step": 356, + "tokens_trained": 0.174988304 + }, + { + "epoch": 0.10155308134174881, + "grad_norm": 224.4961700439453, + "loss": 8.284, + "lr": 0.000714, + "step": 358, + "tokens_trained": 0.175969816 + }, + { + "epoch": 0.10212041699170271, + "grad_norm": 221.86537170410156, + "loss": 7.0578, + "lr": 0.000718, + "step": 360, + "tokens_trained": 0.176952688 + }, + { + "epoch": 0.10268775264165662, + "grad_norm": 331.0989685058594, + "loss": 6.9561, + "lr": 0.000722, + "step": 362, + "tokens_trained": 0.177935144 + }, + { + "epoch": 0.10325508829161052, + "grad_norm": 171.6498260498047, + "loss": 7.203, + "lr": 0.000726, + "step": 364, + "tokens_trained": 0.178916776 + }, + { + "epoch": 0.10382242394156443, + "grad_norm": 284.2208557128906, + "loss": 10.3517, + "lr": 0.00073, + "step": 366, + "tokens_trained": 0.179903432 + }, + { + "epoch": 0.10438975959151833, + "grad_norm": 354.8574523925781, + "loss": 9.3888, + "lr": 0.000734, + "step": 368, + "tokens_trained": 0.180883224 + }, + { + "epoch": 0.10495709524147223, + "grad_norm": 344.82574462890625, + "loss": 10.5933, + "lr": 0.000738, + "step": 370, + "tokens_trained": 0.181863808 + }, + { + "epoch": 0.10552443089142614, + "grad_norm": 302.6838073730469, + "loss": 10.2832, + "lr": 0.000742, + "step": 372, + "tokens_trained": 0.182843712 + }, + { + "epoch": 0.10609176654138004, + "grad_norm": 323.0387878417969, + "loss": 6.4864, + "lr": 0.000746, + "step": 374, + "tokens_trained": 0.183825832 + }, + { + "epoch": 0.10637543436635699, + "eval_loss": 1.4430732727050781, + "eval_runtime": 20.5468, + "step": 375, + "tokens_trained": 0.184317744 + }, + { + "epoch": 0.10665910219133395, + "grad_norm": 133.74822998046875, + "loss": 5.4176, + "lr": 0.00075, + "step": 376, + "tokens_trained": 0.184811352 + }, + { + "epoch": 0.10722643784128785, + "grad_norm": 180.3372344970703, + "loss": 5.5641, + "lr": 0.000754, + "step": 378, + "tokens_trained": 0.185792528 + }, + { + "epoch": 0.10779377349124175, + "grad_norm": 250.83999633789062, + "loss": 5.8612, + "lr": 0.000758, + "step": 380, + "tokens_trained": 0.186777112 + }, + { + "epoch": 0.10836110914119566, + "grad_norm": 293.51959228515625, + "loss": 6.0418, + "lr": 0.000762, + "step": 382, + "tokens_trained": 0.18775724 + }, + { + "epoch": 0.10892844479114956, + "grad_norm": 292.56207275390625, + "loss": 6.1812, + "lr": 0.0007660000000000001, + "step": 384, + "tokens_trained": 0.188733568 + }, + { + "epoch": 0.10949578044110347, + "grad_norm": 121.82467651367188, + "loss": 6.0855, + "lr": 0.0007700000000000001, + "step": 386, + "tokens_trained": 0.189718512 + }, + { + "epoch": 0.11006311609105737, + "grad_norm": 124.30497741699219, + "loss": 5.7734, + "lr": 0.0007740000000000001, + "step": 388, + "tokens_trained": 0.190703776 + }, + { + "epoch": 0.11063045174101127, + "grad_norm": 143.64004516601562, + "loss": 5.7641, + "lr": 0.000778, + "step": 390, + "tokens_trained": 0.191689888 + }, + { + "epoch": 0.11119778739096518, + "grad_norm": 160.06784057617188, + "loss": 5.6025, + "lr": 0.000782, + "step": 392, + "tokens_trained": 0.192673992 + }, + { + "epoch": 0.11176512304091908, + "grad_norm": 226.97988891601562, + "loss": 6.0049, + "lr": 0.000786, + "step": 394, + "tokens_trained": 0.193656272 + }, + { + "epoch": 0.112332458690873, + "grad_norm": 223.26898193359375, + "loss": 5.6972, + "lr": 0.00079, + "step": 396, + "tokens_trained": 0.194639144 + }, + { + "epoch": 0.11289979434082689, + "grad_norm": 249.34912109375, + "loss": 5.7348, + "lr": 0.0007940000000000001, + "step": 398, + "tokens_trained": 0.195621256 + }, + { + "epoch": 0.11346712999078079, + "grad_norm": 161.34271240234375, + "loss": 5.6689, + "lr": 0.0007980000000000001, + "step": 400, + "tokens_trained": 0.196604136 + }, + { + "epoch": 0.1140344656407347, + "grad_norm": 148.53176879882812, + "loss": 5.702, + "lr": 0.0008020000000000001, + "step": 402, + "tokens_trained": 0.197586784 + }, + { + "epoch": 0.1146018012906886, + "grad_norm": 144.40835571289062, + "loss": 6.2402, + "lr": 0.0008060000000000001, + "step": 404, + "tokens_trained": 0.198570824 + }, + { + "epoch": 0.11516913694064251, + "grad_norm": 306.57562255859375, + "loss": 7.1739, + "lr": 0.0008100000000000001, + "step": 406, + "tokens_trained": 0.199548328 + }, + { + "epoch": 0.11573647259059641, + "grad_norm": 308.79180908203125, + "loss": 6.0972, + "lr": 0.0008139999999999999, + "step": 408, + "tokens_trained": 0.200532496 + }, + { + "epoch": 0.11630380824055031, + "grad_norm": 197.76791381835938, + "loss": 6.3533, + "lr": 0.0008179999999999999, + "step": 410, + "tokens_trained": 0.201514648 + }, + { + "epoch": 0.11687114389050422, + "grad_norm": 129.5694580078125, + "loss": 6.9628, + "lr": 0.0008219999999999999, + "step": 412, + "tokens_trained": 0.2024994 + }, + { + "epoch": 0.11743847954045812, + "grad_norm": 446.0195617675781, + "loss": 11.7562, + "lr": 0.000826, + "step": 414, + "tokens_trained": 0.20348012 + }, + { + "epoch": 0.11800581519041203, + "grad_norm": 355.5342712402344, + "loss": 8.8055, + "lr": 0.00083, + "step": 416, + "tokens_trained": 0.20446356 + }, + { + "epoch": 0.11857315084036593, + "grad_norm": 456.2491149902344, + "loss": 9.606, + "lr": 0.000834, + "step": 418, + "tokens_trained": 0.205445288 + }, + { + "epoch": 0.11914048649031983, + "grad_norm": 369.8676452636719, + "loss": 8.385, + "lr": 0.000838, + "step": 420, + "tokens_trained": 0.206427832 + }, + { + "epoch": 0.11970782214027374, + "grad_norm": 262.19073486328125, + "loss": 9.0956, + "lr": 0.000842, + "step": 422, + "tokens_trained": 0.207409848 + }, + { + "epoch": 0.12027515779022764, + "grad_norm": 120.3193130493164, + "loss": 5.4937, + "lr": 0.000846, + "step": 424, + "tokens_trained": 0.208391752 + }, + { + "epoch": 0.12084249344018155, + "grad_norm": 222.1111297607422, + "loss": 8.9367, + "lr": 0.00085, + "step": 426, + "tokens_trained": 0.20937384 + }, + { + "epoch": 0.12140982909013545, + "grad_norm": 137.16819763183594, + "loss": 7.5876, + "lr": 0.000854, + "step": 428, + "tokens_trained": 0.210358576 + }, + { + "epoch": 0.12197716474008935, + "grad_norm": 267.61846923828125, + "loss": 8.817, + "lr": 0.000858, + "step": 430, + "tokens_trained": 0.211340064 + }, + { + "epoch": 0.12254450039004326, + "grad_norm": 472.72906494140625, + "loss": 8.203, + "lr": 0.000862, + "step": 432, + "tokens_trained": 0.212321144 + }, + { + "epoch": 0.12311183603999716, + "grad_norm": 297.1420593261719, + "loss": 10.987, + "lr": 0.000866, + "step": 434, + "tokens_trained": 0.213300312 + }, + { + "epoch": 0.12367917168995107, + "grad_norm": 281.7297668457031, + "loss": 7.6117, + "lr": 0.00087, + "step": 436, + "tokens_trained": 0.214287624 + }, + { + "epoch": 0.12424650733990497, + "grad_norm": 203.09678649902344, + "loss": 6.5638, + "lr": 0.000874, + "step": 438, + "tokens_trained": 0.215272136 + }, + { + "epoch": 0.12481384298985887, + "grad_norm": 155.7823944091797, + "loss": 6.1131, + "lr": 0.000878, + "step": 440, + "tokens_trained": 0.216256392 + }, + { + "epoch": 0.12538117863981277, + "grad_norm": 189.86196899414062, + "loss": 8.2565, + "lr": 0.000882, + "step": 442, + "tokens_trained": 0.217242504 + }, + { + "epoch": 0.1259485142897667, + "grad_norm": 247.4568634033203, + "loss": 7.1005, + "lr": 0.0008860000000000001, + "step": 444, + "tokens_trained": 0.218226008 + }, + { + "epoch": 0.1265158499397206, + "grad_norm": 179.72825622558594, + "loss": 6.3379, + "lr": 0.0008900000000000001, + "step": 446, + "tokens_trained": 0.219210584 + }, + { + "epoch": 0.1270831855896745, + "grad_norm": 212.96356201171875, + "loss": 7.2514, + "lr": 0.000894, + "step": 448, + "tokens_trained": 0.220193952 + }, + { + "epoch": 0.1276505212396284, + "grad_norm": 105.67095947265625, + "loss": 5.456, + "lr": 0.000898, + "step": 450, + "tokens_trained": 0.221176936 + }, + { + "epoch": 0.1282178568895823, + "grad_norm": 302.9122619628906, + "loss": 6.4018, + "lr": 0.000902, + "step": 452, + "tokens_trained": 0.222161952 + }, + { + "epoch": 0.12878519253953621, + "grad_norm": 215.66561889648438, + "loss": 6.2853, + "lr": 0.000906, + "step": 454, + "tokens_trained": 0.223144912 + }, + { + "epoch": 0.1293525281894901, + "grad_norm": 272.9984130859375, + "loss": 7.3902, + "lr": 0.00091, + "step": 456, + "tokens_trained": 0.224127392 + }, + { + "epoch": 0.129919863839444, + "grad_norm": 200.7503662109375, + "loss": 6.1637, + "lr": 0.0009140000000000001, + "step": 458, + "tokens_trained": 0.22511648 + }, + { + "epoch": 0.1304871994893979, + "grad_norm": 93.23990631103516, + "loss": 6.4867, + "lr": 0.0009180000000000001, + "step": 460, + "tokens_trained": 0.226098144 + }, + { + "epoch": 0.1310545351393518, + "grad_norm": 274.37164306640625, + "loss": 8.99, + "lr": 0.0009220000000000001, + "step": 462, + "tokens_trained": 0.227081848 + }, + { + "epoch": 0.13162187078930573, + "grad_norm": 186.66322326660156, + "loss": 8.7122, + "lr": 0.0009260000000000001, + "step": 464, + "tokens_trained": 0.22806636 + }, + { + "epoch": 0.13218920643925963, + "grad_norm": 586.1035766601562, + "loss": 9.1045, + "lr": 0.00093, + "step": 466, + "tokens_trained": 0.229047872 + }, + { + "epoch": 0.13275654208921353, + "grad_norm": 227.55996704101562, + "loss": 9.7276, + "lr": 0.000934, + "step": 468, + "tokens_trained": 0.230031144 + }, + { + "epoch": 0.13332387773916743, + "grad_norm": 229.26609802246094, + "loss": 6.6244, + "lr": 0.0009379999999999999, + "step": 470, + "tokens_trained": 0.2310158 + }, + { + "epoch": 0.13389121338912133, + "grad_norm": 145.16331481933594, + "loss": 5.759, + "lr": 0.000942, + "step": 472, + "tokens_trained": 0.2319996 + }, + { + "epoch": 0.13445854903907525, + "grad_norm": 109.9937744140625, + "loss": 5.4838, + "lr": 0.000946, + "step": 474, + "tokens_trained": 0.232983808 + }, + { + "epoch": 0.13502588468902915, + "grad_norm": 135.74899291992188, + "loss": 6.2738, + "lr": 0.00095, + "step": 476, + "tokens_trained": 0.233963016 + }, + { + "epoch": 0.13559322033898305, + "grad_norm": 142.99449157714844, + "loss": 5.8459, + "lr": 0.000954, + "step": 478, + "tokens_trained": 0.234948864 + }, + { + "epoch": 0.13616055598893695, + "grad_norm": 198.66883850097656, + "loss": 6.6626, + "lr": 0.000958, + "step": 480, + "tokens_trained": 0.235932392 + }, + { + "epoch": 0.13672789163889085, + "grad_norm": 260.76507568359375, + "loss": 6.9299, + "lr": 0.000962, + "step": 482, + "tokens_trained": 0.236915664 + }, + { + "epoch": 0.13729522728884477, + "grad_norm": 267.97589111328125, + "loss": 6.4343, + "lr": 0.000966, + "step": 484, + "tokens_trained": 0.237896904 + }, + { + "epoch": 0.13786256293879867, + "grad_norm": 89.8781967163086, + "loss": 6.3203, + "lr": 0.0009699999999999999, + "step": 486, + "tokens_trained": 0.238874528 + }, + { + "epoch": 0.13842989858875257, + "grad_norm": 225.62985229492188, + "loss": 6.2778, + "lr": 0.000974, + "step": 488, + "tokens_trained": 0.2398588 + }, + { + "epoch": 0.13899723423870647, + "grad_norm": 85.84110260009766, + "loss": 5.2786, + "lr": 0.000978, + "step": 490, + "tokens_trained": 0.240839968 + }, + { + "epoch": 0.13956456988866037, + "grad_norm": 141.4368438720703, + "loss": 5.5525, + "lr": 0.000982, + "step": 492, + "tokens_trained": 0.241823544 + }, + { + "epoch": 0.1401319055386143, + "grad_norm": 94.9535140991211, + "loss": 5.4386, + "lr": 0.0009860000000000001, + "step": 494, + "tokens_trained": 0.242805456 + }, + { + "epoch": 0.1406992411885682, + "grad_norm": 157.4557647705078, + "loss": 5.9786, + "lr": 0.00099, + "step": 496, + "tokens_trained": 0.243792496 + }, + { + "epoch": 0.1412665768385221, + "grad_norm": 319.5025634765625, + "loss": 7.04, + "lr": 0.000994, + "step": 498, + "tokens_trained": 0.244772472 + }, + { + "epoch": 0.141833912488476, + "grad_norm": 282.26824951171875, + "loss": 9.4037, + "lr": 0.000998, + "step": 500, + "tokens_trained": 0.245758968 + }, + { + "epoch": 0.141833912488476, + "eval_loss": 2.152184247970581, + "eval_runtime": 21.2772, + "step": 500, + "tokens_trained": 0.245758968 + }, + { + "epoch": 0.1424012481384299, + "grad_norm": 306.0666809082031, + "loss": 7.8845, + "lr": 0.00099986013986014, + "step": 502, + "tokens_trained": 0.246739024 + }, + { + "epoch": 0.1429685837883838, + "grad_norm": 188.89024353027344, + "loss": 6.8118, + "lr": 0.0009995804195804196, + "step": 504, + "tokens_trained": 0.247726552 + }, + { + "epoch": 0.1435359194383377, + "grad_norm": 228.97474670410156, + "loss": 6.8475, + "lr": 0.0009993006993006994, + "step": 506, + "tokens_trained": 0.24870688 + }, + { + "epoch": 0.1441032550882916, + "grad_norm": 229.80029296875, + "loss": 6.2171, + "lr": 0.000999020979020979, + "step": 508, + "tokens_trained": 0.249689096 + }, + { + "epoch": 0.1446705907382455, + "grad_norm": 157.30340576171875, + "loss": 6.2281, + "lr": 0.0009987412587412587, + "step": 510, + "tokens_trained": 0.250671768 + }, + { + "epoch": 0.1452379263881994, + "grad_norm": 176.64683532714844, + "loss": 6.5993, + "lr": 0.0009984615384615386, + "step": 512, + "tokens_trained": 0.25165608 + }, + { + "epoch": 0.14580526203815333, + "grad_norm": 197.20526123046875, + "loss": 5.7267, + "lr": 0.0009981818181818182, + "step": 514, + "tokens_trained": 0.252639712 + }, + { + "epoch": 0.14637259768810723, + "grad_norm": 54.713260650634766, + "loss": 5.7911, + "lr": 0.000997902097902098, + "step": 516, + "tokens_trained": 0.253622816 + }, + { + "epoch": 0.14693993333806113, + "grad_norm": 185.74923706054688, + "loss": 7.0055, + "lr": 0.0009976223776223777, + "step": 518, + "tokens_trained": 0.254602792 + }, + { + "epoch": 0.14750726898801503, + "grad_norm": 240.31021118164062, + "loss": 6.452, + "lr": 0.0009973426573426573, + "step": 520, + "tokens_trained": 0.255584736 + }, + { + "epoch": 0.14807460463796893, + "grad_norm": 160.2477264404297, + "loss": 7.6556, + "lr": 0.000997062937062937, + "step": 522, + "tokens_trained": 0.256563792 + }, + { + "epoch": 0.14864194028792285, + "grad_norm": 283.0034484863281, + "loss": 6.5345, + "lr": 0.0009967832167832168, + "step": 524, + "tokens_trained": 0.257546656 + }, + { + "epoch": 0.14920927593787675, + "grad_norm": 245.537109375, + "loss": 6.3281, + "lr": 0.0009965034965034964, + "step": 526, + "tokens_trained": 0.258530832 + }, + { + "epoch": 0.14977661158783065, + "grad_norm": 162.1538848876953, + "loss": 7.4072, + "lr": 0.0009962237762237763, + "step": 528, + "tokens_trained": 0.259514528 + }, + { + "epoch": 0.15034394723778455, + "grad_norm": 107.25792694091797, + "loss": 5.356, + "lr": 0.000995944055944056, + "step": 530, + "tokens_trained": 0.260500912 + }, + { + "epoch": 0.15091128288773845, + "grad_norm": 173.73353576660156, + "loss": 6.8625, + "lr": 0.0009956643356643356, + "step": 532, + "tokens_trained": 0.26148632 + }, + { + "epoch": 0.15147861853769237, + "grad_norm": 178.33541870117188, + "loss": 5.8794, + "lr": 0.0009953846153846154, + "step": 534, + "tokens_trained": 0.262468816 + }, + { + "epoch": 0.15204595418764627, + "grad_norm": 181.2533416748047, + "loss": 7.0243, + "lr": 0.000995104895104895, + "step": 536, + "tokens_trained": 0.263446696 + }, + { + "epoch": 0.15261328983760017, + "grad_norm": 208.79293823242188, + "loss": 5.8908, + "lr": 0.000994825174825175, + "step": 538, + "tokens_trained": 0.26443108 + }, + { + "epoch": 0.15318062548755407, + "grad_norm": 148.66285705566406, + "loss": 6.0831, + "lr": 0.0009945454545454546, + "step": 540, + "tokens_trained": 0.265414496 + }, + { + "epoch": 0.15374796113750797, + "grad_norm": 165.044189453125, + "loss": 5.5594, + "lr": 0.0009942657342657344, + "step": 542, + "tokens_trained": 0.266394128 + }, + { + "epoch": 0.1543152967874619, + "grad_norm": 124.5405502319336, + "loss": 5.2442, + "lr": 0.000993986013986014, + "step": 544, + "tokens_trained": 0.267378768 + }, + { + "epoch": 0.1548826324374158, + "grad_norm": 68.66510772705078, + "loss": 5.1173, + "lr": 0.0009937062937062937, + "step": 546, + "tokens_trained": 0.268360184 + }, + { + "epoch": 0.1554499680873697, + "grad_norm": 57.052860260009766, + "loss": 5.2348, + "lr": 0.0009934265734265735, + "step": 548, + "tokens_trained": 0.269345672 + }, + { + "epoch": 0.1560173037373236, + "grad_norm": 184.9175567626953, + "loss": 6.7748, + "lr": 0.0009931468531468532, + "step": 550, + "tokens_trained": 0.2703288 + }, + { + "epoch": 0.15658463938727749, + "grad_norm": 72.9861831665039, + "loss": 5.7387, + "lr": 0.000992867132867133, + "step": 552, + "tokens_trained": 0.271309176 + }, + { + "epoch": 0.1571519750372314, + "grad_norm": 135.864501953125, + "loss": 6.3035, + "lr": 0.0009925874125874127, + "step": 554, + "tokens_trained": 0.27229644 + }, + { + "epoch": 0.1577193106871853, + "grad_norm": 130.579833984375, + "loss": 5.4434, + "lr": 0.0009923076923076923, + "step": 556, + "tokens_trained": 0.273277904 + }, + { + "epoch": 0.1582866463371392, + "grad_norm": 206.77345275878906, + "loss": 5.8649, + "lr": 0.000992027972027972, + "step": 558, + "tokens_trained": 0.274261712 + }, + { + "epoch": 0.1588539819870931, + "grad_norm": 144.0505828857422, + "loss": 5.3459, + "lr": 0.0009917482517482518, + "step": 560, + "tokens_trained": 0.2752468 + }, + { + "epoch": 0.159421317637047, + "grad_norm": 87.56634521484375, + "loss": 5.6321, + "lr": 0.0009914685314685314, + "step": 562, + "tokens_trained": 0.276232384 + }, + { + "epoch": 0.15998865328700093, + "grad_norm": 275.2727355957031, + "loss": 6.7515, + "lr": 0.0009911888111888113, + "step": 564, + "tokens_trained": 0.277211608 + }, + { + "epoch": 0.16055598893695483, + "grad_norm": 97.00019836425781, + "loss": 5.4374, + "lr": 0.000990909090909091, + "step": 566, + "tokens_trained": 0.278196336 + }, + { + "epoch": 0.16112332458690873, + "grad_norm": 102.91439056396484, + "loss": 5.729, + "lr": 0.0009906293706293705, + "step": 568, + "tokens_trained": 0.279175672 + }, + { + "epoch": 0.16169066023686263, + "grad_norm": 151.12432861328125, + "loss": 5.4189, + "lr": 0.0009903496503496504, + "step": 570, + "tokens_trained": 0.280161088 + }, + { + "epoch": 0.16225799588681653, + "grad_norm": 86.6823959350586, + "loss": 5.1704, + "lr": 0.00099006993006993, + "step": 572, + "tokens_trained": 0.28114256 + }, + { + "epoch": 0.16282533153677045, + "grad_norm": 90.7052230834961, + "loss": 5.3673, + "lr": 0.0009897902097902099, + "step": 574, + "tokens_trained": 0.282128904 + }, + { + "epoch": 0.16339266718672435, + "grad_norm": 146.92874145507812, + "loss": 5.5971, + "lr": 0.0009895104895104895, + "step": 576, + "tokens_trained": 0.28311528 + }, + { + "epoch": 0.16396000283667825, + "grad_norm": 189.76296997070312, + "loss": 5.3109, + "lr": 0.0009892307692307694, + "step": 578, + "tokens_trained": 0.284098528 + }, + { + "epoch": 0.16452733848663215, + "grad_norm": 174.48092651367188, + "loss": 5.68, + "lr": 0.000988951048951049, + "step": 580, + "tokens_trained": 0.285081064 + }, + { + "epoch": 0.16509467413658604, + "grad_norm": 154.10816955566406, + "loss": 5.3307, + "lr": 0.0009886713286713286, + "step": 582, + "tokens_trained": 0.286067952 + }, + { + "epoch": 0.16566200978653997, + "grad_norm": 64.28263092041016, + "loss": 5.1676, + "lr": 0.0009883916083916085, + "step": 584, + "tokens_trained": 0.287051384 + }, + { + "epoch": 0.16622934543649387, + "grad_norm": 103.81795501708984, + "loss": 5.3436, + "lr": 0.0009881118881118881, + "step": 586, + "tokens_trained": 0.28803284 + }, + { + "epoch": 0.16679668108644777, + "grad_norm": 144.0076904296875, + "loss": 5.3033, + "lr": 0.000987832167832168, + "step": 588, + "tokens_trained": 0.289014824 + }, + { + "epoch": 0.16736401673640167, + "grad_norm": 88.31237030029297, + "loss": 5.0609, + "lr": 0.0009875524475524476, + "step": 590, + "tokens_trained": 0.289999864 + }, + { + "epoch": 0.16793135238635556, + "grad_norm": 68.4583740234375, + "loss": 5.0702, + "lr": 0.0009872727272727273, + "step": 592, + "tokens_trained": 0.290983888 + }, + { + "epoch": 0.1684986880363095, + "grad_norm": 135.28665161132812, + "loss": 5.3962, + "lr": 0.000986993006993007, + "step": 594, + "tokens_trained": 0.291965752 + }, + { + "epoch": 0.1690660236862634, + "grad_norm": 80.0412368774414, + "loss": 5.0246, + "lr": 0.0009867132867132867, + "step": 596, + "tokens_trained": 0.292946952 + }, + { + "epoch": 0.1696333593362173, + "grad_norm": 43.29194641113281, + "loss": 5.0051, + "lr": 0.0009864335664335664, + "step": 598, + "tokens_trained": 0.293928976 + }, + { + "epoch": 0.17020069498617119, + "grad_norm": 220.88687133789062, + "loss": 6.0798, + "lr": 0.0009861538461538462, + "step": 600, + "tokens_trained": 0.294912408 + }, + { + "epoch": 0.17076803063612508, + "grad_norm": 102.58654022216797, + "loss": 5.1271, + "lr": 0.0009858741258741259, + "step": 602, + "tokens_trained": 0.29589416 + }, + { + "epoch": 0.171335366286079, + "grad_norm": 119.0067138671875, + "loss": 5.7402, + "lr": 0.0009855944055944055, + "step": 604, + "tokens_trained": 0.296878584 + }, + { + "epoch": 0.1719027019360329, + "grad_norm": 138.8656005859375, + "loss": 5.1951, + "lr": 0.0009853146853146854, + "step": 606, + "tokens_trained": 0.297864552 + }, + { + "epoch": 0.1724700375859868, + "grad_norm": 73.5890884399414, + "loss": 5.2522, + "lr": 0.000985034965034965, + "step": 608, + "tokens_trained": 0.298854088 + }, + { + "epoch": 0.1730373732359407, + "grad_norm": 113.78330993652344, + "loss": 5.6683, + "lr": 0.0009847552447552449, + "step": 610, + "tokens_trained": 0.299835024 + }, + { + "epoch": 0.1736047088858946, + "grad_norm": 125.20297241210938, + "loss": 5.1812, + "lr": 0.0009844755244755245, + "step": 612, + "tokens_trained": 0.30082032 + }, + { + "epoch": 0.17417204453584853, + "grad_norm": 67.46041870117188, + "loss": 5.0417, + "lr": 0.0009841958041958043, + "step": 614, + "tokens_trained": 0.301808456 + }, + { + "epoch": 0.17473938018580243, + "grad_norm": 117.30754852294922, + "loss": 5.3064, + "lr": 0.000983916083916084, + "step": 616, + "tokens_trained": 0.302794456 + }, + { + "epoch": 0.17530671583575633, + "grad_norm": 124.30754089355469, + "loss": 5.1614, + "lr": 0.0009836363636363636, + "step": 618, + "tokens_trained": 0.303777376 + }, + { + "epoch": 0.17587405148571023, + "grad_norm": 102.72042083740234, + "loss": 5.1265, + "lr": 0.0009833566433566435, + "step": 620, + "tokens_trained": 0.304758864 + }, + { + "epoch": 0.17644138713566412, + "grad_norm": 39.332252502441406, + "loss": 5.1078, + "lr": 0.000983076923076923, + "step": 622, + "tokens_trained": 0.30574392 + }, + { + "epoch": 0.17700872278561805, + "grad_norm": 153.84811401367188, + "loss": 5.7696, + "lr": 0.000982797202797203, + "step": 624, + "tokens_trained": 0.306727584 + }, + { + "epoch": 0.17729239061059499, + "eval_loss": 1.3463915586471558, + "eval_runtime": 20.8357, + "step": 625, + "tokens_trained": 0.307220496 + }, + { + "epoch": 0.17757605843557195, + "grad_norm": 160.2552490234375, + "loss": 5.2283, + "lr": 0.0009825174825174826, + "step": 626, + "tokens_trained": 0.307713024 + }, + { + "epoch": 0.17814339408552585, + "grad_norm": 186.77407836914062, + "loss": 5.2866, + "lr": 0.0009822377622377622, + "step": 628, + "tokens_trained": 0.308700128 + }, + { + "epoch": 0.17871072973547975, + "grad_norm": 84.55519104003906, + "loss": 5.1106, + "lr": 0.0009819580419580419, + "step": 630, + "tokens_trained": 0.309681208 + }, + { + "epoch": 0.17927806538543364, + "grad_norm": 20.617040634155273, + "loss": 4.8327, + "lr": 0.0009816783216783217, + "step": 632, + "tokens_trained": 0.310662224 + }, + { + "epoch": 0.17984540103538757, + "grad_norm": 168.06039428710938, + "loss": 6.0704, + "lr": 0.0009813986013986014, + "step": 634, + "tokens_trained": 0.31164064 + }, + { + "epoch": 0.18041273668534147, + "grad_norm": 238.23736572265625, + "loss": 5.6188, + "lr": 0.0009811188811188812, + "step": 636, + "tokens_trained": 0.312622568 + }, + { + "epoch": 0.18098007233529537, + "grad_norm": 140.0707550048828, + "loss": 6.4034, + "lr": 0.0009808391608391608, + "step": 638, + "tokens_trained": 0.313604944 + }, + { + "epoch": 0.18154740798524927, + "grad_norm": 161.19302368164062, + "loss": 5.4906, + "lr": 0.0009805594405594405, + "step": 640, + "tokens_trained": 0.314592072 + }, + { + "epoch": 0.18211474363520316, + "grad_norm": 121.9577407836914, + "loss": 5.2097, + "lr": 0.0009802797202797203, + "step": 642, + "tokens_trained": 0.315574392 + }, + { + "epoch": 0.1826820792851571, + "grad_norm": 121.25574493408203, + "loss": 5.0317, + "lr": 0.00098, + "step": 644, + "tokens_trained": 0.316559008 + }, + { + "epoch": 0.183249414935111, + "grad_norm": 28.328269958496094, + "loss": 4.932, + "lr": 0.0009797202797202798, + "step": 646, + "tokens_trained": 0.317538776 + }, + { + "epoch": 0.1838167505850649, + "grad_norm": 127.77408599853516, + "loss": 5.8335, + "lr": 0.0009794405594405595, + "step": 648, + "tokens_trained": 0.31851792 + }, + { + "epoch": 0.18438408623501878, + "grad_norm": 94.9522933959961, + "loss": 5.1948, + "lr": 0.000979160839160839, + "step": 650, + "tokens_trained": 0.319501576 + }, + { + "epoch": 0.18495142188497268, + "grad_norm": 110.33658599853516, + "loss": 5.098, + "lr": 0.000978881118881119, + "step": 652, + "tokens_trained": 0.320482392 + }, + { + "epoch": 0.1855187575349266, + "grad_norm": 67.23124694824219, + "loss": 4.7723, + "lr": 0.0009786013986013986, + "step": 654, + "tokens_trained": 0.32146712 + }, + { + "epoch": 0.1860860931848805, + "grad_norm": 61.519866943359375, + "loss": 4.7245, + "lr": 0.0009783216783216782, + "step": 656, + "tokens_trained": 0.322449576 + }, + { + "epoch": 0.1866534288348344, + "grad_norm": 99.51078033447266, + "loss": 4.783, + "lr": 0.000978041958041958, + "step": 658, + "tokens_trained": 0.323432688 + }, + { + "epoch": 0.1872207644847883, + "grad_norm": 44.619197845458984, + "loss": 4.7495, + "lr": 0.000977762237762238, + "step": 660, + "tokens_trained": 0.324413952 + }, + { + "epoch": 0.18778810013474223, + "grad_norm": 114.5891342163086, + "loss": 5.1261, + "lr": 0.0009774825174825176, + "step": 662, + "tokens_trained": 0.325394536 + }, + { + "epoch": 0.18835543578469613, + "grad_norm": 100.3728256225586, + "loss": 4.7883, + "lr": 0.0009772027972027972, + "step": 664, + "tokens_trained": 0.326374672 + }, + { + "epoch": 0.18892277143465003, + "grad_norm": 51.883033752441406, + "loss": 4.7249, + "lr": 0.0009769230769230768, + "step": 666, + "tokens_trained": 0.327357152 + }, + { + "epoch": 0.18949010708460393, + "grad_norm": 82.27507019042969, + "loss": 4.8277, + "lr": 0.0009766433566433567, + "step": 668, + "tokens_trained": 0.328342088 + }, + { + "epoch": 0.19005744273455782, + "grad_norm": 83.53064727783203, + "loss": 4.8338, + "lr": 0.0009763636363636363, + "step": 670, + "tokens_trained": 0.329319248 + }, + { + "epoch": 0.19062477838451175, + "grad_norm": 76.18387603759766, + "loss": 4.6958, + "lr": 0.0009760839160839161, + "step": 672, + "tokens_trained": 0.330305968 + }, + { + "epoch": 0.19119211403446565, + "grad_norm": 27.401426315307617, + "loss": 4.6929, + "lr": 0.0009758041958041958, + "step": 674, + "tokens_trained": 0.3312912 + }, + { + "epoch": 0.19175944968441955, + "grad_norm": 186.770263671875, + "loss": 5.5089, + "lr": 0.0009755244755244756, + "step": 676, + "tokens_trained": 0.332275224 + }, + { + "epoch": 0.19232678533437345, + "grad_norm": 105.02385711669922, + "loss": 4.8876, + "lr": 0.0009752447552447553, + "step": 678, + "tokens_trained": 0.33325588 + }, + { + "epoch": 0.19289412098432734, + "grad_norm": 94.96269989013672, + "loss": 5.1235, + "lr": 0.0009749650349650349, + "step": 680, + "tokens_trained": 0.334238408 + }, + { + "epoch": 0.19346145663428127, + "grad_norm": 92.29356384277344, + "loss": 4.8194, + "lr": 0.0009746853146853148, + "step": 682, + "tokens_trained": 0.335219368 + }, + { + "epoch": 0.19402879228423517, + "grad_norm": 59.1584358215332, + "loss": 4.7511, + "lr": 0.0009744055944055944, + "step": 684, + "tokens_trained": 0.336207136 + }, + { + "epoch": 0.19459612793418907, + "grad_norm": 54.759002685546875, + "loss": 4.777, + "lr": 0.0009741258741258742, + "step": 686, + "tokens_trained": 0.337193536 + }, + { + "epoch": 0.19516346358414297, + "grad_norm": 92.20452880859375, + "loss": 4.8225, + "lr": 0.0009738461538461538, + "step": 688, + "tokens_trained": 0.338179224 + }, + { + "epoch": 0.19573079923409686, + "grad_norm": 75.97005462646484, + "loss": 4.655, + "lr": 0.0009735664335664336, + "step": 690, + "tokens_trained": 0.339162168 + }, + { + "epoch": 0.1962981348840508, + "grad_norm": 58.19076919555664, + "loss": 4.6446, + "lr": 0.0009732867132867133, + "step": 692, + "tokens_trained": 0.340138904 + }, + { + "epoch": 0.1968654705340047, + "grad_norm": 50.81512451171875, + "loss": 4.5866, + "lr": 0.000973006993006993, + "step": 694, + "tokens_trained": 0.34112288 + }, + { + "epoch": 0.1974328061839586, + "grad_norm": 61.683372497558594, + "loss": 4.6018, + "lr": 0.0009727272727272728, + "step": 696, + "tokens_trained": 0.342111992 + }, + { + "epoch": 0.19800014183391249, + "grad_norm": 61.01798629760742, + "loss": 4.6007, + "lr": 0.0009724475524475524, + "step": 698, + "tokens_trained": 0.343095912 + }, + { + "epoch": 0.19856747748386638, + "grad_norm": 96.49671936035156, + "loss": 4.7035, + "lr": 0.0009721678321678323, + "step": 700, + "tokens_trained": 0.344078632 + }, + { + "epoch": 0.1991348131338203, + "grad_norm": 64.7771224975586, + "loss": 4.8341, + "lr": 0.0009718881118881119, + "step": 702, + "tokens_trained": 0.345060576 + }, + { + "epoch": 0.1997021487837742, + "grad_norm": 90.1478042602539, + "loss": 4.7739, + "lr": 0.0009716083916083917, + "step": 704, + "tokens_trained": 0.34604112 + }, + { + "epoch": 0.2002694844337281, + "grad_norm": 67.6308822631836, + "loss": 4.6218, + "lr": 0.0009713286713286713, + "step": 706, + "tokens_trained": 0.347023496 + }, + { + "epoch": 0.200836820083682, + "grad_norm": 40.50175094604492, + "loss": 4.6008, + "lr": 0.000971048951048951, + "step": 708, + "tokens_trained": 0.348005416 + }, + { + "epoch": 0.2014041557336359, + "grad_norm": 33.6448860168457, + "loss": 4.5307, + "lr": 0.0009707692307692308, + "step": 710, + "tokens_trained": 0.3489886 + }, + { + "epoch": 0.20197149138358983, + "grad_norm": 15.484851837158203, + "loss": 4.5065, + "lr": 0.0009704895104895105, + "step": 712, + "tokens_trained": 0.34997024 + }, + { + "epoch": 0.20253882703354373, + "grad_norm": 109.26301574707031, + "loss": 4.9613, + "lr": 0.0009702097902097903, + "step": 714, + "tokens_trained": 0.350958496 + }, + { + "epoch": 0.20310616268349763, + "grad_norm": 150.07492065429688, + "loss": 4.8507, + "lr": 0.0009699300699300699, + "step": 716, + "tokens_trained": 0.35193892 + }, + { + "epoch": 0.20367349833345152, + "grad_norm": 113.43978881835938, + "loss": 5.4494, + "lr": 0.0009696503496503498, + "step": 718, + "tokens_trained": 0.35291908 + }, + { + "epoch": 0.20424083398340542, + "grad_norm": 123.0071792602539, + "loss": 4.9475, + "lr": 0.0009693706293706294, + "step": 720, + "tokens_trained": 0.353896072 + }, + { + "epoch": 0.20480816963335935, + "grad_norm": 65.55500793457031, + "loss": 4.7585, + "lr": 0.0009690909090909091, + "step": 722, + "tokens_trained": 0.354878992 + }, + { + "epoch": 0.20537550528331325, + "grad_norm": 36.11159896850586, + "loss": 4.6323, + "lr": 0.0009688111888111888, + "step": 724, + "tokens_trained": 0.355863728 + }, + { + "epoch": 0.20594284093326715, + "grad_norm": 30.566436767578125, + "loss": 4.53, + "lr": 0.0009685314685314685, + "step": 726, + "tokens_trained": 0.356845272 + }, + { + "epoch": 0.20651017658322104, + "grad_norm": 59.01853561401367, + "loss": 4.5283, + "lr": 0.0009682517482517483, + "step": 728, + "tokens_trained": 0.357826656 + }, + { + "epoch": 0.20707751223317494, + "grad_norm": 91.78115844726562, + "loss": 4.6149, + "lr": 0.000967972027972028, + "step": 730, + "tokens_trained": 0.358809896 + }, + { + "epoch": 0.20764484788312887, + "grad_norm": 67.97398376464844, + "loss": 4.617, + "lr": 0.0009676923076923078, + "step": 732, + "tokens_trained": 0.359788736 + }, + { + "epoch": 0.20821218353308277, + "grad_norm": 42.82001876831055, + "loss": 4.6134, + "lr": 0.0009674125874125874, + "step": 734, + "tokens_trained": 0.360771744 + }, + { + "epoch": 0.20877951918303667, + "grad_norm": 63.52122116088867, + "loss": 4.6995, + "lr": 0.0009671328671328672, + "step": 736, + "tokens_trained": 0.361757656 + }, + { + "epoch": 0.20934685483299056, + "grad_norm": 116.39544677734375, + "loss": 4.7153, + "lr": 0.0009668531468531469, + "step": 738, + "tokens_trained": 0.362744008 + }, + { + "epoch": 0.20991419048294446, + "grad_norm": 40.74269485473633, + "loss": 4.7978, + "lr": 0.0009665734265734266, + "step": 740, + "tokens_trained": 0.36372872 + }, + { + "epoch": 0.2104815261328984, + "grad_norm": 114.29917907714844, + "loss": 5.1683, + "lr": 0.0009662937062937063, + "step": 742, + "tokens_trained": 0.364710536 + }, + { + "epoch": 0.2110488617828523, + "grad_norm": 115.83326721191406, + "loss": 4.7642, + "lr": 0.000966013986013986, + "step": 744, + "tokens_trained": 0.3656912 + }, + { + "epoch": 0.21161619743280619, + "grad_norm": 21.708093643188477, + "loss": 4.8244, + "lr": 0.0009657342657342657, + "step": 746, + "tokens_trained": 0.36667388 + }, + { + "epoch": 0.21218353308276008, + "grad_norm": 182.01918029785156, + "loss": 5.6045, + "lr": 0.0009654545454545455, + "step": 748, + "tokens_trained": 0.3676634 + }, + { + "epoch": 0.21275086873271398, + "grad_norm": 47.119319915771484, + "loss": 4.7929, + "lr": 0.0009651748251748252, + "step": 750, + "tokens_trained": 0.368647288 + }, + { + "epoch": 0.21275086873271398, + "eval_loss": 1.2186306715011597, + "eval_runtime": 20.9362, + "step": 750, + "tokens_trained": 0.368647288 + }, + { + "epoch": 0.2133182043826679, + "grad_norm": 51.43566131591797, + "loss": 4.7298, + "lr": 0.0009648951048951049, + "step": 752, + "tokens_trained": 0.36962992 + }, + { + "epoch": 0.2138855400326218, + "grad_norm": 79.49323272705078, + "loss": 5.0749, + "lr": 0.0009646153846153846, + "step": 754, + "tokens_trained": 0.370616064 + }, + { + "epoch": 0.2144528756825757, + "grad_norm": 119.80200958251953, + "loss": 4.8198, + "lr": 0.0009643356643356644, + "step": 756, + "tokens_trained": 0.371596208 + }, + { + "epoch": 0.2150202113325296, + "grad_norm": 95.88092041015625, + "loss": 4.7437, + "lr": 0.0009640559440559441, + "step": 758, + "tokens_trained": 0.372579584 + }, + { + "epoch": 0.2155875469824835, + "grad_norm": 79.64202117919922, + "loss": 4.9181, + "lr": 0.0009637762237762237, + "step": 760, + "tokens_trained": 0.373563056 + }, + { + "epoch": 0.21615488263243743, + "grad_norm": 79.93920135498047, + "loss": 4.6393, + "lr": 0.0009634965034965035, + "step": 762, + "tokens_trained": 0.374547648 + }, + { + "epoch": 0.21672221828239133, + "grad_norm": 78.67620849609375, + "loss": 4.6178, + "lr": 0.0009632167832167832, + "step": 764, + "tokens_trained": 0.375531456 + }, + { + "epoch": 0.21728955393234523, + "grad_norm": 56.32818603515625, + "loss": 4.6498, + "lr": 0.000962937062937063, + "step": 766, + "tokens_trained": 0.376516896 + }, + { + "epoch": 0.21785688958229912, + "grad_norm": 45.35737228393555, + "loss": 4.5812, + "lr": 0.0009626573426573427, + "step": 768, + "tokens_trained": 0.377499752 + }, + { + "epoch": 0.21842422523225302, + "grad_norm": 58.13076400756836, + "loss": 4.5793, + "lr": 0.0009623776223776224, + "step": 770, + "tokens_trained": 0.37848276 + }, + { + "epoch": 0.21899156088220695, + "grad_norm": 55.620628356933594, + "loss": 4.4865, + "lr": 0.0009620979020979021, + "step": 772, + "tokens_trained": 0.379466296 + }, + { + "epoch": 0.21955889653216085, + "grad_norm": 77.26813507080078, + "loss": 4.5671, + "lr": 0.0009618181818181818, + "step": 774, + "tokens_trained": 0.380449888 + }, + { + "epoch": 0.22012623218211474, + "grad_norm": 45.00653839111328, + "loss": 4.5923, + "lr": 0.0009615384615384616, + "step": 776, + "tokens_trained": 0.381430352 + }, + { + "epoch": 0.22069356783206864, + "grad_norm": 52.77407455444336, + "loss": 4.5094, + "lr": 0.0009612587412587412, + "step": 778, + "tokens_trained": 0.382416152 + }, + { + "epoch": 0.22126090348202254, + "grad_norm": 36.721073150634766, + "loss": 4.4536, + "lr": 0.000960979020979021, + "step": 780, + "tokens_trained": 0.383396672 + }, + { + "epoch": 0.22182823913197647, + "grad_norm": 51.21247100830078, + "loss": 4.4599, + "lr": 0.0009606993006993007, + "step": 782, + "tokens_trained": 0.384380584 + }, + { + "epoch": 0.22239557478193037, + "grad_norm": 65.23794555664062, + "loss": 4.5397, + "lr": 0.0009604195804195805, + "step": 784, + "tokens_trained": 0.385361368 + }, + { + "epoch": 0.22296291043188426, + "grad_norm": 23.255144119262695, + "loss": 4.5007, + "lr": 0.0009601398601398602, + "step": 786, + "tokens_trained": 0.386341416 + }, + { + "epoch": 0.22353024608183816, + "grad_norm": 30.812740325927734, + "loss": 4.5239, + "lr": 0.0009598601398601398, + "step": 788, + "tokens_trained": 0.387324624 + }, + { + "epoch": 0.22409758173179206, + "grad_norm": 50.781219482421875, + "loss": 4.5131, + "lr": 0.0009595804195804196, + "step": 790, + "tokens_trained": 0.388312744 + }, + { + "epoch": 0.224664917381746, + "grad_norm": 47.88816452026367, + "loss": 4.4622, + "lr": 0.0009593006993006993, + "step": 792, + "tokens_trained": 0.38929852 + }, + { + "epoch": 0.22523225303169989, + "grad_norm": 49.32049560546875, + "loss": 4.5053, + "lr": 0.0009590209790209791, + "step": 794, + "tokens_trained": 0.390279792 + }, + { + "epoch": 0.22579958868165378, + "grad_norm": 36.98805618286133, + "loss": 4.5144, + "lr": 0.0009587412587412587, + "step": 796, + "tokens_trained": 0.391258904 + }, + { + "epoch": 0.22636692433160768, + "grad_norm": 24.88475799560547, + "loss": 4.4992, + "lr": 0.0009584615384615385, + "step": 798, + "tokens_trained": 0.392238976 + }, + { + "epoch": 0.22693425998156158, + "grad_norm": 38.89309310913086, + "loss": 4.4853, + "lr": 0.0009581818181818182, + "step": 800, + "tokens_trained": 0.393226312 + }, + { + "epoch": 0.2275015956315155, + "grad_norm": 34.86774444580078, + "loss": 4.4519, + "lr": 0.000957902097902098, + "step": 802, + "tokens_trained": 0.394206688 + }, + { + "epoch": 0.2280689312814694, + "grad_norm": 24.966291427612305, + "loss": 4.456, + "lr": 0.0009576223776223777, + "step": 804, + "tokens_trained": 0.395191608 + }, + { + "epoch": 0.2286362669314233, + "grad_norm": 12.218213081359863, + "loss": 4.4266, + "lr": 0.0009573426573426573, + "step": 806, + "tokens_trained": 0.396174512 + }, + { + "epoch": 0.2292036025813772, + "grad_norm": 50.817054748535156, + "loss": 4.586, + "lr": 0.0009570629370629371, + "step": 808, + "tokens_trained": 0.397156912 + }, + { + "epoch": 0.2297709382313311, + "grad_norm": 37.60087203979492, + "loss": 4.4616, + "lr": 0.0009567832167832168, + "step": 810, + "tokens_trained": 0.398140016 + }, + { + "epoch": 0.23033827388128503, + "grad_norm": 37.55678176879883, + "loss": 4.4755, + "lr": 0.0009565034965034966, + "step": 812, + "tokens_trained": 0.39912384 + }, + { + "epoch": 0.23090560953123893, + "grad_norm": 56.427215576171875, + "loss": 4.5078, + "lr": 0.0009562237762237762, + "step": 814, + "tokens_trained": 0.400111224 + }, + { + "epoch": 0.23147294518119282, + "grad_norm": 31.869827270507812, + "loss": 4.5013, + "lr": 0.0009559440559440559, + "step": 816, + "tokens_trained": 0.401094936 + }, + { + "epoch": 0.23204028083114672, + "grad_norm": 77.57958984375, + "loss": 4.6977, + "lr": 0.0009556643356643357, + "step": 818, + "tokens_trained": 0.402078888 + }, + { + "epoch": 0.23260761648110062, + "grad_norm": 52.50204849243164, + "loss": 4.5142, + "lr": 0.0009553846153846154, + "step": 820, + "tokens_trained": 0.403059904 + }, + { + "epoch": 0.23317495213105455, + "grad_norm": 32.34305191040039, + "loss": 4.4828, + "lr": 0.0009551048951048952, + "step": 822, + "tokens_trained": 0.404049848 + }, + { + "epoch": 0.23374228778100845, + "grad_norm": 52.08961486816406, + "loss": 4.4869, + "lr": 0.0009548251748251748, + "step": 824, + "tokens_trained": 0.405033872 + }, + { + "epoch": 0.23430962343096234, + "grad_norm": 44.32194900512695, + "loss": 4.4802, + "lr": 0.0009545454545454546, + "step": 826, + "tokens_trained": 0.406017872 + }, + { + "epoch": 0.23487695908091624, + "grad_norm": 30.941524505615234, + "loss": 4.4323, + "lr": 0.0009542657342657343, + "step": 828, + "tokens_trained": 0.40700704 + }, + { + "epoch": 0.23544429473087014, + "grad_norm": 20.52709197998047, + "loss": 4.4919, + "lr": 0.000953986013986014, + "step": 830, + "tokens_trained": 0.407991512 + }, + { + "epoch": 0.23601163038082407, + "grad_norm": 86.80307006835938, + "loss": 4.8228, + "lr": 0.0009537062937062937, + "step": 832, + "tokens_trained": 0.408979272 + }, + { + "epoch": 0.23657896603077797, + "grad_norm": 73.71435546875, + "loss": 4.5954, + "lr": 0.0009534265734265734, + "step": 834, + "tokens_trained": 0.409962984 + }, + { + "epoch": 0.23714630168073186, + "grad_norm": 66.3813247680664, + "loss": 4.5969, + "lr": 0.0009531468531468532, + "step": 836, + "tokens_trained": 0.410945248 + }, + { + "epoch": 0.23771363733068576, + "grad_norm": 86.94453430175781, + "loss": 4.5894, + "lr": 0.0009528671328671329, + "step": 838, + "tokens_trained": 0.411930872 + }, + { + "epoch": 0.23828097298063966, + "grad_norm": 61.28915786743164, + "loss": 4.5613, + "lr": 0.0009525874125874127, + "step": 840, + "tokens_trained": 0.412912608 + }, + { + "epoch": 0.2388483086305936, + "grad_norm": 65.02153778076172, + "loss": 4.5398, + "lr": 0.0009523076923076923, + "step": 842, + "tokens_trained": 0.413897488 + }, + { + "epoch": 0.23941564428054748, + "grad_norm": 54.01200485229492, + "loss": 4.4922, + "lr": 0.000952027972027972, + "step": 844, + "tokens_trained": 0.414872888 + }, + { + "epoch": 0.23998297993050138, + "grad_norm": 66.7095718383789, + "loss": 4.5317, + "lr": 0.0009517482517482518, + "step": 846, + "tokens_trained": 0.415856296 + }, + { + "epoch": 0.24055031558045528, + "grad_norm": 64.23979949951172, + "loss": 4.4686, + "lr": 0.0009514685314685315, + "step": 848, + "tokens_trained": 0.416843344 + }, + { + "epoch": 0.24111765123040918, + "grad_norm": 51.012840270996094, + "loss": 4.4544, + "lr": 0.0009511888111888112, + "step": 850, + "tokens_trained": 0.41782032 + }, + { + "epoch": 0.2416849868803631, + "grad_norm": 40.83076095581055, + "loss": 4.4665, + "lr": 0.0009509090909090909, + "step": 852, + "tokens_trained": 0.418805672 + }, + { + "epoch": 0.242252322530317, + "grad_norm": 48.31489944458008, + "loss": 4.4748, + "lr": 0.0009506293706293707, + "step": 854, + "tokens_trained": 0.419786344 + }, + { + "epoch": 0.2428196581802709, + "grad_norm": 50.08705520629883, + "loss": 4.4973, + "lr": 0.0009503496503496504, + "step": 856, + "tokens_trained": 0.420768872 + }, + { + "epoch": 0.2433869938302248, + "grad_norm": 26.840139389038086, + "loss": 4.461, + "lr": 0.0009500699300699301, + "step": 858, + "tokens_trained": 0.421750296 + }, + { + "epoch": 0.2439543294801787, + "grad_norm": 24.721454620361328, + "loss": 4.4246, + "lr": 0.0009497902097902098, + "step": 860, + "tokens_trained": 0.422730976 + }, + { + "epoch": 0.24452166513013263, + "grad_norm": 63.147926330566406, + "loss": 4.623, + "lr": 0.0009495104895104895, + "step": 862, + "tokens_trained": 0.423715768 + }, + { + "epoch": 0.24508900078008652, + "grad_norm": 50.99778747558594, + "loss": 4.4663, + "lr": 0.0009492307692307693, + "step": 864, + "tokens_trained": 0.424697072 + }, + { + "epoch": 0.24565633643004042, + "grad_norm": 38.0300407409668, + "loss": 4.4649, + "lr": 0.000948951048951049, + "step": 866, + "tokens_trained": 0.425681392 + }, + { + "epoch": 0.24622367207999432, + "grad_norm": 19.017776489257812, + "loss": 4.4296, + "lr": 0.0009486713286713286, + "step": 868, + "tokens_trained": 0.426665088 + }, + { + "epoch": 0.24679100772994822, + "grad_norm": 24.02813148498535, + "loss": 4.4958, + "lr": 0.0009483916083916084, + "step": 870, + "tokens_trained": 0.427646016 + }, + { + "epoch": 0.24735834337990215, + "grad_norm": 59.40018081665039, + "loss": 4.5919, + "lr": 0.0009481118881118881, + "step": 872, + "tokens_trained": 0.428628048 + }, + { + "epoch": 0.24792567902985604, + "grad_norm": 61.13710403442383, + "loss": 4.4642, + "lr": 0.0009478321678321679, + "step": 874, + "tokens_trained": 0.4296112 + }, + { + "epoch": 0.24820934685483298, + "eval_loss": 1.1135390996932983, + "eval_runtime": 20.4738, + "step": 875, + "tokens_trained": 0.430109024 + }, + { + "epoch": 0.24849301467980994, + "grad_norm": 47.920021057128906, + "loss": 4.4832, + "lr": 0.0009475524475524476, + "step": 876, + "tokens_trained": 0.430599208 + }, + { + "epoch": 0.24906035032976384, + "grad_norm": 25.661701202392578, + "loss": 4.4176, + "lr": 0.0009472727272727273, + "step": 878, + "tokens_trained": 0.43158356 + }, + { + "epoch": 0.24962768597971774, + "grad_norm": 32.86565399169922, + "loss": 4.405, + "lr": 0.000946993006993007, + "step": 880, + "tokens_trained": 0.432570584 + }, + { + "epoch": 0.25019502162967167, + "grad_norm": 23.443584442138672, + "loss": 4.4218, + "lr": 0.0009467132867132868, + "step": 882, + "tokens_trained": 0.433557672 + }, + { + "epoch": 0.25076235727962554, + "grad_norm": 28.315975189208984, + "loss": 4.4019, + "lr": 0.0009464335664335665, + "step": 884, + "tokens_trained": 0.434542736 + }, + { + "epoch": 0.25132969292957946, + "grad_norm": 31.056642532348633, + "loss": 4.4027, + "lr": 0.0009461538461538461, + "step": 886, + "tokens_trained": 0.43553112 + }, + { + "epoch": 0.2518970285795334, + "grad_norm": 13.661805152893066, + "loss": 4.3745, + "lr": 0.0009458741258741259, + "step": 888, + "tokens_trained": 0.436511584 + }, + { + "epoch": 0.25246436422948726, + "grad_norm": 47.04901885986328, + "loss": 4.4875, + "lr": 0.0009455944055944056, + "step": 890, + "tokens_trained": 0.43749464 + }, + { + "epoch": 0.2530316998794412, + "grad_norm": 84.91446685791016, + "loss": 4.5185, + "lr": 0.0009453146853146854, + "step": 892, + "tokens_trained": 0.43847764 + }, + { + "epoch": 0.25359903552939506, + "grad_norm": 40.9110107421875, + "loss": 4.5735, + "lr": 0.000945034965034965, + "step": 894, + "tokens_trained": 0.439461496 + }, + { + "epoch": 0.254166371179349, + "grad_norm": 58.98877716064453, + "loss": 4.5146, + "lr": 0.0009447552447552447, + "step": 896, + "tokens_trained": 0.440443656 + }, + { + "epoch": 0.2547337068293029, + "grad_norm": 34.037315368652344, + "loss": 4.4714, + "lr": 0.0009444755244755245, + "step": 898, + "tokens_trained": 0.441423496 + }, + { + "epoch": 0.2553010424792568, + "grad_norm": 24.91920280456543, + "loss": 4.4334, + "lr": 0.0009441958041958042, + "step": 900, + "tokens_trained": 0.442407408 + }, + { + "epoch": 0.2558683781292107, + "grad_norm": 30.612323760986328, + "loss": 4.4459, + "lr": 0.000943916083916084, + "step": 902, + "tokens_trained": 0.443383464 + }, + { + "epoch": 0.2564357137791646, + "grad_norm": 50.595577239990234, + "loss": 4.4848, + "lr": 0.0009436363636363636, + "step": 904, + "tokens_trained": 0.4443674 + }, + { + "epoch": 0.2570030494291185, + "grad_norm": 41.3300895690918, + "loss": 4.4445, + "lr": 0.0009433566433566434, + "step": 906, + "tokens_trained": 0.445346072 + }, + { + "epoch": 0.25757038507907243, + "grad_norm": 48.33689880371094, + "loss": 4.4058, + "lr": 0.0009430769230769231, + "step": 908, + "tokens_trained": 0.446329872 + }, + { + "epoch": 0.2581377207290263, + "grad_norm": 39.081382751464844, + "loss": 4.4321, + "lr": 0.0009427972027972029, + "step": 910, + "tokens_trained": 0.447309544 + }, + { + "epoch": 0.2587050563789802, + "grad_norm": 62.18062210083008, + "loss": 4.4672, + "lr": 0.0009425174825174825, + "step": 912, + "tokens_trained": 0.448295056 + }, + { + "epoch": 0.2592723920289341, + "grad_norm": 28.725404739379883, + "loss": 4.4786, + "lr": 0.0009422377622377622, + "step": 914, + "tokens_trained": 0.449274208 + }, + { + "epoch": 0.259839727678888, + "grad_norm": 47.55582809448242, + "loss": 4.4227, + "lr": 0.000941958041958042, + "step": 916, + "tokens_trained": 0.450256408 + }, + { + "epoch": 0.26040706332884195, + "grad_norm": 35.743125915527344, + "loss": 4.379, + "lr": 0.0009416783216783217, + "step": 918, + "tokens_trained": 0.45123684 + }, + { + "epoch": 0.2609743989787958, + "grad_norm": 31.489402770996094, + "loss": 4.3888, + "lr": 0.0009413986013986015, + "step": 920, + "tokens_trained": 0.45221748 + }, + { + "epoch": 0.26154173462874974, + "grad_norm": 36.46233367919922, + "loss": 4.3982, + "lr": 0.0009411188811188811, + "step": 922, + "tokens_trained": 0.453202064 + }, + { + "epoch": 0.2621090702787036, + "grad_norm": 41.6457633972168, + "loss": 4.385, + "lr": 0.0009408391608391608, + "step": 924, + "tokens_trained": 0.454183456 + }, + { + "epoch": 0.26267640592865754, + "grad_norm": 26.52242088317871, + "loss": 4.4091, + "lr": 0.0009405594405594406, + "step": 926, + "tokens_trained": 0.455165496 + }, + { + "epoch": 0.26324374157861147, + "grad_norm": 14.401509284973145, + "loss": 4.3549, + "lr": 0.0009402797202797203, + "step": 928, + "tokens_trained": 0.456150248 + }, + { + "epoch": 0.26381107722856534, + "grad_norm": 30.626131057739258, + "loss": 4.3325, + "lr": 0.00094, + "step": 930, + "tokens_trained": 0.457134184 + }, + { + "epoch": 0.26437841287851926, + "grad_norm": 63.74067687988281, + "loss": 4.442, + "lr": 0.0009397202797202797, + "step": 932, + "tokens_trained": 0.458118808 + }, + { + "epoch": 0.26494574852847314, + "grad_norm": 12.15156364440918, + "loss": 4.4658, + "lr": 0.0009394405594405595, + "step": 934, + "tokens_trained": 0.459103872 + }, + { + "epoch": 0.26551308417842706, + "grad_norm": 76.2789306640625, + "loss": 4.8153, + "lr": 0.0009391608391608392, + "step": 936, + "tokens_trained": 0.460087216 + }, + { + "epoch": 0.266080419828381, + "grad_norm": 63.919334411621094, + "loss": 4.5707, + "lr": 0.000938881118881119, + "step": 938, + "tokens_trained": 0.461070568 + }, + { + "epoch": 0.26664775547833486, + "grad_norm": 75.1481704711914, + "loss": 4.5931, + "lr": 0.0009386013986013986, + "step": 940, + "tokens_trained": 0.462055184 + }, + { + "epoch": 0.2672150911282888, + "grad_norm": 33.118961334228516, + "loss": 4.4723, + "lr": 0.0009383216783216783, + "step": 942, + "tokens_trained": 0.463034592 + }, + { + "epoch": 0.26778242677824265, + "grad_norm": 30.8759765625, + "loss": 4.4275, + "lr": 0.0009380419580419581, + "step": 944, + "tokens_trained": 0.464016816 + }, + { + "epoch": 0.2683497624281966, + "grad_norm": 41.05061340332031, + "loss": 4.4566, + "lr": 0.0009377622377622378, + "step": 946, + "tokens_trained": 0.465000872 + }, + { + "epoch": 0.2689170980781505, + "grad_norm": 30.93424415588379, + "loss": 4.3985, + "lr": 0.0009374825174825175, + "step": 948, + "tokens_trained": 0.465984096 + }, + { + "epoch": 0.2694844337281044, + "grad_norm": 29.477052688598633, + "loss": 4.3718, + "lr": 0.0009372027972027972, + "step": 950, + "tokens_trained": 0.466961752 + }, + { + "epoch": 0.2700517693780583, + "grad_norm": 21.568912506103516, + "loss": 4.3697, + "lr": 0.0009369230769230769, + "step": 952, + "tokens_trained": 0.467950088 + }, + { + "epoch": 0.2706191050280122, + "grad_norm": 41.66835021972656, + "loss": 4.4241, + "lr": 0.0009366433566433567, + "step": 954, + "tokens_trained": 0.468928736 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 68.04551696777344, + "loss": 4.3978, + "lr": 0.0009363636363636364, + "step": 956, + "tokens_trained": 0.469907496 + }, + { + "epoch": 0.27175377632792, + "grad_norm": 37.655181884765625, + "loss": 4.4497, + "lr": 0.0009360839160839161, + "step": 958, + "tokens_trained": 0.470889168 + }, + { + "epoch": 0.2723211119778739, + "grad_norm": 22.074953079223633, + "loss": 4.3918, + "lr": 0.0009358041958041958, + "step": 960, + "tokens_trained": 0.471871816 + }, + { + "epoch": 0.2728884476278278, + "grad_norm": 49.925777435302734, + "loss": 4.4745, + "lr": 0.0009355244755244755, + "step": 962, + "tokens_trained": 0.472856728 + }, + { + "epoch": 0.2734557832777817, + "grad_norm": 46.520851135253906, + "loss": 4.403, + "lr": 0.0009352447552447553, + "step": 964, + "tokens_trained": 0.473838544 + }, + { + "epoch": 0.2740231189277356, + "grad_norm": 25.053146362304688, + "loss": 4.4247, + "lr": 0.0009349650349650349, + "step": 966, + "tokens_trained": 0.474819976 + }, + { + "epoch": 0.27459045457768955, + "grad_norm": 30.127140045166016, + "loss": 4.3834, + "lr": 0.0009346853146853147, + "step": 968, + "tokens_trained": 0.475800696 + }, + { + "epoch": 0.2751577902276434, + "grad_norm": 41.478328704833984, + "loss": 4.3978, + "lr": 0.0009344055944055944, + "step": 970, + "tokens_trained": 0.4767834 + }, + { + "epoch": 0.27572512587759734, + "grad_norm": 23.739456176757812, + "loss": 4.3698, + "lr": 0.0009341258741258742, + "step": 972, + "tokens_trained": 0.47776944 + }, + { + "epoch": 0.2762924615275512, + "grad_norm": 21.813220977783203, + "loss": 4.3902, + "lr": 0.0009338461538461539, + "step": 974, + "tokens_trained": 0.478757048 + }, + { + "epoch": 0.27685979717750514, + "grad_norm": 64.79598999023438, + "loss": 4.5237, + "lr": 0.0009335664335664336, + "step": 976, + "tokens_trained": 0.47973872 + }, + { + "epoch": 0.27742713282745907, + "grad_norm": 68.32705688476562, + "loss": 4.4461, + "lr": 0.0009332867132867133, + "step": 978, + "tokens_trained": 0.480721912 + }, + { + "epoch": 0.27799446847741294, + "grad_norm": 41.857582092285156, + "loss": 4.4663, + "lr": 0.0009330069930069929, + "step": 980, + "tokens_trained": 0.481704248 + }, + { + "epoch": 0.27856180412736686, + "grad_norm": 28.30609893798828, + "loss": 4.3461, + "lr": 0.0009327272727272728, + "step": 982, + "tokens_trained": 0.482689768 + }, + { + "epoch": 0.27912913977732073, + "grad_norm": 33.207950592041016, + "loss": 4.4185, + "lr": 0.0009324475524475524, + "step": 984, + "tokens_trained": 0.483670008 + }, + { + "epoch": 0.27969647542727466, + "grad_norm": 29.541227340698242, + "loss": 4.388, + "lr": 0.0009321678321678322, + "step": 986, + "tokens_trained": 0.48465836 + }, + { + "epoch": 0.2802638110772286, + "grad_norm": 16.23346710205078, + "loss": 4.3219, + "lr": 0.0009318881118881119, + "step": 988, + "tokens_trained": 0.4856402 + }, + { + "epoch": 0.28083114672718246, + "grad_norm": 20.036178588867188, + "loss": 4.3273, + "lr": 0.0009316083916083917, + "step": 990, + "tokens_trained": 0.486621648 + }, + { + "epoch": 0.2813984823771364, + "grad_norm": 49.25468063354492, + "loss": 4.4649, + "lr": 0.0009313286713286714, + "step": 992, + "tokens_trained": 0.48760744 + }, + { + "epoch": 0.28196581802709025, + "grad_norm": 48.59744644165039, + "loss": 4.3979, + "lr": 0.000931048951048951, + "step": 994, + "tokens_trained": 0.488590472 + }, + { + "epoch": 0.2825331536770442, + "grad_norm": 16.33649253845215, + "loss": 4.3945, + "lr": 0.0009307692307692308, + "step": 996, + "tokens_trained": 0.489570976 + }, + { + "epoch": 0.2831004893269981, + "grad_norm": 60.632591247558594, + "loss": 4.5581, + "lr": 0.0009304895104895104, + "step": 998, + "tokens_trained": 0.490552296 + }, + { + "epoch": 0.283667824976952, + "grad_norm": 52.75735092163086, + "loss": 4.424, + "lr": 0.0009302097902097903, + "step": 1000, + "tokens_trained": 0.49153744 + }, + { + "epoch": 0.283667824976952, + "eval_loss": 1.1363450288772583, + "eval_runtime": 20.7491, + "step": 1000, + "tokens_trained": 0.49153744 + }, + { + "epoch": 0.2842351606269059, + "grad_norm": 20.506614685058594, + "loss": 4.4241, + "lr": 0.0009299300699300699, + "step": 1002, + "tokens_trained": 0.492522608 + }, + { + "epoch": 0.2848024962768598, + "grad_norm": 23.148601531982422, + "loss": 4.3975, + "lr": 0.0009296503496503497, + "step": 1004, + "tokens_trained": 0.493501384 + }, + { + "epoch": 0.2853698319268137, + "grad_norm": 9.550869941711426, + "loss": 4.3952, + "lr": 0.0009293706293706294, + "step": 1006, + "tokens_trained": 0.494482544 + }, + { + "epoch": 0.2859371675767676, + "grad_norm": 80.31155395507812, + "loss": 4.7614, + "lr": 0.0009290909090909091, + "step": 1008, + "tokens_trained": 0.495459416 + }, + { + "epoch": 0.2865045032267215, + "grad_norm": 61.021026611328125, + "loss": 4.4396, + "lr": 0.0009288111888111889, + "step": 1010, + "tokens_trained": 0.4964418 + }, + { + "epoch": 0.2870718388766754, + "grad_norm": 35.23258972167969, + "loss": 4.5548, + "lr": 0.0009285314685314685, + "step": 1012, + "tokens_trained": 0.497428288 + }, + { + "epoch": 0.2876391745266293, + "grad_norm": 36.45478057861328, + "loss": 4.46, + "lr": 0.0009282517482517483, + "step": 1014, + "tokens_trained": 0.498416832 + }, + { + "epoch": 0.2882065101765832, + "grad_norm": 46.622982025146484, + "loss": 4.3554, + "lr": 0.0009279720279720279, + "step": 1016, + "tokens_trained": 0.499399792 + }, + { + "epoch": 0.28877384582653715, + "grad_norm": 87.00289154052734, + "loss": 4.5276, + "lr": 0.0009276923076923078, + "step": 1018, + "tokens_trained": 0.500383776 + }, + { + "epoch": 0.289341181476491, + "grad_norm": 11.444964408874512, + "loss": 4.5483, + "lr": 0.0009274125874125874, + "step": 1020, + "tokens_trained": 0.50136468 + }, + { + "epoch": 0.28990851712644494, + "grad_norm": 89.05914306640625, + "loss": 4.8957, + "lr": 0.0009271328671328671, + "step": 1022, + "tokens_trained": 0.50235172 + }, + { + "epoch": 0.2904758527763988, + "grad_norm": 26.915477752685547, + "loss": 4.6184, + "lr": 0.0009268531468531469, + "step": 1024, + "tokens_trained": 0.50333208 + }, + { + "epoch": 0.29104318842635274, + "grad_norm": 44.32100296020508, + "loss": 4.5263, + "lr": 0.0009265734265734266, + "step": 1026, + "tokens_trained": 0.504314656 + }, + { + "epoch": 0.29161052407630667, + "grad_norm": 26.699670791625977, + "loss": 4.3871, + "lr": 0.0009262937062937064, + "step": 1028, + "tokens_trained": 0.505296568 + }, + { + "epoch": 0.29217785972626054, + "grad_norm": 27.469482421875, + "loss": 4.3558, + "lr": 0.000926013986013986, + "step": 1030, + "tokens_trained": 0.506280416 + }, + { + "epoch": 0.29274519537621446, + "grad_norm": 26.149612426757812, + "loss": 4.3368, + "lr": 0.0009257342657342658, + "step": 1032, + "tokens_trained": 0.507261224 + }, + { + "epoch": 0.29331253102616833, + "grad_norm": 8.754459381103516, + "loss": 4.3447, + "lr": 0.0009254545454545454, + "step": 1034, + "tokens_trained": 0.508243288 + }, + { + "epoch": 0.29387986667612226, + "grad_norm": 32.17164611816406, + "loss": 4.4174, + "lr": 0.0009251748251748252, + "step": 1036, + "tokens_trained": 0.509224176 + }, + { + "epoch": 0.2944472023260762, + "grad_norm": 41.17238235473633, + "loss": 4.4221, + "lr": 0.0009248951048951049, + "step": 1038, + "tokens_trained": 0.510203568 + }, + { + "epoch": 0.29501453797603006, + "grad_norm": 44.97213363647461, + "loss": 4.3594, + "lr": 0.0009246153846153846, + "step": 1040, + "tokens_trained": 0.511186464 + }, + { + "epoch": 0.295581873625984, + "grad_norm": 42.23421859741211, + "loss": 4.4159, + "lr": 0.0009243356643356644, + "step": 1042, + "tokens_trained": 0.51216944 + }, + { + "epoch": 0.29614920927593785, + "grad_norm": 36.13594436645508, + "loss": 4.4105, + "lr": 0.0009240559440559441, + "step": 1044, + "tokens_trained": 0.513153144 + }, + { + "epoch": 0.2967165449258918, + "grad_norm": 36.89309310913086, + "loss": 4.3947, + "lr": 0.0009237762237762239, + "step": 1046, + "tokens_trained": 0.51413388 + }, + { + "epoch": 0.2972838805758457, + "grad_norm": 58.599700927734375, + "loss": 4.3988, + "lr": 0.0009234965034965035, + "step": 1048, + "tokens_trained": 0.515119288 + }, + { + "epoch": 0.2978512162257996, + "grad_norm": 13.725994110107422, + "loss": 4.412, + "lr": 0.0009232167832167832, + "step": 1050, + "tokens_trained": 0.51610284 + }, + { + "epoch": 0.2984185518757535, + "grad_norm": 105.28518676757812, + "loss": 4.7305, + "lr": 0.0009229370629370629, + "step": 1052, + "tokens_trained": 0.517085576 + }, + { + "epoch": 0.2989858875257074, + "grad_norm": 29.499713897705078, + "loss": 4.5106, + "lr": 0.0009226573426573427, + "step": 1054, + "tokens_trained": 0.518064224 + }, + { + "epoch": 0.2995532231756613, + "grad_norm": 60.907203674316406, + "loss": 4.5249, + "lr": 0.0009223776223776224, + "step": 1056, + "tokens_trained": 0.51905084 + }, + { + "epoch": 0.3001205588256152, + "grad_norm": 39.825069427490234, + "loss": 4.3695, + "lr": 0.0009220979020979021, + "step": 1058, + "tokens_trained": 0.5200318 + }, + { + "epoch": 0.3006878944755691, + "grad_norm": 42.77061462402344, + "loss": 4.4094, + "lr": 0.0009218181818181819, + "step": 1060, + "tokens_trained": 0.521013568 + }, + { + "epoch": 0.301255230125523, + "grad_norm": 37.05888748168945, + "loss": 4.3684, + "lr": 0.0009215384615384616, + "step": 1062, + "tokens_trained": 0.521997624 + }, + { + "epoch": 0.3018225657754769, + "grad_norm": 42.28252029418945, + "loss": 4.3489, + "lr": 0.0009212587412587413, + "step": 1064, + "tokens_trained": 0.522986184 + }, + { + "epoch": 0.3023899014254308, + "grad_norm": 40.95197677612305, + "loss": 4.3564, + "lr": 0.000920979020979021, + "step": 1066, + "tokens_trained": 0.523970984 + }, + { + "epoch": 0.30295723707538474, + "grad_norm": 25.469568252563477, + "loss": 4.3833, + "lr": 0.0009206993006993007, + "step": 1068, + "tokens_trained": 0.524952808 + }, + { + "epoch": 0.3035245727253386, + "grad_norm": 29.921735763549805, + "loss": 4.3579, + "lr": 0.0009204195804195804, + "step": 1070, + "tokens_trained": 0.525935696 + }, + { + "epoch": 0.30409190837529254, + "grad_norm": 26.038026809692383, + "loss": 4.2898, + "lr": 0.0009201398601398602, + "step": 1072, + "tokens_trained": 0.526916904 + }, + { + "epoch": 0.3046592440252464, + "grad_norm": 32.59503936767578, + "loss": 4.3335, + "lr": 0.0009198601398601398, + "step": 1074, + "tokens_trained": 0.527899864 + }, + { + "epoch": 0.30522657967520034, + "grad_norm": 14.04964828491211, + "loss": 4.3171, + "lr": 0.0009195804195804196, + "step": 1076, + "tokens_trained": 0.528878176 + }, + { + "epoch": 0.30579391532515426, + "grad_norm": 15.936906814575195, + "loss": 4.3005, + "lr": 0.0009193006993006993, + "step": 1078, + "tokens_trained": 0.529859952 + }, + { + "epoch": 0.30636125097510813, + "grad_norm": 9.73235034942627, + "loss": 4.3287, + "lr": 0.0009190209790209791, + "step": 1080, + "tokens_trained": 0.530838192 + }, + { + "epoch": 0.30692858662506206, + "grad_norm": 45.44027328491211, + "loss": 4.4384, + "lr": 0.0009187412587412588, + "step": 1082, + "tokens_trained": 0.531818376 + }, + { + "epoch": 0.30749592227501593, + "grad_norm": 55.65925598144531, + "loss": 4.3772, + "lr": 0.0009184615384615385, + "step": 1084, + "tokens_trained": 0.532802048 + }, + { + "epoch": 0.30806325792496986, + "grad_norm": 33.47093200683594, + "loss": 4.4257, + "lr": 0.0009181818181818182, + "step": 1086, + "tokens_trained": 0.533785376 + }, + { + "epoch": 0.3086305935749238, + "grad_norm": 39.709224700927734, + "loss": 4.4177, + "lr": 0.0009179020979020978, + "step": 1088, + "tokens_trained": 0.5347698 + }, + { + "epoch": 0.30919792922487765, + "grad_norm": 34.25212097167969, + "loss": 4.3518, + "lr": 0.0009176223776223777, + "step": 1090, + "tokens_trained": 0.53575108 + }, + { + "epoch": 0.3097652648748316, + "grad_norm": 29.156312942504883, + "loss": 4.3596, + "lr": 0.0009173426573426573, + "step": 1092, + "tokens_trained": 0.536735544 + }, + { + "epoch": 0.31033260052478545, + "grad_norm": 31.714128494262695, + "loss": 4.3736, + "lr": 0.0009170629370629371, + "step": 1094, + "tokens_trained": 0.537718008 + }, + { + "epoch": 0.3108999361747394, + "grad_norm": 12.244729042053223, + "loss": 4.3472, + "lr": 0.0009167832167832168, + "step": 1096, + "tokens_trained": 0.538693512 + }, + { + "epoch": 0.3114672718246933, + "grad_norm": 10.271063804626465, + "loss": 4.301, + "lr": 0.0009165034965034966, + "step": 1098, + "tokens_trained": 0.539681376 + }, + { + "epoch": 0.3120346074746472, + "grad_norm": 35.79754638671875, + "loss": 4.3912, + "lr": 0.0009162237762237763, + "step": 1100, + "tokens_trained": 0.540661392 + }, + { + "epoch": 0.3126019431246011, + "grad_norm": 24.1260986328125, + "loss": 4.3303, + "lr": 0.0009159440559440559, + "step": 1102, + "tokens_trained": 0.541646968 + }, + { + "epoch": 0.31316927877455497, + "grad_norm": 24.501169204711914, + "loss": 4.3205, + "lr": 0.0009156643356643357, + "step": 1104, + "tokens_trained": 0.542629392 + }, + { + "epoch": 0.3137366144245089, + "grad_norm": 17.031600952148438, + "loss": 4.2521, + "lr": 0.0009153846153846153, + "step": 1106, + "tokens_trained": 0.54361348 + }, + { + "epoch": 0.3143039500744628, + "grad_norm": 19.506216049194336, + "loss": 4.3225, + "lr": 0.0009151048951048952, + "step": 1108, + "tokens_trained": 0.544595336 + }, + { + "epoch": 0.3148712857244167, + "grad_norm": 20.822546005249023, + "loss": 4.2711, + "lr": 0.0009148251748251748, + "step": 1110, + "tokens_trained": 0.545578256 + }, + { + "epoch": 0.3154386213743706, + "grad_norm": 29.967998504638672, + "loss": 4.2868, + "lr": 0.0009145454545454546, + "step": 1112, + "tokens_trained": 0.546561024 + }, + { + "epoch": 0.3160059570243245, + "grad_norm": 24.06121063232422, + "loss": 4.2701, + "lr": 0.0009142657342657343, + "step": 1114, + "tokens_trained": 0.547544616 + }, + { + "epoch": 0.3165732926742784, + "grad_norm": 15.868765830993652, + "loss": 4.3233, + "lr": 0.000913986013986014, + "step": 1116, + "tokens_trained": 0.548526216 + }, + { + "epoch": 0.31714062832423234, + "grad_norm": 27.47897720336914, + "loss": 4.2813, + "lr": 0.0009137062937062938, + "step": 1118, + "tokens_trained": 0.549506544 + }, + { + "epoch": 0.3177079639741862, + "grad_norm": 15.343204498291016, + "loss": 4.3002, + "lr": 0.0009134265734265734, + "step": 1120, + "tokens_trained": 0.550488496 + }, + { + "epoch": 0.31827529962414014, + "grad_norm": 4.320124626159668, + "loss": 4.2622, + "lr": 0.0009131468531468532, + "step": 1122, + "tokens_trained": 0.551471792 + }, + { + "epoch": 0.318842635274094, + "grad_norm": 34.520050048828125, + "loss": 4.366, + "lr": 0.0009128671328671328, + "step": 1124, + "tokens_trained": 0.552457008 + }, + { + "epoch": 0.319126303099071, + "eval_loss": 1.096465826034546, + "eval_runtime": 20.7643, + "step": 1125, + "tokens_trained": 0.552948064 + }, + { + "epoch": 0.31940997092404794, + "grad_norm": 39.718719482421875, + "loss": 4.3317, + "lr": 0.0009125874125874127, + "step": 1126, + "tokens_trained": 0.5534394 + }, + { + "epoch": 0.31997730657400186, + "grad_norm": 20.843252182006836, + "loss": 4.3883, + "lr": 0.0009123076923076923, + "step": 1128, + "tokens_trained": 0.554419184 + }, + { + "epoch": 0.32054464222395573, + "grad_norm": 12.916360855102539, + "loss": 4.3119, + "lr": 0.000912027972027972, + "step": 1130, + "tokens_trained": 0.555401952 + }, + { + "epoch": 0.32111197787390966, + "grad_norm": 48.54426956176758, + "loss": 4.4155, + "lr": 0.0009117482517482518, + "step": 1132, + "tokens_trained": 0.556385024 + }, + { + "epoch": 0.32167931352386353, + "grad_norm": 41.00883483886719, + "loss": 4.362, + "lr": 0.0009114685314685315, + "step": 1134, + "tokens_trained": 0.557368472 + }, + { + "epoch": 0.32224664917381746, + "grad_norm": 28.0487060546875, + "loss": 4.3504, + "lr": 0.0009111888111888113, + "step": 1136, + "tokens_trained": 0.55835288 + }, + { + "epoch": 0.3228139848237714, + "grad_norm": 22.05229377746582, + "loss": 4.331, + "lr": 0.0009109090909090909, + "step": 1138, + "tokens_trained": 0.559337064 + }, + { + "epoch": 0.32338132047372525, + "grad_norm": 16.770631790161133, + "loss": 4.3008, + "lr": 0.0009106293706293707, + "step": 1140, + "tokens_trained": 0.560317984 + }, + { + "epoch": 0.3239486561236792, + "grad_norm": 35.300262451171875, + "loss": 4.4083, + "lr": 0.0009103496503496503, + "step": 1142, + "tokens_trained": 0.561299688 + }, + { + "epoch": 0.32451599177363305, + "grad_norm": 23.788284301757812, + "loss": 4.2772, + "lr": 0.0009100699300699301, + "step": 1144, + "tokens_trained": 0.562285664 + }, + { + "epoch": 0.325083327423587, + "grad_norm": 23.085710525512695, + "loss": 4.3185, + "lr": 0.0009097902097902098, + "step": 1146, + "tokens_trained": 0.563267832 + }, + { + "epoch": 0.3256506630735409, + "grad_norm": 13.11314582824707, + "loss": 4.2711, + "lr": 0.0009095104895104895, + "step": 1148, + "tokens_trained": 0.564248928 + }, + { + "epoch": 0.3262179987234948, + "grad_norm": 31.297805786132812, + "loss": 4.3096, + "lr": 0.0009092307692307692, + "step": 1150, + "tokens_trained": 0.56522952 + }, + { + "epoch": 0.3267853343734487, + "grad_norm": 11.668539047241211, + "loss": 4.2667, + "lr": 0.000908951048951049, + "step": 1152, + "tokens_trained": 0.566212392 + }, + { + "epoch": 0.32735267002340257, + "grad_norm": 23.359189987182617, + "loss": 4.3156, + "lr": 0.0009086713286713288, + "step": 1154, + "tokens_trained": 0.567192216 + }, + { + "epoch": 0.3279200056733565, + "grad_norm": 31.09916114807129, + "loss": 4.3367, + "lr": 0.0009083916083916084, + "step": 1156, + "tokens_trained": 0.568177088 + }, + { + "epoch": 0.3284873413233104, + "grad_norm": 24.03261947631836, + "loss": 4.3504, + "lr": 0.0009081118881118881, + "step": 1158, + "tokens_trained": 0.56915868 + }, + { + "epoch": 0.3290546769732643, + "grad_norm": 16.029443740844727, + "loss": 4.3192, + "lr": 0.0009078321678321678, + "step": 1160, + "tokens_trained": 0.570142976 + }, + { + "epoch": 0.3296220126232182, + "grad_norm": 53.486724853515625, + "loss": 4.3921, + "lr": 0.0009075524475524476, + "step": 1162, + "tokens_trained": 0.57112748 + }, + { + "epoch": 0.3301893482731721, + "grad_norm": 37.42267608642578, + "loss": 4.2821, + "lr": 0.0009072727272727273, + "step": 1164, + "tokens_trained": 0.57211356 + }, + { + "epoch": 0.330756683923126, + "grad_norm": 28.862472534179688, + "loss": 4.3002, + "lr": 0.000906993006993007, + "step": 1166, + "tokens_trained": 0.57309492 + }, + { + "epoch": 0.33132401957307994, + "grad_norm": 22.26299476623535, + "loss": 4.2729, + "lr": 0.0009067132867132866, + "step": 1168, + "tokens_trained": 0.5740806 + }, + { + "epoch": 0.3318913552230338, + "grad_norm": 21.635013580322266, + "loss": 4.2866, + "lr": 0.0009064335664335665, + "step": 1170, + "tokens_trained": 0.575061664 + }, + { + "epoch": 0.33245869087298774, + "grad_norm": 18.995012283325195, + "loss": 4.2814, + "lr": 0.0009061538461538462, + "step": 1172, + "tokens_trained": 0.576046304 + }, + { + "epoch": 0.3330260265229416, + "grad_norm": 22.621299743652344, + "loss": 4.2739, + "lr": 0.0009058741258741259, + "step": 1174, + "tokens_trained": 0.577032376 + }, + { + "epoch": 0.33359336217289554, + "grad_norm": 21.758216857910156, + "loss": 4.263, + "lr": 0.0009055944055944056, + "step": 1176, + "tokens_trained": 0.578013896 + }, + { + "epoch": 0.33416069782284946, + "grad_norm": 32.38374710083008, + "loss": 4.2713, + "lr": 0.0009053146853146853, + "step": 1178, + "tokens_trained": 0.57900508 + }, + { + "epoch": 0.33472803347280333, + "grad_norm": 35.57462692260742, + "loss": 4.2986, + "lr": 0.0009050349650349651, + "step": 1180, + "tokens_trained": 0.57999512 + }, + { + "epoch": 0.33529536912275726, + "grad_norm": 11.77812385559082, + "loss": 4.3085, + "lr": 0.0009047552447552448, + "step": 1182, + "tokens_trained": 0.580982752 + }, + { + "epoch": 0.33586270477271113, + "grad_norm": 51.48725509643555, + "loss": 4.4003, + "lr": 0.0009044755244755245, + "step": 1184, + "tokens_trained": 0.581964936 + }, + { + "epoch": 0.33643004042266506, + "grad_norm": 47.01481628417969, + "loss": 4.3182, + "lr": 0.0009041958041958041, + "step": 1186, + "tokens_trained": 0.582949944 + }, + { + "epoch": 0.336997376072619, + "grad_norm": 22.935691833496094, + "loss": 4.3432, + "lr": 0.000903916083916084, + "step": 1188, + "tokens_trained": 0.583934776 + }, + { + "epoch": 0.33756471172257285, + "grad_norm": 45.21054458618164, + "loss": 4.4674, + "lr": 0.0009036363636363637, + "step": 1190, + "tokens_trained": 0.584918344 + }, + { + "epoch": 0.3381320473725268, + "grad_norm": 27.012706756591797, + "loss": 4.2889, + "lr": 0.0009033566433566434, + "step": 1192, + "tokens_trained": 0.585897632 + }, + { + "epoch": 0.33869938302248065, + "grad_norm": 16.68247413635254, + "loss": 4.2896, + "lr": 0.0009030769230769231, + "step": 1194, + "tokens_trained": 0.586879408 + }, + { + "epoch": 0.3392667186724346, + "grad_norm": 20.664148330688477, + "loss": 4.304, + "lr": 0.0009027972027972027, + "step": 1196, + "tokens_trained": 0.587859392 + }, + { + "epoch": 0.3398340543223885, + "grad_norm": 22.954742431640625, + "loss": 4.2853, + "lr": 0.0009025174825174826, + "step": 1198, + "tokens_trained": 0.588845408 + }, + { + "epoch": 0.34040138997234237, + "grad_norm": 23.226943969726562, + "loss": 4.2597, + "lr": 0.0009022377622377622, + "step": 1200, + "tokens_trained": 0.589832736 + }, + { + "epoch": 0.3409687256222963, + "grad_norm": 7.963059902191162, + "loss": 4.261, + "lr": 0.000901958041958042, + "step": 1202, + "tokens_trained": 0.590816568 + }, + { + "epoch": 0.34153606127225017, + "grad_norm": 25.160730361938477, + "loss": 4.3288, + "lr": 0.0009016783216783216, + "step": 1204, + "tokens_trained": 0.59179692 + }, + { + "epoch": 0.3421033969222041, + "grad_norm": 38.45030212402344, + "loss": 4.3371, + "lr": 0.0009013986013986014, + "step": 1206, + "tokens_trained": 0.592780968 + }, + { + "epoch": 0.342670732572158, + "grad_norm": 52.66873550415039, + "loss": 4.2805, + "lr": 0.0009011188811188812, + "step": 1208, + "tokens_trained": 0.593760896 + }, + { + "epoch": 0.3432380682221119, + "grad_norm": 28.104921340942383, + "loss": 4.3885, + "lr": 0.0009008391608391609, + "step": 1210, + "tokens_trained": 0.59474304 + }, + { + "epoch": 0.3438054038720658, + "grad_norm": 49.20989990234375, + "loss": 4.346, + "lr": 0.0009005594405594406, + "step": 1212, + "tokens_trained": 0.59572768 + }, + { + "epoch": 0.3443727395220197, + "grad_norm": 20.652427673339844, + "loss": 4.2368, + "lr": 0.0009002797202797202, + "step": 1214, + "tokens_trained": 0.59671092 + }, + { + "epoch": 0.3449400751719736, + "grad_norm": 17.821596145629883, + "loss": 4.3041, + "lr": 0.0009000000000000001, + "step": 1216, + "tokens_trained": 0.597697344 + }, + { + "epoch": 0.34550741082192754, + "grad_norm": 48.594932556152344, + "loss": 4.3668, + "lr": 0.0008997202797202797, + "step": 1218, + "tokens_trained": 0.598677288 + }, + { + "epoch": 0.3460747464718814, + "grad_norm": 27.70078468322754, + "loss": 4.2939, + "lr": 0.0008994405594405595, + "step": 1220, + "tokens_trained": 0.599662488 + }, + { + "epoch": 0.34664208212183534, + "grad_norm": 25.498798370361328, + "loss": 4.2891, + "lr": 0.0008991608391608391, + "step": 1222, + "tokens_trained": 0.600646904 + }, + { + "epoch": 0.3472094177717892, + "grad_norm": 13.455835342407227, + "loss": 4.2881, + "lr": 0.0008988811188811188, + "step": 1224, + "tokens_trained": 0.601628112 + }, + { + "epoch": 0.34777675342174313, + "grad_norm": 17.518342971801758, + "loss": 4.2977, + "lr": 0.0008986013986013987, + "step": 1226, + "tokens_trained": 0.602612336 + }, + { + "epoch": 0.34834408907169706, + "grad_norm": 20.642597198486328, + "loss": 4.2921, + "lr": 0.0008983216783216783, + "step": 1228, + "tokens_trained": 0.603595 + }, + { + "epoch": 0.34891142472165093, + "grad_norm": 14.464616775512695, + "loss": 4.233, + "lr": 0.0008980419580419581, + "step": 1230, + "tokens_trained": 0.604576592 + }, + { + "epoch": 0.34947876037160486, + "grad_norm": 13.204504013061523, + "loss": 4.2707, + "lr": 0.0008977622377622377, + "step": 1232, + "tokens_trained": 0.60555656 + }, + { + "epoch": 0.35004609602155873, + "grad_norm": 12.241665840148926, + "loss": 4.2506, + "lr": 0.0008974825174825176, + "step": 1234, + "tokens_trained": 0.606536024 + }, + { + "epoch": 0.35061343167151265, + "grad_norm": 18.187660217285156, + "loss": 4.2659, + "lr": 0.0008972027972027972, + "step": 1236, + "tokens_trained": 0.607522576 + }, + { + "epoch": 0.3511807673214666, + "grad_norm": 8.911888122558594, + "loss": 4.2505, + "lr": 0.000896923076923077, + "step": 1238, + "tokens_trained": 0.608507736 + }, + { + "epoch": 0.35174810297142045, + "grad_norm": 21.351713180541992, + "loss": 4.2291, + "lr": 0.0008966433566433566, + "step": 1240, + "tokens_trained": 0.609486688 + }, + { + "epoch": 0.3523154386213744, + "grad_norm": 47.81566619873047, + "loss": 4.2725, + "lr": 0.0008963636363636363, + "step": 1242, + "tokens_trained": 0.610470272 + }, + { + "epoch": 0.35288277427132825, + "grad_norm": 33.53351974487305, + "loss": 4.3237, + "lr": 0.0008960839160839162, + "step": 1244, + "tokens_trained": 0.611455176 + }, + { + "epoch": 0.3534501099212822, + "grad_norm": 15.252607345581055, + "loss": 4.2868, + "lr": 0.0008958041958041958, + "step": 1246, + "tokens_trained": 0.612437888 + }, + { + "epoch": 0.3540174455712361, + "grad_norm": 24.129865646362305, + "loss": 4.2626, + "lr": 0.0008955244755244756, + "step": 1248, + "tokens_trained": 0.613420728 + }, + { + "epoch": 0.35458478122118997, + "grad_norm": 34.814605712890625, + "loss": 4.2627, + "lr": 0.0008952447552447552, + "step": 1250, + "tokens_trained": 0.614405904 + }, + { + "epoch": 0.35458478122118997, + "eval_loss": 1.078355312347412, + "eval_runtime": 20.4723, + "step": 1250, + "tokens_trained": 0.614405904 + }, + { + "epoch": 0.3551521168711439, + "grad_norm": 18.26809310913086, + "loss": 4.2986, + "lr": 0.000894965034965035, + "step": 1252, + "tokens_trained": 0.615386288 + }, + { + "epoch": 0.35571945252109777, + "grad_norm": 24.68335723876953, + "loss": 4.3146, + "lr": 0.0008946853146853147, + "step": 1254, + "tokens_trained": 0.616370576 + }, + { + "epoch": 0.3562867881710517, + "grad_norm": 35.34586715698242, + "loss": 4.2905, + "lr": 0.0008944055944055944, + "step": 1256, + "tokens_trained": 0.617351944 + }, + { + "epoch": 0.3568541238210056, + "grad_norm": 22.668407440185547, + "loss": 4.2607, + "lr": 0.0008941258741258741, + "step": 1258, + "tokens_trained": 0.618334816 + }, + { + "epoch": 0.3574214594709595, + "grad_norm": 14.068164825439453, + "loss": 4.2459, + "lr": 0.0008938461538461538, + "step": 1260, + "tokens_trained": 0.619319736 + }, + { + "epoch": 0.3579887951209134, + "grad_norm": 8.274995803833008, + "loss": 4.2713, + "lr": 0.0008935664335664337, + "step": 1262, + "tokens_trained": 0.620299344 + }, + { + "epoch": 0.3585561307708673, + "grad_norm": 22.12897491455078, + "loss": 4.2841, + "lr": 0.0008932867132867133, + "step": 1264, + "tokens_trained": 0.621282592 + }, + { + "epoch": 0.3591234664208212, + "grad_norm": 26.171052932739258, + "loss": 4.2505, + "lr": 0.000893006993006993, + "step": 1266, + "tokens_trained": 0.622266136 + }, + { + "epoch": 0.35969080207077514, + "grad_norm": 14.768603324890137, + "loss": 4.271, + "lr": 0.0008927272727272727, + "step": 1268, + "tokens_trained": 0.623247816 + }, + { + "epoch": 0.360258137720729, + "grad_norm": 13.065408706665039, + "loss": 4.2387, + "lr": 0.0008924475524475525, + "step": 1270, + "tokens_trained": 0.624234848 + }, + { + "epoch": 0.36082547337068294, + "grad_norm": 14.043888092041016, + "loss": 4.2601, + "lr": 0.0008921678321678322, + "step": 1272, + "tokens_trained": 0.625214176 + }, + { + "epoch": 0.3613928090206368, + "grad_norm": 13.734328269958496, + "loss": 4.2426, + "lr": 0.0008918881118881119, + "step": 1274, + "tokens_trained": 0.626197608 + }, + { + "epoch": 0.36196014467059073, + "grad_norm": 10.075374603271484, + "loss": 4.2259, + "lr": 0.0008916083916083916, + "step": 1276, + "tokens_trained": 0.62717884 + }, + { + "epoch": 0.36252748032054466, + "grad_norm": 33.92001724243164, + "loss": 4.3054, + "lr": 0.0008913286713286713, + "step": 1278, + "tokens_trained": 0.628166888 + }, + { + "epoch": 0.36309481597049853, + "grad_norm": 31.1391544342041, + "loss": 4.3066, + "lr": 0.0008910489510489512, + "step": 1280, + "tokens_trained": 0.629152528 + }, + { + "epoch": 0.36366215162045246, + "grad_norm": 10.888711929321289, + "loss": 4.2348, + "lr": 0.0008907692307692308, + "step": 1282, + "tokens_trained": 0.630132584 + }, + { + "epoch": 0.3642294872704063, + "grad_norm": 27.298410415649414, + "loss": 4.3225, + "lr": 0.0008904895104895105, + "step": 1284, + "tokens_trained": 0.63111212 + }, + { + "epoch": 0.36479682292036025, + "grad_norm": 23.396818161010742, + "loss": 4.3177, + "lr": 0.0008902097902097902, + "step": 1286, + "tokens_trained": 0.632094984 + }, + { + "epoch": 0.3653641585703142, + "grad_norm": 18.824432373046875, + "loss": 4.2235, + "lr": 0.00088993006993007, + "step": 1288, + "tokens_trained": 0.633076832 + }, + { + "epoch": 0.36593149422026805, + "grad_norm": 8.04826545715332, + "loss": 4.2268, + "lr": 0.0008896503496503497, + "step": 1290, + "tokens_trained": 0.63405868 + }, + { + "epoch": 0.366498829870222, + "grad_norm": 32.26673889160156, + "loss": 4.3113, + "lr": 0.0008893706293706294, + "step": 1292, + "tokens_trained": 0.635045096 + }, + { + "epoch": 0.36706616552017585, + "grad_norm": 29.91358184814453, + "loss": 4.2971, + "lr": 0.000889090909090909, + "step": 1294, + "tokens_trained": 0.63603008 + }, + { + "epoch": 0.3676335011701298, + "grad_norm": 12.093538284301758, + "loss": 4.2502, + "lr": 0.0008888111888111888, + "step": 1296, + "tokens_trained": 0.637014016 + }, + { + "epoch": 0.3682008368200837, + "grad_norm": 8.252509117126465, + "loss": 4.2905, + "lr": 0.0008885314685314686, + "step": 1298, + "tokens_trained": 0.637997752 + }, + { + "epoch": 0.36876817247003757, + "grad_norm": 61.22240447998047, + "loss": 4.4753, + "lr": 0.0008882517482517483, + "step": 1300, + "tokens_trained": 0.638981552 + }, + { + "epoch": 0.3693355081199915, + "grad_norm": 47.58195877075195, + "loss": 4.2769, + "lr": 0.000887972027972028, + "step": 1302, + "tokens_trained": 0.639963512 + }, + { + "epoch": 0.36990284376994537, + "grad_norm": 28.806411743164062, + "loss": 4.3728, + "lr": 0.0008876923076923077, + "step": 1304, + "tokens_trained": 0.640948392 + }, + { + "epoch": 0.3704701794198993, + "grad_norm": 38.960853576660156, + "loss": 4.338, + "lr": 0.0008874125874125875, + "step": 1306, + "tokens_trained": 0.641935304 + }, + { + "epoch": 0.3710375150698532, + "grad_norm": 25.05726432800293, + "loss": 4.3002, + "lr": 0.0008871328671328671, + "step": 1308, + "tokens_trained": 0.642924168 + }, + { + "epoch": 0.3716048507198071, + "grad_norm": 39.84127426147461, + "loss": 4.3593, + "lr": 0.0008868531468531469, + "step": 1310, + "tokens_trained": 0.64390412 + }, + { + "epoch": 0.372172186369761, + "grad_norm": 15.03055191040039, + "loss": 4.223, + "lr": 0.0008865734265734265, + "step": 1312, + "tokens_trained": 0.644882104 + }, + { + "epoch": 0.3727395220197149, + "grad_norm": 41.85628890991211, + "loss": 4.3819, + "lr": 0.0008862937062937063, + "step": 1314, + "tokens_trained": 0.645866912 + }, + { + "epoch": 0.3733068576696688, + "grad_norm": 29.014118194580078, + "loss": 4.2843, + "lr": 0.0008860139860139861, + "step": 1316, + "tokens_trained": 0.646850376 + }, + { + "epoch": 0.37387419331962274, + "grad_norm": 24.407743453979492, + "loss": 4.2598, + "lr": 0.0008857342657342658, + "step": 1318, + "tokens_trained": 0.647832272 + }, + { + "epoch": 0.3744415289695766, + "grad_norm": 23.28154182434082, + "loss": 4.2162, + "lr": 0.0008854545454545455, + "step": 1320, + "tokens_trained": 0.64881652 + }, + { + "epoch": 0.37500886461953054, + "grad_norm": 17.70418930053711, + "loss": 4.2386, + "lr": 0.0008851748251748251, + "step": 1322, + "tokens_trained": 0.649794936 + }, + { + "epoch": 0.37557620026948446, + "grad_norm": 22.582124710083008, + "loss": 4.2358, + "lr": 0.000884895104895105, + "step": 1324, + "tokens_trained": 0.650777784 + }, + { + "epoch": 0.37614353591943833, + "grad_norm": 16.77848243713379, + "loss": 4.2536, + "lr": 0.0008846153846153846, + "step": 1326, + "tokens_trained": 0.651762472 + }, + { + "epoch": 0.37671087156939226, + "grad_norm": 14.382417678833008, + "loss": 4.2403, + "lr": 0.0008843356643356644, + "step": 1328, + "tokens_trained": 0.652741832 + }, + { + "epoch": 0.37727820721934613, + "grad_norm": 22.420886993408203, + "loss": 4.1977, + "lr": 0.000884055944055944, + "step": 1330, + "tokens_trained": 0.653725792 + }, + { + "epoch": 0.37784554286930006, + "grad_norm": 9.768660545349121, + "loss": 4.2148, + "lr": 0.0008837762237762238, + "step": 1332, + "tokens_trained": 0.654704648 + }, + { + "epoch": 0.378412878519254, + "grad_norm": 5.091487407684326, + "loss": 4.2062, + "lr": 0.0008834965034965036, + "step": 1334, + "tokens_trained": 0.65569176 + }, + { + "epoch": 0.37898021416920785, + "grad_norm": 53.520957946777344, + "loss": 4.4082, + "lr": 0.0008832167832167832, + "step": 1336, + "tokens_trained": 0.656679344 + }, + { + "epoch": 0.3795475498191618, + "grad_norm": 32.17420959472656, + "loss": 4.2911, + "lr": 0.000882937062937063, + "step": 1338, + "tokens_trained": 0.657665136 + }, + { + "epoch": 0.38011488546911565, + "grad_norm": 14.12790584564209, + "loss": 4.2899, + "lr": 0.0008826573426573426, + "step": 1340, + "tokens_trained": 0.658651576 + }, + { + "epoch": 0.3806822211190696, + "grad_norm": 51.74199676513672, + "loss": 4.3901, + "lr": 0.0008823776223776225, + "step": 1342, + "tokens_trained": 0.659631792 + }, + { + "epoch": 0.3812495567690235, + "grad_norm": 48.99909973144531, + "loss": 4.298, + "lr": 0.0008820979020979021, + "step": 1344, + "tokens_trained": 0.660616912 + }, + { + "epoch": 0.38181689241897737, + "grad_norm": 28.356245040893555, + "loss": 4.3171, + "lr": 0.0008818181818181819, + "step": 1346, + "tokens_trained": 0.66159872 + }, + { + "epoch": 0.3823842280689313, + "grad_norm": 45.081703186035156, + "loss": 4.3067, + "lr": 0.0008815384615384615, + "step": 1348, + "tokens_trained": 0.662582152 + }, + { + "epoch": 0.38295156371888517, + "grad_norm": 37.175052642822266, + "loss": 4.241, + "lr": 0.0008812587412587412, + "step": 1350, + "tokens_trained": 0.663561176 + }, + { + "epoch": 0.3835188993688391, + "grad_norm": 49.46076965332031, + "loss": 4.2896, + "lr": 0.0008809790209790211, + "step": 1352, + "tokens_trained": 0.664545144 + }, + { + "epoch": 0.384086235018793, + "grad_norm": 22.20182991027832, + "loss": 4.323, + "lr": 0.0008806993006993007, + "step": 1354, + "tokens_trained": 0.66553092 + }, + { + "epoch": 0.3846535706687469, + "grad_norm": 34.111549377441406, + "loss": 4.3138, + "lr": 0.0008804195804195805, + "step": 1356, + "tokens_trained": 0.666517568 + }, + { + "epoch": 0.3852209063187008, + "grad_norm": 47.01582336425781, + "loss": 4.3009, + "lr": 0.0008801398601398601, + "step": 1358, + "tokens_trained": 0.667498192 + }, + { + "epoch": 0.3857882419686547, + "grad_norm": 18.845388412475586, + "loss": 4.3176, + "lr": 0.00087986013986014, + "step": 1360, + "tokens_trained": 0.668479008 + }, + { + "epoch": 0.3863555776186086, + "grad_norm": 53.68927764892578, + "loss": 4.4024, + "lr": 0.0008795804195804196, + "step": 1362, + "tokens_trained": 0.669462472 + }, + { + "epoch": 0.38692291326856254, + "grad_norm": 29.88358497619629, + "loss": 4.286, + "lr": 0.0008793006993006993, + "step": 1364, + "tokens_trained": 0.67044392 + }, + { + "epoch": 0.3874902489185164, + "grad_norm": 11.12879753112793, + "loss": 4.3024, + "lr": 0.000879020979020979, + "step": 1366, + "tokens_trained": 0.671424552 + }, + { + "epoch": 0.38805758456847034, + "grad_norm": 23.573301315307617, + "loss": 4.2662, + "lr": 0.0008787412587412587, + "step": 1368, + "tokens_trained": 0.672409992 + }, + { + "epoch": 0.3886249202184242, + "grad_norm": 24.749160766601562, + "loss": 4.274, + "lr": 0.0008784615384615386, + "step": 1370, + "tokens_trained": 0.67339824 + }, + { + "epoch": 0.38919225586837813, + "grad_norm": 33.26881408691406, + "loss": 4.2588, + "lr": 0.0008781818181818182, + "step": 1372, + "tokens_trained": 0.67438204 + }, + { + "epoch": 0.38975959151833206, + "grad_norm": 24.466472625732422, + "loss": 4.2837, + "lr": 0.000877902097902098, + "step": 1374, + "tokens_trained": 0.67536356 + }, + { + "epoch": 0.39004325934330897, + "eval_loss": 1.0616238117218018, + "eval_runtime": 20.3698, + "step": 1375, + "tokens_trained": 0.675855672 + }, + { + "epoch": 0.39032692716828593, + "grad_norm": 24.48844337463379, + "loss": 4.259, + "lr": 0.0008776223776223776, + "step": 1376, + "tokens_trained": 0.676346368 + }, + { + "epoch": 0.39089426281823986, + "grad_norm": 30.594989776611328, + "loss": 4.1894, + "lr": 0.0008773426573426574, + "step": 1378, + "tokens_trained": 0.677329312 + }, + { + "epoch": 0.3914615984681937, + "grad_norm": 19.835350036621094, + "loss": 4.2718, + "lr": 0.0008770629370629371, + "step": 1380, + "tokens_trained": 0.678312272 + }, + { + "epoch": 0.39202893411814765, + "grad_norm": 14.570358276367188, + "loss": 4.2419, + "lr": 0.0008767832167832168, + "step": 1382, + "tokens_trained": 0.679291216 + }, + { + "epoch": 0.3925962697681016, + "grad_norm": 11.608271598815918, + "loss": 4.1917, + "lr": 0.0008765034965034965, + "step": 1384, + "tokens_trained": 0.680273296 + }, + { + "epoch": 0.39316360541805545, + "grad_norm": 26.094860076904297, + "loss": 4.2762, + "lr": 0.0008762237762237762, + "step": 1386, + "tokens_trained": 0.681249464 + }, + { + "epoch": 0.3937309410680094, + "grad_norm": 12.754049301147461, + "loss": 4.2032, + "lr": 0.0008759440559440561, + "step": 1388, + "tokens_trained": 0.682234168 + }, + { + "epoch": 0.39429827671796325, + "grad_norm": 5.951663970947266, + "loss": 4.1921, + "lr": 0.0008756643356643357, + "step": 1390, + "tokens_trained": 0.683217176 + }, + { + "epoch": 0.3948656123679172, + "grad_norm": 26.907669067382812, + "loss": 4.24, + "lr": 0.0008753846153846154, + "step": 1392, + "tokens_trained": 0.68419888 + }, + { + "epoch": 0.3954329480178711, + "grad_norm": 25.04796600341797, + "loss": 4.2656, + "lr": 0.0008751048951048951, + "step": 1394, + "tokens_trained": 0.685178784 + }, + { + "epoch": 0.39600028366782497, + "grad_norm": 19.600811004638672, + "loss": 4.2683, + "lr": 0.0008748251748251749, + "step": 1396, + "tokens_trained": 0.686161632 + }, + { + "epoch": 0.3965676193177789, + "grad_norm": 14.087088584899902, + "loss": 4.2658, + "lr": 0.0008745454545454546, + "step": 1398, + "tokens_trained": 0.687139992 + }, + { + "epoch": 0.39713495496773277, + "grad_norm": 9.257765769958496, + "loss": 4.2021, + "lr": 0.0008742657342657343, + "step": 1400, + "tokens_trained": 0.688117912 + }, + { + "epoch": 0.3977022906176867, + "grad_norm": 18.830154418945312, + "loss": 4.2249, + "lr": 0.0008739860139860139, + "step": 1402, + "tokens_trained": 0.689098776 + }, + { + "epoch": 0.3982696262676406, + "grad_norm": 24.81566619873047, + "loss": 4.246, + "lr": 0.0008737062937062937, + "step": 1404, + "tokens_trained": 0.690085432 + }, + { + "epoch": 0.3988369619175945, + "grad_norm": 14.071616172790527, + "loss": 4.2531, + "lr": 0.0008734265734265734, + "step": 1406, + "tokens_trained": 0.691069232 + }, + { + "epoch": 0.3994042975675484, + "grad_norm": 21.414424896240234, + "loss": 4.2192, + "lr": 0.0008731468531468532, + "step": 1408, + "tokens_trained": 0.692051224 + }, + { + "epoch": 0.3999716332175023, + "grad_norm": 38.74683380126953, + "loss": 4.2421, + "lr": 0.0008728671328671329, + "step": 1410, + "tokens_trained": 0.693029976 + }, + { + "epoch": 0.4005389688674562, + "grad_norm": 12.595442771911621, + "loss": 4.2569, + "lr": 0.0008725874125874126, + "step": 1412, + "tokens_trained": 0.694013304 + }, + { + "epoch": 0.40110630451741014, + "grad_norm": 55.233673095703125, + "loss": 4.3422, + "lr": 0.0008723076923076924, + "step": 1414, + "tokens_trained": 0.694997536 + }, + { + "epoch": 0.401673640167364, + "grad_norm": 24.717113494873047, + "loss": 4.2567, + "lr": 0.000872027972027972, + "step": 1416, + "tokens_trained": 0.695982632 + }, + { + "epoch": 0.40224097581731794, + "grad_norm": 20.552875518798828, + "loss": 4.2464, + "lr": 0.0008717482517482518, + "step": 1418, + "tokens_trained": 0.696966408 + }, + { + "epoch": 0.4028083114672718, + "grad_norm": 25.569900512695312, + "loss": 4.21, + "lr": 0.0008714685314685314, + "step": 1420, + "tokens_trained": 0.697948224 + }, + { + "epoch": 0.40337564711722573, + "grad_norm": 24.538320541381836, + "loss": 4.2605, + "lr": 0.0008711888111888112, + "step": 1422, + "tokens_trained": 0.698934688 + }, + { + "epoch": 0.40394298276717966, + "grad_norm": 9.585651397705078, + "loss": 4.2524, + "lr": 0.0008709090909090909, + "step": 1424, + "tokens_trained": 0.699921976 + }, + { + "epoch": 0.40451031841713353, + "grad_norm": 11.886672973632812, + "loss": 4.1934, + "lr": 0.0008706293706293707, + "step": 1426, + "tokens_trained": 0.70090396 + }, + { + "epoch": 0.40507765406708746, + "grad_norm": 26.162124633789062, + "loss": 4.2412, + "lr": 0.0008703496503496504, + "step": 1428, + "tokens_trained": 0.701888448 + }, + { + "epoch": 0.4056449897170413, + "grad_norm": 5.03931188583374, + "loss": 4.202, + "lr": 0.00087006993006993, + "step": 1430, + "tokens_trained": 0.702864336 + }, + { + "epoch": 0.40621232536699525, + "grad_norm": 33.67579650878906, + "loss": 4.3087, + "lr": 0.0008697902097902099, + "step": 1432, + "tokens_trained": 0.703847784 + }, + { + "epoch": 0.4067796610169492, + "grad_norm": 34.38542556762695, + "loss": 4.2807, + "lr": 0.0008695104895104895, + "step": 1434, + "tokens_trained": 0.704827288 + }, + { + "epoch": 0.40734699666690305, + "grad_norm": 13.319886207580566, + "loss": 4.3332, + "lr": 0.0008692307692307693, + "step": 1436, + "tokens_trained": 0.705815392 + }, + { + "epoch": 0.407914332316857, + "grad_norm": 36.58311080932617, + "loss": 4.3318, + "lr": 0.0008689510489510489, + "step": 1438, + "tokens_trained": 0.7067914 + }, + { + "epoch": 0.40848166796681085, + "grad_norm": 29.63648223876953, + "loss": 4.2962, + "lr": 0.0008686713286713287, + "step": 1440, + "tokens_trained": 0.70777396 + }, + { + "epoch": 0.4090490036167648, + "grad_norm": 9.55128002166748, + "loss": 4.2773, + "lr": 0.0008683916083916084, + "step": 1442, + "tokens_trained": 0.708750496 + }, + { + "epoch": 0.4096163392667187, + "grad_norm": 53.83981704711914, + "loss": 4.3875, + "lr": 0.0008681118881118881, + "step": 1444, + "tokens_trained": 0.709730168 + }, + { + "epoch": 0.41018367491667257, + "grad_norm": 54.59236526489258, + "loss": 4.3582, + "lr": 0.0008678321678321679, + "step": 1446, + "tokens_trained": 0.710709704 + }, + { + "epoch": 0.4107510105666265, + "grad_norm": 13.964411735534668, + "loss": 4.3065, + "lr": 0.0008675524475524475, + "step": 1448, + "tokens_trained": 0.711690136 + }, + { + "epoch": 0.41131834621658037, + "grad_norm": 25.506649017333984, + "loss": 4.2686, + "lr": 0.0008672727272727273, + "step": 1450, + "tokens_trained": 0.712668056 + }, + { + "epoch": 0.4118856818665343, + "grad_norm": 21.1628360748291, + "loss": 4.2485, + "lr": 0.000866993006993007, + "step": 1452, + "tokens_trained": 0.71365004 + }, + { + "epoch": 0.4124530175164882, + "grad_norm": 15.751238822937012, + "loss": 4.2078, + "lr": 0.0008667132867132868, + "step": 1454, + "tokens_trained": 0.714632032 + }, + { + "epoch": 0.4130203531664421, + "grad_norm": 15.838552474975586, + "loss": 4.1944, + "lr": 0.0008664335664335664, + "step": 1456, + "tokens_trained": 0.715611376 + }, + { + "epoch": 0.413587688816396, + "grad_norm": 15.968609809875488, + "loss": 4.1768, + "lr": 0.0008661538461538461, + "step": 1458, + "tokens_trained": 0.716591112 + }, + { + "epoch": 0.4141550244663499, + "grad_norm": 15.419891357421875, + "loss": 4.1978, + "lr": 0.0008658741258741259, + "step": 1460, + "tokens_trained": 0.717575952 + }, + { + "epoch": 0.4147223601163038, + "grad_norm": 15.088132858276367, + "loss": 4.2361, + "lr": 0.0008655944055944056, + "step": 1462, + "tokens_trained": 0.718563696 + }, + { + "epoch": 0.41528969576625774, + "grad_norm": 4.839190483093262, + "loss": 4.2089, + "lr": 0.0008653146853146854, + "step": 1464, + "tokens_trained": 0.71954848 + }, + { + "epoch": 0.4158570314162116, + "grad_norm": 22.192466735839844, + "loss": 4.2109, + "lr": 0.000865034965034965, + "step": 1466, + "tokens_trained": 0.720533304 + }, + { + "epoch": 0.41642436706616553, + "grad_norm": 28.983531951904297, + "loss": 4.2402, + "lr": 0.0008647552447552448, + "step": 1468, + "tokens_trained": 0.721518176 + }, + { + "epoch": 0.4169917027161194, + "grad_norm": 21.010780334472656, + "loss": 4.1732, + "lr": 0.0008644755244755245, + "step": 1470, + "tokens_trained": 0.72250176 + }, + { + "epoch": 0.41755903836607333, + "grad_norm": 14.59277057647705, + "loss": 4.1847, + "lr": 0.0008641958041958042, + "step": 1472, + "tokens_trained": 0.723486664 + }, + { + "epoch": 0.41812637401602726, + "grad_norm": 13.688531875610352, + "loss": 4.1577, + "lr": 0.0008639160839160839, + "step": 1474, + "tokens_trained": 0.724469328 + }, + { + "epoch": 0.41869370966598113, + "grad_norm": 15.879347801208496, + "loss": 4.1721, + "lr": 0.0008636363636363636, + "step": 1476, + "tokens_trained": 0.725454968 + }, + { + "epoch": 0.41926104531593505, + "grad_norm": 10.225201606750488, + "loss": 4.1999, + "lr": 0.0008633566433566434, + "step": 1478, + "tokens_trained": 0.7264426 + }, + { + "epoch": 0.4198283809658889, + "grad_norm": 17.007728576660156, + "loss": 4.2229, + "lr": 0.0008630769230769231, + "step": 1480, + "tokens_trained": 0.727422056 + }, + { + "epoch": 0.42039571661584285, + "grad_norm": 13.517934799194336, + "loss": 4.2241, + "lr": 0.0008627972027972029, + "step": 1482, + "tokens_trained": 0.728403688 + }, + { + "epoch": 0.4209630522657968, + "grad_norm": 17.132064819335938, + "loss": 4.1679, + "lr": 0.0008625174825174825, + "step": 1484, + "tokens_trained": 0.729386248 + }, + { + "epoch": 0.42153038791575065, + "grad_norm": 19.782320022583008, + "loss": 4.1817, + "lr": 0.0008622377622377622, + "step": 1486, + "tokens_trained": 0.730368752 + }, + { + "epoch": 0.4220977235657046, + "grad_norm": 3.388552188873291, + "loss": 4.1726, + "lr": 0.000861958041958042, + "step": 1488, + "tokens_trained": 0.731354304 + }, + { + "epoch": 0.42266505921565845, + "grad_norm": 28.33499526977539, + "loss": 4.2623, + "lr": 0.0008616783216783217, + "step": 1490, + "tokens_trained": 0.732337296 + }, + { + "epoch": 0.42323239486561237, + "grad_norm": 24.927406311035156, + "loss": 4.2422, + "lr": 0.0008613986013986014, + "step": 1492, + "tokens_trained": 0.733319824 + }, + { + "epoch": 0.4237997305155663, + "grad_norm": 25.996028900146484, + "loss": 4.2227, + "lr": 0.0008611188811188811, + "step": 1494, + "tokens_trained": 0.73430636 + }, + { + "epoch": 0.42436706616552017, + "grad_norm": 14.625783920288086, + "loss": 4.2268, + "lr": 0.0008608391608391609, + "step": 1496, + "tokens_trained": 0.735285848 + }, + { + "epoch": 0.4249344018154741, + "grad_norm": 12.556640625, + "loss": 4.2352, + "lr": 0.0008605594405594406, + "step": 1498, + "tokens_trained": 0.736270632 + }, + { + "epoch": 0.42550173746542796, + "grad_norm": 18.579416275024414, + "loss": 4.2377, + "lr": 0.0008602797202797203, + "step": 1500, + "tokens_trained": 0.737255104 + }, + { + "epoch": 0.42550173746542796, + "eval_loss": 1.052606463432312, + "eval_runtime": 20.5089, + "step": 1500, + "tokens_trained": 0.737255104 + }, + { + "epoch": 0.4260690731153819, + "grad_norm": 16.550657272338867, + "loss": 4.182, + "lr": 0.00086, + "step": 1502, + "tokens_trained": 0.738240848 + }, + { + "epoch": 0.4266364087653358, + "grad_norm": 24.4381046295166, + "loss": 4.2093, + "lr": 0.0008597202797202797, + "step": 1504, + "tokens_trained": 0.73922592 + }, + { + "epoch": 0.4272037444152897, + "grad_norm": 13.155163764953613, + "loss": 4.239, + "lr": 0.0008594405594405595, + "step": 1506, + "tokens_trained": 0.740208896 + }, + { + "epoch": 0.4277710800652436, + "grad_norm": 27.667949676513672, + "loss": 4.2607, + "lr": 0.0008591608391608392, + "step": 1508, + "tokens_trained": 0.741189312 + }, + { + "epoch": 0.4283384157151975, + "grad_norm": 35.897743225097656, + "loss": 4.2153, + "lr": 0.0008588811188811188, + "step": 1510, + "tokens_trained": 0.742170456 + }, + { + "epoch": 0.4289057513651514, + "grad_norm": 18.16407012939453, + "loss": 4.2753, + "lr": 0.0008586013986013986, + "step": 1512, + "tokens_trained": 0.743152504 + }, + { + "epoch": 0.42947308701510534, + "grad_norm": 27.447364807128906, + "loss": 4.2321, + "lr": 0.0008583216783216783, + "step": 1514, + "tokens_trained": 0.744139768 + }, + { + "epoch": 0.4300404226650592, + "grad_norm": 21.115859985351562, + "loss": 4.2048, + "lr": 0.0008580419580419581, + "step": 1516, + "tokens_trained": 0.745122368 + }, + { + "epoch": 0.43060775831501313, + "grad_norm": 5.949585914611816, + "loss": 4.1787, + "lr": 0.0008577622377622378, + "step": 1518, + "tokens_trained": 0.746104936 + }, + { + "epoch": 0.431175093964967, + "grad_norm": 6.631585121154785, + "loss": 4.2035, + "lr": 0.0008574825174825175, + "step": 1520, + "tokens_trained": 0.747086264 + }, + { + "epoch": 0.43174242961492093, + "grad_norm": 38.91585159301758, + "loss": 4.354, + "lr": 0.0008572027972027972, + "step": 1522, + "tokens_trained": 0.74806844 + }, + { + "epoch": 0.43230976526487486, + "grad_norm": 37.53727722167969, + "loss": 4.228, + "lr": 0.000856923076923077, + "step": 1524, + "tokens_trained": 0.749052432 + }, + { + "epoch": 0.4328771009148287, + "grad_norm": 19.87713623046875, + "loss": 4.2696, + "lr": 0.0008566433566433567, + "step": 1526, + "tokens_trained": 0.750037072 + }, + { + "epoch": 0.43344443656478265, + "grad_norm": 25.615995407104492, + "loss": 4.2676, + "lr": 0.0008563636363636363, + "step": 1528, + "tokens_trained": 0.751020584 + }, + { + "epoch": 0.4340117722147365, + "grad_norm": 16.643299102783203, + "loss": 4.201, + "lr": 0.0008560839160839161, + "step": 1530, + "tokens_trained": 0.75200224 + }, + { + "epoch": 0.43457910786469045, + "grad_norm": 16.207853317260742, + "loss": 4.1944, + "lr": 0.0008558041958041958, + "step": 1532, + "tokens_trained": 0.752981624 + }, + { + "epoch": 0.4351464435146444, + "grad_norm": 27.054973602294922, + "loss": 4.2188, + "lr": 0.0008555244755244756, + "step": 1534, + "tokens_trained": 0.753968464 + }, + { + "epoch": 0.43571377916459825, + "grad_norm": 33.468238830566406, + "loss": 4.2052, + "lr": 0.0008552447552447553, + "step": 1536, + "tokens_trained": 0.754950976 + }, + { + "epoch": 0.4362811148145522, + "grad_norm": 21.083576202392578, + "loss": 4.2514, + "lr": 0.000854965034965035, + "step": 1538, + "tokens_trained": 0.755938272 + }, + { + "epoch": 0.43684845046450604, + "grad_norm": 19.927122116088867, + "loss": 4.2493, + "lr": 0.0008546853146853147, + "step": 1540, + "tokens_trained": 0.756916784 + }, + { + "epoch": 0.43741578611445997, + "grad_norm": 22.105287551879883, + "loss": 4.2264, + "lr": 0.0008544055944055944, + "step": 1542, + "tokens_trained": 0.757901152 + }, + { + "epoch": 0.4379831217644139, + "grad_norm": 22.448705673217773, + "loss": 4.1987, + "lr": 0.0008541258741258742, + "step": 1544, + "tokens_trained": 0.758886048 + }, + { + "epoch": 0.43855045741436777, + "grad_norm": 17.740005493164062, + "loss": 4.1918, + "lr": 0.0008538461538461538, + "step": 1546, + "tokens_trained": 0.759864304 + }, + { + "epoch": 0.4391177930643217, + "grad_norm": 20.58041763305664, + "loss": 4.2144, + "lr": 0.0008535664335664336, + "step": 1548, + "tokens_trained": 0.760844312 + }, + { + "epoch": 0.43968512871427556, + "grad_norm": 21.937252044677734, + "loss": 4.2129, + "lr": 0.0008532867132867133, + "step": 1550, + "tokens_trained": 0.761827256 + }, + { + "epoch": 0.4402524643642295, + "grad_norm": 26.883426666259766, + "loss": 4.2244, + "lr": 0.000853006993006993, + "step": 1552, + "tokens_trained": 0.7628098 + }, + { + "epoch": 0.4408198000141834, + "grad_norm": 10.297266960144043, + "loss": 4.1724, + "lr": 0.0008527272727272728, + "step": 1554, + "tokens_trained": 0.763792488 + }, + { + "epoch": 0.4413871356641373, + "grad_norm": 12.119601249694824, + "loss": 4.1828, + "lr": 0.0008524475524475524, + "step": 1556, + "tokens_trained": 0.764769936 + }, + { + "epoch": 0.4419544713140912, + "grad_norm": 16.565885543823242, + "loss": 4.2113, + "lr": 0.0008521678321678322, + "step": 1558, + "tokens_trained": 0.765752376 + }, + { + "epoch": 0.4425218069640451, + "grad_norm": 18.860309600830078, + "loss": 4.1864, + "lr": 0.0008518881118881119, + "step": 1560, + "tokens_trained": 0.766736256 + }, + { + "epoch": 0.443089142613999, + "grad_norm": 4.049737453460693, + "loss": 4.2108, + "lr": 0.0008516083916083917, + "step": 1562, + "tokens_trained": 0.767720568 + }, + { + "epoch": 0.44365647826395294, + "grad_norm": 15.730945587158203, + "loss": 4.2339, + "lr": 0.0008513286713286713, + "step": 1564, + "tokens_trained": 0.768701288 + }, + { + "epoch": 0.4442238139139068, + "grad_norm": 18.64398956298828, + "loss": 4.2132, + "lr": 0.000851048951048951, + "step": 1566, + "tokens_trained": 0.769681336 + }, + { + "epoch": 0.44479114956386073, + "grad_norm": 22.01759147644043, + "loss": 4.2211, + "lr": 0.0008507692307692308, + "step": 1568, + "tokens_trained": 0.770661168 + }, + { + "epoch": 0.4453584852138146, + "grad_norm": 3.097306489944458, + "loss": 4.2114, + "lr": 0.0008504895104895105, + "step": 1570, + "tokens_trained": 0.7716424 + }, + { + "epoch": 0.44592582086376853, + "grad_norm": 35.901546478271484, + "loss": 4.3, + "lr": 0.0008502097902097903, + "step": 1572, + "tokens_trained": 0.772627536 + }, + { + "epoch": 0.44649315651372246, + "grad_norm": 20.762710571289062, + "loss": 4.2465, + "lr": 0.0008499300699300699, + "step": 1574, + "tokens_trained": 0.77361008 + }, + { + "epoch": 0.4470604921636763, + "grad_norm": 13.54304027557373, + "loss": 4.221, + "lr": 0.0008496503496503497, + "step": 1576, + "tokens_trained": 0.774591184 + }, + { + "epoch": 0.44762782781363025, + "grad_norm": 18.83641242980957, + "loss": 4.2228, + "lr": 0.0008493706293706294, + "step": 1578, + "tokens_trained": 0.775574136 + }, + { + "epoch": 0.4481951634635841, + "grad_norm": 12.294941902160645, + "loss": 4.1768, + "lr": 0.0008490909090909091, + "step": 1580, + "tokens_trained": 0.776554752 + }, + { + "epoch": 0.44876249911353805, + "grad_norm": 5.768923759460449, + "loss": 4.2255, + "lr": 0.0008488111888111888, + "step": 1582, + "tokens_trained": 0.777539368 + }, + { + "epoch": 0.449329834763492, + "grad_norm": 7.9961137771606445, + "loss": 4.2218, + "lr": 0.0008485314685314685, + "step": 1584, + "tokens_trained": 0.778522344 + }, + { + "epoch": 0.44989717041344585, + "grad_norm": 22.005645751953125, + "loss": 4.2452, + "lr": 0.0008482517482517483, + "step": 1586, + "tokens_trained": 0.77950768 + }, + { + "epoch": 0.45046450606339977, + "grad_norm": 27.313426971435547, + "loss": 4.1875, + "lr": 0.000847972027972028, + "step": 1588, + "tokens_trained": 0.780490984 + }, + { + "epoch": 0.45103184171335364, + "grad_norm": 10.344687461853027, + "loss": 4.2356, + "lr": 0.0008476923076923078, + "step": 1590, + "tokens_trained": 0.781469 + }, + { + "epoch": 0.45159917736330757, + "grad_norm": 27.348726272583008, + "loss": 4.2962, + "lr": 0.0008474125874125874, + "step": 1592, + "tokens_trained": 0.782450304 + }, + { + "epoch": 0.4521665130132615, + "grad_norm": 32.965911865234375, + "loss": 4.2736, + "lr": 0.0008471328671328671, + "step": 1594, + "tokens_trained": 0.783431416 + }, + { + "epoch": 0.45273384866321537, + "grad_norm": 7.752636909484863, + "loss": 4.2074, + "lr": 0.0008468531468531469, + "step": 1596, + "tokens_trained": 0.784409568 + }, + { + "epoch": 0.4533011843131693, + "grad_norm": 38.85223388671875, + "loss": 4.3261, + "lr": 0.0008465734265734266, + "step": 1598, + "tokens_trained": 0.785399368 + }, + { + "epoch": 0.45386851996312316, + "grad_norm": 38.017967224121094, + "loss": 4.2646, + "lr": 0.0008462937062937063, + "step": 1600, + "tokens_trained": 0.786376072 + }, + { + "epoch": 0.4544358556130771, + "grad_norm": 7.856576442718506, + "loss": 4.191, + "lr": 0.000846013986013986, + "step": 1602, + "tokens_trained": 0.787362072 + }, + { + "epoch": 0.455003191263031, + "grad_norm": 37.902870178222656, + "loss": 4.2651, + "lr": 0.0008457342657342658, + "step": 1604, + "tokens_trained": 0.788345104 + }, + { + "epoch": 0.4555705269129849, + "grad_norm": 7.724793434143066, + "loss": 4.1994, + "lr": 0.0008454545454545455, + "step": 1606, + "tokens_trained": 0.7893314 + }, + { + "epoch": 0.4561378625629388, + "grad_norm": 26.484699249267578, + "loss": 4.2276, + "lr": 0.0008451748251748252, + "step": 1608, + "tokens_trained": 0.790309344 + }, + { + "epoch": 0.4567051982128927, + "grad_norm": 23.137874603271484, + "loss": 4.2082, + "lr": 0.0008448951048951049, + "step": 1610, + "tokens_trained": 0.791295784 + }, + { + "epoch": 0.4572725338628466, + "grad_norm": 13.902606964111328, + "loss": 4.2035, + "lr": 0.0008446153846153846, + "step": 1612, + "tokens_trained": 0.79228076 + }, + { + "epoch": 0.45783986951280053, + "grad_norm": 8.438498497009277, + "loss": 4.1713, + "lr": 0.0008443356643356644, + "step": 1614, + "tokens_trained": 0.793265456 + }, + { + "epoch": 0.4584072051627544, + "grad_norm": 11.60899829864502, + "loss": 4.1971, + "lr": 0.0008440559440559441, + "step": 1616, + "tokens_trained": 0.794245896 + }, + { + "epoch": 0.45897454081270833, + "grad_norm": 19.33312225341797, + "loss": 4.2328, + "lr": 0.0008437762237762238, + "step": 1618, + "tokens_trained": 0.795229016 + }, + { + "epoch": 0.4595418764626622, + "grad_norm": 16.45014190673828, + "loss": 4.2277, + "lr": 0.0008434965034965035, + "step": 1620, + "tokens_trained": 0.79620792 + }, + { + "epoch": 0.46010921211261613, + "grad_norm": 9.818867683410645, + "loss": 4.1494, + "lr": 0.0008432167832167832, + "step": 1622, + "tokens_trained": 0.797192352 + }, + { + "epoch": 0.46067654776257005, + "grad_norm": 7.920058250427246, + "loss": 4.2027, + "lr": 0.000842937062937063, + "step": 1624, + "tokens_trained": 0.798174104 + }, + { + "epoch": 0.46096021558754696, + "eval_loss": 1.044265627861023, + "eval_runtime": 20.5617, + "step": 1625, + "tokens_trained": 0.798668072 + }, + { + "epoch": 0.4612438834125239, + "grad_norm": 10.734235763549805, + "loss": 4.1505, + "lr": 0.0008426573426573427, + "step": 1626, + "tokens_trained": 0.799160304 + }, + { + "epoch": 0.46181121906247785, + "grad_norm": 23.376392364501953, + "loss": 4.195, + "lr": 0.0008423776223776224, + "step": 1628, + "tokens_trained": 0.800144144 + }, + { + "epoch": 0.4623785547124317, + "grad_norm": 23.567371368408203, + "loss": 4.2367, + "lr": 0.0008420979020979021, + "step": 1630, + "tokens_trained": 0.801131184 + }, + { + "epoch": 0.46294589036238565, + "grad_norm": 19.271820068359375, + "loss": 4.1899, + "lr": 0.0008418181818181819, + "step": 1632, + "tokens_trained": 0.802111296 + }, + { + "epoch": 0.4635132260123396, + "grad_norm": 17.468698501586914, + "loss": 4.1941, + "lr": 0.0008415384615384616, + "step": 1634, + "tokens_trained": 0.803095112 + }, + { + "epoch": 0.46408056166229344, + "grad_norm": 22.298749923706055, + "loss": 4.2083, + "lr": 0.0008412587412587412, + "step": 1636, + "tokens_trained": 0.804080456 + }, + { + "epoch": 0.46464789731224737, + "grad_norm": 12.506179809570312, + "loss": 4.1953, + "lr": 0.000840979020979021, + "step": 1638, + "tokens_trained": 0.805062464 + }, + { + "epoch": 0.46521523296220124, + "grad_norm": 11.819656372070312, + "loss": 4.2047, + "lr": 0.0008406993006993006, + "step": 1640, + "tokens_trained": 0.806045504 + }, + { + "epoch": 0.46578256861215517, + "grad_norm": 15.925740242004395, + "loss": 4.1565, + "lr": 0.0008404195804195805, + "step": 1642, + "tokens_trained": 0.80702736 + }, + { + "epoch": 0.4663499042621091, + "grad_norm": 15.869892120361328, + "loss": 4.2134, + "lr": 0.0008401398601398602, + "step": 1644, + "tokens_trained": 0.808009192 + }, + { + "epoch": 0.46691723991206296, + "grad_norm": 10.851021766662598, + "loss": 4.2041, + "lr": 0.0008398601398601399, + "step": 1646, + "tokens_trained": 0.808994728 + }, + { + "epoch": 0.4674845755620169, + "grad_norm": 8.271230697631836, + "loss": 4.1739, + "lr": 0.0008395804195804196, + "step": 1648, + "tokens_trained": 0.809976448 + }, + { + "epoch": 0.46805191121197076, + "grad_norm": 13.768092155456543, + "loss": 4.1761, + "lr": 0.0008393006993006993, + "step": 1650, + "tokens_trained": 0.810958392 + }, + { + "epoch": 0.4686192468619247, + "grad_norm": 7.760485649108887, + "loss": 4.1826, + "lr": 0.0008390209790209791, + "step": 1652, + "tokens_trained": 0.81194136 + }, + { + "epoch": 0.4691865825118786, + "grad_norm": 13.28488540649414, + "loss": 4.1659, + "lr": 0.0008387412587412587, + "step": 1654, + "tokens_trained": 0.812924984 + }, + { + "epoch": 0.4697539181618325, + "grad_norm": 10.466367721557617, + "loss": 4.1432, + "lr": 0.0008384615384615385, + "step": 1656, + "tokens_trained": 0.813907424 + }, + { + "epoch": 0.4703212538117864, + "grad_norm": 15.40854549407959, + "loss": 4.1625, + "lr": 0.0008381818181818181, + "step": 1658, + "tokens_trained": 0.814888712 + }, + { + "epoch": 0.4708885894617403, + "grad_norm": 20.580612182617188, + "loss": 4.1636, + "lr": 0.000837902097902098, + "step": 1660, + "tokens_trained": 0.815869152 + }, + { + "epoch": 0.4714559251116942, + "grad_norm": 14.908403396606445, + "loss": 4.1763, + "lr": 0.0008376223776223776, + "step": 1662, + "tokens_trained": 0.816852664 + }, + { + "epoch": 0.47202326076164813, + "grad_norm": 10.217529296875, + "loss": 4.1934, + "lr": 0.0008373426573426573, + "step": 1664, + "tokens_trained": 0.817832792 + }, + { + "epoch": 0.472590596411602, + "grad_norm": 15.74150276184082, + "loss": 4.1714, + "lr": 0.0008370629370629371, + "step": 1666, + "tokens_trained": 0.81881728 + }, + { + "epoch": 0.47315793206155593, + "grad_norm": 15.39499282836914, + "loss": 4.2005, + "lr": 0.0008367832167832168, + "step": 1668, + "tokens_trained": 0.819800824 + }, + { + "epoch": 0.4737252677115098, + "grad_norm": 11.585809707641602, + "loss": 4.136, + "lr": 0.0008365034965034966, + "step": 1670, + "tokens_trained": 0.8207856 + }, + { + "epoch": 0.4742926033614637, + "grad_norm": 16.053237915039062, + "loss": 4.1827, + "lr": 0.0008362237762237762, + "step": 1672, + "tokens_trained": 0.821766576 + }, + { + "epoch": 0.47485993901141765, + "grad_norm": 9.23779582977295, + "loss": 4.1159, + "lr": 0.000835944055944056, + "step": 1674, + "tokens_trained": 0.822749696 + }, + { + "epoch": 0.4754272746613715, + "grad_norm": 11.395891189575195, + "loss": 4.17, + "lr": 0.0008356643356643356, + "step": 1676, + "tokens_trained": 0.82373032 + }, + { + "epoch": 0.47599461031132545, + "grad_norm": 17.745365142822266, + "loss": 4.1696, + "lr": 0.0008353846153846154, + "step": 1678, + "tokens_trained": 0.824712192 + }, + { + "epoch": 0.4765619459612793, + "grad_norm": 6.7816572189331055, + "loss": 4.1933, + "lr": 0.0008351048951048951, + "step": 1680, + "tokens_trained": 0.825691208 + }, + { + "epoch": 0.47712928161123325, + "grad_norm": 20.552772521972656, + "loss": 4.1625, + "lr": 0.0008348251748251748, + "step": 1682, + "tokens_trained": 0.826672584 + }, + { + "epoch": 0.4776966172611872, + "grad_norm": 21.632352828979492, + "loss": 4.2061, + "lr": 0.0008345454545454546, + "step": 1684, + "tokens_trained": 0.827654368 + }, + { + "epoch": 0.47826395291114104, + "grad_norm": 17.754596710205078, + "loss": 4.222, + "lr": 0.0008342657342657343, + "step": 1686, + "tokens_trained": 0.828639392 + }, + { + "epoch": 0.47883128856109497, + "grad_norm": 20.73906707763672, + "loss": 4.1679, + "lr": 0.0008339860139860141, + "step": 1688, + "tokens_trained": 0.829627232 + }, + { + "epoch": 0.47939862421104884, + "grad_norm": 28.157238006591797, + "loss": 4.1658, + "lr": 0.0008337062937062937, + "step": 1690, + "tokens_trained": 0.830610904 + }, + { + "epoch": 0.47996595986100277, + "grad_norm": 12.728020668029785, + "loss": 4.1892, + "lr": 0.0008334265734265734, + "step": 1692, + "tokens_trained": 0.831602544 + }, + { + "epoch": 0.4805332955109567, + "grad_norm": 20.21622657775879, + "loss": 4.1453, + "lr": 0.0008331468531468531, + "step": 1694, + "tokens_trained": 0.832584656 + }, + { + "epoch": 0.48110063116091056, + "grad_norm": 18.5329647064209, + "loss": 4.2145, + "lr": 0.0008328671328671329, + "step": 1696, + "tokens_trained": 0.833570472 + }, + { + "epoch": 0.4816679668108645, + "grad_norm": 12.47617244720459, + "loss": 4.1944, + "lr": 0.0008325874125874126, + "step": 1698, + "tokens_trained": 0.834556104 + }, + { + "epoch": 0.48223530246081836, + "grad_norm": 21.34851837158203, + "loss": 4.1754, + "lr": 0.0008323076923076923, + "step": 1700, + "tokens_trained": 0.835540592 + }, + { + "epoch": 0.4828026381107723, + "grad_norm": 13.20995807647705, + "loss": 4.1657, + "lr": 0.000832027972027972, + "step": 1702, + "tokens_trained": 0.836525136 + }, + { + "epoch": 0.4833699737607262, + "grad_norm": 16.77725601196289, + "loss": 4.1905, + "lr": 0.0008317482517482518, + "step": 1704, + "tokens_trained": 0.837509224 + }, + { + "epoch": 0.4839373094106801, + "grad_norm": 15.17611312866211, + "loss": 4.1823, + "lr": 0.0008314685314685315, + "step": 1706, + "tokens_trained": 0.838492472 + }, + { + "epoch": 0.484504645060634, + "grad_norm": 13.06942081451416, + "loss": 4.1732, + "lr": 0.0008311888111888112, + "step": 1708, + "tokens_trained": 0.839471696 + }, + { + "epoch": 0.4850719807105879, + "grad_norm": 10.456578254699707, + "loss": 4.1862, + "lr": 0.0008309090909090909, + "step": 1710, + "tokens_trained": 0.840452808 + }, + { + "epoch": 0.4856393163605418, + "grad_norm": 13.80197525024414, + "loss": 4.1663, + "lr": 0.0008306293706293706, + "step": 1712, + "tokens_trained": 0.841434224 + }, + { + "epoch": 0.48620665201049573, + "grad_norm": 20.076507568359375, + "loss": 4.1436, + "lr": 0.0008303496503496504, + "step": 1714, + "tokens_trained": 0.842415304 + }, + { + "epoch": 0.4867739876604496, + "grad_norm": 5.629086971282959, + "loss": 4.149, + "lr": 0.00083006993006993, + "step": 1716, + "tokens_trained": 0.84339416 + }, + { + "epoch": 0.48734132331040353, + "grad_norm": 13.932148933410645, + "loss": 4.1785, + "lr": 0.0008297902097902098, + "step": 1718, + "tokens_trained": 0.844380472 + }, + { + "epoch": 0.4879086589603574, + "grad_norm": 18.951047897338867, + "loss": 4.216, + "lr": 0.0008295104895104895, + "step": 1720, + "tokens_trained": 0.845366896 + }, + { + "epoch": 0.4884759946103113, + "grad_norm": 21.042476654052734, + "loss": 4.1634, + "lr": 0.0008292307692307693, + "step": 1722, + "tokens_trained": 0.846344792 + }, + { + "epoch": 0.48904333026026525, + "grad_norm": 23.94416618347168, + "loss": 4.1613, + "lr": 0.000828951048951049, + "step": 1724, + "tokens_trained": 0.847323608 + }, + { + "epoch": 0.4896106659102191, + "grad_norm": 5.057071208953857, + "loss": 4.1729, + "lr": 0.0008286713286713287, + "step": 1726, + "tokens_trained": 0.848304856 + }, + { + "epoch": 0.49017800156017305, + "grad_norm": 18.068674087524414, + "loss": 4.2194, + "lr": 0.0008283916083916084, + "step": 1728, + "tokens_trained": 0.849287712 + }, + { + "epoch": 0.4907453372101269, + "grad_norm": 11.621233940124512, + "loss": 4.2232, + "lr": 0.000828111888111888, + "step": 1730, + "tokens_trained": 0.850268968 + }, + { + "epoch": 0.49131267286008085, + "grad_norm": 12.939676284790039, + "loss": 4.2003, + "lr": 0.0008278321678321679, + "step": 1732, + "tokens_trained": 0.851256528 + }, + { + "epoch": 0.49188000851003477, + "grad_norm": 10.638157844543457, + "loss": 4.1975, + "lr": 0.0008275524475524475, + "step": 1734, + "tokens_trained": 0.852240824 + }, + { + "epoch": 0.49244734415998864, + "grad_norm": 6.2671003341674805, + "loss": 4.1617, + "lr": 0.0008272727272727273, + "step": 1736, + "tokens_trained": 0.853224768 + }, + { + "epoch": 0.49301467980994257, + "grad_norm": 12.318375587463379, + "loss": 4.1939, + "lr": 0.000826993006993007, + "step": 1738, + "tokens_trained": 0.8542062 + }, + { + "epoch": 0.49358201545989644, + "grad_norm": 17.275348663330078, + "loss": 4.1911, + "lr": 0.0008267132867132868, + "step": 1740, + "tokens_trained": 0.855192024 + }, + { + "epoch": 0.49414935110985037, + "grad_norm": 11.122747421264648, + "loss": 4.17, + "lr": 0.0008264335664335665, + "step": 1742, + "tokens_trained": 0.856172136 + }, + { + "epoch": 0.4947166867598043, + "grad_norm": 6.223485469818115, + "loss": 4.1774, + "lr": 0.0008261538461538461, + "step": 1744, + "tokens_trained": 0.857156312 + }, + { + "epoch": 0.49528402240975816, + "grad_norm": 14.62152099609375, + "loss": 4.1607, + "lr": 0.0008258741258741259, + "step": 1746, + "tokens_trained": 0.858140152 + }, + { + "epoch": 0.4958513580597121, + "grad_norm": 15.991989135742188, + "loss": 4.1825, + "lr": 0.0008255944055944055, + "step": 1748, + "tokens_trained": 0.85912524 + }, + { + "epoch": 0.49641869370966596, + "grad_norm": 28.88335418701172, + "loss": 4.2244, + "lr": 0.0008253146853146854, + "step": 1750, + "tokens_trained": 0.860105784 + }, + { + "epoch": 0.49641869370966596, + "eval_loss": 1.061833143234253, + "eval_runtime": 20.4841, + "step": 1750, + "tokens_trained": 0.860105784 + }, + { + "epoch": 0.4969860293596199, + "grad_norm": 14.708030700683594, + "loss": 4.2036, + "lr": 0.000825034965034965, + "step": 1752, + "tokens_trained": 0.861089272 + }, + { + "epoch": 0.4975533650095738, + "grad_norm": 24.67535400390625, + "loss": 4.2405, + "lr": 0.0008247552447552448, + "step": 1754, + "tokens_trained": 0.862066656 + }, + { + "epoch": 0.4981207006595277, + "grad_norm": 10.923722267150879, + "loss": 4.1713, + "lr": 0.0008244755244755245, + "step": 1756, + "tokens_trained": 0.863049256 + }, + { + "epoch": 0.4986880363094816, + "grad_norm": 8.88796615600586, + "loss": 4.1834, + "lr": 0.0008241958041958042, + "step": 1758, + "tokens_trained": 0.864029352 + }, + { + "epoch": 0.4992553719594355, + "grad_norm": 34.90485382080078, + "loss": 4.2338, + "lr": 0.000823916083916084, + "step": 1760, + "tokens_trained": 0.865013008 + }, + { + "epoch": 0.4998227076093894, + "grad_norm": 36.34440612792969, + "loss": 4.2012, + "lr": 0.0008236363636363636, + "step": 1762, + "tokens_trained": 0.86599204 + }, + { + "epoch": 0.5003900432593433, + "grad_norm": 27.913984298706055, + "loss": 4.269, + "lr": 0.0008233566433566434, + "step": 1764, + "tokens_trained": 0.866975456 + }, + { + "epoch": 0.5009573789092973, + "grad_norm": 28.236122131347656, + "loss": 4.2413, + "lr": 0.000823076923076923, + "step": 1766, + "tokens_trained": 0.867963912 + }, + { + "epoch": 0.5015247145592511, + "grad_norm": 18.181337356567383, + "loss": 4.2088, + "lr": 0.0008227972027972029, + "step": 1768, + "tokens_trained": 0.86894656 + }, + { + "epoch": 0.502092050209205, + "grad_norm": 17.403850555419922, + "loss": 4.1854, + "lr": 0.0008225174825174825, + "step": 1770, + "tokens_trained": 0.869932592 + }, + { + "epoch": 0.5026593858591589, + "grad_norm": 15.002805709838867, + "loss": 4.1897, + "lr": 0.0008222377622377622, + "step": 1772, + "tokens_trained": 0.87091592 + }, + { + "epoch": 0.5032267215091129, + "grad_norm": 6.787586688995361, + "loss": 4.1625, + "lr": 0.000821958041958042, + "step": 1774, + "tokens_trained": 0.871899144 + }, + { + "epoch": 0.5037940571590668, + "grad_norm": 6.255197525024414, + "loss": 4.1682, + "lr": 0.0008216783216783217, + "step": 1776, + "tokens_trained": 0.872874824 + }, + { + "epoch": 0.5043613928090206, + "grad_norm": 25.828433990478516, + "loss": 4.2354, + "lr": 0.0008213986013986015, + "step": 1778, + "tokens_trained": 0.873858424 + }, + { + "epoch": 0.5049287284589745, + "grad_norm": 20.261323928833008, + "loss": 4.2373, + "lr": 0.0008211188811188811, + "step": 1780, + "tokens_trained": 0.87483884 + }, + { + "epoch": 0.5054960641089284, + "grad_norm": 9.670608520507812, + "loss": 4.191, + "lr": 0.0008208391608391609, + "step": 1782, + "tokens_trained": 0.875820792 + }, + { + "epoch": 0.5060633997588824, + "grad_norm": 23.33945655822754, + "loss": 4.2319, + "lr": 0.0008205594405594405, + "step": 1784, + "tokens_trained": 0.876804368 + }, + { + "epoch": 0.5066307354088363, + "grad_norm": 32.22544479370117, + "loss": 4.1799, + "lr": 0.0008202797202797203, + "step": 1786, + "tokens_trained": 0.877784816 + }, + { + "epoch": 0.5071980710587901, + "grad_norm": 21.048891067504883, + "loss": 4.2635, + "lr": 0.00082, + "step": 1788, + "tokens_trained": 0.878768256 + }, + { + "epoch": 0.507765406708744, + "grad_norm": 28.73198699951172, + "loss": 4.2436, + "lr": 0.0008197202797202797, + "step": 1790, + "tokens_trained": 0.879751288 + }, + { + "epoch": 0.508332742358698, + "grad_norm": 27.627851486206055, + "loss": 4.2118, + "lr": 0.0008194405594405595, + "step": 1792, + "tokens_trained": 0.880732072 + }, + { + "epoch": 0.5089000780086519, + "grad_norm": 21.16539192199707, + "loss": 4.2123, + "lr": 0.0008191608391608392, + "step": 1794, + "tokens_trained": 0.88171332 + }, + { + "epoch": 0.5094674136586058, + "grad_norm": 11.402868270874023, + "loss": 4.1524, + "lr": 0.000818881118881119, + "step": 1796, + "tokens_trained": 0.882695464 + }, + { + "epoch": 0.5100347493085596, + "grad_norm": 11.958270072937012, + "loss": 4.2091, + "lr": 0.0008186013986013986, + "step": 1798, + "tokens_trained": 0.883678736 + }, + { + "epoch": 0.5106020849585136, + "grad_norm": 15.902670860290527, + "loss": 4.1687, + "lr": 0.0008183216783216783, + "step": 1800, + "tokens_trained": 0.8846604 + }, + { + "epoch": 0.5111694206084675, + "grad_norm": 19.732566833496094, + "loss": 4.1302, + "lr": 0.000818041958041958, + "step": 1802, + "tokens_trained": 0.885641384 + }, + { + "epoch": 0.5117367562584214, + "grad_norm": 15.119332313537598, + "loss": 4.1546, + "lr": 0.0008177622377622378, + "step": 1804, + "tokens_trained": 0.8866262 + }, + { + "epoch": 0.5123040919083753, + "grad_norm": 9.641027450561523, + "loss": 4.1748, + "lr": 0.0008174825174825175, + "step": 1806, + "tokens_trained": 0.887604504 + }, + { + "epoch": 0.5128714275583292, + "grad_norm": 11.642073631286621, + "loss": 4.1879, + "lr": 0.0008172027972027972, + "step": 1808, + "tokens_trained": 0.888584152 + }, + { + "epoch": 0.5134387632082831, + "grad_norm": 12.05164909362793, + "loss": 4.1332, + "lr": 0.000816923076923077, + "step": 1810, + "tokens_trained": 0.889568448 + }, + { + "epoch": 0.514006098858237, + "grad_norm": 13.54423999786377, + "loss": 4.1398, + "lr": 0.0008166433566433567, + "step": 1812, + "tokens_trained": 0.890550896 + }, + { + "epoch": 0.5145734345081909, + "grad_norm": 21.94988441467285, + "loss": 4.1523, + "lr": 0.0008163636363636364, + "step": 1814, + "tokens_trained": 0.89153436 + }, + { + "epoch": 0.5151407701581449, + "grad_norm": 8.613338470458984, + "loss": 4.1428, + "lr": 0.0008160839160839161, + "step": 1816, + "tokens_trained": 0.89251064 + }, + { + "epoch": 0.5157081058080987, + "grad_norm": 27.448917388916016, + "loss": 4.2014, + "lr": 0.0008158041958041958, + "step": 1818, + "tokens_trained": 0.893493904 + }, + { + "epoch": 0.5162754414580526, + "grad_norm": 16.226577758789062, + "loss": 4.1787, + "lr": 0.0008155244755244755, + "step": 1820, + "tokens_trained": 0.894476344 + }, + { + "epoch": 0.5168427771080065, + "grad_norm": 16.967891693115234, + "loss": 4.1898, + "lr": 0.0008152447552447553, + "step": 1822, + "tokens_trained": 0.895460064 + }, + { + "epoch": 0.5174101127579604, + "grad_norm": 13.723483085632324, + "loss": 4.2058, + "lr": 0.000814965034965035, + "step": 1824, + "tokens_trained": 0.896443272 + }, + { + "epoch": 0.5179774484079144, + "grad_norm": 16.789636611938477, + "loss": 4.1669, + "lr": 0.0008146853146853147, + "step": 1826, + "tokens_trained": 0.897426712 + }, + { + "epoch": 0.5185447840578682, + "grad_norm": 11.26768684387207, + "loss": 4.1401, + "lr": 0.0008144055944055944, + "step": 1828, + "tokens_trained": 0.89840672 + }, + { + "epoch": 0.5191121197078221, + "grad_norm": 9.25829029083252, + "loss": 4.1581, + "lr": 0.0008141258741258742, + "step": 1830, + "tokens_trained": 0.89939132 + }, + { + "epoch": 0.519679455357776, + "grad_norm": 12.006930351257324, + "loss": 4.1768, + "lr": 0.0008138461538461539, + "step": 1832, + "tokens_trained": 0.900373704 + }, + { + "epoch": 0.52024679100773, + "grad_norm": 18.766008377075195, + "loss": 4.1419, + "lr": 0.0008135664335664336, + "step": 1834, + "tokens_trained": 0.901356176 + }, + { + "epoch": 0.5208141266576839, + "grad_norm": 17.483421325683594, + "loss": 4.1382, + "lr": 0.0008132867132867133, + "step": 1836, + "tokens_trained": 0.902344088 + }, + { + "epoch": 0.5213814623076377, + "grad_norm": 10.484652519226074, + "loss": 4.1571, + "lr": 0.000813006993006993, + "step": 1838, + "tokens_trained": 0.903328896 + }, + { + "epoch": 0.5219487979575916, + "grad_norm": 13.653974533081055, + "loss": 4.1638, + "lr": 0.0008127272727272728, + "step": 1840, + "tokens_trained": 0.904309368 + }, + { + "epoch": 0.5225161336075456, + "grad_norm": 12.48718547821045, + "loss": 4.1226, + "lr": 0.0008124475524475524, + "step": 1842, + "tokens_trained": 0.905293112 + }, + { + "epoch": 0.5230834692574995, + "grad_norm": 8.086355209350586, + "loss": 4.1303, + "lr": 0.0008121678321678322, + "step": 1844, + "tokens_trained": 0.906275632 + }, + { + "epoch": 0.5236508049074534, + "grad_norm": 10.940073013305664, + "loss": 4.1634, + "lr": 0.0008118881118881119, + "step": 1846, + "tokens_trained": 0.907255808 + }, + { + "epoch": 0.5242181405574072, + "grad_norm": 13.844099044799805, + "loss": 4.1505, + "lr": 0.0008116083916083917, + "step": 1848, + "tokens_trained": 0.908238664 + }, + { + "epoch": 0.5247854762073612, + "grad_norm": 6.305738925933838, + "loss": 4.1463, + "lr": 0.0008113286713286714, + "step": 1850, + "tokens_trained": 0.909221424 + }, + { + "epoch": 0.5253528118573151, + "grad_norm": 8.957951545715332, + "loss": 4.1785, + "lr": 0.000811048951048951, + "step": 1852, + "tokens_trained": 0.910204472 + }, + { + "epoch": 0.525920147507269, + "grad_norm": 12.665373802185059, + "loss": 4.1776, + "lr": 0.0008107692307692308, + "step": 1854, + "tokens_trained": 0.911186456 + }, + { + "epoch": 0.5264874831572229, + "grad_norm": 13.7921781539917, + "loss": 4.2058, + "lr": 0.0008104895104895104, + "step": 1856, + "tokens_trained": 0.912163912 + }, + { + "epoch": 0.5270548188071768, + "grad_norm": 18.400495529174805, + "loss": 4.1378, + "lr": 0.0008102097902097903, + "step": 1858, + "tokens_trained": 0.913143416 + }, + { + "epoch": 0.5276221544571307, + "grad_norm": 10.095234870910645, + "loss": 4.1673, + "lr": 0.0008099300699300699, + "step": 1860, + "tokens_trained": 0.914125056 + }, + { + "epoch": 0.5281894901070846, + "grad_norm": 9.396644592285156, + "loss": 4.1226, + "lr": 0.0008096503496503497, + "step": 1862, + "tokens_trained": 0.915109128 + }, + { + "epoch": 0.5287568257570385, + "grad_norm": 12.686080932617188, + "loss": 4.1356, + "lr": 0.0008093706293706294, + "step": 1864, + "tokens_trained": 0.916092096 + }, + { + "epoch": 0.5293241614069925, + "grad_norm": 15.91020679473877, + "loss": 4.1276, + "lr": 0.0008090909090909092, + "step": 1866, + "tokens_trained": 0.917077264 + }, + { + "epoch": 0.5298914970569463, + "grad_norm": 21.305110931396484, + "loss": 4.1492, + "lr": 0.0008088111888111889, + "step": 1868, + "tokens_trained": 0.918060288 + }, + { + "epoch": 0.5304588327069002, + "grad_norm": 9.242319107055664, + "loss": 4.1457, + "lr": 0.0008085314685314685, + "step": 1870, + "tokens_trained": 0.91904616 + }, + { + "epoch": 0.5310261683568541, + "grad_norm": 17.556922912597656, + "loss": 4.1698, + "lr": 0.0008082517482517483, + "step": 1872, + "tokens_trained": 0.920028192 + }, + { + "epoch": 0.531593504006808, + "grad_norm": 24.155885696411133, + "loss": 4.193, + "lr": 0.0008079720279720279, + "step": 1874, + "tokens_trained": 0.921010456 + }, + { + "epoch": 0.531877171831785, + "eval_loss": 1.0404243469238281, + "eval_runtime": 21.451, + "step": 1875, + "tokens_trained": 0.921502192 + }, + { + "epoch": 0.532160839656762, + "grad_norm": 4.985994338989258, + "loss": 4.1649, + "lr": 0.0008076923076923078, + "step": 1876, + "tokens_trained": 0.921994216 + }, + { + "epoch": 0.5327281753067158, + "grad_norm": 19.2642765045166, + "loss": 4.1883, + "lr": 0.0008074125874125874, + "step": 1878, + "tokens_trained": 0.922978112 + }, + { + "epoch": 0.5332955109566697, + "grad_norm": 15.012572288513184, + "loss": 4.1944, + "lr": 0.0008071328671328671, + "step": 1880, + "tokens_trained": 0.923962952 + }, + { + "epoch": 0.5338628466066236, + "grad_norm": 21.37204360961914, + "loss": 4.1708, + "lr": 0.0008068531468531469, + "step": 1882, + "tokens_trained": 0.92494744 + }, + { + "epoch": 0.5344301822565776, + "grad_norm": 6.402398586273193, + "loss": 4.1921, + "lr": 0.0008065734265734265, + "step": 1884, + "tokens_trained": 0.925927984 + }, + { + "epoch": 0.5349975179065315, + "grad_norm": 27.606822967529297, + "loss": 4.2033, + "lr": 0.0008062937062937064, + "step": 1886, + "tokens_trained": 0.926911352 + }, + { + "epoch": 0.5355648535564853, + "grad_norm": 16.434572219848633, + "loss": 4.1504, + "lr": 0.000806013986013986, + "step": 1888, + "tokens_trained": 0.927894056 + }, + { + "epoch": 0.5361321892064392, + "grad_norm": 8.066178321838379, + "loss": 4.1674, + "lr": 0.0008057342657342658, + "step": 1890, + "tokens_trained": 0.928879504 + }, + { + "epoch": 0.5366995248563932, + "grad_norm": 6.167456150054932, + "loss": 4.1207, + "lr": 0.0008054545454545454, + "step": 1892, + "tokens_trained": 0.92986424 + }, + { + "epoch": 0.5372668605063471, + "grad_norm": 3.584982395172119, + "loss": 4.1051, + "lr": 0.0008051748251748253, + "step": 1894, + "tokens_trained": 0.930846696 + }, + { + "epoch": 0.537834196156301, + "grad_norm": 14.988295555114746, + "loss": 4.1199, + "lr": 0.0008048951048951049, + "step": 1896, + "tokens_trained": 0.931831112 + }, + { + "epoch": 0.5384015318062548, + "grad_norm": 12.735363960266113, + "loss": 4.1368, + "lr": 0.0008046153846153846, + "step": 1898, + "tokens_trained": 0.932816952 + }, + { + "epoch": 0.5389688674562088, + "grad_norm": 7.701294422149658, + "loss": 4.1205, + "lr": 0.0008043356643356644, + "step": 1900, + "tokens_trained": 0.93380264 + }, + { + "epoch": 0.5395362031061627, + "grad_norm": 9.15809440612793, + "loss": 4.1567, + "lr": 0.000804055944055944, + "step": 1902, + "tokens_trained": 0.934785848 + }, + { + "epoch": 0.5401035387561166, + "grad_norm": 10.8292875289917, + "loss": 4.1645, + "lr": 0.0008037762237762239, + "step": 1904, + "tokens_trained": 0.935766912 + }, + { + "epoch": 0.5406708744060705, + "grad_norm": 10.906803131103516, + "loss": 4.1398, + "lr": 0.0008034965034965035, + "step": 1906, + "tokens_trained": 0.936749352 + }, + { + "epoch": 0.5412382100560243, + "grad_norm": 10.140864372253418, + "loss": 4.1754, + "lr": 0.0008032167832167832, + "step": 1908, + "tokens_trained": 0.9377304 + }, + { + "epoch": 0.5418055457059783, + "grad_norm": 10.061383247375488, + "loss": 4.1485, + "lr": 0.0008029370629370629, + "step": 1910, + "tokens_trained": 0.938712336 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 8.252259254455566, + "loss": 4.1502, + "lr": 0.0008026573426573427, + "step": 1912, + "tokens_trained": 0.939693304 + }, + { + "epoch": 0.5429402170058861, + "grad_norm": 15.104400634765625, + "loss": 4.182, + "lr": 0.0008023776223776224, + "step": 1914, + "tokens_trained": 0.940679832 + }, + { + "epoch": 0.54350755265584, + "grad_norm": 21.167285919189453, + "loss": 4.1241, + "lr": 0.0008020979020979021, + "step": 1916, + "tokens_trained": 0.941665088 + }, + { + "epoch": 0.5440748883057939, + "grad_norm": 17.936481475830078, + "loss": 4.1846, + "lr": 0.0008018181818181818, + "step": 1918, + "tokens_trained": 0.942651632 + }, + { + "epoch": 0.5446422239557478, + "grad_norm": 9.773019790649414, + "loss": 4.1164, + "lr": 0.0008015384615384615, + "step": 1920, + "tokens_trained": 0.943635928 + }, + { + "epoch": 0.5452095596057017, + "grad_norm": 14.120475769042969, + "loss": 4.1556, + "lr": 0.0008012587412587414, + "step": 1922, + "tokens_trained": 0.944618336 + }, + { + "epoch": 0.5457768952556556, + "grad_norm": 10.898097038269043, + "loss": 4.1521, + "lr": 0.000800979020979021, + "step": 1924, + "tokens_trained": 0.945608216 + }, + { + "epoch": 0.5463442309056096, + "grad_norm": 8.271462440490723, + "loss": 4.0785, + "lr": 0.0008006993006993007, + "step": 1926, + "tokens_trained": 0.946593504 + }, + { + "epoch": 0.5469115665555634, + "grad_norm": 17.28820037841797, + "loss": 4.0998, + "lr": 0.0008004195804195804, + "step": 1928, + "tokens_trained": 0.947575288 + }, + { + "epoch": 0.5474789022055173, + "grad_norm": 17.754959106445312, + "loss": 4.1652, + "lr": 0.0008001398601398602, + "step": 1930, + "tokens_trained": 0.948562968 + }, + { + "epoch": 0.5480462378554712, + "grad_norm": 10.576292037963867, + "loss": 4.1754, + "lr": 0.0007998601398601399, + "step": 1932, + "tokens_trained": 0.949545728 + }, + { + "epoch": 0.5486135735054252, + "grad_norm": 14.297791481018066, + "loss": 4.1597, + "lr": 0.0007995804195804196, + "step": 1934, + "tokens_trained": 0.950528952 + }, + { + "epoch": 0.5491809091553791, + "grad_norm": 23.882539749145508, + "loss": 4.1366, + "lr": 0.0007993006993006992, + "step": 1936, + "tokens_trained": 0.951513448 + }, + { + "epoch": 0.5497482448053329, + "grad_norm": 5.12502908706665, + "loss": 4.1441, + "lr": 0.000799020979020979, + "step": 1938, + "tokens_trained": 0.952497048 + }, + { + "epoch": 0.5503155804552868, + "grad_norm": 26.879070281982422, + "loss": 4.2595, + "lr": 0.0007987412587412588, + "step": 1940, + "tokens_trained": 0.953475816 + }, + { + "epoch": 0.5508829161052408, + "grad_norm": 23.032690048217773, + "loss": 4.1841, + "lr": 0.0007984615384615385, + "step": 1942, + "tokens_trained": 0.954459984 + }, + { + "epoch": 0.5514502517551947, + "grad_norm": 8.810720443725586, + "loss": 4.1329, + "lr": 0.0007981818181818182, + "step": 1944, + "tokens_trained": 0.95544252 + }, + { + "epoch": 0.5520175874051486, + "grad_norm": 31.051185607910156, + "loss": 4.2278, + "lr": 0.0007979020979020979, + "step": 1946, + "tokens_trained": 0.956428016 + }, + { + "epoch": 0.5525849230551024, + "grad_norm": 22.537412643432617, + "loss": 4.1729, + "lr": 0.0007976223776223777, + "step": 1948, + "tokens_trained": 0.957406024 + }, + { + "epoch": 0.5531522587050564, + "grad_norm": 10.596793174743652, + "loss": 4.1636, + "lr": 0.0007973426573426573, + "step": 1950, + "tokens_trained": 0.958391232 + }, + { + "epoch": 0.5537195943550103, + "grad_norm": 16.45500373840332, + "loss": 4.1591, + "lr": 0.0007970629370629371, + "step": 1952, + "tokens_trained": 0.959378448 + }, + { + "epoch": 0.5542869300049642, + "grad_norm": 15.090359687805176, + "loss": 4.1516, + "lr": 0.0007967832167832167, + "step": 1954, + "tokens_trained": 0.960363384 + }, + { + "epoch": 0.5548542656549181, + "grad_norm": 28.482192993164062, + "loss": 4.1211, + "lr": 0.0007965034965034965, + "step": 1956, + "tokens_trained": 0.961348752 + }, + { + "epoch": 0.555421601304872, + "grad_norm": 9.402368545532227, + "loss": 4.178, + "lr": 0.0007962237762237763, + "step": 1958, + "tokens_trained": 0.962332976 + }, + { + "epoch": 0.5559889369548259, + "grad_norm": 33.001346588134766, + "loss": 4.218, + "lr": 0.000795944055944056, + "step": 1960, + "tokens_trained": 0.963316928 + }, + { + "epoch": 0.5565562726047798, + "grad_norm": 29.695520401000977, + "loss": 4.2071, + "lr": 0.0007956643356643357, + "step": 1962, + "tokens_trained": 0.964301728 + }, + { + "epoch": 0.5571236082547337, + "grad_norm": 22.22412109375, + "loss": 4.2158, + "lr": 0.0007953846153846153, + "step": 1964, + "tokens_trained": 0.96528524 + }, + { + "epoch": 0.5576909439046877, + "grad_norm": 15.590829849243164, + "loss": 4.1681, + "lr": 0.0007951048951048952, + "step": 1966, + "tokens_trained": 0.966268264 + }, + { + "epoch": 0.5582582795546415, + "grad_norm": 16.011110305786133, + "loss": 4.1591, + "lr": 0.0007948251748251748, + "step": 1968, + "tokens_trained": 0.967252016 + }, + { + "epoch": 0.5588256152045954, + "grad_norm": 15.24573040008545, + "loss": 4.1446, + "lr": 0.0007945454545454546, + "step": 1970, + "tokens_trained": 0.96823396 + }, + { + "epoch": 0.5593929508545493, + "grad_norm": 15.718021392822266, + "loss": 4.1846, + "lr": 0.0007942657342657342, + "step": 1972, + "tokens_trained": 0.969217792 + }, + { + "epoch": 0.5599602865045032, + "grad_norm": 8.648459434509277, + "loss": 4.1655, + "lr": 0.000793986013986014, + "step": 1974, + "tokens_trained": 0.970200776 + }, + { + "epoch": 0.5605276221544572, + "grad_norm": 7.273077487945557, + "loss": 4.1397, + "lr": 0.0007937062937062938, + "step": 1976, + "tokens_trained": 0.971181376 + }, + { + "epoch": 0.561094957804411, + "grad_norm": 25.027616500854492, + "loss": 4.1918, + "lr": 0.0007934265734265734, + "step": 1978, + "tokens_trained": 0.972165496 + }, + { + "epoch": 0.5616622934543649, + "grad_norm": 25.485851287841797, + "loss": 4.1896, + "lr": 0.0007931468531468532, + "step": 1980, + "tokens_trained": 0.973145616 + }, + { + "epoch": 0.5622296291043188, + "grad_norm": 18.065462112426758, + "loss": 4.1876, + "lr": 0.0007928671328671328, + "step": 1982, + "tokens_trained": 0.974131104 + }, + { + "epoch": 0.5627969647542728, + "grad_norm": 20.412248611450195, + "loss": 4.1556, + "lr": 0.0007925874125874127, + "step": 1984, + "tokens_trained": 0.975111232 + }, + { + "epoch": 0.5633643004042267, + "grad_norm": 15.51710319519043, + "loss": 4.1391, + "lr": 0.0007923076923076923, + "step": 1986, + "tokens_trained": 0.976098968 + }, + { + "epoch": 0.5639316360541805, + "grad_norm": 8.650726318359375, + "loss": 4.1421, + "lr": 0.000792027972027972, + "step": 1988, + "tokens_trained": 0.977082992 + }, + { + "epoch": 0.5644989717041344, + "grad_norm": 19.833505630493164, + "loss": 4.1505, + "lr": 0.0007917482517482517, + "step": 1990, + "tokens_trained": 0.978068896 + }, + { + "epoch": 0.5650663073540884, + "grad_norm": 26.585390090942383, + "loss": 4.1661, + "lr": 0.0007914685314685314, + "step": 1992, + "tokens_trained": 0.979048504 + }, + { + "epoch": 0.5656336430040423, + "grad_norm": 20.827394485473633, + "loss": 4.1987, + "lr": 0.0007911888111888113, + "step": 1994, + "tokens_trained": 0.98003104 + }, + { + "epoch": 0.5662009786539962, + "grad_norm": 23.700273513793945, + "loss": 4.1773, + "lr": 0.0007909090909090909, + "step": 1996, + "tokens_trained": 0.981013384 + }, + { + "epoch": 0.56676831430395, + "grad_norm": 15.673397064208984, + "loss": 4.12, + "lr": 0.0007906293706293707, + "step": 1998, + "tokens_trained": 0.981999776 + }, + { + "epoch": 0.567335649953904, + "grad_norm": 11.268630981445312, + "loss": 4.1373, + "lr": 0.0007903496503496503, + "step": 2000, + "tokens_trained": 0.982980936 + }, + { + "epoch": 0.567335649953904, + "eval_loss": 1.0422048568725586, + "eval_runtime": 20.3928, + "step": 2000, + "tokens_trained": 0.982980936 + }, + { + "epoch": 0.5679029856038579, + "grad_norm": 18.37994384765625, + "loss": 4.1536, + "lr": 0.0007900699300699302, + "step": 2002, + "tokens_trained": 0.983969536 + }, + { + "epoch": 0.5684703212538118, + "grad_norm": 23.911537170410156, + "loss": 4.1652, + "lr": 0.0007897902097902098, + "step": 2004, + "tokens_trained": 0.98495052 + }, + { + "epoch": 0.5690376569037657, + "grad_norm": 7.355772018432617, + "loss": 4.1846, + "lr": 0.0007895104895104895, + "step": 2006, + "tokens_trained": 0.98593252 + }, + { + "epoch": 0.5696049925537195, + "grad_norm": 35.29991149902344, + "loss": 4.2145, + "lr": 0.0007892307692307692, + "step": 2008, + "tokens_trained": 0.986922392 + }, + { + "epoch": 0.5701723282036735, + "grad_norm": 14.28709602355957, + "loss": 4.1629, + "lr": 0.0007889510489510489, + "step": 2010, + "tokens_trained": 0.987905712 + }, + { + "epoch": 0.5707396638536274, + "grad_norm": 22.50174331665039, + "loss": 4.1907, + "lr": 0.0007886713286713288, + "step": 2012, + "tokens_trained": 0.988887536 + }, + { + "epoch": 0.5713069995035813, + "grad_norm": 14.588640213012695, + "loss": 4.1523, + "lr": 0.0007883916083916084, + "step": 2014, + "tokens_trained": 0.989872712 + }, + { + "epoch": 0.5718743351535353, + "grad_norm": 2.776369094848633, + "loss": 4.1548, + "lr": 0.0007881118881118882, + "step": 2016, + "tokens_trained": 0.990854072 + }, + { + "epoch": 0.5724416708034891, + "grad_norm": 16.00047492980957, + "loss": 4.1319, + "lr": 0.0007878321678321678, + "step": 2018, + "tokens_trained": 0.991834552 + }, + { + "epoch": 0.573009006453443, + "grad_norm": 21.678735733032227, + "loss": 4.1986, + "lr": 0.0007875524475524476, + "step": 2020, + "tokens_trained": 0.992818256 + }, + { + "epoch": 0.5735763421033969, + "grad_norm": 4.835119724273682, + "loss": 4.1625, + "lr": 0.0007872727272727273, + "step": 2022, + "tokens_trained": 0.993801376 + }, + { + "epoch": 0.5741436777533508, + "grad_norm": 19.427467346191406, + "loss": 4.1594, + "lr": 0.000786993006993007, + "step": 2024, + "tokens_trained": 0.994788568 + }, + { + "epoch": 0.5747110134033048, + "grad_norm": 15.458346366882324, + "loss": 4.1829, + "lr": 0.0007867132867132867, + "step": 2026, + "tokens_trained": 0.995769976 + }, + { + "epoch": 0.5752783490532586, + "grad_norm": 11.073614120483398, + "loss": 4.1303, + "lr": 0.0007864335664335664, + "step": 2028, + "tokens_trained": 0.996751464 + }, + { + "epoch": 0.5758456847032125, + "grad_norm": 4.685436248779297, + "loss": 4.1368, + "lr": 0.0007861538461538463, + "step": 2030, + "tokens_trained": 0.997733952 + }, + { + "epoch": 0.5764130203531664, + "grad_norm": 15.977241516113281, + "loss": 4.1584, + "lr": 0.0007858741258741259, + "step": 2032, + "tokens_trained": 0.998716976 + }, + { + "epoch": 0.5769803560031204, + "grad_norm": 11.305732727050781, + "loss": 4.102, + "lr": 0.0007855944055944056, + "step": 2034, + "tokens_trained": 0.999703632 + }, + { + "epoch": 0.5775476916530743, + "grad_norm": 7.794003963470459, + "loss": 4.161, + "lr": 0.0007853146853146853, + "step": 2036, + "tokens_trained": 1.000687488 + }, + { + "epoch": 0.5781150273030281, + "grad_norm": 7.609982013702393, + "loss": 4.1546, + "lr": 0.0007850349650349651, + "step": 2038, + "tokens_trained": 1.0016692 + }, + { + "epoch": 0.578682362952982, + "grad_norm": 7.622653961181641, + "loss": 4.1246, + "lr": 0.0007847552447552448, + "step": 2040, + "tokens_trained": 1.002653352 + }, + { + "epoch": 0.579249698602936, + "grad_norm": 9.98919677734375, + "loss": 4.1319, + "lr": 0.0007844755244755245, + "step": 2042, + "tokens_trained": 1.003639528 + }, + { + "epoch": 0.5798170342528899, + "grad_norm": 9.557628631591797, + "loss": 4.1105, + "lr": 0.0007841958041958041, + "step": 2044, + "tokens_trained": 1.004623776 + }, + { + "epoch": 0.5803843699028438, + "grad_norm": 14.172621726989746, + "loss": 4.1339, + "lr": 0.0007839160839160839, + "step": 2046, + "tokens_trained": 1.005604008 + }, + { + "epoch": 0.5809517055527976, + "grad_norm": 8.185248374938965, + "loss": 4.1142, + "lr": 0.0007836363636363637, + "step": 2048, + "tokens_trained": 1.006585704 + }, + { + "epoch": 0.5815190412027516, + "grad_norm": 10.642661094665527, + "loss": 4.131, + "lr": 0.0007833566433566434, + "step": 2050, + "tokens_trained": 1.00757132 + }, + { + "epoch": 0.5820863768527055, + "grad_norm": 7.868969917297363, + "loss": 4.1477, + "lr": 0.0007830769230769231, + "step": 2052, + "tokens_trained": 1.008556824 + }, + { + "epoch": 0.5826537125026594, + "grad_norm": 2.8441150188446045, + "loss": 4.1156, + "lr": 0.0007827972027972028, + "step": 2054, + "tokens_trained": 1.00954056 + }, + { + "epoch": 0.5832210481526133, + "grad_norm": 5.2797932624816895, + "loss": 4.1058, + "lr": 0.0007825174825174826, + "step": 2056, + "tokens_trained": 1.010526488 + }, + { + "epoch": 0.5837883838025671, + "grad_norm": 11.850811004638672, + "loss": 4.165, + "lr": 0.0007822377622377622, + "step": 2058, + "tokens_trained": 1.011507584 + }, + { + "epoch": 0.5843557194525211, + "grad_norm": 11.073920249938965, + "loss": 4.1509, + "lr": 0.000781958041958042, + "step": 2060, + "tokens_trained": 1.012491648 + }, + { + "epoch": 0.584923055102475, + "grad_norm": 8.282343864440918, + "loss": 4.0656, + "lr": 0.0007816783216783216, + "step": 2062, + "tokens_trained": 1.013475224 + }, + { + "epoch": 0.5854903907524289, + "grad_norm": 10.414461135864258, + "loss": 4.1285, + "lr": 0.0007813986013986014, + "step": 2064, + "tokens_trained": 1.014458144 + }, + { + "epoch": 0.5860577264023829, + "grad_norm": 9.988463401794434, + "loss": 4.1234, + "lr": 0.0007811188811188812, + "step": 2066, + "tokens_trained": 1.015444112 + }, + { + "epoch": 0.5866250620523367, + "grad_norm": 8.713189125061035, + "loss": 4.129, + "lr": 0.0007808391608391609, + "step": 2068, + "tokens_trained": 1.016427568 + }, + { + "epoch": 0.5871923977022906, + "grad_norm": 3.4149773120880127, + "loss": 4.155, + "lr": 0.0007805594405594406, + "step": 2070, + "tokens_trained": 1.017412264 + }, + { + "epoch": 0.5877597333522445, + "grad_norm": 12.33522891998291, + "loss": 4.1856, + "lr": 0.0007802797202797202, + "step": 2072, + "tokens_trained": 1.018402216 + }, + { + "epoch": 0.5883270690021984, + "grad_norm": 12.155695915222168, + "loss": 4.1468, + "lr": 0.0007800000000000001, + "step": 2074, + "tokens_trained": 1.019387096 + }, + { + "epoch": 0.5888944046521524, + "grad_norm": 7.73326301574707, + "loss": 4.1239, + "lr": 0.0007797202797202797, + "step": 2076, + "tokens_trained": 1.020370008 + }, + { + "epoch": 0.5894617403021062, + "grad_norm": 6.425852298736572, + "loss": 4.1101, + "lr": 0.0007794405594405595, + "step": 2078, + "tokens_trained": 1.02135716 + }, + { + "epoch": 0.5900290759520601, + "grad_norm": 18.360816955566406, + "loss": 4.1726, + "lr": 0.0007791608391608391, + "step": 2080, + "tokens_trained": 1.022338024 + }, + { + "epoch": 0.590596411602014, + "grad_norm": 28.31681251525879, + "loss": 4.1341, + "lr": 0.0007788811188811189, + "step": 2082, + "tokens_trained": 1.023318008 + }, + { + "epoch": 0.591163747251968, + "grad_norm": 10.673089027404785, + "loss": 4.1268, + "lr": 0.0007786013986013987, + "step": 2084, + "tokens_trained": 1.02430432 + }, + { + "epoch": 0.5917310829019219, + "grad_norm": 26.656522750854492, + "loss": 4.1703, + "lr": 0.0007783216783216783, + "step": 2086, + "tokens_trained": 1.025288272 + }, + { + "epoch": 0.5922984185518757, + "grad_norm": 20.022029876708984, + "loss": 4.1532, + "lr": 0.0007780419580419581, + "step": 2088, + "tokens_trained": 1.026272984 + }, + { + "epoch": 0.5928657542018296, + "grad_norm": 7.2955121994018555, + "loss": 4.1992, + "lr": 0.0007777622377622377, + "step": 2090, + "tokens_trained": 1.02725572 + }, + { + "epoch": 0.5934330898517836, + "grad_norm": 28.561243057250977, + "loss": 4.2098, + "lr": 0.0007774825174825176, + "step": 2092, + "tokens_trained": 1.028238456 + }, + { + "epoch": 0.5940004255017375, + "grad_norm": 16.715425491333008, + "loss": 4.1509, + "lr": 0.0007772027972027972, + "step": 2094, + "tokens_trained": 1.029226048 + }, + { + "epoch": 0.5945677611516914, + "grad_norm": 6.325936317443848, + "loss": 4.1221, + "lr": 0.000776923076923077, + "step": 2096, + "tokens_trained": 1.030210528 + }, + { + "epoch": 0.5951350968016452, + "grad_norm": 12.83181381225586, + "loss": 4.1808, + "lr": 0.0007766433566433566, + "step": 2098, + "tokens_trained": 1.031193456 + }, + { + "epoch": 0.5957024324515992, + "grad_norm": 12.183184623718262, + "loss": 4.1292, + "lr": 0.0007763636363636363, + "step": 2100, + "tokens_trained": 1.032173528 + }, + { + "epoch": 0.5962697681015531, + "grad_norm": 8.247485160827637, + "loss": 4.1425, + "lr": 0.0007760839160839162, + "step": 2102, + "tokens_trained": 1.033158144 + }, + { + "epoch": 0.596837103751507, + "grad_norm": 10.814559936523438, + "loss": 4.1167, + "lr": 0.0007758041958041958, + "step": 2104, + "tokens_trained": 1.034141216 + }, + { + "epoch": 0.5974044394014609, + "grad_norm": 12.589309692382812, + "loss": 4.0916, + "lr": 0.0007755244755244756, + "step": 2106, + "tokens_trained": 1.035121888 + }, + { + "epoch": 0.5979717750514147, + "grad_norm": 11.65658187866211, + "loss": 4.0776, + "lr": 0.0007752447552447552, + "step": 2108, + "tokens_trained": 1.036103688 + }, + { + "epoch": 0.5985391107013687, + "grad_norm": 18.0120792388916, + "loss": 4.1588, + "lr": 0.0007749650349650351, + "step": 2110, + "tokens_trained": 1.03708248 + }, + { + "epoch": 0.5991064463513226, + "grad_norm": 5.742938995361328, + "loss": 4.151, + "lr": 0.0007746853146853147, + "step": 2112, + "tokens_trained": 1.038068792 + }, + { + "epoch": 0.5996737820012765, + "grad_norm": 36.54581832885742, + "loss": 4.2239, + "lr": 0.0007744055944055944, + "step": 2114, + "tokens_trained": 1.03904728 + }, + { + "epoch": 0.6002411176512304, + "grad_norm": 13.304069519042969, + "loss": 4.152, + "lr": 0.0007741258741258741, + "step": 2116, + "tokens_trained": 1.040031312 + }, + { + "epoch": 0.6008084533011843, + "grad_norm": 18.68927001953125, + "loss": 4.1413, + "lr": 0.0007738461538461538, + "step": 2118, + "tokens_trained": 1.041018376 + }, + { + "epoch": 0.6013757889511382, + "grad_norm": 16.946630477905273, + "loss": 4.1122, + "lr": 0.0007735664335664337, + "step": 2120, + "tokens_trained": 1.0420056 + }, + { + "epoch": 0.6019431246010921, + "grad_norm": 4.236926078796387, + "loss": 4.1146, + "lr": 0.0007732867132867133, + "step": 2122, + "tokens_trained": 1.042990376 + }, + { + "epoch": 0.602510460251046, + "grad_norm": 12.148641586303711, + "loss": 4.1472, + "lr": 0.0007730069930069931, + "step": 2124, + "tokens_trained": 1.0439754 + }, + { + "epoch": 0.602794128076023, + "eval_loss": 1.039306640625, + "eval_runtime": 20.6138, + "step": 2125, + "tokens_trained": 1.044467008 + }, + { + "epoch": 0.603077795901, + "grad_norm": 17.051687240600586, + "loss": 4.1572, + "lr": 0.0007727272727272727, + "step": 2126, + "tokens_trained": 1.044957456 + }, + { + "epoch": 0.6036451315509538, + "grad_norm": 14.019828796386719, + "loss": 4.1464, + "lr": 0.0007724475524475525, + "step": 2128, + "tokens_trained": 1.04593944 + }, + { + "epoch": 0.6042124672009077, + "grad_norm": 11.22962760925293, + "loss": 4.1345, + "lr": 0.0007721678321678322, + "step": 2130, + "tokens_trained": 1.046919592 + }, + { + "epoch": 0.6047798028508616, + "grad_norm": 11.524348258972168, + "loss": 4.1233, + "lr": 0.0007718881118881119, + "step": 2132, + "tokens_trained": 1.047904744 + }, + { + "epoch": 0.6053471385008156, + "grad_norm": 7.174457550048828, + "loss": 4.1201, + "lr": 0.0007716083916083916, + "step": 2134, + "tokens_trained": 1.048885328 + }, + { + "epoch": 0.6059144741507695, + "grad_norm": 6.847499847412109, + "loss": 4.1313, + "lr": 0.0007713286713286713, + "step": 2136, + "tokens_trained": 1.049868776 + }, + { + "epoch": 0.6064818098007233, + "grad_norm": 8.44458293914795, + "loss": 4.1236, + "lr": 0.0007710489510489512, + "step": 2138, + "tokens_trained": 1.050852704 + }, + { + "epoch": 0.6070491454506772, + "grad_norm": 15.415260314941406, + "loss": 4.1424, + "lr": 0.0007707692307692308, + "step": 2140, + "tokens_trained": 1.051837736 + }, + { + "epoch": 0.6076164811006312, + "grad_norm": 16.845874786376953, + "loss": 4.1037, + "lr": 0.0007704895104895105, + "step": 2142, + "tokens_trained": 1.05282172 + }, + { + "epoch": 0.6081838167505851, + "grad_norm": 1.3947086334228516, + "loss": 4.1389, + "lr": 0.0007702097902097902, + "step": 2144, + "tokens_trained": 1.053802928 + }, + { + "epoch": 0.608751152400539, + "grad_norm": 3.4119038581848145, + "loss": 4.16, + "lr": 0.0007699300699300699, + "step": 2146, + "tokens_trained": 1.054784368 + }, + { + "epoch": 0.6093184880504928, + "grad_norm": 9.26860523223877, + "loss": 4.1841, + "lr": 0.0007696503496503497, + "step": 2148, + "tokens_trained": 1.05576888 + }, + { + "epoch": 0.6098858237004467, + "grad_norm": 8.744836807250977, + "loss": 4.1043, + "lr": 0.0007693706293706294, + "step": 2150, + "tokens_trained": 1.056751336 + }, + { + "epoch": 0.6104531593504007, + "grad_norm": 8.805045127868652, + "loss": 4.1032, + "lr": 0.000769090909090909, + "step": 2152, + "tokens_trained": 1.057734 + }, + { + "epoch": 0.6110204950003546, + "grad_norm": 4.785625457763672, + "loss": 4.1817, + "lr": 0.0007688111888111888, + "step": 2154, + "tokens_trained": 1.058716328 + }, + { + "epoch": 0.6115878306503085, + "grad_norm": 2.2137513160705566, + "loss": 4.1514, + "lr": 0.0007685314685314686, + "step": 2156, + "tokens_trained": 1.059696248 + }, + { + "epoch": 0.6121551663002623, + "grad_norm": 7.164271354675293, + "loss": 4.1433, + "lr": 0.0007682517482517483, + "step": 2158, + "tokens_trained": 1.060676648 + }, + { + "epoch": 0.6127225019502163, + "grad_norm": 9.481597900390625, + "loss": 4.0971, + "lr": 0.000767972027972028, + "step": 2160, + "tokens_trained": 1.061656688 + }, + { + "epoch": 0.6132898376001702, + "grad_norm": 11.28831672668457, + "loss": 4.149, + "lr": 0.0007676923076923077, + "step": 2162, + "tokens_trained": 1.062640576 + }, + { + "epoch": 0.6138571732501241, + "grad_norm": 17.21572494506836, + "loss": 4.098, + "lr": 0.0007674125874125874, + "step": 2164, + "tokens_trained": 1.063617688 + }, + { + "epoch": 0.614424508900078, + "grad_norm": 14.486310005187988, + "loss": 4.123, + "lr": 0.0007671328671328672, + "step": 2166, + "tokens_trained": 1.06460584 + }, + { + "epoch": 0.6149918445500319, + "grad_norm": 10.582398414611816, + "loss": 4.1243, + "lr": 0.0007668531468531469, + "step": 2168, + "tokens_trained": 1.065589064 + }, + { + "epoch": 0.6155591801999858, + "grad_norm": 12.923002243041992, + "loss": 4.0928, + "lr": 0.0007665734265734265, + "step": 2170, + "tokens_trained": 1.06657224 + }, + { + "epoch": 0.6161265158499397, + "grad_norm": 12.445414543151855, + "loss": 4.1697, + "lr": 0.0007662937062937063, + "step": 2172, + "tokens_trained": 1.067556952 + }, + { + "epoch": 0.6166938514998936, + "grad_norm": 3.562396287918091, + "loss": 4.0763, + "lr": 0.000766013986013986, + "step": 2174, + "tokens_trained": 1.068538248 + }, + { + "epoch": 0.6172611871498476, + "grad_norm": 12.62887954711914, + "loss": 4.1203, + "lr": 0.0007657342657342658, + "step": 2176, + "tokens_trained": 1.06952032 + }, + { + "epoch": 0.6178285227998014, + "grad_norm": 9.387356758117676, + "loss": 4.1318, + "lr": 0.0007654545454545455, + "step": 2178, + "tokens_trained": 1.070503872 + }, + { + "epoch": 0.6183958584497553, + "grad_norm": 8.885710716247559, + "loss": 4.1609, + "lr": 0.0007651748251748251, + "step": 2180, + "tokens_trained": 1.071486328 + }, + { + "epoch": 0.6189631940997092, + "grad_norm": 7.174533843994141, + "loss": 4.0824, + "lr": 0.0007648951048951049, + "step": 2182, + "tokens_trained": 1.07246928 + }, + { + "epoch": 0.6195305297496632, + "grad_norm": 15.866931915283203, + "loss": 4.1461, + "lr": 0.0007646153846153846, + "step": 2184, + "tokens_trained": 1.07345252 + }, + { + "epoch": 0.6200978653996171, + "grad_norm": 4.892337799072266, + "loss": 4.1418, + "lr": 0.0007643356643356644, + "step": 2186, + "tokens_trained": 1.07443796 + }, + { + "epoch": 0.6206652010495709, + "grad_norm": 4.796551704406738, + "loss": 4.1394, + "lr": 0.000764055944055944, + "step": 2188, + "tokens_trained": 1.075421392 + }, + { + "epoch": 0.6212325366995248, + "grad_norm": 10.585665702819824, + "loss": 4.1046, + "lr": 0.0007637762237762238, + "step": 2190, + "tokens_trained": 1.076404848 + }, + { + "epoch": 0.6217998723494788, + "grad_norm": 8.71747875213623, + "loss": 4.1819, + "lr": 0.0007634965034965035, + "step": 2192, + "tokens_trained": 1.077386672 + }, + { + "epoch": 0.6223672079994327, + "grad_norm": 10.74347972869873, + "loss": 4.1231, + "lr": 0.0007632167832167833, + "step": 2194, + "tokens_trained": 1.078365112 + }, + { + "epoch": 0.6229345436493866, + "grad_norm": 12.079446792602539, + "loss": 4.1132, + "lr": 0.000762937062937063, + "step": 2196, + "tokens_trained": 1.07935376 + }, + { + "epoch": 0.6235018792993404, + "grad_norm": 7.8133649826049805, + "loss": 4.0915, + "lr": 0.0007626573426573426, + "step": 2198, + "tokens_trained": 1.080332872 + }, + { + "epoch": 0.6240692149492943, + "grad_norm": 4.51243782043457, + "loss": 4.1108, + "lr": 0.0007623776223776224, + "step": 2200, + "tokens_trained": 1.081316664 + }, + { + "epoch": 0.6246365505992483, + "grad_norm": 12.625933647155762, + "loss": 4.1552, + "lr": 0.0007620979020979021, + "step": 2202, + "tokens_trained": 1.08230448 + }, + { + "epoch": 0.6252038862492022, + "grad_norm": 9.984200477600098, + "loss": 4.1199, + "lr": 0.0007618181818181819, + "step": 2204, + "tokens_trained": 1.083288992 + }, + { + "epoch": 0.6257712218991561, + "grad_norm": 11.338666915893555, + "loss": 4.0821, + "lr": 0.0007615384615384615, + "step": 2206, + "tokens_trained": 1.084273864 + }, + { + "epoch": 0.6263385575491099, + "grad_norm": 6.808894634246826, + "loss": 4.1202, + "lr": 0.0007612587412587412, + "step": 2208, + "tokens_trained": 1.085254584 + }, + { + "epoch": 0.6269058931990639, + "grad_norm": 4.182394027709961, + "loss": 4.1072, + "lr": 0.000760979020979021, + "step": 2210, + "tokens_trained": 1.086237312 + }, + { + "epoch": 0.6274732288490178, + "grad_norm": 13.04654312133789, + "loss": 4.1611, + "lr": 0.0007606993006993007, + "step": 2212, + "tokens_trained": 1.087220136 + }, + { + "epoch": 0.6280405644989717, + "grad_norm": 8.223962783813477, + "loss": 4.1094, + "lr": 0.0007604195804195805, + "step": 2214, + "tokens_trained": 1.088203464 + }, + { + "epoch": 0.6286079001489256, + "grad_norm": 7.974697589874268, + "loss": 4.1061, + "lr": 0.0007601398601398601, + "step": 2216, + "tokens_trained": 1.089188056 + }, + { + "epoch": 0.6291752357988795, + "grad_norm": 9.93747329711914, + "loss": 4.1625, + "lr": 0.0007598601398601399, + "step": 2218, + "tokens_trained": 1.090168464 + }, + { + "epoch": 0.6297425714488334, + "grad_norm": 14.117332458496094, + "loss": 4.1386, + "lr": 0.0007595804195804196, + "step": 2220, + "tokens_trained": 1.09115228 + }, + { + "epoch": 0.6303099070987873, + "grad_norm": 8.045380592346191, + "loss": 4.0962, + "lr": 0.0007593006993006993, + "step": 2222, + "tokens_trained": 1.0921348 + }, + { + "epoch": 0.6308772427487412, + "grad_norm": 7.286352634429932, + "loss": 4.1456, + "lr": 0.000759020979020979, + "step": 2224, + "tokens_trained": 1.0931198 + }, + { + "epoch": 0.6314445783986952, + "grad_norm": 7.278292179107666, + "loss": 4.1155, + "lr": 0.0007587412587412587, + "step": 2226, + "tokens_trained": 1.094107536 + }, + { + "epoch": 0.632011914048649, + "grad_norm": 5.973489761352539, + "loss": 4.1403, + "lr": 0.0007584615384615385, + "step": 2228, + "tokens_trained": 1.095090384 + }, + { + "epoch": 0.6325792496986029, + "grad_norm": 11.78962230682373, + "loss": 4.1322, + "lr": 0.0007581818181818182, + "step": 2230, + "tokens_trained": 1.096072192 + }, + { + "epoch": 0.6331465853485568, + "grad_norm": 9.853010177612305, + "loss": 4.0905, + "lr": 0.000757902097902098, + "step": 2232, + "tokens_trained": 1.097057368 + }, + { + "epoch": 0.6337139209985108, + "grad_norm": 12.578025817871094, + "loss": 4.0871, + "lr": 0.0007576223776223776, + "step": 2234, + "tokens_trained": 1.0980418 + }, + { + "epoch": 0.6342812566484647, + "grad_norm": 8.467657089233398, + "loss": 4.0972, + "lr": 0.0007573426573426573, + "step": 2236, + "tokens_trained": 1.099023032 + }, + { + "epoch": 0.6348485922984185, + "grad_norm": 10.768691062927246, + "loss": 4.0683, + "lr": 0.0007570629370629371, + "step": 2238, + "tokens_trained": 1.1000078 + }, + { + "epoch": 0.6354159279483724, + "grad_norm": 8.509350776672363, + "loss": 4.1319, + "lr": 0.0007567832167832168, + "step": 2240, + "tokens_trained": 1.100990904 + }, + { + "epoch": 0.6359832635983264, + "grad_norm": 9.473450660705566, + "loss": 4.0971, + "lr": 0.0007565034965034965, + "step": 2242, + "tokens_trained": 1.101971112 + }, + { + "epoch": 0.6365505992482803, + "grad_norm": 5.248406887054443, + "loss": 4.1212, + "lr": 0.0007562237762237762, + "step": 2244, + "tokens_trained": 1.10295244 + }, + { + "epoch": 0.6371179348982342, + "grad_norm": 2.8849964141845703, + "loss": 4.0914, + "lr": 0.000755944055944056, + "step": 2246, + "tokens_trained": 1.103935728 + }, + { + "epoch": 0.637685270548188, + "grad_norm": 10.757996559143066, + "loss": 4.0711, + "lr": 0.0007556643356643357, + "step": 2248, + "tokens_trained": 1.104917112 + }, + { + "epoch": 0.638252606198142, + "grad_norm": 14.822528839111328, + "loss": 4.1311, + "lr": 0.0007553846153846154, + "step": 2250, + "tokens_trained": 1.105899872 + }, + { + "epoch": 0.638252606198142, + "eval_loss": 1.0298579931259155, + "eval_runtime": 20.7482, + "step": 2250, + "tokens_trained": 1.105899872 + } + ], + "logging_steps": 2, + "max_steps": 7650, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 750, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}