diff --git "a/checkpoint-6000/trainer_state.json" "b/checkpoint-6000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6000/trainer_state.json" @@ -0,0 +1,24370 @@ +{ + "best_global_step": 6000, + "best_metric": 0.9661399722099304, + "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-MxDNA/checkpoint-6000", + "epoch": 1.7017941989929792, + "eval_steps": 125, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005673356499539039, + "grad_norm": 8450.4345703125, + "loss": 876.9911, + "lr": 2e-06, + "step": 2, + "tokens_trained": 0.000985992 + }, + { + "epoch": 0.0011346712999078079, + "grad_norm": 8980.888671875, + "loss": 779.4711, + "lr": 6e-06, + "step": 4, + "tokens_trained": 0.001968088 + }, + { + "epoch": 0.001702006949861712, + "grad_norm": 7489.92529296875, + "loss": 488.6157, + "lr": 1e-05, + "step": 6, + "tokens_trained": 0.002953808 + }, + { + "epoch": 0.0022693425998156157, + "grad_norm": 1952.1917724609375, + "loss": 237.0602, + "lr": 1.4e-05, + "step": 8, + "tokens_trained": 0.003935728 + }, + { + "epoch": 0.0028366782497695198, + "grad_norm": 1418.443603515625, + "loss": 159.0854, + "lr": 1.8e-05, + "step": 10, + "tokens_trained": 0.004916488 + }, + { + "epoch": 0.003404013899723424, + "grad_norm": 874.7195434570312, + "loss": 91.9563, + "lr": 2.2e-05, + "step": 12, + "tokens_trained": 0.005902792 + }, + { + "epoch": 0.003971349549677328, + "grad_norm": 1339.8248291015625, + "loss": 40.3366, + "lr": 2.6e-05, + "step": 14, + "tokens_trained": 0.0068856 + }, + { + "epoch": 0.0045386851996312315, + "grad_norm": 2936.7607421875, + "loss": 22.7436, + "lr": 3e-05, + "step": 16, + "tokens_trained": 0.007868248 + }, + { + "epoch": 0.005106020849585136, + "grad_norm": 1531.3807373046875, + "loss": 23.4797, + "lr": 3.4000000000000007e-05, + "step": 18, + "tokens_trained": 0.008849296 + }, + { + "epoch": 0.0056733564995390395, + "grad_norm": 3027.4189453125, + "loss": 38.7379, + "lr": 3.8e-05, + "step": 20, + "tokens_trained": 0.009830984 + }, + { + "epoch": 0.006240692149492944, + "grad_norm": 2435.890625, + "loss": 26.2427, + "lr": 4.2000000000000004e-05, + "step": 22, + "tokens_trained": 0.01081364 + }, + { + "epoch": 0.006808027799446848, + "grad_norm": 3217.990478515625, + "loss": 31.0263, + "lr": 4.6e-05, + "step": 24, + "tokens_trained": 0.01179036 + }, + { + "epoch": 0.007375363449400752, + "grad_norm": 3854.00634765625, + "loss": 33.8781, + "lr": 5e-05, + "step": 26, + "tokens_trained": 0.012774504 + }, + { + "epoch": 0.007942699099354656, + "grad_norm": 3197.489990234375, + "loss": 27.7927, + "lr": 5.4e-05, + "step": 28, + "tokens_trained": 0.013759992 + }, + { + "epoch": 0.00851003474930856, + "grad_norm": 3034.156494140625, + "loss": 37.9083, + "lr": 5.800000000000001e-05, + "step": 30, + "tokens_trained": 0.014740536 + }, + { + "epoch": 0.009077370399262463, + "grad_norm": 3040.314453125, + "loss": 34.0659, + "lr": 6.2e-05, + "step": 32, + "tokens_trained": 0.015725984 + }, + { + "epoch": 0.009644706049216368, + "grad_norm": 3065.5791015625, + "loss": 27.7768, + "lr": 6.6e-05, + "step": 34, + "tokens_trained": 0.016706864 + }, + { + "epoch": 0.010212041699170272, + "grad_norm": 2454.293701171875, + "loss": 35.1143, + "lr": 7.000000000000001e-05, + "step": 36, + "tokens_trained": 0.017688816 + }, + { + "epoch": 0.010779377349124175, + "grad_norm": 3100.7802734375, + "loss": 42.2603, + "lr": 7.4e-05, + "step": 38, + "tokens_trained": 0.018669072 + }, + { + "epoch": 0.011346712999078079, + "grad_norm": 2749.84423828125, + "loss": 39.3879, + "lr": 7.8e-05, + "step": 40, + "tokens_trained": 0.019652072 + }, + { + "epoch": 0.011914048649031984, + "grad_norm": 1519.9908447265625, + "loss": 35.0735, + "lr": 8.2e-05, + "step": 42, + "tokens_trained": 0.020633112 + }, + { + "epoch": 0.012481384298985888, + "grad_norm": 1474.4244384765625, + "loss": 25.8965, + "lr": 8.599999999999999e-05, + "step": 44, + "tokens_trained": 0.021616192 + }, + { + "epoch": 0.013048719948939792, + "grad_norm": 2962.500244140625, + "loss": 51.0784, + "lr": 8.999999999999999e-05, + "step": 46, + "tokens_trained": 0.022597288 + }, + { + "epoch": 0.013616055598893695, + "grad_norm": 2419.41455078125, + "loss": 43.0334, + "lr": 9.400000000000001e-05, + "step": 48, + "tokens_trained": 0.02357572 + }, + { + "epoch": 0.014183391248847599, + "grad_norm": 1267.87451171875, + "loss": 21.8063, + "lr": 9.800000000000001e-05, + "step": 50, + "tokens_trained": 0.024553376 + }, + { + "epoch": 0.014750726898801504, + "grad_norm": 1573.944091796875, + "loss": 52.9693, + "lr": 0.000102, + "step": 52, + "tokens_trained": 0.025536728 + }, + { + "epoch": 0.015318062548755408, + "grad_norm": 1509.650146484375, + "loss": 50.0825, + "lr": 0.000106, + "step": 54, + "tokens_trained": 0.026517 + }, + { + "epoch": 0.01588539819870931, + "grad_norm": 2334.765380859375, + "loss": 42.1982, + "lr": 0.00011, + "step": 56, + "tokens_trained": 0.027504728 + }, + { + "epoch": 0.016452733848663217, + "grad_norm": 1594.16259765625, + "loss": 39.0562, + "lr": 0.000114, + "step": 58, + "tokens_trained": 0.028485416 + }, + { + "epoch": 0.01702006949861712, + "grad_norm": 1628.082275390625, + "loss": 35.0488, + "lr": 0.000118, + "step": 60, + "tokens_trained": 0.029468696 + }, + { + "epoch": 0.017587405148571024, + "grad_norm": 2496.6455078125, + "loss": 49.4241, + "lr": 0.000122, + "step": 62, + "tokens_trained": 0.030453584 + }, + { + "epoch": 0.018154740798524926, + "grad_norm": 2521.721435546875, + "loss": 69.0275, + "lr": 0.000126, + "step": 64, + "tokens_trained": 0.031432864 + }, + { + "epoch": 0.01872207644847883, + "grad_norm": 2179.571533203125, + "loss": 63.1409, + "lr": 0.00013000000000000002, + "step": 66, + "tokens_trained": 0.032418416 + }, + { + "epoch": 0.019289412098432736, + "grad_norm": 899.7137451171875, + "loss": 38.4131, + "lr": 0.000134, + "step": 68, + "tokens_trained": 0.033402136 + }, + { + "epoch": 0.01985674774838664, + "grad_norm": 2109.377685546875, + "loss": 51.0044, + "lr": 0.00013800000000000002, + "step": 70, + "tokens_trained": 0.03438832 + }, + { + "epoch": 0.020424083398340544, + "grad_norm": 1649.1873779296875, + "loss": 32.1408, + "lr": 0.00014199999999999998, + "step": 72, + "tokens_trained": 0.035374464 + }, + { + "epoch": 0.020991419048294446, + "grad_norm": 1807.994140625, + "loss": 28.8357, + "lr": 0.000146, + "step": 74, + "tokens_trained": 0.03635784 + }, + { + "epoch": 0.02155875469824835, + "grad_norm": 998.9485473632812, + "loss": 23.0343, + "lr": 0.00015, + "step": 76, + "tokens_trained": 0.037340248 + }, + { + "epoch": 0.022126090348202256, + "grad_norm": 2240.17578125, + "loss": 32.0397, + "lr": 0.000154, + "step": 78, + "tokens_trained": 0.038321968 + }, + { + "epoch": 0.022693425998156158, + "grad_norm": 1606.0067138671875, + "loss": 32.1776, + "lr": 0.000158, + "step": 80, + "tokens_trained": 0.039304992 + }, + { + "epoch": 0.023260761648110063, + "grad_norm": 1685.1015625, + "loss": 24.3428, + "lr": 0.000162, + "step": 82, + "tokens_trained": 0.040286808 + }, + { + "epoch": 0.02382809729806397, + "grad_norm": 1761.7890625, + "loss": 23.9261, + "lr": 0.00016600000000000002, + "step": 84, + "tokens_trained": 0.041271776 + }, + { + "epoch": 0.02439543294801787, + "grad_norm": 2036.0982666015625, + "loss": 27.7196, + "lr": 0.00017, + "step": 86, + "tokens_trained": 0.042252784 + }, + { + "epoch": 0.024962768597971776, + "grad_norm": 1564.3870849609375, + "loss": 25.3722, + "lr": 0.000174, + "step": 88, + "tokens_trained": 0.04323596 + }, + { + "epoch": 0.025530104247925678, + "grad_norm": 1508.349853515625, + "loss": 18.4107, + "lr": 0.000178, + "step": 90, + "tokens_trained": 0.044218984 + }, + { + "epoch": 0.026097439897879583, + "grad_norm": 1955.011474609375, + "loss": 28.8456, + "lr": 0.000182, + "step": 92, + "tokens_trained": 0.045202144 + }, + { + "epoch": 0.02666477554783349, + "grad_norm": 1679.9423828125, + "loss": 23.6139, + "lr": 0.000186, + "step": 94, + "tokens_trained": 0.046192336 + }, + { + "epoch": 0.02723211119778739, + "grad_norm": 1517.5731201171875, + "loss": 42.145, + "lr": 0.00019, + "step": 96, + "tokens_trained": 0.047174312 + }, + { + "epoch": 0.027799446847741296, + "grad_norm": 1535.3076171875, + "loss": 31.9711, + "lr": 0.000194, + "step": 98, + "tokens_trained": 0.048158944 + }, + { + "epoch": 0.028366782497695198, + "grad_norm": 1475.2569580078125, + "loss": 37.645, + "lr": 0.00019800000000000002, + "step": 100, + "tokens_trained": 0.04914364 + }, + { + "epoch": 0.028934118147649103, + "grad_norm": 1918.4088134765625, + "loss": 69.4053, + "lr": 0.000202, + "step": 102, + "tokens_trained": 0.050123488 + }, + { + "epoch": 0.02950145379760301, + "grad_norm": 1631.6231689453125, + "loss": 50.9725, + "lr": 0.000206, + "step": 104, + "tokens_trained": 0.051105512 + }, + { + "epoch": 0.03006878944755691, + "grad_norm": 1291.6376953125, + "loss": 22.6527, + "lr": 0.00021, + "step": 106, + "tokens_trained": 0.052091704 + }, + { + "epoch": 0.030636125097510816, + "grad_norm": 1224.9625244140625, + "loss": 60.2725, + "lr": 0.000214, + "step": 108, + "tokens_trained": 0.053074824 + }, + { + "epoch": 0.031203460747464717, + "grad_norm": 1218.2022705078125, + "loss": 75.8728, + "lr": 0.000218, + "step": 110, + "tokens_trained": 0.054057104 + }, + { + "epoch": 0.03177079639741862, + "grad_norm": 1761.8861083984375, + "loss": 61.6427, + "lr": 0.000222, + "step": 112, + "tokens_trained": 0.055039128 + }, + { + "epoch": 0.03233813204737253, + "grad_norm": 1482.4256591796875, + "loss": 35.3351, + "lr": 0.00022600000000000002, + "step": 114, + "tokens_trained": 0.05602388 + }, + { + "epoch": 0.03290546769732643, + "grad_norm": 563.6399536132812, + "loss": 40.1461, + "lr": 0.00023, + "step": 116, + "tokens_trained": 0.057005376 + }, + { + "epoch": 0.03347280334728033, + "grad_norm": 1266.058837890625, + "loss": 24.0657, + "lr": 0.00023400000000000002, + "step": 118, + "tokens_trained": 0.057985136 + }, + { + "epoch": 0.03404013899723424, + "grad_norm": 918.206298828125, + "loss": 23.9626, + "lr": 0.00023799999999999998, + "step": 120, + "tokens_trained": 0.058968288 + }, + { + "epoch": 0.03460747464718814, + "grad_norm": 1495.7191162109375, + "loss": 19.798, + "lr": 0.000242, + "step": 122, + "tokens_trained": 0.05995348 + }, + { + "epoch": 0.03517481029714205, + "grad_norm": 1264.302734375, + "loss": 31.5342, + "lr": 0.000246, + "step": 124, + "tokens_trained": 0.060935832 + }, + { + "epoch": 0.035458478122119, + "eval_loss": 5.312118053436279, + "eval_runtime": 21.3065, + "step": 125, + "tokens_trained": 0.061426608 + }, + { + "epoch": 0.03574214594709595, + "grad_norm": 907.4861450195312, + "loss": 25.1262, + "lr": 0.00025, + "step": 126, + "tokens_trained": 0.061918184 + }, + { + "epoch": 0.03630948159704985, + "grad_norm": 1287.6158447265625, + "loss": 26.963, + "lr": 0.000254, + "step": 128, + "tokens_trained": 0.062902328 + }, + { + "epoch": 0.03687681724700376, + "grad_norm": 1260.570556640625, + "loss": 24.9633, + "lr": 0.00025800000000000004, + "step": 130, + "tokens_trained": 0.063883456 + }, + { + "epoch": 0.03744415289695766, + "grad_norm": 1436.82373046875, + "loss": 23.1028, + "lr": 0.000262, + "step": 132, + "tokens_trained": 0.06486748 + }, + { + "epoch": 0.03801148854691157, + "grad_norm": 812.9523315429688, + "loss": 20.5496, + "lr": 0.000266, + "step": 134, + "tokens_trained": 0.065847104 + }, + { + "epoch": 0.03857882419686547, + "grad_norm": 1336.5322265625, + "loss": 23.673, + "lr": 0.00027, + "step": 136, + "tokens_trained": 0.066829928 + }, + { + "epoch": 0.03914615984681937, + "grad_norm": 1381.282470703125, + "loss": 32.0373, + "lr": 0.00027400000000000005, + "step": 138, + "tokens_trained": 0.067814024 + }, + { + "epoch": 0.03971349549677328, + "grad_norm": 972.7861938476562, + "loss": 26.9454, + "lr": 0.00027800000000000004, + "step": 140, + "tokens_trained": 0.068797744 + }, + { + "epoch": 0.04028083114672718, + "grad_norm": 1347.2249755859375, + "loss": 22.3578, + "lr": 0.00028199999999999997, + "step": 142, + "tokens_trained": 0.069780072 + }, + { + "epoch": 0.04084816679668109, + "grad_norm": 829.525390625, + "loss": 37.9879, + "lr": 0.00028599999999999996, + "step": 144, + "tokens_trained": 0.070759896 + }, + { + "epoch": 0.04141550244663499, + "grad_norm": 1094.1033935546875, + "loss": 21.1972, + "lr": 0.00029, + "step": 146, + "tokens_trained": 0.0717452 + }, + { + "epoch": 0.04198283809658889, + "grad_norm": 717.107421875, + "loss": 21.7774, + "lr": 0.000294, + "step": 148, + "tokens_trained": 0.072727432 + }, + { + "epoch": 0.042550173746542796, + "grad_norm": 744.4456787109375, + "loss": 20.3235, + "lr": 0.000298, + "step": 150, + "tokens_trained": 0.073712128 + }, + { + "epoch": 0.0431175093964967, + "grad_norm": 904.1460571289062, + "loss": 22.7878, + "lr": 0.000302, + "step": 152, + "tokens_trained": 0.074695296 + }, + { + "epoch": 0.04368484504645061, + "grad_norm": 1352.303955078125, + "loss": 20.9757, + "lr": 0.000306, + "step": 154, + "tokens_trained": 0.0756798 + }, + { + "epoch": 0.04425218069640451, + "grad_norm": 997.0473022460938, + "loss": 17.4647, + "lr": 0.00031, + "step": 156, + "tokens_trained": 0.076666504 + }, + { + "epoch": 0.04481951634635841, + "grad_norm": 1206.387939453125, + "loss": 21.1846, + "lr": 0.000314, + "step": 158, + "tokens_trained": 0.07764868 + }, + { + "epoch": 0.045386851996312316, + "grad_norm": 1029.6807861328125, + "loss": 17.8853, + "lr": 0.00031800000000000003, + "step": 160, + "tokens_trained": 0.07863548 + }, + { + "epoch": 0.04595418764626622, + "grad_norm": 1136.4635009765625, + "loss": 30.057, + "lr": 0.000322, + "step": 162, + "tokens_trained": 0.079618928 + }, + { + "epoch": 0.04652152329622013, + "grad_norm": 834.3464965820312, + "loss": 28.1782, + "lr": 0.000326, + "step": 164, + "tokens_trained": 0.0806032 + }, + { + "epoch": 0.04708885894617403, + "grad_norm": 1177.8365478515625, + "loss": 16.4267, + "lr": 0.00033, + "step": 166, + "tokens_trained": 0.081583752 + }, + { + "epoch": 0.04765619459612794, + "grad_norm": 572.501708984375, + "loss": 16.5752, + "lr": 0.00033400000000000004, + "step": 168, + "tokens_trained": 0.082568184 + }, + { + "epoch": 0.048223530246081836, + "grad_norm": 437.6822814941406, + "loss": 11.5509, + "lr": 0.00033800000000000003, + "step": 170, + "tokens_trained": 0.083553352 + }, + { + "epoch": 0.04879086589603574, + "grad_norm": 1119.0416259765625, + "loss": 16.2689, + "lr": 0.000342, + "step": 172, + "tokens_trained": 0.084536352 + }, + { + "epoch": 0.04935820154598965, + "grad_norm": 895.4021606445312, + "loss": 12.6663, + "lr": 0.000346, + "step": 174, + "tokens_trained": 0.085517312 + }, + { + "epoch": 0.04992553719594355, + "grad_norm": 995.6289672851562, + "loss": 26.0663, + "lr": 0.00035, + "step": 176, + "tokens_trained": 0.086496088 + }, + { + "epoch": 0.05049287284589746, + "grad_norm": 839.6610717773438, + "loss": 21.5115, + "lr": 0.000354, + "step": 178, + "tokens_trained": 0.087480632 + }, + { + "epoch": 0.051060208495851356, + "grad_norm": 734.1155395507812, + "loss": 29.3287, + "lr": 0.000358, + "step": 180, + "tokens_trained": 0.088460408 + }, + { + "epoch": 0.05162754414580526, + "grad_norm": 721.4505615234375, + "loss": 26.0801, + "lr": 0.000362, + "step": 182, + "tokens_trained": 0.08944248 + }, + { + "epoch": 0.052194879795759166, + "grad_norm": 845.9672241210938, + "loss": 19.0639, + "lr": 0.000366, + "step": 184, + "tokens_trained": 0.090427832 + }, + { + "epoch": 0.05276221544571307, + "grad_norm": 1210.9969482421875, + "loss": 23.9036, + "lr": 0.00037, + "step": 186, + "tokens_trained": 0.091411504 + }, + { + "epoch": 0.05332955109566698, + "grad_norm": 1079.1690673828125, + "loss": 23.5588, + "lr": 0.000374, + "step": 188, + "tokens_trained": 0.092392672 + }, + { + "epoch": 0.053896886745620876, + "grad_norm": 596.111328125, + "loss": 20.8275, + "lr": 0.000378, + "step": 190, + "tokens_trained": 0.093374696 + }, + { + "epoch": 0.05446422239557478, + "grad_norm": 761.8096923828125, + "loss": 22.512, + "lr": 0.000382, + "step": 192, + "tokens_trained": 0.094361912 + }, + { + "epoch": 0.055031558045528686, + "grad_norm": 1081.9832763671875, + "loss": 32.335, + "lr": 0.000386, + "step": 194, + "tokens_trained": 0.095342992 + }, + { + "epoch": 0.05559889369548259, + "grad_norm": 304.3534240722656, + "loss": 11.5275, + "lr": 0.00039000000000000005, + "step": 196, + "tokens_trained": 0.096323512 + }, + { + "epoch": 0.0561662293454365, + "grad_norm": 586.6314086914062, + "loss": 16.2663, + "lr": 0.00039400000000000004, + "step": 198, + "tokens_trained": 0.097308864 + }, + { + "epoch": 0.056733564995390395, + "grad_norm": 624.9953002929688, + "loss": 16.627, + "lr": 0.000398, + "step": 200, + "tokens_trained": 0.098289064 + }, + { + "epoch": 0.0573009006453443, + "grad_norm": 585.9645385742188, + "loss": 15.8359, + "lr": 0.000402, + "step": 202, + "tokens_trained": 0.099269696 + }, + { + "epoch": 0.057868236295298206, + "grad_norm": 537.9913330078125, + "loss": 20.0779, + "lr": 0.00040600000000000006, + "step": 204, + "tokens_trained": 0.100248448 + }, + { + "epoch": 0.05843557194525211, + "grad_norm": 805.04931640625, + "loss": 21.4524, + "lr": 0.00041, + "step": 206, + "tokens_trained": 0.101231248 + }, + { + "epoch": 0.05900290759520602, + "grad_norm": 439.1418151855469, + "loss": 23.9852, + "lr": 0.000414, + "step": 208, + "tokens_trained": 0.102210688 + }, + { + "epoch": 0.059570243245159915, + "grad_norm": 502.684814453125, + "loss": 17.6273, + "lr": 0.00041799999999999997, + "step": 210, + "tokens_trained": 0.103192176 + }, + { + "epoch": 0.06013757889511382, + "grad_norm": 849.9979858398438, + "loss": 33.7517, + "lr": 0.000422, + "step": 212, + "tokens_trained": 0.104172824 + }, + { + "epoch": 0.060704914545067726, + "grad_norm": 939.583740234375, + "loss": 26.2559, + "lr": 0.000426, + "step": 214, + "tokens_trained": 0.105156672 + }, + { + "epoch": 0.06127225019502163, + "grad_norm": 525.0505981445312, + "loss": 20.0923, + "lr": 0.00043, + "step": 216, + "tokens_trained": 0.106141368 + }, + { + "epoch": 0.061839585844975536, + "grad_norm": 420.296630859375, + "loss": 17.9608, + "lr": 0.00043400000000000003, + "step": 218, + "tokens_trained": 0.107124088 + }, + { + "epoch": 0.062406921494929435, + "grad_norm": 711.3380737304688, + "loss": 19.387, + "lr": 0.000438, + "step": 220, + "tokens_trained": 0.108112632 + }, + { + "epoch": 0.06297425714488335, + "grad_norm": 759.183349609375, + "loss": 17.8061, + "lr": 0.000442, + "step": 222, + "tokens_trained": 0.1090934 + }, + { + "epoch": 0.06354159279483725, + "grad_norm": 790.025146484375, + "loss": 13.8539, + "lr": 0.000446, + "step": 224, + "tokens_trained": 0.110079512 + }, + { + "epoch": 0.06410892844479114, + "grad_norm": 769.8306274414062, + "loss": 22.1258, + "lr": 0.00045000000000000004, + "step": 226, + "tokens_trained": 0.111060152 + }, + { + "epoch": 0.06467626409474506, + "grad_norm": 656.8352661132812, + "loss": 14.8646, + "lr": 0.00045400000000000003, + "step": 228, + "tokens_trained": 0.112044144 + }, + { + "epoch": 0.06524359974469895, + "grad_norm": 498.92010498046875, + "loss": 23.1558, + "lr": 0.000458, + "step": 230, + "tokens_trained": 0.113022928 + }, + { + "epoch": 0.06581093539465287, + "grad_norm": 764.0186157226562, + "loss": 16.7089, + "lr": 0.000462, + "step": 232, + "tokens_trained": 0.114003832 + }, + { + "epoch": 0.06637827104460677, + "grad_norm": 491.5793762207031, + "loss": 12.3979, + "lr": 0.00046600000000000005, + "step": 234, + "tokens_trained": 0.114991008 + }, + { + "epoch": 0.06694560669456066, + "grad_norm": 679.9217529296875, + "loss": 14.9037, + "lr": 0.00047, + "step": 236, + "tokens_trained": 0.115971888 + }, + { + "epoch": 0.06751294234451458, + "grad_norm": 491.0369567871094, + "loss": 7.7603, + "lr": 0.000474, + "step": 238, + "tokens_trained": 0.116952616 + }, + { + "epoch": 0.06808027799446847, + "grad_norm": 369.2186279296875, + "loss": 8.2256, + "lr": 0.00047799999999999996, + "step": 240, + "tokens_trained": 0.117935816 + }, + { + "epoch": 0.06864761364442239, + "grad_norm": 312.72137451171875, + "loss": 7.5486, + "lr": 0.000482, + "step": 242, + "tokens_trained": 0.118919392 + }, + { + "epoch": 0.06921494929437629, + "grad_norm": 596.1439208984375, + "loss": 11.7351, + "lr": 0.000486, + "step": 244, + "tokens_trained": 0.119901856 + }, + { + "epoch": 0.06978228494433018, + "grad_norm": 467.5667419433594, + "loss": 11.8403, + "lr": 0.00049, + "step": 246, + "tokens_trained": 0.120884624 + }, + { + "epoch": 0.0703496205942841, + "grad_norm": 430.50048828125, + "loss": 13.8081, + "lr": 0.000494, + "step": 248, + "tokens_trained": 0.121869224 + }, + { + "epoch": 0.070916956244238, + "grad_norm": 522.242919921875, + "loss": 14.1892, + "lr": 0.000498, + "step": 250, + "tokens_trained": 0.122853584 + }, + { + "epoch": 0.070916956244238, + "eval_loss": 1.9294606447219849, + "eval_runtime": 20.4162, + "step": 250, + "tokens_trained": 0.122853584 + }, + { + "epoch": 0.0714842918941919, + "grad_norm": 835.2765502929688, + "loss": 13.2462, + "lr": 0.0005020000000000001, + "step": 252, + "tokens_trained": 0.123835544 + }, + { + "epoch": 0.0720516275441458, + "grad_norm": 714.8098754882812, + "loss": 20.0498, + "lr": 0.000506, + "step": 254, + "tokens_trained": 0.124821616 + }, + { + "epoch": 0.0726189631940997, + "grad_norm": 701.512939453125, + "loss": 18.3664, + "lr": 0.00051, + "step": 256, + "tokens_trained": 0.125807608 + }, + { + "epoch": 0.07318629884405362, + "grad_norm": 773.987060546875, + "loss": 21.3807, + "lr": 0.000514, + "step": 258, + "tokens_trained": 0.126791464 + }, + { + "epoch": 0.07375363449400751, + "grad_norm": 826.422119140625, + "loss": 22.6403, + "lr": 0.000518, + "step": 260, + "tokens_trained": 0.127771752 + }, + { + "epoch": 0.07432097014396143, + "grad_norm": 742.8673095703125, + "loss": 20.1504, + "lr": 0.000522, + "step": 262, + "tokens_trained": 0.128755448 + }, + { + "epoch": 0.07488830579391532, + "grad_norm": 797.79296875, + "loss": 26.7343, + "lr": 0.000526, + "step": 264, + "tokens_trained": 0.129741088 + }, + { + "epoch": 0.07545564144386922, + "grad_norm": 673.9141235351562, + "loss": 12.505, + "lr": 0.0005300000000000001, + "step": 266, + "tokens_trained": 0.130727504 + }, + { + "epoch": 0.07602297709382314, + "grad_norm": 310.6510925292969, + "loss": 12.6344, + "lr": 0.0005340000000000001, + "step": 268, + "tokens_trained": 0.131710296 + }, + { + "epoch": 0.07659031274377703, + "grad_norm": 312.40966796875, + "loss": 14.254, + "lr": 0.0005380000000000001, + "step": 270, + "tokens_trained": 0.132695352 + }, + { + "epoch": 0.07715764839373095, + "grad_norm": 492.2834777832031, + "loss": 19.0979, + "lr": 0.0005420000000000001, + "step": 272, + "tokens_trained": 0.133677928 + }, + { + "epoch": 0.07772498404368484, + "grad_norm": 628.457763671875, + "loss": 21.7735, + "lr": 0.000546, + "step": 274, + "tokens_trained": 0.134655504 + }, + { + "epoch": 0.07829231969363874, + "grad_norm": 382.8389892578125, + "loss": 12.5128, + "lr": 0.00055, + "step": 276, + "tokens_trained": 0.135640208 + }, + { + "epoch": 0.07885965534359266, + "grad_norm": 483.12335205078125, + "loss": 15.2589, + "lr": 0.000554, + "step": 278, + "tokens_trained": 0.136624232 + }, + { + "epoch": 0.07942699099354655, + "grad_norm": 640.658447265625, + "loss": 12.1341, + "lr": 0.000558, + "step": 280, + "tokens_trained": 0.13760628 + }, + { + "epoch": 0.07999432664350047, + "grad_norm": 410.0824279785156, + "loss": 12.5723, + "lr": 0.0005620000000000001, + "step": 282, + "tokens_trained": 0.13858832 + }, + { + "epoch": 0.08056166229345436, + "grad_norm": 513.2861328125, + "loss": 14.8461, + "lr": 0.000566, + "step": 284, + "tokens_trained": 0.139568424 + }, + { + "epoch": 0.08112899794340826, + "grad_norm": 564.547607421875, + "loss": 12.5792, + "lr": 0.00057, + "step": 286, + "tokens_trained": 0.140557016 + }, + { + "epoch": 0.08169633359336217, + "grad_norm": 451.3592834472656, + "loss": 16.5433, + "lr": 0.000574, + "step": 288, + "tokens_trained": 0.141540248 + }, + { + "epoch": 0.08226366924331607, + "grad_norm": 404.2495422363281, + "loss": 16.4138, + "lr": 0.000578, + "step": 290, + "tokens_trained": 0.142528272 + }, + { + "epoch": 0.08283100489326999, + "grad_norm": 566.5219116210938, + "loss": 16.4743, + "lr": 0.0005819999999999999, + "step": 292, + "tokens_trained": 0.143513096 + }, + { + "epoch": 0.08339834054322388, + "grad_norm": 559.6517333984375, + "loss": 16.421, + "lr": 0.0005859999999999999, + "step": 294, + "tokens_trained": 0.144494472 + }, + { + "epoch": 0.08396567619317778, + "grad_norm": 260.874755859375, + "loss": 11.2214, + "lr": 0.00059, + "step": 296, + "tokens_trained": 0.14547876 + }, + { + "epoch": 0.0845330118431317, + "grad_norm": 272.02899169921875, + "loss": 10.3491, + "lr": 0.000594, + "step": 298, + "tokens_trained": 0.146465864 + }, + { + "epoch": 0.08510034749308559, + "grad_norm": 556.9845581054688, + "loss": 10.4348, + "lr": 0.000598, + "step": 300, + "tokens_trained": 0.147446344 + }, + { + "epoch": 0.0856676831430395, + "grad_norm": 273.35772705078125, + "loss": 8.3292, + "lr": 0.000602, + "step": 302, + "tokens_trained": 0.14843244 + }, + { + "epoch": 0.0862350187929934, + "grad_norm": 246.6316680908203, + "loss": 9.9362, + "lr": 0.000606, + "step": 304, + "tokens_trained": 0.149415976 + }, + { + "epoch": 0.0868023544429473, + "grad_norm": 564.4365844726562, + "loss": 9.2621, + "lr": 0.00061, + "step": 306, + "tokens_trained": 0.150398728 + }, + { + "epoch": 0.08736969009290121, + "grad_norm": 396.0948791503906, + "loss": 11.8526, + "lr": 0.000614, + "step": 308, + "tokens_trained": 0.151385104 + }, + { + "epoch": 0.08793702574285511, + "grad_norm": 488.6072692871094, + "loss": 11.8473, + "lr": 0.0006180000000000001, + "step": 310, + "tokens_trained": 0.152373672 + }, + { + "epoch": 0.08850436139280903, + "grad_norm": 346.70660400390625, + "loss": 12.0897, + "lr": 0.000622, + "step": 312, + "tokens_trained": 0.153356256 + }, + { + "epoch": 0.08907169704276292, + "grad_norm": 382.40679931640625, + "loss": 9.271, + "lr": 0.000626, + "step": 314, + "tokens_trained": 0.154342632 + }, + { + "epoch": 0.08963903269271682, + "grad_norm": 288.7908935546875, + "loss": 9.185, + "lr": 0.00063, + "step": 316, + "tokens_trained": 0.1553238 + }, + { + "epoch": 0.09020636834267073, + "grad_norm": 337.5335388183594, + "loss": 12.0555, + "lr": 0.000634, + "step": 318, + "tokens_trained": 0.156313168 + }, + { + "epoch": 0.09077370399262463, + "grad_norm": 349.25531005859375, + "loss": 8.51, + "lr": 0.000638, + "step": 320, + "tokens_trained": 0.157299448 + }, + { + "epoch": 0.09134103964257854, + "grad_norm": 471.7824401855469, + "loss": 14.1888, + "lr": 0.000642, + "step": 322, + "tokens_trained": 0.158285264 + }, + { + "epoch": 0.09190837529253244, + "grad_norm": 284.94036865234375, + "loss": 10.1593, + "lr": 0.000646, + "step": 324, + "tokens_trained": 0.159267512 + }, + { + "epoch": 0.09247571094248634, + "grad_norm": 510.90478515625, + "loss": 13.5744, + "lr": 0.0006500000000000001, + "step": 326, + "tokens_trained": 0.160250856 + }, + { + "epoch": 0.09304304659244025, + "grad_norm": 373.82965087890625, + "loss": 8.4999, + "lr": 0.0006540000000000001, + "step": 328, + "tokens_trained": 0.161231832 + }, + { + "epoch": 0.09361038224239415, + "grad_norm": 219.3827362060547, + "loss": 8.4436, + "lr": 0.0006580000000000001, + "step": 330, + "tokens_trained": 0.162217656 + }, + { + "epoch": 0.09417771789234806, + "grad_norm": 433.0914001464844, + "loss": 11.2019, + "lr": 0.000662, + "step": 332, + "tokens_trained": 0.163199096 + }, + { + "epoch": 0.09474505354230196, + "grad_norm": 242.65907287597656, + "loss": 9.0666, + "lr": 0.000666, + "step": 334, + "tokens_trained": 0.164178512 + }, + { + "epoch": 0.09531238919225588, + "grad_norm": 446.07916259765625, + "loss": 8.6546, + "lr": 0.00067, + "step": 336, + "tokens_trained": 0.165162464 + }, + { + "epoch": 0.09587972484220977, + "grad_norm": 231.8892364501953, + "loss": 7.5819, + "lr": 0.000674, + "step": 338, + "tokens_trained": 0.166141536 + }, + { + "epoch": 0.09644706049216367, + "grad_norm": 100.7306137084961, + "loss": 6.7047, + "lr": 0.0006780000000000001, + "step": 340, + "tokens_trained": 0.167123944 + }, + { + "epoch": 0.09701439614211758, + "grad_norm": 78.11279296875, + "loss": 5.9308, + "lr": 0.0006820000000000001, + "step": 342, + "tokens_trained": 0.168105264 + }, + { + "epoch": 0.09758173179207148, + "grad_norm": 271.466064453125, + "loss": 6.9141, + "lr": 0.0006860000000000001, + "step": 344, + "tokens_trained": 0.169088912 + }, + { + "epoch": 0.0981490674420254, + "grad_norm": 252.54478454589844, + "loss": 6.3281, + "lr": 0.00069, + "step": 346, + "tokens_trained": 0.170077368 + }, + { + "epoch": 0.0987164030919793, + "grad_norm": 305.8559875488281, + "loss": 6.443, + "lr": 0.000694, + "step": 348, + "tokens_trained": 0.171057232 + }, + { + "epoch": 0.09928373874193319, + "grad_norm": 227.74374389648438, + "loss": 6.552, + "lr": 0.0006979999999999999, + "step": 350, + "tokens_trained": 0.172041376 + }, + { + "epoch": 0.0998510743918871, + "grad_norm": 446.7601623535156, + "loss": 10.8184, + "lr": 0.0007019999999999999, + "step": 352, + "tokens_trained": 0.173023624 + }, + { + "epoch": 0.100418410041841, + "grad_norm": 353.0849609375, + "loss": 8.6327, + "lr": 0.0007059999999999999, + "step": 354, + "tokens_trained": 0.174005992 + }, + { + "epoch": 0.10098574569179491, + "grad_norm": 367.9427185058594, + "loss": 9.3898, + "lr": 0.00071, + "step": 356, + "tokens_trained": 0.174988304 + }, + { + "epoch": 0.10155308134174881, + "grad_norm": 224.4961700439453, + "loss": 8.284, + "lr": 0.000714, + "step": 358, + "tokens_trained": 0.175969816 + }, + { + "epoch": 0.10212041699170271, + "grad_norm": 221.86537170410156, + "loss": 7.0578, + "lr": 0.000718, + "step": 360, + "tokens_trained": 0.176952688 + }, + { + "epoch": 0.10268775264165662, + "grad_norm": 331.0989685058594, + "loss": 6.9561, + "lr": 0.000722, + "step": 362, + "tokens_trained": 0.177935144 + }, + { + "epoch": 0.10325508829161052, + "grad_norm": 171.6498260498047, + "loss": 7.203, + "lr": 0.000726, + "step": 364, + "tokens_trained": 0.178916776 + }, + { + "epoch": 0.10382242394156443, + "grad_norm": 284.2208557128906, + "loss": 10.3517, + "lr": 0.00073, + "step": 366, + "tokens_trained": 0.179903432 + }, + { + "epoch": 0.10438975959151833, + "grad_norm": 354.8574523925781, + "loss": 9.3888, + "lr": 0.000734, + "step": 368, + "tokens_trained": 0.180883224 + }, + { + "epoch": 0.10495709524147223, + "grad_norm": 344.82574462890625, + "loss": 10.5933, + "lr": 0.000738, + "step": 370, + "tokens_trained": 0.181863808 + }, + { + "epoch": 0.10552443089142614, + "grad_norm": 302.6838073730469, + "loss": 10.2832, + "lr": 0.000742, + "step": 372, + "tokens_trained": 0.182843712 + }, + { + "epoch": 0.10609176654138004, + "grad_norm": 323.0387878417969, + "loss": 6.4864, + "lr": 0.000746, + "step": 374, + "tokens_trained": 0.183825832 + }, + { + "epoch": 0.10637543436635699, + "eval_loss": 1.4430732727050781, + "eval_runtime": 20.5468, + "step": 375, + "tokens_trained": 0.184317744 + }, + { + "epoch": 0.10665910219133395, + "grad_norm": 133.74822998046875, + "loss": 5.4176, + "lr": 0.00075, + "step": 376, + "tokens_trained": 0.184811352 + }, + { + "epoch": 0.10722643784128785, + "grad_norm": 180.3372344970703, + "loss": 5.5641, + "lr": 0.000754, + "step": 378, + "tokens_trained": 0.185792528 + }, + { + "epoch": 0.10779377349124175, + "grad_norm": 250.83999633789062, + "loss": 5.8612, + "lr": 0.000758, + "step": 380, + "tokens_trained": 0.186777112 + }, + { + "epoch": 0.10836110914119566, + "grad_norm": 293.51959228515625, + "loss": 6.0418, + "lr": 0.000762, + "step": 382, + "tokens_trained": 0.18775724 + }, + { + "epoch": 0.10892844479114956, + "grad_norm": 292.56207275390625, + "loss": 6.1812, + "lr": 0.0007660000000000001, + "step": 384, + "tokens_trained": 0.188733568 + }, + { + "epoch": 0.10949578044110347, + "grad_norm": 121.82467651367188, + "loss": 6.0855, + "lr": 0.0007700000000000001, + "step": 386, + "tokens_trained": 0.189718512 + }, + { + "epoch": 0.11006311609105737, + "grad_norm": 124.30497741699219, + "loss": 5.7734, + "lr": 0.0007740000000000001, + "step": 388, + "tokens_trained": 0.190703776 + }, + { + "epoch": 0.11063045174101127, + "grad_norm": 143.64004516601562, + "loss": 5.7641, + "lr": 0.000778, + "step": 390, + "tokens_trained": 0.191689888 + }, + { + "epoch": 0.11119778739096518, + "grad_norm": 160.06784057617188, + "loss": 5.6025, + "lr": 0.000782, + "step": 392, + "tokens_trained": 0.192673992 + }, + { + "epoch": 0.11176512304091908, + "grad_norm": 226.97988891601562, + "loss": 6.0049, + "lr": 0.000786, + "step": 394, + "tokens_trained": 0.193656272 + }, + { + "epoch": 0.112332458690873, + "grad_norm": 223.26898193359375, + "loss": 5.6972, + "lr": 0.00079, + "step": 396, + "tokens_trained": 0.194639144 + }, + { + "epoch": 0.11289979434082689, + "grad_norm": 249.34912109375, + "loss": 5.7348, + "lr": 0.0007940000000000001, + "step": 398, + "tokens_trained": 0.195621256 + }, + { + "epoch": 0.11346712999078079, + "grad_norm": 161.34271240234375, + "loss": 5.6689, + "lr": 0.0007980000000000001, + "step": 400, + "tokens_trained": 0.196604136 + }, + { + "epoch": 0.1140344656407347, + "grad_norm": 148.53176879882812, + "loss": 5.702, + "lr": 0.0008020000000000001, + "step": 402, + "tokens_trained": 0.197586784 + }, + { + "epoch": 0.1146018012906886, + "grad_norm": 144.40835571289062, + "loss": 6.2402, + "lr": 0.0008060000000000001, + "step": 404, + "tokens_trained": 0.198570824 + }, + { + "epoch": 0.11516913694064251, + "grad_norm": 306.57562255859375, + "loss": 7.1739, + "lr": 0.0008100000000000001, + "step": 406, + "tokens_trained": 0.199548328 + }, + { + "epoch": 0.11573647259059641, + "grad_norm": 308.79180908203125, + "loss": 6.0972, + "lr": 0.0008139999999999999, + "step": 408, + "tokens_trained": 0.200532496 + }, + { + "epoch": 0.11630380824055031, + "grad_norm": 197.76791381835938, + "loss": 6.3533, + "lr": 0.0008179999999999999, + "step": 410, + "tokens_trained": 0.201514648 + }, + { + "epoch": 0.11687114389050422, + "grad_norm": 129.5694580078125, + "loss": 6.9628, + "lr": 0.0008219999999999999, + "step": 412, + "tokens_trained": 0.2024994 + }, + { + "epoch": 0.11743847954045812, + "grad_norm": 446.0195617675781, + "loss": 11.7562, + "lr": 0.000826, + "step": 414, + "tokens_trained": 0.20348012 + }, + { + "epoch": 0.11800581519041203, + "grad_norm": 355.5342712402344, + "loss": 8.8055, + "lr": 0.00083, + "step": 416, + "tokens_trained": 0.20446356 + }, + { + "epoch": 0.11857315084036593, + "grad_norm": 456.2491149902344, + "loss": 9.606, + "lr": 0.000834, + "step": 418, + "tokens_trained": 0.205445288 + }, + { + "epoch": 0.11914048649031983, + "grad_norm": 369.8676452636719, + "loss": 8.385, + "lr": 0.000838, + "step": 420, + "tokens_trained": 0.206427832 + }, + { + "epoch": 0.11970782214027374, + "grad_norm": 262.19073486328125, + "loss": 9.0956, + "lr": 0.000842, + "step": 422, + "tokens_trained": 0.207409848 + }, + { + "epoch": 0.12027515779022764, + "grad_norm": 120.3193130493164, + "loss": 5.4937, + "lr": 0.000846, + "step": 424, + "tokens_trained": 0.208391752 + }, + { + "epoch": 0.12084249344018155, + "grad_norm": 222.1111297607422, + "loss": 8.9367, + "lr": 0.00085, + "step": 426, + "tokens_trained": 0.20937384 + }, + { + "epoch": 0.12140982909013545, + "grad_norm": 137.16819763183594, + "loss": 7.5876, + "lr": 0.000854, + "step": 428, + "tokens_trained": 0.210358576 + }, + { + "epoch": 0.12197716474008935, + "grad_norm": 267.61846923828125, + "loss": 8.817, + "lr": 0.000858, + "step": 430, + "tokens_trained": 0.211340064 + }, + { + "epoch": 0.12254450039004326, + "grad_norm": 472.72906494140625, + "loss": 8.203, + "lr": 0.000862, + "step": 432, + "tokens_trained": 0.212321144 + }, + { + "epoch": 0.12311183603999716, + "grad_norm": 297.1420593261719, + "loss": 10.987, + "lr": 0.000866, + "step": 434, + "tokens_trained": 0.213300312 + }, + { + "epoch": 0.12367917168995107, + "grad_norm": 281.7297668457031, + "loss": 7.6117, + "lr": 0.00087, + "step": 436, + "tokens_trained": 0.214287624 + }, + { + "epoch": 0.12424650733990497, + "grad_norm": 203.09678649902344, + "loss": 6.5638, + "lr": 0.000874, + "step": 438, + "tokens_trained": 0.215272136 + }, + { + "epoch": 0.12481384298985887, + "grad_norm": 155.7823944091797, + "loss": 6.1131, + "lr": 0.000878, + "step": 440, + "tokens_trained": 0.216256392 + }, + { + "epoch": 0.12538117863981277, + "grad_norm": 189.86196899414062, + "loss": 8.2565, + "lr": 0.000882, + "step": 442, + "tokens_trained": 0.217242504 + }, + { + "epoch": 0.1259485142897667, + "grad_norm": 247.4568634033203, + "loss": 7.1005, + "lr": 0.0008860000000000001, + "step": 444, + "tokens_trained": 0.218226008 + }, + { + "epoch": 0.1265158499397206, + "grad_norm": 179.72825622558594, + "loss": 6.3379, + "lr": 0.0008900000000000001, + "step": 446, + "tokens_trained": 0.219210584 + }, + { + "epoch": 0.1270831855896745, + "grad_norm": 212.96356201171875, + "loss": 7.2514, + "lr": 0.000894, + "step": 448, + "tokens_trained": 0.220193952 + }, + { + "epoch": 0.1276505212396284, + "grad_norm": 105.67095947265625, + "loss": 5.456, + "lr": 0.000898, + "step": 450, + "tokens_trained": 0.221176936 + }, + { + "epoch": 0.1282178568895823, + "grad_norm": 302.9122619628906, + "loss": 6.4018, + "lr": 0.000902, + "step": 452, + "tokens_trained": 0.222161952 + }, + { + "epoch": 0.12878519253953621, + "grad_norm": 215.66561889648438, + "loss": 6.2853, + "lr": 0.000906, + "step": 454, + "tokens_trained": 0.223144912 + }, + { + "epoch": 0.1293525281894901, + "grad_norm": 272.9984130859375, + "loss": 7.3902, + "lr": 0.00091, + "step": 456, + "tokens_trained": 0.224127392 + }, + { + "epoch": 0.129919863839444, + "grad_norm": 200.7503662109375, + "loss": 6.1637, + "lr": 0.0009140000000000001, + "step": 458, + "tokens_trained": 0.22511648 + }, + { + "epoch": 0.1304871994893979, + "grad_norm": 93.23990631103516, + "loss": 6.4867, + "lr": 0.0009180000000000001, + "step": 460, + "tokens_trained": 0.226098144 + }, + { + "epoch": 0.1310545351393518, + "grad_norm": 274.37164306640625, + "loss": 8.99, + "lr": 0.0009220000000000001, + "step": 462, + "tokens_trained": 0.227081848 + }, + { + "epoch": 0.13162187078930573, + "grad_norm": 186.66322326660156, + "loss": 8.7122, + "lr": 0.0009260000000000001, + "step": 464, + "tokens_trained": 0.22806636 + }, + { + "epoch": 0.13218920643925963, + "grad_norm": 586.1035766601562, + "loss": 9.1045, + "lr": 0.00093, + "step": 466, + "tokens_trained": 0.229047872 + }, + { + "epoch": 0.13275654208921353, + "grad_norm": 227.55996704101562, + "loss": 9.7276, + "lr": 0.000934, + "step": 468, + "tokens_trained": 0.230031144 + }, + { + "epoch": 0.13332387773916743, + "grad_norm": 229.26609802246094, + "loss": 6.6244, + "lr": 0.0009379999999999999, + "step": 470, + "tokens_trained": 0.2310158 + }, + { + "epoch": 0.13389121338912133, + "grad_norm": 145.16331481933594, + "loss": 5.759, + "lr": 0.000942, + "step": 472, + "tokens_trained": 0.2319996 + }, + { + "epoch": 0.13445854903907525, + "grad_norm": 109.9937744140625, + "loss": 5.4838, + "lr": 0.000946, + "step": 474, + "tokens_trained": 0.232983808 + }, + { + "epoch": 0.13502588468902915, + "grad_norm": 135.74899291992188, + "loss": 6.2738, + "lr": 0.00095, + "step": 476, + "tokens_trained": 0.233963016 + }, + { + "epoch": 0.13559322033898305, + "grad_norm": 142.99449157714844, + "loss": 5.8459, + "lr": 0.000954, + "step": 478, + "tokens_trained": 0.234948864 + }, + { + "epoch": 0.13616055598893695, + "grad_norm": 198.66883850097656, + "loss": 6.6626, + "lr": 0.000958, + "step": 480, + "tokens_trained": 0.235932392 + }, + { + "epoch": 0.13672789163889085, + "grad_norm": 260.76507568359375, + "loss": 6.9299, + "lr": 0.000962, + "step": 482, + "tokens_trained": 0.236915664 + }, + { + "epoch": 0.13729522728884477, + "grad_norm": 267.97589111328125, + "loss": 6.4343, + "lr": 0.000966, + "step": 484, + "tokens_trained": 0.237896904 + }, + { + "epoch": 0.13786256293879867, + "grad_norm": 89.8781967163086, + "loss": 6.3203, + "lr": 0.0009699999999999999, + "step": 486, + "tokens_trained": 0.238874528 + }, + { + "epoch": 0.13842989858875257, + "grad_norm": 225.62985229492188, + "loss": 6.2778, + "lr": 0.000974, + "step": 488, + "tokens_trained": 0.2398588 + }, + { + "epoch": 0.13899723423870647, + "grad_norm": 85.84110260009766, + "loss": 5.2786, + "lr": 0.000978, + "step": 490, + "tokens_trained": 0.240839968 + }, + { + "epoch": 0.13956456988866037, + "grad_norm": 141.4368438720703, + "loss": 5.5525, + "lr": 0.000982, + "step": 492, + "tokens_trained": 0.241823544 + }, + { + "epoch": 0.1401319055386143, + "grad_norm": 94.9535140991211, + "loss": 5.4386, + "lr": 0.0009860000000000001, + "step": 494, + "tokens_trained": 0.242805456 + }, + { + "epoch": 0.1406992411885682, + "grad_norm": 157.4557647705078, + "loss": 5.9786, + "lr": 0.00099, + "step": 496, + "tokens_trained": 0.243792496 + }, + { + "epoch": 0.1412665768385221, + "grad_norm": 319.5025634765625, + "loss": 7.04, + "lr": 0.000994, + "step": 498, + "tokens_trained": 0.244772472 + }, + { + "epoch": 0.141833912488476, + "grad_norm": 282.26824951171875, + "loss": 9.4037, + "lr": 0.000998, + "step": 500, + "tokens_trained": 0.245758968 + }, + { + "epoch": 0.141833912488476, + "eval_loss": 2.152184247970581, + "eval_runtime": 21.2772, + "step": 500, + "tokens_trained": 0.245758968 + }, + { + "epoch": 0.1424012481384299, + "grad_norm": 306.0666809082031, + "loss": 7.8845, + "lr": 0.00099986013986014, + "step": 502, + "tokens_trained": 0.246739024 + }, + { + "epoch": 0.1429685837883838, + "grad_norm": 188.89024353027344, + "loss": 6.8118, + "lr": 0.0009995804195804196, + "step": 504, + "tokens_trained": 0.247726552 + }, + { + "epoch": 0.1435359194383377, + "grad_norm": 228.97474670410156, + "loss": 6.8475, + "lr": 0.0009993006993006994, + "step": 506, + "tokens_trained": 0.24870688 + }, + { + "epoch": 0.1441032550882916, + "grad_norm": 229.80029296875, + "loss": 6.2171, + "lr": 0.000999020979020979, + "step": 508, + "tokens_trained": 0.249689096 + }, + { + "epoch": 0.1446705907382455, + "grad_norm": 157.30340576171875, + "loss": 6.2281, + "lr": 0.0009987412587412587, + "step": 510, + "tokens_trained": 0.250671768 + }, + { + "epoch": 0.1452379263881994, + "grad_norm": 176.64683532714844, + "loss": 6.5993, + "lr": 0.0009984615384615386, + "step": 512, + "tokens_trained": 0.25165608 + }, + { + "epoch": 0.14580526203815333, + "grad_norm": 197.20526123046875, + "loss": 5.7267, + "lr": 0.0009981818181818182, + "step": 514, + "tokens_trained": 0.252639712 + }, + { + "epoch": 0.14637259768810723, + "grad_norm": 54.713260650634766, + "loss": 5.7911, + "lr": 0.000997902097902098, + "step": 516, + "tokens_trained": 0.253622816 + }, + { + "epoch": 0.14693993333806113, + "grad_norm": 185.74923706054688, + "loss": 7.0055, + "lr": 0.0009976223776223777, + "step": 518, + "tokens_trained": 0.254602792 + }, + { + "epoch": 0.14750726898801503, + "grad_norm": 240.31021118164062, + "loss": 6.452, + "lr": 0.0009973426573426573, + "step": 520, + "tokens_trained": 0.255584736 + }, + { + "epoch": 0.14807460463796893, + "grad_norm": 160.2477264404297, + "loss": 7.6556, + "lr": 0.000997062937062937, + "step": 522, + "tokens_trained": 0.256563792 + }, + { + "epoch": 0.14864194028792285, + "grad_norm": 283.0034484863281, + "loss": 6.5345, + "lr": 0.0009967832167832168, + "step": 524, + "tokens_trained": 0.257546656 + }, + { + "epoch": 0.14920927593787675, + "grad_norm": 245.537109375, + "loss": 6.3281, + "lr": 0.0009965034965034964, + "step": 526, + "tokens_trained": 0.258530832 + }, + { + "epoch": 0.14977661158783065, + "grad_norm": 162.1538848876953, + "loss": 7.4072, + "lr": 0.0009962237762237763, + "step": 528, + "tokens_trained": 0.259514528 + }, + { + "epoch": 0.15034394723778455, + "grad_norm": 107.25792694091797, + "loss": 5.356, + "lr": 0.000995944055944056, + "step": 530, + "tokens_trained": 0.260500912 + }, + { + "epoch": 0.15091128288773845, + "grad_norm": 173.73353576660156, + "loss": 6.8625, + "lr": 0.0009956643356643356, + "step": 532, + "tokens_trained": 0.26148632 + }, + { + "epoch": 0.15147861853769237, + "grad_norm": 178.33541870117188, + "loss": 5.8794, + "lr": 0.0009953846153846154, + "step": 534, + "tokens_trained": 0.262468816 + }, + { + "epoch": 0.15204595418764627, + "grad_norm": 181.2533416748047, + "loss": 7.0243, + "lr": 0.000995104895104895, + "step": 536, + "tokens_trained": 0.263446696 + }, + { + "epoch": 0.15261328983760017, + "grad_norm": 208.79293823242188, + "loss": 5.8908, + "lr": 0.000994825174825175, + "step": 538, + "tokens_trained": 0.26443108 + }, + { + "epoch": 0.15318062548755407, + "grad_norm": 148.66285705566406, + "loss": 6.0831, + "lr": 0.0009945454545454546, + "step": 540, + "tokens_trained": 0.265414496 + }, + { + "epoch": 0.15374796113750797, + "grad_norm": 165.044189453125, + "loss": 5.5594, + "lr": 0.0009942657342657344, + "step": 542, + "tokens_trained": 0.266394128 + }, + { + "epoch": 0.1543152967874619, + "grad_norm": 124.5405502319336, + "loss": 5.2442, + "lr": 0.000993986013986014, + "step": 544, + "tokens_trained": 0.267378768 + }, + { + "epoch": 0.1548826324374158, + "grad_norm": 68.66510772705078, + "loss": 5.1173, + "lr": 0.0009937062937062937, + "step": 546, + "tokens_trained": 0.268360184 + }, + { + "epoch": 0.1554499680873697, + "grad_norm": 57.052860260009766, + "loss": 5.2348, + "lr": 0.0009934265734265735, + "step": 548, + "tokens_trained": 0.269345672 + }, + { + "epoch": 0.1560173037373236, + "grad_norm": 184.9175567626953, + "loss": 6.7748, + "lr": 0.0009931468531468532, + "step": 550, + "tokens_trained": 0.2703288 + }, + { + "epoch": 0.15658463938727749, + "grad_norm": 72.9861831665039, + "loss": 5.7387, + "lr": 0.000992867132867133, + "step": 552, + "tokens_trained": 0.271309176 + }, + { + "epoch": 0.1571519750372314, + "grad_norm": 135.864501953125, + "loss": 6.3035, + "lr": 0.0009925874125874127, + "step": 554, + "tokens_trained": 0.27229644 + }, + { + "epoch": 0.1577193106871853, + "grad_norm": 130.579833984375, + "loss": 5.4434, + "lr": 0.0009923076923076923, + "step": 556, + "tokens_trained": 0.273277904 + }, + { + "epoch": 0.1582866463371392, + "grad_norm": 206.77345275878906, + "loss": 5.8649, + "lr": 0.000992027972027972, + "step": 558, + "tokens_trained": 0.274261712 + }, + { + "epoch": 0.1588539819870931, + "grad_norm": 144.0505828857422, + "loss": 5.3459, + "lr": 0.0009917482517482518, + "step": 560, + "tokens_trained": 0.2752468 + }, + { + "epoch": 0.159421317637047, + "grad_norm": 87.56634521484375, + "loss": 5.6321, + "lr": 0.0009914685314685314, + "step": 562, + "tokens_trained": 0.276232384 + }, + { + "epoch": 0.15998865328700093, + "grad_norm": 275.2727355957031, + "loss": 6.7515, + "lr": 0.0009911888111888113, + "step": 564, + "tokens_trained": 0.277211608 + }, + { + "epoch": 0.16055598893695483, + "grad_norm": 97.00019836425781, + "loss": 5.4374, + "lr": 0.000990909090909091, + "step": 566, + "tokens_trained": 0.278196336 + }, + { + "epoch": 0.16112332458690873, + "grad_norm": 102.91439056396484, + "loss": 5.729, + "lr": 0.0009906293706293705, + "step": 568, + "tokens_trained": 0.279175672 + }, + { + "epoch": 0.16169066023686263, + "grad_norm": 151.12432861328125, + "loss": 5.4189, + "lr": 0.0009903496503496504, + "step": 570, + "tokens_trained": 0.280161088 + }, + { + "epoch": 0.16225799588681653, + "grad_norm": 86.6823959350586, + "loss": 5.1704, + "lr": 0.00099006993006993, + "step": 572, + "tokens_trained": 0.28114256 + }, + { + "epoch": 0.16282533153677045, + "grad_norm": 90.7052230834961, + "loss": 5.3673, + "lr": 0.0009897902097902099, + "step": 574, + "tokens_trained": 0.282128904 + }, + { + "epoch": 0.16339266718672435, + "grad_norm": 146.92874145507812, + "loss": 5.5971, + "lr": 0.0009895104895104895, + "step": 576, + "tokens_trained": 0.28311528 + }, + { + "epoch": 0.16396000283667825, + "grad_norm": 189.76296997070312, + "loss": 5.3109, + "lr": 0.0009892307692307694, + "step": 578, + "tokens_trained": 0.284098528 + }, + { + "epoch": 0.16452733848663215, + "grad_norm": 174.48092651367188, + "loss": 5.68, + "lr": 0.000988951048951049, + "step": 580, + "tokens_trained": 0.285081064 + }, + { + "epoch": 0.16509467413658604, + "grad_norm": 154.10816955566406, + "loss": 5.3307, + "lr": 0.0009886713286713286, + "step": 582, + "tokens_trained": 0.286067952 + }, + { + "epoch": 0.16566200978653997, + "grad_norm": 64.28263092041016, + "loss": 5.1676, + "lr": 0.0009883916083916085, + "step": 584, + "tokens_trained": 0.287051384 + }, + { + "epoch": 0.16622934543649387, + "grad_norm": 103.81795501708984, + "loss": 5.3436, + "lr": 0.0009881118881118881, + "step": 586, + "tokens_trained": 0.28803284 + }, + { + "epoch": 0.16679668108644777, + "grad_norm": 144.0076904296875, + "loss": 5.3033, + "lr": 0.000987832167832168, + "step": 588, + "tokens_trained": 0.289014824 + }, + { + "epoch": 0.16736401673640167, + "grad_norm": 88.31237030029297, + "loss": 5.0609, + "lr": 0.0009875524475524476, + "step": 590, + "tokens_trained": 0.289999864 + }, + { + "epoch": 0.16793135238635556, + "grad_norm": 68.4583740234375, + "loss": 5.0702, + "lr": 0.0009872727272727273, + "step": 592, + "tokens_trained": 0.290983888 + }, + { + "epoch": 0.1684986880363095, + "grad_norm": 135.28665161132812, + "loss": 5.3962, + "lr": 0.000986993006993007, + "step": 594, + "tokens_trained": 0.291965752 + }, + { + "epoch": 0.1690660236862634, + "grad_norm": 80.0412368774414, + "loss": 5.0246, + "lr": 0.0009867132867132867, + "step": 596, + "tokens_trained": 0.292946952 + }, + { + "epoch": 0.1696333593362173, + "grad_norm": 43.29194641113281, + "loss": 5.0051, + "lr": 0.0009864335664335664, + "step": 598, + "tokens_trained": 0.293928976 + }, + { + "epoch": 0.17020069498617119, + "grad_norm": 220.88687133789062, + "loss": 6.0798, + "lr": 0.0009861538461538462, + "step": 600, + "tokens_trained": 0.294912408 + }, + { + "epoch": 0.17076803063612508, + "grad_norm": 102.58654022216797, + "loss": 5.1271, + "lr": 0.0009858741258741259, + "step": 602, + "tokens_trained": 0.29589416 + }, + { + "epoch": 0.171335366286079, + "grad_norm": 119.0067138671875, + "loss": 5.7402, + "lr": 0.0009855944055944055, + "step": 604, + "tokens_trained": 0.296878584 + }, + { + "epoch": 0.1719027019360329, + "grad_norm": 138.8656005859375, + "loss": 5.1951, + "lr": 0.0009853146853146854, + "step": 606, + "tokens_trained": 0.297864552 + }, + { + "epoch": 0.1724700375859868, + "grad_norm": 73.5890884399414, + "loss": 5.2522, + "lr": 0.000985034965034965, + "step": 608, + "tokens_trained": 0.298854088 + }, + { + "epoch": 0.1730373732359407, + "grad_norm": 113.78330993652344, + "loss": 5.6683, + "lr": 0.0009847552447552449, + "step": 610, + "tokens_trained": 0.299835024 + }, + { + "epoch": 0.1736047088858946, + "grad_norm": 125.20297241210938, + "loss": 5.1812, + "lr": 0.0009844755244755245, + "step": 612, + "tokens_trained": 0.30082032 + }, + { + "epoch": 0.17417204453584853, + "grad_norm": 67.46041870117188, + "loss": 5.0417, + "lr": 0.0009841958041958043, + "step": 614, + "tokens_trained": 0.301808456 + }, + { + "epoch": 0.17473938018580243, + "grad_norm": 117.30754852294922, + "loss": 5.3064, + "lr": 0.000983916083916084, + "step": 616, + "tokens_trained": 0.302794456 + }, + { + "epoch": 0.17530671583575633, + "grad_norm": 124.30754089355469, + "loss": 5.1614, + "lr": 0.0009836363636363636, + "step": 618, + "tokens_trained": 0.303777376 + }, + { + "epoch": 0.17587405148571023, + "grad_norm": 102.72042083740234, + "loss": 5.1265, + "lr": 0.0009833566433566435, + "step": 620, + "tokens_trained": 0.304758864 + }, + { + "epoch": 0.17644138713566412, + "grad_norm": 39.332252502441406, + "loss": 5.1078, + "lr": 0.000983076923076923, + "step": 622, + "tokens_trained": 0.30574392 + }, + { + "epoch": 0.17700872278561805, + "grad_norm": 153.84811401367188, + "loss": 5.7696, + "lr": 0.000982797202797203, + "step": 624, + "tokens_trained": 0.306727584 + }, + { + "epoch": 0.17729239061059499, + "eval_loss": 1.3463915586471558, + "eval_runtime": 20.8357, + "step": 625, + "tokens_trained": 0.307220496 + }, + { + "epoch": 0.17757605843557195, + "grad_norm": 160.2552490234375, + "loss": 5.2283, + "lr": 0.0009825174825174826, + "step": 626, + "tokens_trained": 0.307713024 + }, + { + "epoch": 0.17814339408552585, + "grad_norm": 186.77407836914062, + "loss": 5.2866, + "lr": 0.0009822377622377622, + "step": 628, + "tokens_trained": 0.308700128 + }, + { + "epoch": 0.17871072973547975, + "grad_norm": 84.55519104003906, + "loss": 5.1106, + "lr": 0.0009819580419580419, + "step": 630, + "tokens_trained": 0.309681208 + }, + { + "epoch": 0.17927806538543364, + "grad_norm": 20.617040634155273, + "loss": 4.8327, + "lr": 0.0009816783216783217, + "step": 632, + "tokens_trained": 0.310662224 + }, + { + "epoch": 0.17984540103538757, + "grad_norm": 168.06039428710938, + "loss": 6.0704, + "lr": 0.0009813986013986014, + "step": 634, + "tokens_trained": 0.31164064 + }, + { + "epoch": 0.18041273668534147, + "grad_norm": 238.23736572265625, + "loss": 5.6188, + "lr": 0.0009811188811188812, + "step": 636, + "tokens_trained": 0.312622568 + }, + { + "epoch": 0.18098007233529537, + "grad_norm": 140.0707550048828, + "loss": 6.4034, + "lr": 0.0009808391608391608, + "step": 638, + "tokens_trained": 0.313604944 + }, + { + "epoch": 0.18154740798524927, + "grad_norm": 161.19302368164062, + "loss": 5.4906, + "lr": 0.0009805594405594405, + "step": 640, + "tokens_trained": 0.314592072 + }, + { + "epoch": 0.18211474363520316, + "grad_norm": 121.9577407836914, + "loss": 5.2097, + "lr": 0.0009802797202797203, + "step": 642, + "tokens_trained": 0.315574392 + }, + { + "epoch": 0.1826820792851571, + "grad_norm": 121.25574493408203, + "loss": 5.0317, + "lr": 0.00098, + "step": 644, + "tokens_trained": 0.316559008 + }, + { + "epoch": 0.183249414935111, + "grad_norm": 28.328269958496094, + "loss": 4.932, + "lr": 0.0009797202797202798, + "step": 646, + "tokens_trained": 0.317538776 + }, + { + "epoch": 0.1838167505850649, + "grad_norm": 127.77408599853516, + "loss": 5.8335, + "lr": 0.0009794405594405595, + "step": 648, + "tokens_trained": 0.31851792 + }, + { + "epoch": 0.18438408623501878, + "grad_norm": 94.9522933959961, + "loss": 5.1948, + "lr": 0.000979160839160839, + "step": 650, + "tokens_trained": 0.319501576 + }, + { + "epoch": 0.18495142188497268, + "grad_norm": 110.33658599853516, + "loss": 5.098, + "lr": 0.000978881118881119, + "step": 652, + "tokens_trained": 0.320482392 + }, + { + "epoch": 0.1855187575349266, + "grad_norm": 67.23124694824219, + "loss": 4.7723, + "lr": 0.0009786013986013986, + "step": 654, + "tokens_trained": 0.32146712 + }, + { + "epoch": 0.1860860931848805, + "grad_norm": 61.519866943359375, + "loss": 4.7245, + "lr": 0.0009783216783216782, + "step": 656, + "tokens_trained": 0.322449576 + }, + { + "epoch": 0.1866534288348344, + "grad_norm": 99.51078033447266, + "loss": 4.783, + "lr": 0.000978041958041958, + "step": 658, + "tokens_trained": 0.323432688 + }, + { + "epoch": 0.1872207644847883, + "grad_norm": 44.619197845458984, + "loss": 4.7495, + "lr": 0.000977762237762238, + "step": 660, + "tokens_trained": 0.324413952 + }, + { + "epoch": 0.18778810013474223, + "grad_norm": 114.5891342163086, + "loss": 5.1261, + "lr": 0.0009774825174825176, + "step": 662, + "tokens_trained": 0.325394536 + }, + { + "epoch": 0.18835543578469613, + "grad_norm": 100.3728256225586, + "loss": 4.7883, + "lr": 0.0009772027972027972, + "step": 664, + "tokens_trained": 0.326374672 + }, + { + "epoch": 0.18892277143465003, + "grad_norm": 51.883033752441406, + "loss": 4.7249, + "lr": 0.0009769230769230768, + "step": 666, + "tokens_trained": 0.327357152 + }, + { + "epoch": 0.18949010708460393, + "grad_norm": 82.27507019042969, + "loss": 4.8277, + "lr": 0.0009766433566433567, + "step": 668, + "tokens_trained": 0.328342088 + }, + { + "epoch": 0.19005744273455782, + "grad_norm": 83.53064727783203, + "loss": 4.8338, + "lr": 0.0009763636363636363, + "step": 670, + "tokens_trained": 0.329319248 + }, + { + "epoch": 0.19062477838451175, + "grad_norm": 76.18387603759766, + "loss": 4.6958, + "lr": 0.0009760839160839161, + "step": 672, + "tokens_trained": 0.330305968 + }, + { + "epoch": 0.19119211403446565, + "grad_norm": 27.401426315307617, + "loss": 4.6929, + "lr": 0.0009758041958041958, + "step": 674, + "tokens_trained": 0.3312912 + }, + { + "epoch": 0.19175944968441955, + "grad_norm": 186.770263671875, + "loss": 5.5089, + "lr": 0.0009755244755244756, + "step": 676, + "tokens_trained": 0.332275224 + }, + { + "epoch": 0.19232678533437345, + "grad_norm": 105.02385711669922, + "loss": 4.8876, + "lr": 0.0009752447552447553, + "step": 678, + "tokens_trained": 0.33325588 + }, + { + "epoch": 0.19289412098432734, + "grad_norm": 94.96269989013672, + "loss": 5.1235, + "lr": 0.0009749650349650349, + "step": 680, + "tokens_trained": 0.334238408 + }, + { + "epoch": 0.19346145663428127, + "grad_norm": 92.29356384277344, + "loss": 4.8194, + "lr": 0.0009746853146853148, + "step": 682, + "tokens_trained": 0.335219368 + }, + { + "epoch": 0.19402879228423517, + "grad_norm": 59.1584358215332, + "loss": 4.7511, + "lr": 0.0009744055944055944, + "step": 684, + "tokens_trained": 0.336207136 + }, + { + "epoch": 0.19459612793418907, + "grad_norm": 54.759002685546875, + "loss": 4.777, + "lr": 0.0009741258741258742, + "step": 686, + "tokens_trained": 0.337193536 + }, + { + "epoch": 0.19516346358414297, + "grad_norm": 92.20452880859375, + "loss": 4.8225, + "lr": 0.0009738461538461538, + "step": 688, + "tokens_trained": 0.338179224 + }, + { + "epoch": 0.19573079923409686, + "grad_norm": 75.97005462646484, + "loss": 4.655, + "lr": 0.0009735664335664336, + "step": 690, + "tokens_trained": 0.339162168 + }, + { + "epoch": 0.1962981348840508, + "grad_norm": 58.19076919555664, + "loss": 4.6446, + "lr": 0.0009732867132867133, + "step": 692, + "tokens_trained": 0.340138904 + }, + { + "epoch": 0.1968654705340047, + "grad_norm": 50.81512451171875, + "loss": 4.5866, + "lr": 0.000973006993006993, + "step": 694, + "tokens_trained": 0.34112288 + }, + { + "epoch": 0.1974328061839586, + "grad_norm": 61.683372497558594, + "loss": 4.6018, + "lr": 0.0009727272727272728, + "step": 696, + "tokens_trained": 0.342111992 + }, + { + "epoch": 0.19800014183391249, + "grad_norm": 61.01798629760742, + "loss": 4.6007, + "lr": 0.0009724475524475524, + "step": 698, + "tokens_trained": 0.343095912 + }, + { + "epoch": 0.19856747748386638, + "grad_norm": 96.49671936035156, + "loss": 4.7035, + "lr": 0.0009721678321678323, + "step": 700, + "tokens_trained": 0.344078632 + }, + { + "epoch": 0.1991348131338203, + "grad_norm": 64.7771224975586, + "loss": 4.8341, + "lr": 0.0009718881118881119, + "step": 702, + "tokens_trained": 0.345060576 + }, + { + "epoch": 0.1997021487837742, + "grad_norm": 90.1478042602539, + "loss": 4.7739, + "lr": 0.0009716083916083917, + "step": 704, + "tokens_trained": 0.34604112 + }, + { + "epoch": 0.2002694844337281, + "grad_norm": 67.6308822631836, + "loss": 4.6218, + "lr": 0.0009713286713286713, + "step": 706, + "tokens_trained": 0.347023496 + }, + { + "epoch": 0.200836820083682, + "grad_norm": 40.50175094604492, + "loss": 4.6008, + "lr": 0.000971048951048951, + "step": 708, + "tokens_trained": 0.348005416 + }, + { + "epoch": 0.2014041557336359, + "grad_norm": 33.6448860168457, + "loss": 4.5307, + "lr": 0.0009707692307692308, + "step": 710, + "tokens_trained": 0.3489886 + }, + { + "epoch": 0.20197149138358983, + "grad_norm": 15.484851837158203, + "loss": 4.5065, + "lr": 0.0009704895104895105, + "step": 712, + "tokens_trained": 0.34997024 + }, + { + "epoch": 0.20253882703354373, + "grad_norm": 109.26301574707031, + "loss": 4.9613, + "lr": 0.0009702097902097903, + "step": 714, + "tokens_trained": 0.350958496 + }, + { + "epoch": 0.20310616268349763, + "grad_norm": 150.07492065429688, + "loss": 4.8507, + "lr": 0.0009699300699300699, + "step": 716, + "tokens_trained": 0.35193892 + }, + { + "epoch": 0.20367349833345152, + "grad_norm": 113.43978881835938, + "loss": 5.4494, + "lr": 0.0009696503496503498, + "step": 718, + "tokens_trained": 0.35291908 + }, + { + "epoch": 0.20424083398340542, + "grad_norm": 123.0071792602539, + "loss": 4.9475, + "lr": 0.0009693706293706294, + "step": 720, + "tokens_trained": 0.353896072 + }, + { + "epoch": 0.20480816963335935, + "grad_norm": 65.55500793457031, + "loss": 4.7585, + "lr": 0.0009690909090909091, + "step": 722, + "tokens_trained": 0.354878992 + }, + { + "epoch": 0.20537550528331325, + "grad_norm": 36.11159896850586, + "loss": 4.6323, + "lr": 0.0009688111888111888, + "step": 724, + "tokens_trained": 0.355863728 + }, + { + "epoch": 0.20594284093326715, + "grad_norm": 30.566436767578125, + "loss": 4.53, + "lr": 0.0009685314685314685, + "step": 726, + "tokens_trained": 0.356845272 + }, + { + "epoch": 0.20651017658322104, + "grad_norm": 59.01853561401367, + "loss": 4.5283, + "lr": 0.0009682517482517483, + "step": 728, + "tokens_trained": 0.357826656 + }, + { + "epoch": 0.20707751223317494, + "grad_norm": 91.78115844726562, + "loss": 4.6149, + "lr": 0.000967972027972028, + "step": 730, + "tokens_trained": 0.358809896 + }, + { + "epoch": 0.20764484788312887, + "grad_norm": 67.97398376464844, + "loss": 4.617, + "lr": 0.0009676923076923078, + "step": 732, + "tokens_trained": 0.359788736 + }, + { + "epoch": 0.20821218353308277, + "grad_norm": 42.82001876831055, + "loss": 4.6134, + "lr": 0.0009674125874125874, + "step": 734, + "tokens_trained": 0.360771744 + }, + { + "epoch": 0.20877951918303667, + "grad_norm": 63.52122116088867, + "loss": 4.6995, + "lr": 0.0009671328671328672, + "step": 736, + "tokens_trained": 0.361757656 + }, + { + "epoch": 0.20934685483299056, + "grad_norm": 116.39544677734375, + "loss": 4.7153, + "lr": 0.0009668531468531469, + "step": 738, + "tokens_trained": 0.362744008 + }, + { + "epoch": 0.20991419048294446, + "grad_norm": 40.74269485473633, + "loss": 4.7978, + "lr": 0.0009665734265734266, + "step": 740, + "tokens_trained": 0.36372872 + }, + { + "epoch": 0.2104815261328984, + "grad_norm": 114.29917907714844, + "loss": 5.1683, + "lr": 0.0009662937062937063, + "step": 742, + "tokens_trained": 0.364710536 + }, + { + "epoch": 0.2110488617828523, + "grad_norm": 115.83326721191406, + "loss": 4.7642, + "lr": 0.000966013986013986, + "step": 744, + "tokens_trained": 0.3656912 + }, + { + "epoch": 0.21161619743280619, + "grad_norm": 21.708093643188477, + "loss": 4.8244, + "lr": 0.0009657342657342657, + "step": 746, + "tokens_trained": 0.36667388 + }, + { + "epoch": 0.21218353308276008, + "grad_norm": 182.01918029785156, + "loss": 5.6045, + "lr": 0.0009654545454545455, + "step": 748, + "tokens_trained": 0.3676634 + }, + { + "epoch": 0.21275086873271398, + "grad_norm": 47.119319915771484, + "loss": 4.7929, + "lr": 0.0009651748251748252, + "step": 750, + "tokens_trained": 0.368647288 + }, + { + "epoch": 0.21275086873271398, + "eval_loss": 1.2186306715011597, + "eval_runtime": 20.9362, + "step": 750, + "tokens_trained": 0.368647288 + }, + { + "epoch": 0.2133182043826679, + "grad_norm": 51.43566131591797, + "loss": 4.7298, + "lr": 0.0009648951048951049, + "step": 752, + "tokens_trained": 0.36962992 + }, + { + "epoch": 0.2138855400326218, + "grad_norm": 79.49323272705078, + "loss": 5.0749, + "lr": 0.0009646153846153846, + "step": 754, + "tokens_trained": 0.370616064 + }, + { + "epoch": 0.2144528756825757, + "grad_norm": 119.80200958251953, + "loss": 4.8198, + "lr": 0.0009643356643356644, + "step": 756, + "tokens_trained": 0.371596208 + }, + { + "epoch": 0.2150202113325296, + "grad_norm": 95.88092041015625, + "loss": 4.7437, + "lr": 0.0009640559440559441, + "step": 758, + "tokens_trained": 0.372579584 + }, + { + "epoch": 0.2155875469824835, + "grad_norm": 79.64202117919922, + "loss": 4.9181, + "lr": 0.0009637762237762237, + "step": 760, + "tokens_trained": 0.373563056 + }, + { + "epoch": 0.21615488263243743, + "grad_norm": 79.93920135498047, + "loss": 4.6393, + "lr": 0.0009634965034965035, + "step": 762, + "tokens_trained": 0.374547648 + }, + { + "epoch": 0.21672221828239133, + "grad_norm": 78.67620849609375, + "loss": 4.6178, + "lr": 0.0009632167832167832, + "step": 764, + "tokens_trained": 0.375531456 + }, + { + "epoch": 0.21728955393234523, + "grad_norm": 56.32818603515625, + "loss": 4.6498, + "lr": 0.000962937062937063, + "step": 766, + "tokens_trained": 0.376516896 + }, + { + "epoch": 0.21785688958229912, + "grad_norm": 45.35737228393555, + "loss": 4.5812, + "lr": 0.0009626573426573427, + "step": 768, + "tokens_trained": 0.377499752 + }, + { + "epoch": 0.21842422523225302, + "grad_norm": 58.13076400756836, + "loss": 4.5793, + "lr": 0.0009623776223776224, + "step": 770, + "tokens_trained": 0.37848276 + }, + { + "epoch": 0.21899156088220695, + "grad_norm": 55.620628356933594, + "loss": 4.4865, + "lr": 0.0009620979020979021, + "step": 772, + "tokens_trained": 0.379466296 + }, + { + "epoch": 0.21955889653216085, + "grad_norm": 77.26813507080078, + "loss": 4.5671, + "lr": 0.0009618181818181818, + "step": 774, + "tokens_trained": 0.380449888 + }, + { + "epoch": 0.22012623218211474, + "grad_norm": 45.00653839111328, + "loss": 4.5923, + "lr": 0.0009615384615384616, + "step": 776, + "tokens_trained": 0.381430352 + }, + { + "epoch": 0.22069356783206864, + "grad_norm": 52.77407455444336, + "loss": 4.5094, + "lr": 0.0009612587412587412, + "step": 778, + "tokens_trained": 0.382416152 + }, + { + "epoch": 0.22126090348202254, + "grad_norm": 36.721073150634766, + "loss": 4.4536, + "lr": 0.000960979020979021, + "step": 780, + "tokens_trained": 0.383396672 + }, + { + "epoch": 0.22182823913197647, + "grad_norm": 51.21247100830078, + "loss": 4.4599, + "lr": 0.0009606993006993007, + "step": 782, + "tokens_trained": 0.384380584 + }, + { + "epoch": 0.22239557478193037, + "grad_norm": 65.23794555664062, + "loss": 4.5397, + "lr": 0.0009604195804195805, + "step": 784, + "tokens_trained": 0.385361368 + }, + { + "epoch": 0.22296291043188426, + "grad_norm": 23.255144119262695, + "loss": 4.5007, + "lr": 0.0009601398601398602, + "step": 786, + "tokens_trained": 0.386341416 + }, + { + "epoch": 0.22353024608183816, + "grad_norm": 30.812740325927734, + "loss": 4.5239, + "lr": 0.0009598601398601398, + "step": 788, + "tokens_trained": 0.387324624 + }, + { + "epoch": 0.22409758173179206, + "grad_norm": 50.781219482421875, + "loss": 4.5131, + "lr": 0.0009595804195804196, + "step": 790, + "tokens_trained": 0.388312744 + }, + { + "epoch": 0.224664917381746, + "grad_norm": 47.88816452026367, + "loss": 4.4622, + "lr": 0.0009593006993006993, + "step": 792, + "tokens_trained": 0.38929852 + }, + { + "epoch": 0.22523225303169989, + "grad_norm": 49.32049560546875, + "loss": 4.5053, + "lr": 0.0009590209790209791, + "step": 794, + "tokens_trained": 0.390279792 + }, + { + "epoch": 0.22579958868165378, + "grad_norm": 36.98805618286133, + "loss": 4.5144, + "lr": 0.0009587412587412587, + "step": 796, + "tokens_trained": 0.391258904 + }, + { + "epoch": 0.22636692433160768, + "grad_norm": 24.88475799560547, + "loss": 4.4992, + "lr": 0.0009584615384615385, + "step": 798, + "tokens_trained": 0.392238976 + }, + { + "epoch": 0.22693425998156158, + "grad_norm": 38.89309310913086, + "loss": 4.4853, + "lr": 0.0009581818181818182, + "step": 800, + "tokens_trained": 0.393226312 + }, + { + "epoch": 0.2275015956315155, + "grad_norm": 34.86774444580078, + "loss": 4.4519, + "lr": 0.000957902097902098, + "step": 802, + "tokens_trained": 0.394206688 + }, + { + "epoch": 0.2280689312814694, + "grad_norm": 24.966291427612305, + "loss": 4.456, + "lr": 0.0009576223776223777, + "step": 804, + "tokens_trained": 0.395191608 + }, + { + "epoch": 0.2286362669314233, + "grad_norm": 12.218213081359863, + "loss": 4.4266, + "lr": 0.0009573426573426573, + "step": 806, + "tokens_trained": 0.396174512 + }, + { + "epoch": 0.2292036025813772, + "grad_norm": 50.817054748535156, + "loss": 4.586, + "lr": 0.0009570629370629371, + "step": 808, + "tokens_trained": 0.397156912 + }, + { + "epoch": 0.2297709382313311, + "grad_norm": 37.60087203979492, + "loss": 4.4616, + "lr": 0.0009567832167832168, + "step": 810, + "tokens_trained": 0.398140016 + }, + { + "epoch": 0.23033827388128503, + "grad_norm": 37.55678176879883, + "loss": 4.4755, + "lr": 0.0009565034965034966, + "step": 812, + "tokens_trained": 0.39912384 + }, + { + "epoch": 0.23090560953123893, + "grad_norm": 56.427215576171875, + "loss": 4.5078, + "lr": 0.0009562237762237762, + "step": 814, + "tokens_trained": 0.400111224 + }, + { + "epoch": 0.23147294518119282, + "grad_norm": 31.869827270507812, + "loss": 4.5013, + "lr": 0.0009559440559440559, + "step": 816, + "tokens_trained": 0.401094936 + }, + { + "epoch": 0.23204028083114672, + "grad_norm": 77.57958984375, + "loss": 4.6977, + "lr": 0.0009556643356643357, + "step": 818, + "tokens_trained": 0.402078888 + }, + { + "epoch": 0.23260761648110062, + "grad_norm": 52.50204849243164, + "loss": 4.5142, + "lr": 0.0009553846153846154, + "step": 820, + "tokens_trained": 0.403059904 + }, + { + "epoch": 0.23317495213105455, + "grad_norm": 32.34305191040039, + "loss": 4.4828, + "lr": 0.0009551048951048952, + "step": 822, + "tokens_trained": 0.404049848 + }, + { + "epoch": 0.23374228778100845, + "grad_norm": 52.08961486816406, + "loss": 4.4869, + "lr": 0.0009548251748251748, + "step": 824, + "tokens_trained": 0.405033872 + }, + { + "epoch": 0.23430962343096234, + "grad_norm": 44.32194900512695, + "loss": 4.4802, + "lr": 0.0009545454545454546, + "step": 826, + "tokens_trained": 0.406017872 + }, + { + "epoch": 0.23487695908091624, + "grad_norm": 30.941524505615234, + "loss": 4.4323, + "lr": 0.0009542657342657343, + "step": 828, + "tokens_trained": 0.40700704 + }, + { + "epoch": 0.23544429473087014, + "grad_norm": 20.52709197998047, + "loss": 4.4919, + "lr": 0.000953986013986014, + "step": 830, + "tokens_trained": 0.407991512 + }, + { + "epoch": 0.23601163038082407, + "grad_norm": 86.80307006835938, + "loss": 4.8228, + "lr": 0.0009537062937062937, + "step": 832, + "tokens_trained": 0.408979272 + }, + { + "epoch": 0.23657896603077797, + "grad_norm": 73.71435546875, + "loss": 4.5954, + "lr": 0.0009534265734265734, + "step": 834, + "tokens_trained": 0.409962984 + }, + { + "epoch": 0.23714630168073186, + "grad_norm": 66.3813247680664, + "loss": 4.5969, + "lr": 0.0009531468531468532, + "step": 836, + "tokens_trained": 0.410945248 + }, + { + "epoch": 0.23771363733068576, + "grad_norm": 86.94453430175781, + "loss": 4.5894, + "lr": 0.0009528671328671329, + "step": 838, + "tokens_trained": 0.411930872 + }, + { + "epoch": 0.23828097298063966, + "grad_norm": 61.28915786743164, + "loss": 4.5613, + "lr": 0.0009525874125874127, + "step": 840, + "tokens_trained": 0.412912608 + }, + { + "epoch": 0.2388483086305936, + "grad_norm": 65.02153778076172, + "loss": 4.5398, + "lr": 0.0009523076923076923, + "step": 842, + "tokens_trained": 0.413897488 + }, + { + "epoch": 0.23941564428054748, + "grad_norm": 54.01200485229492, + "loss": 4.4922, + "lr": 0.000952027972027972, + "step": 844, + "tokens_trained": 0.414872888 + }, + { + "epoch": 0.23998297993050138, + "grad_norm": 66.7095718383789, + "loss": 4.5317, + "lr": 0.0009517482517482518, + "step": 846, + "tokens_trained": 0.415856296 + }, + { + "epoch": 0.24055031558045528, + "grad_norm": 64.23979949951172, + "loss": 4.4686, + "lr": 0.0009514685314685315, + "step": 848, + "tokens_trained": 0.416843344 + }, + { + "epoch": 0.24111765123040918, + "grad_norm": 51.012840270996094, + "loss": 4.4544, + "lr": 0.0009511888111888112, + "step": 850, + "tokens_trained": 0.41782032 + }, + { + "epoch": 0.2416849868803631, + "grad_norm": 40.83076095581055, + "loss": 4.4665, + "lr": 0.0009509090909090909, + "step": 852, + "tokens_trained": 0.418805672 + }, + { + "epoch": 0.242252322530317, + "grad_norm": 48.31489944458008, + "loss": 4.4748, + "lr": 0.0009506293706293707, + "step": 854, + "tokens_trained": 0.419786344 + }, + { + "epoch": 0.2428196581802709, + "grad_norm": 50.08705520629883, + "loss": 4.4973, + "lr": 0.0009503496503496504, + "step": 856, + "tokens_trained": 0.420768872 + }, + { + "epoch": 0.2433869938302248, + "grad_norm": 26.840139389038086, + "loss": 4.461, + "lr": 0.0009500699300699301, + "step": 858, + "tokens_trained": 0.421750296 + }, + { + "epoch": 0.2439543294801787, + "grad_norm": 24.721454620361328, + "loss": 4.4246, + "lr": 0.0009497902097902098, + "step": 860, + "tokens_trained": 0.422730976 + }, + { + "epoch": 0.24452166513013263, + "grad_norm": 63.147926330566406, + "loss": 4.623, + "lr": 0.0009495104895104895, + "step": 862, + "tokens_trained": 0.423715768 + }, + { + "epoch": 0.24508900078008652, + "grad_norm": 50.99778747558594, + "loss": 4.4663, + "lr": 0.0009492307692307693, + "step": 864, + "tokens_trained": 0.424697072 + }, + { + "epoch": 0.24565633643004042, + "grad_norm": 38.0300407409668, + "loss": 4.4649, + "lr": 0.000948951048951049, + "step": 866, + "tokens_trained": 0.425681392 + }, + { + "epoch": 0.24622367207999432, + "grad_norm": 19.017776489257812, + "loss": 4.4296, + "lr": 0.0009486713286713286, + "step": 868, + "tokens_trained": 0.426665088 + }, + { + "epoch": 0.24679100772994822, + "grad_norm": 24.02813148498535, + "loss": 4.4958, + "lr": 0.0009483916083916084, + "step": 870, + "tokens_trained": 0.427646016 + }, + { + "epoch": 0.24735834337990215, + "grad_norm": 59.40018081665039, + "loss": 4.5919, + "lr": 0.0009481118881118881, + "step": 872, + "tokens_trained": 0.428628048 + }, + { + "epoch": 0.24792567902985604, + "grad_norm": 61.13710403442383, + "loss": 4.4642, + "lr": 0.0009478321678321679, + "step": 874, + "tokens_trained": 0.4296112 + }, + { + "epoch": 0.24820934685483298, + "eval_loss": 1.1135390996932983, + "eval_runtime": 20.4738, + "step": 875, + "tokens_trained": 0.430109024 + }, + { + "epoch": 0.24849301467980994, + "grad_norm": 47.920021057128906, + "loss": 4.4832, + "lr": 0.0009475524475524476, + "step": 876, + "tokens_trained": 0.430599208 + }, + { + "epoch": 0.24906035032976384, + "grad_norm": 25.661701202392578, + "loss": 4.4176, + "lr": 0.0009472727272727273, + "step": 878, + "tokens_trained": 0.43158356 + }, + { + "epoch": 0.24962768597971774, + "grad_norm": 32.86565399169922, + "loss": 4.405, + "lr": 0.000946993006993007, + "step": 880, + "tokens_trained": 0.432570584 + }, + { + "epoch": 0.25019502162967167, + "grad_norm": 23.443584442138672, + "loss": 4.4218, + "lr": 0.0009467132867132868, + "step": 882, + "tokens_trained": 0.433557672 + }, + { + "epoch": 0.25076235727962554, + "grad_norm": 28.315975189208984, + "loss": 4.4019, + "lr": 0.0009464335664335665, + "step": 884, + "tokens_trained": 0.434542736 + }, + { + "epoch": 0.25132969292957946, + "grad_norm": 31.056642532348633, + "loss": 4.4027, + "lr": 0.0009461538461538461, + "step": 886, + "tokens_trained": 0.43553112 + }, + { + "epoch": 0.2518970285795334, + "grad_norm": 13.661805152893066, + "loss": 4.3745, + "lr": 0.0009458741258741259, + "step": 888, + "tokens_trained": 0.436511584 + }, + { + "epoch": 0.25246436422948726, + "grad_norm": 47.04901885986328, + "loss": 4.4875, + "lr": 0.0009455944055944056, + "step": 890, + "tokens_trained": 0.43749464 + }, + { + "epoch": 0.2530316998794412, + "grad_norm": 84.91446685791016, + "loss": 4.5185, + "lr": 0.0009453146853146854, + "step": 892, + "tokens_trained": 0.43847764 + }, + { + "epoch": 0.25359903552939506, + "grad_norm": 40.9110107421875, + "loss": 4.5735, + "lr": 0.000945034965034965, + "step": 894, + "tokens_trained": 0.439461496 + }, + { + "epoch": 0.254166371179349, + "grad_norm": 58.98877716064453, + "loss": 4.5146, + "lr": 0.0009447552447552447, + "step": 896, + "tokens_trained": 0.440443656 + }, + { + "epoch": 0.2547337068293029, + "grad_norm": 34.037315368652344, + "loss": 4.4714, + "lr": 0.0009444755244755245, + "step": 898, + "tokens_trained": 0.441423496 + }, + { + "epoch": 0.2553010424792568, + "grad_norm": 24.91920280456543, + "loss": 4.4334, + "lr": 0.0009441958041958042, + "step": 900, + "tokens_trained": 0.442407408 + }, + { + "epoch": 0.2558683781292107, + "grad_norm": 30.612323760986328, + "loss": 4.4459, + "lr": 0.000943916083916084, + "step": 902, + "tokens_trained": 0.443383464 + }, + { + "epoch": 0.2564357137791646, + "grad_norm": 50.595577239990234, + "loss": 4.4848, + "lr": 0.0009436363636363636, + "step": 904, + "tokens_trained": 0.4443674 + }, + { + "epoch": 0.2570030494291185, + "grad_norm": 41.3300895690918, + "loss": 4.4445, + "lr": 0.0009433566433566434, + "step": 906, + "tokens_trained": 0.445346072 + }, + { + "epoch": 0.25757038507907243, + "grad_norm": 48.33689880371094, + "loss": 4.4058, + "lr": 0.0009430769230769231, + "step": 908, + "tokens_trained": 0.446329872 + }, + { + "epoch": 0.2581377207290263, + "grad_norm": 39.081382751464844, + "loss": 4.4321, + "lr": 0.0009427972027972029, + "step": 910, + "tokens_trained": 0.447309544 + }, + { + "epoch": 0.2587050563789802, + "grad_norm": 62.18062210083008, + "loss": 4.4672, + "lr": 0.0009425174825174825, + "step": 912, + "tokens_trained": 0.448295056 + }, + { + "epoch": 0.2592723920289341, + "grad_norm": 28.725404739379883, + "loss": 4.4786, + "lr": 0.0009422377622377622, + "step": 914, + "tokens_trained": 0.449274208 + }, + { + "epoch": 0.259839727678888, + "grad_norm": 47.55582809448242, + "loss": 4.4227, + "lr": 0.000941958041958042, + "step": 916, + "tokens_trained": 0.450256408 + }, + { + "epoch": 0.26040706332884195, + "grad_norm": 35.743125915527344, + "loss": 4.379, + "lr": 0.0009416783216783217, + "step": 918, + "tokens_trained": 0.45123684 + }, + { + "epoch": 0.2609743989787958, + "grad_norm": 31.489402770996094, + "loss": 4.3888, + "lr": 0.0009413986013986015, + "step": 920, + "tokens_trained": 0.45221748 + }, + { + "epoch": 0.26154173462874974, + "grad_norm": 36.46233367919922, + "loss": 4.3982, + "lr": 0.0009411188811188811, + "step": 922, + "tokens_trained": 0.453202064 + }, + { + "epoch": 0.2621090702787036, + "grad_norm": 41.6457633972168, + "loss": 4.385, + "lr": 0.0009408391608391608, + "step": 924, + "tokens_trained": 0.454183456 + }, + { + "epoch": 0.26267640592865754, + "grad_norm": 26.52242088317871, + "loss": 4.4091, + "lr": 0.0009405594405594406, + "step": 926, + "tokens_trained": 0.455165496 + }, + { + "epoch": 0.26324374157861147, + "grad_norm": 14.401509284973145, + "loss": 4.3549, + "lr": 0.0009402797202797203, + "step": 928, + "tokens_trained": 0.456150248 + }, + { + "epoch": 0.26381107722856534, + "grad_norm": 30.626131057739258, + "loss": 4.3325, + "lr": 0.00094, + "step": 930, + "tokens_trained": 0.457134184 + }, + { + "epoch": 0.26437841287851926, + "grad_norm": 63.74067687988281, + "loss": 4.442, + "lr": 0.0009397202797202797, + "step": 932, + "tokens_trained": 0.458118808 + }, + { + "epoch": 0.26494574852847314, + "grad_norm": 12.15156364440918, + "loss": 4.4658, + "lr": 0.0009394405594405595, + "step": 934, + "tokens_trained": 0.459103872 + }, + { + "epoch": 0.26551308417842706, + "grad_norm": 76.2789306640625, + "loss": 4.8153, + "lr": 0.0009391608391608392, + "step": 936, + "tokens_trained": 0.460087216 + }, + { + "epoch": 0.266080419828381, + "grad_norm": 63.919334411621094, + "loss": 4.5707, + "lr": 0.000938881118881119, + "step": 938, + "tokens_trained": 0.461070568 + }, + { + "epoch": 0.26664775547833486, + "grad_norm": 75.1481704711914, + "loss": 4.5931, + "lr": 0.0009386013986013986, + "step": 940, + "tokens_trained": 0.462055184 + }, + { + "epoch": 0.2672150911282888, + "grad_norm": 33.118961334228516, + "loss": 4.4723, + "lr": 0.0009383216783216783, + "step": 942, + "tokens_trained": 0.463034592 + }, + { + "epoch": 0.26778242677824265, + "grad_norm": 30.8759765625, + "loss": 4.4275, + "lr": 0.0009380419580419581, + "step": 944, + "tokens_trained": 0.464016816 + }, + { + "epoch": 0.2683497624281966, + "grad_norm": 41.05061340332031, + "loss": 4.4566, + "lr": 0.0009377622377622378, + "step": 946, + "tokens_trained": 0.465000872 + }, + { + "epoch": 0.2689170980781505, + "grad_norm": 30.93424415588379, + "loss": 4.3985, + "lr": 0.0009374825174825175, + "step": 948, + "tokens_trained": 0.465984096 + }, + { + "epoch": 0.2694844337281044, + "grad_norm": 29.477052688598633, + "loss": 4.3718, + "lr": 0.0009372027972027972, + "step": 950, + "tokens_trained": 0.466961752 + }, + { + "epoch": 0.2700517693780583, + "grad_norm": 21.568912506103516, + "loss": 4.3697, + "lr": 0.0009369230769230769, + "step": 952, + "tokens_trained": 0.467950088 + }, + { + "epoch": 0.2706191050280122, + "grad_norm": 41.66835021972656, + "loss": 4.4241, + "lr": 0.0009366433566433567, + "step": 954, + "tokens_trained": 0.468928736 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 68.04551696777344, + "loss": 4.3978, + "lr": 0.0009363636363636364, + "step": 956, + "tokens_trained": 0.469907496 + }, + { + "epoch": 0.27175377632792, + "grad_norm": 37.655181884765625, + "loss": 4.4497, + "lr": 0.0009360839160839161, + "step": 958, + "tokens_trained": 0.470889168 + }, + { + "epoch": 0.2723211119778739, + "grad_norm": 22.074953079223633, + "loss": 4.3918, + "lr": 0.0009358041958041958, + "step": 960, + "tokens_trained": 0.471871816 + }, + { + "epoch": 0.2728884476278278, + "grad_norm": 49.925777435302734, + "loss": 4.4745, + "lr": 0.0009355244755244755, + "step": 962, + "tokens_trained": 0.472856728 + }, + { + "epoch": 0.2734557832777817, + "grad_norm": 46.520851135253906, + "loss": 4.403, + "lr": 0.0009352447552447553, + "step": 964, + "tokens_trained": 0.473838544 + }, + { + "epoch": 0.2740231189277356, + "grad_norm": 25.053146362304688, + "loss": 4.4247, + "lr": 0.0009349650349650349, + "step": 966, + "tokens_trained": 0.474819976 + }, + { + "epoch": 0.27459045457768955, + "grad_norm": 30.127140045166016, + "loss": 4.3834, + "lr": 0.0009346853146853147, + "step": 968, + "tokens_trained": 0.475800696 + }, + { + "epoch": 0.2751577902276434, + "grad_norm": 41.478328704833984, + "loss": 4.3978, + "lr": 0.0009344055944055944, + "step": 970, + "tokens_trained": 0.4767834 + }, + { + "epoch": 0.27572512587759734, + "grad_norm": 23.739456176757812, + "loss": 4.3698, + "lr": 0.0009341258741258742, + "step": 972, + "tokens_trained": 0.47776944 + }, + { + "epoch": 0.2762924615275512, + "grad_norm": 21.813220977783203, + "loss": 4.3902, + "lr": 0.0009338461538461539, + "step": 974, + "tokens_trained": 0.478757048 + }, + { + "epoch": 0.27685979717750514, + "grad_norm": 64.79598999023438, + "loss": 4.5237, + "lr": 0.0009335664335664336, + "step": 976, + "tokens_trained": 0.47973872 + }, + { + "epoch": 0.27742713282745907, + "grad_norm": 68.32705688476562, + "loss": 4.4461, + "lr": 0.0009332867132867133, + "step": 978, + "tokens_trained": 0.480721912 + }, + { + "epoch": 0.27799446847741294, + "grad_norm": 41.857582092285156, + "loss": 4.4663, + "lr": 0.0009330069930069929, + "step": 980, + "tokens_trained": 0.481704248 + }, + { + "epoch": 0.27856180412736686, + "grad_norm": 28.30609893798828, + "loss": 4.3461, + "lr": 0.0009327272727272728, + "step": 982, + "tokens_trained": 0.482689768 + }, + { + "epoch": 0.27912913977732073, + "grad_norm": 33.207950592041016, + "loss": 4.4185, + "lr": 0.0009324475524475524, + "step": 984, + "tokens_trained": 0.483670008 + }, + { + "epoch": 0.27969647542727466, + "grad_norm": 29.541227340698242, + "loss": 4.388, + "lr": 0.0009321678321678322, + "step": 986, + "tokens_trained": 0.48465836 + }, + { + "epoch": 0.2802638110772286, + "grad_norm": 16.23346710205078, + "loss": 4.3219, + "lr": 0.0009318881118881119, + "step": 988, + "tokens_trained": 0.4856402 + }, + { + "epoch": 0.28083114672718246, + "grad_norm": 20.036178588867188, + "loss": 4.3273, + "lr": 0.0009316083916083917, + "step": 990, + "tokens_trained": 0.486621648 + }, + { + "epoch": 0.2813984823771364, + "grad_norm": 49.25468063354492, + "loss": 4.4649, + "lr": 0.0009313286713286714, + "step": 992, + "tokens_trained": 0.48760744 + }, + { + "epoch": 0.28196581802709025, + "grad_norm": 48.59744644165039, + "loss": 4.3979, + "lr": 0.000931048951048951, + "step": 994, + "tokens_trained": 0.488590472 + }, + { + "epoch": 0.2825331536770442, + "grad_norm": 16.33649253845215, + "loss": 4.3945, + "lr": 0.0009307692307692308, + "step": 996, + "tokens_trained": 0.489570976 + }, + { + "epoch": 0.2831004893269981, + "grad_norm": 60.632591247558594, + "loss": 4.5581, + "lr": 0.0009304895104895104, + "step": 998, + "tokens_trained": 0.490552296 + }, + { + "epoch": 0.283667824976952, + "grad_norm": 52.75735092163086, + "loss": 4.424, + "lr": 0.0009302097902097903, + "step": 1000, + "tokens_trained": 0.49153744 + }, + { + "epoch": 0.283667824976952, + "eval_loss": 1.1363450288772583, + "eval_runtime": 20.7491, + "step": 1000, + "tokens_trained": 0.49153744 + }, + { + "epoch": 0.2842351606269059, + "grad_norm": 20.506614685058594, + "loss": 4.4241, + "lr": 0.0009299300699300699, + "step": 1002, + "tokens_trained": 0.492522608 + }, + { + "epoch": 0.2848024962768598, + "grad_norm": 23.148601531982422, + "loss": 4.3975, + "lr": 0.0009296503496503497, + "step": 1004, + "tokens_trained": 0.493501384 + }, + { + "epoch": 0.2853698319268137, + "grad_norm": 9.550869941711426, + "loss": 4.3952, + "lr": 0.0009293706293706294, + "step": 1006, + "tokens_trained": 0.494482544 + }, + { + "epoch": 0.2859371675767676, + "grad_norm": 80.31155395507812, + "loss": 4.7614, + "lr": 0.0009290909090909091, + "step": 1008, + "tokens_trained": 0.495459416 + }, + { + "epoch": 0.2865045032267215, + "grad_norm": 61.021026611328125, + "loss": 4.4396, + "lr": 0.0009288111888111889, + "step": 1010, + "tokens_trained": 0.4964418 + }, + { + "epoch": 0.2870718388766754, + "grad_norm": 35.23258972167969, + "loss": 4.5548, + "lr": 0.0009285314685314685, + "step": 1012, + "tokens_trained": 0.497428288 + }, + { + "epoch": 0.2876391745266293, + "grad_norm": 36.45478057861328, + "loss": 4.46, + "lr": 0.0009282517482517483, + "step": 1014, + "tokens_trained": 0.498416832 + }, + { + "epoch": 0.2882065101765832, + "grad_norm": 46.622982025146484, + "loss": 4.3554, + "lr": 0.0009279720279720279, + "step": 1016, + "tokens_trained": 0.499399792 + }, + { + "epoch": 0.28877384582653715, + "grad_norm": 87.00289154052734, + "loss": 4.5276, + "lr": 0.0009276923076923078, + "step": 1018, + "tokens_trained": 0.500383776 + }, + { + "epoch": 0.289341181476491, + "grad_norm": 11.444964408874512, + "loss": 4.5483, + "lr": 0.0009274125874125874, + "step": 1020, + "tokens_trained": 0.50136468 + }, + { + "epoch": 0.28990851712644494, + "grad_norm": 89.05914306640625, + "loss": 4.8957, + "lr": 0.0009271328671328671, + "step": 1022, + "tokens_trained": 0.50235172 + }, + { + "epoch": 0.2904758527763988, + "grad_norm": 26.915477752685547, + "loss": 4.6184, + "lr": 0.0009268531468531469, + "step": 1024, + "tokens_trained": 0.50333208 + }, + { + "epoch": 0.29104318842635274, + "grad_norm": 44.32100296020508, + "loss": 4.5263, + "lr": 0.0009265734265734266, + "step": 1026, + "tokens_trained": 0.504314656 + }, + { + "epoch": 0.29161052407630667, + "grad_norm": 26.699670791625977, + "loss": 4.3871, + "lr": 0.0009262937062937064, + "step": 1028, + "tokens_trained": 0.505296568 + }, + { + "epoch": 0.29217785972626054, + "grad_norm": 27.469482421875, + "loss": 4.3558, + "lr": 0.000926013986013986, + "step": 1030, + "tokens_trained": 0.506280416 + }, + { + "epoch": 0.29274519537621446, + "grad_norm": 26.149612426757812, + "loss": 4.3368, + "lr": 0.0009257342657342658, + "step": 1032, + "tokens_trained": 0.507261224 + }, + { + "epoch": 0.29331253102616833, + "grad_norm": 8.754459381103516, + "loss": 4.3447, + "lr": 0.0009254545454545454, + "step": 1034, + "tokens_trained": 0.508243288 + }, + { + "epoch": 0.29387986667612226, + "grad_norm": 32.17164611816406, + "loss": 4.4174, + "lr": 0.0009251748251748252, + "step": 1036, + "tokens_trained": 0.509224176 + }, + { + "epoch": 0.2944472023260762, + "grad_norm": 41.17238235473633, + "loss": 4.4221, + "lr": 0.0009248951048951049, + "step": 1038, + "tokens_trained": 0.510203568 + }, + { + "epoch": 0.29501453797603006, + "grad_norm": 44.97213363647461, + "loss": 4.3594, + "lr": 0.0009246153846153846, + "step": 1040, + "tokens_trained": 0.511186464 + }, + { + "epoch": 0.295581873625984, + "grad_norm": 42.23421859741211, + "loss": 4.4159, + "lr": 0.0009243356643356644, + "step": 1042, + "tokens_trained": 0.51216944 + }, + { + "epoch": 0.29614920927593785, + "grad_norm": 36.13594436645508, + "loss": 4.4105, + "lr": 0.0009240559440559441, + "step": 1044, + "tokens_trained": 0.513153144 + }, + { + "epoch": 0.2967165449258918, + "grad_norm": 36.89309310913086, + "loss": 4.3947, + "lr": 0.0009237762237762239, + "step": 1046, + "tokens_trained": 0.51413388 + }, + { + "epoch": 0.2972838805758457, + "grad_norm": 58.599700927734375, + "loss": 4.3988, + "lr": 0.0009234965034965035, + "step": 1048, + "tokens_trained": 0.515119288 + }, + { + "epoch": 0.2978512162257996, + "grad_norm": 13.725994110107422, + "loss": 4.412, + "lr": 0.0009232167832167832, + "step": 1050, + "tokens_trained": 0.51610284 + }, + { + "epoch": 0.2984185518757535, + "grad_norm": 105.28518676757812, + "loss": 4.7305, + "lr": 0.0009229370629370629, + "step": 1052, + "tokens_trained": 0.517085576 + }, + { + "epoch": 0.2989858875257074, + "grad_norm": 29.499713897705078, + "loss": 4.5106, + "lr": 0.0009226573426573427, + "step": 1054, + "tokens_trained": 0.518064224 + }, + { + "epoch": 0.2995532231756613, + "grad_norm": 60.907203674316406, + "loss": 4.5249, + "lr": 0.0009223776223776224, + "step": 1056, + "tokens_trained": 0.51905084 + }, + { + "epoch": 0.3001205588256152, + "grad_norm": 39.825069427490234, + "loss": 4.3695, + "lr": 0.0009220979020979021, + "step": 1058, + "tokens_trained": 0.5200318 + }, + { + "epoch": 0.3006878944755691, + "grad_norm": 42.77061462402344, + "loss": 4.4094, + "lr": 0.0009218181818181819, + "step": 1060, + "tokens_trained": 0.521013568 + }, + { + "epoch": 0.301255230125523, + "grad_norm": 37.05888748168945, + "loss": 4.3684, + "lr": 0.0009215384615384616, + "step": 1062, + "tokens_trained": 0.521997624 + }, + { + "epoch": 0.3018225657754769, + "grad_norm": 42.28252029418945, + "loss": 4.3489, + "lr": 0.0009212587412587413, + "step": 1064, + "tokens_trained": 0.522986184 + }, + { + "epoch": 0.3023899014254308, + "grad_norm": 40.95197677612305, + "loss": 4.3564, + "lr": 0.000920979020979021, + "step": 1066, + "tokens_trained": 0.523970984 + }, + { + "epoch": 0.30295723707538474, + "grad_norm": 25.469568252563477, + "loss": 4.3833, + "lr": 0.0009206993006993007, + "step": 1068, + "tokens_trained": 0.524952808 + }, + { + "epoch": 0.3035245727253386, + "grad_norm": 29.921735763549805, + "loss": 4.3579, + "lr": 0.0009204195804195804, + "step": 1070, + "tokens_trained": 0.525935696 + }, + { + "epoch": 0.30409190837529254, + "grad_norm": 26.038026809692383, + "loss": 4.2898, + "lr": 0.0009201398601398602, + "step": 1072, + "tokens_trained": 0.526916904 + }, + { + "epoch": 0.3046592440252464, + "grad_norm": 32.59503936767578, + "loss": 4.3335, + "lr": 0.0009198601398601398, + "step": 1074, + "tokens_trained": 0.527899864 + }, + { + "epoch": 0.30522657967520034, + "grad_norm": 14.04964828491211, + "loss": 4.3171, + "lr": 0.0009195804195804196, + "step": 1076, + "tokens_trained": 0.528878176 + }, + { + "epoch": 0.30579391532515426, + "grad_norm": 15.936906814575195, + "loss": 4.3005, + "lr": 0.0009193006993006993, + "step": 1078, + "tokens_trained": 0.529859952 + }, + { + "epoch": 0.30636125097510813, + "grad_norm": 9.73235034942627, + "loss": 4.3287, + "lr": 0.0009190209790209791, + "step": 1080, + "tokens_trained": 0.530838192 + }, + { + "epoch": 0.30692858662506206, + "grad_norm": 45.44027328491211, + "loss": 4.4384, + "lr": 0.0009187412587412588, + "step": 1082, + "tokens_trained": 0.531818376 + }, + { + "epoch": 0.30749592227501593, + "grad_norm": 55.65925598144531, + "loss": 4.3772, + "lr": 0.0009184615384615385, + "step": 1084, + "tokens_trained": 0.532802048 + }, + { + "epoch": 0.30806325792496986, + "grad_norm": 33.47093200683594, + "loss": 4.4257, + "lr": 0.0009181818181818182, + "step": 1086, + "tokens_trained": 0.533785376 + }, + { + "epoch": 0.3086305935749238, + "grad_norm": 39.709224700927734, + "loss": 4.4177, + "lr": 0.0009179020979020978, + "step": 1088, + "tokens_trained": 0.5347698 + }, + { + "epoch": 0.30919792922487765, + "grad_norm": 34.25212097167969, + "loss": 4.3518, + "lr": 0.0009176223776223777, + "step": 1090, + "tokens_trained": 0.53575108 + }, + { + "epoch": 0.3097652648748316, + "grad_norm": 29.156312942504883, + "loss": 4.3596, + "lr": 0.0009173426573426573, + "step": 1092, + "tokens_trained": 0.536735544 + }, + { + "epoch": 0.31033260052478545, + "grad_norm": 31.714128494262695, + "loss": 4.3736, + "lr": 0.0009170629370629371, + "step": 1094, + "tokens_trained": 0.537718008 + }, + { + "epoch": 0.3108999361747394, + "grad_norm": 12.244729042053223, + "loss": 4.3472, + "lr": 0.0009167832167832168, + "step": 1096, + "tokens_trained": 0.538693512 + }, + { + "epoch": 0.3114672718246933, + "grad_norm": 10.271063804626465, + "loss": 4.301, + "lr": 0.0009165034965034966, + "step": 1098, + "tokens_trained": 0.539681376 + }, + { + "epoch": 0.3120346074746472, + "grad_norm": 35.79754638671875, + "loss": 4.3912, + "lr": 0.0009162237762237763, + "step": 1100, + "tokens_trained": 0.540661392 + }, + { + "epoch": 0.3126019431246011, + "grad_norm": 24.1260986328125, + "loss": 4.3303, + "lr": 0.0009159440559440559, + "step": 1102, + "tokens_trained": 0.541646968 + }, + { + "epoch": 0.31316927877455497, + "grad_norm": 24.501169204711914, + "loss": 4.3205, + "lr": 0.0009156643356643357, + "step": 1104, + "tokens_trained": 0.542629392 + }, + { + "epoch": 0.3137366144245089, + "grad_norm": 17.031600952148438, + "loss": 4.2521, + "lr": 0.0009153846153846153, + "step": 1106, + "tokens_trained": 0.54361348 + }, + { + "epoch": 0.3143039500744628, + "grad_norm": 19.506216049194336, + "loss": 4.3225, + "lr": 0.0009151048951048952, + "step": 1108, + "tokens_trained": 0.544595336 + }, + { + "epoch": 0.3148712857244167, + "grad_norm": 20.822546005249023, + "loss": 4.2711, + "lr": 0.0009148251748251748, + "step": 1110, + "tokens_trained": 0.545578256 + }, + { + "epoch": 0.3154386213743706, + "grad_norm": 29.967998504638672, + "loss": 4.2868, + "lr": 0.0009145454545454546, + "step": 1112, + "tokens_trained": 0.546561024 + }, + { + "epoch": 0.3160059570243245, + "grad_norm": 24.06121063232422, + "loss": 4.2701, + "lr": 0.0009142657342657343, + "step": 1114, + "tokens_trained": 0.547544616 + }, + { + "epoch": 0.3165732926742784, + "grad_norm": 15.868765830993652, + "loss": 4.3233, + "lr": 0.000913986013986014, + "step": 1116, + "tokens_trained": 0.548526216 + }, + { + "epoch": 0.31714062832423234, + "grad_norm": 27.47897720336914, + "loss": 4.2813, + "lr": 0.0009137062937062938, + "step": 1118, + "tokens_trained": 0.549506544 + }, + { + "epoch": 0.3177079639741862, + "grad_norm": 15.343204498291016, + "loss": 4.3002, + "lr": 0.0009134265734265734, + "step": 1120, + "tokens_trained": 0.550488496 + }, + { + "epoch": 0.31827529962414014, + "grad_norm": 4.320124626159668, + "loss": 4.2622, + "lr": 0.0009131468531468532, + "step": 1122, + "tokens_trained": 0.551471792 + }, + { + "epoch": 0.318842635274094, + "grad_norm": 34.520050048828125, + "loss": 4.366, + "lr": 0.0009128671328671328, + "step": 1124, + "tokens_trained": 0.552457008 + }, + { + "epoch": 0.319126303099071, + "eval_loss": 1.096465826034546, + "eval_runtime": 20.7643, + "step": 1125, + "tokens_trained": 0.552948064 + }, + { + "epoch": 0.31940997092404794, + "grad_norm": 39.718719482421875, + "loss": 4.3317, + "lr": 0.0009125874125874127, + "step": 1126, + "tokens_trained": 0.5534394 + }, + { + "epoch": 0.31997730657400186, + "grad_norm": 20.843252182006836, + "loss": 4.3883, + "lr": 0.0009123076923076923, + "step": 1128, + "tokens_trained": 0.554419184 + }, + { + "epoch": 0.32054464222395573, + "grad_norm": 12.916360855102539, + "loss": 4.3119, + "lr": 0.000912027972027972, + "step": 1130, + "tokens_trained": 0.555401952 + }, + { + "epoch": 0.32111197787390966, + "grad_norm": 48.54426956176758, + "loss": 4.4155, + "lr": 0.0009117482517482518, + "step": 1132, + "tokens_trained": 0.556385024 + }, + { + "epoch": 0.32167931352386353, + "grad_norm": 41.00883483886719, + "loss": 4.362, + "lr": 0.0009114685314685315, + "step": 1134, + "tokens_trained": 0.557368472 + }, + { + "epoch": 0.32224664917381746, + "grad_norm": 28.0487060546875, + "loss": 4.3504, + "lr": 0.0009111888111888113, + "step": 1136, + "tokens_trained": 0.55835288 + }, + { + "epoch": 0.3228139848237714, + "grad_norm": 22.05229377746582, + "loss": 4.331, + "lr": 0.0009109090909090909, + "step": 1138, + "tokens_trained": 0.559337064 + }, + { + "epoch": 0.32338132047372525, + "grad_norm": 16.770631790161133, + "loss": 4.3008, + "lr": 0.0009106293706293707, + "step": 1140, + "tokens_trained": 0.560317984 + }, + { + "epoch": 0.3239486561236792, + "grad_norm": 35.300262451171875, + "loss": 4.4083, + "lr": 0.0009103496503496503, + "step": 1142, + "tokens_trained": 0.561299688 + }, + { + "epoch": 0.32451599177363305, + "grad_norm": 23.788284301757812, + "loss": 4.2772, + "lr": 0.0009100699300699301, + "step": 1144, + "tokens_trained": 0.562285664 + }, + { + "epoch": 0.325083327423587, + "grad_norm": 23.085710525512695, + "loss": 4.3185, + "lr": 0.0009097902097902098, + "step": 1146, + "tokens_trained": 0.563267832 + }, + { + "epoch": 0.3256506630735409, + "grad_norm": 13.11314582824707, + "loss": 4.2711, + "lr": 0.0009095104895104895, + "step": 1148, + "tokens_trained": 0.564248928 + }, + { + "epoch": 0.3262179987234948, + "grad_norm": 31.297805786132812, + "loss": 4.3096, + "lr": 0.0009092307692307692, + "step": 1150, + "tokens_trained": 0.56522952 + }, + { + "epoch": 0.3267853343734487, + "grad_norm": 11.668539047241211, + "loss": 4.2667, + "lr": 0.000908951048951049, + "step": 1152, + "tokens_trained": 0.566212392 + }, + { + "epoch": 0.32735267002340257, + "grad_norm": 23.359189987182617, + "loss": 4.3156, + "lr": 0.0009086713286713288, + "step": 1154, + "tokens_trained": 0.567192216 + }, + { + "epoch": 0.3279200056733565, + "grad_norm": 31.09916114807129, + "loss": 4.3367, + "lr": 0.0009083916083916084, + "step": 1156, + "tokens_trained": 0.568177088 + }, + { + "epoch": 0.3284873413233104, + "grad_norm": 24.03261947631836, + "loss": 4.3504, + "lr": 0.0009081118881118881, + "step": 1158, + "tokens_trained": 0.56915868 + }, + { + "epoch": 0.3290546769732643, + "grad_norm": 16.029443740844727, + "loss": 4.3192, + "lr": 0.0009078321678321678, + "step": 1160, + "tokens_trained": 0.570142976 + }, + { + "epoch": 0.3296220126232182, + "grad_norm": 53.486724853515625, + "loss": 4.3921, + "lr": 0.0009075524475524476, + "step": 1162, + "tokens_trained": 0.57112748 + }, + { + "epoch": 0.3301893482731721, + "grad_norm": 37.42267608642578, + "loss": 4.2821, + "lr": 0.0009072727272727273, + "step": 1164, + "tokens_trained": 0.57211356 + }, + { + "epoch": 0.330756683923126, + "grad_norm": 28.862472534179688, + "loss": 4.3002, + "lr": 0.000906993006993007, + "step": 1166, + "tokens_trained": 0.57309492 + }, + { + "epoch": 0.33132401957307994, + "grad_norm": 22.26299476623535, + "loss": 4.2729, + "lr": 0.0009067132867132866, + "step": 1168, + "tokens_trained": 0.5740806 + }, + { + "epoch": 0.3318913552230338, + "grad_norm": 21.635013580322266, + "loss": 4.2866, + "lr": 0.0009064335664335665, + "step": 1170, + "tokens_trained": 0.575061664 + }, + { + "epoch": 0.33245869087298774, + "grad_norm": 18.995012283325195, + "loss": 4.2814, + "lr": 0.0009061538461538462, + "step": 1172, + "tokens_trained": 0.576046304 + }, + { + "epoch": 0.3330260265229416, + "grad_norm": 22.621299743652344, + "loss": 4.2739, + "lr": 0.0009058741258741259, + "step": 1174, + "tokens_trained": 0.577032376 + }, + { + "epoch": 0.33359336217289554, + "grad_norm": 21.758216857910156, + "loss": 4.263, + "lr": 0.0009055944055944056, + "step": 1176, + "tokens_trained": 0.578013896 + }, + { + "epoch": 0.33416069782284946, + "grad_norm": 32.38374710083008, + "loss": 4.2713, + "lr": 0.0009053146853146853, + "step": 1178, + "tokens_trained": 0.57900508 + }, + { + "epoch": 0.33472803347280333, + "grad_norm": 35.57462692260742, + "loss": 4.2986, + "lr": 0.0009050349650349651, + "step": 1180, + "tokens_trained": 0.57999512 + }, + { + "epoch": 0.33529536912275726, + "grad_norm": 11.77812385559082, + "loss": 4.3085, + "lr": 0.0009047552447552448, + "step": 1182, + "tokens_trained": 0.580982752 + }, + { + "epoch": 0.33586270477271113, + "grad_norm": 51.48725509643555, + "loss": 4.4003, + "lr": 0.0009044755244755245, + "step": 1184, + "tokens_trained": 0.581964936 + }, + { + "epoch": 0.33643004042266506, + "grad_norm": 47.01481628417969, + "loss": 4.3182, + "lr": 0.0009041958041958041, + "step": 1186, + "tokens_trained": 0.582949944 + }, + { + "epoch": 0.336997376072619, + "grad_norm": 22.935691833496094, + "loss": 4.3432, + "lr": 0.000903916083916084, + "step": 1188, + "tokens_trained": 0.583934776 + }, + { + "epoch": 0.33756471172257285, + "grad_norm": 45.21054458618164, + "loss": 4.4674, + "lr": 0.0009036363636363637, + "step": 1190, + "tokens_trained": 0.584918344 + }, + { + "epoch": 0.3381320473725268, + "grad_norm": 27.012706756591797, + "loss": 4.2889, + "lr": 0.0009033566433566434, + "step": 1192, + "tokens_trained": 0.585897632 + }, + { + "epoch": 0.33869938302248065, + "grad_norm": 16.68247413635254, + "loss": 4.2896, + "lr": 0.0009030769230769231, + "step": 1194, + "tokens_trained": 0.586879408 + }, + { + "epoch": 0.3392667186724346, + "grad_norm": 20.664148330688477, + "loss": 4.304, + "lr": 0.0009027972027972027, + "step": 1196, + "tokens_trained": 0.587859392 + }, + { + "epoch": 0.3398340543223885, + "grad_norm": 22.954742431640625, + "loss": 4.2853, + "lr": 0.0009025174825174826, + "step": 1198, + "tokens_trained": 0.588845408 + }, + { + "epoch": 0.34040138997234237, + "grad_norm": 23.226943969726562, + "loss": 4.2597, + "lr": 0.0009022377622377622, + "step": 1200, + "tokens_trained": 0.589832736 + }, + { + "epoch": 0.3409687256222963, + "grad_norm": 7.963059902191162, + "loss": 4.261, + "lr": 0.000901958041958042, + "step": 1202, + "tokens_trained": 0.590816568 + }, + { + "epoch": 0.34153606127225017, + "grad_norm": 25.160730361938477, + "loss": 4.3288, + "lr": 0.0009016783216783216, + "step": 1204, + "tokens_trained": 0.59179692 + }, + { + "epoch": 0.3421033969222041, + "grad_norm": 38.45030212402344, + "loss": 4.3371, + "lr": 0.0009013986013986014, + "step": 1206, + "tokens_trained": 0.592780968 + }, + { + "epoch": 0.342670732572158, + "grad_norm": 52.66873550415039, + "loss": 4.2805, + "lr": 0.0009011188811188812, + "step": 1208, + "tokens_trained": 0.593760896 + }, + { + "epoch": 0.3432380682221119, + "grad_norm": 28.104921340942383, + "loss": 4.3885, + "lr": 0.0009008391608391609, + "step": 1210, + "tokens_trained": 0.59474304 + }, + { + "epoch": 0.3438054038720658, + "grad_norm": 49.20989990234375, + "loss": 4.346, + "lr": 0.0009005594405594406, + "step": 1212, + "tokens_trained": 0.59572768 + }, + { + "epoch": 0.3443727395220197, + "grad_norm": 20.652427673339844, + "loss": 4.2368, + "lr": 0.0009002797202797202, + "step": 1214, + "tokens_trained": 0.59671092 + }, + { + "epoch": 0.3449400751719736, + "grad_norm": 17.821596145629883, + "loss": 4.3041, + "lr": 0.0009000000000000001, + "step": 1216, + "tokens_trained": 0.597697344 + }, + { + "epoch": 0.34550741082192754, + "grad_norm": 48.594932556152344, + "loss": 4.3668, + "lr": 0.0008997202797202797, + "step": 1218, + "tokens_trained": 0.598677288 + }, + { + "epoch": 0.3460747464718814, + "grad_norm": 27.70078468322754, + "loss": 4.2939, + "lr": 0.0008994405594405595, + "step": 1220, + "tokens_trained": 0.599662488 + }, + { + "epoch": 0.34664208212183534, + "grad_norm": 25.498798370361328, + "loss": 4.2891, + "lr": 0.0008991608391608391, + "step": 1222, + "tokens_trained": 0.600646904 + }, + { + "epoch": 0.3472094177717892, + "grad_norm": 13.455835342407227, + "loss": 4.2881, + "lr": 0.0008988811188811188, + "step": 1224, + "tokens_trained": 0.601628112 + }, + { + "epoch": 0.34777675342174313, + "grad_norm": 17.518342971801758, + "loss": 4.2977, + "lr": 0.0008986013986013987, + "step": 1226, + "tokens_trained": 0.602612336 + }, + { + "epoch": 0.34834408907169706, + "grad_norm": 20.642597198486328, + "loss": 4.2921, + "lr": 0.0008983216783216783, + "step": 1228, + "tokens_trained": 0.603595 + }, + { + "epoch": 0.34891142472165093, + "grad_norm": 14.464616775512695, + "loss": 4.233, + "lr": 0.0008980419580419581, + "step": 1230, + "tokens_trained": 0.604576592 + }, + { + "epoch": 0.34947876037160486, + "grad_norm": 13.204504013061523, + "loss": 4.2707, + "lr": 0.0008977622377622377, + "step": 1232, + "tokens_trained": 0.60555656 + }, + { + "epoch": 0.35004609602155873, + "grad_norm": 12.241665840148926, + "loss": 4.2506, + "lr": 0.0008974825174825176, + "step": 1234, + "tokens_trained": 0.606536024 + }, + { + "epoch": 0.35061343167151265, + "grad_norm": 18.187660217285156, + "loss": 4.2659, + "lr": 0.0008972027972027972, + "step": 1236, + "tokens_trained": 0.607522576 + }, + { + "epoch": 0.3511807673214666, + "grad_norm": 8.911888122558594, + "loss": 4.2505, + "lr": 0.000896923076923077, + "step": 1238, + "tokens_trained": 0.608507736 + }, + { + "epoch": 0.35174810297142045, + "grad_norm": 21.351713180541992, + "loss": 4.2291, + "lr": 0.0008966433566433566, + "step": 1240, + "tokens_trained": 0.609486688 + }, + { + "epoch": 0.3523154386213744, + "grad_norm": 47.81566619873047, + "loss": 4.2725, + "lr": 0.0008963636363636363, + "step": 1242, + "tokens_trained": 0.610470272 + }, + { + "epoch": 0.35288277427132825, + "grad_norm": 33.53351974487305, + "loss": 4.3237, + "lr": 0.0008960839160839162, + "step": 1244, + "tokens_trained": 0.611455176 + }, + { + "epoch": 0.3534501099212822, + "grad_norm": 15.252607345581055, + "loss": 4.2868, + "lr": 0.0008958041958041958, + "step": 1246, + "tokens_trained": 0.612437888 + }, + { + "epoch": 0.3540174455712361, + "grad_norm": 24.129865646362305, + "loss": 4.2626, + "lr": 0.0008955244755244756, + "step": 1248, + "tokens_trained": 0.613420728 + }, + { + "epoch": 0.35458478122118997, + "grad_norm": 34.814605712890625, + "loss": 4.2627, + "lr": 0.0008952447552447552, + "step": 1250, + "tokens_trained": 0.614405904 + }, + { + "epoch": 0.35458478122118997, + "eval_loss": 1.078355312347412, + "eval_runtime": 20.4723, + "step": 1250, + "tokens_trained": 0.614405904 + }, + { + "epoch": 0.3551521168711439, + "grad_norm": 18.26809310913086, + "loss": 4.2986, + "lr": 0.000894965034965035, + "step": 1252, + "tokens_trained": 0.615386288 + }, + { + "epoch": 0.35571945252109777, + "grad_norm": 24.68335723876953, + "loss": 4.3146, + "lr": 0.0008946853146853147, + "step": 1254, + "tokens_trained": 0.616370576 + }, + { + "epoch": 0.3562867881710517, + "grad_norm": 35.34586715698242, + "loss": 4.2905, + "lr": 0.0008944055944055944, + "step": 1256, + "tokens_trained": 0.617351944 + }, + { + "epoch": 0.3568541238210056, + "grad_norm": 22.668407440185547, + "loss": 4.2607, + "lr": 0.0008941258741258741, + "step": 1258, + "tokens_trained": 0.618334816 + }, + { + "epoch": 0.3574214594709595, + "grad_norm": 14.068164825439453, + "loss": 4.2459, + "lr": 0.0008938461538461538, + "step": 1260, + "tokens_trained": 0.619319736 + }, + { + "epoch": 0.3579887951209134, + "grad_norm": 8.274995803833008, + "loss": 4.2713, + "lr": 0.0008935664335664337, + "step": 1262, + "tokens_trained": 0.620299344 + }, + { + "epoch": 0.3585561307708673, + "grad_norm": 22.12897491455078, + "loss": 4.2841, + "lr": 0.0008932867132867133, + "step": 1264, + "tokens_trained": 0.621282592 + }, + { + "epoch": 0.3591234664208212, + "grad_norm": 26.171052932739258, + "loss": 4.2505, + "lr": 0.000893006993006993, + "step": 1266, + "tokens_trained": 0.622266136 + }, + { + "epoch": 0.35969080207077514, + "grad_norm": 14.768603324890137, + "loss": 4.271, + "lr": 0.0008927272727272727, + "step": 1268, + "tokens_trained": 0.623247816 + }, + { + "epoch": 0.360258137720729, + "grad_norm": 13.065408706665039, + "loss": 4.2387, + "lr": 0.0008924475524475525, + "step": 1270, + "tokens_trained": 0.624234848 + }, + { + "epoch": 0.36082547337068294, + "grad_norm": 14.043888092041016, + "loss": 4.2601, + "lr": 0.0008921678321678322, + "step": 1272, + "tokens_trained": 0.625214176 + }, + { + "epoch": 0.3613928090206368, + "grad_norm": 13.734328269958496, + "loss": 4.2426, + "lr": 0.0008918881118881119, + "step": 1274, + "tokens_trained": 0.626197608 + }, + { + "epoch": 0.36196014467059073, + "grad_norm": 10.075374603271484, + "loss": 4.2259, + "lr": 0.0008916083916083916, + "step": 1276, + "tokens_trained": 0.62717884 + }, + { + "epoch": 0.36252748032054466, + "grad_norm": 33.92001724243164, + "loss": 4.3054, + "lr": 0.0008913286713286713, + "step": 1278, + "tokens_trained": 0.628166888 + }, + { + "epoch": 0.36309481597049853, + "grad_norm": 31.1391544342041, + "loss": 4.3066, + "lr": 0.0008910489510489512, + "step": 1280, + "tokens_trained": 0.629152528 + }, + { + "epoch": 0.36366215162045246, + "grad_norm": 10.888711929321289, + "loss": 4.2348, + "lr": 0.0008907692307692308, + "step": 1282, + "tokens_trained": 0.630132584 + }, + { + "epoch": 0.3642294872704063, + "grad_norm": 27.298410415649414, + "loss": 4.3225, + "lr": 0.0008904895104895105, + "step": 1284, + "tokens_trained": 0.63111212 + }, + { + "epoch": 0.36479682292036025, + "grad_norm": 23.396818161010742, + "loss": 4.3177, + "lr": 0.0008902097902097902, + "step": 1286, + "tokens_trained": 0.632094984 + }, + { + "epoch": 0.3653641585703142, + "grad_norm": 18.824432373046875, + "loss": 4.2235, + "lr": 0.00088993006993007, + "step": 1288, + "tokens_trained": 0.633076832 + }, + { + "epoch": 0.36593149422026805, + "grad_norm": 8.04826545715332, + "loss": 4.2268, + "lr": 0.0008896503496503497, + "step": 1290, + "tokens_trained": 0.63405868 + }, + { + "epoch": 0.366498829870222, + "grad_norm": 32.26673889160156, + "loss": 4.3113, + "lr": 0.0008893706293706294, + "step": 1292, + "tokens_trained": 0.635045096 + }, + { + "epoch": 0.36706616552017585, + "grad_norm": 29.91358184814453, + "loss": 4.2971, + "lr": 0.000889090909090909, + "step": 1294, + "tokens_trained": 0.63603008 + }, + { + "epoch": 0.3676335011701298, + "grad_norm": 12.093538284301758, + "loss": 4.2502, + "lr": 0.0008888111888111888, + "step": 1296, + "tokens_trained": 0.637014016 + }, + { + "epoch": 0.3682008368200837, + "grad_norm": 8.252509117126465, + "loss": 4.2905, + "lr": 0.0008885314685314686, + "step": 1298, + "tokens_trained": 0.637997752 + }, + { + "epoch": 0.36876817247003757, + "grad_norm": 61.22240447998047, + "loss": 4.4753, + "lr": 0.0008882517482517483, + "step": 1300, + "tokens_trained": 0.638981552 + }, + { + "epoch": 0.3693355081199915, + "grad_norm": 47.58195877075195, + "loss": 4.2769, + "lr": 0.000887972027972028, + "step": 1302, + "tokens_trained": 0.639963512 + }, + { + "epoch": 0.36990284376994537, + "grad_norm": 28.806411743164062, + "loss": 4.3728, + "lr": 0.0008876923076923077, + "step": 1304, + "tokens_trained": 0.640948392 + }, + { + "epoch": 0.3704701794198993, + "grad_norm": 38.960853576660156, + "loss": 4.338, + "lr": 0.0008874125874125875, + "step": 1306, + "tokens_trained": 0.641935304 + }, + { + "epoch": 0.3710375150698532, + "grad_norm": 25.05726432800293, + "loss": 4.3002, + "lr": 0.0008871328671328671, + "step": 1308, + "tokens_trained": 0.642924168 + }, + { + "epoch": 0.3716048507198071, + "grad_norm": 39.84127426147461, + "loss": 4.3593, + "lr": 0.0008868531468531469, + "step": 1310, + "tokens_trained": 0.64390412 + }, + { + "epoch": 0.372172186369761, + "grad_norm": 15.03055191040039, + "loss": 4.223, + "lr": 0.0008865734265734265, + "step": 1312, + "tokens_trained": 0.644882104 + }, + { + "epoch": 0.3727395220197149, + "grad_norm": 41.85628890991211, + "loss": 4.3819, + "lr": 0.0008862937062937063, + "step": 1314, + "tokens_trained": 0.645866912 + }, + { + "epoch": 0.3733068576696688, + "grad_norm": 29.014118194580078, + "loss": 4.2843, + "lr": 0.0008860139860139861, + "step": 1316, + "tokens_trained": 0.646850376 + }, + { + "epoch": 0.37387419331962274, + "grad_norm": 24.407743453979492, + "loss": 4.2598, + "lr": 0.0008857342657342658, + "step": 1318, + "tokens_trained": 0.647832272 + }, + { + "epoch": 0.3744415289695766, + "grad_norm": 23.28154182434082, + "loss": 4.2162, + "lr": 0.0008854545454545455, + "step": 1320, + "tokens_trained": 0.64881652 + }, + { + "epoch": 0.37500886461953054, + "grad_norm": 17.70418930053711, + "loss": 4.2386, + "lr": 0.0008851748251748251, + "step": 1322, + "tokens_trained": 0.649794936 + }, + { + "epoch": 0.37557620026948446, + "grad_norm": 22.582124710083008, + "loss": 4.2358, + "lr": 0.000884895104895105, + "step": 1324, + "tokens_trained": 0.650777784 + }, + { + "epoch": 0.37614353591943833, + "grad_norm": 16.77848243713379, + "loss": 4.2536, + "lr": 0.0008846153846153846, + "step": 1326, + "tokens_trained": 0.651762472 + }, + { + "epoch": 0.37671087156939226, + "grad_norm": 14.382417678833008, + "loss": 4.2403, + "lr": 0.0008843356643356644, + "step": 1328, + "tokens_trained": 0.652741832 + }, + { + "epoch": 0.37727820721934613, + "grad_norm": 22.420886993408203, + "loss": 4.1977, + "lr": 0.000884055944055944, + "step": 1330, + "tokens_trained": 0.653725792 + }, + { + "epoch": 0.37784554286930006, + "grad_norm": 9.768660545349121, + "loss": 4.2148, + "lr": 0.0008837762237762238, + "step": 1332, + "tokens_trained": 0.654704648 + }, + { + "epoch": 0.378412878519254, + "grad_norm": 5.091487407684326, + "loss": 4.2062, + "lr": 0.0008834965034965036, + "step": 1334, + "tokens_trained": 0.65569176 + }, + { + "epoch": 0.37898021416920785, + "grad_norm": 53.520957946777344, + "loss": 4.4082, + "lr": 0.0008832167832167832, + "step": 1336, + "tokens_trained": 0.656679344 + }, + { + "epoch": 0.3795475498191618, + "grad_norm": 32.17420959472656, + "loss": 4.2911, + "lr": 0.000882937062937063, + "step": 1338, + "tokens_trained": 0.657665136 + }, + { + "epoch": 0.38011488546911565, + "grad_norm": 14.12790584564209, + "loss": 4.2899, + "lr": 0.0008826573426573426, + "step": 1340, + "tokens_trained": 0.658651576 + }, + { + "epoch": 0.3806822211190696, + "grad_norm": 51.74199676513672, + "loss": 4.3901, + "lr": 0.0008823776223776225, + "step": 1342, + "tokens_trained": 0.659631792 + }, + { + "epoch": 0.3812495567690235, + "grad_norm": 48.99909973144531, + "loss": 4.298, + "lr": 0.0008820979020979021, + "step": 1344, + "tokens_trained": 0.660616912 + }, + { + "epoch": 0.38181689241897737, + "grad_norm": 28.356245040893555, + "loss": 4.3171, + "lr": 0.0008818181818181819, + "step": 1346, + "tokens_trained": 0.66159872 + }, + { + "epoch": 0.3823842280689313, + "grad_norm": 45.081703186035156, + "loss": 4.3067, + "lr": 0.0008815384615384615, + "step": 1348, + "tokens_trained": 0.662582152 + }, + { + "epoch": 0.38295156371888517, + "grad_norm": 37.175052642822266, + "loss": 4.241, + "lr": 0.0008812587412587412, + "step": 1350, + "tokens_trained": 0.663561176 + }, + { + "epoch": 0.3835188993688391, + "grad_norm": 49.46076965332031, + "loss": 4.2896, + "lr": 0.0008809790209790211, + "step": 1352, + "tokens_trained": 0.664545144 + }, + { + "epoch": 0.384086235018793, + "grad_norm": 22.20182991027832, + "loss": 4.323, + "lr": 0.0008806993006993007, + "step": 1354, + "tokens_trained": 0.66553092 + }, + { + "epoch": 0.3846535706687469, + "grad_norm": 34.111549377441406, + "loss": 4.3138, + "lr": 0.0008804195804195805, + "step": 1356, + "tokens_trained": 0.666517568 + }, + { + "epoch": 0.3852209063187008, + "grad_norm": 47.01582336425781, + "loss": 4.3009, + "lr": 0.0008801398601398601, + "step": 1358, + "tokens_trained": 0.667498192 + }, + { + "epoch": 0.3857882419686547, + "grad_norm": 18.845388412475586, + "loss": 4.3176, + "lr": 0.00087986013986014, + "step": 1360, + "tokens_trained": 0.668479008 + }, + { + "epoch": 0.3863555776186086, + "grad_norm": 53.68927764892578, + "loss": 4.4024, + "lr": 0.0008795804195804196, + "step": 1362, + "tokens_trained": 0.669462472 + }, + { + "epoch": 0.38692291326856254, + "grad_norm": 29.88358497619629, + "loss": 4.286, + "lr": 0.0008793006993006993, + "step": 1364, + "tokens_trained": 0.67044392 + }, + { + "epoch": 0.3874902489185164, + "grad_norm": 11.12879753112793, + "loss": 4.3024, + "lr": 0.000879020979020979, + "step": 1366, + "tokens_trained": 0.671424552 + }, + { + "epoch": 0.38805758456847034, + "grad_norm": 23.573301315307617, + "loss": 4.2662, + "lr": 0.0008787412587412587, + "step": 1368, + "tokens_trained": 0.672409992 + }, + { + "epoch": 0.3886249202184242, + "grad_norm": 24.749160766601562, + "loss": 4.274, + "lr": 0.0008784615384615386, + "step": 1370, + "tokens_trained": 0.67339824 + }, + { + "epoch": 0.38919225586837813, + "grad_norm": 33.26881408691406, + "loss": 4.2588, + "lr": 0.0008781818181818182, + "step": 1372, + "tokens_trained": 0.67438204 + }, + { + "epoch": 0.38975959151833206, + "grad_norm": 24.466472625732422, + "loss": 4.2837, + "lr": 0.000877902097902098, + "step": 1374, + "tokens_trained": 0.67536356 + }, + { + "epoch": 0.39004325934330897, + "eval_loss": 1.0616238117218018, + "eval_runtime": 20.3698, + "step": 1375, + "tokens_trained": 0.675855672 + }, + { + "epoch": 0.39032692716828593, + "grad_norm": 24.48844337463379, + "loss": 4.259, + "lr": 0.0008776223776223776, + "step": 1376, + "tokens_trained": 0.676346368 + }, + { + "epoch": 0.39089426281823986, + "grad_norm": 30.594989776611328, + "loss": 4.1894, + "lr": 0.0008773426573426574, + "step": 1378, + "tokens_trained": 0.677329312 + }, + { + "epoch": 0.3914615984681937, + "grad_norm": 19.835350036621094, + "loss": 4.2718, + "lr": 0.0008770629370629371, + "step": 1380, + "tokens_trained": 0.678312272 + }, + { + "epoch": 0.39202893411814765, + "grad_norm": 14.570358276367188, + "loss": 4.2419, + "lr": 0.0008767832167832168, + "step": 1382, + "tokens_trained": 0.679291216 + }, + { + "epoch": 0.3925962697681016, + "grad_norm": 11.608271598815918, + "loss": 4.1917, + "lr": 0.0008765034965034965, + "step": 1384, + "tokens_trained": 0.680273296 + }, + { + "epoch": 0.39316360541805545, + "grad_norm": 26.094860076904297, + "loss": 4.2762, + "lr": 0.0008762237762237762, + "step": 1386, + "tokens_trained": 0.681249464 + }, + { + "epoch": 0.3937309410680094, + "grad_norm": 12.754049301147461, + "loss": 4.2032, + "lr": 0.0008759440559440561, + "step": 1388, + "tokens_trained": 0.682234168 + }, + { + "epoch": 0.39429827671796325, + "grad_norm": 5.951663970947266, + "loss": 4.1921, + "lr": 0.0008756643356643357, + "step": 1390, + "tokens_trained": 0.683217176 + }, + { + "epoch": 0.3948656123679172, + "grad_norm": 26.907669067382812, + "loss": 4.24, + "lr": 0.0008753846153846154, + "step": 1392, + "tokens_trained": 0.68419888 + }, + { + "epoch": 0.3954329480178711, + "grad_norm": 25.04796600341797, + "loss": 4.2656, + "lr": 0.0008751048951048951, + "step": 1394, + "tokens_trained": 0.685178784 + }, + { + "epoch": 0.39600028366782497, + "grad_norm": 19.600811004638672, + "loss": 4.2683, + "lr": 0.0008748251748251749, + "step": 1396, + "tokens_trained": 0.686161632 + }, + { + "epoch": 0.3965676193177789, + "grad_norm": 14.087088584899902, + "loss": 4.2658, + "lr": 0.0008745454545454546, + "step": 1398, + "tokens_trained": 0.687139992 + }, + { + "epoch": 0.39713495496773277, + "grad_norm": 9.257765769958496, + "loss": 4.2021, + "lr": 0.0008742657342657343, + "step": 1400, + "tokens_trained": 0.688117912 + }, + { + "epoch": 0.3977022906176867, + "grad_norm": 18.830154418945312, + "loss": 4.2249, + "lr": 0.0008739860139860139, + "step": 1402, + "tokens_trained": 0.689098776 + }, + { + "epoch": 0.3982696262676406, + "grad_norm": 24.81566619873047, + "loss": 4.246, + "lr": 0.0008737062937062937, + "step": 1404, + "tokens_trained": 0.690085432 + }, + { + "epoch": 0.3988369619175945, + "grad_norm": 14.071616172790527, + "loss": 4.2531, + "lr": 0.0008734265734265734, + "step": 1406, + "tokens_trained": 0.691069232 + }, + { + "epoch": 0.3994042975675484, + "grad_norm": 21.414424896240234, + "loss": 4.2192, + "lr": 0.0008731468531468532, + "step": 1408, + "tokens_trained": 0.692051224 + }, + { + "epoch": 0.3999716332175023, + "grad_norm": 38.74683380126953, + "loss": 4.2421, + "lr": 0.0008728671328671329, + "step": 1410, + "tokens_trained": 0.693029976 + }, + { + "epoch": 0.4005389688674562, + "grad_norm": 12.595442771911621, + "loss": 4.2569, + "lr": 0.0008725874125874126, + "step": 1412, + "tokens_trained": 0.694013304 + }, + { + "epoch": 0.40110630451741014, + "grad_norm": 55.233673095703125, + "loss": 4.3422, + "lr": 0.0008723076923076924, + "step": 1414, + "tokens_trained": 0.694997536 + }, + { + "epoch": 0.401673640167364, + "grad_norm": 24.717113494873047, + "loss": 4.2567, + "lr": 0.000872027972027972, + "step": 1416, + "tokens_trained": 0.695982632 + }, + { + "epoch": 0.40224097581731794, + "grad_norm": 20.552875518798828, + "loss": 4.2464, + "lr": 0.0008717482517482518, + "step": 1418, + "tokens_trained": 0.696966408 + }, + { + "epoch": 0.4028083114672718, + "grad_norm": 25.569900512695312, + "loss": 4.21, + "lr": 0.0008714685314685314, + "step": 1420, + "tokens_trained": 0.697948224 + }, + { + "epoch": 0.40337564711722573, + "grad_norm": 24.538320541381836, + "loss": 4.2605, + "lr": 0.0008711888111888112, + "step": 1422, + "tokens_trained": 0.698934688 + }, + { + "epoch": 0.40394298276717966, + "grad_norm": 9.585651397705078, + "loss": 4.2524, + "lr": 0.0008709090909090909, + "step": 1424, + "tokens_trained": 0.699921976 + }, + { + "epoch": 0.40451031841713353, + "grad_norm": 11.886672973632812, + "loss": 4.1934, + "lr": 0.0008706293706293707, + "step": 1426, + "tokens_trained": 0.70090396 + }, + { + "epoch": 0.40507765406708746, + "grad_norm": 26.162124633789062, + "loss": 4.2412, + "lr": 0.0008703496503496504, + "step": 1428, + "tokens_trained": 0.701888448 + }, + { + "epoch": 0.4056449897170413, + "grad_norm": 5.03931188583374, + "loss": 4.202, + "lr": 0.00087006993006993, + "step": 1430, + "tokens_trained": 0.702864336 + }, + { + "epoch": 0.40621232536699525, + "grad_norm": 33.67579650878906, + "loss": 4.3087, + "lr": 0.0008697902097902099, + "step": 1432, + "tokens_trained": 0.703847784 + }, + { + "epoch": 0.4067796610169492, + "grad_norm": 34.38542556762695, + "loss": 4.2807, + "lr": 0.0008695104895104895, + "step": 1434, + "tokens_trained": 0.704827288 + }, + { + "epoch": 0.40734699666690305, + "grad_norm": 13.319886207580566, + "loss": 4.3332, + "lr": 0.0008692307692307693, + "step": 1436, + "tokens_trained": 0.705815392 + }, + { + "epoch": 0.407914332316857, + "grad_norm": 36.58311080932617, + "loss": 4.3318, + "lr": 0.0008689510489510489, + "step": 1438, + "tokens_trained": 0.7067914 + }, + { + "epoch": 0.40848166796681085, + "grad_norm": 29.63648223876953, + "loss": 4.2962, + "lr": 0.0008686713286713287, + "step": 1440, + "tokens_trained": 0.70777396 + }, + { + "epoch": 0.4090490036167648, + "grad_norm": 9.55128002166748, + "loss": 4.2773, + "lr": 0.0008683916083916084, + "step": 1442, + "tokens_trained": 0.708750496 + }, + { + "epoch": 0.4096163392667187, + "grad_norm": 53.83981704711914, + "loss": 4.3875, + "lr": 0.0008681118881118881, + "step": 1444, + "tokens_trained": 0.709730168 + }, + { + "epoch": 0.41018367491667257, + "grad_norm": 54.59236526489258, + "loss": 4.3582, + "lr": 0.0008678321678321679, + "step": 1446, + "tokens_trained": 0.710709704 + }, + { + "epoch": 0.4107510105666265, + "grad_norm": 13.964411735534668, + "loss": 4.3065, + "lr": 0.0008675524475524475, + "step": 1448, + "tokens_trained": 0.711690136 + }, + { + "epoch": 0.41131834621658037, + "grad_norm": 25.506649017333984, + "loss": 4.2686, + "lr": 0.0008672727272727273, + "step": 1450, + "tokens_trained": 0.712668056 + }, + { + "epoch": 0.4118856818665343, + "grad_norm": 21.1628360748291, + "loss": 4.2485, + "lr": 0.000866993006993007, + "step": 1452, + "tokens_trained": 0.71365004 + }, + { + "epoch": 0.4124530175164882, + "grad_norm": 15.751238822937012, + "loss": 4.2078, + "lr": 0.0008667132867132868, + "step": 1454, + "tokens_trained": 0.714632032 + }, + { + "epoch": 0.4130203531664421, + "grad_norm": 15.838552474975586, + "loss": 4.1944, + "lr": 0.0008664335664335664, + "step": 1456, + "tokens_trained": 0.715611376 + }, + { + "epoch": 0.413587688816396, + "grad_norm": 15.968609809875488, + "loss": 4.1768, + "lr": 0.0008661538461538461, + "step": 1458, + "tokens_trained": 0.716591112 + }, + { + "epoch": 0.4141550244663499, + "grad_norm": 15.419891357421875, + "loss": 4.1978, + "lr": 0.0008658741258741259, + "step": 1460, + "tokens_trained": 0.717575952 + }, + { + "epoch": 0.4147223601163038, + "grad_norm": 15.088132858276367, + "loss": 4.2361, + "lr": 0.0008655944055944056, + "step": 1462, + "tokens_trained": 0.718563696 + }, + { + "epoch": 0.41528969576625774, + "grad_norm": 4.839190483093262, + "loss": 4.2089, + "lr": 0.0008653146853146854, + "step": 1464, + "tokens_trained": 0.71954848 + }, + { + "epoch": 0.4158570314162116, + "grad_norm": 22.192466735839844, + "loss": 4.2109, + "lr": 0.000865034965034965, + "step": 1466, + "tokens_trained": 0.720533304 + }, + { + "epoch": 0.41642436706616553, + "grad_norm": 28.983531951904297, + "loss": 4.2402, + "lr": 0.0008647552447552448, + "step": 1468, + "tokens_trained": 0.721518176 + }, + { + "epoch": 0.4169917027161194, + "grad_norm": 21.010780334472656, + "loss": 4.1732, + "lr": 0.0008644755244755245, + "step": 1470, + "tokens_trained": 0.72250176 + }, + { + "epoch": 0.41755903836607333, + "grad_norm": 14.59277057647705, + "loss": 4.1847, + "lr": 0.0008641958041958042, + "step": 1472, + "tokens_trained": 0.723486664 + }, + { + "epoch": 0.41812637401602726, + "grad_norm": 13.688531875610352, + "loss": 4.1577, + "lr": 0.0008639160839160839, + "step": 1474, + "tokens_trained": 0.724469328 + }, + { + "epoch": 0.41869370966598113, + "grad_norm": 15.879347801208496, + "loss": 4.1721, + "lr": 0.0008636363636363636, + "step": 1476, + "tokens_trained": 0.725454968 + }, + { + "epoch": 0.41926104531593505, + "grad_norm": 10.225201606750488, + "loss": 4.1999, + "lr": 0.0008633566433566434, + "step": 1478, + "tokens_trained": 0.7264426 + }, + { + "epoch": 0.4198283809658889, + "grad_norm": 17.007728576660156, + "loss": 4.2229, + "lr": 0.0008630769230769231, + "step": 1480, + "tokens_trained": 0.727422056 + }, + { + "epoch": 0.42039571661584285, + "grad_norm": 13.517934799194336, + "loss": 4.2241, + "lr": 0.0008627972027972029, + "step": 1482, + "tokens_trained": 0.728403688 + }, + { + "epoch": 0.4209630522657968, + "grad_norm": 17.132064819335938, + "loss": 4.1679, + "lr": 0.0008625174825174825, + "step": 1484, + "tokens_trained": 0.729386248 + }, + { + "epoch": 0.42153038791575065, + "grad_norm": 19.782320022583008, + "loss": 4.1817, + "lr": 0.0008622377622377622, + "step": 1486, + "tokens_trained": 0.730368752 + }, + { + "epoch": 0.4220977235657046, + "grad_norm": 3.388552188873291, + "loss": 4.1726, + "lr": 0.000861958041958042, + "step": 1488, + "tokens_trained": 0.731354304 + }, + { + "epoch": 0.42266505921565845, + "grad_norm": 28.33499526977539, + "loss": 4.2623, + "lr": 0.0008616783216783217, + "step": 1490, + "tokens_trained": 0.732337296 + }, + { + "epoch": 0.42323239486561237, + "grad_norm": 24.927406311035156, + "loss": 4.2422, + "lr": 0.0008613986013986014, + "step": 1492, + "tokens_trained": 0.733319824 + }, + { + "epoch": 0.4237997305155663, + "grad_norm": 25.996028900146484, + "loss": 4.2227, + "lr": 0.0008611188811188811, + "step": 1494, + "tokens_trained": 0.73430636 + }, + { + "epoch": 0.42436706616552017, + "grad_norm": 14.625783920288086, + "loss": 4.2268, + "lr": 0.0008608391608391609, + "step": 1496, + "tokens_trained": 0.735285848 + }, + { + "epoch": 0.4249344018154741, + "grad_norm": 12.556640625, + "loss": 4.2352, + "lr": 0.0008605594405594406, + "step": 1498, + "tokens_trained": 0.736270632 + }, + { + "epoch": 0.42550173746542796, + "grad_norm": 18.579416275024414, + "loss": 4.2377, + "lr": 0.0008602797202797203, + "step": 1500, + "tokens_trained": 0.737255104 + }, + { + "epoch": 0.42550173746542796, + "eval_loss": 1.052606463432312, + "eval_runtime": 20.5089, + "step": 1500, + "tokens_trained": 0.737255104 + }, + { + "epoch": 0.4260690731153819, + "grad_norm": 16.550657272338867, + "loss": 4.182, + "lr": 0.00086, + "step": 1502, + "tokens_trained": 0.738240848 + }, + { + "epoch": 0.4266364087653358, + "grad_norm": 24.4381046295166, + "loss": 4.2093, + "lr": 0.0008597202797202797, + "step": 1504, + "tokens_trained": 0.73922592 + }, + { + "epoch": 0.4272037444152897, + "grad_norm": 13.155163764953613, + "loss": 4.239, + "lr": 0.0008594405594405595, + "step": 1506, + "tokens_trained": 0.740208896 + }, + { + "epoch": 0.4277710800652436, + "grad_norm": 27.667949676513672, + "loss": 4.2607, + "lr": 0.0008591608391608392, + "step": 1508, + "tokens_trained": 0.741189312 + }, + { + "epoch": 0.4283384157151975, + "grad_norm": 35.897743225097656, + "loss": 4.2153, + "lr": 0.0008588811188811188, + "step": 1510, + "tokens_trained": 0.742170456 + }, + { + "epoch": 0.4289057513651514, + "grad_norm": 18.16407012939453, + "loss": 4.2753, + "lr": 0.0008586013986013986, + "step": 1512, + "tokens_trained": 0.743152504 + }, + { + "epoch": 0.42947308701510534, + "grad_norm": 27.447364807128906, + "loss": 4.2321, + "lr": 0.0008583216783216783, + "step": 1514, + "tokens_trained": 0.744139768 + }, + { + "epoch": 0.4300404226650592, + "grad_norm": 21.115859985351562, + "loss": 4.2048, + "lr": 0.0008580419580419581, + "step": 1516, + "tokens_trained": 0.745122368 + }, + { + "epoch": 0.43060775831501313, + "grad_norm": 5.949585914611816, + "loss": 4.1787, + "lr": 0.0008577622377622378, + "step": 1518, + "tokens_trained": 0.746104936 + }, + { + "epoch": 0.431175093964967, + "grad_norm": 6.631585121154785, + "loss": 4.2035, + "lr": 0.0008574825174825175, + "step": 1520, + "tokens_trained": 0.747086264 + }, + { + "epoch": 0.43174242961492093, + "grad_norm": 38.91585159301758, + "loss": 4.354, + "lr": 0.0008572027972027972, + "step": 1522, + "tokens_trained": 0.74806844 + }, + { + "epoch": 0.43230976526487486, + "grad_norm": 37.53727722167969, + "loss": 4.228, + "lr": 0.000856923076923077, + "step": 1524, + "tokens_trained": 0.749052432 + }, + { + "epoch": 0.4328771009148287, + "grad_norm": 19.87713623046875, + "loss": 4.2696, + "lr": 0.0008566433566433567, + "step": 1526, + "tokens_trained": 0.750037072 + }, + { + "epoch": 0.43344443656478265, + "grad_norm": 25.615995407104492, + "loss": 4.2676, + "lr": 0.0008563636363636363, + "step": 1528, + "tokens_trained": 0.751020584 + }, + { + "epoch": 0.4340117722147365, + "grad_norm": 16.643299102783203, + "loss": 4.201, + "lr": 0.0008560839160839161, + "step": 1530, + "tokens_trained": 0.75200224 + }, + { + "epoch": 0.43457910786469045, + "grad_norm": 16.207853317260742, + "loss": 4.1944, + "lr": 0.0008558041958041958, + "step": 1532, + "tokens_trained": 0.752981624 + }, + { + "epoch": 0.4351464435146444, + "grad_norm": 27.054973602294922, + "loss": 4.2188, + "lr": 0.0008555244755244756, + "step": 1534, + "tokens_trained": 0.753968464 + }, + { + "epoch": 0.43571377916459825, + "grad_norm": 33.468238830566406, + "loss": 4.2052, + "lr": 0.0008552447552447553, + "step": 1536, + "tokens_trained": 0.754950976 + }, + { + "epoch": 0.4362811148145522, + "grad_norm": 21.083576202392578, + "loss": 4.2514, + "lr": 0.000854965034965035, + "step": 1538, + "tokens_trained": 0.755938272 + }, + { + "epoch": 0.43684845046450604, + "grad_norm": 19.927122116088867, + "loss": 4.2493, + "lr": 0.0008546853146853147, + "step": 1540, + "tokens_trained": 0.756916784 + }, + { + "epoch": 0.43741578611445997, + "grad_norm": 22.105287551879883, + "loss": 4.2264, + "lr": 0.0008544055944055944, + "step": 1542, + "tokens_trained": 0.757901152 + }, + { + "epoch": 0.4379831217644139, + "grad_norm": 22.448705673217773, + "loss": 4.1987, + "lr": 0.0008541258741258742, + "step": 1544, + "tokens_trained": 0.758886048 + }, + { + "epoch": 0.43855045741436777, + "grad_norm": 17.740005493164062, + "loss": 4.1918, + "lr": 0.0008538461538461538, + "step": 1546, + "tokens_trained": 0.759864304 + }, + { + "epoch": 0.4391177930643217, + "grad_norm": 20.58041763305664, + "loss": 4.2144, + "lr": 0.0008535664335664336, + "step": 1548, + "tokens_trained": 0.760844312 + }, + { + "epoch": 0.43968512871427556, + "grad_norm": 21.937252044677734, + "loss": 4.2129, + "lr": 0.0008532867132867133, + "step": 1550, + "tokens_trained": 0.761827256 + }, + { + "epoch": 0.4402524643642295, + "grad_norm": 26.883426666259766, + "loss": 4.2244, + "lr": 0.000853006993006993, + "step": 1552, + "tokens_trained": 0.7628098 + }, + { + "epoch": 0.4408198000141834, + "grad_norm": 10.297266960144043, + "loss": 4.1724, + "lr": 0.0008527272727272728, + "step": 1554, + "tokens_trained": 0.763792488 + }, + { + "epoch": 0.4413871356641373, + "grad_norm": 12.119601249694824, + "loss": 4.1828, + "lr": 0.0008524475524475524, + "step": 1556, + "tokens_trained": 0.764769936 + }, + { + "epoch": 0.4419544713140912, + "grad_norm": 16.565885543823242, + "loss": 4.2113, + "lr": 0.0008521678321678322, + "step": 1558, + "tokens_trained": 0.765752376 + }, + { + "epoch": 0.4425218069640451, + "grad_norm": 18.860309600830078, + "loss": 4.1864, + "lr": 0.0008518881118881119, + "step": 1560, + "tokens_trained": 0.766736256 + }, + { + "epoch": 0.443089142613999, + "grad_norm": 4.049737453460693, + "loss": 4.2108, + "lr": 0.0008516083916083917, + "step": 1562, + "tokens_trained": 0.767720568 + }, + { + "epoch": 0.44365647826395294, + "grad_norm": 15.730945587158203, + "loss": 4.2339, + "lr": 0.0008513286713286713, + "step": 1564, + "tokens_trained": 0.768701288 + }, + { + "epoch": 0.4442238139139068, + "grad_norm": 18.64398956298828, + "loss": 4.2132, + "lr": 0.000851048951048951, + "step": 1566, + "tokens_trained": 0.769681336 + }, + { + "epoch": 0.44479114956386073, + "grad_norm": 22.01759147644043, + "loss": 4.2211, + "lr": 0.0008507692307692308, + "step": 1568, + "tokens_trained": 0.770661168 + }, + { + "epoch": 0.4453584852138146, + "grad_norm": 3.097306489944458, + "loss": 4.2114, + "lr": 0.0008504895104895105, + "step": 1570, + "tokens_trained": 0.7716424 + }, + { + "epoch": 0.44592582086376853, + "grad_norm": 35.901546478271484, + "loss": 4.3, + "lr": 0.0008502097902097903, + "step": 1572, + "tokens_trained": 0.772627536 + }, + { + "epoch": 0.44649315651372246, + "grad_norm": 20.762710571289062, + "loss": 4.2465, + "lr": 0.0008499300699300699, + "step": 1574, + "tokens_trained": 0.77361008 + }, + { + "epoch": 0.4470604921636763, + "grad_norm": 13.54304027557373, + "loss": 4.221, + "lr": 0.0008496503496503497, + "step": 1576, + "tokens_trained": 0.774591184 + }, + { + "epoch": 0.44762782781363025, + "grad_norm": 18.83641242980957, + "loss": 4.2228, + "lr": 0.0008493706293706294, + "step": 1578, + "tokens_trained": 0.775574136 + }, + { + "epoch": 0.4481951634635841, + "grad_norm": 12.294941902160645, + "loss": 4.1768, + "lr": 0.0008490909090909091, + "step": 1580, + "tokens_trained": 0.776554752 + }, + { + "epoch": 0.44876249911353805, + "grad_norm": 5.768923759460449, + "loss": 4.2255, + "lr": 0.0008488111888111888, + "step": 1582, + "tokens_trained": 0.777539368 + }, + { + "epoch": 0.449329834763492, + "grad_norm": 7.9961137771606445, + "loss": 4.2218, + "lr": 0.0008485314685314685, + "step": 1584, + "tokens_trained": 0.778522344 + }, + { + "epoch": 0.44989717041344585, + "grad_norm": 22.005645751953125, + "loss": 4.2452, + "lr": 0.0008482517482517483, + "step": 1586, + "tokens_trained": 0.77950768 + }, + { + "epoch": 0.45046450606339977, + "grad_norm": 27.313426971435547, + "loss": 4.1875, + "lr": 0.000847972027972028, + "step": 1588, + "tokens_trained": 0.780490984 + }, + { + "epoch": 0.45103184171335364, + "grad_norm": 10.344687461853027, + "loss": 4.2356, + "lr": 0.0008476923076923078, + "step": 1590, + "tokens_trained": 0.781469 + }, + { + "epoch": 0.45159917736330757, + "grad_norm": 27.348726272583008, + "loss": 4.2962, + "lr": 0.0008474125874125874, + "step": 1592, + "tokens_trained": 0.782450304 + }, + { + "epoch": 0.4521665130132615, + "grad_norm": 32.965911865234375, + "loss": 4.2736, + "lr": 0.0008471328671328671, + "step": 1594, + "tokens_trained": 0.783431416 + }, + { + "epoch": 0.45273384866321537, + "grad_norm": 7.752636909484863, + "loss": 4.2074, + "lr": 0.0008468531468531469, + "step": 1596, + "tokens_trained": 0.784409568 + }, + { + "epoch": 0.4533011843131693, + "grad_norm": 38.85223388671875, + "loss": 4.3261, + "lr": 0.0008465734265734266, + "step": 1598, + "tokens_trained": 0.785399368 + }, + { + "epoch": 0.45386851996312316, + "grad_norm": 38.017967224121094, + "loss": 4.2646, + "lr": 0.0008462937062937063, + "step": 1600, + "tokens_trained": 0.786376072 + }, + { + "epoch": 0.4544358556130771, + "grad_norm": 7.856576442718506, + "loss": 4.191, + "lr": 0.000846013986013986, + "step": 1602, + "tokens_trained": 0.787362072 + }, + { + "epoch": 0.455003191263031, + "grad_norm": 37.902870178222656, + "loss": 4.2651, + "lr": 0.0008457342657342658, + "step": 1604, + "tokens_trained": 0.788345104 + }, + { + "epoch": 0.4555705269129849, + "grad_norm": 7.724793434143066, + "loss": 4.1994, + "lr": 0.0008454545454545455, + "step": 1606, + "tokens_trained": 0.7893314 + }, + { + "epoch": 0.4561378625629388, + "grad_norm": 26.484699249267578, + "loss": 4.2276, + "lr": 0.0008451748251748252, + "step": 1608, + "tokens_trained": 0.790309344 + }, + { + "epoch": 0.4567051982128927, + "grad_norm": 23.137874603271484, + "loss": 4.2082, + "lr": 0.0008448951048951049, + "step": 1610, + "tokens_trained": 0.791295784 + }, + { + "epoch": 0.4572725338628466, + "grad_norm": 13.902606964111328, + "loss": 4.2035, + "lr": 0.0008446153846153846, + "step": 1612, + "tokens_trained": 0.79228076 + }, + { + "epoch": 0.45783986951280053, + "grad_norm": 8.438498497009277, + "loss": 4.1713, + "lr": 0.0008443356643356644, + "step": 1614, + "tokens_trained": 0.793265456 + }, + { + "epoch": 0.4584072051627544, + "grad_norm": 11.60899829864502, + "loss": 4.1971, + "lr": 0.0008440559440559441, + "step": 1616, + "tokens_trained": 0.794245896 + }, + { + "epoch": 0.45897454081270833, + "grad_norm": 19.33312225341797, + "loss": 4.2328, + "lr": 0.0008437762237762238, + "step": 1618, + "tokens_trained": 0.795229016 + }, + { + "epoch": 0.4595418764626622, + "grad_norm": 16.45014190673828, + "loss": 4.2277, + "lr": 0.0008434965034965035, + "step": 1620, + "tokens_trained": 0.79620792 + }, + { + "epoch": 0.46010921211261613, + "grad_norm": 9.818867683410645, + "loss": 4.1494, + "lr": 0.0008432167832167832, + "step": 1622, + "tokens_trained": 0.797192352 + }, + { + "epoch": 0.46067654776257005, + "grad_norm": 7.920058250427246, + "loss": 4.2027, + "lr": 0.000842937062937063, + "step": 1624, + "tokens_trained": 0.798174104 + }, + { + "epoch": 0.46096021558754696, + "eval_loss": 1.044265627861023, + "eval_runtime": 20.5617, + "step": 1625, + "tokens_trained": 0.798668072 + }, + { + "epoch": 0.4612438834125239, + "grad_norm": 10.734235763549805, + "loss": 4.1505, + "lr": 0.0008426573426573427, + "step": 1626, + "tokens_trained": 0.799160304 + }, + { + "epoch": 0.46181121906247785, + "grad_norm": 23.376392364501953, + "loss": 4.195, + "lr": 0.0008423776223776224, + "step": 1628, + "tokens_trained": 0.800144144 + }, + { + "epoch": 0.4623785547124317, + "grad_norm": 23.567371368408203, + "loss": 4.2367, + "lr": 0.0008420979020979021, + "step": 1630, + "tokens_trained": 0.801131184 + }, + { + "epoch": 0.46294589036238565, + "grad_norm": 19.271820068359375, + "loss": 4.1899, + "lr": 0.0008418181818181819, + "step": 1632, + "tokens_trained": 0.802111296 + }, + { + "epoch": 0.4635132260123396, + "grad_norm": 17.468698501586914, + "loss": 4.1941, + "lr": 0.0008415384615384616, + "step": 1634, + "tokens_trained": 0.803095112 + }, + { + "epoch": 0.46408056166229344, + "grad_norm": 22.298749923706055, + "loss": 4.2083, + "lr": 0.0008412587412587412, + "step": 1636, + "tokens_trained": 0.804080456 + }, + { + "epoch": 0.46464789731224737, + "grad_norm": 12.506179809570312, + "loss": 4.1953, + "lr": 0.000840979020979021, + "step": 1638, + "tokens_trained": 0.805062464 + }, + { + "epoch": 0.46521523296220124, + "grad_norm": 11.819656372070312, + "loss": 4.2047, + "lr": 0.0008406993006993006, + "step": 1640, + "tokens_trained": 0.806045504 + }, + { + "epoch": 0.46578256861215517, + "grad_norm": 15.925740242004395, + "loss": 4.1565, + "lr": 0.0008404195804195805, + "step": 1642, + "tokens_trained": 0.80702736 + }, + { + "epoch": 0.4663499042621091, + "grad_norm": 15.869892120361328, + "loss": 4.2134, + "lr": 0.0008401398601398602, + "step": 1644, + "tokens_trained": 0.808009192 + }, + { + "epoch": 0.46691723991206296, + "grad_norm": 10.851021766662598, + "loss": 4.2041, + "lr": 0.0008398601398601399, + "step": 1646, + "tokens_trained": 0.808994728 + }, + { + "epoch": 0.4674845755620169, + "grad_norm": 8.271230697631836, + "loss": 4.1739, + "lr": 0.0008395804195804196, + "step": 1648, + "tokens_trained": 0.809976448 + }, + { + "epoch": 0.46805191121197076, + "grad_norm": 13.768092155456543, + "loss": 4.1761, + "lr": 0.0008393006993006993, + "step": 1650, + "tokens_trained": 0.810958392 + }, + { + "epoch": 0.4686192468619247, + "grad_norm": 7.760485649108887, + "loss": 4.1826, + "lr": 0.0008390209790209791, + "step": 1652, + "tokens_trained": 0.81194136 + }, + { + "epoch": 0.4691865825118786, + "grad_norm": 13.28488540649414, + "loss": 4.1659, + "lr": 0.0008387412587412587, + "step": 1654, + "tokens_trained": 0.812924984 + }, + { + "epoch": 0.4697539181618325, + "grad_norm": 10.466367721557617, + "loss": 4.1432, + "lr": 0.0008384615384615385, + "step": 1656, + "tokens_trained": 0.813907424 + }, + { + "epoch": 0.4703212538117864, + "grad_norm": 15.40854549407959, + "loss": 4.1625, + "lr": 0.0008381818181818181, + "step": 1658, + "tokens_trained": 0.814888712 + }, + { + "epoch": 0.4708885894617403, + "grad_norm": 20.580612182617188, + "loss": 4.1636, + "lr": 0.000837902097902098, + "step": 1660, + "tokens_trained": 0.815869152 + }, + { + "epoch": 0.4714559251116942, + "grad_norm": 14.908403396606445, + "loss": 4.1763, + "lr": 0.0008376223776223776, + "step": 1662, + "tokens_trained": 0.816852664 + }, + { + "epoch": 0.47202326076164813, + "grad_norm": 10.217529296875, + "loss": 4.1934, + "lr": 0.0008373426573426573, + "step": 1664, + "tokens_trained": 0.817832792 + }, + { + "epoch": 0.472590596411602, + "grad_norm": 15.74150276184082, + "loss": 4.1714, + "lr": 0.0008370629370629371, + "step": 1666, + "tokens_trained": 0.81881728 + }, + { + "epoch": 0.47315793206155593, + "grad_norm": 15.39499282836914, + "loss": 4.2005, + "lr": 0.0008367832167832168, + "step": 1668, + "tokens_trained": 0.819800824 + }, + { + "epoch": 0.4737252677115098, + "grad_norm": 11.585809707641602, + "loss": 4.136, + "lr": 0.0008365034965034966, + "step": 1670, + "tokens_trained": 0.8207856 + }, + { + "epoch": 0.4742926033614637, + "grad_norm": 16.053237915039062, + "loss": 4.1827, + "lr": 0.0008362237762237762, + "step": 1672, + "tokens_trained": 0.821766576 + }, + { + "epoch": 0.47485993901141765, + "grad_norm": 9.23779582977295, + "loss": 4.1159, + "lr": 0.000835944055944056, + "step": 1674, + "tokens_trained": 0.822749696 + }, + { + "epoch": 0.4754272746613715, + "grad_norm": 11.395891189575195, + "loss": 4.17, + "lr": 0.0008356643356643356, + "step": 1676, + "tokens_trained": 0.82373032 + }, + { + "epoch": 0.47599461031132545, + "grad_norm": 17.745365142822266, + "loss": 4.1696, + "lr": 0.0008353846153846154, + "step": 1678, + "tokens_trained": 0.824712192 + }, + { + "epoch": 0.4765619459612793, + "grad_norm": 6.7816572189331055, + "loss": 4.1933, + "lr": 0.0008351048951048951, + "step": 1680, + "tokens_trained": 0.825691208 + }, + { + "epoch": 0.47712928161123325, + "grad_norm": 20.552772521972656, + "loss": 4.1625, + "lr": 0.0008348251748251748, + "step": 1682, + "tokens_trained": 0.826672584 + }, + { + "epoch": 0.4776966172611872, + "grad_norm": 21.632352828979492, + "loss": 4.2061, + "lr": 0.0008345454545454546, + "step": 1684, + "tokens_trained": 0.827654368 + }, + { + "epoch": 0.47826395291114104, + "grad_norm": 17.754596710205078, + "loss": 4.222, + "lr": 0.0008342657342657343, + "step": 1686, + "tokens_trained": 0.828639392 + }, + { + "epoch": 0.47883128856109497, + "grad_norm": 20.73906707763672, + "loss": 4.1679, + "lr": 0.0008339860139860141, + "step": 1688, + "tokens_trained": 0.829627232 + }, + { + "epoch": 0.47939862421104884, + "grad_norm": 28.157238006591797, + "loss": 4.1658, + "lr": 0.0008337062937062937, + "step": 1690, + "tokens_trained": 0.830610904 + }, + { + "epoch": 0.47996595986100277, + "grad_norm": 12.728020668029785, + "loss": 4.1892, + "lr": 0.0008334265734265734, + "step": 1692, + "tokens_trained": 0.831602544 + }, + { + "epoch": 0.4805332955109567, + "grad_norm": 20.21622657775879, + "loss": 4.1453, + "lr": 0.0008331468531468531, + "step": 1694, + "tokens_trained": 0.832584656 + }, + { + "epoch": 0.48110063116091056, + "grad_norm": 18.5329647064209, + "loss": 4.2145, + "lr": 0.0008328671328671329, + "step": 1696, + "tokens_trained": 0.833570472 + }, + { + "epoch": 0.4816679668108645, + "grad_norm": 12.47617244720459, + "loss": 4.1944, + "lr": 0.0008325874125874126, + "step": 1698, + "tokens_trained": 0.834556104 + }, + { + "epoch": 0.48223530246081836, + "grad_norm": 21.34851837158203, + "loss": 4.1754, + "lr": 0.0008323076923076923, + "step": 1700, + "tokens_trained": 0.835540592 + }, + { + "epoch": 0.4828026381107723, + "grad_norm": 13.20995807647705, + "loss": 4.1657, + "lr": 0.000832027972027972, + "step": 1702, + "tokens_trained": 0.836525136 + }, + { + "epoch": 0.4833699737607262, + "grad_norm": 16.77725601196289, + "loss": 4.1905, + "lr": 0.0008317482517482518, + "step": 1704, + "tokens_trained": 0.837509224 + }, + { + "epoch": 0.4839373094106801, + "grad_norm": 15.17611312866211, + "loss": 4.1823, + "lr": 0.0008314685314685315, + "step": 1706, + "tokens_trained": 0.838492472 + }, + { + "epoch": 0.484504645060634, + "grad_norm": 13.06942081451416, + "loss": 4.1732, + "lr": 0.0008311888111888112, + "step": 1708, + "tokens_trained": 0.839471696 + }, + { + "epoch": 0.4850719807105879, + "grad_norm": 10.456578254699707, + "loss": 4.1862, + "lr": 0.0008309090909090909, + "step": 1710, + "tokens_trained": 0.840452808 + }, + { + "epoch": 0.4856393163605418, + "grad_norm": 13.80197525024414, + "loss": 4.1663, + "lr": 0.0008306293706293706, + "step": 1712, + "tokens_trained": 0.841434224 + }, + { + "epoch": 0.48620665201049573, + "grad_norm": 20.076507568359375, + "loss": 4.1436, + "lr": 0.0008303496503496504, + "step": 1714, + "tokens_trained": 0.842415304 + }, + { + "epoch": 0.4867739876604496, + "grad_norm": 5.629086971282959, + "loss": 4.149, + "lr": 0.00083006993006993, + "step": 1716, + "tokens_trained": 0.84339416 + }, + { + "epoch": 0.48734132331040353, + "grad_norm": 13.932148933410645, + "loss": 4.1785, + "lr": 0.0008297902097902098, + "step": 1718, + "tokens_trained": 0.844380472 + }, + { + "epoch": 0.4879086589603574, + "grad_norm": 18.951047897338867, + "loss": 4.216, + "lr": 0.0008295104895104895, + "step": 1720, + "tokens_trained": 0.845366896 + }, + { + "epoch": 0.4884759946103113, + "grad_norm": 21.042476654052734, + "loss": 4.1634, + "lr": 0.0008292307692307693, + "step": 1722, + "tokens_trained": 0.846344792 + }, + { + "epoch": 0.48904333026026525, + "grad_norm": 23.94416618347168, + "loss": 4.1613, + "lr": 0.000828951048951049, + "step": 1724, + "tokens_trained": 0.847323608 + }, + { + "epoch": 0.4896106659102191, + "grad_norm": 5.057071208953857, + "loss": 4.1729, + "lr": 0.0008286713286713287, + "step": 1726, + "tokens_trained": 0.848304856 + }, + { + "epoch": 0.49017800156017305, + "grad_norm": 18.068674087524414, + "loss": 4.2194, + "lr": 0.0008283916083916084, + "step": 1728, + "tokens_trained": 0.849287712 + }, + { + "epoch": 0.4907453372101269, + "grad_norm": 11.621233940124512, + "loss": 4.2232, + "lr": 0.000828111888111888, + "step": 1730, + "tokens_trained": 0.850268968 + }, + { + "epoch": 0.49131267286008085, + "grad_norm": 12.939676284790039, + "loss": 4.2003, + "lr": 0.0008278321678321679, + "step": 1732, + "tokens_trained": 0.851256528 + }, + { + "epoch": 0.49188000851003477, + "grad_norm": 10.638157844543457, + "loss": 4.1975, + "lr": 0.0008275524475524475, + "step": 1734, + "tokens_trained": 0.852240824 + }, + { + "epoch": 0.49244734415998864, + "grad_norm": 6.2671003341674805, + "loss": 4.1617, + "lr": 0.0008272727272727273, + "step": 1736, + "tokens_trained": 0.853224768 + }, + { + "epoch": 0.49301467980994257, + "grad_norm": 12.318375587463379, + "loss": 4.1939, + "lr": 0.000826993006993007, + "step": 1738, + "tokens_trained": 0.8542062 + }, + { + "epoch": 0.49358201545989644, + "grad_norm": 17.275348663330078, + "loss": 4.1911, + "lr": 0.0008267132867132868, + "step": 1740, + "tokens_trained": 0.855192024 + }, + { + "epoch": 0.49414935110985037, + "grad_norm": 11.122747421264648, + "loss": 4.17, + "lr": 0.0008264335664335665, + "step": 1742, + "tokens_trained": 0.856172136 + }, + { + "epoch": 0.4947166867598043, + "grad_norm": 6.223485469818115, + "loss": 4.1774, + "lr": 0.0008261538461538461, + "step": 1744, + "tokens_trained": 0.857156312 + }, + { + "epoch": 0.49528402240975816, + "grad_norm": 14.62152099609375, + "loss": 4.1607, + "lr": 0.0008258741258741259, + "step": 1746, + "tokens_trained": 0.858140152 + }, + { + "epoch": 0.4958513580597121, + "grad_norm": 15.991989135742188, + "loss": 4.1825, + "lr": 0.0008255944055944055, + "step": 1748, + "tokens_trained": 0.85912524 + }, + { + "epoch": 0.49641869370966596, + "grad_norm": 28.88335418701172, + "loss": 4.2244, + "lr": 0.0008253146853146854, + "step": 1750, + "tokens_trained": 0.860105784 + }, + { + "epoch": 0.49641869370966596, + "eval_loss": 1.061833143234253, + "eval_runtime": 20.4841, + "step": 1750, + "tokens_trained": 0.860105784 + }, + { + "epoch": 0.4969860293596199, + "grad_norm": 14.708030700683594, + "loss": 4.2036, + "lr": 0.000825034965034965, + "step": 1752, + "tokens_trained": 0.861089272 + }, + { + "epoch": 0.4975533650095738, + "grad_norm": 24.67535400390625, + "loss": 4.2405, + "lr": 0.0008247552447552448, + "step": 1754, + "tokens_trained": 0.862066656 + }, + { + "epoch": 0.4981207006595277, + "grad_norm": 10.923722267150879, + "loss": 4.1713, + "lr": 0.0008244755244755245, + "step": 1756, + "tokens_trained": 0.863049256 + }, + { + "epoch": 0.4986880363094816, + "grad_norm": 8.88796615600586, + "loss": 4.1834, + "lr": 0.0008241958041958042, + "step": 1758, + "tokens_trained": 0.864029352 + }, + { + "epoch": 0.4992553719594355, + "grad_norm": 34.90485382080078, + "loss": 4.2338, + "lr": 0.000823916083916084, + "step": 1760, + "tokens_trained": 0.865013008 + }, + { + "epoch": 0.4998227076093894, + "grad_norm": 36.34440612792969, + "loss": 4.2012, + "lr": 0.0008236363636363636, + "step": 1762, + "tokens_trained": 0.86599204 + }, + { + "epoch": 0.5003900432593433, + "grad_norm": 27.913984298706055, + "loss": 4.269, + "lr": 0.0008233566433566434, + "step": 1764, + "tokens_trained": 0.866975456 + }, + { + "epoch": 0.5009573789092973, + "grad_norm": 28.236122131347656, + "loss": 4.2413, + "lr": 0.000823076923076923, + "step": 1766, + "tokens_trained": 0.867963912 + }, + { + "epoch": 0.5015247145592511, + "grad_norm": 18.181337356567383, + "loss": 4.2088, + "lr": 0.0008227972027972029, + "step": 1768, + "tokens_trained": 0.86894656 + }, + { + "epoch": 0.502092050209205, + "grad_norm": 17.403850555419922, + "loss": 4.1854, + "lr": 0.0008225174825174825, + "step": 1770, + "tokens_trained": 0.869932592 + }, + { + "epoch": 0.5026593858591589, + "grad_norm": 15.002805709838867, + "loss": 4.1897, + "lr": 0.0008222377622377622, + "step": 1772, + "tokens_trained": 0.87091592 + }, + { + "epoch": 0.5032267215091129, + "grad_norm": 6.787586688995361, + "loss": 4.1625, + "lr": 0.000821958041958042, + "step": 1774, + "tokens_trained": 0.871899144 + }, + { + "epoch": 0.5037940571590668, + "grad_norm": 6.255197525024414, + "loss": 4.1682, + "lr": 0.0008216783216783217, + "step": 1776, + "tokens_trained": 0.872874824 + }, + { + "epoch": 0.5043613928090206, + "grad_norm": 25.828433990478516, + "loss": 4.2354, + "lr": 0.0008213986013986015, + "step": 1778, + "tokens_trained": 0.873858424 + }, + { + "epoch": 0.5049287284589745, + "grad_norm": 20.261323928833008, + "loss": 4.2373, + "lr": 0.0008211188811188811, + "step": 1780, + "tokens_trained": 0.87483884 + }, + { + "epoch": 0.5054960641089284, + "grad_norm": 9.670608520507812, + "loss": 4.191, + "lr": 0.0008208391608391609, + "step": 1782, + "tokens_trained": 0.875820792 + }, + { + "epoch": 0.5060633997588824, + "grad_norm": 23.33945655822754, + "loss": 4.2319, + "lr": 0.0008205594405594405, + "step": 1784, + "tokens_trained": 0.876804368 + }, + { + "epoch": 0.5066307354088363, + "grad_norm": 32.22544479370117, + "loss": 4.1799, + "lr": 0.0008202797202797203, + "step": 1786, + "tokens_trained": 0.877784816 + }, + { + "epoch": 0.5071980710587901, + "grad_norm": 21.048891067504883, + "loss": 4.2635, + "lr": 0.00082, + "step": 1788, + "tokens_trained": 0.878768256 + }, + { + "epoch": 0.507765406708744, + "grad_norm": 28.73198699951172, + "loss": 4.2436, + "lr": 0.0008197202797202797, + "step": 1790, + "tokens_trained": 0.879751288 + }, + { + "epoch": 0.508332742358698, + "grad_norm": 27.627851486206055, + "loss": 4.2118, + "lr": 0.0008194405594405595, + "step": 1792, + "tokens_trained": 0.880732072 + }, + { + "epoch": 0.5089000780086519, + "grad_norm": 21.16539192199707, + "loss": 4.2123, + "lr": 0.0008191608391608392, + "step": 1794, + "tokens_trained": 0.88171332 + }, + { + "epoch": 0.5094674136586058, + "grad_norm": 11.402868270874023, + "loss": 4.1524, + "lr": 0.000818881118881119, + "step": 1796, + "tokens_trained": 0.882695464 + }, + { + "epoch": 0.5100347493085596, + "grad_norm": 11.958270072937012, + "loss": 4.2091, + "lr": 0.0008186013986013986, + "step": 1798, + "tokens_trained": 0.883678736 + }, + { + "epoch": 0.5106020849585136, + "grad_norm": 15.902670860290527, + "loss": 4.1687, + "lr": 0.0008183216783216783, + "step": 1800, + "tokens_trained": 0.8846604 + }, + { + "epoch": 0.5111694206084675, + "grad_norm": 19.732566833496094, + "loss": 4.1302, + "lr": 0.000818041958041958, + "step": 1802, + "tokens_trained": 0.885641384 + }, + { + "epoch": 0.5117367562584214, + "grad_norm": 15.119332313537598, + "loss": 4.1546, + "lr": 0.0008177622377622378, + "step": 1804, + "tokens_trained": 0.8866262 + }, + { + "epoch": 0.5123040919083753, + "grad_norm": 9.641027450561523, + "loss": 4.1748, + "lr": 0.0008174825174825175, + "step": 1806, + "tokens_trained": 0.887604504 + }, + { + "epoch": 0.5128714275583292, + "grad_norm": 11.642073631286621, + "loss": 4.1879, + "lr": 0.0008172027972027972, + "step": 1808, + "tokens_trained": 0.888584152 + }, + { + "epoch": 0.5134387632082831, + "grad_norm": 12.05164909362793, + "loss": 4.1332, + "lr": 0.000816923076923077, + "step": 1810, + "tokens_trained": 0.889568448 + }, + { + "epoch": 0.514006098858237, + "grad_norm": 13.54423999786377, + "loss": 4.1398, + "lr": 0.0008166433566433567, + "step": 1812, + "tokens_trained": 0.890550896 + }, + { + "epoch": 0.5145734345081909, + "grad_norm": 21.94988441467285, + "loss": 4.1523, + "lr": 0.0008163636363636364, + "step": 1814, + "tokens_trained": 0.89153436 + }, + { + "epoch": 0.5151407701581449, + "grad_norm": 8.613338470458984, + "loss": 4.1428, + "lr": 0.0008160839160839161, + "step": 1816, + "tokens_trained": 0.89251064 + }, + { + "epoch": 0.5157081058080987, + "grad_norm": 27.448917388916016, + "loss": 4.2014, + "lr": 0.0008158041958041958, + "step": 1818, + "tokens_trained": 0.893493904 + }, + { + "epoch": 0.5162754414580526, + "grad_norm": 16.226577758789062, + "loss": 4.1787, + "lr": 0.0008155244755244755, + "step": 1820, + "tokens_trained": 0.894476344 + }, + { + "epoch": 0.5168427771080065, + "grad_norm": 16.967891693115234, + "loss": 4.1898, + "lr": 0.0008152447552447553, + "step": 1822, + "tokens_trained": 0.895460064 + }, + { + "epoch": 0.5174101127579604, + "grad_norm": 13.723483085632324, + "loss": 4.2058, + "lr": 0.000814965034965035, + "step": 1824, + "tokens_trained": 0.896443272 + }, + { + "epoch": 0.5179774484079144, + "grad_norm": 16.789636611938477, + "loss": 4.1669, + "lr": 0.0008146853146853147, + "step": 1826, + "tokens_trained": 0.897426712 + }, + { + "epoch": 0.5185447840578682, + "grad_norm": 11.26768684387207, + "loss": 4.1401, + "lr": 0.0008144055944055944, + "step": 1828, + "tokens_trained": 0.89840672 + }, + { + "epoch": 0.5191121197078221, + "grad_norm": 9.25829029083252, + "loss": 4.1581, + "lr": 0.0008141258741258742, + "step": 1830, + "tokens_trained": 0.89939132 + }, + { + "epoch": 0.519679455357776, + "grad_norm": 12.006930351257324, + "loss": 4.1768, + "lr": 0.0008138461538461539, + "step": 1832, + "tokens_trained": 0.900373704 + }, + { + "epoch": 0.52024679100773, + "grad_norm": 18.766008377075195, + "loss": 4.1419, + "lr": 0.0008135664335664336, + "step": 1834, + "tokens_trained": 0.901356176 + }, + { + "epoch": 0.5208141266576839, + "grad_norm": 17.483421325683594, + "loss": 4.1382, + "lr": 0.0008132867132867133, + "step": 1836, + "tokens_trained": 0.902344088 + }, + { + "epoch": 0.5213814623076377, + "grad_norm": 10.484652519226074, + "loss": 4.1571, + "lr": 0.000813006993006993, + "step": 1838, + "tokens_trained": 0.903328896 + }, + { + "epoch": 0.5219487979575916, + "grad_norm": 13.653974533081055, + "loss": 4.1638, + "lr": 0.0008127272727272728, + "step": 1840, + "tokens_trained": 0.904309368 + }, + { + "epoch": 0.5225161336075456, + "grad_norm": 12.48718547821045, + "loss": 4.1226, + "lr": 0.0008124475524475524, + "step": 1842, + "tokens_trained": 0.905293112 + }, + { + "epoch": 0.5230834692574995, + "grad_norm": 8.086355209350586, + "loss": 4.1303, + "lr": 0.0008121678321678322, + "step": 1844, + "tokens_trained": 0.906275632 + }, + { + "epoch": 0.5236508049074534, + "grad_norm": 10.940073013305664, + "loss": 4.1634, + "lr": 0.0008118881118881119, + "step": 1846, + "tokens_trained": 0.907255808 + }, + { + "epoch": 0.5242181405574072, + "grad_norm": 13.844099044799805, + "loss": 4.1505, + "lr": 0.0008116083916083917, + "step": 1848, + "tokens_trained": 0.908238664 + }, + { + "epoch": 0.5247854762073612, + "grad_norm": 6.305738925933838, + "loss": 4.1463, + "lr": 0.0008113286713286714, + "step": 1850, + "tokens_trained": 0.909221424 + }, + { + "epoch": 0.5253528118573151, + "grad_norm": 8.957951545715332, + "loss": 4.1785, + "lr": 0.000811048951048951, + "step": 1852, + "tokens_trained": 0.910204472 + }, + { + "epoch": 0.525920147507269, + "grad_norm": 12.665373802185059, + "loss": 4.1776, + "lr": 0.0008107692307692308, + "step": 1854, + "tokens_trained": 0.911186456 + }, + { + "epoch": 0.5264874831572229, + "grad_norm": 13.7921781539917, + "loss": 4.2058, + "lr": 0.0008104895104895104, + "step": 1856, + "tokens_trained": 0.912163912 + }, + { + "epoch": 0.5270548188071768, + "grad_norm": 18.400495529174805, + "loss": 4.1378, + "lr": 0.0008102097902097903, + "step": 1858, + "tokens_trained": 0.913143416 + }, + { + "epoch": 0.5276221544571307, + "grad_norm": 10.095234870910645, + "loss": 4.1673, + "lr": 0.0008099300699300699, + "step": 1860, + "tokens_trained": 0.914125056 + }, + { + "epoch": 0.5281894901070846, + "grad_norm": 9.396644592285156, + "loss": 4.1226, + "lr": 0.0008096503496503497, + "step": 1862, + "tokens_trained": 0.915109128 + }, + { + "epoch": 0.5287568257570385, + "grad_norm": 12.686080932617188, + "loss": 4.1356, + "lr": 0.0008093706293706294, + "step": 1864, + "tokens_trained": 0.916092096 + }, + { + "epoch": 0.5293241614069925, + "grad_norm": 15.91020679473877, + "loss": 4.1276, + "lr": 0.0008090909090909092, + "step": 1866, + "tokens_trained": 0.917077264 + }, + { + "epoch": 0.5298914970569463, + "grad_norm": 21.305110931396484, + "loss": 4.1492, + "lr": 0.0008088111888111889, + "step": 1868, + "tokens_trained": 0.918060288 + }, + { + "epoch": 0.5304588327069002, + "grad_norm": 9.242319107055664, + "loss": 4.1457, + "lr": 0.0008085314685314685, + "step": 1870, + "tokens_trained": 0.91904616 + }, + { + "epoch": 0.5310261683568541, + "grad_norm": 17.556922912597656, + "loss": 4.1698, + "lr": 0.0008082517482517483, + "step": 1872, + "tokens_trained": 0.920028192 + }, + { + "epoch": 0.531593504006808, + "grad_norm": 24.155885696411133, + "loss": 4.193, + "lr": 0.0008079720279720279, + "step": 1874, + "tokens_trained": 0.921010456 + }, + { + "epoch": 0.531877171831785, + "eval_loss": 1.0404243469238281, + "eval_runtime": 21.451, + "step": 1875, + "tokens_trained": 0.921502192 + }, + { + "epoch": 0.532160839656762, + "grad_norm": 4.985994338989258, + "loss": 4.1649, + "lr": 0.0008076923076923078, + "step": 1876, + "tokens_trained": 0.921994216 + }, + { + "epoch": 0.5327281753067158, + "grad_norm": 19.2642765045166, + "loss": 4.1883, + "lr": 0.0008074125874125874, + "step": 1878, + "tokens_trained": 0.922978112 + }, + { + "epoch": 0.5332955109566697, + "grad_norm": 15.012572288513184, + "loss": 4.1944, + "lr": 0.0008071328671328671, + "step": 1880, + "tokens_trained": 0.923962952 + }, + { + "epoch": 0.5338628466066236, + "grad_norm": 21.37204360961914, + "loss": 4.1708, + "lr": 0.0008068531468531469, + "step": 1882, + "tokens_trained": 0.92494744 + }, + { + "epoch": 0.5344301822565776, + "grad_norm": 6.402398586273193, + "loss": 4.1921, + "lr": 0.0008065734265734265, + "step": 1884, + "tokens_trained": 0.925927984 + }, + { + "epoch": 0.5349975179065315, + "grad_norm": 27.606822967529297, + "loss": 4.2033, + "lr": 0.0008062937062937064, + "step": 1886, + "tokens_trained": 0.926911352 + }, + { + "epoch": 0.5355648535564853, + "grad_norm": 16.434572219848633, + "loss": 4.1504, + "lr": 0.000806013986013986, + "step": 1888, + "tokens_trained": 0.927894056 + }, + { + "epoch": 0.5361321892064392, + "grad_norm": 8.066178321838379, + "loss": 4.1674, + "lr": 0.0008057342657342658, + "step": 1890, + "tokens_trained": 0.928879504 + }, + { + "epoch": 0.5366995248563932, + "grad_norm": 6.167456150054932, + "loss": 4.1207, + "lr": 0.0008054545454545454, + "step": 1892, + "tokens_trained": 0.92986424 + }, + { + "epoch": 0.5372668605063471, + "grad_norm": 3.584982395172119, + "loss": 4.1051, + "lr": 0.0008051748251748253, + "step": 1894, + "tokens_trained": 0.930846696 + }, + { + "epoch": 0.537834196156301, + "grad_norm": 14.988295555114746, + "loss": 4.1199, + "lr": 0.0008048951048951049, + "step": 1896, + "tokens_trained": 0.931831112 + }, + { + "epoch": 0.5384015318062548, + "grad_norm": 12.735363960266113, + "loss": 4.1368, + "lr": 0.0008046153846153846, + "step": 1898, + "tokens_trained": 0.932816952 + }, + { + "epoch": 0.5389688674562088, + "grad_norm": 7.701294422149658, + "loss": 4.1205, + "lr": 0.0008043356643356644, + "step": 1900, + "tokens_trained": 0.93380264 + }, + { + "epoch": 0.5395362031061627, + "grad_norm": 9.15809440612793, + "loss": 4.1567, + "lr": 0.000804055944055944, + "step": 1902, + "tokens_trained": 0.934785848 + }, + { + "epoch": 0.5401035387561166, + "grad_norm": 10.8292875289917, + "loss": 4.1645, + "lr": 0.0008037762237762239, + "step": 1904, + "tokens_trained": 0.935766912 + }, + { + "epoch": 0.5406708744060705, + "grad_norm": 10.906803131103516, + "loss": 4.1398, + "lr": 0.0008034965034965035, + "step": 1906, + "tokens_trained": 0.936749352 + }, + { + "epoch": 0.5412382100560243, + "grad_norm": 10.140864372253418, + "loss": 4.1754, + "lr": 0.0008032167832167832, + "step": 1908, + "tokens_trained": 0.9377304 + }, + { + "epoch": 0.5418055457059783, + "grad_norm": 10.061383247375488, + "loss": 4.1485, + "lr": 0.0008029370629370629, + "step": 1910, + "tokens_trained": 0.938712336 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 8.252259254455566, + "loss": 4.1502, + "lr": 0.0008026573426573427, + "step": 1912, + "tokens_trained": 0.939693304 + }, + { + "epoch": 0.5429402170058861, + "grad_norm": 15.104400634765625, + "loss": 4.182, + "lr": 0.0008023776223776224, + "step": 1914, + "tokens_trained": 0.940679832 + }, + { + "epoch": 0.54350755265584, + "grad_norm": 21.167285919189453, + "loss": 4.1241, + "lr": 0.0008020979020979021, + "step": 1916, + "tokens_trained": 0.941665088 + }, + { + "epoch": 0.5440748883057939, + "grad_norm": 17.936481475830078, + "loss": 4.1846, + "lr": 0.0008018181818181818, + "step": 1918, + "tokens_trained": 0.942651632 + }, + { + "epoch": 0.5446422239557478, + "grad_norm": 9.773019790649414, + "loss": 4.1164, + "lr": 0.0008015384615384615, + "step": 1920, + "tokens_trained": 0.943635928 + }, + { + "epoch": 0.5452095596057017, + "grad_norm": 14.120475769042969, + "loss": 4.1556, + "lr": 0.0008012587412587414, + "step": 1922, + "tokens_trained": 0.944618336 + }, + { + "epoch": 0.5457768952556556, + "grad_norm": 10.898097038269043, + "loss": 4.1521, + "lr": 0.000800979020979021, + "step": 1924, + "tokens_trained": 0.945608216 + }, + { + "epoch": 0.5463442309056096, + "grad_norm": 8.271462440490723, + "loss": 4.0785, + "lr": 0.0008006993006993007, + "step": 1926, + "tokens_trained": 0.946593504 + }, + { + "epoch": 0.5469115665555634, + "grad_norm": 17.28820037841797, + "loss": 4.0998, + "lr": 0.0008004195804195804, + "step": 1928, + "tokens_trained": 0.947575288 + }, + { + "epoch": 0.5474789022055173, + "grad_norm": 17.754959106445312, + "loss": 4.1652, + "lr": 0.0008001398601398602, + "step": 1930, + "tokens_trained": 0.948562968 + }, + { + "epoch": 0.5480462378554712, + "grad_norm": 10.576292037963867, + "loss": 4.1754, + "lr": 0.0007998601398601399, + "step": 1932, + "tokens_trained": 0.949545728 + }, + { + "epoch": 0.5486135735054252, + "grad_norm": 14.297791481018066, + "loss": 4.1597, + "lr": 0.0007995804195804196, + "step": 1934, + "tokens_trained": 0.950528952 + }, + { + "epoch": 0.5491809091553791, + "grad_norm": 23.882539749145508, + "loss": 4.1366, + "lr": 0.0007993006993006992, + "step": 1936, + "tokens_trained": 0.951513448 + }, + { + "epoch": 0.5497482448053329, + "grad_norm": 5.12502908706665, + "loss": 4.1441, + "lr": 0.000799020979020979, + "step": 1938, + "tokens_trained": 0.952497048 + }, + { + "epoch": 0.5503155804552868, + "grad_norm": 26.879070281982422, + "loss": 4.2595, + "lr": 0.0007987412587412588, + "step": 1940, + "tokens_trained": 0.953475816 + }, + { + "epoch": 0.5508829161052408, + "grad_norm": 23.032690048217773, + "loss": 4.1841, + "lr": 0.0007984615384615385, + "step": 1942, + "tokens_trained": 0.954459984 + }, + { + "epoch": 0.5514502517551947, + "grad_norm": 8.810720443725586, + "loss": 4.1329, + "lr": 0.0007981818181818182, + "step": 1944, + "tokens_trained": 0.95544252 + }, + { + "epoch": 0.5520175874051486, + "grad_norm": 31.051185607910156, + "loss": 4.2278, + "lr": 0.0007979020979020979, + "step": 1946, + "tokens_trained": 0.956428016 + }, + { + "epoch": 0.5525849230551024, + "grad_norm": 22.537412643432617, + "loss": 4.1729, + "lr": 0.0007976223776223777, + "step": 1948, + "tokens_trained": 0.957406024 + }, + { + "epoch": 0.5531522587050564, + "grad_norm": 10.596793174743652, + "loss": 4.1636, + "lr": 0.0007973426573426573, + "step": 1950, + "tokens_trained": 0.958391232 + }, + { + "epoch": 0.5537195943550103, + "grad_norm": 16.45500373840332, + "loss": 4.1591, + "lr": 0.0007970629370629371, + "step": 1952, + "tokens_trained": 0.959378448 + }, + { + "epoch": 0.5542869300049642, + "grad_norm": 15.090359687805176, + "loss": 4.1516, + "lr": 0.0007967832167832167, + "step": 1954, + "tokens_trained": 0.960363384 + }, + { + "epoch": 0.5548542656549181, + "grad_norm": 28.482192993164062, + "loss": 4.1211, + "lr": 0.0007965034965034965, + "step": 1956, + "tokens_trained": 0.961348752 + }, + { + "epoch": 0.555421601304872, + "grad_norm": 9.402368545532227, + "loss": 4.178, + "lr": 0.0007962237762237763, + "step": 1958, + "tokens_trained": 0.962332976 + }, + { + "epoch": 0.5559889369548259, + "grad_norm": 33.001346588134766, + "loss": 4.218, + "lr": 0.000795944055944056, + "step": 1960, + "tokens_trained": 0.963316928 + }, + { + "epoch": 0.5565562726047798, + "grad_norm": 29.695520401000977, + "loss": 4.2071, + "lr": 0.0007956643356643357, + "step": 1962, + "tokens_trained": 0.964301728 + }, + { + "epoch": 0.5571236082547337, + "grad_norm": 22.22412109375, + "loss": 4.2158, + "lr": 0.0007953846153846153, + "step": 1964, + "tokens_trained": 0.96528524 + }, + { + "epoch": 0.5576909439046877, + "grad_norm": 15.590829849243164, + "loss": 4.1681, + "lr": 0.0007951048951048952, + "step": 1966, + "tokens_trained": 0.966268264 + }, + { + "epoch": 0.5582582795546415, + "grad_norm": 16.011110305786133, + "loss": 4.1591, + "lr": 0.0007948251748251748, + "step": 1968, + "tokens_trained": 0.967252016 + }, + { + "epoch": 0.5588256152045954, + "grad_norm": 15.24573040008545, + "loss": 4.1446, + "lr": 0.0007945454545454546, + "step": 1970, + "tokens_trained": 0.96823396 + }, + { + "epoch": 0.5593929508545493, + "grad_norm": 15.718021392822266, + "loss": 4.1846, + "lr": 0.0007942657342657342, + "step": 1972, + "tokens_trained": 0.969217792 + }, + { + "epoch": 0.5599602865045032, + "grad_norm": 8.648459434509277, + "loss": 4.1655, + "lr": 0.000793986013986014, + "step": 1974, + "tokens_trained": 0.970200776 + }, + { + "epoch": 0.5605276221544572, + "grad_norm": 7.273077487945557, + "loss": 4.1397, + "lr": 0.0007937062937062938, + "step": 1976, + "tokens_trained": 0.971181376 + }, + { + "epoch": 0.561094957804411, + "grad_norm": 25.027616500854492, + "loss": 4.1918, + "lr": 0.0007934265734265734, + "step": 1978, + "tokens_trained": 0.972165496 + }, + { + "epoch": 0.5616622934543649, + "grad_norm": 25.485851287841797, + "loss": 4.1896, + "lr": 0.0007931468531468532, + "step": 1980, + "tokens_trained": 0.973145616 + }, + { + "epoch": 0.5622296291043188, + "grad_norm": 18.065462112426758, + "loss": 4.1876, + "lr": 0.0007928671328671328, + "step": 1982, + "tokens_trained": 0.974131104 + }, + { + "epoch": 0.5627969647542728, + "grad_norm": 20.412248611450195, + "loss": 4.1556, + "lr": 0.0007925874125874127, + "step": 1984, + "tokens_trained": 0.975111232 + }, + { + "epoch": 0.5633643004042267, + "grad_norm": 15.51710319519043, + "loss": 4.1391, + "lr": 0.0007923076923076923, + "step": 1986, + "tokens_trained": 0.976098968 + }, + { + "epoch": 0.5639316360541805, + "grad_norm": 8.650726318359375, + "loss": 4.1421, + "lr": 0.000792027972027972, + "step": 1988, + "tokens_trained": 0.977082992 + }, + { + "epoch": 0.5644989717041344, + "grad_norm": 19.833505630493164, + "loss": 4.1505, + "lr": 0.0007917482517482517, + "step": 1990, + "tokens_trained": 0.978068896 + }, + { + "epoch": 0.5650663073540884, + "grad_norm": 26.585390090942383, + "loss": 4.1661, + "lr": 0.0007914685314685314, + "step": 1992, + "tokens_trained": 0.979048504 + }, + { + "epoch": 0.5656336430040423, + "grad_norm": 20.827394485473633, + "loss": 4.1987, + "lr": 0.0007911888111888113, + "step": 1994, + "tokens_trained": 0.98003104 + }, + { + "epoch": 0.5662009786539962, + "grad_norm": 23.700273513793945, + "loss": 4.1773, + "lr": 0.0007909090909090909, + "step": 1996, + "tokens_trained": 0.981013384 + }, + { + "epoch": 0.56676831430395, + "grad_norm": 15.673397064208984, + "loss": 4.12, + "lr": 0.0007906293706293707, + "step": 1998, + "tokens_trained": 0.981999776 + }, + { + "epoch": 0.567335649953904, + "grad_norm": 11.268630981445312, + "loss": 4.1373, + "lr": 0.0007903496503496503, + "step": 2000, + "tokens_trained": 0.982980936 + }, + { + "epoch": 0.567335649953904, + "eval_loss": 1.0422048568725586, + "eval_runtime": 20.3928, + "step": 2000, + "tokens_trained": 0.982980936 + }, + { + "epoch": 0.5679029856038579, + "grad_norm": 18.37994384765625, + "loss": 4.1536, + "lr": 0.0007900699300699302, + "step": 2002, + "tokens_trained": 0.983969536 + }, + { + "epoch": 0.5684703212538118, + "grad_norm": 23.911537170410156, + "loss": 4.1652, + "lr": 0.0007897902097902098, + "step": 2004, + "tokens_trained": 0.98495052 + }, + { + "epoch": 0.5690376569037657, + "grad_norm": 7.355772018432617, + "loss": 4.1846, + "lr": 0.0007895104895104895, + "step": 2006, + "tokens_trained": 0.98593252 + }, + { + "epoch": 0.5696049925537195, + "grad_norm": 35.29991149902344, + "loss": 4.2145, + "lr": 0.0007892307692307692, + "step": 2008, + "tokens_trained": 0.986922392 + }, + { + "epoch": 0.5701723282036735, + "grad_norm": 14.28709602355957, + "loss": 4.1629, + "lr": 0.0007889510489510489, + "step": 2010, + "tokens_trained": 0.987905712 + }, + { + "epoch": 0.5707396638536274, + "grad_norm": 22.50174331665039, + "loss": 4.1907, + "lr": 0.0007886713286713288, + "step": 2012, + "tokens_trained": 0.988887536 + }, + { + "epoch": 0.5713069995035813, + "grad_norm": 14.588640213012695, + "loss": 4.1523, + "lr": 0.0007883916083916084, + "step": 2014, + "tokens_trained": 0.989872712 + }, + { + "epoch": 0.5718743351535353, + "grad_norm": 2.776369094848633, + "loss": 4.1548, + "lr": 0.0007881118881118882, + "step": 2016, + "tokens_trained": 0.990854072 + }, + { + "epoch": 0.5724416708034891, + "grad_norm": 16.00047492980957, + "loss": 4.1319, + "lr": 0.0007878321678321678, + "step": 2018, + "tokens_trained": 0.991834552 + }, + { + "epoch": 0.573009006453443, + "grad_norm": 21.678735733032227, + "loss": 4.1986, + "lr": 0.0007875524475524476, + "step": 2020, + "tokens_trained": 0.992818256 + }, + { + "epoch": 0.5735763421033969, + "grad_norm": 4.835119724273682, + "loss": 4.1625, + "lr": 0.0007872727272727273, + "step": 2022, + "tokens_trained": 0.993801376 + }, + { + "epoch": 0.5741436777533508, + "grad_norm": 19.427467346191406, + "loss": 4.1594, + "lr": 0.000786993006993007, + "step": 2024, + "tokens_trained": 0.994788568 + }, + { + "epoch": 0.5747110134033048, + "grad_norm": 15.458346366882324, + "loss": 4.1829, + "lr": 0.0007867132867132867, + "step": 2026, + "tokens_trained": 0.995769976 + }, + { + "epoch": 0.5752783490532586, + "grad_norm": 11.073614120483398, + "loss": 4.1303, + "lr": 0.0007864335664335664, + "step": 2028, + "tokens_trained": 0.996751464 + }, + { + "epoch": 0.5758456847032125, + "grad_norm": 4.685436248779297, + "loss": 4.1368, + "lr": 0.0007861538461538463, + "step": 2030, + "tokens_trained": 0.997733952 + }, + { + "epoch": 0.5764130203531664, + "grad_norm": 15.977241516113281, + "loss": 4.1584, + "lr": 0.0007858741258741259, + "step": 2032, + "tokens_trained": 0.998716976 + }, + { + "epoch": 0.5769803560031204, + "grad_norm": 11.305732727050781, + "loss": 4.102, + "lr": 0.0007855944055944056, + "step": 2034, + "tokens_trained": 0.999703632 + }, + { + "epoch": 0.5775476916530743, + "grad_norm": 7.794003963470459, + "loss": 4.161, + "lr": 0.0007853146853146853, + "step": 2036, + "tokens_trained": 1.000687488 + }, + { + "epoch": 0.5781150273030281, + "grad_norm": 7.609982013702393, + "loss": 4.1546, + "lr": 0.0007850349650349651, + "step": 2038, + "tokens_trained": 1.0016692 + }, + { + "epoch": 0.578682362952982, + "grad_norm": 7.622653961181641, + "loss": 4.1246, + "lr": 0.0007847552447552448, + "step": 2040, + "tokens_trained": 1.002653352 + }, + { + "epoch": 0.579249698602936, + "grad_norm": 9.98919677734375, + "loss": 4.1319, + "lr": 0.0007844755244755245, + "step": 2042, + "tokens_trained": 1.003639528 + }, + { + "epoch": 0.5798170342528899, + "grad_norm": 9.557628631591797, + "loss": 4.1105, + "lr": 0.0007841958041958041, + "step": 2044, + "tokens_trained": 1.004623776 + }, + { + "epoch": 0.5803843699028438, + "grad_norm": 14.172621726989746, + "loss": 4.1339, + "lr": 0.0007839160839160839, + "step": 2046, + "tokens_trained": 1.005604008 + }, + { + "epoch": 0.5809517055527976, + "grad_norm": 8.185248374938965, + "loss": 4.1142, + "lr": 0.0007836363636363637, + "step": 2048, + "tokens_trained": 1.006585704 + }, + { + "epoch": 0.5815190412027516, + "grad_norm": 10.642661094665527, + "loss": 4.131, + "lr": 0.0007833566433566434, + "step": 2050, + "tokens_trained": 1.00757132 + }, + { + "epoch": 0.5820863768527055, + "grad_norm": 7.868969917297363, + "loss": 4.1477, + "lr": 0.0007830769230769231, + "step": 2052, + "tokens_trained": 1.008556824 + }, + { + "epoch": 0.5826537125026594, + "grad_norm": 2.8441150188446045, + "loss": 4.1156, + "lr": 0.0007827972027972028, + "step": 2054, + "tokens_trained": 1.00954056 + }, + { + "epoch": 0.5832210481526133, + "grad_norm": 5.2797932624816895, + "loss": 4.1058, + "lr": 0.0007825174825174826, + "step": 2056, + "tokens_trained": 1.010526488 + }, + { + "epoch": 0.5837883838025671, + "grad_norm": 11.850811004638672, + "loss": 4.165, + "lr": 0.0007822377622377622, + "step": 2058, + "tokens_trained": 1.011507584 + }, + { + "epoch": 0.5843557194525211, + "grad_norm": 11.073920249938965, + "loss": 4.1509, + "lr": 0.000781958041958042, + "step": 2060, + "tokens_trained": 1.012491648 + }, + { + "epoch": 0.584923055102475, + "grad_norm": 8.282343864440918, + "loss": 4.0656, + "lr": 0.0007816783216783216, + "step": 2062, + "tokens_trained": 1.013475224 + }, + { + "epoch": 0.5854903907524289, + "grad_norm": 10.414461135864258, + "loss": 4.1285, + "lr": 0.0007813986013986014, + "step": 2064, + "tokens_trained": 1.014458144 + }, + { + "epoch": 0.5860577264023829, + "grad_norm": 9.988463401794434, + "loss": 4.1234, + "lr": 0.0007811188811188812, + "step": 2066, + "tokens_trained": 1.015444112 + }, + { + "epoch": 0.5866250620523367, + "grad_norm": 8.713189125061035, + "loss": 4.129, + "lr": 0.0007808391608391609, + "step": 2068, + "tokens_trained": 1.016427568 + }, + { + "epoch": 0.5871923977022906, + "grad_norm": 3.4149773120880127, + "loss": 4.155, + "lr": 0.0007805594405594406, + "step": 2070, + "tokens_trained": 1.017412264 + }, + { + "epoch": 0.5877597333522445, + "grad_norm": 12.33522891998291, + "loss": 4.1856, + "lr": 0.0007802797202797202, + "step": 2072, + "tokens_trained": 1.018402216 + }, + { + "epoch": 0.5883270690021984, + "grad_norm": 12.155695915222168, + "loss": 4.1468, + "lr": 0.0007800000000000001, + "step": 2074, + "tokens_trained": 1.019387096 + }, + { + "epoch": 0.5888944046521524, + "grad_norm": 7.73326301574707, + "loss": 4.1239, + "lr": 0.0007797202797202797, + "step": 2076, + "tokens_trained": 1.020370008 + }, + { + "epoch": 0.5894617403021062, + "grad_norm": 6.425852298736572, + "loss": 4.1101, + "lr": 0.0007794405594405595, + "step": 2078, + "tokens_trained": 1.02135716 + }, + { + "epoch": 0.5900290759520601, + "grad_norm": 18.360816955566406, + "loss": 4.1726, + "lr": 0.0007791608391608391, + "step": 2080, + "tokens_trained": 1.022338024 + }, + { + "epoch": 0.590596411602014, + "grad_norm": 28.31681251525879, + "loss": 4.1341, + "lr": 0.0007788811188811189, + "step": 2082, + "tokens_trained": 1.023318008 + }, + { + "epoch": 0.591163747251968, + "grad_norm": 10.673089027404785, + "loss": 4.1268, + "lr": 0.0007786013986013987, + "step": 2084, + "tokens_trained": 1.02430432 + }, + { + "epoch": 0.5917310829019219, + "grad_norm": 26.656522750854492, + "loss": 4.1703, + "lr": 0.0007783216783216783, + "step": 2086, + "tokens_trained": 1.025288272 + }, + { + "epoch": 0.5922984185518757, + "grad_norm": 20.022029876708984, + "loss": 4.1532, + "lr": 0.0007780419580419581, + "step": 2088, + "tokens_trained": 1.026272984 + }, + { + "epoch": 0.5928657542018296, + "grad_norm": 7.2955121994018555, + "loss": 4.1992, + "lr": 0.0007777622377622377, + "step": 2090, + "tokens_trained": 1.02725572 + }, + { + "epoch": 0.5934330898517836, + "grad_norm": 28.561243057250977, + "loss": 4.2098, + "lr": 0.0007774825174825176, + "step": 2092, + "tokens_trained": 1.028238456 + }, + { + "epoch": 0.5940004255017375, + "grad_norm": 16.715425491333008, + "loss": 4.1509, + "lr": 0.0007772027972027972, + "step": 2094, + "tokens_trained": 1.029226048 + }, + { + "epoch": 0.5945677611516914, + "grad_norm": 6.325936317443848, + "loss": 4.1221, + "lr": 0.000776923076923077, + "step": 2096, + "tokens_trained": 1.030210528 + }, + { + "epoch": 0.5951350968016452, + "grad_norm": 12.83181381225586, + "loss": 4.1808, + "lr": 0.0007766433566433566, + "step": 2098, + "tokens_trained": 1.031193456 + }, + { + "epoch": 0.5957024324515992, + "grad_norm": 12.183184623718262, + "loss": 4.1292, + "lr": 0.0007763636363636363, + "step": 2100, + "tokens_trained": 1.032173528 + }, + { + "epoch": 0.5962697681015531, + "grad_norm": 8.247485160827637, + "loss": 4.1425, + "lr": 0.0007760839160839162, + "step": 2102, + "tokens_trained": 1.033158144 + }, + { + "epoch": 0.596837103751507, + "grad_norm": 10.814559936523438, + "loss": 4.1167, + "lr": 0.0007758041958041958, + "step": 2104, + "tokens_trained": 1.034141216 + }, + { + "epoch": 0.5974044394014609, + "grad_norm": 12.589309692382812, + "loss": 4.0916, + "lr": 0.0007755244755244756, + "step": 2106, + "tokens_trained": 1.035121888 + }, + { + "epoch": 0.5979717750514147, + "grad_norm": 11.65658187866211, + "loss": 4.0776, + "lr": 0.0007752447552447552, + "step": 2108, + "tokens_trained": 1.036103688 + }, + { + "epoch": 0.5985391107013687, + "grad_norm": 18.0120792388916, + "loss": 4.1588, + "lr": 0.0007749650349650351, + "step": 2110, + "tokens_trained": 1.03708248 + }, + { + "epoch": 0.5991064463513226, + "grad_norm": 5.742938995361328, + "loss": 4.151, + "lr": 0.0007746853146853147, + "step": 2112, + "tokens_trained": 1.038068792 + }, + { + "epoch": 0.5996737820012765, + "grad_norm": 36.54581832885742, + "loss": 4.2239, + "lr": 0.0007744055944055944, + "step": 2114, + "tokens_trained": 1.03904728 + }, + { + "epoch": 0.6002411176512304, + "grad_norm": 13.304069519042969, + "loss": 4.152, + "lr": 0.0007741258741258741, + "step": 2116, + "tokens_trained": 1.040031312 + }, + { + "epoch": 0.6008084533011843, + "grad_norm": 18.68927001953125, + "loss": 4.1413, + "lr": 0.0007738461538461538, + "step": 2118, + "tokens_trained": 1.041018376 + }, + { + "epoch": 0.6013757889511382, + "grad_norm": 16.946630477905273, + "loss": 4.1122, + "lr": 0.0007735664335664337, + "step": 2120, + "tokens_trained": 1.0420056 + }, + { + "epoch": 0.6019431246010921, + "grad_norm": 4.236926078796387, + "loss": 4.1146, + "lr": 0.0007732867132867133, + "step": 2122, + "tokens_trained": 1.042990376 + }, + { + "epoch": 0.602510460251046, + "grad_norm": 12.148641586303711, + "loss": 4.1472, + "lr": 0.0007730069930069931, + "step": 2124, + "tokens_trained": 1.0439754 + }, + { + "epoch": 0.602794128076023, + "eval_loss": 1.039306640625, + "eval_runtime": 20.6138, + "step": 2125, + "tokens_trained": 1.044467008 + }, + { + "epoch": 0.603077795901, + "grad_norm": 17.051687240600586, + "loss": 4.1572, + "lr": 0.0007727272727272727, + "step": 2126, + "tokens_trained": 1.044957456 + }, + { + "epoch": 0.6036451315509538, + "grad_norm": 14.019828796386719, + "loss": 4.1464, + "lr": 0.0007724475524475525, + "step": 2128, + "tokens_trained": 1.04593944 + }, + { + "epoch": 0.6042124672009077, + "grad_norm": 11.22962760925293, + "loss": 4.1345, + "lr": 0.0007721678321678322, + "step": 2130, + "tokens_trained": 1.046919592 + }, + { + "epoch": 0.6047798028508616, + "grad_norm": 11.524348258972168, + "loss": 4.1233, + "lr": 0.0007718881118881119, + "step": 2132, + "tokens_trained": 1.047904744 + }, + { + "epoch": 0.6053471385008156, + "grad_norm": 7.174457550048828, + "loss": 4.1201, + "lr": 0.0007716083916083916, + "step": 2134, + "tokens_trained": 1.048885328 + }, + { + "epoch": 0.6059144741507695, + "grad_norm": 6.847499847412109, + "loss": 4.1313, + "lr": 0.0007713286713286713, + "step": 2136, + "tokens_trained": 1.049868776 + }, + { + "epoch": 0.6064818098007233, + "grad_norm": 8.44458293914795, + "loss": 4.1236, + "lr": 0.0007710489510489512, + "step": 2138, + "tokens_trained": 1.050852704 + }, + { + "epoch": 0.6070491454506772, + "grad_norm": 15.415260314941406, + "loss": 4.1424, + "lr": 0.0007707692307692308, + "step": 2140, + "tokens_trained": 1.051837736 + }, + { + "epoch": 0.6076164811006312, + "grad_norm": 16.845874786376953, + "loss": 4.1037, + "lr": 0.0007704895104895105, + "step": 2142, + "tokens_trained": 1.05282172 + }, + { + "epoch": 0.6081838167505851, + "grad_norm": 1.3947086334228516, + "loss": 4.1389, + "lr": 0.0007702097902097902, + "step": 2144, + "tokens_trained": 1.053802928 + }, + { + "epoch": 0.608751152400539, + "grad_norm": 3.4119038581848145, + "loss": 4.16, + "lr": 0.0007699300699300699, + "step": 2146, + "tokens_trained": 1.054784368 + }, + { + "epoch": 0.6093184880504928, + "grad_norm": 9.26860523223877, + "loss": 4.1841, + "lr": 0.0007696503496503497, + "step": 2148, + "tokens_trained": 1.05576888 + }, + { + "epoch": 0.6098858237004467, + "grad_norm": 8.744836807250977, + "loss": 4.1043, + "lr": 0.0007693706293706294, + "step": 2150, + "tokens_trained": 1.056751336 + }, + { + "epoch": 0.6104531593504007, + "grad_norm": 8.805045127868652, + "loss": 4.1032, + "lr": 0.000769090909090909, + "step": 2152, + "tokens_trained": 1.057734 + }, + { + "epoch": 0.6110204950003546, + "grad_norm": 4.785625457763672, + "loss": 4.1817, + "lr": 0.0007688111888111888, + "step": 2154, + "tokens_trained": 1.058716328 + }, + { + "epoch": 0.6115878306503085, + "grad_norm": 2.2137513160705566, + "loss": 4.1514, + "lr": 0.0007685314685314686, + "step": 2156, + "tokens_trained": 1.059696248 + }, + { + "epoch": 0.6121551663002623, + "grad_norm": 7.164271354675293, + "loss": 4.1433, + "lr": 0.0007682517482517483, + "step": 2158, + "tokens_trained": 1.060676648 + }, + { + "epoch": 0.6127225019502163, + "grad_norm": 9.481597900390625, + "loss": 4.0971, + "lr": 0.000767972027972028, + "step": 2160, + "tokens_trained": 1.061656688 + }, + { + "epoch": 0.6132898376001702, + "grad_norm": 11.28831672668457, + "loss": 4.149, + "lr": 0.0007676923076923077, + "step": 2162, + "tokens_trained": 1.062640576 + }, + { + "epoch": 0.6138571732501241, + "grad_norm": 17.21572494506836, + "loss": 4.098, + "lr": 0.0007674125874125874, + "step": 2164, + "tokens_trained": 1.063617688 + }, + { + "epoch": 0.614424508900078, + "grad_norm": 14.486310005187988, + "loss": 4.123, + "lr": 0.0007671328671328672, + "step": 2166, + "tokens_trained": 1.06460584 + }, + { + "epoch": 0.6149918445500319, + "grad_norm": 10.582398414611816, + "loss": 4.1243, + "lr": 0.0007668531468531469, + "step": 2168, + "tokens_trained": 1.065589064 + }, + { + "epoch": 0.6155591801999858, + "grad_norm": 12.923002243041992, + "loss": 4.0928, + "lr": 0.0007665734265734265, + "step": 2170, + "tokens_trained": 1.06657224 + }, + { + "epoch": 0.6161265158499397, + "grad_norm": 12.445414543151855, + "loss": 4.1697, + "lr": 0.0007662937062937063, + "step": 2172, + "tokens_trained": 1.067556952 + }, + { + "epoch": 0.6166938514998936, + "grad_norm": 3.562396287918091, + "loss": 4.0763, + "lr": 0.000766013986013986, + "step": 2174, + "tokens_trained": 1.068538248 + }, + { + "epoch": 0.6172611871498476, + "grad_norm": 12.62887954711914, + "loss": 4.1203, + "lr": 0.0007657342657342658, + "step": 2176, + "tokens_trained": 1.06952032 + }, + { + "epoch": 0.6178285227998014, + "grad_norm": 9.387356758117676, + "loss": 4.1318, + "lr": 0.0007654545454545455, + "step": 2178, + "tokens_trained": 1.070503872 + }, + { + "epoch": 0.6183958584497553, + "grad_norm": 8.885710716247559, + "loss": 4.1609, + "lr": 0.0007651748251748251, + "step": 2180, + "tokens_trained": 1.071486328 + }, + { + "epoch": 0.6189631940997092, + "grad_norm": 7.174533843994141, + "loss": 4.0824, + "lr": 0.0007648951048951049, + "step": 2182, + "tokens_trained": 1.07246928 + }, + { + "epoch": 0.6195305297496632, + "grad_norm": 15.866931915283203, + "loss": 4.1461, + "lr": 0.0007646153846153846, + "step": 2184, + "tokens_trained": 1.07345252 + }, + { + "epoch": 0.6200978653996171, + "grad_norm": 4.892337799072266, + "loss": 4.1418, + "lr": 0.0007643356643356644, + "step": 2186, + "tokens_trained": 1.07443796 + }, + { + "epoch": 0.6206652010495709, + "grad_norm": 4.796551704406738, + "loss": 4.1394, + "lr": 0.000764055944055944, + "step": 2188, + "tokens_trained": 1.075421392 + }, + { + "epoch": 0.6212325366995248, + "grad_norm": 10.585665702819824, + "loss": 4.1046, + "lr": 0.0007637762237762238, + "step": 2190, + "tokens_trained": 1.076404848 + }, + { + "epoch": 0.6217998723494788, + "grad_norm": 8.71747875213623, + "loss": 4.1819, + "lr": 0.0007634965034965035, + "step": 2192, + "tokens_trained": 1.077386672 + }, + { + "epoch": 0.6223672079994327, + "grad_norm": 10.74347972869873, + "loss": 4.1231, + "lr": 0.0007632167832167833, + "step": 2194, + "tokens_trained": 1.078365112 + }, + { + "epoch": 0.6229345436493866, + "grad_norm": 12.079446792602539, + "loss": 4.1132, + "lr": 0.000762937062937063, + "step": 2196, + "tokens_trained": 1.07935376 + }, + { + "epoch": 0.6235018792993404, + "grad_norm": 7.8133649826049805, + "loss": 4.0915, + "lr": 0.0007626573426573426, + "step": 2198, + "tokens_trained": 1.080332872 + }, + { + "epoch": 0.6240692149492943, + "grad_norm": 4.51243782043457, + "loss": 4.1108, + "lr": 0.0007623776223776224, + "step": 2200, + "tokens_trained": 1.081316664 + }, + { + "epoch": 0.6246365505992483, + "grad_norm": 12.625933647155762, + "loss": 4.1552, + "lr": 0.0007620979020979021, + "step": 2202, + "tokens_trained": 1.08230448 + }, + { + "epoch": 0.6252038862492022, + "grad_norm": 9.984200477600098, + "loss": 4.1199, + "lr": 0.0007618181818181819, + "step": 2204, + "tokens_trained": 1.083288992 + }, + { + "epoch": 0.6257712218991561, + "grad_norm": 11.338666915893555, + "loss": 4.0821, + "lr": 0.0007615384615384615, + "step": 2206, + "tokens_trained": 1.084273864 + }, + { + "epoch": 0.6263385575491099, + "grad_norm": 6.808894634246826, + "loss": 4.1202, + "lr": 0.0007612587412587412, + "step": 2208, + "tokens_trained": 1.085254584 + }, + { + "epoch": 0.6269058931990639, + "grad_norm": 4.182394027709961, + "loss": 4.1072, + "lr": 0.000760979020979021, + "step": 2210, + "tokens_trained": 1.086237312 + }, + { + "epoch": 0.6274732288490178, + "grad_norm": 13.04654312133789, + "loss": 4.1611, + "lr": 0.0007606993006993007, + "step": 2212, + "tokens_trained": 1.087220136 + }, + { + "epoch": 0.6280405644989717, + "grad_norm": 8.223962783813477, + "loss": 4.1094, + "lr": 0.0007604195804195805, + "step": 2214, + "tokens_trained": 1.088203464 + }, + { + "epoch": 0.6286079001489256, + "grad_norm": 7.974697589874268, + "loss": 4.1061, + "lr": 0.0007601398601398601, + "step": 2216, + "tokens_trained": 1.089188056 + }, + { + "epoch": 0.6291752357988795, + "grad_norm": 9.93747329711914, + "loss": 4.1625, + "lr": 0.0007598601398601399, + "step": 2218, + "tokens_trained": 1.090168464 + }, + { + "epoch": 0.6297425714488334, + "grad_norm": 14.117332458496094, + "loss": 4.1386, + "lr": 0.0007595804195804196, + "step": 2220, + "tokens_trained": 1.09115228 + }, + { + "epoch": 0.6303099070987873, + "grad_norm": 8.045380592346191, + "loss": 4.0962, + "lr": 0.0007593006993006993, + "step": 2222, + "tokens_trained": 1.0921348 + }, + { + "epoch": 0.6308772427487412, + "grad_norm": 7.286352634429932, + "loss": 4.1456, + "lr": 0.000759020979020979, + "step": 2224, + "tokens_trained": 1.0931198 + }, + { + "epoch": 0.6314445783986952, + "grad_norm": 7.278292179107666, + "loss": 4.1155, + "lr": 0.0007587412587412587, + "step": 2226, + "tokens_trained": 1.094107536 + }, + { + "epoch": 0.632011914048649, + "grad_norm": 5.973489761352539, + "loss": 4.1403, + "lr": 0.0007584615384615385, + "step": 2228, + "tokens_trained": 1.095090384 + }, + { + "epoch": 0.6325792496986029, + "grad_norm": 11.78962230682373, + "loss": 4.1322, + "lr": 0.0007581818181818182, + "step": 2230, + "tokens_trained": 1.096072192 + }, + { + "epoch": 0.6331465853485568, + "grad_norm": 9.853010177612305, + "loss": 4.0905, + "lr": 0.000757902097902098, + "step": 2232, + "tokens_trained": 1.097057368 + }, + { + "epoch": 0.6337139209985108, + "grad_norm": 12.578025817871094, + "loss": 4.0871, + "lr": 0.0007576223776223776, + "step": 2234, + "tokens_trained": 1.0980418 + }, + { + "epoch": 0.6342812566484647, + "grad_norm": 8.467657089233398, + "loss": 4.0972, + "lr": 0.0007573426573426573, + "step": 2236, + "tokens_trained": 1.099023032 + }, + { + "epoch": 0.6348485922984185, + "grad_norm": 10.768691062927246, + "loss": 4.0683, + "lr": 0.0007570629370629371, + "step": 2238, + "tokens_trained": 1.1000078 + }, + { + "epoch": 0.6354159279483724, + "grad_norm": 8.509350776672363, + "loss": 4.1319, + "lr": 0.0007567832167832168, + "step": 2240, + "tokens_trained": 1.100990904 + }, + { + "epoch": 0.6359832635983264, + "grad_norm": 9.473450660705566, + "loss": 4.0971, + "lr": 0.0007565034965034965, + "step": 2242, + "tokens_trained": 1.101971112 + }, + { + "epoch": 0.6365505992482803, + "grad_norm": 5.248406887054443, + "loss": 4.1212, + "lr": 0.0007562237762237762, + "step": 2244, + "tokens_trained": 1.10295244 + }, + { + "epoch": 0.6371179348982342, + "grad_norm": 2.8849964141845703, + "loss": 4.0914, + "lr": 0.000755944055944056, + "step": 2246, + "tokens_trained": 1.103935728 + }, + { + "epoch": 0.637685270548188, + "grad_norm": 10.757996559143066, + "loss": 4.0711, + "lr": 0.0007556643356643357, + "step": 2248, + "tokens_trained": 1.104917112 + }, + { + "epoch": 0.638252606198142, + "grad_norm": 14.822528839111328, + "loss": 4.1311, + "lr": 0.0007553846153846154, + "step": 2250, + "tokens_trained": 1.105899872 + }, + { + "epoch": 0.638252606198142, + "eval_loss": 1.0298579931259155, + "eval_runtime": 20.7482, + "step": 2250, + "tokens_trained": 1.105899872 + }, + { + "epoch": 0.6388199418480959, + "grad_norm": 12.402534484863281, + "loss": 4.0729, + "lr": 0.0007551048951048951, + "step": 2252, + "tokens_trained": 1.106885776 + }, + { + "epoch": 0.6393872774980498, + "grad_norm": 8.585915565490723, + "loss": 4.1026, + "lr": 0.0007548251748251748, + "step": 2254, + "tokens_trained": 1.107867784 + }, + { + "epoch": 0.6399546131480037, + "grad_norm": 9.298388481140137, + "loss": 4.1033, + "lr": 0.0007545454545454546, + "step": 2256, + "tokens_trained": 1.108846136 + }, + { + "epoch": 0.6405219487979575, + "grad_norm": 10.894235610961914, + "loss": 4.1212, + "lr": 0.0007542657342657343, + "step": 2258, + "tokens_trained": 1.10982972 + }, + { + "epoch": 0.6410892844479115, + "grad_norm": 7.488401889801025, + "loss": 4.1268, + "lr": 0.000753986013986014, + "step": 2260, + "tokens_trained": 1.110815128 + }, + { + "epoch": 0.6416566200978654, + "grad_norm": 10.087981224060059, + "loss": 4.0819, + "lr": 0.0007537062937062937, + "step": 2262, + "tokens_trained": 1.111796896 + }, + { + "epoch": 0.6422239557478193, + "grad_norm": 8.851993560791016, + "loss": 4.0903, + "lr": 0.0007534265734265734, + "step": 2264, + "tokens_trained": 1.112779032 + }, + { + "epoch": 0.6427912913977732, + "grad_norm": 7.973280429840088, + "loss": 4.1251, + "lr": 0.0007531468531468532, + "step": 2266, + "tokens_trained": 1.11376248 + }, + { + "epoch": 0.6433586270477271, + "grad_norm": 10.600922584533691, + "loss": 4.1062, + "lr": 0.0007528671328671329, + "step": 2268, + "tokens_trained": 1.11474752 + }, + { + "epoch": 0.643925962697681, + "grad_norm": 6.029149532318115, + "loss": 4.1174, + "lr": 0.0007525874125874126, + "step": 2270, + "tokens_trained": 1.115730304 + }, + { + "epoch": 0.6444932983476349, + "grad_norm": 5.804802417755127, + "loss": 4.0634, + "lr": 0.0007523076923076923, + "step": 2272, + "tokens_trained": 1.116712712 + }, + { + "epoch": 0.6450606339975888, + "grad_norm": 12.601567268371582, + "loss": 4.111, + "lr": 0.0007520279720279721, + "step": 2274, + "tokens_trained": 1.117692824 + }, + { + "epoch": 0.6456279696475428, + "grad_norm": 6.2783203125, + "loss": 4.1375, + "lr": 0.0007517482517482518, + "step": 2276, + "tokens_trained": 1.118681616 + }, + { + "epoch": 0.6461953052974966, + "grad_norm": 3.368333339691162, + "loss": 4.096, + "lr": 0.0007514685314685314, + "step": 2278, + "tokens_trained": 1.119662896 + }, + { + "epoch": 0.6467626409474505, + "grad_norm": 28.135610580444336, + "loss": 4.1362, + "lr": 0.0007511888111888112, + "step": 2280, + "tokens_trained": 1.120644592 + }, + { + "epoch": 0.6473299765974044, + "grad_norm": 31.932798385620117, + "loss": 4.177, + "lr": 0.0007509090909090909, + "step": 2282, + "tokens_trained": 1.1216274 + }, + { + "epoch": 0.6478973122473584, + "grad_norm": 18.303653717041016, + "loss": 4.2105, + "lr": 0.0007506293706293707, + "step": 2284, + "tokens_trained": 1.122610568 + }, + { + "epoch": 0.6484646478973123, + "grad_norm": 24.33900260925293, + "loss": 4.1685, + "lr": 0.0007503496503496504, + "step": 2286, + "tokens_trained": 1.1235948 + }, + { + "epoch": 0.6490319835472661, + "grad_norm": 14.718119621276855, + "loss": 4.1309, + "lr": 0.00075006993006993, + "step": 2288, + "tokens_trained": 1.124576952 + }, + { + "epoch": 0.64959931919722, + "grad_norm": 10.44218921661377, + "loss": 4.1178, + "lr": 0.0007497902097902098, + "step": 2290, + "tokens_trained": 1.12555812 + }, + { + "epoch": 0.650166654847174, + "grad_norm": 12.619060516357422, + "loss": 4.088, + "lr": 0.0007495104895104895, + "step": 2292, + "tokens_trained": 1.126542504 + }, + { + "epoch": 0.6507339904971279, + "grad_norm": 12.677931785583496, + "loss": 4.1146, + "lr": 0.0007492307692307693, + "step": 2294, + "tokens_trained": 1.127527144 + }, + { + "epoch": 0.6513013261470818, + "grad_norm": 9.913066864013672, + "loss": 4.1376, + "lr": 0.0007489510489510489, + "step": 2296, + "tokens_trained": 1.128511472 + }, + { + "epoch": 0.6518686617970356, + "grad_norm": 10.902573585510254, + "loss": 4.1184, + "lr": 0.0007486713286713287, + "step": 2298, + "tokens_trained": 1.129493144 + }, + { + "epoch": 0.6524359974469895, + "grad_norm": 11.475235939025879, + "loss": 4.098, + "lr": 0.0007483916083916084, + "step": 2300, + "tokens_trained": 1.13047816 + }, + { + "epoch": 0.6530033330969435, + "grad_norm": 11.541910171508789, + "loss": 4.106, + "lr": 0.0007481118881118882, + "step": 2302, + "tokens_trained": 1.131461952 + }, + { + "epoch": 0.6535706687468974, + "grad_norm": 8.055131912231445, + "loss": 4.0913, + "lr": 0.0007478321678321679, + "step": 2304, + "tokens_trained": 1.132445928 + }, + { + "epoch": 0.6541380043968513, + "grad_norm": 11.786042213439941, + "loss": 4.14, + "lr": 0.0007475524475524475, + "step": 2306, + "tokens_trained": 1.133430104 + }, + { + "epoch": 0.6547053400468051, + "grad_norm": 7.311541557312012, + "loss": 4.0989, + "lr": 0.0007472727272727273, + "step": 2308, + "tokens_trained": 1.1344128 + }, + { + "epoch": 0.6552726756967591, + "grad_norm": 5.909560680389404, + "loss": 4.1226, + "lr": 0.000746993006993007, + "step": 2310, + "tokens_trained": 1.135395456 + }, + { + "epoch": 0.655840011346713, + "grad_norm": 15.199941635131836, + "loss": 4.1003, + "lr": 0.0007467132867132868, + "step": 2312, + "tokens_trained": 1.136377952 + }, + { + "epoch": 0.6564073469966669, + "grad_norm": 11.078165054321289, + "loss": 4.1273, + "lr": 0.0007464335664335664, + "step": 2314, + "tokens_trained": 1.137364488 + }, + { + "epoch": 0.6569746826466208, + "grad_norm": 14.202346801757812, + "loss": 4.074, + "lr": 0.0007461538461538462, + "step": 2316, + "tokens_trained": 1.138348624 + }, + { + "epoch": 0.6575420182965747, + "grad_norm": 12.573927879333496, + "loss": 4.0749, + "lr": 0.0007458741258741259, + "step": 2318, + "tokens_trained": 1.139332304 + }, + { + "epoch": 0.6581093539465286, + "grad_norm": 4.582006454467773, + "loss": 4.1204, + "lr": 0.0007455944055944056, + "step": 2320, + "tokens_trained": 1.140317248 + }, + { + "epoch": 0.6586766895964825, + "grad_norm": 12.172183990478516, + "loss": 4.1045, + "lr": 0.0007453146853146854, + "step": 2322, + "tokens_trained": 1.141300976 + }, + { + "epoch": 0.6592440252464364, + "grad_norm": 8.110429763793945, + "loss": 4.1081, + "lr": 0.000745034965034965, + "step": 2324, + "tokens_trained": 1.142283576 + }, + { + "epoch": 0.6598113608963904, + "grad_norm": 7.653029918670654, + "loss": 4.1272, + "lr": 0.0007447552447552448, + "step": 2326, + "tokens_trained": 1.143264144 + }, + { + "epoch": 0.6603786965463442, + "grad_norm": 8.91545295715332, + "loss": 4.0604, + "lr": 0.0007444755244755245, + "step": 2328, + "tokens_trained": 1.144248336 + }, + { + "epoch": 0.6609460321962981, + "grad_norm": 8.173501014709473, + "loss": 4.1033, + "lr": 0.0007441958041958043, + "step": 2330, + "tokens_trained": 1.145231936 + }, + { + "epoch": 0.661513367846252, + "grad_norm": 6.748053550720215, + "loss": 4.1, + "lr": 0.0007439160839160839, + "step": 2332, + "tokens_trained": 1.146214208 + }, + { + "epoch": 0.662080703496206, + "grad_norm": 8.997527122497559, + "loss": 4.0642, + "lr": 0.0007436363636363636, + "step": 2334, + "tokens_trained": 1.147203592 + }, + { + "epoch": 0.6626480391461599, + "grad_norm": 5.39633321762085, + "loss": 4.0531, + "lr": 0.0007433566433566433, + "step": 2336, + "tokens_trained": 1.148189176 + }, + { + "epoch": 0.6632153747961137, + "grad_norm": 11.717559814453125, + "loss": 4.1069, + "lr": 0.0007430769230769231, + "step": 2338, + "tokens_trained": 1.14917232 + }, + { + "epoch": 0.6637827104460676, + "grad_norm": 4.895142078399658, + "loss": 4.1119, + "lr": 0.0007427972027972029, + "step": 2340, + "tokens_trained": 1.150150104 + }, + { + "epoch": 0.6643500460960216, + "grad_norm": 7.677682399749756, + "loss": 4.0787, + "lr": 0.0007425174825174825, + "step": 2342, + "tokens_trained": 1.15113228 + }, + { + "epoch": 0.6649173817459755, + "grad_norm": 9.910654067993164, + "loss": 4.114, + "lr": 0.0007422377622377622, + "step": 2344, + "tokens_trained": 1.152119112 + }, + { + "epoch": 0.6654847173959294, + "grad_norm": 7.880978107452393, + "loss": 4.1188, + "lr": 0.000741958041958042, + "step": 2346, + "tokens_trained": 1.153100688 + }, + { + "epoch": 0.6660520530458832, + "grad_norm": 3.284940242767334, + "loss": 4.0736, + "lr": 0.0007416783216783217, + "step": 2348, + "tokens_trained": 1.1540818 + }, + { + "epoch": 0.6666193886958371, + "grad_norm": 13.524490356445312, + "loss": 4.0621, + "lr": 0.0007413986013986014, + "step": 2350, + "tokens_trained": 1.155065608 + }, + { + "epoch": 0.6671867243457911, + "grad_norm": 5.8569135665893555, + "loss": 4.0904, + "lr": 0.0007411188811188811, + "step": 2352, + "tokens_trained": 1.156048544 + }, + { + "epoch": 0.667754059995745, + "grad_norm": 7.1157450675964355, + "loss": 4.0774, + "lr": 0.0007408391608391608, + "step": 2354, + "tokens_trained": 1.157030432 + }, + { + "epoch": 0.6683213956456989, + "grad_norm": 7.612982273101807, + "loss": 4.0829, + "lr": 0.0007405594405594406, + "step": 2356, + "tokens_trained": 1.158012728 + }, + { + "epoch": 0.6688887312956527, + "grad_norm": 8.317691802978516, + "loss": 4.1176, + "lr": 0.0007402797202797204, + "step": 2358, + "tokens_trained": 1.158993632 + }, + { + "epoch": 0.6694560669456067, + "grad_norm": 5.272528648376465, + "loss": 4.0977, + "lr": 0.00074, + "step": 2360, + "tokens_trained": 1.159976328 + }, + { + "epoch": 0.6700234025955606, + "grad_norm": 11.313931465148926, + "loss": 4.0792, + "lr": 0.0007397202797202797, + "step": 2362, + "tokens_trained": 1.160962072 + }, + { + "epoch": 0.6705907382455145, + "grad_norm": 12.588369369506836, + "loss": 4.0491, + "lr": 0.0007394405594405595, + "step": 2364, + "tokens_trained": 1.161947664 + }, + { + "epoch": 0.6711580738954684, + "grad_norm": 23.921968460083008, + "loss": 4.1085, + "lr": 0.0007391608391608392, + "step": 2366, + "tokens_trained": 1.16292872 + }, + { + "epoch": 0.6717254095454223, + "grad_norm": 9.100578308105469, + "loss": 4.1305, + "lr": 0.0007388811188811189, + "step": 2368, + "tokens_trained": 1.163913888 + }, + { + "epoch": 0.6722927451953762, + "grad_norm": 35.22720718383789, + "loss": 4.1538, + "lr": 0.0007386013986013986, + "step": 2370, + "tokens_trained": 1.164894912 + }, + { + "epoch": 0.6728600808453301, + "grad_norm": 16.7394962310791, + "loss": 4.1449, + "lr": 0.0007383216783216782, + "step": 2372, + "tokens_trained": 1.165879832 + }, + { + "epoch": 0.673427416495284, + "grad_norm": 11.066312789916992, + "loss": 4.1172, + "lr": 0.0007380419580419581, + "step": 2374, + "tokens_trained": 1.166864736 + }, + { + "epoch": 0.6737110843202609, + "eval_loss": 1.0303717851638794, + "eval_runtime": 20.7454, + "step": 2375, + "tokens_trained": 1.167358632 + }, + { + "epoch": 0.673994752145238, + "grad_norm": 12.827569007873535, + "loss": 4.1377, + "lr": 0.0007377622377622378, + "step": 2376, + "tokens_trained": 1.16784964 + }, + { + "epoch": 0.6745620877951918, + "grad_norm": 13.321866035461426, + "loss": 4.0747, + "lr": 0.0007374825174825175, + "step": 2378, + "tokens_trained": 1.168834992 + }, + { + "epoch": 0.6751294234451457, + "grad_norm": 15.812009811401367, + "loss": 4.1107, + "lr": 0.0007372027972027972, + "step": 2380, + "tokens_trained": 1.169817608 + }, + { + "epoch": 0.6756967590950996, + "grad_norm": 16.37995719909668, + "loss": 4.1556, + "lr": 0.000736923076923077, + "step": 2382, + "tokens_trained": 1.170800952 + }, + { + "epoch": 0.6762640947450536, + "grad_norm": 3.3421339988708496, + "loss": 4.1199, + "lr": 0.0007366433566433567, + "step": 2384, + "tokens_trained": 1.1717818 + }, + { + "epoch": 0.6768314303950075, + "grad_norm": 9.120339393615723, + "loss": 4.0834, + "lr": 0.0007363636363636363, + "step": 2386, + "tokens_trained": 1.172767384 + }, + { + "epoch": 0.6773987660449613, + "grad_norm": 12.614449501037598, + "loss": 4.0852, + "lr": 0.0007360839160839161, + "step": 2388, + "tokens_trained": 1.173755008 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 4.983767986297607, + "loss": 4.0881, + "lr": 0.0007358041958041957, + "step": 2390, + "tokens_trained": 1.174738528 + }, + { + "epoch": 0.6785334373448692, + "grad_norm": 4.194960117340088, + "loss": 4.1279, + "lr": 0.0007355244755244756, + "step": 2392, + "tokens_trained": 1.175724848 + }, + { + "epoch": 0.6791007729948231, + "grad_norm": 5.257171154022217, + "loss": 4.1044, + "lr": 0.0007352447552447553, + "step": 2394, + "tokens_trained": 1.176708808 + }, + { + "epoch": 0.679668108644777, + "grad_norm": 10.38420295715332, + "loss": 4.124, + "lr": 0.000734965034965035, + "step": 2396, + "tokens_trained": 1.177695552 + }, + { + "epoch": 0.6802354442947308, + "grad_norm": 8.629493713378906, + "loss": 4.0992, + "lr": 0.0007346853146853147, + "step": 2398, + "tokens_trained": 1.17868064 + }, + { + "epoch": 0.6808027799446847, + "grad_norm": 9.099041938781738, + "loss": 4.1047, + "lr": 0.0007344055944055944, + "step": 2400, + "tokens_trained": 1.179664536 + }, + { + "epoch": 0.6813701155946387, + "grad_norm": 11.343080520629883, + "loss": 4.1027, + "lr": 0.0007341258741258742, + "step": 2402, + "tokens_trained": 1.180644264 + }, + { + "epoch": 0.6819374512445926, + "grad_norm": 5.834907054901123, + "loss": 4.098, + "lr": 0.0007338461538461538, + "step": 2404, + "tokens_trained": 1.181629672 + }, + { + "epoch": 0.6825047868945465, + "grad_norm": 4.648270606994629, + "loss": 4.0775, + "lr": 0.0007335664335664336, + "step": 2406, + "tokens_trained": 1.182614064 + }, + { + "epoch": 0.6830721225445003, + "grad_norm": 6.934843063354492, + "loss": 4.1206, + "lr": 0.0007332867132867132, + "step": 2408, + "tokens_trained": 1.183597056 + }, + { + "epoch": 0.6836394581944543, + "grad_norm": 9.745563507080078, + "loss": 4.0921, + "lr": 0.0007330069930069931, + "step": 2410, + "tokens_trained": 1.184579832 + }, + { + "epoch": 0.6842067938444082, + "grad_norm": 7.189306259155273, + "loss": 4.095, + "lr": 0.0007327272727272728, + "step": 2412, + "tokens_trained": 1.185567912 + }, + { + "epoch": 0.6847741294943621, + "grad_norm": 6.303226947784424, + "loss": 4.0462, + "lr": 0.0007324475524475524, + "step": 2414, + "tokens_trained": 1.186550184 + }, + { + "epoch": 0.685341465144316, + "grad_norm": 6.373469352722168, + "loss": 4.1126, + "lr": 0.0007321678321678322, + "step": 2416, + "tokens_trained": 1.1875374 + }, + { + "epoch": 0.6859088007942699, + "grad_norm": 7.8680853843688965, + "loss": 4.0954, + "lr": 0.0007318881118881119, + "step": 2418, + "tokens_trained": 1.188519808 + }, + { + "epoch": 0.6864761364442238, + "grad_norm": 6.305267810821533, + "loss": 4.0951, + "lr": 0.0007316083916083917, + "step": 2420, + "tokens_trained": 1.18950228 + }, + { + "epoch": 0.6870434720941777, + "grad_norm": 9.990362167358398, + "loss": 4.0902, + "lr": 0.0007313286713286713, + "step": 2422, + "tokens_trained": 1.190483872 + }, + { + "epoch": 0.6876108077441316, + "grad_norm": 7.421126365661621, + "loss": 4.082, + "lr": 0.0007310489510489511, + "step": 2424, + "tokens_trained": 1.191465424 + }, + { + "epoch": 0.6881781433940856, + "grad_norm": 7.08989953994751, + "loss": 4.057, + "lr": 0.0007307692307692307, + "step": 2426, + "tokens_trained": 1.192446 + }, + { + "epoch": 0.6887454790440394, + "grad_norm": 16.008317947387695, + "loss": 4.0857, + "lr": 0.0007304895104895105, + "step": 2428, + "tokens_trained": 1.193428632 + }, + { + "epoch": 0.6893128146939933, + "grad_norm": 14.471416473388672, + "loss": 4.127, + "lr": 0.0007302097902097902, + "step": 2430, + "tokens_trained": 1.194413624 + }, + { + "epoch": 0.6898801503439472, + "grad_norm": 8.250576972961426, + "loss": 4.1244, + "lr": 0.0007299300699300699, + "step": 2432, + "tokens_trained": 1.195396768 + }, + { + "epoch": 0.6904474859939012, + "grad_norm": 17.120845794677734, + "loss": 4.107, + "lr": 0.0007296503496503497, + "step": 2434, + "tokens_trained": 1.196377144 + }, + { + "epoch": 0.6910148216438551, + "grad_norm": 24.250490188598633, + "loss": 4.1443, + "lr": 0.0007293706293706294, + "step": 2436, + "tokens_trained": 1.197361496 + }, + { + "epoch": 0.6915821572938089, + "grad_norm": 9.916406631469727, + "loss": 4.1308, + "lr": 0.0007290909090909092, + "step": 2438, + "tokens_trained": 1.198343376 + }, + { + "epoch": 0.6921494929437628, + "grad_norm": 29.035507202148438, + "loss": 4.1809, + "lr": 0.0007288111888111888, + "step": 2440, + "tokens_trained": 1.19932396 + }, + { + "epoch": 0.6927168285937167, + "grad_norm": 26.963102340698242, + "loss": 4.1343, + "lr": 0.0007285314685314685, + "step": 2442, + "tokens_trained": 1.200310088 + }, + { + "epoch": 0.6932841642436707, + "grad_norm": 9.7550048828125, + "loss": 4.0746, + "lr": 0.0007282517482517482, + "step": 2444, + "tokens_trained": 1.201291576 + }, + { + "epoch": 0.6938514998936246, + "grad_norm": 18.56088638305664, + "loss": 4.1634, + "lr": 0.000727972027972028, + "step": 2446, + "tokens_trained": 1.202271312 + }, + { + "epoch": 0.6944188355435784, + "grad_norm": 20.842105865478516, + "loss": 4.128, + "lr": 0.0007276923076923077, + "step": 2448, + "tokens_trained": 1.203252912 + }, + { + "epoch": 0.6949861711935323, + "grad_norm": 21.38428497314453, + "loss": 4.1263, + "lr": 0.0007274125874125874, + "step": 2450, + "tokens_trained": 1.204231328 + }, + { + "epoch": 0.6955535068434863, + "grad_norm": 9.129469871520996, + "loss": 4.0964, + "lr": 0.0007271328671328672, + "step": 2452, + "tokens_trained": 1.205215552 + }, + { + "epoch": 0.6961208424934402, + "grad_norm": 25.37588882446289, + "loss": 4.1568, + "lr": 0.0007268531468531469, + "step": 2454, + "tokens_trained": 1.206202536 + }, + { + "epoch": 0.6966881781433941, + "grad_norm": 17.409656524658203, + "loss": 4.1214, + "lr": 0.0007265734265734266, + "step": 2456, + "tokens_trained": 1.207182664 + }, + { + "epoch": 0.6972555137933479, + "grad_norm": 12.378538131713867, + "loss": 4.1235, + "lr": 0.0007262937062937063, + "step": 2458, + "tokens_trained": 1.208164408 + }, + { + "epoch": 0.6978228494433019, + "grad_norm": 15.208183288574219, + "loss": 4.0724, + "lr": 0.000726013986013986, + "step": 2460, + "tokens_trained": 1.209151056 + }, + { + "epoch": 0.6983901850932558, + "grad_norm": 15.311476707458496, + "loss": 4.1146, + "lr": 0.0007257342657342657, + "step": 2462, + "tokens_trained": 1.210135672 + }, + { + "epoch": 0.6989575207432097, + "grad_norm": 8.551816940307617, + "loss": 4.0944, + "lr": 0.0007254545454545455, + "step": 2464, + "tokens_trained": 1.211118992 + }, + { + "epoch": 0.6995248563931636, + "grad_norm": 5.893448829650879, + "loss": 4.0777, + "lr": 0.0007251748251748252, + "step": 2466, + "tokens_trained": 1.212102 + }, + { + "epoch": 0.7000921920431175, + "grad_norm": 12.23680591583252, + "loss": 4.0998, + "lr": 0.0007248951048951049, + "step": 2468, + "tokens_trained": 1.213078936 + }, + { + "epoch": 0.7006595276930714, + "grad_norm": 6.285398006439209, + "loss": 4.0691, + "lr": 0.0007246153846153846, + "step": 2470, + "tokens_trained": 1.214058832 + }, + { + "epoch": 0.7012268633430253, + "grad_norm": 5.049949645996094, + "loss": 4.0849, + "lr": 0.0007243356643356644, + "step": 2472, + "tokens_trained": 1.215045384 + }, + { + "epoch": 0.7017941989929792, + "grad_norm": 8.333894729614258, + "loss": 4.1072, + "lr": 0.0007240559440559441, + "step": 2474, + "tokens_trained": 1.216029416 + }, + { + "epoch": 0.7023615346429332, + "grad_norm": 10.236394882202148, + "loss": 4.1144, + "lr": 0.0007237762237762238, + "step": 2476, + "tokens_trained": 1.217012872 + }, + { + "epoch": 0.702928870292887, + "grad_norm": 7.674532413482666, + "loss": 4.0948, + "lr": 0.0007234965034965035, + "step": 2478, + "tokens_trained": 1.2179988 + }, + { + "epoch": 0.7034962059428409, + "grad_norm": 8.445834159851074, + "loss": 4.0937, + "lr": 0.0007232167832167831, + "step": 2480, + "tokens_trained": 1.218980608 + }, + { + "epoch": 0.7040635415927948, + "grad_norm": 6.923468112945557, + "loss": 4.0756, + "lr": 0.000722937062937063, + "step": 2482, + "tokens_trained": 1.219966912 + }, + { + "epoch": 0.7046308772427488, + "grad_norm": 5.95997428894043, + "loss": 4.0618, + "lr": 0.0007226573426573426, + "step": 2484, + "tokens_trained": 1.220952696 + }, + { + "epoch": 0.7051982128927027, + "grad_norm": 3.7207870483398438, + "loss": 4.0869, + "lr": 0.0007223776223776224, + "step": 2486, + "tokens_trained": 1.22193476 + }, + { + "epoch": 0.7057655485426565, + "grad_norm": 8.434130668640137, + "loss": 4.0965, + "lr": 0.0007220979020979021, + "step": 2488, + "tokens_trained": 1.222914616 + }, + { + "epoch": 0.7063328841926104, + "grad_norm": 10.180377006530762, + "loss": 4.0871, + "lr": 0.0007218181818181819, + "step": 2490, + "tokens_trained": 1.22389764 + }, + { + "epoch": 0.7069002198425643, + "grad_norm": 8.211799621582031, + "loss": 4.0811, + "lr": 0.0007215384615384616, + "step": 2492, + "tokens_trained": 1.224875448 + }, + { + "epoch": 0.7074675554925183, + "grad_norm": 5.268981456756592, + "loss": 4.0926, + "lr": 0.0007212587412587412, + "step": 2494, + "tokens_trained": 1.225858112 + }, + { + "epoch": 0.7080348911424722, + "grad_norm": 7.387131690979004, + "loss": 4.1097, + "lr": 0.000720979020979021, + "step": 2496, + "tokens_trained": 1.226838472 + }, + { + "epoch": 0.708602226792426, + "grad_norm": 7.289080619812012, + "loss": 4.0566, + "lr": 0.0007206993006993006, + "step": 2498, + "tokens_trained": 1.227821848 + }, + { + "epoch": 0.7091695624423799, + "grad_norm": 6.981493949890137, + "loss": 4.062, + "lr": 0.0007204195804195805, + "step": 2500, + "tokens_trained": 1.228806208 + }, + { + "epoch": 0.7091695624423799, + "eval_loss": 1.0222537517547607, + "eval_runtime": 20.7945, + "step": 2500, + "tokens_trained": 1.228806208 + }, + { + "epoch": 0.7097368980923339, + "grad_norm": 6.244803428649902, + "loss": 4.1417, + "lr": 0.0007201398601398601, + "step": 2502, + "tokens_trained": 1.229787872 + }, + { + "epoch": 0.7103042337422878, + "grad_norm": 4.354197978973389, + "loss": 4.0663, + "lr": 0.0007198601398601399, + "step": 2504, + "tokens_trained": 1.23077076 + }, + { + "epoch": 0.7108715693922417, + "grad_norm": 4.971379280090332, + "loss": 4.0495, + "lr": 0.0007195804195804196, + "step": 2506, + "tokens_trained": 1.231752344 + }, + { + "epoch": 0.7114389050421955, + "grad_norm": 5.990703582763672, + "loss": 4.0837, + "lr": 0.0007193006993006994, + "step": 2508, + "tokens_trained": 1.232733864 + }, + { + "epoch": 0.7120062406921495, + "grad_norm": 8.498222351074219, + "loss": 4.0379, + "lr": 0.0007190209790209791, + "step": 2510, + "tokens_trained": 1.233716744 + }, + { + "epoch": 0.7125735763421034, + "grad_norm": 13.36562442779541, + "loss": 4.0187, + "lr": 0.0007187412587412587, + "step": 2512, + "tokens_trained": 1.234699872 + }, + { + "epoch": 0.7131409119920573, + "grad_norm": 8.733027458190918, + "loss": 4.092, + "lr": 0.0007184615384615385, + "step": 2514, + "tokens_trained": 1.235684584 + }, + { + "epoch": 0.7137082476420112, + "grad_norm": 4.150378227233887, + "loss": 4.1277, + "lr": 0.0007181818181818181, + "step": 2516, + "tokens_trained": 1.236669584 + }, + { + "epoch": 0.714275583291965, + "grad_norm": 5.051011085510254, + "loss": 4.0942, + "lr": 0.000717902097902098, + "step": 2518, + "tokens_trained": 1.237654456 + }, + { + "epoch": 0.714842918941919, + "grad_norm": 19.51820945739746, + "loss": 4.0784, + "lr": 0.0007176223776223776, + "step": 2520, + "tokens_trained": 1.238634888 + }, + { + "epoch": 0.7154102545918729, + "grad_norm": 12.287970542907715, + "loss": 4.1096, + "lr": 0.0007173426573426573, + "step": 2522, + "tokens_trained": 1.239617096 + }, + { + "epoch": 0.7159775902418268, + "grad_norm": 7.280889511108398, + "loss": 4.1173, + "lr": 0.0007170629370629371, + "step": 2524, + "tokens_trained": 1.240599456 + }, + { + "epoch": 0.7165449258917808, + "grad_norm": 7.321331024169922, + "loss": 4.1011, + "lr": 0.0007167832167832168, + "step": 2526, + "tokens_trained": 1.2415852 + }, + { + "epoch": 0.7171122615417346, + "grad_norm": 12.695849418640137, + "loss": 4.0652, + "lr": 0.0007165034965034966, + "step": 2528, + "tokens_trained": 1.242566296 + }, + { + "epoch": 0.7176795971916885, + "grad_norm": 10.30766487121582, + "loss": 4.0683, + "lr": 0.0007162237762237762, + "step": 2530, + "tokens_trained": 1.24354928 + }, + { + "epoch": 0.7182469328416424, + "grad_norm": 6.451354503631592, + "loss": 4.0712, + "lr": 0.000715944055944056, + "step": 2532, + "tokens_trained": 1.244534464 + }, + { + "epoch": 0.7188142684915964, + "grad_norm": 13.049304962158203, + "loss": 4.0662, + "lr": 0.0007156643356643356, + "step": 2534, + "tokens_trained": 1.245514976 + }, + { + "epoch": 0.7193816041415503, + "grad_norm": 6.242895603179932, + "loss": 4.089, + "lr": 0.0007153846153846155, + "step": 2536, + "tokens_trained": 1.246499648 + }, + { + "epoch": 0.7199489397915041, + "grad_norm": 9.09418773651123, + "loss": 4.0727, + "lr": 0.0007151048951048951, + "step": 2538, + "tokens_trained": 1.247482424 + }, + { + "epoch": 0.720516275441458, + "grad_norm": 5.704024791717529, + "loss": 4.0973, + "lr": 0.0007148251748251748, + "step": 2540, + "tokens_trained": 1.248465776 + }, + { + "epoch": 0.721083611091412, + "grad_norm": 1.818793535232544, + "loss": 4.0928, + "lr": 0.0007145454545454546, + "step": 2542, + "tokens_trained": 1.249446792 + }, + { + "epoch": 0.7216509467413659, + "grad_norm": 8.157804489135742, + "loss": 4.1082, + "lr": 0.0007142657342657343, + "step": 2544, + "tokens_trained": 1.25042832 + }, + { + "epoch": 0.7222182823913198, + "grad_norm": 12.176240921020508, + "loss": 4.0472, + "lr": 0.0007139860139860141, + "step": 2546, + "tokens_trained": 1.251411112 + }, + { + "epoch": 0.7227856180412736, + "grad_norm": 9.750322341918945, + "loss": 4.0892, + "lr": 0.0007137062937062937, + "step": 2548, + "tokens_trained": 1.25239148 + }, + { + "epoch": 0.7233529536912275, + "grad_norm": 7.636045455932617, + "loss": 4.0939, + "lr": 0.0007134265734265734, + "step": 2550, + "tokens_trained": 1.253374936 + }, + { + "epoch": 0.7239202893411815, + "grad_norm": 9.795125007629395, + "loss": 4.0542, + "lr": 0.0007131468531468531, + "step": 2552, + "tokens_trained": 1.254359048 + }, + { + "epoch": 0.7244876249911354, + "grad_norm": 7.851208686828613, + "loss": 4.0546, + "lr": 0.0007128671328671329, + "step": 2554, + "tokens_trained": 1.255343552 + }, + { + "epoch": 0.7250549606410893, + "grad_norm": 7.749396800994873, + "loss": 4.0834, + "lr": 0.0007125874125874126, + "step": 2556, + "tokens_trained": 1.256332976 + }, + { + "epoch": 0.7256222962910431, + "grad_norm": 7.826572418212891, + "loss": 4.0914, + "lr": 0.0007123076923076923, + "step": 2558, + "tokens_trained": 1.257315376 + }, + { + "epoch": 0.7261896319409971, + "grad_norm": 7.173867225646973, + "loss": 4.0721, + "lr": 0.0007120279720279721, + "step": 2560, + "tokens_trained": 1.258296944 + }, + { + "epoch": 0.726756967590951, + "grad_norm": 7.722167015075684, + "loss": 4.092, + "lr": 0.0007117482517482518, + "step": 2562, + "tokens_trained": 1.259278984 + }, + { + "epoch": 0.7273243032409049, + "grad_norm": 5.8100690841674805, + "loss": 4.0592, + "lr": 0.0007114685314685315, + "step": 2564, + "tokens_trained": 1.260261648 + }, + { + "epoch": 0.7278916388908588, + "grad_norm": 6.633793830871582, + "loss": 4.0871, + "lr": 0.0007111888111888112, + "step": 2566, + "tokens_trained": 1.261235168 + }, + { + "epoch": 0.7284589745408127, + "grad_norm": 9.645057678222656, + "loss": 4.0707, + "lr": 0.0007109090909090909, + "step": 2568, + "tokens_trained": 1.26221864 + }, + { + "epoch": 0.7290263101907666, + "grad_norm": 8.770727157592773, + "loss": 4.0757, + "lr": 0.0007106293706293706, + "step": 2570, + "tokens_trained": 1.263199256 + }, + { + "epoch": 0.7295936458407205, + "grad_norm": 6.190083980560303, + "loss": 4.0911, + "lr": 0.0007103496503496504, + "step": 2572, + "tokens_trained": 1.264180424 + }, + { + "epoch": 0.7301609814906744, + "grad_norm": 11.070337295532227, + "loss": 4.0566, + "lr": 0.0007100699300699301, + "step": 2574, + "tokens_trained": 1.265164384 + }, + { + "epoch": 0.7307283171406284, + "grad_norm": 8.301725387573242, + "loss": 4.0636, + "lr": 0.0007097902097902098, + "step": 2576, + "tokens_trained": 1.266148592 + }, + { + "epoch": 0.7312956527905822, + "grad_norm": 5.524992942810059, + "loss": 4.0974, + "lr": 0.0007095104895104895, + "step": 2578, + "tokens_trained": 1.26712948 + }, + { + "epoch": 0.7318629884405361, + "grad_norm": 11.42268180847168, + "loss": 4.0858, + "lr": 0.0007092307692307692, + "step": 2580, + "tokens_trained": 1.268107968 + }, + { + "epoch": 0.73243032409049, + "grad_norm": 6.110471725463867, + "loss": 4.0563, + "lr": 0.000708951048951049, + "step": 2582, + "tokens_trained": 1.26909272 + }, + { + "epoch": 0.732997659740444, + "grad_norm": 4.583469867706299, + "loss": 4.0907, + "lr": 0.0007086713286713287, + "step": 2584, + "tokens_trained": 1.270074432 + }, + { + "epoch": 0.7335649953903979, + "grad_norm": 4.348790645599365, + "loss": 4.0768, + "lr": 0.0007083916083916084, + "step": 2586, + "tokens_trained": 1.271059184 + }, + { + "epoch": 0.7341323310403517, + "grad_norm": 9.383113861083984, + "loss": 4.0829, + "lr": 0.000708111888111888, + "step": 2588, + "tokens_trained": 1.272044288 + }, + { + "epoch": 0.7346996666903056, + "grad_norm": 8.594022750854492, + "loss": 4.097, + "lr": 0.0007078321678321679, + "step": 2590, + "tokens_trained": 1.273026808 + }, + { + "epoch": 0.7352670023402595, + "grad_norm": 8.971443176269531, + "loss": 4.0689, + "lr": 0.0007075524475524475, + "step": 2592, + "tokens_trained": 1.274011272 + }, + { + "epoch": 0.7358343379902135, + "grad_norm": 14.21872615814209, + "loss": 4.0892, + "lr": 0.0007072727272727273, + "step": 2594, + "tokens_trained": 1.274995728 + }, + { + "epoch": 0.7364016736401674, + "grad_norm": 5.579262733459473, + "loss": 4.1151, + "lr": 0.000706993006993007, + "step": 2596, + "tokens_trained": 1.27598244 + }, + { + "epoch": 0.7369690092901212, + "grad_norm": 7.760303974151611, + "loss": 4.0923, + "lr": 0.0007067132867132867, + "step": 2598, + "tokens_trained": 1.276966176 + }, + { + "epoch": 0.7375363449400751, + "grad_norm": 8.493928909301758, + "loss": 4.1002, + "lr": 0.0007064335664335665, + "step": 2600, + "tokens_trained": 1.277946064 + }, + { + "epoch": 0.7381036805900291, + "grad_norm": 7.7460126876831055, + "loss": 4.0464, + "lr": 0.0007061538461538462, + "step": 2602, + "tokens_trained": 1.278928016 + }, + { + "epoch": 0.738671016239983, + "grad_norm": 14.752384185791016, + "loss": 4.0694, + "lr": 0.0007058741258741259, + "step": 2604, + "tokens_trained": 1.27991464 + }, + { + "epoch": 0.7392383518899369, + "grad_norm": 4.13566255569458, + "loss": 4.0852, + "lr": 0.0007055944055944055, + "step": 2606, + "tokens_trained": 1.280898424 + }, + { + "epoch": 0.7398056875398907, + "grad_norm": 9.910110473632812, + "loss": 4.0819, + "lr": 0.0007053146853146854, + "step": 2608, + "tokens_trained": 1.281880448 + }, + { + "epoch": 0.7403730231898447, + "grad_norm": 8.776302337646484, + "loss": 4.0908, + "lr": 0.000705034965034965, + "step": 2610, + "tokens_trained": 1.282866224 + }, + { + "epoch": 0.7409403588397986, + "grad_norm": 7.437447547912598, + "loss": 4.0914, + "lr": 0.0007047552447552448, + "step": 2612, + "tokens_trained": 1.283846848 + }, + { + "epoch": 0.7415076944897525, + "grad_norm": 5.371145248413086, + "loss": 4.0601, + "lr": 0.0007044755244755245, + "step": 2614, + "tokens_trained": 1.284828288 + }, + { + "epoch": 0.7420750301397064, + "grad_norm": 5.754990100860596, + "loss": 4.034, + "lr": 0.0007041958041958041, + "step": 2616, + "tokens_trained": 1.285813632 + }, + { + "epoch": 0.7426423657896603, + "grad_norm": 12.21330738067627, + "loss": 4.0893, + "lr": 0.000703916083916084, + "step": 2618, + "tokens_trained": 1.286796048 + }, + { + "epoch": 0.7432097014396142, + "grad_norm": 6.313106060028076, + "loss": 4.1348, + "lr": 0.0007036363636363636, + "step": 2620, + "tokens_trained": 1.287779984 + }, + { + "epoch": 0.7437770370895681, + "grad_norm": 3.671832323074341, + "loss": 4.0892, + "lr": 0.0007033566433566434, + "step": 2622, + "tokens_trained": 1.288763704 + }, + { + "epoch": 0.744344372739522, + "grad_norm": 7.610039710998535, + "loss": 4.0544, + "lr": 0.000703076923076923, + "step": 2624, + "tokens_trained": 1.289748608 + }, + { + "epoch": 0.7446280405644989, + "eval_loss": 1.0216281414031982, + "eval_runtime": 21.3239, + "step": 2625, + "tokens_trained": 1.290237248 + }, + { + "epoch": 0.744911708389476, + "grad_norm": 10.805936813354492, + "loss": 4.0702, + "lr": 0.0007027972027972029, + "step": 2626, + "tokens_trained": 1.290726104 + }, + { + "epoch": 0.7454790440394298, + "grad_norm": 8.497400283813477, + "loss": 4.056, + "lr": 0.0007025174825174825, + "step": 2628, + "tokens_trained": 1.291710888 + }, + { + "epoch": 0.7460463796893837, + "grad_norm": 7.71652364730835, + "loss": 4.0428, + "lr": 0.0007022377622377623, + "step": 2630, + "tokens_trained": 1.2926998 + }, + { + "epoch": 0.7466137153393376, + "grad_norm": 11.314064979553223, + "loss": 4.0442, + "lr": 0.000701958041958042, + "step": 2632, + "tokens_trained": 1.293681648 + }, + { + "epoch": 0.7471810509892916, + "grad_norm": 8.498956680297852, + "loss": 4.0806, + "lr": 0.0007016783216783216, + "step": 2634, + "tokens_trained": 1.29466332 + }, + { + "epoch": 0.7477483866392455, + "grad_norm": 8.315062522888184, + "loss": 4.0496, + "lr": 0.0007013986013986015, + "step": 2636, + "tokens_trained": 1.29565108 + }, + { + "epoch": 0.7483157222891993, + "grad_norm": 7.541136264801025, + "loss": 4.0901, + "lr": 0.0007011188811188811, + "step": 2638, + "tokens_trained": 1.296633192 + }, + { + "epoch": 0.7488830579391532, + "grad_norm": 5.977221965789795, + "loss": 4.0612, + "lr": 0.0007008391608391609, + "step": 2640, + "tokens_trained": 1.297621272 + }, + { + "epoch": 0.7494503935891071, + "grad_norm": 5.02126932144165, + "loss": 4.0944, + "lr": 0.0007005594405594405, + "step": 2642, + "tokens_trained": 1.298601744 + }, + { + "epoch": 0.7500177292390611, + "grad_norm": 6.345284938812256, + "loss": 4.0578, + "lr": 0.0007002797202797204, + "step": 2644, + "tokens_trained": 1.299583072 + }, + { + "epoch": 0.750585064889015, + "grad_norm": 7.036267280578613, + "loss": 4.0472, + "lr": 0.0007, + "step": 2646, + "tokens_trained": 1.300567448 + }, + { + "epoch": 0.7511524005389689, + "grad_norm": 2.7125253677368164, + "loss": 4.0534, + "lr": 0.0006997202797202797, + "step": 2648, + "tokens_trained": 1.301554096 + }, + { + "epoch": 0.7517197361889227, + "grad_norm": 3.862492322921753, + "loss": 4.0696, + "lr": 0.0006994405594405595, + "step": 2650, + "tokens_trained": 1.302540112 + }, + { + "epoch": 0.7522870718388767, + "grad_norm": 2.0384063720703125, + "loss": 4.0662, + "lr": 0.0006991608391608391, + "step": 2652, + "tokens_trained": 1.30352596 + }, + { + "epoch": 0.7528544074888306, + "grad_norm": 5.195199966430664, + "loss": 4.0819, + "lr": 0.000698881118881119, + "step": 2654, + "tokens_trained": 1.30450616 + }, + { + "epoch": 0.7534217431387845, + "grad_norm": 14.55208969116211, + "loss": 4.0757, + "lr": 0.0006986013986013986, + "step": 2656, + "tokens_trained": 1.305488752 + }, + { + "epoch": 0.7539890787887384, + "grad_norm": 10.982531547546387, + "loss": 4.0474, + "lr": 0.0006983216783216784, + "step": 2658, + "tokens_trained": 1.306474856 + }, + { + "epoch": 0.7545564144386923, + "grad_norm": 7.926928997039795, + "loss": 4.0497, + "lr": 0.000698041958041958, + "step": 2660, + "tokens_trained": 1.307456136 + }, + { + "epoch": 0.7551237500886462, + "grad_norm": 5.156681537628174, + "loss": 4.098, + "lr": 0.0006977622377622378, + "step": 2662, + "tokens_trained": 1.308442664 + }, + { + "epoch": 0.7556910857386001, + "grad_norm": 8.156705856323242, + "loss": 4.0828, + "lr": 0.0006974825174825175, + "step": 2664, + "tokens_trained": 1.309422976 + }, + { + "epoch": 0.756258421388554, + "grad_norm": 8.489871978759766, + "loss": 4.0668, + "lr": 0.0006972027972027972, + "step": 2666, + "tokens_trained": 1.310406152 + }, + { + "epoch": 0.756825757038508, + "grad_norm": 13.065528869628906, + "loss": 4.0915, + "lr": 0.000696923076923077, + "step": 2668, + "tokens_trained": 1.311392576 + }, + { + "epoch": 0.7573930926884618, + "grad_norm": 7.475847244262695, + "loss": 4.0308, + "lr": 0.0006966433566433566, + "step": 2670, + "tokens_trained": 1.312378776 + }, + { + "epoch": 0.7579604283384157, + "grad_norm": 7.049544334411621, + "loss": 4.0662, + "lr": 0.0006963636363636365, + "step": 2672, + "tokens_trained": 1.313358848 + }, + { + "epoch": 0.7585277639883696, + "grad_norm": 5.037269115447998, + "loss": 4.1016, + "lr": 0.0006960839160839161, + "step": 2674, + "tokens_trained": 1.3143412 + }, + { + "epoch": 0.7590950996383236, + "grad_norm": 10.421965599060059, + "loss": 4.0655, + "lr": 0.0006958041958041958, + "step": 2676, + "tokens_trained": 1.315322968 + }, + { + "epoch": 0.7596624352882775, + "grad_norm": 8.08486557006836, + "loss": 4.0933, + "lr": 0.0006955244755244755, + "step": 2678, + "tokens_trained": 1.316306592 + }, + { + "epoch": 0.7602297709382313, + "grad_norm": 10.121665954589844, + "loss": 4.0673, + "lr": 0.0006952447552447553, + "step": 2680, + "tokens_trained": 1.317292536 + }, + { + "epoch": 0.7607971065881852, + "grad_norm": 4.840561389923096, + "loss": 4.089, + "lr": 0.000694965034965035, + "step": 2682, + "tokens_trained": 1.318278512 + }, + { + "epoch": 0.7613644422381391, + "grad_norm": 5.03504753112793, + "loss": 4.0696, + "lr": 0.0006946853146853147, + "step": 2684, + "tokens_trained": 1.319263032 + }, + { + "epoch": 0.7619317778880931, + "grad_norm": 12.180596351623535, + "loss": 4.1166, + "lr": 0.0006944055944055943, + "step": 2686, + "tokens_trained": 1.320252752 + }, + { + "epoch": 0.762499113538047, + "grad_norm": 8.842597007751465, + "loss": 4.0946, + "lr": 0.0006941258741258741, + "step": 2688, + "tokens_trained": 1.321239648 + }, + { + "epoch": 0.7630664491880008, + "grad_norm": 4.742710113525391, + "loss": 4.0894, + "lr": 0.0006938461538461539, + "step": 2690, + "tokens_trained": 1.322224872 + }, + { + "epoch": 0.7636337848379547, + "grad_norm": 2.7827649116516113, + "loss": 4.0453, + "lr": 0.0006935664335664336, + "step": 2692, + "tokens_trained": 1.323211432 + }, + { + "epoch": 0.7642011204879087, + "grad_norm": 8.263550758361816, + "loss": 4.0034, + "lr": 0.0006932867132867133, + "step": 2694, + "tokens_trained": 1.324190272 + }, + { + "epoch": 0.7647684561378626, + "grad_norm": 14.927130699157715, + "loss": 4.0243, + "lr": 0.000693006993006993, + "step": 2696, + "tokens_trained": 1.325175184 + }, + { + "epoch": 0.7653357917878165, + "grad_norm": 9.046390533447266, + "loss": 4.0646, + "lr": 0.0006927272727272728, + "step": 2698, + "tokens_trained": 1.326156856 + }, + { + "epoch": 0.7659031274377703, + "grad_norm": 7.640266418457031, + "loss": 4.0581, + "lr": 0.0006924475524475524, + "step": 2700, + "tokens_trained": 1.327134224 + }, + { + "epoch": 0.7664704630877243, + "grad_norm": 11.179667472839355, + "loss": 4.0286, + "lr": 0.0006921678321678322, + "step": 2702, + "tokens_trained": 1.328119376 + }, + { + "epoch": 0.7670377987376782, + "grad_norm": 13.961971282958984, + "loss": 4.072, + "lr": 0.0006918881118881118, + "step": 2704, + "tokens_trained": 1.329097248 + }, + { + "epoch": 0.7676051343876321, + "grad_norm": 5.873361110687256, + "loss": 4.1069, + "lr": 0.0006916083916083916, + "step": 2706, + "tokens_trained": 1.330079272 + }, + { + "epoch": 0.768172470037586, + "grad_norm": 5.7134623527526855, + "loss": 4.0483, + "lr": 0.0006913286713286714, + "step": 2708, + "tokens_trained": 1.331062968 + }, + { + "epoch": 0.7687398056875399, + "grad_norm": 8.088322639465332, + "loss": 4.0806, + "lr": 0.0006910489510489511, + "step": 2710, + "tokens_trained": 1.3320508 + }, + { + "epoch": 0.7693071413374938, + "grad_norm": 12.358318328857422, + "loss": 4.0281, + "lr": 0.0006907692307692308, + "step": 2712, + "tokens_trained": 1.333034392 + }, + { + "epoch": 0.7698744769874477, + "grad_norm": 6.448056221008301, + "loss": 4.0449, + "lr": 0.0006904895104895104, + "step": 2714, + "tokens_trained": 1.334018424 + }, + { + "epoch": 0.7704418126374016, + "grad_norm": 10.305964469909668, + "loss": 4.0611, + "lr": 0.0006902097902097903, + "step": 2716, + "tokens_trained": 1.33500044 + }, + { + "epoch": 0.7710091482873556, + "grad_norm": 8.82204532623291, + "loss": 4.0697, + "lr": 0.0006899300699300699, + "step": 2718, + "tokens_trained": 1.335985304 + }, + { + "epoch": 0.7715764839373094, + "grad_norm": 11.34217643737793, + "loss": 4.0471, + "lr": 0.0006896503496503497, + "step": 2720, + "tokens_trained": 1.336971752 + }, + { + "epoch": 0.7721438195872633, + "grad_norm": 9.843841552734375, + "loss": 4.1015, + "lr": 0.0006893706293706293, + "step": 2722, + "tokens_trained": 1.337955296 + }, + { + "epoch": 0.7727111552372172, + "grad_norm": 8.029809951782227, + "loss": 4.0432, + "lr": 0.0006890909090909091, + "step": 2724, + "tokens_trained": 1.338936912 + }, + { + "epoch": 0.7732784908871712, + "grad_norm": 8.858033180236816, + "loss": 4.0841, + "lr": 0.0006888111888111889, + "step": 2726, + "tokens_trained": 1.339920296 + }, + { + "epoch": 0.7738458265371251, + "grad_norm": 6.917725086212158, + "loss": 4.0701, + "lr": 0.0006885314685314685, + "step": 2728, + "tokens_trained": 1.340910088 + }, + { + "epoch": 0.7744131621870789, + "grad_norm": 9.695552825927734, + "loss": 4.0818, + "lr": 0.0006882517482517483, + "step": 2730, + "tokens_trained": 1.341895264 + }, + { + "epoch": 0.7749804978370328, + "grad_norm": 8.998181343078613, + "loss": 4.0734, + "lr": 0.0006879720279720279, + "step": 2732, + "tokens_trained": 1.342875544 + }, + { + "epoch": 0.7755478334869867, + "grad_norm": 7.250143527984619, + "loss": 4.0511, + "lr": 0.0006876923076923078, + "step": 2734, + "tokens_trained": 1.34386044 + }, + { + "epoch": 0.7761151691369407, + "grad_norm": 8.95149040222168, + "loss": 4.0671, + "lr": 0.0006874125874125874, + "step": 2736, + "tokens_trained": 1.344844568 + }, + { + "epoch": 0.7766825047868946, + "grad_norm": 9.469155311584473, + "loss": 4.0549, + "lr": 0.0006871328671328672, + "step": 2738, + "tokens_trained": 1.3458226 + }, + { + "epoch": 0.7772498404368484, + "grad_norm": 6.303086757659912, + "loss": 4.0808, + "lr": 0.0006868531468531468, + "step": 2740, + "tokens_trained": 1.346809256 + }, + { + "epoch": 0.7778171760868023, + "grad_norm": 6.282865524291992, + "loss": 4.0425, + "lr": 0.0006865734265734265, + "step": 2742, + "tokens_trained": 1.347790504 + }, + { + "epoch": 0.7783845117367563, + "grad_norm": 6.448110103607178, + "loss": 4.0512, + "lr": 0.0006862937062937064, + "step": 2744, + "tokens_trained": 1.348770416 + }, + { + "epoch": 0.7789518473867102, + "grad_norm": 3.967651128768921, + "loss": 4.0189, + "lr": 0.000686013986013986, + "step": 2746, + "tokens_trained": 1.34975288 + }, + { + "epoch": 0.7795191830366641, + "grad_norm": 4.253781318664551, + "loss": 4.0774, + "lr": 0.0006857342657342658, + "step": 2748, + "tokens_trained": 1.350729672 + }, + { + "epoch": 0.7800865186866179, + "grad_norm": 15.237231254577637, + "loss": 4.0929, + "lr": 0.0006854545454545454, + "step": 2750, + "tokens_trained": 1.351711184 + }, + { + "epoch": 0.7800865186866179, + "eval_loss": 1.0141865015029907, + "eval_runtime": 20.7754, + "step": 2750, + "tokens_trained": 1.351711184 + }, + { + "epoch": 0.7806538543365719, + "grad_norm": 14.367753028869629, + "loss": 4.0422, + "lr": 0.0006851748251748253, + "step": 2752, + "tokens_trained": 1.352694296 + }, + { + "epoch": 0.7812211899865258, + "grad_norm": 4.344571590423584, + "loss": 4.018, + "lr": 0.0006848951048951049, + "step": 2754, + "tokens_trained": 1.353678976 + }, + { + "epoch": 0.7817885256364797, + "grad_norm": 4.031637191772461, + "loss": 4.0568, + "lr": 0.0006846153846153846, + "step": 2756, + "tokens_trained": 1.354661624 + }, + { + "epoch": 0.7823558612864336, + "grad_norm": 11.08716106414795, + "loss": 4.0717, + "lr": 0.0006843356643356643, + "step": 2758, + "tokens_trained": 1.355644416 + }, + { + "epoch": 0.7829231969363875, + "grad_norm": 10.119296073913574, + "loss": 4.0726, + "lr": 0.000684055944055944, + "step": 2760, + "tokens_trained": 1.356625632 + }, + { + "epoch": 0.7834905325863414, + "grad_norm": 14.678930282592773, + "loss": 4.065, + "lr": 0.0006837762237762239, + "step": 2762, + "tokens_trained": 1.357605968 + }, + { + "epoch": 0.7840578682362953, + "grad_norm": 2.6932129859924316, + "loss": 4.0831, + "lr": 0.0006834965034965035, + "step": 2764, + "tokens_trained": 1.358590808 + }, + { + "epoch": 0.7846252038862492, + "grad_norm": 22.138845443725586, + "loss": 4.1011, + "lr": 0.0006832167832167833, + "step": 2766, + "tokens_trained": 1.359570928 + }, + { + "epoch": 0.7851925395362032, + "grad_norm": 17.627702713012695, + "loss": 4.1441, + "lr": 0.0006829370629370629, + "step": 2768, + "tokens_trained": 1.36055716 + }, + { + "epoch": 0.785759875186157, + "grad_norm": 9.9471435546875, + "loss": 4.122, + "lr": 0.0006826573426573427, + "step": 2770, + "tokens_trained": 1.361539352 + }, + { + "epoch": 0.7863272108361109, + "grad_norm": 11.452835083007812, + "loss": 4.0928, + "lr": 0.0006823776223776224, + "step": 2772, + "tokens_trained": 1.362519 + }, + { + "epoch": 0.7868945464860648, + "grad_norm": 15.566934585571289, + "loss": 4.0816, + "lr": 0.0006820979020979021, + "step": 2774, + "tokens_trained": 1.363505808 + }, + { + "epoch": 0.7874618821360188, + "grad_norm": 8.46238899230957, + "loss": 4.0924, + "lr": 0.0006818181818181818, + "step": 2776, + "tokens_trained": 1.364484496 + }, + { + "epoch": 0.7880292177859727, + "grad_norm": 4.6673688888549805, + "loss": 4.0732, + "lr": 0.0006815384615384615, + "step": 2778, + "tokens_trained": 1.365468696 + }, + { + "epoch": 0.7885965534359265, + "grad_norm": 10.422809600830078, + "loss": 4.0285, + "lr": 0.0006812587412587414, + "step": 2780, + "tokens_trained": 1.36645104 + }, + { + "epoch": 0.7891638890858804, + "grad_norm": 11.707451820373535, + "loss": 4.0645, + "lr": 0.000680979020979021, + "step": 2782, + "tokens_trained": 1.367433136 + }, + { + "epoch": 0.7897312247358343, + "grad_norm": 6.887526988983154, + "loss": 4.0591, + "lr": 0.0006806993006993007, + "step": 2784, + "tokens_trained": 1.368420024 + }, + { + "epoch": 0.7902985603857883, + "grad_norm": 7.914979457855225, + "loss": 4.0641, + "lr": 0.0006804195804195804, + "step": 2786, + "tokens_trained": 1.369401936 + }, + { + "epoch": 0.7908658960357422, + "grad_norm": 7.964488506317139, + "loss": 4.0462, + "lr": 0.0006801398601398602, + "step": 2788, + "tokens_trained": 1.370384896 + }, + { + "epoch": 0.791433231685696, + "grad_norm": 7.16652774810791, + "loss": 4.026, + "lr": 0.0006798601398601399, + "step": 2790, + "tokens_trained": 1.371365304 + }, + { + "epoch": 0.7920005673356499, + "grad_norm": 8.604512214660645, + "loss": 4.0407, + "lr": 0.0006795804195804196, + "step": 2792, + "tokens_trained": 1.372349584 + }, + { + "epoch": 0.7925679029856039, + "grad_norm": 6.616272449493408, + "loss": 4.0417, + "lr": 0.0006793006993006992, + "step": 2794, + "tokens_trained": 1.373330584 + }, + { + "epoch": 0.7931352386355578, + "grad_norm": 3.8474340438842773, + "loss": 4.0322, + "lr": 0.000679020979020979, + "step": 2796, + "tokens_trained": 1.374312888 + }, + { + "epoch": 0.7937025742855117, + "grad_norm": 11.628402709960938, + "loss": 4.0378, + "lr": 0.0006787412587412588, + "step": 2798, + "tokens_trained": 1.375294704 + }, + { + "epoch": 0.7942699099354655, + "grad_norm": 7.480481147766113, + "loss": 4.1031, + "lr": 0.0006784615384615385, + "step": 2800, + "tokens_trained": 1.376279072 + }, + { + "epoch": 0.7948372455854195, + "grad_norm": 6.449431896209717, + "loss": 4.0397, + "lr": 0.0006781818181818182, + "step": 2802, + "tokens_trained": 1.377265568 + }, + { + "epoch": 0.7954045812353734, + "grad_norm": 5.179644584655762, + "loss": 4.0826, + "lr": 0.0006779020979020979, + "step": 2804, + "tokens_trained": 1.378250776 + }, + { + "epoch": 0.7959719168853273, + "grad_norm": 8.918203353881836, + "loss": 4.0358, + "lr": 0.0006776223776223777, + "step": 2806, + "tokens_trained": 1.379235464 + }, + { + "epoch": 0.7965392525352812, + "grad_norm": 6.065394878387451, + "loss": 4.0754, + "lr": 0.0006773426573426574, + "step": 2808, + "tokens_trained": 1.380215248 + }, + { + "epoch": 0.797106588185235, + "grad_norm": 3.9142706394195557, + "loss": 4.0274, + "lr": 0.0006770629370629371, + "step": 2810, + "tokens_trained": 1.381197872 + }, + { + "epoch": 0.797673923835189, + "grad_norm": 12.86207103729248, + "loss": 4.0471, + "lr": 0.0006767832167832167, + "step": 2812, + "tokens_trained": 1.38218364 + }, + { + "epoch": 0.7982412594851429, + "grad_norm": 10.052533149719238, + "loss": 4.0628, + "lr": 0.0006765034965034965, + "step": 2814, + "tokens_trained": 1.383170176 + }, + { + "epoch": 0.7988085951350968, + "grad_norm": 5.910792827606201, + "loss": 4.0358, + "lr": 0.0006762237762237763, + "step": 2816, + "tokens_trained": 1.384154592 + }, + { + "epoch": 0.7993759307850508, + "grad_norm": 13.312492370605469, + "loss": 4.0694, + "lr": 0.000675944055944056, + "step": 2818, + "tokens_trained": 1.385138352 + }, + { + "epoch": 0.7999432664350046, + "grad_norm": 12.467507362365723, + "loss": 4.0705, + "lr": 0.0006756643356643357, + "step": 2820, + "tokens_trained": 1.386123232 + }, + { + "epoch": 0.8005106020849585, + "grad_norm": 4.8490824699401855, + "loss": 4.0387, + "lr": 0.0006753846153846153, + "step": 2822, + "tokens_trained": 1.387107008 + }, + { + "epoch": 0.8010779377349124, + "grad_norm": 13.596024513244629, + "loss": 4.0505, + "lr": 0.0006751048951048951, + "step": 2824, + "tokens_trained": 1.388091632 + }, + { + "epoch": 0.8016452733848664, + "grad_norm": 13.633816719055176, + "loss": 4.0894, + "lr": 0.0006748251748251748, + "step": 2826, + "tokens_trained": 1.389077456 + }, + { + "epoch": 0.8022126090348203, + "grad_norm": 4.448362827301025, + "loss": 4.0623, + "lr": 0.0006745454545454546, + "step": 2828, + "tokens_trained": 1.39006124 + }, + { + "epoch": 0.8027799446847741, + "grad_norm": 21.12818717956543, + "loss": 4.1275, + "lr": 0.0006742657342657342, + "step": 2830, + "tokens_trained": 1.391043016 + }, + { + "epoch": 0.803347280334728, + "grad_norm": 10.096168518066406, + "loss": 4.0858, + "lr": 0.000673986013986014, + "step": 2832, + "tokens_trained": 1.392026656 + }, + { + "epoch": 0.803914615984682, + "grad_norm": 4.614907264709473, + "loss": 4.0075, + "lr": 0.0006737062937062938, + "step": 2834, + "tokens_trained": 1.393006784 + }, + { + "epoch": 0.8044819516346359, + "grad_norm": 13.106852531433105, + "loss": 4.1113, + "lr": 0.0006734265734265734, + "step": 2836, + "tokens_trained": 1.393990424 + }, + { + "epoch": 0.8050492872845898, + "grad_norm": 4.287477493286133, + "loss": 4.0818, + "lr": 0.0006731468531468532, + "step": 2838, + "tokens_trained": 1.39497072 + }, + { + "epoch": 0.8056166229345436, + "grad_norm": 9.295431137084961, + "loss": 4.0652, + "lr": 0.0006728671328671328, + "step": 2840, + "tokens_trained": 1.395951488 + }, + { + "epoch": 0.8061839585844975, + "grad_norm": 12.001997947692871, + "loss": 4.1061, + "lr": 0.0006725874125874126, + "step": 2842, + "tokens_trained": 1.396933744 + }, + { + "epoch": 0.8067512942344515, + "grad_norm": 15.18830680847168, + "loss": 4.0483, + "lr": 0.0006723076923076923, + "step": 2844, + "tokens_trained": 1.397915696 + }, + { + "epoch": 0.8073186298844054, + "grad_norm": 9.936029434204102, + "loss": 4.0559, + "lr": 0.0006720279720279721, + "step": 2846, + "tokens_trained": 1.398900048 + }, + { + "epoch": 0.8078859655343593, + "grad_norm": 4.903693199157715, + "loss": 4.0474, + "lr": 0.0006717482517482517, + "step": 2848, + "tokens_trained": 1.399885336 + }, + { + "epoch": 0.8084533011843131, + "grad_norm": 6.753813743591309, + "loss": 4.0365, + "lr": 0.0006714685314685314, + "step": 2850, + "tokens_trained": 1.400867432 + }, + { + "epoch": 0.8090206368342671, + "grad_norm": 10.53545093536377, + "loss": 4.0697, + "lr": 0.0006711888111888113, + "step": 2852, + "tokens_trained": 1.401849552 + }, + { + "epoch": 0.809587972484221, + "grad_norm": 7.666012763977051, + "loss": 3.9955, + "lr": 0.0006709090909090909, + "step": 2854, + "tokens_trained": 1.402832496 + }, + { + "epoch": 0.8101553081341749, + "grad_norm": 11.65257740020752, + "loss": 4.0377, + "lr": 0.0006706293706293707, + "step": 2856, + "tokens_trained": 1.403816768 + }, + { + "epoch": 0.8107226437841288, + "grad_norm": 10.997775077819824, + "loss": 4.0145, + "lr": 0.0006703496503496503, + "step": 2858, + "tokens_trained": 1.404804968 + }, + { + "epoch": 0.8112899794340827, + "grad_norm": 3.699673652648926, + "loss": 4.1053, + "lr": 0.0006700699300699301, + "step": 2860, + "tokens_trained": 1.40578656 + }, + { + "epoch": 0.8118573150840366, + "grad_norm": 17.54732894897461, + "loss": 4.121, + "lr": 0.0006697902097902098, + "step": 2862, + "tokens_trained": 1.406773056 + }, + { + "epoch": 0.8124246507339905, + "grad_norm": 10.354470252990723, + "loss": 4.0353, + "lr": 0.0006695104895104895, + "step": 2864, + "tokens_trained": 1.407756592 + }, + { + "epoch": 0.8129919863839444, + "grad_norm": 7.760607719421387, + "loss": 4.0529, + "lr": 0.0006692307692307692, + "step": 2866, + "tokens_trained": 1.408742176 + }, + { + "epoch": 0.8135593220338984, + "grad_norm": 11.074470520019531, + "loss": 4.0223, + "lr": 0.0006689510489510489, + "step": 2868, + "tokens_trained": 1.409727856 + }, + { + "epoch": 0.8141266576838522, + "grad_norm": 12.221083641052246, + "loss": 4.0228, + "lr": 0.0006686713286713288, + "step": 2870, + "tokens_trained": 1.410712016 + }, + { + "epoch": 0.8146939933338061, + "grad_norm": 8.933589935302734, + "loss": 4.1234, + "lr": 0.0006683916083916084, + "step": 2872, + "tokens_trained": 1.411694496 + }, + { + "epoch": 0.81526132898376, + "grad_norm": 12.326020240783691, + "loss": 4.0772, + "lr": 0.0006681118881118882, + "step": 2874, + "tokens_trained": 1.412676992 + }, + { + "epoch": 0.8155449968087369, + "eval_loss": 1.015201449394226, + "eval_runtime": 20.3991, + "step": 2875, + "tokens_trained": 1.413169416 + }, + { + "epoch": 0.815828664633714, + "grad_norm": 8.320648193359375, + "loss": 4.0045, + "lr": 0.0006678321678321678, + "step": 2876, + "tokens_trained": 1.413657912 + }, + { + "epoch": 0.8163960002836679, + "grad_norm": 4.708253383636475, + "loss": 4.022, + "lr": 0.0006675524475524475, + "step": 2878, + "tokens_trained": 1.414641576 + }, + { + "epoch": 0.8169633359336217, + "grad_norm": 13.005586624145508, + "loss": 4.0305, + "lr": 0.0006672727272727273, + "step": 2880, + "tokens_trained": 1.415624992 + }, + { + "epoch": 0.8175306715835756, + "grad_norm": 8.445854187011719, + "loss": 4.0723, + "lr": 0.000666993006993007, + "step": 2882, + "tokens_trained": 1.416605936 + }, + { + "epoch": 0.8180980072335295, + "grad_norm": 5.153830528259277, + "loss": 4.0766, + "lr": 0.0006667132867132867, + "step": 2884, + "tokens_trained": 1.417593408 + }, + { + "epoch": 0.8186653428834835, + "grad_norm": 13.989762306213379, + "loss": 4.043, + "lr": 0.0006664335664335664, + "step": 2886, + "tokens_trained": 1.418577984 + }, + { + "epoch": 0.8192326785334374, + "grad_norm": 6.2893805503845215, + "loss": 4.0576, + "lr": 0.0006661538461538463, + "step": 2888, + "tokens_trained": 1.419557304 + }, + { + "epoch": 0.8198000141833912, + "grad_norm": 3.1825716495513916, + "loss": 4.0216, + "lr": 0.0006658741258741259, + "step": 2890, + "tokens_trained": 1.420538736 + }, + { + "epoch": 0.8203673498333451, + "grad_norm": 13.280265808105469, + "loss": 4.0665, + "lr": 0.0006655944055944056, + "step": 2892, + "tokens_trained": 1.421523048 + }, + { + "epoch": 0.8209346854832991, + "grad_norm": 8.963871955871582, + "loss": 4.0996, + "lr": 0.0006653146853146853, + "step": 2894, + "tokens_trained": 1.422504352 + }, + { + "epoch": 0.821502021133253, + "grad_norm": 9.463395118713379, + "loss": 4.0638, + "lr": 0.000665034965034965, + "step": 2896, + "tokens_trained": 1.423490256 + }, + { + "epoch": 0.8220693567832069, + "grad_norm": 10.848092079162598, + "loss": 4.0767, + "lr": 0.0006647552447552448, + "step": 2898, + "tokens_trained": 1.424473728 + }, + { + "epoch": 0.8226366924331607, + "grad_norm": 9.271900177001953, + "loss": 4.0675, + "lr": 0.0006644755244755245, + "step": 2900, + "tokens_trained": 1.425456216 + }, + { + "epoch": 0.8232040280831147, + "grad_norm": 8.910347938537598, + "loss": 4.031, + "lr": 0.0006641958041958042, + "step": 2902, + "tokens_trained": 1.426442408 + }, + { + "epoch": 0.8237713637330686, + "grad_norm": 6.92717981338501, + "loss": 4.1025, + "lr": 0.0006639160839160839, + "step": 2904, + "tokens_trained": 1.42742624 + }, + { + "epoch": 0.8243386993830225, + "grad_norm": 6.383159637451172, + "loss": 4.0057, + "lr": 0.0006636363636363638, + "step": 2906, + "tokens_trained": 1.428414912 + }, + { + "epoch": 0.8249060350329764, + "grad_norm": 5.782074451446533, + "loss": 4.0169, + "lr": 0.0006633566433566434, + "step": 2908, + "tokens_trained": 1.42939668 + }, + { + "epoch": 0.8254733706829303, + "grad_norm": 10.663660049438477, + "loss": 4.0504, + "lr": 0.0006630769230769231, + "step": 2910, + "tokens_trained": 1.430382648 + }, + { + "epoch": 0.8260407063328842, + "grad_norm": 11.806394577026367, + "loss": 4.065, + "lr": 0.0006627972027972028, + "step": 2912, + "tokens_trained": 1.43136304 + }, + { + "epoch": 0.8266080419828381, + "grad_norm": 5.7375617027282715, + "loss": 4.0133, + "lr": 0.0006625174825174825, + "step": 2914, + "tokens_trained": 1.432347472 + }, + { + "epoch": 0.827175377632792, + "grad_norm": 6.814542293548584, + "loss": 4.0656, + "lr": 0.0006622377622377623, + "step": 2916, + "tokens_trained": 1.433329632 + }, + { + "epoch": 0.827742713282746, + "grad_norm": 8.265726089477539, + "loss": 4.0206, + "lr": 0.000661958041958042, + "step": 2918, + "tokens_trained": 1.434312216 + }, + { + "epoch": 0.8283100489326998, + "grad_norm": 6.937063694000244, + "loss": 4.0372, + "lr": 0.0006616783216783216, + "step": 2920, + "tokens_trained": 1.435294504 + }, + { + "epoch": 0.8288773845826537, + "grad_norm": 6.773707866668701, + "loss": 4.0496, + "lr": 0.0006613986013986014, + "step": 2922, + "tokens_trained": 1.436276344 + }, + { + "epoch": 0.8294447202326076, + "grad_norm": 8.471631050109863, + "loss": 4.0834, + "lr": 0.0006611188811188812, + "step": 2924, + "tokens_trained": 1.43725852 + }, + { + "epoch": 0.8300120558825616, + "grad_norm": 10.602453231811523, + "loss": 4.0445, + "lr": 0.0006608391608391609, + "step": 2926, + "tokens_trained": 1.438239768 + }, + { + "epoch": 0.8305793915325155, + "grad_norm": 8.173192977905273, + "loss": 4.0423, + "lr": 0.0006605594405594406, + "step": 2928, + "tokens_trained": 1.43921892 + }, + { + "epoch": 0.8311467271824693, + "grad_norm": 9.510146141052246, + "loss": 4.0012, + "lr": 0.0006602797202797203, + "step": 2930, + "tokens_trained": 1.440203128 + }, + { + "epoch": 0.8317140628324232, + "grad_norm": 4.894539833068848, + "loss": 4.0574, + "lr": 0.00066, + "step": 2932, + "tokens_trained": 1.441187856 + }, + { + "epoch": 0.8322813984823771, + "grad_norm": 4.4945149421691895, + "loss": 4.0107, + "lr": 0.0006597202797202797, + "step": 2934, + "tokens_trained": 1.442164056 + }, + { + "epoch": 0.8328487341323311, + "grad_norm": 7.323387145996094, + "loss": 4.0779, + "lr": 0.0006594405594405595, + "step": 2936, + "tokens_trained": 1.44314688 + }, + { + "epoch": 0.833416069782285, + "grad_norm": 9.858680725097656, + "loss": 4.03, + "lr": 0.0006591608391608391, + "step": 2938, + "tokens_trained": 1.444127552 + }, + { + "epoch": 0.8339834054322388, + "grad_norm": 8.214831352233887, + "loss": 4.0591, + "lr": 0.0006588811188811189, + "step": 2940, + "tokens_trained": 1.445109336 + }, + { + "epoch": 0.8345507410821927, + "grad_norm": 6.628262996673584, + "loss": 4.0834, + "lr": 0.0006586013986013986, + "step": 2942, + "tokens_trained": 1.4460904 + }, + { + "epoch": 0.8351180767321467, + "grad_norm": 11.043391227722168, + "loss": 4.0516, + "lr": 0.0006583216783216784, + "step": 2944, + "tokens_trained": 1.447068776 + }, + { + "epoch": 0.8356854123821006, + "grad_norm": 8.013843536376953, + "loss": 4.0309, + "lr": 0.0006580419580419581, + "step": 2946, + "tokens_trained": 1.448046952 + }, + { + "epoch": 0.8362527480320545, + "grad_norm": 4.856717586517334, + "loss": 4.0547, + "lr": 0.0006577622377622377, + "step": 2948, + "tokens_trained": 1.449033752 + }, + { + "epoch": 0.8368200836820083, + "grad_norm": 4.799930572509766, + "loss": 4.0044, + "lr": 0.0006574825174825175, + "step": 2950, + "tokens_trained": 1.450019912 + }, + { + "epoch": 0.8373874193319623, + "grad_norm": 8.492339134216309, + "loss": 4.0368, + "lr": 0.0006572027972027972, + "step": 2952, + "tokens_trained": 1.451002976 + }, + { + "epoch": 0.8379547549819162, + "grad_norm": 7.098823547363281, + "loss": 3.9807, + "lr": 0.000656923076923077, + "step": 2954, + "tokens_trained": 1.45198412 + }, + { + "epoch": 0.8385220906318701, + "grad_norm": 8.705301284790039, + "loss": 4.0749, + "lr": 0.0006566433566433566, + "step": 2956, + "tokens_trained": 1.452963832 + }, + { + "epoch": 0.839089426281824, + "grad_norm": 2.8292014598846436, + "loss": 4.0241, + "lr": 0.0006563636363636364, + "step": 2958, + "tokens_trained": 1.453947688 + }, + { + "epoch": 0.8396567619317779, + "grad_norm": 3.7414586544036865, + "loss": 4.0554, + "lr": 0.0006560839160839161, + "step": 2960, + "tokens_trained": 1.45492676 + }, + { + "epoch": 0.8402240975817318, + "grad_norm": 11.956228256225586, + "loss": 4.0343, + "lr": 0.0006558041958041958, + "step": 2962, + "tokens_trained": 1.455907464 + }, + { + "epoch": 0.8407914332316857, + "grad_norm": 11.086222648620605, + "loss": 4.0324, + "lr": 0.0006555244755244756, + "step": 2964, + "tokens_trained": 1.456891688 + }, + { + "epoch": 0.8413587688816396, + "grad_norm": 8.380780220031738, + "loss": 4.0335, + "lr": 0.0006552447552447552, + "step": 2966, + "tokens_trained": 1.457880016 + }, + { + "epoch": 0.8419261045315936, + "grad_norm": 8.568910598754883, + "loss": 4.0431, + "lr": 0.000654965034965035, + "step": 2968, + "tokens_trained": 1.458866944 + }, + { + "epoch": 0.8424934401815474, + "grad_norm": 10.840734481811523, + "loss": 4.0275, + "lr": 0.0006546853146853147, + "step": 2970, + "tokens_trained": 1.459849096 + }, + { + "epoch": 0.8430607758315013, + "grad_norm": 5.364732265472412, + "loss": 4.0464, + "lr": 0.0006544055944055945, + "step": 2972, + "tokens_trained": 1.460833976 + }, + { + "epoch": 0.8436281114814552, + "grad_norm": 8.918869018554688, + "loss": 4.0501, + "lr": 0.0006541258741258741, + "step": 2974, + "tokens_trained": 1.461811472 + }, + { + "epoch": 0.8441954471314091, + "grad_norm": 10.94211483001709, + "loss": 4.0284, + "lr": 0.0006538461538461538, + "step": 2976, + "tokens_trained": 1.462798528 + }, + { + "epoch": 0.8447627827813631, + "grad_norm": 14.475136756896973, + "loss": 4.0597, + "lr": 0.0006535664335664336, + "step": 2978, + "tokens_trained": 1.46378116 + }, + { + "epoch": 0.8453301184313169, + "grad_norm": 8.219613075256348, + "loss": 4.0499, + "lr": 0.0006532867132867133, + "step": 2980, + "tokens_trained": 1.464758752 + }, + { + "epoch": 0.8458974540812708, + "grad_norm": 8.898524284362793, + "loss": 4.0472, + "lr": 0.0006530069930069931, + "step": 2982, + "tokens_trained": 1.465737992 + }, + { + "epoch": 0.8464647897312247, + "grad_norm": 6.673952579498291, + "loss": 3.9971, + "lr": 0.0006527272727272727, + "step": 2984, + "tokens_trained": 1.466724672 + }, + { + "epoch": 0.8470321253811787, + "grad_norm": 6.514251708984375, + "loss": 4.0245, + "lr": 0.0006524475524475524, + "step": 2986, + "tokens_trained": 1.46770572 + }, + { + "epoch": 0.8475994610311326, + "grad_norm": 8.130202293395996, + "loss": 4.0332, + "lr": 0.0006521678321678322, + "step": 2988, + "tokens_trained": 1.468690624 + }, + { + "epoch": 0.8481667966810864, + "grad_norm": 4.283686637878418, + "loss": 4.0551, + "lr": 0.0006518881118881119, + "step": 2990, + "tokens_trained": 1.469674696 + }, + { + "epoch": 0.8487341323310403, + "grad_norm": 4.8144426345825195, + "loss": 4.0408, + "lr": 0.0006516083916083916, + "step": 2992, + "tokens_trained": 1.470659816 + }, + { + "epoch": 0.8493014679809943, + "grad_norm": 11.117393493652344, + "loss": 4.0423, + "lr": 0.0006513286713286713, + "step": 2994, + "tokens_trained": 1.47164192 + }, + { + "epoch": 0.8498688036309482, + "grad_norm": 8.022162437438965, + "loss": 4.064, + "lr": 0.0006510489510489511, + "step": 2996, + "tokens_trained": 1.472624344 + }, + { + "epoch": 0.8504361392809021, + "grad_norm": 5.267605304718018, + "loss": 3.9804, + "lr": 0.0006507692307692308, + "step": 2998, + "tokens_trained": 1.473606552 + }, + { + "epoch": 0.8510034749308559, + "grad_norm": 9.365017890930176, + "loss": 4.0223, + "lr": 0.0006504895104895106, + "step": 3000, + "tokens_trained": 1.474586552 + }, + { + "epoch": 0.8510034749308559, + "eval_loss": 1.0078805685043335, + "eval_runtime": 20.7752, + "step": 3000, + "tokens_trained": 1.474586552 + }, + { + "epoch": 0.8515708105808099, + "grad_norm": 10.311480522155762, + "loss": 3.969, + "lr": 0.0006502097902097902, + "step": 3002, + "tokens_trained": 1.475564304 + }, + { + "epoch": 0.8521381462307638, + "grad_norm": 5.622078895568848, + "loss": 3.9803, + "lr": 0.0006499300699300699, + "step": 3004, + "tokens_trained": 1.476547088 + }, + { + "epoch": 0.8527054818807177, + "grad_norm": 6.005502223968506, + "loss": 4.0584, + "lr": 0.0006496503496503497, + "step": 3006, + "tokens_trained": 1.477531352 + }, + { + "epoch": 0.8532728175306716, + "grad_norm": 5.769370079040527, + "loss": 4.0332, + "lr": 0.0006493706293706294, + "step": 3008, + "tokens_trained": 1.478512136 + }, + { + "epoch": 0.8538401531806254, + "grad_norm": 4.246579647064209, + "loss": 3.9848, + "lr": 0.0006490909090909091, + "step": 3010, + "tokens_trained": 1.47949464 + }, + { + "epoch": 0.8544074888305794, + "grad_norm": 3.3972086906433105, + "loss": 3.9969, + "lr": 0.0006488111888111888, + "step": 3012, + "tokens_trained": 1.4804812 + }, + { + "epoch": 0.8549748244805333, + "grad_norm": 4.793631553649902, + "loss": 3.9748, + "lr": 0.0006485314685314685, + "step": 3014, + "tokens_trained": 1.481469176 + }, + { + "epoch": 0.8555421601304872, + "grad_norm": 7.709076881408691, + "loss": 4.0399, + "lr": 0.0006482517482517483, + "step": 3016, + "tokens_trained": 1.482450232 + }, + { + "epoch": 0.8561094957804412, + "grad_norm": 9.06294059753418, + "loss": 4.0279, + "lr": 0.000647972027972028, + "step": 3018, + "tokens_trained": 1.48343416 + }, + { + "epoch": 0.856676831430395, + "grad_norm": 7.496627330780029, + "loss": 4.047, + "lr": 0.0006476923076923077, + "step": 3020, + "tokens_trained": 1.484423072 + }, + { + "epoch": 0.8572441670803489, + "grad_norm": 6.635293006896973, + "loss": 4.0583, + "lr": 0.0006474125874125874, + "step": 3022, + "tokens_trained": 1.485406296 + }, + { + "epoch": 0.8578115027303028, + "grad_norm": 6.3066864013671875, + "loss": 3.9902, + "lr": 0.0006471328671328672, + "step": 3024, + "tokens_trained": 1.486391472 + }, + { + "epoch": 0.8583788383802567, + "grad_norm": 1.1249172687530518, + "loss": 4.0032, + "lr": 0.0006468531468531469, + "step": 3026, + "tokens_trained": 1.487377128 + }, + { + "epoch": 0.8589461740302107, + "grad_norm": 2.966470241546631, + "loss": 3.9859, + "lr": 0.0006465734265734265, + "step": 3028, + "tokens_trained": 1.488359656 + }, + { + "epoch": 0.8595135096801645, + "grad_norm": 6.611581325531006, + "loss": 4.0259, + "lr": 0.0006462937062937063, + "step": 3030, + "tokens_trained": 1.489340552 + }, + { + "epoch": 0.8600808453301184, + "grad_norm": 7.76756477355957, + "loss": 4.0223, + "lr": 0.0006460139860139859, + "step": 3032, + "tokens_trained": 1.49032648 + }, + { + "epoch": 0.8606481809800723, + "grad_norm": 10.86517333984375, + "loss": 4.0457, + "lr": 0.0006457342657342658, + "step": 3034, + "tokens_trained": 1.491312608 + }, + { + "epoch": 0.8612155166300263, + "grad_norm": 4.524630546569824, + "loss": 4.0882, + "lr": 0.0006454545454545455, + "step": 3036, + "tokens_trained": 1.49229724 + }, + { + "epoch": 0.8617828522799802, + "grad_norm": 10.601529121398926, + "loss": 4.0466, + "lr": 0.0006451748251748252, + "step": 3038, + "tokens_trained": 1.49327952 + }, + { + "epoch": 0.862350187929934, + "grad_norm": 10.691457748413086, + "loss": 4.0239, + "lr": 0.0006448951048951049, + "step": 3040, + "tokens_trained": 1.494263528 + }, + { + "epoch": 0.8629175235798879, + "grad_norm": 5.371310710906982, + "loss": 4.0864, + "lr": 0.0006446153846153846, + "step": 3042, + "tokens_trained": 1.49524708 + }, + { + "epoch": 0.8634848592298419, + "grad_norm": 5.7418999671936035, + "loss": 4.0618, + "lr": 0.0006443356643356644, + "step": 3044, + "tokens_trained": 1.496229136 + }, + { + "epoch": 0.8640521948797958, + "grad_norm": 7.521689414978027, + "loss": 4.0235, + "lr": 0.000644055944055944, + "step": 3046, + "tokens_trained": 1.497212944 + }, + { + "epoch": 0.8646195305297497, + "grad_norm": 6.966773509979248, + "loss": 4.0187, + "lr": 0.0006437762237762238, + "step": 3048, + "tokens_trained": 1.498198992 + }, + { + "epoch": 0.8651868661797035, + "grad_norm": 12.514280319213867, + "loss": 4.0306, + "lr": 0.0006434965034965034, + "step": 3050, + "tokens_trained": 1.499181312 + }, + { + "epoch": 0.8657542018296575, + "grad_norm": 4.849910736083984, + "loss": 4.033, + "lr": 0.0006432167832167833, + "step": 3052, + "tokens_trained": 1.500163288 + }, + { + "epoch": 0.8663215374796114, + "grad_norm": 9.553950309753418, + "loss": 4.0465, + "lr": 0.000642937062937063, + "step": 3054, + "tokens_trained": 1.501147464 + }, + { + "epoch": 0.8668888731295653, + "grad_norm": 8.58786678314209, + "loss": 4.0584, + "lr": 0.0006426573426573426, + "step": 3056, + "tokens_trained": 1.50212956 + }, + { + "epoch": 0.8674562087795192, + "grad_norm": 11.174147605895996, + "loss": 4.0152, + "lr": 0.0006423776223776224, + "step": 3058, + "tokens_trained": 1.503112168 + }, + { + "epoch": 0.868023544429473, + "grad_norm": 1.879528522491455, + "loss": 3.999, + "lr": 0.0006420979020979021, + "step": 3060, + "tokens_trained": 1.504099584 + }, + { + "epoch": 0.868590880079427, + "grad_norm": 19.370494842529297, + "loss": 4.1039, + "lr": 0.0006418181818181819, + "step": 3062, + "tokens_trained": 1.50508356 + }, + { + "epoch": 0.8691582157293809, + "grad_norm": 10.598268508911133, + "loss": 4.0542, + "lr": 0.0006415384615384615, + "step": 3064, + "tokens_trained": 1.506063304 + }, + { + "epoch": 0.8697255513793348, + "grad_norm": 8.537477493286133, + "loss": 4.0529, + "lr": 0.0006412587412587413, + "step": 3066, + "tokens_trained": 1.507046368 + }, + { + "epoch": 0.8702928870292888, + "grad_norm": 8.395747184753418, + "loss": 3.9941, + "lr": 0.0006409790209790209, + "step": 3068, + "tokens_trained": 1.508029128 + }, + { + "epoch": 0.8708602226792426, + "grad_norm": 5.918806552886963, + "loss": 4.0078, + "lr": 0.0006406993006993007, + "step": 3070, + "tokens_trained": 1.5090132 + }, + { + "epoch": 0.8714275583291965, + "grad_norm": 3.845099925994873, + "loss": 4.0564, + "lr": 0.0006404195804195805, + "step": 3072, + "tokens_trained": 1.509994832 + }, + { + "epoch": 0.8719948939791504, + "grad_norm": 3.3807923793792725, + "loss": 4.0438, + "lr": 0.0006401398601398601, + "step": 3074, + "tokens_trained": 1.510975552 + }, + { + "epoch": 0.8725622296291043, + "grad_norm": 4.468081951141357, + "loss": 4.066, + "lr": 0.0006398601398601399, + "step": 3076, + "tokens_trained": 1.511959576 + }, + { + "epoch": 0.8731295652790583, + "grad_norm": 1.8455613851547241, + "loss": 4.0247, + "lr": 0.0006395804195804196, + "step": 3078, + "tokens_trained": 1.512939112 + }, + { + "epoch": 0.8736969009290121, + "grad_norm": 7.184399127960205, + "loss": 4.081, + "lr": 0.0006393006993006994, + "step": 3080, + "tokens_trained": 1.513924792 + }, + { + "epoch": 0.874264236578966, + "grad_norm": 8.416154861450195, + "loss": 4.0372, + "lr": 0.000639020979020979, + "step": 3082, + "tokens_trained": 1.514905096 + }, + { + "epoch": 0.8748315722289199, + "grad_norm": 6.620309829711914, + "loss": 4.0822, + "lr": 0.0006387412587412587, + "step": 3084, + "tokens_trained": 1.51588724 + }, + { + "epoch": 0.8753989078788739, + "grad_norm": 7.424724102020264, + "loss": 4.053, + "lr": 0.0006384615384615384, + "step": 3086, + "tokens_trained": 1.516871792 + }, + { + "epoch": 0.8759662435288278, + "grad_norm": 7.8764448165893555, + "loss": 4.059, + "lr": 0.0006381818181818182, + "step": 3088, + "tokens_trained": 1.517857872 + }, + { + "epoch": 0.8765335791787816, + "grad_norm": 7.330927848815918, + "loss": 4.0182, + "lr": 0.000637902097902098, + "step": 3090, + "tokens_trained": 1.518840616 + }, + { + "epoch": 0.8771009148287355, + "grad_norm": 8.612639427185059, + "loss": 4.0181, + "lr": 0.0006376223776223776, + "step": 3092, + "tokens_trained": 1.519826616 + }, + { + "epoch": 0.8776682504786895, + "grad_norm": 9.889811515808105, + "loss": 4.0434, + "lr": 0.0006373426573426574, + "step": 3094, + "tokens_trained": 1.520805784 + }, + { + "epoch": 0.8782355861286434, + "grad_norm": 5.421345233917236, + "loss": 4.0237, + "lr": 0.0006370629370629371, + "step": 3096, + "tokens_trained": 1.521789344 + }, + { + "epoch": 0.8788029217785973, + "grad_norm": 4.9160990715026855, + "loss": 4.0497, + "lr": 0.0006367832167832168, + "step": 3098, + "tokens_trained": 1.522772664 + }, + { + "epoch": 0.8793702574285511, + "grad_norm": 8.828028678894043, + "loss": 4.0381, + "lr": 0.0006365034965034965, + "step": 3100, + "tokens_trained": 1.523755712 + }, + { + "epoch": 0.879937593078505, + "grad_norm": 5.6704182624816895, + "loss": 4.0017, + "lr": 0.0006362237762237762, + "step": 3102, + "tokens_trained": 1.52473876 + }, + { + "epoch": 0.880504928728459, + "grad_norm": 4.982235908508301, + "loss": 3.9826, + "lr": 0.0006359440559440559, + "step": 3104, + "tokens_trained": 1.52571756 + }, + { + "epoch": 0.8810722643784129, + "grad_norm": 8.639644622802734, + "loss": 4.0177, + "lr": 0.0006356643356643357, + "step": 3106, + "tokens_trained": 1.526695632 + }, + { + "epoch": 0.8816396000283668, + "grad_norm": 6.1896820068359375, + "loss": 4.0248, + "lr": 0.0006353846153846155, + "step": 3108, + "tokens_trained": 1.527678296 + }, + { + "epoch": 0.8822069356783206, + "grad_norm": 3.787477731704712, + "loss": 4.0489, + "lr": 0.0006351048951048951, + "step": 3110, + "tokens_trained": 1.528665456 + }, + { + "epoch": 0.8827742713282746, + "grad_norm": 4.418561935424805, + "loss": 4.0422, + "lr": 0.0006348251748251748, + "step": 3112, + "tokens_trained": 1.529648584 + }, + { + "epoch": 0.8833416069782285, + "grad_norm": 8.951369285583496, + "loss": 4.028, + "lr": 0.0006345454545454546, + "step": 3114, + "tokens_trained": 1.530628808 + }, + { + "epoch": 0.8839089426281824, + "grad_norm": 4.903277397155762, + "loss": 4.0772, + "lr": 0.0006342657342657343, + "step": 3116, + "tokens_trained": 1.531612144 + }, + { + "epoch": 0.8844762782781364, + "grad_norm": 4.366726875305176, + "loss": 3.9975, + "lr": 0.000633986013986014, + "step": 3118, + "tokens_trained": 1.532595304 + }, + { + "epoch": 0.8850436139280902, + "grad_norm": 6.9316911697387695, + "loss": 4.0019, + "lr": 0.0006337062937062937, + "step": 3120, + "tokens_trained": 1.533578888 + }, + { + "epoch": 0.8856109495780441, + "grad_norm": 8.896012306213379, + "loss": 4.04, + "lr": 0.0006334265734265733, + "step": 3122, + "tokens_trained": 1.534557552 + }, + { + "epoch": 0.886178285227998, + "grad_norm": 5.350147724151611, + "loss": 4.0229, + "lr": 0.0006331468531468532, + "step": 3124, + "tokens_trained": 1.535539672 + }, + { + "epoch": 0.8864619530529749, + "eval_loss": 1.007444143295288, + "eval_runtime": 20.5976, + "step": 3125, + "tokens_trained": 1.53603052 + }, + { + "epoch": 0.886745620877952, + "grad_norm": 5.331796646118164, + "loss": 4.0331, + "lr": 0.0006328671328671329, + "step": 3126, + "tokens_trained": 1.536525432 + }, + { + "epoch": 0.8873129565279059, + "grad_norm": 11.335051536560059, + "loss": 4.041, + "lr": 0.0006325874125874126, + "step": 3128, + "tokens_trained": 1.537508928 + }, + { + "epoch": 0.8878802921778597, + "grad_norm": 8.185080528259277, + "loss": 4.0299, + "lr": 0.0006323076923076923, + "step": 3130, + "tokens_trained": 1.53848672 + }, + { + "epoch": 0.8884476278278136, + "grad_norm": 4.136550426483154, + "loss": 4.0268, + "lr": 0.0006320279720279721, + "step": 3132, + "tokens_trained": 1.5394682 + }, + { + "epoch": 0.8890149634777675, + "grad_norm": 4.993428707122803, + "loss": 3.9808, + "lr": 0.0006317482517482518, + "step": 3134, + "tokens_trained": 1.540449416 + }, + { + "epoch": 0.8895822991277215, + "grad_norm": 5.485887050628662, + "loss": 4.0201, + "lr": 0.0006314685314685314, + "step": 3136, + "tokens_trained": 1.541436136 + }, + { + "epoch": 0.8901496347776754, + "grad_norm": 4.517815589904785, + "loss": 3.9985, + "lr": 0.0006311888111888112, + "step": 3138, + "tokens_trained": 1.542421992 + }, + { + "epoch": 0.8907169704276292, + "grad_norm": 3.8219170570373535, + "loss": 4.0299, + "lr": 0.0006309090909090908, + "step": 3140, + "tokens_trained": 1.543399648 + }, + { + "epoch": 0.8912843060775831, + "grad_norm": 7.318249702453613, + "loss": 4.0377, + "lr": 0.0006306293706293707, + "step": 3142, + "tokens_trained": 1.54438384 + }, + { + "epoch": 0.8918516417275371, + "grad_norm": 9.09650707244873, + "loss": 4.0572, + "lr": 0.0006303496503496504, + "step": 3144, + "tokens_trained": 1.545367632 + }, + { + "epoch": 0.892418977377491, + "grad_norm": 6.241589069366455, + "loss": 4.025, + "lr": 0.0006300699300699301, + "step": 3146, + "tokens_trained": 1.546355136 + }, + { + "epoch": 0.8929863130274449, + "grad_norm": 6.9915385246276855, + "loss": 4.0177, + "lr": 0.0006297902097902098, + "step": 3148, + "tokens_trained": 1.547340304 + }, + { + "epoch": 0.8935536486773987, + "grad_norm": 5.599451541900635, + "loss": 3.9892, + "lr": 0.0006295104895104896, + "step": 3150, + "tokens_trained": 1.54832164 + }, + { + "epoch": 0.8941209843273527, + "grad_norm": 7.765986442565918, + "loss": 4.0232, + "lr": 0.0006292307692307693, + "step": 3152, + "tokens_trained": 1.54930228 + }, + { + "epoch": 0.8946883199773066, + "grad_norm": 10.365357398986816, + "loss": 4.0254, + "lr": 0.0006289510489510489, + "step": 3154, + "tokens_trained": 1.550282888 + }, + { + "epoch": 0.8952556556272605, + "grad_norm": 7.8539276123046875, + "loss": 4.008, + "lr": 0.0006286713286713287, + "step": 3156, + "tokens_trained": 1.551265008 + }, + { + "epoch": 0.8958229912772144, + "grad_norm": 8.106318473815918, + "loss": 4.0351, + "lr": 0.0006283916083916083, + "step": 3158, + "tokens_trained": 1.552245928 + }, + { + "epoch": 0.8963903269271682, + "grad_norm": 10.22494125366211, + "loss": 3.9873, + "lr": 0.0006281118881118882, + "step": 3160, + "tokens_trained": 1.553227848 + }, + { + "epoch": 0.8969576625771222, + "grad_norm": 2.8810367584228516, + "loss": 4.0399, + "lr": 0.0006278321678321679, + "step": 3162, + "tokens_trained": 1.554208112 + }, + { + "epoch": 0.8975249982270761, + "grad_norm": 10.036259651184082, + "loss": 4.0072, + "lr": 0.0006275524475524475, + "step": 3164, + "tokens_trained": 1.555186496 + }, + { + "epoch": 0.89809233387703, + "grad_norm": 6.596704006195068, + "loss": 4.0306, + "lr": 0.0006272727272727273, + "step": 3166, + "tokens_trained": 1.556170896 + }, + { + "epoch": 0.898659669526984, + "grad_norm": 4.411632537841797, + "loss": 4.035, + "lr": 0.000626993006993007, + "step": 3168, + "tokens_trained": 1.55715312 + }, + { + "epoch": 0.8992270051769378, + "grad_norm": 4.391601085662842, + "loss": 3.9973, + "lr": 0.0006267132867132868, + "step": 3170, + "tokens_trained": 1.558133552 + }, + { + "epoch": 0.8997943408268917, + "grad_norm": 9.456700325012207, + "loss": 4.0255, + "lr": 0.0006264335664335664, + "step": 3172, + "tokens_trained": 1.559115752 + }, + { + "epoch": 0.9003616764768456, + "grad_norm": 8.490089416503906, + "loss": 4.0368, + "lr": 0.0006261538461538462, + "step": 3174, + "tokens_trained": 1.560095384 + }, + { + "epoch": 0.9009290121267995, + "grad_norm": 7.3357744216918945, + "loss": 4.0528, + "lr": 0.0006258741258741258, + "step": 3176, + "tokens_trained": 1.561078856 + }, + { + "epoch": 0.9014963477767535, + "grad_norm": 6.7389092445373535, + "loss": 4.0457, + "lr": 0.0006255944055944057, + "step": 3178, + "tokens_trained": 1.562063936 + }, + { + "epoch": 0.9020636834267073, + "grad_norm": 7.586348056793213, + "loss": 4.0516, + "lr": 0.0006253146853146854, + "step": 3180, + "tokens_trained": 1.5630424 + }, + { + "epoch": 0.9026310190766612, + "grad_norm": 5.646294116973877, + "loss": 4.0048, + "lr": 0.000625034965034965, + "step": 3182, + "tokens_trained": 1.564028064 + }, + { + "epoch": 0.9031983547266151, + "grad_norm": 7.30889368057251, + "loss": 3.9952, + "lr": 0.0006247552447552448, + "step": 3184, + "tokens_trained": 1.565010296 + }, + { + "epoch": 0.9037656903765691, + "grad_norm": 6.234517574310303, + "loss": 4.0267, + "lr": 0.0006244755244755245, + "step": 3186, + "tokens_trained": 1.565993536 + }, + { + "epoch": 0.904333026026523, + "grad_norm": 4.630068302154541, + "loss": 4.0638, + "lr": 0.0006241958041958043, + "step": 3188, + "tokens_trained": 1.566973648 + }, + { + "epoch": 0.9049003616764768, + "grad_norm": 10.530085563659668, + "loss": 4.056, + "lr": 0.0006239160839160839, + "step": 3190, + "tokens_trained": 1.567954192 + }, + { + "epoch": 0.9054676973264307, + "grad_norm": 6.909562110900879, + "loss": 4.0297, + "lr": 0.0006236363636363636, + "step": 3192, + "tokens_trained": 1.568941888 + }, + { + "epoch": 0.9060350329763847, + "grad_norm": 3.382798910140991, + "loss": 3.9554, + "lr": 0.0006233566433566433, + "step": 3194, + "tokens_trained": 1.569926344 + }, + { + "epoch": 0.9066023686263386, + "grad_norm": 6.318317890167236, + "loss": 4.0313, + "lr": 0.0006230769230769231, + "step": 3196, + "tokens_trained": 1.570909072 + }, + { + "epoch": 0.9071697042762925, + "grad_norm": 8.904982566833496, + "loss": 4.0422, + "lr": 0.0006227972027972028, + "step": 3198, + "tokens_trained": 1.571891864 + }, + { + "epoch": 0.9077370399262463, + "grad_norm": 4.008038520812988, + "loss": 4.0254, + "lr": 0.0006225174825174825, + "step": 3200, + "tokens_trained": 1.572877488 + }, + { + "epoch": 0.9083043755762003, + "grad_norm": 4.28498649597168, + "loss": 3.9916, + "lr": 0.0006222377622377623, + "step": 3202, + "tokens_trained": 1.57385788 + }, + { + "epoch": 0.9088717112261542, + "grad_norm": 7.385266304016113, + "loss": 3.9841, + "lr": 0.000621958041958042, + "step": 3204, + "tokens_trained": 1.574841232 + }, + { + "epoch": 0.9094390468761081, + "grad_norm": 6.1430134773254395, + "loss": 3.9886, + "lr": 0.0006216783216783217, + "step": 3206, + "tokens_trained": 1.5758212 + }, + { + "epoch": 0.910006382526062, + "grad_norm": 4.640578746795654, + "loss": 4.036, + "lr": 0.0006213986013986014, + "step": 3208, + "tokens_trained": 1.576803856 + }, + { + "epoch": 0.9105737181760158, + "grad_norm": 2.6749765872955322, + "loss": 3.9934, + "lr": 0.0006211188811188811, + "step": 3210, + "tokens_trained": 1.577788136 + }, + { + "epoch": 0.9111410538259698, + "grad_norm": 2.5117337703704834, + "loss": 3.9924, + "lr": 0.0006208391608391608, + "step": 3212, + "tokens_trained": 1.5787728 + }, + { + "epoch": 0.9117083894759237, + "grad_norm": 9.552038192749023, + "loss": 4.0141, + "lr": 0.0006205594405594406, + "step": 3214, + "tokens_trained": 1.579757576 + }, + { + "epoch": 0.9122757251258776, + "grad_norm": 4.317904949188232, + "loss": 4.0242, + "lr": 0.0006202797202797203, + "step": 3216, + "tokens_trained": 1.580737776 + }, + { + "epoch": 0.9128430607758315, + "grad_norm": 4.847869873046875, + "loss": 4.0037, + "lr": 0.00062, + "step": 3218, + "tokens_trained": 1.58172144 + }, + { + "epoch": 0.9134103964257854, + "grad_norm": 8.135149002075195, + "loss": 4.056, + "lr": 0.0006197202797202797, + "step": 3220, + "tokens_trained": 1.58270064 + }, + { + "epoch": 0.9139777320757393, + "grad_norm": 4.46032190322876, + "loss": 4.0037, + "lr": 0.0006194405594405595, + "step": 3222, + "tokens_trained": 1.58368244 + }, + { + "epoch": 0.9145450677256932, + "grad_norm": 4.710826873779297, + "loss": 4.0083, + "lr": 0.0006191608391608392, + "step": 3224, + "tokens_trained": 1.584669984 + }, + { + "epoch": 0.9151124033756471, + "grad_norm": 6.524029731750488, + "loss": 4.0394, + "lr": 0.0006188811188811189, + "step": 3226, + "tokens_trained": 1.585651952 + }, + { + "epoch": 0.9156797390256011, + "grad_norm": 8.807348251342773, + "loss": 4.0215, + "lr": 0.0006186013986013986, + "step": 3228, + "tokens_trained": 1.586634416 + }, + { + "epoch": 0.9162470746755549, + "grad_norm": 8.313971519470215, + "loss": 4.048, + "lr": 0.0006183216783216783, + "step": 3230, + "tokens_trained": 1.587616352 + }, + { + "epoch": 0.9168144103255088, + "grad_norm": 7.2862868309021, + "loss": 4.0326, + "lr": 0.0006180419580419581, + "step": 3232, + "tokens_trained": 1.588597696 + }, + { + "epoch": 0.9173817459754627, + "grad_norm": 6.1933746337890625, + "loss": 4.0232, + "lr": 0.0006177622377622377, + "step": 3234, + "tokens_trained": 1.589579384 + }, + { + "epoch": 0.9179490816254167, + "grad_norm": 6.848970890045166, + "loss": 4.0134, + "lr": 0.0006174825174825175, + "step": 3236, + "tokens_trained": 1.590563936 + }, + { + "epoch": 0.9185164172753706, + "grad_norm": 6.213261604309082, + "loss": 3.9622, + "lr": 0.0006172027972027972, + "step": 3238, + "tokens_trained": 1.591546488 + }, + { + "epoch": 0.9190837529253244, + "grad_norm": 11.642724990844727, + "loss": 4.0487, + "lr": 0.000616923076923077, + "step": 3240, + "tokens_trained": 1.592528992 + }, + { + "epoch": 0.9196510885752783, + "grad_norm": 2.465311288833618, + "loss": 3.9996, + "lr": 0.0006166433566433567, + "step": 3242, + "tokens_trained": 1.593514088 + }, + { + "epoch": 0.9202184242252323, + "grad_norm": 14.788623809814453, + "loss": 4.1041, + "lr": 0.0006163636363636364, + "step": 3244, + "tokens_trained": 1.594498768 + }, + { + "epoch": 0.9207857598751862, + "grad_norm": 11.614027976989746, + "loss": 3.99, + "lr": 0.0006160839160839161, + "step": 3246, + "tokens_trained": 1.595477496 + }, + { + "epoch": 0.9213530955251401, + "grad_norm": 8.917405128479004, + "loss": 4.0626, + "lr": 0.0006158041958041957, + "step": 3248, + "tokens_trained": 1.596459208 + }, + { + "epoch": 0.9219204311750939, + "grad_norm": 9.843046188354492, + "loss": 4.0256, + "lr": 0.0006155244755244756, + "step": 3250, + "tokens_trained": 1.59744676 + }, + { + "epoch": 0.9219204311750939, + "eval_loss": 1.0055779218673706, + "eval_runtime": 20.5405, + "step": 3250, + "tokens_trained": 1.59744676 + }, + { + "epoch": 0.9224877668250479, + "grad_norm": 5.153568267822266, + "loss": 3.9596, + "lr": 0.0006152447552447552, + "step": 3252, + "tokens_trained": 1.598428968 + }, + { + "epoch": 0.9230551024750018, + "grad_norm": 3.321300745010376, + "loss": 3.969, + "lr": 0.000614965034965035, + "step": 3254, + "tokens_trained": 1.599406304 + }, + { + "epoch": 0.9236224381249557, + "grad_norm": 5.910068511962891, + "loss": 3.9806, + "lr": 0.0006146853146853147, + "step": 3256, + "tokens_trained": 1.60038644 + }, + { + "epoch": 0.9241897737749096, + "grad_norm": 9.364005088806152, + "loss": 3.9919, + "lr": 0.0006144055944055945, + "step": 3258, + "tokens_trained": 1.601371288 + }, + { + "epoch": 0.9247571094248634, + "grad_norm": 9.865127563476562, + "loss": 3.9827, + "lr": 0.0006141258741258742, + "step": 3260, + "tokens_trained": 1.602351528 + }, + { + "epoch": 0.9253244450748174, + "grad_norm": 6.053020000457764, + "loss": 3.9769, + "lr": 0.0006138461538461538, + "step": 3262, + "tokens_trained": 1.603337336 + }, + { + "epoch": 0.9258917807247713, + "grad_norm": 5.632033348083496, + "loss": 4.061, + "lr": 0.0006135664335664336, + "step": 3264, + "tokens_trained": 1.6043186 + }, + { + "epoch": 0.9264591163747252, + "grad_norm": 6.253534317016602, + "loss": 3.9414, + "lr": 0.0006132867132867132, + "step": 3266, + "tokens_trained": 1.605300448 + }, + { + "epoch": 0.9270264520246791, + "grad_norm": 7.757418632507324, + "loss": 4.0119, + "lr": 0.0006130069930069931, + "step": 3268, + "tokens_trained": 1.60628376 + }, + { + "epoch": 0.927593787674633, + "grad_norm": 5.378245830535889, + "loss": 3.9746, + "lr": 0.0006127272727272727, + "step": 3270, + "tokens_trained": 1.607265384 + }, + { + "epoch": 0.9281611233245869, + "grad_norm": 5.998968124389648, + "loss": 4.0218, + "lr": 0.0006124475524475525, + "step": 3272, + "tokens_trained": 1.60824544 + }, + { + "epoch": 0.9287284589745408, + "grad_norm": 6.340670585632324, + "loss": 4.0204, + "lr": 0.0006121678321678322, + "step": 3274, + "tokens_trained": 1.609232632 + }, + { + "epoch": 0.9292957946244947, + "grad_norm": 6.357148170471191, + "loss": 3.9686, + "lr": 0.0006118881118881118, + "step": 3276, + "tokens_trained": 1.610216024 + }, + { + "epoch": 0.9298631302744487, + "grad_norm": 4.993794918060303, + "loss": 3.9812, + "lr": 0.0006116083916083917, + "step": 3278, + "tokens_trained": 1.611196872 + }, + { + "epoch": 0.9304304659244025, + "grad_norm": 7.559938430786133, + "loss": 4.0018, + "lr": 0.0006113286713286713, + "step": 3280, + "tokens_trained": 1.612184944 + }, + { + "epoch": 0.9309978015743564, + "grad_norm": 3.7233004570007324, + "loss": 3.9835, + "lr": 0.0006110489510489511, + "step": 3282, + "tokens_trained": 1.613170464 + }, + { + "epoch": 0.9315651372243103, + "grad_norm": 7.3292717933654785, + "loss": 3.977, + "lr": 0.0006107692307692307, + "step": 3284, + "tokens_trained": 1.614153168 + }, + { + "epoch": 0.9321324728742643, + "grad_norm": 8.804302215576172, + "loss": 3.962, + "lr": 0.0006104895104895106, + "step": 3286, + "tokens_trained": 1.615134208 + }, + { + "epoch": 0.9326998085242182, + "grad_norm": 5.557953834533691, + "loss": 3.9729, + "lr": 0.0006102097902097902, + "step": 3288, + "tokens_trained": 1.616116248 + }, + { + "epoch": 0.933267144174172, + "grad_norm": 5.135542869567871, + "loss": 3.9855, + "lr": 0.0006099300699300699, + "step": 3290, + "tokens_trained": 1.617100064 + }, + { + "epoch": 0.9338344798241259, + "grad_norm": 10.206086158752441, + "loss": 4.0058, + "lr": 0.0006096503496503497, + "step": 3292, + "tokens_trained": 1.61808084 + }, + { + "epoch": 0.9344018154740799, + "grad_norm": 6.490070819854736, + "loss": 4.0328, + "lr": 0.0006093706293706293, + "step": 3294, + "tokens_trained": 1.619061608 + }, + { + "epoch": 0.9349691511240338, + "grad_norm": 6.246134281158447, + "loss": 3.9858, + "lr": 0.0006090909090909092, + "step": 3296, + "tokens_trained": 1.620046896 + }, + { + "epoch": 0.9355364867739877, + "grad_norm": 6.82793664932251, + "loss": 3.9416, + "lr": 0.0006088111888111888, + "step": 3298, + "tokens_trained": 1.621030544 + }, + { + "epoch": 0.9361038224239415, + "grad_norm": 5.400341510772705, + "loss": 4.0048, + "lr": 0.0006085314685314686, + "step": 3300, + "tokens_trained": 1.622010024 + }, + { + "epoch": 0.9366711580738954, + "grad_norm": 2.7493224143981934, + "loss": 3.9987, + "lr": 0.0006082517482517482, + "step": 3302, + "tokens_trained": 1.622992736 + }, + { + "epoch": 0.9372384937238494, + "grad_norm": 8.426931381225586, + "loss": 4.0074, + "lr": 0.000607972027972028, + "step": 3304, + "tokens_trained": 1.623977336 + }, + { + "epoch": 0.9378058293738033, + "grad_norm": 6.779547691345215, + "loss": 4.0041, + "lr": 0.0006076923076923077, + "step": 3306, + "tokens_trained": 1.624958504 + }, + { + "epoch": 0.9383731650237572, + "grad_norm": 5.38230562210083, + "loss": 4.0297, + "lr": 0.0006074125874125874, + "step": 3308, + "tokens_trained": 1.625948568 + }, + { + "epoch": 0.938940500673711, + "grad_norm": 5.785275936126709, + "loss": 4.0112, + "lr": 0.0006071328671328672, + "step": 3310, + "tokens_trained": 1.626932696 + }, + { + "epoch": 0.939507836323665, + "grad_norm": 14.610711097717285, + "loss": 3.9558, + "lr": 0.0006068531468531468, + "step": 3312, + "tokens_trained": 1.62791704 + }, + { + "epoch": 0.9400751719736189, + "grad_norm": 2.3301351070404053, + "loss": 4.0155, + "lr": 0.0006065734265734267, + "step": 3314, + "tokens_trained": 1.628900096 + }, + { + "epoch": 0.9406425076235728, + "grad_norm": 17.020362854003906, + "loss": 4.0244, + "lr": 0.0006062937062937063, + "step": 3316, + "tokens_trained": 1.629885888 + }, + { + "epoch": 0.9412098432735267, + "grad_norm": 8.809579849243164, + "loss": 4.0622, + "lr": 0.000606013986013986, + "step": 3318, + "tokens_trained": 1.630868992 + }, + { + "epoch": 0.9417771789234806, + "grad_norm": 4.908751964569092, + "loss": 4.0464, + "lr": 0.0006057342657342657, + "step": 3320, + "tokens_trained": 1.631855664 + }, + { + "epoch": 0.9423445145734345, + "grad_norm": 9.65546989440918, + "loss": 4.013, + "lr": 0.0006054545454545455, + "step": 3322, + "tokens_trained": 1.632839496 + }, + { + "epoch": 0.9429118502233884, + "grad_norm": 5.595473766326904, + "loss": 4.0371, + "lr": 0.0006051748251748252, + "step": 3324, + "tokens_trained": 1.633827536 + }, + { + "epoch": 0.9434791858733423, + "grad_norm": 10.249938011169434, + "loss": 4.0702, + "lr": 0.0006048951048951049, + "step": 3326, + "tokens_trained": 1.634811888 + }, + { + "epoch": 0.9440465215232963, + "grad_norm": 12.086007118225098, + "loss": 4.0042, + "lr": 0.0006046153846153846, + "step": 3328, + "tokens_trained": 1.635792824 + }, + { + "epoch": 0.9446138571732501, + "grad_norm": 3.0745136737823486, + "loss": 4.0355, + "lr": 0.0006043356643356643, + "step": 3330, + "tokens_trained": 1.636776176 + }, + { + "epoch": 0.945181192823204, + "grad_norm": 4.060697078704834, + "loss": 4.0016, + "lr": 0.0006040559440559441, + "step": 3332, + "tokens_trained": 1.637758008 + }, + { + "epoch": 0.9457485284731579, + "grad_norm": 7.648933410644531, + "loss": 3.9939, + "lr": 0.0006037762237762238, + "step": 3334, + "tokens_trained": 1.638744408 + }, + { + "epoch": 0.9463158641231119, + "grad_norm": 5.033253192901611, + "loss": 4.0245, + "lr": 0.0006034965034965035, + "step": 3336, + "tokens_trained": 1.639724776 + }, + { + "epoch": 0.9468831997730658, + "grad_norm": 4.653557300567627, + "loss": 4.0169, + "lr": 0.0006032167832167832, + "step": 3338, + "tokens_trained": 1.640708864 + }, + { + "epoch": 0.9474505354230196, + "grad_norm": 6.682651042938232, + "loss": 4.0062, + "lr": 0.000602937062937063, + "step": 3340, + "tokens_trained": 1.641689864 + }, + { + "epoch": 0.9480178710729735, + "grad_norm": 5.059361934661865, + "loss": 3.9681, + "lr": 0.0006026573426573426, + "step": 3342, + "tokens_trained": 1.64267264 + }, + { + "epoch": 0.9485852067229275, + "grad_norm": 4.165974140167236, + "loss": 3.9941, + "lr": 0.0006023776223776224, + "step": 3344, + "tokens_trained": 1.643655624 + }, + { + "epoch": 0.9491525423728814, + "grad_norm": 6.669079780578613, + "loss": 4.0258, + "lr": 0.0006020979020979021, + "step": 3346, + "tokens_trained": 1.644635752 + }, + { + "epoch": 0.9497198780228353, + "grad_norm": 5.924664497375488, + "loss": 4.0589, + "lr": 0.0006018181818181818, + "step": 3348, + "tokens_trained": 1.64561992 + }, + { + "epoch": 0.9502872136727891, + "grad_norm": 1.662906527519226, + "loss": 3.9894, + "lr": 0.0006015384615384616, + "step": 3350, + "tokens_trained": 1.646605552 + }, + { + "epoch": 0.950854549322743, + "grad_norm": 3.1677517890930176, + "loss": 4.0062, + "lr": 0.0006012587412587413, + "step": 3352, + "tokens_trained": 1.647587824 + }, + { + "epoch": 0.951421884972697, + "grad_norm": 5.4521918296813965, + "loss": 4.0244, + "lr": 0.000600979020979021, + "step": 3354, + "tokens_trained": 1.648566792 + }, + { + "epoch": 0.9519892206226509, + "grad_norm": 7.839843273162842, + "loss": 3.9954, + "lr": 0.0006006993006993006, + "step": 3356, + "tokens_trained": 1.6495504 + }, + { + "epoch": 0.9525565562726048, + "grad_norm": 5.340535640716553, + "loss": 3.9915, + "lr": 0.0006004195804195805, + "step": 3358, + "tokens_trained": 1.65053064 + }, + { + "epoch": 0.9531238919225586, + "grad_norm": 3.9342992305755615, + "loss": 3.9507, + "lr": 0.0006001398601398601, + "step": 3360, + "tokens_trained": 1.651516704 + }, + { + "epoch": 0.9536912275725126, + "grad_norm": 3.879631519317627, + "loss": 4.0369, + "lr": 0.0005998601398601399, + "step": 3362, + "tokens_trained": 1.652501248 + }, + { + "epoch": 0.9542585632224665, + "grad_norm": 4.699181079864502, + "loss": 4.0151, + "lr": 0.0005995804195804196, + "step": 3364, + "tokens_trained": 1.653486632 + }, + { + "epoch": 0.9548258988724204, + "grad_norm": 7.259454250335693, + "loss": 3.9855, + "lr": 0.0005993006993006993, + "step": 3366, + "tokens_trained": 1.654473488 + }, + { + "epoch": 0.9553932345223743, + "grad_norm": 6.6725029945373535, + "loss": 3.9972, + "lr": 0.0005990209790209791, + "step": 3368, + "tokens_trained": 1.655456328 + }, + { + "epoch": 0.9559605701723282, + "grad_norm": 5.077842712402344, + "loss": 3.9706, + "lr": 0.0005987412587412587, + "step": 3370, + "tokens_trained": 1.656442256 + }, + { + "epoch": 0.9565279058222821, + "grad_norm": 7.882787704467773, + "loss": 4.0581, + "lr": 0.0005984615384615385, + "step": 3372, + "tokens_trained": 1.657425912 + }, + { + "epoch": 0.957095241472236, + "grad_norm": 7.118039608001709, + "loss": 3.9939, + "lr": 0.0005981818181818181, + "step": 3374, + "tokens_trained": 1.658406184 + }, + { + "epoch": 0.9573789092972129, + "eval_loss": 1.0043113231658936, + "eval_runtime": 20.471, + "step": 3375, + "tokens_trained": 1.658898224 + }, + { + "epoch": 0.9576625771221899, + "grad_norm": 11.206400871276855, + "loss": 4.0073, + "lr": 0.000597902097902098, + "step": 3376, + "tokens_trained": 1.65938968 + }, + { + "epoch": 0.9582299127721439, + "grad_norm": 3.2221481800079346, + "loss": 3.9924, + "lr": 0.0005976223776223776, + "step": 3378, + "tokens_trained": 1.660372856 + }, + { + "epoch": 0.9587972484220977, + "grad_norm": 15.000614166259766, + "loss": 4.0361, + "lr": 0.0005973426573426574, + "step": 3380, + "tokens_trained": 1.66135512 + }, + { + "epoch": 0.9593645840720516, + "grad_norm": 13.365633964538574, + "loss": 4.0258, + "lr": 0.0005970629370629371, + "step": 3382, + "tokens_trained": 1.662332728 + }, + { + "epoch": 0.9599319197220055, + "grad_norm": 6.362198829650879, + "loss": 3.9868, + "lr": 0.0005967832167832167, + "step": 3384, + "tokens_trained": 1.663311392 + }, + { + "epoch": 0.9604992553719595, + "grad_norm": 16.104549407958984, + "loss": 3.9893, + "lr": 0.0005965034965034966, + "step": 3386, + "tokens_trained": 1.664296088 + }, + { + "epoch": 0.9610665910219134, + "grad_norm": 32.109375, + "loss": 4.0635, + "lr": 0.0005962237762237762, + "step": 3388, + "tokens_trained": 1.665278232 + }, + { + "epoch": 0.9616339266718672, + "grad_norm": 14.814417839050293, + "loss": 4.0545, + "lr": 0.000595944055944056, + "step": 3390, + "tokens_trained": 1.666262952 + }, + { + "epoch": 0.9622012623218211, + "grad_norm": 8.69149398803711, + "loss": 4.0214, + "lr": 0.0005956643356643356, + "step": 3392, + "tokens_trained": 1.66724224 + }, + { + "epoch": 0.962768597971775, + "grad_norm": 6.150435447692871, + "loss": 4.0675, + "lr": 0.0005953846153846155, + "step": 3394, + "tokens_trained": 1.668222488 + }, + { + "epoch": 0.963335933621729, + "grad_norm": 14.53095817565918, + "loss": 4.0293, + "lr": 0.0005951048951048951, + "step": 3396, + "tokens_trained": 1.66920572 + }, + { + "epoch": 0.9639032692716829, + "grad_norm": 14.750361442565918, + "loss": 4.0345, + "lr": 0.0005948251748251748, + "step": 3398, + "tokens_trained": 1.670191456 + }, + { + "epoch": 0.9644706049216367, + "grad_norm": 10.563243865966797, + "loss": 4.0796, + "lr": 0.0005945454545454546, + "step": 3400, + "tokens_trained": 1.671174992 + }, + { + "epoch": 0.9650379405715906, + "grad_norm": 14.203415870666504, + "loss": 4.0078, + "lr": 0.0005942657342657342, + "step": 3402, + "tokens_trained": 1.672159048 + }, + { + "epoch": 0.9656052762215446, + "grad_norm": 7.918346405029297, + "loss": 4.0015, + "lr": 0.0005939860139860141, + "step": 3404, + "tokens_trained": 1.6731408 + }, + { + "epoch": 0.9661726118714985, + "grad_norm": 3.3628811836242676, + "loss": 4.0656, + "lr": 0.0005937062937062937, + "step": 3406, + "tokens_trained": 1.674120472 + }, + { + "epoch": 0.9667399475214524, + "grad_norm": 13.740876197814941, + "loss": 4.0296, + "lr": 0.0005934265734265735, + "step": 3408, + "tokens_trained": 1.67510176 + }, + { + "epoch": 0.9673072831714062, + "grad_norm": 8.178666114807129, + "loss": 3.9804, + "lr": 0.0005931468531468531, + "step": 3410, + "tokens_trained": 1.676087336 + }, + { + "epoch": 0.9678746188213602, + "grad_norm": 6.31284761428833, + "loss": 3.9905, + "lr": 0.000592867132867133, + "step": 3412, + "tokens_trained": 1.677069328 + }, + { + "epoch": 0.9684419544713141, + "grad_norm": 10.166040420532227, + "loss": 3.9962, + "lr": 0.0005925874125874126, + "step": 3414, + "tokens_trained": 1.678049672 + }, + { + "epoch": 0.969009290121268, + "grad_norm": 6.166718006134033, + "loss": 3.9966, + "lr": 0.0005923076923076923, + "step": 3416, + "tokens_trained": 1.679035104 + }, + { + "epoch": 0.969576625771222, + "grad_norm": 3.7397615909576416, + "loss": 4.0323, + "lr": 0.0005920279720279721, + "step": 3418, + "tokens_trained": 1.680018424 + }, + { + "epoch": 0.9701439614211758, + "grad_norm": 12.122432708740234, + "loss": 4.0143, + "lr": 0.0005917482517482517, + "step": 3420, + "tokens_trained": 1.681001112 + }, + { + "epoch": 0.9707112970711297, + "grad_norm": 5.118746280670166, + "loss": 3.9909, + "lr": 0.0005914685314685316, + "step": 3422, + "tokens_trained": 1.681987648 + }, + { + "epoch": 0.9712786327210836, + "grad_norm": 5.810860633850098, + "loss": 3.9675, + "lr": 0.0005911888111888112, + "step": 3424, + "tokens_trained": 1.68296972 + }, + { + "epoch": 0.9718459683710375, + "grad_norm": 7.637686252593994, + "loss": 3.9976, + "lr": 0.0005909090909090909, + "step": 3426, + "tokens_trained": 1.683952 + }, + { + "epoch": 0.9724133040209915, + "grad_norm": 5.637698173522949, + "loss": 3.9829, + "lr": 0.0005906293706293706, + "step": 3428, + "tokens_trained": 1.684933912 + }, + { + "epoch": 0.9729806396709453, + "grad_norm": 2.2650809288024902, + "loss": 3.9656, + "lr": 0.0005903496503496504, + "step": 3430, + "tokens_trained": 1.685915176 + }, + { + "epoch": 0.9735479753208992, + "grad_norm": 6.0117058753967285, + "loss": 4.0575, + "lr": 0.0005900699300699301, + "step": 3432, + "tokens_trained": 1.686901184 + }, + { + "epoch": 0.9741153109708531, + "grad_norm": 8.301697731018066, + "loss": 3.9869, + "lr": 0.0005897902097902098, + "step": 3434, + "tokens_trained": 1.687886888 + }, + { + "epoch": 0.9746826466208071, + "grad_norm": 6.436981678009033, + "loss": 4.01, + "lr": 0.0005895104895104896, + "step": 3436, + "tokens_trained": 1.68886904 + }, + { + "epoch": 0.975249982270761, + "grad_norm": 4.290571212768555, + "loss": 3.9953, + "lr": 0.0005892307692307692, + "step": 3438, + "tokens_trained": 1.689850264 + }, + { + "epoch": 0.9758173179207148, + "grad_norm": 4.618532657623291, + "loss": 3.9995, + "lr": 0.000588951048951049, + "step": 3440, + "tokens_trained": 1.69083728 + }, + { + "epoch": 0.9763846535706687, + "grad_norm": 8.481820106506348, + "loss": 4.0019, + "lr": 0.0005886713286713287, + "step": 3442, + "tokens_trained": 1.691819976 + }, + { + "epoch": 0.9769519892206227, + "grad_norm": 4.643980503082275, + "loss": 3.9974, + "lr": 0.0005883916083916084, + "step": 3444, + "tokens_trained": 1.692803784 + }, + { + "epoch": 0.9775193248705766, + "grad_norm": 6.828413009643555, + "loss": 3.9886, + "lr": 0.0005881118881118881, + "step": 3446, + "tokens_trained": 1.69378512 + }, + { + "epoch": 0.9780866605205305, + "grad_norm": 7.530898094177246, + "loss": 4.0318, + "lr": 0.0005878321678321679, + "step": 3448, + "tokens_trained": 1.694768152 + }, + { + "epoch": 0.9786539961704843, + "grad_norm": 6.020658493041992, + "loss": 4.0057, + "lr": 0.0005875524475524476, + "step": 3450, + "tokens_trained": 1.695752832 + }, + { + "epoch": 0.9792213318204382, + "grad_norm": 5.292300224304199, + "loss": 3.9915, + "lr": 0.0005872727272727273, + "step": 3452, + "tokens_trained": 1.696735104 + }, + { + "epoch": 0.9797886674703922, + "grad_norm": 4.932474613189697, + "loss": 4.0163, + "lr": 0.0005869930069930069, + "step": 3454, + "tokens_trained": 1.697718208 + }, + { + "epoch": 0.9803560031203461, + "grad_norm": 4.504141807556152, + "loss": 3.9875, + "lr": 0.0005867132867132867, + "step": 3456, + "tokens_trained": 1.698697752 + }, + { + "epoch": 0.9809233387703, + "grad_norm": 4.826939582824707, + "loss": 3.9326, + "lr": 0.0005864335664335665, + "step": 3458, + "tokens_trained": 1.699672392 + }, + { + "epoch": 0.9814906744202538, + "grad_norm": 7.805232524871826, + "loss": 3.9695, + "lr": 0.0005861538461538462, + "step": 3460, + "tokens_trained": 1.700656392 + }, + { + "epoch": 0.9820580100702078, + "grad_norm": 6.857801914215088, + "loss": 3.995, + "lr": 0.0005858741258741259, + "step": 3462, + "tokens_trained": 1.701644848 + }, + { + "epoch": 0.9826253457201617, + "grad_norm": 4.32315731048584, + "loss": 3.9701, + "lr": 0.0005855944055944055, + "step": 3464, + "tokens_trained": 1.702624688 + }, + { + "epoch": 0.9831926813701156, + "grad_norm": 6.007495880126953, + "loss": 3.9887, + "lr": 0.0005853146853146854, + "step": 3466, + "tokens_trained": 1.703607376 + }, + { + "epoch": 0.9837600170200695, + "grad_norm": 4.779850006103516, + "loss": 3.9852, + "lr": 0.000585034965034965, + "step": 3468, + "tokens_trained": 1.704589808 + }, + { + "epoch": 0.9843273526700234, + "grad_norm": 4.593331336975098, + "loss": 4.0136, + "lr": 0.0005847552447552448, + "step": 3470, + "tokens_trained": 1.705573184 + }, + { + "epoch": 0.9848946883199773, + "grad_norm": 5.466218948364258, + "loss": 3.9426, + "lr": 0.0005844755244755244, + "step": 3472, + "tokens_trained": 1.706555864 + }, + { + "epoch": 0.9854620239699312, + "grad_norm": 8.283979415893555, + "loss": 3.9788, + "lr": 0.0005841958041958042, + "step": 3474, + "tokens_trained": 1.70754036 + }, + { + "epoch": 0.9860293596198851, + "grad_norm": 2.4386069774627686, + "loss": 3.9413, + "lr": 0.000583916083916084, + "step": 3476, + "tokens_trained": 1.708525528 + }, + { + "epoch": 0.9865966952698391, + "grad_norm": 4.485580921173096, + "loss": 3.9695, + "lr": 0.0005836363636363636, + "step": 3478, + "tokens_trained": 1.709508232 + }, + { + "epoch": 0.9871640309197929, + "grad_norm": 6.725922584533691, + "loss": 4.0084, + "lr": 0.0005833566433566434, + "step": 3480, + "tokens_trained": 1.710493288 + }, + { + "epoch": 0.9877313665697468, + "grad_norm": 5.532742023468018, + "loss": 3.9571, + "lr": 0.000583076923076923, + "step": 3482, + "tokens_trained": 1.711478792 + }, + { + "epoch": 0.9882987022197007, + "grad_norm": 5.568683624267578, + "loss": 4.0178, + "lr": 0.0005827972027972029, + "step": 3484, + "tokens_trained": 1.712464864 + }, + { + "epoch": 0.9888660378696547, + "grad_norm": 5.192487716674805, + "loss": 4.0294, + "lr": 0.0005825174825174825, + "step": 3486, + "tokens_trained": 1.713448256 + }, + { + "epoch": 0.9894333735196086, + "grad_norm": 5.584596633911133, + "loss": 3.9992, + "lr": 0.0005822377622377623, + "step": 3488, + "tokens_trained": 1.714435472 + }, + { + "epoch": 0.9900007091695624, + "grad_norm": 5.044432163238525, + "loss": 4.0119, + "lr": 0.0005819580419580419, + "step": 3490, + "tokens_trained": 1.715418784 + }, + { + "epoch": 0.9905680448195163, + "grad_norm": 3.4799540042877197, + "loss": 4.0099, + "lr": 0.0005816783216783216, + "step": 3492, + "tokens_trained": 1.716402544 + }, + { + "epoch": 0.9911353804694703, + "grad_norm": 4.949790000915527, + "loss": 3.9372, + "lr": 0.0005813986013986015, + "step": 3494, + "tokens_trained": 1.71738848 + }, + { + "epoch": 0.9917027161194242, + "grad_norm": 6.527776718139648, + "loss": 3.9938, + "lr": 0.0005811188811188811, + "step": 3496, + "tokens_trained": 1.718371984 + }, + { + "epoch": 0.9922700517693781, + "grad_norm": 5.616584300994873, + "loss": 3.9352, + "lr": 0.0005808391608391609, + "step": 3498, + "tokens_trained": 1.719358256 + }, + { + "epoch": 0.9928373874193319, + "grad_norm": 7.028440952301025, + "loss": 3.9494, + "lr": 0.0005805594405594405, + "step": 3500, + "tokens_trained": 1.720339264 + }, + { + "epoch": 0.9928373874193319, + "eval_loss": 0.999991238117218, + "eval_runtime": 20.318, + "step": 3500, + "tokens_trained": 1.720339264 + }, + { + "epoch": 0.9934047230692858, + "grad_norm": 5.338140487670898, + "loss": 3.9748, + "lr": 0.0005802797202797204, + "step": 3502, + "tokens_trained": 1.72132272 + }, + { + "epoch": 0.9939720587192398, + "grad_norm": 3.3448476791381836, + "loss": 3.96, + "lr": 0.00058, + "step": 3504, + "tokens_trained": 1.722307576 + }, + { + "epoch": 0.9945393943691937, + "grad_norm": 10.660968780517578, + "loss": 4.0199, + "lr": 0.0005797202797202797, + "step": 3506, + "tokens_trained": 1.723288472 + }, + { + "epoch": 0.9951067300191476, + "grad_norm": 7.261615753173828, + "loss": 3.9889, + "lr": 0.0005794405594405594, + "step": 3508, + "tokens_trained": 1.724272744 + }, + { + "epoch": 0.9956740656691014, + "grad_norm": 5.103553295135498, + "loss": 4.0047, + "lr": 0.0005791608391608391, + "step": 3510, + "tokens_trained": 1.725255576 + }, + { + "epoch": 0.9962414013190554, + "grad_norm": 1.5151104927062988, + "loss": 4.0228, + "lr": 0.000578881118881119, + "step": 3512, + "tokens_trained": 1.72624092 + }, + { + "epoch": 0.9968087369690093, + "grad_norm": 6.042428493499756, + "loss": 3.9699, + "lr": 0.0005786013986013986, + "step": 3514, + "tokens_trained": 1.727227176 + }, + { + "epoch": 0.9973760726189632, + "grad_norm": 10.020720481872559, + "loss": 3.9961, + "lr": 0.0005783216783216784, + "step": 3516, + "tokens_trained": 1.728205072 + }, + { + "epoch": 0.9979434082689171, + "grad_norm": 9.385619163513184, + "loss": 3.9962, + "lr": 0.000578041958041958, + "step": 3518, + "tokens_trained": 1.729187536 + }, + { + "epoch": 0.998510743918871, + "grad_norm": 1.413792371749878, + "loss": 4.0256, + "lr": 0.0005777622377622377, + "step": 3520, + "tokens_trained": 1.730168968 + }, + { + "epoch": 0.9990780795688249, + "grad_norm": 2.8461780548095703, + "loss": 3.9616, + "lr": 0.0005774825174825175, + "step": 3522, + "tokens_trained": 1.731150472 + }, + { + "epoch": 0.9996454152187788, + "grad_norm": 4.164590835571289, + "loss": 3.9786, + "lr": 0.0005772027972027972, + "step": 3524, + "tokens_trained": 1.732130536 + }, + { + "epoch": 1.0, + "grad_norm": 1.0116016864776611, + "loss": 2.5007, + "lr": 0.0005769230769230769, + "step": 3526, + "tokens_trained": 1.732744968 + }, + { + "epoch": 1.0005673356499538, + "grad_norm": 5.954165458679199, + "loss": 3.9598, + "lr": 0.0005766433566433566, + "step": 3528, + "tokens_trained": 1.733727424 + }, + { + "epoch": 1.0011346712999079, + "grad_norm": 8.648826599121094, + "loss": 3.9773, + "lr": 0.0005763636363636365, + "step": 3530, + "tokens_trained": 1.734708184 + }, + { + "epoch": 1.0017020069498617, + "grad_norm": 2.920509099960327, + "loss": 3.9745, + "lr": 0.0005760839160839161, + "step": 3532, + "tokens_trained": 1.735688616 + }, + { + "epoch": 1.0022693425998157, + "grad_norm": 9.963903427124023, + "loss": 3.9742, + "lr": 0.0005758041958041958, + "step": 3534, + "tokens_trained": 1.73667084 + }, + { + "epoch": 1.0028366782497695, + "grad_norm": 9.745009422302246, + "loss": 4.028, + "lr": 0.0005755244755244755, + "step": 3536, + "tokens_trained": 1.737656328 + }, + { + "epoch": 1.0034040138997233, + "grad_norm": 5.159154891967773, + "loss": 3.9812, + "lr": 0.0005752447552447552, + "step": 3538, + "tokens_trained": 1.738637688 + }, + { + "epoch": 1.0039713495496774, + "grad_norm": 10.829404830932617, + "loss": 3.9795, + "lr": 0.000574965034965035, + "step": 3540, + "tokens_trained": 1.739621688 + }, + { + "epoch": 1.0045386851996312, + "grad_norm": 8.493478775024414, + "loss": 3.9918, + "lr": 0.0005746853146853147, + "step": 3542, + "tokens_trained": 1.740604488 + }, + { + "epoch": 1.0051060208495852, + "grad_norm": 4.013627529144287, + "loss": 3.9928, + "lr": 0.0005744055944055944, + "step": 3544, + "tokens_trained": 1.74158764 + }, + { + "epoch": 1.005673356499539, + "grad_norm": 12.669920921325684, + "loss": 4.0114, + "lr": 0.0005741258741258741, + "step": 3546, + "tokens_trained": 1.742573592 + }, + { + "epoch": 1.0062406921494929, + "grad_norm": 6.349422931671143, + "loss": 4.0294, + "lr": 0.000573846153846154, + "step": 3548, + "tokens_trained": 1.743555672 + }, + { + "epoch": 1.006808027799447, + "grad_norm": 4.14855432510376, + "loss": 3.9963, + "lr": 0.0005735664335664336, + "step": 3550, + "tokens_trained": 1.744538384 + }, + { + "epoch": 1.0073753634494007, + "grad_norm": 9.063926696777344, + "loss": 3.9557, + "lr": 0.0005732867132867133, + "step": 3552, + "tokens_trained": 1.745523552 + }, + { + "epoch": 1.0079426990993547, + "grad_norm": 11.227505683898926, + "loss": 4.0087, + "lr": 0.000573006993006993, + "step": 3554, + "tokens_trained": 1.746510024 + }, + { + "epoch": 1.0085100347493086, + "grad_norm": 2.418097972869873, + "loss": 3.9942, + "lr": 0.0005727272727272727, + "step": 3556, + "tokens_trained": 1.747493048 + }, + { + "epoch": 1.0090773703992624, + "grad_norm": 14.376424789428711, + "loss": 3.999, + "lr": 0.0005724475524475525, + "step": 3558, + "tokens_trained": 1.748476808 + }, + { + "epoch": 1.0096447060492164, + "grad_norm": 9.035455703735352, + "loss": 4.063, + "lr": 0.0005721678321678322, + "step": 3560, + "tokens_trained": 1.749460504 + }, + { + "epoch": 1.0102120416991702, + "grad_norm": 3.8785758018493652, + "loss": 4.0269, + "lr": 0.0005718881118881118, + "step": 3562, + "tokens_trained": 1.750438936 + }, + { + "epoch": 1.0107793773491243, + "grad_norm": 15.488290786743164, + "loss": 4.0294, + "lr": 0.0005716083916083916, + "step": 3564, + "tokens_trained": 1.751420168 + }, + { + "epoch": 1.011346712999078, + "grad_norm": 10.785538673400879, + "loss": 4.0102, + "lr": 0.0005713286713286714, + "step": 3566, + "tokens_trained": 1.752405288 + }, + { + "epoch": 1.011914048649032, + "grad_norm": 5.724320888519287, + "loss": 4.0148, + "lr": 0.0005710489510489511, + "step": 3568, + "tokens_trained": 1.75338604 + }, + { + "epoch": 1.012481384298986, + "grad_norm": 11.051252365112305, + "loss": 4.022, + "lr": 0.0005707692307692308, + "step": 3570, + "tokens_trained": 1.75436632 + }, + { + "epoch": 1.0130487199489397, + "grad_norm": 10.290446281433105, + "loss": 3.9781, + "lr": 0.0005704895104895105, + "step": 3572, + "tokens_trained": 1.755349944 + }, + { + "epoch": 1.0136160555988938, + "grad_norm": 4.81416130065918, + "loss": 4.0393, + "lr": 0.0005702097902097902, + "step": 3574, + "tokens_trained": 1.756337976 + }, + { + "epoch": 1.0141833912488476, + "grad_norm": 14.237113952636719, + "loss": 4.087, + "lr": 0.0005699300699300699, + "step": 3576, + "tokens_trained": 1.75732372 + }, + { + "epoch": 1.0147507268988014, + "grad_norm": 3.973662853240967, + "loss": 3.9692, + "lr": 0.0005696503496503497, + "step": 3578, + "tokens_trained": 1.7583098 + }, + { + "epoch": 1.0153180625487555, + "grad_norm": 5.629733562469482, + "loss": 4.0003, + "lr": 0.0005693706293706293, + "step": 3580, + "tokens_trained": 1.759300416 + }, + { + "epoch": 1.0158853981987093, + "grad_norm": 7.505983352661133, + "loss": 4.011, + "lr": 0.0005690909090909091, + "step": 3582, + "tokens_trained": 1.760288632 + }, + { + "epoch": 1.0164527338486633, + "grad_norm": 5.501095294952393, + "loss": 3.994, + "lr": 0.0005688111888111889, + "step": 3584, + "tokens_trained": 1.761270328 + }, + { + "epoch": 1.0170200694986171, + "grad_norm": 4.74052619934082, + "loss": 4.0241, + "lr": 0.0005685314685314686, + "step": 3586, + "tokens_trained": 1.762252432 + }, + { + "epoch": 1.017587405148571, + "grad_norm": 8.409584045410156, + "loss": 4.0137, + "lr": 0.0005682517482517483, + "step": 3588, + "tokens_trained": 1.76323772 + }, + { + "epoch": 1.018154740798525, + "grad_norm": 5.391080379486084, + "loss": 3.9424, + "lr": 0.0005679720279720279, + "step": 3590, + "tokens_trained": 1.764220272 + }, + { + "epoch": 1.0187220764484788, + "grad_norm": 4.679509162902832, + "loss": 3.9893, + "lr": 0.0005676923076923077, + "step": 3592, + "tokens_trained": 1.765203832 + }, + { + "epoch": 1.0192894120984328, + "grad_norm": 5.354970932006836, + "loss": 4.023, + "lr": 0.0005674125874125874, + "step": 3594, + "tokens_trained": 1.76618936 + }, + { + "epoch": 1.0198567477483866, + "grad_norm": 5.1085357666015625, + "loss": 3.9995, + "lr": 0.0005671328671328672, + "step": 3596, + "tokens_trained": 1.767171216 + }, + { + "epoch": 1.0204240833983405, + "grad_norm": 3.0856151580810547, + "loss": 4.0084, + "lr": 0.0005668531468531468, + "step": 3598, + "tokens_trained": 1.76815464 + }, + { + "epoch": 1.0209914190482945, + "grad_norm": 2.330599308013916, + "loss": 3.9838, + "lr": 0.0005665734265734265, + "step": 3600, + "tokens_trained": 1.76913612 + }, + { + "epoch": 1.0215587546982483, + "grad_norm": 5.641542434692383, + "loss": 3.951, + "lr": 0.0005662937062937064, + "step": 3602, + "tokens_trained": 1.770119592 + }, + { + "epoch": 1.0221260903482023, + "grad_norm": 8.442550659179688, + "loss": 4.0088, + "lr": 0.000566013986013986, + "step": 3604, + "tokens_trained": 1.771103624 + }, + { + "epoch": 1.0226934259981562, + "grad_norm": 6.0125732421875, + "loss": 4.0243, + "lr": 0.0005657342657342658, + "step": 3606, + "tokens_trained": 1.772091496 + }, + { + "epoch": 1.02326076164811, + "grad_norm": 4.9415388107299805, + "loss": 3.9874, + "lr": 0.0005654545454545454, + "step": 3608, + "tokens_trained": 1.77307708 + }, + { + "epoch": 1.023828097298064, + "grad_norm": 5.762909889221191, + "loss": 4.0242, + "lr": 0.0005651748251748252, + "step": 3610, + "tokens_trained": 1.774058032 + }, + { + "epoch": 1.0243954329480178, + "grad_norm": 6.652433395385742, + "loss": 3.9908, + "lr": 0.0005648951048951049, + "step": 3612, + "tokens_trained": 1.775036512 + }, + { + "epoch": 1.0249627685979719, + "grad_norm": 3.539031505584717, + "loss": 3.9406, + "lr": 0.0005646153846153847, + "step": 3614, + "tokens_trained": 1.776021656 + }, + { + "epoch": 1.0255301042479257, + "grad_norm": 6.829031467437744, + "loss": 3.9839, + "lr": 0.0005643356643356643, + "step": 3616, + "tokens_trained": 1.777000824 + }, + { + "epoch": 1.0260974398978795, + "grad_norm": 3.46431040763855, + "loss": 4.0013, + "lr": 0.000564055944055944, + "step": 3618, + "tokens_trained": 1.777983504 + }, + { + "epoch": 1.0266647755478335, + "grad_norm": 5.163998126983643, + "loss": 3.9898, + "lr": 0.0005637762237762239, + "step": 3620, + "tokens_trained": 1.778966368 + }, + { + "epoch": 1.0272321111977873, + "grad_norm": 4.270689010620117, + "loss": 3.9868, + "lr": 0.0005634965034965035, + "step": 3622, + "tokens_trained": 1.77994468 + }, + { + "epoch": 1.0277994468477414, + "grad_norm": 5.297236442565918, + "loss": 3.9903, + "lr": 0.0005632167832167833, + "step": 3624, + "tokens_trained": 1.7809246 + }, + { + "epoch": 1.0280831146727183, + "eval_loss": 0.9977753162384033, + "eval_runtime": 20.5557, + "step": 3625, + "tokens_trained": 1.781418056 + }, + { + "epoch": 1.0283667824976952, + "grad_norm": 4.560519218444824, + "loss": 3.9339, + "lr": 0.0005629370629370629, + "step": 3626, + "tokens_trained": 1.781910808 + }, + { + "epoch": 1.028934118147649, + "grad_norm": 3.7894208431243896, + "loss": 3.9739, + "lr": 0.0005626573426573426, + "step": 3628, + "tokens_trained": 1.782891912 + }, + { + "epoch": 1.029501453797603, + "grad_norm": 3.9937522411346436, + "loss": 3.9734, + "lr": 0.0005623776223776224, + "step": 3630, + "tokens_trained": 1.783871032 + }, + { + "epoch": 1.0300687894475569, + "grad_norm": 5.798377990722656, + "loss": 3.9526, + "lr": 0.0005620979020979021, + "step": 3632, + "tokens_trained": 1.784855792 + }, + { + "epoch": 1.030636125097511, + "grad_norm": 3.2532927989959717, + "loss": 3.9237, + "lr": 0.0005618181818181818, + "step": 3634, + "tokens_trained": 1.785835216 + }, + { + "epoch": 1.0312034607474647, + "grad_norm": 3.2262985706329346, + "loss": 3.9676, + "lr": 0.0005615384615384615, + "step": 3636, + "tokens_trained": 1.78682184 + }, + { + "epoch": 1.0317707963974185, + "grad_norm": 2.4307727813720703, + "loss": 3.9376, + "lr": 0.0005612587412587414, + "step": 3638, + "tokens_trained": 1.787804536 + }, + { + "epoch": 1.0323381320473726, + "grad_norm": 11.10562515258789, + "loss": 4.0096, + "lr": 0.000560979020979021, + "step": 3640, + "tokens_trained": 1.788785152 + }, + { + "epoch": 1.0329054676973264, + "grad_norm": 8.139045715332031, + "loss": 3.992, + "lr": 0.0005606993006993008, + "step": 3642, + "tokens_trained": 1.789766736 + }, + { + "epoch": 1.0334728033472804, + "grad_norm": 5.561949729919434, + "loss": 3.9368, + "lr": 0.0005604195804195804, + "step": 3644, + "tokens_trained": 1.790746488 + }, + { + "epoch": 1.0340401389972342, + "grad_norm": 6.812232494354248, + "loss": 4.0185, + "lr": 0.0005601398601398601, + "step": 3646, + "tokens_trained": 1.79172608 + }, + { + "epoch": 1.034607474647188, + "grad_norm": 6.200248718261719, + "loss": 3.9072, + "lr": 0.0005598601398601399, + "step": 3648, + "tokens_trained": 1.792710784 + }, + { + "epoch": 1.035174810297142, + "grad_norm": 5.059606075286865, + "loss": 3.9334, + "lr": 0.0005595804195804196, + "step": 3650, + "tokens_trained": 1.793692736 + }, + { + "epoch": 1.035742145947096, + "grad_norm": 2.722522020339966, + "loss": 3.9438, + "lr": 0.0005593006993006993, + "step": 3652, + "tokens_trained": 1.79467536 + }, + { + "epoch": 1.03630948159705, + "grad_norm": 5.643895626068115, + "loss": 4.0213, + "lr": 0.000559020979020979, + "step": 3654, + "tokens_trained": 1.795662048 + }, + { + "epoch": 1.0368768172470038, + "grad_norm": 3.948822021484375, + "loss": 4.0022, + "lr": 0.0005587412587412589, + "step": 3656, + "tokens_trained": 1.79664468 + }, + { + "epoch": 1.0374441528969576, + "grad_norm": 2.5267179012298584, + "loss": 3.9655, + "lr": 0.0005584615384615385, + "step": 3658, + "tokens_trained": 1.7976262 + }, + { + "epoch": 1.0380114885469116, + "grad_norm": 2.7988510131835938, + "loss": 4.0161, + "lr": 0.0005581818181818182, + "step": 3660, + "tokens_trained": 1.79861132 + }, + { + "epoch": 1.0385788241968654, + "grad_norm": 8.685417175292969, + "loss": 4.0038, + "lr": 0.0005579020979020979, + "step": 3662, + "tokens_trained": 1.799592384 + }, + { + "epoch": 1.0391461598468195, + "grad_norm": 8.391874313354492, + "loss": 3.9519, + "lr": 0.0005576223776223776, + "step": 3664, + "tokens_trained": 1.800577208 + }, + { + "epoch": 1.0397134954967733, + "grad_norm": 7.6766815185546875, + "loss": 4.0119, + "lr": 0.0005573426573426574, + "step": 3666, + "tokens_trained": 1.801559128 + }, + { + "epoch": 1.040280831146727, + "grad_norm": 6.230587959289551, + "loss": 3.9528, + "lr": 0.0005570629370629371, + "step": 3668, + "tokens_trained": 1.802540608 + }, + { + "epoch": 1.0408481667966811, + "grad_norm": 7.4818010330200195, + "loss": 3.9532, + "lr": 0.0005567832167832167, + "step": 3670, + "tokens_trained": 1.80352688 + }, + { + "epoch": 1.041415502446635, + "grad_norm": 7.714044094085693, + "loss": 4.0154, + "lr": 0.0005565034965034965, + "step": 3672, + "tokens_trained": 1.804515736 + }, + { + "epoch": 1.041982838096589, + "grad_norm": 5.260356426239014, + "loss": 3.9931, + "lr": 0.0005562237762237763, + "step": 3674, + "tokens_trained": 1.805497152 + }, + { + "epoch": 1.0425501737465428, + "grad_norm": 4.576403617858887, + "loss": 4.0345, + "lr": 0.000555944055944056, + "step": 3676, + "tokens_trained": 1.806479328 + }, + { + "epoch": 1.0431175093964966, + "grad_norm": 3.378896713256836, + "loss": 3.9827, + "lr": 0.0005556643356643357, + "step": 3678, + "tokens_trained": 1.807459232 + }, + { + "epoch": 1.0436848450464506, + "grad_norm": 6.739299774169922, + "loss": 3.9811, + "lr": 0.0005553846153846154, + "step": 3680, + "tokens_trained": 1.808441944 + }, + { + "epoch": 1.0442521806964045, + "grad_norm": 4.965353012084961, + "loss": 3.9292, + "lr": 0.0005551048951048951, + "step": 3682, + "tokens_trained": 1.809423488 + }, + { + "epoch": 1.0448195163463585, + "grad_norm": 7.479167461395264, + "loss": 3.9386, + "lr": 0.0005548251748251748, + "step": 3684, + "tokens_trained": 1.810409008 + }, + { + "epoch": 1.0453868519963123, + "grad_norm": 3.754814863204956, + "loss": 3.9936, + "lr": 0.0005545454545454546, + "step": 3686, + "tokens_trained": 1.811387856 + }, + { + "epoch": 1.0459541876462661, + "grad_norm": 5.744228839874268, + "loss": 3.9761, + "lr": 0.0005542657342657342, + "step": 3688, + "tokens_trained": 1.812371104 + }, + { + "epoch": 1.0465215232962202, + "grad_norm": 5.926168918609619, + "loss": 3.904, + "lr": 0.000553986013986014, + "step": 3690, + "tokens_trained": 1.813356456 + }, + { + "epoch": 1.047088858946174, + "grad_norm": 5.209751605987549, + "loss": 3.9706, + "lr": 0.0005537062937062938, + "step": 3692, + "tokens_trained": 1.81434056 + }, + { + "epoch": 1.047656194596128, + "grad_norm": 4.979823112487793, + "loss": 3.972, + "lr": 0.0005534265734265735, + "step": 3694, + "tokens_trained": 1.815319936 + }, + { + "epoch": 1.0482235302460818, + "grad_norm": 5.393070220947266, + "loss": 3.9694, + "lr": 0.0005531468531468532, + "step": 3696, + "tokens_trained": 1.816299016 + }, + { + "epoch": 1.0487908658960357, + "grad_norm": 3.27998423576355, + "loss": 3.9706, + "lr": 0.0005528671328671328, + "step": 3698, + "tokens_trained": 1.817284696 + }, + { + "epoch": 1.0493582015459897, + "grad_norm": 6.364100456237793, + "loss": 3.9803, + "lr": 0.0005525874125874126, + "step": 3700, + "tokens_trained": 1.818268736 + }, + { + "epoch": 1.0499255371959435, + "grad_norm": 6.063296794891357, + "loss": 3.9761, + "lr": 0.0005523076923076923, + "step": 3702, + "tokens_trained": 1.819255432 + }, + { + "epoch": 1.0504928728458975, + "grad_norm": 6.279892444610596, + "loss": 3.9792, + "lr": 0.0005520279720279721, + "step": 3704, + "tokens_trained": 1.820241704 + }, + { + "epoch": 1.0510602084958514, + "grad_norm": 3.804609537124634, + "loss": 3.9763, + "lr": 0.0005517482517482517, + "step": 3706, + "tokens_trained": 1.821226584 + }, + { + "epoch": 1.0516275441458052, + "grad_norm": 5.056581497192383, + "loss": 3.9886, + "lr": 0.0005514685314685315, + "step": 3708, + "tokens_trained": 1.822208432 + }, + { + "epoch": 1.0521948797957592, + "grad_norm": 2.052483081817627, + "loss": 3.9485, + "lr": 0.0005511888111888111, + "step": 3710, + "tokens_trained": 1.823195928 + }, + { + "epoch": 1.052762215445713, + "grad_norm": 6.076491832733154, + "loss": 4.0132, + "lr": 0.0005509090909090909, + "step": 3712, + "tokens_trained": 1.824178568 + }, + { + "epoch": 1.053329551095667, + "grad_norm": 7.526022434234619, + "loss": 3.9478, + "lr": 0.0005506293706293707, + "step": 3714, + "tokens_trained": 1.82516128 + }, + { + "epoch": 1.0538968867456209, + "grad_norm": 2.7086679935455322, + "loss": 3.9913, + "lr": 0.0005503496503496503, + "step": 3716, + "tokens_trained": 1.826142864 + }, + { + "epoch": 1.0544642223955747, + "grad_norm": 1.7643057107925415, + "loss": 3.9813, + "lr": 0.0005500699300699301, + "step": 3718, + "tokens_trained": 1.82712608 + }, + { + "epoch": 1.0550315580455287, + "grad_norm": 6.2813029289245605, + "loss": 3.9772, + "lr": 0.0005497902097902098, + "step": 3720, + "tokens_trained": 1.828107616 + }, + { + "epoch": 1.0555988936954825, + "grad_norm": 7.591973781585693, + "loss": 3.938, + "lr": 0.0005495104895104896, + "step": 3722, + "tokens_trained": 1.82909308 + }, + { + "epoch": 1.0561662293454366, + "grad_norm": 4.976797580718994, + "loss": 3.9889, + "lr": 0.0005492307692307692, + "step": 3724, + "tokens_trained": 1.830079168 + }, + { + "epoch": 1.0567335649953904, + "grad_norm": 5.417744159698486, + "loss": 4.0039, + "lr": 0.0005489510489510489, + "step": 3726, + "tokens_trained": 1.831062488 + }, + { + "epoch": 1.0573009006453442, + "grad_norm": 4.516066074371338, + "loss": 3.9845, + "lr": 0.0005486713286713286, + "step": 3728, + "tokens_trained": 1.832046528 + }, + { + "epoch": 1.0578682362952982, + "grad_norm": 3.677839756011963, + "loss": 3.9446, + "lr": 0.0005483916083916084, + "step": 3730, + "tokens_trained": 1.83303104 + }, + { + "epoch": 1.058435571945252, + "grad_norm": 5.22024393081665, + "loss": 3.9746, + "lr": 0.0005481118881118882, + "step": 3732, + "tokens_trained": 1.834017736 + }, + { + "epoch": 1.059002907595206, + "grad_norm": 7.4156060218811035, + "loss": 3.9898, + "lr": 0.0005478321678321678, + "step": 3734, + "tokens_trained": 1.8349996 + }, + { + "epoch": 1.05957024324516, + "grad_norm": 3.472533702850342, + "loss": 3.9558, + "lr": 0.0005475524475524476, + "step": 3736, + "tokens_trained": 1.835979152 + }, + { + "epoch": 1.0601375788951137, + "grad_norm": 2.4360055923461914, + "loss": 3.9627, + "lr": 0.0005472727272727273, + "step": 3738, + "tokens_trained": 1.836963416 + }, + { + "epoch": 1.0607049145450678, + "grad_norm": 4.8988728523254395, + "loss": 3.9492, + "lr": 0.000546993006993007, + "step": 3740, + "tokens_trained": 1.83794088 + }, + { + "epoch": 1.0612722501950216, + "grad_norm": 5.711161136627197, + "loss": 4.002, + "lr": 0.0005467132867132867, + "step": 3742, + "tokens_trained": 1.838924456 + }, + { + "epoch": 1.0618395858449756, + "grad_norm": 4.373830318450928, + "loss": 3.9811, + "lr": 0.0005464335664335664, + "step": 3744, + "tokens_trained": 1.839902072 + }, + { + "epoch": 1.0624069214949294, + "grad_norm": 3.2446751594543457, + "loss": 3.9551, + "lr": 0.0005461538461538461, + "step": 3746, + "tokens_trained": 1.840882688 + }, + { + "epoch": 1.0629742571448832, + "grad_norm": 3.3250389099121094, + "loss": 3.9556, + "lr": 0.0005458741258741259, + "step": 3748, + "tokens_trained": 1.841863816 + }, + { + "epoch": 1.0635415927948373, + "grad_norm": 7.377841949462891, + "loss": 4.0118, + "lr": 0.0005455944055944057, + "step": 3750, + "tokens_trained": 1.842844072 + }, + { + "epoch": 1.0635415927948373, + "eval_loss": 0.994845449924469, + "eval_runtime": 20.2191, + "step": 3750, + "tokens_trained": 1.842844072 + }, + { + "epoch": 1.064108928444791, + "grad_norm": 3.671860694885254, + "loss": 3.9439, + "lr": 0.0005453146853146853, + "step": 3752, + "tokens_trained": 1.843832472 + }, + { + "epoch": 1.0646762640947451, + "grad_norm": 3.7120800018310547, + "loss": 3.9992, + "lr": 0.000545034965034965, + "step": 3754, + "tokens_trained": 1.84481192 + }, + { + "epoch": 1.065243599744699, + "grad_norm": 6.560836315155029, + "loss": 3.9594, + "lr": 0.0005447552447552448, + "step": 3756, + "tokens_trained": 1.84579436 + }, + { + "epoch": 1.0658109353946528, + "grad_norm": 1.7166560888290405, + "loss": 3.9656, + "lr": 0.0005444755244755245, + "step": 3758, + "tokens_trained": 1.84678316 + }, + { + "epoch": 1.0663782710446068, + "grad_norm": 5.579006671905518, + "loss": 4.0034, + "lr": 0.0005441958041958042, + "step": 3760, + "tokens_trained": 1.847770488 + }, + { + "epoch": 1.0669456066945606, + "grad_norm": 3.6601710319519043, + "loss": 3.9346, + "lr": 0.0005439160839160839, + "step": 3762, + "tokens_trained": 1.848747488 + }, + { + "epoch": 1.0675129423445147, + "grad_norm": 1.2449930906295776, + "loss": 3.9493, + "lr": 0.0005436363636363635, + "step": 3764, + "tokens_trained": 1.849727168 + }, + { + "epoch": 1.0680802779944685, + "grad_norm": 5.6108479499816895, + "loss": 3.9527, + "lr": 0.0005433566433566434, + "step": 3766, + "tokens_trained": 1.85070748 + }, + { + "epoch": 1.0686476136444223, + "grad_norm": 7.556972980499268, + "loss": 3.9465, + "lr": 0.0005430769230769231, + "step": 3768, + "tokens_trained": 1.851693328 + }, + { + "epoch": 1.0692149492943763, + "grad_norm": 3.7439489364624023, + "loss": 3.964, + "lr": 0.0005427972027972028, + "step": 3770, + "tokens_trained": 1.852674992 + }, + { + "epoch": 1.0697822849443301, + "grad_norm": 4.162338733673096, + "loss": 3.969, + "lr": 0.0005425174825174825, + "step": 3772, + "tokens_trained": 1.853659048 + }, + { + "epoch": 1.0703496205942842, + "grad_norm": 3.8950648307800293, + "loss": 3.9691, + "lr": 0.0005422377622377623, + "step": 3774, + "tokens_trained": 1.854644728 + }, + { + "epoch": 1.070916956244238, + "grad_norm": 4.361495018005371, + "loss": 3.9437, + "lr": 0.000541958041958042, + "step": 3776, + "tokens_trained": 1.855626632 + }, + { + "epoch": 1.0714842918941918, + "grad_norm": 3.5286366939544678, + "loss": 3.9831, + "lr": 0.0005416783216783216, + "step": 3778, + "tokens_trained": 1.856606216 + }, + { + "epoch": 1.0720516275441458, + "grad_norm": 4.972531795501709, + "loss": 4.0222, + "lr": 0.0005413986013986014, + "step": 3780, + "tokens_trained": 1.857590816 + }, + { + "epoch": 1.0726189631940997, + "grad_norm": 9.155055046081543, + "loss": 3.9442, + "lr": 0.000541118881118881, + "step": 3782, + "tokens_trained": 1.85857288 + }, + { + "epoch": 1.0731862988440537, + "grad_norm": 1.4077136516571045, + "loss": 3.9806, + "lr": 0.0005408391608391609, + "step": 3784, + "tokens_trained": 1.859555224 + }, + { + "epoch": 1.0737536344940075, + "grad_norm": 3.204779863357544, + "loss": 3.9506, + "lr": 0.0005405594405594406, + "step": 3786, + "tokens_trained": 1.860538984 + }, + { + "epoch": 1.0743209701439613, + "grad_norm": 3.988658905029297, + "loss": 4.0025, + "lr": 0.0005402797202797203, + "step": 3788, + "tokens_trained": 1.861522976 + }, + { + "epoch": 1.0748883057939154, + "grad_norm": 3.0060372352600098, + "loss": 3.9308, + "lr": 0.00054, + "step": 3790, + "tokens_trained": 1.86250564 + }, + { + "epoch": 1.0754556414438692, + "grad_norm": 2.494147777557373, + "loss": 4.0116, + "lr": 0.0005397202797202798, + "step": 3792, + "tokens_trained": 1.863491248 + }, + { + "epoch": 1.0760229770938232, + "grad_norm": 5.260354518890381, + "loss": 3.9917, + "lr": 0.0005394405594405595, + "step": 3794, + "tokens_trained": 1.864474808 + }, + { + "epoch": 1.076590312743777, + "grad_norm": 4.43446159362793, + "loss": 3.9698, + "lr": 0.0005391608391608391, + "step": 3796, + "tokens_trained": 1.865457608 + }, + { + "epoch": 1.0771576483937308, + "grad_norm": 5.485021114349365, + "loss": 3.9494, + "lr": 0.0005388811188811189, + "step": 3798, + "tokens_trained": 1.866439336 + }, + { + "epoch": 1.0777249840436849, + "grad_norm": 5.432106971740723, + "loss": 3.9749, + "lr": 0.0005386013986013985, + "step": 3800, + "tokens_trained": 1.867422432 + }, + { + "epoch": 1.0782923196936387, + "grad_norm": 5.726179122924805, + "loss": 3.9524, + "lr": 0.0005383216783216784, + "step": 3802, + "tokens_trained": 1.868404976 + }, + { + "epoch": 1.0788596553435927, + "grad_norm": 7.2211594581604, + "loss": 3.954, + "lr": 0.0005380419580419581, + "step": 3804, + "tokens_trained": 1.869387272 + }, + { + "epoch": 1.0794269909935466, + "grad_norm": 3.6406068801879883, + "loss": 4.0125, + "lr": 0.0005377622377622377, + "step": 3806, + "tokens_trained": 1.870371664 + }, + { + "epoch": 1.0799943266435004, + "grad_norm": 7.254781723022461, + "loss": 3.9535, + "lr": 0.0005374825174825175, + "step": 3808, + "tokens_trained": 1.87135524 + }, + { + "epoch": 1.0805616622934544, + "grad_norm": 7.8573079109191895, + "loss": 4.0054, + "lr": 0.0005372027972027972, + "step": 3810, + "tokens_trained": 1.872337216 + }, + { + "epoch": 1.0811289979434082, + "grad_norm": 1.049710988998413, + "loss": 3.9541, + "lr": 0.000536923076923077, + "step": 3812, + "tokens_trained": 1.873317672 + }, + { + "epoch": 1.0816963335933623, + "grad_norm": 7.515570163726807, + "loss": 3.9466, + "lr": 0.0005366433566433566, + "step": 3814, + "tokens_trained": 1.874299184 + }, + { + "epoch": 1.082263669243316, + "grad_norm": 6.041797637939453, + "loss": 3.9508, + "lr": 0.0005363636363636364, + "step": 3816, + "tokens_trained": 1.875282768 + }, + { + "epoch": 1.0828310048932699, + "grad_norm": 2.9910285472869873, + "loss": 3.9368, + "lr": 0.000536083916083916, + "step": 3818, + "tokens_trained": 1.876264312 + }, + { + "epoch": 1.083398340543224, + "grad_norm": 3.5802299976348877, + "loss": 3.9661, + "lr": 0.0005358041958041959, + "step": 3820, + "tokens_trained": 1.877245472 + }, + { + "epoch": 1.0839656761931777, + "grad_norm": 6.078779697418213, + "loss": 3.9758, + "lr": 0.0005355244755244756, + "step": 3822, + "tokens_trained": 1.87822768 + }, + { + "epoch": 1.0845330118431318, + "grad_norm": 6.143925189971924, + "loss": 3.947, + "lr": 0.0005352447552447552, + "step": 3824, + "tokens_trained": 1.879209824 + }, + { + "epoch": 1.0851003474930856, + "grad_norm": 4.272439002990723, + "loss": 4.0284, + "lr": 0.000534965034965035, + "step": 3826, + "tokens_trained": 1.88019528 + }, + { + "epoch": 1.0856676831430394, + "grad_norm": 7.169465065002441, + "loss": 3.9651, + "lr": 0.0005346853146853147, + "step": 3828, + "tokens_trained": 1.88117776 + }, + { + "epoch": 1.0862350187929934, + "grad_norm": 6.489839553833008, + "loss": 3.9505, + "lr": 0.0005344055944055945, + "step": 3830, + "tokens_trained": 1.88216468 + }, + { + "epoch": 1.0868023544429473, + "grad_norm": 2.966554880142212, + "loss": 4.0406, + "lr": 0.0005341258741258741, + "step": 3832, + "tokens_trained": 1.883147968 + }, + { + "epoch": 1.0873696900929013, + "grad_norm": 4.948841094970703, + "loss": 3.9704, + "lr": 0.0005338461538461538, + "step": 3834, + "tokens_trained": 1.884132176 + }, + { + "epoch": 1.0879370257428551, + "grad_norm": 7.666274547576904, + "loss": 4.0082, + "lr": 0.0005335664335664335, + "step": 3836, + "tokens_trained": 1.885119008 + }, + { + "epoch": 1.088504361392809, + "grad_norm": 12.454533576965332, + "loss": 3.9702, + "lr": 0.0005332867132867133, + "step": 3838, + "tokens_trained": 1.88610144 + }, + { + "epoch": 1.089071697042763, + "grad_norm": 4.42985725402832, + "loss": 3.9601, + "lr": 0.0005330069930069931, + "step": 3840, + "tokens_trained": 1.88708772 + }, + { + "epoch": 1.0896390326927168, + "grad_norm": 14.10716438293457, + "loss": 3.9942, + "lr": 0.0005327272727272727, + "step": 3842, + "tokens_trained": 1.888068192 + }, + { + "epoch": 1.0902063683426708, + "grad_norm": 6.3290910720825195, + "loss": 3.9218, + "lr": 0.0005324475524475525, + "step": 3844, + "tokens_trained": 1.88905572 + }, + { + "epoch": 1.0907737039926246, + "grad_norm": 6.61427640914917, + "loss": 4.0173, + "lr": 0.0005321678321678322, + "step": 3846, + "tokens_trained": 1.890040152 + }, + { + "epoch": 1.0913410396425784, + "grad_norm": 6.868432998657227, + "loss": 3.9553, + "lr": 0.000531888111888112, + "step": 3848, + "tokens_trained": 1.891031024 + }, + { + "epoch": 1.0919083752925325, + "grad_norm": 4.057258129119873, + "loss": 3.9839, + "lr": 0.0005316083916083916, + "step": 3850, + "tokens_trained": 1.892009904 + }, + { + "epoch": 1.0924757109424863, + "grad_norm": 3.5418479442596436, + "loss": 3.9839, + "lr": 0.0005313286713286713, + "step": 3852, + "tokens_trained": 1.892993976 + }, + { + "epoch": 1.0930430465924403, + "grad_norm": 1.231491208076477, + "loss": 3.9549, + "lr": 0.000531048951048951, + "step": 3854, + "tokens_trained": 1.893972744 + }, + { + "epoch": 1.0936103822423942, + "grad_norm": 4.056438446044922, + "loss": 3.9512, + "lr": 0.0005307692307692308, + "step": 3856, + "tokens_trained": 1.894954248 + }, + { + "epoch": 1.094177717892348, + "grad_norm": 2.9252607822418213, + "loss": 3.9201, + "lr": 0.0005304895104895106, + "step": 3858, + "tokens_trained": 1.895938816 + }, + { + "epoch": 1.094745053542302, + "grad_norm": 3.035308599472046, + "loss": 3.9367, + "lr": 0.0005302097902097902, + "step": 3860, + "tokens_trained": 1.896920832 + }, + { + "epoch": 1.0953123891922558, + "grad_norm": 2.2526092529296875, + "loss": 3.9554, + "lr": 0.0005299300699300699, + "step": 3862, + "tokens_trained": 1.897903216 + }, + { + "epoch": 1.0958797248422099, + "grad_norm": 2.882819175720215, + "loss": 3.926, + "lr": 0.0005296503496503497, + "step": 3864, + "tokens_trained": 1.898886632 + }, + { + "epoch": 1.0964470604921637, + "grad_norm": 7.817485809326172, + "loss": 3.9583, + "lr": 0.0005293706293706294, + "step": 3866, + "tokens_trained": 1.899872128 + }, + { + "epoch": 1.0970143961421175, + "grad_norm": 8.241719245910645, + "loss": 3.9391, + "lr": 0.0005290909090909091, + "step": 3868, + "tokens_trained": 1.900856544 + }, + { + "epoch": 1.0975817317920715, + "grad_norm": 4.160614013671875, + "loss": 3.9285, + "lr": 0.0005288111888111888, + "step": 3870, + "tokens_trained": 1.901838952 + }, + { + "epoch": 1.0981490674420253, + "grad_norm": 3.527678966522217, + "loss": 3.9593, + "lr": 0.0005285314685314684, + "step": 3872, + "tokens_trained": 1.902823024 + }, + { + "epoch": 1.0987164030919794, + "grad_norm": 5.290194511413574, + "loss": 3.9357, + "lr": 0.0005282517482517483, + "step": 3874, + "tokens_trained": 1.903803456 + }, + { + "epoch": 1.0990000709169563, + "eval_loss": 0.9935861229896545, + "eval_runtime": 20.2396, + "step": 3875, + "tokens_trained": 1.904295504 + }, + { + "epoch": 1.0992837387419332, + "grad_norm": 5.472379207611084, + "loss": 4.0255, + "lr": 0.000527972027972028, + "step": 3876, + "tokens_trained": 1.904786344 + }, + { + "epoch": 1.099851074391887, + "grad_norm": 6.999550819396973, + "loss": 3.9523, + "lr": 0.0005276923076923077, + "step": 3878, + "tokens_trained": 1.90576952 + }, + { + "epoch": 1.100418410041841, + "grad_norm": 3.3077871799468994, + "loss": 3.9452, + "lr": 0.0005274125874125874, + "step": 3880, + "tokens_trained": 1.906745784 + }, + { + "epoch": 1.1009857456917949, + "grad_norm": 4.513088226318359, + "loss": 3.9687, + "lr": 0.0005271328671328672, + "step": 3882, + "tokens_trained": 1.907734576 + }, + { + "epoch": 1.101553081341749, + "grad_norm": 8.249629020690918, + "loss": 3.9445, + "lr": 0.0005268531468531469, + "step": 3884, + "tokens_trained": 1.908716328 + }, + { + "epoch": 1.1021204169917027, + "grad_norm": 8.281685829162598, + "loss": 3.9906, + "lr": 0.0005265734265734266, + "step": 3886, + "tokens_trained": 1.909702984 + }, + { + "epoch": 1.1026877526416565, + "grad_norm": 6.521668910980225, + "loss": 3.9971, + "lr": 0.0005262937062937063, + "step": 3888, + "tokens_trained": 1.91068828 + }, + { + "epoch": 1.1032550882916106, + "grad_norm": 6.442141056060791, + "loss": 3.9769, + "lr": 0.0005260139860139859, + "step": 3890, + "tokens_trained": 1.911668976 + }, + { + "epoch": 1.1038224239415644, + "grad_norm": 11.120711326599121, + "loss": 3.9455, + "lr": 0.0005257342657342658, + "step": 3892, + "tokens_trained": 1.912650176 + }, + { + "epoch": 1.1043897595915184, + "grad_norm": 2.695085048675537, + "loss": 3.984, + "lr": 0.0005254545454545455, + "step": 3894, + "tokens_trained": 1.913624832 + }, + { + "epoch": 1.1049570952414722, + "grad_norm": 16.994462966918945, + "loss": 3.968, + "lr": 0.0005251748251748252, + "step": 3896, + "tokens_trained": 1.914609128 + }, + { + "epoch": 1.105524430891426, + "grad_norm": 5.866199016571045, + "loss": 3.9157, + "lr": 0.0005248951048951049, + "step": 3898, + "tokens_trained": 1.91559088 + }, + { + "epoch": 1.10609176654138, + "grad_norm": 8.222938537597656, + "loss": 3.9516, + "lr": 0.0005246153846153847, + "step": 3900, + "tokens_trained": 1.916575752 + }, + { + "epoch": 1.106659102191334, + "grad_norm": 6.4162774085998535, + "loss": 3.9761, + "lr": 0.0005243356643356644, + "step": 3902, + "tokens_trained": 1.9175578 + }, + { + "epoch": 1.107226437841288, + "grad_norm": 5.338213920593262, + "loss": 3.9804, + "lr": 0.000524055944055944, + "step": 3904, + "tokens_trained": 1.918538192 + }, + { + "epoch": 1.1077937734912418, + "grad_norm": 6.3608927726745605, + "loss": 3.9675, + "lr": 0.0005237762237762238, + "step": 3906, + "tokens_trained": 1.9195184 + }, + { + "epoch": 1.1083611091411956, + "grad_norm": 6.1585845947265625, + "loss": 3.9385, + "lr": 0.0005234965034965034, + "step": 3908, + "tokens_trained": 1.920498704 + }, + { + "epoch": 1.1089284447911496, + "grad_norm": 5.266563415527344, + "loss": 4.0169, + "lr": 0.0005232167832167833, + "step": 3910, + "tokens_trained": 1.921477824 + }, + { + "epoch": 1.1094957804411034, + "grad_norm": 3.5322930812835693, + "loss": 3.9734, + "lr": 0.000522937062937063, + "step": 3912, + "tokens_trained": 1.922456704 + }, + { + "epoch": 1.1100631160910575, + "grad_norm": 3.8564069271087646, + "loss": 3.9873, + "lr": 0.0005226573426573427, + "step": 3914, + "tokens_trained": 1.92343992 + }, + { + "epoch": 1.1106304517410113, + "grad_norm": 3.9069607257843018, + "loss": 3.9892, + "lr": 0.0005223776223776224, + "step": 3916, + "tokens_trained": 1.924424576 + }, + { + "epoch": 1.111197787390965, + "grad_norm": 6.195169925689697, + "loss": 3.9489, + "lr": 0.0005220979020979021, + "step": 3918, + "tokens_trained": 1.92540764 + }, + { + "epoch": 1.1117651230409191, + "grad_norm": 4.950653076171875, + "loss": 3.9561, + "lr": 0.0005218181818181819, + "step": 3920, + "tokens_trained": 1.926386144 + }, + { + "epoch": 1.112332458690873, + "grad_norm": 4.923401832580566, + "loss": 3.991, + "lr": 0.0005215384615384615, + "step": 3922, + "tokens_trained": 1.92736516 + }, + { + "epoch": 1.112899794340827, + "grad_norm": 4.2394561767578125, + "loss": 3.9445, + "lr": 0.0005212587412587413, + "step": 3924, + "tokens_trained": 1.928350608 + }, + { + "epoch": 1.1134671299907808, + "grad_norm": 3.4303910732269287, + "loss": 3.9871, + "lr": 0.0005209790209790209, + "step": 3926, + "tokens_trained": 1.929333008 + }, + { + "epoch": 1.1140344656407346, + "grad_norm": 6.241591453552246, + "loss": 3.9799, + "lr": 0.0005206993006993008, + "step": 3928, + "tokens_trained": 1.930315616 + }, + { + "epoch": 1.1146018012906886, + "grad_norm": 5.21243143081665, + "loss": 3.9624, + "lr": 0.0005204195804195805, + "step": 3930, + "tokens_trained": 1.931298192 + }, + { + "epoch": 1.1151691369406425, + "grad_norm": 7.095268249511719, + "loss": 3.9263, + "lr": 0.0005201398601398601, + "step": 3932, + "tokens_trained": 1.93228248 + }, + { + "epoch": 1.1157364725905965, + "grad_norm": 9.025245666503906, + "loss": 4.0058, + "lr": 0.0005198601398601399, + "step": 3934, + "tokens_trained": 1.93326592 + }, + { + "epoch": 1.1163038082405503, + "grad_norm": 3.9758048057556152, + "loss": 3.9299, + "lr": 0.0005195804195804196, + "step": 3936, + "tokens_trained": 1.93424888 + }, + { + "epoch": 1.1168711438905041, + "grad_norm": 9.68726634979248, + "loss": 3.9433, + "lr": 0.0005193006993006994, + "step": 3938, + "tokens_trained": 1.935231688 + }, + { + "epoch": 1.1174384795404582, + "grad_norm": 7.5478901863098145, + "loss": 4.0053, + "lr": 0.000519020979020979, + "step": 3940, + "tokens_trained": 1.936216832 + }, + { + "epoch": 1.118005815190412, + "grad_norm": 6.016645431518555, + "loss": 3.9481, + "lr": 0.0005187412587412588, + "step": 3942, + "tokens_trained": 1.937196632 + }, + { + "epoch": 1.118573150840366, + "grad_norm": 7.313266277313232, + "loss": 3.9539, + "lr": 0.0005184615384615384, + "step": 3944, + "tokens_trained": 1.938180424 + }, + { + "epoch": 1.1191404864903198, + "grad_norm": 4.228805065155029, + "loss": 3.9528, + "lr": 0.0005181818181818182, + "step": 3946, + "tokens_trained": 1.939165376 + }, + { + "epoch": 1.1197078221402736, + "grad_norm": 1.2050669193267822, + "loss": 3.9699, + "lr": 0.000517902097902098, + "step": 3948, + "tokens_trained": 1.940146184 + }, + { + "epoch": 1.1202751577902277, + "grad_norm": 4.581719875335693, + "loss": 3.9346, + "lr": 0.0005176223776223776, + "step": 3950, + "tokens_trained": 1.941130648 + }, + { + "epoch": 1.1208424934401815, + "grad_norm": 9.381650924682617, + "loss": 3.9294, + "lr": 0.0005173426573426574, + "step": 3952, + "tokens_trained": 1.94210952 + }, + { + "epoch": 1.1214098290901355, + "grad_norm": 5.3781585693359375, + "loss": 3.9208, + "lr": 0.000517062937062937, + "step": 3954, + "tokens_trained": 1.943096344 + }, + { + "epoch": 1.1219771647400893, + "grad_norm": 4.263558387756348, + "loss": 3.9492, + "lr": 0.0005167832167832169, + "step": 3956, + "tokens_trained": 1.94407804 + }, + { + "epoch": 1.1225445003900432, + "grad_norm": 5.920651435852051, + "loss": 3.8951, + "lr": 0.0005165034965034965, + "step": 3958, + "tokens_trained": 1.94506156 + }, + { + "epoch": 1.1231118360399972, + "grad_norm": 7.0110344886779785, + "loss": 3.9329, + "lr": 0.0005162237762237762, + "step": 3960, + "tokens_trained": 1.946040072 + }, + { + "epoch": 1.123679171689951, + "grad_norm": 4.611392021179199, + "loss": 3.9094, + "lr": 0.0005159440559440559, + "step": 3962, + "tokens_trained": 1.947023256 + }, + { + "epoch": 1.124246507339905, + "grad_norm": 5.340510845184326, + "loss": 3.9552, + "lr": 0.0005156643356643357, + "step": 3964, + "tokens_trained": 1.948006848 + }, + { + "epoch": 1.1248138429898589, + "grad_norm": 5.190691947937012, + "loss": 3.956, + "lr": 0.0005153846153846154, + "step": 3966, + "tokens_trained": 1.948991632 + }, + { + "epoch": 1.1253811786398127, + "grad_norm": 5.612351894378662, + "loss": 3.9861, + "lr": 0.0005151048951048951, + "step": 3968, + "tokens_trained": 1.949975704 + }, + { + "epoch": 1.1259485142897667, + "grad_norm": 6.097261428833008, + "loss": 3.9867, + "lr": 0.0005148251748251748, + "step": 3970, + "tokens_trained": 1.950957944 + }, + { + "epoch": 1.1265158499397205, + "grad_norm": 4.194180965423584, + "loss": 3.9242, + "lr": 0.0005145454545454545, + "step": 3972, + "tokens_trained": 1.9519416 + }, + { + "epoch": 1.1270831855896746, + "grad_norm": 4.118505477905273, + "loss": 3.9553, + "lr": 0.0005142657342657343, + "step": 3974, + "tokens_trained": 1.95292252 + }, + { + "epoch": 1.1276505212396284, + "grad_norm": 5.10177755355835, + "loss": 3.9653, + "lr": 0.000513986013986014, + "step": 3976, + "tokens_trained": 1.953902792 + }, + { + "epoch": 1.1282178568895822, + "grad_norm": 5.665530204772949, + "loss": 3.916, + "lr": 0.0005137062937062937, + "step": 3978, + "tokens_trained": 1.954888184 + }, + { + "epoch": 1.1287851925395362, + "grad_norm": 4.1443963050842285, + "loss": 3.9254, + "lr": 0.0005134265734265734, + "step": 3980, + "tokens_trained": 1.955868688 + }, + { + "epoch": 1.12935252818949, + "grad_norm": 2.4941980838775635, + "loss": 3.9502, + "lr": 0.0005131468531468532, + "step": 3982, + "tokens_trained": 1.956852472 + }, + { + "epoch": 1.129919863839444, + "grad_norm": 3.85143780708313, + "loss": 3.8926, + "lr": 0.0005128671328671328, + "step": 3984, + "tokens_trained": 1.957835808 + }, + { + "epoch": 1.130487199489398, + "grad_norm": 5.975537300109863, + "loss": 3.9926, + "lr": 0.0005125874125874126, + "step": 3986, + "tokens_trained": 1.958816736 + }, + { + "epoch": 1.1310545351393517, + "grad_norm": 6.722855567932129, + "loss": 3.986, + "lr": 0.0005123076923076923, + "step": 3988, + "tokens_trained": 1.9598008 + }, + { + "epoch": 1.1316218707893058, + "grad_norm": 3.1752729415893555, + "loss": 3.9343, + "lr": 0.000512027972027972, + "step": 3990, + "tokens_trained": 1.960783816 + }, + { + "epoch": 1.1321892064392596, + "grad_norm": 3.669602394104004, + "loss": 3.9746, + "lr": 0.0005117482517482518, + "step": 3992, + "tokens_trained": 1.96176816 + }, + { + "epoch": 1.1327565420892136, + "grad_norm": 7.3116326332092285, + "loss": 3.9829, + "lr": 0.0005114685314685315, + "step": 3994, + "tokens_trained": 1.962752696 + }, + { + "epoch": 1.1333238777391674, + "grad_norm": 5.816486358642578, + "loss": 3.9617, + "lr": 0.0005111888111888112, + "step": 3996, + "tokens_trained": 1.96373432 + }, + { + "epoch": 1.1338912133891212, + "grad_norm": 2.3524768352508545, + "loss": 3.929, + "lr": 0.0005109090909090908, + "step": 3998, + "tokens_trained": 1.964713416 + }, + { + "epoch": 1.1344585490390753, + "grad_norm": 4.908108711242676, + "loss": 3.9741, + "lr": 0.0005106293706293707, + "step": 4000, + "tokens_trained": 1.965692096 + }, + { + "epoch": 1.1344585490390753, + "eval_loss": 0.9912415146827698, + "eval_runtime": 20.338, + "step": 4000, + "tokens_trained": 1.965692096 + }, + { + "epoch": 1.135025884689029, + "grad_norm": 4.395096778869629, + "loss": 3.955, + "lr": 0.0005103496503496503, + "step": 4002, + "tokens_trained": 1.966677008 + }, + { + "epoch": 1.1355932203389831, + "grad_norm": 3.2460927963256836, + "loss": 3.9522, + "lr": 0.0005100699300699301, + "step": 4004, + "tokens_trained": 1.967662208 + }, + { + "epoch": 1.136160555988937, + "grad_norm": 3.2880218029022217, + "loss": 3.9111, + "lr": 0.0005097902097902098, + "step": 4006, + "tokens_trained": 1.968642816 + }, + { + "epoch": 1.1367278916388908, + "grad_norm": 3.694084644317627, + "loss": 3.9045, + "lr": 0.0005095104895104895, + "step": 4008, + "tokens_trained": 1.969623616 + }, + { + "epoch": 1.1372952272888448, + "grad_norm": 2.690668821334839, + "loss": 3.9534, + "lr": 0.0005092307692307693, + "step": 4010, + "tokens_trained": 1.970607456 + }, + { + "epoch": 1.1378625629387986, + "grad_norm": 3.6751973628997803, + "loss": 3.9979, + "lr": 0.0005089510489510489, + "step": 4012, + "tokens_trained": 1.971587136 + }, + { + "epoch": 1.1384298985887527, + "grad_norm": 3.0805108547210693, + "loss": 3.888, + "lr": 0.0005086713286713287, + "step": 4014, + "tokens_trained": 1.972575152 + }, + { + "epoch": 1.1389972342387065, + "grad_norm": 5.386228084564209, + "loss": 3.9586, + "lr": 0.0005083916083916083, + "step": 4016, + "tokens_trained": 1.973563872 + }, + { + "epoch": 1.1395645698886603, + "grad_norm": 5.567631721496582, + "loss": 3.9337, + "lr": 0.0005081118881118882, + "step": 4018, + "tokens_trained": 1.97454444 + }, + { + "epoch": 1.1401319055386143, + "grad_norm": 5.159145355224609, + "loss": 3.9311, + "lr": 0.0005078321678321678, + "step": 4020, + "tokens_trained": 1.975528128 + }, + { + "epoch": 1.1406992411885681, + "grad_norm": 3.8111817836761475, + "loss": 3.9542, + "lr": 0.0005075524475524476, + "step": 4022, + "tokens_trained": 1.97651136 + }, + { + "epoch": 1.1412665768385222, + "grad_norm": 5.618584156036377, + "loss": 3.9841, + "lr": 0.0005072727272727273, + "step": 4024, + "tokens_trained": 1.97749408 + }, + { + "epoch": 1.141833912488476, + "grad_norm": 5.414000511169434, + "loss": 3.9435, + "lr": 0.0005069930069930069, + "step": 4026, + "tokens_trained": 1.978478936 + }, + { + "epoch": 1.1424012481384298, + "grad_norm": 7.3321661949157715, + "loss": 3.962, + "lr": 0.0005067132867132868, + "step": 4028, + "tokens_trained": 1.979462272 + }, + { + "epoch": 1.1429685837883838, + "grad_norm": 3.5029044151306152, + "loss": 3.9399, + "lr": 0.0005064335664335664, + "step": 4030, + "tokens_trained": 1.98044648 + }, + { + "epoch": 1.1435359194383377, + "grad_norm": 6.343649387359619, + "loss": 3.9788, + "lr": 0.0005061538461538462, + "step": 4032, + "tokens_trained": 1.981432816 + }, + { + "epoch": 1.1441032550882917, + "grad_norm": 8.250723838806152, + "loss": 3.9025, + "lr": 0.0005058741258741258, + "step": 4034, + "tokens_trained": 1.982413272 + }, + { + "epoch": 1.1446705907382455, + "grad_norm": 3.6089327335357666, + "loss": 3.9855, + "lr": 0.0005055944055944057, + "step": 4036, + "tokens_trained": 1.983396296 + }, + { + "epoch": 1.1452379263881993, + "grad_norm": 5.802486896514893, + "loss": 3.9569, + "lr": 0.0005053146853146853, + "step": 4038, + "tokens_trained": 1.984378296 + }, + { + "epoch": 1.1458052620381534, + "grad_norm": 6.48319673538208, + "loss": 3.9423, + "lr": 0.000505034965034965, + "step": 4040, + "tokens_trained": 1.985356768 + }, + { + "epoch": 1.1463725976881072, + "grad_norm": 2.9942495822906494, + "loss": 3.9667, + "lr": 0.0005047552447552448, + "step": 4042, + "tokens_trained": 1.98633836 + }, + { + "epoch": 1.1469399333380612, + "grad_norm": 1.4219609498977661, + "loss": 3.9238, + "lr": 0.0005044755244755244, + "step": 4044, + "tokens_trained": 1.98732128 + }, + { + "epoch": 1.147507268988015, + "grad_norm": 2.6950814723968506, + "loss": 3.9829, + "lr": 0.0005041958041958043, + "step": 4046, + "tokens_trained": 1.988304968 + }, + { + "epoch": 1.1480746046379688, + "grad_norm": 4.490326404571533, + "loss": 3.9506, + "lr": 0.0005039160839160839, + "step": 4048, + "tokens_trained": 1.989288848 + }, + { + "epoch": 1.1486419402879229, + "grad_norm": 7.026235580444336, + "loss": 3.9374, + "lr": 0.0005036363636363637, + "step": 4050, + "tokens_trained": 1.990270344 + }, + { + "epoch": 1.1492092759378767, + "grad_norm": 6.214878082275391, + "loss": 3.9627, + "lr": 0.0005033566433566433, + "step": 4052, + "tokens_trained": 1.991250424 + }, + { + "epoch": 1.1497766115878307, + "grad_norm": 4.663200855255127, + "loss": 3.9631, + "lr": 0.0005030769230769231, + "step": 4054, + "tokens_trained": 1.9922354 + }, + { + "epoch": 1.1503439472377845, + "grad_norm": 4.318966865539551, + "loss": 4.0147, + "lr": 0.0005027972027972028, + "step": 4056, + "tokens_trained": 1.993221056 + }, + { + "epoch": 1.1509112828877384, + "grad_norm": 5.912793159484863, + "loss": 3.9639, + "lr": 0.0005025174825174825, + "step": 4058, + "tokens_trained": 1.994207552 + }, + { + "epoch": 1.1514786185376924, + "grad_norm": 3.6957592964172363, + "loss": 3.9253, + "lr": 0.0005022377622377623, + "step": 4060, + "tokens_trained": 1.99519044 + }, + { + "epoch": 1.1520459541876462, + "grad_norm": 2.9899842739105225, + "loss": 3.9874, + "lr": 0.0005019580419580419, + "step": 4062, + "tokens_trained": 1.996177368 + }, + { + "epoch": 1.1526132898376003, + "grad_norm": 6.149812698364258, + "loss": 3.9278, + "lr": 0.0005016783216783218, + "step": 4064, + "tokens_trained": 1.997162248 + }, + { + "epoch": 1.153180625487554, + "grad_norm": 3.7720232009887695, + "loss": 3.9526, + "lr": 0.0005013986013986014, + "step": 4066, + "tokens_trained": 1.99815024 + }, + { + "epoch": 1.1537479611375079, + "grad_norm": 3.3968939781188965, + "loss": 3.9522, + "lr": 0.0005011188811188811, + "step": 4068, + "tokens_trained": 1.999129208 + }, + { + "epoch": 1.154315296787462, + "grad_norm": 7.051310062408447, + "loss": 3.9545, + "lr": 0.0005008391608391608, + "step": 4070, + "tokens_trained": 2.000111232 + }, + { + "epoch": 1.1548826324374157, + "grad_norm": 4.798380374908447, + "loss": 3.9114, + "lr": 0.0005005594405594406, + "step": 4072, + "tokens_trained": 2.001098352 + }, + { + "epoch": 1.1554499680873698, + "grad_norm": 7.5074992179870605, + "loss": 3.9795, + "lr": 0.0005002797202797203, + "step": 4074, + "tokens_trained": 2.002077616 + }, + { + "epoch": 1.1560173037373236, + "grad_norm": 3.944998025894165, + "loss": 3.9208, + "lr": 0.0005, + "step": 4076, + "tokens_trained": 2.003065976 + }, + { + "epoch": 1.1565846393872774, + "grad_norm": 9.103386878967285, + "loss": 3.9577, + "lr": 0.0004997202797202798, + "step": 4078, + "tokens_trained": 2.004046568 + }, + { + "epoch": 1.1571519750372314, + "grad_norm": 8.950857162475586, + "loss": 3.9474, + "lr": 0.0004994405594405594, + "step": 4080, + "tokens_trained": 2.005031288 + }, + { + "epoch": 1.1577193106871853, + "grad_norm": 6.812939643859863, + "loss": 3.9995, + "lr": 0.0004991608391608391, + "step": 4082, + "tokens_trained": 2.00601472 + }, + { + "epoch": 1.1582866463371393, + "grad_norm": 8.14719009399414, + "loss": 3.9496, + "lr": 0.0004988811188811189, + "step": 4084, + "tokens_trained": 2.006996416 + }, + { + "epoch": 1.158853981987093, + "grad_norm": 7.125198841094971, + "loss": 3.9074, + "lr": 0.0004986013986013986, + "step": 4086, + "tokens_trained": 2.007980248 + }, + { + "epoch": 1.159421317637047, + "grad_norm": 2.4099230766296387, + "loss": 3.9675, + "lr": 0.0004983216783216784, + "step": 4088, + "tokens_trained": 2.008964792 + }, + { + "epoch": 1.159988653287001, + "grad_norm": 3.9759979248046875, + "loss": 3.9655, + "lr": 0.0004980419580419581, + "step": 4090, + "tokens_trained": 2.009945552 + }, + { + "epoch": 1.1605559889369548, + "grad_norm": 5.3169264793396, + "loss": 3.9856, + "lr": 0.0004977622377622378, + "step": 4092, + "tokens_trained": 2.010931072 + }, + { + "epoch": 1.1611233245869088, + "grad_norm": 9.010540008544922, + "loss": 3.9293, + "lr": 0.0004974825174825175, + "step": 4094, + "tokens_trained": 2.011911712 + }, + { + "epoch": 1.1616906602368626, + "grad_norm": 5.83132266998291, + "loss": 3.9725, + "lr": 0.0004972027972027972, + "step": 4096, + "tokens_trained": 2.012895208 + }, + { + "epoch": 1.1622579958868164, + "grad_norm": 8.76009750366211, + "loss": 3.9875, + "lr": 0.0004969230769230769, + "step": 4098, + "tokens_trained": 2.013881768 + }, + { + "epoch": 1.1628253315367705, + "grad_norm": 4.634799480438232, + "loss": 3.9478, + "lr": 0.0004966433566433566, + "step": 4100, + "tokens_trained": 2.014862288 + }, + { + "epoch": 1.1633926671867243, + "grad_norm": 3.717115879058838, + "loss": 3.9029, + "lr": 0.0004963636363636364, + "step": 4102, + "tokens_trained": 2.015846344 + }, + { + "epoch": 1.1639600028366783, + "grad_norm": 5.467166423797607, + "loss": 3.9561, + "lr": 0.0004960839160839161, + "step": 4104, + "tokens_trained": 2.01682528 + }, + { + "epoch": 1.1645273384866321, + "grad_norm": 5.645481109619141, + "loss": 3.9889, + "lr": 0.0004958041958041959, + "step": 4106, + "tokens_trained": 2.017809272 + }, + { + "epoch": 1.165094674136586, + "grad_norm": 4.796457767486572, + "loss": 3.9554, + "lr": 0.0004955244755244756, + "step": 4108, + "tokens_trained": 2.018791344 + }, + { + "epoch": 1.16566200978654, + "grad_norm": 6.111627578735352, + "loss": 3.9495, + "lr": 0.0004952447552447552, + "step": 4110, + "tokens_trained": 2.019777776 + }, + { + "epoch": 1.1662293454364938, + "grad_norm": 4.132344722747803, + "loss": 3.878, + "lr": 0.000494965034965035, + "step": 4112, + "tokens_trained": 2.020760032 + }, + { + "epoch": 1.1667966810864479, + "grad_norm": 4.833931922912598, + "loss": 3.9537, + "lr": 0.0004946853146853147, + "step": 4114, + "tokens_trained": 2.021745984 + }, + { + "epoch": 1.1673640167364017, + "grad_norm": 5.027078628540039, + "loss": 3.9359, + "lr": 0.0004944055944055944, + "step": 4116, + "tokens_trained": 2.022724968 + }, + { + "epoch": 1.1679313523863555, + "grad_norm": 5.339116096496582, + "loss": 3.9104, + "lr": 0.0004941258741258741, + "step": 4118, + "tokens_trained": 2.023705248 + }, + { + "epoch": 1.1684986880363095, + "grad_norm": 5.1652607917785645, + "loss": 3.9671, + "lr": 0.0004938461538461538, + "step": 4120, + "tokens_trained": 2.024688648 + }, + { + "epoch": 1.1690660236862633, + "grad_norm": 4.289709568023682, + "loss": 3.9315, + "lr": 0.0004935664335664336, + "step": 4122, + "tokens_trained": 2.025667424 + }, + { + "epoch": 1.1696333593362174, + "grad_norm": 5.6946492195129395, + "loss": 3.9498, + "lr": 0.0004932867132867133, + "step": 4124, + "tokens_trained": 2.026647168 + }, + { + "epoch": 1.1699170271611943, + "eval_loss": 0.9880662560462952, + "eval_runtime": 21.3984, + "step": 4125, + "tokens_trained": 2.027139168 + }, + { + "epoch": 1.1702006949861712, + "grad_norm": 3.798551082611084, + "loss": 3.9244, + "lr": 0.0004930069930069931, + "step": 4126, + "tokens_trained": 2.027631096 + }, + { + "epoch": 1.170768030636125, + "grad_norm": 3.644767999649048, + "loss": 3.939, + "lr": 0.0004927272727272727, + "step": 4128, + "tokens_trained": 2.028613776 + }, + { + "epoch": 1.171335366286079, + "grad_norm": 5.300503253936768, + "loss": 3.9352, + "lr": 0.0004924475524475525, + "step": 4130, + "tokens_trained": 2.0295936 + }, + { + "epoch": 1.1719027019360329, + "grad_norm": 4.033862590789795, + "loss": 3.9805, + "lr": 0.0004921678321678322, + "step": 4132, + "tokens_trained": 2.030575632 + }, + { + "epoch": 1.172470037585987, + "grad_norm": 3.5188965797424316, + "loss": 3.979, + "lr": 0.0004918881118881118, + "step": 4134, + "tokens_trained": 2.031559704 + }, + { + "epoch": 1.1730373732359407, + "grad_norm": 2.1571266651153564, + "loss": 3.9798, + "lr": 0.0004916083916083916, + "step": 4136, + "tokens_trained": 2.032544624 + }, + { + "epoch": 1.1736047088858945, + "grad_norm": 1.2364273071289062, + "loss": 3.971, + "lr": 0.0004913286713286713, + "step": 4138, + "tokens_trained": 2.033524816 + }, + { + "epoch": 1.1741720445358486, + "grad_norm": 2.3588576316833496, + "loss": 3.9631, + "lr": 0.0004910489510489511, + "step": 4140, + "tokens_trained": 2.034509784 + }, + { + "epoch": 1.1747393801858024, + "grad_norm": 1.2670316696166992, + "loss": 3.9317, + "lr": 0.0004907692307692308, + "step": 4142, + "tokens_trained": 2.035493456 + }, + { + "epoch": 1.1753067158357564, + "grad_norm": 3.2413010597229004, + "loss": 3.9778, + "lr": 0.0004904895104895106, + "step": 4144, + "tokens_trained": 2.03647368 + }, + { + "epoch": 1.1758740514857102, + "grad_norm": 4.079458713531494, + "loss": 3.9715, + "lr": 0.0004902097902097902, + "step": 4146, + "tokens_trained": 2.037452696 + }, + { + "epoch": 1.176441387135664, + "grad_norm": 2.3634743690490723, + "loss": 3.9857, + "lr": 0.00048993006993007, + "step": 4148, + "tokens_trained": 2.038437256 + }, + { + "epoch": 1.177008722785618, + "grad_norm": 1.7258849143981934, + "loss": 3.9044, + "lr": 0.0004896503496503497, + "step": 4150, + "tokens_trained": 2.039421224 + }, + { + "epoch": 1.177576058435572, + "grad_norm": 4.426620960235596, + "loss": 3.9366, + "lr": 0.0004893706293706293, + "step": 4152, + "tokens_trained": 2.040399768 + }, + { + "epoch": 1.178143394085526, + "grad_norm": 4.946300506591797, + "loss": 3.8394, + "lr": 0.0004890909090909091, + "step": 4154, + "tokens_trained": 2.041382744 + }, + { + "epoch": 1.1787107297354797, + "grad_norm": 7.814687252044678, + "loss": 3.9504, + "lr": 0.0004888111888111888, + "step": 4156, + "tokens_trained": 2.042364152 + }, + { + "epoch": 1.1792780653854336, + "grad_norm": 1.7227815389633179, + "loss": 3.8821, + "lr": 0.0004885314685314686, + "step": 4158, + "tokens_trained": 2.043344264 + }, + { + "epoch": 1.1798454010353876, + "grad_norm": 11.620087623596191, + "loss": 3.9375, + "lr": 0.0004882517482517483, + "step": 4160, + "tokens_trained": 2.04432976 + }, + { + "epoch": 1.1804127366853414, + "grad_norm": 11.146257400512695, + "loss": 3.9933, + "lr": 0.000487972027972028, + "step": 4162, + "tokens_trained": 2.0453136 + }, + { + "epoch": 1.1809800723352954, + "grad_norm": 9.995295524597168, + "loss": 3.9977, + "lr": 0.0004876923076923077, + "step": 4164, + "tokens_trained": 2.046294384 + }, + { + "epoch": 1.1815474079852493, + "grad_norm": 9.448521614074707, + "loss": 3.8709, + "lr": 0.00048741258741258743, + "step": 4166, + "tokens_trained": 2.047279192 + }, + { + "epoch": 1.182114743635203, + "grad_norm": 2.3229587078094482, + "loss": 3.9194, + "lr": 0.0004871328671328671, + "step": 4168, + "tokens_trained": 2.048260136 + }, + { + "epoch": 1.1826820792851571, + "grad_norm": 3.8930304050445557, + "loss": 3.9447, + "lr": 0.00048685314685314687, + "step": 4170, + "tokens_trained": 2.049238496 + }, + { + "epoch": 1.183249414935111, + "grad_norm": 6.03069543838501, + "loss": 3.9134, + "lr": 0.00048657342657342656, + "step": 4172, + "tokens_trained": 2.050226352 + }, + { + "epoch": 1.183816750585065, + "grad_norm": 6.509665489196777, + "loss": 3.9005, + "lr": 0.0004862937062937063, + "step": 4174, + "tokens_trained": 2.05121248 + }, + { + "epoch": 1.1843840862350188, + "grad_norm": 2.0728557109832764, + "loss": 3.9646, + "lr": 0.000486013986013986, + "step": 4176, + "tokens_trained": 2.052196784 + }, + { + "epoch": 1.1849514218849726, + "grad_norm": 1.972641944885254, + "loss": 3.9529, + "lr": 0.0004857342657342658, + "step": 4178, + "tokens_trained": 2.053177512 + }, + { + "epoch": 1.1855187575349266, + "grad_norm": 6.664553165435791, + "loss": 3.9424, + "lr": 0.0004854545454545455, + "step": 4180, + "tokens_trained": 2.054159928 + }, + { + "epoch": 1.1860860931848805, + "grad_norm": 7.182534217834473, + "loss": 3.9572, + "lr": 0.00048517482517482517, + "step": 4182, + "tokens_trained": 2.05514288 + }, + { + "epoch": 1.1866534288348345, + "grad_norm": 3.3657350540161133, + "loss": 3.9027, + "lr": 0.0004848951048951049, + "step": 4184, + "tokens_trained": 2.056127256 + }, + { + "epoch": 1.1872207644847883, + "grad_norm": 3.8826489448547363, + "loss": 3.9045, + "lr": 0.0004846153846153846, + "step": 4186, + "tokens_trained": 2.057110184 + }, + { + "epoch": 1.1877881001347421, + "grad_norm": 3.4556474685668945, + "loss": 3.9407, + "lr": 0.00048433566433566435, + "step": 4188, + "tokens_trained": 2.058090016 + }, + { + "epoch": 1.1883554357846962, + "grad_norm": 5.431522846221924, + "loss": 3.93, + "lr": 0.00048405594405594404, + "step": 4190, + "tokens_trained": 2.059071208 + }, + { + "epoch": 1.18892277143465, + "grad_norm": 3.987600803375244, + "loss": 3.9276, + "lr": 0.0004837762237762238, + "step": 4192, + "tokens_trained": 2.060047448 + }, + { + "epoch": 1.189490107084604, + "grad_norm": 5.114170074462891, + "loss": 3.9685, + "lr": 0.0004834965034965035, + "step": 4194, + "tokens_trained": 2.0610266 + }, + { + "epoch": 1.1900574427345578, + "grad_norm": 3.948340654373169, + "loss": 3.9357, + "lr": 0.0004832167832167833, + "step": 4196, + "tokens_trained": 2.062014792 + }, + { + "epoch": 1.1906247783845116, + "grad_norm": 4.607158660888672, + "loss": 3.9441, + "lr": 0.00048293706293706297, + "step": 4198, + "tokens_trained": 2.062993768 + }, + { + "epoch": 1.1911921140344657, + "grad_norm": 2.860197067260742, + "loss": 3.9469, + "lr": 0.00048265734265734266, + "step": 4200, + "tokens_trained": 2.063974352 + }, + { + "epoch": 1.1917594496844195, + "grad_norm": 4.8133544921875, + "loss": 3.9549, + "lr": 0.0004823776223776224, + "step": 4202, + "tokens_trained": 2.064955 + }, + { + "epoch": 1.1923267853343735, + "grad_norm": 3.1824069023132324, + "loss": 3.9589, + "lr": 0.0004820979020979021, + "step": 4204, + "tokens_trained": 2.065938728 + }, + { + "epoch": 1.1928941209843273, + "grad_norm": 4.413929462432861, + "loss": 3.9259, + "lr": 0.00048181818181818184, + "step": 4206, + "tokens_trained": 2.066920408 + }, + { + "epoch": 1.1934614566342812, + "grad_norm": 4.193307876586914, + "loss": 3.8911, + "lr": 0.0004815384615384615, + "step": 4208, + "tokens_trained": 2.067904384 + }, + { + "epoch": 1.1940287922842352, + "grad_norm": 3.4476332664489746, + "loss": 3.9646, + "lr": 0.00048125874125874127, + "step": 4210, + "tokens_trained": 2.068888184 + }, + { + "epoch": 1.194596127934189, + "grad_norm": 1.2195734977722168, + "loss": 3.9053, + "lr": 0.00048097902097902096, + "step": 4212, + "tokens_trained": 2.069866408 + }, + { + "epoch": 1.195163463584143, + "grad_norm": 2.1013519763946533, + "loss": 3.9806, + "lr": 0.00048069930069930076, + "step": 4214, + "tokens_trained": 2.070848272 + }, + { + "epoch": 1.1957307992340969, + "grad_norm": 6.16254186630249, + "loss": 3.99, + "lr": 0.00048041958041958045, + "step": 4216, + "tokens_trained": 2.071833968 + }, + { + "epoch": 1.1962981348840507, + "grad_norm": 4.7692179679870605, + "loss": 3.9775, + "lr": 0.00048013986013986014, + "step": 4218, + "tokens_trained": 2.07281356 + }, + { + "epoch": 1.1968654705340047, + "grad_norm": 3.336514949798584, + "loss": 4.0087, + "lr": 0.0004798601398601399, + "step": 4220, + "tokens_trained": 2.07380172 + }, + { + "epoch": 1.1974328061839585, + "grad_norm": 3.2661092281341553, + "loss": 3.9471, + "lr": 0.0004795804195804196, + "step": 4222, + "tokens_trained": 2.074785216 + }, + { + "epoch": 1.1980001418339126, + "grad_norm": 3.0861871242523193, + "loss": 3.9829, + "lr": 0.0004793006993006993, + "step": 4224, + "tokens_trained": 2.075770912 + }, + { + "epoch": 1.1985674774838664, + "grad_norm": 4.010982036590576, + "loss": 3.9013, + "lr": 0.000479020979020979, + "step": 4226, + "tokens_trained": 2.076755104 + }, + { + "epoch": 1.1991348131338202, + "grad_norm": 3.736706495285034, + "loss": 3.9455, + "lr": 0.00047874125874125875, + "step": 4228, + "tokens_trained": 2.077737472 + }, + { + "epoch": 1.1997021487837742, + "grad_norm": 2.741546392440796, + "loss": 3.929, + "lr": 0.00047846153846153844, + "step": 4230, + "tokens_trained": 2.078721008 + }, + { + "epoch": 1.200269484433728, + "grad_norm": 5.045975685119629, + "loss": 3.938, + "lr": 0.00047818181818181824, + "step": 4232, + "tokens_trained": 2.079705624 + }, + { + "epoch": 1.200836820083682, + "grad_norm": 6.466317653656006, + "loss": 3.9189, + "lr": 0.00047790209790209793, + "step": 4234, + "tokens_trained": 2.080689632 + }, + { + "epoch": 1.201404155733636, + "grad_norm": 10.680752754211426, + "loss": 3.924, + "lr": 0.0004776223776223776, + "step": 4236, + "tokens_trained": 2.0816728 + }, + { + "epoch": 1.2019714913835897, + "grad_norm": 4.394003868103027, + "loss": 3.9587, + "lr": 0.00047734265734265737, + "step": 4238, + "tokens_trained": 2.082649352 + }, + { + "epoch": 1.2025388270335438, + "grad_norm": 14.375049591064453, + "loss": 3.8901, + "lr": 0.00047706293706293706, + "step": 4240, + "tokens_trained": 2.083629016 + }, + { + "epoch": 1.2031061626834976, + "grad_norm": 6.259925365447998, + "loss": 3.9736, + "lr": 0.0004767832167832168, + "step": 4242, + "tokens_trained": 2.084612464 + }, + { + "epoch": 1.2036734983334516, + "grad_norm": 7.176869869232178, + "loss": 3.9335, + "lr": 0.0004765034965034965, + "step": 4244, + "tokens_trained": 2.085598128 + }, + { + "epoch": 1.2042408339834054, + "grad_norm": 7.3431291580200195, + "loss": 3.9129, + "lr": 0.00047622377622377624, + "step": 4246, + "tokens_trained": 2.086582144 + }, + { + "epoch": 1.2048081696333592, + "grad_norm": 3.1388702392578125, + "loss": 3.9645, + "lr": 0.00047594405594405593, + "step": 4248, + "tokens_trained": 2.087566256 + }, + { + "epoch": 1.2053755052833133, + "grad_norm": 4.360974311828613, + "loss": 3.8965, + "lr": 0.00047566433566433573, + "step": 4250, + "tokens_trained": 2.088546896 + }, + { + "epoch": 1.2053755052833133, + "eval_loss": 0.9876537919044495, + "eval_runtime": 20.2375, + "step": 4250, + "tokens_trained": 2.088546896 + }, + { + "epoch": 1.205942840933267, + "grad_norm": 6.790876388549805, + "loss": 3.8925, + "lr": 0.0004753846153846154, + "step": 4252, + "tokens_trained": 2.089529312 + }, + { + "epoch": 1.2065101765832211, + "grad_norm": 5.942895412445068, + "loss": 3.9429, + "lr": 0.0004751048951048951, + "step": 4254, + "tokens_trained": 2.090517856 + }, + { + "epoch": 1.207077512233175, + "grad_norm": 7.182357311248779, + "loss": 3.975, + "lr": 0.00047482517482517485, + "step": 4256, + "tokens_trained": 2.091501152 + }, + { + "epoch": 1.2076448478831288, + "grad_norm": 3.092268228530884, + "loss": 3.9078, + "lr": 0.00047454545454545454, + "step": 4258, + "tokens_trained": 2.0924852 + }, + { + "epoch": 1.2082121835330828, + "grad_norm": 7.483865737915039, + "loss": 3.9469, + "lr": 0.0004742657342657343, + "step": 4260, + "tokens_trained": 2.093467328 + }, + { + "epoch": 1.2087795191830366, + "grad_norm": 6.828039169311523, + "loss": 3.9683, + "lr": 0.000473986013986014, + "step": 4262, + "tokens_trained": 2.094447 + }, + { + "epoch": 1.2093468548329906, + "grad_norm": 2.1174066066741943, + "loss": 3.9575, + "lr": 0.0004737062937062937, + "step": 4264, + "tokens_trained": 2.095428552 + }, + { + "epoch": 1.2099141904829445, + "grad_norm": 1.7029787302017212, + "loss": 3.9174, + "lr": 0.0004734265734265734, + "step": 4266, + "tokens_trained": 2.096413944 + }, + { + "epoch": 1.2104815261328983, + "grad_norm": 8.107586860656738, + "loss": 3.9526, + "lr": 0.0004731468531468531, + "step": 4268, + "tokens_trained": 2.097395416 + }, + { + "epoch": 1.2110488617828523, + "grad_norm": 6.090738773345947, + "loss": 3.8711, + "lr": 0.0004728671328671329, + "step": 4270, + "tokens_trained": 2.098379488 + }, + { + "epoch": 1.2116161974328061, + "grad_norm": 3.09671950340271, + "loss": 3.9489, + "lr": 0.0004725874125874126, + "step": 4272, + "tokens_trained": 2.099365672 + }, + { + "epoch": 1.2121835330827602, + "grad_norm": 1.3280375003814697, + "loss": 3.8766, + "lr": 0.00047230769230769234, + "step": 4274, + "tokens_trained": 2.100345872 + }, + { + "epoch": 1.212750868732714, + "grad_norm": 2.2725517749786377, + "loss": 3.9298, + "lr": 0.00047202797202797203, + "step": 4276, + "tokens_trained": 2.101330144 + }, + { + "epoch": 1.2133182043826678, + "grad_norm": 7.571750164031982, + "loss": 3.9129, + "lr": 0.00047174825174825177, + "step": 4278, + "tokens_trained": 2.102310504 + }, + { + "epoch": 1.2138855400326218, + "grad_norm": 5.49086856842041, + "loss": 3.9257, + "lr": 0.00047146853146853146, + "step": 4280, + "tokens_trained": 2.10329544 + }, + { + "epoch": 1.2144528756825756, + "grad_norm": 3.936779737472534, + "loss": 3.9055, + "lr": 0.0004711888111888112, + "step": 4282, + "tokens_trained": 2.104280736 + }, + { + "epoch": 1.2150202113325297, + "grad_norm": 3.1779263019561768, + "loss": 3.9624, + "lr": 0.0004709090909090909, + "step": 4284, + "tokens_trained": 2.10526688 + }, + { + "epoch": 1.2155875469824835, + "grad_norm": 2.7246220111846924, + "loss": 3.9584, + "lr": 0.0004706293706293706, + "step": 4286, + "tokens_trained": 2.106249208 + }, + { + "epoch": 1.2161548826324373, + "grad_norm": 6.718515396118164, + "loss": 3.9084, + "lr": 0.0004703496503496504, + "step": 4288, + "tokens_trained": 2.107231312 + }, + { + "epoch": 1.2167222182823914, + "grad_norm": 5.000235080718994, + "loss": 3.9648, + "lr": 0.0004700699300699301, + "step": 4290, + "tokens_trained": 2.108215624 + }, + { + "epoch": 1.2172895539323452, + "grad_norm": 4.756376266479492, + "loss": 3.9848, + "lr": 0.0004697902097902098, + "step": 4292, + "tokens_trained": 2.10920156 + }, + { + "epoch": 1.2178568895822992, + "grad_norm": 1.9365978240966797, + "loss": 3.9517, + "lr": 0.0004695104895104895, + "step": 4294, + "tokens_trained": 2.110182936 + }, + { + "epoch": 1.218424225232253, + "grad_norm": 5.350283622741699, + "loss": 3.9737, + "lr": 0.00046923076923076926, + "step": 4296, + "tokens_trained": 2.111164808 + }, + { + "epoch": 1.2189915608822068, + "grad_norm": 4.543917655944824, + "loss": 3.9111, + "lr": 0.00046895104895104895, + "step": 4298, + "tokens_trained": 2.112146848 + }, + { + "epoch": 1.2195588965321609, + "grad_norm": 5.1316938400268555, + "loss": 3.9194, + "lr": 0.0004686713286713287, + "step": 4300, + "tokens_trained": 2.113134184 + }, + { + "epoch": 1.2201262321821147, + "grad_norm": 3.0844085216522217, + "loss": 3.8872, + "lr": 0.0004683916083916084, + "step": 4302, + "tokens_trained": 2.114120832 + }, + { + "epoch": 1.2206935678320687, + "grad_norm": 2.2305877208709717, + "loss": 3.9497, + "lr": 0.00046811188811188807, + "step": 4304, + "tokens_trained": 2.115103856 + }, + { + "epoch": 1.2212609034820225, + "grad_norm": 1.7684617042541504, + "loss": 3.9218, + "lr": 0.00046783216783216787, + "step": 4306, + "tokens_trained": 2.116086968 + }, + { + "epoch": 1.2218282391319764, + "grad_norm": 6.3064680099487305, + "loss": 3.9657, + "lr": 0.00046755244755244756, + "step": 4308, + "tokens_trained": 2.11707108 + }, + { + "epoch": 1.2223955747819304, + "grad_norm": 2.4910192489624023, + "loss": 3.8588, + "lr": 0.0004672727272727273, + "step": 4310, + "tokens_trained": 2.118053928 + }, + { + "epoch": 1.2229629104318842, + "grad_norm": 3.482459306716919, + "loss": 3.9213, + "lr": 0.000466993006993007, + "step": 4312, + "tokens_trained": 2.119037056 + }, + { + "epoch": 1.2235302460818382, + "grad_norm": 6.552737712860107, + "loss": 3.8804, + "lr": 0.00046671328671328674, + "step": 4314, + "tokens_trained": 2.120019576 + }, + { + "epoch": 1.224097581731792, + "grad_norm": 5.225849628448486, + "loss": 3.9562, + "lr": 0.00046643356643356643, + "step": 4316, + "tokens_trained": 2.121000112 + }, + { + "epoch": 1.2246649173817459, + "grad_norm": 2.1894407272338867, + "loss": 3.8752, + "lr": 0.0004661538461538462, + "step": 4318, + "tokens_trained": 2.121988376 + }, + { + "epoch": 1.2252322530317, + "grad_norm": 1.5741831064224243, + "loss": 3.953, + "lr": 0.00046587412587412587, + "step": 4320, + "tokens_trained": 2.122965864 + }, + { + "epoch": 1.2257995886816537, + "grad_norm": 4.103208065032959, + "loss": 3.9216, + "lr": 0.00046559440559440556, + "step": 4322, + "tokens_trained": 2.123950848 + }, + { + "epoch": 1.2263669243316078, + "grad_norm": 7.347278118133545, + "loss": 3.9547, + "lr": 0.00046531468531468536, + "step": 4324, + "tokens_trained": 2.124933448 + }, + { + "epoch": 1.2269342599815616, + "grad_norm": 4.8083930015563965, + "loss": 3.9711, + "lr": 0.00046503496503496505, + "step": 4326, + "tokens_trained": 2.125921528 + }, + { + "epoch": 1.2275015956315154, + "grad_norm": 5.4488654136657715, + "loss": 3.8941, + "lr": 0.0004647552447552448, + "step": 4328, + "tokens_trained": 2.126897152 + }, + { + "epoch": 1.2280689312814694, + "grad_norm": 6.24332332611084, + "loss": 3.9178, + "lr": 0.0004644755244755245, + "step": 4330, + "tokens_trained": 2.127881384 + }, + { + "epoch": 1.2286362669314232, + "grad_norm": 5.97770881652832, + "loss": 3.8804, + "lr": 0.0004641958041958042, + "step": 4332, + "tokens_trained": 2.128864008 + }, + { + "epoch": 1.2292036025813773, + "grad_norm": 3.901036500930786, + "loss": 3.8968, + "lr": 0.0004639160839160839, + "step": 4334, + "tokens_trained": 2.129847632 + }, + { + "epoch": 1.229770938231331, + "grad_norm": 5.377021789550781, + "loss": 3.9565, + "lr": 0.00046363636363636366, + "step": 4336, + "tokens_trained": 2.130832296 + }, + { + "epoch": 1.230338273881285, + "grad_norm": 4.565158367156982, + "loss": 3.9672, + "lr": 0.00046335664335664335, + "step": 4338, + "tokens_trained": 2.131814648 + }, + { + "epoch": 1.230905609531239, + "grad_norm": 1.2882499694824219, + "loss": 3.9515, + "lr": 0.00046307692307692304, + "step": 4340, + "tokens_trained": 2.132797872 + }, + { + "epoch": 1.2314729451811928, + "grad_norm": 0.9845411777496338, + "loss": 3.9057, + "lr": 0.00046279720279720284, + "step": 4342, + "tokens_trained": 2.133780992 + }, + { + "epoch": 1.2320402808311468, + "grad_norm": 3.7839152812957764, + "loss": 3.8909, + "lr": 0.00046251748251748253, + "step": 4344, + "tokens_trained": 2.134762864 + }, + { + "epoch": 1.2326076164811006, + "grad_norm": 3.8872299194335938, + "loss": 3.9262, + "lr": 0.0004622377622377623, + "step": 4346, + "tokens_trained": 2.135743504 + }, + { + "epoch": 1.2331749521310544, + "grad_norm": 4.538093566894531, + "loss": 3.9098, + "lr": 0.00046195804195804196, + "step": 4348, + "tokens_trained": 2.136727288 + }, + { + "epoch": 1.2337422877810085, + "grad_norm": 6.453696250915527, + "loss": 3.9103, + "lr": 0.0004616783216783217, + "step": 4350, + "tokens_trained": 2.137710256 + }, + { + "epoch": 1.2343096234309623, + "grad_norm": 4.033708572387695, + "loss": 3.9144, + "lr": 0.0004613986013986014, + "step": 4352, + "tokens_trained": 2.138691568 + }, + { + "epoch": 1.2348769590809163, + "grad_norm": 4.32963752746582, + "loss": 3.9154, + "lr": 0.00046111888111888114, + "step": 4354, + "tokens_trained": 2.13967628 + }, + { + "epoch": 1.2354442947308701, + "grad_norm": 3.0617220401763916, + "loss": 3.8984, + "lr": 0.00046083916083916083, + "step": 4356, + "tokens_trained": 2.140659368 + }, + { + "epoch": 1.236011630380824, + "grad_norm": 2.51361346244812, + "loss": 3.8971, + "lr": 0.0004605594405594405, + "step": 4358, + "tokens_trained": 2.141644648 + }, + { + "epoch": 1.236578966030778, + "grad_norm": 3.6975977420806885, + "loss": 3.9208, + "lr": 0.0004602797202797203, + "step": 4360, + "tokens_trained": 2.142628176 + }, + { + "epoch": 1.2371463016807318, + "grad_norm": 5.2992844581604, + "loss": 3.8855, + "lr": 0.00046, + "step": 4362, + "tokens_trained": 2.143610328 + }, + { + "epoch": 1.2377136373306858, + "grad_norm": 4.426636695861816, + "loss": 3.893, + "lr": 0.00045972027972027976, + "step": 4364, + "tokens_trained": 2.144591512 + }, + { + "epoch": 1.2382809729806397, + "grad_norm": 4.131166458129883, + "loss": 3.9098, + "lr": 0.00045944055944055945, + "step": 4366, + "tokens_trained": 2.14557312 + }, + { + "epoch": 1.2388483086305935, + "grad_norm": 2.9156816005706787, + "loss": 3.9771, + "lr": 0.0004591608391608392, + "step": 4368, + "tokens_trained": 2.146551592 + }, + { + "epoch": 1.2394156442805475, + "grad_norm": 3.8412554264068604, + "loss": 3.9584, + "lr": 0.0004588811188811189, + "step": 4370, + "tokens_trained": 2.147533032 + }, + { + "epoch": 1.2399829799305013, + "grad_norm": 3.1897640228271484, + "loss": 3.8253, + "lr": 0.0004586013986013986, + "step": 4372, + "tokens_trained": 2.148517592 + }, + { + "epoch": 1.2405503155804554, + "grad_norm": 4.066483020782471, + "loss": 3.8905, + "lr": 0.0004583216783216783, + "step": 4374, + "tokens_trained": 2.149502368 + }, + { + "epoch": 1.2408339834054323, + "eval_loss": 0.9844964146614075, + "eval_runtime": 21.0593, + "step": 4375, + "tokens_trained": 2.14999612 + }, + { + "epoch": 1.2411176512304092, + "grad_norm": 2.0596890449523926, + "loss": 3.9142, + "lr": 0.000458041958041958, + "step": 4376, + "tokens_trained": 2.15048712 + }, + { + "epoch": 1.241684986880363, + "grad_norm": 4.4018988609313965, + "loss": 3.9487, + "lr": 0.0004577622377622378, + "step": 4378, + "tokens_trained": 2.151468832 + }, + { + "epoch": 1.242252322530317, + "grad_norm": 3.294774055480957, + "loss": 3.979, + "lr": 0.0004574825174825175, + "step": 4380, + "tokens_trained": 2.152451456 + }, + { + "epoch": 1.2428196581802708, + "grad_norm": 2.5546209812164307, + "loss": 3.9135, + "lr": 0.00045720279720279724, + "step": 4382, + "tokens_trained": 2.1534348 + }, + { + "epoch": 1.2433869938302249, + "grad_norm": 2.1771605014801025, + "loss": 3.9207, + "lr": 0.00045692307692307693, + "step": 4384, + "tokens_trained": 2.154414104 + }, + { + "epoch": 1.2439543294801787, + "grad_norm": 3.5681049823760986, + "loss": 3.8632, + "lr": 0.0004566433566433567, + "step": 4386, + "tokens_trained": 2.155399088 + }, + { + "epoch": 1.2445216651301325, + "grad_norm": 5.588647365570068, + "loss": 3.9769, + "lr": 0.00045636363636363637, + "step": 4388, + "tokens_trained": 2.15638104 + }, + { + "epoch": 1.2450890007800866, + "grad_norm": 5.798253059387207, + "loss": 3.9167, + "lr": 0.00045608391608391606, + "step": 4390, + "tokens_trained": 2.157366296 + }, + { + "epoch": 1.2456563364300404, + "grad_norm": 2.425339698791504, + "loss": 3.9152, + "lr": 0.0004558041958041958, + "step": 4392, + "tokens_trained": 2.158347208 + }, + { + "epoch": 1.2462236720799944, + "grad_norm": 4.4874444007873535, + "loss": 3.9171, + "lr": 0.0004555244755244755, + "step": 4394, + "tokens_trained": 2.159329056 + }, + { + "epoch": 1.2467910077299482, + "grad_norm": 4.653798580169678, + "loss": 3.9308, + "lr": 0.00045524475524475524, + "step": 4396, + "tokens_trained": 2.160312792 + }, + { + "epoch": 1.247358343379902, + "grad_norm": 5.013849258422852, + "loss": 3.9224, + "lr": 0.000454965034965035, + "step": 4398, + "tokens_trained": 2.161298728 + }, + { + "epoch": 1.247925679029856, + "grad_norm": 3.3346633911132812, + "loss": 3.9482, + "lr": 0.0004546853146853147, + "step": 4400, + "tokens_trained": 2.162280664 + }, + { + "epoch": 1.2484930146798099, + "grad_norm": 2.408282518386841, + "loss": 3.9468, + "lr": 0.0004544055944055944, + "step": 4402, + "tokens_trained": 2.163262608 + }, + { + "epoch": 1.249060350329764, + "grad_norm": 2.3152034282684326, + "loss": 3.9346, + "lr": 0.00045412587412587416, + "step": 4404, + "tokens_trained": 2.16424488 + }, + { + "epoch": 1.2496276859797177, + "grad_norm": 4.722060680389404, + "loss": 3.93, + "lr": 0.00045384615384615385, + "step": 4406, + "tokens_trained": 2.165227184 + }, + { + "epoch": 1.2501950216296716, + "grad_norm": 2.3931281566619873, + "loss": 3.9412, + "lr": 0.00045356643356643354, + "step": 4408, + "tokens_trained": 2.166208312 + }, + { + "epoch": 1.2507623572796256, + "grad_norm": 3.703711986541748, + "loss": 3.9661, + "lr": 0.0004532867132867133, + "step": 4410, + "tokens_trained": 2.167191896 + }, + { + "epoch": 1.2513296929295794, + "grad_norm": 3.168426036834717, + "loss": 3.9108, + "lr": 0.000453006993006993, + "step": 4412, + "tokens_trained": 2.1681734 + }, + { + "epoch": 1.2518970285795334, + "grad_norm": 4.465419769287109, + "loss": 3.9224, + "lr": 0.0004527272727272727, + "step": 4414, + "tokens_trained": 2.16915824 + }, + { + "epoch": 1.2524643642294873, + "grad_norm": 3.145385265350342, + "loss": 3.9317, + "lr": 0.00045244755244755247, + "step": 4416, + "tokens_trained": 2.170140944 + }, + { + "epoch": 1.253031699879441, + "grad_norm": 3.0174384117126465, + "loss": 3.9592, + "lr": 0.0004521678321678322, + "step": 4418, + "tokens_trained": 2.171127312 + }, + { + "epoch": 1.2535990355293951, + "grad_norm": 2.9682352542877197, + "loss": 3.9248, + "lr": 0.0004518881118881119, + "step": 4420, + "tokens_trained": 2.17211552 + }, + { + "epoch": 1.254166371179349, + "grad_norm": 4.654287338256836, + "loss": 3.9592, + "lr": 0.00045160839160839165, + "step": 4422, + "tokens_trained": 2.173101456 + }, + { + "epoch": 1.254733706829303, + "grad_norm": 5.210162162780762, + "loss": 3.9463, + "lr": 0.00045132867132867134, + "step": 4424, + "tokens_trained": 2.174081192 + }, + { + "epoch": 1.2553010424792568, + "grad_norm": 1.6227176189422607, + "loss": 3.8894, + "lr": 0.000451048951048951, + "step": 4426, + "tokens_trained": 2.175063888 + }, + { + "epoch": 1.2558683781292106, + "grad_norm": 1.6847152709960938, + "loss": 3.9207, + "lr": 0.00045076923076923077, + "step": 4428, + "tokens_trained": 2.176047656 + }, + { + "epoch": 1.2564357137791646, + "grad_norm": 7.743977069854736, + "loss": 3.9202, + "lr": 0.00045048951048951046, + "step": 4430, + "tokens_trained": 2.177030728 + }, + { + "epoch": 1.2570030494291184, + "grad_norm": 5.493525981903076, + "loss": 3.8951, + "lr": 0.0004502097902097902, + "step": 4432, + "tokens_trained": 2.178010048 + }, + { + "epoch": 1.2575703850790725, + "grad_norm": 4.744298934936523, + "loss": 3.9641, + "lr": 0.00044993006993006995, + "step": 4434, + "tokens_trained": 2.178992816 + }, + { + "epoch": 1.2581377207290263, + "grad_norm": 5.230485916137695, + "loss": 3.9552, + "lr": 0.0004496503496503497, + "step": 4436, + "tokens_trained": 2.179977048 + }, + { + "epoch": 1.2587050563789801, + "grad_norm": 2.7955129146575928, + "loss": 3.9462, + "lr": 0.0004493706293706294, + "step": 4438, + "tokens_trained": 2.18096108 + }, + { + "epoch": 1.2592723920289342, + "grad_norm": 4.869340419769287, + "loss": 3.8819, + "lr": 0.00044909090909090913, + "step": 4440, + "tokens_trained": 2.181941176 + }, + { + "epoch": 1.259839727678888, + "grad_norm": 4.538938045501709, + "loss": 3.8967, + "lr": 0.0004488111888111888, + "step": 4442, + "tokens_trained": 2.182923032 + }, + { + "epoch": 1.260407063328842, + "grad_norm": 4.085853576660156, + "loss": 3.9155, + "lr": 0.0004485314685314685, + "step": 4444, + "tokens_trained": 2.183902584 + }, + { + "epoch": 1.2609743989787958, + "grad_norm": 6.15781831741333, + "loss": 3.9379, + "lr": 0.00044825174825174826, + "step": 4446, + "tokens_trained": 2.184884968 + }, + { + "epoch": 1.2615417346287496, + "grad_norm": 2.5738606452941895, + "loss": 3.9642, + "lr": 0.00044797202797202795, + "step": 4448, + "tokens_trained": 2.185870952 + }, + { + "epoch": 1.2621090702787037, + "grad_norm": 4.356530666351318, + "loss": 3.8908, + "lr": 0.0004476923076923077, + "step": 4450, + "tokens_trained": 2.186854928 + }, + { + "epoch": 1.2626764059286575, + "grad_norm": 5.518537998199463, + "loss": 3.8954, + "lr": 0.00044741258741258744, + "step": 4452, + "tokens_trained": 2.187847 + }, + { + "epoch": 1.2632437415786115, + "grad_norm": 7.3632354736328125, + "loss": 3.9363, + "lr": 0.0004471328671328672, + "step": 4454, + "tokens_trained": 2.188829592 + }, + { + "epoch": 1.2638110772285653, + "grad_norm": 0.9625980854034424, + "loss": 3.9416, + "lr": 0.00044685314685314687, + "step": 4456, + "tokens_trained": 2.189811456 + }, + { + "epoch": 1.2643784128785192, + "grad_norm": 4.0898003578186035, + "loss": 3.9133, + "lr": 0.0004465734265734266, + "step": 4458, + "tokens_trained": 2.19079428 + }, + { + "epoch": 1.2649457485284732, + "grad_norm": 6.740445137023926, + "loss": 3.9282, + "lr": 0.0004462937062937063, + "step": 4460, + "tokens_trained": 2.1917786 + }, + { + "epoch": 1.265513084178427, + "grad_norm": 6.742666244506836, + "loss": 3.9077, + "lr": 0.000446013986013986, + "step": 4462, + "tokens_trained": 2.192758016 + }, + { + "epoch": 1.266080419828381, + "grad_norm": 4.592698097229004, + "loss": 3.9123, + "lr": 0.00044573426573426574, + "step": 4464, + "tokens_trained": 2.193741496 + }, + { + "epoch": 1.2666477554783349, + "grad_norm": 8.934327125549316, + "loss": 3.9647, + "lr": 0.00044545454545454543, + "step": 4466, + "tokens_trained": 2.194723584 + }, + { + "epoch": 1.2672150911282887, + "grad_norm": 4.280580997467041, + "loss": 3.9189, + "lr": 0.0004451748251748252, + "step": 4468, + "tokens_trained": 2.195708432 + }, + { + "epoch": 1.2677824267782427, + "grad_norm": 3.257995843887329, + "loss": 3.9698, + "lr": 0.0004448951048951049, + "step": 4470, + "tokens_trained": 2.196691336 + }, + { + "epoch": 1.2683497624281965, + "grad_norm": 6.521494388580322, + "loss": 3.9676, + "lr": 0.00044461538461538466, + "step": 4472, + "tokens_trained": 2.197674528 + }, + { + "epoch": 1.2689170980781506, + "grad_norm": 6.169503211975098, + "loss": 3.9404, + "lr": 0.00044433566433566435, + "step": 4474, + "tokens_trained": 2.198658448 + }, + { + "epoch": 1.2694844337281044, + "grad_norm": 3.5009562969207764, + "loss": 3.9229, + "lr": 0.0004440559440559441, + "step": 4476, + "tokens_trained": 2.199646232 + }, + { + "epoch": 1.2700517693780582, + "grad_norm": 3.2101058959960938, + "loss": 3.9536, + "lr": 0.0004437762237762238, + "step": 4478, + "tokens_trained": 2.200630024 + }, + { + "epoch": 1.2706191050280122, + "grad_norm": 5.417990684509277, + "loss": 3.9591, + "lr": 0.0004434965034965035, + "step": 4480, + "tokens_trained": 2.2016182 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 3.1346352100372314, + "loss": 3.9408, + "lr": 0.0004432167832167832, + "step": 4482, + "tokens_trained": 2.2025994 + }, + { + "epoch": 1.27175377632792, + "grad_norm": 3.2468717098236084, + "loss": 3.922, + "lr": 0.0004429370629370629, + "step": 4484, + "tokens_trained": 2.203581424 + }, + { + "epoch": 1.272321111977874, + "grad_norm": 5.069144248962402, + "loss": 3.9616, + "lr": 0.00044265734265734266, + "step": 4486, + "tokens_trained": 2.204562264 + }, + { + "epoch": 1.2728884476278277, + "grad_norm": 4.097993850708008, + "loss": 3.931, + "lr": 0.0004423776223776224, + "step": 4488, + "tokens_trained": 2.205548376 + }, + { + "epoch": 1.2734557832777817, + "grad_norm": 2.3711421489715576, + "loss": 3.9201, + "lr": 0.00044209790209790215, + "step": 4490, + "tokens_trained": 2.206535208 + }, + { + "epoch": 1.2740231189277356, + "grad_norm": 7.32819938659668, + "loss": 3.8766, + "lr": 0.00044181818181818184, + "step": 4492, + "tokens_trained": 2.207522192 + }, + { + "epoch": 1.2745904545776896, + "grad_norm": 3.9666519165039062, + "loss": 3.894, + "lr": 0.00044153846153846153, + "step": 4494, + "tokens_trained": 2.208506616 + }, + { + "epoch": 1.2751577902276434, + "grad_norm": 2.1190407276153564, + "loss": 3.9141, + "lr": 0.0004412587412587413, + "step": 4496, + "tokens_trained": 2.209489192 + }, + { + "epoch": 1.2757251258775972, + "grad_norm": 1.3682332038879395, + "loss": 3.8666, + "lr": 0.00044097902097902096, + "step": 4498, + "tokens_trained": 2.210472392 + }, + { + "epoch": 1.2762924615275513, + "grad_norm": 2.5941426753997803, + "loss": 3.8921, + "lr": 0.0004406993006993007, + "step": 4500, + "tokens_trained": 2.211451384 + }, + { + "epoch": 1.2762924615275513, + "eval_loss": 0.9826880097389221, + "eval_runtime": 20.931, + "step": 4500, + "tokens_trained": 2.211451384 + }, + { + "epoch": 1.276859797177505, + "grad_norm": 3.0399274826049805, + "loss": 3.9331, + "lr": 0.0004404195804195804, + "step": 4502, + "tokens_trained": 2.212433736 + }, + { + "epoch": 1.2774271328274591, + "grad_norm": 4.30709981918335, + "loss": 3.9199, + "lr": 0.00044013986013986014, + "step": 4504, + "tokens_trained": 2.21341728 + }, + { + "epoch": 1.277994468477413, + "grad_norm": 3.0569705963134766, + "loss": 3.8956, + "lr": 0.0004398601398601399, + "step": 4506, + "tokens_trained": 2.214397528 + }, + { + "epoch": 1.2785618041273668, + "grad_norm": 4.9559197425842285, + "loss": 3.9736, + "lr": 0.00043958041958041963, + "step": 4508, + "tokens_trained": 2.215381712 + }, + { + "epoch": 1.2791291397773208, + "grad_norm": 2.7426505088806152, + "loss": 3.9042, + "lr": 0.0004393006993006993, + "step": 4510, + "tokens_trained": 2.216361048 + }, + { + "epoch": 1.2796964754272746, + "grad_norm": 1.8043859004974365, + "loss": 3.8892, + "lr": 0.000439020979020979, + "step": 4512, + "tokens_trained": 2.217344128 + }, + { + "epoch": 1.2802638110772286, + "grad_norm": 4.298875331878662, + "loss": 3.9133, + "lr": 0.00043874125874125876, + "step": 4514, + "tokens_trained": 2.218327112 + }, + { + "epoch": 1.2808311467271825, + "grad_norm": 2.752638339996338, + "loss": 3.9078, + "lr": 0.00043846153846153845, + "step": 4516, + "tokens_trained": 2.219311704 + }, + { + "epoch": 1.2813984823771363, + "grad_norm": 4.202718257904053, + "loss": 3.9452, + "lr": 0.0004381818181818182, + "step": 4518, + "tokens_trained": 2.220295888 + }, + { + "epoch": 1.2819658180270903, + "grad_norm": 3.5449273586273193, + "loss": 3.8367, + "lr": 0.0004379020979020979, + "step": 4520, + "tokens_trained": 2.221281456 + }, + { + "epoch": 1.2825331536770441, + "grad_norm": 2.472935199737549, + "loss": 3.8939, + "lr": 0.00043762237762237763, + "step": 4522, + "tokens_trained": 2.222262496 + }, + { + "epoch": 1.2831004893269982, + "grad_norm": 1.1959093809127808, + "loss": 3.9271, + "lr": 0.0004373426573426573, + "step": 4524, + "tokens_trained": 2.223247352 + }, + { + "epoch": 1.283667824976952, + "grad_norm": 2.553889036178589, + "loss": 3.9083, + "lr": 0.0004370629370629371, + "step": 4526, + "tokens_trained": 2.224231696 + }, + { + "epoch": 1.2842351606269058, + "grad_norm": 2.028510570526123, + "loss": 3.9004, + "lr": 0.0004367832167832168, + "step": 4528, + "tokens_trained": 2.225211808 + }, + { + "epoch": 1.2848024962768598, + "grad_norm": 2.498624086380005, + "loss": 3.9138, + "lr": 0.0004365034965034965, + "step": 4530, + "tokens_trained": 2.226197488 + }, + { + "epoch": 1.2853698319268136, + "grad_norm": 2.689389228820801, + "loss": 3.9439, + "lr": 0.00043622377622377624, + "step": 4532, + "tokens_trained": 2.227181176 + }, + { + "epoch": 1.2859371675767677, + "grad_norm": 6.014649868011475, + "loss": 3.9232, + "lr": 0.00043594405594405593, + "step": 4534, + "tokens_trained": 2.228163272 + }, + { + "epoch": 1.2865045032267215, + "grad_norm": 4.911413192749023, + "loss": 3.9831, + "lr": 0.0004356643356643357, + "step": 4536, + "tokens_trained": 2.229142248 + }, + { + "epoch": 1.2870718388766753, + "grad_norm": 3.633075714111328, + "loss": 3.944, + "lr": 0.00043538461538461537, + "step": 4538, + "tokens_trained": 2.23012372 + }, + { + "epoch": 1.2876391745266293, + "grad_norm": 2.4579458236694336, + "loss": 3.9051, + "lr": 0.0004351048951048951, + "step": 4540, + "tokens_trained": 2.231109312 + }, + { + "epoch": 1.2882065101765832, + "grad_norm": 2.5251097679138184, + "loss": 3.9248, + "lr": 0.0004348251748251748, + "step": 4542, + "tokens_trained": 2.232096216 + }, + { + "epoch": 1.2887738458265372, + "grad_norm": 4.023996353149414, + "loss": 3.9358, + "lr": 0.0004345454545454546, + "step": 4544, + "tokens_trained": 2.233081224 + }, + { + "epoch": 1.289341181476491, + "grad_norm": 3.8658416271209717, + "loss": 3.9193, + "lr": 0.0004342657342657343, + "step": 4546, + "tokens_trained": 2.234069488 + }, + { + "epoch": 1.2899085171264448, + "grad_norm": 6.1119914054870605, + "loss": 3.8991, + "lr": 0.000433986013986014, + "step": 4548, + "tokens_trained": 2.23505432 + }, + { + "epoch": 1.2904758527763989, + "grad_norm": 3.834200620651245, + "loss": 3.9844, + "lr": 0.0004337062937062937, + "step": 4550, + "tokens_trained": 2.236034064 + }, + { + "epoch": 1.2910431884263527, + "grad_norm": 3.4992194175720215, + "loss": 3.9358, + "lr": 0.0004334265734265734, + "step": 4552, + "tokens_trained": 2.23701784 + }, + { + "epoch": 1.2916105240763067, + "grad_norm": 5.517240524291992, + "loss": 3.9046, + "lr": 0.00043314685314685316, + "step": 4554, + "tokens_trained": 2.238003144 + }, + { + "epoch": 1.2921778597262605, + "grad_norm": 3.596975803375244, + "loss": 3.9073, + "lr": 0.00043286713286713285, + "step": 4556, + "tokens_trained": 2.238986056 + }, + { + "epoch": 1.2927451953762144, + "grad_norm": 6.674678325653076, + "loss": 3.9285, + "lr": 0.0004325874125874126, + "step": 4558, + "tokens_trained": 2.239968824 + }, + { + "epoch": 1.2933125310261684, + "grad_norm": 3.589822292327881, + "loss": 3.9137, + "lr": 0.0004323076923076923, + "step": 4560, + "tokens_trained": 2.24095252 + }, + { + "epoch": 1.2938798666761222, + "grad_norm": 4.785327434539795, + "loss": 3.9188, + "lr": 0.0004320279720279721, + "step": 4562, + "tokens_trained": 2.241935408 + }, + { + "epoch": 1.2944472023260762, + "grad_norm": 5.784316062927246, + "loss": 3.8804, + "lr": 0.0004317482517482518, + "step": 4564, + "tokens_trained": 2.242919696 + }, + { + "epoch": 1.29501453797603, + "grad_norm": 4.1364641189575195, + "loss": 3.9, + "lr": 0.00043146853146853147, + "step": 4566, + "tokens_trained": 2.243899048 + }, + { + "epoch": 1.2955818736259839, + "grad_norm": 4.100215435028076, + "loss": 3.875, + "lr": 0.0004311888111888112, + "step": 4568, + "tokens_trained": 2.24487848 + }, + { + "epoch": 1.296149209275938, + "grad_norm": 5.456444263458252, + "loss": 3.9252, + "lr": 0.0004309090909090909, + "step": 4570, + "tokens_trained": 2.245860712 + }, + { + "epoch": 1.2967165449258917, + "grad_norm": 4.084255695343018, + "loss": 3.8755, + "lr": 0.00043062937062937065, + "step": 4572, + "tokens_trained": 2.246846216 + }, + { + "epoch": 1.2972838805758458, + "grad_norm": 4.147522926330566, + "loss": 3.9162, + "lr": 0.00043034965034965034, + "step": 4574, + "tokens_trained": 2.24783164 + }, + { + "epoch": 1.2978512162257996, + "grad_norm": 5.48593807220459, + "loss": 3.9073, + "lr": 0.0004300699300699301, + "step": 4576, + "tokens_trained": 2.248817664 + }, + { + "epoch": 1.2984185518757534, + "grad_norm": 2.8644235134124756, + "loss": 3.9117, + "lr": 0.00042979020979020977, + "step": 4578, + "tokens_trained": 2.249802224 + }, + { + "epoch": 1.2989858875257074, + "grad_norm": 1.8577483892440796, + "loss": 3.8833, + "lr": 0.00042951048951048957, + "step": 4580, + "tokens_trained": 2.250785496 + }, + { + "epoch": 1.2995532231756612, + "grad_norm": 1.4357212781906128, + "loss": 3.8986, + "lr": 0.00042923076923076926, + "step": 4582, + "tokens_trained": 2.25177068 + }, + { + "epoch": 1.3001205588256153, + "grad_norm": 1.9124270677566528, + "loss": 3.9149, + "lr": 0.00042895104895104895, + "step": 4584, + "tokens_trained": 2.252752112 + }, + { + "epoch": 1.300687894475569, + "grad_norm": 0.6659060120582581, + "loss": 3.8666, + "lr": 0.0004286713286713287, + "step": 4586, + "tokens_trained": 2.25373604 + }, + { + "epoch": 1.301255230125523, + "grad_norm": 1.1679121255874634, + "loss": 3.9469, + "lr": 0.0004283916083916084, + "step": 4588, + "tokens_trained": 2.254720456 + }, + { + "epoch": 1.301822565775477, + "grad_norm": 2.010969877243042, + "loss": 3.9181, + "lr": 0.00042811188811188813, + "step": 4590, + "tokens_trained": 2.255698824 + }, + { + "epoch": 1.3023899014254308, + "grad_norm": 2.0586466789245605, + "loss": 3.8682, + "lr": 0.0004278321678321678, + "step": 4592, + "tokens_trained": 2.256682928 + }, + { + "epoch": 1.3029572370753848, + "grad_norm": 1.4269180297851562, + "loss": 3.935, + "lr": 0.00042755244755244756, + "step": 4594, + "tokens_trained": 2.257665184 + }, + { + "epoch": 1.3035245727253386, + "grad_norm": 3.324599504470825, + "loss": 3.9849, + "lr": 0.00042727272727272726, + "step": 4596, + "tokens_trained": 2.258650496 + }, + { + "epoch": 1.3040919083752924, + "grad_norm": 5.035736560821533, + "loss": 3.9088, + "lr": 0.00042699300699300705, + "step": 4598, + "tokens_trained": 2.259632984 + }, + { + "epoch": 1.3046592440252465, + "grad_norm": 3.3298044204711914, + "loss": 3.9033, + "lr": 0.00042671328671328674, + "step": 4600, + "tokens_trained": 2.260615528 + }, + { + "epoch": 1.3052265796752003, + "grad_norm": 1.253243088722229, + "loss": 3.9154, + "lr": 0.00042643356643356643, + "step": 4602, + "tokens_trained": 2.261605648 + }, + { + "epoch": 1.3057939153251543, + "grad_norm": 1.8505600690841675, + "loss": 3.8771, + "lr": 0.0004261538461538462, + "step": 4604, + "tokens_trained": 2.262590648 + }, + { + "epoch": 1.3063612509751081, + "grad_norm": 7.305438995361328, + "loss": 3.9323, + "lr": 0.00042587412587412587, + "step": 4606, + "tokens_trained": 2.263579136 + }, + { + "epoch": 1.306928586625062, + "grad_norm": 4.584920406341553, + "loss": 3.9097, + "lr": 0.0004255944055944056, + "step": 4608, + "tokens_trained": 2.264559496 + }, + { + "epoch": 1.307495922275016, + "grad_norm": 2.3128468990325928, + "loss": 3.8532, + "lr": 0.0004253146853146853, + "step": 4610, + "tokens_trained": 2.265543104 + }, + { + "epoch": 1.3080632579249698, + "grad_norm": 3.1513102054595947, + "loss": 3.944, + "lr": 0.00042503496503496505, + "step": 4612, + "tokens_trained": 2.266528816 + }, + { + "epoch": 1.3086305935749238, + "grad_norm": 3.1904358863830566, + "loss": 3.8706, + "lr": 0.00042475524475524474, + "step": 4614, + "tokens_trained": 2.26751496 + }, + { + "epoch": 1.3091979292248777, + "grad_norm": 2.383105516433716, + "loss": 3.925, + "lr": 0.0004244755244755245, + "step": 4616, + "tokens_trained": 2.268497744 + }, + { + "epoch": 1.3097652648748315, + "grad_norm": 2.642970561981201, + "loss": 3.918, + "lr": 0.00042419580419580423, + "step": 4618, + "tokens_trained": 2.269478888 + }, + { + "epoch": 1.3103326005247855, + "grad_norm": 1.1598117351531982, + "loss": 3.8815, + "lr": 0.0004239160839160839, + "step": 4620, + "tokens_trained": 2.270465888 + }, + { + "epoch": 1.3108999361747393, + "grad_norm": 0.9736254811286926, + "loss": 3.8866, + "lr": 0.00042363636363636366, + "step": 4622, + "tokens_trained": 2.271446656 + }, + { + "epoch": 1.3114672718246934, + "grad_norm": 2.0817017555236816, + "loss": 3.9753, + "lr": 0.00042335664335664335, + "step": 4624, + "tokens_trained": 2.272427288 + }, + { + "epoch": 1.3117509396496703, + "eval_loss": 0.9817197918891907, + "eval_runtime": 20.1783, + "step": 4625, + "tokens_trained": 2.272921208 + }, + { + "epoch": 1.3120346074746472, + "grad_norm": 4.969366550445557, + "loss": 3.8696, + "lr": 0.0004230769230769231, + "step": 4626, + "tokens_trained": 2.273412256 + }, + { + "epoch": 1.312601943124601, + "grad_norm": 3.270707130432129, + "loss": 3.9589, + "lr": 0.0004227972027972028, + "step": 4628, + "tokens_trained": 2.274396776 + }, + { + "epoch": 1.313169278774555, + "grad_norm": 2.6939852237701416, + "loss": 3.8711, + "lr": 0.00042251748251748253, + "step": 4630, + "tokens_trained": 2.27537728 + }, + { + "epoch": 1.3137366144245088, + "grad_norm": 3.0615079402923584, + "loss": 3.8899, + "lr": 0.0004222377622377622, + "step": 4632, + "tokens_trained": 2.276362448 + }, + { + "epoch": 1.3143039500744629, + "grad_norm": 3.1804049015045166, + "loss": 3.9158, + "lr": 0.00042195804195804197, + "step": 4634, + "tokens_trained": 2.277342984 + }, + { + "epoch": 1.3148712857244167, + "grad_norm": 1.3030014038085938, + "loss": 3.958, + "lr": 0.0004216783216783217, + "step": 4636, + "tokens_trained": 2.278326696 + }, + { + "epoch": 1.3154386213743705, + "grad_norm": 2.9791958332061768, + "loss": 3.9412, + "lr": 0.0004213986013986014, + "step": 4638, + "tokens_trained": 2.27930736 + }, + { + "epoch": 1.3160059570243245, + "grad_norm": 4.533553600311279, + "loss": 3.9069, + "lr": 0.00042111888111888115, + "step": 4640, + "tokens_trained": 2.280293032 + }, + { + "epoch": 1.3165732926742784, + "grad_norm": 4.159526348114014, + "loss": 3.9262, + "lr": 0.00042083916083916084, + "step": 4642, + "tokens_trained": 2.281273472 + }, + { + "epoch": 1.3171406283242324, + "grad_norm": 2.847492218017578, + "loss": 3.9244, + "lr": 0.0004205594405594406, + "step": 4644, + "tokens_trained": 2.282257288 + }, + { + "epoch": 1.3177079639741862, + "grad_norm": 3.4552533626556396, + "loss": 3.9252, + "lr": 0.00042027972027972027, + "step": 4646, + "tokens_trained": 2.28324236 + }, + { + "epoch": 1.31827529962414, + "grad_norm": 1.4335713386535645, + "loss": 3.9075, + "lr": 0.00042, + "step": 4648, + "tokens_trained": 2.284224 + }, + { + "epoch": 1.318842635274094, + "grad_norm": 3.8727214336395264, + "loss": 3.8907, + "lr": 0.0004197202797202797, + "step": 4650, + "tokens_trained": 2.285205656 + }, + { + "epoch": 1.3194099709240479, + "grad_norm": 4.415209770202637, + "loss": 3.9138, + "lr": 0.0004194405594405594, + "step": 4652, + "tokens_trained": 2.286191744 + }, + { + "epoch": 1.319977306574002, + "grad_norm": 3.026095151901245, + "loss": 3.889, + "lr": 0.0004191608391608392, + "step": 4654, + "tokens_trained": 2.287174 + }, + { + "epoch": 1.3205446422239557, + "grad_norm": 3.9142091274261475, + "loss": 3.8506, + "lr": 0.0004188811188811189, + "step": 4656, + "tokens_trained": 2.288156824 + }, + { + "epoch": 1.3211119778739095, + "grad_norm": 5.409343719482422, + "loss": 3.9258, + "lr": 0.00041860139860139863, + "step": 4658, + "tokens_trained": 2.289136776 + }, + { + "epoch": 1.3216793135238636, + "grad_norm": 1.3607697486877441, + "loss": 3.9254, + "lr": 0.0004183216783216783, + "step": 4660, + "tokens_trained": 2.290117448 + }, + { + "epoch": 1.3222466491738174, + "grad_norm": 4.911555290222168, + "loss": 3.9406, + "lr": 0.00041804195804195807, + "step": 4662, + "tokens_trained": 2.291098904 + }, + { + "epoch": 1.3228139848237714, + "grad_norm": 5.282960891723633, + "loss": 3.9109, + "lr": 0.00041776223776223776, + "step": 4664, + "tokens_trained": 2.29208568 + }, + { + "epoch": 1.3233813204737253, + "grad_norm": 4.313295364379883, + "loss": 3.9077, + "lr": 0.0004174825174825175, + "step": 4666, + "tokens_trained": 2.293067688 + }, + { + "epoch": 1.323948656123679, + "grad_norm": 2.7871968746185303, + "loss": 3.9306, + "lr": 0.0004172027972027972, + "step": 4668, + "tokens_trained": 2.294050264 + }, + { + "epoch": 1.324515991773633, + "grad_norm": 2.481030225753784, + "loss": 3.9429, + "lr": 0.0004169230769230769, + "step": 4670, + "tokens_trained": 2.295031904 + }, + { + "epoch": 1.325083327423587, + "grad_norm": 5.044018268585205, + "loss": 3.8738, + "lr": 0.0004166433566433567, + "step": 4672, + "tokens_trained": 2.296011688 + }, + { + "epoch": 1.325650663073541, + "grad_norm": 6.23581075668335, + "loss": 3.9253, + "lr": 0.00041636363636363637, + "step": 4674, + "tokens_trained": 2.296996288 + }, + { + "epoch": 1.3262179987234948, + "grad_norm": 2.041799545288086, + "loss": 3.905, + "lr": 0.0004160839160839161, + "step": 4676, + "tokens_trained": 2.297978912 + }, + { + "epoch": 1.3267853343734486, + "grad_norm": 1.1758520603179932, + "loss": 3.8992, + "lr": 0.0004158041958041958, + "step": 4678, + "tokens_trained": 2.298963184 + }, + { + "epoch": 1.3273526700234026, + "grad_norm": 2.1230716705322266, + "loss": 3.9038, + "lr": 0.00041552447552447555, + "step": 4680, + "tokens_trained": 2.299946672 + }, + { + "epoch": 1.3279200056733564, + "grad_norm": 1.821915626525879, + "loss": 3.9239, + "lr": 0.00041524475524475524, + "step": 4682, + "tokens_trained": 2.300931632 + }, + { + "epoch": 1.3284873413233105, + "grad_norm": 0.7051568627357483, + "loss": 3.9281, + "lr": 0.000414965034965035, + "step": 4684, + "tokens_trained": 2.301910304 + }, + { + "epoch": 1.3290546769732643, + "grad_norm": 1.8326458930969238, + "loss": 3.9498, + "lr": 0.0004146853146853147, + "step": 4686, + "tokens_trained": 2.302891896 + }, + { + "epoch": 1.329622012623218, + "grad_norm": 1.4614375829696655, + "loss": 3.9342, + "lr": 0.00041440559440559437, + "step": 4688, + "tokens_trained": 2.30387572 + }, + { + "epoch": 1.3301893482731721, + "grad_norm": 1.6197412014007568, + "loss": 3.8507, + "lr": 0.00041412587412587417, + "step": 4690, + "tokens_trained": 2.304857152 + }, + { + "epoch": 1.330756683923126, + "grad_norm": 0.5570790767669678, + "loss": 3.9307, + "lr": 0.00041384615384615386, + "step": 4692, + "tokens_trained": 2.305841336 + }, + { + "epoch": 1.33132401957308, + "grad_norm": 1.6550084352493286, + "loss": 3.9237, + "lr": 0.0004135664335664336, + "step": 4694, + "tokens_trained": 2.30682708 + }, + { + "epoch": 1.3318913552230338, + "grad_norm": 1.334955096244812, + "loss": 3.917, + "lr": 0.0004132867132867133, + "step": 4696, + "tokens_trained": 2.307809136 + }, + { + "epoch": 1.3324586908729876, + "grad_norm": 3.4471423625946045, + "loss": 3.9231, + "lr": 0.00041300699300699304, + "step": 4698, + "tokens_trained": 2.308789496 + }, + { + "epoch": 1.3330260265229417, + "grad_norm": 4.426776885986328, + "loss": 3.8495, + "lr": 0.0004127272727272727, + "step": 4700, + "tokens_trained": 2.3097722 + }, + { + "epoch": 1.3335933621728955, + "grad_norm": 4.349783897399902, + "loss": 3.918, + "lr": 0.00041244755244755247, + "step": 4702, + "tokens_trained": 2.310748672 + }, + { + "epoch": 1.3341606978228495, + "grad_norm": 4.3204426765441895, + "loss": 3.8733, + "lr": 0.00041216783216783216, + "step": 4704, + "tokens_trained": 2.31172956 + }, + { + "epoch": 1.3347280334728033, + "grad_norm": 4.6586480140686035, + "loss": 3.9174, + "lr": 0.00041188811188811185, + "step": 4706, + "tokens_trained": 2.312715208 + }, + { + "epoch": 1.3352953691227571, + "grad_norm": 4.72362756729126, + "loss": 3.8998, + "lr": 0.00041160839160839165, + "step": 4708, + "tokens_trained": 2.313700896 + }, + { + "epoch": 1.3358627047727112, + "grad_norm": 4.0833516120910645, + "loss": 3.8726, + "lr": 0.00041132867132867134, + "step": 4710, + "tokens_trained": 2.314683176 + }, + { + "epoch": 1.336430040422665, + "grad_norm": 3.979100227355957, + "loss": 3.9482, + "lr": 0.0004110489510489511, + "step": 4712, + "tokens_trained": 2.315667504 + }, + { + "epoch": 1.336997376072619, + "grad_norm": 2.9478790760040283, + "loss": 3.8954, + "lr": 0.0004107692307692308, + "step": 4714, + "tokens_trained": 2.316647488 + }, + { + "epoch": 1.3375647117225729, + "grad_norm": 3.2437031269073486, + "loss": 3.8961, + "lr": 0.0004104895104895105, + "step": 4716, + "tokens_trained": 2.317629888 + }, + { + "epoch": 1.3381320473725267, + "grad_norm": 3.9469761848449707, + "loss": 3.916, + "lr": 0.0004102097902097902, + "step": 4718, + "tokens_trained": 2.318613016 + }, + { + "epoch": 1.3386993830224807, + "grad_norm": 4.271415710449219, + "loss": 3.928, + "lr": 0.0004099300699300699, + "step": 4720, + "tokens_trained": 2.319592112 + }, + { + "epoch": 1.3392667186724345, + "grad_norm": 2.656351327896118, + "loss": 3.8936, + "lr": 0.00040965034965034964, + "step": 4722, + "tokens_trained": 2.320572056 + }, + { + "epoch": 1.3398340543223886, + "grad_norm": 2.6717190742492676, + "loss": 3.8753, + "lr": 0.00040937062937062934, + "step": 4724, + "tokens_trained": 2.321557736 + }, + { + "epoch": 1.3404013899723424, + "grad_norm": 4.214351654052734, + "loss": 3.8905, + "lr": 0.00040909090909090913, + "step": 4726, + "tokens_trained": 2.322539008 + }, + { + "epoch": 1.3409687256222962, + "grad_norm": 4.417314052581787, + "loss": 3.9017, + "lr": 0.0004088111888111888, + "step": 4728, + "tokens_trained": 2.323524672 + }, + { + "epoch": 1.3415360612722502, + "grad_norm": 3.1664652824401855, + "loss": 3.9226, + "lr": 0.00040853146853146857, + "step": 4730, + "tokens_trained": 2.32450916 + }, + { + "epoch": 1.342103396922204, + "grad_norm": 2.39656662940979, + "loss": 3.91, + "lr": 0.00040825174825174826, + "step": 4732, + "tokens_trained": 2.325490472 + }, + { + "epoch": 1.342670732572158, + "grad_norm": 2.9324393272399902, + "loss": 3.8945, + "lr": 0.000407972027972028, + "step": 4734, + "tokens_trained": 2.326473872 + }, + { + "epoch": 1.343238068222112, + "grad_norm": 3.534731388092041, + "loss": 3.8557, + "lr": 0.0004076923076923077, + "step": 4736, + "tokens_trained": 2.327458424 + }, + { + "epoch": 1.3438054038720657, + "grad_norm": 2.3089616298675537, + "loss": 3.8957, + "lr": 0.0004074125874125874, + "step": 4738, + "tokens_trained": 2.328444432 + }, + { + "epoch": 1.3443727395220197, + "grad_norm": 3.3014519214630127, + "loss": 3.8746, + "lr": 0.00040713286713286713, + "step": 4740, + "tokens_trained": 2.32942976 + }, + { + "epoch": 1.3449400751719736, + "grad_norm": 5.408111572265625, + "loss": 3.9117, + "lr": 0.0004068531468531468, + "step": 4742, + "tokens_trained": 2.330411736 + }, + { + "epoch": 1.3455074108219276, + "grad_norm": 4.326341152191162, + "loss": 3.8331, + "lr": 0.0004065734265734266, + "step": 4744, + "tokens_trained": 2.3313928 + }, + { + "epoch": 1.3460747464718814, + "grad_norm": 3.9538161754608154, + "loss": 3.9216, + "lr": 0.0004062937062937063, + "step": 4746, + "tokens_trained": 2.332380728 + }, + { + "epoch": 1.3466420821218352, + "grad_norm": 2.4591166973114014, + "loss": 3.8795, + "lr": 0.00040601398601398605, + "step": 4748, + "tokens_trained": 2.333363448 + }, + { + "epoch": 1.3472094177717893, + "grad_norm": 3.2325263023376465, + "loss": 3.9277, + "lr": 0.00040573426573426574, + "step": 4750, + "tokens_trained": 2.334348496 + }, + { + "epoch": 1.3472094177717893, + "eval_loss": 0.9784421920776367, + "eval_runtime": 20.3876, + "step": 4750, + "tokens_trained": 2.334348496 + }, + { + "epoch": 1.347776753421743, + "grad_norm": 2.721426486968994, + "loss": 3.9264, + "lr": 0.0004054545454545455, + "step": 4752, + "tokens_trained": 2.335333088 + }, + { + "epoch": 1.3483440890716971, + "grad_norm": 1.0679550170898438, + "loss": 3.9515, + "lr": 0.0004051748251748252, + "step": 4754, + "tokens_trained": 2.33632024 + }, + { + "epoch": 1.348911424721651, + "grad_norm": 0.6162118911743164, + "loss": 3.8984, + "lr": 0.00040489510489510487, + "step": 4756, + "tokens_trained": 2.337303072 + }, + { + "epoch": 1.3494787603716047, + "grad_norm": 1.993177890777588, + "loss": 3.8845, + "lr": 0.0004046153846153846, + "step": 4758, + "tokens_trained": 2.338281984 + }, + { + "epoch": 1.3500460960215588, + "grad_norm": 2.5877304077148438, + "loss": 3.9123, + "lr": 0.0004043356643356643, + "step": 4760, + "tokens_trained": 2.339261224 + }, + { + "epoch": 1.3506134316715126, + "grad_norm": 6.708667755126953, + "loss": 3.9087, + "lr": 0.0004040559440559441, + "step": 4762, + "tokens_trained": 2.340241416 + }, + { + "epoch": 1.3511807673214666, + "grad_norm": 4.514158248901367, + "loss": 3.9094, + "lr": 0.0004037762237762238, + "step": 4764, + "tokens_trained": 2.341223048 + }, + { + "epoch": 1.3517481029714205, + "grad_norm": 2.3245937824249268, + "loss": 3.9466, + "lr": 0.00040349650349650354, + "step": 4766, + "tokens_trained": 2.34220588 + }, + { + "epoch": 1.3523154386213743, + "grad_norm": 4.14736795425415, + "loss": 3.8754, + "lr": 0.00040321678321678323, + "step": 4768, + "tokens_trained": 2.343188752 + }, + { + "epoch": 1.3528827742713283, + "grad_norm": 6.2871880531311035, + "loss": 3.8384, + "lr": 0.00040293706293706297, + "step": 4770, + "tokens_trained": 2.344169864 + }, + { + "epoch": 1.3534501099212821, + "grad_norm": 1.5958847999572754, + "loss": 3.8735, + "lr": 0.00040265734265734266, + "step": 4772, + "tokens_trained": 2.345153104 + }, + { + "epoch": 1.3540174455712362, + "grad_norm": 5.585666179656982, + "loss": 3.9073, + "lr": 0.00040237762237762235, + "step": 4774, + "tokens_trained": 2.346137 + }, + { + "epoch": 1.35458478122119, + "grad_norm": 3.8506343364715576, + "loss": 3.9298, + "lr": 0.0004020979020979021, + "step": 4776, + "tokens_trained": 2.347124712 + }, + { + "epoch": 1.3551521168711438, + "grad_norm": 4.7482757568359375, + "loss": 3.8957, + "lr": 0.0004018181818181818, + "step": 4778, + "tokens_trained": 2.348107992 + }, + { + "epoch": 1.3557194525210978, + "grad_norm": 1.6603455543518066, + "loss": 3.9345, + "lr": 0.00040153846153846153, + "step": 4780, + "tokens_trained": 2.349091872 + }, + { + "epoch": 1.3562867881710516, + "grad_norm": 0.40717223286628723, + "loss": 3.8988, + "lr": 0.0004012587412587413, + "step": 4782, + "tokens_trained": 2.350073024 + }, + { + "epoch": 1.3568541238210057, + "grad_norm": 1.7904951572418213, + "loss": 3.9163, + "lr": 0.000400979020979021, + "step": 4784, + "tokens_trained": 2.351057608 + }, + { + "epoch": 1.3574214594709595, + "grad_norm": 1.3750170469284058, + "loss": 3.9321, + "lr": 0.0004006993006993007, + "step": 4786, + "tokens_trained": 2.352039248 + }, + { + "epoch": 1.3579887951209133, + "grad_norm": 3.6613173484802246, + "loss": 3.9463, + "lr": 0.00040041958041958046, + "step": 4788, + "tokens_trained": 2.3530216 + }, + { + "epoch": 1.3585561307708673, + "grad_norm": 3.13639497756958, + "loss": 3.8653, + "lr": 0.00040013986013986015, + "step": 4790, + "tokens_trained": 2.353999496 + }, + { + "epoch": 1.3591234664208212, + "grad_norm": 3.408346176147461, + "loss": 3.946, + "lr": 0.00039986013986013984, + "step": 4792, + "tokens_trained": 2.354983984 + }, + { + "epoch": 1.3596908020707752, + "grad_norm": 4.422549247741699, + "loss": 3.9123, + "lr": 0.0003995804195804196, + "step": 4794, + "tokens_trained": 2.355968032 + }, + { + "epoch": 1.360258137720729, + "grad_norm": 2.9923927783966064, + "loss": 3.9502, + "lr": 0.00039930069930069927, + "step": 4796, + "tokens_trained": 2.356951192 + }, + { + "epoch": 1.3608254733706828, + "grad_norm": 1.1125166416168213, + "loss": 3.8638, + "lr": 0.000399020979020979, + "step": 4798, + "tokens_trained": 2.357930688 + }, + { + "epoch": 1.3613928090206369, + "grad_norm": 2.9915504455566406, + "loss": 3.9227, + "lr": 0.00039874125874125876, + "step": 4800, + "tokens_trained": 2.358909896 + }, + { + "epoch": 1.3619601446705907, + "grad_norm": 4.443681716918945, + "loss": 3.9206, + "lr": 0.0003984615384615385, + "step": 4802, + "tokens_trained": 2.35989144 + }, + { + "epoch": 1.3625274803205447, + "grad_norm": 5.246060848236084, + "loss": 3.8391, + "lr": 0.0003981818181818182, + "step": 4804, + "tokens_trained": 2.360878576 + }, + { + "epoch": 1.3630948159704985, + "grad_norm": 7.064333915710449, + "loss": 3.9045, + "lr": 0.00039790209790209794, + "step": 4806, + "tokens_trained": 2.361860808 + }, + { + "epoch": 1.3636621516204523, + "grad_norm": 2.9516990184783936, + "loss": 3.9038, + "lr": 0.00039762237762237763, + "step": 4808, + "tokens_trained": 2.362848272 + }, + { + "epoch": 1.3642294872704064, + "grad_norm": 7.830825328826904, + "loss": 3.9086, + "lr": 0.0003973426573426573, + "step": 4810, + "tokens_trained": 2.363829304 + }, + { + "epoch": 1.3647968229203602, + "grad_norm": 3.3761377334594727, + "loss": 3.8936, + "lr": 0.00039706293706293707, + "step": 4812, + "tokens_trained": 2.364814928 + }, + { + "epoch": 1.3653641585703142, + "grad_norm": 3.8069584369659424, + "loss": 3.8805, + "lr": 0.00039678321678321676, + "step": 4814, + "tokens_trained": 2.365793656 + }, + { + "epoch": 1.365931494220268, + "grad_norm": 5.233834743499756, + "loss": 3.8868, + "lr": 0.0003965034965034965, + "step": 4816, + "tokens_trained": 2.366781064 + }, + { + "epoch": 1.3664988298702219, + "grad_norm": 5.134295463562012, + "loss": 3.883, + "lr": 0.00039622377622377625, + "step": 4818, + "tokens_trained": 2.367763392 + }, + { + "epoch": 1.367066165520176, + "grad_norm": 1.2896602153778076, + "loss": 3.8733, + "lr": 0.000395944055944056, + "step": 4820, + "tokens_trained": 2.368744936 + }, + { + "epoch": 1.3676335011701297, + "grad_norm": 6.089853763580322, + "loss": 3.943, + "lr": 0.0003956643356643357, + "step": 4822, + "tokens_trained": 2.369729952 + }, + { + "epoch": 1.3682008368200838, + "grad_norm": 4.928650379180908, + "loss": 3.9151, + "lr": 0.0003953846153846154, + "step": 4824, + "tokens_trained": 2.370710944 + }, + { + "epoch": 1.3687681724700376, + "grad_norm": 4.412777423858643, + "loss": 3.9165, + "lr": 0.0003951048951048951, + "step": 4826, + "tokens_trained": 2.371693224 + }, + { + "epoch": 1.3693355081199914, + "grad_norm": 3.940869092941284, + "loss": 3.9457, + "lr": 0.0003948251748251748, + "step": 4828, + "tokens_trained": 2.372672808 + }, + { + "epoch": 1.3699028437699454, + "grad_norm": 4.23148775100708, + "loss": 3.8893, + "lr": 0.00039454545454545455, + "step": 4830, + "tokens_trained": 2.373659816 + }, + { + "epoch": 1.3704701794198992, + "grad_norm": 2.781536817550659, + "loss": 3.8649, + "lr": 0.00039426573426573424, + "step": 4832, + "tokens_trained": 2.374645648 + }, + { + "epoch": 1.3710375150698533, + "grad_norm": 1.7263449430465698, + "loss": 3.9142, + "lr": 0.000393986013986014, + "step": 4834, + "tokens_trained": 2.375627952 + }, + { + "epoch": 1.371604850719807, + "grad_norm": 7.530355453491211, + "loss": 3.9146, + "lr": 0.00039370629370629373, + "step": 4836, + "tokens_trained": 2.376608568 + }, + { + "epoch": 1.372172186369761, + "grad_norm": 5.03418493270874, + "loss": 3.8995, + "lr": 0.0003934265734265735, + "step": 4838, + "tokens_trained": 2.377594368 + }, + { + "epoch": 1.372739522019715, + "grad_norm": 3.9235804080963135, + "loss": 3.8575, + "lr": 0.00039314685314685316, + "step": 4840, + "tokens_trained": 2.37857584 + }, + { + "epoch": 1.3733068576696688, + "grad_norm": 4.762357234954834, + "loss": 3.9044, + "lr": 0.00039286713286713286, + "step": 4842, + "tokens_trained": 2.37956052 + }, + { + "epoch": 1.3738741933196228, + "grad_norm": 4.108587741851807, + "loss": 3.8829, + "lr": 0.0003925874125874126, + "step": 4844, + "tokens_trained": 2.380537584 + }, + { + "epoch": 1.3744415289695766, + "grad_norm": 2.686072826385498, + "loss": 3.8575, + "lr": 0.0003923076923076923, + "step": 4846, + "tokens_trained": 2.381523144 + }, + { + "epoch": 1.3750088646195304, + "grad_norm": 3.9192161560058594, + "loss": 3.8674, + "lr": 0.00039202797202797203, + "step": 4848, + "tokens_trained": 2.382504752 + }, + { + "epoch": 1.3755762002694845, + "grad_norm": 3.2957770824432373, + "loss": 3.8897, + "lr": 0.0003917482517482517, + "step": 4850, + "tokens_trained": 2.383486968 + }, + { + "epoch": 1.3761435359194383, + "grad_norm": 3.0208771228790283, + "loss": 3.8923, + "lr": 0.00039146853146853147, + "step": 4852, + "tokens_trained": 2.384467168 + }, + { + "epoch": 1.3767108715693923, + "grad_norm": 1.4386385679244995, + "loss": 3.8489, + "lr": 0.0003911888111888112, + "step": 4854, + "tokens_trained": 2.385446096 + }, + { + "epoch": 1.3772782072193461, + "grad_norm": 2.494499444961548, + "loss": 3.8857, + "lr": 0.00039090909090909096, + "step": 4856, + "tokens_trained": 2.386428088 + }, + { + "epoch": 1.3778455428693, + "grad_norm": 2.573397397994995, + "loss": 3.8716, + "lr": 0.00039062937062937065, + "step": 4858, + "tokens_trained": 2.387410288 + }, + { + "epoch": 1.378412878519254, + "grad_norm": 2.8497166633605957, + "loss": 3.9172, + "lr": 0.00039034965034965034, + "step": 4860, + "tokens_trained": 2.388389632 + }, + { + "epoch": 1.3789802141692078, + "grad_norm": 1.3268458843231201, + "loss": 3.8724, + "lr": 0.0003900699300699301, + "step": 4862, + "tokens_trained": 2.389375248 + }, + { + "epoch": 1.3795475498191618, + "grad_norm": 2.5455031394958496, + "loss": 3.9061, + "lr": 0.0003897902097902098, + "step": 4864, + "tokens_trained": 2.390357104 + }, + { + "epoch": 1.3801148854691156, + "grad_norm": 2.6307923793792725, + "loss": 3.894, + "lr": 0.0003895104895104895, + "step": 4866, + "tokens_trained": 2.391334728 + }, + { + "epoch": 1.3806822211190695, + "grad_norm": 2.4805266857147217, + "loss": 3.848, + "lr": 0.0003892307692307692, + "step": 4868, + "tokens_trained": 2.392315672 + }, + { + "epoch": 1.3812495567690235, + "grad_norm": 2.6160788536071777, + "loss": 3.9057, + "lr": 0.00038895104895104895, + "step": 4870, + "tokens_trained": 2.393302152 + }, + { + "epoch": 1.3818168924189773, + "grad_norm": 1.4398711919784546, + "loss": 3.8609, + "lr": 0.0003886713286713287, + "step": 4872, + "tokens_trained": 2.394281808 + }, + { + "epoch": 1.3823842280689314, + "grad_norm": 2.4663705825805664, + "loss": 3.9316, + "lr": 0.00038839160839160844, + "step": 4874, + "tokens_trained": 2.395263216 + }, + { + "epoch": 1.3826678958939083, + "eval_loss": 0.9779003858566284, + "eval_runtime": 20.7864, + "step": 4875, + "tokens_trained": 2.395753536 + }, + { + "epoch": 1.3829515637188852, + "grad_norm": 2.455738067626953, + "loss": 3.8944, + "lr": 0.00038811188811188813, + "step": 4876, + "tokens_trained": 2.396244232 + }, + { + "epoch": 1.383518899368839, + "grad_norm": 4.183098793029785, + "loss": 3.9096, + "lr": 0.0003878321678321678, + "step": 4878, + "tokens_trained": 2.39722784 + }, + { + "epoch": 1.384086235018793, + "grad_norm": 6.7180495262146, + "loss": 3.8762, + "lr": 0.00038755244755244757, + "step": 4880, + "tokens_trained": 2.39820884 + }, + { + "epoch": 1.3846535706687468, + "grad_norm": 1.702336072921753, + "loss": 3.8616, + "lr": 0.00038727272727272726, + "step": 4882, + "tokens_trained": 2.39919008 + }, + { + "epoch": 1.3852209063187009, + "grad_norm": 10.165470123291016, + "loss": 3.9159, + "lr": 0.000386993006993007, + "step": 4884, + "tokens_trained": 2.400176128 + }, + { + "epoch": 1.3857882419686547, + "grad_norm": 4.2575297355651855, + "loss": 3.9513, + "lr": 0.0003867132867132867, + "step": 4886, + "tokens_trained": 2.401159616 + }, + { + "epoch": 1.3863555776186085, + "grad_norm": 4.321669578552246, + "loss": 3.8929, + "lr": 0.00038643356643356644, + "step": 4888, + "tokens_trained": 2.40214392 + }, + { + "epoch": 1.3869229132685625, + "grad_norm": 4.289078235626221, + "loss": 3.8378, + "lr": 0.0003861538461538462, + "step": 4890, + "tokens_trained": 2.403127904 + }, + { + "epoch": 1.3874902489185164, + "grad_norm": 6.578473091125488, + "loss": 3.8603, + "lr": 0.00038587412587412593, + "step": 4892, + "tokens_trained": 2.404109496 + }, + { + "epoch": 1.3880575845684704, + "grad_norm": 4.092262268066406, + "loss": 3.9049, + "lr": 0.0003855944055944056, + "step": 4894, + "tokens_trained": 2.405092064 + }, + { + "epoch": 1.3886249202184242, + "grad_norm": 3.304581642150879, + "loss": 3.8862, + "lr": 0.0003853146853146853, + "step": 4896, + "tokens_trained": 2.406074136 + }, + { + "epoch": 1.389192255868378, + "grad_norm": 3.7834372520446777, + "loss": 3.916, + "lr": 0.00038503496503496505, + "step": 4898, + "tokens_trained": 2.407054056 + }, + { + "epoch": 1.389759591518332, + "grad_norm": 3.30719256401062, + "loss": 3.9162, + "lr": 0.00038475524475524474, + "step": 4900, + "tokens_trained": 2.408035 + }, + { + "epoch": 1.3903269271682859, + "grad_norm": 2.2104077339172363, + "loss": 3.9094, + "lr": 0.0003844755244755245, + "step": 4902, + "tokens_trained": 2.409019864 + }, + { + "epoch": 1.39089426281824, + "grad_norm": 3.2836616039276123, + "loss": 3.8586, + "lr": 0.0003841958041958042, + "step": 4904, + "tokens_trained": 2.410002576 + }, + { + "epoch": 1.3914615984681937, + "grad_norm": 2.468010187149048, + "loss": 3.8655, + "lr": 0.0003839160839160839, + "step": 4906, + "tokens_trained": 2.410983384 + }, + { + "epoch": 1.3920289341181475, + "grad_norm": 2.7495617866516113, + "loss": 3.8934, + "lr": 0.0003836363636363636, + "step": 4908, + "tokens_trained": 2.411967688 + }, + { + "epoch": 1.3925962697681016, + "grad_norm": 2.61542010307312, + "loss": 3.8928, + "lr": 0.0003833566433566434, + "step": 4910, + "tokens_trained": 2.412946648 + }, + { + "epoch": 1.3931636054180554, + "grad_norm": 3.393087148666382, + "loss": 3.9396, + "lr": 0.0003830769230769231, + "step": 4912, + "tokens_trained": 2.413930608 + }, + { + "epoch": 1.3937309410680094, + "grad_norm": 2.1915347576141357, + "loss": 3.8685, + "lr": 0.0003827972027972028, + "step": 4914, + "tokens_trained": 2.414910456 + }, + { + "epoch": 1.3942982767179632, + "grad_norm": 1.2087231874465942, + "loss": 3.9201, + "lr": 0.00038251748251748254, + "step": 4916, + "tokens_trained": 2.41588864 + }, + { + "epoch": 1.394865612367917, + "grad_norm": 2.1861963272094727, + "loss": 3.8936, + "lr": 0.0003822377622377622, + "step": 4918, + "tokens_trained": 2.416869936 + }, + { + "epoch": 1.395432948017871, + "grad_norm": 2.2949490547180176, + "loss": 3.8818, + "lr": 0.00038195804195804197, + "step": 4920, + "tokens_trained": 2.417855472 + }, + { + "epoch": 1.396000283667825, + "grad_norm": 2.027250289916992, + "loss": 3.8758, + "lr": 0.00038167832167832166, + "step": 4922, + "tokens_trained": 2.418839744 + }, + { + "epoch": 1.396567619317779, + "grad_norm": 4.480210304260254, + "loss": 3.8769, + "lr": 0.0003813986013986014, + "step": 4924, + "tokens_trained": 2.4198266 + }, + { + "epoch": 1.3971349549677328, + "grad_norm": 2.718602180480957, + "loss": 3.9309, + "lr": 0.0003811188811188811, + "step": 4926, + "tokens_trained": 2.420809488 + }, + { + "epoch": 1.3977022906176866, + "grad_norm": 3.022064447402954, + "loss": 3.8953, + "lr": 0.0003808391608391609, + "step": 4928, + "tokens_trained": 2.421795792 + }, + { + "epoch": 1.3982696262676406, + "grad_norm": 3.6465160846710205, + "loss": 3.901, + "lr": 0.0003805594405594406, + "step": 4930, + "tokens_trained": 2.422778216 + }, + { + "epoch": 1.3988369619175944, + "grad_norm": 2.549898386001587, + "loss": 3.8641, + "lr": 0.0003802797202797203, + "step": 4932, + "tokens_trained": 2.423761104 + }, + { + "epoch": 1.3994042975675485, + "grad_norm": 2.1666665077209473, + "loss": 3.9211, + "lr": 0.00038, + "step": 4934, + "tokens_trained": 2.424746424 + }, + { + "epoch": 1.3999716332175023, + "grad_norm": 5.31266450881958, + "loss": 3.8729, + "lr": 0.0003797202797202797, + "step": 4936, + "tokens_trained": 2.425730296 + }, + { + "epoch": 1.400538968867456, + "grad_norm": 3.2631475925445557, + "loss": 3.8741, + "lr": 0.00037944055944055946, + "step": 4938, + "tokens_trained": 2.426711856 + }, + { + "epoch": 1.4011063045174101, + "grad_norm": 2.7507376670837402, + "loss": 3.8877, + "lr": 0.00037916083916083915, + "step": 4940, + "tokens_trained": 2.427695064 + }, + { + "epoch": 1.401673640167364, + "grad_norm": 2.361859083175659, + "loss": 3.8937, + "lr": 0.0003788811188811189, + "step": 4942, + "tokens_trained": 2.428680184 + }, + { + "epoch": 1.402240975817318, + "grad_norm": 3.007972240447998, + "loss": 3.8591, + "lr": 0.0003786013986013986, + "step": 4944, + "tokens_trained": 2.429668312 + }, + { + "epoch": 1.4028083114672718, + "grad_norm": 3.033128261566162, + "loss": 3.9293, + "lr": 0.0003783216783216784, + "step": 4946, + "tokens_trained": 2.430652248 + }, + { + "epoch": 1.4033756471172256, + "grad_norm": 1.0569933652877808, + "loss": 3.9047, + "lr": 0.00037804195804195807, + "step": 4948, + "tokens_trained": 2.431634048 + }, + { + "epoch": 1.4039429827671797, + "grad_norm": 1.1776299476623535, + "loss": 3.8985, + "lr": 0.00037776223776223776, + "step": 4950, + "tokens_trained": 2.432615856 + }, + { + "epoch": 1.4045103184171335, + "grad_norm": 2.139624834060669, + "loss": 3.8648, + "lr": 0.0003774825174825175, + "step": 4952, + "tokens_trained": 2.433598912 + }, + { + "epoch": 1.4050776540670875, + "grad_norm": 3.9667930603027344, + "loss": 3.9196, + "lr": 0.0003772027972027972, + "step": 4954, + "tokens_trained": 2.434583464 + }, + { + "epoch": 1.4056449897170413, + "grad_norm": 3.4130353927612305, + "loss": 3.873, + "lr": 0.00037692307692307694, + "step": 4956, + "tokens_trained": 2.435562696 + }, + { + "epoch": 1.4062123253669951, + "grad_norm": 2.91157603263855, + "loss": 3.8901, + "lr": 0.00037664335664335663, + "step": 4958, + "tokens_trained": 2.436544192 + }, + { + "epoch": 1.4067796610169492, + "grad_norm": 2.038764715194702, + "loss": 3.8951, + "lr": 0.0003763636363636364, + "step": 4960, + "tokens_trained": 2.43752728 + }, + { + "epoch": 1.407346996666903, + "grad_norm": 2.2672388553619385, + "loss": 3.8573, + "lr": 0.00037608391608391607, + "step": 4962, + "tokens_trained": 2.438511552 + }, + { + "epoch": 1.407914332316857, + "grad_norm": 2.4656710624694824, + "loss": 3.8819, + "lr": 0.0003758041958041958, + "step": 4964, + "tokens_trained": 2.4394974 + }, + { + "epoch": 1.4084816679668108, + "grad_norm": 2.4732837677001953, + "loss": 3.8761, + "lr": 0.00037552447552447555, + "step": 4966, + "tokens_trained": 2.440477752 + }, + { + "epoch": 1.4090490036167647, + "grad_norm": 4.646571636199951, + "loss": 3.8883, + "lr": 0.00037524475524475524, + "step": 4968, + "tokens_trained": 2.441464112 + }, + { + "epoch": 1.4096163392667187, + "grad_norm": 3.602743625640869, + "loss": 3.9305, + "lr": 0.000374965034965035, + "step": 4970, + "tokens_trained": 2.442441072 + }, + { + "epoch": 1.4101836749166725, + "grad_norm": 2.1577095985412598, + "loss": 3.8883, + "lr": 0.0003746853146853147, + "step": 4972, + "tokens_trained": 2.443425168 + }, + { + "epoch": 1.4107510105666266, + "grad_norm": 2.35933256149292, + "loss": 3.9124, + "lr": 0.0003744055944055944, + "step": 4974, + "tokens_trained": 2.444408448 + }, + { + "epoch": 1.4113183462165804, + "grad_norm": 7.452941417694092, + "loss": 3.8741, + "lr": 0.0003741258741258741, + "step": 4976, + "tokens_trained": 2.445390888 + }, + { + "epoch": 1.4118856818665342, + "grad_norm": 4.788355827331543, + "loss": 3.8915, + "lr": 0.00037384615384615386, + "step": 4978, + "tokens_trained": 2.44637136 + }, + { + "epoch": 1.4124530175164882, + "grad_norm": 2.7765729427337646, + "loss": 3.8878, + "lr": 0.00037356643356643355, + "step": 4980, + "tokens_trained": 2.447355568 + }, + { + "epoch": 1.413020353166442, + "grad_norm": 3.324477195739746, + "loss": 3.8546, + "lr": 0.0003732867132867133, + "step": 4982, + "tokens_trained": 2.448338952 + }, + { + "epoch": 1.413587688816396, + "grad_norm": 1.5179075002670288, + "loss": 3.9019, + "lr": 0.00037300699300699304, + "step": 4984, + "tokens_trained": 2.449324032 + }, + { + "epoch": 1.4141550244663499, + "grad_norm": 4.929554462432861, + "loss": 3.8773, + "lr": 0.00037272727272727273, + "step": 4986, + "tokens_trained": 2.450307912 + }, + { + "epoch": 1.4147223601163037, + "grad_norm": 4.763064384460449, + "loss": 3.9035, + "lr": 0.0003724475524475525, + "step": 4988, + "tokens_trained": 2.451293168 + }, + { + "epoch": 1.4152896957662577, + "grad_norm": 2.141029119491577, + "loss": 3.9224, + "lr": 0.00037216783216783216, + "step": 4990, + "tokens_trained": 2.452276952 + }, + { + "epoch": 1.4158570314162116, + "grad_norm": 3.93829607963562, + "loss": 3.8889, + "lr": 0.0003718881118881119, + "step": 4992, + "tokens_trained": 2.453264688 + }, + { + "epoch": 1.4164243670661656, + "grad_norm": 3.691845178604126, + "loss": 3.884, + "lr": 0.0003716083916083916, + "step": 4994, + "tokens_trained": 2.454252408 + }, + { + "epoch": 1.4169917027161194, + "grad_norm": 1.6449168920516968, + "loss": 3.8893, + "lr": 0.00037132867132867134, + "step": 4996, + "tokens_trained": 2.4552316 + }, + { + "epoch": 1.4175590383660732, + "grad_norm": 3.0063729286193848, + "loss": 3.8786, + "lr": 0.00037104895104895103, + "step": 4998, + "tokens_trained": 2.456215176 + }, + { + "epoch": 1.4181263740160273, + "grad_norm": 4.001911163330078, + "loss": 3.8797, + "lr": 0.0003707692307692308, + "step": 5000, + "tokens_trained": 2.4571994 + }, + { + "epoch": 1.4181263740160273, + "eval_loss": 0.9744628667831421, + "eval_runtime": 20.7577, + "step": 5000, + "tokens_trained": 2.4571994 + }, + { + "epoch": 1.418693709665981, + "grad_norm": 2.209125518798828, + "loss": 3.8578, + "lr": 0.0003704895104895105, + "step": 5002, + "tokens_trained": 2.458180112 + }, + { + "epoch": 1.419261045315935, + "grad_norm": 3.4210712909698486, + "loss": 3.8956, + "lr": 0.0003702097902097902, + "step": 5004, + "tokens_trained": 2.459162136 + }, + { + "epoch": 1.419828380965889, + "grad_norm": 3.204285144805908, + "loss": 3.8689, + "lr": 0.00036993006993006996, + "step": 5006, + "tokens_trained": 2.460144776 + }, + { + "epoch": 1.4203957166158427, + "grad_norm": 2.6957204341888428, + "loss": 3.9176, + "lr": 0.00036965034965034965, + "step": 5008, + "tokens_trained": 2.461124056 + }, + { + "epoch": 1.4209630522657968, + "grad_norm": 3.2292940616607666, + "loss": 3.8843, + "lr": 0.0003693706293706294, + "step": 5010, + "tokens_trained": 2.462105048 + }, + { + "epoch": 1.4215303879157506, + "grad_norm": 3.2393546104431152, + "loss": 3.9098, + "lr": 0.0003690909090909091, + "step": 5012, + "tokens_trained": 2.46308476 + }, + { + "epoch": 1.4220977235657046, + "grad_norm": 4.3664774894714355, + "loss": 3.8755, + "lr": 0.00036881118881118883, + "step": 5014, + "tokens_trained": 2.464068376 + }, + { + "epoch": 1.4226650592156584, + "grad_norm": 3.5531437397003174, + "loss": 3.9183, + "lr": 0.0003685314685314685, + "step": 5016, + "tokens_trained": 2.465050144 + }, + { + "epoch": 1.4232323948656123, + "grad_norm": 2.292147636413574, + "loss": 3.9113, + "lr": 0.00036825174825174826, + "step": 5018, + "tokens_trained": 2.466032864 + }, + { + "epoch": 1.4237997305155663, + "grad_norm": 3.2202541828155518, + "loss": 3.9005, + "lr": 0.000367972027972028, + "step": 5020, + "tokens_trained": 2.467016672 + }, + { + "epoch": 1.4243670661655201, + "grad_norm": 2.978670835494995, + "loss": 3.8717, + "lr": 0.0003676923076923077, + "step": 5022, + "tokens_trained": 2.468001272 + }, + { + "epoch": 1.4249344018154741, + "grad_norm": 1.9841945171356201, + "loss": 3.8642, + "lr": 0.00036741258741258744, + "step": 5024, + "tokens_trained": 2.468983928 + }, + { + "epoch": 1.425501737465428, + "grad_norm": 3.14475417137146, + "loss": 3.8952, + "lr": 0.00036713286713286713, + "step": 5026, + "tokens_trained": 2.469965368 + }, + { + "epoch": 1.4260690731153818, + "grad_norm": 2.5225462913513184, + "loss": 3.8684, + "lr": 0.0003668531468531469, + "step": 5028, + "tokens_trained": 2.470944904 + }, + { + "epoch": 1.4266364087653358, + "grad_norm": 2.162013053894043, + "loss": 3.8492, + "lr": 0.00036657342657342657, + "step": 5030, + "tokens_trained": 2.471925792 + }, + { + "epoch": 1.4272037444152896, + "grad_norm": 3.798084020614624, + "loss": 3.8492, + "lr": 0.0003662937062937063, + "step": 5032, + "tokens_trained": 2.4729112 + }, + { + "epoch": 1.4277710800652437, + "grad_norm": 3.125767707824707, + "loss": 3.8675, + "lr": 0.000366013986013986, + "step": 5034, + "tokens_trained": 2.473893608 + }, + { + "epoch": 1.4283384157151975, + "grad_norm": 2.3426859378814697, + "loss": 3.829, + "lr": 0.0003657342657342657, + "step": 5036, + "tokens_trained": 2.474873112 + }, + { + "epoch": 1.4289057513651513, + "grad_norm": 3.2585058212280273, + "loss": 3.8476, + "lr": 0.0003654545454545455, + "step": 5038, + "tokens_trained": 2.475857392 + }, + { + "epoch": 1.4294730870151053, + "grad_norm": 2.814438581466675, + "loss": 3.8467, + "lr": 0.0003651748251748252, + "step": 5040, + "tokens_trained": 2.476838664 + }, + { + "epoch": 1.4300404226650592, + "grad_norm": 1.8864086866378784, + "loss": 3.8652, + "lr": 0.0003648951048951049, + "step": 5042, + "tokens_trained": 2.47782168 + }, + { + "epoch": 1.4306077583150132, + "grad_norm": 1.7076116800308228, + "loss": 3.855, + "lr": 0.0003646153846153846, + "step": 5044, + "tokens_trained": 2.478803744 + }, + { + "epoch": 1.431175093964967, + "grad_norm": 2.2379872798919678, + "loss": 3.8486, + "lr": 0.00036433566433566436, + "step": 5046, + "tokens_trained": 2.479785216 + }, + { + "epoch": 1.4317424296149208, + "grad_norm": 2.4551987648010254, + "loss": 3.8613, + "lr": 0.00036405594405594405, + "step": 5048, + "tokens_trained": 2.480762576 + }, + { + "epoch": 1.4323097652648749, + "grad_norm": 1.9165434837341309, + "loss": 3.8691, + "lr": 0.00036377622377622374, + "step": 5050, + "tokens_trained": 2.48174416 + }, + { + "epoch": 1.4328771009148287, + "grad_norm": 3.355273723602295, + "loss": 3.9335, + "lr": 0.0003634965034965035, + "step": 5052, + "tokens_trained": 2.482725264 + }, + { + "epoch": 1.4334444365647827, + "grad_norm": 2.419801712036133, + "loss": 3.8996, + "lr": 0.0003632167832167832, + "step": 5054, + "tokens_trained": 2.483708016 + }, + { + "epoch": 1.4340117722147365, + "grad_norm": 0.953630268573761, + "loss": 3.8797, + "lr": 0.000362937062937063, + "step": 5056, + "tokens_trained": 2.484690616 + }, + { + "epoch": 1.4345791078646903, + "grad_norm": 2.454457998275757, + "loss": 3.9272, + "lr": 0.00036265734265734267, + "step": 5058, + "tokens_trained": 2.485672144 + }, + { + "epoch": 1.4351464435146444, + "grad_norm": 2.6845757961273193, + "loss": 3.8732, + "lr": 0.0003623776223776224, + "step": 5060, + "tokens_trained": 2.486659072 + }, + { + "epoch": 1.4357137791645982, + "grad_norm": 1.8361189365386963, + "loss": 3.8963, + "lr": 0.0003620979020979021, + "step": 5062, + "tokens_trained": 2.487640824 + }, + { + "epoch": 1.4362811148145522, + "grad_norm": 2.003408432006836, + "loss": 3.8925, + "lr": 0.00036181818181818185, + "step": 5064, + "tokens_trained": 2.488623064 + }, + { + "epoch": 1.436848450464506, + "grad_norm": 2.320922374725342, + "loss": 3.8839, + "lr": 0.00036153846153846154, + "step": 5066, + "tokens_trained": 2.489605856 + }, + { + "epoch": 1.4374157861144599, + "grad_norm": 3.1108357906341553, + "loss": 3.8711, + "lr": 0.0003612587412587412, + "step": 5068, + "tokens_trained": 2.490593592 + }, + { + "epoch": 1.437983121764414, + "grad_norm": 4.1830267906188965, + "loss": 3.8753, + "lr": 0.00036097902097902097, + "step": 5070, + "tokens_trained": 2.491577552 + }, + { + "epoch": 1.4385504574143677, + "grad_norm": 4.149252414703369, + "loss": 3.9214, + "lr": 0.00036069930069930066, + "step": 5072, + "tokens_trained": 2.492563048 + }, + { + "epoch": 1.4391177930643217, + "grad_norm": 2.50063419342041, + "loss": 3.9325, + "lr": 0.00036041958041958046, + "step": 5074, + "tokens_trained": 2.493544432 + }, + { + "epoch": 1.4396851287142756, + "grad_norm": 3.926102638244629, + "loss": 3.9229, + "lr": 0.00036013986013986015, + "step": 5076, + "tokens_trained": 2.494525176 + }, + { + "epoch": 1.4402524643642294, + "grad_norm": 2.9965932369232178, + "loss": 3.8654, + "lr": 0.0003598601398601399, + "step": 5078, + "tokens_trained": 2.495506904 + }, + { + "epoch": 1.4408198000141834, + "grad_norm": 3.242460250854492, + "loss": 3.8657, + "lr": 0.0003595804195804196, + "step": 5080, + "tokens_trained": 2.496486016 + }, + { + "epoch": 1.4413871356641372, + "grad_norm": 4.620968341827393, + "loss": 3.9037, + "lr": 0.00035930069930069933, + "step": 5082, + "tokens_trained": 2.497472288 + }, + { + "epoch": 1.4419544713140913, + "grad_norm": 4.284809112548828, + "loss": 3.8864, + "lr": 0.000359020979020979, + "step": 5084, + "tokens_trained": 2.49845476 + }, + { + "epoch": 1.442521806964045, + "grad_norm": 3.115851640701294, + "loss": 3.9297, + "lr": 0.0003587412587412587, + "step": 5086, + "tokens_trained": 2.499438056 + }, + { + "epoch": 1.443089142613999, + "grad_norm": 3.842564105987549, + "loss": 3.8401, + "lr": 0.00035846153846153846, + "step": 5088, + "tokens_trained": 2.500420768 + }, + { + "epoch": 1.443656478263953, + "grad_norm": 3.615903615951538, + "loss": 3.869, + "lr": 0.00035818181818181815, + "step": 5090, + "tokens_trained": 2.50140196 + }, + { + "epoch": 1.4442238139139068, + "grad_norm": 3.166294574737549, + "loss": 3.858, + "lr": 0.00035790209790209794, + "step": 5092, + "tokens_trained": 2.502388264 + }, + { + "epoch": 1.4447911495638608, + "grad_norm": 2.21025013923645, + "loss": 3.867, + "lr": 0.00035762237762237763, + "step": 5094, + "tokens_trained": 2.503375728 + }, + { + "epoch": 1.4453584852138146, + "grad_norm": 3.1004698276519775, + "loss": 3.8808, + "lr": 0.0003573426573426574, + "step": 5096, + "tokens_trained": 2.504358936 + }, + { + "epoch": 1.4459258208637684, + "grad_norm": 1.524992823600769, + "loss": 3.8603, + "lr": 0.00035706293706293707, + "step": 5098, + "tokens_trained": 2.505342432 + }, + { + "epoch": 1.4464931565137225, + "grad_norm": 2.9289309978485107, + "loss": 3.8623, + "lr": 0.0003567832167832168, + "step": 5100, + "tokens_trained": 2.50632448 + }, + { + "epoch": 1.4470604921636763, + "grad_norm": 1.872747540473938, + "loss": 3.873, + "lr": 0.0003565034965034965, + "step": 5102, + "tokens_trained": 2.507303672 + }, + { + "epoch": 1.4476278278136303, + "grad_norm": 5.076520919799805, + "loss": 3.8882, + "lr": 0.0003562237762237762, + "step": 5104, + "tokens_trained": 2.508294896 + }, + { + "epoch": 1.4481951634635841, + "grad_norm": 3.738583564758301, + "loss": 3.8517, + "lr": 0.00035594405594405594, + "step": 5106, + "tokens_trained": 2.50927976 + }, + { + "epoch": 1.448762499113538, + "grad_norm": 4.042014122009277, + "loss": 3.8544, + "lr": 0.00035566433566433563, + "step": 5108, + "tokens_trained": 2.510263368 + }, + { + "epoch": 1.449329834763492, + "grad_norm": 4.474701881408691, + "loss": 3.9099, + "lr": 0.00035538461538461543, + "step": 5110, + "tokens_trained": 2.511249408 + }, + { + "epoch": 1.4498971704134458, + "grad_norm": 2.5567240715026855, + "loss": 3.9099, + "lr": 0.0003551048951048951, + "step": 5112, + "tokens_trained": 2.51222996 + }, + { + "epoch": 1.4504645060633998, + "grad_norm": 1.9672293663024902, + "loss": 3.8462, + "lr": 0.00035482517482517486, + "step": 5114, + "tokens_trained": 2.513214328 + }, + { + "epoch": 1.4510318417133536, + "grad_norm": 3.023873805999756, + "loss": 3.8937, + "lr": 0.00035454545454545455, + "step": 5116, + "tokens_trained": 2.51419652 + }, + { + "epoch": 1.4515991773633075, + "grad_norm": 2.4916296005249023, + "loss": 3.8856, + "lr": 0.0003542657342657343, + "step": 5118, + "tokens_trained": 2.515177864 + }, + { + "epoch": 1.4521665130132615, + "grad_norm": 2.6898279190063477, + "loss": 3.8899, + "lr": 0.000353986013986014, + "step": 5120, + "tokens_trained": 2.516162416 + }, + { + "epoch": 1.4527338486632153, + "grad_norm": 2.552603244781494, + "loss": 3.9042, + "lr": 0.0003537062937062937, + "step": 5122, + "tokens_trained": 2.517143872 + }, + { + "epoch": 1.4533011843131693, + "grad_norm": 2.877371072769165, + "loss": 3.9297, + "lr": 0.0003534265734265734, + "step": 5124, + "tokens_trained": 2.518128736 + }, + { + "epoch": 1.4535848521381463, + "eval_loss": 0.97332763671875, + "eval_runtime": 20.9544, + "step": 5125, + "tokens_trained": 2.518622672 + }, + { + "epoch": 1.4538685199631232, + "grad_norm": 3.5342681407928467, + "loss": 3.9252, + "lr": 0.0003531468531468531, + "step": 5126, + "tokens_trained": 2.51911752 + }, + { + "epoch": 1.454435855613077, + "grad_norm": 3.628720998764038, + "loss": 3.859, + "lr": 0.0003528671328671329, + "step": 5128, + "tokens_trained": 2.520098472 + }, + { + "epoch": 1.455003191263031, + "grad_norm": 6.547962188720703, + "loss": 3.8856, + "lr": 0.0003525874125874126, + "step": 5130, + "tokens_trained": 2.521079136 + }, + { + "epoch": 1.4555705269129848, + "grad_norm": 2.413188934326172, + "loss": 3.8697, + "lr": 0.00035230769230769235, + "step": 5132, + "tokens_trained": 2.522058328 + }, + { + "epoch": 1.4561378625629389, + "grad_norm": 3.4512171745300293, + "loss": 3.8708, + "lr": 0.00035202797202797204, + "step": 5134, + "tokens_trained": 2.523038472 + }, + { + "epoch": 1.4567051982128927, + "grad_norm": 3.700793504714966, + "loss": 3.8337, + "lr": 0.0003517482517482518, + "step": 5136, + "tokens_trained": 2.524023792 + }, + { + "epoch": 1.4572725338628465, + "grad_norm": 3.7885332107543945, + "loss": 3.9458, + "lr": 0.0003514685314685315, + "step": 5138, + "tokens_trained": 2.525009728 + }, + { + "epoch": 1.4578398695128005, + "grad_norm": 2.7266547679901123, + "loss": 3.9023, + "lr": 0.00035118881118881116, + "step": 5140, + "tokens_trained": 2.525989656 + }, + { + "epoch": 1.4584072051627543, + "grad_norm": 3.19142746925354, + "loss": 3.8541, + "lr": 0.0003509090909090909, + "step": 5142, + "tokens_trained": 2.526971216 + }, + { + "epoch": 1.4589745408127084, + "grad_norm": 4.478598117828369, + "loss": 3.8717, + "lr": 0.0003506293706293706, + "step": 5144, + "tokens_trained": 2.527954072 + }, + { + "epoch": 1.4595418764626622, + "grad_norm": 2.4593617916107178, + "loss": 3.8894, + "lr": 0.0003503496503496504, + "step": 5146, + "tokens_trained": 2.528939184 + }, + { + "epoch": 1.460109212112616, + "grad_norm": 3.3200669288635254, + "loss": 3.8385, + "lr": 0.0003500699300699301, + "step": 5148, + "tokens_trained": 2.529926776 + }, + { + "epoch": 1.46067654776257, + "grad_norm": 4.14384651184082, + "loss": 3.8771, + "lr": 0.00034979020979020983, + "step": 5150, + "tokens_trained": 2.530912672 + }, + { + "epoch": 1.4612438834125239, + "grad_norm": 4.013224124908447, + "loss": 3.8845, + "lr": 0.0003495104895104895, + "step": 5152, + "tokens_trained": 2.531895672 + }, + { + "epoch": 1.461811219062478, + "grad_norm": 2.421576976776123, + "loss": 3.8511, + "lr": 0.00034923076923076927, + "step": 5154, + "tokens_trained": 2.53288024 + }, + { + "epoch": 1.4623785547124317, + "grad_norm": 2.5835623741149902, + "loss": 3.8596, + "lr": 0.00034895104895104896, + "step": 5156, + "tokens_trained": 2.533866024 + }, + { + "epoch": 1.4629458903623855, + "grad_norm": 4.254941940307617, + "loss": 3.8578, + "lr": 0.00034867132867132865, + "step": 5158, + "tokens_trained": 2.534849704 + }, + { + "epoch": 1.4635132260123396, + "grad_norm": 5.818271160125732, + "loss": 3.8577, + "lr": 0.0003483916083916084, + "step": 5160, + "tokens_trained": 2.535833776 + }, + { + "epoch": 1.4640805616622934, + "grad_norm": 0.8015483021736145, + "loss": 3.8521, + "lr": 0.0003481118881118881, + "step": 5162, + "tokens_trained": 2.536811552 + }, + { + "epoch": 1.4646478973122474, + "grad_norm": 2.905026435852051, + "loss": 3.9294, + "lr": 0.0003478321678321678, + "step": 5164, + "tokens_trained": 2.537794976 + }, + { + "epoch": 1.4652152329622012, + "grad_norm": 4.075428009033203, + "loss": 3.8707, + "lr": 0.00034755244755244757, + "step": 5166, + "tokens_trained": 2.538778856 + }, + { + "epoch": 1.465782568612155, + "grad_norm": 3.1994779109954834, + "loss": 3.8997, + "lr": 0.0003472727272727273, + "step": 5168, + "tokens_trained": 2.539766424 + }, + { + "epoch": 1.466349904262109, + "grad_norm": 3.8348865509033203, + "loss": 3.8407, + "lr": 0.000346993006993007, + "step": 5170, + "tokens_trained": 2.54074992 + }, + { + "epoch": 1.466917239912063, + "grad_norm": 3.057394504547119, + "loss": 3.8996, + "lr": 0.00034671328671328675, + "step": 5172, + "tokens_trained": 2.54173296 + }, + { + "epoch": 1.467484575562017, + "grad_norm": 2.629530668258667, + "loss": 3.8695, + "lr": 0.00034643356643356644, + "step": 5174, + "tokens_trained": 2.542712656 + }, + { + "epoch": 1.4680519112119708, + "grad_norm": 6.596874237060547, + "loss": 3.8993, + "lr": 0.00034615384615384613, + "step": 5176, + "tokens_trained": 2.543697248 + }, + { + "epoch": 1.4686192468619246, + "grad_norm": 6.877425670623779, + "loss": 3.8664, + "lr": 0.0003458741258741259, + "step": 5178, + "tokens_trained": 2.544676296 + }, + { + "epoch": 1.4691865825118786, + "grad_norm": 4.752718448638916, + "loss": 3.8747, + "lr": 0.00034559440559440557, + "step": 5180, + "tokens_trained": 2.54566048 + }, + { + "epoch": 1.4697539181618324, + "grad_norm": 6.17790412902832, + "loss": 3.8618, + "lr": 0.0003453146853146853, + "step": 5182, + "tokens_trained": 2.5466462 + }, + { + "epoch": 1.4703212538117865, + "grad_norm": 4.023257732391357, + "loss": 3.9337, + "lr": 0.00034503496503496506, + "step": 5184, + "tokens_trained": 2.547626248 + }, + { + "epoch": 1.4708885894617403, + "grad_norm": 5.393856048583984, + "loss": 3.9071, + "lr": 0.0003447552447552448, + "step": 5186, + "tokens_trained": 2.548609344 + }, + { + "epoch": 1.471455925111694, + "grad_norm": 3.888399124145508, + "loss": 3.8781, + "lr": 0.0003444755244755245, + "step": 5188, + "tokens_trained": 2.549590128 + }, + { + "epoch": 1.4720232607616481, + "grad_norm": 2.120105504989624, + "loss": 3.8423, + "lr": 0.0003441958041958042, + "step": 5190, + "tokens_trained": 2.550575896 + }, + { + "epoch": 1.472590596411602, + "grad_norm": 2.569045305252075, + "loss": 3.9112, + "lr": 0.0003439160839160839, + "step": 5192, + "tokens_trained": 2.551560416 + }, + { + "epoch": 1.473157932061556, + "grad_norm": 3.4651668071746826, + "loss": 3.9087, + "lr": 0.0003436363636363636, + "step": 5194, + "tokens_trained": 2.552542536 + }, + { + "epoch": 1.4737252677115098, + "grad_norm": 2.7434427738189697, + "loss": 3.8748, + "lr": 0.00034335664335664336, + "step": 5196, + "tokens_trained": 2.553529144 + }, + { + "epoch": 1.4742926033614636, + "grad_norm": 2.8238751888275146, + "loss": 3.883, + "lr": 0.00034307692307692305, + "step": 5198, + "tokens_trained": 2.554512624 + }, + { + "epoch": 1.4748599390114177, + "grad_norm": 2.6443698406219482, + "loss": 3.8933, + "lr": 0.0003427972027972028, + "step": 5200, + "tokens_trained": 2.555493584 + }, + { + "epoch": 1.4754272746613715, + "grad_norm": 3.0539839267730713, + "loss": 3.8446, + "lr": 0.00034251748251748254, + "step": 5202, + "tokens_trained": 2.556470992 + }, + { + "epoch": 1.4759946103113255, + "grad_norm": 2.7458343505859375, + "loss": 3.8937, + "lr": 0.0003422377622377623, + "step": 5204, + "tokens_trained": 2.557456112 + }, + { + "epoch": 1.4765619459612793, + "grad_norm": 2.1506590843200684, + "loss": 3.8197, + "lr": 0.000341958041958042, + "step": 5206, + "tokens_trained": 2.558440904 + }, + { + "epoch": 1.4771292816112331, + "grad_norm": 3.0944714546203613, + "loss": 3.8825, + "lr": 0.00034167832167832167, + "step": 5208, + "tokens_trained": 2.559422448 + }, + { + "epoch": 1.4776966172611872, + "grad_norm": 4.058701038360596, + "loss": 3.8541, + "lr": 0.0003413986013986014, + "step": 5210, + "tokens_trained": 2.560408408 + }, + { + "epoch": 1.478263952911141, + "grad_norm": 3.390343189239502, + "loss": 3.8573, + "lr": 0.0003411188811188811, + "step": 5212, + "tokens_trained": 2.561387848 + }, + { + "epoch": 1.478831288561095, + "grad_norm": 1.3469499349594116, + "loss": 3.8657, + "lr": 0.00034083916083916084, + "step": 5214, + "tokens_trained": 2.562369448 + }, + { + "epoch": 1.4793986242110488, + "grad_norm": 2.6407840251922607, + "loss": 3.8439, + "lr": 0.00034055944055944054, + "step": 5216, + "tokens_trained": 2.563352728 + }, + { + "epoch": 1.4799659598610027, + "grad_norm": 4.6244378089904785, + "loss": 3.8524, + "lr": 0.0003402797202797203, + "step": 5218, + "tokens_trained": 2.56433732 + }, + { + "epoch": 1.4805332955109567, + "grad_norm": 3.53739333152771, + "loss": 3.8383, + "lr": 0.00034, + "step": 5220, + "tokens_trained": 2.565318712 + }, + { + "epoch": 1.4811006311609105, + "grad_norm": 1.2742515802383423, + "loss": 3.8461, + "lr": 0.00033972027972027977, + "step": 5222, + "tokens_trained": 2.566305728 + }, + { + "epoch": 1.4816679668108645, + "grad_norm": 2.308912754058838, + "loss": 3.9021, + "lr": 0.00033944055944055946, + "step": 5224, + "tokens_trained": 2.567289704 + }, + { + "epoch": 1.4822353024608184, + "grad_norm": 4.492687225341797, + "loss": 3.942, + "lr": 0.00033916083916083915, + "step": 5226, + "tokens_trained": 2.568271784 + }, + { + "epoch": 1.4828026381107722, + "grad_norm": 3.4822142124176025, + "loss": 3.8815, + "lr": 0.0003388811188811189, + "step": 5228, + "tokens_trained": 2.569255992 + }, + { + "epoch": 1.4833699737607262, + "grad_norm": 2.584545612335205, + "loss": 3.8663, + "lr": 0.0003386013986013986, + "step": 5230, + "tokens_trained": 2.570237432 + }, + { + "epoch": 1.48393730941068, + "grad_norm": 1.3823322057724, + "loss": 3.8608, + "lr": 0.00033832167832167833, + "step": 5232, + "tokens_trained": 2.571214488 + }, + { + "epoch": 1.484504645060634, + "grad_norm": 0.8751009702682495, + "loss": 3.8788, + "lr": 0.000338041958041958, + "step": 5234, + "tokens_trained": 2.572196464 + }, + { + "epoch": 1.4850719807105879, + "grad_norm": 0.723051905632019, + "loss": 3.8176, + "lr": 0.00033776223776223776, + "step": 5236, + "tokens_trained": 2.5731788 + }, + { + "epoch": 1.4856393163605417, + "grad_norm": 1.073199987411499, + "loss": 3.8142, + "lr": 0.0003374825174825175, + "step": 5238, + "tokens_trained": 2.5741574 + }, + { + "epoch": 1.4862066520104957, + "grad_norm": 1.4350844621658325, + "loss": 3.8865, + "lr": 0.00033720279720279725, + "step": 5240, + "tokens_trained": 2.57514396 + }, + { + "epoch": 1.4867739876604495, + "grad_norm": 2.656418561935425, + "loss": 3.8975, + "lr": 0.00033692307692307694, + "step": 5242, + "tokens_trained": 2.576123752 + }, + { + "epoch": 1.4873413233104036, + "grad_norm": 2.0948193073272705, + "loss": 3.8543, + "lr": 0.00033664335664335663, + "step": 5244, + "tokens_trained": 2.577109968 + }, + { + "epoch": 1.4879086589603574, + "grad_norm": 2.3233394622802734, + "loss": 3.9017, + "lr": 0.0003363636363636364, + "step": 5246, + "tokens_trained": 2.578092128 + }, + { + "epoch": 1.4884759946103112, + "grad_norm": 2.3845908641815186, + "loss": 3.8993, + "lr": 0.00033608391608391607, + "step": 5248, + "tokens_trained": 2.579077664 + }, + { + "epoch": 1.4890433302602653, + "grad_norm": 1.0513813495635986, + "loss": 3.8597, + "lr": 0.0003358041958041958, + "step": 5250, + "tokens_trained": 2.580059224 + }, + { + "epoch": 1.4890433302602653, + "eval_loss": 0.9717268943786621, + "eval_runtime": 20.2853, + "step": 5250, + "tokens_trained": 2.580059224 + }, + { + "epoch": 1.489610665910219, + "grad_norm": 1.56052827835083, + "loss": 3.8768, + "lr": 0.0003355244755244755, + "step": 5252, + "tokens_trained": 2.581038592 + }, + { + "epoch": 1.490178001560173, + "grad_norm": 3.654672384262085, + "loss": 3.8987, + "lr": 0.00033524475524475525, + "step": 5254, + "tokens_trained": 2.58201948 + }, + { + "epoch": 1.490745337210127, + "grad_norm": 2.942765474319458, + "loss": 3.9019, + "lr": 0.000334965034965035, + "step": 5256, + "tokens_trained": 2.582999032 + }, + { + "epoch": 1.4913126728600807, + "grad_norm": 2.78128719329834, + "loss": 3.858, + "lr": 0.00033468531468531474, + "step": 5258, + "tokens_trained": 2.583980504 + }, + { + "epoch": 1.4918800085100348, + "grad_norm": 2.8371148109436035, + "loss": 3.9144, + "lr": 0.00033440559440559443, + "step": 5260, + "tokens_trained": 2.584964024 + }, + { + "epoch": 1.4924473441599886, + "grad_norm": 3.362802743911743, + "loss": 3.9014, + "lr": 0.0003341258741258741, + "step": 5262, + "tokens_trained": 2.585946728 + }, + { + "epoch": 1.4930146798099426, + "grad_norm": 2.9014971256256104, + "loss": 3.88, + "lr": 0.00033384615384615386, + "step": 5264, + "tokens_trained": 2.586928936 + }, + { + "epoch": 1.4935820154598964, + "grad_norm": 4.144679546356201, + "loss": 3.8206, + "lr": 0.00033356643356643355, + "step": 5266, + "tokens_trained": 2.587912456 + }, + { + "epoch": 1.4941493511098503, + "grad_norm": 2.4919822216033936, + "loss": 3.8968, + "lr": 0.0003332867132867133, + "step": 5268, + "tokens_trained": 2.58889736 + }, + { + "epoch": 1.4947166867598043, + "grad_norm": 4.618200778961182, + "loss": 3.8869, + "lr": 0.000333006993006993, + "step": 5270, + "tokens_trained": 2.58988292 + }, + { + "epoch": 1.495284022409758, + "grad_norm": 2.53562068939209, + "loss": 3.8692, + "lr": 0.00033272727272727273, + "step": 5272, + "tokens_trained": 2.590868032 + }, + { + "epoch": 1.4958513580597121, + "grad_norm": 2.9674575328826904, + "loss": 3.8982, + "lr": 0.0003324475524475525, + "step": 5274, + "tokens_trained": 2.591849336 + }, + { + "epoch": 1.496418693709666, + "grad_norm": 3.1153666973114014, + "loss": 3.8812, + "lr": 0.0003321678321678322, + "step": 5276, + "tokens_trained": 2.592831392 + }, + { + "epoch": 1.4969860293596198, + "grad_norm": 1.1431063413619995, + "loss": 3.8505, + "lr": 0.0003318881118881119, + "step": 5278, + "tokens_trained": 2.593812872 + }, + { + "epoch": 1.4975533650095738, + "grad_norm": 1.3542804718017578, + "loss": 3.7819, + "lr": 0.0003316083916083916, + "step": 5280, + "tokens_trained": 2.594791464 + }, + { + "epoch": 1.4981207006595276, + "grad_norm": 2.3613617420196533, + "loss": 3.8637, + "lr": 0.00033132867132867135, + "step": 5282, + "tokens_trained": 2.595773248 + }, + { + "epoch": 1.4986880363094817, + "grad_norm": 3.757537364959717, + "loss": 3.8808, + "lr": 0.00033104895104895104, + "step": 5284, + "tokens_trained": 2.596756584 + }, + { + "epoch": 1.4992553719594355, + "grad_norm": 2.953866481781006, + "loss": 3.9131, + "lr": 0.0003307692307692308, + "step": 5286, + "tokens_trained": 2.59773796 + }, + { + "epoch": 1.4998227076093893, + "grad_norm": 2.655200958251953, + "loss": 3.8643, + "lr": 0.00033048951048951047, + "step": 5288, + "tokens_trained": 2.598719536 + }, + { + "epoch": 1.5003900432593433, + "grad_norm": 2.889563798904419, + "loss": 3.9746, + "lr": 0.0003302097902097902, + "step": 5290, + "tokens_trained": 2.599697288 + }, + { + "epoch": 1.5009573789092974, + "grad_norm": 2.8182990550994873, + "loss": 3.8618, + "lr": 0.0003299300699300699, + "step": 5292, + "tokens_trained": 2.600680032 + }, + { + "epoch": 1.5015247145592512, + "grad_norm": 3.309007406234741, + "loss": 3.9308, + "lr": 0.0003296503496503497, + "step": 5294, + "tokens_trained": 2.601664832 + }, + { + "epoch": 1.502092050209205, + "grad_norm": 2.542564868927002, + "loss": 3.8754, + "lr": 0.0003293706293706294, + "step": 5296, + "tokens_trained": 2.602645048 + }, + { + "epoch": 1.5026593858591588, + "grad_norm": 1.6452852487564087, + "loss": 3.8644, + "lr": 0.0003290909090909091, + "step": 5298, + "tokens_trained": 2.603625752 + }, + { + "epoch": 1.5032267215091129, + "grad_norm": 2.0452191829681396, + "loss": 3.8651, + "lr": 0.00032881118881118883, + "step": 5300, + "tokens_trained": 2.604608136 + }, + { + "epoch": 1.5037940571590669, + "grad_norm": 3.8787152767181396, + "loss": 3.897, + "lr": 0.0003285314685314685, + "step": 5302, + "tokens_trained": 2.605591744 + }, + { + "epoch": 1.5043613928090207, + "grad_norm": 2.729074478149414, + "loss": 3.8237, + "lr": 0.00032825174825174827, + "step": 5304, + "tokens_trained": 2.606578392 + }, + { + "epoch": 1.5049287284589745, + "grad_norm": 4.348790168762207, + "loss": 3.8972, + "lr": 0.00032797202797202796, + "step": 5306, + "tokens_trained": 2.607560696 + }, + { + "epoch": 1.5054960641089283, + "grad_norm": 3.7172658443450928, + "loss": 3.9184, + "lr": 0.0003276923076923077, + "step": 5308, + "tokens_trained": 2.608540552 + }, + { + "epoch": 1.5060633997588824, + "grad_norm": 0.9424030780792236, + "loss": 3.9053, + "lr": 0.0003274125874125874, + "step": 5310, + "tokens_trained": 2.609521736 + }, + { + "epoch": 1.5066307354088364, + "grad_norm": 1.4858821630477905, + "loss": 3.9223, + "lr": 0.00032713286713286714, + "step": 5312, + "tokens_trained": 2.610500288 + }, + { + "epoch": 1.5071980710587902, + "grad_norm": 1.154492974281311, + "loss": 3.8776, + "lr": 0.0003268531468531469, + "step": 5314, + "tokens_trained": 2.611484024 + }, + { + "epoch": 1.507765406708744, + "grad_norm": 2.853030204772949, + "loss": 3.839, + "lr": 0.00032657342657342657, + "step": 5316, + "tokens_trained": 2.61246684 + }, + { + "epoch": 1.5083327423586979, + "grad_norm": 5.903510570526123, + "loss": 3.9016, + "lr": 0.0003262937062937063, + "step": 5318, + "tokens_trained": 2.61344732 + }, + { + "epoch": 1.508900078008652, + "grad_norm": 4.1008453369140625, + "loss": 3.8961, + "lr": 0.000326013986013986, + "step": 5320, + "tokens_trained": 2.614426112 + }, + { + "epoch": 1.509467413658606, + "grad_norm": 1.814429759979248, + "loss": 3.8708, + "lr": 0.00032573426573426575, + "step": 5322, + "tokens_trained": 2.615404984 + }, + { + "epoch": 1.5100347493085597, + "grad_norm": 7.854028224945068, + "loss": 3.8936, + "lr": 0.00032545454545454544, + "step": 5324, + "tokens_trained": 2.61638448 + }, + { + "epoch": 1.5106020849585136, + "grad_norm": 8.18005084991455, + "loss": 3.8511, + "lr": 0.0003251748251748252, + "step": 5326, + "tokens_trained": 2.617367336 + }, + { + "epoch": 1.5111694206084674, + "grad_norm": 6.5862135887146, + "loss": 3.8394, + "lr": 0.0003248951048951049, + "step": 5328, + "tokens_trained": 2.618350632 + }, + { + "epoch": 1.5117367562584214, + "grad_norm": 5.746713638305664, + "loss": 3.9074, + "lr": 0.0003246153846153846, + "step": 5330, + "tokens_trained": 2.619333 + }, + { + "epoch": 1.5123040919083754, + "grad_norm": 3.554576873779297, + "loss": 3.8755, + "lr": 0.00032433566433566436, + "step": 5332, + "tokens_trained": 2.62031072 + }, + { + "epoch": 1.5128714275583293, + "grad_norm": 3.7476911544799805, + "loss": 3.8923, + "lr": 0.00032405594405594406, + "step": 5334, + "tokens_trained": 2.621290952 + }, + { + "epoch": 1.513438763208283, + "grad_norm": 3.5004961490631104, + "loss": 3.8579, + "lr": 0.0003237762237762238, + "step": 5336, + "tokens_trained": 2.622267496 + }, + { + "epoch": 1.514006098858237, + "grad_norm": 2.527608633041382, + "loss": 3.8617, + "lr": 0.0003234965034965035, + "step": 5338, + "tokens_trained": 2.623250648 + }, + { + "epoch": 1.514573434508191, + "grad_norm": 1.698697805404663, + "loss": 3.8735, + "lr": 0.00032321678321678323, + "step": 5340, + "tokens_trained": 2.624232432 + }, + { + "epoch": 1.515140770158145, + "grad_norm": 1.567301630973816, + "loss": 3.8696, + "lr": 0.0003229370629370629, + "step": 5342, + "tokens_trained": 2.625214496 + }, + { + "epoch": 1.5157081058080988, + "grad_norm": 1.1091945171356201, + "loss": 3.9026, + "lr": 0.00032265734265734267, + "step": 5344, + "tokens_trained": 2.626195168 + }, + { + "epoch": 1.5162754414580526, + "grad_norm": 2.308842420578003, + "loss": 3.9068, + "lr": 0.00032237762237762236, + "step": 5346, + "tokens_trained": 2.627178136 + }, + { + "epoch": 1.5168427771080064, + "grad_norm": 1.895664930343628, + "loss": 3.8927, + "lr": 0.0003220979020979021, + "step": 5348, + "tokens_trained": 2.628160768 + }, + { + "epoch": 1.5174101127579604, + "grad_norm": 3.357377529144287, + "loss": 3.9102, + "lr": 0.00032181818181818185, + "step": 5350, + "tokens_trained": 2.629144816 + }, + { + "epoch": 1.5179774484079145, + "grad_norm": 3.45583176612854, + "loss": 3.8342, + "lr": 0.00032153846153846154, + "step": 5352, + "tokens_trained": 2.630123376 + }, + { + "epoch": 1.5185447840578683, + "grad_norm": 2.129251718521118, + "loss": 3.8917, + "lr": 0.0003212587412587413, + "step": 5354, + "tokens_trained": 2.631108104 + }, + { + "epoch": 1.5191121197078221, + "grad_norm": 3.7762246131896973, + "loss": 3.9044, + "lr": 0.000320979020979021, + "step": 5356, + "tokens_trained": 2.632086912 + }, + { + "epoch": 1.519679455357776, + "grad_norm": 3.4620509147644043, + "loss": 3.8829, + "lr": 0.0003206993006993007, + "step": 5358, + "tokens_trained": 2.633066048 + }, + { + "epoch": 1.52024679100773, + "grad_norm": 2.9604990482330322, + "loss": 3.9058, + "lr": 0.0003204195804195804, + "step": 5360, + "tokens_trained": 2.634048104 + }, + { + "epoch": 1.520814126657684, + "grad_norm": 2.3409082889556885, + "loss": 3.8871, + "lr": 0.00032013986013986015, + "step": 5362, + "tokens_trained": 2.635033888 + }, + { + "epoch": 1.5213814623076378, + "grad_norm": 2.3598854541778564, + "loss": 3.8278, + "lr": 0.00031986013986013984, + "step": 5364, + "tokens_trained": 2.636016304 + }, + { + "epoch": 1.5219487979575916, + "grad_norm": 2.3019731044769287, + "loss": 3.8662, + "lr": 0.0003195804195804196, + "step": 5366, + "tokens_trained": 2.637003712 + }, + { + "epoch": 1.5225161336075455, + "grad_norm": 1.2325515747070312, + "loss": 3.871, + "lr": 0.00031930069930069933, + "step": 5368, + "tokens_trained": 2.637982504 + }, + { + "epoch": 1.5230834692574995, + "grad_norm": 0.7675896883010864, + "loss": 3.8765, + "lr": 0.000319020979020979, + "step": 5370, + "tokens_trained": 2.638965288 + }, + { + "epoch": 1.5236508049074535, + "grad_norm": 1.4549137353897095, + "loss": 3.8722, + "lr": 0.00031874125874125877, + "step": 5372, + "tokens_trained": 2.63994804 + }, + { + "epoch": 1.5242181405574073, + "grad_norm": 3.515141248703003, + "loss": 3.8776, + "lr": 0.00031846153846153846, + "step": 5374, + "tokens_trained": 2.640932128 + }, + { + "epoch": 1.5245018083823842, + "eval_loss": 0.9721737504005432, + "eval_runtime": 20.8279, + "step": 5375, + "tokens_trained": 2.641423648 + }, + { + "epoch": 1.5247854762073612, + "grad_norm": 6.560733318328857, + "loss": 3.9073, + "lr": 0.0003181818181818182, + "step": 5376, + "tokens_trained": 2.641915928 + }, + { + "epoch": 1.525352811857315, + "grad_norm": 1.6997367143630981, + "loss": 3.8291, + "lr": 0.0003179020979020979, + "step": 5378, + "tokens_trained": 2.642900016 + }, + { + "epoch": 1.525920147507269, + "grad_norm": 9.629950523376465, + "loss": 3.8959, + "lr": 0.00031762237762237764, + "step": 5380, + "tokens_trained": 2.64388104 + }, + { + "epoch": 1.526487483157223, + "grad_norm": 3.9039199352264404, + "loss": 3.8823, + "lr": 0.00031734265734265733, + "step": 5382, + "tokens_trained": 2.644863936 + }, + { + "epoch": 1.5270548188071769, + "grad_norm": 3.925534963607788, + "loss": 3.8869, + "lr": 0.0003170629370629371, + "step": 5384, + "tokens_trained": 2.645846304 + }, + { + "epoch": 1.5276221544571307, + "grad_norm": 3.528144121170044, + "loss": 3.8808, + "lr": 0.0003167832167832168, + "step": 5386, + "tokens_trained": 2.646833528 + }, + { + "epoch": 1.5281894901070845, + "grad_norm": 4.170341968536377, + "loss": 3.9114, + "lr": 0.0003165034965034965, + "step": 5388, + "tokens_trained": 2.647815024 + }, + { + "epoch": 1.5287568257570385, + "grad_norm": 1.840369462966919, + "loss": 3.8531, + "lr": 0.00031622377622377625, + "step": 5390, + "tokens_trained": 2.64879892 + }, + { + "epoch": 1.5293241614069926, + "grad_norm": 3.2327773571014404, + "loss": 3.9133, + "lr": 0.00031594405594405594, + "step": 5392, + "tokens_trained": 2.649780992 + }, + { + "epoch": 1.5298914970569464, + "grad_norm": 4.462336540222168, + "loss": 3.8588, + "lr": 0.0003156643356643357, + "step": 5394, + "tokens_trained": 2.650768296 + }, + { + "epoch": 1.5304588327069002, + "grad_norm": 4.678606033325195, + "loss": 3.9386, + "lr": 0.0003153846153846154, + "step": 5396, + "tokens_trained": 2.651752624 + }, + { + "epoch": 1.531026168356854, + "grad_norm": 1.7649297714233398, + "loss": 3.8813, + "lr": 0.00031510489510489507, + "step": 5398, + "tokens_trained": 2.652734752 + }, + { + "epoch": 1.531593504006808, + "grad_norm": 5.314251899719238, + "loss": 3.857, + "lr": 0.0003148251748251748, + "step": 5400, + "tokens_trained": 2.653716744 + }, + { + "epoch": 1.532160839656762, + "grad_norm": 3.9521164894104004, + "loss": 3.8795, + "lr": 0.00031454545454545456, + "step": 5402, + "tokens_trained": 2.654698216 + }, + { + "epoch": 1.532728175306716, + "grad_norm": 2.3679206371307373, + "loss": 3.8809, + "lr": 0.0003142657342657343, + "step": 5404, + "tokens_trained": 2.655684352 + }, + { + "epoch": 1.5332955109566697, + "grad_norm": 5.0761871337890625, + "loss": 3.8539, + "lr": 0.000313986013986014, + "step": 5406, + "tokens_trained": 2.656668872 + }, + { + "epoch": 1.5338628466066235, + "grad_norm": 3.036986827850342, + "loss": 3.8724, + "lr": 0.00031370629370629374, + "step": 5408, + "tokens_trained": 2.657649648 + }, + { + "epoch": 1.5344301822565776, + "grad_norm": 1.9492992162704468, + "loss": 3.8559, + "lr": 0.00031342657342657343, + "step": 5410, + "tokens_trained": 2.65863172 + }, + { + "epoch": 1.5349975179065316, + "grad_norm": 5.674772262573242, + "loss": 3.9009, + "lr": 0.00031314685314685317, + "step": 5412, + "tokens_trained": 2.659614912 + }, + { + "epoch": 1.5355648535564854, + "grad_norm": 3.045802116394043, + "loss": 3.8898, + "lr": 0.00031286713286713286, + "step": 5414, + "tokens_trained": 2.660596088 + }, + { + "epoch": 1.5361321892064392, + "grad_norm": 2.8371381759643555, + "loss": 3.8409, + "lr": 0.00031258741258741255, + "step": 5416, + "tokens_trained": 2.661580752 + }, + { + "epoch": 1.536699524856393, + "grad_norm": 3.7679245471954346, + "loss": 3.8803, + "lr": 0.0003123076923076923, + "step": 5418, + "tokens_trained": 2.662563832 + }, + { + "epoch": 1.537266860506347, + "grad_norm": 3.2771692276000977, + "loss": 3.8699, + "lr": 0.000312027972027972, + "step": 5420, + "tokens_trained": 2.663544624 + }, + { + "epoch": 1.5378341961563011, + "grad_norm": 2.7474050521850586, + "loss": 3.899, + "lr": 0.0003117482517482518, + "step": 5422, + "tokens_trained": 2.664520368 + }, + { + "epoch": 1.538401531806255, + "grad_norm": 3.284118890762329, + "loss": 3.8437, + "lr": 0.0003114685314685315, + "step": 5424, + "tokens_trained": 2.665499312 + }, + { + "epoch": 1.5389688674562088, + "grad_norm": 2.7903459072113037, + "loss": 3.8609, + "lr": 0.0003111888111888112, + "step": 5426, + "tokens_trained": 2.666482416 + }, + { + "epoch": 1.5395362031061626, + "grad_norm": 3.876206398010254, + "loss": 3.876, + "lr": 0.0003109090909090909, + "step": 5428, + "tokens_trained": 2.667467008 + }, + { + "epoch": 1.5401035387561166, + "grad_norm": 1.5711065530776978, + "loss": 3.9029, + "lr": 0.00031062937062937066, + "step": 5430, + "tokens_trained": 2.668449184 + }, + { + "epoch": 1.5406708744060706, + "grad_norm": 1.2520103454589844, + "loss": 3.8487, + "lr": 0.00031034965034965035, + "step": 5432, + "tokens_trained": 2.669431912 + }, + { + "epoch": 1.5412382100560245, + "grad_norm": 0.9419916272163391, + "loss": 3.8721, + "lr": 0.00031006993006993004, + "step": 5434, + "tokens_trained": 2.670416424 + }, + { + "epoch": 1.5418055457059783, + "grad_norm": 1.9234577417373657, + "loss": 3.8729, + "lr": 0.0003097902097902098, + "step": 5436, + "tokens_trained": 2.671400888 + }, + { + "epoch": 1.542372881355932, + "grad_norm": 3.8806726932525635, + "loss": 3.8551, + "lr": 0.00030951048951048947, + "step": 5438, + "tokens_trained": 2.672375848 + }, + { + "epoch": 1.5429402170058861, + "grad_norm": 3.5235371589660645, + "loss": 3.8662, + "lr": 0.00030923076923076927, + "step": 5440, + "tokens_trained": 2.673355968 + }, + { + "epoch": 1.5435075526558402, + "grad_norm": 2.4708411693573, + "loss": 3.844, + "lr": 0.00030895104895104896, + "step": 5442, + "tokens_trained": 2.674337336 + }, + { + "epoch": 1.544074888305794, + "grad_norm": 2.014948606491089, + "loss": 3.8766, + "lr": 0.0003086713286713287, + "step": 5444, + "tokens_trained": 2.67532064 + }, + { + "epoch": 1.5446422239557478, + "grad_norm": 2.5892093181610107, + "loss": 3.858, + "lr": 0.0003083916083916084, + "step": 5446, + "tokens_trained": 2.676301352 + }, + { + "epoch": 1.5452095596057016, + "grad_norm": 3.082036018371582, + "loss": 3.8663, + "lr": 0.00030811188811188814, + "step": 5448, + "tokens_trained": 2.67728328 + }, + { + "epoch": 1.5457768952556556, + "grad_norm": 3.072131395339966, + "loss": 3.8562, + "lr": 0.00030783216783216783, + "step": 5450, + "tokens_trained": 2.678268 + }, + { + "epoch": 1.5463442309056097, + "grad_norm": 2.331498384475708, + "loss": 3.874, + "lr": 0.0003075524475524475, + "step": 5452, + "tokens_trained": 2.6792526 + }, + { + "epoch": 1.5469115665555635, + "grad_norm": 4.706553936004639, + "loss": 3.889, + "lr": 0.00030727272727272727, + "step": 5454, + "tokens_trained": 2.680234128 + }, + { + "epoch": 1.5474789022055173, + "grad_norm": 4.815377712249756, + "loss": 3.8797, + "lr": 0.00030699300699300696, + "step": 5456, + "tokens_trained": 2.68121644 + }, + { + "epoch": 1.5480462378554711, + "grad_norm": 4.225409507751465, + "loss": 3.8561, + "lr": 0.00030671328671328675, + "step": 5458, + "tokens_trained": 2.682198312 + }, + { + "epoch": 1.5486135735054252, + "grad_norm": 2.394444227218628, + "loss": 3.9328, + "lr": 0.00030643356643356645, + "step": 5460, + "tokens_trained": 2.683179776 + }, + { + "epoch": 1.5491809091553792, + "grad_norm": 3.93528151512146, + "loss": 3.8418, + "lr": 0.0003061538461538462, + "step": 5462, + "tokens_trained": 2.684163624 + }, + { + "epoch": 1.549748244805333, + "grad_norm": 3.366722822189331, + "loss": 3.8553, + "lr": 0.0003058741258741259, + "step": 5464, + "tokens_trained": 2.68514788 + }, + { + "epoch": 1.5503155804552868, + "grad_norm": 2.567106246948242, + "loss": 3.8859, + "lr": 0.0003055944055944056, + "step": 5466, + "tokens_trained": 2.686129328 + }, + { + "epoch": 1.5508829161052406, + "grad_norm": 2.0634472370147705, + "loss": 3.8997, + "lr": 0.0003053146853146853, + "step": 5468, + "tokens_trained": 2.687110848 + }, + { + "epoch": 1.5514502517551947, + "grad_norm": 0.823783814907074, + "loss": 3.905, + "lr": 0.000305034965034965, + "step": 5470, + "tokens_trained": 2.688097216 + }, + { + "epoch": 1.5520175874051487, + "grad_norm": 1.0160223245620728, + "loss": 3.8902, + "lr": 0.00030475524475524475, + "step": 5472, + "tokens_trained": 2.689077344 + }, + { + "epoch": 1.5525849230551025, + "grad_norm": 1.5037281513214111, + "loss": 3.823, + "lr": 0.00030447552447552444, + "step": 5474, + "tokens_trained": 2.690056832 + }, + { + "epoch": 1.5531522587050564, + "grad_norm": 0.46490955352783203, + "loss": 3.8819, + "lr": 0.00030419580419580424, + "step": 5476, + "tokens_trained": 2.691035328 + }, + { + "epoch": 1.5537195943550102, + "grad_norm": 1.715409278869629, + "loss": 3.8291, + "lr": 0.00030391608391608393, + "step": 5478, + "tokens_trained": 2.6920166 + }, + { + "epoch": 1.5542869300049642, + "grad_norm": 2.430316925048828, + "loss": 3.8457, + "lr": 0.0003036363636363637, + "step": 5480, + "tokens_trained": 2.6930046 + }, + { + "epoch": 1.5548542656549182, + "grad_norm": 3.483908176422119, + "loss": 3.8864, + "lr": 0.00030335664335664336, + "step": 5482, + "tokens_trained": 2.693984624 + }, + { + "epoch": 1.555421601304872, + "grad_norm": 1.167831301689148, + "loss": 3.8714, + "lr": 0.0003030769230769231, + "step": 5484, + "tokens_trained": 2.694967672 + }, + { + "epoch": 1.5559889369548259, + "grad_norm": 1.5959419012069702, + "loss": 3.8725, + "lr": 0.0003027972027972028, + "step": 5486, + "tokens_trained": 2.69595152 + }, + { + "epoch": 1.5565562726047797, + "grad_norm": 2.6633737087249756, + "loss": 3.8369, + "lr": 0.0003025174825174825, + "step": 5488, + "tokens_trained": 2.696935328 + }, + { + "epoch": 1.5571236082547337, + "grad_norm": 4.084526062011719, + "loss": 3.8687, + "lr": 0.00030223776223776223, + "step": 5490, + "tokens_trained": 2.697917976 + }, + { + "epoch": 1.5576909439046878, + "grad_norm": 2.062319040298462, + "loss": 3.8875, + "lr": 0.0003019580419580419, + "step": 5492, + "tokens_trained": 2.698902704 + }, + { + "epoch": 1.5582582795546416, + "grad_norm": 1.9942662715911865, + "loss": 3.8643, + "lr": 0.0003016783216783217, + "step": 5494, + "tokens_trained": 2.699887184 + }, + { + "epoch": 1.5588256152045954, + "grad_norm": 3.3120384216308594, + "loss": 3.8805, + "lr": 0.0003013986013986014, + "step": 5496, + "tokens_trained": 2.700869192 + }, + { + "epoch": 1.5593929508545492, + "grad_norm": 4.658695220947266, + "loss": 3.8846, + "lr": 0.00030111888111888116, + "step": 5498, + "tokens_trained": 2.701852504 + }, + { + "epoch": 1.5599602865045032, + "grad_norm": 2.397148370742798, + "loss": 3.9023, + "lr": 0.00030083916083916085, + "step": 5500, + "tokens_trained": 2.702838232 + }, + { + "epoch": 1.5599602865045032, + "eval_loss": 0.9699593782424927, + "eval_runtime": 20.4232, + "step": 5500, + "tokens_trained": 2.702838232 + }, + { + "epoch": 1.5605276221544573, + "grad_norm": 3.4792380332946777, + "loss": 3.9054, + "lr": 0.0003005594405594406, + "step": 5502, + "tokens_trained": 2.703821968 + }, + { + "epoch": 1.561094957804411, + "grad_norm": 2.4424889087677, + "loss": 3.8883, + "lr": 0.0003002797202797203, + "step": 5504, + "tokens_trained": 2.704803952 + }, + { + "epoch": 1.561662293454365, + "grad_norm": 2.9872353076934814, + "loss": 3.8708, + "lr": 0.0003, + "step": 5506, + "tokens_trained": 2.705786744 + }, + { + "epoch": 1.5622296291043187, + "grad_norm": 2.74369740486145, + "loss": 3.8685, + "lr": 0.0002997202797202797, + "step": 5508, + "tokens_trained": 2.70677 + }, + { + "epoch": 1.5627969647542728, + "grad_norm": 3.588508367538452, + "loss": 3.8362, + "lr": 0.0002994405594405594, + "step": 5510, + "tokens_trained": 2.707756336 + }, + { + "epoch": 1.5633643004042268, + "grad_norm": 3.268918037414551, + "loss": 3.8514, + "lr": 0.0002991608391608392, + "step": 5512, + "tokens_trained": 2.708738096 + }, + { + "epoch": 1.5639316360541806, + "grad_norm": 3.9960944652557373, + "loss": 3.8601, + "lr": 0.0002988811188811189, + "step": 5514, + "tokens_trained": 2.709718392 + }, + { + "epoch": 1.5644989717041344, + "grad_norm": 1.5690975189208984, + "loss": 3.8826, + "lr": 0.00029860139860139864, + "step": 5516, + "tokens_trained": 2.710698272 + }, + { + "epoch": 1.5650663073540882, + "grad_norm": 1.7052137851715088, + "loss": 3.858, + "lr": 0.00029832167832167833, + "step": 5518, + "tokens_trained": 2.711681816 + }, + { + "epoch": 1.5656336430040423, + "grad_norm": 2.0696487426757812, + "loss": 3.8539, + "lr": 0.000298041958041958, + "step": 5520, + "tokens_trained": 2.712668992 + }, + { + "epoch": 1.5662009786539963, + "grad_norm": 3.0199241638183594, + "loss": 3.8253, + "lr": 0.00029776223776223777, + "step": 5522, + "tokens_trained": 2.713648024 + }, + { + "epoch": 1.5667683143039501, + "grad_norm": 2.88175106048584, + "loss": 3.9067, + "lr": 0.00029748251748251746, + "step": 5524, + "tokens_trained": 2.71462988 + }, + { + "epoch": 1.567335649953904, + "grad_norm": 2.287402868270874, + "loss": 3.7902, + "lr": 0.0002972027972027972, + "step": 5526, + "tokens_trained": 2.715612736 + }, + { + "epoch": 1.5679029856038578, + "grad_norm": 2.2216570377349854, + "loss": 3.8992, + "lr": 0.0002969230769230769, + "step": 5528, + "tokens_trained": 2.716597376 + }, + { + "epoch": 1.5684703212538118, + "grad_norm": 4.012553691864014, + "loss": 3.85, + "lr": 0.0002966433566433567, + "step": 5530, + "tokens_trained": 2.717578944 + }, + { + "epoch": 1.5690376569037658, + "grad_norm": 3.187795639038086, + "loss": 3.8657, + "lr": 0.0002963636363636364, + "step": 5532, + "tokens_trained": 2.718558344 + }, + { + "epoch": 1.5696049925537197, + "grad_norm": 0.5813043713569641, + "loss": 3.8923, + "lr": 0.0002960839160839161, + "step": 5534, + "tokens_trained": 2.719547264 + }, + { + "epoch": 1.5701723282036735, + "grad_norm": 2.481187105178833, + "loss": 3.8645, + "lr": 0.0002958041958041958, + "step": 5536, + "tokens_trained": 2.7205292 + }, + { + "epoch": 1.5707396638536273, + "grad_norm": 2.6428370475769043, + "loss": 3.809, + "lr": 0.0002955244755244755, + "step": 5538, + "tokens_trained": 2.721513648 + }, + { + "epoch": 1.5713069995035813, + "grad_norm": 3.8301851749420166, + "loss": 3.844, + "lr": 0.00029524475524475525, + "step": 5540, + "tokens_trained": 2.722494504 + }, + { + "epoch": 1.5718743351535354, + "grad_norm": 2.134653091430664, + "loss": 3.8471, + "lr": 0.00029496503496503494, + "step": 5542, + "tokens_trained": 2.72347492 + }, + { + "epoch": 1.5724416708034892, + "grad_norm": 0.9983079433441162, + "loss": 3.8889, + "lr": 0.0002946853146853147, + "step": 5544, + "tokens_trained": 2.724455024 + }, + { + "epoch": 1.573009006453443, + "grad_norm": 0.41518381237983704, + "loss": 3.8726, + "lr": 0.0002944055944055944, + "step": 5546, + "tokens_trained": 2.72544024 + }, + { + "epoch": 1.5735763421033968, + "grad_norm": 0.42304641008377075, + "loss": 3.8176, + "lr": 0.0002941258741258741, + "step": 5548, + "tokens_trained": 2.726424232 + }, + { + "epoch": 1.5741436777533508, + "grad_norm": 1.4611626863479614, + "loss": 3.8959, + "lr": 0.00029384615384615387, + "step": 5550, + "tokens_trained": 2.727410496 + }, + { + "epoch": 1.5747110134033049, + "grad_norm": 0.546713650226593, + "loss": 3.9122, + "lr": 0.0002935664335664336, + "step": 5552, + "tokens_trained": 2.728396976 + }, + { + "epoch": 1.5752783490532587, + "grad_norm": 1.1208237409591675, + "loss": 3.8791, + "lr": 0.0002932867132867133, + "step": 5554, + "tokens_trained": 2.729385936 + }, + { + "epoch": 1.5758456847032125, + "grad_norm": 2.6620264053344727, + "loss": 3.877, + "lr": 0.000293006993006993, + "step": 5556, + "tokens_trained": 2.730368968 + }, + { + "epoch": 1.5764130203531663, + "grad_norm": 0.7671589255332947, + "loss": 3.8491, + "lr": 0.00029272727272727274, + "step": 5558, + "tokens_trained": 2.731351336 + }, + { + "epoch": 1.5769803560031204, + "grad_norm": 0.7316055297851562, + "loss": 3.849, + "lr": 0.0002924475524475524, + "step": 5560, + "tokens_trained": 2.732334296 + }, + { + "epoch": 1.5775476916530744, + "grad_norm": 3.3884339332580566, + "loss": 3.9126, + "lr": 0.00029216783216783217, + "step": 5562, + "tokens_trained": 2.733314536 + }, + { + "epoch": 1.5781150273030282, + "grad_norm": 1.2948181629180908, + "loss": 3.9129, + "lr": 0.00029188811188811186, + "step": 5564, + "tokens_trained": 2.73429728 + }, + { + "epoch": 1.578682362952982, + "grad_norm": 3.948852777481079, + "loss": 3.845, + "lr": 0.0002916083916083916, + "step": 5566, + "tokens_trained": 2.735282408 + }, + { + "epoch": 1.5792496986029358, + "grad_norm": 4.460155963897705, + "loss": 3.8637, + "lr": 0.00029132867132867135, + "step": 5568, + "tokens_trained": 2.736265168 + }, + { + "epoch": 1.5798170342528899, + "grad_norm": 2.052924633026123, + "loss": 3.9199, + "lr": 0.0002910489510489511, + "step": 5570, + "tokens_trained": 2.737247144 + }, + { + "epoch": 1.580384369902844, + "grad_norm": 2.460111379623413, + "loss": 3.8224, + "lr": 0.0002907692307692308, + "step": 5572, + "tokens_trained": 2.73823084 + }, + { + "epoch": 1.5809517055527977, + "grad_norm": 1.7709126472473145, + "loss": 3.8537, + "lr": 0.0002904895104895105, + "step": 5574, + "tokens_trained": 2.739212296 + }, + { + "epoch": 1.5815190412027516, + "grad_norm": 2.155181884765625, + "loss": 3.9214, + "lr": 0.0002902097902097902, + "step": 5576, + "tokens_trained": 2.740193656 + }, + { + "epoch": 1.5820863768527054, + "grad_norm": 2.0963149070739746, + "loss": 3.8832, + "lr": 0.0002899300699300699, + "step": 5578, + "tokens_trained": 2.741176512 + }, + { + "epoch": 1.5826537125026594, + "grad_norm": 2.6366584300994873, + "loss": 3.8542, + "lr": 0.00028965034965034966, + "step": 5580, + "tokens_trained": 2.742157504 + }, + { + "epoch": 1.5832210481526134, + "grad_norm": 1.9845340251922607, + "loss": 3.8644, + "lr": 0.00028937062937062935, + "step": 5582, + "tokens_trained": 2.743138064 + }, + { + "epoch": 1.5837883838025673, + "grad_norm": 0.9953936338424683, + "loss": 3.8577, + "lr": 0.0002890909090909091, + "step": 5584, + "tokens_trained": 2.744115648 + }, + { + "epoch": 1.584355719452521, + "grad_norm": 1.3023415803909302, + "loss": 3.8784, + "lr": 0.00028881118881118883, + "step": 5586, + "tokens_trained": 2.745097536 + }, + { + "epoch": 1.5849230551024749, + "grad_norm": 1.2267543077468872, + "loss": 3.8312, + "lr": 0.0002885314685314686, + "step": 5588, + "tokens_trained": 2.74608184 + }, + { + "epoch": 1.585490390752429, + "grad_norm": 0.7333025932312012, + "loss": 3.8134, + "lr": 0.00028825174825174827, + "step": 5590, + "tokens_trained": 2.747067224 + }, + { + "epoch": 1.586057726402383, + "grad_norm": 3.838825225830078, + "loss": 3.8554, + "lr": 0.00028797202797202796, + "step": 5592, + "tokens_trained": 2.74804864 + }, + { + "epoch": 1.5866250620523368, + "grad_norm": 2.8580691814422607, + "loss": 3.8197, + "lr": 0.0002876923076923077, + "step": 5594, + "tokens_trained": 2.749030704 + }, + { + "epoch": 1.5871923977022906, + "grad_norm": 3.3770620822906494, + "loss": 3.845, + "lr": 0.0002874125874125874, + "step": 5596, + "tokens_trained": 2.750011464 + }, + { + "epoch": 1.5877597333522444, + "grad_norm": 2.183331251144409, + "loss": 3.8625, + "lr": 0.00028713286713286714, + "step": 5598, + "tokens_trained": 2.750993992 + }, + { + "epoch": 1.5883270690021984, + "grad_norm": 1.1044546365737915, + "loss": 3.8821, + "lr": 0.00028685314685314683, + "step": 5600, + "tokens_trained": 2.75197632 + }, + { + "epoch": 1.5888944046521525, + "grad_norm": 1.9587361812591553, + "loss": 3.9019, + "lr": 0.0002865734265734266, + "step": 5602, + "tokens_trained": 2.75295512 + }, + { + "epoch": 1.5894617403021063, + "grad_norm": 5.257344722747803, + "loss": 3.8587, + "lr": 0.0002862937062937063, + "step": 5604, + "tokens_trained": 2.75393968 + }, + { + "epoch": 1.5900290759520601, + "grad_norm": 2.98882794380188, + "loss": 3.8751, + "lr": 0.00028601398601398606, + "step": 5606, + "tokens_trained": 2.754917056 + }, + { + "epoch": 1.590596411602014, + "grad_norm": 3.215801239013672, + "loss": 3.8178, + "lr": 0.00028573426573426575, + "step": 5608, + "tokens_trained": 2.755897352 + }, + { + "epoch": 1.591163747251968, + "grad_norm": 3.7019567489624023, + "loss": 3.8578, + "lr": 0.00028545454545454544, + "step": 5610, + "tokens_trained": 2.756876792 + }, + { + "epoch": 1.591731082901922, + "grad_norm": 0.5233857035636902, + "loss": 3.84, + "lr": 0.0002851748251748252, + "step": 5612, + "tokens_trained": 2.75786064 + }, + { + "epoch": 1.5922984185518758, + "grad_norm": 1.3499095439910889, + "loss": 3.846, + "lr": 0.0002848951048951049, + "step": 5614, + "tokens_trained": 2.758843208 + }, + { + "epoch": 1.5928657542018296, + "grad_norm": 3.770670175552368, + "loss": 3.8715, + "lr": 0.0002846153846153846, + "step": 5616, + "tokens_trained": 2.759831272 + }, + { + "epoch": 1.5934330898517834, + "grad_norm": 2.2430431842803955, + "loss": 3.8457, + "lr": 0.0002843356643356643, + "step": 5618, + "tokens_trained": 2.760816576 + }, + { + "epoch": 1.5940004255017375, + "grad_norm": 2.121674060821533, + "loss": 3.8379, + "lr": 0.00028405594405594406, + "step": 5620, + "tokens_trained": 2.761806192 + }, + { + "epoch": 1.5945677611516915, + "grad_norm": 2.42568302154541, + "loss": 3.8762, + "lr": 0.0002837762237762238, + "step": 5622, + "tokens_trained": 2.762787368 + }, + { + "epoch": 1.5951350968016453, + "grad_norm": 2.4501335620880127, + "loss": 3.8985, + "lr": 0.00028349650349650355, + "step": 5624, + "tokens_trained": 2.763768648 + }, + { + "epoch": 1.5954187646266222, + "eval_loss": 0.9685465693473816, + "eval_runtime": 20.383, + "step": 5625, + "tokens_trained": 2.764258728 + }, + { + "epoch": 1.5957024324515992, + "grad_norm": 1.7675210237503052, + "loss": 3.8317, + "lr": 0.00028321678321678324, + "step": 5626, + "tokens_trained": 2.764748448 + }, + { + "epoch": 1.596269768101553, + "grad_norm": 2.069201707839966, + "loss": 3.8316, + "lr": 0.00028293706293706293, + "step": 5628, + "tokens_trained": 2.7657334 + }, + { + "epoch": 1.596837103751507, + "grad_norm": 3.7776238918304443, + "loss": 3.8705, + "lr": 0.0002826573426573427, + "step": 5630, + "tokens_trained": 2.766718144 + }, + { + "epoch": 1.597404439401461, + "grad_norm": 4.658926963806152, + "loss": 3.8201, + "lr": 0.00028237762237762236, + "step": 5632, + "tokens_trained": 2.76770032 + }, + { + "epoch": 1.5979717750514149, + "grad_norm": 2.883873462677002, + "loss": 3.8384, + "lr": 0.0002820979020979021, + "step": 5634, + "tokens_trained": 2.768682736 + }, + { + "epoch": 1.5985391107013687, + "grad_norm": 3.313469886779785, + "loss": 3.8485, + "lr": 0.0002818181818181818, + "step": 5636, + "tokens_trained": 2.769667704 + }, + { + "epoch": 1.5991064463513225, + "grad_norm": 3.279757022857666, + "loss": 3.8407, + "lr": 0.00028153846153846154, + "step": 5638, + "tokens_trained": 2.77065176 + }, + { + "epoch": 1.5996737820012765, + "grad_norm": 3.4190688133239746, + "loss": 3.8733, + "lr": 0.0002812587412587413, + "step": 5640, + "tokens_trained": 2.77163072 + }, + { + "epoch": 1.6002411176512306, + "grad_norm": 2.766123056411743, + "loss": 3.8826, + "lr": 0.000280979020979021, + "step": 5642, + "tokens_trained": 2.772610632 + }, + { + "epoch": 1.6008084533011844, + "grad_norm": 2.292541742324829, + "loss": 3.8072, + "lr": 0.0002806993006993007, + "step": 5644, + "tokens_trained": 2.77359028 + }, + { + "epoch": 1.6013757889511382, + "grad_norm": 3.0967636108398438, + "loss": 3.8529, + "lr": 0.0002804195804195804, + "step": 5646, + "tokens_trained": 2.774572744 + }, + { + "epoch": 1.601943124601092, + "grad_norm": 4.144455432891846, + "loss": 3.8964, + "lr": 0.00028013986013986016, + "step": 5648, + "tokens_trained": 2.775555344 + }, + { + "epoch": 1.602510460251046, + "grad_norm": 1.0935693979263306, + "loss": 3.8742, + "lr": 0.00027986013986013985, + "step": 5650, + "tokens_trained": 2.77653816 + }, + { + "epoch": 1.603077795901, + "grad_norm": 1.5766457319259644, + "loss": 3.854, + "lr": 0.0002795804195804196, + "step": 5652, + "tokens_trained": 2.777518952 + }, + { + "epoch": 1.603645131550954, + "grad_norm": 7.910213470458984, + "loss": 3.8816, + "lr": 0.0002793006993006993, + "step": 5654, + "tokens_trained": 2.778506272 + }, + { + "epoch": 1.6042124672009077, + "grad_norm": 4.65513277053833, + "loss": 3.8729, + "lr": 0.00027902097902097903, + "step": 5656, + "tokens_trained": 2.779489664 + }, + { + "epoch": 1.6047798028508615, + "grad_norm": 3.681711435317993, + "loss": 3.8677, + "lr": 0.00027874125874125877, + "step": 5658, + "tokens_trained": 2.780472776 + }, + { + "epoch": 1.6053471385008156, + "grad_norm": 5.058254718780518, + "loss": 3.8606, + "lr": 0.00027846153846153846, + "step": 5660, + "tokens_trained": 2.781455744 + }, + { + "epoch": 1.6059144741507696, + "grad_norm": 4.267047882080078, + "loss": 3.8373, + "lr": 0.0002781818181818182, + "step": 5662, + "tokens_trained": 2.78244188 + }, + { + "epoch": 1.6064818098007234, + "grad_norm": 3.1416563987731934, + "loss": 3.8404, + "lr": 0.0002779020979020979, + "step": 5664, + "tokens_trained": 2.783428128 + }, + { + "epoch": 1.6070491454506772, + "grad_norm": 4.125866413116455, + "loss": 3.8763, + "lr": 0.00027762237762237764, + "step": 5666, + "tokens_trained": 2.784414224 + }, + { + "epoch": 1.607616481100631, + "grad_norm": 3.6334707736968994, + "loss": 3.8564, + "lr": 0.00027734265734265733, + "step": 5668, + "tokens_trained": 2.785394264 + }, + { + "epoch": 1.608183816750585, + "grad_norm": 4.244611740112305, + "loss": 3.8709, + "lr": 0.0002770629370629371, + "step": 5670, + "tokens_trained": 2.786378176 + }, + { + "epoch": 1.6087511524005391, + "grad_norm": 2.5464348793029785, + "loss": 3.9095, + "lr": 0.00027678321678321677, + "step": 5672, + "tokens_trained": 2.787364264 + }, + { + "epoch": 1.609318488050493, + "grad_norm": 2.5525379180908203, + "loss": 3.868, + "lr": 0.0002765034965034965, + "step": 5674, + "tokens_trained": 2.78834892 + }, + { + "epoch": 1.6098858237004467, + "grad_norm": 1.4956291913986206, + "loss": 3.8024, + "lr": 0.0002762237762237762, + "step": 5676, + "tokens_trained": 2.789330144 + }, + { + "epoch": 1.6104531593504006, + "grad_norm": 1.331429362297058, + "loss": 3.8308, + "lr": 0.00027594405594405595, + "step": 5678, + "tokens_trained": 2.790313456 + }, + { + "epoch": 1.6110204950003546, + "grad_norm": 1.6636086702346802, + "loss": 3.8013, + "lr": 0.0002756643356643357, + "step": 5680, + "tokens_trained": 2.791291288 + }, + { + "epoch": 1.6115878306503086, + "grad_norm": 1.0856963396072388, + "loss": 3.8851, + "lr": 0.0002753846153846154, + "step": 5682, + "tokens_trained": 2.792272912 + }, + { + "epoch": 1.6121551663002625, + "grad_norm": 0.8681638240814209, + "loss": 3.8744, + "lr": 0.0002751048951048951, + "step": 5684, + "tokens_trained": 2.793256312 + }, + { + "epoch": 1.6127225019502163, + "grad_norm": 1.770532488822937, + "loss": 3.8362, + "lr": 0.0002748251748251748, + "step": 5686, + "tokens_trained": 2.794240288 + }, + { + "epoch": 1.61328983760017, + "grad_norm": 2.9169862270355225, + "loss": 3.8114, + "lr": 0.00027454545454545456, + "step": 5688, + "tokens_trained": 2.795223128 + }, + { + "epoch": 1.6138571732501241, + "grad_norm": 2.319213628768921, + "loss": 3.823, + "lr": 0.00027426573426573425, + "step": 5690, + "tokens_trained": 2.796204352 + }, + { + "epoch": 1.6144245089000782, + "grad_norm": 1.7466791868209839, + "loss": 3.8292, + "lr": 0.000273986013986014, + "step": 5692, + "tokens_trained": 2.797188408 + }, + { + "epoch": 1.614991844550032, + "grad_norm": 2.5481719970703125, + "loss": 3.8338, + "lr": 0.0002737062937062937, + "step": 5694, + "tokens_trained": 2.79817152 + }, + { + "epoch": 1.6155591801999858, + "grad_norm": 1.9857237339019775, + "loss": 3.8354, + "lr": 0.00027342657342657343, + "step": 5696, + "tokens_trained": 2.799152488 + }, + { + "epoch": 1.6161265158499396, + "grad_norm": 2.332441568374634, + "loss": 3.8649, + "lr": 0.0002731468531468532, + "step": 5698, + "tokens_trained": 2.800136424 + }, + { + "epoch": 1.6166938514998936, + "grad_norm": 1.6021710634231567, + "loss": 3.892, + "lr": 0.00027286713286713287, + "step": 5700, + "tokens_trained": 2.801121064 + }, + { + "epoch": 1.6172611871498477, + "grad_norm": 1.5943433046340942, + "loss": 3.8719, + "lr": 0.0002725874125874126, + "step": 5702, + "tokens_trained": 2.802104456 + }, + { + "epoch": 1.6178285227998015, + "grad_norm": 1.7614659070968628, + "loss": 3.8755, + "lr": 0.0002723076923076923, + "step": 5704, + "tokens_trained": 2.803086472 + }, + { + "epoch": 1.6183958584497553, + "grad_norm": 0.709842324256897, + "loss": 3.883, + "lr": 0.00027202797202797205, + "step": 5706, + "tokens_trained": 2.804074552 + }, + { + "epoch": 1.6189631940997091, + "grad_norm": 2.912022829055786, + "loss": 3.827, + "lr": 0.00027174825174825174, + "step": 5708, + "tokens_trained": 2.805050624 + }, + { + "epoch": 1.6195305297496632, + "grad_norm": 1.5365500450134277, + "loss": 3.8561, + "lr": 0.0002714685314685315, + "step": 5710, + "tokens_trained": 2.806032664 + }, + { + "epoch": 1.6200978653996172, + "grad_norm": 1.8530750274658203, + "loss": 3.8715, + "lr": 0.00027118881118881117, + "step": 5712, + "tokens_trained": 2.807015072 + }, + { + "epoch": 1.620665201049571, + "grad_norm": 6.598786354064941, + "loss": 3.9111, + "lr": 0.0002709090909090909, + "step": 5714, + "tokens_trained": 2.80800292 + }, + { + "epoch": 1.6212325366995248, + "grad_norm": 3.761838436126709, + "loss": 3.8632, + "lr": 0.00027062937062937066, + "step": 5716, + "tokens_trained": 2.80898492 + }, + { + "epoch": 1.6217998723494786, + "grad_norm": 1.7242389917373657, + "loss": 3.8467, + "lr": 0.00027034965034965035, + "step": 5718, + "tokens_trained": 2.80996564 + }, + { + "epoch": 1.6223672079994327, + "grad_norm": 5.131701946258545, + "loss": 3.8868, + "lr": 0.0002700699300699301, + "step": 5720, + "tokens_trained": 2.810949456 + }, + { + "epoch": 1.6229345436493867, + "grad_norm": 3.7940638065338135, + "loss": 3.8526, + "lr": 0.0002697902097902098, + "step": 5722, + "tokens_trained": 2.811933712 + }, + { + "epoch": 1.6235018792993405, + "grad_norm": 3.0134806632995605, + "loss": 3.9174, + "lr": 0.00026951048951048953, + "step": 5724, + "tokens_trained": 2.812914312 + }, + { + "epoch": 1.6240692149492943, + "grad_norm": 4.154657363891602, + "loss": 3.86, + "lr": 0.0002692307692307692, + "step": 5726, + "tokens_trained": 2.813900408 + }, + { + "epoch": 1.6246365505992482, + "grad_norm": 4.034200668334961, + "loss": 3.8983, + "lr": 0.00026895104895104896, + "step": 5728, + "tokens_trained": 2.8148846 + }, + { + "epoch": 1.6252038862492022, + "grad_norm": 2.5282604694366455, + "loss": 3.8614, + "lr": 0.00026867132867132865, + "step": 5730, + "tokens_trained": 2.81587128 + }, + { + "epoch": 1.6257712218991562, + "grad_norm": 3.9052770137786865, + "loss": 3.8696, + "lr": 0.0002683916083916084, + "step": 5732, + "tokens_trained": 2.816857792 + }, + { + "epoch": 1.62633855754911, + "grad_norm": 4.2138352394104, + "loss": 3.8902, + "lr": 0.00026811188811188814, + "step": 5734, + "tokens_trained": 2.817840552 + }, + { + "epoch": 1.6269058931990639, + "grad_norm": 1.2808244228363037, + "loss": 3.8675, + "lr": 0.00026783216783216783, + "step": 5736, + "tokens_trained": 2.818825232 + }, + { + "epoch": 1.6274732288490177, + "grad_norm": 2.491243839263916, + "loss": 3.9218, + "lr": 0.0002675524475524476, + "step": 5738, + "tokens_trained": 2.819809352 + }, + { + "epoch": 1.6280405644989717, + "grad_norm": 3.1643896102905273, + "loss": 3.8813, + "lr": 0.00026727272727272727, + "step": 5740, + "tokens_trained": 2.820793456 + }, + { + "epoch": 1.6286079001489258, + "grad_norm": 3.648646593093872, + "loss": 3.8445, + "lr": 0.000266993006993007, + "step": 5742, + "tokens_trained": 2.821780088 + }, + { + "epoch": 1.6291752357988796, + "grad_norm": 2.1239254474639893, + "loss": 3.8781, + "lr": 0.0002667132867132867, + "step": 5744, + "tokens_trained": 2.82276436 + }, + { + "epoch": 1.6297425714488334, + "grad_norm": 2.5850162506103516, + "loss": 3.8625, + "lr": 0.0002664335664335664, + "step": 5746, + "tokens_trained": 2.82375128 + }, + { + "epoch": 1.6303099070987872, + "grad_norm": 2.6930086612701416, + "loss": 3.817, + "lr": 0.00026615384615384614, + "step": 5748, + "tokens_trained": 2.824735192 + }, + { + "epoch": 1.6308772427487412, + "grad_norm": 0.6374559998512268, + "loss": 3.8555, + "lr": 0.0002658741258741259, + "step": 5750, + "tokens_trained": 2.825718392 + }, + { + "epoch": 1.6308772427487412, + "eval_loss": 0.9677565097808838, + "eval_runtime": 20.3934, + "step": 5750, + "tokens_trained": 2.825718392 + }, + { + "epoch": 1.6314445783986953, + "grad_norm": 2.324770212173462, + "loss": 3.8393, + "lr": 0.00026559440559440563, + "step": 5752, + "tokens_trained": 2.826700664 + }, + { + "epoch": 1.632011914048649, + "grad_norm": 3.6169118881225586, + "loss": 3.8783, + "lr": 0.0002653146853146853, + "step": 5754, + "tokens_trained": 2.82768492 + }, + { + "epoch": 1.632579249698603, + "grad_norm": 3.1136844158172607, + "loss": 3.8528, + "lr": 0.00026503496503496506, + "step": 5756, + "tokens_trained": 2.828668856 + }, + { + "epoch": 1.6331465853485567, + "grad_norm": 1.646531105041504, + "loss": 3.8368, + "lr": 0.00026475524475524475, + "step": 5758, + "tokens_trained": 2.82965104 + }, + { + "epoch": 1.6337139209985108, + "grad_norm": 1.9851844310760498, + "loss": 3.8839, + "lr": 0.0002644755244755245, + "step": 5760, + "tokens_trained": 2.830636984 + }, + { + "epoch": 1.6342812566484648, + "grad_norm": 5.908127307891846, + "loss": 3.8477, + "lr": 0.0002641958041958042, + "step": 5762, + "tokens_trained": 2.831616488 + }, + { + "epoch": 1.6348485922984186, + "grad_norm": 4.9002909660339355, + "loss": 3.8279, + "lr": 0.0002639160839160839, + "step": 5764, + "tokens_trained": 2.832599464 + }, + { + "epoch": 1.6354159279483724, + "grad_norm": 2.045973539352417, + "loss": 3.8317, + "lr": 0.0002636363636363636, + "step": 5766, + "tokens_trained": 2.83358544 + }, + { + "epoch": 1.6359832635983262, + "grad_norm": 1.7147414684295654, + "loss": 3.8913, + "lr": 0.00026335664335664337, + "step": 5768, + "tokens_trained": 2.83456792 + }, + { + "epoch": 1.6365505992482803, + "grad_norm": 2.8540899753570557, + "loss": 3.8896, + "lr": 0.0002630769230769231, + "step": 5770, + "tokens_trained": 2.835546528 + }, + { + "epoch": 1.6371179348982343, + "grad_norm": 2.798184633255005, + "loss": 3.8901, + "lr": 0.0002627972027972028, + "step": 5772, + "tokens_trained": 2.836531536 + }, + { + "epoch": 1.6376852705481881, + "grad_norm": 3.74381160736084, + "loss": 3.8667, + "lr": 0.00026251748251748255, + "step": 5774, + "tokens_trained": 2.837511976 + }, + { + "epoch": 1.638252606198142, + "grad_norm": 1.3036679029464722, + "loss": 3.8828, + "lr": 0.00026223776223776224, + "step": 5776, + "tokens_trained": 2.838494216 + }, + { + "epoch": 1.6388199418480958, + "grad_norm": 2.3305046558380127, + "loss": 3.8687, + "lr": 0.000261958041958042, + "step": 5778, + "tokens_trained": 2.839477616 + }, + { + "epoch": 1.6393872774980498, + "grad_norm": 1.8486007452011108, + "loss": 3.8277, + "lr": 0.00026167832167832167, + "step": 5780, + "tokens_trained": 2.840460776 + }, + { + "epoch": 1.6399546131480038, + "grad_norm": 7.9603681564331055, + "loss": 3.8558, + "lr": 0.00026139860139860136, + "step": 5782, + "tokens_trained": 2.841442784 + }, + { + "epoch": 1.6405219487979577, + "grad_norm": 6.6514410972595215, + "loss": 3.8566, + "lr": 0.0002611188811188811, + "step": 5784, + "tokens_trained": 2.842423376 + }, + { + "epoch": 1.6410892844479115, + "grad_norm": 4.3851237297058105, + "loss": 3.8145, + "lr": 0.00026083916083916085, + "step": 5786, + "tokens_trained": 2.843410992 + }, + { + "epoch": 1.6416566200978653, + "grad_norm": 6.750310897827148, + "loss": 3.8696, + "lr": 0.0002605594405594406, + "step": 5788, + "tokens_trained": 2.844393928 + }, + { + "epoch": 1.6422239557478193, + "grad_norm": 3.409925937652588, + "loss": 3.8069, + "lr": 0.0002602797202797203, + "step": 5790, + "tokens_trained": 2.845371936 + }, + { + "epoch": 1.6427912913977734, + "grad_norm": 4.318549633026123, + "loss": 3.862, + "lr": 0.00026000000000000003, + "step": 5792, + "tokens_trained": 2.846352808 + }, + { + "epoch": 1.6433586270477272, + "grad_norm": 3.3245508670806885, + "loss": 3.9211, + "lr": 0.0002597202797202797, + "step": 5794, + "tokens_trained": 2.847335992 + }, + { + "epoch": 1.643925962697681, + "grad_norm": 2.312521457672119, + "loss": 3.7887, + "lr": 0.00025944055944055947, + "step": 5796, + "tokens_trained": 2.848320616 + }, + { + "epoch": 1.6444932983476348, + "grad_norm": 1.4259709119796753, + "loss": 3.8607, + "lr": 0.00025916083916083916, + "step": 5798, + "tokens_trained": 2.849300928 + }, + { + "epoch": 1.6450606339975888, + "grad_norm": 0.9020340442657471, + "loss": 3.869, + "lr": 0.00025888111888111885, + "step": 5800, + "tokens_trained": 2.850282648 + }, + { + "epoch": 1.6456279696475429, + "grad_norm": 2.114844799041748, + "loss": 3.8049, + "lr": 0.0002586013986013986, + "step": 5802, + "tokens_trained": 2.85126584 + }, + { + "epoch": 1.6461953052974967, + "grad_norm": 4.662852764129639, + "loss": 3.8474, + "lr": 0.0002583216783216783, + "step": 5804, + "tokens_trained": 2.852253648 + }, + { + "epoch": 1.6467626409474505, + "grad_norm": 4.038625240325928, + "loss": 3.8813, + "lr": 0.0002580419580419581, + "step": 5806, + "tokens_trained": 2.853237552 + }, + { + "epoch": 1.6473299765974043, + "grad_norm": 2.922651767730713, + "loss": 3.8331, + "lr": 0.00025776223776223777, + "step": 5808, + "tokens_trained": 2.854218656 + }, + { + "epoch": 1.6478973122473584, + "grad_norm": 4.35854434967041, + "loss": 3.8623, + "lr": 0.0002574825174825175, + "step": 5810, + "tokens_trained": 2.855199264 + }, + { + "epoch": 1.6484646478973124, + "grad_norm": 2.1086177825927734, + "loss": 3.8747, + "lr": 0.0002572027972027972, + "step": 5812, + "tokens_trained": 2.856182856 + }, + { + "epoch": 1.6490319835472662, + "grad_norm": 1.4423526525497437, + "loss": 3.8822, + "lr": 0.00025692307692307695, + "step": 5814, + "tokens_trained": 2.85716116 + }, + { + "epoch": 1.64959931919722, + "grad_norm": 1.7866076231002808, + "loss": 3.8701, + "lr": 0.00025664335664335664, + "step": 5816, + "tokens_trained": 2.8581406 + }, + { + "epoch": 1.6501666548471738, + "grad_norm": 0.9082437753677368, + "loss": 3.8207, + "lr": 0.00025636363636363633, + "step": 5818, + "tokens_trained": 2.859123392 + }, + { + "epoch": 1.6507339904971279, + "grad_norm": 2.493602991104126, + "loss": 3.8473, + "lr": 0.0002560839160839161, + "step": 5820, + "tokens_trained": 2.860107752 + }, + { + "epoch": 1.651301326147082, + "grad_norm": 2.814542055130005, + "loss": 3.8977, + "lr": 0.00025580419580419577, + "step": 5822, + "tokens_trained": 2.86109204 + }, + { + "epoch": 1.6518686617970357, + "grad_norm": 3.3991076946258545, + "loss": 3.7998, + "lr": 0.00025552447552447557, + "step": 5824, + "tokens_trained": 2.862077776 + }, + { + "epoch": 1.6524359974469895, + "grad_norm": 4.02992057800293, + "loss": 3.8594, + "lr": 0.00025524475524475526, + "step": 5826, + "tokens_trained": 2.863062432 + }, + { + "epoch": 1.6530033330969434, + "grad_norm": 5.211875915527344, + "loss": 3.8718, + "lr": 0.000254965034965035, + "step": 5828, + "tokens_trained": 2.86404512 + }, + { + "epoch": 1.6535706687468974, + "grad_norm": 2.361069917678833, + "loss": 3.837, + "lr": 0.0002546853146853147, + "step": 5830, + "tokens_trained": 2.865024888 + }, + { + "epoch": 1.6541380043968514, + "grad_norm": 6.926619052886963, + "loss": 3.8268, + "lr": 0.00025440559440559443, + "step": 5832, + "tokens_trained": 2.866006128 + }, + { + "epoch": 1.6547053400468053, + "grad_norm": 3.741729974746704, + "loss": 3.8109, + "lr": 0.0002541258741258741, + "step": 5834, + "tokens_trained": 2.866990408 + }, + { + "epoch": 1.655272675696759, + "grad_norm": 4.150857448577881, + "loss": 3.9021, + "lr": 0.0002538461538461538, + "step": 5836, + "tokens_trained": 2.867972736 + }, + { + "epoch": 1.6558400113467129, + "grad_norm": 3.9393651485443115, + "loss": 3.8552, + "lr": 0.00025356643356643356, + "step": 5838, + "tokens_trained": 2.86895772 + }, + { + "epoch": 1.656407346996667, + "grad_norm": 1.9962868690490723, + "loss": 3.8358, + "lr": 0.00025328671328671325, + "step": 5840, + "tokens_trained": 2.869938184 + }, + { + "epoch": 1.656974682646621, + "grad_norm": 0.8876021504402161, + "loss": 3.8869, + "lr": 0.00025300699300699305, + "step": 5842, + "tokens_trained": 2.87091688 + }, + { + "epoch": 1.6575420182965748, + "grad_norm": 1.5319703817367554, + "loss": 3.8339, + "lr": 0.00025272727272727274, + "step": 5844, + "tokens_trained": 2.871899592 + }, + { + "epoch": 1.6581093539465286, + "grad_norm": 1.3673622608184814, + "loss": 3.8461, + "lr": 0.0002524475524475525, + "step": 5846, + "tokens_trained": 2.87288524 + }, + { + "epoch": 1.6586766895964824, + "grad_norm": 1.8747504949569702, + "loss": 3.8687, + "lr": 0.0002521678321678322, + "step": 5848, + "tokens_trained": 2.873868904 + }, + { + "epoch": 1.6592440252464364, + "grad_norm": 1.78745698928833, + "loss": 3.8289, + "lr": 0.0002518881118881119, + "step": 5850, + "tokens_trained": 2.874854024 + }, + { + "epoch": 1.6598113608963905, + "grad_norm": 1.74812650680542, + "loss": 3.8631, + "lr": 0.0002516083916083916, + "step": 5852, + "tokens_trained": 2.875837264 + }, + { + "epoch": 1.6603786965463443, + "grad_norm": 4.6655778884887695, + "loss": 3.767, + "lr": 0.0002513286713286713, + "step": 5854, + "tokens_trained": 2.876824112 + }, + { + "epoch": 1.660946032196298, + "grad_norm": 4.012164115905762, + "loss": 3.8522, + "lr": 0.00025104895104895104, + "step": 5856, + "tokens_trained": 2.877811008 + }, + { + "epoch": 1.661513367846252, + "grad_norm": 4.21424674987793, + "loss": 3.8507, + "lr": 0.00025076923076923073, + "step": 5858, + "tokens_trained": 2.87879564 + }, + { + "epoch": 1.662080703496206, + "grad_norm": 4.155895233154297, + "loss": 3.8529, + "lr": 0.00025048951048951053, + "step": 5860, + "tokens_trained": 2.879774832 + }, + { + "epoch": 1.66264803914616, + "grad_norm": 2.7593812942504883, + "loss": 3.8609, + "lr": 0.0002502097902097902, + "step": 5862, + "tokens_trained": 2.880758592 + }, + { + "epoch": 1.6632153747961138, + "grad_norm": 1.1735769510269165, + "loss": 3.8646, + "lr": 0.00024993006993006997, + "step": 5864, + "tokens_trained": 2.881743336 + }, + { + "epoch": 1.6637827104460676, + "grad_norm": 2.4293084144592285, + "loss": 3.867, + "lr": 0.00024965034965034966, + "step": 5866, + "tokens_trained": 2.882725984 + }, + { + "epoch": 1.6643500460960214, + "grad_norm": 1.6900265216827393, + "loss": 3.8467, + "lr": 0.00024937062937062935, + "step": 5868, + "tokens_trained": 2.883709024 + }, + { + "epoch": 1.6649173817459755, + "grad_norm": 1.6338657140731812, + "loss": 3.8499, + "lr": 0.0002490909090909091, + "step": 5870, + "tokens_trained": 2.884690136 + }, + { + "epoch": 1.6654847173959295, + "grad_norm": 1.3867520093917847, + "loss": 3.8497, + "lr": 0.0002488111888111888, + "step": 5872, + "tokens_trained": 2.885670192 + }, + { + "epoch": 1.6660520530458833, + "grad_norm": 2.3722336292266846, + "loss": 3.8404, + "lr": 0.00024853146853146853, + "step": 5874, + "tokens_trained": 2.88665212 + }, + { + "epoch": 1.6663357208708602, + "eval_loss": 0.9677584767341614, + "eval_runtime": 21.0454, + "step": 5875, + "tokens_trained": 2.887143888 + }, + { + "epoch": 1.6666193886958371, + "grad_norm": 3.559649705886841, + "loss": 3.8521, + "lr": 0.0002482517482517483, + "step": 5876, + "tokens_trained": 2.887632456 + }, + { + "epoch": 1.667186724345791, + "grad_norm": 3.4279959201812744, + "loss": 3.8603, + "lr": 0.00024797202797202796, + "step": 5878, + "tokens_trained": 2.888610976 + }, + { + "epoch": 1.667754059995745, + "grad_norm": 2.7501025199890137, + "loss": 3.8123, + "lr": 0.0002476923076923077, + "step": 5880, + "tokens_trained": 2.889593048 + }, + { + "epoch": 1.668321395645699, + "grad_norm": 4.056321144104004, + "loss": 3.8852, + "lr": 0.00024741258741258745, + "step": 5882, + "tokens_trained": 2.890573992 + }, + { + "epoch": 1.6688887312956528, + "grad_norm": 2.395308017730713, + "loss": 3.8352, + "lr": 0.00024713286713286714, + "step": 5884, + "tokens_trained": 2.89156064 + }, + { + "epoch": 1.6694560669456067, + "grad_norm": 1.6177494525909424, + "loss": 3.8488, + "lr": 0.00024685314685314683, + "step": 5886, + "tokens_trained": 2.892541712 + }, + { + "epoch": 1.6700234025955605, + "grad_norm": 1.648560881614685, + "loss": 3.8204, + "lr": 0.0002465734265734266, + "step": 5888, + "tokens_trained": 2.893523312 + }, + { + "epoch": 1.6705907382455145, + "grad_norm": 2.471012830734253, + "loss": 3.8384, + "lr": 0.00024629370629370627, + "step": 5890, + "tokens_trained": 2.89450544 + }, + { + "epoch": 1.6711580738954686, + "grad_norm": 3.052476644515991, + "loss": 3.9013, + "lr": 0.000246013986013986, + "step": 5892, + "tokens_trained": 2.895485296 + }, + { + "epoch": 1.6717254095454224, + "grad_norm": 2.2633492946624756, + "loss": 3.8368, + "lr": 0.00024573426573426576, + "step": 5894, + "tokens_trained": 2.896470736 + }, + { + "epoch": 1.6722927451953762, + "grad_norm": 1.6077561378479004, + "loss": 3.8656, + "lr": 0.00024545454545454545, + "step": 5896, + "tokens_trained": 2.897452728 + }, + { + "epoch": 1.67286008084533, + "grad_norm": 2.796211004257202, + "loss": 3.8573, + "lr": 0.0002451748251748252, + "step": 5898, + "tokens_trained": 2.89843608 + }, + { + "epoch": 1.673427416495284, + "grad_norm": 2.2211575508117676, + "loss": 3.8605, + "lr": 0.0002448951048951049, + "step": 5900, + "tokens_trained": 2.89941884 + }, + { + "epoch": 1.673994752145238, + "grad_norm": 3.0687623023986816, + "loss": 3.8518, + "lr": 0.00024461538461538463, + "step": 5902, + "tokens_trained": 2.90040236 + }, + { + "epoch": 1.674562087795192, + "grad_norm": 3.5390725135803223, + "loss": 3.8293, + "lr": 0.0002443356643356643, + "step": 5904, + "tokens_trained": 2.901387496 + }, + { + "epoch": 1.6751294234451457, + "grad_norm": 0.4400745630264282, + "loss": 3.8764, + "lr": 0.00024405594405594406, + "step": 5906, + "tokens_trained": 2.902366184 + }, + { + "epoch": 1.6756967590950995, + "grad_norm": 2.00661301612854, + "loss": 3.8548, + "lr": 0.00024377622377622378, + "step": 5908, + "tokens_trained": 2.90334932 + }, + { + "epoch": 1.6762640947450536, + "grad_norm": 2.0423686504364014, + "loss": 3.8576, + "lr": 0.00024349650349650352, + "step": 5910, + "tokens_trained": 2.904335264 + }, + { + "epoch": 1.6768314303950076, + "grad_norm": 4.125240325927734, + "loss": 3.8416, + "lr": 0.00024321678321678321, + "step": 5912, + "tokens_trained": 2.905314864 + }, + { + "epoch": 1.6773987660449614, + "grad_norm": 3.8097951412200928, + "loss": 3.8451, + "lr": 0.00024293706293706293, + "step": 5914, + "tokens_trained": 2.906299024 + }, + { + "epoch": 1.6779661016949152, + "grad_norm": 3.335597276687622, + "loss": 3.8206, + "lr": 0.00024265734265734265, + "step": 5916, + "tokens_trained": 2.907281608 + }, + { + "epoch": 1.678533437344869, + "grad_norm": 1.986657977104187, + "loss": 3.8006, + "lr": 0.00024237762237762237, + "step": 5918, + "tokens_trained": 2.908257288 + }, + { + "epoch": 1.679100772994823, + "grad_norm": 1.9969795942306519, + "loss": 3.909, + "lr": 0.0002420979020979021, + "step": 5920, + "tokens_trained": 2.909239544 + }, + { + "epoch": 1.6796681086447771, + "grad_norm": 2.5585694313049316, + "loss": 3.8341, + "lr": 0.00024181818181818183, + "step": 5922, + "tokens_trained": 2.910221368 + }, + { + "epoch": 1.680235444294731, + "grad_norm": 2.500028371810913, + "loss": 3.8697, + "lr": 0.00024153846153846155, + "step": 5924, + "tokens_trained": 2.911205368 + }, + { + "epoch": 1.6808027799446847, + "grad_norm": 2.8927834033966064, + "loss": 3.8504, + "lr": 0.00024125874125874126, + "step": 5926, + "tokens_trained": 2.91218908 + }, + { + "epoch": 1.6813701155946386, + "grad_norm": 3.0361721515655518, + "loss": 3.8477, + "lr": 0.000240979020979021, + "step": 5928, + "tokens_trained": 2.913171488 + }, + { + "epoch": 1.6819374512445926, + "grad_norm": 2.912531852722168, + "loss": 3.9016, + "lr": 0.0002406993006993007, + "step": 5930, + "tokens_trained": 2.914153488 + }, + { + "epoch": 1.6825047868945466, + "grad_norm": 2.563627004623413, + "loss": 3.8274, + "lr": 0.00024041958041958042, + "step": 5932, + "tokens_trained": 2.915134264 + }, + { + "epoch": 1.6830721225445004, + "grad_norm": 1.3338478803634644, + "loss": 3.7957, + "lr": 0.00024013986013986013, + "step": 5934, + "tokens_trained": 2.916116168 + }, + { + "epoch": 1.6836394581944543, + "grad_norm": 1.8714828491210938, + "loss": 3.8932, + "lr": 0.00023986013986013985, + "step": 5936, + "tokens_trained": 2.917097976 + }, + { + "epoch": 1.684206793844408, + "grad_norm": 6.701860427856445, + "loss": 3.8579, + "lr": 0.0002395804195804196, + "step": 5938, + "tokens_trained": 2.918085424 + }, + { + "epoch": 1.6847741294943621, + "grad_norm": 7.627328395843506, + "loss": 3.8301, + "lr": 0.0002393006993006993, + "step": 5940, + "tokens_trained": 2.9190662 + }, + { + "epoch": 1.6853414651443162, + "grad_norm": 7.1663713455200195, + "loss": 3.8541, + "lr": 0.00023902097902097903, + "step": 5942, + "tokens_trained": 2.920054232 + }, + { + "epoch": 1.68590880079427, + "grad_norm": 6.0305094718933105, + "loss": 3.8826, + "lr": 0.00023874125874125875, + "step": 5944, + "tokens_trained": 2.921033344 + }, + { + "epoch": 1.6864761364442238, + "grad_norm": 4.241663932800293, + "loss": 3.8372, + "lr": 0.0002384615384615385, + "step": 5946, + "tokens_trained": 2.922014616 + }, + { + "epoch": 1.6870434720941776, + "grad_norm": 4.3776984214782715, + "loss": 3.8612, + "lr": 0.00023818181818181818, + "step": 5948, + "tokens_trained": 2.922998192 + }, + { + "epoch": 1.6876108077441316, + "grad_norm": 1.773468255996704, + "loss": 3.8674, + "lr": 0.0002379020979020979, + "step": 5950, + "tokens_trained": 2.92398352 + }, + { + "epoch": 1.6881781433940857, + "grad_norm": 1.1746567487716675, + "loss": 3.8757, + "lr": 0.00023762237762237762, + "step": 5952, + "tokens_trained": 2.924967368 + }, + { + "epoch": 1.6887454790440395, + "grad_norm": 2.353240728378296, + "loss": 3.8848, + "lr": 0.00023734265734265734, + "step": 5954, + "tokens_trained": 2.925950768 + }, + { + "epoch": 1.6893128146939933, + "grad_norm": 2.495901584625244, + "loss": 3.8353, + "lr": 0.00023706293706293708, + "step": 5956, + "tokens_trained": 2.926931672 + }, + { + "epoch": 1.6898801503439471, + "grad_norm": 2.9549484252929688, + "loss": 3.8289, + "lr": 0.0002367832167832168, + "step": 5958, + "tokens_trained": 2.9279126 + }, + { + "epoch": 1.6904474859939012, + "grad_norm": 3.3719921112060547, + "loss": 3.8499, + "lr": 0.00023650349650349652, + "step": 5960, + "tokens_trained": 2.928895256 + }, + { + "epoch": 1.6910148216438552, + "grad_norm": 2.7297303676605225, + "loss": 3.8315, + "lr": 0.00023622377622377623, + "step": 5962, + "tokens_trained": 2.929876128 + }, + { + "epoch": 1.691582157293809, + "grad_norm": 2.845301389694214, + "loss": 3.8558, + "lr": 0.00023594405594405592, + "step": 5964, + "tokens_trained": 2.93085852 + }, + { + "epoch": 1.6921494929437628, + "grad_norm": 1.7312262058258057, + "loss": 3.8874, + "lr": 0.00023566433566433567, + "step": 5966, + "tokens_trained": 2.93183948 + }, + { + "epoch": 1.6927168285937166, + "grad_norm": 6.511951923370361, + "loss": 3.8787, + "lr": 0.00023538461538461538, + "step": 5968, + "tokens_trained": 2.932820528 + }, + { + "epoch": 1.6932841642436707, + "grad_norm": 2.518000841140747, + "loss": 3.8689, + "lr": 0.0002351048951048951, + "step": 5970, + "tokens_trained": 2.93380352 + }, + { + "epoch": 1.6938514998936247, + "grad_norm": 3.1675634384155273, + "loss": 3.8182, + "lr": 0.00023482517482517482, + "step": 5972, + "tokens_trained": 2.93478664 + }, + { + "epoch": 1.6944188355435785, + "grad_norm": 1.4572842121124268, + "loss": 3.8316, + "lr": 0.00023454545454545456, + "step": 5974, + "tokens_trained": 2.935771864 + }, + { + "epoch": 1.6949861711935323, + "grad_norm": 3.347806453704834, + "loss": 3.8602, + "lr": 0.00023426573426573428, + "step": 5976, + "tokens_trained": 2.936755656 + }, + { + "epoch": 1.6955535068434862, + "grad_norm": 5.49018669128418, + "loss": 3.8196, + "lr": 0.000233986013986014, + "step": 5978, + "tokens_trained": 2.937737104 + }, + { + "epoch": 1.6961208424934402, + "grad_norm": 2.272129535675049, + "loss": 3.8414, + "lr": 0.00023370629370629372, + "step": 5980, + "tokens_trained": 2.93872084 + }, + { + "epoch": 1.6966881781433942, + "grad_norm": 5.61100435256958, + "loss": 3.8377, + "lr": 0.0002334265734265734, + "step": 5982, + "tokens_trained": 2.939702104 + }, + { + "epoch": 1.697255513793348, + "grad_norm": 4.814182758331299, + "loss": 3.9013, + "lr": 0.00023314685314685315, + "step": 5984, + "tokens_trained": 2.940683392 + }, + { + "epoch": 1.6978228494433019, + "grad_norm": 2.8431384563446045, + "loss": 3.8322, + "lr": 0.00023286713286713287, + "step": 5986, + "tokens_trained": 2.941665112 + }, + { + "epoch": 1.6983901850932557, + "grad_norm": 5.4591450691223145, + "loss": 3.8187, + "lr": 0.0002325874125874126, + "step": 5988, + "tokens_trained": 2.942654464 + }, + { + "epoch": 1.6989575207432097, + "grad_norm": 2.687572956085205, + "loss": 3.8279, + "lr": 0.0002323076923076923, + "step": 5990, + "tokens_trained": 2.943639176 + }, + { + "epoch": 1.6995248563931638, + "grad_norm": 2.6767523288726807, + "loss": 3.8326, + "lr": 0.00023202797202797205, + "step": 5992, + "tokens_trained": 2.944621208 + }, + { + "epoch": 1.7000921920431176, + "grad_norm": 5.612683296203613, + "loss": 3.8468, + "lr": 0.00023174825174825177, + "step": 5994, + "tokens_trained": 2.94560208 + }, + { + "epoch": 1.7006595276930714, + "grad_norm": 3.099323272705078, + "loss": 3.8726, + "lr": 0.00023146853146853148, + "step": 5996, + "tokens_trained": 2.946583688 + }, + { + "epoch": 1.7012268633430252, + "grad_norm": 2.9504568576812744, + "loss": 3.8566, + "lr": 0.0002311888111888112, + "step": 5998, + "tokens_trained": 2.947561592 + }, + { + "epoch": 1.7017941989929792, + "grad_norm": 2.855426073074341, + "loss": 3.8686, + "lr": 0.0002309090909090909, + "step": 6000, + "tokens_trained": 2.948546968 + }, + { + "epoch": 1.7017941989929792, + "eval_loss": 0.9661399722099304, + "eval_runtime": 20.2295, + "step": 6000, + "tokens_trained": 2.948546968 + } + ], + "logging_steps": 2, + "max_steps": 7650, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 750, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}