{ "best_global_step": 6000, "best_metric": 4.926996231079102, "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-BPE/checkpoint-6000", "epoch": 1.702127659574468, "eval_steps": 125, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005673758865248227, "grad_norm": 1275.0146484375, "loss": 281.4781, "lr": 2e-06, "step": 2, "tokens_trained": 0.000192256 }, { "epoch": 0.0011347517730496454, "grad_norm": 1437.579833984375, "loss": 267.2211, "lr": 6e-06, "step": 4, "tokens_trained": 0.000382024 }, { "epoch": 0.001702127659574468, "grad_norm": 1719.271484375, "loss": 219.3822, "lr": 1e-05, "step": 6, "tokens_trained": 0.00057072 }, { "epoch": 0.0022695035460992908, "grad_norm": 1444.94970703125, "loss": 133.8172, "lr": 1.4e-05, "step": 8, "tokens_trained": 0.000761336 }, { "epoch": 0.0028368794326241137, "grad_norm": 238.9689178466797, "loss": 90.8177, "lr": 1.8e-05, "step": 10, "tokens_trained": 0.000953248 }, { "epoch": 0.003404255319148936, "grad_norm": 158.53497314453125, "loss": 84.6922, "lr": 2.2e-05, "step": 12, "tokens_trained": 0.00114424 }, { "epoch": 0.003971631205673759, "grad_norm": 146.10595703125, "loss": 76.7055, "lr": 2.6e-05, "step": 14, "tokens_trained": 0.001334104 }, { "epoch": 0.0045390070921985815, "grad_norm": 140.69964599609375, "loss": 67.9952, "lr": 3e-05, "step": 16, "tokens_trained": 0.00152392 }, { "epoch": 0.005106382978723404, "grad_norm": 108.80303192138672, "loss": 57.8088, "lr": 3.4000000000000007e-05, "step": 18, "tokens_trained": 0.001713872 }, { "epoch": 0.005673758865248227, "grad_norm": 106.82334899902344, "loss": 48.6585, "lr": 3.8e-05, "step": 20, "tokens_trained": 0.001903976 }, { "epoch": 0.00624113475177305, "grad_norm": 93.58769989013672, "loss": 41.7984, "lr": 4.2000000000000004e-05, "step": 22, "tokens_trained": 0.002094288 }, { "epoch": 0.006808510638297872, "grad_norm": 87.5854721069336, "loss": 37.6201, "lr": 4.6e-05, "step": 24, "tokens_trained": 0.002282496 }, { "epoch": 0.007375886524822695, "grad_norm": 84.12794494628906, "loss": 35.0091, "lr": 5e-05, "step": 26, "tokens_trained": 0.00247068 }, { "epoch": 0.007943262411347518, "grad_norm": 79.77535247802734, "loss": 33.2253, "lr": 5.4e-05, "step": 28, "tokens_trained": 0.002662888 }, { "epoch": 0.00851063829787234, "grad_norm": 66.42157745361328, "loss": 32.0682, "lr": 5.800000000000001e-05, "step": 30, "tokens_trained": 0.002851968 }, { "epoch": 0.009078014184397163, "grad_norm": 87.52485656738281, "loss": 30.893, "lr": 6.2e-05, "step": 32, "tokens_trained": 0.003041384 }, { "epoch": 0.009645390070921986, "grad_norm": 58.33614730834961, "loss": 30.0513, "lr": 6.6e-05, "step": 34, "tokens_trained": 0.003232872 }, { "epoch": 0.010212765957446808, "grad_norm": 54.629329681396484, "loss": 29.0115, "lr": 7.000000000000001e-05, "step": 36, "tokens_trained": 0.003423824 }, { "epoch": 0.01078014184397163, "grad_norm": 52.79097366333008, "loss": 28.2084, "lr": 7.4e-05, "step": 38, "tokens_trained": 0.003613232 }, { "epoch": 0.011347517730496455, "grad_norm": 54.481224060058594, "loss": 27.4345, "lr": 7.8e-05, "step": 40, "tokens_trained": 0.003800952 }, { "epoch": 0.011914893617021277, "grad_norm": 58.7069091796875, "loss": 26.5936, "lr": 8.2e-05, "step": 42, "tokens_trained": 0.003991512 }, { "epoch": 0.0124822695035461, "grad_norm": 49.30760955810547, "loss": 26.0608, "lr": 8.599999999999999e-05, "step": 44, "tokens_trained": 0.004180648 }, { "epoch": 0.013049645390070922, "grad_norm": 61.902587890625, "loss": 25.5363, "lr": 8.999999999999999e-05, "step": 46, "tokens_trained": 0.00437148 }, { "epoch": 0.013617021276595745, "grad_norm": 46.76111602783203, "loss": 24.9599, "lr": 9.400000000000001e-05, "step": 48, "tokens_trained": 0.004559344 }, { "epoch": 0.014184397163120567, "grad_norm": 57.06416702270508, "loss": 24.4087, "lr": 9.800000000000001e-05, "step": 50, "tokens_trained": 0.004749256 }, { "epoch": 0.01475177304964539, "grad_norm": 44.798736572265625, "loss": 24.1444, "lr": 0.000102, "step": 52, "tokens_trained": 0.004940192 }, { "epoch": 0.015319148936170212, "grad_norm": 40.29296875, "loss": 23.6011, "lr": 0.000106, "step": 54, "tokens_trained": 0.005130304 }, { "epoch": 0.015886524822695036, "grad_norm": 38.75099563598633, "loss": 23.1781, "lr": 0.00011, "step": 56, "tokens_trained": 0.005322864 }, { "epoch": 0.016453900709219857, "grad_norm": 37.470706939697266, "loss": 22.9136, "lr": 0.000114, "step": 58, "tokens_trained": 0.00551392 }, { "epoch": 0.01702127659574468, "grad_norm": 35.1894645690918, "loss": 22.6336, "lr": 0.000118, "step": 60, "tokens_trained": 0.005703096 }, { "epoch": 0.017588652482269502, "grad_norm": 35.136573791503906, "loss": 22.2998, "lr": 0.000122, "step": 62, "tokens_trained": 0.005892448 }, { "epoch": 0.018156028368794326, "grad_norm": 38.05111312866211, "loss": 21.9401, "lr": 0.000126, "step": 64, "tokens_trained": 0.006081656 }, { "epoch": 0.01872340425531915, "grad_norm": 35.63850021362305, "loss": 21.7206, "lr": 0.00013000000000000002, "step": 66, "tokens_trained": 0.006273032 }, { "epoch": 0.01929078014184397, "grad_norm": 34.327667236328125, "loss": 21.4051, "lr": 0.000134, "step": 68, "tokens_trained": 0.00646304 }, { "epoch": 0.019858156028368795, "grad_norm": 31.457059860229492, "loss": 21.0774, "lr": 0.00013800000000000002, "step": 70, "tokens_trained": 0.006652832 }, { "epoch": 0.020425531914893616, "grad_norm": 34.91672897338867, "loss": 20.8718, "lr": 0.00014199999999999998, "step": 72, "tokens_trained": 0.006843512 }, { "epoch": 0.02099290780141844, "grad_norm": 27.959579467773438, "loss": 20.6932, "lr": 0.000146, "step": 74, "tokens_trained": 0.007033584 }, { "epoch": 0.02156028368794326, "grad_norm": 26.569866180419922, "loss": 20.4072, "lr": 0.00015, "step": 76, "tokens_trained": 0.007224032 }, { "epoch": 0.022127659574468085, "grad_norm": 28.009904861450195, "loss": 20.2229, "lr": 0.000154, "step": 78, "tokens_trained": 0.00741368 }, { "epoch": 0.02269503546099291, "grad_norm": 28.892959594726562, "loss": 20.0528, "lr": 0.000158, "step": 80, "tokens_trained": 0.00760416 }, { "epoch": 0.02326241134751773, "grad_norm": 31.58131980895996, "loss": 19.8016, "lr": 0.000162, "step": 82, "tokens_trained": 0.007793952 }, { "epoch": 0.023829787234042554, "grad_norm": 31.01254653930664, "loss": 19.634, "lr": 0.00016600000000000002, "step": 84, "tokens_trained": 0.007980792 }, { "epoch": 0.024397163120567375, "grad_norm": 28.732515335083008, "loss": 19.3777, "lr": 0.00017, "step": 86, "tokens_trained": 0.008171968 }, { "epoch": 0.0249645390070922, "grad_norm": 24.31264877319336, "loss": 19.1346, "lr": 0.000174, "step": 88, "tokens_trained": 0.008361632 }, { "epoch": 0.02553191489361702, "grad_norm": 26.557010650634766, "loss": 19.0014, "lr": 0.000178, "step": 90, "tokens_trained": 0.008552328 }, { "epoch": 0.026099290780141844, "grad_norm": 21.156103134155273, "loss": 18.7032, "lr": 0.000182, "step": 92, "tokens_trained": 0.008743136 }, { "epoch": 0.02666666666666667, "grad_norm": 25.7484188079834, "loss": 18.4836, "lr": 0.000186, "step": 94, "tokens_trained": 0.008932056 }, { "epoch": 0.02723404255319149, "grad_norm": 22.27949333190918, "loss": 18.2233, "lr": 0.00019, "step": 96, "tokens_trained": 0.009121608 }, { "epoch": 0.027801418439716313, "grad_norm": 24.9247989654541, "loss": 17.9867, "lr": 0.000194, "step": 98, "tokens_trained": 0.009311008 }, { "epoch": 0.028368794326241134, "grad_norm": 24.302066802978516, "loss": 17.8016, "lr": 0.00019800000000000002, "step": 100, "tokens_trained": 0.009501456 }, { "epoch": 0.02893617021276596, "grad_norm": 23.458459854125977, "loss": 17.6295, "lr": 0.000202, "step": 102, "tokens_trained": 0.009693952 }, { "epoch": 0.02950354609929078, "grad_norm": 24.092350006103516, "loss": 17.4593, "lr": 0.000206, "step": 104, "tokens_trained": 0.009883328 }, { "epoch": 0.030070921985815603, "grad_norm": 22.54726219177246, "loss": 17.2141, "lr": 0.00021, "step": 106, "tokens_trained": 0.01007316 }, { "epoch": 0.030638297872340424, "grad_norm": 21.334760665893555, "loss": 17.044, "lr": 0.000214, "step": 108, "tokens_trained": 0.010266504 }, { "epoch": 0.031205673758865248, "grad_norm": 20.584287643432617, "loss": 16.8919, "lr": 0.000218, "step": 110, "tokens_trained": 0.010455736 }, { "epoch": 0.03177304964539007, "grad_norm": 23.51676368713379, "loss": 16.751, "lr": 0.000222, "step": 112, "tokens_trained": 0.010645208 }, { "epoch": 0.03234042553191489, "grad_norm": 23.278276443481445, "loss": 16.5997, "lr": 0.00022600000000000002, "step": 114, "tokens_trained": 0.010838928 }, { "epoch": 0.032907801418439714, "grad_norm": 25.4830265045166, "loss": 16.3416, "lr": 0.00023, "step": 116, "tokens_trained": 0.011027792 }, { "epoch": 0.03347517730496454, "grad_norm": 29.442413330078125, "loss": 16.24, "lr": 0.00023400000000000002, "step": 118, "tokens_trained": 0.011217456 }, { "epoch": 0.03404255319148936, "grad_norm": 21.77578353881836, "loss": 16.1922, "lr": 0.00023799999999999998, "step": 120, "tokens_trained": 0.01140804 }, { "epoch": 0.03460992907801418, "grad_norm": 27.040719985961914, "loss": 15.9059, "lr": 0.000242, "step": 122, "tokens_trained": 0.011597816 }, { "epoch": 0.035177304964539004, "grad_norm": 24.74480628967285, "loss": 15.7818, "lr": 0.000246, "step": 124, "tokens_trained": 0.011785624 }, { "epoch": 0.03546099290780142, "eval_loss": 15.553059577941895, "eval_runtime": 23.5485, "step": 125, "tokens_trained": 0.011880832 }, { "epoch": 0.03574468085106383, "grad_norm": 23.13482666015625, "loss": 15.5739, "lr": 0.00025, "step": 126, "tokens_trained": 0.011975976 }, { "epoch": 0.03631205673758865, "grad_norm": 22.8618106842041, "loss": 15.4302, "lr": 0.000254, "step": 128, "tokens_trained": 0.012166744 }, { "epoch": 0.03687943262411347, "grad_norm": 26.804859161376953, "loss": 15.3623, "lr": 0.00025800000000000004, "step": 130, "tokens_trained": 0.01235436 }, { "epoch": 0.0374468085106383, "grad_norm": 21.826601028442383, "loss": 15.1465, "lr": 0.000262, "step": 132, "tokens_trained": 0.012544976 }, { "epoch": 0.03801418439716312, "grad_norm": 39.447086334228516, "loss": 15.0137, "lr": 0.000266, "step": 134, "tokens_trained": 0.012736352 }, { "epoch": 0.03858156028368794, "grad_norm": 23.44275665283203, "loss": 14.9355, "lr": 0.00027, "step": 136, "tokens_trained": 0.012925008 }, { "epoch": 0.03914893617021276, "grad_norm": 21.631427764892578, "loss": 14.6825, "lr": 0.00027400000000000005, "step": 138, "tokens_trained": 0.013114672 }, { "epoch": 0.03971631205673759, "grad_norm": 23.674650192260742, "loss": 14.5194, "lr": 0.00027800000000000004, "step": 140, "tokens_trained": 0.013304016 }, { "epoch": 0.04028368794326241, "grad_norm": 23.974796295166016, "loss": 14.4829, "lr": 0.00028199999999999997, "step": 142, "tokens_trained": 0.013496696 }, { "epoch": 0.04085106382978723, "grad_norm": 26.112201690673828, "loss": 14.3027, "lr": 0.00028599999999999996, "step": 144, "tokens_trained": 0.013684816 }, { "epoch": 0.04141843971631206, "grad_norm": 20.67386817932129, "loss": 14.1499, "lr": 0.00029, "step": 146, "tokens_trained": 0.013874832 }, { "epoch": 0.04198581560283688, "grad_norm": 24.253408432006836, "loss": 13.9378, "lr": 0.000294, "step": 148, "tokens_trained": 0.014065056 }, { "epoch": 0.0425531914893617, "grad_norm": 35.716087341308594, "loss": 14.0562, "lr": 0.000298, "step": 150, "tokens_trained": 0.014256784 }, { "epoch": 0.04312056737588652, "grad_norm": 29.414331436157227, "loss": 14.0462, "lr": 0.000302, "step": 152, "tokens_trained": 0.014446312 }, { "epoch": 0.04368794326241135, "grad_norm": 30.687482833862305, "loss": 13.7603, "lr": 0.000306, "step": 154, "tokens_trained": 0.014639872 }, { "epoch": 0.04425531914893617, "grad_norm": 29.806455612182617, "loss": 13.708, "lr": 0.00031, "step": 156, "tokens_trained": 0.014831112 }, { "epoch": 0.04482269503546099, "grad_norm": 24.900897979736328, "loss": 13.548, "lr": 0.000314, "step": 158, "tokens_trained": 0.015021288 }, { "epoch": 0.04539007092198582, "grad_norm": 24.29252815246582, "loss": 13.3119, "lr": 0.00031800000000000003, "step": 160, "tokens_trained": 0.01521228 }, { "epoch": 0.04595744680851064, "grad_norm": 20.68342399597168, "loss": 13.1829, "lr": 0.000322, "step": 162, "tokens_trained": 0.015403688 }, { "epoch": 0.04652482269503546, "grad_norm": 20.822795867919922, "loss": 12.9044, "lr": 0.000326, "step": 164, "tokens_trained": 0.015593416 }, { "epoch": 0.04709219858156028, "grad_norm": 21.689916610717773, "loss": 12.6862, "lr": 0.00033, "step": 166, "tokens_trained": 0.015784408 }, { "epoch": 0.04765957446808511, "grad_norm": 17.873889923095703, "loss": 12.5502, "lr": 0.00033400000000000004, "step": 168, "tokens_trained": 0.0159744 }, { "epoch": 0.04822695035460993, "grad_norm": 18.951616287231445, "loss": 12.308, "lr": 0.00033800000000000003, "step": 170, "tokens_trained": 0.016163736 }, { "epoch": 0.04879432624113475, "grad_norm": 15.146363258361816, "loss": 12.1558, "lr": 0.000342, "step": 172, "tokens_trained": 0.016353832 }, { "epoch": 0.04936170212765958, "grad_norm": 18.336984634399414, "loss": 12.0386, "lr": 0.000346, "step": 174, "tokens_trained": 0.016545088 }, { "epoch": 0.0499290780141844, "grad_norm": 17.221126556396484, "loss": 11.8791, "lr": 0.00035, "step": 176, "tokens_trained": 0.016735704 }, { "epoch": 0.05049645390070922, "grad_norm": 19.362564086914062, "loss": 11.7224, "lr": 0.000354, "step": 178, "tokens_trained": 0.016927944 }, { "epoch": 0.05106382978723404, "grad_norm": 15.564507484436035, "loss": 11.6448, "lr": 0.000358, "step": 180, "tokens_trained": 0.017116096 }, { "epoch": 0.05163120567375887, "grad_norm": 20.711383819580078, "loss": 11.4398, "lr": 0.000362, "step": 182, "tokens_trained": 0.01730564 }, { "epoch": 0.05219858156028369, "grad_norm": 18.627403259277344, "loss": 11.3377, "lr": 0.000366, "step": 184, "tokens_trained": 0.017495864 }, { "epoch": 0.05276595744680851, "grad_norm": 15.00942325592041, "loss": 11.1416, "lr": 0.00037, "step": 186, "tokens_trained": 0.017686464 }, { "epoch": 0.05333333333333334, "grad_norm": 17.070598602294922, "loss": 11.0148, "lr": 0.000374, "step": 188, "tokens_trained": 0.017879488 }, { "epoch": 0.05390070921985816, "grad_norm": 16.101457595825195, "loss": 10.8874, "lr": 0.000378, "step": 190, "tokens_trained": 0.018068312 }, { "epoch": 0.05446808510638298, "grad_norm": 15.613334655761719, "loss": 10.7055, "lr": 0.000382, "step": 192, "tokens_trained": 0.018255752 }, { "epoch": 0.0550354609929078, "grad_norm": 17.671857833862305, "loss": 10.5706, "lr": 0.000386, "step": 194, "tokens_trained": 0.018447096 }, { "epoch": 0.05560283687943263, "grad_norm": 16.080909729003906, "loss": 10.4476, "lr": 0.00039000000000000005, "step": 196, "tokens_trained": 0.018637264 }, { "epoch": 0.05617021276595745, "grad_norm": 15.02849292755127, "loss": 10.2962, "lr": 0.00039400000000000004, "step": 198, "tokens_trained": 0.018827552 }, { "epoch": 0.05673758865248227, "grad_norm": 14.990167617797852, "loss": 10.1912, "lr": 0.000398, "step": 200, "tokens_trained": 0.019018 }, { "epoch": 0.05730496453900709, "grad_norm": 15.390633583068848, "loss": 10.0442, "lr": 0.000402, "step": 202, "tokens_trained": 0.019209864 }, { "epoch": 0.05787234042553192, "grad_norm": 16.871570587158203, "loss": 9.9685, "lr": 0.00040600000000000006, "step": 204, "tokens_trained": 0.019400176 }, { "epoch": 0.05843971631205674, "grad_norm": 20.16544532775879, "loss": 9.8531, "lr": 0.00041, "step": 206, "tokens_trained": 0.019589424 }, { "epoch": 0.05900709219858156, "grad_norm": 16.825023651123047, "loss": 9.7777, "lr": 0.000414, "step": 208, "tokens_trained": 0.019779112 }, { "epoch": 0.059574468085106386, "grad_norm": 16.43510627746582, "loss": 9.6122, "lr": 0.00041799999999999997, "step": 210, "tokens_trained": 0.019970048 }, { "epoch": 0.060141843971631206, "grad_norm": 17.340473175048828, "loss": 9.4859, "lr": 0.000422, "step": 212, "tokens_trained": 0.020160968 }, { "epoch": 0.06070921985815603, "grad_norm": 15.019119262695312, "loss": 9.3656, "lr": 0.000426, "step": 214, "tokens_trained": 0.020349664 }, { "epoch": 0.06127659574468085, "grad_norm": 13.379194259643555, "loss": 9.2348, "lr": 0.00043, "step": 216, "tokens_trained": 0.020538192 }, { "epoch": 0.061843971631205676, "grad_norm": 16.71472930908203, "loss": 9.2258, "lr": 0.00043400000000000003, "step": 218, "tokens_trained": 0.020728936 }, { "epoch": 0.062411347517730496, "grad_norm": 12.743139266967773, "loss": 9.0569, "lr": 0.000438, "step": 220, "tokens_trained": 0.020917472 }, { "epoch": 0.06297872340425532, "grad_norm": 15.739934921264648, "loss": 8.9623, "lr": 0.000442, "step": 222, "tokens_trained": 0.02110928 }, { "epoch": 0.06354609929078014, "grad_norm": 14.23620891571045, "loss": 8.8201, "lr": 0.000446, "step": 224, "tokens_trained": 0.021300168 }, { "epoch": 0.06411347517730497, "grad_norm": 13.005538940429688, "loss": 8.7235, "lr": 0.00045000000000000004, "step": 226, "tokens_trained": 0.021490272 }, { "epoch": 0.06468085106382979, "grad_norm": 17.17629051208496, "loss": 8.6907, "lr": 0.00045400000000000003, "step": 228, "tokens_trained": 0.021681552 }, { "epoch": 0.06524822695035461, "grad_norm": 14.430739402770996, "loss": 8.6196, "lr": 0.000458, "step": 230, "tokens_trained": 0.02187236 }, { "epoch": 0.06581560283687943, "grad_norm": 14.575714111328125, "loss": 8.4741, "lr": 0.000462, "step": 232, "tokens_trained": 0.022061976 }, { "epoch": 0.06638297872340425, "grad_norm": 13.892754554748535, "loss": 8.4118, "lr": 0.00046600000000000005, "step": 234, "tokens_trained": 0.022252008 }, { "epoch": 0.06695035460992908, "grad_norm": 11.58240795135498, "loss": 8.2781, "lr": 0.00047, "step": 236, "tokens_trained": 0.02244284 }, { "epoch": 0.0675177304964539, "grad_norm": 13.022644996643066, "loss": 8.2139, "lr": 0.000474, "step": 238, "tokens_trained": 0.022631152 }, { "epoch": 0.06808510638297872, "grad_norm": 11.844677925109863, "loss": 8.1134, "lr": 0.00047799999999999996, "step": 240, "tokens_trained": 0.022821096 }, { "epoch": 0.06865248226950355, "grad_norm": 13.878067016601562, "loss": 8.0221, "lr": 0.000482, "step": 242, "tokens_trained": 0.023011656 }, { "epoch": 0.06921985815602837, "grad_norm": 12.34648323059082, "loss": 7.9755, "lr": 0.000486, "step": 244, "tokens_trained": 0.023201 }, { "epoch": 0.06978723404255319, "grad_norm": 14.238297462463379, "loss": 7.8969, "lr": 0.00049, "step": 246, "tokens_trained": 0.023391128 }, { "epoch": 0.07035460992907801, "grad_norm": 14.386019706726074, "loss": 7.8627, "lr": 0.000494, "step": 248, "tokens_trained": 0.023581768 }, { "epoch": 0.07092198581560284, "grad_norm": 13.623086929321289, "loss": 7.7568, "lr": 0.000498, "step": 250, "tokens_trained": 0.023771248 }, { "epoch": 0.07092198581560284, "eval_loss": 7.70297384262085, "eval_runtime": 21.3853, "step": 250, "tokens_trained": 0.023771248 }, { "epoch": 0.07148936170212766, "grad_norm": 14.347646713256836, "loss": 7.6842, "lr": 0.0005020000000000001, "step": 252, "tokens_trained": 0.023961056 }, { "epoch": 0.07205673758865248, "grad_norm": 12.5592041015625, "loss": 7.6516, "lr": 0.000506, "step": 254, "tokens_trained": 0.024150968 }, { "epoch": 0.0726241134751773, "grad_norm": 13.219141960144043, "loss": 7.5789, "lr": 0.00051, "step": 256, "tokens_trained": 0.024340072 }, { "epoch": 0.07319148936170213, "grad_norm": 12.654081344604492, "loss": 7.5369, "lr": 0.000514, "step": 258, "tokens_trained": 0.024529296 }, { "epoch": 0.07375886524822695, "grad_norm": 13.136971473693848, "loss": 7.4949, "lr": 0.000518, "step": 260, "tokens_trained": 0.024719688 }, { "epoch": 0.07432624113475177, "grad_norm": 12.680288314819336, "loss": 7.3904, "lr": 0.000522, "step": 262, "tokens_trained": 0.024909632 }, { "epoch": 0.0748936170212766, "grad_norm": 12.754518508911133, "loss": 7.3514, "lr": 0.000526, "step": 264, "tokens_trained": 0.025098416 }, { "epoch": 0.07546099290780142, "grad_norm": 13.22311019897461, "loss": 7.2951, "lr": 0.0005300000000000001, "step": 266, "tokens_trained": 0.025287344 }, { "epoch": 0.07602836879432624, "grad_norm": 12.11903190612793, "loss": 7.2229, "lr": 0.0005340000000000001, "step": 268, "tokens_trained": 0.025477152 }, { "epoch": 0.07659574468085106, "grad_norm": 13.771833419799805, "loss": 7.1815, "lr": 0.0005380000000000001, "step": 270, "tokens_trained": 0.025668288 }, { "epoch": 0.07716312056737588, "grad_norm": 11.756864547729492, "loss": 7.1669, "lr": 0.0005420000000000001, "step": 272, "tokens_trained": 0.025858528 }, { "epoch": 0.0777304964539007, "grad_norm": 13.613094329833984, "loss": 7.1079, "lr": 0.000546, "step": 274, "tokens_trained": 0.026048616 }, { "epoch": 0.07829787234042553, "grad_norm": 10.001923561096191, "loss": 7.0508, "lr": 0.00055, "step": 276, "tokens_trained": 0.026236944 }, { "epoch": 0.07886524822695036, "grad_norm": 14.262083053588867, "loss": 6.9955, "lr": 0.000554, "step": 278, "tokens_trained": 0.026426848 }, { "epoch": 0.07943262411347518, "grad_norm": 12.381136894226074, "loss": 6.9831, "lr": 0.000558, "step": 280, "tokens_trained": 0.026616784 }, { "epoch": 0.08, "grad_norm": 9.815845489501953, "loss": 6.917, "lr": 0.0005620000000000001, "step": 282, "tokens_trained": 0.026805176 }, { "epoch": 0.08056737588652482, "grad_norm": 11.669997215270996, "loss": 6.8999, "lr": 0.000566, "step": 284, "tokens_trained": 0.02699488 }, { "epoch": 0.08113475177304964, "grad_norm": 12.770941734313965, "loss": 6.8998, "lr": 0.00057, "step": 286, "tokens_trained": 0.027185784 }, { "epoch": 0.08170212765957446, "grad_norm": 15.572457313537598, "loss": 6.841, "lr": 0.000574, "step": 288, "tokens_trained": 0.027375896 }, { "epoch": 0.08226950354609928, "grad_norm": 10.980833053588867, "loss": 6.8545, "lr": 0.000578, "step": 290, "tokens_trained": 0.02756588 }, { "epoch": 0.08283687943262412, "grad_norm": 11.678337097167969, "loss": 6.7853, "lr": 0.0005819999999999999, "step": 292, "tokens_trained": 0.02775456 }, { "epoch": 0.08340425531914894, "grad_norm": 9.77885913848877, "loss": 6.7465, "lr": 0.0005859999999999999, "step": 294, "tokens_trained": 0.027942856 }, { "epoch": 0.08397163120567376, "grad_norm": 13.62730884552002, "loss": 6.7276, "lr": 0.00059, "step": 296, "tokens_trained": 0.028133152 }, { "epoch": 0.08453900709219858, "grad_norm": 10.644404411315918, "loss": 6.6802, "lr": 0.000594, "step": 298, "tokens_trained": 0.028322192 }, { "epoch": 0.0851063829787234, "grad_norm": 11.130610466003418, "loss": 6.6548, "lr": 0.000598, "step": 300, "tokens_trained": 0.0285122 }, { "epoch": 0.08567375886524822, "grad_norm": 11.557455062866211, "loss": 6.6155, "lr": 0.000602, "step": 302, "tokens_trained": 0.028699792 }, { "epoch": 0.08624113475177304, "grad_norm": 9.276884078979492, "loss": 6.5989, "lr": 0.000606, "step": 304, "tokens_trained": 0.028889896 }, { "epoch": 0.08680851063829788, "grad_norm": 9.616179466247559, "loss": 6.5773, "lr": 0.00061, "step": 306, "tokens_trained": 0.029082272 }, { "epoch": 0.0873758865248227, "grad_norm": 10.575953483581543, "loss": 6.5358, "lr": 0.000614, "step": 308, "tokens_trained": 0.029273352 }, { "epoch": 0.08794326241134752, "grad_norm": 9.089850425720215, "loss": 6.5088, "lr": 0.0006180000000000001, "step": 310, "tokens_trained": 0.029463848 }, { "epoch": 0.08851063829787234, "grad_norm": 9.090002059936523, "loss": 6.4849, "lr": 0.000622, "step": 312, "tokens_trained": 0.029653272 }, { "epoch": 0.08907801418439716, "grad_norm": 12.038308143615723, "loss": 6.4624, "lr": 0.000626, "step": 314, "tokens_trained": 0.029841928 }, { "epoch": 0.08964539007092198, "grad_norm": 9.073866844177246, "loss": 6.4515, "lr": 0.00063, "step": 316, "tokens_trained": 0.030029808 }, { "epoch": 0.0902127659574468, "grad_norm": 8.727197647094727, "loss": 6.43, "lr": 0.000634, "step": 318, "tokens_trained": 0.030221288 }, { "epoch": 0.09078014184397164, "grad_norm": 14.558151245117188, "loss": 6.4487, "lr": 0.000638, "step": 320, "tokens_trained": 0.030410872 }, { "epoch": 0.09134751773049646, "grad_norm": 9.98914623260498, "loss": 6.4279, "lr": 0.000642, "step": 322, "tokens_trained": 0.030602376 }, { "epoch": 0.09191489361702128, "grad_norm": 10.395442962646484, "loss": 6.4311, "lr": 0.000646, "step": 324, "tokens_trained": 0.030792968 }, { "epoch": 0.0924822695035461, "grad_norm": 10.8250093460083, "loss": 6.3726, "lr": 0.0006500000000000001, "step": 326, "tokens_trained": 0.030982944 }, { "epoch": 0.09304964539007092, "grad_norm": 9.73416805267334, "loss": 6.34, "lr": 0.0006540000000000001, "step": 328, "tokens_trained": 0.031174928 }, { "epoch": 0.09361702127659574, "grad_norm": 8.596503257751465, "loss": 6.3322, "lr": 0.0006580000000000001, "step": 330, "tokens_trained": 0.031364288 }, { "epoch": 0.09418439716312056, "grad_norm": 8.49472427368164, "loss": 6.3096, "lr": 0.000662, "step": 332, "tokens_trained": 0.03155376 }, { "epoch": 0.0947517730496454, "grad_norm": 7.857503414154053, "loss": 6.2368, "lr": 0.000666, "step": 334, "tokens_trained": 0.031744368 }, { "epoch": 0.09531914893617022, "grad_norm": 9.007513999938965, "loss": 6.198, "lr": 0.00067, "step": 336, "tokens_trained": 0.031934136 }, { "epoch": 0.09588652482269504, "grad_norm": 8.185524940490723, "loss": 6.2328, "lr": 0.000674, "step": 338, "tokens_trained": 0.032124984 }, { "epoch": 0.09645390070921986, "grad_norm": 8.784396171569824, "loss": 6.1945, "lr": 0.0006780000000000001, "step": 340, "tokens_trained": 0.032316016 }, { "epoch": 0.09702127659574468, "grad_norm": 8.642311096191406, "loss": 6.218, "lr": 0.0006820000000000001, "step": 342, "tokens_trained": 0.032506224 }, { "epoch": 0.0975886524822695, "grad_norm": 8.493780136108398, "loss": 6.194, "lr": 0.0006860000000000001, "step": 344, "tokens_trained": 0.032696152 }, { "epoch": 0.09815602836879432, "grad_norm": 9.120508193969727, "loss": 6.2241, "lr": 0.00069, "step": 346, "tokens_trained": 0.032885688 }, { "epoch": 0.09872340425531916, "grad_norm": 9.34500503540039, "loss": 6.1548, "lr": 0.000694, "step": 348, "tokens_trained": 0.03307568 }, { "epoch": 0.09929078014184398, "grad_norm": 7.483356952667236, "loss": 6.1282, "lr": 0.0006979999999999999, "step": 350, "tokens_trained": 0.033267208 }, { "epoch": 0.0998581560283688, "grad_norm": 7.974069118499756, "loss": 6.1032, "lr": 0.0007019999999999999, "step": 352, "tokens_trained": 0.033458144 }, { "epoch": 0.10042553191489362, "grad_norm": 8.247384071350098, "loss": 6.1698, "lr": 0.0007059999999999999, "step": 354, "tokens_trained": 0.033650352 }, { "epoch": 0.10099290780141844, "grad_norm": 8.554885864257812, "loss": 6.1429, "lr": 0.00071, "step": 356, "tokens_trained": 0.033840232 }, { "epoch": 0.10156028368794326, "grad_norm": 7.209281921386719, "loss": 6.0997, "lr": 0.000714, "step": 358, "tokens_trained": 0.034030032 }, { "epoch": 0.10212765957446808, "grad_norm": 8.660383224487305, "loss": 6.1497, "lr": 0.000718, "step": 360, "tokens_trained": 0.034218592 }, { "epoch": 0.10269503546099291, "grad_norm": 9.382761001586914, "loss": 6.0665, "lr": 0.000722, "step": 362, "tokens_trained": 0.034408408 }, { "epoch": 0.10326241134751774, "grad_norm": 6.915714263916016, "loss": 6.0636, "lr": 0.000726, "step": 364, "tokens_trained": 0.034600016 }, { "epoch": 0.10382978723404256, "grad_norm": 7.8990631103515625, "loss": 6.0975, "lr": 0.00073, "step": 366, "tokens_trained": 0.034790792 }, { "epoch": 0.10439716312056738, "grad_norm": 8.859809875488281, "loss": 6.0754, "lr": 0.000734, "step": 368, "tokens_trained": 0.034981304 }, { "epoch": 0.1049645390070922, "grad_norm": 7.392801761627197, "loss": 6.039, "lr": 0.000738, "step": 370, "tokens_trained": 0.03516956 }, { "epoch": 0.10553191489361702, "grad_norm": 9.427324295043945, "loss": 6.084, "lr": 0.000742, "step": 372, "tokens_trained": 0.035358816 }, { "epoch": 0.10609929078014184, "grad_norm": 7.168910503387451, "loss": 6.0498, "lr": 0.000746, "step": 374, "tokens_trained": 0.035548016 }, { "epoch": 0.10638297872340426, "eval_loss": 6.038269996643066, "eval_runtime": 21.3445, "step": 375, "tokens_trained": 0.035644104 }, { "epoch": 0.10666666666666667, "grad_norm": 7.899259567260742, "loss": 6.0345, "lr": 0.00075, "step": 376, "tokens_trained": 0.035739856 }, { "epoch": 0.1072340425531915, "grad_norm": 8.91533374786377, "loss": 6.0386, "lr": 0.000754, "step": 378, "tokens_trained": 0.035930264 }, { "epoch": 0.10780141843971631, "grad_norm": 6.998043060302734, "loss": 6.0294, "lr": 0.000758, "step": 380, "tokens_trained": 0.036119616 }, { "epoch": 0.10836879432624114, "grad_norm": 7.343894958496094, "loss": 6.0116, "lr": 0.000762, "step": 382, "tokens_trained": 0.036308416 }, { "epoch": 0.10893617021276596, "grad_norm": 8.182528495788574, "loss": 5.9904, "lr": 0.0007660000000000001, "step": 384, "tokens_trained": 0.036497264 }, { "epoch": 0.10950354609929078, "grad_norm": 7.927818775177002, "loss": 6.0345, "lr": 0.0007700000000000001, "step": 386, "tokens_trained": 0.036688192 }, { "epoch": 0.1100709219858156, "grad_norm": 8.07447338104248, "loss": 5.9685, "lr": 0.0007740000000000001, "step": 388, "tokens_trained": 0.036878256 }, { "epoch": 0.11063829787234042, "grad_norm": 7.281871318817139, "loss": 6.0125, "lr": 0.000778, "step": 390, "tokens_trained": 0.037068272 }, { "epoch": 0.11120567375886525, "grad_norm": 8.298929214477539, "loss": 6.0071, "lr": 0.000782, "step": 392, "tokens_trained": 0.037259464 }, { "epoch": 0.11177304964539007, "grad_norm": 7.546716690063477, "loss": 5.9721, "lr": 0.000786, "step": 394, "tokens_trained": 0.037449696 }, { "epoch": 0.1123404255319149, "grad_norm": 8.28548526763916, "loss": 5.9819, "lr": 0.00079, "step": 396, "tokens_trained": 0.037639672 }, { "epoch": 0.11290780141843972, "grad_norm": 7.064655303955078, "loss": 5.9873, "lr": 0.0007940000000000001, "step": 398, "tokens_trained": 0.03782712 }, { "epoch": 0.11347517730496454, "grad_norm": 7.743175506591797, "loss": 5.9528, "lr": 0.0007980000000000001, "step": 400, "tokens_trained": 0.03801792 }, { "epoch": 0.11404255319148936, "grad_norm": 7.00898551940918, "loss": 5.9504, "lr": 0.0008020000000000001, "step": 402, "tokens_trained": 0.038209176 }, { "epoch": 0.11460992907801418, "grad_norm": 7.9350409507751465, "loss": 5.9555, "lr": 0.0008060000000000001, "step": 404, "tokens_trained": 0.03839824 }, { "epoch": 0.11517730496453901, "grad_norm": 7.048569679260254, "loss": 5.9787, "lr": 0.0008100000000000001, "step": 406, "tokens_trained": 0.03858732 }, { "epoch": 0.11574468085106383, "grad_norm": 7.088194370269775, "loss": 5.928, "lr": 0.0008139999999999999, "step": 408, "tokens_trained": 0.038777712 }, { "epoch": 0.11631205673758865, "grad_norm": 8.230712890625, "loss": 5.9716, "lr": 0.0008179999999999999, "step": 410, "tokens_trained": 0.038969464 }, { "epoch": 0.11687943262411347, "grad_norm": 8.076972007751465, "loss": 5.9624, "lr": 0.0008219999999999999, "step": 412, "tokens_trained": 0.039162064 }, { "epoch": 0.1174468085106383, "grad_norm": 8.065289497375488, "loss": 5.9937, "lr": 0.000826, "step": 414, "tokens_trained": 0.039348688 }, { "epoch": 0.11801418439716312, "grad_norm": 6.393420696258545, "loss": 5.9278, "lr": 0.00083, "step": 416, "tokens_trained": 0.03953732 }, { "epoch": 0.11858156028368794, "grad_norm": 7.384702682495117, "loss": 5.931, "lr": 0.000834, "step": 418, "tokens_trained": 0.039729808 }, { "epoch": 0.11914893617021277, "grad_norm": 7.007425308227539, "loss": 5.93, "lr": 0.000838, "step": 420, "tokens_trained": 0.039921096 }, { "epoch": 0.11971631205673759, "grad_norm": 7.112692832946777, "loss": 5.9625, "lr": 0.000842, "step": 422, "tokens_trained": 0.040110856 }, { "epoch": 0.12028368794326241, "grad_norm": 8.484418869018555, "loss": 5.9848, "lr": 0.000846, "step": 424, "tokens_trained": 0.040300504 }, { "epoch": 0.12085106382978723, "grad_norm": 6.633459091186523, "loss": 6.0226, "lr": 0.00085, "step": 426, "tokens_trained": 0.04049056 }, { "epoch": 0.12141843971631205, "grad_norm": 7.796964168548584, "loss": 5.9152, "lr": 0.000854, "step": 428, "tokens_trained": 0.040680544 }, { "epoch": 0.12198581560283688, "grad_norm": 7.833578586578369, "loss": 5.924, "lr": 0.000858, "step": 430, "tokens_trained": 0.040873128 }, { "epoch": 0.1225531914893617, "grad_norm": 6.7470550537109375, "loss": 5.9318, "lr": 0.000862, "step": 432, "tokens_trained": 0.041063488 }, { "epoch": 0.12312056737588653, "grad_norm": 6.066318988800049, "loss": 5.9569, "lr": 0.000866, "step": 434, "tokens_trained": 0.041254368 }, { "epoch": 0.12368794326241135, "grad_norm": 6.753541469573975, "loss": 5.8851, "lr": 0.00087, "step": 436, "tokens_trained": 0.04144516 }, { "epoch": 0.12425531914893617, "grad_norm": 6.471331596374512, "loss": 5.864, "lr": 0.000874, "step": 438, "tokens_trained": 0.041636912 }, { "epoch": 0.12482269503546099, "grad_norm": 6.129056930541992, "loss": 5.8965, "lr": 0.000878, "step": 440, "tokens_trained": 0.041828104 }, { "epoch": 0.1253900709219858, "grad_norm": 6.478890895843506, "loss": 5.8817, "lr": 0.000882, "step": 442, "tokens_trained": 0.04201808 }, { "epoch": 0.12595744680851065, "grad_norm": 6.014713287353516, "loss": 5.8268, "lr": 0.0008860000000000001, "step": 444, "tokens_trained": 0.042207328 }, { "epoch": 0.12652482269503545, "grad_norm": 5.505755424499512, "loss": 5.8684, "lr": 0.0008900000000000001, "step": 446, "tokens_trained": 0.042398152 }, { "epoch": 0.1270921985815603, "grad_norm": 10.096606254577637, "loss": 5.8608, "lr": 0.000894, "step": 448, "tokens_trained": 0.042588984 }, { "epoch": 0.1276595744680851, "grad_norm": 6.388499736785889, "loss": 5.8766, "lr": 0.000898, "step": 450, "tokens_trained": 0.042778592 }, { "epoch": 0.12822695035460993, "grad_norm": 7.145125865936279, "loss": 5.8571, "lr": 0.000902, "step": 452, "tokens_trained": 0.042967176 }, { "epoch": 0.12879432624113477, "grad_norm": 6.826383113861084, "loss": 5.8655, "lr": 0.000906, "step": 454, "tokens_trained": 0.043158952 }, { "epoch": 0.12936170212765957, "grad_norm": 6.036892414093018, "loss": 5.8775, "lr": 0.00091, "step": 456, "tokens_trained": 0.043349288 }, { "epoch": 0.1299290780141844, "grad_norm": 6.36528205871582, "loss": 5.8908, "lr": 0.0009140000000000001, "step": 458, "tokens_trained": 0.043539888 }, { "epoch": 0.13049645390070921, "grad_norm": 6.317558288574219, "loss": 5.8702, "lr": 0.0009180000000000001, "step": 460, "tokens_trained": 0.04373232 }, { "epoch": 0.13106382978723405, "grad_norm": 6.427131175994873, "loss": 5.8399, "lr": 0.0009220000000000001, "step": 462, "tokens_trained": 0.043922744 }, { "epoch": 0.13163120567375886, "grad_norm": 5.666539669036865, "loss": 5.7899, "lr": 0.0009260000000000001, "step": 464, "tokens_trained": 0.044112888 }, { "epoch": 0.1321985815602837, "grad_norm": 5.241824150085449, "loss": 5.8203, "lr": 0.00093, "step": 466, "tokens_trained": 0.04430244 }, { "epoch": 0.1327659574468085, "grad_norm": 6.072646141052246, "loss": 5.8367, "lr": 0.000934, "step": 468, "tokens_trained": 0.044493528 }, { "epoch": 0.13333333333333333, "grad_norm": 6.414418697357178, "loss": 5.8236, "lr": 0.0009379999999999999, "step": 470, "tokens_trained": 0.044682328 }, { "epoch": 0.13390070921985817, "grad_norm": 6.958801746368408, "loss": 5.8179, "lr": 0.000942, "step": 472, "tokens_trained": 0.044874256 }, { "epoch": 0.13446808510638297, "grad_norm": 5.787843227386475, "loss": 5.8478, "lr": 0.000946, "step": 474, "tokens_trained": 0.045065616 }, { "epoch": 0.1350354609929078, "grad_norm": 5.5841240882873535, "loss": 5.8307, "lr": 0.00095, "step": 476, "tokens_trained": 0.045257024 }, { "epoch": 0.13560283687943261, "grad_norm": 6.607712745666504, "loss": 5.8512, "lr": 0.000954, "step": 478, "tokens_trained": 0.045446432 }, { "epoch": 0.13617021276595745, "grad_norm": 5.473597049713135, "loss": 5.8174, "lr": 0.000958, "step": 480, "tokens_trained": 0.045636392 }, { "epoch": 0.13673758865248226, "grad_norm": 5.435728549957275, "loss": 5.8308, "lr": 0.000962, "step": 482, "tokens_trained": 0.045823784 }, { "epoch": 0.1373049645390071, "grad_norm": 6.049300670623779, "loss": 5.8293, "lr": 0.000966, "step": 484, "tokens_trained": 0.046013408 }, { "epoch": 0.13787234042553193, "grad_norm": 6.311764717102051, "loss": 5.8086, "lr": 0.0009699999999999999, "step": 486, "tokens_trained": 0.046202528 }, { "epoch": 0.13843971631205673, "grad_norm": 5.886009216308594, "loss": 5.7986, "lr": 0.000974, "step": 488, "tokens_trained": 0.04639404 }, { "epoch": 0.13900709219858157, "grad_norm": 5.438202381134033, "loss": 5.8473, "lr": 0.000978, "step": 490, "tokens_trained": 0.046586512 }, { "epoch": 0.13957446808510637, "grad_norm": 5.08393669128418, "loss": 5.7613, "lr": 0.000982, "step": 492, "tokens_trained": 0.046777448 }, { "epoch": 0.1401418439716312, "grad_norm": 5.645389080047607, "loss": 5.7723, "lr": 0.0009860000000000001, "step": 494, "tokens_trained": 0.046966096 }, { "epoch": 0.14070921985815601, "grad_norm": 6.320916652679443, "loss": 5.7772, "lr": 0.00099, "step": 496, "tokens_trained": 0.047155152 }, { "epoch": 0.14127659574468085, "grad_norm": 5.573540210723877, "loss": 5.7412, "lr": 0.000994, "step": 498, "tokens_trained": 0.047345352 }, { "epoch": 0.14184397163120568, "grad_norm": 4.939594745635986, "loss": 5.8208, "lr": 0.000998, "step": 500, "tokens_trained": 0.047535016 }, { "epoch": 0.14184397163120568, "eval_loss": 5.799490928649902, "eval_runtime": 20.8575, "step": 500, "tokens_trained": 0.047535016 }, { "epoch": 0.1424113475177305, "grad_norm": 5.805343151092529, "loss": 5.7734, "lr": 0.00099986013986014, "step": 502, "tokens_trained": 0.047724216 }, { "epoch": 0.14297872340425533, "grad_norm": 5.831176280975342, "loss": 5.8044, "lr": 0.0009995804195804196, "step": 504, "tokens_trained": 0.047914328 }, { "epoch": 0.14354609929078013, "grad_norm": 5.045091152191162, "loss": 5.8133, "lr": 0.0009993006993006994, "step": 506, "tokens_trained": 0.048105032 }, { "epoch": 0.14411347517730497, "grad_norm": 5.276819705963135, "loss": 5.7555, "lr": 0.000999020979020979, "step": 508, "tokens_trained": 0.048293104 }, { "epoch": 0.14468085106382977, "grad_norm": 5.710324287414551, "loss": 5.7619, "lr": 0.0009987412587412587, "step": 510, "tokens_trained": 0.048483888 }, { "epoch": 0.1452482269503546, "grad_norm": 4.9472527503967285, "loss": 5.767, "lr": 0.0009984615384615386, "step": 512, "tokens_trained": 0.04867336 }, { "epoch": 0.14581560283687944, "grad_norm": 5.410078525543213, "loss": 5.7238, "lr": 0.0009981818181818182, "step": 514, "tokens_trained": 0.048863104 }, { "epoch": 0.14638297872340425, "grad_norm": 6.025843143463135, "loss": 5.7664, "lr": 0.000997902097902098, "step": 516, "tokens_trained": 0.049053856 }, { "epoch": 0.14695035460992908, "grad_norm": 5.3211669921875, "loss": 5.747, "lr": 0.0009976223776223777, "step": 518, "tokens_trained": 0.049245104 }, { "epoch": 0.1475177304964539, "grad_norm": 6.059483051300049, "loss": 5.7611, "lr": 0.0009973426573426573, "step": 520, "tokens_trained": 0.049434368 }, { "epoch": 0.14808510638297873, "grad_norm": 5.362505912780762, "loss": 5.7607, "lr": 0.000997062937062937, "step": 522, "tokens_trained": 0.049622648 }, { "epoch": 0.14865248226950353, "grad_norm": 5.391371726989746, "loss": 5.7857, "lr": 0.0009967832167832168, "step": 524, "tokens_trained": 0.049812304 }, { "epoch": 0.14921985815602837, "grad_norm": 4.3839030265808105, "loss": 5.7334, "lr": 0.0009965034965034964, "step": 526, "tokens_trained": 0.05000356 }, { "epoch": 0.1497872340425532, "grad_norm": 5.008530616760254, "loss": 5.7475, "lr": 0.0009962237762237763, "step": 528, "tokens_trained": 0.050193304 }, { "epoch": 0.150354609929078, "grad_norm": 5.068671226501465, "loss": 5.7866, "lr": 0.000995944055944056, "step": 530, "tokens_trained": 0.050382856 }, { "epoch": 0.15092198581560284, "grad_norm": 5.399240493774414, "loss": 5.6857, "lr": 0.0009956643356643356, "step": 532, "tokens_trained": 0.050570864 }, { "epoch": 0.15148936170212765, "grad_norm": 5.689481735229492, "loss": 5.7586, "lr": 0.0009953846153846154, "step": 534, "tokens_trained": 0.050760384 }, { "epoch": 0.15205673758865249, "grad_norm": 4.652275562286377, "loss": 5.7866, "lr": 0.000995104895104895, "step": 536, "tokens_trained": 0.050952712 }, { "epoch": 0.1526241134751773, "grad_norm": 4.126920223236084, "loss": 5.7261, "lr": 0.000994825174825175, "step": 538, "tokens_trained": 0.051141656 }, { "epoch": 0.15319148936170213, "grad_norm": 4.233098030090332, "loss": 5.6903, "lr": 0.0009945454545454546, "step": 540, "tokens_trained": 0.051331256 }, { "epoch": 0.15375886524822696, "grad_norm": 4.271973133087158, "loss": 5.7293, "lr": 0.0009942657342657344, "step": 542, "tokens_trained": 0.051522072 }, { "epoch": 0.15432624113475177, "grad_norm": 4.653008937835693, "loss": 5.7133, "lr": 0.000993986013986014, "step": 544, "tokens_trained": 0.051711624 }, { "epoch": 0.1548936170212766, "grad_norm": 4.192624092102051, "loss": 5.6876, "lr": 0.0009937062937062937, "step": 546, "tokens_trained": 0.051901744 }, { "epoch": 0.1554609929078014, "grad_norm": 5.497848033905029, "loss": 5.7378, "lr": 0.0009934265734265735, "step": 548, "tokens_trained": 0.052092872 }, { "epoch": 0.15602836879432624, "grad_norm": 4.350259780883789, "loss": 5.6533, "lr": 0.0009931468531468532, "step": 550, "tokens_trained": 0.052281768 }, { "epoch": 0.15659574468085105, "grad_norm": 4.515641689300537, "loss": 5.7492, "lr": 0.000992867132867133, "step": 552, "tokens_trained": 0.052471848 }, { "epoch": 0.15716312056737589, "grad_norm": 4.628066539764404, "loss": 5.7113, "lr": 0.0009925874125874127, "step": 554, "tokens_trained": 0.052660168 }, { "epoch": 0.15773049645390072, "grad_norm": 4.8322930335998535, "loss": 5.6696, "lr": 0.0009923076923076923, "step": 556, "tokens_trained": 0.05284776 }, { "epoch": 0.15829787234042553, "grad_norm": 3.999706506729126, "loss": 5.7296, "lr": 0.000992027972027972, "step": 558, "tokens_trained": 0.053037344 }, { "epoch": 0.15886524822695036, "grad_norm": 4.332971572875977, "loss": 5.7362, "lr": 0.0009917482517482518, "step": 560, "tokens_trained": 0.053228168 }, { "epoch": 0.15943262411347517, "grad_norm": 4.500301361083984, "loss": 5.6982, "lr": 0.0009914685314685314, "step": 562, "tokens_trained": 0.05341856 }, { "epoch": 0.16, "grad_norm": 4.721808910369873, "loss": 5.7166, "lr": 0.0009911888111888113, "step": 564, "tokens_trained": 0.053608824 }, { "epoch": 0.1605673758865248, "grad_norm": 5.265316009521484, "loss": 5.7069, "lr": 0.000990909090909091, "step": 566, "tokens_trained": 0.053799728 }, { "epoch": 0.16113475177304964, "grad_norm": 5.024131774902344, "loss": 5.7113, "lr": 0.0009906293706293705, "step": 568, "tokens_trained": 0.05398944 }, { "epoch": 0.16170212765957448, "grad_norm": 4.063276767730713, "loss": 5.6251, "lr": 0.0009903496503496504, "step": 570, "tokens_trained": 0.054176512 }, { "epoch": 0.1622695035460993, "grad_norm": 4.15974760055542, "loss": 5.6912, "lr": 0.00099006993006993, "step": 572, "tokens_trained": 0.054367072 }, { "epoch": 0.16283687943262412, "grad_norm": 4.338894844055176, "loss": 5.6807, "lr": 0.0009897902097902099, "step": 574, "tokens_trained": 0.054559184 }, { "epoch": 0.16340425531914893, "grad_norm": 5.535487174987793, "loss": 5.6765, "lr": 0.0009895104895104895, "step": 576, "tokens_trained": 0.054748904 }, { "epoch": 0.16397163120567376, "grad_norm": 4.379040241241455, "loss": 5.6884, "lr": 0.0009892307692307694, "step": 578, "tokens_trained": 0.054936136 }, { "epoch": 0.16453900709219857, "grad_norm": 4.746179103851318, "loss": 5.6885, "lr": 0.000988951048951049, "step": 580, "tokens_trained": 0.055125584 }, { "epoch": 0.1651063829787234, "grad_norm": 4.949806213378906, "loss": 5.7061, "lr": 0.0009886713286713286, "step": 582, "tokens_trained": 0.055314608 }, { "epoch": 0.16567375886524824, "grad_norm": 4.507448196411133, "loss": 5.6339, "lr": 0.0009883916083916085, "step": 584, "tokens_trained": 0.055503992 }, { "epoch": 0.16624113475177305, "grad_norm": 4.131013870239258, "loss": 5.7122, "lr": 0.0009881118881118881, "step": 586, "tokens_trained": 0.055693376 }, { "epoch": 0.16680851063829788, "grad_norm": 5.32897424697876, "loss": 5.7192, "lr": 0.000987832167832168, "step": 588, "tokens_trained": 0.05588452 }, { "epoch": 0.1673758865248227, "grad_norm": 4.166877746582031, "loss": 5.6666, "lr": 0.0009875524475524476, "step": 590, "tokens_trained": 0.056073936 }, { "epoch": 0.16794326241134752, "grad_norm": 4.393389701843262, "loss": 5.6113, "lr": 0.0009872727272727273, "step": 592, "tokens_trained": 0.056262224 }, { "epoch": 0.16851063829787233, "grad_norm": 4.466696739196777, "loss": 5.6466, "lr": 0.000986993006993007, "step": 594, "tokens_trained": 0.056454008 }, { "epoch": 0.16907801418439716, "grad_norm": 3.9413373470306396, "loss": 5.6838, "lr": 0.0009867132867132867, "step": 596, "tokens_trained": 0.05664444 }, { "epoch": 0.169645390070922, "grad_norm": 3.594649314880371, "loss": 5.6684, "lr": 0.0009864335664335664, "step": 598, "tokens_trained": 0.056833864 }, { "epoch": 0.1702127659574468, "grad_norm": 3.5969483852386475, "loss": 5.6619, "lr": 0.0009861538461538462, "step": 600, "tokens_trained": 0.05702332 }, { "epoch": 0.17078014184397164, "grad_norm": 3.845414638519287, "loss": 5.5855, "lr": 0.0009858741258741259, "step": 602, "tokens_trained": 0.057212776 }, { "epoch": 0.17134751773049645, "grad_norm": 3.9198834896087646, "loss": 5.6551, "lr": 0.0009855944055944055, "step": 604, "tokens_trained": 0.05740152 }, { "epoch": 0.17191489361702128, "grad_norm": 3.6764986515045166, "loss": 5.6228, "lr": 0.0009853146853146854, "step": 606, "tokens_trained": 0.057595616 }, { "epoch": 0.1724822695035461, "grad_norm": 3.8210043907165527, "loss": 5.6557, "lr": 0.000985034965034965, "step": 608, "tokens_trained": 0.057783968 }, { "epoch": 0.17304964539007092, "grad_norm": 3.893644094467163, "loss": 5.6675, "lr": 0.0009847552447552449, "step": 610, "tokens_trained": 0.057974832 }, { "epoch": 0.17361702127659576, "grad_norm": 3.280839681625366, "loss": 5.6442, "lr": 0.0009844755244755245, "step": 612, "tokens_trained": 0.058166272 }, { "epoch": 0.17418439716312056, "grad_norm": 3.4350404739379883, "loss": 5.6555, "lr": 0.0009841958041958043, "step": 614, "tokens_trained": 0.058356008 }, { "epoch": 0.1747517730496454, "grad_norm": 3.7700448036193848, "loss": 5.6138, "lr": 0.000983916083916084, "step": 616, "tokens_trained": 0.058546792 }, { "epoch": 0.1753191489361702, "grad_norm": 3.8182730674743652, "loss": 5.6931, "lr": 0.0009836363636363636, "step": 618, "tokens_trained": 0.058736296 }, { "epoch": 0.17588652482269504, "grad_norm": 3.9105372428894043, "loss": 5.6431, "lr": 0.0009833566433566435, "step": 620, "tokens_trained": 0.058927576 }, { "epoch": 0.17645390070921985, "grad_norm": 3.8897712230682373, "loss": 5.6203, "lr": 0.000983076923076923, "step": 622, "tokens_trained": 0.059118416 }, { "epoch": 0.17702127659574468, "grad_norm": 3.512194871902466, "loss": 5.6292, "lr": 0.000982797202797203, "step": 624, "tokens_trained": 0.059308568 }, { "epoch": 0.1773049645390071, "eval_loss": 5.630118370056152, "eval_runtime": 21.1591, "step": 625, "tokens_trained": 0.059404056 }, { "epoch": 0.17758865248226952, "grad_norm": 2.990100383758545, "loss": 5.622, "lr": 0.0009825174825174826, "step": 626, "tokens_trained": 0.059499776 }, { "epoch": 0.17815602836879432, "grad_norm": 3.0487334728240967, "loss": 5.6629, "lr": 0.0009822377622377622, "step": 628, "tokens_trained": 0.059690208 }, { "epoch": 0.17872340425531916, "grad_norm": 3.6905510425567627, "loss": 5.6345, "lr": 0.0009819580419580419, "step": 630, "tokens_trained": 0.059881352 }, { "epoch": 0.17929078014184396, "grad_norm": 3.302255630493164, "loss": 5.6733, "lr": 0.0009816783216783217, "step": 632, "tokens_trained": 0.060071896 }, { "epoch": 0.1798581560283688, "grad_norm": 3.6833834648132324, "loss": 5.5868, "lr": 0.0009813986013986014, "step": 634, "tokens_trained": 0.060260504 }, { "epoch": 0.1804255319148936, "grad_norm": 3.1528804302215576, "loss": 5.6128, "lr": 0.0009811188811188812, "step": 636, "tokens_trained": 0.060450584 }, { "epoch": 0.18099290780141844, "grad_norm": 3.788860559463501, "loss": 5.6235, "lr": 0.0009808391608391608, "step": 638, "tokens_trained": 0.060640872 }, { "epoch": 0.18156028368794327, "grad_norm": 3.192462682723999, "loss": 5.545, "lr": 0.0009805594405594405, "step": 640, "tokens_trained": 0.060832776 }, { "epoch": 0.18212765957446808, "grad_norm": 3.505732774734497, "loss": 5.5801, "lr": 0.0009802797202797203, "step": 642, "tokens_trained": 0.06102204 }, { "epoch": 0.18269503546099292, "grad_norm": 3.9589102268218994, "loss": 5.6091, "lr": 0.00098, "step": 644, "tokens_trained": 0.061209744 }, { "epoch": 0.18326241134751772, "grad_norm": 3.4410059452056885, "loss": 5.6279, "lr": 0.0009797202797202798, "step": 646, "tokens_trained": 0.061400392 }, { "epoch": 0.18382978723404256, "grad_norm": 3.7746005058288574, "loss": 5.6124, "lr": 0.0009794405594405595, "step": 648, "tokens_trained": 0.061592232 }, { "epoch": 0.18439716312056736, "grad_norm": 3.75022292137146, "loss": 5.5826, "lr": 0.000979160839160839, "step": 650, "tokens_trained": 0.061781824 }, { "epoch": 0.1849645390070922, "grad_norm": 3.7629313468933105, "loss": 5.555, "lr": 0.000978881118881119, "step": 652, "tokens_trained": 0.061972744 }, { "epoch": 0.18553191489361703, "grad_norm": 4.5046820640563965, "loss": 5.5972, "lr": 0.0009786013986013986, "step": 654, "tokens_trained": 0.062163456 }, { "epoch": 0.18609929078014184, "grad_norm": 3.443138599395752, "loss": 5.6061, "lr": 0.0009783216783216782, "step": 656, "tokens_trained": 0.06235208 }, { "epoch": 0.18666666666666668, "grad_norm": 3.2661828994750977, "loss": 5.5479, "lr": 0.000978041958041958, "step": 658, "tokens_trained": 0.062544416 }, { "epoch": 0.18723404255319148, "grad_norm": 3.9571003913879395, "loss": 5.6069, "lr": 0.000977762237762238, "step": 660, "tokens_trained": 0.062733992 }, { "epoch": 0.18780141843971632, "grad_norm": 3.705880641937256, "loss": 5.5915, "lr": 0.0009774825174825176, "step": 662, "tokens_trained": 0.062922536 }, { "epoch": 0.18836879432624112, "grad_norm": 4.066433429718018, "loss": 5.6031, "lr": 0.0009772027972027972, "step": 664, "tokens_trained": 0.063114224 }, { "epoch": 0.18893617021276596, "grad_norm": 3.356651782989502, "loss": 5.6045, "lr": 0.0009769230769230768, "step": 666, "tokens_trained": 0.063304616 }, { "epoch": 0.1895035460992908, "grad_norm": 3.8084938526153564, "loss": 5.6138, "lr": 0.0009766433566433567, "step": 668, "tokens_trained": 0.06349476 }, { "epoch": 0.1900709219858156, "grad_norm": 4.282619953155518, "loss": 5.5704, "lr": 0.0009763636363636363, "step": 670, "tokens_trained": 0.063684848 }, { "epoch": 0.19063829787234043, "grad_norm": 3.045057773590088, "loss": 5.6427, "lr": 0.0009760839160839161, "step": 672, "tokens_trained": 0.063875192 }, { "epoch": 0.19120567375886524, "grad_norm": 3.360164165496826, "loss": 5.5778, "lr": 0.0009758041958041958, "step": 674, "tokens_trained": 0.06406636 }, { "epoch": 0.19177304964539008, "grad_norm": 3.5778472423553467, "loss": 5.5389, "lr": 0.0009755244755244756, "step": 676, "tokens_trained": 0.064254376 }, { "epoch": 0.19234042553191488, "grad_norm": 3.34869384765625, "loss": 5.5894, "lr": 0.0009752447552447553, "step": 678, "tokens_trained": 0.0644448 }, { "epoch": 0.19290780141843972, "grad_norm": 3.083582878112793, "loss": 5.5776, "lr": 0.0009749650349650349, "step": 680, "tokens_trained": 0.064633712 }, { "epoch": 0.19347517730496455, "grad_norm": 3.345973491668701, "loss": 5.5987, "lr": 0.0009746853146853148, "step": 682, "tokens_trained": 0.064824808 }, { "epoch": 0.19404255319148936, "grad_norm": 3.9262702465057373, "loss": 5.64, "lr": 0.0009744055944055944, "step": 684, "tokens_trained": 0.065016224 }, { "epoch": 0.1946099290780142, "grad_norm": 3.298543930053711, "loss": 5.587, "lr": 0.0009741258741258742, "step": 686, "tokens_trained": 0.065204216 }, { "epoch": 0.195177304964539, "grad_norm": 3.118626832962036, "loss": 5.5864, "lr": 0.0009738461538461538, "step": 688, "tokens_trained": 0.065393256 }, { "epoch": 0.19574468085106383, "grad_norm": 2.983548402786255, "loss": 5.5506, "lr": 0.0009735664335664336, "step": 690, "tokens_trained": 0.06558324 }, { "epoch": 0.19631205673758864, "grad_norm": 3.5204527378082275, "loss": 5.5336, "lr": 0.0009732867132867133, "step": 692, "tokens_trained": 0.065775624 }, { "epoch": 0.19687943262411348, "grad_norm": 3.138550281524658, "loss": 5.5677, "lr": 0.000973006993006993, "step": 694, "tokens_trained": 0.0659666 }, { "epoch": 0.1974468085106383, "grad_norm": 3.0961053371429443, "loss": 5.5714, "lr": 0.0009727272727272728, "step": 696, "tokens_trained": 0.066155512 }, { "epoch": 0.19801418439716312, "grad_norm": 3.4929685592651367, "loss": 5.5829, "lr": 0.0009724475524475524, "step": 698, "tokens_trained": 0.06634576 }, { "epoch": 0.19858156028368795, "grad_norm": 3.1820616722106934, "loss": 5.6108, "lr": 0.0009721678321678323, "step": 700, "tokens_trained": 0.066537016 }, { "epoch": 0.19914893617021276, "grad_norm": 3.4244654178619385, "loss": 5.6025, "lr": 0.0009718881118881119, "step": 702, "tokens_trained": 0.066727856 }, { "epoch": 0.1997163120567376, "grad_norm": 3.258605480194092, "loss": 5.5581, "lr": 0.0009716083916083917, "step": 704, "tokens_trained": 0.066916672 }, { "epoch": 0.2002836879432624, "grad_norm": 2.7159688472747803, "loss": 5.5478, "lr": 0.0009713286713286713, "step": 706, "tokens_trained": 0.067107704 }, { "epoch": 0.20085106382978724, "grad_norm": 3.1941912174224854, "loss": 5.6126, "lr": 0.000971048951048951, "step": 708, "tokens_trained": 0.067297896 }, { "epoch": 0.20141843971631207, "grad_norm": 3.20470929145813, "loss": 5.5628, "lr": 0.0009707692307692308, "step": 710, "tokens_trained": 0.06748608 }, { "epoch": 0.20198581560283688, "grad_norm": 3.6400153636932373, "loss": 5.5758, "lr": 0.0009704895104895105, "step": 712, "tokens_trained": 0.0676766 }, { "epoch": 0.2025531914893617, "grad_norm": 2.881639003753662, "loss": 5.5512, "lr": 0.0009702097902097903, "step": 714, "tokens_trained": 0.067865848 }, { "epoch": 0.20312056737588652, "grad_norm": 3.1113905906677246, "loss": 5.5396, "lr": 0.0009699300699300699, "step": 716, "tokens_trained": 0.068055368 }, { "epoch": 0.20368794326241135, "grad_norm": 3.135014772415161, "loss": 5.5763, "lr": 0.0009696503496503498, "step": 718, "tokens_trained": 0.068248544 }, { "epoch": 0.20425531914893616, "grad_norm": 3.1870718002319336, "loss": 5.5903, "lr": 0.0009693706293706294, "step": 720, "tokens_trained": 0.068436944 }, { "epoch": 0.204822695035461, "grad_norm": 3.125596523284912, "loss": 5.6033, "lr": 0.0009690909090909091, "step": 722, "tokens_trained": 0.06862548 }, { "epoch": 0.20539007092198583, "grad_norm": 2.897671699523926, "loss": 5.5946, "lr": 0.0009688111888111888, "step": 724, "tokens_trained": 0.068815232 }, { "epoch": 0.20595744680851064, "grad_norm": 2.855313539505005, "loss": 5.5731, "lr": 0.0009685314685314685, "step": 726, "tokens_trained": 0.06900692 }, { "epoch": 0.20652482269503547, "grad_norm": 2.7760672569274902, "loss": 5.4949, "lr": 0.0009682517482517483, "step": 728, "tokens_trained": 0.069195376 }, { "epoch": 0.20709219858156028, "grad_norm": 2.9300007820129395, "loss": 5.5491, "lr": 0.000967972027972028, "step": 730, "tokens_trained": 0.069385512 }, { "epoch": 0.2076595744680851, "grad_norm": 3.299860954284668, "loss": 5.5405, "lr": 0.0009676923076923078, "step": 732, "tokens_trained": 0.069573304 }, { "epoch": 0.20822695035460992, "grad_norm": 3.300189256668091, "loss": 5.5797, "lr": 0.0009674125874125874, "step": 734, "tokens_trained": 0.069764248 }, { "epoch": 0.20879432624113475, "grad_norm": 2.932995557785034, "loss": 5.5556, "lr": 0.0009671328671328672, "step": 736, "tokens_trained": 0.06995496 }, { "epoch": 0.2093617021276596, "grad_norm": 2.6711719036102295, "loss": 5.48, "lr": 0.0009668531468531469, "step": 738, "tokens_trained": 0.070142776 }, { "epoch": 0.2099290780141844, "grad_norm": 2.833314895629883, "loss": 5.542, "lr": 0.0009665734265734266, "step": 740, "tokens_trained": 0.070332064 }, { "epoch": 0.21049645390070923, "grad_norm": 2.899843215942383, "loss": 5.5649, "lr": 0.0009662937062937063, "step": 742, "tokens_trained": 0.070523448 }, { "epoch": 0.21106382978723404, "grad_norm": 2.96528697013855, "loss": 5.5277, "lr": 0.000966013986013986, "step": 744, "tokens_trained": 0.070713768 }, { "epoch": 0.21163120567375887, "grad_norm": 2.921109437942505, "loss": 5.5646, "lr": 0.0009657342657342657, "step": 746, "tokens_trained": 0.070905704 }, { "epoch": 0.21219858156028368, "grad_norm": 3.2725329399108887, "loss": 5.4786, "lr": 0.0009654545454545455, "step": 748, "tokens_trained": 0.071096008 }, { "epoch": 0.2127659574468085, "grad_norm": 2.8296804428100586, "loss": 5.573, "lr": 0.0009651748251748252, "step": 750, "tokens_trained": 0.07128828 }, { "epoch": 0.2127659574468085, "eval_loss": 5.535472869873047, "eval_runtime": 21.0109, "step": 750, "tokens_trained": 0.07128828 }, { "epoch": 0.21333333333333335, "grad_norm": 3.0509591102600098, "loss": 5.6037, "lr": 0.0009648951048951049, "step": 752, "tokens_trained": 0.071479496 }, { "epoch": 0.21390070921985815, "grad_norm": 2.6773571968078613, "loss": 5.5266, "lr": 0.0009646153846153846, "step": 754, "tokens_trained": 0.071668568 }, { "epoch": 0.214468085106383, "grad_norm": 2.9600210189819336, "loss": 5.5362, "lr": 0.0009643356643356644, "step": 756, "tokens_trained": 0.071860552 }, { "epoch": 0.2150354609929078, "grad_norm": 2.6674885749816895, "loss": 5.5388, "lr": 0.0009640559440559441, "step": 758, "tokens_trained": 0.07204912 }, { "epoch": 0.21560283687943263, "grad_norm": 2.50179386138916, "loss": 5.5027, "lr": 0.0009637762237762237, "step": 760, "tokens_trained": 0.072239952 }, { "epoch": 0.21617021276595744, "grad_norm": 2.843411684036255, "loss": 5.5221, "lr": 0.0009634965034965035, "step": 762, "tokens_trained": 0.07243076 }, { "epoch": 0.21673758865248227, "grad_norm": 2.8686277866363525, "loss": 5.4896, "lr": 0.0009632167832167832, "step": 764, "tokens_trained": 0.072623272 }, { "epoch": 0.2173049645390071, "grad_norm": 2.611424684524536, "loss": 5.5557, "lr": 0.000962937062937063, "step": 766, "tokens_trained": 0.07281408 }, { "epoch": 0.2178723404255319, "grad_norm": 3.013145685195923, "loss": 5.4964, "lr": 0.0009626573426573427, "step": 768, "tokens_trained": 0.073005016 }, { "epoch": 0.21843971631205675, "grad_norm": 2.8682022094726562, "loss": 5.5232, "lr": 0.0009623776223776224, "step": 770, "tokens_trained": 0.07319652 }, { "epoch": 0.21900709219858155, "grad_norm": 2.6478466987609863, "loss": 5.5517, "lr": 0.0009620979020979021, "step": 772, "tokens_trained": 0.073387048 }, { "epoch": 0.2195744680851064, "grad_norm": 2.7273097038269043, "loss": 5.5572, "lr": 0.0009618181818181818, "step": 774, "tokens_trained": 0.073577424 }, { "epoch": 0.2201418439716312, "grad_norm": 3.104907751083374, "loss": 5.5081, "lr": 0.0009615384615384616, "step": 776, "tokens_trained": 0.073766712 }, { "epoch": 0.22070921985815603, "grad_norm": 2.9616432189941406, "loss": 5.5059, "lr": 0.0009612587412587412, "step": 778, "tokens_trained": 0.073956272 }, { "epoch": 0.22127659574468084, "grad_norm": 3.330319881439209, "loss": 5.4811, "lr": 0.000960979020979021, "step": 780, "tokens_trained": 0.074144008 }, { "epoch": 0.22184397163120567, "grad_norm": 2.964371919631958, "loss": 5.4763, "lr": 0.0009606993006993007, "step": 782, "tokens_trained": 0.074333888 }, { "epoch": 0.2224113475177305, "grad_norm": 3.13899827003479, "loss": 5.5262, "lr": 0.0009604195804195805, "step": 784, "tokens_trained": 0.074523584 }, { "epoch": 0.2229787234042553, "grad_norm": 3.2576637268066406, "loss": 5.4983, "lr": 0.0009601398601398602, "step": 786, "tokens_trained": 0.074714128 }, { "epoch": 0.22354609929078015, "grad_norm": 2.916149616241455, "loss": 5.504, "lr": 0.0009598601398601398, "step": 788, "tokens_trained": 0.074905104 }, { "epoch": 0.22411347517730495, "grad_norm": 2.842733144760132, "loss": 5.4997, "lr": 0.0009595804195804196, "step": 790, "tokens_trained": 0.075096328 }, { "epoch": 0.2246808510638298, "grad_norm": 2.880695104598999, "loss": 5.5131, "lr": 0.0009593006993006993, "step": 792, "tokens_trained": 0.075286104 }, { "epoch": 0.2252482269503546, "grad_norm": 2.620516300201416, "loss": 5.5291, "lr": 0.0009590209790209791, "step": 794, "tokens_trained": 0.075477392 }, { "epoch": 0.22581560283687943, "grad_norm": 2.622455358505249, "loss": 5.5433, "lr": 0.0009587412587412587, "step": 796, "tokens_trained": 0.0756682 }, { "epoch": 0.22638297872340427, "grad_norm": 2.532047986984253, "loss": 5.5169, "lr": 0.0009584615384615385, "step": 798, "tokens_trained": 0.075856528 }, { "epoch": 0.22695035460992907, "grad_norm": 2.628110885620117, "loss": 5.5369, "lr": 0.0009581818181818182, "step": 800, "tokens_trained": 0.076046256 }, { "epoch": 0.2275177304964539, "grad_norm": 2.376600980758667, "loss": 5.4888, "lr": 0.000957902097902098, "step": 802, "tokens_trained": 0.076236016 }, { "epoch": 0.22808510638297871, "grad_norm": 2.433666706085205, "loss": 5.5044, "lr": 0.0009576223776223777, "step": 804, "tokens_trained": 0.07642324 }, { "epoch": 0.22865248226950355, "grad_norm": 2.3850929737091064, "loss": 5.4941, "lr": 0.0009573426573426573, "step": 806, "tokens_trained": 0.07661376 }, { "epoch": 0.22921985815602836, "grad_norm": 2.4664969444274902, "loss": 5.5257, "lr": 0.0009570629370629371, "step": 808, "tokens_trained": 0.076804952 }, { "epoch": 0.2297872340425532, "grad_norm": 2.8514602184295654, "loss": 5.5335, "lr": 0.0009567832167832168, "step": 810, "tokens_trained": 0.076995064 }, { "epoch": 0.23035460992907802, "grad_norm": 2.508887767791748, "loss": 5.5093, "lr": 0.0009565034965034966, "step": 812, "tokens_trained": 0.077185344 }, { "epoch": 0.23092198581560283, "grad_norm": 2.5842514038085938, "loss": 5.5246, "lr": 0.0009562237762237762, "step": 814, "tokens_trained": 0.077375232 }, { "epoch": 0.23148936170212767, "grad_norm": 2.621562957763672, "loss": 5.4948, "lr": 0.0009559440559440559, "step": 816, "tokens_trained": 0.07756528 }, { "epoch": 0.23205673758865247, "grad_norm": 2.3230698108673096, "loss": 5.5367, "lr": 0.0009556643356643357, "step": 818, "tokens_trained": 0.077754936 }, { "epoch": 0.2326241134751773, "grad_norm": 2.728039264678955, "loss": 5.4548, "lr": 0.0009553846153846154, "step": 820, "tokens_trained": 0.077944056 }, { "epoch": 0.23319148936170211, "grad_norm": 2.786271333694458, "loss": 5.4701, "lr": 0.0009551048951048952, "step": 822, "tokens_trained": 0.07813272 }, { "epoch": 0.23375886524822695, "grad_norm": 2.449995517730713, "loss": 5.5505, "lr": 0.0009548251748251748, "step": 824, "tokens_trained": 0.078321888 }, { "epoch": 0.23432624113475178, "grad_norm": 2.394447088241577, "loss": 5.4709, "lr": 0.0009545454545454546, "step": 826, "tokens_trained": 0.078510288 }, { "epoch": 0.2348936170212766, "grad_norm": 2.5857675075531006, "loss": 5.4986, "lr": 0.0009542657342657343, "step": 828, "tokens_trained": 0.078698032 }, { "epoch": 0.23546099290780143, "grad_norm": 2.728743314743042, "loss": 5.4983, "lr": 0.000953986013986014, "step": 830, "tokens_trained": 0.078890608 }, { "epoch": 0.23602836879432623, "grad_norm": 2.3619866371154785, "loss": 5.4985, "lr": 0.0009537062937062937, "step": 832, "tokens_trained": 0.079081968 }, { "epoch": 0.23659574468085107, "grad_norm": 2.6265158653259277, "loss": 5.5088, "lr": 0.0009534265734265734, "step": 834, "tokens_trained": 0.079270712 }, { "epoch": 0.23716312056737587, "grad_norm": 2.3731281757354736, "loss": 5.4682, "lr": 0.0009531468531468532, "step": 836, "tokens_trained": 0.079459912 }, { "epoch": 0.2377304964539007, "grad_norm": 2.375283718109131, "loss": 5.4278, "lr": 0.0009528671328671329, "step": 838, "tokens_trained": 0.079649408 }, { "epoch": 0.23829787234042554, "grad_norm": 2.6856729984283447, "loss": 5.5277, "lr": 0.0009525874125874127, "step": 840, "tokens_trained": 0.079839552 }, { "epoch": 0.23886524822695035, "grad_norm": 2.5037410259246826, "loss": 5.5022, "lr": 0.0009523076923076923, "step": 842, "tokens_trained": 0.08002732 }, { "epoch": 0.23943262411347518, "grad_norm": 2.25175404548645, "loss": 5.4918, "lr": 0.000952027972027972, "step": 844, "tokens_trained": 0.080216416 }, { "epoch": 0.24, "grad_norm": 2.3555264472961426, "loss": 5.5134, "lr": 0.0009517482517482518, "step": 846, "tokens_trained": 0.080406928 }, { "epoch": 0.24056737588652483, "grad_norm": 2.390998601913452, "loss": 5.4721, "lr": 0.0009514685314685315, "step": 848, "tokens_trained": 0.080596232 }, { "epoch": 0.24113475177304963, "grad_norm": 2.1585986614227295, "loss": 5.4511, "lr": 0.0009511888111888112, "step": 850, "tokens_trained": 0.080786848 }, { "epoch": 0.24170212765957447, "grad_norm": 2.7733986377716064, "loss": 5.5269, "lr": 0.0009509090909090909, "step": 852, "tokens_trained": 0.080978144 }, { "epoch": 0.2422695035460993, "grad_norm": 2.8021209239959717, "loss": 5.4751, "lr": 0.0009506293706293707, "step": 854, "tokens_trained": 0.081167712 }, { "epoch": 0.2428368794326241, "grad_norm": 2.5434224605560303, "loss": 5.5154, "lr": 0.0009503496503496504, "step": 856, "tokens_trained": 0.081357584 }, { "epoch": 0.24340425531914894, "grad_norm": 2.456421136856079, "loss": 5.5459, "lr": 0.0009500699300699301, "step": 858, "tokens_trained": 0.081545992 }, { "epoch": 0.24397163120567375, "grad_norm": 2.317312002182007, "loss": 5.4644, "lr": 0.0009497902097902098, "step": 860, "tokens_trained": 0.081735392 }, { "epoch": 0.24453900709219858, "grad_norm": 2.3580780029296875, "loss": 5.4359, "lr": 0.0009495104895104895, "step": 862, "tokens_trained": 0.081925608 }, { "epoch": 0.2451063829787234, "grad_norm": 2.6440224647521973, "loss": 5.4757, "lr": 0.0009492307692307693, "step": 864, "tokens_trained": 0.08211328 }, { "epoch": 0.24567375886524823, "grad_norm": 2.5468132495880127, "loss": 5.4115, "lr": 0.000948951048951049, "step": 866, "tokens_trained": 0.082303736 }, { "epoch": 0.24624113475177306, "grad_norm": 2.431992530822754, "loss": 5.4655, "lr": 0.0009486713286713286, "step": 868, "tokens_trained": 0.082492896 }, { "epoch": 0.24680851063829787, "grad_norm": 2.443335771560669, "loss": 5.4684, "lr": 0.0009483916083916084, "step": 870, "tokens_trained": 0.082684024 }, { "epoch": 0.2473758865248227, "grad_norm": 2.6467180252075195, "loss": 5.5017, "lr": 0.0009481118881118881, "step": 872, "tokens_trained": 0.08287444 }, { "epoch": 0.2479432624113475, "grad_norm": 2.6044974327087402, "loss": 5.4637, "lr": 0.0009478321678321679, "step": 874, "tokens_trained": 0.08306436 }, { "epoch": 0.24822695035460993, "eval_loss": 5.4816508293151855, "eval_runtime": 20.9467, "step": 875, "tokens_trained": 0.083158888 }, { "epoch": 0.24851063829787234, "grad_norm": 2.6221189498901367, "loss": 5.4785, "lr": 0.0009475524475524476, "step": 876, "tokens_trained": 0.083253472 }, { "epoch": 0.24907801418439715, "grad_norm": 2.409327983856201, "loss": 5.42, "lr": 0.0009472727272727273, "step": 878, "tokens_trained": 0.08344528 }, { "epoch": 0.24964539007092199, "grad_norm": 2.2504723072052, "loss": 5.399, "lr": 0.000946993006993007, "step": 880, "tokens_trained": 0.083635752 }, { "epoch": 0.2502127659574468, "grad_norm": 2.3018665313720703, "loss": 5.4512, "lr": 0.0009467132867132868, "step": 882, "tokens_trained": 0.08382576 }, { "epoch": 0.2507801418439716, "grad_norm": 2.5774636268615723, "loss": 5.4592, "lr": 0.0009464335664335665, "step": 884, "tokens_trained": 0.084016232 }, { "epoch": 0.25134751773049646, "grad_norm": 2.614935874938965, "loss": 5.4772, "lr": 0.0009461538461538461, "step": 886, "tokens_trained": 0.084206992 }, { "epoch": 0.2519148936170213, "grad_norm": 2.4281506538391113, "loss": 5.4972, "lr": 0.0009458741258741259, "step": 888, "tokens_trained": 0.084395848 }, { "epoch": 0.2524822695035461, "grad_norm": 2.3668100833892822, "loss": 5.4505, "lr": 0.0009455944055944056, "step": 890, "tokens_trained": 0.084583704 }, { "epoch": 0.2530496453900709, "grad_norm": 2.1937146186828613, "loss": 5.4981, "lr": 0.0009453146853146854, "step": 892, "tokens_trained": 0.08477096 }, { "epoch": 0.25361702127659574, "grad_norm": 2.2917556762695312, "loss": 5.4224, "lr": 0.000945034965034965, "step": 894, "tokens_trained": 0.084961048 }, { "epoch": 0.2541843971631206, "grad_norm": 2.1254703998565674, "loss": 5.4409, "lr": 0.0009447552447552447, "step": 896, "tokens_trained": 0.085153256 }, { "epoch": 0.2547517730496454, "grad_norm": 2.267159938812256, "loss": 5.4527, "lr": 0.0009444755244755245, "step": 898, "tokens_trained": 0.085343128 }, { "epoch": 0.2553191489361702, "grad_norm": 2.1975555419921875, "loss": 5.516, "lr": 0.0009441958041958042, "step": 900, "tokens_trained": 0.085534024 }, { "epoch": 0.255886524822695, "grad_norm": 2.3459436893463135, "loss": 5.4592, "lr": 0.000943916083916084, "step": 902, "tokens_trained": 0.085725136 }, { "epoch": 0.25645390070921986, "grad_norm": 2.4788501262664795, "loss": 5.3937, "lr": 0.0009436363636363636, "step": 904, "tokens_trained": 0.08591548 }, { "epoch": 0.2570212765957447, "grad_norm": 2.415065288543701, "loss": 5.3991, "lr": 0.0009433566433566434, "step": 906, "tokens_trained": 0.086105008 }, { "epoch": 0.25758865248226953, "grad_norm": 2.1260058879852295, "loss": 5.4122, "lr": 0.0009430769230769231, "step": 908, "tokens_trained": 0.08629424 }, { "epoch": 0.2581560283687943, "grad_norm": 2.1759092807769775, "loss": 5.4663, "lr": 0.0009427972027972029, "step": 910, "tokens_trained": 0.086485784 }, { "epoch": 0.25872340425531914, "grad_norm": 2.3481245040893555, "loss": 5.4398, "lr": 0.0009425174825174825, "step": 912, "tokens_trained": 0.086676744 }, { "epoch": 0.259290780141844, "grad_norm": 2.312612533569336, "loss": 5.4615, "lr": 0.0009422377622377622, "step": 914, "tokens_trained": 0.086866424 }, { "epoch": 0.2598581560283688, "grad_norm": 2.4709548950195312, "loss": 5.4062, "lr": 0.000941958041958042, "step": 916, "tokens_trained": 0.087055824 }, { "epoch": 0.2604255319148936, "grad_norm": 2.3664543628692627, "loss": 5.4696, "lr": 0.0009416783216783217, "step": 918, "tokens_trained": 0.087244136 }, { "epoch": 0.26099290780141843, "grad_norm": 2.423687696456909, "loss": 5.4762, "lr": 0.0009413986013986015, "step": 920, "tokens_trained": 0.087432584 }, { "epoch": 0.26156028368794326, "grad_norm": 2.4002890586853027, "loss": 5.4743, "lr": 0.0009411188811188811, "step": 922, "tokens_trained": 0.087622248 }, { "epoch": 0.2621276595744681, "grad_norm": 2.107527494430542, "loss": 5.4013, "lr": 0.0009408391608391608, "step": 924, "tokens_trained": 0.087809888 }, { "epoch": 0.26269503546099293, "grad_norm": 2.05177640914917, "loss": 5.4601, "lr": 0.0009405594405594406, "step": 926, "tokens_trained": 0.088002704 }, { "epoch": 0.2632624113475177, "grad_norm": 2.303874969482422, "loss": 5.456, "lr": 0.0009402797202797203, "step": 928, "tokens_trained": 0.088191344 }, { "epoch": 0.26382978723404255, "grad_norm": 2.4369659423828125, "loss": 5.4162, "lr": 0.00094, "step": 930, "tokens_trained": 0.088380832 }, { "epoch": 0.2643971631205674, "grad_norm": 2.4750819206237793, "loss": 5.455, "lr": 0.0009397202797202797, "step": 932, "tokens_trained": 0.088569936 }, { "epoch": 0.2649645390070922, "grad_norm": 2.09557843208313, "loss": 5.4273, "lr": 0.0009394405594405595, "step": 934, "tokens_trained": 0.08876116 }, { "epoch": 0.265531914893617, "grad_norm": 2.0984373092651367, "loss": 5.4342, "lr": 0.0009391608391608392, "step": 936, "tokens_trained": 0.088951032 }, { "epoch": 0.26609929078014183, "grad_norm": 2.1150097846984863, "loss": 5.4344, "lr": 0.000938881118881119, "step": 938, "tokens_trained": 0.08914124 }, { "epoch": 0.26666666666666666, "grad_norm": 2.1577563285827637, "loss": 5.455, "lr": 0.0009386013986013986, "step": 940, "tokens_trained": 0.089330952 }, { "epoch": 0.2672340425531915, "grad_norm": 2.0483016967773438, "loss": 5.413, "lr": 0.0009383216783216783, "step": 942, "tokens_trained": 0.08952116 }, { "epoch": 0.26780141843971633, "grad_norm": 2.3116559982299805, "loss": 5.455, "lr": 0.0009380419580419581, "step": 944, "tokens_trained": 0.089712888 }, { "epoch": 0.2683687943262411, "grad_norm": 2.2459256649017334, "loss": 5.3971, "lr": 0.0009377622377622378, "step": 946, "tokens_trained": 0.089903936 }, { "epoch": 0.26893617021276595, "grad_norm": 2.3048787117004395, "loss": 5.4454, "lr": 0.0009374825174825175, "step": 948, "tokens_trained": 0.090095888 }, { "epoch": 0.2695035460992908, "grad_norm": 2.196735143661499, "loss": 5.4101, "lr": 0.0009372027972027972, "step": 950, "tokens_trained": 0.090287472 }, { "epoch": 0.2700709219858156, "grad_norm": 2.3908562660217285, "loss": 5.4731, "lr": 0.0009369230769230769, "step": 952, "tokens_trained": 0.090476568 }, { "epoch": 0.27063829787234045, "grad_norm": 2.154932975769043, "loss": 5.4104, "lr": 0.0009366433566433567, "step": 954, "tokens_trained": 0.090665592 }, { "epoch": 0.27120567375886523, "grad_norm": 2.340907096862793, "loss": 5.3707, "lr": 0.0009363636363636364, "step": 956, "tokens_trained": 0.090853232 }, { "epoch": 0.27177304964539006, "grad_norm": 2.1736438274383545, "loss": 5.4484, "lr": 0.0009360839160839161, "step": 958, "tokens_trained": 0.091043808 }, { "epoch": 0.2723404255319149, "grad_norm": 2.3518154621124268, "loss": 5.4919, "lr": 0.0009358041958041958, "step": 960, "tokens_trained": 0.09123384 }, { "epoch": 0.27290780141843973, "grad_norm": 2.6673426628112793, "loss": 5.4008, "lr": 0.0009355244755244755, "step": 962, "tokens_trained": 0.091422544 }, { "epoch": 0.2734751773049645, "grad_norm": 2.4755311012268066, "loss": 5.4533, "lr": 0.0009352447552447553, "step": 964, "tokens_trained": 0.09161544 }, { "epoch": 0.27404255319148935, "grad_norm": 2.338452100753784, "loss": 5.4953, "lr": 0.0009349650349650349, "step": 966, "tokens_trained": 0.091806344 }, { "epoch": 0.2746099290780142, "grad_norm": 2.170426845550537, "loss": 5.4588, "lr": 0.0009346853146853147, "step": 968, "tokens_trained": 0.091996648 }, { "epoch": 0.275177304964539, "grad_norm": 2.2587599754333496, "loss": 5.4547, "lr": 0.0009344055944055944, "step": 970, "tokens_trained": 0.09218848 }, { "epoch": 0.27574468085106385, "grad_norm": 2.0009043216705322, "loss": 5.4116, "lr": 0.0009341258741258742, "step": 972, "tokens_trained": 0.092377984 }, { "epoch": 0.27631205673758863, "grad_norm": 2.0617294311523438, "loss": 5.4541, "lr": 0.0009338461538461539, "step": 974, "tokens_trained": 0.092569472 }, { "epoch": 0.27687943262411346, "grad_norm": 2.059300661087036, "loss": 5.4414, "lr": 0.0009335664335664336, "step": 976, "tokens_trained": 0.092758496 }, { "epoch": 0.2774468085106383, "grad_norm": 2.2815263271331787, "loss": 5.4435, "lr": 0.0009332867132867133, "step": 978, "tokens_trained": 0.092950368 }, { "epoch": 0.27801418439716313, "grad_norm": 2.1770365238189697, "loss": 5.4481, "lr": 0.0009330069930069929, "step": 980, "tokens_trained": 0.093140552 }, { "epoch": 0.27858156028368797, "grad_norm": 2.0089797973632812, "loss": 5.4117, "lr": 0.0009327272727272728, "step": 982, "tokens_trained": 0.093332312 }, { "epoch": 0.27914893617021275, "grad_norm": 2.2188286781311035, "loss": 5.4594, "lr": 0.0009324475524475524, "step": 984, "tokens_trained": 0.093520792 }, { "epoch": 0.2797163120567376, "grad_norm": 2.310481548309326, "loss": 5.393, "lr": 0.0009321678321678322, "step": 986, "tokens_trained": 0.093710608 }, { "epoch": 0.2802836879432624, "grad_norm": 2.3832972049713135, "loss": 5.4277, "lr": 0.0009318881118881119, "step": 988, "tokens_trained": 0.093900952 }, { "epoch": 0.28085106382978725, "grad_norm": 2.011126756668091, "loss": 5.4097, "lr": 0.0009316083916083917, "step": 990, "tokens_trained": 0.094091 }, { "epoch": 0.28141843971631203, "grad_norm": 2.2632968425750732, "loss": 5.4388, "lr": 0.0009313286713286714, "step": 992, "tokens_trained": 0.094281216 }, { "epoch": 0.28198581560283686, "grad_norm": 2.3477587699890137, "loss": 5.3728, "lr": 0.000931048951048951, "step": 994, "tokens_trained": 0.094470264 }, { "epoch": 0.2825531914893617, "grad_norm": 2.486196756362915, "loss": 5.414, "lr": 0.0009307692307692308, "step": 996, "tokens_trained": 0.094662816 }, { "epoch": 0.28312056737588653, "grad_norm": 2.5286316871643066, "loss": 5.4063, "lr": 0.0009304895104895104, "step": 998, "tokens_trained": 0.094852896 }, { "epoch": 0.28368794326241137, "grad_norm": 2.712406635284424, "loss": 5.4274, "lr": 0.0009302097902097903, "step": 1000, "tokens_trained": 0.09504336 }, { "epoch": 0.28368794326241137, "eval_loss": 5.434450626373291, "eval_runtime": 21.0388, "step": 1000, "tokens_trained": 0.09504336 }, { "epoch": 0.28425531914893615, "grad_norm": 2.245316505432129, "loss": 5.3551, "lr": 0.0009299300699300699, "step": 1002, "tokens_trained": 0.095233944 }, { "epoch": 0.284822695035461, "grad_norm": 2.335533618927002, "loss": 5.4608, "lr": 0.0009296503496503497, "step": 1004, "tokens_trained": 0.095423184 }, { "epoch": 0.2853900709219858, "grad_norm": 2.232128858566284, "loss": 5.4374, "lr": 0.0009293706293706294, "step": 1006, "tokens_trained": 0.095612672 }, { "epoch": 0.28595744680851065, "grad_norm": 2.148329257965088, "loss": 5.463, "lr": 0.0009290909090909091, "step": 1008, "tokens_trained": 0.095802784 }, { "epoch": 0.2865248226950355, "grad_norm": 1.9580810070037842, "loss": 5.291, "lr": 0.0009288111888111889, "step": 1010, "tokens_trained": 0.095990776 }, { "epoch": 0.28709219858156027, "grad_norm": 1.9873988628387451, "loss": 5.4103, "lr": 0.0009285314685314685, "step": 1012, "tokens_trained": 0.096180648 }, { "epoch": 0.2876595744680851, "grad_norm": 2.0297746658325195, "loss": 5.4078, "lr": 0.0009282517482517483, "step": 1014, "tokens_trained": 0.09637224 }, { "epoch": 0.28822695035460993, "grad_norm": 1.928497076034546, "loss": 5.3162, "lr": 0.0009279720279720279, "step": 1016, "tokens_trained": 0.096561744 }, { "epoch": 0.28879432624113477, "grad_norm": 2.1219675540924072, "loss": 5.4358, "lr": 0.0009276923076923078, "step": 1018, "tokens_trained": 0.096752296 }, { "epoch": 0.28936170212765955, "grad_norm": 2.0021066665649414, "loss": 5.4232, "lr": 0.0009274125874125874, "step": 1020, "tokens_trained": 0.096943856 }, { "epoch": 0.2899290780141844, "grad_norm": 1.9920068979263306, "loss": 5.407, "lr": 0.0009271328671328671, "step": 1022, "tokens_trained": 0.097133632 }, { "epoch": 0.2904964539007092, "grad_norm": 1.8810361623764038, "loss": 5.4293, "lr": 0.0009268531468531469, "step": 1024, "tokens_trained": 0.097325976 }, { "epoch": 0.29106382978723405, "grad_norm": 1.8560134172439575, "loss": 5.4236, "lr": 0.0009265734265734266, "step": 1026, "tokens_trained": 0.0975142 }, { "epoch": 0.2916312056737589, "grad_norm": 2.1735010147094727, "loss": 5.4252, "lr": 0.0009262937062937064, "step": 1028, "tokens_trained": 0.0977042 }, { "epoch": 0.29219858156028367, "grad_norm": 2.2467288970947266, "loss": 5.3756, "lr": 0.000926013986013986, "step": 1030, "tokens_trained": 0.097893376 }, { "epoch": 0.2927659574468085, "grad_norm": 1.9609313011169434, "loss": 5.4091, "lr": 0.0009257342657342658, "step": 1032, "tokens_trained": 0.0980824 }, { "epoch": 0.29333333333333333, "grad_norm": 2.116384267807007, "loss": 5.4001, "lr": 0.0009254545454545454, "step": 1034, "tokens_trained": 0.098271304 }, { "epoch": 0.29390070921985817, "grad_norm": 2.1869800090789795, "loss": 5.4102, "lr": 0.0009251748251748252, "step": 1036, "tokens_trained": 0.098461528 }, { "epoch": 0.294468085106383, "grad_norm": 2.2882192134857178, "loss": 5.4723, "lr": 0.0009248951048951049, "step": 1038, "tokens_trained": 0.09865268 }, { "epoch": 0.2950354609929078, "grad_norm": 2.1590888500213623, "loss": 5.3523, "lr": 0.0009246153846153846, "step": 1040, "tokens_trained": 0.098842688 }, { "epoch": 0.2956028368794326, "grad_norm": 2.284207582473755, "loss": 5.4647, "lr": 0.0009243356643356644, "step": 1042, "tokens_trained": 0.099031544 }, { "epoch": 0.29617021276595745, "grad_norm": 2.333207845687866, "loss": 5.4655, "lr": 0.0009240559440559441, "step": 1044, "tokens_trained": 0.09922264 }, { "epoch": 0.2967375886524823, "grad_norm": 2.357572555541992, "loss": 5.3909, "lr": 0.0009237762237762239, "step": 1046, "tokens_trained": 0.099411416 }, { "epoch": 0.29730496453900707, "grad_norm": 1.88053297996521, "loss": 5.4119, "lr": 0.0009234965034965035, "step": 1048, "tokens_trained": 0.099602112 }, { "epoch": 0.2978723404255319, "grad_norm": 1.8860585689544678, "loss": 5.3867, "lr": 0.0009232167832167832, "step": 1050, "tokens_trained": 0.099792552 }, { "epoch": 0.29843971631205674, "grad_norm": 2.000173568725586, "loss": 5.3773, "lr": 0.0009229370629370629, "step": 1052, "tokens_trained": 0.099981752 }, { "epoch": 0.29900709219858157, "grad_norm": 2.015394926071167, "loss": 5.3936, "lr": 0.0009226573426573427, "step": 1054, "tokens_trained": 0.10017428 }, { "epoch": 0.2995744680851064, "grad_norm": 2.0050301551818848, "loss": 5.3653, "lr": 0.0009223776223776224, "step": 1056, "tokens_trained": 0.100364544 }, { "epoch": 0.3001418439716312, "grad_norm": 1.7397475242614746, "loss": 5.3224, "lr": 0.0009220979020979021, "step": 1058, "tokens_trained": 0.100555648 }, { "epoch": 0.300709219858156, "grad_norm": 1.9808533191680908, "loss": 5.3822, "lr": 0.0009218181818181819, "step": 1060, "tokens_trained": 0.100744968 }, { "epoch": 0.30127659574468085, "grad_norm": 2.034329652786255, "loss": 5.3961, "lr": 0.0009215384615384616, "step": 1062, "tokens_trained": 0.100934176 }, { "epoch": 0.3018439716312057, "grad_norm": 2.1286778450012207, "loss": 5.4061, "lr": 0.0009212587412587413, "step": 1064, "tokens_trained": 0.101125216 }, { "epoch": 0.3024113475177305, "grad_norm": 2.131822347640991, "loss": 5.3675, "lr": 0.000920979020979021, "step": 1066, "tokens_trained": 0.101314504 }, { "epoch": 0.3029787234042553, "grad_norm": 2.162069320678711, "loss": 5.4552, "lr": 0.0009206993006993007, "step": 1068, "tokens_trained": 0.101503352 }, { "epoch": 0.30354609929078014, "grad_norm": 2.5730931758880615, "loss": 5.3978, "lr": 0.0009204195804195804, "step": 1070, "tokens_trained": 0.101691504 }, { "epoch": 0.30411347517730497, "grad_norm": 2.2053022384643555, "loss": 5.3604, "lr": 0.0009201398601398602, "step": 1072, "tokens_trained": 0.101883072 }, { "epoch": 0.3046808510638298, "grad_norm": 2.1578407287597656, "loss": 5.4236, "lr": 0.0009198601398601398, "step": 1074, "tokens_trained": 0.102075832 }, { "epoch": 0.3052482269503546, "grad_norm": 2.0061423778533936, "loss": 5.3882, "lr": 0.0009195804195804196, "step": 1076, "tokens_trained": 0.102266768 }, { "epoch": 0.3058156028368794, "grad_norm": 1.8915576934814453, "loss": 5.3539, "lr": 0.0009193006993006993, "step": 1078, "tokens_trained": 0.102457096 }, { "epoch": 0.30638297872340425, "grad_norm": 2.15053129196167, "loss": 5.4222, "lr": 0.0009190209790209791, "step": 1080, "tokens_trained": 0.102647544 }, { "epoch": 0.3069503546099291, "grad_norm": 2.241217851638794, "loss": 5.4275, "lr": 0.0009187412587412588, "step": 1082, "tokens_trained": 0.10283904 }, { "epoch": 0.3075177304964539, "grad_norm": 2.37854266166687, "loss": 5.419, "lr": 0.0009184615384615385, "step": 1084, "tokens_trained": 0.103028464 }, { "epoch": 0.3080851063829787, "grad_norm": 2.00118350982666, "loss": 5.4225, "lr": 0.0009181818181818182, "step": 1086, "tokens_trained": 0.10321896 }, { "epoch": 0.30865248226950354, "grad_norm": 2.2643723487854004, "loss": 5.4487, "lr": 0.0009179020979020978, "step": 1088, "tokens_trained": 0.103409256 }, { "epoch": 0.30921985815602837, "grad_norm": 2.4618585109710693, "loss": 5.4211, "lr": 0.0009176223776223777, "step": 1090, "tokens_trained": 0.103597272 }, { "epoch": 0.3097872340425532, "grad_norm": 2.141491174697876, "loss": 5.3758, "lr": 0.0009173426573426573, "step": 1092, "tokens_trained": 0.103786128 }, { "epoch": 0.31035460992907804, "grad_norm": 1.9777475595474243, "loss": 5.4129, "lr": 0.0009170629370629371, "step": 1094, "tokens_trained": 0.103974864 }, { "epoch": 0.3109219858156028, "grad_norm": 1.9153270721435547, "loss": 5.3912, "lr": 0.0009167832167832168, "step": 1096, "tokens_trained": 0.104163864 }, { "epoch": 0.31148936170212765, "grad_norm": 2.172558546066284, "loss": 5.3425, "lr": 0.0009165034965034966, "step": 1098, "tokens_trained": 0.104353136 }, { "epoch": 0.3120567375886525, "grad_norm": 2.049896717071533, "loss": 5.3732, "lr": 0.0009162237762237763, "step": 1100, "tokens_trained": 0.10454476 }, { "epoch": 0.3126241134751773, "grad_norm": 1.9415545463562012, "loss": 5.3873, "lr": 0.0009159440559440559, "step": 1102, "tokens_trained": 0.104734296 }, { "epoch": 0.3131914893617021, "grad_norm": 1.7280856370925903, "loss": 5.3857, "lr": 0.0009156643356643357, "step": 1104, "tokens_trained": 0.104925648 }, { "epoch": 0.31375886524822694, "grad_norm": 1.9120069742202759, "loss": 5.3216, "lr": 0.0009153846153846153, "step": 1106, "tokens_trained": 0.105115776 }, { "epoch": 0.31432624113475177, "grad_norm": 2.007101058959961, "loss": 5.4074, "lr": 0.0009151048951048952, "step": 1108, "tokens_trained": 0.105305656 }, { "epoch": 0.3148936170212766, "grad_norm": 1.9159268140792847, "loss": 5.3625, "lr": 0.0009148251748251748, "step": 1110, "tokens_trained": 0.105494632 }, { "epoch": 0.31546099290780144, "grad_norm": 1.9235239028930664, "loss": 5.3362, "lr": 0.0009145454545454546, "step": 1112, "tokens_trained": 0.105683536 }, { "epoch": 0.3160283687943262, "grad_norm": 1.8954299688339233, "loss": 5.3531, "lr": 0.0009142657342657343, "step": 1114, "tokens_trained": 0.105873176 }, { "epoch": 0.31659574468085105, "grad_norm": 2.026578426361084, "loss": 5.408, "lr": 0.000913986013986014, "step": 1116, "tokens_trained": 0.10606276 }, { "epoch": 0.3171631205673759, "grad_norm": 1.9014806747436523, "loss": 5.363, "lr": 0.0009137062937062938, "step": 1118, "tokens_trained": 0.106254616 }, { "epoch": 0.3177304964539007, "grad_norm": 1.849649429321289, "loss": 5.3811, "lr": 0.0009134265734265734, "step": 1120, "tokens_trained": 0.106445376 }, { "epoch": 0.31829787234042556, "grad_norm": 1.7405186891555786, "loss": 5.3504, "lr": 0.0009131468531468532, "step": 1122, "tokens_trained": 0.106636072 }, { "epoch": 0.31886524822695034, "grad_norm": 1.867285966873169, "loss": 5.3675, "lr": 0.0009128671328671328, "step": 1124, "tokens_trained": 0.106827896 }, { "epoch": 0.3191489361702128, "eval_loss": 5.3796281814575195, "eval_runtime": 20.7444, "step": 1125, "tokens_trained": 0.106922416 }, { "epoch": 0.31943262411347517, "grad_norm": 1.8044356107711792, "loss": 5.3717, "lr": 0.0009125874125874127, "step": 1126, "tokens_trained": 0.107016056 }, { "epoch": 0.32, "grad_norm": 1.6348600387573242, "loss": 5.4676, "lr": 0.0009123076923076923, "step": 1128, "tokens_trained": 0.107203992 }, { "epoch": 0.32056737588652484, "grad_norm": 1.7802475690841675, "loss": 5.3304, "lr": 0.000912027972027972, "step": 1130, "tokens_trained": 0.107394224 }, { "epoch": 0.3211347517730496, "grad_norm": 1.7695430517196655, "loss": 5.3611, "lr": 0.0009117482517482518, "step": 1132, "tokens_trained": 0.107583464 }, { "epoch": 0.32170212765957445, "grad_norm": 2.026853322982788, "loss": 5.363, "lr": 0.0009114685314685315, "step": 1134, "tokens_trained": 0.107776088 }, { "epoch": 0.3222695035460993, "grad_norm": 1.803208589553833, "loss": 5.3801, "lr": 0.0009111888111888113, "step": 1136, "tokens_trained": 0.107964416 }, { "epoch": 0.3228368794326241, "grad_norm": 1.812386155128479, "loss": 5.3721, "lr": 0.0009109090909090909, "step": 1138, "tokens_trained": 0.108153104 }, { "epoch": 0.32340425531914896, "grad_norm": 1.605839490890503, "loss": 5.3339, "lr": 0.0009106293706293707, "step": 1140, "tokens_trained": 0.108341408 }, { "epoch": 0.32397163120567374, "grad_norm": 1.7169313430786133, "loss": 5.4051, "lr": 0.0009103496503496503, "step": 1142, "tokens_trained": 0.108532312 }, { "epoch": 0.3245390070921986, "grad_norm": 2.0499444007873535, "loss": 5.2992, "lr": 0.0009100699300699301, "step": 1144, "tokens_trained": 0.108721864 }, { "epoch": 0.3251063829787234, "grad_norm": 1.988674521446228, "loss": 5.3862, "lr": 0.0009097902097902098, "step": 1146, "tokens_trained": 0.108912352 }, { "epoch": 0.32567375886524824, "grad_norm": 1.8733936548233032, "loss": 5.3627, "lr": 0.0009095104895104895, "step": 1148, "tokens_trained": 0.109101952 }, { "epoch": 0.3262411347517731, "grad_norm": 1.978704810142517, "loss": 5.3668, "lr": 0.0009092307692307692, "step": 1150, "tokens_trained": 0.109292712 }, { "epoch": 0.32680851063829786, "grad_norm": 1.9723341464996338, "loss": 5.3545, "lr": 0.000908951048951049, "step": 1152, "tokens_trained": 0.109484992 }, { "epoch": 0.3273758865248227, "grad_norm": 2.165728807449341, "loss": 5.3731, "lr": 0.0009086713286713288, "step": 1154, "tokens_trained": 0.109674336 }, { "epoch": 0.3279432624113475, "grad_norm": 1.9241019487380981, "loss": 5.3456, "lr": 0.0009083916083916084, "step": 1156, "tokens_trained": 0.109863368 }, { "epoch": 0.32851063829787236, "grad_norm": 1.9442275762557983, "loss": 5.4065, "lr": 0.0009081118881118881, "step": 1158, "tokens_trained": 0.110051744 }, { "epoch": 0.32907801418439714, "grad_norm": 1.7714000940322876, "loss": 5.3888, "lr": 0.0009078321678321678, "step": 1160, "tokens_trained": 0.11024344 }, { "epoch": 0.329645390070922, "grad_norm": 2.043646812438965, "loss": 5.3835, "lr": 0.0009075524475524476, "step": 1162, "tokens_trained": 0.11043488 }, { "epoch": 0.3302127659574468, "grad_norm": 1.837196946144104, "loss": 5.3554, "lr": 0.0009072727272727273, "step": 1164, "tokens_trained": 0.110626104 }, { "epoch": 0.33078014184397164, "grad_norm": 1.874135971069336, "loss": 5.3457, "lr": 0.000906993006993007, "step": 1166, "tokens_trained": 0.110814768 }, { "epoch": 0.3313475177304965, "grad_norm": 1.6493511199951172, "loss": 5.3118, "lr": 0.0009067132867132866, "step": 1168, "tokens_trained": 0.111004104 }, { "epoch": 0.33191489361702126, "grad_norm": 1.8386362791061401, "loss": 5.3422, "lr": 0.0009064335664335665, "step": 1170, "tokens_trained": 0.11119544 }, { "epoch": 0.3324822695035461, "grad_norm": 2.020859718322754, "loss": 5.3565, "lr": 0.0009061538461538462, "step": 1172, "tokens_trained": 0.111384384 }, { "epoch": 0.3330496453900709, "grad_norm": 2.049401044845581, "loss": 5.3358, "lr": 0.0009058741258741259, "step": 1174, "tokens_trained": 0.111573944 }, { "epoch": 0.33361702127659576, "grad_norm": 1.965345025062561, "loss": 5.3431, "lr": 0.0009055944055944056, "step": 1176, "tokens_trained": 0.111763504 }, { "epoch": 0.3341843971631206, "grad_norm": 1.9792066812515259, "loss": 5.3579, "lr": 0.0009053146853146853, "step": 1178, "tokens_trained": 0.111953664 }, { "epoch": 0.3347517730496454, "grad_norm": 1.7790883779525757, "loss": 5.3499, "lr": 0.0009050349650349651, "step": 1180, "tokens_trained": 0.11214324 }, { "epoch": 0.3353191489361702, "grad_norm": 1.6504682302474976, "loss": 5.3415, "lr": 0.0009047552447552448, "step": 1182, "tokens_trained": 0.112331256 }, { "epoch": 0.33588652482269504, "grad_norm": 1.9687312841415405, "loss": 5.3565, "lr": 0.0009044755244755245, "step": 1184, "tokens_trained": 0.11252208 }, { "epoch": 0.3364539007092199, "grad_norm": 1.7077507972717285, "loss": 5.3568, "lr": 0.0009041958041958041, "step": 1186, "tokens_trained": 0.112714272 }, { "epoch": 0.33702127659574466, "grad_norm": 1.6311697959899902, "loss": 5.3345, "lr": 0.000903916083916084, "step": 1188, "tokens_trained": 0.11290428 }, { "epoch": 0.3375886524822695, "grad_norm": 1.975233793258667, "loss": 5.4161, "lr": 0.0009036363636363637, "step": 1190, "tokens_trained": 0.113093984 }, { "epoch": 0.3381560283687943, "grad_norm": 1.7567362785339355, "loss": 5.3481, "lr": 0.0009033566433566434, "step": 1192, "tokens_trained": 0.113284904 }, { "epoch": 0.33872340425531916, "grad_norm": 2.121367931365967, "loss": 5.3729, "lr": 0.0009030769230769231, "step": 1194, "tokens_trained": 0.113477952 }, { "epoch": 0.339290780141844, "grad_norm": 2.143253803253174, "loss": 5.3866, "lr": 0.0009027972027972027, "step": 1196, "tokens_trained": 0.11366872 }, { "epoch": 0.3398581560283688, "grad_norm": 2.1118557453155518, "loss": 5.3501, "lr": 0.0009025174825174826, "step": 1198, "tokens_trained": 0.113861552 }, { "epoch": 0.3404255319148936, "grad_norm": 1.8132637739181519, "loss": 5.3325, "lr": 0.0009022377622377622, "step": 1200, "tokens_trained": 0.114051176 }, { "epoch": 0.34099290780141844, "grad_norm": 1.761227011680603, "loss": 5.3629, "lr": 0.000901958041958042, "step": 1202, "tokens_trained": 0.114240808 }, { "epoch": 0.3415602836879433, "grad_norm": 1.8358371257781982, "loss": 5.3644, "lr": 0.0009016783216783216, "step": 1204, "tokens_trained": 0.114430968 }, { "epoch": 0.3421276595744681, "grad_norm": 2.0768542289733887, "loss": 5.3705, "lr": 0.0009013986013986014, "step": 1206, "tokens_trained": 0.114620544 }, { "epoch": 0.3426950354609929, "grad_norm": 1.6928143501281738, "loss": 5.2534, "lr": 0.0009011188811188812, "step": 1208, "tokens_trained": 0.114811928 }, { "epoch": 0.3432624113475177, "grad_norm": 1.8634029626846313, "loss": 5.3679, "lr": 0.0009008391608391609, "step": 1210, "tokens_trained": 0.115002912 }, { "epoch": 0.34382978723404256, "grad_norm": 1.8048954010009766, "loss": 5.3049, "lr": 0.0009005594405594406, "step": 1212, "tokens_trained": 0.115192544 }, { "epoch": 0.3443971631205674, "grad_norm": 1.9170348644256592, "loss": 5.2457, "lr": 0.0009002797202797202, "step": 1214, "tokens_trained": 0.115383248 }, { "epoch": 0.3449645390070922, "grad_norm": 1.788751482963562, "loss": 5.3678, "lr": 0.0009000000000000001, "step": 1216, "tokens_trained": 0.115574304 }, { "epoch": 0.345531914893617, "grad_norm": 1.9751293659210205, "loss": 5.3352, "lr": 0.0008997202797202797, "step": 1218, "tokens_trained": 0.115766008 }, { "epoch": 0.34609929078014184, "grad_norm": 1.8202649354934692, "loss": 5.37, "lr": 0.0008994405594405595, "step": 1220, "tokens_trained": 0.11595804 }, { "epoch": 0.3466666666666667, "grad_norm": 1.656063199043274, "loss": 5.3664, "lr": 0.0008991608391608391, "step": 1222, "tokens_trained": 0.116146344 }, { "epoch": 0.3472340425531915, "grad_norm": 1.7509667873382568, "loss": 5.3636, "lr": 0.0008988811188811188, "step": 1224, "tokens_trained": 0.116334568 }, { "epoch": 0.3478014184397163, "grad_norm": 1.8556638956069946, "loss": 5.3577, "lr": 0.0008986013986013987, "step": 1226, "tokens_trained": 0.116525704 }, { "epoch": 0.3483687943262411, "grad_norm": 2.026033639907837, "loss": 5.3657, "lr": 0.0008983216783216783, "step": 1228, "tokens_trained": 0.116716032 }, { "epoch": 0.34893617021276596, "grad_norm": 1.6965924501419067, "loss": 5.304, "lr": 0.0008980419580419581, "step": 1230, "tokens_trained": 0.116904832 }, { "epoch": 0.3495035460992908, "grad_norm": 1.8144174814224243, "loss": 5.3759, "lr": 0.0008977622377622377, "step": 1232, "tokens_trained": 0.117095536 }, { "epoch": 0.3500709219858156, "grad_norm": 1.7229580879211426, "loss": 5.3244, "lr": 0.0008974825174825176, "step": 1234, "tokens_trained": 0.117285952 }, { "epoch": 0.3506382978723404, "grad_norm": 1.722578525543213, "loss": 5.3442, "lr": 0.0008972027972027972, "step": 1236, "tokens_trained": 0.117477488 }, { "epoch": 0.35120567375886524, "grad_norm": 1.8006796836853027, "loss": 5.3624, "lr": 0.000896923076923077, "step": 1238, "tokens_trained": 0.117667352 }, { "epoch": 0.3517730496453901, "grad_norm": 1.7172250747680664, "loss": 5.3002, "lr": 0.0008966433566433566, "step": 1240, "tokens_trained": 0.117856504 }, { "epoch": 0.3523404255319149, "grad_norm": 1.8281760215759277, "loss": 5.3311, "lr": 0.0008963636363636363, "step": 1242, "tokens_trained": 0.11804676 }, { "epoch": 0.3529078014184397, "grad_norm": 1.7666652202606201, "loss": 5.3847, "lr": 0.0008960839160839162, "step": 1244, "tokens_trained": 0.118235688 }, { "epoch": 0.3534751773049645, "grad_norm": 1.7723621129989624, "loss": 5.3506, "lr": 0.0008958041958041958, "step": 1246, "tokens_trained": 0.11842632 }, { "epoch": 0.35404255319148936, "grad_norm": 1.7779643535614014, "loss": 5.3066, "lr": 0.0008955244755244756, "step": 1248, "tokens_trained": 0.118616536 }, { "epoch": 0.3546099290780142, "grad_norm": 1.746245265007019, "loss": 5.2993, "lr": 0.0008952447552447552, "step": 1250, "tokens_trained": 0.118807672 }, { "epoch": 0.3546099290780142, "eval_loss": 5.34489107131958, "eval_runtime": 21.0838, "step": 1250, "tokens_trained": 0.118807672 }, { "epoch": 0.35517730496453903, "grad_norm": 1.8439521789550781, "loss": 5.3796, "lr": 0.000894965034965035, "step": 1252, "tokens_trained": 0.118996672 }, { "epoch": 0.3557446808510638, "grad_norm": 1.7830157279968262, "loss": 5.3435, "lr": 0.0008946853146853147, "step": 1254, "tokens_trained": 0.119189544 }, { "epoch": 0.35631205673758864, "grad_norm": 1.6022379398345947, "loss": 5.3772, "lr": 0.0008944055944055944, "step": 1256, "tokens_trained": 0.119379312 }, { "epoch": 0.3568794326241135, "grad_norm": 1.6100343465805054, "loss": 5.3411, "lr": 0.0008941258741258741, "step": 1258, "tokens_trained": 0.119572072 }, { "epoch": 0.3574468085106383, "grad_norm": 1.7826210260391235, "loss": 5.317, "lr": 0.0008938461538461538, "step": 1260, "tokens_trained": 0.119761304 }, { "epoch": 0.3580141843971631, "grad_norm": 1.510432243347168, "loss": 5.4018, "lr": 0.0008935664335664337, "step": 1262, "tokens_trained": 0.11994984 }, { "epoch": 0.35858156028368793, "grad_norm": 1.7209227085113525, "loss": 5.3651, "lr": 0.0008932867132867133, "step": 1264, "tokens_trained": 0.120139368 }, { "epoch": 0.35914893617021276, "grad_norm": 1.7528654336929321, "loss": 5.3329, "lr": 0.000893006993006993, "step": 1266, "tokens_trained": 0.1203308 }, { "epoch": 0.3597163120567376, "grad_norm": 1.8427083492279053, "loss": 5.3897, "lr": 0.0008927272727272727, "step": 1268, "tokens_trained": 0.12052112 }, { "epoch": 0.36028368794326243, "grad_norm": 1.530527114868164, "loss": 5.3407, "lr": 0.0008924475524475525, "step": 1270, "tokens_trained": 0.120709456 }, { "epoch": 0.3608510638297872, "grad_norm": 1.5996145009994507, "loss": 5.3697, "lr": 0.0008921678321678322, "step": 1272, "tokens_trained": 0.12089976 }, { "epoch": 0.36141843971631205, "grad_norm": 1.5235425233840942, "loss": 5.335, "lr": 0.0008918881118881119, "step": 1274, "tokens_trained": 0.121089184 }, { "epoch": 0.3619858156028369, "grad_norm": 1.757206678390503, "loss": 5.2983, "lr": 0.0008916083916083916, "step": 1276, "tokens_trained": 0.1212798 }, { "epoch": 0.3625531914893617, "grad_norm": 1.5952467918395996, "loss": 5.3593, "lr": 0.0008913286713286713, "step": 1278, "tokens_trained": 0.121472816 }, { "epoch": 0.36312056737588655, "grad_norm": 1.6975666284561157, "loss": 5.3867, "lr": 0.0008910489510489512, "step": 1280, "tokens_trained": 0.121659944 }, { "epoch": 0.36368794326241133, "grad_norm": 1.8659151792526245, "loss": 5.3032, "lr": 0.0008907692307692308, "step": 1282, "tokens_trained": 0.121848552 }, { "epoch": 0.36425531914893616, "grad_norm": 1.8692409992218018, "loss": 5.3643, "lr": 0.0008904895104895105, "step": 1284, "tokens_trained": 0.12203916 }, { "epoch": 0.364822695035461, "grad_norm": 1.786490559577942, "loss": 5.4001, "lr": 0.0008902097902097902, "step": 1286, "tokens_trained": 0.122228464 }, { "epoch": 0.36539007092198583, "grad_norm": 1.6635786294937134, "loss": 5.3158, "lr": 0.00088993006993007, "step": 1288, "tokens_trained": 0.122419768 }, { "epoch": 0.3659574468085106, "grad_norm": 1.8413279056549072, "loss": 5.315, "lr": 0.0008896503496503497, "step": 1290, "tokens_trained": 0.122608512 }, { "epoch": 0.36652482269503545, "grad_norm": 1.802370548248291, "loss": 5.3203, "lr": 0.0008893706293706294, "step": 1292, "tokens_trained": 0.122795944 }, { "epoch": 0.3670921985815603, "grad_norm": 1.5968035459518433, "loss": 5.3833, "lr": 0.000889090909090909, "step": 1294, "tokens_trained": 0.1229842 }, { "epoch": 0.3676595744680851, "grad_norm": 1.8354761600494385, "loss": 5.3365, "lr": 0.0008888111888111888, "step": 1296, "tokens_trained": 0.123175336 }, { "epoch": 0.36822695035460995, "grad_norm": 1.925227403640747, "loss": 5.3687, "lr": 0.0008885314685314686, "step": 1298, "tokens_trained": 0.123366848 }, { "epoch": 0.36879432624113473, "grad_norm": 1.7477060556411743, "loss": 5.4033, "lr": 0.0008882517482517483, "step": 1300, "tokens_trained": 0.123556656 }, { "epoch": 0.36936170212765956, "grad_norm": 1.8925527334213257, "loss": 5.2854, "lr": 0.000887972027972028, "step": 1302, "tokens_trained": 0.12374612 }, { "epoch": 0.3699290780141844, "grad_norm": 1.8904681205749512, "loss": 5.2903, "lr": 0.0008876923076923077, "step": 1304, "tokens_trained": 0.123936192 }, { "epoch": 0.37049645390070923, "grad_norm": 1.9903556108474731, "loss": 5.2994, "lr": 0.0008874125874125875, "step": 1306, "tokens_trained": 0.124126112 }, { "epoch": 0.37106382978723407, "grad_norm": 2.014011859893799, "loss": 5.353, "lr": 0.0008871328671328671, "step": 1308, "tokens_trained": 0.124314592 }, { "epoch": 0.37163120567375885, "grad_norm": 1.9086287021636963, "loss": 5.3924, "lr": 0.0008868531468531469, "step": 1310, "tokens_trained": 0.124503496 }, { "epoch": 0.3721985815602837, "grad_norm": 1.8927134275436401, "loss": 5.3098, "lr": 0.0008865734265734265, "step": 1312, "tokens_trained": 0.124693296 }, { "epoch": 0.3727659574468085, "grad_norm": 1.850883960723877, "loss": 5.356, "lr": 0.0008862937062937063, "step": 1314, "tokens_trained": 0.124883528 }, { "epoch": 0.37333333333333335, "grad_norm": 1.813315510749817, "loss": 5.3564, "lr": 0.0008860139860139861, "step": 1316, "tokens_trained": 0.125072328 }, { "epoch": 0.37390070921985813, "grad_norm": 1.6776509284973145, "loss": 5.3348, "lr": 0.0008857342657342658, "step": 1318, "tokens_trained": 0.125263128 }, { "epoch": 0.37446808510638296, "grad_norm": 1.7775620222091675, "loss": 5.298, "lr": 0.0008854545454545455, "step": 1320, "tokens_trained": 0.125453944 }, { "epoch": 0.3750354609929078, "grad_norm": 1.6916086673736572, "loss": 5.332, "lr": 0.0008851748251748251, "step": 1322, "tokens_trained": 0.125644264 }, { "epoch": 0.37560283687943263, "grad_norm": 1.7182034254074097, "loss": 5.3405, "lr": 0.000884895104895105, "step": 1324, "tokens_trained": 0.125835256 }, { "epoch": 0.37617021276595747, "grad_norm": 1.690463662147522, "loss": 5.355, "lr": 0.0008846153846153846, "step": 1326, "tokens_trained": 0.126025952 }, { "epoch": 0.37673758865248225, "grad_norm": 1.7073352336883545, "loss": 5.3304, "lr": 0.0008843356643356644, "step": 1328, "tokens_trained": 0.126217456 }, { "epoch": 0.3773049645390071, "grad_norm": 1.6633049249649048, "loss": 5.2724, "lr": 0.000884055944055944, "step": 1330, "tokens_trained": 0.126407688 }, { "epoch": 0.3778723404255319, "grad_norm": 1.618843913078308, "loss": 5.2952, "lr": 0.0008837762237762238, "step": 1332, "tokens_trained": 0.126599504 }, { "epoch": 0.37843971631205675, "grad_norm": 1.7496757507324219, "loss": 5.2846, "lr": 0.0008834965034965036, "step": 1334, "tokens_trained": 0.126787648 }, { "epoch": 0.3790070921985816, "grad_norm": 1.7284750938415527, "loss": 5.3229, "lr": 0.0008832167832167832, "step": 1336, "tokens_trained": 0.126977568 }, { "epoch": 0.37957446808510636, "grad_norm": 1.55423903465271, "loss": 5.3112, "lr": 0.000882937062937063, "step": 1338, "tokens_trained": 0.12716944 }, { "epoch": 0.3801418439716312, "grad_norm": 1.5783073902130127, "loss": 5.3002, "lr": 0.0008826573426573426, "step": 1340, "tokens_trained": 0.127357296 }, { "epoch": 0.38070921985815603, "grad_norm": 1.6970964670181274, "loss": 5.3003, "lr": 0.0008823776223776225, "step": 1342, "tokens_trained": 0.127547112 }, { "epoch": 0.38127659574468087, "grad_norm": 1.8086830377578735, "loss": 5.3018, "lr": 0.0008820979020979021, "step": 1344, "tokens_trained": 0.12773616 }, { "epoch": 0.38184397163120565, "grad_norm": 1.6589199304580688, "loss": 5.2903, "lr": 0.0008818181818181819, "step": 1346, "tokens_trained": 0.127924704 }, { "epoch": 0.3824113475177305, "grad_norm": 1.6546344757080078, "loss": 5.2639, "lr": 0.0008815384615384615, "step": 1348, "tokens_trained": 0.128114848 }, { "epoch": 0.3829787234042553, "grad_norm": 1.6867282390594482, "loss": 5.2713, "lr": 0.0008812587412587412, "step": 1350, "tokens_trained": 0.12830572 }, { "epoch": 0.38354609929078015, "grad_norm": 1.6336158514022827, "loss": 5.2688, "lr": 0.0008809790209790211, "step": 1352, "tokens_trained": 0.128497336 }, { "epoch": 0.384113475177305, "grad_norm": 1.591659665107727, "loss": 5.3073, "lr": 0.0008806993006993007, "step": 1354, "tokens_trained": 0.128689088 }, { "epoch": 0.38468085106382977, "grad_norm": 1.6427522897720337, "loss": 5.2649, "lr": 0.0008804195804195805, "step": 1356, "tokens_trained": 0.128879208 }, { "epoch": 0.3852482269503546, "grad_norm": 1.693124771118164, "loss": 5.334, "lr": 0.0008801398601398601, "step": 1358, "tokens_trained": 0.129069376 }, { "epoch": 0.38581560283687943, "grad_norm": 1.6677430868148804, "loss": 5.3091, "lr": 0.00087986013986014, "step": 1360, "tokens_trained": 0.12925972 }, { "epoch": 0.38638297872340427, "grad_norm": 1.6829359531402588, "loss": 5.3529, "lr": 0.0008795804195804196, "step": 1362, "tokens_trained": 0.129449816 }, { "epoch": 0.3869503546099291, "grad_norm": 1.6984829902648926, "loss": 5.2832, "lr": 0.0008793006993006993, "step": 1364, "tokens_trained": 0.129638736 }, { "epoch": 0.3875177304964539, "grad_norm": 1.6351298093795776, "loss": 5.3654, "lr": 0.000879020979020979, "step": 1366, "tokens_trained": 0.129831512 }, { "epoch": 0.3880851063829787, "grad_norm": 1.588394045829773, "loss": 5.3203, "lr": 0.0008787412587412587, "step": 1368, "tokens_trained": 0.130021424 }, { "epoch": 0.38865248226950355, "grad_norm": 1.7608240842819214, "loss": 5.3387, "lr": 0.0008784615384615386, "step": 1370, "tokens_trained": 0.130211848 }, { "epoch": 0.3892198581560284, "grad_norm": 1.7742120027542114, "loss": 5.3054, "lr": 0.0008781818181818182, "step": 1372, "tokens_trained": 0.130400256 }, { "epoch": 0.38978723404255317, "grad_norm": 1.8751057386398315, "loss": 5.3569, "lr": 0.000877902097902098, "step": 1374, "tokens_trained": 0.130591616 }, { "epoch": 0.3900709219858156, "eval_loss": 5.315512180328369, "eval_runtime": 20.9232, "step": 1375, "tokens_trained": 0.130685128 }, { "epoch": 0.390354609929078, "grad_norm": 1.8666746616363525, "loss": 5.3088, "lr": 0.0008776223776223776, "step": 1376, "tokens_trained": 0.130781056 }, { "epoch": 0.39092198581560283, "grad_norm": 1.8694190979003906, "loss": 5.2391, "lr": 0.0008773426573426574, "step": 1378, "tokens_trained": 0.130971152 }, { "epoch": 0.39148936170212767, "grad_norm": 2.0663323402404785, "loss": 5.3497, "lr": 0.0008770629370629371, "step": 1380, "tokens_trained": 0.131163224 }, { "epoch": 0.3920567375886525, "grad_norm": 1.956207275390625, "loss": 5.3227, "lr": 0.0008767832167832168, "step": 1382, "tokens_trained": 0.131353832 }, { "epoch": 0.3926241134751773, "grad_norm": 1.6816498041152954, "loss": 5.2626, "lr": 0.0008765034965034965, "step": 1384, "tokens_trained": 0.13154472 }, { "epoch": 0.3931914893617021, "grad_norm": 1.655116319656372, "loss": 5.3334, "lr": 0.0008762237762237762, "step": 1386, "tokens_trained": 0.131732128 }, { "epoch": 0.39375886524822695, "grad_norm": 1.6439241170883179, "loss": 5.3038, "lr": 0.0008759440559440561, "step": 1388, "tokens_trained": 0.131920728 }, { "epoch": 0.3943262411347518, "grad_norm": 1.5000464916229248, "loss": 5.2798, "lr": 0.0008756643356643357, "step": 1390, "tokens_trained": 0.1321094 }, { "epoch": 0.3948936170212766, "grad_norm": 1.7129333019256592, "loss": 5.2918, "lr": 0.0008753846153846154, "step": 1392, "tokens_trained": 0.132299832 }, { "epoch": 0.3954609929078014, "grad_norm": 1.7489241361618042, "loss": 5.3551, "lr": 0.0008751048951048951, "step": 1394, "tokens_trained": 0.13249016 }, { "epoch": 0.39602836879432624, "grad_norm": 1.6597840785980225, "loss": 5.3718, "lr": 0.0008748251748251749, "step": 1396, "tokens_trained": 0.132680568 }, { "epoch": 0.39659574468085107, "grad_norm": 1.8800175189971924, "loss": 5.3578, "lr": 0.0008745454545454546, "step": 1398, "tokens_trained": 0.132871296 }, { "epoch": 0.3971631205673759, "grad_norm": 1.8190884590148926, "loss": 5.2714, "lr": 0.0008742657342657343, "step": 1400, "tokens_trained": 0.133062288 }, { "epoch": 0.3977304964539007, "grad_norm": 1.602634310722351, "loss": 5.2914, "lr": 0.0008739860139860139, "step": 1402, "tokens_trained": 0.133252584 }, { "epoch": 0.3982978723404255, "grad_norm": 1.7363992929458618, "loss": 5.3154, "lr": 0.0008737062937062937, "step": 1404, "tokens_trained": 0.133444784 }, { "epoch": 0.39886524822695035, "grad_norm": 1.7578014135360718, "loss": 5.3735, "lr": 0.0008734265734265734, "step": 1406, "tokens_trained": 0.133636288 }, { "epoch": 0.3994326241134752, "grad_norm": 1.8847187757492065, "loss": 5.3118, "lr": 0.0008731468531468532, "step": 1408, "tokens_trained": 0.133825824 }, { "epoch": 0.4, "grad_norm": 1.750780701637268, "loss": 5.3101, "lr": 0.0008728671328671329, "step": 1410, "tokens_trained": 0.134016688 }, { "epoch": 0.4005673758865248, "grad_norm": 1.6785613298416138, "loss": 5.2823, "lr": 0.0008725874125874126, "step": 1412, "tokens_trained": 0.134208992 }, { "epoch": 0.40113475177304964, "grad_norm": 1.7419382333755493, "loss": 5.2388, "lr": 0.0008723076923076924, "step": 1414, "tokens_trained": 0.134398376 }, { "epoch": 0.40170212765957447, "grad_norm": 1.6936920881271362, "loss": 5.2824, "lr": 0.000872027972027972, "step": 1416, "tokens_trained": 0.134589264 }, { "epoch": 0.4022695035460993, "grad_norm": 1.7408183813095093, "loss": 5.2692, "lr": 0.0008717482517482518, "step": 1418, "tokens_trained": 0.134776568 }, { "epoch": 0.40283687943262414, "grad_norm": 1.7089916467666626, "loss": 5.2309, "lr": 0.0008714685314685314, "step": 1420, "tokens_trained": 0.134967616 }, { "epoch": 0.4034042553191489, "grad_norm": 1.6850922107696533, "loss": 5.3656, "lr": 0.0008711888111888112, "step": 1422, "tokens_trained": 0.135158272 }, { "epoch": 0.40397163120567375, "grad_norm": 1.546431064605713, "loss": 5.3455, "lr": 0.0008709090909090909, "step": 1424, "tokens_trained": 0.135349512 }, { "epoch": 0.4045390070921986, "grad_norm": 1.3656421899795532, "loss": 5.2842, "lr": 0.0008706293706293707, "step": 1426, "tokens_trained": 0.135538512 }, { "epoch": 0.4051063829787234, "grad_norm": 1.5918062925338745, "loss": 5.3243, "lr": 0.0008703496503496504, "step": 1428, "tokens_trained": 0.13572968 }, { "epoch": 0.4056737588652482, "grad_norm": 1.563009262084961, "loss": 5.2539, "lr": 0.00087006993006993, "step": 1430, "tokens_trained": 0.135919568 }, { "epoch": 0.40624113475177304, "grad_norm": 1.6144121885299683, "loss": 5.2844, "lr": 0.0008697902097902099, "step": 1432, "tokens_trained": 0.136109304 }, { "epoch": 0.40680851063829787, "grad_norm": 1.5911130905151367, "loss": 5.3205, "lr": 0.0008695104895104895, "step": 1434, "tokens_trained": 0.136296696 }, { "epoch": 0.4073758865248227, "grad_norm": 1.60932457447052, "loss": 5.3783, "lr": 0.0008692307692307693, "step": 1436, "tokens_trained": 0.136484912 }, { "epoch": 0.40794326241134754, "grad_norm": 1.559644341468811, "loss": 5.2785, "lr": 0.0008689510489510489, "step": 1438, "tokens_trained": 0.136675736 }, { "epoch": 0.4085106382978723, "grad_norm": 1.5167043209075928, "loss": 5.3224, "lr": 0.0008686713286713287, "step": 1440, "tokens_trained": 0.136864928 }, { "epoch": 0.40907801418439715, "grad_norm": 1.5843397378921509, "loss": 5.3075, "lr": 0.0008683916083916084, "step": 1442, "tokens_trained": 0.137056688 }, { "epoch": 0.409645390070922, "grad_norm": 1.581120491027832, "loss": 5.2863, "lr": 0.0008681118881118881, "step": 1444, "tokens_trained": 0.137244664 }, { "epoch": 0.4102127659574468, "grad_norm": 1.6355490684509277, "loss": 5.348, "lr": 0.0008678321678321679, "step": 1446, "tokens_trained": 0.13743372 }, { "epoch": 0.41078014184397166, "grad_norm": 1.5543185472488403, "loss": 5.3268, "lr": 0.0008675524475524475, "step": 1448, "tokens_trained": 0.13762696 }, { "epoch": 0.41134751773049644, "grad_norm": 1.5313750505447388, "loss": 5.2784, "lr": 0.0008672727272727273, "step": 1450, "tokens_trained": 0.137817376 }, { "epoch": 0.41191489361702127, "grad_norm": 1.7918111085891724, "loss": 5.3063, "lr": 0.000866993006993007, "step": 1452, "tokens_trained": 0.138007944 }, { "epoch": 0.4124822695035461, "grad_norm": 1.5105966329574585, "loss": 5.2432, "lr": 0.0008667132867132868, "step": 1454, "tokens_trained": 0.138199776 }, { "epoch": 0.41304964539007094, "grad_norm": 1.4441865682601929, "loss": 5.269, "lr": 0.0008664335664335664, "step": 1456, "tokens_trained": 0.13839124 }, { "epoch": 0.4136170212765957, "grad_norm": 1.473544955253601, "loss": 5.2377, "lr": 0.0008661538461538461, "step": 1458, "tokens_trained": 0.138580704 }, { "epoch": 0.41418439716312055, "grad_norm": 1.6085572242736816, "loss": 5.245, "lr": 0.0008658741258741259, "step": 1460, "tokens_trained": 0.138770176 }, { "epoch": 0.4147517730496454, "grad_norm": 1.609894871711731, "loss": 5.3124, "lr": 0.0008655944055944056, "step": 1462, "tokens_trained": 0.138961656 }, { "epoch": 0.4153191489361702, "grad_norm": 1.6923688650131226, "loss": 5.3099, "lr": 0.0008653146853146854, "step": 1464, "tokens_trained": 0.139151128 }, { "epoch": 0.41588652482269506, "grad_norm": 1.7480796575546265, "loss": 5.2608, "lr": 0.000865034965034965, "step": 1466, "tokens_trained": 0.139341168 }, { "epoch": 0.41645390070921984, "grad_norm": 1.725832223892212, "loss": 5.2863, "lr": 0.0008647552447552448, "step": 1468, "tokens_trained": 0.139530448 }, { "epoch": 0.41702127659574467, "grad_norm": 1.7886406183242798, "loss": 5.231, "lr": 0.0008644755244755245, "step": 1470, "tokens_trained": 0.13972244 }, { "epoch": 0.4175886524822695, "grad_norm": 1.803231954574585, "loss": 5.2428, "lr": 0.0008641958041958042, "step": 1472, "tokens_trained": 0.139913136 }, { "epoch": 0.41815602836879434, "grad_norm": 1.5347254276275635, "loss": 5.2215, "lr": 0.0008639160839160839, "step": 1474, "tokens_trained": 0.140104072 }, { "epoch": 0.4187234042553192, "grad_norm": 1.4485915899276733, "loss": 5.2364, "lr": 0.0008636363636363636, "step": 1476, "tokens_trained": 0.140294312 }, { "epoch": 0.41929078014184396, "grad_norm": 1.6130446195602417, "loss": 5.3088, "lr": 0.0008633566433566434, "step": 1478, "tokens_trained": 0.140482968 }, { "epoch": 0.4198581560283688, "grad_norm": 1.5839030742645264, "loss": 5.3215, "lr": 0.0008630769230769231, "step": 1480, "tokens_trained": 0.140674208 }, { "epoch": 0.4204255319148936, "grad_norm": 1.7519373893737793, "loss": 5.3331, "lr": 0.0008627972027972029, "step": 1482, "tokens_trained": 0.140864408 }, { "epoch": 0.42099290780141846, "grad_norm": 1.6718385219573975, "loss": 5.231, "lr": 0.0008625174825174825, "step": 1484, "tokens_trained": 0.141054696 }, { "epoch": 0.42156028368794324, "grad_norm": 1.5733797550201416, "loss": 5.2621, "lr": 0.0008622377622377622, "step": 1486, "tokens_trained": 0.141245712 }, { "epoch": 0.4221276595744681, "grad_norm": 1.549985647201538, "loss": 5.2574, "lr": 0.000861958041958042, "step": 1488, "tokens_trained": 0.141434232 }, { "epoch": 0.4226950354609929, "grad_norm": 1.651908278465271, "loss": 5.2953, "lr": 0.0008616783216783217, "step": 1490, "tokens_trained": 0.141623936 }, { "epoch": 0.42326241134751774, "grad_norm": 1.5680350065231323, "loss": 5.288, "lr": 0.0008613986013986014, "step": 1492, "tokens_trained": 0.141813904 }, { "epoch": 0.4238297872340426, "grad_norm": 1.5155646800994873, "loss": 5.2529, "lr": 0.0008611188811188811, "step": 1494, "tokens_trained": 0.14200372 }, { "epoch": 0.42439716312056736, "grad_norm": 1.5949562788009644, "loss": 5.3064, "lr": 0.0008608391608391609, "step": 1496, "tokens_trained": 0.142194496 }, { "epoch": 0.4249645390070922, "grad_norm": 1.6359357833862305, "loss": 5.3452, "lr": 0.0008605594405594406, "step": 1498, "tokens_trained": 0.142384592 }, { "epoch": 0.425531914893617, "grad_norm": 1.648120403289795, "loss": 5.3427, "lr": 0.0008602797202797203, "step": 1500, "tokens_trained": 0.142573368 }, { "epoch": 0.425531914893617, "eval_loss": 5.282389163970947, "eval_runtime": 20.5657, "step": 1500, "tokens_trained": 0.142573368 }, { "epoch": 0.42609929078014186, "grad_norm": 1.6313989162445068, "loss": 5.2442, "lr": 0.00086, "step": 1502, "tokens_trained": 0.142764584 }, { "epoch": 0.4266666666666667, "grad_norm": 1.447824239730835, "loss": 5.2979, "lr": 0.0008597202797202797, "step": 1504, "tokens_trained": 0.142953912 }, { "epoch": 0.4272340425531915, "grad_norm": 1.4285600185394287, "loss": 5.317, "lr": 0.0008594405594405595, "step": 1506, "tokens_trained": 0.143145944 }, { "epoch": 0.4278014184397163, "grad_norm": 1.4464077949523926, "loss": 5.2746, "lr": 0.0008591608391608392, "step": 1508, "tokens_trained": 0.1433374 }, { "epoch": 0.42836879432624114, "grad_norm": 1.3554625511169434, "loss": 5.276, "lr": 0.0008588811188811188, "step": 1510, "tokens_trained": 0.143529088 }, { "epoch": 0.428936170212766, "grad_norm": 1.4690148830413818, "loss": 5.2976, "lr": 0.0008586013986013986, "step": 1512, "tokens_trained": 0.1437192 }, { "epoch": 0.42950354609929076, "grad_norm": 1.4911222457885742, "loss": 5.2727, "lr": 0.0008583216783216783, "step": 1514, "tokens_trained": 0.143907728 }, { "epoch": 0.4300709219858156, "grad_norm": 1.5823880434036255, "loss": 5.2481, "lr": 0.0008580419580419581, "step": 1516, "tokens_trained": 0.144097048 }, { "epoch": 0.4306382978723404, "grad_norm": 1.486588716506958, "loss": 5.2561, "lr": 0.0008577622377622378, "step": 1518, "tokens_trained": 0.14428652 }, { "epoch": 0.43120567375886526, "grad_norm": 1.5762882232666016, "loss": 5.267, "lr": 0.0008574825174825175, "step": 1520, "tokens_trained": 0.144476848 }, { "epoch": 0.4317730496453901, "grad_norm": 1.6832828521728516, "loss": 5.3329, "lr": 0.0008572027972027972, "step": 1522, "tokens_trained": 0.144667568 }, { "epoch": 0.4323404255319149, "grad_norm": 1.7036137580871582, "loss": 5.2326, "lr": 0.000856923076923077, "step": 1524, "tokens_trained": 0.144860328 }, { "epoch": 0.4329078014184397, "grad_norm": 1.8102291822433472, "loss": 5.251, "lr": 0.0008566433566433567, "step": 1526, "tokens_trained": 0.1450528 }, { "epoch": 0.43347517730496454, "grad_norm": 1.667229413986206, "loss": 5.2841, "lr": 0.0008563636363636363, "step": 1528, "tokens_trained": 0.145240952 }, { "epoch": 0.4340425531914894, "grad_norm": 1.6709800958633423, "loss": 5.2387, "lr": 0.0008560839160839161, "step": 1530, "tokens_trained": 0.145431376 }, { "epoch": 0.4346099290780142, "grad_norm": 1.600885272026062, "loss": 5.2179, "lr": 0.0008558041958041958, "step": 1532, "tokens_trained": 0.145620184 }, { "epoch": 0.435177304964539, "grad_norm": 1.5783873796463013, "loss": 5.2432, "lr": 0.0008555244755244756, "step": 1534, "tokens_trained": 0.145810616 }, { "epoch": 0.4357446808510638, "grad_norm": 1.5059685707092285, "loss": 5.2604, "lr": 0.0008552447552447553, "step": 1536, "tokens_trained": 0.14600232 }, { "epoch": 0.43631205673758866, "grad_norm": 1.5880341529846191, "loss": 5.249, "lr": 0.000854965034965035, "step": 1538, "tokens_trained": 0.146192504 }, { "epoch": 0.4368794326241135, "grad_norm": 1.430004596710205, "loss": 5.2668, "lr": 0.0008546853146853147, "step": 1540, "tokens_trained": 0.146382264 }, { "epoch": 0.4374468085106383, "grad_norm": 1.4099256992340088, "loss": 5.2839, "lr": 0.0008544055944055944, "step": 1542, "tokens_trained": 0.146570432 }, { "epoch": 0.4380141843971631, "grad_norm": 1.3938827514648438, "loss": 5.2534, "lr": 0.0008541258741258742, "step": 1544, "tokens_trained": 0.146763736 }, { "epoch": 0.43858156028368794, "grad_norm": 1.4359923601150513, "loss": 5.2202, "lr": 0.0008538461538461538, "step": 1546, "tokens_trained": 0.146953944 }, { "epoch": 0.4391489361702128, "grad_norm": 1.5405043363571167, "loss": 5.2613, "lr": 0.0008535664335664336, "step": 1548, "tokens_trained": 0.147144664 }, { "epoch": 0.4397163120567376, "grad_norm": 1.6448051929473877, "loss": 5.299, "lr": 0.0008532867132867133, "step": 1550, "tokens_trained": 0.147335064 }, { "epoch": 0.4402836879432624, "grad_norm": 1.6528949737548828, "loss": 5.3004, "lr": 0.000853006993006993, "step": 1552, "tokens_trained": 0.147524088 }, { "epoch": 0.4408510638297872, "grad_norm": 1.637702226638794, "loss": 5.2298, "lr": 0.0008527272727272728, "step": 1554, "tokens_trained": 0.147716296 }, { "epoch": 0.44141843971631206, "grad_norm": 1.7230212688446045, "loss": 5.2806, "lr": 0.0008524475524475524, "step": 1556, "tokens_trained": 0.147905216 }, { "epoch": 0.4419858156028369, "grad_norm": 1.6216089725494385, "loss": 5.3062, "lr": 0.0008521678321678322, "step": 1558, "tokens_trained": 0.148092312 }, { "epoch": 0.4425531914893617, "grad_norm": 1.5734955072402954, "loss": 5.2607, "lr": 0.0008518881118881119, "step": 1560, "tokens_trained": 0.148282712 }, { "epoch": 0.4431205673758865, "grad_norm": 1.6687103509902954, "loss": 5.2737, "lr": 0.0008516083916083917, "step": 1562, "tokens_trained": 0.148474672 }, { "epoch": 0.44368794326241134, "grad_norm": 1.547277569770813, "loss": 5.3183, "lr": 0.0008513286713286713, "step": 1564, "tokens_trained": 0.148667824 }, { "epoch": 0.4442553191489362, "grad_norm": 1.3782074451446533, "loss": 5.266, "lr": 0.000851048951048951, "step": 1566, "tokens_trained": 0.14885704 }, { "epoch": 0.444822695035461, "grad_norm": 1.5648273229599, "loss": 5.2954, "lr": 0.0008507692307692308, "step": 1568, "tokens_trained": 0.14904804 }, { "epoch": 0.4453900709219858, "grad_norm": 1.5675908327102661, "loss": 5.2897, "lr": 0.0008504895104895105, "step": 1570, "tokens_trained": 0.149237048 }, { "epoch": 0.4459574468085106, "grad_norm": 1.5399287939071655, "loss": 5.2993, "lr": 0.0008502097902097903, "step": 1572, "tokens_trained": 0.149427328 }, { "epoch": 0.44652482269503546, "grad_norm": 1.7170253992080688, "loss": 5.2756, "lr": 0.0008499300699300699, "step": 1574, "tokens_trained": 0.149618448 }, { "epoch": 0.4470921985815603, "grad_norm": 1.5694142580032349, "loss": 5.2294, "lr": 0.0008496503496503497, "step": 1576, "tokens_trained": 0.149809416 }, { "epoch": 0.44765957446808513, "grad_norm": 1.5410487651824951, "loss": 5.2392, "lr": 0.0008493706293706294, "step": 1578, "tokens_trained": 0.149999608 }, { "epoch": 0.4482269503546099, "grad_norm": 1.5991896390914917, "loss": 5.2569, "lr": 0.0008490909090909091, "step": 1580, "tokens_trained": 0.150190224 }, { "epoch": 0.44879432624113474, "grad_norm": 1.5861775875091553, "loss": 5.3151, "lr": 0.0008488111888111888, "step": 1582, "tokens_trained": 0.150380592 }, { "epoch": 0.4493617021276596, "grad_norm": 1.530462622642517, "loss": 5.3242, "lr": 0.0008485314685314685, "step": 1584, "tokens_trained": 0.15056992 }, { "epoch": 0.4499290780141844, "grad_norm": 1.5658655166625977, "loss": 5.2933, "lr": 0.0008482517482517483, "step": 1586, "tokens_trained": 0.150760336 }, { "epoch": 0.4504964539007092, "grad_norm": 1.4187430143356323, "loss": 5.2235, "lr": 0.000847972027972028, "step": 1588, "tokens_trained": 0.150949088 }, { "epoch": 0.451063829787234, "grad_norm": 1.6921541690826416, "loss": 5.2496, "lr": 0.0008476923076923078, "step": 1590, "tokens_trained": 0.151140016 }, { "epoch": 0.45163120567375886, "grad_norm": 1.6049220561981201, "loss": 5.2767, "lr": 0.0008474125874125874, "step": 1592, "tokens_trained": 0.151330944 }, { "epoch": 0.4521985815602837, "grad_norm": 1.513168454170227, "loss": 5.2904, "lr": 0.0008471328671328671, "step": 1594, "tokens_trained": 0.151520152 }, { "epoch": 0.45276595744680853, "grad_norm": 1.5247087478637695, "loss": 5.2391, "lr": 0.0008468531468531469, "step": 1596, "tokens_trained": 0.151711592 }, { "epoch": 0.4533333333333333, "grad_norm": 1.5005898475646973, "loss": 5.3025, "lr": 0.0008465734265734266, "step": 1598, "tokens_trained": 0.151902736 }, { "epoch": 0.45390070921985815, "grad_norm": 1.3196156024932861, "loss": 5.3025, "lr": 0.0008462937062937063, "step": 1600, "tokens_trained": 0.152094032 }, { "epoch": 0.454468085106383, "grad_norm": 1.5037102699279785, "loss": 5.2348, "lr": 0.000846013986013986, "step": 1602, "tokens_trained": 0.15228336 }, { "epoch": 0.4550354609929078, "grad_norm": 1.404539942741394, "loss": 5.2551, "lr": 0.0008457342657342658, "step": 1604, "tokens_trained": 0.152474776 }, { "epoch": 0.45560283687943265, "grad_norm": 1.4784883260726929, "loss": 5.2927, "lr": 0.0008454545454545455, "step": 1606, "tokens_trained": 0.152663392 }, { "epoch": 0.45617021276595743, "grad_norm": 1.3743332624435425, "loss": 5.2542, "lr": 0.0008451748251748252, "step": 1608, "tokens_trained": 0.152852512 }, { "epoch": 0.45673758865248226, "grad_norm": 1.4161995649337769, "loss": 5.2518, "lr": 0.0008448951048951049, "step": 1610, "tokens_trained": 0.15304428 }, { "epoch": 0.4573049645390071, "grad_norm": 1.5045989751815796, "loss": 5.2735, "lr": 0.0008446153846153846, "step": 1612, "tokens_trained": 0.153234632 }, { "epoch": 0.45787234042553193, "grad_norm": 1.3695783615112305, "loss": 5.2294, "lr": 0.0008443356643356644, "step": 1614, "tokens_trained": 0.1534248 }, { "epoch": 0.4584397163120567, "grad_norm": 1.4551646709442139, "loss": 5.2639, "lr": 0.0008440559440559441, "step": 1616, "tokens_trained": 0.153614944 }, { "epoch": 0.45900709219858155, "grad_norm": 1.5018376111984253, "loss": 5.2989, "lr": 0.0008437762237762238, "step": 1618, "tokens_trained": 0.153803784 }, { "epoch": 0.4595744680851064, "grad_norm": 1.5295960903167725, "loss": 5.33, "lr": 0.0008434965034965035, "step": 1620, "tokens_trained": 0.153993752 }, { "epoch": 0.4601418439716312, "grad_norm": 1.417626142501831, "loss": 5.2134, "lr": 0.0008432167832167832, "step": 1622, "tokens_trained": 0.154184448 }, { "epoch": 0.46070921985815605, "grad_norm": 1.5715348720550537, "loss": 5.2782, "lr": 0.000842937062937063, "step": 1624, "tokens_trained": 0.154373632 }, { "epoch": 0.46099290780141844, "eval_loss": 5.266384601593018, "eval_runtime": 21.0916, "step": 1625, "tokens_trained": 0.154468808 }, { "epoch": 0.46127659574468083, "grad_norm": 1.5504534244537354, "loss": 5.2307, "lr": 0.0008426573426573427, "step": 1626, "tokens_trained": 0.154564864 }, { "epoch": 0.46184397163120566, "grad_norm": 1.483108401298523, "loss": 5.2578, "lr": 0.0008423776223776224, "step": 1628, "tokens_trained": 0.154755312 }, { "epoch": 0.4624113475177305, "grad_norm": 1.5631264448165894, "loss": 5.3291, "lr": 0.0008420979020979021, "step": 1630, "tokens_trained": 0.154943736 }, { "epoch": 0.46297872340425533, "grad_norm": 1.4680705070495605, "loss": 5.2256, "lr": 0.0008418181818181819, "step": 1632, "tokens_trained": 0.15513452 }, { "epoch": 0.46354609929078017, "grad_norm": 1.468338966369629, "loss": 5.2712, "lr": 0.0008415384615384616, "step": 1634, "tokens_trained": 0.155325288 }, { "epoch": 0.46411347517730495, "grad_norm": 1.4557780027389526, "loss": 5.2808, "lr": 0.0008412587412587412, "step": 1636, "tokens_trained": 0.155515328 }, { "epoch": 0.4646808510638298, "grad_norm": 1.4534999132156372, "loss": 5.2707, "lr": 0.000840979020979021, "step": 1638, "tokens_trained": 0.155706752 }, { "epoch": 0.4652482269503546, "grad_norm": 1.4011393785476685, "loss": 5.3028, "lr": 0.0008406993006993006, "step": 1640, "tokens_trained": 0.155895336 }, { "epoch": 0.46581560283687945, "grad_norm": 1.307922601699829, "loss": 5.2188, "lr": 0.0008404195804195805, "step": 1642, "tokens_trained": 0.156085936 }, { "epoch": 0.46638297872340423, "grad_norm": 1.359922170639038, "loss": 5.2863, "lr": 0.0008401398601398602, "step": 1644, "tokens_trained": 0.15627636 }, { "epoch": 0.46695035460992906, "grad_norm": 1.6204577684402466, "loss": 5.2877, "lr": 0.0008398601398601399, "step": 1646, "tokens_trained": 0.156465192 }, { "epoch": 0.4675177304964539, "grad_norm": 1.7367322444915771, "loss": 5.2501, "lr": 0.0008395804195804196, "step": 1648, "tokens_trained": 0.15665336 }, { "epoch": 0.46808510638297873, "grad_norm": 1.7013088464736938, "loss": 5.2522, "lr": 0.0008393006993006993, "step": 1650, "tokens_trained": 0.156843128 }, { "epoch": 0.46865248226950357, "grad_norm": 1.6429578065872192, "loss": 5.2978, "lr": 0.0008390209790209791, "step": 1652, "tokens_trained": 0.157034328 }, { "epoch": 0.46921985815602835, "grad_norm": 1.527243733406067, "loss": 5.2384, "lr": 0.0008387412587412587, "step": 1654, "tokens_trained": 0.157222784 }, { "epoch": 0.4697872340425532, "grad_norm": 1.4792861938476562, "loss": 5.2149, "lr": 0.0008384615384615385, "step": 1656, "tokens_trained": 0.15741308 }, { "epoch": 0.470354609929078, "grad_norm": 1.4050098657608032, "loss": 5.229, "lr": 0.0008381818181818181, "step": 1658, "tokens_trained": 0.157603872 }, { "epoch": 0.47092198581560285, "grad_norm": 1.4799182415008545, "loss": 5.2235, "lr": 0.000837902097902098, "step": 1660, "tokens_trained": 0.157793352 }, { "epoch": 0.4714893617021277, "grad_norm": 1.4031378030776978, "loss": 5.23, "lr": 0.0008376223776223776, "step": 1662, "tokens_trained": 0.157984416 }, { "epoch": 0.47205673758865246, "grad_norm": 1.5775604248046875, "loss": 5.2811, "lr": 0.0008373426573426573, "step": 1664, "tokens_trained": 0.158176048 }, { "epoch": 0.4726241134751773, "grad_norm": 1.4855432510375977, "loss": 5.2363, "lr": 0.0008370629370629371, "step": 1666, "tokens_trained": 0.158368152 }, { "epoch": 0.47319148936170213, "grad_norm": 1.5609453916549683, "loss": 5.2984, "lr": 0.0008367832167832168, "step": 1668, "tokens_trained": 0.15855684 }, { "epoch": 0.47375886524822697, "grad_norm": 1.5052629709243774, "loss": 5.213, "lr": 0.0008365034965034966, "step": 1670, "tokens_trained": 0.15874712 }, { "epoch": 0.47432624113475175, "grad_norm": 1.5655242204666138, "loss": 5.2551, "lr": 0.0008362237762237762, "step": 1672, "tokens_trained": 0.158937104 }, { "epoch": 0.4748936170212766, "grad_norm": 1.301142930984497, "loss": 5.1564, "lr": 0.000835944055944056, "step": 1674, "tokens_trained": 0.159128096 }, { "epoch": 0.4754609929078014, "grad_norm": 1.5447527170181274, "loss": 5.2547, "lr": 0.0008356643356643356, "step": 1676, "tokens_trained": 0.159318968 }, { "epoch": 0.47602836879432625, "grad_norm": 1.638100266456604, "loss": 5.2301, "lr": 0.0008353846153846154, "step": 1678, "tokens_trained": 0.159508648 }, { "epoch": 0.4765957446808511, "grad_norm": 1.6203068494796753, "loss": 5.2644, "lr": 0.0008351048951048951, "step": 1680, "tokens_trained": 0.159698648 }, { "epoch": 0.47716312056737586, "grad_norm": 1.4097110033035278, "loss": 5.2047, "lr": 0.0008348251748251748, "step": 1682, "tokens_trained": 0.159887392 }, { "epoch": 0.4777304964539007, "grad_norm": 1.3377385139465332, "loss": 5.2685, "lr": 0.0008345454545454546, "step": 1684, "tokens_trained": 0.160076904 }, { "epoch": 0.47829787234042553, "grad_norm": 1.4079371690750122, "loss": 5.2842, "lr": 0.0008342657342657343, "step": 1686, "tokens_trained": 0.160266712 }, { "epoch": 0.47886524822695037, "grad_norm": 1.6039987802505493, "loss": 5.2248, "lr": 0.0008339860139860141, "step": 1688, "tokens_trained": 0.160455464 }, { "epoch": 0.4794326241134752, "grad_norm": 1.639218807220459, "loss": 5.2007, "lr": 0.0008337062937062937, "step": 1690, "tokens_trained": 0.16064472 }, { "epoch": 0.48, "grad_norm": 1.8226710557937622, "loss": 5.2427, "lr": 0.0008334265734265734, "step": 1692, "tokens_trained": 0.160835192 }, { "epoch": 0.4805673758865248, "grad_norm": 1.6480419635772705, "loss": 5.1944, "lr": 0.0008331468531468531, "step": 1694, "tokens_trained": 0.161025272 }, { "epoch": 0.48113475177304965, "grad_norm": 1.666717290878296, "loss": 5.2879, "lr": 0.0008328671328671329, "step": 1696, "tokens_trained": 0.161214016 }, { "epoch": 0.4817021276595745, "grad_norm": 1.5092660188674927, "loss": 5.2612, "lr": 0.0008325874125874126, "step": 1698, "tokens_trained": 0.161405448 }, { "epoch": 0.48226950354609927, "grad_norm": 1.4042121171951294, "loss": 5.2373, "lr": 0.0008323076923076923, "step": 1700, "tokens_trained": 0.161595896 }, { "epoch": 0.4828368794326241, "grad_norm": 1.4937382936477661, "loss": 5.2172, "lr": 0.000832027972027972, "step": 1702, "tokens_trained": 0.161783904 }, { "epoch": 0.48340425531914893, "grad_norm": 1.4652959108352661, "loss": 5.2704, "lr": 0.0008317482517482518, "step": 1704, "tokens_trained": 0.161975888 }, { "epoch": 0.48397163120567377, "grad_norm": 1.3021745681762695, "loss": 5.2672, "lr": 0.0008314685314685315, "step": 1706, "tokens_trained": 0.162165808 }, { "epoch": 0.4845390070921986, "grad_norm": 1.3580701351165771, "loss": 5.2467, "lr": 0.0008311888111888112, "step": 1708, "tokens_trained": 0.162355152 }, { "epoch": 0.4851063829787234, "grad_norm": 1.480072259902954, "loss": 5.2797, "lr": 0.0008309090909090909, "step": 1710, "tokens_trained": 0.162544744 }, { "epoch": 0.4856737588652482, "grad_norm": 1.3532829284667969, "loss": 5.2556, "lr": 0.0008306293706293706, "step": 1712, "tokens_trained": 0.162734976 }, { "epoch": 0.48624113475177305, "grad_norm": 1.240332007408142, "loss": 5.2153, "lr": 0.0008303496503496504, "step": 1714, "tokens_trained": 0.162924992 }, { "epoch": 0.4868085106382979, "grad_norm": 1.4141086339950562, "loss": 5.2056, "lr": 0.00083006993006993, "step": 1716, "tokens_trained": 0.163114008 }, { "epoch": 0.4873758865248227, "grad_norm": 1.321721076965332, "loss": 5.2223, "lr": 0.0008297902097902098, "step": 1718, "tokens_trained": 0.163304416 }, { "epoch": 0.4879432624113475, "grad_norm": 1.5437248945236206, "loss": 5.2727, "lr": 0.0008295104895104895, "step": 1720, "tokens_trained": 0.163493816 }, { "epoch": 0.48851063829787233, "grad_norm": 1.7218859195709229, "loss": 5.2323, "lr": 0.0008292307692307693, "step": 1722, "tokens_trained": 0.163683984 }, { "epoch": 0.48907801418439717, "grad_norm": 1.5534045696258545, "loss": 5.1983, "lr": 0.000828951048951049, "step": 1724, "tokens_trained": 0.163874968 }, { "epoch": 0.489645390070922, "grad_norm": 1.3675404787063599, "loss": 5.2086, "lr": 0.0008286713286713287, "step": 1726, "tokens_trained": 0.164065152 }, { "epoch": 0.4902127659574468, "grad_norm": 1.5178970098495483, "loss": 5.2529, "lr": 0.0008283916083916084, "step": 1728, "tokens_trained": 0.164255952 }, { "epoch": 0.4907801418439716, "grad_norm": 1.4910545349121094, "loss": 5.2931, "lr": 0.000828111888111888, "step": 1730, "tokens_trained": 0.164447112 }, { "epoch": 0.49134751773049645, "grad_norm": 1.5647637844085693, "loss": 5.2603, "lr": 0.0008278321678321679, "step": 1732, "tokens_trained": 0.16463704 }, { "epoch": 0.4919148936170213, "grad_norm": 1.4607906341552734, "loss": 5.2702, "lr": 0.0008275524475524475, "step": 1734, "tokens_trained": 0.164827312 }, { "epoch": 0.4924822695035461, "grad_norm": 1.5806026458740234, "loss": 5.2356, "lr": 0.0008272727272727273, "step": 1736, "tokens_trained": 0.165015224 }, { "epoch": 0.4930496453900709, "grad_norm": 1.5417263507843018, "loss": 5.262, "lr": 0.000826993006993007, "step": 1738, "tokens_trained": 0.16520484 }, { "epoch": 0.49361702127659574, "grad_norm": 1.511680245399475, "loss": 5.2634, "lr": 0.0008267132867132868, "step": 1740, "tokens_trained": 0.165393064 }, { "epoch": 0.49418439716312057, "grad_norm": 1.4468717575073242, "loss": 5.2452, "lr": 0.0008264335664335665, "step": 1742, "tokens_trained": 0.165584472 }, { "epoch": 0.4947517730496454, "grad_norm": 1.423187017440796, "loss": 5.2533, "lr": 0.0008261538461538461, "step": 1744, "tokens_trained": 0.165773768 }, { "epoch": 0.49531914893617024, "grad_norm": 1.512462854385376, "loss": 5.2152, "lr": 0.0008258741258741259, "step": 1746, "tokens_trained": 0.165963456 }, { "epoch": 0.495886524822695, "grad_norm": 1.4620780944824219, "loss": 5.2511, "lr": 0.0008255944055944055, "step": 1748, "tokens_trained": 0.166152136 }, { "epoch": 0.49645390070921985, "grad_norm": 1.4943009614944458, "loss": 5.2829, "lr": 0.0008253146853146854, "step": 1750, "tokens_trained": 0.16634248 }, { "epoch": 0.49645390070921985, "eval_loss": 5.23966646194458, "eval_runtime": 20.5954, "step": 1750, "tokens_trained": 0.16634248 }, { "epoch": 0.4970212765957447, "grad_norm": 1.6739267110824585, "loss": 5.2306, "lr": 0.000825034965034965, "step": 1752, "tokens_trained": 0.166532864 }, { "epoch": 0.4975886524822695, "grad_norm": 1.6125763654708862, "loss": 5.2845, "lr": 0.0008247552447552448, "step": 1754, "tokens_trained": 0.166722944 }, { "epoch": 0.4981560283687943, "grad_norm": 1.5872310400009155, "loss": 5.2075, "lr": 0.0008244755244755245, "step": 1756, "tokens_trained": 0.16691184 }, { "epoch": 0.49872340425531914, "grad_norm": 1.4396610260009766, "loss": 5.2532, "lr": 0.0008241958041958042, "step": 1758, "tokens_trained": 0.167101896 }, { "epoch": 0.49929078014184397, "grad_norm": 1.363879680633545, "loss": 5.2252, "lr": 0.000823916083916084, "step": 1760, "tokens_trained": 0.167289384 }, { "epoch": 0.4998581560283688, "grad_norm": 1.395561695098877, "loss": 5.2097, "lr": 0.0008236363636363636, "step": 1762, "tokens_trained": 0.167479424 }, { "epoch": 0.5004255319148936, "grad_norm": 1.413736343383789, "loss": 5.2283, "lr": 0.0008233566433566434, "step": 1764, "tokens_trained": 0.167668256 }, { "epoch": 0.5009929078014185, "grad_norm": 1.4240859746932983, "loss": 5.2574, "lr": 0.000823076923076923, "step": 1766, "tokens_trained": 0.167858616 }, { "epoch": 0.5015602836879433, "grad_norm": 1.437165379524231, "loss": 5.2511, "lr": 0.0008227972027972029, "step": 1768, "tokens_trained": 0.168048272 }, { "epoch": 0.502127659574468, "grad_norm": 1.458575963973999, "loss": 5.2183, "lr": 0.0008225174825174825, "step": 1770, "tokens_trained": 0.168240184 }, { "epoch": 0.5026950354609929, "grad_norm": 1.5224673748016357, "loss": 5.259, "lr": 0.0008222377622377622, "step": 1772, "tokens_trained": 0.168429536 }, { "epoch": 0.5032624113475177, "grad_norm": 1.578438401222229, "loss": 5.2108, "lr": 0.000821958041958042, "step": 1774, "tokens_trained": 0.168619312 }, { "epoch": 0.5038297872340426, "grad_norm": 1.4880632162094116, "loss": 5.229, "lr": 0.0008216783216783217, "step": 1776, "tokens_trained": 0.168808344 }, { "epoch": 0.5043971631205674, "grad_norm": 1.3741049766540527, "loss": 5.2873, "lr": 0.0008213986013986015, "step": 1778, "tokens_trained": 0.168999112 }, { "epoch": 0.5049645390070922, "grad_norm": 1.4396610260009766, "loss": 5.3237, "lr": 0.0008211188811188811, "step": 1780, "tokens_trained": 0.169189288 }, { "epoch": 0.505531914893617, "grad_norm": 1.4296880960464478, "loss": 5.2228, "lr": 0.0008208391608391609, "step": 1782, "tokens_trained": 0.16937864 }, { "epoch": 0.5060992907801418, "grad_norm": 1.5704258680343628, "loss": 5.2569, "lr": 0.0008205594405594405, "step": 1784, "tokens_trained": 0.169569024 }, { "epoch": 0.5066666666666667, "grad_norm": 1.458261489868164, "loss": 5.1818, "lr": 0.0008202797202797203, "step": 1786, "tokens_trained": 0.16975932 }, { "epoch": 0.5072340425531915, "grad_norm": 1.5307244062423706, "loss": 5.2684, "lr": 0.00082, "step": 1788, "tokens_trained": 0.169949064 }, { "epoch": 0.5078014184397163, "grad_norm": 1.3966363668441772, "loss": 5.2125, "lr": 0.0008197202797202797, "step": 1790, "tokens_trained": 0.170139352 }, { "epoch": 0.5083687943262412, "grad_norm": 1.4094839096069336, "loss": 5.2518, "lr": 0.0008194405594405595, "step": 1792, "tokens_trained": 0.170330336 }, { "epoch": 0.5089361702127659, "grad_norm": 1.266122817993164, "loss": 5.2409, "lr": 0.0008191608391608392, "step": 1794, "tokens_trained": 0.170521848 }, { "epoch": 0.5095035460992908, "grad_norm": 1.3079488277435303, "loss": 5.182, "lr": 0.000818881118881119, "step": 1796, "tokens_trained": 0.170710664 }, { "epoch": 0.5100709219858156, "grad_norm": 1.2961090803146362, "loss": 5.2456, "lr": 0.0008186013986013986, "step": 1798, "tokens_trained": 0.170900016 }, { "epoch": 0.5106382978723404, "grad_norm": 1.3402773141860962, "loss": 5.1888, "lr": 0.0008183216783216783, "step": 1800, "tokens_trained": 0.171089824 }, { "epoch": 0.5112056737588653, "grad_norm": 1.386769413948059, "loss": 5.1715, "lr": 0.000818041958041958, "step": 1802, "tokens_trained": 0.171279448 }, { "epoch": 0.51177304964539, "grad_norm": 1.4280421733856201, "loss": 5.2131, "lr": 0.0008177622377622378, "step": 1804, "tokens_trained": 0.17147048 }, { "epoch": 0.512340425531915, "grad_norm": 1.4805412292480469, "loss": 5.2379, "lr": 0.0008174825174825175, "step": 1806, "tokens_trained": 0.171662264 }, { "epoch": 0.5129078014184397, "grad_norm": 1.4608936309814453, "loss": 5.2412, "lr": 0.0008172027972027972, "step": 1808, "tokens_trained": 0.171853176 }, { "epoch": 0.5134751773049645, "grad_norm": 1.550136923789978, "loss": 5.1828, "lr": 0.000816923076923077, "step": 1810, "tokens_trained": 0.172043344 }, { "epoch": 0.5140425531914894, "grad_norm": 1.4756869077682495, "loss": 5.199, "lr": 0.0008166433566433567, "step": 1812, "tokens_trained": 0.172231952 }, { "epoch": 0.5146099290780142, "grad_norm": 1.4199044704437256, "loss": 5.2074, "lr": 0.0008163636363636364, "step": 1814, "tokens_trained": 0.172420376 }, { "epoch": 0.5151773049645391, "grad_norm": 1.3477959632873535, "loss": 5.1672, "lr": 0.0008160839160839161, "step": 1816, "tokens_trained": 0.172610248 }, { "epoch": 0.5157446808510638, "grad_norm": 1.3331218957901, "loss": 5.2267, "lr": 0.0008158041958041958, "step": 1818, "tokens_trained": 0.172799168 }, { "epoch": 0.5163120567375886, "grad_norm": 1.2391384840011597, "loss": 5.2088, "lr": 0.0008155244755244755, "step": 1820, "tokens_trained": 0.172989328 }, { "epoch": 0.5168794326241135, "grad_norm": 1.3377013206481934, "loss": 5.2279, "lr": 0.0008152447552447553, "step": 1822, "tokens_trained": 0.173179376 }, { "epoch": 0.5174468085106383, "grad_norm": 1.285628318786621, "loss": 5.3006, "lr": 0.000814965034965035, "step": 1824, "tokens_trained": 0.173370408 }, { "epoch": 0.5180141843971631, "grad_norm": 1.2010120153427124, "loss": 5.2264, "lr": 0.0008146853146853147, "step": 1826, "tokens_trained": 0.173561144 }, { "epoch": 0.518581560283688, "grad_norm": 1.2953096628189087, "loss": 5.1879, "lr": 0.0008144055944055944, "step": 1828, "tokens_trained": 0.173753592 }, { "epoch": 0.5191489361702127, "grad_norm": 1.256910800933838, "loss": 5.2402, "lr": 0.0008141258741258742, "step": 1830, "tokens_trained": 0.173943752 }, { "epoch": 0.5197163120567376, "grad_norm": 1.338755488395691, "loss": 5.2556, "lr": 0.0008138461538461539, "step": 1832, "tokens_trained": 0.174130504 }, { "epoch": 0.5202836879432624, "grad_norm": 1.380715012550354, "loss": 5.2047, "lr": 0.0008135664335664336, "step": 1834, "tokens_trained": 0.174322088 }, { "epoch": 0.5208510638297872, "grad_norm": 1.4989492893218994, "loss": 5.1873, "lr": 0.0008132867132867133, "step": 1836, "tokens_trained": 0.17451164 }, { "epoch": 0.5214184397163121, "grad_norm": 1.3239110708236694, "loss": 5.202, "lr": 0.000813006993006993, "step": 1838, "tokens_trained": 0.174701896 }, { "epoch": 0.5219858156028369, "grad_norm": 1.397745132446289, "loss": 5.2259, "lr": 0.0008127272727272728, "step": 1840, "tokens_trained": 0.174892336 }, { "epoch": 0.5225531914893617, "grad_norm": 1.3992305994033813, "loss": 5.1771, "lr": 0.0008124475524475524, "step": 1842, "tokens_trained": 0.17508276 }, { "epoch": 0.5231205673758865, "grad_norm": 1.38923180103302, "loss": 5.1981, "lr": 0.0008121678321678322, "step": 1844, "tokens_trained": 0.175273272 }, { "epoch": 0.5236879432624113, "grad_norm": 1.478642225265503, "loss": 5.2533, "lr": 0.0008118881118881119, "step": 1846, "tokens_trained": 0.175462352 }, { "epoch": 0.5242553191489362, "grad_norm": 1.332709789276123, "loss": 5.2205, "lr": 0.0008116083916083917, "step": 1848, "tokens_trained": 0.175648128 }, { "epoch": 0.524822695035461, "grad_norm": 1.4612590074539185, "loss": 5.2207, "lr": 0.0008113286713286714, "step": 1850, "tokens_trained": 0.175837712 }, { "epoch": 0.5253900709219859, "grad_norm": 1.4682700634002686, "loss": 5.2576, "lr": 0.000811048951048951, "step": 1852, "tokens_trained": 0.176029512 }, { "epoch": 0.5259574468085106, "grad_norm": 1.3380264043807983, "loss": 5.2435, "lr": 0.0008107692307692308, "step": 1854, "tokens_trained": 0.176220432 }, { "epoch": 0.5265248226950354, "grad_norm": 1.2452281713485718, "loss": 5.2973, "lr": 0.0008104895104895104, "step": 1856, "tokens_trained": 0.176412144 }, { "epoch": 0.5270921985815603, "grad_norm": 1.392592191696167, "loss": 5.2028, "lr": 0.0008102097902097903, "step": 1858, "tokens_trained": 0.17660144 }, { "epoch": 0.5276595744680851, "grad_norm": 1.4258657693862915, "loss": 5.2342, "lr": 0.0008099300699300699, "step": 1860, "tokens_trained": 0.176790424 }, { "epoch": 0.52822695035461, "grad_norm": 1.4627033472061157, "loss": 5.1732, "lr": 0.0008096503496503497, "step": 1862, "tokens_trained": 0.176983296 }, { "epoch": 0.5287943262411348, "grad_norm": 1.4448645114898682, "loss": 5.2001, "lr": 0.0008093706293706294, "step": 1864, "tokens_trained": 0.177174544 }, { "epoch": 0.5293617021276595, "grad_norm": 1.3879749774932861, "loss": 5.1642, "lr": 0.0008090909090909092, "step": 1866, "tokens_trained": 0.17736428 }, { "epoch": 0.5299290780141844, "grad_norm": 1.2791417837142944, "loss": 5.1975, "lr": 0.0008088111888111889, "step": 1868, "tokens_trained": 0.177553752 }, { "epoch": 0.5304964539007092, "grad_norm": 1.3620632886886597, "loss": 5.1742, "lr": 0.0008085314685314685, "step": 1870, "tokens_trained": 0.177746448 }, { "epoch": 0.531063829787234, "grad_norm": 1.2759565114974976, "loss": 5.2076, "lr": 0.0008082517482517483, "step": 1872, "tokens_trained": 0.177937888 }, { "epoch": 0.5316312056737589, "grad_norm": 1.3390915393829346, "loss": 5.2387, "lr": 0.0008079720279720279, "step": 1874, "tokens_trained": 0.178127776 }, { "epoch": 0.5319148936170213, "eval_loss": 5.228371620178223, "eval_runtime": 20.9372, "step": 1875, "tokens_trained": 0.17822376 }, { "epoch": 0.5321985815602837, "grad_norm": 1.3872885704040527, "loss": 5.2053, "lr": 0.0008076923076923078, "step": 1876, "tokens_trained": 0.178318616 }, { "epoch": 0.5327659574468085, "grad_norm": 1.4238568544387817, "loss": 5.2091, "lr": 0.0008074125874125874, "step": 1878, "tokens_trained": 0.178509272 }, { "epoch": 0.5333333333333333, "grad_norm": 1.3352588415145874, "loss": 5.2471, "lr": 0.0008071328671328671, "step": 1880, "tokens_trained": 0.178698016 }, { "epoch": 0.5339007092198581, "grad_norm": 1.2931993007659912, "loss": 5.2315, "lr": 0.0008068531468531469, "step": 1882, "tokens_trained": 0.17888628 }, { "epoch": 0.534468085106383, "grad_norm": 1.3475919961929321, "loss": 5.2337, "lr": 0.0008065734265734265, "step": 1884, "tokens_trained": 0.179076944 }, { "epoch": 0.5350354609929078, "grad_norm": 1.3263812065124512, "loss": 5.2017, "lr": 0.0008062937062937064, "step": 1886, "tokens_trained": 0.179266128 }, { "epoch": 0.5356028368794327, "grad_norm": 1.3956594467163086, "loss": 5.1907, "lr": 0.000806013986013986, "step": 1888, "tokens_trained": 0.179454848 }, { "epoch": 0.5361702127659574, "grad_norm": 1.4399393796920776, "loss": 5.216, "lr": 0.0008057342657342658, "step": 1890, "tokens_trained": 0.179643992 }, { "epoch": 0.5367375886524822, "grad_norm": 1.278714656829834, "loss": 5.1689, "lr": 0.0008054545454545454, "step": 1892, "tokens_trained": 0.179831416 }, { "epoch": 0.5373049645390071, "grad_norm": 1.3517796993255615, "loss": 5.1319, "lr": 0.0008051748251748253, "step": 1894, "tokens_trained": 0.180022528 }, { "epoch": 0.5378723404255319, "grad_norm": 1.2710460424423218, "loss": 5.1619, "lr": 0.0008048951048951049, "step": 1896, "tokens_trained": 0.180212936 }, { "epoch": 0.5384397163120568, "grad_norm": 1.3603075742721558, "loss": 5.1615, "lr": 0.0008046153846153846, "step": 1898, "tokens_trained": 0.180404648 }, { "epoch": 0.5390070921985816, "grad_norm": 1.422122836112976, "loss": 5.1801, "lr": 0.0008043356643356644, "step": 1900, "tokens_trained": 0.18059388 }, { "epoch": 0.5395744680851063, "grad_norm": 1.4242218732833862, "loss": 5.2367, "lr": 0.000804055944055944, "step": 1902, "tokens_trained": 0.180783248 }, { "epoch": 0.5401418439716312, "grad_norm": 1.4476134777069092, "loss": 5.252, "lr": 0.0008037762237762239, "step": 1904, "tokens_trained": 0.180971152 }, { "epoch": 0.540709219858156, "grad_norm": 1.4724863767623901, "loss": 5.2042, "lr": 0.0008034965034965035, "step": 1906, "tokens_trained": 0.181159992 }, { "epoch": 0.5412765957446809, "grad_norm": 1.4014806747436523, "loss": 5.2514, "lr": 0.0008032167832167832, "step": 1908, "tokens_trained": 0.18135032 }, { "epoch": 0.5418439716312057, "grad_norm": 1.3511682748794556, "loss": 5.2036, "lr": 0.0008029370629370629, "step": 1910, "tokens_trained": 0.181540312 }, { "epoch": 0.5424113475177305, "grad_norm": 1.3011739253997803, "loss": 5.24, "lr": 0.0008026573426573427, "step": 1912, "tokens_trained": 0.181731104 }, { "epoch": 0.5429787234042553, "grad_norm": 1.2753015756607056, "loss": 5.25, "lr": 0.0008023776223776224, "step": 1914, "tokens_trained": 0.18192008 }, { "epoch": 0.5435460992907801, "grad_norm": 1.4685192108154297, "loss": 5.1619, "lr": 0.0008020979020979021, "step": 1916, "tokens_trained": 0.182110072 }, { "epoch": 0.544113475177305, "grad_norm": 1.4695900678634644, "loss": 5.2626, "lr": 0.0008018181818181818, "step": 1918, "tokens_trained": 0.182300224 }, { "epoch": 0.5446808510638298, "grad_norm": 1.4895613193511963, "loss": 5.1766, "lr": 0.0008015384615384615, "step": 1920, "tokens_trained": 0.182490712 }, { "epoch": 0.5452482269503546, "grad_norm": 1.3073184490203857, "loss": 5.2281, "lr": 0.0008012587412587414, "step": 1922, "tokens_trained": 0.182681168 }, { "epoch": 0.5458156028368795, "grad_norm": 1.2414125204086304, "loss": 5.2099, "lr": 0.000800979020979021, "step": 1924, "tokens_trained": 0.182870504 }, { "epoch": 0.5463829787234042, "grad_norm": 1.2407176494598389, "loss": 5.1116, "lr": 0.0008006993006993007, "step": 1926, "tokens_trained": 0.1830618 }, { "epoch": 0.546950354609929, "grad_norm": 1.4507744312286377, "loss": 5.1658, "lr": 0.0008004195804195804, "step": 1928, "tokens_trained": 0.183250072 }, { "epoch": 0.5475177304964539, "grad_norm": 1.348907232284546, "loss": 5.231, "lr": 0.0008001398601398602, "step": 1930, "tokens_trained": 0.18344004 }, { "epoch": 0.5480851063829787, "grad_norm": 1.4393324851989746, "loss": 5.2393, "lr": 0.0007998601398601399, "step": 1932, "tokens_trained": 0.183630032 }, { "epoch": 0.5486524822695036, "grad_norm": 1.3569602966308594, "loss": 5.2068, "lr": 0.0007995804195804196, "step": 1934, "tokens_trained": 0.183820816 }, { "epoch": 0.5492198581560284, "grad_norm": 1.362021803855896, "loss": 5.1641, "lr": 0.0007993006993006992, "step": 1936, "tokens_trained": 0.184009824 }, { "epoch": 0.5497872340425531, "grad_norm": 1.2926445007324219, "loss": 5.1983, "lr": 0.000799020979020979, "step": 1938, "tokens_trained": 0.184199544 }, { "epoch": 0.550354609929078, "grad_norm": 1.3065440654754639, "loss": 5.3009, "lr": 0.0007987412587412588, "step": 1940, "tokens_trained": 0.1843906 }, { "epoch": 0.5509219858156028, "grad_norm": 1.3288060426712036, "loss": 5.2347, "lr": 0.0007984615384615385, "step": 1942, "tokens_trained": 0.184580304 }, { "epoch": 0.5514893617021277, "grad_norm": 1.4742496013641357, "loss": 5.1497, "lr": 0.0007981818181818182, "step": 1944, "tokens_trained": 0.184771832 }, { "epoch": 0.5520567375886525, "grad_norm": 1.3907397985458374, "loss": 5.2001, "lr": 0.0007979020979020979, "step": 1946, "tokens_trained": 0.184963744 }, { "epoch": 0.5526241134751773, "grad_norm": 1.3324332237243652, "loss": 5.2056, "lr": 0.0007976223776223777, "step": 1948, "tokens_trained": 0.185152248 }, { "epoch": 0.5531914893617021, "grad_norm": 1.258155345916748, "loss": 5.1999, "lr": 0.0007973426573426573, "step": 1950, "tokens_trained": 0.18534196 }, { "epoch": 0.5537588652482269, "grad_norm": 1.3515956401824951, "loss": 5.1988, "lr": 0.0007970629370629371, "step": 1952, "tokens_trained": 0.18553156 }, { "epoch": 0.5543262411347518, "grad_norm": 1.535507321357727, "loss": 5.2198, "lr": 0.0007967832167832167, "step": 1954, "tokens_trained": 0.185719792 }, { "epoch": 0.5548936170212766, "grad_norm": 1.3124226331710815, "loss": 5.1468, "lr": 0.0007965034965034965, "step": 1956, "tokens_trained": 0.18591288 }, { "epoch": 0.5554609929078014, "grad_norm": 1.2720654010772705, "loss": 5.1939, "lr": 0.0007962237762237763, "step": 1958, "tokens_trained": 0.186102344 }, { "epoch": 0.5560283687943263, "grad_norm": 1.2731753587722778, "loss": 5.2063, "lr": 0.000795944055944056, "step": 1960, "tokens_trained": 0.186291976 }, { "epoch": 0.556595744680851, "grad_norm": 1.3020576238632202, "loss": 5.266, "lr": 0.0007956643356643357, "step": 1962, "tokens_trained": 0.186483504 }, { "epoch": 0.5571631205673759, "grad_norm": 1.300626277923584, "loss": 5.2159, "lr": 0.0007953846153846153, "step": 1964, "tokens_trained": 0.18667372 }, { "epoch": 0.5577304964539007, "grad_norm": 1.3075426816940308, "loss": 5.2136, "lr": 0.0007951048951048952, "step": 1966, "tokens_trained": 0.186864808 }, { "epoch": 0.5582978723404255, "grad_norm": 1.4623394012451172, "loss": 5.2081, "lr": 0.0007948251748251748, "step": 1968, "tokens_trained": 0.187056272 }, { "epoch": 0.5588652482269504, "grad_norm": 1.4950625896453857, "loss": 5.1885, "lr": 0.0007945454545454546, "step": 1970, "tokens_trained": 0.187244464 }, { "epoch": 0.5594326241134752, "grad_norm": 1.517152190208435, "loss": 5.2558, "lr": 0.0007942657342657342, "step": 1972, "tokens_trained": 0.187433216 }, { "epoch": 0.56, "grad_norm": 1.4226372241973877, "loss": 5.236, "lr": 0.000793986013986014, "step": 1974, "tokens_trained": 0.187622632 }, { "epoch": 0.5605673758865248, "grad_norm": 1.3692735433578491, "loss": 5.2089, "lr": 0.0007937062937062938, "step": 1976, "tokens_trained": 0.18781324 }, { "epoch": 0.5611347517730496, "grad_norm": 1.3344841003417969, "loss": 5.2052, "lr": 0.0007934265734265734, "step": 1978, "tokens_trained": 0.188002488 }, { "epoch": 0.5617021276595745, "grad_norm": 1.3929632902145386, "loss": 5.2353, "lr": 0.0007931468531468532, "step": 1980, "tokens_trained": 0.188194712 }, { "epoch": 0.5622695035460993, "grad_norm": 1.3147000074386597, "loss": 5.2071, "lr": 0.0007928671328671328, "step": 1982, "tokens_trained": 0.188387056 }, { "epoch": 0.5628368794326241, "grad_norm": 1.351483702659607, "loss": 5.2196, "lr": 0.0007925874125874127, "step": 1984, "tokens_trained": 0.188579048 }, { "epoch": 0.563404255319149, "grad_norm": 1.3840581178665161, "loss": 5.1889, "lr": 0.0007923076923076923, "step": 1986, "tokens_trained": 0.18876896 }, { "epoch": 0.5639716312056737, "grad_norm": 1.3427214622497559, "loss": 5.192, "lr": 0.000792027972027972, "step": 1988, "tokens_trained": 0.18895832 }, { "epoch": 0.5645390070921986, "grad_norm": 1.2931344509124756, "loss": 5.1942, "lr": 0.0007917482517482517, "step": 1990, "tokens_trained": 0.18915036 }, { "epoch": 0.5651063829787234, "grad_norm": 1.2408664226531982, "loss": 5.2014, "lr": 0.0007914685314685314, "step": 1992, "tokens_trained": 0.189339784 }, { "epoch": 0.5656737588652482, "grad_norm": 1.342760682106018, "loss": 5.2056, "lr": 0.0007911888111888113, "step": 1994, "tokens_trained": 0.189530776 }, { "epoch": 0.5662411347517731, "grad_norm": 1.2647815942764282, "loss": 5.2338, "lr": 0.0007909090909090909, "step": 1996, "tokens_trained": 0.189720312 }, { "epoch": 0.5668085106382978, "grad_norm": 1.1956689357757568, "loss": 5.1464, "lr": 0.0007906293706293707, "step": 1998, "tokens_trained": 0.189909592 }, { "epoch": 0.5673758865248227, "grad_norm": 1.287185549736023, "loss": 5.1919, "lr": 0.0007903496503496503, "step": 2000, "tokens_trained": 0.190100544 }, { "epoch": 0.5673758865248227, "eval_loss": 5.208409309387207, "eval_runtime": 21.1643, "step": 2000, "tokens_trained": 0.190100544 }, { "epoch": 0.5679432624113475, "grad_norm": 1.3409695625305176, "loss": 5.1723, "lr": 0.0007900699300699302, "step": 2002, "tokens_trained": 0.190291792 }, { "epoch": 0.5685106382978723, "grad_norm": 1.3951654434204102, "loss": 5.243, "lr": 0.0007897902097902098, "step": 2004, "tokens_trained": 0.190481864 }, { "epoch": 0.5690780141843972, "grad_norm": 1.2949507236480713, "loss": 5.2248, "lr": 0.0007895104895104895, "step": 2006, "tokens_trained": 0.19067228 }, { "epoch": 0.569645390070922, "grad_norm": 1.3585959672927856, "loss": 5.1889, "lr": 0.0007892307692307692, "step": 2008, "tokens_trained": 0.190860368 }, { "epoch": 0.5702127659574469, "grad_norm": 1.2834774255752563, "loss": 5.2067, "lr": 0.0007889510489510489, "step": 2010, "tokens_trained": 0.191051904 }, { "epoch": 0.5707801418439716, "grad_norm": 1.3544108867645264, "loss": 5.2041, "lr": 0.0007886713286713288, "step": 2012, "tokens_trained": 0.191242688 }, { "epoch": 0.5713475177304964, "grad_norm": 1.3536330461502075, "loss": 5.2131, "lr": 0.0007883916083916084, "step": 2014, "tokens_trained": 0.191431104 }, { "epoch": 0.5719148936170213, "grad_norm": 1.337441325187683, "loss": 5.2036, "lr": 0.0007881118881118882, "step": 2016, "tokens_trained": 0.19162204 }, { "epoch": 0.5724822695035461, "grad_norm": 1.4701579809188843, "loss": 5.2049, "lr": 0.0007878321678321678, "step": 2018, "tokens_trained": 0.191813352 }, { "epoch": 0.573049645390071, "grad_norm": 1.4354153871536255, "loss": 5.2583, "lr": 0.0007875524475524476, "step": 2020, "tokens_trained": 0.192004064 }, { "epoch": 0.5736170212765958, "grad_norm": 1.358913540840149, "loss": 5.1961, "lr": 0.0007872727272727273, "step": 2022, "tokens_trained": 0.192193232 }, { "epoch": 0.5741843971631205, "grad_norm": 1.3889496326446533, "loss": 5.1755, "lr": 0.000786993006993007, "step": 2024, "tokens_trained": 0.192385416 }, { "epoch": 0.5747517730496454, "grad_norm": 1.4138504266738892, "loss": 5.2423, "lr": 0.0007867132867132867, "step": 2026, "tokens_trained": 0.192575904 }, { "epoch": 0.5753191489361702, "grad_norm": 1.2651748657226562, "loss": 5.1574, "lr": 0.0007864335664335664, "step": 2028, "tokens_trained": 0.192765568 }, { "epoch": 0.5758865248226951, "grad_norm": 1.304296612739563, "loss": 5.1978, "lr": 0.0007861538461538463, "step": 2030, "tokens_trained": 0.192956176 }, { "epoch": 0.5764539007092199, "grad_norm": 1.2884007692337036, "loss": 5.1945, "lr": 0.0007858741258741259, "step": 2032, "tokens_trained": 0.193146208 }, { "epoch": 0.5770212765957446, "grad_norm": 1.4838171005249023, "loss": 5.1348, "lr": 0.0007855944055944056, "step": 2034, "tokens_trained": 0.193335664 }, { "epoch": 0.5775886524822695, "grad_norm": 1.456529974937439, "loss": 5.2284, "lr": 0.0007853146853146853, "step": 2036, "tokens_trained": 0.193525216 }, { "epoch": 0.5781560283687943, "grad_norm": 1.3471657037734985, "loss": 5.2101, "lr": 0.0007850349650349651, "step": 2038, "tokens_trained": 0.19371268 }, { "epoch": 0.5787234042553191, "grad_norm": 1.3996837139129639, "loss": 5.1828, "lr": 0.0007847552447552448, "step": 2040, "tokens_trained": 0.193903536 }, { "epoch": 0.579290780141844, "grad_norm": 1.4071470499038696, "loss": 5.1724, "lr": 0.0007844755244755245, "step": 2042, "tokens_trained": 0.194092384 }, { "epoch": 0.5798581560283688, "grad_norm": 1.4125159978866577, "loss": 5.1602, "lr": 0.0007841958041958041, "step": 2044, "tokens_trained": 0.19428356 }, { "epoch": 0.5804255319148937, "grad_norm": 1.3602298498153687, "loss": 5.1904, "lr": 0.0007839160839160839, "step": 2046, "tokens_trained": 0.194473352 }, { "epoch": 0.5809929078014184, "grad_norm": 1.2836074829101562, "loss": 5.1648, "lr": 0.0007836363636363637, "step": 2048, "tokens_trained": 0.194663624 }, { "epoch": 0.5815602836879432, "grad_norm": 1.306192398071289, "loss": 5.2037, "lr": 0.0007833566433566434, "step": 2050, "tokens_trained": 0.194854 }, { "epoch": 0.5821276595744681, "grad_norm": 1.3130674362182617, "loss": 5.223, "lr": 0.0007830769230769231, "step": 2052, "tokens_trained": 0.195044368 }, { "epoch": 0.5826950354609929, "grad_norm": 1.2337714433670044, "loss": 5.1609, "lr": 0.0007827972027972028, "step": 2054, "tokens_trained": 0.195237064 }, { "epoch": 0.5832624113475178, "grad_norm": 1.2249869108200073, "loss": 5.1352, "lr": 0.0007825174825174826, "step": 2056, "tokens_trained": 0.195425016 }, { "epoch": 0.5838297872340426, "grad_norm": 1.2610726356506348, "loss": 5.2304, "lr": 0.0007822377622377622, "step": 2058, "tokens_trained": 0.195614488 }, { "epoch": 0.5843971631205673, "grad_norm": 1.1917920112609863, "loss": 5.1964, "lr": 0.000781958041958042, "step": 2060, "tokens_trained": 0.19580392 }, { "epoch": 0.5849645390070922, "grad_norm": 1.2248187065124512, "loss": 5.0901, "lr": 0.0007816783216783216, "step": 2062, "tokens_trained": 0.195993096 }, { "epoch": 0.585531914893617, "grad_norm": 1.4138745069503784, "loss": 5.1806, "lr": 0.0007813986013986014, "step": 2064, "tokens_trained": 0.196183824 }, { "epoch": 0.5860992907801419, "grad_norm": 1.389195442199707, "loss": 5.1813, "lr": 0.0007811188811188812, "step": 2066, "tokens_trained": 0.196373912 }, { "epoch": 0.5866666666666667, "grad_norm": 1.2737247943878174, "loss": 5.1935, "lr": 0.0007808391608391609, "step": 2068, "tokens_trained": 0.196564696 }, { "epoch": 0.5872340425531914, "grad_norm": 1.443703293800354, "loss": 5.2376, "lr": 0.0007805594405594406, "step": 2070, "tokens_trained": 0.196754472 }, { "epoch": 0.5878014184397163, "grad_norm": 1.367251992225647, "loss": 5.2505, "lr": 0.0007802797202797202, "step": 2072, "tokens_trained": 0.196945288 }, { "epoch": 0.5883687943262411, "grad_norm": 1.4049919843673706, "loss": 5.2155, "lr": 0.0007800000000000001, "step": 2074, "tokens_trained": 0.197135328 }, { "epoch": 0.588936170212766, "grad_norm": 1.5119894742965698, "loss": 5.189, "lr": 0.0007797202797202797, "step": 2076, "tokens_trained": 0.197325152 }, { "epoch": 0.5895035460992908, "grad_norm": 1.349288821220398, "loss": 5.1626, "lr": 0.0007794405594405595, "step": 2078, "tokens_trained": 0.197514576 }, { "epoch": 0.5900709219858156, "grad_norm": 1.2594739198684692, "loss": 5.2222, "lr": 0.0007791608391608391, "step": 2080, "tokens_trained": 0.197705064 }, { "epoch": 0.5906382978723405, "grad_norm": 1.0747008323669434, "loss": 5.1669, "lr": 0.0007788811188811189, "step": 2082, "tokens_trained": 0.197895032 }, { "epoch": 0.5912056737588652, "grad_norm": 1.1089273691177368, "loss": 5.1071, "lr": 0.0007786013986013987, "step": 2084, "tokens_trained": 0.198085832 }, { "epoch": 0.5917730496453901, "grad_norm": 1.153296709060669, "loss": 5.1483, "lr": 0.0007783216783216783, "step": 2086, "tokens_trained": 0.198272104 }, { "epoch": 0.5923404255319149, "grad_norm": 1.1960811614990234, "loss": 5.1703, "lr": 0.0007780419580419581, "step": 2088, "tokens_trained": 0.198459976 }, { "epoch": 0.5929078014184397, "grad_norm": 1.073548674583435, "loss": 5.2449, "lr": 0.0007777622377622377, "step": 2090, "tokens_trained": 0.198648376 }, { "epoch": 0.5934751773049646, "grad_norm": 1.233362078666687, "loss": 5.1987, "lr": 0.0007774825174825176, "step": 2092, "tokens_trained": 0.198839144 }, { "epoch": 0.5940425531914894, "grad_norm": 1.3649506568908691, "loss": 5.183, "lr": 0.0007772027972027972, "step": 2094, "tokens_trained": 0.199029064 }, { "epoch": 0.5946099290780141, "grad_norm": 1.2620112895965576, "loss": 5.1343, "lr": 0.000776923076923077, "step": 2096, "tokens_trained": 0.199218376 }, { "epoch": 0.595177304964539, "grad_norm": 1.3836737871170044, "loss": 5.248, "lr": 0.0007766433566433566, "step": 2098, "tokens_trained": 0.199407736 }, { "epoch": 0.5957446808510638, "grad_norm": 1.3027995824813843, "loss": 5.1813, "lr": 0.0007763636363636363, "step": 2100, "tokens_trained": 0.199597888 }, { "epoch": 0.5963120567375887, "grad_norm": 1.2857698202133179, "loss": 5.2111, "lr": 0.0007760839160839162, "step": 2102, "tokens_trained": 0.19978852 }, { "epoch": 0.5968794326241135, "grad_norm": 1.3470538854599, "loss": 5.1505, "lr": 0.0007758041958041958, "step": 2104, "tokens_trained": 0.199978536 }, { "epoch": 0.5974468085106382, "grad_norm": 1.230573058128357, "loss": 5.1222, "lr": 0.0007755244755244756, "step": 2106, "tokens_trained": 0.200170024 }, { "epoch": 0.5980141843971631, "grad_norm": 1.2551500797271729, "loss": 5.1297, "lr": 0.0007752447552447552, "step": 2108, "tokens_trained": 0.20035992 }, { "epoch": 0.5985815602836879, "grad_norm": 1.2162272930145264, "loss": 5.233, "lr": 0.0007749650349650351, "step": 2110, "tokens_trained": 0.200548976 }, { "epoch": 0.5991489361702128, "grad_norm": 1.2617305517196655, "loss": 5.2118, "lr": 0.0007746853146853147, "step": 2112, "tokens_trained": 0.200740656 }, { "epoch": 0.5997163120567376, "grad_norm": 1.4057862758636475, "loss": 5.2215, "lr": 0.0007744055944055944, "step": 2114, "tokens_trained": 0.200930944 }, { "epoch": 0.6002836879432624, "grad_norm": 1.3729593753814697, "loss": 5.1773, "lr": 0.0007741258741258741, "step": 2116, "tokens_trained": 0.201122528 }, { "epoch": 0.6008510638297873, "grad_norm": 1.3300920724868774, "loss": 5.1573, "lr": 0.0007738461538461538, "step": 2118, "tokens_trained": 0.201310224 }, { "epoch": 0.601418439716312, "grad_norm": 1.33209228515625, "loss": 5.1523, "lr": 0.0007735664335664337, "step": 2120, "tokens_trained": 0.201499048 }, { "epoch": 0.6019858156028369, "grad_norm": 1.1407768726348877, "loss": 5.1453, "lr": 0.0007732867132867133, "step": 2122, "tokens_trained": 0.201688872 }, { "epoch": 0.6025531914893617, "grad_norm": 1.1250742673873901, "loss": 5.173, "lr": 0.0007730069930069931, "step": 2124, "tokens_trained": 0.201880504 }, { "epoch": 0.6028368794326241, "eval_loss": 5.190411567687988, "eval_runtime": 20.812, "step": 2125, "tokens_trained": 0.201976984 }, { "epoch": 0.6031205673758865, "grad_norm": 1.2974287271499634, "loss": 5.1878, "lr": 0.0007727272727272727, "step": 2126, "tokens_trained": 0.20207104 }, { "epoch": 0.6036879432624114, "grad_norm": 1.251120924949646, "loss": 5.203, "lr": 0.0007724475524475525, "step": 2128, "tokens_trained": 0.202261848 }, { "epoch": 0.6042553191489362, "grad_norm": 1.3494654893875122, "loss": 5.1981, "lr": 0.0007721678321678322, "step": 2130, "tokens_trained": 0.202452936 }, { "epoch": 0.604822695035461, "grad_norm": 1.2586653232574463, "loss": 5.1786, "lr": 0.0007718881118881119, "step": 2132, "tokens_trained": 0.202642168 }, { "epoch": 0.6053900709219858, "grad_norm": 1.228868842124939, "loss": 5.1651, "lr": 0.0007716083916083916, "step": 2134, "tokens_trained": 0.202830528 }, { "epoch": 0.6059574468085106, "grad_norm": 1.25627863407135, "loss": 5.2033, "lr": 0.0007713286713286713, "step": 2136, "tokens_trained": 0.203022216 }, { "epoch": 0.6065248226950355, "grad_norm": 1.1568467617034912, "loss": 5.1659, "lr": 0.0007710489510489512, "step": 2138, "tokens_trained": 0.203211696 }, { "epoch": 0.6070921985815603, "grad_norm": 1.1502138376235962, "loss": 5.1935, "lr": 0.0007707692307692308, "step": 2140, "tokens_trained": 0.203403224 }, { "epoch": 0.6076595744680852, "grad_norm": 1.2491158246994019, "loss": 5.1367, "lr": 0.0007704895104895105, "step": 2142, "tokens_trained": 0.203594912 }, { "epoch": 0.6082269503546099, "grad_norm": 1.3012075424194336, "loss": 5.1954, "lr": 0.0007702097902097902, "step": 2144, "tokens_trained": 0.203787032 }, { "epoch": 0.6087943262411347, "grad_norm": 1.2956688404083252, "loss": 5.2255, "lr": 0.0007699300699300699, "step": 2146, "tokens_trained": 0.203979064 }, { "epoch": 0.6093617021276596, "grad_norm": 1.3562579154968262, "loss": 5.2371, "lr": 0.0007696503496503497, "step": 2148, "tokens_trained": 0.20416828 }, { "epoch": 0.6099290780141844, "grad_norm": 1.2726640701293945, "loss": 5.154, "lr": 0.0007693706293706294, "step": 2150, "tokens_trained": 0.20435532 }, { "epoch": 0.6104964539007092, "grad_norm": 1.1975597143173218, "loss": 5.1559, "lr": 0.000769090909090909, "step": 2152, "tokens_trained": 0.204545416 }, { "epoch": 0.6110638297872341, "grad_norm": 1.2840410470962524, "loss": 5.2558, "lr": 0.0007688111888111888, "step": 2154, "tokens_trained": 0.204734752 }, { "epoch": 0.6116312056737588, "grad_norm": 1.4807062149047852, "loss": 5.229, "lr": 0.0007685314685314686, "step": 2156, "tokens_trained": 0.204925432 }, { "epoch": 0.6121985815602837, "grad_norm": 1.3909307718276978, "loss": 5.2128, "lr": 0.0007682517482517483, "step": 2158, "tokens_trained": 0.205117624 }, { "epoch": 0.6127659574468085, "grad_norm": 1.3998613357543945, "loss": 5.1344, "lr": 0.000767972027972028, "step": 2160, "tokens_trained": 0.205309032 }, { "epoch": 0.6133333333333333, "grad_norm": 1.3821474313735962, "loss": 5.2223, "lr": 0.0007676923076923077, "step": 2162, "tokens_trained": 0.205498112 }, { "epoch": 0.6139007092198582, "grad_norm": 1.280150294303894, "loss": 5.1357, "lr": 0.0007674125874125874, "step": 2164, "tokens_trained": 0.205686112 }, { "epoch": 0.614468085106383, "grad_norm": 1.2361094951629639, "loss": 5.1285, "lr": 0.0007671328671328672, "step": 2166, "tokens_trained": 0.20587828 }, { "epoch": 0.6150354609929078, "grad_norm": 1.1495496034622192, "loss": 5.1597, "lr": 0.0007668531468531469, "step": 2168, "tokens_trained": 0.206068272 }, { "epoch": 0.6156028368794326, "grad_norm": 1.2377156019210815, "loss": 5.1208, "lr": 0.0007665734265734265, "step": 2170, "tokens_trained": 0.206257272 }, { "epoch": 0.6161702127659574, "grad_norm": 1.226664423942566, "loss": 5.2143, "lr": 0.0007662937062937063, "step": 2172, "tokens_trained": 0.206449824 }, { "epoch": 0.6167375886524823, "grad_norm": 1.1939537525177002, "loss": 5.0847, "lr": 0.000766013986013986, "step": 2174, "tokens_trained": 0.206636992 }, { "epoch": 0.6173049645390071, "grad_norm": 1.233585238456726, "loss": 5.1647, "lr": 0.0007657342657342658, "step": 2176, "tokens_trained": 0.206828288 }, { "epoch": 0.617872340425532, "grad_norm": 1.3282006978988647, "loss": 5.1748, "lr": 0.0007654545454545455, "step": 2178, "tokens_trained": 0.207019064 }, { "epoch": 0.6184397163120567, "grad_norm": 1.2299532890319824, "loss": 5.248, "lr": 0.0007651748251748251, "step": 2180, "tokens_trained": 0.20720844 }, { "epoch": 0.6190070921985815, "grad_norm": 1.279590129852295, "loss": 5.1467, "lr": 0.0007648951048951049, "step": 2182, "tokens_trained": 0.207398952 }, { "epoch": 0.6195744680851064, "grad_norm": 1.30775785446167, "loss": 5.1981, "lr": 0.0007646153846153846, "step": 2184, "tokens_trained": 0.207589224 }, { "epoch": 0.6201418439716312, "grad_norm": 1.2829056978225708, "loss": 5.1976, "lr": 0.0007643356643356644, "step": 2186, "tokens_trained": 0.20778024 }, { "epoch": 0.6207092198581561, "grad_norm": 1.2149474620819092, "loss": 5.2186, "lr": 0.000764055944055944, "step": 2188, "tokens_trained": 0.207969176 }, { "epoch": 0.6212765957446809, "grad_norm": 1.239912748336792, "loss": 5.15, "lr": 0.0007637762237762238, "step": 2190, "tokens_trained": 0.208159016 }, { "epoch": 0.6218439716312056, "grad_norm": 1.322252869606018, "loss": 5.2447, "lr": 0.0007634965034965035, "step": 2192, "tokens_trained": 0.2083502 }, { "epoch": 0.6224113475177305, "grad_norm": 1.1804618835449219, "loss": 5.1924, "lr": 0.0007632167832167833, "step": 2194, "tokens_trained": 0.208539616 }, { "epoch": 0.6229787234042553, "grad_norm": 1.2914003133773804, "loss": 5.1559, "lr": 0.000762937062937063, "step": 2196, "tokens_trained": 0.208731032 }, { "epoch": 0.6235460992907801, "grad_norm": 1.2175878286361694, "loss": 5.1335, "lr": 0.0007626573426573426, "step": 2198, "tokens_trained": 0.208923952 }, { "epoch": 0.624113475177305, "grad_norm": 1.2267946004867554, "loss": 5.1697, "lr": 0.0007623776223776224, "step": 2200, "tokens_trained": 0.20911168 }, { "epoch": 0.6246808510638298, "grad_norm": 1.2482635974884033, "loss": 5.1986, "lr": 0.0007620979020979021, "step": 2202, "tokens_trained": 0.209299504 }, { "epoch": 0.6252482269503546, "grad_norm": 1.3256076574325562, "loss": 5.1955, "lr": 0.0007618181818181819, "step": 2204, "tokens_trained": 0.20948936 }, { "epoch": 0.6258156028368794, "grad_norm": 1.205692172050476, "loss": 5.1175, "lr": 0.0007615384615384615, "step": 2206, "tokens_trained": 0.209678072 }, { "epoch": 0.6263829787234042, "grad_norm": 1.2371326684951782, "loss": 5.1798, "lr": 0.0007612587412587412, "step": 2208, "tokens_trained": 0.209868904 }, { "epoch": 0.6269503546099291, "grad_norm": 1.1657975912094116, "loss": 5.159, "lr": 0.000760979020979021, "step": 2210, "tokens_trained": 0.210060992 }, { "epoch": 0.6275177304964539, "grad_norm": 1.18202543258667, "loss": 5.2157, "lr": 0.0007606993006993007, "step": 2212, "tokens_trained": 0.210252096 }, { "epoch": 0.6280851063829788, "grad_norm": 1.220446228981018, "loss": 5.1677, "lr": 0.0007604195804195805, "step": 2214, "tokens_trained": 0.210444176 }, { "epoch": 0.6286524822695035, "grad_norm": 1.1070069074630737, "loss": 5.1702, "lr": 0.0007601398601398601, "step": 2216, "tokens_trained": 0.210633376 }, { "epoch": 0.6292198581560283, "grad_norm": 1.3031543493270874, "loss": 5.2253, "lr": 0.0007598601398601399, "step": 2218, "tokens_trained": 0.21082368 }, { "epoch": 0.6297872340425532, "grad_norm": 1.0999404191970825, "loss": 5.1942, "lr": 0.0007595804195804196, "step": 2220, "tokens_trained": 0.211013448 }, { "epoch": 0.630354609929078, "grad_norm": 1.2241060733795166, "loss": 5.1408, "lr": 0.0007593006993006993, "step": 2222, "tokens_trained": 0.211205176 }, { "epoch": 0.6309219858156029, "grad_norm": 1.3057242631912231, "loss": 5.2234, "lr": 0.000759020979020979, "step": 2224, "tokens_trained": 0.211396464 }, { "epoch": 0.6314893617021277, "grad_norm": 1.2667888402938843, "loss": 5.1675, "lr": 0.0007587412587412587, "step": 2226, "tokens_trained": 0.211587608 }, { "epoch": 0.6320567375886524, "grad_norm": 1.1653670072555542, "loss": 5.2081, "lr": 0.0007584615384615385, "step": 2228, "tokens_trained": 0.211779832 }, { "epoch": 0.6326241134751773, "grad_norm": 1.1786928176879883, "loss": 5.1772, "lr": 0.0007581818181818182, "step": 2230, "tokens_trained": 0.211971584 }, { "epoch": 0.6331914893617021, "grad_norm": 1.242872714996338, "loss": 5.1378, "lr": 0.000757902097902098, "step": 2232, "tokens_trained": 0.212161024 }, { "epoch": 0.633758865248227, "grad_norm": 1.2831401824951172, "loss": 5.1488, "lr": 0.0007576223776223776, "step": 2234, "tokens_trained": 0.21235084 }, { "epoch": 0.6343262411347518, "grad_norm": 1.269600510597229, "loss": 5.1454, "lr": 0.0007573426573426573, "step": 2236, "tokens_trained": 0.212539504 }, { "epoch": 0.6348936170212766, "grad_norm": 1.2224805355072021, "loss": 5.1123, "lr": 0.0007570629370629371, "step": 2238, "tokens_trained": 0.21272884 }, { "epoch": 0.6354609929078014, "grad_norm": 1.2404342889785767, "loss": 5.2023, "lr": 0.0007567832167832168, "step": 2240, "tokens_trained": 0.212920128 }, { "epoch": 0.6360283687943262, "grad_norm": 1.1551696062088013, "loss": 5.1529, "lr": 0.0007565034965034965, "step": 2242, "tokens_trained": 0.213110744 }, { "epoch": 0.6365957446808511, "grad_norm": 1.2342238426208496, "loss": 5.182, "lr": 0.0007562237762237762, "step": 2244, "tokens_trained": 0.213298584 }, { "epoch": 0.6371631205673759, "grad_norm": 1.2631146907806396, "loss": 5.1442, "lr": 0.000755944055944056, "step": 2246, "tokens_trained": 0.213488512 }, { "epoch": 0.6377304964539007, "grad_norm": 1.2031443119049072, "loss": 5.1041, "lr": 0.0007556643356643357, "step": 2248, "tokens_trained": 0.21367964 }, { "epoch": 0.6382978723404256, "grad_norm": 1.127889633178711, "loss": 5.1889, "lr": 0.0007553846153846154, "step": 2250, "tokens_trained": 0.213871584 }, { "epoch": 0.6382978723404256, "eval_loss": 5.1714253425598145, "eval_runtime": 20.5005, "step": 2250, "tokens_trained": 0.213871584 }, { "epoch": 0.6388652482269503, "grad_norm": 1.1281750202178955, "loss": 5.1039, "lr": 0.0007551048951048951, "step": 2252, "tokens_trained": 0.214061624 }, { "epoch": 0.6394326241134751, "grad_norm": 1.1058608293533325, "loss": 5.1562, "lr": 0.0007548251748251748, "step": 2254, "tokens_trained": 0.214252024 }, { "epoch": 0.64, "grad_norm": 1.0579496622085571, "loss": 5.1476, "lr": 0.0007545454545454546, "step": 2256, "tokens_trained": 0.214442624 }, { "epoch": 0.6405673758865248, "grad_norm": 1.1370742321014404, "loss": 5.1948, "lr": 0.0007542657342657343, "step": 2258, "tokens_trained": 0.214634016 }, { "epoch": 0.6411347517730497, "grad_norm": 1.1118457317352295, "loss": 5.169, "lr": 0.000753986013986014, "step": 2260, "tokens_trained": 0.214823368 }, { "epoch": 0.6417021276595745, "grad_norm": 1.039004921913147, "loss": 5.1454, "lr": 0.0007537062937062937, "step": 2262, "tokens_trained": 0.21501196 }, { "epoch": 0.6422695035460992, "grad_norm": 1.2534265518188477, "loss": 5.1455, "lr": 0.0007534265734265734, "step": 2264, "tokens_trained": 0.215200808 }, { "epoch": 0.6428368794326241, "grad_norm": 1.2437689304351807, "loss": 5.1966, "lr": 0.0007531468531468532, "step": 2266, "tokens_trained": 0.21539036 }, { "epoch": 0.6434042553191489, "grad_norm": 1.1795995235443115, "loss": 5.1716, "lr": 0.0007528671328671329, "step": 2268, "tokens_trained": 0.215582088 }, { "epoch": 0.6439716312056738, "grad_norm": 1.3241360187530518, "loss": 5.1638, "lr": 0.0007525874125874126, "step": 2270, "tokens_trained": 0.215771936 }, { "epoch": 0.6445390070921986, "grad_norm": 1.2526317834854126, "loss": 5.1067, "lr": 0.0007523076923076923, "step": 2272, "tokens_trained": 0.215960792 }, { "epoch": 0.6451063829787234, "grad_norm": 1.249042272567749, "loss": 5.1466, "lr": 0.0007520279720279721, "step": 2274, "tokens_trained": 0.216151448 }, { "epoch": 0.6456737588652482, "grad_norm": 1.1926413774490356, "loss": 5.1886, "lr": 0.0007517482517482518, "step": 2276, "tokens_trained": 0.216340368 }, { "epoch": 0.646241134751773, "grad_norm": 1.1615192890167236, "loss": 5.1538, "lr": 0.0007514685314685314, "step": 2278, "tokens_trained": 0.216531264 }, { "epoch": 0.6468085106382979, "grad_norm": 1.1265521049499512, "loss": 5.1518, "lr": 0.0007511888111888112, "step": 2280, "tokens_trained": 0.216722024 }, { "epoch": 0.6473758865248227, "grad_norm": 1.0598393678665161, "loss": 5.1776, "lr": 0.0007509090909090909, "step": 2282, "tokens_trained": 0.216913232 }, { "epoch": 0.6479432624113475, "grad_norm": 1.1727370023727417, "loss": 5.2083, "lr": 0.0007506293706293707, "step": 2284, "tokens_trained": 0.217103136 }, { "epoch": 0.6485106382978724, "grad_norm": 1.1411634683609009, "loss": 5.182, "lr": 0.0007503496503496504, "step": 2286, "tokens_trained": 0.21729368 }, { "epoch": 0.6490780141843971, "grad_norm": 1.2293574810028076, "loss": 5.1725, "lr": 0.00075006993006993, "step": 2288, "tokens_trained": 0.217485624 }, { "epoch": 0.649645390070922, "grad_norm": 1.3079198598861694, "loss": 5.1531, "lr": 0.0007497902097902098, "step": 2290, "tokens_trained": 0.217675192 }, { "epoch": 0.6502127659574468, "grad_norm": 1.1579710245132446, "loss": 5.1162, "lr": 0.0007495104895104895, "step": 2292, "tokens_trained": 0.2178658 }, { "epoch": 0.6507801418439716, "grad_norm": 1.1968539953231812, "loss": 5.1652, "lr": 0.0007492307692307693, "step": 2294, "tokens_trained": 0.218057984 }, { "epoch": 0.6513475177304965, "grad_norm": 1.3666965961456299, "loss": 5.2035, "lr": 0.0007489510489510489, "step": 2296, "tokens_trained": 0.218249704 }, { "epoch": 0.6519148936170213, "grad_norm": 1.3615487813949585, "loss": 5.1704, "lr": 0.0007486713286713287, "step": 2298, "tokens_trained": 0.218441792 }, { "epoch": 0.6524822695035462, "grad_norm": 1.2289810180664062, "loss": 5.1683, "lr": 0.0007483916083916084, "step": 2300, "tokens_trained": 0.218630624 }, { "epoch": 0.6530496453900709, "grad_norm": 1.1299561262130737, "loss": 5.1672, "lr": 0.0007481118881118882, "step": 2302, "tokens_trained": 0.218819928 }, { "epoch": 0.6536170212765957, "grad_norm": 1.186132550239563, "loss": 5.1456, "lr": 0.0007478321678321679, "step": 2304, "tokens_trained": 0.219008792 }, { "epoch": 0.6541843971631206, "grad_norm": 1.2106919288635254, "loss": 5.1998, "lr": 0.0007475524475524475, "step": 2306, "tokens_trained": 0.219198584 }, { "epoch": 0.6547517730496454, "grad_norm": 1.2485368251800537, "loss": 5.1473, "lr": 0.0007472727272727273, "step": 2308, "tokens_trained": 0.219386768 }, { "epoch": 0.6553191489361702, "grad_norm": 1.1855547428131104, "loss": 5.1721, "lr": 0.000746993006993007, "step": 2310, "tokens_trained": 0.219575904 }, { "epoch": 0.655886524822695, "grad_norm": 1.3077043294906616, "loss": 5.1444, "lr": 0.0007467132867132868, "step": 2312, "tokens_trained": 0.219767712 }, { "epoch": 0.6564539007092198, "grad_norm": 1.3514399528503418, "loss": 5.198, "lr": 0.0007464335664335664, "step": 2314, "tokens_trained": 0.219959384 }, { "epoch": 0.6570212765957447, "grad_norm": 1.0906041860580444, "loss": 5.115, "lr": 0.0007461538461538462, "step": 2316, "tokens_trained": 0.2201464 }, { "epoch": 0.6575886524822695, "grad_norm": 1.154425859451294, "loss": 5.1186, "lr": 0.0007458741258741259, "step": 2318, "tokens_trained": 0.220336992 }, { "epoch": 0.6581560283687943, "grad_norm": 1.1141375303268433, "loss": 5.1709, "lr": 0.0007455944055944056, "step": 2320, "tokens_trained": 0.220525928 }, { "epoch": 0.6587234042553192, "grad_norm": 1.0958452224731445, "loss": 5.1641, "lr": 0.0007453146853146854, "step": 2322, "tokens_trained": 0.220715056 }, { "epoch": 0.659290780141844, "grad_norm": 1.168017029762268, "loss": 5.1666, "lr": 0.000745034965034965, "step": 2324, "tokens_trained": 0.220905264 }, { "epoch": 0.6598581560283688, "grad_norm": 1.044488549232483, "loss": 5.2079, "lr": 0.0007447552447552448, "step": 2326, "tokens_trained": 0.221096736 }, { "epoch": 0.6604255319148936, "grad_norm": 1.2333874702453613, "loss": 5.1166, "lr": 0.0007444755244755245, "step": 2328, "tokens_trained": 0.221287184 }, { "epoch": 0.6609929078014184, "grad_norm": 1.1800497770309448, "loss": 5.1561, "lr": 0.0007441958041958043, "step": 2330, "tokens_trained": 0.221477312 }, { "epoch": 0.6615602836879433, "grad_norm": 1.118755578994751, "loss": 5.1513, "lr": 0.0007439160839160839, "step": 2332, "tokens_trained": 0.221665208 }, { "epoch": 0.6621276595744681, "grad_norm": 1.2018475532531738, "loss": 5.1007, "lr": 0.0007436363636363636, "step": 2334, "tokens_trained": 0.221855608 }, { "epoch": 0.662695035460993, "grad_norm": 1.1832036972045898, "loss": 5.0944, "lr": 0.0007433566433566433, "step": 2336, "tokens_trained": 0.222043856 }, { "epoch": 0.6632624113475177, "grad_norm": 1.3179196119308472, "loss": 5.1645, "lr": 0.0007430769230769231, "step": 2338, "tokens_trained": 0.222235728 }, { "epoch": 0.6638297872340425, "grad_norm": 1.1313154697418213, "loss": 5.1733, "lr": 0.0007427972027972029, "step": 2340, "tokens_trained": 0.222424688 }, { "epoch": 0.6643971631205674, "grad_norm": 1.2135043144226074, "loss": 5.1291, "lr": 0.0007425174825174825, "step": 2342, "tokens_trained": 0.222611952 }, { "epoch": 0.6649645390070922, "grad_norm": 1.2418344020843506, "loss": 5.178, "lr": 0.0007422377622377622, "step": 2344, "tokens_trained": 0.222803264 }, { "epoch": 0.6655319148936171, "grad_norm": 1.2896099090576172, "loss": 5.1772, "lr": 0.000741958041958042, "step": 2346, "tokens_trained": 0.22299108 }, { "epoch": 0.6660992907801419, "grad_norm": 1.150012731552124, "loss": 5.1334, "lr": 0.0007416783216783217, "step": 2348, "tokens_trained": 0.223182336 }, { "epoch": 0.6666666666666666, "grad_norm": 1.307721495628357, "loss": 5.0898, "lr": 0.0007413986013986014, "step": 2350, "tokens_trained": 0.223371664 }, { "epoch": 0.6672340425531915, "grad_norm": 1.2633092403411865, "loss": 5.1344, "lr": 0.0007411188811188811, "step": 2352, "tokens_trained": 0.223561984 }, { "epoch": 0.6678014184397163, "grad_norm": 1.1801539659500122, "loss": 5.1242, "lr": 0.0007408391608391608, "step": 2354, "tokens_trained": 0.223750344 }, { "epoch": 0.6683687943262412, "grad_norm": 1.1279330253601074, "loss": 5.1348, "lr": 0.0007405594405594406, "step": 2356, "tokens_trained": 0.223941528 }, { "epoch": 0.668936170212766, "grad_norm": 1.193912148475647, "loss": 5.1823, "lr": 0.0007402797202797204, "step": 2358, "tokens_trained": 0.224132064 }, { "epoch": 0.6695035460992907, "grad_norm": 1.1424062252044678, "loss": 5.1452, "lr": 0.00074, "step": 2360, "tokens_trained": 0.2243216 }, { "epoch": 0.6700709219858156, "grad_norm": 1.1543093919754028, "loss": 5.1199, "lr": 0.0007397202797202797, "step": 2362, "tokens_trained": 0.224509992 }, { "epoch": 0.6706382978723404, "grad_norm": 1.2291040420532227, "loss": 5.0824, "lr": 0.0007394405594405595, "step": 2364, "tokens_trained": 0.22470124 }, { "epoch": 0.6712056737588652, "grad_norm": 1.1839559078216553, "loss": 5.1486, "lr": 0.0007391608391608392, "step": 2366, "tokens_trained": 0.224893488 }, { "epoch": 0.6717730496453901, "grad_norm": 1.1374263763427734, "loss": 5.1482, "lr": 0.0007388811188811189, "step": 2368, "tokens_trained": 0.225083304 }, { "epoch": 0.6723404255319149, "grad_norm": 1.2041044235229492, "loss": 5.1055, "lr": 0.0007386013986013986, "step": 2370, "tokens_trained": 0.225273256 }, { "epoch": 0.6729078014184398, "grad_norm": 1.1405609846115112, "loss": 5.1647, "lr": 0.0007383216783216782, "step": 2372, "tokens_trained": 0.225461976 }, { "epoch": 0.6734751773049645, "grad_norm": 1.112979531288147, "loss": 5.1232, "lr": 0.0007380419580419581, "step": 2374, "tokens_trained": 0.225651248 }, { "epoch": 0.6737588652482269, "eval_loss": 5.160866737365723, "eval_runtime": 20.3049, "step": 2375, "tokens_trained": 0.22574612 }, { "epoch": 0.6740425531914893, "grad_norm": 1.2868081331253052, "loss": 5.1802, "lr": 0.0007377622377622378, "step": 2376, "tokens_trained": 0.225840616 }, { "epoch": 0.6746099290780142, "grad_norm": 1.0904244184494019, "loss": 5.1093, "lr": 0.0007374825174825175, "step": 2378, "tokens_trained": 0.22602952 }, { "epoch": 0.675177304964539, "grad_norm": 1.182820200920105, "loss": 5.1425, "lr": 0.0007372027972027972, "step": 2380, "tokens_trained": 0.226219912 }, { "epoch": 0.6757446808510639, "grad_norm": 1.29615318775177, "loss": 5.2044, "lr": 0.000736923076923077, "step": 2382, "tokens_trained": 0.226409832 }, { "epoch": 0.6763120567375887, "grad_norm": 1.2440109252929688, "loss": 5.1722, "lr": 0.0007366433566433567, "step": 2384, "tokens_trained": 0.226600912 }, { "epoch": 0.6768794326241134, "grad_norm": 1.2176823616027832, "loss": 5.1237, "lr": 0.0007363636363636363, "step": 2386, "tokens_trained": 0.226788136 }, { "epoch": 0.6774468085106383, "grad_norm": 1.1725387573242188, "loss": 5.1334, "lr": 0.0007360839160839161, "step": 2388, "tokens_trained": 0.22697924 }, { "epoch": 0.6780141843971631, "grad_norm": 1.0678813457489014, "loss": 5.1306, "lr": 0.0007358041958041957, "step": 2390, "tokens_trained": 0.227169576 }, { "epoch": 0.678581560283688, "grad_norm": 1.1266731023788452, "loss": 5.1956, "lr": 0.0007355244755244756, "step": 2392, "tokens_trained": 0.227361776 }, { "epoch": 0.6791489361702128, "grad_norm": 1.2048848867416382, "loss": 5.1599, "lr": 0.0007352447552447553, "step": 2394, "tokens_trained": 0.227551768 }, { "epoch": 0.6797163120567375, "grad_norm": 1.2414182424545288, "loss": 5.1836, "lr": 0.000734965034965035, "step": 2396, "tokens_trained": 0.227743072 }, { "epoch": 0.6802836879432624, "grad_norm": 1.1587010622024536, "loss": 5.1589, "lr": 0.0007346853146853147, "step": 2398, "tokens_trained": 0.227933848 }, { "epoch": 0.6808510638297872, "grad_norm": 1.1487596035003662, "loss": 5.1494, "lr": 0.0007344055944055944, "step": 2400, "tokens_trained": 0.228122304 }, { "epoch": 0.6814184397163121, "grad_norm": 1.1008368730545044, "loss": 5.1614, "lr": 0.0007341258741258742, "step": 2402, "tokens_trained": 0.228311624 }, { "epoch": 0.6819858156028369, "grad_norm": 1.0571539402008057, "loss": 5.1373, "lr": 0.0007338461538461538, "step": 2404, "tokens_trained": 0.228501208 }, { "epoch": 0.6825531914893617, "grad_norm": 1.1685987710952759, "loss": 5.1439, "lr": 0.0007335664335664336, "step": 2406, "tokens_trained": 0.228691272 }, { "epoch": 0.6831205673758866, "grad_norm": 1.2319012880325317, "loss": 5.1949, "lr": 0.0007332867132867132, "step": 2408, "tokens_trained": 0.228881608 }, { "epoch": 0.6836879432624113, "grad_norm": 1.1806107759475708, "loss": 5.1467, "lr": 0.0007330069930069931, "step": 2410, "tokens_trained": 0.229073152 }, { "epoch": 0.6842553191489362, "grad_norm": 1.1616697311401367, "loss": 5.1553, "lr": 0.0007327272727272728, "step": 2412, "tokens_trained": 0.229263656 }, { "epoch": 0.684822695035461, "grad_norm": 1.143112063407898, "loss": 5.091, "lr": 0.0007324475524475524, "step": 2414, "tokens_trained": 0.229454224 }, { "epoch": 0.6853900709219858, "grad_norm": 1.2467398643493652, "loss": 5.1778, "lr": 0.0007321678321678322, "step": 2416, "tokens_trained": 0.22964568 }, { "epoch": 0.6859574468085107, "grad_norm": 1.1989973783493042, "loss": 5.146, "lr": 0.0007318881118881119, "step": 2418, "tokens_trained": 0.229836448 }, { "epoch": 0.6865248226950355, "grad_norm": 1.3296927213668823, "loss": 5.1446, "lr": 0.0007316083916083917, "step": 2420, "tokens_trained": 0.230027424 }, { "epoch": 0.6870921985815602, "grad_norm": 1.256990671157837, "loss": 5.1396, "lr": 0.0007313286713286713, "step": 2422, "tokens_trained": 0.23022012 }, { "epoch": 0.6876595744680851, "grad_norm": 1.1474595069885254, "loss": 5.1263, "lr": 0.0007310489510489511, "step": 2424, "tokens_trained": 0.230410232 }, { "epoch": 0.6882269503546099, "grad_norm": 1.2070049047470093, "loss": 5.1169, "lr": 0.0007307692307692307, "step": 2426, "tokens_trained": 0.230601056 }, { "epoch": 0.6887943262411348, "grad_norm": 1.2047003507614136, "loss": 5.1146, "lr": 0.0007304895104895105, "step": 2428, "tokens_trained": 0.230791056 }, { "epoch": 0.6893617021276596, "grad_norm": 1.3246855735778809, "loss": 5.1864, "lr": 0.0007302097902097902, "step": 2430, "tokens_trained": 0.230981904 }, { "epoch": 0.6899290780141843, "grad_norm": 1.2012712955474854, "loss": 5.168, "lr": 0.0007299300699300699, "step": 2432, "tokens_trained": 0.231170976 }, { "epoch": 0.6904964539007092, "grad_norm": 1.2258418798446655, "loss": 5.14, "lr": 0.0007296503496503497, "step": 2434, "tokens_trained": 0.231362024 }, { "epoch": 0.691063829787234, "grad_norm": 1.2767595052719116, "loss": 5.1775, "lr": 0.0007293706293706294, "step": 2436, "tokens_trained": 0.23155 }, { "epoch": 0.6916312056737589, "grad_norm": 1.204324722290039, "loss": 5.1357, "lr": 0.0007290909090909092, "step": 2438, "tokens_trained": 0.231739944 }, { "epoch": 0.6921985815602837, "grad_norm": 1.1876553297042847, "loss": 5.1185, "lr": 0.0007288111888111888, "step": 2440, "tokens_trained": 0.231930448 }, { "epoch": 0.6927659574468085, "grad_norm": 1.2512568235397339, "loss": 5.1212, "lr": 0.0007285314685314685, "step": 2442, "tokens_trained": 0.23212152 }, { "epoch": 0.6933333333333334, "grad_norm": 1.2961020469665527, "loss": 5.0622, "lr": 0.0007282517482517482, "step": 2444, "tokens_trained": 0.232310856 }, { "epoch": 0.6939007092198581, "grad_norm": 1.1042410135269165, "loss": 5.1317, "lr": 0.000727972027972028, "step": 2446, "tokens_trained": 0.232499144 }, { "epoch": 0.694468085106383, "grad_norm": 1.0408610105514526, "loss": 5.1562, "lr": 0.0007276923076923077, "step": 2448, "tokens_trained": 0.232689864 }, { "epoch": 0.6950354609929078, "grad_norm": 1.1109600067138672, "loss": 5.1463, "lr": 0.0007274125874125874, "step": 2450, "tokens_trained": 0.232878912 }, { "epoch": 0.6956028368794326, "grad_norm": 1.0867618322372437, "loss": 5.105, "lr": 0.0007271328671328672, "step": 2452, "tokens_trained": 0.233069416 }, { "epoch": 0.6961702127659575, "grad_norm": 1.0342003107070923, "loss": 5.1431, "lr": 0.0007268531468531469, "step": 2454, "tokens_trained": 0.233258552 }, { "epoch": 0.6967375886524823, "grad_norm": 1.2264306545257568, "loss": 5.1646, "lr": 0.0007265734265734266, "step": 2456, "tokens_trained": 0.233448464 }, { "epoch": 0.6973049645390071, "grad_norm": 1.1715648174285889, "loss": 5.1194, "lr": 0.0007262937062937063, "step": 2458, "tokens_trained": 0.23364024 }, { "epoch": 0.6978723404255319, "grad_norm": 1.05716872215271, "loss": 5.09, "lr": 0.000726013986013986, "step": 2460, "tokens_trained": 0.233829848 }, { "epoch": 0.6984397163120567, "grad_norm": 1.1329678297042847, "loss": 5.1303, "lr": 0.0007257342657342657, "step": 2462, "tokens_trained": 0.234021368 }, { "epoch": 0.6990070921985816, "grad_norm": 1.2084178924560547, "loss": 5.1393, "lr": 0.0007254545454545455, "step": 2464, "tokens_trained": 0.234210264 }, { "epoch": 0.6995744680851064, "grad_norm": 1.0744361877441406, "loss": 5.1067, "lr": 0.0007251748251748252, "step": 2466, "tokens_trained": 0.234399616 }, { "epoch": 0.7001418439716312, "grad_norm": 1.1711128950119019, "loss": 5.1226, "lr": 0.0007248951048951049, "step": 2468, "tokens_trained": 0.234589936 }, { "epoch": 0.700709219858156, "grad_norm": 1.2188383340835571, "loss": 5.1139, "lr": 0.0007246153846153846, "step": 2470, "tokens_trained": 0.234781376 }, { "epoch": 0.7012765957446808, "grad_norm": 1.1662676334381104, "loss": 5.137, "lr": 0.0007243356643356644, "step": 2472, "tokens_trained": 0.234972192 }, { "epoch": 0.7018439716312057, "grad_norm": 1.18717622756958, "loss": 5.1665, "lr": 0.0007240559440559441, "step": 2474, "tokens_trained": 0.235162472 }, { "epoch": 0.7024113475177305, "grad_norm": 1.1546517610549927, "loss": 5.1503, "lr": 0.0007237762237762238, "step": 2476, "tokens_trained": 0.23535256 }, { "epoch": 0.7029787234042553, "grad_norm": 1.0647573471069336, "loss": 5.155, "lr": 0.0007234965034965035, "step": 2478, "tokens_trained": 0.235543424 }, { "epoch": 0.7035460992907802, "grad_norm": 1.1157219409942627, "loss": 5.1561, "lr": 0.0007232167832167831, "step": 2480, "tokens_trained": 0.23573568 }, { "epoch": 0.7041134751773049, "grad_norm": 1.1972934007644653, "loss": 5.1271, "lr": 0.000722937062937063, "step": 2482, "tokens_trained": 0.235927072 }, { "epoch": 0.7046808510638298, "grad_norm": 1.0370620489120483, "loss": 5.1016, "lr": 0.0007226573426573426, "step": 2484, "tokens_trained": 0.236116528 }, { "epoch": 0.7052482269503546, "grad_norm": 1.1389620304107666, "loss": 5.1422, "lr": 0.0007223776223776224, "step": 2486, "tokens_trained": 0.236305864 }, { "epoch": 0.7058156028368794, "grad_norm": 1.1045559644699097, "loss": 5.1434, "lr": 0.0007220979020979021, "step": 2488, "tokens_trained": 0.236494224 }, { "epoch": 0.7063829787234043, "grad_norm": 1.1014395952224731, "loss": 5.1462, "lr": 0.0007218181818181819, "step": 2490, "tokens_trained": 0.236684376 }, { "epoch": 0.706950354609929, "grad_norm": 1.0460759401321411, "loss": 5.126, "lr": 0.0007215384615384616, "step": 2492, "tokens_trained": 0.236875272 }, { "epoch": 0.707517730496454, "grad_norm": 1.0848767757415771, "loss": 5.1387, "lr": 0.0007212587412587412, "step": 2494, "tokens_trained": 0.237065552 }, { "epoch": 0.7080851063829787, "grad_norm": 1.1626802682876587, "loss": 5.1509, "lr": 0.000720979020979021, "step": 2496, "tokens_trained": 0.237254944 }, { "epoch": 0.7086524822695035, "grad_norm": 1.1846860647201538, "loss": 5.098, "lr": 0.0007206993006993006, "step": 2498, "tokens_trained": 0.237444488 }, { "epoch": 0.7092198581560284, "grad_norm": 1.2549248933792114, "loss": 5.1104, "lr": 0.0007204195804195805, "step": 2500, "tokens_trained": 0.237633528 }, { "epoch": 0.7092198581560284, "eval_loss": 5.141824245452881, "eval_runtime": 20.5081, "step": 2500, "tokens_trained": 0.237633528 }, { "epoch": 0.7097872340425532, "grad_norm": 1.19071626663208, "loss": 5.2249, "lr": 0.0007201398601398601, "step": 2502, "tokens_trained": 0.237823136 }, { "epoch": 0.7103546099290781, "grad_norm": 1.162804365158081, "loss": 5.1099, "lr": 0.0007198601398601399, "step": 2504, "tokens_trained": 0.238012752 }, { "epoch": 0.7109219858156028, "grad_norm": 1.0964027643203735, "loss": 5.1015, "lr": 0.0007195804195804196, "step": 2506, "tokens_trained": 0.238205472 }, { "epoch": 0.7114893617021276, "grad_norm": 1.0719815492630005, "loss": 5.1425, "lr": 0.0007193006993006994, "step": 2508, "tokens_trained": 0.238394848 }, { "epoch": 0.7120567375886525, "grad_norm": 1.1835323572158813, "loss": 5.0744, "lr": 0.0007190209790209791, "step": 2510, "tokens_trained": 0.238583408 }, { "epoch": 0.7126241134751773, "grad_norm": 1.0975273847579956, "loss": 5.0346, "lr": 0.0007187412587412587, "step": 2512, "tokens_trained": 0.238773544 }, { "epoch": 0.7131914893617022, "grad_norm": 1.1507470607757568, "loss": 5.146, "lr": 0.0007184615384615385, "step": 2514, "tokens_trained": 0.238962624 }, { "epoch": 0.713758865248227, "grad_norm": 1.1186292171478271, "loss": 5.1934, "lr": 0.0007181818181818181, "step": 2516, "tokens_trained": 0.239152848 }, { "epoch": 0.7143262411347517, "grad_norm": 1.0672920942306519, "loss": 5.1488, "lr": 0.000717902097902098, "step": 2518, "tokens_trained": 0.239344248 }, { "epoch": 0.7148936170212766, "grad_norm": 1.1226296424865723, "loss": 5.0799, "lr": 0.0007176223776223776, "step": 2520, "tokens_trained": 0.239535088 }, { "epoch": 0.7154609929078014, "grad_norm": 1.134265422821045, "loss": 5.1677, "lr": 0.0007173426573426573, "step": 2522, "tokens_trained": 0.23972356 }, { "epoch": 0.7160283687943262, "grad_norm": 1.1157846450805664, "loss": 5.1576, "lr": 0.0007170629370629371, "step": 2524, "tokens_trained": 0.239914104 }, { "epoch": 0.7165957446808511, "grad_norm": 1.096637487411499, "loss": 5.1512, "lr": 0.0007167832167832168, "step": 2526, "tokens_trained": 0.24010344 }, { "epoch": 0.7171631205673759, "grad_norm": 1.0092846155166626, "loss": 5.0907, "lr": 0.0007165034965034966, "step": 2528, "tokens_trained": 0.240294496 }, { "epoch": 0.7177304964539007, "grad_norm": 0.9926803112030029, "loss": 5.112, "lr": 0.0007162237762237762, "step": 2530, "tokens_trained": 0.240484752 }, { "epoch": 0.7182978723404255, "grad_norm": 1.031894326210022, "loss": 5.13, "lr": 0.000715944055944056, "step": 2532, "tokens_trained": 0.240674024 }, { "epoch": 0.7188652482269503, "grad_norm": 1.0606821775436401, "loss": 5.1229, "lr": 0.0007156643356643356, "step": 2534, "tokens_trained": 0.24086436 }, { "epoch": 0.7194326241134752, "grad_norm": 1.0486221313476562, "loss": 5.1179, "lr": 0.0007153846153846155, "step": 2536, "tokens_trained": 0.241052096 }, { "epoch": 0.72, "grad_norm": 1.073940396308899, "loss": 5.1147, "lr": 0.0007151048951048951, "step": 2538, "tokens_trained": 0.241242064 }, { "epoch": 0.7205673758865249, "grad_norm": 1.0888422727584839, "loss": 5.1442, "lr": 0.0007148251748251748, "step": 2540, "tokens_trained": 0.241429472 }, { "epoch": 0.7211347517730496, "grad_norm": 1.0362575054168701, "loss": 5.1482, "lr": 0.0007145454545454546, "step": 2542, "tokens_trained": 0.241619464 }, { "epoch": 0.7217021276595744, "grad_norm": 1.020987629890442, "loss": 5.1809, "lr": 0.0007142657342657343, "step": 2544, "tokens_trained": 0.241810584 }, { "epoch": 0.7222695035460993, "grad_norm": 1.1145941019058228, "loss": 5.07, "lr": 0.0007139860139860141, "step": 2546, "tokens_trained": 0.242001336 }, { "epoch": 0.7228368794326241, "grad_norm": 1.114311933517456, "loss": 5.1288, "lr": 0.0007137062937062937, "step": 2548, "tokens_trained": 0.242191648 }, { "epoch": 0.723404255319149, "grad_norm": 1.2127752304077148, "loss": 5.1414, "lr": 0.0007134265734265734, "step": 2550, "tokens_trained": 0.2423814 }, { "epoch": 0.7239716312056738, "grad_norm": 1.2173429727554321, "loss": 5.0843, "lr": 0.0007131468531468531, "step": 2552, "tokens_trained": 0.242571344 }, { "epoch": 0.7245390070921985, "grad_norm": 1.269544005393982, "loss": 5.0945, "lr": 0.0007128671328671329, "step": 2554, "tokens_trained": 0.242760304 }, { "epoch": 0.7251063829787234, "grad_norm": 1.1891573667526245, "loss": 5.1301, "lr": 0.0007125874125874126, "step": 2556, "tokens_trained": 0.242950432 }, { "epoch": 0.7256737588652482, "grad_norm": 1.1826258897781372, "loss": 5.1463, "lr": 0.0007123076923076923, "step": 2558, "tokens_trained": 0.243140944 }, { "epoch": 0.7262411347517731, "grad_norm": 1.0478367805480957, "loss": 5.1082, "lr": 0.0007120279720279721, "step": 2560, "tokens_trained": 0.243331192 }, { "epoch": 0.7268085106382979, "grad_norm": 1.05866539478302, "loss": 5.135, "lr": 0.0007117482517482518, "step": 2562, "tokens_trained": 0.243519712 }, { "epoch": 0.7273758865248227, "grad_norm": 1.1300735473632812, "loss": 5.0985, "lr": 0.0007114685314685315, "step": 2564, "tokens_trained": 0.243710408 }, { "epoch": 0.7279432624113475, "grad_norm": 1.0662705898284912, "loss": 5.1482, "lr": 0.0007111888111888112, "step": 2566, "tokens_trained": 0.243899576 }, { "epoch": 0.7285106382978723, "grad_norm": 1.0905804634094238, "loss": 5.103, "lr": 0.0007109090909090909, "step": 2568, "tokens_trained": 0.244090984 }, { "epoch": 0.7290780141843972, "grad_norm": 1.2062023878097534, "loss": 5.1318, "lr": 0.0007106293706293706, "step": 2570, "tokens_trained": 0.244280584 }, { "epoch": 0.729645390070922, "grad_norm": 1.0444546937942505, "loss": 5.144, "lr": 0.0007103496503496504, "step": 2572, "tokens_trained": 0.244471384 }, { "epoch": 0.7302127659574468, "grad_norm": 1.0395665168762207, "loss": 5.0944, "lr": 0.0007100699300699301, "step": 2574, "tokens_trained": 0.24466056 }, { "epoch": 0.7307801418439717, "grad_norm": 1.0630977153778076, "loss": 5.1038, "lr": 0.0007097902097902098, "step": 2576, "tokens_trained": 0.2448524 }, { "epoch": 0.7313475177304964, "grad_norm": 1.1561299562454224, "loss": 5.1544, "lr": 0.0007095104895104895, "step": 2578, "tokens_trained": 0.245042104 }, { "epoch": 0.7319148936170212, "grad_norm": 1.1774277687072754, "loss": 5.1366, "lr": 0.0007092307692307692, "step": 2580, "tokens_trained": 0.245231832 }, { "epoch": 0.7324822695035461, "grad_norm": 1.2139825820922852, "loss": 5.1195, "lr": 0.000708951048951049, "step": 2582, "tokens_trained": 0.24542076 }, { "epoch": 0.7330496453900709, "grad_norm": 1.1340903043746948, "loss": 5.1476, "lr": 0.0007086713286713287, "step": 2584, "tokens_trained": 0.245613128 }, { "epoch": 0.7336170212765958, "grad_norm": 1.2109994888305664, "loss": 5.1359, "lr": 0.0007083916083916084, "step": 2586, "tokens_trained": 0.245803992 }, { "epoch": 0.7341843971631206, "grad_norm": 1.1087621450424194, "loss": 5.1287, "lr": 0.000708111888111888, "step": 2588, "tokens_trained": 0.245994816 }, { "epoch": 0.7347517730496453, "grad_norm": 1.206106424331665, "loss": 5.1618, "lr": 0.0007078321678321679, "step": 2590, "tokens_trained": 0.246183624 }, { "epoch": 0.7353191489361702, "grad_norm": 1.0370070934295654, "loss": 5.1103, "lr": 0.0007075524475524475, "step": 2592, "tokens_trained": 0.246375232 }, { "epoch": 0.735886524822695, "grad_norm": 0.9844968914985657, "loss": 5.1266, "lr": 0.0007072727272727273, "step": 2594, "tokens_trained": 0.246565048 }, { "epoch": 0.7364539007092199, "grad_norm": 1.0623670816421509, "loss": 5.1341, "lr": 0.000706993006993007, "step": 2596, "tokens_trained": 0.246754136 }, { "epoch": 0.7370212765957447, "grad_norm": 1.1878798007965088, "loss": 5.1178, "lr": 0.0007067132867132867, "step": 2598, "tokens_trained": 0.246944496 }, { "epoch": 0.7375886524822695, "grad_norm": 1.045849323272705, "loss": 5.1151, "lr": 0.0007064335664335665, "step": 2600, "tokens_trained": 0.247135616 }, { "epoch": 0.7381560283687943, "grad_norm": 1.1081782579421997, "loss": 5.0699, "lr": 0.0007061538461538462, "step": 2602, "tokens_trained": 0.247326864 }, { "epoch": 0.7387234042553191, "grad_norm": 1.0893741846084595, "loss": 5.0967, "lr": 0.0007058741258741259, "step": 2604, "tokens_trained": 0.247515736 }, { "epoch": 0.739290780141844, "grad_norm": 1.128481149673462, "loss": 5.1136, "lr": 0.0007055944055944055, "step": 2606, "tokens_trained": 0.24770688 }, { "epoch": 0.7398581560283688, "grad_norm": 1.0735145807266235, "loss": 5.1127, "lr": 0.0007053146853146854, "step": 2608, "tokens_trained": 0.247897584 }, { "epoch": 0.7404255319148936, "grad_norm": 1.0027481317520142, "loss": 5.1157, "lr": 0.000705034965034965, "step": 2610, "tokens_trained": 0.248088352 }, { "epoch": 0.7409929078014185, "grad_norm": 1.0782684087753296, "loss": 5.1268, "lr": 0.0007047552447552448, "step": 2612, "tokens_trained": 0.248277752 }, { "epoch": 0.7415602836879432, "grad_norm": 1.0961271524429321, "loss": 5.1024, "lr": 0.0007044755244755245, "step": 2614, "tokens_trained": 0.248466504 }, { "epoch": 0.7421276595744681, "grad_norm": 0.9727640151977539, "loss": 5.067, "lr": 0.0007041958041958041, "step": 2616, "tokens_trained": 0.248657896 }, { "epoch": 0.7426950354609929, "grad_norm": 0.9756829738616943, "loss": 5.1326, "lr": 0.000703916083916084, "step": 2618, "tokens_trained": 0.248849288 }, { "epoch": 0.7432624113475177, "grad_norm": 0.9990546703338623, "loss": 5.2016, "lr": 0.0007036363636363636, "step": 2620, "tokens_trained": 0.24903988 }, { "epoch": 0.7438297872340426, "grad_norm": 1.062199592590332, "loss": 5.1517, "lr": 0.0007033566433566434, "step": 2622, "tokens_trained": 0.24922872 }, { "epoch": 0.7443971631205674, "grad_norm": 1.138197422027588, "loss": 5.1052, "lr": 0.000703076923076923, "step": 2624, "tokens_trained": 0.249420464 }, { "epoch": 0.7446808510638298, "eval_loss": 5.127779960632324, "eval_runtime": 20.9141, "step": 2625, "tokens_trained": 0.249516464 }, { "epoch": 0.7449645390070923, "grad_norm": 1.1704756021499634, "loss": 5.1167, "lr": 0.0007027972027972029, "step": 2626, "tokens_trained": 0.249612824 }, { "epoch": 0.745531914893617, "grad_norm": 1.067280888557434, "loss": 5.0877, "lr": 0.0007025174825174825, "step": 2628, "tokens_trained": 0.249801672 }, { "epoch": 0.7460992907801418, "grad_norm": 1.0734069347381592, "loss": 5.091, "lr": 0.0007022377622377623, "step": 2630, "tokens_trained": 0.249993136 }, { "epoch": 0.7466666666666667, "grad_norm": 1.0817586183547974, "loss": 5.0894, "lr": 0.000701958041958042, "step": 2632, "tokens_trained": 0.25018232 }, { "epoch": 0.7472340425531915, "grad_norm": 1.0738139152526855, "loss": 5.1141, "lr": 0.0007016783216783216, "step": 2634, "tokens_trained": 0.250373456 }, { "epoch": 0.7478014184397163, "grad_norm": 1.0292818546295166, "loss": 5.0746, "lr": 0.0007013986013986015, "step": 2636, "tokens_trained": 0.250563552 }, { "epoch": 0.7483687943262411, "grad_norm": 1.0308977365493774, "loss": 5.1346, "lr": 0.0007011188811188811, "step": 2638, "tokens_trained": 0.25075176 }, { "epoch": 0.7489361702127659, "grad_norm": 1.0287693738937378, "loss": 5.1137, "lr": 0.0007008391608391609, "step": 2640, "tokens_trained": 0.250939456 }, { "epoch": 0.7495035460992908, "grad_norm": 1.043565273284912, "loss": 5.1251, "lr": 0.0007005594405594405, "step": 2642, "tokens_trained": 0.251130456 }, { "epoch": 0.7500709219858156, "grad_norm": 1.0977740287780762, "loss": 5.0959, "lr": 0.0007002797202797204, "step": 2644, "tokens_trained": 0.251320016 }, { "epoch": 0.7506382978723404, "grad_norm": 1.0304359197616577, "loss": 5.0893, "lr": 0.0007, "step": 2646, "tokens_trained": 0.251509824 }, { "epoch": 0.7512056737588653, "grad_norm": 1.0331344604492188, "loss": 5.1238, "lr": 0.0006997202797202797, "step": 2648, "tokens_trained": 0.251700504 }, { "epoch": 0.75177304964539, "grad_norm": 1.0405573844909668, "loss": 5.1301, "lr": 0.0006994405594405595, "step": 2650, "tokens_trained": 0.251890936 }, { "epoch": 0.7523404255319149, "grad_norm": 1.0685805082321167, "loss": 5.1354, "lr": 0.0006991608391608391, "step": 2652, "tokens_trained": 0.252081296 }, { "epoch": 0.7529078014184397, "grad_norm": 1.0597950220108032, "loss": 5.1229, "lr": 0.000698881118881119, "step": 2654, "tokens_trained": 0.252270456 }, { "epoch": 0.7534751773049645, "grad_norm": 1.0094919204711914, "loss": 5.1077, "lr": 0.0006986013986013986, "step": 2656, "tokens_trained": 0.252459416 }, { "epoch": 0.7540425531914894, "grad_norm": 1.0850694179534912, "loss": 5.0876, "lr": 0.0006983216783216784, "step": 2658, "tokens_trained": 0.252649656 }, { "epoch": 0.7546099290780142, "grad_norm": 1.0182054042816162, "loss": 5.0842, "lr": 0.000698041958041958, "step": 2660, "tokens_trained": 0.252840488 }, { "epoch": 0.755177304964539, "grad_norm": 1.074000597000122, "loss": 5.1387, "lr": 0.0006977622377622378, "step": 2662, "tokens_trained": 0.253030672 }, { "epoch": 0.7557446808510638, "grad_norm": 1.1259658336639404, "loss": 5.1334, "lr": 0.0006974825174825175, "step": 2664, "tokens_trained": 0.253221976 }, { "epoch": 0.7563120567375886, "grad_norm": 1.0146551132202148, "loss": 5.0995, "lr": 0.0006972027972027972, "step": 2666, "tokens_trained": 0.253414352 }, { "epoch": 0.7568794326241135, "grad_norm": 1.1268185377120972, "loss": 5.1201, "lr": 0.000696923076923077, "step": 2668, "tokens_trained": 0.25360448 }, { "epoch": 0.7574468085106383, "grad_norm": 1.025431752204895, "loss": 5.0546, "lr": 0.0006966433566433566, "step": 2670, "tokens_trained": 0.253791368 }, { "epoch": 0.7580141843971632, "grad_norm": 1.108112096786499, "loss": 5.0917, "lr": 0.0006963636363636365, "step": 2672, "tokens_trained": 0.253982112 }, { "epoch": 0.758581560283688, "grad_norm": 1.1009857654571533, "loss": 5.1447, "lr": 0.0006960839160839161, "step": 2674, "tokens_trained": 0.254173328 }, { "epoch": 0.7591489361702127, "grad_norm": 1.0718492269515991, "loss": 5.1093, "lr": 0.0006958041958041958, "step": 2676, "tokens_trained": 0.254363624 }, { "epoch": 0.7597163120567376, "grad_norm": 1.0715916156768799, "loss": 5.1287, "lr": 0.0006955244755244755, "step": 2678, "tokens_trained": 0.25455316 }, { "epoch": 0.7602836879432624, "grad_norm": 1.0953240394592285, "loss": 5.1031, "lr": 0.0006952447552447553, "step": 2680, "tokens_trained": 0.254742424 }, { "epoch": 0.7608510638297873, "grad_norm": 1.0574376583099365, "loss": 5.1316, "lr": 0.000694965034965035, "step": 2682, "tokens_trained": 0.254933624 }, { "epoch": 0.7614184397163121, "grad_norm": 1.1887143850326538, "loss": 5.1261, "lr": 0.0006946853146853147, "step": 2684, "tokens_trained": 0.255124424 }, { "epoch": 0.7619858156028368, "grad_norm": 1.0359193086624146, "loss": 5.1584, "lr": 0.0006944055944055943, "step": 2686, "tokens_trained": 0.255314344 }, { "epoch": 0.7625531914893617, "grad_norm": 1.1207493543624878, "loss": 5.1496, "lr": 0.0006941258741258741, "step": 2688, "tokens_trained": 0.255503192 }, { "epoch": 0.7631205673758865, "grad_norm": 1.1609482765197754, "loss": 5.1403, "lr": 0.0006938461538461539, "step": 2690, "tokens_trained": 0.25569088 }, { "epoch": 0.7636879432624113, "grad_norm": 1.0204665660858154, "loss": 5.0891, "lr": 0.0006935664335664336, "step": 2692, "tokens_trained": 0.255880216 }, { "epoch": 0.7642553191489362, "grad_norm": 1.064090371131897, "loss": 5.0507, "lr": 0.0006932867132867133, "step": 2694, "tokens_trained": 0.256070744 }, { "epoch": 0.764822695035461, "grad_norm": 1.1102992296218872, "loss": 5.062, "lr": 0.000693006993006993, "step": 2696, "tokens_trained": 0.256261136 }, { "epoch": 0.7653900709219859, "grad_norm": 1.0316580533981323, "loss": 5.0933, "lr": 0.0006927272727272728, "step": 2698, "tokens_trained": 0.256452032 }, { "epoch": 0.7659574468085106, "grad_norm": 1.0681291818618774, "loss": 5.088, "lr": 0.0006924475524475524, "step": 2700, "tokens_trained": 0.25664152 }, { "epoch": 0.7665248226950354, "grad_norm": 1.1148093938827515, "loss": 5.0389, "lr": 0.0006921678321678322, "step": 2702, "tokens_trained": 0.25683068 }, { "epoch": 0.7670921985815603, "grad_norm": 1.0831029415130615, "loss": 5.1181, "lr": 0.0006918881118881118, "step": 2704, "tokens_trained": 0.257020752 }, { "epoch": 0.7676595744680851, "grad_norm": 1.0877745151519775, "loss": 5.1822, "lr": 0.0006916083916083916, "step": 2706, "tokens_trained": 0.257209136 }, { "epoch": 0.76822695035461, "grad_norm": 1.0823218822479248, "loss": 5.0855, "lr": 0.0006913286713286714, "step": 2708, "tokens_trained": 0.257398504 }, { "epoch": 0.7687943262411348, "grad_norm": 1.0309520959854126, "loss": 5.141, "lr": 0.0006910489510489511, "step": 2710, "tokens_trained": 0.257589568 }, { "epoch": 0.7693617021276595, "grad_norm": 1.0433647632598877, "loss": 5.057, "lr": 0.0006907692307692308, "step": 2712, "tokens_trained": 0.257781368 }, { "epoch": 0.7699290780141844, "grad_norm": 1.05474054813385, "loss": 5.0639, "lr": 0.0006904895104895104, "step": 2714, "tokens_trained": 0.25797212 }, { "epoch": 0.7704964539007092, "grad_norm": 1.0005548000335693, "loss": 5.1155, "lr": 0.0006902097902097903, "step": 2716, "tokens_trained": 0.25815968 }, { "epoch": 0.7710638297872341, "grad_norm": 0.9644413590431213, "loss": 5.1092, "lr": 0.0006899300699300699, "step": 2718, "tokens_trained": 0.258350192 }, { "epoch": 0.7716312056737589, "grad_norm": 1.0715434551239014, "loss": 5.0827, "lr": 0.0006896503496503497, "step": 2720, "tokens_trained": 0.258539872 }, { "epoch": 0.7721985815602836, "grad_norm": 1.0799431800842285, "loss": 5.1489, "lr": 0.0006893706293706293, "step": 2722, "tokens_trained": 0.258728696 }, { "epoch": 0.7727659574468085, "grad_norm": 1.0224812030792236, "loss": 5.0897, "lr": 0.0006890909090909091, "step": 2724, "tokens_trained": 0.258918368 }, { "epoch": 0.7733333333333333, "grad_norm": 1.2171430587768555, "loss": 5.1283, "lr": 0.0006888111888111889, "step": 2726, "tokens_trained": 0.259107072 }, { "epoch": 0.7739007092198582, "grad_norm": 1.0420043468475342, "loss": 5.1325, "lr": 0.0006885314685314685, "step": 2728, "tokens_trained": 0.259297744 }, { "epoch": 0.774468085106383, "grad_norm": 1.0326933860778809, "loss": 5.1543, "lr": 0.0006882517482517483, "step": 2730, "tokens_trained": 0.259486832 }, { "epoch": 0.7750354609929078, "grad_norm": 1.1191221475601196, "loss": 5.1182, "lr": 0.0006879720279720279, "step": 2732, "tokens_trained": 0.2596774 }, { "epoch": 0.7756028368794327, "grad_norm": 1.089678168296814, "loss": 5.09, "lr": 0.0006876923076923078, "step": 2734, "tokens_trained": 0.259868248 }, { "epoch": 0.7761702127659574, "grad_norm": 1.0944526195526123, "loss": 5.1519, "lr": 0.0006874125874125874, "step": 2736, "tokens_trained": 0.260056992 }, { "epoch": 0.7767375886524823, "grad_norm": 1.0774682760238647, "loss": 5.0998, "lr": 0.0006871328671328672, "step": 2738, "tokens_trained": 0.2602478 }, { "epoch": 0.7773049645390071, "grad_norm": 1.0795758962631226, "loss": 5.1483, "lr": 0.0006868531468531468, "step": 2740, "tokens_trained": 0.260435896 }, { "epoch": 0.7778723404255319, "grad_norm": 1.229885458946228, "loss": 5.0991, "lr": 0.0006865734265734265, "step": 2742, "tokens_trained": 0.260624176 }, { "epoch": 0.7784397163120568, "grad_norm": 1.2816888093948364, "loss": 5.1131, "lr": 0.0006862937062937064, "step": 2744, "tokens_trained": 0.26081632 }, { "epoch": 0.7790070921985816, "grad_norm": 1.127356767654419, "loss": 5.0589, "lr": 0.000686013986013986, "step": 2746, "tokens_trained": 0.261003088 }, { "epoch": 0.7795744680851063, "grad_norm": 1.073644995689392, "loss": 5.1402, "lr": 0.0006857342657342658, "step": 2748, "tokens_trained": 0.261192088 }, { "epoch": 0.7801418439716312, "grad_norm": 1.0892105102539062, "loss": 5.1231, "lr": 0.0006854545454545454, "step": 2750, "tokens_trained": 0.261381504 }, { "epoch": 0.7801418439716312, "eval_loss": 5.11714506149292, "eval_runtime": 20.9289, "step": 2750, "tokens_trained": 0.261381504 }, { "epoch": 0.780709219858156, "grad_norm": 1.2366212606430054, "loss": 5.079, "lr": 0.0006851748251748253, "step": 2752, "tokens_trained": 0.261572936 }, { "epoch": 0.7812765957446809, "grad_norm": 1.2283895015716553, "loss": 5.0414, "lr": 0.0006848951048951049, "step": 2754, "tokens_trained": 0.26176184 }, { "epoch": 0.7818439716312057, "grad_norm": 1.2296546697616577, "loss": 5.0758, "lr": 0.0006846153846153846, "step": 2756, "tokens_trained": 0.261952224 }, { "epoch": 0.7824113475177304, "grad_norm": 1.1455234289169312, "loss": 5.0903, "lr": 0.0006843356643356643, "step": 2758, "tokens_trained": 0.262142128 }, { "epoch": 0.7829787234042553, "grad_norm": 0.9795711040496826, "loss": 5.1101, "lr": 0.000684055944055944, "step": 2760, "tokens_trained": 0.262331464 }, { "epoch": 0.7835460992907801, "grad_norm": 1.1363111734390259, "loss": 5.0948, "lr": 0.0006837762237762239, "step": 2762, "tokens_trained": 0.262523048 }, { "epoch": 0.784113475177305, "grad_norm": 1.0878827571868896, "loss": 5.0942, "lr": 0.0006834965034965035, "step": 2764, "tokens_trained": 0.26271264 }, { "epoch": 0.7846808510638298, "grad_norm": 1.1213501691818237, "loss": 5.0863, "lr": 0.0006832167832167833, "step": 2766, "tokens_trained": 0.262903952 }, { "epoch": 0.7852482269503546, "grad_norm": 1.1156904697418213, "loss": 5.1835, "lr": 0.0006829370629370629, "step": 2768, "tokens_trained": 0.2630932 }, { "epoch": 0.7858156028368795, "grad_norm": 1.2105063199996948, "loss": 5.152, "lr": 0.0006826573426573427, "step": 2770, "tokens_trained": 0.2632822 }, { "epoch": 0.7863829787234042, "grad_norm": 1.056512475013733, "loss": 5.129, "lr": 0.0006823776223776224, "step": 2772, "tokens_trained": 0.263471056 }, { "epoch": 0.7869503546099291, "grad_norm": 1.124480128288269, "loss": 5.1122, "lr": 0.0006820979020979021, "step": 2774, "tokens_trained": 0.26365952 }, { "epoch": 0.7875177304964539, "grad_norm": 1.1403707265853882, "loss": 5.1283, "lr": 0.0006818181818181818, "step": 2776, "tokens_trained": 0.263850128 }, { "epoch": 0.7880851063829787, "grad_norm": 1.0712953805923462, "loss": 5.0901, "lr": 0.0006815384615384615, "step": 2778, "tokens_trained": 0.264036944 }, { "epoch": 0.7886524822695036, "grad_norm": 1.1485860347747803, "loss": 5.0673, "lr": 0.0006812587412587414, "step": 2780, "tokens_trained": 0.2642284 }, { "epoch": 0.7892198581560284, "grad_norm": 1.144534945487976, "loss": 5.0939, "lr": 0.000680979020979021, "step": 2782, "tokens_trained": 0.264417248 }, { "epoch": 0.7897872340425532, "grad_norm": 1.0953861474990845, "loss": 5.0998, "lr": 0.0006806993006993007, "step": 2784, "tokens_trained": 0.264605776 }, { "epoch": 0.790354609929078, "grad_norm": 1.0519598722457886, "loss": 5.1059, "lr": 0.0006804195804195804, "step": 2786, "tokens_trained": 0.264795928 }, { "epoch": 0.7909219858156028, "grad_norm": 1.064609408378601, "loss": 5.1017, "lr": 0.0006801398601398602, "step": 2788, "tokens_trained": 0.264986 }, { "epoch": 0.7914893617021277, "grad_norm": 1.0485059022903442, "loss": 5.0636, "lr": 0.0006798601398601399, "step": 2790, "tokens_trained": 0.265176936 }, { "epoch": 0.7920567375886525, "grad_norm": 1.1277351379394531, "loss": 5.0689, "lr": 0.0006795804195804196, "step": 2792, "tokens_trained": 0.265366584 }, { "epoch": 0.7926241134751772, "grad_norm": 1.0692890882492065, "loss": 5.0922, "lr": 0.0006793006993006992, "step": 2794, "tokens_trained": 0.265557456 }, { "epoch": 0.7931914893617021, "grad_norm": 0.9836872220039368, "loss": 5.0702, "lr": 0.000679020979020979, "step": 2796, "tokens_trained": 0.265747056 }, { "epoch": 0.7937588652482269, "grad_norm": 1.0450890064239502, "loss": 5.0778, "lr": 0.0006787412587412588, "step": 2798, "tokens_trained": 0.265935536 }, { "epoch": 0.7943262411347518, "grad_norm": 1.017853856086731, "loss": 5.1401, "lr": 0.0006784615384615385, "step": 2800, "tokens_trained": 0.266124376 }, { "epoch": 0.7948936170212766, "grad_norm": 0.9698541760444641, "loss": 5.0882, "lr": 0.0006781818181818182, "step": 2802, "tokens_trained": 0.266312192 }, { "epoch": 0.7954609929078014, "grad_norm": 0.9696250557899475, "loss": 5.1424, "lr": 0.0006779020979020979, "step": 2804, "tokens_trained": 0.266503584 }, { "epoch": 0.7960283687943263, "grad_norm": 1.011576533317566, "loss": 5.062, "lr": 0.0006776223776223777, "step": 2806, "tokens_trained": 0.266693776 }, { "epoch": 0.796595744680851, "grad_norm": 0.9681981801986694, "loss": 5.1343, "lr": 0.0006773426573426574, "step": 2808, "tokens_trained": 0.26688324 }, { "epoch": 0.7971631205673759, "grad_norm": 0.9778586626052856, "loss": 5.0619, "lr": 0.0006770629370629371, "step": 2810, "tokens_trained": 0.267072 }, { "epoch": 0.7977304964539007, "grad_norm": 0.9624539613723755, "loss": 5.0943, "lr": 0.0006767832167832167, "step": 2812, "tokens_trained": 0.267260184 }, { "epoch": 0.7982978723404255, "grad_norm": 1.0591245889663696, "loss": 5.101, "lr": 0.0006765034965034965, "step": 2814, "tokens_trained": 0.267450632 }, { "epoch": 0.7988652482269504, "grad_norm": 1.0650452375411987, "loss": 5.0754, "lr": 0.0006762237762237763, "step": 2816, "tokens_trained": 0.267641848 }, { "epoch": 0.7994326241134752, "grad_norm": 1.0241055488586426, "loss": 5.113, "lr": 0.000675944055944056, "step": 2818, "tokens_trained": 0.267831232 }, { "epoch": 0.8, "grad_norm": 0.9588684439659119, "loss": 5.1124, "lr": 0.0006756643356643357, "step": 2820, "tokens_trained": 0.268022016 }, { "epoch": 0.8005673758865248, "grad_norm": 1.0146323442459106, "loss": 5.0773, "lr": 0.0006753846153846153, "step": 2822, "tokens_trained": 0.268211504 }, { "epoch": 0.8011347517730496, "grad_norm": 1.040366530418396, "loss": 5.0735, "lr": 0.0006751048951048951, "step": 2824, "tokens_trained": 0.268400704 }, { "epoch": 0.8017021276595745, "grad_norm": 1.0419392585754395, "loss": 5.1243, "lr": 0.0006748251748251748, "step": 2826, "tokens_trained": 0.268592936 }, { "epoch": 0.8022695035460993, "grad_norm": 1.0807193517684937, "loss": 5.0938, "lr": 0.0006745454545454546, "step": 2828, "tokens_trained": 0.26878236 }, { "epoch": 0.8028368794326242, "grad_norm": 1.0357084274291992, "loss": 5.138, "lr": 0.0006742657342657342, "step": 2830, "tokens_trained": 0.268973808 }, { "epoch": 0.8034042553191489, "grad_norm": 1.0543837547302246, "loss": 5.1219, "lr": 0.000673986013986014, "step": 2832, "tokens_trained": 0.269163576 }, { "epoch": 0.8039716312056737, "grad_norm": 0.9575244188308716, "loss": 5.0388, "lr": 0.0006737062937062938, "step": 2834, "tokens_trained": 0.26935304 }, { "epoch": 0.8045390070921986, "grad_norm": 1.0559078454971313, "loss": 5.1569, "lr": 0.0006734265734265734, "step": 2836, "tokens_trained": 0.269542488 }, { "epoch": 0.8051063829787234, "grad_norm": 1.1365549564361572, "loss": 5.1392, "lr": 0.0006731468531468532, "step": 2838, "tokens_trained": 0.269732336 }, { "epoch": 0.8056737588652483, "grad_norm": 1.0022294521331787, "loss": 5.1017, "lr": 0.0006728671328671328, "step": 2840, "tokens_trained": 0.269922384 }, { "epoch": 0.8062411347517731, "grad_norm": 0.9790627360343933, "loss": 5.1443, "lr": 0.0006725874125874126, "step": 2842, "tokens_trained": 0.270111096 }, { "epoch": 0.8068085106382978, "grad_norm": 1.0328103303909302, "loss": 5.087, "lr": 0.0006723076923076923, "step": 2844, "tokens_trained": 0.27030036 }, { "epoch": 0.8073758865248227, "grad_norm": 1.0813841819763184, "loss": 5.0995, "lr": 0.0006720279720279721, "step": 2846, "tokens_trained": 0.270490936 }, { "epoch": 0.8079432624113475, "grad_norm": 1.1210085153579712, "loss": 5.0929, "lr": 0.0006717482517482517, "step": 2848, "tokens_trained": 0.27067904 }, { "epoch": 0.8085106382978723, "grad_norm": 1.10624361038208, "loss": 5.0861, "lr": 0.0006714685314685314, "step": 2850, "tokens_trained": 0.270869664 }, { "epoch": 0.8090780141843972, "grad_norm": 0.9984250664710999, "loss": 5.1126, "lr": 0.0006711888111888113, "step": 2852, "tokens_trained": 0.271059912 }, { "epoch": 0.809645390070922, "grad_norm": 1.0100075006484985, "loss": 5.0131, "lr": 0.0006709090909090909, "step": 2854, "tokens_trained": 0.271248128 }, { "epoch": 0.8102127659574468, "grad_norm": 1.0718857049942017, "loss": 5.0978, "lr": 0.0006706293706293707, "step": 2856, "tokens_trained": 0.271437752 }, { "epoch": 0.8107801418439716, "grad_norm": 1.0277525186538696, "loss": 5.022, "lr": 0.0006703496503496503, "step": 2858, "tokens_trained": 0.271627272 }, { "epoch": 0.8113475177304964, "grad_norm": 1.1056699752807617, "loss": 5.1678, "lr": 0.0006700699300699301, "step": 2860, "tokens_trained": 0.271815032 }, { "epoch": 0.8119148936170213, "grad_norm": 0.9853792190551758, "loss": 5.1511, "lr": 0.0006697902097902098, "step": 2862, "tokens_trained": 0.272004128 }, { "epoch": 0.8124822695035461, "grad_norm": 1.0207619667053223, "loss": 5.0681, "lr": 0.0006695104895104895, "step": 2864, "tokens_trained": 0.272193024 }, { "epoch": 0.813049645390071, "grad_norm": 1.0080488920211792, "loss": 5.0712, "lr": 0.0006692307692307692, "step": 2866, "tokens_trained": 0.27238368 }, { "epoch": 0.8136170212765957, "grad_norm": 1.1197504997253418, "loss": 5.0545, "lr": 0.0006689510489510489, "step": 2868, "tokens_trained": 0.272573416 }, { "epoch": 0.8141843971631205, "grad_norm": 1.0667881965637207, "loss": 5.0492, "lr": 0.0006686713286713288, "step": 2870, "tokens_trained": 0.272762152 }, { "epoch": 0.8147517730496454, "grad_norm": 1.0861417055130005, "loss": 5.1656, "lr": 0.0006683916083916084, "step": 2872, "tokens_trained": 0.272951384 }, { "epoch": 0.8153191489361702, "grad_norm": 0.9590932130813599, "loss": 5.0987, "lr": 0.0006681118881118882, "step": 2874, "tokens_trained": 0.273142368 }, { "epoch": 0.8156028368794326, "eval_loss": 5.098834037780762, "eval_runtime": 21.1414, "step": 2875, "tokens_trained": 0.273239384 }, { "epoch": 0.8158865248226951, "grad_norm": 0.9762487411499023, "loss": 5.0346, "lr": 0.0006678321678321678, "step": 2876, "tokens_trained": 0.273334 }, { "epoch": 0.8164539007092199, "grad_norm": 1.059070110321045, "loss": 5.0466, "lr": 0.0006675524475524475, "step": 2878, "tokens_trained": 0.2735232 }, { "epoch": 0.8170212765957446, "grad_norm": 1.0384489297866821, "loss": 5.0647, "lr": 0.0006672727272727273, "step": 2880, "tokens_trained": 0.27371452 }, { "epoch": 0.8175886524822695, "grad_norm": 1.0188980102539062, "loss": 5.135, "lr": 0.000666993006993007, "step": 2882, "tokens_trained": 0.273903312 }, { "epoch": 0.8181560283687943, "grad_norm": 1.0437567234039307, "loss": 5.1251, "lr": 0.0006667132867132867, "step": 2884, "tokens_trained": 0.27409364 }, { "epoch": 0.8187234042553192, "grad_norm": 1.040148138999939, "loss": 5.0829, "lr": 0.0006664335664335664, "step": 2886, "tokens_trained": 0.274283392 }, { "epoch": 0.819290780141844, "grad_norm": 0.9796963930130005, "loss": 5.1062, "lr": 0.0006661538461538463, "step": 2888, "tokens_trained": 0.27447272 }, { "epoch": 0.8198581560283688, "grad_norm": 1.0791646242141724, "loss": 5.0677, "lr": 0.0006658741258741259, "step": 2890, "tokens_trained": 0.274661656 }, { "epoch": 0.8204255319148936, "grad_norm": 1.075614094734192, "loss": 5.0932, "lr": 0.0006655944055944056, "step": 2892, "tokens_trained": 0.274851216 }, { "epoch": 0.8209929078014184, "grad_norm": 1.0696609020233154, "loss": 5.1614, "lr": 0.0006653146853146853, "step": 2894, "tokens_trained": 0.275040176 }, { "epoch": 0.8215602836879433, "grad_norm": 1.0564289093017578, "loss": 5.1165, "lr": 0.000665034965034965, "step": 2896, "tokens_trained": 0.27522948 }, { "epoch": 0.8221276595744681, "grad_norm": 1.0135756731033325, "loss": 5.1222, "lr": 0.0006647552447552448, "step": 2898, "tokens_trained": 0.275419392 }, { "epoch": 0.8226950354609929, "grad_norm": 1.0177373886108398, "loss": 5.1085, "lr": 0.0006644755244755245, "step": 2900, "tokens_trained": 0.27561 }, { "epoch": 0.8232624113475178, "grad_norm": 0.9718354344367981, "loss": 5.069, "lr": 0.0006641958041958042, "step": 2902, "tokens_trained": 0.275800288 }, { "epoch": 0.8238297872340425, "grad_norm": 1.011567234992981, "loss": 5.1668, "lr": 0.0006639160839160839, "step": 2904, "tokens_trained": 0.275988672 }, { "epoch": 0.8243971631205673, "grad_norm": 1.0020220279693604, "loss": 5.0616, "lr": 0.0006636363636363638, "step": 2906, "tokens_trained": 0.276180112 }, { "epoch": 0.8249645390070922, "grad_norm": 0.9929330945014954, "loss": 5.0723, "lr": 0.0006633566433566434, "step": 2908, "tokens_trained": 0.276368136 }, { "epoch": 0.825531914893617, "grad_norm": 0.9768717885017395, "loss": 5.0872, "lr": 0.0006630769230769231, "step": 2910, "tokens_trained": 0.276557936 }, { "epoch": 0.8260992907801419, "grad_norm": 1.0068199634552002, "loss": 5.1279, "lr": 0.0006627972027972028, "step": 2912, "tokens_trained": 0.276748584 }, { "epoch": 0.8266666666666667, "grad_norm": 0.953273594379425, "loss": 5.0422, "lr": 0.0006625174825174825, "step": 2914, "tokens_trained": 0.276939168 }, { "epoch": 0.8272340425531914, "grad_norm": 0.9808285236358643, "loss": 5.1278, "lr": 0.0006622377622377623, "step": 2916, "tokens_trained": 0.277128728 }, { "epoch": 0.8278014184397163, "grad_norm": 0.9755997061729431, "loss": 5.0661, "lr": 0.000661958041958042, "step": 2918, "tokens_trained": 0.27731964 }, { "epoch": 0.8283687943262411, "grad_norm": 0.9573803544044495, "loss": 5.0744, "lr": 0.0006616783216783216, "step": 2920, "tokens_trained": 0.277508704 }, { "epoch": 0.828936170212766, "grad_norm": 1.1060761213302612, "loss": 5.1124, "lr": 0.0006613986013986014, "step": 2922, "tokens_trained": 0.277698576 }, { "epoch": 0.8295035460992908, "grad_norm": 1.1377017498016357, "loss": 5.1375, "lr": 0.0006611188811188812, "step": 2924, "tokens_trained": 0.277887456 }, { "epoch": 0.8300709219858156, "grad_norm": 1.0315862894058228, "loss": 5.0839, "lr": 0.0006608391608391609, "step": 2926, "tokens_trained": 0.278076232 }, { "epoch": 0.8306382978723404, "grad_norm": 0.9509685635566711, "loss": 5.0986, "lr": 0.0006605594405594406, "step": 2928, "tokens_trained": 0.278265168 }, { "epoch": 0.8312056737588652, "grad_norm": 0.9749555587768555, "loss": 5.0292, "lr": 0.0006602797202797203, "step": 2930, "tokens_trained": 0.27845612 }, { "epoch": 0.8317730496453901, "grad_norm": 0.9728718400001526, "loss": 5.1113, "lr": 0.00066, "step": 2932, "tokens_trained": 0.278647 }, { "epoch": 0.8323404255319149, "grad_norm": 0.8888244032859802, "loss": 5.0698, "lr": 0.0006597202797202797, "step": 2934, "tokens_trained": 0.278834704 }, { "epoch": 0.8329078014184397, "grad_norm": 0.9745096564292908, "loss": 5.1356, "lr": 0.0006594405594405595, "step": 2936, "tokens_trained": 0.27902504 }, { "epoch": 0.8334751773049646, "grad_norm": 1.023566484451294, "loss": 5.0733, "lr": 0.0006591608391608391, "step": 2938, "tokens_trained": 0.279214024 }, { "epoch": 0.8340425531914893, "grad_norm": 0.9370903968811035, "loss": 5.108, "lr": 0.0006588811188811189, "step": 2940, "tokens_trained": 0.279402336 }, { "epoch": 0.8346099290780142, "grad_norm": 1.037245750427246, "loss": 5.1539, "lr": 0.0006586013986013986, "step": 2942, "tokens_trained": 0.279594456 }, { "epoch": 0.835177304964539, "grad_norm": 1.1117267608642578, "loss": 5.0984, "lr": 0.0006583216783216784, "step": 2944, "tokens_trained": 0.279784736 }, { "epoch": 0.8357446808510638, "grad_norm": 1.0760383605957031, "loss": 5.0798, "lr": 0.0006580419580419581, "step": 2946, "tokens_trained": 0.279974272 }, { "epoch": 0.8363120567375887, "grad_norm": 1.0359710454940796, "loss": 5.1052, "lr": 0.0006577622377622377, "step": 2948, "tokens_trained": 0.280162576 }, { "epoch": 0.8368794326241135, "grad_norm": 1.0630141496658325, "loss": 5.0561, "lr": 0.0006574825174825175, "step": 2950, "tokens_trained": 0.280351752 }, { "epoch": 0.8374468085106384, "grad_norm": 1.0445481538772583, "loss": 5.1009, "lr": 0.0006572027972027972, "step": 2952, "tokens_trained": 0.280541392 }, { "epoch": 0.8380141843971631, "grad_norm": 1.0606142282485962, "loss": 5.0109, "lr": 0.000656923076923077, "step": 2954, "tokens_trained": 0.280732192 }, { "epoch": 0.8385815602836879, "grad_norm": 1.0462067127227783, "loss": 5.1411, "lr": 0.0006566433566433566, "step": 2956, "tokens_trained": 0.280922712 }, { "epoch": 0.8391489361702128, "grad_norm": 0.9841874241828918, "loss": 5.0773, "lr": 0.0006563636363636364, "step": 2958, "tokens_trained": 0.28111024 }, { "epoch": 0.8397163120567376, "grad_norm": 1.1026822328567505, "loss": 5.1128, "lr": 0.0006560839160839161, "step": 2960, "tokens_trained": 0.281302152 }, { "epoch": 0.8402836879432624, "grad_norm": 0.9562904834747314, "loss": 5.0521, "lr": 0.0006558041958041958, "step": 2962, "tokens_trained": 0.281490768 }, { "epoch": 0.8408510638297872, "grad_norm": 1.038006067276001, "loss": 5.0931, "lr": 0.0006555244755244756, "step": 2964, "tokens_trained": 0.281682552 }, { "epoch": 0.841418439716312, "grad_norm": 1.008678913116455, "loss": 5.0728, "lr": 0.0006552447552447552, "step": 2966, "tokens_trained": 0.281871816 }, { "epoch": 0.8419858156028369, "grad_norm": 0.9977920651435852, "loss": 5.086, "lr": 0.000654965034965035, "step": 2968, "tokens_trained": 0.2820618 }, { "epoch": 0.8425531914893617, "grad_norm": 0.9422287344932556, "loss": 5.0844, "lr": 0.0006546853146853147, "step": 2970, "tokens_trained": 0.282253032 }, { "epoch": 0.8431205673758865, "grad_norm": 1.0029969215393066, "loss": 5.0928, "lr": 0.0006544055944055945, "step": 2972, "tokens_trained": 0.282443296 }, { "epoch": 0.8436879432624114, "grad_norm": 1.0643123388290405, "loss": 5.0988, "lr": 0.0006541258741258741, "step": 2974, "tokens_trained": 0.282634024 }, { "epoch": 0.8442553191489361, "grad_norm": 1.0360649824142456, "loss": 5.0634, "lr": 0.0006538461538461538, "step": 2976, "tokens_trained": 0.282825768 }, { "epoch": 0.844822695035461, "grad_norm": 0.9609996676445007, "loss": 5.1155, "lr": 0.0006535664335664336, "step": 2978, "tokens_trained": 0.283016704 }, { "epoch": 0.8453900709219858, "grad_norm": 0.9547716379165649, "loss": 5.0769, "lr": 0.0006532867132867133, "step": 2980, "tokens_trained": 0.283205288 }, { "epoch": 0.8459574468085106, "grad_norm": 1.0286030769348145, "loss": 5.0849, "lr": 0.0006530069930069931, "step": 2982, "tokens_trained": 0.283395192 }, { "epoch": 0.8465248226950355, "grad_norm": 0.9071921706199646, "loss": 5.0308, "lr": 0.0006527272727272727, "step": 2984, "tokens_trained": 0.283587048 }, { "epoch": 0.8470921985815603, "grad_norm": 0.851090133190155, "loss": 5.0601, "lr": 0.0006524475524475524, "step": 2986, "tokens_trained": 0.28377872 }, { "epoch": 0.8476595744680852, "grad_norm": 0.946025550365448, "loss": 5.0863, "lr": 0.0006521678321678322, "step": 2988, "tokens_trained": 0.283968304 }, { "epoch": 0.8482269503546099, "grad_norm": 0.994915783405304, "loss": 5.1034, "lr": 0.0006518881118881119, "step": 2990, "tokens_trained": 0.284158704 }, { "epoch": 0.8487943262411347, "grad_norm": 0.9354639053344727, "loss": 5.0749, "lr": 0.0006516083916083916, "step": 2992, "tokens_trained": 0.284350032 }, { "epoch": 0.8493617021276596, "grad_norm": 0.9014646410942078, "loss": 5.0753, "lr": 0.0006513286713286713, "step": 2994, "tokens_trained": 0.284541136 }, { "epoch": 0.8499290780141844, "grad_norm": 0.9647039771080017, "loss": 5.1391, "lr": 0.0006510489510489511, "step": 2996, "tokens_trained": 0.28473112 }, { "epoch": 0.8504964539007093, "grad_norm": 0.9687992930412292, "loss": 5.0058, "lr": 0.0006507692307692308, "step": 2998, "tokens_trained": 0.284922608 }, { "epoch": 0.851063829787234, "grad_norm": 0.9827167987823486, "loss": 5.0597, "lr": 0.0006504895104895106, "step": 3000, "tokens_trained": 0.285112344 }, { "epoch": 0.851063829787234, "eval_loss": 5.092260837554932, "eval_runtime": 20.8128, "step": 3000, "tokens_trained": 0.285112344 }, { "epoch": 0.8516312056737588, "grad_norm": 1.1164077520370483, "loss": 4.9872, "lr": 0.0006502097902097902, "step": 3002, "tokens_trained": 0.285299144 }, { "epoch": 0.8521985815602837, "grad_norm": 1.0835845470428467, "loss": 4.999, "lr": 0.0006499300699300699, "step": 3004, "tokens_trained": 0.28548968 }, { "epoch": 0.8527659574468085, "grad_norm": 1.135926365852356, "loss": 5.1038, "lr": 0.0006496503496503497, "step": 3006, "tokens_trained": 0.28568256 }, { "epoch": 0.8533333333333334, "grad_norm": 1.0743507146835327, "loss": 5.0964, "lr": 0.0006493706293706294, "step": 3008, "tokens_trained": 0.28587348 }, { "epoch": 0.8539007092198582, "grad_norm": 0.9776538014411926, "loss": 5.0208, "lr": 0.0006490909090909091, "step": 3010, "tokens_trained": 0.286061968 }, { "epoch": 0.854468085106383, "grad_norm": 0.9797994494438171, "loss": 5.0238, "lr": 0.0006488111888111888, "step": 3012, "tokens_trained": 0.28625252 }, { "epoch": 0.8550354609929078, "grad_norm": 0.8697059154510498, "loss": 5.0017, "lr": 0.0006485314685314685, "step": 3014, "tokens_trained": 0.286443872 }, { "epoch": 0.8556028368794326, "grad_norm": 0.9378446340560913, "loss": 5.0856, "lr": 0.0006482517482517483, "step": 3016, "tokens_trained": 0.286633232 }, { "epoch": 0.8561702127659574, "grad_norm": 0.9418164491653442, "loss": 5.0637, "lr": 0.000647972027972028, "step": 3018, "tokens_trained": 0.286824032 }, { "epoch": 0.8567375886524823, "grad_norm": 0.9479710459709167, "loss": 5.0941, "lr": 0.0006476923076923077, "step": 3020, "tokens_trained": 0.2870158 }, { "epoch": 0.8573049645390071, "grad_norm": 0.9716140627861023, "loss": 5.1218, "lr": 0.0006474125874125874, "step": 3022, "tokens_trained": 0.287206184 }, { "epoch": 0.857872340425532, "grad_norm": 0.9651079177856445, "loss": 5.0137, "lr": 0.0006471328671328672, "step": 3024, "tokens_trained": 0.287395568 }, { "epoch": 0.8584397163120567, "grad_norm": 1.0485713481903076, "loss": 5.0713, "lr": 0.0006468531468531469, "step": 3026, "tokens_trained": 0.28758644 }, { "epoch": 0.8590070921985815, "grad_norm": 1.0849828720092773, "loss": 5.0241, "lr": 0.0006465734265734265, "step": 3028, "tokens_trained": 0.287773088 }, { "epoch": 0.8595744680851064, "grad_norm": 1.0668689012527466, "loss": 5.0694, "lr": 0.0006462937062937063, "step": 3030, "tokens_trained": 0.287963544 }, { "epoch": 0.8601418439716312, "grad_norm": 0.9943816065788269, "loss": 5.0807, "lr": 0.0006460139860139859, "step": 3032, "tokens_trained": 0.288152376 }, { "epoch": 0.8607092198581561, "grad_norm": 1.104642629623413, "loss": 5.1047, "lr": 0.0006457342657342658, "step": 3034, "tokens_trained": 0.288343064 }, { "epoch": 0.8612765957446809, "grad_norm": 1.0915707349777222, "loss": 5.1332, "lr": 0.0006454545454545455, "step": 3036, "tokens_trained": 0.28853308 }, { "epoch": 0.8618439716312056, "grad_norm": 0.9935365319252014, "loss": 5.0799, "lr": 0.0006451748251748252, "step": 3038, "tokens_trained": 0.288726184 }, { "epoch": 0.8624113475177305, "grad_norm": 0.9564308524131775, "loss": 5.0549, "lr": 0.0006448951048951049, "step": 3040, "tokens_trained": 0.288916368 }, { "epoch": 0.8629787234042553, "grad_norm": 1.0183926820755005, "loss": 5.1508, "lr": 0.0006446153846153846, "step": 3042, "tokens_trained": 0.28910616 }, { "epoch": 0.8635460992907802, "grad_norm": 0.8167940974235535, "loss": 5.1238, "lr": 0.0006443356643356644, "step": 3044, "tokens_trained": 0.289295008 }, { "epoch": 0.864113475177305, "grad_norm": 0.981560468673706, "loss": 5.0692, "lr": 0.000644055944055944, "step": 3046, "tokens_trained": 0.289483192 }, { "epoch": 0.8646808510638297, "grad_norm": 0.9596647024154663, "loss": 5.0557, "lr": 0.0006437762237762238, "step": 3048, "tokens_trained": 0.289672528 }, { "epoch": 0.8652482269503546, "grad_norm": 0.9322229027748108, "loss": 5.0769, "lr": 0.0006434965034965034, "step": 3050, "tokens_trained": 0.28986108 }, { "epoch": 0.8658156028368794, "grad_norm": 0.94253009557724, "loss": 5.0556, "lr": 0.0006432167832167833, "step": 3052, "tokens_trained": 0.29005032 }, { "epoch": 0.8663829787234043, "grad_norm": 0.9793356657028198, "loss": 5.0821, "lr": 0.000642937062937063, "step": 3054, "tokens_trained": 0.290238496 }, { "epoch": 0.8669503546099291, "grad_norm": 1.0270706415176392, "loss": 5.1137, "lr": 0.0006426573426573426, "step": 3056, "tokens_trained": 0.290428552 }, { "epoch": 0.8675177304964539, "grad_norm": 1.0015908479690552, "loss": 5.0426, "lr": 0.0006423776223776224, "step": 3058, "tokens_trained": 0.290617592 }, { "epoch": 0.8680851063829788, "grad_norm": 1.1663475036621094, "loss": 5.0152, "lr": 0.0006420979020979021, "step": 3060, "tokens_trained": 0.290806784 }, { "epoch": 0.8686524822695035, "grad_norm": 1.1144863367080688, "loss": 5.1324, "lr": 0.0006418181818181819, "step": 3062, "tokens_trained": 0.290997672 }, { "epoch": 0.8692198581560284, "grad_norm": 1.086949110031128, "loss": 5.0896, "lr": 0.0006415384615384615, "step": 3064, "tokens_trained": 0.291187224 }, { "epoch": 0.8697872340425532, "grad_norm": 1.0380237102508545, "loss": 5.0964, "lr": 0.0006412587412587413, "step": 3066, "tokens_trained": 0.291378608 }, { "epoch": 0.870354609929078, "grad_norm": 0.9731833338737488, "loss": 5.0113, "lr": 0.0006409790209790209, "step": 3068, "tokens_trained": 0.291568064 }, { "epoch": 0.8709219858156029, "grad_norm": 0.9414166212081909, "loss": 5.0396, "lr": 0.0006406993006993007, "step": 3070, "tokens_trained": 0.291757936 }, { "epoch": 0.8714893617021277, "grad_norm": 1.0375349521636963, "loss": 5.1187, "lr": 0.0006404195804195805, "step": 3072, "tokens_trained": 0.291948704 }, { "epoch": 0.8720567375886524, "grad_norm": 0.9412112236022949, "loss": 5.0955, "lr": 0.0006401398601398601, "step": 3074, "tokens_trained": 0.292141128 }, { "epoch": 0.8726241134751773, "grad_norm": 0.9645117521286011, "loss": 5.1278, "lr": 0.0006398601398601399, "step": 3076, "tokens_trained": 0.292331704 }, { "epoch": 0.8731914893617021, "grad_norm": 0.9918674230575562, "loss": 5.0726, "lr": 0.0006395804195804196, "step": 3078, "tokens_trained": 0.292519984 }, { "epoch": 0.873758865248227, "grad_norm": 0.8824833035469055, "loss": 5.1334, "lr": 0.0006393006993006994, "step": 3080, "tokens_trained": 0.292712064 }, { "epoch": 0.8743262411347518, "grad_norm": 1.0651589632034302, "loss": 5.0911, "lr": 0.000639020979020979, "step": 3082, "tokens_trained": 0.292901816 }, { "epoch": 0.8748936170212765, "grad_norm": 1.0067808628082275, "loss": 5.1345, "lr": 0.0006387412587412587, "step": 3084, "tokens_trained": 0.293094064 }, { "epoch": 0.8754609929078014, "grad_norm": 0.8916751146316528, "loss": 5.1117, "lr": 0.0006384615384615384, "step": 3086, "tokens_trained": 0.293284272 }, { "epoch": 0.8760283687943262, "grad_norm": 1.0009779930114746, "loss": 5.1143, "lr": 0.0006381818181818182, "step": 3088, "tokens_trained": 0.293474352 }, { "epoch": 0.8765957446808511, "grad_norm": 1.0289413928985596, "loss": 5.0551, "lr": 0.000637902097902098, "step": 3090, "tokens_trained": 0.293661976 }, { "epoch": 0.8771631205673759, "grad_norm": 0.9375638961791992, "loss": 5.0666, "lr": 0.0006376223776223776, "step": 3092, "tokens_trained": 0.293851968 }, { "epoch": 0.8777304964539007, "grad_norm": 0.9490086436271667, "loss": 5.0901, "lr": 0.0006373426573426574, "step": 3094, "tokens_trained": 0.294041608 }, { "epoch": 0.8782978723404256, "grad_norm": 0.932090163230896, "loss": 5.0783, "lr": 0.0006370629370629371, "step": 3096, "tokens_trained": 0.29423028 }, { "epoch": 0.8788652482269503, "grad_norm": 0.9120060205459595, "loss": 5.1065, "lr": 0.0006367832167832168, "step": 3098, "tokens_trained": 0.294421528 }, { "epoch": 0.8794326241134752, "grad_norm": 0.8693923950195312, "loss": 5.0689, "lr": 0.0006365034965034965, "step": 3100, "tokens_trained": 0.294609832 }, { "epoch": 0.88, "grad_norm": 0.834987223148346, "loss": 5.0542, "lr": 0.0006362237762237762, "step": 3102, "tokens_trained": 0.294799424 }, { "epoch": 0.8805673758865248, "grad_norm": 0.9196602702140808, "loss": 5.0212, "lr": 0.0006359440559440559, "step": 3104, "tokens_trained": 0.294990504 }, { "epoch": 0.8811347517730497, "grad_norm": 1.0392085313796997, "loss": 5.0734, "lr": 0.0006356643356643357, "step": 3106, "tokens_trained": 0.295181112 }, { "epoch": 0.8817021276595745, "grad_norm": 1.0879757404327393, "loss": 5.0834, "lr": 0.0006353846153846155, "step": 3108, "tokens_trained": 0.295371224 }, { "epoch": 0.8822695035460993, "grad_norm": 1.0321052074432373, "loss": 5.1132, "lr": 0.0006351048951048951, "step": 3110, "tokens_trained": 0.295563288 }, { "epoch": 0.8828368794326241, "grad_norm": 0.9930777549743652, "loss": 5.0855, "lr": 0.0006348251748251748, "step": 3112, "tokens_trained": 0.295753864 }, { "epoch": 0.8834042553191489, "grad_norm": 1.007925033569336, "loss": 5.0728, "lr": 0.0006345454545454546, "step": 3114, "tokens_trained": 0.29594536 }, { "epoch": 0.8839716312056738, "grad_norm": 1.0430697202682495, "loss": 5.161, "lr": 0.0006342657342657343, "step": 3116, "tokens_trained": 0.296135144 }, { "epoch": 0.8845390070921986, "grad_norm": 0.9607092142105103, "loss": 5.0514, "lr": 0.000633986013986014, "step": 3118, "tokens_trained": 0.296325736 }, { "epoch": 0.8851063829787233, "grad_norm": 1.0054502487182617, "loss": 5.03, "lr": 0.0006337062937062937, "step": 3120, "tokens_trained": 0.296514408 }, { "epoch": 0.8856737588652482, "grad_norm": 1.0535473823547363, "loss": 5.1082, "lr": 0.0006334265734265733, "step": 3122, "tokens_trained": 0.296702248 }, { "epoch": 0.886241134751773, "grad_norm": 0.9889388680458069, "loss": 5.0583, "lr": 0.0006331468531468532, "step": 3124, "tokens_trained": 0.296891656 }, { "epoch": 0.8865248226950354, "eval_loss": 5.07567024230957, "eval_runtime": 20.7649, "step": 3125, "tokens_trained": 0.296985944 }, { "epoch": 0.8868085106382979, "grad_norm": 1.008825421333313, "loss": 5.0698, "lr": 0.0006328671328671329, "step": 3126, "tokens_trained": 0.297081752 }, { "epoch": 0.8873758865248227, "grad_norm": 0.9656203985214233, "loss": 5.0915, "lr": 0.0006325874125874126, "step": 3128, "tokens_trained": 0.297269568 }, { "epoch": 0.8879432624113475, "grad_norm": 0.9101914167404175, "loss": 5.0821, "lr": 0.0006323076923076923, "step": 3130, "tokens_trained": 0.297457936 }, { "epoch": 0.8885106382978724, "grad_norm": 1.03163480758667, "loss": 5.07, "lr": 0.0006320279720279721, "step": 3132, "tokens_trained": 0.297646944 }, { "epoch": 0.8890780141843971, "grad_norm": 1.0470985174179077, "loss": 5.0165, "lr": 0.0006317482517482518, "step": 3134, "tokens_trained": 0.29783736 }, { "epoch": 0.889645390070922, "grad_norm": 1.0149681568145752, "loss": 5.0809, "lr": 0.0006314685314685314, "step": 3136, "tokens_trained": 0.298027048 }, { "epoch": 0.8902127659574468, "grad_norm": 1.017217993736267, "loss": 5.0323, "lr": 0.0006311888111888112, "step": 3138, "tokens_trained": 0.298218432 }, { "epoch": 0.8907801418439716, "grad_norm": 1.0002187490463257, "loss": 5.0818, "lr": 0.0006309090909090908, "step": 3140, "tokens_trained": 0.29840768 }, { "epoch": 0.8913475177304965, "grad_norm": 0.9259346723556519, "loss": 5.0863, "lr": 0.0006306293706293707, "step": 3142, "tokens_trained": 0.298599384 }, { "epoch": 0.8919148936170213, "grad_norm": 0.9437862634658813, "loss": 5.1282, "lr": 0.0006303496503496504, "step": 3144, "tokens_trained": 0.298789024 }, { "epoch": 0.8924822695035461, "grad_norm": 0.9849722981452942, "loss": 5.0658, "lr": 0.0006300699300699301, "step": 3146, "tokens_trained": 0.298979648 }, { "epoch": 0.8930496453900709, "grad_norm": 1.1129319667816162, "loss": 5.0663, "lr": 0.0006297902097902098, "step": 3148, "tokens_trained": 0.299170416 }, { "epoch": 0.8936170212765957, "grad_norm": 1.101006031036377, "loss": 5.0394, "lr": 0.0006295104895104896, "step": 3150, "tokens_trained": 0.299361408 }, { "epoch": 0.8941843971631206, "grad_norm": 1.0711042881011963, "loss": 5.0696, "lr": 0.0006292307692307693, "step": 3152, "tokens_trained": 0.29955124 }, { "epoch": 0.8947517730496454, "grad_norm": 1.0250879526138306, "loss": 5.0645, "lr": 0.0006289510489510489, "step": 3154, "tokens_trained": 0.299742168 }, { "epoch": 0.8953191489361703, "grad_norm": 1.0772818326950073, "loss": 5.06, "lr": 0.0006286713286713287, "step": 3156, "tokens_trained": 0.299931536 }, { "epoch": 0.895886524822695, "grad_norm": 1.1049630641937256, "loss": 5.0823, "lr": 0.0006283916083916083, "step": 3158, "tokens_trained": 0.300121944 }, { "epoch": 0.8964539007092198, "grad_norm": 1.0998307466506958, "loss": 5.0334, "lr": 0.0006281118881118882, "step": 3160, "tokens_trained": 0.300311336 }, { "epoch": 0.8970212765957447, "grad_norm": 1.0796667337417603, "loss": 5.1029, "lr": 0.0006278321678321679, "step": 3162, "tokens_trained": 0.300499712 }, { "epoch": 0.8975886524822695, "grad_norm": 1.054004192352295, "loss": 5.0425, "lr": 0.0006275524475524475, "step": 3164, "tokens_trained": 0.300689128 }, { "epoch": 0.8981560283687944, "grad_norm": 0.9226939082145691, "loss": 5.0712, "lr": 0.0006272727272727273, "step": 3166, "tokens_trained": 0.300878016 }, { "epoch": 0.8987234042553192, "grad_norm": 0.8905312418937683, "loss": 5.0948, "lr": 0.000626993006993007, "step": 3168, "tokens_trained": 0.301067672 }, { "epoch": 0.8992907801418439, "grad_norm": 0.92462557554245, "loss": 5.0488, "lr": 0.0006267132867132868, "step": 3170, "tokens_trained": 0.30125668 }, { "epoch": 0.8998581560283688, "grad_norm": 0.911163866519928, "loss": 5.0655, "lr": 0.0006264335664335664, "step": 3172, "tokens_trained": 0.301447736 }, { "epoch": 0.9004255319148936, "grad_norm": 1.0041508674621582, "loss": 5.1074, "lr": 0.0006261538461538462, "step": 3174, "tokens_trained": 0.301636976 }, { "epoch": 0.9009929078014184, "grad_norm": 1.1221826076507568, "loss": 5.1076, "lr": 0.0006258741258741258, "step": 3176, "tokens_trained": 0.301831152 }, { "epoch": 0.9015602836879433, "grad_norm": 1.0674721002578735, "loss": 5.1029, "lr": 0.0006255944055944057, "step": 3178, "tokens_trained": 0.302021192 }, { "epoch": 0.902127659574468, "grad_norm": 0.9207814335823059, "loss": 5.1237, "lr": 0.0006253146853146854, "step": 3180, "tokens_trained": 0.302214408 }, { "epoch": 0.902695035460993, "grad_norm": 0.9445079565048218, "loss": 5.0714, "lr": 0.000625034965034965, "step": 3182, "tokens_trained": 0.302406056 }, { "epoch": 0.9032624113475177, "grad_norm": 0.930630624294281, "loss": 5.0326, "lr": 0.0006247552447552448, "step": 3184, "tokens_trained": 0.302596376 }, { "epoch": 0.9038297872340425, "grad_norm": 0.9014614224433899, "loss": 5.0768, "lr": 0.0006244755244755245, "step": 3186, "tokens_trained": 0.302787288 }, { "epoch": 0.9043971631205674, "grad_norm": 0.9306453466415405, "loss": 5.139, "lr": 0.0006241958041958043, "step": 3188, "tokens_trained": 0.302976344 }, { "epoch": 0.9049645390070922, "grad_norm": 0.9506230354309082, "loss": 5.0932, "lr": 0.0006239160839160839, "step": 3190, "tokens_trained": 0.303166928 }, { "epoch": 0.9055319148936171, "grad_norm": 0.8852939605712891, "loss": 5.0761, "lr": 0.0006236363636363636, "step": 3192, "tokens_trained": 0.303357176 }, { "epoch": 0.9060992907801418, "grad_norm": 0.9017012119293213, "loss": 4.9965, "lr": 0.0006233566433566433, "step": 3194, "tokens_trained": 0.303547344 }, { "epoch": 0.9066666666666666, "grad_norm": 0.8619939684867859, "loss": 5.0892, "lr": 0.0006230769230769231, "step": 3196, "tokens_trained": 0.303737392 }, { "epoch": 0.9072340425531915, "grad_norm": 0.8667910695075989, "loss": 5.1222, "lr": 0.0006227972027972028, "step": 3198, "tokens_trained": 0.303926592 }, { "epoch": 0.9078014184397163, "grad_norm": 0.9172303676605225, "loss": 5.0891, "lr": 0.0006225174825174825, "step": 3200, "tokens_trained": 0.304117744 }, { "epoch": 0.9083687943262412, "grad_norm": 0.9247593879699707, "loss": 5.0528, "lr": 0.0006222377622377623, "step": 3202, "tokens_trained": 0.304304792 }, { "epoch": 0.908936170212766, "grad_norm": 0.9245242476463318, "loss": 5.027, "lr": 0.000621958041958042, "step": 3204, "tokens_trained": 0.304496016 }, { "epoch": 0.9095035460992907, "grad_norm": 0.8890556693077087, "loss": 5.0348, "lr": 0.0006216783216783217, "step": 3206, "tokens_trained": 0.304685896 }, { "epoch": 0.9100709219858156, "grad_norm": 0.9343590140342712, "loss": 5.103, "lr": 0.0006213986013986014, "step": 3208, "tokens_trained": 0.304876864 }, { "epoch": 0.9106382978723404, "grad_norm": 0.9546332955360413, "loss": 5.0456, "lr": 0.0006211188811188811, "step": 3210, "tokens_trained": 0.305067744 }, { "epoch": 0.9112056737588653, "grad_norm": 0.9404990077018738, "loss": 5.0357, "lr": 0.0006208391608391608, "step": 3212, "tokens_trained": 0.305256552 }, { "epoch": 0.9117730496453901, "grad_norm": 0.8743602633476257, "loss": 5.0564, "lr": 0.0006205594405594406, "step": 3214, "tokens_trained": 0.305446568 }, { "epoch": 0.9123404255319149, "grad_norm": 0.9437069892883301, "loss": 5.0703, "lr": 0.0006202797202797203, "step": 3216, "tokens_trained": 0.305636344 }, { "epoch": 0.9129078014184397, "grad_norm": 0.970951497554779, "loss": 5.0722, "lr": 0.00062, "step": 3218, "tokens_trained": 0.305825936 }, { "epoch": 0.9134751773049645, "grad_norm": 0.9047942757606506, "loss": 5.113, "lr": 0.0006197202797202797, "step": 3220, "tokens_trained": 0.306016936 }, { "epoch": 0.9140425531914894, "grad_norm": 0.9751421213150024, "loss": 5.0465, "lr": 0.0006194405594405595, "step": 3222, "tokens_trained": 0.306207216 }, { "epoch": 0.9146099290780142, "grad_norm": 0.9317526817321777, "loss": 5.0601, "lr": 0.0006191608391608392, "step": 3224, "tokens_trained": 0.306396832 }, { "epoch": 0.915177304964539, "grad_norm": 0.9828630685806274, "loss": 5.0857, "lr": 0.0006188811188811189, "step": 3226, "tokens_trained": 0.30658724 }, { "epoch": 0.9157446808510639, "grad_norm": 0.9108901619911194, "loss": 5.0525, "lr": 0.0006186013986013986, "step": 3228, "tokens_trained": 0.30677856 }, { "epoch": 0.9163120567375886, "grad_norm": 0.8517162203788757, "loss": 5.1157, "lr": 0.0006183216783216783, "step": 3230, "tokens_trained": 0.3069698 }, { "epoch": 0.9168794326241134, "grad_norm": 0.9589570760726929, "loss": 5.0823, "lr": 0.0006180419580419581, "step": 3232, "tokens_trained": 0.307160952 }, { "epoch": 0.9174468085106383, "grad_norm": 1.0031661987304688, "loss": 5.0808, "lr": 0.0006177622377622377, "step": 3234, "tokens_trained": 0.307352776 }, { "epoch": 0.9180141843971631, "grad_norm": 0.9295787215232849, "loss": 5.0699, "lr": 0.0006174825174825175, "step": 3236, "tokens_trained": 0.3075432 }, { "epoch": 0.918581560283688, "grad_norm": 0.9967226982116699, "loss": 5.0036, "lr": 0.0006172027972027972, "step": 3238, "tokens_trained": 0.307735016 }, { "epoch": 0.9191489361702128, "grad_norm": 1.0219292640686035, "loss": 5.1142, "lr": 0.000616923076923077, "step": 3240, "tokens_trained": 0.307926624 }, { "epoch": 0.9197163120567375, "grad_norm": 1.0547230243682861, "loss": 5.0429, "lr": 0.0006166433566433567, "step": 3242, "tokens_trained": 0.30811696 }, { "epoch": 0.9202836879432624, "grad_norm": 1.0130624771118164, "loss": 5.1345, "lr": 0.0006163636363636364, "step": 3244, "tokens_trained": 0.30830848 }, { "epoch": 0.9208510638297872, "grad_norm": 0.8802092671394348, "loss": 5.0404, "lr": 0.0006160839160839161, "step": 3246, "tokens_trained": 0.308497688 }, { "epoch": 0.9214184397163121, "grad_norm": 0.970391571521759, "loss": 5.0875, "lr": 0.0006158041958041957, "step": 3248, "tokens_trained": 0.308686352 }, { "epoch": 0.9219858156028369, "grad_norm": 0.9314327239990234, "loss": 5.0519, "lr": 0.0006155244755244756, "step": 3250, "tokens_trained": 0.308875888 }, { "epoch": 0.9219858156028369, "eval_loss": 5.063432216644287, "eval_runtime": 20.6963, "step": 3250, "tokens_trained": 0.308875888 }, { "epoch": 0.9225531914893617, "grad_norm": 0.875278890132904, "loss": 4.9958, "lr": 0.0006152447552447552, "step": 3252, "tokens_trained": 0.309068888 }, { "epoch": 0.9231205673758865, "grad_norm": 0.9115424156188965, "loss": 4.9971, "lr": 0.000614965034965035, "step": 3254, "tokens_trained": 0.309260656 }, { "epoch": 0.9236879432624113, "grad_norm": 0.9202569723129272, "loss": 5.0103, "lr": 0.0006146853146853147, "step": 3256, "tokens_trained": 0.309452672 }, { "epoch": 0.9242553191489362, "grad_norm": 0.9471083879470825, "loss": 5.0429, "lr": 0.0006144055944055945, "step": 3258, "tokens_trained": 0.30964252 }, { "epoch": 0.924822695035461, "grad_norm": 0.9518803954124451, "loss": 5.0143, "lr": 0.0006141258741258742, "step": 3260, "tokens_trained": 0.309831288 }, { "epoch": 0.9253900709219858, "grad_norm": 0.9274792671203613, "loss": 5.0121, "lr": 0.0006138461538461538, "step": 3262, "tokens_trained": 0.310021056 }, { "epoch": 0.9259574468085107, "grad_norm": 0.9414265751838684, "loss": 5.1362, "lr": 0.0006135664335664336, "step": 3264, "tokens_trained": 0.310210328 }, { "epoch": 0.9265248226950354, "grad_norm": 0.968233048915863, "loss": 4.9792, "lr": 0.0006132867132867132, "step": 3266, "tokens_trained": 0.310399616 }, { "epoch": 0.9270921985815603, "grad_norm": 0.9223787784576416, "loss": 5.0543, "lr": 0.0006130069930069931, "step": 3268, "tokens_trained": 0.310588952 }, { "epoch": 0.9276595744680851, "grad_norm": 0.9317581653594971, "loss": 5.0053, "lr": 0.0006127272727272727, "step": 3270, "tokens_trained": 0.310779576 }, { "epoch": 0.9282269503546099, "grad_norm": 0.8910759687423706, "loss": 5.1044, "lr": 0.0006124475524475525, "step": 3272, "tokens_trained": 0.310970096 }, { "epoch": 0.9287943262411348, "grad_norm": 0.8903452157974243, "loss": 5.093, "lr": 0.0006121678321678322, "step": 3274, "tokens_trained": 0.311158808 }, { "epoch": 0.9293617021276596, "grad_norm": 0.9635697603225708, "loss": 5.0149, "lr": 0.0006118881118881118, "step": 3276, "tokens_trained": 0.311348672 }, { "epoch": 0.9299290780141845, "grad_norm": 1.0122349262237549, "loss": 5.0353, "lr": 0.0006116083916083917, "step": 3278, "tokens_trained": 0.31153696 }, { "epoch": 0.9304964539007092, "grad_norm": 0.9734505414962769, "loss": 5.0531, "lr": 0.0006113286713286713, "step": 3280, "tokens_trained": 0.311728288 }, { "epoch": 0.931063829787234, "grad_norm": 0.9433160424232483, "loss": 5.0234, "lr": 0.0006110489510489511, "step": 3282, "tokens_trained": 0.311917352 }, { "epoch": 0.9316312056737589, "grad_norm": 0.9984011054039001, "loss": 5.0355, "lr": 0.0006107692307692307, "step": 3284, "tokens_trained": 0.312108504 }, { "epoch": 0.9321985815602837, "grad_norm": 1.0186588764190674, "loss": 4.9903, "lr": 0.0006104895104895106, "step": 3286, "tokens_trained": 0.312300216 }, { "epoch": 0.9327659574468085, "grad_norm": 0.984987735748291, "loss": 5.0188, "lr": 0.0006102097902097902, "step": 3288, "tokens_trained": 0.312490928 }, { "epoch": 0.9333333333333333, "grad_norm": 0.9382873773574829, "loss": 5.0223, "lr": 0.0006099300699300699, "step": 3290, "tokens_trained": 0.312681928 }, { "epoch": 0.9339007092198581, "grad_norm": 0.8770633339881897, "loss": 5.05, "lr": 0.0006096503496503497, "step": 3292, "tokens_trained": 0.312870072 }, { "epoch": 0.934468085106383, "grad_norm": 0.9703201055526733, "loss": 5.0905, "lr": 0.0006093706293706293, "step": 3294, "tokens_trained": 0.313060608 }, { "epoch": 0.9350354609929078, "grad_norm": 0.9052334427833557, "loss": 5.0416, "lr": 0.0006090909090909092, "step": 3296, "tokens_trained": 0.313251584 }, { "epoch": 0.9356028368794326, "grad_norm": 0.949390709400177, "loss": 4.9757, "lr": 0.0006088111888111888, "step": 3298, "tokens_trained": 0.313440784 }, { "epoch": 0.9361702127659575, "grad_norm": 0.9845399260520935, "loss": 5.0403, "lr": 0.0006085314685314686, "step": 3300, "tokens_trained": 0.313631088 }, { "epoch": 0.9367375886524822, "grad_norm": 0.921394407749176, "loss": 5.0464, "lr": 0.0006082517482517482, "step": 3302, "tokens_trained": 0.313821704 }, { "epoch": 0.9373049645390071, "grad_norm": 0.9639559984207153, "loss": 5.0658, "lr": 0.000607972027972028, "step": 3304, "tokens_trained": 0.314011048 }, { "epoch": 0.9378723404255319, "grad_norm": 0.9921663403511047, "loss": 5.0469, "lr": 0.0006076923076923077, "step": 3306, "tokens_trained": 0.314199264 }, { "epoch": 0.9384397163120567, "grad_norm": 0.9891427159309387, "loss": 5.0784, "lr": 0.0006074125874125874, "step": 3308, "tokens_trained": 0.314388688 }, { "epoch": 0.9390070921985816, "grad_norm": 0.966525673866272, "loss": 5.0759, "lr": 0.0006071328671328672, "step": 3310, "tokens_trained": 0.31457712 }, { "epoch": 0.9395744680851064, "grad_norm": 0.9262145757675171, "loss": 4.9822, "lr": 0.0006068531468531468, "step": 3312, "tokens_trained": 0.314768096 }, { "epoch": 0.9401418439716313, "grad_norm": 0.9138565063476562, "loss": 5.059, "lr": 0.0006065734265734267, "step": 3314, "tokens_trained": 0.314959 }, { "epoch": 0.940709219858156, "grad_norm": 0.9083120226860046, "loss": 5.0523, "lr": 0.0006062937062937063, "step": 3316, "tokens_trained": 0.315148288 }, { "epoch": 0.9412765957446808, "grad_norm": 0.9483383893966675, "loss": 5.0821, "lr": 0.000606013986013986, "step": 3318, "tokens_trained": 0.31533864 }, { "epoch": 0.9418439716312057, "grad_norm": 0.8716344833374023, "loss": 5.1046, "lr": 0.0006057342657342657, "step": 3320, "tokens_trained": 0.31552972 }, { "epoch": 0.9424113475177305, "grad_norm": 0.9287091493606567, "loss": 5.0531, "lr": 0.0006054545454545455, "step": 3322, "tokens_trained": 0.315720136 }, { "epoch": 0.9429787234042554, "grad_norm": 0.9560433030128479, "loss": 5.087, "lr": 0.0006051748251748252, "step": 3324, "tokens_trained": 0.315911024 }, { "epoch": 0.9435460992907801, "grad_norm": 0.8612940311431885, "loss": 5.1338, "lr": 0.0006048951048951049, "step": 3326, "tokens_trained": 0.316102368 }, { "epoch": 0.9441134751773049, "grad_norm": 1.0215116739273071, "loss": 5.034, "lr": 0.0006046153846153846, "step": 3328, "tokens_trained": 0.316292296 }, { "epoch": 0.9446808510638298, "grad_norm": 1.0019500255584717, "loss": 5.101, "lr": 0.0006043356643356643, "step": 3330, "tokens_trained": 0.31648088 }, { "epoch": 0.9452482269503546, "grad_norm": 0.9435174465179443, "loss": 5.0476, "lr": 0.0006040559440559441, "step": 3332, "tokens_trained": 0.316672936 }, { "epoch": 0.9458156028368795, "grad_norm": 0.9211596846580505, "loss": 5.039, "lr": 0.0006037762237762238, "step": 3334, "tokens_trained": 0.31686408 }, { "epoch": 0.9463829787234043, "grad_norm": 0.9332453608512878, "loss": 5.0857, "lr": 0.0006034965034965035, "step": 3336, "tokens_trained": 0.317053896 }, { "epoch": 0.946950354609929, "grad_norm": 0.8761624097824097, "loss": 5.0614, "lr": 0.0006032167832167832, "step": 3338, "tokens_trained": 0.317245016 }, { "epoch": 0.9475177304964539, "grad_norm": 0.9113523364067078, "loss": 5.0609, "lr": 0.000602937062937063, "step": 3340, "tokens_trained": 0.317433592 }, { "epoch": 0.9480851063829787, "grad_norm": 1.0509337186813354, "loss": 4.9984, "lr": 0.0006026573426573426, "step": 3342, "tokens_trained": 0.317623392 }, { "epoch": 0.9486524822695035, "grad_norm": 0.9496453404426575, "loss": 5.0632, "lr": 0.0006023776223776224, "step": 3344, "tokens_trained": 0.317814848 }, { "epoch": 0.9492198581560284, "grad_norm": 0.913977861404419, "loss": 5.0816, "lr": 0.0006020979020979021, "step": 3346, "tokens_trained": 0.318003232 }, { "epoch": 0.9497872340425532, "grad_norm": 0.9476690292358398, "loss": 5.1321, "lr": 0.0006018181818181818, "step": 3348, "tokens_trained": 0.31819216 }, { "epoch": 0.950354609929078, "grad_norm": 1.0221197605133057, "loss": 5.0602, "lr": 0.0006015384615384616, "step": 3350, "tokens_trained": 0.318379648 }, { "epoch": 0.9509219858156028, "grad_norm": 0.9944773316383362, "loss": 5.0595, "lr": 0.0006012587412587413, "step": 3352, "tokens_trained": 0.3185692 }, { "epoch": 0.9514893617021276, "grad_norm": 0.9641481041908264, "loss": 5.0842, "lr": 0.000600979020979021, "step": 3354, "tokens_trained": 0.318758464 }, { "epoch": 0.9520567375886525, "grad_norm": 0.8794710636138916, "loss": 5.0385, "lr": 0.0006006993006993006, "step": 3356, "tokens_trained": 0.318948528 }, { "epoch": 0.9526241134751773, "grad_norm": 0.9986928701400757, "loss": 5.0325, "lr": 0.0006004195804195805, "step": 3358, "tokens_trained": 0.319137168 }, { "epoch": 0.9531914893617022, "grad_norm": 0.9385401606559753, "loss": 4.9957, "lr": 0.0006001398601398601, "step": 3360, "tokens_trained": 0.319327992 }, { "epoch": 0.953758865248227, "grad_norm": 0.9591023921966553, "loss": 5.0883, "lr": 0.0005998601398601399, "step": 3362, "tokens_trained": 0.319518928 }, { "epoch": 0.9543262411347517, "grad_norm": 0.9454349279403687, "loss": 5.0639, "lr": 0.0005995804195804196, "step": 3364, "tokens_trained": 0.319711176 }, { "epoch": 0.9548936170212766, "grad_norm": 0.9882696270942688, "loss": 5.0326, "lr": 0.0005993006993006993, "step": 3366, "tokens_trained": 0.319901272 }, { "epoch": 0.9554609929078014, "grad_norm": 0.9254516959190369, "loss": 5.0454, "lr": 0.0005990209790209791, "step": 3368, "tokens_trained": 0.320091928 }, { "epoch": 0.9560283687943263, "grad_norm": 0.9193766117095947, "loss": 4.9996, "lr": 0.0005987412587412587, "step": 3370, "tokens_trained": 0.320282712 }, { "epoch": 0.9565957446808511, "grad_norm": 0.9373677372932434, "loss": 5.1228, "lr": 0.0005984615384615385, "step": 3372, "tokens_trained": 0.320472168 }, { "epoch": 0.9571631205673758, "grad_norm": 0.9842008352279663, "loss": 5.0338, "lr": 0.0005981818181818181, "step": 3374, "tokens_trained": 0.320662592 }, { "epoch": 0.9574468085106383, "eval_loss": 5.064303398132324, "eval_runtime": 20.617, "step": 3375, "tokens_trained": 0.320758504 }, { "epoch": 0.9577304964539007, "grad_norm": 0.9934602379798889, "loss": 5.0444, "lr": 0.000597902097902098, "step": 3376, "tokens_trained": 0.320853552 }, { "epoch": 0.9582978723404255, "grad_norm": 0.9192136526107788, "loss": 5.0502, "lr": 0.0005976223776223776, "step": 3378, "tokens_trained": 0.321043072 }, { "epoch": 0.9588652482269504, "grad_norm": 0.9416385293006897, "loss": 5.0676, "lr": 0.0005973426573426574, "step": 3380, "tokens_trained": 0.321234024 }, { "epoch": 0.9594326241134752, "grad_norm": 0.87016761302948, "loss": 5.0474, "lr": 0.0005970629370629371, "step": 3382, "tokens_trained": 0.321423504 }, { "epoch": 0.96, "grad_norm": 0.9421593546867371, "loss": 5.0148, "lr": 0.0005967832167832167, "step": 3384, "tokens_trained": 0.32161436 }, { "epoch": 0.9605673758865249, "grad_norm": 0.9040830135345459, "loss": 5.0065, "lr": 0.0005965034965034966, "step": 3386, "tokens_trained": 0.321804688 }, { "epoch": 0.9611347517730496, "grad_norm": 0.9497122764587402, "loss": 5.0882, "lr": 0.0005962237762237762, "step": 3388, "tokens_trained": 0.321994728 }, { "epoch": 0.9617021276595744, "grad_norm": 0.9700525999069214, "loss": 5.0462, "lr": 0.000595944055944056, "step": 3390, "tokens_trained": 0.322186 }, { "epoch": 0.9622695035460993, "grad_norm": 0.9304616451263428, "loss": 5.0781, "lr": 0.0005956643356643356, "step": 3392, "tokens_trained": 0.322376408 }, { "epoch": 0.9628368794326241, "grad_norm": 0.8804431557655334, "loss": 5.1449, "lr": 0.0005953846153846155, "step": 3394, "tokens_trained": 0.322566024 }, { "epoch": 0.963404255319149, "grad_norm": 0.8852412700653076, "loss": 5.0602, "lr": 0.0005951048951048951, "step": 3396, "tokens_trained": 0.322758272 }, { "epoch": 0.9639716312056738, "grad_norm": 1.015409231185913, "loss": 5.0753, "lr": 0.0005948251748251748, "step": 3398, "tokens_trained": 0.322948904 }, { "epoch": 0.9645390070921985, "grad_norm": 0.9504172205924988, "loss": 5.1003, "lr": 0.0005945454545454546, "step": 3400, "tokens_trained": 0.323140944 }, { "epoch": 0.9651063829787234, "grad_norm": 0.8708069920539856, "loss": 5.0316, "lr": 0.0005942657342657342, "step": 3402, "tokens_trained": 0.323331864 }, { "epoch": 0.9656737588652482, "grad_norm": 0.8804038166999817, "loss": 5.038, "lr": 0.0005939860139860141, "step": 3404, "tokens_trained": 0.323521296 }, { "epoch": 0.9662411347517731, "grad_norm": 0.901345431804657, "loss": 5.1247, "lr": 0.0005937062937062937, "step": 3406, "tokens_trained": 0.323713464 }, { "epoch": 0.9668085106382979, "grad_norm": 0.8839131593704224, "loss": 5.058, "lr": 0.0005934265734265735, "step": 3408, "tokens_trained": 0.323903208 }, { "epoch": 0.9673758865248226, "grad_norm": 0.9157027006149292, "loss": 5.007, "lr": 0.0005931468531468531, "step": 3410, "tokens_trained": 0.324091224 }, { "epoch": 0.9679432624113475, "grad_norm": 0.9776141047477722, "loss": 5.0244, "lr": 0.000592867132867133, "step": 3412, "tokens_trained": 0.324281696 }, { "epoch": 0.9685106382978723, "grad_norm": 0.8768822550773621, "loss": 5.0321, "lr": 0.0005925874125874126, "step": 3414, "tokens_trained": 0.324471136 }, { "epoch": 0.9690780141843972, "grad_norm": 0.7926638722419739, "loss": 5.0433, "lr": 0.0005923076923076923, "step": 3416, "tokens_trained": 0.324661816 }, { "epoch": 0.969645390070922, "grad_norm": 0.8630013465881348, "loss": 5.0876, "lr": 0.0005920279720279721, "step": 3418, "tokens_trained": 0.324852152 }, { "epoch": 0.9702127659574468, "grad_norm": 0.8769309520721436, "loss": 5.0611, "lr": 0.0005917482517482517, "step": 3420, "tokens_trained": 0.325042144 }, { "epoch": 0.9707801418439717, "grad_norm": 0.8933807611465454, "loss": 5.0288, "lr": 0.0005914685314685316, "step": 3422, "tokens_trained": 0.325232376 }, { "epoch": 0.9713475177304964, "grad_norm": 0.9544175267219543, "loss": 5.0038, "lr": 0.0005911888111888112, "step": 3424, "tokens_trained": 0.325423784 }, { "epoch": 0.9719148936170213, "grad_norm": 0.9057655930519104, "loss": 5.0613, "lr": 0.0005909090909090909, "step": 3426, "tokens_trained": 0.325614744 }, { "epoch": 0.9724822695035461, "grad_norm": 0.8956878781318665, "loss": 5.0327, "lr": 0.0005906293706293706, "step": 3428, "tokens_trained": 0.325803144 }, { "epoch": 0.9730496453900709, "grad_norm": 0.8879814147949219, "loss": 5.0018, "lr": 0.0005903496503496504, "step": 3430, "tokens_trained": 0.3259944 }, { "epoch": 0.9736170212765958, "grad_norm": 0.8801882863044739, "loss": 5.125, "lr": 0.0005900699300699301, "step": 3432, "tokens_trained": 0.326185928 }, { "epoch": 0.9741843971631206, "grad_norm": 0.8586528301239014, "loss": 5.0299, "lr": 0.0005897902097902098, "step": 3434, "tokens_trained": 0.326378416 }, { "epoch": 0.9747517730496454, "grad_norm": 0.8574861884117126, "loss": 5.0743, "lr": 0.0005895104895104896, "step": 3436, "tokens_trained": 0.326569616 }, { "epoch": 0.9753191489361702, "grad_norm": 0.8478572368621826, "loss": 5.0547, "lr": 0.0005892307692307692, "step": 3438, "tokens_trained": 0.326759744 }, { "epoch": 0.975886524822695, "grad_norm": 0.8645881414413452, "loss": 5.0504, "lr": 0.000588951048951049, "step": 3440, "tokens_trained": 0.3269478 }, { "epoch": 0.9764539007092199, "grad_norm": 0.8346559405326843, "loss": 5.0472, "lr": 0.0005886713286713287, "step": 3442, "tokens_trained": 0.32714012 }, { "epoch": 0.9770212765957447, "grad_norm": 0.8666026592254639, "loss": 5.0557, "lr": 0.0005883916083916084, "step": 3444, "tokens_trained": 0.327329992 }, { "epoch": 0.9775886524822694, "grad_norm": 0.9243910312652588, "loss": 5.0326, "lr": 0.0005881118881118881, "step": 3446, "tokens_trained": 0.327520664 }, { "epoch": 0.9781560283687943, "grad_norm": 0.8909792304039001, "loss": 5.0948, "lr": 0.0005878321678321679, "step": 3448, "tokens_trained": 0.327712056 }, { "epoch": 0.9787234042553191, "grad_norm": 0.8586627840995789, "loss": 5.0587, "lr": 0.0005875524475524476, "step": 3450, "tokens_trained": 0.327903456 }, { "epoch": 0.979290780141844, "grad_norm": 0.9551260471343994, "loss": 5.0493, "lr": 0.0005872727272727273, "step": 3452, "tokens_trained": 0.328093768 }, { "epoch": 0.9798581560283688, "grad_norm": 0.8501218557357788, "loss": 5.0725, "lr": 0.0005869930069930069, "step": 3454, "tokens_trained": 0.328281896 }, { "epoch": 0.9804255319148936, "grad_norm": 0.8573510646820068, "loss": 5.057, "lr": 0.0005867132867132867, "step": 3456, "tokens_trained": 0.32847448 }, { "epoch": 0.9809929078014185, "grad_norm": 0.8716034889221191, "loss": 4.9833, "lr": 0.0005864335664335665, "step": 3458, "tokens_trained": 0.328661304 }, { "epoch": 0.9815602836879432, "grad_norm": 0.8251221179962158, "loss": 5.0059, "lr": 0.0005861538461538462, "step": 3460, "tokens_trained": 0.328850496 }, { "epoch": 0.9821276595744681, "grad_norm": 0.8577293753623962, "loss": 5.0385, "lr": 0.0005858741258741259, "step": 3462, "tokens_trained": 0.329039896 }, { "epoch": 0.9826950354609929, "grad_norm": 0.9229962825775146, "loss": 5.0115, "lr": 0.0005855944055944055, "step": 3464, "tokens_trained": 0.329230472 }, { "epoch": 0.9832624113475177, "grad_norm": 0.8700546622276306, "loss": 5.0319, "lr": 0.0005853146853146854, "step": 3466, "tokens_trained": 0.32941888 }, { "epoch": 0.9838297872340426, "grad_norm": 0.8610907196998596, "loss": 5.0327, "lr": 0.000585034965034965, "step": 3468, "tokens_trained": 0.329611152 }, { "epoch": 0.9843971631205674, "grad_norm": 0.7971277236938477, "loss": 5.0813, "lr": 0.0005847552447552448, "step": 3470, "tokens_trained": 0.329800024 }, { "epoch": 0.9849645390070922, "grad_norm": 0.9169178009033203, "loss": 4.9764, "lr": 0.0005844755244755244, "step": 3472, "tokens_trained": 0.329991688 }, { "epoch": 0.985531914893617, "grad_norm": 0.9630699157714844, "loss": 5.0263, "lr": 0.0005841958041958042, "step": 3474, "tokens_trained": 0.33018312 }, { "epoch": 0.9860992907801418, "grad_norm": 0.9706154465675354, "loss": 4.9928, "lr": 0.000583916083916084, "step": 3476, "tokens_trained": 0.330372336 }, { "epoch": 0.9866666666666667, "grad_norm": 0.9754578471183777, "loss": 5.0122, "lr": 0.0005836363636363636, "step": 3478, "tokens_trained": 0.330564472 }, { "epoch": 0.9872340425531915, "grad_norm": 0.9906936287879944, "loss": 5.0495, "lr": 0.0005833566433566434, "step": 3480, "tokens_trained": 0.3307554 }, { "epoch": 0.9878014184397164, "grad_norm": 0.9739910960197449, "loss": 4.9801, "lr": 0.000583076923076923, "step": 3482, "tokens_trained": 0.330944608 }, { "epoch": 0.9883687943262411, "grad_norm": 1.0058059692382812, "loss": 5.0974, "lr": 0.0005827972027972029, "step": 3484, "tokens_trained": 0.331134752 }, { "epoch": 0.9889361702127659, "grad_norm": 1.0330032110214233, "loss": 5.1054, "lr": 0.0005825174825174825, "step": 3486, "tokens_trained": 0.331323744 }, { "epoch": 0.9895035460992908, "grad_norm": 0.9857019186019897, "loss": 5.0417, "lr": 0.0005822377622377623, "step": 3488, "tokens_trained": 0.33151316 }, { "epoch": 0.9900709219858156, "grad_norm": 0.8929789066314697, "loss": 5.0753, "lr": 0.0005819580419580419, "step": 3490, "tokens_trained": 0.331703136 }, { "epoch": 0.9906382978723405, "grad_norm": 0.9803673624992371, "loss": 5.0748, "lr": 0.0005816783216783216, "step": 3492, "tokens_trained": 0.331894376 }, { "epoch": 0.9912056737588653, "grad_norm": 1.0658507347106934, "loss": 4.952, "lr": 0.0005813986013986015, "step": 3494, "tokens_trained": 0.33208472 }, { "epoch": 0.99177304964539, "grad_norm": 0.9646208882331848, "loss": 5.0638, "lr": 0.0005811188811188811, "step": 3496, "tokens_trained": 0.332274704 }, { "epoch": 0.9923404255319149, "grad_norm": 0.9479737281799316, "loss": 4.9608, "lr": 0.0005808391608391609, "step": 3498, "tokens_trained": 0.332464656 }, { "epoch": 0.9929078014184397, "grad_norm": 0.9422057867050171, "loss": 4.9805, "lr": 0.0005805594405594405, "step": 3500, "tokens_trained": 0.332653056 }, { "epoch": 0.9929078014184397, "eval_loss": 5.051141738891602, "eval_runtime": 20.5005, "step": 3500, "tokens_trained": 0.332653056 }, { "epoch": 0.9934751773049645, "grad_norm": 0.8606115579605103, "loss": 5.014, "lr": 0.0005802797202797204, "step": 3502, "tokens_trained": 0.33284184 }, { "epoch": 0.9940425531914894, "grad_norm": 0.9218055009841919, "loss": 5.015, "lr": 0.00058, "step": 3504, "tokens_trained": 0.333031504 }, { "epoch": 0.9946099290780142, "grad_norm": 0.8346299529075623, "loss": 5.0793, "lr": 0.0005797202797202797, "step": 3506, "tokens_trained": 0.333222184 }, { "epoch": 0.995177304964539, "grad_norm": 0.9426013231277466, "loss": 5.0416, "lr": 0.0005794405594405594, "step": 3508, "tokens_trained": 0.333413 }, { "epoch": 0.9957446808510638, "grad_norm": 0.973893940448761, "loss": 5.0579, "lr": 0.0005791608391608391, "step": 3510, "tokens_trained": 0.333602176 }, { "epoch": 0.9963120567375886, "grad_norm": 0.9642478823661804, "loss": 5.1078, "lr": 0.000578881118881119, "step": 3512, "tokens_trained": 0.333792992 }, { "epoch": 0.9968794326241135, "grad_norm": 0.9709126949310303, "loss": 5.0379, "lr": 0.0005786013986013986, "step": 3514, "tokens_trained": 0.333982568 }, { "epoch": 0.9974468085106383, "grad_norm": 0.9238979816436768, "loss": 5.0391, "lr": 0.0005783216783216784, "step": 3516, "tokens_trained": 0.334171688 }, { "epoch": 0.9980141843971632, "grad_norm": 0.884200930595398, "loss": 5.0402, "lr": 0.000578041958041958, "step": 3518, "tokens_trained": 0.334361968 }, { "epoch": 0.9985815602836879, "grad_norm": 0.9557647705078125, "loss": 5.0816, "lr": 0.0005777622377622377, "step": 3520, "tokens_trained": 0.3345518 }, { "epoch": 0.9991489361702127, "grad_norm": 0.963729202747345, "loss": 5.0047, "lr": 0.0005774825174825175, "step": 3522, "tokens_trained": 0.334743104 }, { "epoch": 0.9997163120567376, "grad_norm": 0.8432028889656067, "loss": 5.0323, "lr": 0.0005772027972027972, "step": 3524, "tokens_trained": 0.334932336 }, { "epoch": 1.0002836879432624, "grad_norm": 0.9493524432182312, "loss": 5.0686, "lr": 0.0005769230769230769, "step": 3526, "tokens_trained": 0.335119632 }, { "epoch": 1.0008510638297872, "grad_norm": 0.8715479969978333, "loss": 4.9798, "lr": 0.0005766433566433566, "step": 3528, "tokens_trained": 0.335308752 }, { "epoch": 1.001418439716312, "grad_norm": 0.9414225816726685, "loss": 5.0294, "lr": 0.0005763636363636365, "step": 3530, "tokens_trained": 0.335499976 }, { "epoch": 1.001985815602837, "grad_norm": 0.9580470323562622, "loss": 5.0097, "lr": 0.0005760839160839161, "step": 3532, "tokens_trained": 0.335687704 }, { "epoch": 1.0025531914893617, "grad_norm": 0.8775055408477783, "loss": 5.047, "lr": 0.0005758041958041958, "step": 3534, "tokens_trained": 0.335877328 }, { "epoch": 1.0031205673758865, "grad_norm": 0.8149566054344177, "loss": 5.0598, "lr": 0.0005755244755244755, "step": 3536, "tokens_trained": 0.33606848 }, { "epoch": 1.0036879432624113, "grad_norm": 0.8992729783058167, "loss": 4.9875, "lr": 0.0005752447552447552, "step": 3538, "tokens_trained": 0.336259808 }, { "epoch": 1.004255319148936, "grad_norm": 0.913520872592926, "loss": 5.0254, "lr": 0.000574965034965035, "step": 3540, "tokens_trained": 0.336449872 }, { "epoch": 1.004822695035461, "grad_norm": 0.9528400301933289, "loss": 4.9949, "lr": 0.0005746853146853147, "step": 3542, "tokens_trained": 0.336640192 }, { "epoch": 1.0053900709219858, "grad_norm": 0.933910071849823, "loss": 5.0776, "lr": 0.0005744055944055944, "step": 3544, "tokens_trained": 0.336829088 }, { "epoch": 1.0059574468085106, "grad_norm": 0.9097418785095215, "loss": 5.0021, "lr": 0.0005741258741258741, "step": 3546, "tokens_trained": 0.337021576 }, { "epoch": 1.0065248226950354, "grad_norm": 0.8718441724777222, "loss": 5.0946, "lr": 0.000573846153846154, "step": 3548, "tokens_trained": 0.337210208 }, { "epoch": 1.0070921985815602, "grad_norm": 0.887489378452301, "loss": 4.9686, "lr": 0.0005735664335664336, "step": 3550, "tokens_trained": 0.337401176 }, { "epoch": 1.0076595744680852, "grad_norm": 0.8851242065429688, "loss": 5.0423, "lr": 0.0005732867132867133, "step": 3552, "tokens_trained": 0.337589216 }, { "epoch": 1.00822695035461, "grad_norm": 0.8381972312927246, "loss": 5.0645, "lr": 0.000573006993006993, "step": 3554, "tokens_trained": 0.337777424 }, { "epoch": 1.0087943262411347, "grad_norm": 0.8307796716690063, "loss": 5.036, "lr": 0.0005727272727272727, "step": 3556, "tokens_trained": 0.337967088 }, { "epoch": 1.0093617021276595, "grad_norm": 0.9271431565284729, "loss": 5.0384, "lr": 0.0005724475524475525, "step": 3558, "tokens_trained": 0.33815904 }, { "epoch": 1.0099290780141843, "grad_norm": 0.9501886367797852, "loss": 5.0929, "lr": 0.0005721678321678322, "step": 3560, "tokens_trained": 0.338349184 }, { "epoch": 1.0104964539007093, "grad_norm": 0.9176658987998962, "loss": 5.0721, "lr": 0.0005718881118881118, "step": 3562, "tokens_trained": 0.338539664 }, { "epoch": 1.011063829787234, "grad_norm": 0.8755439519882202, "loss": 5.0864, "lr": 0.0005716083916083916, "step": 3564, "tokens_trained": 0.33872792 }, { "epoch": 1.0116312056737589, "grad_norm": 0.9178908467292786, "loss": 5.035, "lr": 0.0005713286713286714, "step": 3566, "tokens_trained": 0.33891592 }, { "epoch": 1.0121985815602836, "grad_norm": 0.9046779870986938, "loss": 5.0286, "lr": 0.0005710489510489511, "step": 3568, "tokens_trained": 0.3391062 }, { "epoch": 1.0127659574468084, "grad_norm": 0.8680547475814819, "loss": 5.036, "lr": 0.0005707692307692308, "step": 3570, "tokens_trained": 0.339295896 }, { "epoch": 1.0133333333333334, "grad_norm": 0.8271722793579102, "loss": 5.0438, "lr": 0.0005704895104895105, "step": 3572, "tokens_trained": 0.339487368 }, { "epoch": 1.0139007092198582, "grad_norm": 0.8582717180252075, "loss": 5.1501, "lr": 0.0005702097902097902, "step": 3574, "tokens_trained": 0.339678792 }, { "epoch": 1.014468085106383, "grad_norm": 0.9433448314666748, "loss": 5.0575, "lr": 0.0005699300699300699, "step": 3576, "tokens_trained": 0.33987056 }, { "epoch": 1.0150354609929078, "grad_norm": 0.8291800022125244, "loss": 5.0284, "lr": 0.0005696503496503497, "step": 3578, "tokens_trained": 0.340059304 }, { "epoch": 1.0156028368794325, "grad_norm": 0.8057491183280945, "loss": 5.0737, "lr": 0.0005693706293706293, "step": 3580, "tokens_trained": 0.34024912 }, { "epoch": 1.0161702127659575, "grad_norm": 0.8925788998603821, "loss": 5.017, "lr": 0.0005690909090909091, "step": 3582, "tokens_trained": 0.340439688 }, { "epoch": 1.0167375886524823, "grad_norm": 0.8613091707229614, "loss": 5.0778, "lr": 0.0005688111888111889, "step": 3584, "tokens_trained": 0.34063064 }, { "epoch": 1.017304964539007, "grad_norm": 0.9694734811782837, "loss": 5.0831, "lr": 0.0005685314685314686, "step": 3586, "tokens_trained": 0.340820944 }, { "epoch": 1.0178723404255319, "grad_norm": 0.9405204653739929, "loss": 5.0819, "lr": 0.0005682517482517483, "step": 3588, "tokens_trained": 0.341008368 }, { "epoch": 1.0184397163120567, "grad_norm": 0.9191365838050842, "loss": 5.016, "lr": 0.0005679720279720279, "step": 3590, "tokens_trained": 0.341198984 }, { "epoch": 1.0190070921985817, "grad_norm": 0.9363374710083008, "loss": 5.0432, "lr": 0.0005676923076923077, "step": 3592, "tokens_trained": 0.341391808 }, { "epoch": 1.0195744680851064, "grad_norm": 0.9394513368606567, "loss": 5.0159, "lr": 0.0005674125874125874, "step": 3594, "tokens_trained": 0.34158276 }, { "epoch": 1.0201418439716312, "grad_norm": 0.8832948803901672, "loss": 5.1156, "lr": 0.0005671328671328672, "step": 3596, "tokens_trained": 0.341772296 }, { "epoch": 1.020709219858156, "grad_norm": 0.8347297310829163, "loss": 5.0666, "lr": 0.0005668531468531468, "step": 3598, "tokens_trained": 0.341959528 }, { "epoch": 1.0212765957446808, "grad_norm": 0.8295504450798035, "loss": 5.0179, "lr": 0.0005665734265734265, "step": 3600, "tokens_trained": 0.342150464 }, { "epoch": 1.0218439716312058, "grad_norm": 0.9434390068054199, "loss": 5.0127, "lr": 0.0005662937062937064, "step": 3602, "tokens_trained": 0.342339448 }, { "epoch": 1.0224113475177306, "grad_norm": 0.9653499722480774, "loss": 5.0665, "lr": 0.000566013986013986, "step": 3604, "tokens_trained": 0.342530488 }, { "epoch": 1.0229787234042553, "grad_norm": 0.8737668991088867, "loss": 5.0718, "lr": 0.0005657342657342658, "step": 3606, "tokens_trained": 0.342719696 }, { "epoch": 1.02354609929078, "grad_norm": 0.8800668716430664, "loss": 5.0302, "lr": 0.0005654545454545454, "step": 3608, "tokens_trained": 0.342909824 }, { "epoch": 1.0241134751773049, "grad_norm": 0.904245913028717, "loss": 5.0692, "lr": 0.0005651748251748252, "step": 3610, "tokens_trained": 0.343098816 }, { "epoch": 1.02468085106383, "grad_norm": 0.8640607595443726, "loss": 5.0146, "lr": 0.0005648951048951049, "step": 3612, "tokens_trained": 0.343288344 }, { "epoch": 1.0252482269503547, "grad_norm": 0.9330228567123413, "loss": 5.0123, "lr": 0.0005646153846153847, "step": 3614, "tokens_trained": 0.34347712 }, { "epoch": 1.0258156028368794, "grad_norm": 0.8850971460342407, "loss": 5.0718, "lr": 0.0005643356643356643, "step": 3616, "tokens_trained": 0.343666264 }, { "epoch": 1.0263829787234042, "grad_norm": 0.9091493487358093, "loss": 5.0508, "lr": 0.000564055944055944, "step": 3618, "tokens_trained": 0.343854008 }, { "epoch": 1.026950354609929, "grad_norm": 0.8939360976219177, "loss": 5.0492, "lr": 0.0005637762237762239, "step": 3620, "tokens_trained": 0.344046368 }, { "epoch": 1.027517730496454, "grad_norm": 0.9629043340682983, "loss": 5.0234, "lr": 0.0005634965034965035, "step": 3622, "tokens_trained": 0.344236592 }, { "epoch": 1.0280851063829788, "grad_norm": 0.955611526966095, "loss": 4.9878, "lr": 0.0005632167832167833, "step": 3624, "tokens_trained": 0.344425704 }, { "epoch": 1.0283687943262412, "eval_loss": 5.0450639724731445, "eval_runtime": 20.6963, "step": 3625, "tokens_trained": 0.344518808 }, { "epoch": 1.0286524822695036, "grad_norm": 0.9501426815986633, "loss": 5.0039, "lr": 0.0005629370629370629, "step": 3626, "tokens_trained": 0.344612688 }, { "epoch": 1.0292198581560283, "grad_norm": 0.9446471333503723, "loss": 5.0306, "lr": 0.0005626573426573426, "step": 3628, "tokens_trained": 0.344802448 }, { "epoch": 1.0297872340425531, "grad_norm": 0.9773867726325989, "loss": 5.0016, "lr": 0.0005623776223776224, "step": 3630, "tokens_trained": 0.344992872 }, { "epoch": 1.030354609929078, "grad_norm": 0.8802851438522339, "loss": 5.0263, "lr": 0.0005620979020979021, "step": 3632, "tokens_trained": 0.345182064 }, { "epoch": 1.030921985815603, "grad_norm": 0.9009132385253906, "loss": 4.9681, "lr": 0.0005618181818181818, "step": 3634, "tokens_trained": 0.345372888 }, { "epoch": 1.0314893617021277, "grad_norm": 0.9252756834030151, "loss": 4.9491, "lr": 0.0005615384615384615, "step": 3636, "tokens_trained": 0.345563088 }, { "epoch": 1.0320567375886525, "grad_norm": 0.9195572733879089, "loss": 5.0525, "lr": 0.0005612587412587414, "step": 3638, "tokens_trained": 0.345753928 }, { "epoch": 1.0326241134751772, "grad_norm": 0.8032271862030029, "loss": 5.0535, "lr": 0.000560979020979021, "step": 3640, "tokens_trained": 0.345945664 }, { "epoch": 1.033191489361702, "grad_norm": 0.7840321660041809, "loss": 4.9713, "lr": 0.0005606993006993008, "step": 3642, "tokens_trained": 0.346134096 }, { "epoch": 1.033758865248227, "grad_norm": 0.8394534587860107, "loss": 5.0695, "lr": 0.0005604195804195804, "step": 3644, "tokens_trained": 0.346325368 }, { "epoch": 1.0343262411347518, "grad_norm": 0.8543218374252319, "loss": 4.9826, "lr": 0.0005601398601398601, "step": 3646, "tokens_trained": 0.346515088 }, { "epoch": 1.0348936170212766, "grad_norm": 0.8483793139457703, "loss": 4.9956, "lr": 0.0005598601398601399, "step": 3648, "tokens_trained": 0.346705304 }, { "epoch": 1.0354609929078014, "grad_norm": 0.8377392888069153, "loss": 4.9123, "lr": 0.0005595804195804196, "step": 3650, "tokens_trained": 0.34689744 }, { "epoch": 1.0360283687943261, "grad_norm": 0.902778685092926, "loss": 5.0771, "lr": 0.0005593006993006993, "step": 3652, "tokens_trained": 0.347086984 }, { "epoch": 1.0365957446808511, "grad_norm": 0.915446937084198, "loss": 5.0235, "lr": 0.000559020979020979, "step": 3654, "tokens_trained": 0.347278816 }, { "epoch": 1.037163120567376, "grad_norm": 0.803059458732605, "loss": 5.0255, "lr": 0.0005587412587412589, "step": 3656, "tokens_trained": 0.347468136 }, { "epoch": 1.0377304964539007, "grad_norm": 0.9930711984634399, "loss": 5.0759, "lr": 0.0005584615384615385, "step": 3658, "tokens_trained": 0.347659624 }, { "epoch": 1.0382978723404255, "grad_norm": 0.9266470670700073, "loss": 5.0732, "lr": 0.0005581818181818182, "step": 3660, "tokens_trained": 0.347848536 }, { "epoch": 1.0388652482269503, "grad_norm": 0.8442680835723877, "loss": 5.0594, "lr": 0.0005579020979020979, "step": 3662, "tokens_trained": 0.348039968 }, { "epoch": 1.0394326241134753, "grad_norm": 0.8922600746154785, "loss": 4.9876, "lr": 0.0005576223776223776, "step": 3664, "tokens_trained": 0.348229432 }, { "epoch": 1.04, "grad_norm": 0.8602802753448486, "loss": 5.0332, "lr": 0.0005573426573426574, "step": 3666, "tokens_trained": 0.348420184 }, { "epoch": 1.0405673758865248, "grad_norm": 0.8762813806533813, "loss": 4.9641, "lr": 0.0005570629370629371, "step": 3668, "tokens_trained": 0.348609504 }, { "epoch": 1.0411347517730496, "grad_norm": 0.8674803972244263, "loss": 5.0701, "lr": 0.0005567832167832167, "step": 3670, "tokens_trained": 0.348799384 }, { "epoch": 1.0417021276595744, "grad_norm": 0.8296146988868713, "loss": 5.0432, "lr": 0.0005565034965034965, "step": 3672, "tokens_trained": 0.34898944 }, { "epoch": 1.0422695035460994, "grad_norm": 0.7757400870323181, "loss": 5.0742, "lr": 0.0005562237762237763, "step": 3674, "tokens_trained": 0.349178752 }, { "epoch": 1.0428368794326242, "grad_norm": 0.8509469032287598, "loss": 5.0754, "lr": 0.000555944055944056, "step": 3676, "tokens_trained": 0.349369944 }, { "epoch": 1.043404255319149, "grad_norm": 0.8896392583847046, "loss": 5.0305, "lr": 0.0005556643356643357, "step": 3678, "tokens_trained": 0.3495604 }, { "epoch": 1.0439716312056737, "grad_norm": 0.8363154530525208, "loss": 4.9969, "lr": 0.0005553846153846154, "step": 3680, "tokens_trained": 0.349749488 }, { "epoch": 1.0445390070921985, "grad_norm": 0.8382596969604492, "loss": 4.9747, "lr": 0.0005551048951048951, "step": 3682, "tokens_trained": 0.349939408 }, { "epoch": 1.0451063829787235, "grad_norm": 0.9114118218421936, "loss": 4.9993, "lr": 0.0005548251748251748, "step": 3684, "tokens_trained": 0.350129704 }, { "epoch": 1.0456737588652483, "grad_norm": 0.8570284843444824, "loss": 5.0509, "lr": 0.0005545454545454546, "step": 3686, "tokens_trained": 0.350319608 }, { "epoch": 1.046241134751773, "grad_norm": 0.8100084066390991, "loss": 4.9202, "lr": 0.0005542657342657342, "step": 3688, "tokens_trained": 0.35051 }, { "epoch": 1.0468085106382978, "grad_norm": 0.9485516548156738, "loss": 4.983, "lr": 0.000553986013986014, "step": 3690, "tokens_trained": 0.350702976 }, { "epoch": 1.0473758865248226, "grad_norm": 0.9124506115913391, "loss": 5.0354, "lr": 0.0005537062937062938, "step": 3692, "tokens_trained": 0.350894824 }, { "epoch": 1.0479432624113476, "grad_norm": 0.9002963900566101, "loss": 5.0171, "lr": 0.0005534265734265735, "step": 3694, "tokens_trained": 0.351085672 }, { "epoch": 1.0485106382978724, "grad_norm": 0.8576133251190186, "loss": 5.0411, "lr": 0.0005531468531468532, "step": 3696, "tokens_trained": 0.351274576 }, { "epoch": 1.0490780141843972, "grad_norm": 0.8824317455291748, "loss": 5.034, "lr": 0.0005528671328671328, "step": 3698, "tokens_trained": 0.351465168 }, { "epoch": 1.049645390070922, "grad_norm": 0.9119421243667603, "loss": 5.0423, "lr": 0.0005525874125874126, "step": 3700, "tokens_trained": 0.35165588 }, { "epoch": 1.0502127659574467, "grad_norm": 0.8260598182678223, "loss": 5.0213, "lr": 0.0005523076923076923, "step": 3702, "tokens_trained": 0.351846456 }, { "epoch": 1.0507801418439717, "grad_norm": 0.9968200922012329, "loss": 4.9728, "lr": 0.0005520279720279721, "step": 3704, "tokens_trained": 0.352036312 }, { "epoch": 1.0513475177304965, "grad_norm": 0.9910591840744019, "loss": 5.0692, "lr": 0.0005517482517482517, "step": 3706, "tokens_trained": 0.352227032 }, { "epoch": 1.0519148936170213, "grad_norm": 0.8656545877456665, "loss": 5.0201, "lr": 0.0005514685314685315, "step": 3708, "tokens_trained": 0.35241624 }, { "epoch": 1.052482269503546, "grad_norm": 0.9561606049537659, "loss": 5.055, "lr": 0.0005511888111888111, "step": 3710, "tokens_trained": 0.352607936 }, { "epoch": 1.0530496453900708, "grad_norm": 0.9602318406105042, "loss": 5.0372, "lr": 0.0005509090909090909, "step": 3712, "tokens_trained": 0.352797584 }, { "epoch": 1.0536170212765958, "grad_norm": 0.9743978381156921, "loss": 5.0101, "lr": 0.0005506293706293707, "step": 3714, "tokens_trained": 0.352988184 }, { "epoch": 1.0541843971631206, "grad_norm": 0.9676964282989502, "loss": 5.0518, "lr": 0.0005503496503496503, "step": 3716, "tokens_trained": 0.353180088 }, { "epoch": 1.0547517730496454, "grad_norm": 0.8736178874969482, "loss": 5.0278, "lr": 0.0005500699300699301, "step": 3718, "tokens_trained": 0.353370808 }, { "epoch": 1.0553191489361702, "grad_norm": 0.8516846895217896, "loss": 4.9893, "lr": 0.0005497902097902098, "step": 3720, "tokens_trained": 0.35356052 }, { "epoch": 1.055886524822695, "grad_norm": 1.0038187503814697, "loss": 5.0376, "lr": 0.0005495104895104896, "step": 3722, "tokens_trained": 0.353752744 }, { "epoch": 1.05645390070922, "grad_norm": 0.9077925682067871, "loss": 5.045, "lr": 0.0005492307692307692, "step": 3724, "tokens_trained": 0.353944136 }, { "epoch": 1.0570212765957447, "grad_norm": 0.8750975728034973, "loss": 5.0275, "lr": 0.0005489510489510489, "step": 3726, "tokens_trained": 0.354135648 }, { "epoch": 1.0575886524822695, "grad_norm": 0.9059204459190369, "loss": 5.0502, "lr": 0.0005486713286713286, "step": 3728, "tokens_trained": 0.354325256 }, { "epoch": 1.0581560283687943, "grad_norm": 0.8883426189422607, "loss": 5.0016, "lr": 0.0005483916083916084, "step": 3730, "tokens_trained": 0.354517776 }, { "epoch": 1.058723404255319, "grad_norm": 0.911379873752594, "loss": 5.0363, "lr": 0.0005481118881118882, "step": 3732, "tokens_trained": 0.354706528 }, { "epoch": 1.0592907801418439, "grad_norm": 0.8956911563873291, "loss": 5.0028, "lr": 0.0005478321678321678, "step": 3734, "tokens_trained": 0.354896352 }, { "epoch": 1.0598581560283689, "grad_norm": 0.9133324027061462, "loss": 5.0426, "lr": 0.0005475524475524476, "step": 3736, "tokens_trained": 0.3550884 }, { "epoch": 1.0604255319148936, "grad_norm": 0.8321526050567627, "loss": 4.9918, "lr": 0.0005472727272727273, "step": 3738, "tokens_trained": 0.355277608 }, { "epoch": 1.0609929078014184, "grad_norm": 0.8607254028320312, "loss": 5.021, "lr": 0.000546993006993007, "step": 3740, "tokens_trained": 0.355467432 }, { "epoch": 1.0615602836879432, "grad_norm": 0.8457037806510925, "loss": 5.037, "lr": 0.0005467132867132867, "step": 3742, "tokens_trained": 0.355659088 }, { "epoch": 1.0621276595744682, "grad_norm": 0.9381092190742493, "loss": 4.9878, "lr": 0.0005464335664335664, "step": 3744, "tokens_trained": 0.35585168 }, { "epoch": 1.062695035460993, "grad_norm": 0.8678731918334961, "loss": 5.0716, "lr": 0.0005461538461538461, "step": 3746, "tokens_trained": 0.356040984 }, { "epoch": 1.0632624113475178, "grad_norm": 0.8570135235786438, "loss": 5.0018, "lr": 0.0005458741258741259, "step": 3748, "tokens_trained": 0.356230064 }, { "epoch": 1.0638297872340425, "grad_norm": 0.9624084234237671, "loss": 5.0531, "lr": 0.0005455944055944057, "step": 3750, "tokens_trained": 0.356419352 }, { "epoch": 1.0638297872340425, "eval_loss": 5.037150859832764, "eval_runtime": 20.8153, "step": 3750, "tokens_trained": 0.356419352 }, { "epoch": 1.0643971631205673, "grad_norm": 0.9213569760322571, "loss": 5.0228, "lr": 0.0005453146853146853, "step": 3752, "tokens_trained": 0.356611128 }, { "epoch": 1.064964539007092, "grad_norm": 0.8769538998603821, "loss": 5.0138, "lr": 0.000545034965034965, "step": 3754, "tokens_trained": 0.356800248 }, { "epoch": 1.065531914893617, "grad_norm": 0.9480370879173279, "loss": 5.056, "lr": 0.0005447552447552448, "step": 3756, "tokens_trained": 0.35699148 }, { "epoch": 1.0660992907801419, "grad_norm": 0.8391848206520081, "loss": 5.0256, "lr": 0.0005444755244755245, "step": 3758, "tokens_trained": 0.357182168 }, { "epoch": 1.0666666666666667, "grad_norm": 0.85853111743927, "loss": 5.0147, "lr": 0.0005441958041958042, "step": 3760, "tokens_trained": 0.357373032 }, { "epoch": 1.0672340425531914, "grad_norm": 0.8469287753105164, "loss": 4.9702, "lr": 0.0005439160839160839, "step": 3762, "tokens_trained": 0.357562944 }, { "epoch": 1.0678014184397162, "grad_norm": 0.8880507349967957, "loss": 5.0123, "lr": 0.0005436363636363635, "step": 3764, "tokens_trained": 0.357753872 }, { "epoch": 1.0683687943262412, "grad_norm": 0.9235898852348328, "loss": 4.9693, "lr": 0.0005433566433566434, "step": 3766, "tokens_trained": 0.357944312 }, { "epoch": 1.068936170212766, "grad_norm": 0.8787907361984253, "loss": 4.9987, "lr": 0.0005430769230769231, "step": 3768, "tokens_trained": 0.35813388 }, { "epoch": 1.0695035460992908, "grad_norm": 0.8627321124076843, "loss": 4.9938, "lr": 0.0005427972027972028, "step": 3770, "tokens_trained": 0.35832436 }, { "epoch": 1.0700709219858155, "grad_norm": 0.8891534805297852, "loss": 4.9982, "lr": 0.0005425174825174825, "step": 3772, "tokens_trained": 0.35851672 }, { "epoch": 1.0706382978723403, "grad_norm": 0.947503387928009, "loss": 5.0114, "lr": 0.0005422377622377623, "step": 3774, "tokens_trained": 0.358705936 }, { "epoch": 1.0712056737588653, "grad_norm": 0.9056106805801392, "loss": 5.0199, "lr": 0.000541958041958042, "step": 3776, "tokens_trained": 0.358896904 }, { "epoch": 1.07177304964539, "grad_norm": 0.9422404766082764, "loss": 5.0556, "lr": 0.0005416783216783216, "step": 3778, "tokens_trained": 0.35908716 }, { "epoch": 1.0723404255319149, "grad_norm": 0.9013909101486206, "loss": 5.0516, "lr": 0.0005413986013986014, "step": 3780, "tokens_trained": 0.359276784 }, { "epoch": 1.0729078014184397, "grad_norm": 0.8561504483222961, "loss": 4.973, "lr": 0.000541118881118881, "step": 3782, "tokens_trained": 0.35946816 }, { "epoch": 1.0734751773049644, "grad_norm": 0.8561832308769226, "loss": 5.053, "lr": 0.0005408391608391609, "step": 3784, "tokens_trained": 0.3596616 }, { "epoch": 1.0740425531914894, "grad_norm": 0.7730107307434082, "loss": 5.006, "lr": 0.0005405594405594406, "step": 3786, "tokens_trained": 0.359853624 }, { "epoch": 1.0746099290780142, "grad_norm": 0.889777660369873, "loss": 5.0275, "lr": 0.0005402797202797203, "step": 3788, "tokens_trained": 0.360044616 }, { "epoch": 1.075177304964539, "grad_norm": 0.8839589357376099, "loss": 5.0635, "lr": 0.00054, "step": 3790, "tokens_trained": 0.36023484 }, { "epoch": 1.0757446808510638, "grad_norm": 0.8542807102203369, "loss": 5.0161, "lr": 0.0005397202797202798, "step": 3792, "tokens_trained": 0.3604262 }, { "epoch": 1.0763120567375886, "grad_norm": 0.8976609706878662, "loss": 5.0693, "lr": 0.0005394405594405595, "step": 3794, "tokens_trained": 0.360615912 }, { "epoch": 1.0768794326241136, "grad_norm": 0.8138758540153503, "loss": 4.9589, "lr": 0.0005391608391608391, "step": 3796, "tokens_trained": 0.360807648 }, { "epoch": 1.0774468085106383, "grad_norm": 0.8604118227958679, "loss": 5.0311, "lr": 0.0005388811188811189, "step": 3798, "tokens_trained": 0.360998824 }, { "epoch": 1.0780141843971631, "grad_norm": 0.8839350342750549, "loss": 5.0355, "lr": 0.0005386013986013985, "step": 3800, "tokens_trained": 0.36119052 }, { "epoch": 1.078581560283688, "grad_norm": 0.9019435048103333, "loss": 4.9899, "lr": 0.0005383216783216784, "step": 3802, "tokens_trained": 0.361380456 }, { "epoch": 1.0791489361702127, "grad_norm": 0.8486269116401672, "loss": 5.033, "lr": 0.0005380419580419581, "step": 3804, "tokens_trained": 0.361569216 }, { "epoch": 1.0797163120567377, "grad_norm": 0.8133941888809204, "loss": 5.0129, "lr": 0.0005377622377622377, "step": 3806, "tokens_trained": 0.361761648 }, { "epoch": 1.0802836879432625, "grad_norm": 0.8590590357780457, "loss": 5.0938, "lr": 0.0005374825174825175, "step": 3808, "tokens_trained": 0.361950784 }, { "epoch": 1.0808510638297872, "grad_norm": 0.8362659215927124, "loss": 5.0397, "lr": 0.0005372027972027972, "step": 3810, "tokens_trained": 0.3621414 }, { "epoch": 1.081418439716312, "grad_norm": 0.912358820438385, "loss": 4.9804, "lr": 0.000536923076923077, "step": 3812, "tokens_trained": 0.362330072 }, { "epoch": 1.0819858156028368, "grad_norm": 0.9518508911132812, "loss": 4.9975, "lr": 0.0005366433566433566, "step": 3814, "tokens_trained": 0.362521472 }, { "epoch": 1.0825531914893618, "grad_norm": 0.8164550065994263, "loss": 4.9745, "lr": 0.0005363636363636364, "step": 3816, "tokens_trained": 0.362710744 }, { "epoch": 1.0831205673758866, "grad_norm": 0.9007307887077332, "loss": 4.9715, "lr": 0.000536083916083916, "step": 3818, "tokens_trained": 0.362900624 }, { "epoch": 1.0836879432624114, "grad_norm": 0.8775385022163391, "loss": 5.0296, "lr": 0.0005358041958041959, "step": 3820, "tokens_trained": 0.36309048 }, { "epoch": 1.0842553191489361, "grad_norm": 0.7864426970481873, "loss": 4.9837, "lr": 0.0005355244755244756, "step": 3822, "tokens_trained": 0.363280088 }, { "epoch": 1.084822695035461, "grad_norm": 0.7757525444030762, "loss": 5.0445, "lr": 0.0005352447552447552, "step": 3824, "tokens_trained": 0.363470768 }, { "epoch": 1.085390070921986, "grad_norm": 0.7588837146759033, "loss": 5.0431, "lr": 0.000534965034965035, "step": 3826, "tokens_trained": 0.363661176 }, { "epoch": 1.0859574468085107, "grad_norm": 0.8844705820083618, "loss": 5.0192, "lr": 0.0005346853146853147, "step": 3828, "tokens_trained": 0.363852544 }, { "epoch": 1.0865248226950355, "grad_norm": 0.8446291089057922, "loss": 5.0647, "lr": 0.0005344055944055945, "step": 3830, "tokens_trained": 0.364044088 }, { "epoch": 1.0870921985815603, "grad_norm": 0.8611181974411011, "loss": 5.0475, "lr": 0.0005341258741258741, "step": 3832, "tokens_trained": 0.364234688 }, { "epoch": 1.087659574468085, "grad_norm": 0.8670753240585327, "loss": 5.0243, "lr": 0.0005338461538461538, "step": 3834, "tokens_trained": 0.364424096 }, { "epoch": 1.0882269503546098, "grad_norm": 0.8563777208328247, "loss": 5.0512, "lr": 0.0005335664335664335, "step": 3836, "tokens_trained": 0.364611896 }, { "epoch": 1.0887943262411348, "grad_norm": 0.849647581577301, "loss": 5.0089, "lr": 0.0005332867132867133, "step": 3838, "tokens_trained": 0.364800808 }, { "epoch": 1.0893617021276596, "grad_norm": 0.8674852252006531, "loss": 5.0018, "lr": 0.0005330069930069931, "step": 3840, "tokens_trained": 0.364993432 }, { "epoch": 1.0899290780141844, "grad_norm": 0.8642079830169678, "loss": 4.9989, "lr": 0.0005327272727272727, "step": 3842, "tokens_trained": 0.365182432 }, { "epoch": 1.0904964539007092, "grad_norm": 0.8550288081169128, "loss": 4.9855, "lr": 0.0005324475524475525, "step": 3844, "tokens_trained": 0.365372416 }, { "epoch": 1.0910638297872342, "grad_norm": 0.901297926902771, "loss": 5.0342, "lr": 0.0005321678321678322, "step": 3846, "tokens_trained": 0.365564576 }, { "epoch": 1.091631205673759, "grad_norm": 0.8426658511161804, "loss": 5.0301, "lr": 0.000531888111888112, "step": 3848, "tokens_trained": 0.36575708 }, { "epoch": 1.0921985815602837, "grad_norm": 0.8530165553092957, "loss": 5.071, "lr": 0.0005316083916083916, "step": 3850, "tokens_trained": 0.365947984 }, { "epoch": 1.0927659574468085, "grad_norm": 0.9010403156280518, "loss": 5.0279, "lr": 0.0005313286713286713, "step": 3852, "tokens_trained": 0.366136392 }, { "epoch": 1.0933333333333333, "grad_norm": 0.9402730464935303, "loss": 4.9896, "lr": 0.000531048951048951, "step": 3854, "tokens_trained": 0.36632536 }, { "epoch": 1.093900709219858, "grad_norm": 0.8633377552032471, "loss": 5.0093, "lr": 0.0005307692307692308, "step": 3856, "tokens_trained": 0.366515056 }, { "epoch": 1.094468085106383, "grad_norm": 0.8778465390205383, "loss": 4.9574, "lr": 0.0005304895104895106, "step": 3858, "tokens_trained": 0.366705328 }, { "epoch": 1.0950354609929078, "grad_norm": 0.8562993407249451, "loss": 4.9938, "lr": 0.0005302097902097902, "step": 3860, "tokens_trained": 0.36689668 }, { "epoch": 1.0956028368794326, "grad_norm": 0.8061450719833374, "loss": 5.0132, "lr": 0.0005299300699300699, "step": 3862, "tokens_trained": 0.367087104 }, { "epoch": 1.0961702127659574, "grad_norm": 0.9253963232040405, "loss": 4.9672, "lr": 0.0005296503496503497, "step": 3864, "tokens_trained": 0.36727676 }, { "epoch": 1.0967375886524824, "grad_norm": 0.8339546918869019, "loss": 4.9757, "lr": 0.0005293706293706294, "step": 3866, "tokens_trained": 0.367467928 }, { "epoch": 1.0973049645390072, "grad_norm": 0.7303675413131714, "loss": 4.9613, "lr": 0.0005290909090909091, "step": 3868, "tokens_trained": 0.36765876 }, { "epoch": 1.097872340425532, "grad_norm": 0.8417290449142456, "loss": 5.0074, "lr": 0.0005288111888111888, "step": 3870, "tokens_trained": 0.367848064 }, { "epoch": 1.0984397163120567, "grad_norm": 0.7773861289024353, "loss": 4.9411, "lr": 0.0005285314685314684, "step": 3872, "tokens_trained": 0.368038176 }, { "epoch": 1.0990070921985815, "grad_norm": 0.8101850152015686, "loss": 5.0479, "lr": 0.0005282517482517483, "step": 3874, "tokens_trained": 0.368228208 }, { "epoch": 1.099290780141844, "eval_loss": 5.027334690093994, "eval_runtime": 20.6629, "step": 3875, "tokens_trained": 0.368324424 }, { "epoch": 1.0995744680851063, "grad_norm": 0.8131702542304993, "loss": 5.045, "lr": 0.000527972027972028, "step": 3876, "tokens_trained": 0.368421216 }, { "epoch": 1.1001418439716313, "grad_norm": 0.7819017171859741, "loss": 5.0151, "lr": 0.0005276923076923077, "step": 3878, "tokens_trained": 0.368612904 }, { "epoch": 1.100709219858156, "grad_norm": 0.8118953108787537, "loss": 5.0233, "lr": 0.0005274125874125874, "step": 3880, "tokens_trained": 0.368803144 }, { "epoch": 1.1012765957446808, "grad_norm": 0.8203917741775513, "loss": 4.9401, "lr": 0.0005271328671328672, "step": 3882, "tokens_trained": 0.368993072 }, { "epoch": 1.1018439716312056, "grad_norm": 0.8229494690895081, "loss": 5.0605, "lr": 0.0005268531468531469, "step": 3884, "tokens_trained": 0.36918396 }, { "epoch": 1.1024113475177304, "grad_norm": 0.7224509119987488, "loss": 5.03, "lr": 0.0005265734265734266, "step": 3886, "tokens_trained": 0.36937192 }, { "epoch": 1.1029787234042554, "grad_norm": 0.8122052550315857, "loss": 5.0416, "lr": 0.0005262937062937063, "step": 3888, "tokens_trained": 0.36956204 }, { "epoch": 1.1035460992907802, "grad_norm": 0.8190508484840393, "loss": 5.0106, "lr": 0.0005260139860139859, "step": 3890, "tokens_trained": 0.369753208 }, { "epoch": 1.104113475177305, "grad_norm": 0.845341682434082, "loss": 5.0, "lr": 0.0005257342657342658, "step": 3892, "tokens_trained": 0.36994372 }, { "epoch": 1.1046808510638297, "grad_norm": 0.9587157964706421, "loss": 5.0319, "lr": 0.0005254545454545455, "step": 3894, "tokens_trained": 0.370133776 }, { "epoch": 1.1052482269503545, "grad_norm": 0.8882042169570923, "loss": 4.9617, "lr": 0.0005251748251748252, "step": 3896, "tokens_trained": 0.370322504 }, { "epoch": 1.1058156028368795, "grad_norm": 0.879010021686554, "loss": 4.9197, "lr": 0.0005248951048951049, "step": 3898, "tokens_trained": 0.370514864 }, { "epoch": 1.1063829787234043, "grad_norm": 0.9890635013580322, "loss": 5.0381, "lr": 0.0005246153846153847, "step": 3900, "tokens_trained": 0.370706568 }, { "epoch": 1.106950354609929, "grad_norm": 0.8491361737251282, "loss": 5.0187, "lr": 0.0005243356643356644, "step": 3902, "tokens_trained": 0.370899112 }, { "epoch": 1.1075177304964539, "grad_norm": 0.8746361136436462, "loss": 5.0972, "lr": 0.000524055944055944, "step": 3904, "tokens_trained": 0.37108932 }, { "epoch": 1.1080851063829786, "grad_norm": 0.9623220562934875, "loss": 5.0143, "lr": 0.0005237762237762238, "step": 3906, "tokens_trained": 0.371276808 }, { "epoch": 1.1086524822695036, "grad_norm": 0.8145681023597717, "loss": 5.0081, "lr": 0.0005234965034965034, "step": 3908, "tokens_trained": 0.3714666 }, { "epoch": 1.1092198581560284, "grad_norm": 0.8862302899360657, "loss": 5.0613, "lr": 0.0005232167832167833, "step": 3910, "tokens_trained": 0.371654632 }, { "epoch": 1.1097872340425532, "grad_norm": 0.8897994160652161, "loss": 5.0447, "lr": 0.000522937062937063, "step": 3912, "tokens_trained": 0.37184496 }, { "epoch": 1.110354609929078, "grad_norm": 0.9659616947174072, "loss": 5.0852, "lr": 0.0005226573426573427, "step": 3914, "tokens_trained": 0.372034032 }, { "epoch": 1.1109219858156028, "grad_norm": 0.8457762002944946, "loss": 4.9992, "lr": 0.0005223776223776224, "step": 3916, "tokens_trained": 0.372224056 }, { "epoch": 1.1114893617021278, "grad_norm": 0.8297874927520752, "loss": 5.0416, "lr": 0.0005220979020979021, "step": 3918, "tokens_trained": 0.372413992 }, { "epoch": 1.1120567375886525, "grad_norm": 0.8436822295188904, "loss": 5.0201, "lr": 0.0005218181818181819, "step": 3920, "tokens_trained": 0.372604784 }, { "epoch": 1.1126241134751773, "grad_norm": 0.8133619427680969, "loss": 5.0074, "lr": 0.0005215384615384615, "step": 3922, "tokens_trained": 0.372796808 }, { "epoch": 1.113191489361702, "grad_norm": 0.7879509925842285, "loss": 5.0536, "lr": 0.0005212587412587413, "step": 3924, "tokens_trained": 0.372988416 }, { "epoch": 1.1137588652482269, "grad_norm": 0.8212776780128479, "loss": 5.0259, "lr": 0.0005209790209790209, "step": 3926, "tokens_trained": 0.373178784 }, { "epoch": 1.1143262411347519, "grad_norm": 0.8426427245140076, "loss": 5.0017, "lr": 0.0005206993006993008, "step": 3928, "tokens_trained": 0.373367992 }, { "epoch": 1.1148936170212767, "grad_norm": 0.8375839591026306, "loss": 4.9984, "lr": 0.0005204195804195805, "step": 3930, "tokens_trained": 0.373558848 }, { "epoch": 1.1154609929078014, "grad_norm": 0.907742440700531, "loss": 5.0629, "lr": 0.0005201398601398601, "step": 3932, "tokens_trained": 0.373748656 }, { "epoch": 1.1160283687943262, "grad_norm": 0.8619366884231567, "loss": 4.9702, "lr": 0.0005198601398601399, "step": 3934, "tokens_trained": 0.373937232 }, { "epoch": 1.116595744680851, "grad_norm": 0.8558400273323059, "loss": 4.9833, "lr": 0.0005195804195804196, "step": 3936, "tokens_trained": 0.3741268 }, { "epoch": 1.117163120567376, "grad_norm": 0.7772043347358704, "loss": 5.0636, "lr": 0.0005193006993006994, "step": 3938, "tokens_trained": 0.374315752 }, { "epoch": 1.1177304964539008, "grad_norm": 0.9044018387794495, "loss": 5.0419, "lr": 0.000519020979020979, "step": 3940, "tokens_trained": 0.374504464 }, { "epoch": 1.1182978723404255, "grad_norm": 0.8944953083992004, "loss": 4.961, "lr": 0.0005187412587412588, "step": 3942, "tokens_trained": 0.374695528 }, { "epoch": 1.1188652482269503, "grad_norm": 0.8230746984481812, "loss": 5.0148, "lr": 0.0005184615384615384, "step": 3944, "tokens_trained": 0.374886128 }, { "epoch": 1.119432624113475, "grad_norm": 0.7891346216201782, "loss": 4.9601, "lr": 0.0005181818181818182, "step": 3946, "tokens_trained": 0.375074408 }, { "epoch": 1.12, "grad_norm": 0.8364359140396118, "loss": 5.0317, "lr": 0.000517902097902098, "step": 3948, "tokens_trained": 0.37526636 }, { "epoch": 1.1205673758865249, "grad_norm": 0.7932770252227783, "loss": 4.9399, "lr": 0.0005176223776223776, "step": 3950, "tokens_trained": 0.375455888 }, { "epoch": 1.1211347517730497, "grad_norm": 0.8276688456535339, "loss": 4.9465, "lr": 0.0005173426573426574, "step": 3952, "tokens_trained": 0.37564728 }, { "epoch": 1.1217021276595744, "grad_norm": 0.8242233991622925, "loss": 5.0069, "lr": 0.000517062937062937, "step": 3954, "tokens_trained": 0.375839296 }, { "epoch": 1.1222695035460992, "grad_norm": 0.8828895688056946, "loss": 4.9488, "lr": 0.0005167832167832169, "step": 3956, "tokens_trained": 0.376028744 }, { "epoch": 1.122836879432624, "grad_norm": 0.8730418682098389, "loss": 4.9729, "lr": 0.0005165034965034965, "step": 3958, "tokens_trained": 0.376217656 }, { "epoch": 1.123404255319149, "grad_norm": 0.7701008915901184, "loss": 4.9922, "lr": 0.0005162237762237762, "step": 3960, "tokens_trained": 0.3764066 }, { "epoch": 1.1239716312056738, "grad_norm": 0.8723980784416199, "loss": 4.9452, "lr": 0.0005159440559440559, "step": 3962, "tokens_trained": 0.376594952 }, { "epoch": 1.1245390070921986, "grad_norm": 0.9300636053085327, "loss": 5.0595, "lr": 0.0005156643356643357, "step": 3964, "tokens_trained": 0.376785256 }, { "epoch": 1.1251063829787233, "grad_norm": 0.8684858083724976, "loss": 5.0372, "lr": 0.0005153846153846154, "step": 3966, "tokens_trained": 0.376975 }, { "epoch": 1.1256737588652483, "grad_norm": 0.8335841298103333, "loss": 5.0636, "lr": 0.0005151048951048951, "step": 3968, "tokens_trained": 0.377164552 }, { "epoch": 1.1262411347517731, "grad_norm": 0.8454932570457458, "loss": 4.9603, "lr": 0.0005148251748251748, "step": 3970, "tokens_trained": 0.377353968 }, { "epoch": 1.126808510638298, "grad_norm": 0.8978991508483887, "loss": 5.0161, "lr": 0.0005145454545454545, "step": 3972, "tokens_trained": 0.377543664 }, { "epoch": 1.1273758865248227, "grad_norm": 0.863207995891571, "loss": 4.9949, "lr": 0.0005142657342657343, "step": 3974, "tokens_trained": 0.37773332 }, { "epoch": 1.1279432624113475, "grad_norm": 0.8614553213119507, "loss": 4.9812, "lr": 0.000513986013986014, "step": 3976, "tokens_trained": 0.377921272 }, { "epoch": 1.1285106382978722, "grad_norm": 0.8703583478927612, "loss": 4.9823, "lr": 0.0005137062937062937, "step": 3978, "tokens_trained": 0.378112584 }, { "epoch": 1.1290780141843972, "grad_norm": 0.7951223254203796, "loss": 4.9732, "lr": 0.0005134265734265734, "step": 3980, "tokens_trained": 0.378302088 }, { "epoch": 1.129645390070922, "grad_norm": 0.8486145734786987, "loss": 4.9422, "lr": 0.0005131468531468532, "step": 3982, "tokens_trained": 0.37849388 }, { "epoch": 1.1302127659574468, "grad_norm": 0.8054757714271545, "loss": 4.9911, "lr": 0.0005128671328671328, "step": 3984, "tokens_trained": 0.378682928 }, { "epoch": 1.1307801418439716, "grad_norm": 0.83322674036026, "loss": 5.0289, "lr": 0.0005125874125874126, "step": 3986, "tokens_trained": 0.378874296 }, { "epoch": 1.1313475177304966, "grad_norm": 0.8249304890632629, "loss": 5.0455, "lr": 0.0005123076923076923, "step": 3988, "tokens_trained": 0.379067408 }, { "epoch": 1.1319148936170214, "grad_norm": 0.8258949518203735, "loss": 4.9703, "lr": 0.000512027972027972, "step": 3990, "tokens_trained": 0.379255328 }, { "epoch": 1.1324822695035461, "grad_norm": 0.8535506725311279, "loss": 5.0652, "lr": 0.0005117482517482518, "step": 3992, "tokens_trained": 0.379446152 }, { "epoch": 1.133049645390071, "grad_norm": 0.8468305468559265, "loss": 5.0071, "lr": 0.0005114685314685315, "step": 3994, "tokens_trained": 0.379637664 }, { "epoch": 1.1336170212765957, "grad_norm": 0.8334465026855469, "loss": 5.043, "lr": 0.0005111888111888112, "step": 3996, "tokens_trained": 0.379829408 }, { "epoch": 1.1341843971631205, "grad_norm": 0.8690851926803589, "loss": 4.9637, "lr": 0.0005109090909090908, "step": 3998, "tokens_trained": 0.380021248 }, { "epoch": 1.1347517730496455, "grad_norm": 0.7997336983680725, "loss": 5.0168, "lr": 0.0005106293706293707, "step": 4000, "tokens_trained": 0.380212256 }, { "epoch": 1.1347517730496455, "eval_loss": 5.021827697753906, "eval_runtime": 20.8538, "step": 4000, "tokens_trained": 0.380212256 }, { "epoch": 1.1353191489361703, "grad_norm": 0.8898105621337891, "loss": 4.9954, "lr": 0.0005103496503496503, "step": 4002, "tokens_trained": 0.380403128 }, { "epoch": 1.135886524822695, "grad_norm": 0.8997061848640442, "loss": 5.0087, "lr": 0.0005100699300699301, "step": 4004, "tokens_trained": 0.3805902 }, { "epoch": 1.1364539007092198, "grad_norm": 0.8276216387748718, "loss": 4.9579, "lr": 0.0005097902097902098, "step": 4006, "tokens_trained": 0.380778288 }, { "epoch": 1.1370212765957446, "grad_norm": 0.8275374174118042, "loss": 4.973, "lr": 0.0005095104895104895, "step": 4008, "tokens_trained": 0.38096896 }, { "epoch": 1.1375886524822696, "grad_norm": 0.881206214427948, "loss": 5.0027, "lr": 0.0005092307692307693, "step": 4010, "tokens_trained": 0.381159008 }, { "epoch": 1.1381560283687944, "grad_norm": 0.8062921762466431, "loss": 4.9771, "lr": 0.0005089510489510489, "step": 4012, "tokens_trained": 0.381350336 }, { "epoch": 1.1387234042553191, "grad_norm": 0.8482317924499512, "loss": 4.972, "lr": 0.0005086713286713287, "step": 4014, "tokens_trained": 0.381540512 }, { "epoch": 1.139290780141844, "grad_norm": 0.8180603981018066, "loss": 5.0052, "lr": 0.0005083916083916083, "step": 4016, "tokens_trained": 0.38173168 }, { "epoch": 1.1398581560283687, "grad_norm": 0.7816891670227051, "loss": 4.9689, "lr": 0.0005081118881118882, "step": 4018, "tokens_trained": 0.381922056 }, { "epoch": 1.1404255319148937, "grad_norm": 0.831451952457428, "loss": 4.9931, "lr": 0.0005078321678321678, "step": 4020, "tokens_trained": 0.382111816 }, { "epoch": 1.1409929078014185, "grad_norm": 0.8557744026184082, "loss": 5.0101, "lr": 0.0005075524475524476, "step": 4022, "tokens_trained": 0.38230276 }, { "epoch": 1.1415602836879433, "grad_norm": 0.8070439696311951, "loss": 5.0457, "lr": 0.0005072727272727273, "step": 4024, "tokens_trained": 0.38249052 }, { "epoch": 1.142127659574468, "grad_norm": 0.9021100401878357, "loss": 4.9979, "lr": 0.0005069930069930069, "step": 4026, "tokens_trained": 0.382679696 }, { "epoch": 1.1426950354609928, "grad_norm": 0.8565911650657654, "loss": 4.9828, "lr": 0.0005067132867132868, "step": 4028, "tokens_trained": 0.382869408 }, { "epoch": 1.1432624113475178, "grad_norm": 0.8522788286209106, "loss": 5.0306, "lr": 0.0005064335664335664, "step": 4030, "tokens_trained": 0.383058416 }, { "epoch": 1.1438297872340426, "grad_norm": 0.79361891746521, "loss": 5.0027, "lr": 0.0005061538461538462, "step": 4032, "tokens_trained": 0.383248504 }, { "epoch": 1.1443971631205674, "grad_norm": 0.8457452654838562, "loss": 4.9762, "lr": 0.0005058741258741258, "step": 4034, "tokens_trained": 0.383439016 }, { "epoch": 1.1449645390070922, "grad_norm": 0.9091781377792358, "loss": 5.0534, "lr": 0.0005055944055944057, "step": 4036, "tokens_trained": 0.383630552 }, { "epoch": 1.145531914893617, "grad_norm": 0.8448526263237, "loss": 5.0068, "lr": 0.0005053146853146853, "step": 4038, "tokens_trained": 0.383817712 }, { "epoch": 1.1460992907801417, "grad_norm": 0.7852639555931091, "loss": 4.9615, "lr": 0.000505034965034965, "step": 4040, "tokens_trained": 0.384008192 }, { "epoch": 1.1466666666666667, "grad_norm": 0.7787274122238159, "loss": 5.0035, "lr": 0.0005047552447552448, "step": 4042, "tokens_trained": 0.38419848 }, { "epoch": 1.1472340425531915, "grad_norm": 0.9463234543800354, "loss": 5.0284, "lr": 0.0005044755244755244, "step": 4044, "tokens_trained": 0.384390448 }, { "epoch": 1.1478014184397163, "grad_norm": 0.9096873998641968, "loss": 5.0104, "lr": 0.0005041958041958043, "step": 4046, "tokens_trained": 0.384578688 }, { "epoch": 1.148368794326241, "grad_norm": 0.8237007856369019, "loss": 5.0225, "lr": 0.0005039160839160839, "step": 4048, "tokens_trained": 0.384769368 }, { "epoch": 1.148936170212766, "grad_norm": 0.8391951322555542, "loss": 4.9316, "lr": 0.0005036363636363637, "step": 4050, "tokens_trained": 0.384959448 }, { "epoch": 1.1495035460992908, "grad_norm": 0.8555214405059814, "loss": 5.0299, "lr": 0.0005033566433566433, "step": 4052, "tokens_trained": 0.385148392 }, { "epoch": 1.1500709219858156, "grad_norm": 0.813484251499176, "loss": 5.0792, "lr": 0.0005030769230769231, "step": 4054, "tokens_trained": 0.385338144 }, { "epoch": 1.1506382978723404, "grad_norm": 0.8149204850196838, "loss": 5.0607, "lr": 0.0005027972027972028, "step": 4056, "tokens_trained": 0.385528776 }, { "epoch": 1.1512056737588652, "grad_norm": 0.8909300565719604, "loss": 5.007, "lr": 0.0005025174825174825, "step": 4058, "tokens_trained": 0.385717672 }, { "epoch": 1.15177304964539, "grad_norm": 0.8447635173797607, "loss": 5.024, "lr": 0.0005022377622377623, "step": 4060, "tokens_trained": 0.3859074 }, { "epoch": 1.152340425531915, "grad_norm": 0.8429125547409058, "loss": 4.9871, "lr": 0.0005019580419580419, "step": 4062, "tokens_trained": 0.386096712 }, { "epoch": 1.1529078014184397, "grad_norm": 0.8532034158706665, "loss": 4.9807, "lr": 0.0005016783216783218, "step": 4064, "tokens_trained": 0.386290392 }, { "epoch": 1.1534751773049645, "grad_norm": 0.8414303064346313, "loss": 5.0426, "lr": 0.0005013986013986014, "step": 4066, "tokens_trained": 0.386484048 }, { "epoch": 1.1540425531914893, "grad_norm": 0.8659424185752869, "loss": 4.9572, "lr": 0.0005011188811188811, "step": 4068, "tokens_trained": 0.386670896 }, { "epoch": 1.1546099290780143, "grad_norm": 0.8472128510475159, "loss": 4.9993, "lr": 0.0005008391608391608, "step": 4070, "tokens_trained": 0.38686096 }, { "epoch": 1.155177304964539, "grad_norm": 0.7704010009765625, "loss": 5.0267, "lr": 0.0005005594405594406, "step": 4072, "tokens_trained": 0.387052256 }, { "epoch": 1.1557446808510639, "grad_norm": 0.8503726720809937, "loss": 4.953, "lr": 0.0005002797202797203, "step": 4074, "tokens_trained": 0.387241648 }, { "epoch": 1.1563120567375886, "grad_norm": 0.8159539699554443, "loss": 5.0096, "lr": 0.0005, "step": 4076, "tokens_trained": 0.387432368 }, { "epoch": 1.1568794326241134, "grad_norm": 0.7673088312149048, "loss": 4.9996, "lr": 0.0004997202797202798, "step": 4078, "tokens_trained": 0.387620656 }, { "epoch": 1.1574468085106382, "grad_norm": 0.8308261036872864, "loss": 5.0114, "lr": 0.0004994405594405594, "step": 4080, "tokens_trained": 0.387809712 }, { "epoch": 1.1580141843971632, "grad_norm": 0.8294357657432556, "loss": 5.0508, "lr": 0.0004991608391608391, "step": 4082, "tokens_trained": 0.387999152 }, { "epoch": 1.158581560283688, "grad_norm": 0.8797832727432251, "loss": 4.9784, "lr": 0.0004988811188811189, "step": 4084, "tokens_trained": 0.3881876 }, { "epoch": 1.1591489361702128, "grad_norm": 0.8250353932380676, "loss": 4.959, "lr": 0.0004986013986013986, "step": 4086, "tokens_trained": 0.38837592 }, { "epoch": 1.1597163120567375, "grad_norm": 0.8896451592445374, "loss": 5.0103, "lr": 0.0004983216783216784, "step": 4088, "tokens_trained": 0.388565768 }, { "epoch": 1.1602836879432625, "grad_norm": 0.7970037460327148, "loss": 5.0534, "lr": 0.0004980419580419581, "step": 4090, "tokens_trained": 0.388755536 }, { "epoch": 1.1608510638297873, "grad_norm": 0.8623605966567993, "loss": 4.986, "lr": 0.0004977622377622378, "step": 4092, "tokens_trained": 0.388947 }, { "epoch": 1.161418439716312, "grad_norm": 0.8195328712463379, "loss": 5.0193, "lr": 0.0004974825174825175, "step": 4094, "tokens_trained": 0.38913532 }, { "epoch": 1.1619858156028369, "grad_norm": 0.8058289885520935, "loss": 5.0001, "lr": 0.0004972027972027972, "step": 4096, "tokens_trained": 0.389325904 }, { "epoch": 1.1625531914893616, "grad_norm": 0.8325840830802917, "loss": 5.0711, "lr": 0.0004969230769230769, "step": 4098, "tokens_trained": 0.3895166 }, { "epoch": 1.1631205673758864, "grad_norm": 0.8684342503547668, "loss": 4.9548, "lr": 0.0004966433566433566, "step": 4100, "tokens_trained": 0.389704048 }, { "epoch": 1.1636879432624114, "grad_norm": 0.891304612159729, "loss": 4.9711, "lr": 0.0004963636363636364, "step": 4102, "tokens_trained": 0.389893816 }, { "epoch": 1.1642553191489362, "grad_norm": 0.8750278353691101, "loss": 5.0493, "lr": 0.0004960839160839161, "step": 4104, "tokens_trained": 0.390082752 }, { "epoch": 1.164822695035461, "grad_norm": 0.8391188383102417, "loss": 4.9804, "lr": 0.0004958041958041959, "step": 4106, "tokens_trained": 0.390272096 }, { "epoch": 1.1653900709219858, "grad_norm": 0.8190635442733765, "loss": 5.0121, "lr": 0.0004955244755244756, "step": 4108, "tokens_trained": 0.390462024 }, { "epoch": 1.1659574468085105, "grad_norm": 0.7800264954566956, "loss": 4.9819, "lr": 0.0004952447552447552, "step": 4110, "tokens_trained": 0.390651968 }, { "epoch": 1.1665248226950355, "grad_norm": 0.8210972547531128, "loss": 4.9929, "lr": 0.000494965034965035, "step": 4112, "tokens_trained": 0.390842776 }, { "epoch": 1.1670921985815603, "grad_norm": 0.9442235827445984, "loss": 5.0133, "lr": 0.0004946853146853147, "step": 4114, "tokens_trained": 0.391031856 }, { "epoch": 1.167659574468085, "grad_norm": 0.8627631068229675, "loss": 4.9587, "lr": 0.0004944055944055944, "step": 4116, "tokens_trained": 0.391223288 }, { "epoch": 1.1682269503546099, "grad_norm": 0.7751641869544983, "loss": 4.9934, "lr": 0.0004941258741258741, "step": 4118, "tokens_trained": 0.391412784 }, { "epoch": 1.1687943262411347, "grad_norm": 0.8243580460548401, "loss": 5.0126, "lr": 0.0004938461538461538, "step": 4120, "tokens_trained": 0.391603056 }, { "epoch": 1.1693617021276597, "grad_norm": 0.8990906476974487, "loss": 5.0234, "lr": 0.0004935664335664336, "step": 4122, "tokens_trained": 0.391793368 }, { "epoch": 1.1699290780141844, "grad_norm": 0.8721649050712585, "loss": 4.997, "lr": 0.0004932867132867133, "step": 4124, "tokens_trained": 0.39198508 }, { "epoch": 1.1702127659574468, "eval_loss": 5.014278411865234, "eval_runtime": 21.0162, "step": 4125, "tokens_trained": 0.392082752 }, { "epoch": 1.1704964539007092, "grad_norm": 0.7662192583084106, "loss": 4.9791, "lr": 0.0004930069930069931, "step": 4126, "tokens_trained": 0.392179088 }, { "epoch": 1.171063829787234, "grad_norm": 0.9081931710243225, "loss": 4.9882, "lr": 0.0004927272727272727, "step": 4128, "tokens_trained": 0.392369312 }, { "epoch": 1.1716312056737588, "grad_norm": 0.8503204584121704, "loss": 5.0403, "lr": 0.0004924475524475525, "step": 4130, "tokens_trained": 0.392557944 }, { "epoch": 1.1721985815602838, "grad_norm": 0.8676162362098694, "loss": 5.0716, "lr": 0.0004921678321678322, "step": 4132, "tokens_trained": 0.39274924 }, { "epoch": 1.1727659574468086, "grad_norm": 0.8527748584747314, "loss": 5.0416, "lr": 0.0004918881118881118, "step": 4134, "tokens_trained": 0.392939672 }, { "epoch": 1.1733333333333333, "grad_norm": 0.8113415241241455, "loss": 5.0525, "lr": 0.0004916083916083916, "step": 4136, "tokens_trained": 0.393131152 }, { "epoch": 1.1739007092198581, "grad_norm": 0.8555265665054321, "loss": 5.0734, "lr": 0.0004913286713286713, "step": 4138, "tokens_trained": 0.39332136 }, { "epoch": 1.174468085106383, "grad_norm": 0.9134076237678528, "loss": 4.9742, "lr": 0.0004910489510489511, "step": 4140, "tokens_trained": 0.393509376 }, { "epoch": 1.1750354609929077, "grad_norm": 0.8159533739089966, "loss": 5.0728, "lr": 0.0004907692307692308, "step": 4142, "tokens_trained": 0.393699616 }, { "epoch": 1.1756028368794327, "grad_norm": 0.8070579767227173, "loss": 5.0032, "lr": 0.0004904895104895106, "step": 4144, "tokens_trained": 0.393888176 }, { "epoch": 1.1761702127659575, "grad_norm": 0.8635644316673279, "loss": 5.0564, "lr": 0.0004902097902097902, "step": 4146, "tokens_trained": 0.39407804 }, { "epoch": 1.1767375886524822, "grad_norm": 0.8500214219093323, "loss": 4.9698, "lr": 0.00048993006993007, "step": 4148, "tokens_trained": 0.394268456 }, { "epoch": 1.177304964539007, "grad_norm": 0.8485430479049683, "loss": 4.9751, "lr": 0.0004896503496503497, "step": 4150, "tokens_trained": 0.394459912 }, { "epoch": 1.177872340425532, "grad_norm": 0.8265682458877563, "loss": 4.9703, "lr": 0.0004893706293706293, "step": 4152, "tokens_trained": 0.394650984 }, { "epoch": 1.1784397163120568, "grad_norm": 0.7867625951766968, "loss": 4.8901, "lr": 0.0004890909090909091, "step": 4154, "tokens_trained": 0.394843184 }, { "epoch": 1.1790070921985816, "grad_norm": 0.8666532635688782, "loss": 4.9144, "lr": 0.0004888111888111888, "step": 4156, "tokens_trained": 0.39503568 }, { "epoch": 1.1795744680851064, "grad_norm": 0.862920880317688, "loss": 4.9529, "lr": 0.0004885314685314686, "step": 4158, "tokens_trained": 0.395225424 }, { "epoch": 1.1801418439716311, "grad_norm": 0.810485303401947, "loss": 5.0165, "lr": 0.0004882517482517483, "step": 4160, "tokens_trained": 0.395415632 }, { "epoch": 1.180709219858156, "grad_norm": 0.7997188568115234, "loss": 5.0197, "lr": 0.000487972027972028, "step": 4162, "tokens_trained": 0.39560452 }, { "epoch": 1.181276595744681, "grad_norm": 0.8133664727210999, "loss": 5.0056, "lr": 0.0004876923076923077, "step": 4164, "tokens_trained": 0.395794008 }, { "epoch": 1.1818439716312057, "grad_norm": 0.8120067119598389, "loss": 4.913, "lr": 0.00048741258741258743, "step": 4166, "tokens_trained": 0.395983296 }, { "epoch": 1.1824113475177305, "grad_norm": 0.8434014320373535, "loss": 4.9777, "lr": 0.0004871328671328671, "step": 4168, "tokens_trained": 0.396175216 }, { "epoch": 1.1829787234042553, "grad_norm": 0.8452426195144653, "loss": 4.9693, "lr": 0.00048685314685314687, "step": 4170, "tokens_trained": 0.3963634 }, { "epoch": 1.1835460992907803, "grad_norm": 0.8733723759651184, "loss": 4.9757, "lr": 0.00048657342657342656, "step": 4172, "tokens_trained": 0.39655404 }, { "epoch": 1.184113475177305, "grad_norm": 0.8372209072113037, "loss": 4.9725, "lr": 0.0004862937062937063, "step": 4174, "tokens_trained": 0.396744688 }, { "epoch": 1.1846808510638298, "grad_norm": 0.7722007632255554, "loss": 5.0234, "lr": 0.000486013986013986, "step": 4176, "tokens_trained": 0.396935848 }, { "epoch": 1.1852482269503546, "grad_norm": 0.8685297966003418, "loss": 4.9777, "lr": 0.0004857342657342658, "step": 4178, "tokens_trained": 0.39712576 }, { "epoch": 1.1858156028368794, "grad_norm": 0.8083483576774597, "loss": 4.973, "lr": 0.0004854545454545455, "step": 4180, "tokens_trained": 0.397315672 }, { "epoch": 1.1863829787234041, "grad_norm": 0.8481479287147522, "loss": 5.0308, "lr": 0.00048517482517482517, "step": 4182, "tokens_trained": 0.39750464 }, { "epoch": 1.1869503546099291, "grad_norm": 0.7996193170547485, "loss": 4.9251, "lr": 0.0004848951048951049, "step": 4184, "tokens_trained": 0.397693584 }, { "epoch": 1.187517730496454, "grad_norm": 0.811189591884613, "loss": 5.0092, "lr": 0.0004846153846153846, "step": 4186, "tokens_trained": 0.397883352 }, { "epoch": 1.1880851063829787, "grad_norm": 0.9195986390113831, "loss": 4.961, "lr": 0.00048433566433566435, "step": 4188, "tokens_trained": 0.398073712 }, { "epoch": 1.1886524822695035, "grad_norm": 0.8444050550460815, "loss": 4.9707, "lr": 0.00048405594405594404, "step": 4190, "tokens_trained": 0.398265744 }, { "epoch": 1.1892198581560285, "grad_norm": 0.859663724899292, "loss": 5.0202, "lr": 0.0004837762237762238, "step": 4192, "tokens_trained": 0.39845568 }, { "epoch": 1.1897872340425533, "grad_norm": 0.8403055667877197, "loss": 4.9831, "lr": 0.0004834965034965035, "step": 4194, "tokens_trained": 0.398647696 }, { "epoch": 1.190354609929078, "grad_norm": 0.8377063870429993, "loss": 5.0545, "lr": 0.0004832167832167833, "step": 4196, "tokens_trained": 0.398838432 }, { "epoch": 1.1909219858156028, "grad_norm": 0.8102120161056519, "loss": 5.0068, "lr": 0.00048293706293706297, "step": 4198, "tokens_trained": 0.399027968 }, { "epoch": 1.1914893617021276, "grad_norm": 0.8520330190658569, "loss": 5.0102, "lr": 0.00048265734265734266, "step": 4200, "tokens_trained": 0.3992202 }, { "epoch": 1.1920567375886524, "grad_norm": 0.8204303979873657, "loss": 5.0303, "lr": 0.0004823776223776224, "step": 4202, "tokens_trained": 0.399411656 }, { "epoch": 1.1926241134751774, "grad_norm": 0.8569766879081726, "loss": 5.0097, "lr": 0.0004820979020979021, "step": 4204, "tokens_trained": 0.399602136 }, { "epoch": 1.1931914893617022, "grad_norm": 0.8269557952880859, "loss": 4.9694, "lr": 0.00048181818181818184, "step": 4206, "tokens_trained": 0.399793544 }, { "epoch": 1.193758865248227, "grad_norm": 0.9124187231063843, "loss": 4.9506, "lr": 0.0004815384615384615, "step": 4208, "tokens_trained": 0.399982856 }, { "epoch": 1.1943262411347517, "grad_norm": 0.8813201189041138, "loss": 4.9989, "lr": 0.00048125874125874127, "step": 4210, "tokens_trained": 0.400173184 }, { "epoch": 1.1948936170212765, "grad_norm": 0.8605351448059082, "loss": 5.0437, "lr": 0.00048097902097902096, "step": 4212, "tokens_trained": 0.400363824 }, { "epoch": 1.1954609929078015, "grad_norm": 0.8277431726455688, "loss": 5.0283, "lr": 0.00048069930069930076, "step": 4214, "tokens_trained": 0.400554648 }, { "epoch": 1.1960283687943263, "grad_norm": 0.828187108039856, "loss": 5.0573, "lr": 0.00048041958041958045, "step": 4216, "tokens_trained": 0.400746632 }, { "epoch": 1.196595744680851, "grad_norm": 0.8459845781326294, "loss": 5.0734, "lr": 0.00048013986013986014, "step": 4218, "tokens_trained": 0.400937568 }, { "epoch": 1.1971631205673758, "grad_norm": 0.7948288321495056, "loss": 5.011, "lr": 0.0004798601398601399, "step": 4220, "tokens_trained": 0.401127024 }, { "epoch": 1.1977304964539006, "grad_norm": 0.8868036866188049, "loss": 5.0318, "lr": 0.0004795804195804196, "step": 4222, "tokens_trained": 0.401318248 }, { "epoch": 1.1982978723404256, "grad_norm": 0.7660478353500366, "loss": 5.0656, "lr": 0.0004793006993006993, "step": 4224, "tokens_trained": 0.401506136 }, { "epoch": 1.1988652482269504, "grad_norm": 0.779299259185791, "loss": 4.9907, "lr": 0.000479020979020979, "step": 4226, "tokens_trained": 0.401696856 }, { "epoch": 1.1994326241134752, "grad_norm": 0.7903150916099548, "loss": 4.9744, "lr": 0.00047874125874125875, "step": 4228, "tokens_trained": 0.401885744 }, { "epoch": 1.2, "grad_norm": 0.7829038500785828, "loss": 4.9847, "lr": 0.00047846153846153844, "step": 4230, "tokens_trained": 0.402075072 }, { "epoch": 1.2005673758865247, "grad_norm": 0.9025991559028625, "loss": 4.9758, "lr": 0.00047818181818181824, "step": 4232, "tokens_trained": 0.4022674 }, { "epoch": 1.2011347517730497, "grad_norm": 0.8891049027442932, "loss": 4.9791, "lr": 0.00047790209790209793, "step": 4234, "tokens_trained": 0.402459792 }, { "epoch": 1.2017021276595745, "grad_norm": 0.7566952109336853, "loss": 5.0183, "lr": 0.0004776223776223776, "step": 4236, "tokens_trained": 0.402649768 }, { "epoch": 1.2022695035460993, "grad_norm": 0.80048668384552, "loss": 4.9493, "lr": 0.00047734265734265737, "step": 4238, "tokens_trained": 0.4028382 }, { "epoch": 1.202836879432624, "grad_norm": 0.7540125250816345, "loss": 4.9685, "lr": 0.00047706293706293706, "step": 4240, "tokens_trained": 0.403028848 }, { "epoch": 1.2034042553191489, "grad_norm": 0.7707799673080444, "loss": 4.984, "lr": 0.0004767832167832168, "step": 4242, "tokens_trained": 0.40321844 }, { "epoch": 1.2039716312056739, "grad_norm": 0.7681775093078613, "loss": 4.9807, "lr": 0.0004765034965034965, "step": 4244, "tokens_trained": 0.40340716 }, { "epoch": 1.2045390070921986, "grad_norm": 0.7557908892631531, "loss": 4.9912, "lr": 0.00047622377622377624, "step": 4246, "tokens_trained": 0.403600152 }, { "epoch": 1.2051063829787234, "grad_norm": 0.822948694229126, "loss": 5.0144, "lr": 0.00047594405594405593, "step": 4248, "tokens_trained": 0.403788 }, { "epoch": 1.2056737588652482, "grad_norm": 0.7625008225440979, "loss": 4.8949, "lr": 0.00047566433566433573, "step": 4250, "tokens_trained": 0.40397872 }, { "epoch": 1.2056737588652482, "eval_loss": 5.00390625, "eval_runtime": 20.2421, "step": 4250, "tokens_trained": 0.40397872 }, { "epoch": 1.206241134751773, "grad_norm": 0.7532864212989807, "loss": 5.0128, "lr": 0.0004753846153846154, "step": 4252, "tokens_trained": 0.404169384 }, { "epoch": 1.206808510638298, "grad_norm": 0.69386887550354, "loss": 4.9849, "lr": 0.0004751048951048951, "step": 4254, "tokens_trained": 0.40435968 }, { "epoch": 1.2073758865248228, "grad_norm": 0.7845306992530823, "loss": 5.0254, "lr": 0.00047482517482517485, "step": 4256, "tokens_trained": 0.404549424 }, { "epoch": 1.2079432624113475, "grad_norm": 0.8036428093910217, "loss": 4.9676, "lr": 0.00047454545454545454, "step": 4258, "tokens_trained": 0.404739344 }, { "epoch": 1.2085106382978723, "grad_norm": 0.8440237045288086, "loss": 4.9965, "lr": 0.0004742657342657343, "step": 4260, "tokens_trained": 0.40492952 }, { "epoch": 1.209078014184397, "grad_norm": 0.7936769127845764, "loss": 5.0458, "lr": 0.000473986013986014, "step": 4262, "tokens_trained": 0.405117144 }, { "epoch": 1.2096453900709219, "grad_norm": 0.8117086291313171, "loss": 5.0196, "lr": 0.0004737062937062937, "step": 4264, "tokens_trained": 0.405310184 }, { "epoch": 1.2102127659574469, "grad_norm": 0.7395413517951965, "loss": 4.9655, "lr": 0.0004734265734265734, "step": 4266, "tokens_trained": 0.405498272 }, { "epoch": 1.2107801418439716, "grad_norm": 0.8879559636116028, "loss": 4.9637, "lr": 0.0004731468531468531, "step": 4268, "tokens_trained": 0.4056908 }, { "epoch": 1.2113475177304964, "grad_norm": 0.8651279211044312, "loss": 4.945, "lr": 0.0004728671328671329, "step": 4270, "tokens_trained": 0.405879384 }, { "epoch": 1.2119148936170212, "grad_norm": 0.8421851992607117, "loss": 4.9391, "lr": 0.0004725874125874126, "step": 4272, "tokens_trained": 0.406071432 }, { "epoch": 1.2124822695035462, "grad_norm": 0.815262496471405, "loss": 5.0465, "lr": 0.00047230769230769234, "step": 4274, "tokens_trained": 0.406262776 }, { "epoch": 1.213049645390071, "grad_norm": 0.8042894005775452, "loss": 4.8908, "lr": 0.00047202797202797203, "step": 4276, "tokens_trained": 0.406452656 }, { "epoch": 1.2136170212765958, "grad_norm": 0.8514822721481323, "loss": 4.9961, "lr": 0.00047174825174825177, "step": 4278, "tokens_trained": 0.406642224 }, { "epoch": 1.2141843971631205, "grad_norm": 0.7532519102096558, "loss": 4.9658, "lr": 0.00047146853146853146, "step": 4280, "tokens_trained": 0.406830288 }, { "epoch": 1.2147517730496453, "grad_norm": 0.7978721261024475, "loss": 4.9477, "lr": 0.0004711888111888112, "step": 4282, "tokens_trained": 0.4070214 }, { "epoch": 1.21531914893617, "grad_norm": 0.8998175859451294, "loss": 5.0531, "lr": 0.0004709090909090909, "step": 4284, "tokens_trained": 0.407211064 }, { "epoch": 1.215886524822695, "grad_norm": 0.7281949520111084, "loss": 4.9474, "lr": 0.0004706293706293706, "step": 4286, "tokens_trained": 0.40740104 }, { "epoch": 1.2164539007092199, "grad_norm": 0.7590287923812866, "loss": 5.0104, "lr": 0.0004703496503496504, "step": 4288, "tokens_trained": 0.40759144 }, { "epoch": 1.2170212765957447, "grad_norm": 0.8452118039131165, "loss": 5.024, "lr": 0.0004700699300699301, "step": 4290, "tokens_trained": 0.407780576 }, { "epoch": 1.2175886524822694, "grad_norm": 0.8062863945960999, "loss": 5.0099, "lr": 0.0004697902097902098, "step": 4292, "tokens_trained": 0.407971808 }, { "epoch": 1.2181560283687944, "grad_norm": 0.8372058272361755, "loss": 5.0832, "lr": 0.0004695104895104895, "step": 4294, "tokens_trained": 0.408162104 }, { "epoch": 1.2187234042553192, "grad_norm": 0.7989845871925354, "loss": 4.971, "lr": 0.00046923076923076926, "step": 4296, "tokens_trained": 0.408351392 }, { "epoch": 1.219290780141844, "grad_norm": 0.7519237399101257, "loss": 4.9739, "lr": 0.00046895104895104895, "step": 4298, "tokens_trained": 0.408541056 }, { "epoch": 1.2198581560283688, "grad_norm": 0.769143283367157, "loss": 4.9483, "lr": 0.0004686713286713287, "step": 4300, "tokens_trained": 0.408731728 }, { "epoch": 1.2204255319148936, "grad_norm": 0.7855169177055359, "loss": 5.0007, "lr": 0.0004683916083916084, "step": 4302, "tokens_trained": 0.408921824 }, { "epoch": 1.2209929078014183, "grad_norm": 0.8531661629676819, "loss": 5.018, "lr": 0.00046811188811188807, "step": 4304, "tokens_trained": 0.409112528 }, { "epoch": 1.2215602836879433, "grad_norm": 0.8178502321243286, "loss": 4.9869, "lr": 0.00046783216783216787, "step": 4306, "tokens_trained": 0.40930284 }, { "epoch": 1.2221276595744681, "grad_norm": 0.7806143164634705, "loss": 4.9561, "lr": 0.00046755244755244756, "step": 4308, "tokens_trained": 0.409492304 }, { "epoch": 1.222695035460993, "grad_norm": 0.7506605982780457, "loss": 4.937, "lr": 0.0004672727272727273, "step": 4310, "tokens_trained": 0.409680208 }, { "epoch": 1.2232624113475177, "grad_norm": 0.8441674113273621, "loss": 4.9137, "lr": 0.000466993006993007, "step": 4312, "tokens_trained": 0.409869952 }, { "epoch": 1.2238297872340427, "grad_norm": 0.8911812901496887, "loss": 5.0072, "lr": 0.00046671328671328674, "step": 4314, "tokens_trained": 0.410058728 }, { "epoch": 1.2243971631205675, "grad_norm": 0.7732901573181152, "loss": 4.9094, "lr": 0.00046643356643356643, "step": 4316, "tokens_trained": 0.410249624 }, { "epoch": 1.2249645390070922, "grad_norm": 0.7372212409973145, "loss": 4.9646, "lr": 0.0004661538461538462, "step": 4318, "tokens_trained": 0.410440088 }, { "epoch": 1.225531914893617, "grad_norm": 0.8266177177429199, "loss": 5.01, "lr": 0.00046587412587412587, "step": 4320, "tokens_trained": 0.410630384 }, { "epoch": 1.2260992907801418, "grad_norm": 0.7471604347229004, "loss": 4.9741, "lr": 0.00046559440559440556, "step": 4322, "tokens_trained": 0.410819312 }, { "epoch": 1.2266666666666666, "grad_norm": 0.8529990911483765, "loss": 5.0115, "lr": 0.00046531468531468536, "step": 4324, "tokens_trained": 0.411007776 }, { "epoch": 1.2272340425531916, "grad_norm": 0.8250638246536255, "loss": 4.9974, "lr": 0.00046503496503496505, "step": 4326, "tokens_trained": 0.4111994 }, { "epoch": 1.2278014184397164, "grad_norm": 0.7049713730812073, "loss": 4.9412, "lr": 0.0004647552447552448, "step": 4328, "tokens_trained": 0.411387512 }, { "epoch": 1.2283687943262411, "grad_norm": 0.8164275884628296, "loss": 4.9696, "lr": 0.0004644755244755245, "step": 4330, "tokens_trained": 0.411579192 }, { "epoch": 1.228936170212766, "grad_norm": 0.786007821559906, "loss": 4.9015, "lr": 0.0004641958041958042, "step": 4332, "tokens_trained": 0.411769256 }, { "epoch": 1.2295035460992907, "grad_norm": 0.7956440448760986, "loss": 4.9864, "lr": 0.0004639160839160839, "step": 4334, "tokens_trained": 0.411958112 }, { "epoch": 1.2300709219858157, "grad_norm": 0.7968415021896362, "loss": 5.0563, "lr": 0.00046363636363636366, "step": 4336, "tokens_trained": 0.412148936 }, { "epoch": 1.2306382978723405, "grad_norm": 0.9666130542755127, "loss": 4.9907, "lr": 0.00046335664335664335, "step": 4338, "tokens_trained": 0.412337728 }, { "epoch": 1.2312056737588652, "grad_norm": 0.9147318005561829, "loss": 5.0003, "lr": 0.00046307692307692304, "step": 4340, "tokens_trained": 0.412527736 }, { "epoch": 1.23177304964539, "grad_norm": 0.7779629230499268, "loss": 4.9392, "lr": 0.00046279720279720284, "step": 4342, "tokens_trained": 0.412717944 }, { "epoch": 1.2323404255319148, "grad_norm": 0.8160842061042786, "loss": 4.9644, "lr": 0.00046251748251748253, "step": 4344, "tokens_trained": 0.412909288 }, { "epoch": 1.2329078014184398, "grad_norm": 0.8430790305137634, "loss": 4.9472, "lr": 0.0004622377622377623, "step": 4346, "tokens_trained": 0.413097912 }, { "epoch": 1.2334751773049646, "grad_norm": 0.8291404843330383, "loss": 4.9647, "lr": 0.00046195804195804196, "step": 4348, "tokens_trained": 0.413290272 }, { "epoch": 1.2340425531914894, "grad_norm": 0.8272704482078552, "loss": 4.9685, "lr": 0.0004616783216783217, "step": 4350, "tokens_trained": 0.41348152 }, { "epoch": 1.2346099290780141, "grad_norm": 0.7785531282424927, "loss": 5.0172, "lr": 0.0004613986013986014, "step": 4352, "tokens_trained": 0.413670184 }, { "epoch": 1.235177304964539, "grad_norm": 0.8512988090515137, "loss": 4.9727, "lr": 0.00046111888111888114, "step": 4354, "tokens_trained": 0.413860232 }, { "epoch": 1.235744680851064, "grad_norm": 0.7373901009559631, "loss": 4.9092, "lr": 0.00046083916083916083, "step": 4356, "tokens_trained": 0.414051312 }, { "epoch": 1.2363120567375887, "grad_norm": 0.7716902494430542, "loss": 4.9456, "lr": 0.0004605594405594405, "step": 4358, "tokens_trained": 0.414239448 }, { "epoch": 1.2368794326241135, "grad_norm": 0.8303737044334412, "loss": 4.9656, "lr": 0.0004602797202797203, "step": 4360, "tokens_trained": 0.414430488 }, { "epoch": 1.2374468085106383, "grad_norm": 0.850261926651001, "loss": 4.9407, "lr": 0.00046, "step": 4362, "tokens_trained": 0.414620536 }, { "epoch": 1.238014184397163, "grad_norm": 0.8391888737678528, "loss": 4.9772, "lr": 0.00045972027972027976, "step": 4364, "tokens_trained": 0.4148106 }, { "epoch": 1.2385815602836878, "grad_norm": 0.8289617300033569, "loss": 5.0061, "lr": 0.00045944055944055945, "step": 4366, "tokens_trained": 0.414998608 }, { "epoch": 1.2391489361702128, "grad_norm": 0.801800549030304, "loss": 5.0436, "lr": 0.0004591608391608392, "step": 4368, "tokens_trained": 0.415190568 }, { "epoch": 1.2397163120567376, "grad_norm": 0.8448522686958313, "loss": 4.9398, "lr": 0.0004588811188811189, "step": 4370, "tokens_trained": 0.415378536 }, { "epoch": 1.2402836879432624, "grad_norm": 0.8992466330528259, "loss": 4.9277, "lr": 0.0004586013986013986, "step": 4372, "tokens_trained": 0.4155704 }, { "epoch": 1.2408510638297872, "grad_norm": 0.8534346222877502, "loss": 4.8933, "lr": 0.0004583216783216783, "step": 4374, "tokens_trained": 0.41575984 }, { "epoch": 1.2411347517730495, "eval_loss": 4.997620582580566, "eval_runtime": 20.4786, "step": 4375, "tokens_trained": 0.415855704 }, { "epoch": 1.2414184397163122, "grad_norm": 0.8547607064247131, "loss": 5.0331, "lr": 0.000458041958041958, "step": 4376, "tokens_trained": 0.415951704 }, { "epoch": 1.241985815602837, "grad_norm": 0.7995121479034424, "loss": 4.9727, "lr": 0.0004577622377622378, "step": 4378, "tokens_trained": 0.416142464 }, { "epoch": 1.2425531914893617, "grad_norm": 0.7953593730926514, "loss": 5.054, "lr": 0.0004574825174825175, "step": 4380, "tokens_trained": 0.416331184 }, { "epoch": 1.2431205673758865, "grad_norm": 0.8307169079780579, "loss": 4.9694, "lr": 0.00045720279720279724, "step": 4382, "tokens_trained": 0.416522688 }, { "epoch": 1.2436879432624113, "grad_norm": 0.8380933403968811, "loss": 4.9432, "lr": 0.00045692307692307693, "step": 4384, "tokens_trained": 0.416712408 }, { "epoch": 1.244255319148936, "grad_norm": 0.8354132771492004, "loss": 4.9649, "lr": 0.0004566433566433567, "step": 4386, "tokens_trained": 0.416902056 }, { "epoch": 1.244822695035461, "grad_norm": 0.8815358877182007, "loss": 4.9998, "lr": 0.00045636363636363637, "step": 4388, "tokens_trained": 0.417090856 }, { "epoch": 1.2453900709219858, "grad_norm": 0.8799077868461609, "loss": 4.984, "lr": 0.00045608391608391606, "step": 4390, "tokens_trained": 0.417281408 }, { "epoch": 1.2459574468085106, "grad_norm": 0.9041373133659363, "loss": 4.9209, "lr": 0.0004558041958041958, "step": 4392, "tokens_trained": 0.41747192 }, { "epoch": 1.2465248226950354, "grad_norm": 0.8234816193580627, "loss": 5.022, "lr": 0.0004555244755244755, "step": 4394, "tokens_trained": 0.41766064 }, { "epoch": 1.2470921985815604, "grad_norm": 0.8067740797996521, "loss": 5.0255, "lr": 0.00045524475524475524, "step": 4396, "tokens_trained": 0.417852816 }, { "epoch": 1.2476595744680852, "grad_norm": 0.812566876411438, "loss": 4.9524, "lr": 0.000454965034965035, "step": 4398, "tokens_trained": 0.418043848 }, { "epoch": 1.24822695035461, "grad_norm": 0.7977521419525146, "loss": 5.023, "lr": 0.0004546853146853147, "step": 4400, "tokens_trained": 0.418234224 }, { "epoch": 1.2487943262411347, "grad_norm": 0.7514439225196838, "loss": 4.9909, "lr": 0.0004544055944055944, "step": 4402, "tokens_trained": 0.418424576 }, { "epoch": 1.2493617021276595, "grad_norm": 0.7931577563285828, "loss": 5.0128, "lr": 0.00045412587412587416, "step": 4404, "tokens_trained": 0.418616776 }, { "epoch": 1.2499290780141843, "grad_norm": 0.787543773651123, "loss": 4.9616, "lr": 0.00045384615384615385, "step": 4406, "tokens_trained": 0.418805744 }, { "epoch": 1.2504964539007093, "grad_norm": 0.7384114861488342, "loss": 5.0641, "lr": 0.00045356643356643354, "step": 4408, "tokens_trained": 0.418997784 }, { "epoch": 1.251063829787234, "grad_norm": 0.8014666438102722, "loss": 4.9652, "lr": 0.0004532867132867133, "step": 4410, "tokens_trained": 0.419187464 }, { "epoch": 1.2516312056737589, "grad_norm": 0.7648611068725586, "loss": 4.9813, "lr": 0.000453006993006993, "step": 4412, "tokens_trained": 0.419376864 }, { "epoch": 1.2521985815602836, "grad_norm": 0.7647461891174316, "loss": 5.0052, "lr": 0.0004527272727272727, "step": 4414, "tokens_trained": 0.419568352 }, { "epoch": 1.2527659574468086, "grad_norm": 0.7152479887008667, "loss": 4.9851, "lr": 0.00045244755244755247, "step": 4416, "tokens_trained": 0.419759464 }, { "epoch": 1.2533333333333334, "grad_norm": 0.7977505326271057, "loss": 5.0082, "lr": 0.0004521678321678322, "step": 4418, "tokens_trained": 0.419951 }, { "epoch": 1.2539007092198582, "grad_norm": 0.7556982040405273, "loss": 5.0207, "lr": 0.0004518881118881119, "step": 4420, "tokens_trained": 0.420141312 }, { "epoch": 1.254468085106383, "grad_norm": 0.8059271574020386, "loss": 5.0286, "lr": 0.00045160839160839165, "step": 4422, "tokens_trained": 0.420330672 }, { "epoch": 1.2550354609929077, "grad_norm": 0.836380660533905, "loss": 4.9406, "lr": 0.00045132867132867134, "step": 4424, "tokens_trained": 0.420519952 }, { "epoch": 1.2556028368794325, "grad_norm": 0.7693254947662354, "loss": 4.9533, "lr": 0.000451048951048951, "step": 4426, "tokens_trained": 0.42070948 }, { "epoch": 1.2561702127659575, "grad_norm": 0.8241584897041321, "loss": 5.0407, "lr": 0.00045076923076923077, "step": 4428, "tokens_trained": 0.420899504 }, { "epoch": 1.2567375886524823, "grad_norm": 0.7866604328155518, "loss": 4.9119, "lr": 0.00045048951048951046, "step": 4430, "tokens_trained": 0.421088352 }, { "epoch": 1.257304964539007, "grad_norm": 0.8286674618721008, "loss": 5.016, "lr": 0.0004502097902097902, "step": 4432, "tokens_trained": 0.421277528 }, { "epoch": 1.2578723404255319, "grad_norm": 0.7921491265296936, "loss": 4.9991, "lr": 0.00044993006993006995, "step": 4434, "tokens_trained": 0.421468272 }, { "epoch": 1.2584397163120569, "grad_norm": 0.807640016078949, "loss": 5.042, "lr": 0.0004496503496503497, "step": 4436, "tokens_trained": 0.421658096 }, { "epoch": 1.2590070921985816, "grad_norm": 0.7414442896842957, "loss": 4.9647, "lr": 0.0004493706293706294, "step": 4438, "tokens_trained": 0.421849712 }, { "epoch": 1.2595744680851064, "grad_norm": 0.8236945867538452, "loss": 4.9562, "lr": 0.00044909090909090913, "step": 4440, "tokens_trained": 0.422038344 }, { "epoch": 1.2601418439716312, "grad_norm": 0.7859675884246826, "loss": 4.9568, "lr": 0.0004488111888111888, "step": 4442, "tokens_trained": 0.422227928 }, { "epoch": 1.260709219858156, "grad_norm": 0.7467136383056641, "loss": 4.9543, "lr": 0.0004485314685314685, "step": 4444, "tokens_trained": 0.422415664 }, { "epoch": 1.2612765957446808, "grad_norm": 0.711588978767395, "loss": 5.0494, "lr": 0.00044825174825174826, "step": 4446, "tokens_trained": 0.422606696 }, { "epoch": 1.2618439716312055, "grad_norm": 0.750599205493927, "loss": 4.9878, "lr": 0.00044797202797202795, "step": 4448, "tokens_trained": 0.422796416 }, { "epoch": 1.2624113475177305, "grad_norm": 0.7823654413223267, "loss": 4.947, "lr": 0.0004476923076923077, "step": 4450, "tokens_trained": 0.422986968 }, { "epoch": 1.2629787234042553, "grad_norm": 0.8101715445518494, "loss": 4.925, "lr": 0.00044741258741258744, "step": 4452, "tokens_trained": 0.423174384 }, { "epoch": 1.26354609929078, "grad_norm": 0.8134462237358093, "loss": 5.051, "lr": 0.0004471328671328672, "step": 4454, "tokens_trained": 0.42336536 }, { "epoch": 1.264113475177305, "grad_norm": 0.8446463942527771, "loss": 4.9789, "lr": 0.00044685314685314687, "step": 4456, "tokens_trained": 0.423556136 }, { "epoch": 1.2646808510638299, "grad_norm": 0.7812824845314026, "loss": 4.9819, "lr": 0.0004465734265734266, "step": 4458, "tokens_trained": 0.423745736 }, { "epoch": 1.2652482269503547, "grad_norm": 0.7645587921142578, "loss": 4.9824, "lr": 0.0004462937062937063, "step": 4460, "tokens_trained": 0.423935408 }, { "epoch": 1.2658156028368794, "grad_norm": 0.8110623955726624, "loss": 4.9664, "lr": 0.000446013986013986, "step": 4462, "tokens_trained": 0.424125264 }, { "epoch": 1.2663829787234042, "grad_norm": 0.7860397696495056, "loss": 4.9871, "lr": 0.00044573426573426574, "step": 4464, "tokens_trained": 0.424314544 }, { "epoch": 1.266950354609929, "grad_norm": 0.7764657735824585, "loss": 5.0335, "lr": 0.00044545454545454543, "step": 4466, "tokens_trained": 0.424502264 }, { "epoch": 1.2675177304964538, "grad_norm": 0.7725886702537537, "loss": 4.9705, "lr": 0.0004451748251748252, "step": 4468, "tokens_trained": 0.424691888 }, { "epoch": 1.2680851063829788, "grad_norm": 0.8336632251739502, "loss": 5.0535, "lr": 0.0004448951048951049, "step": 4470, "tokens_trained": 0.424880992 }, { "epoch": 1.2686524822695036, "grad_norm": 0.7934354543685913, "loss": 5.0105, "lr": 0.00044461538461538466, "step": 4472, "tokens_trained": 0.425069536 }, { "epoch": 1.2692198581560283, "grad_norm": 0.7649230360984802, "loss": 4.978, "lr": 0.00044433566433566435, "step": 4474, "tokens_trained": 0.425259168 }, { "epoch": 1.2697872340425531, "grad_norm": 0.7798753976821899, "loss": 5.0526, "lr": 0.0004440559440559441, "step": 4476, "tokens_trained": 0.425450064 }, { "epoch": 1.2703546099290781, "grad_norm": 0.7455066442489624, "loss": 4.9914, "lr": 0.0004437762237762238, "step": 4478, "tokens_trained": 0.42564068 }, { "epoch": 1.270921985815603, "grad_norm": 0.7951638698577881, "loss": 5.0092, "lr": 0.0004434965034965035, "step": 4480, "tokens_trained": 0.42583048 }, { "epoch": 1.2714893617021277, "grad_norm": 0.7585451602935791, "loss": 5.016, "lr": 0.0004432167832167832, "step": 4482, "tokens_trained": 0.42602172 }, { "epoch": 1.2720567375886525, "grad_norm": 0.8267669081687927, "loss": 4.972, "lr": 0.0004429370629370629, "step": 4484, "tokens_trained": 0.426212496 }, { "epoch": 1.2726241134751772, "grad_norm": 0.7738245129585266, "loss": 5.0239, "lr": 0.00044265734265734266, "step": 4486, "tokens_trained": 0.426401408 }, { "epoch": 1.273191489361702, "grad_norm": 0.9146332144737244, "loss": 5.0361, "lr": 0.0004423776223776224, "step": 4488, "tokens_trained": 0.426591056 }, { "epoch": 1.273758865248227, "grad_norm": 0.8278553485870361, "loss": 4.9512, "lr": 0.00044209790209790215, "step": 4490, "tokens_trained": 0.42678144 }, { "epoch": 1.2743262411347518, "grad_norm": 0.7594732046127319, "loss": 4.9391, "lr": 0.00044181818181818184, "step": 4492, "tokens_trained": 0.426971472 }, { "epoch": 1.2748936170212766, "grad_norm": 0.8350242376327515, "loss": 4.9151, "lr": 0.00044153846153846153, "step": 4494, "tokens_trained": 0.427161504 }, { "epoch": 1.2754609929078013, "grad_norm": 0.85927414894104, "loss": 4.9303, "lr": 0.0004412587412587413, "step": 4496, "tokens_trained": 0.427351448 }, { "epoch": 1.2760283687943264, "grad_norm": 0.8133000135421753, "loss": 4.9668, "lr": 0.00044097902097902096, "step": 4498, "tokens_trained": 0.427539384 }, { "epoch": 1.2765957446808511, "grad_norm": 0.7529495358467102, "loss": 4.9364, "lr": 0.0004406993006993007, "step": 4500, "tokens_trained": 0.427730552 }, { "epoch": 1.2765957446808511, "eval_loss": 4.999549388885498, "eval_runtime": 20.6764, "step": 4500, "tokens_trained": 0.427730552 }, { "epoch": 1.277163120567376, "grad_norm": 0.7332281470298767, "loss": 5.024, "lr": 0.0004404195804195804, "step": 4502, "tokens_trained": 0.427922016 }, { "epoch": 1.2777304964539007, "grad_norm": 0.7735735774040222, "loss": 4.9235, "lr": 0.00044013986013986014, "step": 4504, "tokens_trained": 0.428112824 }, { "epoch": 1.2782978723404255, "grad_norm": 0.8075562119483948, "loss": 5.0712, "lr": 0.0004398601398601399, "step": 4506, "tokens_trained": 0.428306056 }, { "epoch": 1.2788652482269502, "grad_norm": 0.8019667863845825, "loss": 4.9597, "lr": 0.00043958041958041963, "step": 4508, "tokens_trained": 0.428496768 }, { "epoch": 1.2794326241134752, "grad_norm": 0.7908930778503418, "loss": 4.9471, "lr": 0.0004393006993006993, "step": 4510, "tokens_trained": 0.428685312 }, { "epoch": 1.28, "grad_norm": 0.8128061890602112, "loss": 4.9244, "lr": 0.000439020979020979, "step": 4512, "tokens_trained": 0.428875184 }, { "epoch": 1.2805673758865248, "grad_norm": 0.7859349250793457, "loss": 5.0096, "lr": 0.00043874125874125876, "step": 4514, "tokens_trained": 0.429066688 }, { "epoch": 1.2811347517730496, "grad_norm": 0.7396280169487, "loss": 4.9263, "lr": 0.00043846153846153845, "step": 4516, "tokens_trained": 0.429254336 }, { "epoch": 1.2817021276595746, "grad_norm": 0.8057092428207397, "loss": 4.9705, "lr": 0.0004381818181818182, "step": 4518, "tokens_trained": 0.429446032 }, { "epoch": 1.2822695035460994, "grad_norm": 0.8460845351219177, "loss": 4.9311, "lr": 0.0004379020979020979, "step": 4520, "tokens_trained": 0.429636152 }, { "epoch": 1.2828368794326241, "grad_norm": 0.7627289891242981, "loss": 4.9622, "lr": 0.00043762237762237763, "step": 4522, "tokens_trained": 0.429825536 }, { "epoch": 1.283404255319149, "grad_norm": 0.7211505174636841, "loss": 4.9851, "lr": 0.0004373426573426573, "step": 4524, "tokens_trained": 0.430016616 }, { "epoch": 1.2839716312056737, "grad_norm": 0.7647969722747803, "loss": 4.9708, "lr": 0.0004370629370629371, "step": 4526, "tokens_trained": 0.430208336 }, { "epoch": 1.2845390070921985, "grad_norm": 0.7541454434394836, "loss": 4.9404, "lr": 0.0004367832167832168, "step": 4528, "tokens_trained": 0.430398968 }, { "epoch": 1.2851063829787235, "grad_norm": 0.7825188636779785, "loss": 4.9741, "lr": 0.0004365034965034965, "step": 4530, "tokens_trained": 0.430589112 }, { "epoch": 1.2856737588652483, "grad_norm": 0.7198429107666016, "loss": 4.9745, "lr": 0.00043622377622377624, "step": 4532, "tokens_trained": 0.43077964 }, { "epoch": 1.286241134751773, "grad_norm": 0.7174004912376404, "loss": 5.037, "lr": 0.00043594405594405593, "step": 4534, "tokens_trained": 0.43096964 }, { "epoch": 1.2868085106382978, "grad_norm": 0.7118927240371704, "loss": 5.0456, "lr": 0.0004356643356643357, "step": 4536, "tokens_trained": 0.431160024 }, { "epoch": 1.2873758865248228, "grad_norm": 0.7081615924835205, "loss": 4.9763, "lr": 0.00043538461538461537, "step": 4538, "tokens_trained": 0.431351344 }, { "epoch": 1.2879432624113476, "grad_norm": 0.7620618343353271, "loss": 4.9863, "lr": 0.0004351048951048951, "step": 4540, "tokens_trained": 0.43154232 }, { "epoch": 1.2885106382978724, "grad_norm": 0.8104450702667236, "loss": 4.9903, "lr": 0.0004348251748251748, "step": 4542, "tokens_trained": 0.431731592 }, { "epoch": 1.2890780141843972, "grad_norm": 0.7488150000572205, "loss": 5.0189, "lr": 0.0004345454545454546, "step": 4544, "tokens_trained": 0.431922608 }, { "epoch": 1.289645390070922, "grad_norm": 0.7956752181053162, "loss": 4.9259, "lr": 0.0004342657342657343, "step": 4546, "tokens_trained": 0.432113808 }, { "epoch": 1.2902127659574467, "grad_norm": 0.7799624800682068, "loss": 5.0129, "lr": 0.000433986013986014, "step": 4548, "tokens_trained": 0.432304088 }, { "epoch": 1.2907801418439715, "grad_norm": 0.792834997177124, "loss": 5.0647, "lr": 0.0004337062937062937, "step": 4550, "tokens_trained": 0.432493096 }, { "epoch": 1.2913475177304965, "grad_norm": 0.7479969263076782, "loss": 4.9514, "lr": 0.0004334265734265734, "step": 4552, "tokens_trained": 0.432680128 }, { "epoch": 1.2919148936170213, "grad_norm": 0.7381340861320496, "loss": 4.9865, "lr": 0.00043314685314685316, "step": 4554, "tokens_trained": 0.43287188 }, { "epoch": 1.292482269503546, "grad_norm": 0.7690939903259277, "loss": 4.9704, "lr": 0.00043286713286713285, "step": 4556, "tokens_trained": 0.43306148 }, { "epoch": 1.293049645390071, "grad_norm": 0.7883870005607605, "loss": 4.9766, "lr": 0.0004325874125874126, "step": 4558, "tokens_trained": 0.433252704 }, { "epoch": 1.2936170212765958, "grad_norm": 0.782208263874054, "loss": 4.9967, "lr": 0.0004323076923076923, "step": 4560, "tokens_trained": 0.433444272 }, { "epoch": 1.2941843971631206, "grad_norm": 0.7333335280418396, "loss": 4.9237, "lr": 0.0004320279720279721, "step": 4562, "tokens_trained": 0.433634264 }, { "epoch": 1.2947517730496454, "grad_norm": 0.7663769721984863, "loss": 4.9961, "lr": 0.0004317482517482518, "step": 4564, "tokens_trained": 0.433825632 }, { "epoch": 1.2953191489361702, "grad_norm": 0.75322026014328, "loss": 4.9294, "lr": 0.00043146853146853147, "step": 4566, "tokens_trained": 0.434015424 }, { "epoch": 1.295886524822695, "grad_norm": 0.7660694718360901, "loss": 4.953, "lr": 0.0004311888111888112, "step": 4568, "tokens_trained": 0.434208048 }, { "epoch": 1.2964539007092197, "grad_norm": 0.7548807859420776, "loss": 4.9164, "lr": 0.0004309090909090909, "step": 4570, "tokens_trained": 0.434397424 }, { "epoch": 1.2970212765957447, "grad_norm": 0.760160505771637, "loss": 4.9748, "lr": 0.00043062937062937065, "step": 4572, "tokens_trained": 0.434588752 }, { "epoch": 1.2975886524822695, "grad_norm": 0.8081098198890686, "loss": 4.9596, "lr": 0.00043034965034965034, "step": 4574, "tokens_trained": 0.434779696 }, { "epoch": 1.2981560283687943, "grad_norm": 0.7557078003883362, "loss": 4.979, "lr": 0.0004300699300699301, "step": 4576, "tokens_trained": 0.434971072 }, { "epoch": 1.298723404255319, "grad_norm": 0.7966912984848022, "loss": 4.9257, "lr": 0.00042979020979020977, "step": 4578, "tokens_trained": 0.435160496 }, { "epoch": 1.299290780141844, "grad_norm": 0.8104644417762756, "loss": 4.9675, "lr": 0.00042951048951048957, "step": 4580, "tokens_trained": 0.435349392 }, { "epoch": 1.2998581560283688, "grad_norm": 0.711733877658844, "loss": 4.929, "lr": 0.00042923076923076926, "step": 4582, "tokens_trained": 0.435539752 }, { "epoch": 1.3004255319148936, "grad_norm": 0.7435249090194702, "loss": 5.0012, "lr": 0.00042895104895104895, "step": 4584, "tokens_trained": 0.435730112 }, { "epoch": 1.3009929078014184, "grad_norm": 0.8262581825256348, "loss": 4.9065, "lr": 0.0004286713286713287, "step": 4586, "tokens_trained": 0.435918552 }, { "epoch": 1.3015602836879432, "grad_norm": 0.7614077925682068, "loss": 5.022, "lr": 0.0004283916083916084, "step": 4588, "tokens_trained": 0.43611052 }, { "epoch": 1.302127659574468, "grad_norm": 0.7792633175849915, "loss": 4.9763, "lr": 0.00042811188811188813, "step": 4590, "tokens_trained": 0.43629848 }, { "epoch": 1.302695035460993, "grad_norm": 0.748753011226654, "loss": 4.9588, "lr": 0.0004278321678321678, "step": 4592, "tokens_trained": 0.436487384 }, { "epoch": 1.3032624113475177, "grad_norm": 0.6770404577255249, "loss": 5.0546, "lr": 0.00042755244755244756, "step": 4594, "tokens_trained": 0.436677688 }, { "epoch": 1.3038297872340425, "grad_norm": 0.7595148682594299, "loss": 4.9832, "lr": 0.00042727272727272726, "step": 4596, "tokens_trained": 0.436866288 }, { "epoch": 1.3043971631205673, "grad_norm": 0.7239478230476379, "loss": 4.9597, "lr": 0.00042699300699300705, "step": 4598, "tokens_trained": 0.437057192 }, { "epoch": 1.3049645390070923, "grad_norm": 0.7907828092575073, "loss": 5.0041, "lr": 0.00042671328671328674, "step": 4600, "tokens_trained": 0.437247856 }, { "epoch": 1.305531914893617, "grad_norm": 0.6975818872451782, "loss": 4.9256, "lr": 0.00042643356643356643, "step": 4602, "tokens_trained": 0.43743696 }, { "epoch": 1.3060992907801419, "grad_norm": 0.7589024305343628, "loss": 4.9781, "lr": 0.0004261538461538462, "step": 4604, "tokens_trained": 0.437627408 }, { "epoch": 1.3066666666666666, "grad_norm": 0.7332574725151062, "loss": 5.0012, "lr": 0.00042587412587412587, "step": 4606, "tokens_trained": 0.43781732 }, { "epoch": 1.3072340425531914, "grad_norm": 0.8402982950210571, "loss": 4.9202, "lr": 0.0004255944055944056, "step": 4608, "tokens_trained": 0.438006368 }, { "epoch": 1.3078014184397162, "grad_norm": 0.8018138408660889, "loss": 4.9518, "lr": 0.0004253146853146853, "step": 4610, "tokens_trained": 0.438196728 }, { "epoch": 1.3083687943262412, "grad_norm": 0.8211417198181152, "loss": 4.9916, "lr": 0.00042503496503496505, "step": 4612, "tokens_trained": 0.43838568 }, { "epoch": 1.308936170212766, "grad_norm": 0.8054932355880737, "loss": 4.9329, "lr": 0.00042475524475524474, "step": 4614, "tokens_trained": 0.438577096 }, { "epoch": 1.3095035460992908, "grad_norm": 0.795623779296875, "loss": 4.9572, "lr": 0.0004244755244755245, "step": 4616, "tokens_trained": 0.438767032 }, { "epoch": 1.3100709219858155, "grad_norm": 0.7230743169784546, "loss": 5.0013, "lr": 0.00042419580419580423, "step": 4618, "tokens_trained": 0.438955216 }, { "epoch": 1.3106382978723405, "grad_norm": 0.7714941501617432, "loss": 4.9493, "lr": 0.0004239160839160839, "step": 4620, "tokens_trained": 0.439145848 }, { "epoch": 1.3112056737588653, "grad_norm": 0.7291305661201477, "loss": 4.9792, "lr": 0.00042363636363636366, "step": 4622, "tokens_trained": 0.439334768 }, { "epoch": 1.31177304964539, "grad_norm": 0.6893495321273804, "loss": 4.9703, "lr": 0.00042335664335664335, "step": 4624, "tokens_trained": 0.439524928 }, { "epoch": 1.3120567375886525, "eval_loss": 4.985546112060547, "eval_runtime": 20.6802, "step": 4625, "tokens_trained": 0.439619056 }, { "epoch": 1.3123404255319149, "grad_norm": 0.7363048791885376, "loss": 4.9635, "lr": 0.0004230769230769231, "step": 4626, "tokens_trained": 0.439714232 }, { "epoch": 1.3129078014184397, "grad_norm": 0.7479920387268066, "loss": 5.0308, "lr": 0.0004227972027972028, "step": 4628, "tokens_trained": 0.439904056 }, { "epoch": 1.3134751773049644, "grad_norm": 0.7858623266220093, "loss": 4.9504, "lr": 0.00042251748251748253, "step": 4630, "tokens_trained": 0.440093304 }, { "epoch": 1.3140425531914894, "grad_norm": 0.7382465600967407, "loss": 4.9397, "lr": 0.0004222377622377622, "step": 4632, "tokens_trained": 0.440283584 }, { "epoch": 1.3146099290780142, "grad_norm": 0.7232691049575806, "loss": 5.0304, "lr": 0.00042195804195804197, "step": 4634, "tokens_trained": 0.440473064 }, { "epoch": 1.315177304964539, "grad_norm": 0.7827140092849731, "loss": 5.0059, "lr": 0.0004216783216783217, "step": 4636, "tokens_trained": 0.440664664 }, { "epoch": 1.3157446808510638, "grad_norm": 0.7799215316772461, "loss": 4.9534, "lr": 0.0004213986013986014, "step": 4638, "tokens_trained": 0.4408536 }, { "epoch": 1.3163120567375888, "grad_norm": 0.8065125346183777, "loss": 4.99, "lr": 0.00042111888111888115, "step": 4640, "tokens_trained": 0.441042616 }, { "epoch": 1.3168794326241136, "grad_norm": 0.7722545266151428, "loss": 4.9687, "lr": 0.00042083916083916084, "step": 4642, "tokens_trained": 0.441233296 }, { "epoch": 1.3174468085106383, "grad_norm": 0.7521271109580994, "loss": 5.0357, "lr": 0.0004205594405594406, "step": 4644, "tokens_trained": 0.441423976 }, { "epoch": 1.3180141843971631, "grad_norm": 0.7580513954162598, "loss": 4.9353, "lr": 0.00042027972027972027, "step": 4646, "tokens_trained": 0.441612488 }, { "epoch": 1.318581560283688, "grad_norm": 0.7603718638420105, "loss": 5.0189, "lr": 0.00042, "step": 4648, "tokens_trained": 0.441800944 }, { "epoch": 1.3191489361702127, "grad_norm": 0.7828201055526733, "loss": 4.941, "lr": 0.0004197202797202797, "step": 4650, "tokens_trained": 0.441990072 }, { "epoch": 1.3197163120567375, "grad_norm": 0.7227108478546143, "loss": 4.9707, "lr": 0.0004194405594405594, "step": 4652, "tokens_trained": 0.44218048 }, { "epoch": 1.3202836879432625, "grad_norm": 0.8121836185455322, "loss": 4.91, "lr": 0.0004191608391608392, "step": 4654, "tokens_trained": 0.442370728 }, { "epoch": 1.3208510638297872, "grad_norm": 0.6706936955451965, "loss": 4.907, "lr": 0.0004188811188811189, "step": 4656, "tokens_trained": 0.442560352 }, { "epoch": 1.321418439716312, "grad_norm": 0.7793337106704712, "loss": 5.0206, "lr": 0.00041860139860139863, "step": 4658, "tokens_trained": 0.442750192 }, { "epoch": 1.321985815602837, "grad_norm": 0.7981981039047241, "loss": 5.0155, "lr": 0.0004183216783216783, "step": 4660, "tokens_trained": 0.442940848 }, { "epoch": 1.3225531914893618, "grad_norm": 0.7972844243049622, "loss": 4.9879, "lr": 0.00041804195804195807, "step": 4662, "tokens_trained": 0.443128896 }, { "epoch": 1.3231205673758866, "grad_norm": 0.8017681241035461, "loss": 4.9746, "lr": 0.00041776223776223776, "step": 4664, "tokens_trained": 0.443320528 }, { "epoch": 1.3236879432624113, "grad_norm": 0.7505584955215454, "loss": 4.9819, "lr": 0.0004174825174825175, "step": 4666, "tokens_trained": 0.443510888 }, { "epoch": 1.3242553191489361, "grad_norm": 0.772155225276947, "loss": 5.0783, "lr": 0.0004172027972027972, "step": 4668, "tokens_trained": 0.443701856 }, { "epoch": 1.324822695035461, "grad_norm": 0.7051090598106384, "loss": 4.9403, "lr": 0.0004169230769230769, "step": 4670, "tokens_trained": 0.44389428 }, { "epoch": 1.3253900709219857, "grad_norm": 0.7992343902587891, "loss": 4.9498, "lr": 0.0004166433566433567, "step": 4672, "tokens_trained": 0.444087272 }, { "epoch": 1.3259574468085107, "grad_norm": 0.7696804404258728, "loss": 5.0109, "lr": 0.00041636363636363637, "step": 4674, "tokens_trained": 0.444274648 }, { "epoch": 1.3265248226950355, "grad_norm": 0.7982995510101318, "loss": 4.9506, "lr": 0.0004160839160839161, "step": 4676, "tokens_trained": 0.444464632 }, { "epoch": 1.3270921985815602, "grad_norm": 0.8207205533981323, "loss": 4.9527, "lr": 0.0004158041958041958, "step": 4678, "tokens_trained": 0.444655184 }, { "epoch": 1.327659574468085, "grad_norm": 0.7874724268913269, "loss": 4.9924, "lr": 0.00041552447552447555, "step": 4680, "tokens_trained": 0.444845096 }, { "epoch": 1.32822695035461, "grad_norm": 0.7951269149780273, "loss": 5.0061, "lr": 0.00041524475524475524, "step": 4682, "tokens_trained": 0.445034912 }, { "epoch": 1.3287943262411348, "grad_norm": 0.7952069640159607, "loss": 5.029, "lr": 0.000414965034965035, "step": 4684, "tokens_trained": 0.445224664 }, { "epoch": 1.3293617021276596, "grad_norm": 0.7753441333770752, "loss": 5.0353, "lr": 0.0004146853146853147, "step": 4686, "tokens_trained": 0.44541256 }, { "epoch": 1.3299290780141844, "grad_norm": 0.7112265229225159, "loss": 4.9221, "lr": 0.00041440559440559437, "step": 4688, "tokens_trained": 0.445604696 }, { "epoch": 1.3304964539007091, "grad_norm": 0.7774649262428284, "loss": 5.0125, "lr": 0.00041412587412587417, "step": 4690, "tokens_trained": 0.445794752 }, { "epoch": 1.331063829787234, "grad_norm": 0.8355589509010315, "loss": 4.9665, "lr": 0.00041384615384615386, "step": 4692, "tokens_trained": 0.44598544 }, { "epoch": 1.331631205673759, "grad_norm": 0.7191185355186462, "loss": 4.9798, "lr": 0.0004135664335664336, "step": 4694, "tokens_trained": 0.44617436 }, { "epoch": 1.3321985815602837, "grad_norm": 0.7386505007743835, "loss": 4.9756, "lr": 0.0004132867132867133, "step": 4696, "tokens_trained": 0.446363384 }, { "epoch": 1.3327659574468085, "grad_norm": 0.7661808133125305, "loss": 4.9374, "lr": 0.00041300699300699304, "step": 4698, "tokens_trained": 0.44655264 }, { "epoch": 1.3333333333333333, "grad_norm": 0.7530731558799744, "loss": 4.9681, "lr": 0.0004127272727272727, "step": 4700, "tokens_trained": 0.446743016 }, { "epoch": 1.3339007092198583, "grad_norm": 0.7512504458427429, "loss": 4.9827, "lr": 0.00041244755244755247, "step": 4702, "tokens_trained": 0.446932608 }, { "epoch": 1.334468085106383, "grad_norm": 0.7335140109062195, "loss": 4.9586, "lr": 0.00041216783216783216, "step": 4704, "tokens_trained": 0.447122208 }, { "epoch": 1.3350354609929078, "grad_norm": 0.7327559590339661, "loss": 4.9666, "lr": 0.00041188811188811185, "step": 4706, "tokens_trained": 0.447312824 }, { "epoch": 1.3356028368794326, "grad_norm": 0.7450160980224609, "loss": 4.9197, "lr": 0.00041160839160839165, "step": 4708, "tokens_trained": 0.447500672 }, { "epoch": 1.3361702127659574, "grad_norm": 0.6740980744361877, "loss": 5.0133, "lr": 0.00041132867132867134, "step": 4710, "tokens_trained": 0.447689552 }, { "epoch": 1.3367375886524822, "grad_norm": 0.7320116758346558, "loss": 4.9751, "lr": 0.0004110489510489511, "step": 4712, "tokens_trained": 0.447880128 }, { "epoch": 1.3373049645390072, "grad_norm": 0.7833261489868164, "loss": 4.9285, "lr": 0.0004107692307692308, "step": 4714, "tokens_trained": 0.448069496 }, { "epoch": 1.337872340425532, "grad_norm": 0.7570978999137878, "loss": 5.0047, "lr": 0.0004104895104895105, "step": 4716, "tokens_trained": 0.448258184 }, { "epoch": 1.3384397163120567, "grad_norm": 0.7320883274078369, "loss": 4.9751, "lr": 0.0004102097902097902, "step": 4718, "tokens_trained": 0.448449488 }, { "epoch": 1.3390070921985815, "grad_norm": 0.7385469675064087, "loss": 4.9712, "lr": 0.0004099300699300699, "step": 4720, "tokens_trained": 0.448638776 }, { "epoch": 1.3395744680851065, "grad_norm": 0.7620404958724976, "loss": 4.8906, "lr": 0.00040965034965034964, "step": 4722, "tokens_trained": 0.448830528 }, { "epoch": 1.3401418439716313, "grad_norm": 0.7389976382255554, "loss": 4.9994, "lr": 0.00040937062937062934, "step": 4724, "tokens_trained": 0.449018952 }, { "epoch": 1.340709219858156, "grad_norm": 0.7150964140892029, "loss": 4.9244, "lr": 0.00040909090909090913, "step": 4726, "tokens_trained": 0.44920784 }, { "epoch": 1.3412765957446808, "grad_norm": 0.7163580060005188, "loss": 5.0069, "lr": 0.0004088111888111888, "step": 4728, "tokens_trained": 0.449396696 }, { "epoch": 1.3418439716312056, "grad_norm": 0.7657668590545654, "loss": 4.9322, "lr": 0.00040853146853146857, "step": 4730, "tokens_trained": 0.449585568 }, { "epoch": 1.3424113475177304, "grad_norm": 0.7743586301803589, "loss": 4.9691, "lr": 0.00040825174825174826, "step": 4732, "tokens_trained": 0.44977396 }, { "epoch": 1.3429787234042554, "grad_norm": 0.8050113320350647, "loss": 4.9514, "lr": 0.000407972027972028, "step": 4734, "tokens_trained": 0.449964656 }, { "epoch": 1.3435460992907802, "grad_norm": 0.7641178965568542, "loss": 4.8956, "lr": 0.0004076923076923077, "step": 4736, "tokens_trained": 0.450154896 }, { "epoch": 1.344113475177305, "grad_norm": 0.8350791931152344, "loss": 4.9625, "lr": 0.0004074125874125874, "step": 4738, "tokens_trained": 0.450344528 }, { "epoch": 1.3446808510638297, "grad_norm": 0.7148427367210388, "loss": 4.9657, "lr": 0.00040713286713286713, "step": 4740, "tokens_trained": 0.450535416 }, { "epoch": 1.3452482269503547, "grad_norm": 0.7961207032203674, "loss": 4.897, "lr": 0.0004068531468531468, "step": 4742, "tokens_trained": 0.450724968 }, { "epoch": 1.3458156028368795, "grad_norm": 0.8115900754928589, "loss": 4.9171, "lr": 0.0004065734265734266, "step": 4744, "tokens_trained": 0.450916104 }, { "epoch": 1.3463829787234043, "grad_norm": 0.7608439326286316, "loss": 4.9817, "lr": 0.0004062937062937063, "step": 4746, "tokens_trained": 0.451107272 }, { "epoch": 1.346950354609929, "grad_norm": 0.7412408590316772, "loss": 5.0053, "lr": 0.00040601398601398605, "step": 4748, "tokens_trained": 0.451297408 }, { "epoch": 1.3475177304964538, "grad_norm": 0.7785027623176575, "loss": 5.0091, "lr": 0.00040573426573426574, "step": 4750, "tokens_trained": 0.451488272 }, { "epoch": 1.3475177304964538, "eval_loss": 4.9776997566223145, "eval_runtime": 20.4142, "step": 4750, "tokens_trained": 0.451488272 }, { "epoch": 1.3480851063829786, "grad_norm": 0.7034481763839722, "loss": 5.0312, "lr": 0.0004054545454545455, "step": 4752, "tokens_trained": 0.451678048 }, { "epoch": 1.3486524822695036, "grad_norm": 0.8021607398986816, "loss": 4.9923, "lr": 0.0004051748251748252, "step": 4754, "tokens_trained": 0.451867816 }, { "epoch": 1.3492198581560284, "grad_norm": 0.7409330606460571, "loss": 4.9429, "lr": 0.00040489510489510487, "step": 4756, "tokens_trained": 0.45205644 }, { "epoch": 1.3497872340425532, "grad_norm": 0.6617271900177002, "loss": 5.0044, "lr": 0.0004046153846153846, "step": 4758, "tokens_trained": 0.452247464 }, { "epoch": 1.350354609929078, "grad_norm": 0.7742848992347717, "loss": 4.9794, "lr": 0.0004043356643356643, "step": 4760, "tokens_trained": 0.452437608 }, { "epoch": 1.350921985815603, "grad_norm": 0.7627806663513184, "loss": 4.9562, "lr": 0.0004040559440559441, "step": 4762, "tokens_trained": 0.452627568 }, { "epoch": 1.3514893617021277, "grad_norm": 0.8105679750442505, "loss": 5.0514, "lr": 0.0004037762237762238, "step": 4764, "tokens_trained": 0.452817176 }, { "epoch": 1.3520567375886525, "grad_norm": 0.7783811688423157, "loss": 4.9414, "lr": 0.00040349650349650354, "step": 4766, "tokens_trained": 0.4530078 }, { "epoch": 1.3526241134751773, "grad_norm": 0.7357584238052368, "loss": 4.9184, "lr": 0.00040321678321678323, "step": 4768, "tokens_trained": 0.453196856 }, { "epoch": 1.353191489361702, "grad_norm": 0.79344242811203, "loss": 4.8904, "lr": 0.00040293706293706297, "step": 4770, "tokens_trained": 0.4533878 }, { "epoch": 1.3537588652482269, "grad_norm": 0.7372890710830688, "loss": 4.9378, "lr": 0.00040265734265734266, "step": 4772, "tokens_trained": 0.4535766 }, { "epoch": 1.3543262411347516, "grad_norm": 0.7920981049537659, "loss": 4.9701, "lr": 0.00040237762237762235, "step": 4774, "tokens_trained": 0.45376792 }, { "epoch": 1.3548936170212766, "grad_norm": 0.7568764686584473, "loss": 5.0008, "lr": 0.0004020979020979021, "step": 4776, "tokens_trained": 0.453958072 }, { "epoch": 1.3554609929078014, "grad_norm": 0.7389140129089355, "loss": 4.9886, "lr": 0.0004018181818181818, "step": 4778, "tokens_trained": 0.454147016 }, { "epoch": 1.3560283687943262, "grad_norm": 0.7528326511383057, "loss": 4.9669, "lr": 0.00040153846153846153, "step": 4780, "tokens_trained": 0.454338392 }, { "epoch": 1.3565957446808512, "grad_norm": 0.7838888764381409, "loss": 5.0034, "lr": 0.0004012587412587413, "step": 4782, "tokens_trained": 0.45452652 }, { "epoch": 1.357163120567376, "grad_norm": 0.8001760244369507, "loss": 4.969, "lr": 0.000400979020979021, "step": 4784, "tokens_trained": 0.454714896 }, { "epoch": 1.3577304964539008, "grad_norm": 0.7670722007751465, "loss": 5.0728, "lr": 0.0004006993006993007, "step": 4786, "tokens_trained": 0.45490428 }, { "epoch": 1.3582978723404255, "grad_norm": 0.7396910786628723, "loss": 4.9123, "lr": 0.00040041958041958046, "step": 4788, "tokens_trained": 0.45509412 }, { "epoch": 1.3588652482269503, "grad_norm": 0.8072660565376282, "loss": 4.9988, "lr": 0.00040013986013986015, "step": 4790, "tokens_trained": 0.455283592 }, { "epoch": 1.359432624113475, "grad_norm": 0.7714769840240479, "loss": 4.9984, "lr": 0.00039986013986013984, "step": 4792, "tokens_trained": 0.455476456 }, { "epoch": 1.3599999999999999, "grad_norm": 0.749272882938385, "loss": 5.02, "lr": 0.0003995804195804196, "step": 4794, "tokens_trained": 0.455666568 }, { "epoch": 1.3605673758865249, "grad_norm": 0.9460277557373047, "loss": 5.0115, "lr": 0.00039930069930069927, "step": 4796, "tokens_trained": 0.455855472 }, { "epoch": 1.3611347517730497, "grad_norm": 0.8013962507247925, "loss": 4.8935, "lr": 0.000399020979020979, "step": 4798, "tokens_trained": 0.456043256 }, { "epoch": 1.3617021276595744, "grad_norm": 0.8356024026870728, "loss": 5.004, "lr": 0.00039874125874125876, "step": 4800, "tokens_trained": 0.456232728 }, { "epoch": 1.3622695035460992, "grad_norm": 0.7791249752044678, "loss": 4.9025, "lr": 0.0003984615384615385, "step": 4802, "tokens_trained": 0.456422672 }, { "epoch": 1.3628368794326242, "grad_norm": 0.7426172494888306, "loss": 4.9706, "lr": 0.0003981818181818182, "step": 4804, "tokens_trained": 0.456612888 }, { "epoch": 1.363404255319149, "grad_norm": 0.8252729773521423, "loss": 4.9679, "lr": 0.00039790209790209794, "step": 4806, "tokens_trained": 0.456802432 }, { "epoch": 1.3639716312056738, "grad_norm": 0.7870017290115356, "loss": 4.9609, "lr": 0.00039762237762237763, "step": 4808, "tokens_trained": 0.456990752 }, { "epoch": 1.3645390070921986, "grad_norm": 0.815733790397644, "loss": 4.9634, "lr": 0.0003973426573426573, "step": 4810, "tokens_trained": 0.457181528 }, { "epoch": 1.3651063829787233, "grad_norm": 0.6886212825775146, "loss": 4.954, "lr": 0.00039706293706293707, "step": 4812, "tokens_trained": 0.457370064 }, { "epoch": 1.365673758865248, "grad_norm": 0.7102149724960327, "loss": 4.8986, "lr": 0.00039678321678321676, "step": 4814, "tokens_trained": 0.457559112 }, { "epoch": 1.3662411347517731, "grad_norm": 0.7671045064926147, "loss": 4.9504, "lr": 0.0003965034965034965, "step": 4816, "tokens_trained": 0.457749888 }, { "epoch": 1.366808510638298, "grad_norm": 0.7828851938247681, "loss": 4.9522, "lr": 0.00039622377622377625, "step": 4818, "tokens_trained": 0.457939616 }, { "epoch": 1.3673758865248227, "grad_norm": 0.7570793628692627, "loss": 4.9273, "lr": 0.000395944055944056, "step": 4820, "tokens_trained": 0.458131776 }, { "epoch": 1.3679432624113474, "grad_norm": 0.7246227860450745, "loss": 5.0342, "lr": 0.0003956643356643357, "step": 4822, "tokens_trained": 0.458323576 }, { "epoch": 1.3685106382978725, "grad_norm": 0.7387742400169373, "loss": 4.976, "lr": 0.0003953846153846154, "step": 4824, "tokens_trained": 0.4585148 }, { "epoch": 1.3690780141843972, "grad_norm": 0.7457069158554077, "loss": 5.0033, "lr": 0.0003951048951048951, "step": 4826, "tokens_trained": 0.458706352 }, { "epoch": 1.369645390070922, "grad_norm": 0.721156895160675, "loss": 4.9869, "lr": 0.0003948251748251748, "step": 4828, "tokens_trained": 0.45889584 }, { "epoch": 1.3702127659574468, "grad_norm": 0.7440138459205627, "loss": 4.9304, "lr": 0.00039454545454545455, "step": 4830, "tokens_trained": 0.459084496 }, { "epoch": 1.3707801418439716, "grad_norm": 0.7051060199737549, "loss": 4.9388, "lr": 0.00039426573426573424, "step": 4832, "tokens_trained": 0.459273328 }, { "epoch": 1.3713475177304963, "grad_norm": 0.7923696637153625, "loss": 4.9632, "lr": 0.000393986013986014, "step": 4834, "tokens_trained": 0.459461936 }, { "epoch": 1.3719148936170213, "grad_norm": 0.7542476654052734, "loss": 4.9849, "lr": 0.00039370629370629373, "step": 4836, "tokens_trained": 0.459654296 }, { "epoch": 1.3724822695035461, "grad_norm": 0.6460102200508118, "loss": 4.9345, "lr": 0.0003934265734265735, "step": 4838, "tokens_trained": 0.459840832 }, { "epoch": 1.373049645390071, "grad_norm": 0.6898486614227295, "loss": 4.9322, "lr": 0.00039314685314685316, "step": 4840, "tokens_trained": 0.46003016 }, { "epoch": 1.3736170212765957, "grad_norm": 0.7820252776145935, "loss": 4.9832, "lr": 0.00039286713286713286, "step": 4842, "tokens_trained": 0.460220928 }, { "epoch": 1.3741843971631207, "grad_norm": 0.681734561920166, "loss": 4.8975, "lr": 0.0003925874125874126, "step": 4844, "tokens_trained": 0.460410064 }, { "epoch": 1.3747517730496455, "grad_norm": 0.7517859935760498, "loss": 4.941, "lr": 0.0003923076923076923, "step": 4846, "tokens_trained": 0.46059848 }, { "epoch": 1.3753191489361702, "grad_norm": 0.7375074625015259, "loss": 4.9473, "lr": 0.00039202797202797203, "step": 4848, "tokens_trained": 0.46078916 }, { "epoch": 1.375886524822695, "grad_norm": 0.728672444820404, "loss": 4.962, "lr": 0.0003917482517482517, "step": 4850, "tokens_trained": 0.460977672 }, { "epoch": 1.3764539007092198, "grad_norm": 0.7166595458984375, "loss": 4.9366, "lr": 0.00039146853146853147, "step": 4852, "tokens_trained": 0.461166912 }, { "epoch": 1.3770212765957446, "grad_norm": 0.7807113528251648, "loss": 4.9279, "lr": 0.0003911888111888112, "step": 4854, "tokens_trained": 0.46135684 }, { "epoch": 1.3775886524822696, "grad_norm": 0.7296082973480225, "loss": 4.9246, "lr": 0.00039090909090909096, "step": 4856, "tokens_trained": 0.461546944 }, { "epoch": 1.3781560283687944, "grad_norm": 0.7450242638587952, "loss": 4.9474, "lr": 0.00039062937062937065, "step": 4858, "tokens_trained": 0.461736576 }, { "epoch": 1.3787234042553191, "grad_norm": 0.6994244456291199, "loss": 4.9334, "lr": 0.00039034965034965034, "step": 4860, "tokens_trained": 0.461925424 }, { "epoch": 1.379290780141844, "grad_norm": 0.7981341481208801, "loss": 4.9785, "lr": 0.0003900699300699301, "step": 4862, "tokens_trained": 0.462115912 }, { "epoch": 1.379858156028369, "grad_norm": 0.6945004463195801, "loss": 4.9581, "lr": 0.0003897902097902098, "step": 4864, "tokens_trained": 0.462306424 }, { "epoch": 1.3804255319148937, "grad_norm": 0.7116626501083374, "loss": 4.947, "lr": 0.0003895104895104895, "step": 4866, "tokens_trained": 0.462497352 }, { "epoch": 1.3809929078014185, "grad_norm": 0.7096779346466064, "loss": 4.956, "lr": 0.0003892307692307692, "step": 4868, "tokens_trained": 0.462686872 }, { "epoch": 1.3815602836879433, "grad_norm": 0.6993130445480347, "loss": 4.9038, "lr": 0.00038895104895104895, "step": 4870, "tokens_trained": 0.462877712 }, { "epoch": 1.382127659574468, "grad_norm": 0.7118195295333862, "loss": 4.9709, "lr": 0.0003886713286713287, "step": 4872, "tokens_trained": 0.463069304 }, { "epoch": 1.3826950354609928, "grad_norm": 0.760608971118927, "loss": 4.9574, "lr": 0.00038839160839160844, "step": 4874, "tokens_trained": 0.463260616 }, { "epoch": 1.3829787234042552, "eval_loss": 4.976211071014404, "eval_runtime": 20.5866, "step": 4875, "tokens_trained": 0.463356504 }, { "epoch": 1.3832624113475176, "grad_norm": 0.7358114123344421, "loss": 5.0454, "lr": 0.00038811188811188813, "step": 4876, "tokens_trained": 0.4634498 }, { "epoch": 1.3838297872340426, "grad_norm": 0.7012422680854797, "loss": 4.9396, "lr": 0.0003878321678321678, "step": 4878, "tokens_trained": 0.463639736 }, { "epoch": 1.3843971631205674, "grad_norm": 0.7740567922592163, "loss": 4.9148, "lr": 0.00038755244755244757, "step": 4880, "tokens_trained": 0.463831816 }, { "epoch": 1.3849645390070922, "grad_norm": 0.7246590852737427, "loss": 4.9392, "lr": 0.00038727272727272726, "step": 4882, "tokens_trained": 0.464022656 }, { "epoch": 1.3855319148936172, "grad_norm": 0.7365467548370361, "loss": 4.9912, "lr": 0.000386993006993007, "step": 4884, "tokens_trained": 0.464212584 }, { "epoch": 1.386099290780142, "grad_norm": 0.7027139067649841, "loss": 4.9846, "lr": 0.0003867132867132867, "step": 4886, "tokens_trained": 0.464404256 }, { "epoch": 1.3866666666666667, "grad_norm": 0.7768815755844116, "loss": 4.9042, "lr": 0.00038643356643356644, "step": 4888, "tokens_trained": 0.46459544 }, { "epoch": 1.3872340425531915, "grad_norm": 0.7037492990493774, "loss": 4.8755, "lr": 0.0003861538461538462, "step": 4890, "tokens_trained": 0.464784224 }, { "epoch": 1.3878014184397163, "grad_norm": 0.8143949508666992, "loss": 4.9742, "lr": 0.00038587412587412593, "step": 4892, "tokens_trained": 0.464975456 }, { "epoch": 1.388368794326241, "grad_norm": 0.7223230600357056, "loss": 4.9473, "lr": 0.0003855944055944056, "step": 4894, "tokens_trained": 0.465164944 }, { "epoch": 1.3889361702127658, "grad_norm": 0.7167389988899231, "loss": 4.9495, "lr": 0.0003853146853146853, "step": 4896, "tokens_trained": 0.46535596 }, { "epoch": 1.3895035460992908, "grad_norm": 0.764140248298645, "loss": 4.9759, "lr": 0.00038503496503496505, "step": 4898, "tokens_trained": 0.465545192 }, { "epoch": 1.3900709219858156, "grad_norm": 0.7822412252426147, "loss": 5.0296, "lr": 0.00038475524475524474, "step": 4900, "tokens_trained": 0.465734568 }, { "epoch": 1.3906382978723404, "grad_norm": 0.7479943633079529, "loss": 4.9059, "lr": 0.0003844755244755245, "step": 4902, "tokens_trained": 0.465923464 }, { "epoch": 1.3912056737588652, "grad_norm": 0.7703482508659363, "loss": 4.927, "lr": 0.0003841958041958042, "step": 4904, "tokens_trained": 0.466114272 }, { "epoch": 1.3917730496453902, "grad_norm": 0.7773356437683105, "loss": 4.9733, "lr": 0.0003839160839160839, "step": 4906, "tokens_trained": 0.466305032 }, { "epoch": 1.392340425531915, "grad_norm": 0.7287682294845581, "loss": 4.9497, "lr": 0.0003836363636363636, "step": 4908, "tokens_trained": 0.466495336 }, { "epoch": 1.3929078014184397, "grad_norm": 0.7540012001991272, "loss": 4.948, "lr": 0.0003833566433566434, "step": 4910, "tokens_trained": 0.466686312 }, { "epoch": 1.3934751773049645, "grad_norm": 0.6999531388282776, "loss": 5.0049, "lr": 0.0003830769230769231, "step": 4912, "tokens_trained": 0.466878424 }, { "epoch": 1.3940425531914893, "grad_norm": 0.7895733714103699, "loss": 4.9807, "lr": 0.0003827972027972028, "step": 4914, "tokens_trained": 0.467068576 }, { "epoch": 1.394609929078014, "grad_norm": 0.8046857118606567, "loss": 4.9518, "lr": 0.00038251748251748254, "step": 4916, "tokens_trained": 0.467259704 }, { "epoch": 1.395177304964539, "grad_norm": 0.6962889432907104, "loss": 4.993, "lr": 0.0003822377622377622, "step": 4918, "tokens_trained": 0.46744908 }, { "epoch": 1.3957446808510638, "grad_norm": 0.7171238660812378, "loss": 4.9422, "lr": 0.00038195804195804197, "step": 4920, "tokens_trained": 0.467637408 }, { "epoch": 1.3963120567375886, "grad_norm": 0.7095980644226074, "loss": 4.9518, "lr": 0.00038167832167832166, "step": 4922, "tokens_trained": 0.467828128 }, { "epoch": 1.3968794326241134, "grad_norm": 0.798319399356842, "loss": 4.9267, "lr": 0.0003813986013986014, "step": 4924, "tokens_trained": 0.4680194 }, { "epoch": 1.3974468085106384, "grad_norm": 0.6752556562423706, "loss": 4.9905, "lr": 0.0003811188811188811, "step": 4926, "tokens_trained": 0.468209408 }, { "epoch": 1.3980141843971632, "grad_norm": 0.7536012530326843, "loss": 4.938, "lr": 0.0003808391608391609, "step": 4928, "tokens_trained": 0.468399016 }, { "epoch": 1.398581560283688, "grad_norm": 0.7366868257522583, "loss": 4.946, "lr": 0.0003805594405594406, "step": 4930, "tokens_trained": 0.468591152 }, { "epoch": 1.3991489361702127, "grad_norm": 0.765252411365509, "loss": 4.9843, "lr": 0.0003802797202797203, "step": 4932, "tokens_trained": 0.468781688 }, { "epoch": 1.3997163120567375, "grad_norm": 0.6715340614318848, "loss": 4.955, "lr": 0.00038, "step": 4934, "tokens_trained": 0.468972624 }, { "epoch": 1.4002836879432623, "grad_norm": 0.7280968427658081, "loss": 4.9459, "lr": 0.0003797202797202797, "step": 4936, "tokens_trained": 0.46916264 }, { "epoch": 1.4008510638297873, "grad_norm": 0.7301554679870605, "loss": 4.9083, "lr": 0.00037944055944055946, "step": 4938, "tokens_trained": 0.469352208 }, { "epoch": 1.401418439716312, "grad_norm": 0.7966684103012085, "loss": 4.9554, "lr": 0.00037916083916083915, "step": 4940, "tokens_trained": 0.469542584 }, { "epoch": 1.4019858156028369, "grad_norm": 0.7339959144592285, "loss": 4.9614, "lr": 0.0003788811188811189, "step": 4942, "tokens_trained": 0.4697328 }, { "epoch": 1.4025531914893616, "grad_norm": 0.7321662902832031, "loss": 4.9159, "lr": 0.0003786013986013986, "step": 4944, "tokens_trained": 0.469922768 }, { "epoch": 1.4031205673758866, "grad_norm": 0.7663842439651489, "loss": 5.0158, "lr": 0.0003783216783216784, "step": 4946, "tokens_trained": 0.470111912 }, { "epoch": 1.4036879432624114, "grad_norm": 0.6754962801933289, "loss": 4.973, "lr": 0.00037804195804195807, "step": 4948, "tokens_trained": 0.470303544 }, { "epoch": 1.4042553191489362, "grad_norm": 0.6885703802108765, "loss": 4.9308, "lr": 0.00037776223776223776, "step": 4950, "tokens_trained": 0.470493872 }, { "epoch": 1.404822695035461, "grad_norm": 0.7635725140571594, "loss": 4.9281, "lr": 0.0003774825174825175, "step": 4952, "tokens_trained": 0.47068516 }, { "epoch": 1.4053900709219858, "grad_norm": 0.6963970065116882, "loss": 4.9579, "lr": 0.0003772027972027972, "step": 4954, "tokens_trained": 0.470875848 }, { "epoch": 1.4059574468085105, "grad_norm": 0.7530264854431152, "loss": 4.9418, "lr": 0.00037692307692307694, "step": 4956, "tokens_trained": 0.47106516 }, { "epoch": 1.4065248226950355, "grad_norm": 0.707700788974762, "loss": 5.0045, "lr": 0.00037664335664335663, "step": 4958, "tokens_trained": 0.471252432 }, { "epoch": 1.4070921985815603, "grad_norm": 0.7403944134712219, "loss": 4.9305, "lr": 0.0003763636363636364, "step": 4960, "tokens_trained": 0.471442672 }, { "epoch": 1.407659574468085, "grad_norm": 0.753716230392456, "loss": 4.9812, "lr": 0.00037608391608391607, "step": 4962, "tokens_trained": 0.471631888 }, { "epoch": 1.4082269503546099, "grad_norm": 0.8004569411277771, "loss": 4.9217, "lr": 0.0003758041958041958, "step": 4964, "tokens_trained": 0.47182072 }, { "epoch": 1.4087943262411349, "grad_norm": 0.7715573906898499, "loss": 4.9233, "lr": 0.00037552447552447555, "step": 4966, "tokens_trained": 0.472011104 }, { "epoch": 1.4093617021276597, "grad_norm": 0.6821765303611755, "loss": 4.9976, "lr": 0.00037524475524475524, "step": 4968, "tokens_trained": 0.472201568 }, { "epoch": 1.4099290780141844, "grad_norm": 0.7360137701034546, "loss": 4.9414, "lr": 0.000374965034965035, "step": 4970, "tokens_trained": 0.472390656 }, { "epoch": 1.4104964539007092, "grad_norm": 0.6912544369697571, "loss": 4.9692, "lr": 0.0003746853146853147, "step": 4972, "tokens_trained": 0.4725808 }, { "epoch": 1.411063829787234, "grad_norm": 0.7245798110961914, "loss": 4.9708, "lr": 0.0003744055944055944, "step": 4974, "tokens_trained": 0.472768096 }, { "epoch": 1.4116312056737588, "grad_norm": 0.8210451602935791, "loss": 4.9523, "lr": 0.0003741258741258741, "step": 4976, "tokens_trained": 0.472957224 }, { "epoch": 1.4121985815602836, "grad_norm": 0.7312847971916199, "loss": 4.948, "lr": 0.00037384615384615386, "step": 4978, "tokens_trained": 0.47314768 }, { "epoch": 1.4127659574468086, "grad_norm": 0.7223467826843262, "loss": 4.9328, "lr": 0.00037356643356643355, "step": 4980, "tokens_trained": 0.47333896 }, { "epoch": 1.4133333333333333, "grad_norm": 0.7106639742851257, "loss": 4.9244, "lr": 0.0003732867132867133, "step": 4982, "tokens_trained": 0.473530608 }, { "epoch": 1.413900709219858, "grad_norm": 0.717099130153656, "loss": 4.9213, "lr": 0.00037300699300699304, "step": 4984, "tokens_trained": 0.473720968 }, { "epoch": 1.414468085106383, "grad_norm": 0.7603443264961243, "loss": 4.9815, "lr": 0.00037272727272727273, "step": 4986, "tokens_trained": 0.473911448 }, { "epoch": 1.415035460992908, "grad_norm": 0.7069094181060791, "loss": 4.9781, "lr": 0.0003724475524475525, "step": 4988, "tokens_trained": 0.474101576 }, { "epoch": 1.4156028368794327, "grad_norm": 0.6874499917030334, "loss": 4.9678, "lr": 0.00037216783216783216, "step": 4990, "tokens_trained": 0.474292264 }, { "epoch": 1.4161702127659574, "grad_norm": 0.7207010984420776, "loss": 4.9998, "lr": 0.0003718881118881119, "step": 4992, "tokens_trained": 0.474482208 }, { "epoch": 1.4167375886524822, "grad_norm": 0.7269707322120667, "loss": 4.9148, "lr": 0.0003716083916083916, "step": 4994, "tokens_trained": 0.474671136 }, { "epoch": 1.417304964539007, "grad_norm": 0.6694115400314331, "loss": 4.9269, "lr": 0.00037132867132867134, "step": 4996, "tokens_trained": 0.474862056 }, { "epoch": 1.4178723404255318, "grad_norm": 0.6479254364967346, "loss": 4.9747, "lr": 0.00037104895104895103, "step": 4998, "tokens_trained": 0.475053784 }, { "epoch": 1.4184397163120568, "grad_norm": 0.660739004611969, "loss": 4.9034, "lr": 0.0003707692307692308, "step": 5000, "tokens_trained": 0.47524568 }, { "epoch": 1.4184397163120568, "eval_loss": 4.966795444488525, "eval_runtime": 21.018, "step": 5000, "tokens_trained": 0.47524568 }, { "epoch": 1.4190070921985816, "grad_norm": 0.7606148719787598, "loss": 4.991, "lr": 0.0003704895104895105, "step": 5002, "tokens_trained": 0.475437208 }, { "epoch": 1.4195744680851063, "grad_norm": 0.6917815208435059, "loss": 4.9582, "lr": 0.0003702097902097902, "step": 5004, "tokens_trained": 0.475625952 }, { "epoch": 1.4201418439716311, "grad_norm": 0.731756865978241, "loss": 4.9908, "lr": 0.00036993006993006996, "step": 5006, "tokens_trained": 0.475815792 }, { "epoch": 1.4207092198581561, "grad_norm": 0.7233264446258545, "loss": 4.936, "lr": 0.00036965034965034965, "step": 5008, "tokens_trained": 0.476005808 }, { "epoch": 1.421276595744681, "grad_norm": 0.6983084082603455, "loss": 4.9926, "lr": 0.0003693706293706294, "step": 5010, "tokens_trained": 0.476195072 }, { "epoch": 1.4218439716312057, "grad_norm": 0.752465009689331, "loss": 4.9064, "lr": 0.0003690909090909091, "step": 5012, "tokens_trained": 0.47638492 }, { "epoch": 1.4224113475177305, "grad_norm": 0.7406246662139893, "loss": 4.9646, "lr": 0.00036881118881118883, "step": 5014, "tokens_trained": 0.476575672 }, { "epoch": 1.4229787234042552, "grad_norm": 0.7230610251426697, "loss": 5.0192, "lr": 0.0003685314685314685, "step": 5016, "tokens_trained": 0.476764648 }, { "epoch": 1.42354609929078, "grad_norm": 0.7906433343887329, "loss": 4.9454, "lr": 0.00036825174825174826, "step": 5018, "tokens_trained": 0.476955176 }, { "epoch": 1.424113475177305, "grad_norm": 0.713800847530365, "loss": 4.9439, "lr": 0.000367972027972028, "step": 5020, "tokens_trained": 0.477145336 }, { "epoch": 1.4246808510638298, "grad_norm": 0.80546635389328, "loss": 4.9492, "lr": 0.0003676923076923077, "step": 5022, "tokens_trained": 0.477338024 }, { "epoch": 1.4252482269503546, "grad_norm": 0.831771969795227, "loss": 4.9498, "lr": 0.00036741258741258744, "step": 5024, "tokens_trained": 0.47752756 }, { "epoch": 1.4258156028368794, "grad_norm": 0.7554155588150024, "loss": 4.9411, "lr": 0.00036713286713286713, "step": 5026, "tokens_trained": 0.477717624 }, { "epoch": 1.4263829787234044, "grad_norm": 0.7594896554946899, "loss": 4.9751, "lr": 0.0003668531468531469, "step": 5028, "tokens_trained": 0.477908536 }, { "epoch": 1.4269503546099291, "grad_norm": 0.6471177339553833, "loss": 4.8401, "lr": 0.00036657342657342657, "step": 5030, "tokens_trained": 0.478097528 }, { "epoch": 1.427517730496454, "grad_norm": 0.7507487535476685, "loss": 4.9596, "lr": 0.0003662937062937063, "step": 5032, "tokens_trained": 0.47828772 }, { "epoch": 1.4280851063829787, "grad_norm": 0.7026324272155762, "loss": 4.9324, "lr": 0.000366013986013986, "step": 5034, "tokens_trained": 0.478478448 }, { "epoch": 1.4286524822695035, "grad_norm": 0.7535367012023926, "loss": 4.8952, "lr": 0.0003657342657342657, "step": 5036, "tokens_trained": 0.47866956 }, { "epoch": 1.4292198581560283, "grad_norm": 0.7286129593849182, "loss": 4.9075, "lr": 0.0003654545454545455, "step": 5038, "tokens_trained": 0.47885872 }, { "epoch": 1.4297872340425533, "grad_norm": 0.6919812560081482, "loss": 4.9271, "lr": 0.0003651748251748252, "step": 5040, "tokens_trained": 0.479048376 }, { "epoch": 1.430354609929078, "grad_norm": 0.7181224822998047, "loss": 4.9257, "lr": 0.0003648951048951049, "step": 5042, "tokens_trained": 0.479237688 }, { "epoch": 1.4309219858156028, "grad_norm": 0.7457099556922913, "loss": 4.8927, "lr": 0.0003646153846153846, "step": 5044, "tokens_trained": 0.479426584 }, { "epoch": 1.4314893617021276, "grad_norm": 0.7675755023956299, "loss": 4.9287, "lr": 0.00036433566433566436, "step": 5046, "tokens_trained": 0.479617216 }, { "epoch": 1.4320567375886526, "grad_norm": 0.7041569352149963, "loss": 4.9276, "lr": 0.00036405594405594405, "step": 5048, "tokens_trained": 0.479807304 }, { "epoch": 1.4326241134751774, "grad_norm": 0.7185978293418884, "loss": 4.9652, "lr": 0.00036377622377622374, "step": 5050, "tokens_trained": 0.479997504 }, { "epoch": 1.4331914893617022, "grad_norm": 0.7958099246025085, "loss": 4.94, "lr": 0.0003634965034965035, "step": 5052, "tokens_trained": 0.480189784 }, { "epoch": 1.433758865248227, "grad_norm": 0.6902858018875122, "loss": 4.9827, "lr": 0.0003632167832167832, "step": 5054, "tokens_trained": 0.480383144 }, { "epoch": 1.4343262411347517, "grad_norm": 0.6887302398681641, "loss": 5.0039, "lr": 0.000362937062937063, "step": 5056, "tokens_trained": 0.480570904 }, { "epoch": 1.4348936170212765, "grad_norm": 0.7241384983062744, "loss": 4.9248, "lr": 0.00036265734265734267, "step": 5058, "tokens_trained": 0.480759136 }, { "epoch": 1.4354609929078015, "grad_norm": 0.7790824770927429, "loss": 4.9626, "lr": 0.0003623776223776224, "step": 5060, "tokens_trained": 0.480949056 }, { "epoch": 1.4360283687943263, "grad_norm": 0.8010179400444031, "loss": 4.9652, "lr": 0.0003620979020979021, "step": 5062, "tokens_trained": 0.481139536 }, { "epoch": 1.436595744680851, "grad_norm": 0.7285072803497314, "loss": 4.9414, "lr": 0.00036181818181818185, "step": 5064, "tokens_trained": 0.481328504 }, { "epoch": 1.4371631205673758, "grad_norm": 0.7610006332397461, "loss": 4.9742, "lr": 0.00036153846153846154, "step": 5066, "tokens_trained": 0.481519352 }, { "epoch": 1.4377304964539008, "grad_norm": 0.6971138715744019, "loss": 4.998, "lr": 0.0003612587412587412, "step": 5068, "tokens_trained": 0.4817108 }, { "epoch": 1.4382978723404256, "grad_norm": 0.7477296590805054, "loss": 4.9079, "lr": 0.00036097902097902097, "step": 5070, "tokens_trained": 0.481902 }, { "epoch": 1.4388652482269504, "grad_norm": 0.7010331153869629, "loss": 5.0261, "lr": 0.00036069930069930066, "step": 5072, "tokens_trained": 0.48209196 }, { "epoch": 1.4394326241134752, "grad_norm": 0.7054550647735596, "loss": 4.9881, "lr": 0.00036041958041958046, "step": 5074, "tokens_trained": 0.48228332 }, { "epoch": 1.44, "grad_norm": 0.7022992968559265, "loss": 4.9847, "lr": 0.00036013986013986015, "step": 5076, "tokens_trained": 0.482474376 }, { "epoch": 1.4405673758865247, "grad_norm": 0.716465175151825, "loss": 4.8994, "lr": 0.0003598601398601399, "step": 5078, "tokens_trained": 0.48266244 }, { "epoch": 1.4411347517730497, "grad_norm": 0.6937554478645325, "loss": 4.9677, "lr": 0.0003595804195804196, "step": 5080, "tokens_trained": 0.482851648 }, { "epoch": 1.4417021276595745, "grad_norm": 0.7124615907669067, "loss": 4.9537, "lr": 0.00035930069930069933, "step": 5082, "tokens_trained": 0.483039216 }, { "epoch": 1.4422695035460993, "grad_norm": 0.6647019386291504, "loss": 4.9518, "lr": 0.000359020979020979, "step": 5084, "tokens_trained": 0.483229912 }, { "epoch": 1.442836879432624, "grad_norm": 0.7044801712036133, "loss": 4.9696, "lr": 0.0003587412587412587, "step": 5086, "tokens_trained": 0.483419848 }, { "epoch": 1.443404255319149, "grad_norm": 0.7027294039726257, "loss": 4.9886, "lr": 0.00035846153846153846, "step": 5088, "tokens_trained": 0.483608528 }, { "epoch": 1.4439716312056738, "grad_norm": 0.7377288341522217, "loss": 4.8964, "lr": 0.00035818181818181815, "step": 5090, "tokens_trained": 0.483799232 }, { "epoch": 1.4445390070921986, "grad_norm": 0.7174035310745239, "loss": 4.9365, "lr": 0.00035790209790209794, "step": 5092, "tokens_trained": 0.48398796 }, { "epoch": 1.4451063829787234, "grad_norm": 0.7121730446815491, "loss": 4.936, "lr": 0.00035762237762237763, "step": 5094, "tokens_trained": 0.484177608 }, { "epoch": 1.4456737588652482, "grad_norm": 0.7427595853805542, "loss": 4.968, "lr": 0.0003573426573426574, "step": 5096, "tokens_trained": 0.484368912 }, { "epoch": 1.446241134751773, "grad_norm": 0.7151100635528564, "loss": 4.866, "lr": 0.00035706293706293707, "step": 5098, "tokens_trained": 0.484558872 }, { "epoch": 1.4468085106382977, "grad_norm": 0.7151250243186951, "loss": 4.9275, "lr": 0.0003567832167832168, "step": 5100, "tokens_trained": 0.484746272 }, { "epoch": 1.4473758865248227, "grad_norm": 0.7005182504653931, "loss": 4.956, "lr": 0.0003565034965034965, "step": 5102, "tokens_trained": 0.484937928 }, { "epoch": 1.4479432624113475, "grad_norm": 0.7152060270309448, "loss": 4.9544, "lr": 0.0003562237762237762, "step": 5104, "tokens_trained": 0.485127264 }, { "epoch": 1.4485106382978723, "grad_norm": 0.7763362526893616, "loss": 4.8919, "lr": 0.00035594405594405594, "step": 5106, "tokens_trained": 0.485317256 }, { "epoch": 1.4490780141843973, "grad_norm": 0.7702814936637878, "loss": 4.9243, "lr": 0.00035566433566433563, "step": 5108, "tokens_trained": 0.485507224 }, { "epoch": 1.449645390070922, "grad_norm": 0.7871324419975281, "loss": 5.0174, "lr": 0.00035538461538461543, "step": 5110, "tokens_trained": 0.485695736 }, { "epoch": 1.4502127659574469, "grad_norm": 0.7191143035888672, "loss": 4.8973, "lr": 0.0003551048951048951, "step": 5112, "tokens_trained": 0.485886528 }, { "epoch": 1.4507801418439716, "grad_norm": 0.6869152188301086, "loss": 4.894, "lr": 0.00035482517482517486, "step": 5114, "tokens_trained": 0.486074896 }, { "epoch": 1.4513475177304964, "grad_norm": 0.7272975444793701, "loss": 4.9682, "lr": 0.00035454545454545455, "step": 5116, "tokens_trained": 0.486265864 }, { "epoch": 1.4519148936170212, "grad_norm": 0.6644308567047119, "loss": 4.9707, "lr": 0.0003542657342657343, "step": 5118, "tokens_trained": 0.4864564 }, { "epoch": 1.452482269503546, "grad_norm": 0.7381615042686462, "loss": 4.9609, "lr": 0.000353986013986014, "step": 5120, "tokens_trained": 0.486645736 }, { "epoch": 1.453049645390071, "grad_norm": 0.7426425814628601, "loss": 4.9685, "lr": 0.0003537062937062937, "step": 5122, "tokens_trained": 0.486836424 }, { "epoch": 1.4536170212765958, "grad_norm": 0.682476818561554, "loss": 5.0294, "lr": 0.0003534265734265734, "step": 5124, "tokens_trained": 0.48702528 }, { "epoch": 1.4539007092198581, "eval_loss": 4.962060451507568, "eval_runtime": 20.8404, "step": 5125, "tokens_trained": 0.487120248 }, { "epoch": 1.4541843971631205, "grad_norm": 0.7397556900978088, "loss": 4.9378, "lr": 0.0003531468531468531, "step": 5126, "tokens_trained": 0.487216208 }, { "epoch": 1.4547517730496453, "grad_norm": 0.8119034171104431, "loss": 4.9408, "lr": 0.0003528671328671329, "step": 5128, "tokens_trained": 0.487405496 }, { "epoch": 1.4553191489361703, "grad_norm": 0.7072781324386597, "loss": 4.9543, "lr": 0.0003525874125874126, "step": 5130, "tokens_trained": 0.487596552 }, { "epoch": 1.455886524822695, "grad_norm": 0.839381217956543, "loss": 4.9204, "lr": 0.00035230769230769235, "step": 5132, "tokens_trained": 0.487786936 }, { "epoch": 1.4564539007092199, "grad_norm": 0.8116129636764526, "loss": 4.8954, "lr": 0.00035202797202797204, "step": 5134, "tokens_trained": 0.48797788 }, { "epoch": 1.4570212765957447, "grad_norm": 0.6917586326599121, "loss": 4.9926, "lr": 0.0003517482517482518, "step": 5136, "tokens_trained": 0.488167992 }, { "epoch": 1.4575886524822694, "grad_norm": 0.7610443830490112, "loss": 4.9908, "lr": 0.0003514685314685315, "step": 5138, "tokens_trained": 0.488357824 }, { "epoch": 1.4581560283687942, "grad_norm": 0.6879466772079468, "loss": 4.9405, "lr": 0.00035118881118881116, "step": 5140, "tokens_trained": 0.488546976 }, { "epoch": 1.4587234042553192, "grad_norm": 0.7296876311302185, "loss": 4.9302, "lr": 0.0003509090909090909, "step": 5142, "tokens_trained": 0.488736728 }, { "epoch": 1.459290780141844, "grad_norm": 0.726078987121582, "loss": 4.953, "lr": 0.0003506293706293706, "step": 5144, "tokens_trained": 0.488924216 }, { "epoch": 1.4598581560283688, "grad_norm": 0.7201434969902039, "loss": 4.9273, "lr": 0.0003503496503496504, "step": 5146, "tokens_trained": 0.489115432 }, { "epoch": 1.4604255319148935, "grad_norm": 0.7064175605773926, "loss": 4.878, "lr": 0.0003500699300699301, "step": 5148, "tokens_trained": 0.489305352 }, { "epoch": 1.4609929078014185, "grad_norm": 0.6968701481819153, "loss": 4.9705, "lr": 0.00034979020979020983, "step": 5150, "tokens_trained": 0.489495768 }, { "epoch": 1.4615602836879433, "grad_norm": 0.6772524118423462, "loss": 4.8796, "lr": 0.0003495104895104895, "step": 5152, "tokens_trained": 0.489685904 }, { "epoch": 1.462127659574468, "grad_norm": 0.6674978137016296, "loss": 4.9093, "lr": 0.00034923076923076927, "step": 5154, "tokens_trained": 0.489876272 }, { "epoch": 1.4626950354609929, "grad_norm": 0.7403509616851807, "loss": 4.9312, "lr": 0.00034895104895104896, "step": 5156, "tokens_trained": 0.490066952 }, { "epoch": 1.4632624113475177, "grad_norm": 0.7287116050720215, "loss": 4.9127, "lr": 0.00034867132867132865, "step": 5158, "tokens_trained": 0.490257024 }, { "epoch": 1.4638297872340424, "grad_norm": 0.7658629417419434, "loss": 4.9002, "lr": 0.0003483916083916084, "step": 5160, "tokens_trained": 0.490445832 }, { "epoch": 1.4643971631205674, "grad_norm": 0.7551032304763794, "loss": 4.9557, "lr": 0.0003481118881118881, "step": 5162, "tokens_trained": 0.490636 }, { "epoch": 1.4649645390070922, "grad_norm": 0.6556824445724487, "loss": 4.968, "lr": 0.0003478321678321678, "step": 5164, "tokens_trained": 0.490826024 }, { "epoch": 1.465531914893617, "grad_norm": 0.6782544255256653, "loss": 4.9682, "lr": 0.00034755244755244757, "step": 5166, "tokens_trained": 0.49101488 }, { "epoch": 1.4660992907801418, "grad_norm": 0.6604062914848328, "loss": 4.9523, "lr": 0.0003472727272727273, "step": 5168, "tokens_trained": 0.491205576 }, { "epoch": 1.4666666666666668, "grad_norm": 0.7062127590179443, "loss": 4.9148, "lr": 0.000346993006993007, "step": 5170, "tokens_trained": 0.491397208 }, { "epoch": 1.4672340425531916, "grad_norm": 0.7056037187576294, "loss": 4.9568, "lr": 0.00034671328671328675, "step": 5172, "tokens_trained": 0.491588976 }, { "epoch": 1.4678014184397163, "grad_norm": 0.6791336536407471, "loss": 4.9242, "lr": 0.00034643356643356644, "step": 5174, "tokens_trained": 0.491779592 }, { "epoch": 1.4683687943262411, "grad_norm": 0.694888710975647, "loss": 4.9874, "lr": 0.00034615384615384613, "step": 5176, "tokens_trained": 0.49197004 }, { "epoch": 1.468936170212766, "grad_norm": 0.7048712968826294, "loss": 4.914, "lr": 0.0003458741258741259, "step": 5178, "tokens_trained": 0.492157104 }, { "epoch": 1.4695035460992907, "grad_norm": 0.6525787711143494, "loss": 4.9394, "lr": 0.00034559440559440557, "step": 5180, "tokens_trained": 0.492347352 }, { "epoch": 1.4700709219858157, "grad_norm": 0.719822883605957, "loss": 4.9142, "lr": 0.0003453146853146853, "step": 5182, "tokens_trained": 0.492540024 }, { "epoch": 1.4706382978723405, "grad_norm": 0.6324074268341064, "loss": 5.0143, "lr": 0.00034503496503496506, "step": 5184, "tokens_trained": 0.49273128 }, { "epoch": 1.4712056737588652, "grad_norm": 0.7017198204994202, "loss": 4.9955, "lr": 0.0003447552447552448, "step": 5186, "tokens_trained": 0.492921624 }, { "epoch": 1.47177304964539, "grad_norm": 0.6721644997596741, "loss": 4.8586, "lr": 0.0003444755244755245, "step": 5188, "tokens_trained": 0.493110224 }, { "epoch": 1.472340425531915, "grad_norm": 0.6856983304023743, "loss": 4.9897, "lr": 0.0003441958041958042, "step": 5190, "tokens_trained": 0.493301296 }, { "epoch": 1.4729078014184398, "grad_norm": 0.7391275763511658, "loss": 4.9807, "lr": 0.0003439160839160839, "step": 5192, "tokens_trained": 0.493490904 }, { "epoch": 1.4734751773049646, "grad_norm": 0.7362062931060791, "loss": 4.9474, "lr": 0.0003436363636363636, "step": 5194, "tokens_trained": 0.493680368 }, { "epoch": 1.4740425531914894, "grad_norm": 0.7283117175102234, "loss": 4.9375, "lr": 0.00034335664335664336, "step": 5196, "tokens_trained": 0.493869344 }, { "epoch": 1.4746099290780141, "grad_norm": 0.6644704937934875, "loss": 5.0159, "lr": 0.00034307692307692305, "step": 5198, "tokens_trained": 0.494059664 }, { "epoch": 1.475177304964539, "grad_norm": 0.7252303957939148, "loss": 4.9307, "lr": 0.0003427972027972028, "step": 5200, "tokens_trained": 0.494250216 }, { "epoch": 1.4757446808510637, "grad_norm": 0.6854642033576965, "loss": 4.9407, "lr": 0.00034251748251748254, "step": 5202, "tokens_trained": 0.494437472 }, { "epoch": 1.4763120567375887, "grad_norm": 0.7645247578620911, "loss": 4.9242, "lr": 0.0003422377622377623, "step": 5204, "tokens_trained": 0.494627872 }, { "epoch": 1.4768794326241135, "grad_norm": 0.7982824444770813, "loss": 4.9241, "lr": 0.000341958041958042, "step": 5206, "tokens_trained": 0.494819832 }, { "epoch": 1.4774468085106383, "grad_norm": 0.7241318225860596, "loss": 4.9255, "lr": 0.00034167832167832167, "step": 5208, "tokens_trained": 0.495010952 }, { "epoch": 1.4780141843971633, "grad_norm": 0.7253429293632507, "loss": 4.9332, "lr": 0.0003413986013986014, "step": 5210, "tokens_trained": 0.49520108 }, { "epoch": 1.478581560283688, "grad_norm": 0.7978675365447998, "loss": 4.9115, "lr": 0.0003411188811188811, "step": 5212, "tokens_trained": 0.495390568 }, { "epoch": 1.4791489361702128, "grad_norm": 0.7228849530220032, "loss": 4.9242, "lr": 0.00034083916083916084, "step": 5214, "tokens_trained": 0.495579944 }, { "epoch": 1.4797163120567376, "grad_norm": 0.6821274757385254, "loss": 4.9012, "lr": 0.00034055944055944054, "step": 5216, "tokens_trained": 0.495770832 }, { "epoch": 1.4802836879432624, "grad_norm": 0.7085686922073364, "loss": 4.8695, "lr": 0.0003402797202797203, "step": 5218, "tokens_trained": 0.495959744 }, { "epoch": 1.4808510638297872, "grad_norm": 0.6809284090995789, "loss": 4.9484, "lr": 0.00034, "step": 5220, "tokens_trained": 0.496149904 }, { "epoch": 1.481418439716312, "grad_norm": 0.8035463690757751, "loss": 4.9195, "lr": 0.00033972027972027977, "step": 5222, "tokens_trained": 0.496339944 }, { "epoch": 1.481985815602837, "grad_norm": 0.6803924441337585, "loss": 5.0083, "lr": 0.00033944055944055946, "step": 5224, "tokens_trained": 0.49653112 }, { "epoch": 1.4825531914893617, "grad_norm": 0.7047116756439209, "loss": 4.9358, "lr": 0.00033916083916083915, "step": 5226, "tokens_trained": 0.496722816 }, { "epoch": 1.4831205673758865, "grad_norm": 0.6624785661697388, "loss": 4.9749, "lr": 0.0003388811188811189, "step": 5228, "tokens_trained": 0.496914168 }, { "epoch": 1.4836879432624113, "grad_norm": 0.7224833965301514, "loss": 4.9043, "lr": 0.0003386013986013986, "step": 5230, "tokens_trained": 0.497105128 }, { "epoch": 1.4842553191489363, "grad_norm": 0.7224262952804565, "loss": 4.9132, "lr": 0.00033832167832167833, "step": 5232, "tokens_trained": 0.497294728 }, { "epoch": 1.484822695035461, "grad_norm": 0.7377181053161621, "loss": 4.9376, "lr": 0.000338041958041958, "step": 5234, "tokens_trained": 0.49748684 }, { "epoch": 1.4853900709219858, "grad_norm": 0.6763118505477905, "loss": 4.8603, "lr": 0.00033776223776223776, "step": 5236, "tokens_trained": 0.49767768 }, { "epoch": 1.4859574468085106, "grad_norm": 0.6546086668968201, "loss": 4.9397, "lr": 0.0003374825174825175, "step": 5238, "tokens_trained": 0.49786732 }, { "epoch": 1.4865248226950354, "grad_norm": 0.6710076928138733, "loss": 4.9352, "lr": 0.00033720279720279725, "step": 5240, "tokens_trained": 0.49805788 }, { "epoch": 1.4870921985815602, "grad_norm": 0.6867020726203918, "loss": 4.9736, "lr": 0.00033692307692307694, "step": 5242, "tokens_trained": 0.498249072 }, { "epoch": 1.4876595744680852, "grad_norm": 0.7198293209075928, "loss": 4.951, "lr": 0.00033664335664335663, "step": 5244, "tokens_trained": 0.498438568 }, { "epoch": 1.48822695035461, "grad_norm": 0.7505615949630737, "loss": 4.9478, "lr": 0.0003363636363636364, "step": 5246, "tokens_trained": 0.498628128 }, { "epoch": 1.4887943262411347, "grad_norm": 0.7085391879081726, "loss": 4.9528, "lr": 0.00033608391608391607, "step": 5248, "tokens_trained": 0.498817976 }, { "epoch": 1.4893617021276595, "grad_norm": 0.6769458651542664, "loss": 4.9422, "lr": 0.0003358041958041958, "step": 5250, "tokens_trained": 0.499007344 }, { "epoch": 1.4893617021276595, "eval_loss": 4.961794376373291, "eval_runtime": 20.5965, "step": 5250, "tokens_trained": 0.499007344 }, { "epoch": 1.4899290780141845, "grad_norm": 0.703074038028717, "loss": 4.9623, "lr": 0.0003355244755244755, "step": 5252, "tokens_trained": 0.499197168 }, { "epoch": 1.4904964539007093, "grad_norm": 0.7225844264030457, "loss": 4.9225, "lr": 0.00033524475524475525, "step": 5254, "tokens_trained": 0.499387424 }, { "epoch": 1.491063829787234, "grad_norm": 0.7070515751838684, "loss": 4.9978, "lr": 0.000334965034965035, "step": 5256, "tokens_trained": 0.49957504 }, { "epoch": 1.4916312056737588, "grad_norm": 0.6716368794441223, "loss": 4.9366, "lr": 0.00033468531468531474, "step": 5258, "tokens_trained": 0.499765288 }, { "epoch": 1.4921985815602836, "grad_norm": 0.6873813271522522, "loss": 4.9765, "lr": 0.00033440559440559443, "step": 5260, "tokens_trained": 0.499955856 }, { "epoch": 1.4927659574468084, "grad_norm": 0.7044195532798767, "loss": 4.9801, "lr": 0.0003341258741258741, "step": 5262, "tokens_trained": 0.500144536 }, { "epoch": 1.4933333333333334, "grad_norm": 0.7457751631736755, "loss": 4.9185, "lr": 0.00033384615384615386, "step": 5264, "tokens_trained": 0.500335344 }, { "epoch": 1.4939007092198582, "grad_norm": 0.6479538679122925, "loss": 4.8611, "lr": 0.00033356643356643355, "step": 5266, "tokens_trained": 0.50052428 }, { "epoch": 1.494468085106383, "grad_norm": 0.7350703477859497, "loss": 5.0101, "lr": 0.0003332867132867133, "step": 5268, "tokens_trained": 0.500715368 }, { "epoch": 1.4950354609929077, "grad_norm": 0.6749465465545654, "loss": 4.9189, "lr": 0.000333006993006993, "step": 5270, "tokens_trained": 0.5009054 }, { "epoch": 1.4956028368794327, "grad_norm": 0.7268565893173218, "loss": 4.9873, "lr": 0.00033272727272727273, "step": 5272, "tokens_trained": 0.501095304 }, { "epoch": 1.4961702127659575, "grad_norm": 0.7625619769096375, "loss": 4.9759, "lr": 0.0003324475524475525, "step": 5274, "tokens_trained": 0.501286024 }, { "epoch": 1.4967375886524823, "grad_norm": 0.7040157318115234, "loss": 4.8602, "lr": 0.0003321678321678322, "step": 5276, "tokens_trained": 0.501473928 }, { "epoch": 1.497304964539007, "grad_norm": 0.7228814363479614, "loss": 4.9073, "lr": 0.0003318881118881119, "step": 5278, "tokens_trained": 0.501665544 }, { "epoch": 1.4978723404255319, "grad_norm": 0.657363772392273, "loss": 4.9149, "lr": 0.0003316083916083916, "step": 5280, "tokens_trained": 0.501855568 }, { "epoch": 1.4984397163120566, "grad_norm": 0.7363243699073792, "loss": 4.9329, "lr": 0.00033132867132867135, "step": 5282, "tokens_trained": 0.5020458 }, { "epoch": 1.4990070921985816, "grad_norm": 0.7383313179016113, "loss": 4.9492, "lr": 0.00033104895104895104, "step": 5284, "tokens_trained": 0.50223592 }, { "epoch": 1.4995744680851064, "grad_norm": 0.7142005562782288, "loss": 4.9452, "lr": 0.0003307692307692308, "step": 5286, "tokens_trained": 0.502425872 }, { "epoch": 1.5001418439716312, "grad_norm": 0.7596315145492554, "loss": 5.0124, "lr": 0.00033048951048951047, "step": 5288, "tokens_trained": 0.502616216 }, { "epoch": 1.500709219858156, "grad_norm": 0.6722792983055115, "loss": 5.0299, "lr": 0.0003302097902097902, "step": 5290, "tokens_trained": 0.502805016 }, { "epoch": 1.501276595744681, "grad_norm": 0.7129888534545898, "loss": 4.9328, "lr": 0.0003299300699300699, "step": 5292, "tokens_trained": 0.50299348 }, { "epoch": 1.5018439716312058, "grad_norm": 0.692414402961731, "loss": 4.9892, "lr": 0.0003296503496503497, "step": 5294, "tokens_trained": 0.503183344 }, { "epoch": 1.5024113475177305, "grad_norm": 0.7669686079025269, "loss": 4.9678, "lr": 0.0003293706293706294, "step": 5296, "tokens_trained": 0.503371672 }, { "epoch": 1.5029787234042553, "grad_norm": 0.7279202342033386, "loss": 4.8896, "lr": 0.0003290909090909091, "step": 5298, "tokens_trained": 0.503561568 }, { "epoch": 1.50354609929078, "grad_norm": 0.7096788883209229, "loss": 4.9277, "lr": 0.00032881118881118883, "step": 5300, "tokens_trained": 0.50375168 }, { "epoch": 1.5041134751773049, "grad_norm": 0.6873319745063782, "loss": 4.9328, "lr": 0.0003285314685314685, "step": 5302, "tokens_trained": 0.503940848 }, { "epoch": 1.5046808510638296, "grad_norm": 0.7552497982978821, "loss": 4.9348, "lr": 0.00032825174825174827, "step": 5304, "tokens_trained": 0.504129936 }, { "epoch": 1.5052482269503547, "grad_norm": 0.7070790529251099, "loss": 4.9917, "lr": 0.00032797202797202796, "step": 5306, "tokens_trained": 0.504320448 }, { "epoch": 1.5058156028368794, "grad_norm": 0.716891348361969, "loss": 4.9646, "lr": 0.0003276923076923077, "step": 5308, "tokens_trained": 0.504509208 }, { "epoch": 1.5063829787234042, "grad_norm": 0.6974935531616211, "loss": 4.9859, "lr": 0.0003274125874125874, "step": 5310, "tokens_trained": 0.504699424 }, { "epoch": 1.5069503546099292, "grad_norm": 0.6656584143638611, "loss": 5.018, "lr": 0.00032713286713286714, "step": 5312, "tokens_trained": 0.504891008 }, { "epoch": 1.507517730496454, "grad_norm": 0.7303102016448975, "loss": 4.9347, "lr": 0.0003268531468531469, "step": 5314, "tokens_trained": 0.505079936 }, { "epoch": 1.5080851063829788, "grad_norm": 0.6668216586112976, "loss": 4.94, "lr": 0.00032657342657342657, "step": 5316, "tokens_trained": 0.50527092 }, { "epoch": 1.5086524822695035, "grad_norm": 0.6867246031761169, "loss": 4.9748, "lr": 0.0003262937062937063, "step": 5318, "tokens_trained": 0.505460696 }, { "epoch": 1.5092198581560283, "grad_norm": 0.6879433393478394, "loss": 4.9788, "lr": 0.000326013986013986, "step": 5320, "tokens_trained": 0.505651704 }, { "epoch": 1.509787234042553, "grad_norm": 0.6238239407539368, "loss": 4.9144, "lr": 0.00032573426573426575, "step": 5322, "tokens_trained": 0.505842712 }, { "epoch": 1.5103546099290779, "grad_norm": 0.6932456493377686, "loss": 4.9553, "lr": 0.00032545454545454544, "step": 5324, "tokens_trained": 0.50603356 }, { "epoch": 1.5109219858156029, "grad_norm": 0.6765711307525635, "loss": 4.8932, "lr": 0.0003251748251748252, "step": 5326, "tokens_trained": 0.506223432 }, { "epoch": 1.5114893617021277, "grad_norm": 0.6961528658866882, "loss": 4.9832, "lr": 0.0003248951048951049, "step": 5328, "tokens_trained": 0.506415744 }, { "epoch": 1.5120567375886524, "grad_norm": 0.6740815043449402, "loss": 4.9249, "lr": 0.0003246153846153846, "step": 5330, "tokens_trained": 0.506606464 }, { "epoch": 1.5126241134751774, "grad_norm": 0.7454788684844971, "loss": 4.994, "lr": 0.00032433566433566436, "step": 5332, "tokens_trained": 0.50679476 }, { "epoch": 1.5131914893617022, "grad_norm": 0.6879718899726868, "loss": 4.9346, "lr": 0.00032405594405594406, "step": 5334, "tokens_trained": 0.506985656 }, { "epoch": 1.513758865248227, "grad_norm": 0.7361749410629272, "loss": 4.9389, "lr": 0.0003237762237762238, "step": 5336, "tokens_trained": 0.507172712 }, { "epoch": 1.5143262411347518, "grad_norm": 0.6871389150619507, "loss": 4.9199, "lr": 0.0003234965034965035, "step": 5338, "tokens_trained": 0.507362736 }, { "epoch": 1.5148936170212766, "grad_norm": 0.7587972283363342, "loss": 4.9011, "lr": 0.00032321678321678323, "step": 5340, "tokens_trained": 0.507553832 }, { "epoch": 1.5154609929078013, "grad_norm": 0.7415489554405212, "loss": 4.9605, "lr": 0.0003229370629370629, "step": 5342, "tokens_trained": 0.507745568 }, { "epoch": 1.5160283687943261, "grad_norm": 0.652815580368042, "loss": 4.9904, "lr": 0.00032265734265734267, "step": 5344, "tokens_trained": 0.507935952 }, { "epoch": 1.516595744680851, "grad_norm": 0.6617633700370789, "loss": 4.9636, "lr": 0.00032237762237762236, "step": 5346, "tokens_trained": 0.508127672 }, { "epoch": 1.517163120567376, "grad_norm": 0.7277817726135254, "loss": 4.9886, "lr": 0.0003220979020979021, "step": 5348, "tokens_trained": 0.508316872 }, { "epoch": 1.5177304964539007, "grad_norm": 0.702679455280304, "loss": 4.9731, "lr": 0.00032181818181818185, "step": 5350, "tokens_trained": 0.508509352 }, { "epoch": 1.5182978723404257, "grad_norm": 0.6981706023216248, "loss": 4.9026, "lr": 0.00032153846153846154, "step": 5352, "tokens_trained": 0.508698856 }, { "epoch": 1.5188652482269505, "grad_norm": 0.6164356470108032, "loss": 4.9847, "lr": 0.0003212587412587413, "step": 5354, "tokens_trained": 0.508887864 }, { "epoch": 1.5194326241134752, "grad_norm": 0.6921977996826172, "loss": 4.9873, "lr": 0.000320979020979021, "step": 5356, "tokens_trained": 0.509076184 }, { "epoch": 1.52, "grad_norm": 0.623582124710083, "loss": 4.9509, "lr": 0.0003206993006993007, "step": 5358, "tokens_trained": 0.509265936 }, { "epoch": 1.5205673758865248, "grad_norm": 0.67270427942276, "loss": 4.954, "lr": 0.0003204195804195804, "step": 5360, "tokens_trained": 0.509454856 }, { "epoch": 1.5211347517730496, "grad_norm": 0.6615780591964722, "loss": 4.9553, "lr": 0.00032013986013986015, "step": 5362, "tokens_trained": 0.509646208 }, { "epoch": 1.5217021276595744, "grad_norm": 0.6998571157455444, "loss": 4.8435, "lr": 0.00031986013986013984, "step": 5364, "tokens_trained": 0.5098372 }, { "epoch": 1.5222695035460991, "grad_norm": 0.7160391807556152, "loss": 4.98, "lr": 0.0003195804195804196, "step": 5366, "tokens_trained": 0.510027392 }, { "epoch": 1.5228368794326241, "grad_norm": 0.6850275993347168, "loss": 4.8923, "lr": 0.00031930069930069933, "step": 5368, "tokens_trained": 0.51021744 }, { "epoch": 1.523404255319149, "grad_norm": 0.7021211385726929, "loss": 4.983, "lr": 0.000319020979020979, "step": 5370, "tokens_trained": 0.510407656 }, { "epoch": 1.523971631205674, "grad_norm": 0.774710476398468, "loss": 4.9424, "lr": 0.00031874125874125877, "step": 5372, "tokens_trained": 0.510599136 }, { "epoch": 1.5245390070921987, "grad_norm": 0.7655723094940186, "loss": 4.9945, "lr": 0.00031846153846153846, "step": 5374, "tokens_trained": 0.510788296 }, { "epoch": 1.524822695035461, "eval_loss": 4.955821990966797, "eval_runtime": 20.6924, "step": 5375, "tokens_trained": 0.5108826 }, { "epoch": 1.5251063829787235, "grad_norm": 0.7002980709075928, "loss": 4.9169, "lr": 0.0003181818181818182, "step": 5376, "tokens_trained": 0.510976216 }, { "epoch": 1.5256737588652483, "grad_norm": 0.7407814264297485, "loss": 4.9083, "lr": 0.0003179020979020979, "step": 5378, "tokens_trained": 0.51116604 }, { "epoch": 1.526241134751773, "grad_norm": 0.6531659960746765, "loss": 4.962, "lr": 0.00031762237762237764, "step": 5380, "tokens_trained": 0.511356032 }, { "epoch": 1.5268085106382978, "grad_norm": 0.7520626783370972, "loss": 4.9729, "lr": 0.00031734265734265733, "step": 5382, "tokens_trained": 0.511546912 }, { "epoch": 1.5273758865248226, "grad_norm": 0.711840808391571, "loss": 4.8914, "lr": 0.0003170629370629371, "step": 5384, "tokens_trained": 0.511735008 }, { "epoch": 1.5279432624113474, "grad_norm": 0.7245994210243225, "loss": 4.9847, "lr": 0.0003167832167832168, "step": 5386, "tokens_trained": 0.511924464 }, { "epoch": 1.5285106382978724, "grad_norm": 0.6797400712966919, "loss": 4.9264, "lr": 0.0003165034965034965, "step": 5388, "tokens_trained": 0.512114576 }, { "epoch": 1.5290780141843971, "grad_norm": 0.6849039196968079, "loss": 4.9869, "lr": 0.00031622377622377625, "step": 5390, "tokens_trained": 0.5123038 }, { "epoch": 1.5296453900709222, "grad_norm": 0.6865547299385071, "loss": 4.939, "lr": 0.00031594405594405594, "step": 5392, "tokens_trained": 0.512492256 }, { "epoch": 1.530212765957447, "grad_norm": 0.6928399801254272, "loss": 4.9971, "lr": 0.0003156643356643357, "step": 5394, "tokens_trained": 0.512681984 }, { "epoch": 1.5307801418439717, "grad_norm": 0.6588703989982605, "loss": 4.9954, "lr": 0.0003153846153846154, "step": 5396, "tokens_trained": 0.51287404 }, { "epoch": 1.5313475177304965, "grad_norm": 0.6540804505348206, "loss": 4.9124, "lr": 0.00031510489510489507, "step": 5398, "tokens_trained": 0.513062224 }, { "epoch": 1.5319148936170213, "grad_norm": 0.6257782578468323, "loss": 4.8895, "lr": 0.0003148251748251748, "step": 5400, "tokens_trained": 0.513253072 }, { "epoch": 1.532482269503546, "grad_norm": 0.7582085132598877, "loss": 4.9873, "lr": 0.00031454545454545456, "step": 5402, "tokens_trained": 0.513444424 }, { "epoch": 1.5330496453900708, "grad_norm": 0.6824280023574829, "loss": 4.9706, "lr": 0.0003142657342657343, "step": 5404, "tokens_trained": 0.513632392 }, { "epoch": 1.5336170212765956, "grad_norm": 0.7039311528205872, "loss": 4.9045, "lr": 0.000313986013986014, "step": 5406, "tokens_trained": 0.513821312 }, { "epoch": 1.5341843971631206, "grad_norm": 0.7429019808769226, "loss": 4.9177, "lr": 0.00031370629370629374, "step": 5408, "tokens_trained": 0.514013672 }, { "epoch": 1.5347517730496454, "grad_norm": 0.7027938365936279, "loss": 4.973, "lr": 0.00031342657342657343, "step": 5410, "tokens_trained": 0.514204496 }, { "epoch": 1.5353191489361702, "grad_norm": 0.6951351165771484, "loss": 4.9362, "lr": 0.00031314685314685317, "step": 5412, "tokens_trained": 0.514396032 }, { "epoch": 1.5358865248226952, "grad_norm": 0.669873833656311, "loss": 4.9528, "lr": 0.00031286713286713286, "step": 5414, "tokens_trained": 0.514585336 }, { "epoch": 1.53645390070922, "grad_norm": 0.713265597820282, "loss": 4.9212, "lr": 0.00031258741258741255, "step": 5416, "tokens_trained": 0.514775768 }, { "epoch": 1.5370212765957447, "grad_norm": 0.6735701560974121, "loss": 4.9472, "lr": 0.0003123076923076923, "step": 5418, "tokens_trained": 0.514966152 }, { "epoch": 1.5375886524822695, "grad_norm": 0.6668660044670105, "loss": 4.9723, "lr": 0.000312027972027972, "step": 5420, "tokens_trained": 0.515156704 }, { "epoch": 1.5381560283687943, "grad_norm": 0.6513842940330505, "loss": 4.9932, "lr": 0.0003117482517482518, "step": 5422, "tokens_trained": 0.515346472 }, { "epoch": 1.538723404255319, "grad_norm": 0.6403244733810425, "loss": 4.8578, "lr": 0.0003114685314685315, "step": 5424, "tokens_trained": 0.515534712 }, { "epoch": 1.5392907801418438, "grad_norm": 0.7064723372459412, "loss": 4.9676, "lr": 0.0003111888111888112, "step": 5426, "tokens_trained": 0.515722064 }, { "epoch": 1.5398581560283688, "grad_norm": 0.7145774364471436, "loss": 4.9483, "lr": 0.0003109090909090909, "step": 5428, "tokens_trained": 0.515913768 }, { "epoch": 1.5404255319148936, "grad_norm": 0.7140977382659912, "loss": 4.9531, "lr": 0.00031062937062937066, "step": 5430, "tokens_trained": 0.516104296 }, { "epoch": 1.5409929078014184, "grad_norm": 0.6778871417045593, "loss": 4.8843, "lr": 0.00031034965034965035, "step": 5432, "tokens_trained": 0.516291816 }, { "epoch": 1.5415602836879434, "grad_norm": 0.7036442160606384, "loss": 4.9779, "lr": 0.00031006993006993004, "step": 5434, "tokens_trained": 0.516483368 }, { "epoch": 1.5421276595744682, "grad_norm": 0.6736776232719421, "loss": 4.8892, "lr": 0.0003097902097902098, "step": 5436, "tokens_trained": 0.516673328 }, { "epoch": 1.542695035460993, "grad_norm": 0.7328810691833496, "loss": 5.0193, "lr": 0.00030951048951048947, "step": 5438, "tokens_trained": 0.516864888 }, { "epoch": 1.5432624113475177, "grad_norm": 0.6792168021202087, "loss": 4.8394, "lr": 0.00030923076923076927, "step": 5440, "tokens_trained": 0.517054416 }, { "epoch": 1.5438297872340425, "grad_norm": 0.7632612586021423, "loss": 4.9669, "lr": 0.00030895104895104896, "step": 5442, "tokens_trained": 0.517244824 }, { "epoch": 1.5443971631205673, "grad_norm": 0.7215416431427002, "loss": 4.8858, "lr": 0.0003086713286713287, "step": 5444, "tokens_trained": 0.517435952 }, { "epoch": 1.544964539007092, "grad_norm": 0.682598352432251, "loss": 4.9424, "lr": 0.0003083916083916084, "step": 5446, "tokens_trained": 0.517626944 }, { "epoch": 1.545531914893617, "grad_norm": 0.726121187210083, "loss": 4.9332, "lr": 0.00030811188811188814, "step": 5448, "tokens_trained": 0.517818208 }, { "epoch": 1.5460992907801419, "grad_norm": 0.6898934841156006, "loss": 4.939, "lr": 0.00030783216783216783, "step": 5450, "tokens_trained": 0.518008272 }, { "epoch": 1.5466666666666666, "grad_norm": 0.723687469959259, "loss": 4.9424, "lr": 0.0003075524475524475, "step": 5452, "tokens_trained": 0.518199648 }, { "epoch": 1.5472340425531916, "grad_norm": 0.6857745051383972, "loss": 4.9438, "lr": 0.00030727272727272727, "step": 5454, "tokens_trained": 0.518389736 }, { "epoch": 1.5478014184397164, "grad_norm": 0.7015530467033386, "loss": 4.9858, "lr": 0.00030699300699300696, "step": 5456, "tokens_trained": 0.518580544 }, { "epoch": 1.5483687943262412, "grad_norm": 0.7432133555412292, "loss": 4.9272, "lr": 0.00030671328671328675, "step": 5458, "tokens_trained": 0.518769088 }, { "epoch": 1.548936170212766, "grad_norm": 0.6339504718780518, "loss": 4.9458, "lr": 0.00030643356643356645, "step": 5460, "tokens_trained": 0.51896068 }, { "epoch": 1.5495035460992908, "grad_norm": 0.6800563931465149, "loss": 4.9094, "lr": 0.0003061538461538462, "step": 5462, "tokens_trained": 0.519151952 }, { "epoch": 1.5500709219858155, "grad_norm": 0.668978214263916, "loss": 4.961, "lr": 0.0003058741258741259, "step": 5464, "tokens_trained": 0.519339696 }, { "epoch": 1.5506382978723403, "grad_norm": 0.7214268445968628, "loss": 4.9676, "lr": 0.0003055944055944056, "step": 5466, "tokens_trained": 0.519531888 }, { "epoch": 1.551205673758865, "grad_norm": 0.7074905633926392, "loss": 4.9774, "lr": 0.0003053146853146853, "step": 5468, "tokens_trained": 0.51972192 }, { "epoch": 1.55177304964539, "grad_norm": 0.6255798935890198, "loss": 4.9697, "lr": 0.000305034965034965, "step": 5470, "tokens_trained": 0.519913584 }, { "epoch": 1.5523404255319149, "grad_norm": 0.6483328938484192, "loss": 4.933, "lr": 0.00030475524475524475, "step": 5472, "tokens_trained": 0.520102352 }, { "epoch": 1.5529078014184399, "grad_norm": 0.6571099758148193, "loss": 4.929, "lr": 0.00030447552447552444, "step": 5474, "tokens_trained": 0.520290072 }, { "epoch": 1.5534751773049646, "grad_norm": 0.6606305241584778, "loss": 4.8969, "lr": 0.00030419580419580424, "step": 5476, "tokens_trained": 0.520480936 }, { "epoch": 1.5540425531914894, "grad_norm": 0.6898429989814758, "loss": 4.8674, "lr": 0.00030391608391608393, "step": 5478, "tokens_trained": 0.520672488 }, { "epoch": 1.5546099290780142, "grad_norm": 0.6592644453048706, "loss": 4.9508, "lr": 0.0003036363636363637, "step": 5480, "tokens_trained": 0.520861416 }, { "epoch": 1.555177304964539, "grad_norm": 0.655953049659729, "loss": 4.984, "lr": 0.00030335664335664336, "step": 5482, "tokens_trained": 0.521050896 }, { "epoch": 1.5557446808510638, "grad_norm": 0.6819510459899902, "loss": 4.9191, "lr": 0.0003030769230769231, "step": 5484, "tokens_trained": 0.521240968 }, { "epoch": 1.5563120567375885, "grad_norm": 0.6007876992225647, "loss": 4.9466, "lr": 0.0003027972027972028, "step": 5486, "tokens_trained": 0.521432128 }, { "epoch": 1.5568794326241133, "grad_norm": 0.695093035697937, "loss": 4.917, "lr": 0.0003025174825174825, "step": 5488, "tokens_trained": 0.521620344 }, { "epoch": 1.5574468085106383, "grad_norm": 0.6258317828178406, "loss": 4.9805, "lr": 0.00030223776223776223, "step": 5490, "tokens_trained": 0.521809136 }, { "epoch": 1.558014184397163, "grad_norm": 0.6752353310585022, "loss": 4.9337, "lr": 0.0003019580419580419, "step": 5492, "tokens_trained": 0.52199984 }, { "epoch": 1.558581560283688, "grad_norm": 0.6715705394744873, "loss": 4.9365, "lr": 0.0003016783216783217, "step": 5494, "tokens_trained": 0.5221894 }, { "epoch": 1.5591489361702129, "grad_norm": 0.6483761668205261, "loss": 4.9399, "lr": 0.0003013986013986014, "step": 5496, "tokens_trained": 0.522378232 }, { "epoch": 1.5597163120567377, "grad_norm": 0.6588965654373169, "loss": 4.9542, "lr": 0.00030111888111888116, "step": 5498, "tokens_trained": 0.522567912 }, { "epoch": 1.5602836879432624, "grad_norm": 0.6552091836929321, "loss": 5.0228, "lr": 0.00030083916083916085, "step": 5500, "tokens_trained": 0.522758864 }, { "epoch": 1.5602836879432624, "eval_loss": 4.952324390411377, "eval_runtime": 20.7411, "step": 5500, "tokens_trained": 0.522758864 }, { "epoch": 1.5608510638297872, "grad_norm": 0.6408063769340515, "loss": 4.9222, "lr": 0.0003005594405594406, "step": 5502, "tokens_trained": 0.522948072 }, { "epoch": 1.561418439716312, "grad_norm": 0.6870996356010437, "loss": 4.9752, "lr": 0.0003002797202797203, "step": 5504, "tokens_trained": 0.52313788 }, { "epoch": 1.5619858156028368, "grad_norm": 0.6667518019676208, "loss": 4.9168, "lr": 0.0003, "step": 5506, "tokens_trained": 0.523329992 }, { "epoch": 1.5625531914893616, "grad_norm": 0.702208936214447, "loss": 4.933, "lr": 0.0002997202797202797, "step": 5508, "tokens_trained": 0.523520472 }, { "epoch": 1.5631205673758866, "grad_norm": 0.6981185674667358, "loss": 4.8785, "lr": 0.0002994405594405594, "step": 5510, "tokens_trained": 0.523710392 }, { "epoch": 1.5636879432624113, "grad_norm": 0.702541172504425, "loss": 4.935, "lr": 0.0002991608391608392, "step": 5512, "tokens_trained": 0.523899808 }, { "epoch": 1.5642553191489361, "grad_norm": 0.7196447253227234, "loss": 4.8815, "lr": 0.0002988811188811189, "step": 5514, "tokens_trained": 0.524092104 }, { "epoch": 1.5648226950354611, "grad_norm": 0.6708106398582458, "loss": 4.956, "lr": 0.00029860139860139864, "step": 5516, "tokens_trained": 0.524282736 }, { "epoch": 1.565390070921986, "grad_norm": 0.7550585269927979, "loss": 4.9753, "lr": 0.00029832167832167833, "step": 5518, "tokens_trained": 0.524474136 }, { "epoch": 1.5659574468085107, "grad_norm": 0.6838743686676025, "loss": 4.8614, "lr": 0.000298041958041958, "step": 5520, "tokens_trained": 0.524664808 }, { "epoch": 1.5665248226950355, "grad_norm": 0.6623738408088684, "loss": 4.9764, "lr": 0.00029776223776223777, "step": 5522, "tokens_trained": 0.524854544 }, { "epoch": 1.5670921985815602, "grad_norm": 0.6545519232749939, "loss": 4.8457, "lr": 0.00029748251748251746, "step": 5524, "tokens_trained": 0.5250424 }, { "epoch": 1.567659574468085, "grad_norm": 0.7389140725135803, "loss": 4.9146, "lr": 0.0002972027972027972, "step": 5526, "tokens_trained": 0.525230992 }, { "epoch": 1.5682269503546098, "grad_norm": 0.7217056155204773, "loss": 4.9467, "lr": 0.0002969230769230769, "step": 5528, "tokens_trained": 0.525420856 }, { "epoch": 1.5687943262411348, "grad_norm": 0.6547122597694397, "loss": 4.9329, "lr": 0.0002966433566433567, "step": 5530, "tokens_trained": 0.52560836 }, { "epoch": 1.5693617021276596, "grad_norm": 0.666806697845459, "loss": 4.9834, "lr": 0.0002963636363636364, "step": 5532, "tokens_trained": 0.525800296 }, { "epoch": 1.5699290780141844, "grad_norm": 0.6413487195968628, "loss": 4.962, "lr": 0.0002960839160839161, "step": 5534, "tokens_trained": 0.525990048 }, { "epoch": 1.5704964539007094, "grad_norm": 0.6668083667755127, "loss": 4.8913, "lr": 0.0002958041958041958, "step": 5536, "tokens_trained": 0.526179944 }, { "epoch": 1.5710638297872341, "grad_norm": 0.7005071640014648, "loss": 4.9072, "lr": 0.0002955244755244755, "step": 5538, "tokens_trained": 0.526370832 }, { "epoch": 1.571631205673759, "grad_norm": 0.6637274622917175, "loss": 4.8715, "lr": 0.00029524475524475525, "step": 5540, "tokens_trained": 0.526559848 }, { "epoch": 1.5721985815602837, "grad_norm": 0.7049083113670349, "loss": 4.9219, "lr": 0.00029496503496503494, "step": 5542, "tokens_trained": 0.526748064 }, { "epoch": 1.5727659574468085, "grad_norm": 0.6516308784484863, "loss": 4.9807, "lr": 0.0002946853146853147, "step": 5544, "tokens_trained": 0.526938512 }, { "epoch": 1.5733333333333333, "grad_norm": 0.6860136985778809, "loss": 4.9383, "lr": 0.0002944055944055944, "step": 5546, "tokens_trained": 0.527127952 }, { "epoch": 1.573900709219858, "grad_norm": 0.6561754941940308, "loss": 4.9285, "lr": 0.0002941258741258741, "step": 5548, "tokens_trained": 0.527316968 }, { "epoch": 1.574468085106383, "grad_norm": 0.6510263085365295, "loss": 4.9713, "lr": 0.00029384615384615387, "step": 5550, "tokens_trained": 0.527507632 }, { "epoch": 1.5750354609929078, "grad_norm": 0.6448122262954712, "loss": 4.9256, "lr": 0.0002935664335664336, "step": 5552, "tokens_trained": 0.527697672 }, { "epoch": 1.5756028368794326, "grad_norm": 0.6920211911201477, "loss": 4.9597, "lr": 0.0002932867132867133, "step": 5554, "tokens_trained": 0.527887192 }, { "epoch": 1.5761702127659576, "grad_norm": 0.6790446639060974, "loss": 4.9459, "lr": 0.000293006993006993, "step": 5556, "tokens_trained": 0.528076944 }, { "epoch": 1.5767375886524824, "grad_norm": 0.6892750263214111, "loss": 4.9409, "lr": 0.00029272727272727274, "step": 5558, "tokens_trained": 0.528268088 }, { "epoch": 1.5773049645390071, "grad_norm": 0.6887848973274231, "loss": 4.942, "lr": 0.0002924475524475524, "step": 5560, "tokens_trained": 0.528458328 }, { "epoch": 1.577872340425532, "grad_norm": 0.6473134160041809, "loss": 5.0129, "lr": 0.00029216783216783217, "step": 5562, "tokens_trained": 0.528649176 }, { "epoch": 1.5784397163120567, "grad_norm": 0.6970229744911194, "loss": 4.9428, "lr": 0.00029188811188811186, "step": 5564, "tokens_trained": 0.52884144 }, { "epoch": 1.5790070921985815, "grad_norm": 0.6593358516693115, "loss": 4.9599, "lr": 0.0002916083916083916, "step": 5566, "tokens_trained": 0.5290326 }, { "epoch": 1.5795744680851063, "grad_norm": 0.6955072283744812, "loss": 4.942, "lr": 0.00029132867132867135, "step": 5568, "tokens_trained": 0.5292212 }, { "epoch": 1.580141843971631, "grad_norm": 0.7118654847145081, "loss": 4.9216, "lr": 0.0002910489510489511, "step": 5570, "tokens_trained": 0.529408904 }, { "epoch": 1.580709219858156, "grad_norm": 0.7366807460784912, "loss": 4.8904, "lr": 0.0002907692307692308, "step": 5572, "tokens_trained": 0.529599688 }, { "epoch": 1.5812765957446808, "grad_norm": 0.6824966669082642, "loss": 4.9659, "lr": 0.0002904895104895105, "step": 5574, "tokens_trained": 0.529790952 }, { "epoch": 1.5818439716312058, "grad_norm": 0.6991832256317139, "loss": 5.024, "lr": 0.0002902097902097902, "step": 5576, "tokens_trained": 0.529981632 }, { "epoch": 1.5824113475177306, "grad_norm": 0.7548424005508423, "loss": 4.8805, "lr": 0.0002899300699300699, "step": 5578, "tokens_trained": 0.530172432 }, { "epoch": 1.5829787234042554, "grad_norm": 0.666056215763092, "loss": 4.9532, "lr": 0.00028965034965034966, "step": 5580, "tokens_trained": 0.530364048 }, { "epoch": 1.5835460992907802, "grad_norm": 0.7504513263702393, "loss": 4.9057, "lr": 0.00028937062937062935, "step": 5582, "tokens_trained": 0.53055348 }, { "epoch": 1.584113475177305, "grad_norm": 0.6936983466148376, "loss": 4.9605, "lr": 0.0002890909090909091, "step": 5584, "tokens_trained": 0.530742032 }, { "epoch": 1.5846808510638297, "grad_norm": 0.7441573143005371, "loss": 4.9377, "lr": 0.00028881118881118883, "step": 5586, "tokens_trained": 0.530933392 }, { "epoch": 1.5852482269503545, "grad_norm": 0.6817852258682251, "loss": 4.948, "lr": 0.0002885314685314686, "step": 5588, "tokens_trained": 0.531123216 }, { "epoch": 1.5858156028368793, "grad_norm": 0.6524785757064819, "loss": 4.8597, "lr": 0.00028825174825174827, "step": 5590, "tokens_trained": 0.531313264 }, { "epoch": 1.5863829787234043, "grad_norm": 0.6831153631210327, "loss": 4.8704, "lr": 0.00028797202797202796, "step": 5592, "tokens_trained": 0.531505544 }, { "epoch": 1.586950354609929, "grad_norm": 0.6845701932907104, "loss": 4.8938, "lr": 0.0002876923076923077, "step": 5594, "tokens_trained": 0.531696592 }, { "epoch": 1.587517730496454, "grad_norm": 0.704077422618866, "loss": 4.9192, "lr": 0.0002874125874125874, "step": 5596, "tokens_trained": 0.53188604 }, { "epoch": 1.5880851063829788, "grad_norm": 0.668889045715332, "loss": 4.9262, "lr": 0.00028713286713286714, "step": 5598, "tokens_trained": 0.53207592 }, { "epoch": 1.5886524822695036, "grad_norm": 0.7270148992538452, "loss": 5.021, "lr": 0.00028685314685314683, "step": 5600, "tokens_trained": 0.532265504 }, { "epoch": 1.5892198581560284, "grad_norm": 0.6982212662696838, "loss": 4.9617, "lr": 0.0002865734265734266, "step": 5602, "tokens_trained": 0.532455144 }, { "epoch": 1.5897872340425532, "grad_norm": 0.6767390370368958, "loss": 4.925, "lr": 0.0002862937062937063, "step": 5604, "tokens_trained": 0.53264576 }, { "epoch": 1.590354609929078, "grad_norm": 0.6317259073257446, "loss": 4.9388, "lr": 0.00028601398601398606, "step": 5606, "tokens_trained": 0.53283732 }, { "epoch": 1.5909219858156027, "grad_norm": 0.6900723576545715, "loss": 4.9187, "lr": 0.00028573426573426575, "step": 5608, "tokens_trained": 0.533026856 }, { "epoch": 1.5914893617021275, "grad_norm": 0.6563754677772522, "loss": 4.8987, "lr": 0.00028545454545454544, "step": 5610, "tokens_trained": 0.53321736 }, { "epoch": 1.5920567375886525, "grad_norm": 0.6413408517837524, "loss": 4.9192, "lr": 0.0002851748251748252, "step": 5612, "tokens_trained": 0.533407032 }, { "epoch": 1.5926241134751773, "grad_norm": 0.6865932941436768, "loss": 4.8934, "lr": 0.0002848951048951049, "step": 5614, "tokens_trained": 0.533597104 }, { "epoch": 1.593191489361702, "grad_norm": 0.6359190940856934, "loss": 4.9053, "lr": 0.0002846153846153846, "step": 5616, "tokens_trained": 0.533787608 }, { "epoch": 1.593758865248227, "grad_norm": 0.6963808536529541, "loss": 4.8564, "lr": 0.0002843356643356643, "step": 5618, "tokens_trained": 0.53397852 }, { "epoch": 1.5943262411347519, "grad_norm": 0.6808618307113647, "loss": 4.9829, "lr": 0.00028405594405594406, "step": 5620, "tokens_trained": 0.534168624 }, { "epoch": 1.5948936170212766, "grad_norm": 0.7262938618659973, "loss": 4.9545, "lr": 0.0002837762237762238, "step": 5622, "tokens_trained": 0.53435708 }, { "epoch": 1.5954609929078014, "grad_norm": 0.6611909866333008, "loss": 4.9651, "lr": 0.00028349650349650355, "step": 5624, "tokens_trained": 0.534548648 }, { "epoch": 1.5957446808510638, "eval_loss": 4.941013813018799, "eval_runtime": 21.1205, "step": 5625, "tokens_trained": 0.53464364 }, { "epoch": 1.5960283687943262, "grad_norm": 0.7356603741645813, "loss": 4.8811, "lr": 0.00028321678321678324, "step": 5626, "tokens_trained": 0.534739048 }, { "epoch": 1.596595744680851, "grad_norm": 0.8225935697555542, "loss": 4.9211, "lr": 0.00028293706293706293, "step": 5628, "tokens_trained": 0.534930272 }, { "epoch": 1.5971631205673757, "grad_norm": 0.7392207980155945, "loss": 4.911, "lr": 0.0002826573426573427, "step": 5630, "tokens_trained": 0.535120112 }, { "epoch": 1.5977304964539008, "grad_norm": 0.7081847786903381, "loss": 4.8499, "lr": 0.00028237762237762236, "step": 5632, "tokens_trained": 0.535308088 }, { "epoch": 1.5982978723404255, "grad_norm": 0.6932501196861267, "loss": 4.94, "lr": 0.0002820979020979021, "step": 5634, "tokens_trained": 0.535497464 }, { "epoch": 1.5988652482269503, "grad_norm": 0.7282589077949524, "loss": 4.8935, "lr": 0.0002818181818181818, "step": 5636, "tokens_trained": 0.535688928 }, { "epoch": 1.5994326241134753, "grad_norm": 0.6721267104148865, "loss": 4.9241, "lr": 0.00028153846153846154, "step": 5638, "tokens_trained": 0.535876328 }, { "epoch": 1.6, "grad_norm": 0.6456476449966431, "loss": 4.9559, "lr": 0.0002812587412587413, "step": 5640, "tokens_trained": 0.536066384 }, { "epoch": 1.6005673758865249, "grad_norm": 0.6804112792015076, "loss": 4.9085, "lr": 0.000280979020979021, "step": 5642, "tokens_trained": 0.536254128 }, { "epoch": 1.6011347517730496, "grad_norm": 0.6562972068786621, "loss": 4.872, "lr": 0.0002806993006993007, "step": 5644, "tokens_trained": 0.536446344 }, { "epoch": 1.6017021276595744, "grad_norm": 0.6613638401031494, "loss": 4.9944, "lr": 0.0002804195804195804, "step": 5646, "tokens_trained": 0.536635528 }, { "epoch": 1.6022695035460992, "grad_norm": 0.6700873374938965, "loss": 4.9456, "lr": 0.00028013986013986016, "step": 5648, "tokens_trained": 0.536824784 }, { "epoch": 1.602836879432624, "grad_norm": 0.6235386729240417, "loss": 4.9588, "lr": 0.00027986013986013985, "step": 5650, "tokens_trained": 0.537015576 }, { "epoch": 1.603404255319149, "grad_norm": 0.6628431677818298, "loss": 4.9453, "lr": 0.0002795804195804196, "step": 5652, "tokens_trained": 0.537207536 }, { "epoch": 1.6039716312056738, "grad_norm": 0.6944266557693481, "loss": 4.9151, "lr": 0.0002793006993006993, "step": 5654, "tokens_trained": 0.537397312 }, { "epoch": 1.6045390070921985, "grad_norm": 0.6461726427078247, "loss": 4.9444, "lr": 0.00027902097902097903, "step": 5656, "tokens_trained": 0.537585608 }, { "epoch": 1.6051063829787235, "grad_norm": 0.695777177810669, "loss": 4.9216, "lr": 0.00027874125874125877, "step": 5658, "tokens_trained": 0.5377756 }, { "epoch": 1.6056737588652483, "grad_norm": 0.733026921749115, "loss": 4.9423, "lr": 0.00027846153846153846, "step": 5660, "tokens_trained": 0.537964192 }, { "epoch": 1.606241134751773, "grad_norm": 0.6576181650161743, "loss": 4.9046, "lr": 0.0002781818181818182, "step": 5662, "tokens_trained": 0.538151936 }, { "epoch": 1.6068085106382979, "grad_norm": 0.6802405118942261, "loss": 4.9429, "lr": 0.0002779020979020979, "step": 5664, "tokens_trained": 0.53834172 }, { "epoch": 1.6073758865248227, "grad_norm": 0.6290127635002136, "loss": 4.9098, "lr": 0.00027762237762237764, "step": 5666, "tokens_trained": 0.53852936 }, { "epoch": 1.6079432624113474, "grad_norm": 0.6849179863929749, "loss": 4.9477, "lr": 0.00027734265734265733, "step": 5668, "tokens_trained": 0.538720736 }, { "epoch": 1.6085106382978722, "grad_norm": 0.6847065687179565, "loss": 4.9632, "lr": 0.0002770629370629371, "step": 5670, "tokens_trained": 0.53891076 }, { "epoch": 1.609078014184397, "grad_norm": 0.6859111785888672, "loss": 4.9782, "lr": 0.00027678321678321677, "step": 5672, "tokens_trained": 0.53910004 }, { "epoch": 1.609645390070922, "grad_norm": 0.687892496585846, "loss": 4.9052, "lr": 0.0002765034965034965, "step": 5674, "tokens_trained": 0.539290336 }, { "epoch": 1.6102127659574468, "grad_norm": 0.679977297782898, "loss": 4.8621, "lr": 0.0002762237762237762, "step": 5676, "tokens_trained": 0.539480408 }, { "epoch": 1.6107801418439718, "grad_norm": 0.6193852424621582, "loss": 4.8707, "lr": 0.00027594405594405595, "step": 5678, "tokens_trained": 0.53967172 }, { "epoch": 1.6113475177304966, "grad_norm": 0.6429640054702759, "loss": 4.9173, "lr": 0.0002756643356643357, "step": 5680, "tokens_trained": 0.539862872 }, { "epoch": 1.6119148936170213, "grad_norm": 0.6615781188011169, "loss": 4.9644, "lr": 0.0002753846153846154, "step": 5682, "tokens_trained": 0.540051136 }, { "epoch": 1.6124822695035461, "grad_norm": 0.6882503032684326, "loss": 4.941, "lr": 0.0002751048951048951, "step": 5684, "tokens_trained": 0.540241848 }, { "epoch": 1.613049645390071, "grad_norm": 0.6527437567710876, "loss": 4.8515, "lr": 0.0002748251748251748, "step": 5686, "tokens_trained": 0.540431136 }, { "epoch": 1.6136170212765957, "grad_norm": 0.700136125087738, "loss": 4.8908, "lr": 0.00027454545454545456, "step": 5688, "tokens_trained": 0.540623296 }, { "epoch": 1.6141843971631205, "grad_norm": 0.7063473463058472, "loss": 4.8918, "lr": 0.00027426573426573425, "step": 5690, "tokens_trained": 0.540814016 }, { "epoch": 1.6147517730496452, "grad_norm": 0.6957635879516602, "loss": 4.856, "lr": 0.000273986013986014, "step": 5692, "tokens_trained": 0.541003752 }, { "epoch": 1.6153191489361702, "grad_norm": 0.7204902768135071, "loss": 4.9044, "lr": 0.0002737062937062937, "step": 5694, "tokens_trained": 0.541193784 }, { "epoch": 1.615886524822695, "grad_norm": 0.7462215423583984, "loss": 4.9294, "lr": 0.00027342657342657343, "step": 5696, "tokens_trained": 0.541381816 }, { "epoch": 1.61645390070922, "grad_norm": 0.7395153045654297, "loss": 4.9422, "lr": 0.0002731468531468532, "step": 5698, "tokens_trained": 0.541575016 }, { "epoch": 1.6170212765957448, "grad_norm": 0.6907937526702881, "loss": 4.9753, "lr": 0.00027286713286713287, "step": 5700, "tokens_trained": 0.541763672 }, { "epoch": 1.6175886524822696, "grad_norm": 0.7379112243652344, "loss": 4.9595, "lr": 0.0002725874125874126, "step": 5702, "tokens_trained": 0.541954856 }, { "epoch": 1.6181560283687944, "grad_norm": 0.702833354473114, "loss": 4.9187, "lr": 0.0002723076923076923, "step": 5704, "tokens_trained": 0.542144208 }, { "epoch": 1.6187234042553191, "grad_norm": 0.6583960056304932, "loss": 4.9265, "lr": 0.00027202797202797205, "step": 5706, "tokens_trained": 0.542334328 }, { "epoch": 1.619290780141844, "grad_norm": 0.739059329032898, "loss": 4.9139, "lr": 0.00027174825174825174, "step": 5708, "tokens_trained": 0.54252452 }, { "epoch": 1.6198581560283687, "grad_norm": 0.7109675407409668, "loss": 4.9524, "lr": 0.0002714685314685315, "step": 5710, "tokens_trained": 0.54271396 }, { "epoch": 1.6204255319148935, "grad_norm": 0.6530102491378784, "loss": 4.9311, "lr": 0.00027118881118881117, "step": 5712, "tokens_trained": 0.542904968 }, { "epoch": 1.6209929078014185, "grad_norm": 0.7134439945220947, "loss": 4.9622, "lr": 0.0002709090909090909, "step": 5714, "tokens_trained": 0.543092176 }, { "epoch": 1.6215602836879432, "grad_norm": 0.7074549794197083, "loss": 4.9306, "lr": 0.00027062937062937066, "step": 5716, "tokens_trained": 0.543283496 }, { "epoch": 1.6221276595744683, "grad_norm": 0.6454264521598816, "loss": 4.9339, "lr": 0.00027034965034965035, "step": 5718, "tokens_trained": 0.54347484 }, { "epoch": 1.622695035460993, "grad_norm": 0.7298420667648315, "loss": 4.9364, "lr": 0.0002700699300699301, "step": 5720, "tokens_trained": 0.543666408 }, { "epoch": 1.6232624113475178, "grad_norm": 0.6454148292541504, "loss": 4.9546, "lr": 0.0002697902097902098, "step": 5722, "tokens_trained": 0.54385632 }, { "epoch": 1.6238297872340426, "grad_norm": 0.6439184546470642, "loss": 4.8989, "lr": 0.00026951048951048953, "step": 5724, "tokens_trained": 0.544048256 }, { "epoch": 1.6243971631205674, "grad_norm": 0.6580947637557983, "loss": 4.9836, "lr": 0.0002692307692307692, "step": 5726, "tokens_trained": 0.544237336 }, { "epoch": 1.6249645390070921, "grad_norm": 0.6553696990013123, "loss": 4.9504, "lr": 0.00026895104895104896, "step": 5728, "tokens_trained": 0.544426544 }, { "epoch": 1.625531914893617, "grad_norm": 0.6438179612159729, "loss": 4.9624, "lr": 0.00026867132867132865, "step": 5730, "tokens_trained": 0.544617048 }, { "epoch": 1.6260992907801417, "grad_norm": 0.6884915232658386, "loss": 4.9252, "lr": 0.0002683916083916084, "step": 5732, "tokens_trained": 0.544807008 }, { "epoch": 1.6266666666666667, "grad_norm": 0.714886486530304, "loss": 4.999, "lr": 0.00026811188811188814, "step": 5734, "tokens_trained": 0.544997864 }, { "epoch": 1.6272340425531915, "grad_norm": 0.6199847459793091, "loss": 4.9469, "lr": 0.00026783216783216783, "step": 5736, "tokens_trained": 0.545188112 }, { "epoch": 1.6278014184397163, "grad_norm": 0.6860629320144653, "loss": 4.9831, "lr": 0.0002675524475524476, "step": 5738, "tokens_trained": 0.54537904 }, { "epoch": 1.6283687943262413, "grad_norm": 0.6149401664733887, "loss": 4.8914, "lr": 0.00026727272727272727, "step": 5740, "tokens_trained": 0.545569464 }, { "epoch": 1.628936170212766, "grad_norm": 0.6258800029754639, "loss": 4.9898, "lr": 0.000266993006993007, "step": 5742, "tokens_trained": 0.54575892 }, { "epoch": 1.6295035460992908, "grad_norm": 0.6986097097396851, "loss": 4.9162, "lr": 0.0002667132867132867, "step": 5744, "tokens_trained": 0.545948856 }, { "epoch": 1.6300709219858156, "grad_norm": 0.5756875872612, "loss": 4.9078, "lr": 0.0002664335664335664, "step": 5746, "tokens_trained": 0.546138816 }, { "epoch": 1.6306382978723404, "grad_norm": 0.6775264143943787, "loss": 4.9215, "lr": 0.00026615384615384614, "step": 5748, "tokens_trained": 0.54632868 }, { "epoch": 1.6312056737588652, "grad_norm": 0.6331945061683655, "loss": 4.911, "lr": 0.0002658741258741259, "step": 5750, "tokens_trained": 0.546516976 }, { "epoch": 1.6312056737588652, "eval_loss": 4.937545299530029, "eval_runtime": 20.8228, "step": 5750, "tokens_trained": 0.546516976 }, { "epoch": 1.63177304964539, "grad_norm": 0.660851240158081, "loss": 4.9008, "lr": 0.00026559440559440563, "step": 5752, "tokens_trained": 0.546706264 }, { "epoch": 1.632340425531915, "grad_norm": 0.6905441284179688, "loss": 4.9086, "lr": 0.0002653146853146853, "step": 5754, "tokens_trained": 0.54689604 }, { "epoch": 1.6329078014184397, "grad_norm": 0.6203919053077698, "loss": 4.9058, "lr": 0.00026503496503496506, "step": 5756, "tokens_trained": 0.547087872 }, { "epoch": 1.6334751773049645, "grad_norm": 0.6931180357933044, "loss": 4.9507, "lr": 0.00026475524475524475, "step": 5758, "tokens_trained": 0.5472796 }, { "epoch": 1.6340425531914895, "grad_norm": 0.6373468637466431, "loss": 4.9398, "lr": 0.0002644755244755245, "step": 5760, "tokens_trained": 0.54746944 }, { "epoch": 1.6346099290780143, "grad_norm": 0.6498023867607117, "loss": 4.8999, "lr": 0.0002641958041958042, "step": 5762, "tokens_trained": 0.547660904 }, { "epoch": 1.635177304964539, "grad_norm": 0.6512255668640137, "loss": 4.8594, "lr": 0.0002639160839160839, "step": 5764, "tokens_trained": 0.5478504 }, { "epoch": 1.6357446808510638, "grad_norm": 0.6539003849029541, "loss": 4.9252, "lr": 0.0002636363636363636, "step": 5766, "tokens_trained": 0.548040144 }, { "epoch": 1.6363120567375886, "grad_norm": 0.6517258882522583, "loss": 4.9924, "lr": 0.00026335664335664337, "step": 5768, "tokens_trained": 0.548230744 }, { "epoch": 1.6368794326241134, "grad_norm": 0.6765470504760742, "loss": 4.9538, "lr": 0.0002630769230769231, "step": 5770, "tokens_trained": 0.548420008 }, { "epoch": 1.6374468085106382, "grad_norm": 0.6759946346282959, "loss": 4.9602, "lr": 0.0002627972027972028, "step": 5772, "tokens_trained": 0.548610312 }, { "epoch": 1.6380141843971632, "grad_norm": 0.7101868391036987, "loss": 4.9472, "lr": 0.00026251748251748255, "step": 5774, "tokens_trained": 0.548800392 }, { "epoch": 1.638581560283688, "grad_norm": 0.6968977451324463, "loss": 4.9418, "lr": 0.00026223776223776224, "step": 5776, "tokens_trained": 0.548990264 }, { "epoch": 1.6391489361702127, "grad_norm": 0.6361983418464661, "loss": 4.9036, "lr": 0.000261958041958042, "step": 5778, "tokens_trained": 0.549181104 }, { "epoch": 1.6397163120567377, "grad_norm": 0.6737280488014221, "loss": 4.8658, "lr": 0.00026167832167832167, "step": 5780, "tokens_trained": 0.549372056 }, { "epoch": 1.6402836879432625, "grad_norm": 0.7052464485168457, "loss": 4.949, "lr": 0.00026139860139860136, "step": 5782, "tokens_trained": 0.54956176 }, { "epoch": 1.6408510638297873, "grad_norm": 0.6796829104423523, "loss": 4.9296, "lr": 0.0002611188811188811, "step": 5784, "tokens_trained": 0.54975152 }, { "epoch": 1.641418439716312, "grad_norm": 0.6988239288330078, "loss": 4.8764, "lr": 0.00026083916083916085, "step": 5786, "tokens_trained": 0.54994496 }, { "epoch": 1.6419858156028369, "grad_norm": 0.6090063452720642, "loss": 4.8592, "lr": 0.0002605594405594406, "step": 5788, "tokens_trained": 0.550135928 }, { "epoch": 1.6425531914893616, "grad_norm": 0.7027743458747864, "loss": 4.8875, "lr": 0.0002602797202797203, "step": 5790, "tokens_trained": 0.550326568 }, { "epoch": 1.6431205673758864, "grad_norm": 0.6735103130340576, "loss": 4.9801, "lr": 0.00026000000000000003, "step": 5792, "tokens_trained": 0.550516616 }, { "epoch": 1.6436879432624112, "grad_norm": 0.660088062286377, "loss": 4.9159, "lr": 0.0002597202797202797, "step": 5794, "tokens_trained": 0.550705936 }, { "epoch": 1.6442553191489362, "grad_norm": 0.7521641254425049, "loss": 4.9043, "lr": 0.00025944055944055947, "step": 5796, "tokens_trained": 0.5508934 }, { "epoch": 1.644822695035461, "grad_norm": 0.641076385974884, "loss": 4.8886, "lr": 0.00025916083916083916, "step": 5798, "tokens_trained": 0.55108328 }, { "epoch": 1.645390070921986, "grad_norm": 0.6786360740661621, "loss": 4.8984, "lr": 0.00025888111888111885, "step": 5800, "tokens_trained": 0.551272536 }, { "epoch": 1.6459574468085107, "grad_norm": 0.7009422779083252, "loss": 4.8771, "lr": 0.0002586013986013986, "step": 5802, "tokens_trained": 0.55146236 }, { "epoch": 1.6465248226950355, "grad_norm": 0.676520586013794, "loss": 4.936, "lr": 0.0002583216783216783, "step": 5804, "tokens_trained": 0.551652696 }, { "epoch": 1.6470921985815603, "grad_norm": 0.7154592275619507, "loss": 4.9434, "lr": 0.0002580419580419581, "step": 5806, "tokens_trained": 0.551842848 }, { "epoch": 1.647659574468085, "grad_norm": 0.6566652655601501, "loss": 4.9044, "lr": 0.00025776223776223777, "step": 5808, "tokens_trained": 0.552033984 }, { "epoch": 1.6482269503546099, "grad_norm": 0.6657050251960754, "loss": 4.9275, "lr": 0.0002574825174825175, "step": 5810, "tokens_trained": 0.552225856 }, { "epoch": 1.6487943262411346, "grad_norm": 0.6648410558700562, "loss": 4.9252, "lr": 0.0002572027972027972, "step": 5812, "tokens_trained": 0.552416688 }, { "epoch": 1.6493617021276594, "grad_norm": 0.635155200958252, "loss": 4.934, "lr": 0.00025692307692307695, "step": 5814, "tokens_trained": 0.5526062 }, { "epoch": 1.6499290780141844, "grad_norm": 0.6842248439788818, "loss": 4.9348, "lr": 0.00025664335664335664, "step": 5816, "tokens_trained": 0.552794664 }, { "epoch": 1.6504964539007092, "grad_norm": 0.6383311152458191, "loss": 4.8907, "lr": 0.00025636363636363633, "step": 5818, "tokens_trained": 0.552984088 }, { "epoch": 1.6510638297872342, "grad_norm": 0.6503682136535645, "loss": 4.9384, "lr": 0.0002560839160839161, "step": 5820, "tokens_trained": 0.553174888 }, { "epoch": 1.651631205673759, "grad_norm": 0.6397541165351868, "loss": 4.8987, "lr": 0.00025580419580419577, "step": 5822, "tokens_trained": 0.553366336 }, { "epoch": 1.6521985815602838, "grad_norm": 0.6879380941390991, "loss": 4.9076, "lr": 0.00025552447552447557, "step": 5824, "tokens_trained": 0.553556232 }, { "epoch": 1.6527659574468085, "grad_norm": 0.6244833469390869, "loss": 4.9333, "lr": 0.00025524475524475526, "step": 5826, "tokens_trained": 0.553746288 }, { "epoch": 1.6533333333333333, "grad_norm": 0.6310114860534668, "loss": 4.9349, "lr": 0.000254965034965035, "step": 5828, "tokens_trained": 0.553936136 }, { "epoch": 1.653900709219858, "grad_norm": 0.6264005303382874, "loss": 4.8709, "lr": 0.0002546853146853147, "step": 5830, "tokens_trained": 0.554125224 }, { "epoch": 1.6544680851063829, "grad_norm": 0.6551124453544617, "loss": 4.8386, "lr": 0.00025440559440559443, "step": 5832, "tokens_trained": 0.554315128 }, { "epoch": 1.6550354609929077, "grad_norm": 0.6286723017692566, "loss": 4.926, "lr": 0.0002541258741258741, "step": 5834, "tokens_trained": 0.554506864 }, { "epoch": 1.6556028368794327, "grad_norm": 0.6238033175468445, "loss": 4.9501, "lr": 0.0002538461538461538, "step": 5836, "tokens_trained": 0.554699104 }, { "epoch": 1.6561702127659574, "grad_norm": 0.6658656597137451, "loss": 4.9057, "lr": 0.00025356643356643356, "step": 5838, "tokens_trained": 0.554890096 }, { "epoch": 1.6567375886524822, "grad_norm": 0.6604855060577393, "loss": 4.9791, "lr": 0.00025328671328671325, "step": 5840, "tokens_trained": 0.55508088 }, { "epoch": 1.6573049645390072, "grad_norm": 0.6438781023025513, "loss": 4.9047, "lr": 0.00025300699300699305, "step": 5842, "tokens_trained": 0.555271944 }, { "epoch": 1.657872340425532, "grad_norm": 0.7207959890365601, "loss": 4.8833, "lr": 0.00025272727272727274, "step": 5844, "tokens_trained": 0.555461088 }, { "epoch": 1.6584397163120568, "grad_norm": 0.638710618019104, "loss": 4.9453, "lr": 0.0002524475524475525, "step": 5846, "tokens_trained": 0.555652112 }, { "epoch": 1.6590070921985816, "grad_norm": 0.6577686071395874, "loss": 4.8861, "lr": 0.0002521678321678322, "step": 5848, "tokens_trained": 0.555840504 }, { "epoch": 1.6595744680851063, "grad_norm": 0.5944091081619263, "loss": 4.9211, "lr": 0.0002518881118881119, "step": 5850, "tokens_trained": 0.5560306 }, { "epoch": 1.6601418439716311, "grad_norm": 0.6569780111312866, "loss": 4.8638, "lr": 0.0002516083916083916, "step": 5852, "tokens_trained": 0.556220936 }, { "epoch": 1.660709219858156, "grad_norm": 0.6286190152168274, "loss": 4.8752, "lr": 0.0002513286713286713, "step": 5854, "tokens_trained": 0.556407576 }, { "epoch": 1.661276595744681, "grad_norm": 0.6778548359870911, "loss": 4.8653, "lr": 0.00025104895104895104, "step": 5856, "tokens_trained": 0.556598056 }, { "epoch": 1.6618439716312057, "grad_norm": 0.7045045495033264, "loss": 5.0023, "lr": 0.00025076923076923073, "step": 5858, "tokens_trained": 0.556786912 }, { "epoch": 1.6624113475177305, "grad_norm": 0.6449150443077087, "loss": 4.8952, "lr": 0.00025048951048951053, "step": 5860, "tokens_trained": 0.5569748 }, { "epoch": 1.6629787234042555, "grad_norm": 0.6967601180076599, "loss": 4.9314, "lr": 0.0002502097902097902, "step": 5862, "tokens_trained": 0.55716296 }, { "epoch": 1.6635460992907802, "grad_norm": 0.6434826254844666, "loss": 4.9258, "lr": 0.00024993006993006997, "step": 5864, "tokens_trained": 0.557352408 }, { "epoch": 1.664113475177305, "grad_norm": 0.6420868039131165, "loss": 4.914, "lr": 0.00024965034965034966, "step": 5866, "tokens_trained": 0.557542608 }, { "epoch": 1.6646808510638298, "grad_norm": 0.6396485567092896, "loss": 4.8814, "lr": 0.00024937062937062935, "step": 5868, "tokens_trained": 0.55773236 }, { "epoch": 1.6652482269503546, "grad_norm": 0.682684600353241, "loss": 4.9327, "lr": 0.0002490909090909091, "step": 5870, "tokens_trained": 0.557924048 }, { "epoch": 1.6658156028368793, "grad_norm": 0.5905774831771851, "loss": 4.9029, "lr": 0.0002488111888111888, "step": 5872, "tokens_trained": 0.55811432 }, { "epoch": 1.6663829787234041, "grad_norm": 0.7155118584632874, "loss": 4.9349, "lr": 0.00024853146853146853, "step": 5874, "tokens_trained": 0.558304 }, { "epoch": 1.6666666666666665, "eval_loss": 4.9358296394348145, "eval_runtime": 20.4887, "step": 5875, "tokens_trained": 0.558400232 }, { "epoch": 1.6669503546099291, "grad_norm": 0.6634054183959961, "loss": 4.8892, "lr": 0.0002482517482517483, "step": 5876, "tokens_trained": 0.558495824 }, { "epoch": 1.667517730496454, "grad_norm": 0.6477299928665161, "loss": 4.9186, "lr": 0.00024797202797202796, "step": 5878, "tokens_trained": 0.55868628 }, { "epoch": 1.6680851063829787, "grad_norm": 0.6633033156394958, "loss": 4.954, "lr": 0.0002476923076923077, "step": 5880, "tokens_trained": 0.558876288 }, { "epoch": 1.6686524822695037, "grad_norm": 0.6511030197143555, "loss": 4.9135, "lr": 0.00024741258741258745, "step": 5882, "tokens_trained": 0.559067016 }, { "epoch": 1.6692198581560285, "grad_norm": 0.662302553653717, "loss": 4.935, "lr": 0.00024713286713286714, "step": 5884, "tokens_trained": 0.559257432 }, { "epoch": 1.6697872340425532, "grad_norm": 0.624128520488739, "loss": 4.8758, "lr": 0.00024685314685314683, "step": 5886, "tokens_trained": 0.559446 }, { "epoch": 1.670354609929078, "grad_norm": 0.6498073935508728, "loss": 4.9154, "lr": 0.0002465734265734266, "step": 5888, "tokens_trained": 0.55963632 }, { "epoch": 1.6709219858156028, "grad_norm": 0.6341784596443176, "loss": 4.9037, "lr": 0.00024629370629370627, "step": 5890, "tokens_trained": 0.559825824 }, { "epoch": 1.6714893617021276, "grad_norm": 0.6473166942596436, "loss": 4.9668, "lr": 0.000246013986013986, "step": 5892, "tokens_trained": 0.560017576 }, { "epoch": 1.6720567375886524, "grad_norm": 0.6661013960838318, "loss": 4.9081, "lr": 0.00024573426573426576, "step": 5894, "tokens_trained": 0.560207408 }, { "epoch": 1.6726241134751771, "grad_norm": 0.6763671636581421, "loss": 4.9004, "lr": 0.00024545454545454545, "step": 5896, "tokens_trained": 0.560397992 }, { "epoch": 1.6731914893617021, "grad_norm": 0.6760857105255127, "loss": 4.9442, "lr": 0.0002451748251748252, "step": 5898, "tokens_trained": 0.56058784 }, { "epoch": 1.673758865248227, "grad_norm": 0.6097757816314697, "loss": 4.9087, "lr": 0.0002448951048951049, "step": 5900, "tokens_trained": 0.56077764 }, { "epoch": 1.674326241134752, "grad_norm": 0.6780499219894409, "loss": 4.9119, "lr": 0.00024461538461538463, "step": 5902, "tokens_trained": 0.56096688 }, { "epoch": 1.6748936170212767, "grad_norm": 0.6448239684104919, "loss": 4.9117, "lr": 0.0002443356643356643, "step": 5904, "tokens_trained": 0.561157992 }, { "epoch": 1.6754609929078015, "grad_norm": 0.6529706120491028, "loss": 4.9196, "lr": 0.00024405594405594406, "step": 5906, "tokens_trained": 0.56134868 }, { "epoch": 1.6760283687943263, "grad_norm": 0.6135060787200928, "loss": 4.9333, "lr": 0.00024377622377622378, "step": 5908, "tokens_trained": 0.561542176 }, { "epoch": 1.676595744680851, "grad_norm": 0.6510182023048401, "loss": 4.9107, "lr": 0.00024349650349650352, "step": 5910, "tokens_trained": 0.56173332 }, { "epoch": 1.6771631205673758, "grad_norm": 0.6503420472145081, "loss": 4.9493, "lr": 0.00024321678321678321, "step": 5912, "tokens_trained": 0.56192172 }, { "epoch": 1.6777304964539006, "grad_norm": 0.6826989054679871, "loss": 4.8839, "lr": 0.00024293706293706293, "step": 5914, "tokens_trained": 0.562110064 }, { "epoch": 1.6782978723404254, "grad_norm": 0.6254825592041016, "loss": 4.8727, "lr": 0.00024265734265734265, "step": 5916, "tokens_trained": 0.562299216 }, { "epoch": 1.6788652482269504, "grad_norm": 0.6255868077278137, "loss": 4.8653, "lr": 0.00024237762237762237, "step": 5918, "tokens_trained": 0.562489864 }, { "epoch": 1.6794326241134752, "grad_norm": 0.6201049089431763, "loss": 4.9563, "lr": 0.0002420979020979021, "step": 5920, "tokens_trained": 0.56268032 }, { "epoch": 1.6800000000000002, "grad_norm": 0.696552574634552, "loss": 4.922, "lr": 0.00024181818181818183, "step": 5922, "tokens_trained": 0.562870992 }, { "epoch": 1.680567375886525, "grad_norm": 0.7069157958030701, "loss": 4.9399, "lr": 0.00024153846153846155, "step": 5924, "tokens_trained": 0.563059256 }, { "epoch": 1.6811347517730497, "grad_norm": 0.7062954902648926, "loss": 4.9387, "lr": 0.00024125874125874126, "step": 5926, "tokens_trained": 0.563248928 }, { "epoch": 1.6817021276595745, "grad_norm": 0.6647072434425354, "loss": 4.9069, "lr": 0.000240979020979021, "step": 5928, "tokens_trained": 0.563439824 }, { "epoch": 1.6822695035460993, "grad_norm": 0.6497480869293213, "loss": 4.931, "lr": 0.0002406993006993007, "step": 5930, "tokens_trained": 0.563630672 }, { "epoch": 1.682836879432624, "grad_norm": 0.6641665101051331, "loss": 4.8775, "lr": 0.00024041958041958042, "step": 5932, "tokens_trained": 0.563820464 }, { "epoch": 1.6834042553191488, "grad_norm": 0.6873026490211487, "loss": 4.8971, "lr": 0.00024013986013986013, "step": 5934, "tokens_trained": 0.564014008 }, { "epoch": 1.6839716312056736, "grad_norm": 0.626800537109375, "loss": 4.9666, "lr": 0.00023986013986013985, "step": 5936, "tokens_trained": 0.564204536 }, { "epoch": 1.6845390070921986, "grad_norm": 0.6775664687156677, "loss": 4.8731, "lr": 0.0002395804195804196, "step": 5938, "tokens_trained": 0.564396216 }, { "epoch": 1.6851063829787234, "grad_norm": 0.6553815007209778, "loss": 4.9246, "lr": 0.0002393006993006993, "step": 5940, "tokens_trained": 0.564586632 }, { "epoch": 1.6856737588652482, "grad_norm": 0.6535831093788147, "loss": 4.8769, "lr": 0.00023902097902097903, "step": 5942, "tokens_trained": 0.564777112 }, { "epoch": 1.6862411347517732, "grad_norm": 0.696178138256073, "loss": 4.9115, "lr": 0.00023874125874125875, "step": 5944, "tokens_trained": 0.564965984 }, { "epoch": 1.686808510638298, "grad_norm": 0.6272562742233276, "loss": 4.8957, "lr": 0.0002384615384615385, "step": 5946, "tokens_trained": 0.565155928 }, { "epoch": 1.6873758865248227, "grad_norm": 0.6838794946670532, "loss": 4.9421, "lr": 0.00023818181818181818, "step": 5948, "tokens_trained": 0.5653456 }, { "epoch": 1.6879432624113475, "grad_norm": 0.6344850063323975, "loss": 4.9601, "lr": 0.0002379020979020979, "step": 5950, "tokens_trained": 0.565536296 }, { "epoch": 1.6885106382978723, "grad_norm": 0.6875824928283691, "loss": 4.9443, "lr": 0.00023762237762237762, "step": 5952, "tokens_trained": 0.565725656 }, { "epoch": 1.689078014184397, "grad_norm": 0.6602357625961304, "loss": 4.9395, "lr": 0.00023734265734265734, "step": 5954, "tokens_trained": 0.565915856 }, { "epoch": 1.6896453900709218, "grad_norm": 0.6800606846809387, "loss": 4.8741, "lr": 0.00023706293706293708, "step": 5956, "tokens_trained": 0.566105896 }, { "epoch": 1.6902127659574468, "grad_norm": 0.5828418135643005, "loss": 4.9077, "lr": 0.0002367832167832168, "step": 5958, "tokens_trained": 0.566296256 }, { "epoch": 1.6907801418439716, "grad_norm": 0.7016698718070984, "loss": 4.9157, "lr": 0.00023650349650349652, "step": 5960, "tokens_trained": 0.566485008 }, { "epoch": 1.6913475177304964, "grad_norm": 0.6452853083610535, "loss": 4.9228, "lr": 0.00023622377622377623, "step": 5962, "tokens_trained": 0.566672864 }, { "epoch": 1.6919148936170214, "grad_norm": 0.7076459527015686, "loss": 4.9212, "lr": 0.00023594405594405592, "step": 5964, "tokens_trained": 0.566862552 }, { "epoch": 1.6924822695035462, "grad_norm": 0.6401494741439819, "loss": 4.9477, "lr": 0.00023566433566433567, "step": 5966, "tokens_trained": 0.567052528 }, { "epoch": 1.693049645390071, "grad_norm": 0.6188704371452332, "loss": 4.9492, "lr": 0.00023538461538461538, "step": 5968, "tokens_trained": 0.56724216 }, { "epoch": 1.6936170212765957, "grad_norm": 0.6359750032424927, "loss": 4.8968, "lr": 0.0002351048951048951, "step": 5970, "tokens_trained": 0.567432336 }, { "epoch": 1.6941843971631205, "grad_norm": 0.5982444882392883, "loss": 4.8844, "lr": 0.00023482517482517482, "step": 5972, "tokens_trained": 0.567622688 }, { "epoch": 1.6947517730496453, "grad_norm": 0.656658411026001, "loss": 4.9238, "lr": 0.00023454545454545456, "step": 5974, "tokens_trained": 0.56781264 }, { "epoch": 1.69531914893617, "grad_norm": 0.6378044486045837, "loss": 4.8598, "lr": 0.00023426573426573428, "step": 5976, "tokens_trained": 0.568002208 }, { "epoch": 1.695886524822695, "grad_norm": 0.628505289554596, "loss": 4.907, "lr": 0.000233986013986014, "step": 5978, "tokens_trained": 0.568192648 }, { "epoch": 1.6964539007092199, "grad_norm": 0.5912485718727112, "loss": 4.8991, "lr": 0.00023370629370629372, "step": 5980, "tokens_trained": 0.568384808 }, { "epoch": 1.6970212765957446, "grad_norm": 0.6603651642799377, "loss": 4.8716, "lr": 0.0002334265734265734, "step": 5982, "tokens_trained": 0.568575728 }, { "epoch": 1.6975886524822696, "grad_norm": 0.6282093524932861, "loss": 4.9948, "lr": 0.00023314685314685315, "step": 5984, "tokens_trained": 0.56876664 }, { "epoch": 1.6981560283687944, "grad_norm": 0.7470095157623291, "loss": 4.9029, "lr": 0.00023286713286713287, "step": 5986, "tokens_trained": 0.568955864 }, { "epoch": 1.6987234042553192, "grad_norm": 0.6167851090431213, "loss": 4.8737, "lr": 0.0002325874125874126, "step": 5988, "tokens_trained": 0.569145168 }, { "epoch": 1.699290780141844, "grad_norm": 0.704911470413208, "loss": 4.8866, "lr": 0.0002323076923076923, "step": 5990, "tokens_trained": 0.569333848 }, { "epoch": 1.6998581560283688, "grad_norm": 0.619904637336731, "loss": 4.8562, "lr": 0.00023202797202797205, "step": 5992, "tokens_trained": 0.56952244 }, { "epoch": 1.7004255319148935, "grad_norm": 0.6953230500221252, "loss": 4.92, "lr": 0.00023174825174825177, "step": 5994, "tokens_trained": 0.569712192 }, { "epoch": 1.7009929078014183, "grad_norm": 0.6583752036094666, "loss": 4.9628, "lr": 0.00023146853146853148, "step": 5996, "tokens_trained": 0.569901272 }, { "epoch": 1.701560283687943, "grad_norm": 0.6207908987998962, "loss": 4.918, "lr": 0.0002311888111888112, "step": 5998, "tokens_trained": 0.57009052 }, { "epoch": 1.702127659574468, "grad_norm": 0.6498993635177612, "loss": 4.9996, "lr": 0.0002309090909090909, "step": 6000, "tokens_trained": 0.570279552 }, { "epoch": 1.702127659574468, "eval_loss": 4.926996231079102, "eval_runtime": 20.7631, "step": 6000, "tokens_trained": 0.570279552 } ], "logging_steps": 2, "max_steps": 7650, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }