{ "best_global_step": 625, "best_metric": 5.630118370056152, "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-BPE/checkpoint-625", "epoch": 0.1773049645390071, "eval_steps": 125, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005673758865248227, "grad_norm": 1275.0146484375, "loss": 281.4781, "lr": 2e-06, "step": 2, "tokens_trained": 0.000192256 }, { "epoch": 0.0011347517730496454, "grad_norm": 1437.579833984375, "loss": 267.2211, "lr": 6e-06, "step": 4, "tokens_trained": 0.000382024 }, { "epoch": 0.001702127659574468, "grad_norm": 1719.271484375, "loss": 219.3822, "lr": 1e-05, "step": 6, "tokens_trained": 0.00057072 }, { "epoch": 0.0022695035460992908, "grad_norm": 1444.94970703125, "loss": 133.8172, "lr": 1.4e-05, "step": 8, "tokens_trained": 0.000761336 }, { "epoch": 0.0028368794326241137, "grad_norm": 238.9689178466797, "loss": 90.8177, "lr": 1.8e-05, "step": 10, "tokens_trained": 0.000953248 }, { "epoch": 0.003404255319148936, "grad_norm": 158.53497314453125, "loss": 84.6922, "lr": 2.2e-05, "step": 12, "tokens_trained": 0.00114424 }, { "epoch": 0.003971631205673759, "grad_norm": 146.10595703125, "loss": 76.7055, "lr": 2.6e-05, "step": 14, "tokens_trained": 0.001334104 }, { "epoch": 0.0045390070921985815, "grad_norm": 140.69964599609375, "loss": 67.9952, "lr": 3e-05, "step": 16, "tokens_trained": 0.00152392 }, { "epoch": 0.005106382978723404, "grad_norm": 108.80303192138672, "loss": 57.8088, "lr": 3.4000000000000007e-05, "step": 18, "tokens_trained": 0.001713872 }, { "epoch": 0.005673758865248227, "grad_norm": 106.82334899902344, "loss": 48.6585, "lr": 3.8e-05, "step": 20, "tokens_trained": 0.001903976 }, { "epoch": 0.00624113475177305, "grad_norm": 93.58769989013672, "loss": 41.7984, "lr": 4.2000000000000004e-05, "step": 22, "tokens_trained": 0.002094288 }, { "epoch": 0.006808510638297872, "grad_norm": 87.5854721069336, "loss": 37.6201, "lr": 4.6e-05, "step": 24, "tokens_trained": 0.002282496 }, { "epoch": 0.007375886524822695, "grad_norm": 84.12794494628906, "loss": 35.0091, "lr": 5e-05, "step": 26, "tokens_trained": 0.00247068 }, { "epoch": 0.007943262411347518, "grad_norm": 79.77535247802734, "loss": 33.2253, "lr": 5.4e-05, "step": 28, "tokens_trained": 0.002662888 }, { "epoch": 0.00851063829787234, "grad_norm": 66.42157745361328, "loss": 32.0682, "lr": 5.800000000000001e-05, "step": 30, "tokens_trained": 0.002851968 }, { "epoch": 0.009078014184397163, "grad_norm": 87.52485656738281, "loss": 30.893, "lr": 6.2e-05, "step": 32, "tokens_trained": 0.003041384 }, { "epoch": 0.009645390070921986, "grad_norm": 58.33614730834961, "loss": 30.0513, "lr": 6.6e-05, "step": 34, "tokens_trained": 0.003232872 }, { "epoch": 0.010212765957446808, "grad_norm": 54.629329681396484, "loss": 29.0115, "lr": 7.000000000000001e-05, "step": 36, "tokens_trained": 0.003423824 }, { "epoch": 0.01078014184397163, "grad_norm": 52.79097366333008, "loss": 28.2084, "lr": 7.4e-05, "step": 38, "tokens_trained": 0.003613232 }, { "epoch": 0.011347517730496455, "grad_norm": 54.481224060058594, "loss": 27.4345, "lr": 7.8e-05, "step": 40, "tokens_trained": 0.003800952 }, { "epoch": 0.011914893617021277, "grad_norm": 58.7069091796875, "loss": 26.5936, "lr": 8.2e-05, "step": 42, "tokens_trained": 0.003991512 }, { "epoch": 0.0124822695035461, "grad_norm": 49.30760955810547, "loss": 26.0608, "lr": 8.599999999999999e-05, "step": 44, "tokens_trained": 0.004180648 }, { "epoch": 0.013049645390070922, "grad_norm": 61.902587890625, "loss": 25.5363, "lr": 8.999999999999999e-05, "step": 46, "tokens_trained": 0.00437148 }, { "epoch": 0.013617021276595745, "grad_norm": 46.76111602783203, "loss": 24.9599, "lr": 9.400000000000001e-05, "step": 48, "tokens_trained": 0.004559344 }, { "epoch": 0.014184397163120567, "grad_norm": 57.06416702270508, "loss": 24.4087, "lr": 9.800000000000001e-05, "step": 50, "tokens_trained": 0.004749256 }, { "epoch": 0.01475177304964539, "grad_norm": 44.798736572265625, "loss": 24.1444, "lr": 0.000102, "step": 52, "tokens_trained": 0.004940192 }, { "epoch": 0.015319148936170212, "grad_norm": 40.29296875, "loss": 23.6011, "lr": 0.000106, "step": 54, "tokens_trained": 0.005130304 }, { "epoch": 0.015886524822695036, "grad_norm": 38.75099563598633, "loss": 23.1781, "lr": 0.00011, "step": 56, "tokens_trained": 0.005322864 }, { "epoch": 0.016453900709219857, "grad_norm": 37.470706939697266, "loss": 22.9136, "lr": 0.000114, "step": 58, "tokens_trained": 0.00551392 }, { "epoch": 0.01702127659574468, "grad_norm": 35.1894645690918, "loss": 22.6336, "lr": 0.000118, "step": 60, "tokens_trained": 0.005703096 }, { "epoch": 0.017588652482269502, "grad_norm": 35.136573791503906, "loss": 22.2998, "lr": 0.000122, "step": 62, "tokens_trained": 0.005892448 }, { "epoch": 0.018156028368794326, "grad_norm": 38.05111312866211, "loss": 21.9401, "lr": 0.000126, "step": 64, "tokens_trained": 0.006081656 }, { "epoch": 0.01872340425531915, "grad_norm": 35.63850021362305, "loss": 21.7206, "lr": 0.00013000000000000002, "step": 66, "tokens_trained": 0.006273032 }, { "epoch": 0.01929078014184397, "grad_norm": 34.327667236328125, "loss": 21.4051, "lr": 0.000134, "step": 68, "tokens_trained": 0.00646304 }, { "epoch": 0.019858156028368795, "grad_norm": 31.457059860229492, "loss": 21.0774, "lr": 0.00013800000000000002, "step": 70, "tokens_trained": 0.006652832 }, { "epoch": 0.020425531914893616, "grad_norm": 34.91672897338867, "loss": 20.8718, "lr": 0.00014199999999999998, "step": 72, "tokens_trained": 0.006843512 }, { "epoch": 0.02099290780141844, "grad_norm": 27.959579467773438, "loss": 20.6932, "lr": 0.000146, "step": 74, "tokens_trained": 0.007033584 }, { "epoch": 0.02156028368794326, "grad_norm": 26.569866180419922, "loss": 20.4072, "lr": 0.00015, "step": 76, "tokens_trained": 0.007224032 }, { "epoch": 0.022127659574468085, "grad_norm": 28.009904861450195, "loss": 20.2229, "lr": 0.000154, "step": 78, "tokens_trained": 0.00741368 }, { "epoch": 0.02269503546099291, "grad_norm": 28.892959594726562, "loss": 20.0528, "lr": 0.000158, "step": 80, "tokens_trained": 0.00760416 }, { "epoch": 0.02326241134751773, "grad_norm": 31.58131980895996, "loss": 19.8016, "lr": 0.000162, "step": 82, "tokens_trained": 0.007793952 }, { "epoch": 0.023829787234042554, "grad_norm": 31.01254653930664, "loss": 19.634, "lr": 0.00016600000000000002, "step": 84, "tokens_trained": 0.007980792 }, { "epoch": 0.024397163120567375, "grad_norm": 28.732515335083008, "loss": 19.3777, "lr": 0.00017, "step": 86, "tokens_trained": 0.008171968 }, { "epoch": 0.0249645390070922, "grad_norm": 24.31264877319336, "loss": 19.1346, "lr": 0.000174, "step": 88, "tokens_trained": 0.008361632 }, { "epoch": 0.02553191489361702, "grad_norm": 26.557010650634766, "loss": 19.0014, "lr": 0.000178, "step": 90, "tokens_trained": 0.008552328 }, { "epoch": 0.026099290780141844, "grad_norm": 21.156103134155273, "loss": 18.7032, "lr": 0.000182, "step": 92, "tokens_trained": 0.008743136 }, { "epoch": 0.02666666666666667, "grad_norm": 25.7484188079834, "loss": 18.4836, "lr": 0.000186, "step": 94, "tokens_trained": 0.008932056 }, { "epoch": 0.02723404255319149, "grad_norm": 22.27949333190918, "loss": 18.2233, "lr": 0.00019, "step": 96, "tokens_trained": 0.009121608 }, { "epoch": 0.027801418439716313, "grad_norm": 24.9247989654541, "loss": 17.9867, "lr": 0.000194, "step": 98, "tokens_trained": 0.009311008 }, { "epoch": 0.028368794326241134, "grad_norm": 24.302066802978516, "loss": 17.8016, "lr": 0.00019800000000000002, "step": 100, "tokens_trained": 0.009501456 }, { "epoch": 0.02893617021276596, "grad_norm": 23.458459854125977, "loss": 17.6295, "lr": 0.000202, "step": 102, "tokens_trained": 0.009693952 }, { "epoch": 0.02950354609929078, "grad_norm": 24.092350006103516, "loss": 17.4593, "lr": 0.000206, "step": 104, "tokens_trained": 0.009883328 }, { "epoch": 0.030070921985815603, "grad_norm": 22.54726219177246, "loss": 17.2141, "lr": 0.00021, "step": 106, "tokens_trained": 0.01007316 }, { "epoch": 0.030638297872340424, "grad_norm": 21.334760665893555, "loss": 17.044, "lr": 0.000214, "step": 108, "tokens_trained": 0.010266504 }, { "epoch": 0.031205673758865248, "grad_norm": 20.584287643432617, "loss": 16.8919, "lr": 0.000218, "step": 110, "tokens_trained": 0.010455736 }, { "epoch": 0.03177304964539007, "grad_norm": 23.51676368713379, "loss": 16.751, "lr": 0.000222, "step": 112, "tokens_trained": 0.010645208 }, { "epoch": 0.03234042553191489, "grad_norm": 23.278276443481445, "loss": 16.5997, "lr": 0.00022600000000000002, "step": 114, "tokens_trained": 0.010838928 }, { "epoch": 0.032907801418439714, "grad_norm": 25.4830265045166, "loss": 16.3416, "lr": 0.00023, "step": 116, "tokens_trained": 0.011027792 }, { "epoch": 0.03347517730496454, "grad_norm": 29.442413330078125, "loss": 16.24, "lr": 0.00023400000000000002, "step": 118, "tokens_trained": 0.011217456 }, { "epoch": 0.03404255319148936, "grad_norm": 21.77578353881836, "loss": 16.1922, "lr": 0.00023799999999999998, "step": 120, "tokens_trained": 0.01140804 }, { "epoch": 0.03460992907801418, "grad_norm": 27.040719985961914, "loss": 15.9059, "lr": 0.000242, "step": 122, "tokens_trained": 0.011597816 }, { "epoch": 0.035177304964539004, "grad_norm": 24.74480628967285, "loss": 15.7818, "lr": 0.000246, "step": 124, "tokens_trained": 0.011785624 }, { "epoch": 0.03546099290780142, "eval_loss": 15.553059577941895, "eval_runtime": 23.5485, "step": 125, "tokens_trained": 0.011880832 }, { "epoch": 0.03574468085106383, "grad_norm": 23.13482666015625, "loss": 15.5739, "lr": 0.00025, "step": 126, "tokens_trained": 0.011975976 }, { "epoch": 0.03631205673758865, "grad_norm": 22.8618106842041, "loss": 15.4302, "lr": 0.000254, "step": 128, "tokens_trained": 0.012166744 }, { "epoch": 0.03687943262411347, "grad_norm": 26.804859161376953, "loss": 15.3623, "lr": 0.00025800000000000004, "step": 130, "tokens_trained": 0.01235436 }, { "epoch": 0.0374468085106383, "grad_norm": 21.826601028442383, "loss": 15.1465, "lr": 0.000262, "step": 132, "tokens_trained": 0.012544976 }, { "epoch": 0.03801418439716312, "grad_norm": 39.447086334228516, "loss": 15.0137, "lr": 0.000266, "step": 134, "tokens_trained": 0.012736352 }, { "epoch": 0.03858156028368794, "grad_norm": 23.44275665283203, "loss": 14.9355, "lr": 0.00027, "step": 136, "tokens_trained": 0.012925008 }, { "epoch": 0.03914893617021276, "grad_norm": 21.631427764892578, "loss": 14.6825, "lr": 0.00027400000000000005, "step": 138, "tokens_trained": 0.013114672 }, { "epoch": 0.03971631205673759, "grad_norm": 23.674650192260742, "loss": 14.5194, "lr": 0.00027800000000000004, "step": 140, "tokens_trained": 0.013304016 }, { "epoch": 0.04028368794326241, "grad_norm": 23.974796295166016, "loss": 14.4829, "lr": 0.00028199999999999997, "step": 142, "tokens_trained": 0.013496696 }, { "epoch": 0.04085106382978723, "grad_norm": 26.112201690673828, "loss": 14.3027, "lr": 0.00028599999999999996, "step": 144, "tokens_trained": 0.013684816 }, { "epoch": 0.04141843971631206, "grad_norm": 20.67386817932129, "loss": 14.1499, "lr": 0.00029, "step": 146, "tokens_trained": 0.013874832 }, { "epoch": 0.04198581560283688, "grad_norm": 24.253408432006836, "loss": 13.9378, "lr": 0.000294, "step": 148, "tokens_trained": 0.014065056 }, { "epoch": 0.0425531914893617, "grad_norm": 35.716087341308594, "loss": 14.0562, "lr": 0.000298, "step": 150, "tokens_trained": 0.014256784 }, { "epoch": 0.04312056737588652, "grad_norm": 29.414331436157227, "loss": 14.0462, "lr": 0.000302, "step": 152, "tokens_trained": 0.014446312 }, { "epoch": 0.04368794326241135, "grad_norm": 30.687482833862305, "loss": 13.7603, "lr": 0.000306, "step": 154, "tokens_trained": 0.014639872 }, { "epoch": 0.04425531914893617, "grad_norm": 29.806455612182617, "loss": 13.708, "lr": 0.00031, "step": 156, "tokens_trained": 0.014831112 }, { "epoch": 0.04482269503546099, "grad_norm": 24.900897979736328, "loss": 13.548, "lr": 0.000314, "step": 158, "tokens_trained": 0.015021288 }, { "epoch": 0.04539007092198582, "grad_norm": 24.29252815246582, "loss": 13.3119, "lr": 0.00031800000000000003, "step": 160, "tokens_trained": 0.01521228 }, { "epoch": 0.04595744680851064, "grad_norm": 20.68342399597168, "loss": 13.1829, "lr": 0.000322, "step": 162, "tokens_trained": 0.015403688 }, { "epoch": 0.04652482269503546, "grad_norm": 20.822795867919922, "loss": 12.9044, "lr": 0.000326, "step": 164, "tokens_trained": 0.015593416 }, { "epoch": 0.04709219858156028, "grad_norm": 21.689916610717773, "loss": 12.6862, "lr": 0.00033, "step": 166, "tokens_trained": 0.015784408 }, { "epoch": 0.04765957446808511, "grad_norm": 17.873889923095703, "loss": 12.5502, "lr": 0.00033400000000000004, "step": 168, "tokens_trained": 0.0159744 }, { "epoch": 0.04822695035460993, "grad_norm": 18.951616287231445, "loss": 12.308, "lr": 0.00033800000000000003, "step": 170, "tokens_trained": 0.016163736 }, { "epoch": 0.04879432624113475, "grad_norm": 15.146363258361816, "loss": 12.1558, "lr": 0.000342, "step": 172, "tokens_trained": 0.016353832 }, { "epoch": 0.04936170212765958, "grad_norm": 18.336984634399414, "loss": 12.0386, "lr": 0.000346, "step": 174, "tokens_trained": 0.016545088 }, { "epoch": 0.0499290780141844, "grad_norm": 17.221126556396484, "loss": 11.8791, "lr": 0.00035, "step": 176, "tokens_trained": 0.016735704 }, { "epoch": 0.05049645390070922, "grad_norm": 19.362564086914062, "loss": 11.7224, "lr": 0.000354, "step": 178, "tokens_trained": 0.016927944 }, { "epoch": 0.05106382978723404, "grad_norm": 15.564507484436035, "loss": 11.6448, "lr": 0.000358, "step": 180, "tokens_trained": 0.017116096 }, { "epoch": 0.05163120567375887, "grad_norm": 20.711383819580078, "loss": 11.4398, "lr": 0.000362, "step": 182, "tokens_trained": 0.01730564 }, { "epoch": 0.05219858156028369, "grad_norm": 18.627403259277344, "loss": 11.3377, "lr": 0.000366, "step": 184, "tokens_trained": 0.017495864 }, { "epoch": 0.05276595744680851, "grad_norm": 15.00942325592041, "loss": 11.1416, "lr": 0.00037, "step": 186, "tokens_trained": 0.017686464 }, { "epoch": 0.05333333333333334, "grad_norm": 17.070598602294922, "loss": 11.0148, "lr": 0.000374, "step": 188, "tokens_trained": 0.017879488 }, { "epoch": 0.05390070921985816, "grad_norm": 16.101457595825195, "loss": 10.8874, "lr": 0.000378, "step": 190, "tokens_trained": 0.018068312 }, { "epoch": 0.05446808510638298, "grad_norm": 15.613334655761719, "loss": 10.7055, "lr": 0.000382, "step": 192, "tokens_trained": 0.018255752 }, { "epoch": 0.0550354609929078, "grad_norm": 17.671857833862305, "loss": 10.5706, "lr": 0.000386, "step": 194, "tokens_trained": 0.018447096 }, { "epoch": 0.05560283687943263, "grad_norm": 16.080909729003906, "loss": 10.4476, "lr": 0.00039000000000000005, "step": 196, "tokens_trained": 0.018637264 }, { "epoch": 0.05617021276595745, "grad_norm": 15.02849292755127, "loss": 10.2962, "lr": 0.00039400000000000004, "step": 198, "tokens_trained": 0.018827552 }, { "epoch": 0.05673758865248227, "grad_norm": 14.990167617797852, "loss": 10.1912, "lr": 0.000398, "step": 200, "tokens_trained": 0.019018 }, { "epoch": 0.05730496453900709, "grad_norm": 15.390633583068848, "loss": 10.0442, "lr": 0.000402, "step": 202, "tokens_trained": 0.019209864 }, { "epoch": 0.05787234042553192, "grad_norm": 16.871570587158203, "loss": 9.9685, "lr": 0.00040600000000000006, "step": 204, "tokens_trained": 0.019400176 }, { "epoch": 0.05843971631205674, "grad_norm": 20.16544532775879, "loss": 9.8531, "lr": 0.00041, "step": 206, "tokens_trained": 0.019589424 }, { "epoch": 0.05900709219858156, "grad_norm": 16.825023651123047, "loss": 9.7777, "lr": 0.000414, "step": 208, "tokens_trained": 0.019779112 }, { "epoch": 0.059574468085106386, "grad_norm": 16.43510627746582, "loss": 9.6122, "lr": 0.00041799999999999997, "step": 210, "tokens_trained": 0.019970048 }, { "epoch": 0.060141843971631206, "grad_norm": 17.340473175048828, "loss": 9.4859, "lr": 0.000422, "step": 212, "tokens_trained": 0.020160968 }, { "epoch": 0.06070921985815603, "grad_norm": 15.019119262695312, "loss": 9.3656, "lr": 0.000426, "step": 214, "tokens_trained": 0.020349664 }, { "epoch": 0.06127659574468085, "grad_norm": 13.379194259643555, "loss": 9.2348, "lr": 0.00043, "step": 216, "tokens_trained": 0.020538192 }, { "epoch": 0.061843971631205676, "grad_norm": 16.71472930908203, "loss": 9.2258, "lr": 0.00043400000000000003, "step": 218, "tokens_trained": 0.020728936 }, { "epoch": 0.062411347517730496, "grad_norm": 12.743139266967773, "loss": 9.0569, "lr": 0.000438, "step": 220, "tokens_trained": 0.020917472 }, { "epoch": 0.06297872340425532, "grad_norm": 15.739934921264648, "loss": 8.9623, "lr": 0.000442, "step": 222, "tokens_trained": 0.02110928 }, { "epoch": 0.06354609929078014, "grad_norm": 14.23620891571045, "loss": 8.8201, "lr": 0.000446, "step": 224, "tokens_trained": 0.021300168 }, { "epoch": 0.06411347517730497, "grad_norm": 13.005538940429688, "loss": 8.7235, "lr": 0.00045000000000000004, "step": 226, "tokens_trained": 0.021490272 }, { "epoch": 0.06468085106382979, "grad_norm": 17.17629051208496, "loss": 8.6907, "lr": 0.00045400000000000003, "step": 228, "tokens_trained": 0.021681552 }, { "epoch": 0.06524822695035461, "grad_norm": 14.430739402770996, "loss": 8.6196, "lr": 0.000458, "step": 230, "tokens_trained": 0.02187236 }, { "epoch": 0.06581560283687943, "grad_norm": 14.575714111328125, "loss": 8.4741, "lr": 0.000462, "step": 232, "tokens_trained": 0.022061976 }, { "epoch": 0.06638297872340425, "grad_norm": 13.892754554748535, "loss": 8.4118, "lr": 0.00046600000000000005, "step": 234, "tokens_trained": 0.022252008 }, { "epoch": 0.06695035460992908, "grad_norm": 11.58240795135498, "loss": 8.2781, "lr": 0.00047, "step": 236, "tokens_trained": 0.02244284 }, { "epoch": 0.0675177304964539, "grad_norm": 13.022644996643066, "loss": 8.2139, "lr": 0.000474, "step": 238, "tokens_trained": 0.022631152 }, { "epoch": 0.06808510638297872, "grad_norm": 11.844677925109863, "loss": 8.1134, "lr": 0.00047799999999999996, "step": 240, "tokens_trained": 0.022821096 }, { "epoch": 0.06865248226950355, "grad_norm": 13.878067016601562, "loss": 8.0221, "lr": 0.000482, "step": 242, "tokens_trained": 0.023011656 }, { "epoch": 0.06921985815602837, "grad_norm": 12.34648323059082, "loss": 7.9755, "lr": 0.000486, "step": 244, "tokens_trained": 0.023201 }, { "epoch": 0.06978723404255319, "grad_norm": 14.238297462463379, "loss": 7.8969, "lr": 0.00049, "step": 246, "tokens_trained": 0.023391128 }, { "epoch": 0.07035460992907801, "grad_norm": 14.386019706726074, "loss": 7.8627, "lr": 0.000494, "step": 248, "tokens_trained": 0.023581768 }, { "epoch": 0.07092198581560284, "grad_norm": 13.623086929321289, "loss": 7.7568, "lr": 0.000498, "step": 250, "tokens_trained": 0.023771248 }, { "epoch": 0.07092198581560284, "eval_loss": 7.70297384262085, "eval_runtime": 21.3853, "step": 250, "tokens_trained": 0.023771248 }, { "epoch": 0.07148936170212766, "grad_norm": 14.347646713256836, "loss": 7.6842, "lr": 0.0005020000000000001, "step": 252, "tokens_trained": 0.023961056 }, { "epoch": 0.07205673758865248, "grad_norm": 12.5592041015625, "loss": 7.6516, "lr": 0.000506, "step": 254, "tokens_trained": 0.024150968 }, { "epoch": 0.0726241134751773, "grad_norm": 13.219141960144043, "loss": 7.5789, "lr": 0.00051, "step": 256, "tokens_trained": 0.024340072 }, { "epoch": 0.07319148936170213, "grad_norm": 12.654081344604492, "loss": 7.5369, "lr": 0.000514, "step": 258, "tokens_trained": 0.024529296 }, { "epoch": 0.07375886524822695, "grad_norm": 13.136971473693848, "loss": 7.4949, "lr": 0.000518, "step": 260, "tokens_trained": 0.024719688 }, { "epoch": 0.07432624113475177, "grad_norm": 12.680288314819336, "loss": 7.3904, "lr": 0.000522, "step": 262, "tokens_trained": 0.024909632 }, { "epoch": 0.0748936170212766, "grad_norm": 12.754518508911133, "loss": 7.3514, "lr": 0.000526, "step": 264, "tokens_trained": 0.025098416 }, { "epoch": 0.07546099290780142, "grad_norm": 13.22311019897461, "loss": 7.2951, "lr": 0.0005300000000000001, "step": 266, "tokens_trained": 0.025287344 }, { "epoch": 0.07602836879432624, "grad_norm": 12.11903190612793, "loss": 7.2229, "lr": 0.0005340000000000001, "step": 268, "tokens_trained": 0.025477152 }, { "epoch": 0.07659574468085106, "grad_norm": 13.771833419799805, "loss": 7.1815, "lr": 0.0005380000000000001, "step": 270, "tokens_trained": 0.025668288 }, { "epoch": 0.07716312056737588, "grad_norm": 11.756864547729492, "loss": 7.1669, "lr": 0.0005420000000000001, "step": 272, "tokens_trained": 0.025858528 }, { "epoch": 0.0777304964539007, "grad_norm": 13.613094329833984, "loss": 7.1079, "lr": 0.000546, "step": 274, "tokens_trained": 0.026048616 }, { "epoch": 0.07829787234042553, "grad_norm": 10.001923561096191, "loss": 7.0508, "lr": 0.00055, "step": 276, "tokens_trained": 0.026236944 }, { "epoch": 0.07886524822695036, "grad_norm": 14.262083053588867, "loss": 6.9955, "lr": 0.000554, "step": 278, "tokens_trained": 0.026426848 }, { "epoch": 0.07943262411347518, "grad_norm": 12.381136894226074, "loss": 6.9831, "lr": 0.000558, "step": 280, "tokens_trained": 0.026616784 }, { "epoch": 0.08, "grad_norm": 9.815845489501953, "loss": 6.917, "lr": 0.0005620000000000001, "step": 282, "tokens_trained": 0.026805176 }, { "epoch": 0.08056737588652482, "grad_norm": 11.669997215270996, "loss": 6.8999, "lr": 0.000566, "step": 284, "tokens_trained": 0.02699488 }, { "epoch": 0.08113475177304964, "grad_norm": 12.770941734313965, "loss": 6.8998, "lr": 0.00057, "step": 286, "tokens_trained": 0.027185784 }, { "epoch": 0.08170212765957446, "grad_norm": 15.572457313537598, "loss": 6.841, "lr": 0.000574, "step": 288, "tokens_trained": 0.027375896 }, { "epoch": 0.08226950354609928, "grad_norm": 10.980833053588867, "loss": 6.8545, "lr": 0.000578, "step": 290, "tokens_trained": 0.02756588 }, { "epoch": 0.08283687943262412, "grad_norm": 11.678337097167969, "loss": 6.7853, "lr": 0.0005819999999999999, "step": 292, "tokens_trained": 0.02775456 }, { "epoch": 0.08340425531914894, "grad_norm": 9.77885913848877, "loss": 6.7465, "lr": 0.0005859999999999999, "step": 294, "tokens_trained": 0.027942856 }, { "epoch": 0.08397163120567376, "grad_norm": 13.62730884552002, "loss": 6.7276, "lr": 0.00059, "step": 296, "tokens_trained": 0.028133152 }, { "epoch": 0.08453900709219858, "grad_norm": 10.644404411315918, "loss": 6.6802, "lr": 0.000594, "step": 298, "tokens_trained": 0.028322192 }, { "epoch": 0.0851063829787234, "grad_norm": 11.130610466003418, "loss": 6.6548, "lr": 0.000598, "step": 300, "tokens_trained": 0.0285122 }, { "epoch": 0.08567375886524822, "grad_norm": 11.557455062866211, "loss": 6.6155, "lr": 0.000602, "step": 302, "tokens_trained": 0.028699792 }, { "epoch": 0.08624113475177304, "grad_norm": 9.276884078979492, "loss": 6.5989, "lr": 0.000606, "step": 304, "tokens_trained": 0.028889896 }, { "epoch": 0.08680851063829788, "grad_norm": 9.616179466247559, "loss": 6.5773, "lr": 0.00061, "step": 306, "tokens_trained": 0.029082272 }, { "epoch": 0.0873758865248227, "grad_norm": 10.575953483581543, "loss": 6.5358, "lr": 0.000614, "step": 308, "tokens_trained": 0.029273352 }, { "epoch": 0.08794326241134752, "grad_norm": 9.089850425720215, "loss": 6.5088, "lr": 0.0006180000000000001, "step": 310, "tokens_trained": 0.029463848 }, { "epoch": 0.08851063829787234, "grad_norm": 9.090002059936523, "loss": 6.4849, "lr": 0.000622, "step": 312, "tokens_trained": 0.029653272 }, { "epoch": 0.08907801418439716, "grad_norm": 12.038308143615723, "loss": 6.4624, "lr": 0.000626, "step": 314, "tokens_trained": 0.029841928 }, { "epoch": 0.08964539007092198, "grad_norm": 9.073866844177246, "loss": 6.4515, "lr": 0.00063, "step": 316, "tokens_trained": 0.030029808 }, { "epoch": 0.0902127659574468, "grad_norm": 8.727197647094727, "loss": 6.43, "lr": 0.000634, "step": 318, "tokens_trained": 0.030221288 }, { "epoch": 0.09078014184397164, "grad_norm": 14.558151245117188, "loss": 6.4487, "lr": 0.000638, "step": 320, "tokens_trained": 0.030410872 }, { "epoch": 0.09134751773049646, "grad_norm": 9.98914623260498, "loss": 6.4279, "lr": 0.000642, "step": 322, "tokens_trained": 0.030602376 }, { "epoch": 0.09191489361702128, "grad_norm": 10.395442962646484, "loss": 6.4311, "lr": 0.000646, "step": 324, "tokens_trained": 0.030792968 }, { "epoch": 0.0924822695035461, "grad_norm": 10.8250093460083, "loss": 6.3726, "lr": 0.0006500000000000001, "step": 326, "tokens_trained": 0.030982944 }, { "epoch": 0.09304964539007092, "grad_norm": 9.73416805267334, "loss": 6.34, "lr": 0.0006540000000000001, "step": 328, "tokens_trained": 0.031174928 }, { "epoch": 0.09361702127659574, "grad_norm": 8.596503257751465, "loss": 6.3322, "lr": 0.0006580000000000001, "step": 330, "tokens_trained": 0.031364288 }, { "epoch": 0.09418439716312056, "grad_norm": 8.49472427368164, "loss": 6.3096, "lr": 0.000662, "step": 332, "tokens_trained": 0.03155376 }, { "epoch": 0.0947517730496454, "grad_norm": 7.857503414154053, "loss": 6.2368, "lr": 0.000666, "step": 334, "tokens_trained": 0.031744368 }, { "epoch": 0.09531914893617022, "grad_norm": 9.007513999938965, "loss": 6.198, "lr": 0.00067, "step": 336, "tokens_trained": 0.031934136 }, { "epoch": 0.09588652482269504, "grad_norm": 8.185524940490723, "loss": 6.2328, "lr": 0.000674, "step": 338, "tokens_trained": 0.032124984 }, { "epoch": 0.09645390070921986, "grad_norm": 8.784396171569824, "loss": 6.1945, "lr": 0.0006780000000000001, "step": 340, "tokens_trained": 0.032316016 }, { "epoch": 0.09702127659574468, "grad_norm": 8.642311096191406, "loss": 6.218, "lr": 0.0006820000000000001, "step": 342, "tokens_trained": 0.032506224 }, { "epoch": 0.0975886524822695, "grad_norm": 8.493780136108398, "loss": 6.194, "lr": 0.0006860000000000001, "step": 344, "tokens_trained": 0.032696152 }, { "epoch": 0.09815602836879432, "grad_norm": 9.120508193969727, "loss": 6.2241, "lr": 0.00069, "step": 346, "tokens_trained": 0.032885688 }, { "epoch": 0.09872340425531916, "grad_norm": 9.34500503540039, "loss": 6.1548, "lr": 0.000694, "step": 348, "tokens_trained": 0.03307568 }, { "epoch": 0.09929078014184398, "grad_norm": 7.483356952667236, "loss": 6.1282, "lr": 0.0006979999999999999, "step": 350, "tokens_trained": 0.033267208 }, { "epoch": 0.0998581560283688, "grad_norm": 7.974069118499756, "loss": 6.1032, "lr": 0.0007019999999999999, "step": 352, "tokens_trained": 0.033458144 }, { "epoch": 0.10042553191489362, "grad_norm": 8.247384071350098, "loss": 6.1698, "lr": 0.0007059999999999999, "step": 354, "tokens_trained": 0.033650352 }, { "epoch": 0.10099290780141844, "grad_norm": 8.554885864257812, "loss": 6.1429, "lr": 0.00071, "step": 356, "tokens_trained": 0.033840232 }, { "epoch": 0.10156028368794326, "grad_norm": 7.209281921386719, "loss": 6.0997, "lr": 0.000714, "step": 358, "tokens_trained": 0.034030032 }, { "epoch": 0.10212765957446808, "grad_norm": 8.660383224487305, "loss": 6.1497, "lr": 0.000718, "step": 360, "tokens_trained": 0.034218592 }, { "epoch": 0.10269503546099291, "grad_norm": 9.382761001586914, "loss": 6.0665, "lr": 0.000722, "step": 362, "tokens_trained": 0.034408408 }, { "epoch": 0.10326241134751774, "grad_norm": 6.915714263916016, "loss": 6.0636, "lr": 0.000726, "step": 364, "tokens_trained": 0.034600016 }, { "epoch": 0.10382978723404256, "grad_norm": 7.8990631103515625, "loss": 6.0975, "lr": 0.00073, "step": 366, "tokens_trained": 0.034790792 }, { "epoch": 0.10439716312056738, "grad_norm": 8.859809875488281, "loss": 6.0754, "lr": 0.000734, "step": 368, "tokens_trained": 0.034981304 }, { "epoch": 0.1049645390070922, "grad_norm": 7.392801761627197, "loss": 6.039, "lr": 0.000738, "step": 370, "tokens_trained": 0.03516956 }, { "epoch": 0.10553191489361702, "grad_norm": 9.427324295043945, "loss": 6.084, "lr": 0.000742, "step": 372, "tokens_trained": 0.035358816 }, { "epoch": 0.10609929078014184, "grad_norm": 7.168910503387451, "loss": 6.0498, "lr": 0.000746, "step": 374, "tokens_trained": 0.035548016 }, { "epoch": 0.10638297872340426, "eval_loss": 6.038269996643066, "eval_runtime": 21.3445, "step": 375, "tokens_trained": 0.035644104 }, { "epoch": 0.10666666666666667, "grad_norm": 7.899259567260742, "loss": 6.0345, "lr": 0.00075, "step": 376, "tokens_trained": 0.035739856 }, { "epoch": 0.1072340425531915, "grad_norm": 8.91533374786377, "loss": 6.0386, "lr": 0.000754, "step": 378, "tokens_trained": 0.035930264 }, { "epoch": 0.10780141843971631, "grad_norm": 6.998043060302734, "loss": 6.0294, "lr": 0.000758, "step": 380, "tokens_trained": 0.036119616 }, { "epoch": 0.10836879432624114, "grad_norm": 7.343894958496094, "loss": 6.0116, "lr": 0.000762, "step": 382, "tokens_trained": 0.036308416 }, { "epoch": 0.10893617021276596, "grad_norm": 8.182528495788574, "loss": 5.9904, "lr": 0.0007660000000000001, "step": 384, "tokens_trained": 0.036497264 }, { "epoch": 0.10950354609929078, "grad_norm": 7.927818775177002, "loss": 6.0345, "lr": 0.0007700000000000001, "step": 386, "tokens_trained": 0.036688192 }, { "epoch": 0.1100709219858156, "grad_norm": 8.07447338104248, "loss": 5.9685, "lr": 0.0007740000000000001, "step": 388, "tokens_trained": 0.036878256 }, { "epoch": 0.11063829787234042, "grad_norm": 7.281871318817139, "loss": 6.0125, "lr": 0.000778, "step": 390, "tokens_trained": 0.037068272 }, { "epoch": 0.11120567375886525, "grad_norm": 8.298929214477539, "loss": 6.0071, "lr": 0.000782, "step": 392, "tokens_trained": 0.037259464 }, { "epoch": 0.11177304964539007, "grad_norm": 7.546716690063477, "loss": 5.9721, "lr": 0.000786, "step": 394, "tokens_trained": 0.037449696 }, { "epoch": 0.1123404255319149, "grad_norm": 8.28548526763916, "loss": 5.9819, "lr": 0.00079, "step": 396, "tokens_trained": 0.037639672 }, { "epoch": 0.11290780141843972, "grad_norm": 7.064655303955078, "loss": 5.9873, "lr": 0.0007940000000000001, "step": 398, "tokens_trained": 0.03782712 }, { "epoch": 0.11347517730496454, "grad_norm": 7.743175506591797, "loss": 5.9528, "lr": 0.0007980000000000001, "step": 400, "tokens_trained": 0.03801792 }, { "epoch": 0.11404255319148936, "grad_norm": 7.00898551940918, "loss": 5.9504, "lr": 0.0008020000000000001, "step": 402, "tokens_trained": 0.038209176 }, { "epoch": 0.11460992907801418, "grad_norm": 7.9350409507751465, "loss": 5.9555, "lr": 0.0008060000000000001, "step": 404, "tokens_trained": 0.03839824 }, { "epoch": 0.11517730496453901, "grad_norm": 7.048569679260254, "loss": 5.9787, "lr": 0.0008100000000000001, "step": 406, "tokens_trained": 0.03858732 }, { "epoch": 0.11574468085106383, "grad_norm": 7.088194370269775, "loss": 5.928, "lr": 0.0008139999999999999, "step": 408, "tokens_trained": 0.038777712 }, { "epoch": 0.11631205673758865, "grad_norm": 8.230712890625, "loss": 5.9716, "lr": 0.0008179999999999999, "step": 410, "tokens_trained": 0.038969464 }, { "epoch": 0.11687943262411347, "grad_norm": 8.076972007751465, "loss": 5.9624, "lr": 0.0008219999999999999, "step": 412, "tokens_trained": 0.039162064 }, { "epoch": 0.1174468085106383, "grad_norm": 8.065289497375488, "loss": 5.9937, "lr": 0.000826, "step": 414, "tokens_trained": 0.039348688 }, { "epoch": 0.11801418439716312, "grad_norm": 6.393420696258545, "loss": 5.9278, "lr": 0.00083, "step": 416, "tokens_trained": 0.03953732 }, { "epoch": 0.11858156028368794, "grad_norm": 7.384702682495117, "loss": 5.931, "lr": 0.000834, "step": 418, "tokens_trained": 0.039729808 }, { "epoch": 0.11914893617021277, "grad_norm": 7.007425308227539, "loss": 5.93, "lr": 0.000838, "step": 420, "tokens_trained": 0.039921096 }, { "epoch": 0.11971631205673759, "grad_norm": 7.112692832946777, "loss": 5.9625, "lr": 0.000842, "step": 422, "tokens_trained": 0.040110856 }, { "epoch": 0.12028368794326241, "grad_norm": 8.484418869018555, "loss": 5.9848, "lr": 0.000846, "step": 424, "tokens_trained": 0.040300504 }, { "epoch": 0.12085106382978723, "grad_norm": 6.633459091186523, "loss": 6.0226, "lr": 0.00085, "step": 426, "tokens_trained": 0.04049056 }, { "epoch": 0.12141843971631205, "grad_norm": 7.796964168548584, "loss": 5.9152, "lr": 0.000854, "step": 428, "tokens_trained": 0.040680544 }, { "epoch": 0.12198581560283688, "grad_norm": 7.833578586578369, "loss": 5.924, "lr": 0.000858, "step": 430, "tokens_trained": 0.040873128 }, { "epoch": 0.1225531914893617, "grad_norm": 6.7470550537109375, "loss": 5.9318, "lr": 0.000862, "step": 432, "tokens_trained": 0.041063488 }, { "epoch": 0.12312056737588653, "grad_norm": 6.066318988800049, "loss": 5.9569, "lr": 0.000866, "step": 434, "tokens_trained": 0.041254368 }, { "epoch": 0.12368794326241135, "grad_norm": 6.753541469573975, "loss": 5.8851, "lr": 0.00087, "step": 436, "tokens_trained": 0.04144516 }, { "epoch": 0.12425531914893617, "grad_norm": 6.471331596374512, "loss": 5.864, "lr": 0.000874, "step": 438, "tokens_trained": 0.041636912 }, { "epoch": 0.12482269503546099, "grad_norm": 6.129056930541992, "loss": 5.8965, "lr": 0.000878, "step": 440, "tokens_trained": 0.041828104 }, { "epoch": 0.1253900709219858, "grad_norm": 6.478890895843506, "loss": 5.8817, "lr": 0.000882, "step": 442, "tokens_trained": 0.04201808 }, { "epoch": 0.12595744680851065, "grad_norm": 6.014713287353516, "loss": 5.8268, "lr": 0.0008860000000000001, "step": 444, "tokens_trained": 0.042207328 }, { "epoch": 0.12652482269503545, "grad_norm": 5.505755424499512, "loss": 5.8684, "lr": 0.0008900000000000001, "step": 446, "tokens_trained": 0.042398152 }, { "epoch": 0.1270921985815603, "grad_norm": 10.096606254577637, "loss": 5.8608, "lr": 0.000894, "step": 448, "tokens_trained": 0.042588984 }, { "epoch": 0.1276595744680851, "grad_norm": 6.388499736785889, "loss": 5.8766, "lr": 0.000898, "step": 450, "tokens_trained": 0.042778592 }, { "epoch": 0.12822695035460993, "grad_norm": 7.145125865936279, "loss": 5.8571, "lr": 0.000902, "step": 452, "tokens_trained": 0.042967176 }, { "epoch": 0.12879432624113477, "grad_norm": 6.826383113861084, "loss": 5.8655, "lr": 0.000906, "step": 454, "tokens_trained": 0.043158952 }, { "epoch": 0.12936170212765957, "grad_norm": 6.036892414093018, "loss": 5.8775, "lr": 0.00091, "step": 456, "tokens_trained": 0.043349288 }, { "epoch": 0.1299290780141844, "grad_norm": 6.36528205871582, "loss": 5.8908, "lr": 0.0009140000000000001, "step": 458, "tokens_trained": 0.043539888 }, { "epoch": 0.13049645390070921, "grad_norm": 6.317558288574219, "loss": 5.8702, "lr": 0.0009180000000000001, "step": 460, "tokens_trained": 0.04373232 }, { "epoch": 0.13106382978723405, "grad_norm": 6.427131175994873, "loss": 5.8399, "lr": 0.0009220000000000001, "step": 462, "tokens_trained": 0.043922744 }, { "epoch": 0.13163120567375886, "grad_norm": 5.666539669036865, "loss": 5.7899, "lr": 0.0009260000000000001, "step": 464, "tokens_trained": 0.044112888 }, { "epoch": 0.1321985815602837, "grad_norm": 5.241824150085449, "loss": 5.8203, "lr": 0.00093, "step": 466, "tokens_trained": 0.04430244 }, { "epoch": 0.1327659574468085, "grad_norm": 6.072646141052246, "loss": 5.8367, "lr": 0.000934, "step": 468, "tokens_trained": 0.044493528 }, { "epoch": 0.13333333333333333, "grad_norm": 6.414418697357178, "loss": 5.8236, "lr": 0.0009379999999999999, "step": 470, "tokens_trained": 0.044682328 }, { "epoch": 0.13390070921985817, "grad_norm": 6.958801746368408, "loss": 5.8179, "lr": 0.000942, "step": 472, "tokens_trained": 0.044874256 }, { "epoch": 0.13446808510638297, "grad_norm": 5.787843227386475, "loss": 5.8478, "lr": 0.000946, "step": 474, "tokens_trained": 0.045065616 }, { "epoch": 0.1350354609929078, "grad_norm": 5.5841240882873535, "loss": 5.8307, "lr": 0.00095, "step": 476, "tokens_trained": 0.045257024 }, { "epoch": 0.13560283687943261, "grad_norm": 6.607712745666504, "loss": 5.8512, "lr": 0.000954, "step": 478, "tokens_trained": 0.045446432 }, { "epoch": 0.13617021276595745, "grad_norm": 5.473597049713135, "loss": 5.8174, "lr": 0.000958, "step": 480, "tokens_trained": 0.045636392 }, { "epoch": 0.13673758865248226, "grad_norm": 5.435728549957275, "loss": 5.8308, "lr": 0.000962, "step": 482, "tokens_trained": 0.045823784 }, { "epoch": 0.1373049645390071, "grad_norm": 6.049300670623779, "loss": 5.8293, "lr": 0.000966, "step": 484, "tokens_trained": 0.046013408 }, { "epoch": 0.13787234042553193, "grad_norm": 6.311764717102051, "loss": 5.8086, "lr": 0.0009699999999999999, "step": 486, "tokens_trained": 0.046202528 }, { "epoch": 0.13843971631205673, "grad_norm": 5.886009216308594, "loss": 5.7986, "lr": 0.000974, "step": 488, "tokens_trained": 0.04639404 }, { "epoch": 0.13900709219858157, "grad_norm": 5.438202381134033, "loss": 5.8473, "lr": 0.000978, "step": 490, "tokens_trained": 0.046586512 }, { "epoch": 0.13957446808510637, "grad_norm": 5.08393669128418, "loss": 5.7613, "lr": 0.000982, "step": 492, "tokens_trained": 0.046777448 }, { "epoch": 0.1401418439716312, "grad_norm": 5.645389080047607, "loss": 5.7723, "lr": 0.0009860000000000001, "step": 494, "tokens_trained": 0.046966096 }, { "epoch": 0.14070921985815601, "grad_norm": 6.320916652679443, "loss": 5.7772, "lr": 0.00099, "step": 496, "tokens_trained": 0.047155152 }, { "epoch": 0.14127659574468085, "grad_norm": 5.573540210723877, "loss": 5.7412, "lr": 0.000994, "step": 498, "tokens_trained": 0.047345352 }, { "epoch": 0.14184397163120568, "grad_norm": 4.939594745635986, "loss": 5.8208, "lr": 0.000998, "step": 500, "tokens_trained": 0.047535016 }, { "epoch": 0.14184397163120568, "eval_loss": 5.799490928649902, "eval_runtime": 20.8575, "step": 500, "tokens_trained": 0.047535016 }, { "epoch": 0.1424113475177305, "grad_norm": 5.805343151092529, "loss": 5.7734, "lr": 0.00099986013986014, "step": 502, "tokens_trained": 0.047724216 }, { "epoch": 0.14297872340425533, "grad_norm": 5.831176280975342, "loss": 5.8044, "lr": 0.0009995804195804196, "step": 504, "tokens_trained": 0.047914328 }, { "epoch": 0.14354609929078013, "grad_norm": 5.045091152191162, "loss": 5.8133, "lr": 0.0009993006993006994, "step": 506, "tokens_trained": 0.048105032 }, { "epoch": 0.14411347517730497, "grad_norm": 5.276819705963135, "loss": 5.7555, "lr": 0.000999020979020979, "step": 508, "tokens_trained": 0.048293104 }, { "epoch": 0.14468085106382977, "grad_norm": 5.710324287414551, "loss": 5.7619, "lr": 0.0009987412587412587, "step": 510, "tokens_trained": 0.048483888 }, { "epoch": 0.1452482269503546, "grad_norm": 4.9472527503967285, "loss": 5.767, "lr": 0.0009984615384615386, "step": 512, "tokens_trained": 0.04867336 }, { "epoch": 0.14581560283687944, "grad_norm": 5.410078525543213, "loss": 5.7238, "lr": 0.0009981818181818182, "step": 514, "tokens_trained": 0.048863104 }, { "epoch": 0.14638297872340425, "grad_norm": 6.025843143463135, "loss": 5.7664, "lr": 0.000997902097902098, "step": 516, "tokens_trained": 0.049053856 }, { "epoch": 0.14695035460992908, "grad_norm": 5.3211669921875, "loss": 5.747, "lr": 0.0009976223776223777, "step": 518, "tokens_trained": 0.049245104 }, { "epoch": 0.1475177304964539, "grad_norm": 6.059483051300049, "loss": 5.7611, "lr": 0.0009973426573426573, "step": 520, "tokens_trained": 0.049434368 }, { "epoch": 0.14808510638297873, "grad_norm": 5.362505912780762, "loss": 5.7607, "lr": 0.000997062937062937, "step": 522, "tokens_trained": 0.049622648 }, { "epoch": 0.14865248226950353, "grad_norm": 5.391371726989746, "loss": 5.7857, "lr": 0.0009967832167832168, "step": 524, "tokens_trained": 0.049812304 }, { "epoch": 0.14921985815602837, "grad_norm": 4.3839030265808105, "loss": 5.7334, "lr": 0.0009965034965034964, "step": 526, "tokens_trained": 0.05000356 }, { "epoch": 0.1497872340425532, "grad_norm": 5.008530616760254, "loss": 5.7475, "lr": 0.0009962237762237763, "step": 528, "tokens_trained": 0.050193304 }, { "epoch": 0.150354609929078, "grad_norm": 5.068671226501465, "loss": 5.7866, "lr": 0.000995944055944056, "step": 530, "tokens_trained": 0.050382856 }, { "epoch": 0.15092198581560284, "grad_norm": 5.399240493774414, "loss": 5.6857, "lr": 0.0009956643356643356, "step": 532, "tokens_trained": 0.050570864 }, { "epoch": 0.15148936170212765, "grad_norm": 5.689481735229492, "loss": 5.7586, "lr": 0.0009953846153846154, "step": 534, "tokens_trained": 0.050760384 }, { "epoch": 0.15205673758865249, "grad_norm": 4.652275562286377, "loss": 5.7866, "lr": 0.000995104895104895, "step": 536, "tokens_trained": 0.050952712 }, { "epoch": 0.1526241134751773, "grad_norm": 4.126920223236084, "loss": 5.7261, "lr": 0.000994825174825175, "step": 538, "tokens_trained": 0.051141656 }, { "epoch": 0.15319148936170213, "grad_norm": 4.233098030090332, "loss": 5.6903, "lr": 0.0009945454545454546, "step": 540, "tokens_trained": 0.051331256 }, { "epoch": 0.15375886524822696, "grad_norm": 4.271973133087158, "loss": 5.7293, "lr": 0.0009942657342657344, "step": 542, "tokens_trained": 0.051522072 }, { "epoch": 0.15432624113475177, "grad_norm": 4.653008937835693, "loss": 5.7133, "lr": 0.000993986013986014, "step": 544, "tokens_trained": 0.051711624 }, { "epoch": 0.1548936170212766, "grad_norm": 4.192624092102051, "loss": 5.6876, "lr": 0.0009937062937062937, "step": 546, "tokens_trained": 0.051901744 }, { "epoch": 0.1554609929078014, "grad_norm": 5.497848033905029, "loss": 5.7378, "lr": 0.0009934265734265735, "step": 548, "tokens_trained": 0.052092872 }, { "epoch": 0.15602836879432624, "grad_norm": 4.350259780883789, "loss": 5.6533, "lr": 0.0009931468531468532, "step": 550, "tokens_trained": 0.052281768 }, { "epoch": 0.15659574468085105, "grad_norm": 4.515641689300537, "loss": 5.7492, "lr": 0.000992867132867133, "step": 552, "tokens_trained": 0.052471848 }, { "epoch": 0.15716312056737589, "grad_norm": 4.628066539764404, "loss": 5.7113, "lr": 0.0009925874125874127, "step": 554, "tokens_trained": 0.052660168 }, { "epoch": 0.15773049645390072, "grad_norm": 4.8322930335998535, "loss": 5.6696, "lr": 0.0009923076923076923, "step": 556, "tokens_trained": 0.05284776 }, { "epoch": 0.15829787234042553, "grad_norm": 3.999706506729126, "loss": 5.7296, "lr": 0.000992027972027972, "step": 558, "tokens_trained": 0.053037344 }, { "epoch": 0.15886524822695036, "grad_norm": 4.332971572875977, "loss": 5.7362, "lr": 0.0009917482517482518, "step": 560, "tokens_trained": 0.053228168 }, { "epoch": 0.15943262411347517, "grad_norm": 4.500301361083984, "loss": 5.6982, "lr": 0.0009914685314685314, "step": 562, "tokens_trained": 0.05341856 }, { "epoch": 0.16, "grad_norm": 4.721808910369873, "loss": 5.7166, "lr": 0.0009911888111888113, "step": 564, "tokens_trained": 0.053608824 }, { "epoch": 0.1605673758865248, "grad_norm": 5.265316009521484, "loss": 5.7069, "lr": 0.000990909090909091, "step": 566, "tokens_trained": 0.053799728 }, { "epoch": 0.16113475177304964, "grad_norm": 5.024131774902344, "loss": 5.7113, "lr": 0.0009906293706293705, "step": 568, "tokens_trained": 0.05398944 }, { "epoch": 0.16170212765957448, "grad_norm": 4.063276767730713, "loss": 5.6251, "lr": 0.0009903496503496504, "step": 570, "tokens_trained": 0.054176512 }, { "epoch": 0.1622695035460993, "grad_norm": 4.15974760055542, "loss": 5.6912, "lr": 0.00099006993006993, "step": 572, "tokens_trained": 0.054367072 }, { "epoch": 0.16283687943262412, "grad_norm": 4.338894844055176, "loss": 5.6807, "lr": 0.0009897902097902099, "step": 574, "tokens_trained": 0.054559184 }, { "epoch": 0.16340425531914893, "grad_norm": 5.535487174987793, "loss": 5.6765, "lr": 0.0009895104895104895, "step": 576, "tokens_trained": 0.054748904 }, { "epoch": 0.16397163120567376, "grad_norm": 4.379040241241455, "loss": 5.6884, "lr": 0.0009892307692307694, "step": 578, "tokens_trained": 0.054936136 }, { "epoch": 0.16453900709219857, "grad_norm": 4.746179103851318, "loss": 5.6885, "lr": 0.000988951048951049, "step": 580, "tokens_trained": 0.055125584 }, { "epoch": 0.1651063829787234, "grad_norm": 4.949806213378906, "loss": 5.7061, "lr": 0.0009886713286713286, "step": 582, "tokens_trained": 0.055314608 }, { "epoch": 0.16567375886524824, "grad_norm": 4.507448196411133, "loss": 5.6339, "lr": 0.0009883916083916085, "step": 584, "tokens_trained": 0.055503992 }, { "epoch": 0.16624113475177305, "grad_norm": 4.131013870239258, "loss": 5.7122, "lr": 0.0009881118881118881, "step": 586, "tokens_trained": 0.055693376 }, { "epoch": 0.16680851063829788, "grad_norm": 5.32897424697876, "loss": 5.7192, "lr": 0.000987832167832168, "step": 588, "tokens_trained": 0.05588452 }, { "epoch": 0.1673758865248227, "grad_norm": 4.166877746582031, "loss": 5.6666, "lr": 0.0009875524475524476, "step": 590, "tokens_trained": 0.056073936 }, { "epoch": 0.16794326241134752, "grad_norm": 4.393389701843262, "loss": 5.6113, "lr": 0.0009872727272727273, "step": 592, "tokens_trained": 0.056262224 }, { "epoch": 0.16851063829787233, "grad_norm": 4.466696739196777, "loss": 5.6466, "lr": 0.000986993006993007, "step": 594, "tokens_trained": 0.056454008 }, { "epoch": 0.16907801418439716, "grad_norm": 3.9413373470306396, "loss": 5.6838, "lr": 0.0009867132867132867, "step": 596, "tokens_trained": 0.05664444 }, { "epoch": 0.169645390070922, "grad_norm": 3.594649314880371, "loss": 5.6684, "lr": 0.0009864335664335664, "step": 598, "tokens_trained": 0.056833864 }, { "epoch": 0.1702127659574468, "grad_norm": 3.5969483852386475, "loss": 5.6619, "lr": 0.0009861538461538462, "step": 600, "tokens_trained": 0.05702332 }, { "epoch": 0.17078014184397164, "grad_norm": 3.845414638519287, "loss": 5.5855, "lr": 0.0009858741258741259, "step": 602, "tokens_trained": 0.057212776 }, { "epoch": 0.17134751773049645, "grad_norm": 3.9198834896087646, "loss": 5.6551, "lr": 0.0009855944055944055, "step": 604, "tokens_trained": 0.05740152 }, { "epoch": 0.17191489361702128, "grad_norm": 3.6764986515045166, "loss": 5.6228, "lr": 0.0009853146853146854, "step": 606, "tokens_trained": 0.057595616 }, { "epoch": 0.1724822695035461, "grad_norm": 3.8210043907165527, "loss": 5.6557, "lr": 0.000985034965034965, "step": 608, "tokens_trained": 0.057783968 }, { "epoch": 0.17304964539007092, "grad_norm": 3.893644094467163, "loss": 5.6675, "lr": 0.0009847552447552449, "step": 610, "tokens_trained": 0.057974832 }, { "epoch": 0.17361702127659576, "grad_norm": 3.280839681625366, "loss": 5.6442, "lr": 0.0009844755244755245, "step": 612, "tokens_trained": 0.058166272 }, { "epoch": 0.17418439716312056, "grad_norm": 3.4350404739379883, "loss": 5.6555, "lr": 0.0009841958041958043, "step": 614, "tokens_trained": 0.058356008 }, { "epoch": 0.1747517730496454, "grad_norm": 3.7700448036193848, "loss": 5.6138, "lr": 0.000983916083916084, "step": 616, "tokens_trained": 0.058546792 }, { "epoch": 0.1753191489361702, "grad_norm": 3.8182730674743652, "loss": 5.6931, "lr": 0.0009836363636363636, "step": 618, "tokens_trained": 0.058736296 }, { "epoch": 0.17588652482269504, "grad_norm": 3.9105372428894043, "loss": 5.6431, "lr": 0.0009833566433566435, "step": 620, "tokens_trained": 0.058927576 }, { "epoch": 0.17645390070921985, "grad_norm": 3.8897712230682373, "loss": 5.6203, "lr": 0.000983076923076923, "step": 622, "tokens_trained": 0.059118416 }, { "epoch": 0.17702127659574468, "grad_norm": 3.512194871902466, "loss": 5.6292, "lr": 0.000982797202797203, "step": 624, "tokens_trained": 0.059308568 }, { "epoch": 0.1773049645390071, "eval_loss": 5.630118370056152, "eval_runtime": 21.1591, "step": 625, "tokens_trained": 0.059404056 } ], "logging_steps": 2, "max_steps": 7650, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }