{ "best_global_step": 375, "best_metric": 6.038269996643066, "best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-BPE/checkpoint-375", "epoch": 0.10638297872340426, "eval_steps": 125, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005673758865248227, "grad_norm": 1275.0146484375, "loss": 281.4781, "lr": 2e-06, "step": 2, "tokens_trained": 0.000192256 }, { "epoch": 0.0011347517730496454, "grad_norm": 1437.579833984375, "loss": 267.2211, "lr": 6e-06, "step": 4, "tokens_trained": 0.000382024 }, { "epoch": 0.001702127659574468, "grad_norm": 1719.271484375, "loss": 219.3822, "lr": 1e-05, "step": 6, "tokens_trained": 0.00057072 }, { "epoch": 0.0022695035460992908, "grad_norm": 1444.94970703125, "loss": 133.8172, "lr": 1.4e-05, "step": 8, "tokens_trained": 0.000761336 }, { "epoch": 0.0028368794326241137, "grad_norm": 238.9689178466797, "loss": 90.8177, "lr": 1.8e-05, "step": 10, "tokens_trained": 0.000953248 }, { "epoch": 0.003404255319148936, "grad_norm": 158.53497314453125, "loss": 84.6922, "lr": 2.2e-05, "step": 12, "tokens_trained": 0.00114424 }, { "epoch": 0.003971631205673759, "grad_norm": 146.10595703125, "loss": 76.7055, "lr": 2.6e-05, "step": 14, "tokens_trained": 0.001334104 }, { "epoch": 0.0045390070921985815, "grad_norm": 140.69964599609375, "loss": 67.9952, "lr": 3e-05, "step": 16, "tokens_trained": 0.00152392 }, { "epoch": 0.005106382978723404, "grad_norm": 108.80303192138672, "loss": 57.8088, "lr": 3.4000000000000007e-05, "step": 18, "tokens_trained": 0.001713872 }, { "epoch": 0.005673758865248227, "grad_norm": 106.82334899902344, "loss": 48.6585, "lr": 3.8e-05, "step": 20, "tokens_trained": 0.001903976 }, { "epoch": 0.00624113475177305, "grad_norm": 93.58769989013672, "loss": 41.7984, "lr": 4.2000000000000004e-05, "step": 22, "tokens_trained": 0.002094288 }, { "epoch": 0.006808510638297872, "grad_norm": 87.5854721069336, "loss": 37.6201, "lr": 4.6e-05, "step": 24, "tokens_trained": 0.002282496 }, { "epoch": 0.007375886524822695, "grad_norm": 84.12794494628906, "loss": 35.0091, "lr": 5e-05, "step": 26, "tokens_trained": 0.00247068 }, { "epoch": 0.007943262411347518, "grad_norm": 79.77535247802734, "loss": 33.2253, "lr": 5.4e-05, "step": 28, "tokens_trained": 0.002662888 }, { "epoch": 0.00851063829787234, "grad_norm": 66.42157745361328, "loss": 32.0682, "lr": 5.800000000000001e-05, "step": 30, "tokens_trained": 0.002851968 }, { "epoch": 0.009078014184397163, "grad_norm": 87.52485656738281, "loss": 30.893, "lr": 6.2e-05, "step": 32, "tokens_trained": 0.003041384 }, { "epoch": 0.009645390070921986, "grad_norm": 58.33614730834961, "loss": 30.0513, "lr": 6.6e-05, "step": 34, "tokens_trained": 0.003232872 }, { "epoch": 0.010212765957446808, "grad_norm": 54.629329681396484, "loss": 29.0115, "lr": 7.000000000000001e-05, "step": 36, "tokens_trained": 0.003423824 }, { "epoch": 0.01078014184397163, "grad_norm": 52.79097366333008, "loss": 28.2084, "lr": 7.4e-05, "step": 38, "tokens_trained": 0.003613232 }, { "epoch": 0.011347517730496455, "grad_norm": 54.481224060058594, "loss": 27.4345, "lr": 7.8e-05, "step": 40, "tokens_trained": 0.003800952 }, { "epoch": 0.011914893617021277, "grad_norm": 58.7069091796875, "loss": 26.5936, "lr": 8.2e-05, "step": 42, "tokens_trained": 0.003991512 }, { "epoch": 0.0124822695035461, "grad_norm": 49.30760955810547, "loss": 26.0608, "lr": 8.599999999999999e-05, "step": 44, "tokens_trained": 0.004180648 }, { "epoch": 0.013049645390070922, "grad_norm": 61.902587890625, "loss": 25.5363, "lr": 8.999999999999999e-05, "step": 46, "tokens_trained": 0.00437148 }, { "epoch": 0.013617021276595745, "grad_norm": 46.76111602783203, "loss": 24.9599, "lr": 9.400000000000001e-05, "step": 48, "tokens_trained": 0.004559344 }, { "epoch": 0.014184397163120567, "grad_norm": 57.06416702270508, "loss": 24.4087, "lr": 9.800000000000001e-05, "step": 50, "tokens_trained": 0.004749256 }, { "epoch": 0.01475177304964539, "grad_norm": 44.798736572265625, "loss": 24.1444, "lr": 0.000102, "step": 52, "tokens_trained": 0.004940192 }, { "epoch": 0.015319148936170212, "grad_norm": 40.29296875, "loss": 23.6011, "lr": 0.000106, "step": 54, "tokens_trained": 0.005130304 }, { "epoch": 0.015886524822695036, "grad_norm": 38.75099563598633, "loss": 23.1781, "lr": 0.00011, "step": 56, "tokens_trained": 0.005322864 }, { "epoch": 0.016453900709219857, "grad_norm": 37.470706939697266, "loss": 22.9136, "lr": 0.000114, "step": 58, "tokens_trained": 0.00551392 }, { "epoch": 0.01702127659574468, "grad_norm": 35.1894645690918, "loss": 22.6336, "lr": 0.000118, "step": 60, "tokens_trained": 0.005703096 }, { "epoch": 0.017588652482269502, "grad_norm": 35.136573791503906, "loss": 22.2998, "lr": 0.000122, "step": 62, "tokens_trained": 0.005892448 }, { "epoch": 0.018156028368794326, "grad_norm": 38.05111312866211, "loss": 21.9401, "lr": 0.000126, "step": 64, "tokens_trained": 0.006081656 }, { "epoch": 0.01872340425531915, "grad_norm": 35.63850021362305, "loss": 21.7206, "lr": 0.00013000000000000002, "step": 66, "tokens_trained": 0.006273032 }, { "epoch": 0.01929078014184397, "grad_norm": 34.327667236328125, "loss": 21.4051, "lr": 0.000134, "step": 68, "tokens_trained": 0.00646304 }, { "epoch": 0.019858156028368795, "grad_norm": 31.457059860229492, "loss": 21.0774, "lr": 0.00013800000000000002, "step": 70, "tokens_trained": 0.006652832 }, { "epoch": 0.020425531914893616, "grad_norm": 34.91672897338867, "loss": 20.8718, "lr": 0.00014199999999999998, "step": 72, "tokens_trained": 0.006843512 }, { "epoch": 0.02099290780141844, "grad_norm": 27.959579467773438, "loss": 20.6932, "lr": 0.000146, "step": 74, "tokens_trained": 0.007033584 }, { "epoch": 0.02156028368794326, "grad_norm": 26.569866180419922, "loss": 20.4072, "lr": 0.00015, "step": 76, "tokens_trained": 0.007224032 }, { "epoch": 0.022127659574468085, "grad_norm": 28.009904861450195, "loss": 20.2229, "lr": 0.000154, "step": 78, "tokens_trained": 0.00741368 }, { "epoch": 0.02269503546099291, "grad_norm": 28.892959594726562, "loss": 20.0528, "lr": 0.000158, "step": 80, "tokens_trained": 0.00760416 }, { "epoch": 0.02326241134751773, "grad_norm": 31.58131980895996, "loss": 19.8016, "lr": 0.000162, "step": 82, "tokens_trained": 0.007793952 }, { "epoch": 0.023829787234042554, "grad_norm": 31.01254653930664, "loss": 19.634, "lr": 0.00016600000000000002, "step": 84, "tokens_trained": 0.007980792 }, { "epoch": 0.024397163120567375, "grad_norm": 28.732515335083008, "loss": 19.3777, "lr": 0.00017, "step": 86, "tokens_trained": 0.008171968 }, { "epoch": 0.0249645390070922, "grad_norm": 24.31264877319336, "loss": 19.1346, "lr": 0.000174, "step": 88, "tokens_trained": 0.008361632 }, { "epoch": 0.02553191489361702, "grad_norm": 26.557010650634766, "loss": 19.0014, "lr": 0.000178, "step": 90, "tokens_trained": 0.008552328 }, { "epoch": 0.026099290780141844, "grad_norm": 21.156103134155273, "loss": 18.7032, "lr": 0.000182, "step": 92, "tokens_trained": 0.008743136 }, { "epoch": 0.02666666666666667, "grad_norm": 25.7484188079834, "loss": 18.4836, "lr": 0.000186, "step": 94, "tokens_trained": 0.008932056 }, { "epoch": 0.02723404255319149, "grad_norm": 22.27949333190918, "loss": 18.2233, "lr": 0.00019, "step": 96, "tokens_trained": 0.009121608 }, { "epoch": 0.027801418439716313, "grad_norm": 24.9247989654541, "loss": 17.9867, "lr": 0.000194, "step": 98, "tokens_trained": 0.009311008 }, { "epoch": 0.028368794326241134, "grad_norm": 24.302066802978516, "loss": 17.8016, "lr": 0.00019800000000000002, "step": 100, "tokens_trained": 0.009501456 }, { "epoch": 0.02893617021276596, "grad_norm": 23.458459854125977, "loss": 17.6295, "lr": 0.000202, "step": 102, "tokens_trained": 0.009693952 }, { "epoch": 0.02950354609929078, "grad_norm": 24.092350006103516, "loss": 17.4593, "lr": 0.000206, "step": 104, "tokens_trained": 0.009883328 }, { "epoch": 0.030070921985815603, "grad_norm": 22.54726219177246, "loss": 17.2141, "lr": 0.00021, "step": 106, "tokens_trained": 0.01007316 }, { "epoch": 0.030638297872340424, "grad_norm": 21.334760665893555, "loss": 17.044, "lr": 0.000214, "step": 108, "tokens_trained": 0.010266504 }, { "epoch": 0.031205673758865248, "grad_norm": 20.584287643432617, "loss": 16.8919, "lr": 0.000218, "step": 110, "tokens_trained": 0.010455736 }, { "epoch": 0.03177304964539007, "grad_norm": 23.51676368713379, "loss": 16.751, "lr": 0.000222, "step": 112, "tokens_trained": 0.010645208 }, { "epoch": 0.03234042553191489, "grad_norm": 23.278276443481445, "loss": 16.5997, "lr": 0.00022600000000000002, "step": 114, "tokens_trained": 0.010838928 }, { "epoch": 0.032907801418439714, "grad_norm": 25.4830265045166, "loss": 16.3416, "lr": 0.00023, "step": 116, "tokens_trained": 0.011027792 }, { "epoch": 0.03347517730496454, "grad_norm": 29.442413330078125, "loss": 16.24, "lr": 0.00023400000000000002, "step": 118, "tokens_trained": 0.011217456 }, { "epoch": 0.03404255319148936, "grad_norm": 21.77578353881836, "loss": 16.1922, "lr": 0.00023799999999999998, "step": 120, "tokens_trained": 0.01140804 }, { "epoch": 0.03460992907801418, "grad_norm": 27.040719985961914, "loss": 15.9059, "lr": 0.000242, "step": 122, "tokens_trained": 0.011597816 }, { "epoch": 0.035177304964539004, "grad_norm": 24.74480628967285, "loss": 15.7818, "lr": 0.000246, "step": 124, "tokens_trained": 0.011785624 }, { "epoch": 0.03546099290780142, "eval_loss": 15.553059577941895, "eval_runtime": 23.5485, "step": 125, "tokens_trained": 0.011880832 }, { "epoch": 0.03574468085106383, "grad_norm": 23.13482666015625, "loss": 15.5739, "lr": 0.00025, "step": 126, "tokens_trained": 0.011975976 }, { "epoch": 0.03631205673758865, "grad_norm": 22.8618106842041, "loss": 15.4302, "lr": 0.000254, "step": 128, "tokens_trained": 0.012166744 }, { "epoch": 0.03687943262411347, "grad_norm": 26.804859161376953, "loss": 15.3623, "lr": 0.00025800000000000004, "step": 130, "tokens_trained": 0.01235436 }, { "epoch": 0.0374468085106383, "grad_norm": 21.826601028442383, "loss": 15.1465, "lr": 0.000262, "step": 132, "tokens_trained": 0.012544976 }, { "epoch": 0.03801418439716312, "grad_norm": 39.447086334228516, "loss": 15.0137, "lr": 0.000266, "step": 134, "tokens_trained": 0.012736352 }, { "epoch": 0.03858156028368794, "grad_norm": 23.44275665283203, "loss": 14.9355, "lr": 0.00027, "step": 136, "tokens_trained": 0.012925008 }, { "epoch": 0.03914893617021276, "grad_norm": 21.631427764892578, "loss": 14.6825, "lr": 0.00027400000000000005, "step": 138, "tokens_trained": 0.013114672 }, { "epoch": 0.03971631205673759, "grad_norm": 23.674650192260742, "loss": 14.5194, "lr": 0.00027800000000000004, "step": 140, "tokens_trained": 0.013304016 }, { "epoch": 0.04028368794326241, "grad_norm": 23.974796295166016, "loss": 14.4829, "lr": 0.00028199999999999997, "step": 142, "tokens_trained": 0.013496696 }, { "epoch": 0.04085106382978723, "grad_norm": 26.112201690673828, "loss": 14.3027, "lr": 0.00028599999999999996, "step": 144, "tokens_trained": 0.013684816 }, { "epoch": 0.04141843971631206, "grad_norm": 20.67386817932129, "loss": 14.1499, "lr": 0.00029, "step": 146, "tokens_trained": 0.013874832 }, { "epoch": 0.04198581560283688, "grad_norm": 24.253408432006836, "loss": 13.9378, "lr": 0.000294, "step": 148, "tokens_trained": 0.014065056 }, { "epoch": 0.0425531914893617, "grad_norm": 35.716087341308594, "loss": 14.0562, "lr": 0.000298, "step": 150, "tokens_trained": 0.014256784 }, { "epoch": 0.04312056737588652, "grad_norm": 29.414331436157227, "loss": 14.0462, "lr": 0.000302, "step": 152, "tokens_trained": 0.014446312 }, { "epoch": 0.04368794326241135, "grad_norm": 30.687482833862305, "loss": 13.7603, "lr": 0.000306, "step": 154, "tokens_trained": 0.014639872 }, { "epoch": 0.04425531914893617, "grad_norm": 29.806455612182617, "loss": 13.708, "lr": 0.00031, "step": 156, "tokens_trained": 0.014831112 }, { "epoch": 0.04482269503546099, "grad_norm": 24.900897979736328, "loss": 13.548, "lr": 0.000314, "step": 158, "tokens_trained": 0.015021288 }, { "epoch": 0.04539007092198582, "grad_norm": 24.29252815246582, "loss": 13.3119, "lr": 0.00031800000000000003, "step": 160, "tokens_trained": 0.01521228 }, { "epoch": 0.04595744680851064, "grad_norm": 20.68342399597168, "loss": 13.1829, "lr": 0.000322, "step": 162, "tokens_trained": 0.015403688 }, { "epoch": 0.04652482269503546, "grad_norm": 20.822795867919922, "loss": 12.9044, "lr": 0.000326, "step": 164, "tokens_trained": 0.015593416 }, { "epoch": 0.04709219858156028, "grad_norm": 21.689916610717773, "loss": 12.6862, "lr": 0.00033, "step": 166, "tokens_trained": 0.015784408 }, { "epoch": 0.04765957446808511, "grad_norm": 17.873889923095703, "loss": 12.5502, "lr": 0.00033400000000000004, "step": 168, "tokens_trained": 0.0159744 }, { "epoch": 0.04822695035460993, "grad_norm": 18.951616287231445, "loss": 12.308, "lr": 0.00033800000000000003, "step": 170, "tokens_trained": 0.016163736 }, { "epoch": 0.04879432624113475, "grad_norm": 15.146363258361816, "loss": 12.1558, "lr": 0.000342, "step": 172, "tokens_trained": 0.016353832 }, { "epoch": 0.04936170212765958, "grad_norm": 18.336984634399414, "loss": 12.0386, "lr": 0.000346, "step": 174, "tokens_trained": 0.016545088 }, { "epoch": 0.0499290780141844, "grad_norm": 17.221126556396484, "loss": 11.8791, "lr": 0.00035, "step": 176, "tokens_trained": 0.016735704 }, { "epoch": 0.05049645390070922, "grad_norm": 19.362564086914062, "loss": 11.7224, "lr": 0.000354, "step": 178, "tokens_trained": 0.016927944 }, { "epoch": 0.05106382978723404, "grad_norm": 15.564507484436035, "loss": 11.6448, "lr": 0.000358, "step": 180, "tokens_trained": 0.017116096 }, { "epoch": 0.05163120567375887, "grad_norm": 20.711383819580078, "loss": 11.4398, "lr": 0.000362, "step": 182, "tokens_trained": 0.01730564 }, { "epoch": 0.05219858156028369, "grad_norm": 18.627403259277344, "loss": 11.3377, "lr": 0.000366, "step": 184, "tokens_trained": 0.017495864 }, { "epoch": 0.05276595744680851, "grad_norm": 15.00942325592041, "loss": 11.1416, "lr": 0.00037, "step": 186, "tokens_trained": 0.017686464 }, { "epoch": 0.05333333333333334, "grad_norm": 17.070598602294922, "loss": 11.0148, "lr": 0.000374, "step": 188, "tokens_trained": 0.017879488 }, { "epoch": 0.05390070921985816, "grad_norm": 16.101457595825195, "loss": 10.8874, "lr": 0.000378, "step": 190, "tokens_trained": 0.018068312 }, { "epoch": 0.05446808510638298, "grad_norm": 15.613334655761719, "loss": 10.7055, "lr": 0.000382, "step": 192, "tokens_trained": 0.018255752 }, { "epoch": 0.0550354609929078, "grad_norm": 17.671857833862305, "loss": 10.5706, "lr": 0.000386, "step": 194, "tokens_trained": 0.018447096 }, { "epoch": 0.05560283687943263, "grad_norm": 16.080909729003906, "loss": 10.4476, "lr": 0.00039000000000000005, "step": 196, "tokens_trained": 0.018637264 }, { "epoch": 0.05617021276595745, "grad_norm": 15.02849292755127, "loss": 10.2962, "lr": 0.00039400000000000004, "step": 198, "tokens_trained": 0.018827552 }, { "epoch": 0.05673758865248227, "grad_norm": 14.990167617797852, "loss": 10.1912, "lr": 0.000398, "step": 200, "tokens_trained": 0.019018 }, { "epoch": 0.05730496453900709, "grad_norm": 15.390633583068848, "loss": 10.0442, "lr": 0.000402, "step": 202, "tokens_trained": 0.019209864 }, { "epoch": 0.05787234042553192, "grad_norm": 16.871570587158203, "loss": 9.9685, "lr": 0.00040600000000000006, "step": 204, "tokens_trained": 0.019400176 }, { "epoch": 0.05843971631205674, "grad_norm": 20.16544532775879, "loss": 9.8531, "lr": 0.00041, "step": 206, "tokens_trained": 0.019589424 }, { "epoch": 0.05900709219858156, "grad_norm": 16.825023651123047, "loss": 9.7777, "lr": 0.000414, "step": 208, "tokens_trained": 0.019779112 }, { "epoch": 0.059574468085106386, "grad_norm": 16.43510627746582, "loss": 9.6122, "lr": 0.00041799999999999997, "step": 210, "tokens_trained": 0.019970048 }, { "epoch": 0.060141843971631206, "grad_norm": 17.340473175048828, "loss": 9.4859, "lr": 0.000422, "step": 212, "tokens_trained": 0.020160968 }, { "epoch": 0.06070921985815603, "grad_norm": 15.019119262695312, "loss": 9.3656, "lr": 0.000426, "step": 214, "tokens_trained": 0.020349664 }, { "epoch": 0.06127659574468085, "grad_norm": 13.379194259643555, "loss": 9.2348, "lr": 0.00043, "step": 216, "tokens_trained": 0.020538192 }, { "epoch": 0.061843971631205676, "grad_norm": 16.71472930908203, "loss": 9.2258, "lr": 0.00043400000000000003, "step": 218, "tokens_trained": 0.020728936 }, { "epoch": 0.062411347517730496, "grad_norm": 12.743139266967773, "loss": 9.0569, "lr": 0.000438, "step": 220, "tokens_trained": 0.020917472 }, { "epoch": 0.06297872340425532, "grad_norm": 15.739934921264648, "loss": 8.9623, "lr": 0.000442, "step": 222, "tokens_trained": 0.02110928 }, { "epoch": 0.06354609929078014, "grad_norm": 14.23620891571045, "loss": 8.8201, "lr": 0.000446, "step": 224, "tokens_trained": 0.021300168 }, { "epoch": 0.06411347517730497, "grad_norm": 13.005538940429688, "loss": 8.7235, "lr": 0.00045000000000000004, "step": 226, "tokens_trained": 0.021490272 }, { "epoch": 0.06468085106382979, "grad_norm": 17.17629051208496, "loss": 8.6907, "lr": 0.00045400000000000003, "step": 228, "tokens_trained": 0.021681552 }, { "epoch": 0.06524822695035461, "grad_norm": 14.430739402770996, "loss": 8.6196, "lr": 0.000458, "step": 230, "tokens_trained": 0.02187236 }, { "epoch": 0.06581560283687943, "grad_norm": 14.575714111328125, "loss": 8.4741, "lr": 0.000462, "step": 232, "tokens_trained": 0.022061976 }, { "epoch": 0.06638297872340425, "grad_norm": 13.892754554748535, "loss": 8.4118, "lr": 0.00046600000000000005, "step": 234, "tokens_trained": 0.022252008 }, { "epoch": 0.06695035460992908, "grad_norm": 11.58240795135498, "loss": 8.2781, "lr": 0.00047, "step": 236, "tokens_trained": 0.02244284 }, { "epoch": 0.0675177304964539, "grad_norm": 13.022644996643066, "loss": 8.2139, "lr": 0.000474, "step": 238, "tokens_trained": 0.022631152 }, { "epoch": 0.06808510638297872, "grad_norm": 11.844677925109863, "loss": 8.1134, "lr": 0.00047799999999999996, "step": 240, "tokens_trained": 0.022821096 }, { "epoch": 0.06865248226950355, "grad_norm": 13.878067016601562, "loss": 8.0221, "lr": 0.000482, "step": 242, "tokens_trained": 0.023011656 }, { "epoch": 0.06921985815602837, "grad_norm": 12.34648323059082, "loss": 7.9755, "lr": 0.000486, "step": 244, "tokens_trained": 0.023201 }, { "epoch": 0.06978723404255319, "grad_norm": 14.238297462463379, "loss": 7.8969, "lr": 0.00049, "step": 246, "tokens_trained": 0.023391128 }, { "epoch": 0.07035460992907801, "grad_norm": 14.386019706726074, "loss": 7.8627, "lr": 0.000494, "step": 248, "tokens_trained": 0.023581768 }, { "epoch": 0.07092198581560284, "grad_norm": 13.623086929321289, "loss": 7.7568, "lr": 0.000498, "step": 250, "tokens_trained": 0.023771248 }, { "epoch": 0.07092198581560284, "eval_loss": 7.70297384262085, "eval_runtime": 21.3853, "step": 250, "tokens_trained": 0.023771248 }, { "epoch": 0.07148936170212766, "grad_norm": 14.347646713256836, "loss": 7.6842, "lr": 0.0005020000000000001, "step": 252, "tokens_trained": 0.023961056 }, { "epoch": 0.07205673758865248, "grad_norm": 12.5592041015625, "loss": 7.6516, "lr": 0.000506, "step": 254, "tokens_trained": 0.024150968 }, { "epoch": 0.0726241134751773, "grad_norm": 13.219141960144043, "loss": 7.5789, "lr": 0.00051, "step": 256, "tokens_trained": 0.024340072 }, { "epoch": 0.07319148936170213, "grad_norm": 12.654081344604492, "loss": 7.5369, "lr": 0.000514, "step": 258, "tokens_trained": 0.024529296 }, { "epoch": 0.07375886524822695, "grad_norm": 13.136971473693848, "loss": 7.4949, "lr": 0.000518, "step": 260, "tokens_trained": 0.024719688 }, { "epoch": 0.07432624113475177, "grad_norm": 12.680288314819336, "loss": 7.3904, "lr": 0.000522, "step": 262, "tokens_trained": 0.024909632 }, { "epoch": 0.0748936170212766, "grad_norm": 12.754518508911133, "loss": 7.3514, "lr": 0.000526, "step": 264, "tokens_trained": 0.025098416 }, { "epoch": 0.07546099290780142, "grad_norm": 13.22311019897461, "loss": 7.2951, "lr": 0.0005300000000000001, "step": 266, "tokens_trained": 0.025287344 }, { "epoch": 0.07602836879432624, "grad_norm": 12.11903190612793, "loss": 7.2229, "lr": 0.0005340000000000001, "step": 268, "tokens_trained": 0.025477152 }, { "epoch": 0.07659574468085106, "grad_norm": 13.771833419799805, "loss": 7.1815, "lr": 0.0005380000000000001, "step": 270, "tokens_trained": 0.025668288 }, { "epoch": 0.07716312056737588, "grad_norm": 11.756864547729492, "loss": 7.1669, "lr": 0.0005420000000000001, "step": 272, "tokens_trained": 0.025858528 }, { "epoch": 0.0777304964539007, "grad_norm": 13.613094329833984, "loss": 7.1079, "lr": 0.000546, "step": 274, "tokens_trained": 0.026048616 }, { "epoch": 0.07829787234042553, "grad_norm": 10.001923561096191, "loss": 7.0508, "lr": 0.00055, "step": 276, "tokens_trained": 0.026236944 }, { "epoch": 0.07886524822695036, "grad_norm": 14.262083053588867, "loss": 6.9955, "lr": 0.000554, "step": 278, "tokens_trained": 0.026426848 }, { "epoch": 0.07943262411347518, "grad_norm": 12.381136894226074, "loss": 6.9831, "lr": 0.000558, "step": 280, "tokens_trained": 0.026616784 }, { "epoch": 0.08, "grad_norm": 9.815845489501953, "loss": 6.917, "lr": 0.0005620000000000001, "step": 282, "tokens_trained": 0.026805176 }, { "epoch": 0.08056737588652482, "grad_norm": 11.669997215270996, "loss": 6.8999, "lr": 0.000566, "step": 284, "tokens_trained": 0.02699488 }, { "epoch": 0.08113475177304964, "grad_norm": 12.770941734313965, "loss": 6.8998, "lr": 0.00057, "step": 286, "tokens_trained": 0.027185784 }, { "epoch": 0.08170212765957446, "grad_norm": 15.572457313537598, "loss": 6.841, "lr": 0.000574, "step": 288, "tokens_trained": 0.027375896 }, { "epoch": 0.08226950354609928, "grad_norm": 10.980833053588867, "loss": 6.8545, "lr": 0.000578, "step": 290, "tokens_trained": 0.02756588 }, { "epoch": 0.08283687943262412, "grad_norm": 11.678337097167969, "loss": 6.7853, "lr": 0.0005819999999999999, "step": 292, "tokens_trained": 0.02775456 }, { "epoch": 0.08340425531914894, "grad_norm": 9.77885913848877, "loss": 6.7465, "lr": 0.0005859999999999999, "step": 294, "tokens_trained": 0.027942856 }, { "epoch": 0.08397163120567376, "grad_norm": 13.62730884552002, "loss": 6.7276, "lr": 0.00059, "step": 296, "tokens_trained": 0.028133152 }, { "epoch": 0.08453900709219858, "grad_norm": 10.644404411315918, "loss": 6.6802, "lr": 0.000594, "step": 298, "tokens_trained": 0.028322192 }, { "epoch": 0.0851063829787234, "grad_norm": 11.130610466003418, "loss": 6.6548, "lr": 0.000598, "step": 300, "tokens_trained": 0.0285122 }, { "epoch": 0.08567375886524822, "grad_norm": 11.557455062866211, "loss": 6.6155, "lr": 0.000602, "step": 302, "tokens_trained": 0.028699792 }, { "epoch": 0.08624113475177304, "grad_norm": 9.276884078979492, "loss": 6.5989, "lr": 0.000606, "step": 304, "tokens_trained": 0.028889896 }, { "epoch": 0.08680851063829788, "grad_norm": 9.616179466247559, "loss": 6.5773, "lr": 0.00061, "step": 306, "tokens_trained": 0.029082272 }, { "epoch": 0.0873758865248227, "grad_norm": 10.575953483581543, "loss": 6.5358, "lr": 0.000614, "step": 308, "tokens_trained": 0.029273352 }, { "epoch": 0.08794326241134752, "grad_norm": 9.089850425720215, "loss": 6.5088, "lr": 0.0006180000000000001, "step": 310, "tokens_trained": 0.029463848 }, { "epoch": 0.08851063829787234, "grad_norm": 9.090002059936523, "loss": 6.4849, "lr": 0.000622, "step": 312, "tokens_trained": 0.029653272 }, { "epoch": 0.08907801418439716, "grad_norm": 12.038308143615723, "loss": 6.4624, "lr": 0.000626, "step": 314, "tokens_trained": 0.029841928 }, { "epoch": 0.08964539007092198, "grad_norm": 9.073866844177246, "loss": 6.4515, "lr": 0.00063, "step": 316, "tokens_trained": 0.030029808 }, { "epoch": 0.0902127659574468, "grad_norm": 8.727197647094727, "loss": 6.43, "lr": 0.000634, "step": 318, "tokens_trained": 0.030221288 }, { "epoch": 0.09078014184397164, "grad_norm": 14.558151245117188, "loss": 6.4487, "lr": 0.000638, "step": 320, "tokens_trained": 0.030410872 }, { "epoch": 0.09134751773049646, "grad_norm": 9.98914623260498, "loss": 6.4279, "lr": 0.000642, "step": 322, "tokens_trained": 0.030602376 }, { "epoch": 0.09191489361702128, "grad_norm": 10.395442962646484, "loss": 6.4311, "lr": 0.000646, "step": 324, "tokens_trained": 0.030792968 }, { "epoch": 0.0924822695035461, "grad_norm": 10.8250093460083, "loss": 6.3726, "lr": 0.0006500000000000001, "step": 326, "tokens_trained": 0.030982944 }, { "epoch": 0.09304964539007092, "grad_norm": 9.73416805267334, "loss": 6.34, "lr": 0.0006540000000000001, "step": 328, "tokens_trained": 0.031174928 }, { "epoch": 0.09361702127659574, "grad_norm": 8.596503257751465, "loss": 6.3322, "lr": 0.0006580000000000001, "step": 330, "tokens_trained": 0.031364288 }, { "epoch": 0.09418439716312056, "grad_norm": 8.49472427368164, "loss": 6.3096, "lr": 0.000662, "step": 332, "tokens_trained": 0.03155376 }, { "epoch": 0.0947517730496454, "grad_norm": 7.857503414154053, "loss": 6.2368, "lr": 0.000666, "step": 334, "tokens_trained": 0.031744368 }, { "epoch": 0.09531914893617022, "grad_norm": 9.007513999938965, "loss": 6.198, "lr": 0.00067, "step": 336, "tokens_trained": 0.031934136 }, { "epoch": 0.09588652482269504, "grad_norm": 8.185524940490723, "loss": 6.2328, "lr": 0.000674, "step": 338, "tokens_trained": 0.032124984 }, { "epoch": 0.09645390070921986, "grad_norm": 8.784396171569824, "loss": 6.1945, "lr": 0.0006780000000000001, "step": 340, "tokens_trained": 0.032316016 }, { "epoch": 0.09702127659574468, "grad_norm": 8.642311096191406, "loss": 6.218, "lr": 0.0006820000000000001, "step": 342, "tokens_trained": 0.032506224 }, { "epoch": 0.0975886524822695, "grad_norm": 8.493780136108398, "loss": 6.194, "lr": 0.0006860000000000001, "step": 344, "tokens_trained": 0.032696152 }, { "epoch": 0.09815602836879432, "grad_norm": 9.120508193969727, "loss": 6.2241, "lr": 0.00069, "step": 346, "tokens_trained": 0.032885688 }, { "epoch": 0.09872340425531916, "grad_norm": 9.34500503540039, "loss": 6.1548, "lr": 0.000694, "step": 348, "tokens_trained": 0.03307568 }, { "epoch": 0.09929078014184398, "grad_norm": 7.483356952667236, "loss": 6.1282, "lr": 0.0006979999999999999, "step": 350, "tokens_trained": 0.033267208 }, { "epoch": 0.0998581560283688, "grad_norm": 7.974069118499756, "loss": 6.1032, "lr": 0.0007019999999999999, "step": 352, "tokens_trained": 0.033458144 }, { "epoch": 0.10042553191489362, "grad_norm": 8.247384071350098, "loss": 6.1698, "lr": 0.0007059999999999999, "step": 354, "tokens_trained": 0.033650352 }, { "epoch": 0.10099290780141844, "grad_norm": 8.554885864257812, "loss": 6.1429, "lr": 0.00071, "step": 356, "tokens_trained": 0.033840232 }, { "epoch": 0.10156028368794326, "grad_norm": 7.209281921386719, "loss": 6.0997, "lr": 0.000714, "step": 358, "tokens_trained": 0.034030032 }, { "epoch": 0.10212765957446808, "grad_norm": 8.660383224487305, "loss": 6.1497, "lr": 0.000718, "step": 360, "tokens_trained": 0.034218592 }, { "epoch": 0.10269503546099291, "grad_norm": 9.382761001586914, "loss": 6.0665, "lr": 0.000722, "step": 362, "tokens_trained": 0.034408408 }, { "epoch": 0.10326241134751774, "grad_norm": 6.915714263916016, "loss": 6.0636, "lr": 0.000726, "step": 364, "tokens_trained": 0.034600016 }, { "epoch": 0.10382978723404256, "grad_norm": 7.8990631103515625, "loss": 6.0975, "lr": 0.00073, "step": 366, "tokens_trained": 0.034790792 }, { "epoch": 0.10439716312056738, "grad_norm": 8.859809875488281, "loss": 6.0754, "lr": 0.000734, "step": 368, "tokens_trained": 0.034981304 }, { "epoch": 0.1049645390070922, "grad_norm": 7.392801761627197, "loss": 6.039, "lr": 0.000738, "step": 370, "tokens_trained": 0.03516956 }, { "epoch": 0.10553191489361702, "grad_norm": 9.427324295043945, "loss": 6.084, "lr": 0.000742, "step": 372, "tokens_trained": 0.035358816 }, { "epoch": 0.10609929078014184, "grad_norm": 7.168910503387451, "loss": 6.0498, "lr": 0.000746, "step": 374, "tokens_trained": 0.035548016 }, { "epoch": 0.10638297872340426, "eval_loss": 6.038269996643066, "eval_runtime": 21.3445, "step": 375, "tokens_trained": 0.035644104 } ], "logging_steps": 2, "max_steps": 7650, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }