| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 30, |
| "global_step": 400, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.025078369905956112, |
| "grad_norm": 0.43725547194480896, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.9437, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.050156739811912224, |
| "grad_norm": 0.6708689332008362, |
| "learning_rate": 2.4e-05, |
| "loss": 1.849, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.07523510971786834, |
| "grad_norm": 0.3925197422504425, |
| "learning_rate": 4e-05, |
| "loss": 1.8725, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.10031347962382445, |
| "grad_norm": 0.4107086956501007, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 1.7837, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.12539184952978055, |
| "grad_norm": 0.5341728329658508, |
| "learning_rate": 7.2e-05, |
| "loss": 1.8033, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.15047021943573669, |
| "grad_norm": 0.43938425183296204, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 1.5958, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1755485893416928, |
| "grad_norm": 0.3650668263435364, |
| "learning_rate": 0.00010400000000000001, |
| "loss": 1.5566, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.2006269592476489, |
| "grad_norm": 0.3861461877822876, |
| "learning_rate": 0.00012, |
| "loss": 1.3631, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.22570532915360503, |
| "grad_norm": 0.3685659170150757, |
| "learning_rate": 0.00013600000000000003, |
| "loss": 1.3227, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2507836990595611, |
| "grad_norm": 0.4720342457294464, |
| "learning_rate": 0.000152, |
| "loss": 1.1027, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.27586206896551724, |
| "grad_norm": 0.40199315547943115, |
| "learning_rate": 0.000168, |
| "loss": 1.018, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.30094043887147337, |
| "grad_norm": 0.22768579423427582, |
| "learning_rate": 0.00018400000000000003, |
| "loss": 0.9077, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.32601880877742945, |
| "grad_norm": 0.2388496696949005, |
| "learning_rate": 0.0002, |
| "loss": 0.7587, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.3510971786833856, |
| "grad_norm": 0.19000130891799927, |
| "learning_rate": 0.0001998993710691824, |
| "loss": 0.7766, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3761755485893417, |
| "grad_norm": 0.1965627372264862, |
| "learning_rate": 0.0001997987421383648, |
| "loss": 0.7575, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.4012539184952978, |
| "grad_norm": 0.18852603435516357, |
| "learning_rate": 0.00019969811320754718, |
| "loss": 0.8292, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4263322884012539, |
| "grad_norm": 0.16236507892608643, |
| "learning_rate": 0.00019959748427672956, |
| "loss": 0.7272, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.45141065830721006, |
| "grad_norm": 0.14128465950489044, |
| "learning_rate": 0.00019949685534591195, |
| "loss": 0.751, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.47648902821316613, |
| "grad_norm": 0.14623811841011047, |
| "learning_rate": 0.00019939622641509434, |
| "loss": 0.6661, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.5015673981191222, |
| "grad_norm": 0.14945009350776672, |
| "learning_rate": 0.00019929559748427673, |
| "loss": 0.7435, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5266457680250783, |
| "grad_norm": 0.1485632061958313, |
| "learning_rate": 0.00019919496855345915, |
| "loss": 0.6466, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.5517241379310345, |
| "grad_norm": 0.14336936175823212, |
| "learning_rate": 0.00019909433962264153, |
| "loss": 0.7, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.5768025078369906, |
| "grad_norm": 0.1316949725151062, |
| "learning_rate": 0.0001989937106918239, |
| "loss": 0.6194, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.6018808777429467, |
| "grad_norm": 0.14485935866832733, |
| "learning_rate": 0.00019889308176100629, |
| "loss": 0.8044, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.6269592476489029, |
| "grad_norm": 0.12860289216041565, |
| "learning_rate": 0.00019879245283018867, |
| "loss": 0.5773, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.6520376175548589, |
| "grad_norm": 0.16889798641204834, |
| "learning_rate": 0.0001986918238993711, |
| "loss": 0.7281, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.677115987460815, |
| "grad_norm": 0.14104238152503967, |
| "learning_rate": 0.00019859119496855348, |
| "loss": 0.6654, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.7021943573667712, |
| "grad_norm": 0.1515515297651291, |
| "learning_rate": 0.00019849056603773587, |
| "loss": 0.6833, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 0.16043280065059662, |
| "learning_rate": 0.00019838993710691826, |
| "loss": 0.6599, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.7523510971786834, |
| "grad_norm": 0.15895870327949524, |
| "learning_rate": 0.00019828930817610062, |
| "loss": 0.686, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7774294670846394, |
| "grad_norm": 0.3486965298652649, |
| "learning_rate": 0.00019818867924528303, |
| "loss": 0.6835, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.8025078369905956, |
| "grad_norm": 0.1274975836277008, |
| "learning_rate": 0.00019808805031446542, |
| "loss": 0.5425, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.8275862068965517, |
| "grad_norm": 0.1669531911611557, |
| "learning_rate": 0.0001979874213836478, |
| "loss": 0.766, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.8526645768025078, |
| "grad_norm": 0.14856377243995667, |
| "learning_rate": 0.0001978867924528302, |
| "loss": 0.5993, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.877742946708464, |
| "grad_norm": 0.13947483897209167, |
| "learning_rate": 0.0001977861635220126, |
| "loss": 0.7071, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.9028213166144201, |
| "grad_norm": 0.15276968479156494, |
| "learning_rate": 0.00019768553459119498, |
| "loss": 0.687, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.9278996865203761, |
| "grad_norm": 0.14595958590507507, |
| "learning_rate": 0.00019758490566037737, |
| "loss": 0.6495, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.9529780564263323, |
| "grad_norm": 0.17588546872138977, |
| "learning_rate": 0.00019748427672955975, |
| "loss": 0.6584, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.9780564263322884, |
| "grad_norm": 0.15688568353652954, |
| "learning_rate": 0.00019738364779874214, |
| "loss": 0.6769, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.1879902333021164, |
| "learning_rate": 0.00019728301886792453, |
| "loss": 0.5924, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.025078369905956, |
| "grad_norm": 0.15292422473430634, |
| "learning_rate": 0.00019718238993710695, |
| "loss": 0.5727, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.0501567398119123, |
| "grad_norm": 0.14542542397975922, |
| "learning_rate": 0.00019708176100628934, |
| "loss": 0.5528, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.0752351097178683, |
| "grad_norm": 0.15912258625030518, |
| "learning_rate": 0.0001969811320754717, |
| "loss": 0.5404, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.1003134796238245, |
| "grad_norm": 0.16443438827991486, |
| "learning_rate": 0.00019688050314465409, |
| "loss": 0.544, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.1253918495297806, |
| "grad_norm": 0.18315915763378143, |
| "learning_rate": 0.00019677987421383647, |
| "loss": 0.5768, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.1504702194357366, |
| "grad_norm": 0.16878078877925873, |
| "learning_rate": 0.0001966792452830189, |
| "loss": 0.6918, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.1755485893416928, |
| "grad_norm": 0.1652018129825592, |
| "learning_rate": 0.00019657861635220128, |
| "loss": 0.5903, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.2006269592476488, |
| "grad_norm": 0.181439608335495, |
| "learning_rate": 0.00019647798742138367, |
| "loss": 0.5917, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.225705329153605, |
| "grad_norm": 0.15887363255023956, |
| "learning_rate": 0.00019637735849056606, |
| "loss": 0.5631, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.250783699059561, |
| "grad_norm": 0.16309796273708344, |
| "learning_rate": 0.00019627672955974842, |
| "loss": 0.6206, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.2758620689655173, |
| "grad_norm": 0.19174307584762573, |
| "learning_rate": 0.00019617610062893083, |
| "loss": 0.5375, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.3009404388714734, |
| "grad_norm": 0.2240614891052246, |
| "learning_rate": 0.00019607547169811322, |
| "loss": 0.6407, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.3260188087774294, |
| "grad_norm": 0.1673874855041504, |
| "learning_rate": 0.0001959748427672956, |
| "loss": 0.631, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.3510971786833856, |
| "grad_norm": 0.16143380105495453, |
| "learning_rate": 0.000195874213836478, |
| "loss": 0.4985, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.3761755485893417, |
| "grad_norm": 0.18511593341827393, |
| "learning_rate": 0.0001957735849056604, |
| "loss": 0.5844, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.4012539184952977, |
| "grad_norm": 0.15226313471794128, |
| "learning_rate": 0.00019567295597484278, |
| "loss": 0.5237, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.426332288401254, |
| "grad_norm": 0.16536672413349152, |
| "learning_rate": 0.00019557232704402517, |
| "loss": 0.4945, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.4514106583072102, |
| "grad_norm": 0.17802752554416656, |
| "learning_rate": 0.00019547169811320755, |
| "loss": 0.632, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.4764890282131662, |
| "grad_norm": 0.18615730106830597, |
| "learning_rate": 0.00019537106918238994, |
| "loss": 0.6778, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.5015673981191222, |
| "grad_norm": 0.16418549418449402, |
| "learning_rate": 0.00019527044025157233, |
| "loss": 0.6141, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.5266457680250785, |
| "grad_norm": 0.16458265483379364, |
| "learning_rate": 0.00019516981132075475, |
| "loss": 0.5316, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.5517241379310345, |
| "grad_norm": 0.15842436254024506, |
| "learning_rate": 0.0001950691823899371, |
| "loss": 0.4761, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.5768025078369905, |
| "grad_norm": 0.15368856489658356, |
| "learning_rate": 0.0001949685534591195, |
| "loss": 0.471, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.6018808777429467, |
| "grad_norm": 0.16104081273078918, |
| "learning_rate": 0.0001948679245283019, |
| "loss": 0.572, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.626959247648903, |
| "grad_norm": 0.1696012020111084, |
| "learning_rate": 0.00019476729559748428, |
| "loss": 0.5293, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.6520376175548588, |
| "grad_norm": 0.16601622104644775, |
| "learning_rate": 0.0001946666666666667, |
| "loss": 0.6569, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.677115987460815, |
| "grad_norm": 0.15106241405010223, |
| "learning_rate": 0.00019456603773584908, |
| "loss": 0.5687, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.7021943573667713, |
| "grad_norm": 0.18189087510108948, |
| "learning_rate": 0.00019446540880503147, |
| "loss": 0.4965, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.7272727272727273, |
| "grad_norm": 0.15463034808635712, |
| "learning_rate": 0.00019436477987421383, |
| "loss": 0.6398, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.7523510971786833, |
| "grad_norm": 0.17576032876968384, |
| "learning_rate": 0.00019426415094339622, |
| "loss": 0.6122, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.7774294670846396, |
| "grad_norm": 0.14750592410564423, |
| "learning_rate": 0.00019416352201257863, |
| "loss": 0.5475, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.8025078369905956, |
| "grad_norm": 0.15765072405338287, |
| "learning_rate": 0.00019406289308176102, |
| "loss": 0.443, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.8275862068965516, |
| "grad_norm": 0.16872242093086243, |
| "learning_rate": 0.0001939622641509434, |
| "loss": 0.5061, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.8526645768025078, |
| "grad_norm": 0.16207149624824524, |
| "learning_rate": 0.0001938616352201258, |
| "loss": 0.572, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.877742946708464, |
| "grad_norm": 0.16720207035541534, |
| "learning_rate": 0.0001937610062893082, |
| "loss": 0.5329, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.90282131661442, |
| "grad_norm": 0.1653318852186203, |
| "learning_rate": 0.00019366037735849058, |
| "loss": 0.5361, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.9278996865203761, |
| "grad_norm": 0.1918332576751709, |
| "learning_rate": 0.00019355974842767297, |
| "loss": 0.6219, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.9529780564263324, |
| "grad_norm": 0.1535947322845459, |
| "learning_rate": 0.00019345911949685536, |
| "loss": 0.586, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.9780564263322884, |
| "grad_norm": 0.16222791373729706, |
| "learning_rate": 0.00019335849056603774, |
| "loss": 0.5228, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.24010032415390015, |
| "learning_rate": 0.00019325786163522013, |
| "loss": 0.5218, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.0250783699059562, |
| "grad_norm": 0.15844006836414337, |
| "learning_rate": 0.00019315723270440255, |
| "loss": 0.5111, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.050156739811912, |
| "grad_norm": 0.1755230873823166, |
| "learning_rate": 0.0001930566037735849, |
| "loss": 0.6063, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.0752351097178683, |
| "grad_norm": 0.2025759369134903, |
| "learning_rate": 0.0001929559748427673, |
| "loss": 0.5672, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.1003134796238245, |
| "grad_norm": 0.2058378905057907, |
| "learning_rate": 0.0001928553459119497, |
| "loss": 0.4629, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.1253918495297803, |
| "grad_norm": 0.1765381544828415, |
| "learning_rate": 0.00019275471698113208, |
| "loss": 0.5236, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.1504702194357366, |
| "grad_norm": 0.20618358254432678, |
| "learning_rate": 0.0001926540880503145, |
| "loss": 0.5029, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.175548589341693, |
| "grad_norm": 0.1737968772649765, |
| "learning_rate": 0.00019255345911949688, |
| "loss": 0.3964, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.200626959247649, |
| "grad_norm": 0.20385882258415222, |
| "learning_rate": 0.00019245283018867927, |
| "loss": 0.5188, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.225705329153605, |
| "grad_norm": 0.2051456868648529, |
| "learning_rate": 0.00019235220125786163, |
| "loss": 0.4548, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.250783699059561, |
| "grad_norm": 0.18826241791248322, |
| "learning_rate": 0.00019225157232704402, |
| "loss": 0.4515, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.2758620689655173, |
| "grad_norm": 0.18653476238250732, |
| "learning_rate": 0.00019215094339622644, |
| "loss": 0.5373, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.300940438871473, |
| "grad_norm": 0.179554283618927, |
| "learning_rate": 0.00019205031446540882, |
| "loss": 0.49, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.3260188087774294, |
| "grad_norm": 0.18949083983898163, |
| "learning_rate": 0.0001919496855345912, |
| "loss": 0.4795, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.3510971786833856, |
| "grad_norm": 0.21681569516658783, |
| "learning_rate": 0.0001918490566037736, |
| "loss": 0.4826, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.376175548589342, |
| "grad_norm": 0.20997639000415802, |
| "learning_rate": 0.000191748427672956, |
| "loss": 0.3699, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.4012539184952977, |
| "grad_norm": 0.3043127954006195, |
| "learning_rate": 0.00019164779874213838, |
| "loss": 0.5264, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.426332288401254, |
| "grad_norm": 0.19533301889896393, |
| "learning_rate": 0.00019154716981132077, |
| "loss": 0.4243, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.45141065830721, |
| "grad_norm": 0.20891591906547546, |
| "learning_rate": 0.00019144654088050316, |
| "loss": 0.4748, |
| "step": 196 |
| }, |
| { |
| "epoch": 2.476489028213166, |
| "grad_norm": 0.1940625011920929, |
| "learning_rate": 0.00019134591194968554, |
| "loss": 0.456, |
| "step": 198 |
| }, |
| { |
| "epoch": 2.501567398119122, |
| "grad_norm": 0.2169208973646164, |
| "learning_rate": 0.00019124528301886793, |
| "loss": 0.577, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.5266457680250785, |
| "grad_norm": 0.21462920308113098, |
| "learning_rate": 0.00019114465408805032, |
| "loss": 0.3583, |
| "step": 202 |
| }, |
| { |
| "epoch": 2.5517241379310347, |
| "grad_norm": 0.22243842482566833, |
| "learning_rate": 0.0001910440251572327, |
| "loss": 0.5473, |
| "step": 204 |
| }, |
| { |
| "epoch": 2.5768025078369905, |
| "grad_norm": 0.20357415080070496, |
| "learning_rate": 0.0001909433962264151, |
| "loss": 0.5596, |
| "step": 206 |
| }, |
| { |
| "epoch": 2.6018808777429467, |
| "grad_norm": 0.21374137699604034, |
| "learning_rate": 0.0001908427672955975, |
| "loss": 0.6041, |
| "step": 208 |
| }, |
| { |
| "epoch": 2.626959247648903, |
| "grad_norm": 0.22612103819847107, |
| "learning_rate": 0.00019074213836477988, |
| "loss": 0.4825, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.652037617554859, |
| "grad_norm": 0.182185098528862, |
| "learning_rate": 0.0001906415094339623, |
| "loss": 0.5961, |
| "step": 212 |
| }, |
| { |
| "epoch": 2.677115987460815, |
| "grad_norm": 0.21316243708133698, |
| "learning_rate": 0.00019054088050314468, |
| "loss": 0.4892, |
| "step": 214 |
| }, |
| { |
| "epoch": 2.7021943573667713, |
| "grad_norm": 0.20594292879104614, |
| "learning_rate": 0.00019044025157232704, |
| "loss": 0.5294, |
| "step": 216 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 0.18579436838626862, |
| "learning_rate": 0.00019033962264150943, |
| "loss": 0.4867, |
| "step": 218 |
| }, |
| { |
| "epoch": 2.7523510971786833, |
| "grad_norm": 0.20978513360023499, |
| "learning_rate": 0.00019023899371069182, |
| "loss": 0.5459, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.7774294670846396, |
| "grad_norm": 0.20766879618167877, |
| "learning_rate": 0.00019013836477987424, |
| "loss": 0.4467, |
| "step": 222 |
| }, |
| { |
| "epoch": 2.8025078369905954, |
| "grad_norm": 0.2247876673936844, |
| "learning_rate": 0.00019003773584905662, |
| "loss": 0.4955, |
| "step": 224 |
| }, |
| { |
| "epoch": 2.8275862068965516, |
| "grad_norm": 0.20031589269638062, |
| "learning_rate": 0.00018993710691823901, |
| "loss": 0.4274, |
| "step": 226 |
| }, |
| { |
| "epoch": 2.852664576802508, |
| "grad_norm": 0.22423385083675385, |
| "learning_rate": 0.0001898364779874214, |
| "loss": 0.4741, |
| "step": 228 |
| }, |
| { |
| "epoch": 2.877742946708464, |
| "grad_norm": 0.1920011192560196, |
| "learning_rate": 0.00018973584905660376, |
| "loss": 0.4802, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.9028213166144203, |
| "grad_norm": 0.1996566653251648, |
| "learning_rate": 0.00018963522012578615, |
| "loss": 0.5204, |
| "step": 232 |
| }, |
| { |
| "epoch": 2.927899686520376, |
| "grad_norm": 0.18659324944019318, |
| "learning_rate": 0.00018953459119496857, |
| "loss": 0.563, |
| "step": 234 |
| }, |
| { |
| "epoch": 2.9529780564263324, |
| "grad_norm": 0.20645543932914734, |
| "learning_rate": 0.00018943396226415096, |
| "loss": 0.5968, |
| "step": 236 |
| }, |
| { |
| "epoch": 2.978056426332288, |
| "grad_norm": 0.20103755593299866, |
| "learning_rate": 0.00018933333333333335, |
| "loss": 0.4683, |
| "step": 238 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.26790672540664673, |
| "learning_rate": 0.00018923270440251573, |
| "loss": 0.4214, |
| "step": 240 |
| }, |
| { |
| "epoch": 3.0250783699059562, |
| "grad_norm": 0.24234965443611145, |
| "learning_rate": 0.00018913207547169812, |
| "loss": 0.4778, |
| "step": 242 |
| }, |
| { |
| "epoch": 3.050156739811912, |
| "grad_norm": 0.2758055031299591, |
| "learning_rate": 0.0001890314465408805, |
| "loss": 0.4029, |
| "step": 244 |
| }, |
| { |
| "epoch": 3.0752351097178683, |
| "grad_norm": 0.2382444590330124, |
| "learning_rate": 0.0001889308176100629, |
| "loss": 0.3801, |
| "step": 246 |
| }, |
| { |
| "epoch": 3.1003134796238245, |
| "grad_norm": 0.24490897357463837, |
| "learning_rate": 0.0001888301886792453, |
| "loss": 0.3975, |
| "step": 248 |
| }, |
| { |
| "epoch": 3.1253918495297803, |
| "grad_norm": 0.26239147782325745, |
| "learning_rate": 0.00018872955974842768, |
| "loss": 0.4038, |
| "step": 250 |
| }, |
| { |
| "epoch": 3.1504702194357366, |
| "grad_norm": 0.24605032801628113, |
| "learning_rate": 0.00018862893081761007, |
| "loss": 0.4364, |
| "step": 252 |
| }, |
| { |
| "epoch": 3.175548589341693, |
| "grad_norm": 0.28764280676841736, |
| "learning_rate": 0.00018852830188679248, |
| "loss": 0.3577, |
| "step": 254 |
| }, |
| { |
| "epoch": 3.200626959247649, |
| "grad_norm": 0.23804618418216705, |
| "learning_rate": 0.00018842767295597484, |
| "loss": 0.4155, |
| "step": 256 |
| }, |
| { |
| "epoch": 3.225705329153605, |
| "grad_norm": 0.25497862696647644, |
| "learning_rate": 0.00018832704402515723, |
| "loss": 0.4654, |
| "step": 258 |
| }, |
| { |
| "epoch": 3.250783699059561, |
| "grad_norm": 0.23537839949131012, |
| "learning_rate": 0.00018822641509433962, |
| "loss": 0.4144, |
| "step": 260 |
| }, |
| { |
| "epoch": 3.2758620689655173, |
| "grad_norm": 0.268036812543869, |
| "learning_rate": 0.000188125786163522, |
| "loss": 0.4546, |
| "step": 262 |
| }, |
| { |
| "epoch": 3.300940438871473, |
| "grad_norm": 0.25395911931991577, |
| "learning_rate": 0.00018802515723270443, |
| "loss": 0.4604, |
| "step": 264 |
| }, |
| { |
| "epoch": 3.3260188087774294, |
| "grad_norm": 0.3395281732082367, |
| "learning_rate": 0.00018792452830188681, |
| "loss": 0.5816, |
| "step": 266 |
| }, |
| { |
| "epoch": 3.3510971786833856, |
| "grad_norm": 0.258900910615921, |
| "learning_rate": 0.0001878238993710692, |
| "loss": 0.4415, |
| "step": 268 |
| }, |
| { |
| "epoch": 3.376175548589342, |
| "grad_norm": 0.24031828343868256, |
| "learning_rate": 0.00018772327044025156, |
| "loss": 0.469, |
| "step": 270 |
| }, |
| { |
| "epoch": 3.4012539184952977, |
| "grad_norm": 0.26624906063079834, |
| "learning_rate": 0.00018762264150943395, |
| "loss": 0.4304, |
| "step": 272 |
| }, |
| { |
| "epoch": 3.426332288401254, |
| "grad_norm": 0.2869020998477936, |
| "learning_rate": 0.00018752201257861637, |
| "loss": 0.4623, |
| "step": 274 |
| }, |
| { |
| "epoch": 3.45141065830721, |
| "grad_norm": 0.2383798062801361, |
| "learning_rate": 0.00018742138364779876, |
| "loss": 0.3973, |
| "step": 276 |
| }, |
| { |
| "epoch": 3.476489028213166, |
| "grad_norm": 0.25947991013526917, |
| "learning_rate": 0.00018732075471698115, |
| "loss": 0.4468, |
| "step": 278 |
| }, |
| { |
| "epoch": 3.501567398119122, |
| "grad_norm": 0.21950559318065643, |
| "learning_rate": 0.00018722012578616354, |
| "loss": 0.3432, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.5266457680250785, |
| "grad_norm": 0.26003995537757874, |
| "learning_rate": 0.00018711949685534592, |
| "loss": 0.4664, |
| "step": 282 |
| }, |
| { |
| "epoch": 3.5517241379310347, |
| "grad_norm": 0.2847505807876587, |
| "learning_rate": 0.0001870188679245283, |
| "loss": 0.4583, |
| "step": 284 |
| }, |
| { |
| "epoch": 3.5768025078369905, |
| "grad_norm": 0.2824760973453522, |
| "learning_rate": 0.0001869182389937107, |
| "loss": 0.4735, |
| "step": 286 |
| }, |
| { |
| "epoch": 3.6018808777429467, |
| "grad_norm": 0.268838107585907, |
| "learning_rate": 0.0001868176100628931, |
| "loss": 0.4071, |
| "step": 288 |
| }, |
| { |
| "epoch": 3.626959247648903, |
| "grad_norm": 0.24519529938697815, |
| "learning_rate": 0.00018671698113207548, |
| "loss": 0.4178, |
| "step": 290 |
| }, |
| { |
| "epoch": 3.652037617554859, |
| "grad_norm": 0.24740180373191833, |
| "learning_rate": 0.00018661635220125787, |
| "loss": 0.4716, |
| "step": 292 |
| }, |
| { |
| "epoch": 3.677115987460815, |
| "grad_norm": 0.22623687982559204, |
| "learning_rate": 0.00018651572327044026, |
| "loss": 0.3645, |
| "step": 294 |
| }, |
| { |
| "epoch": 3.7021943573667713, |
| "grad_norm": 0.2554280459880829, |
| "learning_rate": 0.00018641509433962264, |
| "loss": 0.4044, |
| "step": 296 |
| }, |
| { |
| "epoch": 3.7272727272727275, |
| "grad_norm": 0.2251761108636856, |
| "learning_rate": 0.00018631446540880503, |
| "loss": 0.3663, |
| "step": 298 |
| }, |
| { |
| "epoch": 3.7523510971786833, |
| "grad_norm": 0.20053140819072723, |
| "learning_rate": 0.00018621383647798742, |
| "loss": 0.4342, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.7774294670846396, |
| "grad_norm": 0.2692326605319977, |
| "learning_rate": 0.0001861132075471698, |
| "loss": 0.4268, |
| "step": 302 |
| }, |
| { |
| "epoch": 3.8025078369905954, |
| "grad_norm": 0.23218081891536713, |
| "learning_rate": 0.00018601257861635223, |
| "loss": 0.4848, |
| "step": 304 |
| }, |
| { |
| "epoch": 3.8275862068965516, |
| "grad_norm": 0.2571001648902893, |
| "learning_rate": 0.00018591194968553462, |
| "loss": 0.5391, |
| "step": 306 |
| }, |
| { |
| "epoch": 3.852664576802508, |
| "grad_norm": 0.20899826288223267, |
| "learning_rate": 0.00018581132075471698, |
| "loss": 0.4183, |
| "step": 308 |
| }, |
| { |
| "epoch": 3.877742946708464, |
| "grad_norm": 0.24893143773078918, |
| "learning_rate": 0.00018571069182389937, |
| "loss": 0.4314, |
| "step": 310 |
| }, |
| { |
| "epoch": 3.9028213166144203, |
| "grad_norm": 0.26598888635635376, |
| "learning_rate": 0.00018561006289308175, |
| "loss": 0.4182, |
| "step": 312 |
| }, |
| { |
| "epoch": 3.927899686520376, |
| "grad_norm": 0.24121470749378204, |
| "learning_rate": 0.00018550943396226417, |
| "loss": 0.4327, |
| "step": 314 |
| }, |
| { |
| "epoch": 3.9529780564263324, |
| "grad_norm": 0.2874317467212677, |
| "learning_rate": 0.00018540880503144656, |
| "loss": 0.4616, |
| "step": 316 |
| }, |
| { |
| "epoch": 3.978056426332288, |
| "grad_norm": 0.22735589742660522, |
| "learning_rate": 0.00018530817610062895, |
| "loss": 0.376, |
| "step": 318 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.3424147069454193, |
| "learning_rate": 0.00018520754716981134, |
| "loss": 0.537, |
| "step": 320 |
| }, |
| { |
| "epoch": 4.025078369905956, |
| "grad_norm": 0.27403339743614197, |
| "learning_rate": 0.0001851069182389937, |
| "loss": 0.3539, |
| "step": 322 |
| }, |
| { |
| "epoch": 4.0501567398119125, |
| "grad_norm": 0.33905503153800964, |
| "learning_rate": 0.0001850062893081761, |
| "loss": 0.3367, |
| "step": 324 |
| }, |
| { |
| "epoch": 4.075235109717869, |
| "grad_norm": 0.47853460907936096, |
| "learning_rate": 0.0001849056603773585, |
| "loss": 0.393, |
| "step": 326 |
| }, |
| { |
| "epoch": 4.100313479623824, |
| "grad_norm": 0.3133102059364319, |
| "learning_rate": 0.0001848050314465409, |
| "loss": 0.3762, |
| "step": 328 |
| }, |
| { |
| "epoch": 4.12539184952978, |
| "grad_norm": 0.22834369540214539, |
| "learning_rate": 0.00018470440251572328, |
| "loss": 0.3404, |
| "step": 330 |
| }, |
| { |
| "epoch": 4.150470219435737, |
| "grad_norm": 0.2537166476249695, |
| "learning_rate": 0.00018460377358490567, |
| "loss": 0.3582, |
| "step": 332 |
| }, |
| { |
| "epoch": 4.175548589341693, |
| "grad_norm": 0.35708144307136536, |
| "learning_rate": 0.00018450314465408806, |
| "loss": 0.369, |
| "step": 334 |
| }, |
| { |
| "epoch": 4.200626959247649, |
| "grad_norm": 0.3224405348300934, |
| "learning_rate": 0.00018440251572327045, |
| "loss": 0.3458, |
| "step": 336 |
| }, |
| { |
| "epoch": 4.225705329153605, |
| "grad_norm": 0.34621739387512207, |
| "learning_rate": 0.00018430188679245283, |
| "loss": 0.4176, |
| "step": 338 |
| }, |
| { |
| "epoch": 4.250783699059561, |
| "grad_norm": 0.24818404018878937, |
| "learning_rate": 0.00018420125786163522, |
| "loss": 0.3205, |
| "step": 340 |
| }, |
| { |
| "epoch": 4.275862068965517, |
| "grad_norm": 0.29599064588546753, |
| "learning_rate": 0.0001841006289308176, |
| "loss": 0.3696, |
| "step": 342 |
| }, |
| { |
| "epoch": 4.300940438871473, |
| "grad_norm": 0.2980504333972931, |
| "learning_rate": 0.00018400000000000003, |
| "loss": 0.3256, |
| "step": 344 |
| }, |
| { |
| "epoch": 4.326018808777429, |
| "grad_norm": 0.35454604029655457, |
| "learning_rate": 0.0001838993710691824, |
| "loss": 0.4957, |
| "step": 346 |
| }, |
| { |
| "epoch": 4.351097178683386, |
| "grad_norm": 0.3497369885444641, |
| "learning_rate": 0.00018379874213836478, |
| "loss": 0.3941, |
| "step": 348 |
| }, |
| { |
| "epoch": 4.376175548589342, |
| "grad_norm": 0.32460692524909973, |
| "learning_rate": 0.00018369811320754717, |
| "loss": 0.3615, |
| "step": 350 |
| }, |
| { |
| "epoch": 4.401253918495298, |
| "grad_norm": 0.29358094930648804, |
| "learning_rate": 0.00018359748427672955, |
| "loss": 0.3749, |
| "step": 352 |
| }, |
| { |
| "epoch": 4.4263322884012535, |
| "grad_norm": 0.2807920575141907, |
| "learning_rate": 0.00018349685534591197, |
| "loss": 0.3575, |
| "step": 354 |
| }, |
| { |
| "epoch": 4.45141065830721, |
| "grad_norm": 0.2809455096721649, |
| "learning_rate": 0.00018339622641509436, |
| "loss": 0.3097, |
| "step": 356 |
| }, |
| { |
| "epoch": 4.476489028213166, |
| "grad_norm": 0.3250884413719177, |
| "learning_rate": 0.00018329559748427675, |
| "loss": 0.4113, |
| "step": 358 |
| }, |
| { |
| "epoch": 4.501567398119122, |
| "grad_norm": 0.29040804505348206, |
| "learning_rate": 0.0001831949685534591, |
| "loss": 0.4481, |
| "step": 360 |
| }, |
| { |
| "epoch": 4.5266457680250785, |
| "grad_norm": 0.3208359479904175, |
| "learning_rate": 0.0001830943396226415, |
| "loss": 0.4438, |
| "step": 362 |
| }, |
| { |
| "epoch": 4.551724137931035, |
| "grad_norm": 0.23080404102802277, |
| "learning_rate": 0.00018299371069182391, |
| "loss": 0.2099, |
| "step": 364 |
| }, |
| { |
| "epoch": 4.576802507836991, |
| "grad_norm": 0.2984071373939514, |
| "learning_rate": 0.0001828930817610063, |
| "loss": 0.3339, |
| "step": 366 |
| }, |
| { |
| "epoch": 4.601880877742946, |
| "grad_norm": 0.3299279808998108, |
| "learning_rate": 0.0001827924528301887, |
| "loss": 0.4208, |
| "step": 368 |
| }, |
| { |
| "epoch": 4.6269592476489025, |
| "grad_norm": 0.3243483006954193, |
| "learning_rate": 0.00018269182389937108, |
| "loss": 0.4165, |
| "step": 370 |
| }, |
| { |
| "epoch": 4.652037617554859, |
| "grad_norm": 0.29541853070259094, |
| "learning_rate": 0.00018259119496855347, |
| "loss": 0.3703, |
| "step": 372 |
| }, |
| { |
| "epoch": 4.677115987460815, |
| "grad_norm": 0.3010431230068207, |
| "learning_rate": 0.00018249056603773586, |
| "loss": 0.4034, |
| "step": 374 |
| }, |
| { |
| "epoch": 4.702194357366771, |
| "grad_norm": 0.2970607578754425, |
| "learning_rate": 0.00018238993710691825, |
| "loss": 0.3151, |
| "step": 376 |
| }, |
| { |
| "epoch": 4.7272727272727275, |
| "grad_norm": 0.2794083058834076, |
| "learning_rate": 0.00018228930817610063, |
| "loss": 0.3605, |
| "step": 378 |
| }, |
| { |
| "epoch": 4.752351097178684, |
| "grad_norm": 0.2949012219905853, |
| "learning_rate": 0.00018218867924528302, |
| "loss": 0.343, |
| "step": 380 |
| }, |
| { |
| "epoch": 4.777429467084639, |
| "grad_norm": 0.28160709142684937, |
| "learning_rate": 0.0001820880503144654, |
| "loss": 0.4515, |
| "step": 382 |
| }, |
| { |
| "epoch": 4.802507836990595, |
| "grad_norm": 0.296051561832428, |
| "learning_rate": 0.00018198742138364783, |
| "loss": 0.3908, |
| "step": 384 |
| }, |
| { |
| "epoch": 4.827586206896552, |
| "grad_norm": 0.26115506887435913, |
| "learning_rate": 0.0001818867924528302, |
| "loss": 0.3312, |
| "step": 386 |
| }, |
| { |
| "epoch": 4.852664576802508, |
| "grad_norm": 0.27632880210876465, |
| "learning_rate": 0.00018178616352201258, |
| "loss": 0.3179, |
| "step": 388 |
| }, |
| { |
| "epoch": 4.877742946708464, |
| "grad_norm": 0.2973230481147766, |
| "learning_rate": 0.00018168553459119497, |
| "loss": 0.382, |
| "step": 390 |
| }, |
| { |
| "epoch": 4.90282131661442, |
| "grad_norm": 0.2833520472049713, |
| "learning_rate": 0.00018158490566037736, |
| "loss": 0.3363, |
| "step": 392 |
| }, |
| { |
| "epoch": 4.927899686520377, |
| "grad_norm": 0.30823326110839844, |
| "learning_rate": 0.00018148427672955977, |
| "loss": 0.3234, |
| "step": 394 |
| }, |
| { |
| "epoch": 4.952978056426332, |
| "grad_norm": 0.2736763060092926, |
| "learning_rate": 0.00018138364779874216, |
| "loss": 0.415, |
| "step": 396 |
| }, |
| { |
| "epoch": 4.978056426332288, |
| "grad_norm": 0.2832898199558258, |
| "learning_rate": 0.00018128301886792455, |
| "loss": 0.3755, |
| "step": 398 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.3281781077384949, |
| "learning_rate": 0.0001811823899371069, |
| "loss": 0.2756, |
| "step": 400 |
| } |
| ], |
| "logging_steps": 2, |
| "max_steps": 4000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.25570189464994e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|