| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 400, |
| "global_step": 375, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.5721463561058044, |
| "learning_rate": 0.000997326203208556, |
| "loss": 22.6723, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 1.5221903324127197, |
| "learning_rate": 0.0009919786096256684, |
| "loss": 22.7502, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 3.269012212753296, |
| "learning_rate": 0.0009866310160427808, |
| "loss": 22.2706, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 4.567020416259766, |
| "learning_rate": 0.0009812834224598931, |
| "loss": 21.3625, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 7.019204139709473, |
| "learning_rate": 0.0009759358288770054, |
| "loss": 20.7279, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 8.498096466064453, |
| "learning_rate": 0.0009705882352941176, |
| "loss": 20.8221, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 7.8151397705078125, |
| "learning_rate": 0.00096524064171123, |
| "loss": 20.4136, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 8.028499603271484, |
| "learning_rate": 0.0009598930481283422, |
| "loss": 20.2719, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 8.516434669494629, |
| "learning_rate": 0.0009545454545454546, |
| "loss": 20.1681, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 8.52490520477295, |
| "learning_rate": 0.0009491978609625669, |
| "loss": 19.8895, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 6.709629058837891, |
| "learning_rate": 0.0009438502673796791, |
| "loss": 19.93, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 6.038687705993652, |
| "learning_rate": 0.0009385026737967914, |
| "loss": 19.6312, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 5.785665512084961, |
| "learning_rate": 0.0009331550802139037, |
| "loss": 19.7683, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 5.79067850112915, |
| "learning_rate": 0.0009278074866310161, |
| "loss": 19.6965, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 5.166928291320801, |
| "learning_rate": 0.0009224598930481284, |
| "loss": 19.4005, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 4.578023433685303, |
| "learning_rate": 0.0009171122994652407, |
| "loss": 19.3963, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 4.7540693283081055, |
| "learning_rate": 0.0009117647058823529, |
| "loss": 19.4129, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 5.394408226013184, |
| "learning_rate": 0.0009064171122994653, |
| "loss": 19.5821, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 4.4902753829956055, |
| "learning_rate": 0.0009010695187165776, |
| "loss": 19.6562, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 5.49019193649292, |
| "learning_rate": 0.0008957219251336899, |
| "loss": 19.3588, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 4.184142589569092, |
| "learning_rate": 0.0008903743315508022, |
| "loss": 18.9032, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 3.98618483543396, |
| "learning_rate": 0.0008850267379679144, |
| "loss": 19.1882, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 4.851687908172607, |
| "learning_rate": 0.0008796791443850267, |
| "loss": 19.4565, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 4.108444690704346, |
| "learning_rate": 0.0008743315508021391, |
| "loss": 19.6149, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 3.7055838108062744, |
| "learning_rate": 0.0008689839572192514, |
| "loss": 18.9573, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 4.930137634277344, |
| "learning_rate": 0.0008636363636363636, |
| "loss": 19.4389, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 3.910098075866699, |
| "learning_rate": 0.000858288770053476, |
| "loss": 19.1465, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 4.0127716064453125, |
| "learning_rate": 0.0008529411764705882, |
| "loss": 19.5038, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 4.495028018951416, |
| "learning_rate": 0.0008475935828877005, |
| "loss": 19.3252, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 3.7703821659088135, |
| "learning_rate": 0.0008422459893048129, |
| "loss": 19.0238, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 3.6335291862487793, |
| "learning_rate": 0.0008368983957219252, |
| "loss": 19.1296, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 3.819183588027954, |
| "learning_rate": 0.0008315508021390374, |
| "loss": 18.4946, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.528, |
| "grad_norm": 3.3171255588531494, |
| "learning_rate": 0.0008262032085561497, |
| "loss": 18.8054, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.544, |
| "grad_norm": 4.316566467285156, |
| "learning_rate": 0.000820855614973262, |
| "loss": 19.162, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 3.39648175239563, |
| "learning_rate": 0.0008155080213903744, |
| "loss": 18.5671, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 3.7200136184692383, |
| "learning_rate": 0.0008101604278074867, |
| "loss": 18.9179, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.592, |
| "grad_norm": 3.6730430126190186, |
| "learning_rate": 0.0008048128342245989, |
| "loss": 18.7162, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.608, |
| "grad_norm": 3.5580945014953613, |
| "learning_rate": 0.0007994652406417113, |
| "loss": 19.0574, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.624, |
| "grad_norm": 3.4793589115142822, |
| "learning_rate": 0.0007941176470588235, |
| "loss": 18.8649, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 4.074679374694824, |
| "learning_rate": 0.0007887700534759359, |
| "loss": 18.5553, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.656, |
| "grad_norm": 3.315810441970825, |
| "learning_rate": 0.0007834224598930482, |
| "loss": 18.2136, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.672, |
| "grad_norm": 4.288172721862793, |
| "learning_rate": 0.0007780748663101605, |
| "loss": 18.6089, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.688, |
| "grad_norm": 3.5749149322509766, |
| "learning_rate": 0.0007727272727272727, |
| "loss": 18.8697, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.704, |
| "grad_norm": 3.608825206756592, |
| "learning_rate": 0.000767379679144385, |
| "loss": 18.4129, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 3.5199592113494873, |
| "learning_rate": 0.0007620320855614974, |
| "loss": 18.1619, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.736, |
| "grad_norm": 3.5022549629211426, |
| "learning_rate": 0.0007566844919786096, |
| "loss": 18.7368, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.752, |
| "grad_norm": 3.6002230644226074, |
| "learning_rate": 0.000751336898395722, |
| "loss": 18.7792, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.768, |
| "grad_norm": 4.682362079620361, |
| "learning_rate": 0.0007459893048128342, |
| "loss": 18.5495, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.784, |
| "grad_norm": 3.6108767986297607, |
| "learning_rate": 0.0007406417112299465, |
| "loss": 18.7077, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 3.4719815254211426, |
| "learning_rate": 0.0007352941176470589, |
| "loss": 18.3262, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.816, |
| "grad_norm": 4.4115986824035645, |
| "learning_rate": 0.0007299465240641712, |
| "loss": 18.3416, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.832, |
| "grad_norm": 3.324169158935547, |
| "learning_rate": 0.0007245989304812834, |
| "loss": 18.7297, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.848, |
| "grad_norm": 3.4287421703338623, |
| "learning_rate": 0.0007192513368983958, |
| "loss": 18.4499, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.864, |
| "grad_norm": 3.9451239109039307, |
| "learning_rate": 0.000713903743315508, |
| "loss": 18.2669, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 3.5031988620758057, |
| "learning_rate": 0.0007085561497326202, |
| "loss": 18.8895, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.896, |
| "grad_norm": 3.5174903869628906, |
| "learning_rate": 0.0007032085561497327, |
| "loss": 18.2961, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.912, |
| "grad_norm": 4.080729961395264, |
| "learning_rate": 0.0006978609625668449, |
| "loss": 18.5613, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.928, |
| "grad_norm": 3.7523930072784424, |
| "learning_rate": 0.0006925133689839572, |
| "loss": 18.5538, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.944, |
| "grad_norm": 3.066669225692749, |
| "learning_rate": 0.0006871657754010695, |
| "loss": 18.6904, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 4.274256706237793, |
| "learning_rate": 0.0006818181818181818, |
| "loss": 18.6147, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.976, |
| "grad_norm": 3.690139055252075, |
| "learning_rate": 0.0006764705882352942, |
| "loss": 18.1693, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.992, |
| "grad_norm": 3.6681807041168213, |
| "learning_rate": 0.0006711229946524065, |
| "loss": 18.2498, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.008, |
| "grad_norm": 3.5203354358673096, |
| "learning_rate": 0.0006657754010695187, |
| "loss": 18.4522, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.024, |
| "grad_norm": 4.650991439819336, |
| "learning_rate": 0.000660427807486631, |
| "loss": 18.2839, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 3.7944228649139404, |
| "learning_rate": 0.0006550802139037433, |
| "loss": 18.051, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.056, |
| "grad_norm": 3.2437500953674316, |
| "learning_rate": 0.0006497326203208556, |
| "loss": 18.1842, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.072, |
| "grad_norm": 3.2863543033599854, |
| "learning_rate": 0.000644385026737968, |
| "loss": 18.2304, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.088, |
| "grad_norm": 3.553260326385498, |
| "learning_rate": 0.0006390374331550802, |
| "loss": 18.1385, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.104, |
| "grad_norm": 3.4277195930480957, |
| "learning_rate": 0.0006336898395721925, |
| "loss": 18.1337, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 3.974073886871338, |
| "learning_rate": 0.0006283422459893048, |
| "loss": 18.0326, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.1360000000000001, |
| "grad_norm": 3.3450510501861572, |
| "learning_rate": 0.0006229946524064172, |
| "loss": 18.2695, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.152, |
| "grad_norm": 3.2181997299194336, |
| "learning_rate": 0.0006176470588235294, |
| "loss": 18.0315, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.168, |
| "grad_norm": 3.8346364498138428, |
| "learning_rate": 0.0006122994652406418, |
| "loss": 18.4272, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.184, |
| "grad_norm": 3.2085418701171875, |
| "learning_rate": 0.000606951871657754, |
| "loss": 18.1768, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 3.462108850479126, |
| "learning_rate": 0.0006016042780748662, |
| "loss": 18.1731, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.216, |
| "grad_norm": 3.444965362548828, |
| "learning_rate": 0.0005962566844919787, |
| "loss": 18.3599, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.232, |
| "grad_norm": 3.3701171875, |
| "learning_rate": 0.0005909090909090909, |
| "loss": 18.1495, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.248, |
| "grad_norm": 3.5145843029022217, |
| "learning_rate": 0.0005855614973262032, |
| "loss": 18.0835, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.264, |
| "grad_norm": 3.4785313606262207, |
| "learning_rate": 0.0005802139037433155, |
| "loss": 17.8138, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 3.9735538959503174, |
| "learning_rate": 0.0005748663101604278, |
| "loss": 18.0071, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.296, |
| "grad_norm": 3.650447368621826, |
| "learning_rate": 0.00056951871657754, |
| "loss": 18.0124, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.312, |
| "grad_norm": 3.6459813117980957, |
| "learning_rate": 0.0005641711229946525, |
| "loss": 18.0059, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.328, |
| "grad_norm": 3.2154831886291504, |
| "learning_rate": 0.0005588235294117647, |
| "loss": 17.9694, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.3439999999999999, |
| "grad_norm": 3.367403507232666, |
| "learning_rate": 0.0005534759358288771, |
| "loss": 17.6557, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "grad_norm": 3.9948298931121826, |
| "learning_rate": 0.0005481283422459893, |
| "loss": 18.1942, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.376, |
| "grad_norm": 3.3495073318481445, |
| "learning_rate": 0.0005427807486631015, |
| "loss": 18.2016, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.392, |
| "grad_norm": 3.373162269592285, |
| "learning_rate": 0.000537433155080214, |
| "loss": 18.0422, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.408, |
| "grad_norm": 4.063633441925049, |
| "learning_rate": 0.0005320855614973262, |
| "loss": 18.0809, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.424, |
| "grad_norm": 3.4912514686584473, |
| "learning_rate": 0.0005267379679144385, |
| "loss": 18.0674, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 3.5900015830993652, |
| "learning_rate": 0.0005213903743315508, |
| "loss": 17.9285, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.456, |
| "grad_norm": 4.066802024841309, |
| "learning_rate": 0.0005160427807486631, |
| "loss": 18.1551, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.472, |
| "grad_norm": 3.9782357215881348, |
| "learning_rate": 0.0005106951871657754, |
| "loss": 18.0509, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.488, |
| "grad_norm": 3.314682960510254, |
| "learning_rate": 0.0005053475935828878, |
| "loss": 17.7608, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.504, |
| "grad_norm": 3.3548595905303955, |
| "learning_rate": 0.0005, |
| "loss": 17.8103, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 3.3475797176361084, |
| "learning_rate": 0.0004946524064171123, |
| "loss": 17.9465, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.536, |
| "grad_norm": 3.4256432056427, |
| "learning_rate": 0.0004893048128342246, |
| "loss": 17.6619, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.552, |
| "grad_norm": 3.390056848526001, |
| "learning_rate": 0.0004839572192513369, |
| "loss": 17.9681, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.568, |
| "grad_norm": 3.4441208839416504, |
| "learning_rate": 0.00047860962566844924, |
| "loss": 17.9407, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.584, |
| "grad_norm": 3.2374165058135986, |
| "learning_rate": 0.0004732620320855615, |
| "loss": 17.7235, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 3.5628514289855957, |
| "learning_rate": 0.0004679144385026738, |
| "loss": 18.1743, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.616, |
| "grad_norm": 3.41139554977417, |
| "learning_rate": 0.00046256684491978613, |
| "loss": 17.8456, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.6320000000000001, |
| "grad_norm": 3.423110008239746, |
| "learning_rate": 0.0004572192513368984, |
| "loss": 17.6656, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.6480000000000001, |
| "grad_norm": 3.3344337940216064, |
| "learning_rate": 0.00045187165775401067, |
| "loss": 17.962, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.6640000000000001, |
| "grad_norm": 3.5036981105804443, |
| "learning_rate": 0.000446524064171123, |
| "loss": 18.0875, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "grad_norm": 3.4953839778900146, |
| "learning_rate": 0.0004411764705882353, |
| "loss": 17.3435, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.696, |
| "grad_norm": 3.6864068508148193, |
| "learning_rate": 0.0004358288770053476, |
| "loss": 17.9087, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.712, |
| "grad_norm": 3.4755449295043945, |
| "learning_rate": 0.0004304812834224599, |
| "loss": 17.5076, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.728, |
| "grad_norm": 3.8116891384124756, |
| "learning_rate": 0.0004251336898395722, |
| "loss": 17.9272, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.744, |
| "grad_norm": 3.18284010887146, |
| "learning_rate": 0.0004197860962566845, |
| "loss": 17.7148, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 3.2884979248046875, |
| "learning_rate": 0.0004144385026737968, |
| "loss": 17.8813, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.776, |
| "grad_norm": 3.3735768795013428, |
| "learning_rate": 0.00040909090909090913, |
| "loss": 18.0372, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.792, |
| "grad_norm": 3.2611794471740723, |
| "learning_rate": 0.00040374331550802143, |
| "loss": 17.3771, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.808, |
| "grad_norm": 3.3338570594787598, |
| "learning_rate": 0.00039839572192513367, |
| "loss": 18.4657, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.8239999999999998, |
| "grad_norm": 3.405127763748169, |
| "learning_rate": 0.000393048128342246, |
| "loss": 17.9076, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "grad_norm": 3.561793565750122, |
| "learning_rate": 0.0003877005347593583, |
| "loss": 17.8996, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.8559999999999999, |
| "grad_norm": 3.5615479946136475, |
| "learning_rate": 0.00038235294117647055, |
| "loss": 17.6746, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.8719999999999999, |
| "grad_norm": 3.4306275844573975, |
| "learning_rate": 0.0003770053475935829, |
| "loss": 17.7182, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.888, |
| "grad_norm": 3.5057003498077393, |
| "learning_rate": 0.0003716577540106952, |
| "loss": 17.8058, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.904, |
| "grad_norm": 3.3117101192474365, |
| "learning_rate": 0.0003663101604278075, |
| "loss": 17.8643, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 3.6897945404052734, |
| "learning_rate": 0.0003609625668449198, |
| "loss": 17.8266, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.936, |
| "grad_norm": 3.7577505111694336, |
| "learning_rate": 0.0003556149732620321, |
| "loss": 18.6381, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.952, |
| "grad_norm": 3.2401480674743652, |
| "learning_rate": 0.0003502673796791444, |
| "loss": 17.6933, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.968, |
| "grad_norm": 3.6619515419006348, |
| "learning_rate": 0.0003449197860962567, |
| "loss": 18.0547, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.984, |
| "grad_norm": 3.8387668132781982, |
| "learning_rate": 0.000339572192513369, |
| "loss": 17.7932, |
| "step": 248 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 3.390653371810913, |
| "learning_rate": 0.0003342245989304813, |
| "loss": 17.2655, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.016, |
| "grad_norm": 3.40058970451355, |
| "learning_rate": 0.00032887700534759356, |
| "loss": 17.703, |
| "step": 252 |
| }, |
| { |
| "epoch": 2.032, |
| "grad_norm": 3.568702220916748, |
| "learning_rate": 0.0003235294117647059, |
| "loss": 17.2042, |
| "step": 254 |
| }, |
| { |
| "epoch": 2.048, |
| "grad_norm": 3.529431104660034, |
| "learning_rate": 0.0003181818181818182, |
| "loss": 17.5732, |
| "step": 256 |
| }, |
| { |
| "epoch": 2.064, |
| "grad_norm": 3.3919003009796143, |
| "learning_rate": 0.00031283422459893044, |
| "loss": 17.6191, |
| "step": 258 |
| }, |
| { |
| "epoch": 2.08, |
| "grad_norm": 3.878042459487915, |
| "learning_rate": 0.0003074866310160428, |
| "loss": 17.4911, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.096, |
| "grad_norm": 3.772318124771118, |
| "learning_rate": 0.0003021390374331551, |
| "loss": 17.7258, |
| "step": 262 |
| }, |
| { |
| "epoch": 2.112, |
| "grad_norm": 3.4453060626983643, |
| "learning_rate": 0.0002967914438502674, |
| "loss": 17.4906, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.128, |
| "grad_norm": 3.4957454204559326, |
| "learning_rate": 0.0002914438502673797, |
| "loss": 17.5716, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.144, |
| "grad_norm": 3.530831813812256, |
| "learning_rate": 0.000286096256684492, |
| "loss": 17.4089, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 3.7524755001068115, |
| "learning_rate": 0.0002807486631016043, |
| "loss": 17.7712, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.176, |
| "grad_norm": 3.297961711883545, |
| "learning_rate": 0.00027540106951871656, |
| "loss": 17.4408, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.192, |
| "grad_norm": 3.3661088943481445, |
| "learning_rate": 0.0002700534759358289, |
| "loss": 17.6753, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.208, |
| "grad_norm": 3.646210193634033, |
| "learning_rate": 0.0002647058823529412, |
| "loss": 17.7821, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.224, |
| "grad_norm": 3.475140333175659, |
| "learning_rate": 0.00025935828877005345, |
| "loss": 17.6129, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.24, |
| "grad_norm": 3.4734578132629395, |
| "learning_rate": 0.0002540106951871658, |
| "loss": 17.6856, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.2560000000000002, |
| "grad_norm": 3.491572380065918, |
| "learning_rate": 0.0002486631016042781, |
| "loss": 17.6071, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.2720000000000002, |
| "grad_norm": 3.4102542400360107, |
| "learning_rate": 0.0002433155080213904, |
| "loss": 17.352, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.288, |
| "grad_norm": 3.393477439880371, |
| "learning_rate": 0.00023796791443850268, |
| "loss": 17.2612, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.304, |
| "grad_norm": 3.112462282180786, |
| "learning_rate": 0.000232620320855615, |
| "loss": 17.3272, |
| "step": 288 |
| }, |
| { |
| "epoch": 2.32, |
| "grad_norm": 3.3398191928863525, |
| "learning_rate": 0.00022727272727272727, |
| "loss": 17.5815, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.336, |
| "grad_norm": 3.5039889812469482, |
| "learning_rate": 0.00022192513368983957, |
| "loss": 17.7557, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.352, |
| "grad_norm": 3.532892942428589, |
| "learning_rate": 0.0002165775401069519, |
| "loss": 18.0523, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.368, |
| "grad_norm": 3.2969062328338623, |
| "learning_rate": 0.00021122994652406418, |
| "loss": 17.7496, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.384, |
| "grad_norm": 3.262855291366577, |
| "learning_rate": 0.00020588235294117645, |
| "loss": 17.793, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 3.459914445877075, |
| "learning_rate": 0.00020053475935828877, |
| "loss": 17.9245, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.416, |
| "grad_norm": 3.6749696731567383, |
| "learning_rate": 0.00019518716577540107, |
| "loss": 17.7125, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.432, |
| "grad_norm": 3.266754150390625, |
| "learning_rate": 0.0001898395721925134, |
| "loss": 17.5905, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.448, |
| "grad_norm": 3.1848971843719482, |
| "learning_rate": 0.00018449197860962566, |
| "loss": 17.523, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.464, |
| "grad_norm": 3.2962844371795654, |
| "learning_rate": 0.00017914438502673795, |
| "loss": 17.5297, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 3.4688000679016113, |
| "learning_rate": 0.00017379679144385028, |
| "loss": 17.6315, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.496, |
| "grad_norm": 3.4146833419799805, |
| "learning_rate": 0.00016844919786096257, |
| "loss": 17.5776, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.512, |
| "grad_norm": 3.3122944831848145, |
| "learning_rate": 0.0001631016042780749, |
| "loss": 17.7264, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.528, |
| "grad_norm": 3.2939462661743164, |
| "learning_rate": 0.00015775401069518716, |
| "loss": 17.48, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.544, |
| "grad_norm": 3.8504631519317627, |
| "learning_rate": 0.00015240641711229946, |
| "loss": 17.3854, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.56, |
| "grad_norm": 4.062356948852539, |
| "learning_rate": 0.00014705882352941178, |
| "loss": 17.6811, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.576, |
| "grad_norm": 3.741989850997925, |
| "learning_rate": 0.00014171122994652407, |
| "loss": 17.4078, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.592, |
| "grad_norm": 3.7287967205047607, |
| "learning_rate": 0.00013636363636363637, |
| "loss": 17.3517, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.608, |
| "grad_norm": 3.6224465370178223, |
| "learning_rate": 0.00013101604278074866, |
| "loss": 17.254, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.624, |
| "grad_norm": 3.5674147605895996, |
| "learning_rate": 0.00012566844919786096, |
| "loss": 17.869, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.64, |
| "grad_norm": 3.722736358642578, |
| "learning_rate": 0.00012032085561497325, |
| "loss": 17.7399, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.656, |
| "grad_norm": 3.6463096141815186, |
| "learning_rate": 0.00011497326203208556, |
| "loss": 17.5016, |
| "step": 332 |
| }, |
| { |
| "epoch": 2.672, |
| "grad_norm": 3.5358524322509766, |
| "learning_rate": 0.00010962566844919786, |
| "loss": 17.0355, |
| "step": 334 |
| }, |
| { |
| "epoch": 2.6879999999999997, |
| "grad_norm": 3.5321309566497803, |
| "learning_rate": 0.00010427807486631017, |
| "loss": 17.5089, |
| "step": 336 |
| }, |
| { |
| "epoch": 2.7039999999999997, |
| "grad_norm": 3.4019291400909424, |
| "learning_rate": 9.893048128342247e-05, |
| "loss": 17.3768, |
| "step": 338 |
| }, |
| { |
| "epoch": 2.7199999999999998, |
| "grad_norm": 3.4486570358276367, |
| "learning_rate": 9.358288770053476e-05, |
| "loss": 17.488, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.7359999999999998, |
| "grad_norm": 3.7740256786346436, |
| "learning_rate": 8.823529411764706e-05, |
| "loss": 17.5768, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.752, |
| "grad_norm": 3.5659339427948, |
| "learning_rate": 8.288770053475936e-05, |
| "loss": 17.6865, |
| "step": 344 |
| }, |
| { |
| "epoch": 2.768, |
| "grad_norm": 3.3678972721099854, |
| "learning_rate": 7.754010695187167e-05, |
| "loss": 17.4687, |
| "step": 346 |
| }, |
| { |
| "epoch": 2.784, |
| "grad_norm": 3.585134506225586, |
| "learning_rate": 7.219251336898395e-05, |
| "loss": 17.536, |
| "step": 348 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 3.6471846103668213, |
| "learning_rate": 6.684491978609626e-05, |
| "loss": 17.6269, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.816, |
| "grad_norm": 3.533790111541748, |
| "learning_rate": 6.149732620320857e-05, |
| "loss": 17.5771, |
| "step": 352 |
| }, |
| { |
| "epoch": 2.832, |
| "grad_norm": 3.7971367835998535, |
| "learning_rate": 5.614973262032086e-05, |
| "loss": 17.874, |
| "step": 354 |
| }, |
| { |
| "epoch": 2.848, |
| "grad_norm": 3.391874074935913, |
| "learning_rate": 5.080213903743316e-05, |
| "loss": 17.2528, |
| "step": 356 |
| }, |
| { |
| "epoch": 2.864, |
| "grad_norm": 3.069033145904541, |
| "learning_rate": 4.545454545454546e-05, |
| "loss": 17.6175, |
| "step": 358 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 3.780275821685791, |
| "learning_rate": 4.0106951871657754e-05, |
| "loss": 17.2663, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.896, |
| "grad_norm": 3.3377978801727295, |
| "learning_rate": 3.4759358288770055e-05, |
| "loss": 17.3711, |
| "step": 362 |
| }, |
| { |
| "epoch": 2.912, |
| "grad_norm": 3.356203317642212, |
| "learning_rate": 2.9411764705882354e-05, |
| "loss": 17.6077, |
| "step": 364 |
| }, |
| { |
| "epoch": 2.928, |
| "grad_norm": 3.302241563796997, |
| "learning_rate": 2.4064171122994652e-05, |
| "loss": 17.4777, |
| "step": 366 |
| }, |
| { |
| "epoch": 2.944, |
| "grad_norm": 3.73811411857605, |
| "learning_rate": 1.871657754010695e-05, |
| "loss": 17.3149, |
| "step": 368 |
| }, |
| { |
| "epoch": 2.96, |
| "grad_norm": 3.392902135848999, |
| "learning_rate": 1.336898395721925e-05, |
| "loss": 17.8118, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.976, |
| "grad_norm": 3.8080010414123535, |
| "learning_rate": 8.021390374331552e-06, |
| "loss": 17.1875, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.992, |
| "grad_norm": 3.5202646255493164, |
| "learning_rate": 2.67379679144385e-06, |
| "loss": 17.7556, |
| "step": 374 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 375, |
| "total_flos": 2.6461914289864704e+17, |
| "train_loss": 18.264725362141927, |
| "train_runtime": 1944.3243, |
| "train_samples_per_second": 24.687, |
| "train_steps_per_second": 0.193 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 2.2290163040161133, |
| "eval_runtime": 83.3238, |
| "eval_samples_per_second": 24.003, |
| "eval_steps_per_second": 3.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 2.226619243621826, |
| "eval_runtime": 83.9815, |
| "eval_samples_per_second": 23.815, |
| "eval_steps_per_second": 2.977, |
| "step": 375 |
| } |
| ], |
| "logging_steps": 2, |
| "max_steps": 375, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.6461914289864704e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|