diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7174 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1020, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00196078431372549, + "grad_norm": 138868.515625, + "learning_rate": 0.0, + "loss": 17.658615112304688, + "step": 1 + }, + { + "epoch": 0.00392156862745098, + "grad_norm": 14902.44140625, + "learning_rate": 3.2258064516129035e-07, + "loss": 15.435331344604492, + "step": 2 + }, + { + "epoch": 0.0058823529411764705, + "grad_norm": 18682.611328125, + "learning_rate": 6.451612903225807e-07, + "loss": 17.053977966308594, + "step": 3 + }, + { + "epoch": 0.00784313725490196, + "grad_norm": 25779.986328125, + "learning_rate": 9.67741935483871e-07, + "loss": 17.117534637451172, + "step": 4 + }, + { + "epoch": 0.00980392156862745, + "grad_norm": 15318.7373046875, + "learning_rate": 1.2903225806451614e-06, + "loss": 17.542219161987305, + "step": 5 + }, + { + "epoch": 0.011764705882352941, + "grad_norm": 19811.609375, + "learning_rate": 1.6129032258064516e-06, + "loss": 17.963178634643555, + "step": 6 + }, + { + "epoch": 0.013725490196078431, + "grad_norm": 9440.365234375, + "learning_rate": 1.935483870967742e-06, + "loss": 13.022825241088867, + "step": 7 + }, + { + "epoch": 0.01568627450980392, + "grad_norm": 88075.3046875, + "learning_rate": 2.2580645161290324e-06, + "loss": 16.652803421020508, + "step": 8 + }, + { + "epoch": 0.01764705882352941, + "grad_norm": 20173.8203125, + "learning_rate": 2.580645161290323e-06, + "loss": 16.681129455566406, + "step": 9 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 20974.09765625, + "learning_rate": 2.903225806451613e-06, + "loss": 14.24374771118164, + "step": 10 + }, + { + "epoch": 0.021568627450980392, + "grad_norm": 22168.7890625, + "learning_rate": 3.225806451612903e-06, + "loss": 13.430747985839844, + "step": 11 + }, + { + "epoch": 0.023529411764705882, + "grad_norm": 6798.2158203125, + "learning_rate": 3.548387096774194e-06, + "loss": 13.947168350219727, + "step": 12 + }, + { + "epoch": 0.025490196078431372, + "grad_norm": 11967.0244140625, + "learning_rate": 3.870967741935484e-06, + "loss": 15.127799034118652, + "step": 13 + }, + { + "epoch": 0.027450980392156862, + "grad_norm": 11873.794921875, + "learning_rate": 4.193548387096774e-06, + "loss": 13.805913925170898, + "step": 14 + }, + { + "epoch": 0.029411764705882353, + "grad_norm": 25335.591796875, + "learning_rate": 4.516129032258065e-06, + "loss": 13.47291374206543, + "step": 15 + }, + { + "epoch": 0.03137254901960784, + "grad_norm": 21837.5234375, + "learning_rate": 4.838709677419355e-06, + "loss": 14.185405731201172, + "step": 16 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 6198.25146484375, + "learning_rate": 5.161290322580646e-06, + "loss": 11.65322494506836, + "step": 17 + }, + { + "epoch": 0.03529411764705882, + "grad_norm": 5560.4794921875, + "learning_rate": 5.483870967741935e-06, + "loss": 11.970712661743164, + "step": 18 + }, + { + "epoch": 0.03725490196078431, + "grad_norm": 2361.835693359375, + "learning_rate": 5.806451612903226e-06, + "loss": 11.846226692199707, + "step": 19 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 3759.865478515625, + "learning_rate": 6.129032258064517e-06, + "loss": 10.710617065429688, + "step": 20 + }, + { + "epoch": 0.041176470588235294, + "grad_norm": 6874.8154296875, + "learning_rate": 6.451612903225806e-06, + "loss": 9.231167793273926, + "step": 21 + }, + { + "epoch": 0.043137254901960784, + "grad_norm": 19366.40625, + "learning_rate": 6.774193548387097e-06, + "loss": 11.892879486083984, + "step": 22 + }, + { + "epoch": 0.045098039215686274, + "grad_norm": 6423.419921875, + "learning_rate": 7.096774193548388e-06, + "loss": 9.792671203613281, + "step": 23 + }, + { + "epoch": 0.047058823529411764, + "grad_norm": 3562.012451171875, + "learning_rate": 7.4193548387096784e-06, + "loss": 9.525957107543945, + "step": 24 + }, + { + "epoch": 0.049019607843137254, + "grad_norm": 4905.87060546875, + "learning_rate": 7.741935483870968e-06, + "loss": 11.546586036682129, + "step": 25 + }, + { + "epoch": 0.050980392156862744, + "grad_norm": 162666.359375, + "learning_rate": 8.064516129032258e-06, + "loss": 9.598024368286133, + "step": 26 + }, + { + "epoch": 0.052941176470588235, + "grad_norm": 8230.1279296875, + "learning_rate": 8.387096774193549e-06, + "loss": 12.389551162719727, + "step": 27 + }, + { + "epoch": 0.054901960784313725, + "grad_norm": 14983.3798828125, + "learning_rate": 8.70967741935484e-06, + "loss": 14.294782638549805, + "step": 28 + }, + { + "epoch": 0.056862745098039215, + "grad_norm": 8721.31640625, + "learning_rate": 9.03225806451613e-06, + "loss": 10.350369453430176, + "step": 29 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 2975.934814453125, + "learning_rate": 9.35483870967742e-06, + "loss": 9.85280704498291, + "step": 30 + }, + { + "epoch": 0.060784313725490195, + "grad_norm": 4804.4228515625, + "learning_rate": 9.67741935483871e-06, + "loss": 8.866147994995117, + "step": 31 + }, + { + "epoch": 0.06274509803921569, + "grad_norm": 3305.45361328125, + "learning_rate": 1e-05, + "loss": 10.436628341674805, + "step": 32 + }, + { + "epoch": 0.06470588235294118, + "grad_norm": 11790.716796875, + "learning_rate": 9.999974774092107e-06, + "loss": 10.873075485229492, + "step": 33 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 4500.14111328125, + "learning_rate": 9.999899096622962e-06, + "loss": 10.806778907775879, + "step": 34 + }, + { + "epoch": 0.06862745098039216, + "grad_norm": 164.95433044433594, + "learning_rate": 9.999772968356182e-06, + "loss": 11.633756637573242, + "step": 35 + }, + { + "epoch": 0.07058823529411765, + "grad_norm": 43294.8359375, + "learning_rate": 9.999596390564446e-06, + "loss": 9.791693687438965, + "step": 36 + }, + { + "epoch": 0.07254901960784314, + "grad_norm": 39442.3984375, + "learning_rate": 9.999369365029487e-06, + "loss": 8.951934814453125, + "step": 37 + }, + { + "epoch": 0.07450980392156863, + "grad_norm": 588.51171875, + "learning_rate": 9.999091894042077e-06, + "loss": 8.9212646484375, + "step": 38 + }, + { + "epoch": 0.07647058823529412, + "grad_norm": 1923.2791748046875, + "learning_rate": 9.998763980401997e-06, + "loss": 7.225367546081543, + "step": 39 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 657.7442016601562, + "learning_rate": 9.998385627418015e-06, + "loss": 6.479832172393799, + "step": 40 + }, + { + "epoch": 0.0803921568627451, + "grad_norm": 136.12237548828125, + "learning_rate": 9.997956838907853e-06, + "loss": 6.8389177322387695, + "step": 41 + }, + { + "epoch": 0.08235294117647059, + "grad_norm": 251.0398712158203, + "learning_rate": 9.997477619198138e-06, + "loss": 6.4207072257995605, + "step": 42 + }, + { + "epoch": 0.08431372549019608, + "grad_norm": 577.7664794921875, + "learning_rate": 9.996947973124372e-06, + "loss": 7.31302547454834, + "step": 43 + }, + { + "epoch": 0.08627450980392157, + "grad_norm": 91.31795501708984, + "learning_rate": 9.996367906030879e-06, + "loss": 6.951511383056641, + "step": 44 + }, + { + "epoch": 0.08823529411764706, + "grad_norm": 6586.15673828125, + "learning_rate": 9.995737423770746e-06, + "loss": 6.607446670532227, + "step": 45 + }, + { + "epoch": 0.09019607843137255, + "grad_norm": 2296.7861328125, + "learning_rate": 9.995056532705766e-06, + "loss": 5.819401264190674, + "step": 46 + }, + { + "epoch": 0.09215686274509804, + "grad_norm": 61.191524505615234, + "learning_rate": 9.994325239706377e-06, + "loss": 5.54649019241333, + "step": 47 + }, + { + "epoch": 0.09411764705882353, + "grad_norm": 82.74103546142578, + "learning_rate": 9.993543552151594e-06, + "loss": 5.546056747436523, + "step": 48 + }, + { + "epoch": 0.09607843137254903, + "grad_norm": 2329.645263671875, + "learning_rate": 9.992711477928925e-06, + "loss": 6.310848712921143, + "step": 49 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 210.10841369628906, + "learning_rate": 9.991829025434305e-06, + "loss": 4.921277046203613, + "step": 50 + }, + { + "epoch": 0.1, + "grad_norm": 153.93605041503906, + "learning_rate": 9.990896203571994e-06, + "loss": 5.7787909507751465, + "step": 51 + }, + { + "epoch": 0.10196078431372549, + "grad_norm": 146.04843139648438, + "learning_rate": 9.98991302175451e-06, + "loss": 5.398743629455566, + "step": 52 + }, + { + "epoch": 0.10392156862745099, + "grad_norm": 84.42292022705078, + "learning_rate": 9.98887948990251e-06, + "loss": 5.348798751831055, + "step": 53 + }, + { + "epoch": 0.10588235294117647, + "grad_norm": 1806.098388671875, + "learning_rate": 9.987795618444707e-06, + "loss": 5.749485969543457, + "step": 54 + }, + { + "epoch": 0.10784313725490197, + "grad_norm": 19992.6953125, + "learning_rate": 9.986661418317759e-06, + "loss": 4.9936370849609375, + "step": 55 + }, + { + "epoch": 0.10980392156862745, + "grad_norm": 479.07501220703125, + "learning_rate": 9.985476900966156e-06, + "loss": 5.318588733673096, + "step": 56 + }, + { + "epoch": 0.11176470588235295, + "grad_norm": 161.11553955078125, + "learning_rate": 9.984242078342108e-06, + "loss": 5.936580181121826, + "step": 57 + }, + { + "epoch": 0.11372549019607843, + "grad_norm": 47.58917236328125, + "learning_rate": 9.982956962905423e-06, + "loss": 5.245980739593506, + "step": 58 + }, + { + "epoch": 0.11568627450980393, + "grad_norm": 64.91607666015625, + "learning_rate": 9.981621567623385e-06, + "loss": 5.656664848327637, + "step": 59 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 193.08592224121094, + "learning_rate": 9.980235905970615e-06, + "loss": 5.183746337890625, + "step": 60 + }, + { + "epoch": 0.11960784313725491, + "grad_norm": 197.5607452392578, + "learning_rate": 9.978799991928945e-06, + "loss": 4.836249351501465, + "step": 61 + }, + { + "epoch": 0.12156862745098039, + "grad_norm": 836.5057983398438, + "learning_rate": 9.977313839987265e-06, + "loss": 4.21888542175293, + "step": 62 + }, + { + "epoch": 0.12352941176470589, + "grad_norm": 4496.34228515625, + "learning_rate": 9.975777465141391e-06, + "loss": 4.858551979064941, + "step": 63 + }, + { + "epoch": 0.12549019607843137, + "grad_norm": 33.644187927246094, + "learning_rate": 9.974190882893901e-06, + "loss": 4.454083442687988, + "step": 64 + }, + { + "epoch": 0.12745098039215685, + "grad_norm": 389.5293273925781, + "learning_rate": 9.972554109253988e-06, + "loss": 4.7087812423706055, + "step": 65 + }, + { + "epoch": 0.12941176470588237, + "grad_norm": 234.9954833984375, + "learning_rate": 9.970867160737293e-06, + "loss": 4.443643569946289, + "step": 66 + }, + { + "epoch": 0.13137254901960785, + "grad_norm": 127.71284484863281, + "learning_rate": 9.969130054365737e-06, + "loss": 5.485596179962158, + "step": 67 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 73.74437713623047, + "learning_rate": 9.967342807667355e-06, + "loss": 5.120403289794922, + "step": 68 + }, + { + "epoch": 0.13529411764705881, + "grad_norm": 49.9027214050293, + "learning_rate": 9.965505438676115e-06, + "loss": 5.075076580047607, + "step": 69 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 138.47344970703125, + "learning_rate": 9.963617965931738e-06, + "loss": 5.561940670013428, + "step": 70 + }, + { + "epoch": 0.1392156862745098, + "grad_norm": 322.7532043457031, + "learning_rate": 9.961680408479508e-06, + "loss": 4.983658790588379, + "step": 71 + }, + { + "epoch": 0.1411764705882353, + "grad_norm": 41682.1875, + "learning_rate": 9.959692785870086e-06, + "loss": 4.346513748168945, + "step": 72 + }, + { + "epoch": 0.14313725490196078, + "grad_norm": 22.722593307495117, + "learning_rate": 9.957655118159304e-06, + "loss": 4.279910087585449, + "step": 73 + }, + { + "epoch": 0.1450980392156863, + "grad_norm": 1315.896240234375, + "learning_rate": 9.955567425907968e-06, + "loss": 5.980461597442627, + "step": 74 + }, + { + "epoch": 0.14705882352941177, + "grad_norm": 34.87533950805664, + "learning_rate": 9.953429730181653e-06, + "loss": 4.460562705993652, + "step": 75 + }, + { + "epoch": 0.14901960784313725, + "grad_norm": 34.65178680419922, + "learning_rate": 9.951242052550487e-06, + "loss": 4.673140525817871, + "step": 76 + }, + { + "epoch": 0.15098039215686274, + "grad_norm": 188.9655303955078, + "learning_rate": 9.949004415088928e-06, + "loss": 4.867604732513428, + "step": 77 + }, + { + "epoch": 0.15294117647058825, + "grad_norm": 267.5862121582031, + "learning_rate": 9.946716840375552e-06, + "loss": 4.577199935913086, + "step": 78 + }, + { + "epoch": 0.15490196078431373, + "grad_norm": 427.12872314453125, + "learning_rate": 9.944379351492818e-06, + "loss": 4.855893135070801, + "step": 79 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 119.81890869140625, + "learning_rate": 9.941991972026839e-06, + "loss": 4.051677703857422, + "step": 80 + }, + { + "epoch": 0.1588235294117647, + "grad_norm": 6054.2958984375, + "learning_rate": 9.939554726067142e-06, + "loss": 4.933249473571777, + "step": 81 + }, + { + "epoch": 0.1607843137254902, + "grad_norm": 62.10055923461914, + "learning_rate": 9.937067638206418e-06, + "loss": 4.819094657897949, + "step": 82 + }, + { + "epoch": 0.1627450980392157, + "grad_norm": 450.60992431640625, + "learning_rate": 9.934530733540293e-06, + "loss": 4.19674825668335, + "step": 83 + }, + { + "epoch": 0.16470588235294117, + "grad_norm": 350.167724609375, + "learning_rate": 9.931944037667056e-06, + "loss": 4.2217607498168945, + "step": 84 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 1959.1741943359375, + "learning_rate": 9.929307576687404e-06, + "loss": 4.780778884887695, + "step": 85 + }, + { + "epoch": 0.16862745098039217, + "grad_norm": 178.00027465820312, + "learning_rate": 9.926621377204188e-06, + "loss": 4.609301567077637, + "step": 86 + }, + { + "epoch": 0.17058823529411765, + "grad_norm": 1623.9786376953125, + "learning_rate": 9.923885466322135e-06, + "loss": 5.524645805358887, + "step": 87 + }, + { + "epoch": 0.17254901960784313, + "grad_norm": 26.787124633789062, + "learning_rate": 9.921099871647582e-06, + "loss": 4.657105445861816, + "step": 88 + }, + { + "epoch": 0.17450980392156862, + "grad_norm": 438.51654052734375, + "learning_rate": 9.918264621288187e-06, + "loss": 5.135782241821289, + "step": 89 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 68.33883666992188, + "learning_rate": 9.91537974385266e-06, + "loss": 4.544101715087891, + "step": 90 + }, + { + "epoch": 0.1784313725490196, + "grad_norm": 184.18646240234375, + "learning_rate": 9.912445268450459e-06, + "loss": 5.053075790405273, + "step": 91 + }, + { + "epoch": 0.1803921568627451, + "grad_norm": 1090.8336181640625, + "learning_rate": 9.909461224691506e-06, + "loss": 4.8303327560424805, + "step": 92 + }, + { + "epoch": 0.18235294117647058, + "grad_norm": 61.77042007446289, + "learning_rate": 9.906427642685889e-06, + "loss": 4.788046360015869, + "step": 93 + }, + { + "epoch": 0.1843137254901961, + "grad_norm": 103.00730895996094, + "learning_rate": 9.90334455304355e-06, + "loss": 5.147237300872803, + "step": 94 + }, + { + "epoch": 0.18627450980392157, + "grad_norm": 46.233280181884766, + "learning_rate": 9.900211986873986e-06, + "loss": 4.351109981536865, + "step": 95 + }, + { + "epoch": 0.18823529411764706, + "grad_norm": 305.6100769042969, + "learning_rate": 9.897029975785924e-06, + "loss": 4.575442790985107, + "step": 96 + }, + { + "epoch": 0.19019607843137254, + "grad_norm": 109.36982727050781, + "learning_rate": 9.89379855188701e-06, + "loss": 4.224271774291992, + "step": 97 + }, + { + "epoch": 0.19215686274509805, + "grad_norm": 100.88874816894531, + "learning_rate": 9.89051774778349e-06, + "loss": 4.574636936187744, + "step": 98 + }, + { + "epoch": 0.19411764705882353, + "grad_norm": 285.57757568359375, + "learning_rate": 9.887187596579865e-06, + "loss": 5.0750861167907715, + "step": 99 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 87.83949279785156, + "learning_rate": 9.883808131878573e-06, + "loss": 4.645264148712158, + "step": 100 + }, + { + "epoch": 0.1980392156862745, + "grad_norm": 414.4931335449219, + "learning_rate": 9.880379387779637e-06, + "loss": 4.512279510498047, + "step": 101 + }, + { + "epoch": 0.2, + "grad_norm": 607.8229370117188, + "learning_rate": 9.87690139888033e-06, + "loss": 4.384090423583984, + "step": 102 + }, + { + "epoch": 0.2019607843137255, + "grad_norm": 33.75107192993164, + "learning_rate": 9.873374200274826e-06, + "loss": 4.4997639656066895, + "step": 103 + }, + { + "epoch": 0.20392156862745098, + "grad_norm": 302.5324401855469, + "learning_rate": 9.869797827553837e-06, + "loss": 4.900559902191162, + "step": 104 + }, + { + "epoch": 0.20588235294117646, + "grad_norm": 59.07832717895508, + "learning_rate": 9.866172316804265e-06, + "loss": 4.729743957519531, + "step": 105 + }, + { + "epoch": 0.20784313725490197, + "grad_norm": 555.4403686523438, + "learning_rate": 9.862497704608829e-06, + "loss": 4.753190517425537, + "step": 106 + }, + { + "epoch": 0.20980392156862746, + "grad_norm": 649.705810546875, + "learning_rate": 9.8587740280457e-06, + "loss": 4.613556861877441, + "step": 107 + }, + { + "epoch": 0.21176470588235294, + "grad_norm": 414.4284973144531, + "learning_rate": 9.855001324688128e-06, + "loss": 4.940046310424805, + "step": 108 + }, + { + "epoch": 0.21372549019607842, + "grad_norm": 417.20989990234375, + "learning_rate": 9.851179632604057e-06, + "loss": 4.4412970542907715, + "step": 109 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 60.889366149902344, + "learning_rate": 9.847308990355752e-06, + "loss": 4.690826416015625, + "step": 110 + }, + { + "epoch": 0.21764705882352942, + "grad_norm": 83.28646087646484, + "learning_rate": 9.843389436999396e-06, + "loss": 4.576815605163574, + "step": 111 + }, + { + "epoch": 0.2196078431372549, + "grad_norm": 144.78509521484375, + "learning_rate": 9.839421012084709e-06, + "loss": 4.536053657531738, + "step": 112 + }, + { + "epoch": 0.22156862745098038, + "grad_norm": 87.0609359741211, + "learning_rate": 9.835403755654535e-06, + "loss": 4.452672004699707, + "step": 113 + }, + { + "epoch": 0.2235294117647059, + "grad_norm": 49.286476135253906, + "learning_rate": 9.831337708244454e-06, + "loss": 4.295703887939453, + "step": 114 + }, + { + "epoch": 0.22549019607843138, + "grad_norm": 225.94625854492188, + "learning_rate": 9.827222910882358e-06, + "loss": 5.346158504486084, + "step": 115 + }, + { + "epoch": 0.22745098039215686, + "grad_norm": 116.64401245117188, + "learning_rate": 9.82305940508805e-06, + "loss": 4.404465675354004, + "step": 116 + }, + { + "epoch": 0.22941176470588234, + "grad_norm": 71.7906265258789, + "learning_rate": 9.818847232872815e-06, + "loss": 4.849125862121582, + "step": 117 + }, + { + "epoch": 0.23137254901960785, + "grad_norm": 116.41288757324219, + "learning_rate": 9.814586436738998e-06, + "loss": 4.716423988342285, + "step": 118 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 107.20303344726562, + "learning_rate": 9.81027705967958e-06, + "loss": 4.078630447387695, + "step": 119 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 44.105812072753906, + "learning_rate": 9.805919145177741e-06, + "loss": 4.686631679534912, + "step": 120 + }, + { + "epoch": 0.2372549019607843, + "grad_norm": 793.5034790039062, + "learning_rate": 9.801512737206422e-06, + "loss": 5.099960803985596, + "step": 121 + }, + { + "epoch": 0.23921568627450981, + "grad_norm": 91.11273956298828, + "learning_rate": 9.797057880227878e-06, + "loss": 4.722168922424316, + "step": 122 + }, + { + "epoch": 0.2411764705882353, + "grad_norm": 25.48920440673828, + "learning_rate": 9.792554619193235e-06, + "loss": 4.521475791931152, + "step": 123 + }, + { + "epoch": 0.24313725490196078, + "grad_norm": 9693.837890625, + "learning_rate": 9.78800299954203e-06, + "loss": 4.789237976074219, + "step": 124 + }, + { + "epoch": 0.24509803921568626, + "grad_norm": 144.1415252685547, + "learning_rate": 9.783403067201763e-06, + "loss": 4.778863906860352, + "step": 125 + }, + { + "epoch": 0.24705882352941178, + "grad_norm": 11.692220687866211, + "learning_rate": 9.778754868587414e-06, + "loss": 5.121346473693848, + "step": 126 + }, + { + "epoch": 0.24901960784313726, + "grad_norm": 35.05064010620117, + "learning_rate": 9.774058450601003e-06, + "loss": 5.001660346984863, + "step": 127 + }, + { + "epoch": 0.25098039215686274, + "grad_norm": 137.78485107421875, + "learning_rate": 9.76931386063109e-06, + "loss": 4.57066535949707, + "step": 128 + }, + { + "epoch": 0.2529411764705882, + "grad_norm": 43.179466247558594, + "learning_rate": 9.76452114655231e-06, + "loss": 4.620499610900879, + "step": 129 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 33.09333038330078, + "learning_rate": 9.759680356724888e-06, + "loss": 4.002799034118652, + "step": 130 + }, + { + "epoch": 0.2568627450980392, + "grad_norm": 61.24403762817383, + "learning_rate": 9.754791539994153e-06, + "loss": 5.060644149780273, + "step": 131 + }, + { + "epoch": 0.25882352941176473, + "grad_norm": 8565.3466796875, + "learning_rate": 9.749854745690041e-06, + "loss": 4.671350479125977, + "step": 132 + }, + { + "epoch": 0.2607843137254902, + "grad_norm": 59.654541015625, + "learning_rate": 9.744870023626598e-06, + "loss": 4.615689277648926, + "step": 133 + }, + { + "epoch": 0.2627450980392157, + "grad_norm": 58.113155364990234, + "learning_rate": 9.739837424101484e-06, + "loss": 4.944394588470459, + "step": 134 + }, + { + "epoch": 0.2647058823529412, + "grad_norm": 44.76533126831055, + "learning_rate": 9.73475699789545e-06, + "loss": 4.559343338012695, + "step": 135 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 4049.171142578125, + "learning_rate": 9.729628796271844e-06, + "loss": 4.097330093383789, + "step": 136 + }, + { + "epoch": 0.26862745098039215, + "grad_norm": 91.75375366210938, + "learning_rate": 9.724452870976084e-06, + "loss": 4.160323143005371, + "step": 137 + }, + { + "epoch": 0.27058823529411763, + "grad_norm": 92.1652603149414, + "learning_rate": 9.719229274235134e-06, + "loss": 4.578685760498047, + "step": 138 + }, + { + "epoch": 0.2725490196078431, + "grad_norm": 21.699565887451172, + "learning_rate": 9.713958058756985e-06, + "loss": 4.3331217765808105, + "step": 139 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 2818.811279296875, + "learning_rate": 9.708639277730112e-06, + "loss": 4.388368606567383, + "step": 140 + }, + { + "epoch": 0.27647058823529413, + "grad_norm": 58.03261184692383, + "learning_rate": 9.703272984822947e-06, + "loss": 4.328610897064209, + "step": 141 + }, + { + "epoch": 0.2784313725490196, + "grad_norm": 85.0673599243164, + "learning_rate": 9.697859234183336e-06, + "loss": 4.389078617095947, + "step": 142 + }, + { + "epoch": 0.2803921568627451, + "grad_norm": 47.508522033691406, + "learning_rate": 9.692398080437991e-06, + "loss": 4.623535633087158, + "step": 143 + }, + { + "epoch": 0.2823529411764706, + "grad_norm": 26.19891357421875, + "learning_rate": 9.68688957869193e-06, + "loss": 4.690242767333984, + "step": 144 + }, + { + "epoch": 0.28431372549019607, + "grad_norm": 27.067237854003906, + "learning_rate": 9.681333784527945e-06, + "loss": 4.570530414581299, + "step": 145 + }, + { + "epoch": 0.28627450980392155, + "grad_norm": 241.20358276367188, + "learning_rate": 9.67573075400601e-06, + "loss": 4.467217922210693, + "step": 146 + }, + { + "epoch": 0.28823529411764703, + "grad_norm": 14.455266952514648, + "learning_rate": 9.670080543662742e-06, + "loss": 4.641494274139404, + "step": 147 + }, + { + "epoch": 0.2901960784313726, + "grad_norm": 31.17917251586914, + "learning_rate": 9.66438321051081e-06, + "loss": 4.5704450607299805, + "step": 148 + }, + { + "epoch": 0.29215686274509806, + "grad_norm": 31.33642578125, + "learning_rate": 9.658638812038379e-06, + "loss": 4.436771869659424, + "step": 149 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 44.800392150878906, + "learning_rate": 9.652847406208514e-06, + "loss": 4.712490558624268, + "step": 150 + }, + { + "epoch": 0.296078431372549, + "grad_norm": 25.672563552856445, + "learning_rate": 9.647009051458604e-06, + "loss": 5.042919158935547, + "step": 151 + }, + { + "epoch": 0.2980392156862745, + "grad_norm": 16.580514907836914, + "learning_rate": 9.641123806699769e-06, + "loss": 4.510254859924316, + "step": 152 + }, + { + "epoch": 0.3, + "grad_norm": 15.236343383789062, + "learning_rate": 9.635191731316262e-06, + "loss": 4.711069583892822, + "step": 153 + }, + { + "epoch": 0.30196078431372547, + "grad_norm": 216.19012451171875, + "learning_rate": 9.629212885164882e-06, + "loss": 4.6602983474731445, + "step": 154 + }, + { + "epoch": 0.30392156862745096, + "grad_norm": 23.054521560668945, + "learning_rate": 9.623187328574357e-06, + "loss": 4.656505584716797, + "step": 155 + }, + { + "epoch": 0.3058823529411765, + "grad_norm": 75.67474365234375, + "learning_rate": 9.617115122344742e-06, + "loss": 4.795361042022705, + "step": 156 + }, + { + "epoch": 0.307843137254902, + "grad_norm": 10.801730155944824, + "learning_rate": 9.6109963277468e-06, + "loss": 4.377931118011475, + "step": 157 + }, + { + "epoch": 0.30980392156862746, + "grad_norm": 50.18651580810547, + "learning_rate": 9.604831006521393e-06, + "loss": 4.209827423095703, + "step": 158 + }, + { + "epoch": 0.31176470588235294, + "grad_norm": 65.79145812988281, + "learning_rate": 9.598619220878852e-06, + "loss": 4.403324127197266, + "step": 159 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 717.3584594726562, + "learning_rate": 9.592361033498349e-06, + "loss": 4.686285972595215, + "step": 160 + }, + { + "epoch": 0.3156862745098039, + "grad_norm": 6.576393127441406, + "learning_rate": 9.586056507527266e-06, + "loss": 4.493882179260254, + "step": 161 + }, + { + "epoch": 0.3176470588235294, + "grad_norm": 22.952342987060547, + "learning_rate": 9.57970570658056e-06, + "loss": 3.9739434719085693, + "step": 162 + }, + { + "epoch": 0.3196078431372549, + "grad_norm": 19.122220993041992, + "learning_rate": 9.57330869474012e-06, + "loss": 4.901614189147949, + "step": 163 + }, + { + "epoch": 0.3215686274509804, + "grad_norm": 13.962799072265625, + "learning_rate": 9.566865536554119e-06, + "loss": 4.684842109680176, + "step": 164 + }, + { + "epoch": 0.3235294117647059, + "grad_norm": 588.8753051757812, + "learning_rate": 9.560376297036362e-06, + "loss": 4.2213664054870605, + "step": 165 + }, + { + "epoch": 0.3254901960784314, + "grad_norm": 52.16958999633789, + "learning_rate": 9.553841041665632e-06, + "loss": 4.610918045043945, + "step": 166 + }, + { + "epoch": 0.32745098039215687, + "grad_norm": 10.890721321105957, + "learning_rate": 9.54725983638503e-06, + "loss": 4.533082485198975, + "step": 167 + }, + { + "epoch": 0.32941176470588235, + "grad_norm": 16.27850914001465, + "learning_rate": 9.540632747601309e-06, + "loss": 4.84617805480957, + "step": 168 + }, + { + "epoch": 0.33137254901960783, + "grad_norm": 23.593048095703125, + "learning_rate": 9.533959842184195e-06, + "loss": 4.829172611236572, + "step": 169 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 12.077108383178711, + "learning_rate": 9.527241187465735e-06, + "loss": 4.293300151824951, + "step": 170 + }, + { + "epoch": 0.3352941176470588, + "grad_norm": 21.818492889404297, + "learning_rate": 9.520476851239588e-06, + "loss": 4.222914695739746, + "step": 171 + }, + { + "epoch": 0.33725490196078434, + "grad_norm": 14.46260929107666, + "learning_rate": 9.513666901760368e-06, + "loss": 4.497089385986328, + "step": 172 + }, + { + "epoch": 0.3392156862745098, + "grad_norm": 15.194242477416992, + "learning_rate": 9.506811407742938e-06, + "loss": 4.559511661529541, + "step": 173 + }, + { + "epoch": 0.3411764705882353, + "grad_norm": 23.59486198425293, + "learning_rate": 9.49991043836172e-06, + "loss": 4.664986610412598, + "step": 174 + }, + { + "epoch": 0.3431372549019608, + "grad_norm": 12.736374855041504, + "learning_rate": 9.49296406325e-06, + "loss": 3.997081756591797, + "step": 175 + }, + { + "epoch": 0.34509803921568627, + "grad_norm": 32.026031494140625, + "learning_rate": 9.485972352499231e-06, + "loss": 3.9222970008850098, + "step": 176 + }, + { + "epoch": 0.34705882352941175, + "grad_norm": 74.40033721923828, + "learning_rate": 9.478935376658308e-06, + "loss": 4.217952728271484, + "step": 177 + }, + { + "epoch": 0.34901960784313724, + "grad_norm": 32.45917892456055, + "learning_rate": 9.471853206732875e-06, + "loss": 4.877760887145996, + "step": 178 + }, + { + "epoch": 0.3509803921568627, + "grad_norm": 133.5160675048828, + "learning_rate": 9.4647259141846e-06, + "loss": 4.599568843841553, + "step": 179 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 110.89315795898438, + "learning_rate": 9.457553570930451e-06, + "loss": 4.784282684326172, + "step": 180 + }, + { + "epoch": 0.35490196078431374, + "grad_norm": 7.933850288391113, + "learning_rate": 9.450336249341976e-06, + "loss": 4.384489059448242, + "step": 181 + }, + { + "epoch": 0.3568627450980392, + "grad_norm": 90.08252716064453, + "learning_rate": 9.443074022244573e-06, + "loss": 4.630293846130371, + "step": 182 + }, + { + "epoch": 0.3588235294117647, + "grad_norm": 203.20277404785156, + "learning_rate": 9.435766962916749e-06, + "loss": 4.612138748168945, + "step": 183 + }, + { + "epoch": 0.3607843137254902, + "grad_norm": 29.04180145263672, + "learning_rate": 9.428415145089385e-06, + "loss": 4.887096405029297, + "step": 184 + }, + { + "epoch": 0.3627450980392157, + "grad_norm": 21.732030868530273, + "learning_rate": 9.421018642944996e-06, + "loss": 4.200204372406006, + "step": 185 + }, + { + "epoch": 0.36470588235294116, + "grad_norm": 88.96598052978516, + "learning_rate": 9.413577531116973e-06, + "loss": 4.376042366027832, + "step": 186 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 97.27217102050781, + "learning_rate": 9.406091884688837e-06, + "loss": 4.695228099822998, + "step": 187 + }, + { + "epoch": 0.3686274509803922, + "grad_norm": 50.880985260009766, + "learning_rate": 9.398561779193477e-06, + "loss": 4.356112003326416, + "step": 188 + }, + { + "epoch": 0.37058823529411766, + "grad_norm": 12.541425704956055, + "learning_rate": 9.390987290612396e-06, + "loss": 4.752440452575684, + "step": 189 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 236.70387268066406, + "learning_rate": 9.38336849537493e-06, + "loss": 4.593203067779541, + "step": 190 + }, + { + "epoch": 0.37450980392156863, + "grad_norm": 2993.429931640625, + "learning_rate": 9.375705470357493e-06, + "loss": 4.44484806060791, + "step": 191 + }, + { + "epoch": 0.3764705882352941, + "grad_norm": 150.78634643554688, + "learning_rate": 9.367998292882789e-06, + "loss": 3.959789514541626, + "step": 192 + }, + { + "epoch": 0.3784313725490196, + "grad_norm": 39.33837127685547, + "learning_rate": 9.36024704071904e-06, + "loss": 4.030791759490967, + "step": 193 + }, + { + "epoch": 0.3803921568627451, + "grad_norm": 18.125226974487305, + "learning_rate": 9.35245179207919e-06, + "loss": 4.262718200683594, + "step": 194 + }, + { + "epoch": 0.38235294117647056, + "grad_norm": 70.55377960205078, + "learning_rate": 9.344612625620134e-06, + "loss": 4.606302261352539, + "step": 195 + }, + { + "epoch": 0.3843137254901961, + "grad_norm": 1980.358154296875, + "learning_rate": 9.336729620441906e-06, + "loss": 4.8002824783325195, + "step": 196 + }, + { + "epoch": 0.3862745098039216, + "grad_norm": 31.0122013092041, + "learning_rate": 9.328802856086891e-06, + "loss": 4.136668682098389, + "step": 197 + }, + { + "epoch": 0.38823529411764707, + "grad_norm": 13.777653694152832, + "learning_rate": 9.32083241253902e-06, + "loss": 4.453038692474365, + "step": 198 + }, + { + "epoch": 0.39019607843137255, + "grad_norm": 15.520337104797363, + "learning_rate": 9.312818370222962e-06, + "loss": 4.666173934936523, + "step": 199 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 25.686555862426758, + "learning_rate": 9.304760810003318e-06, + "loss": 4.567206859588623, + "step": 200 + }, + { + "epoch": 0.3941176470588235, + "grad_norm": 34.059505462646484, + "learning_rate": 9.296659813183794e-06, + "loss": 4.661189556121826, + "step": 201 + }, + { + "epoch": 0.396078431372549, + "grad_norm": 20.709781646728516, + "learning_rate": 9.28851546150639e-06, + "loss": 4.140271186828613, + "step": 202 + }, + { + "epoch": 0.3980392156862745, + "grad_norm": 52.71310043334961, + "learning_rate": 9.280327837150572e-06, + "loss": 4.564424514770508, + "step": 203 + }, + { + "epoch": 0.4, + "grad_norm": 16.125553131103516, + "learning_rate": 9.272097022732444e-06, + "loss": 4.352408409118652, + "step": 204 + }, + { + "epoch": 0.4019607843137255, + "grad_norm": 10.148744583129883, + "learning_rate": 9.263823101303911e-06, + "loss": 3.883530616760254, + "step": 205 + }, + { + "epoch": 0.403921568627451, + "grad_norm": 22.48365020751953, + "learning_rate": 9.255506156351846e-06, + "loss": 4.559526443481445, + "step": 206 + }, + { + "epoch": 0.40588235294117647, + "grad_norm": 1699.036865234375, + "learning_rate": 9.247146271797244e-06, + "loss": 4.905045509338379, + "step": 207 + }, + { + "epoch": 0.40784313725490196, + "grad_norm": 7.313532829284668, + "learning_rate": 9.238743531994378e-06, + "loss": 3.642618417739868, + "step": 208 + }, + { + "epoch": 0.40980392156862744, + "grad_norm": 35.672157287597656, + "learning_rate": 9.23029802172994e-06, + "loss": 4.230594635009766, + "step": 209 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 202.3529815673828, + "learning_rate": 9.221809826222198e-06, + "loss": 4.2360124588012695, + "step": 210 + }, + { + "epoch": 0.4137254901960784, + "grad_norm": 117.8387222290039, + "learning_rate": 9.213279031120129e-06, + "loss": 4.491461277008057, + "step": 211 + }, + { + "epoch": 0.41568627450980394, + "grad_norm": 34.6633186340332, + "learning_rate": 9.20470572250255e-06, + "loss": 4.295816898345947, + "step": 212 + }, + { + "epoch": 0.4176470588235294, + "grad_norm": 8.826363563537598, + "learning_rate": 9.196089986877262e-06, + "loss": 4.463611602783203, + "step": 213 + }, + { + "epoch": 0.4196078431372549, + "grad_norm": 26.18223762512207, + "learning_rate": 9.18743191118016e-06, + "loss": 4.4809675216674805, + "step": 214 + }, + { + "epoch": 0.4215686274509804, + "grad_norm": 23.7016544342041, + "learning_rate": 9.17873158277438e-06, + "loss": 4.53269100189209, + "step": 215 + }, + { + "epoch": 0.4235294117647059, + "grad_norm": 147.547119140625, + "learning_rate": 9.16998908944939e-06, + "loss": 4.548085689544678, + "step": 216 + }, + { + "epoch": 0.42549019607843136, + "grad_norm": 33.69279098510742, + "learning_rate": 9.161204519420126e-06, + "loss": 4.510319709777832, + "step": 217 + }, + { + "epoch": 0.42745098039215684, + "grad_norm": 21.794368743896484, + "learning_rate": 9.152377961326085e-06, + "loss": 3.977487564086914, + "step": 218 + }, + { + "epoch": 0.4294117647058823, + "grad_norm": 17.71551513671875, + "learning_rate": 9.14350950423045e-06, + "loss": 4.281040191650391, + "step": 219 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 91.0526123046875, + "learning_rate": 9.134599237619167e-06, + "loss": 4.954435348510742, + "step": 220 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 39.16487121582031, + "learning_rate": 9.125647251400068e-06, + "loss": 4.5113935470581055, + "step": 221 + }, + { + "epoch": 0.43529411764705883, + "grad_norm": 304.0317687988281, + "learning_rate": 9.11665363590194e-06, + "loss": 4.286795139312744, + "step": 222 + }, + { + "epoch": 0.4372549019607843, + "grad_norm": 64.62968444824219, + "learning_rate": 9.107618481873632e-06, + "loss": 4.302193641662598, + "step": 223 + }, + { + "epoch": 0.4392156862745098, + "grad_norm": 9.797171592712402, + "learning_rate": 9.098541880483129e-06, + "loss": 4.519267559051514, + "step": 224 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 16.81446647644043, + "learning_rate": 9.089423923316636e-06, + "loss": 4.170806884765625, + "step": 225 + }, + { + "epoch": 0.44313725490196076, + "grad_norm": 97.48379516601562, + "learning_rate": 9.08026470237765e-06, + "loss": 4.171516418457031, + "step": 226 + }, + { + "epoch": 0.44509803921568625, + "grad_norm": 8.226983070373535, + "learning_rate": 9.07106431008604e-06, + "loss": 4.536833763122559, + "step": 227 + }, + { + "epoch": 0.4470588235294118, + "grad_norm": 17.50508689880371, + "learning_rate": 9.0618228392771e-06, + "loss": 4.602504730224609, + "step": 228 + }, + { + "epoch": 0.44901960784313727, + "grad_norm": 1409.4854736328125, + "learning_rate": 9.052540383200634e-06, + "loss": 4.213375091552734, + "step": 229 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 18.96872901916504, + "learning_rate": 9.043217035519986e-06, + "loss": 4.0827131271362305, + "step": 230 + }, + { + "epoch": 0.45294117647058824, + "grad_norm": 16.178728103637695, + "learning_rate": 9.033852890311127e-06, + "loss": 4.6328125, + "step": 231 + }, + { + "epoch": 0.4549019607843137, + "grad_norm": 10.366260528564453, + "learning_rate": 9.02444804206168e-06, + "loss": 4.5647430419921875, + "step": 232 + }, + { + "epoch": 0.4568627450980392, + "grad_norm": 17.84885597229004, + "learning_rate": 9.01500258566998e-06, + "loss": 4.389437198638916, + "step": 233 + }, + { + "epoch": 0.4588235294117647, + "grad_norm": 11.878860473632812, + "learning_rate": 9.005516616444112e-06, + "loss": 4.770614147186279, + "step": 234 + }, + { + "epoch": 0.46078431372549017, + "grad_norm": 85.99885559082031, + "learning_rate": 8.99599023010095e-06, + "loss": 4.427285194396973, + "step": 235 + }, + { + "epoch": 0.4627450980392157, + "grad_norm": 77.5919189453125, + "learning_rate": 8.986423522765191e-06, + "loss": 4.447712421417236, + "step": 236 + }, + { + "epoch": 0.4647058823529412, + "grad_norm": 19.800437927246094, + "learning_rate": 8.976816590968388e-06, + "loss": 4.388566017150879, + "step": 237 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 29.633106231689453, + "learning_rate": 8.967169531647971e-06, + "loss": 4.662332534790039, + "step": 238 + }, + { + "epoch": 0.46862745098039216, + "grad_norm": 11.136107444763184, + "learning_rate": 8.957482442146271e-06, + "loss": 4.721919059753418, + "step": 239 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 23.172388076782227, + "learning_rate": 8.947755420209541e-06, + "loss": 4.55937385559082, + "step": 240 + }, + { + "epoch": 0.4725490196078431, + "grad_norm": 22.51704216003418, + "learning_rate": 8.937988563986963e-06, + "loss": 4.346927642822266, + "step": 241 + }, + { + "epoch": 0.4745098039215686, + "grad_norm": 9.842615127563477, + "learning_rate": 8.928181972029664e-06, + "loss": 4.690535545349121, + "step": 242 + }, + { + "epoch": 0.4764705882352941, + "grad_norm": 158.396484375, + "learning_rate": 8.918335743289717e-06, + "loss": 4.770160675048828, + "step": 243 + }, + { + "epoch": 0.47843137254901963, + "grad_norm": 18.87772560119629, + "learning_rate": 8.90844997711915e-06, + "loss": 4.694735527038574, + "step": 244 + }, + { + "epoch": 0.4803921568627451, + "grad_norm": 10.401981353759766, + "learning_rate": 8.898524773268926e-06, + "loss": 4.433718681335449, + "step": 245 + }, + { + "epoch": 0.4823529411764706, + "grad_norm": 35.996971130371094, + "learning_rate": 8.888560231887963e-06, + "loss": 4.435983180999756, + "step": 246 + }, + { + "epoch": 0.4843137254901961, + "grad_norm": 144.66212463378906, + "learning_rate": 8.8785564535221e-06, + "loss": 4.828408241271973, + "step": 247 + }, + { + "epoch": 0.48627450980392156, + "grad_norm": 61.79336166381836, + "learning_rate": 8.868513539113093e-06, + "loss": 4.478762149810791, + "step": 248 + }, + { + "epoch": 0.48823529411764705, + "grad_norm": 111.1070556640625, + "learning_rate": 8.858431589997597e-06, + "loss": 4.791953086853027, + "step": 249 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 43.53703308105469, + "learning_rate": 8.848310707906138e-06, + "loss": 4.221644401550293, + "step": 250 + }, + { + "epoch": 0.492156862745098, + "grad_norm": 30.130136489868164, + "learning_rate": 8.838150994962094e-06, + "loss": 3.9666197299957275, + "step": 251 + }, + { + "epoch": 0.49411764705882355, + "grad_norm": 24.61577033996582, + "learning_rate": 8.827952553680656e-06, + "loss": 4.494099140167236, + "step": 252 + }, + { + "epoch": 0.49607843137254903, + "grad_norm": 95.78424835205078, + "learning_rate": 8.817715486967803e-06, + "loss": 4.37385368347168, + "step": 253 + }, + { + "epoch": 0.4980392156862745, + "grad_norm": 12.204360008239746, + "learning_rate": 8.807439898119252e-06, + "loss": 4.433926582336426, + "step": 254 + }, + { + "epoch": 0.5, + "grad_norm": 14.268149375915527, + "learning_rate": 8.797125890819429e-06, + "loss": 4.530971527099609, + "step": 255 + }, + { + "epoch": 0.5019607843137255, + "grad_norm": 17.889156341552734, + "learning_rate": 8.786773569140414e-06, + "loss": 4.126347541809082, + "step": 256 + }, + { + "epoch": 0.503921568627451, + "grad_norm": 18.556318283081055, + "learning_rate": 8.776383037540888e-06, + "loss": 4.337622165679932, + "step": 257 + }, + { + "epoch": 0.5058823529411764, + "grad_norm": 679.375244140625, + "learning_rate": 8.765954400865093e-06, + "loss": 4.433990478515625, + "step": 258 + }, + { + "epoch": 0.5078431372549019, + "grad_norm": 42.50425338745117, + "learning_rate": 8.755487764341756e-06, + "loss": 4.115379810333252, + "step": 259 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 53.077613830566406, + "learning_rate": 8.744983233583044e-06, + "loss": 4.244760036468506, + "step": 260 + }, + { + "epoch": 0.5117647058823529, + "grad_norm": 17.79299545288086, + "learning_rate": 8.734440914583486e-06, + "loss": 4.051713943481445, + "step": 261 + }, + { + "epoch": 0.5137254901960784, + "grad_norm": 15.688224792480469, + "learning_rate": 8.72386091371891e-06, + "loss": 4.603087425231934, + "step": 262 + }, + { + "epoch": 0.515686274509804, + "grad_norm": 55.612709045410156, + "learning_rate": 8.713243337745366e-06, + "loss": 4.455329418182373, + "step": 263 + }, + { + "epoch": 0.5176470588235295, + "grad_norm": 18.093618392944336, + "learning_rate": 8.70258829379805e-06, + "loss": 4.481441497802734, + "step": 264 + }, + { + "epoch": 0.5196078431372549, + "grad_norm": 290.83355712890625, + "learning_rate": 8.691895889390228e-06, + "loss": 4.285877227783203, + "step": 265 + }, + { + "epoch": 0.5215686274509804, + "grad_norm": 17.16389274597168, + "learning_rate": 8.681166232412142e-06, + "loss": 4.445030212402344, + "step": 266 + }, + { + "epoch": 0.5235294117647059, + "grad_norm": 26.885225296020508, + "learning_rate": 8.670399431129926e-06, + "loss": 4.563932418823242, + "step": 267 + }, + { + "epoch": 0.5254901960784314, + "grad_norm": 42.71232604980469, + "learning_rate": 8.659595594184516e-06, + "loss": 3.8843421936035156, + "step": 268 + }, + { + "epoch": 0.5274509803921569, + "grad_norm": 28.694578170776367, + "learning_rate": 8.648754830590552e-06, + "loss": 3.6372265815734863, + "step": 269 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 43.32936477661133, + "learning_rate": 8.637877249735274e-06, + "loss": 4.222830295562744, + "step": 270 + }, + { + "epoch": 0.5313725490196078, + "grad_norm": 72.35948944091797, + "learning_rate": 8.626962961377423e-06, + "loss": 4.611291408538818, + "step": 271 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 15.251282691955566, + "learning_rate": 8.616012075646134e-06, + "loss": 4.276963233947754, + "step": 272 + }, + { + "epoch": 0.5352941176470588, + "grad_norm": 776.6341552734375, + "learning_rate": 8.605024703039817e-06, + "loss": 4.6572585105896, + "step": 273 + }, + { + "epoch": 0.5372549019607843, + "grad_norm": 22.79600715637207, + "learning_rate": 8.594000954425056e-06, + "loss": 4.917038917541504, + "step": 274 + }, + { + "epoch": 0.5392156862745098, + "grad_norm": 10.690701484680176, + "learning_rate": 8.582940941035476e-06, + "loss": 4.69964599609375, + "step": 275 + }, + { + "epoch": 0.5411764705882353, + "grad_norm": 86.31309509277344, + "learning_rate": 8.571844774470627e-06, + "loss": 4.586027145385742, + "step": 276 + }, + { + "epoch": 0.5431372549019607, + "grad_norm": 17.10081672668457, + "learning_rate": 8.560712566694863e-06, + "loss": 4.531658172607422, + "step": 277 + }, + { + "epoch": 0.5450980392156862, + "grad_norm": 15.138278007507324, + "learning_rate": 8.549544430036198e-06, + "loss": 4.515611171722412, + "step": 278 + }, + { + "epoch": 0.5470588235294118, + "grad_norm": 11.231842994689941, + "learning_rate": 8.538340477185191e-06, + "loss": 4.401930809020996, + "step": 279 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 9.649141311645508, + "learning_rate": 8.527100821193797e-06, + "loss": 4.2050909996032715, + "step": 280 + }, + { + "epoch": 0.5509803921568628, + "grad_norm": 44.636756896972656, + "learning_rate": 8.51582557547422e-06, + "loss": 4.522353649139404, + "step": 281 + }, + { + "epoch": 0.5529411764705883, + "grad_norm": 10.403969764709473, + "learning_rate": 8.504514853797789e-06, + "loss": 4.316591262817383, + "step": 282 + }, + { + "epoch": 0.5549019607843138, + "grad_norm": 45.07345199584961, + "learning_rate": 8.493168770293793e-06, + "loss": 4.220366477966309, + "step": 283 + }, + { + "epoch": 0.5568627450980392, + "grad_norm": 81.64495849609375, + "learning_rate": 8.481787439448332e-06, + "loss": 4.375057220458984, + "step": 284 + }, + { + "epoch": 0.5588235294117647, + "grad_norm": 22.120563507080078, + "learning_rate": 8.470370976103171e-06, + "loss": 4.166134834289551, + "step": 285 + }, + { + "epoch": 0.5607843137254902, + "grad_norm": 10.833823204040527, + "learning_rate": 8.458919495454567e-06, + "loss": 4.409770965576172, + "step": 286 + }, + { + "epoch": 0.5627450980392157, + "grad_norm": 81.50211334228516, + "learning_rate": 8.447433113052124e-06, + "loss": 4.4287190437316895, + "step": 287 + }, + { + "epoch": 0.5647058823529412, + "grad_norm": 16.984268188476562, + "learning_rate": 8.435911944797605e-06, + "loss": 4.146420955657959, + "step": 288 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 85.84491729736328, + "learning_rate": 8.42435610694379e-06, + "loss": 4.802792549133301, + "step": 289 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 1234.199951171875, + "learning_rate": 8.412765716093273e-06, + "loss": 4.3419270515441895, + "step": 290 + }, + { + "epoch": 0.5705882352941176, + "grad_norm": 12.962360382080078, + "learning_rate": 8.401140889197305e-06, + "loss": 4.3528547286987305, + "step": 291 + }, + { + "epoch": 0.5725490196078431, + "grad_norm": 26.18000030517578, + "learning_rate": 8.38948174355462e-06, + "loss": 4.065543174743652, + "step": 292 + }, + { + "epoch": 0.5745098039215686, + "grad_norm": 35.139549255371094, + "learning_rate": 8.377788396810223e-06, + "loss": 4.251129150390625, + "step": 293 + }, + { + "epoch": 0.5764705882352941, + "grad_norm": 37.94339370727539, + "learning_rate": 8.366060966954235e-06, + "loss": 4.624574661254883, + "step": 294 + }, + { + "epoch": 0.5784313725490197, + "grad_norm": 11.896288871765137, + "learning_rate": 8.354299572320679e-06, + "loss": 4.315122604370117, + "step": 295 + }, + { + "epoch": 0.5803921568627451, + "grad_norm": 41.16322708129883, + "learning_rate": 8.342504331586298e-06, + "loss": 4.489446640014648, + "step": 296 + }, + { + "epoch": 0.5823529411764706, + "grad_norm": 28.843425750732422, + "learning_rate": 8.330675363769356e-06, + "loss": 4.456976890563965, + "step": 297 + }, + { + "epoch": 0.5843137254901961, + "grad_norm": 14.19128704071045, + "learning_rate": 8.318812788228434e-06, + "loss": 4.4964447021484375, + "step": 298 + }, + { + "epoch": 0.5862745098039216, + "grad_norm": 41.6136474609375, + "learning_rate": 8.306916724661225e-06, + "loss": 4.119976043701172, + "step": 299 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 20.12372398376465, + "learning_rate": 8.294987293103334e-06, + "loss": 4.503427505493164, + "step": 300 + }, + { + "epoch": 0.5901960784313726, + "grad_norm": 96.8432846069336, + "learning_rate": 8.283024613927055e-06, + "loss": 4.409475326538086, + "step": 301 + }, + { + "epoch": 0.592156862745098, + "grad_norm": 24.83096694946289, + "learning_rate": 8.271028807840164e-06, + "loss": 4.263705730438232, + "step": 302 + }, + { + "epoch": 0.5941176470588235, + "grad_norm": 92.04113006591797, + "learning_rate": 8.258999995884706e-06, + "loss": 4.188453674316406, + "step": 303 + }, + { + "epoch": 0.596078431372549, + "grad_norm": 32.55808639526367, + "learning_rate": 8.246938299435759e-06, + "loss": 4.0705437660217285, + "step": 304 + }, + { + "epoch": 0.5980392156862745, + "grad_norm": 28.917774200439453, + "learning_rate": 8.234843840200218e-06, + "loss": 4.273771286010742, + "step": 305 + }, + { + "epoch": 0.6, + "grad_norm": 14.942693710327148, + "learning_rate": 8.222716740215573e-06, + "loss": 4.468536376953125, + "step": 306 + }, + { + "epoch": 0.6019607843137255, + "grad_norm": 76.9916763305664, + "learning_rate": 8.210557121848664e-06, + "loss": 4.393499851226807, + "step": 307 + }, + { + "epoch": 0.6039215686274509, + "grad_norm": 87.08967590332031, + "learning_rate": 8.198365107794457e-06, + "loss": 4.2165422439575195, + "step": 308 + }, + { + "epoch": 0.6058823529411764, + "grad_norm": 76.53693389892578, + "learning_rate": 8.186140821074801e-06, + "loss": 4.373100757598877, + "step": 309 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 13.614083290100098, + "learning_rate": 8.173884385037193e-06, + "loss": 4.481573581695557, + "step": 310 + }, + { + "epoch": 0.6098039215686275, + "grad_norm": 39.6025505065918, + "learning_rate": 8.161595923353516e-06, + "loss": 4.316531181335449, + "step": 311 + }, + { + "epoch": 0.611764705882353, + "grad_norm": 41.38591003417969, + "learning_rate": 8.149275560018816e-06, + "loss": 4.617020130157471, + "step": 312 + }, + { + "epoch": 0.6137254901960785, + "grad_norm": 156.15367126464844, + "learning_rate": 8.136923419350032e-06, + "loss": 4.4222869873046875, + "step": 313 + }, + { + "epoch": 0.615686274509804, + "grad_norm": 32.320045471191406, + "learning_rate": 8.12453962598475e-06, + "loss": 4.656857967376709, + "step": 314 + }, + { + "epoch": 0.6176470588235294, + "grad_norm": 17.588327407836914, + "learning_rate": 8.112124304879938e-06, + "loss": 4.441835403442383, + "step": 315 + }, + { + "epoch": 0.6196078431372549, + "grad_norm": 30.43013572692871, + "learning_rate": 8.0996775813107e-06, + "loss": 4.392027854919434, + "step": 316 + }, + { + "epoch": 0.6215686274509804, + "grad_norm": 21.386093139648438, + "learning_rate": 8.087199580868997e-06, + "loss": 4.848608016967773, + "step": 317 + }, + { + "epoch": 0.6235294117647059, + "grad_norm": 81.71918487548828, + "learning_rate": 8.07469042946238e-06, + "loss": 4.443190097808838, + "step": 318 + }, + { + "epoch": 0.6254901960784314, + "grad_norm": 20.856964111328125, + "learning_rate": 8.062150253312735e-06, + "loss": 4.9603166580200195, + "step": 319 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 145.81161499023438, + "learning_rate": 8.04957917895499e-06, + "loss": 4.618847846984863, + "step": 320 + }, + { + "epoch": 0.6294117647058823, + "grad_norm": 14.190864562988281, + "learning_rate": 8.03697733323585e-06, + "loss": 4.593096733093262, + "step": 321 + }, + { + "epoch": 0.6313725490196078, + "grad_norm": 23.47496795654297, + "learning_rate": 8.024344843312517e-06, + "loss": 4.16273307800293, + "step": 322 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 39.30678176879883, + "learning_rate": 8.011681836651401e-06, + "loss": 4.198973655700684, + "step": 323 + }, + { + "epoch": 0.6352941176470588, + "grad_norm": 55.29441452026367, + "learning_rate": 7.99898844102684e-06, + "loss": 4.373217582702637, + "step": 324 + }, + { + "epoch": 0.6372549019607843, + "grad_norm": 87.39408111572266, + "learning_rate": 7.986264784519801e-06, + "loss": 4.491955280303955, + "step": 325 + }, + { + "epoch": 0.6392156862745098, + "grad_norm": 71.22431182861328, + "learning_rate": 7.973510995516603e-06, + "loss": 4.367103576660156, + "step": 326 + }, + { + "epoch": 0.6411764705882353, + "grad_norm": 32.032073974609375, + "learning_rate": 7.960727202707605e-06, + "loss": 4.673696041107178, + "step": 327 + }, + { + "epoch": 0.6431372549019608, + "grad_norm": 14.861653327941895, + "learning_rate": 7.947913535085925e-06, + "loss": 4.37457275390625, + "step": 328 + }, + { + "epoch": 0.6450980392156863, + "grad_norm": 43.02954864501953, + "learning_rate": 7.935070121946116e-06, + "loss": 4.4756364822387695, + "step": 329 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 40.1738395690918, + "learning_rate": 7.922197092882882e-06, + "loss": 4.393209457397461, + "step": 330 + }, + { + "epoch": 0.6490196078431373, + "grad_norm": 58.01616668701172, + "learning_rate": 7.909294577789765e-06, + "loss": 4.345884323120117, + "step": 331 + }, + { + "epoch": 0.6509803921568628, + "grad_norm": 14.342921257019043, + "learning_rate": 7.896362706857825e-06, + "loss": 4.297840118408203, + "step": 332 + }, + { + "epoch": 0.6529411764705882, + "grad_norm": 8.965250015258789, + "learning_rate": 7.883401610574338e-06, + "loss": 4.571505546569824, + "step": 333 + }, + { + "epoch": 0.6549019607843137, + "grad_norm": 21.012739181518555, + "learning_rate": 7.870411419721468e-06, + "loss": 4.734814167022705, + "step": 334 + }, + { + "epoch": 0.6568627450980392, + "grad_norm": 30.89315414428711, + "learning_rate": 7.857392265374963e-06, + "loss": 4.410980701446533, + "step": 335 + }, + { + "epoch": 0.6588235294117647, + "grad_norm": 9.218079566955566, + "learning_rate": 7.844344278902815e-06, + "loss": 4.341933250427246, + "step": 336 + }, + { + "epoch": 0.6607843137254902, + "grad_norm": 17.145496368408203, + "learning_rate": 7.83126759196395e-06, + "loss": 4.177962779998779, + "step": 337 + }, + { + "epoch": 0.6627450980392157, + "grad_norm": 16.176719665527344, + "learning_rate": 7.818162336506885e-06, + "loss": 4.377812385559082, + "step": 338 + }, + { + "epoch": 0.6647058823529411, + "grad_norm": 23.922420501708984, + "learning_rate": 7.805028644768407e-06, + "loss": 3.9010050296783447, + "step": 339 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 467.1762390136719, + "learning_rate": 7.791866649272236e-06, + "loss": 4.715754508972168, + "step": 340 + }, + { + "epoch": 0.6686274509803921, + "grad_norm": 86.71304321289062, + "learning_rate": 7.778676482827686e-06, + "loss": 4.450630187988281, + "step": 341 + }, + { + "epoch": 0.6705882352941176, + "grad_norm": 208.03579711914062, + "learning_rate": 7.765458278528327e-06, + "loss": 4.261456489562988, + "step": 342 + }, + { + "epoch": 0.6725490196078432, + "grad_norm": 10.478469848632812, + "learning_rate": 7.752212169750642e-06, + "loss": 3.6522653102874756, + "step": 343 + }, + { + "epoch": 0.6745098039215687, + "grad_norm": 109.38081359863281, + "learning_rate": 7.738938290152675e-06, + "loss": 4.505516529083252, + "step": 344 + }, + { + "epoch": 0.6764705882352942, + "grad_norm": 114.52716827392578, + "learning_rate": 7.725636773672694e-06, + "loss": 4.370604038238525, + "step": 345 + }, + { + "epoch": 0.6784313725490196, + "grad_norm": 14.925344467163086, + "learning_rate": 7.712307754527832e-06, + "loss": 4.476314544677734, + "step": 346 + }, + { + "epoch": 0.6803921568627451, + "grad_norm": 53.75554275512695, + "learning_rate": 7.69895136721273e-06, + "loss": 4.616474151611328, + "step": 347 + }, + { + "epoch": 0.6823529411764706, + "grad_norm": 42.78981399536133, + "learning_rate": 7.685567746498191e-06, + "loss": 4.426451683044434, + "step": 348 + }, + { + "epoch": 0.6843137254901961, + "grad_norm": 80.6156234741211, + "learning_rate": 7.672157027429803e-06, + "loss": 4.43165397644043, + "step": 349 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 46.6151123046875, + "learning_rate": 7.658719345326595e-06, + "loss": 4.440042495727539, + "step": 350 + }, + { + "epoch": 0.6882352941176471, + "grad_norm": 104.17079162597656, + "learning_rate": 7.645254835779657e-06, + "loss": 4.430392265319824, + "step": 351 + }, + { + "epoch": 0.6901960784313725, + "grad_norm": 502.8309020996094, + "learning_rate": 7.631763634650783e-06, + "loss": 4.152275562286377, + "step": 352 + }, + { + "epoch": 0.692156862745098, + "grad_norm": 23.371397018432617, + "learning_rate": 7.618245878071091e-06, + "loss": 4.074784278869629, + "step": 353 + }, + { + "epoch": 0.6941176470588235, + "grad_norm": 93.83110809326172, + "learning_rate": 7.604701702439652e-06, + "loss": 4.7920379638671875, + "step": 354 + }, + { + "epoch": 0.696078431372549, + "grad_norm": 20.388896942138672, + "learning_rate": 7.591131244422118e-06, + "loss": 4.258466720581055, + "step": 355 + }, + { + "epoch": 0.6980392156862745, + "grad_norm": 25.022539138793945, + "learning_rate": 7.57753464094934e-06, + "loss": 4.348616600036621, + "step": 356 + }, + { + "epoch": 0.7, + "grad_norm": 24.122034072875977, + "learning_rate": 7.563912029215983e-06, + "loss": 4.450387954711914, + "step": 357 + }, + { + "epoch": 0.7019607843137254, + "grad_norm": 47.63944625854492, + "learning_rate": 7.550263546679148e-06, + "loss": 4.950525760650635, + "step": 358 + }, + { + "epoch": 0.703921568627451, + "grad_norm": 18.94533348083496, + "learning_rate": 7.536589331056976e-06, + "loss": 5.012373924255371, + "step": 359 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 96.9211196899414, + "learning_rate": 7.522889520327275e-06, + "loss": 4.925107955932617, + "step": 360 + }, + { + "epoch": 0.707843137254902, + "grad_norm": 42.01844024658203, + "learning_rate": 7.509164252726107e-06, + "loss": 4.5356903076171875, + "step": 361 + }, + { + "epoch": 0.7098039215686275, + "grad_norm": 59.13837814331055, + "learning_rate": 7.495413666746406e-06, + "loss": 4.559690475463867, + "step": 362 + }, + { + "epoch": 0.711764705882353, + "grad_norm": 67.81266784667969, + "learning_rate": 7.481637901136578e-06, + "loss": 4.372148513793945, + "step": 363 + }, + { + "epoch": 0.7137254901960784, + "grad_norm": 35.04251480102539, + "learning_rate": 7.467837094899104e-06, + "loss": 4.26740837097168, + "step": 364 + }, + { + "epoch": 0.7156862745098039, + "grad_norm": 28.242380142211914, + "learning_rate": 7.454011387289127e-06, + "loss": 4.79606819152832, + "step": 365 + }, + { + "epoch": 0.7176470588235294, + "grad_norm": 33.4775505065918, + "learning_rate": 7.440160917813059e-06, + "loss": 4.412802696228027, + "step": 366 + }, + { + "epoch": 0.7196078431372549, + "grad_norm": 58.53046417236328, + "learning_rate": 7.426285826227171e-06, + "loss": 3.844216823577881, + "step": 367 + }, + { + "epoch": 0.7215686274509804, + "grad_norm": 67.4899673461914, + "learning_rate": 7.412386252536168e-06, + "loss": 4.131912708282471, + "step": 368 + }, + { + "epoch": 0.7235294117647059, + "grad_norm": 3803.90185546875, + "learning_rate": 7.398462336991802e-06, + "loss": 4.365766525268555, + "step": 369 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 76.64689636230469, + "learning_rate": 7.384514220091437e-06, + "loss": 4.796448707580566, + "step": 370 + }, + { + "epoch": 0.7274509803921568, + "grad_norm": 101.20423126220703, + "learning_rate": 7.370542042576635e-06, + "loss": 4.397182941436768, + "step": 371 + }, + { + "epoch": 0.7294117647058823, + "grad_norm": 73.64826965332031, + "learning_rate": 7.356545945431744e-06, + "loss": 4.440197944641113, + "step": 372 + }, + { + "epoch": 0.7313725490196078, + "grad_norm": 66.81968688964844, + "learning_rate": 7.342526069882465e-06, + "loss": 4.575042724609375, + "step": 373 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 40.752525329589844, + "learning_rate": 7.328482557394435e-06, + "loss": 4.159431457519531, + "step": 374 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 27.12578010559082, + "learning_rate": 7.314415549671795e-06, + "loss": 4.4377336502075195, + "step": 375 + }, + { + "epoch": 0.7372549019607844, + "grad_norm": 231.64231872558594, + "learning_rate": 7.300325188655762e-06, + "loss": 4.438188552856445, + "step": 376 + }, + { + "epoch": 0.7392156862745098, + "grad_norm": 23.71122932434082, + "learning_rate": 7.286211616523193e-06, + "loss": 4.190389633178711, + "step": 377 + }, + { + "epoch": 0.7411764705882353, + "grad_norm": 277.79718017578125, + "learning_rate": 7.27207497568516e-06, + "loss": 3.8417224884033203, + "step": 378 + }, + { + "epoch": 0.7431372549019608, + "grad_norm": 30.528398513793945, + "learning_rate": 7.257915408785499e-06, + "loss": 4.584486961364746, + "step": 379 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 618.6641845703125, + "learning_rate": 7.243733058699386e-06, + "loss": 4.159678936004639, + "step": 380 + }, + { + "epoch": 0.7470588235294118, + "grad_norm": 20.201461791992188, + "learning_rate": 7.229528068531881e-06, + "loss": 4.334630489349365, + "step": 381 + }, + { + "epoch": 0.7490196078431373, + "grad_norm": 69.16433715820312, + "learning_rate": 7.215300581616496e-06, + "loss": 4.4458160400390625, + "step": 382 + }, + { + "epoch": 0.7509803921568627, + "grad_norm": 50.7408332824707, + "learning_rate": 7.201050741513735e-06, + "loss": 4.584663391113281, + "step": 383 + }, + { + "epoch": 0.7529411764705882, + "grad_norm": 71.0459213256836, + "learning_rate": 7.186778692009669e-06, + "loss": 4.636325359344482, + "step": 384 + }, + { + "epoch": 0.7549019607843137, + "grad_norm": 38.0345344543457, + "learning_rate": 7.172484577114452e-06, + "loss": 4.060024261474609, + "step": 385 + }, + { + "epoch": 0.7568627450980392, + "grad_norm": 40.320499420166016, + "learning_rate": 7.1581685410609e-06, + "loss": 4.512998580932617, + "step": 386 + }, + { + "epoch": 0.7588235294117647, + "grad_norm": 33.043148040771484, + "learning_rate": 7.1438307283030106e-06, + "loss": 4.692201614379883, + "step": 387 + }, + { + "epoch": 0.7607843137254902, + "grad_norm": 137.872314453125, + "learning_rate": 7.129471283514525e-06, + "loss": 4.415122985839844, + "step": 388 + }, + { + "epoch": 0.7627450980392156, + "grad_norm": 76.47061157226562, + "learning_rate": 7.115090351587455e-06, + "loss": 4.573295593261719, + "step": 389 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 328.7730712890625, + "learning_rate": 7.100688077630628e-06, + "loss": 4.141142845153809, + "step": 390 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 43.609642028808594, + "learning_rate": 7.086264606968215e-06, + "loss": 4.211104393005371, + "step": 391 + }, + { + "epoch": 0.7686274509803922, + "grad_norm": 125.07260131835938, + "learning_rate": 7.071820085138275e-06, + "loss": 4.797672271728516, + "step": 392 + }, + { + "epoch": 0.7705882352941177, + "grad_norm": 61.34317398071289, + "learning_rate": 7.05735465789128e-06, + "loss": 4.348987579345703, + "step": 393 + }, + { + "epoch": 0.7725490196078432, + "grad_norm": 75.8901596069336, + "learning_rate": 7.042868471188642e-06, + "loss": 4.376434803009033, + "step": 394 + }, + { + "epoch": 0.7745098039215687, + "grad_norm": 172.4529571533203, + "learning_rate": 7.028361671201245e-06, + "loss": 4.280189514160156, + "step": 395 + }, + { + "epoch": 0.7764705882352941, + "grad_norm": 67.70394897460938, + "learning_rate": 7.013834404307972e-06, + "loss": 4.715417861938477, + "step": 396 + }, + { + "epoch": 0.7784313725490196, + "grad_norm": 17.541534423828125, + "learning_rate": 6.9992868170942205e-06, + "loss": 4.301790237426758, + "step": 397 + }, + { + "epoch": 0.7803921568627451, + "grad_norm": 17.7639217376709, + "learning_rate": 6.9847190563504284e-06, + "loss": 4.332895278930664, + "step": 398 + }, + { + "epoch": 0.7823529411764706, + "grad_norm": 77.33992767333984, + "learning_rate": 6.970131269070591e-06, + "loss": 4.072659492492676, + "step": 399 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 30.242053985595703, + "learning_rate": 6.95552360245078e-06, + "loss": 4.5907301902771, + "step": 400 + }, + { + "epoch": 0.7862745098039216, + "grad_norm": 157.5186767578125, + "learning_rate": 6.940896203887659e-06, + "loss": 4.161381721496582, + "step": 401 + }, + { + "epoch": 0.788235294117647, + "grad_norm": 48.990875244140625, + "learning_rate": 6.926249220976988e-06, + "loss": 4.169566631317139, + "step": 402 + }, + { + "epoch": 0.7901960784313725, + "grad_norm": 33.2501220703125, + "learning_rate": 6.911582801512146e-06, + "loss": 4.37021017074585, + "step": 403 + }, + { + "epoch": 0.792156862745098, + "grad_norm": 29.51424789428711, + "learning_rate": 6.8968970934826296e-06, + "loss": 3.938095808029175, + "step": 404 + }, + { + "epoch": 0.7941176470588235, + "grad_norm": 86.2677001953125, + "learning_rate": 6.88219224507257e-06, + "loss": 4.483772277832031, + "step": 405 + }, + { + "epoch": 0.796078431372549, + "grad_norm": 25.195167541503906, + "learning_rate": 6.867468404659222e-06, + "loss": 4.458285331726074, + "step": 406 + }, + { + "epoch": 0.7980392156862746, + "grad_norm": 22.213388442993164, + "learning_rate": 6.852725720811487e-06, + "loss": 3.883963108062744, + "step": 407 + }, + { + "epoch": 0.8, + "grad_norm": 60.982303619384766, + "learning_rate": 6.837964342288399e-06, + "loss": 4.16390323638916, + "step": 408 + }, + { + "epoch": 0.8019607843137255, + "grad_norm": 255.90065002441406, + "learning_rate": 6.823184418037625e-06, + "loss": 4.246565818786621, + "step": 409 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 80.53013610839844, + "learning_rate": 6.808386097193969e-06, + "loss": 3.941505193710327, + "step": 410 + }, + { + "epoch": 0.8058823529411765, + "grad_norm": 70.98387145996094, + "learning_rate": 6.793569529077864e-06, + "loss": 4.301395416259766, + "step": 411 + }, + { + "epoch": 0.807843137254902, + "grad_norm": 117.46578979492188, + "learning_rate": 6.778734863193862e-06, + "loss": 4.3663835525512695, + "step": 412 + }, + { + "epoch": 0.8098039215686275, + "grad_norm": 32.978851318359375, + "learning_rate": 6.76388224922913e-06, + "loss": 4.204647064208984, + "step": 413 + }, + { + "epoch": 0.8117647058823529, + "grad_norm": 21.284744262695312, + "learning_rate": 6.7490118370519356e-06, + "loss": 4.878431797027588, + "step": 414 + }, + { + "epoch": 0.8137254901960784, + "grad_norm": 116.35888671875, + "learning_rate": 6.7341237767101375e-06, + "loss": 4.751389503479004, + "step": 415 + }, + { + "epoch": 0.8156862745098039, + "grad_norm": 58.715450286865234, + "learning_rate": 6.7192182184296725e-06, + "loss": 4.176122665405273, + "step": 416 + }, + { + "epoch": 0.8176470588235294, + "grad_norm": 266.84912109375, + "learning_rate": 6.704295312613037e-06, + "loss": 4.090945243835449, + "step": 417 + }, + { + "epoch": 0.8196078431372549, + "grad_norm": 282.4599609375, + "learning_rate": 6.689355209837769e-06, + "loss": 4.8022003173828125, + "step": 418 + }, + { + "epoch": 0.8215686274509804, + "grad_norm": 17.034099578857422, + "learning_rate": 6.674398060854931e-06, + "loss": 4.382605075836182, + "step": 419 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 45.16923904418945, + "learning_rate": 6.65942401658759e-06, + "loss": 4.5414934158325195, + "step": 420 + }, + { + "epoch": 0.8254901960784313, + "grad_norm": 68.16588592529297, + "learning_rate": 6.644433228129288e-06, + "loss": 4.519162654876709, + "step": 421 + }, + { + "epoch": 0.8274509803921568, + "grad_norm": 30.64187240600586, + "learning_rate": 6.6294258467425256e-06, + "loss": 4.278877258300781, + "step": 422 + }, + { + "epoch": 0.8294117647058824, + "grad_norm": 50.78958511352539, + "learning_rate": 6.614402023857231e-06, + "loss": 4.127124786376953, + "step": 423 + }, + { + "epoch": 0.8313725490196079, + "grad_norm": 16.406599044799805, + "learning_rate": 6.599361911069235e-06, + "loss": 4.240130424499512, + "step": 424 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 8.257003784179688, + "learning_rate": 6.584305660138734e-06, + "loss": 4.398717880249023, + "step": 425 + }, + { + "epoch": 0.8352941176470589, + "grad_norm": 53.47596740722656, + "learning_rate": 6.569233422988771e-06, + "loss": 4.362873554229736, + "step": 426 + }, + { + "epoch": 0.8372549019607843, + "grad_norm": 163.6345672607422, + "learning_rate": 6.554145351703689e-06, + "loss": 4.455537796020508, + "step": 427 + }, + { + "epoch": 0.8392156862745098, + "grad_norm": 222.5769805908203, + "learning_rate": 6.539041598527612e-06, + "loss": 4.446180820465088, + "step": 428 + }, + { + "epoch": 0.8411764705882353, + "grad_norm": 193.03797912597656, + "learning_rate": 6.523922315862887e-06, + "loss": 3.9635980129241943, + "step": 429 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 259.9488830566406, + "learning_rate": 6.508787656268573e-06, + "loss": 4.20033073425293, + "step": 430 + }, + { + "epoch": 0.8450980392156863, + "grad_norm": 79.64652252197266, + "learning_rate": 6.4936377724588794e-06, + "loss": 4.132991790771484, + "step": 431 + }, + { + "epoch": 0.8470588235294118, + "grad_norm": 89.49686431884766, + "learning_rate": 6.478472817301635e-06, + "loss": 4.8201904296875, + "step": 432 + }, + { + "epoch": 0.8490196078431372, + "grad_norm": 36.88655090332031, + "learning_rate": 6.463292943816747e-06, + "loss": 4.225468635559082, + "step": 433 + }, + { + "epoch": 0.8509803921568627, + "grad_norm": 167.7744140625, + "learning_rate": 6.448098305174648e-06, + "loss": 4.24064826965332, + "step": 434 + }, + { + "epoch": 0.8529411764705882, + "grad_norm": 320.5234069824219, + "learning_rate": 6.4328890546947645e-06, + "loss": 4.498333930969238, + "step": 435 + }, + { + "epoch": 0.8549019607843137, + "grad_norm": 108.37223815917969, + "learning_rate": 6.417665345843952e-06, + "loss": 3.9274849891662598, + "step": 436 + }, + { + "epoch": 0.8568627450980392, + "grad_norm": 53.389373779296875, + "learning_rate": 6.402427332234965e-06, + "loss": 4.50510835647583, + "step": 437 + }, + { + "epoch": 0.8588235294117647, + "grad_norm": 586.7931518554688, + "learning_rate": 6.387175167624894e-06, + "loss": 4.44918966293335, + "step": 438 + }, + { + "epoch": 0.8607843137254902, + "grad_norm": 64.9321517944336, + "learning_rate": 6.371909005913618e-06, + "loss": 4.453424453735352, + "step": 439 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 21.094820022583008, + "learning_rate": 6.3566290011422515e-06, + "loss": 4.252875328063965, + "step": 440 + }, + { + "epoch": 0.8647058823529412, + "grad_norm": 302.2917785644531, + "learning_rate": 6.341335307491596e-06, + "loss": 3.92726993560791, + "step": 441 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 44.52492141723633, + "learning_rate": 6.32602807928057e-06, + "loss": 4.290216445922852, + "step": 442 + }, + { + "epoch": 0.8686274509803922, + "grad_norm": 48.71710205078125, + "learning_rate": 6.310707470964668e-06, + "loss": 4.29799747467041, + "step": 443 + }, + { + "epoch": 0.8705882352941177, + "grad_norm": 224.1940460205078, + "learning_rate": 6.29537363713439e-06, + "loss": 4.234777450561523, + "step": 444 + }, + { + "epoch": 0.8725490196078431, + "grad_norm": 198.24740600585938, + "learning_rate": 6.280026732513689e-06, + "loss": 4.185808181762695, + "step": 445 + }, + { + "epoch": 0.8745098039215686, + "grad_norm": 35.172672271728516, + "learning_rate": 6.264666911958404e-06, + "loss": 4.557499885559082, + "step": 446 + }, + { + "epoch": 0.8764705882352941, + "grad_norm": 63.40365219116211, + "learning_rate": 6.249294330454705e-06, + "loss": 4.098924160003662, + "step": 447 + }, + { + "epoch": 0.8784313725490196, + "grad_norm": 145.4202117919922, + "learning_rate": 6.233909143117521e-06, + "loss": 4.268922805786133, + "step": 448 + }, + { + "epoch": 0.8803921568627451, + "grad_norm": 149.33828735351562, + "learning_rate": 6.21851150518898e-06, + "loss": 4.938076019287109, + "step": 449 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 42.62335205078125, + "learning_rate": 6.203101572036839e-06, + "loss": 4.535097599029541, + "step": 450 + }, + { + "epoch": 0.884313725490196, + "grad_norm": 29.659095764160156, + "learning_rate": 6.18767949915292e-06, + "loss": 4.5162248611450195, + "step": 451 + }, + { + "epoch": 0.8862745098039215, + "grad_norm": 385.17120361328125, + "learning_rate": 6.172245442151541e-06, + "loss": 4.205960273742676, + "step": 452 + }, + { + "epoch": 0.888235294117647, + "grad_norm": 41.33932113647461, + "learning_rate": 6.156799556767941e-06, + "loss": 4.351683139801025, + "step": 453 + }, + { + "epoch": 0.8901960784313725, + "grad_norm": 143.6595458984375, + "learning_rate": 6.141341998856711e-06, + "loss": 4.250962257385254, + "step": 454 + }, + { + "epoch": 0.8921568627450981, + "grad_norm": 37.78490447998047, + "learning_rate": 6.125872924390226e-06, + "loss": 4.384239196777344, + "step": 455 + }, + { + "epoch": 0.8941176470588236, + "grad_norm": 515.0045166015625, + "learning_rate": 6.110392489457067e-06, + "loss": 4.019399166107178, + "step": 456 + }, + { + "epoch": 0.8960784313725491, + "grad_norm": 55.70957946777344, + "learning_rate": 6.094900850260439e-06, + "loss": 4.14704704284668, + "step": 457 + }, + { + "epoch": 0.8980392156862745, + "grad_norm": 193.2394256591797, + "learning_rate": 6.079398163116611e-06, + "loss": 4.078997611999512, + "step": 458 + }, + { + "epoch": 0.9, + "grad_norm": 5256.82421875, + "learning_rate": 6.063884584453326e-06, + "loss": 4.191615104675293, + "step": 459 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 100.2900390625, + "learning_rate": 6.048360270808226e-06, + "loss": 4.465028762817383, + "step": 460 + }, + { + "epoch": 0.903921568627451, + "grad_norm": 1331.4735107421875, + "learning_rate": 6.032825378827273e-06, + "loss": 4.066887378692627, + "step": 461 + }, + { + "epoch": 0.9058823529411765, + "grad_norm": 452.3879699707031, + "learning_rate": 6.0172800652631706e-06, + "loss": 4.527779579162598, + "step": 462 + }, + { + "epoch": 0.907843137254902, + "grad_norm": 2017.1990966796875, + "learning_rate": 6.001724486973774e-06, + "loss": 4.606598854064941, + "step": 463 + }, + { + "epoch": 0.9098039215686274, + "grad_norm": 754.1237182617188, + "learning_rate": 5.986158800920523e-06, + "loss": 4.566285133361816, + "step": 464 + }, + { + "epoch": 0.9117647058823529, + "grad_norm": 384.491943359375, + "learning_rate": 5.970583164166838e-06, + "loss": 4.586278915405273, + "step": 465 + }, + { + "epoch": 0.9137254901960784, + "grad_norm": 33.79523468017578, + "learning_rate": 5.954997733876552e-06, + "loss": 3.9631872177124023, + "step": 466 + }, + { + "epoch": 0.9156862745098039, + "grad_norm": 66.47810363769531, + "learning_rate": 5.939402667312316e-06, + "loss": 4.857361793518066, + "step": 467 + }, + { + "epoch": 0.9176470588235294, + "grad_norm": 32.95048904418945, + "learning_rate": 5.923798121834016e-06, + "loss": 4.744093418121338, + "step": 468 + }, + { + "epoch": 0.9196078431372549, + "grad_norm": 66.45060729980469, + "learning_rate": 5.908184254897183e-06, + "loss": 4.424873352050781, + "step": 469 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 160.2024688720703, + "learning_rate": 5.892561224051403e-06, + "loss": 4.0311994552612305, + "step": 470 + }, + { + "epoch": 0.9235294117647059, + "grad_norm": 26.441673278808594, + "learning_rate": 5.876929186938734e-06, + "loss": 4.408843040466309, + "step": 471 + }, + { + "epoch": 0.9254901960784314, + "grad_norm": 95.61007690429688, + "learning_rate": 5.861288301292103e-06, + "loss": 4.58468770980835, + "step": 472 + }, + { + "epoch": 0.9274509803921569, + "grad_norm": 101.88788604736328, + "learning_rate": 5.845638724933729e-06, + "loss": 4.604763031005859, + "step": 473 + }, + { + "epoch": 0.9294117647058824, + "grad_norm": 196.6406707763672, + "learning_rate": 5.82998061577352e-06, + "loss": 3.798020839691162, + "step": 474 + }, + { + "epoch": 0.9313725490196079, + "grad_norm": 64.56034088134766, + "learning_rate": 5.814314131807486e-06, + "loss": 4.227663040161133, + "step": 475 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 43.46277618408203, + "learning_rate": 5.798639431116135e-06, + "loss": 4.6275858879089355, + "step": 476 + }, + { + "epoch": 0.9352941176470588, + "grad_norm": 624.3564453125, + "learning_rate": 5.782956671862895e-06, + "loss": 4.316646575927734, + "step": 477 + }, + { + "epoch": 0.9372549019607843, + "grad_norm": 105.81096649169922, + "learning_rate": 5.767266012292496e-06, + "loss": 4.3191094398498535, + "step": 478 + }, + { + "epoch": 0.9392156862745098, + "grad_norm": 11.598875045776367, + "learning_rate": 5.751567610729398e-06, + "loss": 4.125179767608643, + "step": 479 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 80.84516143798828, + "learning_rate": 5.735861625576167e-06, + "loss": 4.346436977386475, + "step": 480 + }, + { + "epoch": 0.9431372549019608, + "grad_norm": 74.3943862915039, + "learning_rate": 5.720148215311902e-06, + "loss": 4.7361907958984375, + "step": 481 + }, + { + "epoch": 0.9450980392156862, + "grad_norm": 28.853347778320312, + "learning_rate": 5.7044275384906164e-06, + "loss": 4.320212364196777, + "step": 482 + }, + { + "epoch": 0.9470588235294117, + "grad_norm": 12.190153121948242, + "learning_rate": 5.688699753739649e-06, + "loss": 4.433542251586914, + "step": 483 + }, + { + "epoch": 0.9490196078431372, + "grad_norm": 14.953727722167969, + "learning_rate": 5.672965019758061e-06, + "loss": 4.154573440551758, + "step": 484 + }, + { + "epoch": 0.9509803921568627, + "grad_norm": 45.093048095703125, + "learning_rate": 5.657223495315031e-06, + "loss": 4.646431922912598, + "step": 485 + }, + { + "epoch": 0.9529411764705882, + "grad_norm": 67.42533111572266, + "learning_rate": 5.641475339248257e-06, + "loss": 4.0797953605651855, + "step": 486 + }, + { + "epoch": 0.9549019607843138, + "grad_norm": 31.419174194335938, + "learning_rate": 5.625720710462352e-06, + "loss": 4.35106086730957, + "step": 487 + }, + { + "epoch": 0.9568627450980393, + "grad_norm": 72.13565063476562, + "learning_rate": 5.609959767927247e-06, + "loss": 4.7546563148498535, + "step": 488 + }, + { + "epoch": 0.9588235294117647, + "grad_norm": 1067.875, + "learning_rate": 5.594192670676568e-06, + "loss": 4.418883323669434, + "step": 489 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 183.54017639160156, + "learning_rate": 5.578419577806058e-06, + "loss": 4.376974105834961, + "step": 490 + }, + { + "epoch": 0.9627450980392157, + "grad_norm": 95.38218688964844, + "learning_rate": 5.562640648471951e-06, + "loss": 4.362179756164551, + "step": 491 + }, + { + "epoch": 0.9647058823529412, + "grad_norm": 71.57412719726562, + "learning_rate": 5.546856041889374e-06, + "loss": 4.819134712219238, + "step": 492 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 16.976749420166016, + "learning_rate": 5.531065917330737e-06, + "loss": 4.120271682739258, + "step": 493 + }, + { + "epoch": 0.9686274509803922, + "grad_norm": 71.98576354980469, + "learning_rate": 5.515270434124136e-06, + "loss": 4.1797919273376465, + "step": 494 + }, + { + "epoch": 0.9705882352941176, + "grad_norm": 4.768789768218994, + "learning_rate": 5.499469751651728e-06, + "loss": 4.024587631225586, + "step": 495 + }, + { + "epoch": 0.9725490196078431, + "grad_norm": 408.6588134765625, + "learning_rate": 5.483664029348141e-06, + "loss": 4.756344795227051, + "step": 496 + }, + { + "epoch": 0.9745098039215686, + "grad_norm": 10.495368957519531, + "learning_rate": 5.467853426698852e-06, + "loss": 4.134525299072266, + "step": 497 + }, + { + "epoch": 0.9764705882352941, + "grad_norm": 183.8063201904297, + "learning_rate": 5.452038103238582e-06, + "loss": 4.6963725090026855, + "step": 498 + }, + { + "epoch": 0.9784313725490196, + "grad_norm": 83.09385681152344, + "learning_rate": 5.43621821854969e-06, + "loss": 4.915416717529297, + "step": 499 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 33.74040985107422, + "learning_rate": 5.420393932260557e-06, + "loss": 4.430484294891357, + "step": 500 + }, + { + "epoch": 0.9823529411764705, + "grad_norm": 41.18917465209961, + "learning_rate": 5.404565404043977e-06, + "loss": 4.3449602127075195, + "step": 501 + }, + { + "epoch": 0.984313725490196, + "grad_norm": 243.4974822998047, + "learning_rate": 5.388732793615551e-06, + "loss": 4.609969139099121, + "step": 502 + }, + { + "epoch": 0.9862745098039216, + "grad_norm": 24.437667846679688, + "learning_rate": 5.372896260732065e-06, + "loss": 4.368291854858398, + "step": 503 + }, + { + "epoch": 0.9882352941176471, + "grad_norm": 36.5069694519043, + "learning_rate": 5.357055965189888e-06, + "loss": 4.355335712432861, + "step": 504 + }, + { + "epoch": 0.9901960784313726, + "grad_norm": 156.41387939453125, + "learning_rate": 5.341212066823356e-06, + "loss": 4.198671340942383, + "step": 505 + }, + { + "epoch": 0.9921568627450981, + "grad_norm": 32.02665710449219, + "learning_rate": 5.325364725503155e-06, + "loss": 5.039112567901611, + "step": 506 + }, + { + "epoch": 0.9941176470588236, + "grad_norm": 28.728458404541016, + "learning_rate": 5.3095141011347155e-06, + "loss": 4.022368907928467, + "step": 507 + }, + { + "epoch": 0.996078431372549, + "grad_norm": 82.8222427368164, + "learning_rate": 5.2936603536565915e-06, + "loss": 4.288028717041016, + "step": 508 + }, + { + "epoch": 0.9980392156862745, + "grad_norm": 46.44649124145508, + "learning_rate": 5.277803643038855e-06, + "loss": 4.239832401275635, + "step": 509 + }, + { + "epoch": 1.0, + "grad_norm": 85.16124725341797, + "learning_rate": 5.261944129281474e-06, + "loss": 4.053654193878174, + "step": 510 + }, + { + "epoch": 1.0019607843137255, + "grad_norm": 33.269752502441406, + "learning_rate": 5.246081972412702e-06, + "loss": 4.511114120483398, + "step": 511 + }, + { + "epoch": 1.003921568627451, + "grad_norm": 12.197842597961426, + "learning_rate": 5.230217332487462e-06, + "loss": 4.179715633392334, + "step": 512 + }, + { + "epoch": 1.0058823529411764, + "grad_norm": 463.8089599609375, + "learning_rate": 5.214350369585731e-06, + "loss": 3.9151241779327393, + "step": 513 + }, + { + "epoch": 1.007843137254902, + "grad_norm": 41.52510070800781, + "learning_rate": 5.1984812438109274e-06, + "loss": 4.9370880126953125, + "step": 514 + }, + { + "epoch": 1.0098039215686274, + "grad_norm": 85.5319595336914, + "learning_rate": 5.182610115288296e-06, + "loss": 4.470064163208008, + "step": 515 + }, + { + "epoch": 1.011764705882353, + "grad_norm": 16.760358810424805, + "learning_rate": 5.166737144163283e-06, + "loss": 4.650325298309326, + "step": 516 + }, + { + "epoch": 1.0137254901960784, + "grad_norm": 1324.2694091796875, + "learning_rate": 5.150862490599934e-06, + "loss": 4.390547752380371, + "step": 517 + }, + { + "epoch": 1.0156862745098039, + "grad_norm": 31.39581871032715, + "learning_rate": 5.134986314779269e-06, + "loss": 4.533949851989746, + "step": 518 + }, + { + "epoch": 1.0176470588235293, + "grad_norm": 25.530866622924805, + "learning_rate": 5.119108776897665e-06, + "loss": 4.503119468688965, + "step": 519 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 44.92827224731445, + "learning_rate": 5.103230037165248e-06, + "loss": 4.454017639160156, + "step": 520 + }, + { + "epoch": 1.0215686274509803, + "grad_norm": 30.468584060668945, + "learning_rate": 5.0873502558042665e-06, + "loss": 4.6700439453125, + "step": 521 + }, + { + "epoch": 1.0235294117647058, + "grad_norm": 10.387107849121094, + "learning_rate": 5.071469593047482e-06, + "loss": 4.546551704406738, + "step": 522 + }, + { + "epoch": 1.0254901960784313, + "grad_norm": 169.88038635253906, + "learning_rate": 5.055588209136548e-06, + "loss": 4.674521446228027, + "step": 523 + }, + { + "epoch": 1.0274509803921568, + "grad_norm": 127.17510223388672, + "learning_rate": 5.0397062643204e-06, + "loss": 4.426300048828125, + "step": 524 + }, + { + "epoch": 1.0294117647058822, + "grad_norm": 15.707987785339355, + "learning_rate": 5.023823918853622e-06, + "loss": 4.438831806182861, + "step": 525 + }, + { + "epoch": 1.0313725490196077, + "grad_norm": 46.319480895996094, + "learning_rate": 5.0079413329948524e-06, + "loss": 4.169025897979736, + "step": 526 + }, + { + "epoch": 1.0333333333333334, + "grad_norm": 21.5632266998291, + "learning_rate": 4.992058667005149e-06, + "loss": 4.026384353637695, + "step": 527 + }, + { + "epoch": 1.035294117647059, + "grad_norm": 12.650291442871094, + "learning_rate": 4.976176081146379e-06, + "loss": 4.545536041259766, + "step": 528 + }, + { + "epoch": 1.0372549019607844, + "grad_norm": 36.72844314575195, + "learning_rate": 4.960293735679601e-06, + "loss": 4.455844879150391, + "step": 529 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 660.7545166015625, + "learning_rate": 4.944411790863453e-06, + "loss": 4.13068962097168, + "step": 530 + }, + { + "epoch": 1.0411764705882354, + "grad_norm": 148.22833251953125, + "learning_rate": 4.928530406952521e-06, + "loss": 3.958733081817627, + "step": 531 + }, + { + "epoch": 1.0431372549019609, + "grad_norm": 13.015345573425293, + "learning_rate": 4.912649744195735e-06, + "loss": 4.207390308380127, + "step": 532 + }, + { + "epoch": 1.0450980392156863, + "grad_norm": 29.108354568481445, + "learning_rate": 4.896769962834754e-06, + "loss": 4.251070976257324, + "step": 533 + }, + { + "epoch": 1.0470588235294118, + "grad_norm": 114.75436401367188, + "learning_rate": 4.880891223102337e-06, + "loss": 4.03155517578125, + "step": 534 + }, + { + "epoch": 1.0490196078431373, + "grad_norm": 41.20862579345703, + "learning_rate": 4.865013685220733e-06, + "loss": 4.275918483734131, + "step": 535 + }, + { + "epoch": 1.0509803921568628, + "grad_norm": 22.689228057861328, + "learning_rate": 4.8491375094000675e-06, + "loss": 4.5153985023498535, + "step": 536 + }, + { + "epoch": 1.0529411764705883, + "grad_norm": 199.50894165039062, + "learning_rate": 4.83326285583672e-06, + "loss": 4.557675361633301, + "step": 537 + }, + { + "epoch": 1.0549019607843138, + "grad_norm": 115.482666015625, + "learning_rate": 4.817389884711706e-06, + "loss": 4.072908878326416, + "step": 538 + }, + { + "epoch": 1.0568627450980392, + "grad_norm": 479.1891174316406, + "learning_rate": 4.801518756189074e-06, + "loss": 4.4816670417785645, + "step": 539 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 8993.1259765625, + "learning_rate": 4.785649630414272e-06, + "loss": 4.955939292907715, + "step": 540 + }, + { + "epoch": 1.0607843137254902, + "grad_norm": 13.87210750579834, + "learning_rate": 4.76978266751254e-06, + "loss": 4.211750507354736, + "step": 541 + }, + { + "epoch": 1.0627450980392157, + "grad_norm": 274.2032775878906, + "learning_rate": 4.7539180275873e-06, + "loss": 4.1780195236206055, + "step": 542 + }, + { + "epoch": 1.0647058823529412, + "grad_norm": 58.98638916015625, + "learning_rate": 4.7380558707185285e-06, + "loss": 4.577972412109375, + "step": 543 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 717.9498291015625, + "learning_rate": 4.7221963569611454e-06, + "loss": 4.485439300537109, + "step": 544 + }, + { + "epoch": 1.0686274509803921, + "grad_norm": 48.960872650146484, + "learning_rate": 4.70633964634341e-06, + "loss": 4.3080573081970215, + "step": 545 + }, + { + "epoch": 1.0705882352941176, + "grad_norm": 72.79077911376953, + "learning_rate": 4.690485898865288e-06, + "loss": 4.191202163696289, + "step": 546 + }, + { + "epoch": 1.072549019607843, + "grad_norm": 17.700952529907227, + "learning_rate": 4.6746352744968474e-06, + "loss": 4.54551887512207, + "step": 547 + }, + { + "epoch": 1.0745098039215686, + "grad_norm": 74.11648559570312, + "learning_rate": 4.6587879331766465e-06, + "loss": 4.341633319854736, + "step": 548 + }, + { + "epoch": 1.076470588235294, + "grad_norm": 71.6302719116211, + "learning_rate": 4.642944034810113e-06, + "loss": 4.656595230102539, + "step": 549 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 18.58173942565918, + "learning_rate": 4.627103739267935e-06, + "loss": 4.4985198974609375, + "step": 550 + }, + { + "epoch": 1.080392156862745, + "grad_norm": 29.89740562438965, + "learning_rate": 4.61126720638445e-06, + "loss": 4.266082286834717, + "step": 551 + }, + { + "epoch": 1.0823529411764705, + "grad_norm": 487.6723937988281, + "learning_rate": 4.595434595956024e-06, + "loss": 4.734857082366943, + "step": 552 + }, + { + "epoch": 1.084313725490196, + "grad_norm": 41.911346435546875, + "learning_rate": 4.579606067739445e-06, + "loss": 4.1984429359436035, + "step": 553 + }, + { + "epoch": 1.0862745098039215, + "grad_norm": 10.742730140686035, + "learning_rate": 4.563781781450312e-06, + "loss": 4.3449859619140625, + "step": 554 + }, + { + "epoch": 1.088235294117647, + "grad_norm": 108.23196411132812, + "learning_rate": 4.547961896761419e-06, + "loss": 4.055901050567627, + "step": 555 + }, + { + "epoch": 1.0901960784313725, + "grad_norm": 87.71426391601562, + "learning_rate": 4.5321465733011495e-06, + "loss": 3.7149624824523926, + "step": 556 + }, + { + "epoch": 1.0921568627450982, + "grad_norm": 47.08598709106445, + "learning_rate": 4.51633597065186e-06, + "loss": 4.83895206451416, + "step": 557 + }, + { + "epoch": 1.0941176470588236, + "grad_norm": 24.926769256591797, + "learning_rate": 4.500530248348274e-06, + "loss": 4.079520225524902, + "step": 558 + }, + { + "epoch": 1.0960784313725491, + "grad_norm": 54.99407196044922, + "learning_rate": 4.484729565875865e-06, + "loss": 4.527491569519043, + "step": 559 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 247.14097595214844, + "learning_rate": 4.468934082669265e-06, + "loss": 4.299029350280762, + "step": 560 + }, + { + "epoch": 1.1, + "grad_norm": 121.5594482421875, + "learning_rate": 4.4531439581106295e-06, + "loss": 4.263119220733643, + "step": 561 + }, + { + "epoch": 1.1019607843137256, + "grad_norm": 25.304927825927734, + "learning_rate": 4.43735935152805e-06, + "loss": 3.9582228660583496, + "step": 562 + }, + { + "epoch": 1.103921568627451, + "grad_norm": 46.14994812011719, + "learning_rate": 4.421580422193943e-06, + "loss": 3.8566017150878906, + "step": 563 + }, + { + "epoch": 1.1058823529411765, + "grad_norm": 76.16458892822266, + "learning_rate": 4.405807329323434e-06, + "loss": 4.10983943939209, + "step": 564 + }, + { + "epoch": 1.107843137254902, + "grad_norm": 83.4740982055664, + "learning_rate": 4.390040232072756e-06, + "loss": 4.510924816131592, + "step": 565 + }, + { + "epoch": 1.1098039215686275, + "grad_norm": 47.82321548461914, + "learning_rate": 4.3742792895376494e-06, + "loss": 4.188164710998535, + "step": 566 + }, + { + "epoch": 1.111764705882353, + "grad_norm": 95.69087982177734, + "learning_rate": 4.358524660751746e-06, + "loss": 4.134848594665527, + "step": 567 + }, + { + "epoch": 1.1137254901960785, + "grad_norm": 433.8464660644531, + "learning_rate": 4.3427765046849715e-06, + "loss": 4.2101240158081055, + "step": 568 + }, + { + "epoch": 1.115686274509804, + "grad_norm": 319.2967834472656, + "learning_rate": 4.327034980241941e-06, + "loss": 4.203764915466309, + "step": 569 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 65.28421783447266, + "learning_rate": 4.3113002462603525e-06, + "loss": 4.3949408531188965, + "step": 570 + }, + { + "epoch": 1.119607843137255, + "grad_norm": 255.21192932128906, + "learning_rate": 4.295572461509384e-06, + "loss": 4.426019668579102, + "step": 571 + }, + { + "epoch": 1.1215686274509804, + "grad_norm": 162.92665100097656, + "learning_rate": 4.279851784688099e-06, + "loss": 4.358798980712891, + "step": 572 + }, + { + "epoch": 1.1235294117647059, + "grad_norm": 52.00394058227539, + "learning_rate": 4.264138374423835e-06, + "loss": 4.416162014007568, + "step": 573 + }, + { + "epoch": 1.1254901960784314, + "grad_norm": 75.48197174072266, + "learning_rate": 4.248432389270604e-06, + "loss": 4.510519981384277, + "step": 574 + }, + { + "epoch": 1.1274509803921569, + "grad_norm": 24.009601593017578, + "learning_rate": 4.232733987707505e-06, + "loss": 4.026247024536133, + "step": 575 + }, + { + "epoch": 1.1294117647058823, + "grad_norm": 34.50553894042969, + "learning_rate": 4.2170433281371085e-06, + "loss": 3.6613011360168457, + "step": 576 + }, + { + "epoch": 1.1313725490196078, + "grad_norm": 155.1118621826172, + "learning_rate": 4.2013605688838656e-06, + "loss": 4.086950302124023, + "step": 577 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 33.91543197631836, + "learning_rate": 4.185685868192516e-06, + "loss": 4.358902931213379, + "step": 578 + }, + { + "epoch": 1.1352941176470588, + "grad_norm": 11.068925857543945, + "learning_rate": 4.170019384226482e-06, + "loss": 4.525607585906982, + "step": 579 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 12.911892890930176, + "learning_rate": 4.154361275066272e-06, + "loss": 4.787441730499268, + "step": 580 + }, + { + "epoch": 1.1392156862745098, + "grad_norm": 44.60051727294922, + "learning_rate": 4.138711698707899e-06, + "loss": 4.10947322845459, + "step": 581 + }, + { + "epoch": 1.1411764705882352, + "grad_norm": 14.284730911254883, + "learning_rate": 4.123070813061269e-06, + "loss": 4.306097030639648, + "step": 582 + }, + { + "epoch": 1.1431372549019607, + "grad_norm": 28.69902229309082, + "learning_rate": 4.107438775948598e-06, + "loss": 4.534627914428711, + "step": 583 + }, + { + "epoch": 1.1450980392156862, + "grad_norm": 118.47590637207031, + "learning_rate": 4.091815745102818e-06, + "loss": 4.478302001953125, + "step": 584 + }, + { + "epoch": 1.1470588235294117, + "grad_norm": 239.13079833984375, + "learning_rate": 4.076201878165985e-06, + "loss": 4.175230026245117, + "step": 585 + }, + { + "epoch": 1.1490196078431372, + "grad_norm": 11.822772979736328, + "learning_rate": 4.060597332687685e-06, + "loss": 4.232016563415527, + "step": 586 + }, + { + "epoch": 1.1509803921568627, + "grad_norm": 8.779848098754883, + "learning_rate": 4.04500226612345e-06, + "loss": 4.359222412109375, + "step": 587 + }, + { + "epoch": 1.1529411764705881, + "grad_norm": 55.4611701965332, + "learning_rate": 4.0294168358331646e-06, + "loss": 4.48436164855957, + "step": 588 + }, + { + "epoch": 1.1549019607843136, + "grad_norm": 106.90692901611328, + "learning_rate": 4.013841199079479e-06, + "loss": 4.364660263061523, + "step": 589 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 116.64163970947266, + "learning_rate": 3.998275513026227e-06, + "loss": 4.734926223754883, + "step": 590 + }, + { + "epoch": 1.1588235294117646, + "grad_norm": 49.576759338378906, + "learning_rate": 3.982719934736832e-06, + "loss": 4.57968807220459, + "step": 591 + }, + { + "epoch": 1.1607843137254903, + "grad_norm": 249.4477996826172, + "learning_rate": 3.967174621172728e-06, + "loss": 4.361348628997803, + "step": 592 + }, + { + "epoch": 1.1627450980392158, + "grad_norm": 200.72610473632812, + "learning_rate": 3.951639729191775e-06, + "loss": 4.360734939575195, + "step": 593 + }, + { + "epoch": 1.1647058823529413, + "grad_norm": 9.295411109924316, + "learning_rate": 3.936115415546676e-06, + "loss": 3.982612371444702, + "step": 594 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 24.211009979248047, + "learning_rate": 3.920601836883389e-06, + "loss": 4.072271347045898, + "step": 595 + }, + { + "epoch": 1.1686274509803922, + "grad_norm": 21.431562423706055, + "learning_rate": 3.9050991497395625e-06, + "loss": 4.5261969566345215, + "step": 596 + }, + { + "epoch": 1.1705882352941177, + "grad_norm": 30.35516929626465, + "learning_rate": 3.889607510542936e-06, + "loss": 4.187648773193359, + "step": 597 + }, + { + "epoch": 1.1725490196078432, + "grad_norm": 125.94475555419922, + "learning_rate": 3.874127075609774e-06, + "loss": 4.380338668823242, + "step": 598 + }, + { + "epoch": 1.1745098039215687, + "grad_norm": 13.184361457824707, + "learning_rate": 3.85865800114329e-06, + "loss": 4.057315826416016, + "step": 599 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 19.243846893310547, + "learning_rate": 3.8432004432320615e-06, + "loss": 4.725327491760254, + "step": 600 + }, + { + "epoch": 1.1784313725490196, + "grad_norm": 36.28742599487305, + "learning_rate": 3.82775455784846e-06, + "loss": 4.383151054382324, + "step": 601 + }, + { + "epoch": 1.1803921568627451, + "grad_norm": 19.237449645996094, + "learning_rate": 3.8123205008470814e-06, + "loss": 4.282052993774414, + "step": 602 + }, + { + "epoch": 1.1823529411764706, + "grad_norm": 5.7032790184021, + "learning_rate": 3.796898427963163e-06, + "loss": 4.363219738006592, + "step": 603 + }, + { + "epoch": 1.184313725490196, + "grad_norm": 29.10776138305664, + "learning_rate": 3.781488494811022e-06, + "loss": 4.339258193969727, + "step": 604 + }, + { + "epoch": 1.1862745098039216, + "grad_norm": 7.000486850738525, + "learning_rate": 3.7660908568824805e-06, + "loss": 3.8698220252990723, + "step": 605 + }, + { + "epoch": 1.188235294117647, + "grad_norm": 30.676071166992188, + "learning_rate": 3.7507056695452966e-06, + "loss": 4.321857929229736, + "step": 606 + }, + { + "epoch": 1.1901960784313725, + "grad_norm": 50.5842170715332, + "learning_rate": 3.7353330880415963e-06, + "loss": 4.553991317749023, + "step": 607 + }, + { + "epoch": 1.192156862745098, + "grad_norm": 45.57831573486328, + "learning_rate": 3.7199732674863126e-06, + "loss": 4.030979633331299, + "step": 608 + }, + { + "epoch": 1.1941176470588235, + "grad_norm": 7.072158336639404, + "learning_rate": 3.704626362865612e-06, + "loss": 4.507158279418945, + "step": 609 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 13.144266128540039, + "learning_rate": 3.689292529035332e-06, + "loss": 4.481673240661621, + "step": 610 + }, + { + "epoch": 1.1980392156862745, + "grad_norm": 25.689403533935547, + "learning_rate": 3.6739719207194313e-06, + "loss": 3.993607997894287, + "step": 611 + }, + { + "epoch": 1.2, + "grad_norm": 17.309415817260742, + "learning_rate": 3.6586646925084057e-06, + "loss": 4.172903060913086, + "step": 612 + }, + { + "epoch": 1.2019607843137254, + "grad_norm": 19.0247859954834, + "learning_rate": 3.643370998857748e-06, + "loss": 4.659976959228516, + "step": 613 + }, + { + "epoch": 1.203921568627451, + "grad_norm": 135.28846740722656, + "learning_rate": 3.628090994086384e-06, + "loss": 4.32747745513916, + "step": 614 + }, + { + "epoch": 1.2058823529411764, + "grad_norm": 56.14744186401367, + "learning_rate": 3.612824832375109e-06, + "loss": 4.782073020935059, + "step": 615 + }, + { + "epoch": 1.2078431372549019, + "grad_norm": 35.78535079956055, + "learning_rate": 3.5975726677650352e-06, + "loss": 4.763627052307129, + "step": 616 + }, + { + "epoch": 1.2098039215686274, + "grad_norm": 19.496564865112305, + "learning_rate": 3.5823346541560494e-06, + "loss": 4.345594882965088, + "step": 617 + }, + { + "epoch": 1.2117647058823529, + "grad_norm": 33.521278381347656, + "learning_rate": 3.5671109453052375e-06, + "loss": 4.2201056480407715, + "step": 618 + }, + { + "epoch": 1.2137254901960783, + "grad_norm": 42.489444732666016, + "learning_rate": 3.551901694825352e-06, + "loss": 4.0522074699401855, + "step": 619 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 49.21531295776367, + "learning_rate": 3.536707056183254e-06, + "loss": 4.429040908813477, + "step": 620 + }, + { + "epoch": 1.2176470588235295, + "grad_norm": 24.033802032470703, + "learning_rate": 3.5215271826983653e-06, + "loss": 4.133354663848877, + "step": 621 + }, + { + "epoch": 1.219607843137255, + "grad_norm": 990.7142944335938, + "learning_rate": 3.5063622275411214e-06, + "loss": 4.201162815093994, + "step": 622 + }, + { + "epoch": 1.2215686274509805, + "grad_norm": 15.486369132995605, + "learning_rate": 3.491212343731428e-06, + "loss": 4.364123344421387, + "step": 623 + }, + { + "epoch": 1.223529411764706, + "grad_norm": 12.343740463256836, + "learning_rate": 3.4760776841371147e-06, + "loss": 4.412334442138672, + "step": 624 + }, + { + "epoch": 1.2254901960784315, + "grad_norm": 30.658004760742188, + "learning_rate": 3.460958401472391e-06, + "loss": 4.671567916870117, + "step": 625 + }, + { + "epoch": 1.227450980392157, + "grad_norm": 22.261457443237305, + "learning_rate": 3.4458546482963117e-06, + "loss": 4.55692195892334, + "step": 626 + }, + { + "epoch": 1.2294117647058824, + "grad_norm": 24.038450241088867, + "learning_rate": 3.430766577011231e-06, + "loss": 4.43558406829834, + "step": 627 + }, + { + "epoch": 1.231372549019608, + "grad_norm": 15.328783988952637, + "learning_rate": 3.415694339861266e-06, + "loss": 4.682916164398193, + "step": 628 + }, + { + "epoch": 1.2333333333333334, + "grad_norm": 39.863975524902344, + "learning_rate": 3.4006380889307666e-06, + "loss": 4.413348197937012, + "step": 629 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 203.64805603027344, + "learning_rate": 3.3855979761427705e-06, + "loss": 4.414485454559326, + "step": 630 + }, + { + "epoch": 1.2372549019607844, + "grad_norm": 44.45320510864258, + "learning_rate": 3.3705741532574744e-06, + "loss": 3.9716179370880127, + "step": 631 + }, + { + "epoch": 1.2392156862745098, + "grad_norm": 38.30686950683594, + "learning_rate": 3.3555667718707143e-06, + "loss": 4.339261054992676, + "step": 632 + }, + { + "epoch": 1.2411764705882353, + "grad_norm": 424.18194580078125, + "learning_rate": 3.340575983412412e-06, + "loss": 4.420026779174805, + "step": 633 + }, + { + "epoch": 1.2431372549019608, + "grad_norm": 7.286797523498535, + "learning_rate": 3.3256019391450696e-06, + "loss": 4.589149475097656, + "step": 634 + }, + { + "epoch": 1.2450980392156863, + "grad_norm": 244.61492919921875, + "learning_rate": 3.3106447901622324e-06, + "loss": 4.318112373352051, + "step": 635 + }, + { + "epoch": 1.2470588235294118, + "grad_norm": 11.346695899963379, + "learning_rate": 3.2957046873869647e-06, + "loss": 4.311612129211426, + "step": 636 + }, + { + "epoch": 1.2490196078431373, + "grad_norm": 62.095428466796875, + "learning_rate": 3.280781781570328e-06, + "loss": 4.524885177612305, + "step": 637 + }, + { + "epoch": 1.2509803921568627, + "grad_norm": 17.787887573242188, + "learning_rate": 3.2658762232898646e-06, + "loss": 4.234330177307129, + "step": 638 + }, + { + "epoch": 1.2529411764705882, + "grad_norm": 16.902944564819336, + "learning_rate": 3.2509881629480674e-06, + "loss": 3.874988317489624, + "step": 639 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 98.8851547241211, + "learning_rate": 3.236117750770872e-06, + "loss": 4.235379695892334, + "step": 640 + }, + { + "epoch": 1.2568627450980392, + "grad_norm": 6.254584312438965, + "learning_rate": 3.221265136806139e-06, + "loss": 4.176527976989746, + "step": 641 + }, + { + "epoch": 1.2588235294117647, + "grad_norm": 82.6905517578125, + "learning_rate": 3.2064304709221374e-06, + "loss": 4.461231231689453, + "step": 642 + }, + { + "epoch": 1.2607843137254902, + "grad_norm": 25.304033279418945, + "learning_rate": 3.1916139028060318e-06, + "loss": 4.426841735839844, + "step": 643 + }, + { + "epoch": 1.2627450980392156, + "grad_norm": 108.71114349365234, + "learning_rate": 3.176815581962377e-06, + "loss": 3.9393110275268555, + "step": 644 + }, + { + "epoch": 1.2647058823529411, + "grad_norm": 56.15510559082031, + "learning_rate": 3.162035657711604e-06, + "loss": 3.9527595043182373, + "step": 645 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 142.79994201660156, + "learning_rate": 3.1472742791885126e-06, + "loss": 4.400339126586914, + "step": 646 + }, + { + "epoch": 1.268627450980392, + "grad_norm": 81.11062622070312, + "learning_rate": 3.1325315953407787e-06, + "loss": 4.062412738800049, + "step": 647 + }, + { + "epoch": 1.2705882352941176, + "grad_norm": 22.960067749023438, + "learning_rate": 3.117807754927433e-06, + "loss": 4.251653671264648, + "step": 648 + }, + { + "epoch": 1.272549019607843, + "grad_norm": 11.816069602966309, + "learning_rate": 3.103102906517371e-06, + "loss": 4.824273109436035, + "step": 649 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 13.434324264526367, + "learning_rate": 3.0884171984878553e-06, + "loss": 4.2310380935668945, + "step": 650 + }, + { + "epoch": 1.276470588235294, + "grad_norm": 27.62645721435547, + "learning_rate": 3.0737507790230143e-06, + "loss": 4.626526832580566, + "step": 651 + }, + { + "epoch": 1.2784313725490195, + "grad_norm": 26.819211959838867, + "learning_rate": 3.0591037961123414e-06, + "loss": 4.332751274108887, + "step": 652 + }, + { + "epoch": 1.280392156862745, + "grad_norm": 283.3738098144531, + "learning_rate": 3.044476397549221e-06, + "loss": 3.8689937591552734, + "step": 653 + }, + { + "epoch": 1.2823529411764705, + "grad_norm": 53.717491149902344, + "learning_rate": 3.0298687309294106e-06, + "loss": 4.225245475769043, + "step": 654 + }, + { + "epoch": 1.284313725490196, + "grad_norm": 19.717180252075195, + "learning_rate": 3.0152809436495732e-06, + "loss": 4.666205406188965, + "step": 655 + }, + { + "epoch": 1.2862745098039214, + "grad_norm": 266.5069274902344, + "learning_rate": 3.0007131829057807e-06, + "loss": 4.279370307922363, + "step": 656 + }, + { + "epoch": 1.288235294117647, + "grad_norm": 169.1015625, + "learning_rate": 2.9861655956920286e-06, + "loss": 4.160942554473877, + "step": 657 + }, + { + "epoch": 1.2901960784313726, + "grad_norm": 32.97356033325195, + "learning_rate": 2.971638328798755e-06, + "loss": 4.0544891357421875, + "step": 658 + }, + { + "epoch": 1.2921568627450981, + "grad_norm": 22.934980392456055, + "learning_rate": 2.95713152881136e-06, + "loss": 3.782228469848633, + "step": 659 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 243.40133666992188, + "learning_rate": 2.942645342108723e-06, + "loss": 4.141233444213867, + "step": 660 + }, + { + "epoch": 1.296078431372549, + "grad_norm": 20.202136993408203, + "learning_rate": 2.9281799148617264e-06, + "loss": 4.041909217834473, + "step": 661 + }, + { + "epoch": 1.2980392156862746, + "grad_norm": 15.2548189163208, + "learning_rate": 2.913735393031786e-06, + "loss": 4.165369987487793, + "step": 662 + }, + { + "epoch": 1.3, + "grad_norm": 70.03112030029297, + "learning_rate": 2.8993119223693756e-06, + "loss": 3.9369964599609375, + "step": 663 + }, + { + "epoch": 1.3019607843137255, + "grad_norm": 13.889703750610352, + "learning_rate": 2.884909648412545e-06, + "loss": 4.479905605316162, + "step": 664 + }, + { + "epoch": 1.303921568627451, + "grad_norm": 18.408138275146484, + "learning_rate": 2.8705287164854755e-06, + "loss": 4.415904521942139, + "step": 665 + }, + { + "epoch": 1.3058823529411765, + "grad_norm": 9.360381126403809, + "learning_rate": 2.8561692716969907e-06, + "loss": 4.0752058029174805, + "step": 666 + }, + { + "epoch": 1.307843137254902, + "grad_norm": 14.265610694885254, + "learning_rate": 2.841831458939103e-06, + "loss": 4.536774635314941, + "step": 667 + }, + { + "epoch": 1.3098039215686275, + "grad_norm": 1695.8045654296875, + "learning_rate": 2.8275154228855495e-06, + "loss": 4.553939342498779, + "step": 668 + }, + { + "epoch": 1.311764705882353, + "grad_norm": 10.43436336517334, + "learning_rate": 2.8132213079903335e-06, + "loss": 4.155887603759766, + "step": 669 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 35.32231140136719, + "learning_rate": 2.798949258486263e-06, + "loss": 4.418889999389648, + "step": 670 + }, + { + "epoch": 1.315686274509804, + "grad_norm": 48.21449279785156, + "learning_rate": 2.7846994183835073e-06, + "loss": 4.05487060546875, + "step": 671 + }, + { + "epoch": 1.3176470588235294, + "grad_norm": 29.26831817626953, + "learning_rate": 2.770471931468121e-06, + "loss": 4.465351104736328, + "step": 672 + }, + { + "epoch": 1.3196078431372549, + "grad_norm": 15.706915855407715, + "learning_rate": 2.756266941300615e-06, + "loss": 4.6231231689453125, + "step": 673 + }, + { + "epoch": 1.3215686274509804, + "grad_norm": 18.40941619873047, + "learning_rate": 2.742084591214501e-06, + "loss": 4.08518648147583, + "step": 674 + }, + { + "epoch": 1.3235294117647058, + "grad_norm": 41.988037109375, + "learning_rate": 2.7279250243148416e-06, + "loss": 4.570401191711426, + "step": 675 + }, + { + "epoch": 1.3254901960784313, + "grad_norm": 10.751540184020996, + "learning_rate": 2.7137883834768076e-06, + "loss": 4.091187953948975, + "step": 676 + }, + { + "epoch": 1.3274509803921568, + "grad_norm": 34.95832443237305, + "learning_rate": 2.6996748113442397e-06, + "loss": 4.208563327789307, + "step": 677 + }, + { + "epoch": 1.3294117647058823, + "grad_norm": 18.75754165649414, + "learning_rate": 2.6855844503282057e-06, + "loss": 4.102869987487793, + "step": 678 + }, + { + "epoch": 1.3313725490196078, + "grad_norm": 7.9432172775268555, + "learning_rate": 2.6715174426055664e-06, + "loss": 4.183775424957275, + "step": 679 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 15.478090286254883, + "learning_rate": 2.657473930117537e-06, + "loss": 4.355172157287598, + "step": 680 + }, + { + "epoch": 1.3352941176470587, + "grad_norm": 40.765968322753906, + "learning_rate": 2.6434540545682585e-06, + "loss": 4.0800347328186035, + "step": 681 + }, + { + "epoch": 1.3372549019607844, + "grad_norm": 296.0919189453125, + "learning_rate": 2.629457957423365e-06, + "loss": 4.647578716278076, + "step": 682 + }, + { + "epoch": 1.33921568627451, + "grad_norm": 575.8104858398438, + "learning_rate": 2.6154857799085643e-06, + "loss": 4.862715244293213, + "step": 683 + }, + { + "epoch": 1.3411764705882354, + "grad_norm": 12.176534652709961, + "learning_rate": 2.6015376630082e-06, + "loss": 4.206980228424072, + "step": 684 + }, + { + "epoch": 1.343137254901961, + "grad_norm": 48.13445281982422, + "learning_rate": 2.5876137474638323e-06, + "loss": 4.654312610626221, + "step": 685 + }, + { + "epoch": 1.3450980392156864, + "grad_norm": 95.93660736083984, + "learning_rate": 2.5737141737728313e-06, + "loss": 4.447317123413086, + "step": 686 + }, + { + "epoch": 1.3470588235294119, + "grad_norm": 654.4398193359375, + "learning_rate": 2.5598390821869403e-06, + "loss": 4.394565582275391, + "step": 687 + }, + { + "epoch": 1.3490196078431373, + "grad_norm": 11.787772178649902, + "learning_rate": 2.5459886127108733e-06, + "loss": 4.405200004577637, + "step": 688 + }, + { + "epoch": 1.3509803921568628, + "grad_norm": 48.73141098022461, + "learning_rate": 2.532162905100898e-06, + "loss": 4.204862594604492, + "step": 689 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 213.5713653564453, + "learning_rate": 2.518362098863423e-06, + "loss": 4.153653144836426, + "step": 690 + }, + { + "epoch": 1.3549019607843138, + "grad_norm": 106.65089416503906, + "learning_rate": 2.504586333253595e-06, + "loss": 4.212902069091797, + "step": 691 + }, + { + "epoch": 1.3568627450980393, + "grad_norm": 15.037378311157227, + "learning_rate": 2.490835747273896e-06, + "loss": 4.021491050720215, + "step": 692 + }, + { + "epoch": 1.3588235294117648, + "grad_norm": 15.249185562133789, + "learning_rate": 2.4771104796727275e-06, + "loss": 4.2650299072265625, + "step": 693 + }, + { + "epoch": 1.3607843137254902, + "grad_norm": 121.88268280029297, + "learning_rate": 2.4634106689430235e-06, + "loss": 4.424811840057373, + "step": 694 + }, + { + "epoch": 1.3627450980392157, + "grad_norm": 43.32889938354492, + "learning_rate": 2.449736453320854e-06, + "loss": 4.422557830810547, + "step": 695 + }, + { + "epoch": 1.3647058823529412, + "grad_norm": 148.86349487304688, + "learning_rate": 2.436087970784018e-06, + "loss": 4.341975212097168, + "step": 696 + }, + { + "epoch": 1.3666666666666667, + "grad_norm": 670.186767578125, + "learning_rate": 2.422465359050661e-06, + "loss": 4.281984806060791, + "step": 697 + }, + { + "epoch": 1.3686274509803922, + "grad_norm": 14.450155258178711, + "learning_rate": 2.408868755577882e-06, + "loss": 4.32998514175415, + "step": 698 + }, + { + "epoch": 1.3705882352941177, + "grad_norm": 9.539745330810547, + "learning_rate": 2.3952982975603494e-06, + "loss": 4.54522705078125, + "step": 699 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 182.08261108398438, + "learning_rate": 2.3817541219289094e-06, + "loss": 4.368851661682129, + "step": 700 + }, + { + "epoch": 1.3745098039215686, + "grad_norm": 25.275768280029297, + "learning_rate": 2.368236365349218e-06, + "loss": 4.376866340637207, + "step": 701 + }, + { + "epoch": 1.3764705882352941, + "grad_norm": 202.50473022460938, + "learning_rate": 2.3547451642203438e-06, + "loss": 3.923326253890991, + "step": 702 + }, + { + "epoch": 1.3784313725490196, + "grad_norm": 591.466796875, + "learning_rate": 2.341280654673406e-06, + "loss": 4.25909423828125, + "step": 703 + }, + { + "epoch": 1.380392156862745, + "grad_norm": 6.909029006958008, + "learning_rate": 2.327842972570198e-06, + "loss": 4.500032424926758, + "step": 704 + }, + { + "epoch": 1.3823529411764706, + "grad_norm": 10.733404159545898, + "learning_rate": 2.3144322535018126e-06, + "loss": 4.299771785736084, + "step": 705 + }, + { + "epoch": 1.384313725490196, + "grad_norm": 7.808283805847168, + "learning_rate": 2.30104863278727e-06, + "loss": 4.240073204040527, + "step": 706 + }, + { + "epoch": 1.3862745098039215, + "grad_norm": 263.0734558105469, + "learning_rate": 2.2876922454721695e-06, + "loss": 4.555202484130859, + "step": 707 + }, + { + "epoch": 1.388235294117647, + "grad_norm": 14.998809814453125, + "learning_rate": 2.2743632263273075e-06, + "loss": 4.1804022789001465, + "step": 708 + }, + { + "epoch": 1.3901960784313725, + "grad_norm": 71.89509582519531, + "learning_rate": 2.261061709847327e-06, + "loss": 4.29913330078125, + "step": 709 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 288.5723571777344, + "learning_rate": 2.247787830249361e-06, + "loss": 4.5099029541015625, + "step": 710 + }, + { + "epoch": 1.3941176470588235, + "grad_norm": 9.197834968566895, + "learning_rate": 2.2345417214716745e-06, + "loss": 4.179586887359619, + "step": 711 + }, + { + "epoch": 1.396078431372549, + "grad_norm": 11.813665390014648, + "learning_rate": 2.2213235171723135e-06, + "loss": 4.482941627502441, + "step": 712 + }, + { + "epoch": 1.3980392156862744, + "grad_norm": 28.311222076416016, + "learning_rate": 2.208133350727764e-06, + "loss": 4.279744625091553, + "step": 713 + }, + { + "epoch": 1.4, + "grad_norm": 16.056880950927734, + "learning_rate": 2.194971355231595e-06, + "loss": 4.4880499839782715, + "step": 714 + }, + { + "epoch": 1.4019607843137254, + "grad_norm": 18.576108932495117, + "learning_rate": 2.1818376634931154e-06, + "loss": 4.454833984375, + "step": 715 + }, + { + "epoch": 1.4039215686274509, + "grad_norm": 103.8512191772461, + "learning_rate": 2.1687324080360505e-06, + "loss": 4.2193450927734375, + "step": 716 + }, + { + "epoch": 1.4058823529411764, + "grad_norm": 21.82050323486328, + "learning_rate": 2.1556557210971845e-06, + "loss": 4.52620792388916, + "step": 717 + }, + { + "epoch": 1.4078431372549018, + "grad_norm": 19.718570709228516, + "learning_rate": 2.1426077346250387e-06, + "loss": 4.283022403717041, + "step": 718 + }, + { + "epoch": 1.4098039215686273, + "grad_norm": 40.729331970214844, + "learning_rate": 2.1295885802785332e-06, + "loss": 3.9489569664001465, + "step": 719 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 6.961674690246582, + "learning_rate": 2.1165983894256647e-06, + "loss": 4.545563220977783, + "step": 720 + }, + { + "epoch": 1.4137254901960783, + "grad_norm": 5.8866682052612305, + "learning_rate": 2.103637293142175e-06, + "loss": 4.608613014221191, + "step": 721 + }, + { + "epoch": 1.415686274509804, + "grad_norm": 34.60615539550781, + "learning_rate": 2.0907054222102367e-06, + "loss": 3.834550380706787, + "step": 722 + }, + { + "epoch": 1.4176470588235295, + "grad_norm": 8.542845726013184, + "learning_rate": 2.077802907117119e-06, + "loss": 4.079771995544434, + "step": 723 + }, + { + "epoch": 1.419607843137255, + "grad_norm": 7.8854289054870605, + "learning_rate": 2.064929878053885e-06, + "loss": 4.5798139572143555, + "step": 724 + }, + { + "epoch": 1.4215686274509804, + "grad_norm": 25.48946189880371, + "learning_rate": 2.0520864649140763e-06, + "loss": 4.3257293701171875, + "step": 725 + }, + { + "epoch": 1.423529411764706, + "grad_norm": 34.86707305908203, + "learning_rate": 2.039272797292394e-06, + "loss": 4.576321125030518, + "step": 726 + }, + { + "epoch": 1.4254901960784314, + "grad_norm": 1225.54443359375, + "learning_rate": 2.0264890044833995e-06, + "loss": 4.269768714904785, + "step": 727 + }, + { + "epoch": 1.427450980392157, + "grad_norm": 27.487634658813477, + "learning_rate": 2.0137352154801993e-06, + "loss": 3.9596872329711914, + "step": 728 + }, + { + "epoch": 1.4294117647058824, + "grad_norm": 32.341365814208984, + "learning_rate": 2.0010115589731614e-06, + "loss": 4.522527694702148, + "step": 729 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 17.39375114440918, + "learning_rate": 1.9883181633485994e-06, + "loss": 4.125075340270996, + "step": 730 + }, + { + "epoch": 1.4333333333333333, + "grad_norm": 17.85887336730957, + "learning_rate": 1.9756551566874837e-06, + "loss": 4.250030994415283, + "step": 731 + }, + { + "epoch": 1.4352941176470588, + "grad_norm": 7.828402519226074, + "learning_rate": 1.9630226667641516e-06, + "loss": 4.435338497161865, + "step": 732 + }, + { + "epoch": 1.4372549019607843, + "grad_norm": 55.706932067871094, + "learning_rate": 1.9504208210450126e-06, + "loss": 4.133784294128418, + "step": 733 + }, + { + "epoch": 1.4392156862745098, + "grad_norm": 26.68115234375, + "learning_rate": 1.9378497466872657e-06, + "loss": 4.250054359436035, + "step": 734 + }, + { + "epoch": 1.4411764705882353, + "grad_norm": 30.59134864807129, + "learning_rate": 1.9253095705376218e-06, + "loss": 4.763115882873535, + "step": 735 + }, + { + "epoch": 1.4431372549019608, + "grad_norm": 24.48635482788086, + "learning_rate": 1.9128004191310064e-06, + "loss": 4.577441215515137, + "step": 736 + }, + { + "epoch": 1.4450980392156862, + "grad_norm": 86.27688598632812, + "learning_rate": 1.9003224186893e-06, + "loss": 4.721240997314453, + "step": 737 + }, + { + "epoch": 1.4470588235294117, + "grad_norm": 20.05085563659668, + "learning_rate": 1.8878756951200627e-06, + "loss": 3.7817955017089844, + "step": 738 + }, + { + "epoch": 1.4490196078431372, + "grad_norm": 32.25955581665039, + "learning_rate": 1.8754603740152533e-06, + "loss": 4.532436370849609, + "step": 739 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 77.6532974243164, + "learning_rate": 1.86307658064997e-06, + "loss": 4.3144707679748535, + "step": 740 + }, + { + "epoch": 1.4529411764705882, + "grad_norm": 29.76628303527832, + "learning_rate": 1.8507244399811858e-06, + "loss": 4.315280914306641, + "step": 741 + }, + { + "epoch": 1.4549019607843137, + "grad_norm": 36.668670654296875, + "learning_rate": 1.8384040766464856e-06, + "loss": 4.625729560852051, + "step": 742 + }, + { + "epoch": 1.4568627450980391, + "grad_norm": 8.990495681762695, + "learning_rate": 1.8261156149628101e-06, + "loss": 4.665683746337891, + "step": 743 + }, + { + "epoch": 1.4588235294117646, + "grad_norm": 12.940643310546875, + "learning_rate": 1.8138591789251997e-06, + "loss": 3.9143993854522705, + "step": 744 + }, + { + "epoch": 1.4607843137254901, + "grad_norm": 16.716720581054688, + "learning_rate": 1.8016348922055448e-06, + "loss": 4.646050453186035, + "step": 745 + }, + { + "epoch": 1.4627450980392158, + "grad_norm": 7.3492584228515625, + "learning_rate": 1.7894428781513367e-06, + "loss": 4.509244918823242, + "step": 746 + }, + { + "epoch": 1.4647058823529413, + "grad_norm": 8.405111312866211, + "learning_rate": 1.7772832597844286e-06, + "loss": 3.976335287094116, + "step": 747 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 64.35211181640625, + "learning_rate": 1.7651561597997846e-06, + "loss": 4.503273010253906, + "step": 748 + }, + { + "epoch": 1.4686274509803923, + "grad_norm": 16.13751220703125, + "learning_rate": 1.7530617005642431e-06, + "loss": 4.158659934997559, + "step": 749 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 15.204669952392578, + "learning_rate": 1.7410000041152953e-06, + "loss": 4.037508010864258, + "step": 750 + }, + { + "epoch": 1.4725490196078432, + "grad_norm": 9.200127601623535, + "learning_rate": 1.7289711921598362e-06, + "loss": 4.425247669219971, + "step": 751 + }, + { + "epoch": 1.4745098039215687, + "grad_norm": 88.66690063476562, + "learning_rate": 1.716975386072947e-06, + "loss": 4.361676216125488, + "step": 752 + }, + { + "epoch": 1.4764705882352942, + "grad_norm": 22.621477127075195, + "learning_rate": 1.7050127068966681e-06, + "loss": 4.638454437255859, + "step": 753 + }, + { + "epoch": 1.4784313725490197, + "grad_norm": 34.49787521362305, + "learning_rate": 1.6930832753387767e-06, + "loss": 4.669740200042725, + "step": 754 + }, + { + "epoch": 1.4803921568627452, + "grad_norm": 14.792598724365234, + "learning_rate": 1.6811872117715672e-06, + "loss": 4.41928243637085, + "step": 755 + }, + { + "epoch": 1.4823529411764707, + "grad_norm": 14.08731460571289, + "learning_rate": 1.6693246362306465e-06, + "loss": 4.068864822387695, + "step": 756 + }, + { + "epoch": 1.4843137254901961, + "grad_norm": 1587.7181396484375, + "learning_rate": 1.6574956684137044e-06, + "loss": 4.343111038208008, + "step": 757 + }, + { + "epoch": 1.4862745098039216, + "grad_norm": 13.467636108398438, + "learning_rate": 1.6457004276793227e-06, + "loss": 4.441114902496338, + "step": 758 + }, + { + "epoch": 1.488235294117647, + "grad_norm": 12.129409790039062, + "learning_rate": 1.633939033045766e-06, + "loss": 4.096912384033203, + "step": 759 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 30.13478660583496, + "learning_rate": 1.6222116031897773e-06, + "loss": 4.392602920532227, + "step": 760 + }, + { + "epoch": 1.492156862745098, + "grad_norm": 12.348272323608398, + "learning_rate": 1.610518256445382e-06, + "loss": 3.8157711029052734, + "step": 761 + }, + { + "epoch": 1.4941176470588236, + "grad_norm": 179.19436645507812, + "learning_rate": 1.5988591108026952e-06, + "loss": 4.038779258728027, + "step": 762 + }, + { + "epoch": 1.496078431372549, + "grad_norm": 15.53244686126709, + "learning_rate": 1.5872342839067305e-06, + "loss": 4.308762073516846, + "step": 763 + }, + { + "epoch": 1.4980392156862745, + "grad_norm": 25.847118377685547, + "learning_rate": 1.575643893056213e-06, + "loss": 4.312381744384766, + "step": 764 + }, + { + "epoch": 1.5, + "grad_norm": 34.48213577270508, + "learning_rate": 1.5640880552023957e-06, + "loss": 4.298844337463379, + "step": 765 + }, + { + "epoch": 1.5019607843137255, + "grad_norm": 216.0990447998047, + "learning_rate": 1.552566886947879e-06, + "loss": 4.351778984069824, + "step": 766 + }, + { + "epoch": 1.503921568627451, + "grad_norm": 18.95107650756836, + "learning_rate": 1.541080504545433e-06, + "loss": 4.403024196624756, + "step": 767 + }, + { + "epoch": 1.5058823529411764, + "grad_norm": 75.59261322021484, + "learning_rate": 1.5296290238968303e-06, + "loss": 4.473559379577637, + "step": 768 + }, + { + "epoch": 1.507843137254902, + "grad_norm": 52.72391891479492, + "learning_rate": 1.5182125605516706e-06, + "loss": 4.5149312019348145, + "step": 769 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 12.191537857055664, + "learning_rate": 1.5068312297062089e-06, + "loss": 4.095131874084473, + "step": 770 + }, + { + "epoch": 1.511764705882353, + "grad_norm": 19.78276824951172, + "learning_rate": 1.4954851462022118e-06, + "loss": 4.230744361877441, + "step": 771 + }, + { + "epoch": 1.5137254901960784, + "grad_norm": 88.43994903564453, + "learning_rate": 1.4841744245257812e-06, + "loss": 4.566298484802246, + "step": 772 + }, + { + "epoch": 1.5156862745098039, + "grad_norm": 53.15193176269531, + "learning_rate": 1.4728991788062052e-06, + "loss": 3.882889747619629, + "step": 773 + }, + { + "epoch": 1.5176470588235293, + "grad_norm": 1745.162353515625, + "learning_rate": 1.4616595228148095e-06, + "loss": 3.846311569213867, + "step": 774 + }, + { + "epoch": 1.5196078431372548, + "grad_norm": 15.805097579956055, + "learning_rate": 1.4504555699638034e-06, + "loss": 4.322307109832764, + "step": 775 + }, + { + "epoch": 1.5215686274509803, + "grad_norm": 21.051225662231445, + "learning_rate": 1.4392874333051387e-06, + "loss": 4.375843048095703, + "step": 776 + }, + { + "epoch": 1.5235294117647058, + "grad_norm": 115.48617553710938, + "learning_rate": 1.428155225529374e-06, + "loss": 4.2751359939575195, + "step": 777 + }, + { + "epoch": 1.5254901960784313, + "grad_norm": 66.88899993896484, + "learning_rate": 1.4170590589645273e-06, + "loss": 3.909058094024658, + "step": 778 + }, + { + "epoch": 1.5274509803921568, + "grad_norm": 15.390741348266602, + "learning_rate": 1.405999045574945e-06, + "loss": 3.9388744831085205, + "step": 779 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 12.519731521606445, + "learning_rate": 1.3949752969601838e-06, + "loss": 4.760117530822754, + "step": 780 + }, + { + "epoch": 1.5313725490196077, + "grad_norm": 30.16779327392578, + "learning_rate": 1.383987924353868e-06, + "loss": 4.391450881958008, + "step": 781 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 20.49533462524414, + "learning_rate": 1.3730370386225778e-06, + "loss": 4.297346115112305, + "step": 782 + }, + { + "epoch": 1.5352941176470587, + "grad_norm": 262.9514465332031, + "learning_rate": 1.3621227502647272e-06, + "loss": 4.436494827270508, + "step": 783 + }, + { + "epoch": 1.5372549019607842, + "grad_norm": 24.261484146118164, + "learning_rate": 1.351245169409449e-06, + "loss": 4.242164611816406, + "step": 784 + }, + { + "epoch": 1.5392156862745097, + "grad_norm": 94.59295654296875, + "learning_rate": 1.3404044058154836e-06, + "loss": 4.2807841300964355, + "step": 785 + }, + { + "epoch": 1.5411764705882351, + "grad_norm": 133.6329803466797, + "learning_rate": 1.3296005688700764e-06, + "loss": 4.7687201499938965, + "step": 786 + }, + { + "epoch": 1.5431372549019606, + "grad_norm": 43.33927917480469, + "learning_rate": 1.318833767587861e-06, + "loss": 4.107509136199951, + "step": 787 + }, + { + "epoch": 1.5450980392156861, + "grad_norm": 1382.5068359375, + "learning_rate": 1.308104110609773e-06, + "loss": 4.649358749389648, + "step": 788 + }, + { + "epoch": 1.5470588235294118, + "grad_norm": 44.702369689941406, + "learning_rate": 1.2974117062019504e-06, + "loss": 4.620138168334961, + "step": 789 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 41.0037841796875, + "learning_rate": 1.2867566622546357e-06, + "loss": 4.396909713745117, + "step": 790 + }, + { + "epoch": 1.5509803921568628, + "grad_norm": 87.05127716064453, + "learning_rate": 1.2761390862810907e-06, + "loss": 4.270963668823242, + "step": 791 + }, + { + "epoch": 1.5529411764705883, + "grad_norm": 24.310062408447266, + "learning_rate": 1.2655590854165146e-06, + "loss": 3.945737838745117, + "step": 792 + }, + { + "epoch": 1.5549019607843138, + "grad_norm": 51.97520065307617, + "learning_rate": 1.2550167664169565e-06, + "loss": 4.492574691772461, + "step": 793 + }, + { + "epoch": 1.5568627450980392, + "grad_norm": 48.23164749145508, + "learning_rate": 1.244512235658245e-06, + "loss": 3.9952263832092285, + "step": 794 + }, + { + "epoch": 1.5588235294117647, + "grad_norm": 11.07433032989502, + "learning_rate": 1.2340455991349094e-06, + "loss": 4.4271416664123535, + "step": 795 + }, + { + "epoch": 1.5607843137254902, + "grad_norm": 169.1728515625, + "learning_rate": 1.2236169624591138e-06, + "loss": 4.345428466796875, + "step": 796 + }, + { + "epoch": 1.5627450980392157, + "grad_norm": 27.21241569519043, + "learning_rate": 1.2132264308595875e-06, + "loss": 3.972264289855957, + "step": 797 + }, + { + "epoch": 1.5647058823529412, + "grad_norm": 183.76351928710938, + "learning_rate": 1.2028741091805713e-06, + "loss": 4.575385093688965, + "step": 798 + }, + { + "epoch": 1.5666666666666667, + "grad_norm": 9.296713829040527, + "learning_rate": 1.1925601018807498e-06, + "loss": 4.201542854309082, + "step": 799 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 15.747394561767578, + "learning_rate": 1.182284513032198e-06, + "loss": 4.3995256423950195, + "step": 800 + }, + { + "epoch": 1.5705882352941176, + "grad_norm": 65.41732025146484, + "learning_rate": 1.1720474463193442e-06, + "loss": 4.298453330993652, + "step": 801 + }, + { + "epoch": 1.572549019607843, + "grad_norm": 33.861167907714844, + "learning_rate": 1.1618490050379073e-06, + "loss": 4.283881664276123, + "step": 802 + }, + { + "epoch": 1.5745098039215686, + "grad_norm": 25.06275749206543, + "learning_rate": 1.1516892920938627e-06, + "loss": 4.503167152404785, + "step": 803 + }, + { + "epoch": 1.576470588235294, + "grad_norm": 47.199275970458984, + "learning_rate": 1.1415684100024043e-06, + "loss": 4.034012794494629, + "step": 804 + }, + { + "epoch": 1.5784313725490198, + "grad_norm": 2318.244384765625, + "learning_rate": 1.131486460886908e-06, + "loss": 4.031803607940674, + "step": 805 + }, + { + "epoch": 1.5803921568627453, + "grad_norm": 45.9147834777832, + "learning_rate": 1.1214435464779006e-06, + "loss": 4.6407575607299805, + "step": 806 + }, + { + "epoch": 1.5823529411764707, + "grad_norm": 23.733440399169922, + "learning_rate": 1.1114397681120386e-06, + "loss": 3.8248114585876465, + "step": 807 + }, + { + "epoch": 1.5843137254901962, + "grad_norm": 50.900054931640625, + "learning_rate": 1.1014752267310757e-06, + "loss": 4.657618999481201, + "step": 808 + }, + { + "epoch": 1.5862745098039217, + "grad_norm": 14.20254135131836, + "learning_rate": 1.0915500228808523e-06, + "loss": 3.8249688148498535, + "step": 809 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 76.59568786621094, + "learning_rate": 1.0816642567102832e-06, + "loss": 4.338264465332031, + "step": 810 + }, + { + "epoch": 1.5901960784313727, + "grad_norm": 50.278507232666016, + "learning_rate": 1.0718180279703371e-06, + "loss": 4.3644843101501465, + "step": 811 + }, + { + "epoch": 1.5921568627450982, + "grad_norm": 18.936275482177734, + "learning_rate": 1.0620114360130385e-06, + "loss": 4.120285511016846, + "step": 812 + }, + { + "epoch": 1.5941176470588236, + "grad_norm": 92.27937316894531, + "learning_rate": 1.0522445797904608e-06, + "loss": 4.5084075927734375, + "step": 813 + }, + { + "epoch": 1.5960784313725491, + "grad_norm": 34.172847747802734, + "learning_rate": 1.04251755785373e-06, + "loss": 4.208546161651611, + "step": 814 + }, + { + "epoch": 1.5980392156862746, + "grad_norm": 51.73433303833008, + "learning_rate": 1.0328304683520308e-06, + "loss": 4.2955193519592285, + "step": 815 + }, + { + "epoch": 1.6, + "grad_norm": 69.9002456665039, + "learning_rate": 1.0231834090316135e-06, + "loss": 4.448221206665039, + "step": 816 + }, + { + "epoch": 1.6019607843137256, + "grad_norm": 43.196990966796875, + "learning_rate": 1.0135764772348105e-06, + "loss": 4.406120300292969, + "step": 817 + }, + { + "epoch": 1.603921568627451, + "grad_norm": 164.75355529785156, + "learning_rate": 1.004009769899051e-06, + "loss": 4.193570613861084, + "step": 818 + }, + { + "epoch": 1.6058823529411765, + "grad_norm": 10.750273704528809, + "learning_rate": 9.944833835558886e-07, + "loss": 4.542207717895508, + "step": 819 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 159.42050170898438, + "learning_rate": 9.849974143300216e-07, + "loss": 4.338168621063232, + "step": 820 + }, + { + "epoch": 1.6098039215686275, + "grad_norm": 75.44558715820312, + "learning_rate": 9.755519579383206e-07, + "loss": 4.4732279777526855, + "step": 821 + }, + { + "epoch": 1.611764705882353, + "grad_norm": 9.22712516784668, + "learning_rate": 9.661471096888735e-07, + "loss": 4.747133255004883, + "step": 822 + }, + { + "epoch": 1.6137254901960785, + "grad_norm": 17.274816513061523, + "learning_rate": 9.567829644800141e-07, + "loss": 4.6141815185546875, + "step": 823 + }, + { + "epoch": 1.615686274509804, + "grad_norm": 38.42319869995117, + "learning_rate": 9.474596167993688e-07, + "loss": 4.395506858825684, + "step": 824 + }, + { + "epoch": 1.6176470588235294, + "grad_norm": 9.850702285766602, + "learning_rate": 9.381771607229001e-07, + "loss": 4.559998512268066, + "step": 825 + }, + { + "epoch": 1.619607843137255, + "grad_norm": 25.2269344329834, + "learning_rate": 9.289356899139623e-07, + "loss": 3.902873992919922, + "step": 826 + }, + { + "epoch": 1.6215686274509804, + "grad_norm": 15.09051513671875, + "learning_rate": 9.197352976223495e-07, + "loss": 4.349954605102539, + "step": 827 + }, + { + "epoch": 1.6235294117647059, + "grad_norm": 15.10377311706543, + "learning_rate": 9.10576076683366e-07, + "loss": 4.255687236785889, + "step": 828 + }, + { + "epoch": 1.6254901960784314, + "grad_norm": 13.975825309753418, + "learning_rate": 9.014581195168726e-07, + "loss": 4.513431549072266, + "step": 829 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 55.94126510620117, + "learning_rate": 8.923815181263684e-07, + "loss": 4.227084159851074, + "step": 830 + }, + { + "epoch": 1.6294117647058823, + "grad_norm": 18.68338966369629, + "learning_rate": 8.83346364098061e-07, + "loss": 4.461234092712402, + "step": 831 + }, + { + "epoch": 1.6313725490196078, + "grad_norm": 78.93167877197266, + "learning_rate": 8.743527485999342e-07, + "loss": 4.154361724853516, + "step": 832 + }, + { + "epoch": 1.6333333333333333, + "grad_norm": 10.899136543273926, + "learning_rate": 8.654007623808335e-07, + "loss": 4.573888301849365, + "step": 833 + }, + { + "epoch": 1.6352941176470588, + "grad_norm": 27.798236846923828, + "learning_rate": 8.564904957695524e-07, + "loss": 4.1454854011535645, + "step": 834 + }, + { + "epoch": 1.6372549019607843, + "grad_norm": 36.72344207763672, + "learning_rate": 8.476220386739153e-07, + "loss": 4.31050443649292, + "step": 835 + }, + { + "epoch": 1.6392156862745098, + "grad_norm": 76.8628158569336, + "learning_rate": 8.387954805798748e-07, + "loss": 4.214484214782715, + "step": 836 + }, + { + "epoch": 1.6411764705882352, + "grad_norm": 87.1292953491211, + "learning_rate": 8.30010910550611e-07, + "loss": 4.35565185546875, + "step": 837 + }, + { + "epoch": 1.6431372549019607, + "grad_norm": 668.2094116210938, + "learning_rate": 8.212684172256219e-07, + "loss": 4.3396382331848145, + "step": 838 + }, + { + "epoch": 1.6450980392156862, + "grad_norm": 131.48623657226562, + "learning_rate": 8.125680888198395e-07, + "loss": 4.131406307220459, + "step": 839 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 26.730304718017578, + "learning_rate": 8.039100131227401e-07, + "loss": 4.347740650177002, + "step": 840 + }, + { + "epoch": 1.6490196078431372, + "grad_norm": 10.344943046569824, + "learning_rate": 7.95294277497452e-07, + "loss": 4.575904369354248, + "step": 841 + }, + { + "epoch": 1.6509803921568627, + "grad_norm": 8.698440551757812, + "learning_rate": 7.867209688798722e-07, + "loss": 4.042451858520508, + "step": 842 + }, + { + "epoch": 1.6529411764705881, + "grad_norm": 65.40000915527344, + "learning_rate": 7.781901737778014e-07, + "loss": 4.252891540527344, + "step": 843 + }, + { + "epoch": 1.6549019607843136, + "grad_norm": 1880.466064453125, + "learning_rate": 7.697019782700605e-07, + "loss": 4.086131572723389, + "step": 844 + }, + { + "epoch": 1.656862745098039, + "grad_norm": 12.000948905944824, + "learning_rate": 7.612564680056233e-07, + "loss": 4.185359001159668, + "step": 845 + }, + { + "epoch": 1.6588235294117646, + "grad_norm": 16.268617630004883, + "learning_rate": 7.52853728202756e-07, + "loss": 4.333435535430908, + "step": 846 + }, + { + "epoch": 1.66078431372549, + "grad_norm": 19.7327880859375, + "learning_rate": 7.444938436481547e-07, + "loss": 4.323735237121582, + "step": 847 + }, + { + "epoch": 1.6627450980392156, + "grad_norm": 20.108184814453125, + "learning_rate": 7.361768986960893e-07, + "loss": 3.988382339477539, + "step": 848 + }, + { + "epoch": 1.664705882352941, + "grad_norm": 35.86913299560547, + "learning_rate": 7.279029772675572e-07, + "loss": 3.8766672611236572, + "step": 849 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 215.09352111816406, + "learning_rate": 7.196721628494296e-07, + "loss": 4.467662811279297, + "step": 850 + }, + { + "epoch": 1.668627450980392, + "grad_norm": 14.033326148986816, + "learning_rate": 7.114845384936109e-07, + "loss": 3.823746681213379, + "step": 851 + }, + { + "epoch": 1.6705882352941175, + "grad_norm": 130.5819091796875, + "learning_rate": 7.033401868162071e-07, + "loss": 4.653918743133545, + "step": 852 + }, + { + "epoch": 1.6725490196078432, + "grad_norm": 7.852978229522705, + "learning_rate": 6.952391899966826e-07, + "loss": 4.3796491622924805, + "step": 853 + }, + { + "epoch": 1.6745098039215687, + "grad_norm": 14.399147987365723, + "learning_rate": 6.871816297770379e-07, + "loss": 3.9127392768859863, + "step": 854 + }, + { + "epoch": 1.6764705882352942, + "grad_norm": 276.7757873535156, + "learning_rate": 6.791675874609815e-07, + "loss": 3.9367387294769287, + "step": 855 + }, + { + "epoch": 1.6784313725490196, + "grad_norm": 23.932926177978516, + "learning_rate": 6.71197143913111e-07, + "loss": 4.5128278732299805, + "step": 856 + }, + { + "epoch": 1.6803921568627451, + "grad_norm": 39.19783401489258, + "learning_rate": 6.632703795580947e-07, + "loss": 4.048386096954346, + "step": 857 + }, + { + "epoch": 1.6823529411764706, + "grad_norm": 617.0408935546875, + "learning_rate": 6.553873743798678e-07, + "loss": 4.400557994842529, + "step": 858 + }, + { + "epoch": 1.684313725490196, + "grad_norm": 351.9881896972656, + "learning_rate": 6.475482079208112e-07, + "loss": 4.236886024475098, + "step": 859 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 12.667593955993652, + "learning_rate": 6.397529592809615e-07, + "loss": 4.2488203048706055, + "step": 860 + }, + { + "epoch": 1.688235294117647, + "grad_norm": 22.5399227142334, + "learning_rate": 6.320017071172114e-07, + "loss": 4.3423614501953125, + "step": 861 + }, + { + "epoch": 1.6901960784313725, + "grad_norm": 9.796163558959961, + "learning_rate": 6.242945296425074e-07, + "loss": 3.9263696670532227, + "step": 862 + }, + { + "epoch": 1.692156862745098, + "grad_norm": 33.729209899902344, + "learning_rate": 6.166315046250704e-07, + "loss": 4.36262321472168, + "step": 863 + }, + { + "epoch": 1.6941176470588235, + "grad_norm": 33.189064025878906, + "learning_rate": 6.090127093876058e-07, + "loss": 4.022284030914307, + "step": 864 + }, + { + "epoch": 1.696078431372549, + "grad_norm": 8.703564643859863, + "learning_rate": 6.014382208065234e-07, + "loss": 4.033292770385742, + "step": 865 + }, + { + "epoch": 1.6980392156862745, + "grad_norm": 532.3449096679688, + "learning_rate": 5.939081153111648e-07, + "loss": 4.381195545196533, + "step": 866 + }, + { + "epoch": 1.7, + "grad_norm": 17.51077651977539, + "learning_rate": 5.864224688830283e-07, + "loss": 4.547145843505859, + "step": 867 + }, + { + "epoch": 1.7019607843137254, + "grad_norm": 482.6360168457031, + "learning_rate": 5.789813570550052e-07, + "loss": 4.411568641662598, + "step": 868 + }, + { + "epoch": 1.7039215686274511, + "grad_norm": 90.01253509521484, + "learning_rate": 5.715848549106146e-07, + "loss": 4.115753173828125, + "step": 869 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 51.696807861328125, + "learning_rate": 5.642330370832521e-07, + "loss": 4.346105575561523, + "step": 870 + }, + { + "epoch": 1.707843137254902, + "grad_norm": 75.20115661621094, + "learning_rate": 5.569259777554287e-07, + "loss": 4.778619289398193, + "step": 871 + }, + { + "epoch": 1.7098039215686276, + "grad_norm": 12.626012802124023, + "learning_rate": 5.496637506580243e-07, + "loss": 3.989429473876953, + "step": 872 + }, + { + "epoch": 1.711764705882353, + "grad_norm": 11.204209327697754, + "learning_rate": 5.424464290695497e-07, + "loss": 3.9180917739868164, + "step": 873 + }, + { + "epoch": 1.7137254901960786, + "grad_norm": 24.38224220275879, + "learning_rate": 5.352740858154009e-07, + "loss": 4.300451278686523, + "step": 874 + }, + { + "epoch": 1.715686274509804, + "grad_norm": 8773.572265625, + "learning_rate": 5.281467932671253e-07, + "loss": 4.360892295837402, + "step": 875 + }, + { + "epoch": 1.7176470588235295, + "grad_norm": 5.854603290557861, + "learning_rate": 5.210646233416933e-07, + "loss": 4.384590148925781, + "step": 876 + }, + { + "epoch": 1.719607843137255, + "grad_norm": 35.005653381347656, + "learning_rate": 5.140276475007711e-07, + "loss": 4.532591819763184, + "step": 877 + }, + { + "epoch": 1.7215686274509805, + "grad_norm": 6.363423824310303, + "learning_rate": 5.070359367499994e-07, + "loss": 4.252459526062012, + "step": 878 + }, + { + "epoch": 1.723529411764706, + "grad_norm": 25.73122787475586, + "learning_rate": 5.000895616382829e-07, + "loss": 4.175426483154297, + "step": 879 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 40.31061553955078, + "learning_rate": 4.931885922570645e-07, + "loss": 4.50531005859375, + "step": 880 + }, + { + "epoch": 1.727450980392157, + "grad_norm": 86.81022644042969, + "learning_rate": 4.86333098239632e-07, + "loss": 4.1436638832092285, + "step": 881 + }, + { + "epoch": 1.7294117647058824, + "grad_norm": 47.35791015625, + "learning_rate": 4.795231487604124e-07, + "loss": 4.5163726806640625, + "step": 882 + }, + { + "epoch": 1.731372549019608, + "grad_norm": 19.143882751464844, + "learning_rate": 4.727588125342669e-07, + "loss": 4.077076435089111, + "step": 883 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 46.4240608215332, + "learning_rate": 4.660401578158053e-07, + "loss": 4.224460601806641, + "step": 884 + }, + { + "epoch": 1.7352941176470589, + "grad_norm": 522.8136596679688, + "learning_rate": 4.5936725239869364e-07, + "loss": 4.482194423675537, + "step": 885 + }, + { + "epoch": 1.7372549019607844, + "grad_norm": 2674.419677734375, + "learning_rate": 4.527401636149703e-07, + "loss": 3.8143420219421387, + "step": 886 + }, + { + "epoch": 1.7392156862745098, + "grad_norm": 72.95992279052734, + "learning_rate": 4.4615895833436784e-07, + "loss": 4.43809175491333, + "step": 887 + }, + { + "epoch": 1.7411764705882353, + "grad_norm": 31.746112823486328, + "learning_rate": 4.396237029636385e-07, + "loss": 3.784120798110962, + "step": 888 + }, + { + "epoch": 1.7431372549019608, + "grad_norm": 718.4949340820312, + "learning_rate": 4.3313446344588117e-07, + "loss": 4.468380928039551, + "step": 889 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 45.256126403808594, + "learning_rate": 4.266913052598792e-07, + "loss": 4.18980598449707, + "step": 890 + }, + { + "epoch": 1.7470588235294118, + "grad_norm": 18.33076286315918, + "learning_rate": 4.2029429341943983e-07, + "loss": 4.076150417327881, + "step": 891 + }, + { + "epoch": 1.7490196078431373, + "grad_norm": 20.862789154052734, + "learning_rate": 4.139434924727359e-07, + "loss": 3.7962851524353027, + "step": 892 + }, + { + "epoch": 1.7509803921568627, + "grad_norm": 9.703033447265625, + "learning_rate": 4.0763896650165227e-07, + "loss": 3.898289680480957, + "step": 893 + }, + { + "epoch": 1.7529411764705882, + "grad_norm": 22.09119987487793, + "learning_rate": 4.0138077912114824e-07, + "loss": 4.499690532684326, + "step": 894 + }, + { + "epoch": 1.7549019607843137, + "grad_norm": 20.46513557434082, + "learning_rate": 3.951689934786068e-07, + "loss": 4.488393783569336, + "step": 895 + }, + { + "epoch": 1.7568627450980392, + "grad_norm": 63.77817916870117, + "learning_rate": 3.8900367225320036e-07, + "loss": 4.285112380981445, + "step": 896 + }, + { + "epoch": 1.7588235294117647, + "grad_norm": 6.742626190185547, + "learning_rate": 3.828848776552596e-07, + "loss": 4.3112382888793945, + "step": 897 + }, + { + "epoch": 1.7607843137254902, + "grad_norm": 15.83242416381836, + "learning_rate": 3.768126714256437e-07, + "loss": 4.37016487121582, + "step": 898 + }, + { + "epoch": 1.7627450980392156, + "grad_norm": 111.3309555053711, + "learning_rate": 3.7078711483511833e-07, + "loss": 3.872631549835205, + "step": 899 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 30.7449951171875, + "learning_rate": 3.648082686837395e-07, + "loss": 3.9214181900024414, + "step": 900 + }, + { + "epoch": 1.7666666666666666, + "grad_norm": 14.394401550292969, + "learning_rate": 3.588761933002344e-07, + "loss": 4.099413871765137, + "step": 901 + }, + { + "epoch": 1.768627450980392, + "grad_norm": 70.64187622070312, + "learning_rate": 3.529909485413968e-07, + "loss": 4.350872039794922, + "step": 902 + }, + { + "epoch": 1.7705882352941176, + "grad_norm": 29.170866012573242, + "learning_rate": 3.4715259379148656e-07, + "loss": 4.229859352111816, + "step": 903 + }, + { + "epoch": 1.772549019607843, + "grad_norm": 22.33905029296875, + "learning_rate": 3.413611879616219e-07, + "loss": 4.710308074951172, + "step": 904 + }, + { + "epoch": 1.7745098039215685, + "grad_norm": 60.61310958862305, + "learning_rate": 3.3561678948919094e-07, + "loss": 4.223546028137207, + "step": 905 + }, + { + "epoch": 1.776470588235294, + "grad_norm": 68.75520324707031, + "learning_rate": 3.299194563372604e-07, + "loss": 4.672816276550293, + "step": 906 + }, + { + "epoch": 1.7784313725490195, + "grad_norm": 788.5811767578125, + "learning_rate": 3.2426924599399056e-07, + "loss": 4.218260288238525, + "step": 907 + }, + { + "epoch": 1.780392156862745, + "grad_norm": 28.486061096191406, + "learning_rate": 3.186662154720549e-07, + "loss": 3.965388774871826, + "step": 908 + }, + { + "epoch": 1.7823529411764705, + "grad_norm": 31.833799362182617, + "learning_rate": 3.131104213080688e-07, + "loss": 4.031269550323486, + "step": 909 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 377.443359375, + "learning_rate": 3.076019195620111e-07, + "loss": 4.190829753875732, + "step": 910 + }, + { + "epoch": 1.7862745098039214, + "grad_norm": 1760.9605712890625, + "learning_rate": 3.0214076581666364e-07, + "loss": 4.345890998840332, + "step": 911 + }, + { + "epoch": 1.788235294117647, + "grad_norm": 23.542831420898438, + "learning_rate": 2.9672701517705404e-07, + "loss": 4.38871955871582, + "step": 912 + }, + { + "epoch": 1.7901960784313724, + "grad_norm": 70.62339782714844, + "learning_rate": 2.9136072226989054e-07, + "loss": 4.383261680603027, + "step": 913 + }, + { + "epoch": 1.792156862745098, + "grad_norm": 30.117904663085938, + "learning_rate": 2.8604194124301654e-07, + "loss": 4.235455513000488, + "step": 914 + }, + { + "epoch": 1.7941176470588234, + "grad_norm": 644.7010498046875, + "learning_rate": 2.807707257648662e-07, + "loss": 4.087576866149902, + "step": 915 + }, + { + "epoch": 1.7960784313725489, + "grad_norm": 80.14166259765625, + "learning_rate": 2.7554712902391647e-07, + "loss": 3.9834418296813965, + "step": 916 + }, + { + "epoch": 1.7980392156862746, + "grad_norm": 24.340307235717773, + "learning_rate": 2.703712037281564e-07, + "loss": 3.9975290298461914, + "step": 917 + }, + { + "epoch": 1.8, + "grad_norm": 10.314132690429688, + "learning_rate": 2.65243002104551e-07, + "loss": 4.420810222625732, + "step": 918 + }, + { + "epoch": 1.8019607843137255, + "grad_norm": 11.760737419128418, + "learning_rate": 2.6016257589851825e-07, + "loss": 3.9548559188842773, + "step": 919 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 6.947646617889404, + "learning_rate": 2.551299763734011e-07, + "loss": 4.306017875671387, + "step": 920 + }, + { + "epoch": 1.8058823529411765, + "grad_norm": 214.57057189941406, + "learning_rate": 2.5014525430995915e-07, + "loss": 4.3000946044921875, + "step": 921 + }, + { + "epoch": 1.807843137254902, + "grad_norm": 18.068117141723633, + "learning_rate": 2.4520846000584795e-07, + "loss": 4.083752632141113, + "step": 922 + }, + { + "epoch": 1.8098039215686275, + "grad_norm": 41.73397445678711, + "learning_rate": 2.403196432751131e-07, + "loss": 3.7975006103515625, + "step": 923 + }, + { + "epoch": 1.811764705882353, + "grad_norm": 13.082367897033691, + "learning_rate": 2.354788534476915e-07, + "loss": 4.3696441650390625, + "step": 924 + }, + { + "epoch": 1.8137254901960784, + "grad_norm": 6.786513328552246, + "learning_rate": 2.306861393689114e-07, + "loss": 4.202939033508301, + "step": 925 + }, + { + "epoch": 1.815686274509804, + "grad_norm": 13.234094619750977, + "learning_rate": 2.2594154939899805e-07, + "loss": 4.239182949066162, + "step": 926 + }, + { + "epoch": 1.8176470588235294, + "grad_norm": 33.10063934326172, + "learning_rate": 2.2124513141258574e-07, + "loss": 4.172329425811768, + "step": 927 + }, + { + "epoch": 1.8196078431372549, + "grad_norm": 52.010643005371094, + "learning_rate": 2.1659693279823923e-07, + "loss": 4.38886833190918, + "step": 928 + }, + { + "epoch": 1.8215686274509804, + "grad_norm": 21.009357452392578, + "learning_rate": 2.1199700045797077e-07, + "loss": 4.021790981292725, + "step": 929 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 11.467341423034668, + "learning_rate": 2.0744538080676669e-07, + "loss": 4.197271347045898, + "step": 930 + }, + { + "epoch": 1.8254901960784313, + "grad_norm": 25.42687225341797, + "learning_rate": 2.0294211977212318e-07, + "loss": 4.264697551727295, + "step": 931 + }, + { + "epoch": 1.8274509803921568, + "grad_norm": 9550.0322265625, + "learning_rate": 1.9848726279357966e-07, + "loss": 4.283550262451172, + "step": 932 + }, + { + "epoch": 1.8294117647058825, + "grad_norm": 927.8634643554688, + "learning_rate": 1.9408085482225946e-07, + "loss": 4.379736423492432, + "step": 933 + }, + { + "epoch": 1.831372549019608, + "grad_norm": 43.04875564575195, + "learning_rate": 1.8972294032042092e-07, + "loss": 4.642847061157227, + "step": 934 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 237.4536895751953, + "learning_rate": 1.8541356326100436e-07, + "loss": 4.4732513427734375, + "step": 935 + }, + { + "epoch": 1.835294117647059, + "grad_norm": 41.125755310058594, + "learning_rate": 1.8115276712718622e-07, + "loss": 4.2745161056518555, + "step": 936 + }, + { + "epoch": 1.8372549019607844, + "grad_norm": 24.50762939453125, + "learning_rate": 1.7694059491195014e-07, + "loss": 4.4691877365112305, + "step": 937 + }, + { + "epoch": 1.83921568627451, + "grad_norm": 244.0862274169922, + "learning_rate": 1.7277708911764223e-07, + "loss": 4.659967422485352, + "step": 938 + }, + { + "epoch": 1.8411764705882354, + "grad_norm": 49.2494010925293, + "learning_rate": 1.686622917555475e-07, + "loss": 4.281001091003418, + "step": 939 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 33.747833251953125, + "learning_rate": 1.645962443454663e-07, + "loss": 4.207096576690674, + "step": 940 + }, + { + "epoch": 1.8450980392156864, + "grad_norm": 150.17184448242188, + "learning_rate": 1.6057898791529303e-07, + "loss": 4.32537841796875, + "step": 941 + }, + { + "epoch": 1.8470588235294119, + "grad_norm": 5.460432529449463, + "learning_rate": 1.5661056300060428e-07, + "loss": 4.553836822509766, + "step": 942 + }, + { + "epoch": 1.8490196078431373, + "grad_norm": 1245.7965087890625, + "learning_rate": 1.526910096442491e-07, + "loss": 3.7847492694854736, + "step": 943 + }, + { + "epoch": 1.8509803921568628, + "grad_norm": 88.72211456298828, + "learning_rate": 1.4882036739594374e-07, + "loss": 4.1381001472473145, + "step": 944 + }, + { + "epoch": 1.8529411764705883, + "grad_norm": 14.506940841674805, + "learning_rate": 1.4499867531187372e-07, + "loss": 4.168397903442383, + "step": 945 + }, + { + "epoch": 1.8549019607843138, + "grad_norm": 40.451839447021484, + "learning_rate": 1.4122597195430077e-07, + "loss": 4.074184894561768, + "step": 946 + }, + { + "epoch": 1.8568627450980393, + "grad_norm": 1232.6160888671875, + "learning_rate": 1.3750229539117143e-07, + "loss": 4.445058822631836, + "step": 947 + }, + { + "epoch": 1.8588235294117648, + "grad_norm": 6.072132110595703, + "learning_rate": 1.3382768319573525e-07, + "loss": 4.521212100982666, + "step": 948 + }, + { + "epoch": 1.8607843137254902, + "grad_norm": 67.43428802490234, + "learning_rate": 1.3020217244616273e-07, + "loss": 4.11070442199707, + "step": 949 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 51.582584381103516, + "learning_rate": 1.2662579972517463e-07, + "loss": 3.98960018157959, + "step": 950 + }, + { + "epoch": 1.8647058823529412, + "grad_norm": 2543.5771484375, + "learning_rate": 1.2309860111967053e-07, + "loss": 3.894416093826294, + "step": 951 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 17.300437927246094, + "learning_rate": 1.196206122203647e-07, + "loss": 4.141676902770996, + "step": 952 + }, + { + "epoch": 1.8686274509803922, + "grad_norm": 1173.831298828125, + "learning_rate": 1.1619186812142858e-07, + "loss": 4.419776439666748, + "step": 953 + }, + { + "epoch": 1.8705882352941177, + "grad_norm": 91.50953674316406, + "learning_rate": 1.1281240342013444e-07, + "loss": 4.3769402503967285, + "step": 954 + }, + { + "epoch": 1.8725490196078431, + "grad_norm": 15.492836952209473, + "learning_rate": 1.0948225221651009e-07, + "loss": 4.41178035736084, + "step": 955 + }, + { + "epoch": 1.8745098039215686, + "grad_norm": 23.23512077331543, + "learning_rate": 1.0620144811299027e-07, + "loss": 4.478861331939697, + "step": 956 + }, + { + "epoch": 1.8764705882352941, + "grad_norm": 17.37805938720703, + "learning_rate": 1.0297002421407798e-07, + "loss": 3.897634506225586, + "step": 957 + }, + { + "epoch": 1.8784313725490196, + "grad_norm": 8.621871948242188, + "learning_rate": 9.978801312601538e-08, + "loss": 4.658677101135254, + "step": 958 + }, + { + "epoch": 1.880392156862745, + "grad_norm": 34.825897216796875, + "learning_rate": 9.665544695645013e-08, + "loss": 4.049420356750488, + "step": 959 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 424.513916015625, + "learning_rate": 9.357235731411174e-08, + "loss": 4.394476413726807, + "step": 960 + }, + { + "epoch": 1.884313725490196, + "grad_norm": 56.41450881958008, + "learning_rate": 9.053877530849465e-08, + "loss": 4.252983093261719, + "step": 961 + }, + { + "epoch": 1.8862745098039215, + "grad_norm": 8.703697204589844, + "learning_rate": 8.755473154954342e-08, + "loss": 4.242513656616211, + "step": 962 + }, + { + "epoch": 1.888235294117647, + "grad_norm": 21.02497100830078, + "learning_rate": 8.462025614734193e-08, + "loss": 3.881922483444214, + "step": 963 + }, + { + "epoch": 1.8901960784313725, + "grad_norm": 30.859045028686523, + "learning_rate": 8.173537871181413e-08, + "loss": 4.2897515296936035, + "step": 964 + }, + { + "epoch": 1.892156862745098, + "grad_norm": 27.57143783569336, + "learning_rate": 7.890012835242045e-08, + "loss": 4.476958274841309, + "step": 965 + }, + { + "epoch": 1.8941176470588235, + "grad_norm": 26.67787742614746, + "learning_rate": 7.61145336778657e-08, + "loss": 4.516484260559082, + "step": 966 + }, + { + "epoch": 1.896078431372549, + "grad_norm": 88.0553970336914, + "learning_rate": 7.337862279581332e-08, + "loss": 4.290770530700684, + "step": 967 + }, + { + "epoch": 1.8980392156862744, + "grad_norm": 74.33447265625, + "learning_rate": 7.069242331259719e-08, + "loss": 4.163839340209961, + "step": 968 + }, + { + "epoch": 1.9, + "grad_norm": 167.7811737060547, + "learning_rate": 6.805596233294576e-08, + "loss": 4.354069232940674, + "step": 969 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 52.53337097167969, + "learning_rate": 6.546926645970675e-08, + "loss": 4.376225471496582, + "step": 970 + }, + { + "epoch": 1.9039215686274509, + "grad_norm": 20.528648376464844, + "learning_rate": 6.293236179358175e-08, + "loss": 4.402164936065674, + "step": 971 + }, + { + "epoch": 1.9058823529411764, + "grad_norm": 62.334659576416016, + "learning_rate": 6.044527393286037e-08, + "loss": 4.0689191818237305, + "step": 972 + }, + { + "epoch": 1.9078431372549018, + "grad_norm": 31.04817771911621, + "learning_rate": 5.800802797316152e-08, + "loss": 3.3812990188598633, + "step": 973 + }, + { + "epoch": 1.9098039215686273, + "grad_norm": 12.806366920471191, + "learning_rate": 5.5620648507182516e-08, + "loss": 4.259305477142334, + "step": 974 + }, + { + "epoch": 1.9117647058823528, + "grad_norm": 30.618995666503906, + "learning_rate": 5.3283159624448745e-08, + "loss": 4.314586162567139, + "step": 975 + }, + { + "epoch": 1.9137254901960783, + "grad_norm": 20.089120864868164, + "learning_rate": 5.09955849110727e-08, + "loss": 4.420983791351318, + "step": 976 + }, + { + "epoch": 1.9156862745098038, + "grad_norm": 9.239790916442871, + "learning_rate": 4.875794744951423e-08, + "loss": 4.013869762420654, + "step": 977 + }, + { + "epoch": 1.9176470588235293, + "grad_norm": 12.351265907287598, + "learning_rate": 4.657026981834623e-08, + "loss": 4.202646255493164, + "step": 978 + }, + { + "epoch": 1.9196078431372547, + "grad_norm": 103.25799560546875, + "learning_rate": 4.443257409203206e-08, + "loss": 4.95119571685791, + "step": 979 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 18.508987426757812, + "learning_rate": 4.2344881840697404e-08, + "loss": 4.172933578491211, + "step": 980 + }, + { + "epoch": 1.923529411764706, + "grad_norm": 44.97269058227539, + "learning_rate": 4.0307214129914896e-08, + "loss": 4.063309192657471, + "step": 981 + }, + { + "epoch": 1.9254901960784314, + "grad_norm": 46.690128326416016, + "learning_rate": 3.8319591520492025e-08, + "loss": 4.745169162750244, + "step": 982 + }, + { + "epoch": 1.927450980392157, + "grad_norm": 96.9118423461914, + "learning_rate": 3.638203406826302e-08, + "loss": 4.313328266143799, + "step": 983 + }, + { + "epoch": 1.9294117647058824, + "grad_norm": 270.73638916015625, + "learning_rate": 3.449456132388562e-08, + "loss": 4.73173713684082, + "step": 984 + }, + { + "epoch": 1.9313725490196079, + "grad_norm": 31.75279998779297, + "learning_rate": 3.265719233264575e-08, + "loss": 4.4036407470703125, + "step": 985 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 57.023155212402344, + "learning_rate": 3.086994563426371e-08, + "loss": 4.6626739501953125, + "step": 986 + }, + { + "epoch": 1.9352941176470588, + "grad_norm": 24.05242919921875, + "learning_rate": 2.9132839262707714e-08, + "loss": 4.355752944946289, + "step": 987 + }, + { + "epoch": 1.9372549019607843, + "grad_norm": 110.74296569824219, + "learning_rate": 2.7445890746011782e-08, + "loss": 4.365067005157471, + "step": 988 + }, + { + "epoch": 1.9392156862745098, + "grad_norm": 107.20745849609375, + "learning_rate": 2.5809117106099235e-08, + "loss": 4.463684558868408, + "step": 989 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 19.856996536254883, + "learning_rate": 2.4222534858610036e-08, + "loss": 4.137124061584473, + "step": 990 + }, + { + "epoch": 1.9431372549019608, + "grad_norm": 19.380189895629883, + "learning_rate": 2.2686160012735935e-08, + "loss": 4.621649265289307, + "step": 991 + }, + { + "epoch": 1.9450980392156862, + "grad_norm": 8.958719253540039, + "learning_rate": 2.120000807105671e-08, + "loss": 4.091302871704102, + "step": 992 + }, + { + "epoch": 1.9470588235294117, + "grad_norm": 231.63558959960938, + "learning_rate": 1.9764094029385285e-08, + "loss": 4.132928848266602, + "step": 993 + }, + { + "epoch": 1.9490196078431372, + "grad_norm": 13.57322883605957, + "learning_rate": 1.837843237661563e-08, + "loss": 3.730222702026367, + "step": 994 + }, + { + "epoch": 1.9509803921568627, + "grad_norm": 15.670381546020508, + "learning_rate": 1.704303709457733e-08, + "loss": 3.905336856842041, + "step": 995 + }, + { + "epoch": 1.9529411764705882, + "grad_norm": 9.398954391479492, + "learning_rate": 1.5757921657892915e-08, + "loss": 4.584706783294678, + "step": 996 + }, + { + "epoch": 1.9549019607843139, + "grad_norm": 21.601640701293945, + "learning_rate": 1.4523099033845189e-08, + "loss": 4.310138702392578, + "step": 997 + }, + { + "epoch": 1.9568627450980394, + "grad_norm": 13.652542114257812, + "learning_rate": 1.333858168224178e-08, + "loss": 4.348342418670654, + "step": 998 + }, + { + "epoch": 1.9588235294117649, + "grad_norm": 327.3991394042969, + "learning_rate": 1.2204381555293021e-08, + "loss": 4.026599884033203, + "step": 999 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 40.3809814453125, + "learning_rate": 1.1120510097490933e-08, + "loss": 4.513772964477539, + "step": 1000 + }, + { + "epoch": 1.9627450980392158, + "grad_norm": 285.50177001953125, + "learning_rate": 1.0086978245490986e-08, + "loss": 4.170161247253418, + "step": 1001 + }, + { + "epoch": 1.9647058823529413, + "grad_norm": 18.863910675048828, + "learning_rate": 9.103796428006074e-09, + "loss": 4.978179931640625, + "step": 1002 + }, + { + "epoch": 1.9666666666666668, + "grad_norm": 14.770316123962402, + "learning_rate": 8.1709745656966e-09, + "loss": 3.7664594650268555, + "step": 1003 + }, + { + "epoch": 1.9686274509803923, + "grad_norm": 296.1942138671875, + "learning_rate": 7.288522071074999e-09, + "loss": 4.17707633972168, + "step": 1004 + }, + { + "epoch": 1.9705882352941178, + "grad_norm": 12.930943489074707, + "learning_rate": 6.4564478484069326e-09, + "loss": 4.778520584106445, + "step": 1005 + }, + { + "epoch": 1.9725490196078432, + "grad_norm": 12.585882186889648, + "learning_rate": 5.6747602936230166e-09, + "loss": 4.420896530151367, + "step": 1006 + }, + { + "epoch": 1.9745098039215687, + "grad_norm": 206.85272216796875, + "learning_rate": 4.9434672942355595e-09, + "loss": 4.377656936645508, + "step": 1007 + }, + { + "epoch": 1.9764705882352942, + "grad_norm": 6.57910680770874, + "learning_rate": 4.2625762292553e-09, + "loss": 3.997027635574341, + "step": 1008 + }, + { + "epoch": 1.9784313725490197, + "grad_norm": 20.324464797973633, + "learning_rate": 3.632093969121453e-09, + "loss": 3.9947962760925293, + "step": 1009 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 13.75621223449707, + "learning_rate": 3.0520268756284444e-09, + "loss": 4.60496187210083, + "step": 1010 + }, + { + "epoch": 1.9823529411764707, + "grad_norm": 19.593032836914062, + "learning_rate": 2.522380801863733e-09, + "loss": 4.289256572723389, + "step": 1011 + }, + { + "epoch": 1.9843137254901961, + "grad_norm": 19.958444595336914, + "learning_rate": 2.043161092148971e-09, + "loss": 4.73214054107666, + "step": 1012 + }, + { + "epoch": 1.9862745098039216, + "grad_norm": 38.842655181884766, + "learning_rate": 1.6143725819850465e-09, + "loss": 4.757114410400391, + "step": 1013 + }, + { + "epoch": 1.988235294117647, + "grad_norm": 161.94960021972656, + "learning_rate": 1.2360195980032351e-09, + "loss": 4.048542022705078, + "step": 1014 + }, + { + "epoch": 1.9901960784313726, + "grad_norm": 272.5787048339844, + "learning_rate": 9.081059579235662e-10, + "loss": 4.251223564147949, + "step": 1015 + }, + { + "epoch": 1.992156862745098, + "grad_norm": 32.76935958862305, + "learning_rate": 6.306349705126335e-10, + "loss": 4.2266130447387695, + "step": 1016 + }, + { + "epoch": 1.9941176470588236, + "grad_norm": 12.375147819519043, + "learning_rate": 4.036094355541753e-10, + "loss": 4.641788959503174, + "step": 1017 + }, + { + "epoch": 1.996078431372549, + "grad_norm": 127.43289184570312, + "learning_rate": 2.2703164381743248e-10, + "loss": 4.508474826812744, + "step": 1018 + }, + { + "epoch": 1.9980392156862745, + "grad_norm": 30.30124855041504, + "learning_rate": 1.0090337703771991e-10, + "loss": 4.436887741088867, + "step": 1019 + }, + { + "epoch": 2.0, + "grad_norm": 572.6058349609375, + "learning_rate": 2.522590789422186e-11, + "loss": 4.0800886154174805, + "step": 1020 + } + ], + "logging_steps": 1, + "max_steps": 1020, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.829783862596862e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}