gyung's picture
Upload Gemma 4 31B base Terminal SFT Native Liquid 1Epoch
f64af54 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 510,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00196078431372549,
"grad_norm": 53193.03125,
"learning_rate": 0.0,
"loss": 12.563469886779785,
"step": 1
},
{
"epoch": 0.00392156862745098,
"grad_norm": 65318.5546875,
"learning_rate": 3.2258064516129035e-07,
"loss": 11.076333999633789,
"step": 2
},
{
"epoch": 0.0058823529411764705,
"grad_norm": 53597.85546875,
"learning_rate": 6.451612903225807e-07,
"loss": 13.65261459350586,
"step": 3
},
{
"epoch": 0.00784313725490196,
"grad_norm": 59171.875,
"learning_rate": 9.67741935483871e-07,
"loss": 15.897336959838867,
"step": 4
},
{
"epoch": 0.00980392156862745,
"grad_norm": 58358.25390625,
"learning_rate": 1.2903225806451614e-06,
"loss": 13.228803634643555,
"step": 5
},
{
"epoch": 0.011764705882352941,
"grad_norm": 33576.91796875,
"learning_rate": 1.6129032258064516e-06,
"loss": 12.043094635009766,
"step": 6
},
{
"epoch": 0.013725490196078431,
"grad_norm": 40702.50390625,
"learning_rate": 1.935483870967742e-06,
"loss": 9.755388259887695,
"step": 7
},
{
"epoch": 0.01568627450980392,
"grad_norm": 15023.3505859375,
"learning_rate": 2.2580645161290324e-06,
"loss": 11.674318313598633,
"step": 8
},
{
"epoch": 0.01764705882352941,
"grad_norm": 11325.5283203125,
"learning_rate": 2.580645161290323e-06,
"loss": 10.191791534423828,
"step": 9
},
{
"epoch": 0.0196078431372549,
"grad_norm": 10106.4345703125,
"learning_rate": 2.903225806451613e-06,
"loss": 9.150543212890625,
"step": 10
},
{
"epoch": 0.021568627450980392,
"grad_norm": 27759.919921875,
"learning_rate": 3.225806451612903e-06,
"loss": 9.193891525268555,
"step": 11
},
{
"epoch": 0.023529411764705882,
"grad_norm": 17614.66796875,
"learning_rate": 3.548387096774194e-06,
"loss": 8.195146560668945,
"step": 12
},
{
"epoch": 0.025490196078431372,
"grad_norm": 69656.46875,
"learning_rate": 3.870967741935484e-06,
"loss": 9.301647186279297,
"step": 13
},
{
"epoch": 0.027450980392156862,
"grad_norm": 12202.4873046875,
"learning_rate": 4.193548387096774e-06,
"loss": 9.306933403015137,
"step": 14
},
{
"epoch": 0.029411764705882353,
"grad_norm": 15527.83984375,
"learning_rate": 4.516129032258065e-06,
"loss": 11.867281913757324,
"step": 15
},
{
"epoch": 0.03137254901960784,
"grad_norm": 9547.95703125,
"learning_rate": 4.838709677419355e-06,
"loss": 7.4255876541137695,
"step": 16
},
{
"epoch": 0.03333333333333333,
"grad_norm": 7850.044921875,
"learning_rate": 5.161290322580646e-06,
"loss": 7.318220138549805,
"step": 17
},
{
"epoch": 0.03529411764705882,
"grad_norm": 19678.12109375,
"learning_rate": 5.483870967741935e-06,
"loss": 8.5791015625,
"step": 18
},
{
"epoch": 0.03725490196078431,
"grad_norm": 7771.2578125,
"learning_rate": 5.806451612903226e-06,
"loss": 7.246687412261963,
"step": 19
},
{
"epoch": 0.0392156862745098,
"grad_norm": 279435.96875,
"learning_rate": 6.129032258064517e-06,
"loss": 8.958134651184082,
"step": 20
},
{
"epoch": 0.041176470588235294,
"grad_norm": 7213.25341796875,
"learning_rate": 6.451612903225806e-06,
"loss": 6.607357025146484,
"step": 21
},
{
"epoch": 0.043137254901960784,
"grad_norm": 6548.99267578125,
"learning_rate": 6.774193548387097e-06,
"loss": 7.45709228515625,
"step": 22
},
{
"epoch": 0.045098039215686274,
"grad_norm": 6222.45751953125,
"learning_rate": 7.096774193548388e-06,
"loss": 6.48307991027832,
"step": 23
},
{
"epoch": 0.047058823529411764,
"grad_norm": 2163.545654296875,
"learning_rate": 7.4193548387096784e-06,
"loss": 6.221013069152832,
"step": 24
},
{
"epoch": 0.049019607843137254,
"grad_norm": 2340.748779296875,
"learning_rate": 7.741935483870968e-06,
"loss": 6.952737808227539,
"step": 25
},
{
"epoch": 0.050980392156862744,
"grad_norm": 4872.9873046875,
"learning_rate": 8.064516129032258e-06,
"loss": 7.317384719848633,
"step": 26
},
{
"epoch": 0.052941176470588235,
"grad_norm": 1415.2637939453125,
"learning_rate": 8.387096774193549e-06,
"loss": 7.906464576721191,
"step": 27
},
{
"epoch": 0.054901960784313725,
"grad_norm": 1782.3765869140625,
"learning_rate": 8.70967741935484e-06,
"loss": 7.230937957763672,
"step": 28
},
{
"epoch": 0.056862745098039215,
"grad_norm": 1727.0596923828125,
"learning_rate": 9.03225806451613e-06,
"loss": 6.246130466461182,
"step": 29
},
{
"epoch": 0.058823529411764705,
"grad_norm": 967.6840209960938,
"learning_rate": 9.35483870967742e-06,
"loss": 7.539262294769287,
"step": 30
},
{
"epoch": 0.060784313725490195,
"grad_norm": 7889.62060546875,
"learning_rate": 9.67741935483871e-06,
"loss": 6.116678237915039,
"step": 31
},
{
"epoch": 0.06274509803921569,
"grad_norm": 7365.2177734375,
"learning_rate": 1e-05,
"loss": 6.007411956787109,
"step": 32
},
{
"epoch": 0.06470588235294118,
"grad_norm": 341.2566833496094,
"learning_rate": 9.999974774092107e-06,
"loss": 5.7740020751953125,
"step": 33
},
{
"epoch": 0.06666666666666667,
"grad_norm": 1897.9654541015625,
"learning_rate": 9.999899096622962e-06,
"loss": 7.221264839172363,
"step": 34
},
{
"epoch": 0.06862745098039216,
"grad_norm": 2535.89208984375,
"learning_rate": 9.999772968356182e-06,
"loss": 6.483910083770752,
"step": 35
},
{
"epoch": 0.07058823529411765,
"grad_norm": 472.5924377441406,
"learning_rate": 9.999596390564446e-06,
"loss": 6.1053571701049805,
"step": 36
},
{
"epoch": 0.07254901960784314,
"grad_norm": 744.4826049804688,
"learning_rate": 9.999369365029487e-06,
"loss": 6.1797099113464355,
"step": 37
},
{
"epoch": 0.07450980392156863,
"grad_norm": 487.4558410644531,
"learning_rate": 9.999091894042077e-06,
"loss": 6.281297206878662,
"step": 38
},
{
"epoch": 0.07647058823529412,
"grad_norm": 1321.0650634765625,
"learning_rate": 9.998763980401997e-06,
"loss": 5.605555534362793,
"step": 39
},
{
"epoch": 0.0784313725490196,
"grad_norm": 1479.992919921875,
"learning_rate": 9.998385627418015e-06,
"loss": 5.528350830078125,
"step": 40
},
{
"epoch": 0.0803921568627451,
"grad_norm": 408.816650390625,
"learning_rate": 9.997956838907853e-06,
"loss": 7.773702144622803,
"step": 41
},
{
"epoch": 0.08235294117647059,
"grad_norm": 137.47793579101562,
"learning_rate": 9.997477619198138e-06,
"loss": 5.883761405944824,
"step": 42
},
{
"epoch": 0.08431372549019608,
"grad_norm": 613.70458984375,
"learning_rate": 9.996947973124372e-06,
"loss": 5.855252265930176,
"step": 43
},
{
"epoch": 0.08627450980392157,
"grad_norm": 272.45184326171875,
"learning_rate": 9.996367906030879e-06,
"loss": 6.615945816040039,
"step": 44
},
{
"epoch": 0.08823529411764706,
"grad_norm": 89.09500122070312,
"learning_rate": 9.995737423770746e-06,
"loss": 5.607078552246094,
"step": 45
},
{
"epoch": 0.09019607843137255,
"grad_norm": 313.746337890625,
"learning_rate": 9.995056532705766e-06,
"loss": 4.953719139099121,
"step": 46
},
{
"epoch": 0.09215686274509804,
"grad_norm": 4434.6171875,
"learning_rate": 9.994325239706377e-06,
"loss": 5.135598182678223,
"step": 47
},
{
"epoch": 0.09411764705882353,
"grad_norm": 305.1719055175781,
"learning_rate": 9.993543552151594e-06,
"loss": 5.021300792694092,
"step": 48
},
{
"epoch": 0.09607843137254903,
"grad_norm": 9320.9189453125,
"learning_rate": 9.992711477928925e-06,
"loss": 5.3348188400268555,
"step": 49
},
{
"epoch": 0.09803921568627451,
"grad_norm": 4146.0498046875,
"learning_rate": 9.991829025434305e-06,
"loss": 4.429060935974121,
"step": 50
},
{
"epoch": 0.1,
"grad_norm": 15375.9609375,
"learning_rate": 9.990896203571994e-06,
"loss": 5.005645275115967,
"step": 51
},
{
"epoch": 0.10196078431372549,
"grad_norm": 200.52828979492188,
"learning_rate": 9.98991302175451e-06,
"loss": 4.591862201690674,
"step": 52
},
{
"epoch": 0.10392156862745099,
"grad_norm": 778.8656616210938,
"learning_rate": 9.98887948990251e-06,
"loss": 4.922955513000488,
"step": 53
},
{
"epoch": 0.10588235294117647,
"grad_norm": 338.90618896484375,
"learning_rate": 9.987795618444707e-06,
"loss": 5.766139507293701,
"step": 54
},
{
"epoch": 0.10784313725490197,
"grad_norm": 2074.159912109375,
"learning_rate": 9.986661418317759e-06,
"loss": 4.315250873565674,
"step": 55
},
{
"epoch": 0.10980392156862745,
"grad_norm": 163.18319702148438,
"learning_rate": 9.985476900966156e-06,
"loss": 4.612663269042969,
"step": 56
},
{
"epoch": 0.11176470588235295,
"grad_norm": 366.7607116699219,
"learning_rate": 9.984242078342108e-06,
"loss": 5.53734827041626,
"step": 57
},
{
"epoch": 0.11372549019607843,
"grad_norm": 507.3059387207031,
"learning_rate": 9.982956962905423e-06,
"loss": 5.0663065910339355,
"step": 58
},
{
"epoch": 0.11568627450980393,
"grad_norm": 203.18968200683594,
"learning_rate": 9.981621567623385e-06,
"loss": 4.972416877746582,
"step": 59
},
{
"epoch": 0.11764705882352941,
"grad_norm": 170.353271484375,
"learning_rate": 9.980235905970615e-06,
"loss": 5.337252616882324,
"step": 60
},
{
"epoch": 0.11960784313725491,
"grad_norm": 138.26260375976562,
"learning_rate": 9.978799991928945e-06,
"loss": 5.116921424865723,
"step": 61
},
{
"epoch": 0.12156862745098039,
"grad_norm": 129.8578643798828,
"learning_rate": 9.977313839987265e-06,
"loss": 4.708870887756348,
"step": 62
},
{
"epoch": 0.12352941176470589,
"grad_norm": 182.51133728027344,
"learning_rate": 9.975777465141391e-06,
"loss": 5.148445129394531,
"step": 63
},
{
"epoch": 0.12549019607843137,
"grad_norm": 263.6011962890625,
"learning_rate": 9.974190882893901e-06,
"loss": 5.472186088562012,
"step": 64
},
{
"epoch": 0.12745098039215685,
"grad_norm": 410.5545349121094,
"learning_rate": 9.972554109253988e-06,
"loss": 4.765022277832031,
"step": 65
},
{
"epoch": 0.12941176470588237,
"grad_norm": 36.73834991455078,
"learning_rate": 9.970867160737293e-06,
"loss": 4.604272842407227,
"step": 66
},
{
"epoch": 0.13137254901960785,
"grad_norm": 48809.47265625,
"learning_rate": 9.969130054365737e-06,
"loss": 5.7819061279296875,
"step": 67
},
{
"epoch": 0.13333333333333333,
"grad_norm": 267.5246276855469,
"learning_rate": 9.967342807667355e-06,
"loss": 4.696765899658203,
"step": 68
},
{
"epoch": 0.13529411764705881,
"grad_norm": 5235.89404296875,
"learning_rate": 9.965505438676115e-06,
"loss": 7.0500078201293945,
"step": 69
},
{
"epoch": 0.13725490196078433,
"grad_norm": 61.30277633666992,
"learning_rate": 9.963617965931738e-06,
"loss": 5.219986915588379,
"step": 70
},
{
"epoch": 0.1392156862745098,
"grad_norm": 86.16511535644531,
"learning_rate": 9.961680408479508e-06,
"loss": 4.936437606811523,
"step": 71
},
{
"epoch": 0.1411764705882353,
"grad_norm": 1172.7030029296875,
"learning_rate": 9.959692785870086e-06,
"loss": 4.968964099884033,
"step": 72
},
{
"epoch": 0.14313725490196078,
"grad_norm": 151.62570190429688,
"learning_rate": 9.957655118159304e-06,
"loss": 4.32323694229126,
"step": 73
},
{
"epoch": 0.1450980392156863,
"grad_norm": 179.33120727539062,
"learning_rate": 9.955567425907968e-06,
"loss": 4.883161544799805,
"step": 74
},
{
"epoch": 0.14705882352941177,
"grad_norm": 48.311744689941406,
"learning_rate": 9.953429730181653e-06,
"loss": 4.122620582580566,
"step": 75
},
{
"epoch": 0.14901960784313725,
"grad_norm": 55.67217254638672,
"learning_rate": 9.951242052550487e-06,
"loss": 4.693333148956299,
"step": 76
},
{
"epoch": 0.15098039215686274,
"grad_norm": 24.245948791503906,
"learning_rate": 9.949004415088928e-06,
"loss": 4.758542060852051,
"step": 77
},
{
"epoch": 0.15294117647058825,
"grad_norm": 75.69949340820312,
"learning_rate": 9.946716840375552e-06,
"loss": 4.672502517700195,
"step": 78
},
{
"epoch": 0.15490196078431373,
"grad_norm": 47.35036849975586,
"learning_rate": 9.944379351492818e-06,
"loss": 5.108541488647461,
"step": 79
},
{
"epoch": 0.1568627450980392,
"grad_norm": 32.07907485961914,
"learning_rate": 9.941991972026839e-06,
"loss": 4.218465805053711,
"step": 80
},
{
"epoch": 0.1588235294117647,
"grad_norm": 45.03062057495117,
"learning_rate": 9.939554726067142e-06,
"loss": 5.109847068786621,
"step": 81
},
{
"epoch": 0.1607843137254902,
"grad_norm": 23.509695053100586,
"learning_rate": 9.937067638206418e-06,
"loss": 4.969597816467285,
"step": 82
},
{
"epoch": 0.1627450980392157,
"grad_norm": 22.31194496154785,
"learning_rate": 9.934530733540293e-06,
"loss": 4.279464244842529,
"step": 83
},
{
"epoch": 0.16470588235294117,
"grad_norm": 33.23004913330078,
"learning_rate": 9.931944037667056e-06,
"loss": 4.389366149902344,
"step": 84
},
{
"epoch": 0.16666666666666666,
"grad_norm": 55.22507858276367,
"learning_rate": 9.929307576687404e-06,
"loss": 4.9593353271484375,
"step": 85
},
{
"epoch": 0.16862745098039217,
"grad_norm": 18.55003547668457,
"learning_rate": 9.926621377204188e-06,
"loss": 4.743139743804932,
"step": 86
},
{
"epoch": 0.17058823529411765,
"grad_norm": 37.464111328125,
"learning_rate": 9.923885466322135e-06,
"loss": 5.466174125671387,
"step": 87
},
{
"epoch": 0.17254901960784313,
"grad_norm": 26.026020050048828,
"learning_rate": 9.921099871647582e-06,
"loss": 4.79066276550293,
"step": 88
},
{
"epoch": 0.17450980392156862,
"grad_norm": 21.373525619506836,
"learning_rate": 9.918264621288187e-06,
"loss": 4.939136505126953,
"step": 89
},
{
"epoch": 0.17647058823529413,
"grad_norm": 48.5876350402832,
"learning_rate": 9.91537974385266e-06,
"loss": 5.149735927581787,
"step": 90
},
{
"epoch": 0.1784313725490196,
"grad_norm": 19.163881301879883,
"learning_rate": 9.912445268450459e-06,
"loss": 5.1168317794799805,
"step": 91
},
{
"epoch": 0.1803921568627451,
"grad_norm": 79.20329284667969,
"learning_rate": 9.909461224691506e-06,
"loss": 5.10573673248291,
"step": 92
},
{
"epoch": 0.18235294117647058,
"grad_norm": 14.64513874053955,
"learning_rate": 9.906427642685889e-06,
"loss": 4.724274635314941,
"step": 93
},
{
"epoch": 0.1843137254901961,
"grad_norm": 114.3454818725586,
"learning_rate": 9.90334455304355e-06,
"loss": 5.02380895614624,
"step": 94
},
{
"epoch": 0.18627450980392157,
"grad_norm": 20.076950073242188,
"learning_rate": 9.900211986873986e-06,
"loss": 4.226929187774658,
"step": 95
},
{
"epoch": 0.18823529411764706,
"grad_norm": 34.134727478027344,
"learning_rate": 9.897029975785924e-06,
"loss": 4.3056230545043945,
"step": 96
},
{
"epoch": 0.19019607843137254,
"grad_norm": 1793.708251953125,
"learning_rate": 9.89379855188701e-06,
"loss": 4.075126647949219,
"step": 97
},
{
"epoch": 0.19215686274509805,
"grad_norm": 223.3547821044922,
"learning_rate": 9.89051774778349e-06,
"loss": 4.407467365264893,
"step": 98
},
{
"epoch": 0.19411764705882353,
"grad_norm": 150.83956909179688,
"learning_rate": 9.887187596579865e-06,
"loss": 4.878414630889893,
"step": 99
},
{
"epoch": 0.19607843137254902,
"grad_norm": 631.5261840820312,
"learning_rate": 9.883808131878573e-06,
"loss": 4.593829154968262,
"step": 100
},
{
"epoch": 0.1980392156862745,
"grad_norm": 424.9626770019531,
"learning_rate": 9.880379387779637e-06,
"loss": 4.420987129211426,
"step": 101
},
{
"epoch": 0.2,
"grad_norm": 156.39096069335938,
"learning_rate": 9.87690139888033e-06,
"loss": 4.282057762145996,
"step": 102
},
{
"epoch": 0.2019607843137255,
"grad_norm": 87.1181869506836,
"learning_rate": 9.873374200274826e-06,
"loss": 4.356283187866211,
"step": 103
},
{
"epoch": 0.20392156862745098,
"grad_norm": 91.06869506835938,
"learning_rate": 9.869797827553837e-06,
"loss": 4.65322732925415,
"step": 104
},
{
"epoch": 0.20588235294117646,
"grad_norm": 24.660625457763672,
"learning_rate": 9.866172316804265e-06,
"loss": 4.646048069000244,
"step": 105
},
{
"epoch": 0.20784313725490197,
"grad_norm": 228.06607055664062,
"learning_rate": 9.862497704608829e-06,
"loss": 4.7434186935424805,
"step": 106
},
{
"epoch": 0.20980392156862746,
"grad_norm": 44.946834564208984,
"learning_rate": 9.8587740280457e-06,
"loss": 4.521874904632568,
"step": 107
},
{
"epoch": 0.21176470588235294,
"grad_norm": 70.29621887207031,
"learning_rate": 9.855001324688128e-06,
"loss": 4.990371227264404,
"step": 108
},
{
"epoch": 0.21372549019607842,
"grad_norm": 229.05606079101562,
"learning_rate": 9.851179632604057e-06,
"loss": 4.608217239379883,
"step": 109
},
{
"epoch": 0.21568627450980393,
"grad_norm": 27.004846572875977,
"learning_rate": 9.847308990355752e-06,
"loss": 4.623664855957031,
"step": 110
},
{
"epoch": 0.21764705882352942,
"grad_norm": 127.99211883544922,
"learning_rate": 9.843389436999396e-06,
"loss": 4.443979263305664,
"step": 111
},
{
"epoch": 0.2196078431372549,
"grad_norm": 17364.953125,
"learning_rate": 9.839421012084709e-06,
"loss": 4.457244396209717,
"step": 112
},
{
"epoch": 0.22156862745098038,
"grad_norm": 374.42242431640625,
"learning_rate": 9.835403755654535e-06,
"loss": 4.446849822998047,
"step": 113
},
{
"epoch": 0.2235294117647059,
"grad_norm": 251.22105407714844,
"learning_rate": 9.831337708244454e-06,
"loss": 4.229968547821045,
"step": 114
},
{
"epoch": 0.22549019607843138,
"grad_norm": 49.53373718261719,
"learning_rate": 9.827222910882358e-06,
"loss": 4.880672454833984,
"step": 115
},
{
"epoch": 0.22745098039215686,
"grad_norm": 50.32280349731445,
"learning_rate": 9.82305940508805e-06,
"loss": 4.348365783691406,
"step": 116
},
{
"epoch": 0.22941176470588234,
"grad_norm": 140.9510040283203,
"learning_rate": 9.818847232872815e-06,
"loss": 4.777993202209473,
"step": 117
},
{
"epoch": 0.23137254901960785,
"grad_norm": 279.43096923828125,
"learning_rate": 9.814586436738998e-06,
"loss": 4.542442321777344,
"step": 118
},
{
"epoch": 0.23333333333333334,
"grad_norm": 67.41588592529297,
"learning_rate": 9.81027705967958e-06,
"loss": 3.9457411766052246,
"step": 119
},
{
"epoch": 0.23529411764705882,
"grad_norm": 86.58934783935547,
"learning_rate": 9.805919145177741e-06,
"loss": 4.564214706420898,
"step": 120
},
{
"epoch": 0.2372549019607843,
"grad_norm": 54.56572723388672,
"learning_rate": 9.801512737206422e-06,
"loss": 4.819596290588379,
"step": 121
},
{
"epoch": 0.23921568627450981,
"grad_norm": 301.5806884765625,
"learning_rate": 9.797057880227878e-06,
"loss": 4.497692108154297,
"step": 122
},
{
"epoch": 0.2411764705882353,
"grad_norm": 27.02676010131836,
"learning_rate": 9.792554619193235e-06,
"loss": 4.406162738800049,
"step": 123
},
{
"epoch": 0.24313725490196078,
"grad_norm": 135.9830322265625,
"learning_rate": 9.78800299954203e-06,
"loss": 4.398698806762695,
"step": 124
},
{
"epoch": 0.24509803921568626,
"grad_norm": 103.8440170288086,
"learning_rate": 9.783403067201763e-06,
"loss": 4.38757848739624,
"step": 125
},
{
"epoch": 0.24705882352941178,
"grad_norm": 34.276283264160156,
"learning_rate": 9.778754868587414e-06,
"loss": 5.003340721130371,
"step": 126
},
{
"epoch": 0.24901960784313726,
"grad_norm": 1124.2784423828125,
"learning_rate": 9.774058450601003e-06,
"loss": 4.731633186340332,
"step": 127
},
{
"epoch": 0.25098039215686274,
"grad_norm": 226.05685424804688,
"learning_rate": 9.76931386063109e-06,
"loss": 4.437061309814453,
"step": 128
},
{
"epoch": 0.2529411764705882,
"grad_norm": 119.51632690429688,
"learning_rate": 9.76452114655231e-06,
"loss": 4.532749176025391,
"step": 129
},
{
"epoch": 0.2549019607843137,
"grad_norm": 243.1568145751953,
"learning_rate": 9.759680356724888e-06,
"loss": 3.9236581325531006,
"step": 130
},
{
"epoch": 0.2568627450980392,
"grad_norm": 80.8863754272461,
"learning_rate": 9.754791539994153e-06,
"loss": 4.814586162567139,
"step": 131
},
{
"epoch": 0.25882352941176473,
"grad_norm": 61.44620895385742,
"learning_rate": 9.749854745690041e-06,
"loss": 4.452531814575195,
"step": 132
},
{
"epoch": 0.2607843137254902,
"grad_norm": 28.460981369018555,
"learning_rate": 9.744870023626598e-06,
"loss": 4.3587646484375,
"step": 133
},
{
"epoch": 0.2627450980392157,
"grad_norm": 134.26856994628906,
"learning_rate": 9.739837424101484e-06,
"loss": 4.51732873916626,
"step": 134
},
{
"epoch": 0.2647058823529412,
"grad_norm": 130.5369415283203,
"learning_rate": 9.73475699789545e-06,
"loss": 4.519060134887695,
"step": 135
},
{
"epoch": 0.26666666666666666,
"grad_norm": 44.77864074707031,
"learning_rate": 9.729628796271844e-06,
"loss": 3.988651990890503,
"step": 136
},
{
"epoch": 0.26862745098039215,
"grad_norm": 11.590542793273926,
"learning_rate": 9.724452870976084e-06,
"loss": 4.040298938751221,
"step": 137
},
{
"epoch": 0.27058823529411763,
"grad_norm": 18.744321823120117,
"learning_rate": 9.719229274235134e-06,
"loss": 4.6587629318237305,
"step": 138
},
{
"epoch": 0.2725490196078431,
"grad_norm": 20.353900909423828,
"learning_rate": 9.713958058756985e-06,
"loss": 4.333967685699463,
"step": 139
},
{
"epoch": 0.27450980392156865,
"grad_norm": 77.0707015991211,
"learning_rate": 9.708639277730112e-06,
"loss": 4.544798851013184,
"step": 140
},
{
"epoch": 0.27647058823529413,
"grad_norm": 66.58236694335938,
"learning_rate": 9.703272984822947e-06,
"loss": 4.844306945800781,
"step": 141
},
{
"epoch": 0.2784313725490196,
"grad_norm": 94.8054428100586,
"learning_rate": 9.697859234183336e-06,
"loss": 4.96964168548584,
"step": 142
},
{
"epoch": 0.2803921568627451,
"grad_norm": 20.816715240478516,
"learning_rate": 9.692398080437991e-06,
"loss": 4.7491607666015625,
"step": 143
},
{
"epoch": 0.2823529411764706,
"grad_norm": 35.99055862426758,
"learning_rate": 9.68688957869193e-06,
"loss": 4.6900787353515625,
"step": 144
},
{
"epoch": 0.28431372549019607,
"grad_norm": 86.11298370361328,
"learning_rate": 9.681333784527945e-06,
"loss": 4.595911979675293,
"step": 145
},
{
"epoch": 0.28627450980392155,
"grad_norm": 60.40806198120117,
"learning_rate": 9.67573075400601e-06,
"loss": 4.306059837341309,
"step": 146
},
{
"epoch": 0.28823529411764703,
"grad_norm": 2100.6142578125,
"learning_rate": 9.670080543662742e-06,
"loss": 4.651222229003906,
"step": 147
},
{
"epoch": 0.2901960784313726,
"grad_norm": 27.78398323059082,
"learning_rate": 9.66438321051081e-06,
"loss": 4.409613609313965,
"step": 148
},
{
"epoch": 0.29215686274509806,
"grad_norm": 60.483070373535156,
"learning_rate": 9.658638812038379e-06,
"loss": 4.674676418304443,
"step": 149
},
{
"epoch": 0.29411764705882354,
"grad_norm": 32.12427520751953,
"learning_rate": 9.652847406208514e-06,
"loss": 4.888742446899414,
"step": 150
},
{
"epoch": 0.296078431372549,
"grad_norm": 26.405981063842773,
"learning_rate": 9.647009051458604e-06,
"loss": 5.177173614501953,
"step": 151
},
{
"epoch": 0.2980392156862745,
"grad_norm": 26.46014976501465,
"learning_rate": 9.641123806699769e-06,
"loss": 4.627013683319092,
"step": 152
},
{
"epoch": 0.3,
"grad_norm": 29.641414642333984,
"learning_rate": 9.635191731316262e-06,
"loss": 4.72336483001709,
"step": 153
},
{
"epoch": 0.30196078431372547,
"grad_norm": 57.82429504394531,
"learning_rate": 9.629212885164882e-06,
"loss": 4.602277755737305,
"step": 154
},
{
"epoch": 0.30392156862745096,
"grad_norm": 15.265630722045898,
"learning_rate": 9.623187328574357e-06,
"loss": 4.607016563415527,
"step": 155
},
{
"epoch": 0.3058823529411765,
"grad_norm": 18.17546844482422,
"learning_rate": 9.617115122344742e-06,
"loss": 4.736464023590088,
"step": 156
},
{
"epoch": 0.307843137254902,
"grad_norm": 16.221704483032227,
"learning_rate": 9.6109963277468e-06,
"loss": 4.437252998352051,
"step": 157
},
{
"epoch": 0.30980392156862746,
"grad_norm": 178.49720764160156,
"learning_rate": 9.604831006521393e-06,
"loss": 4.21634578704834,
"step": 158
},
{
"epoch": 0.31176470588235294,
"grad_norm": 13.691607475280762,
"learning_rate": 9.598619220878852e-06,
"loss": 4.380677700042725,
"step": 159
},
{
"epoch": 0.3137254901960784,
"grad_norm": 20.732192993164062,
"learning_rate": 9.592361033498349e-06,
"loss": 4.700325012207031,
"step": 160
},
{
"epoch": 0.3156862745098039,
"grad_norm": 16.656484603881836,
"learning_rate": 9.586056507527266e-06,
"loss": 4.596185684204102,
"step": 161
},
{
"epoch": 0.3176470588235294,
"grad_norm": 23.23253631591797,
"learning_rate": 9.57970570658056e-06,
"loss": 3.866307497024536,
"step": 162
},
{
"epoch": 0.3196078431372549,
"grad_norm": 17.419248580932617,
"learning_rate": 9.57330869474012e-06,
"loss": 4.794591903686523,
"step": 163
},
{
"epoch": 0.3215686274509804,
"grad_norm": 16.963010787963867,
"learning_rate": 9.566865536554119e-06,
"loss": 4.635406017303467,
"step": 164
},
{
"epoch": 0.3235294117647059,
"grad_norm": 37.14400863647461,
"learning_rate": 9.560376297036362e-06,
"loss": 4.17448091506958,
"step": 165
},
{
"epoch": 0.3254901960784314,
"grad_norm": 75.3456039428711,
"learning_rate": 9.553841041665632e-06,
"loss": 4.513227462768555,
"step": 166
},
{
"epoch": 0.32745098039215687,
"grad_norm": 6.637061595916748,
"learning_rate": 9.54725983638503e-06,
"loss": 4.461617469787598,
"step": 167
},
{
"epoch": 0.32941176470588235,
"grad_norm": 50.82125473022461,
"learning_rate": 9.540632747601309e-06,
"loss": 4.77449893951416,
"step": 168
},
{
"epoch": 0.33137254901960783,
"grad_norm": 10.49996566772461,
"learning_rate": 9.533959842184195e-06,
"loss": 4.7018327713012695,
"step": 169
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1079.276611328125,
"learning_rate": 9.527241187465735e-06,
"loss": 4.265623569488525,
"step": 170
},
{
"epoch": 0.3352941176470588,
"grad_norm": 16.335859298706055,
"learning_rate": 9.520476851239588e-06,
"loss": 4.110130310058594,
"step": 171
},
{
"epoch": 0.33725490196078434,
"grad_norm": 18.83439826965332,
"learning_rate": 9.513666901760368e-06,
"loss": 4.40615177154541,
"step": 172
},
{
"epoch": 0.3392156862745098,
"grad_norm": 96.0332260131836,
"learning_rate": 9.506811407742938e-06,
"loss": 4.52007532119751,
"step": 173
},
{
"epoch": 0.3411764705882353,
"grad_norm": 148.7286376953125,
"learning_rate": 9.49991043836172e-06,
"loss": 4.527442455291748,
"step": 174
},
{
"epoch": 0.3431372549019608,
"grad_norm": 12.599474906921387,
"learning_rate": 9.49296406325e-06,
"loss": 3.911221981048584,
"step": 175
},
{
"epoch": 0.34509803921568627,
"grad_norm": 20.757164001464844,
"learning_rate": 9.485972352499231e-06,
"loss": 3.9439802169799805,
"step": 176
},
{
"epoch": 0.34705882352941175,
"grad_norm": 27.068387985229492,
"learning_rate": 9.478935376658308e-06,
"loss": 4.118377685546875,
"step": 177
},
{
"epoch": 0.34901960784313724,
"grad_norm": 10.007109642028809,
"learning_rate": 9.471853206732875e-06,
"loss": 4.880275249481201,
"step": 178
},
{
"epoch": 0.3509803921568627,
"grad_norm": 16.727500915527344,
"learning_rate": 9.4647259141846e-06,
"loss": 4.590615272521973,
"step": 179
},
{
"epoch": 0.35294117647058826,
"grad_norm": 41.481021881103516,
"learning_rate": 9.457553570930451e-06,
"loss": 4.905158042907715,
"step": 180
},
{
"epoch": 0.35490196078431374,
"grad_norm": 31.07561492919922,
"learning_rate": 9.450336249341976e-06,
"loss": 4.413531303405762,
"step": 181
},
{
"epoch": 0.3568627450980392,
"grad_norm": 24.699298858642578,
"learning_rate": 9.443074022244573e-06,
"loss": 4.389023780822754,
"step": 182
},
{
"epoch": 0.3588235294117647,
"grad_norm": 16.85030174255371,
"learning_rate": 9.435766962916749e-06,
"loss": 4.493863105773926,
"step": 183
},
{
"epoch": 0.3607843137254902,
"grad_norm": 84.7086410522461,
"learning_rate": 9.428415145089385e-06,
"loss": 4.844029426574707,
"step": 184
},
{
"epoch": 0.3627450980392157,
"grad_norm": 21.581398010253906,
"learning_rate": 9.421018642944996e-06,
"loss": 4.144974231719971,
"step": 185
},
{
"epoch": 0.36470588235294116,
"grad_norm": 41.96290969848633,
"learning_rate": 9.413577531116973e-06,
"loss": 4.406426906585693,
"step": 186
},
{
"epoch": 0.36666666666666664,
"grad_norm": 26.514503479003906,
"learning_rate": 9.406091884688837e-06,
"loss": 4.684653282165527,
"step": 187
},
{
"epoch": 0.3686274509803922,
"grad_norm": 113.76895141601562,
"learning_rate": 9.398561779193477e-06,
"loss": 4.297077178955078,
"step": 188
},
{
"epoch": 0.37058823529411766,
"grad_norm": 71.71430206298828,
"learning_rate": 9.390987290612396e-06,
"loss": 4.727826118469238,
"step": 189
},
{
"epoch": 0.37254901960784315,
"grad_norm": 3643.83251953125,
"learning_rate": 9.38336849537493e-06,
"loss": 4.563501358032227,
"step": 190
},
{
"epoch": 0.37450980392156863,
"grad_norm": 5243.2451171875,
"learning_rate": 9.375705470357493e-06,
"loss": 4.342321395874023,
"step": 191
},
{
"epoch": 0.3764705882352941,
"grad_norm": 72.02322387695312,
"learning_rate": 9.367998292882789e-06,
"loss": 3.8581736087799072,
"step": 192
},
{
"epoch": 0.3784313725490196,
"grad_norm": 18.841279983520508,
"learning_rate": 9.36024704071904e-06,
"loss": 3.925510883331299,
"step": 193
},
{
"epoch": 0.3803921568627451,
"grad_norm": 17.755449295043945,
"learning_rate": 9.35245179207919e-06,
"loss": 4.235489368438721,
"step": 194
},
{
"epoch": 0.38235294117647056,
"grad_norm": 58.992862701416016,
"learning_rate": 9.344612625620134e-06,
"loss": 4.563477516174316,
"step": 195
},
{
"epoch": 0.3843137254901961,
"grad_norm": 507.7042236328125,
"learning_rate": 9.336729620441906e-06,
"loss": 4.757030487060547,
"step": 196
},
{
"epoch": 0.3862745098039216,
"grad_norm": 15.296229362487793,
"learning_rate": 9.328802856086891e-06,
"loss": 4.090035915374756,
"step": 197
},
{
"epoch": 0.38823529411764707,
"grad_norm": 18.083436965942383,
"learning_rate": 9.32083241253902e-06,
"loss": 4.428251266479492,
"step": 198
},
{
"epoch": 0.39019607843137255,
"grad_norm": 9.689542770385742,
"learning_rate": 9.312818370222962e-06,
"loss": 4.577901840209961,
"step": 199
},
{
"epoch": 0.39215686274509803,
"grad_norm": 787.316162109375,
"learning_rate": 9.304760810003318e-06,
"loss": 4.476050853729248,
"step": 200
},
{
"epoch": 0.3941176470588235,
"grad_norm": 12.827642440795898,
"learning_rate": 9.296659813183794e-06,
"loss": 4.565194129943848,
"step": 201
},
{
"epoch": 0.396078431372549,
"grad_norm": 21.77273178100586,
"learning_rate": 9.28851546150639e-06,
"loss": 4.149844169616699,
"step": 202
},
{
"epoch": 0.3980392156862745,
"grad_norm": 19.56074333190918,
"learning_rate": 9.280327837150572e-06,
"loss": 4.48900842666626,
"step": 203
},
{
"epoch": 0.4,
"grad_norm": 99.8299560546875,
"learning_rate": 9.272097022732444e-06,
"loss": 4.36943244934082,
"step": 204
},
{
"epoch": 0.4019607843137255,
"grad_norm": 14.808595657348633,
"learning_rate": 9.263823101303911e-06,
"loss": 3.8771262168884277,
"step": 205
},
{
"epoch": 0.403921568627451,
"grad_norm": 57.59800338745117,
"learning_rate": 9.255506156351846e-06,
"loss": 4.419735908508301,
"step": 206
},
{
"epoch": 0.40588235294117647,
"grad_norm": 11.623266220092773,
"learning_rate": 9.247146271797244e-06,
"loss": 4.8247294425964355,
"step": 207
},
{
"epoch": 0.40784313725490196,
"grad_norm": 24.259164810180664,
"learning_rate": 9.238743531994378e-06,
"loss": 3.6684348583221436,
"step": 208
},
{
"epoch": 0.40980392156862744,
"grad_norm": 6.247960090637207,
"learning_rate": 9.23029802172994e-06,
"loss": 4.1336989402771,
"step": 209
},
{
"epoch": 0.4117647058823529,
"grad_norm": 51.79281997680664,
"learning_rate": 9.221809826222198e-06,
"loss": 4.226710319519043,
"step": 210
},
{
"epoch": 0.4137254901960784,
"grad_norm": 7.704559326171875,
"learning_rate": 9.213279031120129e-06,
"loss": 4.430539608001709,
"step": 211
},
{
"epoch": 0.41568627450980394,
"grad_norm": 14.719862937927246,
"learning_rate": 9.20470572250255e-06,
"loss": 4.240396499633789,
"step": 212
},
{
"epoch": 0.4176470588235294,
"grad_norm": 181.2888946533203,
"learning_rate": 9.196089986877262e-06,
"loss": 5.368552207946777,
"step": 213
},
{
"epoch": 0.4196078431372549,
"grad_norm": 57.88325119018555,
"learning_rate": 9.18743191118016e-06,
"loss": 4.403110980987549,
"step": 214
},
{
"epoch": 0.4215686274509804,
"grad_norm": 7.923624038696289,
"learning_rate": 9.17873158277438e-06,
"loss": 4.505987167358398,
"step": 215
},
{
"epoch": 0.4235294117647059,
"grad_norm": 12.484492301940918,
"learning_rate": 9.16998908944939e-06,
"loss": 4.539330005645752,
"step": 216
},
{
"epoch": 0.42549019607843136,
"grad_norm": 17.081193923950195,
"learning_rate": 9.161204519420126e-06,
"loss": 4.4862260818481445,
"step": 217
},
{
"epoch": 0.42745098039215684,
"grad_norm": 40.73018264770508,
"learning_rate": 9.152377961326085e-06,
"loss": 3.9755687713623047,
"step": 218
},
{
"epoch": 0.4294117647058823,
"grad_norm": 25.14496612548828,
"learning_rate": 9.14350950423045e-06,
"loss": 4.247672080993652,
"step": 219
},
{
"epoch": 0.43137254901960786,
"grad_norm": 42.86063003540039,
"learning_rate": 9.134599237619167e-06,
"loss": 4.861601829528809,
"step": 220
},
{
"epoch": 0.43333333333333335,
"grad_norm": 6.335762977600098,
"learning_rate": 9.125647251400068e-06,
"loss": 4.403509140014648,
"step": 221
},
{
"epoch": 0.43529411764705883,
"grad_norm": 15.092013359069824,
"learning_rate": 9.11665363590194e-06,
"loss": 4.2879791259765625,
"step": 222
},
{
"epoch": 0.4372549019607843,
"grad_norm": 21.203367233276367,
"learning_rate": 9.107618481873632e-06,
"loss": 4.234829902648926,
"step": 223
},
{
"epoch": 0.4392156862745098,
"grad_norm": 17.83528709411621,
"learning_rate": 9.098541880483129e-06,
"loss": 4.473383903503418,
"step": 224
},
{
"epoch": 0.4411764705882353,
"grad_norm": 9.45661449432373,
"learning_rate": 9.089423923316636e-06,
"loss": 4.060793876647949,
"step": 225
},
{
"epoch": 0.44313725490196076,
"grad_norm": 15.240864753723145,
"learning_rate": 9.08026470237765e-06,
"loss": 4.1823625564575195,
"step": 226
},
{
"epoch": 0.44509803921568625,
"grad_norm": 9.284879684448242,
"learning_rate": 9.07106431008604e-06,
"loss": 4.482451438903809,
"step": 227
},
{
"epoch": 0.4470588235294118,
"grad_norm": 316.4547119140625,
"learning_rate": 9.0618228392771e-06,
"loss": 4.567050933837891,
"step": 228
},
{
"epoch": 0.44901960784313727,
"grad_norm": 29.554767608642578,
"learning_rate": 9.052540383200634e-06,
"loss": 4.180408954620361,
"step": 229
},
{
"epoch": 0.45098039215686275,
"grad_norm": 6.75397253036499,
"learning_rate": 9.043217035519986e-06,
"loss": 4.032431125640869,
"step": 230
},
{
"epoch": 0.45294117647058824,
"grad_norm": 52.491146087646484,
"learning_rate": 9.033852890311127e-06,
"loss": 4.559549331665039,
"step": 231
},
{
"epoch": 0.4549019607843137,
"grad_norm": 26.352832794189453,
"learning_rate": 9.02444804206168e-06,
"loss": 4.473541736602783,
"step": 232
},
{
"epoch": 0.4568627450980392,
"grad_norm": 55.94530487060547,
"learning_rate": 9.01500258566998e-06,
"loss": 4.33009147644043,
"step": 233
},
{
"epoch": 0.4588235294117647,
"grad_norm": 561.2896728515625,
"learning_rate": 9.005516616444112e-06,
"loss": 4.711042404174805,
"step": 234
},
{
"epoch": 0.46078431372549017,
"grad_norm": 8.535683631896973,
"learning_rate": 8.99599023010095e-06,
"loss": 4.379650115966797,
"step": 235
},
{
"epoch": 0.4627450980392157,
"grad_norm": 305.4293212890625,
"learning_rate": 8.986423522765191e-06,
"loss": 4.39713191986084,
"step": 236
},
{
"epoch": 0.4647058823529412,
"grad_norm": 33.79056167602539,
"learning_rate": 8.976816590968388e-06,
"loss": 4.3444414138793945,
"step": 237
},
{
"epoch": 0.4666666666666667,
"grad_norm": 96.67849731445312,
"learning_rate": 8.967169531647971e-06,
"loss": 4.552453994750977,
"step": 238
},
{
"epoch": 0.46862745098039216,
"grad_norm": 9.912099838256836,
"learning_rate": 8.957482442146271e-06,
"loss": 4.669564723968506,
"step": 239
},
{
"epoch": 0.47058823529411764,
"grad_norm": 70.12734985351562,
"learning_rate": 8.947755420209541e-06,
"loss": 4.48826789855957,
"step": 240
},
{
"epoch": 0.4725490196078431,
"grad_norm": 82.63882446289062,
"learning_rate": 8.937988563986963e-06,
"loss": 4.289237022399902,
"step": 241
},
{
"epoch": 0.4745098039215686,
"grad_norm": 8.964136123657227,
"learning_rate": 8.928181972029664e-06,
"loss": 4.6587371826171875,
"step": 242
},
{
"epoch": 0.4764705882352941,
"grad_norm": 31.586191177368164,
"learning_rate": 8.918335743289717e-06,
"loss": 4.701657295227051,
"step": 243
},
{
"epoch": 0.47843137254901963,
"grad_norm": 11.228754043579102,
"learning_rate": 8.90844997711915e-06,
"loss": 4.612178802490234,
"step": 244
},
{
"epoch": 0.4803921568627451,
"grad_norm": 12.46069049835205,
"learning_rate": 8.898524773268926e-06,
"loss": 4.405297756195068,
"step": 245
},
{
"epoch": 0.4823529411764706,
"grad_norm": 10.117264747619629,
"learning_rate": 8.888560231887963e-06,
"loss": 4.3877387046813965,
"step": 246
},
{
"epoch": 0.4843137254901961,
"grad_norm": 10.064031600952148,
"learning_rate": 8.8785564535221e-06,
"loss": 4.765181064605713,
"step": 247
},
{
"epoch": 0.48627450980392156,
"grad_norm": 26.27206039428711,
"learning_rate": 8.868513539113093e-06,
"loss": 4.58603572845459,
"step": 248
},
{
"epoch": 0.48823529411764705,
"grad_norm": 9.178251266479492,
"learning_rate": 8.858431589997597e-06,
"loss": 4.693767070770264,
"step": 249
},
{
"epoch": 0.49019607843137253,
"grad_norm": 41.67184829711914,
"learning_rate": 8.848310707906138e-06,
"loss": 4.149996280670166,
"step": 250
},
{
"epoch": 0.492156862745098,
"grad_norm": 26.0257511138916,
"learning_rate": 8.838150994962094e-06,
"loss": 3.98866605758667,
"step": 251
},
{
"epoch": 0.49411764705882355,
"grad_norm": 11.47574234008789,
"learning_rate": 8.827952553680656e-06,
"loss": 4.481922626495361,
"step": 252
},
{
"epoch": 0.49607843137254903,
"grad_norm": 13.316975593566895,
"learning_rate": 8.817715486967803e-06,
"loss": 4.367927551269531,
"step": 253
},
{
"epoch": 0.4980392156862745,
"grad_norm": 25.555376052856445,
"learning_rate": 8.807439898119252e-06,
"loss": 4.406060218811035,
"step": 254
},
{
"epoch": 0.5,
"grad_norm": 19.18964195251465,
"learning_rate": 8.797125890819429e-06,
"loss": 4.455549240112305,
"step": 255
},
{
"epoch": 0.5019607843137255,
"grad_norm": 320.9529113769531,
"learning_rate": 8.786773569140414e-06,
"loss": 4.059732913970947,
"step": 256
},
{
"epoch": 0.503921568627451,
"grad_norm": 10.907160758972168,
"learning_rate": 8.776383037540888e-06,
"loss": 4.300107955932617,
"step": 257
},
{
"epoch": 0.5058823529411764,
"grad_norm": 10.504913330078125,
"learning_rate": 8.765954400865093e-06,
"loss": 4.379460334777832,
"step": 258
},
{
"epoch": 0.5078431372549019,
"grad_norm": 8.576491355895996,
"learning_rate": 8.755487764341756e-06,
"loss": 4.10643196105957,
"step": 259
},
{
"epoch": 0.5098039215686274,
"grad_norm": 37.79690170288086,
"learning_rate": 8.744983233583044e-06,
"loss": 4.253848075866699,
"step": 260
},
{
"epoch": 0.5117647058823529,
"grad_norm": 16.08103370666504,
"learning_rate": 8.734440914583486e-06,
"loss": 4.03438663482666,
"step": 261
},
{
"epoch": 0.5137254901960784,
"grad_norm": 16.359207153320312,
"learning_rate": 8.72386091371891e-06,
"loss": 4.546582221984863,
"step": 262
},
{
"epoch": 0.515686274509804,
"grad_norm": 14.165915489196777,
"learning_rate": 8.713243337745366e-06,
"loss": 4.424289226531982,
"step": 263
},
{
"epoch": 0.5176470588235295,
"grad_norm": 102.89573669433594,
"learning_rate": 8.70258829379805e-06,
"loss": 4.413416862487793,
"step": 264
},
{
"epoch": 0.5196078431372549,
"grad_norm": 48.94295120239258,
"learning_rate": 8.691895889390228e-06,
"loss": 4.2126665115356445,
"step": 265
},
{
"epoch": 0.5215686274509804,
"grad_norm": 4378.3564453125,
"learning_rate": 8.681166232412142e-06,
"loss": 4.454058647155762,
"step": 266
},
{
"epoch": 0.5235294117647059,
"grad_norm": 20.026926040649414,
"learning_rate": 8.670399431129926e-06,
"loss": 4.487159252166748,
"step": 267
},
{
"epoch": 0.5254901960784314,
"grad_norm": 31.539865493774414,
"learning_rate": 8.659595594184516e-06,
"loss": 3.834848642349243,
"step": 268
},
{
"epoch": 0.5274509803921569,
"grad_norm": 18.91790199279785,
"learning_rate": 8.648754830590552e-06,
"loss": 3.623075246810913,
"step": 269
},
{
"epoch": 0.5294117647058824,
"grad_norm": 225.7623748779297,
"learning_rate": 8.637877249735274e-06,
"loss": 4.196628093719482,
"step": 270
},
{
"epoch": 0.5313725490196078,
"grad_norm": 61.982601165771484,
"learning_rate": 8.626962961377423e-06,
"loss": 4.607351303100586,
"step": 271
},
{
"epoch": 0.5333333333333333,
"grad_norm": 29.32758140563965,
"learning_rate": 8.616012075646134e-06,
"loss": 4.233864784240723,
"step": 272
},
{
"epoch": 0.5352941176470588,
"grad_norm": 131.88658142089844,
"learning_rate": 8.605024703039817e-06,
"loss": 4.576198577880859,
"step": 273
},
{
"epoch": 0.5372549019607843,
"grad_norm": 28.853824615478516,
"learning_rate": 8.594000954425056e-06,
"loss": 4.819530010223389,
"step": 274
},
{
"epoch": 0.5392156862745098,
"grad_norm": 78.97745513916016,
"learning_rate": 8.582940941035476e-06,
"loss": 4.64754056930542,
"step": 275
},
{
"epoch": 0.5411764705882353,
"grad_norm": 84.47677612304688,
"learning_rate": 8.571844774470627e-06,
"loss": 4.477121829986572,
"step": 276
},
{
"epoch": 0.5431372549019607,
"grad_norm": 151.27618408203125,
"learning_rate": 8.560712566694863e-06,
"loss": 4.461421489715576,
"step": 277
},
{
"epoch": 0.5450980392156862,
"grad_norm": 67.91239166259766,
"learning_rate": 8.549544430036198e-06,
"loss": 4.4422101974487305,
"step": 278
},
{
"epoch": 0.5470588235294118,
"grad_norm": 27.76258087158203,
"learning_rate": 8.538340477185191e-06,
"loss": 4.3184404373168945,
"step": 279
},
{
"epoch": 0.5490196078431373,
"grad_norm": 52.35835266113281,
"learning_rate": 8.527100821193797e-06,
"loss": 4.155409812927246,
"step": 280
},
{
"epoch": 0.5509803921568628,
"grad_norm": 4.545929908752441,
"learning_rate": 8.51582557547422e-06,
"loss": 4.422041893005371,
"step": 281
},
{
"epoch": 0.5529411764705883,
"grad_norm": 27.044042587280273,
"learning_rate": 8.504514853797789e-06,
"loss": 4.30333948135376,
"step": 282
},
{
"epoch": 0.5549019607843138,
"grad_norm": 16.815105438232422,
"learning_rate": 8.493168770293793e-06,
"loss": 4.171971797943115,
"step": 283
},
{
"epoch": 0.5568627450980392,
"grad_norm": 13.872841835021973,
"learning_rate": 8.481787439448332e-06,
"loss": 4.339047431945801,
"step": 284
},
{
"epoch": 0.5588235294117647,
"grad_norm": 28.80792808532715,
"learning_rate": 8.470370976103171e-06,
"loss": 4.152105331420898,
"step": 285
},
{
"epoch": 0.5607843137254902,
"grad_norm": 8.963105201721191,
"learning_rate": 8.458919495454567e-06,
"loss": 4.377296447753906,
"step": 286
},
{
"epoch": 0.5627450980392157,
"grad_norm": 74.01875305175781,
"learning_rate": 8.447433113052124e-06,
"loss": 4.3900346755981445,
"step": 287
},
{
"epoch": 0.5647058823529412,
"grad_norm": 58.68815612792969,
"learning_rate": 8.435911944797605e-06,
"loss": 4.081506729125977,
"step": 288
},
{
"epoch": 0.5666666666666667,
"grad_norm": 9.66000747680664,
"learning_rate": 8.42435610694379e-06,
"loss": 4.700527191162109,
"step": 289
},
{
"epoch": 0.5686274509803921,
"grad_norm": 856.5879516601562,
"learning_rate": 8.412765716093273e-06,
"loss": 4.371489524841309,
"step": 290
},
{
"epoch": 0.5705882352941176,
"grad_norm": 54.936737060546875,
"learning_rate": 8.401140889197305e-06,
"loss": 4.387087821960449,
"step": 291
},
{
"epoch": 0.5725490196078431,
"grad_norm": 47.68635559082031,
"learning_rate": 8.38948174355462e-06,
"loss": 4.089095115661621,
"step": 292
},
{
"epoch": 0.5745098039215686,
"grad_norm": 57.09484100341797,
"learning_rate": 8.377788396810223e-06,
"loss": 4.206271171569824,
"step": 293
},
{
"epoch": 0.5764705882352941,
"grad_norm": 112.220947265625,
"learning_rate": 8.366060966954235e-06,
"loss": 4.599027156829834,
"step": 294
},
{
"epoch": 0.5784313725490197,
"grad_norm": 6.840386390686035,
"learning_rate": 8.354299572320679e-06,
"loss": 4.318846702575684,
"step": 295
},
{
"epoch": 0.5803921568627451,
"grad_norm": 9.837911605834961,
"learning_rate": 8.342504331586298e-06,
"loss": 4.517173767089844,
"step": 296
},
{
"epoch": 0.5823529411764706,
"grad_norm": 8.993966102600098,
"learning_rate": 8.330675363769356e-06,
"loss": 4.465400695800781,
"step": 297
},
{
"epoch": 0.5843137254901961,
"grad_norm": 13.560818672180176,
"learning_rate": 8.318812788228434e-06,
"loss": 4.454391002655029,
"step": 298
},
{
"epoch": 0.5862745098039216,
"grad_norm": 42.093650817871094,
"learning_rate": 8.306916724661225e-06,
"loss": 4.149091720581055,
"step": 299
},
{
"epoch": 0.5882352941176471,
"grad_norm": 26.250268936157227,
"learning_rate": 8.294987293103334e-06,
"loss": 4.4396562576293945,
"step": 300
},
{
"epoch": 0.5901960784313726,
"grad_norm": 28.954051971435547,
"learning_rate": 8.283024613927055e-06,
"loss": 4.341966152191162,
"step": 301
},
{
"epoch": 0.592156862745098,
"grad_norm": 62.48039245605469,
"learning_rate": 8.271028807840164e-06,
"loss": 4.215608596801758,
"step": 302
},
{
"epoch": 0.5941176470588235,
"grad_norm": 22.603797912597656,
"learning_rate": 8.258999995884706e-06,
"loss": 4.152694225311279,
"step": 303
},
{
"epoch": 0.596078431372549,
"grad_norm": 8.716656684875488,
"learning_rate": 8.246938299435759e-06,
"loss": 4.047298431396484,
"step": 304
},
{
"epoch": 0.5980392156862745,
"grad_norm": 18.179229736328125,
"learning_rate": 8.234843840200218e-06,
"loss": 4.296360015869141,
"step": 305
},
{
"epoch": 0.6,
"grad_norm": 56.76976013183594,
"learning_rate": 8.222716740215573e-06,
"loss": 4.411765098571777,
"step": 306
},
{
"epoch": 0.6019607843137255,
"grad_norm": 52.26530838012695,
"learning_rate": 8.210557121848664e-06,
"loss": 4.340122222900391,
"step": 307
},
{
"epoch": 0.6039215686274509,
"grad_norm": 266.9866638183594,
"learning_rate": 8.198365107794457e-06,
"loss": 4.172665596008301,
"step": 308
},
{
"epoch": 0.6058823529411764,
"grad_norm": 15.979412078857422,
"learning_rate": 8.186140821074801e-06,
"loss": 4.39658260345459,
"step": 309
},
{
"epoch": 0.6078431372549019,
"grad_norm": 11.8402738571167,
"learning_rate": 8.173884385037193e-06,
"loss": 4.514126777648926,
"step": 310
},
{
"epoch": 0.6098039215686275,
"grad_norm": 13.766944885253906,
"learning_rate": 8.161595923353516e-06,
"loss": 4.273541450500488,
"step": 311
},
{
"epoch": 0.611764705882353,
"grad_norm": 14.141230583190918,
"learning_rate": 8.149275560018816e-06,
"loss": 4.575261116027832,
"step": 312
},
{
"epoch": 0.6137254901960785,
"grad_norm": 11.983506202697754,
"learning_rate": 8.136923419350032e-06,
"loss": 4.441531181335449,
"step": 313
},
{
"epoch": 0.615686274509804,
"grad_norm": 11.801518440246582,
"learning_rate": 8.12453962598475e-06,
"loss": 4.674493789672852,
"step": 314
},
{
"epoch": 0.6176470588235294,
"grad_norm": 10.840680122375488,
"learning_rate": 8.112124304879938e-06,
"loss": 4.433037757873535,
"step": 315
},
{
"epoch": 0.6196078431372549,
"grad_norm": 10.749861717224121,
"learning_rate": 8.0996775813107e-06,
"loss": 4.395119667053223,
"step": 316
},
{
"epoch": 0.6215686274509804,
"grad_norm": 14.47747802734375,
"learning_rate": 8.087199580868997e-06,
"loss": 4.807774543762207,
"step": 317
},
{
"epoch": 0.6235294117647059,
"grad_norm": 145.16539001464844,
"learning_rate": 8.07469042946238e-06,
"loss": 4.5776777267456055,
"step": 318
},
{
"epoch": 0.6254901960784314,
"grad_norm": 16.87470054626465,
"learning_rate": 8.062150253312735e-06,
"loss": 4.936606407165527,
"step": 319
},
{
"epoch": 0.6274509803921569,
"grad_norm": 17.10848617553711,
"learning_rate": 8.04957917895499e-06,
"loss": 4.733745574951172,
"step": 320
},
{
"epoch": 0.6294117647058823,
"grad_norm": 13.901424407958984,
"learning_rate": 8.03697733323585e-06,
"loss": 4.649224758148193,
"step": 321
},
{
"epoch": 0.6313725490196078,
"grad_norm": 9.257561683654785,
"learning_rate": 8.024344843312517e-06,
"loss": 4.121101379394531,
"step": 322
},
{
"epoch": 0.6333333333333333,
"grad_norm": 20.801715850830078,
"learning_rate": 8.011681836651401e-06,
"loss": 4.233510971069336,
"step": 323
},
{
"epoch": 0.6352941176470588,
"grad_norm": 8.654178619384766,
"learning_rate": 7.99898844102684e-06,
"loss": 4.358521938323975,
"step": 324
},
{
"epoch": 0.6372549019607843,
"grad_norm": 19.770313262939453,
"learning_rate": 7.986264784519801e-06,
"loss": 4.486325263977051,
"step": 325
},
{
"epoch": 0.6392156862745098,
"grad_norm": 24.694883346557617,
"learning_rate": 7.973510995516603e-06,
"loss": 4.33124303817749,
"step": 326
},
{
"epoch": 0.6411764705882353,
"grad_norm": 5.858190536499023,
"learning_rate": 7.960727202707605e-06,
"loss": 4.653376579284668,
"step": 327
},
{
"epoch": 0.6431372549019608,
"grad_norm": 10.262151718139648,
"learning_rate": 7.947913535085925e-06,
"loss": 4.330365180969238,
"step": 328
},
{
"epoch": 0.6450980392156863,
"grad_norm": 7.667881965637207,
"learning_rate": 7.935070121946116e-06,
"loss": 4.4410929679870605,
"step": 329
},
{
"epoch": 0.6470588235294118,
"grad_norm": 8.601579666137695,
"learning_rate": 7.922197092882882e-06,
"loss": 4.40837287902832,
"step": 330
},
{
"epoch": 0.6490196078431373,
"grad_norm": 7.12674617767334,
"learning_rate": 7.909294577789765e-06,
"loss": 4.342606544494629,
"step": 331
},
{
"epoch": 0.6509803921568628,
"grad_norm": 10.018842697143555,
"learning_rate": 7.896362706857825e-06,
"loss": 4.35633659362793,
"step": 332
},
{
"epoch": 0.6529411764705882,
"grad_norm": 8.465182304382324,
"learning_rate": 7.883401610574338e-06,
"loss": 4.524230480194092,
"step": 333
},
{
"epoch": 0.6549019607843137,
"grad_norm": 11.65457534790039,
"learning_rate": 7.870411419721468e-06,
"loss": 4.69898796081543,
"step": 334
},
{
"epoch": 0.6568627450980392,
"grad_norm": 12.852667808532715,
"learning_rate": 7.857392265374963e-06,
"loss": 4.4134087562561035,
"step": 335
},
{
"epoch": 0.6588235294117647,
"grad_norm": 7.551708698272705,
"learning_rate": 7.844344278902815e-06,
"loss": 4.333009243011475,
"step": 336
},
{
"epoch": 0.6607843137254902,
"grad_norm": 8.990592002868652,
"learning_rate": 7.83126759196395e-06,
"loss": 4.12913703918457,
"step": 337
},
{
"epoch": 0.6627450980392157,
"grad_norm": 5.897281646728516,
"learning_rate": 7.818162336506885e-06,
"loss": 4.319456100463867,
"step": 338
},
{
"epoch": 0.6647058823529411,
"grad_norm": 10.881563186645508,
"learning_rate": 7.805028644768407e-06,
"loss": 3.8857927322387695,
"step": 339
},
{
"epoch": 0.6666666666666666,
"grad_norm": 11.13469409942627,
"learning_rate": 7.791866649272236e-06,
"loss": 4.694910526275635,
"step": 340
},
{
"epoch": 0.6686274509803921,
"grad_norm": 9.245570182800293,
"learning_rate": 7.778676482827686e-06,
"loss": 4.456270694732666,
"step": 341
},
{
"epoch": 0.6705882352941176,
"grad_norm": 9.138643264770508,
"learning_rate": 7.765458278528327e-06,
"loss": 4.281149864196777,
"step": 342
},
{
"epoch": 0.6725490196078432,
"grad_norm": 10.280998229980469,
"learning_rate": 7.752212169750642e-06,
"loss": 3.655160665512085,
"step": 343
},
{
"epoch": 0.6745098039215687,
"grad_norm": 12.691766738891602,
"learning_rate": 7.738938290152675e-06,
"loss": 4.443702220916748,
"step": 344
},
{
"epoch": 0.6764705882352942,
"grad_norm": 53.1502799987793,
"learning_rate": 7.725636773672694e-06,
"loss": 4.33807897567749,
"step": 345
},
{
"epoch": 0.6784313725490196,
"grad_norm": 32.06969451904297,
"learning_rate": 7.712307754527832e-06,
"loss": 4.4467267990112305,
"step": 346
},
{
"epoch": 0.6803921568627451,
"grad_norm": 9.82119083404541,
"learning_rate": 7.69895136721273e-06,
"loss": 4.563485145568848,
"step": 347
},
{
"epoch": 0.6823529411764706,
"grad_norm": 14.746912002563477,
"learning_rate": 7.685567746498191e-06,
"loss": 4.372725486755371,
"step": 348
},
{
"epoch": 0.6843137254901961,
"grad_norm": 8.75436019897461,
"learning_rate": 7.672157027429803e-06,
"loss": 4.417882919311523,
"step": 349
},
{
"epoch": 0.6862745098039216,
"grad_norm": 19.984905242919922,
"learning_rate": 7.658719345326595e-06,
"loss": 4.403292655944824,
"step": 350
},
{
"epoch": 0.6882352941176471,
"grad_norm": 8.184240341186523,
"learning_rate": 7.645254835779657e-06,
"loss": 4.424741268157959,
"step": 351
},
{
"epoch": 0.6901960784313725,
"grad_norm": 12.842850685119629,
"learning_rate": 7.631763634650783e-06,
"loss": 4.134533405303955,
"step": 352
},
{
"epoch": 0.692156862745098,
"grad_norm": 6.699388027191162,
"learning_rate": 7.618245878071091e-06,
"loss": 4.04726505279541,
"step": 353
},
{
"epoch": 0.6941176470588235,
"grad_norm": 105.13713073730469,
"learning_rate": 7.604701702439652e-06,
"loss": 4.773365020751953,
"step": 354
},
{
"epoch": 0.696078431372549,
"grad_norm": 4.884748458862305,
"learning_rate": 7.591131244422118e-06,
"loss": 4.247958660125732,
"step": 355
},
{
"epoch": 0.6980392156862745,
"grad_norm": 7.144754409790039,
"learning_rate": 7.57753464094934e-06,
"loss": 4.382415771484375,
"step": 356
},
{
"epoch": 0.7,
"grad_norm": 6.072117805480957,
"learning_rate": 7.563912029215983e-06,
"loss": 4.450734615325928,
"step": 357
},
{
"epoch": 0.7019607843137254,
"grad_norm": 12.829042434692383,
"learning_rate": 7.550263546679148e-06,
"loss": 4.965085983276367,
"step": 358
},
{
"epoch": 0.703921568627451,
"grad_norm": 14.615585327148438,
"learning_rate": 7.536589331056976e-06,
"loss": 5.021347999572754,
"step": 359
},
{
"epoch": 0.7058823529411765,
"grad_norm": 20.44240379333496,
"learning_rate": 7.522889520327275e-06,
"loss": 4.897363662719727,
"step": 360
},
{
"epoch": 0.707843137254902,
"grad_norm": 8.430086135864258,
"learning_rate": 7.509164252726107e-06,
"loss": 4.523484230041504,
"step": 361
},
{
"epoch": 0.7098039215686275,
"grad_norm": 64.25504302978516,
"learning_rate": 7.495413666746406e-06,
"loss": 4.529187202453613,
"step": 362
},
{
"epoch": 0.711764705882353,
"grad_norm": 7.60660982131958,
"learning_rate": 7.481637901136578e-06,
"loss": 4.3769965171813965,
"step": 363
},
{
"epoch": 0.7137254901960784,
"grad_norm": 10.102652549743652,
"learning_rate": 7.467837094899104e-06,
"loss": 4.237904071807861,
"step": 364
},
{
"epoch": 0.7156862745098039,
"grad_norm": 7.940488815307617,
"learning_rate": 7.454011387289127e-06,
"loss": 4.800004959106445,
"step": 365
},
{
"epoch": 0.7176470588235294,
"grad_norm": 9.199849128723145,
"learning_rate": 7.440160917813059e-06,
"loss": 4.4133710861206055,
"step": 366
},
{
"epoch": 0.7196078431372549,
"grad_norm": 13.037126541137695,
"learning_rate": 7.426285826227171e-06,
"loss": 3.827446699142456,
"step": 367
},
{
"epoch": 0.7215686274509804,
"grad_norm": 16.962772369384766,
"learning_rate": 7.412386252536168e-06,
"loss": 4.1103997230529785,
"step": 368
},
{
"epoch": 0.7235294117647059,
"grad_norm": 5.314018249511719,
"learning_rate": 7.398462336991802e-06,
"loss": 4.367024898529053,
"step": 369
},
{
"epoch": 0.7254901960784313,
"grad_norm": 7.412561893463135,
"learning_rate": 7.384514220091437e-06,
"loss": 4.72689962387085,
"step": 370
},
{
"epoch": 0.7274509803921568,
"grad_norm": 8.2675199508667,
"learning_rate": 7.370542042576635e-06,
"loss": 4.435096263885498,
"step": 371
},
{
"epoch": 0.7294117647058823,
"grad_norm": 9.654603004455566,
"learning_rate": 7.356545945431744e-06,
"loss": 4.463583946228027,
"step": 372
},
{
"epoch": 0.7313725490196078,
"grad_norm": 8.670485496520996,
"learning_rate": 7.342526069882465e-06,
"loss": 4.5643415451049805,
"step": 373
},
{
"epoch": 0.7333333333333333,
"grad_norm": 6.766948699951172,
"learning_rate": 7.328482557394435e-06,
"loss": 4.181691646575928,
"step": 374
},
{
"epoch": 0.7352941176470589,
"grad_norm": 10.84627914428711,
"learning_rate": 7.314415549671795e-06,
"loss": 4.422411918640137,
"step": 375
},
{
"epoch": 0.7372549019607844,
"grad_norm": 5.880477428436279,
"learning_rate": 7.300325188655762e-06,
"loss": 4.424395561218262,
"step": 376
},
{
"epoch": 0.7392156862745098,
"grad_norm": 20.747825622558594,
"learning_rate": 7.286211616523193e-06,
"loss": 4.11272668838501,
"step": 377
},
{
"epoch": 0.7411764705882353,
"grad_norm": 18.401670455932617,
"learning_rate": 7.27207497568516e-06,
"loss": 3.824052333831787,
"step": 378
},
{
"epoch": 0.7431372549019608,
"grad_norm": 8.974493980407715,
"learning_rate": 7.257915408785499e-06,
"loss": 4.54365348815918,
"step": 379
},
{
"epoch": 0.7450980392156863,
"grad_norm": 17.942354202270508,
"learning_rate": 7.243733058699386e-06,
"loss": 4.127044677734375,
"step": 380
},
{
"epoch": 0.7470588235294118,
"grad_norm": 25.93067169189453,
"learning_rate": 7.229528068531881e-06,
"loss": 4.338431358337402,
"step": 381
},
{
"epoch": 0.7490196078431373,
"grad_norm": 11.005992889404297,
"learning_rate": 7.215300581616496e-06,
"loss": 4.377932548522949,
"step": 382
},
{
"epoch": 0.7509803921568627,
"grad_norm": 37.77220916748047,
"learning_rate": 7.201050741513735e-06,
"loss": 4.601173400878906,
"step": 383
},
{
"epoch": 0.7529411764705882,
"grad_norm": 46.073551177978516,
"learning_rate": 7.186778692009669e-06,
"loss": 4.647587776184082,
"step": 384
},
{
"epoch": 0.7549019607843137,
"grad_norm": 20.6584529876709,
"learning_rate": 7.172484577114452e-06,
"loss": 3.963017702102661,
"step": 385
},
{
"epoch": 0.7568627450980392,
"grad_norm": 21.113977432250977,
"learning_rate": 7.1581685410609e-06,
"loss": 4.483006477355957,
"step": 386
},
{
"epoch": 0.7588235294117647,
"grad_norm": 8.748090744018555,
"learning_rate": 7.1438307283030106e-06,
"loss": 4.640547752380371,
"step": 387
},
{
"epoch": 0.7607843137254902,
"grad_norm": 6.323770046234131,
"learning_rate": 7.129471283514525e-06,
"loss": 4.377452850341797,
"step": 388
},
{
"epoch": 0.7627450980392156,
"grad_norm": 7.1361308097839355,
"learning_rate": 7.115090351587455e-06,
"loss": 4.528425216674805,
"step": 389
},
{
"epoch": 0.7647058823529411,
"grad_norm": 7.771296977996826,
"learning_rate": 7.100688077630628e-06,
"loss": 4.031993865966797,
"step": 390
},
{
"epoch": 0.7666666666666667,
"grad_norm": 11.560522079467773,
"learning_rate": 7.086264606968215e-06,
"loss": 4.186218738555908,
"step": 391
},
{
"epoch": 0.7686274509803922,
"grad_norm": 7.255620002746582,
"learning_rate": 7.071820085138275e-06,
"loss": 4.786005973815918,
"step": 392
},
{
"epoch": 0.7705882352941177,
"grad_norm": 8.37585163116455,
"learning_rate": 7.05735465789128e-06,
"loss": 4.31743860244751,
"step": 393
},
{
"epoch": 0.7725490196078432,
"grad_norm": 5.002284049987793,
"learning_rate": 7.042868471188642e-06,
"loss": 4.3620500564575195,
"step": 394
},
{
"epoch": 0.7745098039215687,
"grad_norm": 7.566415309906006,
"learning_rate": 7.028361671201245e-06,
"loss": 4.245545387268066,
"step": 395
},
{
"epoch": 0.7764705882352941,
"grad_norm": 15.03036117553711,
"learning_rate": 7.013834404307972e-06,
"loss": 4.74299955368042,
"step": 396
},
{
"epoch": 0.7784313725490196,
"grad_norm": 7.180116176605225,
"learning_rate": 6.9992868170942205e-06,
"loss": 4.2865447998046875,
"step": 397
},
{
"epoch": 0.7803921568627451,
"grad_norm": 11.753171920776367,
"learning_rate": 6.9847190563504284e-06,
"loss": 4.315492630004883,
"step": 398
},
{
"epoch": 0.7823529411764706,
"grad_norm": 109.72077178955078,
"learning_rate": 6.970131269070591e-06,
"loss": 4.001181602478027,
"step": 399
},
{
"epoch": 0.7843137254901961,
"grad_norm": 25.598417282104492,
"learning_rate": 6.95552360245078e-06,
"loss": 4.539231300354004,
"step": 400
},
{
"epoch": 0.7862745098039216,
"grad_norm": 11.810413360595703,
"learning_rate": 6.940896203887659e-06,
"loss": 4.104607105255127,
"step": 401
},
{
"epoch": 0.788235294117647,
"grad_norm": 12.639123916625977,
"learning_rate": 6.926249220976988e-06,
"loss": 4.151772499084473,
"step": 402
},
{
"epoch": 0.7901960784313725,
"grad_norm": 11.613636016845703,
"learning_rate": 6.911582801512146e-06,
"loss": 4.29304313659668,
"step": 403
},
{
"epoch": 0.792156862745098,
"grad_norm": 22.361234664916992,
"learning_rate": 6.8968970934826296e-06,
"loss": 3.9035181999206543,
"step": 404
},
{
"epoch": 0.7941176470588235,
"grad_norm": 47.97343826293945,
"learning_rate": 6.88219224507257e-06,
"loss": 4.482244491577148,
"step": 405
},
{
"epoch": 0.796078431372549,
"grad_norm": 53.55236053466797,
"learning_rate": 6.867468404659222e-06,
"loss": 4.42404317855835,
"step": 406
},
{
"epoch": 0.7980392156862746,
"grad_norm": 6.139882564544678,
"learning_rate": 6.852725720811487e-06,
"loss": 3.866445302963257,
"step": 407
},
{
"epoch": 0.8,
"grad_norm": 13.475299835205078,
"learning_rate": 6.837964342288399e-06,
"loss": 4.167118072509766,
"step": 408
},
{
"epoch": 0.8019607843137255,
"grad_norm": 13.994526863098145,
"learning_rate": 6.823184418037625e-06,
"loss": 4.223340034484863,
"step": 409
},
{
"epoch": 0.803921568627451,
"grad_norm": 21.013151168823242,
"learning_rate": 6.808386097193969e-06,
"loss": 3.9206745624542236,
"step": 410
},
{
"epoch": 0.8058823529411765,
"grad_norm": 14.670804023742676,
"learning_rate": 6.793569529077864e-06,
"loss": 4.275432586669922,
"step": 411
},
{
"epoch": 0.807843137254902,
"grad_norm": 7.984720230102539,
"learning_rate": 6.778734863193862e-06,
"loss": 4.327607154846191,
"step": 412
},
{
"epoch": 0.8098039215686275,
"grad_norm": 12.322322845458984,
"learning_rate": 6.76388224922913e-06,
"loss": 4.192738056182861,
"step": 413
},
{
"epoch": 0.8117647058823529,
"grad_norm": 14.9388427734375,
"learning_rate": 6.7490118370519356e-06,
"loss": 4.836441993713379,
"step": 414
},
{
"epoch": 0.8137254901960784,
"grad_norm": 12.723301887512207,
"learning_rate": 6.7341237767101375e-06,
"loss": 4.762911319732666,
"step": 415
},
{
"epoch": 0.8156862745098039,
"grad_norm": 14.065217018127441,
"learning_rate": 6.7192182184296725e-06,
"loss": 4.191904067993164,
"step": 416
},
{
"epoch": 0.8176470588235294,
"grad_norm": 17.27406120300293,
"learning_rate": 6.704295312613037e-06,
"loss": 4.0509748458862305,
"step": 417
},
{
"epoch": 0.8196078431372549,
"grad_norm": 6.344608783721924,
"learning_rate": 6.689355209837769e-06,
"loss": 4.776634216308594,
"step": 418
},
{
"epoch": 0.8215686274509804,
"grad_norm": 5.471117973327637,
"learning_rate": 6.674398060854931e-06,
"loss": 4.332463264465332,
"step": 419
},
{
"epoch": 0.8235294117647058,
"grad_norm": 36.830535888671875,
"learning_rate": 6.65942401658759e-06,
"loss": 4.420635223388672,
"step": 420
},
{
"epoch": 0.8254901960784313,
"grad_norm": 21.876571655273438,
"learning_rate": 6.644433228129288e-06,
"loss": 4.5363006591796875,
"step": 421
},
{
"epoch": 0.8274509803921568,
"grad_norm": 20.565128326416016,
"learning_rate": 6.6294258467425256e-06,
"loss": 4.202686309814453,
"step": 422
},
{
"epoch": 0.8294117647058824,
"grad_norm": 20.264272689819336,
"learning_rate": 6.614402023857231e-06,
"loss": 4.074251174926758,
"step": 423
},
{
"epoch": 0.8313725490196079,
"grad_norm": 43.58699417114258,
"learning_rate": 6.599361911069235e-06,
"loss": 4.196136474609375,
"step": 424
},
{
"epoch": 0.8333333333333334,
"grad_norm": 339.2962951660156,
"learning_rate": 6.584305660138734e-06,
"loss": 4.362434387207031,
"step": 425
},
{
"epoch": 0.8352941176470589,
"grad_norm": 10.97144603729248,
"learning_rate": 6.569233422988771e-06,
"loss": 4.28524923324585,
"step": 426
},
{
"epoch": 0.8372549019607843,
"grad_norm": 14.54588508605957,
"learning_rate": 6.554145351703689e-06,
"loss": 4.445708274841309,
"step": 427
},
{
"epoch": 0.8392156862745098,
"grad_norm": 60.744747161865234,
"learning_rate": 6.539041598527612e-06,
"loss": 4.457370281219482,
"step": 428
},
{
"epoch": 0.8411764705882353,
"grad_norm": 200.7458038330078,
"learning_rate": 6.523922315862887e-06,
"loss": 3.9219799041748047,
"step": 429
},
{
"epoch": 0.8431372549019608,
"grad_norm": 5.749436378479004,
"learning_rate": 6.508787656268573e-06,
"loss": 4.174837112426758,
"step": 430
},
{
"epoch": 0.8450980392156863,
"grad_norm": 6.574963092803955,
"learning_rate": 6.4936377724588794e-06,
"loss": 4.107203960418701,
"step": 431
},
{
"epoch": 0.8470588235294118,
"grad_norm": 14.352339744567871,
"learning_rate": 6.478472817301635e-06,
"loss": 4.814848899841309,
"step": 432
},
{
"epoch": 0.8490196078431372,
"grad_norm": 7.782406806945801,
"learning_rate": 6.463292943816747e-06,
"loss": 4.176547050476074,
"step": 433
},
{
"epoch": 0.8509803921568627,
"grad_norm": 62.66047668457031,
"learning_rate": 6.448098305174648e-06,
"loss": 4.225009918212891,
"step": 434
},
{
"epoch": 0.8529411764705882,
"grad_norm": 31.651609420776367,
"learning_rate": 6.4328890546947645e-06,
"loss": 4.503094673156738,
"step": 435
},
{
"epoch": 0.8549019607843137,
"grad_norm": 6.525542736053467,
"learning_rate": 6.417665345843952e-06,
"loss": 3.9161875247955322,
"step": 436
},
{
"epoch": 0.8568627450980392,
"grad_norm": 7.290045738220215,
"learning_rate": 6.402427332234965e-06,
"loss": 4.532121658325195,
"step": 437
},
{
"epoch": 0.8588235294117647,
"grad_norm": 7.032063007354736,
"learning_rate": 6.387175167624894e-06,
"loss": 4.433841705322266,
"step": 438
},
{
"epoch": 0.8607843137254902,
"grad_norm": 8.59839916229248,
"learning_rate": 6.371909005913618e-06,
"loss": 4.42300271987915,
"step": 439
},
{
"epoch": 0.8627450980392157,
"grad_norm": 6.988096714019775,
"learning_rate": 6.3566290011422515e-06,
"loss": 4.233433723449707,
"step": 440
},
{
"epoch": 0.8647058823529412,
"grad_norm": 4.5652313232421875,
"learning_rate": 6.341335307491596e-06,
"loss": 3.88616943359375,
"step": 441
},
{
"epoch": 0.8666666666666667,
"grad_norm": 4.380303382873535,
"learning_rate": 6.32602807928057e-06,
"loss": 4.275338649749756,
"step": 442
},
{
"epoch": 0.8686274509803922,
"grad_norm": 13.967598915100098,
"learning_rate": 6.310707470964668e-06,
"loss": 4.245949745178223,
"step": 443
},
{
"epoch": 0.8705882352941177,
"grad_norm": 6.2936506271362305,
"learning_rate": 6.29537363713439e-06,
"loss": 4.214742660522461,
"step": 444
},
{
"epoch": 0.8725490196078431,
"grad_norm": 6.2614970207214355,
"learning_rate": 6.280026732513689e-06,
"loss": 4.165116310119629,
"step": 445
},
{
"epoch": 0.8745098039215686,
"grad_norm": 13.735063552856445,
"learning_rate": 6.264666911958404e-06,
"loss": 4.574287414550781,
"step": 446
},
{
"epoch": 0.8764705882352941,
"grad_norm": 13.999856948852539,
"learning_rate": 6.249294330454705e-06,
"loss": 4.115286827087402,
"step": 447
},
{
"epoch": 0.8784313725490196,
"grad_norm": 9.688719749450684,
"learning_rate": 6.233909143117521e-06,
"loss": 4.250068664550781,
"step": 448
},
{
"epoch": 0.8803921568627451,
"grad_norm": 32.32737350463867,
"learning_rate": 6.21851150518898e-06,
"loss": 4.899062156677246,
"step": 449
},
{
"epoch": 0.8823529411764706,
"grad_norm": 9.234833717346191,
"learning_rate": 6.203101572036839e-06,
"loss": 4.489043235778809,
"step": 450
},
{
"epoch": 0.884313725490196,
"grad_norm": 19.185062408447266,
"learning_rate": 6.18767949915292e-06,
"loss": 4.504437446594238,
"step": 451
},
{
"epoch": 0.8862745098039215,
"grad_norm": 10.716368675231934,
"learning_rate": 6.172245442151541e-06,
"loss": 4.1727800369262695,
"step": 452
},
{
"epoch": 0.888235294117647,
"grad_norm": 12.19927978515625,
"learning_rate": 6.156799556767941e-06,
"loss": 4.317794322967529,
"step": 453
},
{
"epoch": 0.8901960784313725,
"grad_norm": 22.349681854248047,
"learning_rate": 6.141341998856711e-06,
"loss": 4.239640235900879,
"step": 454
},
{
"epoch": 0.8921568627450981,
"grad_norm": 9.220239639282227,
"learning_rate": 6.125872924390226e-06,
"loss": 4.351978778839111,
"step": 455
},
{
"epoch": 0.8941176470588236,
"grad_norm": 30.204912185668945,
"learning_rate": 6.110392489457067e-06,
"loss": 4.01255989074707,
"step": 456
},
{
"epoch": 0.8960784313725491,
"grad_norm": 34.6622428894043,
"learning_rate": 6.094900850260439e-06,
"loss": 4.116291046142578,
"step": 457
},
{
"epoch": 0.8980392156862745,
"grad_norm": 53.01099395751953,
"learning_rate": 6.079398163116611e-06,
"loss": 4.077776908874512,
"step": 458
},
{
"epoch": 0.9,
"grad_norm": 13.728572845458984,
"learning_rate": 6.063884584453326e-06,
"loss": 4.205946922302246,
"step": 459
},
{
"epoch": 0.9019607843137255,
"grad_norm": 27.227886199951172,
"learning_rate": 6.048360270808226e-06,
"loss": 4.454074859619141,
"step": 460
},
{
"epoch": 0.903921568627451,
"grad_norm": 1187.58203125,
"learning_rate": 6.032825378827273e-06,
"loss": 4.106019973754883,
"step": 461
},
{
"epoch": 0.9058823529411765,
"grad_norm": 88.40968322753906,
"learning_rate": 6.0172800652631706e-06,
"loss": 4.531243324279785,
"step": 462
},
{
"epoch": 0.907843137254902,
"grad_norm": 76.25667572021484,
"learning_rate": 6.001724486973774e-06,
"loss": 4.6429853439331055,
"step": 463
},
{
"epoch": 0.9098039215686274,
"grad_norm": 104.32091522216797,
"learning_rate": 5.986158800920523e-06,
"loss": 4.6695685386657715,
"step": 464
},
{
"epoch": 0.9117647058823529,
"grad_norm": 101.0859603881836,
"learning_rate": 5.970583164166838e-06,
"loss": 4.725400447845459,
"step": 465
},
{
"epoch": 0.9137254901960784,
"grad_norm": 67.9173583984375,
"learning_rate": 5.954997733876552e-06,
"loss": 4.044053077697754,
"step": 466
},
{
"epoch": 0.9156862745098039,
"grad_norm": 258.1031494140625,
"learning_rate": 5.939402667312316e-06,
"loss": 4.8689799308776855,
"step": 467
},
{
"epoch": 0.9176470588235294,
"grad_norm": 36.58251953125,
"learning_rate": 5.923798121834016e-06,
"loss": 4.843733787536621,
"step": 468
},
{
"epoch": 0.9196078431372549,
"grad_norm": 22.516658782958984,
"learning_rate": 5.908184254897183e-06,
"loss": 4.461618423461914,
"step": 469
},
{
"epoch": 0.9215686274509803,
"grad_norm": 54.69601821899414,
"learning_rate": 5.892561224051403e-06,
"loss": 4.002848148345947,
"step": 470
},
{
"epoch": 0.9235294117647059,
"grad_norm": 34.609493255615234,
"learning_rate": 5.876929186938734e-06,
"loss": 4.39974308013916,
"step": 471
},
{
"epoch": 0.9254901960784314,
"grad_norm": 158.92474365234375,
"learning_rate": 5.861288301292103e-06,
"loss": 4.6093525886535645,
"step": 472
},
{
"epoch": 0.9274509803921569,
"grad_norm": 358.99554443359375,
"learning_rate": 5.845638724933729e-06,
"loss": 4.667701721191406,
"step": 473
},
{
"epoch": 0.9294117647058824,
"grad_norm": 52.98344039916992,
"learning_rate": 5.82998061577352e-06,
"loss": 3.9046082496643066,
"step": 474
},
{
"epoch": 0.9313725490196079,
"grad_norm": 11.510079383850098,
"learning_rate": 5.814314131807486e-06,
"loss": 4.29728889465332,
"step": 475
},
{
"epoch": 0.9333333333333333,
"grad_norm": 52.69283676147461,
"learning_rate": 5.798639431116135e-06,
"loss": 4.667060375213623,
"step": 476
},
{
"epoch": 0.9352941176470588,
"grad_norm": 27.900066375732422,
"learning_rate": 5.782956671862895e-06,
"loss": 4.403017044067383,
"step": 477
},
{
"epoch": 0.9372549019607843,
"grad_norm": 8.282060623168945,
"learning_rate": 5.767266012292496e-06,
"loss": 4.352850914001465,
"step": 478
},
{
"epoch": 0.9392156862745098,
"grad_norm": 9.053024291992188,
"learning_rate": 5.751567610729398e-06,
"loss": 4.167609691619873,
"step": 479
},
{
"epoch": 0.9411764705882353,
"grad_norm": 39.519447326660156,
"learning_rate": 5.735861625576167e-06,
"loss": 4.330041885375977,
"step": 480
},
{
"epoch": 0.9431372549019608,
"grad_norm": 95.29867553710938,
"learning_rate": 5.720148215311902e-06,
"loss": 4.768982887268066,
"step": 481
},
{
"epoch": 0.9450980392156862,
"grad_norm": 23.550722122192383,
"learning_rate": 5.7044275384906164e-06,
"loss": 4.328036308288574,
"step": 482
},
{
"epoch": 0.9470588235294117,
"grad_norm": 30.829458236694336,
"learning_rate": 5.688699753739649e-06,
"loss": 4.415774345397949,
"step": 483
},
{
"epoch": 0.9490196078431372,
"grad_norm": 9.180082321166992,
"learning_rate": 5.672965019758061e-06,
"loss": 4.172072410583496,
"step": 484
},
{
"epoch": 0.9509803921568627,
"grad_norm": 15.722640037536621,
"learning_rate": 5.657223495315031e-06,
"loss": 4.710781574249268,
"step": 485
},
{
"epoch": 0.9529411764705882,
"grad_norm": 31.03820037841797,
"learning_rate": 5.641475339248257e-06,
"loss": 4.139684200286865,
"step": 486
},
{
"epoch": 0.9549019607843138,
"grad_norm": 2897.17333984375,
"learning_rate": 5.625720710462352e-06,
"loss": 4.369198322296143,
"step": 487
},
{
"epoch": 0.9568627450980393,
"grad_norm": 30.0218563079834,
"learning_rate": 5.609959767927247e-06,
"loss": 4.735037326812744,
"step": 488
},
{
"epoch": 0.9588235294117647,
"grad_norm": 16.19695281982422,
"learning_rate": 5.594192670676568e-06,
"loss": 4.406367301940918,
"step": 489
},
{
"epoch": 0.9607843137254902,
"grad_norm": 15.989925384521484,
"learning_rate": 5.578419577806058e-06,
"loss": 4.398193836212158,
"step": 490
},
{
"epoch": 0.9627450980392157,
"grad_norm": 30.709712982177734,
"learning_rate": 5.562640648471951e-06,
"loss": 4.407459735870361,
"step": 491
},
{
"epoch": 0.9647058823529412,
"grad_norm": 48.51979446411133,
"learning_rate": 5.546856041889374e-06,
"loss": 4.81352424621582,
"step": 492
},
{
"epoch": 0.9666666666666667,
"grad_norm": 12.244742393493652,
"learning_rate": 5.531065917330737e-06,
"loss": 4.10401725769043,
"step": 493
},
{
"epoch": 0.9686274509803922,
"grad_norm": 6.017563343048096,
"learning_rate": 5.515270434124136e-06,
"loss": 4.171624183654785,
"step": 494
},
{
"epoch": 0.9705882352941176,
"grad_norm": 5.285996437072754,
"learning_rate": 5.499469751651728e-06,
"loss": 4.021862030029297,
"step": 495
},
{
"epoch": 0.9725490196078431,
"grad_norm": 8.994955062866211,
"learning_rate": 5.483664029348141e-06,
"loss": 4.736730575561523,
"step": 496
},
{
"epoch": 0.9745098039215686,
"grad_norm": 5.072285175323486,
"learning_rate": 5.467853426698852e-06,
"loss": 4.152073383331299,
"step": 497
},
{
"epoch": 0.9764705882352941,
"grad_norm": 5.994180202484131,
"learning_rate": 5.452038103238582e-06,
"loss": 4.670340538024902,
"step": 498
},
{
"epoch": 0.9784313725490196,
"grad_norm": 10.215426445007324,
"learning_rate": 5.43621821854969e-06,
"loss": 4.869247913360596,
"step": 499
},
{
"epoch": 0.9803921568627451,
"grad_norm": 5.956417083740234,
"learning_rate": 5.420393932260557e-06,
"loss": 4.4339447021484375,
"step": 500
},
{
"epoch": 0.9823529411764705,
"grad_norm": 4.450964450836182,
"learning_rate": 5.404565404043977e-06,
"loss": 4.339052677154541,
"step": 501
},
{
"epoch": 0.984313725490196,
"grad_norm": 12.46487045288086,
"learning_rate": 5.388732793615551e-06,
"loss": 4.585082054138184,
"step": 502
},
{
"epoch": 0.9862745098039216,
"grad_norm": 10.983120918273926,
"learning_rate": 5.372896260732065e-06,
"loss": 4.340575695037842,
"step": 503
},
{
"epoch": 0.9882352941176471,
"grad_norm": 11.863996505737305,
"learning_rate": 5.357055965189888e-06,
"loss": 4.341768264770508,
"step": 504
},
{
"epoch": 0.9901960784313726,
"grad_norm": 19.986562728881836,
"learning_rate": 5.341212066823356e-06,
"loss": 4.160877704620361,
"step": 505
},
{
"epoch": 0.9921568627450981,
"grad_norm": 6.726130962371826,
"learning_rate": 5.325364725503155e-06,
"loss": 5.01240348815918,
"step": 506
},
{
"epoch": 0.9941176470588236,
"grad_norm": 17.171804428100586,
"learning_rate": 5.3095141011347155e-06,
"loss": 4.031642913818359,
"step": 507
},
{
"epoch": 0.996078431372549,
"grad_norm": 5.7268524169921875,
"learning_rate": 5.2936603536565915e-06,
"loss": 4.291274547576904,
"step": 508
},
{
"epoch": 0.9980392156862745,
"grad_norm": 7.725809097290039,
"learning_rate": 5.277803643038855e-06,
"loss": 4.2685546875,
"step": 509
},
{
"epoch": 1.0,
"grad_norm": 6.054961681365967,
"learning_rate": 5.261944129281474e-06,
"loss": 4.074590682983398,
"step": 510
}
],
"logging_steps": 1,
"max_steps": 1020,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.411555016453128e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}