Text Generation
Safetensors
English
mistral
conversational
Mistral-150M-Instruct / trainer_state.json
ZMC2019's picture
Upload folder using huggingface_hub
0ffed95 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2063,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004847309743092584,
"grad_norm": 7.171422004699707,
"learning_rate": 2.4154589371980677e-07,
"loss": 3.4607,
"step": 1
},
{
"epoch": 0.0009694619486185168,
"grad_norm": 6.8233819007873535,
"learning_rate": 4.830917874396135e-07,
"loss": 3.2371,
"step": 2
},
{
"epoch": 0.001454192922927775,
"grad_norm": 7.979361057281494,
"learning_rate": 7.246376811594203e-07,
"loss": 3.3669,
"step": 3
},
{
"epoch": 0.0019389238972370335,
"grad_norm": 6.270462512969971,
"learning_rate": 9.66183574879227e-07,
"loss": 3.3027,
"step": 4
},
{
"epoch": 0.0024236548715462916,
"grad_norm": 6.778450012207031,
"learning_rate": 1.2077294685990338e-06,
"loss": 3.2712,
"step": 5
},
{
"epoch": 0.00290838584585555,
"grad_norm": 6.3815741539001465,
"learning_rate": 1.4492753623188406e-06,
"loss": 3.3691,
"step": 6
},
{
"epoch": 0.0033931168201648087,
"grad_norm": 6.661952018737793,
"learning_rate": 1.6908212560386474e-06,
"loss": 3.3269,
"step": 7
},
{
"epoch": 0.003877847794474067,
"grad_norm": 6.819033145904541,
"learning_rate": 1.932367149758454e-06,
"loss": 3.2467,
"step": 8
},
{
"epoch": 0.004362578768783325,
"grad_norm": 5.535458564758301,
"learning_rate": 2.173913043478261e-06,
"loss": 3.1249,
"step": 9
},
{
"epoch": 0.004847309743092583,
"grad_norm": 6.2597880363464355,
"learning_rate": 2.4154589371980677e-06,
"loss": 3.4761,
"step": 10
},
{
"epoch": 0.005332040717401842,
"grad_norm": 6.067697525024414,
"learning_rate": 2.6570048309178746e-06,
"loss": 3.3821,
"step": 11
},
{
"epoch": 0.0058167716917111,
"grad_norm": 5.567847728729248,
"learning_rate": 2.898550724637681e-06,
"loss": 3.0755,
"step": 12
},
{
"epoch": 0.006301502666020358,
"grad_norm": 4.956480026245117,
"learning_rate": 3.140096618357488e-06,
"loss": 3.0035,
"step": 13
},
{
"epoch": 0.0067862336403296175,
"grad_norm": 6.243023872375488,
"learning_rate": 3.3816425120772947e-06,
"loss": 3.1336,
"step": 14
},
{
"epoch": 0.007270964614638876,
"grad_norm": 5.114922046661377,
"learning_rate": 3.6231884057971017e-06,
"loss": 2.9698,
"step": 15
},
{
"epoch": 0.007755695588948134,
"grad_norm": 5.099619388580322,
"learning_rate": 3.864734299516908e-06,
"loss": 3.022,
"step": 16
},
{
"epoch": 0.008240426563257392,
"grad_norm": 5.203329563140869,
"learning_rate": 4.106280193236716e-06,
"loss": 2.8866,
"step": 17
},
{
"epoch": 0.00872515753756665,
"grad_norm": 4.828646659851074,
"learning_rate": 4.347826086956522e-06,
"loss": 2.7138,
"step": 18
},
{
"epoch": 0.009209888511875909,
"grad_norm": 5.072122573852539,
"learning_rate": 4.589371980676329e-06,
"loss": 2.8717,
"step": 19
},
{
"epoch": 0.009694619486185167,
"grad_norm": 4.990163326263428,
"learning_rate": 4.830917874396135e-06,
"loss": 2.8924,
"step": 20
},
{
"epoch": 0.010179350460494426,
"grad_norm": 4.578018665313721,
"learning_rate": 5.072463768115943e-06,
"loss": 2.5511,
"step": 21
},
{
"epoch": 0.010664081434803683,
"grad_norm": 4.484613418579102,
"learning_rate": 5.314009661835749e-06,
"loss": 2.6689,
"step": 22
},
{
"epoch": 0.011148812409112942,
"grad_norm": 4.256691932678223,
"learning_rate": 5.555555555555556e-06,
"loss": 2.5432,
"step": 23
},
{
"epoch": 0.0116335433834222,
"grad_norm": 4.374969005584717,
"learning_rate": 5.797101449275362e-06,
"loss": 2.6499,
"step": 24
},
{
"epoch": 0.012118274357731459,
"grad_norm": 4.293645858764648,
"learning_rate": 6.038647342995169e-06,
"loss": 2.2692,
"step": 25
},
{
"epoch": 0.012603005332040717,
"grad_norm": 3.59814190864563,
"learning_rate": 6.280193236714976e-06,
"loss": 2.3972,
"step": 26
},
{
"epoch": 0.013087736306349976,
"grad_norm": 4.462075710296631,
"learning_rate": 6.521739130434783e-06,
"loss": 2.3534,
"step": 27
},
{
"epoch": 0.013572467280659235,
"grad_norm": 3.837050437927246,
"learning_rate": 6.7632850241545894e-06,
"loss": 2.1863,
"step": 28
},
{
"epoch": 0.014057198254968492,
"grad_norm": 3.4918296337127686,
"learning_rate": 7.004830917874397e-06,
"loss": 2.0585,
"step": 29
},
{
"epoch": 0.014541929229277752,
"grad_norm": 3.0016872882843018,
"learning_rate": 7.246376811594203e-06,
"loss": 2.0722,
"step": 30
},
{
"epoch": 0.015026660203587009,
"grad_norm": 3.585313320159912,
"learning_rate": 7.48792270531401e-06,
"loss": 2.2528,
"step": 31
},
{
"epoch": 0.015511391177896268,
"grad_norm": 3.0016026496887207,
"learning_rate": 7.729468599033817e-06,
"loss": 1.9671,
"step": 32
},
{
"epoch": 0.015996122152205527,
"grad_norm": 2.7799417972564697,
"learning_rate": 7.971014492753623e-06,
"loss": 2.1397,
"step": 33
},
{
"epoch": 0.016480853126514785,
"grad_norm": 2.6177585124969482,
"learning_rate": 8.212560386473431e-06,
"loss": 2.0716,
"step": 34
},
{
"epoch": 0.016965584100824042,
"grad_norm": 2.6502163410186768,
"learning_rate": 8.454106280193238e-06,
"loss": 2.0137,
"step": 35
},
{
"epoch": 0.0174503150751333,
"grad_norm": 2.5228984355926514,
"learning_rate": 8.695652173913044e-06,
"loss": 1.7984,
"step": 36
},
{
"epoch": 0.01793504604944256,
"grad_norm": 2.638493537902832,
"learning_rate": 8.93719806763285e-06,
"loss": 1.8365,
"step": 37
},
{
"epoch": 0.018419777023751818,
"grad_norm": 2.337646007537842,
"learning_rate": 9.178743961352658e-06,
"loss": 1.8024,
"step": 38
},
{
"epoch": 0.018904507998061076,
"grad_norm": 2.2700746059417725,
"learning_rate": 9.420289855072464e-06,
"loss": 1.8987,
"step": 39
},
{
"epoch": 0.019389238972370333,
"grad_norm": 2.020162343978882,
"learning_rate": 9.66183574879227e-06,
"loss": 1.6793,
"step": 40
},
{
"epoch": 0.019873969946679594,
"grad_norm": 2.015110492706299,
"learning_rate": 9.903381642512077e-06,
"loss": 1.8938,
"step": 41
},
{
"epoch": 0.02035870092098885,
"grad_norm": 2.2413482666015625,
"learning_rate": 1.0144927536231885e-05,
"loss": 2.1364,
"step": 42
},
{
"epoch": 0.02084343189529811,
"grad_norm": 1.931469202041626,
"learning_rate": 1.0386473429951692e-05,
"loss": 1.9931,
"step": 43
},
{
"epoch": 0.021328162869607366,
"grad_norm": 1.8747423887252808,
"learning_rate": 1.0628019323671499e-05,
"loss": 1.6649,
"step": 44
},
{
"epoch": 0.021812893843916627,
"grad_norm": 1.9672399759292603,
"learning_rate": 1.0869565217391305e-05,
"loss": 1.8262,
"step": 45
},
{
"epoch": 0.022297624818225885,
"grad_norm": 1.9558823108673096,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.8059,
"step": 46
},
{
"epoch": 0.022782355792535142,
"grad_norm": 1.8525190353393555,
"learning_rate": 1.1352657004830918e-05,
"loss": 1.721,
"step": 47
},
{
"epoch": 0.0232670867668444,
"grad_norm": 1.8605695962905884,
"learning_rate": 1.1594202898550725e-05,
"loss": 1.9044,
"step": 48
},
{
"epoch": 0.02375181774115366,
"grad_norm": 2.524609088897705,
"learning_rate": 1.1835748792270531e-05,
"loss": 1.6106,
"step": 49
},
{
"epoch": 0.024236548715462918,
"grad_norm": 1.8198078870773315,
"learning_rate": 1.2077294685990338e-05,
"loss": 1.822,
"step": 50
},
{
"epoch": 0.024721279689772176,
"grad_norm": 1.7705700397491455,
"learning_rate": 1.2318840579710146e-05,
"loss": 1.6892,
"step": 51
},
{
"epoch": 0.025206010664081433,
"grad_norm": 1.9049899578094482,
"learning_rate": 1.2560386473429953e-05,
"loss": 1.7649,
"step": 52
},
{
"epoch": 0.025690741638390694,
"grad_norm": 1.835711121559143,
"learning_rate": 1.2801932367149761e-05,
"loss": 1.7394,
"step": 53
},
{
"epoch": 0.02617547261269995,
"grad_norm": 1.6949882507324219,
"learning_rate": 1.3043478260869566e-05,
"loss": 1.6299,
"step": 54
},
{
"epoch": 0.02666020358700921,
"grad_norm": 1.7638367414474487,
"learning_rate": 1.3285024154589374e-05,
"loss": 1.5448,
"step": 55
},
{
"epoch": 0.02714493456131847,
"grad_norm": 2.0102837085723877,
"learning_rate": 1.3526570048309179e-05,
"loss": 1.9325,
"step": 56
},
{
"epoch": 0.027629665535627727,
"grad_norm": 1.7918657064437866,
"learning_rate": 1.3768115942028985e-05,
"loss": 1.6411,
"step": 57
},
{
"epoch": 0.028114396509936985,
"grad_norm": 2.3990306854248047,
"learning_rate": 1.4009661835748794e-05,
"loss": 1.8308,
"step": 58
},
{
"epoch": 0.028599127484246242,
"grad_norm": 2.5733494758605957,
"learning_rate": 1.4251207729468599e-05,
"loss": 2.1079,
"step": 59
},
{
"epoch": 0.029083858458555503,
"grad_norm": 1.8608803749084473,
"learning_rate": 1.4492753623188407e-05,
"loss": 1.9755,
"step": 60
},
{
"epoch": 0.02956858943286476,
"grad_norm": 1.7092028856277466,
"learning_rate": 1.4734299516908212e-05,
"loss": 1.6123,
"step": 61
},
{
"epoch": 0.030053320407174018,
"grad_norm": 1.6902247667312622,
"learning_rate": 1.497584541062802e-05,
"loss": 1.6213,
"step": 62
},
{
"epoch": 0.030538051381483276,
"grad_norm": 1.5993880033493042,
"learning_rate": 1.5217391304347828e-05,
"loss": 1.5819,
"step": 63
},
{
"epoch": 0.031022782355792537,
"grad_norm": 1.6327762603759766,
"learning_rate": 1.5458937198067633e-05,
"loss": 1.6991,
"step": 64
},
{
"epoch": 0.031507513330101794,
"grad_norm": 1.6658787727355957,
"learning_rate": 1.570048309178744e-05,
"loss": 1.9614,
"step": 65
},
{
"epoch": 0.031992244304411055,
"grad_norm": 1.5906341075897217,
"learning_rate": 1.5942028985507246e-05,
"loss": 1.5806,
"step": 66
},
{
"epoch": 0.03247697527872031,
"grad_norm": 1.7006059885025024,
"learning_rate": 1.6183574879227054e-05,
"loss": 2.1204,
"step": 67
},
{
"epoch": 0.03296170625302957,
"grad_norm": 1.8104106187820435,
"learning_rate": 1.6425120772946863e-05,
"loss": 1.7377,
"step": 68
},
{
"epoch": 0.033446437227338824,
"grad_norm": 1.7204387187957764,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.6536,
"step": 69
},
{
"epoch": 0.033931168201648085,
"grad_norm": 1.77187979221344,
"learning_rate": 1.6908212560386476e-05,
"loss": 1.7974,
"step": 70
},
{
"epoch": 0.034415899175957346,
"grad_norm": 1.7312054634094238,
"learning_rate": 1.714975845410628e-05,
"loss": 1.6036,
"step": 71
},
{
"epoch": 0.0349006301502666,
"grad_norm": 1.7339930534362793,
"learning_rate": 1.739130434782609e-05,
"loss": 1.532,
"step": 72
},
{
"epoch": 0.03538536112457586,
"grad_norm": 1.6795563697814941,
"learning_rate": 1.7632850241545894e-05,
"loss": 1.7439,
"step": 73
},
{
"epoch": 0.03587009209888512,
"grad_norm": 1.6724114418029785,
"learning_rate": 1.78743961352657e-05,
"loss": 1.7637,
"step": 74
},
{
"epoch": 0.036354823073194376,
"grad_norm": 1.5222914218902588,
"learning_rate": 1.8115942028985507e-05,
"loss": 1.4971,
"step": 75
},
{
"epoch": 0.036839554047503636,
"grad_norm": 1.6842753887176514,
"learning_rate": 1.8357487922705315e-05,
"loss": 1.525,
"step": 76
},
{
"epoch": 0.03732428502181289,
"grad_norm": 1.6687383651733398,
"learning_rate": 1.859903381642512e-05,
"loss": 1.9496,
"step": 77
},
{
"epoch": 0.03780901599612215,
"grad_norm": 1.8582350015640259,
"learning_rate": 1.8840579710144928e-05,
"loss": 1.733,
"step": 78
},
{
"epoch": 0.03829374697043141,
"grad_norm": 1.554795265197754,
"learning_rate": 1.9082125603864733e-05,
"loss": 1.492,
"step": 79
},
{
"epoch": 0.038778477944740666,
"grad_norm": 1.7551424503326416,
"learning_rate": 1.932367149758454e-05,
"loss": 1.898,
"step": 80
},
{
"epoch": 0.03926320891904993,
"grad_norm": 1.9943029880523682,
"learning_rate": 1.956521739130435e-05,
"loss": 1.6975,
"step": 81
},
{
"epoch": 0.03974793989335919,
"grad_norm": 1.6358810663223267,
"learning_rate": 1.9806763285024154e-05,
"loss": 1.4857,
"step": 82
},
{
"epoch": 0.04023267086766844,
"grad_norm": 1.7312710285186768,
"learning_rate": 2.0048309178743963e-05,
"loss": 1.5569,
"step": 83
},
{
"epoch": 0.0407174018419777,
"grad_norm": 1.6241320371627808,
"learning_rate": 2.028985507246377e-05,
"loss": 1.734,
"step": 84
},
{
"epoch": 0.041202132816286964,
"grad_norm": 1.6983885765075684,
"learning_rate": 2.0531400966183576e-05,
"loss": 1.786,
"step": 85
},
{
"epoch": 0.04168686379059622,
"grad_norm": 1.615333080291748,
"learning_rate": 2.0772946859903384e-05,
"loss": 1.8196,
"step": 86
},
{
"epoch": 0.04217159476490548,
"grad_norm": 1.6279438734054565,
"learning_rate": 2.101449275362319e-05,
"loss": 1.5445,
"step": 87
},
{
"epoch": 0.04265632573921473,
"grad_norm": 1.582491397857666,
"learning_rate": 2.1256038647342997e-05,
"loss": 1.456,
"step": 88
},
{
"epoch": 0.043141056713523994,
"grad_norm": 1.701452612876892,
"learning_rate": 2.1497584541062805e-05,
"loss": 1.8605,
"step": 89
},
{
"epoch": 0.043625787687833255,
"grad_norm": 1.845126986503601,
"learning_rate": 2.173913043478261e-05,
"loss": 1.5401,
"step": 90
},
{
"epoch": 0.04411051866214251,
"grad_norm": 1.6799591779708862,
"learning_rate": 2.198067632850242e-05,
"loss": 1.7081,
"step": 91
},
{
"epoch": 0.04459524963645177,
"grad_norm": 1.7880574464797974,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.5429,
"step": 92
},
{
"epoch": 0.04507998061076103,
"grad_norm": 1.625308632850647,
"learning_rate": 2.246376811594203e-05,
"loss": 1.5608,
"step": 93
},
{
"epoch": 0.045564711585070285,
"grad_norm": 1.8478410243988037,
"learning_rate": 2.2705314009661836e-05,
"loss": 2.0223,
"step": 94
},
{
"epoch": 0.046049442559379546,
"grad_norm": 1.5162346363067627,
"learning_rate": 2.294685990338164e-05,
"loss": 1.2781,
"step": 95
},
{
"epoch": 0.0465341735336888,
"grad_norm": 1.854674220085144,
"learning_rate": 2.318840579710145e-05,
"loss": 1.6944,
"step": 96
},
{
"epoch": 0.04701890450799806,
"grad_norm": 1.7069923877716064,
"learning_rate": 2.3429951690821258e-05,
"loss": 1.6425,
"step": 97
},
{
"epoch": 0.04750363548230732,
"grad_norm": 1.5543208122253418,
"learning_rate": 2.3671497584541063e-05,
"loss": 1.4902,
"step": 98
},
{
"epoch": 0.047988366456616575,
"grad_norm": 1.5168403387069702,
"learning_rate": 2.391304347826087e-05,
"loss": 1.3613,
"step": 99
},
{
"epoch": 0.048473097430925836,
"grad_norm": 1.7112879753112793,
"learning_rate": 2.4154589371980676e-05,
"loss": 1.9106,
"step": 100
},
{
"epoch": 0.0489578284052351,
"grad_norm": 1.8067775964736938,
"learning_rate": 2.4396135265700484e-05,
"loss": 1.8777,
"step": 101
},
{
"epoch": 0.04944255937954435,
"grad_norm": 1.528732419013977,
"learning_rate": 2.4637681159420292e-05,
"loss": 1.4165,
"step": 102
},
{
"epoch": 0.04992729035385361,
"grad_norm": 1.5768786668777466,
"learning_rate": 2.4879227053140097e-05,
"loss": 1.5526,
"step": 103
},
{
"epoch": 0.050412021328162866,
"grad_norm": 1.665514349937439,
"learning_rate": 2.5120772946859905e-05,
"loss": 1.5298,
"step": 104
},
{
"epoch": 0.05089675230247213,
"grad_norm": 1.59950852394104,
"learning_rate": 2.5362318840579714e-05,
"loss": 1.6167,
"step": 105
},
{
"epoch": 0.05138148327678139,
"grad_norm": 1.6841107606887817,
"learning_rate": 2.5603864734299522e-05,
"loss": 1.6686,
"step": 106
},
{
"epoch": 0.05186621425109064,
"grad_norm": 1.6083794832229614,
"learning_rate": 2.5845410628019323e-05,
"loss": 1.3828,
"step": 107
},
{
"epoch": 0.0523509452253999,
"grad_norm": 1.6372400522232056,
"learning_rate": 2.608695652173913e-05,
"loss": 1.7482,
"step": 108
},
{
"epoch": 0.052835676199709164,
"grad_norm": 1.7453114986419678,
"learning_rate": 2.632850241545894e-05,
"loss": 1.769,
"step": 109
},
{
"epoch": 0.05332040717401842,
"grad_norm": 1.6545413732528687,
"learning_rate": 2.6570048309178748e-05,
"loss": 2.123,
"step": 110
},
{
"epoch": 0.05380513814832768,
"grad_norm": 1.8229496479034424,
"learning_rate": 2.6811594202898553e-05,
"loss": 1.9621,
"step": 111
},
{
"epoch": 0.05428986912263694,
"grad_norm": 1.640529990196228,
"learning_rate": 2.7053140096618358e-05,
"loss": 1.8864,
"step": 112
},
{
"epoch": 0.054774600096946194,
"grad_norm": 1.4900970458984375,
"learning_rate": 2.7294685990338166e-05,
"loss": 1.9581,
"step": 113
},
{
"epoch": 0.055259331071255455,
"grad_norm": 1.60316002368927,
"learning_rate": 2.753623188405797e-05,
"loss": 1.4996,
"step": 114
},
{
"epoch": 0.05574406204556471,
"grad_norm": 1.8271881341934204,
"learning_rate": 2.777777777777778e-05,
"loss": 1.8359,
"step": 115
},
{
"epoch": 0.05622879301987397,
"grad_norm": 1.6188838481903076,
"learning_rate": 2.8019323671497587e-05,
"loss": 1.778,
"step": 116
},
{
"epoch": 0.05671352399418323,
"grad_norm": 1.672446608543396,
"learning_rate": 2.826086956521739e-05,
"loss": 1.9118,
"step": 117
},
{
"epoch": 0.057198254968492485,
"grad_norm": 1.725005030632019,
"learning_rate": 2.8502415458937197e-05,
"loss": 1.4782,
"step": 118
},
{
"epoch": 0.057682985942801746,
"grad_norm": 1.6945290565490723,
"learning_rate": 2.8743961352657005e-05,
"loss": 1.4347,
"step": 119
},
{
"epoch": 0.058167716917111006,
"grad_norm": 1.6939204931259155,
"learning_rate": 2.8985507246376814e-05,
"loss": 1.6457,
"step": 120
},
{
"epoch": 0.05865244789142026,
"grad_norm": 1.7131168842315674,
"learning_rate": 2.9227053140096622e-05,
"loss": 1.8858,
"step": 121
},
{
"epoch": 0.05913717886572952,
"grad_norm": 1.8440873622894287,
"learning_rate": 2.9468599033816423e-05,
"loss": 2.0418,
"step": 122
},
{
"epoch": 0.059621909840038775,
"grad_norm": 1.6105564832687378,
"learning_rate": 2.971014492753623e-05,
"loss": 1.5901,
"step": 123
},
{
"epoch": 0.060106640814348036,
"grad_norm": 1.7111328840255737,
"learning_rate": 2.995169082125604e-05,
"loss": 1.6965,
"step": 124
},
{
"epoch": 0.0605913717886573,
"grad_norm": 1.6334936618804932,
"learning_rate": 3.0193236714975848e-05,
"loss": 1.7441,
"step": 125
},
{
"epoch": 0.06107610276296655,
"grad_norm": 1.6838350296020508,
"learning_rate": 3.0434782608695656e-05,
"loss": 1.7894,
"step": 126
},
{
"epoch": 0.06156083373727581,
"grad_norm": 1.6963026523590088,
"learning_rate": 3.067632850241546e-05,
"loss": 1.8447,
"step": 127
},
{
"epoch": 0.06204556471158507,
"grad_norm": 1.6151043176651,
"learning_rate": 3.0917874396135266e-05,
"loss": 1.6308,
"step": 128
},
{
"epoch": 0.06253029568589433,
"grad_norm": 1.802123785018921,
"learning_rate": 3.1159420289855074e-05,
"loss": 1.5931,
"step": 129
},
{
"epoch": 0.06301502666020359,
"grad_norm": 9.797577857971191,
"learning_rate": 3.140096618357488e-05,
"loss": 1.4618,
"step": 130
},
{
"epoch": 0.06349975763451285,
"grad_norm": 1.555799126625061,
"learning_rate": 3.164251207729469e-05,
"loss": 1.4462,
"step": 131
},
{
"epoch": 0.06398448860882211,
"grad_norm": 1.631108283996582,
"learning_rate": 3.188405797101449e-05,
"loss": 1.6234,
"step": 132
},
{
"epoch": 0.06446921958313136,
"grad_norm": 1.6092970371246338,
"learning_rate": 3.21256038647343e-05,
"loss": 1.6422,
"step": 133
},
{
"epoch": 0.06495395055744062,
"grad_norm": 1.667833685874939,
"learning_rate": 3.236714975845411e-05,
"loss": 1.6484,
"step": 134
},
{
"epoch": 0.06543868153174988,
"grad_norm": 1.7068734169006348,
"learning_rate": 3.260869565217392e-05,
"loss": 1.4214,
"step": 135
},
{
"epoch": 0.06592341250605914,
"grad_norm": 1.5251973867416382,
"learning_rate": 3.2850241545893725e-05,
"loss": 1.5508,
"step": 136
},
{
"epoch": 0.0664081434803684,
"grad_norm": 1.6476942300796509,
"learning_rate": 3.3091787439613533e-05,
"loss": 1.49,
"step": 137
},
{
"epoch": 0.06689287445467765,
"grad_norm": 1.7182284593582153,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.6699,
"step": 138
},
{
"epoch": 0.06737760542898691,
"grad_norm": 1.5735374689102173,
"learning_rate": 3.357487922705314e-05,
"loss": 1.6516,
"step": 139
},
{
"epoch": 0.06786233640329617,
"grad_norm": 1.6058720350265503,
"learning_rate": 3.381642512077295e-05,
"loss": 1.6413,
"step": 140
},
{
"epoch": 0.06834706737760543,
"grad_norm": 1.596596360206604,
"learning_rate": 3.405797101449276e-05,
"loss": 1.5226,
"step": 141
},
{
"epoch": 0.06883179835191469,
"grad_norm": 1.7365012168884277,
"learning_rate": 3.429951690821256e-05,
"loss": 1.9471,
"step": 142
},
{
"epoch": 0.06931652932622395,
"grad_norm": 1.566796064376831,
"learning_rate": 3.454106280193237e-05,
"loss": 1.6039,
"step": 143
},
{
"epoch": 0.0698012603005332,
"grad_norm": 2.105440616607666,
"learning_rate": 3.478260869565218e-05,
"loss": 1.8893,
"step": 144
},
{
"epoch": 0.07028599127484246,
"grad_norm": 1.701600193977356,
"learning_rate": 3.502415458937198e-05,
"loss": 1.8973,
"step": 145
},
{
"epoch": 0.07077072224915172,
"grad_norm": 1.6586337089538574,
"learning_rate": 3.526570048309179e-05,
"loss": 1.6766,
"step": 146
},
{
"epoch": 0.07125545322346098,
"grad_norm": 1.6110674142837524,
"learning_rate": 3.5507246376811596e-05,
"loss": 1.5639,
"step": 147
},
{
"epoch": 0.07174018419777024,
"grad_norm": 1.6405996084213257,
"learning_rate": 3.57487922705314e-05,
"loss": 1.7666,
"step": 148
},
{
"epoch": 0.07222491517207949,
"grad_norm": 1.6947883367538452,
"learning_rate": 3.5990338164251205e-05,
"loss": 1.578,
"step": 149
},
{
"epoch": 0.07270964614638875,
"grad_norm": 1.6765739917755127,
"learning_rate": 3.6231884057971014e-05,
"loss": 1.7495,
"step": 150
},
{
"epoch": 0.07319437712069801,
"grad_norm": 1.8097703456878662,
"learning_rate": 3.647342995169082e-05,
"loss": 1.3894,
"step": 151
},
{
"epoch": 0.07367910809500727,
"grad_norm": 1.657542109489441,
"learning_rate": 3.671497584541063e-05,
"loss": 1.615,
"step": 152
},
{
"epoch": 0.07416383906931653,
"grad_norm": 1.6826646327972412,
"learning_rate": 3.695652173913043e-05,
"loss": 1.5399,
"step": 153
},
{
"epoch": 0.07464857004362578,
"grad_norm": 1.6431878805160522,
"learning_rate": 3.719806763285024e-05,
"loss": 1.8077,
"step": 154
},
{
"epoch": 0.07513330101793504,
"grad_norm": 1.617641568183899,
"learning_rate": 3.743961352657005e-05,
"loss": 1.7934,
"step": 155
},
{
"epoch": 0.0756180319922443,
"grad_norm": 1.6802802085876465,
"learning_rate": 3.7681159420289856e-05,
"loss": 2.0497,
"step": 156
},
{
"epoch": 0.07610276296655356,
"grad_norm": 1.5995146036148071,
"learning_rate": 3.7922705314009665e-05,
"loss": 1.5435,
"step": 157
},
{
"epoch": 0.07658749394086282,
"grad_norm": 1.6491643190383911,
"learning_rate": 3.8164251207729466e-05,
"loss": 1.6003,
"step": 158
},
{
"epoch": 0.07707222491517209,
"grad_norm": 1.671040654182434,
"learning_rate": 3.8405797101449274e-05,
"loss": 1.6768,
"step": 159
},
{
"epoch": 0.07755695588948133,
"grad_norm": 1.4470667839050293,
"learning_rate": 3.864734299516908e-05,
"loss": 1.4752,
"step": 160
},
{
"epoch": 0.0780416868637906,
"grad_norm": 1.4912703037261963,
"learning_rate": 3.888888888888889e-05,
"loss": 1.5621,
"step": 161
},
{
"epoch": 0.07852641783809985,
"grad_norm": 1.6416089534759521,
"learning_rate": 3.91304347826087e-05,
"loss": 1.7115,
"step": 162
},
{
"epoch": 0.07901114881240912,
"grad_norm": 1.738970160484314,
"learning_rate": 3.937198067632851e-05,
"loss": 1.5945,
"step": 163
},
{
"epoch": 0.07949587978671838,
"grad_norm": 1.50935959815979,
"learning_rate": 3.961352657004831e-05,
"loss": 1.4379,
"step": 164
},
{
"epoch": 0.07998061076102762,
"grad_norm": 1.8530389070510864,
"learning_rate": 3.985507246376812e-05,
"loss": 1.6197,
"step": 165
},
{
"epoch": 0.08046534173533688,
"grad_norm": 1.5483351945877075,
"learning_rate": 4.0096618357487925e-05,
"loss": 1.4075,
"step": 166
},
{
"epoch": 0.08095007270964615,
"grad_norm": 1.7282538414001465,
"learning_rate": 4.0338164251207733e-05,
"loss": 1.7884,
"step": 167
},
{
"epoch": 0.0814348036839554,
"grad_norm": 1.605089545249939,
"learning_rate": 4.057971014492754e-05,
"loss": 1.4268,
"step": 168
},
{
"epoch": 0.08191953465826467,
"grad_norm": 1.6566203832626343,
"learning_rate": 4.082125603864734e-05,
"loss": 1.5855,
"step": 169
},
{
"epoch": 0.08240426563257393,
"grad_norm": 1.7147961854934692,
"learning_rate": 4.106280193236715e-05,
"loss": 1.9024,
"step": 170
},
{
"epoch": 0.08288899660688318,
"grad_norm": 1.638307809829712,
"learning_rate": 4.130434782608696e-05,
"loss": 1.5341,
"step": 171
},
{
"epoch": 0.08337372758119244,
"grad_norm": 1.503071665763855,
"learning_rate": 4.154589371980677e-05,
"loss": 1.3162,
"step": 172
},
{
"epoch": 0.0838584585555017,
"grad_norm": 1.6790330410003662,
"learning_rate": 4.1787439613526576e-05,
"loss": 1.5488,
"step": 173
},
{
"epoch": 0.08434318952981096,
"grad_norm": 1.6894205808639526,
"learning_rate": 4.202898550724638e-05,
"loss": 1.4366,
"step": 174
},
{
"epoch": 0.08482792050412022,
"grad_norm": 1.550458550453186,
"learning_rate": 4.2270531400966186e-05,
"loss": 1.6258,
"step": 175
},
{
"epoch": 0.08531265147842947,
"grad_norm": 1.4660718441009521,
"learning_rate": 4.2512077294685994e-05,
"loss": 1.3366,
"step": 176
},
{
"epoch": 0.08579738245273873,
"grad_norm": 1.527509331703186,
"learning_rate": 4.27536231884058e-05,
"loss": 1.5319,
"step": 177
},
{
"epoch": 0.08628211342704799,
"grad_norm": 5.795914173126221,
"learning_rate": 4.299516908212561e-05,
"loss": 1.3028,
"step": 178
},
{
"epoch": 0.08676684440135725,
"grad_norm": 1.6990413665771484,
"learning_rate": 4.323671497584541e-05,
"loss": 1.8149,
"step": 179
},
{
"epoch": 0.08725157537566651,
"grad_norm": 1.704795479774475,
"learning_rate": 4.347826086956522e-05,
"loss": 1.671,
"step": 180
},
{
"epoch": 0.08773630634997576,
"grad_norm": 1.5868449211120605,
"learning_rate": 4.371980676328503e-05,
"loss": 1.5234,
"step": 181
},
{
"epoch": 0.08822103732428502,
"grad_norm": 1.7695292234420776,
"learning_rate": 4.396135265700484e-05,
"loss": 1.6046,
"step": 182
},
{
"epoch": 0.08870576829859428,
"grad_norm": 1.6047744750976562,
"learning_rate": 4.4202898550724645e-05,
"loss": 1.7261,
"step": 183
},
{
"epoch": 0.08919049927290354,
"grad_norm": 1.7648770809173584,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.6856,
"step": 184
},
{
"epoch": 0.0896752302472128,
"grad_norm": 1.8613518476486206,
"learning_rate": 4.4685990338164255e-05,
"loss": 1.8129,
"step": 185
},
{
"epoch": 0.09015996122152206,
"grad_norm": 1.7032883167266846,
"learning_rate": 4.492753623188406e-05,
"loss": 1.6468,
"step": 186
},
{
"epoch": 0.09064469219583131,
"grad_norm": 1.4716066122055054,
"learning_rate": 4.5169082125603865e-05,
"loss": 1.2393,
"step": 187
},
{
"epoch": 0.09112942317014057,
"grad_norm": 1.582453966140747,
"learning_rate": 4.541062801932367e-05,
"loss": 1.7276,
"step": 188
},
{
"epoch": 0.09161415414444983,
"grad_norm": 1.5959687232971191,
"learning_rate": 4.565217391304348e-05,
"loss": 1.4626,
"step": 189
},
{
"epoch": 0.09209888511875909,
"grad_norm": 2.014113426208496,
"learning_rate": 4.589371980676328e-05,
"loss": 2.0453,
"step": 190
},
{
"epoch": 0.09258361609306835,
"grad_norm": 1.6704699993133545,
"learning_rate": 4.613526570048309e-05,
"loss": 1.7625,
"step": 191
},
{
"epoch": 0.0930683470673776,
"grad_norm": 1.8854517936706543,
"learning_rate": 4.63768115942029e-05,
"loss": 1.7175,
"step": 192
},
{
"epoch": 0.09355307804168686,
"grad_norm": 1.588793158531189,
"learning_rate": 4.661835748792271e-05,
"loss": 1.5578,
"step": 193
},
{
"epoch": 0.09403780901599612,
"grad_norm": 1.5601580142974854,
"learning_rate": 4.6859903381642516e-05,
"loss": 1.6403,
"step": 194
},
{
"epoch": 0.09452253999030538,
"grad_norm": 1.6801401376724243,
"learning_rate": 4.710144927536232e-05,
"loss": 1.4691,
"step": 195
},
{
"epoch": 0.09500727096461464,
"grad_norm": 2.123878002166748,
"learning_rate": 4.7342995169082125e-05,
"loss": 1.5602,
"step": 196
},
{
"epoch": 0.0954920019389239,
"grad_norm": 1.666060447692871,
"learning_rate": 4.7584541062801933e-05,
"loss": 1.4702,
"step": 197
},
{
"epoch": 0.09597673291323315,
"grad_norm": 1.7319121360778809,
"learning_rate": 4.782608695652174e-05,
"loss": 1.8495,
"step": 198
},
{
"epoch": 0.09646146388754241,
"grad_norm": 1.5935922861099243,
"learning_rate": 4.806763285024155e-05,
"loss": 1.5142,
"step": 199
},
{
"epoch": 0.09694619486185167,
"grad_norm": 1.8752068281173706,
"learning_rate": 4.830917874396135e-05,
"loss": 1.9948,
"step": 200
},
{
"epoch": 0.09743092583616093,
"grad_norm": 1.8214166164398193,
"learning_rate": 4.855072463768116e-05,
"loss": 1.7985,
"step": 201
},
{
"epoch": 0.0979156568104702,
"grad_norm": 1.6320829391479492,
"learning_rate": 4.879227053140097e-05,
"loss": 1.6227,
"step": 202
},
{
"epoch": 0.09840038778477944,
"grad_norm": 1.5568000078201294,
"learning_rate": 4.9033816425120776e-05,
"loss": 1.587,
"step": 203
},
{
"epoch": 0.0988851187590887,
"grad_norm": 1.5281633138656616,
"learning_rate": 4.9275362318840584e-05,
"loss": 1.4167,
"step": 204
},
{
"epoch": 0.09936984973339796,
"grad_norm": 1.5692189931869507,
"learning_rate": 4.9516908212560386e-05,
"loss": 1.5648,
"step": 205
},
{
"epoch": 0.09985458070770722,
"grad_norm": 1.644121766090393,
"learning_rate": 4.9758454106280194e-05,
"loss": 1.8842,
"step": 206
},
{
"epoch": 0.10033931168201649,
"grad_norm": 1.654074788093567,
"learning_rate": 5e-05,
"loss": 1.3634,
"step": 207
},
{
"epoch": 0.10082404265632573,
"grad_norm": 1.6578999757766724,
"learning_rate": 4.9999964185927293e-05,
"loss": 1.4366,
"step": 208
},
{
"epoch": 0.101308773630635,
"grad_norm": 1.6873010396957397,
"learning_rate": 4.999985674381179e-05,
"loss": 1.5582,
"step": 209
},
{
"epoch": 0.10179350460494425,
"grad_norm": 1.5198674201965332,
"learning_rate": 4.999967767396132e-05,
"loss": 1.3704,
"step": 210
},
{
"epoch": 0.10227823557925352,
"grad_norm": 1.6594791412353516,
"learning_rate": 4.999942697688894e-05,
"loss": 1.7239,
"step": 211
},
{
"epoch": 0.10276296655356278,
"grad_norm": 1.6230357885360718,
"learning_rate": 4.9999104653312926e-05,
"loss": 1.5243,
"step": 212
},
{
"epoch": 0.10324769752787204,
"grad_norm": 1.62498140335083,
"learning_rate": 4.9998710704156785e-05,
"loss": 1.4242,
"step": 213
},
{
"epoch": 0.10373242850218128,
"grad_norm": 1.611889362335205,
"learning_rate": 4.9998245130549226e-05,
"loss": 1.7185,
"step": 214
},
{
"epoch": 0.10421715947649055,
"grad_norm": 1.7176823616027832,
"learning_rate": 4.999770793382418e-05,
"loss": 1.7587,
"step": 215
},
{
"epoch": 0.1047018904507998,
"grad_norm": 1.8749507665634155,
"learning_rate": 4.999709911552077e-05,
"loss": 1.5999,
"step": 216
},
{
"epoch": 0.10518662142510907,
"grad_norm": 2.380711793899536,
"learning_rate": 4.999641867738336e-05,
"loss": 1.5824,
"step": 217
},
{
"epoch": 0.10567135239941833,
"grad_norm": 1.681323528289795,
"learning_rate": 4.999566662136147e-05,
"loss": 1.6289,
"step": 218
},
{
"epoch": 0.10615608337372757,
"grad_norm": 1.7086970806121826,
"learning_rate": 4.999484294960984e-05,
"loss": 1.5793,
"step": 219
},
{
"epoch": 0.10664081434803684,
"grad_norm": 1.6258258819580078,
"learning_rate": 4.999394766448841e-05,
"loss": 1.7768,
"step": 220
},
{
"epoch": 0.1071255453223461,
"grad_norm": 1.609623670578003,
"learning_rate": 4.9992980768562256e-05,
"loss": 1.5748,
"step": 221
},
{
"epoch": 0.10761027629665536,
"grad_norm": 1.6131285429000854,
"learning_rate": 4.999194226460167e-05,
"loss": 1.7234,
"step": 222
},
{
"epoch": 0.10809500727096462,
"grad_norm": 1.7092560529708862,
"learning_rate": 4.99908321555821e-05,
"loss": 1.8781,
"step": 223
},
{
"epoch": 0.10857973824527388,
"grad_norm": 1.755306601524353,
"learning_rate": 4.998965044468414e-05,
"loss": 1.647,
"step": 224
},
{
"epoch": 0.10906446921958313,
"grad_norm": 1.637286901473999,
"learning_rate": 4.9988397135293567e-05,
"loss": 1.401,
"step": 225
},
{
"epoch": 0.10954920019389239,
"grad_norm": 1.505699634552002,
"learning_rate": 4.998707223100123e-05,
"loss": 1.3702,
"step": 226
},
{
"epoch": 0.11003393116820165,
"grad_norm": 1.5001720190048218,
"learning_rate": 4.9985675735603164e-05,
"loss": 1.6756,
"step": 227
},
{
"epoch": 0.11051866214251091,
"grad_norm": 1.6514838933944702,
"learning_rate": 4.998420765310051e-05,
"loss": 1.3885,
"step": 228
},
{
"epoch": 0.11100339311682017,
"grad_norm": 1.6999015808105469,
"learning_rate": 4.998266798769951e-05,
"loss": 1.2946,
"step": 229
},
{
"epoch": 0.11148812409112942,
"grad_norm": 1.8051220178604126,
"learning_rate": 4.998105674381148e-05,
"loss": 1.6211,
"step": 230
},
{
"epoch": 0.11197285506543868,
"grad_norm": 1.7223929166793823,
"learning_rate": 4.9979373926052865e-05,
"loss": 1.6805,
"step": 231
},
{
"epoch": 0.11245758603974794,
"grad_norm": 1.5139886140823364,
"learning_rate": 4.997761953924512e-05,
"loss": 1.4749,
"step": 232
},
{
"epoch": 0.1129423170140572,
"grad_norm": 1.6115888357162476,
"learning_rate": 4.997579358841481e-05,
"loss": 1.6298,
"step": 233
},
{
"epoch": 0.11342704798836646,
"grad_norm": 1.5873501300811768,
"learning_rate": 4.99738960787935e-05,
"loss": 1.5954,
"step": 234
},
{
"epoch": 0.11391177896267571,
"grad_norm": 1.5256075859069824,
"learning_rate": 4.99719270158178e-05,
"loss": 1.155,
"step": 235
},
{
"epoch": 0.11439650993698497,
"grad_norm": 1.633514165878296,
"learning_rate": 4.996988640512931e-05,
"loss": 1.8803,
"step": 236
},
{
"epoch": 0.11488124091129423,
"grad_norm": 1.6052523851394653,
"learning_rate": 4.996777425257465e-05,
"loss": 1.6264,
"step": 237
},
{
"epoch": 0.11536597188560349,
"grad_norm": 1.5508285760879517,
"learning_rate": 4.99655905642054e-05,
"loss": 1.6035,
"step": 238
},
{
"epoch": 0.11585070285991275,
"grad_norm": 1.9220739603042603,
"learning_rate": 4.996333534627809e-05,
"loss": 1.7689,
"step": 239
},
{
"epoch": 0.11633543383422201,
"grad_norm": 1.5728856325149536,
"learning_rate": 4.9961008605254237e-05,
"loss": 1.4674,
"step": 240
},
{
"epoch": 0.11682016480853126,
"grad_norm": 1.6598260402679443,
"learning_rate": 4.9958610347800206e-05,
"loss": 1.4091,
"step": 241
},
{
"epoch": 0.11730489578284052,
"grad_norm": 1.6171025037765503,
"learning_rate": 4.995614058078733e-05,
"loss": 1.5094,
"step": 242
},
{
"epoch": 0.11778962675714978,
"grad_norm": 1.5818380117416382,
"learning_rate": 4.995359931129179e-05,
"loss": 1.464,
"step": 243
},
{
"epoch": 0.11827435773145904,
"grad_norm": 1.5343612432479858,
"learning_rate": 4.995098654659465e-05,
"loss": 1.4005,
"step": 244
},
{
"epoch": 0.1187590887057683,
"grad_norm": 1.4373968839645386,
"learning_rate": 4.99483022941818e-05,
"loss": 1.156,
"step": 245
},
{
"epoch": 0.11924381968007755,
"grad_norm": 1.7314152717590332,
"learning_rate": 4.994554656174398e-05,
"loss": 1.7723,
"step": 246
},
{
"epoch": 0.11972855065438681,
"grad_norm": 1.6805850267410278,
"learning_rate": 4.99427193571767e-05,
"loss": 1.6244,
"step": 247
},
{
"epoch": 0.12021328162869607,
"grad_norm": 1.5888878107070923,
"learning_rate": 4.993982068858025e-05,
"loss": 1.6246,
"step": 248
},
{
"epoch": 0.12069801260300533,
"grad_norm": 1.8114043474197388,
"learning_rate": 4.9936850564259695e-05,
"loss": 1.4527,
"step": 249
},
{
"epoch": 0.1211827435773146,
"grad_norm": 1.7056317329406738,
"learning_rate": 4.99338089927248e-05,
"loss": 1.5897,
"step": 250
},
{
"epoch": 0.12166747455162386,
"grad_norm": 1.6106626987457275,
"learning_rate": 4.993069598269006e-05,
"loss": 1.5619,
"step": 251
},
{
"epoch": 0.1221522055259331,
"grad_norm": 1.6103230714797974,
"learning_rate": 4.9927511543074635e-05,
"loss": 1.3745,
"step": 252
},
{
"epoch": 0.12263693650024236,
"grad_norm": 1.699284315109253,
"learning_rate": 4.992425568300234e-05,
"loss": 1.5918,
"step": 253
},
{
"epoch": 0.12312166747455162,
"grad_norm": 1.6529169082641602,
"learning_rate": 4.992092841180164e-05,
"loss": 1.7702,
"step": 254
},
{
"epoch": 0.12360639844886089,
"grad_norm": 1.7270163297653198,
"learning_rate": 4.9917529739005574e-05,
"loss": 1.5362,
"step": 255
},
{
"epoch": 0.12409112942317015,
"grad_norm": 2.138240098953247,
"learning_rate": 4.991405967435177e-05,
"loss": 1.7812,
"step": 256
},
{
"epoch": 0.1245758603974794,
"grad_norm": 1.7157567739486694,
"learning_rate": 4.991051822778239e-05,
"loss": 1.8263,
"step": 257
},
{
"epoch": 0.12506059137178865,
"grad_norm": 1.5743305683135986,
"learning_rate": 4.990690540944414e-05,
"loss": 1.5631,
"step": 258
},
{
"epoch": 0.12554532234609792,
"grad_norm": 1.4902536869049072,
"learning_rate": 4.9903221229688194e-05,
"loss": 1.6046,
"step": 259
},
{
"epoch": 0.12603005332040718,
"grad_norm": 1.700891375541687,
"learning_rate": 4.989946569907019e-05,
"loss": 1.7547,
"step": 260
},
{
"epoch": 0.12651478429471644,
"grad_norm": 1.683803915977478,
"learning_rate": 4.989563882835019e-05,
"loss": 1.3777,
"step": 261
},
{
"epoch": 0.1269995152690257,
"grad_norm": 1.548683762550354,
"learning_rate": 4.989174062849267e-05,
"loss": 1.2059,
"step": 262
},
{
"epoch": 0.12748424624333496,
"grad_norm": 1.5738409757614136,
"learning_rate": 4.988777111066646e-05,
"loss": 1.2089,
"step": 263
},
{
"epoch": 0.12796897721764422,
"grad_norm": 1.5687381029129028,
"learning_rate": 4.9883730286244715e-05,
"loss": 1.4081,
"step": 264
},
{
"epoch": 0.12845370819195345,
"grad_norm": 1.6630191802978516,
"learning_rate": 4.987961816680492e-05,
"loss": 1.472,
"step": 265
},
{
"epoch": 0.1289384391662627,
"grad_norm": 1.692008376121521,
"learning_rate": 4.987543476412881e-05,
"loss": 1.8636,
"step": 266
},
{
"epoch": 0.12942317014057197,
"grad_norm": 1.612111210823059,
"learning_rate": 4.987118009020237e-05,
"loss": 1.5438,
"step": 267
},
{
"epoch": 0.12990790111488124,
"grad_norm": 2.01253342628479,
"learning_rate": 4.986685415721576e-05,
"loss": 1.9671,
"step": 268
},
{
"epoch": 0.1303926320891905,
"grad_norm": 1.5312128067016602,
"learning_rate": 4.986245697756333e-05,
"loss": 1.3681,
"step": 269
},
{
"epoch": 0.13087736306349976,
"grad_norm": 1.6384061574935913,
"learning_rate": 4.985798856384355e-05,
"loss": 1.6502,
"step": 270
},
{
"epoch": 0.13136209403780902,
"grad_norm": 2.0369157791137695,
"learning_rate": 4.985344892885899e-05,
"loss": 1.5976,
"step": 271
},
{
"epoch": 0.13184682501211828,
"grad_norm": 1.5668435096740723,
"learning_rate": 4.984883808561628e-05,
"loss": 1.456,
"step": 272
},
{
"epoch": 0.13233155598642754,
"grad_norm": 3.0037758350372314,
"learning_rate": 4.9844156047326054e-05,
"loss": 1.4169,
"step": 273
},
{
"epoch": 0.1328162869607368,
"grad_norm": 1.5953983068466187,
"learning_rate": 4.9839402827402947e-05,
"loss": 1.5507,
"step": 274
},
{
"epoch": 0.13330101793504606,
"grad_norm": 1.6405383348464966,
"learning_rate": 4.983457843946554e-05,
"loss": 1.5394,
"step": 275
},
{
"epoch": 0.1337857489093553,
"grad_norm": 1.5499027967453003,
"learning_rate": 4.98296828973363e-05,
"loss": 1.5463,
"step": 276
},
{
"epoch": 0.13427047988366456,
"grad_norm": 1.5397121906280518,
"learning_rate": 4.9824716215041575e-05,
"loss": 1.5493,
"step": 277
},
{
"epoch": 0.13475521085797382,
"grad_norm": 1.5781245231628418,
"learning_rate": 4.981967840681154e-05,
"loss": 1.6137,
"step": 278
},
{
"epoch": 0.13523994183228308,
"grad_norm": 1.79483962059021,
"learning_rate": 4.981456948708014e-05,
"loss": 1.6445,
"step": 279
},
{
"epoch": 0.13572467280659234,
"grad_norm": 1.5061801671981812,
"learning_rate": 4.980938947048508e-05,
"loss": 1.4163,
"step": 280
},
{
"epoch": 0.1362094037809016,
"grad_norm": 1.5987025499343872,
"learning_rate": 4.980413837186775e-05,
"loss": 1.531,
"step": 281
},
{
"epoch": 0.13669413475521086,
"grad_norm": 1.7168915271759033,
"learning_rate": 4.979881620627322e-05,
"loss": 1.4682,
"step": 282
},
{
"epoch": 0.13717886572952012,
"grad_norm": 1.4458422660827637,
"learning_rate": 4.979342298895016e-05,
"loss": 1.2247,
"step": 283
},
{
"epoch": 0.13766359670382938,
"grad_norm": 1.6409187316894531,
"learning_rate": 4.9787958735350816e-05,
"loss": 1.7199,
"step": 284
},
{
"epoch": 0.13814832767813864,
"grad_norm": 1.5449514389038086,
"learning_rate": 4.978242346113095e-05,
"loss": 1.4311,
"step": 285
},
{
"epoch": 0.1386330586524479,
"grad_norm": 1.5891187191009521,
"learning_rate": 4.977681718214984e-05,
"loss": 1.8869,
"step": 286
},
{
"epoch": 0.13911778962675714,
"grad_norm": 3.7609283924102783,
"learning_rate": 4.977113991447017e-05,
"loss": 1.7382,
"step": 287
},
{
"epoch": 0.1396025206010664,
"grad_norm": 1.7186360359191895,
"learning_rate": 4.976539167435803e-05,
"loss": 1.699,
"step": 288
},
{
"epoch": 0.14008725157537566,
"grad_norm": 1.4953521490097046,
"learning_rate": 4.9759572478282846e-05,
"loss": 1.4215,
"step": 289
},
{
"epoch": 0.14057198254968492,
"grad_norm": 1.5115928649902344,
"learning_rate": 4.975368234291734e-05,
"loss": 1.2988,
"step": 290
},
{
"epoch": 0.14105671352399418,
"grad_norm": 2.540882110595703,
"learning_rate": 4.974772128513751e-05,
"loss": 1.651,
"step": 291
},
{
"epoch": 0.14154144449830344,
"grad_norm": 1.6582648754119873,
"learning_rate": 4.974168932202252e-05,
"loss": 1.7001,
"step": 292
},
{
"epoch": 0.1420261754726127,
"grad_norm": 1.5049525499343872,
"learning_rate": 4.973558647085472e-05,
"loss": 1.7229,
"step": 293
},
{
"epoch": 0.14251090644692196,
"grad_norm": 1.6676872968673706,
"learning_rate": 4.972941274911953e-05,
"loss": 1.5541,
"step": 294
},
{
"epoch": 0.14299563742123123,
"grad_norm": 1.5429099798202515,
"learning_rate": 4.972316817450544e-05,
"loss": 1.813,
"step": 295
},
{
"epoch": 0.1434803683955405,
"grad_norm": 1.6391764879226685,
"learning_rate": 4.9716852764903955e-05,
"loss": 1.9432,
"step": 296
},
{
"epoch": 0.14396509936984975,
"grad_norm": 1.6778995990753174,
"learning_rate": 4.9710466538409505e-05,
"loss": 1.5883,
"step": 297
},
{
"epoch": 0.14444983034415898,
"grad_norm": 1.508912205696106,
"learning_rate": 4.9704009513319444e-05,
"loss": 1.6739,
"step": 298
},
{
"epoch": 0.14493456131846824,
"grad_norm": 1.5249568223953247,
"learning_rate": 4.9697481708133955e-05,
"loss": 1.3977,
"step": 299
},
{
"epoch": 0.1454192922927775,
"grad_norm": 1.429032802581787,
"learning_rate": 4.969088314155602e-05,
"loss": 1.3599,
"step": 300
},
{
"epoch": 0.14590402326708676,
"grad_norm": 1.425854206085205,
"learning_rate": 4.968421383249137e-05,
"loss": 1.4904,
"step": 301
},
{
"epoch": 0.14638875424139602,
"grad_norm": 1.5954697132110596,
"learning_rate": 4.967747380004839e-05,
"loss": 1.6036,
"step": 302
},
{
"epoch": 0.14687348521570529,
"grad_norm": 1.6121957302093506,
"learning_rate": 4.967066306353816e-05,
"loss": 1.5024,
"step": 303
},
{
"epoch": 0.14735821619001455,
"grad_norm": 1.502057433128357,
"learning_rate": 4.966378164247426e-05,
"loss": 1.4643,
"step": 304
},
{
"epoch": 0.1478429471643238,
"grad_norm": 1.4162192344665527,
"learning_rate": 4.965682955657286e-05,
"loss": 1.279,
"step": 305
},
{
"epoch": 0.14832767813863307,
"grad_norm": 1.5923371315002441,
"learning_rate": 4.964980682575253e-05,
"loss": 1.4768,
"step": 306
},
{
"epoch": 0.14881240911294233,
"grad_norm": 1.5782544612884521,
"learning_rate": 4.964271347013431e-05,
"loss": 1.3391,
"step": 307
},
{
"epoch": 0.14929714008725156,
"grad_norm": 2.188934326171875,
"learning_rate": 4.9635549510041516e-05,
"loss": 1.6648,
"step": 308
},
{
"epoch": 0.14978187106156082,
"grad_norm": 1.6566468477249146,
"learning_rate": 4.9628314965999835e-05,
"loss": 1.6523,
"step": 309
},
{
"epoch": 0.15026660203587008,
"grad_norm": 1.46741783618927,
"learning_rate": 4.9621009858737116e-05,
"loss": 1.5349,
"step": 310
},
{
"epoch": 0.15075133301017934,
"grad_norm": 1.486839771270752,
"learning_rate": 4.961363420918342e-05,
"loss": 1.4895,
"step": 311
},
{
"epoch": 0.1512360639844886,
"grad_norm": 1.4840891361236572,
"learning_rate": 4.960618803847092e-05,
"loss": 1.5155,
"step": 312
},
{
"epoch": 0.15172079495879787,
"grad_norm": 1.578395962715149,
"learning_rate": 4.959867136793384e-05,
"loss": 1.5057,
"step": 313
},
{
"epoch": 0.15220552593310713,
"grad_norm": 1.5518931150436401,
"learning_rate": 4.959108421910835e-05,
"loss": 1.8778,
"step": 314
},
{
"epoch": 0.1526902569074164,
"grad_norm": 1.491754412651062,
"learning_rate": 4.958342661373262e-05,
"loss": 1.5156,
"step": 315
},
{
"epoch": 0.15317498788172565,
"grad_norm": 1.492876648902893,
"learning_rate": 4.957569857374664e-05,
"loss": 1.5804,
"step": 316
},
{
"epoch": 0.1536597188560349,
"grad_norm": 1.7110258340835571,
"learning_rate": 4.956790012129221e-05,
"loss": 1.8366,
"step": 317
},
{
"epoch": 0.15414444983034417,
"grad_norm": 1.5975233316421509,
"learning_rate": 4.9560031278712896e-05,
"loss": 1.5372,
"step": 318
},
{
"epoch": 0.1546291808046534,
"grad_norm": 1.5784556865692139,
"learning_rate": 4.95520920685539e-05,
"loss": 1.7814,
"step": 319
},
{
"epoch": 0.15511391177896267,
"grad_norm": 1.4783300161361694,
"learning_rate": 4.9544082513562076e-05,
"loss": 1.495,
"step": 320
},
{
"epoch": 0.15559864275327193,
"grad_norm": 1.75221586227417,
"learning_rate": 4.95360026366858e-05,
"loss": 1.7768,
"step": 321
},
{
"epoch": 0.1560833737275812,
"grad_norm": 2.064201831817627,
"learning_rate": 4.952785246107494e-05,
"loss": 1.983,
"step": 322
},
{
"epoch": 0.15656810470189045,
"grad_norm": 1.7069522142410278,
"learning_rate": 4.951963201008076e-05,
"loss": 1.6382,
"step": 323
},
{
"epoch": 0.1570528356761997,
"grad_norm": 1.5614511966705322,
"learning_rate": 4.951134130725591e-05,
"loss": 1.6066,
"step": 324
},
{
"epoch": 0.15753756665050897,
"grad_norm": 1.510345697402954,
"learning_rate": 4.950298037635428e-05,
"loss": 1.6226,
"step": 325
},
{
"epoch": 0.15802229762481823,
"grad_norm": 1.7443574666976929,
"learning_rate": 4.949454924133098e-05,
"loss": 1.8012,
"step": 326
},
{
"epoch": 0.1585070285991275,
"grad_norm": 1.6052006483078003,
"learning_rate": 4.948604792634229e-05,
"loss": 1.7394,
"step": 327
},
{
"epoch": 0.15899175957343675,
"grad_norm": 1.6979362964630127,
"learning_rate": 4.947747645574555e-05,
"loss": 1.4477,
"step": 328
},
{
"epoch": 0.159476490547746,
"grad_norm": 2.022580146789551,
"learning_rate": 4.9468834854099095e-05,
"loss": 1.5245,
"step": 329
},
{
"epoch": 0.15996122152205525,
"grad_norm": 1.485262155532837,
"learning_rate": 4.94601231461622e-05,
"loss": 1.4005,
"step": 330
},
{
"epoch": 0.1604459524963645,
"grad_norm": 1.5557737350463867,
"learning_rate": 4.9451341356895e-05,
"loss": 1.5077,
"step": 331
},
{
"epoch": 0.16093068347067377,
"grad_norm": 1.7877156734466553,
"learning_rate": 4.9442489511458426e-05,
"loss": 1.62,
"step": 332
},
{
"epoch": 0.16141541444498303,
"grad_norm": 1.6162307262420654,
"learning_rate": 4.943356763521414e-05,
"loss": 1.7224,
"step": 333
},
{
"epoch": 0.1619001454192923,
"grad_norm": 1.500104308128357,
"learning_rate": 4.942457575372443e-05,
"loss": 1.4062,
"step": 334
},
{
"epoch": 0.16238487639360155,
"grad_norm": 1.463844656944275,
"learning_rate": 4.941551389275217e-05,
"loss": 1.6271,
"step": 335
},
{
"epoch": 0.1628696073679108,
"grad_norm": 1.591209888458252,
"learning_rate": 4.940638207826074e-05,
"loss": 1.5389,
"step": 336
},
{
"epoch": 0.16335433834222007,
"grad_norm": 1.4040523767471313,
"learning_rate": 4.9397180336413915e-05,
"loss": 1.355,
"step": 337
},
{
"epoch": 0.16383906931652933,
"grad_norm": 1.4741119146347046,
"learning_rate": 4.938790869357587e-05,
"loss": 1.5721,
"step": 338
},
{
"epoch": 0.1643238002908386,
"grad_norm": 1.6420493125915527,
"learning_rate": 4.937856717631102e-05,
"loss": 1.7241,
"step": 339
},
{
"epoch": 0.16480853126514786,
"grad_norm": 1.6836682558059692,
"learning_rate": 4.936915581138398e-05,
"loss": 1.5049,
"step": 340
},
{
"epoch": 0.1652932622394571,
"grad_norm": 1.529339075088501,
"learning_rate": 4.935967462575949e-05,
"loss": 1.4003,
"step": 341
},
{
"epoch": 0.16577799321376635,
"grad_norm": 1.4345910549163818,
"learning_rate": 4.9350123646602356e-05,
"loss": 1.5476,
"step": 342
},
{
"epoch": 0.1662627241880756,
"grad_norm": 1.5849047899246216,
"learning_rate": 4.934050290127733e-05,
"loss": 1.6484,
"step": 343
},
{
"epoch": 0.16674745516238487,
"grad_norm": 1.5803264379501343,
"learning_rate": 4.933081241734905e-05,
"loss": 1.5366,
"step": 344
},
{
"epoch": 0.16723218613669413,
"grad_norm": 1.4626617431640625,
"learning_rate": 4.9321052222581976e-05,
"loss": 1.5589,
"step": 345
},
{
"epoch": 0.1677169171110034,
"grad_norm": 1.4863779544830322,
"learning_rate": 4.93112223449403e-05,
"loss": 1.3602,
"step": 346
},
{
"epoch": 0.16820164808531265,
"grad_norm": 3.0919888019561768,
"learning_rate": 4.930132281258785e-05,
"loss": 1.602,
"step": 347
},
{
"epoch": 0.16868637905962192,
"grad_norm": 1.61482834815979,
"learning_rate": 4.929135365388804e-05,
"loss": 1.594,
"step": 348
},
{
"epoch": 0.16917111003393118,
"grad_norm": 1.4651432037353516,
"learning_rate": 4.928131489740375e-05,
"loss": 1.353,
"step": 349
},
{
"epoch": 0.16965584100824044,
"grad_norm": 1.6565533876419067,
"learning_rate": 4.9271206571897286e-05,
"loss": 1.7263,
"step": 350
},
{
"epoch": 0.1701405719825497,
"grad_norm": 1.62041175365448,
"learning_rate": 4.926102870633029e-05,
"loss": 1.7091,
"step": 351
},
{
"epoch": 0.17062530295685893,
"grad_norm": 1.470937728881836,
"learning_rate": 4.9250781329863606e-05,
"loss": 1.4829,
"step": 352
},
{
"epoch": 0.1711100339311682,
"grad_norm": 1.6846436262130737,
"learning_rate": 4.924046447185726e-05,
"loss": 1.6661,
"step": 353
},
{
"epoch": 0.17159476490547745,
"grad_norm": 1.5719223022460938,
"learning_rate": 4.923007816187035e-05,
"loss": 1.3444,
"step": 354
},
{
"epoch": 0.17207949587978671,
"grad_norm": 1.6110163927078247,
"learning_rate": 4.921962242966097e-05,
"loss": 1.789,
"step": 355
},
{
"epoch": 0.17256422685409598,
"grad_norm": 1.6574710607528687,
"learning_rate": 4.9209097305186094e-05,
"loss": 1.4593,
"step": 356
},
{
"epoch": 0.17304895782840524,
"grad_norm": 1.5048030614852905,
"learning_rate": 4.9198502818601547e-05,
"loss": 1.3008,
"step": 357
},
{
"epoch": 0.1735336888027145,
"grad_norm": 1.6137561798095703,
"learning_rate": 4.918783900026184e-05,
"loss": 1.4641,
"step": 358
},
{
"epoch": 0.17401841977702376,
"grad_norm": 1.6105793714523315,
"learning_rate": 4.9177105880720173e-05,
"loss": 1.6433,
"step": 359
},
{
"epoch": 0.17450315075133302,
"grad_norm": 1.8704934120178223,
"learning_rate": 4.916630349072828e-05,
"loss": 1.9321,
"step": 360
},
{
"epoch": 0.17498788172564228,
"grad_norm": 1.8688757419586182,
"learning_rate": 4.915543186123636e-05,
"loss": 1.6639,
"step": 361
},
{
"epoch": 0.1754726126999515,
"grad_norm": 1.411590576171875,
"learning_rate": 4.9144491023393016e-05,
"loss": 1.4361,
"step": 362
},
{
"epoch": 0.17595734367426077,
"grad_norm": 1.503797173500061,
"learning_rate": 4.913348100854511e-05,
"loss": 1.4521,
"step": 363
},
{
"epoch": 0.17644207464857004,
"grad_norm": 2.0284149646759033,
"learning_rate": 4.912240184823772e-05,
"loss": 1.6762,
"step": 364
},
{
"epoch": 0.1769268056228793,
"grad_norm": 1.6562719345092773,
"learning_rate": 4.911125357421405e-05,
"loss": 1.5597,
"step": 365
},
{
"epoch": 0.17741153659718856,
"grad_norm": 1.6344953775405884,
"learning_rate": 4.9100036218415285e-05,
"loss": 1.6212,
"step": 366
},
{
"epoch": 0.17789626757149782,
"grad_norm": 1.5039825439453125,
"learning_rate": 4.908874981298057e-05,
"loss": 1.4205,
"step": 367
},
{
"epoch": 0.17838099854580708,
"grad_norm": 1.5786499977111816,
"learning_rate": 4.907739439024689e-05,
"loss": 1.6106,
"step": 368
},
{
"epoch": 0.17886572952011634,
"grad_norm": 1.7766309976577759,
"learning_rate": 4.9065969982748946e-05,
"loss": 1.5542,
"step": 369
},
{
"epoch": 0.1793504604944256,
"grad_norm": 1.6327425241470337,
"learning_rate": 4.9054476623219104e-05,
"loss": 1.5334,
"step": 370
},
{
"epoch": 0.17983519146873486,
"grad_norm": 1.6132947206497192,
"learning_rate": 4.904291434458729e-05,
"loss": 1.5541,
"step": 371
},
{
"epoch": 0.18031992244304412,
"grad_norm": 1.458174467086792,
"learning_rate": 4.9031283179980874e-05,
"loss": 1.3287,
"step": 372
},
{
"epoch": 0.18080465341735336,
"grad_norm": 1.5462307929992676,
"learning_rate": 4.901958316272462e-05,
"loss": 1.6874,
"step": 373
},
{
"epoch": 0.18128938439166262,
"grad_norm": 1.3828489780426025,
"learning_rate": 4.9007814326340544e-05,
"loss": 1.4119,
"step": 374
},
{
"epoch": 0.18177411536597188,
"grad_norm": 1.6115537881851196,
"learning_rate": 4.899597670454785e-05,
"loss": 1.6936,
"step": 375
},
{
"epoch": 0.18225884634028114,
"grad_norm": 1.5310516357421875,
"learning_rate": 4.89840703312628e-05,
"loss": 1.4026,
"step": 376
},
{
"epoch": 0.1827435773145904,
"grad_norm": 1.604029655456543,
"learning_rate": 4.897209524059866e-05,
"loss": 1.5988,
"step": 377
},
{
"epoch": 0.18322830828889966,
"grad_norm": 1.6310497522354126,
"learning_rate": 4.896005146686558e-05,
"loss": 1.5445,
"step": 378
},
{
"epoch": 0.18371303926320892,
"grad_norm": 1.655869483947754,
"learning_rate": 4.8947939044570467e-05,
"loss": 1.5755,
"step": 379
},
{
"epoch": 0.18419777023751818,
"grad_norm": 1.5540688037872314,
"learning_rate": 4.893575800841695e-05,
"loss": 1.5128,
"step": 380
},
{
"epoch": 0.18468250121182744,
"grad_norm": 1.5631048679351807,
"learning_rate": 4.892350839330522e-05,
"loss": 1.645,
"step": 381
},
{
"epoch": 0.1851672321861367,
"grad_norm": 1.522678017616272,
"learning_rate": 4.891119023433198e-05,
"loss": 1.5037,
"step": 382
},
{
"epoch": 0.18565196316044597,
"grad_norm": 1.7026262283325195,
"learning_rate": 4.8898803566790296e-05,
"loss": 1.6548,
"step": 383
},
{
"epoch": 0.1861366941347552,
"grad_norm": 2.272061824798584,
"learning_rate": 4.888634842616953e-05,
"loss": 1.6816,
"step": 384
},
{
"epoch": 0.18662142510906446,
"grad_norm": 1.7039604187011719,
"learning_rate": 4.887382484815522e-05,
"loss": 1.8058,
"step": 385
},
{
"epoch": 0.18710615608337372,
"grad_norm": 1.421434760093689,
"learning_rate": 4.8861232868628994e-05,
"loss": 1.2899,
"step": 386
},
{
"epoch": 0.18759088705768298,
"grad_norm": 1.4786161184310913,
"learning_rate": 4.884857252366847e-05,
"loss": 1.5044,
"step": 387
},
{
"epoch": 0.18807561803199224,
"grad_norm": 1.6115511655807495,
"learning_rate": 4.8835843849547126e-05,
"loss": 1.4588,
"step": 388
},
{
"epoch": 0.1885603490063015,
"grad_norm": 1.4734134674072266,
"learning_rate": 4.88230468827342e-05,
"loss": 1.5804,
"step": 389
},
{
"epoch": 0.18904507998061076,
"grad_norm": 1.677364468574524,
"learning_rate": 4.8810181659894635e-05,
"loss": 1.5134,
"step": 390
},
{
"epoch": 0.18952981095492002,
"grad_norm": 1.5314624309539795,
"learning_rate": 4.879724821788889e-05,
"loss": 1.7543,
"step": 391
},
{
"epoch": 0.19001454192922929,
"grad_norm": 1.8517158031463623,
"learning_rate": 4.878424659377292e-05,
"loss": 1.641,
"step": 392
},
{
"epoch": 0.19049927290353855,
"grad_norm": 1.5239332914352417,
"learning_rate": 4.8771176824798006e-05,
"loss": 1.2542,
"step": 393
},
{
"epoch": 0.1909840038778478,
"grad_norm": 1.6505666971206665,
"learning_rate": 4.875803894841069e-05,
"loss": 1.5467,
"step": 394
},
{
"epoch": 0.19146873485215704,
"grad_norm": 1.5365902185440063,
"learning_rate": 4.8744833002252625e-05,
"loss": 1.3391,
"step": 395
},
{
"epoch": 0.1919534658264663,
"grad_norm": 1.8492168188095093,
"learning_rate": 4.8731559024160524e-05,
"loss": 1.6368,
"step": 396
},
{
"epoch": 0.19243819680077556,
"grad_norm": 1.471468210220337,
"learning_rate": 4.8718217052165985e-05,
"loss": 1.2187,
"step": 397
},
{
"epoch": 0.19292292777508482,
"grad_norm": 1.6151539087295532,
"learning_rate": 4.870480712449546e-05,
"loss": 1.6523,
"step": 398
},
{
"epoch": 0.19340765874939408,
"grad_norm": 1.5233840942382812,
"learning_rate": 4.869132927957007e-05,
"loss": 1.6125,
"step": 399
},
{
"epoch": 0.19389238972370335,
"grad_norm": 1.4746246337890625,
"learning_rate": 4.8677783556005515e-05,
"loss": 1.5412,
"step": 400
},
{
"epoch": 0.1943771206980126,
"grad_norm": 1.7926899194717407,
"learning_rate": 4.8664169992612035e-05,
"loss": 1.7692,
"step": 401
},
{
"epoch": 0.19486185167232187,
"grad_norm": 1.5407027006149292,
"learning_rate": 4.865048862839417e-05,
"loss": 1.3119,
"step": 402
},
{
"epoch": 0.19534658264663113,
"grad_norm": 1.5474193096160889,
"learning_rate": 4.8636739502550775e-05,
"loss": 1.4238,
"step": 403
},
{
"epoch": 0.1958313136209404,
"grad_norm": 1.5824707746505737,
"learning_rate": 4.862292265447481e-05,
"loss": 1.5708,
"step": 404
},
{
"epoch": 0.19631604459524965,
"grad_norm": 1.5270860195159912,
"learning_rate": 4.860903812375329e-05,
"loss": 1.5826,
"step": 405
},
{
"epoch": 0.19680077556955888,
"grad_norm": 1.4273207187652588,
"learning_rate": 4.859508595016713e-05,
"loss": 1.3348,
"step": 406
},
{
"epoch": 0.19728550654386814,
"grad_norm": 1.4353327751159668,
"learning_rate": 4.8581066173691074e-05,
"loss": 1.2723,
"step": 407
},
{
"epoch": 0.1977702375181774,
"grad_norm": 1.475465178489685,
"learning_rate": 4.856697883449355e-05,
"loss": 1.5006,
"step": 408
},
{
"epoch": 0.19825496849248667,
"grad_norm": 1.4205036163330078,
"learning_rate": 4.8552823972936545e-05,
"loss": 1.2869,
"step": 409
},
{
"epoch": 0.19873969946679593,
"grad_norm": 1.5155439376831055,
"learning_rate": 4.853860162957552e-05,
"loss": 1.4603,
"step": 410
},
{
"epoch": 0.1992244304411052,
"grad_norm": 1.5754328966140747,
"learning_rate": 4.8524311845159286e-05,
"loss": 1.4744,
"step": 411
},
{
"epoch": 0.19970916141541445,
"grad_norm": 1.5085046291351318,
"learning_rate": 4.850995466062988e-05,
"loss": 1.4843,
"step": 412
},
{
"epoch": 0.2001938923897237,
"grad_norm": 1.3275054693222046,
"learning_rate": 4.849553011712241e-05,
"loss": 1.4272,
"step": 413
},
{
"epoch": 0.20067862336403297,
"grad_norm": 1.424777626991272,
"learning_rate": 4.848103825596504e-05,
"loss": 1.4182,
"step": 414
},
{
"epoch": 0.20116335433834223,
"grad_norm": 1.498131513595581,
"learning_rate": 4.8466479118678766e-05,
"loss": 1.4618,
"step": 415
},
{
"epoch": 0.20164808531265146,
"grad_norm": 1.4873474836349487,
"learning_rate": 4.845185274697734e-05,
"loss": 1.2547,
"step": 416
},
{
"epoch": 0.20213281628696073,
"grad_norm": 1.4218641519546509,
"learning_rate": 4.843715918276717e-05,
"loss": 1.2078,
"step": 417
},
{
"epoch": 0.20261754726127,
"grad_norm": 1.585609793663025,
"learning_rate": 4.842239846814716e-05,
"loss": 1.3015,
"step": 418
},
{
"epoch": 0.20310227823557925,
"grad_norm": 1.5259923934936523,
"learning_rate": 4.840757064540862e-05,
"loss": 1.8969,
"step": 419
},
{
"epoch": 0.2035870092098885,
"grad_norm": 1.5763602256774902,
"learning_rate": 4.8392675757035114e-05,
"loss": 1.6627,
"step": 420
},
{
"epoch": 0.20407174018419777,
"grad_norm": 1.4582678079605103,
"learning_rate": 4.837771384570238e-05,
"loss": 1.4835,
"step": 421
},
{
"epoch": 0.20455647115850703,
"grad_norm": 1.6100239753723145,
"learning_rate": 4.8362684954278174e-05,
"loss": 1.3861,
"step": 422
},
{
"epoch": 0.2050412021328163,
"grad_norm": 1.6313788890838623,
"learning_rate": 4.834758912582217e-05,
"loss": 1.4019,
"step": 423
},
{
"epoch": 0.20552593310712555,
"grad_norm": 1.7470930814743042,
"learning_rate": 4.8332426403585805e-05,
"loss": 1.7705,
"step": 424
},
{
"epoch": 0.2060106640814348,
"grad_norm": 1.4268667697906494,
"learning_rate": 4.831719683101219e-05,
"loss": 1.4434,
"step": 425
},
{
"epoch": 0.20649539505574407,
"grad_norm": 1.5203051567077637,
"learning_rate": 4.830190045173596e-05,
"loss": 1.7992,
"step": 426
},
{
"epoch": 0.2069801260300533,
"grad_norm": 4.344082355499268,
"learning_rate": 4.828653730958318e-05,
"loss": 1.4597,
"step": 427
},
{
"epoch": 0.20746485700436257,
"grad_norm": 1.5200762748718262,
"learning_rate": 4.827110744857117e-05,
"loss": 1.4586,
"step": 428
},
{
"epoch": 0.20794958797867183,
"grad_norm": 1.4414129257202148,
"learning_rate": 4.825561091290844e-05,
"loss": 1.2286,
"step": 429
},
{
"epoch": 0.2084343189529811,
"grad_norm": 1.9879165887832642,
"learning_rate": 4.82400477469945e-05,
"loss": 1.6122,
"step": 430
},
{
"epoch": 0.20891904992729035,
"grad_norm": 1.4587631225585938,
"learning_rate": 4.822441799541979e-05,
"loss": 1.5816,
"step": 431
},
{
"epoch": 0.2094037809015996,
"grad_norm": 1.6219898462295532,
"learning_rate": 4.82087217029655e-05,
"loss": 1.4554,
"step": 432
},
{
"epoch": 0.20988851187590887,
"grad_norm": 1.6090373992919922,
"learning_rate": 4.819295891460349e-05,
"loss": 1.8978,
"step": 433
},
{
"epoch": 0.21037324285021813,
"grad_norm": 1.6244341135025024,
"learning_rate": 4.817712967549614e-05,
"loss": 1.5749,
"step": 434
},
{
"epoch": 0.2108579738245274,
"grad_norm": 1.5403733253479004,
"learning_rate": 4.8161234030996204e-05,
"loss": 1.5755,
"step": 435
},
{
"epoch": 0.21134270479883666,
"grad_norm": 1.6046067476272583,
"learning_rate": 4.81452720266467e-05,
"loss": 1.8461,
"step": 436
},
{
"epoch": 0.21182743577314592,
"grad_norm": 1.4461714029312134,
"learning_rate": 4.8129243708180785e-05,
"loss": 1.4177,
"step": 437
},
{
"epoch": 0.21231216674745515,
"grad_norm": 1.6043570041656494,
"learning_rate": 4.81131491215216e-05,
"loss": 1.5736,
"step": 438
},
{
"epoch": 0.2127968977217644,
"grad_norm": 1.4942891597747803,
"learning_rate": 4.8096988312782174e-05,
"loss": 1.3317,
"step": 439
},
{
"epoch": 0.21328162869607367,
"grad_norm": 1.5307039022445679,
"learning_rate": 4.808076132826524e-05,
"loss": 1.493,
"step": 440
},
{
"epoch": 0.21376635967038293,
"grad_norm": 1.3940438032150269,
"learning_rate": 4.806446821446317e-05,
"loss": 1.4157,
"step": 441
},
{
"epoch": 0.2142510906446922,
"grad_norm": 1.4989084005355835,
"learning_rate": 4.8048109018057776e-05,
"loss": 1.6034,
"step": 442
},
{
"epoch": 0.21473582161900145,
"grad_norm": 1.5145782232284546,
"learning_rate": 4.80316837859202e-05,
"loss": 1.6784,
"step": 443
},
{
"epoch": 0.21522055259331072,
"grad_norm": 1.4529794454574585,
"learning_rate": 4.801519256511082e-05,
"loss": 1.4246,
"step": 444
},
{
"epoch": 0.21570528356761998,
"grad_norm": 1.5570049285888672,
"learning_rate": 4.799863540287905e-05,
"loss": 1.582,
"step": 445
},
{
"epoch": 0.21619001454192924,
"grad_norm": 1.42445707321167,
"learning_rate": 4.798201234666324e-05,
"loss": 1.3466,
"step": 446
},
{
"epoch": 0.2166747455162385,
"grad_norm": 1.5558465719223022,
"learning_rate": 4.796532344409055e-05,
"loss": 1.5421,
"step": 447
},
{
"epoch": 0.21715947649054776,
"grad_norm": 4.552513599395752,
"learning_rate": 4.794856874297676e-05,
"loss": 2.2261,
"step": 448
},
{
"epoch": 0.217644207464857,
"grad_norm": 1.374784231185913,
"learning_rate": 4.793174829132623e-05,
"loss": 1.5355,
"step": 449
},
{
"epoch": 0.21812893843916625,
"grad_norm": 1.9197838306427002,
"learning_rate": 4.791486213733164e-05,
"loss": 1.7625,
"step": 450
},
{
"epoch": 0.21861366941347551,
"grad_norm": 1.4595524072647095,
"learning_rate": 4.789791032937397e-05,
"loss": 1.2965,
"step": 451
},
{
"epoch": 0.21909840038778478,
"grad_norm": 1.4804373979568481,
"learning_rate": 4.7880892916022265e-05,
"loss": 1.3907,
"step": 452
},
{
"epoch": 0.21958313136209404,
"grad_norm": 1.4374562501907349,
"learning_rate": 4.786380994603356e-05,
"loss": 1.4291,
"step": 453
},
{
"epoch": 0.2200678623364033,
"grad_norm": 1.697147250175476,
"learning_rate": 4.7846661468352716e-05,
"loss": 1.8611,
"step": 454
},
{
"epoch": 0.22055259331071256,
"grad_norm": 1.5372896194458008,
"learning_rate": 4.782944753211228e-05,
"loss": 1.5396,
"step": 455
},
{
"epoch": 0.22103732428502182,
"grad_norm": 1.520835518836975,
"learning_rate": 4.781216818663234e-05,
"loss": 1.5921,
"step": 456
},
{
"epoch": 0.22152205525933108,
"grad_norm": 1.7574188709259033,
"learning_rate": 4.7794823481420406e-05,
"loss": 1.644,
"step": 457
},
{
"epoch": 0.22200678623364034,
"grad_norm": 1.384774923324585,
"learning_rate": 4.7777413466171227e-05,
"loss": 1.542,
"step": 458
},
{
"epoch": 0.2224915172079496,
"grad_norm": 1.520736575126648,
"learning_rate": 4.7759938190766694e-05,
"loss": 1.674,
"step": 459
},
{
"epoch": 0.22297624818225883,
"grad_norm": 1.4717646837234497,
"learning_rate": 4.7742397705275665e-05,
"loss": 1.3583,
"step": 460
},
{
"epoch": 0.2234609791565681,
"grad_norm": 1.4752681255340576,
"learning_rate": 4.772479205995385e-05,
"loss": 1.4931,
"step": 461
},
{
"epoch": 0.22394571013087736,
"grad_norm": 1.5523077249526978,
"learning_rate": 4.7707121305243623e-05,
"loss": 1.4731,
"step": 462
},
{
"epoch": 0.22443044110518662,
"grad_norm": 1.4732838869094849,
"learning_rate": 4.768938549177393e-05,
"loss": 1.4059,
"step": 463
},
{
"epoch": 0.22491517207949588,
"grad_norm": 1.555053472518921,
"learning_rate": 4.7671584670360105e-05,
"loss": 1.3137,
"step": 464
},
{
"epoch": 0.22539990305380514,
"grad_norm": 1.4933258295059204,
"learning_rate": 4.765371889200373e-05,
"loss": 1.4378,
"step": 465
},
{
"epoch": 0.2258846340281144,
"grad_norm": 1.409632921218872,
"learning_rate": 4.763578820789253e-05,
"loss": 1.4902,
"step": 466
},
{
"epoch": 0.22636936500242366,
"grad_norm": 1.4202258586883545,
"learning_rate": 4.761779266940015e-05,
"loss": 1.3832,
"step": 467
},
{
"epoch": 0.22685409597673292,
"grad_norm": 1.3944295644760132,
"learning_rate": 4.759973232808609e-05,
"loss": 1.5828,
"step": 468
},
{
"epoch": 0.22733882695104218,
"grad_norm": 1.5394325256347656,
"learning_rate": 4.758160723569548e-05,
"loss": 1.207,
"step": 469
},
{
"epoch": 0.22782355792535142,
"grad_norm": 1.7582964897155762,
"learning_rate": 4.756341744415901e-05,
"loss": 1.7935,
"step": 470
},
{
"epoch": 0.22830828889966068,
"grad_norm": 1.4797354936599731,
"learning_rate": 4.754516300559271e-05,
"loss": 1.5384,
"step": 471
},
{
"epoch": 0.22879301987396994,
"grad_norm": 1.5122283697128296,
"learning_rate": 4.752684397229784e-05,
"loss": 1.8131,
"step": 472
},
{
"epoch": 0.2292777508482792,
"grad_norm": 1.4401196241378784,
"learning_rate": 4.750846039676075e-05,
"loss": 1.5188,
"step": 473
},
{
"epoch": 0.22976248182258846,
"grad_norm": 1.4823417663574219,
"learning_rate": 4.7490012331652675e-05,
"loss": 1.2987,
"step": 474
},
{
"epoch": 0.23024721279689772,
"grad_norm": 1.5985757112503052,
"learning_rate": 4.7471499829829666e-05,
"loss": 1.7577,
"step": 475
},
{
"epoch": 0.23073194377120698,
"grad_norm": 1.5306726694107056,
"learning_rate": 4.7452922944332355e-05,
"loss": 1.4455,
"step": 476
},
{
"epoch": 0.23121667474551624,
"grad_norm": 1.585697889328003,
"learning_rate": 4.7434281728385867e-05,
"loss": 1.4526,
"step": 477
},
{
"epoch": 0.2317014057198255,
"grad_norm": 1.5287288427352905,
"learning_rate": 4.741557623539962e-05,
"loss": 1.8082,
"step": 478
},
{
"epoch": 0.23218613669413476,
"grad_norm": 1.835727334022522,
"learning_rate": 4.73968065189672e-05,
"loss": 1.5274,
"step": 479
},
{
"epoch": 0.23267086766844403,
"grad_norm": 1.5724575519561768,
"learning_rate": 4.7377972632866226e-05,
"loss": 1.8953,
"step": 480
},
{
"epoch": 0.23315559864275326,
"grad_norm": 1.598510503768921,
"learning_rate": 4.7359074631058134e-05,
"loss": 1.7241,
"step": 481
},
{
"epoch": 0.23364032961706252,
"grad_norm": 1.5090588331222534,
"learning_rate": 4.7340112567688085e-05,
"loss": 1.3641,
"step": 482
},
{
"epoch": 0.23412506059137178,
"grad_norm": 1.5519758462905884,
"learning_rate": 4.732108649708478e-05,
"loss": 1.5011,
"step": 483
},
{
"epoch": 0.23460979156568104,
"grad_norm": 1.5860713720321655,
"learning_rate": 4.7301996473760304e-05,
"loss": 1.5786,
"step": 484
},
{
"epoch": 0.2350945225399903,
"grad_norm": 1.534163236618042,
"learning_rate": 4.728284255240996e-05,
"loss": 1.4667,
"step": 485
},
{
"epoch": 0.23557925351429956,
"grad_norm": 1.3986639976501465,
"learning_rate": 4.726362478791217e-05,
"loss": 1.4603,
"step": 486
},
{
"epoch": 0.23606398448860882,
"grad_norm": 1.4852182865142822,
"learning_rate": 4.724434323532821e-05,
"loss": 1.6303,
"step": 487
},
{
"epoch": 0.23654871546291809,
"grad_norm": 1.4321929216384888,
"learning_rate": 4.7224997949902186e-05,
"loss": 1.3958,
"step": 488
},
{
"epoch": 0.23703344643722735,
"grad_norm": 1.8100051879882812,
"learning_rate": 4.720558898706077e-05,
"loss": 1.4998,
"step": 489
},
{
"epoch": 0.2375181774115366,
"grad_norm": 1.4117873907089233,
"learning_rate": 4.7186116402413064e-05,
"loss": 1.3552,
"step": 490
},
{
"epoch": 0.23800290838584587,
"grad_norm": 1.5080662965774536,
"learning_rate": 4.716658025175049e-05,
"loss": 1.5167,
"step": 491
},
{
"epoch": 0.2384876393601551,
"grad_norm": 1.5356587171554565,
"learning_rate": 4.714698059104658e-05,
"loss": 1.3864,
"step": 492
},
{
"epoch": 0.23897237033446436,
"grad_norm": 1.686954140663147,
"learning_rate": 4.712731747645682e-05,
"loss": 1.6785,
"step": 493
},
{
"epoch": 0.23945710130877362,
"grad_norm": 1.4700133800506592,
"learning_rate": 4.7107590964318505e-05,
"loss": 1.2093,
"step": 494
},
{
"epoch": 0.23994183228308288,
"grad_norm": 1.4735498428344727,
"learning_rate": 4.708780111115057e-05,
"loss": 1.6332,
"step": 495
},
{
"epoch": 0.24042656325739215,
"grad_norm": 1.572034239768982,
"learning_rate": 4.706794797365346e-05,
"loss": 1.5175,
"step": 496
},
{
"epoch": 0.2409112942317014,
"grad_norm": 1.4558137655258179,
"learning_rate": 4.7048031608708876e-05,
"loss": 1.2924,
"step": 497
},
{
"epoch": 0.24139602520601067,
"grad_norm": 1.6685311794281006,
"learning_rate": 4.702805207337974e-05,
"loss": 1.0251,
"step": 498
},
{
"epoch": 0.24188075618031993,
"grad_norm": 1.5901682376861572,
"learning_rate": 4.7008009424909917e-05,
"loss": 2.1593,
"step": 499
},
{
"epoch": 0.2423654871546292,
"grad_norm": 1.4795153141021729,
"learning_rate": 4.698790372072411e-05,
"loss": 1.7071,
"step": 500
},
{
"epoch": 0.24285021812893845,
"grad_norm": 1.9734545946121216,
"learning_rate": 4.696773501842771e-05,
"loss": 1.4482,
"step": 501
},
{
"epoch": 0.2433349491032477,
"grad_norm": 1.4583733081817627,
"learning_rate": 4.694750337580659e-05,
"loss": 1.5411,
"step": 502
},
{
"epoch": 0.24381968007755694,
"grad_norm": 1.5093872547149658,
"learning_rate": 4.6927208850826925e-05,
"loss": 1.414,
"step": 503
},
{
"epoch": 0.2443044110518662,
"grad_norm": 1.376407504081726,
"learning_rate": 4.6906851501635106e-05,
"loss": 1.2979,
"step": 504
},
{
"epoch": 0.24478914202617547,
"grad_norm": 1.5103840827941895,
"learning_rate": 4.688643138655748e-05,
"loss": 1.502,
"step": 505
},
{
"epoch": 0.24527387300048473,
"grad_norm": 1.4392768144607544,
"learning_rate": 4.686594856410027e-05,
"loss": 1.4235,
"step": 506
},
{
"epoch": 0.245758603974794,
"grad_norm": 1.5379340648651123,
"learning_rate": 4.684540309294932e-05,
"loss": 1.5077,
"step": 507
},
{
"epoch": 0.24624333494910325,
"grad_norm": 1.5546420812606812,
"learning_rate": 4.682479503197001e-05,
"loss": 1.5388,
"step": 508
},
{
"epoch": 0.2467280659234125,
"grad_norm": 1.6801774501800537,
"learning_rate": 4.6804124440207e-05,
"loss": 1.7391,
"step": 509
},
{
"epoch": 0.24721279689772177,
"grad_norm": 1.4841091632843018,
"learning_rate": 4.678339137688416e-05,
"loss": 1.72,
"step": 510
},
{
"epoch": 0.24769752787203103,
"grad_norm": 1.4708201885223389,
"learning_rate": 4.67625959014043e-05,
"loss": 1.5798,
"step": 511
},
{
"epoch": 0.2481822588463403,
"grad_norm": 1.5190343856811523,
"learning_rate": 4.67417380733491e-05,
"loss": 1.6462,
"step": 512
},
{
"epoch": 0.24866698982064955,
"grad_norm": 1.479689121246338,
"learning_rate": 4.6720817952478854e-05,
"loss": 1.5912,
"step": 513
},
{
"epoch": 0.2491517207949588,
"grad_norm": 1.4845134019851685,
"learning_rate": 4.6699835598732325e-05,
"loss": 1.4543,
"step": 514
},
{
"epoch": 0.24963645176926805,
"grad_norm": 1.5402343273162842,
"learning_rate": 4.667879107222662e-05,
"loss": 1.4791,
"step": 515
},
{
"epoch": 0.2501211827435773,
"grad_norm": 1.627976417541504,
"learning_rate": 4.6657684433256934e-05,
"loss": 1.4067,
"step": 516
},
{
"epoch": 0.2506059137178866,
"grad_norm": 1.530739665031433,
"learning_rate": 4.6636515742296464e-05,
"loss": 1.9333,
"step": 517
},
{
"epoch": 0.25109064469219583,
"grad_norm": 1.4607384204864502,
"learning_rate": 4.661528505999615e-05,
"loss": 1.537,
"step": 518
},
{
"epoch": 0.25157537566650506,
"grad_norm": 1.5360444784164429,
"learning_rate": 4.6593992447184586e-05,
"loss": 1.3789,
"step": 519
},
{
"epoch": 0.25206010664081435,
"grad_norm": 1.4390000104904175,
"learning_rate": 4.6572637964867776e-05,
"loss": 1.4374,
"step": 520
},
{
"epoch": 0.2525448376151236,
"grad_norm": 1.4811269044876099,
"learning_rate": 4.6551221674229003e-05,
"loss": 1.561,
"step": 521
},
{
"epoch": 0.2530295685894329,
"grad_norm": 1.4047800302505493,
"learning_rate": 4.652974363662864e-05,
"loss": 1.2907,
"step": 522
},
{
"epoch": 0.2535142995637421,
"grad_norm": 1.6080119609832764,
"learning_rate": 4.650820391360396e-05,
"loss": 1.4566,
"step": 523
},
{
"epoch": 0.2539990305380514,
"grad_norm": 1.5174490213394165,
"learning_rate": 4.6486602566868975e-05,
"loss": 1.4285,
"step": 524
},
{
"epoch": 0.25448376151236063,
"grad_norm": 1.5314956903457642,
"learning_rate": 4.6464939658314274e-05,
"loss": 1.3154,
"step": 525
},
{
"epoch": 0.2549684924866699,
"grad_norm": 1.4693747758865356,
"learning_rate": 4.6443215250006806e-05,
"loss": 1.4333,
"step": 526
},
{
"epoch": 0.25545322346097915,
"grad_norm": 1.5617380142211914,
"learning_rate": 4.642142940418973e-05,
"loss": 1.4411,
"step": 527
},
{
"epoch": 0.25593795443528844,
"grad_norm": 1.4139310121536255,
"learning_rate": 4.6399582183282256e-05,
"loss": 1.6047,
"step": 528
},
{
"epoch": 0.2564226854095977,
"grad_norm": 1.6654372215270996,
"learning_rate": 4.6377673649879396e-05,
"loss": 1.7693,
"step": 529
},
{
"epoch": 0.2569074163839069,
"grad_norm": 1.595267415046692,
"learning_rate": 4.635570386675186e-05,
"loss": 1.458,
"step": 530
},
{
"epoch": 0.2573921473582162,
"grad_norm": 1.4914833307266235,
"learning_rate": 4.633367289684586e-05,
"loss": 1.4662,
"step": 531
},
{
"epoch": 0.2578768783325254,
"grad_norm": 1.764076828956604,
"learning_rate": 4.631158080328287e-05,
"loss": 2.023,
"step": 532
},
{
"epoch": 0.2583616093068347,
"grad_norm": 1.4962358474731445,
"learning_rate": 4.628942764935954e-05,
"loss": 1.1698,
"step": 533
},
{
"epoch": 0.25884634028114395,
"grad_norm": 1.628941535949707,
"learning_rate": 4.626721349854742e-05,
"loss": 1.6603,
"step": 534
},
{
"epoch": 0.25933107125545324,
"grad_norm": 1.473333477973938,
"learning_rate": 4.6244938414492875e-05,
"loss": 1.4744,
"step": 535
},
{
"epoch": 0.25981580222976247,
"grad_norm": 1.5255992412567139,
"learning_rate": 4.62226024610168e-05,
"loss": 1.6574,
"step": 536
},
{
"epoch": 0.26030053320407176,
"grad_norm": 1.4754759073257446,
"learning_rate": 4.6200205702114526e-05,
"loss": 1.3515,
"step": 537
},
{
"epoch": 0.260785264178381,
"grad_norm": 1.539143681526184,
"learning_rate": 4.617774820195557e-05,
"loss": 1.361,
"step": 538
},
{
"epoch": 0.2612699951526903,
"grad_norm": 1.4834741353988647,
"learning_rate": 4.615523002488352e-05,
"loss": 1.4823,
"step": 539
},
{
"epoch": 0.2617547261269995,
"grad_norm": 1.541680932044983,
"learning_rate": 4.6132651235415764e-05,
"loss": 1.6218,
"step": 540
},
{
"epoch": 0.26223945710130875,
"grad_norm": 1.802275538444519,
"learning_rate": 4.6110011898243374e-05,
"loss": 1.2527,
"step": 541
},
{
"epoch": 0.26272418807561804,
"grad_norm": 1.4547120332717896,
"learning_rate": 4.608731207823093e-05,
"loss": 1.2674,
"step": 542
},
{
"epoch": 0.26320891904992727,
"grad_norm": 1.4830158948898315,
"learning_rate": 4.606455184041622e-05,
"loss": 1.3814,
"step": 543
},
{
"epoch": 0.26369365002423656,
"grad_norm": 1.5940104722976685,
"learning_rate": 4.6041731250010246e-05,
"loss": 1.644,
"step": 544
},
{
"epoch": 0.2641783809985458,
"grad_norm": 1.4886668920516968,
"learning_rate": 4.601885037239683e-05,
"loss": 1.3262,
"step": 545
},
{
"epoch": 0.2646631119728551,
"grad_norm": 1.414720058441162,
"learning_rate": 4.5995909273132587e-05,
"loss": 1.3218,
"step": 546
},
{
"epoch": 0.2651478429471643,
"grad_norm": 1.3138704299926758,
"learning_rate": 4.597290801794664e-05,
"loss": 1.161,
"step": 547
},
{
"epoch": 0.2656325739214736,
"grad_norm": 1.401702880859375,
"learning_rate": 4.594984667274048e-05,
"loss": 1.6609,
"step": 548
},
{
"epoch": 0.26611730489578284,
"grad_norm": 1.4239600896835327,
"learning_rate": 4.592672530358777e-05,
"loss": 1.4492,
"step": 549
},
{
"epoch": 0.2666020358700921,
"grad_norm": 1.3764070272445679,
"learning_rate": 4.5903543976734145e-05,
"loss": 1.1719,
"step": 550
},
{
"epoch": 0.26708676684440136,
"grad_norm": 1.6861745119094849,
"learning_rate": 4.5880302758597e-05,
"loss": 1.4993,
"step": 551
},
{
"epoch": 0.2675714978187106,
"grad_norm": 1.3549953699111938,
"learning_rate": 4.585700171576538e-05,
"loss": 1.2464,
"step": 552
},
{
"epoch": 0.2680562287930199,
"grad_norm": 1.4663500785827637,
"learning_rate": 4.583364091499968e-05,
"loss": 1.7526,
"step": 553
},
{
"epoch": 0.2685409597673291,
"grad_norm": 1.8229775428771973,
"learning_rate": 4.581022042323155e-05,
"loss": 1.2188,
"step": 554
},
{
"epoch": 0.2690256907416384,
"grad_norm": 1.5518858432769775,
"learning_rate": 4.5786740307563636e-05,
"loss": 1.4877,
"step": 555
},
{
"epoch": 0.26951042171594763,
"grad_norm": 1.3994567394256592,
"learning_rate": 4.576320063526942e-05,
"loss": 1.1313,
"step": 556
},
{
"epoch": 0.2699951526902569,
"grad_norm": 1.5038384199142456,
"learning_rate": 4.573960147379304e-05,
"loss": 1.4847,
"step": 557
},
{
"epoch": 0.27047988366456616,
"grad_norm": 1.4688293933868408,
"learning_rate": 4.5715942890749045e-05,
"loss": 1.3024,
"step": 558
},
{
"epoch": 0.27096461463887545,
"grad_norm": 1.4721691608428955,
"learning_rate": 4.5692224953922266e-05,
"loss": 1.7076,
"step": 559
},
{
"epoch": 0.2714493456131847,
"grad_norm": 1.452632188796997,
"learning_rate": 4.566844773126757e-05,
"loss": 1.4929,
"step": 560
},
{
"epoch": 0.27193407658749397,
"grad_norm": 1.4588583707809448,
"learning_rate": 4.564461129090969e-05,
"loss": 1.2069,
"step": 561
},
{
"epoch": 0.2724188075618032,
"grad_norm": 1.426776647567749,
"learning_rate": 4.562071570114304e-05,
"loss": 1.2884,
"step": 562
},
{
"epoch": 0.27290353853611243,
"grad_norm": 1.4680659770965576,
"learning_rate": 4.5596761030431465e-05,
"loss": 1.8103,
"step": 563
},
{
"epoch": 0.2733882695104217,
"grad_norm": 1.7482470273971558,
"learning_rate": 4.557274734740813e-05,
"loss": 1.1736,
"step": 564
},
{
"epoch": 0.27387300048473096,
"grad_norm": 1.5107123851776123,
"learning_rate": 4.554867472087525e-05,
"loss": 1.6481,
"step": 565
},
{
"epoch": 0.27435773145904024,
"grad_norm": 1.5711710453033447,
"learning_rate": 4.552454321980394e-05,
"loss": 1.2276,
"step": 566
},
{
"epoch": 0.2748424624333495,
"grad_norm": 1.6438961029052734,
"learning_rate": 4.5500352913333974e-05,
"loss": 1.4729,
"step": 567
},
{
"epoch": 0.27532719340765877,
"grad_norm": 1.5576547384262085,
"learning_rate": 4.547610387077363e-05,
"loss": 1.6697,
"step": 568
},
{
"epoch": 0.275811924381968,
"grad_norm": 1.587242841720581,
"learning_rate": 4.5451796161599466e-05,
"loss": 1.727,
"step": 569
},
{
"epoch": 0.2762966553562773,
"grad_norm": 1.4595431089401245,
"learning_rate": 4.5427429855456125e-05,
"loss": 1.5131,
"step": 570
},
{
"epoch": 0.2767813863305865,
"grad_norm": 1.8416346311569214,
"learning_rate": 4.5403005022156145e-05,
"loss": 1.241,
"step": 571
},
{
"epoch": 0.2772661173048958,
"grad_norm": 1.4765833616256714,
"learning_rate": 4.5378521731679735e-05,
"loss": 1.268,
"step": 572
},
{
"epoch": 0.27775084827920504,
"grad_norm": 1.4958488941192627,
"learning_rate": 4.535398005417461e-05,
"loss": 1.2896,
"step": 573
},
{
"epoch": 0.2782355792535143,
"grad_norm": 1.6208523511886597,
"learning_rate": 4.5329380059955776e-05,
"loss": 1.5778,
"step": 574
},
{
"epoch": 0.27872031022782356,
"grad_norm": 1.4747179746627808,
"learning_rate": 4.530472181950528e-05,
"loss": 1.3638,
"step": 575
},
{
"epoch": 0.2792050412021328,
"grad_norm": 1.5383862257003784,
"learning_rate": 4.528000540347212e-05,
"loss": 1.2982,
"step": 576
},
{
"epoch": 0.2796897721764421,
"grad_norm": 1.5441182851791382,
"learning_rate": 4.52552308826719e-05,
"loss": 1.4339,
"step": 577
},
{
"epoch": 0.2801745031507513,
"grad_norm": 1.4504390954971313,
"learning_rate": 4.523039832808677e-05,
"loss": 1.6174,
"step": 578
},
{
"epoch": 0.2806592341250606,
"grad_norm": 1.409399151802063,
"learning_rate": 4.520550781086511e-05,
"loss": 1.1168,
"step": 579
},
{
"epoch": 0.28114396509936984,
"grad_norm": 1.5246732234954834,
"learning_rate": 4.5180559402321385e-05,
"loss": 1.5579,
"step": 580
},
{
"epoch": 0.28162869607367913,
"grad_norm": 1.6616579294204712,
"learning_rate": 4.515555317393593e-05,
"loss": 1.8633,
"step": 581
},
{
"epoch": 0.28211342704798836,
"grad_norm": 1.4933806657791138,
"learning_rate": 4.5130489197354734e-05,
"loss": 1.5078,
"step": 582
},
{
"epoch": 0.28259815802229765,
"grad_norm": 1.469255805015564,
"learning_rate": 4.510536754438923e-05,
"loss": 1.4949,
"step": 583
},
{
"epoch": 0.2830828889966069,
"grad_norm": 1.32322096824646,
"learning_rate": 4.508018828701612e-05,
"loss": 1.1069,
"step": 584
},
{
"epoch": 0.2835676199709161,
"grad_norm": 1.438407063484192,
"learning_rate": 4.5054951497377165e-05,
"loss": 1.587,
"step": 585
},
{
"epoch": 0.2840523509452254,
"grad_norm": 1.5011217594146729,
"learning_rate": 4.502965724777891e-05,
"loss": 1.7561,
"step": 586
},
{
"epoch": 0.28453708191953464,
"grad_norm": 1.436948537826538,
"learning_rate": 4.500430561069259e-05,
"loss": 1.3473,
"step": 587
},
{
"epoch": 0.28502181289384393,
"grad_norm": 1.5629382133483887,
"learning_rate": 4.497889665875382e-05,
"loss": 1.8745,
"step": 588
},
{
"epoch": 0.28550654386815316,
"grad_norm": 1.4830938577651978,
"learning_rate": 4.495343046476245e-05,
"loss": 1.5765,
"step": 589
},
{
"epoch": 0.28599127484246245,
"grad_norm": 1.4268252849578857,
"learning_rate": 4.492790710168233e-05,
"loss": 1.3325,
"step": 590
},
{
"epoch": 0.2864760058167717,
"grad_norm": 1.4147472381591797,
"learning_rate": 4.4902326642641095e-05,
"loss": 1.3003,
"step": 591
},
{
"epoch": 0.286960736791081,
"grad_norm": 1.4721652269363403,
"learning_rate": 4.487668916093e-05,
"loss": 1.3235,
"step": 592
},
{
"epoch": 0.2874454677653902,
"grad_norm": 1.5141065120697021,
"learning_rate": 4.4850994730003634e-05,
"loss": 1.2092,
"step": 593
},
{
"epoch": 0.2879301987396995,
"grad_norm": 1.4203892946243286,
"learning_rate": 4.482524342347978e-05,
"loss": 1.8299,
"step": 594
},
{
"epoch": 0.2884149297140087,
"grad_norm": 2.1075599193573,
"learning_rate": 4.479943531513918e-05,
"loss": 1.9049,
"step": 595
},
{
"epoch": 0.28889966068831796,
"grad_norm": 1.3928402662277222,
"learning_rate": 4.477357047892531e-05,
"loss": 1.4002,
"step": 596
},
{
"epoch": 0.28938439166262725,
"grad_norm": 1.4497021436691284,
"learning_rate": 4.474764898894418e-05,
"loss": 1.3542,
"step": 597
},
{
"epoch": 0.2898691226369365,
"grad_norm": 1.6207205057144165,
"learning_rate": 4.472167091946411e-05,
"loss": 1.8706,
"step": 598
},
{
"epoch": 0.29035385361124577,
"grad_norm": 1.4122729301452637,
"learning_rate": 4.469563634491554e-05,
"loss": 1.3707,
"step": 599
},
{
"epoch": 0.290838584585555,
"grad_norm": 1.7664668560028076,
"learning_rate": 4.4669545339890814e-05,
"loss": 1.2924,
"step": 600
},
{
"epoch": 0.2913233155598643,
"grad_norm": 1.4191179275512695,
"learning_rate": 4.464339797914393e-05,
"loss": 1.4986,
"step": 601
},
{
"epoch": 0.2918080465341735,
"grad_norm": 1.3508961200714111,
"learning_rate": 4.4617194337590376e-05,
"loss": 1.3907,
"step": 602
},
{
"epoch": 0.2922927775084828,
"grad_norm": 1.5529752969741821,
"learning_rate": 4.459093449030688e-05,
"loss": 1.3911,
"step": 603
},
{
"epoch": 0.29277750848279205,
"grad_norm": 1.4372109174728394,
"learning_rate": 4.4564618512531206e-05,
"loss": 1.3964,
"step": 604
},
{
"epoch": 0.2932622394571013,
"grad_norm": 1.628930687904358,
"learning_rate": 4.4538246479661936e-05,
"loss": 1.8534,
"step": 605
},
{
"epoch": 0.29374697043141057,
"grad_norm": 1.4418190717697144,
"learning_rate": 4.451181846725827e-05,
"loss": 1.4604,
"step": 606
},
{
"epoch": 0.2942317014057198,
"grad_norm": 1.503642201423645,
"learning_rate": 4.448533455103979e-05,
"loss": 1.6738,
"step": 607
},
{
"epoch": 0.2947164323800291,
"grad_norm": 1.5811353921890259,
"learning_rate": 4.445879480688625e-05,
"loss": 1.2007,
"step": 608
},
{
"epoch": 0.2952011633543383,
"grad_norm": 1.4607658386230469,
"learning_rate": 4.443219931083734e-05,
"loss": 1.3633,
"step": 609
},
{
"epoch": 0.2956858943286476,
"grad_norm": 1.4199997186660767,
"learning_rate": 4.440554813909252e-05,
"loss": 1.6503,
"step": 610
},
{
"epoch": 0.29617062530295685,
"grad_norm": 1.5280699729919434,
"learning_rate": 4.437884136801074e-05,
"loss": 1.4311,
"step": 611
},
{
"epoch": 0.29665535627726614,
"grad_norm": 1.4121835231781006,
"learning_rate": 4.435207907411026e-05,
"loss": 1.3614,
"step": 612
},
{
"epoch": 0.29714008725157537,
"grad_norm": 1.6203902959823608,
"learning_rate": 4.4325261334068426e-05,
"loss": 1.6557,
"step": 613
},
{
"epoch": 0.29762481822588466,
"grad_norm": 1.4249413013458252,
"learning_rate": 4.4298388224721435e-05,
"loss": 1.4862,
"step": 614
},
{
"epoch": 0.2981095492001939,
"grad_norm": 1.6460726261138916,
"learning_rate": 4.427145982306412e-05,
"loss": 1.7338,
"step": 615
},
{
"epoch": 0.2985942801745031,
"grad_norm": 1.5498179197311401,
"learning_rate": 4.4244476206249745e-05,
"loss": 1.1775,
"step": 616
},
{
"epoch": 0.2990790111488124,
"grad_norm": 1.4984955787658691,
"learning_rate": 4.421743745158977e-05,
"loss": 1.5588,
"step": 617
},
{
"epoch": 0.29956374212312165,
"grad_norm": 1.8243629932403564,
"learning_rate": 4.419034363655362e-05,
"loss": 1.6428,
"step": 618
},
{
"epoch": 0.30004847309743093,
"grad_norm": 1.5599781274795532,
"learning_rate": 4.4163194838768495e-05,
"loss": 1.7171,
"step": 619
},
{
"epoch": 0.30053320407174017,
"grad_norm": 1.6111764907836914,
"learning_rate": 4.4135991136019106e-05,
"loss": 1.8379,
"step": 620
},
{
"epoch": 0.30101793504604946,
"grad_norm": 1.54121994972229,
"learning_rate": 4.4108732606247495e-05,
"loss": 1.3949,
"step": 621
},
{
"epoch": 0.3015026660203587,
"grad_norm": 1.552783489227295,
"learning_rate": 4.408141932755277e-05,
"loss": 1.3488,
"step": 622
},
{
"epoch": 0.301987396994668,
"grad_norm": 1.6675286293029785,
"learning_rate": 4.4054051378190915e-05,
"loss": 1.2843,
"step": 623
},
{
"epoch": 0.3024721279689772,
"grad_norm": 1.6017731428146362,
"learning_rate": 4.402662883657454e-05,
"loss": 1.6162,
"step": 624
},
{
"epoch": 0.3029568589432865,
"grad_norm": 1.6486562490463257,
"learning_rate": 4.3999151781272694e-05,
"loss": 1.723,
"step": 625
},
{
"epoch": 0.30344158991759573,
"grad_norm": 1.474776268005371,
"learning_rate": 4.397162029101058e-05,
"loss": 1.7979,
"step": 626
},
{
"epoch": 0.30392632089190497,
"grad_norm": 1.4462790489196777,
"learning_rate": 4.3944034444669405e-05,
"loss": 1.3842,
"step": 627
},
{
"epoch": 0.30441105186621426,
"grad_norm": 1.8951647281646729,
"learning_rate": 4.391639432128606e-05,
"loss": 1.4682,
"step": 628
},
{
"epoch": 0.3048957828405235,
"grad_norm": 1.4978829622268677,
"learning_rate": 4.3888700000052996e-05,
"loss": 1.4582,
"step": 629
},
{
"epoch": 0.3053805138148328,
"grad_norm": 1.5500818490982056,
"learning_rate": 4.386095156031792e-05,
"loss": 1.6484,
"step": 630
},
{
"epoch": 0.305865244789142,
"grad_norm": 1.9269793033599854,
"learning_rate": 4.3833149081583604e-05,
"loss": 1.5815,
"step": 631
},
{
"epoch": 0.3063499757634513,
"grad_norm": 2.429781436920166,
"learning_rate": 4.3805292643507644e-05,
"loss": 1.935,
"step": 632
},
{
"epoch": 0.30683470673776053,
"grad_norm": 1.5993130207061768,
"learning_rate": 4.377738232590225e-05,
"loss": 1.3461,
"step": 633
},
{
"epoch": 0.3073194377120698,
"grad_norm": 1.588075041770935,
"learning_rate": 4.374941820873399e-05,
"loss": 1.617,
"step": 634
},
{
"epoch": 0.30780416868637905,
"grad_norm": 1.6482924222946167,
"learning_rate": 4.372140037212357e-05,
"loss": 1.4718,
"step": 635
},
{
"epoch": 0.30828889966068834,
"grad_norm": 1.5200806856155396,
"learning_rate": 4.369332889634563e-05,
"loss": 1.5187,
"step": 636
},
{
"epoch": 0.3087736306349976,
"grad_norm": 1.4226235151290894,
"learning_rate": 4.366520386182846e-05,
"loss": 1.2748,
"step": 637
},
{
"epoch": 0.3092583616093068,
"grad_norm": 1.404143214225769,
"learning_rate": 4.363702534915385e-05,
"loss": 1.3319,
"step": 638
},
{
"epoch": 0.3097430925836161,
"grad_norm": 1.511928677558899,
"learning_rate": 4.360879343905676e-05,
"loss": 1.8477,
"step": 639
},
{
"epoch": 0.31022782355792533,
"grad_norm": 1.4452472925186157,
"learning_rate": 4.358050821242517e-05,
"loss": 1.222,
"step": 640
},
{
"epoch": 0.3107125545322346,
"grad_norm": 1.5438624620437622,
"learning_rate": 4.3552169750299835e-05,
"loss": 1.5466,
"step": 641
},
{
"epoch": 0.31119728550654385,
"grad_norm": 4.010799884796143,
"learning_rate": 4.352377813387398e-05,
"loss": 2.1084,
"step": 642
},
{
"epoch": 0.31168201648085314,
"grad_norm": 1.4538034200668335,
"learning_rate": 4.349533344449318e-05,
"loss": 1.5272,
"step": 643
},
{
"epoch": 0.3121667474551624,
"grad_norm": 1.4150643348693848,
"learning_rate": 4.346683576365505e-05,
"loss": 1.5093,
"step": 644
},
{
"epoch": 0.31265147842947166,
"grad_norm": 1.3800829648971558,
"learning_rate": 4.3438285173009006e-05,
"loss": 1.6719,
"step": 645
},
{
"epoch": 0.3131362094037809,
"grad_norm": 1.3637222051620483,
"learning_rate": 4.340968175435611e-05,
"loss": 1.2698,
"step": 646
},
{
"epoch": 0.3136209403780902,
"grad_norm": 1.470177173614502,
"learning_rate": 4.338102558964876e-05,
"loss": 1.3927,
"step": 647
},
{
"epoch": 0.3141056713523994,
"grad_norm": 2.070209264755249,
"learning_rate": 4.335231676099044e-05,
"loss": 1.3481,
"step": 648
},
{
"epoch": 0.31459040232670865,
"grad_norm": 1.404061198234558,
"learning_rate": 4.332355535063559e-05,
"loss": 1.514,
"step": 649
},
{
"epoch": 0.31507513330101794,
"grad_norm": 3.9239437580108643,
"learning_rate": 4.329474144098924e-05,
"loss": 1.341,
"step": 650
},
{
"epoch": 0.3155598642753272,
"grad_norm": 2.510849952697754,
"learning_rate": 4.32658751146069e-05,
"loss": 1.4483,
"step": 651
},
{
"epoch": 0.31604459524963646,
"grad_norm": 1.472462773323059,
"learning_rate": 4.323695645419419e-05,
"loss": 1.7946,
"step": 652
},
{
"epoch": 0.3165293262239457,
"grad_norm": 1.3601329326629639,
"learning_rate": 4.320798554260674e-05,
"loss": 1.1157,
"step": 653
},
{
"epoch": 0.317014057198255,
"grad_norm": 1.462933897972107,
"learning_rate": 4.3178962462849835e-05,
"loss": 1.3288,
"step": 654
},
{
"epoch": 0.3174987881725642,
"grad_norm": 1.6446117162704468,
"learning_rate": 4.3149887298078276e-05,
"loss": 1.9576,
"step": 655
},
{
"epoch": 0.3179835191468735,
"grad_norm": 1.4152432680130005,
"learning_rate": 4.312076013159604e-05,
"loss": 1.2452,
"step": 656
},
{
"epoch": 0.31846825012118274,
"grad_norm": 2.2185819149017334,
"learning_rate": 4.309158104685614e-05,
"loss": 1.8676,
"step": 657
},
{
"epoch": 0.318952981095492,
"grad_norm": 1.3394452333450317,
"learning_rate": 4.3062350127460325e-05,
"loss": 1.2346,
"step": 658
},
{
"epoch": 0.31943771206980126,
"grad_norm": 1.6615955829620361,
"learning_rate": 4.303306745715885e-05,
"loss": 1.5486,
"step": 659
},
{
"epoch": 0.3199224430441105,
"grad_norm": 1.4415209293365479,
"learning_rate": 4.3003733119850256e-05,
"loss": 1.5477,
"step": 660
},
{
"epoch": 0.3204071740184198,
"grad_norm": 1.4911268949508667,
"learning_rate": 4.29743471995811e-05,
"loss": 1.3075,
"step": 661
},
{
"epoch": 0.320891904992729,
"grad_norm": 1.4862326383590698,
"learning_rate": 4.2944909780545754e-05,
"loss": 1.5706,
"step": 662
},
{
"epoch": 0.3213766359670383,
"grad_norm": 1.5849559307098389,
"learning_rate": 4.291542094708612e-05,
"loss": 1.4566,
"step": 663
},
{
"epoch": 0.32186136694134754,
"grad_norm": 1.786057949066162,
"learning_rate": 4.288588078369141e-05,
"loss": 1.7439,
"step": 664
},
{
"epoch": 0.3223460979156568,
"grad_norm": 1.5245951414108276,
"learning_rate": 4.2856289374997927e-05,
"loss": 1.524,
"step": 665
},
{
"epoch": 0.32283082888996606,
"grad_norm": 1.8819663524627686,
"learning_rate": 4.282664680578876e-05,
"loss": 1.4255,
"step": 666
},
{
"epoch": 0.32331555986427535,
"grad_norm": 1.5141843557357788,
"learning_rate": 4.2796953160993616e-05,
"loss": 1.7939,
"step": 667
},
{
"epoch": 0.3238002908385846,
"grad_norm": 1.4404217004776,
"learning_rate": 4.276720852568851e-05,
"loss": 1.7917,
"step": 668
},
{
"epoch": 0.32428502181289387,
"grad_norm": 1.4955337047576904,
"learning_rate": 4.273741298509557e-05,
"loss": 1.7536,
"step": 669
},
{
"epoch": 0.3247697527872031,
"grad_norm": 1.5283381938934326,
"learning_rate": 4.2707566624582774e-05,
"loss": 1.3797,
"step": 670
},
{
"epoch": 0.32525448376151234,
"grad_norm": 1.550523281097412,
"learning_rate": 4.267766952966369e-05,
"loss": 1.6906,
"step": 671
},
{
"epoch": 0.3257392147358216,
"grad_norm": 1.4687964916229248,
"learning_rate": 4.264772178599726e-05,
"loss": 1.4744,
"step": 672
},
{
"epoch": 0.32622394571013086,
"grad_norm": 1.485487937927246,
"learning_rate": 4.261772347938754e-05,
"loss": 1.675,
"step": 673
},
{
"epoch": 0.32670867668444015,
"grad_norm": 1.8627375364303589,
"learning_rate": 4.258767469578345e-05,
"loss": 1.5248,
"step": 674
},
{
"epoch": 0.3271934076587494,
"grad_norm": 1.4943833351135254,
"learning_rate": 4.255757552127855e-05,
"loss": 1.3291,
"step": 675
},
{
"epoch": 0.32767813863305867,
"grad_norm": 1.4833825826644897,
"learning_rate": 4.252742604211073e-05,
"loss": 1.3374,
"step": 676
},
{
"epoch": 0.3281628696073679,
"grad_norm": 1.7539558410644531,
"learning_rate": 4.2497226344662065e-05,
"loss": 1.9051,
"step": 677
},
{
"epoch": 0.3286476005816772,
"grad_norm": 1.5023777484893799,
"learning_rate": 4.2466976515458484e-05,
"loss": 1.416,
"step": 678
},
{
"epoch": 0.3291323315559864,
"grad_norm": 1.383313775062561,
"learning_rate": 4.243667664116956e-05,
"loss": 1.6164,
"step": 679
},
{
"epoch": 0.3296170625302957,
"grad_norm": 1.8263195753097534,
"learning_rate": 4.2406326808608225e-05,
"loss": 1.6341,
"step": 680
},
{
"epoch": 0.33010179350460495,
"grad_norm": 2.7321722507476807,
"learning_rate": 4.237592710473059e-05,
"loss": 1.2566,
"step": 681
},
{
"epoch": 0.3305865244789142,
"grad_norm": 1.4957518577575684,
"learning_rate": 4.234547761663562e-05,
"loss": 1.5391,
"step": 682
},
{
"epoch": 0.33107125545322347,
"grad_norm": 1.3792263269424438,
"learning_rate": 4.2314978431564923e-05,
"loss": 1.2249,
"step": 683
},
{
"epoch": 0.3315559864275327,
"grad_norm": 1.4482512474060059,
"learning_rate": 4.228442963690252e-05,
"loss": 1.3106,
"step": 684
},
{
"epoch": 0.332040717401842,
"grad_norm": 1.5233250856399536,
"learning_rate": 4.2253831320174534e-05,
"loss": 1.4784,
"step": 685
},
{
"epoch": 0.3325254483761512,
"grad_norm": 1.4404493570327759,
"learning_rate": 4.2223183569049005e-05,
"loss": 1.3078,
"step": 686
},
{
"epoch": 0.3330101793504605,
"grad_norm": 1.585357904434204,
"learning_rate": 4.2192486471335585e-05,
"loss": 1.6884,
"step": 687
},
{
"epoch": 0.33349491032476974,
"grad_norm": 1.4535739421844482,
"learning_rate": 4.216174011498533e-05,
"loss": 1.8015,
"step": 688
},
{
"epoch": 0.33397964129907903,
"grad_norm": 1.4156732559204102,
"learning_rate": 4.2130944588090415e-05,
"loss": 1.5549,
"step": 689
},
{
"epoch": 0.33446437227338827,
"grad_norm": 1.4804260730743408,
"learning_rate": 4.2100099978883896e-05,
"loss": 1.4739,
"step": 690
},
{
"epoch": 0.33494910324769755,
"grad_norm": 1.4360147714614868,
"learning_rate": 4.206920637573946e-05,
"loss": 1.3966,
"step": 691
},
{
"epoch": 0.3354338342220068,
"grad_norm": 1.5461288690567017,
"learning_rate": 4.203826386717118e-05,
"loss": 1.4189,
"step": 692
},
{
"epoch": 0.335918565196316,
"grad_norm": 1.321059226989746,
"learning_rate": 4.200727254183322e-05,
"loss": 1.1196,
"step": 693
},
{
"epoch": 0.3364032961706253,
"grad_norm": 1.325295329093933,
"learning_rate": 4.1976232488519626e-05,
"loss": 1.2674,
"step": 694
},
{
"epoch": 0.33688802714493454,
"grad_norm": 1.6588038206100464,
"learning_rate": 4.1945143796164076e-05,
"loss": 1.7719,
"step": 695
},
{
"epoch": 0.33737275811924383,
"grad_norm": 1.321679711341858,
"learning_rate": 4.191400655383956e-05,
"loss": 1.238,
"step": 696
},
{
"epoch": 0.33785748909355307,
"grad_norm": 1.4317057132720947,
"learning_rate": 4.188282085075821e-05,
"loss": 1.5644,
"step": 697
},
{
"epoch": 0.33834222006786235,
"grad_norm": 1.4975212812423706,
"learning_rate": 4.185158677627099e-05,
"loss": 1.5107,
"step": 698
},
{
"epoch": 0.3388269510421716,
"grad_norm": 1.4152145385742188,
"learning_rate": 4.182030441986744e-05,
"loss": 1.5376,
"step": 699
},
{
"epoch": 0.3393116820164809,
"grad_norm": 1.418017864227295,
"learning_rate": 4.178897387117546e-05,
"loss": 1.3491,
"step": 700
},
{
"epoch": 0.3397964129907901,
"grad_norm": 1.5809285640716553,
"learning_rate": 4.175759521996101e-05,
"loss": 1.487,
"step": 701
},
{
"epoch": 0.3402811439650994,
"grad_norm": 1.5660691261291504,
"learning_rate": 4.172616855612787e-05,
"loss": 1.6579,
"step": 702
},
{
"epoch": 0.34076587493940863,
"grad_norm": 1.5755140781402588,
"learning_rate": 4.169469396971739e-05,
"loss": 1.2482,
"step": 703
},
{
"epoch": 0.34125060591371786,
"grad_norm": 1.782956838607788,
"learning_rate": 4.166317155090822e-05,
"loss": 1.5582,
"step": 704
},
{
"epoch": 0.34173533688802715,
"grad_norm": 1.4416836500167847,
"learning_rate": 4.1631601390016055e-05,
"loss": 1.653,
"step": 705
},
{
"epoch": 0.3422200678623364,
"grad_norm": 1.537095069885254,
"learning_rate": 4.159998357749338e-05,
"loss": 1.4798,
"step": 706
},
{
"epoch": 0.3427047988366457,
"grad_norm": 1.4685359001159668,
"learning_rate": 4.1568318203929195e-05,
"loss": 1.3947,
"step": 707
},
{
"epoch": 0.3431895298109549,
"grad_norm": 1.462533712387085,
"learning_rate": 4.1536605360048795e-05,
"loss": 1.4598,
"step": 708
},
{
"epoch": 0.3436742607852642,
"grad_norm": 1.5037108659744263,
"learning_rate": 4.150484513671346e-05,
"loss": 1.5225,
"step": 709
},
{
"epoch": 0.34415899175957343,
"grad_norm": 1.4552758932113647,
"learning_rate": 4.147303762492022e-05,
"loss": 1.433,
"step": 710
},
{
"epoch": 0.3446437227338827,
"grad_norm": 1.6252245903015137,
"learning_rate": 4.144118291580161e-05,
"loss": 1.6058,
"step": 711
},
{
"epoch": 0.34512845370819195,
"grad_norm": 1.6239243745803833,
"learning_rate": 4.140928110062538e-05,
"loss": 1.3355,
"step": 712
},
{
"epoch": 0.3456131846825012,
"grad_norm": 1.3930360078811646,
"learning_rate": 4.137733227079423e-05,
"loss": 1.4171,
"step": 713
},
{
"epoch": 0.3460979156568105,
"grad_norm": 1.9757750034332275,
"learning_rate": 4.134533651784559e-05,
"loss": 1.3524,
"step": 714
},
{
"epoch": 0.3465826466311197,
"grad_norm": 1.4614371061325073,
"learning_rate": 4.131329393345131e-05,
"loss": 1.7791,
"step": 715
},
{
"epoch": 0.347067377605429,
"grad_norm": 1.3664659261703491,
"learning_rate": 4.1281204609417435e-05,
"loss": 1.2993,
"step": 716
},
{
"epoch": 0.34755210857973823,
"grad_norm": 1.4222404956817627,
"learning_rate": 4.1249068637683906e-05,
"loss": 1.3012,
"step": 717
},
{
"epoch": 0.3480368395540475,
"grad_norm": 1.4118279218673706,
"learning_rate": 4.1216886110324324e-05,
"loss": 1.3078,
"step": 718
},
{
"epoch": 0.34852157052835675,
"grad_norm": 1.4675920009613037,
"learning_rate": 4.118465711954569e-05,
"loss": 1.4492,
"step": 719
},
{
"epoch": 0.34900630150266604,
"grad_norm": 1.490893006324768,
"learning_rate": 4.115238175768812e-05,
"loss": 1.4637,
"step": 720
},
{
"epoch": 0.34949103247697527,
"grad_norm": 1.3924874067306519,
"learning_rate": 4.1120060117224566e-05,
"loss": 1.3949,
"step": 721
},
{
"epoch": 0.34997576345128456,
"grad_norm": 1.4966955184936523,
"learning_rate": 4.108769229076061e-05,
"loss": 1.5634,
"step": 722
},
{
"epoch": 0.3504604944255938,
"grad_norm": 1.4051393270492554,
"learning_rate": 4.105527837103414e-05,
"loss": 1.2924,
"step": 723
},
{
"epoch": 0.350945225399903,
"grad_norm": 1.4678040742874146,
"learning_rate": 4.102281845091512e-05,
"loss": 1.6048,
"step": 724
},
{
"epoch": 0.3514299563742123,
"grad_norm": 1.4021576642990112,
"learning_rate": 4.0990312623405305e-05,
"loss": 1.3513,
"step": 725
},
{
"epoch": 0.35191468734852155,
"grad_norm": 1.5062463283538818,
"learning_rate": 4.095776098163798e-05,
"loss": 1.489,
"step": 726
},
{
"epoch": 0.35239941832283084,
"grad_norm": 1.4043464660644531,
"learning_rate": 4.0925163618877695e-05,
"loss": 1.497,
"step": 727
},
{
"epoch": 0.35288414929714007,
"grad_norm": 1.5384563207626343,
"learning_rate": 4.0892520628519985e-05,
"loss": 1.8048,
"step": 728
},
{
"epoch": 0.35336888027144936,
"grad_norm": 1.4933316707611084,
"learning_rate": 4.085983210409114e-05,
"loss": 1.2942,
"step": 729
},
{
"epoch": 0.3538536112457586,
"grad_norm": 1.570822834968567,
"learning_rate": 4.082709813924789e-05,
"loss": 1.6173,
"step": 730
},
{
"epoch": 0.3543383422200679,
"grad_norm": 1.5000869035720825,
"learning_rate": 4.079431882777715e-05,
"loss": 1.3644,
"step": 731
},
{
"epoch": 0.3548230731943771,
"grad_norm": 1.4136881828308105,
"learning_rate": 4.0761494263595796e-05,
"loss": 1.7214,
"step": 732
},
{
"epoch": 0.3553078041686864,
"grad_norm": 1.5652875900268555,
"learning_rate": 4.072862454075031e-05,
"loss": 1.6255,
"step": 733
},
{
"epoch": 0.35579253514299564,
"grad_norm": 1.6801156997680664,
"learning_rate": 4.06957097534166e-05,
"loss": 1.7948,
"step": 734
},
{
"epoch": 0.35627726611730487,
"grad_norm": 1.3107026815414429,
"learning_rate": 4.0662749995899666e-05,
"loss": 1.3926,
"step": 735
},
{
"epoch": 0.35676199709161416,
"grad_norm": 1.503405213356018,
"learning_rate": 4.062974536263336e-05,
"loss": 1.5015,
"step": 736
},
{
"epoch": 0.3572467280659234,
"grad_norm": 1.5230603218078613,
"learning_rate": 4.0596695948180116e-05,
"loss": 1.3572,
"step": 737
},
{
"epoch": 0.3577314590402327,
"grad_norm": 1.4852657318115234,
"learning_rate": 4.056360184723065e-05,
"loss": 1.5162,
"step": 738
},
{
"epoch": 0.3582161900145419,
"grad_norm": 1.5229320526123047,
"learning_rate": 4.0530463154603747e-05,
"loss": 1.5939,
"step": 739
},
{
"epoch": 0.3587009209888512,
"grad_norm": 1.5715887546539307,
"learning_rate": 4.049727996524591e-05,
"loss": 1.4526,
"step": 740
},
{
"epoch": 0.35918565196316043,
"grad_norm": 1.3569464683532715,
"learning_rate": 4.046405237423116e-05,
"loss": 1.3617,
"step": 741
},
{
"epoch": 0.3596703829374697,
"grad_norm": 1.477789282798767,
"learning_rate": 4.043078047676072e-05,
"loss": 1.4437,
"step": 742
},
{
"epoch": 0.36015511391177896,
"grad_norm": 1.3856958150863647,
"learning_rate": 4.039746436816277e-05,
"loss": 1.3219,
"step": 743
},
{
"epoch": 0.36063984488608825,
"grad_norm": 1.4407496452331543,
"learning_rate": 4.036410414389215e-05,
"loss": 1.2868,
"step": 744
},
{
"epoch": 0.3611245758603975,
"grad_norm": 1.449094533920288,
"learning_rate": 4.03306998995301e-05,
"loss": 1.3048,
"step": 745
},
{
"epoch": 0.3616093068347067,
"grad_norm": 1.4655970335006714,
"learning_rate": 4.029725173078398e-05,
"loss": 1.4586,
"step": 746
},
{
"epoch": 0.362094037809016,
"grad_norm": 1.4294497966766357,
"learning_rate": 4.0263759733487015e-05,
"loss": 1.417,
"step": 747
},
{
"epoch": 0.36257876878332523,
"grad_norm": 1.5399290323257446,
"learning_rate": 4.023022400359797e-05,
"loss": 1.6722,
"step": 748
},
{
"epoch": 0.3630634997576345,
"grad_norm": 1.4106359481811523,
"learning_rate": 4.019664463720094e-05,
"loss": 1.5666,
"step": 749
},
{
"epoch": 0.36354823073194376,
"grad_norm": 1.5670846700668335,
"learning_rate": 4.0163021730505045e-05,
"loss": 1.6455,
"step": 750
},
{
"epoch": 0.36403296170625304,
"grad_norm": 1.5435312986373901,
"learning_rate": 4.012935537984414e-05,
"loss": 1.7314,
"step": 751
},
{
"epoch": 0.3645176926805623,
"grad_norm": 1.43980872631073,
"learning_rate": 4.009564568167653e-05,
"loss": 1.1749,
"step": 752
},
{
"epoch": 0.36500242365487157,
"grad_norm": 1.5904608964920044,
"learning_rate": 4.006189273258477e-05,
"loss": 1.4091,
"step": 753
},
{
"epoch": 0.3654871546291808,
"grad_norm": 1.569663643836975,
"learning_rate": 4.00280966292753e-05,
"loss": 1.447,
"step": 754
},
{
"epoch": 0.3659718856034901,
"grad_norm": 1.4279499053955078,
"learning_rate": 3.99942574685782e-05,
"loss": 1.4849,
"step": 755
},
{
"epoch": 0.3664566165777993,
"grad_norm": 1.4076343774795532,
"learning_rate": 3.9960375347446934e-05,
"loss": 1.3214,
"step": 756
},
{
"epoch": 0.36694134755210855,
"grad_norm": 1.4956542253494263,
"learning_rate": 3.9926450362958024e-05,
"loss": 1.4805,
"step": 757
},
{
"epoch": 0.36742607852641784,
"grad_norm": 1.563628911972046,
"learning_rate": 3.9892482612310836e-05,
"loss": 1.8152,
"step": 758
},
{
"epoch": 0.3679108095007271,
"grad_norm": 1.4647592306137085,
"learning_rate": 3.985847219282725e-05,
"loss": 1.4635,
"step": 759
},
{
"epoch": 0.36839554047503636,
"grad_norm": 1.4081557989120483,
"learning_rate": 3.982441920195138e-05,
"loss": 1.5819,
"step": 760
},
{
"epoch": 0.3688802714493456,
"grad_norm": 1.5153309106826782,
"learning_rate": 3.9790323737249346e-05,
"loss": 1.5674,
"step": 761
},
{
"epoch": 0.3693650024236549,
"grad_norm": 1.4655232429504395,
"learning_rate": 3.975618589640894e-05,
"loss": 1.4397,
"step": 762
},
{
"epoch": 0.3698497333979641,
"grad_norm": 1.60316801071167,
"learning_rate": 3.9722005777239354e-05,
"loss": 1.5732,
"step": 763
},
{
"epoch": 0.3703344643722734,
"grad_norm": 1.594388723373413,
"learning_rate": 3.9687783477670966e-05,
"loss": 1.6924,
"step": 764
},
{
"epoch": 0.37081919534658264,
"grad_norm": 1.587856411933899,
"learning_rate": 3.9653519095754934e-05,
"loss": 1.7377,
"step": 765
},
{
"epoch": 0.37130392632089193,
"grad_norm": 1.4031965732574463,
"learning_rate": 3.961921272966305e-05,
"loss": 1.5464,
"step": 766
},
{
"epoch": 0.37178865729520116,
"grad_norm": 1.3980727195739746,
"learning_rate": 3.958486447768736e-05,
"loss": 1.3223,
"step": 767
},
{
"epoch": 0.3722733882695104,
"grad_norm": 1.411486268043518,
"learning_rate": 3.95504744382399e-05,
"loss": 1.5066,
"step": 768
},
{
"epoch": 0.3727581192438197,
"grad_norm": 1.5289703607559204,
"learning_rate": 3.9516042709852506e-05,
"loss": 1.4962,
"step": 769
},
{
"epoch": 0.3732428502181289,
"grad_norm": 1.3834885358810425,
"learning_rate": 3.948156939117639e-05,
"loss": 1.3076,
"step": 770
},
{
"epoch": 0.3737275811924382,
"grad_norm": 1.5126419067382812,
"learning_rate": 3.944705458098194e-05,
"loss": 1.203,
"step": 771
},
{
"epoch": 0.37421231216674744,
"grad_norm": 1.4430164098739624,
"learning_rate": 3.9412498378158446e-05,
"loss": 1.2171,
"step": 772
},
{
"epoch": 0.37469704314105673,
"grad_norm": 1.4813838005065918,
"learning_rate": 3.9377900881713764e-05,
"loss": 1.4277,
"step": 773
},
{
"epoch": 0.37518177411536596,
"grad_norm": 1.431311011314392,
"learning_rate": 3.9343262190774076e-05,
"loss": 1.5013,
"step": 774
},
{
"epoch": 0.37566650508967525,
"grad_norm": 1.891334891319275,
"learning_rate": 3.93085824045836e-05,
"loss": 2.4066,
"step": 775
},
{
"epoch": 0.3761512360639845,
"grad_norm": 1.4745622873306274,
"learning_rate": 3.927386162250427e-05,
"loss": 1.3788,
"step": 776
},
{
"epoch": 0.3766359670382938,
"grad_norm": 1.437791109085083,
"learning_rate": 3.923909994401551e-05,
"loss": 1.565,
"step": 777
},
{
"epoch": 0.377120698012603,
"grad_norm": 1.3869900703430176,
"learning_rate": 3.92042974687139e-05,
"loss": 1.5087,
"step": 778
},
{
"epoch": 0.37760542898691224,
"grad_norm": 1.3274116516113281,
"learning_rate": 3.916945429631289e-05,
"loss": 1.5359,
"step": 779
},
{
"epoch": 0.37809015996122153,
"grad_norm": 1.4933420419692993,
"learning_rate": 3.9134570526642594e-05,
"loss": 1.5366,
"step": 780
},
{
"epoch": 0.37857489093553076,
"grad_norm": 1.4309301376342773,
"learning_rate": 3.9099646259649364e-05,
"loss": 1.3898,
"step": 781
},
{
"epoch": 0.37905962190984005,
"grad_norm": 1.476951241493225,
"learning_rate": 3.9064681595395634e-05,
"loss": 1.4312,
"step": 782
},
{
"epoch": 0.3795443528841493,
"grad_norm": 1.872909426689148,
"learning_rate": 3.902967663405956e-05,
"loss": 1.4043,
"step": 783
},
{
"epoch": 0.38002908385845857,
"grad_norm": 1.5244536399841309,
"learning_rate": 3.8994631475934775e-05,
"loss": 1.382,
"step": 784
},
{
"epoch": 0.3805138148327678,
"grad_norm": 1.5926803350448608,
"learning_rate": 3.895954622143004e-05,
"loss": 1.3973,
"step": 785
},
{
"epoch": 0.3809985458070771,
"grad_norm": 1.583483338356018,
"learning_rate": 3.8924420971069055e-05,
"loss": 1.6504,
"step": 786
},
{
"epoch": 0.3814832767813863,
"grad_norm": 1.3475099802017212,
"learning_rate": 3.888925582549006e-05,
"loss": 1.4933,
"step": 787
},
{
"epoch": 0.3819680077556956,
"grad_norm": 1.3581137657165527,
"learning_rate": 3.885405088544563e-05,
"loss": 1.2155,
"step": 788
},
{
"epoch": 0.38245273873000485,
"grad_norm": 1.462459921836853,
"learning_rate": 3.8818806251802334e-05,
"loss": 1.446,
"step": 789
},
{
"epoch": 0.3829374697043141,
"grad_norm": 1.4395034313201904,
"learning_rate": 3.878352202554051e-05,
"loss": 1.3661,
"step": 790
},
{
"epoch": 0.38342220067862337,
"grad_norm": 1.3227758407592773,
"learning_rate": 3.8748198307753874e-05,
"loss": 1.2516,
"step": 791
},
{
"epoch": 0.3839069316529326,
"grad_norm": 1.5062460899353027,
"learning_rate": 3.871283519964935e-05,
"loss": 1.7564,
"step": 792
},
{
"epoch": 0.3843916626272419,
"grad_norm": 1.4209660291671753,
"learning_rate": 3.867743280254666e-05,
"loss": 1.5597,
"step": 793
},
{
"epoch": 0.3848763936015511,
"grad_norm": 1.5597984790802002,
"learning_rate": 3.8641991217878154e-05,
"loss": 1.4486,
"step": 794
},
{
"epoch": 0.3853611245758604,
"grad_norm": 1.390090823173523,
"learning_rate": 3.8606510547188425e-05,
"loss": 1.361,
"step": 795
},
{
"epoch": 0.38584585555016965,
"grad_norm": 1.511312484741211,
"learning_rate": 3.857099089213405e-05,
"loss": 1.4659,
"step": 796
},
{
"epoch": 0.38633058652447894,
"grad_norm": 1.4220378398895264,
"learning_rate": 3.8535432354483313e-05,
"loss": 1.8297,
"step": 797
},
{
"epoch": 0.38681531749878817,
"grad_norm": 1.4622609615325928,
"learning_rate": 3.849983503611591e-05,
"loss": 1.6799,
"step": 798
},
{
"epoch": 0.38730004847309746,
"grad_norm": 1.4184774160385132,
"learning_rate": 3.8464199039022605e-05,
"loss": 1.3475,
"step": 799
},
{
"epoch": 0.3877847794474067,
"grad_norm": 1.6148196458816528,
"learning_rate": 3.842852446530505e-05,
"loss": 1.5014,
"step": 800
},
{
"epoch": 0.3882695104217159,
"grad_norm": 1.4410502910614014,
"learning_rate": 3.839281141717538e-05,
"loss": 1.6122,
"step": 801
},
{
"epoch": 0.3887542413960252,
"grad_norm": 1.4458303451538086,
"learning_rate": 3.835705999695595e-05,
"loss": 1.3593,
"step": 802
},
{
"epoch": 0.38923897237033445,
"grad_norm": 1.4514504671096802,
"learning_rate": 3.832127030707909e-05,
"loss": 1.2312,
"step": 803
},
{
"epoch": 0.38972370334464373,
"grad_norm": 1.5186152458190918,
"learning_rate": 3.828544245008677e-05,
"loss": 1.5824,
"step": 804
},
{
"epoch": 0.39020843431895297,
"grad_norm": 1.2924879789352417,
"learning_rate": 3.82495765286303e-05,
"loss": 1.3355,
"step": 805
},
{
"epoch": 0.39069316529326226,
"grad_norm": 1.4316754341125488,
"learning_rate": 3.821367264547006e-05,
"loss": 1.4083,
"step": 806
},
{
"epoch": 0.3911778962675715,
"grad_norm": 1.5610601902008057,
"learning_rate": 3.817773090347519e-05,
"loss": 1.662,
"step": 807
},
{
"epoch": 0.3916626272418808,
"grad_norm": 1.4195690155029297,
"learning_rate": 3.8141751405623317e-05,
"loss": 1.4629,
"step": 808
},
{
"epoch": 0.39214735821619,
"grad_norm": 1.4146546125411987,
"learning_rate": 3.8105734255000214e-05,
"loss": 1.452,
"step": 809
},
{
"epoch": 0.3926320891904993,
"grad_norm": 1.45216965675354,
"learning_rate": 3.806967955479955e-05,
"loss": 1.2171,
"step": 810
},
{
"epoch": 0.39311682016480853,
"grad_norm": 1.468102216720581,
"learning_rate": 3.803358740832257e-05,
"loss": 1.5747,
"step": 811
},
{
"epoch": 0.39360155113911777,
"grad_norm": 1.314497947692871,
"learning_rate": 3.7997457918977845e-05,
"loss": 1.351,
"step": 812
},
{
"epoch": 0.39408628211342706,
"grad_norm": 1.4871678352355957,
"learning_rate": 3.796129119028087e-05,
"loss": 1.2889,
"step": 813
},
{
"epoch": 0.3945710130877363,
"grad_norm": 1.3904635906219482,
"learning_rate": 3.79250873258539e-05,
"loss": 1.3541,
"step": 814
},
{
"epoch": 0.3950557440620456,
"grad_norm": 1.4073387384414673,
"learning_rate": 3.7888846429425546e-05,
"loss": 1.1039,
"step": 815
},
{
"epoch": 0.3955404750363548,
"grad_norm": 1.6151351928710938,
"learning_rate": 3.785256860483054e-05,
"loss": 1.6067,
"step": 816
},
{
"epoch": 0.3960252060106641,
"grad_norm": 1.456746220588684,
"learning_rate": 3.781625395600943e-05,
"loss": 1.4839,
"step": 817
},
{
"epoch": 0.39650993698497333,
"grad_norm": 1.4020472764968872,
"learning_rate": 3.7779902587008225e-05,
"loss": 1.2439,
"step": 818
},
{
"epoch": 0.3969946679592826,
"grad_norm": 1.5153536796569824,
"learning_rate": 3.774351460197819e-05,
"loss": 1.5316,
"step": 819
},
{
"epoch": 0.39747939893359185,
"grad_norm": 1.8110442161560059,
"learning_rate": 3.770709010517549e-05,
"loss": 1.4559,
"step": 820
},
{
"epoch": 0.3979641299079011,
"grad_norm": 1.497971773147583,
"learning_rate": 3.767062920096086e-05,
"loss": 1.587,
"step": 821
},
{
"epoch": 0.3984488608822104,
"grad_norm": 1.3729140758514404,
"learning_rate": 3.763413199379941e-05,
"loss": 1.27,
"step": 822
},
{
"epoch": 0.3989335918565196,
"grad_norm": 1.5104501247406006,
"learning_rate": 3.7597598588260196e-05,
"loss": 1.315,
"step": 823
},
{
"epoch": 0.3994183228308289,
"grad_norm": 1.459794521331787,
"learning_rate": 3.7561029089016055e-05,
"loss": 1.4664,
"step": 824
},
{
"epoch": 0.39990305380513813,
"grad_norm": 1.403060793876648,
"learning_rate": 3.7524423600843186e-05,
"loss": 1.6218,
"step": 825
},
{
"epoch": 0.4003877847794474,
"grad_norm": 1.4324053525924683,
"learning_rate": 3.7487782228620916e-05,
"loss": 1.274,
"step": 826
},
{
"epoch": 0.40087251575375665,
"grad_norm": 1.4029552936553955,
"learning_rate": 3.7451105077331396e-05,
"loss": 1.464,
"step": 827
},
{
"epoch": 0.40135724672806594,
"grad_norm": 1.549617052078247,
"learning_rate": 3.741439225205927e-05,
"loss": 1.4068,
"step": 828
},
{
"epoch": 0.4018419777023752,
"grad_norm": 1.4655847549438477,
"learning_rate": 3.7377643857991416e-05,
"loss": 1.5115,
"step": 829
},
{
"epoch": 0.40232670867668446,
"grad_norm": 1.5959397554397583,
"learning_rate": 3.7340860000416595e-05,
"loss": 1.5153,
"step": 830
},
{
"epoch": 0.4028114396509937,
"grad_norm": 1.6088857650756836,
"learning_rate": 3.730404078472518e-05,
"loss": 1.7593,
"step": 831
},
{
"epoch": 0.40329617062530293,
"grad_norm": 1.4206966161727905,
"learning_rate": 3.726718631640888e-05,
"loss": 1.4746,
"step": 832
},
{
"epoch": 0.4037809015996122,
"grad_norm": 1.4388668537139893,
"learning_rate": 3.723029670106036e-05,
"loss": 1.1222,
"step": 833
},
{
"epoch": 0.40426563257392145,
"grad_norm": 1.5215039253234863,
"learning_rate": 3.719337204437302e-05,
"loss": 1.3856,
"step": 834
},
{
"epoch": 0.40475036354823074,
"grad_norm": 1.470598578453064,
"learning_rate": 3.7156412452140646e-05,
"loss": 1.2346,
"step": 835
},
{
"epoch": 0.40523509452254,
"grad_norm": 1.4196856021881104,
"learning_rate": 3.711941803025712e-05,
"loss": 1.2546,
"step": 836
},
{
"epoch": 0.40571982549684926,
"grad_norm": 1.5476642847061157,
"learning_rate": 3.708238888471611e-05,
"loss": 1.7273,
"step": 837
},
{
"epoch": 0.4062045564711585,
"grad_norm": 1.4293063879013062,
"learning_rate": 3.704532512161079e-05,
"loss": 1.5743,
"step": 838
},
{
"epoch": 0.4066892874454678,
"grad_norm": 1.4300776720046997,
"learning_rate": 3.700822684713349e-05,
"loss": 1.3128,
"step": 839
},
{
"epoch": 0.407174018419777,
"grad_norm": 1.404038667678833,
"learning_rate": 3.697109416757544e-05,
"loss": 1.1517,
"step": 840
},
{
"epoch": 0.4076587493940863,
"grad_norm": 1.432827115058899,
"learning_rate": 3.6933927189326435e-05,
"loss": 1.3499,
"step": 841
},
{
"epoch": 0.40814348036839554,
"grad_norm": 1.4601900577545166,
"learning_rate": 3.689672601887455e-05,
"loss": 1.4287,
"step": 842
},
{
"epoch": 0.4086282113427048,
"grad_norm": 1.4678421020507812,
"learning_rate": 3.685949076280583e-05,
"loss": 1.5663,
"step": 843
},
{
"epoch": 0.40911294231701406,
"grad_norm": 1.538097620010376,
"learning_rate": 3.6822221527803934e-05,
"loss": 1.6855,
"step": 844
},
{
"epoch": 0.4095976732913233,
"grad_norm": 1.4171286821365356,
"learning_rate": 3.678491842064995e-05,
"loss": 1.6271,
"step": 845
},
{
"epoch": 0.4100824042656326,
"grad_norm": 1.5117058753967285,
"learning_rate": 3.674758154822194e-05,
"loss": 1.356,
"step": 846
},
{
"epoch": 0.4105671352399418,
"grad_norm": 1.442489743232727,
"learning_rate": 3.671021101749476e-05,
"loss": 1.3713,
"step": 847
},
{
"epoch": 0.4110518662142511,
"grad_norm": 1.4731215238571167,
"learning_rate": 3.667280693553967e-05,
"loss": 1.4432,
"step": 848
},
{
"epoch": 0.41153659718856034,
"grad_norm": 1.464311122894287,
"learning_rate": 3.663536940952409e-05,
"loss": 1.5066,
"step": 849
},
{
"epoch": 0.4120213281628696,
"grad_norm": 1.3426711559295654,
"learning_rate": 3.659789854671122e-05,
"loss": 1.1671,
"step": 850
},
{
"epoch": 0.41250605913717886,
"grad_norm": 1.665104866027832,
"learning_rate": 3.6560394454459814e-05,
"loss": 1.9996,
"step": 851
},
{
"epoch": 0.41299079011148815,
"grad_norm": 1.2623895406723022,
"learning_rate": 3.652285724022379e-05,
"loss": 1.1562,
"step": 852
},
{
"epoch": 0.4134755210857974,
"grad_norm": 1.361686110496521,
"learning_rate": 3.648528701155203e-05,
"loss": 1.278,
"step": 853
},
{
"epoch": 0.4139602520601066,
"grad_norm": 1.4689781665802002,
"learning_rate": 3.644768387608793e-05,
"loss": 1.417,
"step": 854
},
{
"epoch": 0.4144449830344159,
"grad_norm": 1.5472491979599,
"learning_rate": 3.6410047941569224e-05,
"loss": 1.4856,
"step": 855
},
{
"epoch": 0.41492971400872514,
"grad_norm": 1.4804879426956177,
"learning_rate": 3.637237931582759e-05,
"loss": 1.7454,
"step": 856
},
{
"epoch": 0.4154144449830344,
"grad_norm": 1.4140907526016235,
"learning_rate": 3.633467810678839e-05,
"loss": 1.4682,
"step": 857
},
{
"epoch": 0.41589917595734366,
"grad_norm": 1.4889259338378906,
"learning_rate": 3.629694442247032e-05,
"loss": 1.594,
"step": 858
},
{
"epoch": 0.41638390693165295,
"grad_norm": 1.4880090951919556,
"learning_rate": 3.6259178370985144e-05,
"loss": 1.4029,
"step": 859
},
{
"epoch": 0.4168686379059622,
"grad_norm": 1.4780687093734741,
"learning_rate": 3.6221380060537333e-05,
"loss": 1.4561,
"step": 860
},
{
"epoch": 0.41735336888027147,
"grad_norm": 1.4494913816452026,
"learning_rate": 3.6183549599423815e-05,
"loss": 1.4076,
"step": 861
},
{
"epoch": 0.4178380998545807,
"grad_norm": 1.416123390197754,
"learning_rate": 3.6145687096033634e-05,
"loss": 1.3328,
"step": 862
},
{
"epoch": 0.41832283082889,
"grad_norm": 1.4615715742111206,
"learning_rate": 3.6107792658847595e-05,
"loss": 1.3547,
"step": 863
},
{
"epoch": 0.4188075618031992,
"grad_norm": 1.3456425666809082,
"learning_rate": 3.606986639643805e-05,
"loss": 1.6143,
"step": 864
},
{
"epoch": 0.41929229277750846,
"grad_norm": 1.4085299968719482,
"learning_rate": 3.603190841746851e-05,
"loss": 1.4381,
"step": 865
},
{
"epoch": 0.41977702375181775,
"grad_norm": 1.3223602771759033,
"learning_rate": 3.599391883069335e-05,
"loss": 1.1998,
"step": 866
},
{
"epoch": 0.420261754726127,
"grad_norm": 1.3724223375320435,
"learning_rate": 3.595589774495753e-05,
"loss": 1.3599,
"step": 867
},
{
"epoch": 0.42074648570043627,
"grad_norm": 1.5481332540512085,
"learning_rate": 3.591784526919624e-05,
"loss": 1.6104,
"step": 868
},
{
"epoch": 0.4212312166747455,
"grad_norm": 1.4483201503753662,
"learning_rate": 3.58797615124346e-05,
"loss": 1.3523,
"step": 869
},
{
"epoch": 0.4217159476490548,
"grad_norm": 2.079836845397949,
"learning_rate": 3.584164658378738e-05,
"loss": 1.6446,
"step": 870
},
{
"epoch": 0.422200678623364,
"grad_norm": 1.3948813676834106,
"learning_rate": 3.580350059245864e-05,
"loss": 1.3267,
"step": 871
},
{
"epoch": 0.4226854095976733,
"grad_norm": 1.3875503540039062,
"learning_rate": 3.576532364774145e-05,
"loss": 1.2546,
"step": 872
},
{
"epoch": 0.42317014057198254,
"grad_norm": 1.590307354927063,
"learning_rate": 3.572711585901755e-05,
"loss": 1.733,
"step": 873
},
{
"epoch": 0.42365487154629183,
"grad_norm": 1.4814637899398804,
"learning_rate": 3.568887733575706e-05,
"loss": 1.5166,
"step": 874
},
{
"epoch": 0.42413960252060107,
"grad_norm": 1.3917155265808105,
"learning_rate": 3.565060818751816e-05,
"loss": 1.2831,
"step": 875
},
{
"epoch": 0.4246243334949103,
"grad_norm": 1.4022222757339478,
"learning_rate": 3.561230852394679e-05,
"loss": 1.2607,
"step": 876
},
{
"epoch": 0.4251090644692196,
"grad_norm": 1.503482460975647,
"learning_rate": 3.55739784547763e-05,
"loss": 1.5367,
"step": 877
},
{
"epoch": 0.4255937954435288,
"grad_norm": 1.544471025466919,
"learning_rate": 3.553561808982715e-05,
"loss": 1.5024,
"step": 878
},
{
"epoch": 0.4260785264178381,
"grad_norm": 1.434191346168518,
"learning_rate": 3.5497227539006614e-05,
"loss": 1.327,
"step": 879
},
{
"epoch": 0.42656325739214734,
"grad_norm": 1.3471134901046753,
"learning_rate": 3.545880691230846e-05,
"loss": 1.263,
"step": 880
},
{
"epoch": 0.42704798836645663,
"grad_norm": 1.3368321657180786,
"learning_rate": 3.542035631981261e-05,
"loss": 1.3815,
"step": 881
},
{
"epoch": 0.42753271934076587,
"grad_norm": 1.9717825651168823,
"learning_rate": 3.538187587168486e-05,
"loss": 1.6113,
"step": 882
},
{
"epoch": 0.42801745031507515,
"grad_norm": 1.412209153175354,
"learning_rate": 3.534336567817651e-05,
"loss": 1.2464,
"step": 883
},
{
"epoch": 0.4285021812893844,
"grad_norm": 1.6004551649093628,
"learning_rate": 3.530482584962414e-05,
"loss": 1.7197,
"step": 884
},
{
"epoch": 0.4289869122636937,
"grad_norm": 1.4312201738357544,
"learning_rate": 3.5266256496449186e-05,
"loss": 1.4607,
"step": 885
},
{
"epoch": 0.4294716432380029,
"grad_norm": 1.418025016784668,
"learning_rate": 3.5227657729157705e-05,
"loss": 1.441,
"step": 886
},
{
"epoch": 0.42995637421231214,
"grad_norm": 1.4603976011276245,
"learning_rate": 3.5189029658340025e-05,
"loss": 1.472,
"step": 887
},
{
"epoch": 0.43044110518662143,
"grad_norm": 1.4871946573257446,
"learning_rate": 3.5150372394670426e-05,
"loss": 1.252,
"step": 888
},
{
"epoch": 0.43092583616093066,
"grad_norm": 2.187009811401367,
"learning_rate": 3.5111686048906835e-05,
"loss": 1.3653,
"step": 889
},
{
"epoch": 0.43141056713523995,
"grad_norm": 1.4901810884475708,
"learning_rate": 3.5072970731890486e-05,
"loss": 1.7895,
"step": 890
},
{
"epoch": 0.4318952981095492,
"grad_norm": 1.3609486818313599,
"learning_rate": 3.5034226554545656e-05,
"loss": 1.2407,
"step": 891
},
{
"epoch": 0.4323800290838585,
"grad_norm": 1.5160627365112305,
"learning_rate": 3.499545362787927e-05,
"loss": 1.4348,
"step": 892
},
{
"epoch": 0.4328647600581677,
"grad_norm": 1.4251633882522583,
"learning_rate": 3.495665206298065e-05,
"loss": 1.4585,
"step": 893
},
{
"epoch": 0.433349491032477,
"grad_norm": 1.4607558250427246,
"learning_rate": 3.491782197102115e-05,
"loss": 1.3888,
"step": 894
},
{
"epoch": 0.43383422200678623,
"grad_norm": 1.49302077293396,
"learning_rate": 3.487896346325389e-05,
"loss": 1.604,
"step": 895
},
{
"epoch": 0.4343189529810955,
"grad_norm": 1.5757123231887817,
"learning_rate": 3.484007665101336e-05,
"loss": 1.5465,
"step": 896
},
{
"epoch": 0.43480368395540475,
"grad_norm": 1.4108710289001465,
"learning_rate": 3.480116164571519e-05,
"loss": 1.4156,
"step": 897
},
{
"epoch": 0.435288414929714,
"grad_norm": 1.2154583930969238,
"learning_rate": 3.476221855885576e-05,
"loss": 1.4186,
"step": 898
},
{
"epoch": 0.4357731459040233,
"grad_norm": 1.456335425376892,
"learning_rate": 3.47232475020119e-05,
"loss": 1.5292,
"step": 899
},
{
"epoch": 0.4362578768783325,
"grad_norm": 1.4850857257843018,
"learning_rate": 3.468424858684061e-05,
"loss": 1.1836,
"step": 900
},
{
"epoch": 0.4367426078526418,
"grad_norm": 1.4295960664749146,
"learning_rate": 3.4645221925078674e-05,
"loss": 1.3047,
"step": 901
},
{
"epoch": 0.43722733882695103,
"grad_norm": 1.5569299459457397,
"learning_rate": 3.4606167628542395e-05,
"loss": 1.3896,
"step": 902
},
{
"epoch": 0.4377120698012603,
"grad_norm": 1.4047545194625854,
"learning_rate": 3.456708580912725e-05,
"loss": 1.5364,
"step": 903
},
{
"epoch": 0.43819680077556955,
"grad_norm": 2.0098063945770264,
"learning_rate": 3.452797657880756e-05,
"loss": 1.7044,
"step": 904
},
{
"epoch": 0.43868153174987884,
"grad_norm": 1.4840755462646484,
"learning_rate": 3.4488840049636195e-05,
"loss": 1.4837,
"step": 905
},
{
"epoch": 0.4391662627241881,
"grad_norm": 1.7989381551742554,
"learning_rate": 3.444967633374424e-05,
"loss": 1.6995,
"step": 906
},
{
"epoch": 0.43965099369849736,
"grad_norm": 1.4424799680709839,
"learning_rate": 3.441048554334066e-05,
"loss": 1.397,
"step": 907
},
{
"epoch": 0.4401357246728066,
"grad_norm": 1.501582145690918,
"learning_rate": 3.4371267790712e-05,
"loss": 1.5452,
"step": 908
},
{
"epoch": 0.4406204556471158,
"grad_norm": 1.461760401725769,
"learning_rate": 3.433202318822207e-05,
"loss": 1.581,
"step": 909
},
{
"epoch": 0.4411051866214251,
"grad_norm": 1.3767198324203491,
"learning_rate": 3.429275184831158e-05,
"loss": 1.4953,
"step": 910
},
{
"epoch": 0.44158991759573435,
"grad_norm": 1.3784865140914917,
"learning_rate": 3.425345388349786e-05,
"loss": 1.4367,
"step": 911
},
{
"epoch": 0.44207464857004364,
"grad_norm": 1.4725292921066284,
"learning_rate": 3.421412940637453e-05,
"loss": 1.3251,
"step": 912
},
{
"epoch": 0.44255937954435287,
"grad_norm": 1.5235271453857422,
"learning_rate": 3.417477852961116e-05,
"loss": 1.7435,
"step": 913
},
{
"epoch": 0.44304411051866216,
"grad_norm": 1.5075349807739258,
"learning_rate": 3.413540136595296e-05,
"loss": 1.3295,
"step": 914
},
{
"epoch": 0.4435288414929714,
"grad_norm": 1.5114574432373047,
"learning_rate": 3.409599802822047e-05,
"loss": 1.6123,
"step": 915
},
{
"epoch": 0.4440135724672807,
"grad_norm": 1.4881386756896973,
"learning_rate": 3.40565686293092e-05,
"loss": 1.5483,
"step": 916
},
{
"epoch": 0.4444983034415899,
"grad_norm": 1.7126438617706299,
"learning_rate": 3.401711328218934e-05,
"loss": 1.7761,
"step": 917
},
{
"epoch": 0.4449830344158992,
"grad_norm": 1.6697680950164795,
"learning_rate": 3.397763209990542e-05,
"loss": 1.5235,
"step": 918
},
{
"epoch": 0.44546776539020844,
"grad_norm": 1.4794918298721313,
"learning_rate": 3.3938125195576e-05,
"loss": 1.468,
"step": 919
},
{
"epoch": 0.44595249636451767,
"grad_norm": 1.6914706230163574,
"learning_rate": 3.3898592682393336e-05,
"loss": 1.5126,
"step": 920
},
{
"epoch": 0.44643722733882696,
"grad_norm": 1.4983173608779907,
"learning_rate": 3.3859034673623045e-05,
"loss": 1.1858,
"step": 921
},
{
"epoch": 0.4469219583131362,
"grad_norm": 1.5133384466171265,
"learning_rate": 3.3819451282603805e-05,
"loss": 1.3387,
"step": 922
},
{
"epoch": 0.4474066892874455,
"grad_norm": 1.416934609413147,
"learning_rate": 3.377984262274701e-05,
"loss": 1.3361,
"step": 923
},
{
"epoch": 0.4478914202617547,
"grad_norm": 1.4019713401794434,
"learning_rate": 3.3740208807536446e-05,
"loss": 1.5387,
"step": 924
},
{
"epoch": 0.448376151236064,
"grad_norm": 1.4528837203979492,
"learning_rate": 3.3700549950527994e-05,
"loss": 1.332,
"step": 925
},
{
"epoch": 0.44886088221037324,
"grad_norm": 1.435003638267517,
"learning_rate": 3.366086616534925e-05,
"loss": 1.2677,
"step": 926
},
{
"epoch": 0.4493456131846825,
"grad_norm": 1.4747051000595093,
"learning_rate": 3.362115756569926e-05,
"loss": 1.2977,
"step": 927
},
{
"epoch": 0.44983034415899176,
"grad_norm": 1.4179006814956665,
"learning_rate": 3.358142426534817e-05,
"loss": 1.3164,
"step": 928
},
{
"epoch": 0.450315075133301,
"grad_norm": 1.5167887210845947,
"learning_rate": 3.354166637813687e-05,
"loss": 1.8859,
"step": 929
},
{
"epoch": 0.4507998061076103,
"grad_norm": 1.4719829559326172,
"learning_rate": 3.350188401797672e-05,
"loss": 1.4828,
"step": 930
},
{
"epoch": 0.4512845370819195,
"grad_norm": 1.4412150382995605,
"learning_rate": 3.346207729884918e-05,
"loss": 1.4545,
"step": 931
},
{
"epoch": 0.4517692680562288,
"grad_norm": 1.426696538925171,
"learning_rate": 3.34222463348055e-05,
"loss": 1.4738,
"step": 932
},
{
"epoch": 0.45225399903053803,
"grad_norm": 1.7029757499694824,
"learning_rate": 3.338239123996642e-05,
"loss": 1.7937,
"step": 933
},
{
"epoch": 0.4527387300048473,
"grad_norm": 1.4222897291183472,
"learning_rate": 3.3342512128521794e-05,
"loss": 1.4876,
"step": 934
},
{
"epoch": 0.45322346097915656,
"grad_norm": 1.3874694108963013,
"learning_rate": 3.33026091147303e-05,
"loss": 1.2369,
"step": 935
},
{
"epoch": 0.45370819195346584,
"grad_norm": 1.348215103149414,
"learning_rate": 3.3262682312919084e-05,
"loss": 1.3298,
"step": 936
},
{
"epoch": 0.4541929229277751,
"grad_norm": 1.54569673538208,
"learning_rate": 3.322273183748346e-05,
"loss": 1.4331,
"step": 937
},
{
"epoch": 0.45467765390208437,
"grad_norm": 1.403524398803711,
"learning_rate": 3.318275780288656e-05,
"loss": 1.1923,
"step": 938
},
{
"epoch": 0.4551623848763936,
"grad_norm": 1.4105076789855957,
"learning_rate": 3.3142760323659036e-05,
"loss": 1.3577,
"step": 939
},
{
"epoch": 0.45564711585070283,
"grad_norm": 1.3613835573196411,
"learning_rate": 3.310273951439869e-05,
"loss": 1.3631,
"step": 940
},
{
"epoch": 0.4561318468250121,
"grad_norm": 1.5017019510269165,
"learning_rate": 3.3062695489770175e-05,
"loss": 1.4085,
"step": 941
},
{
"epoch": 0.45661657779932135,
"grad_norm": 1.409903883934021,
"learning_rate": 3.302262836450466e-05,
"loss": 1.3829,
"step": 942
},
{
"epoch": 0.45710130877363064,
"grad_norm": 1.4461843967437744,
"learning_rate": 3.29825382533995e-05,
"loss": 1.4971,
"step": 943
},
{
"epoch": 0.4575860397479399,
"grad_norm": 1.486220359802246,
"learning_rate": 3.2942425271317914e-05,
"loss": 1.4184,
"step": 944
},
{
"epoch": 0.45807077072224917,
"grad_norm": 1.5135023593902588,
"learning_rate": 3.2902289533188634e-05,
"loss": 1.4369,
"step": 945
},
{
"epoch": 0.4585555016965584,
"grad_norm": 1.5079814195632935,
"learning_rate": 3.28621311540056e-05,
"loss": 1.5486,
"step": 946
},
{
"epoch": 0.4590402326708677,
"grad_norm": 1.4888428449630737,
"learning_rate": 3.282195024882764e-05,
"loss": 1.2889,
"step": 947
},
{
"epoch": 0.4595249636451769,
"grad_norm": 1.3688241243362427,
"learning_rate": 3.2781746932778076e-05,
"loss": 1.2063,
"step": 948
},
{
"epoch": 0.4600096946194862,
"grad_norm": 1.4846850633621216,
"learning_rate": 3.274152132104447e-05,
"loss": 1.5821,
"step": 949
},
{
"epoch": 0.46049442559379544,
"grad_norm": 1.5739784240722656,
"learning_rate": 3.270127352887828e-05,
"loss": 1.4057,
"step": 950
},
{
"epoch": 0.4609791565681047,
"grad_norm": 1.5579380989074707,
"learning_rate": 3.266100367159448e-05,
"loss": 1.5043,
"step": 951
},
{
"epoch": 0.46146388754241396,
"grad_norm": 1.4212524890899658,
"learning_rate": 3.2620711864571274e-05,
"loss": 1.3979,
"step": 952
},
{
"epoch": 0.4619486185167232,
"grad_norm": 1.4628902673721313,
"learning_rate": 3.258039822324977e-05,
"loss": 1.377,
"step": 953
},
{
"epoch": 0.4624333494910325,
"grad_norm": 1.5454577207565308,
"learning_rate": 3.254006286313362e-05,
"loss": 1.4185,
"step": 954
},
{
"epoch": 0.4629180804653417,
"grad_norm": 1.8712515830993652,
"learning_rate": 3.24997058997887e-05,
"loss": 1.6019,
"step": 955
},
{
"epoch": 0.463402811439651,
"grad_norm": 1.3647255897521973,
"learning_rate": 3.245932744884278e-05,
"loss": 1.1018,
"step": 956
},
{
"epoch": 0.46388754241396024,
"grad_norm": 1.5049718618392944,
"learning_rate": 3.241892762598522e-05,
"loss": 1.5208,
"step": 957
},
{
"epoch": 0.46437227338826953,
"grad_norm": 1.5857036113739014,
"learning_rate": 3.237850654696659e-05,
"loss": 1.33,
"step": 958
},
{
"epoch": 0.46485700436257876,
"grad_norm": 1.4404864311218262,
"learning_rate": 3.233806432759837e-05,
"loss": 1.3034,
"step": 959
},
{
"epoch": 0.46534173533688805,
"grad_norm": 1.4808937311172485,
"learning_rate": 3.22976010837526e-05,
"loss": 1.5562,
"step": 960
},
{
"epoch": 0.4658264663111973,
"grad_norm": 1.636323094367981,
"learning_rate": 3.225711693136156e-05,
"loss": 1.2444,
"step": 961
},
{
"epoch": 0.4663111972855065,
"grad_norm": 1.4531266689300537,
"learning_rate": 3.221661198641745e-05,
"loss": 1.2632,
"step": 962
},
{
"epoch": 0.4667959282598158,
"grad_norm": 1.3352887630462646,
"learning_rate": 3.217608636497203e-05,
"loss": 1.2413,
"step": 963
},
{
"epoch": 0.46728065923412504,
"grad_norm": 1.3544855117797852,
"learning_rate": 3.213554018313631e-05,
"loss": 1.2552,
"step": 964
},
{
"epoch": 0.46776539020843433,
"grad_norm": 1.5176138877868652,
"learning_rate": 3.209497355708019e-05,
"loss": 1.4344,
"step": 965
},
{
"epoch": 0.46825012118274356,
"grad_norm": 1.3940963745117188,
"learning_rate": 3.205438660303216e-05,
"loss": 1.418,
"step": 966
},
{
"epoch": 0.46873485215705285,
"grad_norm": 1.6681914329528809,
"learning_rate": 3.201377943727896e-05,
"loss": 1.5961,
"step": 967
},
{
"epoch": 0.4692195831313621,
"grad_norm": 1.3219623565673828,
"learning_rate": 3.1973152176165224e-05,
"loss": 1.1749,
"step": 968
},
{
"epoch": 0.46970431410567137,
"grad_norm": 1.4592280387878418,
"learning_rate": 3.1932504936093167e-05,
"loss": 1.6801,
"step": 969
},
{
"epoch": 0.4701890450799806,
"grad_norm": 1.471772313117981,
"learning_rate": 3.189183783352224e-05,
"loss": 1.4001,
"step": 970
},
{
"epoch": 0.4706737760542899,
"grad_norm": 1.5347093343734741,
"learning_rate": 3.1851150984968814e-05,
"loss": 1.4215,
"step": 971
},
{
"epoch": 0.4711585070285991,
"grad_norm": 1.481303334236145,
"learning_rate": 3.1810444507005824e-05,
"loss": 1.4193,
"step": 972
},
{
"epoch": 0.47164323800290836,
"grad_norm": 1.6206600666046143,
"learning_rate": 3.1769718516262466e-05,
"loss": 1.4644,
"step": 973
},
{
"epoch": 0.47212796897721765,
"grad_norm": 1.437705159187317,
"learning_rate": 3.172897312942381e-05,
"loss": 1.4197,
"step": 974
},
{
"epoch": 0.4726126999515269,
"grad_norm": 1.5616474151611328,
"learning_rate": 3.168820846323053e-05,
"loss": 1.574,
"step": 975
},
{
"epoch": 0.47309743092583617,
"grad_norm": 1.4508112668991089,
"learning_rate": 3.16474246344785e-05,
"loss": 1.5167,
"step": 976
},
{
"epoch": 0.4735821619001454,
"grad_norm": 1.4133033752441406,
"learning_rate": 3.1606621760018554e-05,
"loss": 1.3912,
"step": 977
},
{
"epoch": 0.4740668928744547,
"grad_norm": 1.5243926048278809,
"learning_rate": 3.156579995675603e-05,
"loss": 1.1586,
"step": 978
},
{
"epoch": 0.4745516238487639,
"grad_norm": 1.3419501781463623,
"learning_rate": 3.152495934165055e-05,
"loss": 1.434,
"step": 979
},
{
"epoch": 0.4750363548230732,
"grad_norm": 1.5045238733291626,
"learning_rate": 3.148410003171561e-05,
"loss": 1.2983,
"step": 980
},
{
"epoch": 0.47552108579738245,
"grad_norm": 1.3928799629211426,
"learning_rate": 3.1443222144018265e-05,
"loss": 1.7629,
"step": 981
},
{
"epoch": 0.47600581677169174,
"grad_norm": 1.5783344507217407,
"learning_rate": 3.1402325795678814e-05,
"loss": 1.4108,
"step": 982
},
{
"epoch": 0.47649054774600097,
"grad_norm": 1.3899141550064087,
"learning_rate": 3.1361411103870455e-05,
"loss": 1.3194,
"step": 983
},
{
"epoch": 0.4769752787203102,
"grad_norm": 1.438714623451233,
"learning_rate": 3.1320478185818896e-05,
"loss": 1.6322,
"step": 984
},
{
"epoch": 0.4774600096946195,
"grad_norm": 1.579592227935791,
"learning_rate": 3.127952715880212e-05,
"loss": 1.7614,
"step": 985
},
{
"epoch": 0.4779447406689287,
"grad_norm": 1.5455819368362427,
"learning_rate": 3.1238558140149964e-05,
"loss": 1.1737,
"step": 986
},
{
"epoch": 0.478429471643238,
"grad_norm": 1.4268600940704346,
"learning_rate": 3.119757124724384e-05,
"loss": 1.3511,
"step": 987
},
{
"epoch": 0.47891420261754725,
"grad_norm": 1.4414108991622925,
"learning_rate": 3.115656659751632e-05,
"loss": 1.6027,
"step": 988
},
{
"epoch": 0.47939893359185654,
"grad_norm": 1.382475733757019,
"learning_rate": 3.11155443084509e-05,
"loss": 1.3433,
"step": 989
},
{
"epoch": 0.47988366456616577,
"grad_norm": 1.3869885206222534,
"learning_rate": 3.10745044975816e-05,
"loss": 1.5374,
"step": 990
},
{
"epoch": 0.48036839554047506,
"grad_norm": 1.4492113590240479,
"learning_rate": 3.1033447282492646e-05,
"loss": 1.4005,
"step": 991
},
{
"epoch": 0.4808531265147843,
"grad_norm": 1.4593931436538696,
"learning_rate": 3.0992372780818113e-05,
"loss": 1.4781,
"step": 992
},
{
"epoch": 0.4813378574890936,
"grad_norm": 1.2960480451583862,
"learning_rate": 3.0951281110241634e-05,
"loss": 1.1956,
"step": 993
},
{
"epoch": 0.4818225884634028,
"grad_norm": 3.1581783294677734,
"learning_rate": 3.0910172388496e-05,
"loss": 1.2586,
"step": 994
},
{
"epoch": 0.48230731943771205,
"grad_norm": 1.422606348991394,
"learning_rate": 3.086904673336287e-05,
"loss": 1.3854,
"step": 995
},
{
"epoch": 0.48279205041202133,
"grad_norm": 1.3689360618591309,
"learning_rate": 3.082790426267243e-05,
"loss": 0.9986,
"step": 996
},
{
"epoch": 0.48327678138633057,
"grad_norm": 1.3445098400115967,
"learning_rate": 3.0786745094303035e-05,
"loss": 1.5066,
"step": 997
},
{
"epoch": 0.48376151236063986,
"grad_norm": 1.4405405521392822,
"learning_rate": 3.0745569346180876e-05,
"loss": 1.4553,
"step": 998
},
{
"epoch": 0.4842462433349491,
"grad_norm": 1.8509117364883423,
"learning_rate": 3.070437713627965e-05,
"loss": 1.5544,
"step": 999
},
{
"epoch": 0.4847309743092584,
"grad_norm": 1.540792465209961,
"learning_rate": 3.066316858262023e-05,
"loss": 1.8954,
"step": 1000
},
{
"epoch": 0.4852157052835676,
"grad_norm": 1.587246060371399,
"learning_rate": 3.0621943803270295e-05,
"loss": 1.3981,
"step": 1001
},
{
"epoch": 0.4857004362578769,
"grad_norm": 1.6329646110534668,
"learning_rate": 3.058070291634403e-05,
"loss": 1.795,
"step": 1002
},
{
"epoch": 0.48618516723218613,
"grad_norm": 1.5801475048065186,
"learning_rate": 3.053944604000177e-05,
"loss": 1.7153,
"step": 1003
},
{
"epoch": 0.4866698982064954,
"grad_norm": 1.7902394533157349,
"learning_rate": 3.0498173292449643e-05,
"loss": 1.5319,
"step": 1004
},
{
"epoch": 0.48715462918080465,
"grad_norm": 1.4669686555862427,
"learning_rate": 3.0456884791939278e-05,
"loss": 1.5914,
"step": 1005
},
{
"epoch": 0.4876393601551139,
"grad_norm": 4.603597640991211,
"learning_rate": 3.041558065676742e-05,
"loss": 1.7194,
"step": 1006
},
{
"epoch": 0.4881240911294232,
"grad_norm": 1.4835340976715088,
"learning_rate": 3.0374261005275607e-05,
"loss": 1.673,
"step": 1007
},
{
"epoch": 0.4886088221037324,
"grad_norm": 1.3735729455947876,
"learning_rate": 3.0332925955849844e-05,
"loss": 1.3659,
"step": 1008
},
{
"epoch": 0.4890935530780417,
"grad_norm": 1.5638624429702759,
"learning_rate": 3.0291575626920243e-05,
"loss": 1.4514,
"step": 1009
},
{
"epoch": 0.48957828405235093,
"grad_norm": 1.4114654064178467,
"learning_rate": 3.025021013696071e-05,
"loss": 1.4997,
"step": 1010
},
{
"epoch": 0.4900630150266602,
"grad_norm": 1.3967055082321167,
"learning_rate": 3.0208829604488563e-05,
"loss": 1.4588,
"step": 1011
},
{
"epoch": 0.49054774600096945,
"grad_norm": 1.719627022743225,
"learning_rate": 3.0167434148064254e-05,
"loss": 1.6016,
"step": 1012
},
{
"epoch": 0.49103247697527874,
"grad_norm": 1.463747262954712,
"learning_rate": 3.0126023886290955e-05,
"loss": 1.687,
"step": 1013
},
{
"epoch": 0.491517207949588,
"grad_norm": 1.4238579273223877,
"learning_rate": 3.008459893781429e-05,
"loss": 1.3301,
"step": 1014
},
{
"epoch": 0.49200193892389726,
"grad_norm": 1.401583194732666,
"learning_rate": 3.004315942132194e-05,
"loss": 1.6042,
"step": 1015
},
{
"epoch": 0.4924866698982065,
"grad_norm": 1.4220138788223267,
"learning_rate": 3.0001705455543326e-05,
"loss": 1.3374,
"step": 1016
},
{
"epoch": 0.49297140087251573,
"grad_norm": 1.4294856786727905,
"learning_rate": 2.99602371592493e-05,
"loss": 1.4055,
"step": 1017
},
{
"epoch": 0.493456131846825,
"grad_norm": 1.5217885971069336,
"learning_rate": 2.9918754651251723e-05,
"loss": 1.4845,
"step": 1018
},
{
"epoch": 0.49394086282113425,
"grad_norm": 1.4662128686904907,
"learning_rate": 2.9877258050403212e-05,
"loss": 1.2869,
"step": 1019
},
{
"epoch": 0.49442559379544354,
"grad_norm": 1.4692763090133667,
"learning_rate": 2.9835747475596743e-05,
"loss": 1.3966,
"step": 1020
},
{
"epoch": 0.4949103247697528,
"grad_norm": 1.358508586883545,
"learning_rate": 2.979422304576534e-05,
"loss": 1.3493,
"step": 1021
},
{
"epoch": 0.49539505574406206,
"grad_norm": 1.510632038116455,
"learning_rate": 2.9752684879881725e-05,
"loss": 1.6699,
"step": 1022
},
{
"epoch": 0.4958797867183713,
"grad_norm": 1.507602334022522,
"learning_rate": 2.9711133096957962e-05,
"loss": 1.6718,
"step": 1023
},
{
"epoch": 0.4963645176926806,
"grad_norm": 1.3191126585006714,
"learning_rate": 2.966956781604513e-05,
"loss": 1.2832,
"step": 1024
},
{
"epoch": 0.4968492486669898,
"grad_norm": 1.4080357551574707,
"learning_rate": 2.9627989156233006e-05,
"loss": 1.4744,
"step": 1025
},
{
"epoch": 0.4973339796412991,
"grad_norm": 1.4639397859573364,
"learning_rate": 2.9586397236649666e-05,
"loss": 1.5554,
"step": 1026
},
{
"epoch": 0.49781871061560834,
"grad_norm": 1.427006721496582,
"learning_rate": 2.9544792176461205e-05,
"loss": 1.8437,
"step": 1027
},
{
"epoch": 0.4983034415899176,
"grad_norm": 1.4412407875061035,
"learning_rate": 2.9503174094871344e-05,
"loss": 1.5907,
"step": 1028
},
{
"epoch": 0.49878817256422686,
"grad_norm": 1.5524944067001343,
"learning_rate": 2.9461543111121128e-05,
"loss": 1.5992,
"step": 1029
},
{
"epoch": 0.4992729035385361,
"grad_norm": 1.46015202999115,
"learning_rate": 2.941989934448856e-05,
"loss": 1.3932,
"step": 1030
},
{
"epoch": 0.4997576345128454,
"grad_norm": 1.3838374614715576,
"learning_rate": 2.9378242914288272e-05,
"loss": 1.2414,
"step": 1031
},
{
"epoch": 0.5002423654871546,
"grad_norm": 1.40300714969635,
"learning_rate": 2.9336573939871186e-05,
"loss": 1.2298,
"step": 1032
},
{
"epoch": 0.5007270964614638,
"grad_norm": 1.3450630903244019,
"learning_rate": 2.9294892540624147e-05,
"loss": 1.6257,
"step": 1033
},
{
"epoch": 0.5012118274357732,
"grad_norm": 1.4591161012649536,
"learning_rate": 2.9253198835969607e-05,
"loss": 1.5271,
"step": 1034
},
{
"epoch": 0.5016965584100824,
"grad_norm": 1.4087638854980469,
"learning_rate": 2.9211492945365288e-05,
"loss": 1.2425,
"step": 1035
},
{
"epoch": 0.5021812893843917,
"grad_norm": 1.501910924911499,
"learning_rate": 2.9169774988303805e-05,
"loss": 1.5748,
"step": 1036
},
{
"epoch": 0.5026660203587009,
"grad_norm": 1.5783244371414185,
"learning_rate": 2.9128045084312344e-05,
"loss": 1.3556,
"step": 1037
},
{
"epoch": 0.5031507513330101,
"grad_norm": 1.54836905002594,
"learning_rate": 2.908630335295235e-05,
"loss": 1.6043,
"step": 1038
},
{
"epoch": 0.5036354823073195,
"grad_norm": 1.2901034355163574,
"learning_rate": 2.9044549913819124e-05,
"loss": 1.5115,
"step": 1039
},
{
"epoch": 0.5041202132816287,
"grad_norm": 1.3442738056182861,
"learning_rate": 2.9002784886541517e-05,
"loss": 1.4688,
"step": 1040
},
{
"epoch": 0.5046049442559379,
"grad_norm": 1.411210298538208,
"learning_rate": 2.8961008390781603e-05,
"loss": 1.5606,
"step": 1041
},
{
"epoch": 0.5050896752302472,
"grad_norm": 1.4159587621688843,
"learning_rate": 2.8919220546234282e-05,
"loss": 1.498,
"step": 1042
},
{
"epoch": 0.5055744062045565,
"grad_norm": 1.3815311193466187,
"learning_rate": 2.8877421472626996e-05,
"loss": 1.2571,
"step": 1043
},
{
"epoch": 0.5060591371788657,
"grad_norm": 1.5053796768188477,
"learning_rate": 2.8835611289719345e-05,
"loss": 1.336,
"step": 1044
},
{
"epoch": 0.506543868153175,
"grad_norm": 1.4755936861038208,
"learning_rate": 2.8793790117302765e-05,
"loss": 1.2908,
"step": 1045
},
{
"epoch": 0.5070285991274842,
"grad_norm": 1.34824800491333,
"learning_rate": 2.8751958075200185e-05,
"loss": 1.5134,
"step": 1046
},
{
"epoch": 0.5075133301017936,
"grad_norm": 1.3312855958938599,
"learning_rate": 2.8710115283265655e-05,
"loss": 1.3463,
"step": 1047
},
{
"epoch": 0.5079980610761028,
"grad_norm": 1.3728924989700317,
"learning_rate": 2.8668261861384045e-05,
"loss": 1.7027,
"step": 1048
},
{
"epoch": 0.508482792050412,
"grad_norm": 1.781646728515625,
"learning_rate": 2.8626397929470672e-05,
"loss": 1.2995,
"step": 1049
},
{
"epoch": 0.5089675230247213,
"grad_norm": 1.4653760194778442,
"learning_rate": 2.8584523607470976e-05,
"loss": 1.3966,
"step": 1050
},
{
"epoch": 0.5094522539990305,
"grad_norm": 1.5125970840454102,
"learning_rate": 2.854263901536015e-05,
"loss": 1.4322,
"step": 1051
},
{
"epoch": 0.5099369849733398,
"grad_norm": 1.3065565824508667,
"learning_rate": 2.8500744273142833e-05,
"loss": 1.4568,
"step": 1052
},
{
"epoch": 0.5104217159476491,
"grad_norm": 1.6204395294189453,
"learning_rate": 2.845883950085271e-05,
"loss": 1.5351,
"step": 1053
},
{
"epoch": 0.5109064469219583,
"grad_norm": 1.3245327472686768,
"learning_rate": 2.8416924818552238e-05,
"loss": 1.4935,
"step": 1054
},
{
"epoch": 0.5113911778962675,
"grad_norm": 1.4986622333526611,
"learning_rate": 2.8375000346332255e-05,
"loss": 1.3269,
"step": 1055
},
{
"epoch": 0.5118759088705769,
"grad_norm": 1.5058215856552124,
"learning_rate": 2.8333066204311654e-05,
"loss": 1.7093,
"step": 1056
},
{
"epoch": 0.5123606398448861,
"grad_norm": 1.4276849031448364,
"learning_rate": 2.829112251263702e-05,
"loss": 1.5234,
"step": 1057
},
{
"epoch": 0.5128453708191953,
"grad_norm": 1.4878147840499878,
"learning_rate": 2.824916939148231e-05,
"loss": 1.1303,
"step": 1058
},
{
"epoch": 0.5133301017935046,
"grad_norm": 1.4708224534988403,
"learning_rate": 2.8207206961048494e-05,
"loss": 1.5127,
"step": 1059
},
{
"epoch": 0.5138148327678138,
"grad_norm": 1.4829494953155518,
"learning_rate": 2.8165235341563212e-05,
"loss": 1.1638,
"step": 1060
},
{
"epoch": 0.5142995637421232,
"grad_norm": 1.3504929542541504,
"learning_rate": 2.8123254653280445e-05,
"loss": 1.5974,
"step": 1061
},
{
"epoch": 0.5147842947164324,
"grad_norm": 1.5481303930282593,
"learning_rate": 2.8081265016480137e-05,
"loss": 1.4454,
"step": 1062
},
{
"epoch": 0.5152690256907416,
"grad_norm": 1.4048713445663452,
"learning_rate": 2.8039266551467873e-05,
"loss": 1.5425,
"step": 1063
},
{
"epoch": 0.5157537566650509,
"grad_norm": 1.3949064016342163,
"learning_rate": 2.7997259378574564e-05,
"loss": 1.2964,
"step": 1064
},
{
"epoch": 0.5162384876393602,
"grad_norm": 1.4568665027618408,
"learning_rate": 2.7955243618156023e-05,
"loss": 1.5504,
"step": 1065
},
{
"epoch": 0.5167232186136694,
"grad_norm": 1.3728277683258057,
"learning_rate": 2.7913219390592704e-05,
"loss": 1.19,
"step": 1066
},
{
"epoch": 0.5172079495879787,
"grad_norm": 1.5279256105422974,
"learning_rate": 2.787118681628929e-05,
"loss": 1.737,
"step": 1067
},
{
"epoch": 0.5176926805622879,
"grad_norm": 1.4620921611785889,
"learning_rate": 2.7829146015674406e-05,
"loss": 1.461,
"step": 1068
},
{
"epoch": 0.5181774115365972,
"grad_norm": 1.5227320194244385,
"learning_rate": 2.778709710920024e-05,
"loss": 1.3925,
"step": 1069
},
{
"epoch": 0.5186621425109065,
"grad_norm": 1.4971587657928467,
"learning_rate": 2.7745040217342195e-05,
"loss": 1.4625,
"step": 1070
},
{
"epoch": 0.5191468734852157,
"grad_norm": 1.531641960144043,
"learning_rate": 2.7702975460598547e-05,
"loss": 1.3207,
"step": 1071
},
{
"epoch": 0.5196316044595249,
"grad_norm": 1.3242888450622559,
"learning_rate": 2.766090295949013e-05,
"loss": 1.5634,
"step": 1072
},
{
"epoch": 0.5201163354338342,
"grad_norm": 1.3874385356903076,
"learning_rate": 2.7618822834559947e-05,
"loss": 1.35,
"step": 1073
},
{
"epoch": 0.5206010664081435,
"grad_norm": 1.45573091506958,
"learning_rate": 2.757673520637285e-05,
"loss": 1.6007,
"step": 1074
},
{
"epoch": 0.5210857973824528,
"grad_norm": 1.45209538936615,
"learning_rate": 2.75346401955152e-05,
"loss": 1.3736,
"step": 1075
},
{
"epoch": 0.521570528356762,
"grad_norm": 1.4319308996200562,
"learning_rate": 2.749253792259448e-05,
"loss": 1.6778,
"step": 1076
},
{
"epoch": 0.5220552593310712,
"grad_norm": 1.388581395149231,
"learning_rate": 2.7450428508239024e-05,
"loss": 1.6616,
"step": 1077
},
{
"epoch": 0.5225399903053806,
"grad_norm": 1.3756077289581299,
"learning_rate": 2.7408312073097574e-05,
"loss": 1.1387,
"step": 1078
},
{
"epoch": 0.5230247212796898,
"grad_norm": 1.4797683954238892,
"learning_rate": 2.7366188737839026e-05,
"loss": 1.5972,
"step": 1079
},
{
"epoch": 0.523509452253999,
"grad_norm": 1.4945778846740723,
"learning_rate": 2.7324058623152056e-05,
"loss": 1.5199,
"step": 1080
},
{
"epoch": 0.5239941832283083,
"grad_norm": 1.3682905435562134,
"learning_rate": 2.7281921849744714e-05,
"loss": 1.3174,
"step": 1081
},
{
"epoch": 0.5244789142026175,
"grad_norm": 1.3681645393371582,
"learning_rate": 2.7239778538344163e-05,
"loss": 1.3481,
"step": 1082
},
{
"epoch": 0.5249636451769268,
"grad_norm": 1.43455171585083,
"learning_rate": 2.7197628809696306e-05,
"loss": 1.4334,
"step": 1083
},
{
"epoch": 0.5254483761512361,
"grad_norm": 1.50799560546875,
"learning_rate": 2.715547278456541e-05,
"loss": 1.5683,
"step": 1084
},
{
"epoch": 0.5259331071255453,
"grad_norm": 1.4469860792160034,
"learning_rate": 2.7113310583733797e-05,
"loss": 1.4747,
"step": 1085
},
{
"epoch": 0.5264178380998545,
"grad_norm": 1.420809030532837,
"learning_rate": 2.7071142328001465e-05,
"loss": 1.6305,
"step": 1086
},
{
"epoch": 0.5269025690741639,
"grad_norm": 1.3464183807373047,
"learning_rate": 2.7028968138185782e-05,
"loss": 1.3846,
"step": 1087
},
{
"epoch": 0.5273873000484731,
"grad_norm": 1.4121391773223877,
"learning_rate": 2.6986788135121106e-05,
"loss": 1.4262,
"step": 1088
},
{
"epoch": 0.5278720310227824,
"grad_norm": 1.4404566287994385,
"learning_rate": 2.6944602439658457e-05,
"loss": 1.7716,
"step": 1089
},
{
"epoch": 0.5283567619970916,
"grad_norm": 1.4350225925445557,
"learning_rate": 2.6902411172665147e-05,
"loss": 1.3612,
"step": 1090
},
{
"epoch": 0.5288414929714008,
"grad_norm": 1.46249520778656,
"learning_rate": 2.686021445502448e-05,
"loss": 1.699,
"step": 1091
},
{
"epoch": 0.5293262239457102,
"grad_norm": 1.4543901681900024,
"learning_rate": 2.681801240763535e-05,
"loss": 1.6293,
"step": 1092
},
{
"epoch": 0.5298109549200194,
"grad_norm": 1.5145845413208008,
"learning_rate": 2.6775805151411936e-05,
"loss": 1.4227,
"step": 1093
},
{
"epoch": 0.5302956858943286,
"grad_norm": 1.4717572927474976,
"learning_rate": 2.6733592807283344e-05,
"loss": 1.4983,
"step": 1094
},
{
"epoch": 0.5307804168686379,
"grad_norm": 1.4818400144577026,
"learning_rate": 2.6691375496193234e-05,
"loss": 1.3531,
"step": 1095
},
{
"epoch": 0.5312651478429472,
"grad_norm": 1.3545438051223755,
"learning_rate": 2.6649153339099524e-05,
"loss": 1.0584,
"step": 1096
},
{
"epoch": 0.5317498788172564,
"grad_norm": 1.457953691482544,
"learning_rate": 2.6606926456974013e-05,
"loss": 1.4133,
"step": 1097
},
{
"epoch": 0.5322346097915657,
"grad_norm": 1.343666434288025,
"learning_rate": 2.656469497080202e-05,
"loss": 1.3662,
"step": 1098
},
{
"epoch": 0.5327193407658749,
"grad_norm": 1.5306185483932495,
"learning_rate": 2.6522459001582078e-05,
"loss": 1.6159,
"step": 1099
},
{
"epoch": 0.5332040717401842,
"grad_norm": 1.4223085641860962,
"learning_rate": 2.648021867032554e-05,
"loss": 1.3616,
"step": 1100
},
{
"epoch": 0.5336888027144935,
"grad_norm": 1.4389088153839111,
"learning_rate": 2.643797409805628e-05,
"loss": 1.2718,
"step": 1101
},
{
"epoch": 0.5341735336888027,
"grad_norm": 1.5863919258117676,
"learning_rate": 2.6395725405810307e-05,
"loss": 1.5832,
"step": 1102
},
{
"epoch": 0.534658264663112,
"grad_norm": 1.531956672668457,
"learning_rate": 2.635347271463544e-05,
"loss": 1.3952,
"step": 1103
},
{
"epoch": 0.5351429956374212,
"grad_norm": 1.63759446144104,
"learning_rate": 2.631121614559096e-05,
"loss": 1.3839,
"step": 1104
},
{
"epoch": 0.5356277266117305,
"grad_norm": 1.6401275396347046,
"learning_rate": 2.6268955819747247e-05,
"loss": 1.6776,
"step": 1105
},
{
"epoch": 0.5361124575860398,
"grad_norm": 1.4038505554199219,
"learning_rate": 2.6226691858185454e-05,
"loss": 1.4152,
"step": 1106
},
{
"epoch": 0.536597188560349,
"grad_norm": 1.4665429592132568,
"learning_rate": 2.6184424381997146e-05,
"loss": 1.5916,
"step": 1107
},
{
"epoch": 0.5370819195346582,
"grad_norm": 1.3302826881408691,
"learning_rate": 2.6142153512283968e-05,
"loss": 1.5929,
"step": 1108
},
{
"epoch": 0.5375666505089676,
"grad_norm": 1.4651814699172974,
"learning_rate": 2.609987937015728e-05,
"loss": 1.5793,
"step": 1109
},
{
"epoch": 0.5380513814832768,
"grad_norm": 1.3761016130447388,
"learning_rate": 2.605760207673781e-05,
"loss": 1.4432,
"step": 1110
},
{
"epoch": 0.538536112457586,
"grad_norm": 1.5528533458709717,
"learning_rate": 2.601532175315532e-05,
"loss": 1.6668,
"step": 1111
},
{
"epoch": 0.5390208434318953,
"grad_norm": 1.3150146007537842,
"learning_rate": 2.5973038520548266e-05,
"loss": 1.3136,
"step": 1112
},
{
"epoch": 0.5395055744062045,
"grad_norm": 1.4814192056655884,
"learning_rate": 2.5930752500063425e-05,
"loss": 1.721,
"step": 1113
},
{
"epoch": 0.5399903053805138,
"grad_norm": 1.5142635107040405,
"learning_rate": 2.5888463812855578e-05,
"loss": 1.5931,
"step": 1114
},
{
"epoch": 0.5404750363548231,
"grad_norm": 1.3756675720214844,
"learning_rate": 2.5846172580087112e-05,
"loss": 1.0801,
"step": 1115
},
{
"epoch": 0.5409597673291323,
"grad_norm": 1.4426296949386597,
"learning_rate": 2.5803878922927755e-05,
"loss": 1.4838,
"step": 1116
},
{
"epoch": 0.5414444983034415,
"grad_norm": 1.2579292058944702,
"learning_rate": 2.576158296255413e-05,
"loss": 1.2591,
"step": 1117
},
{
"epoch": 0.5419292292777509,
"grad_norm": 1.4666320085525513,
"learning_rate": 2.5719284820149503e-05,
"loss": 1.5415,
"step": 1118
},
{
"epoch": 0.5424139602520601,
"grad_norm": 1.5992366075515747,
"learning_rate": 2.5676984616903367e-05,
"loss": 1.327,
"step": 1119
},
{
"epoch": 0.5428986912263694,
"grad_norm": 1.4156771898269653,
"learning_rate": 2.5634682474011128e-05,
"loss": 1.2807,
"step": 1120
},
{
"epoch": 0.5433834222006786,
"grad_norm": 1.4783258438110352,
"learning_rate": 2.559237851267374e-05,
"loss": 1.5313,
"step": 1121
},
{
"epoch": 0.5438681531749879,
"grad_norm": 1.400573492050171,
"learning_rate": 2.555007285409739e-05,
"loss": 1.3329,
"step": 1122
},
{
"epoch": 0.5443528841492972,
"grad_norm": 1.3685263395309448,
"learning_rate": 2.550776561949311e-05,
"loss": 1.4368,
"step": 1123
},
{
"epoch": 0.5448376151236064,
"grad_norm": 1.5202267169952393,
"learning_rate": 2.5465456930076435e-05,
"loss": 1.5863,
"step": 1124
},
{
"epoch": 0.5453223460979156,
"grad_norm": 1.5026854276657104,
"learning_rate": 2.54231469070671e-05,
"loss": 1.6135,
"step": 1125
},
{
"epoch": 0.5458070770722249,
"grad_norm": 1.2926137447357178,
"learning_rate": 2.5380835671688628e-05,
"loss": 1.1667,
"step": 1126
},
{
"epoch": 0.5462918080465342,
"grad_norm": 1.429006814956665,
"learning_rate": 2.5338523345168048e-05,
"loss": 1.3725,
"step": 1127
},
{
"epoch": 0.5467765390208434,
"grad_norm": 1.605655550956726,
"learning_rate": 2.52962100487355e-05,
"loss": 1.9144,
"step": 1128
},
{
"epoch": 0.5472612699951527,
"grad_norm": 1.3905277252197266,
"learning_rate": 2.525389590362388e-05,
"loss": 1.4073,
"step": 1129
},
{
"epoch": 0.5477460009694619,
"grad_norm": 1.443237543106079,
"learning_rate": 2.521158103106856e-05,
"loss": 1.6917,
"step": 1130
},
{
"epoch": 0.5482307319437713,
"grad_norm": 1.544218897819519,
"learning_rate": 2.5169265552306963e-05,
"loss": 1.4208,
"step": 1131
},
{
"epoch": 0.5487154629180805,
"grad_norm": 1.4166316986083984,
"learning_rate": 2.5126949588578264e-05,
"loss": 1.3686,
"step": 1132
},
{
"epoch": 0.5492001938923897,
"grad_norm": 1.3663363456726074,
"learning_rate": 2.508463326112302e-05,
"loss": 1.3377,
"step": 1133
},
{
"epoch": 0.549684924866699,
"grad_norm": 1.3695237636566162,
"learning_rate": 2.504231669118283e-05,
"loss": 1.384,
"step": 1134
},
{
"epoch": 0.5501696558410082,
"grad_norm": 1.418594479560852,
"learning_rate": 2.5e-05,
"loss": 1.6453,
"step": 1135
},
{
"epoch": 0.5506543868153175,
"grad_norm": 1.4432318210601807,
"learning_rate": 2.495768330881717e-05,
"loss": 1.6813,
"step": 1136
},
{
"epoch": 0.5511391177896268,
"grad_norm": 1.3886134624481201,
"learning_rate": 2.4915366738876986e-05,
"loss": 1.4878,
"step": 1137
},
{
"epoch": 0.551623848763936,
"grad_norm": 1.3143367767333984,
"learning_rate": 2.4873050411421738e-05,
"loss": 1.1719,
"step": 1138
},
{
"epoch": 0.5521085797382452,
"grad_norm": 1.4947563409805298,
"learning_rate": 2.483073444769304e-05,
"loss": 1.9726,
"step": 1139
},
{
"epoch": 0.5525933107125546,
"grad_norm": 1.4314428567886353,
"learning_rate": 2.478841896893145e-05,
"loss": 1.4453,
"step": 1140
},
{
"epoch": 0.5530780416868638,
"grad_norm": 1.5481895208358765,
"learning_rate": 2.4746104096376128e-05,
"loss": 1.5185,
"step": 1141
},
{
"epoch": 0.553562772661173,
"grad_norm": 1.3574029207229614,
"learning_rate": 2.470378995126451e-05,
"loss": 1.2467,
"step": 1142
},
{
"epoch": 0.5540475036354823,
"grad_norm": 1.4265308380126953,
"learning_rate": 2.4661476654831958e-05,
"loss": 1.3669,
"step": 1143
},
{
"epoch": 0.5545322346097916,
"grad_norm": 1.789808988571167,
"learning_rate": 2.4619164328311374e-05,
"loss": 1.4957,
"step": 1144
},
{
"epoch": 0.5550169655841009,
"grad_norm": 1.401563048362732,
"learning_rate": 2.4576853092932907e-05,
"loss": 1.7368,
"step": 1145
},
{
"epoch": 0.5555016965584101,
"grad_norm": 1.3560460805892944,
"learning_rate": 2.4534543069923567e-05,
"loss": 1.3568,
"step": 1146
},
{
"epoch": 0.5559864275327193,
"grad_norm": 1.4376869201660156,
"learning_rate": 2.4492234380506894e-05,
"loss": 1.4885,
"step": 1147
},
{
"epoch": 0.5564711585070286,
"grad_norm": 1.4365642070770264,
"learning_rate": 2.4449927145902606e-05,
"loss": 1.2875,
"step": 1148
},
{
"epoch": 0.5569558894813379,
"grad_norm": 1.4295824766159058,
"learning_rate": 2.4407621487326255e-05,
"loss": 1.4832,
"step": 1149
},
{
"epoch": 0.5574406204556471,
"grad_norm": 1.3713922500610352,
"learning_rate": 2.4365317525988885e-05,
"loss": 1.1781,
"step": 1150
},
{
"epoch": 0.5579253514299564,
"grad_norm": 1.5723742246627808,
"learning_rate": 2.4323015383096643e-05,
"loss": 1.4846,
"step": 1151
},
{
"epoch": 0.5584100824042656,
"grad_norm": 1.2833575010299683,
"learning_rate": 2.4280715179850506e-05,
"loss": 1.1875,
"step": 1152
},
{
"epoch": 0.5588948133785749,
"grad_norm": 1.355686068534851,
"learning_rate": 2.4238417037445875e-05,
"loss": 1.1902,
"step": 1153
},
{
"epoch": 0.5593795443528842,
"grad_norm": 1.415571689605713,
"learning_rate": 2.419612107707225e-05,
"loss": 1.5119,
"step": 1154
},
{
"epoch": 0.5598642753271934,
"grad_norm": 1.4047833681106567,
"learning_rate": 2.415382741991289e-05,
"loss": 1.3724,
"step": 1155
},
{
"epoch": 0.5603490063015026,
"grad_norm": 1.8079580068588257,
"learning_rate": 2.4111536187144425e-05,
"loss": 1.6128,
"step": 1156
},
{
"epoch": 0.5608337372758119,
"grad_norm": 1.410330057144165,
"learning_rate": 2.406924749993657e-05,
"loss": 1.3897,
"step": 1157
},
{
"epoch": 0.5613184682501212,
"grad_norm": 1.3198529481887817,
"learning_rate": 2.4026961479451733e-05,
"loss": 1.0365,
"step": 1158
},
{
"epoch": 0.5618031992244304,
"grad_norm": 1.3807947635650635,
"learning_rate": 2.3984678246844677e-05,
"loss": 1.3613,
"step": 1159
},
{
"epoch": 0.5622879301987397,
"grad_norm": 3.3659908771514893,
"learning_rate": 2.3942397923262204e-05,
"loss": 1.4527,
"step": 1160
},
{
"epoch": 0.5627726611730489,
"grad_norm": 1.3476786613464355,
"learning_rate": 2.3900120629842732e-05,
"loss": 1.1218,
"step": 1161
},
{
"epoch": 0.5632573921473583,
"grad_norm": 1.6585768461227417,
"learning_rate": 2.3857846487716038e-05,
"loss": 2.0664,
"step": 1162
},
{
"epoch": 0.5637421231216675,
"grad_norm": 1.4534820318222046,
"learning_rate": 2.3815575618002856e-05,
"loss": 1.466,
"step": 1163
},
{
"epoch": 0.5642268540959767,
"grad_norm": 1.3563114404678345,
"learning_rate": 2.3773308141814552e-05,
"loss": 1.3486,
"step": 1164
},
{
"epoch": 0.564711585070286,
"grad_norm": 1.4399093389511108,
"learning_rate": 2.3731044180252756e-05,
"loss": 1.3011,
"step": 1165
},
{
"epoch": 0.5651963160445953,
"grad_norm": 1.3951159715652466,
"learning_rate": 2.3688783854409045e-05,
"loss": 1.159,
"step": 1166
},
{
"epoch": 0.5656810470189045,
"grad_norm": 1.4155405759811401,
"learning_rate": 2.3646527285364565e-05,
"loss": 1.3806,
"step": 1167
},
{
"epoch": 0.5661657779932138,
"grad_norm": 1.3713849782943726,
"learning_rate": 2.3604274594189695e-05,
"loss": 1.4719,
"step": 1168
},
{
"epoch": 0.566650508967523,
"grad_norm": 1.5539053678512573,
"learning_rate": 2.3562025901943726e-05,
"loss": 1.5451,
"step": 1169
},
{
"epoch": 0.5671352399418322,
"grad_norm": 1.4450373649597168,
"learning_rate": 2.351978132967447e-05,
"loss": 1.3855,
"step": 1170
},
{
"epoch": 0.5676199709161416,
"grad_norm": 1.4404420852661133,
"learning_rate": 2.347754099841793e-05,
"loss": 1.7035,
"step": 1171
},
{
"epoch": 0.5681047018904508,
"grad_norm": 1.4115735292434692,
"learning_rate": 2.3435305029197984e-05,
"loss": 1.4108,
"step": 1172
},
{
"epoch": 0.56858943286476,
"grad_norm": 1.4475492238998413,
"learning_rate": 2.3393073543025996e-05,
"loss": 1.2907,
"step": 1173
},
{
"epoch": 0.5690741638390693,
"grad_norm": 1.3643134832382202,
"learning_rate": 2.335084666090048e-05,
"loss": 1.3279,
"step": 1174
},
{
"epoch": 0.5695588948133786,
"grad_norm": 1.480979323387146,
"learning_rate": 2.3308624503806772e-05,
"loss": 1.6189,
"step": 1175
},
{
"epoch": 0.5700436257876879,
"grad_norm": 1.3969759941101074,
"learning_rate": 2.3266407192716666e-05,
"loss": 1.472,
"step": 1176
},
{
"epoch": 0.5705283567619971,
"grad_norm": 1.454965591430664,
"learning_rate": 2.3224194848588066e-05,
"loss": 1.0431,
"step": 1177
},
{
"epoch": 0.5710130877363063,
"grad_norm": 1.3337841033935547,
"learning_rate": 2.3181987592364655e-05,
"loss": 1.2462,
"step": 1178
},
{
"epoch": 0.5714978187106156,
"grad_norm": 1.3526654243469238,
"learning_rate": 2.3139785544975527e-05,
"loss": 1.3556,
"step": 1179
},
{
"epoch": 0.5719825496849249,
"grad_norm": 1.464104413986206,
"learning_rate": 2.309758882733486e-05,
"loss": 1.594,
"step": 1180
},
{
"epoch": 0.5724672806592341,
"grad_norm": 1.5070477724075317,
"learning_rate": 2.305539756034155e-05,
"loss": 1.2799,
"step": 1181
},
{
"epoch": 0.5729520116335434,
"grad_norm": 1.747273325920105,
"learning_rate": 2.30132118648789e-05,
"loss": 1.4735,
"step": 1182
},
{
"epoch": 0.5734367426078526,
"grad_norm": 2.815830945968628,
"learning_rate": 2.2971031861814223e-05,
"loss": 1.3718,
"step": 1183
},
{
"epoch": 0.573921473582162,
"grad_norm": 1.4527679681777954,
"learning_rate": 2.2928857671998538e-05,
"loss": 1.4325,
"step": 1184
},
{
"epoch": 0.5744062045564712,
"grad_norm": 1.5000076293945312,
"learning_rate": 2.288668941626621e-05,
"loss": 1.5461,
"step": 1185
},
{
"epoch": 0.5748909355307804,
"grad_norm": 1.3602912425994873,
"learning_rate": 2.2844527215434592e-05,
"loss": 1.403,
"step": 1186
},
{
"epoch": 0.5753756665050896,
"grad_norm": 1.4031354188919067,
"learning_rate": 2.2802371190303696e-05,
"loss": 1.2408,
"step": 1187
},
{
"epoch": 0.575860397479399,
"grad_norm": 1.385236382484436,
"learning_rate": 2.2760221461655833e-05,
"loss": 1.4098,
"step": 1188
},
{
"epoch": 0.5763451284537082,
"grad_norm": 1.3685550689697266,
"learning_rate": 2.27180781502553e-05,
"loss": 1.2458,
"step": 1189
},
{
"epoch": 0.5768298594280175,
"grad_norm": 1.4121043682098389,
"learning_rate": 2.267594137684796e-05,
"loss": 1.6139,
"step": 1190
},
{
"epoch": 0.5773145904023267,
"grad_norm": 1.40248703956604,
"learning_rate": 2.2633811262160977e-05,
"loss": 1.6284,
"step": 1191
},
{
"epoch": 0.5777993213766359,
"grad_norm": 1.5229443311691284,
"learning_rate": 2.2591687926902432e-05,
"loss": 1.619,
"step": 1192
},
{
"epoch": 0.5782840523509453,
"grad_norm": 1.4838701486587524,
"learning_rate": 2.2549571491760986e-05,
"loss": 1.4843,
"step": 1193
},
{
"epoch": 0.5787687833252545,
"grad_norm": 1.4755678176879883,
"learning_rate": 2.2507462077405523e-05,
"loss": 1.4363,
"step": 1194
},
{
"epoch": 0.5792535142995637,
"grad_norm": 1.4962295293807983,
"learning_rate": 2.2465359804484806e-05,
"loss": 1.3032,
"step": 1195
},
{
"epoch": 0.579738245273873,
"grad_norm": 1.3037718534469604,
"learning_rate": 2.2423264793627148e-05,
"loss": 1.3224,
"step": 1196
},
{
"epoch": 0.5802229762481823,
"grad_norm": 1.3326340913772583,
"learning_rate": 2.2381177165440055e-05,
"loss": 1.5501,
"step": 1197
},
{
"epoch": 0.5807077072224915,
"grad_norm": 1.484641432762146,
"learning_rate": 2.2339097040509882e-05,
"loss": 1.5076,
"step": 1198
},
{
"epoch": 0.5811924381968008,
"grad_norm": 1.4163663387298584,
"learning_rate": 2.2297024539401463e-05,
"loss": 1.482,
"step": 1199
},
{
"epoch": 0.58167716917111,
"grad_norm": 1.3082435131072998,
"learning_rate": 2.225495978265782e-05,
"loss": 1.2917,
"step": 1200
},
{
"epoch": 0.5821619001454192,
"grad_norm": 1.4738914966583252,
"learning_rate": 2.2212902890799767e-05,
"loss": 1.6231,
"step": 1201
},
{
"epoch": 0.5826466311197286,
"grad_norm": 1.4668748378753662,
"learning_rate": 2.2170853984325597e-05,
"loss": 1.2994,
"step": 1202
},
{
"epoch": 0.5831313620940378,
"grad_norm": 1.4222160577774048,
"learning_rate": 2.2128813183710716e-05,
"loss": 1.4073,
"step": 1203
},
{
"epoch": 0.583616093068347,
"grad_norm": 1.6111295223236084,
"learning_rate": 2.2086780609407305e-05,
"loss": 1.1423,
"step": 1204
},
{
"epoch": 0.5841008240426563,
"grad_norm": 1.3994250297546387,
"learning_rate": 2.2044756381843983e-05,
"loss": 1.6045,
"step": 1205
},
{
"epoch": 0.5845855550169656,
"grad_norm": 1.3738783597946167,
"learning_rate": 2.2002740621425442e-05,
"loss": 1.2844,
"step": 1206
},
{
"epoch": 0.5850702859912749,
"grad_norm": 1.3349037170410156,
"learning_rate": 2.1960733448532126e-05,
"loss": 1.3563,
"step": 1207
},
{
"epoch": 0.5855550169655841,
"grad_norm": 1.464147925376892,
"learning_rate": 2.1918734983519873e-05,
"loss": 1.5385,
"step": 1208
},
{
"epoch": 0.5860397479398933,
"grad_norm": 1.2928396463394165,
"learning_rate": 2.1876745346719567e-05,
"loss": 1.1307,
"step": 1209
},
{
"epoch": 0.5865244789142026,
"grad_norm": 1.4599294662475586,
"learning_rate": 2.1834764658436797e-05,
"loss": 1.3856,
"step": 1210
},
{
"epoch": 0.5870092098885119,
"grad_norm": 1.4603444337844849,
"learning_rate": 2.1792793038951515e-05,
"loss": 1.4164,
"step": 1211
},
{
"epoch": 0.5874939408628211,
"grad_norm": 1.4382938146591187,
"learning_rate": 2.1750830608517696e-05,
"loss": 1.661,
"step": 1212
},
{
"epoch": 0.5879786718371304,
"grad_norm": 1.448847770690918,
"learning_rate": 2.1708877487362987e-05,
"loss": 1.498,
"step": 1213
},
{
"epoch": 0.5884634028114396,
"grad_norm": 1.4043089151382446,
"learning_rate": 2.1666933795688352e-05,
"loss": 1.4716,
"step": 1214
},
{
"epoch": 0.588948133785749,
"grad_norm": 1.4823040962219238,
"learning_rate": 2.1624999653667747e-05,
"loss": 1.4944,
"step": 1215
},
{
"epoch": 0.5894328647600582,
"grad_norm": 1.5355263948440552,
"learning_rate": 2.1583075181447764e-05,
"loss": 1.5809,
"step": 1216
},
{
"epoch": 0.5899175957343674,
"grad_norm": 1.4335788488388062,
"learning_rate": 2.1541160499147297e-05,
"loss": 1.4292,
"step": 1217
},
{
"epoch": 0.5904023267086767,
"grad_norm": 1.435947299003601,
"learning_rate": 2.1499255726857183e-05,
"loss": 1.5298,
"step": 1218
},
{
"epoch": 0.590887057682986,
"grad_norm": 1.5709877014160156,
"learning_rate": 2.1457360984639853e-05,
"loss": 1.304,
"step": 1219
},
{
"epoch": 0.5913717886572952,
"grad_norm": 1.3494329452514648,
"learning_rate": 2.141547639252903e-05,
"loss": 1.4771,
"step": 1220
},
{
"epoch": 0.5918565196316045,
"grad_norm": 1.6605241298675537,
"learning_rate": 2.137360207052933e-05,
"loss": 1.8418,
"step": 1221
},
{
"epoch": 0.5923412506059137,
"grad_norm": 1.2385108470916748,
"learning_rate": 2.1331738138615958e-05,
"loss": 1.3536,
"step": 1222
},
{
"epoch": 0.5928259815802229,
"grad_norm": 1.4509235620498657,
"learning_rate": 2.1289884716734347e-05,
"loss": 1.4231,
"step": 1223
},
{
"epoch": 0.5933107125545323,
"grad_norm": 1.551924705505371,
"learning_rate": 2.124804192479982e-05,
"loss": 1.3416,
"step": 1224
},
{
"epoch": 0.5937954435288415,
"grad_norm": 1.4182485342025757,
"learning_rate": 2.1206209882697234e-05,
"loss": 1.7183,
"step": 1225
},
{
"epoch": 0.5942801745031507,
"grad_norm": 2.135904550552368,
"learning_rate": 2.1164388710280654e-05,
"loss": 1.2838,
"step": 1226
},
{
"epoch": 0.59476490547746,
"grad_norm": 1.4602768421173096,
"learning_rate": 2.1122578527373016e-05,
"loss": 1.3902,
"step": 1227
},
{
"epoch": 0.5952496364517693,
"grad_norm": 1.485456943511963,
"learning_rate": 2.1080779453765727e-05,
"loss": 1.4738,
"step": 1228
},
{
"epoch": 0.5957343674260785,
"grad_norm": 1.4163349866867065,
"learning_rate": 2.1038991609218407e-05,
"loss": 1.2601,
"step": 1229
},
{
"epoch": 0.5962190984003878,
"grad_norm": 1.474471926689148,
"learning_rate": 2.099721511345849e-05,
"loss": 1.633,
"step": 1230
},
{
"epoch": 0.596703829374697,
"grad_norm": 1.3297244310379028,
"learning_rate": 2.0955450086180882e-05,
"loss": 1.1103,
"step": 1231
},
{
"epoch": 0.5971885603490062,
"grad_norm": 1.4120020866394043,
"learning_rate": 2.091369664704766e-05,
"loss": 1.469,
"step": 1232
},
{
"epoch": 0.5976732913233156,
"grad_norm": 1.4585497379302979,
"learning_rate": 2.0871954915687658e-05,
"loss": 1.4362,
"step": 1233
},
{
"epoch": 0.5981580222976248,
"grad_norm": 1.4092267751693726,
"learning_rate": 2.08302250116962e-05,
"loss": 1.3138,
"step": 1234
},
{
"epoch": 0.5986427532719341,
"grad_norm": 1.3591123819351196,
"learning_rate": 2.0788507054634714e-05,
"loss": 1.3283,
"step": 1235
},
{
"epoch": 0.5991274842462433,
"grad_norm": 1.3640435934066772,
"learning_rate": 2.074680116403039e-05,
"loss": 1.4952,
"step": 1236
},
{
"epoch": 0.5996122152205526,
"grad_norm": 1.4771053791046143,
"learning_rate": 2.070510745937586e-05,
"loss": 1.6309,
"step": 1237
},
{
"epoch": 0.6000969461948619,
"grad_norm": 1.4691507816314697,
"learning_rate": 2.066342606012882e-05,
"loss": 1.348,
"step": 1238
},
{
"epoch": 0.6005816771691711,
"grad_norm": 1.3989927768707275,
"learning_rate": 2.0621757085711734e-05,
"loss": 1.4557,
"step": 1239
},
{
"epoch": 0.6010664081434803,
"grad_norm": 1.4642021656036377,
"learning_rate": 2.058010065551145e-05,
"loss": 1.5403,
"step": 1240
},
{
"epoch": 0.6015511391177897,
"grad_norm": 1.473580241203308,
"learning_rate": 2.0538456888878878e-05,
"loss": 1.3097,
"step": 1241
},
{
"epoch": 0.6020358700920989,
"grad_norm": 1.3591980934143066,
"learning_rate": 2.0496825905128665e-05,
"loss": 1.4441,
"step": 1242
},
{
"epoch": 0.6025206010664081,
"grad_norm": 1.3612762689590454,
"learning_rate": 2.04552078235388e-05,
"loss": 1.2061,
"step": 1243
},
{
"epoch": 0.6030053320407174,
"grad_norm": 1.4053421020507812,
"learning_rate": 2.0413602763350337e-05,
"loss": 1.3619,
"step": 1244
},
{
"epoch": 0.6034900630150266,
"grad_norm": 1.2905552387237549,
"learning_rate": 2.0372010843766996e-05,
"loss": 1.1999,
"step": 1245
},
{
"epoch": 0.603974793989336,
"grad_norm": 1.4193109273910522,
"learning_rate": 2.0330432183954867e-05,
"loss": 1.512,
"step": 1246
},
{
"epoch": 0.6044595249636452,
"grad_norm": 1.3693450689315796,
"learning_rate": 2.0288866903042054e-05,
"loss": 1.3532,
"step": 1247
},
{
"epoch": 0.6049442559379544,
"grad_norm": 1.4601976871490479,
"learning_rate": 2.0247315120118284e-05,
"loss": 1.4071,
"step": 1248
},
{
"epoch": 0.6054289869122637,
"grad_norm": 1.480519413948059,
"learning_rate": 2.0205776954234663e-05,
"loss": 1.5318,
"step": 1249
},
{
"epoch": 0.605913717886573,
"grad_norm": 1.3617873191833496,
"learning_rate": 2.0164252524403263e-05,
"loss": 1.3624,
"step": 1250
},
{
"epoch": 0.6063984488608822,
"grad_norm": 1.3958122730255127,
"learning_rate": 2.0122741949596797e-05,
"loss": 1.0867,
"step": 1251
},
{
"epoch": 0.6068831798351915,
"grad_norm": 1.5380889177322388,
"learning_rate": 2.0081245348748286e-05,
"loss": 1.7665,
"step": 1252
},
{
"epoch": 0.6073679108095007,
"grad_norm": 1.4936906099319458,
"learning_rate": 2.0039762840750707e-05,
"loss": 1.1883,
"step": 1253
},
{
"epoch": 0.6078526417838099,
"grad_norm": 1.3765724897384644,
"learning_rate": 1.999829454445667e-05,
"loss": 1.2834,
"step": 1254
},
{
"epoch": 0.6083373727581193,
"grad_norm": 1.4031294584274292,
"learning_rate": 1.995684057867806e-05,
"loss": 1.2124,
"step": 1255
},
{
"epoch": 0.6088221037324285,
"grad_norm": 1.514238715171814,
"learning_rate": 1.991540106218572e-05,
"loss": 1.6542,
"step": 1256
},
{
"epoch": 0.6093068347067377,
"grad_norm": 1.4218300580978394,
"learning_rate": 1.9873976113709048e-05,
"loss": 1.6589,
"step": 1257
},
{
"epoch": 0.609791565681047,
"grad_norm": 1.5005664825439453,
"learning_rate": 1.983256585193575e-05,
"loss": 1.445,
"step": 1258
},
{
"epoch": 0.6102762966553563,
"grad_norm": 1.571577548980713,
"learning_rate": 1.979117039551144e-05,
"loss": 1.7245,
"step": 1259
},
{
"epoch": 0.6107610276296656,
"grad_norm": 1.4720330238342285,
"learning_rate": 1.9749789863039297e-05,
"loss": 1.4244,
"step": 1260
},
{
"epoch": 0.6112457586039748,
"grad_norm": 1.5714093446731567,
"learning_rate": 1.970842437307976e-05,
"loss": 1.6333,
"step": 1261
},
{
"epoch": 0.611730489578284,
"grad_norm": 1.4255337715148926,
"learning_rate": 1.9667074044150165e-05,
"loss": 1.5165,
"step": 1262
},
{
"epoch": 0.6122152205525934,
"grad_norm": 1.590695858001709,
"learning_rate": 1.96257389947244e-05,
"loss": 1.33,
"step": 1263
},
{
"epoch": 0.6126999515269026,
"grad_norm": 1.4347730875015259,
"learning_rate": 1.9584419343232584e-05,
"loss": 1.3907,
"step": 1264
},
{
"epoch": 0.6131846825012118,
"grad_norm": 1.46970796585083,
"learning_rate": 1.954311520806072e-05,
"loss": 1.3961,
"step": 1265
},
{
"epoch": 0.6136694134755211,
"grad_norm": 1.3484935760498047,
"learning_rate": 1.9501826707550366e-05,
"loss": 1.4027,
"step": 1266
},
{
"epoch": 0.6141541444498303,
"grad_norm": 1.3133972883224487,
"learning_rate": 1.9460553959998244e-05,
"loss": 1.3246,
"step": 1267
},
{
"epoch": 0.6146388754241396,
"grad_norm": 1.3001095056533813,
"learning_rate": 1.9419297083655976e-05,
"loss": 1.3312,
"step": 1268
},
{
"epoch": 0.6151236063984489,
"grad_norm": 1.4388257265090942,
"learning_rate": 1.937805619672971e-05,
"loss": 1.4682,
"step": 1269
},
{
"epoch": 0.6156083373727581,
"grad_norm": 1.4479858875274658,
"learning_rate": 1.9336831417379777e-05,
"loss": 1.4709,
"step": 1270
},
{
"epoch": 0.6160930683470673,
"grad_norm": 1.2688299417495728,
"learning_rate": 1.9295622863720356e-05,
"loss": 0.9973,
"step": 1271
},
{
"epoch": 0.6165777993213767,
"grad_norm": 1.4602196216583252,
"learning_rate": 1.9254430653819127e-05,
"loss": 1.5403,
"step": 1272
},
{
"epoch": 0.6170625302956859,
"grad_norm": 1.4173598289489746,
"learning_rate": 1.9213254905696964e-05,
"loss": 1.3238,
"step": 1273
},
{
"epoch": 0.6175472612699952,
"grad_norm": 1.3910014629364014,
"learning_rate": 1.9172095737327566e-05,
"loss": 1.347,
"step": 1274
},
{
"epoch": 0.6180319922443044,
"grad_norm": 1.4783943891525269,
"learning_rate": 1.9130953266637127e-05,
"loss": 1.6262,
"step": 1275
},
{
"epoch": 0.6185167232186136,
"grad_norm": 1.5285893678665161,
"learning_rate": 1.9089827611504013e-05,
"loss": 1.624,
"step": 1276
},
{
"epoch": 0.619001454192923,
"grad_norm": 1.4484494924545288,
"learning_rate": 1.9048718889758375e-05,
"loss": 1.3973,
"step": 1277
},
{
"epoch": 0.6194861851672322,
"grad_norm": 1.3599704504013062,
"learning_rate": 1.900762721918189e-05,
"loss": 1.3223,
"step": 1278
},
{
"epoch": 0.6199709161415414,
"grad_norm": 1.4407846927642822,
"learning_rate": 1.8966552717507364e-05,
"loss": 1.2713,
"step": 1279
},
{
"epoch": 0.6204556471158507,
"grad_norm": 1.4449760913848877,
"learning_rate": 1.8925495502418406e-05,
"loss": 1.6689,
"step": 1280
},
{
"epoch": 0.62094037809016,
"grad_norm": 1.5764880180358887,
"learning_rate": 1.8884455691549105e-05,
"loss": 1.5437,
"step": 1281
},
{
"epoch": 0.6214251090644692,
"grad_norm": 1.5581961870193481,
"learning_rate": 1.8843433402483683e-05,
"loss": 1.4253,
"step": 1282
},
{
"epoch": 0.6219098400387785,
"grad_norm": 1.384125828742981,
"learning_rate": 1.8802428752756172e-05,
"loss": 1.4686,
"step": 1283
},
{
"epoch": 0.6223945710130877,
"grad_norm": 1.4908874034881592,
"learning_rate": 1.876144185985003e-05,
"loss": 1.3817,
"step": 1284
},
{
"epoch": 0.622879301987397,
"grad_norm": 1.441921591758728,
"learning_rate": 1.8720472841197884e-05,
"loss": 1.1528,
"step": 1285
},
{
"epoch": 0.6233640329617063,
"grad_norm": 5.794642925262451,
"learning_rate": 1.867952181418111e-05,
"loss": 1.5045,
"step": 1286
},
{
"epoch": 0.6238487639360155,
"grad_norm": 1.430221676826477,
"learning_rate": 1.8638588896129557e-05,
"loss": 1.3444,
"step": 1287
},
{
"epoch": 0.6243334949103247,
"grad_norm": 1.3817723989486694,
"learning_rate": 1.8597674204321185e-05,
"loss": 1.4123,
"step": 1288
},
{
"epoch": 0.624818225884634,
"grad_norm": 1.4198659658432007,
"learning_rate": 1.8556777855981737e-05,
"loss": 1.76,
"step": 1289
},
{
"epoch": 0.6253029568589433,
"grad_norm": 1.387041449546814,
"learning_rate": 1.85158999682844e-05,
"loss": 1.4046,
"step": 1290
},
{
"epoch": 0.6257876878332526,
"grad_norm": 1.4337729215621948,
"learning_rate": 1.8475040658349454e-05,
"loss": 1.6913,
"step": 1291
},
{
"epoch": 0.6262724188075618,
"grad_norm": 1.2845624685287476,
"learning_rate": 1.843420004324397e-05,
"loss": 1.3359,
"step": 1292
},
{
"epoch": 0.626757149781871,
"grad_norm": 1.3890070915222168,
"learning_rate": 1.839337823998145e-05,
"loss": 1.3585,
"step": 1293
},
{
"epoch": 0.6272418807561804,
"grad_norm": 1.6244319677352905,
"learning_rate": 1.8352575365521503e-05,
"loss": 1.302,
"step": 1294
},
{
"epoch": 0.6277266117304896,
"grad_norm": 1.373167634010315,
"learning_rate": 1.8311791536769483e-05,
"loss": 1.3026,
"step": 1295
},
{
"epoch": 0.6282113427047988,
"grad_norm": 1.3994495868682861,
"learning_rate": 1.8271026870576197e-05,
"loss": 1.273,
"step": 1296
},
{
"epoch": 0.6286960736791081,
"grad_norm": 1.373883843421936,
"learning_rate": 1.8230281483737537e-05,
"loss": 1.5305,
"step": 1297
},
{
"epoch": 0.6291808046534173,
"grad_norm": 1.4813038110733032,
"learning_rate": 1.818955549299418e-05,
"loss": 1.244,
"step": 1298
},
{
"epoch": 0.6296655356277266,
"grad_norm": 1.3685561418533325,
"learning_rate": 1.8148849015031195e-05,
"loss": 1.4916,
"step": 1299
},
{
"epoch": 0.6301502666020359,
"grad_norm": 1.4443938732147217,
"learning_rate": 1.8108162166477766e-05,
"loss": 1.2453,
"step": 1300
},
{
"epoch": 0.6306349975763451,
"grad_norm": 1.4075669050216675,
"learning_rate": 1.806749506390684e-05,
"loss": 1.5644,
"step": 1301
},
{
"epoch": 0.6311197285506543,
"grad_norm": 1.4213589429855347,
"learning_rate": 1.802684782383478e-05,
"loss": 1.3007,
"step": 1302
},
{
"epoch": 0.6316044595249637,
"grad_norm": 1.3473411798477173,
"learning_rate": 1.798622056272104e-05,
"loss": 1.1438,
"step": 1303
},
{
"epoch": 0.6320891904992729,
"grad_norm": 1.3807307481765747,
"learning_rate": 1.7945613396967837e-05,
"loss": 1.3414,
"step": 1304
},
{
"epoch": 0.6325739214735822,
"grad_norm": 1.6303166151046753,
"learning_rate": 1.790502644291982e-05,
"loss": 1.7151,
"step": 1305
},
{
"epoch": 0.6330586524478914,
"grad_norm": 1.4952675104141235,
"learning_rate": 1.78644598168637e-05,
"loss": 1.66,
"step": 1306
},
{
"epoch": 0.6335433834222006,
"grad_norm": 1.3126227855682373,
"learning_rate": 1.7823913635027973e-05,
"loss": 1.1817,
"step": 1307
},
{
"epoch": 0.63402811439651,
"grad_norm": 1.4245887994766235,
"learning_rate": 1.7783388013582553e-05,
"loss": 1.4009,
"step": 1308
},
{
"epoch": 0.6345128453708192,
"grad_norm": 1.420230507850647,
"learning_rate": 1.7742883068638447e-05,
"loss": 1.408,
"step": 1309
},
{
"epoch": 0.6349975763451284,
"grad_norm": 1.3932969570159912,
"learning_rate": 1.770239891624741e-05,
"loss": 1.5593,
"step": 1310
},
{
"epoch": 0.6354823073194377,
"grad_norm": 1.392712116241455,
"learning_rate": 1.7661935672401632e-05,
"loss": 1.2642,
"step": 1311
},
{
"epoch": 0.635967038293747,
"grad_norm": 1.4296149015426636,
"learning_rate": 1.7621493453033405e-05,
"loss": 1.4418,
"step": 1312
},
{
"epoch": 0.6364517692680562,
"grad_norm": 1.367656946182251,
"learning_rate": 1.7581072374014777e-05,
"loss": 1.3111,
"step": 1313
},
{
"epoch": 0.6369365002423655,
"grad_norm": 1.353005290031433,
"learning_rate": 1.7540672551157227e-05,
"loss": 1.1314,
"step": 1314
},
{
"epoch": 0.6374212312166747,
"grad_norm": 1.514885663986206,
"learning_rate": 1.7500294100211315e-05,
"loss": 1.265,
"step": 1315
},
{
"epoch": 0.637905962190984,
"grad_norm": 1.365065336227417,
"learning_rate": 1.7459937136866392e-05,
"loss": 1.3412,
"step": 1316
},
{
"epoch": 0.6383906931652933,
"grad_norm": 1.6574945449829102,
"learning_rate": 1.7419601776750237e-05,
"loss": 1.5467,
"step": 1317
},
{
"epoch": 0.6388754241396025,
"grad_norm": 1.4217700958251953,
"learning_rate": 1.737928813542873e-05,
"loss": 1.5412,
"step": 1318
},
{
"epoch": 0.6393601551139118,
"grad_norm": 1.3700860738754272,
"learning_rate": 1.7338996328405526e-05,
"loss": 1.3197,
"step": 1319
},
{
"epoch": 0.639844886088221,
"grad_norm": 1.380444049835205,
"learning_rate": 1.7298726471121723e-05,
"loss": 1.6721,
"step": 1320
},
{
"epoch": 0.6403296170625303,
"grad_norm": 1.3262513875961304,
"learning_rate": 1.725847867895553e-05,
"loss": 1.274,
"step": 1321
},
{
"epoch": 0.6408143480368396,
"grad_norm": 1.5101852416992188,
"learning_rate": 1.7218253067221933e-05,
"loss": 1.4451,
"step": 1322
},
{
"epoch": 0.6412990790111488,
"grad_norm": 1.5045435428619385,
"learning_rate": 1.7178049751172366e-05,
"loss": 1.5808,
"step": 1323
},
{
"epoch": 0.641783809985458,
"grad_norm": 1.5001122951507568,
"learning_rate": 1.7137868845994397e-05,
"loss": 1.3329,
"step": 1324
},
{
"epoch": 0.6422685409597674,
"grad_norm": 1.3572721481323242,
"learning_rate": 1.709771046681137e-05,
"loss": 1.0717,
"step": 1325
},
{
"epoch": 0.6427532719340766,
"grad_norm": 1.542609453201294,
"learning_rate": 1.7057574728682095e-05,
"loss": 1.3739,
"step": 1326
},
{
"epoch": 0.6432380029083858,
"grad_norm": 1.4278963804244995,
"learning_rate": 1.7017461746600506e-05,
"loss": 1.37,
"step": 1327
},
{
"epoch": 0.6437227338826951,
"grad_norm": 1.5175418853759766,
"learning_rate": 1.6977371635495347e-05,
"loss": 1.9564,
"step": 1328
},
{
"epoch": 0.6442074648570043,
"grad_norm": 1.5065232515335083,
"learning_rate": 1.6937304510229834e-05,
"loss": 1.6646,
"step": 1329
},
{
"epoch": 0.6446921958313137,
"grad_norm": 1.5196707248687744,
"learning_rate": 1.6897260485601318e-05,
"loss": 1.4758,
"step": 1330
},
{
"epoch": 0.6451769268056229,
"grad_norm": 1.4596264362335205,
"learning_rate": 1.685723967634097e-05,
"loss": 1.3208,
"step": 1331
},
{
"epoch": 0.6456616577799321,
"grad_norm": 1.43583345413208,
"learning_rate": 1.681724219711344e-05,
"loss": 1.5051,
"step": 1332
},
{
"epoch": 0.6461463887542414,
"grad_norm": 1.985482931137085,
"learning_rate": 1.6777268162516548e-05,
"loss": 1.5165,
"step": 1333
},
{
"epoch": 0.6466311197285507,
"grad_norm": 1.4382468461990356,
"learning_rate": 1.6737317687080922e-05,
"loss": 1.889,
"step": 1334
},
{
"epoch": 0.6471158507028599,
"grad_norm": 1.4122154712677002,
"learning_rate": 1.6697390885269705e-05,
"loss": 1.3209,
"step": 1335
},
{
"epoch": 0.6476005816771692,
"grad_norm": 1.277051568031311,
"learning_rate": 1.6657487871478212e-05,
"loss": 1.1666,
"step": 1336
},
{
"epoch": 0.6480853126514784,
"grad_norm": 1.3974394798278809,
"learning_rate": 1.661760876003358e-05,
"loss": 1.6071,
"step": 1337
},
{
"epoch": 0.6485700436257877,
"grad_norm": 1.503825068473816,
"learning_rate": 1.65777536651945e-05,
"loss": 1.1412,
"step": 1338
},
{
"epoch": 0.649054774600097,
"grad_norm": 1.4791207313537598,
"learning_rate": 1.6537922701150828e-05,
"loss": 1.2004,
"step": 1339
},
{
"epoch": 0.6495395055744062,
"grad_norm": 1.4980095624923706,
"learning_rate": 1.6498115982023285e-05,
"loss": 1.5021,
"step": 1340
},
{
"epoch": 0.6500242365487154,
"grad_norm": 1.372834324836731,
"learning_rate": 1.645833362186313e-05,
"loss": 1.318,
"step": 1341
},
{
"epoch": 0.6505089675230247,
"grad_norm": 1.4026696681976318,
"learning_rate": 1.6418575734651832e-05,
"loss": 1.4896,
"step": 1342
},
{
"epoch": 0.650993698497334,
"grad_norm": 1.454119324684143,
"learning_rate": 1.6378842434300746e-05,
"loss": 1.3353,
"step": 1343
},
{
"epoch": 0.6514784294716433,
"grad_norm": 1.4449537992477417,
"learning_rate": 1.633913383465076e-05,
"loss": 1.7903,
"step": 1344
},
{
"epoch": 0.6519631604459525,
"grad_norm": 1.4829782247543335,
"learning_rate": 1.6299450049472022e-05,
"loss": 1.4147,
"step": 1345
},
{
"epoch": 0.6524478914202617,
"grad_norm": 1.4531413316726685,
"learning_rate": 1.6259791192463557e-05,
"loss": 1.0013,
"step": 1346
},
{
"epoch": 0.6529326223945711,
"grad_norm": 1.5042320489883423,
"learning_rate": 1.6220157377252994e-05,
"loss": 1.7003,
"step": 1347
},
{
"epoch": 0.6534173533688803,
"grad_norm": 1.499081015586853,
"learning_rate": 1.6180548717396198e-05,
"loss": 1.4914,
"step": 1348
},
{
"epoch": 0.6539020843431895,
"grad_norm": 1.2727124691009521,
"learning_rate": 1.6140965326376954e-05,
"loss": 1.0353,
"step": 1349
},
{
"epoch": 0.6543868153174988,
"grad_norm": 1.4354982376098633,
"learning_rate": 1.6101407317606666e-05,
"loss": 1.2793,
"step": 1350
},
{
"epoch": 0.654871546291808,
"grad_norm": 1.458020806312561,
"learning_rate": 1.6061874804424e-05,
"loss": 1.502,
"step": 1351
},
{
"epoch": 0.6553562772661173,
"grad_norm": 1.4475706815719604,
"learning_rate": 1.602236790009458e-05,
"loss": 1.6151,
"step": 1352
},
{
"epoch": 0.6558410082404266,
"grad_norm": 1.4461567401885986,
"learning_rate": 1.5982886717810676e-05,
"loss": 1.6717,
"step": 1353
},
{
"epoch": 0.6563257392147358,
"grad_norm": 1.4085007905960083,
"learning_rate": 1.5943431370690815e-05,
"loss": 1.4512,
"step": 1354
},
{
"epoch": 0.656810470189045,
"grad_norm": 1.3767906427383423,
"learning_rate": 1.590400197177954e-05,
"loss": 1.2589,
"step": 1355
},
{
"epoch": 0.6572952011633544,
"grad_norm": 1.5227607488632202,
"learning_rate": 1.5864598634047046e-05,
"loss": 1.4397,
"step": 1356
},
{
"epoch": 0.6577799321376636,
"grad_norm": 1.4575637578964233,
"learning_rate": 1.5825221470388847e-05,
"loss": 1.3085,
"step": 1357
},
{
"epoch": 0.6582646631119728,
"grad_norm": 1.2845321893692017,
"learning_rate": 1.5785870593625472e-05,
"loss": 1.3451,
"step": 1358
},
{
"epoch": 0.6587493940862821,
"grad_norm": 1.3598212003707886,
"learning_rate": 1.574654611650214e-05,
"loss": 1.4667,
"step": 1359
},
{
"epoch": 0.6592341250605914,
"grad_norm": 1.384535551071167,
"learning_rate": 1.5707248151688424e-05,
"loss": 1.3087,
"step": 1360
},
{
"epoch": 0.6597188560349007,
"grad_norm": 1.3146976232528687,
"learning_rate": 1.5667976811777932e-05,
"loss": 1.4136,
"step": 1361
},
{
"epoch": 0.6602035870092099,
"grad_norm": 1.5279932022094727,
"learning_rate": 1.5628732209287993e-05,
"loss": 1.3477,
"step": 1362
},
{
"epoch": 0.6606883179835191,
"grad_norm": 1.3585039377212524,
"learning_rate": 1.558951445665935e-05,
"loss": 1.3886,
"step": 1363
},
{
"epoch": 0.6611730489578284,
"grad_norm": 1.3408361673355103,
"learning_rate": 1.555032366625577e-05,
"loss": 1.677,
"step": 1364
},
{
"epoch": 0.6616577799321377,
"grad_norm": 1.3672118186950684,
"learning_rate": 1.5511159950363814e-05,
"loss": 1.3577,
"step": 1365
},
{
"epoch": 0.6621425109064469,
"grad_norm": 1.3929879665374756,
"learning_rate": 1.5472023421192445e-05,
"loss": 1.3506,
"step": 1366
},
{
"epoch": 0.6626272418807562,
"grad_norm": 1.4646409749984741,
"learning_rate": 1.5432914190872757e-05,
"loss": 1.6497,
"step": 1367
},
{
"epoch": 0.6631119728550654,
"grad_norm": 1.426300287246704,
"learning_rate": 1.539383237145761e-05,
"loss": 1.3643,
"step": 1368
},
{
"epoch": 0.6635967038293747,
"grad_norm": 1.4488145112991333,
"learning_rate": 1.5354778074921332e-05,
"loss": 1.2896,
"step": 1369
},
{
"epoch": 0.664081434803684,
"grad_norm": 1.503942847251892,
"learning_rate": 1.5315751413159394e-05,
"loss": 1.5616,
"step": 1370
},
{
"epoch": 0.6645661657779932,
"grad_norm": 1.4608261585235596,
"learning_rate": 1.52767524979881e-05,
"loss": 1.3562,
"step": 1371
},
{
"epoch": 0.6650508967523024,
"grad_norm": 1.4247395992279053,
"learning_rate": 1.5237781441144256e-05,
"loss": 1.4961,
"step": 1372
},
{
"epoch": 0.6655356277266117,
"grad_norm": 1.3943641185760498,
"learning_rate": 1.5198838354284817e-05,
"loss": 1.5362,
"step": 1373
},
{
"epoch": 0.666020358700921,
"grad_norm": 1.42288339138031,
"learning_rate": 1.515992334898664e-05,
"loss": 1.2119,
"step": 1374
},
{
"epoch": 0.6665050896752303,
"grad_norm": 1.3145508766174316,
"learning_rate": 1.512103653674612e-05,
"loss": 1.6042,
"step": 1375
},
{
"epoch": 0.6669898206495395,
"grad_norm": 1.4619483947753906,
"learning_rate": 1.5082178028978853e-05,
"loss": 1.4172,
"step": 1376
},
{
"epoch": 0.6674745516238487,
"grad_norm": 1.375780463218689,
"learning_rate": 1.5043347937019358e-05,
"loss": 1.4588,
"step": 1377
},
{
"epoch": 0.6679592825981581,
"grad_norm": 1.339187502861023,
"learning_rate": 1.5004546372120736e-05,
"loss": 1.4457,
"step": 1378
},
{
"epoch": 0.6684440135724673,
"grad_norm": 1.310111165046692,
"learning_rate": 1.4965773445454349e-05,
"loss": 1.1335,
"step": 1379
},
{
"epoch": 0.6689287445467765,
"grad_norm": 1.339221477508545,
"learning_rate": 1.492702926810951e-05,
"loss": 1.2751,
"step": 1380
},
{
"epoch": 0.6694134755210858,
"grad_norm": 6.000275611877441,
"learning_rate": 1.4888313951093169e-05,
"loss": 1.1046,
"step": 1381
},
{
"epoch": 0.6698982064953951,
"grad_norm": 1.5101951360702515,
"learning_rate": 1.4849627605329583e-05,
"loss": 1.496,
"step": 1382
},
{
"epoch": 0.6703829374697043,
"grad_norm": 1.4664740562438965,
"learning_rate": 1.481097034165998e-05,
"loss": 1.5279,
"step": 1383
},
{
"epoch": 0.6708676684440136,
"grad_norm": 1.461575984954834,
"learning_rate": 1.4772342270842299e-05,
"loss": 1.3002,
"step": 1384
},
{
"epoch": 0.6713523994183228,
"grad_norm": 1.4403537511825562,
"learning_rate": 1.4733743503550818e-05,
"loss": 1.7751,
"step": 1385
},
{
"epoch": 0.671837130392632,
"grad_norm": 1.3830691576004028,
"learning_rate": 1.4695174150375865e-05,
"loss": 1.2238,
"step": 1386
},
{
"epoch": 0.6723218613669414,
"grad_norm": 1.3348417282104492,
"learning_rate": 1.4656634321823493e-05,
"loss": 1.3973,
"step": 1387
},
{
"epoch": 0.6728065923412506,
"grad_norm": 1.3746894598007202,
"learning_rate": 1.461812412831515e-05,
"loss": 1.114,
"step": 1388
},
{
"epoch": 0.6732913233155599,
"grad_norm": 1.3570497035980225,
"learning_rate": 1.457964368018739e-05,
"loss": 1.4,
"step": 1389
},
{
"epoch": 0.6737760542898691,
"grad_norm": 1.3715555667877197,
"learning_rate": 1.4541193087691535e-05,
"loss": 1.2383,
"step": 1390
},
{
"epoch": 0.6742607852641784,
"grad_norm": 1.3852076530456543,
"learning_rate": 1.4502772460993385e-05,
"loss": 1.2747,
"step": 1391
},
{
"epoch": 0.6747455162384877,
"grad_norm": 1.203576683998108,
"learning_rate": 1.4464381910172858e-05,
"loss": 0.9592,
"step": 1392
},
{
"epoch": 0.6752302472127969,
"grad_norm": 1.6935615539550781,
"learning_rate": 1.4426021545223712e-05,
"loss": 1.7171,
"step": 1393
},
{
"epoch": 0.6757149781871061,
"grad_norm": 1.4629912376403809,
"learning_rate": 1.438769147605322e-05,
"loss": 1.4803,
"step": 1394
},
{
"epoch": 0.6761997091614154,
"grad_norm": 1.3970519304275513,
"learning_rate": 1.434939181248184e-05,
"loss": 1.5205,
"step": 1395
},
{
"epoch": 0.6766844401357247,
"grad_norm": 1.4061120748519897,
"learning_rate": 1.4311122664242954e-05,
"loss": 1.6476,
"step": 1396
},
{
"epoch": 0.6771691711100339,
"grad_norm": 1.41211998462677,
"learning_rate": 1.4272884140982462e-05,
"loss": 1.456,
"step": 1397
},
{
"epoch": 0.6776539020843432,
"grad_norm": 1.3315012454986572,
"learning_rate": 1.423467635225856e-05,
"loss": 1.4234,
"step": 1398
},
{
"epoch": 0.6781386330586524,
"grad_norm": 1.5095267295837402,
"learning_rate": 1.4196499407541359e-05,
"loss": 1.4583,
"step": 1399
},
{
"epoch": 0.6786233640329618,
"grad_norm": 1.3793443441390991,
"learning_rate": 1.4158353416212622e-05,
"loss": 1.2173,
"step": 1400
},
{
"epoch": 0.679108095007271,
"grad_norm": 1.3824712038040161,
"learning_rate": 1.4120238487565402e-05,
"loss": 1.3028,
"step": 1401
},
{
"epoch": 0.6795928259815802,
"grad_norm": 1.375754952430725,
"learning_rate": 1.4082154730803774e-05,
"loss": 1.2412,
"step": 1402
},
{
"epoch": 0.6800775569558895,
"grad_norm": 1.4535313844680786,
"learning_rate": 1.4044102255042475e-05,
"loss": 1.5615,
"step": 1403
},
{
"epoch": 0.6805622879301988,
"grad_norm": 1.379002332687378,
"learning_rate": 1.4006081169306656e-05,
"loss": 1.5593,
"step": 1404
},
{
"epoch": 0.681047018904508,
"grad_norm": 1.3821449279785156,
"learning_rate": 1.3968091582531495e-05,
"loss": 1.2261,
"step": 1405
},
{
"epoch": 0.6815317498788173,
"grad_norm": 1.4238389730453491,
"learning_rate": 1.3930133603561957e-05,
"loss": 1.2891,
"step": 1406
},
{
"epoch": 0.6820164808531265,
"grad_norm": 1.3251632452011108,
"learning_rate": 1.3892207341152416e-05,
"loss": 1.5867,
"step": 1407
},
{
"epoch": 0.6825012118274357,
"grad_norm": 3.028153896331787,
"learning_rate": 1.3854312903966377e-05,
"loss": 1.4604,
"step": 1408
},
{
"epoch": 0.6829859428017451,
"grad_norm": 1.363827109336853,
"learning_rate": 1.381645040057619e-05,
"loss": 1.1675,
"step": 1409
},
{
"epoch": 0.6834706737760543,
"grad_norm": 1.4435791969299316,
"learning_rate": 1.3778619939462667e-05,
"loss": 1.6013,
"step": 1410
},
{
"epoch": 0.6839554047503635,
"grad_norm": 1.4359902143478394,
"learning_rate": 1.3740821629014874e-05,
"loss": 1.6007,
"step": 1411
},
{
"epoch": 0.6844401357246728,
"grad_norm": 1.3930972814559937,
"learning_rate": 1.3703055577529686e-05,
"loss": 1.479,
"step": 1412
},
{
"epoch": 0.6849248666989821,
"grad_norm": 1.5467122793197632,
"learning_rate": 1.3665321893211618e-05,
"loss": 1.3401,
"step": 1413
},
{
"epoch": 0.6854095976732913,
"grad_norm": 1.3341065645217896,
"learning_rate": 1.3627620684172407e-05,
"loss": 1.4378,
"step": 1414
},
{
"epoch": 0.6858943286476006,
"grad_norm": 1.5038310289382935,
"learning_rate": 1.3589952058430778e-05,
"loss": 1.3485,
"step": 1415
},
{
"epoch": 0.6863790596219098,
"grad_norm": 1.3664665222167969,
"learning_rate": 1.3552316123912063e-05,
"loss": 1.4734,
"step": 1416
},
{
"epoch": 0.686863790596219,
"grad_norm": 1.4052238464355469,
"learning_rate": 1.3514712988447972e-05,
"loss": 1.3877,
"step": 1417
},
{
"epoch": 0.6873485215705284,
"grad_norm": 1.478548526763916,
"learning_rate": 1.3477142759776207e-05,
"loss": 1.6317,
"step": 1418
},
{
"epoch": 0.6878332525448376,
"grad_norm": 1.338326096534729,
"learning_rate": 1.343960554554019e-05,
"loss": 1.3528,
"step": 1419
},
{
"epoch": 0.6883179835191469,
"grad_norm": 1.3133000135421753,
"learning_rate": 1.3402101453288785e-05,
"loss": 1.3344,
"step": 1420
},
{
"epoch": 0.6888027144934561,
"grad_norm": 1.4202029705047607,
"learning_rate": 1.3364630590475923e-05,
"loss": 1.3458,
"step": 1421
},
{
"epoch": 0.6892874454677654,
"grad_norm": 1.3237619400024414,
"learning_rate": 1.3327193064460342e-05,
"loss": 1.2891,
"step": 1422
},
{
"epoch": 0.6897721764420747,
"grad_norm": 1.4759186506271362,
"learning_rate": 1.328978898250525e-05,
"loss": 1.7053,
"step": 1423
},
{
"epoch": 0.6902569074163839,
"grad_norm": 1.4690028429031372,
"learning_rate": 1.325241845177807e-05,
"loss": 1.5294,
"step": 1424
},
{
"epoch": 0.6907416383906931,
"grad_norm": 1.433915376663208,
"learning_rate": 1.3215081579350058e-05,
"loss": 1.2603,
"step": 1425
},
{
"epoch": 0.6912263693650024,
"grad_norm": 1.3608239889144897,
"learning_rate": 1.3177778472196068e-05,
"loss": 1.1412,
"step": 1426
},
{
"epoch": 0.6917111003393117,
"grad_norm": 1.4877392053604126,
"learning_rate": 1.3140509237194176e-05,
"loss": 1.8585,
"step": 1427
},
{
"epoch": 0.692195831313621,
"grad_norm": 1.3863829374313354,
"learning_rate": 1.3103273981125447e-05,
"loss": 1.6264,
"step": 1428
},
{
"epoch": 0.6926805622879302,
"grad_norm": 1.4459201097488403,
"learning_rate": 1.3066072810673557e-05,
"loss": 1.4349,
"step": 1429
},
{
"epoch": 0.6931652932622394,
"grad_norm": 1.4497578144073486,
"learning_rate": 1.302890583242457e-05,
"loss": 1.2253,
"step": 1430
},
{
"epoch": 0.6936500242365488,
"grad_norm": 1.4433369636535645,
"learning_rate": 1.2991773152866515e-05,
"loss": 1.1806,
"step": 1431
},
{
"epoch": 0.694134755210858,
"grad_norm": 1.3377454280853271,
"learning_rate": 1.2954674878389223e-05,
"loss": 1.2198,
"step": 1432
},
{
"epoch": 0.6946194861851672,
"grad_norm": 1.4302222728729248,
"learning_rate": 1.2917611115283901e-05,
"loss": 1.4984,
"step": 1433
},
{
"epoch": 0.6951042171594765,
"grad_norm": 1.5178338289260864,
"learning_rate": 1.2880581969742886e-05,
"loss": 1.3589,
"step": 1434
},
{
"epoch": 0.6955889481337858,
"grad_norm": 1.3676424026489258,
"learning_rate": 1.2843587547859361e-05,
"loss": 1.18,
"step": 1435
},
{
"epoch": 0.696073679108095,
"grad_norm": 1.4502981901168823,
"learning_rate": 1.2806627955626982e-05,
"loss": 1.3871,
"step": 1436
},
{
"epoch": 0.6965584100824043,
"grad_norm": 1.550480604171753,
"learning_rate": 1.2769703298939646e-05,
"loss": 1.2154,
"step": 1437
},
{
"epoch": 0.6970431410567135,
"grad_norm": 1.3464187383651733,
"learning_rate": 1.2732813683591121e-05,
"loss": 1.3843,
"step": 1438
},
{
"epoch": 0.6975278720310227,
"grad_norm": 1.508142352104187,
"learning_rate": 1.2695959215274816e-05,
"loss": 1.3424,
"step": 1439
},
{
"epoch": 0.6980126030053321,
"grad_norm": 1.3689768314361572,
"learning_rate": 1.2659139999583414e-05,
"loss": 1.1724,
"step": 1440
},
{
"epoch": 0.6984973339796413,
"grad_norm": 1.5775456428527832,
"learning_rate": 1.2622356142008593e-05,
"loss": 1.2543,
"step": 1441
},
{
"epoch": 0.6989820649539505,
"grad_norm": 1.415062427520752,
"learning_rate": 1.2585607747940729e-05,
"loss": 1.4339,
"step": 1442
},
{
"epoch": 0.6994667959282598,
"grad_norm": 1.458363652229309,
"learning_rate": 1.2548894922668612e-05,
"loss": 1.499,
"step": 1443
},
{
"epoch": 0.6999515269025691,
"grad_norm": 1.39089834690094,
"learning_rate": 1.2512217771379087e-05,
"loss": 1.274,
"step": 1444
},
{
"epoch": 0.7004362578768784,
"grad_norm": 1.365566611289978,
"learning_rate": 1.2475576399156825e-05,
"loss": 1.2721,
"step": 1445
},
{
"epoch": 0.7009209888511876,
"grad_norm": 1.3721204996109009,
"learning_rate": 1.2438970910983957e-05,
"loss": 1.4607,
"step": 1446
},
{
"epoch": 0.7014057198254968,
"grad_norm": 1.3698453903198242,
"learning_rate": 1.2402401411739806e-05,
"loss": 1.5212,
"step": 1447
},
{
"epoch": 0.701890450799806,
"grad_norm": 2.067800521850586,
"learning_rate": 1.2365868006200603e-05,
"loss": 1.5141,
"step": 1448
},
{
"epoch": 0.7023751817741154,
"grad_norm": 1.5352057218551636,
"learning_rate": 1.232937079903914e-05,
"loss": 1.5916,
"step": 1449
},
{
"epoch": 0.7028599127484246,
"grad_norm": 1.3993531465530396,
"learning_rate": 1.2292909894824528e-05,
"loss": 1.3013,
"step": 1450
},
{
"epoch": 0.7033446437227339,
"grad_norm": 1.529046654701233,
"learning_rate": 1.2256485398021808e-05,
"loss": 1.1088,
"step": 1451
},
{
"epoch": 0.7038293746970431,
"grad_norm": 1.5416908264160156,
"learning_rate": 1.222009741299178e-05,
"loss": 1.5297,
"step": 1452
},
{
"epoch": 0.7043141056713524,
"grad_norm": 1.4917817115783691,
"learning_rate": 1.2183746043990577e-05,
"loss": 1.2723,
"step": 1453
},
{
"epoch": 0.7047988366456617,
"grad_norm": 1.3837028741836548,
"learning_rate": 1.2147431395169459e-05,
"loss": 1.2728,
"step": 1454
},
{
"epoch": 0.7052835676199709,
"grad_norm": 1.434131383895874,
"learning_rate": 1.2111153570574454e-05,
"loss": 1.3624,
"step": 1455
},
{
"epoch": 0.7057682985942801,
"grad_norm": 1.430051326751709,
"learning_rate": 1.2074912674146107e-05,
"loss": 1.3779,
"step": 1456
},
{
"epoch": 0.7062530295685895,
"grad_norm": 1.4436354637145996,
"learning_rate": 1.2038708809719137e-05,
"loss": 1.3387,
"step": 1457
},
{
"epoch": 0.7067377605428987,
"grad_norm": 1.3345868587493896,
"learning_rate": 1.2002542081022165e-05,
"loss": 1.2027,
"step": 1458
},
{
"epoch": 0.707222491517208,
"grad_norm": 1.6215115785598755,
"learning_rate": 1.196641259167743e-05,
"loss": 1.6066,
"step": 1459
},
{
"epoch": 0.7077072224915172,
"grad_norm": 1.4360243082046509,
"learning_rate": 1.1930320445200463e-05,
"loss": 1.1322,
"step": 1460
},
{
"epoch": 0.7081919534658264,
"grad_norm": 1.3410156965255737,
"learning_rate": 1.1894265744999802e-05,
"loss": 1.1842,
"step": 1461
},
{
"epoch": 0.7086766844401358,
"grad_norm": 1.6034032106399536,
"learning_rate": 1.185824859437669e-05,
"loss": 1.7069,
"step": 1462
},
{
"epoch": 0.709161415414445,
"grad_norm": 1.647525668144226,
"learning_rate": 1.1822269096524812e-05,
"loss": 1.6953,
"step": 1463
},
{
"epoch": 0.7096461463887542,
"grad_norm": 1.4244221448898315,
"learning_rate": 1.1786327354529941e-05,
"loss": 1.4405,
"step": 1464
},
{
"epoch": 0.7101308773630635,
"grad_norm": 1.4078463315963745,
"learning_rate": 1.1750423471369703e-05,
"loss": 1.2905,
"step": 1465
},
{
"epoch": 0.7106156083373728,
"grad_norm": 1.5614463090896606,
"learning_rate": 1.1714557549913229e-05,
"loss": 1.8514,
"step": 1466
},
{
"epoch": 0.711100339311682,
"grad_norm": 1.4428095817565918,
"learning_rate": 1.1678729692920911e-05,
"loss": 1.2046,
"step": 1467
},
{
"epoch": 0.7115850702859913,
"grad_norm": 1.3544590473175049,
"learning_rate": 1.164294000304406e-05,
"loss": 1.2275,
"step": 1468
},
{
"epoch": 0.7120698012603005,
"grad_norm": 1.4303261041641235,
"learning_rate": 1.1607188582824635e-05,
"loss": 1.3526,
"step": 1469
},
{
"epoch": 0.7125545322346097,
"grad_norm": 1.548387050628662,
"learning_rate": 1.1571475534694951e-05,
"loss": 1.1652,
"step": 1470
},
{
"epoch": 0.7130392632089191,
"grad_norm": 1.4661262035369873,
"learning_rate": 1.1535800960977397e-05,
"loss": 1.7172,
"step": 1471
},
{
"epoch": 0.7135239941832283,
"grad_norm": 1.3347164392471313,
"learning_rate": 1.1500164963884107e-05,
"loss": 1.397,
"step": 1472
},
{
"epoch": 0.7140087251575375,
"grad_norm": 1.4761581420898438,
"learning_rate": 1.146456764551669e-05,
"loss": 1.7281,
"step": 1473
},
{
"epoch": 0.7144934561318468,
"grad_norm": 1.443963646888733,
"learning_rate": 1.142900910786596e-05,
"loss": 1.5084,
"step": 1474
},
{
"epoch": 0.7149781871061561,
"grad_norm": 1.5460129976272583,
"learning_rate": 1.139348945281158e-05,
"loss": 1.4503,
"step": 1475
},
{
"epoch": 0.7154629180804654,
"grad_norm": 1.3878977298736572,
"learning_rate": 1.1358008782121848e-05,
"loss": 1.4604,
"step": 1476
},
{
"epoch": 0.7159476490547746,
"grad_norm": 1.199691653251648,
"learning_rate": 1.1322567197453338e-05,
"loss": 1.0213,
"step": 1477
},
{
"epoch": 0.7164323800290838,
"grad_norm": 1.3305445909500122,
"learning_rate": 1.128716480035066e-05,
"loss": 1.3534,
"step": 1478
},
{
"epoch": 0.7169171110033932,
"grad_norm": 1.2963858842849731,
"learning_rate": 1.125180169224613e-05,
"loss": 1.1662,
"step": 1479
},
{
"epoch": 0.7174018419777024,
"grad_norm": 1.676682472229004,
"learning_rate": 1.1216477974459505e-05,
"loss": 1.652,
"step": 1480
},
{
"epoch": 0.7178865729520116,
"grad_norm": 1.3726409673690796,
"learning_rate": 1.1181193748197667e-05,
"loss": 1.2268,
"step": 1481
},
{
"epoch": 0.7183713039263209,
"grad_norm": 1.3873904943466187,
"learning_rate": 1.114594911455438e-05,
"loss": 1.2706,
"step": 1482
},
{
"epoch": 0.7188560349006301,
"grad_norm": 1.3303292989730835,
"learning_rate": 1.1110744174509952e-05,
"loss": 1.256,
"step": 1483
},
{
"epoch": 0.7193407658749394,
"grad_norm": 1.3690974712371826,
"learning_rate": 1.107557902893095e-05,
"loss": 1.1057,
"step": 1484
},
{
"epoch": 0.7198254968492487,
"grad_norm": 1.5268480777740479,
"learning_rate": 1.1040453778569961e-05,
"loss": 1.3311,
"step": 1485
},
{
"epoch": 0.7203102278235579,
"grad_norm": 1.5212996006011963,
"learning_rate": 1.100536852406523e-05,
"loss": 1.3117,
"step": 1486
},
{
"epoch": 0.7207949587978671,
"grad_norm": 1.370766282081604,
"learning_rate": 1.0970323365940444e-05,
"loss": 1.4976,
"step": 1487
},
{
"epoch": 0.7212796897721765,
"grad_norm": 1.3523683547973633,
"learning_rate": 1.0935318404604375e-05,
"loss": 1.365,
"step": 1488
},
{
"epoch": 0.7217644207464857,
"grad_norm": 1.9818627834320068,
"learning_rate": 1.090035374035065e-05,
"loss": 1.1948,
"step": 1489
},
{
"epoch": 0.722249151720795,
"grad_norm": 1.465571641921997,
"learning_rate": 1.0865429473357414e-05,
"loss": 1.6397,
"step": 1490
},
{
"epoch": 0.7227338826951042,
"grad_norm": 1.360374093055725,
"learning_rate": 1.0830545703687109e-05,
"loss": 1.3702,
"step": 1491
},
{
"epoch": 0.7232186136694134,
"grad_norm": 1.2991136312484741,
"learning_rate": 1.0795702531286106e-05,
"loss": 1.2215,
"step": 1492
},
{
"epoch": 0.7237033446437228,
"grad_norm": 1.4738467931747437,
"learning_rate": 1.0760900055984496e-05,
"loss": 1.7089,
"step": 1493
},
{
"epoch": 0.724188075618032,
"grad_norm": 1.5092021226882935,
"learning_rate": 1.0726138377495728e-05,
"loss": 1.5763,
"step": 1494
},
{
"epoch": 0.7246728065923412,
"grad_norm": 1.505959391593933,
"learning_rate": 1.0691417595416407e-05,
"loss": 1.4419,
"step": 1495
},
{
"epoch": 0.7251575375666505,
"grad_norm": 1.417377233505249,
"learning_rate": 1.0656737809225928e-05,
"loss": 1.264,
"step": 1496
},
{
"epoch": 0.7256422685409598,
"grad_norm": 1.3735849857330322,
"learning_rate": 1.0622099118286239e-05,
"loss": 1.3909,
"step": 1497
},
{
"epoch": 0.726126999515269,
"grad_norm": 1.5356085300445557,
"learning_rate": 1.0587501621841558e-05,
"loss": 1.4821,
"step": 1498
},
{
"epoch": 0.7266117304895783,
"grad_norm": 1.3162624835968018,
"learning_rate": 1.0552945419018065e-05,
"loss": 1.2017,
"step": 1499
},
{
"epoch": 0.7270964614638875,
"grad_norm": 1.4844067096710205,
"learning_rate": 1.0518430608823621e-05,
"loss": 1.5073,
"step": 1500
},
{
"epoch": 0.7275811924381969,
"grad_norm": 1.652967929840088,
"learning_rate": 1.0483957290147494e-05,
"loss": 1.5602,
"step": 1501
},
{
"epoch": 0.7280659234125061,
"grad_norm": 1.4265861511230469,
"learning_rate": 1.0449525561760098e-05,
"loss": 1.5582,
"step": 1502
},
{
"epoch": 0.7285506543868153,
"grad_norm": 1.723618984222412,
"learning_rate": 1.041513552231265e-05,
"loss": 1.869,
"step": 1503
},
{
"epoch": 0.7290353853611246,
"grad_norm": 1.3224735260009766,
"learning_rate": 1.0380787270336955e-05,
"loss": 1.3943,
"step": 1504
},
{
"epoch": 0.7295201163354338,
"grad_norm": 1.6832599639892578,
"learning_rate": 1.034648090424506e-05,
"loss": 1.4809,
"step": 1505
},
{
"epoch": 0.7300048473097431,
"grad_norm": 1.4091854095458984,
"learning_rate": 1.0312216522329038e-05,
"loss": 1.426,
"step": 1506
},
{
"epoch": 0.7304895782840524,
"grad_norm": 1.501326560974121,
"learning_rate": 1.0277994222760645e-05,
"loss": 1.6879,
"step": 1507
},
{
"epoch": 0.7309743092583616,
"grad_norm": 1.2879735231399536,
"learning_rate": 1.0243814103591074e-05,
"loss": 1.4322,
"step": 1508
},
{
"epoch": 0.7314590402326708,
"grad_norm": 1.6264989376068115,
"learning_rate": 1.0209676262750658e-05,
"loss": 1.2868,
"step": 1509
},
{
"epoch": 0.7319437712069802,
"grad_norm": 1.4507060050964355,
"learning_rate": 1.0175580798048625e-05,
"loss": 1.5371,
"step": 1510
},
{
"epoch": 0.7324285021812894,
"grad_norm": 1.3660101890563965,
"learning_rate": 1.0141527807172766e-05,
"loss": 1.4603,
"step": 1511
},
{
"epoch": 0.7329132331555986,
"grad_norm": 1.5483052730560303,
"learning_rate": 1.0107517387689166e-05,
"loss": 1.2558,
"step": 1512
},
{
"epoch": 0.7333979641299079,
"grad_norm": 1.5407285690307617,
"learning_rate": 1.0073549637041985e-05,
"loss": 1.3025,
"step": 1513
},
{
"epoch": 0.7338826951042171,
"grad_norm": 1.3726648092269897,
"learning_rate": 1.0039624652553073e-05,
"loss": 1.5513,
"step": 1514
},
{
"epoch": 0.7343674260785265,
"grad_norm": 1.3971195220947266,
"learning_rate": 1.0005742531421805e-05,
"loss": 1.2535,
"step": 1515
},
{
"epoch": 0.7348521570528357,
"grad_norm": 1.4116908311843872,
"learning_rate": 9.9719033707247e-06,
"loss": 1.1999,
"step": 1516
},
{
"epoch": 0.7353368880271449,
"grad_norm": 1.2860015630722046,
"learning_rate": 9.938107267415238e-06,
"loss": 1.4093,
"step": 1517
},
{
"epoch": 0.7358216190014542,
"grad_norm": 1.4017292261123657,
"learning_rate": 9.904354318323474e-06,
"loss": 1.3041,
"step": 1518
},
{
"epoch": 0.7363063499757635,
"grad_norm": 1.479777216911316,
"learning_rate": 9.870644620155877e-06,
"loss": 1.4719,
"step": 1519
},
{
"epoch": 0.7367910809500727,
"grad_norm": 1.596462368965149,
"learning_rate": 9.836978269494956e-06,
"loss": 1.4582,
"step": 1520
},
{
"epoch": 0.737275811924382,
"grad_norm": 1.3571925163269043,
"learning_rate": 9.80335536279906e-06,
"loss": 1.4515,
"step": 1521
},
{
"epoch": 0.7377605428986912,
"grad_norm": 1.3262391090393066,
"learning_rate": 9.76977599640204e-06,
"loss": 1.5619,
"step": 1522
},
{
"epoch": 0.7382452738730004,
"grad_norm": 1.3788524866104126,
"learning_rate": 9.736240266512992e-06,
"loss": 1.2029,
"step": 1523
},
{
"epoch": 0.7387300048473098,
"grad_norm": 1.3858743906021118,
"learning_rate": 9.702748269216021e-06,
"loss": 1.466,
"step": 1524
},
{
"epoch": 0.739214735821619,
"grad_norm": 1.5450936555862427,
"learning_rate": 9.669300100469902e-06,
"loss": 1.4527,
"step": 1525
},
{
"epoch": 0.7396994667959282,
"grad_norm": 1.3684203624725342,
"learning_rate": 9.635895856107855e-06,
"loss": 1.2464,
"step": 1526
},
{
"epoch": 0.7401841977702375,
"grad_norm": 1.3800790309906006,
"learning_rate": 9.60253563183724e-06,
"loss": 1.2337,
"step": 1527
},
{
"epoch": 0.7406689287445468,
"grad_norm": 1.3781445026397705,
"learning_rate": 9.569219523239292e-06,
"loss": 1.4606,
"step": 1528
},
{
"epoch": 0.741153659718856,
"grad_norm": 1.3597383499145508,
"learning_rate": 9.535947625768851e-06,
"loss": 1.2775,
"step": 1529
},
{
"epoch": 0.7416383906931653,
"grad_norm": 1.5373189449310303,
"learning_rate": 9.5027200347541e-06,
"loss": 1.4209,
"step": 1530
},
{
"epoch": 0.7421231216674745,
"grad_norm": 1.3820744752883911,
"learning_rate": 9.46953684539626e-06,
"loss": 1.482,
"step": 1531
},
{
"epoch": 0.7426078526417839,
"grad_norm": 1.2832772731781006,
"learning_rate": 9.436398152769349e-06,
"loss": 0.9302,
"step": 1532
},
{
"epoch": 0.7430925836160931,
"grad_norm": 1.4201370477676392,
"learning_rate": 9.403304051819883e-06,
"loss": 1.6234,
"step": 1533
},
{
"epoch": 0.7435773145904023,
"grad_norm": 1.4880331754684448,
"learning_rate": 9.370254637366638e-06,
"loss": 1.3413,
"step": 1534
},
{
"epoch": 0.7440620455647116,
"grad_norm": 1.386278510093689,
"learning_rate": 9.337250004100337e-06,
"loss": 1.5433,
"step": 1535
},
{
"epoch": 0.7445467765390208,
"grad_norm": 1.4576743841171265,
"learning_rate": 9.304290246583398e-06,
"loss": 1.686,
"step": 1536
},
{
"epoch": 0.7450315075133301,
"grad_norm": 1.4495551586151123,
"learning_rate": 9.271375459249698e-06,
"loss": 1.4784,
"step": 1537
},
{
"epoch": 0.7455162384876394,
"grad_norm": 1.3655641078948975,
"learning_rate": 9.238505736404212e-06,
"loss": 1.4092,
"step": 1538
},
{
"epoch": 0.7460009694619486,
"grad_norm": 1.4185867309570312,
"learning_rate": 9.205681172222854e-06,
"loss": 1.4356,
"step": 1539
},
{
"epoch": 0.7464857004362578,
"grad_norm": 1.4596229791641235,
"learning_rate": 9.172901860752117e-06,
"loss": 1.3854,
"step": 1540
},
{
"epoch": 0.7469704314105672,
"grad_norm": 1.3605375289916992,
"learning_rate": 9.140167895908867e-06,
"loss": 1.283,
"step": 1541
},
{
"epoch": 0.7474551623848764,
"grad_norm": 1.421411156654358,
"learning_rate": 9.107479371480016e-06,
"loss": 1.5658,
"step": 1542
},
{
"epoch": 0.7479398933591856,
"grad_norm": 1.4605516195297241,
"learning_rate": 9.074836381122312e-06,
"loss": 1.5596,
"step": 1543
},
{
"epoch": 0.7484246243334949,
"grad_norm": 1.7730802297592163,
"learning_rate": 9.04223901836202e-06,
"loss": 1.5225,
"step": 1544
},
{
"epoch": 0.7489093553078041,
"grad_norm": 1.4480133056640625,
"learning_rate": 9.009687376594694e-06,
"loss": 1.0663,
"step": 1545
},
{
"epoch": 0.7493940862821135,
"grad_norm": 1.3374335765838623,
"learning_rate": 8.977181549084884e-06,
"loss": 1.0286,
"step": 1546
},
{
"epoch": 0.7498788172564227,
"grad_norm": 1.3053975105285645,
"learning_rate": 8.944721628965868e-06,
"loss": 1.3729,
"step": 1547
},
{
"epoch": 0.7503635482307319,
"grad_norm": 1.3824571371078491,
"learning_rate": 8.912307709239394e-06,
"loss": 1.3485,
"step": 1548
},
{
"epoch": 0.7508482792050412,
"grad_norm": 1.3053250312805176,
"learning_rate": 8.879939882775443e-06,
"loss": 1.314,
"step": 1549
},
{
"epoch": 0.7513330101793505,
"grad_norm": 1.4634974002838135,
"learning_rate": 8.847618242311895e-06,
"loss": 1.3767,
"step": 1550
},
{
"epoch": 0.7518177411536597,
"grad_norm": 1.7195087671279907,
"learning_rate": 8.815342880454311e-06,
"loss": 1.7894,
"step": 1551
},
{
"epoch": 0.752302472127969,
"grad_norm": 1.4599210023880005,
"learning_rate": 8.783113889675679e-06,
"loss": 1.4189,
"step": 1552
},
{
"epoch": 0.7527872031022782,
"grad_norm": 1.5068151950836182,
"learning_rate": 8.750931362316094e-06,
"loss": 1.4675,
"step": 1553
},
{
"epoch": 0.7532719340765875,
"grad_norm": 1.356566071510315,
"learning_rate": 8.718795390582569e-06,
"loss": 1.0757,
"step": 1554
},
{
"epoch": 0.7537566650508968,
"grad_norm": 1.467347264289856,
"learning_rate": 8.686706066548686e-06,
"loss": 1.4609,
"step": 1555
},
{
"epoch": 0.754241396025206,
"grad_norm": 1.4406360387802124,
"learning_rate": 8.654663482154419e-06,
"loss": 1.4178,
"step": 1556
},
{
"epoch": 0.7547261269995152,
"grad_norm": 1.337432861328125,
"learning_rate": 8.622667729205771e-06,
"loss": 1.1877,
"step": 1557
},
{
"epoch": 0.7552108579738245,
"grad_norm": 1.2744303941726685,
"learning_rate": 8.590718899374628e-06,
"loss": 1.3257,
"step": 1558
},
{
"epoch": 0.7556955889481338,
"grad_norm": 1.3614236116409302,
"learning_rate": 8.558817084198387e-06,
"loss": 1.2649,
"step": 1559
},
{
"epoch": 0.7561803199224431,
"grad_norm": 1.5040825605392456,
"learning_rate": 8.52696237507978e-06,
"loss": 1.4065,
"step": 1560
},
{
"epoch": 0.7566650508967523,
"grad_norm": 1.427902102470398,
"learning_rate": 8.495154863286548e-06,
"loss": 1.6126,
"step": 1561
},
{
"epoch": 0.7571497818710615,
"grad_norm": 1.4786641597747803,
"learning_rate": 8.463394639951206e-06,
"loss": 1.6583,
"step": 1562
},
{
"epoch": 0.7576345128453709,
"grad_norm": 1.4081834554672241,
"learning_rate": 8.431681796070809e-06,
"loss": 1.3911,
"step": 1563
},
{
"epoch": 0.7581192438196801,
"grad_norm": 1.4340317249298096,
"learning_rate": 8.400016422506624e-06,
"loss": 1.5016,
"step": 1564
},
{
"epoch": 0.7586039747939893,
"grad_norm": 1.3345164060592651,
"learning_rate": 8.368398609983945e-06,
"loss": 1.0097,
"step": 1565
},
{
"epoch": 0.7590887057682986,
"grad_norm": 1.5775506496429443,
"learning_rate": 8.336828449091786e-06,
"loss": 1.7549,
"step": 1566
},
{
"epoch": 0.7595734367426078,
"grad_norm": 1.4792371988296509,
"learning_rate": 8.305306030282617e-06,
"loss": 1.3905,
"step": 1567
},
{
"epoch": 0.7600581677169171,
"grad_norm": 1.5295783281326294,
"learning_rate": 8.273831443872132e-06,
"loss": 1.6053,
"step": 1568
},
{
"epoch": 0.7605428986912264,
"grad_norm": 1.3806251287460327,
"learning_rate": 8.242404780038996e-06,
"loss": 1.2319,
"step": 1569
},
{
"epoch": 0.7610276296655356,
"grad_norm": 1.5314702987670898,
"learning_rate": 8.211026128824539e-06,
"loss": 1.4693,
"step": 1570
},
{
"epoch": 0.7615123606398448,
"grad_norm": 1.3532733917236328,
"learning_rate": 8.179695580132563e-06,
"loss": 1.4102,
"step": 1571
},
{
"epoch": 0.7619970916141542,
"grad_norm": 1.3659793138504028,
"learning_rate": 8.14841322372901e-06,
"loss": 1.041,
"step": 1572
},
{
"epoch": 0.7624818225884634,
"grad_norm": 1.439633846282959,
"learning_rate": 8.117179149241788e-06,
"loss": 1.4129,
"step": 1573
},
{
"epoch": 0.7629665535627727,
"grad_norm": 1.3845553398132324,
"learning_rate": 8.085993446160442e-06,
"loss": 1.2368,
"step": 1574
},
{
"epoch": 0.7634512845370819,
"grad_norm": 1.5613845586776733,
"learning_rate": 8.054856203835934e-06,
"loss": 1.5857,
"step": 1575
},
{
"epoch": 0.7639360155113912,
"grad_norm": 1.3105788230895996,
"learning_rate": 8.023767511480378e-06,
"loss": 1.2001,
"step": 1576
},
{
"epoch": 0.7644207464857005,
"grad_norm": 1.5210412740707397,
"learning_rate": 7.992727458166788e-06,
"loss": 1.6572,
"step": 1577
},
{
"epoch": 0.7649054774600097,
"grad_norm": 1.3836086988449097,
"learning_rate": 7.96173613282883e-06,
"loss": 1.3445,
"step": 1578
},
{
"epoch": 0.7653902084343189,
"grad_norm": 1.2600528001785278,
"learning_rate": 7.93079362426054e-06,
"loss": 1.184,
"step": 1579
},
{
"epoch": 0.7658749394086282,
"grad_norm": 1.443926453590393,
"learning_rate": 7.89990002111611e-06,
"loss": 1.3783,
"step": 1580
},
{
"epoch": 0.7663596703829375,
"grad_norm": 1.4408482313156128,
"learning_rate": 7.86905541190959e-06,
"loss": 1.4508,
"step": 1581
},
{
"epoch": 0.7668444013572467,
"grad_norm": 1.4853960275650024,
"learning_rate": 7.838259885014676e-06,
"loss": 1.5522,
"step": 1582
},
{
"epoch": 0.767329132331556,
"grad_norm": 1.9142934083938599,
"learning_rate": 7.807513528664414e-06,
"loss": 1.2533,
"step": 1583
},
{
"epoch": 0.7678138633058652,
"grad_norm": 1.4642645120620728,
"learning_rate": 7.776816430950997e-06,
"loss": 1.2509,
"step": 1584
},
{
"epoch": 0.7682985942801746,
"grad_norm": 1.3161667585372925,
"learning_rate": 7.746168679825468e-06,
"loss": 1.1,
"step": 1585
},
{
"epoch": 0.7687833252544838,
"grad_norm": 1.469278335571289,
"learning_rate": 7.715570363097487e-06,
"loss": 1.3898,
"step": 1586
},
{
"epoch": 0.769268056228793,
"grad_norm": 1.4469612836837769,
"learning_rate": 7.685021568435074e-06,
"loss": 1.358,
"step": 1587
},
{
"epoch": 0.7697527872031023,
"grad_norm": 1.5007667541503906,
"learning_rate": 7.654522383364387e-06,
"loss": 1.5311,
"step": 1588
},
{
"epoch": 0.7702375181774115,
"grad_norm": 1.4023587703704834,
"learning_rate": 7.624072895269418e-06,
"loss": 1.3025,
"step": 1589
},
{
"epoch": 0.7707222491517208,
"grad_norm": 1.348415493965149,
"learning_rate": 7.593673191391776e-06,
"loss": 1.2927,
"step": 1590
},
{
"epoch": 0.7712069801260301,
"grad_norm": 1.487410068511963,
"learning_rate": 7.563323358830448e-06,
"loss": 1.8384,
"step": 1591
},
{
"epoch": 0.7716917111003393,
"grad_norm": 1.5158098936080933,
"learning_rate": 7.533023484541513e-06,
"loss": 1.3989,
"step": 1592
},
{
"epoch": 0.7721764420746485,
"grad_norm": 1.2837767601013184,
"learning_rate": 7.502773655337936e-06,
"loss": 1.0266,
"step": 1593
},
{
"epoch": 0.7726611730489579,
"grad_norm": 1.5535281896591187,
"learning_rate": 7.472573957889267e-06,
"loss": 1.5511,
"step": 1594
},
{
"epoch": 0.7731459040232671,
"grad_norm": 1.699720025062561,
"learning_rate": 7.4424244787214656e-06,
"loss": 1.3947,
"step": 1595
},
{
"epoch": 0.7736306349975763,
"grad_norm": 1.3902910947799683,
"learning_rate": 7.4123253042165495e-06,
"loss": 1.5077,
"step": 1596
},
{
"epoch": 0.7741153659718856,
"grad_norm": 1.4567160606384277,
"learning_rate": 7.382276520612463e-06,
"loss": 1.377,
"step": 1597
},
{
"epoch": 0.7746000969461949,
"grad_norm": 1.6392182111740112,
"learning_rate": 7.352278214002739e-06,
"loss": 1.7202,
"step": 1598
},
{
"epoch": 0.7750848279205041,
"grad_norm": 1.4235200881958008,
"learning_rate": 7.3223304703363135e-06,
"loss": 1.3698,
"step": 1599
},
{
"epoch": 0.7755695588948134,
"grad_norm": 1.3485275506973267,
"learning_rate": 7.292433375417232e-06,
"loss": 1.1991,
"step": 1600
},
{
"epoch": 0.7760542898691226,
"grad_norm": 1.5387061834335327,
"learning_rate": 7.262587014904429e-06,
"loss": 1.414,
"step": 1601
},
{
"epoch": 0.7765390208434318,
"grad_norm": 1.4910553693771362,
"learning_rate": 7.232791474311493e-06,
"loss": 1.3939,
"step": 1602
},
{
"epoch": 0.7770237518177412,
"grad_norm": 1.3060449361801147,
"learning_rate": 7.203046839006383e-06,
"loss": 1.3913,
"step": 1603
},
{
"epoch": 0.7775084827920504,
"grad_norm": 1.4318301677703857,
"learning_rate": 7.173353194211247e-06,
"loss": 1.3158,
"step": 1604
},
{
"epoch": 0.7779932137663597,
"grad_norm": 1.4487580060958862,
"learning_rate": 7.143710625002078e-06,
"loss": 1.4206,
"step": 1605
},
{
"epoch": 0.7784779447406689,
"grad_norm": 2.377192258834839,
"learning_rate": 7.114119216308593e-06,
"loss": 1.3553,
"step": 1606
},
{
"epoch": 0.7789626757149782,
"grad_norm": 1.4438358545303345,
"learning_rate": 7.084579052913884e-06,
"loss": 1.6249,
"step": 1607
},
{
"epoch": 0.7794474066892875,
"grad_norm": 1.6939022541046143,
"learning_rate": 7.0550902194542525e-06,
"loss": 1.5022,
"step": 1608
},
{
"epoch": 0.7799321376635967,
"grad_norm": 1.4086045026779175,
"learning_rate": 7.0256528004188995e-06,
"loss": 1.486,
"step": 1609
},
{
"epoch": 0.7804168686379059,
"grad_norm": 1.3695285320281982,
"learning_rate": 6.996266880149749e-06,
"loss": 1.2197,
"step": 1610
},
{
"epoch": 0.7809015996122152,
"grad_norm": 1.3833764791488647,
"learning_rate": 6.966932542841156e-06,
"loss": 1.2276,
"step": 1611
},
{
"epoch": 0.7813863305865245,
"grad_norm": 1.2639784812927246,
"learning_rate": 6.937649872539675e-06,
"loss": 1.1809,
"step": 1612
},
{
"epoch": 0.7818710615608337,
"grad_norm": 1.495296597480774,
"learning_rate": 6.908418953143861e-06,
"loss": 1.3947,
"step": 1613
},
{
"epoch": 0.782355792535143,
"grad_norm": 1.2453677654266357,
"learning_rate": 6.879239868403964e-06,
"loss": 1.3538,
"step": 1614
},
{
"epoch": 0.7828405235094522,
"grad_norm": 1.3305635452270508,
"learning_rate": 6.8501127019217346e-06,
"loss": 1.3287,
"step": 1615
},
{
"epoch": 0.7833252544837616,
"grad_norm": 1.4396018981933594,
"learning_rate": 6.8210375371501625e-06,
"loss": 1.4193,
"step": 1616
},
{
"epoch": 0.7838099854580708,
"grad_norm": 1.4076224565505981,
"learning_rate": 6.7920144573932695e-06,
"loss": 1.3454,
"step": 1617
},
{
"epoch": 0.78429471643238,
"grad_norm": 1.4297701120376587,
"learning_rate": 6.7630435458058114e-06,
"loss": 1.4004,
"step": 1618
},
{
"epoch": 0.7847794474066893,
"grad_norm": 1.3946882486343384,
"learning_rate": 6.734124885393111e-06,
"loss": 1.5085,
"step": 1619
},
{
"epoch": 0.7852641783809986,
"grad_norm": 1.538183331489563,
"learning_rate": 6.705258559010755e-06,
"loss": 1.7725,
"step": 1620
},
{
"epoch": 0.7857489093553078,
"grad_norm": 1.3094892501831055,
"learning_rate": 6.676444649364416e-06,
"loss": 1.3554,
"step": 1621
},
{
"epoch": 0.7862336403296171,
"grad_norm": 1.6407068967819214,
"learning_rate": 6.647683239009556e-06,
"loss": 1.5119,
"step": 1622
},
{
"epoch": 0.7867183713039263,
"grad_norm": 1.463111400604248,
"learning_rate": 6.618974410351247e-06,
"loss": 1.4116,
"step": 1623
},
{
"epoch": 0.7872031022782355,
"grad_norm": 1.4133925437927246,
"learning_rate": 6.590318245643887e-06,
"loss": 1.3629,
"step": 1624
},
{
"epoch": 0.7876878332525449,
"grad_norm": 1.5195677280426025,
"learning_rate": 6.561714826990998e-06,
"loss": 1.8831,
"step": 1625
},
{
"epoch": 0.7881725642268541,
"grad_norm": 1.3426353931427002,
"learning_rate": 6.533164236344966e-06,
"loss": 1.1815,
"step": 1626
},
{
"epoch": 0.7886572952011633,
"grad_norm": 1.381893277168274,
"learning_rate": 6.504666555506825e-06,
"loss": 1.4206,
"step": 1627
},
{
"epoch": 0.7891420261754726,
"grad_norm": 1.40556800365448,
"learning_rate": 6.476221866126029e-06,
"loss": 1.2124,
"step": 1628
},
{
"epoch": 0.7896267571497819,
"grad_norm": 1.4516103267669678,
"learning_rate": 6.447830249700174e-06,
"loss": 1.2965,
"step": 1629
},
{
"epoch": 0.7901114881240912,
"grad_norm": 1.3030951023101807,
"learning_rate": 6.41949178757483e-06,
"loss": 1.2891,
"step": 1630
},
{
"epoch": 0.7905962190984004,
"grad_norm": 1.4862583875656128,
"learning_rate": 6.3912065609432415e-06,
"loss": 1.4867,
"step": 1631
},
{
"epoch": 0.7910809500727096,
"grad_norm": 1.3782804012298584,
"learning_rate": 6.362974650846157e-06,
"loss": 1.4044,
"step": 1632
},
{
"epoch": 0.7915656810470189,
"grad_norm": 1.3330339193344116,
"learning_rate": 6.334796138171542e-06,
"loss": 1.4485,
"step": 1633
},
{
"epoch": 0.7920504120213282,
"grad_norm": 1.527451753616333,
"learning_rate": 6.306671103654382e-06,
"loss": 1.5054,
"step": 1634
},
{
"epoch": 0.7925351429956374,
"grad_norm": 1.3172121047973633,
"learning_rate": 6.278599627876433e-06,
"loss": 1.3527,
"step": 1635
},
{
"epoch": 0.7930198739699467,
"grad_norm": 1.3731809854507446,
"learning_rate": 6.250581791266019e-06,
"loss": 1.4858,
"step": 1636
},
{
"epoch": 0.7935046049442559,
"grad_norm": 1.3459367752075195,
"learning_rate": 6.22261767409775e-06,
"loss": 1.3209,
"step": 1637
},
{
"epoch": 0.7939893359185652,
"grad_norm": 2.2093305587768555,
"learning_rate": 6.1947073564923576e-06,
"loss": 1.2707,
"step": 1638
},
{
"epoch": 0.7944740668928745,
"grad_norm": 1.42824387550354,
"learning_rate": 6.166850918416406e-06,
"loss": 1.3117,
"step": 1639
},
{
"epoch": 0.7949587978671837,
"grad_norm": 1.436257243156433,
"learning_rate": 6.139048439682085e-06,
"loss": 1.4706,
"step": 1640
},
{
"epoch": 0.7954435288414929,
"grad_norm": 1.4861083030700684,
"learning_rate": 6.111299999947009e-06,
"loss": 1.4673,
"step": 1641
},
{
"epoch": 0.7959282598158022,
"grad_norm": 1.4589564800262451,
"learning_rate": 6.083605678713939e-06,
"loss": 1.4155,
"step": 1642
},
{
"epoch": 0.7964129907901115,
"grad_norm": 1.4518787860870361,
"learning_rate": 6.055965555330606e-06,
"loss": 1.3371,
"step": 1643
},
{
"epoch": 0.7968977217644208,
"grad_norm": 1.3608945608139038,
"learning_rate": 6.028379708989418e-06,
"loss": 1.2777,
"step": 1644
},
{
"epoch": 0.79738245273873,
"grad_norm": 1.3667936325073242,
"learning_rate": 6.000848218727312e-06,
"loss": 1.2991,
"step": 1645
},
{
"epoch": 0.7978671837130392,
"grad_norm": 1.4242197275161743,
"learning_rate": 5.973371163425456e-06,
"loss": 1.4603,
"step": 1646
},
{
"epoch": 0.7983519146873486,
"grad_norm": 1.4276293516159058,
"learning_rate": 5.945948621809091e-06,
"loss": 1.214,
"step": 1647
},
{
"epoch": 0.7988366456616578,
"grad_norm": 1.4498988389968872,
"learning_rate": 5.91858067244723e-06,
"loss": 1.3601,
"step": 1648
},
{
"epoch": 0.799321376635967,
"grad_norm": 1.4795246124267578,
"learning_rate": 5.891267393752509e-06,
"loss": 1.5095,
"step": 1649
},
{
"epoch": 0.7998061076102763,
"grad_norm": 1.371968388557434,
"learning_rate": 5.864008863980897e-06,
"loss": 1.562,
"step": 1650
},
{
"epoch": 0.8002908385845856,
"grad_norm": 1.39975106716156,
"learning_rate": 5.836805161231507e-06,
"loss": 1.4923,
"step": 1651
},
{
"epoch": 0.8007755695588948,
"grad_norm": 1.4395267963409424,
"learning_rate": 5.809656363446381e-06,
"loss": 1.2459,
"step": 1652
},
{
"epoch": 0.8012603005332041,
"grad_norm": 1.383536696434021,
"learning_rate": 5.782562548410236e-06,
"loss": 1.1381,
"step": 1653
},
{
"epoch": 0.8017450315075133,
"grad_norm": 1.2943404912948608,
"learning_rate": 5.7555237937502616e-06,
"loss": 1.2698,
"step": 1654
},
{
"epoch": 0.8022297624818225,
"grad_norm": 1.4058960676193237,
"learning_rate": 5.7285401769358845e-06,
"loss": 1.6954,
"step": 1655
},
{
"epoch": 0.8027144934561319,
"grad_norm": 1.4720020294189453,
"learning_rate": 5.701611775278573e-06,
"loss": 1.5013,
"step": 1656
},
{
"epoch": 0.8031992244304411,
"grad_norm": 1.3335487842559814,
"learning_rate": 5.674738665931575e-06,
"loss": 1.239,
"step": 1657
},
{
"epoch": 0.8036839554047503,
"grad_norm": 1.3698316812515259,
"learning_rate": 5.647920925889744e-06,
"loss": 1.0645,
"step": 1658
},
{
"epoch": 0.8041686863790596,
"grad_norm": 1.3854635953903198,
"learning_rate": 5.6211586319892625e-06,
"loss": 1.3054,
"step": 1659
},
{
"epoch": 0.8046534173533689,
"grad_norm": 1.5994597673416138,
"learning_rate": 5.594451860907485e-06,
"loss": 1.411,
"step": 1660
},
{
"epoch": 0.8051381483276782,
"grad_norm": 1.4336647987365723,
"learning_rate": 5.567800689162658e-06,
"loss": 1.4892,
"step": 1661
},
{
"epoch": 0.8056228793019874,
"grad_norm": 1.753812551498413,
"learning_rate": 5.541205193113763e-06,
"loss": 1.6698,
"step": 1662
},
{
"epoch": 0.8061076102762966,
"grad_norm": 1.2079178094863892,
"learning_rate": 5.51466544896021e-06,
"loss": 1.0658,
"step": 1663
},
{
"epoch": 0.8065923412506059,
"grad_norm": 1.3493430614471436,
"learning_rate": 5.488181532741732e-06,
"loss": 1.3803,
"step": 1664
},
{
"epoch": 0.8070770722249152,
"grad_norm": 1.5290086269378662,
"learning_rate": 5.46175352033807e-06,
"loss": 1.5866,
"step": 1665
},
{
"epoch": 0.8075618031992244,
"grad_norm": 1.3983334302902222,
"learning_rate": 5.435381487468799e-06,
"loss": 1.595,
"step": 1666
},
{
"epoch": 0.8080465341735337,
"grad_norm": 1.5040364265441895,
"learning_rate": 5.409065509693126e-06,
"loss": 1.4392,
"step": 1667
},
{
"epoch": 0.8085312651478429,
"grad_norm": 1.456047534942627,
"learning_rate": 5.382805662409623e-06,
"loss": 1.504,
"step": 1668
},
{
"epoch": 0.8090159961221522,
"grad_norm": 1.338990330696106,
"learning_rate": 5.356602020856072e-06,
"loss": 1.5103,
"step": 1669
},
{
"epoch": 0.8095007270964615,
"grad_norm": 1.4504512548446655,
"learning_rate": 5.330454660109185e-06,
"loss": 1.0707,
"step": 1670
},
{
"epoch": 0.8099854580707707,
"grad_norm": 1.4192159175872803,
"learning_rate": 5.30436365508446e-06,
"loss": 1.6072,
"step": 1671
},
{
"epoch": 0.81047018904508,
"grad_norm": 1.5492674112319946,
"learning_rate": 5.278329080535896e-06,
"loss": 1.6888,
"step": 1672
},
{
"epoch": 0.8109549200193893,
"grad_norm": 1.4452522993087769,
"learning_rate": 5.252351011055831e-06,
"loss": 1.4031,
"step": 1673
},
{
"epoch": 0.8114396509936985,
"grad_norm": 1.4986207485198975,
"learning_rate": 5.226429521074691e-06,
"loss": 1.5358,
"step": 1674
},
{
"epoch": 0.8119243819680078,
"grad_norm": 1.3144514560699463,
"learning_rate": 5.20056468486082e-06,
"loss": 1.5166,
"step": 1675
},
{
"epoch": 0.812409112942317,
"grad_norm": 1.328102946281433,
"learning_rate": 5.174756576520218e-06,
"loss": 0.9864,
"step": 1676
},
{
"epoch": 0.8128938439166262,
"grad_norm": 1.3630974292755127,
"learning_rate": 5.149005269996374e-06,
"loss": 1.5685,
"step": 1677
},
{
"epoch": 0.8133785748909356,
"grad_norm": 1.7009714841842651,
"learning_rate": 5.123310839070011e-06,
"loss": 1.2827,
"step": 1678
},
{
"epoch": 0.8138633058652448,
"grad_norm": 1.3941096067428589,
"learning_rate": 5.097673357358907e-06,
"loss": 1.3748,
"step": 1679
},
{
"epoch": 0.814348036839554,
"grad_norm": 1.5804961919784546,
"learning_rate": 5.072092898317679e-06,
"loss": 1.9325,
"step": 1680
},
{
"epoch": 0.8148327678138633,
"grad_norm": 1.3486220836639404,
"learning_rate": 5.04656953523755e-06,
"loss": 1.0934,
"step": 1681
},
{
"epoch": 0.8153174987881726,
"grad_norm": 1.475723385810852,
"learning_rate": 5.021103341246186e-06,
"loss": 1.1359,
"step": 1682
},
{
"epoch": 0.8158022297624818,
"grad_norm": 1.3328871726989746,
"learning_rate": 4.995694389307412e-06,
"loss": 1.1081,
"step": 1683
},
{
"epoch": 0.8162869607367911,
"grad_norm": 1.5600857734680176,
"learning_rate": 4.9703427522210914e-06,
"loss": 1.7475,
"step": 1684
},
{
"epoch": 0.8167716917111003,
"grad_norm": 1.3647994995117188,
"learning_rate": 4.94504850262284e-06,
"loss": 1.35,
"step": 1685
},
{
"epoch": 0.8172564226854095,
"grad_norm": 1.3073344230651855,
"learning_rate": 4.91981171298388e-06,
"loss": 1.2369,
"step": 1686
},
{
"epoch": 0.8177411536597189,
"grad_norm": 1.335671067237854,
"learning_rate": 4.894632455610773e-06,
"loss": 1.2816,
"step": 1687
},
{
"epoch": 0.8182258846340281,
"grad_norm": 1.4057748317718506,
"learning_rate": 4.8695108026452745e-06,
"loss": 1.1653,
"step": 1688
},
{
"epoch": 0.8187106156083374,
"grad_norm": 1.358802318572998,
"learning_rate": 4.8444468260640755e-06,
"loss": 1.5163,
"step": 1689
},
{
"epoch": 0.8191953465826466,
"grad_norm": 1.3890354633331299,
"learning_rate": 4.819440597678612e-06,
"loss": 1.3894,
"step": 1690
},
{
"epoch": 0.8196800775569559,
"grad_norm": 1.4253144264221191,
"learning_rate": 4.794492189134892e-06,
"loss": 1.4175,
"step": 1691
},
{
"epoch": 0.8201648085312652,
"grad_norm": 1.2631031274795532,
"learning_rate": 4.769601671913234e-06,
"loss": 1.0669,
"step": 1692
},
{
"epoch": 0.8206495395055744,
"grad_norm": 1.477522611618042,
"learning_rate": 4.744769117328107e-06,
"loss": 1.6204,
"step": 1693
},
{
"epoch": 0.8211342704798836,
"grad_norm": 1.3935991525650024,
"learning_rate": 4.719994596527894e-06,
"loss": 1.2437,
"step": 1694
},
{
"epoch": 0.821619001454193,
"grad_norm": 1.2869259119033813,
"learning_rate": 4.695278180494725e-06,
"loss": 1.383,
"step": 1695
},
{
"epoch": 0.8221037324285022,
"grad_norm": 1.3717503547668457,
"learning_rate": 4.670619940044233e-06,
"loss": 1.3475,
"step": 1696
},
{
"epoch": 0.8225884634028114,
"grad_norm": 1.3983089923858643,
"learning_rate": 4.646019945825392e-06,
"loss": 1.5844,
"step": 1697
},
{
"epoch": 0.8230731943771207,
"grad_norm": 1.434476375579834,
"learning_rate": 4.621478268320265e-06,
"loss": 1.3396,
"step": 1698
},
{
"epoch": 0.8235579253514299,
"grad_norm": 1.392899990081787,
"learning_rate": 4.5969949778438575e-06,
"loss": 1.595,
"step": 1699
},
{
"epoch": 0.8240426563257393,
"grad_norm": 1.4684665203094482,
"learning_rate": 4.5725701445438775e-06,
"loss": 1.4212,
"step": 1700
},
{
"epoch": 0.8245273873000485,
"grad_norm": 1.4998910427093506,
"learning_rate": 4.548203838400539e-06,
"loss": 1.7012,
"step": 1701
},
{
"epoch": 0.8250121182743577,
"grad_norm": 1.3950318098068237,
"learning_rate": 4.523896129226371e-06,
"loss": 1.4352,
"step": 1702
},
{
"epoch": 0.825496849248667,
"grad_norm": 1.3992888927459717,
"learning_rate": 4.499647086666029e-06,
"loss": 1.3451,
"step": 1703
},
{
"epoch": 0.8259815802229763,
"grad_norm": 1.4925978183746338,
"learning_rate": 4.475456780196066e-06,
"loss": 1.5352,
"step": 1704
},
{
"epoch": 0.8264663111972855,
"grad_norm": 1.5991015434265137,
"learning_rate": 4.451325279124749e-06,
"loss": 1.5298,
"step": 1705
},
{
"epoch": 0.8269510421715948,
"grad_norm": 1.3293036222457886,
"learning_rate": 4.427252652591876e-06,
"loss": 1.1902,
"step": 1706
},
{
"epoch": 0.827435773145904,
"grad_norm": 1.5411173105239868,
"learning_rate": 4.40323896956854e-06,
"loss": 1.3795,
"step": 1707
},
{
"epoch": 0.8279205041202132,
"grad_norm": 1.4103530645370483,
"learning_rate": 4.379284298856973e-06,
"loss": 1.5143,
"step": 1708
},
{
"epoch": 0.8284052350945226,
"grad_norm": 1.4393373727798462,
"learning_rate": 4.3553887090903075e-06,
"loss": 1.4624,
"step": 1709
},
{
"epoch": 0.8288899660688318,
"grad_norm": 1.8007562160491943,
"learning_rate": 4.331552268732433e-06,
"loss": 1.6461,
"step": 1710
},
{
"epoch": 0.829374697043141,
"grad_norm": 1.4097933769226074,
"learning_rate": 4.307775046077739e-06,
"loss": 1.1364,
"step": 1711
},
{
"epoch": 0.8298594280174503,
"grad_norm": 1.2913793325424194,
"learning_rate": 4.284057109250961e-06,
"loss": 1.37,
"step": 1712
},
{
"epoch": 0.8303441589917596,
"grad_norm": 1.320348858833313,
"learning_rate": 4.2603985262069656e-06,
"loss": 1.6305,
"step": 1713
},
{
"epoch": 0.8308288899660689,
"grad_norm": 1.3655133247375488,
"learning_rate": 4.236799364730582e-06,
"loss": 1.4158,
"step": 1714
},
{
"epoch": 0.8313136209403781,
"grad_norm": 1.4090162515640259,
"learning_rate": 4.213259692436367e-06,
"loss": 1.4813,
"step": 1715
},
{
"epoch": 0.8317983519146873,
"grad_norm": 1.4175288677215576,
"learning_rate": 4.189779576768454e-06,
"loss": 1.6809,
"step": 1716
},
{
"epoch": 0.8322830828889967,
"grad_norm": 1.5763258934020996,
"learning_rate": 4.166359085000324e-06,
"loss": 1.8811,
"step": 1717
},
{
"epoch": 0.8327678138633059,
"grad_norm": 1.3878095149993896,
"learning_rate": 4.142998284234622e-06,
"loss": 1.35,
"step": 1718
},
{
"epoch": 0.8332525448376151,
"grad_norm": 1.4590060710906982,
"learning_rate": 4.119697241402998e-06,
"loss": 1.3375,
"step": 1719
},
{
"epoch": 0.8337372758119244,
"grad_norm": 1.3511408567428589,
"learning_rate": 4.096456023265866e-06,
"loss": 1.384,
"step": 1720
},
{
"epoch": 0.8342220067862336,
"grad_norm": 1.4394463300704956,
"learning_rate": 4.073274696412235e-06,
"loss": 1.6411,
"step": 1721
},
{
"epoch": 0.8347067377605429,
"grad_norm": 1.3683598041534424,
"learning_rate": 4.05015332725952e-06,
"loss": 1.2006,
"step": 1722
},
{
"epoch": 0.8351914687348522,
"grad_norm": 1.4397015571594238,
"learning_rate": 4.027091982053369e-06,
"loss": 1.4946,
"step": 1723
},
{
"epoch": 0.8356761997091614,
"grad_norm": 1.5546510219573975,
"learning_rate": 4.004090726867416e-06,
"loss": 1.5381,
"step": 1724
},
{
"epoch": 0.8361609306834706,
"grad_norm": 1.4166737794876099,
"learning_rate": 3.98114962760317e-06,
"loss": 1.4553,
"step": 1725
},
{
"epoch": 0.83664566165778,
"grad_norm": 1.4045753479003906,
"learning_rate": 3.9582687499897545e-06,
"loss": 1.3718,
"step": 1726
},
{
"epoch": 0.8371303926320892,
"grad_norm": 1.3510545492172241,
"learning_rate": 3.935448159583774e-06,
"loss": 1.4418,
"step": 1727
},
{
"epoch": 0.8376151236063984,
"grad_norm": 1.3021389245986938,
"learning_rate": 3.912687921769082e-06,
"loss": 1.183,
"step": 1728
},
{
"epoch": 0.8380998545807077,
"grad_norm": 1.3169225454330444,
"learning_rate": 3.88998810175662e-06,
"loss": 1.5796,
"step": 1729
},
{
"epoch": 0.8385845855550169,
"grad_norm": 1.3681930303573608,
"learning_rate": 3.8673487645842415e-06,
"loss": 1.2193,
"step": 1730
},
{
"epoch": 0.8390693165293263,
"grad_norm": 1.4157400131225586,
"learning_rate": 3.844769975116488e-06,
"loss": 1.3846,
"step": 1731
},
{
"epoch": 0.8395540475036355,
"grad_norm": 1.3580200672149658,
"learning_rate": 3.8222517980444325e-06,
"loss": 1.0942,
"step": 1732
},
{
"epoch": 0.8400387784779447,
"grad_norm": 1.3693870306015015,
"learning_rate": 3.7997942978854785e-06,
"loss": 1.2999,
"step": 1733
},
{
"epoch": 0.840523509452254,
"grad_norm": 1.290262222290039,
"learning_rate": 3.7773975389832043e-06,
"loss": 1.3288,
"step": 1734
},
{
"epoch": 0.8410082404265633,
"grad_norm": 1.3575208187103271,
"learning_rate": 3.7550615855071277e-06,
"loss": 1.5665,
"step": 1735
},
{
"epoch": 0.8414929714008725,
"grad_norm": 1.3926094770431519,
"learning_rate": 3.7327865014525787e-06,
"loss": 1.3471,
"step": 1736
},
{
"epoch": 0.8419777023751818,
"grad_norm": 1.4269956350326538,
"learning_rate": 3.710572350640465e-06,
"loss": 1.618,
"step": 1737
},
{
"epoch": 0.842462433349491,
"grad_norm": 1.4620566368103027,
"learning_rate": 3.6884191967171327e-06,
"loss": 1.4229,
"step": 1738
},
{
"epoch": 0.8429471643238002,
"grad_norm": 1.3834177255630493,
"learning_rate": 3.666327103154149e-06,
"loss": 1.5917,
"step": 1739
},
{
"epoch": 0.8434318952981096,
"grad_norm": 1.575303077697754,
"learning_rate": 3.644296133248143e-06,
"loss": 2.0171,
"step": 1740
},
{
"epoch": 0.8439166262724188,
"grad_norm": 1.4172624349594116,
"learning_rate": 3.6223263501206113e-06,
"loss": 1.7448,
"step": 1741
},
{
"epoch": 0.844401357246728,
"grad_norm": 1.4966297149658203,
"learning_rate": 3.600417816717755e-06,
"loss": 1.1987,
"step": 1742
},
{
"epoch": 0.8448860882210373,
"grad_norm": 1.4303022623062134,
"learning_rate": 3.578570595810274e-06,
"loss": 1.5639,
"step": 1743
},
{
"epoch": 0.8453708191953466,
"grad_norm": 1.5101985931396484,
"learning_rate": 3.5567847499932e-06,
"loss": 1.6489,
"step": 1744
},
{
"epoch": 0.8458555501696559,
"grad_norm": 1.4596188068389893,
"learning_rate": 3.535060341685731e-06,
"loss": 1.5892,
"step": 1745
},
{
"epoch": 0.8463402811439651,
"grad_norm": 1.283461332321167,
"learning_rate": 3.513397433131024e-06,
"loss": 1.3714,
"step": 1746
},
{
"epoch": 0.8468250121182743,
"grad_norm": 1.439650535583496,
"learning_rate": 3.491796086396043e-06,
"loss": 1.2675,
"step": 1747
},
{
"epoch": 0.8473097430925837,
"grad_norm": 1.3690756559371948,
"learning_rate": 3.4702563633713577e-06,
"loss": 1.255,
"step": 1748
},
{
"epoch": 0.8477944740668929,
"grad_norm": 1.511044979095459,
"learning_rate": 3.4487783257710015e-06,
"loss": 1.1604,
"step": 1749
},
{
"epoch": 0.8482792050412021,
"grad_norm": 1.4531290531158447,
"learning_rate": 3.4273620351322257e-06,
"loss": 1.4041,
"step": 1750
},
{
"epoch": 0.8487639360155114,
"grad_norm": 1.4664944410324097,
"learning_rate": 3.406007552815421e-06,
"loss": 1.4089,
"step": 1751
},
{
"epoch": 0.8492486669898206,
"grad_norm": 1.3850915431976318,
"learning_rate": 3.3847149400038527e-06,
"loss": 1.3229,
"step": 1752
},
{
"epoch": 0.8497333979641299,
"grad_norm": 1.419942021369934,
"learning_rate": 3.3634842577035447e-06,
"loss": 1.423,
"step": 1753
},
{
"epoch": 0.8502181289384392,
"grad_norm": 1.2842981815338135,
"learning_rate": 3.3423155667430708e-06,
"loss": 1.1818,
"step": 1754
},
{
"epoch": 0.8507028599127484,
"grad_norm": 1.3830044269561768,
"learning_rate": 3.321208927773384e-06,
"loss": 1.3562,
"step": 1755
},
{
"epoch": 0.8511875908870576,
"grad_norm": 1.5061596632003784,
"learning_rate": 3.3001644012676773e-06,
"loss": 1.2338,
"step": 1756
},
{
"epoch": 0.851672321861367,
"grad_norm": 1.3219194412231445,
"learning_rate": 3.279182047521151e-06,
"loss": 1.3248,
"step": 1757
},
{
"epoch": 0.8521570528356762,
"grad_norm": 1.2967970371246338,
"learning_rate": 3.258261926650902e-06,
"loss": 1.1634,
"step": 1758
},
{
"epoch": 0.8526417838099855,
"grad_norm": 1.3681789636611938,
"learning_rate": 3.2374040985957004e-06,
"loss": 1.3534,
"step": 1759
},
{
"epoch": 0.8531265147842947,
"grad_norm": 1.4028301239013672,
"learning_rate": 3.216608623115852e-06,
"loss": 1.4824,
"step": 1760
},
{
"epoch": 0.8536112457586039,
"grad_norm": 1.3395938873291016,
"learning_rate": 3.1958755597930017e-06,
"loss": 1.3257,
"step": 1761
},
{
"epoch": 0.8540959767329133,
"grad_norm": 1.5202916860580444,
"learning_rate": 3.175204968029999e-06,
"loss": 1.4002,
"step": 1762
},
{
"epoch": 0.8545807077072225,
"grad_norm": 1.3687208890914917,
"learning_rate": 3.1545969070506747e-06,
"loss": 1.3911,
"step": 1763
},
{
"epoch": 0.8550654386815317,
"grad_norm": 1.5068252086639404,
"learning_rate": 3.1340514358997293e-06,
"loss": 1.427,
"step": 1764
},
{
"epoch": 0.855550169655841,
"grad_norm": 1.3460315465927124,
"learning_rate": 3.1135686134425134e-06,
"loss": 1.2252,
"step": 1765
},
{
"epoch": 0.8560349006301503,
"grad_norm": 1.3183640241622925,
"learning_rate": 3.093148498364898e-06,
"loss": 1.1255,
"step": 1766
},
{
"epoch": 0.8565196316044595,
"grad_norm": 1.3901616334915161,
"learning_rate": 3.0727911491730764e-06,
"loss": 1.5205,
"step": 1767
},
{
"epoch": 0.8570043625787688,
"grad_norm": 1.479112982749939,
"learning_rate": 3.0524966241934153e-06,
"loss": 1.7212,
"step": 1768
},
{
"epoch": 0.857489093553078,
"grad_norm": 1.5991971492767334,
"learning_rate": 3.0322649815722915e-06,
"loss": 1.5865,
"step": 1769
},
{
"epoch": 0.8579738245273874,
"grad_norm": 1.4487693309783936,
"learning_rate": 3.012096279275892e-06,
"loss": 1.5127,
"step": 1770
},
{
"epoch": 0.8584585555016966,
"grad_norm": 1.623945951461792,
"learning_rate": 2.991990575090095e-06,
"loss": 1.5826,
"step": 1771
},
{
"epoch": 0.8589432864760058,
"grad_norm": 1.5717607736587524,
"learning_rate": 2.9719479266202664e-06,
"loss": 1.505,
"step": 1772
},
{
"epoch": 0.859428017450315,
"grad_norm": 1.4932787418365479,
"learning_rate": 2.9519683912911266e-06,
"loss": 1.4719,
"step": 1773
},
{
"epoch": 0.8599127484246243,
"grad_norm": 1.3678524494171143,
"learning_rate": 2.9320520263465463e-06,
"loss": 1.3981,
"step": 1774
},
{
"epoch": 0.8603974793989336,
"grad_norm": 1.2875282764434814,
"learning_rate": 2.9121988888494297e-06,
"loss": 1.2622,
"step": 1775
},
{
"epoch": 0.8608822103732429,
"grad_norm": 1.5293846130371094,
"learning_rate": 2.892409035681498e-06,
"loss": 1.6768,
"step": 1776
},
{
"epoch": 0.8613669413475521,
"grad_norm": 1.5416409969329834,
"learning_rate": 2.872682523543185e-06,
"loss": 1.4337,
"step": 1777
},
{
"epoch": 0.8618516723218613,
"grad_norm": 1.515479326248169,
"learning_rate": 2.8530194089534225e-06,
"loss": 1.3993,
"step": 1778
},
{
"epoch": 0.8623364032961707,
"grad_norm": 1.349339246749878,
"learning_rate": 2.833419748249511e-06,
"loss": 1.4137,
"step": 1779
},
{
"epoch": 0.8628211342704799,
"grad_norm": 1.4984550476074219,
"learning_rate": 2.8138835975869358e-06,
"loss": 1.7335,
"step": 1780
},
{
"epoch": 0.8633058652447891,
"grad_norm": 1.4722235202789307,
"learning_rate": 2.794411012939238e-06,
"loss": 1.2962,
"step": 1781
},
{
"epoch": 0.8637905962190984,
"grad_norm": 1.4962884187698364,
"learning_rate": 2.7750020500978193e-06,
"loss": 1.3812,
"step": 1782
},
{
"epoch": 0.8642753271934076,
"grad_norm": 1.2720054388046265,
"learning_rate": 2.7556567646717907e-06,
"loss": 1.2682,
"step": 1783
},
{
"epoch": 0.864760058167717,
"grad_norm": 1.5404802560806274,
"learning_rate": 2.7363752120878437e-06,
"loss": 1.5413,
"step": 1784
},
{
"epoch": 0.8652447891420262,
"grad_norm": 1.352376103401184,
"learning_rate": 2.717157447590041e-06,
"loss": 1.3569,
"step": 1785
},
{
"epoch": 0.8657295201163354,
"grad_norm": 1.4405183792114258,
"learning_rate": 2.6980035262397037e-06,
"loss": 1.4793,
"step": 1786
},
{
"epoch": 0.8662142510906446,
"grad_norm": 1.5227733850479126,
"learning_rate": 2.6789135029152173e-06,
"loss": 1.4665,
"step": 1787
},
{
"epoch": 0.866698982064954,
"grad_norm": 1.4717252254486084,
"learning_rate": 2.659887432311917e-06,
"loss": 1.2643,
"step": 1788
},
{
"epoch": 0.8671837130392632,
"grad_norm": 1.542318344116211,
"learning_rate": 2.6409253689418656e-06,
"loss": 1.4383,
"step": 1789
},
{
"epoch": 0.8676684440135725,
"grad_norm": 1.4374393224716187,
"learning_rate": 2.6220273671337807e-06,
"loss": 1.2678,
"step": 1790
},
{
"epoch": 0.8681531749878817,
"grad_norm": 1.5301721096038818,
"learning_rate": 2.603193481032801e-06,
"loss": 1.7229,
"step": 1791
},
{
"epoch": 0.868637905962191,
"grad_norm": 1.2812771797180176,
"learning_rate": 2.584423764600391e-06,
"loss": 1.6145,
"step": 1792
},
{
"epoch": 0.8691226369365003,
"grad_norm": 1.3902934789657593,
"learning_rate": 2.5657182716141452e-06,
"loss": 1.3589,
"step": 1793
},
{
"epoch": 0.8696073679108095,
"grad_norm": 1.4703373908996582,
"learning_rate": 2.547077055667646e-06,
"loss": 1.4929,
"step": 1794
},
{
"epoch": 0.8700920988851187,
"grad_norm": 1.8839432001113892,
"learning_rate": 2.528500170170339e-06,
"loss": 1.3907,
"step": 1795
},
{
"epoch": 0.870576829859428,
"grad_norm": 1.3980306386947632,
"learning_rate": 2.5099876683473244e-06,
"loss": 1.5614,
"step": 1796
},
{
"epoch": 0.8710615608337373,
"grad_norm": 1.3951743841171265,
"learning_rate": 2.4915396032392567e-06,
"loss": 1.3248,
"step": 1797
},
{
"epoch": 0.8715462918080465,
"grad_norm": 1.453779697418213,
"learning_rate": 2.473156027702164e-06,
"loss": 1.4249,
"step": 1798
},
{
"epoch": 0.8720310227823558,
"grad_norm": 1.3146569728851318,
"learning_rate": 2.4548369944073004e-06,
"loss": 1.2215,
"step": 1799
},
{
"epoch": 0.872515753756665,
"grad_norm": 1.4370759725570679,
"learning_rate": 2.4365825558409966e-06,
"loss": 1.5261,
"step": 1800
},
{
"epoch": 0.8730004847309744,
"grad_norm": 1.3861037492752075,
"learning_rate": 2.4183927643045253e-06,
"loss": 1.3451,
"step": 1801
},
{
"epoch": 0.8734852157052836,
"grad_norm": 1.2893579006195068,
"learning_rate": 2.4002676719139166e-06,
"loss": 1.2465,
"step": 1802
},
{
"epoch": 0.8739699466795928,
"grad_norm": 1.4540156126022339,
"learning_rate": 2.3822073305998534e-06,
"loss": 1.3423,
"step": 1803
},
{
"epoch": 0.8744546776539021,
"grad_norm": 1.4508488178253174,
"learning_rate": 2.3642117921074734e-06,
"loss": 1.527,
"step": 1804
},
{
"epoch": 0.8749394086282113,
"grad_norm": 1.4460153579711914,
"learning_rate": 2.3462811079962705e-06,
"loss": 1.7063,
"step": 1805
},
{
"epoch": 0.8754241396025206,
"grad_norm": 1.468252420425415,
"learning_rate": 2.328415329639902e-06,
"loss": 1.5137,
"step": 1806
},
{
"epoch": 0.8759088705768299,
"grad_norm": 1.3156002759933472,
"learning_rate": 2.310614508226078e-06,
"loss": 1.2969,
"step": 1807
},
{
"epoch": 0.8763936015511391,
"grad_norm": 1.447495698928833,
"learning_rate": 2.292878694756384e-06,
"loss": 1.3326,
"step": 1808
},
{
"epoch": 0.8768783325254483,
"grad_norm": 1.4856112003326416,
"learning_rate": 2.2752079400461564e-06,
"loss": 1.3841,
"step": 1809
},
{
"epoch": 0.8773630634997577,
"grad_norm": 1.4724615812301636,
"learning_rate": 2.257602294724337e-06,
"loss": 1.5061,
"step": 1810
},
{
"epoch": 0.8778477944740669,
"grad_norm": 1.5109659433364868,
"learning_rate": 2.24006180923331e-06,
"loss": 1.5312,
"step": 1811
},
{
"epoch": 0.8783325254483761,
"grad_norm": 1.8590974807739258,
"learning_rate": 2.222586533828777e-06,
"loss": 1.6278,
"step": 1812
},
{
"epoch": 0.8788172564226854,
"grad_norm": 1.5733957290649414,
"learning_rate": 2.2051765185795965e-06,
"loss": 1.323,
"step": 1813
},
{
"epoch": 0.8793019873969947,
"grad_norm": 1.3470954895019531,
"learning_rate": 2.1878318133676607e-06,
"loss": 1.1649,
"step": 1814
},
{
"epoch": 0.879786718371304,
"grad_norm": 1.4746617078781128,
"learning_rate": 2.170552467887721e-06,
"loss": 1.7258,
"step": 1815
},
{
"epoch": 0.8802714493456132,
"grad_norm": 1.3480514287948608,
"learning_rate": 2.1533385316472864e-06,
"loss": 1.1594,
"step": 1816
},
{
"epoch": 0.8807561803199224,
"grad_norm": 1.4327815771102905,
"learning_rate": 2.136190053966444e-06,
"loss": 1.3378,
"step": 1817
},
{
"epoch": 0.8812409112942317,
"grad_norm": 1.433623194694519,
"learning_rate": 2.119107083977742e-06,
"loss": 1.5589,
"step": 1818
},
{
"epoch": 0.881725642268541,
"grad_norm": 1.3920966386795044,
"learning_rate": 2.1020896706260367e-06,
"loss": 1.4108,
"step": 1819
},
{
"epoch": 0.8822103732428502,
"grad_norm": 1.4291924238204956,
"learning_rate": 2.08513786266836e-06,
"loss": 1.4014,
"step": 1820
},
{
"epoch": 0.8826951042171595,
"grad_norm": 2.1731436252593994,
"learning_rate": 2.068251708673777e-06,
"loss": 1.4262,
"step": 1821
},
{
"epoch": 0.8831798351914687,
"grad_norm": 1.3496067523956299,
"learning_rate": 2.051431257023237e-06,
"loss": 1.2465,
"step": 1822
},
{
"epoch": 0.883664566165778,
"grad_norm": 1.3288339376449585,
"learning_rate": 2.0346765559094567e-06,
"loss": 1.0933,
"step": 1823
},
{
"epoch": 0.8841492971400873,
"grad_norm": 1.2372487783432007,
"learning_rate": 2.0179876533367587e-06,
"loss": 1.2749,
"step": 1824
},
{
"epoch": 0.8846340281143965,
"grad_norm": 1.4109827280044556,
"learning_rate": 2.0013645971209527e-06,
"loss": 1.3048,
"step": 1825
},
{
"epoch": 0.8851187590887057,
"grad_norm": 1.4247568845748901,
"learning_rate": 1.984807434889177e-06,
"loss": 1.3601,
"step": 1826
},
{
"epoch": 0.885603490063015,
"grad_norm": 1.3820182085037231,
"learning_rate": 1.9683162140798045e-06,
"loss": 1.288,
"step": 1827
},
{
"epoch": 0.8860882210373243,
"grad_norm": 1.4704811573028564,
"learning_rate": 1.9518909819422336e-06,
"loss": 1.4445,
"step": 1828
},
{
"epoch": 0.8865729520116336,
"grad_norm": 1.5304688215255737,
"learning_rate": 1.935531785536834e-06,
"loss": 1.8307,
"step": 1829
},
{
"epoch": 0.8870576829859428,
"grad_norm": 1.4468873739242554,
"learning_rate": 1.919238671734758e-06,
"loss": 1.3208,
"step": 1830
},
{
"epoch": 0.887542413960252,
"grad_norm": 1.4901260137557983,
"learning_rate": 1.9030116872178316e-06,
"loss": 1.5234,
"step": 1831
},
{
"epoch": 0.8880271449345614,
"grad_norm": 1.4315540790557861,
"learning_rate": 1.886850878478405e-06,
"loss": 1.2287,
"step": 1832
},
{
"epoch": 0.8885118759088706,
"grad_norm": 1.2842153310775757,
"learning_rate": 1.87075629181922e-06,
"loss": 1.4168,
"step": 1833
},
{
"epoch": 0.8889966068831798,
"grad_norm": 1.3464667797088623,
"learning_rate": 1.8547279733533042e-06,
"loss": 1.122,
"step": 1834
},
{
"epoch": 0.8894813378574891,
"grad_norm": 1.4574693441390991,
"learning_rate": 1.8387659690038e-06,
"loss": 1.496,
"step": 1835
},
{
"epoch": 0.8899660688317984,
"grad_norm": 1.374718427658081,
"learning_rate": 1.822870324503867e-06,
"loss": 1.4209,
"step": 1836
},
{
"epoch": 0.8904507998061076,
"grad_norm": 1.4177495241165161,
"learning_rate": 1.8070410853965104e-06,
"loss": 1.4393,
"step": 1837
},
{
"epoch": 0.8909355307804169,
"grad_norm": 1.4920076131820679,
"learning_rate": 1.7912782970345044e-06,
"loss": 1.4351,
"step": 1838
},
{
"epoch": 0.8914202617547261,
"grad_norm": 1.3668889999389648,
"learning_rate": 1.7755820045802145e-06,
"loss": 1.4363,
"step": 1839
},
{
"epoch": 0.8919049927290353,
"grad_norm": 1.2889249324798584,
"learning_rate": 1.7599522530055006e-06,
"loss": 1.335,
"step": 1840
},
{
"epoch": 0.8923897237033447,
"grad_norm": 1.3367114067077637,
"learning_rate": 1.744389087091558e-06,
"loss": 1.4699,
"step": 1841
},
{
"epoch": 0.8928744546776539,
"grad_norm": 1.393117904663086,
"learning_rate": 1.7288925514288262e-06,
"loss": 1.5868,
"step": 1842
},
{
"epoch": 0.8933591856519632,
"grad_norm": 1.229596734046936,
"learning_rate": 1.7134626904168228e-06,
"loss": 1.0758,
"step": 1843
},
{
"epoch": 0.8938439166262724,
"grad_norm": 1.3445252180099487,
"learning_rate": 1.6980995482640373e-06,
"loss": 1.5242,
"step": 1844
},
{
"epoch": 0.8943286476005817,
"grad_norm": 1.4988356828689575,
"learning_rate": 1.682803168987815e-06,
"loss": 1.5926,
"step": 1845
},
{
"epoch": 0.894813378574891,
"grad_norm": 1.3614110946655273,
"learning_rate": 1.6675735964142015e-06,
"loss": 1.3875,
"step": 1846
},
{
"epoch": 0.8952981095492002,
"grad_norm": 1.330135703086853,
"learning_rate": 1.6524108741778372e-06,
"loss": 1.4712,
"step": 1847
},
{
"epoch": 0.8957828405235094,
"grad_norm": 1.4532225131988525,
"learning_rate": 1.6373150457218267e-06,
"loss": 1.6974,
"step": 1848
},
{
"epoch": 0.8962675714978187,
"grad_norm": 1.4383496046066284,
"learning_rate": 1.6222861542976252e-06,
"loss": 1.637,
"step": 1849
},
{
"epoch": 0.896752302472128,
"grad_norm": 1.4017609357833862,
"learning_rate": 1.6073242429648916e-06,
"loss": 1.5166,
"step": 1850
},
{
"epoch": 0.8972370334464372,
"grad_norm": 1.5737212896347046,
"learning_rate": 1.5924293545913876e-06,
"loss": 1.4628,
"step": 1851
},
{
"epoch": 0.8977217644207465,
"grad_norm": 1.3844492435455322,
"learning_rate": 1.5776015318528403e-06,
"loss": 1.2499,
"step": 1852
},
{
"epoch": 0.8982064953950557,
"grad_norm": 1.4061801433563232,
"learning_rate": 1.5628408172328301e-06,
"loss": 1.5193,
"step": 1853
},
{
"epoch": 0.898691226369365,
"grad_norm": 1.3358350992202759,
"learning_rate": 1.5481472530226554e-06,
"loss": 1.1172,
"step": 1854
},
{
"epoch": 0.8991759573436743,
"grad_norm": 1.4957354068756104,
"learning_rate": 1.5335208813212375e-06,
"loss": 1.5576,
"step": 1855
},
{
"epoch": 0.8996606883179835,
"grad_norm": 1.3805766105651855,
"learning_rate": 1.5189617440349635e-06,
"loss": 1.4324,
"step": 1856
},
{
"epoch": 0.9001454192922927,
"grad_norm": 1.4622647762298584,
"learning_rate": 1.5044698828775932e-06,
"loss": 1.2463,
"step": 1857
},
{
"epoch": 0.900630150266602,
"grad_norm": 1.4765523672103882,
"learning_rate": 1.4900453393701358e-06,
"loss": 1.6936,
"step": 1858
},
{
"epoch": 0.9011148812409113,
"grad_norm": 1.4264954328536987,
"learning_rate": 1.4756881548407153e-06,
"loss": 1.4019,
"step": 1859
},
{
"epoch": 0.9015996122152206,
"grad_norm": 1.4087258577346802,
"learning_rate": 1.4613983704244826e-06,
"loss": 1.4478,
"step": 1860
},
{
"epoch": 0.9020843431895298,
"grad_norm": 1.7178231477737427,
"learning_rate": 1.44717602706346e-06,
"loss": 1.3309,
"step": 1861
},
{
"epoch": 0.902569074163839,
"grad_norm": 1.351010799407959,
"learning_rate": 1.4330211655064568e-06,
"loss": 1.3857,
"step": 1862
},
{
"epoch": 0.9030538051381484,
"grad_norm": 1.4305837154388428,
"learning_rate": 1.4189338263089241e-06,
"loss": 1.3635,
"step": 1863
},
{
"epoch": 0.9035385361124576,
"grad_norm": 1.2843518257141113,
"learning_rate": 1.4049140498328728e-06,
"loss": 1.1113,
"step": 1864
},
{
"epoch": 0.9040232670867668,
"grad_norm": 1.3791704177856445,
"learning_rate": 1.3909618762467186e-06,
"loss": 1.1953,
"step": 1865
},
{
"epoch": 0.9045079980610761,
"grad_norm": 1.3165531158447266,
"learning_rate": 1.3770773455251935e-06,
"loss": 1.1486,
"step": 1866
},
{
"epoch": 0.9049927290353854,
"grad_norm": 1.3703712224960327,
"learning_rate": 1.3632604974492257e-06,
"loss": 1.4523,
"step": 1867
},
{
"epoch": 0.9054774600096946,
"grad_norm": 1.28277587890625,
"learning_rate": 1.3495113716058272e-06,
"loss": 1.506,
"step": 1868
},
{
"epoch": 0.9059621909840039,
"grad_norm": 1.3652846813201904,
"learning_rate": 1.335830007387967e-06,
"loss": 1.3893,
"step": 1869
},
{
"epoch": 0.9064469219583131,
"grad_norm": 1.3692935705184937,
"learning_rate": 1.3222164439944811e-06,
"loss": 1.644,
"step": 1870
},
{
"epoch": 0.9069316529326223,
"grad_norm": 1.5364172458648682,
"learning_rate": 1.3086707204299414e-06,
"loss": 1.6484,
"step": 1871
},
{
"epoch": 0.9074163839069317,
"grad_norm": 1.4699918031692505,
"learning_rate": 1.2951928755045417e-06,
"loss": 1.593,
"step": 1872
},
{
"epoch": 0.9079011148812409,
"grad_norm": 1.437369465827942,
"learning_rate": 1.281782947834015e-06,
"loss": 1.5027,
"step": 1873
},
{
"epoch": 0.9083858458555502,
"grad_norm": 1.4166172742843628,
"learning_rate": 1.26844097583948e-06,
"loss": 1.506,
"step": 1874
},
{
"epoch": 0.9088705768298594,
"grad_norm": 1.340541124343872,
"learning_rate": 1.2551669977473813e-06,
"loss": 1.4797,
"step": 1875
},
{
"epoch": 0.9093553078041687,
"grad_norm": 1.5571600198745728,
"learning_rate": 1.241961051589316e-06,
"loss": 1.5784,
"step": 1876
},
{
"epoch": 0.909840038778478,
"grad_norm": 1.496361494064331,
"learning_rate": 1.2288231752019956e-06,
"loss": 1.3185,
"step": 1877
},
{
"epoch": 0.9103247697527872,
"grad_norm": 1.3229130506515503,
"learning_rate": 1.2157534062270798e-06,
"loss": 1.4291,
"step": 1878
},
{
"epoch": 0.9108095007270964,
"grad_norm": 1.5197927951812744,
"learning_rate": 1.2027517821111112e-06,
"loss": 1.3644,
"step": 1879
},
{
"epoch": 0.9112942317014057,
"grad_norm": 1.3992700576782227,
"learning_rate": 1.1898183401053697e-06,
"loss": 1.1666,
"step": 1880
},
{
"epoch": 0.911778962675715,
"grad_norm": 1.4355274438858032,
"learning_rate": 1.1769531172658e-06,
"loss": 1.487,
"step": 1881
},
{
"epoch": 0.9122636936500242,
"grad_norm": 1.310941219329834,
"learning_rate": 1.1641561504528803e-06,
"loss": 1.3596,
"step": 1882
},
{
"epoch": 0.9127484246243335,
"grad_norm": 1.495992660522461,
"learning_rate": 1.1514274763315292e-06,
"loss": 1.8062,
"step": 1883
},
{
"epoch": 0.9132331555986427,
"grad_norm": 2.58510684967041,
"learning_rate": 1.1387671313710075e-06,
"loss": 1.4036,
"step": 1884
},
{
"epoch": 0.913717886572952,
"grad_norm": 1.5952808856964111,
"learning_rate": 1.1261751518447882e-06,
"loss": 1.6931,
"step": 1885
},
{
"epoch": 0.9142026175472613,
"grad_norm": 1.2862507104873657,
"learning_rate": 1.113651573830482e-06,
"loss": 1.1187,
"step": 1886
},
{
"epoch": 0.9146873485215705,
"grad_norm": 1.285225510597229,
"learning_rate": 1.1011964332097114e-06,
"loss": 1.5083,
"step": 1887
},
{
"epoch": 0.9151720794958798,
"grad_norm": 1.5734096765518188,
"learning_rate": 1.0888097656680253e-06,
"loss": 1.646,
"step": 1888
},
{
"epoch": 0.9156568104701891,
"grad_norm": 1.4412444829940796,
"learning_rate": 1.0764916066947794e-06,
"loss": 1.5821,
"step": 1889
},
{
"epoch": 0.9161415414444983,
"grad_norm": 1.3290297985076904,
"learning_rate": 1.0642419915830537e-06,
"loss": 1.1081,
"step": 1890
},
{
"epoch": 0.9166262724188076,
"grad_norm": 1.4718834161758423,
"learning_rate": 1.0520609554295346e-06,
"loss": 1.6698,
"step": 1891
},
{
"epoch": 0.9171110033931168,
"grad_norm": 1.3476266860961914,
"learning_rate": 1.0399485331344273e-06,
"loss": 1.2459,
"step": 1892
},
{
"epoch": 0.917595734367426,
"grad_norm": 1.4318119287490845,
"learning_rate": 1.027904759401338e-06,
"loss": 1.3108,
"step": 1893
},
{
"epoch": 0.9180804653417354,
"grad_norm": 1.476616621017456,
"learning_rate": 1.0159296687372034e-06,
"loss": 1.7684,
"step": 1894
},
{
"epoch": 0.9185651963160446,
"grad_norm": 1.3496558666229248,
"learning_rate": 1.0040232954521557e-06,
"loss": 1.4406,
"step": 1895
},
{
"epoch": 0.9190499272903538,
"grad_norm": 1.3763827085494995,
"learning_rate": 9.92185673659457e-07,
"loss": 1.2898,
"step": 1896
},
{
"epoch": 0.9195346582646631,
"grad_norm": 1.430915117263794,
"learning_rate": 9.804168372753858e-07,
"loss": 1.4238,
"step": 1897
},
{
"epoch": 0.9200193892389724,
"grad_norm": 1.3764996528625488,
"learning_rate": 9.687168200191304e-07,
"loss": 1.5283,
"step": 1898
},
{
"epoch": 0.9205041202132817,
"grad_norm": 1.4979218244552612,
"learning_rate": 9.570856554127205e-07,
"loss": 1.853,
"step": 1899
},
{
"epoch": 0.9209888511875909,
"grad_norm": 1.4959334135055542,
"learning_rate": 9.455233767808991e-07,
"loss": 1.7275,
"step": 1900
},
{
"epoch": 0.9214735821619001,
"grad_norm": 1.2733063697814941,
"learning_rate": 9.340300172510586e-07,
"loss": 1.4255,
"step": 1901
},
{
"epoch": 0.9219583131362094,
"grad_norm": 1.451502799987793,
"learning_rate": 9.226056097531105e-07,
"loss": 1.1018,
"step": 1902
},
{
"epoch": 0.9224430441105187,
"grad_norm": 1.3837565183639526,
"learning_rate": 9.112501870194273e-07,
"loss": 1.7232,
"step": 1903
},
{
"epoch": 0.9229277750848279,
"grad_norm": 1.4380730390548706,
"learning_rate": 8.9996378158472e-07,
"loss": 1.2878,
"step": 1904
},
{
"epoch": 0.9234125060591372,
"grad_norm": 1.2848694324493408,
"learning_rate": 8.887464257859579e-07,
"loss": 1.2934,
"step": 1905
},
{
"epoch": 0.9238972370334464,
"grad_norm": 1.2803257703781128,
"learning_rate": 8.775981517622794e-07,
"loss": 1.297,
"step": 1906
},
{
"epoch": 0.9243819680077557,
"grad_norm": 1.3022807836532593,
"learning_rate": 8.665189914548955e-07,
"loss": 1.2849,
"step": 1907
},
{
"epoch": 0.924866698982065,
"grad_norm": 1.3697919845581055,
"learning_rate": 8.555089766069891e-07,
"loss": 1.3761,
"step": 1908
},
{
"epoch": 0.9253514299563742,
"grad_norm": 1.5067558288574219,
"learning_rate": 8.445681387636406e-07,
"loss": 1.31,
"step": 1909
},
{
"epoch": 0.9258361609306834,
"grad_norm": 1.403795838356018,
"learning_rate": 8.336965092717281e-07,
"loss": 1.4403,
"step": 1910
},
{
"epoch": 0.9263208919049928,
"grad_norm": 1.440237283706665,
"learning_rate": 8.228941192798323e-07,
"loss": 1.7129,
"step": 1911
},
{
"epoch": 0.926805622879302,
"grad_norm": 1.4832885265350342,
"learning_rate": 8.121609997381652e-07,
"loss": 1.5841,
"step": 1912
},
{
"epoch": 0.9272903538536112,
"grad_norm": 1.4952884912490845,
"learning_rate": 8.014971813984611e-07,
"loss": 1.6965,
"step": 1913
},
{
"epoch": 0.9277750848279205,
"grad_norm": 1.588004231452942,
"learning_rate": 7.909026948139081e-07,
"loss": 1.5482,
"step": 1914
},
{
"epoch": 0.9282598158022297,
"grad_norm": 1.387260913848877,
"learning_rate": 7.803775703390359e-07,
"loss": 1.2515,
"step": 1915
},
{
"epoch": 0.9287445467765391,
"grad_norm": 1.292482852935791,
"learning_rate": 7.699218381296531e-07,
"loss": 1.3326,
"step": 1916
},
{
"epoch": 0.9292292777508483,
"grad_norm": 1.427830696105957,
"learning_rate": 7.595355281427435e-07,
"loss": 1.5905,
"step": 1917
},
{
"epoch": 0.9297140087251575,
"grad_norm": 1.3774360418319702,
"learning_rate": 7.492186701364007e-07,
"loss": 1.6201,
"step": 1918
},
{
"epoch": 0.9301987396994668,
"grad_norm": 1.4959042072296143,
"learning_rate": 7.389712936697129e-07,
"loss": 1.4392,
"step": 1919
},
{
"epoch": 0.9306834706737761,
"grad_norm": 1.2562229633331299,
"learning_rate": 7.287934281027114e-07,
"loss": 1.2257,
"step": 1920
},
{
"epoch": 0.9311682016480853,
"grad_norm": 1.3972541093826294,
"learning_rate": 7.186851025962532e-07,
"loss": 1.1484,
"step": 1921
},
{
"epoch": 0.9316529326223946,
"grad_norm": 1.3372113704681396,
"learning_rate": 7.086463461119658e-07,
"loss": 1.2652,
"step": 1922
},
{
"epoch": 0.9321376635967038,
"grad_norm": 1.3388514518737793,
"learning_rate": 6.9867718741215e-07,
"loss": 1.5144,
"step": 1923
},
{
"epoch": 0.932622394571013,
"grad_norm": 1.3194801807403564,
"learning_rate": 6.887776550597025e-07,
"loss": 1.1255,
"step": 1924
},
{
"epoch": 0.9331071255453224,
"grad_norm": 1.4161765575408936,
"learning_rate": 6.789477774180236e-07,
"loss": 1.7097,
"step": 1925
},
{
"epoch": 0.9335918565196316,
"grad_norm": 1.3846325874328613,
"learning_rate": 6.691875826509514e-07,
"loss": 1.445,
"step": 1926
},
{
"epoch": 0.9340765874939408,
"grad_norm": 1.311930537223816,
"learning_rate": 6.59497098722675e-07,
"loss": 1.406,
"step": 1927
},
{
"epoch": 0.9345613184682501,
"grad_norm": 1.3026055097579956,
"learning_rate": 6.498763533976437e-07,
"loss": 1.2652,
"step": 1928
},
{
"epoch": 0.9350460494425594,
"grad_norm": 1.4192198514938354,
"learning_rate": 6.403253742405107e-07,
"loss": 1.6227,
"step": 1929
},
{
"epoch": 0.9355307804168687,
"grad_norm": 1.3232550621032715,
"learning_rate": 6.308441886160254e-07,
"loss": 1.4756,
"step": 1930
},
{
"epoch": 0.9360155113911779,
"grad_norm": 1.3774874210357666,
"learning_rate": 6.214328236889861e-07,
"loss": 1.3581,
"step": 1931
},
{
"epoch": 0.9365002423654871,
"grad_norm": 1.381434679031372,
"learning_rate": 6.120913064241313e-07,
"loss": 1.2108,
"step": 1932
},
{
"epoch": 0.9369849733397965,
"grad_norm": 1.3841496706008911,
"learning_rate": 6.02819663586085e-07,
"loss": 1.5833,
"step": 1933
},
{
"epoch": 0.9374697043141057,
"grad_norm": 1.4273308515548706,
"learning_rate": 5.936179217392673e-07,
"loss": 1.3397,
"step": 1934
},
{
"epoch": 0.9379544352884149,
"grad_norm": 1.4026015996932983,
"learning_rate": 5.844861072478336e-07,
"loss": 1.6483,
"step": 1935
},
{
"epoch": 0.9384391662627242,
"grad_norm": 1.4809978008270264,
"learning_rate": 5.754242462755771e-07,
"loss": 1.4236,
"step": 1936
},
{
"epoch": 0.9389238972370334,
"grad_norm": 1.4197194576263428,
"learning_rate": 5.664323647858655e-07,
"loss": 1.5785,
"step": 1937
},
{
"epoch": 0.9394086282113427,
"grad_norm": 1.4222776889801025,
"learning_rate": 5.575104885415794e-07,
"loss": 1.5917,
"step": 1938
},
{
"epoch": 0.939893359185652,
"grad_norm": 1.4109939336776733,
"learning_rate": 5.486586431050072e-07,
"loss": 1.5329,
"step": 1939
},
{
"epoch": 0.9403780901599612,
"grad_norm": 1.3708782196044922,
"learning_rate": 5.398768538378063e-07,
"loss": 1.1433,
"step": 1940
},
{
"epoch": 0.9408628211342704,
"grad_norm": 1.3859466314315796,
"learning_rate": 5.311651459009054e-07,
"loss": 1.4556,
"step": 1941
},
{
"epoch": 0.9413475521085798,
"grad_norm": 1.3821712732315063,
"learning_rate": 5.225235442544468e-07,
"loss": 1.654,
"step": 1942
},
{
"epoch": 0.941832283082889,
"grad_norm": 1.4499861001968384,
"learning_rate": 5.139520736577058e-07,
"loss": 1.7619,
"step": 1943
},
{
"epoch": 0.9423170140571983,
"grad_norm": 1.6326532363891602,
"learning_rate": 5.05450758669021e-07,
"loss": 1.8829,
"step": 1944
},
{
"epoch": 0.9428017450315075,
"grad_norm": 1.4821033477783203,
"learning_rate": 4.97019623645728e-07,
"loss": 1.3496,
"step": 1945
},
{
"epoch": 0.9432864760058167,
"grad_norm": 1.308059573173523,
"learning_rate": 4.886586927440956e-07,
"loss": 1.3063,
"step": 1946
},
{
"epoch": 0.9437712069801261,
"grad_norm": 1.3683174848556519,
"learning_rate": 4.803679899192392e-07,
"loss": 1.4177,
"step": 1947
},
{
"epoch": 0.9442559379544353,
"grad_norm": 1.246372938156128,
"learning_rate": 4.7214753892506625e-07,
"loss": 1.2904,
"step": 1948
},
{
"epoch": 0.9447406689287445,
"grad_norm": 1.3812105655670166,
"learning_rate": 4.6399736331420305e-07,
"loss": 1.4883,
"step": 1949
},
{
"epoch": 0.9452253999030538,
"grad_norm": 1.4763976335525513,
"learning_rate": 4.559174864379234e-07,
"loss": 1.5377,
"step": 1950
},
{
"epoch": 0.9457101308773631,
"grad_norm": 1.4200667142868042,
"learning_rate": 4.4790793144610097e-07,
"loss": 1.2448,
"step": 1951
},
{
"epoch": 0.9461948618516723,
"grad_norm": 1.5260330438613892,
"learning_rate": 4.399687212871123e-07,
"loss": 1.33,
"step": 1952
},
{
"epoch": 0.9466795928259816,
"grad_norm": 1.5376918315887451,
"learning_rate": 4.320998787077923e-07,
"loss": 1.3794,
"step": 1953
},
{
"epoch": 0.9471643238002908,
"grad_norm": 1.408273696899414,
"learning_rate": 4.243014262533679e-07,
"loss": 1.1804,
"step": 1954
},
{
"epoch": 0.9476490547746,
"grad_norm": 1.395007848739624,
"learning_rate": 4.165733862673854e-07,
"loss": 1.2267,
"step": 1955
},
{
"epoch": 0.9481337857489094,
"grad_norm": 1.4566540718078613,
"learning_rate": 4.0891578089164996e-07,
"loss": 1.2163,
"step": 1956
},
{
"epoch": 0.9486185167232186,
"grad_norm": 1.295602560043335,
"learning_rate": 4.0132863206616965e-07,
"loss": 1.1429,
"step": 1957
},
{
"epoch": 0.9491032476975279,
"grad_norm": 1.410298228263855,
"learning_rate": 3.938119615290753e-07,
"loss": 1.5398,
"step": 1958
},
{
"epoch": 0.9495879786718371,
"grad_norm": 1.3872941732406616,
"learning_rate": 3.8636579081657577e-07,
"loss": 1.1388,
"step": 1959
},
{
"epoch": 0.9500727096461464,
"grad_norm": 1.5268107652664185,
"learning_rate": 3.7899014126288876e-07,
"loss": 1.8655,
"step": 1960
},
{
"epoch": 0.9505574406204557,
"grad_norm": 1.472521424293518,
"learning_rate": 3.716850340001715e-07,
"loss": 1.7315,
"step": 1961
},
{
"epoch": 0.9510421715947649,
"grad_norm": 1.4450392723083496,
"learning_rate": 3.644504899584844e-07,
"loss": 1.3071,
"step": 1962
},
{
"epoch": 0.9515269025690741,
"grad_norm": 1.436614990234375,
"learning_rate": 3.5728652986570245e-07,
"loss": 1.4648,
"step": 1963
},
{
"epoch": 0.9520116335433835,
"grad_norm": 1.3284350633621216,
"learning_rate": 3.5019317424747064e-07,
"loss": 1.1009,
"step": 1964
},
{
"epoch": 0.9524963645176927,
"grad_norm": 1.3217493295669556,
"learning_rate": 3.43170443427146e-07,
"loss": 1.303,
"step": 1965
},
{
"epoch": 0.9529810954920019,
"grad_norm": 1.4290056228637695,
"learning_rate": 3.3621835752573884e-07,
"loss": 1.4079,
"step": 1966
},
{
"epoch": 0.9534658264663112,
"grad_norm": 1.4261441230773926,
"learning_rate": 3.293369364618465e-07,
"loss": 1.241,
"step": 1967
},
{
"epoch": 0.9539505574406204,
"grad_norm": 1.4014172554016113,
"learning_rate": 3.2252619995160885e-07,
"loss": 1.641,
"step": 1968
},
{
"epoch": 0.9544352884149298,
"grad_norm": 1.4024168252944946,
"learning_rate": 3.1578616750863875e-07,
"loss": 1.3255,
"step": 1969
},
{
"epoch": 0.954920019389239,
"grad_norm": 1.462719440460205,
"learning_rate": 3.0911685844398353e-07,
"loss": 1.3231,
"step": 1970
},
{
"epoch": 0.9554047503635482,
"grad_norm": 1.350813865661621,
"learning_rate": 3.025182918660496e-07,
"loss": 1.1891,
"step": 1971
},
{
"epoch": 0.9558894813378574,
"grad_norm": 1.3725783824920654,
"learning_rate": 2.9599048668055853e-07,
"loss": 1.2594,
"step": 1972
},
{
"epoch": 0.9563742123121668,
"grad_norm": 1.4037284851074219,
"learning_rate": 2.8953346159049375e-07,
"loss": 1.3825,
"step": 1973
},
{
"epoch": 0.956858943286476,
"grad_norm": 2.3289217948913574,
"learning_rate": 2.831472350960485e-07,
"loss": 1.4553,
"step": 1974
},
{
"epoch": 0.9573436742607853,
"grad_norm": 1.3470871448516846,
"learning_rate": 2.7683182549456123e-07,
"loss": 1.6123,
"step": 1975
},
{
"epoch": 0.9578284052350945,
"grad_norm": 1.5660077333450317,
"learning_rate": 2.705872508804747e-07,
"loss": 1.5749,
"step": 1976
},
{
"epoch": 0.9583131362094037,
"grad_norm": 1.4341754913330078,
"learning_rate": 2.644135291452854e-07,
"loss": 1.1978,
"step": 1977
},
{
"epoch": 0.9587978671837131,
"grad_norm": 1.4906240701675415,
"learning_rate": 2.5831067797747746e-07,
"loss": 1.6585,
"step": 1978
},
{
"epoch": 0.9592825981580223,
"grad_norm": 1.3474236726760864,
"learning_rate": 2.5227871486249164e-07,
"loss": 1.3896,
"step": 1979
},
{
"epoch": 0.9597673291323315,
"grad_norm": 1.4273277521133423,
"learning_rate": 2.463176570826592e-07,
"loss": 1.5367,
"step": 1980
},
{
"epoch": 0.9602520601066408,
"grad_norm": 1.3798199892044067,
"learning_rate": 2.404275217171625e-07,
"loss": 1.129,
"step": 1981
},
{
"epoch": 0.9607367910809501,
"grad_norm": 1.284221887588501,
"learning_rate": 2.3460832564197455e-07,
"loss": 1.2853,
"step": 1982
},
{
"epoch": 0.9612215220552593,
"grad_norm": 1.366292953491211,
"learning_rate": 2.288600855298306e-07,
"loss": 1.2893,
"step": 1983
},
{
"epoch": 0.9617062530295686,
"grad_norm": 1.3929351568222046,
"learning_rate": 2.2318281785015936e-07,
"loss": 1.5092,
"step": 1984
},
{
"epoch": 0.9621909840038778,
"grad_norm": 1.2522450685501099,
"learning_rate": 2.1757653886904927e-07,
"loss": 1.39,
"step": 1985
},
{
"epoch": 0.9626757149781872,
"grad_norm": 1.3403209447860718,
"learning_rate": 2.120412646491904e-07,
"loss": 1.3051,
"step": 1986
},
{
"epoch": 0.9631604459524964,
"grad_norm": 1.3888131380081177,
"learning_rate": 2.0657701104984384e-07,
"loss": 1.4746,
"step": 1987
},
{
"epoch": 0.9636451769268056,
"grad_norm": 1.4653536081314087,
"learning_rate": 2.0118379372678354e-07,
"loss": 1.8005,
"step": 1988
},
{
"epoch": 0.9641299079011149,
"grad_norm": 1.3622173070907593,
"learning_rate": 1.9586162813225174e-07,
"loss": 1.366,
"step": 1989
},
{
"epoch": 0.9646146388754241,
"grad_norm": 1.3387054204940796,
"learning_rate": 1.9061052951492575e-07,
"loss": 1.4404,
"step": 1990
},
{
"epoch": 0.9650993698497334,
"grad_norm": 1.361503005027771,
"learning_rate": 1.8543051291986247e-07,
"loss": 1.4713,
"step": 1991
},
{
"epoch": 0.9655841008240427,
"grad_norm": 1.3547799587249756,
"learning_rate": 1.80321593188465e-07,
"loss": 1.2779,
"step": 1992
},
{
"epoch": 0.9660688317983519,
"grad_norm": 1.6397618055343628,
"learning_rate": 1.7528378495842435e-07,
"loss": 1.4227,
"step": 1993
},
{
"epoch": 0.9665535627726611,
"grad_norm": 1.4830430746078491,
"learning_rate": 1.7031710266370016e-07,
"loss": 1.5416,
"step": 1994
},
{
"epoch": 0.9670382937469705,
"grad_norm": 1.4233936071395874,
"learning_rate": 1.6542156053446223e-07,
"loss": 1.2741,
"step": 1995
},
{
"epoch": 0.9675230247212797,
"grad_norm": 1.3970363140106201,
"learning_rate": 1.6059717259705175e-07,
"loss": 1.2843,
"step": 1996
},
{
"epoch": 0.968007755695589,
"grad_norm": 1.4266377687454224,
"learning_rate": 1.5584395267394802e-07,
"loss": 1.2624,
"step": 1997
},
{
"epoch": 0.9684924866698982,
"grad_norm": 1.3139270544052124,
"learning_rate": 1.5116191438372394e-07,
"loss": 1.3138,
"step": 1998
},
{
"epoch": 0.9689772176442074,
"grad_norm": 1.3843278884887695,
"learning_rate": 1.4655107114101007e-07,
"loss": 1.3294,
"step": 1999
},
{
"epoch": 0.9694619486185168,
"grad_norm": 1.2902089357376099,
"learning_rate": 1.4201143615645006e-07,
"loss": 1.3585,
"step": 2000
},
{
"epoch": 0.969946679592826,
"grad_norm": 1.469571590423584,
"learning_rate": 1.3754302243667304e-07,
"loss": 1.4797,
"step": 2001
},
{
"epoch": 0.9704314105671352,
"grad_norm": 1.3052209615707397,
"learning_rate": 1.331458427842408e-07,
"loss": 1.3906,
"step": 2002
},
{
"epoch": 0.9709161415414445,
"grad_norm": 1.5683739185333252,
"learning_rate": 1.2881990979763393e-07,
"loss": 1.5129,
"step": 2003
},
{
"epoch": 0.9714008725157538,
"grad_norm": 1.4021413326263428,
"learning_rate": 1.2456523587118517e-07,
"loss": 1.2832,
"step": 2004
},
{
"epoch": 0.971885603490063,
"grad_norm": 1.4269837141036987,
"learning_rate": 1.2038183319507955e-07,
"loss": 1.3047,
"step": 2005
},
{
"epoch": 0.9723703344643723,
"grad_norm": 1.4196062088012695,
"learning_rate": 1.1626971375528484e-07,
"loss": 1.2863,
"step": 2006
},
{
"epoch": 0.9728550654386815,
"grad_norm": 1.4747346639633179,
"learning_rate": 1.1222888933354602e-07,
"loss": 1.4186,
"step": 2007
},
{
"epoch": 0.9733397964129908,
"grad_norm": 1.3612735271453857,
"learning_rate": 1.0825937150732989e-07,
"loss": 1.5173,
"step": 2008
},
{
"epoch": 0.9738245273873001,
"grad_norm": 1.586757779121399,
"learning_rate": 1.043611716498083e-07,
"loss": 1.3961,
"step": 2009
},
{
"epoch": 0.9743092583616093,
"grad_norm": 1.2929972410202026,
"learning_rate": 1.0053430092981097e-07,
"loss": 1.0006,
"step": 2010
},
{
"epoch": 0.9747939893359185,
"grad_norm": 1.3124991655349731,
"learning_rate": 9.677877031180615e-08,
"loss": 1.1774,
"step": 2011
},
{
"epoch": 0.9752787203102278,
"grad_norm": 1.4618499279022217,
"learning_rate": 9.30945905558589e-08,
"loss": 1.5174,
"step": 2012
},
{
"epoch": 0.9757634512845371,
"grad_norm": 1.536896824836731,
"learning_rate": 8.948177221760889e-08,
"loss": 2.057,
"step": 2013
},
{
"epoch": 0.9762481822588464,
"grad_norm": 1.474232792854309,
"learning_rate": 8.594032564823717e-08,
"loss": 1.3491,
"step": 2014
},
{
"epoch": 0.9767329132331556,
"grad_norm": 1.3744844198226929,
"learning_rate": 8.247026099443277e-08,
"loss": 1.4189,
"step": 2015
},
{
"epoch": 0.9772176442074648,
"grad_norm": 1.3619568347930908,
"learning_rate": 7.907158819836503e-08,
"loss": 1.4581,
"step": 2016
},
{
"epoch": 0.9777023751817742,
"grad_norm": 1.3401083946228027,
"learning_rate": 7.574431699766127e-08,
"loss": 1.523,
"step": 2017
},
{
"epoch": 0.9781871061560834,
"grad_norm": 1.4377765655517578,
"learning_rate": 7.248845692537088e-08,
"loss": 1.4517,
"step": 2018
},
{
"epoch": 0.9786718371303926,
"grad_norm": 1.4789650440216064,
"learning_rate": 6.930401730994573e-08,
"loss": 1.5122,
"step": 2019
},
{
"epoch": 0.9791565681047019,
"grad_norm": 1.4197337627410889,
"learning_rate": 6.619100727520422e-08,
"loss": 1.2408,
"step": 2020
},
{
"epoch": 0.9796412990790111,
"grad_norm": 1.294545292854309,
"learning_rate": 6.314943574030896e-08,
"loss": 1.4684,
"step": 2021
},
{
"epoch": 0.9801260300533204,
"grad_norm": 1.500753402709961,
"learning_rate": 6.01793114197502e-08,
"loss": 1.6566,
"step": 2022
},
{
"epoch": 0.9806107610276297,
"grad_norm": 1.4268614053726196,
"learning_rate": 5.728064282330137e-08,
"loss": 1.4571,
"step": 2023
},
{
"epoch": 0.9810954920019389,
"grad_norm": 1.3109159469604492,
"learning_rate": 5.4453438256019115e-08,
"loss": 1.17,
"step": 2024
},
{
"epoch": 0.9815802229762481,
"grad_norm": 1.3804876804351807,
"learning_rate": 5.169770581819888e-08,
"loss": 1.3375,
"step": 2025
},
{
"epoch": 0.9820649539505575,
"grad_norm": 1.758589267730713,
"learning_rate": 4.901345340535824e-08,
"loss": 1.6656,
"step": 2026
},
{
"epoch": 0.9825496849248667,
"grad_norm": 1.500661015510559,
"learning_rate": 4.6400688708217455e-08,
"loss": 1.522,
"step": 2027
},
{
"epoch": 0.983034415899176,
"grad_norm": 1.3010058403015137,
"learning_rate": 4.385941921268011e-08,
"loss": 1.075,
"step": 2028
},
{
"epoch": 0.9835191468734852,
"grad_norm": 1.2376115322113037,
"learning_rate": 4.138965219979973e-08,
"loss": 1.0122,
"step": 2029
},
{
"epoch": 0.9840038778477945,
"grad_norm": 1.4325613975524902,
"learning_rate": 3.8991394745771516e-08,
"loss": 1.5664,
"step": 2030
},
{
"epoch": 0.9844886088221038,
"grad_norm": 1.3779929876327515,
"learning_rate": 3.666465372190453e-08,
"loss": 1.2317,
"step": 2031
},
{
"epoch": 0.984973339796413,
"grad_norm": 1.4506980180740356,
"learning_rate": 3.440943579460232e-08,
"loss": 1.3087,
"step": 2032
},
{
"epoch": 0.9854580707707222,
"grad_norm": 1.5038971900939941,
"learning_rate": 3.2225747425351785e-08,
"loss": 1.6422,
"step": 2033
},
{
"epoch": 0.9859428017450315,
"grad_norm": 1.4041410684585571,
"learning_rate": 3.011359487068987e-08,
"loss": 1.5016,
"step": 2034
},
{
"epoch": 0.9864275327193408,
"grad_norm": 1.475483775138855,
"learning_rate": 2.807298418220361e-08,
"loss": 1.5435,
"step": 2035
},
{
"epoch": 0.98691226369365,
"grad_norm": 1.4113868474960327,
"learning_rate": 2.6103921206499517e-08,
"loss": 1.5201,
"step": 2036
},
{
"epoch": 0.9873969946679593,
"grad_norm": 1.4753867387771606,
"learning_rate": 2.4206411585186996e-08,
"loss": 1.6761,
"step": 2037
},
{
"epoch": 0.9878817256422685,
"grad_norm": 1.380028247833252,
"learning_rate": 2.2380460754875544e-08,
"loss": 1.2831,
"step": 2038
},
{
"epoch": 0.9883664566165778,
"grad_norm": 1.2903367280960083,
"learning_rate": 2.0626073947138668e-08,
"loss": 1.235,
"step": 2039
},
{
"epoch": 0.9888511875908871,
"grad_norm": 1.7924153804779053,
"learning_rate": 1.8943256188516667e-08,
"loss": 1.4112,
"step": 2040
},
{
"epoch": 0.9893359185651963,
"grad_norm": 1.4166916608810425,
"learning_rate": 1.7332012300494417e-08,
"loss": 1.651,
"step": 2041
},
{
"epoch": 0.9898206495395055,
"grad_norm": 1.2969902753829956,
"learning_rate": 1.5792346899490275e-08,
"loss": 1.2091,
"step": 2042
},
{
"epoch": 0.9903053805138148,
"grad_norm": 1.426344633102417,
"learning_rate": 1.4324264396836651e-08,
"loss": 1.4629,
"step": 2043
},
{
"epoch": 0.9907901114881241,
"grad_norm": 1.4453415870666504,
"learning_rate": 1.2927768998774458e-08,
"loss": 1.4487,
"step": 2044
},
{
"epoch": 0.9912748424624334,
"grad_norm": 1.3389601707458496,
"learning_rate": 1.1602864706442008e-08,
"loss": 1.2715,
"step": 2045
},
{
"epoch": 0.9917595734367426,
"grad_norm": 1.382049560546875,
"learning_rate": 1.0349555315855574e-08,
"loss": 1.3052,
"step": 2046
},
{
"epoch": 0.9922443044110518,
"grad_norm": 1.3508886098861694,
"learning_rate": 9.167844417901083e-09,
"loss": 1.4453,
"step": 2047
},
{
"epoch": 0.9927290353853612,
"grad_norm": 1.3657429218292236,
"learning_rate": 8.057735398331324e-09,
"loss": 1.4893,
"step": 2048
},
{
"epoch": 0.9932137663596704,
"grad_norm": 1.395776629447937,
"learning_rate": 7.0192314377520715e-09,
"loss": 1.283,
"step": 2049
},
{
"epoch": 0.9936984973339796,
"grad_norm": 1.4177159070968628,
"learning_rate": 6.052335511599893e-09,
"loss": 1.2549,
"step": 2050
},
{
"epoch": 0.9941832283082889,
"grad_norm": 1.4473752975463867,
"learning_rate": 5.1570503901587905e-09,
"loss": 1.49,
"step": 2051
},
{
"epoch": 0.9946679592825982,
"grad_norm": 1.3230009078979492,
"learning_rate": 4.333378638532448e-09,
"loss": 1.7049,
"step": 2052
},
{
"epoch": 0.9951526902569074,
"grad_norm": 1.2757775783538818,
"learning_rate": 3.581322616641458e-09,
"loss": 1.1337,
"step": 2053
},
{
"epoch": 0.9956374212312167,
"grad_norm": 1.4444496631622314,
"learning_rate": 2.9008844792260957e-09,
"loss": 1.6056,
"step": 2054
},
{
"epoch": 0.9961221522055259,
"grad_norm": 1.566375970840454,
"learning_rate": 2.292066175821339e-09,
"loss": 1.3957,
"step": 2055
},
{
"epoch": 0.9966068831798351,
"grad_norm": 1.4305633306503296,
"learning_rate": 1.754869450773522e-09,
"loss": 1.4048,
"step": 2056
},
{
"epoch": 0.9970916141541445,
"grad_norm": 1.446074366569519,
"learning_rate": 1.2892958432153546e-09,
"loss": 1.6426,
"step": 2057
},
{
"epoch": 0.9975763451284537,
"grad_norm": 1.418178915977478,
"learning_rate": 8.953466870742499e-10,
"loss": 1.2807,
"step": 2058
},
{
"epoch": 0.998061076102763,
"grad_norm": 1.4988263845443726,
"learning_rate": 5.730231110639972e-10,
"loss": 1.61,
"step": 2059
},
{
"epoch": 0.9985458070770722,
"grad_norm": 1.3278765678405762,
"learning_rate": 3.2232603868476153e-10,
"loss": 1.1255,
"step": 2060
},
{
"epoch": 0.9990305380513815,
"grad_norm": 1.492288589477539,
"learning_rate": 1.4325618821198207e-10,
"loss": 1.5154,
"step": 2061
},
{
"epoch": 0.9995152690256908,
"grad_norm": 1.3655840158462524,
"learning_rate": 3.5814072707474054e-11,
"loss": 1.4809,
"step": 2062
},
{
"epoch": 1.0,
"grad_norm": 1.489664912223816,
"learning_rate": 0.0,
"loss": 1.4304,
"step": 2063
}
],
"logging_steps": 1,
"max_steps": 2063,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8633303665737728.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}