reasoning_25_chat / trainer_state.json
gsmyrnis's picture
End of training
cb85712 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.994882292732855,
"eval_steps": 500,
"global_step": 2440,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020470829068577278,
"grad_norm": 5.914718943943769,
"learning_rate": 1.639344262295082e-07,
"loss": 0.89,
"step": 1
},
{
"epoch": 0.0040941658137154556,
"grad_norm": 5.711215790032282,
"learning_rate": 3.278688524590164e-07,
"loss": 0.8602,
"step": 2
},
{
"epoch": 0.006141248720573183,
"grad_norm": 6.070030223088026,
"learning_rate": 4.918032786885246e-07,
"loss": 0.8902,
"step": 3
},
{
"epoch": 0.008188331627430911,
"grad_norm": 5.739027819285582,
"learning_rate": 6.557377049180328e-07,
"loss": 0.9194,
"step": 4
},
{
"epoch": 0.01023541453428864,
"grad_norm": 5.558991142229951,
"learning_rate": 8.196721311475409e-07,
"loss": 0.8523,
"step": 5
},
{
"epoch": 0.012282497441146366,
"grad_norm": 5.622988925862499,
"learning_rate": 9.836065573770493e-07,
"loss": 0.9102,
"step": 6
},
{
"epoch": 0.014329580348004094,
"grad_norm": 5.360893456655671,
"learning_rate": 1.1475409836065575e-06,
"loss": 0.8682,
"step": 7
},
{
"epoch": 0.016376663254861822,
"grad_norm": 5.095538528802894,
"learning_rate": 1.3114754098360657e-06,
"loss": 0.8689,
"step": 8
},
{
"epoch": 0.01842374616171955,
"grad_norm": 4.59455985194371,
"learning_rate": 1.4754098360655739e-06,
"loss": 0.852,
"step": 9
},
{
"epoch": 0.02047082906857728,
"grad_norm": 4.139437078984008,
"learning_rate": 1.6393442622950819e-06,
"loss": 0.8102,
"step": 10
},
{
"epoch": 0.022517911975435005,
"grad_norm": 4.171848417796509,
"learning_rate": 1.8032786885245903e-06,
"loss": 0.8102,
"step": 11
},
{
"epoch": 0.02456499488229273,
"grad_norm": 2.4194756147420646,
"learning_rate": 1.9672131147540985e-06,
"loss": 0.7481,
"step": 12
},
{
"epoch": 0.02661207778915046,
"grad_norm": 2.28600629836227,
"learning_rate": 2.1311475409836067e-06,
"loss": 0.7631,
"step": 13
},
{
"epoch": 0.028659160696008188,
"grad_norm": 2.075915465619067,
"learning_rate": 2.295081967213115e-06,
"loss": 0.7506,
"step": 14
},
{
"epoch": 0.030706243602865915,
"grad_norm": 1.952947905310399,
"learning_rate": 2.459016393442623e-06,
"loss": 0.7781,
"step": 15
},
{
"epoch": 0.032753326509723645,
"grad_norm": 3.070659688365803,
"learning_rate": 2.6229508196721314e-06,
"loss": 0.7488,
"step": 16
},
{
"epoch": 0.03480040941658137,
"grad_norm": 3.5264876012722652,
"learning_rate": 2.786885245901639e-06,
"loss": 0.7746,
"step": 17
},
{
"epoch": 0.0368474923234391,
"grad_norm": 3.580605082704591,
"learning_rate": 2.9508196721311478e-06,
"loss": 0.7618,
"step": 18
},
{
"epoch": 0.038894575230296824,
"grad_norm": 3.194230800018389,
"learning_rate": 3.114754098360656e-06,
"loss": 0.7207,
"step": 19
},
{
"epoch": 0.04094165813715456,
"grad_norm": 3.116544112683001,
"learning_rate": 3.2786885245901638e-06,
"loss": 0.7352,
"step": 20
},
{
"epoch": 0.042988741044012284,
"grad_norm": 2.6314247686078094,
"learning_rate": 3.4426229508196724e-06,
"loss": 0.7432,
"step": 21
},
{
"epoch": 0.04503582395087001,
"grad_norm": 2.2137046535068037,
"learning_rate": 3.6065573770491806e-06,
"loss": 0.7444,
"step": 22
},
{
"epoch": 0.04708290685772774,
"grad_norm": 1.6649733496725676,
"learning_rate": 3.7704918032786884e-06,
"loss": 0.6646,
"step": 23
},
{
"epoch": 0.04912998976458546,
"grad_norm": 1.3074947984675702,
"learning_rate": 3.934426229508197e-06,
"loss": 0.6388,
"step": 24
},
{
"epoch": 0.0511770726714432,
"grad_norm": 1.190645815657471,
"learning_rate": 4.098360655737705e-06,
"loss": 0.6575,
"step": 25
},
{
"epoch": 0.05322415557830092,
"grad_norm": 1.2707845894729972,
"learning_rate": 4.2622950819672135e-06,
"loss": 0.671,
"step": 26
},
{
"epoch": 0.05527123848515865,
"grad_norm": 1.4412640499522065,
"learning_rate": 4.426229508196722e-06,
"loss": 0.6526,
"step": 27
},
{
"epoch": 0.057318321392016376,
"grad_norm": 1.3167145329441712,
"learning_rate": 4.59016393442623e-06,
"loss": 0.6533,
"step": 28
},
{
"epoch": 0.0593654042988741,
"grad_norm": 1.338399336461372,
"learning_rate": 4.754098360655738e-06,
"loss": 0.6525,
"step": 29
},
{
"epoch": 0.06141248720573183,
"grad_norm": 1.1005314901400522,
"learning_rate": 4.918032786885246e-06,
"loss": 0.6513,
"step": 30
},
{
"epoch": 0.06345957011258956,
"grad_norm": 1.0341365252216648,
"learning_rate": 5.0819672131147545e-06,
"loss": 0.6517,
"step": 31
},
{
"epoch": 0.06550665301944729,
"grad_norm": 0.8191371315934305,
"learning_rate": 5.245901639344263e-06,
"loss": 0.6562,
"step": 32
},
{
"epoch": 0.06755373592630501,
"grad_norm": 1.0105078954121915,
"learning_rate": 5.409836065573772e-06,
"loss": 0.6318,
"step": 33
},
{
"epoch": 0.06960081883316274,
"grad_norm": 1.083317612523659,
"learning_rate": 5.573770491803278e-06,
"loss": 0.6528,
"step": 34
},
{
"epoch": 0.07164790174002048,
"grad_norm": 1.054453013530755,
"learning_rate": 5.737704918032787e-06,
"loss": 0.6233,
"step": 35
},
{
"epoch": 0.0736949846468782,
"grad_norm": 0.693669694861969,
"learning_rate": 5.9016393442622956e-06,
"loss": 0.5959,
"step": 36
},
{
"epoch": 0.07574206755373593,
"grad_norm": 0.6429960279803676,
"learning_rate": 6.065573770491804e-06,
"loss": 0.6031,
"step": 37
},
{
"epoch": 0.07778915046059365,
"grad_norm": 0.8326921183964671,
"learning_rate": 6.229508196721312e-06,
"loss": 0.6252,
"step": 38
},
{
"epoch": 0.07983623336745138,
"grad_norm": 0.7826089791231328,
"learning_rate": 6.393442622950821e-06,
"loss": 0.5891,
"step": 39
},
{
"epoch": 0.08188331627430911,
"grad_norm": 0.6837753928275052,
"learning_rate": 6.5573770491803276e-06,
"loss": 0.6151,
"step": 40
},
{
"epoch": 0.08393039918116683,
"grad_norm": 0.557265922988386,
"learning_rate": 6.721311475409837e-06,
"loss": 0.5832,
"step": 41
},
{
"epoch": 0.08597748208802457,
"grad_norm": 0.697548950763872,
"learning_rate": 6.885245901639345e-06,
"loss": 0.629,
"step": 42
},
{
"epoch": 0.08802456499488229,
"grad_norm": 0.7262822585766031,
"learning_rate": 7.049180327868853e-06,
"loss": 0.5531,
"step": 43
},
{
"epoch": 0.09007164790174002,
"grad_norm": 0.7701295257257081,
"learning_rate": 7.213114754098361e-06,
"loss": 0.612,
"step": 44
},
{
"epoch": 0.09211873080859775,
"grad_norm": 0.4681801446847226,
"learning_rate": 7.3770491803278695e-06,
"loss": 0.5675,
"step": 45
},
{
"epoch": 0.09416581371545547,
"grad_norm": 0.5678252337287879,
"learning_rate": 7.540983606557377e-06,
"loss": 0.6391,
"step": 46
},
{
"epoch": 0.09621289662231321,
"grad_norm": 0.5704556041531674,
"learning_rate": 7.704918032786886e-06,
"loss": 0.5773,
"step": 47
},
{
"epoch": 0.09825997952917093,
"grad_norm": 0.5739689196908508,
"learning_rate": 7.868852459016394e-06,
"loss": 0.5928,
"step": 48
},
{
"epoch": 0.10030706243602866,
"grad_norm": 0.5604483444656064,
"learning_rate": 8.032786885245902e-06,
"loss": 0.5747,
"step": 49
},
{
"epoch": 0.1023541453428864,
"grad_norm": 0.5266605058896955,
"learning_rate": 8.19672131147541e-06,
"loss": 0.5506,
"step": 50
},
{
"epoch": 0.10440122824974411,
"grad_norm": 0.6855806410785544,
"learning_rate": 8.360655737704919e-06,
"loss": 0.5974,
"step": 51
},
{
"epoch": 0.10644831115660185,
"grad_norm": 0.6492542095140276,
"learning_rate": 8.524590163934427e-06,
"loss": 0.6027,
"step": 52
},
{
"epoch": 0.10849539406345957,
"grad_norm": 0.5702781382301559,
"learning_rate": 8.688524590163935e-06,
"loss": 0.5997,
"step": 53
},
{
"epoch": 0.1105424769703173,
"grad_norm": 0.5828822136164438,
"learning_rate": 8.852459016393443e-06,
"loss": 0.5633,
"step": 54
},
{
"epoch": 0.11258955987717502,
"grad_norm": 0.5923389847320444,
"learning_rate": 9.016393442622952e-06,
"loss": 0.5842,
"step": 55
},
{
"epoch": 0.11463664278403275,
"grad_norm": 0.5636209619702331,
"learning_rate": 9.18032786885246e-06,
"loss": 0.5645,
"step": 56
},
{
"epoch": 0.11668372569089049,
"grad_norm": 0.4912175098948749,
"learning_rate": 9.344262295081968e-06,
"loss": 0.5635,
"step": 57
},
{
"epoch": 0.1187308085977482,
"grad_norm": 0.754994674395481,
"learning_rate": 9.508196721311476e-06,
"loss": 0.5658,
"step": 58
},
{
"epoch": 0.12077789150460594,
"grad_norm": 0.5456693243174974,
"learning_rate": 9.672131147540984e-06,
"loss": 0.5559,
"step": 59
},
{
"epoch": 0.12282497441146366,
"grad_norm": 0.5877326554557568,
"learning_rate": 9.836065573770493e-06,
"loss": 0.5755,
"step": 60
},
{
"epoch": 0.12487205731832139,
"grad_norm": 0.4917195015950769,
"learning_rate": 1e-05,
"loss": 0.5422,
"step": 61
},
{
"epoch": 0.1269191402251791,
"grad_norm": 0.5000347167895204,
"learning_rate": 1.0163934426229509e-05,
"loss": 0.5638,
"step": 62
},
{
"epoch": 0.12896622313203684,
"grad_norm": 0.5165061332900804,
"learning_rate": 1.0327868852459017e-05,
"loss": 0.5874,
"step": 63
},
{
"epoch": 0.13101330603889458,
"grad_norm": 0.5647596683341366,
"learning_rate": 1.0491803278688525e-05,
"loss": 0.5549,
"step": 64
},
{
"epoch": 0.1330603889457523,
"grad_norm": 0.556985780595292,
"learning_rate": 1.0655737704918034e-05,
"loss": 0.5287,
"step": 65
},
{
"epoch": 0.13510747185261002,
"grad_norm": 0.5870046275772329,
"learning_rate": 1.0819672131147544e-05,
"loss": 0.5726,
"step": 66
},
{
"epoch": 0.13715455475946775,
"grad_norm": 0.510016021993097,
"learning_rate": 1.0983606557377052e-05,
"loss": 0.641,
"step": 67
},
{
"epoch": 0.13920163766632548,
"grad_norm": 0.5757122682850916,
"learning_rate": 1.1147540983606557e-05,
"loss": 0.5493,
"step": 68
},
{
"epoch": 0.14124872057318322,
"grad_norm": 0.5572446165702184,
"learning_rate": 1.1311475409836066e-05,
"loss": 0.5231,
"step": 69
},
{
"epoch": 0.14329580348004095,
"grad_norm": 0.5000459199001162,
"learning_rate": 1.1475409836065575e-05,
"loss": 0.5221,
"step": 70
},
{
"epoch": 0.14534288638689866,
"grad_norm": 0.49933135856571004,
"learning_rate": 1.1639344262295083e-05,
"loss": 0.5254,
"step": 71
},
{
"epoch": 0.1473899692937564,
"grad_norm": 0.542314402205445,
"learning_rate": 1.1803278688524591e-05,
"loss": 0.5855,
"step": 72
},
{
"epoch": 0.14943705220061412,
"grad_norm": 0.4908515642706825,
"learning_rate": 1.19672131147541e-05,
"loss": 0.5277,
"step": 73
},
{
"epoch": 0.15148413510747186,
"grad_norm": 0.5541036948715271,
"learning_rate": 1.2131147540983608e-05,
"loss": 0.5226,
"step": 74
},
{
"epoch": 0.1535312180143296,
"grad_norm": 0.5925205612588014,
"learning_rate": 1.2295081967213116e-05,
"loss": 0.5588,
"step": 75
},
{
"epoch": 0.1555783009211873,
"grad_norm": 0.5472033034034951,
"learning_rate": 1.2459016393442624e-05,
"loss": 0.5601,
"step": 76
},
{
"epoch": 0.15762538382804503,
"grad_norm": 0.5831899459875634,
"learning_rate": 1.2622950819672132e-05,
"loss": 0.5208,
"step": 77
},
{
"epoch": 0.15967246673490276,
"grad_norm": 0.6725600970137904,
"learning_rate": 1.2786885245901642e-05,
"loss": 0.5646,
"step": 78
},
{
"epoch": 0.1617195496417605,
"grad_norm": 0.4997581515870628,
"learning_rate": 1.295081967213115e-05,
"loss": 0.5345,
"step": 79
},
{
"epoch": 0.16376663254861823,
"grad_norm": 0.6060573387394406,
"learning_rate": 1.3114754098360655e-05,
"loss": 0.5516,
"step": 80
},
{
"epoch": 0.16581371545547594,
"grad_norm": 0.5707723342314415,
"learning_rate": 1.3278688524590165e-05,
"loss": 0.5494,
"step": 81
},
{
"epoch": 0.16786079836233367,
"grad_norm": 0.5820968760684135,
"learning_rate": 1.3442622950819673e-05,
"loss": 0.5374,
"step": 82
},
{
"epoch": 0.1699078812691914,
"grad_norm": 0.801732853136766,
"learning_rate": 1.3606557377049181e-05,
"loss": 0.546,
"step": 83
},
{
"epoch": 0.17195496417604914,
"grad_norm": 0.5288994895334571,
"learning_rate": 1.377049180327869e-05,
"loss": 0.5761,
"step": 84
},
{
"epoch": 0.17400204708290687,
"grad_norm": 0.7883330207931984,
"learning_rate": 1.3934426229508198e-05,
"loss": 0.5326,
"step": 85
},
{
"epoch": 0.17604912998976457,
"grad_norm": 0.4807106883512578,
"learning_rate": 1.4098360655737706e-05,
"loss": 0.5311,
"step": 86
},
{
"epoch": 0.1780962128966223,
"grad_norm": 0.6954942427041093,
"learning_rate": 1.4262295081967214e-05,
"loss": 0.5206,
"step": 87
},
{
"epoch": 0.18014329580348004,
"grad_norm": 0.5706344929187627,
"learning_rate": 1.4426229508196722e-05,
"loss": 0.5575,
"step": 88
},
{
"epoch": 0.18219037871033777,
"grad_norm": 0.5660731771723676,
"learning_rate": 1.459016393442623e-05,
"loss": 0.5243,
"step": 89
},
{
"epoch": 0.1842374616171955,
"grad_norm": 0.7077259968257474,
"learning_rate": 1.4754098360655739e-05,
"loss": 0.5375,
"step": 90
},
{
"epoch": 0.1862845445240532,
"grad_norm": 0.5640871854454458,
"learning_rate": 1.4918032786885249e-05,
"loss": 0.5678,
"step": 91
},
{
"epoch": 0.18833162743091095,
"grad_norm": 0.5686460669441292,
"learning_rate": 1.5081967213114754e-05,
"loss": 0.5317,
"step": 92
},
{
"epoch": 0.19037871033776868,
"grad_norm": 0.5667037000506248,
"learning_rate": 1.5245901639344264e-05,
"loss": 0.5208,
"step": 93
},
{
"epoch": 0.19242579324462641,
"grad_norm": 0.6528802562895782,
"learning_rate": 1.5409836065573772e-05,
"loss": 0.5538,
"step": 94
},
{
"epoch": 0.19447287615148415,
"grad_norm": 0.6104791131678972,
"learning_rate": 1.5573770491803278e-05,
"loss": 0.4945,
"step": 95
},
{
"epoch": 0.19651995905834185,
"grad_norm": 0.770297526442582,
"learning_rate": 1.5737704918032788e-05,
"loss": 0.5542,
"step": 96
},
{
"epoch": 0.1985670419651996,
"grad_norm": 0.639855439582837,
"learning_rate": 1.5901639344262295e-05,
"loss": 0.5604,
"step": 97
},
{
"epoch": 0.20061412487205732,
"grad_norm": 0.6516313518294781,
"learning_rate": 1.6065573770491805e-05,
"loss": 0.492,
"step": 98
},
{
"epoch": 0.20266120777891505,
"grad_norm": 0.7323248049827676,
"learning_rate": 1.6229508196721314e-05,
"loss": 0.5561,
"step": 99
},
{
"epoch": 0.2047082906857728,
"grad_norm": 0.614575250954447,
"learning_rate": 1.639344262295082e-05,
"loss": 0.5319,
"step": 100
},
{
"epoch": 0.2067553735926305,
"grad_norm": 0.6867103983793634,
"learning_rate": 1.655737704918033e-05,
"loss": 0.5087,
"step": 101
},
{
"epoch": 0.20880245649948823,
"grad_norm": 0.5125936025327898,
"learning_rate": 1.6721311475409837e-05,
"loss": 0.5187,
"step": 102
},
{
"epoch": 0.21084953940634596,
"grad_norm": 0.7791039154279353,
"learning_rate": 1.6885245901639347e-05,
"loss": 0.5831,
"step": 103
},
{
"epoch": 0.2128966223132037,
"grad_norm": 0.5610382412593208,
"learning_rate": 1.7049180327868854e-05,
"loss": 0.5979,
"step": 104
},
{
"epoch": 0.21494370522006143,
"grad_norm": 0.6872141481595387,
"learning_rate": 1.721311475409836e-05,
"loss": 0.5337,
"step": 105
},
{
"epoch": 0.21699078812691913,
"grad_norm": 0.621726895260148,
"learning_rate": 1.737704918032787e-05,
"loss": 0.5014,
"step": 106
},
{
"epoch": 0.21903787103377687,
"grad_norm": 0.6593450481678657,
"learning_rate": 1.7540983606557377e-05,
"loss": 0.5298,
"step": 107
},
{
"epoch": 0.2210849539406346,
"grad_norm": 0.6085504680048398,
"learning_rate": 1.7704918032786887e-05,
"loss": 0.5597,
"step": 108
},
{
"epoch": 0.22313203684749233,
"grad_norm": 0.5818331349760811,
"learning_rate": 1.7868852459016393e-05,
"loss": 0.5671,
"step": 109
},
{
"epoch": 0.22517911975435004,
"grad_norm": 0.5705507858388099,
"learning_rate": 1.8032786885245903e-05,
"loss": 0.5608,
"step": 110
},
{
"epoch": 0.22722620266120777,
"grad_norm": 0.566454711636141,
"learning_rate": 1.8196721311475413e-05,
"loss": 0.536,
"step": 111
},
{
"epoch": 0.2292732855680655,
"grad_norm": 0.6584736492952961,
"learning_rate": 1.836065573770492e-05,
"loss": 0.5107,
"step": 112
},
{
"epoch": 0.23132036847492324,
"grad_norm": 0.5431857503415616,
"learning_rate": 1.852459016393443e-05,
"loss": 0.5216,
"step": 113
},
{
"epoch": 0.23336745138178097,
"grad_norm": 0.7429344164092465,
"learning_rate": 1.8688524590163936e-05,
"loss": 0.5673,
"step": 114
},
{
"epoch": 0.23541453428863868,
"grad_norm": 0.6303478113244917,
"learning_rate": 1.8852459016393446e-05,
"loss": 0.5173,
"step": 115
},
{
"epoch": 0.2374616171954964,
"grad_norm": 0.6632339341308846,
"learning_rate": 1.9016393442622952e-05,
"loss": 0.526,
"step": 116
},
{
"epoch": 0.23950870010235414,
"grad_norm": 0.7178085764934704,
"learning_rate": 1.918032786885246e-05,
"loss": 0.5501,
"step": 117
},
{
"epoch": 0.24155578300921188,
"grad_norm": 0.7261199143030841,
"learning_rate": 1.934426229508197e-05,
"loss": 0.5564,
"step": 118
},
{
"epoch": 0.2436028659160696,
"grad_norm": 0.6467257123886485,
"learning_rate": 1.9508196721311475e-05,
"loss": 0.5307,
"step": 119
},
{
"epoch": 0.24564994882292732,
"grad_norm": 0.7743559427761539,
"learning_rate": 1.9672131147540985e-05,
"loss": 0.4867,
"step": 120
},
{
"epoch": 0.24769703172978505,
"grad_norm": 0.5777069325137312,
"learning_rate": 1.9836065573770492e-05,
"loss": 0.5235,
"step": 121
},
{
"epoch": 0.24974411463664278,
"grad_norm": 0.7434807781935519,
"learning_rate": 2e-05,
"loss": 0.524,
"step": 122
},
{
"epoch": 0.2517911975435005,
"grad_norm": 0.6635906710416195,
"learning_rate": 2.0163934426229508e-05,
"loss": 0.4759,
"step": 123
},
{
"epoch": 0.2538382804503582,
"grad_norm": 0.6164296619684109,
"learning_rate": 2.0327868852459018e-05,
"loss": 0.4925,
"step": 124
},
{
"epoch": 0.25588536335721596,
"grad_norm": 0.5800314323412163,
"learning_rate": 2.0491803278688525e-05,
"loss": 0.536,
"step": 125
},
{
"epoch": 0.2579324462640737,
"grad_norm": 0.6910504700298034,
"learning_rate": 2.0655737704918034e-05,
"loss": 0.5937,
"step": 126
},
{
"epoch": 0.2599795291709314,
"grad_norm": 0.5724201379088258,
"learning_rate": 2.081967213114754e-05,
"loss": 0.5019,
"step": 127
},
{
"epoch": 0.26202661207778916,
"grad_norm": 0.7529480886936586,
"learning_rate": 2.098360655737705e-05,
"loss": 0.5643,
"step": 128
},
{
"epoch": 0.2640736949846469,
"grad_norm": 0.6170979184951091,
"learning_rate": 2.1147540983606557e-05,
"loss": 0.5449,
"step": 129
},
{
"epoch": 0.2661207778915046,
"grad_norm": 0.6541686452612759,
"learning_rate": 2.1311475409836067e-05,
"loss": 0.5377,
"step": 130
},
{
"epoch": 0.26816786079836236,
"grad_norm": 0.7744689267947779,
"learning_rate": 2.1475409836065574e-05,
"loss": 0.5846,
"step": 131
},
{
"epoch": 0.27021494370522003,
"grad_norm": 0.5945036689673261,
"learning_rate": 2.1639344262295087e-05,
"loss": 0.5283,
"step": 132
},
{
"epoch": 0.27226202661207777,
"grad_norm": 0.661953998440321,
"learning_rate": 2.180327868852459e-05,
"loss": 0.557,
"step": 133
},
{
"epoch": 0.2743091095189355,
"grad_norm": 0.5763477769339282,
"learning_rate": 2.1967213114754104e-05,
"loss": 0.511,
"step": 134
},
{
"epoch": 0.27635619242579323,
"grad_norm": 0.5514346276124723,
"learning_rate": 2.213114754098361e-05,
"loss": 0.5182,
"step": 135
},
{
"epoch": 0.27840327533265097,
"grad_norm": 0.5695095404976926,
"learning_rate": 2.2295081967213113e-05,
"loss": 0.5146,
"step": 136
},
{
"epoch": 0.2804503582395087,
"grad_norm": 0.5986561008583392,
"learning_rate": 2.2459016393442626e-05,
"loss": 0.5083,
"step": 137
},
{
"epoch": 0.28249744114636643,
"grad_norm": 0.5610678921247985,
"learning_rate": 2.2622950819672133e-05,
"loss": 0.5313,
"step": 138
},
{
"epoch": 0.28454452405322417,
"grad_norm": 0.6049817345717909,
"learning_rate": 2.2786885245901643e-05,
"loss": 0.5052,
"step": 139
},
{
"epoch": 0.2865916069600819,
"grad_norm": 0.5464694050777636,
"learning_rate": 2.295081967213115e-05,
"loss": 0.5371,
"step": 140
},
{
"epoch": 0.28863868986693964,
"grad_norm": 0.6288012925738132,
"learning_rate": 2.311475409836066e-05,
"loss": 0.4964,
"step": 141
},
{
"epoch": 0.2906857727737973,
"grad_norm": 0.6292583801203022,
"learning_rate": 2.3278688524590166e-05,
"loss": 0.524,
"step": 142
},
{
"epoch": 0.29273285568065505,
"grad_norm": 0.5989675634399306,
"learning_rate": 2.3442622950819676e-05,
"loss": 0.5347,
"step": 143
},
{
"epoch": 0.2947799385875128,
"grad_norm": 0.711475328293837,
"learning_rate": 2.3606557377049182e-05,
"loss": 0.5212,
"step": 144
},
{
"epoch": 0.2968270214943705,
"grad_norm": 0.6147945346885277,
"learning_rate": 2.3770491803278692e-05,
"loss": 0.536,
"step": 145
},
{
"epoch": 0.29887410440122825,
"grad_norm": 0.7665801712563535,
"learning_rate": 2.39344262295082e-05,
"loss": 0.506,
"step": 146
},
{
"epoch": 0.300921187308086,
"grad_norm": 0.5675791883302023,
"learning_rate": 2.4098360655737705e-05,
"loss": 0.5165,
"step": 147
},
{
"epoch": 0.3029682702149437,
"grad_norm": 0.6749613199444409,
"learning_rate": 2.4262295081967215e-05,
"loss": 0.5353,
"step": 148
},
{
"epoch": 0.30501535312180145,
"grad_norm": 0.5449017130362618,
"learning_rate": 2.442622950819672e-05,
"loss": 0.5378,
"step": 149
},
{
"epoch": 0.3070624360286592,
"grad_norm": 0.6201210295020051,
"learning_rate": 2.459016393442623e-05,
"loss": 0.5094,
"step": 150
},
{
"epoch": 0.3091095189355169,
"grad_norm": 0.577151136698421,
"learning_rate": 2.4754098360655738e-05,
"loss": 0.5196,
"step": 151
},
{
"epoch": 0.3111566018423746,
"grad_norm": 0.5313033735643246,
"learning_rate": 2.4918032786885248e-05,
"loss": 0.4937,
"step": 152
},
{
"epoch": 0.3132036847492323,
"grad_norm": 0.5101509817549174,
"learning_rate": 2.5081967213114754e-05,
"loss": 0.4788,
"step": 153
},
{
"epoch": 0.31525076765609006,
"grad_norm": 0.6444658936554309,
"learning_rate": 2.5245901639344264e-05,
"loss": 0.5275,
"step": 154
},
{
"epoch": 0.3172978505629478,
"grad_norm": 0.44154622012146943,
"learning_rate": 2.540983606557377e-05,
"loss": 0.4861,
"step": 155
},
{
"epoch": 0.3193449334698055,
"grad_norm": 0.608818705758696,
"learning_rate": 2.5573770491803284e-05,
"loss": 0.5071,
"step": 156
},
{
"epoch": 0.32139201637666326,
"grad_norm": 0.616456605045568,
"learning_rate": 2.5737704918032787e-05,
"loss": 0.5195,
"step": 157
},
{
"epoch": 0.323439099283521,
"grad_norm": 0.67099003512138,
"learning_rate": 2.59016393442623e-05,
"loss": 0.5523,
"step": 158
},
{
"epoch": 0.3254861821903787,
"grad_norm": 0.6144728851907352,
"learning_rate": 2.6065573770491807e-05,
"loss": 0.5451,
"step": 159
},
{
"epoch": 0.32753326509723646,
"grad_norm": 0.6323429696875369,
"learning_rate": 2.622950819672131e-05,
"loss": 0.5309,
"step": 160
},
{
"epoch": 0.3295803480040942,
"grad_norm": 0.5465209069864494,
"learning_rate": 2.6393442622950824e-05,
"loss": 0.5342,
"step": 161
},
{
"epoch": 0.33162743091095187,
"grad_norm": 0.7352587986716179,
"learning_rate": 2.655737704918033e-05,
"loss": 0.5181,
"step": 162
},
{
"epoch": 0.3336745138178096,
"grad_norm": 0.5130967329382123,
"learning_rate": 2.672131147540984e-05,
"loss": 0.5074,
"step": 163
},
{
"epoch": 0.33572159672466734,
"grad_norm": 0.6870511656785425,
"learning_rate": 2.6885245901639346e-05,
"loss": 0.5603,
"step": 164
},
{
"epoch": 0.33776867963152507,
"grad_norm": 0.7048410066657461,
"learning_rate": 2.7049180327868856e-05,
"loss": 0.5272,
"step": 165
},
{
"epoch": 0.3398157625383828,
"grad_norm": 0.8082743167202432,
"learning_rate": 2.7213114754098363e-05,
"loss": 0.5196,
"step": 166
},
{
"epoch": 0.34186284544524054,
"grad_norm": 0.6901700346930703,
"learning_rate": 2.7377049180327873e-05,
"loss": 0.5439,
"step": 167
},
{
"epoch": 0.34390992835209827,
"grad_norm": 0.6543817136595013,
"learning_rate": 2.754098360655738e-05,
"loss": 0.5375,
"step": 168
},
{
"epoch": 0.345957011258956,
"grad_norm": 0.6015689406714214,
"learning_rate": 2.770491803278689e-05,
"loss": 0.5014,
"step": 169
},
{
"epoch": 0.34800409416581374,
"grad_norm": 0.658713714501858,
"learning_rate": 2.7868852459016396e-05,
"loss": 0.5262,
"step": 170
},
{
"epoch": 0.3500511770726714,
"grad_norm": 0.5872370736842677,
"learning_rate": 2.8032786885245902e-05,
"loss": 0.5454,
"step": 171
},
{
"epoch": 0.35209825997952915,
"grad_norm": 0.5434995669507285,
"learning_rate": 2.8196721311475412e-05,
"loss": 0.562,
"step": 172
},
{
"epoch": 0.3541453428863869,
"grad_norm": 0.7617505474169058,
"learning_rate": 2.836065573770492e-05,
"loss": 0.5147,
"step": 173
},
{
"epoch": 0.3561924257932446,
"grad_norm": 0.6940517866702995,
"learning_rate": 2.852459016393443e-05,
"loss": 0.561,
"step": 174
},
{
"epoch": 0.35823950870010235,
"grad_norm": 0.6026045944898959,
"learning_rate": 2.8688524590163935e-05,
"loss": 0.5294,
"step": 175
},
{
"epoch": 0.3602865916069601,
"grad_norm": 0.685003134185584,
"learning_rate": 2.8852459016393445e-05,
"loss": 0.5086,
"step": 176
},
{
"epoch": 0.3623336745138178,
"grad_norm": 0.7007920129863516,
"learning_rate": 2.901639344262295e-05,
"loss": 0.5021,
"step": 177
},
{
"epoch": 0.36438075742067555,
"grad_norm": 0.7355272145052204,
"learning_rate": 2.918032786885246e-05,
"loss": 0.5007,
"step": 178
},
{
"epoch": 0.3664278403275333,
"grad_norm": 0.6487776776109249,
"learning_rate": 2.9344262295081968e-05,
"loss": 0.5524,
"step": 179
},
{
"epoch": 0.368474923234391,
"grad_norm": 0.6902433390057133,
"learning_rate": 2.9508196721311478e-05,
"loss": 0.5142,
"step": 180
},
{
"epoch": 0.3705220061412487,
"grad_norm": 0.6867948754438059,
"learning_rate": 2.9672131147540984e-05,
"loss": 0.5187,
"step": 181
},
{
"epoch": 0.3725690890481064,
"grad_norm": 0.6159722804543436,
"learning_rate": 2.9836065573770498e-05,
"loss": 0.5508,
"step": 182
},
{
"epoch": 0.37461617195496416,
"grad_norm": 0.7696599736408124,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.5348,
"step": 183
},
{
"epoch": 0.3766632548618219,
"grad_norm": 0.6954090228635682,
"learning_rate": 3.0163934426229507e-05,
"loss": 0.5447,
"step": 184
},
{
"epoch": 0.37871033776867963,
"grad_norm": 0.6435121902381328,
"learning_rate": 3.032786885245902e-05,
"loss": 0.5474,
"step": 185
},
{
"epoch": 0.38075742067553736,
"grad_norm": 0.8038288638214993,
"learning_rate": 3.0491803278688527e-05,
"loss": 0.5559,
"step": 186
},
{
"epoch": 0.3828045035823951,
"grad_norm": 0.688956722367706,
"learning_rate": 3.065573770491804e-05,
"loss": 0.4875,
"step": 187
},
{
"epoch": 0.38485158648925283,
"grad_norm": 0.7159386414570686,
"learning_rate": 3.0819672131147544e-05,
"loss": 0.5499,
"step": 188
},
{
"epoch": 0.38689866939611056,
"grad_norm": 0.9285947357940207,
"learning_rate": 3.098360655737705e-05,
"loss": 0.5144,
"step": 189
},
{
"epoch": 0.3889457523029683,
"grad_norm": 0.6303078015541651,
"learning_rate": 3.1147540983606557e-05,
"loss": 0.6274,
"step": 190
},
{
"epoch": 0.390992835209826,
"grad_norm": 0.8868869883770375,
"learning_rate": 3.131147540983607e-05,
"loss": 0.5261,
"step": 191
},
{
"epoch": 0.3930399181166837,
"grad_norm": 0.8238279905547772,
"learning_rate": 3.1475409836065576e-05,
"loss": 0.5284,
"step": 192
},
{
"epoch": 0.39508700102354144,
"grad_norm": 0.7109735935815616,
"learning_rate": 3.163934426229509e-05,
"loss": 0.4981,
"step": 193
},
{
"epoch": 0.3971340839303992,
"grad_norm": 0.8555381934750876,
"learning_rate": 3.180327868852459e-05,
"loss": 0.5184,
"step": 194
},
{
"epoch": 0.3991811668372569,
"grad_norm": 0.7950926694758862,
"learning_rate": 3.19672131147541e-05,
"loss": 0.5286,
"step": 195
},
{
"epoch": 0.40122824974411464,
"grad_norm": 0.6893785135479137,
"learning_rate": 3.213114754098361e-05,
"loss": 0.5048,
"step": 196
},
{
"epoch": 0.4032753326509724,
"grad_norm": 0.8285127939060417,
"learning_rate": 3.2295081967213116e-05,
"loss": 0.5696,
"step": 197
},
{
"epoch": 0.4053224155578301,
"grad_norm": 0.6476667354794096,
"learning_rate": 3.245901639344263e-05,
"loss": 0.5175,
"step": 198
},
{
"epoch": 0.40736949846468784,
"grad_norm": 0.9133932587744947,
"learning_rate": 3.2622950819672136e-05,
"loss": 0.5426,
"step": 199
},
{
"epoch": 0.4094165813715456,
"grad_norm": 0.7532290387705938,
"learning_rate": 3.278688524590164e-05,
"loss": 0.5087,
"step": 200
},
{
"epoch": 0.41146366427840325,
"grad_norm": 0.9389363029784095,
"learning_rate": 3.295081967213115e-05,
"loss": 0.509,
"step": 201
},
{
"epoch": 0.413510747185261,
"grad_norm": 0.5686252927682621,
"learning_rate": 3.311475409836066e-05,
"loss": 0.5193,
"step": 202
},
{
"epoch": 0.4155578300921187,
"grad_norm": 0.8087812079712846,
"learning_rate": 3.327868852459017e-05,
"loss": 0.5197,
"step": 203
},
{
"epoch": 0.41760491299897645,
"grad_norm": 0.4881097207653879,
"learning_rate": 3.3442622950819675e-05,
"loss": 0.5044,
"step": 204
},
{
"epoch": 0.4196519959058342,
"grad_norm": 0.7274067002262758,
"learning_rate": 3.360655737704918e-05,
"loss": 0.4974,
"step": 205
},
{
"epoch": 0.4216990788126919,
"grad_norm": 0.5458714268885722,
"learning_rate": 3.3770491803278695e-05,
"loss": 0.5017,
"step": 206
},
{
"epoch": 0.42374616171954965,
"grad_norm": 0.7035504070202385,
"learning_rate": 3.39344262295082e-05,
"loss": 0.4983,
"step": 207
},
{
"epoch": 0.4257932446264074,
"grad_norm": 0.5269572768909501,
"learning_rate": 3.409836065573771e-05,
"loss": 0.5484,
"step": 208
},
{
"epoch": 0.4278403275332651,
"grad_norm": 0.5550435466705637,
"learning_rate": 3.4262295081967214e-05,
"loss": 0.5152,
"step": 209
},
{
"epoch": 0.42988741044012285,
"grad_norm": 0.5658118678239209,
"learning_rate": 3.442622950819672e-05,
"loss": 0.529,
"step": 210
},
{
"epoch": 0.43193449334698053,
"grad_norm": 0.4274306239416562,
"learning_rate": 3.4590163934426234e-05,
"loss": 0.547,
"step": 211
},
{
"epoch": 0.43398157625383826,
"grad_norm": 0.6498291018145316,
"learning_rate": 3.475409836065574e-05,
"loss": 0.5255,
"step": 212
},
{
"epoch": 0.436028659160696,
"grad_norm": 0.4862132754376562,
"learning_rate": 3.491803278688525e-05,
"loss": 0.5407,
"step": 213
},
{
"epoch": 0.43807574206755373,
"grad_norm": 0.7345334765463367,
"learning_rate": 3.5081967213114754e-05,
"loss": 0.5467,
"step": 214
},
{
"epoch": 0.44012282497441146,
"grad_norm": 0.6199950312514801,
"learning_rate": 3.524590163934427e-05,
"loss": 0.5599,
"step": 215
},
{
"epoch": 0.4421699078812692,
"grad_norm": 0.7074691063675598,
"learning_rate": 3.5409836065573773e-05,
"loss": 0.5127,
"step": 216
},
{
"epoch": 0.44421699078812693,
"grad_norm": 0.6229358703039948,
"learning_rate": 3.557377049180329e-05,
"loss": 0.5186,
"step": 217
},
{
"epoch": 0.44626407369498466,
"grad_norm": 0.6395887726511317,
"learning_rate": 3.5737704918032786e-05,
"loss": 0.4989,
"step": 218
},
{
"epoch": 0.4483111566018424,
"grad_norm": 0.5458826332923155,
"learning_rate": 3.59016393442623e-05,
"loss": 0.5008,
"step": 219
},
{
"epoch": 0.4503582395087001,
"grad_norm": 0.6729015313127311,
"learning_rate": 3.6065573770491806e-05,
"loss": 0.5122,
"step": 220
},
{
"epoch": 0.4524053224155578,
"grad_norm": 0.7752024606600724,
"learning_rate": 3.622950819672131e-05,
"loss": 0.5018,
"step": 221
},
{
"epoch": 0.45445240532241554,
"grad_norm": 0.6141567713992134,
"learning_rate": 3.6393442622950826e-05,
"loss": 0.4921,
"step": 222
},
{
"epoch": 0.4564994882292733,
"grad_norm": 0.6807359145513986,
"learning_rate": 3.655737704918033e-05,
"loss": 0.5041,
"step": 223
},
{
"epoch": 0.458546571136131,
"grad_norm": 0.5856826658124886,
"learning_rate": 3.672131147540984e-05,
"loss": 0.5281,
"step": 224
},
{
"epoch": 0.46059365404298874,
"grad_norm": 0.6528386749398997,
"learning_rate": 3.6885245901639346e-05,
"loss": 0.4862,
"step": 225
},
{
"epoch": 0.4626407369498465,
"grad_norm": 0.8760949897414592,
"learning_rate": 3.704918032786886e-05,
"loss": 0.5313,
"step": 226
},
{
"epoch": 0.4646878198567042,
"grad_norm": 0.7133493187489152,
"learning_rate": 3.7213114754098365e-05,
"loss": 0.5611,
"step": 227
},
{
"epoch": 0.46673490276356194,
"grad_norm": 0.6514391258645618,
"learning_rate": 3.737704918032787e-05,
"loss": 0.5017,
"step": 228
},
{
"epoch": 0.4687819856704197,
"grad_norm": 0.712386378258888,
"learning_rate": 3.754098360655738e-05,
"loss": 0.4801,
"step": 229
},
{
"epoch": 0.47082906857727735,
"grad_norm": 0.7933509165191774,
"learning_rate": 3.770491803278689e-05,
"loss": 0.5265,
"step": 230
},
{
"epoch": 0.4728761514841351,
"grad_norm": 0.6393959136262052,
"learning_rate": 3.78688524590164e-05,
"loss": 0.533,
"step": 231
},
{
"epoch": 0.4749232343909928,
"grad_norm": 0.8336237173823177,
"learning_rate": 3.8032786885245905e-05,
"loss": 0.5067,
"step": 232
},
{
"epoch": 0.47697031729785055,
"grad_norm": 0.7946625026406952,
"learning_rate": 3.819672131147541e-05,
"loss": 0.5572,
"step": 233
},
{
"epoch": 0.4790174002047083,
"grad_norm": 0.7448514333498657,
"learning_rate": 3.836065573770492e-05,
"loss": 0.5267,
"step": 234
},
{
"epoch": 0.481064483111566,
"grad_norm": 0.6216449013018147,
"learning_rate": 3.852459016393443e-05,
"loss": 0.5007,
"step": 235
},
{
"epoch": 0.48311156601842375,
"grad_norm": 0.5847497681971316,
"learning_rate": 3.868852459016394e-05,
"loss": 0.5465,
"step": 236
},
{
"epoch": 0.4851586489252815,
"grad_norm": 0.5930045841712915,
"learning_rate": 3.8852459016393444e-05,
"loss": 0.5498,
"step": 237
},
{
"epoch": 0.4872057318321392,
"grad_norm": 0.6242247074949386,
"learning_rate": 3.901639344262295e-05,
"loss": 0.5032,
"step": 238
},
{
"epoch": 0.48925281473899696,
"grad_norm": 0.5315372089751544,
"learning_rate": 3.9180327868852464e-05,
"loss": 0.5351,
"step": 239
},
{
"epoch": 0.49129989764585463,
"grad_norm": 0.5976493830078852,
"learning_rate": 3.934426229508197e-05,
"loss": 0.5511,
"step": 240
},
{
"epoch": 0.49334698055271237,
"grad_norm": 0.5263959833035169,
"learning_rate": 3.950819672131148e-05,
"loss": 0.5095,
"step": 241
},
{
"epoch": 0.4953940634595701,
"grad_norm": 0.5164961935996712,
"learning_rate": 3.9672131147540983e-05,
"loss": 0.4903,
"step": 242
},
{
"epoch": 0.49744114636642783,
"grad_norm": 0.5030461231666816,
"learning_rate": 3.98360655737705e-05,
"loss": 0.5246,
"step": 243
},
{
"epoch": 0.49948822927328557,
"grad_norm": 0.5199256237042372,
"learning_rate": 4e-05,
"loss": 0.527,
"step": 244
},
{
"epoch": 0.5015353121801432,
"grad_norm": 0.5720072000502151,
"learning_rate": 3.999997953390434e-05,
"loss": 0.4698,
"step": 245
},
{
"epoch": 0.503582395087001,
"grad_norm": 0.4723958838410465,
"learning_rate": 3.999991813565924e-05,
"loss": 0.5021,
"step": 246
},
{
"epoch": 0.5056294779938587,
"grad_norm": 0.6664032925560375,
"learning_rate": 3.999981580539036e-05,
"loss": 0.5195,
"step": 247
},
{
"epoch": 0.5076765609007164,
"grad_norm": 0.46934376217932583,
"learning_rate": 3.999967254330713e-05,
"loss": 0.4915,
"step": 248
},
{
"epoch": 0.5097236438075742,
"grad_norm": 0.5471916125552302,
"learning_rate": 3.999948834970275e-05,
"loss": 0.5395,
"step": 249
},
{
"epoch": 0.5117707267144319,
"grad_norm": 0.5322293661429813,
"learning_rate": 3.9999263224954204e-05,
"loss": 0.5156,
"step": 250
},
{
"epoch": 0.5138178096212896,
"grad_norm": 0.48934414004740173,
"learning_rate": 3.999899716952221e-05,
"loss": 0.505,
"step": 251
},
{
"epoch": 0.5158648925281474,
"grad_norm": 0.6841239054987143,
"learning_rate": 3.9998690183951304e-05,
"loss": 0.517,
"step": 252
},
{
"epoch": 0.5179119754350051,
"grad_norm": 0.6081072200654224,
"learning_rate": 3.999834226886976e-05,
"loss": 0.5209,
"step": 253
},
{
"epoch": 0.5199590583418628,
"grad_norm": 0.591682811543655,
"learning_rate": 3.999795342498961e-05,
"loss": 0.5144,
"step": 254
},
{
"epoch": 0.5220061412487206,
"grad_norm": 0.644975243350573,
"learning_rate": 3.999752365310668e-05,
"loss": 0.5285,
"step": 255
},
{
"epoch": 0.5240532241555783,
"grad_norm": 0.5648625378625047,
"learning_rate": 3.999705295410054e-05,
"loss": 0.493,
"step": 256
},
{
"epoch": 0.526100307062436,
"grad_norm": 0.6130167811037579,
"learning_rate": 3.999654132893453e-05,
"loss": 0.5257,
"step": 257
},
{
"epoch": 0.5281473899692938,
"grad_norm": 0.5037937329537826,
"learning_rate": 3.999598877865575e-05,
"loss": 0.4947,
"step": 258
},
{
"epoch": 0.5301944728761515,
"grad_norm": 0.6388452684007601,
"learning_rate": 3.999539530439504e-05,
"loss": 0.5319,
"step": 259
},
{
"epoch": 0.5322415557830092,
"grad_norm": 0.5304888345319132,
"learning_rate": 3.9994760907367025e-05,
"loss": 0.5239,
"step": 260
},
{
"epoch": 0.534288638689867,
"grad_norm": 0.548729152916564,
"learning_rate": 3.999408558887006e-05,
"loss": 0.5182,
"step": 261
},
{
"epoch": 0.5363357215967247,
"grad_norm": 0.4946265440615839,
"learning_rate": 3.9993369350286265e-05,
"loss": 0.5211,
"step": 262
},
{
"epoch": 0.5383828045035824,
"grad_norm": 0.5513131572470374,
"learning_rate": 3.999261219308149e-05,
"loss": 0.4922,
"step": 263
},
{
"epoch": 0.5404298874104401,
"grad_norm": 0.5628821503706624,
"learning_rate": 3.999181411880536e-05,
"loss": 0.4833,
"step": 264
},
{
"epoch": 0.5424769703172978,
"grad_norm": 0.6321595406631201,
"learning_rate": 3.99909751290912e-05,
"loss": 0.5156,
"step": 265
},
{
"epoch": 0.5445240532241555,
"grad_norm": 0.49452183826279106,
"learning_rate": 3.9990095225656104e-05,
"loss": 0.4918,
"step": 266
},
{
"epoch": 0.5465711361310133,
"grad_norm": 0.5983835992691791,
"learning_rate": 3.998917441030089e-05,
"loss": 0.534,
"step": 267
},
{
"epoch": 0.548618219037871,
"grad_norm": 0.5392668568126767,
"learning_rate": 3.9988212684910107e-05,
"loss": 0.4919,
"step": 268
},
{
"epoch": 0.5506653019447287,
"grad_norm": 0.5271122702751097,
"learning_rate": 3.998721005145204e-05,
"loss": 0.5023,
"step": 269
},
{
"epoch": 0.5527123848515865,
"grad_norm": 0.613359179011921,
"learning_rate": 3.998616651197867e-05,
"loss": 0.5123,
"step": 270
},
{
"epoch": 0.5547594677584442,
"grad_norm": 0.5885866628158287,
"learning_rate": 3.9985082068625724e-05,
"loss": 0.5192,
"step": 271
},
{
"epoch": 0.5568065506653019,
"grad_norm": 0.5458709549332362,
"learning_rate": 3.998395672361264e-05,
"loss": 0.5159,
"step": 272
},
{
"epoch": 0.5588536335721597,
"grad_norm": 0.47718089897345783,
"learning_rate": 3.998279047924255e-05,
"loss": 0.4571,
"step": 273
},
{
"epoch": 0.5609007164790174,
"grad_norm": 0.5918739989559765,
"learning_rate": 3.998158333790231e-05,
"loss": 0.5093,
"step": 274
},
{
"epoch": 0.5629477993858751,
"grad_norm": 0.6171028421078789,
"learning_rate": 3.998033530206246e-05,
"loss": 0.5174,
"step": 275
},
{
"epoch": 0.5649948822927329,
"grad_norm": 0.6049929559700232,
"learning_rate": 3.9979046374277246e-05,
"loss": 0.5292,
"step": 276
},
{
"epoch": 0.5670419651995906,
"grad_norm": 0.6353516580703451,
"learning_rate": 3.99777165571846e-05,
"loss": 0.5202,
"step": 277
},
{
"epoch": 0.5690890481064483,
"grad_norm": 0.542126012081921,
"learning_rate": 3.997634585350614e-05,
"loss": 0.515,
"step": 278
},
{
"epoch": 0.5711361310133061,
"grad_norm": 0.6577209989909311,
"learning_rate": 3.997493426604715e-05,
"loss": 0.4827,
"step": 279
},
{
"epoch": 0.5731832139201638,
"grad_norm": 0.5786055028419322,
"learning_rate": 3.997348179769661e-05,
"loss": 0.4984,
"step": 280
},
{
"epoch": 0.5752302968270215,
"grad_norm": 0.5693454757360409,
"learning_rate": 3.9971988451427155e-05,
"loss": 0.4795,
"step": 281
},
{
"epoch": 0.5772773797338793,
"grad_norm": 0.523219696074873,
"learning_rate": 3.997045423029508e-05,
"loss": 0.5288,
"step": 282
},
{
"epoch": 0.579324462640737,
"grad_norm": 0.5914665747076296,
"learning_rate": 3.996887913744033e-05,
"loss": 0.5412,
"step": 283
},
{
"epoch": 0.5813715455475946,
"grad_norm": 0.504747034534071,
"learning_rate": 3.996726317608652e-05,
"loss": 0.5119,
"step": 284
},
{
"epoch": 0.5834186284544524,
"grad_norm": 0.6330512274369553,
"learning_rate": 3.996560634954088e-05,
"loss": 0.5504,
"step": 285
},
{
"epoch": 0.5854657113613101,
"grad_norm": 0.5570244494354821,
"learning_rate": 3.9963908661194285e-05,
"loss": 0.5323,
"step": 286
},
{
"epoch": 0.5875127942681678,
"grad_norm": 0.7110270240629256,
"learning_rate": 3.9962170114521246e-05,
"loss": 0.5086,
"step": 287
},
{
"epoch": 0.5895598771750256,
"grad_norm": 0.5503143307537267,
"learning_rate": 3.996039071307989e-05,
"loss": 0.5451,
"step": 288
},
{
"epoch": 0.5916069600818833,
"grad_norm": 0.6551750805922822,
"learning_rate": 3.995857046051196e-05,
"loss": 0.5375,
"step": 289
},
{
"epoch": 0.593654042988741,
"grad_norm": 0.625258001738531,
"learning_rate": 3.995670936054279e-05,
"loss": 0.5241,
"step": 290
},
{
"epoch": 0.5957011258955988,
"grad_norm": 0.6762376840884802,
"learning_rate": 3.9954807416981335e-05,
"loss": 0.5049,
"step": 291
},
{
"epoch": 0.5977482088024565,
"grad_norm": 0.6711303804476403,
"learning_rate": 3.995286463372013e-05,
"loss": 0.5117,
"step": 292
},
{
"epoch": 0.5997952917093142,
"grad_norm": 0.5151264552712931,
"learning_rate": 3.9950881014735295e-05,
"loss": 0.5053,
"step": 293
},
{
"epoch": 0.601842374616172,
"grad_norm": 0.663266741433607,
"learning_rate": 3.994885656408651e-05,
"loss": 0.4948,
"step": 294
},
{
"epoch": 0.6038894575230297,
"grad_norm": 0.567833717196775,
"learning_rate": 3.994679128591706e-05,
"loss": 0.5566,
"step": 295
},
{
"epoch": 0.6059365404298874,
"grad_norm": 0.5899154422863364,
"learning_rate": 3.9944685184453746e-05,
"loss": 0.53,
"step": 296
},
{
"epoch": 0.6079836233367452,
"grad_norm": 0.5758493539965378,
"learning_rate": 3.994253826400693e-05,
"loss": 0.5314,
"step": 297
},
{
"epoch": 0.6100307062436029,
"grad_norm": 0.5296826695926912,
"learning_rate": 3.9940350528970535e-05,
"loss": 0.5116,
"step": 298
},
{
"epoch": 0.6120777891504606,
"grad_norm": 0.5290847709355387,
"learning_rate": 3.993812198382199e-05,
"loss": 0.5028,
"step": 299
},
{
"epoch": 0.6141248720573184,
"grad_norm": 0.552336105606733,
"learning_rate": 3.993585263312227e-05,
"loss": 0.5202,
"step": 300
},
{
"epoch": 0.6161719549641761,
"grad_norm": 0.5293716014410816,
"learning_rate": 3.993354248151583e-05,
"loss": 0.4912,
"step": 301
},
{
"epoch": 0.6182190378710338,
"grad_norm": 0.6609476283232325,
"learning_rate": 3.993119153373067e-05,
"loss": 0.5438,
"step": 302
},
{
"epoch": 0.6202661207778914,
"grad_norm": 0.5469203904739622,
"learning_rate": 3.992879979457824e-05,
"loss": 0.5123,
"step": 303
},
{
"epoch": 0.6223132036847492,
"grad_norm": 0.6036405520109454,
"learning_rate": 3.9926367268953514e-05,
"loss": 0.5047,
"step": 304
},
{
"epoch": 0.6243602865916069,
"grad_norm": 0.5523585337922061,
"learning_rate": 3.9923893961834914e-05,
"loss": 0.5126,
"step": 305
},
{
"epoch": 0.6264073694984647,
"grad_norm": 0.5060340859748851,
"learning_rate": 3.992137987828434e-05,
"loss": 0.512,
"step": 306
},
{
"epoch": 0.6284544524053224,
"grad_norm": 0.4931821446334137,
"learning_rate": 3.991882502344712e-05,
"loss": 0.5086,
"step": 307
},
{
"epoch": 0.6305015353121801,
"grad_norm": 0.49318765086607474,
"learning_rate": 3.991622940255208e-05,
"loss": 0.4928,
"step": 308
},
{
"epoch": 0.6325486182190379,
"grad_norm": 0.47444503281787836,
"learning_rate": 3.991359302091141e-05,
"loss": 0.4823,
"step": 309
},
{
"epoch": 0.6345957011258956,
"grad_norm": 0.5837217244432208,
"learning_rate": 3.991091588392077e-05,
"loss": 0.553,
"step": 310
},
{
"epoch": 0.6366427840327533,
"grad_norm": 0.5943267226346076,
"learning_rate": 3.99081979970592e-05,
"loss": 0.5191,
"step": 311
},
{
"epoch": 0.638689866939611,
"grad_norm": 0.4692619037725303,
"learning_rate": 3.9905439365889176e-05,
"loss": 0.4833,
"step": 312
},
{
"epoch": 0.6407369498464688,
"grad_norm": 0.5017562232149083,
"learning_rate": 3.990263999605652e-05,
"loss": 0.4932,
"step": 313
},
{
"epoch": 0.6427840327533265,
"grad_norm": 0.4800449124898955,
"learning_rate": 3.989979989329046e-05,
"loss": 0.5475,
"step": 314
},
{
"epoch": 0.6448311156601843,
"grad_norm": 0.7077261651500286,
"learning_rate": 3.9896919063403567e-05,
"loss": 0.5656,
"step": 315
},
{
"epoch": 0.646878198567042,
"grad_norm": 0.4705479904386771,
"learning_rate": 3.989399751229179e-05,
"loss": 0.4812,
"step": 316
},
{
"epoch": 0.6489252814738997,
"grad_norm": 0.6188237485091818,
"learning_rate": 3.989103524593439e-05,
"loss": 0.5448,
"step": 317
},
{
"epoch": 0.6509723643807575,
"grad_norm": 0.5875009833381106,
"learning_rate": 3.9888032270393966e-05,
"loss": 0.5391,
"step": 318
},
{
"epoch": 0.6530194472876152,
"grad_norm": 0.6367560756626307,
"learning_rate": 3.988498859181645e-05,
"loss": 0.5857,
"step": 319
},
{
"epoch": 0.6550665301944729,
"grad_norm": 0.485432763118475,
"learning_rate": 3.988190421643105e-05,
"loss": 0.4775,
"step": 320
},
{
"epoch": 0.6571136131013307,
"grad_norm": 0.6730462808178248,
"learning_rate": 3.9878779150550306e-05,
"loss": 0.4953,
"step": 321
},
{
"epoch": 0.6591606960081884,
"grad_norm": 0.6111893546543505,
"learning_rate": 3.9875613400569975e-05,
"loss": 0.4593,
"step": 322
},
{
"epoch": 0.661207778915046,
"grad_norm": 0.7191482927871957,
"learning_rate": 3.987240697296912e-05,
"loss": 0.4943,
"step": 323
},
{
"epoch": 0.6632548618219037,
"grad_norm": 0.6827606638555512,
"learning_rate": 3.986915987431006e-05,
"loss": 0.5312,
"step": 324
},
{
"epoch": 0.6653019447287615,
"grad_norm": 0.6385480452392281,
"learning_rate": 3.986587211123833e-05,
"loss": 0.5066,
"step": 325
},
{
"epoch": 0.6673490276356192,
"grad_norm": 0.5586591391727767,
"learning_rate": 3.986254369048268e-05,
"loss": 0.519,
"step": 326
},
{
"epoch": 0.6693961105424769,
"grad_norm": 0.7119031753948682,
"learning_rate": 3.985917461885512e-05,
"loss": 0.526,
"step": 327
},
{
"epoch": 0.6714431934493347,
"grad_norm": 0.403690119122138,
"learning_rate": 3.98557649032508e-05,
"loss": 0.4941,
"step": 328
},
{
"epoch": 0.6734902763561924,
"grad_norm": 0.711228959963807,
"learning_rate": 3.985231455064809e-05,
"loss": 0.5161,
"step": 329
},
{
"epoch": 0.6755373592630501,
"grad_norm": 0.6152056684018806,
"learning_rate": 3.9848823568108515e-05,
"loss": 0.5252,
"step": 330
},
{
"epoch": 0.6775844421699079,
"grad_norm": 0.6811387482564264,
"learning_rate": 3.984529196277674e-05,
"loss": 0.5153,
"step": 331
},
{
"epoch": 0.6796315250767656,
"grad_norm": 0.715472760713951,
"learning_rate": 3.9841719741880583e-05,
"loss": 0.5136,
"step": 332
},
{
"epoch": 0.6816786079836233,
"grad_norm": 0.6117081033404244,
"learning_rate": 3.9838106912731e-05,
"loss": 0.5056,
"step": 333
},
{
"epoch": 0.6837256908904811,
"grad_norm": 0.5640133468287892,
"learning_rate": 3.983445348272203e-05,
"loss": 0.5022,
"step": 334
},
{
"epoch": 0.6857727737973388,
"grad_norm": 0.5708874168642846,
"learning_rate": 3.983075945933083e-05,
"loss": 0.5063,
"step": 335
},
{
"epoch": 0.6878198567041965,
"grad_norm": 0.5178591256000558,
"learning_rate": 3.9827024850117606e-05,
"loss": 0.5067,
"step": 336
},
{
"epoch": 0.6898669396110543,
"grad_norm": 0.5425146505052523,
"learning_rate": 3.982324966272566e-05,
"loss": 0.5112,
"step": 337
},
{
"epoch": 0.691914022517912,
"grad_norm": 0.4545857800127313,
"learning_rate": 3.9819433904881324e-05,
"loss": 0.4987,
"step": 338
},
{
"epoch": 0.6939611054247697,
"grad_norm": 0.5538231712121938,
"learning_rate": 3.981557758439396e-05,
"loss": 0.5174,
"step": 339
},
{
"epoch": 0.6960081883316275,
"grad_norm": 0.516910525115318,
"learning_rate": 3.981168070915594e-05,
"loss": 0.5205,
"step": 340
},
{
"epoch": 0.6980552712384852,
"grad_norm": 0.5341261157883666,
"learning_rate": 3.980774328714267e-05,
"loss": 0.5406,
"step": 341
},
{
"epoch": 0.7001023541453428,
"grad_norm": 0.5219326420859508,
"learning_rate": 3.9803765326412506e-05,
"loss": 0.5491,
"step": 342
},
{
"epoch": 0.7021494370522006,
"grad_norm": 0.5126045854291318,
"learning_rate": 3.979974683510677e-05,
"loss": 0.5507,
"step": 343
},
{
"epoch": 0.7041965199590583,
"grad_norm": 0.5442853681992302,
"learning_rate": 3.9795687821449754e-05,
"loss": 0.5136,
"step": 344
},
{
"epoch": 0.706243602865916,
"grad_norm": 0.4580135182917517,
"learning_rate": 3.9791588293748676e-05,
"loss": 0.4908,
"step": 345
},
{
"epoch": 0.7082906857727738,
"grad_norm": 0.5632074896697251,
"learning_rate": 3.978744826039366e-05,
"loss": 0.5046,
"step": 346
},
{
"epoch": 0.7103377686796315,
"grad_norm": 0.4381200556890962,
"learning_rate": 3.9783267729857756e-05,
"loss": 0.4994,
"step": 347
},
{
"epoch": 0.7123848515864892,
"grad_norm": 0.5608491796041225,
"learning_rate": 3.9779046710696854e-05,
"loss": 0.4813,
"step": 348
},
{
"epoch": 0.714431934493347,
"grad_norm": 0.4726444333140026,
"learning_rate": 3.977478521154974e-05,
"loss": 0.487,
"step": 349
},
{
"epoch": 0.7164790174002047,
"grad_norm": 0.5138973186415984,
"learning_rate": 3.977048324113805e-05,
"loss": 0.5418,
"step": 350
},
{
"epoch": 0.7185261003070624,
"grad_norm": 0.48636953697879376,
"learning_rate": 3.976614080826623e-05,
"loss": 0.5313,
"step": 351
},
{
"epoch": 0.7205731832139202,
"grad_norm": 0.5627556793214199,
"learning_rate": 3.9761757921821544e-05,
"loss": 0.5136,
"step": 352
},
{
"epoch": 0.7226202661207779,
"grad_norm": 0.42092677165552256,
"learning_rate": 3.975733459077405e-05,
"loss": 0.5396,
"step": 353
},
{
"epoch": 0.7246673490276356,
"grad_norm": 0.48567860686645814,
"learning_rate": 3.9752870824176585e-05,
"loss": 0.4912,
"step": 354
},
{
"epoch": 0.7267144319344934,
"grad_norm": 0.48215074888552417,
"learning_rate": 3.974836663116472e-05,
"loss": 0.5246,
"step": 355
},
{
"epoch": 0.7287615148413511,
"grad_norm": 0.4041796943755226,
"learning_rate": 3.97438220209568e-05,
"loss": 0.4697,
"step": 356
},
{
"epoch": 0.7308085977482088,
"grad_norm": 0.5033023757557246,
"learning_rate": 3.973923700285386e-05,
"loss": 0.5033,
"step": 357
},
{
"epoch": 0.7328556806550666,
"grad_norm": 0.47849562907550447,
"learning_rate": 3.973461158623963e-05,
"loss": 0.51,
"step": 358
},
{
"epoch": 0.7349027635619243,
"grad_norm": 0.4728712624318383,
"learning_rate": 3.972994578058055e-05,
"loss": 0.5183,
"step": 359
},
{
"epoch": 0.736949846468782,
"grad_norm": 0.46994907616226134,
"learning_rate": 3.972523959542569e-05,
"loss": 0.4791,
"step": 360
},
{
"epoch": 0.7389969293756398,
"grad_norm": 0.4716759611852859,
"learning_rate": 3.9720493040406786e-05,
"loss": 0.5053,
"step": 361
},
{
"epoch": 0.7410440122824974,
"grad_norm": 0.4739511859859272,
"learning_rate": 3.9715706125238164e-05,
"loss": 0.4902,
"step": 362
},
{
"epoch": 0.7430910951893551,
"grad_norm": 0.5524351394525074,
"learning_rate": 3.971087885971679e-05,
"loss": 0.5002,
"step": 363
},
{
"epoch": 0.7451381780962129,
"grad_norm": 0.5160577267882994,
"learning_rate": 3.970601125372218e-05,
"loss": 0.5077,
"step": 364
},
{
"epoch": 0.7471852610030706,
"grad_norm": 0.6036728391773393,
"learning_rate": 3.970110331721643e-05,
"loss": 0.5152,
"step": 365
},
{
"epoch": 0.7492323439099283,
"grad_norm": 0.44920724019503216,
"learning_rate": 3.9696155060244166e-05,
"loss": 0.5016,
"step": 366
},
{
"epoch": 0.7512794268167861,
"grad_norm": 0.5748889938625247,
"learning_rate": 3.9691166492932535e-05,
"loss": 0.5484,
"step": 367
},
{
"epoch": 0.7533265097236438,
"grad_norm": 0.456775282424986,
"learning_rate": 3.968613762549119e-05,
"loss": 0.4839,
"step": 368
},
{
"epoch": 0.7553735926305015,
"grad_norm": 0.5722901317947914,
"learning_rate": 3.968106846821226e-05,
"loss": 0.4961,
"step": 369
},
{
"epoch": 0.7574206755373593,
"grad_norm": 0.513431808850321,
"learning_rate": 3.9675959031470336e-05,
"loss": 0.5347,
"step": 370
},
{
"epoch": 0.759467758444217,
"grad_norm": 0.4971842267477506,
"learning_rate": 3.9670809325722425e-05,
"loss": 0.5025,
"step": 371
},
{
"epoch": 0.7615148413510747,
"grad_norm": 0.5489460962549015,
"learning_rate": 3.966561936150797e-05,
"loss": 0.527,
"step": 372
},
{
"epoch": 0.7635619242579325,
"grad_norm": 0.46580059626935816,
"learning_rate": 3.966038914944881e-05,
"loss": 0.5024,
"step": 373
},
{
"epoch": 0.7656090071647902,
"grad_norm": 0.48329497303274316,
"learning_rate": 3.9655118700249146e-05,
"loss": 0.4543,
"step": 374
},
{
"epoch": 0.7676560900716479,
"grad_norm": 0.4294347901743354,
"learning_rate": 3.964980802469552e-05,
"loss": 0.4918,
"step": 375
},
{
"epoch": 0.7697031729785057,
"grad_norm": 0.46898699035993047,
"learning_rate": 3.964445713365682e-05,
"loss": 0.5219,
"step": 376
},
{
"epoch": 0.7717502558853634,
"grad_norm": 0.5368649938570568,
"learning_rate": 3.963906603808422e-05,
"loss": 0.5491,
"step": 377
},
{
"epoch": 0.7737973387922211,
"grad_norm": 0.4589070525461554,
"learning_rate": 3.96336347490112e-05,
"loss": 0.5193,
"step": 378
},
{
"epoch": 0.7758444216990789,
"grad_norm": 0.5198618447541961,
"learning_rate": 3.9628163277553486e-05,
"loss": 0.5059,
"step": 379
},
{
"epoch": 0.7778915046059366,
"grad_norm": 0.5201151329391878,
"learning_rate": 3.962265163490903e-05,
"loss": 0.5242,
"step": 380
},
{
"epoch": 0.7799385875127943,
"grad_norm": 0.5286097719017859,
"learning_rate": 3.9617099832358035e-05,
"loss": 0.5131,
"step": 381
},
{
"epoch": 0.781985670419652,
"grad_norm": 0.4908828849732809,
"learning_rate": 3.961150788126286e-05,
"loss": 0.5247,
"step": 382
},
{
"epoch": 0.7840327533265097,
"grad_norm": 0.5105242176184168,
"learning_rate": 3.960587579306805e-05,
"loss": 0.5082,
"step": 383
},
{
"epoch": 0.7860798362333674,
"grad_norm": 0.4827529985137035,
"learning_rate": 3.960020357930028e-05,
"loss": 0.4886,
"step": 384
},
{
"epoch": 0.7881269191402251,
"grad_norm": 0.5255080063950176,
"learning_rate": 3.9594491251568376e-05,
"loss": 0.5323,
"step": 385
},
{
"epoch": 0.7901740020470829,
"grad_norm": 0.42324786024329186,
"learning_rate": 3.958873882156322e-05,
"loss": 0.4913,
"step": 386
},
{
"epoch": 0.7922210849539406,
"grad_norm": 0.4856526176027495,
"learning_rate": 3.9582946301057806e-05,
"loss": 0.5083,
"step": 387
},
{
"epoch": 0.7942681678607983,
"grad_norm": 0.4869309440084628,
"learning_rate": 3.957711370190716e-05,
"loss": 0.503,
"step": 388
},
{
"epoch": 0.7963152507676561,
"grad_norm": 0.5268029487052006,
"learning_rate": 3.957124103604833e-05,
"loss": 0.5082,
"step": 389
},
{
"epoch": 0.7983623336745138,
"grad_norm": 0.507634587065165,
"learning_rate": 3.9565328315500375e-05,
"loss": 0.5166,
"step": 390
},
{
"epoch": 0.8004094165813715,
"grad_norm": 0.5687757643916819,
"learning_rate": 3.9559375552364325e-05,
"loss": 0.5143,
"step": 391
},
{
"epoch": 0.8024564994882293,
"grad_norm": 0.4695087735006217,
"learning_rate": 3.955338275882316e-05,
"loss": 0.4713,
"step": 392
},
{
"epoch": 0.804503582395087,
"grad_norm": 0.46001610899852224,
"learning_rate": 3.9547349947141787e-05,
"loss": 0.4701,
"step": 393
},
{
"epoch": 0.8065506653019447,
"grad_norm": 0.4942091923144858,
"learning_rate": 3.954127712966702e-05,
"loss": 0.4916,
"step": 394
},
{
"epoch": 0.8085977482088025,
"grad_norm": 0.4750963479134321,
"learning_rate": 3.953516431882754e-05,
"loss": 0.535,
"step": 395
},
{
"epoch": 0.8106448311156602,
"grad_norm": 0.45303577566833647,
"learning_rate": 3.952901152713389e-05,
"loss": 0.4756,
"step": 396
},
{
"epoch": 0.812691914022518,
"grad_norm": 0.45075457713091993,
"learning_rate": 3.952281876717843e-05,
"loss": 0.5013,
"step": 397
},
{
"epoch": 0.8147389969293757,
"grad_norm": 0.4866908528841161,
"learning_rate": 3.951658605163533e-05,
"loss": 0.5159,
"step": 398
},
{
"epoch": 0.8167860798362334,
"grad_norm": 0.47273974659050305,
"learning_rate": 3.9510313393260507e-05,
"loss": 0.4876,
"step": 399
},
{
"epoch": 0.8188331627430911,
"grad_norm": 0.4599999323212464,
"learning_rate": 3.950400080489165e-05,
"loss": 0.486,
"step": 400
},
{
"epoch": 0.8208802456499488,
"grad_norm": 0.41263568282343643,
"learning_rate": 3.9497648299448174e-05,
"loss": 0.6514,
"step": 401
},
{
"epoch": 0.8229273285568065,
"grad_norm": 0.49637310910399085,
"learning_rate": 3.949125588993117e-05,
"loss": 0.5247,
"step": 402
},
{
"epoch": 0.8249744114636642,
"grad_norm": 0.407642872768924,
"learning_rate": 3.94848235894234e-05,
"loss": 0.5023,
"step": 403
},
{
"epoch": 0.827021494370522,
"grad_norm": 0.5125499802657086,
"learning_rate": 3.947835141108928e-05,
"loss": 0.5187,
"step": 404
},
{
"epoch": 0.8290685772773797,
"grad_norm": 0.46889549178993806,
"learning_rate": 3.947183936817483e-05,
"loss": 0.5089,
"step": 405
},
{
"epoch": 0.8311156601842374,
"grad_norm": 0.4793952039316711,
"learning_rate": 3.9465287474007654e-05,
"loss": 0.4946,
"step": 406
},
{
"epoch": 0.8331627430910952,
"grad_norm": 0.4616324812446371,
"learning_rate": 3.945869574199693e-05,
"loss": 0.4905,
"step": 407
},
{
"epoch": 0.8352098259979529,
"grad_norm": 0.522723518377932,
"learning_rate": 3.9452064185633345e-05,
"loss": 0.4873,
"step": 408
},
{
"epoch": 0.8372569089048106,
"grad_norm": 0.4444650907824343,
"learning_rate": 3.944539281848912e-05,
"loss": 0.4995,
"step": 409
},
{
"epoch": 0.8393039918116684,
"grad_norm": 0.5394910784111964,
"learning_rate": 3.943868165421793e-05,
"loss": 0.4597,
"step": 410
},
{
"epoch": 0.8413510747185261,
"grad_norm": 0.47150833451005714,
"learning_rate": 3.943193070655492e-05,
"loss": 0.4768,
"step": 411
},
{
"epoch": 0.8433981576253838,
"grad_norm": 0.5217385743079497,
"learning_rate": 3.942513998931663e-05,
"loss": 0.4936,
"step": 412
},
{
"epoch": 0.8454452405322416,
"grad_norm": 0.47838287732742774,
"learning_rate": 3.9418309516401015e-05,
"loss": 0.4998,
"step": 413
},
{
"epoch": 0.8474923234390993,
"grad_norm": 0.5219985412359689,
"learning_rate": 3.9411439301787383e-05,
"loss": 0.4922,
"step": 414
},
{
"epoch": 0.849539406345957,
"grad_norm": 0.5360559638934609,
"learning_rate": 3.940452935953639e-05,
"loss": 0.4932,
"step": 415
},
{
"epoch": 0.8515864892528148,
"grad_norm": 0.45167095248901046,
"learning_rate": 3.939757970378997e-05,
"loss": 0.5325,
"step": 416
},
{
"epoch": 0.8536335721596725,
"grad_norm": 0.5392146495002762,
"learning_rate": 3.9390590348771374e-05,
"loss": 0.5161,
"step": 417
},
{
"epoch": 0.8556806550665302,
"grad_norm": 0.4694690399172554,
"learning_rate": 3.9383561308785075e-05,
"loss": 0.4872,
"step": 418
},
{
"epoch": 0.857727737973388,
"grad_norm": 0.5450902044992034,
"learning_rate": 3.937649259821677e-05,
"loss": 0.5033,
"step": 419
},
{
"epoch": 0.8597748208802457,
"grad_norm": 0.4513747246745581,
"learning_rate": 3.9369384231533365e-05,
"loss": 0.5022,
"step": 420
},
{
"epoch": 0.8618219037871033,
"grad_norm": 0.47998170745321456,
"learning_rate": 3.9362236223282885e-05,
"loss": 0.488,
"step": 421
},
{
"epoch": 0.8638689866939611,
"grad_norm": 0.5703617109539353,
"learning_rate": 3.935504858809454e-05,
"loss": 0.5364,
"step": 422
},
{
"epoch": 0.8659160696008188,
"grad_norm": 0.46673348103204154,
"learning_rate": 3.9347821340678597e-05,
"loss": 0.519,
"step": 423
},
{
"epoch": 0.8679631525076765,
"grad_norm": 0.5424695733970012,
"learning_rate": 3.934055449582641e-05,
"loss": 0.5568,
"step": 424
},
{
"epoch": 0.8700102354145343,
"grad_norm": 0.4765444623240975,
"learning_rate": 3.9333248068410375e-05,
"loss": 0.4743,
"step": 425
},
{
"epoch": 0.872057318321392,
"grad_norm": 0.49864798719594966,
"learning_rate": 3.932590207338391e-05,
"loss": 0.4982,
"step": 426
},
{
"epoch": 0.8741044012282497,
"grad_norm": 0.4988338996845499,
"learning_rate": 3.931851652578137e-05,
"loss": 0.4963,
"step": 427
},
{
"epoch": 0.8761514841351075,
"grad_norm": 0.4432676805703767,
"learning_rate": 3.931109144071811e-05,
"loss": 0.4719,
"step": 428
},
{
"epoch": 0.8781985670419652,
"grad_norm": 0.48146006351463866,
"learning_rate": 3.930362683339037e-05,
"loss": 0.4863,
"step": 429
},
{
"epoch": 0.8802456499488229,
"grad_norm": 0.44888291145823134,
"learning_rate": 3.92961227190753e-05,
"loss": 0.5167,
"step": 430
},
{
"epoch": 0.8822927328556807,
"grad_norm": 0.4602238267927309,
"learning_rate": 3.928857911313088e-05,
"loss": 0.5031,
"step": 431
},
{
"epoch": 0.8843398157625384,
"grad_norm": 0.49233832361099084,
"learning_rate": 3.928099603099591e-05,
"loss": 0.5013,
"step": 432
},
{
"epoch": 0.8863868986693961,
"grad_norm": 0.4576541292218355,
"learning_rate": 3.9273373488190036e-05,
"loss": 0.5574,
"step": 433
},
{
"epoch": 0.8884339815762539,
"grad_norm": 0.4638871631773895,
"learning_rate": 3.92657115003136e-05,
"loss": 0.5253,
"step": 434
},
{
"epoch": 0.8904810644831116,
"grad_norm": 0.5178780585024065,
"learning_rate": 3.9258010083047715e-05,
"loss": 0.5485,
"step": 435
},
{
"epoch": 0.8925281473899693,
"grad_norm": 0.4598091842235503,
"learning_rate": 3.925026925215417e-05,
"loss": 0.4636,
"step": 436
},
{
"epoch": 0.8945752302968271,
"grad_norm": 0.5965207792237605,
"learning_rate": 3.924248902347541e-05,
"loss": 0.5464,
"step": 437
},
{
"epoch": 0.8966223132036848,
"grad_norm": 0.49770158650370516,
"learning_rate": 3.9234669412934546e-05,
"loss": 0.5461,
"step": 438
},
{
"epoch": 0.8986693961105425,
"grad_norm": 0.5031859956447141,
"learning_rate": 3.922681043653526e-05,
"loss": 0.5146,
"step": 439
},
{
"epoch": 0.9007164790174002,
"grad_norm": 0.4561558037016599,
"learning_rate": 3.92189121103618e-05,
"loss": 0.4898,
"step": 440
},
{
"epoch": 0.9027635619242579,
"grad_norm": 0.4247526477385047,
"learning_rate": 3.921097445057896e-05,
"loss": 0.4976,
"step": 441
},
{
"epoch": 0.9048106448311156,
"grad_norm": 0.523620896917421,
"learning_rate": 3.920299747343204e-05,
"loss": 0.4894,
"step": 442
},
{
"epoch": 0.9068577277379734,
"grad_norm": 0.4267971609347695,
"learning_rate": 3.919498119524679e-05,
"loss": 0.5029,
"step": 443
},
{
"epoch": 0.9089048106448311,
"grad_norm": 0.5817422656960313,
"learning_rate": 3.9186925632429396e-05,
"loss": 0.5477,
"step": 444
},
{
"epoch": 0.9109518935516888,
"grad_norm": 0.3812610872657855,
"learning_rate": 3.9178830801466465e-05,
"loss": 0.5147,
"step": 445
},
{
"epoch": 0.9129989764585466,
"grad_norm": 0.5210417707070376,
"learning_rate": 3.917069671892494e-05,
"loss": 0.5052,
"step": 446
},
{
"epoch": 0.9150460593654043,
"grad_norm": 0.4195818289044568,
"learning_rate": 3.9162523401452125e-05,
"loss": 0.476,
"step": 447
},
{
"epoch": 0.917093142272262,
"grad_norm": 0.4640561681219655,
"learning_rate": 3.915431086577561e-05,
"loss": 0.4811,
"step": 448
},
{
"epoch": 0.9191402251791198,
"grad_norm": 0.46049336960827775,
"learning_rate": 3.914605912870324e-05,
"loss": 0.5082,
"step": 449
},
{
"epoch": 0.9211873080859775,
"grad_norm": 0.5244765933281279,
"learning_rate": 3.913776820712309e-05,
"loss": 0.4982,
"step": 450
},
{
"epoch": 0.9232343909928352,
"grad_norm": 0.4207300716892893,
"learning_rate": 3.912943811800347e-05,
"loss": 0.477,
"step": 451
},
{
"epoch": 0.925281473899693,
"grad_norm": 0.4532179957617135,
"learning_rate": 3.912106887839278e-05,
"loss": 0.4953,
"step": 452
},
{
"epoch": 0.9273285568065507,
"grad_norm": 0.42360404758505654,
"learning_rate": 3.9112660505419626e-05,
"loss": 0.5248,
"step": 453
},
{
"epoch": 0.9293756397134084,
"grad_norm": 0.4345837890396033,
"learning_rate": 3.910421301629264e-05,
"loss": 0.4887,
"step": 454
},
{
"epoch": 0.9314227226202662,
"grad_norm": 0.43091075968385034,
"learning_rate": 3.909572642830053e-05,
"loss": 0.4987,
"step": 455
},
{
"epoch": 0.9334698055271239,
"grad_norm": 0.4111596649311637,
"learning_rate": 3.9087200758812054e-05,
"loss": 0.5429,
"step": 456
},
{
"epoch": 0.9355168884339816,
"grad_norm": 0.4114767207964879,
"learning_rate": 3.9078636025275904e-05,
"loss": 0.4868,
"step": 457
},
{
"epoch": 0.9375639713408394,
"grad_norm": 0.4511575697019446,
"learning_rate": 3.907003224522075e-05,
"loss": 0.5151,
"step": 458
},
{
"epoch": 0.9396110542476971,
"grad_norm": 0.47837137672489904,
"learning_rate": 3.906138943625519e-05,
"loss": 0.4812,
"step": 459
},
{
"epoch": 0.9416581371545547,
"grad_norm": 0.394915101475589,
"learning_rate": 3.9052707616067654e-05,
"loss": 0.5405,
"step": 460
},
{
"epoch": 0.9437052200614124,
"grad_norm": 0.5432929178652014,
"learning_rate": 3.9043986802426453e-05,
"loss": 0.4816,
"step": 461
},
{
"epoch": 0.9457523029682702,
"grad_norm": 0.3617390720179231,
"learning_rate": 3.903522701317968e-05,
"loss": 0.4864,
"step": 462
},
{
"epoch": 0.9477993858751279,
"grad_norm": 0.5239560704020834,
"learning_rate": 3.9026428266255205e-05,
"loss": 0.4979,
"step": 463
},
{
"epoch": 0.9498464687819856,
"grad_norm": 0.39348645216036965,
"learning_rate": 3.901759057966064e-05,
"loss": 0.4741,
"step": 464
},
{
"epoch": 0.9518935516888434,
"grad_norm": 0.47748699997048394,
"learning_rate": 3.9008713971483257e-05,
"loss": 0.4729,
"step": 465
},
{
"epoch": 0.9539406345957011,
"grad_norm": 0.4339281681486082,
"learning_rate": 3.899979845989003e-05,
"loss": 0.5072,
"step": 466
},
{
"epoch": 0.9559877175025588,
"grad_norm": 0.4360289845901765,
"learning_rate": 3.899084406312751e-05,
"loss": 0.4844,
"step": 467
},
{
"epoch": 0.9580348004094166,
"grad_norm": 0.49803606731717964,
"learning_rate": 3.8981850799521856e-05,
"loss": 0.5021,
"step": 468
},
{
"epoch": 0.9600818833162743,
"grad_norm": 0.4633434375152389,
"learning_rate": 3.897281868747878e-05,
"loss": 0.5003,
"step": 469
},
{
"epoch": 0.962128966223132,
"grad_norm": 0.4887467757571554,
"learning_rate": 3.896374774548348e-05,
"loss": 0.5054,
"step": 470
},
{
"epoch": 0.9641760491299898,
"grad_norm": 0.44664870237749776,
"learning_rate": 3.895463799210063e-05,
"loss": 0.5186,
"step": 471
},
{
"epoch": 0.9662231320368475,
"grad_norm": 0.5203094189929391,
"learning_rate": 3.894548944597434e-05,
"loss": 0.5227,
"step": 472
},
{
"epoch": 0.9682702149437052,
"grad_norm": 0.37329019363463944,
"learning_rate": 3.8936302125828114e-05,
"loss": 0.4836,
"step": 473
},
{
"epoch": 0.970317297850563,
"grad_norm": 0.46252269585391337,
"learning_rate": 3.892707605046482e-05,
"loss": 0.465,
"step": 474
},
{
"epoch": 0.9723643807574207,
"grad_norm": 0.4466031989756924,
"learning_rate": 3.8917811238766606e-05,
"loss": 0.5103,
"step": 475
},
{
"epoch": 0.9744114636642784,
"grad_norm": 0.49858281748974287,
"learning_rate": 3.8908507709694945e-05,
"loss": 0.4857,
"step": 476
},
{
"epoch": 0.9764585465711362,
"grad_norm": 0.5999408245173112,
"learning_rate": 3.8899165482290524e-05,
"loss": 0.5135,
"step": 477
},
{
"epoch": 0.9785056294779939,
"grad_norm": 0.44390557227435046,
"learning_rate": 3.888978457567323e-05,
"loss": 0.4826,
"step": 478
},
{
"epoch": 0.9805527123848515,
"grad_norm": 0.6043545995456223,
"learning_rate": 3.888036500904212e-05,
"loss": 0.5154,
"step": 479
},
{
"epoch": 0.9825997952917093,
"grad_norm": 0.4818312706910111,
"learning_rate": 3.887090680167537e-05,
"loss": 0.5087,
"step": 480
},
{
"epoch": 0.984646878198567,
"grad_norm": 0.6429627680182052,
"learning_rate": 3.886140997293024e-05,
"loss": 0.5033,
"step": 481
},
{
"epoch": 0.9866939611054247,
"grad_norm": 0.46009297651512515,
"learning_rate": 3.8851874542243024e-05,
"loss": 0.4737,
"step": 482
},
{
"epoch": 0.9887410440122825,
"grad_norm": 0.5257256010671767,
"learning_rate": 3.8842300529129026e-05,
"loss": 0.4922,
"step": 483
},
{
"epoch": 0.9907881269191402,
"grad_norm": 0.4952147402880594,
"learning_rate": 3.883268795318252e-05,
"loss": 0.5093,
"step": 484
},
{
"epoch": 0.9928352098259979,
"grad_norm": 0.40718187065435846,
"learning_rate": 3.882303683407669e-05,
"loss": 0.4898,
"step": 485
},
{
"epoch": 0.9948822927328557,
"grad_norm": 0.6342262964756523,
"learning_rate": 3.8813347191563615e-05,
"loss": 0.4809,
"step": 486
},
{
"epoch": 0.9969293756397134,
"grad_norm": 0.36521858199635876,
"learning_rate": 3.88036190454742e-05,
"loss": 0.4792,
"step": 487
},
{
"epoch": 0.9989764585465711,
"grad_norm": 0.49465426774165494,
"learning_rate": 3.879385241571817e-05,
"loss": 0.4945,
"step": 488
},
{
"epoch": 1.0010235414534288,
"grad_norm": 0.6823091472624835,
"learning_rate": 3.8784047322284e-05,
"loss": 0.7219,
"step": 489
},
{
"epoch": 1.0030706243602865,
"grad_norm": 0.5576204983958845,
"learning_rate": 3.8774203785238886e-05,
"loss": 0.5356,
"step": 490
},
{
"epoch": 1.0051177072671442,
"grad_norm": 0.4049517959021661,
"learning_rate": 3.8764321824728715e-05,
"loss": 0.3433,
"step": 491
},
{
"epoch": 1.007164790174002,
"grad_norm": 0.6379294758690475,
"learning_rate": 3.875440146097798e-05,
"loss": 0.4658,
"step": 492
},
{
"epoch": 1.0092118730808597,
"grad_norm": 0.48729117155574453,
"learning_rate": 3.8744442714289816e-05,
"loss": 0.423,
"step": 493
},
{
"epoch": 1.0112589559877174,
"grad_norm": 0.6905667947564962,
"learning_rate": 3.873444560504588e-05,
"loss": 0.4378,
"step": 494
},
{
"epoch": 1.0133060388945752,
"grad_norm": 0.5821283432486261,
"learning_rate": 3.872441015370635e-05,
"loss": 0.4592,
"step": 495
},
{
"epoch": 1.015353121801433,
"grad_norm": 0.6506243911845556,
"learning_rate": 3.8714336380809874e-05,
"loss": 0.4401,
"step": 496
},
{
"epoch": 1.0174002047082906,
"grad_norm": 0.5026499066925773,
"learning_rate": 3.870422430697354e-05,
"loss": 0.4082,
"step": 497
},
{
"epoch": 1.0194472876151484,
"grad_norm": 0.7706353109115344,
"learning_rate": 3.869407395289281e-05,
"loss": 0.4851,
"step": 498
},
{
"epoch": 1.021494370522006,
"grad_norm": 0.40094293966181355,
"learning_rate": 3.86838853393415e-05,
"loss": 0.3689,
"step": 499
},
{
"epoch": 1.0235414534288638,
"grad_norm": 0.7044751532962212,
"learning_rate": 3.867365848717171e-05,
"loss": 0.4298,
"step": 500
},
{
"epoch": 1.0255885363357216,
"grad_norm": 0.5123771630826552,
"learning_rate": 3.866339341731384e-05,
"loss": 0.4313,
"step": 501
},
{
"epoch": 1.0276356192425793,
"grad_norm": 0.44854290574972205,
"learning_rate": 3.865309015077645e-05,
"loss": 0.3686,
"step": 502
},
{
"epoch": 1.029682702149437,
"grad_norm": 0.5101255948957101,
"learning_rate": 3.8642748708646324e-05,
"loss": 0.4471,
"step": 503
},
{
"epoch": 1.0317297850562948,
"grad_norm": 0.44907848191294825,
"learning_rate": 3.863236911208835e-05,
"loss": 0.342,
"step": 504
},
{
"epoch": 1.0337768679631525,
"grad_norm": 0.5031368605626068,
"learning_rate": 3.862195138234551e-05,
"loss": 0.4214,
"step": 505
},
{
"epoch": 1.0358239508700102,
"grad_norm": 0.5833077927402516,
"learning_rate": 3.8611495540738835e-05,
"loss": 0.4858,
"step": 506
},
{
"epoch": 1.037871033776868,
"grad_norm": 0.41256204125595797,
"learning_rate": 3.860100160866733e-05,
"loss": 0.4171,
"step": 507
},
{
"epoch": 1.0399181166837257,
"grad_norm": 0.5124022037131871,
"learning_rate": 3.859046960760801e-05,
"loss": 0.4517,
"step": 508
},
{
"epoch": 1.0419651995905834,
"grad_norm": 0.5225640900487728,
"learning_rate": 3.857989955911574e-05,
"loss": 0.4341,
"step": 509
},
{
"epoch": 1.0440122824974412,
"grad_norm": 0.38566215827923084,
"learning_rate": 3.85692914848233e-05,
"loss": 0.3913,
"step": 510
},
{
"epoch": 1.046059365404299,
"grad_norm": 0.4633891939003197,
"learning_rate": 3.855864540644126e-05,
"loss": 0.397,
"step": 511
},
{
"epoch": 1.0481064483111566,
"grad_norm": 0.47380208701961085,
"learning_rate": 3.8547961345758e-05,
"loss": 0.4476,
"step": 512
},
{
"epoch": 1.0501535312180144,
"grad_norm": 0.5577403954270109,
"learning_rate": 3.853723932463962e-05,
"loss": 0.4559,
"step": 513
},
{
"epoch": 1.052200614124872,
"grad_norm": 0.5358423915305728,
"learning_rate": 3.8526479365029906e-05,
"loss": 0.4322,
"step": 514
},
{
"epoch": 1.0542476970317298,
"grad_norm": 0.463025872995783,
"learning_rate": 3.8515681488950286e-05,
"loss": 0.4725,
"step": 515
},
{
"epoch": 1.0562947799385876,
"grad_norm": 0.4349348376778485,
"learning_rate": 3.850484571849982e-05,
"loss": 0.3743,
"step": 516
},
{
"epoch": 1.0583418628454453,
"grad_norm": 0.49093997411971185,
"learning_rate": 3.849397207585508e-05,
"loss": 0.4704,
"step": 517
},
{
"epoch": 1.060388945752303,
"grad_norm": 0.5752884046292331,
"learning_rate": 3.848306058327016e-05,
"loss": 0.4772,
"step": 518
},
{
"epoch": 1.0624360286591608,
"grad_norm": 0.36504458066426737,
"learning_rate": 3.847211126307666e-05,
"loss": 0.4277,
"step": 519
},
{
"epoch": 1.0644831115660185,
"grad_norm": 0.5134952699143726,
"learning_rate": 3.846112413768353e-05,
"loss": 0.4094,
"step": 520
},
{
"epoch": 1.0665301944728762,
"grad_norm": 0.42686550744637813,
"learning_rate": 3.845009922957713e-05,
"loss": 0.3999,
"step": 521
},
{
"epoch": 1.068577277379734,
"grad_norm": 0.45121329177961284,
"learning_rate": 3.843903656132116e-05,
"loss": 0.4782,
"step": 522
},
{
"epoch": 1.0706243602865917,
"grad_norm": 0.4799180332218181,
"learning_rate": 3.842793615555657e-05,
"loss": 0.4344,
"step": 523
},
{
"epoch": 1.0726714431934494,
"grad_norm": 0.4906577800070559,
"learning_rate": 3.8416798035001545e-05,
"loss": 0.3999,
"step": 524
},
{
"epoch": 1.0747185261003072,
"grad_norm": 0.4381457628189835,
"learning_rate": 3.8405622222451496e-05,
"loss": 0.4867,
"step": 525
},
{
"epoch": 1.076765609007165,
"grad_norm": 0.4802227337203006,
"learning_rate": 3.8394408740778934e-05,
"loss": 0.4067,
"step": 526
},
{
"epoch": 1.0788126919140226,
"grad_norm": 0.4490101903364367,
"learning_rate": 3.838315761293348e-05,
"loss": 0.4346,
"step": 527
},
{
"epoch": 1.0808597748208801,
"grad_norm": 0.4633513621013708,
"learning_rate": 3.8371868861941795e-05,
"loss": 0.4177,
"step": 528
},
{
"epoch": 1.0829068577277379,
"grad_norm": 0.38946131051973665,
"learning_rate": 3.836054251090755e-05,
"loss": 0.3554,
"step": 529
},
{
"epoch": 1.0849539406345956,
"grad_norm": 0.4671283243776551,
"learning_rate": 3.8349178583011356e-05,
"loss": 0.4613,
"step": 530
},
{
"epoch": 1.0870010235414533,
"grad_norm": 0.42447494229810095,
"learning_rate": 3.833777710151075e-05,
"loss": 0.4056,
"step": 531
},
{
"epoch": 1.089048106448311,
"grad_norm": 0.42416375790936595,
"learning_rate": 3.83263380897401e-05,
"loss": 0.3751,
"step": 532
},
{
"epoch": 1.0910951893551688,
"grad_norm": 0.4073852074265662,
"learning_rate": 3.8314861571110604e-05,
"loss": 0.4637,
"step": 533
},
{
"epoch": 1.0931422722620265,
"grad_norm": 0.4548496033565697,
"learning_rate": 3.830334756911021e-05,
"loss": 0.4299,
"step": 534
},
{
"epoch": 1.0951893551688843,
"grad_norm": 0.376272980920323,
"learning_rate": 3.829179610730359e-05,
"loss": 0.408,
"step": 535
},
{
"epoch": 1.097236438075742,
"grad_norm": 0.427450221785291,
"learning_rate": 3.828020720933207e-05,
"loss": 0.4095,
"step": 536
},
{
"epoch": 1.0992835209825997,
"grad_norm": 0.440976279211047,
"learning_rate": 3.826858089891361e-05,
"loss": 0.494,
"step": 537
},
{
"epoch": 1.1013306038894575,
"grad_norm": 0.4209945160038491,
"learning_rate": 3.8256917199842715e-05,
"loss": 0.4586,
"step": 538
},
{
"epoch": 1.1033776867963152,
"grad_norm": 0.3962229334467954,
"learning_rate": 3.824521613599043e-05,
"loss": 0.405,
"step": 539
},
{
"epoch": 1.105424769703173,
"grad_norm": 0.40571779964971894,
"learning_rate": 3.823347773130427e-05,
"loss": 0.4275,
"step": 540
},
{
"epoch": 1.1074718526100307,
"grad_norm": 0.3618547256384093,
"learning_rate": 3.822170200980815e-05,
"loss": 0.3858,
"step": 541
},
{
"epoch": 1.1095189355168884,
"grad_norm": 0.4080407212884249,
"learning_rate": 3.820988899560239e-05,
"loss": 0.4645,
"step": 542
},
{
"epoch": 1.1115660184237461,
"grad_norm": 0.3661867427781835,
"learning_rate": 3.819803871286361e-05,
"loss": 0.4359,
"step": 543
},
{
"epoch": 1.1136131013306039,
"grad_norm": 0.3672357843320578,
"learning_rate": 3.818615118584472e-05,
"loss": 0.4266,
"step": 544
},
{
"epoch": 1.1156601842374616,
"grad_norm": 0.42948959266814246,
"learning_rate": 3.817422643887484e-05,
"loss": 0.3994,
"step": 545
},
{
"epoch": 1.1177072671443193,
"grad_norm": 0.4671751141730952,
"learning_rate": 3.816226449635927e-05,
"loss": 0.4408,
"step": 546
},
{
"epoch": 1.119754350051177,
"grad_norm": 0.41461516441504426,
"learning_rate": 3.815026538277943e-05,
"loss": 0.3956,
"step": 547
},
{
"epoch": 1.1218014329580348,
"grad_norm": 0.5835686601708205,
"learning_rate": 3.813822912269284e-05,
"loss": 0.4505,
"step": 548
},
{
"epoch": 1.1238485158648925,
"grad_norm": 0.4319406426533076,
"learning_rate": 3.812615574073301e-05,
"loss": 0.3822,
"step": 549
},
{
"epoch": 1.1258955987717503,
"grad_norm": 0.4516791429478303,
"learning_rate": 3.811404526160943e-05,
"loss": 0.4144,
"step": 550
},
{
"epoch": 1.127942681678608,
"grad_norm": 0.5360130807078863,
"learning_rate": 3.810189771010755e-05,
"loss": 0.4786,
"step": 551
},
{
"epoch": 1.1299897645854657,
"grad_norm": 0.4708744253041833,
"learning_rate": 3.808971311108865e-05,
"loss": 0.4241,
"step": 552
},
{
"epoch": 1.1320368474923235,
"grad_norm": 0.43422914565208964,
"learning_rate": 3.8077491489489835e-05,
"loss": 0.4204,
"step": 553
},
{
"epoch": 1.1340839303991812,
"grad_norm": 0.4721868771642081,
"learning_rate": 3.806523287032401e-05,
"loss": 0.3909,
"step": 554
},
{
"epoch": 1.136131013306039,
"grad_norm": 0.4196801932672472,
"learning_rate": 3.805293727867978e-05,
"loss": 0.4427,
"step": 555
},
{
"epoch": 1.1381780962128967,
"grad_norm": 0.43191530611285084,
"learning_rate": 3.8040604739721415e-05,
"loss": 0.4247,
"step": 556
},
{
"epoch": 1.1402251791197544,
"grad_norm": 0.4098386090267591,
"learning_rate": 3.8028235278688814e-05,
"loss": 0.4136,
"step": 557
},
{
"epoch": 1.1422722620266121,
"grad_norm": 0.42481716204053743,
"learning_rate": 3.8015828920897425e-05,
"loss": 0.425,
"step": 558
},
{
"epoch": 1.1443193449334699,
"grad_norm": 0.3947721096519682,
"learning_rate": 3.8003385691738227e-05,
"loss": 0.4169,
"step": 559
},
{
"epoch": 1.1463664278403276,
"grad_norm": 0.4434721832169034,
"learning_rate": 3.7990905616677644e-05,
"loss": 0.4804,
"step": 560
},
{
"epoch": 1.1484135107471853,
"grad_norm": 0.44596996028550917,
"learning_rate": 3.797838872125752e-05,
"loss": 0.4587,
"step": 561
},
{
"epoch": 1.150460593654043,
"grad_norm": 0.40781008639677546,
"learning_rate": 3.7965835031095065e-05,
"loss": 0.4614,
"step": 562
},
{
"epoch": 1.1525076765609008,
"grad_norm": 0.4036300851691864,
"learning_rate": 3.795324457188276e-05,
"loss": 0.3913,
"step": 563
},
{
"epoch": 1.1545547594677585,
"grad_norm": 0.3583696642752529,
"learning_rate": 3.794061736938837e-05,
"loss": 0.3828,
"step": 564
},
{
"epoch": 1.156601842374616,
"grad_norm": 0.5138599201629513,
"learning_rate": 3.792795344945485e-05,
"loss": 0.4861,
"step": 565
},
{
"epoch": 1.158648925281474,
"grad_norm": 0.40729637695646664,
"learning_rate": 3.79152528380003e-05,
"loss": 0.4136,
"step": 566
},
{
"epoch": 1.1606960081883315,
"grad_norm": 0.4688730059593128,
"learning_rate": 3.790251556101791e-05,
"loss": 0.3991,
"step": 567
},
{
"epoch": 1.1627430910951895,
"grad_norm": 0.41555333567521613,
"learning_rate": 3.7889741644575914e-05,
"loss": 0.4859,
"step": 568
},
{
"epoch": 1.164790174002047,
"grad_norm": 0.368128010733192,
"learning_rate": 3.787693111481753e-05,
"loss": 0.412,
"step": 569
},
{
"epoch": 1.1668372569089047,
"grad_norm": 0.40448293578209554,
"learning_rate": 3.786408399796091e-05,
"loss": 0.4812,
"step": 570
},
{
"epoch": 1.1688843398157625,
"grad_norm": 0.35706355299321096,
"learning_rate": 3.78512003202991e-05,
"loss": 0.4098,
"step": 571
},
{
"epoch": 1.1709314227226202,
"grad_norm": 0.3794649872224116,
"learning_rate": 3.783828010819993e-05,
"loss": 0.4184,
"step": 572
},
{
"epoch": 1.172978505629478,
"grad_norm": 0.4084126099380887,
"learning_rate": 3.782532338810605e-05,
"loss": 0.4279,
"step": 573
},
{
"epoch": 1.1750255885363357,
"grad_norm": 0.40248846798503674,
"learning_rate": 3.7812330186534815e-05,
"loss": 0.374,
"step": 574
},
{
"epoch": 1.1770726714431934,
"grad_norm": 0.42082880622868263,
"learning_rate": 3.779930053007821e-05,
"loss": 0.4294,
"step": 575
},
{
"epoch": 1.1791197543500511,
"grad_norm": 0.4741074023730061,
"learning_rate": 3.778623444540287e-05,
"loss": 0.4655,
"step": 576
},
{
"epoch": 1.1811668372569089,
"grad_norm": 0.40443469906849566,
"learning_rate": 3.777313195924998e-05,
"loss": 0.4313,
"step": 577
},
{
"epoch": 1.1832139201637666,
"grad_norm": 0.5301327875202605,
"learning_rate": 3.775999309843519e-05,
"loss": 0.4872,
"step": 578
},
{
"epoch": 1.1852610030706243,
"grad_norm": 0.4031400337502115,
"learning_rate": 3.774681788984863e-05,
"loss": 0.44,
"step": 579
},
{
"epoch": 1.187308085977482,
"grad_norm": 0.32666462235868937,
"learning_rate": 3.773360636045481e-05,
"loss": 0.3297,
"step": 580
},
{
"epoch": 1.1893551688843398,
"grad_norm": 0.48740976479218406,
"learning_rate": 3.7720358537292566e-05,
"loss": 0.502,
"step": 581
},
{
"epoch": 1.1914022517911975,
"grad_norm": 0.4129431656165589,
"learning_rate": 3.770707444747502e-05,
"loss": 0.3901,
"step": 582
},
{
"epoch": 1.1934493346980553,
"grad_norm": 0.37634568229429827,
"learning_rate": 3.7693754118189525e-05,
"loss": 0.406,
"step": 583
},
{
"epoch": 1.195496417604913,
"grad_norm": 0.413600210538619,
"learning_rate": 3.768039757669759e-05,
"loss": 0.4389,
"step": 584
},
{
"epoch": 1.1975435005117707,
"grad_norm": 0.41522152600174966,
"learning_rate": 3.766700485033484e-05,
"loss": 0.4213,
"step": 585
},
{
"epoch": 1.1995905834186285,
"grad_norm": 0.3994741162417736,
"learning_rate": 3.765357596651095e-05,
"loss": 0.4192,
"step": 586
},
{
"epoch": 1.2016376663254862,
"grad_norm": 0.44961799715640177,
"learning_rate": 3.764011095270962e-05,
"loss": 0.4448,
"step": 587
},
{
"epoch": 1.203684749232344,
"grad_norm": 0.4405333877210868,
"learning_rate": 3.762660983648846e-05,
"loss": 0.4425,
"step": 588
},
{
"epoch": 1.2057318321392017,
"grad_norm": 0.3647533882869788,
"learning_rate": 3.761307264547899e-05,
"loss": 0.3798,
"step": 589
},
{
"epoch": 1.2077789150460594,
"grad_norm": 0.5134897309426143,
"learning_rate": 3.759949940738655e-05,
"loss": 0.4862,
"step": 590
},
{
"epoch": 1.2098259979529171,
"grad_norm": 0.42730289018233486,
"learning_rate": 3.7585890149990265e-05,
"loss": 0.3887,
"step": 591
},
{
"epoch": 1.2118730808597749,
"grad_norm": 0.4053414794034375,
"learning_rate": 3.757224490114297e-05,
"loss": 0.4327,
"step": 592
},
{
"epoch": 1.2139201637666326,
"grad_norm": 0.4351870322953961,
"learning_rate": 3.755856368877116e-05,
"loss": 0.379,
"step": 593
},
{
"epoch": 1.2159672466734903,
"grad_norm": 0.4319979039337228,
"learning_rate": 3.7544846540874934e-05,
"loss": 0.45,
"step": 594
},
{
"epoch": 1.218014329580348,
"grad_norm": 0.3683791838302071,
"learning_rate": 3.7531093485527943e-05,
"loss": 0.4189,
"step": 595
},
{
"epoch": 1.2200614124872058,
"grad_norm": 0.47932608544530164,
"learning_rate": 3.7517304550877315e-05,
"loss": 0.4327,
"step": 596
},
{
"epoch": 1.2221084953940635,
"grad_norm": 0.46656194426712394,
"learning_rate": 3.750347976514362e-05,
"loss": 0.4774,
"step": 597
},
{
"epoch": 1.2241555783009213,
"grad_norm": 0.3933033098379041,
"learning_rate": 3.7489619156620796e-05,
"loss": 0.4224,
"step": 598
},
{
"epoch": 1.226202661207779,
"grad_norm": 0.40838601128257357,
"learning_rate": 3.74757227536761e-05,
"loss": 0.4361,
"step": 599
},
{
"epoch": 1.2282497441146367,
"grad_norm": 0.4011956657411684,
"learning_rate": 3.7461790584750036e-05,
"loss": 0.373,
"step": 600
},
{
"epoch": 1.2302968270214945,
"grad_norm": 0.4783960660544416,
"learning_rate": 3.744782267835632e-05,
"loss": 0.4497,
"step": 601
},
{
"epoch": 1.2323439099283522,
"grad_norm": 0.4426728718136347,
"learning_rate": 3.74338190630818e-05,
"loss": 0.4396,
"step": 602
},
{
"epoch": 1.23439099283521,
"grad_norm": 0.4507239891416186,
"learning_rate": 3.7419779767586406e-05,
"loss": 0.4312,
"step": 603
},
{
"epoch": 1.2364380757420674,
"grad_norm": 0.45738216770554924,
"learning_rate": 3.740570482060311e-05,
"loss": 0.4186,
"step": 604
},
{
"epoch": 1.2384851586489254,
"grad_norm": 0.4112804317839281,
"learning_rate": 3.7391594250937813e-05,
"loss": 0.4075,
"step": 605
},
{
"epoch": 1.240532241555783,
"grad_norm": 0.3910005461879264,
"learning_rate": 3.737744808746935e-05,
"loss": 0.4063,
"step": 606
},
{
"epoch": 1.2425793244626409,
"grad_norm": 0.37923695698945686,
"learning_rate": 3.73632663591494e-05,
"loss": 0.3753,
"step": 607
},
{
"epoch": 1.2446264073694984,
"grad_norm": 0.3993349892342946,
"learning_rate": 3.7349049095002414e-05,
"loss": 0.415,
"step": 608
},
{
"epoch": 1.246673490276356,
"grad_norm": 0.49340730052744397,
"learning_rate": 3.733479632412559e-05,
"loss": 0.4137,
"step": 609
},
{
"epoch": 1.2487205731832138,
"grad_norm": 0.4033663663707718,
"learning_rate": 3.732050807568878e-05,
"loss": 0.4078,
"step": 610
},
{
"epoch": 1.2507676560900716,
"grad_norm": 0.4885064983059752,
"learning_rate": 3.730618437893444e-05,
"loss": 0.479,
"step": 611
},
{
"epoch": 1.2528147389969293,
"grad_norm": 0.43160732650352407,
"learning_rate": 3.729182526317761e-05,
"loss": 0.455,
"step": 612
},
{
"epoch": 1.254861821903787,
"grad_norm": 0.45988754618419736,
"learning_rate": 3.727743075780578e-05,
"loss": 0.3783,
"step": 613
},
{
"epoch": 1.2569089048106448,
"grad_norm": 0.44846875859935137,
"learning_rate": 3.726300089227887e-05,
"loss": 0.4124,
"step": 614
},
{
"epoch": 1.2589559877175025,
"grad_norm": 0.5009012146609582,
"learning_rate": 3.72485356961292e-05,
"loss": 0.4256,
"step": 615
},
{
"epoch": 1.2610030706243602,
"grad_norm": 0.40882732802753774,
"learning_rate": 3.723403519896136e-05,
"loss": 0.3877,
"step": 616
},
{
"epoch": 1.263050153531218,
"grad_norm": 0.509277319060574,
"learning_rate": 3.721949943045223e-05,
"loss": 0.4603,
"step": 617
},
{
"epoch": 1.2650972364380757,
"grad_norm": 0.4550675308253649,
"learning_rate": 3.720492842035084e-05,
"loss": 0.4958,
"step": 618
},
{
"epoch": 1.2671443193449334,
"grad_norm": 0.5039050269537974,
"learning_rate": 3.7190322198478355e-05,
"loss": 0.5296,
"step": 619
},
{
"epoch": 1.2691914022517912,
"grad_norm": 0.42034576410985675,
"learning_rate": 3.7175680794728015e-05,
"loss": 0.4171,
"step": 620
},
{
"epoch": 1.271238485158649,
"grad_norm": 0.6194106838370393,
"learning_rate": 3.716100423906505e-05,
"loss": 0.524,
"step": 621
},
{
"epoch": 1.2732855680655066,
"grad_norm": 0.3686755041504862,
"learning_rate": 3.7146292561526654e-05,
"loss": 0.3836,
"step": 622
},
{
"epoch": 1.2753326509723644,
"grad_norm": 0.5059037426173506,
"learning_rate": 3.7131545792221864e-05,
"loss": 0.485,
"step": 623
},
{
"epoch": 1.277379733879222,
"grad_norm": 0.40419199499915215,
"learning_rate": 3.711676396133158e-05,
"loss": 0.3979,
"step": 624
},
{
"epoch": 1.2794268167860798,
"grad_norm": 0.48892649990931586,
"learning_rate": 3.7101947099108425e-05,
"loss": 0.5072,
"step": 625
},
{
"epoch": 1.2814738996929376,
"grad_norm": 0.4545220525315462,
"learning_rate": 3.708709523587674e-05,
"loss": 0.4275,
"step": 626
},
{
"epoch": 1.2835209825997953,
"grad_norm": 0.40189284591007923,
"learning_rate": 3.707220840203249e-05,
"loss": 0.3881,
"step": 627
},
{
"epoch": 1.285568065506653,
"grad_norm": 0.43194988622551983,
"learning_rate": 3.70572866280432e-05,
"loss": 0.4747,
"step": 628
},
{
"epoch": 1.2876151484135108,
"grad_norm": 0.44678464462286466,
"learning_rate": 3.7042329944447925e-05,
"loss": 0.391,
"step": 629
},
{
"epoch": 1.2896622313203685,
"grad_norm": 0.4623510609405332,
"learning_rate": 3.702733838185716e-05,
"loss": 0.423,
"step": 630
},
{
"epoch": 1.2917093142272262,
"grad_norm": 0.4314971051525567,
"learning_rate": 3.701231197095277e-05,
"loss": 0.4285,
"step": 631
},
{
"epoch": 1.293756397134084,
"grad_norm": 0.39807090722917615,
"learning_rate": 3.6997250742487955e-05,
"loss": 0.3975,
"step": 632
},
{
"epoch": 1.2958034800409417,
"grad_norm": 0.4538564539433799,
"learning_rate": 3.698215472728718e-05,
"loss": 0.4566,
"step": 633
},
{
"epoch": 1.2978505629477994,
"grad_norm": 0.387005647622149,
"learning_rate": 3.696702395624608e-05,
"loss": 0.4376,
"step": 634
},
{
"epoch": 1.2998976458546572,
"grad_norm": 0.4089593023659839,
"learning_rate": 3.6951858460331446e-05,
"loss": 0.4297,
"step": 635
},
{
"epoch": 1.301944728761515,
"grad_norm": 0.423980436557396,
"learning_rate": 3.693665827058111e-05,
"loss": 0.4407,
"step": 636
},
{
"epoch": 1.3039918116683726,
"grad_norm": 0.3939340668431382,
"learning_rate": 3.692142341810395e-05,
"loss": 0.3762,
"step": 637
},
{
"epoch": 1.3060388945752304,
"grad_norm": 0.3791993183647926,
"learning_rate": 3.690615393407975e-05,
"loss": 0.412,
"step": 638
},
{
"epoch": 1.308085977482088,
"grad_norm": 0.44464627005705176,
"learning_rate": 3.689084984975918e-05,
"loss": 0.3946,
"step": 639
},
{
"epoch": 1.3101330603889458,
"grad_norm": 0.4356188955070128,
"learning_rate": 3.6875511196463715e-05,
"loss": 0.4583,
"step": 640
},
{
"epoch": 1.3121801432958033,
"grad_norm": 0.43588824508846175,
"learning_rate": 3.686013800558561e-05,
"loss": 0.4674,
"step": 641
},
{
"epoch": 1.3142272262026613,
"grad_norm": 0.5156297864126925,
"learning_rate": 3.6844730308587776e-05,
"loss": 0.4052,
"step": 642
},
{
"epoch": 1.3162743091095188,
"grad_norm": 0.40684908578032264,
"learning_rate": 3.682928813700375e-05,
"loss": 0.4329,
"step": 643
},
{
"epoch": 1.3183213920163768,
"grad_norm": 0.42989019879872664,
"learning_rate": 3.681381152243763e-05,
"loss": 0.4264,
"step": 644
},
{
"epoch": 1.3203684749232343,
"grad_norm": 0.37812994433882907,
"learning_rate": 3.6798300496564e-05,
"loss": 0.386,
"step": 645
},
{
"epoch": 1.3224155578300922,
"grad_norm": 0.4191444772159957,
"learning_rate": 3.678275509112788e-05,
"loss": 0.4587,
"step": 646
},
{
"epoch": 1.3244626407369497,
"grad_norm": 0.34057419650855425,
"learning_rate": 3.6767175337944646e-05,
"loss": 0.4069,
"step": 647
},
{
"epoch": 1.3265097236438077,
"grad_norm": 0.4210596650083103,
"learning_rate": 3.675156126889996e-05,
"loss": 0.4614,
"step": 648
},
{
"epoch": 1.3285568065506652,
"grad_norm": 0.380672360295006,
"learning_rate": 3.6735912915949745e-05,
"loss": 0.4919,
"step": 649
},
{
"epoch": 1.330603889457523,
"grad_norm": 0.46378857138786805,
"learning_rate": 3.672023031112005e-05,
"loss": 0.4398,
"step": 650
},
{
"epoch": 1.3326509723643807,
"grad_norm": 0.3731856457734196,
"learning_rate": 3.670451348650705e-05,
"loss": 0.3786,
"step": 651
},
{
"epoch": 1.3346980552712384,
"grad_norm": 0.4439716102781381,
"learning_rate": 3.6688762474276945e-05,
"loss": 0.4175,
"step": 652
},
{
"epoch": 1.3367451381780961,
"grad_norm": 0.5191394349356976,
"learning_rate": 3.667297730666592e-05,
"loss": 0.5042,
"step": 653
},
{
"epoch": 1.3387922210849539,
"grad_norm": 0.41690574322071583,
"learning_rate": 3.665715801598004e-05,
"loss": 0.4098,
"step": 654
},
{
"epoch": 1.3408393039918116,
"grad_norm": 0.42158819550590015,
"learning_rate": 3.6641304634595216e-05,
"loss": 0.4271,
"step": 655
},
{
"epoch": 1.3428863868986693,
"grad_norm": 0.4596791286598413,
"learning_rate": 3.662541719495714e-05,
"loss": 0.4609,
"step": 656
},
{
"epoch": 1.344933469805527,
"grad_norm": 0.4858116688501969,
"learning_rate": 3.6609495729581186e-05,
"loss": 0.4909,
"step": 657
},
{
"epoch": 1.3469805527123848,
"grad_norm": 0.3811091658108338,
"learning_rate": 3.659354027105238e-05,
"loss": 0.4224,
"step": 658
},
{
"epoch": 1.3490276356192425,
"grad_norm": 0.40653713538788,
"learning_rate": 3.657755085202532e-05,
"loss": 0.4144,
"step": 659
},
{
"epoch": 1.3510747185261003,
"grad_norm": 0.4900473769452504,
"learning_rate": 3.6561527505224104e-05,
"loss": 0.4582,
"step": 660
},
{
"epoch": 1.353121801432958,
"grad_norm": 0.45088745908937394,
"learning_rate": 3.6545470263442265e-05,
"loss": 0.4345,
"step": 661
},
{
"epoch": 1.3551688843398157,
"grad_norm": 0.44409465950161797,
"learning_rate": 3.65293791595427e-05,
"loss": 0.4867,
"step": 662
},
{
"epoch": 1.3572159672466735,
"grad_norm": 0.3857032013105052,
"learning_rate": 3.651325422645763e-05,
"loss": 0.4072,
"step": 663
},
{
"epoch": 1.3592630501535312,
"grad_norm": 0.4196509296621015,
"learning_rate": 3.649709549718849e-05,
"loss": 0.4109,
"step": 664
},
{
"epoch": 1.361310133060389,
"grad_norm": 0.3711033202233833,
"learning_rate": 3.648090300480589e-05,
"loss": 0.4498,
"step": 665
},
{
"epoch": 1.3633572159672467,
"grad_norm": 0.4738363974307569,
"learning_rate": 3.646467678244954e-05,
"loss": 0.4268,
"step": 666
},
{
"epoch": 1.3654042988741044,
"grad_norm": 0.4028954103082967,
"learning_rate": 3.6448416863328186e-05,
"loss": 0.4346,
"step": 667
},
{
"epoch": 1.3674513817809621,
"grad_norm": 0.4635908886319793,
"learning_rate": 3.643212328071953e-05,
"loss": 0.4453,
"step": 668
},
{
"epoch": 1.3694984646878199,
"grad_norm": 0.38319546452013337,
"learning_rate": 3.641579606797017e-05,
"loss": 0.4054,
"step": 669
},
{
"epoch": 1.3715455475946776,
"grad_norm": 0.3700643765726001,
"learning_rate": 3.639943525849555e-05,
"loss": 0.4692,
"step": 670
},
{
"epoch": 1.3735926305015353,
"grad_norm": 0.34905600329178454,
"learning_rate": 3.638304088577984e-05,
"loss": 0.4131,
"step": 671
},
{
"epoch": 1.375639713408393,
"grad_norm": 0.37865539989560054,
"learning_rate": 3.6366612983375936e-05,
"loss": 0.4696,
"step": 672
},
{
"epoch": 1.3776867963152508,
"grad_norm": 0.34634916188478093,
"learning_rate": 3.635015158490533e-05,
"loss": 0.4206,
"step": 673
},
{
"epoch": 1.3797338792221086,
"grad_norm": 0.3790168529129947,
"learning_rate": 3.6333656724058075e-05,
"loss": 0.4311,
"step": 674
},
{
"epoch": 1.3817809621289663,
"grad_norm": 0.39245574822155366,
"learning_rate": 3.6317128434592725e-05,
"loss": 0.453,
"step": 675
},
{
"epoch": 1.383828045035824,
"grad_norm": 0.3723108820988336,
"learning_rate": 3.6300566750336225e-05,
"loss": 0.4055,
"step": 676
},
{
"epoch": 1.3858751279426818,
"grad_norm": 0.37587879714211964,
"learning_rate": 3.6283971705183884e-05,
"loss": 0.4468,
"step": 677
},
{
"epoch": 1.3879222108495395,
"grad_norm": 0.3410504468464227,
"learning_rate": 3.626734333309927e-05,
"loss": 0.3782,
"step": 678
},
{
"epoch": 1.3899692937563972,
"grad_norm": 0.4332726859166056,
"learning_rate": 3.625068166811418e-05,
"loss": 0.4419,
"step": 679
},
{
"epoch": 1.3920163766632547,
"grad_norm": 0.3869800586992885,
"learning_rate": 3.623398674432853e-05,
"loss": 0.4211,
"step": 680
},
{
"epoch": 1.3940634595701127,
"grad_norm": 0.38121810828451136,
"learning_rate": 3.621725859591031e-05,
"loss": 0.4592,
"step": 681
},
{
"epoch": 1.3961105424769702,
"grad_norm": 0.39222808854449187,
"learning_rate": 3.6200497257095504e-05,
"loss": 0.4664,
"step": 682
},
{
"epoch": 1.3981576253838282,
"grad_norm": 0.3565067379505055,
"learning_rate": 3.6183702762188045e-05,
"loss": 0.4218,
"step": 683
},
{
"epoch": 1.4002047082906857,
"grad_norm": 0.3730858903859956,
"learning_rate": 3.6166875145559684e-05,
"loss": 0.4338,
"step": 684
},
{
"epoch": 1.4022517911975436,
"grad_norm": 0.36655424256782704,
"learning_rate": 3.615001444165001e-05,
"loss": 0.4372,
"step": 685
},
{
"epoch": 1.4042988741044011,
"grad_norm": 0.4487119389407511,
"learning_rate": 3.613312068496627e-05,
"loss": 0.462,
"step": 686
},
{
"epoch": 1.406345957011259,
"grad_norm": 0.3515839553061697,
"learning_rate": 3.611619391008341e-05,
"loss": 0.3974,
"step": 687
},
{
"epoch": 1.4083930399181166,
"grad_norm": 0.3677425891572493,
"learning_rate": 3.6099234151643924e-05,
"loss": 0.4634,
"step": 688
},
{
"epoch": 1.4104401228249743,
"grad_norm": 0.3826249798074036,
"learning_rate": 3.608224144435781e-05,
"loss": 0.4338,
"step": 689
},
{
"epoch": 1.412487205731832,
"grad_norm": 0.37018522079183763,
"learning_rate": 3.606521582300252e-05,
"loss": 0.4089,
"step": 690
},
{
"epoch": 1.4145342886386898,
"grad_norm": 0.39718163395641504,
"learning_rate": 3.604815732242283e-05,
"loss": 0.4481,
"step": 691
},
{
"epoch": 1.4165813715455475,
"grad_norm": 0.4177328745486512,
"learning_rate": 3.6031065977530854e-05,
"loss": 0.4061,
"step": 692
},
{
"epoch": 1.4186284544524053,
"grad_norm": 0.43120876742640946,
"learning_rate": 3.6013941823305884e-05,
"loss": 0.4105,
"step": 693
},
{
"epoch": 1.420675537359263,
"grad_norm": 0.4330368740835022,
"learning_rate": 3.5996784894794394e-05,
"loss": 0.4329,
"step": 694
},
{
"epoch": 1.4227226202661207,
"grad_norm": 0.42455998302592957,
"learning_rate": 3.5979595227109906e-05,
"loss": 0.4045,
"step": 695
},
{
"epoch": 1.4247697031729785,
"grad_norm": 0.4072147719505155,
"learning_rate": 3.5962372855432956e-05,
"loss": 0.452,
"step": 696
},
{
"epoch": 1.4268167860798362,
"grad_norm": 0.4188081245089065,
"learning_rate": 3.594511781501103e-05,
"loss": 0.4199,
"step": 697
},
{
"epoch": 1.428863868986694,
"grad_norm": 0.38251894827311167,
"learning_rate": 3.592783014115845e-05,
"loss": 0.3955,
"step": 698
},
{
"epoch": 1.4309109518935517,
"grad_norm": 0.4441091071016442,
"learning_rate": 3.5910509869256326e-05,
"loss": 0.4398,
"step": 699
},
{
"epoch": 1.4329580348004094,
"grad_norm": 0.38058511976389736,
"learning_rate": 3.58931570347525e-05,
"loss": 0.4019,
"step": 700
},
{
"epoch": 1.4350051177072671,
"grad_norm": 0.3617807382285515,
"learning_rate": 3.587577167316146e-05,
"loss": 0.4363,
"step": 701
},
{
"epoch": 1.4370522006141249,
"grad_norm": 0.43987641158494933,
"learning_rate": 3.585835382006424e-05,
"loss": 0.4328,
"step": 702
},
{
"epoch": 1.4390992835209826,
"grad_norm": 0.3609144606393506,
"learning_rate": 3.584090351110838e-05,
"loss": 0.4104,
"step": 703
},
{
"epoch": 1.4411463664278403,
"grad_norm": 0.4091808525809948,
"learning_rate": 3.582342078200786e-05,
"loss": 0.4138,
"step": 704
},
{
"epoch": 1.443193449334698,
"grad_norm": 0.41531638710969015,
"learning_rate": 3.5805905668543e-05,
"loss": 0.4697,
"step": 705
},
{
"epoch": 1.4452405322415558,
"grad_norm": 0.4154315464780426,
"learning_rate": 3.57883582065604e-05,
"loss": 0.4844,
"step": 706
},
{
"epoch": 1.4472876151484135,
"grad_norm": 0.426486587951999,
"learning_rate": 3.577077843197285e-05,
"loss": 0.4088,
"step": 707
},
{
"epoch": 1.4493346980552713,
"grad_norm": 0.4965404868194584,
"learning_rate": 3.57531663807593e-05,
"loss": 0.4369,
"step": 708
},
{
"epoch": 1.451381780962129,
"grad_norm": 0.4695819928605211,
"learning_rate": 3.573552208896474e-05,
"loss": 0.4382,
"step": 709
},
{
"epoch": 1.4534288638689867,
"grad_norm": 0.46954254869967305,
"learning_rate": 3.571784559270014e-05,
"loss": 0.4456,
"step": 710
},
{
"epoch": 1.4554759467758445,
"grad_norm": 0.4760159462479475,
"learning_rate": 3.570013692814239e-05,
"loss": 0.4477,
"step": 711
},
{
"epoch": 1.4575230296827022,
"grad_norm": 0.3992032479219894,
"learning_rate": 3.568239613153421e-05,
"loss": 0.396,
"step": 712
},
{
"epoch": 1.45957011258956,
"grad_norm": 0.5000911965095803,
"learning_rate": 3.566462323918409e-05,
"loss": 0.4535,
"step": 713
},
{
"epoch": 1.4616171954964177,
"grad_norm": 0.43927830807739277,
"learning_rate": 3.564681828746619e-05,
"loss": 0.4579,
"step": 714
},
{
"epoch": 1.4636642784032754,
"grad_norm": 0.3840584961797973,
"learning_rate": 3.5628981312820315e-05,
"loss": 0.3861,
"step": 715
},
{
"epoch": 1.4657113613101331,
"grad_norm": 0.46545692391278676,
"learning_rate": 3.561111235175177e-05,
"loss": 0.4142,
"step": 716
},
{
"epoch": 1.4677584442169909,
"grad_norm": 0.32920533353338466,
"learning_rate": 3.5593211440831345e-05,
"loss": 0.3932,
"step": 717
},
{
"epoch": 1.4698055271238486,
"grad_norm": 0.5879408329975259,
"learning_rate": 3.557527861669522e-05,
"loss": 0.4518,
"step": 718
},
{
"epoch": 1.471852610030706,
"grad_norm": 0.3711931671196681,
"learning_rate": 3.555731391604488e-05,
"loss": 0.4135,
"step": 719
},
{
"epoch": 1.473899692937564,
"grad_norm": 0.451950604877235,
"learning_rate": 3.553931737564705e-05,
"loss": 0.3778,
"step": 720
},
{
"epoch": 1.4759467758444216,
"grad_norm": 0.3802803607853056,
"learning_rate": 3.552128903233363e-05,
"loss": 0.4528,
"step": 721
},
{
"epoch": 1.4779938587512795,
"grad_norm": 0.5401764017146911,
"learning_rate": 3.55032289230016e-05,
"loss": 0.4431,
"step": 722
},
{
"epoch": 1.480040941658137,
"grad_norm": 0.39166017929307784,
"learning_rate": 3.5485137084612945e-05,
"loss": 0.4384,
"step": 723
},
{
"epoch": 1.482088024564995,
"grad_norm": 0.44680536201066046,
"learning_rate": 3.54670135541946e-05,
"loss": 0.4017,
"step": 724
},
{
"epoch": 1.4841351074718525,
"grad_norm": 0.4126986303099099,
"learning_rate": 3.544885836883836e-05,
"loss": 0.4267,
"step": 725
},
{
"epoch": 1.4861821903787105,
"grad_norm": 0.49002764481757854,
"learning_rate": 3.5430671565700786e-05,
"loss": 0.4451,
"step": 726
},
{
"epoch": 1.488229273285568,
"grad_norm": 0.43094421524683524,
"learning_rate": 3.541245318200318e-05,
"loss": 0.4157,
"step": 727
},
{
"epoch": 1.4902763561924257,
"grad_norm": 0.4077891821232257,
"learning_rate": 3.5394203255031445e-05,
"loss": 0.4184,
"step": 728
},
{
"epoch": 1.4923234390992834,
"grad_norm": 0.4214423887733812,
"learning_rate": 3.537592182213607e-05,
"loss": 0.4404,
"step": 729
},
{
"epoch": 1.4943705220061412,
"grad_norm": 0.39779568039250524,
"learning_rate": 3.5357608920732e-05,
"loss": 0.3862,
"step": 730
},
{
"epoch": 1.496417604912999,
"grad_norm": 0.3868845645927947,
"learning_rate": 3.5339264588298606e-05,
"loss": 0.4859,
"step": 731
},
{
"epoch": 1.4984646878198566,
"grad_norm": 0.401554933480186,
"learning_rate": 3.532088886237956e-05,
"loss": 0.4605,
"step": 732
},
{
"epoch": 1.5005117707267144,
"grad_norm": 0.39625088885583626,
"learning_rate": 3.530248178058282e-05,
"loss": 0.4371,
"step": 733
},
{
"epoch": 1.5025588536335721,
"grad_norm": 0.3433544773175932,
"learning_rate": 3.528404338058046e-05,
"loss": 0.359,
"step": 734
},
{
"epoch": 1.5046059365404298,
"grad_norm": 0.3790208864361706,
"learning_rate": 3.526557370010872e-05,
"loss": 0.4668,
"step": 735
},
{
"epoch": 1.5066530194472876,
"grad_norm": 0.3463058143561582,
"learning_rate": 3.5247072776967805e-05,
"loss": 0.428,
"step": 736
},
{
"epoch": 1.5087001023541453,
"grad_norm": 0.4092673650520593,
"learning_rate": 3.522854064902189e-05,
"loss": 0.4787,
"step": 737
},
{
"epoch": 1.510747185261003,
"grad_norm": 0.3701835656091217,
"learning_rate": 3.520997735419901e-05,
"loss": 0.4335,
"step": 738
},
{
"epoch": 1.5127942681678608,
"grad_norm": 0.4036995525874622,
"learning_rate": 3.519138293049097e-05,
"loss": 0.4371,
"step": 739
},
{
"epoch": 1.5148413510747185,
"grad_norm": 0.3520959626432674,
"learning_rate": 3.51727574159533e-05,
"loss": 0.442,
"step": 740
},
{
"epoch": 1.5168884339815762,
"grad_norm": 0.3474665477504301,
"learning_rate": 3.515410084870516e-05,
"loss": 0.3833,
"step": 741
},
{
"epoch": 1.518935516888434,
"grad_norm": 0.37700117041246284,
"learning_rate": 3.513541326692925e-05,
"loss": 0.421,
"step": 742
},
{
"epoch": 1.5209825997952917,
"grad_norm": 0.3354789534984794,
"learning_rate": 3.511669470887177e-05,
"loss": 0.3646,
"step": 743
},
{
"epoch": 1.5230296827021494,
"grad_norm": 0.36336403413161944,
"learning_rate": 3.509794521284228e-05,
"loss": 0.4479,
"step": 744
},
{
"epoch": 1.5250767656090072,
"grad_norm": 0.3640226182102435,
"learning_rate": 3.5079164817213684e-05,
"loss": 0.3994,
"step": 745
},
{
"epoch": 1.527123848515865,
"grad_norm": 0.39095484712326395,
"learning_rate": 3.5060353560422137e-05,
"loss": 0.4906,
"step": 746
},
{
"epoch": 1.5291709314227226,
"grad_norm": 0.42138462191208187,
"learning_rate": 3.504151148096691e-05,
"loss": 0.4876,
"step": 747
},
{
"epoch": 1.5312180143295804,
"grad_norm": 0.36356347432593744,
"learning_rate": 3.5022638617410396e-05,
"loss": 0.4171,
"step": 748
},
{
"epoch": 1.5332650972364381,
"grad_norm": 0.402770920410115,
"learning_rate": 3.500373500837799e-05,
"loss": 0.4427,
"step": 749
},
{
"epoch": 1.5353121801432958,
"grad_norm": 0.35789791445115465,
"learning_rate": 3.4984800692557974e-05,
"loss": 0.4306,
"step": 750
},
{
"epoch": 1.5373592630501536,
"grad_norm": 0.3511318223180355,
"learning_rate": 3.496583570870152e-05,
"loss": 0.4051,
"step": 751
},
{
"epoch": 1.5394063459570113,
"grad_norm": 0.415831551492267,
"learning_rate": 3.494684009562254e-05,
"loss": 0.4738,
"step": 752
},
{
"epoch": 1.541453428863869,
"grad_norm": 0.3774452021561776,
"learning_rate": 3.492781389219763e-05,
"loss": 0.472,
"step": 753
},
{
"epoch": 1.5435005117707266,
"grad_norm": 0.38222406103253237,
"learning_rate": 3.4908757137366006e-05,
"loss": 0.3984,
"step": 754
},
{
"epoch": 1.5455475946775845,
"grad_norm": 0.4097364486307627,
"learning_rate": 3.488966987012941e-05,
"loss": 0.4436,
"step": 755
},
{
"epoch": 1.547594677584442,
"grad_norm": 0.3733136430359242,
"learning_rate": 3.487055212955201e-05,
"loss": 0.4102,
"step": 756
},
{
"epoch": 1.5496417604913,
"grad_norm": 0.49818552627556745,
"learning_rate": 3.485140395476038e-05,
"loss": 0.485,
"step": 757
},
{
"epoch": 1.5516888433981575,
"grad_norm": 0.40160664320174677,
"learning_rate": 3.4832225384943335e-05,
"loss": 0.4662,
"step": 758
},
{
"epoch": 1.5537359263050154,
"grad_norm": 0.36914648750243184,
"learning_rate": 3.481301645935193e-05,
"loss": 0.4173,
"step": 759
},
{
"epoch": 1.555783009211873,
"grad_norm": 0.40716553946286355,
"learning_rate": 3.4793777217299346e-05,
"loss": 0.417,
"step": 760
},
{
"epoch": 1.557830092118731,
"grad_norm": 0.42779403061287685,
"learning_rate": 3.477450769816077e-05,
"loss": 0.4848,
"step": 761
},
{
"epoch": 1.5598771750255884,
"grad_norm": 0.3422957268296126,
"learning_rate": 3.475520794137341e-05,
"loss": 0.4039,
"step": 762
},
{
"epoch": 1.5619242579324464,
"grad_norm": 0.4032099351305834,
"learning_rate": 3.473587798643633e-05,
"loss": 0.4378,
"step": 763
},
{
"epoch": 1.563971340839304,
"grad_norm": 0.4010793632381822,
"learning_rate": 3.4716517872910405e-05,
"loss": 0.4461,
"step": 764
},
{
"epoch": 1.5660184237461618,
"grad_norm": 0.41602009577806304,
"learning_rate": 3.4697127640418204e-05,
"loss": 0.5198,
"step": 765
},
{
"epoch": 1.5680655066530194,
"grad_norm": 0.3840607306535795,
"learning_rate": 3.467770732864399e-05,
"loss": 0.4072,
"step": 766
},
{
"epoch": 1.5701125895598773,
"grad_norm": 0.45544304460334517,
"learning_rate": 3.4658256977333536e-05,
"loss": 0.4263,
"step": 767
},
{
"epoch": 1.5721596724667348,
"grad_norm": 0.4489200382867973,
"learning_rate": 3.4638776626294134e-05,
"loss": 0.4189,
"step": 768
},
{
"epoch": 1.5742067553735928,
"grad_norm": 0.40708118209847305,
"learning_rate": 3.461926631539445e-05,
"loss": 0.4344,
"step": 769
},
{
"epoch": 1.5762538382804503,
"grad_norm": 0.47396385986382383,
"learning_rate": 3.459972608456448e-05,
"loss": 0.4318,
"step": 770
},
{
"epoch": 1.5783009211873082,
"grad_norm": 0.44275905450120784,
"learning_rate": 3.4580155973795434e-05,
"loss": 0.4024,
"step": 771
},
{
"epoch": 1.5803480040941658,
"grad_norm": 0.37512733188763275,
"learning_rate": 3.4560556023139695e-05,
"loss": 0.4646,
"step": 772
},
{
"epoch": 1.5823950870010235,
"grad_norm": 0.4718852007159965,
"learning_rate": 3.454092627271072e-05,
"loss": 0.4386,
"step": 773
},
{
"epoch": 1.5844421699078812,
"grad_norm": 0.36977406003434726,
"learning_rate": 3.4521266762682924e-05,
"loss": 0.4429,
"step": 774
},
{
"epoch": 1.586489252814739,
"grad_norm": 0.36917719793736264,
"learning_rate": 3.450157753329166e-05,
"loss": 0.4753,
"step": 775
},
{
"epoch": 1.5885363357215967,
"grad_norm": 0.466057931195573,
"learning_rate": 3.448185862483309e-05,
"loss": 0.4542,
"step": 776
},
{
"epoch": 1.5905834186284544,
"grad_norm": 0.34642675856704536,
"learning_rate": 3.446211007766412e-05,
"loss": 0.3954,
"step": 777
},
{
"epoch": 1.5926305015353122,
"grad_norm": 0.4330447451756299,
"learning_rate": 3.4442331932202326e-05,
"loss": 0.4278,
"step": 778
},
{
"epoch": 1.59467758444217,
"grad_norm": 0.3730822908863033,
"learning_rate": 3.4422524228925836e-05,
"loss": 0.4061,
"step": 779
},
{
"epoch": 1.5967246673490276,
"grad_norm": 0.37973943757397915,
"learning_rate": 3.440268700837329e-05,
"loss": 0.394,
"step": 780
},
{
"epoch": 1.5987717502558854,
"grad_norm": 0.49428851704529125,
"learning_rate": 3.438282031114374e-05,
"loss": 0.4486,
"step": 781
},
{
"epoch": 1.600818833162743,
"grad_norm": 0.41805483229206175,
"learning_rate": 3.4362924177896545e-05,
"loss": 0.4393,
"step": 782
},
{
"epoch": 1.6028659160696008,
"grad_norm": 0.47689234144727466,
"learning_rate": 3.434299864935133e-05,
"loss": 0.4354,
"step": 783
},
{
"epoch": 1.6049129989764586,
"grad_norm": 0.42155977412917267,
"learning_rate": 3.432304376628787e-05,
"loss": 0.4602,
"step": 784
},
{
"epoch": 1.6069600818833163,
"grad_norm": 0.3865188335568589,
"learning_rate": 3.430305956954602e-05,
"loss": 0.4152,
"step": 785
},
{
"epoch": 1.609007164790174,
"grad_norm": 0.4271955733661172,
"learning_rate": 3.428304610002563e-05,
"loss": 0.4408,
"step": 786
},
{
"epoch": 1.6110542476970318,
"grad_norm": 0.44963813575609907,
"learning_rate": 3.4263003398686464e-05,
"loss": 0.4958,
"step": 787
},
{
"epoch": 1.6131013306038895,
"grad_norm": 0.3733726588143654,
"learning_rate": 3.424293150654809e-05,
"loss": 0.4287,
"step": 788
},
{
"epoch": 1.6151484135107472,
"grad_norm": 0.3812115032343838,
"learning_rate": 3.422283046468985e-05,
"loss": 0.405,
"step": 789
},
{
"epoch": 1.617195496417605,
"grad_norm": 0.37005120300102184,
"learning_rate": 3.420270031425072e-05,
"loss": 0.4516,
"step": 790
},
{
"epoch": 1.6192425793244627,
"grad_norm": 0.45666000218850494,
"learning_rate": 3.4182541096429265e-05,
"loss": 0.4523,
"step": 791
},
{
"epoch": 1.6212896622313204,
"grad_norm": 0.34898311295474405,
"learning_rate": 3.416235285248352e-05,
"loss": 0.4007,
"step": 792
},
{
"epoch": 1.623336745138178,
"grad_norm": 0.3839127190696324,
"learning_rate": 3.4142135623730954e-05,
"loss": 0.4617,
"step": 793
},
{
"epoch": 1.625383828045036,
"grad_norm": 0.4387438648695345,
"learning_rate": 3.412188945154833e-05,
"loss": 0.4723,
"step": 794
},
{
"epoch": 1.6274309109518934,
"grad_norm": 0.3332803619102528,
"learning_rate": 3.410161437737166e-05,
"loss": 0.362,
"step": 795
},
{
"epoch": 1.6294779938587514,
"grad_norm": 0.3807927986402262,
"learning_rate": 3.4081310442696114e-05,
"loss": 0.4339,
"step": 796
},
{
"epoch": 1.6315250767656089,
"grad_norm": 0.3937568675037694,
"learning_rate": 3.4060977689075914e-05,
"loss": 0.4184,
"step": 797
},
{
"epoch": 1.6335721596724668,
"grad_norm": 0.40435872407490453,
"learning_rate": 3.404061615812425e-05,
"loss": 0.4906,
"step": 798
},
{
"epoch": 1.6356192425793243,
"grad_norm": 0.37907378547306464,
"learning_rate": 3.402022589151325e-05,
"loss": 0.4273,
"step": 799
},
{
"epoch": 1.6376663254861823,
"grad_norm": 0.3703864227511636,
"learning_rate": 3.399980693097383e-05,
"loss": 0.4154,
"step": 800
},
{
"epoch": 1.6397134083930398,
"grad_norm": 0.4191952674222777,
"learning_rate": 3.3979359318295605e-05,
"loss": 0.3581,
"step": 801
},
{
"epoch": 1.6417604912998978,
"grad_norm": 0.3966331751054016,
"learning_rate": 3.395888309532687e-05,
"loss": 0.4669,
"step": 802
},
{
"epoch": 1.6438075742067553,
"grad_norm": 0.445008569764165,
"learning_rate": 3.393837830397446e-05,
"loss": 0.4267,
"step": 803
},
{
"epoch": 1.6458546571136132,
"grad_norm": 0.46681180810571704,
"learning_rate": 3.391784498620369e-05,
"loss": 0.4895,
"step": 804
},
{
"epoch": 1.6479017400204707,
"grad_norm": 0.3385167810574236,
"learning_rate": 3.3897283184038215e-05,
"loss": 0.404,
"step": 805
},
{
"epoch": 1.6499488229273287,
"grad_norm": 0.5219129958909896,
"learning_rate": 3.387669293956003e-05,
"loss": 0.4305,
"step": 806
},
{
"epoch": 1.6519959058341862,
"grad_norm": 0.3668653050867526,
"learning_rate": 3.385607429490934e-05,
"loss": 0.3746,
"step": 807
},
{
"epoch": 1.6540429887410442,
"grad_norm": 0.5158311002393402,
"learning_rate": 3.3835427292284445e-05,
"loss": 0.486,
"step": 808
},
{
"epoch": 1.6560900716479017,
"grad_norm": 0.36978068539278974,
"learning_rate": 3.38147519739417e-05,
"loss": 0.4487,
"step": 809
},
{
"epoch": 1.6581371545547596,
"grad_norm": 0.3772021211105363,
"learning_rate": 3.37940483821954e-05,
"loss": 0.4017,
"step": 810
},
{
"epoch": 1.6601842374616171,
"grad_norm": 0.41401208686983515,
"learning_rate": 3.3773316559417734e-05,
"loss": 0.491,
"step": 811
},
{
"epoch": 1.6622313203684749,
"grad_norm": 0.3524038676017696,
"learning_rate": 3.375255654803864e-05,
"loss": 0.3938,
"step": 812
},
{
"epoch": 1.6642784032753326,
"grad_norm": 0.4096084241414413,
"learning_rate": 3.373176839054576e-05,
"loss": 0.4157,
"step": 813
},
{
"epoch": 1.6663254861821903,
"grad_norm": 0.4186774343886114,
"learning_rate": 3.371095212948431e-05,
"loss": 0.4681,
"step": 814
},
{
"epoch": 1.668372569089048,
"grad_norm": 0.40515797591205743,
"learning_rate": 3.3690107807457085e-05,
"loss": 0.4459,
"step": 815
},
{
"epoch": 1.6704196519959058,
"grad_norm": 0.32867520587713756,
"learning_rate": 3.366923546712426e-05,
"loss": 0.3888,
"step": 816
},
{
"epoch": 1.6724667349027635,
"grad_norm": 0.347589875409667,
"learning_rate": 3.364833515120336e-05,
"loss": 0.4083,
"step": 817
},
{
"epoch": 1.6745138178096213,
"grad_norm": 0.3591052938957919,
"learning_rate": 3.362740690246918e-05,
"loss": 0.3938,
"step": 818
},
{
"epoch": 1.676560900716479,
"grad_norm": 0.35854368626658006,
"learning_rate": 3.360645076375368e-05,
"loss": 0.4217,
"step": 819
},
{
"epoch": 1.6786079836233367,
"grad_norm": 0.44210711469086067,
"learning_rate": 3.358546677794586e-05,
"loss": 0.4752,
"step": 820
},
{
"epoch": 1.6806550665301945,
"grad_norm": 0.3418341105649144,
"learning_rate": 3.356445498799179e-05,
"loss": 0.3828,
"step": 821
},
{
"epoch": 1.6827021494370522,
"grad_norm": 0.40574252833368174,
"learning_rate": 3.354341543689438e-05,
"loss": 0.4138,
"step": 822
},
{
"epoch": 1.68474923234391,
"grad_norm": 0.41618079643123923,
"learning_rate": 3.352234816771337e-05,
"loss": 0.4559,
"step": 823
},
{
"epoch": 1.6867963152507677,
"grad_norm": 0.42761098792857727,
"learning_rate": 3.350125322356525e-05,
"loss": 0.4466,
"step": 824
},
{
"epoch": 1.6888433981576254,
"grad_norm": 0.33049589320031353,
"learning_rate": 3.348013064762312e-05,
"loss": 0.3768,
"step": 825
},
{
"epoch": 1.6908904810644831,
"grad_norm": 0.3768698742983625,
"learning_rate": 3.3458980483116664e-05,
"loss": 0.4278,
"step": 826
},
{
"epoch": 1.6929375639713409,
"grad_norm": 0.3675413592176129,
"learning_rate": 3.343780277333199e-05,
"loss": 0.4573,
"step": 827
},
{
"epoch": 1.6949846468781986,
"grad_norm": 0.39935534592316574,
"learning_rate": 3.3416597561611616e-05,
"loss": 0.4607,
"step": 828
},
{
"epoch": 1.6970317297850563,
"grad_norm": 0.3422867133996467,
"learning_rate": 3.3395364891354316e-05,
"loss": 0.3925,
"step": 829
},
{
"epoch": 1.699078812691914,
"grad_norm": 0.4421095089073154,
"learning_rate": 3.33741048060151e-05,
"loss": 0.5186,
"step": 830
},
{
"epoch": 1.7011258955987718,
"grad_norm": 0.40513609641554227,
"learning_rate": 3.3352817349105046e-05,
"loss": 0.4388,
"step": 831
},
{
"epoch": 1.7031729785056293,
"grad_norm": 0.47309198797804985,
"learning_rate": 3.333150256419127e-05,
"loss": 0.5152,
"step": 832
},
{
"epoch": 1.7052200614124873,
"grad_norm": 0.37396555660516145,
"learning_rate": 3.331016049489681e-05,
"loss": 0.3778,
"step": 833
},
{
"epoch": 1.7072671443193448,
"grad_norm": 0.46343295183867955,
"learning_rate": 3.328879118490055e-05,
"loss": 0.4181,
"step": 834
},
{
"epoch": 1.7093142272262027,
"grad_norm": 0.36985219342112263,
"learning_rate": 3.3267394677937134e-05,
"loss": 0.4519,
"step": 835
},
{
"epoch": 1.7113613101330603,
"grad_norm": 0.47168961292482453,
"learning_rate": 3.3245971017796854e-05,
"loss": 0.4754,
"step": 836
},
{
"epoch": 1.7134083930399182,
"grad_norm": 0.45545209130289294,
"learning_rate": 3.322452024832557e-05,
"loss": 0.4054,
"step": 837
},
{
"epoch": 1.7154554759467757,
"grad_norm": 0.44300545126776186,
"learning_rate": 3.320304241342464e-05,
"loss": 0.4707,
"step": 838
},
{
"epoch": 1.7175025588536337,
"grad_norm": 0.40914423787668386,
"learning_rate": 3.31815375570508e-05,
"loss": 0.4008,
"step": 839
},
{
"epoch": 1.7195496417604912,
"grad_norm": 0.42788343961512754,
"learning_rate": 3.3160005723216105e-05,
"loss": 0.4063,
"step": 840
},
{
"epoch": 1.7215967246673491,
"grad_norm": 0.3836547816282569,
"learning_rate": 3.31384469559878e-05,
"loss": 0.4471,
"step": 841
},
{
"epoch": 1.7236438075742067,
"grad_norm": 0.3712650340512891,
"learning_rate": 3.311686129948827e-05,
"loss": 0.3813,
"step": 842
},
{
"epoch": 1.7256908904810646,
"grad_norm": 0.3661541112584301,
"learning_rate": 3.3095248797894925e-05,
"loss": 0.4373,
"step": 843
},
{
"epoch": 1.7277379733879221,
"grad_norm": 0.40769069558582477,
"learning_rate": 3.307360949544012e-05,
"loss": 0.433,
"step": 844
},
{
"epoch": 1.72978505629478,
"grad_norm": 0.4384390897805754,
"learning_rate": 3.305194343641106e-05,
"loss": 0.4414,
"step": 845
},
{
"epoch": 1.7318321392016376,
"grad_norm": 0.39419405263041707,
"learning_rate": 3.30302506651497e-05,
"loss": 0.3903,
"step": 846
},
{
"epoch": 1.7338792221084955,
"grad_norm": 0.5193517899563321,
"learning_rate": 3.300853122605268e-05,
"loss": 0.5126,
"step": 847
},
{
"epoch": 1.735926305015353,
"grad_norm": 0.37470876760021676,
"learning_rate": 3.2986785163571216e-05,
"loss": 0.4088,
"step": 848
},
{
"epoch": 1.737973387922211,
"grad_norm": 0.3911340678248973,
"learning_rate": 3.2965012522211e-05,
"loss": 0.4231,
"step": 849
},
{
"epoch": 1.7400204708290685,
"grad_norm": 0.3718531256618771,
"learning_rate": 3.294321334653213e-05,
"loss": 0.4087,
"step": 850
},
{
"epoch": 1.7420675537359263,
"grad_norm": 0.44651626014948276,
"learning_rate": 3.2921387681149e-05,
"loss": 0.4623,
"step": 851
},
{
"epoch": 1.744114636642784,
"grad_norm": 0.37467196684362686,
"learning_rate": 3.289953557073024e-05,
"loss": 0.4148,
"step": 852
},
{
"epoch": 1.7461617195496417,
"grad_norm": 0.42577176527999205,
"learning_rate": 3.2877657059998584e-05,
"loss": 0.4103,
"step": 853
},
{
"epoch": 1.7482088024564995,
"grad_norm": 0.3833747846094608,
"learning_rate": 3.285575219373079e-05,
"loss": 0.4432,
"step": 854
},
{
"epoch": 1.7502558853633572,
"grad_norm": 0.4116483874002876,
"learning_rate": 3.2833821016757586e-05,
"loss": 0.4203,
"step": 855
},
{
"epoch": 1.752302968270215,
"grad_norm": 0.330063754223896,
"learning_rate": 3.281186357396351e-05,
"loss": 0.3895,
"step": 856
},
{
"epoch": 1.7543500511770727,
"grad_norm": 0.398313432569639,
"learning_rate": 3.278987991028688e-05,
"loss": 0.4367,
"step": 857
},
{
"epoch": 1.7563971340839304,
"grad_norm": 0.3618880492014595,
"learning_rate": 3.276787007071968e-05,
"loss": 0.3917,
"step": 858
},
{
"epoch": 1.7584442169907881,
"grad_norm": 0.4604401121085883,
"learning_rate": 3.274583410030745e-05,
"loss": 0.4577,
"step": 859
},
{
"epoch": 1.7604912998976459,
"grad_norm": 0.33679127692517397,
"learning_rate": 3.2723772044149224e-05,
"loss": 0.4072,
"step": 860
},
{
"epoch": 1.7625383828045036,
"grad_norm": 0.3624660321439512,
"learning_rate": 3.270168394739741e-05,
"loss": 0.4098,
"step": 861
},
{
"epoch": 1.7645854657113613,
"grad_norm": 0.343429438960415,
"learning_rate": 3.267956985525774e-05,
"loss": 0.4262,
"step": 862
},
{
"epoch": 1.766632548618219,
"grad_norm": 0.3866589432412809,
"learning_rate": 3.26574298129891e-05,
"loss": 0.4311,
"step": 863
},
{
"epoch": 1.7686796315250768,
"grad_norm": 0.3484380413267947,
"learning_rate": 3.263526386590351e-05,
"loss": 0.4265,
"step": 864
},
{
"epoch": 1.7707267144319345,
"grad_norm": 0.3727255169733587,
"learning_rate": 3.261307205936603e-05,
"loss": 0.4936,
"step": 865
},
{
"epoch": 1.7727737973387923,
"grad_norm": 0.35604276467584445,
"learning_rate": 3.2590854438794604e-05,
"loss": 0.3855,
"step": 866
},
{
"epoch": 1.77482088024565,
"grad_norm": 0.37701838111479336,
"learning_rate": 3.2568611049660046e-05,
"loss": 0.4308,
"step": 867
},
{
"epoch": 1.7768679631525077,
"grad_norm": 0.3530904170276449,
"learning_rate": 3.2546341937485884e-05,
"loss": 0.4198,
"step": 868
},
{
"epoch": 1.7789150460593655,
"grad_norm": 0.37564040686995553,
"learning_rate": 3.2524047147848284e-05,
"loss": 0.3702,
"step": 869
},
{
"epoch": 1.7809621289662232,
"grad_norm": 0.366044908240742,
"learning_rate": 3.250172672637598e-05,
"loss": 0.4561,
"step": 870
},
{
"epoch": 1.7830092118730807,
"grad_norm": 0.3683215486290299,
"learning_rate": 3.247938071875017e-05,
"loss": 0.4467,
"step": 871
},
{
"epoch": 1.7850562947799387,
"grad_norm": 0.38716018556798415,
"learning_rate": 3.24570091707044e-05,
"loss": 0.4151,
"step": 872
},
{
"epoch": 1.7871033776867962,
"grad_norm": 0.7367692001662189,
"learning_rate": 3.24346121280245e-05,
"loss": 0.438,
"step": 873
},
{
"epoch": 1.7891504605936541,
"grad_norm": 0.41338889728660405,
"learning_rate": 3.2412189636548456e-05,
"loss": 0.4629,
"step": 874
},
{
"epoch": 1.7911975435005116,
"grad_norm": 0.35357448274069847,
"learning_rate": 3.238974174216637e-05,
"loss": 0.3559,
"step": 875
},
{
"epoch": 1.7932446264073696,
"grad_norm": 0.3796163326385031,
"learning_rate": 3.236726849082032e-05,
"loss": 0.4281,
"step": 876
},
{
"epoch": 1.795291709314227,
"grad_norm": 0.3944707372507435,
"learning_rate": 3.234476992850425e-05,
"loss": 0.4537,
"step": 877
},
{
"epoch": 1.797338792221085,
"grad_norm": 0.3318296802336291,
"learning_rate": 3.232224610126396e-05,
"loss": 0.3985,
"step": 878
},
{
"epoch": 1.7993858751279426,
"grad_norm": 0.40919288715091856,
"learning_rate": 3.229969705519693e-05,
"loss": 0.4616,
"step": 879
},
{
"epoch": 1.8014329580348005,
"grad_norm": 0.39543379019412783,
"learning_rate": 3.227712283645224e-05,
"loss": 0.4883,
"step": 880
},
{
"epoch": 1.803480040941658,
"grad_norm": 0.3624564100802025,
"learning_rate": 3.225452349123051e-05,
"loss": 0.4264,
"step": 881
},
{
"epoch": 1.805527123848516,
"grad_norm": 0.35250394717776745,
"learning_rate": 3.2231899065783766e-05,
"loss": 0.3975,
"step": 882
},
{
"epoch": 1.8075742067553735,
"grad_norm": 0.41454217155818623,
"learning_rate": 3.2209249606415394e-05,
"loss": 0.4668,
"step": 883
},
{
"epoch": 1.8096212896622315,
"grad_norm": 0.33248312085973525,
"learning_rate": 3.2186575159479966e-05,
"loss": 0.3853,
"step": 884
},
{
"epoch": 1.811668372569089,
"grad_norm": 0.41602712353842625,
"learning_rate": 3.2163875771383246e-05,
"loss": 0.4615,
"step": 885
},
{
"epoch": 1.813715455475947,
"grad_norm": 0.3941084429766098,
"learning_rate": 3.214115148858201e-05,
"loss": 0.5111,
"step": 886
},
{
"epoch": 1.8157625383828044,
"grad_norm": 0.34750451682083505,
"learning_rate": 3.211840235758399e-05,
"loss": 0.4055,
"step": 887
},
{
"epoch": 1.8178096212896624,
"grad_norm": 0.45072414952251894,
"learning_rate": 3.209562842494778e-05,
"loss": 0.4673,
"step": 888
},
{
"epoch": 1.81985670419652,
"grad_norm": 0.36513950471383816,
"learning_rate": 3.207282973728273e-05,
"loss": 0.4526,
"step": 889
},
{
"epoch": 1.8219037871033776,
"grad_norm": 0.3898093803880879,
"learning_rate": 3.205000634124884e-05,
"loss": 0.4045,
"step": 890
},
{
"epoch": 1.8239508700102354,
"grad_norm": 0.36982373792147444,
"learning_rate": 3.20271582835567e-05,
"loss": 0.4079,
"step": 891
},
{
"epoch": 1.825997952917093,
"grad_norm": 0.35532947918298324,
"learning_rate": 3.200428561096737e-05,
"loss": 0.444,
"step": 892
},
{
"epoch": 1.8280450358239508,
"grad_norm": 0.3777203606733099,
"learning_rate": 3.198138837029227e-05,
"loss": 0.46,
"step": 893
},
{
"epoch": 1.8300921187308086,
"grad_norm": 0.3489126911275309,
"learning_rate": 3.195846660839311e-05,
"loss": 0.3887,
"step": 894
},
{
"epoch": 1.8321392016376663,
"grad_norm": 0.3633804424526499,
"learning_rate": 3.193552037218179e-05,
"loss": 0.5416,
"step": 895
},
{
"epoch": 1.834186284544524,
"grad_norm": 0.38704595278722626,
"learning_rate": 3.1912549708620314e-05,
"loss": 0.47,
"step": 896
},
{
"epoch": 1.8362333674513818,
"grad_norm": 0.36865192383072315,
"learning_rate": 3.188955466472063e-05,
"loss": 0.4084,
"step": 897
},
{
"epoch": 1.8382804503582395,
"grad_norm": 0.42348322893722706,
"learning_rate": 3.186653528754464e-05,
"loss": 0.4354,
"step": 898
},
{
"epoch": 1.8403275332650972,
"grad_norm": 0.37712189200316865,
"learning_rate": 3.184349162420401e-05,
"loss": 0.4011,
"step": 899
},
{
"epoch": 1.842374616171955,
"grad_norm": 0.40519216196971364,
"learning_rate": 3.182042372186013e-05,
"loss": 0.4523,
"step": 900
},
{
"epoch": 1.8444216990788127,
"grad_norm": 0.3270302439955843,
"learning_rate": 3.179733162772398e-05,
"loss": 0.3863,
"step": 901
},
{
"epoch": 1.8464687819856704,
"grad_norm": 0.36562347535937434,
"learning_rate": 3.177421538905606e-05,
"loss": 0.414,
"step": 902
},
{
"epoch": 1.8485158648925282,
"grad_norm": 0.34059679657441694,
"learning_rate": 3.17510750531663e-05,
"loss": 0.4319,
"step": 903
},
{
"epoch": 1.850562947799386,
"grad_norm": 0.3971233949934265,
"learning_rate": 3.172791066741392e-05,
"loss": 0.4051,
"step": 904
},
{
"epoch": 1.8526100307062436,
"grad_norm": 0.37489566812190156,
"learning_rate": 3.170472227920737e-05,
"loss": 0.4488,
"step": 905
},
{
"epoch": 1.8546571136131014,
"grad_norm": 0.3619651650704685,
"learning_rate": 3.168150993600424e-05,
"loss": 0.4097,
"step": 906
},
{
"epoch": 1.856704196519959,
"grad_norm": 0.4144444046604467,
"learning_rate": 3.165827368531113e-05,
"loss": 0.4838,
"step": 907
},
{
"epoch": 1.8587512794268168,
"grad_norm": 0.34162161820675707,
"learning_rate": 3.1635013574683564e-05,
"loss": 0.403,
"step": 908
},
{
"epoch": 1.8607983623336746,
"grad_norm": 0.3876122467545283,
"learning_rate": 3.161172965172591e-05,
"loss": 0.4564,
"step": 909
},
{
"epoch": 1.862845445240532,
"grad_norm": 0.34340552862244217,
"learning_rate": 3.1588421964091276e-05,
"loss": 0.397,
"step": 910
},
{
"epoch": 1.86489252814739,
"grad_norm": 0.383808899187042,
"learning_rate": 3.1565090559481396e-05,
"loss": 0.4265,
"step": 911
},
{
"epoch": 1.8669396110542475,
"grad_norm": 0.31690081389294966,
"learning_rate": 3.1541735485646536e-05,
"loss": 0.4047,
"step": 912
},
{
"epoch": 1.8689866939611055,
"grad_norm": 0.3992971993499514,
"learning_rate": 3.151835679038542e-05,
"loss": 0.439,
"step": 913
},
{
"epoch": 1.871033776867963,
"grad_norm": 0.31545614214320866,
"learning_rate": 3.149495452154512e-05,
"loss": 0.3986,
"step": 914
},
{
"epoch": 1.873080859774821,
"grad_norm": 0.33181454924376286,
"learning_rate": 3.147152872702092e-05,
"loss": 0.4364,
"step": 915
},
{
"epoch": 1.8751279426816785,
"grad_norm": 0.38596107764783966,
"learning_rate": 3.14480794547563e-05,
"loss": 0.4666,
"step": 916
},
{
"epoch": 1.8771750255885364,
"grad_norm": 0.3743307900880147,
"learning_rate": 3.142460675274275e-05,
"loss": 0.4136,
"step": 917
},
{
"epoch": 1.879222108495394,
"grad_norm": 0.39267653351820997,
"learning_rate": 3.1401110669019724e-05,
"loss": 0.4308,
"step": 918
},
{
"epoch": 1.881269191402252,
"grad_norm": 0.3865400215305747,
"learning_rate": 3.137759125167455e-05,
"loss": 0.4663,
"step": 919
},
{
"epoch": 1.8833162743091094,
"grad_norm": 0.36025373309076514,
"learning_rate": 3.135404854884226e-05,
"loss": 0.4202,
"step": 920
},
{
"epoch": 1.8853633572159674,
"grad_norm": 0.3774704107912035,
"learning_rate": 3.133048260870561e-05,
"loss": 0.4047,
"step": 921
},
{
"epoch": 1.8874104401228249,
"grad_norm": 0.3281213257339426,
"learning_rate": 3.130689347949486e-05,
"loss": 0.4088,
"step": 922
},
{
"epoch": 1.8894575230296828,
"grad_norm": 0.3593342361422244,
"learning_rate": 3.1283281209487755e-05,
"loss": 0.4475,
"step": 923
},
{
"epoch": 1.8915046059365404,
"grad_norm": 0.348849346649699,
"learning_rate": 3.1259645847009384e-05,
"loss": 0.4133,
"step": 924
},
{
"epoch": 1.8935516888433983,
"grad_norm": 0.3661633848236013,
"learning_rate": 3.123598744043211e-05,
"loss": 0.4345,
"step": 925
},
{
"epoch": 1.8955987717502558,
"grad_norm": 0.39376453806042766,
"learning_rate": 3.121230603817545e-05,
"loss": 0.4802,
"step": 926
},
{
"epoch": 1.8976458546571138,
"grad_norm": 0.4054577347062655,
"learning_rate": 3.1188601688706e-05,
"loss": 0.4861,
"step": 927
},
{
"epoch": 1.8996929375639713,
"grad_norm": 0.3244003846213942,
"learning_rate": 3.1164874440537295e-05,
"loss": 0.3988,
"step": 928
},
{
"epoch": 1.901740020470829,
"grad_norm": 0.3598304561011502,
"learning_rate": 3.114112434222976e-05,
"loss": 0.4083,
"step": 929
},
{
"epoch": 1.9037871033776868,
"grad_norm": 0.3818801716604425,
"learning_rate": 3.111735144239057e-05,
"loss": 0.424,
"step": 930
},
{
"epoch": 1.9058341862845445,
"grad_norm": 0.351513366570262,
"learning_rate": 3.109355578967356e-05,
"loss": 0.4529,
"step": 931
},
{
"epoch": 1.9078812691914022,
"grad_norm": 0.3240275031967729,
"learning_rate": 3.106973743277916e-05,
"loss": 0.4211,
"step": 932
},
{
"epoch": 1.90992835209826,
"grad_norm": 0.361341366033979,
"learning_rate": 3.104589642045422e-05,
"loss": 0.4776,
"step": 933
},
{
"epoch": 1.9119754350051177,
"grad_norm": 0.29231193321874205,
"learning_rate": 3.1022032801492e-05,
"loss": 0.3741,
"step": 934
},
{
"epoch": 1.9140225179119754,
"grad_norm": 0.3401516128458725,
"learning_rate": 3.099814662473202e-05,
"loss": 0.411,
"step": 935
},
{
"epoch": 1.9160696008188332,
"grad_norm": 0.3813395041273339,
"learning_rate": 3.0974237939059947e-05,
"loss": 0.4652,
"step": 936
},
{
"epoch": 1.9181166837256909,
"grad_norm": 0.33396626631772636,
"learning_rate": 3.095030679340751e-05,
"loss": 0.438,
"step": 937
},
{
"epoch": 1.9201637666325486,
"grad_norm": 0.27154874124436096,
"learning_rate": 3.092635323675245e-05,
"loss": 0.3297,
"step": 938
},
{
"epoch": 1.9222108495394064,
"grad_norm": 0.3729869848757238,
"learning_rate": 3.0902377318118336e-05,
"loss": 0.3925,
"step": 939
},
{
"epoch": 1.924257932446264,
"grad_norm": 0.3729082740901169,
"learning_rate": 3.0878379086574494e-05,
"loss": 0.4632,
"step": 940
},
{
"epoch": 1.9263050153531218,
"grad_norm": 0.33442817830860083,
"learning_rate": 3.085435859123596e-05,
"loss": 0.4246,
"step": 941
},
{
"epoch": 1.9283520982599796,
"grad_norm": 0.3455333402015141,
"learning_rate": 3.083031588126329e-05,
"loss": 0.4291,
"step": 942
},
{
"epoch": 1.9303991811668373,
"grad_norm": 0.33030857885870823,
"learning_rate": 3.0806251005862535e-05,
"loss": 0.4293,
"step": 943
},
{
"epoch": 1.932446264073695,
"grad_norm": 0.3313566999610316,
"learning_rate": 3.07821640142851e-05,
"loss": 0.4691,
"step": 944
},
{
"epoch": 1.9344933469805528,
"grad_norm": 0.3371774148631356,
"learning_rate": 3.0758054955827655e-05,
"loss": 0.4283,
"step": 945
},
{
"epoch": 1.9365404298874105,
"grad_norm": 0.35055569911779827,
"learning_rate": 3.073392387983202e-05,
"loss": 0.4157,
"step": 946
},
{
"epoch": 1.9385875127942682,
"grad_norm": 0.3713512962937003,
"learning_rate": 3.070977083568508e-05,
"loss": 0.4709,
"step": 947
},
{
"epoch": 1.940634595701126,
"grad_norm": 0.3121880967121788,
"learning_rate": 3.06855958728187e-05,
"loss": 0.3584,
"step": 948
},
{
"epoch": 1.9426816786079835,
"grad_norm": 0.3923276029776799,
"learning_rate": 3.0661399040709584e-05,
"loss": 0.4273,
"step": 949
},
{
"epoch": 1.9447287615148414,
"grad_norm": 0.39501548252234137,
"learning_rate": 3.0637180388879207e-05,
"loss": 0.4292,
"step": 950
},
{
"epoch": 1.946775844421699,
"grad_norm": 0.3918727723912462,
"learning_rate": 3.061293996689369e-05,
"loss": 0.4422,
"step": 951
},
{
"epoch": 1.9488229273285569,
"grad_norm": 0.3650853100344803,
"learning_rate": 3.05886778243637e-05,
"loss": 0.4355,
"step": 952
},
{
"epoch": 1.9508700102354144,
"grad_norm": 0.40314024207731314,
"learning_rate": 3.0564394010944396e-05,
"loss": 0.3964,
"step": 953
},
{
"epoch": 1.9529170931422724,
"grad_norm": 0.3321329959720213,
"learning_rate": 3.054008857633524e-05,
"loss": 0.3802,
"step": 954
},
{
"epoch": 1.9549641760491299,
"grad_norm": 0.3578935619719161,
"learning_rate": 3.051576157027998e-05,
"loss": 0.4187,
"step": 955
},
{
"epoch": 1.9570112589559878,
"grad_norm": 0.36008706778157745,
"learning_rate": 3.0491413042566492e-05,
"loss": 0.421,
"step": 956
},
{
"epoch": 1.9590583418628453,
"grad_norm": 0.35653968726696394,
"learning_rate": 3.0467043043026705e-05,
"loss": 0.3773,
"step": 957
},
{
"epoch": 1.9611054247697033,
"grad_norm": 0.398663136011159,
"learning_rate": 3.0442651621536502e-05,
"loss": 0.4867,
"step": 958
},
{
"epoch": 1.9631525076765608,
"grad_norm": 0.39803480683406417,
"learning_rate": 3.041823882801559e-05,
"loss": 0.4572,
"step": 959
},
{
"epoch": 1.9651995905834188,
"grad_norm": 0.31571236419876386,
"learning_rate": 3.039380471242743e-05,
"loss": 0.3953,
"step": 960
},
{
"epoch": 1.9672466734902763,
"grad_norm": 0.356704934998217,
"learning_rate": 3.0369349324779115e-05,
"loss": 0.4116,
"step": 961
},
{
"epoch": 1.9692937563971342,
"grad_norm": 0.38456016779817315,
"learning_rate": 3.0344872715121276e-05,
"loss": 0.4181,
"step": 962
},
{
"epoch": 1.9713408393039917,
"grad_norm": 0.42725505346001535,
"learning_rate": 3.0320374933547982e-05,
"loss": 0.4509,
"step": 963
},
{
"epoch": 1.9733879222108497,
"grad_norm": 0.3124541183454085,
"learning_rate": 3.0295856030196618e-05,
"loss": 0.3635,
"step": 964
},
{
"epoch": 1.9754350051177072,
"grad_norm": 0.38135606484411233,
"learning_rate": 3.0271316055247812e-05,
"loss": 0.4322,
"step": 965
},
{
"epoch": 1.9774820880245652,
"grad_norm": 0.3564421243258828,
"learning_rate": 3.024675505892531e-05,
"loss": 0.4706,
"step": 966
},
{
"epoch": 1.9795291709314227,
"grad_norm": 0.33614285440354186,
"learning_rate": 3.022217309149588e-05,
"loss": 0.3916,
"step": 967
},
{
"epoch": 1.9815762538382804,
"grad_norm": 0.3642272706560808,
"learning_rate": 3.019757020326921e-05,
"loss": 0.4001,
"step": 968
},
{
"epoch": 1.9836233367451381,
"grad_norm": 0.3677389037382802,
"learning_rate": 3.017294644459782e-05,
"loss": 0.4067,
"step": 969
},
{
"epoch": 1.9856704196519959,
"grad_norm": 0.38491158269193576,
"learning_rate": 3.0148301865876913e-05,
"loss": 0.4266,
"step": 970
},
{
"epoch": 1.9877175025588536,
"grad_norm": 0.4027798698020095,
"learning_rate": 3.0123636517544326e-05,
"loss": 0.5046,
"step": 971
},
{
"epoch": 1.9897645854657113,
"grad_norm": 0.36103993797318307,
"learning_rate": 3.0098950450080404e-05,
"loss": 0.3863,
"step": 972
},
{
"epoch": 1.991811668372569,
"grad_norm": 0.3202323840951822,
"learning_rate": 3.0074243714007875e-05,
"loss": 0.3562,
"step": 973
},
{
"epoch": 1.9938587512794268,
"grad_norm": 0.3949919429980765,
"learning_rate": 3.004951635989179e-05,
"loss": 0.4732,
"step": 974
},
{
"epoch": 1.9959058341862845,
"grad_norm": 0.3475495093601558,
"learning_rate": 3.0024768438339388e-05,
"loss": 0.391,
"step": 975
},
{
"epoch": 1.9979529170931423,
"grad_norm": 0.39216689067501626,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.4564,
"step": 976
},
{
"epoch": 2.0,
"grad_norm": 0.6533349441460323,
"learning_rate": 2.9975211095564955e-05,
"loss": 0.5882,
"step": 977
},
{
"epoch": 2.0020470829068575,
"grad_norm": 0.4211988149390974,
"learning_rate": 2.995040177576745e-05,
"loss": 0.3174,
"step": 978
},
{
"epoch": 2.0040941658137155,
"grad_norm": 0.6108187632741519,
"learning_rate": 2.992557209138249e-05,
"loss": 0.3367,
"step": 979
},
{
"epoch": 2.006141248720573,
"grad_norm": 0.38089369292387826,
"learning_rate": 2.9900722093226737e-05,
"loss": 0.3232,
"step": 980
},
{
"epoch": 2.008188331627431,
"grad_norm": 0.47508227914348694,
"learning_rate": 2.9875851832158428e-05,
"loss": 0.3522,
"step": 981
},
{
"epoch": 2.0102354145342884,
"grad_norm": 0.4326113199153507,
"learning_rate": 2.9850961359077293e-05,
"loss": 0.2981,
"step": 982
},
{
"epoch": 2.0122824974411464,
"grad_norm": 0.42932267247803857,
"learning_rate": 2.98260507249244e-05,
"loss": 0.3202,
"step": 983
},
{
"epoch": 2.014329580348004,
"grad_norm": 0.3951716605003508,
"learning_rate": 2.9801119980682095e-05,
"loss": 0.3509,
"step": 984
},
{
"epoch": 2.016376663254862,
"grad_norm": 0.35789828953920616,
"learning_rate": 2.977616917737388e-05,
"loss": 0.3281,
"step": 985
},
{
"epoch": 2.0184237461617194,
"grad_norm": 0.3460413679418126,
"learning_rate": 2.9751198366064304e-05,
"loss": 0.3161,
"step": 986
},
{
"epoch": 2.0204708290685773,
"grad_norm": 0.3703844711305289,
"learning_rate": 2.9726207597858872e-05,
"loss": 0.3251,
"step": 987
},
{
"epoch": 2.022517911975435,
"grad_norm": 0.39238883821417286,
"learning_rate": 2.9701196923903927e-05,
"loss": 0.3391,
"step": 988
},
{
"epoch": 2.024564994882293,
"grad_norm": 0.34303403649979675,
"learning_rate": 2.9676166395386553e-05,
"loss": 0.361,
"step": 989
},
{
"epoch": 2.0266120777891503,
"grad_norm": 0.38788579378291205,
"learning_rate": 2.965111606353447e-05,
"loss": 0.312,
"step": 990
},
{
"epoch": 2.0286591606960083,
"grad_norm": 0.4013869555066414,
"learning_rate": 2.9626045979615928e-05,
"loss": 0.3209,
"step": 991
},
{
"epoch": 2.030706243602866,
"grad_norm": 0.37089151639897344,
"learning_rate": 2.9600956194939598e-05,
"loss": 0.3498,
"step": 992
},
{
"epoch": 2.0327533265097237,
"grad_norm": 0.371332657755311,
"learning_rate": 2.957584676085447e-05,
"loss": 0.3422,
"step": 993
},
{
"epoch": 2.0348004094165812,
"grad_norm": 0.3762690812523185,
"learning_rate": 2.9550717728749768e-05,
"loss": 0.3199,
"step": 994
},
{
"epoch": 2.036847492323439,
"grad_norm": 0.3431217144511242,
"learning_rate": 2.9525569150054796e-05,
"loss": 0.2871,
"step": 995
},
{
"epoch": 2.0388945752302967,
"grad_norm": 0.42824685102967736,
"learning_rate": 2.950040107623887e-05,
"loss": 0.3268,
"step": 996
},
{
"epoch": 2.0409416581371547,
"grad_norm": 0.3459406390165573,
"learning_rate": 2.947521355881122e-05,
"loss": 0.3268,
"step": 997
},
{
"epoch": 2.042988741044012,
"grad_norm": 0.3484085391017404,
"learning_rate": 2.9450006649320862e-05,
"loss": 0.325,
"step": 998
},
{
"epoch": 2.04503582395087,
"grad_norm": 0.3537579300316069,
"learning_rate": 2.9424780399356497e-05,
"loss": 0.3144,
"step": 999
},
{
"epoch": 2.0470829068577276,
"grad_norm": 0.3540499619721327,
"learning_rate": 2.9399534860546404e-05,
"loss": 0.3324,
"step": 1000
},
{
"epoch": 2.0491299897645856,
"grad_norm": 0.3146082628297793,
"learning_rate": 2.937427008455835e-05,
"loss": 0.3458,
"step": 1001
},
{
"epoch": 2.051177072671443,
"grad_norm": 0.41424472646515875,
"learning_rate": 2.9348986123099462e-05,
"loss": 0.3638,
"step": 1002
},
{
"epoch": 2.053224155578301,
"grad_norm": 0.3640670691803175,
"learning_rate": 2.932368302791614e-05,
"loss": 0.3596,
"step": 1003
},
{
"epoch": 2.0552712384851586,
"grad_norm": 0.38223463417283743,
"learning_rate": 2.9298360850793944e-05,
"loss": 0.3048,
"step": 1004
},
{
"epoch": 2.0573183213920165,
"grad_norm": 0.35066959498537137,
"learning_rate": 2.9273019643557474e-05,
"loss": 0.3154,
"step": 1005
},
{
"epoch": 2.059365404298874,
"grad_norm": 0.3912112568074411,
"learning_rate": 2.92476594580703e-05,
"loss": 0.3397,
"step": 1006
},
{
"epoch": 2.061412487205732,
"grad_norm": 0.40688373618091567,
"learning_rate": 2.9222280346234816e-05,
"loss": 0.3315,
"step": 1007
},
{
"epoch": 2.0634595701125895,
"grad_norm": 0.3408605767984647,
"learning_rate": 2.919688235999215e-05,
"loss": 0.3334,
"step": 1008
},
{
"epoch": 2.0655066530194475,
"grad_norm": 0.35811936384341014,
"learning_rate": 2.917146555132206e-05,
"loss": 0.3386,
"step": 1009
},
{
"epoch": 2.067553735926305,
"grad_norm": 0.3364760604746556,
"learning_rate": 2.914602997224285e-05,
"loss": 0.3199,
"step": 1010
},
{
"epoch": 2.069600818833163,
"grad_norm": 0.4118145924928188,
"learning_rate": 2.912057567481119e-05,
"loss": 0.3507,
"step": 1011
},
{
"epoch": 2.0716479017400204,
"grad_norm": 0.3612606023218023,
"learning_rate": 2.909510271112212e-05,
"loss": 0.3462,
"step": 1012
},
{
"epoch": 2.0736949846468784,
"grad_norm": 0.3222144557445549,
"learning_rate": 2.906961113330883e-05,
"loss": 0.3271,
"step": 1013
},
{
"epoch": 2.075742067553736,
"grad_norm": 0.4762234619066632,
"learning_rate": 2.904410099354263e-05,
"loss": 0.311,
"step": 1014
},
{
"epoch": 2.0777891504605934,
"grad_norm": 0.3407988463597387,
"learning_rate": 2.9018572344032823e-05,
"loss": 0.3242,
"step": 1015
},
{
"epoch": 2.0798362333674514,
"grad_norm": 0.38477863373629423,
"learning_rate": 2.8993025237026578e-05,
"loss": 0.3472,
"step": 1016
},
{
"epoch": 2.0818833162743093,
"grad_norm": 0.4212662322514699,
"learning_rate": 2.8967459724808856e-05,
"loss": 0.3055,
"step": 1017
},
{
"epoch": 2.083930399181167,
"grad_norm": 0.34644422983431267,
"learning_rate": 2.8941875859702283e-05,
"loss": 0.3099,
"step": 1018
},
{
"epoch": 2.0859774820880244,
"grad_norm": 0.38660401266521843,
"learning_rate": 2.891627369406703e-05,
"loss": 0.3301,
"step": 1019
},
{
"epoch": 2.0880245649948823,
"grad_norm": 0.33570902053280804,
"learning_rate": 2.889065328030074e-05,
"loss": 0.3559,
"step": 1020
},
{
"epoch": 2.09007164790174,
"grad_norm": 0.3938573985227862,
"learning_rate": 2.88650146708384e-05,
"loss": 0.3301,
"step": 1021
},
{
"epoch": 2.092118730808598,
"grad_norm": 0.37756710702648894,
"learning_rate": 2.883935791815222e-05,
"loss": 0.3413,
"step": 1022
},
{
"epoch": 2.0941658137154553,
"grad_norm": 0.36055751260111174,
"learning_rate": 2.8813683074751578e-05,
"loss": 0.3469,
"step": 1023
},
{
"epoch": 2.0962128966223132,
"grad_norm": 0.34337336038390814,
"learning_rate": 2.878799019318283e-05,
"loss": 0.3339,
"step": 1024
},
{
"epoch": 2.0982599795291708,
"grad_norm": 0.37844750997650106,
"learning_rate": 2.8762279326029293e-05,
"loss": 0.3278,
"step": 1025
},
{
"epoch": 2.1003070624360287,
"grad_norm": 0.3641190698552649,
"learning_rate": 2.8736550525911066e-05,
"loss": 0.3548,
"step": 1026
},
{
"epoch": 2.1023541453428862,
"grad_norm": 0.36462562710150886,
"learning_rate": 2.8710803845484955e-05,
"loss": 0.321,
"step": 1027
},
{
"epoch": 2.104401228249744,
"grad_norm": 0.3706428526021544,
"learning_rate": 2.8685039337444368e-05,
"loss": 0.3174,
"step": 1028
},
{
"epoch": 2.1064483111566017,
"grad_norm": 0.3276491863527004,
"learning_rate": 2.8659257054519182e-05,
"loss": 0.3046,
"step": 1029
},
{
"epoch": 2.1084953940634596,
"grad_norm": 0.3247212720041268,
"learning_rate": 2.8633457049475678e-05,
"loss": 0.3526,
"step": 1030
},
{
"epoch": 2.110542476970317,
"grad_norm": 0.3326121138866842,
"learning_rate": 2.8607639375116388e-05,
"loss": 0.3179,
"step": 1031
},
{
"epoch": 2.112589559877175,
"grad_norm": 0.36702351353785473,
"learning_rate": 2.858180408428001e-05,
"loss": 0.3393,
"step": 1032
},
{
"epoch": 2.1146366427840326,
"grad_norm": 0.31780331970447684,
"learning_rate": 2.855595122984129e-05,
"loss": 0.3248,
"step": 1033
},
{
"epoch": 2.1166837256908906,
"grad_norm": 0.32446381302593813,
"learning_rate": 2.853008086471094e-05,
"loss": 0.3283,
"step": 1034
},
{
"epoch": 2.118730808597748,
"grad_norm": 0.3428912531231067,
"learning_rate": 2.8504193041835497e-05,
"loss": 0.3048,
"step": 1035
},
{
"epoch": 2.120777891504606,
"grad_norm": 0.4004823401099236,
"learning_rate": 2.847828781419722e-05,
"loss": 0.3667,
"step": 1036
},
{
"epoch": 2.1228249744114636,
"grad_norm": 0.36467750504991164,
"learning_rate": 2.8452365234813992e-05,
"loss": 0.3601,
"step": 1037
},
{
"epoch": 2.1248720573183215,
"grad_norm": 0.3853260717775913,
"learning_rate": 2.842642535673922e-05,
"loss": 0.3289,
"step": 1038
},
{
"epoch": 2.126919140225179,
"grad_norm": 0.3693312153767782,
"learning_rate": 2.8400468233061708e-05,
"loss": 0.3147,
"step": 1039
},
{
"epoch": 2.128966223132037,
"grad_norm": 0.38807123819436246,
"learning_rate": 2.8374493916905544e-05,
"loss": 0.3269,
"step": 1040
},
{
"epoch": 2.1310133060388945,
"grad_norm": 0.3324036292794174,
"learning_rate": 2.834850246143002e-05,
"loss": 0.3076,
"step": 1041
},
{
"epoch": 2.1330603889457525,
"grad_norm": 0.36633069839093463,
"learning_rate": 2.832249391982949e-05,
"loss": 0.3315,
"step": 1042
},
{
"epoch": 2.13510747185261,
"grad_norm": 0.348911620524954,
"learning_rate": 2.8296468345333298e-05,
"loss": 0.2945,
"step": 1043
},
{
"epoch": 2.137154554759468,
"grad_norm": 0.4313978670199707,
"learning_rate": 2.827042579120562e-05,
"loss": 0.3556,
"step": 1044
},
{
"epoch": 2.1392016376663254,
"grad_norm": 0.33673070921204956,
"learning_rate": 2.8244366310745398e-05,
"loss": 0.3301,
"step": 1045
},
{
"epoch": 2.1412487205731834,
"grad_norm": 0.4048102626926484,
"learning_rate": 2.8218289957286226e-05,
"loss": 0.3672,
"step": 1046
},
{
"epoch": 2.143295803480041,
"grad_norm": 0.41846672934257156,
"learning_rate": 2.8192196784196198e-05,
"loss": 0.3148,
"step": 1047
},
{
"epoch": 2.145342886386899,
"grad_norm": 0.3744830737938391,
"learning_rate": 2.816608684487787e-05,
"loss": 0.3252,
"step": 1048
},
{
"epoch": 2.1473899692937564,
"grad_norm": 0.40525534049659034,
"learning_rate": 2.813996019276809e-05,
"loss": 0.3354,
"step": 1049
},
{
"epoch": 2.1494370522006143,
"grad_norm": 0.3733330999629505,
"learning_rate": 2.8113816881337902e-05,
"loss": 0.3146,
"step": 1050
},
{
"epoch": 2.151484135107472,
"grad_norm": 0.39634148645678874,
"learning_rate": 2.8087656964092472e-05,
"loss": 0.3041,
"step": 1051
},
{
"epoch": 2.15353121801433,
"grad_norm": 0.3852615546043453,
"learning_rate": 2.806148049457093e-05,
"loss": 0.3639,
"step": 1052
},
{
"epoch": 2.1555783009211873,
"grad_norm": 0.3896444126363705,
"learning_rate": 2.803528752634629e-05,
"loss": 0.3317,
"step": 1053
},
{
"epoch": 2.1576253838280453,
"grad_norm": 0.38474286304985633,
"learning_rate": 2.8009078113025335e-05,
"loss": 0.3363,
"step": 1054
},
{
"epoch": 2.1596724667349028,
"grad_norm": 0.36285165391849056,
"learning_rate": 2.798285230824849e-05,
"loss": 0.3088,
"step": 1055
},
{
"epoch": 2.1617195496417603,
"grad_norm": 0.39881869082842497,
"learning_rate": 2.795661016568975e-05,
"loss": 0.3472,
"step": 1056
},
{
"epoch": 2.1637666325486182,
"grad_norm": 0.36428100096066146,
"learning_rate": 2.7930351739056533e-05,
"loss": 0.347,
"step": 1057
},
{
"epoch": 2.1658137154554757,
"grad_norm": 0.4335731332830444,
"learning_rate": 2.7904077082089574e-05,
"loss": 0.325,
"step": 1058
},
{
"epoch": 2.1678607983623337,
"grad_norm": 0.3984251614786555,
"learning_rate": 2.787778624856286e-05,
"loss": 0.3066,
"step": 1059
},
{
"epoch": 2.169907881269191,
"grad_norm": 0.42180327285845043,
"learning_rate": 2.7851479292283442e-05,
"loss": 0.3415,
"step": 1060
},
{
"epoch": 2.171954964176049,
"grad_norm": 0.41842751411141604,
"learning_rate": 2.782515626709139e-05,
"loss": 0.3498,
"step": 1061
},
{
"epoch": 2.1740020470829067,
"grad_norm": 0.3982687240967601,
"learning_rate": 2.7798817226859678e-05,
"loss": 0.3311,
"step": 1062
},
{
"epoch": 2.1760491299897646,
"grad_norm": 0.3653496091806209,
"learning_rate": 2.7772462225494013e-05,
"loss": 0.3393,
"step": 1063
},
{
"epoch": 2.178096212896622,
"grad_norm": 0.35694917670185916,
"learning_rate": 2.7746091316932807e-05,
"loss": 0.2938,
"step": 1064
},
{
"epoch": 2.18014329580348,
"grad_norm": 0.36076433028006755,
"learning_rate": 2.7719704555147012e-05,
"loss": 0.3176,
"step": 1065
},
{
"epoch": 2.1821903787103376,
"grad_norm": 0.35991273790931844,
"learning_rate": 2.7693301994140026e-05,
"loss": 0.3369,
"step": 1066
},
{
"epoch": 2.1842374616171956,
"grad_norm": 0.3605864665846731,
"learning_rate": 2.7666883687947588e-05,
"loss": 0.308,
"step": 1067
},
{
"epoch": 2.186284544524053,
"grad_norm": 0.3868682601764087,
"learning_rate": 2.7640449690637642e-05,
"loss": 0.335,
"step": 1068
},
{
"epoch": 2.188331627430911,
"grad_norm": 0.37080898315589017,
"learning_rate": 2.761400005631028e-05,
"loss": 0.3339,
"step": 1069
},
{
"epoch": 2.1903787103377685,
"grad_norm": 0.35560785487130986,
"learning_rate": 2.7587534839097556e-05,
"loss": 0.3348,
"step": 1070
},
{
"epoch": 2.1924257932446265,
"grad_norm": 0.36780049563494116,
"learning_rate": 2.756105409316345e-05,
"loss": 0.3042,
"step": 1071
},
{
"epoch": 2.194472876151484,
"grad_norm": 0.3750046262445992,
"learning_rate": 2.7534557872703705e-05,
"loss": 0.3286,
"step": 1072
},
{
"epoch": 2.196519959058342,
"grad_norm": 0.3643852622155405,
"learning_rate": 2.750804623194574e-05,
"loss": 0.3202,
"step": 1073
},
{
"epoch": 2.1985670419651995,
"grad_norm": 0.35562711733903674,
"learning_rate": 2.7481519225148537e-05,
"loss": 0.3158,
"step": 1074
},
{
"epoch": 2.2006141248720574,
"grad_norm": 0.34699724820587735,
"learning_rate": 2.7454976906602513e-05,
"loss": 0.3635,
"step": 1075
},
{
"epoch": 2.202661207778915,
"grad_norm": 0.3632869672814209,
"learning_rate": 2.742841933062944e-05,
"loss": 0.3378,
"step": 1076
},
{
"epoch": 2.204708290685773,
"grad_norm": 0.35456528308188723,
"learning_rate": 2.7401846551582304e-05,
"loss": 0.3078,
"step": 1077
},
{
"epoch": 2.2067553735926304,
"grad_norm": 0.4081395245932041,
"learning_rate": 2.7375258623845207e-05,
"loss": 0.3429,
"step": 1078
},
{
"epoch": 2.2088024564994884,
"grad_norm": 0.34135550119349445,
"learning_rate": 2.7348655601833255e-05,
"loss": 0.3338,
"step": 1079
},
{
"epoch": 2.210849539406346,
"grad_norm": 0.42118938931480804,
"learning_rate": 2.7322037539992457e-05,
"loss": 0.3327,
"step": 1080
},
{
"epoch": 2.212896622313204,
"grad_norm": 0.37577320629275823,
"learning_rate": 2.7295404492799575e-05,
"loss": 0.3002,
"step": 1081
},
{
"epoch": 2.2149437052200613,
"grad_norm": 0.3303091254072807,
"learning_rate": 2.726875651476207e-05,
"loss": 0.3106,
"step": 1082
},
{
"epoch": 2.2169907881269193,
"grad_norm": 0.35777928959386923,
"learning_rate": 2.7242093660417954e-05,
"loss": 0.364,
"step": 1083
},
{
"epoch": 2.219037871033777,
"grad_norm": 0.3963714636770776,
"learning_rate": 2.721541598433567e-05,
"loss": 0.2969,
"step": 1084
},
{
"epoch": 2.2210849539406348,
"grad_norm": 0.3409183879753277,
"learning_rate": 2.718872354111401e-05,
"loss": 0.3346,
"step": 1085
},
{
"epoch": 2.2231320368474923,
"grad_norm": 0.35338149712684697,
"learning_rate": 2.7162016385381975e-05,
"loss": 0.3648,
"step": 1086
},
{
"epoch": 2.2251791197543502,
"grad_norm": 0.3315730951687613,
"learning_rate": 2.7135294571798706e-05,
"loss": 0.3063,
"step": 1087
},
{
"epoch": 2.2272262026612077,
"grad_norm": 0.3449763315274245,
"learning_rate": 2.7108558155053296e-05,
"loss": 0.3403,
"step": 1088
},
{
"epoch": 2.2292732855680657,
"grad_norm": 0.37034261955083203,
"learning_rate": 2.7081807189864764e-05,
"loss": 0.3583,
"step": 1089
},
{
"epoch": 2.231320368474923,
"grad_norm": 0.3621281223181069,
"learning_rate": 2.70550417309819e-05,
"loss": 0.3144,
"step": 1090
},
{
"epoch": 2.233367451381781,
"grad_norm": 0.39746428647523574,
"learning_rate": 2.7028261833183132e-05,
"loss": 0.3376,
"step": 1091
},
{
"epoch": 2.2354145342886387,
"grad_norm": 0.34536466760948237,
"learning_rate": 2.7001467551276464e-05,
"loss": 0.2973,
"step": 1092
},
{
"epoch": 2.237461617195496,
"grad_norm": 0.3255654437979655,
"learning_rate": 2.6974658940099337e-05,
"loss": 0.3222,
"step": 1093
},
{
"epoch": 2.239508700102354,
"grad_norm": 0.34570256484501904,
"learning_rate": 2.6947836054518484e-05,
"loss": 0.3585,
"step": 1094
},
{
"epoch": 2.241555783009212,
"grad_norm": 0.3325137281496525,
"learning_rate": 2.6920998949429913e-05,
"loss": 0.309,
"step": 1095
},
{
"epoch": 2.2436028659160696,
"grad_norm": 0.34364314604723273,
"learning_rate": 2.6894147679758678e-05,
"loss": 0.332,
"step": 1096
},
{
"epoch": 2.245649948822927,
"grad_norm": 0.3323431139976392,
"learning_rate": 2.6867282300458853e-05,
"loss": 0.3365,
"step": 1097
},
{
"epoch": 2.247697031729785,
"grad_norm": 0.36326221718241064,
"learning_rate": 2.684040286651338e-05,
"loss": 0.3361,
"step": 1098
},
{
"epoch": 2.2497441146366426,
"grad_norm": 0.3309544898142775,
"learning_rate": 2.6813509432933957e-05,
"loss": 0.3142,
"step": 1099
},
{
"epoch": 2.2517911975435005,
"grad_norm": 0.3686682735992276,
"learning_rate": 2.6786602054760952e-05,
"loss": 0.3078,
"step": 1100
},
{
"epoch": 2.253838280450358,
"grad_norm": 0.3515712454533351,
"learning_rate": 2.675968078706326e-05,
"loss": 0.3151,
"step": 1101
},
{
"epoch": 2.255885363357216,
"grad_norm": 0.3495717010136385,
"learning_rate": 2.673274568493821e-05,
"loss": 0.3243,
"step": 1102
},
{
"epoch": 2.2579324462640735,
"grad_norm": 0.34663490722766044,
"learning_rate": 2.670579680351143e-05,
"loss": 0.3284,
"step": 1103
},
{
"epoch": 2.2599795291709315,
"grad_norm": 0.34744444318737083,
"learning_rate": 2.667883419793676e-05,
"loss": 0.313,
"step": 1104
},
{
"epoch": 2.262026612077789,
"grad_norm": 0.38465408810908674,
"learning_rate": 2.6651857923396132e-05,
"loss": 0.3759,
"step": 1105
},
{
"epoch": 2.264073694984647,
"grad_norm": 0.3414863910860875,
"learning_rate": 2.6624868035099445e-05,
"loss": 0.3336,
"step": 1106
},
{
"epoch": 2.2661207778915045,
"grad_norm": 0.33861934140219296,
"learning_rate": 2.659786458828446e-05,
"loss": 0.3587,
"step": 1107
},
{
"epoch": 2.2681678607983624,
"grad_norm": 0.32698315646381576,
"learning_rate": 2.6570847638216698e-05,
"loss": 0.3506,
"step": 1108
},
{
"epoch": 2.27021494370522,
"grad_norm": 0.3303135891963801,
"learning_rate": 2.65438172401893e-05,
"loss": 0.3451,
"step": 1109
},
{
"epoch": 2.272262026612078,
"grad_norm": 0.3769317246092125,
"learning_rate": 2.6516773449522936e-05,
"loss": 0.3257,
"step": 1110
},
{
"epoch": 2.2743091095189354,
"grad_norm": 0.3511492114236981,
"learning_rate": 2.648971632156569e-05,
"loss": 0.3147,
"step": 1111
},
{
"epoch": 2.2763561924257933,
"grad_norm": 0.3675620800860699,
"learning_rate": 2.6462645911692938e-05,
"loss": 0.2979,
"step": 1112
},
{
"epoch": 2.278403275332651,
"grad_norm": 0.3792036243843211,
"learning_rate": 2.643556227530724e-05,
"loss": 0.3065,
"step": 1113
},
{
"epoch": 2.280450358239509,
"grad_norm": 0.32407850095411667,
"learning_rate": 2.6408465467838225e-05,
"loss": 0.332,
"step": 1114
},
{
"epoch": 2.2824974411463663,
"grad_norm": 0.31664427206527734,
"learning_rate": 2.6381355544742482e-05,
"loss": 0.3963,
"step": 1115
},
{
"epoch": 2.2845445240532243,
"grad_norm": 0.34686617702001726,
"learning_rate": 2.6354232561503433e-05,
"loss": 0.3357,
"step": 1116
},
{
"epoch": 2.286591606960082,
"grad_norm": 0.33621332629180944,
"learning_rate": 2.632709657363124e-05,
"loss": 0.3089,
"step": 1117
},
{
"epoch": 2.2886386898669397,
"grad_norm": 0.33153383111460555,
"learning_rate": 2.6299947636662673e-05,
"loss": 0.3054,
"step": 1118
},
{
"epoch": 2.2906857727737973,
"grad_norm": 0.3416358925904056,
"learning_rate": 2.6272785806161005e-05,
"loss": 0.3278,
"step": 1119
},
{
"epoch": 2.292732855680655,
"grad_norm": 0.40808146160514075,
"learning_rate": 2.6245611137715897e-05,
"loss": 0.3519,
"step": 1120
},
{
"epoch": 2.2947799385875127,
"grad_norm": 0.32524026497753233,
"learning_rate": 2.621842368694329e-05,
"loss": 0.3239,
"step": 1121
},
{
"epoch": 2.2968270214943707,
"grad_norm": 0.3658777033819554,
"learning_rate": 2.6191223509485273e-05,
"loss": 0.3286,
"step": 1122
},
{
"epoch": 2.298874104401228,
"grad_norm": 0.36322923087428066,
"learning_rate": 2.6164010661010007e-05,
"loss": 0.3364,
"step": 1123
},
{
"epoch": 2.300921187308086,
"grad_norm": 0.3278681613056945,
"learning_rate": 2.613678519721155e-05,
"loss": 0.3086,
"step": 1124
},
{
"epoch": 2.3029682702149437,
"grad_norm": 0.3547085800078984,
"learning_rate": 2.61095471738098e-05,
"loss": 0.3215,
"step": 1125
},
{
"epoch": 2.3050153531218016,
"grad_norm": 0.31393691776472127,
"learning_rate": 2.6082296646550364e-05,
"loss": 0.3114,
"step": 1126
},
{
"epoch": 2.307062436028659,
"grad_norm": 0.35347094972995313,
"learning_rate": 2.605503367120442e-05,
"loss": 0.3621,
"step": 1127
},
{
"epoch": 2.309109518935517,
"grad_norm": 0.3314912682101731,
"learning_rate": 2.6027758303568643e-05,
"loss": 0.3196,
"step": 1128
},
{
"epoch": 2.3111566018423746,
"grad_norm": 0.30778697872934085,
"learning_rate": 2.6000470599465065e-05,
"loss": 0.3068,
"step": 1129
},
{
"epoch": 2.313203684749232,
"grad_norm": 0.37088270704077186,
"learning_rate": 2.5973170614740946e-05,
"loss": 0.373,
"step": 1130
},
{
"epoch": 2.31525076765609,
"grad_norm": 0.34268176472285344,
"learning_rate": 2.5945858405268714e-05,
"loss": 0.3413,
"step": 1131
},
{
"epoch": 2.317297850562948,
"grad_norm": 0.3248279403276539,
"learning_rate": 2.5918534026945787e-05,
"loss": 0.3559,
"step": 1132
},
{
"epoch": 2.3193449334698055,
"grad_norm": 0.30464581495860377,
"learning_rate": 2.5891197535694507e-05,
"loss": 0.3367,
"step": 1133
},
{
"epoch": 2.321392016376663,
"grad_norm": 0.35318148922449216,
"learning_rate": 2.5863848987461993e-05,
"loss": 0.3529,
"step": 1134
},
{
"epoch": 2.323439099283521,
"grad_norm": 0.37807352066757405,
"learning_rate": 2.5836488438220044e-05,
"loss": 0.3347,
"step": 1135
},
{
"epoch": 2.325486182190379,
"grad_norm": 0.3608071303513835,
"learning_rate": 2.5809115943965027e-05,
"loss": 0.3366,
"step": 1136
},
{
"epoch": 2.3275332650972365,
"grad_norm": 0.3499879431406946,
"learning_rate": 2.5781731560717745e-05,
"loss": 0.3106,
"step": 1137
},
{
"epoch": 2.329580348004094,
"grad_norm": 0.3591909142140177,
"learning_rate": 2.575433534452334e-05,
"loss": 0.3396,
"step": 1138
},
{
"epoch": 2.331627430910952,
"grad_norm": 0.33212351658248346,
"learning_rate": 2.5726927351451178e-05,
"loss": 0.3439,
"step": 1139
},
{
"epoch": 2.3336745138178094,
"grad_norm": 0.36284882910248367,
"learning_rate": 2.5699507637594706e-05,
"loss": 0.304,
"step": 1140
},
{
"epoch": 2.3357215967246674,
"grad_norm": 0.34302795456823765,
"learning_rate": 2.5672076259071385e-05,
"loss": 0.3276,
"step": 1141
},
{
"epoch": 2.337768679631525,
"grad_norm": 0.36338586044821625,
"learning_rate": 2.5644633272022536e-05,
"loss": 0.36,
"step": 1142
},
{
"epoch": 2.339815762538383,
"grad_norm": 0.3428090299047305,
"learning_rate": 2.561717873261323e-05,
"loss": 0.3062,
"step": 1143
},
{
"epoch": 2.3418628454452404,
"grad_norm": 0.38038284417246715,
"learning_rate": 2.558971269703219e-05,
"loss": 0.3555,
"step": 1144
},
{
"epoch": 2.3439099283520983,
"grad_norm": 0.35045801763083695,
"learning_rate": 2.556223522149168e-05,
"loss": 0.3433,
"step": 1145
},
{
"epoch": 2.345957011258956,
"grad_norm": 0.36682906880387844,
"learning_rate": 2.5534746362227355e-05,
"loss": 0.3507,
"step": 1146
},
{
"epoch": 2.348004094165814,
"grad_norm": 0.34210491558572537,
"learning_rate": 2.5507246175498174e-05,
"loss": 0.3178,
"step": 1147
},
{
"epoch": 2.3500511770726713,
"grad_norm": 0.3421821678613336,
"learning_rate": 2.5479734717586285e-05,
"loss": 0.3124,
"step": 1148
},
{
"epoch": 2.3520982599795293,
"grad_norm": 0.37370205112447163,
"learning_rate": 2.5452212044796912e-05,
"loss": 0.3179,
"step": 1149
},
{
"epoch": 2.3541453428863868,
"grad_norm": 0.3575054301368409,
"learning_rate": 2.5424678213458202e-05,
"loss": 0.2982,
"step": 1150
},
{
"epoch": 2.3561924257932447,
"grad_norm": 0.4005011767035321,
"learning_rate": 2.539713327992117e-05,
"loss": 0.329,
"step": 1151
},
{
"epoch": 2.3582395087001022,
"grad_norm": 0.3796694210246127,
"learning_rate": 2.5369577300559544e-05,
"loss": 0.3495,
"step": 1152
},
{
"epoch": 2.36028659160696,
"grad_norm": 0.38531974828237286,
"learning_rate": 2.5342010331769635e-05,
"loss": 0.3218,
"step": 1153
},
{
"epoch": 2.3623336745138177,
"grad_norm": 0.4150417947081143,
"learning_rate": 2.531443242997029e-05,
"loss": 0.3714,
"step": 1154
},
{
"epoch": 2.3643807574206757,
"grad_norm": 0.3665969184427945,
"learning_rate": 2.5286843651602688e-05,
"loss": 0.33,
"step": 1155
},
{
"epoch": 2.366427840327533,
"grad_norm": 0.3668439390982461,
"learning_rate": 2.5259244053130295e-05,
"loss": 0.3338,
"step": 1156
},
{
"epoch": 2.368474923234391,
"grad_norm": 0.4105022406361347,
"learning_rate": 2.5231633691038716e-05,
"loss": 0.3303,
"step": 1157
},
{
"epoch": 2.3705220061412486,
"grad_norm": 0.3610884307782202,
"learning_rate": 2.5204012621835575e-05,
"loss": 0.3108,
"step": 1158
},
{
"epoch": 2.3725690890481066,
"grad_norm": 0.39634461411533756,
"learning_rate": 2.5176380902050418e-05,
"loss": 0.3398,
"step": 1159
},
{
"epoch": 2.374616171954964,
"grad_norm": 0.3956297902185592,
"learning_rate": 2.5148738588234593e-05,
"loss": 0.3199,
"step": 1160
},
{
"epoch": 2.376663254861822,
"grad_norm": 0.33517128768421744,
"learning_rate": 2.5121085736961112e-05,
"loss": 0.3288,
"step": 1161
},
{
"epoch": 2.3787103377686796,
"grad_norm": 0.3543225737911076,
"learning_rate": 2.5093422404824574e-05,
"loss": 0.3483,
"step": 1162
},
{
"epoch": 2.3807574206755375,
"grad_norm": 0.35329611481723555,
"learning_rate": 2.506574864844102e-05,
"loss": 0.3439,
"step": 1163
},
{
"epoch": 2.382804503582395,
"grad_norm": 0.3415016772188138,
"learning_rate": 2.5038064524447827e-05,
"loss": 0.3461,
"step": 1164
},
{
"epoch": 2.384851586489253,
"grad_norm": 0.3618115220444992,
"learning_rate": 2.5010370089503578e-05,
"loss": 0.3243,
"step": 1165
},
{
"epoch": 2.3868986693961105,
"grad_norm": 0.36205472648695425,
"learning_rate": 2.4982665400287972e-05,
"loss": 0.3411,
"step": 1166
},
{
"epoch": 2.3889457523029685,
"grad_norm": 0.43026672506995006,
"learning_rate": 2.4954950513501697e-05,
"loss": 0.3319,
"step": 1167
},
{
"epoch": 2.390992835209826,
"grad_norm": 0.350243563017003,
"learning_rate": 2.4927225485866297e-05,
"loss": 0.3479,
"step": 1168
},
{
"epoch": 2.393039918116684,
"grad_norm": 0.3746353739097793,
"learning_rate": 2.4899490374124085e-05,
"loss": 0.3429,
"step": 1169
},
{
"epoch": 2.3950870010235414,
"grad_norm": 0.3260898872775567,
"learning_rate": 2.4871745235038006e-05,
"loss": 0.3472,
"step": 1170
},
{
"epoch": 2.397134083930399,
"grad_norm": 0.33558457363741073,
"learning_rate": 2.4843990125391516e-05,
"loss": 0.328,
"step": 1171
},
{
"epoch": 2.399181166837257,
"grad_norm": 0.38996942202586005,
"learning_rate": 2.4816225101988506e-05,
"loss": 0.3391,
"step": 1172
},
{
"epoch": 2.401228249744115,
"grad_norm": 0.3566387434855101,
"learning_rate": 2.478845022165313e-05,
"loss": 0.3667,
"step": 1173
},
{
"epoch": 2.4032753326509724,
"grad_norm": 0.3444970227104489,
"learning_rate": 2.4760665541229712e-05,
"loss": 0.3301,
"step": 1174
},
{
"epoch": 2.40532241555783,
"grad_norm": 0.3092553635632143,
"learning_rate": 2.473287111758267e-05,
"loss": 0.3401,
"step": 1175
},
{
"epoch": 2.407369498464688,
"grad_norm": 0.3394352190867357,
"learning_rate": 2.470506700759631e-05,
"loss": 0.3218,
"step": 1176
},
{
"epoch": 2.409416581371546,
"grad_norm": 0.3393116279567721,
"learning_rate": 2.467725326817481e-05,
"loss": 0.3113,
"step": 1177
},
{
"epoch": 2.4114636642784033,
"grad_norm": 0.399667522842199,
"learning_rate": 2.464942995624203e-05,
"loss": 0.3269,
"step": 1178
},
{
"epoch": 2.413510747185261,
"grad_norm": 0.3505124387280659,
"learning_rate": 2.462159712874142e-05,
"loss": 0.3411,
"step": 1179
},
{
"epoch": 2.4155578300921188,
"grad_norm": 0.3519791912196212,
"learning_rate": 2.4593754842635917e-05,
"loss": 0.3036,
"step": 1180
},
{
"epoch": 2.4176049129989763,
"grad_norm": 0.36680184492885926,
"learning_rate": 2.4565903154907807e-05,
"loss": 0.3898,
"step": 1181
},
{
"epoch": 2.4196519959058342,
"grad_norm": 0.3778349635443584,
"learning_rate": 2.453804212255862e-05,
"loss": 0.3427,
"step": 1182
},
{
"epoch": 2.4216990788126918,
"grad_norm": 0.3389030241399457,
"learning_rate": 2.451017180260902e-05,
"loss": 0.3271,
"step": 1183
},
{
"epoch": 2.4237461617195497,
"grad_norm": 0.42513595322376974,
"learning_rate": 2.448229225209865e-05,
"loss": 0.3298,
"step": 1184
},
{
"epoch": 2.425793244626407,
"grad_norm": 0.3712368776548631,
"learning_rate": 2.4454403528086088e-05,
"loss": 0.3323,
"step": 1185
},
{
"epoch": 2.427840327533265,
"grad_norm": 0.3200914239556184,
"learning_rate": 2.4426505687648653e-05,
"loss": 0.3387,
"step": 1186
},
{
"epoch": 2.4298874104401227,
"grad_norm": 0.346315254552554,
"learning_rate": 2.4398598787882334e-05,
"loss": 0.3449,
"step": 1187
},
{
"epoch": 2.4319344933469806,
"grad_norm": 0.3184464070741258,
"learning_rate": 2.4370682885901657e-05,
"loss": 0.3006,
"step": 1188
},
{
"epoch": 2.433981576253838,
"grad_norm": 0.34720894372009287,
"learning_rate": 2.4342758038839573e-05,
"loss": 0.3354,
"step": 1189
},
{
"epoch": 2.436028659160696,
"grad_norm": 0.3963098146391974,
"learning_rate": 2.4314824303847342e-05,
"loss": 0.3273,
"step": 1190
},
{
"epoch": 2.4380757420675536,
"grad_norm": 0.3419114471543989,
"learning_rate": 2.4286881738094418e-05,
"loss": 0.3334,
"step": 1191
},
{
"epoch": 2.4401228249744116,
"grad_norm": 0.3476204838650686,
"learning_rate": 2.4258930398768317e-05,
"loss": 0.3405,
"step": 1192
},
{
"epoch": 2.442169907881269,
"grad_norm": 0.3398409708251329,
"learning_rate": 2.423097034307452e-05,
"loss": 0.2964,
"step": 1193
},
{
"epoch": 2.444216990788127,
"grad_norm": 0.33597865186885006,
"learning_rate": 2.4203001628236346e-05,
"loss": 0.3122,
"step": 1194
},
{
"epoch": 2.4462640736949846,
"grad_norm": 0.7573334739644073,
"learning_rate": 2.4175024311494835e-05,
"loss": 0.37,
"step": 1195
},
{
"epoch": 2.4483111566018425,
"grad_norm": 0.3096397452376292,
"learning_rate": 2.4147038450108627e-05,
"loss": 0.3462,
"step": 1196
},
{
"epoch": 2.4503582395087,
"grad_norm": 0.32363631370662416,
"learning_rate": 2.4119044101353853e-05,
"loss": 0.3089,
"step": 1197
},
{
"epoch": 2.452405322415558,
"grad_norm": 0.44476946502851955,
"learning_rate": 2.4091041322524023e-05,
"loss": 0.3891,
"step": 1198
},
{
"epoch": 2.4544524053224155,
"grad_norm": 0.34282272355962545,
"learning_rate": 2.406303017092988e-05,
"loss": 0.3672,
"step": 1199
},
{
"epoch": 2.4564994882292734,
"grad_norm": 0.3458362218172388,
"learning_rate": 2.403501070389932e-05,
"loss": 0.3446,
"step": 1200
},
{
"epoch": 2.458546571136131,
"grad_norm": 0.330522133423883,
"learning_rate": 2.4006982978777263e-05,
"loss": 0.3547,
"step": 1201
},
{
"epoch": 2.460593654042989,
"grad_norm": 0.33086606944472513,
"learning_rate": 2.39789470529255e-05,
"loss": 0.3145,
"step": 1202
},
{
"epoch": 2.4626407369498464,
"grad_norm": 0.3611588885012983,
"learning_rate": 2.3950902983722645e-05,
"loss": 0.3512,
"step": 1203
},
{
"epoch": 2.4646878198567044,
"grad_norm": 0.35160175468657195,
"learning_rate": 2.392285082856394e-05,
"loss": 0.331,
"step": 1204
},
{
"epoch": 2.466734902763562,
"grad_norm": 0.3329347867861998,
"learning_rate": 2.389479064486121e-05,
"loss": 0.3697,
"step": 1205
},
{
"epoch": 2.46878198567042,
"grad_norm": 0.35641819087178006,
"learning_rate": 2.3866722490042685e-05,
"loss": 0.3606,
"step": 1206
},
{
"epoch": 2.4708290685772774,
"grad_norm": 0.3604571656733162,
"learning_rate": 2.3838646421552917e-05,
"loss": 0.3377,
"step": 1207
},
{
"epoch": 2.472876151484135,
"grad_norm": 0.3352126815168747,
"learning_rate": 2.3810562496852666e-05,
"loss": 0.3262,
"step": 1208
},
{
"epoch": 2.474923234390993,
"grad_norm": 0.35666331619036534,
"learning_rate": 2.3782470773418756e-05,
"loss": 0.339,
"step": 1209
},
{
"epoch": 2.4769703172978508,
"grad_norm": 0.39672935648110513,
"learning_rate": 2.3754371308743975e-05,
"loss": 0.33,
"step": 1210
},
{
"epoch": 2.4790174002047083,
"grad_norm": 0.3591230233614195,
"learning_rate": 2.372626416033696e-05,
"loss": 0.3209,
"step": 1211
},
{
"epoch": 2.481064483111566,
"grad_norm": 0.34331298511904995,
"learning_rate": 2.3698149385722067e-05,
"loss": 0.3376,
"step": 1212
},
{
"epoch": 2.4831115660184238,
"grad_norm": 0.3934047338427704,
"learning_rate": 2.367002704243927e-05,
"loss": 0.3198,
"step": 1213
},
{
"epoch": 2.4851586489252817,
"grad_norm": 0.34587155423784893,
"learning_rate": 2.3641897188044018e-05,
"loss": 0.3442,
"step": 1214
},
{
"epoch": 2.487205731832139,
"grad_norm": 0.3681467355523078,
"learning_rate": 2.3613759880107133e-05,
"loss": 0.343,
"step": 1215
},
{
"epoch": 2.4892528147389967,
"grad_norm": 0.3249289712237619,
"learning_rate": 2.3585615176214716e-05,
"loss": 0.3066,
"step": 1216
},
{
"epoch": 2.4912998976458547,
"grad_norm": 0.37477832639872893,
"learning_rate": 2.3557463133967976e-05,
"loss": 0.3325,
"step": 1217
},
{
"epoch": 2.493346980552712,
"grad_norm": 0.36178361521501945,
"learning_rate": 2.3529303810983154e-05,
"loss": 0.3127,
"step": 1218
},
{
"epoch": 2.49539406345957,
"grad_norm": 0.32831527326831483,
"learning_rate": 2.3501137264891396e-05,
"loss": 0.3248,
"step": 1219
},
{
"epoch": 2.4974411463664277,
"grad_norm": 0.4027644606883135,
"learning_rate": 2.3472963553338614e-05,
"loss": 0.3023,
"step": 1220
},
{
"epoch": 2.4994882292732856,
"grad_norm": 0.3439537192088776,
"learning_rate": 2.3444782733985396e-05,
"loss": 0.3252,
"step": 1221
},
{
"epoch": 2.501535312180143,
"grad_norm": 0.32953719599686015,
"learning_rate": 2.3416594864506887e-05,
"loss": 0.3043,
"step": 1222
},
{
"epoch": 2.503582395087001,
"grad_norm": 0.3490669906957581,
"learning_rate": 2.338840000259264e-05,
"loss": 0.3133,
"step": 1223
},
{
"epoch": 2.5056294779938586,
"grad_norm": 0.36299216934576684,
"learning_rate": 2.3360198205946542e-05,
"loss": 0.3141,
"step": 1224
},
{
"epoch": 2.5076765609007166,
"grad_norm": 0.3180197591727672,
"learning_rate": 2.333198953228664e-05,
"loss": 0.3535,
"step": 1225
},
{
"epoch": 2.509723643807574,
"grad_norm": 0.5171123946314492,
"learning_rate": 2.3303774039345098e-05,
"loss": 0.3367,
"step": 1226
},
{
"epoch": 2.511770726714432,
"grad_norm": 0.3840513193856834,
"learning_rate": 2.3275551784867997e-05,
"loss": 0.3188,
"step": 1227
},
{
"epoch": 2.5138178096212895,
"grad_norm": 0.3346463848714001,
"learning_rate": 2.3247322826615276e-05,
"loss": 0.3596,
"step": 1228
},
{
"epoch": 2.5158648925281475,
"grad_norm": 0.3505848734275376,
"learning_rate": 2.3219087222360603e-05,
"loss": 0.3395,
"step": 1229
},
{
"epoch": 2.517911975435005,
"grad_norm": 0.3415846729855278,
"learning_rate": 2.3190845029891218e-05,
"loss": 0.3326,
"step": 1230
},
{
"epoch": 2.519959058341863,
"grad_norm": 0.3273579902423157,
"learning_rate": 2.316259630700787e-05,
"loss": 0.3344,
"step": 1231
},
{
"epoch": 2.5220061412487205,
"grad_norm": 0.3247531510829717,
"learning_rate": 2.313434111152467e-05,
"loss": 0.3346,
"step": 1232
},
{
"epoch": 2.5240532241555784,
"grad_norm": 0.3503893886731608,
"learning_rate": 2.310607950126896e-05,
"loss": 0.3448,
"step": 1233
},
{
"epoch": 2.526100307062436,
"grad_norm": 0.35713820818797615,
"learning_rate": 2.307781153408124e-05,
"loss": 0.3359,
"step": 1234
},
{
"epoch": 2.528147389969294,
"grad_norm": 0.34251280908459203,
"learning_rate": 2.3049537267814984e-05,
"loss": 0.3431,
"step": 1235
},
{
"epoch": 2.5301944728761514,
"grad_norm": 0.34661535982225733,
"learning_rate": 2.3021256760336583e-05,
"loss": 0.3604,
"step": 1236
},
{
"epoch": 2.5322415557830094,
"grad_norm": 0.3630834914867728,
"learning_rate": 2.2992970069525202e-05,
"loss": 0.3472,
"step": 1237
},
{
"epoch": 2.534288638689867,
"grad_norm": 0.318836844861738,
"learning_rate": 2.296467725327264e-05,
"loss": 0.3174,
"step": 1238
},
{
"epoch": 2.536335721596725,
"grad_norm": 0.3365825133835808,
"learning_rate": 2.293637836948325e-05,
"loss": 0.3093,
"step": 1239
},
{
"epoch": 2.5383828045035823,
"grad_norm": 0.35403182852053533,
"learning_rate": 2.29080734760738e-05,
"loss": 0.3382,
"step": 1240
},
{
"epoch": 2.54042988741044,
"grad_norm": 0.3562520797769209,
"learning_rate": 2.2879762630973355e-05,
"loss": 0.3315,
"step": 1241
},
{
"epoch": 2.542476970317298,
"grad_norm": 0.3188066365019869,
"learning_rate": 2.285144589212316e-05,
"loss": 0.3079,
"step": 1242
},
{
"epoch": 2.5445240532241558,
"grad_norm": 0.3556195009937043,
"learning_rate": 2.2823123317476522e-05,
"loss": 0.3422,
"step": 1243
},
{
"epoch": 2.5465711361310133,
"grad_norm": 0.3474070822067162,
"learning_rate": 2.2794794964998705e-05,
"loss": 0.3049,
"step": 1244
},
{
"epoch": 2.548618219037871,
"grad_norm": 0.3464039443053579,
"learning_rate": 2.276646089266677e-05,
"loss": 0.2992,
"step": 1245
},
{
"epoch": 2.5506653019447287,
"grad_norm": 0.3767709540930306,
"learning_rate": 2.273812115846951e-05,
"loss": 0.3726,
"step": 1246
},
{
"epoch": 2.5527123848515867,
"grad_norm": 0.3206160362666913,
"learning_rate": 2.2709775820407292e-05,
"loss": 0.2909,
"step": 1247
},
{
"epoch": 2.554759467758444,
"grad_norm": 0.3673509781890066,
"learning_rate": 2.2681424936491954e-05,
"loss": 0.3669,
"step": 1248
},
{
"epoch": 2.5568065506653017,
"grad_norm": 0.3401943174394405,
"learning_rate": 2.2653068564746692e-05,
"loss": 0.3403,
"step": 1249
},
{
"epoch": 2.5588536335721597,
"grad_norm": 0.3490447254588359,
"learning_rate": 2.2624706763205935e-05,
"loss": 0.3603,
"step": 1250
},
{
"epoch": 2.5609007164790176,
"grad_norm": 0.3482076297056933,
"learning_rate": 2.2596339589915197e-05,
"loss": 0.3554,
"step": 1251
},
{
"epoch": 2.562947799385875,
"grad_norm": 0.3115817757213185,
"learning_rate": 2.2567967102931025e-05,
"loss": 0.3136,
"step": 1252
},
{
"epoch": 2.5649948822927326,
"grad_norm": 0.32599919557680007,
"learning_rate": 2.2539589360320802e-05,
"loss": 0.3256,
"step": 1253
},
{
"epoch": 2.5670419651995906,
"grad_norm": 0.3039164892771023,
"learning_rate": 2.2511206420162716e-05,
"loss": 0.3414,
"step": 1254
},
{
"epoch": 2.5690890481064486,
"grad_norm": 0.31157751749513196,
"learning_rate": 2.2482818340545534e-05,
"loss": 0.3102,
"step": 1255
},
{
"epoch": 2.571136131013306,
"grad_norm": 0.33852273483094864,
"learning_rate": 2.2454425179568594e-05,
"loss": 0.3434,
"step": 1256
},
{
"epoch": 2.5731832139201636,
"grad_norm": 0.30672315678878886,
"learning_rate": 2.2426026995341602e-05,
"loss": 0.318,
"step": 1257
},
{
"epoch": 2.5752302968270215,
"grad_norm": 0.31206788352544473,
"learning_rate": 2.2397623845984548e-05,
"loss": 0.3749,
"step": 1258
},
{
"epoch": 2.5772773797338795,
"grad_norm": 0.30841479866018223,
"learning_rate": 2.2369215789627593e-05,
"loss": 0.298,
"step": 1259
},
{
"epoch": 2.579324462640737,
"grad_norm": 0.32527208718455825,
"learning_rate": 2.234080288441095e-05,
"loss": 0.3419,
"step": 1260
},
{
"epoch": 2.5813715455475945,
"grad_norm": 0.3124189339174951,
"learning_rate": 2.2312385188484718e-05,
"loss": 0.3501,
"step": 1261
},
{
"epoch": 2.5834186284544525,
"grad_norm": 0.3352181407147001,
"learning_rate": 2.2283962760008845e-05,
"loss": 0.339,
"step": 1262
},
{
"epoch": 2.58546571136131,
"grad_norm": 0.3548208328356842,
"learning_rate": 2.225553565715294e-05,
"loss": 0.3334,
"step": 1263
},
{
"epoch": 2.587512794268168,
"grad_norm": 0.32210075431253954,
"learning_rate": 2.2227103938096176e-05,
"loss": 0.3139,
"step": 1264
},
{
"epoch": 2.5895598771750254,
"grad_norm": 0.3342742670935017,
"learning_rate": 2.2198667661027193e-05,
"loss": 0.3232,
"step": 1265
},
{
"epoch": 2.5916069600818834,
"grad_norm": 0.3332141930024874,
"learning_rate": 2.2170226884143942e-05,
"loss": 0.3089,
"step": 1266
},
{
"epoch": 2.593654042988741,
"grad_norm": 0.3259468299127148,
"learning_rate": 2.2141781665653584e-05,
"loss": 0.3069,
"step": 1267
},
{
"epoch": 2.595701125895599,
"grad_norm": 0.3756435234426335,
"learning_rate": 2.2113332063772387e-05,
"loss": 0.3343,
"step": 1268
},
{
"epoch": 2.5977482088024564,
"grad_norm": 0.38118415339444334,
"learning_rate": 2.208487813672557e-05,
"loss": 0.3246,
"step": 1269
},
{
"epoch": 2.5997952917093143,
"grad_norm": 0.33475197046166133,
"learning_rate": 2.205641994274721e-05,
"loss": 0.3543,
"step": 1270
},
{
"epoch": 2.601842374616172,
"grad_norm": 0.3609114565434103,
"learning_rate": 2.2027957540080125e-05,
"loss": 0.3412,
"step": 1271
},
{
"epoch": 2.60388945752303,
"grad_norm": 0.35268666756991185,
"learning_rate": 2.199949098697574e-05,
"loss": 0.321,
"step": 1272
},
{
"epoch": 2.6059365404298873,
"grad_norm": 0.3218634099552252,
"learning_rate": 2.1971020341693973e-05,
"loss": 0.321,
"step": 1273
},
{
"epoch": 2.6079836233367453,
"grad_norm": 0.3886771490605891,
"learning_rate": 2.1942545662503115e-05,
"loss": 0.3366,
"step": 1274
},
{
"epoch": 2.610030706243603,
"grad_norm": 0.3542254238953694,
"learning_rate": 2.1914067007679733e-05,
"loss": 0.346,
"step": 1275
},
{
"epoch": 2.6120777891504607,
"grad_norm": 0.33442059420132036,
"learning_rate": 2.188558443550849e-05,
"loss": 0.3471,
"step": 1276
},
{
"epoch": 2.6141248720573182,
"grad_norm": 0.31543081486502833,
"learning_rate": 2.185709800428211e-05,
"loss": 0.3523,
"step": 1277
},
{
"epoch": 2.616171954964176,
"grad_norm": 0.32475295410492505,
"learning_rate": 2.1828607772301187e-05,
"loss": 0.3456,
"step": 1278
},
{
"epoch": 2.6182190378710337,
"grad_norm": 0.35139374027023634,
"learning_rate": 2.180011379787411e-05,
"loss": 0.3309,
"step": 1279
},
{
"epoch": 2.6202661207778917,
"grad_norm": 0.3057519944170325,
"learning_rate": 2.1771616139316903e-05,
"loss": 0.3351,
"step": 1280
},
{
"epoch": 2.622313203684749,
"grad_norm": 0.363923325870162,
"learning_rate": 2.174311485495317e-05,
"loss": 0.3046,
"step": 1281
},
{
"epoch": 2.6243602865916067,
"grad_norm": 0.3291114320223632,
"learning_rate": 2.1714610003113887e-05,
"loss": 0.303,
"step": 1282
},
{
"epoch": 2.6264073694984647,
"grad_norm": 0.3289632021661567,
"learning_rate": 2.168610164213738e-05,
"loss": 0.3213,
"step": 1283
},
{
"epoch": 2.6284544524053226,
"grad_norm": 0.33306777417969263,
"learning_rate": 2.1657589830369113e-05,
"loss": 0.351,
"step": 1284
},
{
"epoch": 2.63050153531218,
"grad_norm": 0.3612382108372884,
"learning_rate": 2.1629074626161647e-05,
"loss": 0.3868,
"step": 1285
},
{
"epoch": 2.6325486182190376,
"grad_norm": 0.3580608267992191,
"learning_rate": 2.1600556087874472e-05,
"loss": 0.3175,
"step": 1286
},
{
"epoch": 2.6345957011258956,
"grad_norm": 0.31536303959925943,
"learning_rate": 2.1572034273873893e-05,
"loss": 0.3262,
"step": 1287
},
{
"epoch": 2.6366427840327535,
"grad_norm": 0.3578677663211092,
"learning_rate": 2.1543509242532932e-05,
"loss": 0.3716,
"step": 1288
},
{
"epoch": 2.638689866939611,
"grad_norm": 0.31607653936815944,
"learning_rate": 2.1514981052231187e-05,
"loss": 0.3166,
"step": 1289
},
{
"epoch": 2.6407369498464686,
"grad_norm": 0.35206333188454375,
"learning_rate": 2.1486449761354727e-05,
"loss": 0.3315,
"step": 1290
},
{
"epoch": 2.6427840327533265,
"grad_norm": 0.34365776192029646,
"learning_rate": 2.145791542829597e-05,
"loss": 0.3225,
"step": 1291
},
{
"epoch": 2.6448311156601845,
"grad_norm": 0.40791136566579844,
"learning_rate": 2.142937811145354e-05,
"loss": 0.3839,
"step": 1292
},
{
"epoch": 2.646878198567042,
"grad_norm": 0.3334669459335626,
"learning_rate": 2.140083786923221e-05,
"loss": 0.3277,
"step": 1293
},
{
"epoch": 2.6489252814738995,
"grad_norm": 0.33721029481105136,
"learning_rate": 2.1372294760042686e-05,
"loss": 0.3396,
"step": 1294
},
{
"epoch": 2.6509723643807575,
"grad_norm": 0.33022887238565724,
"learning_rate": 2.1343748842301575e-05,
"loss": 0.3199,
"step": 1295
},
{
"epoch": 2.6530194472876154,
"grad_norm": 0.3578342406147216,
"learning_rate": 2.1315200174431235e-05,
"loss": 0.3264,
"step": 1296
},
{
"epoch": 2.655066530194473,
"grad_norm": 0.2843605578164525,
"learning_rate": 2.1286648814859636e-05,
"loss": 0.3196,
"step": 1297
},
{
"epoch": 2.6571136131013304,
"grad_norm": 0.3320141991189249,
"learning_rate": 2.1258094822020263e-05,
"loss": 0.3132,
"step": 1298
},
{
"epoch": 2.6591606960081884,
"grad_norm": 0.37866233560469814,
"learning_rate": 2.1229538254351995e-05,
"loss": 0.3238,
"step": 1299
},
{
"epoch": 2.661207778915046,
"grad_norm": 0.327131535806631,
"learning_rate": 2.120097917029897e-05,
"loss": 0.3843,
"step": 1300
},
{
"epoch": 2.663254861821904,
"grad_norm": 0.3036315122393342,
"learning_rate": 2.1172417628310487e-05,
"loss": 0.3292,
"step": 1301
},
{
"epoch": 2.6653019447287614,
"grad_norm": 0.3515024908888374,
"learning_rate": 2.1143853686840874e-05,
"loss": 0.3102,
"step": 1302
},
{
"epoch": 2.6673490276356193,
"grad_norm": 0.34581222342499085,
"learning_rate": 2.1115287404349357e-05,
"loss": 0.3156,
"step": 1303
},
{
"epoch": 2.669396110542477,
"grad_norm": 0.35858989267221897,
"learning_rate": 2.1086718839299972e-05,
"loss": 0.3461,
"step": 1304
},
{
"epoch": 2.671443193449335,
"grad_norm": 0.3475194620828264,
"learning_rate": 2.1058148050161412e-05,
"loss": 0.3357,
"step": 1305
},
{
"epoch": 2.6734902763561923,
"grad_norm": 0.3585594367622285,
"learning_rate": 2.1029575095406933e-05,
"loss": 0.3454,
"step": 1306
},
{
"epoch": 2.6755373592630503,
"grad_norm": 0.3384161065196781,
"learning_rate": 2.1001000033514215e-05,
"loss": 0.3403,
"step": 1307
},
{
"epoch": 2.6775844421699078,
"grad_norm": 0.33004176854360945,
"learning_rate": 2.097242292296525e-05,
"loss": 0.3643,
"step": 1308
},
{
"epoch": 2.6796315250767657,
"grad_norm": 0.33026393179643815,
"learning_rate": 2.0943843822246234e-05,
"loss": 0.3224,
"step": 1309
},
{
"epoch": 2.6816786079836232,
"grad_norm": 0.34661171805934476,
"learning_rate": 2.0915262789847414e-05,
"loss": 0.3368,
"step": 1310
},
{
"epoch": 2.683725690890481,
"grad_norm": 0.3157977388993276,
"learning_rate": 2.088667988426302e-05,
"loss": 0.311,
"step": 1311
},
{
"epoch": 2.6857727737973387,
"grad_norm": 0.32541710739849666,
"learning_rate": 2.0858095163991094e-05,
"loss": 0.3145,
"step": 1312
},
{
"epoch": 2.6878198567041967,
"grad_norm": 0.3317095591959358,
"learning_rate": 2.0829508687533387e-05,
"loss": 0.343,
"step": 1313
},
{
"epoch": 2.689866939611054,
"grad_norm": 0.33508091050613437,
"learning_rate": 2.0800920513395276e-05,
"loss": 0.3102,
"step": 1314
},
{
"epoch": 2.691914022517912,
"grad_norm": 0.3092939306017529,
"learning_rate": 2.077233070008557e-05,
"loss": 0.3628,
"step": 1315
},
{
"epoch": 2.6939611054247696,
"grad_norm": 0.3644518282154973,
"learning_rate": 2.074373930611647e-05,
"loss": 0.3611,
"step": 1316
},
{
"epoch": 2.6960081883316276,
"grad_norm": 0.3268481968948613,
"learning_rate": 2.0715146390003395e-05,
"loss": 0.4382,
"step": 1317
},
{
"epoch": 2.698055271238485,
"grad_norm": 0.32366874488936354,
"learning_rate": 2.0686552010264872e-05,
"loss": 0.3316,
"step": 1318
},
{
"epoch": 2.7001023541453426,
"grad_norm": 0.3655397054326707,
"learning_rate": 2.0657956225422438e-05,
"loss": 0.3241,
"step": 1319
},
{
"epoch": 2.7021494370522006,
"grad_norm": 0.3511986539192374,
"learning_rate": 2.0629359094000502e-05,
"loss": 0.3634,
"step": 1320
},
{
"epoch": 2.7041965199590585,
"grad_norm": 0.33590806443682913,
"learning_rate": 2.060076067452622e-05,
"loss": 0.3434,
"step": 1321
},
{
"epoch": 2.706243602865916,
"grad_norm": 0.34940605685458204,
"learning_rate": 2.0572161025529396e-05,
"loss": 0.3592,
"step": 1322
},
{
"epoch": 2.7082906857727735,
"grad_norm": 0.33025728634487234,
"learning_rate": 2.0543560205542338e-05,
"loss": 0.3273,
"step": 1323
},
{
"epoch": 2.7103377686796315,
"grad_norm": 0.3111565817907451,
"learning_rate": 2.0514958273099778e-05,
"loss": 0.3528,
"step": 1324
},
{
"epoch": 2.7123848515864895,
"grad_norm": 0.34384380691800237,
"learning_rate": 2.0486355286738675e-05,
"loss": 0.3279,
"step": 1325
},
{
"epoch": 2.714431934493347,
"grad_norm": 0.34483761426361903,
"learning_rate": 2.0457751304998196e-05,
"loss": 0.3154,
"step": 1326
},
{
"epoch": 2.7164790174002045,
"grad_norm": 0.34739319472868174,
"learning_rate": 2.042914638641952e-05,
"loss": 0.3122,
"step": 1327
},
{
"epoch": 2.7185261003070624,
"grad_norm": 0.34204787000879766,
"learning_rate": 2.0400540589545738e-05,
"loss": 0.2987,
"step": 1328
},
{
"epoch": 2.7205731832139204,
"grad_norm": 0.31149494456320415,
"learning_rate": 2.0371933972921756e-05,
"loss": 0.3651,
"step": 1329
},
{
"epoch": 2.722620266120778,
"grad_norm": 0.3206059172994117,
"learning_rate": 2.0343326595094154e-05,
"loss": 0.3056,
"step": 1330
},
{
"epoch": 2.7246673490276354,
"grad_norm": 0.3460394891552501,
"learning_rate": 2.031471851461105e-05,
"loss": 0.3078,
"step": 1331
},
{
"epoch": 2.7267144319344934,
"grad_norm": 0.31434160088392427,
"learning_rate": 2.0286109790022023e-05,
"loss": 0.3019,
"step": 1332
},
{
"epoch": 2.7287615148413513,
"grad_norm": 0.30988899965990013,
"learning_rate": 2.0257500479877965e-05,
"loss": 0.3606,
"step": 1333
},
{
"epoch": 2.730808597748209,
"grad_norm": 0.3038902310146715,
"learning_rate": 2.0228890642730967e-05,
"loss": 0.3188,
"step": 1334
},
{
"epoch": 2.7328556806550663,
"grad_norm": 0.33615481049383383,
"learning_rate": 2.020028033713418e-05,
"loss": 0.3233,
"step": 1335
},
{
"epoch": 2.7349027635619243,
"grad_norm": 0.3416566631514737,
"learning_rate": 2.0171669621641743e-05,
"loss": 0.3563,
"step": 1336
},
{
"epoch": 2.7369498464687823,
"grad_norm": 0.33625693354138464,
"learning_rate": 2.0143058554808622e-05,
"loss": 0.3107,
"step": 1337
},
{
"epoch": 2.7389969293756398,
"grad_norm": 0.32608262090854195,
"learning_rate": 2.0114447195190486e-05,
"loss": 0.3445,
"step": 1338
},
{
"epoch": 2.7410440122824973,
"grad_norm": 0.3157455564515132,
"learning_rate": 2.0085835601343627e-05,
"loss": 0.3426,
"step": 1339
},
{
"epoch": 2.7430910951893552,
"grad_norm": 0.3108979356348658,
"learning_rate": 2.005722383182481e-05,
"loss": 0.3216,
"step": 1340
},
{
"epoch": 2.7451381780962127,
"grad_norm": 0.31525583618025826,
"learning_rate": 2.002861194519114e-05,
"loss": 0.3888,
"step": 1341
},
{
"epoch": 2.7471852610030707,
"grad_norm": 0.35286260637825495,
"learning_rate": 2e-05,
"loss": 0.3448,
"step": 1342
},
{
"epoch": 2.749232343909928,
"grad_norm": 0.33612012963227933,
"learning_rate": 1.9971388054808863e-05,
"loss": 0.3303,
"step": 1343
},
{
"epoch": 2.751279426816786,
"grad_norm": 0.3293365486688113,
"learning_rate": 1.99427761681752e-05,
"loss": 0.3378,
"step": 1344
},
{
"epoch": 2.7533265097236437,
"grad_norm": 0.3702026149273537,
"learning_rate": 1.9914164398656383e-05,
"loss": 0.3204,
"step": 1345
},
{
"epoch": 2.7553735926305016,
"grad_norm": 0.3008607535549162,
"learning_rate": 1.988555280480952e-05,
"loss": 0.3245,
"step": 1346
},
{
"epoch": 2.757420675537359,
"grad_norm": 0.3346226482861992,
"learning_rate": 1.9856941445191388e-05,
"loss": 0.2973,
"step": 1347
},
{
"epoch": 2.759467758444217,
"grad_norm": 0.333107377703825,
"learning_rate": 1.9828330378358264e-05,
"loss": 0.3462,
"step": 1348
},
{
"epoch": 2.7615148413510746,
"grad_norm": 0.33602464320692405,
"learning_rate": 1.9799719662865828e-05,
"loss": 0.3348,
"step": 1349
},
{
"epoch": 2.7635619242579326,
"grad_norm": 0.3460744761311549,
"learning_rate": 1.9771109357269047e-05,
"loss": 0.3041,
"step": 1350
},
{
"epoch": 2.76560900716479,
"grad_norm": 0.3398214327127035,
"learning_rate": 1.974249952012204e-05,
"loss": 0.3442,
"step": 1351
},
{
"epoch": 2.767656090071648,
"grad_norm": 0.32323094867372437,
"learning_rate": 1.9713890209977977e-05,
"loss": 0.3133,
"step": 1352
},
{
"epoch": 2.7697031729785055,
"grad_norm": 0.3439688117561741,
"learning_rate": 1.9685281485388955e-05,
"loss": 0.3091,
"step": 1353
},
{
"epoch": 2.7717502558853635,
"grad_norm": 0.29746767666002316,
"learning_rate": 1.9656673404905852e-05,
"loss": 0.2957,
"step": 1354
},
{
"epoch": 2.773797338792221,
"grad_norm": 0.32765073618687846,
"learning_rate": 1.9628066027078247e-05,
"loss": 0.3413,
"step": 1355
},
{
"epoch": 2.775844421699079,
"grad_norm": 0.3473220628635939,
"learning_rate": 1.9599459410454266e-05,
"loss": 0.2997,
"step": 1356
},
{
"epoch": 2.7778915046059365,
"grad_norm": 0.34784120186994494,
"learning_rate": 1.957085361358049e-05,
"loss": 0.3397,
"step": 1357
},
{
"epoch": 2.7799385875127944,
"grad_norm": 0.32647960906928786,
"learning_rate": 1.9542248695001808e-05,
"loss": 0.3269,
"step": 1358
},
{
"epoch": 2.781985670419652,
"grad_norm": 0.3386091012629272,
"learning_rate": 1.9513644713261328e-05,
"loss": 0.3398,
"step": 1359
},
{
"epoch": 2.7840327533265095,
"grad_norm": 0.31112512902574707,
"learning_rate": 1.9485041726900232e-05,
"loss": 0.3247,
"step": 1360
},
{
"epoch": 2.7860798362333674,
"grad_norm": 0.3267971495507797,
"learning_rate": 1.9456439794457665e-05,
"loss": 0.3699,
"step": 1361
},
{
"epoch": 2.7881269191402254,
"grad_norm": 0.34461638842492454,
"learning_rate": 1.942783897447061e-05,
"loss": 0.3315,
"step": 1362
},
{
"epoch": 2.790174002047083,
"grad_norm": 0.30853412340590924,
"learning_rate": 1.939923932547379e-05,
"loss": 0.3325,
"step": 1363
},
{
"epoch": 2.7922210849539404,
"grad_norm": 0.32960376681405235,
"learning_rate": 1.93706409059995e-05,
"loss": 0.3048,
"step": 1364
},
{
"epoch": 2.7942681678607983,
"grad_norm": 0.3216608733849923,
"learning_rate": 1.9342043774577562e-05,
"loss": 0.3478,
"step": 1365
},
{
"epoch": 2.7963152507676563,
"grad_norm": 0.369356903841249,
"learning_rate": 1.931344798973513e-05,
"loss": 0.3428,
"step": 1366
},
{
"epoch": 2.798362333674514,
"grad_norm": 0.3021279849899095,
"learning_rate": 1.928485360999661e-05,
"loss": 0.3641,
"step": 1367
},
{
"epoch": 2.8004094165813713,
"grad_norm": 0.3573666945556479,
"learning_rate": 1.9256260693883534e-05,
"loss": 0.2993,
"step": 1368
},
{
"epoch": 2.8024564994882293,
"grad_norm": 0.31061810018877584,
"learning_rate": 1.922766929991443e-05,
"loss": 0.3298,
"step": 1369
},
{
"epoch": 2.8045035823950872,
"grad_norm": 0.32086151227350046,
"learning_rate": 1.9199079486604727e-05,
"loss": 0.293,
"step": 1370
},
{
"epoch": 2.8065506653019447,
"grad_norm": 0.3339285911403012,
"learning_rate": 1.9170491312466616e-05,
"loss": 0.3239,
"step": 1371
},
{
"epoch": 2.8085977482088023,
"grad_norm": 0.3273867205248011,
"learning_rate": 1.914190483600891e-05,
"loss": 0.3502,
"step": 1372
},
{
"epoch": 2.81064483111566,
"grad_norm": 0.3707945367816578,
"learning_rate": 1.9113320115736986e-05,
"loss": 0.3357,
"step": 1373
},
{
"epoch": 2.812691914022518,
"grad_norm": 0.34848249764459066,
"learning_rate": 1.9084737210152593e-05,
"loss": 0.3185,
"step": 1374
},
{
"epoch": 2.8147389969293757,
"grad_norm": 0.32760861423269866,
"learning_rate": 1.9056156177753776e-05,
"loss": 0.3228,
"step": 1375
},
{
"epoch": 2.816786079836233,
"grad_norm": 0.3391917065894771,
"learning_rate": 1.902757707703475e-05,
"loss": 0.3475,
"step": 1376
},
{
"epoch": 2.818833162743091,
"grad_norm": 0.3257314300037398,
"learning_rate": 1.899899996648579e-05,
"loss": 0.3325,
"step": 1377
},
{
"epoch": 2.8208802456499487,
"grad_norm": 0.3453324225261829,
"learning_rate": 1.897042490459307e-05,
"loss": 0.3301,
"step": 1378
},
{
"epoch": 2.8229273285568066,
"grad_norm": 0.35448623951366837,
"learning_rate": 1.8941851949838595e-05,
"loss": 0.3261,
"step": 1379
},
{
"epoch": 2.824974411463664,
"grad_norm": 0.322963697569429,
"learning_rate": 1.8913281160700038e-05,
"loss": 0.3602,
"step": 1380
},
{
"epoch": 2.827021494370522,
"grad_norm": 0.36959087755198944,
"learning_rate": 1.8884712595650653e-05,
"loss": 0.3173,
"step": 1381
},
{
"epoch": 2.8290685772773796,
"grad_norm": 0.3404440796977021,
"learning_rate": 1.885614631315914e-05,
"loss": 0.3549,
"step": 1382
},
{
"epoch": 2.8311156601842375,
"grad_norm": 0.31042921976093146,
"learning_rate": 1.8827582371689516e-05,
"loss": 0.3202,
"step": 1383
},
{
"epoch": 2.833162743091095,
"grad_norm": 0.3733330139685621,
"learning_rate": 1.8799020829701036e-05,
"loss": 0.3704,
"step": 1384
},
{
"epoch": 2.835209825997953,
"grad_norm": 0.35311166747172257,
"learning_rate": 1.8770461745648012e-05,
"loss": 0.3159,
"step": 1385
},
{
"epoch": 2.8372569089048105,
"grad_norm": 0.30009262162818595,
"learning_rate": 1.8741905177979743e-05,
"loss": 0.347,
"step": 1386
},
{
"epoch": 2.8393039918116685,
"grad_norm": 0.3431225449790101,
"learning_rate": 1.871335118514037e-05,
"loss": 0.3259,
"step": 1387
},
{
"epoch": 2.841351074718526,
"grad_norm": 0.3379879693204325,
"learning_rate": 1.8684799825568775e-05,
"loss": 0.3354,
"step": 1388
},
{
"epoch": 2.843398157625384,
"grad_norm": 0.32284443139667096,
"learning_rate": 1.8656251157698425e-05,
"loss": 0.3332,
"step": 1389
},
{
"epoch": 2.8454452405322415,
"grad_norm": 0.3366299858528211,
"learning_rate": 1.862770523995732e-05,
"loss": 0.3379,
"step": 1390
},
{
"epoch": 2.8474923234390994,
"grad_norm": 0.3306626186125099,
"learning_rate": 1.85991621307678e-05,
"loss": 0.3304,
"step": 1391
},
{
"epoch": 2.849539406345957,
"grad_norm": 0.3154237186664138,
"learning_rate": 1.8570621888546464e-05,
"loss": 0.3162,
"step": 1392
},
{
"epoch": 2.851586489252815,
"grad_norm": 0.3070396971440444,
"learning_rate": 1.854208457170404e-05,
"loss": 0.3341,
"step": 1393
},
{
"epoch": 2.8536335721596724,
"grad_norm": 0.31391792776294475,
"learning_rate": 1.8513550238645283e-05,
"loss": 0.3385,
"step": 1394
},
{
"epoch": 2.8556806550665303,
"grad_norm": 0.3604543553956966,
"learning_rate": 1.8485018947768817e-05,
"loss": 0.3242,
"step": 1395
},
{
"epoch": 2.857727737973388,
"grad_norm": 0.33119690178793554,
"learning_rate": 1.8456490757467075e-05,
"loss": 0.3172,
"step": 1396
},
{
"epoch": 2.859774820880246,
"grad_norm": 0.3634627791673887,
"learning_rate": 1.8427965726126114e-05,
"loss": 0.3256,
"step": 1397
},
{
"epoch": 2.8618219037871033,
"grad_norm": 0.30347193226396807,
"learning_rate": 1.839944391212553e-05,
"loss": 0.3375,
"step": 1398
},
{
"epoch": 2.8638689866939613,
"grad_norm": 0.3273120420881456,
"learning_rate": 1.8370925373838356e-05,
"loss": 0.3388,
"step": 1399
},
{
"epoch": 2.865916069600819,
"grad_norm": 0.31525363609472923,
"learning_rate": 1.834241016963089e-05,
"loss": 0.361,
"step": 1400
},
{
"epoch": 2.8679631525076763,
"grad_norm": 0.32941174535440465,
"learning_rate": 1.8313898357862623e-05,
"loss": 0.3292,
"step": 1401
},
{
"epoch": 2.8700102354145343,
"grad_norm": 0.33198094596750977,
"learning_rate": 1.8285389996886113e-05,
"loss": 0.3239,
"step": 1402
},
{
"epoch": 2.872057318321392,
"grad_norm": 0.31365437902540855,
"learning_rate": 1.8256885145046837e-05,
"loss": 0.3442,
"step": 1403
},
{
"epoch": 2.8741044012282497,
"grad_norm": 0.30554897801238856,
"learning_rate": 1.82283838606831e-05,
"loss": 0.3387,
"step": 1404
},
{
"epoch": 2.8761514841351072,
"grad_norm": 0.32230864173441504,
"learning_rate": 1.8199886202125897e-05,
"loss": 0.364,
"step": 1405
},
{
"epoch": 2.878198567041965,
"grad_norm": 0.3283203936597496,
"learning_rate": 1.817139222769882e-05,
"loss": 0.3135,
"step": 1406
},
{
"epoch": 2.880245649948823,
"grad_norm": 0.3352622595322451,
"learning_rate": 1.8142901995717894e-05,
"loss": 0.3293,
"step": 1407
},
{
"epoch": 2.8822927328556807,
"grad_norm": 0.3199512613271915,
"learning_rate": 1.8114415564491513e-05,
"loss": 0.3382,
"step": 1408
},
{
"epoch": 2.884339815762538,
"grad_norm": 0.3243396061475919,
"learning_rate": 1.8085932992320273e-05,
"loss": 0.3346,
"step": 1409
},
{
"epoch": 2.886386898669396,
"grad_norm": 0.3177758192621551,
"learning_rate": 1.805745433749689e-05,
"loss": 0.3193,
"step": 1410
},
{
"epoch": 2.888433981576254,
"grad_norm": 0.36128950608945015,
"learning_rate": 1.8028979658306033e-05,
"loss": 0.3352,
"step": 1411
},
{
"epoch": 2.8904810644831116,
"grad_norm": 0.34067398643062763,
"learning_rate": 1.8000509013024266e-05,
"loss": 0.3704,
"step": 1412
},
{
"epoch": 2.892528147389969,
"grad_norm": 0.36758322105150537,
"learning_rate": 1.7972042459919878e-05,
"loss": 0.3548,
"step": 1413
},
{
"epoch": 2.894575230296827,
"grad_norm": 0.3446360571011566,
"learning_rate": 1.794358005725279e-05,
"loss": 0.3379,
"step": 1414
},
{
"epoch": 2.896622313203685,
"grad_norm": 0.33967697044665596,
"learning_rate": 1.791512186327444e-05,
"loss": 0.3685,
"step": 1415
},
{
"epoch": 2.8986693961105425,
"grad_norm": 0.32938434012320356,
"learning_rate": 1.7886667936227616e-05,
"loss": 0.3224,
"step": 1416
},
{
"epoch": 2.9007164790174,
"grad_norm": 0.3492413938668141,
"learning_rate": 1.785821833434642e-05,
"loss": 0.3234,
"step": 1417
},
{
"epoch": 2.902763561924258,
"grad_norm": 0.3556331967459014,
"learning_rate": 1.7829773115856065e-05,
"loss": 0.305,
"step": 1418
},
{
"epoch": 2.9048106448311155,
"grad_norm": 0.3322076227286167,
"learning_rate": 1.7801332338972813e-05,
"loss": 0.3463,
"step": 1419
},
{
"epoch": 2.9068577277379735,
"grad_norm": 0.44401272792536567,
"learning_rate": 1.7772896061903824e-05,
"loss": 0.3441,
"step": 1420
},
{
"epoch": 2.908904810644831,
"grad_norm": 0.3346724756218441,
"learning_rate": 1.7744464342847062e-05,
"loss": 0.3243,
"step": 1421
},
{
"epoch": 2.910951893551689,
"grad_norm": 0.37288189845084224,
"learning_rate": 1.771603723999116e-05,
"loss": 0.328,
"step": 1422
},
{
"epoch": 2.9129989764585464,
"grad_norm": 0.3543864660540506,
"learning_rate": 1.768761481151529e-05,
"loss": 0.3239,
"step": 1423
},
{
"epoch": 2.9150460593654044,
"grad_norm": 0.33338686726894085,
"learning_rate": 1.765919711558906e-05,
"loss": 0.3373,
"step": 1424
},
{
"epoch": 2.917093142272262,
"grad_norm": 0.32542482524106037,
"learning_rate": 1.7630784210372413e-05,
"loss": 0.3473,
"step": 1425
},
{
"epoch": 2.91914022517912,
"grad_norm": 0.333470225286717,
"learning_rate": 1.7602376154015456e-05,
"loss": 0.3285,
"step": 1426
},
{
"epoch": 2.9211873080859774,
"grad_norm": 0.33573772915244177,
"learning_rate": 1.7573973004658404e-05,
"loss": 0.3024,
"step": 1427
},
{
"epoch": 2.9232343909928353,
"grad_norm": 0.34259215508028346,
"learning_rate": 1.7545574820431412e-05,
"loss": 0.3315,
"step": 1428
},
{
"epoch": 2.925281473899693,
"grad_norm": 0.33102210911596164,
"learning_rate": 1.751718165945447e-05,
"loss": 0.3519,
"step": 1429
},
{
"epoch": 2.927328556806551,
"grad_norm": 0.32436438467262657,
"learning_rate": 1.7488793579837297e-05,
"loss": 0.3661,
"step": 1430
},
{
"epoch": 2.9293756397134083,
"grad_norm": 0.31331782808616476,
"learning_rate": 1.74604106396792e-05,
"loss": 0.3141,
"step": 1431
},
{
"epoch": 2.9314227226202663,
"grad_norm": 0.4141533412404601,
"learning_rate": 1.743203289706898e-05,
"loss": 0.3557,
"step": 1432
},
{
"epoch": 2.9334698055271238,
"grad_norm": 0.4622088658579696,
"learning_rate": 1.7403660410084806e-05,
"loss": 0.399,
"step": 1433
},
{
"epoch": 2.9355168884339817,
"grad_norm": 0.3236744034505386,
"learning_rate": 1.737529323679407e-05,
"loss": 0.3286,
"step": 1434
},
{
"epoch": 2.9375639713408392,
"grad_norm": 0.2993041210404907,
"learning_rate": 1.734693143525331e-05,
"loss": 0.3259,
"step": 1435
},
{
"epoch": 2.939611054247697,
"grad_norm": 0.31919342750079005,
"learning_rate": 1.731857506350805e-05,
"loss": 0.3438,
"step": 1436
},
{
"epoch": 2.9416581371545547,
"grad_norm": 0.3479704658589382,
"learning_rate": 1.7290224179592718e-05,
"loss": 0.3561,
"step": 1437
},
{
"epoch": 2.943705220061412,
"grad_norm": 0.32794236732374565,
"learning_rate": 1.7261878841530494e-05,
"loss": 0.2956,
"step": 1438
},
{
"epoch": 2.94575230296827,
"grad_norm": 0.3348567062695728,
"learning_rate": 1.7233539107333234e-05,
"loss": 0.3207,
"step": 1439
},
{
"epoch": 2.947799385875128,
"grad_norm": 0.3107019732230144,
"learning_rate": 1.72052050350013e-05,
"loss": 0.3324,
"step": 1440
},
{
"epoch": 2.9498464687819856,
"grad_norm": 0.334794121032537,
"learning_rate": 1.717687668252348e-05,
"loss": 0.3296,
"step": 1441
},
{
"epoch": 2.951893551688843,
"grad_norm": 0.34219772695899153,
"learning_rate": 1.7148554107876847e-05,
"loss": 0.3504,
"step": 1442
},
{
"epoch": 2.953940634595701,
"grad_norm": 0.33351281779847475,
"learning_rate": 1.7120237369026655e-05,
"loss": 0.378,
"step": 1443
},
{
"epoch": 2.955987717502559,
"grad_norm": 0.3475392701501788,
"learning_rate": 1.7091926523926205e-05,
"loss": 0.3437,
"step": 1444
},
{
"epoch": 2.9580348004094166,
"grad_norm": 0.33100597785259966,
"learning_rate": 1.7063621630516755e-05,
"loss": 0.3289,
"step": 1445
},
{
"epoch": 2.960081883316274,
"grad_norm": 0.3615110501855717,
"learning_rate": 1.7035322746727366e-05,
"loss": 0.3148,
"step": 1446
},
{
"epoch": 2.962128966223132,
"grad_norm": 0.31694424871677895,
"learning_rate": 1.7007029930474804e-05,
"loss": 0.3389,
"step": 1447
},
{
"epoch": 2.96417604912999,
"grad_norm": 0.3428230537054886,
"learning_rate": 1.697874323966342e-05,
"loss": 0.3286,
"step": 1448
},
{
"epoch": 2.9662231320368475,
"grad_norm": 0.33741669479723163,
"learning_rate": 1.6950462732185023e-05,
"loss": 0.3197,
"step": 1449
},
{
"epoch": 2.968270214943705,
"grad_norm": 0.3203471822470365,
"learning_rate": 1.6922188465918763e-05,
"loss": 0.3297,
"step": 1450
},
{
"epoch": 2.970317297850563,
"grad_norm": 0.34824872554820474,
"learning_rate": 1.689392049873104e-05,
"loss": 0.3577,
"step": 1451
},
{
"epoch": 2.972364380757421,
"grad_norm": 0.33484691097376085,
"learning_rate": 1.6865658888475334e-05,
"loss": 0.3252,
"step": 1452
},
{
"epoch": 2.9744114636642784,
"grad_norm": 0.3184369996466899,
"learning_rate": 1.6837403692992136e-05,
"loss": 0.3267,
"step": 1453
},
{
"epoch": 2.976458546571136,
"grad_norm": 0.34524241797219873,
"learning_rate": 1.680915497010879e-05,
"loss": 0.362,
"step": 1454
},
{
"epoch": 2.978505629477994,
"grad_norm": 0.3517522038568365,
"learning_rate": 1.6780912777639407e-05,
"loss": 0.3455,
"step": 1455
},
{
"epoch": 2.9805527123848514,
"grad_norm": 0.35084133720055133,
"learning_rate": 1.6752677173384734e-05,
"loss": 0.3476,
"step": 1456
},
{
"epoch": 2.9825997952917094,
"grad_norm": 0.33510673606401725,
"learning_rate": 1.6724448215132006e-05,
"loss": 0.373,
"step": 1457
},
{
"epoch": 2.984646878198567,
"grad_norm": 0.32173623731436524,
"learning_rate": 1.669622596065491e-05,
"loss": 0.3416,
"step": 1458
},
{
"epoch": 2.986693961105425,
"grad_norm": 0.3405913760888214,
"learning_rate": 1.6668010467713363e-05,
"loss": 0.3336,
"step": 1459
},
{
"epoch": 2.9887410440122824,
"grad_norm": 0.33253196052599165,
"learning_rate": 1.6639801794053468e-05,
"loss": 0.3281,
"step": 1460
},
{
"epoch": 2.9907881269191403,
"grad_norm": 0.3127274815254226,
"learning_rate": 1.6611599997407366e-05,
"loss": 0.3219,
"step": 1461
},
{
"epoch": 2.992835209825998,
"grad_norm": 0.35589359153787264,
"learning_rate": 1.658340513549312e-05,
"loss": 0.3253,
"step": 1462
},
{
"epoch": 2.9948822927328558,
"grad_norm": 0.33592684199714334,
"learning_rate": 1.6555217266014604e-05,
"loss": 0.3679,
"step": 1463
},
{
"epoch": 2.9969293756397133,
"grad_norm": 0.3293658658288109,
"learning_rate": 1.6527036446661396e-05,
"loss": 0.3107,
"step": 1464
},
{
"epoch": 2.9989764585465712,
"grad_norm": 0.32358581638845413,
"learning_rate": 1.649886273510861e-05,
"loss": 0.3537,
"step": 1465
},
{
"epoch": 3.0010235414534288,
"grad_norm": 0.6530375339161546,
"learning_rate": 1.6470696189016853e-05,
"loss": 0.4219,
"step": 1466
},
{
"epoch": 3.0030706243602867,
"grad_norm": 0.41349403660281325,
"learning_rate": 1.6442536866032027e-05,
"loss": 0.2489,
"step": 1467
},
{
"epoch": 3.0051177072671442,
"grad_norm": 0.49742564287488156,
"learning_rate": 1.641438482378529e-05,
"loss": 0.2308,
"step": 1468
},
{
"epoch": 3.007164790174002,
"grad_norm": 0.5603085497770336,
"learning_rate": 1.6386240119892867e-05,
"loss": 0.2283,
"step": 1469
},
{
"epoch": 3.0092118730808597,
"grad_norm": 0.37112551523365644,
"learning_rate": 1.6358102811955985e-05,
"loss": 0.2255,
"step": 1470
},
{
"epoch": 3.0112589559877176,
"grad_norm": 0.4964687759956586,
"learning_rate": 1.6329972957560736e-05,
"loss": 0.2499,
"step": 1471
},
{
"epoch": 3.013306038894575,
"grad_norm": 0.5159224089011558,
"learning_rate": 1.6301850614277936e-05,
"loss": 0.2899,
"step": 1472
},
{
"epoch": 3.015353121801433,
"grad_norm": 0.3865653836943384,
"learning_rate": 1.6273735839663044e-05,
"loss": 0.23,
"step": 1473
},
{
"epoch": 3.0174002047082906,
"grad_norm": 0.36109737314160684,
"learning_rate": 1.6245628691256032e-05,
"loss": 0.2087,
"step": 1474
},
{
"epoch": 3.0194472876151486,
"grad_norm": 0.44209392274842507,
"learning_rate": 1.6217529226581247e-05,
"loss": 0.2523,
"step": 1475
},
{
"epoch": 3.021494370522006,
"grad_norm": 0.4472352934034512,
"learning_rate": 1.6189437503147338e-05,
"loss": 0.284,
"step": 1476
},
{
"epoch": 3.023541453428864,
"grad_norm": 0.3324235418829143,
"learning_rate": 1.616135357844709e-05,
"loss": 0.203,
"step": 1477
},
{
"epoch": 3.0255885363357216,
"grad_norm": 0.36268455499664337,
"learning_rate": 1.613327750995732e-05,
"loss": 0.2223,
"step": 1478
},
{
"epoch": 3.0276356192425795,
"grad_norm": 0.38026408150755925,
"learning_rate": 1.61052093551388e-05,
"loss": 0.2122,
"step": 1479
},
{
"epoch": 3.029682702149437,
"grad_norm": 0.3766577760390122,
"learning_rate": 1.6077149171436063e-05,
"loss": 0.263,
"step": 1480
},
{
"epoch": 3.031729785056295,
"grad_norm": 0.36393521520091426,
"learning_rate": 1.6049097016277358e-05,
"loss": 0.2729,
"step": 1481
},
{
"epoch": 3.0337768679631525,
"grad_norm": 0.36192400255610996,
"learning_rate": 1.60210529470745e-05,
"loss": 0.2318,
"step": 1482
},
{
"epoch": 3.0358239508700104,
"grad_norm": 0.35713778733325363,
"learning_rate": 1.599301702122274e-05,
"loss": 0.2356,
"step": 1483
},
{
"epoch": 3.037871033776868,
"grad_norm": 0.3518563642429611,
"learning_rate": 1.5964989296100682e-05,
"loss": 0.2367,
"step": 1484
},
{
"epoch": 3.039918116683726,
"grad_norm": 0.40298228736514674,
"learning_rate": 1.5936969829070125e-05,
"loss": 0.3027,
"step": 1485
},
{
"epoch": 3.0419651995905834,
"grad_norm": 0.31807307761105635,
"learning_rate": 1.590895867747599e-05,
"loss": 0.2541,
"step": 1486
},
{
"epoch": 3.044012282497441,
"grad_norm": 0.3409445938787892,
"learning_rate": 1.588095589864615e-05,
"loss": 0.2289,
"step": 1487
},
{
"epoch": 3.046059365404299,
"grad_norm": 0.3600773888625388,
"learning_rate": 1.5852961549891376e-05,
"loss": 0.2341,
"step": 1488
},
{
"epoch": 3.0481064483111564,
"grad_norm": 0.32453736981408465,
"learning_rate": 1.582497568850517e-05,
"loss": 0.2797,
"step": 1489
},
{
"epoch": 3.0501535312180144,
"grad_norm": 0.3331181801930424,
"learning_rate": 1.579699837176366e-05,
"loss": 0.2036,
"step": 1490
},
{
"epoch": 3.052200614124872,
"grad_norm": 0.4136442196087828,
"learning_rate": 1.5769029656925486e-05,
"loss": 0.2599,
"step": 1491
},
{
"epoch": 3.05424769703173,
"grad_norm": 0.35863640915527445,
"learning_rate": 1.574106960123169e-05,
"loss": 0.2526,
"step": 1492
},
{
"epoch": 3.0562947799385873,
"grad_norm": 0.35564688451602927,
"learning_rate": 1.571311826190559e-05,
"loss": 0.2503,
"step": 1493
},
{
"epoch": 3.0583418628454453,
"grad_norm": 0.33302867243814094,
"learning_rate": 1.5685175696152657e-05,
"loss": 0.2283,
"step": 1494
},
{
"epoch": 3.060388945752303,
"grad_norm": 0.36946655881677004,
"learning_rate": 1.5657241961160434e-05,
"loss": 0.272,
"step": 1495
},
{
"epoch": 3.0624360286591608,
"grad_norm": 0.33870469772598516,
"learning_rate": 1.562931711409835e-05,
"loss": 0.2527,
"step": 1496
},
{
"epoch": 3.0644831115660183,
"grad_norm": 0.29057383945914955,
"learning_rate": 1.5601401212117676e-05,
"loss": 0.2468,
"step": 1497
},
{
"epoch": 3.0665301944728762,
"grad_norm": 0.3538607726440218,
"learning_rate": 1.557349431235135e-05,
"loss": 0.2527,
"step": 1498
},
{
"epoch": 3.0685772773797337,
"grad_norm": 0.36881305364134004,
"learning_rate": 1.554559647191392e-05,
"loss": 0.2572,
"step": 1499
},
{
"epoch": 3.0706243602865917,
"grad_norm": 0.31303360312121764,
"learning_rate": 1.5517707747901352e-05,
"loss": 0.2015,
"step": 1500
},
{
"epoch": 3.072671443193449,
"grad_norm": 0.3468523227469292,
"learning_rate": 1.5489828197390988e-05,
"loss": 0.2522,
"step": 1501
},
{
"epoch": 3.074718526100307,
"grad_norm": 0.3189025943422717,
"learning_rate": 1.5461957877441387e-05,
"loss": 0.2467,
"step": 1502
},
{
"epoch": 3.0767656090071647,
"grad_norm": 0.36693872460390387,
"learning_rate": 1.5434096845092203e-05,
"loss": 0.2266,
"step": 1503
},
{
"epoch": 3.0788126919140226,
"grad_norm": 0.33990261356941326,
"learning_rate": 1.5406245157364093e-05,
"loss": 0.2511,
"step": 1504
},
{
"epoch": 3.08085977482088,
"grad_norm": 0.3224675841478402,
"learning_rate": 1.537840287125859e-05,
"loss": 0.2013,
"step": 1505
},
{
"epoch": 3.082906857727738,
"grad_norm": 0.3392490287207665,
"learning_rate": 1.5350570043757976e-05,
"loss": 0.2222,
"step": 1506
},
{
"epoch": 3.0849539406345956,
"grad_norm": 0.33119892853448313,
"learning_rate": 1.5322746731825195e-05,
"loss": 0.2403,
"step": 1507
},
{
"epoch": 3.0870010235414536,
"grad_norm": 0.3189197704594123,
"learning_rate": 1.5294932992403695e-05,
"loss": 0.2156,
"step": 1508
},
{
"epoch": 3.089048106448311,
"grad_norm": 0.33202442707430363,
"learning_rate": 1.526712888241734e-05,
"loss": 0.2449,
"step": 1509
},
{
"epoch": 3.091095189355169,
"grad_norm": 0.3126714573879982,
"learning_rate": 1.5239334458770291e-05,
"loss": 0.2345,
"step": 1510
},
{
"epoch": 3.0931422722620265,
"grad_norm": 0.3199987863370429,
"learning_rate": 1.5211549778346882e-05,
"loss": 0.2547,
"step": 1511
},
{
"epoch": 3.0951893551688845,
"grad_norm": 0.2865206932540882,
"learning_rate": 1.5183774898011496e-05,
"loss": 0.2262,
"step": 1512
},
{
"epoch": 3.097236438075742,
"grad_norm": 0.2978122286180525,
"learning_rate": 1.5156009874608484e-05,
"loss": 0.2454,
"step": 1513
},
{
"epoch": 3.0992835209826,
"grad_norm": 0.33009627047245504,
"learning_rate": 1.5128254764962e-05,
"loss": 0.2762,
"step": 1514
},
{
"epoch": 3.1013306038894575,
"grad_norm": 0.328880261572203,
"learning_rate": 1.5100509625875921e-05,
"loss": 0.3284,
"step": 1515
},
{
"epoch": 3.1033776867963154,
"grad_norm": 0.31916303688606795,
"learning_rate": 1.5072774514133708e-05,
"loss": 0.2299,
"step": 1516
},
{
"epoch": 3.105424769703173,
"grad_norm": 0.3226893313357512,
"learning_rate": 1.5045049486498311e-05,
"loss": 0.2338,
"step": 1517
},
{
"epoch": 3.107471852610031,
"grad_norm": 0.31484899101790875,
"learning_rate": 1.5017334599712028e-05,
"loss": 0.2039,
"step": 1518
},
{
"epoch": 3.1095189355168884,
"grad_norm": 0.32998837828193894,
"learning_rate": 1.4989629910496424e-05,
"loss": 0.2345,
"step": 1519
},
{
"epoch": 3.1115660184237464,
"grad_norm": 0.3027257775351982,
"learning_rate": 1.4961935475552178e-05,
"loss": 0.2285,
"step": 1520
},
{
"epoch": 3.113613101330604,
"grad_norm": 0.3071518323366728,
"learning_rate": 1.4934251351558983e-05,
"loss": 0.2384,
"step": 1521
},
{
"epoch": 3.115660184237462,
"grad_norm": 0.30152633341709373,
"learning_rate": 1.4906577595175428e-05,
"loss": 0.2201,
"step": 1522
},
{
"epoch": 3.1177072671443193,
"grad_norm": 0.33663282454591203,
"learning_rate": 1.4878914263038895e-05,
"loss": 0.2324,
"step": 1523
},
{
"epoch": 3.119754350051177,
"grad_norm": 0.33606866012096387,
"learning_rate": 1.4851261411765414e-05,
"loss": 0.2629,
"step": 1524
},
{
"epoch": 3.121801432958035,
"grad_norm": 0.32073469626766654,
"learning_rate": 1.4823619097949584e-05,
"loss": 0.2309,
"step": 1525
},
{
"epoch": 3.1238485158648923,
"grad_norm": 0.34825982979835035,
"learning_rate": 1.4795987378164432e-05,
"loss": 0.2361,
"step": 1526
},
{
"epoch": 3.1258955987717503,
"grad_norm": 0.32245272558823557,
"learning_rate": 1.4768366308961288e-05,
"loss": 0.2281,
"step": 1527
},
{
"epoch": 3.127942681678608,
"grad_norm": 0.35652651564540333,
"learning_rate": 1.4740755946869708e-05,
"loss": 0.2508,
"step": 1528
},
{
"epoch": 3.1299897645854657,
"grad_norm": 0.3215049499878307,
"learning_rate": 1.4713156348397317e-05,
"loss": 0.2144,
"step": 1529
},
{
"epoch": 3.1320368474923233,
"grad_norm": 0.33346781365441597,
"learning_rate": 1.468556757002972e-05,
"loss": 0.2425,
"step": 1530
},
{
"epoch": 3.134083930399181,
"grad_norm": 0.35639968613552386,
"learning_rate": 1.4657989668230363e-05,
"loss": 0.2227,
"step": 1531
},
{
"epoch": 3.1361310133060387,
"grad_norm": 0.3010242077787357,
"learning_rate": 1.4630422699440461e-05,
"loss": 0.2742,
"step": 1532
},
{
"epoch": 3.1381780962128967,
"grad_norm": 0.36016875940221305,
"learning_rate": 1.4602866720078832e-05,
"loss": 0.2747,
"step": 1533
},
{
"epoch": 3.140225179119754,
"grad_norm": 0.35237386827329403,
"learning_rate": 1.4575321786541801e-05,
"loss": 0.2408,
"step": 1534
},
{
"epoch": 3.142272262026612,
"grad_norm": 0.319112209857982,
"learning_rate": 1.45477879552031e-05,
"loss": 0.2488,
"step": 1535
},
{
"epoch": 3.1443193449334697,
"grad_norm": 0.2967734684654477,
"learning_rate": 1.4520265282413722e-05,
"loss": 0.213,
"step": 1536
},
{
"epoch": 3.1463664278403276,
"grad_norm": 0.3524504353601547,
"learning_rate": 1.4492753824501833e-05,
"loss": 0.222,
"step": 1537
},
{
"epoch": 3.148413510747185,
"grad_norm": 0.33500821048449647,
"learning_rate": 1.4465253637772651e-05,
"loss": 0.2513,
"step": 1538
},
{
"epoch": 3.150460593654043,
"grad_norm": 0.34077100422476553,
"learning_rate": 1.443776477850833e-05,
"loss": 0.2701,
"step": 1539
},
{
"epoch": 3.1525076765609006,
"grad_norm": 0.34275143758140053,
"learning_rate": 1.4410287302967813e-05,
"loss": 0.3137,
"step": 1540
},
{
"epoch": 3.1545547594677585,
"grad_norm": 0.31464152797503897,
"learning_rate": 1.4382821267386781e-05,
"loss": 0.2329,
"step": 1541
},
{
"epoch": 3.156601842374616,
"grad_norm": 0.29997611145802033,
"learning_rate": 1.4355366727977473e-05,
"loss": 0.2355,
"step": 1542
},
{
"epoch": 3.158648925281474,
"grad_norm": 0.3098740828854044,
"learning_rate": 1.4327923740928613e-05,
"loss": 0.2364,
"step": 1543
},
{
"epoch": 3.1606960081883315,
"grad_norm": 0.3182081076670239,
"learning_rate": 1.4300492362405296e-05,
"loss": 0.2307,
"step": 1544
},
{
"epoch": 3.1627430910951895,
"grad_norm": 0.33289523643049107,
"learning_rate": 1.4273072648548827e-05,
"loss": 0.2658,
"step": 1545
},
{
"epoch": 3.164790174002047,
"grad_norm": 0.3483349343296588,
"learning_rate": 1.4245664655476663e-05,
"loss": 0.239,
"step": 1546
},
{
"epoch": 3.166837256908905,
"grad_norm": 0.3344228654090122,
"learning_rate": 1.4218268439282259e-05,
"loss": 0.2136,
"step": 1547
},
{
"epoch": 3.1688843398157625,
"grad_norm": 0.31443022141500176,
"learning_rate": 1.4190884056034983e-05,
"loss": 0.2642,
"step": 1548
},
{
"epoch": 3.1709314227226204,
"grad_norm": 0.32146571298562293,
"learning_rate": 1.4163511561779956e-05,
"loss": 0.2532,
"step": 1549
},
{
"epoch": 3.172978505629478,
"grad_norm": 0.3550608808689066,
"learning_rate": 1.4136151012538008e-05,
"loss": 0.2358,
"step": 1550
},
{
"epoch": 3.175025588536336,
"grad_norm": 0.3166912892735251,
"learning_rate": 1.4108802464305496e-05,
"loss": 0.2128,
"step": 1551
},
{
"epoch": 3.1770726714431934,
"grad_norm": 0.3332651741225492,
"learning_rate": 1.4081465973054216e-05,
"loss": 0.2423,
"step": 1552
},
{
"epoch": 3.1791197543500513,
"grad_norm": 0.3369578277421136,
"learning_rate": 1.4054141594731289e-05,
"loss": 0.205,
"step": 1553
},
{
"epoch": 3.181166837256909,
"grad_norm": 0.3549410163798208,
"learning_rate": 1.402682938525906e-05,
"loss": 0.2587,
"step": 1554
},
{
"epoch": 3.183213920163767,
"grad_norm": 0.31908575274366446,
"learning_rate": 1.3999529400534941e-05,
"loss": 0.2669,
"step": 1555
},
{
"epoch": 3.1852610030706243,
"grad_norm": 0.33778438982482334,
"learning_rate": 1.3972241696431357e-05,
"loss": 0.244,
"step": 1556
},
{
"epoch": 3.1873080859774823,
"grad_norm": 0.3536190099748012,
"learning_rate": 1.3944966328795584e-05,
"loss": 0.243,
"step": 1557
},
{
"epoch": 3.18935516888434,
"grad_norm": 0.3395539666321202,
"learning_rate": 1.3917703353449646e-05,
"loss": 0.2231,
"step": 1558
},
{
"epoch": 3.1914022517911977,
"grad_norm": 0.3107794197012843,
"learning_rate": 1.3890452826190208e-05,
"loss": 0.203,
"step": 1559
},
{
"epoch": 3.1934493346980553,
"grad_norm": 0.33383730315910054,
"learning_rate": 1.3863214802788459e-05,
"loss": 0.2239,
"step": 1560
},
{
"epoch": 3.1954964176049128,
"grad_norm": 0.3236027308800084,
"learning_rate": 1.3835989338989996e-05,
"loss": 0.2602,
"step": 1561
},
{
"epoch": 3.1975435005117707,
"grad_norm": 0.3383450416141259,
"learning_rate": 1.3808776490514727e-05,
"loss": 0.2775,
"step": 1562
},
{
"epoch": 3.1995905834186287,
"grad_norm": 0.29556678254047786,
"learning_rate": 1.3781576313056713e-05,
"loss": 0.2305,
"step": 1563
},
{
"epoch": 3.201637666325486,
"grad_norm": 0.34240014338764224,
"learning_rate": 1.375438886228411e-05,
"loss": 0.2284,
"step": 1564
},
{
"epoch": 3.2036847492323437,
"grad_norm": 0.35927171229864285,
"learning_rate": 1.3727214193839002e-05,
"loss": 0.2669,
"step": 1565
},
{
"epoch": 3.2057318321392017,
"grad_norm": 0.295141784625082,
"learning_rate": 1.3700052363337337e-05,
"loss": 0.2237,
"step": 1566
},
{
"epoch": 3.207778915046059,
"grad_norm": 0.31352782892999664,
"learning_rate": 1.3672903426368773e-05,
"loss": 0.2105,
"step": 1567
},
{
"epoch": 3.209825997952917,
"grad_norm": 0.31282387528552047,
"learning_rate": 1.3645767438496567e-05,
"loss": 0.2252,
"step": 1568
},
{
"epoch": 3.2118730808597746,
"grad_norm": 0.3487407042028981,
"learning_rate": 1.3618644455257521e-05,
"loss": 0.2717,
"step": 1569
},
{
"epoch": 3.2139201637666326,
"grad_norm": 0.34749040489450855,
"learning_rate": 1.3591534532161781e-05,
"loss": 0.2463,
"step": 1570
},
{
"epoch": 3.21596724667349,
"grad_norm": 0.32899297696851715,
"learning_rate": 1.3564437724692766e-05,
"loss": 0.2275,
"step": 1571
},
{
"epoch": 3.218014329580348,
"grad_norm": 0.2725890794790335,
"learning_rate": 1.353735408830707e-05,
"loss": 0.2153,
"step": 1572
},
{
"epoch": 3.2200614124872056,
"grad_norm": 0.37569994527165246,
"learning_rate": 1.3510283678434317e-05,
"loss": 0.2445,
"step": 1573
},
{
"epoch": 3.2221084953940635,
"grad_norm": 0.3527821656094053,
"learning_rate": 1.348322655047707e-05,
"loss": 0.2088,
"step": 1574
},
{
"epoch": 3.224155578300921,
"grad_norm": 0.3319266141186732,
"learning_rate": 1.3456182759810708e-05,
"loss": 0.2336,
"step": 1575
},
{
"epoch": 3.226202661207779,
"grad_norm": 0.3545486115689844,
"learning_rate": 1.3429152361783307e-05,
"loss": 0.2681,
"step": 1576
},
{
"epoch": 3.2282497441146365,
"grad_norm": 0.3385783041962988,
"learning_rate": 1.3402135411715545e-05,
"loss": 0.2315,
"step": 1577
},
{
"epoch": 3.2302968270214945,
"grad_norm": 0.32938060023769156,
"learning_rate": 1.337513196490056e-05,
"loss": 0.2498,
"step": 1578
},
{
"epoch": 3.232343909928352,
"grad_norm": 0.30316278108042816,
"learning_rate": 1.3348142076603876e-05,
"loss": 0.1928,
"step": 1579
},
{
"epoch": 3.23439099283521,
"grad_norm": 0.34764955355779054,
"learning_rate": 1.3321165802063243e-05,
"loss": 0.28,
"step": 1580
},
{
"epoch": 3.2364380757420674,
"grad_norm": 0.3268398726034169,
"learning_rate": 1.3294203196488576e-05,
"loss": 0.2804,
"step": 1581
},
{
"epoch": 3.2384851586489254,
"grad_norm": 0.31999282008130525,
"learning_rate": 1.3267254315061797e-05,
"loss": 0.26,
"step": 1582
},
{
"epoch": 3.240532241555783,
"grad_norm": 0.31712302189517827,
"learning_rate": 1.324031921293674e-05,
"loss": 0.23,
"step": 1583
},
{
"epoch": 3.242579324462641,
"grad_norm": 0.3137795221764201,
"learning_rate": 1.3213397945239053e-05,
"loss": 0.243,
"step": 1584
},
{
"epoch": 3.2446264073694984,
"grad_norm": 0.3236833391733087,
"learning_rate": 1.318649056706605e-05,
"loss": 0.2621,
"step": 1585
},
{
"epoch": 3.2466734902763563,
"grad_norm": 0.28341768420158997,
"learning_rate": 1.3159597133486628e-05,
"loss": 0.2105,
"step": 1586
},
{
"epoch": 3.248720573183214,
"grad_norm": 0.34778024409649644,
"learning_rate": 1.313271769954115e-05,
"loss": 0.2899,
"step": 1587
},
{
"epoch": 3.250767656090072,
"grad_norm": 0.3072274493404741,
"learning_rate": 1.3105852320241326e-05,
"loss": 0.2141,
"step": 1588
},
{
"epoch": 3.2528147389969293,
"grad_norm": 0.32380150432128973,
"learning_rate": 1.307900105057009e-05,
"loss": 0.2218,
"step": 1589
},
{
"epoch": 3.2548618219037873,
"grad_norm": 0.3234313545922506,
"learning_rate": 1.3052163945481517e-05,
"loss": 0.2301,
"step": 1590
},
{
"epoch": 3.2569089048106448,
"grad_norm": 0.3409588063960833,
"learning_rate": 1.3025341059900675e-05,
"loss": 0.2331,
"step": 1591
},
{
"epoch": 3.2589559877175027,
"grad_norm": 0.33214628078621744,
"learning_rate": 1.2998532448723536e-05,
"loss": 0.2253,
"step": 1592
},
{
"epoch": 3.2610030706243602,
"grad_norm": 0.3495327746489498,
"learning_rate": 1.2971738166816871e-05,
"loss": 0.2369,
"step": 1593
},
{
"epoch": 3.263050153531218,
"grad_norm": 0.35093939156017323,
"learning_rate": 1.2944958269018103e-05,
"loss": 0.2329,
"step": 1594
},
{
"epoch": 3.2650972364380757,
"grad_norm": 0.3204843846779589,
"learning_rate": 1.291819281013524e-05,
"loss": 0.2144,
"step": 1595
},
{
"epoch": 3.2671443193449337,
"grad_norm": 0.3484251329111784,
"learning_rate": 1.289144184494671e-05,
"loss": 0.2531,
"step": 1596
},
{
"epoch": 3.269191402251791,
"grad_norm": 0.29945051356789365,
"learning_rate": 1.2864705428201307e-05,
"loss": 0.2293,
"step": 1597
},
{
"epoch": 3.2712384851586487,
"grad_norm": 0.3521856225970276,
"learning_rate": 1.2837983614618023e-05,
"loss": 0.3006,
"step": 1598
},
{
"epoch": 3.2732855680655066,
"grad_norm": 0.3221884922752649,
"learning_rate": 1.2811276458885993e-05,
"loss": 0.2331,
"step": 1599
},
{
"epoch": 3.2753326509723646,
"grad_norm": 0.30627973138617387,
"learning_rate": 1.2784584015664337e-05,
"loss": 0.222,
"step": 1600
},
{
"epoch": 3.277379733879222,
"grad_norm": 0.3650912580402862,
"learning_rate": 1.2757906339582053e-05,
"loss": 0.269,
"step": 1601
},
{
"epoch": 3.2794268167860796,
"grad_norm": 0.2847330580470038,
"learning_rate": 1.2731243485237932e-05,
"loss": 0.2161,
"step": 1602
},
{
"epoch": 3.2814738996929376,
"grad_norm": 0.3140163871014674,
"learning_rate": 1.2704595507200435e-05,
"loss": 0.271,
"step": 1603
},
{
"epoch": 3.2835209825997955,
"grad_norm": 0.324180451733573,
"learning_rate": 1.2677962460007555e-05,
"loss": 0.2124,
"step": 1604
},
{
"epoch": 3.285568065506653,
"grad_norm": 0.32841827310726324,
"learning_rate": 1.2651344398166745e-05,
"loss": 0.2905,
"step": 1605
},
{
"epoch": 3.2876151484135105,
"grad_norm": 0.2992524116524214,
"learning_rate": 1.26247413761548e-05,
"loss": 0.1951,
"step": 1606
},
{
"epoch": 3.2896622313203685,
"grad_norm": 0.3536568449695457,
"learning_rate": 1.2598153448417701e-05,
"loss": 0.3008,
"step": 1607
},
{
"epoch": 3.291709314227226,
"grad_norm": 0.2745017989746043,
"learning_rate": 1.2571580669370565e-05,
"loss": 0.218,
"step": 1608
},
{
"epoch": 3.293756397134084,
"grad_norm": 0.336785618203528,
"learning_rate": 1.254502309339749e-05,
"loss": 0.2163,
"step": 1609
},
{
"epoch": 3.2958034800409415,
"grad_norm": 0.29711300837975474,
"learning_rate": 1.2518480774851472e-05,
"loss": 0.2217,
"step": 1610
},
{
"epoch": 3.2978505629477994,
"grad_norm": 0.32116672616384045,
"learning_rate": 1.2491953768054263e-05,
"loss": 0.2595,
"step": 1611
},
{
"epoch": 3.299897645854657,
"grad_norm": 0.3367433516635412,
"learning_rate": 1.2465442127296297e-05,
"loss": 0.2293,
"step": 1612
},
{
"epoch": 3.301944728761515,
"grad_norm": 0.3300984778131218,
"learning_rate": 1.2438945906836557e-05,
"loss": 0.2443,
"step": 1613
},
{
"epoch": 3.3039918116683724,
"grad_norm": 0.31907195158009316,
"learning_rate": 1.241246516090245e-05,
"loss": 0.2321,
"step": 1614
},
{
"epoch": 3.3060388945752304,
"grad_norm": 0.3096668811954514,
"learning_rate": 1.2385999943689732e-05,
"loss": 0.2216,
"step": 1615
},
{
"epoch": 3.308085977482088,
"grad_norm": 0.3776462069541184,
"learning_rate": 1.2359550309362368e-05,
"loss": 0.2104,
"step": 1616
},
{
"epoch": 3.310133060388946,
"grad_norm": 0.3196294164019087,
"learning_rate": 1.2333116312052416e-05,
"loss": 0.2403,
"step": 1617
},
{
"epoch": 3.3121801432958033,
"grad_norm": 0.34176563178349245,
"learning_rate": 1.2306698005859975e-05,
"loss": 0.2409,
"step": 1618
},
{
"epoch": 3.3142272262026613,
"grad_norm": 0.34417655045127915,
"learning_rate": 1.2280295444852994e-05,
"loss": 0.2899,
"step": 1619
},
{
"epoch": 3.316274309109519,
"grad_norm": 0.34668338984896196,
"learning_rate": 1.22539086830672e-05,
"loss": 0.216,
"step": 1620
},
{
"epoch": 3.3183213920163768,
"grad_norm": 0.307563527231833,
"learning_rate": 1.2227537774505996e-05,
"loss": 0.204,
"step": 1621
},
{
"epoch": 3.3203684749232343,
"grad_norm": 0.36360020699523277,
"learning_rate": 1.2201182773140334e-05,
"loss": 0.2321,
"step": 1622
},
{
"epoch": 3.3224155578300922,
"grad_norm": 0.34959023754084967,
"learning_rate": 1.2174843732908609e-05,
"loss": 0.2417,
"step": 1623
},
{
"epoch": 3.3244626407369497,
"grad_norm": 0.35791334099991956,
"learning_rate": 1.2148520707716567e-05,
"loss": 0.267,
"step": 1624
},
{
"epoch": 3.3265097236438077,
"grad_norm": 0.31882901909973,
"learning_rate": 1.2122213751437147e-05,
"loss": 0.2368,
"step": 1625
},
{
"epoch": 3.328556806550665,
"grad_norm": 0.31734552594196086,
"learning_rate": 1.2095922917910427e-05,
"loss": 0.2437,
"step": 1626
},
{
"epoch": 3.330603889457523,
"grad_norm": 0.3303738558908603,
"learning_rate": 1.2069648260943473e-05,
"loss": 0.2326,
"step": 1627
},
{
"epoch": 3.3326509723643807,
"grad_norm": 0.32163501618546503,
"learning_rate": 1.2043389834310257e-05,
"loss": 0.2579,
"step": 1628
},
{
"epoch": 3.3346980552712386,
"grad_norm": 0.33400441651552865,
"learning_rate": 1.2017147691751512e-05,
"loss": 0.2572,
"step": 1629
},
{
"epoch": 3.336745138178096,
"grad_norm": 0.3062361134778084,
"learning_rate": 1.1990921886974669e-05,
"loss": 0.1971,
"step": 1630
},
{
"epoch": 3.338792221084954,
"grad_norm": 0.3617880140065343,
"learning_rate": 1.1964712473653713e-05,
"loss": 0.2752,
"step": 1631
},
{
"epoch": 3.3408393039918116,
"grad_norm": 0.31208903084798195,
"learning_rate": 1.1938519505429072e-05,
"loss": 0.1891,
"step": 1632
},
{
"epoch": 3.3428863868986696,
"grad_norm": 0.3846438154154162,
"learning_rate": 1.1912343035907535e-05,
"loss": 0.2663,
"step": 1633
},
{
"epoch": 3.344933469805527,
"grad_norm": 0.3360119347175398,
"learning_rate": 1.1886183118662108e-05,
"loss": 0.225,
"step": 1634
},
{
"epoch": 3.346980552712385,
"grad_norm": 0.34434286466245134,
"learning_rate": 1.1860039807231923e-05,
"loss": 0.2311,
"step": 1635
},
{
"epoch": 3.3490276356192425,
"grad_norm": 0.3193089695495996,
"learning_rate": 1.1833913155122132e-05,
"loss": 0.1958,
"step": 1636
},
{
"epoch": 3.3510747185261005,
"grad_norm": 0.34254339904952036,
"learning_rate": 1.1807803215803806e-05,
"loss": 0.2301,
"step": 1637
},
{
"epoch": 3.353121801432958,
"grad_norm": 0.37284073286468844,
"learning_rate": 1.1781710042713783e-05,
"loss": 0.2321,
"step": 1638
},
{
"epoch": 3.3551688843398155,
"grad_norm": 0.3400024012806843,
"learning_rate": 1.1755633689254609e-05,
"loss": 0.2481,
"step": 1639
},
{
"epoch": 3.3572159672466735,
"grad_norm": 0.3381525204885461,
"learning_rate": 1.1729574208794388e-05,
"loss": 0.2486,
"step": 1640
},
{
"epoch": 3.3592630501535314,
"grad_norm": 0.3197266149899627,
"learning_rate": 1.1703531654666714e-05,
"loss": 0.2368,
"step": 1641
},
{
"epoch": 3.361310133060389,
"grad_norm": 0.3130217545123078,
"learning_rate": 1.1677506080170512e-05,
"loss": 0.2342,
"step": 1642
},
{
"epoch": 3.3633572159672465,
"grad_norm": 0.32947434661394787,
"learning_rate": 1.1651497538569984e-05,
"loss": 0.2124,
"step": 1643
},
{
"epoch": 3.3654042988741044,
"grad_norm": 0.344686988795317,
"learning_rate": 1.162550608309446e-05,
"loss": 0.2464,
"step": 1644
},
{
"epoch": 3.3674513817809624,
"grad_norm": 0.30351992821717133,
"learning_rate": 1.1599531766938306e-05,
"loss": 0.2378,
"step": 1645
},
{
"epoch": 3.36949846468782,
"grad_norm": 0.31537723127663564,
"learning_rate": 1.1573574643260787e-05,
"loss": 0.2683,
"step": 1646
},
{
"epoch": 3.3715455475946774,
"grad_norm": 0.3179109407008773,
"learning_rate": 1.1547634765186016e-05,
"loss": 0.21,
"step": 1647
},
{
"epoch": 3.3735926305015353,
"grad_norm": 0.3414400925992681,
"learning_rate": 1.1521712185802789e-05,
"loss": 0.2467,
"step": 1648
},
{
"epoch": 3.375639713408393,
"grad_norm": 0.3015513958093698,
"learning_rate": 1.1495806958164508e-05,
"loss": 0.2333,
"step": 1649
},
{
"epoch": 3.377686796315251,
"grad_norm": 0.3221671987582906,
"learning_rate": 1.1469919135289058e-05,
"loss": 0.2697,
"step": 1650
},
{
"epoch": 3.3797338792221083,
"grad_norm": 0.29450678032670125,
"learning_rate": 1.1444048770158718e-05,
"loss": 0.2255,
"step": 1651
},
{
"epoch": 3.3817809621289663,
"grad_norm": 0.39235810212847905,
"learning_rate": 1.1418195915720002e-05,
"loss": 0.2599,
"step": 1652
},
{
"epoch": 3.383828045035824,
"grad_norm": 0.2945867088296247,
"learning_rate": 1.139236062488362e-05,
"loss": 0.2336,
"step": 1653
},
{
"epoch": 3.3858751279426818,
"grad_norm": 0.3178722215887932,
"learning_rate": 1.136654295052433e-05,
"loss": 0.266,
"step": 1654
},
{
"epoch": 3.3879222108495393,
"grad_norm": 0.32262451442066525,
"learning_rate": 1.134074294548082e-05,
"loss": 0.2282,
"step": 1655
},
{
"epoch": 3.389969293756397,
"grad_norm": 0.3260714484305257,
"learning_rate": 1.1314960662555639e-05,
"loss": 0.2294,
"step": 1656
},
{
"epoch": 3.3920163766632547,
"grad_norm": 0.29552572043468256,
"learning_rate": 1.1289196154515048e-05,
"loss": 0.1852,
"step": 1657
},
{
"epoch": 3.3940634595701127,
"grad_norm": 0.32257518359378057,
"learning_rate": 1.1263449474088944e-05,
"loss": 0.2122,
"step": 1658
},
{
"epoch": 3.39611054247697,
"grad_norm": 0.31678272789529893,
"learning_rate": 1.1237720673970713e-05,
"loss": 0.2391,
"step": 1659
},
{
"epoch": 3.398157625383828,
"grad_norm": 0.3294063880106706,
"learning_rate": 1.1212009806817163e-05,
"loss": 0.2582,
"step": 1660
},
{
"epoch": 3.4002047082906857,
"grad_norm": 0.33445038877415256,
"learning_rate": 1.118631692524843e-05,
"loss": 0.2325,
"step": 1661
},
{
"epoch": 3.4022517911975436,
"grad_norm": 0.31780769894415345,
"learning_rate": 1.1160642081847782e-05,
"loss": 0.2114,
"step": 1662
},
{
"epoch": 3.404298874104401,
"grad_norm": 0.34936980398469303,
"learning_rate": 1.1134985329161608e-05,
"loss": 0.2633,
"step": 1663
},
{
"epoch": 3.406345957011259,
"grad_norm": 0.25267194298381324,
"learning_rate": 1.1109346719699263e-05,
"loss": 0.1672,
"step": 1664
},
{
"epoch": 3.4083930399181166,
"grad_norm": 0.39414125375432546,
"learning_rate": 1.108372630593298e-05,
"loss": 0.3388,
"step": 1665
},
{
"epoch": 3.4104401228249746,
"grad_norm": 0.31231427401371586,
"learning_rate": 1.1058124140297718e-05,
"loss": 0.247,
"step": 1666
},
{
"epoch": 3.412487205731832,
"grad_norm": 0.2819687858861257,
"learning_rate": 1.1032540275191148e-05,
"loss": 0.2171,
"step": 1667
},
{
"epoch": 3.41453428863869,
"grad_norm": 0.32854044476275013,
"learning_rate": 1.1006974762973425e-05,
"loss": 0.2265,
"step": 1668
},
{
"epoch": 3.4165813715455475,
"grad_norm": 0.34353435192327664,
"learning_rate": 1.0981427655967183e-05,
"loss": 0.2469,
"step": 1669
},
{
"epoch": 3.4186284544524055,
"grad_norm": 0.32710285779281467,
"learning_rate": 1.0955899006457373e-05,
"loss": 0.2437,
"step": 1670
},
{
"epoch": 3.420675537359263,
"grad_norm": 0.3122882795616273,
"learning_rate": 1.0930388866691181e-05,
"loss": 0.2433,
"step": 1671
},
{
"epoch": 3.422722620266121,
"grad_norm": 0.29841602691342367,
"learning_rate": 1.0904897288877891e-05,
"loss": 0.2373,
"step": 1672
},
{
"epoch": 3.4247697031729785,
"grad_norm": 0.32318146749811455,
"learning_rate": 1.0879424325188805e-05,
"loss": 0.2477,
"step": 1673
},
{
"epoch": 3.4268167860798364,
"grad_norm": 0.32962691544440564,
"learning_rate": 1.085397002775716e-05,
"loss": 0.2577,
"step": 1674
},
{
"epoch": 3.428863868986694,
"grad_norm": 0.3028821115102944,
"learning_rate": 1.0828534448677942e-05,
"loss": 0.256,
"step": 1675
},
{
"epoch": 3.4309109518935514,
"grad_norm": 0.31035238566507123,
"learning_rate": 1.080311764000786e-05,
"loss": 0.223,
"step": 1676
},
{
"epoch": 3.4329580348004094,
"grad_norm": 0.30168173543643445,
"learning_rate": 1.0777719653765191e-05,
"loss": 0.2389,
"step": 1677
},
{
"epoch": 3.4350051177072674,
"grad_norm": 0.28500495929546144,
"learning_rate": 1.0752340541929711e-05,
"loss": 0.226,
"step": 1678
},
{
"epoch": 3.437052200614125,
"grad_norm": 0.3141146687901098,
"learning_rate": 1.0726980356442524e-05,
"loss": 0.2684,
"step": 1679
},
{
"epoch": 3.4390992835209824,
"grad_norm": 0.4682196146944551,
"learning_rate": 1.0701639149206061e-05,
"loss": 0.2647,
"step": 1680
},
{
"epoch": 3.4411463664278403,
"grad_norm": 0.3365183925944955,
"learning_rate": 1.0676316972083867e-05,
"loss": 0.2432,
"step": 1681
},
{
"epoch": 3.4431934493346983,
"grad_norm": 0.28313743388646095,
"learning_rate": 1.0651013876900546e-05,
"loss": 0.2099,
"step": 1682
},
{
"epoch": 3.445240532241556,
"grad_norm": 0.34192081983514505,
"learning_rate": 1.0625729915441659e-05,
"loss": 0.2482,
"step": 1683
},
{
"epoch": 3.4472876151484133,
"grad_norm": 0.31305659538484126,
"learning_rate": 1.060046513945361e-05,
"loss": 0.2137,
"step": 1684
},
{
"epoch": 3.4493346980552713,
"grad_norm": 0.3191034853933784,
"learning_rate": 1.0575219600643508e-05,
"loss": 0.2329,
"step": 1685
},
{
"epoch": 3.4513817809621288,
"grad_norm": 0.3245403446485034,
"learning_rate": 1.0549993350679138e-05,
"loss": 0.235,
"step": 1686
},
{
"epoch": 3.4534288638689867,
"grad_norm": 0.3088632467059862,
"learning_rate": 1.0524786441188786e-05,
"loss": 0.2155,
"step": 1687
},
{
"epoch": 3.4554759467758442,
"grad_norm": 0.3354339197261192,
"learning_rate": 1.0499598923761139e-05,
"loss": 0.2341,
"step": 1688
},
{
"epoch": 3.457523029682702,
"grad_norm": 0.3090008978999402,
"learning_rate": 1.0474430849945214e-05,
"loss": 0.2081,
"step": 1689
},
{
"epoch": 3.4595701125895597,
"grad_norm": 0.30640868139315103,
"learning_rate": 1.0449282271250239e-05,
"loss": 0.2258,
"step": 1690
},
{
"epoch": 3.4616171954964177,
"grad_norm": 0.325313731285426,
"learning_rate": 1.0424153239145527e-05,
"loss": 0.267,
"step": 1691
},
{
"epoch": 3.463664278403275,
"grad_norm": 0.3226758771977029,
"learning_rate": 1.0399043805060406e-05,
"loss": 0.2761,
"step": 1692
},
{
"epoch": 3.465711361310133,
"grad_norm": 0.2751548117016905,
"learning_rate": 1.0373954020384073e-05,
"loss": 0.2102,
"step": 1693
},
{
"epoch": 3.4677584442169906,
"grad_norm": 0.3197825524327039,
"learning_rate": 1.0348883936465537e-05,
"loss": 0.2558,
"step": 1694
},
{
"epoch": 3.4698055271238486,
"grad_norm": 0.3009633405603159,
"learning_rate": 1.0323833604613454e-05,
"loss": 0.2214,
"step": 1695
},
{
"epoch": 3.471852610030706,
"grad_norm": 0.3104955641697045,
"learning_rate": 1.029880307609608e-05,
"loss": 0.2686,
"step": 1696
},
{
"epoch": 3.473899692937564,
"grad_norm": 0.2692226200882447,
"learning_rate": 1.0273792402141134e-05,
"loss": 0.2312,
"step": 1697
},
{
"epoch": 3.4759467758444216,
"grad_norm": 0.31761762398562665,
"learning_rate": 1.0248801633935699e-05,
"loss": 0.2536,
"step": 1698
},
{
"epoch": 3.4779938587512795,
"grad_norm": 0.3196706676343915,
"learning_rate": 1.0223830822626124e-05,
"loss": 0.2556,
"step": 1699
},
{
"epoch": 3.480040941658137,
"grad_norm": 0.2983026635325646,
"learning_rate": 1.0198880019317913e-05,
"loss": 0.2468,
"step": 1700
},
{
"epoch": 3.482088024564995,
"grad_norm": 0.304844430252478,
"learning_rate": 1.017394927507561e-05,
"loss": 0.2252,
"step": 1701
},
{
"epoch": 3.4841351074718525,
"grad_norm": 0.32219494329823706,
"learning_rate": 1.0149038640922715e-05,
"loss": 0.2576,
"step": 1702
},
{
"epoch": 3.4861821903787105,
"grad_norm": 0.27799343218915756,
"learning_rate": 1.0124148167841577e-05,
"loss": 0.1703,
"step": 1703
},
{
"epoch": 3.488229273285568,
"grad_norm": 0.32497007511792664,
"learning_rate": 1.009927790677327e-05,
"loss": 0.2366,
"step": 1704
},
{
"epoch": 3.490276356192426,
"grad_norm": 0.3095778446432627,
"learning_rate": 1.0074427908617515e-05,
"loss": 0.2806,
"step": 1705
},
{
"epoch": 3.4923234390992834,
"grad_norm": 0.3249340961496839,
"learning_rate": 1.004959822423255e-05,
"loss": 0.2406,
"step": 1706
},
{
"epoch": 3.4943705220061414,
"grad_norm": 0.3092490415262631,
"learning_rate": 1.0024788904435054e-05,
"loss": 0.225,
"step": 1707
},
{
"epoch": 3.496417604912999,
"grad_norm": 0.3376859697600405,
"learning_rate": 1.0000000000000006e-05,
"loss": 0.2859,
"step": 1708
},
{
"epoch": 3.498464687819857,
"grad_norm": 0.2986922022588596,
"learning_rate": 9.975231561660617e-06,
"loss": 0.2241,
"step": 1709
},
{
"epoch": 3.5005117707267144,
"grad_norm": 0.3304776655435591,
"learning_rate": 9.950483640108215e-06,
"loss": 0.2865,
"step": 1710
},
{
"epoch": 3.5025588536335723,
"grad_norm": 0.30674425880777934,
"learning_rate": 9.92575628599213e-06,
"loss": 0.204,
"step": 1711
},
{
"epoch": 3.50460593654043,
"grad_norm": 0.3372001770402179,
"learning_rate": 9.901049549919601e-06,
"loss": 0.2705,
"step": 1712
},
{
"epoch": 3.5066530194472874,
"grad_norm": 0.28957256516814167,
"learning_rate": 9.876363482455675e-06,
"loss": 0.215,
"step": 1713
},
{
"epoch": 3.5087001023541453,
"grad_norm": 0.36096631037593485,
"learning_rate": 9.851698134123095e-06,
"loss": 0.2522,
"step": 1714
},
{
"epoch": 3.5107471852610033,
"grad_norm": 0.3128276504087451,
"learning_rate": 9.827053555402191e-06,
"loss": 0.2661,
"step": 1715
},
{
"epoch": 3.512794268167861,
"grad_norm": 0.3126033507485691,
"learning_rate": 9.802429796730792e-06,
"loss": 0.2329,
"step": 1716
},
{
"epoch": 3.5148413510747183,
"grad_norm": 0.33709412127432403,
"learning_rate": 9.777826908504126e-06,
"loss": 0.2673,
"step": 1717
},
{
"epoch": 3.5168884339815762,
"grad_norm": 0.31504566512999976,
"learning_rate": 9.753244941074696e-06,
"loss": 0.1942,
"step": 1718
},
{
"epoch": 3.518935516888434,
"grad_norm": 0.3081794499055162,
"learning_rate": 9.728683944752193e-06,
"loss": 0.23,
"step": 1719
},
{
"epoch": 3.5209825997952917,
"grad_norm": 0.3205819253659646,
"learning_rate": 9.704143969803392e-06,
"loss": 0.2599,
"step": 1720
},
{
"epoch": 3.5230296827021492,
"grad_norm": 0.3179566007920735,
"learning_rate": 9.679625066452028e-06,
"loss": 0.2468,
"step": 1721
},
{
"epoch": 3.525076765609007,
"grad_norm": 0.33546341224030785,
"learning_rate": 9.655127284878723e-06,
"loss": 0.2285,
"step": 1722
},
{
"epoch": 3.527123848515865,
"grad_norm": 0.3406356093850999,
"learning_rate": 9.630650675220892e-06,
"loss": 0.2277,
"step": 1723
},
{
"epoch": 3.5291709314227226,
"grad_norm": 0.297528564327505,
"learning_rate": 9.606195287572577e-06,
"loss": 0.199,
"step": 1724
},
{
"epoch": 3.53121801432958,
"grad_norm": 0.3221532376937218,
"learning_rate": 9.581761171984416e-06,
"loss": 0.2157,
"step": 1725
},
{
"epoch": 3.533265097236438,
"grad_norm": 0.35339763123663803,
"learning_rate": 9.557348378463503e-06,
"loss": 0.2377,
"step": 1726
},
{
"epoch": 3.535312180143296,
"grad_norm": 0.3011217732360328,
"learning_rate": 9.532956956973302e-06,
"loss": 0.2316,
"step": 1727
},
{
"epoch": 3.5373592630501536,
"grad_norm": 0.31883004437994317,
"learning_rate": 9.50858695743351e-06,
"loss": 0.2358,
"step": 1728
},
{
"epoch": 3.539406345957011,
"grad_norm": 0.4148367793712331,
"learning_rate": 9.484238429720018e-06,
"loss": 0.2412,
"step": 1729
},
{
"epoch": 3.541453428863869,
"grad_norm": 0.3651190389759453,
"learning_rate": 9.459911423664763e-06,
"loss": 0.2496,
"step": 1730
},
{
"epoch": 3.5435005117707266,
"grad_norm": 0.2782188953479953,
"learning_rate": 9.435605989055607e-06,
"loss": 0.2028,
"step": 1731
},
{
"epoch": 3.5455475946775845,
"grad_norm": 0.31391987453704123,
"learning_rate": 9.411322175636298e-06,
"loss": 0.2561,
"step": 1732
},
{
"epoch": 3.547594677584442,
"grad_norm": 0.3158415427773136,
"learning_rate": 9.387060033106321e-06,
"loss": 0.2956,
"step": 1733
},
{
"epoch": 3.5496417604913,
"grad_norm": 0.3175891657245625,
"learning_rate": 9.362819611120793e-06,
"loss": 0.2566,
"step": 1734
},
{
"epoch": 3.5516888433981575,
"grad_norm": 0.3312219072193364,
"learning_rate": 9.338600959290414e-06,
"loss": 0.2317,
"step": 1735
},
{
"epoch": 3.5537359263050154,
"grad_norm": 0.3341176350083145,
"learning_rate": 9.314404127181307e-06,
"loss": 0.293,
"step": 1736
},
{
"epoch": 3.555783009211873,
"grad_norm": 0.29940835888683603,
"learning_rate": 9.290229164314928e-06,
"loss": 0.2221,
"step": 1737
},
{
"epoch": 3.557830092118731,
"grad_norm": 0.3046910176714613,
"learning_rate": 9.266076120167992e-06,
"loss": 0.2472,
"step": 1738
},
{
"epoch": 3.5598771750255884,
"grad_norm": 0.2841913350596404,
"learning_rate": 9.241945044172353e-06,
"loss": 0.2277,
"step": 1739
},
{
"epoch": 3.5619242579324464,
"grad_norm": 0.323279705340557,
"learning_rate": 9.217835985714898e-06,
"loss": 0.2709,
"step": 1740
},
{
"epoch": 3.563971340839304,
"grad_norm": 0.29421984064136286,
"learning_rate": 9.193748994137462e-06,
"loss": 0.2064,
"step": 1741
},
{
"epoch": 3.566018423746162,
"grad_norm": 0.3431113829919363,
"learning_rate": 9.169684118736708e-06,
"loss": 0.2581,
"step": 1742
},
{
"epoch": 3.5680655066530194,
"grad_norm": 0.31454166876772843,
"learning_rate": 9.145641408764048e-06,
"loss": 0.2135,
"step": 1743
},
{
"epoch": 3.5701125895598773,
"grad_norm": 0.3294672393878416,
"learning_rate": 9.121620913425508e-06,
"loss": 0.2607,
"step": 1744
},
{
"epoch": 3.572159672466735,
"grad_norm": 0.29027976941333605,
"learning_rate": 9.097622681881673e-06,
"loss": 0.1969,
"step": 1745
},
{
"epoch": 3.574206755373593,
"grad_norm": 0.3177811895638415,
"learning_rate": 9.073646763247558e-06,
"loss": 0.2103,
"step": 1746
},
{
"epoch": 3.5762538382804503,
"grad_norm": 0.3196047544970432,
"learning_rate": 9.04969320659249e-06,
"loss": 0.239,
"step": 1747
},
{
"epoch": 3.5783009211873082,
"grad_norm": 0.2925767290546702,
"learning_rate": 9.025762060940062e-06,
"loss": 0.198,
"step": 1748
},
{
"epoch": 3.5803480040941658,
"grad_norm": 0.33053764466506,
"learning_rate": 9.001853375267989e-06,
"loss": 0.2366,
"step": 1749
},
{
"epoch": 3.5823950870010233,
"grad_norm": 0.3174304755008507,
"learning_rate": 8.977967198508001e-06,
"loss": 0.2256,
"step": 1750
},
{
"epoch": 3.5844421699078812,
"grad_norm": 0.343170859287156,
"learning_rate": 8.954103579545785e-06,
"loss": 0.2341,
"step": 1751
},
{
"epoch": 3.586489252814739,
"grad_norm": 0.3215320683527329,
"learning_rate": 8.93026256722085e-06,
"loss": 0.2045,
"step": 1752
},
{
"epoch": 3.5885363357215967,
"grad_norm": 0.345488281284008,
"learning_rate": 8.906444210326441e-06,
"loss": 0.2708,
"step": 1753
},
{
"epoch": 3.590583418628454,
"grad_norm": 0.29374650212717546,
"learning_rate": 8.882648557609434e-06,
"loss": 0.2144,
"step": 1754
},
{
"epoch": 3.592630501535312,
"grad_norm": 0.30791903954132194,
"learning_rate": 8.858875657770241e-06,
"loss": 0.2196,
"step": 1755
},
{
"epoch": 3.59467758444217,
"grad_norm": 0.37604604250081547,
"learning_rate": 8.83512555946271e-06,
"loss": 0.2842,
"step": 1756
},
{
"epoch": 3.5967246673490276,
"grad_norm": 0.2815977468791065,
"learning_rate": 8.811398311294008e-06,
"loss": 0.2128,
"step": 1757
},
{
"epoch": 3.598771750255885,
"grad_norm": 0.32463396760239493,
"learning_rate": 8.787693961824555e-06,
"loss": 0.2635,
"step": 1758
},
{
"epoch": 3.600818833162743,
"grad_norm": 0.3178563263559145,
"learning_rate": 8.764012559567899e-06,
"loss": 0.2749,
"step": 1759
},
{
"epoch": 3.602865916069601,
"grad_norm": 0.32193828000242514,
"learning_rate": 8.740354152990624e-06,
"loss": 0.232,
"step": 1760
},
{
"epoch": 3.6049129989764586,
"grad_norm": 0.35115161370306397,
"learning_rate": 8.716718790512251e-06,
"loss": 0.2421,
"step": 1761
},
{
"epoch": 3.606960081883316,
"grad_norm": 0.3150133989578825,
"learning_rate": 8.693106520505147e-06,
"loss": 0.2078,
"step": 1762
},
{
"epoch": 3.609007164790174,
"grad_norm": 0.34022251620199256,
"learning_rate": 8.669517391294397e-06,
"loss": 0.248,
"step": 1763
},
{
"epoch": 3.611054247697032,
"grad_norm": 0.3039508397036727,
"learning_rate": 8.645951451157741e-06,
"loss": 0.187,
"step": 1764
},
{
"epoch": 3.6131013306038895,
"grad_norm": 0.3287212950737981,
"learning_rate": 8.622408748325461e-06,
"loss": 0.2774,
"step": 1765
},
{
"epoch": 3.615148413510747,
"grad_norm": 0.3159497126746659,
"learning_rate": 8.598889330980277e-06,
"loss": 0.2251,
"step": 1766
},
{
"epoch": 3.617195496417605,
"grad_norm": 0.2931231286120822,
"learning_rate": 8.575393247257256e-06,
"loss": 0.2267,
"step": 1767
},
{
"epoch": 3.619242579324463,
"grad_norm": 0.3287345655736662,
"learning_rate": 8.551920545243704e-06,
"loss": 0.24,
"step": 1768
},
{
"epoch": 3.6212896622313204,
"grad_norm": 0.30400959536179484,
"learning_rate": 8.528471272979083e-06,
"loss": 0.2133,
"step": 1769
},
{
"epoch": 3.623336745138178,
"grad_norm": 0.31203010646854484,
"learning_rate": 8.50504547845489e-06,
"loss": 0.2404,
"step": 1770
},
{
"epoch": 3.625383828045036,
"grad_norm": 0.30015537192427894,
"learning_rate": 8.481643209614576e-06,
"loss": 0.2059,
"step": 1771
},
{
"epoch": 3.6274309109518934,
"grad_norm": 0.295949459736467,
"learning_rate": 8.45826451435347e-06,
"loss": 0.2209,
"step": 1772
},
{
"epoch": 3.6294779938587514,
"grad_norm": 0.32651774554006335,
"learning_rate": 8.434909440518613e-06,
"loss": 0.258,
"step": 1773
},
{
"epoch": 3.631525076765609,
"grad_norm": 0.28180464315900705,
"learning_rate": 8.411578035908728e-06,
"loss": 0.205,
"step": 1774
},
{
"epoch": 3.633572159672467,
"grad_norm": 0.3413634143023636,
"learning_rate": 8.388270348274092e-06,
"loss": 0.2769,
"step": 1775
},
{
"epoch": 3.6356192425793243,
"grad_norm": 0.33236322758981973,
"learning_rate": 8.364986425316448e-06,
"loss": 0.2234,
"step": 1776
},
{
"epoch": 3.6376663254861823,
"grad_norm": 0.3020595253520059,
"learning_rate": 8.341726314688875e-06,
"loss": 0.2509,
"step": 1777
},
{
"epoch": 3.63971340839304,
"grad_norm": 0.31852165056270737,
"learning_rate": 8.318490063995761e-06,
"loss": 0.2537,
"step": 1778
},
{
"epoch": 3.6417604912998978,
"grad_norm": 0.2944306330035562,
"learning_rate": 8.295277720792634e-06,
"loss": 0.2222,
"step": 1779
},
{
"epoch": 3.6438075742067553,
"grad_norm": 0.3191896831407699,
"learning_rate": 8.272089332586089e-06,
"loss": 0.2437,
"step": 1780
},
{
"epoch": 3.6458546571136132,
"grad_norm": 0.3341784697234189,
"learning_rate": 8.248924946833705e-06,
"loss": 0.25,
"step": 1781
},
{
"epoch": 3.6479017400204707,
"grad_norm": 0.28526708897000397,
"learning_rate": 8.225784610943948e-06,
"loss": 0.2586,
"step": 1782
},
{
"epoch": 3.6499488229273287,
"grad_norm": 0.30819644727048756,
"learning_rate": 8.20266837227603e-06,
"loss": 0.2482,
"step": 1783
},
{
"epoch": 3.651995905834186,
"grad_norm": 0.3391130155927512,
"learning_rate": 8.179576278139872e-06,
"loss": 0.2703,
"step": 1784
},
{
"epoch": 3.654042988741044,
"grad_norm": 0.3457772155494689,
"learning_rate": 8.156508375795995e-06,
"loss": 0.2138,
"step": 1785
},
{
"epoch": 3.6560900716479017,
"grad_norm": 0.3545988817947354,
"learning_rate": 8.133464712455364e-06,
"loss": 0.2381,
"step": 1786
},
{
"epoch": 3.6581371545547596,
"grad_norm": 0.3051355101331329,
"learning_rate": 8.11044533527937e-06,
"loss": 0.2212,
"step": 1787
},
{
"epoch": 3.660184237461617,
"grad_norm": 0.31093454380049773,
"learning_rate": 8.087450291379693e-06,
"loss": 0.2782,
"step": 1788
},
{
"epoch": 3.662231320368475,
"grad_norm": 0.3076008520431847,
"learning_rate": 8.064479627818213e-06,
"loss": 0.2563,
"step": 1789
},
{
"epoch": 3.6642784032753326,
"grad_norm": 0.2914759795466477,
"learning_rate": 8.041533391606892e-06,
"loss": 0.237,
"step": 1790
},
{
"epoch": 3.66632548618219,
"grad_norm": 0.32811153076900396,
"learning_rate": 8.018611629707735e-06,
"loss": 0.2192,
"step": 1791
},
{
"epoch": 3.668372569089048,
"grad_norm": 0.3349979236041854,
"learning_rate": 7.995714389032638e-06,
"loss": 0.2544,
"step": 1792
},
{
"epoch": 3.670419651995906,
"grad_norm": 0.30481283300379364,
"learning_rate": 7.972841716443304e-06,
"loss": 0.2021,
"step": 1793
},
{
"epoch": 3.6724667349027635,
"grad_norm": 0.31038423151434946,
"learning_rate": 7.949993658751168e-06,
"loss": 0.2714,
"step": 1794
},
{
"epoch": 3.674513817809621,
"grad_norm": 0.3046182836563986,
"learning_rate": 7.927170262717284e-06,
"loss": 0.2486,
"step": 1795
},
{
"epoch": 3.676560900716479,
"grad_norm": 0.321493522566335,
"learning_rate": 7.904371575052224e-06,
"loss": 0.257,
"step": 1796
},
{
"epoch": 3.678607983623337,
"grad_norm": 0.32116628196410996,
"learning_rate": 7.881597642416012e-06,
"loss": 0.2351,
"step": 1797
},
{
"epoch": 3.6806550665301945,
"grad_norm": 0.3225460384671046,
"learning_rate": 7.858848511417998e-06,
"loss": 0.2787,
"step": 1798
},
{
"epoch": 3.682702149437052,
"grad_norm": 0.3065477845223782,
"learning_rate": 7.836124228616762e-06,
"loss": 0.2059,
"step": 1799
},
{
"epoch": 3.68474923234391,
"grad_norm": 0.2820736922840727,
"learning_rate": 7.81342484052004e-06,
"loss": 0.2065,
"step": 1800
},
{
"epoch": 3.686796315250768,
"grad_norm": 0.3199543711113544,
"learning_rate": 7.790750393584616e-06,
"loss": 0.2482,
"step": 1801
},
{
"epoch": 3.6888433981576254,
"grad_norm": 0.3041948190106927,
"learning_rate": 7.768100934216234e-06,
"loss": 0.2278,
"step": 1802
},
{
"epoch": 3.690890481064483,
"grad_norm": 0.3053390429608892,
"learning_rate": 7.745476508769494e-06,
"loss": 0.2356,
"step": 1803
},
{
"epoch": 3.692937563971341,
"grad_norm": 0.3110750944178115,
"learning_rate": 7.72287716354776e-06,
"loss": 0.2402,
"step": 1804
},
{
"epoch": 3.694984646878199,
"grad_norm": 0.28367962348790254,
"learning_rate": 7.700302944803076e-06,
"loss": 0.1827,
"step": 1805
},
{
"epoch": 3.6970317297850563,
"grad_norm": 0.33323381054790285,
"learning_rate": 7.67775389873604e-06,
"loss": 0.2293,
"step": 1806
},
{
"epoch": 3.699078812691914,
"grad_norm": 0.3521970989279458,
"learning_rate": 7.65523007149575e-06,
"loss": 0.2657,
"step": 1807
},
{
"epoch": 3.701125895598772,
"grad_norm": 0.30699833196534315,
"learning_rate": 7.63273150917969e-06,
"loss": 0.2421,
"step": 1808
},
{
"epoch": 3.7031729785056293,
"grad_norm": 0.31416536601357237,
"learning_rate": 7.6102582578336315e-06,
"loss": 0.1997,
"step": 1809
},
{
"epoch": 3.7052200614124873,
"grad_norm": 0.3803922818493654,
"learning_rate": 7.587810363451544e-06,
"loss": 0.2428,
"step": 1810
},
{
"epoch": 3.707267144319345,
"grad_norm": 0.32978409370696515,
"learning_rate": 7.565387871975511e-06,
"loss": 0.2037,
"step": 1811
},
{
"epoch": 3.7093142272262027,
"grad_norm": 0.33553313939376517,
"learning_rate": 7.5429908292956045e-06,
"loss": 0.2675,
"step": 1812
},
{
"epoch": 3.7113613101330603,
"grad_norm": 0.3164776677964802,
"learning_rate": 7.5206192812498345e-06,
"loss": 0.262,
"step": 1813
},
{
"epoch": 3.713408393039918,
"grad_norm": 0.29867646878594656,
"learning_rate": 7.498273273624022e-06,
"loss": 0.2468,
"step": 1814
},
{
"epoch": 3.7154554759467757,
"grad_norm": 0.31216831365580633,
"learning_rate": 7.475952852151722e-06,
"loss": 0.2225,
"step": 1815
},
{
"epoch": 3.7175025588536337,
"grad_norm": 0.2854940367151503,
"learning_rate": 7.4536580625141244e-06,
"loss": 0.2302,
"step": 1816
},
{
"epoch": 3.719549641760491,
"grad_norm": 0.3133497229188554,
"learning_rate": 7.431388950339955e-06,
"loss": 0.2188,
"step": 1817
},
{
"epoch": 3.721596724667349,
"grad_norm": 0.3115695729233575,
"learning_rate": 7.409145561205402e-06,
"loss": 0.2251,
"step": 1818
},
{
"epoch": 3.7236438075742067,
"grad_norm": 0.3211680619051747,
"learning_rate": 7.386927940633981e-06,
"loss": 0.244,
"step": 1819
},
{
"epoch": 3.7256908904810646,
"grad_norm": 0.3048950617764936,
"learning_rate": 7.364736134096497e-06,
"loss": 0.264,
"step": 1820
},
{
"epoch": 3.727737973387922,
"grad_norm": 0.2939363289247235,
"learning_rate": 7.342570187010913e-06,
"loss": 0.235,
"step": 1821
},
{
"epoch": 3.72978505629478,
"grad_norm": 0.3233466134307953,
"learning_rate": 7.32043014474227e-06,
"loss": 0.306,
"step": 1822
},
{
"epoch": 3.7318321392016376,
"grad_norm": 0.30575032489620757,
"learning_rate": 7.2983160526025854e-06,
"loss": 0.2719,
"step": 1823
},
{
"epoch": 3.7338792221084955,
"grad_norm": 0.297249466993302,
"learning_rate": 7.276227955850774e-06,
"loss": 0.235,
"step": 1824
},
{
"epoch": 3.735926305015353,
"grad_norm": 0.29688130457735584,
"learning_rate": 7.254165899692554e-06,
"loss": 0.2313,
"step": 1825
},
{
"epoch": 3.737973387922211,
"grad_norm": 0.315867334334272,
"learning_rate": 7.2321299292803275e-06,
"loss": 0.2554,
"step": 1826
},
{
"epoch": 3.7400204708290685,
"grad_norm": 0.35865826681363316,
"learning_rate": 7.210120089713117e-06,
"loss": 0.2657,
"step": 1827
},
{
"epoch": 3.742067553735926,
"grad_norm": 0.29556390069528465,
"learning_rate": 7.188136426036498e-06,
"loss": 0.2309,
"step": 1828
},
{
"epoch": 3.744114636642784,
"grad_norm": 0.3062750470379352,
"learning_rate": 7.166178983242425e-06,
"loss": 0.25,
"step": 1829
},
{
"epoch": 3.746161719549642,
"grad_norm": 0.3388760658555746,
"learning_rate": 7.1442478062692135e-06,
"loss": 0.2837,
"step": 1830
},
{
"epoch": 3.7482088024564995,
"grad_norm": 0.3198475086242932,
"learning_rate": 7.12234294000143e-06,
"loss": 0.2334,
"step": 1831
},
{
"epoch": 3.750255885363357,
"grad_norm": 0.3089175153993069,
"learning_rate": 7.100464429269769e-06,
"loss": 0.2647,
"step": 1832
},
{
"epoch": 3.752302968270215,
"grad_norm": 0.28196699295142263,
"learning_rate": 7.078612318850999e-06,
"loss": 0.2168,
"step": 1833
},
{
"epoch": 3.754350051177073,
"grad_norm": 0.297420997662474,
"learning_rate": 7.056786653467882e-06,
"loss": 0.2028,
"step": 1834
},
{
"epoch": 3.7563971340839304,
"grad_norm": 0.28045674846802615,
"learning_rate": 7.034987477789008e-06,
"loss": 0.1939,
"step": 1835
},
{
"epoch": 3.758444216990788,
"grad_norm": 0.311062703079177,
"learning_rate": 7.01321483642879e-06,
"loss": 0.2611,
"step": 1836
},
{
"epoch": 3.760491299897646,
"grad_norm": 0.297760397539936,
"learning_rate": 6.991468773947321e-06,
"loss": 0.2575,
"step": 1837
},
{
"epoch": 3.762538382804504,
"grad_norm": 0.2972100075369938,
"learning_rate": 6.969749334850308e-06,
"loss": 0.2088,
"step": 1838
},
{
"epoch": 3.7645854657113613,
"grad_norm": 0.3155194168733677,
"learning_rate": 6.948056563588943e-06,
"loss": 0.2469,
"step": 1839
},
{
"epoch": 3.766632548618219,
"grad_norm": 0.33253411719664255,
"learning_rate": 6.926390504559879e-06,
"loss": 0.2066,
"step": 1840
},
{
"epoch": 3.768679631525077,
"grad_norm": 0.31443745809454954,
"learning_rate": 6.90475120210508e-06,
"loss": 0.2462,
"step": 1841
},
{
"epoch": 3.7707267144319347,
"grad_norm": 0.2916854017686706,
"learning_rate": 6.883138700511735e-06,
"loss": 0.206,
"step": 1842
},
{
"epoch": 3.7727737973387923,
"grad_norm": 0.29896805378427554,
"learning_rate": 6.861553044012206e-06,
"loss": 0.2458,
"step": 1843
},
{
"epoch": 3.7748208802456498,
"grad_norm": 0.3070231834047609,
"learning_rate": 6.8399942767839075e-06,
"loss": 0.2375,
"step": 1844
},
{
"epoch": 3.7768679631525077,
"grad_norm": 0.35269614289266477,
"learning_rate": 6.818462442949203e-06,
"loss": 0.2354,
"step": 1845
},
{
"epoch": 3.7789150460593657,
"grad_norm": 0.2885923676811397,
"learning_rate": 6.796957586575364e-06,
"loss": 0.2669,
"step": 1846
},
{
"epoch": 3.780962128966223,
"grad_norm": 0.31946737683114335,
"learning_rate": 6.775479751674439e-06,
"loss": 0.2292,
"step": 1847
},
{
"epoch": 3.7830092118730807,
"grad_norm": 0.31027789747129964,
"learning_rate": 6.754028982203154e-06,
"loss": 0.2204,
"step": 1848
},
{
"epoch": 3.7850562947799387,
"grad_norm": 0.28163894937364203,
"learning_rate": 6.732605322062869e-06,
"loss": 0.2131,
"step": 1849
},
{
"epoch": 3.787103377686796,
"grad_norm": 0.32382848241213846,
"learning_rate": 6.711208815099451e-06,
"loss": 0.2349,
"step": 1850
},
{
"epoch": 3.789150460593654,
"grad_norm": 0.3138159392771698,
"learning_rate": 6.689839505103195e-06,
"loss": 0.2214,
"step": 1851
},
{
"epoch": 3.7911975435005116,
"grad_norm": 0.3336295131707369,
"learning_rate": 6.668497435808736e-06,
"loss": 0.2318,
"step": 1852
},
{
"epoch": 3.7932446264073696,
"grad_norm": 0.2805986894444167,
"learning_rate": 6.647182650894956e-06,
"loss": 0.1898,
"step": 1853
},
{
"epoch": 3.795291709314227,
"grad_norm": 0.311652087268774,
"learning_rate": 6.6258951939849055e-06,
"loss": 0.2244,
"step": 1854
},
{
"epoch": 3.797338792221085,
"grad_norm": 0.33194907539363816,
"learning_rate": 6.604635108645683e-06,
"loss": 0.272,
"step": 1855
},
{
"epoch": 3.7993858751279426,
"grad_norm": 0.2974799213005746,
"learning_rate": 6.583402438388391e-06,
"loss": 0.2967,
"step": 1856
},
{
"epoch": 3.8014329580348005,
"grad_norm": 0.31133998039092214,
"learning_rate": 6.562197226668015e-06,
"loss": 0.2587,
"step": 1857
},
{
"epoch": 3.803480040941658,
"grad_norm": 0.30821960779378943,
"learning_rate": 6.5410195168833425e-06,
"loss": 0.242,
"step": 1858
},
{
"epoch": 3.805527123848516,
"grad_norm": 0.30511264324428783,
"learning_rate": 6.519869352376878e-06,
"loss": 0.2318,
"step": 1859
},
{
"epoch": 3.8075742067553735,
"grad_norm": 0.2885137726399915,
"learning_rate": 6.498746776434759e-06,
"loss": 0.2412,
"step": 1860
},
{
"epoch": 3.8096212896622315,
"grad_norm": 0.30572665106073443,
"learning_rate": 6.477651832286633e-06,
"loss": 0.2714,
"step": 1861
},
{
"epoch": 3.811668372569089,
"grad_norm": 0.2999528878346436,
"learning_rate": 6.456584563105628e-06,
"loss": 0.225,
"step": 1862
},
{
"epoch": 3.813715455475947,
"grad_norm": 0.306996947567515,
"learning_rate": 6.435545012008213e-06,
"loss": 0.2228,
"step": 1863
},
{
"epoch": 3.8157625383828044,
"grad_norm": 0.30156526722849053,
"learning_rate": 6.414533222054138e-06,
"loss": 0.2271,
"step": 1864
},
{
"epoch": 3.8178096212896624,
"grad_norm": 0.3027861977559771,
"learning_rate": 6.393549236246333e-06,
"loss": 0.2358,
"step": 1865
},
{
"epoch": 3.81985670419652,
"grad_norm": 0.3100676988224775,
"learning_rate": 6.372593097530822e-06,
"loss": 0.2224,
"step": 1866
},
{
"epoch": 3.821903787103378,
"grad_norm": 0.30881142890916535,
"learning_rate": 6.3516648487966456e-06,
"loss": 0.212,
"step": 1867
},
{
"epoch": 3.8239508700102354,
"grad_norm": 0.3074430788928016,
"learning_rate": 6.330764532875748e-06,
"loss": 0.2559,
"step": 1868
},
{
"epoch": 3.825997952917093,
"grad_norm": 0.29993534561417257,
"learning_rate": 6.309892192542919e-06,
"loss": 0.2414,
"step": 1869
},
{
"epoch": 3.828045035823951,
"grad_norm": 0.2948226082874635,
"learning_rate": 6.289047870515692e-06,
"loss": 0.2634,
"step": 1870
},
{
"epoch": 3.830092118730809,
"grad_norm": 0.35876524638729923,
"learning_rate": 6.268231609454254e-06,
"loss": 0.3221,
"step": 1871
},
{
"epoch": 3.8321392016376663,
"grad_norm": 0.29964527729201973,
"learning_rate": 6.247443451961366e-06,
"loss": 0.2046,
"step": 1872
},
{
"epoch": 3.834186284544524,
"grad_norm": 0.32365319216721733,
"learning_rate": 6.226683440582268e-06,
"loss": 0.2233,
"step": 1873
},
{
"epoch": 3.8362333674513818,
"grad_norm": 0.3064343796948702,
"learning_rate": 6.2059516178046064e-06,
"loss": 0.2548,
"step": 1874
},
{
"epoch": 3.8382804503582397,
"grad_norm": 0.31074022748353647,
"learning_rate": 6.185248026058312e-06,
"loss": 0.2682,
"step": 1875
},
{
"epoch": 3.8403275332650972,
"grad_norm": 0.3001350448200465,
"learning_rate": 6.164572707715564e-06,
"loss": 0.2287,
"step": 1876
},
{
"epoch": 3.8423746161719547,
"grad_norm": 0.3230753170136733,
"learning_rate": 6.143925705090666e-06,
"loss": 0.3244,
"step": 1877
},
{
"epoch": 3.8444216990788127,
"grad_norm": 0.2998343099705012,
"learning_rate": 6.123307060439967e-06,
"loss": 0.2163,
"step": 1878
},
{
"epoch": 3.8464687819856707,
"grad_norm": 0.3171615427794975,
"learning_rate": 6.102716815961787e-06,
"loss": 0.2126,
"step": 1879
},
{
"epoch": 3.848515864892528,
"grad_norm": 0.30038516037265045,
"learning_rate": 6.082155013796323e-06,
"loss": 0.2197,
"step": 1880
},
{
"epoch": 3.8505629477993857,
"grad_norm": 0.2706760721698934,
"learning_rate": 6.061621696025539e-06,
"loss": 0.203,
"step": 1881
},
{
"epoch": 3.8526100307062436,
"grad_norm": 0.3019695460717632,
"learning_rate": 6.041116904673125e-06,
"loss": 0.243,
"step": 1882
},
{
"epoch": 3.8546571136131016,
"grad_norm": 0.3238748626818905,
"learning_rate": 6.020640681704402e-06,
"loss": 0.2387,
"step": 1883
},
{
"epoch": 3.856704196519959,
"grad_norm": 0.2845226343722909,
"learning_rate": 6.000193069026181e-06,
"loss": 0.1939,
"step": 1884
},
{
"epoch": 3.8587512794268166,
"grad_norm": 0.30062466940994315,
"learning_rate": 5.979774108486751e-06,
"loss": 0.2417,
"step": 1885
},
{
"epoch": 3.8607983623336746,
"grad_norm": 0.29657229767791343,
"learning_rate": 5.95938384187575e-06,
"loss": 0.2513,
"step": 1886
},
{
"epoch": 3.862845445240532,
"grad_norm": 0.2965578800271511,
"learning_rate": 5.939022310924099e-06,
"loss": 0.238,
"step": 1887
},
{
"epoch": 3.86489252814739,
"grad_norm": 0.28944984642689814,
"learning_rate": 5.918689557303885e-06,
"loss": 0.233,
"step": 1888
},
{
"epoch": 3.8669396110542475,
"grad_norm": 0.3405343709910588,
"learning_rate": 5.898385622628336e-06,
"loss": 0.2568,
"step": 1889
},
{
"epoch": 3.8689866939611055,
"grad_norm": 0.31192910196034185,
"learning_rate": 5.878110548451675e-06,
"loss": 0.255,
"step": 1890
},
{
"epoch": 3.871033776867963,
"grad_norm": 0.3090737941594832,
"learning_rate": 5.857864376269051e-06,
"loss": 0.2356,
"step": 1891
},
{
"epoch": 3.873080859774821,
"grad_norm": 0.27438554656264486,
"learning_rate": 5.837647147516483e-06,
"loss": 0.19,
"step": 1892
},
{
"epoch": 3.8751279426816785,
"grad_norm": 0.3021169809249445,
"learning_rate": 5.817458903570747e-06,
"loss": 0.2248,
"step": 1893
},
{
"epoch": 3.8771750255885364,
"grad_norm": 0.3213447653550202,
"learning_rate": 5.7972996857492896e-06,
"loss": 0.2269,
"step": 1894
},
{
"epoch": 3.879222108495394,
"grad_norm": 0.33058912837962084,
"learning_rate": 5.777169535310152e-06,
"loss": 0.2359,
"step": 1895
},
{
"epoch": 3.881269191402252,
"grad_norm": 0.28572806654091426,
"learning_rate": 5.7570684934519135e-06,
"loss": 0.2147,
"step": 1896
},
{
"epoch": 3.8833162743091094,
"grad_norm": 0.32405838839318063,
"learning_rate": 5.736996601313545e-06,
"loss": 0.25,
"step": 1897
},
{
"epoch": 3.8853633572159674,
"grad_norm": 0.31407112083239963,
"learning_rate": 5.716953899974371e-06,
"loss": 0.2628,
"step": 1898
},
{
"epoch": 3.887410440122825,
"grad_norm": 0.24848541316900655,
"learning_rate": 5.696940430453981e-06,
"loss": 0.1664,
"step": 1899
},
{
"epoch": 3.889457523029683,
"grad_norm": 0.3315987651736212,
"learning_rate": 5.676956233712139e-06,
"loss": 0.2637,
"step": 1900
},
{
"epoch": 3.8915046059365404,
"grad_norm": 0.2855594139469349,
"learning_rate": 5.657001350648674e-06,
"loss": 0.207,
"step": 1901
},
{
"epoch": 3.8935516888433983,
"grad_norm": 0.3282465345015928,
"learning_rate": 5.6370758221034595e-06,
"loss": 0.2535,
"step": 1902
},
{
"epoch": 3.895598771750256,
"grad_norm": 0.30475560401896684,
"learning_rate": 5.617179688856271e-06,
"loss": 0.2432,
"step": 1903
},
{
"epoch": 3.8976458546571138,
"grad_norm": 0.30871123340361434,
"learning_rate": 5.597312991626713e-06,
"loss": 0.2134,
"step": 1904
},
{
"epoch": 3.8996929375639713,
"grad_norm": 0.33995724036296693,
"learning_rate": 5.577475771074168e-06,
"loss": 0.2485,
"step": 1905
},
{
"epoch": 3.901740020470829,
"grad_norm": 0.2811551664838514,
"learning_rate": 5.557668067797677e-06,
"loss": 0.2453,
"step": 1906
},
{
"epoch": 3.9037871033776868,
"grad_norm": 0.3085482770411968,
"learning_rate": 5.537889922335877e-06,
"loss": 0.2841,
"step": 1907
},
{
"epoch": 3.9058341862845447,
"grad_norm": 0.291552651807712,
"learning_rate": 5.5181413751669125e-06,
"loss": 0.2519,
"step": 1908
},
{
"epoch": 3.907881269191402,
"grad_norm": 0.301795002796971,
"learning_rate": 5.498422466708349e-06,
"loss": 0.2157,
"step": 1909
},
{
"epoch": 3.9099283520982597,
"grad_norm": 0.33010524936332025,
"learning_rate": 5.478733237317084e-06,
"loss": 0.2272,
"step": 1910
},
{
"epoch": 3.9119754350051177,
"grad_norm": 0.30279565810368547,
"learning_rate": 5.459073727289291e-06,
"loss": 0.2462,
"step": 1911
},
{
"epoch": 3.9140225179119756,
"grad_norm": 0.3180781810526128,
"learning_rate": 5.439443976860306e-06,
"loss": 0.2358,
"step": 1912
},
{
"epoch": 3.916069600818833,
"grad_norm": 0.3040038712809124,
"learning_rate": 5.419844026204568e-06,
"loss": 0.2266,
"step": 1913
},
{
"epoch": 3.9181166837256907,
"grad_norm": 0.31115682004986517,
"learning_rate": 5.400273915435526e-06,
"loss": 0.2706,
"step": 1914
},
{
"epoch": 3.9201637666325486,
"grad_norm": 0.2723072282278328,
"learning_rate": 5.38073368460555e-06,
"loss": 0.2172,
"step": 1915
},
{
"epoch": 3.9222108495394066,
"grad_norm": 0.3236989253933042,
"learning_rate": 5.361223373705873e-06,
"loss": 0.2671,
"step": 1916
},
{
"epoch": 3.924257932446264,
"grad_norm": 0.2949286813519759,
"learning_rate": 5.341743022666468e-06,
"loss": 0.2575,
"step": 1917
},
{
"epoch": 3.9263050153531216,
"grad_norm": 0.2929617720522718,
"learning_rate": 5.32229267135602e-06,
"loss": 0.2477,
"step": 1918
},
{
"epoch": 3.9283520982599796,
"grad_norm": 0.29944445931066044,
"learning_rate": 5.302872359581799e-06,
"loss": 0.3016,
"step": 1919
},
{
"epoch": 3.9303991811668375,
"grad_norm": 0.2792530693391496,
"learning_rate": 5.283482127089603e-06,
"loss": 0.2116,
"step": 1920
},
{
"epoch": 3.932446264073695,
"grad_norm": 0.3148045265972349,
"learning_rate": 5.2641220135636685e-06,
"loss": 0.251,
"step": 1921
},
{
"epoch": 3.9344933469805525,
"grad_norm": 0.29519836172160036,
"learning_rate": 5.244792058626587e-06,
"loss": 0.2379,
"step": 1922
},
{
"epoch": 3.9365404298874105,
"grad_norm": 0.27640580348684785,
"learning_rate": 5.2254923018392344e-06,
"loss": 0.2094,
"step": 1923
},
{
"epoch": 3.9385875127942684,
"grad_norm": 0.3216119033954627,
"learning_rate": 5.206222782700667e-06,
"loss": 0.2226,
"step": 1924
},
{
"epoch": 3.940634595701126,
"grad_norm": 0.28724430058767236,
"learning_rate": 5.186983540648074e-06,
"loss": 0.229,
"step": 1925
},
{
"epoch": 3.9426816786079835,
"grad_norm": 0.30838073757141554,
"learning_rate": 5.167774615056669e-06,
"loss": 0.2471,
"step": 1926
},
{
"epoch": 3.9447287615148414,
"grad_norm": 0.3050237811038075,
"learning_rate": 5.1485960452396266e-06,
"loss": 0.2516,
"step": 1927
},
{
"epoch": 3.946775844421699,
"grad_norm": 0.3269582811451654,
"learning_rate": 5.1294478704479896e-06,
"loss": 0.2757,
"step": 1928
},
{
"epoch": 3.948822927328557,
"grad_norm": 0.32559352766006666,
"learning_rate": 5.1103301298705995e-06,
"loss": 0.2234,
"step": 1929
},
{
"epoch": 3.9508700102354144,
"grad_norm": 0.3018696607635989,
"learning_rate": 5.091242862634e-06,
"loss": 0.2112,
"step": 1930
},
{
"epoch": 3.9529170931422724,
"grad_norm": 0.28807273157869273,
"learning_rate": 5.072186107802377e-06,
"loss": 0.2698,
"step": 1931
},
{
"epoch": 3.95496417604913,
"grad_norm": 0.324456956245341,
"learning_rate": 5.05315990437747e-06,
"loss": 0.2451,
"step": 1932
},
{
"epoch": 3.957011258955988,
"grad_norm": 0.314429989537044,
"learning_rate": 5.0341642912984844e-06,
"loss": 0.2311,
"step": 1933
},
{
"epoch": 3.9590583418628453,
"grad_norm": 0.2926963524885184,
"learning_rate": 5.015199307442027e-06,
"loss": 0.2418,
"step": 1934
},
{
"epoch": 3.9611054247697033,
"grad_norm": 0.30442791024762783,
"learning_rate": 4.996264991622015e-06,
"loss": 0.2513,
"step": 1935
},
{
"epoch": 3.963152507676561,
"grad_norm": 0.30348062095693085,
"learning_rate": 4.977361382589607e-06,
"loss": 0.217,
"step": 1936
},
{
"epoch": 3.9651995905834188,
"grad_norm": 0.2876460419928197,
"learning_rate": 4.958488519033096e-06,
"loss": 0.2143,
"step": 1937
},
{
"epoch": 3.9672466734902763,
"grad_norm": 0.3075426518738084,
"learning_rate": 4.939646439577868e-06,
"loss": 0.2275,
"step": 1938
},
{
"epoch": 3.969293756397134,
"grad_norm": 0.3060349427123557,
"learning_rate": 4.920835182786316e-06,
"loss": 0.2371,
"step": 1939
},
{
"epoch": 3.9713408393039917,
"grad_norm": 0.31280082710142615,
"learning_rate": 4.9020547871577265e-06,
"loss": 0.2234,
"step": 1940
},
{
"epoch": 3.9733879222108497,
"grad_norm": 0.3134880106721245,
"learning_rate": 4.8833052911282375e-06,
"loss": 0.2492,
"step": 1941
},
{
"epoch": 3.975435005117707,
"grad_norm": 0.2953871360194199,
"learning_rate": 4.864586733070755e-06,
"loss": 0.221,
"step": 1942
},
{
"epoch": 3.977482088024565,
"grad_norm": 0.31671009607399275,
"learning_rate": 4.845899151294848e-06,
"loss": 0.2335,
"step": 1943
},
{
"epoch": 3.9795291709314227,
"grad_norm": 0.2908681144449216,
"learning_rate": 4.827242584046698e-06,
"loss": 0.2767,
"step": 1944
},
{
"epoch": 3.9815762538382806,
"grad_norm": 0.2931420599842891,
"learning_rate": 4.808617069509034e-06,
"loss": 0.2475,
"step": 1945
},
{
"epoch": 3.983623336745138,
"grad_norm": 0.2955583206381109,
"learning_rate": 4.790022645800994e-06,
"loss": 0.2737,
"step": 1946
},
{
"epoch": 3.9856704196519956,
"grad_norm": 0.2873161139594403,
"learning_rate": 4.77145935097811e-06,
"loss": 0.2116,
"step": 1947
},
{
"epoch": 3.9877175025588536,
"grad_norm": 0.30419560839821447,
"learning_rate": 4.752927223032196e-06,
"loss": 0.2261,
"step": 1948
},
{
"epoch": 3.9897645854657116,
"grad_norm": 0.3199774705798064,
"learning_rate": 4.7344262998912885e-06,
"loss": 0.2478,
"step": 1949
},
{
"epoch": 3.991811668372569,
"grad_norm": 0.31331370811795206,
"learning_rate": 4.715956619419539e-06,
"loss": 0.2427,
"step": 1950
},
{
"epoch": 3.9938587512794266,
"grad_norm": 0.3035495734347721,
"learning_rate": 4.697518219417188e-06,
"loss": 0.2412,
"step": 1951
},
{
"epoch": 3.9959058341862845,
"grad_norm": 0.2835191629580971,
"learning_rate": 4.679111137620442e-06,
"loss": 0.2054,
"step": 1952
},
{
"epoch": 3.9979529170931425,
"grad_norm": 0.29245602269669696,
"learning_rate": 4.660735411701398e-06,
"loss": 0.2047,
"step": 1953
},
{
"epoch": 4.0,
"grad_norm": 0.7074262659570464,
"learning_rate": 4.6423910792680005e-06,
"loss": 0.2856,
"step": 1954
},
{
"epoch": 4.0020470829068575,
"grad_norm": 0.4712597575605154,
"learning_rate": 4.62407817786394e-06,
"loss": 0.1814,
"step": 1955
},
{
"epoch": 4.004094165813715,
"grad_norm": 0.38151984418088236,
"learning_rate": 4.605796744968556e-06,
"loss": 0.1724,
"step": 1956
},
{
"epoch": 4.006141248720573,
"grad_norm": 0.34020967642434874,
"learning_rate": 4.587546817996826e-06,
"loss": 0.149,
"step": 1957
},
{
"epoch": 4.008188331627431,
"grad_norm": 0.31262949437875953,
"learning_rate": 4.56932843429922e-06,
"loss": 0.161,
"step": 1958
},
{
"epoch": 4.0102354145342884,
"grad_norm": 0.4303302435498056,
"learning_rate": 4.551141631161651e-06,
"loss": 0.1656,
"step": 1959
},
{
"epoch": 4.012282497441146,
"grad_norm": 0.4285520812331891,
"learning_rate": 4.532986445805405e-06,
"loss": 0.1627,
"step": 1960
},
{
"epoch": 4.014329580348004,
"grad_norm": 0.4192066755161973,
"learning_rate": 4.514862915387059e-06,
"loss": 0.1827,
"step": 1961
},
{
"epoch": 4.016376663254862,
"grad_norm": 0.3658567404594509,
"learning_rate": 4.496771076998405e-06,
"loss": 0.2017,
"step": 1962
},
{
"epoch": 4.018423746161719,
"grad_norm": 0.3317640866707838,
"learning_rate": 4.478710967666371e-06,
"loss": 0.1817,
"step": 1963
},
{
"epoch": 4.020470829068577,
"grad_norm": 0.38931119420729465,
"learning_rate": 4.460682624352952e-06,
"loss": 0.1582,
"step": 1964
},
{
"epoch": 4.022517911975435,
"grad_norm": 0.4025748777608708,
"learning_rate": 4.442686083955132e-06,
"loss": 0.1692,
"step": 1965
},
{
"epoch": 4.024564994882293,
"grad_norm": 0.3465330209700674,
"learning_rate": 4.424721383304791e-06,
"loss": 0.1623,
"step": 1966
},
{
"epoch": 4.02661207778915,
"grad_norm": 0.33474185562540504,
"learning_rate": 4.4067885591686625e-06,
"loss": 0.1629,
"step": 1967
},
{
"epoch": 4.028659160696008,
"grad_norm": 0.2994162000223648,
"learning_rate": 4.388887648248237e-06,
"loss": 0.1943,
"step": 1968
},
{
"epoch": 4.030706243602866,
"grad_norm": 0.30244570971558177,
"learning_rate": 4.371018687179689e-06,
"loss": 0.2009,
"step": 1969
},
{
"epoch": 4.032753326509724,
"grad_norm": 0.31048799941390864,
"learning_rate": 4.353181712533807e-06,
"loss": 0.1763,
"step": 1970
},
{
"epoch": 4.034800409416581,
"grad_norm": 0.31183964535945935,
"learning_rate": 4.3353767608159125e-06,
"loss": 0.1695,
"step": 1971
},
{
"epoch": 4.036847492323439,
"grad_norm": 0.31495339654929116,
"learning_rate": 4.317603868465794e-06,
"loss": 0.156,
"step": 1972
},
{
"epoch": 4.038894575230297,
"grad_norm": 0.2985528590711877,
"learning_rate": 4.299863071857617e-06,
"loss": 0.1687,
"step": 1973
},
{
"epoch": 4.040941658137155,
"grad_norm": 0.2898165046108191,
"learning_rate": 4.2821544072998655e-06,
"loss": 0.1689,
"step": 1974
},
{
"epoch": 4.042988741044012,
"grad_norm": 0.2806552847148077,
"learning_rate": 4.264477911035265e-06,
"loss": 0.1463,
"step": 1975
},
{
"epoch": 4.04503582395087,
"grad_norm": 0.2973917660709716,
"learning_rate": 4.246833619240702e-06,
"loss": 0.1452,
"step": 1976
},
{
"epoch": 4.047082906857728,
"grad_norm": 0.3137432130391123,
"learning_rate": 4.229221568027151e-06,
"loss": 0.1821,
"step": 1977
},
{
"epoch": 4.049129989764586,
"grad_norm": 0.29853330078810963,
"learning_rate": 4.211641793439609e-06,
"loss": 0.168,
"step": 1978
},
{
"epoch": 4.051177072671443,
"grad_norm": 0.3165013554197021,
"learning_rate": 4.194094331457004e-06,
"loss": 0.1753,
"step": 1979
},
{
"epoch": 4.053224155578301,
"grad_norm": 0.30677661034631387,
"learning_rate": 4.176579217992143e-06,
"loss": 0.1525,
"step": 1980
},
{
"epoch": 4.055271238485159,
"grad_norm": 0.29846779652611327,
"learning_rate": 4.159096488891623e-06,
"loss": 0.1559,
"step": 1981
},
{
"epoch": 4.0573183213920165,
"grad_norm": 0.3014892204744538,
"learning_rate": 4.1416461799357675e-06,
"loss": 0.1732,
"step": 1982
},
{
"epoch": 4.059365404298874,
"grad_norm": 0.29810838609292906,
"learning_rate": 4.124228326838544e-06,
"loss": 0.1442,
"step": 1983
},
{
"epoch": 4.061412487205732,
"grad_norm": 0.28754100275614525,
"learning_rate": 4.106842965247497e-06,
"loss": 0.1587,
"step": 1984
},
{
"epoch": 4.06345957011259,
"grad_norm": 0.28330676946462274,
"learning_rate": 4.0894901307436805e-06,
"loss": 0.1697,
"step": 1985
},
{
"epoch": 4.0655066530194475,
"grad_norm": 0.285301416935174,
"learning_rate": 4.072169858841561e-06,
"loss": 0.1595,
"step": 1986
},
{
"epoch": 4.067553735926305,
"grad_norm": 0.2837041098864574,
"learning_rate": 4.054882184988971e-06,
"loss": 0.1871,
"step": 1987
},
{
"epoch": 4.0696008188331625,
"grad_norm": 0.28867050594006555,
"learning_rate": 4.0376271445670465e-06,
"loss": 0.1805,
"step": 1988
},
{
"epoch": 4.071647901740021,
"grad_norm": 0.27159571765823515,
"learning_rate": 4.020404772890101e-06,
"loss": 0.1782,
"step": 1989
},
{
"epoch": 4.073694984646878,
"grad_norm": 0.29070033660022687,
"learning_rate": 4.003215105205613e-06,
"loss": 0.1971,
"step": 1990
},
{
"epoch": 4.075742067553736,
"grad_norm": 0.29431870068785426,
"learning_rate": 3.986058176694123e-06,
"loss": 0.1836,
"step": 1991
},
{
"epoch": 4.077789150460593,
"grad_norm": 0.30901872319472756,
"learning_rate": 3.968934022469157e-06,
"loss": 0.1615,
"step": 1992
},
{
"epoch": 4.079836233367452,
"grad_norm": 0.26872512623266737,
"learning_rate": 3.951842677577171e-06,
"loss": 0.1571,
"step": 1993
},
{
"epoch": 4.081883316274309,
"grad_norm": 0.277601430071484,
"learning_rate": 3.9347841769974925e-06,
"loss": 0.1993,
"step": 1994
},
{
"epoch": 4.083930399181167,
"grad_norm": 0.2815481603482528,
"learning_rate": 3.917758555642195e-06,
"loss": 0.1776,
"step": 1995
},
{
"epoch": 4.085977482088024,
"grad_norm": 0.31199877087236383,
"learning_rate": 3.900765848356083e-06,
"loss": 0.1807,
"step": 1996
},
{
"epoch": 4.088024564994882,
"grad_norm": 0.3049249771323251,
"learning_rate": 3.883806089916593e-06,
"loss": 0.1738,
"step": 1997
},
{
"epoch": 4.09007164790174,
"grad_norm": 0.29235847173840074,
"learning_rate": 3.866879315033738e-06,
"loss": 0.178,
"step": 1998
},
{
"epoch": 4.092118730808598,
"grad_norm": 0.2849050552270467,
"learning_rate": 3.849985558349998e-06,
"loss": 0.1679,
"step": 1999
},
{
"epoch": 4.094165813715455,
"grad_norm": 0.2977669293917896,
"learning_rate": 3.8331248544403135e-06,
"loss": 0.1556,
"step": 2000
},
{
"epoch": 4.096212896622313,
"grad_norm": 0.2927932662986805,
"learning_rate": 3.8162972378119635e-06,
"loss": 0.1894,
"step": 2001
},
{
"epoch": 4.098259979529171,
"grad_norm": 0.29328702976060467,
"learning_rate": 3.799502742904497e-06,
"loss": 0.173,
"step": 2002
},
{
"epoch": 4.100307062436029,
"grad_norm": 0.2969906241666511,
"learning_rate": 3.7827414040896958e-06,
"loss": 0.1812,
"step": 2003
},
{
"epoch": 4.102354145342886,
"grad_norm": 0.32948789795871786,
"learning_rate": 3.766013255671479e-06,
"loss": 0.1825,
"step": 2004
},
{
"epoch": 4.104401228249744,
"grad_norm": 0.28692228882067206,
"learning_rate": 3.749318331885825e-06,
"loss": 0.1654,
"step": 2005
},
{
"epoch": 4.106448311156602,
"grad_norm": 0.27400445663207174,
"learning_rate": 3.7326566669007268e-06,
"loss": 0.1913,
"step": 2006
},
{
"epoch": 4.10849539406346,
"grad_norm": 0.30606472444023525,
"learning_rate": 3.716028294816119e-06,
"loss": 0.1545,
"step": 2007
},
{
"epoch": 4.110542476970317,
"grad_norm": 0.3123611845587673,
"learning_rate": 3.699433249663775e-06,
"loss": 0.201,
"step": 2008
},
{
"epoch": 4.112589559877175,
"grad_norm": 0.31139724775462246,
"learning_rate": 3.6828715654072776e-06,
"loss": 0.1618,
"step": 2009
},
{
"epoch": 4.114636642784033,
"grad_norm": 0.29336426025625717,
"learning_rate": 3.666343275941926e-06,
"loss": 0.1903,
"step": 2010
},
{
"epoch": 4.116683725690891,
"grad_norm": 0.2893384058971807,
"learning_rate": 3.649848415094681e-06,
"loss": 0.17,
"step": 2011
},
{
"epoch": 4.118730808597748,
"grad_norm": 0.29125235390587384,
"learning_rate": 3.6333870166240703e-06,
"loss": 0.1996,
"step": 2012
},
{
"epoch": 4.120777891504606,
"grad_norm": 0.29871612249436,
"learning_rate": 3.616959114220162e-06,
"loss": 0.1985,
"step": 2013
},
{
"epoch": 4.122824974411464,
"grad_norm": 0.271555858016362,
"learning_rate": 3.60056474150446e-06,
"loss": 0.1818,
"step": 2014
},
{
"epoch": 4.1248720573183215,
"grad_norm": 0.3012910833873169,
"learning_rate": 3.5842039320298327e-06,
"loss": 0.1414,
"step": 2015
},
{
"epoch": 4.126919140225179,
"grad_norm": 0.27667654273766706,
"learning_rate": 3.5678767192804764e-06,
"loss": 0.1882,
"step": 2016
},
{
"epoch": 4.1289662231320365,
"grad_norm": 0.2869593416102505,
"learning_rate": 3.551583136671817e-06,
"loss": 0.1906,
"step": 2017
},
{
"epoch": 4.131013306038895,
"grad_norm": 0.2822576095654785,
"learning_rate": 3.5353232175504614e-06,
"loss": 0.1828,
"step": 2018
},
{
"epoch": 4.1330603889457525,
"grad_norm": 0.3121511458643644,
"learning_rate": 3.5190969951941113e-06,
"loss": 0.161,
"step": 2019
},
{
"epoch": 4.13510747185261,
"grad_norm": 0.2829822535321191,
"learning_rate": 3.5029045028115105e-06,
"loss": 0.1514,
"step": 2020
},
{
"epoch": 4.1371545547594675,
"grad_norm": 0.2837746021156371,
"learning_rate": 3.486745773542375e-06,
"loss": 0.1935,
"step": 2021
},
{
"epoch": 4.139201637666326,
"grad_norm": 0.3025896496647199,
"learning_rate": 3.470620840457304e-06,
"loss": 0.208,
"step": 2022
},
{
"epoch": 4.141248720573183,
"grad_norm": 0.28695976072627655,
"learning_rate": 3.4545297365577437e-06,
"loss": 0.18,
"step": 2023
},
{
"epoch": 4.143295803480041,
"grad_norm": 0.29084046593921437,
"learning_rate": 3.438472494775902e-06,
"loss": 0.1797,
"step": 2024
},
{
"epoch": 4.145342886386898,
"grad_norm": 0.2964669445537806,
"learning_rate": 3.4224491479746822e-06,
"loss": 0.2066,
"step": 2025
},
{
"epoch": 4.147389969293757,
"grad_norm": 0.2735467093182229,
"learning_rate": 3.406459728947622e-06,
"loss": 0.1805,
"step": 2026
},
{
"epoch": 4.149437052200614,
"grad_norm": 0.2883440394869004,
"learning_rate": 3.390504270418822e-06,
"loss": 0.1935,
"step": 2027
},
{
"epoch": 4.151484135107472,
"grad_norm": 0.28347253308933157,
"learning_rate": 3.3745828050428675e-06,
"loss": 0.2042,
"step": 2028
},
{
"epoch": 4.153531218014329,
"grad_norm": 0.287714374967884,
"learning_rate": 3.358695365404785e-06,
"loss": 0.1965,
"step": 2029
},
{
"epoch": 4.155578300921187,
"grad_norm": 0.29340681821156966,
"learning_rate": 3.3428419840199623e-06,
"loss": 0.1603,
"step": 2030
},
{
"epoch": 4.157625383828045,
"grad_norm": 0.2727874360906274,
"learning_rate": 3.327022693334083e-06,
"loss": 0.155,
"step": 2031
},
{
"epoch": 4.159672466734903,
"grad_norm": 0.30308908503638726,
"learning_rate": 3.3112375257230547e-06,
"loss": 0.1909,
"step": 2032
},
{
"epoch": 4.16171954964176,
"grad_norm": 0.2782743108107041,
"learning_rate": 3.295486513492954e-06,
"loss": 0.1912,
"step": 2033
},
{
"epoch": 4.163766632548619,
"grad_norm": 0.2960778023128933,
"learning_rate": 3.279769688879959e-06,
"loss": 0.1706,
"step": 2034
},
{
"epoch": 4.165813715455476,
"grad_norm": 0.2855041966453197,
"learning_rate": 3.2640870840502646e-06,
"loss": 0.1812,
"step": 2035
},
{
"epoch": 4.167860798362334,
"grad_norm": 0.2982604591341762,
"learning_rate": 3.2484387311000364e-06,
"loss": 0.1651,
"step": 2036
},
{
"epoch": 4.169907881269191,
"grad_norm": 0.2957570058273647,
"learning_rate": 3.2328246620553605e-06,
"loss": 0.1632,
"step": 2037
},
{
"epoch": 4.171954964176049,
"grad_norm": 0.2951616591774616,
"learning_rate": 3.2172449088721235e-06,
"loss": 0.1624,
"step": 2038
},
{
"epoch": 4.174002047082907,
"grad_norm": 0.2851463082934784,
"learning_rate": 3.2016995034360045e-06,
"loss": 0.1808,
"step": 2039
},
{
"epoch": 4.176049129989765,
"grad_norm": 0.2845084237530719,
"learning_rate": 3.186188477562382e-06,
"loss": 0.1786,
"step": 2040
},
{
"epoch": 4.178096212896622,
"grad_norm": 0.30650921370752987,
"learning_rate": 3.1707118629962607e-06,
"loss": 0.1472,
"step": 2041
},
{
"epoch": 4.18014329580348,
"grad_norm": 0.2777664159680818,
"learning_rate": 3.1552696914122327e-06,
"loss": 0.1531,
"step": 2042
},
{
"epoch": 4.182190378710338,
"grad_norm": 0.28828599604321864,
"learning_rate": 3.139861994414397e-06,
"loss": 0.1845,
"step": 2043
},
{
"epoch": 4.184237461617196,
"grad_norm": 0.30773699136825977,
"learning_rate": 3.1244888035362875e-06,
"loss": 0.1769,
"step": 2044
},
{
"epoch": 4.186284544524053,
"grad_norm": 0.3155552426235748,
"learning_rate": 3.1091501502408293e-06,
"loss": 0.1643,
"step": 2045
},
{
"epoch": 4.188331627430911,
"grad_norm": 0.2898415631190067,
"learning_rate": 3.093846065920254e-06,
"loss": 0.168,
"step": 2046
},
{
"epoch": 4.190378710337769,
"grad_norm": 0.3080290062858047,
"learning_rate": 3.0785765818960534e-06,
"loss": 0.215,
"step": 2047
},
{
"epoch": 4.1924257932446265,
"grad_norm": 0.26531974258516,
"learning_rate": 3.0633417294188896e-06,
"loss": 0.1902,
"step": 2048
},
{
"epoch": 4.194472876151484,
"grad_norm": 0.29101494010309975,
"learning_rate": 3.0481415396685564e-06,
"loss": 0.1851,
"step": 2049
},
{
"epoch": 4.1965199590583415,
"grad_norm": 0.2805440252695212,
"learning_rate": 3.0329760437539233e-06,
"loss": 0.2106,
"step": 2050
},
{
"epoch": 4.1985670419652,
"grad_norm": 0.2954059617809206,
"learning_rate": 3.017845272712825e-06,
"loss": 0.178,
"step": 2051
},
{
"epoch": 4.200614124872057,
"grad_norm": 0.2750668596805512,
"learning_rate": 3.0027492575120453e-06,
"loss": 0.1639,
"step": 2052
},
{
"epoch": 4.202661207778915,
"grad_norm": 0.2954652665384484,
"learning_rate": 2.9876880290472376e-06,
"loss": 0.1654,
"step": 2053
},
{
"epoch": 4.2047082906857725,
"grad_norm": 0.2844611490184932,
"learning_rate": 2.9726616181428515e-06,
"loss": 0.1824,
"step": 2054
},
{
"epoch": 4.206755373592631,
"grad_norm": 0.28973919117898683,
"learning_rate": 2.957670055552078e-06,
"loss": 0.1785,
"step": 2055
},
{
"epoch": 4.208802456499488,
"grad_norm": 0.3065738121168008,
"learning_rate": 2.942713371956809e-06,
"loss": 0.1466,
"step": 2056
},
{
"epoch": 4.210849539406346,
"grad_norm": 0.27595990092382205,
"learning_rate": 2.927791597967522e-06,
"loss": 0.1674,
"step": 2057
},
{
"epoch": 4.212896622313203,
"grad_norm": 0.2787886162407018,
"learning_rate": 2.9129047641232653e-06,
"loss": 0.1739,
"step": 2058
},
{
"epoch": 4.214943705220062,
"grad_norm": 0.2835601286496118,
"learning_rate": 2.8980529008915793e-06,
"loss": 0.1851,
"step": 2059
},
{
"epoch": 4.216990788126919,
"grad_norm": 0.2734235721997136,
"learning_rate": 2.8832360386684287e-06,
"loss": 0.1894,
"step": 2060
},
{
"epoch": 4.219037871033777,
"grad_norm": 0.313282561765329,
"learning_rate": 2.8684542077781376e-06,
"loss": 0.1844,
"step": 2061
},
{
"epoch": 4.221084953940634,
"grad_norm": 0.29657107157085116,
"learning_rate": 2.853707438473352e-06,
"loss": 0.1861,
"step": 2062
},
{
"epoch": 4.223132036847493,
"grad_norm": 0.2835879428086791,
"learning_rate": 2.838995760934953e-06,
"loss": 0.1992,
"step": 2063
},
{
"epoch": 4.22517911975435,
"grad_norm": 0.28406764855143096,
"learning_rate": 2.8243192052719902e-06,
"loss": 0.1743,
"step": 2064
},
{
"epoch": 4.227226202661208,
"grad_norm": 0.2735358759641007,
"learning_rate": 2.8096778015216484e-06,
"loss": 0.1663,
"step": 2065
},
{
"epoch": 4.229273285568065,
"grad_norm": 0.28994987108300085,
"learning_rate": 2.7950715796491623e-06,
"loss": 0.1693,
"step": 2066
},
{
"epoch": 4.231320368474924,
"grad_norm": 0.2918242884998421,
"learning_rate": 2.7805005695477704e-06,
"loss": 0.1659,
"step": 2067
},
{
"epoch": 4.233367451381781,
"grad_norm": 0.2914555807372611,
"learning_rate": 2.7659648010386365e-06,
"loss": 0.2082,
"step": 2068
},
{
"epoch": 4.235414534288639,
"grad_norm": 0.31049832846223957,
"learning_rate": 2.75146430387081e-06,
"loss": 0.1745,
"step": 2069
},
{
"epoch": 4.237461617195496,
"grad_norm": 0.2786106859690272,
"learning_rate": 2.736999107721137e-06,
"loss": 0.1689,
"step": 2070
},
{
"epoch": 4.239508700102354,
"grad_norm": 0.2813229103286878,
"learning_rate": 2.7225692421942306e-06,
"loss": 0.1938,
"step": 2071
},
{
"epoch": 4.241555783009212,
"grad_norm": 0.2911201694179184,
"learning_rate": 2.7081747368223953e-06,
"loss": 0.1835,
"step": 2072
},
{
"epoch": 4.24360286591607,
"grad_norm": 0.29816674195659204,
"learning_rate": 2.6938156210655584e-06,
"loss": 0.1842,
"step": 2073
},
{
"epoch": 4.245649948822927,
"grad_norm": 0.2996092573579164,
"learning_rate": 2.679491924311226e-06,
"loss": 0.1847,
"step": 2074
},
{
"epoch": 4.247697031729785,
"grad_norm": 0.2895248563688147,
"learning_rate": 2.6652036758744148e-06,
"loss": 0.1827,
"step": 2075
},
{
"epoch": 4.249744114636643,
"grad_norm": 0.28226805129886656,
"learning_rate": 2.6509509049975913e-06,
"loss": 0.1765,
"step": 2076
},
{
"epoch": 4.2517911975435005,
"grad_norm": 0.2949640489147984,
"learning_rate": 2.6367336408506063e-06,
"loss": 0.1705,
"step": 2077
},
{
"epoch": 4.253838280450358,
"grad_norm": 0.2753720710190184,
"learning_rate": 2.622551912530653e-06,
"loss": 0.186,
"step": 2078
},
{
"epoch": 4.255885363357216,
"grad_norm": 0.2936131556811062,
"learning_rate": 2.608405749062193e-06,
"loss": 0.1855,
"step": 2079
},
{
"epoch": 4.257932446264074,
"grad_norm": 0.28222624495514065,
"learning_rate": 2.594295179396895e-06,
"loss": 0.1743,
"step": 2080
},
{
"epoch": 4.2599795291709315,
"grad_norm": 0.29367758291953516,
"learning_rate": 2.5802202324135926e-06,
"loss": 0.1926,
"step": 2081
},
{
"epoch": 4.262026612077789,
"grad_norm": 0.29329088174637774,
"learning_rate": 2.566180936918203e-06,
"loss": 0.1985,
"step": 2082
},
{
"epoch": 4.2640736949846465,
"grad_norm": 0.27495980250181845,
"learning_rate": 2.5521773216436875e-06,
"loss": 0.1694,
"step": 2083
},
{
"epoch": 4.266120777891505,
"grad_norm": 0.2745597317735457,
"learning_rate": 2.5382094152499705e-06,
"loss": 0.1623,
"step": 2084
},
{
"epoch": 4.268167860798362,
"grad_norm": 0.2860214870247153,
"learning_rate": 2.5242772463239075e-06,
"loss": 0.1759,
"step": 2085
},
{
"epoch": 4.27021494370522,
"grad_norm": 0.2829523123556956,
"learning_rate": 2.5103808433792075e-06,
"loss": 0.1953,
"step": 2086
},
{
"epoch": 4.272262026612077,
"grad_norm": 0.29162058098697563,
"learning_rate": 2.4965202348563834e-06,
"loss": 0.1851,
"step": 2087
},
{
"epoch": 4.274309109518936,
"grad_norm": 0.28401444187787156,
"learning_rate": 2.4826954491226875e-06,
"loss": 0.1625,
"step": 2088
},
{
"epoch": 4.276356192425793,
"grad_norm": 0.29640280698764426,
"learning_rate": 2.468906514472065e-06,
"loss": 0.1733,
"step": 2089
},
{
"epoch": 4.278403275332651,
"grad_norm": 0.28541928385296134,
"learning_rate": 2.4551534591250725e-06,
"loss": 0.2083,
"step": 2090
},
{
"epoch": 4.280450358239508,
"grad_norm": 0.2927844886794109,
"learning_rate": 2.4414363112288464e-06,
"loss": 0.1518,
"step": 2091
},
{
"epoch": 4.282497441146367,
"grad_norm": 0.29701394558657035,
"learning_rate": 2.4277550988570362e-06,
"loss": 0.1621,
"step": 2092
},
{
"epoch": 4.284544524053224,
"grad_norm": 0.290771795085261,
"learning_rate": 2.4141098500097403e-06,
"loss": 0.1648,
"step": 2093
},
{
"epoch": 4.286591606960082,
"grad_norm": 0.27774161175473394,
"learning_rate": 2.400500592613455e-06,
"loss": 0.1711,
"step": 2094
},
{
"epoch": 4.288638689866939,
"grad_norm": 0.30883225453627294,
"learning_rate": 2.3869273545210158e-06,
"loss": 0.134,
"step": 2095
},
{
"epoch": 4.290685772773798,
"grad_norm": 0.2797146271613822,
"learning_rate": 2.3733901635115486e-06,
"loss": 0.1692,
"step": 2096
},
{
"epoch": 4.292732855680655,
"grad_norm": 0.2969804226131455,
"learning_rate": 2.359889047290389e-06,
"loss": 0.1671,
"step": 2097
},
{
"epoch": 4.294779938587513,
"grad_norm": 0.3217916741414342,
"learning_rate": 2.3464240334890496e-06,
"loss": 0.1575,
"step": 2098
},
{
"epoch": 4.29682702149437,
"grad_norm": 0.27919309597905084,
"learning_rate": 2.332995149665169e-06,
"loss": 0.2121,
"step": 2099
},
{
"epoch": 4.298874104401229,
"grad_norm": 0.2800459009067448,
"learning_rate": 2.3196024233024185e-06,
"loss": 0.1837,
"step": 2100
},
{
"epoch": 4.300921187308086,
"grad_norm": 0.2671979507192257,
"learning_rate": 2.3062458818104804e-06,
"loss": 0.1952,
"step": 2101
},
{
"epoch": 4.302968270214944,
"grad_norm": 0.28262802438305323,
"learning_rate": 2.2929255525249894e-06,
"loss": 0.1713,
"step": 2102
},
{
"epoch": 4.305015353121801,
"grad_norm": 0.3005757045715249,
"learning_rate": 2.279641462707445e-06,
"loss": 0.1668,
"step": 2103
},
{
"epoch": 4.30706243602866,
"grad_norm": 0.286800196637234,
"learning_rate": 2.266393639545197e-06,
"loss": 0.1896,
"step": 2104
},
{
"epoch": 4.309109518935517,
"grad_norm": 0.2833256809700336,
"learning_rate": 2.2531821101513796e-06,
"loss": 0.1417,
"step": 2105
},
{
"epoch": 4.311156601842375,
"grad_norm": 0.31197523215491646,
"learning_rate": 2.2400069015648173e-06,
"loss": 0.1952,
"step": 2106
},
{
"epoch": 4.313203684749232,
"grad_norm": 0.2678003946049261,
"learning_rate": 2.22686804075003e-06,
"loss": 0.1807,
"step": 2107
},
{
"epoch": 4.3152507676560905,
"grad_norm": 0.30659909160474746,
"learning_rate": 2.213765554597129e-06,
"loss": 0.1873,
"step": 2108
},
{
"epoch": 4.317297850562948,
"grad_norm": 0.27320285898761,
"learning_rate": 2.2006994699217963e-06,
"loss": 0.1783,
"step": 2109
},
{
"epoch": 4.3193449334698055,
"grad_norm": 0.2952250752859734,
"learning_rate": 2.187669813465192e-06,
"loss": 0.1666,
"step": 2110
},
{
"epoch": 4.321392016376663,
"grad_norm": 0.2648240745330133,
"learning_rate": 2.174676611893947e-06,
"loss": 0.1671,
"step": 2111
},
{
"epoch": 4.3234390992835205,
"grad_norm": 0.2811036034280304,
"learning_rate": 2.1617198918000737e-06,
"loss": 0.1765,
"step": 2112
},
{
"epoch": 4.325486182190379,
"grad_norm": 0.29793182339880603,
"learning_rate": 2.1487996797009103e-06,
"loss": 0.167,
"step": 2113
},
{
"epoch": 4.3275332650972365,
"grad_norm": 0.2752900542965174,
"learning_rate": 2.135916002039089e-06,
"loss": 0.1821,
"step": 2114
},
{
"epoch": 4.329580348004094,
"grad_norm": 0.27908851781361865,
"learning_rate": 2.123068885182471e-06,
"loss": 0.1875,
"step": 2115
},
{
"epoch": 4.3316274309109515,
"grad_norm": 0.30369045741991585,
"learning_rate": 2.110258355424093e-06,
"loss": 0.1565,
"step": 2116
},
{
"epoch": 4.33367451381781,
"grad_norm": 0.27387260120165463,
"learning_rate": 2.0974844389820914e-06,
"loss": 0.2037,
"step": 2117
},
{
"epoch": 4.335721596724667,
"grad_norm": 0.2695259971121429,
"learning_rate": 2.084747161999703e-06,
"loss": 0.1883,
"step": 2118
},
{
"epoch": 4.337768679631525,
"grad_norm": 0.27646790728531134,
"learning_rate": 2.0720465505451524e-06,
"loss": 0.1674,
"step": 2119
},
{
"epoch": 4.339815762538382,
"grad_norm": 0.27363984511519157,
"learning_rate": 2.0593826306116328e-06,
"loss": 0.1987,
"step": 2120
},
{
"epoch": 4.341862845445241,
"grad_norm": 0.3058041319945061,
"learning_rate": 2.0467554281172443e-06,
"loss": 0.1668,
"step": 2121
},
{
"epoch": 4.343909928352098,
"grad_norm": 0.28711732195110085,
"learning_rate": 2.0341649689049458e-06,
"loss": 0.1589,
"step": 2122
},
{
"epoch": 4.345957011258956,
"grad_norm": 0.29676194594555955,
"learning_rate": 2.021611278742479e-06,
"loss": 0.2006,
"step": 2123
},
{
"epoch": 4.348004094165813,
"grad_norm": 0.2894509415136224,
"learning_rate": 2.009094383322356e-06,
"loss": 0.1759,
"step": 2124
},
{
"epoch": 4.350051177072672,
"grad_norm": 0.27051010203429876,
"learning_rate": 1.9966143082617797e-06,
"loss": 0.2051,
"step": 2125
},
{
"epoch": 4.352098259979529,
"grad_norm": 0.29097733973563933,
"learning_rate": 1.9841710791025793e-06,
"loss": 0.1718,
"step": 2126
},
{
"epoch": 4.354145342886387,
"grad_norm": 0.321019324398248,
"learning_rate": 1.971764721311191e-06,
"loss": 0.2109,
"step": 2127
},
{
"epoch": 4.356192425793244,
"grad_norm": 0.30390810271040936,
"learning_rate": 1.959395260278587e-06,
"loss": 0.1489,
"step": 2128
},
{
"epoch": 4.358239508700103,
"grad_norm": 0.29799569461187914,
"learning_rate": 1.947062721320221e-06,
"loss": 0.1661,
"step": 2129
},
{
"epoch": 4.36028659160696,
"grad_norm": 0.2832110200623686,
"learning_rate": 1.9347671296759896e-06,
"loss": 0.1714,
"step": 2130
},
{
"epoch": 4.362333674513818,
"grad_norm": 0.2951503600181551,
"learning_rate": 1.922508510510166e-06,
"loss": 0.157,
"step": 2131
},
{
"epoch": 4.364380757420675,
"grad_norm": 0.28017904966905216,
"learning_rate": 1.9102868889113613e-06,
"loss": 0.1822,
"step": 2132
},
{
"epoch": 4.366427840327534,
"grad_norm": 0.27123684319205094,
"learning_rate": 1.8981022898924562e-06,
"loss": 0.2035,
"step": 2133
},
{
"epoch": 4.368474923234391,
"grad_norm": 0.3005307169297444,
"learning_rate": 1.885954738390572e-06,
"loss": 0.178,
"step": 2134
},
{
"epoch": 4.370522006141249,
"grad_norm": 0.30373485662664135,
"learning_rate": 1.8738442592670014e-06,
"loss": 0.1791,
"step": 2135
},
{
"epoch": 4.372569089048106,
"grad_norm": 0.29560692277511486,
"learning_rate": 1.8617708773071698e-06,
"loss": 0.1702,
"step": 2136
},
{
"epoch": 4.3746161719549645,
"grad_norm": 0.2799036916467131,
"learning_rate": 1.8497346172205733e-06,
"loss": 0.1757,
"step": 2137
},
{
"epoch": 4.376663254861822,
"grad_norm": 0.2914915460350074,
"learning_rate": 1.8377355036407408e-06,
"loss": 0.1537,
"step": 2138
},
{
"epoch": 4.37871033776868,
"grad_norm": 0.28848134194988,
"learning_rate": 1.8257735611251704e-06,
"loss": 0.1794,
"step": 2139
},
{
"epoch": 4.380757420675537,
"grad_norm": 0.2663575035541803,
"learning_rate": 1.8138488141552856e-06,
"loss": 0.1895,
"step": 2140
},
{
"epoch": 4.3828045035823955,
"grad_norm": 0.28041126513615366,
"learning_rate": 1.801961287136391e-06,
"loss": 0.1705,
"step": 2141
},
{
"epoch": 4.384851586489253,
"grad_norm": 0.2878226194974552,
"learning_rate": 1.7901110043976122e-06,
"loss": 0.1869,
"step": 2142
},
{
"epoch": 4.3868986693961105,
"grad_norm": 0.2845987197363273,
"learning_rate": 1.7782979901918507e-06,
"loss": 0.1822,
"step": 2143
},
{
"epoch": 4.388945752302968,
"grad_norm": 0.2750503273666346,
"learning_rate": 1.7665222686957362e-06,
"loss": 0.1778,
"step": 2144
},
{
"epoch": 4.3909928352098255,
"grad_norm": 0.3136573248950073,
"learning_rate": 1.754783864009575e-06,
"loss": 0.1569,
"step": 2145
},
{
"epoch": 4.393039918116684,
"grad_norm": 0.2957114276760941,
"learning_rate": 1.7430828001572897e-06,
"loss": 0.2259,
"step": 2146
},
{
"epoch": 4.395087001023541,
"grad_norm": 0.306095495611476,
"learning_rate": 1.7314191010863933e-06,
"loss": 0.2185,
"step": 2147
},
{
"epoch": 4.397134083930399,
"grad_norm": 0.2870140982518843,
"learning_rate": 1.7197927906679335e-06,
"loss": 0.2054,
"step": 2148
},
{
"epoch": 4.399181166837257,
"grad_norm": 0.2679693208920003,
"learning_rate": 1.7082038926964162e-06,
"loss": 0.1553,
"step": 2149
},
{
"epoch": 4.401228249744115,
"grad_norm": 0.30186153363388674,
"learning_rate": 1.6966524308897935e-06,
"loss": 0.1927,
"step": 2150
},
{
"epoch": 4.403275332650972,
"grad_norm": 0.2784578317629372,
"learning_rate": 1.6851384288894058e-06,
"loss": 0.1776,
"step": 2151
},
{
"epoch": 4.40532241555783,
"grad_norm": 0.2843049715212868,
"learning_rate": 1.6736619102599073e-06,
"loss": 0.1672,
"step": 2152
},
{
"epoch": 4.407369498464687,
"grad_norm": 0.31694116691154817,
"learning_rate": 1.6622228984892585e-06,
"loss": 0.1539,
"step": 2153
},
{
"epoch": 4.409416581371546,
"grad_norm": 0.26686752025138655,
"learning_rate": 1.6508214169886483e-06,
"loss": 0.1754,
"step": 2154
},
{
"epoch": 4.411463664278403,
"grad_norm": 0.2654670971586521,
"learning_rate": 1.6394574890924574e-06,
"loss": 0.2013,
"step": 2155
},
{
"epoch": 4.413510747185261,
"grad_norm": 0.29492009598586416,
"learning_rate": 1.6281311380582087e-06,
"loss": 0.2082,
"step": 2156
},
{
"epoch": 4.415557830092118,
"grad_norm": 0.30045313874805496,
"learning_rate": 1.616842387066524e-06,
"loss": 0.1767,
"step": 2157
},
{
"epoch": 4.417604912998977,
"grad_norm": 0.2910283873769807,
"learning_rate": 1.605591259221071e-06,
"loss": 0.1766,
"step": 2158
},
{
"epoch": 4.419651995905834,
"grad_norm": 0.28823745158894704,
"learning_rate": 1.5943777775485058e-06,
"loss": 0.1868,
"step": 2159
},
{
"epoch": 4.421699078812692,
"grad_norm": 0.26486227426645287,
"learning_rate": 1.583201964998451e-06,
"loss": 0.2016,
"step": 2160
},
{
"epoch": 4.423746161719549,
"grad_norm": 0.28343110323179144,
"learning_rate": 1.572063844443441e-06,
"loss": 0.1855,
"step": 2161
},
{
"epoch": 4.425793244626408,
"grad_norm": 0.2863936086901088,
"learning_rate": 1.5609634386788485e-06,
"loss": 0.1952,
"step": 2162
},
{
"epoch": 4.427840327533265,
"grad_norm": 0.29827769122199016,
"learning_rate": 1.5499007704228742e-06,
"loss": 0.1679,
"step": 2163
},
{
"epoch": 4.429887410440123,
"grad_norm": 0.2492563784192527,
"learning_rate": 1.5388758623164802e-06,
"loss": 0.1679,
"step": 2164
},
{
"epoch": 4.43193449334698,
"grad_norm": 0.2802794951076192,
"learning_rate": 1.5278887369233509e-06,
"loss": 0.1792,
"step": 2165
},
{
"epoch": 4.433981576253839,
"grad_norm": 0.2859554141734814,
"learning_rate": 1.5169394167298367e-06,
"loss": 0.1771,
"step": 2166
},
{
"epoch": 4.436028659160696,
"grad_norm": 0.3090415351876153,
"learning_rate": 1.5060279241449304e-06,
"loss": 0.1803,
"step": 2167
},
{
"epoch": 4.438075742067554,
"grad_norm": 0.2838386004821354,
"learning_rate": 1.4951542815001886e-06,
"loss": 0.159,
"step": 2168
},
{
"epoch": 4.440122824974411,
"grad_norm": 0.28615497006671264,
"learning_rate": 1.4843185110497139e-06,
"loss": 0.1654,
"step": 2169
},
{
"epoch": 4.4421699078812695,
"grad_norm": 0.2828928258486327,
"learning_rate": 1.4735206349701003e-06,
"loss": 0.166,
"step": 2170
},
{
"epoch": 4.444216990788127,
"grad_norm": 0.294693410920466,
"learning_rate": 1.4627606753603886e-06,
"loss": 0.1708,
"step": 2171
},
{
"epoch": 4.4462640736949846,
"grad_norm": 0.28832385318223086,
"learning_rate": 1.4520386542420006e-06,
"loss": 0.173,
"step": 2172
},
{
"epoch": 4.448311156601842,
"grad_norm": 0.2750702995509294,
"learning_rate": 1.4413545935587415e-06,
"loss": 0.1612,
"step": 2173
},
{
"epoch": 4.4503582395087005,
"grad_norm": 0.2827730533019191,
"learning_rate": 1.4307085151767086e-06,
"loss": 0.1568,
"step": 2174
},
{
"epoch": 4.452405322415558,
"grad_norm": 0.2724089303161446,
"learning_rate": 1.4201004408842644e-06,
"loss": 0.1577,
"step": 2175
},
{
"epoch": 4.4544524053224155,
"grad_norm": 0.27196453176116103,
"learning_rate": 1.4095303923919956e-06,
"loss": 0.1773,
"step": 2176
},
{
"epoch": 4.456499488229273,
"grad_norm": 0.3178765316382206,
"learning_rate": 1.3989983913326665e-06,
"loss": 0.159,
"step": 2177
},
{
"epoch": 4.458546571136131,
"grad_norm": 0.29327822729767095,
"learning_rate": 1.3885044592611706e-06,
"loss": 0.1431,
"step": 2178
},
{
"epoch": 4.460593654042989,
"grad_norm": 0.29769593027711694,
"learning_rate": 1.3780486176544905e-06,
"loss": 0.1985,
"step": 2179
},
{
"epoch": 4.462640736949846,
"grad_norm": 0.2844568268296535,
"learning_rate": 1.3676308879116507e-06,
"loss": 0.1652,
"step": 2180
},
{
"epoch": 4.464687819856704,
"grad_norm": 0.2824834673707818,
"learning_rate": 1.3572512913536783e-06,
"loss": 0.1957,
"step": 2181
},
{
"epoch": 4.466734902763562,
"grad_norm": 0.2944499941869074,
"learning_rate": 1.3469098492235521e-06,
"loss": 0.1663,
"step": 2182
},
{
"epoch": 4.46878198567042,
"grad_norm": 0.2860326191078485,
"learning_rate": 1.3366065826861685e-06,
"loss": 0.1508,
"step": 2183
},
{
"epoch": 4.470829068577277,
"grad_norm": 0.29391486499526526,
"learning_rate": 1.3263415128282908e-06,
"loss": 0.1643,
"step": 2184
},
{
"epoch": 4.472876151484135,
"grad_norm": 0.28228277616536795,
"learning_rate": 1.316114660658505e-06,
"loss": 0.165,
"step": 2185
},
{
"epoch": 4.474923234390992,
"grad_norm": 0.28104222228106285,
"learning_rate": 1.305926047107191e-06,
"loss": 0.1787,
"step": 2186
},
{
"epoch": 4.476970317297851,
"grad_norm": 0.3067397142838367,
"learning_rate": 1.2957756930264642e-06,
"loss": 0.1708,
"step": 2187
},
{
"epoch": 4.479017400204708,
"grad_norm": 0.287797881234288,
"learning_rate": 1.2856636191901296e-06,
"loss": 0.1778,
"step": 2188
},
{
"epoch": 4.481064483111566,
"grad_norm": 0.29131274478787034,
"learning_rate": 1.2755898462936544e-06,
"loss": 0.1754,
"step": 2189
},
{
"epoch": 4.483111566018424,
"grad_norm": 0.2774665709211303,
"learning_rate": 1.265554394954125e-06,
"loss": 0.1702,
"step": 2190
},
{
"epoch": 4.485158648925282,
"grad_norm": 0.2895032215404063,
"learning_rate": 1.255557285710185e-06,
"loss": 0.1572,
"step": 2191
},
{
"epoch": 4.487205731832139,
"grad_norm": 0.29001265695476014,
"learning_rate": 1.2455985390220193e-06,
"loss": 0.2107,
"step": 2192
},
{
"epoch": 4.489252814738997,
"grad_norm": 0.28177925431653544,
"learning_rate": 1.2356781752712932e-06,
"loss": 0.1821,
"step": 2193
},
{
"epoch": 4.491299897645854,
"grad_norm": 0.3086719314394619,
"learning_rate": 1.225796214761117e-06,
"loss": 0.1513,
"step": 2194
},
{
"epoch": 4.493346980552713,
"grad_norm": 0.29208730292486546,
"learning_rate": 1.2159526777160036e-06,
"loss": 0.1387,
"step": 2195
},
{
"epoch": 4.49539406345957,
"grad_norm": 0.3397441507088236,
"learning_rate": 1.2061475842818337e-06,
"loss": 0.1566,
"step": 2196
},
{
"epoch": 4.497441146366428,
"grad_norm": 0.2873481591587359,
"learning_rate": 1.196380954525802e-06,
"loss": 0.1726,
"step": 2197
},
{
"epoch": 4.499488229273285,
"grad_norm": 0.27576099473692967,
"learning_rate": 1.1866528084363881e-06,
"loss": 0.1549,
"step": 2198
},
{
"epoch": 4.501535312180144,
"grad_norm": 0.3686313423170628,
"learning_rate": 1.1769631659233104e-06,
"loss": 0.1567,
"step": 2199
},
{
"epoch": 4.503582395087001,
"grad_norm": 0.29827782847327655,
"learning_rate": 1.1673120468174837e-06,
"loss": 0.1872,
"step": 2200
},
{
"epoch": 4.505629477993859,
"grad_norm": 0.28857294210571843,
"learning_rate": 1.1576994708709766e-06,
"loss": 0.182,
"step": 2201
},
{
"epoch": 4.507676560900716,
"grad_norm": 0.27379258065859163,
"learning_rate": 1.148125457756981e-06,
"loss": 0.1732,
"step": 2202
},
{
"epoch": 4.5097236438075745,
"grad_norm": 0.2746324365416677,
"learning_rate": 1.1385900270697658e-06,
"loss": 0.1962,
"step": 2203
},
{
"epoch": 4.511770726714432,
"grad_norm": 0.29241951735417715,
"learning_rate": 1.1290931983246334e-06,
"loss": 0.1652,
"step": 2204
},
{
"epoch": 4.5138178096212895,
"grad_norm": 0.3084497347616217,
"learning_rate": 1.119634990957883e-06,
"loss": 0.1849,
"step": 2205
},
{
"epoch": 4.515864892528147,
"grad_norm": 0.3071333917103865,
"learning_rate": 1.110215424326775e-06,
"loss": 0.1584,
"step": 2206
},
{
"epoch": 4.5179119754350054,
"grad_norm": 0.2857404999852703,
"learning_rate": 1.1008345177094859e-06,
"loss": 0.195,
"step": 2207
},
{
"epoch": 4.519959058341863,
"grad_norm": 0.2705119565200689,
"learning_rate": 1.091492290305063e-06,
"loss": 0.1665,
"step": 2208
},
{
"epoch": 4.5220061412487205,
"grad_norm": 0.3090579005503219,
"learning_rate": 1.0821887612333959e-06,
"loss": 0.1802,
"step": 2209
},
{
"epoch": 4.524053224155578,
"grad_norm": 0.27905532018019086,
"learning_rate": 1.0729239495351917e-06,
"loss": 0.1786,
"step": 2210
},
{
"epoch": 4.526100307062436,
"grad_norm": 0.2818996052768231,
"learning_rate": 1.0636978741718873e-06,
"loss": 0.1951,
"step": 2211
},
{
"epoch": 4.528147389969294,
"grad_norm": 0.28519212105139324,
"learning_rate": 1.0545105540256628e-06,
"loss": 0.1718,
"step": 2212
},
{
"epoch": 4.530194472876151,
"grad_norm": 0.2752841091314753,
"learning_rate": 1.0453620078993755e-06,
"loss": 0.1904,
"step": 2213
},
{
"epoch": 4.532241555783009,
"grad_norm": 0.27656267859889894,
"learning_rate": 1.0362522545165276e-06,
"loss": 0.1563,
"step": 2214
},
{
"epoch": 4.534288638689867,
"grad_norm": 0.3034310152471821,
"learning_rate": 1.0271813125212237e-06,
"loss": 0.1967,
"step": 2215
},
{
"epoch": 4.536335721596725,
"grad_norm": 0.28706993728461716,
"learning_rate": 1.0181492004781467e-06,
"loss": 0.159,
"step": 2216
},
{
"epoch": 4.538382804503582,
"grad_norm": 0.2782931986242475,
"learning_rate": 1.009155936872499e-06,
"loss": 0.1926,
"step": 2217
},
{
"epoch": 4.54042988741044,
"grad_norm": 0.2836768256218939,
"learning_rate": 1.0002015401099797e-06,
"loss": 0.1697,
"step": 2218
},
{
"epoch": 4.542476970317297,
"grad_norm": 0.2708987463201031,
"learning_rate": 9.91286028516747e-07,
"loss": 0.1936,
"step": 2219
},
{
"epoch": 4.544524053224156,
"grad_norm": 0.28619987569766664,
"learning_rate": 9.824094203393697e-07,
"loss": 0.1849,
"step": 2220
},
{
"epoch": 4.546571136131013,
"grad_norm": 0.2707540756476124,
"learning_rate": 9.735717337447981e-07,
"loss": 0.1751,
"step": 2221
},
{
"epoch": 4.548618219037871,
"grad_norm": 0.27410601443733645,
"learning_rate": 9.647729868203238e-07,
"loss": 0.1868,
"step": 2222
},
{
"epoch": 4.550665301944729,
"grad_norm": 0.2957604221652705,
"learning_rate": 9.56013197573553e-07,
"loss": 0.1462,
"step": 2223
},
{
"epoch": 4.552712384851587,
"grad_norm": 0.26338269002342407,
"learning_rate": 9.4729238393235e-07,
"loss": 0.193,
"step": 2224
},
{
"epoch": 4.554759467758444,
"grad_norm": 0.2833184625486789,
"learning_rate": 9.386105637448151e-07,
"loss": 0.1621,
"step": 2225
},
{
"epoch": 4.556806550665302,
"grad_norm": 0.2789058820952134,
"learning_rate": 9.299677547792463e-07,
"loss": 0.1593,
"step": 2226
},
{
"epoch": 4.558853633572159,
"grad_norm": 0.2720509751943245,
"learning_rate": 9.21363974724101e-07,
"loss": 0.1802,
"step": 2227
},
{
"epoch": 4.560900716479018,
"grad_norm": 0.2861949558256626,
"learning_rate": 9.127992411879494e-07,
"loss": 0.2003,
"step": 2228
},
{
"epoch": 4.562947799385875,
"grad_norm": 0.28455385830088115,
"learning_rate": 9.042735716994678e-07,
"loss": 0.1772,
"step": 2229
},
{
"epoch": 4.564994882292733,
"grad_norm": 0.2762848657480311,
"learning_rate": 8.957869837073673e-07,
"loss": 0.153,
"step": 2230
},
{
"epoch": 4.567041965199591,
"grad_norm": 0.28684432410227595,
"learning_rate": 8.873394945803793e-07,
"loss": 0.139,
"step": 2231
},
{
"epoch": 4.569089048106449,
"grad_norm": 0.27952234592413544,
"learning_rate": 8.789311216072183e-07,
"loss": 0.1655,
"step": 2232
},
{
"epoch": 4.571136131013306,
"grad_norm": 0.2712591839850155,
"learning_rate": 8.705618819965411e-07,
"loss": 0.1687,
"step": 2233
},
{
"epoch": 4.573183213920164,
"grad_norm": 0.31661374577038626,
"learning_rate": 8.622317928769086e-07,
"loss": 0.1797,
"step": 2234
},
{
"epoch": 4.575230296827021,
"grad_norm": 0.30388013921486445,
"learning_rate": 8.539408712967679e-07,
"loss": 0.205,
"step": 2235
},
{
"epoch": 4.5772773797338795,
"grad_norm": 0.29761363560711057,
"learning_rate": 8.456891342243945e-07,
"loss": 0.1323,
"step": 2236
},
{
"epoch": 4.579324462640737,
"grad_norm": 0.2749824129298373,
"learning_rate": 8.374765985478728e-07,
"loss": 0.1662,
"step": 2237
},
{
"epoch": 4.5813715455475945,
"grad_norm": 0.288719742941428,
"learning_rate": 8.293032810750579e-07,
"loss": 0.185,
"step": 2238
},
{
"epoch": 4.583418628454452,
"grad_norm": 0.29201488163729705,
"learning_rate": 8.211691985335357e-07,
"loss": 0.1763,
"step": 2239
},
{
"epoch": 4.58546571136131,
"grad_norm": 0.29009215527637505,
"learning_rate": 8.130743675706032e-07,
"loss": 0.1485,
"step": 2240
},
{
"epoch": 4.587512794268168,
"grad_norm": 0.2855149297724901,
"learning_rate": 8.050188047532148e-07,
"loss": 0.172,
"step": 2241
},
{
"epoch": 4.5895598771750254,
"grad_norm": 0.3379024479854454,
"learning_rate": 7.970025265679648e-07,
"loss": 0.173,
"step": 2242
},
{
"epoch": 4.591606960081883,
"grad_norm": 0.2783179411940068,
"learning_rate": 7.890255494210453e-07,
"loss": 0.1801,
"step": 2243
},
{
"epoch": 4.593654042988741,
"grad_norm": 0.30188617728523903,
"learning_rate": 7.810878896382101e-07,
"loss": 0.1668,
"step": 2244
},
{
"epoch": 4.595701125895599,
"grad_norm": 0.27939878533555546,
"learning_rate": 7.731895634647513e-07,
"loss": 0.1544,
"step": 2245
},
{
"epoch": 4.597748208802456,
"grad_norm": 0.29721546644084723,
"learning_rate": 7.653305870654604e-07,
"loss": 0.1459,
"step": 2246
},
{
"epoch": 4.599795291709314,
"grad_norm": 0.2761040647920302,
"learning_rate": 7.575109765245936e-07,
"loss": 0.1991,
"step": 2247
},
{
"epoch": 4.601842374616172,
"grad_norm": 0.26790523775903885,
"learning_rate": 7.497307478458382e-07,
"loss": 0.1881,
"step": 2248
},
{
"epoch": 4.60388945752303,
"grad_norm": 0.26902868723836015,
"learning_rate": 7.419899169522903e-07,
"loss": 0.1956,
"step": 2249
},
{
"epoch": 4.605936540429887,
"grad_norm": 0.27200231013008247,
"learning_rate": 7.342884996863997e-07,
"loss": 0.1656,
"step": 2250
},
{
"epoch": 4.607983623336745,
"grad_norm": 0.3124482032601389,
"learning_rate": 7.266265118099669e-07,
"loss": 0.1753,
"step": 2251
},
{
"epoch": 4.610030706243603,
"grad_norm": 0.2938080759180912,
"learning_rate": 7.190039690040884e-07,
"loss": 0.1864,
"step": 2252
},
{
"epoch": 4.612077789150461,
"grad_norm": 0.28057707746789584,
"learning_rate": 7.114208868691319e-07,
"loss": 0.1655,
"step": 2253
},
{
"epoch": 4.614124872057318,
"grad_norm": 0.3670804523977461,
"learning_rate": 7.038772809247075e-07,
"loss": 0.2006,
"step": 2254
},
{
"epoch": 4.616171954964176,
"grad_norm": 0.3156535130425976,
"learning_rate": 6.963731666096318e-07,
"loss": 0.1873,
"step": 2255
},
{
"epoch": 4.618219037871034,
"grad_norm": 0.29444479967748544,
"learning_rate": 6.889085592818956e-07,
"loss": 0.1698,
"step": 2256
},
{
"epoch": 4.620266120777892,
"grad_norm": 0.2743736628864778,
"learning_rate": 6.814834742186361e-07,
"loss": 0.1851,
"step": 2257
},
{
"epoch": 4.622313203684749,
"grad_norm": 0.2788944245236952,
"learning_rate": 6.740979266161018e-07,
"loss": 0.1649,
"step": 2258
},
{
"epoch": 4.624360286591607,
"grad_norm": 0.28277515238178413,
"learning_rate": 6.667519315896264e-07,
"loss": 0.1707,
"step": 2259
},
{
"epoch": 4.626407369498464,
"grad_norm": 0.29229311294327376,
"learning_rate": 6.594455041735925e-07,
"loss": 0.1577,
"step": 2260
},
{
"epoch": 4.628454452405323,
"grad_norm": 0.2826367197666537,
"learning_rate": 6.521786593214075e-07,
"loss": 0.1694,
"step": 2261
},
{
"epoch": 4.63050153531218,
"grad_norm": 0.27183167715863765,
"learning_rate": 6.449514119054634e-07,
"loss": 0.1821,
"step": 2262
},
{
"epoch": 4.632548618219038,
"grad_norm": 0.2705799558624456,
"learning_rate": 6.377637767171152e-07,
"loss": 0.16,
"step": 2263
},
{
"epoch": 4.634595701125896,
"grad_norm": 0.312451093844958,
"learning_rate": 6.306157684666425e-07,
"loss": 0.146,
"step": 2264
},
{
"epoch": 4.6366427840327535,
"grad_norm": 0.30534769600917144,
"learning_rate": 6.235074017832299e-07,
"loss": 0.2026,
"step": 2265
},
{
"epoch": 4.638689866939611,
"grad_norm": 0.2749885611375992,
"learning_rate": 6.164386912149289e-07,
"loss": 0.149,
"step": 2266
},
{
"epoch": 4.640736949846469,
"grad_norm": 0.3043305615254814,
"learning_rate": 6.094096512286297e-07,
"loss": 0.1931,
"step": 2267
},
{
"epoch": 4.642784032753326,
"grad_norm": 0.2926074707796185,
"learning_rate": 6.024202962100312e-07,
"loss": 0.1704,
"step": 2268
},
{
"epoch": 4.6448311156601845,
"grad_norm": 0.27826474993716993,
"learning_rate": 5.954706404636179e-07,
"loss": 0.1868,
"step": 2269
},
{
"epoch": 4.646878198567042,
"grad_norm": 0.2676299230869566,
"learning_rate": 5.88560698212619e-07,
"loss": 0.1656,
"step": 2270
},
{
"epoch": 4.6489252814738995,
"grad_norm": 0.3004879943235446,
"learning_rate": 5.816904835989867e-07,
"loss": 0.1462,
"step": 2271
},
{
"epoch": 4.650972364380758,
"grad_norm": 0.28512914647917514,
"learning_rate": 5.748600106833735e-07,
"loss": 0.1784,
"step": 2272
},
{
"epoch": 4.653019447287615,
"grad_norm": 0.2810502748083855,
"learning_rate": 5.680692934450837e-07,
"loss": 0.2186,
"step": 2273
},
{
"epoch": 4.655066530194473,
"grad_norm": 0.2855042987462666,
"learning_rate": 5.613183457820714e-07,
"loss": 0.1911,
"step": 2274
},
{
"epoch": 4.65711361310133,
"grad_norm": 0.30832919924707025,
"learning_rate": 5.546071815108845e-07,
"loss": 0.1853,
"step": 2275
},
{
"epoch": 4.659160696008188,
"grad_norm": 0.27863470748148456,
"learning_rate": 5.479358143666602e-07,
"loss": 0.1446,
"step": 2276
},
{
"epoch": 4.661207778915046,
"grad_norm": 0.2954741773503375,
"learning_rate": 5.413042580030792e-07,
"loss": 0.1861,
"step": 2277
},
{
"epoch": 4.663254861821904,
"grad_norm": 0.292054066538448,
"learning_rate": 5.347125259923491e-07,
"loss": 0.1693,
"step": 2278
},
{
"epoch": 4.665301944728761,
"grad_norm": 0.2736497121658198,
"learning_rate": 5.281606318251764e-07,
"loss": 0.1548,
"step": 2279
},
{
"epoch": 4.667349027635619,
"grad_norm": 0.30678020029811015,
"learning_rate": 5.216485889107214e-07,
"loss": 0.1982,
"step": 2280
},
{
"epoch": 4.669396110542477,
"grad_norm": 0.3007630538627819,
"learning_rate": 5.151764105766011e-07,
"loss": 0.2082,
"step": 2281
},
{
"epoch": 4.671443193449335,
"grad_norm": 0.30472720971722367,
"learning_rate": 5.087441100688351e-07,
"loss": 0.1913,
"step": 2282
},
{
"epoch": 4.673490276356192,
"grad_norm": 0.28472393369780435,
"learning_rate": 5.023517005518264e-07,
"loss": 0.1795,
"step": 2283
},
{
"epoch": 4.67553735926305,
"grad_norm": 0.30834546215081027,
"learning_rate": 4.959991951083498e-07,
"loss": 0.1677,
"step": 2284
},
{
"epoch": 4.677584442169908,
"grad_norm": 0.2707613967004538,
"learning_rate": 4.89686606739499e-07,
"loss": 0.1885,
"step": 2285
},
{
"epoch": 4.679631525076766,
"grad_norm": 0.2925354294386155,
"learning_rate": 4.834139483646793e-07,
"loss": 0.1369,
"step": 2286
},
{
"epoch": 4.681678607983623,
"grad_norm": 0.295052171708121,
"learning_rate": 4.771812328215708e-07,
"loss": 0.1684,
"step": 2287
},
{
"epoch": 4.683725690890481,
"grad_norm": 0.2816846080734314,
"learning_rate": 4.709884728661118e-07,
"loss": 0.1634,
"step": 2288
},
{
"epoch": 4.685772773797339,
"grad_norm": 0.29537583958782165,
"learning_rate": 4.648356811724619e-07,
"loss": 0.1501,
"step": 2289
},
{
"epoch": 4.687819856704197,
"grad_norm": 0.2776645155887449,
"learning_rate": 4.587228703329838e-07,
"loss": 0.1731,
"step": 2290
},
{
"epoch": 4.689866939611054,
"grad_norm": 0.27540787792841026,
"learning_rate": 4.5265005285821674e-07,
"loss": 0.1688,
"step": 2291
},
{
"epoch": 4.691914022517912,
"grad_norm": 0.2774319968994503,
"learning_rate": 4.4661724117684545e-07,
"loss": 0.1736,
"step": 2292
},
{
"epoch": 4.69396110542477,
"grad_norm": 0.27404159260754235,
"learning_rate": 4.40624447635678e-07,
"loss": 0.1473,
"step": 2293
},
{
"epoch": 4.696008188331628,
"grad_norm": 0.2944590643348067,
"learning_rate": 4.346716844996279e-07,
"loss": 0.1594,
"step": 2294
},
{
"epoch": 4.698055271238485,
"grad_norm": 0.2881991440998457,
"learning_rate": 4.2875896395167427e-07,
"loss": 0.1988,
"step": 2295
},
{
"epoch": 4.700102354145343,
"grad_norm": 0.28026776497774936,
"learning_rate": 4.228862980928439e-07,
"loss": 0.1784,
"step": 2296
},
{
"epoch": 4.702149437052201,
"grad_norm": 0.2955331369567261,
"learning_rate": 4.1705369894219584e-07,
"loss": 0.1786,
"step": 2297
},
{
"epoch": 4.7041965199590585,
"grad_norm": 0.292319027121268,
"learning_rate": 4.112611784367837e-07,
"loss": 0.1677,
"step": 2298
},
{
"epoch": 4.706243602865916,
"grad_norm": 0.31348235722088247,
"learning_rate": 4.0550874843163337e-07,
"loss": 0.1796,
"step": 2299
},
{
"epoch": 4.7082906857727735,
"grad_norm": 0.2903513190722839,
"learning_rate": 3.997964206997207e-07,
"loss": 0.1804,
"step": 2300
},
{
"epoch": 4.710337768679631,
"grad_norm": 0.2879799305264078,
"learning_rate": 3.941242069319562e-07,
"loss": 0.1895,
"step": 2301
},
{
"epoch": 4.7123848515864895,
"grad_norm": 0.3577665248128591,
"learning_rate": 3.8849211873714266e-07,
"loss": 0.1765,
"step": 2302
},
{
"epoch": 4.714431934493347,
"grad_norm": 0.2903669946216332,
"learning_rate": 3.8290016764196637e-07,
"loss": 0.1716,
"step": 2303
},
{
"epoch": 4.7164790174002045,
"grad_norm": 0.3014373493296439,
"learning_rate": 3.7734836509096596e-07,
"loss": 0.1388,
"step": 2304
},
{
"epoch": 4.718526100307063,
"grad_norm": 0.3083437231349992,
"learning_rate": 3.7183672244652135e-07,
"loss": 0.1903,
"step": 2305
},
{
"epoch": 4.72057318321392,
"grad_norm": 0.2957154629258413,
"learning_rate": 3.663652509888027e-07,
"loss": 0.1718,
"step": 2306
},
{
"epoch": 4.722620266120778,
"grad_norm": 0.3046648653682266,
"learning_rate": 3.6093396191578366e-07,
"loss": 0.1979,
"step": 2307
},
{
"epoch": 4.724667349027635,
"grad_norm": 0.28017443242244167,
"learning_rate": 3.5554286634318814e-07,
"loss": 0.1728,
"step": 2308
},
{
"epoch": 4.726714431934493,
"grad_norm": 0.2748026640801257,
"learning_rate": 3.501919753044836e-07,
"loss": 0.2096,
"step": 2309
},
{
"epoch": 4.728761514841351,
"grad_norm": 0.2900862431852642,
"learning_rate": 3.448812997508588e-07,
"loss": 0.1655,
"step": 2310
},
{
"epoch": 4.730808597748209,
"grad_norm": 0.2833002934703294,
"learning_rate": 3.3961085055119083e-07,
"loss": 0.1527,
"step": 2311
},
{
"epoch": 4.732855680655066,
"grad_norm": 0.27909247839420187,
"learning_rate": 3.3438063849203116e-07,
"loss": 0.1449,
"step": 2312
},
{
"epoch": 4.734902763561925,
"grad_norm": 0.2997312028695653,
"learning_rate": 3.2919067427758186e-07,
"loss": 0.153,
"step": 2313
},
{
"epoch": 4.736949846468782,
"grad_norm": 0.2788661977637299,
"learning_rate": 3.2404096852967305e-07,
"loss": 0.1686,
"step": 2314
},
{
"epoch": 4.73899692937564,
"grad_norm": 0.2844108440502707,
"learning_rate": 3.189315317877428e-07,
"loss": 0.1575,
"step": 2315
},
{
"epoch": 4.741044012282497,
"grad_norm": 0.29542954551351697,
"learning_rate": 3.138623745088132e-07,
"loss": 0.1489,
"step": 2316
},
{
"epoch": 4.743091095189355,
"grad_norm": 0.2931309390733155,
"learning_rate": 3.0883350706746973e-07,
"loss": 0.1793,
"step": 2317
},
{
"epoch": 4.745138178096213,
"grad_norm": 0.274362618181376,
"learning_rate": 3.038449397558396e-07,
"loss": 0.1635,
"step": 2318
},
{
"epoch": 4.747185261003071,
"grad_norm": 0.27428372927016115,
"learning_rate": 2.9889668278357376e-07,
"loss": 0.1588,
"step": 2319
},
{
"epoch": 4.749232343909928,
"grad_norm": 0.29732674571188733,
"learning_rate": 2.9398874627782014e-07,
"loss": 0.1708,
"step": 2320
},
{
"epoch": 4.751279426816786,
"grad_norm": 0.3003256230614381,
"learning_rate": 2.891211402832128e-07,
"loss": 0.1725,
"step": 2321
},
{
"epoch": 4.753326509723644,
"grad_norm": 0.3084720969154894,
"learning_rate": 2.8429387476183624e-07,
"loss": 0.1483,
"step": 2322
},
{
"epoch": 4.755373592630502,
"grad_norm": 0.3036705097265483,
"learning_rate": 2.7950695959322093e-07,
"loss": 0.1623,
"step": 2323
},
{
"epoch": 4.757420675537359,
"grad_norm": 0.26881070505536797,
"learning_rate": 2.747604045743102e-07,
"loss": 0.171,
"step": 2324
},
{
"epoch": 4.759467758444217,
"grad_norm": 0.2880629649819299,
"learning_rate": 2.7005421941945555e-07,
"loss": 0.1646,
"step": 2325
},
{
"epoch": 4.761514841351075,
"grad_norm": 0.28081087918300657,
"learning_rate": 2.653884137603702e-07,
"loss": 0.1427,
"step": 2326
},
{
"epoch": 4.763561924257933,
"grad_norm": 0.29698506829638605,
"learning_rate": 2.6076299714614673e-07,
"loss": 0.1612,
"step": 2327
},
{
"epoch": 4.76560900716479,
"grad_norm": 0.2830061118167646,
"learning_rate": 2.5617797904320396e-07,
"loss": 0.1731,
"step": 2328
},
{
"epoch": 4.767656090071648,
"grad_norm": 0.3123860044528765,
"learning_rate": 2.516333688352801e-07,
"loss": 0.1561,
"step": 2329
},
{
"epoch": 4.769703172978506,
"grad_norm": 0.2798082098938455,
"learning_rate": 2.471291758234218e-07,
"loss": 0.1902,
"step": 2330
},
{
"epoch": 4.7717502558853635,
"grad_norm": 0.2987315382953632,
"learning_rate": 2.426654092259528e-07,
"loss": 0.1551,
"step": 2331
},
{
"epoch": 4.773797338792221,
"grad_norm": 0.2808063055566694,
"learning_rate": 2.382420781784589e-07,
"loss": 0.1749,
"step": 2332
},
{
"epoch": 4.7758444216990785,
"grad_norm": 0.2796699754953347,
"learning_rate": 2.338591917337696e-07,
"loss": 0.1727,
"step": 2333
},
{
"epoch": 4.777891504605937,
"grad_norm": 0.277747972615087,
"learning_rate": 2.295167588619518e-07,
"loss": 0.1507,
"step": 2334
},
{
"epoch": 4.779938587512794,
"grad_norm": 0.2987700878833016,
"learning_rate": 2.2521478845025867e-07,
"loss": 0.1798,
"step": 2335
},
{
"epoch": 4.781985670419652,
"grad_norm": 0.2728906304255605,
"learning_rate": 2.2095328930315184e-07,
"loss": 0.171,
"step": 2336
},
{
"epoch": 4.7840327533265095,
"grad_norm": 0.300885418994469,
"learning_rate": 2.167322701422525e-07,
"loss": 0.163,
"step": 2337
},
{
"epoch": 4.786079836233368,
"grad_norm": 0.2843769306835108,
"learning_rate": 2.1255173960634146e-07,
"loss": 0.1788,
"step": 2338
},
{
"epoch": 4.788126919140225,
"grad_norm": 0.29006108226073735,
"learning_rate": 2.08411706251328e-07,
"loss": 0.177,
"step": 2339
},
{
"epoch": 4.790174002047083,
"grad_norm": 0.28369183368200673,
"learning_rate": 2.0431217855025e-07,
"loss": 0.18,
"step": 2340
},
{
"epoch": 4.79222108495394,
"grad_norm": 0.28761647181705274,
"learning_rate": 2.0025316489323597e-07,
"loss": 0.1656,
"step": 2341
},
{
"epoch": 4.794268167860798,
"grad_norm": 0.2839547733000503,
"learning_rate": 1.9623467358750315e-07,
"loss": 0.1743,
"step": 2342
},
{
"epoch": 4.796315250767656,
"grad_norm": 0.2792802390584324,
"learning_rate": 1.9225671285733272e-07,
"loss": 0.1793,
"step": 2343
},
{
"epoch": 4.798362333674514,
"grad_norm": 0.2953632540852749,
"learning_rate": 1.8831929084406119e-07,
"loss": 0.1697,
"step": 2344
},
{
"epoch": 4.800409416581371,
"grad_norm": 0.26668806789326216,
"learning_rate": 1.8442241560604922e-07,
"loss": 0.1517,
"step": 2345
},
{
"epoch": 4.80245649948823,
"grad_norm": 0.27943039216113114,
"learning_rate": 1.8056609511868163e-07,
"loss": 0.2131,
"step": 2346
},
{
"epoch": 4.804503582395087,
"grad_norm": 0.26345026664491444,
"learning_rate": 1.7675033727434288e-07,
"loss": 0.1688,
"step": 2347
},
{
"epoch": 4.806550665301945,
"grad_norm": 0.28602587264323004,
"learning_rate": 1.7297514988239505e-07,
"loss": 0.1958,
"step": 2348
},
{
"epoch": 4.808597748208802,
"grad_norm": 0.27558338217328315,
"learning_rate": 1.692405406691755e-07,
"loss": 0.1796,
"step": 2349
},
{
"epoch": 4.81064483111566,
"grad_norm": 0.2966627143037965,
"learning_rate": 1.655465172779702e-07,
"loss": 0.192,
"step": 2350
},
{
"epoch": 4.812691914022518,
"grad_norm": 0.2944395579726434,
"learning_rate": 1.6189308726900277e-07,
"loss": 0.172,
"step": 2351
},
{
"epoch": 4.814738996929376,
"grad_norm": 0.2711839256534087,
"learning_rate": 1.5828025811941872e-07,
"loss": 0.1901,
"step": 2352
},
{
"epoch": 4.816786079836233,
"grad_norm": 0.29155015705326437,
"learning_rate": 1.547080372232679e-07,
"loss": 0.167,
"step": 2353
},
{
"epoch": 4.818833162743092,
"grad_norm": 0.2815618639741477,
"learning_rate": 1.5117643189149546e-07,
"loss": 0.1516,
"step": 2354
},
{
"epoch": 4.820880245649949,
"grad_norm": 0.29038125469870796,
"learning_rate": 1.4768544935191088e-07,
"loss": 0.1657,
"step": 2355
},
{
"epoch": 4.822927328556807,
"grad_norm": 0.2867465831992582,
"learning_rate": 1.44235096749199e-07,
"loss": 0.1824,
"step": 2356
},
{
"epoch": 4.824974411463664,
"grad_norm": 0.2708843561414753,
"learning_rate": 1.408253811448823e-07,
"loss": 0.1735,
"step": 2357
},
{
"epoch": 4.827021494370522,
"grad_norm": 0.2891677780067326,
"learning_rate": 1.374563095173187e-07,
"loss": 0.1594,
"step": 2358
},
{
"epoch": 4.82906857727738,
"grad_norm": 0.29721811114798363,
"learning_rate": 1.3412788876167925e-07,
"loss": 0.1681,
"step": 2359
},
{
"epoch": 4.8311156601842375,
"grad_norm": 0.2819368121687383,
"learning_rate": 1.3084012568994608e-07,
"loss": 0.2588,
"step": 2360
},
{
"epoch": 4.833162743091095,
"grad_norm": 0.2637703601334377,
"learning_rate": 1.2759302703088117e-07,
"loss": 0.1686,
"step": 2361
},
{
"epoch": 4.835209825997953,
"grad_norm": 0.2790376537691832,
"learning_rate": 1.2438659943003306e-07,
"loss": 0.1824,
"step": 2362
},
{
"epoch": 4.837256908904811,
"grad_norm": 0.2855413561087752,
"learning_rate": 1.212208494497036e-07,
"loss": 0.18,
"step": 2363
},
{
"epoch": 4.8393039918116685,
"grad_norm": 0.2966645113711715,
"learning_rate": 1.180957835689478e-07,
"loss": 0.1462,
"step": 2364
},
{
"epoch": 4.841351074718526,
"grad_norm": 0.28073303467220956,
"learning_rate": 1.1501140818355627e-07,
"loss": 0.2037,
"step": 2365
},
{
"epoch": 4.8433981576253835,
"grad_norm": 0.27323730338581254,
"learning_rate": 1.1196772960603952e-07,
"loss": 0.1552,
"step": 2366
},
{
"epoch": 4.845445240532242,
"grad_norm": 0.27979495089452416,
"learning_rate": 1.0896475406562135e-07,
"loss": 0.1911,
"step": 2367
},
{
"epoch": 4.847492323439099,
"grad_norm": 0.27315088792468245,
"learning_rate": 1.0600248770821886e-07,
"loss": 0.1945,
"step": 2368
},
{
"epoch": 4.849539406345957,
"grad_norm": 0.29508373388397574,
"learning_rate": 1.0308093659643582e-07,
"loss": 0.158,
"step": 2369
},
{
"epoch": 4.851586489252814,
"grad_norm": 0.2790657492666217,
"learning_rate": 1.0020010670954483e-07,
"loss": 0.1798,
"step": 2370
},
{
"epoch": 4.853633572159673,
"grad_norm": 0.2863828008417254,
"learning_rate": 9.736000394348299e-08,
"loss": 0.1688,
"step": 2371
},
{
"epoch": 4.85568065506653,
"grad_norm": 0.2837857995206189,
"learning_rate": 9.456063411082738e-08,
"loss": 0.1818,
"step": 2372
},
{
"epoch": 4.857727737973388,
"grad_norm": 0.26660018975441796,
"learning_rate": 9.180200294079955e-08,
"loss": 0.1681,
"step": 2373
},
{
"epoch": 4.859774820880245,
"grad_norm": 0.30489283035917186,
"learning_rate": 8.908411607923884e-08,
"loss": 0.1724,
"step": 2374
},
{
"epoch": 4.861821903787103,
"grad_norm": 0.27668845133173764,
"learning_rate": 8.640697908859575e-08,
"loss": 0.1871,
"step": 2375
},
{
"epoch": 4.863868986693961,
"grad_norm": 0.27941980307619757,
"learning_rate": 8.377059744792748e-08,
"loss": 0.1875,
"step": 2376
},
{
"epoch": 4.865916069600819,
"grad_norm": 0.2991779658323062,
"learning_rate": 8.117497655287798e-08,
"loss": 0.1607,
"step": 2377
},
{
"epoch": 4.867963152507676,
"grad_norm": 0.2960605235453732,
"learning_rate": 7.862012171566902e-08,
"loss": 0.2025,
"step": 2378
},
{
"epoch": 4.870010235414535,
"grad_norm": 0.2909596199436068,
"learning_rate": 7.61060381650891e-08,
"loss": 0.1976,
"step": 2379
},
{
"epoch": 4.872057318321392,
"grad_norm": 0.3022247341372479,
"learning_rate": 7.363273104648904e-08,
"loss": 0.1873,
"step": 2380
},
{
"epoch": 4.87410440122825,
"grad_norm": 0.27635276227359323,
"learning_rate": 7.120020542176198e-08,
"loss": 0.1815,
"step": 2381
},
{
"epoch": 4.876151484135107,
"grad_norm": 0.3235414628513566,
"learning_rate": 6.880846626933668e-08,
"loss": 0.1526,
"step": 2382
},
{
"epoch": 4.878198567041965,
"grad_norm": 0.3061161602814933,
"learning_rate": 6.645751848417093e-08,
"loss": 0.1672,
"step": 2383
},
{
"epoch": 4.880245649948823,
"grad_norm": 0.2857967006307856,
"learning_rate": 6.414736687773371e-08,
"loss": 0.1662,
"step": 2384
},
{
"epoch": 4.882292732855681,
"grad_norm": 0.27457282899975494,
"learning_rate": 6.187801617800748e-08,
"loss": 0.1564,
"step": 2385
},
{
"epoch": 4.884339815762538,
"grad_norm": 0.2990684370251101,
"learning_rate": 5.964947102946594e-08,
"loss": 0.193,
"step": 2386
},
{
"epoch": 4.886386898669397,
"grad_norm": 0.27143331999482234,
"learning_rate": 5.746173599307181e-08,
"loss": 0.172,
"step": 2387
},
{
"epoch": 4.888433981576254,
"grad_norm": 0.2720324730535623,
"learning_rate": 5.531481554626128e-08,
"loss": 0.1466,
"step": 2388
},
{
"epoch": 4.890481064483112,
"grad_norm": 0.29144216168390374,
"learning_rate": 5.320871408294403e-08,
"loss": 0.1622,
"step": 2389
},
{
"epoch": 4.892528147389969,
"grad_norm": 0.27957166407177925,
"learning_rate": 5.114343591348769e-08,
"loss": 0.1744,
"step": 2390
},
{
"epoch": 4.894575230296827,
"grad_norm": 0.2743372447650692,
"learning_rate": 4.9118985264711147e-08,
"loss": 0.1779,
"step": 2391
},
{
"epoch": 4.896622313203685,
"grad_norm": 0.2864112361999076,
"learning_rate": 4.713536627987347e-08,
"loss": 0.1783,
"step": 2392
},
{
"epoch": 4.8986693961105425,
"grad_norm": 0.2820613867682801,
"learning_rate": 4.519258301866947e-08,
"loss": 0.1764,
"step": 2393
},
{
"epoch": 4.9007164790174,
"grad_norm": 0.2790446443208781,
"learning_rate": 4.3290639457214125e-08,
"loss": 0.1983,
"step": 2394
},
{
"epoch": 4.9027635619242576,
"grad_norm": 0.3009212756955286,
"learning_rate": 4.1429539488047066e-08,
"loss": 0.1632,
"step": 2395
},
{
"epoch": 4.904810644831116,
"grad_norm": 0.3048977634842906,
"learning_rate": 3.960928692011257e-08,
"loss": 0.1775,
"step": 2396
},
{
"epoch": 4.9068577277379735,
"grad_norm": 0.2999700691804944,
"learning_rate": 3.7829885478757324e-08,
"loss": 0.1811,
"step": 2397
},
{
"epoch": 4.908904810644831,
"grad_norm": 0.2867226352655605,
"learning_rate": 3.6091338805719356e-08,
"loss": 0.1749,
"step": 2398
},
{
"epoch": 4.9109518935516885,
"grad_norm": 0.27853817331380126,
"learning_rate": 3.439365045912801e-08,
"loss": 0.1715,
"step": 2399
},
{
"epoch": 4.912998976458547,
"grad_norm": 0.26671396491728905,
"learning_rate": 3.273682391348398e-08,
"loss": 0.1736,
"step": 2400
},
{
"epoch": 4.915046059365404,
"grad_norm": 0.2981072341365868,
"learning_rate": 3.1120862559670396e-08,
"loss": 0.1608,
"step": 2401
},
{
"epoch": 4.917093142272262,
"grad_norm": 0.3006161506174685,
"learning_rate": 2.9545769704923954e-08,
"loss": 0.174,
"step": 2402
},
{
"epoch": 4.919140225179119,
"grad_norm": 0.2700128965804573,
"learning_rate": 2.8011548572846047e-08,
"loss": 0.1666,
"step": 2403
},
{
"epoch": 4.921187308085978,
"grad_norm": 0.27640788721099124,
"learning_rate": 2.651820230338942e-08,
"loss": 0.1786,
"step": 2404
},
{
"epoch": 4.923234390992835,
"grad_norm": 0.30195005723908613,
"learning_rate": 2.50657339528515e-08,
"loss": 0.1683,
"step": 2405
},
{
"epoch": 4.925281473899693,
"grad_norm": 0.27803519346079886,
"learning_rate": 2.365414649386555e-08,
"loss": 0.2196,
"step": 2406
},
{
"epoch": 4.92732855680655,
"grad_norm": 0.27873854091593087,
"learning_rate": 2.2283442815402845e-08,
"loss": 0.1772,
"step": 2407
},
{
"epoch": 4.929375639713409,
"grad_norm": 0.31272442814047646,
"learning_rate": 2.0953625722754943e-08,
"loss": 0.1917,
"step": 2408
},
{
"epoch": 4.931422722620266,
"grad_norm": 0.27074887610829274,
"learning_rate": 1.9664697937542554e-08,
"loss": 0.177,
"step": 2409
},
{
"epoch": 4.933469805527124,
"grad_norm": 0.2809859827116871,
"learning_rate": 1.8416662097693326e-08,
"loss": 0.1735,
"step": 2410
},
{
"epoch": 4.935516888433981,
"grad_norm": 0.2922562803880012,
"learning_rate": 1.720952075745075e-08,
"loss": 0.164,
"step": 2411
},
{
"epoch": 4.93756397134084,
"grad_norm": 0.27866762770170544,
"learning_rate": 1.604327638736525e-08,
"loss": 0.1509,
"step": 2412
},
{
"epoch": 4.939611054247697,
"grad_norm": 0.2757657750150201,
"learning_rate": 1.491793137427866e-08,
"loss": 0.1694,
"step": 2413
},
{
"epoch": 4.941658137154555,
"grad_norm": 0.3690880615237033,
"learning_rate": 1.3833488021335328e-08,
"loss": 0.1459,
"step": 2414
},
{
"epoch": 4.943705220061412,
"grad_norm": 0.28975968339512975,
"learning_rate": 1.2789948547968779e-08,
"loss": 0.1723,
"step": 2415
},
{
"epoch": 4.94575230296827,
"grad_norm": 0.2676510309996025,
"learning_rate": 1.1787315089895057e-08,
"loss": 0.1587,
"step": 2416
},
{
"epoch": 4.947799385875128,
"grad_norm": 0.29535503732393725,
"learning_rate": 1.0825589699112737e-08,
"loss": 0.1429,
"step": 2417
},
{
"epoch": 4.949846468781986,
"grad_norm": 0.2851419625396727,
"learning_rate": 9.904774343898471e-09,
"loss": 0.1529,
"step": 2418
},
{
"epoch": 4.951893551688843,
"grad_norm": 0.25772957787571077,
"learning_rate": 9.024870908802552e-09,
"loss": 0.187,
"step": 2419
},
{
"epoch": 4.9539406345957016,
"grad_norm": 0.2900373956230706,
"learning_rate": 8.185881194644474e-09,
"loss": 0.1541,
"step": 2420
},
{
"epoch": 4.955987717502559,
"grad_norm": 0.291463457270851,
"learning_rate": 7.387806918508489e-09,
"loss": 0.1585,
"step": 2421
},
{
"epoch": 4.958034800409417,
"grad_norm": 0.2796933717518039,
"learning_rate": 6.630649713739168e-09,
"loss": 0.2045,
"step": 2422
},
{
"epoch": 4.960081883316274,
"grad_norm": 0.2809689443794419,
"learning_rate": 5.9144111299414e-09,
"loss": 0.169,
"step": 2423
},
{
"epoch": 4.962128966223132,
"grad_norm": 0.3074472408506574,
"learning_rate": 5.239092632980391e-09,
"loss": 0.1593,
"step": 2424
},
{
"epoch": 4.96417604912999,
"grad_norm": 0.28551605159632193,
"learning_rate": 4.6046956049639045e-09,
"loss": 0.1518,
"step": 2425
},
{
"epoch": 4.9662231320368475,
"grad_norm": 0.2889141054976907,
"learning_rate": 4.011221344257799e-09,
"loss": 0.1781,
"step": 2426
},
{
"epoch": 4.968270214943705,
"grad_norm": 0.25486448368902137,
"learning_rate": 3.4586710654727074e-09,
"loss": 0.1688,
"step": 2427
},
{
"epoch": 4.970317297850563,
"grad_norm": 0.3024410370860751,
"learning_rate": 2.94704589946182e-09,
"loss": 0.2105,
"step": 2428
},
{
"epoch": 4.972364380757421,
"grad_norm": 0.29484058319269135,
"learning_rate": 2.4763468933231005e-09,
"loss": 0.178,
"step": 2429
},
{
"epoch": 4.974411463664278,
"grad_norm": 0.28427681285996675,
"learning_rate": 2.0465750103926263e-09,
"loss": 0.2151,
"step": 2430
},
{
"epoch": 4.976458546571136,
"grad_norm": 0.2758137593263903,
"learning_rate": 1.657731130246809e-09,
"loss": 0.1934,
"step": 2431
},
{
"epoch": 4.9785056294779935,
"grad_norm": 0.3024473446528619,
"learning_rate": 1.309816048697954e-09,
"loss": 0.1732,
"step": 2432
},
{
"epoch": 4.980552712384852,
"grad_norm": 0.26746264280387694,
"learning_rate": 1.0028304777875975e-09,
"loss": 0.1807,
"step": 2433
},
{
"epoch": 4.982599795291709,
"grad_norm": 0.2689810857541332,
"learning_rate": 7.367750458020518e-10,
"loss": 0.1713,
"step": 2434
},
{
"epoch": 4.984646878198567,
"grad_norm": 0.30444153746664904,
"learning_rate": 5.116502972479787e-10,
"loss": 0.172,
"step": 2435
},
{
"epoch": 4.986693961105424,
"grad_norm": 0.2724209053996081,
"learning_rate": 3.2745669287237435e-10,
"loss": 0.1994,
"step": 2436
},
{
"epoch": 4.988741044012283,
"grad_norm": 0.2666986989444724,
"learning_rate": 1.8419460964258505e-10,
"loss": 0.1735,
"step": 2437
},
{
"epoch": 4.99078812691914,
"grad_norm": 0.2846154192735168,
"learning_rate": 8.186434076185024e-11,
"loss": 0.1539,
"step": 2438
},
{
"epoch": 4.992835209825998,
"grad_norm": 0.28158014093768946,
"learning_rate": 2.046609566264124e-11,
"loss": 0.1756,
"step": 2439
},
{
"epoch": 4.994882292732855,
"grad_norm": 0.3150094705291199,
"learning_rate": 0.0,
"loss": 0.2073,
"step": 2440
},
{
"epoch": 4.994882292732855,
"step": 2440,
"total_flos": 2619216084533248.0,
"train_loss": 0.34357271391715183,
"train_runtime": 45861.8772,
"train_samples_per_second": 6.814,
"train_steps_per_second": 0.053
}
],
"logging_steps": 1,
"max_steps": 2440,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2619216084533248.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}