9b-28 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
2e92618 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1962,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030581039755351682,
"grad_norm": 0.18201100826263428,
"learning_rate": 2.0202020202020205e-07,
"loss": 1.8941408395767212,
"step": 2
},
{
"epoch": 0.0061162079510703364,
"grad_norm": 0.11798238009214401,
"learning_rate": 6.060606060606061e-07,
"loss": 1.8559235334396362,
"step": 4
},
{
"epoch": 0.009174311926605505,
"grad_norm": 0.22169165313243866,
"learning_rate": 1.01010101010101e-06,
"loss": 1.8905575275421143,
"step": 6
},
{
"epoch": 0.012232415902140673,
"grad_norm": 0.12514109909534454,
"learning_rate": 1.4141414141414143e-06,
"loss": 1.8444780111312866,
"step": 8
},
{
"epoch": 0.01529051987767584,
"grad_norm": 0.3371029496192932,
"learning_rate": 1.8181818181818183e-06,
"loss": 2.009756326675415,
"step": 10
},
{
"epoch": 0.01834862385321101,
"grad_norm": 0.9957485198974609,
"learning_rate": 2.222222222222222e-06,
"loss": 1.8797118663787842,
"step": 12
},
{
"epoch": 0.021406727828746176,
"grad_norm": 0.3545893132686615,
"learning_rate": 2.6262626262626267e-06,
"loss": 1.880387544631958,
"step": 14
},
{
"epoch": 0.024464831804281346,
"grad_norm": 0.21946458518505096,
"learning_rate": 3.0303030303030305e-06,
"loss": 1.9453247785568237,
"step": 16
},
{
"epoch": 0.027522935779816515,
"grad_norm": 0.7614877223968506,
"learning_rate": 3.4343434343434347e-06,
"loss": 1.829978108406067,
"step": 18
},
{
"epoch": 0.03058103975535168,
"grad_norm": 0.18568752706050873,
"learning_rate": 3.8383838383838385e-06,
"loss": 1.8531824350357056,
"step": 20
},
{
"epoch": 0.03363914373088685,
"grad_norm": 0.21273010969161987,
"learning_rate": 4.242424242424243e-06,
"loss": 1.9913711547851562,
"step": 22
},
{
"epoch": 0.03669724770642202,
"grad_norm": 0.28394240140914917,
"learning_rate": 4.646464646464647e-06,
"loss": 2.005105495452881,
"step": 24
},
{
"epoch": 0.039755351681957186,
"grad_norm": 0.1409633308649063,
"learning_rate": 5.0505050505050515e-06,
"loss": 1.7043734788894653,
"step": 26
},
{
"epoch": 0.04281345565749235,
"grad_norm": 0.20944717526435852,
"learning_rate": 5.4545454545454545e-06,
"loss": 1.8792476654052734,
"step": 28
},
{
"epoch": 0.045871559633027525,
"grad_norm": 0.352851003408432,
"learning_rate": 5.858585858585859e-06,
"loss": 2.1714513301849365,
"step": 30
},
{
"epoch": 0.04892966360856269,
"grad_norm": 0.1783100664615631,
"learning_rate": 6.262626262626264e-06,
"loss": 1.8468539714813232,
"step": 32
},
{
"epoch": 0.05198776758409786,
"grad_norm": 0.163264200091362,
"learning_rate": 6.666666666666667e-06,
"loss": 1.9490365982055664,
"step": 34
},
{
"epoch": 0.05504587155963303,
"grad_norm": 0.13849298655986786,
"learning_rate": 7.070707070707071e-06,
"loss": 1.7920082807540894,
"step": 36
},
{
"epoch": 0.0581039755351682,
"grad_norm": 0.15713374316692352,
"learning_rate": 7.474747474747476e-06,
"loss": 1.8174947500228882,
"step": 38
},
{
"epoch": 0.06116207951070336,
"grad_norm": 0.5049633979797363,
"learning_rate": 7.87878787878788e-06,
"loss": 1.812342643737793,
"step": 40
},
{
"epoch": 0.06422018348623854,
"grad_norm": 0.17551669478416443,
"learning_rate": 8.282828282828283e-06,
"loss": 1.860647201538086,
"step": 42
},
{
"epoch": 0.0672782874617737,
"grad_norm": 0.23912598192691803,
"learning_rate": 8.686868686868687e-06,
"loss": 1.943070650100708,
"step": 44
},
{
"epoch": 0.07033639143730887,
"grad_norm": 0.24952426552772522,
"learning_rate": 9.090909090909091e-06,
"loss": 2.0365545749664307,
"step": 46
},
{
"epoch": 0.07339449541284404,
"grad_norm": 0.48846498131752014,
"learning_rate": 9.494949494949497e-06,
"loss": 2.0339832305908203,
"step": 48
},
{
"epoch": 0.0764525993883792,
"grad_norm": 0.1678168624639511,
"learning_rate": 9.8989898989899e-06,
"loss": 1.83395254611969,
"step": 50
},
{
"epoch": 0.07951070336391437,
"grad_norm": 0.24815726280212402,
"learning_rate": 1.0303030303030304e-05,
"loss": 1.8216886520385742,
"step": 52
},
{
"epoch": 0.08256880733944955,
"grad_norm": 0.18020159006118774,
"learning_rate": 1.0707070707070708e-05,
"loss": 1.9608432054519653,
"step": 54
},
{
"epoch": 0.0856269113149847,
"grad_norm": 0.13843238353729248,
"learning_rate": 1.1111111111111113e-05,
"loss": 1.8869292736053467,
"step": 56
},
{
"epoch": 0.08868501529051988,
"grad_norm": 0.1923639327287674,
"learning_rate": 1.1515151515151517e-05,
"loss": 1.920361042022705,
"step": 58
},
{
"epoch": 0.09174311926605505,
"grad_norm": 0.42597687244415283,
"learning_rate": 1.191919191919192e-05,
"loss": 1.854645013809204,
"step": 60
},
{
"epoch": 0.09480122324159021,
"grad_norm": 0.14682704210281372,
"learning_rate": 1.2323232323232323e-05,
"loss": 1.785409688949585,
"step": 62
},
{
"epoch": 0.09785932721712538,
"grad_norm": 0.17863060534000397,
"learning_rate": 1.2727272727272728e-05,
"loss": 1.6720788478851318,
"step": 64
},
{
"epoch": 0.10091743119266056,
"grad_norm": 0.19281210005283356,
"learning_rate": 1.3131313131313132e-05,
"loss": 1.7950663566589355,
"step": 66
},
{
"epoch": 0.10397553516819572,
"grad_norm": 0.18981774151325226,
"learning_rate": 1.3535353535353538e-05,
"loss": 1.9100995063781738,
"step": 68
},
{
"epoch": 0.10703363914373089,
"grad_norm": 0.12145815044641495,
"learning_rate": 1.3939393939393942e-05,
"loss": 1.773590087890625,
"step": 70
},
{
"epoch": 0.11009174311926606,
"grad_norm": 0.24626584351062775,
"learning_rate": 1.4343434343434344e-05,
"loss": 1.9656442403793335,
"step": 72
},
{
"epoch": 0.11314984709480122,
"grad_norm": 0.18296314775943756,
"learning_rate": 1.4747474747474747e-05,
"loss": 2.0457160472869873,
"step": 74
},
{
"epoch": 0.1162079510703364,
"grad_norm": 0.9755333065986633,
"learning_rate": 1.5151515151515153e-05,
"loss": 1.8354032039642334,
"step": 76
},
{
"epoch": 0.11926605504587157,
"grad_norm": 0.32339081168174744,
"learning_rate": 1.555555555555556e-05,
"loss": 2.1483542919158936,
"step": 78
},
{
"epoch": 0.12232415902140673,
"grad_norm": 0.15254797041416168,
"learning_rate": 1.595959595959596e-05,
"loss": 1.8094338178634644,
"step": 80
},
{
"epoch": 0.12538226299694188,
"grad_norm": 0.2284584939479828,
"learning_rate": 1.6363636363636366e-05,
"loss": 1.5790177583694458,
"step": 82
},
{
"epoch": 0.12844036697247707,
"grad_norm": 0.14264698326587677,
"learning_rate": 1.6767676767676768e-05,
"loss": 1.726978063583374,
"step": 84
},
{
"epoch": 0.13149847094801223,
"grad_norm": 0.17337217926979065,
"learning_rate": 1.7171717171717173e-05,
"loss": 1.7587239742279053,
"step": 86
},
{
"epoch": 0.1345565749235474,
"grad_norm": 0.17287808656692505,
"learning_rate": 1.7575757575757576e-05,
"loss": 1.6768524646759033,
"step": 88
},
{
"epoch": 0.13761467889908258,
"grad_norm": 0.10812447965145111,
"learning_rate": 1.797979797979798e-05,
"loss": 1.7222340106964111,
"step": 90
},
{
"epoch": 0.14067278287461774,
"grad_norm": 0.12938623130321503,
"learning_rate": 1.8383838383838387e-05,
"loss": 1.7632179260253906,
"step": 92
},
{
"epoch": 0.1437308868501529,
"grad_norm": 0.12679119408130646,
"learning_rate": 1.8787878787878792e-05,
"loss": 1.8440016508102417,
"step": 94
},
{
"epoch": 0.14678899082568808,
"grad_norm": 0.2494174838066101,
"learning_rate": 1.9191919191919194e-05,
"loss": 1.7742732763290405,
"step": 96
},
{
"epoch": 0.14984709480122324,
"grad_norm": 0.10247253626585007,
"learning_rate": 1.9595959595959596e-05,
"loss": 1.709675908088684,
"step": 98
},
{
"epoch": 0.1529051987767584,
"grad_norm": 0.12246488779783249,
"learning_rate": 2e-05,
"loss": 1.8481563329696655,
"step": 100
},
{
"epoch": 0.1559633027522936,
"grad_norm": 0.1887405663728714,
"learning_rate": 1.999994881459676e-05,
"loss": 1.7512952089309692,
"step": 102
},
{
"epoch": 0.15902140672782875,
"grad_norm": 0.14479632675647736,
"learning_rate": 1.9999795258969242e-05,
"loss": 1.7125545740127563,
"step": 104
},
{
"epoch": 0.1620795107033639,
"grad_norm": 0.1601811647415161,
"learning_rate": 1.9999539334864075e-05,
"loss": 1.7362236976623535,
"step": 106
},
{
"epoch": 0.1651376146788991,
"grad_norm": 0.13637490570545197,
"learning_rate": 1.9999181045192272e-05,
"loss": 1.775484561920166,
"step": 108
},
{
"epoch": 0.16819571865443425,
"grad_norm": 0.12772946059703827,
"learning_rate": 1.9998720394029214e-05,
"loss": 1.7764359712600708,
"step": 110
},
{
"epoch": 0.1712538226299694,
"grad_norm": 0.3635137975215912,
"learning_rate": 1.9998157386614592e-05,
"loss": 1.6467103958129883,
"step": 112
},
{
"epoch": 0.1743119266055046,
"grad_norm": 0.13494336605072021,
"learning_rate": 1.999749202935236e-05,
"loss": 1.7608314752578735,
"step": 114
},
{
"epoch": 0.17737003058103976,
"grad_norm": 0.10225271433591843,
"learning_rate": 1.9996724329810635e-05,
"loss": 1.6330885887145996,
"step": 116
},
{
"epoch": 0.18042813455657492,
"grad_norm": 0.15280012786388397,
"learning_rate": 1.999585429672165e-05,
"loss": 1.8569954633712769,
"step": 118
},
{
"epoch": 0.1834862385321101,
"grad_norm": 0.21181870996952057,
"learning_rate": 1.999488193998162e-05,
"loss": 1.684548020362854,
"step": 120
},
{
"epoch": 0.18654434250764526,
"grad_norm": 0.13093267381191254,
"learning_rate": 1.9993807270650653e-05,
"loss": 1.7649790048599243,
"step": 122
},
{
"epoch": 0.18960244648318042,
"grad_norm": 0.1822010576725006,
"learning_rate": 1.9992630300952616e-05,
"loss": 1.772727370262146,
"step": 124
},
{
"epoch": 0.1926605504587156,
"grad_norm": 0.14621378481388092,
"learning_rate": 1.9991351044274984e-05,
"loss": 1.7147706747055054,
"step": 126
},
{
"epoch": 0.19571865443425077,
"grad_norm": 0.1720152050256729,
"learning_rate": 1.9989969515168707e-05,
"loss": 1.6797459125518799,
"step": 128
},
{
"epoch": 0.19877675840978593,
"grad_norm": 0.155650332570076,
"learning_rate": 1.9988485729348042e-05,
"loss": 1.6960707902908325,
"step": 130
},
{
"epoch": 0.2018348623853211,
"grad_norm": 0.1650742143392563,
"learning_rate": 1.998689970369035e-05,
"loss": 1.6708898544311523,
"step": 132
},
{
"epoch": 0.20489296636085627,
"grad_norm": 0.16835254430770874,
"learning_rate": 1.9985211456235943e-05,
"loss": 1.7707663774490356,
"step": 134
},
{
"epoch": 0.20795107033639143,
"grad_norm": 0.1444658637046814,
"learning_rate": 1.9983421006187847e-05,
"loss": 1.6729985475540161,
"step": 136
},
{
"epoch": 0.21100917431192662,
"grad_norm": 0.12244903296232224,
"learning_rate": 1.9981528373911593e-05,
"loss": 1.5752283334732056,
"step": 138
},
{
"epoch": 0.21406727828746178,
"grad_norm": 0.13254615664482117,
"learning_rate": 1.9979533580934997e-05,
"loss": 1.5328928232192993,
"step": 140
},
{
"epoch": 0.21712538226299694,
"grad_norm": 0.11054181307554245,
"learning_rate": 1.9977436649947894e-05,
"loss": 1.6198745965957642,
"step": 142
},
{
"epoch": 0.22018348623853212,
"grad_norm": 0.13271930813789368,
"learning_rate": 1.99752376048019e-05,
"loss": 1.5421935319900513,
"step": 144
},
{
"epoch": 0.22324159021406728,
"grad_norm": 0.13614420592784882,
"learning_rate": 1.997293647051013e-05,
"loss": 1.6007068157196045,
"step": 146
},
{
"epoch": 0.22629969418960244,
"grad_norm": 0.18991592526435852,
"learning_rate": 1.9970533273246915e-05,
"loss": 1.5232698917388916,
"step": 148
},
{
"epoch": 0.22935779816513763,
"grad_norm": 1.61968195438385,
"learning_rate": 1.9968028040347495e-05,
"loss": 1.6931004524230957,
"step": 150
},
{
"epoch": 0.2324159021406728,
"grad_norm": 0.13931135833263397,
"learning_rate": 1.996542080030774e-05,
"loss": 1.7459183931350708,
"step": 152
},
{
"epoch": 0.23547400611620795,
"grad_norm": 0.3036521375179291,
"learning_rate": 1.9962711582783782e-05,
"loss": 1.7661356925964355,
"step": 154
},
{
"epoch": 0.23853211009174313,
"grad_norm": 0.18819795548915863,
"learning_rate": 1.995990041859171e-05,
"loss": 1.6686315536499023,
"step": 156
},
{
"epoch": 0.2415902140672783,
"grad_norm": 0.21112458407878876,
"learning_rate": 1.9956987339707212e-05,
"loss": 1.7403184175491333,
"step": 158
},
{
"epoch": 0.24464831804281345,
"grad_norm": 0.15459787845611572,
"learning_rate": 1.9953972379265195e-05,
"loss": 1.6565409898757935,
"step": 160
},
{
"epoch": 0.24770642201834864,
"grad_norm": 0.1960821896791458,
"learning_rate": 1.9950855571559434e-05,
"loss": 1.7828407287597656,
"step": 162
},
{
"epoch": 0.25076452599388377,
"grad_norm": 0.27554914355278015,
"learning_rate": 1.994763695204216e-05,
"loss": 1.5625255107879639,
"step": 164
},
{
"epoch": 0.25382262996941896,
"grad_norm": 0.32836204767227173,
"learning_rate": 1.9944316557323676e-05,
"loss": 1.7131232023239136,
"step": 166
},
{
"epoch": 0.25688073394495414,
"grad_norm": 0.169178307056427,
"learning_rate": 1.9940894425171923e-05,
"loss": 1.4519933462142944,
"step": 168
},
{
"epoch": 0.2599388379204893,
"grad_norm": 0.21049527823925018,
"learning_rate": 1.9937370594512054e-05,
"loss": 1.6664998531341553,
"step": 170
},
{
"epoch": 0.26299694189602446,
"grad_norm": 0.1832091212272644,
"learning_rate": 1.9933745105426012e-05,
"loss": 1.2968833446502686,
"step": 172
},
{
"epoch": 0.26605504587155965,
"grad_norm": 0.20308178663253784,
"learning_rate": 1.9930017999152035e-05,
"loss": 1.2826684713363647,
"step": 174
},
{
"epoch": 0.2691131498470948,
"grad_norm": 0.42270585894584656,
"learning_rate": 1.9926189318084225e-05,
"loss": 1.5076080560684204,
"step": 176
},
{
"epoch": 0.27217125382262997,
"grad_norm": 0.18236956000328064,
"learning_rate": 1.992225910577205e-05,
"loss": 1.4853439331054688,
"step": 178
},
{
"epoch": 0.27522935779816515,
"grad_norm": 1.155766487121582,
"learning_rate": 1.9918227406919834e-05,
"loss": 1.6843595504760742,
"step": 180
},
{
"epoch": 0.2782874617737003,
"grad_norm": 0.23917227983474731,
"learning_rate": 1.9914094267386282e-05,
"loss": 1.6130729913711548,
"step": 182
},
{
"epoch": 0.28134556574923547,
"grad_norm": 0.16265490651130676,
"learning_rate": 1.9909859734183922e-05,
"loss": 1.3471795320510864,
"step": 184
},
{
"epoch": 0.28440366972477066,
"grad_norm": 0.1218869760632515,
"learning_rate": 1.9905523855478605e-05,
"loss": 1.2923707962036133,
"step": 186
},
{
"epoch": 0.2874617737003058,
"grad_norm": 0.12545672059059143,
"learning_rate": 1.990108668058892e-05,
"loss": 1.4676196575164795,
"step": 188
},
{
"epoch": 0.290519877675841,
"grad_norm": 0.21261335909366608,
"learning_rate": 1.9896548259985677e-05,
"loss": 1.396953821182251,
"step": 190
},
{
"epoch": 0.29357798165137616,
"grad_norm": 0.2187412828207016,
"learning_rate": 1.9891908645291285e-05,
"loss": 1.4410208463668823,
"step": 192
},
{
"epoch": 0.2966360856269113,
"grad_norm": 0.12113740295171738,
"learning_rate": 1.98871678892792e-05,
"loss": 1.6053173542022705,
"step": 194
},
{
"epoch": 0.2996941896024465,
"grad_norm": 0.4814097285270691,
"learning_rate": 1.9882326045873318e-05,
"loss": 1.540165901184082,
"step": 196
},
{
"epoch": 0.30275229357798167,
"grad_norm": 0.1228162944316864,
"learning_rate": 1.9877383170147354e-05,
"loss": 1.2737184762954712,
"step": 198
},
{
"epoch": 0.3058103975535168,
"grad_norm": 0.1502322554588318,
"learning_rate": 1.987233931832421e-05,
"loss": 1.4496139287948608,
"step": 200
},
{
"epoch": 0.308868501529052,
"grad_norm": 0.18339720368385315,
"learning_rate": 1.9867194547775352e-05,
"loss": 1.5935065746307373,
"step": 202
},
{
"epoch": 0.3119266055045872,
"grad_norm": 0.11243823170661926,
"learning_rate": 1.9861948917020147e-05,
"loss": 1.7492598295211792,
"step": 204
},
{
"epoch": 0.3149847094801223,
"grad_norm": 0.13702593743801117,
"learning_rate": 1.98566024857252e-05,
"loss": 1.7874646186828613,
"step": 206
},
{
"epoch": 0.3180428134556575,
"grad_norm": 0.13499164581298828,
"learning_rate": 1.985115531470368e-05,
"loss": 1.877272605895996,
"step": 208
},
{
"epoch": 0.3211009174311927,
"grad_norm": 0.15725181996822357,
"learning_rate": 1.9845607465914617e-05,
"loss": 1.4388186931610107,
"step": 210
},
{
"epoch": 0.3241590214067278,
"grad_norm": 0.11166012287139893,
"learning_rate": 1.9839959002462204e-05,
"loss": 1.5455048084259033,
"step": 212
},
{
"epoch": 0.327217125382263,
"grad_norm": 0.5011727809906006,
"learning_rate": 1.9834209988595086e-05,
"loss": 1.3708510398864746,
"step": 214
},
{
"epoch": 0.3302752293577982,
"grad_norm": 0.09888817369937897,
"learning_rate": 1.982836048970561e-05,
"loss": 1.289783239364624,
"step": 216
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.17895136773586273,
"learning_rate": 1.9822410572329106e-05,
"loss": 1.7309335470199585,
"step": 218
},
{
"epoch": 0.3363914373088685,
"grad_norm": 0.1677575707435608,
"learning_rate": 1.9816360304143107e-05,
"loss": 1.5802499055862427,
"step": 220
},
{
"epoch": 0.3394495412844037,
"grad_norm": 0.15030327439308167,
"learning_rate": 1.98102097539666e-05,
"loss": 1.738713026046753,
"step": 222
},
{
"epoch": 0.3425076452599388,
"grad_norm": 0.16578397154808044,
"learning_rate": 1.9803958991759223e-05,
"loss": 1.2134374380111694,
"step": 224
},
{
"epoch": 0.345565749235474,
"grad_norm": 0.27223920822143555,
"learning_rate": 1.979760808862049e-05,
"loss": 1.493201732635498,
"step": 226
},
{
"epoch": 0.3486238532110092,
"grad_norm": 0.1591493785381317,
"learning_rate": 1.979115711678896e-05,
"loss": 1.7763992547988892,
"step": 228
},
{
"epoch": 0.3516819571865443,
"grad_norm": 0.16624890267848969,
"learning_rate": 1.9784606149641425e-05,
"loss": 1.5089151859283447,
"step": 230
},
{
"epoch": 0.3547400611620795,
"grad_norm": 0.1464773416519165,
"learning_rate": 1.9777955261692096e-05,
"loss": 1.6361924409866333,
"step": 232
},
{
"epoch": 0.3577981651376147,
"grad_norm": 0.21099774539470673,
"learning_rate": 1.977120452859172e-05,
"loss": 1.5302180051803589,
"step": 234
},
{
"epoch": 0.36085626911314983,
"grad_norm": 0.13680118322372437,
"learning_rate": 1.976435402712674e-05,
"loss": 1.4461219310760498,
"step": 236
},
{
"epoch": 0.363914373088685,
"grad_norm": 0.12054964900016785,
"learning_rate": 1.9757403835218416e-05,
"loss": 1.5349360704421997,
"step": 238
},
{
"epoch": 0.3669724770642202,
"grad_norm": 0.1747889220714569,
"learning_rate": 1.9750354031921945e-05,
"loss": 1.5896707773208618,
"step": 240
},
{
"epoch": 0.37003058103975534,
"grad_norm": 0.14816083014011383,
"learning_rate": 1.9743204697425555e-05,
"loss": 1.451588749885559,
"step": 242
},
{
"epoch": 0.3730886850152905,
"grad_norm": 0.1631159782409668,
"learning_rate": 1.9735955913049596e-05,
"loss": 1.5556179285049438,
"step": 244
},
{
"epoch": 0.3761467889908257,
"grad_norm": 0.22455169260501862,
"learning_rate": 1.972860776124561e-05,
"loss": 1.828192949295044,
"step": 246
},
{
"epoch": 0.37920489296636084,
"grad_norm": 0.2173646092414856,
"learning_rate": 1.97211603255954e-05,
"loss": 2.273862838745117,
"step": 248
},
{
"epoch": 0.382262996941896,
"grad_norm": 0.18272121250629425,
"learning_rate": 1.971361369081008e-05,
"loss": 1.4451524019241333,
"step": 250
},
{
"epoch": 0.3853211009174312,
"grad_norm": 0.18003606796264648,
"learning_rate": 1.9705967942729097e-05,
"loss": 1.4289908409118652,
"step": 252
},
{
"epoch": 0.38837920489296635,
"grad_norm": 0.1590876430273056,
"learning_rate": 1.969822316831928e-05,
"loss": 1.4974957704544067,
"step": 254
},
{
"epoch": 0.39143730886850153,
"grad_norm": 0.3142800033092499,
"learning_rate": 1.969037945567383e-05,
"loss": 1.6579951047897339,
"step": 256
},
{
"epoch": 0.3944954128440367,
"grad_norm": 0.16492165625095367,
"learning_rate": 1.9682436894011314e-05,
"loss": 1.5667517185211182,
"step": 258
},
{
"epoch": 0.39755351681957185,
"grad_norm": 0.15221764147281647,
"learning_rate": 1.9674395573674682e-05,
"loss": 1.4063596725463867,
"step": 260
},
{
"epoch": 0.40061162079510704,
"grad_norm": 0.3218546509742737,
"learning_rate": 1.9666255586130196e-05,
"loss": 1.2971922159194946,
"step": 262
},
{
"epoch": 0.4036697247706422,
"grad_norm": 0.10011743754148483,
"learning_rate": 1.9658017023966428e-05,
"loss": 1.218963623046875,
"step": 264
},
{
"epoch": 0.40672782874617736,
"grad_norm": 0.12353604286909103,
"learning_rate": 1.964967998089318e-05,
"loss": 1.3903040885925293,
"step": 266
},
{
"epoch": 0.40978593272171254,
"grad_norm": 0.326667845249176,
"learning_rate": 1.9641244551740438e-05,
"loss": 1.6172282695770264,
"step": 268
},
{
"epoch": 0.41284403669724773,
"grad_norm": 0.26776137948036194,
"learning_rate": 1.9632710832457272e-05,
"loss": 1.6435128450393677,
"step": 270
},
{
"epoch": 0.41590214067278286,
"grad_norm": 0.1551942378282547,
"learning_rate": 1.9624078920110766e-05,
"loss": 1.648958444595337,
"step": 272
},
{
"epoch": 0.41896024464831805,
"grad_norm": 0.17337119579315186,
"learning_rate": 1.9615348912884897e-05,
"loss": 1.6705131530761719,
"step": 274
},
{
"epoch": 0.42201834862385323,
"grad_norm": 0.2657426595687866,
"learning_rate": 1.960652091007944e-05,
"loss": 1.5089123249053955,
"step": 276
},
{
"epoch": 0.42507645259938837,
"grad_norm": 0.2546260952949524,
"learning_rate": 1.9597595012108797e-05,
"loss": 1.6476012468338013,
"step": 278
},
{
"epoch": 0.42813455657492355,
"grad_norm": 0.4865143895149231,
"learning_rate": 1.9588571320500914e-05,
"loss": 1.5658520460128784,
"step": 280
},
{
"epoch": 0.43119266055045874,
"grad_norm": 0.1443634331226349,
"learning_rate": 1.9579449937896067e-05,
"loss": 1.4523909091949463,
"step": 282
},
{
"epoch": 0.43425076452599387,
"grad_norm": 0.2230675220489502,
"learning_rate": 1.957023096804574e-05,
"loss": 1.4425302743911743,
"step": 284
},
{
"epoch": 0.43730886850152906,
"grad_norm": 0.19679437577724457,
"learning_rate": 1.9560914515811416e-05,
"loss": 1.6431429386138916,
"step": 286
},
{
"epoch": 0.44036697247706424,
"grad_norm": 0.3200703561306,
"learning_rate": 1.9551500687163404e-05,
"loss": 1.3619184494018555,
"step": 288
},
{
"epoch": 0.4434250764525994,
"grad_norm": 0.2082187533378601,
"learning_rate": 1.9541989589179608e-05,
"loss": 1.590578317642212,
"step": 290
},
{
"epoch": 0.44648318042813456,
"grad_norm": 0.1061125323176384,
"learning_rate": 1.9532381330044346e-05,
"loss": 1.4594062566757202,
"step": 292
},
{
"epoch": 0.44954128440366975,
"grad_norm": 0.12709374725818634,
"learning_rate": 1.9522676019047084e-05,
"loss": 1.4876629114151,
"step": 294
},
{
"epoch": 0.4525993883792049,
"grad_norm": 0.27205580472946167,
"learning_rate": 1.9512873766581216e-05,
"loss": 1.611258864402771,
"step": 296
},
{
"epoch": 0.45565749235474007,
"grad_norm": 0.21540172398090363,
"learning_rate": 1.9502974684142787e-05,
"loss": 1.6235052347183228,
"step": 298
},
{
"epoch": 0.45871559633027525,
"grad_norm": 0.15379998087882996,
"learning_rate": 1.949297888432926e-05,
"loss": 1.4504597187042236,
"step": 300
},
{
"epoch": 0.4617737003058104,
"grad_norm": 0.15325340628623962,
"learning_rate": 1.9482886480838193e-05,
"loss": 1.3728998899459839,
"step": 302
},
{
"epoch": 0.4648318042813456,
"grad_norm": 0.14561216533184052,
"learning_rate": 1.947269758846597e-05,
"loss": 1.5321768522262573,
"step": 304
},
{
"epoch": 0.46788990825688076,
"grad_norm": 0.1464478224515915,
"learning_rate": 1.9462412323106506e-05,
"loss": 1.4658679962158203,
"step": 306
},
{
"epoch": 0.4709480122324159,
"grad_norm": 0.16449898481369019,
"learning_rate": 1.945203080174989e-05,
"loss": 1.6214468479156494,
"step": 308
},
{
"epoch": 0.4740061162079511,
"grad_norm": 0.11995477229356766,
"learning_rate": 1.94415531424811e-05,
"loss": 1.4710017442703247,
"step": 310
},
{
"epoch": 0.47706422018348627,
"grad_norm": 0.12246715277433395,
"learning_rate": 1.9430979464478618e-05,
"loss": 1.4265179634094238,
"step": 312
},
{
"epoch": 0.4801223241590214,
"grad_norm": 0.1928575336933136,
"learning_rate": 1.9420309888013115e-05,
"loss": 1.4053140878677368,
"step": 314
},
{
"epoch": 0.4831804281345566,
"grad_norm": 0.1138468086719513,
"learning_rate": 1.940954453444604e-05,
"loss": 1.501997709274292,
"step": 316
},
{
"epoch": 0.48623853211009177,
"grad_norm": 0.18235905468463898,
"learning_rate": 1.9398683526228283e-05,
"loss": 1.4911972284317017,
"step": 318
},
{
"epoch": 0.4892966360856269,
"grad_norm": 0.25072285532951355,
"learning_rate": 1.9387726986898753e-05,
"loss": 1.4921306371688843,
"step": 320
},
{
"epoch": 0.4923547400611621,
"grad_norm": 0.14147193729877472,
"learning_rate": 1.9376675041082974e-05,
"loss": 1.6467393636703491,
"step": 322
},
{
"epoch": 0.4954128440366973,
"grad_norm": 0.16818289458751678,
"learning_rate": 1.936552781449168e-05,
"loss": 1.8290669918060303,
"step": 324
},
{
"epoch": 0.4984709480122324,
"grad_norm": 0.1758740097284317,
"learning_rate": 1.935428543391938e-05,
"loss": 1.7090046405792236,
"step": 326
},
{
"epoch": 0.5015290519877675,
"grad_norm": 0.12498500198125839,
"learning_rate": 1.9342948027242923e-05,
"loss": 1.687024474143982,
"step": 328
},
{
"epoch": 0.5045871559633027,
"grad_norm": 0.10503566265106201,
"learning_rate": 1.9331515723420016e-05,
"loss": 1.6114351749420166,
"step": 330
},
{
"epoch": 0.5076452599388379,
"grad_norm": 0.2537882626056671,
"learning_rate": 1.9319988652487794e-05,
"loss": 1.307665228843689,
"step": 332
},
{
"epoch": 0.5107033639143731,
"grad_norm": 0.12492425739765167,
"learning_rate": 1.930836694556131e-05,
"loss": 1.2778944969177246,
"step": 334
},
{
"epoch": 0.5137614678899083,
"grad_norm": 0.13973812758922577,
"learning_rate": 1.929665073483208e-05,
"loss": 1.7095977067947388,
"step": 336
},
{
"epoch": 0.5168195718654435,
"grad_norm": 0.17106389999389648,
"learning_rate": 1.9284840153566533e-05,
"loss": 1.7058213949203491,
"step": 338
},
{
"epoch": 0.5198776758409785,
"grad_norm": 0.20474772155284882,
"learning_rate": 1.9272935336104526e-05,
"loss": 1.788483738899231,
"step": 340
},
{
"epoch": 0.5229357798165137,
"grad_norm": 0.3535018265247345,
"learning_rate": 1.926093641785781e-05,
"loss": 1.7168432474136353,
"step": 342
},
{
"epoch": 0.5259938837920489,
"grad_norm": 0.17024751007556915,
"learning_rate": 1.9248843535308494e-05,
"loss": 1.7606186866760254,
"step": 344
},
{
"epoch": 0.5290519877675841,
"grad_norm": 0.49967944622039795,
"learning_rate": 1.9236656826007483e-05,
"loss": 1.6816507577896118,
"step": 346
},
{
"epoch": 0.5321100917431193,
"grad_norm": 0.5842476487159729,
"learning_rate": 1.9224376428572914e-05,
"loss": 1.6088945865631104,
"step": 348
},
{
"epoch": 0.5351681957186545,
"grad_norm": 0.25457754731178284,
"learning_rate": 1.9212002482688586e-05,
"loss": 1.5031757354736328,
"step": 350
},
{
"epoch": 0.5382262996941896,
"grad_norm": 0.20663785934448242,
"learning_rate": 1.919953512910237e-05,
"loss": 1.7612838745117188,
"step": 352
},
{
"epoch": 0.5412844036697247,
"grad_norm": 0.2548372447490692,
"learning_rate": 1.9186974509624596e-05,
"loss": 1.6867271661758423,
"step": 354
},
{
"epoch": 0.5443425076452599,
"grad_norm": 0.8678698539733887,
"learning_rate": 1.917432076712647e-05,
"loss": 1.2227782011032104,
"step": 356
},
{
"epoch": 0.5474006116207951,
"grad_norm": 0.21147370338439941,
"learning_rate": 1.916157404553841e-05,
"loss": 1.8059334754943848,
"step": 358
},
{
"epoch": 0.5504587155963303,
"grad_norm": 0.1419752538204193,
"learning_rate": 1.914873448984843e-05,
"loss": 1.8255892992019653,
"step": 360
},
{
"epoch": 0.5535168195718655,
"grad_norm": 0.16539934277534485,
"learning_rate": 1.913580224610051e-05,
"loss": 1.8986237049102783,
"step": 362
},
{
"epoch": 0.5565749235474006,
"grad_norm": 0.3205839991569519,
"learning_rate": 1.912277746139288e-05,
"loss": 1.822392225265503,
"step": 364
},
{
"epoch": 0.5596330275229358,
"grad_norm": 0.29032111167907715,
"learning_rate": 1.9109660283876402e-05,
"loss": 1.796310305595398,
"step": 366
},
{
"epoch": 0.5626911314984709,
"grad_norm": 0.1845773160457611,
"learning_rate": 1.909645086275286e-05,
"loss": 1.6601674556732178,
"step": 368
},
{
"epoch": 0.5657492354740061,
"grad_norm": 0.18452957272529602,
"learning_rate": 1.9083149348273267e-05,
"loss": 1.5303943157196045,
"step": 370
},
{
"epoch": 0.5688073394495413,
"grad_norm": 0.2641131281852722,
"learning_rate": 1.906975589173615e-05,
"loss": 1.095422387123108,
"step": 372
},
{
"epoch": 0.5718654434250765,
"grad_norm": 0.14679329097270966,
"learning_rate": 1.9056270645485832e-05,
"loss": 1.0895999670028687,
"step": 374
},
{
"epoch": 0.5749235474006116,
"grad_norm": 0.0957442969083786,
"learning_rate": 1.904269376291071e-05,
"loss": 1.3464511632919312,
"step": 376
},
{
"epoch": 0.5779816513761468,
"grad_norm": 0.1374763697385788,
"learning_rate": 1.9029025398441502e-05,
"loss": 1.2797412872314453,
"step": 378
},
{
"epoch": 0.581039755351682,
"grad_norm": 0.13921880722045898,
"learning_rate": 1.9015265707549475e-05,
"loss": 1.2325642108917236,
"step": 380
},
{
"epoch": 0.5840978593272171,
"grad_norm": 0.09560864418745041,
"learning_rate": 1.9001414846744708e-05,
"loss": 1.2352911233901978,
"step": 382
},
{
"epoch": 0.5871559633027523,
"grad_norm": 0.374447226524353,
"learning_rate": 1.898747297357429e-05,
"loss": 1.5163378715515137,
"step": 384
},
{
"epoch": 0.5902140672782875,
"grad_norm": 0.17736363410949707,
"learning_rate": 1.8973440246620527e-05,
"loss": 1.4300881624221802,
"step": 386
},
{
"epoch": 0.5932721712538226,
"grad_norm": 0.15392173826694489,
"learning_rate": 1.895931682549915e-05,
"loss": 1.245898962020874,
"step": 388
},
{
"epoch": 0.5963302752293578,
"grad_norm": 0.24752528965473175,
"learning_rate": 1.8945102870857502e-05,
"loss": 1.5547707080841064,
"step": 390
},
{
"epoch": 0.599388379204893,
"grad_norm": 0.08202062547206879,
"learning_rate": 1.8930798544372683e-05,
"loss": 1.5418813228607178,
"step": 392
},
{
"epoch": 0.6024464831804281,
"grad_norm": 0.2755289077758789,
"learning_rate": 1.891640400874975e-05,
"loss": 1.3220683336257935,
"step": 394
},
{
"epoch": 0.6055045871559633,
"grad_norm": 0.2508130669593811,
"learning_rate": 1.8901919427719835e-05,
"loss": 1.497948169708252,
"step": 396
},
{
"epoch": 0.6085626911314985,
"grad_norm": 0.18379831314086914,
"learning_rate": 1.8887344966038293e-05,
"loss": 1.6999335289001465,
"step": 398
},
{
"epoch": 0.6116207951070336,
"grad_norm": 0.22920790314674377,
"learning_rate": 1.8872680789482847e-05,
"loss": 1.2188191413879395,
"step": 400
},
{
"epoch": 0.6146788990825688,
"grad_norm": 0.16860808432102203,
"learning_rate": 1.8857927064851663e-05,
"loss": 1.4792815446853638,
"step": 402
},
{
"epoch": 0.617737003058104,
"grad_norm": 0.1248302087187767,
"learning_rate": 1.8843083959961487e-05,
"loss": 1.5485862493515015,
"step": 404
},
{
"epoch": 0.6207951070336392,
"grad_norm": 0.258060097694397,
"learning_rate": 1.8828151643645723e-05,
"loss": 1.4772653579711914,
"step": 406
},
{
"epoch": 0.6238532110091743,
"grad_norm": 0.11846158653497696,
"learning_rate": 1.8813130285752504e-05,
"loss": 1.3414539098739624,
"step": 408
},
{
"epoch": 0.6269113149847095,
"grad_norm": 0.4850642681121826,
"learning_rate": 1.8798020057142787e-05,
"loss": 0.9788084626197815,
"step": 410
},
{
"epoch": 0.6299694189602446,
"grad_norm": 0.15223458409309387,
"learning_rate": 1.8782821129688378e-05,
"loss": 1.6087661981582642,
"step": 412
},
{
"epoch": 0.6330275229357798,
"grad_norm": 0.239080011844635,
"learning_rate": 1.8767533676269994e-05,
"loss": 1.5469305515289307,
"step": 414
},
{
"epoch": 0.636085626911315,
"grad_norm": 0.12797395884990692,
"learning_rate": 1.8752157870775293e-05,
"loss": 1.4467060565948486,
"step": 416
},
{
"epoch": 0.6391437308868502,
"grad_norm": 0.1488076150417328,
"learning_rate": 1.87366938880969e-05,
"loss": 1.5522997379302979,
"step": 418
},
{
"epoch": 0.6422018348623854,
"grad_norm": 0.11536487936973572,
"learning_rate": 1.872114190413041e-05,
"loss": 1.4859579801559448,
"step": 420
},
{
"epoch": 0.6452599388379205,
"grad_norm": 0.15605668723583221,
"learning_rate": 1.87055020957724e-05,
"loss": 1.4567848443984985,
"step": 422
},
{
"epoch": 0.6483180428134556,
"grad_norm": 0.166889950633049,
"learning_rate": 1.86897746409184e-05,
"loss": 1.6576907634735107,
"step": 424
},
{
"epoch": 0.6513761467889908,
"grad_norm": 0.1812712550163269,
"learning_rate": 1.8673959718460877e-05,
"loss": 1.3640563488006592,
"step": 426
},
{
"epoch": 0.654434250764526,
"grad_norm": 0.11476635187864304,
"learning_rate": 1.865805750828721e-05,
"loss": 1.0880959033966064,
"step": 428
},
{
"epoch": 0.6574923547400612,
"grad_norm": 0.19921736419200897,
"learning_rate": 1.8642068191277632e-05,
"loss": 1.7264765501022339,
"step": 430
},
{
"epoch": 0.6605504587155964,
"grad_norm": 0.21837309002876282,
"learning_rate": 1.8625991949303163e-05,
"loss": 1.481621503829956,
"step": 432
},
{
"epoch": 0.6636085626911316,
"grad_norm": 0.14236745238304138,
"learning_rate": 1.8609828965223577e-05,
"loss": 1.6409680843353271,
"step": 434
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.15380296111106873,
"learning_rate": 1.8593579422885282e-05,
"loss": 1.4764446020126343,
"step": 436
},
{
"epoch": 0.6697247706422018,
"grad_norm": 0.22593624889850616,
"learning_rate": 1.857724350711925e-05,
"loss": 1.679504632949829,
"step": 438
},
{
"epoch": 0.672782874617737,
"grad_norm": 0.19800275564193726,
"learning_rate": 1.8560821403738913e-05,
"loss": 1.5476343631744385,
"step": 440
},
{
"epoch": 0.6758409785932722,
"grad_norm": 0.373758465051651,
"learning_rate": 1.854431329953804e-05,
"loss": 1.579092264175415,
"step": 442
},
{
"epoch": 0.6788990825688074,
"grad_norm": 0.22904565930366516,
"learning_rate": 1.852771938228863e-05,
"loss": 1.4693065881729126,
"step": 444
},
{
"epoch": 0.6819571865443425,
"grad_norm": 0.20299802720546722,
"learning_rate": 1.851103984073876e-05,
"loss": 1.129746675491333,
"step": 446
},
{
"epoch": 0.6850152905198776,
"grad_norm": 0.6228739619255066,
"learning_rate": 1.8494274864610442e-05,
"loss": 1.571781039237976,
"step": 448
},
{
"epoch": 0.6880733944954128,
"grad_norm": 0.16735827922821045,
"learning_rate": 1.8477424644597466e-05,
"loss": 1.5506985187530518,
"step": 450
},
{
"epoch": 0.691131498470948,
"grad_norm": 0.18025986850261688,
"learning_rate": 1.8460489372363233e-05,
"loss": 1.5530122518539429,
"step": 452
},
{
"epoch": 0.6941896024464832,
"grad_norm": 0.16754700243473053,
"learning_rate": 1.844346924053858e-05,
"loss": 1.5570106506347656,
"step": 454
},
{
"epoch": 0.6972477064220184,
"grad_norm": 0.14615465700626373,
"learning_rate": 1.842636444271957e-05,
"loss": 1.384924292564392,
"step": 456
},
{
"epoch": 0.7003058103975535,
"grad_norm": 0.1849491447210312,
"learning_rate": 1.8409175173465305e-05,
"loss": 1.5958186388015747,
"step": 458
},
{
"epoch": 0.7033639143730887,
"grad_norm": 0.3052046000957489,
"learning_rate": 1.8391901628295723e-05,
"loss": 1.687294602394104,
"step": 460
},
{
"epoch": 0.7064220183486238,
"grad_norm": 0.23065337538719177,
"learning_rate": 1.8374544003689346e-05,
"loss": 1.4123787879943848,
"step": 462
},
{
"epoch": 0.709480122324159,
"grad_norm": 0.1883080154657364,
"learning_rate": 1.8357102497081068e-05,
"loss": 1.6628724336624146,
"step": 464
},
{
"epoch": 0.7125382262996942,
"grad_norm": 0.2225079983472824,
"learning_rate": 1.8339577306859898e-05,
"loss": 1.4429267644882202,
"step": 466
},
{
"epoch": 0.7155963302752294,
"grad_norm": 0.3100110590457916,
"learning_rate": 1.832196863236671e-05,
"loss": 1.5697983503341675,
"step": 468
},
{
"epoch": 0.7186544342507645,
"grad_norm": 0.2065575122833252,
"learning_rate": 1.830427667389197e-05,
"loss": 1.507987380027771,
"step": 470
},
{
"epoch": 0.7217125382262997,
"grad_norm": 0.14007803797721863,
"learning_rate": 1.8286501632673467e-05,
"loss": 1.6372132301330566,
"step": 472
},
{
"epoch": 0.7247706422018348,
"grad_norm": 0.16152538359165192,
"learning_rate": 1.8268643710894008e-05,
"loss": 1.7255295515060425,
"step": 474
},
{
"epoch": 0.72782874617737,
"grad_norm": 0.16280680894851685,
"learning_rate": 1.8250703111679135e-05,
"loss": 1.619618535041809,
"step": 476
},
{
"epoch": 0.7308868501529052,
"grad_norm": 0.1520742028951645,
"learning_rate": 1.8232680039094807e-05,
"loss": 1.4603939056396484,
"step": 478
},
{
"epoch": 0.7339449541284404,
"grad_norm": 0.20179243385791779,
"learning_rate": 1.821457469814507e-05,
"loss": 1.5757310390472412,
"step": 480
},
{
"epoch": 0.7370030581039755,
"grad_norm": 0.16924500465393066,
"learning_rate": 1.8196387294769744e-05,
"loss": 1.5796566009521484,
"step": 482
},
{
"epoch": 0.7400611620795107,
"grad_norm": 0.4897945821285248,
"learning_rate": 1.8178118035842068e-05,
"loss": 1.5773838758468628,
"step": 484
},
{
"epoch": 0.7431192660550459,
"grad_norm": 0.17398764193058014,
"learning_rate": 1.8159767129166353e-05,
"loss": 1.576639175415039,
"step": 486
},
{
"epoch": 0.746177370030581,
"grad_norm": 0.2364799827337265,
"learning_rate": 1.8141334783475608e-05,
"loss": 1.576772689819336,
"step": 488
},
{
"epoch": 0.7492354740061162,
"grad_norm": 0.4591314196586609,
"learning_rate": 1.8122821208429177e-05,
"loss": 1.371825098991394,
"step": 490
},
{
"epoch": 0.7522935779816514,
"grad_norm": 0.220738023519516,
"learning_rate": 1.8104226614610355e-05,
"loss": 1.4610284566879272,
"step": 492
},
{
"epoch": 0.7553516819571865,
"grad_norm": 0.18214958906173706,
"learning_rate": 1.808555121352398e-05,
"loss": 1.3964916467666626,
"step": 494
},
{
"epoch": 0.7584097859327217,
"grad_norm": 0.23065899312496185,
"learning_rate": 1.806679521759403e-05,
"loss": 1.4891163110733032,
"step": 496
},
{
"epoch": 0.7614678899082569,
"grad_norm": 0.18206670880317688,
"learning_rate": 1.804795884016123e-05,
"loss": 1.5172436237335205,
"step": 498
},
{
"epoch": 0.764525993883792,
"grad_norm": 0.256682813167572,
"learning_rate": 1.802904229548059e-05,
"loss": 1.343529224395752,
"step": 500
},
{
"epoch": 0.7675840978593272,
"grad_norm": 0.20497728884220123,
"learning_rate": 1.8010045798718996e-05,
"loss": 1.0024524927139282,
"step": 502
},
{
"epoch": 0.7706422018348624,
"grad_norm": 1.0883042812347412,
"learning_rate": 1.7990969565952744e-05,
"loss": 1.281205415725708,
"step": 504
},
{
"epoch": 0.7737003058103975,
"grad_norm": 0.1716405302286148,
"learning_rate": 1.7971813814165096e-05,
"loss": 1.3423949480056763,
"step": 506
},
{
"epoch": 0.7767584097859327,
"grad_norm": 0.18879351019859314,
"learning_rate": 1.79525787612438e-05,
"loss": 1.388129472732544,
"step": 508
},
{
"epoch": 0.7798165137614679,
"grad_norm": 0.1109057292342186,
"learning_rate": 1.793326462597862e-05,
"loss": 1.4014989137649536,
"step": 510
},
{
"epoch": 0.7828746177370031,
"grad_norm": 0.08534067124128342,
"learning_rate": 1.7913871628058852e-05,
"loss": 1.4662851095199585,
"step": 512
},
{
"epoch": 0.7859327217125383,
"grad_norm": 0.106519415974617,
"learning_rate": 1.7894399988070804e-05,
"loss": 1.3783475160598755,
"step": 514
},
{
"epoch": 0.7889908256880734,
"grad_norm": 0.09660910069942474,
"learning_rate": 1.7874849927495312e-05,
"loss": 1.4017391204833984,
"step": 516
},
{
"epoch": 0.7920489296636085,
"grad_norm": 0.14216075837612152,
"learning_rate": 1.78552216687052e-05,
"loss": 1.365959882736206,
"step": 518
},
{
"epoch": 0.7951070336391437,
"grad_norm": 0.1868741363286972,
"learning_rate": 1.7835515434962775e-05,
"loss": 1.3767143487930298,
"step": 520
},
{
"epoch": 0.7981651376146789,
"grad_norm": 0.08062469214200974,
"learning_rate": 1.781573145041726e-05,
"loss": 1.330634593963623,
"step": 522
},
{
"epoch": 0.8012232415902141,
"grad_norm": 0.1081153079867363,
"learning_rate": 1.7795869940102256e-05,
"loss": 1.3757586479187012,
"step": 524
},
{
"epoch": 0.8042813455657493,
"grad_norm": 0.35133278369903564,
"learning_rate": 1.77759311299332e-05,
"loss": 1.3196890354156494,
"step": 526
},
{
"epoch": 0.8073394495412844,
"grad_norm": 0.05468370392918587,
"learning_rate": 1.775591524670475e-05,
"loss": 1.183956503868103,
"step": 528
},
{
"epoch": 0.8103975535168195,
"grad_norm": 0.23623742163181305,
"learning_rate": 1.773582251808827e-05,
"loss": 1.2204488515853882,
"step": 530
},
{
"epoch": 0.8134556574923547,
"grad_norm": 0.0645361989736557,
"learning_rate": 1.7715653172629172e-05,
"loss": 1.0032373666763306,
"step": 532
},
{
"epoch": 0.8165137614678899,
"grad_norm": 0.12667137384414673,
"learning_rate": 1.7695407439744367e-05,
"loss": 1.2794767618179321,
"step": 534
},
{
"epoch": 0.8195718654434251,
"grad_norm": 0.10390239208936691,
"learning_rate": 1.7675085549719638e-05,
"loss": 1.3293739557266235,
"step": 536
},
{
"epoch": 0.8226299694189603,
"grad_norm": 0.1387118250131607,
"learning_rate": 1.765468773370701e-05,
"loss": 1.3429309129714966,
"step": 538
},
{
"epoch": 0.8256880733944955,
"grad_norm": 0.0737949088215828,
"learning_rate": 1.7634214223722136e-05,
"loss": 1.3086459636688232,
"step": 540
},
{
"epoch": 0.8287461773700305,
"grad_norm": 0.09957081079483032,
"learning_rate": 1.7613665252641656e-05,
"loss": 1.3245176076889038,
"step": 542
},
{
"epoch": 0.8318042813455657,
"grad_norm": 0.10918257385492325,
"learning_rate": 1.7593041054200535e-05,
"loss": 1.2970821857452393,
"step": 544
},
{
"epoch": 0.8348623853211009,
"grad_norm": 0.08030971139669418,
"learning_rate": 1.757234186298943e-05,
"loss": 1.2689207792282104,
"step": 546
},
{
"epoch": 0.8379204892966361,
"grad_norm": 0.10478982329368591,
"learning_rate": 1.7551567914451982e-05,
"loss": 1.3340964317321777,
"step": 548
},
{
"epoch": 0.8409785932721713,
"grad_norm": 0.3195957541465759,
"learning_rate": 1.7530719444882192e-05,
"loss": 1.3039358854293823,
"step": 550
},
{
"epoch": 0.8440366972477065,
"grad_norm": 0.12156535685062408,
"learning_rate": 1.7509796691421677e-05,
"loss": 1.2771456241607666,
"step": 552
},
{
"epoch": 0.8470948012232415,
"grad_norm": 0.12202958762645721,
"learning_rate": 1.7488799892057012e-05,
"loss": 1.2614632844924927,
"step": 554
},
{
"epoch": 0.8501529051987767,
"grad_norm": 0.10148178786039352,
"learning_rate": 1.746772928561701e-05,
"loss": 1.2788347005844116,
"step": 556
},
{
"epoch": 0.8532110091743119,
"grad_norm": 0.14937330782413483,
"learning_rate": 1.7446585111769994e-05,
"loss": 1.2756626605987549,
"step": 558
},
{
"epoch": 0.8562691131498471,
"grad_norm": 0.0912046805024147,
"learning_rate": 1.7425367611021095e-05,
"loss": 1.2301669120788574,
"step": 560
},
{
"epoch": 0.8593272171253823,
"grad_norm": 0.09572380036115646,
"learning_rate": 1.740407702470949e-05,
"loss": 1.2921226024627686,
"step": 562
},
{
"epoch": 0.8623853211009175,
"grad_norm": 0.12295603007078171,
"learning_rate": 1.738271359500569e-05,
"loss": 1.3243293762207031,
"step": 564
},
{
"epoch": 0.8654434250764526,
"grad_norm": 0.10404468327760696,
"learning_rate": 1.7361277564908746e-05,
"loss": 1.322561502456665,
"step": 566
},
{
"epoch": 0.8685015290519877,
"grad_norm": 0.16732335090637207,
"learning_rate": 1.7339769178243513e-05,
"loss": 1.316751480102539,
"step": 568
},
{
"epoch": 0.8715596330275229,
"grad_norm": 0.3713286519050598,
"learning_rate": 1.7318188679657868e-05,
"loss": 1.3337935209274292,
"step": 570
},
{
"epoch": 0.8746177370030581,
"grad_norm": 0.43876761198043823,
"learning_rate": 1.7296536314619927e-05,
"loss": 1.3191556930541992,
"step": 572
},
{
"epoch": 0.8776758409785933,
"grad_norm": 0.13432131707668304,
"learning_rate": 1.7274812329415256e-05,
"loss": 1.3426291942596436,
"step": 574
},
{
"epoch": 0.8807339449541285,
"grad_norm": 0.10545923560857773,
"learning_rate": 1.725301697114406e-05,
"loss": 1.2635884284973145,
"step": 576
},
{
"epoch": 0.8837920489296636,
"grad_norm": 0.16690693795681,
"learning_rate": 1.7231150487718388e-05,
"loss": 1.3099809885025024,
"step": 578
},
{
"epoch": 0.8868501529051988,
"grad_norm": 0.10333248227834702,
"learning_rate": 1.7209213127859298e-05,
"loss": 1.2673903703689575,
"step": 580
},
{
"epoch": 0.8899082568807339,
"grad_norm": 0.11379291117191315,
"learning_rate": 1.718720514109404e-05,
"loss": 1.2961304187774658,
"step": 582
},
{
"epoch": 0.8929663608562691,
"grad_norm": 0.13509656488895416,
"learning_rate": 1.7165126777753205e-05,
"loss": 1.3033212423324585,
"step": 584
},
{
"epoch": 0.8960244648318043,
"grad_norm": 0.1093890517950058,
"learning_rate": 1.714297828896789e-05,
"loss": 1.3269572257995605,
"step": 586
},
{
"epoch": 0.8990825688073395,
"grad_norm": 0.10886257141828537,
"learning_rate": 1.7120759926666833e-05,
"loss": 1.3087824583053589,
"step": 588
},
{
"epoch": 0.9021406727828746,
"grad_norm": 0.10905133932828903,
"learning_rate": 1.7098471943573554e-05,
"loss": 1.247659683227539,
"step": 590
},
{
"epoch": 0.9051987767584098,
"grad_norm": 2.196148157119751,
"learning_rate": 1.7076114593203477e-05,
"loss": 1.2718437910079956,
"step": 592
},
{
"epoch": 0.908256880733945,
"grad_norm": 0.1503346711397171,
"learning_rate": 1.7053688129861047e-05,
"loss": 1.310463309288025,
"step": 594
},
{
"epoch": 0.9113149847094801,
"grad_norm": 0.17786924540996552,
"learning_rate": 1.703119280863683e-05,
"loss": 1.3340353965759277,
"step": 596
},
{
"epoch": 0.9143730886850153,
"grad_norm": 0.30375856161117554,
"learning_rate": 1.700862888540463e-05,
"loss": 1.3078449964523315,
"step": 598
},
{
"epoch": 0.9174311926605505,
"grad_norm": 0.11228691041469574,
"learning_rate": 1.698599661681855e-05,
"loss": 1.2841458320617676,
"step": 600
},
{
"epoch": 0.9204892966360856,
"grad_norm": 0.09515351802110672,
"learning_rate": 1.6963296260310108e-05,
"loss": 1.2543302774429321,
"step": 602
},
{
"epoch": 0.9235474006116208,
"grad_norm": 0.12382373213768005,
"learning_rate": 1.6940528074085277e-05,
"loss": 1.2844551801681519,
"step": 604
},
{
"epoch": 0.926605504587156,
"grad_norm": 0.10872837156057358,
"learning_rate": 1.6917692317121574e-05,
"loss": 1.3093620538711548,
"step": 606
},
{
"epoch": 0.9296636085626911,
"grad_norm": 0.10462535917758942,
"learning_rate": 1.6894789249165088e-05,
"loss": 1.2586979866027832,
"step": 608
},
{
"epoch": 0.9327217125382263,
"grad_norm": 0.19008956849575043,
"learning_rate": 1.6871819130727543e-05,
"loss": 1.2832432985305786,
"step": 610
},
{
"epoch": 0.9357798165137615,
"grad_norm": 0.28194817900657654,
"learning_rate": 1.6848782223083346e-05,
"loss": 1.2822047472000122,
"step": 612
},
{
"epoch": 0.9388379204892966,
"grad_norm": 0.1536101996898651,
"learning_rate": 1.682567878826657e-05,
"loss": 1.3289425373077393,
"step": 614
},
{
"epoch": 0.9418960244648318,
"grad_norm": 0.10441045463085175,
"learning_rate": 1.6802509089068037e-05,
"loss": 1.3459938764572144,
"step": 616
},
{
"epoch": 0.944954128440367,
"grad_norm": 0.12564106285572052,
"learning_rate": 1.6779273389032268e-05,
"loss": 1.3037604093551636,
"step": 618
},
{
"epoch": 0.9480122324159022,
"grad_norm": 0.1721193939447403,
"learning_rate": 1.675597195245453e-05,
"loss": 1.3236896991729736,
"step": 620
},
{
"epoch": 0.9510703363914373,
"grad_norm": 0.1637701392173767,
"learning_rate": 1.6732605044377804e-05,
"loss": 1.290778636932373,
"step": 622
},
{
"epoch": 0.9541284403669725,
"grad_norm": 0.1784583181142807,
"learning_rate": 1.670917293058979e-05,
"loss": 1.2351161241531372,
"step": 624
},
{
"epoch": 0.9571865443425076,
"grad_norm": 0.16748499870300293,
"learning_rate": 1.668567587761985e-05,
"loss": 1.263080358505249,
"step": 626
},
{
"epoch": 0.9602446483180428,
"grad_norm": 0.2232455611228943,
"learning_rate": 1.6662114152736025e-05,
"loss": 1.2841684818267822,
"step": 628
},
{
"epoch": 0.963302752293578,
"grad_norm": 0.16638226807117462,
"learning_rate": 1.663848802394195e-05,
"loss": 1.258873462677002,
"step": 630
},
{
"epoch": 0.9663608562691132,
"grad_norm": 0.11573649197816849,
"learning_rate": 1.6614797759973834e-05,
"loss": 1.269798755645752,
"step": 632
},
{
"epoch": 0.9694189602446484,
"grad_norm": 0.1356838494539261,
"learning_rate": 1.6591043630297394e-05,
"loss": 1.3235660791397095,
"step": 634
},
{
"epoch": 0.9724770642201835,
"grad_norm": 0.25283753871917725,
"learning_rate": 1.6567225905104785e-05,
"loss": 1.303951382637024,
"step": 636
},
{
"epoch": 0.9755351681957186,
"grad_norm": 0.1507134586572647,
"learning_rate": 1.654334485531153e-05,
"loss": 1.319949746131897,
"step": 638
},
{
"epoch": 0.9785932721712538,
"grad_norm": 0.18651549518108368,
"learning_rate": 1.651940075255345e-05,
"loss": 1.274340271949768,
"step": 640
},
{
"epoch": 0.981651376146789,
"grad_norm": 0.11500085890293121,
"learning_rate": 1.649539386918355e-05,
"loss": 1.2713245153427124,
"step": 642
},
{
"epoch": 0.9847094801223242,
"grad_norm": 0.18969926238059998,
"learning_rate": 1.6471324478268946e-05,
"loss": 1.2918071746826172,
"step": 644
},
{
"epoch": 0.9877675840978594,
"grad_norm": 0.33583584427833557,
"learning_rate": 1.644719285358774e-05,
"loss": 1.3432663679122925,
"step": 646
},
{
"epoch": 0.9908256880733946,
"grad_norm": 0.16686177253723145,
"learning_rate": 1.642299926962593e-05,
"loss": 1.3730149269104004,
"step": 648
},
{
"epoch": 0.9938837920489296,
"grad_norm": 0.2564650774002075,
"learning_rate": 1.639874400157425e-05,
"loss": 2.0030324459075928,
"step": 650
},
{
"epoch": 0.9969418960244648,
"grad_norm": 0.7696052193641663,
"learning_rate": 1.6374427325325078e-05,
"loss": 1.9848356246948242,
"step": 652
},
{
"epoch": 1.0,
"grad_norm": 0.39214804768562317,
"learning_rate": 1.635004951746927e-05,
"loss": 1.952000379562378,
"step": 654
},
{
"epoch": 1.003058103975535,
"grad_norm": 0.26579877734184265,
"learning_rate": 1.632561085529304e-05,
"loss": 1.4564932584762573,
"step": 656
},
{
"epoch": 1.0061162079510704,
"grad_norm": 0.09789416193962097,
"learning_rate": 1.6301111616774778e-05,
"loss": 1.418447494506836,
"step": 658
},
{
"epoch": 1.0091743119266054,
"grad_norm": 0.20214584469795227,
"learning_rate": 1.6276552080581905e-05,
"loss": 1.4369993209838867,
"step": 660
},
{
"epoch": 1.0122324159021407,
"grad_norm": 0.1257963627576828,
"learning_rate": 1.6251932526067705e-05,
"loss": 1.36565363407135,
"step": 662
},
{
"epoch": 1.0152905198776758,
"grad_norm": 0.11626733839511871,
"learning_rate": 1.622725323326814e-05,
"loss": 1.5537068843841553,
"step": 664
},
{
"epoch": 1.018348623853211,
"grad_norm": 0.10738101601600647,
"learning_rate": 1.6202514482898665e-05,
"loss": 1.4378470182418823,
"step": 666
},
{
"epoch": 1.0214067278287462,
"grad_norm": 0.10652212798595428,
"learning_rate": 1.617771655635104e-05,
"loss": 1.3821481466293335,
"step": 668
},
{
"epoch": 1.0244648318042813,
"grad_norm": 0.1619665026664734,
"learning_rate": 1.615285973569012e-05,
"loss": 1.4641979932785034,
"step": 670
},
{
"epoch": 1.0275229357798166,
"grad_norm": 0.0878918468952179,
"learning_rate": 1.6127944303650665e-05,
"loss": 1.3324640989303589,
"step": 672
},
{
"epoch": 1.0305810397553516,
"grad_norm": 0.12292367219924927,
"learning_rate": 1.61029705436341e-05,
"loss": 1.4031749963760376,
"step": 674
},
{
"epoch": 1.033639143730887,
"grad_norm": 0.201751247048378,
"learning_rate": 1.607793873970531e-05,
"loss": 1.5163370370864868,
"step": 676
},
{
"epoch": 1.036697247706422,
"grad_norm": 0.1418297439813614,
"learning_rate": 1.6052849176589402e-05,
"loss": 1.4317665100097656,
"step": 678
},
{
"epoch": 1.039755351681957,
"grad_norm": 0.12598387897014618,
"learning_rate": 1.6027702139668467e-05,
"loss": 1.2285103797912598,
"step": 680
},
{
"epoch": 1.0428134556574924,
"grad_norm": 0.13934196531772614,
"learning_rate": 1.600249791497833e-05,
"loss": 1.3943346738815308,
"step": 682
},
{
"epoch": 1.0458715596330275,
"grad_norm": 0.26236793398857117,
"learning_rate": 1.5977236789205305e-05,
"loss": 1.6373569965362549,
"step": 684
},
{
"epoch": 1.0489296636085628,
"grad_norm": 0.1907891184091568,
"learning_rate": 1.595191904968293e-05,
"loss": 1.3017511367797852,
"step": 686
},
{
"epoch": 1.0519877675840978,
"grad_norm": 0.14807197451591492,
"learning_rate": 1.592654498438869e-05,
"loss": 1.4270445108413696,
"step": 688
},
{
"epoch": 1.0550458715596331,
"grad_norm": 0.0998062789440155,
"learning_rate": 1.5901114881940755e-05,
"loss": 1.318896770477295,
"step": 690
},
{
"epoch": 1.0581039755351682,
"grad_norm": 0.11834441125392914,
"learning_rate": 1.5875629031594695e-05,
"loss": 1.3081272840499878,
"step": 692
},
{
"epoch": 1.0611620795107033,
"grad_norm": 0.18614527583122253,
"learning_rate": 1.585008772324018e-05,
"loss": 1.31736159324646,
"step": 694
},
{
"epoch": 1.0642201834862386,
"grad_norm": 0.1135493814945221,
"learning_rate": 1.5824491247397693e-05,
"loss": 1.3496593236923218,
"step": 696
},
{
"epoch": 1.0672782874617737,
"grad_norm": 0.10874086618423462,
"learning_rate": 1.5798839895215222e-05,
"loss": 1.533265233039856,
"step": 698
},
{
"epoch": 1.070336391437309,
"grad_norm": 0.2451164424419403,
"learning_rate": 1.5773133958464943e-05,
"loss": 1.5562183856964111,
"step": 700
},
{
"epoch": 1.073394495412844,
"grad_norm": 0.16339105367660522,
"learning_rate": 1.574737372953991e-05,
"loss": 1.3957347869873047,
"step": 702
},
{
"epoch": 1.0764525993883791,
"grad_norm": 0.5332375764846802,
"learning_rate": 1.5721559501450725e-05,
"loss": 1.3628031015396118,
"step": 704
},
{
"epoch": 1.0795107033639144,
"grad_norm": 0.16171111166477203,
"learning_rate": 1.56956915678222e-05,
"loss": 1.3461381196975708,
"step": 706
},
{
"epoch": 1.0825688073394495,
"grad_norm": 0.2861359417438507,
"learning_rate": 1.5669770222890033e-05,
"loss": 1.4217514991760254,
"step": 708
},
{
"epoch": 1.0856269113149848,
"grad_norm": 0.28001871705055237,
"learning_rate": 1.564379576149744e-05,
"loss": 1.3966766595840454,
"step": 710
},
{
"epoch": 1.0886850152905199,
"grad_norm": 0.2379312813282013,
"learning_rate": 1.561776847909182e-05,
"loss": 1.4265292882919312,
"step": 712
},
{
"epoch": 1.091743119266055,
"grad_norm": 0.17415215075016022,
"learning_rate": 1.5591688671721382e-05,
"loss": 1.4175716638565063,
"step": 714
},
{
"epoch": 1.0948012232415902,
"grad_norm": 0.1268319934606552,
"learning_rate": 1.5565556636031784e-05,
"loss": 1.2845792770385742,
"step": 716
},
{
"epoch": 1.0978593272171253,
"grad_norm": 0.11549645662307739,
"learning_rate": 1.553937266926275e-05,
"loss": 1.2449455261230469,
"step": 718
},
{
"epoch": 1.1009174311926606,
"grad_norm": 0.12862299382686615,
"learning_rate": 1.551313706924471e-05,
"loss": 1.400179147720337,
"step": 720
},
{
"epoch": 1.1039755351681957,
"grad_norm": 0.219200998544693,
"learning_rate": 1.5486850134395386e-05,
"loss": 1.521613359451294,
"step": 722
},
{
"epoch": 1.107033639143731,
"grad_norm": 0.24224898219108582,
"learning_rate": 1.5460512163716413e-05,
"loss": 1.3650974035263062,
"step": 724
},
{
"epoch": 1.110091743119266,
"grad_norm": 0.20005325973033905,
"learning_rate": 1.5434123456789935e-05,
"loss": 1.5584301948547363,
"step": 726
},
{
"epoch": 1.1131498470948011,
"grad_norm": 0.19992578029632568,
"learning_rate": 1.54076843137752e-05,
"loss": 1.5724374055862427,
"step": 728
},
{
"epoch": 1.1162079510703364,
"grad_norm": 0.20727121829986572,
"learning_rate": 1.5381195035405138e-05,
"loss": 1.335442066192627,
"step": 730
},
{
"epoch": 1.1192660550458715,
"grad_norm": 0.23304890096187592,
"learning_rate": 1.535465592298295e-05,
"loss": 1.309862494468689,
"step": 732
},
{
"epoch": 1.1223241590214068,
"grad_norm": 0.11819928884506226,
"learning_rate": 1.5328067278378672e-05,
"loss": 1.4315496683120728,
"step": 734
},
{
"epoch": 1.1253822629969419,
"grad_norm": 0.17882581055164337,
"learning_rate": 1.5301429404025752e-05,
"loss": 1.2392085790634155,
"step": 736
},
{
"epoch": 1.1284403669724772,
"grad_norm": 0.13341587781906128,
"learning_rate": 1.5274742602917594e-05,
"loss": 1.329017996788025,
"step": 738
},
{
"epoch": 1.1314984709480123,
"grad_norm": 0.7237756848335266,
"learning_rate": 1.5248007178604125e-05,
"loss": 1.2809618711471558,
"step": 740
},
{
"epoch": 1.1345565749235473,
"grad_norm": 0.17202049493789673,
"learning_rate": 1.5221223435188346e-05,
"loss": 1.3543680906295776,
"step": 742
},
{
"epoch": 1.1376146788990826,
"grad_norm": 0.207048237323761,
"learning_rate": 1.5194391677322852e-05,
"loss": 1.2619272470474243,
"step": 744
},
{
"epoch": 1.1406727828746177,
"grad_norm": 0.10717354714870453,
"learning_rate": 1.516751221020639e-05,
"loss": 1.3127491474151611,
"step": 746
},
{
"epoch": 1.143730886850153,
"grad_norm": 0.21290293335914612,
"learning_rate": 1.5140585339580372e-05,
"loss": 1.4598865509033203,
"step": 748
},
{
"epoch": 1.146788990825688,
"grad_norm": 0.11727327853441238,
"learning_rate": 1.5113611371725405e-05,
"loss": 1.3367241621017456,
"step": 750
},
{
"epoch": 1.1498470948012232,
"grad_norm": 0.18146538734436035,
"learning_rate": 1.5086590613457808e-05,
"loss": 1.2867947816848755,
"step": 752
},
{
"epoch": 1.1529051987767585,
"grad_norm": 0.16245512664318085,
"learning_rate": 1.5059523372126112e-05,
"loss": 1.509689211845398,
"step": 754
},
{
"epoch": 1.1559633027522935,
"grad_norm": 0.2850814759731293,
"learning_rate": 1.5032409955607578e-05,
"loss": 1.3928141593933105,
"step": 756
},
{
"epoch": 1.1590214067278288,
"grad_norm": 0.0971517339348793,
"learning_rate": 1.5005250672304687e-05,
"loss": 1.3027610778808594,
"step": 758
},
{
"epoch": 1.162079510703364,
"grad_norm": 0.14859257638454437,
"learning_rate": 1.4978045831141626e-05,
"loss": 1.409261703491211,
"step": 760
},
{
"epoch": 1.165137614678899,
"grad_norm": 0.1967761367559433,
"learning_rate": 1.4950795741560793e-05,
"loss": 1.3910449743270874,
"step": 762
},
{
"epoch": 1.1681957186544343,
"grad_norm": 0.11445457488298416,
"learning_rate": 1.4923500713519259e-05,
"loss": 1.4382435083389282,
"step": 764
},
{
"epoch": 1.1712538226299694,
"grad_norm": 0.1770360916852951,
"learning_rate": 1.4896161057485248e-05,
"loss": 1.3225045204162598,
"step": 766
},
{
"epoch": 1.1743119266055047,
"grad_norm": 0.22804246842861176,
"learning_rate": 1.4868777084434607e-05,
"loss": 1.4178036451339722,
"step": 768
},
{
"epoch": 1.1773700305810397,
"grad_norm": 0.1586090326309204,
"learning_rate": 1.4841349105847275e-05,
"loss": 1.3072750568389893,
"step": 770
},
{
"epoch": 1.1804281345565748,
"grad_norm": 0.2554394602775574,
"learning_rate": 1.4813877433703723e-05,
"loss": 1.6272152662277222,
"step": 772
},
{
"epoch": 1.18348623853211,
"grad_norm": 0.2529110610485077,
"learning_rate": 1.4786362380481427e-05,
"loss": 1.3914612531661987,
"step": 774
},
{
"epoch": 1.1865443425076452,
"grad_norm": 0.13387520611286163,
"learning_rate": 1.475880425915129e-05,
"loss": 1.4425235986709595,
"step": 776
},
{
"epoch": 1.1896024464831805,
"grad_norm": 0.14389349520206451,
"learning_rate": 1.4731203383174109e-05,
"loss": 1.4251344203948975,
"step": 778
},
{
"epoch": 1.1926605504587156,
"grad_norm": 0.13777580857276917,
"learning_rate": 1.4703560066496982e-05,
"loss": 1.3878992795944214,
"step": 780
},
{
"epoch": 1.1957186544342508,
"grad_norm": 0.1566278487443924,
"learning_rate": 1.467587462354976e-05,
"loss": 1.3862793445587158,
"step": 782
},
{
"epoch": 1.198776758409786,
"grad_norm": 0.13618597388267517,
"learning_rate": 1.4648147369241452e-05,
"loss": 1.3946163654327393,
"step": 784
},
{
"epoch": 1.2018348623853212,
"grad_norm": 0.192558154463768,
"learning_rate": 1.4620378618956663e-05,
"loss": 1.3836755752563477,
"step": 786
},
{
"epoch": 1.2048929663608563,
"grad_norm": 0.13239143788814545,
"learning_rate": 1.4592568688551982e-05,
"loss": 1.4628338813781738,
"step": 788
},
{
"epoch": 1.2079510703363914,
"grad_norm": 0.17559094727039337,
"learning_rate": 1.4564717894352414e-05,
"loss": 1.3984802961349487,
"step": 790
},
{
"epoch": 1.2110091743119267,
"grad_norm": 0.23624469339847565,
"learning_rate": 1.4536826553147762e-05,
"loss": 1.2977172136306763,
"step": 792
},
{
"epoch": 1.2140672782874617,
"grad_norm": 0.1116776168346405,
"learning_rate": 1.450889498218904e-05,
"loss": 1.2677760124206543,
"step": 794
},
{
"epoch": 1.217125382262997,
"grad_norm": 0.12481163442134857,
"learning_rate": 1.4480923499184851e-05,
"loss": 1.318403720855713,
"step": 796
},
{
"epoch": 1.2201834862385321,
"grad_norm": 0.13348488509655,
"learning_rate": 1.4452912422297783e-05,
"loss": 1.2807520627975464,
"step": 798
},
{
"epoch": 1.2232415902140672,
"grad_norm": 0.11903538554906845,
"learning_rate": 1.4424862070140782e-05,
"loss": 1.3351408243179321,
"step": 800
},
{
"epoch": 1.2262996941896025,
"grad_norm": 0.15349438786506653,
"learning_rate": 1.439677276177353e-05,
"loss": 1.293311357498169,
"step": 802
},
{
"epoch": 1.2293577981651376,
"grad_norm": 0.8263071179389954,
"learning_rate": 1.4368644816698831e-05,
"loss": 1.5547124147415161,
"step": 804
},
{
"epoch": 1.2324159021406729,
"grad_norm": 0.15303385257720947,
"learning_rate": 1.4340478554858948e-05,
"loss": 1.5602731704711914,
"step": 806
},
{
"epoch": 1.235474006116208,
"grad_norm": 0.18527425825595856,
"learning_rate": 1.4312274296631986e-05,
"loss": 1.6077568531036377,
"step": 808
},
{
"epoch": 1.238532110091743,
"grad_norm": 0.1729760617017746,
"learning_rate": 1.428403236282824e-05,
"loss": 1.5241308212280273,
"step": 810
},
{
"epoch": 1.2415902140672783,
"grad_norm": 0.1840292066335678,
"learning_rate": 1.4255753074686554e-05,
"loss": 1.5814104080200195,
"step": 812
},
{
"epoch": 1.2446483180428134,
"grad_norm": 0.14792004227638245,
"learning_rate": 1.4227436753870645e-05,
"loss": 1.469613790512085,
"step": 814
},
{
"epoch": 1.2477064220183487,
"grad_norm": 0.2009243667125702,
"learning_rate": 1.4199083722465473e-05,
"loss": 1.575390338897705,
"step": 816
},
{
"epoch": 1.2507645259938838,
"grad_norm": 0.18052861094474792,
"learning_rate": 1.4170694302973558e-05,
"loss": 1.3542118072509766,
"step": 818
},
{
"epoch": 1.2538226299694188,
"grad_norm": 0.25438863039016724,
"learning_rate": 1.4142268818311318e-05,
"loss": 1.4728432893753052,
"step": 820
},
{
"epoch": 1.2568807339449541,
"grad_norm": 0.1723494827747345,
"learning_rate": 1.4113807591805403e-05,
"loss": 1.2633655071258545,
"step": 822
},
{
"epoch": 1.2599388379204892,
"grad_norm": 0.1885925829410553,
"learning_rate": 1.408531094718899e-05,
"loss": 1.4775038957595825,
"step": 824
},
{
"epoch": 1.2629969418960245,
"grad_norm": 0.17333459854125977,
"learning_rate": 1.4056779208598148e-05,
"loss": 1.0904916524887085,
"step": 826
},
{
"epoch": 1.2660550458715596,
"grad_norm": 0.23691454529762268,
"learning_rate": 1.40282127005681e-05,
"loss": 1.0575618743896484,
"step": 828
},
{
"epoch": 1.2691131498470947,
"grad_norm": 0.19390061497688293,
"learning_rate": 1.3999611748029567e-05,
"loss": 1.275180697441101,
"step": 830
},
{
"epoch": 1.27217125382263,
"grad_norm": 0.30520501732826233,
"learning_rate": 1.3970976676305057e-05,
"loss": 1.3261710405349731,
"step": 832
},
{
"epoch": 1.2752293577981653,
"grad_norm": 0.339356929063797,
"learning_rate": 1.3942307811105174e-05,
"loss": 1.4489119052886963,
"step": 834
},
{
"epoch": 1.2782874617737003,
"grad_norm": 0.31700074672698975,
"learning_rate": 1.3913605478524893e-05,
"loss": 1.375788927078247,
"step": 836
},
{
"epoch": 1.2813455657492354,
"grad_norm": 0.1832026243209839,
"learning_rate": 1.3884870005039876e-05,
"loss": 1.1603834629058838,
"step": 838
},
{
"epoch": 1.2844036697247707,
"grad_norm": 0.16457735002040863,
"learning_rate": 1.3856101717502745e-05,
"loss": 1.1277961730957031,
"step": 840
},
{
"epoch": 1.2874617737003058,
"grad_norm": 0.40114447474479675,
"learning_rate": 1.3827300943139368e-05,
"loss": 1.3474359512329102,
"step": 842
},
{
"epoch": 1.290519877675841,
"grad_norm": 0.3017509877681732,
"learning_rate": 1.3798468009545132e-05,
"loss": 1.258968472480774,
"step": 844
},
{
"epoch": 1.2935779816513762,
"grad_norm": 0.2743600308895111,
"learning_rate": 1.3769603244681224e-05,
"loss": 1.216719388961792,
"step": 846
},
{
"epoch": 1.2966360856269112,
"grad_norm": 0.28029799461364746,
"learning_rate": 1.3740706976870894e-05,
"loss": 1.4269287586212158,
"step": 848
},
{
"epoch": 1.2996941896024465,
"grad_norm": 0.34066903591156006,
"learning_rate": 1.3711779534795726e-05,
"loss": 1.4010690450668335,
"step": 850
},
{
"epoch": 1.3027522935779816,
"grad_norm": 0.1889609545469284,
"learning_rate": 1.3682821247491888e-05,
"loss": 1.128299355506897,
"step": 852
},
{
"epoch": 1.305810397553517,
"grad_norm": 0.13515017926692963,
"learning_rate": 1.365383244434641e-05,
"loss": 1.3370952606201172,
"step": 854
},
{
"epoch": 1.308868501529052,
"grad_norm": 0.2397429496049881,
"learning_rate": 1.3624813455093426e-05,
"loss": 1.4829299449920654,
"step": 856
},
{
"epoch": 1.311926605504587,
"grad_norm": 0.1517391949892044,
"learning_rate": 1.3595764609810409e-05,
"loss": 1.6207728385925293,
"step": 858
},
{
"epoch": 1.3149847094801224,
"grad_norm": 0.14302465319633484,
"learning_rate": 1.3566686238914442e-05,
"loss": 1.6425838470458984,
"step": 860
},
{
"epoch": 1.3180428134556574,
"grad_norm": 0.13647185266017914,
"learning_rate": 1.3537578673158447e-05,
"loss": 1.7624022960662842,
"step": 862
},
{
"epoch": 1.3211009174311927,
"grad_norm": 0.15156084299087524,
"learning_rate": 1.3508442243627414e-05,
"loss": 1.3077445030212402,
"step": 864
},
{
"epoch": 1.3241590214067278,
"grad_norm": 0.16375960409641266,
"learning_rate": 1.3479277281734665e-05,
"loss": 1.387640118598938,
"step": 866
},
{
"epoch": 1.3272171253822629,
"grad_norm": 0.13412417471408844,
"learning_rate": 1.345008411921804e-05,
"loss": 1.2285324335098267,
"step": 868
},
{
"epoch": 1.3302752293577982,
"grad_norm": 0.09673840552568436,
"learning_rate": 1.342086308813617e-05,
"loss": 1.1523722410202026,
"step": 870
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.24676595628261566,
"learning_rate": 1.3391614520864665e-05,
"loss": 1.5277175903320312,
"step": 872
},
{
"epoch": 1.3363914373088686,
"grad_norm": 0.1344609558582306,
"learning_rate": 1.3362338750092345e-05,
"loss": 1.4308637380599976,
"step": 874
},
{
"epoch": 1.3394495412844036,
"grad_norm": 0.18598589301109314,
"learning_rate": 1.3333036108817468e-05,
"loss": 1.6210284233093262,
"step": 876
},
{
"epoch": 1.3425076452599387,
"grad_norm": 0.22593791782855988,
"learning_rate": 1.330370693034392e-05,
"loss": 1.092522144317627,
"step": 878
},
{
"epoch": 1.345565749235474,
"grad_norm": 0.20722009241580963,
"learning_rate": 1.3274351548277444e-05,
"loss": 1.3288477659225464,
"step": 880
},
{
"epoch": 1.3486238532110093,
"grad_norm": 0.3268890976905823,
"learning_rate": 1.3244970296521832e-05,
"loss": 1.5945448875427246,
"step": 882
},
{
"epoch": 1.3516819571865444,
"grad_norm": 0.13788160681724548,
"learning_rate": 1.3215563509275134e-05,
"loss": 1.3352100849151611,
"step": 884
},
{
"epoch": 1.3547400611620795,
"grad_norm": 0.16543197631835938,
"learning_rate": 1.3186131521025848e-05,
"loss": 1.4955792427062988,
"step": 886
},
{
"epoch": 1.3577981651376148,
"grad_norm": 0.21935272216796875,
"learning_rate": 1.3156674666549131e-05,
"loss": 1.3688589334487915,
"step": 888
},
{
"epoch": 1.3608562691131498,
"grad_norm": 0.15441736578941345,
"learning_rate": 1.3127193280902977e-05,
"loss": 1.2942179441452026,
"step": 890
},
{
"epoch": 1.3639143730886851,
"grad_norm": 0.13213297724723816,
"learning_rate": 1.3097687699424411e-05,
"loss": 1.3552772998809814,
"step": 892
},
{
"epoch": 1.3669724770642202,
"grad_norm": 0.17006580531597137,
"learning_rate": 1.306815825772567e-05,
"loss": 1.413638949394226,
"step": 894
},
{
"epoch": 1.3700305810397553,
"grad_norm": 0.1837081015110016,
"learning_rate": 1.3038605291690401e-05,
"loss": 1.2679128646850586,
"step": 896
},
{
"epoch": 1.3730886850152906,
"grad_norm": 0.11259419471025467,
"learning_rate": 1.300902913746982e-05,
"loss": 1.3821946382522583,
"step": 898
},
{
"epoch": 1.3761467889908257,
"grad_norm": 0.2403029501438141,
"learning_rate": 1.2979430131478895e-05,
"loss": 1.6863353252410889,
"step": 900
},
{
"epoch": 1.379204892966361,
"grad_norm": 0.31875738501548767,
"learning_rate": 1.2949808610392536e-05,
"loss": 2.005361318588257,
"step": 902
},
{
"epoch": 1.382262996941896,
"grad_norm": 0.16625094413757324,
"learning_rate": 1.2920164911141739e-05,
"loss": 1.313822865486145,
"step": 904
},
{
"epoch": 1.385321100917431,
"grad_norm": 0.1672082543373108,
"learning_rate": 1.289049937090977e-05,
"loss": 1.3048782348632812,
"step": 906
},
{
"epoch": 1.3883792048929664,
"grad_norm": 0.17883723974227905,
"learning_rate": 1.2860812327128329e-05,
"loss": 1.370732307434082,
"step": 908
},
{
"epoch": 1.3914373088685015,
"grad_norm": 0.34061485528945923,
"learning_rate": 1.2831104117473708e-05,
"loss": 1.4522621631622314,
"step": 910
},
{
"epoch": 1.3944954128440368,
"grad_norm": 0.2218160331249237,
"learning_rate": 1.2801375079862941e-05,
"loss": 1.3893041610717773,
"step": 912
},
{
"epoch": 1.3975535168195719,
"grad_norm": 0.19148877263069153,
"learning_rate": 1.2771625552449989e-05,
"loss": 1.2561511993408203,
"step": 914
},
{
"epoch": 1.400611620795107,
"grad_norm": 0.14251859486103058,
"learning_rate": 1.2741855873621853e-05,
"loss": 1.1840838193893433,
"step": 916
},
{
"epoch": 1.4036697247706422,
"grad_norm": 0.09337247163057327,
"learning_rate": 1.2712066381994771e-05,
"loss": 1.1102322340011597,
"step": 918
},
{
"epoch": 1.4067278287461773,
"grad_norm": 0.11131396144628525,
"learning_rate": 1.2682257416410324e-05,
"loss": 1.2444500923156738,
"step": 920
},
{
"epoch": 1.4097859327217126,
"grad_norm": 0.1941288560628891,
"learning_rate": 1.2652429315931607e-05,
"loss": 1.4857803583145142,
"step": 922
},
{
"epoch": 1.4128440366972477,
"grad_norm": 0.2464672476053238,
"learning_rate": 1.2622582419839364e-05,
"loss": 1.5179466009140015,
"step": 924
},
{
"epoch": 1.4159021406727827,
"grad_norm": 0.40092432498931885,
"learning_rate": 1.259271706762813e-05,
"loss": 1.49838387966156,
"step": 926
},
{
"epoch": 1.418960244648318,
"grad_norm": 0.18188023567199707,
"learning_rate": 1.2562833599002376e-05,
"loss": 1.5421233177185059,
"step": 928
},
{
"epoch": 1.4220183486238533,
"grad_norm": 0.16129463911056519,
"learning_rate": 1.2532932353872626e-05,
"loss": 1.3665364980697632,
"step": 930
},
{
"epoch": 1.4250764525993884,
"grad_norm": 0.36101001501083374,
"learning_rate": 1.2503013672351614e-05,
"loss": 1.4816341400146484,
"step": 932
},
{
"epoch": 1.4281345565749235,
"grad_norm": 0.3823509216308594,
"learning_rate": 1.2473077894750406e-05,
"loss": 1.381542682647705,
"step": 934
},
{
"epoch": 1.4311926605504588,
"grad_norm": 0.2345127910375595,
"learning_rate": 1.2443125361574516e-05,
"loss": 1.3099571466445923,
"step": 936
},
{
"epoch": 1.4342507645259939,
"grad_norm": 0.18030600249767303,
"learning_rate": 1.241315641352006e-05,
"loss": 1.3212041854858398,
"step": 938
},
{
"epoch": 1.4373088685015292,
"grad_norm": 0.2411879003047943,
"learning_rate": 1.238317139146985e-05,
"loss": 1.51852548122406,
"step": 940
},
{
"epoch": 1.4403669724770642,
"grad_norm": 0.2498498558998108,
"learning_rate": 1.235317063648955e-05,
"loss": 1.2771378755569458,
"step": 942
},
{
"epoch": 1.4434250764525993,
"grad_norm": 0.29515254497528076,
"learning_rate": 1.2323154489823766e-05,
"loss": 1.4409904479980469,
"step": 944
},
{
"epoch": 1.4464831804281346,
"grad_norm": 0.23676812648773193,
"learning_rate": 1.2293123292892176e-05,
"loss": 1.3410083055496216,
"step": 946
},
{
"epoch": 1.4495412844036697,
"grad_norm": 0.20357346534729004,
"learning_rate": 1.2263077387285656e-05,
"loss": 1.3485143184661865,
"step": 948
},
{
"epoch": 1.452599388379205,
"grad_norm": 0.21395525336265564,
"learning_rate": 1.2233017114762383e-05,
"loss": 1.5053271055221558,
"step": 950
},
{
"epoch": 1.45565749235474,
"grad_norm": 0.14866068959236145,
"learning_rate": 1.2202942817243945e-05,
"loss": 1.5033762454986572,
"step": 952
},
{
"epoch": 1.4587155963302751,
"grad_norm": 0.1509505659341812,
"learning_rate": 1.217285483681147e-05,
"loss": 1.32400381565094,
"step": 954
},
{
"epoch": 1.4617737003058104,
"grad_norm": 0.2283921092748642,
"learning_rate": 1.2142753515701715e-05,
"loss": 1.2706825733184814,
"step": 956
},
{
"epoch": 1.4648318042813455,
"grad_norm": 0.19454039633274078,
"learning_rate": 1.2112639196303177e-05,
"loss": 1.403527021408081,
"step": 958
},
{
"epoch": 1.4678899082568808,
"grad_norm": 0.19340156018733978,
"learning_rate": 1.2082512221152211e-05,
"loss": 1.3496915102005005,
"step": 960
},
{
"epoch": 1.470948012232416,
"grad_norm": 0.28565388917922974,
"learning_rate": 1.2052372932929124e-05,
"loss": 1.5003544092178345,
"step": 962
},
{
"epoch": 1.474006116207951,
"grad_norm": 0.2407606989145279,
"learning_rate": 1.2022221674454276e-05,
"loss": 1.3663601875305176,
"step": 964
},
{
"epoch": 1.4770642201834863,
"grad_norm": 0.24161574244499207,
"learning_rate": 1.1992058788684178e-05,
"loss": 1.311216950416565,
"step": 966
},
{
"epoch": 1.4801223241590213,
"grad_norm": 0.6812675595283508,
"learning_rate": 1.1961884618707606e-05,
"loss": 1.2701431512832642,
"step": 968
},
{
"epoch": 1.4831804281345566,
"grad_norm": 0.15686868131160736,
"learning_rate": 1.1931699507741681e-05,
"loss": 1.4032564163208008,
"step": 970
},
{
"epoch": 1.4862385321100917,
"grad_norm": 0.29336854815483093,
"learning_rate": 1.1901503799127978e-05,
"loss": 1.3898736238479614,
"step": 972
},
{
"epoch": 1.4892966360856268,
"grad_norm": 0.21223317086696625,
"learning_rate": 1.1871297836328615e-05,
"loss": 1.357151746749878,
"step": 974
},
{
"epoch": 1.492354740061162,
"grad_norm": 0.3396297097206116,
"learning_rate": 1.1841081962922339e-05,
"loss": 1.5504416227340698,
"step": 976
},
{
"epoch": 1.4954128440366974,
"grad_norm": 0.15393349528312683,
"learning_rate": 1.1810856522600633e-05,
"loss": 1.7415130138397217,
"step": 978
},
{
"epoch": 1.4984709480122325,
"grad_norm": 0.17905078828334808,
"learning_rate": 1.1780621859163799e-05,
"loss": 1.6198244094848633,
"step": 980
},
{
"epoch": 1.5015290519877675,
"grad_norm": 0.19629204273223877,
"learning_rate": 1.1750378316517042e-05,
"loss": 1.6088056564331055,
"step": 982
},
{
"epoch": 1.5045871559633026,
"grad_norm": 0.13399019837379456,
"learning_rate": 1.1720126238666574e-05,
"loss": 1.5122345685958862,
"step": 984
},
{
"epoch": 1.507645259938838,
"grad_norm": 0.205459326505661,
"learning_rate": 1.1689865969715682e-05,
"loss": 1.183510661125183,
"step": 986
},
{
"epoch": 1.5107033639143732,
"grad_norm": 0.19832122325897217,
"learning_rate": 1.1659597853860822e-05,
"loss": 1.1826776266098022,
"step": 988
},
{
"epoch": 1.5137614678899083,
"grad_norm": 0.217342808842659,
"learning_rate": 1.1629322235387712e-05,
"loss": 1.5985417366027832,
"step": 990
},
{
"epoch": 1.5168195718654434,
"grad_norm": 0.21424347162246704,
"learning_rate": 1.1599039458667404e-05,
"loss": 1.5874334573745728,
"step": 992
},
{
"epoch": 1.5198776758409784,
"grad_norm": 0.21046607196331024,
"learning_rate": 1.1568749868152376e-05,
"loss": 1.6973150968551636,
"step": 994
},
{
"epoch": 1.5229357798165137,
"grad_norm": 0.22607657313346863,
"learning_rate": 1.1538453808372601e-05,
"loss": 1.6257494688034058,
"step": 996
},
{
"epoch": 1.525993883792049,
"grad_norm": 0.2784031927585602,
"learning_rate": 1.1508151623931652e-05,
"loss": 1.6771817207336426,
"step": 998
},
{
"epoch": 1.529051987767584,
"grad_norm": 0.3850814700126648,
"learning_rate": 1.1477843659502748e-05,
"loss": 1.5088847875595093,
"step": 1000
},
{
"epoch": 1.5321100917431192,
"grad_norm": 0.389369398355484,
"learning_rate": 1.1447530259824867e-05,
"loss": 1.507793664932251,
"step": 1002
},
{
"epoch": 1.5351681957186545,
"grad_norm": 0.596508800983429,
"learning_rate": 1.1417211769698803e-05,
"loss": 1.3859407901763916,
"step": 1004
},
{
"epoch": 1.5382262996941896,
"grad_norm": 0.34961917996406555,
"learning_rate": 1.1386888533983263e-05,
"loss": 1.6432298421859741,
"step": 1006
},
{
"epoch": 1.5412844036697249,
"grad_norm": 0.3803319036960602,
"learning_rate": 1.1356560897590914e-05,
"loss": 1.4720622301101685,
"step": 1008
},
{
"epoch": 1.54434250764526,
"grad_norm": 0.18877731263637543,
"learning_rate": 1.1326229205484494e-05,
"loss": 1.1493924856185913,
"step": 1010
},
{
"epoch": 1.547400611620795,
"grad_norm": 0.29330477118492126,
"learning_rate": 1.1295893802672867e-05,
"loss": 1.6615839004516602,
"step": 1012
},
{
"epoch": 1.5504587155963303,
"grad_norm": 0.21881945431232452,
"learning_rate": 1.1265555034207103e-05,
"loss": 1.749032974243164,
"step": 1014
},
{
"epoch": 1.5535168195718656,
"grad_norm": 0.28221604228019714,
"learning_rate": 1.1235213245176564e-05,
"loss": 1.8175487518310547,
"step": 1016
},
{
"epoch": 1.5565749235474007,
"grad_norm": 0.2191021740436554,
"learning_rate": 1.1204868780704952e-05,
"loss": 1.7448463439941406,
"step": 1018
},
{
"epoch": 1.5596330275229358,
"grad_norm": 0.2853221893310547,
"learning_rate": 1.117452198594642e-05,
"loss": 1.7113560438156128,
"step": 1020
},
{
"epoch": 1.5626911314984708,
"grad_norm": 0.30422237515449524,
"learning_rate": 1.1144173206081619e-05,
"loss": 1.5799381732940674,
"step": 1022
},
{
"epoch": 1.5657492354740061,
"grad_norm": 0.2810017466545105,
"learning_rate": 1.111382278631377e-05,
"loss": 1.4372574090957642,
"step": 1024
},
{
"epoch": 1.5688073394495414,
"grad_norm": 0.2811414897441864,
"learning_rate": 1.1083471071864766e-05,
"loss": 0.9963301420211792,
"step": 1026
},
{
"epoch": 1.5718654434250765,
"grad_norm": 0.18028278648853302,
"learning_rate": 1.105311840797121e-05,
"loss": 0.9933477640151978,
"step": 1028
},
{
"epoch": 1.5749235474006116,
"grad_norm": 0.1357865184545517,
"learning_rate": 1.1022765139880517e-05,
"loss": 1.221966028213501,
"step": 1030
},
{
"epoch": 1.5779816513761467,
"grad_norm": 0.14980663359165192,
"learning_rate": 1.0992411612846962e-05,
"loss": 1.1657860279083252,
"step": 1032
},
{
"epoch": 1.581039755351682,
"grad_norm": 0.18947246670722961,
"learning_rate": 1.0962058172127774e-05,
"loss": 1.1021173000335693,
"step": 1034
},
{
"epoch": 1.5840978593272173,
"grad_norm": 0.12680017948150635,
"learning_rate": 1.0931705162979203e-05,
"loss": 1.1529592275619507,
"step": 1036
},
{
"epoch": 1.5871559633027523,
"grad_norm": 0.31863412261009216,
"learning_rate": 1.090135293065258e-05,
"loss": 1.38621985912323,
"step": 1038
},
{
"epoch": 1.5902140672782874,
"grad_norm": 0.18417641520500183,
"learning_rate": 1.0871001820390406e-05,
"loss": 1.3420405387878418,
"step": 1040
},
{
"epoch": 1.5932721712538225,
"grad_norm": 0.1281033605337143,
"learning_rate": 1.0840652177422418e-05,
"loss": 1.1849461793899536,
"step": 1042
},
{
"epoch": 1.5963302752293578,
"grad_norm": 0.3351232409477234,
"learning_rate": 1.0810304346961666e-05,
"loss": 1.3912733793258667,
"step": 1044
},
{
"epoch": 1.599388379204893,
"grad_norm": 0.15044710040092468,
"learning_rate": 1.0779958674200577e-05,
"loss": 1.4560588598251343,
"step": 1046
},
{
"epoch": 1.6024464831804281,
"grad_norm": 0.2629024386405945,
"learning_rate": 1.0749615504307044e-05,
"loss": 1.2233479022979736,
"step": 1048
},
{
"epoch": 1.6055045871559632,
"grad_norm": 0.23600272834300995,
"learning_rate": 1.0719275182420484e-05,
"loss": 1.4159035682678223,
"step": 1050
},
{
"epoch": 1.6085626911314985,
"grad_norm": 1.801208734512329,
"learning_rate": 1.0688938053647919e-05,
"loss": 1.6346092224121094,
"step": 1052
},
{
"epoch": 1.6116207951070336,
"grad_norm": 0.21306540071964264,
"learning_rate": 1.0658604463060059e-05,
"loss": 1.1381094455718994,
"step": 1054
},
{
"epoch": 1.614678899082569,
"grad_norm": 0.20768539607524872,
"learning_rate": 1.062827475568736e-05,
"loss": 1.3859405517578125,
"step": 1056
},
{
"epoch": 1.617737003058104,
"grad_norm": 0.12193287909030914,
"learning_rate": 1.059794927651611e-05,
"loss": 1.4775490760803223,
"step": 1058
},
{
"epoch": 1.620795107033639,
"grad_norm": 0.2431986778974533,
"learning_rate": 1.0567628370484503e-05,
"loss": 1.3453044891357422,
"step": 1060
},
{
"epoch": 1.6238532110091743,
"grad_norm": 0.1215621829032898,
"learning_rate": 1.0537312382478721e-05,
"loss": 1.2004613876342773,
"step": 1062
},
{
"epoch": 1.6269113149847096,
"grad_norm": 0.59897381067276,
"learning_rate": 1.0507001657329003e-05,
"loss": 0.810043215751648,
"step": 1064
},
{
"epoch": 1.6299694189602447,
"grad_norm": 0.19294363260269165,
"learning_rate": 1.047669653980572e-05,
"loss": 1.5248199701309204,
"step": 1066
},
{
"epoch": 1.6330275229357798,
"grad_norm": 0.27523353695869446,
"learning_rate": 1.0446397374615466e-05,
"loss": 1.4369803667068481,
"step": 1068
},
{
"epoch": 1.6360856269113149,
"grad_norm": 0.1609538048505783,
"learning_rate": 1.0416104506397127e-05,
"loss": 1.3286679983139038,
"step": 1070
},
{
"epoch": 1.6391437308868502,
"grad_norm": 0.30778738856315613,
"learning_rate": 1.0385818279717963e-05,
"loss": 1.4519555568695068,
"step": 1072
},
{
"epoch": 1.6422018348623855,
"grad_norm": 0.15987016260623932,
"learning_rate": 1.0355539039069692e-05,
"loss": 1.389966368675232,
"step": 1074
},
{
"epoch": 1.6452599388379205,
"grad_norm": 0.15622813999652863,
"learning_rate": 1.032526712886457e-05,
"loss": 1.352725625038147,
"step": 1076
},
{
"epoch": 1.6483180428134556,
"grad_norm": 0.36926910281181335,
"learning_rate": 1.0295002893431465e-05,
"loss": 1.5491305589675903,
"step": 1078
},
{
"epoch": 1.6513761467889907,
"grad_norm": 0.2159455269575119,
"learning_rate": 1.0264746677011957e-05,
"loss": 1.1885015964508057,
"step": 1080
},
{
"epoch": 1.654434250764526,
"grad_norm": 0.16300049424171448,
"learning_rate": 1.0234498823756409e-05,
"loss": 0.9729296565055847,
"step": 1082
},
{
"epoch": 1.6574923547400613,
"grad_norm": 0.2133103460073471,
"learning_rate": 1.020425967772006e-05,
"loss": 1.6012141704559326,
"step": 1084
},
{
"epoch": 1.6605504587155964,
"grad_norm": 0.21847309172153473,
"learning_rate": 1.0174029582859104e-05,
"loss": 1.391322135925293,
"step": 1086
},
{
"epoch": 1.6636085626911314,
"grad_norm": 0.2506503462791443,
"learning_rate": 1.0143808883026785e-05,
"loss": 1.5196988582611084,
"step": 1088
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.19972681999206543,
"learning_rate": 1.0113597921969482e-05,
"loss": 1.3515968322753906,
"step": 1090
},
{
"epoch": 1.6697247706422018,
"grad_norm": 0.20533211529254913,
"learning_rate": 1.0083397043322802e-05,
"loss": 1.5942573547363281,
"step": 1092
},
{
"epoch": 1.6727828746177371,
"grad_norm": 0.16217419505119324,
"learning_rate": 1.0053206590607667e-05,
"loss": 1.4510009288787842,
"step": 1094
},
{
"epoch": 1.6758409785932722,
"grad_norm": 0.3597978353500366,
"learning_rate": 1.002302690722641e-05,
"loss": 1.4673590660095215,
"step": 1096
},
{
"epoch": 1.6788990825688073,
"grad_norm": 0.23254692554473877,
"learning_rate": 9.992858336458863e-06,
"loss": 1.3361679315567017,
"step": 1098
},
{
"epoch": 1.6819571865443423,
"grad_norm": 0.36849313974380493,
"learning_rate": 9.962701221458468e-06,
"loss": 1.006578803062439,
"step": 1100
},
{
"epoch": 1.6850152905198776,
"grad_norm": 0.8467805981636047,
"learning_rate": 9.932555905248359e-06,
"loss": 1.468936800956726,
"step": 1102
},
{
"epoch": 1.688073394495413,
"grad_norm": 0.3970109820365906,
"learning_rate": 9.902422730717447e-06,
"loss": 1.4364168643951416,
"step": 1104
},
{
"epoch": 1.691131498470948,
"grad_norm": 0.2257806360721588,
"learning_rate": 9.872302040616564e-06,
"loss": 1.458873987197876,
"step": 1106
},
{
"epoch": 1.694189602446483,
"grad_norm": 0.3083685040473938,
"learning_rate": 9.842194177554522e-06,
"loss": 1.4550821781158447,
"step": 1108
},
{
"epoch": 1.6972477064220184,
"grad_norm": 0.15511758625507355,
"learning_rate": 9.812099483994237e-06,
"loss": 1.2649579048156738,
"step": 1110
},
{
"epoch": 1.7003058103975535,
"grad_norm": 0.18423175811767578,
"learning_rate": 9.782018302248823e-06,
"loss": 1.490966558456421,
"step": 1112
},
{
"epoch": 1.7033639143730888,
"grad_norm": 0.2790970206260681,
"learning_rate": 9.751950974477706e-06,
"loss": 1.5698015689849854,
"step": 1114
},
{
"epoch": 1.7064220183486238,
"grad_norm": 0.2892129123210907,
"learning_rate": 9.721897842682733e-06,
"loss": 1.2804194688796997,
"step": 1116
},
{
"epoch": 1.709480122324159,
"grad_norm": 0.38898563385009766,
"learning_rate": 9.691859248704271e-06,
"loss": 1.5459824800491333,
"step": 1118
},
{
"epoch": 1.7125382262996942,
"grad_norm": 0.2886713147163391,
"learning_rate": 9.661835534217332e-06,
"loss": 1.3230183124542236,
"step": 1120
},
{
"epoch": 1.7155963302752295,
"grad_norm": 0.1743742972612381,
"learning_rate": 9.631827040727679e-06,
"loss": 1.4699349403381348,
"step": 1122
},
{
"epoch": 1.7186544342507646,
"grad_norm": 0.23916998505592346,
"learning_rate": 9.601834109567942e-06,
"loss": 1.4023747444152832,
"step": 1124
},
{
"epoch": 1.7217125382262997,
"grad_norm": 0.16461840271949768,
"learning_rate": 9.571857081893739e-06,
"loss": 1.5375235080718994,
"step": 1126
},
{
"epoch": 1.7247706422018347,
"grad_norm": 0.1851189136505127,
"learning_rate": 9.541896298679794e-06,
"loss": 1.6321358680725098,
"step": 1128
},
{
"epoch": 1.72782874617737,
"grad_norm": 0.2205365002155304,
"learning_rate": 9.511952100716051e-06,
"loss": 1.5237758159637451,
"step": 1130
},
{
"epoch": 1.7308868501529053,
"grad_norm": 0.21927416324615479,
"learning_rate": 9.482024828603813e-06,
"loss": 1.351357340812683,
"step": 1132
},
{
"epoch": 1.7339449541284404,
"grad_norm": 0.14752982556819916,
"learning_rate": 9.452114822751854e-06,
"loss": 1.465145230293274,
"step": 1134
},
{
"epoch": 1.7370030581039755,
"grad_norm": 0.1750420182943344,
"learning_rate": 9.422222423372557e-06,
"loss": 1.4611705541610718,
"step": 1136
},
{
"epoch": 1.7400611620795106,
"grad_norm": 0.2836110293865204,
"learning_rate": 9.392347970478035e-06,
"loss": 1.4611400365829468,
"step": 1138
},
{
"epoch": 1.7431192660550459,
"grad_norm": 0.4955594837665558,
"learning_rate": 9.362491803876267e-06,
"loss": 1.486251950263977,
"step": 1140
},
{
"epoch": 1.7461773700305812,
"grad_norm": 0.1564791202545166,
"learning_rate": 9.332654263167242e-06,
"loss": 1.4786931276321411,
"step": 1142
},
{
"epoch": 1.7492354740061162,
"grad_norm": 0.21143656969070435,
"learning_rate": 9.30283568773908e-06,
"loss": 1.2680423259735107,
"step": 1144
},
{
"epoch": 1.7522935779816513,
"grad_norm": 0.21294501423835754,
"learning_rate": 9.273036416764182e-06,
"loss": 1.3377047777175903,
"step": 1146
},
{
"epoch": 1.7553516819571864,
"grad_norm": 0.6836228966712952,
"learning_rate": 9.243256789195374e-06,
"loss": 1.2617762088775635,
"step": 1148
},
{
"epoch": 1.7584097859327217,
"grad_norm": 0.21216517686843872,
"learning_rate": 9.213497143762036e-06,
"loss": 1.3481056690216064,
"step": 1150
},
{
"epoch": 1.761467889908257,
"grad_norm": 0.21078504621982574,
"learning_rate": 9.18375781896628e-06,
"loss": 1.321746587753296,
"step": 1152
},
{
"epoch": 1.764525993883792,
"grad_norm": 0.4202200472354889,
"learning_rate": 9.154039153079054e-06,
"loss": 1.193610668182373,
"step": 1154
},
{
"epoch": 1.7675840978593271,
"grad_norm": 0.1040918305516243,
"learning_rate": 9.12434148413635e-06,
"loss": 0.8776851892471313,
"step": 1156
},
{
"epoch": 1.7706422018348624,
"grad_norm": 0.1633862406015396,
"learning_rate": 9.094665149935307e-06,
"loss": 1.1803946495056152,
"step": 1158
},
{
"epoch": 1.7737003058103975,
"grad_norm": 0.16494464874267578,
"learning_rate": 9.065010488030397e-06,
"loss": 1.2437876462936401,
"step": 1160
},
{
"epoch": 1.7767584097859328,
"grad_norm": 0.15143194794654846,
"learning_rate": 9.035377835729588e-06,
"loss": 1.2997621297836304,
"step": 1162
},
{
"epoch": 1.7798165137614679,
"grad_norm": 0.18123859167099,
"learning_rate": 9.005767530090489e-06,
"loss": 1.3120684623718262,
"step": 1164
},
{
"epoch": 1.782874617737003,
"grad_norm": 0.1501539647579193,
"learning_rate": 8.976179907916528e-06,
"loss": 1.3702399730682373,
"step": 1166
},
{
"epoch": 1.7859327217125383,
"grad_norm": 0.11408812552690506,
"learning_rate": 8.946615305753127e-06,
"loss": 1.2892541885375977,
"step": 1168
},
{
"epoch": 1.7889908256880735,
"grad_norm": 0.11254674196243286,
"learning_rate": 8.917074059883852e-06,
"loss": 1.308501124382019,
"step": 1170
},
{
"epoch": 1.7920489296636086,
"grad_norm": 0.14027130603790283,
"learning_rate": 8.887556506326615e-06,
"loss": 1.2796250581741333,
"step": 1172
},
{
"epoch": 1.7951070336391437,
"grad_norm": 0.1700981706380844,
"learning_rate": 8.858062980829838e-06,
"loss": 1.2928704023361206,
"step": 1174
},
{
"epoch": 1.7981651376146788,
"grad_norm": 0.18721206486225128,
"learning_rate": 8.828593818868622e-06,
"loss": 1.2547078132629395,
"step": 1176
},
{
"epoch": 1.801223241590214,
"grad_norm": 0.17795971035957336,
"learning_rate": 8.799149355640961e-06,
"loss": 1.2972519397735596,
"step": 1178
},
{
"epoch": 1.8042813455657494,
"grad_norm": 0.22595000267028809,
"learning_rate": 8.769729926063904e-06,
"loss": 1.2488983869552612,
"step": 1180
},
{
"epoch": 1.8073394495412844,
"grad_norm": 0.07708975672721863,
"learning_rate": 8.740335864769747e-06,
"loss": 1.1340965032577515,
"step": 1182
},
{
"epoch": 1.8103975535168195,
"grad_norm": 0.28360649943351746,
"learning_rate": 8.71096750610225e-06,
"loss": 1.1929219961166382,
"step": 1184
},
{
"epoch": 1.8134556574923546,
"grad_norm": 0.06053072586655617,
"learning_rate": 8.681625184112803e-06,
"loss": 0.976668655872345,
"step": 1186
},
{
"epoch": 1.81651376146789,
"grad_norm": 0.10698408633470535,
"learning_rate": 8.652309232556651e-06,
"loss": 1.198448896408081,
"step": 1188
},
{
"epoch": 1.8195718654434252,
"grad_norm": 0.08489919453859329,
"learning_rate": 8.623019984889078e-06,
"loss": 1.2306997776031494,
"step": 1190
},
{
"epoch": 1.8226299694189603,
"grad_norm": 0.11401335895061493,
"learning_rate": 8.593757774261638e-06,
"loss": 1.244828462600708,
"step": 1192
},
{
"epoch": 1.8256880733944953,
"grad_norm": 0.1129307895898819,
"learning_rate": 8.56452293351833e-06,
"loss": 1.2172696590423584,
"step": 1194
},
{
"epoch": 1.8287461773700304,
"grad_norm": 0.11858973652124405,
"learning_rate": 8.535315795191858e-06,
"loss": 1.2330491542816162,
"step": 1196
},
{
"epoch": 1.8318042813455657,
"grad_norm": 0.1406945139169693,
"learning_rate": 8.506136691499805e-06,
"loss": 1.2052946090698242,
"step": 1198
},
{
"epoch": 1.834862385321101,
"grad_norm": 0.09237688034772873,
"learning_rate": 8.476985954340877e-06,
"loss": 1.1807793378829956,
"step": 1200
},
{
"epoch": 1.837920489296636,
"grad_norm": 0.09736914187669754,
"learning_rate": 8.447863915291133e-06,
"loss": 1.2424228191375732,
"step": 1202
},
{
"epoch": 1.8409785932721712,
"grad_norm": 0.10175871849060059,
"learning_rate": 8.418770905600191e-06,
"loss": 1.2142945528030396,
"step": 1204
},
{
"epoch": 1.8440366972477065,
"grad_norm": 0.1257796734571457,
"learning_rate": 8.389707256187484e-06,
"loss": 1.188672661781311,
"step": 1206
},
{
"epoch": 1.8470948012232415,
"grad_norm": 0.12363885343074799,
"learning_rate": 8.360673297638484e-06,
"loss": 1.177275538444519,
"step": 1208
},
{
"epoch": 1.8501529051987768,
"grad_norm": 0.188978374004364,
"learning_rate": 8.331669360200937e-06,
"loss": 1.2045999765396118,
"step": 1210
},
{
"epoch": 1.853211009174312,
"grad_norm": 0.1187102198600769,
"learning_rate": 8.302695773781124e-06,
"loss": 1.1938765048980713,
"step": 1212
},
{
"epoch": 1.856269113149847,
"grad_norm": 0.13880708813667297,
"learning_rate": 8.273752867940081e-06,
"loss": 1.1506744623184204,
"step": 1214
},
{
"epoch": 1.8593272171253823,
"grad_norm": 0.13549721240997314,
"learning_rate": 8.244840971889885e-06,
"loss": 1.2162597179412842,
"step": 1216
},
{
"epoch": 1.8623853211009176,
"grad_norm": 0.25943684577941895,
"learning_rate": 8.215960414489873e-06,
"loss": 1.2452492713928223,
"step": 1218
},
{
"epoch": 1.8654434250764527,
"grad_norm": 0.12404591590166092,
"learning_rate": 8.187111524242938e-06,
"loss": 1.2478712797164917,
"step": 1220
},
{
"epoch": 1.8685015290519877,
"grad_norm": 0.13704179227352142,
"learning_rate": 8.15829462929176e-06,
"loss": 1.2387086153030396,
"step": 1222
},
{
"epoch": 1.8715596330275228,
"grad_norm": 0.11567935347557068,
"learning_rate": 8.129510057415091e-06,
"loss": 1.257190227508545,
"step": 1224
},
{
"epoch": 1.8746177370030581,
"grad_norm": 0.6642646789550781,
"learning_rate": 8.100758136024027e-06,
"loss": 1.239441990852356,
"step": 1226
},
{
"epoch": 1.8776758409785934,
"grad_norm": 0.17069149017333984,
"learning_rate": 8.072039192158272e-06,
"loss": 1.2593530416488647,
"step": 1228
},
{
"epoch": 1.8807339449541285,
"grad_norm": 0.14976222813129425,
"learning_rate": 8.043353552482435e-06,
"loss": 1.1915827989578247,
"step": 1230
},
{
"epoch": 1.8837920489296636,
"grad_norm": 0.2598312497138977,
"learning_rate": 8.014701543282302e-06,
"loss": 1.2334834337234497,
"step": 1232
},
{
"epoch": 1.8868501529051986,
"grad_norm": 0.1409589648246765,
"learning_rate": 7.986083490461124e-06,
"loss": 1.1979966163635254,
"step": 1234
},
{
"epoch": 1.889908256880734,
"grad_norm": 0.1519106924533844,
"learning_rate": 7.957499719535922e-06,
"loss": 1.224788784980774,
"step": 1236
},
{
"epoch": 1.8929663608562692,
"grad_norm": 0.14410492777824402,
"learning_rate": 7.928950555633767e-06,
"loss": 1.2332391738891602,
"step": 1238
},
{
"epoch": 1.8960244648318043,
"grad_norm": 0.15549832582473755,
"learning_rate": 7.900436323488098e-06,
"loss": 1.252604603767395,
"step": 1240
},
{
"epoch": 1.8990825688073394,
"grad_norm": 0.11451930552721024,
"learning_rate": 7.871957347435025e-06,
"loss": 1.2335529327392578,
"step": 1242
},
{
"epoch": 1.9021406727828745,
"grad_norm": 0.13380469381809235,
"learning_rate": 7.843513951409618e-06,
"loss": 1.1709686517715454,
"step": 1244
},
{
"epoch": 1.9051987767584098,
"grad_norm": 0.12940165400505066,
"learning_rate": 7.815106458942265e-06,
"loss": 1.211228609085083,
"step": 1246
},
{
"epoch": 1.908256880733945,
"grad_norm": 0.1105319932103157,
"learning_rate": 7.78673519315495e-06,
"loss": 1.2363883256912231,
"step": 1248
},
{
"epoch": 1.9113149847094801,
"grad_norm": 0.10502426326274872,
"learning_rate": 7.758400476757609e-06,
"loss": 1.262728214263916,
"step": 1250
},
{
"epoch": 1.9143730886850152,
"grad_norm": 0.1284359097480774,
"learning_rate": 7.73010263204443e-06,
"loss": 1.234637975692749,
"step": 1252
},
{
"epoch": 1.9174311926605505,
"grad_norm": 0.19072557985782623,
"learning_rate": 7.70184198089022e-06,
"loss": 1.2130069732666016,
"step": 1254
},
{
"epoch": 1.9204892966360856,
"grad_norm": 0.15589360892772675,
"learning_rate": 7.673618844746709e-06,
"loss": 1.1858221292495728,
"step": 1256
},
{
"epoch": 1.9235474006116209,
"grad_norm": 0.12436363846063614,
"learning_rate": 7.645433544638926e-06,
"loss": 1.2181921005249023,
"step": 1258
},
{
"epoch": 1.926605504587156,
"grad_norm": 0.1155422255396843,
"learning_rate": 7.617286401161523e-06,
"loss": 1.2427356243133545,
"step": 1260
},
{
"epoch": 1.929663608562691,
"grad_norm": 0.10325782001018524,
"learning_rate": 7.589177734475148e-06,
"loss": 1.192740797996521,
"step": 1262
},
{
"epoch": 1.9327217125382263,
"grad_norm": 0.1549818515777588,
"learning_rate": 7.561107864302784e-06,
"loss": 1.2155026197433472,
"step": 1264
},
{
"epoch": 1.9357798165137616,
"grad_norm": 0.11942024528980255,
"learning_rate": 7.533077109926124e-06,
"loss": 1.2154382467269897,
"step": 1266
},
{
"epoch": 1.9388379204892967,
"grad_norm": 0.12078223377466202,
"learning_rate": 7.505085790181938e-06,
"loss": 1.2634786367416382,
"step": 1268
},
{
"epoch": 1.9418960244648318,
"grad_norm": 0.22967126965522766,
"learning_rate": 7.477134223458449e-06,
"loss": 1.273752212524414,
"step": 1270
},
{
"epoch": 1.9449541284403669,
"grad_norm": 0.11903467774391174,
"learning_rate": 7.4492227276917e-06,
"loss": 1.234639048576355,
"step": 1272
},
{
"epoch": 1.9480122324159022,
"grad_norm": 0.11574736982584,
"learning_rate": 7.421351620361954e-06,
"loss": 1.2574361562728882,
"step": 1274
},
{
"epoch": 1.9510703363914375,
"grad_norm": 0.11663772165775299,
"learning_rate": 7.39352121849007e-06,
"loss": 1.2257782220840454,
"step": 1276
},
{
"epoch": 1.9541284403669725,
"grad_norm": 0.1800021380186081,
"learning_rate": 7.3657318386339e-06,
"loss": 1.175177812576294,
"step": 1278
},
{
"epoch": 1.9571865443425076,
"grad_norm": 0.17682303488254547,
"learning_rate": 7.337983796884694e-06,
"loss": 1.2020514011383057,
"step": 1280
},
{
"epoch": 1.9602446483180427,
"grad_norm": 0.13004066050052643,
"learning_rate": 7.310277408863493e-06,
"loss": 1.225712537765503,
"step": 1282
},
{
"epoch": 1.963302752293578,
"grad_norm": 0.396966814994812,
"learning_rate": 7.282612989717555e-06,
"loss": 1.2043389081954956,
"step": 1284
},
{
"epoch": 1.9663608562691133,
"grad_norm": 0.1417648047208786,
"learning_rate": 7.254990854116759e-06,
"loss": 1.2117154598236084,
"step": 1286
},
{
"epoch": 1.9694189602446484,
"grad_norm": 0.13368594646453857,
"learning_rate": 7.2274113162500285e-06,
"loss": 1.2600475549697876,
"step": 1288
},
{
"epoch": 1.9724770642201834,
"grad_norm": 0.1391812562942505,
"learning_rate": 7.199874689821744e-06,
"loss": 1.2455639839172363,
"step": 1290
},
{
"epoch": 1.9755351681957185,
"grad_norm": 0.13156801462173462,
"learning_rate": 7.1723812880482114e-06,
"loss": 1.258913278579712,
"step": 1292
},
{
"epoch": 1.9785932721712538,
"grad_norm": 0.12165912240743637,
"learning_rate": 7.144931423654069e-06,
"loss": 1.2114229202270508,
"step": 1294
},
{
"epoch": 1.981651376146789,
"grad_norm": 0.17182543873786926,
"learning_rate": 7.117525408868722e-06,
"loss": 1.219508409500122,
"step": 1296
},
{
"epoch": 1.9847094801223242,
"grad_norm": 0.1712474673986435,
"learning_rate": 7.090163555422824e-06,
"loss": 1.2391793727874756,
"step": 1298
},
{
"epoch": 1.9877675840978593,
"grad_norm": 0.14425022900104523,
"learning_rate": 7.062846174544713e-06,
"loss": 1.2954765558242798,
"step": 1300
},
{
"epoch": 1.9908256880733946,
"grad_norm": 0.17153722047805786,
"learning_rate": 7.035573576956867e-06,
"loss": 1.322161316871643,
"step": 1302
},
{
"epoch": 1.9938837920489296,
"grad_norm": 0.19832384586334229,
"learning_rate": 7.008346072872372e-06,
"loss": 1.478272557258606,
"step": 1304
},
{
"epoch": 1.996941896024465,
"grad_norm": 0.3216697871685028,
"learning_rate": 6.9811639719914004e-06,
"loss": 1.4260848760604858,
"step": 1306
},
{
"epoch": 2.0,
"grad_norm": 0.706434428691864,
"learning_rate": 6.954027583497691e-06,
"loss": 1.4685496091842651,
"step": 1308
},
{
"epoch": 2.003058103975535,
"grad_norm": 0.17712196707725525,
"learning_rate": 6.92693721605501e-06,
"loss": 1.394515037536621,
"step": 1310
},
{
"epoch": 2.00611620795107,
"grad_norm": 0.12309785187244415,
"learning_rate": 6.899893177803667e-06,
"loss": 1.3557714223861694,
"step": 1312
},
{
"epoch": 2.0091743119266057,
"grad_norm": 0.1349668651819229,
"learning_rate": 6.8728957763570005e-06,
"loss": 1.3635830879211426,
"step": 1314
},
{
"epoch": 2.0122324159021407,
"grad_norm": 0.15826421976089478,
"learning_rate": 6.8459453187978706e-06,
"loss": 1.2972722053527832,
"step": 1316
},
{
"epoch": 2.015290519877676,
"grad_norm": 0.11746937036514282,
"learning_rate": 6.819042111675172e-06,
"loss": 1.4896833896636963,
"step": 1318
},
{
"epoch": 2.018348623853211,
"grad_norm": 0.10701940208673477,
"learning_rate": 6.792186461000352e-06,
"loss": 1.3726967573165894,
"step": 1320
},
{
"epoch": 2.021406727828746,
"grad_norm": 0.18223977088928223,
"learning_rate": 6.765378672243923e-06,
"loss": 1.3199728727340698,
"step": 1322
},
{
"epoch": 2.0244648318042815,
"grad_norm": 0.15080475807189941,
"learning_rate": 6.738619050331995e-06,
"loss": 1.3884634971618652,
"step": 1324
},
{
"epoch": 2.0275229357798166,
"grad_norm": 0.1777096688747406,
"learning_rate": 6.711907899642793e-06,
"loss": 1.2657767534255981,
"step": 1326
},
{
"epoch": 2.0305810397553516,
"grad_norm": 0.15124447643756866,
"learning_rate": 6.685245524003212e-06,
"loss": 1.3411420583724976,
"step": 1328
},
{
"epoch": 2.0336391437308867,
"grad_norm": 0.16805733740329742,
"learning_rate": 6.658632226685355e-06,
"loss": 1.4487394094467163,
"step": 1330
},
{
"epoch": 2.036697247706422,
"grad_norm": 0.15020951628684998,
"learning_rate": 6.632068310403075e-06,
"loss": 1.3471310138702393,
"step": 1332
},
{
"epoch": 2.0397553516819573,
"grad_norm": 0.11654051393270493,
"learning_rate": 6.605554077308541e-06,
"loss": 1.1645305156707764,
"step": 1334
},
{
"epoch": 2.0428134556574924,
"grad_norm": 0.47177380323410034,
"learning_rate": 6.579089828988806e-06,
"loss": 1.3331588506698608,
"step": 1336
},
{
"epoch": 2.0458715596330275,
"grad_norm": 0.18365582823753357,
"learning_rate": 6.552675866462358e-06,
"loss": 1.568501353263855,
"step": 1338
},
{
"epoch": 2.0489296636085625,
"grad_norm": 0.16884642839431763,
"learning_rate": 6.526312490175719e-06,
"loss": 1.2164785861968994,
"step": 1340
},
{
"epoch": 2.051987767584098,
"grad_norm": 0.18424928188323975,
"learning_rate": 6.500000000000003e-06,
"loss": 1.3639503717422485,
"step": 1342
},
{
"epoch": 2.055045871559633,
"grad_norm": 0.14070400595664978,
"learning_rate": 6.473738695227528e-06,
"loss": 1.2531183958053589,
"step": 1344
},
{
"epoch": 2.058103975535168,
"grad_norm": 0.1320883184671402,
"learning_rate": 6.447528874568403e-06,
"loss": 1.2422581911087036,
"step": 1346
},
{
"epoch": 2.0611620795107033,
"grad_norm": 0.31850379705429077,
"learning_rate": 6.421370836147125e-06,
"loss": 1.2550984621047974,
"step": 1348
},
{
"epoch": 2.0642201834862384,
"grad_norm": 0.12996889650821686,
"learning_rate": 6.3952648774991895e-06,
"loss": 1.2892459630966187,
"step": 1350
},
{
"epoch": 2.067278287461774,
"grad_norm": 0.11520091444253922,
"learning_rate": 6.3692112955677145e-06,
"loss": 1.4742932319641113,
"step": 1352
},
{
"epoch": 2.070336391437309,
"grad_norm": 0.20727252960205078,
"learning_rate": 6.343210386700056e-06,
"loss": 1.4938790798187256,
"step": 1354
},
{
"epoch": 2.073394495412844,
"grad_norm": 0.27067703008651733,
"learning_rate": 6.317262446644432e-06,
"loss": 1.289554238319397,
"step": 1356
},
{
"epoch": 2.076452599388379,
"grad_norm": 0.20865699648857117,
"learning_rate": 6.291367770546576e-06,
"loss": 1.3036962747573853,
"step": 1358
},
{
"epoch": 2.079510703363914,
"grad_norm": 0.25244075059890747,
"learning_rate": 6.265526652946361e-06,
"loss": 1.2870533466339111,
"step": 1360
},
{
"epoch": 2.0825688073394497,
"grad_norm": 0.18456417322158813,
"learning_rate": 6.23973938777446e-06,
"loss": 1.3479864597320557,
"step": 1362
},
{
"epoch": 2.085626911314985,
"grad_norm": 0.18370430171489716,
"learning_rate": 6.214006268348997e-06,
"loss": 1.3356621265411377,
"step": 1364
},
{
"epoch": 2.08868501529052,
"grad_norm": 0.19261838495731354,
"learning_rate": 6.188327587372216e-06,
"loss": 1.3640661239624023,
"step": 1366
},
{
"epoch": 2.091743119266055,
"grad_norm": 0.1763276904821396,
"learning_rate": 6.162703636927147e-06,
"loss": 1.3478913307189941,
"step": 1368
},
{
"epoch": 2.09480122324159,
"grad_norm": 0.10882332921028137,
"learning_rate": 6.137134708474293e-06,
"loss": 1.2278797626495361,
"step": 1370
},
{
"epoch": 2.0978593272171255,
"grad_norm": 0.12949886918067932,
"learning_rate": 6.111621092848293e-06,
"loss": 1.1902759075164795,
"step": 1372
},
{
"epoch": 2.1009174311926606,
"grad_norm": 0.15533407032489777,
"learning_rate": 6.086163080254641e-06,
"loss": 1.345337152481079,
"step": 1374
},
{
"epoch": 2.1039755351681957,
"grad_norm": 0.20214258134365082,
"learning_rate": 6.060760960266372e-06,
"loss": 1.4601398706436157,
"step": 1376
},
{
"epoch": 2.1070336391437308,
"grad_norm": 0.13242913782596588,
"learning_rate": 6.035415021820756e-06,
"loss": 1.2989829778671265,
"step": 1378
},
{
"epoch": 2.1100917431192663,
"grad_norm": 0.2398076355457306,
"learning_rate": 6.0101255532160376e-06,
"loss": 1.4964780807495117,
"step": 1380
},
{
"epoch": 2.1131498470948014,
"grad_norm": 0.20416386425495148,
"learning_rate": 5.984892842108143e-06,
"loss": 1.4913971424102783,
"step": 1382
},
{
"epoch": 2.1162079510703364,
"grad_norm": 0.18165673315525055,
"learning_rate": 5.959717175507396e-06,
"loss": 1.239193320274353,
"step": 1384
},
{
"epoch": 2.1192660550458715,
"grad_norm": 0.2294483482837677,
"learning_rate": 5.93459883977528e-06,
"loss": 1.1719461679458618,
"step": 1386
},
{
"epoch": 2.1223241590214066,
"grad_norm": 0.30271396040916443,
"learning_rate": 5.909538120621155e-06,
"loss": 1.3723602294921875,
"step": 1388
},
{
"epoch": 2.1253822629969417,
"grad_norm": 0.13012711703777313,
"learning_rate": 5.884535303099026e-06,
"loss": 1.186031460762024,
"step": 1390
},
{
"epoch": 2.128440366972477,
"grad_norm": 0.1800045669078827,
"learning_rate": 5.859590671604297e-06,
"loss": 1.257864236831665,
"step": 1392
},
{
"epoch": 2.1314984709480123,
"grad_norm": 0.1988981068134308,
"learning_rate": 5.8347045098705216e-06,
"loss": 1.2152053117752075,
"step": 1394
},
{
"epoch": 2.1345565749235473,
"grad_norm": 0.13755300641059875,
"learning_rate": 5.809877100966197e-06,
"loss": 1.299118995666504,
"step": 1396
},
{
"epoch": 2.1376146788990824,
"grad_norm": 0.1406605839729309,
"learning_rate": 5.785108727291532e-06,
"loss": 1.2066929340362549,
"step": 1398
},
{
"epoch": 2.140672782874618,
"grad_norm": 0.14786066114902496,
"learning_rate": 5.760399670575236e-06,
"loss": 1.255595088005066,
"step": 1400
},
{
"epoch": 2.143730886850153,
"grad_norm": 0.14521285891532898,
"learning_rate": 5.735750211871316e-06,
"loss": 1.3966695070266724,
"step": 1402
},
{
"epoch": 2.146788990825688,
"grad_norm": 0.156136155128479,
"learning_rate": 5.711160631555877e-06,
"loss": 1.2653884887695312,
"step": 1404
},
{
"epoch": 2.149847094801223,
"grad_norm": 0.11567161232233047,
"learning_rate": 5.686631209323941e-06,
"loss": 1.227649450302124,
"step": 1406
},
{
"epoch": 2.1529051987767582,
"grad_norm": 0.1110580638051033,
"learning_rate": 5.662162224186258e-06,
"loss": 1.4484609365463257,
"step": 1408
},
{
"epoch": 2.1559633027522938,
"grad_norm": 0.1445491760969162,
"learning_rate": 5.637753954466127e-06,
"loss": 1.3130481243133545,
"step": 1410
},
{
"epoch": 2.159021406727829,
"grad_norm": 0.1265811175107956,
"learning_rate": 5.613406677796246e-06,
"loss": 1.248986840248108,
"step": 1412
},
{
"epoch": 2.162079510703364,
"grad_norm": 0.12633687257766724,
"learning_rate": 5.589120671115542e-06,
"loss": 1.3525274991989136,
"step": 1414
},
{
"epoch": 2.165137614678899,
"grad_norm": 0.23789754509925842,
"learning_rate": 5.564896210666031e-06,
"loss": 1.3208624124526978,
"step": 1416
},
{
"epoch": 2.168195718654434,
"grad_norm": 0.13444137573242188,
"learning_rate": 5.540733571989654e-06,
"loss": 1.3796360492706299,
"step": 1418
},
{
"epoch": 2.1712538226299696,
"grad_norm": 0.18880939483642578,
"learning_rate": 5.51663302992517e-06,
"loss": 1.2581044435501099,
"step": 1420
},
{
"epoch": 2.1743119266055047,
"grad_norm": 0.12744680047035217,
"learning_rate": 5.4925948586050224e-06,
"loss": 1.3596972227096558,
"step": 1422
},
{
"epoch": 2.1773700305810397,
"grad_norm": 0.18471305072307587,
"learning_rate": 5.4686193314522e-06,
"loss": 1.256497859954834,
"step": 1424
},
{
"epoch": 2.180428134556575,
"grad_norm": 0.18038369715213776,
"learning_rate": 5.444706721177157e-06,
"loss": 1.5690301656723022,
"step": 1426
},
{
"epoch": 2.18348623853211,
"grad_norm": 0.15947416424751282,
"learning_rate": 5.420857299774696e-06,
"loss": 1.3116461038589478,
"step": 1428
},
{
"epoch": 2.1865443425076454,
"grad_norm": 0.14132535457611084,
"learning_rate": 5.397071338520867e-06,
"loss": 1.3790884017944336,
"step": 1430
},
{
"epoch": 2.1896024464831805,
"grad_norm": 0.32931429147720337,
"learning_rate": 5.373349107969902e-06,
"loss": 1.356053113937378,
"step": 1432
},
{
"epoch": 2.1926605504587156,
"grad_norm": 0.14345382153987885,
"learning_rate": 5.349690877951115e-06,
"loss": 1.3247517347335815,
"step": 1434
},
{
"epoch": 2.1957186544342506,
"grad_norm": 0.15944042801856995,
"learning_rate": 5.326096917565853e-06,
"loss": 1.3286181688308716,
"step": 1436
},
{
"epoch": 2.198776758409786,
"grad_norm": 0.21919000148773193,
"learning_rate": 5.302567495184422e-06,
"loss": 1.330199122428894,
"step": 1438
},
{
"epoch": 2.2018348623853212,
"grad_norm": 0.1875789612531662,
"learning_rate": 5.279102878443032e-06,
"loss": 1.332606554031372,
"step": 1440
},
{
"epoch": 2.2048929663608563,
"grad_norm": 0.33878740668296814,
"learning_rate": 5.255703334240774e-06,
"loss": 1.3914042711257935,
"step": 1442
},
{
"epoch": 2.2079510703363914,
"grad_norm": 0.5218708515167236,
"learning_rate": 5.232369128736553e-06,
"loss": 1.3419737815856934,
"step": 1444
},
{
"epoch": 2.2110091743119265,
"grad_norm": 0.11105236411094666,
"learning_rate": 5.2091005273460914e-06,
"loss": 1.2455813884735107,
"step": 1446
},
{
"epoch": 2.214067278287462,
"grad_norm": 0.11201157420873642,
"learning_rate": 5.185897794738881e-06,
"loss": 1.2253139019012451,
"step": 1448
},
{
"epoch": 2.217125382262997,
"grad_norm": 0.1912904530763626,
"learning_rate": 5.162761194835198e-06,
"loss": 1.2669559717178345,
"step": 1450
},
{
"epoch": 2.220183486238532,
"grad_norm": 0.24734018743038177,
"learning_rate": 5.139690990803084e-06,
"loss": 1.233301043510437,
"step": 1452
},
{
"epoch": 2.223241590214067,
"grad_norm": 0.16191165149211884,
"learning_rate": 5.1166874450553635e-06,
"loss": 1.2840200662612915,
"step": 1454
},
{
"epoch": 2.2262996941896023,
"grad_norm": 0.12160076200962067,
"learning_rate": 5.093750819246648e-06,
"loss": 1.2450309991836548,
"step": 1456
},
{
"epoch": 2.229357798165138,
"grad_norm": 0.36477330327033997,
"learning_rate": 5.0708813742703666e-06,
"loss": 1.4992554187774658,
"step": 1458
},
{
"epoch": 2.232415902140673,
"grad_norm": 0.1379183977842331,
"learning_rate": 5.0480793702558085e-06,
"loss": 1.5028156042099,
"step": 1460
},
{
"epoch": 2.235474006116208,
"grad_norm": 0.22662517428398132,
"learning_rate": 5.025345066565135e-06,
"loss": 1.5423188209533691,
"step": 1462
},
{
"epoch": 2.238532110091743,
"grad_norm": 0.22309836745262146,
"learning_rate": 5.002678721790462e-06,
"loss": 1.4739760160446167,
"step": 1464
},
{
"epoch": 2.241590214067278,
"grad_norm": 0.18657968938350677,
"learning_rate": 4.980080593750901e-06,
"loss": 1.5198191404342651,
"step": 1466
},
{
"epoch": 2.2446483180428136,
"grad_norm": 0.1312897652387619,
"learning_rate": 4.9575509394896306e-06,
"loss": 1.412659764289856,
"step": 1468
},
{
"epoch": 2.2477064220183487,
"grad_norm": 0.2041846066713333,
"learning_rate": 4.9350900152709644e-06,
"loss": 1.501435399055481,
"step": 1470
},
{
"epoch": 2.2507645259938838,
"grad_norm": 0.21802227199077606,
"learning_rate": 4.9126980765774535e-06,
"loss": 1.2920466661453247,
"step": 1472
},
{
"epoch": 2.253822629969419,
"grad_norm": 0.32734358310699463,
"learning_rate": 4.890375378106969e-06,
"loss": 1.3840155601501465,
"step": 1474
},
{
"epoch": 2.2568807339449544,
"grad_norm": 0.18576525151729584,
"learning_rate": 4.8681221737698e-06,
"loss": 1.1896015405654907,
"step": 1476
},
{
"epoch": 2.2599388379204894,
"grad_norm": 0.25952601432800293,
"learning_rate": 4.845938716685783e-06,
"loss": 1.390250563621521,
"step": 1478
},
{
"epoch": 2.2629969418960245,
"grad_norm": 0.2618364691734314,
"learning_rate": 4.8238252591813994e-06,
"loss": 0.9939552545547485,
"step": 1480
},
{
"epoch": 2.2660550458715596,
"grad_norm": 0.7130351066589355,
"learning_rate": 4.801782052786928e-06,
"loss": 0.9591231942176819,
"step": 1482
},
{
"epoch": 2.2691131498470947,
"grad_norm": 0.5174583196640015,
"learning_rate": 4.7798093482335736e-06,
"loss": 1.1732577085494995,
"step": 1484
},
{
"epoch": 2.2721712538226297,
"grad_norm": 0.1944684386253357,
"learning_rate": 4.757907395450607e-06,
"loss": 1.254443645477295,
"step": 1486
},
{
"epoch": 2.2752293577981653,
"grad_norm": 0.32820016145706177,
"learning_rate": 4.736076443562537e-06,
"loss": 1.3228447437286377,
"step": 1488
},
{
"epoch": 2.2782874617737003,
"grad_norm": 0.27493736147880554,
"learning_rate": 4.714316740886271e-06,
"loss": 1.2684073448181152,
"step": 1490
},
{
"epoch": 2.2813455657492354,
"grad_norm": 0.8473254442214966,
"learning_rate": 4.69262853492829e-06,
"loss": 1.0940097570419312,
"step": 1492
},
{
"epoch": 2.2844036697247705,
"grad_norm": 0.14267265796661377,
"learning_rate": 4.671012072381827e-06,
"loss": 1.062652349472046,
"step": 1494
},
{
"epoch": 2.287461773700306,
"grad_norm": 0.14857089519500732,
"learning_rate": 4.6494675991240725e-06,
"loss": 1.3027708530426025,
"step": 1496
},
{
"epoch": 2.290519877675841,
"grad_norm": 0.2024240493774414,
"learning_rate": 4.627995360213372e-06,
"loss": 1.1870825290679932,
"step": 1498
},
{
"epoch": 2.293577981651376,
"grad_norm": 0.47134989500045776,
"learning_rate": 4.606595599886441e-06,
"loss": 1.1387802362442017,
"step": 1500
},
{
"epoch": 2.2966360856269112,
"grad_norm": 0.12811291217803955,
"learning_rate": 4.585268561555577e-06,
"loss": 1.3463492393493652,
"step": 1502
},
{
"epoch": 2.2996941896024463,
"grad_norm": 0.22761978209018707,
"learning_rate": 4.564014487805905e-06,
"loss": 1.3002067804336548,
"step": 1504
},
{
"epoch": 2.302752293577982,
"grad_norm": 0.11687177419662476,
"learning_rate": 4.542833620392616e-06,
"loss": 1.0505836009979248,
"step": 1506
},
{
"epoch": 2.305810397553517,
"grad_norm": 0.5785715579986572,
"learning_rate": 4.521726200238199e-06,
"loss": 1.2982921600341797,
"step": 1508
},
{
"epoch": 2.308868501529052,
"grad_norm": 0.5210697650909424,
"learning_rate": 4.5006924674297285e-06,
"loss": 1.4383094310760498,
"step": 1510
},
{
"epoch": 2.311926605504587,
"grad_norm": 0.13508401811122894,
"learning_rate": 4.479732661216114e-06,
"loss": 1.5722585916519165,
"step": 1512
},
{
"epoch": 2.314984709480122,
"grad_norm": 0.16143162548542023,
"learning_rate": 4.458847020005387e-06,
"loss": 1.5961930751800537,
"step": 1514
},
{
"epoch": 2.3180428134556577,
"grad_norm": 0.17354567348957062,
"learning_rate": 4.43803578136198e-06,
"loss": 1.7155699729919434,
"step": 1516
},
{
"epoch": 2.3211009174311927,
"grad_norm": 0.14079655706882477,
"learning_rate": 4.4172991820040385e-06,
"loss": 1.248162865638733,
"step": 1518
},
{
"epoch": 2.324159021406728,
"grad_norm": 0.4320622980594635,
"learning_rate": 4.396637457800717e-06,
"loss": 1.3336325883865356,
"step": 1520
},
{
"epoch": 2.327217125382263,
"grad_norm": 0.23424510657787323,
"learning_rate": 4.376050843769508e-06,
"loss": 1.1769382953643799,
"step": 1522
},
{
"epoch": 2.330275229357798,
"grad_norm": 0.1386423110961914,
"learning_rate": 4.355539574073543e-06,
"loss": 1.1032593250274658,
"step": 1524
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.3263895809650421,
"learning_rate": 4.3351038820189605e-06,
"loss": 1.4229612350463867,
"step": 1526
},
{
"epoch": 2.3363914373088686,
"grad_norm": 0.18773163855075836,
"learning_rate": 4.314744000052238e-06,
"loss": 1.3791815042495728,
"step": 1528
},
{
"epoch": 2.3394495412844036,
"grad_norm": 0.28852972388267517,
"learning_rate": 4.294460159757549e-06,
"loss": 1.5685712099075317,
"step": 1530
},
{
"epoch": 2.3425076452599387,
"grad_norm": 0.1875099092721939,
"learning_rate": 4.274252591854119e-06,
"loss": 1.0389281511306763,
"step": 1532
},
{
"epoch": 2.3455657492354742,
"grad_norm": 0.3121390640735626,
"learning_rate": 4.254121526193621e-06,
"loss": 1.2694977521896362,
"step": 1534
},
{
"epoch": 2.3486238532110093,
"grad_norm": 0.18623915314674377,
"learning_rate": 4.234067191757547e-06,
"loss": 1.515195608139038,
"step": 1536
},
{
"epoch": 2.3516819571865444,
"grad_norm": 0.27571722865104675,
"learning_rate": 4.2140898166546094e-06,
"loss": 1.2839555740356445,
"step": 1538
},
{
"epoch": 2.3547400611620795,
"grad_norm": 0.18172216415405273,
"learning_rate": 4.1941896281181345e-06,
"loss": 1.4342246055603027,
"step": 1540
},
{
"epoch": 2.3577981651376145,
"grad_norm": 0.1874471753835678,
"learning_rate": 4.1743668525035e-06,
"loss": 1.302701473236084,
"step": 1542
},
{
"epoch": 2.3608562691131496,
"grad_norm": 0.13664644956588745,
"learning_rate": 4.154621715285544e-06,
"loss": 1.2458817958831787,
"step": 1544
},
{
"epoch": 2.363914373088685,
"grad_norm": 0.19140246510505676,
"learning_rate": 4.134954441055996e-06,
"loss": 1.2903972864151,
"step": 1546
},
{
"epoch": 2.36697247706422,
"grad_norm": 0.1618933528661728,
"learning_rate": 4.11536525352094e-06,
"loss": 1.347403645515442,
"step": 1548
},
{
"epoch": 2.3700305810397553,
"grad_norm": 0.35864803194999695,
"learning_rate": 4.0958543754982555e-06,
"loss": 1.2057493925094604,
"step": 1550
},
{
"epoch": 2.3730886850152904,
"grad_norm": 0.24179436266422272,
"learning_rate": 4.076422028915092e-06,
"loss": 1.33254873752594,
"step": 1552
},
{
"epoch": 2.376146788990826,
"grad_norm": 0.8190633654594421,
"learning_rate": 4.057068434805334e-06,
"loss": 1.6208769083023071,
"step": 1554
},
{
"epoch": 2.379204892966361,
"grad_norm": 0.3764900863170624,
"learning_rate": 4.037793813307097e-06,
"loss": 1.8496865034103394,
"step": 1556
},
{
"epoch": 2.382262996941896,
"grad_norm": 0.2258998155593872,
"learning_rate": 4.018598383660221e-06,
"loss": 1.2403137683868408,
"step": 1558
},
{
"epoch": 2.385321100917431,
"grad_norm": 0.2579484283924103,
"learning_rate": 3.999482364203777e-06,
"loss": 1.2474881410598755,
"step": 1560
},
{
"epoch": 2.388379204892966,
"grad_norm": 0.20246495306491852,
"learning_rate": 3.980445972373572e-06,
"loss": 1.3078337907791138,
"step": 1562
},
{
"epoch": 2.3914373088685017,
"grad_norm": 0.4740979075431824,
"learning_rate": 3.961489424699698e-06,
"loss": 1.3576768636703491,
"step": 1564
},
{
"epoch": 2.3944954128440368,
"grad_norm": 0.2766645848751068,
"learning_rate": 3.9426129368040525e-06,
"loss": 1.3143256902694702,
"step": 1566
},
{
"epoch": 2.397553516819572,
"grad_norm": 0.25092101097106934,
"learning_rate": 3.923816723397891e-06,
"loss": 1.193142056465149,
"step": 1568
},
{
"epoch": 2.400611620795107,
"grad_norm": 0.17161835730075836,
"learning_rate": 3.905100998279378e-06,
"loss": 1.1418907642364502,
"step": 1570
},
{
"epoch": 2.4036697247706424,
"grad_norm": 0.11211515963077545,
"learning_rate": 3.8864659743311674e-06,
"loss": 1.0700639486312866,
"step": 1572
},
{
"epoch": 2.4067278287461775,
"grad_norm": 0.10698743164539337,
"learning_rate": 3.867911863517976e-06,
"loss": 1.1931581497192383,
"step": 1574
},
{
"epoch": 2.4097859327217126,
"grad_norm": 0.29255542159080505,
"learning_rate": 3.849438876884171e-06,
"loss": 1.4374973773956299,
"step": 1576
},
{
"epoch": 2.4128440366972477,
"grad_norm": 0.18589414656162262,
"learning_rate": 3.831047224551362e-06,
"loss": 1.4687811136245728,
"step": 1578
},
{
"epoch": 2.4159021406727827,
"grad_norm": 0.2537822127342224,
"learning_rate": 3.8127371157160274e-06,
"loss": 1.4451251029968262,
"step": 1580
},
{
"epoch": 2.418960244648318,
"grad_norm": 0.2604762613773346,
"learning_rate": 3.794508758647125e-06,
"loss": 1.4975537061691284,
"step": 1582
},
{
"epoch": 2.4220183486238533,
"grad_norm": 0.18013572692871094,
"learning_rate": 3.776362360683725e-06,
"loss": 1.312493085861206,
"step": 1584
},
{
"epoch": 2.4250764525993884,
"grad_norm": 0.26494479179382324,
"learning_rate": 3.7582981282326436e-06,
"loss": 1.4119056463241577,
"step": 1586
},
{
"epoch": 2.4281345565749235,
"grad_norm": 0.22667403519153595,
"learning_rate": 3.74031626676611e-06,
"loss": 1.2957755327224731,
"step": 1588
},
{
"epoch": 2.4311926605504586,
"grad_norm": 0.09898483008146286,
"learning_rate": 3.7224169808194234e-06,
"loss": 1.249149203300476,
"step": 1590
},
{
"epoch": 2.434250764525994,
"grad_norm": 0.25444644689559937,
"learning_rate": 3.704600473988616e-06,
"loss": 1.2711833715438843,
"step": 1592
},
{
"epoch": 2.437308868501529,
"grad_norm": 0.20375902950763702,
"learning_rate": 3.6868669489281526e-06,
"loss": 1.457493543624878,
"step": 1594
},
{
"epoch": 2.4403669724770642,
"grad_norm": 0.16325809061527252,
"learning_rate": 3.6692166073486207e-06,
"loss": 1.2258182764053345,
"step": 1596
},
{
"epoch": 2.4434250764525993,
"grad_norm": 0.31586337089538574,
"learning_rate": 3.6516496500144315e-06,
"loss": 1.3641780614852905,
"step": 1598
},
{
"epoch": 2.4464831804281344,
"grad_norm": 0.3672979772090912,
"learning_rate": 3.6341662767415366e-06,
"loss": 1.2819935083389282,
"step": 1600
},
{
"epoch": 2.44954128440367,
"grad_norm": 0.29007235169410706,
"learning_rate": 3.616766686395161e-06,
"loss": 1.2883577346801758,
"step": 1602
},
{
"epoch": 2.452599388379205,
"grad_norm": 0.22274695336818695,
"learning_rate": 3.599451076887539e-06,
"loss": 1.4647866487503052,
"step": 1604
},
{
"epoch": 2.45565749235474,
"grad_norm": 0.2635745108127594,
"learning_rate": 3.5822196451756617e-06,
"loss": 1.4505290985107422,
"step": 1606
},
{
"epoch": 2.458715596330275,
"grad_norm": 0.6367509961128235,
"learning_rate": 3.5650725872590343e-06,
"loss": 1.266322374343872,
"step": 1608
},
{
"epoch": 2.46177370030581,
"grad_norm": 0.167319193482399,
"learning_rate": 3.54801009817745e-06,
"loss": 1.228219747543335,
"step": 1610
},
{
"epoch": 2.4648318042813457,
"grad_norm": 0.25401708483695984,
"learning_rate": 3.5310323720087747e-06,
"loss": 1.3591912984848022,
"step": 1612
},
{
"epoch": 2.467889908256881,
"grad_norm": 0.1403985321521759,
"learning_rate": 3.5141396018667327e-06,
"loss": 1.3101667165756226,
"step": 1614
},
{
"epoch": 2.470948012232416,
"grad_norm": 0.15048815310001373,
"learning_rate": 3.4973319798987075e-06,
"loss": 1.4515373706817627,
"step": 1616
},
{
"epoch": 2.474006116207951,
"grad_norm": 0.3306581377983093,
"learning_rate": 3.480609697283574e-06,
"loss": 1.3238542079925537,
"step": 1618
},
{
"epoch": 2.477064220183486,
"grad_norm": 0.2043750286102295,
"learning_rate": 3.463972944229502e-06,
"loss": 1.2536604404449463,
"step": 1620
},
{
"epoch": 2.4801223241590216,
"grad_norm": 0.15457271039485931,
"learning_rate": 3.4474219099718085e-06,
"loss": 1.2132182121276855,
"step": 1622
},
{
"epoch": 2.4831804281345566,
"grad_norm": 0.36772802472114563,
"learning_rate": 3.4309567827707936e-06,
"loss": 1.3583812713623047,
"step": 1624
},
{
"epoch": 2.4862385321100917,
"grad_norm": 0.137724831700325,
"learning_rate": 3.4145777499096066e-06,
"loss": 1.347251057624817,
"step": 1626
},
{
"epoch": 2.489296636085627,
"grad_norm": 0.4675873816013336,
"learning_rate": 3.3982849976921185e-06,
"loss": 1.2851288318634033,
"step": 1628
},
{
"epoch": 2.4923547400611623,
"grad_norm": 0.20795711874961853,
"learning_rate": 3.3820787114407927e-06,
"loss": 1.5017863512039185,
"step": 1630
},
{
"epoch": 2.4954128440366974,
"grad_norm": 0.24633866548538208,
"learning_rate": 3.3659590754945816e-06,
"loss": 1.6977734565734863,
"step": 1632
},
{
"epoch": 2.4984709480122325,
"grad_norm": 0.20416969060897827,
"learning_rate": 3.349926273206834e-06,
"loss": 1.5675647258758545,
"step": 1634
},
{
"epoch": 2.5015290519877675,
"grad_norm": 0.17218895256519318,
"learning_rate": 3.3339804869432092e-06,
"loss": 1.568835735321045,
"step": 1636
},
{
"epoch": 2.5045871559633026,
"grad_norm": 0.3312041759490967,
"learning_rate": 3.3181218980795915e-06,
"loss": 1.4730901718139648,
"step": 1638
},
{
"epoch": 2.5076452599388377,
"grad_norm": 0.47724974155426025,
"learning_rate": 3.302350687000041e-06,
"loss": 1.1118125915527344,
"step": 1640
},
{
"epoch": 2.510703363914373,
"grad_norm": 0.22525407373905182,
"learning_rate": 3.2866670330947372e-06,
"loss": 1.1350667476654053,
"step": 1642
},
{
"epoch": 2.5137614678899083,
"grad_norm": 0.5922534465789795,
"learning_rate": 3.271071114757936e-06,
"loss": 1.5340875387191772,
"step": 1644
},
{
"epoch": 2.5168195718654434,
"grad_norm": 0.20079486072063446,
"learning_rate": 3.2555631093859376e-06,
"loss": 1.5291383266448975,
"step": 1646
},
{
"epoch": 2.5198776758409784,
"grad_norm": 0.2685466408729553,
"learning_rate": 3.240143193375079e-06,
"loss": 1.650044322013855,
"step": 1648
},
{
"epoch": 2.522935779816514,
"grad_norm": 0.29007527232170105,
"learning_rate": 3.2248115421197207e-06,
"loss": 1.580130934715271,
"step": 1650
},
{
"epoch": 2.525993883792049,
"grad_norm": 0.17674541473388672,
"learning_rate": 3.2095683300102544e-06,
"loss": 1.6336654424667358,
"step": 1652
},
{
"epoch": 2.529051987767584,
"grad_norm": 0.6385888457298279,
"learning_rate": 3.194413730431111e-06,
"loss": 1.4276736974716187,
"step": 1654
},
{
"epoch": 2.532110091743119,
"grad_norm": 0.18316228687763214,
"learning_rate": 3.1793479157588e-06,
"loss": 1.4557141065597534,
"step": 1656
},
{
"epoch": 2.5351681957186543,
"grad_norm": 0.2798449397087097,
"learning_rate": 3.1643710573599484e-06,
"loss": 1.3273746967315674,
"step": 1658
},
{
"epoch": 2.5382262996941893,
"grad_norm": 0.25256890058517456,
"learning_rate": 3.1494833255893347e-06,
"loss": 1.5899957418441772,
"step": 1660
},
{
"epoch": 2.541284403669725,
"grad_norm": 0.22966250777244568,
"learning_rate": 3.1346848897879773e-06,
"loss": 1.3582658767700195,
"step": 1662
},
{
"epoch": 2.54434250764526,
"grad_norm": 0.23401792347431183,
"learning_rate": 3.1199759182811835e-06,
"loss": 1.1058764457702637,
"step": 1664
},
{
"epoch": 2.547400611620795,
"grad_norm": 0.23885783553123474,
"learning_rate": 3.105356578376652e-06,
"loss": 1.587262511253357,
"step": 1666
},
{
"epoch": 2.5504587155963305,
"grad_norm": 0.1748657077550888,
"learning_rate": 3.090827036362566e-06,
"loss": 1.7135778665542603,
"step": 1668
},
{
"epoch": 2.5535168195718656,
"grad_norm": 0.1924404799938202,
"learning_rate": 3.0763874575056897e-06,
"loss": 1.7697184085845947,
"step": 1670
},
{
"epoch": 2.5565749235474007,
"grad_norm": 0.22128871083259583,
"learning_rate": 3.062038006049509e-06,
"loss": 1.6986305713653564,
"step": 1672
},
{
"epoch": 2.5596330275229358,
"grad_norm": 0.3884623646736145,
"learning_rate": 3.0477788452123474e-06,
"loss": 1.6635832786560059,
"step": 1674
},
{
"epoch": 2.562691131498471,
"grad_norm": 0.193667471408844,
"learning_rate": 3.0336101371855132e-06,
"loss": 1.5323315858840942,
"step": 1676
},
{
"epoch": 2.565749235474006,
"grad_norm": 0.21260865032672882,
"learning_rate": 3.019532043131461e-06,
"loss": 1.3841668367385864,
"step": 1678
},
{
"epoch": 2.5688073394495414,
"grad_norm": 0.7379730343818665,
"learning_rate": 3.005544723181949e-06,
"loss": 0.9397176504135132,
"step": 1680
},
{
"epoch": 2.5718654434250765,
"grad_norm": 0.21575972437858582,
"learning_rate": 2.9916483364362273e-06,
"loss": 0.9356784820556641,
"step": 1682
},
{
"epoch": 2.5749235474006116,
"grad_norm": 0.18256239593029022,
"learning_rate": 2.9778430409592165e-06,
"loss": 1.1633483171463013,
"step": 1684
},
{
"epoch": 2.5779816513761467,
"grad_norm": 0.18563584983348846,
"learning_rate": 2.964128993779721e-06,
"loss": 1.1023343801498413,
"step": 1686
},
{
"epoch": 2.581039755351682,
"grad_norm": 0.17169539630413055,
"learning_rate": 2.95050635088864e-06,
"loss": 1.0343772172927856,
"step": 1688
},
{
"epoch": 2.5840978593272173,
"grad_norm": 0.11842609941959381,
"learning_rate": 2.936975267237188e-06,
"loss": 1.114422082901001,
"step": 1690
},
{
"epoch": 2.5871559633027523,
"grad_norm": 0.5754178762435913,
"learning_rate": 2.9235358967351346e-06,
"loss": 1.3243212699890137,
"step": 1692
},
{
"epoch": 2.5902140672782874,
"grad_norm": 0.22367584705352783,
"learning_rate": 2.9101883922490577e-06,
"loss": 1.3066273927688599,
"step": 1694
},
{
"epoch": 2.5932721712538225,
"grad_norm": 0.11838769167661667,
"learning_rate": 2.8969329056006052e-06,
"loss": 1.1478899717330933,
"step": 1696
},
{
"epoch": 2.5963302752293576,
"grad_norm": 0.6228997707366943,
"learning_rate": 2.883769587564757e-06,
"loss": 1.308228850364685,
"step": 1698
},
{
"epoch": 2.599388379204893,
"grad_norm": 0.14409951865673065,
"learning_rate": 2.8706985878681236e-06,
"loss": 1.4127944707870483,
"step": 1700
},
{
"epoch": 2.602446483180428,
"grad_norm": 0.2419777512550354,
"learning_rate": 2.857720055187237e-06,
"loss": 1.1605547666549683,
"step": 1702
},
{
"epoch": 2.6055045871559632,
"grad_norm": 0.17849405109882355,
"learning_rate": 2.8448341371468606e-06,
"loss": 1.3722493648529053,
"step": 1704
},
{
"epoch": 2.6085626911314987,
"grad_norm": 0.584975004196167,
"learning_rate": 2.832040980318304e-06,
"loss": 1.5971640348434448,
"step": 1706
},
{
"epoch": 2.611620795107034,
"grad_norm": 0.20485441386699677,
"learning_rate": 2.8193407302177696e-06,
"loss": 1.0920323133468628,
"step": 1708
},
{
"epoch": 2.614678899082569,
"grad_norm": 0.37276148796081543,
"learning_rate": 2.806733531304681e-06,
"loss": 1.333469271659851,
"step": 1710
},
{
"epoch": 2.617737003058104,
"grad_norm": 0.16045695543289185,
"learning_rate": 2.7942195269800524e-06,
"loss": 1.4399563074111938,
"step": 1712
},
{
"epoch": 2.620795107033639,
"grad_norm": 0.29774409532546997,
"learning_rate": 2.781798859584855e-06,
"loss": 1.2612054347991943,
"step": 1714
},
{
"epoch": 2.623853211009174,
"grad_norm": 0.10159888118505478,
"learning_rate": 2.769471670398389e-06,
"loss": 1.138002872467041,
"step": 1716
},
{
"epoch": 2.6269113149847096,
"grad_norm": 0.9622639417648315,
"learning_rate": 2.757238099636689e-06,
"loss": 0.7187841534614563,
"step": 1718
},
{
"epoch": 2.6299694189602447,
"grad_norm": 0.15207388997077942,
"learning_rate": 2.7450982864509253e-06,
"loss": 1.4917500019073486,
"step": 1720
},
{
"epoch": 2.63302752293578,
"grad_norm": 0.5151544809341431,
"learning_rate": 2.7330523689258106e-06,
"loss": 1.3835726976394653,
"step": 1722
},
{
"epoch": 2.636085626911315,
"grad_norm": 0.19729852676391602,
"learning_rate": 2.721100484078048e-06,
"loss": 1.2811280488967896,
"step": 1724
},
{
"epoch": 2.6391437308868504,
"grad_norm": 0.22213585674762726,
"learning_rate": 2.709242767854758e-06,
"loss": 1.4066439867019653,
"step": 1726
},
{
"epoch": 2.6422018348623855,
"grad_norm": 0.1790645569562912,
"learning_rate": 2.6974793551319383e-06,
"loss": 1.3520299196243286,
"step": 1728
},
{
"epoch": 2.6452599388379205,
"grad_norm": 0.19265353679656982,
"learning_rate": 2.6858103797129246e-06,
"loss": 1.3133292198181152,
"step": 1730
},
{
"epoch": 2.6483180428134556,
"grad_norm": 0.23618127405643463,
"learning_rate": 2.674235974326878e-06,
"loss": 1.5000425577163696,
"step": 1732
},
{
"epoch": 2.6513761467889907,
"grad_norm": 0.17804878950119019,
"learning_rate": 2.6627562706272657e-06,
"loss": 1.1008257865905762,
"step": 1734
},
{
"epoch": 2.6544342507645258,
"grad_norm": 0.12927761673927307,
"learning_rate": 2.6513713991903705e-06,
"loss": 0.9210459589958191,
"step": 1736
},
{
"epoch": 2.6574923547400613,
"grad_norm": 0.16620349884033203,
"learning_rate": 2.640081489513797e-06,
"loss": 1.5412592887878418,
"step": 1738
},
{
"epoch": 2.6605504587155964,
"grad_norm": 0.3116919994354248,
"learning_rate": 2.628886670015009e-06,
"loss": 1.3480628728866577,
"step": 1740
},
{
"epoch": 2.6636085626911314,
"grad_norm": 0.2775146961212158,
"learning_rate": 2.6177870680298624e-06,
"loss": 1.4652796983718872,
"step": 1742
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.17939536273479462,
"learning_rate": 2.606782809811155e-06,
"loss": 1.297588586807251,
"step": 1744
},
{
"epoch": 2.669724770642202,
"grad_norm": 0.22709611058235168,
"learning_rate": 2.5958740205272003e-06,
"loss": 1.5550439357757568,
"step": 1746
},
{
"epoch": 2.672782874617737,
"grad_norm": 0.23353682458400726,
"learning_rate": 2.5850608242603913e-06,
"loss": 1.4042333364486694,
"step": 1748
},
{
"epoch": 2.675840978593272,
"grad_norm": 0.3190506100654602,
"learning_rate": 2.5743433440058002e-06,
"loss": 1.4088410139083862,
"step": 1750
},
{
"epoch": 2.6788990825688073,
"grad_norm": 0.3655577600002289,
"learning_rate": 2.5637217016697663e-06,
"loss": 1.2669798135757446,
"step": 1752
},
{
"epoch": 2.6819571865443423,
"grad_norm": 0.20551763474941254,
"learning_rate": 2.5531960180685276e-06,
"loss": 0.9370750188827515,
"step": 1754
},
{
"epoch": 2.6850152905198774,
"grad_norm": 0.2501385807991028,
"learning_rate": 2.5427664129268253e-06,
"loss": 1.4236401319503784,
"step": 1756
},
{
"epoch": 2.688073394495413,
"grad_norm": 0.17783670127391815,
"learning_rate": 2.5324330048765626e-06,
"loss": 1.3898571729660034,
"step": 1758
},
{
"epoch": 2.691131498470948,
"grad_norm": 0.19069063663482666,
"learning_rate": 2.522195911455437e-06,
"loss": 1.418738842010498,
"step": 1760
},
{
"epoch": 2.694189602446483,
"grad_norm": 0.24323132634162903,
"learning_rate": 2.5120552491056197e-06,
"loss": 1.4022393226623535,
"step": 1762
},
{
"epoch": 2.6972477064220186,
"grad_norm": 0.1714673787355423,
"learning_rate": 2.502011133172418e-06,
"loss": 1.213725209236145,
"step": 1764
},
{
"epoch": 2.7003058103975537,
"grad_norm": 0.21391062438488007,
"learning_rate": 2.4920636779029736e-06,
"loss": 1.4528888463974,
"step": 1766
},
{
"epoch": 2.7033639143730888,
"grad_norm": 0.3517923951148987,
"learning_rate": 2.482212996444952e-06,
"loss": 1.5154540538787842,
"step": 1768
},
{
"epoch": 2.706422018348624,
"grad_norm": 0.21076905727386475,
"learning_rate": 2.4724592008452655e-06,
"loss": 1.2251962423324585,
"step": 1770
},
{
"epoch": 2.709480122324159,
"grad_norm": 0.2092084437608719,
"learning_rate": 2.4628024020487946e-06,
"loss": 1.4913671016693115,
"step": 1772
},
{
"epoch": 2.712538226299694,
"grad_norm": 0.2707628011703491,
"learning_rate": 2.4532427098971276e-06,
"loss": 1.2693325281143188,
"step": 1774
},
{
"epoch": 2.7155963302752295,
"grad_norm": 0.23994937539100647,
"learning_rate": 2.4437802331273052e-06,
"loss": 1.426426649093628,
"step": 1776
},
{
"epoch": 2.7186544342507646,
"grad_norm": 0.3275426924228668,
"learning_rate": 2.4344150793705944e-06,
"loss": 1.3486037254333496,
"step": 1778
},
{
"epoch": 2.7217125382262997,
"grad_norm": 0.1596205085515976,
"learning_rate": 2.425147355151254e-06,
"loss": 1.4933159351348877,
"step": 1780
},
{
"epoch": 2.7247706422018347,
"grad_norm": 0.3981785476207733,
"learning_rate": 2.4159771658853306e-06,
"loss": 1.5875604152679443,
"step": 1782
},
{
"epoch": 2.7278287461773703,
"grad_norm": 0.8734309673309326,
"learning_rate": 2.406904615879453e-06,
"loss": 1.4748692512512207,
"step": 1784
},
{
"epoch": 2.7308868501529053,
"grad_norm": 0.21883857250213623,
"learning_rate": 2.3979298083296488e-06,
"loss": 1.3090273141860962,
"step": 1786
},
{
"epoch": 2.7339449541284404,
"grad_norm": 0.4178310036659241,
"learning_rate": 2.3890528453201756e-06,
"loss": 1.419610619544983,
"step": 1788
},
{
"epoch": 2.7370030581039755,
"grad_norm": 0.2704829275608063,
"learning_rate": 2.3802738278223474e-06,
"loss": 1.4093573093414307,
"step": 1790
},
{
"epoch": 2.7400611620795106,
"grad_norm": 0.37049105763435364,
"learning_rate": 2.3715928556934005e-06,
"loss": 1.4148246049880981,
"step": 1792
},
{
"epoch": 2.7431192660550456,
"grad_norm": 0.42498084902763367,
"learning_rate": 2.3630100276753463e-06,
"loss": 1.4419108629226685,
"step": 1794
},
{
"epoch": 2.746177370030581,
"grad_norm": 0.16743339598178864,
"learning_rate": 2.354525441393857e-06,
"loss": 1.4382030963897705,
"step": 1796
},
{
"epoch": 2.7492354740061162,
"grad_norm": 0.17311187088489532,
"learning_rate": 2.346139193357145e-06,
"loss": 1.2192634344100952,
"step": 1798
},
{
"epoch": 2.7522935779816513,
"grad_norm": 0.38412225246429443,
"learning_rate": 2.337851378954877e-06,
"loss": 1.2814158201217651,
"step": 1800
},
{
"epoch": 2.7553516819571864,
"grad_norm": 0.2552420496940613,
"learning_rate": 2.3296620924570772e-06,
"loss": 1.1930724382400513,
"step": 1802
},
{
"epoch": 2.758409785932722,
"grad_norm": 0.22476616501808167,
"learning_rate": 2.3215714270130673e-06,
"loss": 1.2869510650634766,
"step": 1804
},
{
"epoch": 2.761467889908257,
"grad_norm": 0.22635005414485931,
"learning_rate": 2.3135794746503934e-06,
"loss": 1.2187225818634033,
"step": 1806
},
{
"epoch": 2.764525993883792,
"grad_norm": 0.20397242903709412,
"learning_rate": 2.3056863262737915e-06,
"loss": 1.131676435470581,
"step": 1808
},
{
"epoch": 2.767584097859327,
"grad_norm": 0.16007225215435028,
"learning_rate": 2.2978920716641456e-06,
"loss": 0.8253338932991028,
"step": 1810
},
{
"epoch": 2.770642201834862,
"grad_norm": 0.2556101679801941,
"learning_rate": 2.290196799477473e-06,
"loss": 1.1432273387908936,
"step": 1812
},
{
"epoch": 2.7737003058103973,
"grad_norm": 0.15398050844669342,
"learning_rate": 2.2826005972439056e-06,
"loss": 1.2073218822479248,
"step": 1814
},
{
"epoch": 2.776758409785933,
"grad_norm": 0.14769600331783295,
"learning_rate": 2.2751035513667067e-06,
"loss": 1.2682545185089111,
"step": 1816
},
{
"epoch": 2.779816513761468,
"grad_norm": 0.15157395601272583,
"learning_rate": 2.2677057471212783e-06,
"loss": 1.2814412117004395,
"step": 1818
},
{
"epoch": 2.782874617737003,
"grad_norm": 0.11602704972028732,
"learning_rate": 2.2604072686541992e-06,
"loss": 1.337765097618103,
"step": 1820
},
{
"epoch": 2.7859327217125385,
"grad_norm": 0.11175990104675293,
"learning_rate": 2.25320819898226e-06,
"loss": 1.2580088376998901,
"step": 1822
},
{
"epoch": 2.7889908256880735,
"grad_norm": 0.2158500850200653,
"learning_rate": 2.2461086199915215e-06,
"loss": 1.280817985534668,
"step": 1824
},
{
"epoch": 2.7920489296636086,
"grad_norm": 0.10697898268699646,
"learning_rate": 2.2391086124363907e-06,
"loss": 1.2506331205368042,
"step": 1826
},
{
"epoch": 2.7951070336391437,
"grad_norm": 0.1721104234457016,
"learning_rate": 2.232208255938689e-06,
"loss": 1.2656058073043823,
"step": 1828
},
{
"epoch": 2.7981651376146788,
"grad_norm": 0.16275018453598022,
"learning_rate": 2.2254076289867574e-06,
"loss": 1.2296828031539917,
"step": 1830
},
{
"epoch": 2.801223241590214,
"grad_norm": 0.12005674839019775,
"learning_rate": 2.218706808934559e-06,
"loss": 1.2741050720214844,
"step": 1832
},
{
"epoch": 2.8042813455657494,
"grad_norm": 0.20490582287311554,
"learning_rate": 2.2121058720008005e-06,
"loss": 1.2242677211761475,
"step": 1834
},
{
"epoch": 2.8073394495412844,
"grad_norm": 0.08420670032501221,
"learning_rate": 2.205604893268061e-06,
"loss": 1.115134358406067,
"step": 1836
},
{
"epoch": 2.8103975535168195,
"grad_norm": 0.09198635816574097,
"learning_rate": 2.1992039466819464e-06,
"loss": 1.1836574077606201,
"step": 1838
},
{
"epoch": 2.8134556574923546,
"grad_norm": 0.12914858758449554,
"learning_rate": 2.192903105050242e-06,
"loss": 0.9696236252784729,
"step": 1840
},
{
"epoch": 2.81651376146789,
"grad_norm": 0.09582311660051346,
"learning_rate": 2.186702440042086e-06,
"loss": 1.174141764640808,
"step": 1842
},
{
"epoch": 2.819571865443425,
"grad_norm": 0.12264709919691086,
"learning_rate": 2.18060202218715e-06,
"loss": 1.2044894695281982,
"step": 1844
},
{
"epoch": 2.8226299694189603,
"grad_norm": 0.16786755621433258,
"learning_rate": 2.174601920874849e-06,
"loss": 1.2202996015548706,
"step": 1846
},
{
"epoch": 2.8256880733944953,
"grad_norm": 0.09281696379184723,
"learning_rate": 2.168702204353538e-06,
"loss": 1.1890109777450562,
"step": 1848
},
{
"epoch": 2.8287461773700304,
"grad_norm": 0.10081319510936737,
"learning_rate": 2.162902939729744e-06,
"loss": 1.2033485174179077,
"step": 1850
},
{
"epoch": 2.8318042813455655,
"grad_norm": 0.11747318506240845,
"learning_rate": 2.1572041929673983e-06,
"loss": 1.1819500923156738,
"step": 1852
},
{
"epoch": 2.834862385321101,
"grad_norm": 0.10887467861175537,
"learning_rate": 2.151606028887092e-06,
"loss": 1.1508567333221436,
"step": 1854
},
{
"epoch": 2.837920489296636,
"grad_norm": 0.14138160645961761,
"learning_rate": 2.146108511165331e-06,
"loss": 1.2135510444641113,
"step": 1856
},
{
"epoch": 2.840978593272171,
"grad_norm": 0.11023754626512527,
"learning_rate": 2.14071170233382e-06,
"loss": 1.1841347217559814,
"step": 1858
},
{
"epoch": 2.8440366972477067,
"grad_norm": 0.20284757018089294,
"learning_rate": 2.135415663778743e-06,
"loss": 1.157168984413147,
"step": 1860
},
{
"epoch": 2.8470948012232418,
"grad_norm": 0.1384735405445099,
"learning_rate": 2.1302204557400727e-06,
"loss": 1.1463501453399658,
"step": 1862
},
{
"epoch": 2.850152905198777,
"grad_norm": 0.1709088236093521,
"learning_rate": 2.125126137310878e-06,
"loss": 1.1752015352249146,
"step": 1864
},
{
"epoch": 2.853211009174312,
"grad_norm": 0.13996273279190063,
"learning_rate": 2.1201327664366585e-06,
"loss": 1.162741780281067,
"step": 1866
},
{
"epoch": 2.856269113149847,
"grad_norm": 0.13266918063163757,
"learning_rate": 2.115240399914681e-06,
"loss": 1.1238844394683838,
"step": 1868
},
{
"epoch": 2.859327217125382,
"grad_norm": 0.25243642926216125,
"learning_rate": 2.1104490933933357e-06,
"loss": 1.188450813293457,
"step": 1870
},
{
"epoch": 2.8623853211009176,
"grad_norm": 0.12690605223178864,
"learning_rate": 2.1057589013715016e-06,
"loss": 1.2131199836730957,
"step": 1872
},
{
"epoch": 2.8654434250764527,
"grad_norm": 0.22158950567245483,
"learning_rate": 2.101169877197926e-06,
"loss": 1.222786545753479,
"step": 1874
},
{
"epoch": 2.8685015290519877,
"grad_norm": 0.12401717156171799,
"learning_rate": 2.096682073070622e-06,
"loss": 1.2122353315353394,
"step": 1876
},
{
"epoch": 2.871559633027523,
"grad_norm": 0.39234742522239685,
"learning_rate": 2.092295540036271e-06,
"loss": 1.2304370403289795,
"step": 1878
},
{
"epoch": 2.8746177370030583,
"grad_norm": 0.14366573095321655,
"learning_rate": 2.088010327989642e-06,
"loss": 1.208174228668213,
"step": 1880
},
{
"epoch": 2.8776758409785934,
"grad_norm": 0.13212910294532776,
"learning_rate": 2.0838264856730233e-06,
"loss": 1.2271817922592163,
"step": 1882
},
{
"epoch": 2.8807339449541285,
"grad_norm": 0.09124042838811874,
"learning_rate": 2.0797440606756747e-06,
"loss": 1.1685302257537842,
"step": 1884
},
{
"epoch": 2.8837920489296636,
"grad_norm": 0.13218353688716888,
"learning_rate": 2.075763099433277e-06,
"loss": 1.2074134349822998,
"step": 1886
},
{
"epoch": 2.8868501529051986,
"grad_norm": 0.13880771398544312,
"learning_rate": 2.0718836472274094e-06,
"loss": 1.1736494302749634,
"step": 1888
},
{
"epoch": 2.8899082568807337,
"grad_norm": 0.0981225073337555,
"learning_rate": 2.0681057481850338e-06,
"loss": 1.1989068984985352,
"step": 1890
},
{
"epoch": 2.8929663608562692,
"grad_norm": 0.16569402813911438,
"learning_rate": 2.0644294452779904e-06,
"loss": 1.207861304283142,
"step": 1892
},
{
"epoch": 2.8960244648318043,
"grad_norm": 0.15494751930236816,
"learning_rate": 2.060854780322513e-06,
"loss": 1.2271397113800049,
"step": 1894
},
{
"epoch": 2.8990825688073394,
"grad_norm": 0.11410652101039886,
"learning_rate": 2.05738179397875e-06,
"loss": 1.2082901000976562,
"step": 1896
},
{
"epoch": 2.9021406727828745,
"grad_norm": 0.11099483072757721,
"learning_rate": 2.054010525750302e-06,
"loss": 1.1450985670089722,
"step": 1898
},
{
"epoch": 2.90519877675841,
"grad_norm": 0.12173059582710266,
"learning_rate": 2.050741013983773e-06,
"loss": 1.187538981437683,
"step": 1900
},
{
"epoch": 2.908256880733945,
"grad_norm": 0.14034679532051086,
"learning_rate": 2.0475732958683374e-06,
"loss": 1.2137997150421143,
"step": 1902
},
{
"epoch": 2.91131498470948,
"grad_norm": 0.1078169122338295,
"learning_rate": 2.0445074074353143e-06,
"loss": 1.2389276027679443,
"step": 1904
},
{
"epoch": 2.914373088685015,
"grad_norm": 0.1030258983373642,
"learning_rate": 2.0415433835577536e-06,
"loss": 1.2125961780548096,
"step": 1906
},
{
"epoch": 2.9174311926605503,
"grad_norm": 0.14901351928710938,
"learning_rate": 2.038681257950046e-06,
"loss": 1.1889057159423828,
"step": 1908
},
{
"epoch": 2.9204892966360854,
"grad_norm": 0.1964503526687622,
"learning_rate": 2.035921063167539e-06,
"loss": 1.1632894277572632,
"step": 1910
},
{
"epoch": 2.923547400611621,
"grad_norm": 0.15398406982421875,
"learning_rate": 2.0332628306061598e-06,
"loss": 1.1950072050094604,
"step": 1912
},
{
"epoch": 2.926605504587156,
"grad_norm": 0.21499764919281006,
"learning_rate": 2.0307065905020655e-06,
"loss": 1.2189935445785522,
"step": 1914
},
{
"epoch": 2.929663608562691,
"grad_norm": 0.18662309646606445,
"learning_rate": 2.028252371931297e-06,
"loss": 1.1687815189361572,
"step": 1916
},
{
"epoch": 2.9327217125382266,
"grad_norm": 0.13196606934070587,
"learning_rate": 2.025900202809447e-06,
"loss": 1.1919829845428467,
"step": 1918
},
{
"epoch": 2.9357798165137616,
"grad_norm": 0.1411059945821762,
"learning_rate": 2.0236501098913433e-06,
"loss": 1.193024754524231,
"step": 1920
},
{
"epoch": 2.9388379204892967,
"grad_norm": 0.3092004060745239,
"learning_rate": 2.021502118770743e-06,
"loss": 1.2424871921539307,
"step": 1922
},
{
"epoch": 2.941896024464832,
"grad_norm": 0.12397109717130661,
"learning_rate": 2.019456253880047e-06,
"loss": 1.2504069805145264,
"step": 1924
},
{
"epoch": 2.944954128440367,
"grad_norm": 0.2362269163131714,
"learning_rate": 2.0175125384900125e-06,
"loss": 1.2087825536727905,
"step": 1926
},
{
"epoch": 2.948012232415902,
"grad_norm": 0.1304045021533966,
"learning_rate": 2.015670994709497e-06,
"loss": 1.2350709438323975,
"step": 1928
},
{
"epoch": 2.9510703363914375,
"grad_norm": 0.14016836881637573,
"learning_rate": 2.0139316434852034e-06,
"loss": 1.203549861907959,
"step": 1930
},
{
"epoch": 2.9541284403669725,
"grad_norm": 0.16034546494483948,
"learning_rate": 2.0122945046014427e-06,
"loss": 1.1559655666351318,
"step": 1932
},
{
"epoch": 2.9571865443425076,
"grad_norm": 0.11549582332372665,
"learning_rate": 2.0107595966799047e-06,
"loss": 1.1843830347061157,
"step": 1934
},
{
"epoch": 2.9602446483180427,
"grad_norm": 0.1061682254076004,
"learning_rate": 2.009326937179452e-06,
"loss": 1.2071914672851562,
"step": 1936
},
{
"epoch": 2.963302752293578,
"grad_norm": 0.13189159333705902,
"learning_rate": 2.0079965423959206e-06,
"loss": 1.187361240386963,
"step": 1938
},
{
"epoch": 2.9663608562691133,
"grad_norm": 0.16287079453468323,
"learning_rate": 2.0067684274619298e-06,
"loss": 1.1922651529312134,
"step": 1940
},
{
"epoch": 2.9694189602446484,
"grad_norm": 0.13288040459156036,
"learning_rate": 2.0056426063467157e-06,
"loss": 1.2393989562988281,
"step": 1942
},
{
"epoch": 2.9724770642201834,
"grad_norm": 0.21218396723270416,
"learning_rate": 2.0046190918559676e-06,
"loss": 1.229328989982605,
"step": 1944
},
{
"epoch": 2.9755351681957185,
"grad_norm": 0.11764626204967499,
"learning_rate": 2.0036978956316867e-06,
"loss": 1.2407135963439941,
"step": 1946
},
{
"epoch": 2.9785932721712536,
"grad_norm": 0.23697015643119812,
"learning_rate": 2.002879028152051e-06,
"loss": 1.1935560703277588,
"step": 1948
},
{
"epoch": 2.981651376146789,
"grad_norm": 0.11776944994926453,
"learning_rate": 2.0021624987312975e-06,
"loss": 1.204485297203064,
"step": 1950
},
{
"epoch": 2.984709480122324,
"grad_norm": 0.3524181544780731,
"learning_rate": 2.001548315519612e-06,
"loss": 1.2240840196609497,
"step": 1952
},
{
"epoch": 2.9877675840978593,
"grad_norm": 0.16863545775413513,
"learning_rate": 2.0010364855030445e-06,
"loss": 1.2831623554229736,
"step": 1954
},
{
"epoch": 2.9908256880733948,
"grad_norm": 0.14657853543758392,
"learning_rate": 2.0006270145034217e-06,
"loss": 1.3105254173278809,
"step": 1956
},
{
"epoch": 2.99388379204893,
"grad_norm": 0.1979057639837265,
"learning_rate": 2.000319907178286e-06,
"loss": 1.2615809440612793,
"step": 1958
},
{
"epoch": 2.996941896024465,
"grad_norm": 0.2079269289970398,
"learning_rate": 2.00011516702084e-06,
"loss": 1.1906858682632446,
"step": 1960
},
{
"epoch": 3.0,
"grad_norm": 0.3183704912662506,
"learning_rate": 2.0000127963599083e-06,
"loss": 1.2669897079467773,
"step": 1962
},
{
"epoch": 3.0,
"step": 1962,
"total_flos": 2.4882019145802056e+18,
"train_loss": 1.4038474378721917,
"train_runtime": 17010.8289,
"train_samples_per_second": 1.845,
"train_steps_per_second": 0.115
}
],
"logging_steps": 2,
"max_steps": 1962,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 9999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4882019145802056e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}