Kallisti-35B-A3B / trainer_state.json
Yichen Feng
End of training
1f306bf verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 159,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018867924528301886,
"grad_norm": 15.087599797333977,
"learning_rate": 0.0,
"loss": 1.3308970928192139,
"step": 1
},
{
"epoch": 0.03773584905660377,
"grad_norm": 13.905749218469701,
"learning_rate": 1e-07,
"loss": 1.285962700843811,
"step": 2
},
{
"epoch": 0.05660377358490566,
"grad_norm": 12.056842971309038,
"learning_rate": 2e-07,
"loss": 1.3014495372772217,
"step": 3
},
{
"epoch": 0.07547169811320754,
"grad_norm": 13.244790508016711,
"learning_rate": 3e-07,
"loss": 1.306698203086853,
"step": 4
},
{
"epoch": 0.09433962264150944,
"grad_norm": 13.343404572220207,
"learning_rate": 4e-07,
"loss": 1.316192388534546,
"step": 5
},
{
"epoch": 0.11320754716981132,
"grad_norm": 11.45228080585811,
"learning_rate": 5e-07,
"loss": 1.3045374155044556,
"step": 6
},
{
"epoch": 0.1320754716981132,
"grad_norm": 14.318691524925551,
"learning_rate": 6e-07,
"loss": 1.3311705589294434,
"step": 7
},
{
"epoch": 0.1509433962264151,
"grad_norm": 12.050953431316902,
"learning_rate": 7e-07,
"loss": 1.2908077239990234,
"step": 8
},
{
"epoch": 0.16981132075471697,
"grad_norm": 10.724349690276135,
"learning_rate": 8e-07,
"loss": 1.3058435916900635,
"step": 9
},
{
"epoch": 0.18867924528301888,
"grad_norm": 9.473154789049158,
"learning_rate": 9e-07,
"loss": 1.2856130599975586,
"step": 10
},
{
"epoch": 0.20754716981132076,
"grad_norm": 6.5491980764342275,
"learning_rate": 1e-06,
"loss": 1.2199636697769165,
"step": 11
},
{
"epoch": 0.22641509433962265,
"grad_norm": 6.462810113541478,
"learning_rate": 9.99888864929809e-07,
"loss": 1.1673463582992554,
"step": 12
},
{
"epoch": 0.24528301886792453,
"grad_norm": 6.9296822460672045,
"learning_rate": 9.995555091232516e-07,
"loss": 1.1699671745300293,
"step": 13
},
{
"epoch": 0.2641509433962264,
"grad_norm": 6.0515568106146596,
"learning_rate": 9.990000807704114e-07,
"loss": 1.1814613342285156,
"step": 14
},
{
"epoch": 0.2830188679245283,
"grad_norm": 4.743020637878028,
"learning_rate": 9.982228267815643e-07,
"loss": 1.0652694702148438,
"step": 15
},
{
"epoch": 0.3018867924528302,
"grad_norm": 4.526630266791274,
"learning_rate": 9.972240926774166e-07,
"loss": 1.0635337829589844,
"step": 16
},
{
"epoch": 0.32075471698113206,
"grad_norm": 4.609514753545406,
"learning_rate": 9.96004322435508e-07,
"loss": 1.0902111530303955,
"step": 17
},
{
"epoch": 0.33962264150943394,
"grad_norm": 4.34097054100359,
"learning_rate": 9.945640582928437e-07,
"loss": 1.06702721118927,
"step": 18
},
{
"epoch": 0.3584905660377358,
"grad_norm": 3.864434007517437,
"learning_rate": 9.9290394050485e-07,
"loss": 1.0476477146148682,
"step": 19
},
{
"epoch": 0.37735849056603776,
"grad_norm": 3.8969527857656656,
"learning_rate": 9.91024707060755e-07,
"loss": 1.0617330074310303,
"step": 20
},
{
"epoch": 0.39622641509433965,
"grad_norm": 3.911924948199517,
"learning_rate": 9.889271933555212e-07,
"loss": 1.07832932472229,
"step": 21
},
{
"epoch": 0.41509433962264153,
"grad_norm": 3.833567236811609,
"learning_rate": 9.8661233181848e-07,
"loss": 1.0324124097824097,
"step": 22
},
{
"epoch": 0.4339622641509434,
"grad_norm": 3.6955004487569396,
"learning_rate": 9.840811514988293e-07,
"loss": 0.9815853834152222,
"step": 23
},
{
"epoch": 0.4528301886792453,
"grad_norm": 4.023404125176107,
"learning_rate": 9.813347776081788e-07,
"loss": 1.0266845226287842,
"step": 24
},
{
"epoch": 0.4716981132075472,
"grad_norm": 3.712461139695743,
"learning_rate": 9.78374431020349e-07,
"loss": 1.0085935592651367,
"step": 25
},
{
"epoch": 0.49056603773584906,
"grad_norm": 3.7543864084874596,
"learning_rate": 9.752014277286431e-07,
"loss": 0.9968965649604797,
"step": 26
},
{
"epoch": 0.5094339622641509,
"grad_norm": 3.8046734306564467,
"learning_rate": 9.718171782608353e-07,
"loss": 0.9803509712219238,
"step": 27
},
{
"epoch": 0.5283018867924528,
"grad_norm": 3.6105782650336433,
"learning_rate": 9.682231870521345e-07,
"loss": 0.9759021997451782,
"step": 28
},
{
"epoch": 0.5471698113207547,
"grad_norm": 3.3896428780092753,
"learning_rate": 9.644210517764013e-07,
"loss": 0.9812103509902954,
"step": 29
},
{
"epoch": 0.5660377358490566,
"grad_norm": 3.118079780719029,
"learning_rate": 9.60412462635919e-07,
"loss": 0.9091012477874756,
"step": 30
},
{
"epoch": 0.5849056603773585,
"grad_norm": 3.3662986364845,
"learning_rate": 9.561992016100291e-07,
"loss": 0.9503388404846191,
"step": 31
},
{
"epoch": 0.6037735849056604,
"grad_norm": 2.9779547004368196,
"learning_rate": 9.517831416629716e-07,
"loss": 0.9247981309890747,
"step": 32
},
{
"epoch": 0.6226415094339622,
"grad_norm": 3.468415170701323,
"learning_rate": 9.471662459112745e-07,
"loss": 0.9473499655723572,
"step": 33
},
{
"epoch": 0.6415094339622641,
"grad_norm": 2.8573918489427688,
"learning_rate": 9.423505667510723e-07,
"loss": 0.9340516328811646,
"step": 34
},
{
"epoch": 0.660377358490566,
"grad_norm": 2.949529150108781,
"learning_rate": 9.373382449457303e-07,
"loss": 0.9248940348625183,
"step": 35
},
{
"epoch": 0.6792452830188679,
"grad_norm": 2.9658340262784697,
"learning_rate": 9.321315086741915e-07,
"loss": 0.9420664310455322,
"step": 36
},
{
"epoch": 0.6981132075471698,
"grad_norm": 3.019712899281778,
"learning_rate": 9.267326725404598e-07,
"loss": 0.9231287240982056,
"step": 37
},
{
"epoch": 0.7169811320754716,
"grad_norm": 2.827563138085356,
"learning_rate": 9.21144136544666e-07,
"loss": 0.9293084740638733,
"step": 38
},
{
"epoch": 0.7358490566037735,
"grad_norm": 3.126960585054511,
"learning_rate": 9.153683850161705e-07,
"loss": 0.9372609853744507,
"step": 39
},
{
"epoch": 0.7547169811320755,
"grad_norm": 2.7757572634358456,
"learning_rate": 9.094079855091797e-07,
"loss": 0.9204014539718628,
"step": 40
},
{
"epoch": 0.7735849056603774,
"grad_norm": 2.86268897243828,
"learning_rate": 9.032655876613635e-07,
"loss": 0.9143469333648682,
"step": 41
},
{
"epoch": 0.7924528301886793,
"grad_norm": 2.899411491265449,
"learning_rate": 8.96943922015986e-07,
"loss": 0.901626467704773,
"step": 42
},
{
"epoch": 0.8113207547169812,
"grad_norm": 3.0296165470958494,
"learning_rate": 8.90445798808068e-07,
"loss": 0.9193109273910522,
"step": 43
},
{
"epoch": 0.8301886792452831,
"grad_norm": 2.832066082274235,
"learning_rate": 8.837741067151249e-07,
"loss": 0.9078618288040161,
"step": 44
},
{
"epoch": 0.8490566037735849,
"grad_norm": 2.9792386000035083,
"learning_rate": 8.769318115730328e-07,
"loss": 0.9032235145568848,
"step": 45
},
{
"epoch": 0.8679245283018868,
"grad_norm": 2.8570785041355373,
"learning_rate": 8.699219550575952e-07,
"loss": 0.8799638152122498,
"step": 46
},
{
"epoch": 0.8867924528301887,
"grad_norm": 2.8898604537645185,
"learning_rate": 8.627476533323956e-07,
"loss": 0.9072629809379578,
"step": 47
},
{
"epoch": 0.9056603773584906,
"grad_norm": 2.819489131324746,
"learning_rate": 8.554120956635374e-07,
"loss": 0.879642128944397,
"step": 48
},
{
"epoch": 0.9245283018867925,
"grad_norm": 2.884576949261456,
"learning_rate": 8.479185430018858e-07,
"loss": 0.9129672050476074,
"step": 49
},
{
"epoch": 0.9433962264150944,
"grad_norm": 2.8206974490824663,
"learning_rate": 8.402703265334454e-07,
"loss": 0.9072036147117615,
"step": 50
},
{
"epoch": 0.9622641509433962,
"grad_norm": 2.8666837714043414,
"learning_rate": 8.324708461985124e-07,
"loss": 0.8936312198638916,
"step": 51
},
{
"epoch": 0.9811320754716981,
"grad_norm": 2.75278105425475,
"learning_rate": 8.245235691802643e-07,
"loss": 0.886029839515686,
"step": 52
},
{
"epoch": 1.0,
"grad_norm": 2.9063116637756807,
"learning_rate": 8.164320283634585e-07,
"loss": 0.886949360370636,
"step": 53
},
{
"epoch": 1.0188679245283019,
"grad_norm": 2.8027377644406104,
"learning_rate": 8.081998207639212e-07,
"loss": 0.8734487891197205,
"step": 54
},
{
"epoch": 1.0377358490566038,
"grad_norm": 2.975237594360833,
"learning_rate": 7.998306059295302e-07,
"loss": 0.8541756868362427,
"step": 55
},
{
"epoch": 1.0566037735849056,
"grad_norm": 2.7212092257296785,
"learning_rate": 7.913281043133977e-07,
"loss": 0.855162501335144,
"step": 56
},
{
"epoch": 1.0754716981132075,
"grad_norm": 4.004522306787069,
"learning_rate": 7.826960956199794e-07,
"loss": 0.8469276428222656,
"step": 57
},
{
"epoch": 1.0943396226415094,
"grad_norm": 2.789521379215554,
"learning_rate": 7.739384171248434e-07,
"loss": 0.8612252473831177,
"step": 58
},
{
"epoch": 1.1132075471698113,
"grad_norm": 3.0001618191920008,
"learning_rate": 7.650589619688468e-07,
"loss": 0.8504967093467712,
"step": 59
},
{
"epoch": 1.1320754716981132,
"grad_norm": 2.803340918384437,
"learning_rate": 7.560616774274774e-07,
"loss": 0.8487892150878906,
"step": 60
},
{
"epoch": 1.150943396226415,
"grad_norm": 2.7872996717171112,
"learning_rate": 7.469505631561317e-07,
"loss": 0.8430064916610718,
"step": 61
},
{
"epoch": 1.169811320754717,
"grad_norm": 2.767338948376076,
"learning_rate": 7.377296694121058e-07,
"loss": 0.834577202796936,
"step": 62
},
{
"epoch": 1.1886792452830188,
"grad_norm": 2.7744551402453883,
"learning_rate": 7.284030952540936e-07,
"loss": 0.8389214277267456,
"step": 63
},
{
"epoch": 1.2075471698113207,
"grad_norm": 2.94391173341089,
"learning_rate": 7.189749867199898e-07,
"loss": 0.8442764282226562,
"step": 64
},
{
"epoch": 1.2264150943396226,
"grad_norm": 2.9244734720758285,
"learning_rate": 7.094495349838092e-07,
"loss": 0.802047848701477,
"step": 65
},
{
"epoch": 1.2452830188679245,
"grad_norm": 2.997891576167027,
"learning_rate": 6.998309744925411e-07,
"loss": 0.8562427163124084,
"step": 66
},
{
"epoch": 1.2641509433962264,
"grad_norm": 2.7454101056544618,
"learning_rate": 6.901235810837667e-07,
"loss": 0.8214827179908752,
"step": 67
},
{
"epoch": 1.2830188679245282,
"grad_norm": 2.9952605769764853,
"learning_rate": 6.803316700848778e-07,
"loss": 0.7995479702949524,
"step": 68
},
{
"epoch": 1.3018867924528301,
"grad_norm": 2.86683247629566,
"learning_rate": 6.704595943947385e-07,
"loss": 0.8077808022499084,
"step": 69
},
{
"epoch": 1.320754716981132,
"grad_norm": 2.7702979738330322,
"learning_rate": 6.605117425486481e-07,
"loss": 0.8417398929595947,
"step": 70
},
{
"epoch": 1.3396226415094339,
"grad_norm": 2.725158428984504,
"learning_rate": 6.504925367674594e-07,
"loss": 0.8494030833244324,
"step": 71
},
{
"epoch": 1.3584905660377358,
"grad_norm": 2.8106277256279255,
"learning_rate": 6.40406430991723e-07,
"loss": 0.8620424866676331,
"step": 72
},
{
"epoch": 1.3773584905660377,
"grad_norm": 2.818628329932316,
"learning_rate": 6.302579089017327e-07,
"loss": 0.8398749232292175,
"step": 73
},
{
"epoch": 1.3962264150943398,
"grad_norm": 2.745904001646307,
"learning_rate": 6.200514819243475e-07,
"loss": 0.8420323133468628,
"step": 74
},
{
"epoch": 1.4150943396226414,
"grad_norm": 2.7850840819985416,
"learning_rate": 6.097916872274814e-07,
"loss": 0.8359158635139465,
"step": 75
},
{
"epoch": 1.4339622641509435,
"grad_norm": 2.793048578545994,
"learning_rate": 5.994830857031499e-07,
"loss": 0.8336814641952515,
"step": 76
},
{
"epoch": 1.4528301886792452,
"grad_norm": 2.8505241824701826,
"learning_rate": 5.891302599399684e-07,
"loss": 0.7930982112884521,
"step": 77
},
{
"epoch": 1.4716981132075473,
"grad_norm": 2.6769256052426615,
"learning_rate": 5.78737812186009e-07,
"loss": 0.8192281723022461,
"step": 78
},
{
"epoch": 1.490566037735849,
"grad_norm": 2.7762595596745916,
"learning_rate": 5.683103623029134e-07,
"loss": 0.8389377593994141,
"step": 79
},
{
"epoch": 1.509433962264151,
"grad_norm": 2.8899154085340166,
"learning_rate": 5.578525457121806e-07,
"loss": 0.8256187438964844,
"step": 80
},
{
"epoch": 1.5283018867924527,
"grad_norm": 2.7720983651750917,
"learning_rate": 5.473690113345342e-07,
"loss": 0.8473238945007324,
"step": 81
},
{
"epoch": 1.5471698113207548,
"grad_norm": 2.8065774463241495,
"learning_rate": 5.368644195232895e-07,
"loss": 0.8165145516395569,
"step": 82
},
{
"epoch": 1.5660377358490565,
"grad_norm": 2.9614754969968016,
"learning_rate": 5.263434399926398e-07,
"loss": 0.8529609441757202,
"step": 83
},
{
"epoch": 1.5849056603773586,
"grad_norm": 2.90447128441676,
"learning_rate": 5.158107497417794e-07,
"loss": 0.8249980211257935,
"step": 84
},
{
"epoch": 1.6037735849056602,
"grad_norm": 2.7563670691746767,
"learning_rate": 5.052710309757898e-07,
"loss": 0.7900608777999878,
"step": 85
},
{
"epoch": 1.6226415094339623,
"grad_norm": 2.781624786647774,
"learning_rate": 4.947289690242102e-07,
"loss": 0.7917711734771729,
"step": 86
},
{
"epoch": 1.641509433962264,
"grad_norm": 2.8227831992064165,
"learning_rate": 4.841892502582205e-07,
"loss": 0.8228881359100342,
"step": 87
},
{
"epoch": 1.6603773584905661,
"grad_norm": 3.0626612203128687,
"learning_rate": 4.736565600073602e-07,
"loss": 0.8176588416099548,
"step": 88
},
{
"epoch": 1.6792452830188678,
"grad_norm": 2.7691999193756316,
"learning_rate": 4.6313558047671047e-07,
"loss": 0.8315557837486267,
"step": 89
},
{
"epoch": 1.6981132075471699,
"grad_norm": 2.9603416787137276,
"learning_rate": 4.5263098866546586e-07,
"loss": 0.8079712390899658,
"step": 90
},
{
"epoch": 1.7169811320754715,
"grad_norm": 2.7648310195075023,
"learning_rate": 4.421474542878194e-07,
"loss": 0.7854694128036499,
"step": 91
},
{
"epoch": 1.7358490566037736,
"grad_norm": 2.9565749840190736,
"learning_rate": 4.316896376970866e-07,
"loss": 0.8382487297058105,
"step": 92
},
{
"epoch": 1.7547169811320755,
"grad_norm": 2.904524931485949,
"learning_rate": 4.2126218781399114e-07,
"loss": 0.8337287902832031,
"step": 93
},
{
"epoch": 1.7735849056603774,
"grad_norm": 2.9419686201700794,
"learning_rate": 4.1086974006003154e-07,
"loss": 0.8450314402580261,
"step": 94
},
{
"epoch": 1.7924528301886793,
"grad_norm": 2.738066358519684,
"learning_rate": 4.0051691429685023e-07,
"loss": 0.7846765518188477,
"step": 95
},
{
"epoch": 1.8113207547169812,
"grad_norm": 2.7276079074380895,
"learning_rate": 3.902083127725186e-07,
"loss": 0.814504861831665,
"step": 96
},
{
"epoch": 1.830188679245283,
"grad_norm": 2.8093937971147835,
"learning_rate": 3.799485180756525e-07,
"loss": 0.8011671304702759,
"step": 97
},
{
"epoch": 1.849056603773585,
"grad_norm": 2.842796846086812,
"learning_rate": 3.697420910982672e-07,
"loss": 0.8165295124053955,
"step": 98
},
{
"epoch": 1.8679245283018868,
"grad_norm": 2.8189503982268977,
"learning_rate": 3.5959356900827687e-07,
"loss": 0.8199301958084106,
"step": 99
},
{
"epoch": 1.8867924528301887,
"grad_norm": 2.910644604198592,
"learning_rate": 3.4950746323254063e-07,
"loss": 0.8019869327545166,
"step": 100
},
{
"epoch": 1.9056603773584906,
"grad_norm": 2.863904675767849,
"learning_rate": 3.394882574513519e-07,
"loss": 0.8060827255249023,
"step": 101
},
{
"epoch": 1.9245283018867925,
"grad_norm": 2.8904123754351723,
"learning_rate": 3.295404056052616e-07,
"loss": 0.8078351020812988,
"step": 102
},
{
"epoch": 1.9433962264150944,
"grad_norm": 2.8850916542883778,
"learning_rate": 3.1966832991512225e-07,
"loss": 0.8068495988845825,
"step": 103
},
{
"epoch": 1.9622641509433962,
"grad_norm": 2.9528533111592865,
"learning_rate": 3.0987641891623315e-07,
"loss": 0.8184278011322021,
"step": 104
},
{
"epoch": 1.9811320754716981,
"grad_norm": 2.869159446180868,
"learning_rate": 3.0016902550745895e-07,
"loss": 0.8299746513366699,
"step": 105
},
{
"epoch": 2.0,
"grad_norm": 2.778568933671074,
"learning_rate": 2.9055046501619083e-07,
"loss": 0.785747766494751,
"step": 106
},
{
"epoch": 2.018867924528302,
"grad_norm": 2.9408610818195062,
"learning_rate": 2.810250132800103e-07,
"loss": 0.7670397758483887,
"step": 107
},
{
"epoch": 2.0377358490566038,
"grad_norm": 2.6257935800346694,
"learning_rate": 2.715969047459066e-07,
"loss": 0.7878092527389526,
"step": 108
},
{
"epoch": 2.056603773584906,
"grad_norm": 3.058449053263793,
"learning_rate": 2.6227033058789403e-07,
"loss": 0.7904379367828369,
"step": 109
},
{
"epoch": 2.0754716981132075,
"grad_norm": 2.88973427193669,
"learning_rate": 2.5304943684386825e-07,
"loss": 0.8011707067489624,
"step": 110
},
{
"epoch": 2.0943396226415096,
"grad_norm": 2.723021754211135,
"learning_rate": 2.439383225725225e-07,
"loss": 0.7658779621124268,
"step": 111
},
{
"epoch": 2.1132075471698113,
"grad_norm": 2.787460559434829,
"learning_rate": 2.3494103803115318e-07,
"loss": 0.7720337510108948,
"step": 112
},
{
"epoch": 2.1320754716981134,
"grad_norm": 2.7422069166294802,
"learning_rate": 2.2606158287515658e-07,
"loss": 0.7842212915420532,
"step": 113
},
{
"epoch": 2.150943396226415,
"grad_norm": 3.381034950183202,
"learning_rate": 2.1730390438002056e-07,
"loss": 0.7690730094909668,
"step": 114
},
{
"epoch": 2.169811320754717,
"grad_norm": 2.7764924352985663,
"learning_rate": 2.0867189568660236e-07,
"loss": 0.7737655639648438,
"step": 115
},
{
"epoch": 2.188679245283019,
"grad_norm": 2.8245587551592264,
"learning_rate": 2.0016939407046986e-07,
"loss": 0.7852470278739929,
"step": 116
},
{
"epoch": 2.207547169811321,
"grad_norm": 3.429004827616326,
"learning_rate": 1.9180017923607883e-07,
"loss": 0.7893455624580383,
"step": 117
},
{
"epoch": 2.2264150943396226,
"grad_norm": 3.1969648790899408,
"learning_rate": 1.835679716365417e-07,
"loss": 0.7634609937667847,
"step": 118
},
{
"epoch": 2.2452830188679247,
"grad_norm": 2.70318214433158,
"learning_rate": 1.7547643081973578e-07,
"loss": 0.7859703898429871,
"step": 119
},
{
"epoch": 2.2641509433962264,
"grad_norm": 2.961996890522788,
"learning_rate": 1.6752915380148768e-07,
"loss": 0.7709099650382996,
"step": 120
},
{
"epoch": 2.2830188679245285,
"grad_norm": 2.8177889556978095,
"learning_rate": 1.5972967346655448e-07,
"loss": 0.7789061069488525,
"step": 121
},
{
"epoch": 2.30188679245283,
"grad_norm": 3.320024417308839,
"learning_rate": 1.5208145699811415e-07,
"loss": 0.7862054705619812,
"step": 122
},
{
"epoch": 2.3207547169811322,
"grad_norm": 2.8631784669698415,
"learning_rate": 1.4458790433646263e-07,
"loss": 0.7816888689994812,
"step": 123
},
{
"epoch": 2.339622641509434,
"grad_norm": 2.902161614336072,
"learning_rate": 1.3725234666760427e-07,
"loss": 0.7391059398651123,
"step": 124
},
{
"epoch": 2.358490566037736,
"grad_norm": 2.882470659827849,
"learning_rate": 1.3007804494240476e-07,
"loss": 0.7627633810043335,
"step": 125
},
{
"epoch": 2.3773584905660377,
"grad_norm": 2.8433427591245284,
"learning_rate": 1.2306818842696715e-07,
"loss": 0.7769066095352173,
"step": 126
},
{
"epoch": 2.3962264150943398,
"grad_norm": 2.8617729260756573,
"learning_rate": 1.1622589328487503e-07,
"loss": 0.7934216856956482,
"step": 127
},
{
"epoch": 2.4150943396226414,
"grad_norm": 2.8509595069990823,
"learning_rate": 1.0955420119193198e-07,
"loss": 0.7673547863960266,
"step": 128
},
{
"epoch": 2.4339622641509435,
"grad_norm": 2.874293982355328,
"learning_rate": 1.03056077984014e-07,
"loss": 0.7849991917610168,
"step": 129
},
{
"epoch": 2.452830188679245,
"grad_norm": 3.0937215388279,
"learning_rate": 9.673441233863661e-08,
"loss": 0.7473263740539551,
"step": 130
},
{
"epoch": 2.4716981132075473,
"grad_norm": 2.9292035796935054,
"learning_rate": 9.059201449082043e-08,
"loss": 0.784021258354187,
"step": 131
},
{
"epoch": 2.490566037735849,
"grad_norm": 2.810444173384006,
"learning_rate": 8.463161498382949e-08,
"loss": 0.7882828712463379,
"step": 132
},
{
"epoch": 2.509433962264151,
"grad_norm": 2.829313317652292,
"learning_rate": 7.885586345533396e-08,
"loss": 0.7572199702262878,
"step": 133
},
{
"epoch": 2.5283018867924527,
"grad_norm": 2.6656369607187567,
"learning_rate": 7.326732745954e-08,
"loss": 0.7826784253120422,
"step": 134
},
{
"epoch": 2.547169811320755,
"grad_norm": 2.7036355808226173,
"learning_rate": 6.786849132580841e-08,
"loss": 0.7726486325263977,
"step": 135
},
{
"epoch": 2.5660377358490565,
"grad_norm": 2.805033772692598,
"learning_rate": 6.266175505426957e-08,
"loss": 0.7736940383911133,
"step": 136
},
{
"epoch": 2.5849056603773586,
"grad_norm": 2.8181269221147396,
"learning_rate": 5.7649433248927794e-08,
"loss": 0.7888213396072388,
"step": 137
},
{
"epoch": 2.6037735849056602,
"grad_norm": 2.9760303324315256,
"learning_rate": 5.283375408872537e-08,
"loss": 0.7611340284347534,
"step": 138
},
{
"epoch": 2.6226415094339623,
"grad_norm": 2.828152013200315,
"learning_rate": 4.821685833702849e-08,
"loss": 0.779454231262207,
"step": 139
},
{
"epoch": 2.641509433962264,
"grad_norm": 2.8581322420761786,
"learning_rate": 4.3800798389970863e-08,
"loss": 0.769560694694519,
"step": 140
},
{
"epoch": 2.660377358490566,
"grad_norm": 2.8125888801619103,
"learning_rate": 3.958753736408105e-08,
"loss": 0.7890896797180176,
"step": 141
},
{
"epoch": 2.6792452830188678,
"grad_norm": 2.757727954638762,
"learning_rate": 3.557894822359864e-08,
"loss": 0.7476776838302612,
"step": 142
},
{
"epoch": 2.69811320754717,
"grad_norm": 2.802525331124496,
"learning_rate": 3.1776812947865384e-08,
"loss": 0.7551087737083435,
"step": 143
},
{
"epoch": 2.7169811320754715,
"grad_norm": 3.172109709327269,
"learning_rate": 2.818282173916453e-08,
"loss": 0.7675119638442993,
"step": 144
},
{
"epoch": 2.7358490566037736,
"grad_norm": 2.836017838014085,
"learning_rate": 2.4798572271356843e-08,
"loss": 0.7670686841011047,
"step": 145
},
{
"epoch": 2.7547169811320753,
"grad_norm": 2.9198667506437905,
"learning_rate": 2.162556897965101e-08,
"loss": 0.7993500828742981,
"step": 146
},
{
"epoch": 2.7735849056603774,
"grad_norm": 2.795471164301072,
"learning_rate": 1.8665222391821166e-08,
"loss": 0.7754116654396057,
"step": 147
},
{
"epoch": 2.7924528301886795,
"grad_norm": 2.7725526525432787,
"learning_rate": 1.5918848501170644e-08,
"loss": 0.7710179090499878,
"step": 148
},
{
"epoch": 2.811320754716981,
"grad_norm": 2.784214561225124,
"learning_rate": 1.3387668181519818e-08,
"loss": 0.7384580969810486,
"step": 149
},
{
"epoch": 2.830188679245283,
"grad_norm": 2.8847249743481833,
"learning_rate": 1.1072806644478738e-08,
"loss": 0.7740883827209473,
"step": 150
},
{
"epoch": 2.849056603773585,
"grad_norm": 2.8315645307075945,
"learning_rate": 8.975292939244927e-09,
"loss": 0.7919697165489197,
"step": 151
},
{
"epoch": 2.867924528301887,
"grad_norm": 2.9085892225722034,
"learning_rate": 7.096059495149853e-09,
"loss": 0.781722903251648,
"step": 152
},
{
"epoch": 2.8867924528301887,
"grad_norm": 2.7506543384708224,
"learning_rate": 5.435941707156388e-09,
"loss": 0.7471998929977417,
"step": 153
},
{
"epoch": 2.9056603773584904,
"grad_norm": 2.8426972222396136,
"learning_rate": 3.995677564492039e-09,
"loss": 0.7751771807670593,
"step": 154
},
{
"epoch": 2.9245283018867925,
"grad_norm": 2.844363880881091,
"learning_rate": 2.7759073225832597e-09,
"loss": 0.7668254375457764,
"step": 155
},
{
"epoch": 2.9433962264150946,
"grad_norm": 3.278094344932399,
"learning_rate": 1.7771732184357901e-09,
"loss": 0.7961957454681396,
"step": 156
},
{
"epoch": 2.9622641509433962,
"grad_norm": 2.9897635623753955,
"learning_rate": 9.999192295886971e-10,
"loss": 0.7848834991455078,
"step": 157
},
{
"epoch": 2.981132075471698,
"grad_norm": 2.748244107712091,
"learning_rate": 4.4449087674847117e-10,
"loss": 0.777495801448822,
"step": 158
},
{
"epoch": 3.0,
"grad_norm": 2.9554977361208974,
"learning_rate": 1.1113507019094858e-10,
"loss": 0.7618961334228516,
"step": 159
},
{
"epoch": 3.0,
"step": 159,
"total_flos": 23335512768512.0,
"train_loss": 0.8809327138294963,
"train_runtime": 1440.1859,
"train_samples_per_second": 3.485,
"train_steps_per_second": 0.11
}
],
"logging_steps": 1.0,
"max_steps": 159,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 23335512768512.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}