7b_2e6 / trainer_state.json
pbwpbw's picture
Upload folder using huggingface_hub
af7c797 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997582789460963,
"eval_steps": 500,
"global_step": 517,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019337684312303602,
"grad_norm": 18.9990177154541,
"learning_rate": 0.0,
"loss": 1.0504,
"step": 1
},
{
"epoch": 0.0038675368624607204,
"grad_norm": 18.651086807250977,
"learning_rate": 1.25e-07,
"loss": 1.0819,
"step": 2
},
{
"epoch": 0.005801305293691081,
"grad_norm": 17.61154556274414,
"learning_rate": 2.5e-07,
"loss": 1.0378,
"step": 3
},
{
"epoch": 0.007735073724921441,
"grad_norm": 18.019412994384766,
"learning_rate": 3.75e-07,
"loss": 1.0519,
"step": 4
},
{
"epoch": 0.0096688421561518,
"grad_norm": 18.217206954956055,
"learning_rate": 5e-07,
"loss": 1.0353,
"step": 5
},
{
"epoch": 0.011602610587382161,
"grad_norm": 17.47523307800293,
"learning_rate": 6.249999999999999e-07,
"loss": 1.0829,
"step": 6
},
{
"epoch": 0.01353637901861252,
"grad_norm": 18.857463836669922,
"learning_rate": 7.5e-07,
"loss": 1.0694,
"step": 7
},
{
"epoch": 0.015470147449842882,
"grad_norm": 17.953575134277344,
"learning_rate": 8.75e-07,
"loss": 1.0873,
"step": 8
},
{
"epoch": 0.01740391588107324,
"grad_norm": 16.11162757873535,
"learning_rate": 1e-06,
"loss": 0.9523,
"step": 9
},
{
"epoch": 0.0193376843123036,
"grad_norm": 14.633333206176758,
"learning_rate": 1.125e-06,
"loss": 1.0021,
"step": 10
},
{
"epoch": 0.021271452743533963,
"grad_norm": 12.60569953918457,
"learning_rate": 1.2499999999999999e-06,
"loss": 0.9455,
"step": 11
},
{
"epoch": 0.023205221174764323,
"grad_norm": 12.926105499267578,
"learning_rate": 1.375e-06,
"loss": 0.9779,
"step": 12
},
{
"epoch": 0.025138989605994682,
"grad_norm": 8.501869201660156,
"learning_rate": 1.5e-06,
"loss": 0.852,
"step": 13
},
{
"epoch": 0.02707275803722504,
"grad_norm": 8.153008460998535,
"learning_rate": 1.625e-06,
"loss": 0.9161,
"step": 14
},
{
"epoch": 0.029006526468455404,
"grad_norm": 8.377603530883789,
"learning_rate": 1.75e-06,
"loss": 0.9677,
"step": 15
},
{
"epoch": 0.030940294899685764,
"grad_norm": 8.171313285827637,
"learning_rate": 1.8749999999999998e-06,
"loss": 0.9456,
"step": 16
},
{
"epoch": 0.03287406333091612,
"grad_norm": 7.480187892913818,
"learning_rate": 2e-06,
"loss": 0.8556,
"step": 17
},
{
"epoch": 0.03480783176214648,
"grad_norm": 6.254351615905762,
"learning_rate": 1.9999803395762148e-06,
"loss": 0.8907,
"step": 18
},
{
"epoch": 0.03674160019337684,
"grad_norm": 7.525010585784912,
"learning_rate": 1.9999213590779243e-06,
"loss": 0.8208,
"step": 19
},
{
"epoch": 0.0386753686246072,
"grad_norm": 8.508758544921875,
"learning_rate": 1.9998230608242915e-06,
"loss": 0.8717,
"step": 20
},
{
"epoch": 0.04060913705583756,
"grad_norm": 8.445762634277344,
"learning_rate": 1.999685448680487e-06,
"loss": 0.8676,
"step": 21
},
{
"epoch": 0.04254290548706793,
"grad_norm": 7.846978187561035,
"learning_rate": 1.999508528057537e-06,
"loss": 0.8169,
"step": 22
},
{
"epoch": 0.044476673918298286,
"grad_norm": 7.429245948791504,
"learning_rate": 1.9992923059121105e-06,
"loss": 0.7503,
"step": 23
},
{
"epoch": 0.046410442349528645,
"grad_norm": 6.933453559875488,
"learning_rate": 1.999036790746245e-06,
"loss": 0.7799,
"step": 24
},
{
"epoch": 0.048344210780759005,
"grad_norm": 6.4085774421691895,
"learning_rate": 1.9987419926070138e-06,
"loss": 0.785,
"step": 25
},
{
"epoch": 0.050277979211989364,
"grad_norm": 5.818970203399658,
"learning_rate": 1.9984079230861295e-06,
"loss": 0.7994,
"step": 26
},
{
"epoch": 0.05221174764321972,
"grad_norm": 6.150722026824951,
"learning_rate": 1.998034595319489e-06,
"loss": 0.7996,
"step": 27
},
{
"epoch": 0.05414551607445008,
"grad_norm": 5.317642688751221,
"learning_rate": 1.997622023986656e-06,
"loss": 0.774,
"step": 28
},
{
"epoch": 0.05607928450568044,
"grad_norm": 4.694331169128418,
"learning_rate": 1.9971702253102855e-06,
"loss": 0.7098,
"step": 29
},
{
"epoch": 0.05801305293691081,
"grad_norm": 4.669729709625244,
"learning_rate": 1.9966792170554842e-06,
"loss": 0.7263,
"step": 30
},
{
"epoch": 0.05994682136814117,
"grad_norm": 5.054605960845947,
"learning_rate": 1.9961490185291127e-06,
"loss": 0.7084,
"step": 31
},
{
"epoch": 0.06188058979937153,
"grad_norm": 4.530153751373291,
"learning_rate": 1.9955796505790267e-06,
"loss": 0.653,
"step": 32
},
{
"epoch": 0.06381435823060189,
"grad_norm": 4.292491436004639,
"learning_rate": 1.9949711355932563e-06,
"loss": 0.7146,
"step": 33
},
{
"epoch": 0.06574812666183225,
"grad_norm": 3.924863815307617,
"learning_rate": 1.9943234974991266e-06,
"loss": 0.6936,
"step": 34
},
{
"epoch": 0.0676818950930626,
"grad_norm": 4.0565314292907715,
"learning_rate": 1.993636761762317e-06,
"loss": 0.6899,
"step": 35
},
{
"epoch": 0.06961566352429296,
"grad_norm": 4.169654846191406,
"learning_rate": 1.9929109553858574e-06,
"loss": 0.7224,
"step": 36
},
{
"epoch": 0.07154943195552332,
"grad_norm": 4.1086578369140625,
"learning_rate": 1.9921461069090715e-06,
"loss": 0.7267,
"step": 37
},
{
"epoch": 0.07348320038675368,
"grad_norm": 3.86490535736084,
"learning_rate": 1.991342246406448e-06,
"loss": 0.7308,
"step": 38
},
{
"epoch": 0.07541696881798404,
"grad_norm": 3.4781484603881836,
"learning_rate": 1.990499405486464e-06,
"loss": 0.6204,
"step": 39
},
{
"epoch": 0.0773507372492144,
"grad_norm": 3.520286798477173,
"learning_rate": 1.989617617290338e-06,
"loss": 0.6711,
"step": 40
},
{
"epoch": 0.07928450568044476,
"grad_norm": 3.5273187160491943,
"learning_rate": 1.98869691649073e-06,
"loss": 0.6004,
"step": 41
},
{
"epoch": 0.08121827411167512,
"grad_norm": 3.4485585689544678,
"learning_rate": 1.9877373392903765e-06,
"loss": 0.7075,
"step": 42
},
{
"epoch": 0.0831520425429055,
"grad_norm": 3.598468780517578,
"learning_rate": 1.986738923420665e-06,
"loss": 0.6459,
"step": 43
},
{
"epoch": 0.08508581097413585,
"grad_norm": 3.1944539546966553,
"learning_rate": 1.9857017081401545e-06,
"loss": 0.5869,
"step": 44
},
{
"epoch": 0.08701957940536621,
"grad_norm": 3.8986239433288574,
"learning_rate": 1.984625734233029e-06,
"loss": 0.6738,
"step": 45
},
{
"epoch": 0.08895334783659657,
"grad_norm": 3.00274920463562,
"learning_rate": 1.9835110440074937e-06,
"loss": 0.6038,
"step": 46
},
{
"epoch": 0.09088711626782693,
"grad_norm": 3.3365049362182617,
"learning_rate": 1.9823576812941135e-06,
"loss": 0.6489,
"step": 47
},
{
"epoch": 0.09282088469905729,
"grad_norm": 3.1739261150360107,
"learning_rate": 1.9811656914440883e-06,
"loss": 0.6558,
"step": 48
},
{
"epoch": 0.09475465313028765,
"grad_norm": 3.4007365703582764,
"learning_rate": 1.9799351213274686e-06,
"loss": 0.6043,
"step": 49
},
{
"epoch": 0.09668842156151801,
"grad_norm": 3.241675853729248,
"learning_rate": 1.9786660193313147e-06,
"loss": 0.6038,
"step": 50
},
{
"epoch": 0.09862218999274837,
"grad_norm": 3.3647406101226807,
"learning_rate": 1.9773584353577927e-06,
"loss": 0.6712,
"step": 51
},
{
"epoch": 0.10055595842397873,
"grad_norm": 3.1126842498779297,
"learning_rate": 1.9760124208222127e-06,
"loss": 0.6281,
"step": 52
},
{
"epoch": 0.10248972685520909,
"grad_norm": 3.2202467918395996,
"learning_rate": 1.974628028651007e-06,
"loss": 0.616,
"step": 53
},
{
"epoch": 0.10442349528643945,
"grad_norm": 3.008822441101074,
"learning_rate": 1.9732053132796493e-06,
"loss": 0.6192,
"step": 54
},
{
"epoch": 0.1063572637176698,
"grad_norm": 3.3741564750671387,
"learning_rate": 1.971744330650514e-06,
"loss": 0.6312,
"step": 55
},
{
"epoch": 0.10829103214890017,
"grad_norm": 3.326568603515625,
"learning_rate": 1.970245138210676e-06,
"loss": 0.6039,
"step": 56
},
{
"epoch": 0.11022480058013052,
"grad_norm": 2.9236772060394287,
"learning_rate": 1.968707794909653e-06,
"loss": 0.6182,
"step": 57
},
{
"epoch": 0.11215856901136088,
"grad_norm": 2.947068452835083,
"learning_rate": 1.967132361197086e-06,
"loss": 0.6027,
"step": 58
},
{
"epoch": 0.11409233744259124,
"grad_norm": 3.112914562225342,
"learning_rate": 1.9655188990203645e-06,
"loss": 0.6059,
"step": 59
},
{
"epoch": 0.11602610587382162,
"grad_norm": 3.3346080780029297,
"learning_rate": 1.963867471822189e-06,
"loss": 0.602,
"step": 60
},
{
"epoch": 0.11795987430505198,
"grad_norm": 2.8977630138397217,
"learning_rate": 1.962178144538076e-06,
"loss": 0.5868,
"step": 61
},
{
"epoch": 0.11989364273628234,
"grad_norm": 2.880448818206787,
"learning_rate": 1.9604509835938064e-06,
"loss": 0.5378,
"step": 62
},
{
"epoch": 0.1218274111675127,
"grad_norm": 2.6951026916503906,
"learning_rate": 1.9586860569028124e-06,
"loss": 0.5642,
"step": 63
},
{
"epoch": 0.12376117959874305,
"grad_norm": 3.1341307163238525,
"learning_rate": 1.9568834338635075e-06,
"loss": 0.6254,
"step": 64
},
{
"epoch": 0.1256949480299734,
"grad_norm": 2.9430906772613525,
"learning_rate": 1.9550431853565574e-06,
"loss": 0.5638,
"step": 65
},
{
"epoch": 0.12762871646120377,
"grad_norm": 3.170628309249878,
"learning_rate": 1.953165383742093e-06,
"loss": 0.6531,
"step": 66
},
{
"epoch": 0.12956248489243413,
"grad_norm": 2.6802241802215576,
"learning_rate": 1.9512501028568656e-06,
"loss": 0.5603,
"step": 67
},
{
"epoch": 0.1314962533236645,
"grad_norm": 2.698767900466919,
"learning_rate": 1.9492974180113425e-06,
"loss": 0.5635,
"step": 68
},
{
"epoch": 0.13343002175489485,
"grad_norm": 2.9341378211975098,
"learning_rate": 1.9473074059867473e-06,
"loss": 0.643,
"step": 69
},
{
"epoch": 0.1353637901861252,
"grad_norm": 3.148897171020508,
"learning_rate": 1.9452801450320396e-06,
"loss": 0.6171,
"step": 70
},
{
"epoch": 0.13729755861735557,
"grad_norm": 2.996152400970459,
"learning_rate": 1.943215714860838e-06,
"loss": 0.645,
"step": 71
},
{
"epoch": 0.13923132704858593,
"grad_norm": 2.855325222015381,
"learning_rate": 1.941114196648286e-06,
"loss": 0.5198,
"step": 72
},
{
"epoch": 0.1411650954798163,
"grad_norm": 2.8352253437042236,
"learning_rate": 1.9389756730278625e-06,
"loss": 0.6044,
"step": 73
},
{
"epoch": 0.14309886391104665,
"grad_norm": 3.1634292602539062,
"learning_rate": 1.9368002280881276e-06,
"loss": 0.6285,
"step": 74
},
{
"epoch": 0.145032632342277,
"grad_norm": 2.8662703037261963,
"learning_rate": 1.9345879473694204e-06,
"loss": 0.539,
"step": 75
},
{
"epoch": 0.14696640077350737,
"grad_norm": 2.915189266204834,
"learning_rate": 1.932338917860494e-06,
"loss": 0.586,
"step": 76
},
{
"epoch": 0.14890016920473773,
"grad_norm": 2.995396852493286,
"learning_rate": 1.9300532279950945e-06,
"loss": 0.5723,
"step": 77
},
{
"epoch": 0.15083393763596809,
"grad_norm": 3.11187481880188,
"learning_rate": 1.9277309676484854e-06,
"loss": 0.6344,
"step": 78
},
{
"epoch": 0.15276770606719844,
"grad_norm": 2.9698314666748047,
"learning_rate": 1.9253722281339115e-06,
"loss": 0.5782,
"step": 79
},
{
"epoch": 0.1547014744984288,
"grad_norm": 2.8995749950408936,
"learning_rate": 1.9229771021990093e-06,
"loss": 0.5567,
"step": 80
},
{
"epoch": 0.15663524292965916,
"grad_norm": 3.096269369125366,
"learning_rate": 1.920545684022161e-06,
"loss": 0.5743,
"step": 81
},
{
"epoch": 0.15856901136088952,
"grad_norm": 2.9514002799987793,
"learning_rate": 1.91807806920879e-06,
"loss": 0.5647,
"step": 82
},
{
"epoch": 0.16050277979211988,
"grad_norm": 3.0244557857513428,
"learning_rate": 1.915574354787602e-06,
"loss": 0.5775,
"step": 83
},
{
"epoch": 0.16243654822335024,
"grad_norm": 2.788266181945801,
"learning_rate": 1.913034639206771e-06,
"loss": 0.5589,
"step": 84
},
{
"epoch": 0.1643703166545806,
"grad_norm": 3.122509241104126,
"learning_rate": 1.910459022330065e-06,
"loss": 0.5745,
"step": 85
},
{
"epoch": 0.166304085085811,
"grad_norm": 2.8160808086395264,
"learning_rate": 1.9078476054329236e-06,
"loss": 0.5431,
"step": 86
},
{
"epoch": 0.16823785351704135,
"grad_norm": 2.9235658645629883,
"learning_rate": 1.9052004911984719e-06,
"loss": 0.5591,
"step": 87
},
{
"epoch": 0.1701716219482717,
"grad_norm": 3.0429940223693848,
"learning_rate": 1.9025177837134855e-06,
"loss": 0.5745,
"step": 88
},
{
"epoch": 0.17210539037950207,
"grad_norm": 2.805288791656494,
"learning_rate": 1.8997995884642967e-06,
"loss": 0.579,
"step": 89
},
{
"epoch": 0.17403915881073242,
"grad_norm": 2.8548991680145264,
"learning_rate": 1.897046012332646e-06,
"loss": 0.5793,
"step": 90
},
{
"epoch": 0.17597292724196278,
"grad_norm": 2.9402284622192383,
"learning_rate": 1.8942571635914812e-06,
"loss": 0.5541,
"step": 91
},
{
"epoch": 0.17790669567319314,
"grad_norm": 2.88116717338562,
"learning_rate": 1.8914331519006983e-06,
"loss": 0.5372,
"step": 92
},
{
"epoch": 0.1798404641044235,
"grad_norm": 2.8997554779052734,
"learning_rate": 1.8885740883028306e-06,
"loss": 0.5603,
"step": 93
},
{
"epoch": 0.18177423253565386,
"grad_norm": 3.0519931316375732,
"learning_rate": 1.8856800852186826e-06,
"loss": 0.5942,
"step": 94
},
{
"epoch": 0.18370800096688422,
"grad_norm": 3.0380802154541016,
"learning_rate": 1.8827512564429074e-06,
"loss": 0.5427,
"step": 95
},
{
"epoch": 0.18564176939811458,
"grad_norm": 3.1358187198638916,
"learning_rate": 1.8797877171395353e-06,
"loss": 0.5735,
"step": 96
},
{
"epoch": 0.18757553782934494,
"grad_norm": 2.9399850368499756,
"learning_rate": 1.8767895838374437e-06,
"loss": 0.5616,
"step": 97
},
{
"epoch": 0.1895093062605753,
"grad_norm": 3.1031243801116943,
"learning_rate": 1.8737569744257752e-06,
"loss": 0.5919,
"step": 98
},
{
"epoch": 0.19144307469180566,
"grad_norm": 2.8527615070343018,
"learning_rate": 1.8706900081493017e-06,
"loss": 0.5165,
"step": 99
},
{
"epoch": 0.19337684312303602,
"grad_norm": 2.9103434085845947,
"learning_rate": 1.8675888056037374e-06,
"loss": 0.5361,
"step": 100
},
{
"epoch": 0.19531061155426638,
"grad_norm": 2.798142910003662,
"learning_rate": 1.8644534887309945e-06,
"loss": 0.5274,
"step": 101
},
{
"epoch": 0.19724437998549674,
"grad_norm": 2.727302312850952,
"learning_rate": 1.8612841808143894e-06,
"loss": 0.5681,
"step": 102
},
{
"epoch": 0.1991781484167271,
"grad_norm": 2.993396759033203,
"learning_rate": 1.8580810064737963e-06,
"loss": 0.5323,
"step": 103
},
{
"epoch": 0.20111191684795746,
"grad_norm": 2.8348159790039062,
"learning_rate": 1.8548440916607452e-06,
"loss": 0.5679,
"step": 104
},
{
"epoch": 0.20304568527918782,
"grad_norm": 2.7694900035858154,
"learning_rate": 1.8515735636534693e-06,
"loss": 0.5343,
"step": 105
},
{
"epoch": 0.20497945371041817,
"grad_norm": 3.0484673976898193,
"learning_rate": 1.8482695510519025e-06,
"loss": 0.5595,
"step": 106
},
{
"epoch": 0.20691322214164853,
"grad_norm": 3.0022387504577637,
"learning_rate": 1.8449321837726205e-06,
"loss": 0.5666,
"step": 107
},
{
"epoch": 0.2088469905728789,
"grad_norm": 2.9040303230285645,
"learning_rate": 1.8415615930437334e-06,
"loss": 0.57,
"step": 108
},
{
"epoch": 0.21078075900410925,
"grad_norm": 2.8118133544921875,
"learning_rate": 1.8381579113997252e-06,
"loss": 0.5479,
"step": 109
},
{
"epoch": 0.2127145274353396,
"grad_norm": 2.882737159729004,
"learning_rate": 1.8347212726762436e-06,
"loss": 0.5641,
"step": 110
},
{
"epoch": 0.21464829586656997,
"grad_norm": 2.9052658081054688,
"learning_rate": 1.8312518120048353e-06,
"loss": 0.4957,
"step": 111
},
{
"epoch": 0.21658206429780033,
"grad_norm": 2.898360013961792,
"learning_rate": 1.827749665807635e-06,
"loss": 0.5477,
"step": 112
},
{
"epoch": 0.2185158327290307,
"grad_norm": 2.7747766971588135,
"learning_rate": 1.8242149717919991e-06,
"loss": 0.5579,
"step": 113
},
{
"epoch": 0.22044960116026105,
"grad_norm": 3.0171499252319336,
"learning_rate": 1.8206478689450929e-06,
"loss": 0.5669,
"step": 114
},
{
"epoch": 0.2223833695914914,
"grad_norm": 2.9386541843414307,
"learning_rate": 1.817048497528423e-06,
"loss": 0.5213,
"step": 115
},
{
"epoch": 0.22431713802272177,
"grad_norm": 2.888003349304199,
"learning_rate": 1.8134169990723246e-06,
"loss": 0.5299,
"step": 116
},
{
"epoch": 0.22625090645395213,
"grad_norm": 3.0927133560180664,
"learning_rate": 1.8097535163703945e-06,
"loss": 0.5599,
"step": 117
},
{
"epoch": 0.2281846748851825,
"grad_norm": 2.846223831176758,
"learning_rate": 1.8060581934738781e-06,
"loss": 0.5416,
"step": 118
},
{
"epoch": 0.23011844331641285,
"grad_norm": 2.8315625190734863,
"learning_rate": 1.8023311756860035e-06,
"loss": 0.5483,
"step": 119
},
{
"epoch": 0.23205221174764323,
"grad_norm": 2.946509599685669,
"learning_rate": 1.798572609556269e-06,
"loss": 0.5825,
"step": 120
},
{
"epoch": 0.2339859801788736,
"grad_norm": 3.0473732948303223,
"learning_rate": 1.7947826428746806e-06,
"loss": 0.5535,
"step": 121
},
{
"epoch": 0.23591974861010395,
"grad_norm": 3.0193090438842773,
"learning_rate": 1.7909614246659406e-06,
"loss": 0.5517,
"step": 122
},
{
"epoch": 0.2378535170413343,
"grad_norm": 2.919999837875366,
"learning_rate": 1.787109105183587e-06,
"loss": 0.5755,
"step": 123
},
{
"epoch": 0.23978728547256467,
"grad_norm": 2.945788860321045,
"learning_rate": 1.7832258359040882e-06,
"loss": 0.5206,
"step": 124
},
{
"epoch": 0.24172105390379503,
"grad_norm": 2.968233108520508,
"learning_rate": 1.7793117695208826e-06,
"loss": 0.5674,
"step": 125
},
{
"epoch": 0.2436548223350254,
"grad_norm": 2.9779975414276123,
"learning_rate": 1.775367059938378e-06,
"loss": 0.5777,
"step": 126
},
{
"epoch": 0.24558859076625575,
"grad_norm": 3.014735221862793,
"learning_rate": 1.771391862265899e-06,
"loss": 0.55,
"step": 127
},
{
"epoch": 0.2475223591974861,
"grad_norm": 3.1135857105255127,
"learning_rate": 1.7673863328115867e-06,
"loss": 0.5403,
"step": 128
},
{
"epoch": 0.24945612762871647,
"grad_norm": 2.9737236499786377,
"learning_rate": 1.763350629076255e-06,
"loss": 0.554,
"step": 129
},
{
"epoch": 0.2513898960599468,
"grad_norm": 2.74530291557312,
"learning_rate": 1.7592849097471947e-06,
"loss": 0.5474,
"step": 130
},
{
"epoch": 0.25332366449117716,
"grad_norm": 3.2034261226654053,
"learning_rate": 1.755189334691936e-06,
"loss": 0.5603,
"step": 131
},
{
"epoch": 0.25525743292240755,
"grad_norm": 2.845120668411255,
"learning_rate": 1.7510640649519614e-06,
"loss": 0.536,
"step": 132
},
{
"epoch": 0.2571912013536379,
"grad_norm": 2.771730661392212,
"learning_rate": 1.7469092627363735e-06,
"loss": 0.5016,
"step": 133
},
{
"epoch": 0.25912496978486826,
"grad_norm": 2.9173460006713867,
"learning_rate": 1.742725091415517e-06,
"loss": 0.5249,
"step": 134
},
{
"epoch": 0.2610587382160986,
"grad_norm": 3.0819239616394043,
"learning_rate": 1.7385117155145545e-06,
"loss": 0.5169,
"step": 135
},
{
"epoch": 0.262992506647329,
"grad_norm": 3.031985282897949,
"learning_rate": 1.7342693007069973e-06,
"loss": 0.5824,
"step": 136
},
{
"epoch": 0.26492627507855937,
"grad_norm": 2.9924063682556152,
"learning_rate": 1.729998013808192e-06,
"loss": 0.5805,
"step": 137
},
{
"epoch": 0.2668600435097897,
"grad_norm": 3.3764071464538574,
"learning_rate": 1.7256980227687591e-06,
"loss": 0.644,
"step": 138
},
{
"epoch": 0.2687938119410201,
"grad_norm": 2.775237798690796,
"learning_rate": 1.7213694966679912e-06,
"loss": 0.495,
"step": 139
},
{
"epoch": 0.2707275803722504,
"grad_norm": 3.067394733428955,
"learning_rate": 1.717012605707203e-06,
"loss": 0.5125,
"step": 140
},
{
"epoch": 0.2726613488034808,
"grad_norm": 3.1399190425872803,
"learning_rate": 1.71262752120304e-06,
"loss": 0.5661,
"step": 141
},
{
"epoch": 0.27459511723471114,
"grad_norm": 3.03717303276062,
"learning_rate": 1.7082144155807416e-06,
"loss": 0.6028,
"step": 142
},
{
"epoch": 0.2765288856659415,
"grad_norm": 2.9807446002960205,
"learning_rate": 1.7037734623673613e-06,
"loss": 0.5505,
"step": 143
},
{
"epoch": 0.27846265409717186,
"grad_norm": 2.834151268005371,
"learning_rate": 1.6993048361849437e-06,
"loss": 0.5391,
"step": 144
},
{
"epoch": 0.28039642252840224,
"grad_norm": 2.9190399646759033,
"learning_rate": 1.694808712743657e-06,
"loss": 0.5122,
"step": 145
},
{
"epoch": 0.2823301909596326,
"grad_norm": 3.0294294357299805,
"learning_rate": 1.6902852688348864e-06,
"loss": 0.5764,
"step": 146
},
{
"epoch": 0.28426395939086296,
"grad_norm": 2.864239454269409,
"learning_rate": 1.68573468232428e-06,
"loss": 0.542,
"step": 147
},
{
"epoch": 0.2861977278220933,
"grad_norm": 3.1250009536743164,
"learning_rate": 1.6811571321447565e-06,
"loss": 0.5638,
"step": 148
},
{
"epoch": 0.2881314962533237,
"grad_norm": 2.9388275146484375,
"learning_rate": 1.6765527982894687e-06,
"loss": 0.5452,
"step": 149
},
{
"epoch": 0.290065264684554,
"grad_norm": 3.0746877193450928,
"learning_rate": 1.6719218618047262e-06,
"loss": 0.5515,
"step": 150
},
{
"epoch": 0.2919990331157844,
"grad_norm": 2.9160993099212646,
"learning_rate": 1.6672645047828769e-06,
"loss": 0.5759,
"step": 151
},
{
"epoch": 0.29393280154701473,
"grad_norm": 2.942295551300049,
"learning_rate": 1.662580910355146e-06,
"loss": 0.5791,
"step": 152
},
{
"epoch": 0.2958665699782451,
"grad_norm": 3.1533167362213135,
"learning_rate": 1.6578712626844363e-06,
"loss": 0.5355,
"step": 153
},
{
"epoch": 0.29780033840947545,
"grad_norm": 2.9345664978027344,
"learning_rate": 1.6531357469580857e-06,
"loss": 0.54,
"step": 154
},
{
"epoch": 0.29973410684070584,
"grad_norm": 2.933525562286377,
"learning_rate": 1.6483745493805868e-06,
"loss": 0.5085,
"step": 155
},
{
"epoch": 0.30166787527193617,
"grad_norm": 3.0294647216796875,
"learning_rate": 1.643587857166263e-06,
"loss": 0.5245,
"step": 156
},
{
"epoch": 0.30360164370316656,
"grad_norm": 2.784876585006714,
"learning_rate": 1.6387758585319098e-06,
"loss": 0.5279,
"step": 157
},
{
"epoch": 0.3055354121343969,
"grad_norm": 2.812796115875244,
"learning_rate": 1.6339387426893917e-06,
"loss": 0.4981,
"step": 158
},
{
"epoch": 0.3074691805656273,
"grad_norm": 3.0488643646240234,
"learning_rate": 1.629076699838204e-06,
"loss": 0.5355,
"step": 159
},
{
"epoch": 0.3094029489968576,
"grad_norm": 2.980043888092041,
"learning_rate": 1.624189921157992e-06,
"loss": 0.5306,
"step": 160
},
{
"epoch": 0.311336717428088,
"grad_norm": 2.963444709777832,
"learning_rate": 1.6192785988010352e-06,
"loss": 0.5569,
"step": 161
},
{
"epoch": 0.3132704858593183,
"grad_norm": 2.8613176345825195,
"learning_rate": 1.6143429258846915e-06,
"loss": 0.5242,
"step": 162
},
{
"epoch": 0.3152042542905487,
"grad_norm": 2.9481241703033447,
"learning_rate": 1.6093830964838033e-06,
"loss": 0.5459,
"step": 163
},
{
"epoch": 0.31713802272177904,
"grad_norm": 2.9568634033203125,
"learning_rate": 1.6043993056230666e-06,
"loss": 0.5866,
"step": 164
},
{
"epoch": 0.31907179115300943,
"grad_norm": 2.9647092819213867,
"learning_rate": 1.5993917492693623e-06,
"loss": 0.5505,
"step": 165
},
{
"epoch": 0.32100555958423976,
"grad_norm": 2.9472572803497314,
"learning_rate": 1.59436062432405e-06,
"loss": 0.5905,
"step": 166
},
{
"epoch": 0.32293932801547015,
"grad_norm": 2.9131317138671875,
"learning_rate": 1.5893061286152273e-06,
"loss": 0.5303,
"step": 167
},
{
"epoch": 0.3248730964467005,
"grad_norm": 2.945516586303711,
"learning_rate": 1.5842284608899488e-06,
"loss": 0.5336,
"step": 168
},
{
"epoch": 0.32680686487793087,
"grad_norm": 2.8020524978637695,
"learning_rate": 1.579127820806414e-06,
"loss": 0.4651,
"step": 169
},
{
"epoch": 0.3287406333091612,
"grad_norm": 3.0330491065979004,
"learning_rate": 1.5740044089261136e-06,
"loss": 0.4867,
"step": 170
},
{
"epoch": 0.3306744017403916,
"grad_norm": 2.847795009613037,
"learning_rate": 1.5688584267059453e-06,
"loss": 0.5472,
"step": 171
},
{
"epoch": 0.332608170171622,
"grad_norm": 2.9428656101226807,
"learning_rate": 1.5636900764902914e-06,
"loss": 0.5629,
"step": 172
},
{
"epoch": 0.3345419386028523,
"grad_norm": 3.1021251678466797,
"learning_rate": 1.5584995615030632e-06,
"loss": 0.5857,
"step": 173
},
{
"epoch": 0.3364757070340827,
"grad_norm": 3.0706946849823,
"learning_rate": 1.5532870858397092e-06,
"loss": 0.5518,
"step": 174
},
{
"epoch": 0.338409475465313,
"grad_norm": 3.023850440979004,
"learning_rate": 1.5480528544591905e-06,
"loss": 0.5307,
"step": 175
},
{
"epoch": 0.3403432438965434,
"grad_norm": 2.7942423820495605,
"learning_rate": 1.542797073175921e-06,
"loss": 0.528,
"step": 176
},
{
"epoch": 0.34227701232777374,
"grad_norm": 3.0587525367736816,
"learning_rate": 1.5375199486516759e-06,
"loss": 0.5768,
"step": 177
},
{
"epoch": 0.34421078075900413,
"grad_norm": 3.1753013134002686,
"learning_rate": 1.532221688387464e-06,
"loss": 0.5658,
"step": 178
},
{
"epoch": 0.34614454919023446,
"grad_norm": 2.8351001739501953,
"learning_rate": 1.5269025007153698e-06,
"loss": 0.5008,
"step": 179
},
{
"epoch": 0.34807831762146485,
"grad_norm": 2.830599069595337,
"learning_rate": 1.5215625947903603e-06,
"loss": 0.4855,
"step": 180
},
{
"epoch": 0.3500120860526952,
"grad_norm": 2.871882200241089,
"learning_rate": 1.516202180582063e-06,
"loss": 0.5144,
"step": 181
},
{
"epoch": 0.35194585448392557,
"grad_norm": 2.947722911834717,
"learning_rate": 1.5108214688665077e-06,
"loss": 0.5819,
"step": 182
},
{
"epoch": 0.3538796229151559,
"grad_norm": 2.912292003631592,
"learning_rate": 1.5054206712178399e-06,
"loss": 0.5282,
"step": 183
},
{
"epoch": 0.3558133913463863,
"grad_norm": 2.9988412857055664,
"learning_rate": 1.5e-06,
"loss": 0.5249,
"step": 184
},
{
"epoch": 0.3577471597776166,
"grad_norm": 2.934715986251831,
"learning_rate": 1.4945596683583753e-06,
"loss": 0.5424,
"step": 185
},
{
"epoch": 0.359680928208847,
"grad_norm": 2.935274839401245,
"learning_rate": 1.4890998902114165e-06,
"loss": 0.5147,
"step": 186
},
{
"epoch": 0.36161469664007734,
"grad_norm": 3.1173348426818848,
"learning_rate": 1.4836208802422286e-06,
"loss": 0.5402,
"step": 187
},
{
"epoch": 0.3635484650713077,
"grad_norm": 2.9309182167053223,
"learning_rate": 1.4781228538901265e-06,
"loss": 0.4956,
"step": 188
},
{
"epoch": 0.36548223350253806,
"grad_norm": 2.8888142108917236,
"learning_rate": 1.4726060273421673e-06,
"loss": 0.534,
"step": 189
},
{
"epoch": 0.36741600193376844,
"grad_norm": 2.891580820083618,
"learning_rate": 1.4670706175246462e-06,
"loss": 0.5188,
"step": 190
},
{
"epoch": 0.3693497703649988,
"grad_norm": 3.278562545776367,
"learning_rate": 1.4615168420945687e-06,
"loss": 0.572,
"step": 191
},
{
"epoch": 0.37128353879622916,
"grad_norm": 3.0605547428131104,
"learning_rate": 1.4559449194310924e-06,
"loss": 0.6035,
"step": 192
},
{
"epoch": 0.3732173072274595,
"grad_norm": 2.988506317138672,
"learning_rate": 1.4503550686269388e-06,
"loss": 0.5552,
"step": 193
},
{
"epoch": 0.3751510756586899,
"grad_norm": 2.906756639480591,
"learning_rate": 1.4447475094797793e-06,
"loss": 0.507,
"step": 194
},
{
"epoch": 0.3770848440899202,
"grad_norm": 2.8822944164276123,
"learning_rate": 1.4391224624835928e-06,
"loss": 0.5025,
"step": 195
},
{
"epoch": 0.3790186125211506,
"grad_norm": 3.0325863361358643,
"learning_rate": 1.4334801488199939e-06,
"loss": 0.5193,
"step": 196
},
{
"epoch": 0.38095238095238093,
"grad_norm": 2.892662763595581,
"learning_rate": 1.4278207903495387e-06,
"loss": 0.5187,
"step": 197
},
{
"epoch": 0.3828861493836113,
"grad_norm": 2.7827394008636475,
"learning_rate": 1.422144609602999e-06,
"loss": 0.5296,
"step": 198
},
{
"epoch": 0.38481991781484165,
"grad_norm": 2.9269039630889893,
"learning_rate": 1.4164518297726126e-06,
"loss": 0.4882,
"step": 199
},
{
"epoch": 0.38675368624607204,
"grad_norm": 3.1903462409973145,
"learning_rate": 1.4107426747033072e-06,
"loss": 0.5382,
"step": 200
},
{
"epoch": 0.38868745467730237,
"grad_norm": 2.9497694969177246,
"learning_rate": 1.4050173688838998e-06,
"loss": 0.5187,
"step": 201
},
{
"epoch": 0.39062122310853276,
"grad_norm": 2.956583023071289,
"learning_rate": 1.399276137438267e-06,
"loss": 0.5251,
"step": 202
},
{
"epoch": 0.3925549915397631,
"grad_norm": 2.959354877471924,
"learning_rate": 1.3935192061164955e-06,
"loss": 0.6077,
"step": 203
},
{
"epoch": 0.3944887599709935,
"grad_norm": 2.9722297191619873,
"learning_rate": 1.3877468012860046e-06,
"loss": 0.5582,
"step": 204
},
{
"epoch": 0.3964225284022238,
"grad_norm": 2.9446754455566406,
"learning_rate": 1.3819591499226444e-06,
"loss": 0.5277,
"step": 205
},
{
"epoch": 0.3983562968334542,
"grad_norm": 2.925280809402466,
"learning_rate": 1.376156479601772e-06,
"loss": 0.5251,
"step": 206
},
{
"epoch": 0.4002900652646846,
"grad_norm": 2.7258141040802,
"learning_rate": 1.3703390184893033e-06,
"loss": 0.497,
"step": 207
},
{
"epoch": 0.4022238336959149,
"grad_norm": 2.9930813312530518,
"learning_rate": 1.3645069953327388e-06,
"loss": 0.575,
"step": 208
},
{
"epoch": 0.4041576021271453,
"grad_norm": 2.80450177192688,
"learning_rate": 1.3586606394521727e-06,
"loss": 0.4993,
"step": 209
},
{
"epoch": 0.40609137055837563,
"grad_norm": 2.997321605682373,
"learning_rate": 1.3528001807312732e-06,
"loss": 0.5231,
"step": 210
},
{
"epoch": 0.408025138989606,
"grad_norm": 3.1863949298858643,
"learning_rate": 1.3469258496082449e-06,
"loss": 0.5803,
"step": 211
},
{
"epoch": 0.40995890742083635,
"grad_norm": 2.7617061138153076,
"learning_rate": 1.341037877066766e-06,
"loss": 0.5061,
"step": 212
},
{
"epoch": 0.41189267585206674,
"grad_norm": 2.9016385078430176,
"learning_rate": 1.335136494626907e-06,
"loss": 0.4804,
"step": 213
},
{
"epoch": 0.41382644428329707,
"grad_norm": 3.061267614364624,
"learning_rate": 1.3292219343360278e-06,
"loss": 0.55,
"step": 214
},
{
"epoch": 0.41576021271452746,
"grad_norm": 3.161426544189453,
"learning_rate": 1.323294428759652e-06,
"loss": 0.5507,
"step": 215
},
{
"epoch": 0.4176939811457578,
"grad_norm": 3.2181830406188965,
"learning_rate": 1.3173542109723227e-06,
"loss": 0.5543,
"step": 216
},
{
"epoch": 0.4196277495769882,
"grad_norm": 3.0244412422180176,
"learning_rate": 1.3114015145484378e-06,
"loss": 0.5716,
"step": 217
},
{
"epoch": 0.4215615180082185,
"grad_norm": 2.8925223350524902,
"learning_rate": 1.3054365735530664e-06,
"loss": 0.4452,
"step": 218
},
{
"epoch": 0.4234952864394489,
"grad_norm": 3.0561466217041016,
"learning_rate": 1.2994596225327442e-06,
"loss": 0.4996,
"step": 219
},
{
"epoch": 0.4254290548706792,
"grad_norm": 2.930914878845215,
"learning_rate": 1.2934708965062507e-06,
"loss": 0.5131,
"step": 220
},
{
"epoch": 0.4273628233019096,
"grad_norm": 2.9865005016326904,
"learning_rate": 1.2874706309553697e-06,
"loss": 0.5286,
"step": 221
},
{
"epoch": 0.42929659173313994,
"grad_norm": 3.2668237686157227,
"learning_rate": 1.2814590618156275e-06,
"loss": 0.5774,
"step": 222
},
{
"epoch": 0.43123036016437033,
"grad_norm": 2.931473970413208,
"learning_rate": 1.2754364254670192e-06,
"loss": 0.5236,
"step": 223
},
{
"epoch": 0.43316412859560066,
"grad_norm": 2.954760789871216,
"learning_rate": 1.2694029587247095e-06,
"loss": 0.4893,
"step": 224
},
{
"epoch": 0.43509789702683105,
"grad_norm": 2.9701437950134277,
"learning_rate": 1.2633588988297247e-06,
"loss": 0.5199,
"step": 225
},
{
"epoch": 0.4370316654580614,
"grad_norm": 3.1630728244781494,
"learning_rate": 1.2573044834396229e-06,
"loss": 0.5607,
"step": 226
},
{
"epoch": 0.43896543388929177,
"grad_norm": 2.9174458980560303,
"learning_rate": 1.251239950619149e-06,
"loss": 0.4932,
"step": 227
},
{
"epoch": 0.4408992023205221,
"grad_norm": 2.9808075428009033,
"learning_rate": 1.245165538830873e-06,
"loss": 0.5203,
"step": 228
},
{
"epoch": 0.4428329707517525,
"grad_norm": 2.8363564014434814,
"learning_rate": 1.2390814869258154e-06,
"loss": 0.54,
"step": 229
},
{
"epoch": 0.4447667391829828,
"grad_norm": 2.8673954010009766,
"learning_rate": 1.232988034134053e-06,
"loss": 0.5408,
"step": 230
},
{
"epoch": 0.4467005076142132,
"grad_norm": 2.936624526977539,
"learning_rate": 1.2268854200553156e-06,
"loss": 0.4853,
"step": 231
},
{
"epoch": 0.44863427604544354,
"grad_norm": 3.0259556770324707,
"learning_rate": 1.2207738846495599e-06,
"loss": 0.5135,
"step": 232
},
{
"epoch": 0.4505680444766739,
"grad_norm": 2.8267462253570557,
"learning_rate": 1.2146536682275385e-06,
"loss": 0.5327,
"step": 233
},
{
"epoch": 0.45250181290790426,
"grad_norm": 3.01057767868042,
"learning_rate": 1.2085250114413484e-06,
"loss": 0.5043,
"step": 234
},
{
"epoch": 0.45443558133913464,
"grad_norm": 2.9250426292419434,
"learning_rate": 1.202388155274969e-06,
"loss": 0.5282,
"step": 235
},
{
"epoch": 0.456369349770365,
"grad_norm": 3.085334539413452,
"learning_rate": 1.1962433410347858e-06,
"loss": 0.536,
"step": 236
},
{
"epoch": 0.45830311820159536,
"grad_norm": 3.0255112648010254,
"learning_rate": 1.1900908103401036e-06,
"loss": 0.5392,
"step": 237
},
{
"epoch": 0.4602368866328257,
"grad_norm": 2.8932268619537354,
"learning_rate": 1.1839308051136429e-06,
"loss": 0.5142,
"step": 238
},
{
"epoch": 0.4621706550640561,
"grad_norm": 3.051480293273926,
"learning_rate": 1.1777635675720313e-06,
"loss": 0.5509,
"step": 239
},
{
"epoch": 0.46410442349528647,
"grad_norm": 2.9904825687408447,
"learning_rate": 1.1715893402162757e-06,
"loss": 0.5316,
"step": 240
},
{
"epoch": 0.4660381919265168,
"grad_norm": 2.8626015186309814,
"learning_rate": 1.1654083658222287e-06,
"loss": 0.4737,
"step": 241
},
{
"epoch": 0.4679719603577472,
"grad_norm": 2.847097158432007,
"learning_rate": 1.1592208874310425e-06,
"loss": 0.5018,
"step": 242
},
{
"epoch": 0.4699057287889775,
"grad_norm": 3.1590757369995117,
"learning_rate": 1.1530271483396115e-06,
"loss": 0.5399,
"step": 243
},
{
"epoch": 0.4718394972202079,
"grad_norm": 3.036067247390747,
"learning_rate": 1.1468273920910067e-06,
"loss": 0.5831,
"step": 244
},
{
"epoch": 0.47377326565143824,
"grad_norm": 3.331260919570923,
"learning_rate": 1.1406218624648985e-06,
"loss": 0.55,
"step": 245
},
{
"epoch": 0.4757070340826686,
"grad_norm": 2.8437914848327637,
"learning_rate": 1.1344108034679709e-06,
"loss": 0.4947,
"step": 246
},
{
"epoch": 0.47764080251389895,
"grad_norm": 3.1023781299591064,
"learning_rate": 1.1281944593243287e-06,
"loss": 0.5565,
"step": 247
},
{
"epoch": 0.47957457094512934,
"grad_norm": 3.0982506275177,
"learning_rate": 1.121973074465892e-06,
"loss": 0.555,
"step": 248
},
{
"epoch": 0.4815083393763597,
"grad_norm": 3.2432122230529785,
"learning_rate": 1.1157468935227865e-06,
"loss": 0.5621,
"step": 249
},
{
"epoch": 0.48344210780759006,
"grad_norm": 2.9969475269317627,
"learning_rate": 1.109516161313724e-06,
"loss": 0.5244,
"step": 250
},
{
"epoch": 0.4853758762388204,
"grad_norm": 2.941561222076416,
"learning_rate": 1.1032811228363764e-06,
"loss": 0.5513,
"step": 251
},
{
"epoch": 0.4873096446700508,
"grad_norm": 2.8772778511047363,
"learning_rate": 1.0970420232577406e-06,
"loss": 0.4848,
"step": 252
},
{
"epoch": 0.4892434131012811,
"grad_norm": 3.026517391204834,
"learning_rate": 1.0907991079045006e-06,
"loss": 0.4887,
"step": 253
},
{
"epoch": 0.4911771815325115,
"grad_norm": 2.914242744445801,
"learning_rate": 1.084552622253379e-06,
"loss": 0.5066,
"step": 254
},
{
"epoch": 0.49311094996374183,
"grad_norm": 3.085822105407715,
"learning_rate": 1.0783028119214861e-06,
"loss": 0.5018,
"step": 255
},
{
"epoch": 0.4950447183949722,
"grad_norm": 2.869805097579956,
"learning_rate": 1.0720499226566616e-06,
"loss": 0.5241,
"step": 256
},
{
"epoch": 0.49697848682620255,
"grad_norm": 3.0161194801330566,
"learning_rate": 1.0657942003278106e-06,
"loss": 0.4984,
"step": 257
},
{
"epoch": 0.49891225525743294,
"grad_norm": 2.8013997077941895,
"learning_rate": 1.0595358909152377e-06,
"loss": 0.519,
"step": 258
},
{
"epoch": 0.5008460236886633,
"grad_norm": 2.783162832260132,
"learning_rate": 1.053275240500973e-06,
"loss": 0.4913,
"step": 259
},
{
"epoch": 0.5027797921198937,
"grad_norm": 3.2985990047454834,
"learning_rate": 1.0470124952590974e-06,
"loss": 0.5137,
"step": 260
},
{
"epoch": 0.504713560551124,
"grad_norm": 2.9076266288757324,
"learning_rate": 1.040747901446062e-06,
"loss": 0.5054,
"step": 261
},
{
"epoch": 0.5066473289823543,
"grad_norm": 2.830371141433716,
"learning_rate": 1.0344817053910048e-06,
"loss": 0.5071,
"step": 262
},
{
"epoch": 0.5085810974135847,
"grad_norm": 2.898216485977173,
"learning_rate": 1.028214153486066e-06,
"loss": 0.5002,
"step": 263
},
{
"epoch": 0.5105148658448151,
"grad_norm": 2.9580469131469727,
"learning_rate": 1.021945492176699e-06,
"loss": 0.4903,
"step": 264
},
{
"epoch": 0.5124486342760455,
"grad_norm": 3.2471764087677,
"learning_rate": 1.0156759679519789e-06,
"loss": 0.5037,
"step": 265
},
{
"epoch": 0.5143824027072758,
"grad_norm": 2.984576940536499,
"learning_rate": 1.0094058273349125e-06,
"loss": 0.5115,
"step": 266
},
{
"epoch": 0.5163161711385061,
"grad_norm": 2.9899098873138428,
"learning_rate": 1.003135316872743e-06,
"loss": 0.4761,
"step": 267
},
{
"epoch": 0.5182499395697365,
"grad_norm": 3.1444497108459473,
"learning_rate": 9.96864683127257e-07,
"loss": 0.5092,
"step": 268
},
{
"epoch": 0.5201837080009669,
"grad_norm": 2.837801456451416,
"learning_rate": 9.905941726650879e-07,
"loss": 0.4743,
"step": 269
},
{
"epoch": 0.5221174764321972,
"grad_norm": 2.847886085510254,
"learning_rate": 9.843240320480212e-07,
"loss": 0.4526,
"step": 270
},
{
"epoch": 0.5240512448634276,
"grad_norm": 2.8540098667144775,
"learning_rate": 9.780545078233012e-07,
"loss": 0.4964,
"step": 271
},
{
"epoch": 0.525985013294658,
"grad_norm": 2.9519143104553223,
"learning_rate": 9.717858465139341e-07,
"loss": 0.4792,
"step": 272
},
{
"epoch": 0.5279187817258884,
"grad_norm": 2.9407591819763184,
"learning_rate": 9.655182946089955e-07,
"loss": 0.5136,
"step": 273
},
{
"epoch": 0.5298525501571187,
"grad_norm": 2.884398937225342,
"learning_rate": 9.592520985539381e-07,
"loss": 0.5249,
"step": 274
},
{
"epoch": 0.531786318588349,
"grad_norm": 2.9702608585357666,
"learning_rate": 9.529875047409026e-07,
"loss": 0.5022,
"step": 275
},
{
"epoch": 0.5337200870195794,
"grad_norm": 2.78511118888855,
"learning_rate": 9.467247594990271e-07,
"loss": 0.5296,
"step": 276
},
{
"epoch": 0.5356538554508098,
"grad_norm": 3.069162607192993,
"learning_rate": 9.404641090847626e-07,
"loss": 0.5199,
"step": 277
},
{
"epoch": 0.5375876238820402,
"grad_norm": 3.088228940963745,
"learning_rate": 9.342057996721894e-07,
"loss": 0.5884,
"step": 278
},
{
"epoch": 0.5395213923132705,
"grad_norm": 2.84122633934021,
"learning_rate": 9.279500773433385e-07,
"loss": 0.479,
"step": 279
},
{
"epoch": 0.5414551607445008,
"grad_norm": 3.0056533813476562,
"learning_rate": 9.216971880785139e-07,
"loss": 0.4961,
"step": 280
},
{
"epoch": 0.5433889291757312,
"grad_norm": 2.8740074634552,
"learning_rate": 9.154473777466209e-07,
"loss": 0.4876,
"step": 281
},
{
"epoch": 0.5453226976069616,
"grad_norm": 2.9126172065734863,
"learning_rate": 9.092008920954995e-07,
"loss": 0.5112,
"step": 282
},
{
"epoch": 0.5472564660381919,
"grad_norm": 3.0587427616119385,
"learning_rate": 9.029579767422592e-07,
"loss": 0.4757,
"step": 283
},
{
"epoch": 0.5491902344694223,
"grad_norm": 2.970956563949585,
"learning_rate": 8.967188771636236e-07,
"loss": 0.5116,
"step": 284
},
{
"epoch": 0.5511240029006527,
"grad_norm": 2.7673499584198,
"learning_rate": 8.904838386862757e-07,
"loss": 0.507,
"step": 285
},
{
"epoch": 0.553057771331883,
"grad_norm": 2.853449821472168,
"learning_rate": 8.842531064772136e-07,
"loss": 0.4738,
"step": 286
},
{
"epoch": 0.5549915397631133,
"grad_norm": 2.9352164268493652,
"learning_rate": 8.78026925534108e-07,
"loss": 0.534,
"step": 287
},
{
"epoch": 0.5569253081943437,
"grad_norm": 3.007397413253784,
"learning_rate": 8.718055406756713e-07,
"loss": 0.513,
"step": 288
},
{
"epoch": 0.5588590766255741,
"grad_norm": 2.853698253631592,
"learning_rate": 8.65589196532029e-07,
"loss": 0.5263,
"step": 289
},
{
"epoch": 0.5607928450568045,
"grad_norm": 3.211627244949341,
"learning_rate": 8.593781375351019e-07,
"loss": 0.5094,
"step": 290
},
{
"epoch": 0.5627266134880348,
"grad_norm": 2.8546507358551025,
"learning_rate": 8.531726079089933e-07,
"loss": 0.4698,
"step": 291
},
{
"epoch": 0.5646603819192652,
"grad_norm": 2.8525969982147217,
"learning_rate": 8.469728516603886e-07,
"loss": 0.5319,
"step": 292
},
{
"epoch": 0.5665941503504955,
"grad_norm": 2.925687313079834,
"learning_rate": 8.407791125689576e-07,
"loss": 0.5478,
"step": 293
},
{
"epoch": 0.5685279187817259,
"grad_norm": 2.994581460952759,
"learning_rate": 8.345916341777714e-07,
"loss": 0.5365,
"step": 294
},
{
"epoch": 0.5704616872129562,
"grad_norm": 3.1099655628204346,
"learning_rate": 8.284106597837242e-07,
"loss": 0.5256,
"step": 295
},
{
"epoch": 0.5723954556441866,
"grad_norm": 2.8576478958129883,
"learning_rate": 8.222364324279689e-07,
"loss": 0.5182,
"step": 296
},
{
"epoch": 0.574329224075417,
"grad_norm": 2.6711535453796387,
"learning_rate": 8.16069194886357e-07,
"loss": 0.4789,
"step": 297
},
{
"epoch": 0.5762629925066474,
"grad_norm": 3.22711181640625,
"learning_rate": 8.099091896598964e-07,
"loss": 0.5506,
"step": 298
},
{
"epoch": 0.5781967609378776,
"grad_norm": 3.0444905757904053,
"learning_rate": 8.037566589652141e-07,
"loss": 0.5388,
"step": 299
},
{
"epoch": 0.580130529369108,
"grad_norm": 3.0839669704437256,
"learning_rate": 7.97611844725031e-07,
"loss": 0.507,
"step": 300
},
{
"epoch": 0.5820642978003384,
"grad_norm": 3.067812442779541,
"learning_rate": 7.914749885586515e-07,
"loss": 0.5522,
"step": 301
},
{
"epoch": 0.5839980662315688,
"grad_norm": 2.7645392417907715,
"learning_rate": 7.853463317724614e-07,
"loss": 0.4622,
"step": 302
},
{
"epoch": 0.5859318346627991,
"grad_norm": 2.6988883018493652,
"learning_rate": 7.792261153504401e-07,
"loss": 0.4774,
"step": 303
},
{
"epoch": 0.5878656030940295,
"grad_norm": 2.8341736793518066,
"learning_rate": 7.731145799446844e-07,
"loss": 0.5248,
"step": 304
},
{
"epoch": 0.5897993715252599,
"grad_norm": 2.75606632232666,
"learning_rate": 7.670119658659469e-07,
"loss": 0.525,
"step": 305
},
{
"epoch": 0.5917331399564902,
"grad_norm": 2.893043279647827,
"learning_rate": 7.609185130741846e-07,
"loss": 0.5375,
"step": 306
},
{
"epoch": 0.5936669083877205,
"grad_norm": 3.2129299640655518,
"learning_rate": 7.548344611691271e-07,
"loss": 0.5252,
"step": 307
},
{
"epoch": 0.5956006768189509,
"grad_norm": 2.8687853813171387,
"learning_rate": 7.487600493808513e-07,
"loss": 0.5052,
"step": 308
},
{
"epoch": 0.5975344452501813,
"grad_norm": 2.8696491718292236,
"learning_rate": 7.426955165603772e-07,
"loss": 0.4912,
"step": 309
},
{
"epoch": 0.5994682136814117,
"grad_norm": 2.9154770374298096,
"learning_rate": 7.366411011702753e-07,
"loss": 0.509,
"step": 310
},
{
"epoch": 0.6014019821126421,
"grad_norm": 2.958178997039795,
"learning_rate": 7.305970412752909e-07,
"loss": 0.5168,
"step": 311
},
{
"epoch": 0.6033357505438723,
"grad_norm": 3.04964542388916,
"learning_rate": 7.245635745329809e-07,
"loss": 0.4598,
"step": 312
},
{
"epoch": 0.6052695189751027,
"grad_norm": 2.9642903804779053,
"learning_rate": 7.185409381843725e-07,
"loss": 0.4998,
"step": 313
},
{
"epoch": 0.6072032874063331,
"grad_norm": 2.8785696029663086,
"learning_rate": 7.125293690446306e-07,
"loss": 0.5129,
"step": 314
},
{
"epoch": 0.6091370558375635,
"grad_norm": 3.1034622192382812,
"learning_rate": 7.065291034937495e-07,
"loss": 0.5073,
"step": 315
},
{
"epoch": 0.6110708242687938,
"grad_norm": 2.8777222633361816,
"learning_rate": 7.005403774672559e-07,
"loss": 0.5226,
"step": 316
},
{
"epoch": 0.6130045927000242,
"grad_norm": 3.067791223526001,
"learning_rate": 6.945634264469337e-07,
"loss": 0.4938,
"step": 317
},
{
"epoch": 0.6149383611312546,
"grad_norm": 3.056411027908325,
"learning_rate": 6.885984854515623e-07,
"loss": 0.5097,
"step": 318
},
{
"epoch": 0.6168721295624849,
"grad_norm": 3.0727953910827637,
"learning_rate": 6.826457890276772e-07,
"loss": 0.5011,
"step": 319
},
{
"epoch": 0.6188058979937152,
"grad_norm": 2.822244167327881,
"learning_rate": 6.76705571240348e-07,
"loss": 0.4943,
"step": 320
},
{
"epoch": 0.6207396664249456,
"grad_norm": 3.038038969039917,
"learning_rate": 6.707780656639721e-07,
"loss": 0.5453,
"step": 321
},
{
"epoch": 0.622673434856176,
"grad_norm": 2.872443437576294,
"learning_rate": 6.64863505373093e-07,
"loss": 0.5338,
"step": 322
},
{
"epoch": 0.6246072032874064,
"grad_norm": 3.021986961364746,
"learning_rate": 6.58962122933234e-07,
"loss": 0.5245,
"step": 323
},
{
"epoch": 0.6265409717186367,
"grad_norm": 2.882286787033081,
"learning_rate": 6.53074150391755e-07,
"loss": 0.4942,
"step": 324
},
{
"epoch": 0.628474740149867,
"grad_norm": 2.975586414337158,
"learning_rate": 6.471998192687265e-07,
"loss": 0.5537,
"step": 325
},
{
"epoch": 0.6304085085810974,
"grad_norm": 2.871222734451294,
"learning_rate": 6.413393605478274e-07,
"loss": 0.4847,
"step": 326
},
{
"epoch": 0.6323422770123278,
"grad_norm": 2.9107320308685303,
"learning_rate": 6.35493004667261e-07,
"loss": 0.493,
"step": 327
},
{
"epoch": 0.6342760454435581,
"grad_norm": 2.997209310531616,
"learning_rate": 6.29660981510697e-07,
"loss": 0.504,
"step": 328
},
{
"epoch": 0.6362098138747885,
"grad_norm": 2.9243879318237305,
"learning_rate": 6.238435203982277e-07,
"loss": 0.517,
"step": 329
},
{
"epoch": 0.6381435823060189,
"grad_norm": 2.822188377380371,
"learning_rate": 6.180408500773557e-07,
"loss": 0.4736,
"step": 330
},
{
"epoch": 0.6400773507372493,
"grad_norm": 2.937476873397827,
"learning_rate": 6.122531987139954e-07,
"loss": 0.4467,
"step": 331
},
{
"epoch": 0.6420111191684795,
"grad_norm": 2.9560484886169434,
"learning_rate": 6.064807938835046e-07,
"loss": 0.5076,
"step": 332
},
{
"epoch": 0.6439448875997099,
"grad_norm": 2.906756639480591,
"learning_rate": 6.007238625617333e-07,
"loss": 0.5161,
"step": 333
},
{
"epoch": 0.6458786560309403,
"grad_norm": 2.875453233718872,
"learning_rate": 5.949826311161006e-07,
"loss": 0.5277,
"step": 334
},
{
"epoch": 0.6478124244621707,
"grad_norm": 3.082592248916626,
"learning_rate": 5.892573252966926e-07,
"loss": 0.4835,
"step": 335
},
{
"epoch": 0.649746192893401,
"grad_norm": 2.8646011352539062,
"learning_rate": 5.835481702273878e-07,
"loss": 0.5205,
"step": 336
},
{
"epoch": 0.6516799613246314,
"grad_norm": 2.8505988121032715,
"learning_rate": 5.778553903970009e-07,
"loss": 0.5463,
"step": 337
},
{
"epoch": 0.6536137297558617,
"grad_norm": 2.866309642791748,
"learning_rate": 5.72179209650461e-07,
"loss": 0.5178,
"step": 338
},
{
"epoch": 0.6555474981870921,
"grad_norm": 2.6980485916137695,
"learning_rate": 5.665198511800063e-07,
"loss": 0.4907,
"step": 339
},
{
"epoch": 0.6574812666183224,
"grad_norm": 3.0001935958862305,
"learning_rate": 5.608775375164072e-07,
"loss": 0.5227,
"step": 340
},
{
"epoch": 0.6594150350495528,
"grad_norm": 2.840331554412842,
"learning_rate": 5.552524905202207e-07,
"loss": 0.4867,
"step": 341
},
{
"epoch": 0.6613488034807832,
"grad_norm": 2.9088988304138184,
"learning_rate": 5.496449313730608e-07,
"loss": 0.5486,
"step": 342
},
{
"epoch": 0.6632825719120136,
"grad_norm": 2.773866653442383,
"learning_rate": 5.440550805689075e-07,
"loss": 0.5258,
"step": 343
},
{
"epoch": 0.665216340343244,
"grad_norm": 3.12353777885437,
"learning_rate": 5.384831579054312e-07,
"loss": 0.5713,
"step": 344
},
{
"epoch": 0.6671501087744742,
"grad_norm": 2.9296865463256836,
"learning_rate": 5.329293824753538e-07,
"loss": 0.5194,
"step": 345
},
{
"epoch": 0.6690838772057046,
"grad_norm": 3.0712759494781494,
"learning_rate": 5.273939726578326e-07,
"loss": 0.553,
"step": 346
},
{
"epoch": 0.671017645636935,
"grad_norm": 3.2009084224700928,
"learning_rate": 5.218771461098732e-07,
"loss": 0.5197,
"step": 347
},
{
"epoch": 0.6729514140681654,
"grad_norm": 2.914090394973755,
"learning_rate": 5.163791197577713e-07,
"loss": 0.5196,
"step": 348
},
{
"epoch": 0.6748851824993957,
"grad_norm": 2.893239974975586,
"learning_rate": 5.109001097885837e-07,
"loss": 0.5387,
"step": 349
},
{
"epoch": 0.676818950930626,
"grad_norm": 2.7517759799957275,
"learning_rate": 5.054403316416247e-07,
"loss": 0.4814,
"step": 350
},
{
"epoch": 0.6787527193618564,
"grad_norm": 2.873094081878662,
"learning_rate": 5.000000000000002e-07,
"loss": 0.5122,
"step": 351
},
{
"epoch": 0.6806864877930868,
"grad_norm": 2.7496497631073,
"learning_rate": 4.945793287821604e-07,
"loss": 0.5086,
"step": 352
},
{
"epoch": 0.6826202562243171,
"grad_norm": 2.7671315670013428,
"learning_rate": 4.891785311334922e-07,
"loss": 0.4881,
"step": 353
},
{
"epoch": 0.6845540246555475,
"grad_norm": 2.9487366676330566,
"learning_rate": 4.837978194179369e-07,
"loss": 0.5329,
"step": 354
},
{
"epoch": 0.6864877930867779,
"grad_norm": 2.971665143966675,
"learning_rate": 4.784374052096396e-07,
"loss": 0.5509,
"step": 355
},
{
"epoch": 0.6884215615180083,
"grad_norm": 3.0846035480499268,
"learning_rate": 4.730974992846303e-07,
"loss": 0.4538,
"step": 356
},
{
"epoch": 0.6903553299492385,
"grad_norm": 2.890350103378296,
"learning_rate": 4.677783116125361e-07,
"loss": 0.5101,
"step": 357
},
{
"epoch": 0.6922890983804689,
"grad_norm": 2.8605659008026123,
"learning_rate": 4.624800513483239e-07,
"loss": 0.5224,
"step": 358
},
{
"epoch": 0.6942228668116993,
"grad_norm": 2.946995735168457,
"learning_rate": 4.572029268240787e-07,
"loss": 0.505,
"step": 359
},
{
"epoch": 0.6961566352429297,
"grad_norm": 2.8882555961608887,
"learning_rate": 4.519471455408098e-07,
"loss": 0.5041,
"step": 360
},
{
"epoch": 0.69809040367416,
"grad_norm": 2.8211588859558105,
"learning_rate": 4.4671291416029055e-07,
"loss": 0.5471,
"step": 361
},
{
"epoch": 0.7000241721053904,
"grad_norm": 3.0928735733032227,
"learning_rate": 4.4150043849693695e-07,
"loss": 0.4627,
"step": 362
},
{
"epoch": 0.7019579405366208,
"grad_norm": 2.9646997451782227,
"learning_rate": 4.3630992350970865e-07,
"loss": 0.5327,
"step": 363
},
{
"epoch": 0.7038917089678511,
"grad_norm": 3.0351743698120117,
"learning_rate": 4.3114157329405487e-07,
"loss": 0.5292,
"step": 364
},
{
"epoch": 0.7058254773990814,
"grad_norm": 2.928863525390625,
"learning_rate": 4.259955910738864e-07,
"loss": 0.5245,
"step": 365
},
{
"epoch": 0.7077592458303118,
"grad_norm": 2.761915445327759,
"learning_rate": 4.20872179193586e-07,
"loss": 0.4653,
"step": 366
},
{
"epoch": 0.7096930142615422,
"grad_norm": 2.9401397705078125,
"learning_rate": 4.157715391100509e-07,
"loss": 0.518,
"step": 367
},
{
"epoch": 0.7116267826927726,
"grad_norm": 2.8561716079711914,
"learning_rate": 4.1069387138477297e-07,
"loss": 0.4474,
"step": 368
},
{
"epoch": 0.7135605511240029,
"grad_norm": 3.0250120162963867,
"learning_rate": 4.0563937567594974e-07,
"loss": 0.4947,
"step": 369
},
{
"epoch": 0.7154943195552332,
"grad_norm": 3.0572850704193115,
"learning_rate": 4.0060825073063785e-07,
"loss": 0.4763,
"step": 370
},
{
"epoch": 0.7174280879864636,
"grad_norm": 3.2508955001831055,
"learning_rate": 3.95600694376933e-07,
"loss": 0.5261,
"step": 371
},
{
"epoch": 0.719361856417694,
"grad_norm": 2.9887571334838867,
"learning_rate": 3.906169035161967e-07,
"loss": 0.5028,
"step": 372
},
{
"epoch": 0.7212956248489243,
"grad_norm": 3.0212152004241943,
"learning_rate": 3.8565707411530866e-07,
"loss": 0.5344,
"step": 373
},
{
"epoch": 0.7232293932801547,
"grad_norm": 2.8374485969543457,
"learning_rate": 3.80721401198965e-07,
"loss": 0.457,
"step": 374
},
{
"epoch": 0.7251631617113851,
"grad_norm": 2.9122722148895264,
"learning_rate": 3.7581007884200824e-07,
"loss": 0.496,
"step": 375
},
{
"epoch": 0.7270969301426154,
"grad_norm": 3.086984157562256,
"learning_rate": 3.7092330016179605e-07,
"loss": 0.5244,
"step": 376
},
{
"epoch": 0.7290306985738458,
"grad_norm": 2.797271490097046,
"learning_rate": 3.660612573106081e-07,
"loss": 0.4423,
"step": 377
},
{
"epoch": 0.7309644670050761,
"grad_norm": 3.072042942047119,
"learning_rate": 3.612241414680901e-07,
"loss": 0.5059,
"step": 378
},
{
"epoch": 0.7328982354363065,
"grad_norm": 2.745339870452881,
"learning_rate": 3.564121428337369e-07,
"loss": 0.4813,
"step": 379
},
{
"epoch": 0.7348320038675369,
"grad_norm": 3.1032216548919678,
"learning_rate": 3.5162545061941327e-07,
"loss": 0.5213,
"step": 380
},
{
"epoch": 0.7367657722987673,
"grad_norm": 2.9812653064727783,
"learning_rate": 3.4686425304191436e-07,
"loss": 0.4848,
"step": 381
},
{
"epoch": 0.7386995407299976,
"grad_norm": 3.1721439361572266,
"learning_rate": 3.421287373155636e-07,
"loss": 0.4748,
"step": 382
},
{
"epoch": 0.7406333091612279,
"grad_norm": 2.7179818153381348,
"learning_rate": 3.374190896448541e-07,
"loss": 0.4638,
"step": 383
},
{
"epoch": 0.7425670775924583,
"grad_norm": 2.983214855194092,
"learning_rate": 3.327354952171232e-07,
"loss": 0.4854,
"step": 384
},
{
"epoch": 0.7445008460236887,
"grad_norm": 2.9129269123077393,
"learning_rate": 3.2807813819527374e-07,
"loss": 0.5304,
"step": 385
},
{
"epoch": 0.746434614454919,
"grad_norm": 2.8850767612457275,
"learning_rate": 3.234472017105313e-07,
"loss": 0.5264,
"step": 386
},
{
"epoch": 0.7483683828861494,
"grad_norm": 2.805777072906494,
"learning_rate": 3.188428678552435e-07,
"loss": 0.5166,
"step": 387
},
{
"epoch": 0.7503021513173798,
"grad_norm": 2.8849081993103027,
"learning_rate": 3.1426531767572e-07,
"loss": 0.4284,
"step": 388
},
{
"epoch": 0.7522359197486101,
"grad_norm": 3.0169715881347656,
"learning_rate": 3.0971473116511393e-07,
"loss": 0.522,
"step": 389
},
{
"epoch": 0.7541696881798404,
"grad_norm": 3.2479677200317383,
"learning_rate": 3.0519128725634293e-07,
"loss": 0.5187,
"step": 390
},
{
"epoch": 0.7561034566110708,
"grad_norm": 3.006490468978882,
"learning_rate": 3.006951638150567e-07,
"loss": 0.5139,
"step": 391
},
{
"epoch": 0.7580372250423012,
"grad_norm": 3.2848265171051025,
"learning_rate": 2.9622653763263873e-07,
"loss": 0.5388,
"step": 392
},
{
"epoch": 0.7599709934735316,
"grad_norm": 3.011970281600952,
"learning_rate": 2.917855844192584e-07,
"loss": 0.5163,
"step": 393
},
{
"epoch": 0.7619047619047619,
"grad_norm": 3.037081480026245,
"learning_rate": 2.8737247879696e-07,
"loss": 0.5292,
"step": 394
},
{
"epoch": 0.7638385303359922,
"grad_norm": 3.168532133102417,
"learning_rate": 2.8298739429279705e-07,
"loss": 0.5267,
"step": 395
},
{
"epoch": 0.7657722987672226,
"grad_norm": 3.2073962688446045,
"learning_rate": 2.786305033320089e-07,
"loss": 0.5159,
"step": 396
},
{
"epoch": 0.767706067198453,
"grad_norm": 3.0780181884765625,
"learning_rate": 2.7430197723124115e-07,
"loss": 0.5512,
"step": 397
},
{
"epoch": 0.7696398356296833,
"grad_norm": 2.9060723781585693,
"learning_rate": 2.700019861918079e-07,
"loss": 0.5404,
"step": 398
},
{
"epoch": 0.7715736040609137,
"grad_norm": 2.9640395641326904,
"learning_rate": 2.657306992930024e-07,
"loss": 0.496,
"step": 399
},
{
"epoch": 0.7735073724921441,
"grad_norm": 3.119464159011841,
"learning_rate": 2.6148828448544566e-07,
"loss": 0.5233,
"step": 400
},
{
"epoch": 0.7754411409233745,
"grad_norm": 3.170423746109009,
"learning_rate": 2.5727490858448285e-07,
"loss": 0.5412,
"step": 401
},
{
"epoch": 0.7773749093546047,
"grad_norm": 3.0757274627685547,
"learning_rate": 2.530907372636265e-07,
"loss": 0.4974,
"step": 402
},
{
"epoch": 0.7793086777858351,
"grad_norm": 3.033518075942993,
"learning_rate": 2.4893593504803823e-07,
"loss": 0.5243,
"step": 403
},
{
"epoch": 0.7812424462170655,
"grad_norm": 2.8962435722351074,
"learning_rate": 2.4481066530806393e-07,
"loss": 0.4652,
"step": 404
},
{
"epoch": 0.7831762146482959,
"grad_norm": 2.9113543033599854,
"learning_rate": 2.407150902528052e-07,
"loss": 0.4438,
"step": 405
},
{
"epoch": 0.7851099830795262,
"grad_norm": 3.034170627593994,
"learning_rate": 2.3664937092374492e-07,
"loss": 0.5339,
"step": 406
},
{
"epoch": 0.7870437515107566,
"grad_norm": 2.8349618911743164,
"learning_rate": 2.3261366718841303e-07,
"loss": 0.5154,
"step": 407
},
{
"epoch": 0.788977519941987,
"grad_norm": 2.9790327548980713,
"learning_rate": 2.2860813773410103e-07,
"loss": 0.5295,
"step": 408
},
{
"epoch": 0.7909112883732173,
"grad_norm": 2.9119043350219727,
"learning_rate": 2.2463294006162182e-07,
"loss": 0.5236,
"step": 409
},
{
"epoch": 0.7928450568044476,
"grad_norm": 3.071845531463623,
"learning_rate": 2.2068823047911755e-07,
"loss": 0.4924,
"step": 410
},
{
"epoch": 0.794778825235678,
"grad_norm": 2.9019951820373535,
"learning_rate": 2.1677416409591165e-07,
"loss": 0.5215,
"step": 411
},
{
"epoch": 0.7967125936669084,
"grad_norm": 2.825124740600586,
"learning_rate": 2.1289089481641277e-07,
"loss": 0.4671,
"step": 412
},
{
"epoch": 0.7986463620981388,
"grad_norm": 2.964590311050415,
"learning_rate": 2.0903857533405954e-07,
"loss": 0.5312,
"step": 413
},
{
"epoch": 0.8005801305293692,
"grad_norm": 3.303433656692505,
"learning_rate": 2.052173571253193e-07,
"loss": 0.5184,
"step": 414
},
{
"epoch": 0.8025138989605994,
"grad_norm": 3.0383520126342773,
"learning_rate": 2.0142739044373092e-07,
"loss": 0.509,
"step": 415
},
{
"epoch": 0.8044476673918298,
"grad_norm": 2.9449782371520996,
"learning_rate": 1.9766882431399646e-07,
"loss": 0.507,
"step": 416
},
{
"epoch": 0.8063814358230602,
"grad_norm": 2.8558340072631836,
"learning_rate": 1.939418065261219e-07,
"loss": 0.4788,
"step": 417
},
{
"epoch": 0.8083152042542906,
"grad_norm": 2.813002109527588,
"learning_rate": 1.9024648362960539e-07,
"loss": 0.5176,
"step": 418
},
{
"epoch": 0.8102489726855209,
"grad_norm": 3.0548245906829834,
"learning_rate": 1.8658300092767543e-07,
"loss": 0.5363,
"step": 419
},
{
"epoch": 0.8121827411167513,
"grad_norm": 2.9029524326324463,
"learning_rate": 1.8295150247157686e-07,
"loss": 0.5361,
"step": 420
},
{
"epoch": 0.8141165095479816,
"grad_norm": 3.0287888050079346,
"learning_rate": 1.7935213105490722e-07,
"loss": 0.5176,
"step": 421
},
{
"epoch": 0.816050277979212,
"grad_norm": 2.9426074028015137,
"learning_rate": 1.7578502820800046e-07,
"loss": 0.4806,
"step": 422
},
{
"epoch": 0.8179840464104423,
"grad_norm": 3.0299363136291504,
"learning_rate": 1.72250334192365e-07,
"loss": 0.4972,
"step": 423
},
{
"epoch": 0.8199178148416727,
"grad_norm": 2.845245838165283,
"learning_rate": 1.6874818799516465e-07,
"loss": 0.5033,
"step": 424
},
{
"epoch": 0.8218515832729031,
"grad_norm": 2.8781020641326904,
"learning_rate": 1.6527872732375648e-07,
"loss": 0.508,
"step": 425
},
{
"epoch": 0.8237853517041335,
"grad_norm": 2.969726324081421,
"learning_rate": 1.618420886002747e-07,
"loss": 0.5408,
"step": 426
},
{
"epoch": 0.8257191201353637,
"grad_norm": 2.890504837036133,
"learning_rate": 1.5843840695626666e-07,
"loss": 0.5224,
"step": 427
},
{
"epoch": 0.8276528885665941,
"grad_norm": 2.945122241973877,
"learning_rate": 1.5506781622737942e-07,
"loss": 0.4949,
"step": 428
},
{
"epoch": 0.8295866569978245,
"grad_norm": 2.883887767791748,
"learning_rate": 1.5173044894809762e-07,
"loss": 0.492,
"step": 429
},
{
"epoch": 0.8315204254290549,
"grad_norm": 2.9001541137695312,
"learning_rate": 1.4842643634653052e-07,
"loss": 0.5115,
"step": 430
},
{
"epoch": 0.8334541938602852,
"grad_norm": 2.7559828758239746,
"learning_rate": 1.4515590833925506e-07,
"loss": 0.4998,
"step": 431
},
{
"epoch": 0.8353879622915156,
"grad_norm": 2.8526365756988525,
"learning_rate": 1.419189935262034e-07,
"loss": 0.5091,
"step": 432
},
{
"epoch": 0.837321730722746,
"grad_norm": 2.8609507083892822,
"learning_rate": 1.3871581918561048e-07,
"loss": 0.4773,
"step": 433
},
{
"epoch": 0.8392554991539763,
"grad_norm": 3.009993314743042,
"learning_rate": 1.3554651126900564e-07,
"loss": 0.4892,
"step": 434
},
{
"epoch": 0.8411892675852066,
"grad_norm": 3.0874485969543457,
"learning_rate": 1.3241119439626258e-07,
"loss": 0.4492,
"step": 435
},
{
"epoch": 0.843123036016437,
"grad_norm": 2.902095079421997,
"learning_rate": 1.29309991850698e-07,
"loss": 0.4865,
"step": 436
},
{
"epoch": 0.8450568044476674,
"grad_norm": 2.789257764816284,
"learning_rate": 1.2624302557422472e-07,
"loss": 0.5034,
"step": 437
},
{
"epoch": 0.8469905728788978,
"grad_norm": 2.9948341846466064,
"learning_rate": 1.232104161625561e-07,
"loss": 0.5115,
"step": 438
},
{
"epoch": 0.8489243413101281,
"grad_norm": 3.0060694217681885,
"learning_rate": 1.202122828604646e-07,
"loss": 0.4999,
"step": 439
},
{
"epoch": 0.8508581097413584,
"grad_norm": 3.115152597427368,
"learning_rate": 1.1724874355709258e-07,
"loss": 0.4844,
"step": 440
},
{
"epoch": 0.8527918781725888,
"grad_norm": 2.870953321456909,
"learning_rate": 1.1431991478131752e-07,
"loss": 0.5252,
"step": 441
},
{
"epoch": 0.8547256466038192,
"grad_norm": 2.8833727836608887,
"learning_rate": 1.114259116971693e-07,
"loss": 0.481,
"step": 442
},
{
"epoch": 0.8566594150350495,
"grad_norm": 3.122464179992676,
"learning_rate": 1.085668480993015e-07,
"loss": 0.5057,
"step": 443
},
{
"epoch": 0.8585931834662799,
"grad_norm": 2.969245672225952,
"learning_rate": 1.0574283640851889e-07,
"loss": 0.5017,
"step": 444
},
{
"epoch": 0.8605269518975103,
"grad_norm": 2.9073445796966553,
"learning_rate": 1.0295398766735409e-07,
"loss": 0.525,
"step": 445
},
{
"epoch": 0.8624607203287407,
"grad_norm": 2.967390775680542,
"learning_rate": 1.0020041153570347e-07,
"loss": 0.5246,
"step": 446
},
{
"epoch": 0.864394488759971,
"grad_norm": 2.8257503509521484,
"learning_rate": 9.748221628651443e-08,
"loss": 0.4992,
"step": 447
},
{
"epoch": 0.8663282571912013,
"grad_norm": 2.900615930557251,
"learning_rate": 9.479950880152809e-08,
"loss": 0.5078,
"step": 448
},
{
"epoch": 0.8682620256224317,
"grad_norm": 2.8263025283813477,
"learning_rate": 9.215239456707636e-08,
"loss": 0.4673,
"step": 449
},
{
"epoch": 0.8701957940536621,
"grad_norm": 2.6599619388580322,
"learning_rate": 8.954097766993496e-08,
"loss": 0.4837,
"step": 450
},
{
"epoch": 0.8721295624848925,
"grad_norm": 2.919358491897583,
"learning_rate": 8.696536079322902e-08,
"loss": 0.4759,
"step": 451
},
{
"epoch": 0.8740633309161228,
"grad_norm": 3.0110116004943848,
"learning_rate": 8.442564521239781e-08,
"loss": 0.509,
"step": 452
},
{
"epoch": 0.8759970993473531,
"grad_norm": 2.8107213973999023,
"learning_rate": 8.192193079121002e-08,
"loss": 0.4791,
"step": 453
},
{
"epoch": 0.8779308677785835,
"grad_norm": 2.792571783065796,
"learning_rate": 7.945431597783902e-08,
"loss": 0.4883,
"step": 454
},
{
"epoch": 0.8798646362098139,
"grad_norm": 3.048387289047241,
"learning_rate": 7.70228978009907e-08,
"loss": 0.5173,
"step": 455
},
{
"epoch": 0.8817984046410442,
"grad_norm": 2.8160572052001953,
"learning_rate": 7.462777186608849e-08,
"loss": 0.4777,
"step": 456
},
{
"epoch": 0.8837321730722746,
"grad_norm": 2.8880162239074707,
"learning_rate": 7.226903235151438e-08,
"loss": 0.4429,
"step": 457
},
{
"epoch": 0.885665941503505,
"grad_norm": 3.1718862056732178,
"learning_rate": 6.994677200490507e-08,
"loss": 0.5493,
"step": 458
},
{
"epoch": 0.8875997099347354,
"grad_norm": 3.0768024921417236,
"learning_rate": 6.766108213950583e-08,
"loss": 0.5329,
"step": 459
},
{
"epoch": 0.8895334783659656,
"grad_norm": 3.097865581512451,
"learning_rate": 6.541205263057936e-08,
"loss": 0.5059,
"step": 460
},
{
"epoch": 0.891467246797196,
"grad_norm": 2.9065606594085693,
"learning_rate": 6.319977191187231e-08,
"loss": 0.5083,
"step": 461
},
{
"epoch": 0.8934010152284264,
"grad_norm": 2.651416540145874,
"learning_rate": 6.102432697213733e-08,
"loss": 0.4076,
"step": 462
},
{
"epoch": 0.8953347836596568,
"grad_norm": 3.1337976455688477,
"learning_rate": 5.888580335171367e-08,
"loss": 0.5136,
"step": 463
},
{
"epoch": 0.8972685520908871,
"grad_norm": 2.9478445053100586,
"learning_rate": 5.6784285139162116e-08,
"loss": 0.5605,
"step": 464
},
{
"epoch": 0.8992023205221175,
"grad_norm": 2.8331518173217773,
"learning_rate": 5.47198549679605e-08,
"loss": 0.4823,
"step": 465
},
{
"epoch": 0.9011360889533478,
"grad_norm": 2.9675228595733643,
"learning_rate": 5.269259401325254e-08,
"loss": 0.5216,
"step": 466
},
{
"epoch": 0.9030698573845782,
"grad_norm": 3.129606246948242,
"learning_rate": 5.07025819886574e-08,
"loss": 0.5425,
"step": 467
},
{
"epoch": 0.9050036258158085,
"grad_norm": 2.961496353149414,
"learning_rate": 4.8749897143134486e-08,
"loss": 0.5506,
"step": 468
},
{
"epoch": 0.9069373942470389,
"grad_norm": 2.621377944946289,
"learning_rate": 4.6834616257906966e-08,
"loss": 0.5214,
"step": 469
},
{
"epoch": 0.9088711626782693,
"grad_norm": 2.75648832321167,
"learning_rate": 4.495681464344259e-08,
"loss": 0.4796,
"step": 470
},
{
"epoch": 0.9108049311094997,
"grad_norm": 2.8078525066375732,
"learning_rate": 4.3116566136492506e-08,
"loss": 0.4867,
"step": 471
},
{
"epoch": 0.91273869954073,
"grad_norm": 2.8183205127716064,
"learning_rate": 4.1313943097187495e-08,
"loss": 0.4663,
"step": 472
},
{
"epoch": 0.9146724679719603,
"grad_norm": 2.8927807807922363,
"learning_rate": 3.954901640619368e-08,
"loss": 0.4743,
"step": 473
},
{
"epoch": 0.9166062364031907,
"grad_norm": 2.9635002613067627,
"learning_rate": 3.782185546192407e-08,
"loss": 0.4839,
"step": 474
},
{
"epoch": 0.9185400048344211,
"grad_norm": 2.9816319942474365,
"learning_rate": 3.6132528177811094e-08,
"loss": 0.5449,
"step": 475
},
{
"epoch": 0.9204737732656514,
"grad_norm": 3.050773859024048,
"learning_rate": 3.44811009796353e-08,
"loss": 0.4857,
"step": 476
},
{
"epoch": 0.9224075416968818,
"grad_norm": 2.933197259902954,
"learning_rate": 3.286763880291399e-08,
"loss": 0.4848,
"step": 477
},
{
"epoch": 0.9243413101281122,
"grad_norm": 2.923037528991699,
"learning_rate": 3.129220509034724e-08,
"loss": 0.5111,
"step": 478
},
{
"epoch": 0.9262750785593425,
"grad_norm": 2.874183177947998,
"learning_rate": 2.975486178932407e-08,
"loss": 0.4919,
"step": 479
},
{
"epoch": 0.9282088469905729,
"grad_norm": 2.9910435676574707,
"learning_rate": 2.8255669349485978e-08,
"loss": 0.4473,
"step": 480
},
{
"epoch": 0.9301426154218032,
"grad_norm": 2.9716756343841553,
"learning_rate": 2.679468672035057e-08,
"loss": 0.4508,
"step": 481
},
{
"epoch": 0.9320763838530336,
"grad_norm": 2.8846945762634277,
"learning_rate": 2.5371971348992938e-08,
"loss": 0.463,
"step": 482
},
{
"epoch": 0.934010152284264,
"grad_norm": 2.8598673343658447,
"learning_rate": 2.3987579177787263e-08,
"loss": 0.5179,
"step": 483
},
{
"epoch": 0.9359439207154944,
"grad_norm": 2.910398006439209,
"learning_rate": 2.2641564642207346e-08,
"loss": 0.5172,
"step": 484
},
{
"epoch": 0.9378776891467246,
"grad_norm": 3.295294761657715,
"learning_rate": 2.1333980668685413e-08,
"loss": 0.5162,
"step": 485
},
{
"epoch": 0.939811457577955,
"grad_norm": 3.001046895980835,
"learning_rate": 2.0064878672531483e-08,
"loss": 0.5116,
"step": 486
},
{
"epoch": 0.9417452260091854,
"grad_norm": 2.902860641479492,
"learning_rate": 1.8834308555911728e-08,
"loss": 0.5198,
"step": 487
},
{
"epoch": 0.9436789944404158,
"grad_norm": 2.8829078674316406,
"learning_rate": 1.7642318705886284e-08,
"loss": 0.5099,
"step": 488
},
{
"epoch": 0.9456127628716461,
"grad_norm": 2.825139045715332,
"learning_rate": 1.6488955992506216e-08,
"loss": 0.5034,
"step": 489
},
{
"epoch": 0.9475465313028765,
"grad_norm": 2.812983274459839,
"learning_rate": 1.5374265766971096e-08,
"loss": 0.4804,
"step": 490
},
{
"epoch": 0.9494802997341069,
"grad_norm": 3.0010793209075928,
"learning_rate": 1.4298291859845212e-08,
"loss": 0.4755,
"step": 491
},
{
"epoch": 0.9514140681653372,
"grad_norm": 2.9898478984832764,
"learning_rate": 1.3261076579334729e-08,
"loss": 0.5148,
"step": 492
},
{
"epoch": 0.9533478365965675,
"grad_norm": 2.897951364517212,
"learning_rate": 1.2262660709623229e-08,
"loss": 0.5036,
"step": 493
},
{
"epoch": 0.9552816050277979,
"grad_norm": 2.879439115524292,
"learning_rate": 1.1303083509269451e-08,
"loss": 0.5003,
"step": 494
},
{
"epoch": 0.9572153734590283,
"grad_norm": 3.1193387508392334,
"learning_rate": 1.0382382709661853e-08,
"loss": 0.4836,
"step": 495
},
{
"epoch": 0.9591491418902587,
"grad_norm": 2.995630979537964,
"learning_rate": 9.500594513536352e-09,
"loss": 0.5322,
"step": 496
},
{
"epoch": 0.961082910321489,
"grad_norm": 2.8320207595825195,
"learning_rate": 8.657753593552142e-09,
"loss": 0.5023,
"step": 497
},
{
"epoch": 0.9630166787527193,
"grad_norm": 2.8688549995422363,
"learning_rate": 7.853893090928654e-09,
"loss": 0.5112,
"step": 498
},
{
"epoch": 0.9649504471839497,
"grad_norm": 2.9314334392547607,
"learning_rate": 7.0890446141421704e-09,
"loss": 0.4959,
"step": 499
},
{
"epoch": 0.9668842156151801,
"grad_norm": 3.1379406452178955,
"learning_rate": 6.363238237683033e-09,
"loss": 0.5223,
"step": 500
},
{
"epoch": 0.9688179840464104,
"grad_norm": 3.0119411945343018,
"learning_rate": 5.676502500873037e-09,
"loss": 0.4634,
"step": 501
},
{
"epoch": 0.9707517524776408,
"grad_norm": 2.8588473796844482,
"learning_rate": 5.028864406743549e-09,
"loss": 0.5054,
"step": 502
},
{
"epoch": 0.9726855209088712,
"grad_norm": 3.05617618560791,
"learning_rate": 4.420349420973357e-09,
"loss": 0.56,
"step": 503
},
{
"epoch": 0.9746192893401016,
"grad_norm": 2.8269994258880615,
"learning_rate": 3.85098147088736e-09,
"loss": 0.4795,
"step": 504
},
{
"epoch": 0.9765530577713318,
"grad_norm": 3.081051826477051,
"learning_rate": 3.3207829445159872e-09,
"loss": 0.532,
"step": 505
},
{
"epoch": 0.9784868262025622,
"grad_norm": 2.9081430435180664,
"learning_rate": 2.8297746897146813e-09,
"loss": 0.4936,
"step": 506
},
{
"epoch": 0.9804205946337926,
"grad_norm": 2.7562308311462402,
"learning_rate": 2.3779760133441075e-09,
"loss": 0.4376,
"step": 507
},
{
"epoch": 0.982354363065023,
"grad_norm": 3.053764820098877,
"learning_rate": 1.9654046805112067e-09,
"loss": 0.5243,
"step": 508
},
{
"epoch": 0.9842881314962533,
"grad_norm": 3.0876071453094482,
"learning_rate": 1.5920769138706436e-09,
"loss": 0.4791,
"step": 509
},
{
"epoch": 0.9862218999274837,
"grad_norm": 2.8327271938323975,
"learning_rate": 1.2580073929863156e-09,
"loss": 0.4757,
"step": 510
},
{
"epoch": 0.988155668358714,
"grad_norm": 2.7686331272125244,
"learning_rate": 9.632092537551483e-10,
"loss": 0.5029,
"step": 511
},
{
"epoch": 0.9900894367899444,
"grad_norm": 2.8857767581939697,
"learning_rate": 7.076940878896209e-10,
"loss": 0.4867,
"step": 512
},
{
"epoch": 0.9920232052211748,
"grad_norm": 2.8375306129455566,
"learning_rate": 4.914719424629066e-10,
"loss": 0.5285,
"step": 513
},
{
"epoch": 0.9939569736524051,
"grad_norm": 2.8377671241760254,
"learning_rate": 3.1455131951285685e-10,
"loss": 0.5036,
"step": 514
},
{
"epoch": 0.9958907420836355,
"grad_norm": 3.5421719551086426,
"learning_rate": 1.7693917570837936e-10,
"loss": 0.5094,
"step": 515
},
{
"epoch": 0.9978245105148659,
"grad_norm": 3.072711229324341,
"learning_rate": 7.864092207554573e-11,
"loss": 0.5009,
"step": 516
},
{
"epoch": 0.9997582789460963,
"grad_norm": 2.859100341796875,
"learning_rate": 1.9660423784984005e-11,
"loss": 0.5103,
"step": 517
},
{
"epoch": 0.9997582789460963,
"step": 517,
"total_flos": 1.8974566875817574e+18,
"train_loss": 0.5522731098722904,
"train_runtime": 18088.4019,
"train_samples_per_second": 1.83,
"train_steps_per_second": 0.029
}
],
"logging_steps": 1.0,
"max_steps": 517,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8974566875817574e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}