{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018867924528301886, "grad_norm": 15.087599797333977, "learning_rate": 0.0, "loss": 1.3308970928192139, "step": 1 }, { "epoch": 0.03773584905660377, "grad_norm": 13.905749218469701, "learning_rate": 1e-07, "loss": 1.285962700843811, "step": 2 }, { "epoch": 0.05660377358490566, "grad_norm": 12.056842971309038, "learning_rate": 2e-07, "loss": 1.3014495372772217, "step": 3 }, { "epoch": 0.07547169811320754, "grad_norm": 13.244790508016711, "learning_rate": 3e-07, "loss": 1.306698203086853, "step": 4 }, { "epoch": 0.09433962264150944, "grad_norm": 13.343404572220207, "learning_rate": 4e-07, "loss": 1.316192388534546, "step": 5 }, { "epoch": 0.11320754716981132, "grad_norm": 11.45228080585811, "learning_rate": 5e-07, "loss": 1.3045374155044556, "step": 6 }, { "epoch": 0.1320754716981132, "grad_norm": 14.318691524925551, "learning_rate": 6e-07, "loss": 1.3311705589294434, "step": 7 }, { "epoch": 0.1509433962264151, "grad_norm": 12.050953431316902, "learning_rate": 7e-07, "loss": 1.2908077239990234, "step": 8 }, { "epoch": 0.16981132075471697, "grad_norm": 10.724349690276135, "learning_rate": 8e-07, "loss": 1.3058435916900635, "step": 9 }, { "epoch": 0.18867924528301888, "grad_norm": 9.473154789049158, "learning_rate": 9e-07, "loss": 1.2856130599975586, "step": 10 }, { "epoch": 0.20754716981132076, "grad_norm": 6.5491980764342275, "learning_rate": 1e-06, "loss": 1.2199636697769165, "step": 11 }, { "epoch": 0.22641509433962265, "grad_norm": 6.462810113541478, "learning_rate": 9.99888864929809e-07, "loss": 1.1673463582992554, "step": 12 }, { "epoch": 0.24528301886792453, "grad_norm": 6.9296822460672045, "learning_rate": 9.995555091232516e-07, "loss": 1.1699671745300293, "step": 13 }, { "epoch": 0.2641509433962264, "grad_norm": 6.0515568106146596, "learning_rate": 9.990000807704114e-07, "loss": 1.1814613342285156, "step": 14 }, { "epoch": 0.2830188679245283, "grad_norm": 4.743020637878028, "learning_rate": 9.982228267815643e-07, "loss": 1.0652694702148438, "step": 15 }, { "epoch": 0.3018867924528302, "grad_norm": 4.526630266791274, "learning_rate": 9.972240926774166e-07, "loss": 1.0635337829589844, "step": 16 }, { "epoch": 0.32075471698113206, "grad_norm": 4.609514753545406, "learning_rate": 9.96004322435508e-07, "loss": 1.0902111530303955, "step": 17 }, { "epoch": 0.33962264150943394, "grad_norm": 4.34097054100359, "learning_rate": 9.945640582928437e-07, "loss": 1.06702721118927, "step": 18 }, { "epoch": 0.3584905660377358, "grad_norm": 3.864434007517437, "learning_rate": 9.9290394050485e-07, "loss": 1.0476477146148682, "step": 19 }, { "epoch": 0.37735849056603776, "grad_norm": 3.8969527857656656, "learning_rate": 9.91024707060755e-07, "loss": 1.0617330074310303, "step": 20 }, { "epoch": 0.39622641509433965, "grad_norm": 3.911924948199517, "learning_rate": 9.889271933555212e-07, "loss": 1.07832932472229, "step": 21 }, { "epoch": 0.41509433962264153, "grad_norm": 3.833567236811609, "learning_rate": 9.8661233181848e-07, "loss": 1.0324124097824097, "step": 22 }, { "epoch": 0.4339622641509434, "grad_norm": 3.6955004487569396, "learning_rate": 9.840811514988293e-07, "loss": 0.9815853834152222, "step": 23 }, { "epoch": 0.4528301886792453, "grad_norm": 4.023404125176107, "learning_rate": 9.813347776081788e-07, "loss": 1.0266845226287842, "step": 24 }, { "epoch": 0.4716981132075472, "grad_norm": 3.712461139695743, "learning_rate": 9.78374431020349e-07, "loss": 1.0085935592651367, "step": 25 }, { "epoch": 0.49056603773584906, "grad_norm": 3.7543864084874596, "learning_rate": 9.752014277286431e-07, "loss": 0.9968965649604797, "step": 26 }, { "epoch": 0.5094339622641509, "grad_norm": 3.8046734306564467, "learning_rate": 9.718171782608353e-07, "loss": 0.9803509712219238, "step": 27 }, { "epoch": 0.5283018867924528, "grad_norm": 3.6105782650336433, "learning_rate": 9.682231870521345e-07, "loss": 0.9759021997451782, "step": 28 }, { "epoch": 0.5471698113207547, "grad_norm": 3.3896428780092753, "learning_rate": 9.644210517764013e-07, "loss": 0.9812103509902954, "step": 29 }, { "epoch": 0.5660377358490566, "grad_norm": 3.118079780719029, "learning_rate": 9.60412462635919e-07, "loss": 0.9091012477874756, "step": 30 }, { "epoch": 0.5849056603773585, "grad_norm": 3.3662986364845, "learning_rate": 9.561992016100291e-07, "loss": 0.9503388404846191, "step": 31 }, { "epoch": 0.6037735849056604, "grad_norm": 2.9779547004368196, "learning_rate": 9.517831416629716e-07, "loss": 0.9247981309890747, "step": 32 }, { "epoch": 0.6226415094339622, "grad_norm": 3.468415170701323, "learning_rate": 9.471662459112745e-07, "loss": 0.9473499655723572, "step": 33 }, { "epoch": 0.6415094339622641, "grad_norm": 2.8573918489427688, "learning_rate": 9.423505667510723e-07, "loss": 0.9340516328811646, "step": 34 }, { "epoch": 0.660377358490566, "grad_norm": 2.949529150108781, "learning_rate": 9.373382449457303e-07, "loss": 0.9248940348625183, "step": 35 }, { "epoch": 0.6792452830188679, "grad_norm": 2.9658340262784697, "learning_rate": 9.321315086741915e-07, "loss": 0.9420664310455322, "step": 36 }, { "epoch": 0.6981132075471698, "grad_norm": 3.019712899281778, "learning_rate": 9.267326725404598e-07, "loss": 0.9231287240982056, "step": 37 }, { "epoch": 0.7169811320754716, "grad_norm": 2.827563138085356, "learning_rate": 9.21144136544666e-07, "loss": 0.9293084740638733, "step": 38 }, { "epoch": 0.7358490566037735, "grad_norm": 3.126960585054511, "learning_rate": 9.153683850161705e-07, "loss": 0.9372609853744507, "step": 39 }, { "epoch": 0.7547169811320755, "grad_norm": 2.7757572634358456, "learning_rate": 9.094079855091797e-07, "loss": 0.9204014539718628, "step": 40 }, { "epoch": 0.7735849056603774, "grad_norm": 2.86268897243828, "learning_rate": 9.032655876613635e-07, "loss": 0.9143469333648682, "step": 41 }, { "epoch": 0.7924528301886793, "grad_norm": 2.899411491265449, "learning_rate": 8.96943922015986e-07, "loss": 0.901626467704773, "step": 42 }, { "epoch": 0.8113207547169812, "grad_norm": 3.0296165470958494, "learning_rate": 8.90445798808068e-07, "loss": 0.9193109273910522, "step": 43 }, { "epoch": 0.8301886792452831, "grad_norm": 2.832066082274235, "learning_rate": 8.837741067151249e-07, "loss": 0.9078618288040161, "step": 44 }, { "epoch": 0.8490566037735849, "grad_norm": 2.9792386000035083, "learning_rate": 8.769318115730328e-07, "loss": 0.9032235145568848, "step": 45 }, { "epoch": 0.8679245283018868, "grad_norm": 2.8570785041355373, "learning_rate": 8.699219550575952e-07, "loss": 0.8799638152122498, "step": 46 }, { "epoch": 0.8867924528301887, "grad_norm": 2.8898604537645185, "learning_rate": 8.627476533323956e-07, "loss": 0.9072629809379578, "step": 47 }, { "epoch": 0.9056603773584906, "grad_norm": 2.819489131324746, "learning_rate": 8.554120956635374e-07, "loss": 0.879642128944397, "step": 48 }, { "epoch": 0.9245283018867925, "grad_norm": 2.884576949261456, "learning_rate": 8.479185430018858e-07, "loss": 0.9129672050476074, "step": 49 }, { "epoch": 0.9433962264150944, "grad_norm": 2.8206974490824663, "learning_rate": 8.402703265334454e-07, "loss": 0.9072036147117615, "step": 50 }, { "epoch": 0.9622641509433962, "grad_norm": 2.8666837714043414, "learning_rate": 8.324708461985124e-07, "loss": 0.8936312198638916, "step": 51 }, { "epoch": 0.9811320754716981, "grad_norm": 2.75278105425475, "learning_rate": 8.245235691802643e-07, "loss": 0.886029839515686, "step": 52 }, { "epoch": 1.0, "grad_norm": 2.9063116637756807, "learning_rate": 8.164320283634585e-07, "loss": 0.886949360370636, "step": 53 }, { "epoch": 1.0188679245283019, "grad_norm": 2.8027377644406104, "learning_rate": 8.081998207639212e-07, "loss": 0.8734487891197205, "step": 54 }, { "epoch": 1.0377358490566038, "grad_norm": 2.975237594360833, "learning_rate": 7.998306059295302e-07, "loss": 0.8541756868362427, "step": 55 }, { "epoch": 1.0566037735849056, "grad_norm": 2.7212092257296785, "learning_rate": 7.913281043133977e-07, "loss": 0.855162501335144, "step": 56 }, { "epoch": 1.0754716981132075, "grad_norm": 4.004522306787069, "learning_rate": 7.826960956199794e-07, "loss": 0.8469276428222656, "step": 57 }, { "epoch": 1.0943396226415094, "grad_norm": 2.789521379215554, "learning_rate": 7.739384171248434e-07, "loss": 0.8612252473831177, "step": 58 }, { "epoch": 1.1132075471698113, "grad_norm": 3.0001618191920008, "learning_rate": 7.650589619688468e-07, "loss": 0.8504967093467712, "step": 59 }, { "epoch": 1.1320754716981132, "grad_norm": 2.803340918384437, "learning_rate": 7.560616774274774e-07, "loss": 0.8487892150878906, "step": 60 }, { "epoch": 1.150943396226415, "grad_norm": 2.7872996717171112, "learning_rate": 7.469505631561317e-07, "loss": 0.8430064916610718, "step": 61 }, { "epoch": 1.169811320754717, "grad_norm": 2.767338948376076, "learning_rate": 7.377296694121058e-07, "loss": 0.834577202796936, "step": 62 }, { "epoch": 1.1886792452830188, "grad_norm": 2.7744551402453883, "learning_rate": 7.284030952540936e-07, "loss": 0.8389214277267456, "step": 63 }, { "epoch": 1.2075471698113207, "grad_norm": 2.94391173341089, "learning_rate": 7.189749867199898e-07, "loss": 0.8442764282226562, "step": 64 }, { "epoch": 1.2264150943396226, "grad_norm": 2.9244734720758285, "learning_rate": 7.094495349838092e-07, "loss": 0.802047848701477, "step": 65 }, { "epoch": 1.2452830188679245, "grad_norm": 2.997891576167027, "learning_rate": 6.998309744925411e-07, "loss": 0.8562427163124084, "step": 66 }, { "epoch": 1.2641509433962264, "grad_norm": 2.7454101056544618, "learning_rate": 6.901235810837667e-07, "loss": 0.8214827179908752, "step": 67 }, { "epoch": 1.2830188679245282, "grad_norm": 2.9952605769764853, "learning_rate": 6.803316700848778e-07, "loss": 0.7995479702949524, "step": 68 }, { "epoch": 1.3018867924528301, "grad_norm": 2.86683247629566, "learning_rate": 6.704595943947385e-07, "loss": 0.8077808022499084, "step": 69 }, { "epoch": 1.320754716981132, "grad_norm": 2.7702979738330322, "learning_rate": 6.605117425486481e-07, "loss": 0.8417398929595947, "step": 70 }, { "epoch": 1.3396226415094339, "grad_norm": 2.725158428984504, "learning_rate": 6.504925367674594e-07, "loss": 0.8494030833244324, "step": 71 }, { "epoch": 1.3584905660377358, "grad_norm": 2.8106277256279255, "learning_rate": 6.40406430991723e-07, "loss": 0.8620424866676331, "step": 72 }, { "epoch": 1.3773584905660377, "grad_norm": 2.818628329932316, "learning_rate": 6.302579089017327e-07, "loss": 0.8398749232292175, "step": 73 }, { "epoch": 1.3962264150943398, "grad_norm": 2.745904001646307, "learning_rate": 6.200514819243475e-07, "loss": 0.8420323133468628, "step": 74 }, { "epoch": 1.4150943396226414, "grad_norm": 2.7850840819985416, "learning_rate": 6.097916872274814e-07, "loss": 0.8359158635139465, "step": 75 }, { "epoch": 1.4339622641509435, "grad_norm": 2.793048578545994, "learning_rate": 5.994830857031499e-07, "loss": 0.8336814641952515, "step": 76 }, { "epoch": 1.4528301886792452, "grad_norm": 2.8505241824701826, "learning_rate": 5.891302599399684e-07, "loss": 0.7930982112884521, "step": 77 }, { "epoch": 1.4716981132075473, "grad_norm": 2.6769256052426615, "learning_rate": 5.78737812186009e-07, "loss": 0.8192281723022461, "step": 78 }, { "epoch": 1.490566037735849, "grad_norm": 2.7762595596745916, "learning_rate": 5.683103623029134e-07, "loss": 0.8389377593994141, "step": 79 }, { "epoch": 1.509433962264151, "grad_norm": 2.8899154085340166, "learning_rate": 5.578525457121806e-07, "loss": 0.8256187438964844, "step": 80 }, { "epoch": 1.5283018867924527, "grad_norm": 2.7720983651750917, "learning_rate": 5.473690113345342e-07, "loss": 0.8473238945007324, "step": 81 }, { "epoch": 1.5471698113207548, "grad_norm": 2.8065774463241495, "learning_rate": 5.368644195232895e-07, "loss": 0.8165145516395569, "step": 82 }, { "epoch": 1.5660377358490565, "grad_norm": 2.9614754969968016, "learning_rate": 5.263434399926398e-07, "loss": 0.8529609441757202, "step": 83 }, { "epoch": 1.5849056603773586, "grad_norm": 2.90447128441676, "learning_rate": 5.158107497417794e-07, "loss": 0.8249980211257935, "step": 84 }, { "epoch": 1.6037735849056602, "grad_norm": 2.7563670691746767, "learning_rate": 5.052710309757898e-07, "loss": 0.7900608777999878, "step": 85 }, { "epoch": 1.6226415094339623, "grad_norm": 2.781624786647774, "learning_rate": 4.947289690242102e-07, "loss": 0.7917711734771729, "step": 86 }, { "epoch": 1.641509433962264, "grad_norm": 2.8227831992064165, "learning_rate": 4.841892502582205e-07, "loss": 0.8228881359100342, "step": 87 }, { "epoch": 1.6603773584905661, "grad_norm": 3.0626612203128687, "learning_rate": 4.736565600073602e-07, "loss": 0.8176588416099548, "step": 88 }, { "epoch": 1.6792452830188678, "grad_norm": 2.7691999193756316, "learning_rate": 4.6313558047671047e-07, "loss": 0.8315557837486267, "step": 89 }, { "epoch": 1.6981132075471699, "grad_norm": 2.9603416787137276, "learning_rate": 4.5263098866546586e-07, "loss": 0.8079712390899658, "step": 90 }, { "epoch": 1.7169811320754715, "grad_norm": 2.7648310195075023, "learning_rate": 4.421474542878194e-07, "loss": 0.7854694128036499, "step": 91 }, { "epoch": 1.7358490566037736, "grad_norm": 2.9565749840190736, "learning_rate": 4.316896376970866e-07, "loss": 0.8382487297058105, "step": 92 }, { "epoch": 1.7547169811320755, "grad_norm": 2.904524931485949, "learning_rate": 4.2126218781399114e-07, "loss": 0.8337287902832031, "step": 93 }, { "epoch": 1.7735849056603774, "grad_norm": 2.9419686201700794, "learning_rate": 4.1086974006003154e-07, "loss": 0.8450314402580261, "step": 94 }, { "epoch": 1.7924528301886793, "grad_norm": 2.738066358519684, "learning_rate": 4.0051691429685023e-07, "loss": 0.7846765518188477, "step": 95 }, { "epoch": 1.8113207547169812, "grad_norm": 2.7276079074380895, "learning_rate": 3.902083127725186e-07, "loss": 0.814504861831665, "step": 96 }, { "epoch": 1.830188679245283, "grad_norm": 2.8093937971147835, "learning_rate": 3.799485180756525e-07, "loss": 0.8011671304702759, "step": 97 }, { "epoch": 1.849056603773585, "grad_norm": 2.842796846086812, "learning_rate": 3.697420910982672e-07, "loss": 0.8165295124053955, "step": 98 }, { "epoch": 1.8679245283018868, "grad_norm": 2.8189503982268977, "learning_rate": 3.5959356900827687e-07, "loss": 0.8199301958084106, "step": 99 }, { "epoch": 1.8867924528301887, "grad_norm": 2.910644604198592, "learning_rate": 3.4950746323254063e-07, "loss": 0.8019869327545166, "step": 100 }, { "epoch": 1.9056603773584906, "grad_norm": 2.863904675767849, "learning_rate": 3.394882574513519e-07, "loss": 0.8060827255249023, "step": 101 }, { "epoch": 1.9245283018867925, "grad_norm": 2.8904123754351723, "learning_rate": 3.295404056052616e-07, "loss": 0.8078351020812988, "step": 102 }, { "epoch": 1.9433962264150944, "grad_norm": 2.8850916542883778, "learning_rate": 3.1966832991512225e-07, "loss": 0.8068495988845825, "step": 103 }, { "epoch": 1.9622641509433962, "grad_norm": 2.9528533111592865, "learning_rate": 3.0987641891623315e-07, "loss": 0.8184278011322021, "step": 104 }, { "epoch": 1.9811320754716981, "grad_norm": 2.869159446180868, "learning_rate": 3.0016902550745895e-07, "loss": 0.8299746513366699, "step": 105 }, { "epoch": 2.0, "grad_norm": 2.778568933671074, "learning_rate": 2.9055046501619083e-07, "loss": 0.785747766494751, "step": 106 }, { "epoch": 2.018867924528302, "grad_norm": 2.9408610818195062, "learning_rate": 2.810250132800103e-07, "loss": 0.7670397758483887, "step": 107 }, { "epoch": 2.0377358490566038, "grad_norm": 2.6257935800346694, "learning_rate": 2.715969047459066e-07, "loss": 0.7878092527389526, "step": 108 }, { "epoch": 2.056603773584906, "grad_norm": 3.058449053263793, "learning_rate": 2.6227033058789403e-07, "loss": 0.7904379367828369, "step": 109 }, { "epoch": 2.0754716981132075, "grad_norm": 2.88973427193669, "learning_rate": 2.5304943684386825e-07, "loss": 0.8011707067489624, "step": 110 }, { "epoch": 2.0943396226415096, "grad_norm": 2.723021754211135, "learning_rate": 2.439383225725225e-07, "loss": 0.7658779621124268, "step": 111 }, { "epoch": 2.1132075471698113, "grad_norm": 2.787460559434829, "learning_rate": 2.3494103803115318e-07, "loss": 0.7720337510108948, "step": 112 }, { "epoch": 2.1320754716981134, "grad_norm": 2.7422069166294802, "learning_rate": 2.2606158287515658e-07, "loss": 0.7842212915420532, "step": 113 }, { "epoch": 2.150943396226415, "grad_norm": 3.381034950183202, "learning_rate": 2.1730390438002056e-07, "loss": 0.7690730094909668, "step": 114 }, { "epoch": 2.169811320754717, "grad_norm": 2.7764924352985663, "learning_rate": 2.0867189568660236e-07, "loss": 0.7737655639648438, "step": 115 }, { "epoch": 2.188679245283019, "grad_norm": 2.8245587551592264, "learning_rate": 2.0016939407046986e-07, "loss": 0.7852470278739929, "step": 116 }, { "epoch": 2.207547169811321, "grad_norm": 3.429004827616326, "learning_rate": 1.9180017923607883e-07, "loss": 0.7893455624580383, "step": 117 }, { "epoch": 2.2264150943396226, "grad_norm": 3.1969648790899408, "learning_rate": 1.835679716365417e-07, "loss": 0.7634609937667847, "step": 118 }, { "epoch": 2.2452830188679247, "grad_norm": 2.70318214433158, "learning_rate": 1.7547643081973578e-07, "loss": 0.7859703898429871, "step": 119 }, { "epoch": 2.2641509433962264, "grad_norm": 2.961996890522788, "learning_rate": 1.6752915380148768e-07, "loss": 0.7709099650382996, "step": 120 }, { "epoch": 2.2830188679245285, "grad_norm": 2.8177889556978095, "learning_rate": 1.5972967346655448e-07, "loss": 0.7789061069488525, "step": 121 }, { "epoch": 2.30188679245283, "grad_norm": 3.320024417308839, "learning_rate": 1.5208145699811415e-07, "loss": 0.7862054705619812, "step": 122 }, { "epoch": 2.3207547169811322, "grad_norm": 2.8631784669698415, "learning_rate": 1.4458790433646263e-07, "loss": 0.7816888689994812, "step": 123 }, { "epoch": 2.339622641509434, "grad_norm": 2.902161614336072, "learning_rate": 1.3725234666760427e-07, "loss": 0.7391059398651123, "step": 124 }, { "epoch": 2.358490566037736, "grad_norm": 2.882470659827849, "learning_rate": 1.3007804494240476e-07, "loss": 0.7627633810043335, "step": 125 }, { "epoch": 2.3773584905660377, "grad_norm": 2.8433427591245284, "learning_rate": 1.2306818842696715e-07, "loss": 0.7769066095352173, "step": 126 }, { "epoch": 2.3962264150943398, "grad_norm": 2.8617729260756573, "learning_rate": 1.1622589328487503e-07, "loss": 0.7934216856956482, "step": 127 }, { "epoch": 2.4150943396226414, "grad_norm": 2.8509595069990823, "learning_rate": 1.0955420119193198e-07, "loss": 0.7673547863960266, "step": 128 }, { "epoch": 2.4339622641509435, "grad_norm": 2.874293982355328, "learning_rate": 1.03056077984014e-07, "loss": 0.7849991917610168, "step": 129 }, { "epoch": 2.452830188679245, "grad_norm": 3.0937215388279, "learning_rate": 9.673441233863661e-08, "loss": 0.7473263740539551, "step": 130 }, { "epoch": 2.4716981132075473, "grad_norm": 2.9292035796935054, "learning_rate": 9.059201449082043e-08, "loss": 0.784021258354187, "step": 131 }, { "epoch": 2.490566037735849, "grad_norm": 2.810444173384006, "learning_rate": 8.463161498382949e-08, "loss": 0.7882828712463379, "step": 132 }, { "epoch": 2.509433962264151, "grad_norm": 2.829313317652292, "learning_rate": 7.885586345533396e-08, "loss": 0.7572199702262878, "step": 133 }, { "epoch": 2.5283018867924527, "grad_norm": 2.6656369607187567, "learning_rate": 7.326732745954e-08, "loss": 0.7826784253120422, "step": 134 }, { "epoch": 2.547169811320755, "grad_norm": 2.7036355808226173, "learning_rate": 6.786849132580841e-08, "loss": 0.7726486325263977, "step": 135 }, { "epoch": 2.5660377358490565, "grad_norm": 2.805033772692598, "learning_rate": 6.266175505426957e-08, "loss": 0.7736940383911133, "step": 136 }, { "epoch": 2.5849056603773586, "grad_norm": 2.8181269221147396, "learning_rate": 5.7649433248927794e-08, "loss": 0.7888213396072388, "step": 137 }, { "epoch": 2.6037735849056602, "grad_norm": 2.9760303324315256, "learning_rate": 5.283375408872537e-08, "loss": 0.7611340284347534, "step": 138 }, { "epoch": 2.6226415094339623, "grad_norm": 2.828152013200315, "learning_rate": 4.821685833702849e-08, "loss": 0.779454231262207, "step": 139 }, { "epoch": 2.641509433962264, "grad_norm": 2.8581322420761786, "learning_rate": 4.3800798389970863e-08, "loss": 0.769560694694519, "step": 140 }, { "epoch": 2.660377358490566, "grad_norm": 2.8125888801619103, "learning_rate": 3.958753736408105e-08, "loss": 0.7890896797180176, "step": 141 }, { "epoch": 2.6792452830188678, "grad_norm": 2.757727954638762, "learning_rate": 3.557894822359864e-08, "loss": 0.7476776838302612, "step": 142 }, { "epoch": 2.69811320754717, "grad_norm": 2.802525331124496, "learning_rate": 3.1776812947865384e-08, "loss": 0.7551087737083435, "step": 143 }, { "epoch": 2.7169811320754715, "grad_norm": 3.172109709327269, "learning_rate": 2.818282173916453e-08, "loss": 0.7675119638442993, "step": 144 }, { "epoch": 2.7358490566037736, "grad_norm": 2.836017838014085, "learning_rate": 2.4798572271356843e-08, "loss": 0.7670686841011047, "step": 145 }, { "epoch": 2.7547169811320753, "grad_norm": 2.9198667506437905, "learning_rate": 2.162556897965101e-08, "loss": 0.7993500828742981, "step": 146 }, { "epoch": 2.7735849056603774, "grad_norm": 2.795471164301072, "learning_rate": 1.8665222391821166e-08, "loss": 0.7754116654396057, "step": 147 }, { "epoch": 2.7924528301886795, "grad_norm": 2.7725526525432787, "learning_rate": 1.5918848501170644e-08, "loss": 0.7710179090499878, "step": 148 }, { "epoch": 2.811320754716981, "grad_norm": 2.784214561225124, "learning_rate": 1.3387668181519818e-08, "loss": 0.7384580969810486, "step": 149 }, { "epoch": 2.830188679245283, "grad_norm": 2.8847249743481833, "learning_rate": 1.1072806644478738e-08, "loss": 0.7740883827209473, "step": 150 }, { "epoch": 2.849056603773585, "grad_norm": 2.8315645307075945, "learning_rate": 8.975292939244927e-09, "loss": 0.7919697165489197, "step": 151 }, { "epoch": 2.867924528301887, "grad_norm": 2.9085892225722034, "learning_rate": 7.096059495149853e-09, "loss": 0.781722903251648, "step": 152 }, { "epoch": 2.8867924528301887, "grad_norm": 2.7506543384708224, "learning_rate": 5.435941707156388e-09, "loss": 0.7471998929977417, "step": 153 }, { "epoch": 2.9056603773584904, "grad_norm": 2.8426972222396136, "learning_rate": 3.995677564492039e-09, "loss": 0.7751771807670593, "step": 154 }, { "epoch": 2.9245283018867925, "grad_norm": 2.844363880881091, "learning_rate": 2.7759073225832597e-09, "loss": 0.7668254375457764, "step": 155 }, { "epoch": 2.9433962264150946, "grad_norm": 3.278094344932399, "learning_rate": 1.7771732184357901e-09, "loss": 0.7961957454681396, "step": 156 }, { "epoch": 2.9622641509433962, "grad_norm": 2.9897635623753955, "learning_rate": 9.999192295886971e-10, "loss": 0.7848834991455078, "step": 157 }, { "epoch": 2.981132075471698, "grad_norm": 2.748244107712091, "learning_rate": 4.4449087674847117e-10, "loss": 0.777495801448822, "step": 158 }, { "epoch": 3.0, "grad_norm": 2.9554977361208974, "learning_rate": 1.1113507019094858e-10, "loss": 0.7618961334228516, "step": 159 }, { "epoch": 3.0, "step": 159, "total_flos": 23335512768512.0, "train_loss": 0.8809327138294963, "train_runtime": 1440.1859, "train_samples_per_second": 3.485, "train_steps_per_second": 0.11 } ], "logging_steps": 1.0, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 23335512768512.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }