{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9971862689926843, "eval_steps": 500, "global_step": 2664, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011254924029262803, "grad_norm": 52.80103639717517, "learning_rate": 1.8726591760299626e-07, "loss": 11.1109, "step": 1 }, { "epoch": 0.0022509848058525606, "grad_norm": 54.14343134228205, "learning_rate": 3.7453183520599253e-07, "loss": 11.1518, "step": 2 }, { "epoch": 0.0033764772087788407, "grad_norm": 55.956204989803545, "learning_rate": 5.617977528089887e-07, "loss": 11.0032, "step": 3 }, { "epoch": 0.004501969611705121, "grad_norm": 52.83455219346365, "learning_rate": 7.490636704119851e-07, "loss": 10.9916, "step": 4 }, { "epoch": 0.005627462014631401, "grad_norm": 55.14267600803135, "learning_rate": 9.363295880149814e-07, "loss": 11.0876, "step": 5 }, { "epoch": 0.006752954417557681, "grad_norm": 54.398858725586614, "learning_rate": 1.1235955056179775e-06, "loss": 11.1606, "step": 6 }, { "epoch": 0.007878446820483961, "grad_norm": 54.32209978082585, "learning_rate": 1.310861423220974e-06, "loss": 11.0376, "step": 7 }, { "epoch": 0.009003939223410242, "grad_norm": 58.95322793439154, "learning_rate": 1.4981273408239701e-06, "loss": 10.7967, "step": 8 }, { "epoch": 0.010129431626336522, "grad_norm": 61.239998437535036, "learning_rate": 1.6853932584269663e-06, "loss": 10.6462, "step": 9 }, { "epoch": 0.011254924029262802, "grad_norm": 61.97431021564583, "learning_rate": 1.8726591760299627e-06, "loss": 10.7333, "step": 10 }, { "epoch": 0.012380416432189083, "grad_norm": 81.14048682798334, "learning_rate": 2.0599250936329587e-06, "loss": 9.5671, "step": 11 }, { "epoch": 0.013505908835115363, "grad_norm": 86.6613892998916, "learning_rate": 2.247191011235955e-06, "loss": 9.3288, "step": 12 }, { "epoch": 0.014631401238041642, "grad_norm": 97.56062675082107, "learning_rate": 2.4344569288389516e-06, "loss": 8.8569, "step": 13 }, { "epoch": 0.015756893640967922, "grad_norm": 99.72668638976701, "learning_rate": 2.621722846441948e-06, "loss": 8.8229, "step": 14 }, { "epoch": 0.016882386043894203, "grad_norm": 68.62441737325906, "learning_rate": 2.808988764044944e-06, "loss": 4.106, "step": 15 }, { "epoch": 0.018007878446820485, "grad_norm": 60.37331191202543, "learning_rate": 2.9962546816479402e-06, "loss": 3.608, "step": 16 }, { "epoch": 0.019133370849746763, "grad_norm": 48.17406200608835, "learning_rate": 3.1835205992509364e-06, "loss": 3.0803, "step": 17 }, { "epoch": 0.020258863252673044, "grad_norm": 36.4342554407753, "learning_rate": 3.3707865168539327e-06, "loss": 2.5875, "step": 18 }, { "epoch": 0.021384355655599326, "grad_norm": 31.524091079255932, "learning_rate": 3.558052434456929e-06, "loss": 2.33, "step": 19 }, { "epoch": 0.022509848058525603, "grad_norm": 6.74346485385345, "learning_rate": 3.7453183520599255e-06, "loss": 1.3777, "step": 20 }, { "epoch": 0.023635340461451885, "grad_norm": 5.1275756648686786, "learning_rate": 3.932584269662922e-06, "loss": 1.3449, "step": 21 }, { "epoch": 0.024760832864378166, "grad_norm": 4.059089851221617, "learning_rate": 4.1198501872659175e-06, "loss": 1.248, "step": 22 }, { "epoch": 0.025886325267304444, "grad_norm": 3.461699822238443, "learning_rate": 4.307116104868914e-06, "loss": 1.2294, "step": 23 }, { "epoch": 0.027011817670230726, "grad_norm": 2.6194967883079197, "learning_rate": 4.49438202247191e-06, "loss": 1.1185, "step": 24 }, { "epoch": 0.028137310073157007, "grad_norm": 2.1602870716248015, "learning_rate": 4.6816479400749066e-06, "loss": 1.1193, "step": 25 }, { "epoch": 0.029262802476083285, "grad_norm": 1.8135748254982695, "learning_rate": 4.868913857677903e-06, "loss": 1.0589, "step": 26 }, { "epoch": 0.030388294879009566, "grad_norm": 1.321133306398482, "learning_rate": 5.056179775280899e-06, "loss": 0.9617, "step": 27 }, { "epoch": 0.031513787281935844, "grad_norm": 31.284838936554156, "learning_rate": 5.243445692883896e-06, "loss": 0.9937, "step": 28 }, { "epoch": 0.032639279684862126, "grad_norm": 1.8832749187420972, "learning_rate": 5.430711610486891e-06, "loss": 0.9144, "step": 29 }, { "epoch": 0.03376477208778841, "grad_norm": 1.4607030575064903, "learning_rate": 5.617977528089888e-06, "loss": 0.8985, "step": 30 }, { "epoch": 0.03489026449071469, "grad_norm": 1.0370555540895343, "learning_rate": 5.805243445692885e-06, "loss": 0.8404, "step": 31 }, { "epoch": 0.03601575689364097, "grad_norm": 0.9098468238765742, "learning_rate": 5.9925093632958805e-06, "loss": 0.8352, "step": 32 }, { "epoch": 0.03714124929656725, "grad_norm": 0.8661052907885602, "learning_rate": 6.179775280898876e-06, "loss": 0.8258, "step": 33 }, { "epoch": 0.038266741699493526, "grad_norm": 0.7740808609488935, "learning_rate": 6.367041198501873e-06, "loss": 0.8324, "step": 34 }, { "epoch": 0.03939223410241981, "grad_norm": 0.7782713659204045, "learning_rate": 6.554307116104869e-06, "loss": 0.7588, "step": 35 }, { "epoch": 0.04051772650534609, "grad_norm": 0.6841020945645767, "learning_rate": 6.741573033707865e-06, "loss": 0.7682, "step": 36 }, { "epoch": 0.04164321890827237, "grad_norm": 0.6004121622938939, "learning_rate": 6.928838951310862e-06, "loss": 0.7549, "step": 37 }, { "epoch": 0.04276871131119865, "grad_norm": 0.6229597097596257, "learning_rate": 7.116104868913858e-06, "loss": 0.7376, "step": 38 }, { "epoch": 0.04389420371412493, "grad_norm": 0.7141033532392286, "learning_rate": 7.303370786516854e-06, "loss": 0.7535, "step": 39 }, { "epoch": 0.04501969611705121, "grad_norm": 0.5725153155935927, "learning_rate": 7.490636704119851e-06, "loss": 0.7185, "step": 40 }, { "epoch": 0.04614518851997749, "grad_norm": 0.5549438958370185, "learning_rate": 7.677902621722846e-06, "loss": 0.7518, "step": 41 }, { "epoch": 0.04727068092290377, "grad_norm": 0.4660101265627369, "learning_rate": 7.865168539325843e-06, "loss": 0.6787, "step": 42 }, { "epoch": 0.04839617332583005, "grad_norm": 0.4908539170309294, "learning_rate": 8.05243445692884e-06, "loss": 0.7032, "step": 43 }, { "epoch": 0.04952166572875633, "grad_norm": 0.48924522260651016, "learning_rate": 8.239700374531835e-06, "loss": 0.6803, "step": 44 }, { "epoch": 0.050647158131682614, "grad_norm": 0.475140896111031, "learning_rate": 8.426966292134832e-06, "loss": 0.6475, "step": 45 }, { "epoch": 0.05177265053460889, "grad_norm": 0.4644093059355716, "learning_rate": 8.614232209737828e-06, "loss": 0.7013, "step": 46 }, { "epoch": 0.05289814293753517, "grad_norm": 0.40301032630352857, "learning_rate": 8.801498127340826e-06, "loss": 0.6463, "step": 47 }, { "epoch": 0.05402363534046145, "grad_norm": 0.43480363638927505, "learning_rate": 8.98876404494382e-06, "loss": 0.6775, "step": 48 }, { "epoch": 0.05514912774338773, "grad_norm": 0.43971181451177166, "learning_rate": 9.176029962546817e-06, "loss": 0.7007, "step": 49 }, { "epoch": 0.056274620146314014, "grad_norm": 0.41896418177510275, "learning_rate": 9.363295880149813e-06, "loss": 0.6468, "step": 50 }, { "epoch": 0.057400112549240295, "grad_norm": 0.4149971177588748, "learning_rate": 9.550561797752809e-06, "loss": 0.628, "step": 51 }, { "epoch": 0.05852560495216657, "grad_norm": 0.37242192155253623, "learning_rate": 9.737827715355806e-06, "loss": 0.652, "step": 52 }, { "epoch": 0.05965109735509285, "grad_norm": 0.327485240758468, "learning_rate": 9.925093632958802e-06, "loss": 0.6334, "step": 53 }, { "epoch": 0.06077658975801913, "grad_norm": 0.36141343502753065, "learning_rate": 1.0112359550561798e-05, "loss": 0.6259, "step": 54 }, { "epoch": 0.061902082160945414, "grad_norm": 0.38897211704559004, "learning_rate": 1.0299625468164795e-05, "loss": 0.6226, "step": 55 }, { "epoch": 0.06302757456387169, "grad_norm": 0.36207952707026725, "learning_rate": 1.0486891385767791e-05, "loss": 0.6289, "step": 56 }, { "epoch": 0.06415306696679797, "grad_norm": 0.28595916020001694, "learning_rate": 1.0674157303370787e-05, "loss": 0.6149, "step": 57 }, { "epoch": 0.06527855936972425, "grad_norm": 0.29359683815567633, "learning_rate": 1.0861423220973783e-05, "loss": 0.5888, "step": 58 }, { "epoch": 0.06640405177265053, "grad_norm": 0.3228509053817298, "learning_rate": 1.104868913857678e-05, "loss": 0.6328, "step": 59 }, { "epoch": 0.06752954417557681, "grad_norm": 0.3068303518794903, "learning_rate": 1.1235955056179776e-05, "loss": 0.571, "step": 60 }, { "epoch": 0.0686550365785031, "grad_norm": 0.3231501093567655, "learning_rate": 1.1423220973782772e-05, "loss": 0.5728, "step": 61 }, { "epoch": 0.06978052898142938, "grad_norm": 0.2827526067701556, "learning_rate": 1.161048689138577e-05, "loss": 0.5919, "step": 62 }, { "epoch": 0.07090602138435566, "grad_norm": 0.3490733036925077, "learning_rate": 1.1797752808988765e-05, "loss": 0.6319, "step": 63 }, { "epoch": 0.07203151378728194, "grad_norm": 0.36049201575238243, "learning_rate": 1.1985018726591761e-05, "loss": 0.6065, "step": 64 }, { "epoch": 0.07315700619020822, "grad_norm": 0.2817612900392732, "learning_rate": 1.2172284644194758e-05, "loss": 0.6022, "step": 65 }, { "epoch": 0.0742824985931345, "grad_norm": 0.27300283931060443, "learning_rate": 1.2359550561797752e-05, "loss": 0.5783, "step": 66 }, { "epoch": 0.07540799099606077, "grad_norm": 0.3421112627990278, "learning_rate": 1.254681647940075e-05, "loss": 0.576, "step": 67 }, { "epoch": 0.07653348339898705, "grad_norm": 0.33598705329341366, "learning_rate": 1.2734082397003746e-05, "loss": 0.5835, "step": 68 }, { "epoch": 0.07765897580191333, "grad_norm": 0.27960476280957486, "learning_rate": 1.2921348314606743e-05, "loss": 0.5987, "step": 69 }, { "epoch": 0.07878446820483961, "grad_norm": 0.2965350125129841, "learning_rate": 1.3108614232209737e-05, "loss": 0.6026, "step": 70 }, { "epoch": 0.0799099606077659, "grad_norm": 0.3122772390396813, "learning_rate": 1.3295880149812733e-05, "loss": 0.574, "step": 71 }, { "epoch": 0.08103545301069218, "grad_norm": 0.3021816040434541, "learning_rate": 1.348314606741573e-05, "loss": 0.5771, "step": 72 }, { "epoch": 0.08216094541361846, "grad_norm": 0.2831578746374877, "learning_rate": 1.3670411985018728e-05, "loss": 0.5675, "step": 73 }, { "epoch": 0.08328643781654474, "grad_norm": 0.32441513984635667, "learning_rate": 1.3857677902621724e-05, "loss": 0.5652, "step": 74 }, { "epoch": 0.08441193021947102, "grad_norm": 0.31509832756589373, "learning_rate": 1.4044943820224721e-05, "loss": 0.5725, "step": 75 }, { "epoch": 0.0855374226223973, "grad_norm": 0.3068003737845105, "learning_rate": 1.4232209737827715e-05, "loss": 0.5921, "step": 76 }, { "epoch": 0.08666291502532358, "grad_norm": 0.28569121242288503, "learning_rate": 1.4419475655430711e-05, "loss": 0.5517, "step": 77 }, { "epoch": 0.08778840742824986, "grad_norm": 0.30318713510099926, "learning_rate": 1.4606741573033709e-05, "loss": 0.5786, "step": 78 }, { "epoch": 0.08891389983117615, "grad_norm": 0.32791686866753017, "learning_rate": 1.4794007490636705e-05, "loss": 0.5835, "step": 79 }, { "epoch": 0.09003939223410241, "grad_norm": 0.34541735995694495, "learning_rate": 1.4981273408239702e-05, "loss": 0.6003, "step": 80 }, { "epoch": 0.0911648846370287, "grad_norm": 0.24219057822403553, "learning_rate": 1.5168539325842698e-05, "loss": 0.5634, "step": 81 }, { "epoch": 0.09229037703995498, "grad_norm": 0.3066124460269189, "learning_rate": 1.5355805243445692e-05, "loss": 0.5385, "step": 82 }, { "epoch": 0.09341586944288126, "grad_norm": 0.36004311246679105, "learning_rate": 1.554307116104869e-05, "loss": 0.542, "step": 83 }, { "epoch": 0.09454136184580754, "grad_norm": 0.277294813524559, "learning_rate": 1.5730337078651687e-05, "loss": 0.5467, "step": 84 }, { "epoch": 0.09566685424873382, "grad_norm": 0.2742529403377881, "learning_rate": 1.591760299625468e-05, "loss": 0.5337, "step": 85 }, { "epoch": 0.0967923466516601, "grad_norm": 0.37776459034853405, "learning_rate": 1.610486891385768e-05, "loss": 0.5392, "step": 86 }, { "epoch": 0.09791783905458638, "grad_norm": 0.29713498839858976, "learning_rate": 1.6292134831460676e-05, "loss": 0.5513, "step": 87 }, { "epoch": 0.09904333145751266, "grad_norm": 0.2677802103514856, "learning_rate": 1.647940074906367e-05, "loss": 0.5435, "step": 88 }, { "epoch": 0.10016882386043895, "grad_norm": 0.3282651538789268, "learning_rate": 1.6666666666666667e-05, "loss": 0.5556, "step": 89 }, { "epoch": 0.10129431626336523, "grad_norm": 0.2903898300830952, "learning_rate": 1.6853932584269665e-05, "loss": 0.5182, "step": 90 }, { "epoch": 0.10241980866629151, "grad_norm": 0.32940772248776146, "learning_rate": 1.704119850187266e-05, "loss": 0.5651, "step": 91 }, { "epoch": 0.10354530106921778, "grad_norm": 0.29877064796568714, "learning_rate": 1.7228464419475657e-05, "loss": 0.5232, "step": 92 }, { "epoch": 0.10467079347214406, "grad_norm": 0.3033306544759112, "learning_rate": 1.7415730337078654e-05, "loss": 0.5415, "step": 93 }, { "epoch": 0.10579628587507034, "grad_norm": 0.298699393195244, "learning_rate": 1.760299625468165e-05, "loss": 0.5351, "step": 94 }, { "epoch": 0.10692177827799662, "grad_norm": 0.27344653088956217, "learning_rate": 1.7790262172284646e-05, "loss": 0.5401, "step": 95 }, { "epoch": 0.1080472706809229, "grad_norm": 0.2901283593528549, "learning_rate": 1.797752808988764e-05, "loss": 0.5548, "step": 96 }, { "epoch": 0.10917276308384918, "grad_norm": 0.2955399438690073, "learning_rate": 1.8164794007490637e-05, "loss": 0.5336, "step": 97 }, { "epoch": 0.11029825548677546, "grad_norm": 0.3044365394746884, "learning_rate": 1.8352059925093635e-05, "loss": 0.5095, "step": 98 }, { "epoch": 0.11142374788970175, "grad_norm": 0.26929920330702195, "learning_rate": 1.8539325842696632e-05, "loss": 0.5569, "step": 99 }, { "epoch": 0.11254924029262803, "grad_norm": 0.36727845819131605, "learning_rate": 1.8726591760299626e-05, "loss": 0.5818, "step": 100 }, { "epoch": 0.11367473269555431, "grad_norm": 0.2836581651837986, "learning_rate": 1.891385767790262e-05, "loss": 0.5373, "step": 101 }, { "epoch": 0.11480022509848059, "grad_norm": 0.29593257115280464, "learning_rate": 1.9101123595505618e-05, "loss": 0.5131, "step": 102 }, { "epoch": 0.11592571750140687, "grad_norm": 0.29964886160890525, "learning_rate": 1.9288389513108615e-05, "loss": 0.5044, "step": 103 }, { "epoch": 0.11705120990433314, "grad_norm": 0.30009105696644967, "learning_rate": 1.9475655430711613e-05, "loss": 0.536, "step": 104 }, { "epoch": 0.11817670230725942, "grad_norm": 0.29291717707624504, "learning_rate": 1.9662921348314607e-05, "loss": 0.5505, "step": 105 }, { "epoch": 0.1193021947101857, "grad_norm": 0.3294836067555843, "learning_rate": 1.9850187265917604e-05, "loss": 0.5505, "step": 106 }, { "epoch": 0.12042768711311198, "grad_norm": 0.29401137621422074, "learning_rate": 2.00374531835206e-05, "loss": 0.528, "step": 107 }, { "epoch": 0.12155317951603826, "grad_norm": 0.3030811720009754, "learning_rate": 2.0224719101123596e-05, "loss": 0.538, "step": 108 }, { "epoch": 0.12267867191896455, "grad_norm": 0.32674282662604665, "learning_rate": 2.0411985018726593e-05, "loss": 0.541, "step": 109 }, { "epoch": 0.12380416432189083, "grad_norm": 0.30319983351235286, "learning_rate": 2.059925093632959e-05, "loss": 0.504, "step": 110 }, { "epoch": 0.12492965672481711, "grad_norm": 0.3402565154469349, "learning_rate": 2.0786516853932585e-05, "loss": 0.5251, "step": 111 }, { "epoch": 0.12605514912774338, "grad_norm": 0.31872601282001034, "learning_rate": 2.0973782771535582e-05, "loss": 0.5286, "step": 112 }, { "epoch": 0.12718064153066966, "grad_norm": 0.34754536732763297, "learning_rate": 2.1161048689138577e-05, "loss": 0.5235, "step": 113 }, { "epoch": 0.12830613393359594, "grad_norm": 0.30998860710868686, "learning_rate": 2.1348314606741574e-05, "loss": 0.525, "step": 114 }, { "epoch": 0.12943162633652222, "grad_norm": 0.32990918540472725, "learning_rate": 2.153558052434457e-05, "loss": 0.5265, "step": 115 }, { "epoch": 0.1305571187394485, "grad_norm": 0.3423710738146026, "learning_rate": 2.1722846441947566e-05, "loss": 0.5338, "step": 116 }, { "epoch": 0.13168261114237478, "grad_norm": 0.2872199647047314, "learning_rate": 2.1910112359550563e-05, "loss": 0.5299, "step": 117 }, { "epoch": 0.13280810354530106, "grad_norm": 0.3317448545714626, "learning_rate": 2.209737827715356e-05, "loss": 0.4959, "step": 118 }, { "epoch": 0.13393359594822735, "grad_norm": 0.31417498563521173, "learning_rate": 2.2284644194756555e-05, "loss": 0.52, "step": 119 }, { "epoch": 0.13505908835115363, "grad_norm": 0.3645759776734259, "learning_rate": 2.2471910112359552e-05, "loss": 0.5296, "step": 120 }, { "epoch": 0.1361845807540799, "grad_norm": 0.3180662213331512, "learning_rate": 2.2659176029962546e-05, "loss": 0.5063, "step": 121 }, { "epoch": 0.1373100731570062, "grad_norm": 0.3716923342200342, "learning_rate": 2.2846441947565544e-05, "loss": 0.5046, "step": 122 }, { "epoch": 0.13843556555993247, "grad_norm": 0.39150702044794555, "learning_rate": 2.303370786516854e-05, "loss": 0.4959, "step": 123 }, { "epoch": 0.13956105796285875, "grad_norm": 0.3713739740316023, "learning_rate": 2.322097378277154e-05, "loss": 0.5015, "step": 124 }, { "epoch": 0.14068655036578503, "grad_norm": 0.355150041365192, "learning_rate": 2.3408239700374533e-05, "loss": 0.5029, "step": 125 }, { "epoch": 0.14181204276871132, "grad_norm": 0.47357406433732624, "learning_rate": 2.359550561797753e-05, "loss": 0.519, "step": 126 }, { "epoch": 0.1429375351716376, "grad_norm": 0.35841513558308474, "learning_rate": 2.3782771535580524e-05, "loss": 0.517, "step": 127 }, { "epoch": 0.14406302757456388, "grad_norm": 0.32127121635614614, "learning_rate": 2.3970037453183522e-05, "loss": 0.5068, "step": 128 }, { "epoch": 0.14518851997749016, "grad_norm": 0.41380038534756397, "learning_rate": 2.415730337078652e-05, "loss": 0.53, "step": 129 }, { "epoch": 0.14631401238041644, "grad_norm": 0.3342860962607464, "learning_rate": 2.4344569288389517e-05, "loss": 0.5131, "step": 130 }, { "epoch": 0.14743950478334272, "grad_norm": 0.328086226882181, "learning_rate": 2.453183520599251e-05, "loss": 0.5359, "step": 131 }, { "epoch": 0.148564997186269, "grad_norm": 0.3980527154392636, "learning_rate": 2.4719101123595505e-05, "loss": 0.4915, "step": 132 }, { "epoch": 0.14969048958919529, "grad_norm": 0.3664150255854856, "learning_rate": 2.4906367041198502e-05, "loss": 0.5239, "step": 133 }, { "epoch": 0.15081598199212154, "grad_norm": 0.36032405515932203, "learning_rate": 2.50936329588015e-05, "loss": 0.5085, "step": 134 }, { "epoch": 0.15194147439504782, "grad_norm": 0.4406027959320581, "learning_rate": 2.5280898876404497e-05, "loss": 0.5126, "step": 135 }, { "epoch": 0.1530669667979741, "grad_norm": 0.344695754779841, "learning_rate": 2.546816479400749e-05, "loss": 0.5122, "step": 136 }, { "epoch": 0.15419245920090038, "grad_norm": 0.3726483933183008, "learning_rate": 2.565543071161049e-05, "loss": 0.4905, "step": 137 }, { "epoch": 0.15531795160382666, "grad_norm": 0.3449312763960655, "learning_rate": 2.5842696629213486e-05, "loss": 0.4987, "step": 138 }, { "epoch": 0.15644344400675295, "grad_norm": 0.35328970504291957, "learning_rate": 2.6029962546816484e-05, "loss": 0.5054, "step": 139 }, { "epoch": 0.15756893640967923, "grad_norm": 0.3700337092111675, "learning_rate": 2.6217228464419475e-05, "loss": 0.509, "step": 140 }, { "epoch": 0.1586944288126055, "grad_norm": 0.301320056673764, "learning_rate": 2.6404494382022472e-05, "loss": 0.4958, "step": 141 }, { "epoch": 0.1598199212155318, "grad_norm": 0.4191378953980472, "learning_rate": 2.6591760299625466e-05, "loss": 0.5387, "step": 142 }, { "epoch": 0.16094541361845807, "grad_norm": 0.3880541184543602, "learning_rate": 2.6779026217228464e-05, "loss": 0.5227, "step": 143 }, { "epoch": 0.16207090602138435, "grad_norm": 0.39927231059272483, "learning_rate": 2.696629213483146e-05, "loss": 0.5237, "step": 144 }, { "epoch": 0.16319639842431063, "grad_norm": 0.3961271339819255, "learning_rate": 2.715355805243446e-05, "loss": 0.5194, "step": 145 }, { "epoch": 0.16432189082723692, "grad_norm": 0.4376696251019293, "learning_rate": 2.7340823970037456e-05, "loss": 0.5178, "step": 146 }, { "epoch": 0.1654473832301632, "grad_norm": 0.44058938921182966, "learning_rate": 2.752808988764045e-05, "loss": 0.4998, "step": 147 }, { "epoch": 0.16657287563308948, "grad_norm": 0.35261095257281155, "learning_rate": 2.7715355805243448e-05, "loss": 0.499, "step": 148 }, { "epoch": 0.16769836803601576, "grad_norm": 0.5218533410763981, "learning_rate": 2.7902621722846445e-05, "loss": 0.5273, "step": 149 }, { "epoch": 0.16882386043894204, "grad_norm": 0.4737891842741366, "learning_rate": 2.8089887640449443e-05, "loss": 0.5003, "step": 150 }, { "epoch": 0.16994935284186832, "grad_norm": 0.392922001496729, "learning_rate": 2.8277153558052437e-05, "loss": 0.5016, "step": 151 }, { "epoch": 0.1710748452447946, "grad_norm": 0.5302514501231146, "learning_rate": 2.846441947565543e-05, "loss": 0.5172, "step": 152 }, { "epoch": 0.17220033764772089, "grad_norm": 0.49803115823639127, "learning_rate": 2.8651685393258425e-05, "loss": 0.4946, "step": 153 }, { "epoch": 0.17332583005064717, "grad_norm": 0.4128451133760804, "learning_rate": 2.8838951310861422e-05, "loss": 0.5232, "step": 154 }, { "epoch": 0.17445132245357345, "grad_norm": 0.6316627266885098, "learning_rate": 2.902621722846442e-05, "loss": 0.5059, "step": 155 }, { "epoch": 0.17557681485649973, "grad_norm": 0.5295204042861669, "learning_rate": 2.9213483146067417e-05, "loss": 0.5243, "step": 156 }, { "epoch": 0.176702307259426, "grad_norm": 0.45607245497823934, "learning_rate": 2.940074906367041e-05, "loss": 0.4821, "step": 157 }, { "epoch": 0.1778277996623523, "grad_norm": 0.6021144875229769, "learning_rate": 2.958801498127341e-05, "loss": 0.5103, "step": 158 }, { "epoch": 0.17895329206527855, "grad_norm": 0.48529780373586173, "learning_rate": 2.9775280898876406e-05, "loss": 0.4922, "step": 159 }, { "epoch": 0.18007878446820483, "grad_norm": 0.4250055623471545, "learning_rate": 2.9962546816479404e-05, "loss": 0.4904, "step": 160 }, { "epoch": 0.1812042768711311, "grad_norm": 0.6512919492582171, "learning_rate": 3.01498127340824e-05, "loss": 0.5145, "step": 161 }, { "epoch": 0.1823297692740574, "grad_norm": 0.45356537836570343, "learning_rate": 3.0337078651685396e-05, "loss": 0.489, "step": 162 }, { "epoch": 0.18345526167698367, "grad_norm": 0.4587778769232854, "learning_rate": 3.052434456928839e-05, "loss": 0.5031, "step": 163 }, { "epoch": 0.18458075407990995, "grad_norm": 0.5209259547751122, "learning_rate": 3.0711610486891384e-05, "loss": 0.5122, "step": 164 }, { "epoch": 0.18570624648283623, "grad_norm": 0.3205075873383086, "learning_rate": 3.089887640449438e-05, "loss": 0.484, "step": 165 }, { "epoch": 0.18683173888576252, "grad_norm": 0.44421922243323253, "learning_rate": 3.108614232209738e-05, "loss": 0.4885, "step": 166 }, { "epoch": 0.1879572312886888, "grad_norm": 0.38376560722257824, "learning_rate": 3.1273408239700376e-05, "loss": 0.5137, "step": 167 }, { "epoch": 0.18908272369161508, "grad_norm": 0.33178548545336345, "learning_rate": 3.1460674157303374e-05, "loss": 0.5214, "step": 168 }, { "epoch": 0.19020821609454136, "grad_norm": 0.3543354285220051, "learning_rate": 3.164794007490637e-05, "loss": 0.4652, "step": 169 }, { "epoch": 0.19133370849746764, "grad_norm": 0.34821873695435235, "learning_rate": 3.183520599250936e-05, "loss": 0.4695, "step": 170 }, { "epoch": 0.19245920090039392, "grad_norm": 0.346452239854666, "learning_rate": 3.202247191011236e-05, "loss": 0.4891, "step": 171 }, { "epoch": 0.1935846933033202, "grad_norm": 0.4398933317388218, "learning_rate": 3.220973782771536e-05, "loss": 0.4911, "step": 172 }, { "epoch": 0.19471018570624649, "grad_norm": 0.3624677233826849, "learning_rate": 3.2397003745318354e-05, "loss": 0.4912, "step": 173 }, { "epoch": 0.19583567810917277, "grad_norm": 0.3699640798637125, "learning_rate": 3.258426966292135e-05, "loss": 0.5004, "step": 174 }, { "epoch": 0.19696117051209905, "grad_norm": 0.41958584077529965, "learning_rate": 3.277153558052435e-05, "loss": 0.4752, "step": 175 }, { "epoch": 0.19808666291502533, "grad_norm": 0.42502324118725465, "learning_rate": 3.295880149812734e-05, "loss": 0.4922, "step": 176 }, { "epoch": 0.1992121553179516, "grad_norm": 0.36517865445954867, "learning_rate": 3.314606741573034e-05, "loss": 0.5048, "step": 177 }, { "epoch": 0.2003376477208779, "grad_norm": 0.41574946856579004, "learning_rate": 3.3333333333333335e-05, "loss": 0.4821, "step": 178 }, { "epoch": 0.20146314012380417, "grad_norm": 0.4378349227779501, "learning_rate": 3.352059925093633e-05, "loss": 0.4933, "step": 179 }, { "epoch": 0.20258863252673046, "grad_norm": 0.4776232193190497, "learning_rate": 3.370786516853933e-05, "loss": 0.4751, "step": 180 }, { "epoch": 0.20371412492965674, "grad_norm": 0.43848154415790724, "learning_rate": 3.389513108614232e-05, "loss": 0.4807, "step": 181 }, { "epoch": 0.20483961733258302, "grad_norm": 0.5845165253893854, "learning_rate": 3.408239700374532e-05, "loss": 0.5009, "step": 182 }, { "epoch": 0.20596510973550927, "grad_norm": 0.4082870415882244, "learning_rate": 3.4269662921348316e-05, "loss": 0.4979, "step": 183 }, { "epoch": 0.20709060213843555, "grad_norm": 0.4718495231442118, "learning_rate": 3.445692883895131e-05, "loss": 0.5038, "step": 184 }, { "epoch": 0.20821609454136183, "grad_norm": 0.473455230356082, "learning_rate": 3.464419475655431e-05, "loss": 0.4833, "step": 185 }, { "epoch": 0.20934158694428812, "grad_norm": 0.46125301819415737, "learning_rate": 3.483146067415731e-05, "loss": 0.4889, "step": 186 }, { "epoch": 0.2104670793472144, "grad_norm": 0.4432990112364735, "learning_rate": 3.5018726591760305e-05, "loss": 0.4807, "step": 187 }, { "epoch": 0.21159257175014068, "grad_norm": 0.5322159815450351, "learning_rate": 3.52059925093633e-05, "loss": 0.5219, "step": 188 }, { "epoch": 0.21271806415306696, "grad_norm": 0.3936101857237881, "learning_rate": 3.5393258426966294e-05, "loss": 0.4922, "step": 189 }, { "epoch": 0.21384355655599324, "grad_norm": 0.53345431181708, "learning_rate": 3.558052434456929e-05, "loss": 0.4897, "step": 190 }, { "epoch": 0.21496904895891952, "grad_norm": 0.6142180117371377, "learning_rate": 3.576779026217228e-05, "loss": 0.493, "step": 191 }, { "epoch": 0.2160945413618458, "grad_norm": 0.41631870227046885, "learning_rate": 3.595505617977528e-05, "loss": 0.4893, "step": 192 }, { "epoch": 0.21722003376477209, "grad_norm": 0.4305104245523169, "learning_rate": 3.614232209737828e-05, "loss": 0.4866, "step": 193 }, { "epoch": 0.21834552616769837, "grad_norm": 0.5169903617970245, "learning_rate": 3.6329588014981274e-05, "loss": 0.4901, "step": 194 }, { "epoch": 0.21947101857062465, "grad_norm": 0.3860200825591215, "learning_rate": 3.651685393258427e-05, "loss": 0.4549, "step": 195 }, { "epoch": 0.22059651097355093, "grad_norm": 0.5230520554579277, "learning_rate": 3.670411985018727e-05, "loss": 0.4724, "step": 196 }, { "epoch": 0.2217220033764772, "grad_norm": 0.39548431249473126, "learning_rate": 3.689138576779027e-05, "loss": 0.4911, "step": 197 }, { "epoch": 0.2228474957794035, "grad_norm": 0.48800271319592975, "learning_rate": 3.7078651685393264e-05, "loss": 0.4572, "step": 198 }, { "epoch": 0.22397298818232977, "grad_norm": 0.41978987611240903, "learning_rate": 3.726591760299626e-05, "loss": 0.4829, "step": 199 }, { "epoch": 0.22509848058525606, "grad_norm": 0.5469472170008755, "learning_rate": 3.745318352059925e-05, "loss": 0.5274, "step": 200 }, { "epoch": 0.22622397298818234, "grad_norm": 0.3918679709299485, "learning_rate": 3.764044943820225e-05, "loss": 0.4766, "step": 201 }, { "epoch": 0.22734946539110862, "grad_norm": 0.4611366578168398, "learning_rate": 3.782771535580524e-05, "loss": 0.4533, "step": 202 }, { "epoch": 0.2284749577940349, "grad_norm": 0.35525042824778863, "learning_rate": 3.801498127340824e-05, "loss": 0.4801, "step": 203 }, { "epoch": 0.22960045019696118, "grad_norm": 0.39795327337608555, "learning_rate": 3.8202247191011236e-05, "loss": 0.4796, "step": 204 }, { "epoch": 0.23072594259988746, "grad_norm": 0.40314796565206873, "learning_rate": 3.838951310861423e-05, "loss": 0.4746, "step": 205 }, { "epoch": 0.23185143500281374, "grad_norm": 0.6186856651894296, "learning_rate": 3.857677902621723e-05, "loss": 0.483, "step": 206 }, { "epoch": 0.23297692740574, "grad_norm": 0.6262230101782875, "learning_rate": 3.876404494382023e-05, "loss": 0.5026, "step": 207 }, { "epoch": 0.23410241980866628, "grad_norm": 0.6885063622065476, "learning_rate": 3.8951310861423226e-05, "loss": 0.4961, "step": 208 }, { "epoch": 0.23522791221159256, "grad_norm": 0.46434262483818484, "learning_rate": 3.913857677902622e-05, "loss": 0.4824, "step": 209 }, { "epoch": 0.23635340461451884, "grad_norm": 0.5876521011749303, "learning_rate": 3.9325842696629214e-05, "loss": 0.4799, "step": 210 }, { "epoch": 0.23747889701744512, "grad_norm": 0.5679577524617186, "learning_rate": 3.951310861423221e-05, "loss": 0.4976, "step": 211 }, { "epoch": 0.2386043894203714, "grad_norm": 0.4948818608996542, "learning_rate": 3.970037453183521e-05, "loss": 0.4689, "step": 212 }, { "epoch": 0.23972988182329769, "grad_norm": 0.5366944392366912, "learning_rate": 3.98876404494382e-05, "loss": 0.5052, "step": 213 }, { "epoch": 0.24085537422622397, "grad_norm": 0.46091893449282645, "learning_rate": 4.00749063670412e-05, "loss": 0.504, "step": 214 }, { "epoch": 0.24198086662915025, "grad_norm": 0.6227113043840353, "learning_rate": 4.0262172284644194e-05, "loss": 0.4947, "step": 215 }, { "epoch": 0.24310635903207653, "grad_norm": 0.37975248168226977, "learning_rate": 4.044943820224719e-05, "loss": 0.4681, "step": 216 }, { "epoch": 0.2442318514350028, "grad_norm": 0.6602796166859184, "learning_rate": 4.063670411985019e-05, "loss": 0.4809, "step": 217 }, { "epoch": 0.2453573438379291, "grad_norm": 0.46707379726848597, "learning_rate": 4.082397003745319e-05, "loss": 0.4586, "step": 218 }, { "epoch": 0.24648283624085537, "grad_norm": 0.58153678436466, "learning_rate": 4.1011235955056184e-05, "loss": 0.508, "step": 219 }, { "epoch": 0.24760832864378166, "grad_norm": 0.6402167998756934, "learning_rate": 4.119850187265918e-05, "loss": 0.5039, "step": 220 }, { "epoch": 0.24873382104670794, "grad_norm": 0.5794603595581886, "learning_rate": 4.138576779026217e-05, "loss": 0.4653, "step": 221 }, { "epoch": 0.24985931344963422, "grad_norm": 0.5230659913502629, "learning_rate": 4.157303370786517e-05, "loss": 0.5038, "step": 222 }, { "epoch": 0.2509848058525605, "grad_norm": 0.6799656883961334, "learning_rate": 4.176029962546817e-05, "loss": 0.4898, "step": 223 }, { "epoch": 0.25211029825548675, "grad_norm": 0.6621661466046944, "learning_rate": 4.1947565543071165e-05, "loss": 0.4844, "step": 224 }, { "epoch": 0.25323579065841306, "grad_norm": 0.5616167950816823, "learning_rate": 4.2134831460674156e-05, "loss": 0.488, "step": 225 }, { "epoch": 0.2543612830613393, "grad_norm": 0.6575771702191943, "learning_rate": 4.232209737827715e-05, "loss": 0.4791, "step": 226 }, { "epoch": 0.2554867754642656, "grad_norm": 0.5488369093936996, "learning_rate": 4.250936329588015e-05, "loss": 0.5005, "step": 227 }, { "epoch": 0.2566122678671919, "grad_norm": 0.6144786136277036, "learning_rate": 4.269662921348315e-05, "loss": 0.4663, "step": 228 }, { "epoch": 0.2577377602701182, "grad_norm": 0.600777544617447, "learning_rate": 4.2883895131086146e-05, "loss": 0.4786, "step": 229 }, { "epoch": 0.25886325267304444, "grad_norm": 0.8023306491917337, "learning_rate": 4.307116104868914e-05, "loss": 0.4861, "step": 230 }, { "epoch": 0.25998874507597075, "grad_norm": 0.5046450430408941, "learning_rate": 4.3258426966292134e-05, "loss": 0.4727, "step": 231 }, { "epoch": 0.261114237478897, "grad_norm": 0.7089229481882726, "learning_rate": 4.344569288389513e-05, "loss": 0.4907, "step": 232 }, { "epoch": 0.2622397298818233, "grad_norm": 0.7578774630778147, "learning_rate": 4.363295880149813e-05, "loss": 0.4568, "step": 233 }, { "epoch": 0.26336522228474957, "grad_norm": 0.43469225376554593, "learning_rate": 4.3820224719101126e-05, "loss": 0.4882, "step": 234 }, { "epoch": 0.2644907146876759, "grad_norm": 0.7064451875198454, "learning_rate": 4.4007490636704124e-05, "loss": 0.4777, "step": 235 }, { "epoch": 0.26561620709060213, "grad_norm": 0.5236443057755517, "learning_rate": 4.419475655430712e-05, "loss": 0.4541, "step": 236 }, { "epoch": 0.26674169949352844, "grad_norm": 0.49987222413307647, "learning_rate": 4.438202247191011e-05, "loss": 0.4813, "step": 237 }, { "epoch": 0.2678671918964547, "grad_norm": 0.4887524110978959, "learning_rate": 4.456928838951311e-05, "loss": 0.4904, "step": 238 }, { "epoch": 0.268992684299381, "grad_norm": 0.43081504689209343, "learning_rate": 4.475655430711611e-05, "loss": 0.4828, "step": 239 }, { "epoch": 0.27011817670230726, "grad_norm": 0.4927726763052006, "learning_rate": 4.4943820224719104e-05, "loss": 0.4661, "step": 240 }, { "epoch": 0.27124366910523356, "grad_norm": 0.48347879754332407, "learning_rate": 4.51310861423221e-05, "loss": 0.4698, "step": 241 }, { "epoch": 0.2723691615081598, "grad_norm": 0.4867564616696551, "learning_rate": 4.531835205992509e-05, "loss": 0.49, "step": 242 }, { "epoch": 0.2734946539110861, "grad_norm": 0.5374635091852026, "learning_rate": 4.550561797752809e-05, "loss": 0.4559, "step": 243 }, { "epoch": 0.2746201463140124, "grad_norm": 0.4772645954109197, "learning_rate": 4.569288389513109e-05, "loss": 0.4717, "step": 244 }, { "epoch": 0.27574563871693863, "grad_norm": 0.4347548915476938, "learning_rate": 4.5880149812734085e-05, "loss": 0.4532, "step": 245 }, { "epoch": 0.27687113111986494, "grad_norm": 0.4759288672038084, "learning_rate": 4.606741573033708e-05, "loss": 0.4945, "step": 246 }, { "epoch": 0.2779966235227912, "grad_norm": 0.5792894788506712, "learning_rate": 4.625468164794008e-05, "loss": 0.4983, "step": 247 }, { "epoch": 0.2791221159257175, "grad_norm": 0.3980178738338264, "learning_rate": 4.644194756554308e-05, "loss": 0.4827, "step": 248 }, { "epoch": 0.28024760832864376, "grad_norm": 0.5408869118408165, "learning_rate": 4.662921348314607e-05, "loss": 0.4528, "step": 249 }, { "epoch": 0.28137310073157007, "grad_norm": 0.5689847204055498, "learning_rate": 4.6816479400749066e-05, "loss": 0.4598, "step": 250 }, { "epoch": 0.2824985931344963, "grad_norm": 0.5617698646408457, "learning_rate": 4.700374531835206e-05, "loss": 0.5022, "step": 251 }, { "epoch": 0.28362408553742263, "grad_norm": 0.45230255865587565, "learning_rate": 4.719101123595506e-05, "loss": 0.47, "step": 252 }, { "epoch": 0.2847495779403489, "grad_norm": 0.42374590684633967, "learning_rate": 4.737827715355805e-05, "loss": 0.466, "step": 253 }, { "epoch": 0.2858750703432752, "grad_norm": 0.4180084231861174, "learning_rate": 4.756554307116105e-05, "loss": 0.4747, "step": 254 }, { "epoch": 0.28700056274620145, "grad_norm": 0.4787386782007702, "learning_rate": 4.7752808988764046e-05, "loss": 0.4611, "step": 255 }, { "epoch": 0.28812605514912776, "grad_norm": 0.40084601864669134, "learning_rate": 4.7940074906367044e-05, "loss": 0.4577, "step": 256 }, { "epoch": 0.289251547552054, "grad_norm": 0.5597034903024327, "learning_rate": 4.812734082397004e-05, "loss": 0.4843, "step": 257 }, { "epoch": 0.2903770399549803, "grad_norm": 0.4389515417232518, "learning_rate": 4.831460674157304e-05, "loss": 0.4667, "step": 258 }, { "epoch": 0.2915025323579066, "grad_norm": 0.5153267551952044, "learning_rate": 4.8501872659176036e-05, "loss": 0.4543, "step": 259 }, { "epoch": 0.2926280247608329, "grad_norm": 0.4414645886637002, "learning_rate": 4.8689138576779034e-05, "loss": 0.4687, "step": 260 }, { "epoch": 0.29375351716375914, "grad_norm": 0.5441323388581608, "learning_rate": 4.8876404494382024e-05, "loss": 0.4904, "step": 261 }, { "epoch": 0.29487900956668545, "grad_norm": 0.47357188882841866, "learning_rate": 4.906367041198502e-05, "loss": 0.4646, "step": 262 }, { "epoch": 0.2960045019696117, "grad_norm": 0.43582942744547837, "learning_rate": 4.925093632958801e-05, "loss": 0.4747, "step": 263 }, { "epoch": 0.297129994372538, "grad_norm": 0.4659511261837298, "learning_rate": 4.943820224719101e-05, "loss": 0.4613, "step": 264 }, { "epoch": 0.29825548677546426, "grad_norm": 0.4561502948161637, "learning_rate": 4.962546816479401e-05, "loss": 0.4691, "step": 265 }, { "epoch": 0.29938097917839057, "grad_norm": 0.4541481932977169, "learning_rate": 4.9812734082397005e-05, "loss": 0.4661, "step": 266 }, { "epoch": 0.3005064715813168, "grad_norm": 0.47547350717861037, "learning_rate": 5e-05, "loss": 0.4678, "step": 267 }, { "epoch": 0.3016319639842431, "grad_norm": 0.406819603933647, "learning_rate": 4.997914059240717e-05, "loss": 0.4381, "step": 268 }, { "epoch": 0.3027574563871694, "grad_norm": 0.4460099057965906, "learning_rate": 4.9958281184814356e-05, "loss": 0.468, "step": 269 }, { "epoch": 0.30388294879009564, "grad_norm": 0.5142189061381415, "learning_rate": 4.9937421777221527e-05, "loss": 0.4714, "step": 270 }, { "epoch": 0.30500844119302195, "grad_norm": 0.5665935978369622, "learning_rate": 4.9916562369628704e-05, "loss": 0.4818, "step": 271 }, { "epoch": 0.3061339335959482, "grad_norm": 0.47186870063336683, "learning_rate": 4.989570296203588e-05, "loss": 0.4793, "step": 272 }, { "epoch": 0.3072594259988745, "grad_norm": 0.457167782453592, "learning_rate": 4.987484355444306e-05, "loss": 0.4404, "step": 273 }, { "epoch": 0.30838491840180077, "grad_norm": 0.5127134764884651, "learning_rate": 4.985398414685023e-05, "loss": 0.465, "step": 274 }, { "epoch": 0.3095104108047271, "grad_norm": 0.6532795598287193, "learning_rate": 4.983312473925741e-05, "loss": 0.518, "step": 275 }, { "epoch": 0.31063590320765333, "grad_norm": 0.6930677368699536, "learning_rate": 4.981226533166458e-05, "loss": 0.4866, "step": 276 }, { "epoch": 0.31176139561057964, "grad_norm": 0.5061640947852638, "learning_rate": 4.979140592407176e-05, "loss": 0.4609, "step": 277 }, { "epoch": 0.3128868880135059, "grad_norm": 0.5257849117764034, "learning_rate": 4.9770546516478936e-05, "loss": 0.457, "step": 278 }, { "epoch": 0.3140123804164322, "grad_norm": 0.4321326701858845, "learning_rate": 4.974968710888611e-05, "loss": 0.4454, "step": 279 }, { "epoch": 0.31513787281935846, "grad_norm": 0.38723391299657206, "learning_rate": 4.972882770129328e-05, "loss": 0.4643, "step": 280 }, { "epoch": 0.31626336522228476, "grad_norm": 0.5298180060499573, "learning_rate": 4.970796829370046e-05, "loss": 0.4746, "step": 281 }, { "epoch": 0.317388857625211, "grad_norm": 0.4061129177315826, "learning_rate": 4.968710888610764e-05, "loss": 0.4483, "step": 282 }, { "epoch": 0.3185143500281373, "grad_norm": 0.44266904421962466, "learning_rate": 4.9666249478514814e-05, "loss": 0.4462, "step": 283 }, { "epoch": 0.3196398424310636, "grad_norm": 0.5723516835538393, "learning_rate": 4.964539007092199e-05, "loss": 0.4912, "step": 284 }, { "epoch": 0.3207653348339899, "grad_norm": 0.4268190071809671, "learning_rate": 4.962453066332917e-05, "loss": 0.4747, "step": 285 }, { "epoch": 0.32189082723691614, "grad_norm": 0.5119510744401135, "learning_rate": 4.960367125573634e-05, "loss": 0.4822, "step": 286 }, { "epoch": 0.32301631963984245, "grad_norm": 0.6845433911954164, "learning_rate": 4.9582811848143515e-05, "loss": 0.4784, "step": 287 }, { "epoch": 0.3241418120427687, "grad_norm": 0.5512193560031954, "learning_rate": 4.956195244055069e-05, "loss": 0.4652, "step": 288 }, { "epoch": 0.325267304445695, "grad_norm": 0.44176750937437276, "learning_rate": 4.954109303295786e-05, "loss": 0.4591, "step": 289 }, { "epoch": 0.32639279684862127, "grad_norm": 0.4802436465499188, "learning_rate": 4.952023362536504e-05, "loss": 0.4646, "step": 290 }, { "epoch": 0.3275182892515476, "grad_norm": 0.5130069834143126, "learning_rate": 4.9499374217772216e-05, "loss": 0.4632, "step": 291 }, { "epoch": 0.32864378165447383, "grad_norm": 0.3492902519652929, "learning_rate": 4.947851481017939e-05, "loss": 0.4479, "step": 292 }, { "epoch": 0.3297692740574001, "grad_norm": 0.44213997370659125, "learning_rate": 4.945765540258657e-05, "loss": 0.5066, "step": 293 }, { "epoch": 0.3308947664603264, "grad_norm": 0.42377286032567374, "learning_rate": 4.943679599499375e-05, "loss": 0.4302, "step": 294 }, { "epoch": 0.33202025886325265, "grad_norm": 0.394814830036649, "learning_rate": 4.941593658740092e-05, "loss": 0.4767, "step": 295 }, { "epoch": 0.33314575126617896, "grad_norm": 0.4318002819787604, "learning_rate": 4.9395077179808094e-05, "loss": 0.4672, "step": 296 }, { "epoch": 0.3342712436691052, "grad_norm": 0.4312528420970919, "learning_rate": 4.937421777221527e-05, "loss": 0.4301, "step": 297 }, { "epoch": 0.3353967360720315, "grad_norm": 0.4506557238069578, "learning_rate": 4.935335836462245e-05, "loss": 0.4799, "step": 298 }, { "epoch": 0.3365222284749578, "grad_norm": 0.4035767985622815, "learning_rate": 4.933249895702962e-05, "loss": 0.4685, "step": 299 }, { "epoch": 0.3376477208778841, "grad_norm": 0.3974620013501897, "learning_rate": 4.93116395494368e-05, "loss": 0.4597, "step": 300 }, { "epoch": 0.33877321328081034, "grad_norm": 0.4262423662726608, "learning_rate": 4.929078014184397e-05, "loss": 0.5115, "step": 301 }, { "epoch": 0.33989870568373665, "grad_norm": 0.46574622767948337, "learning_rate": 4.926992073425115e-05, "loss": 0.4633, "step": 302 }, { "epoch": 0.3410241980866629, "grad_norm": 0.3662096934434811, "learning_rate": 4.9249061326658326e-05, "loss": 0.459, "step": 303 }, { "epoch": 0.3421496904895892, "grad_norm": 0.4345723771900289, "learning_rate": 4.92282019190655e-05, "loss": 0.4912, "step": 304 }, { "epoch": 0.34327518289251546, "grad_norm": 0.43786190381782847, "learning_rate": 4.9207342511472674e-05, "loss": 0.4433, "step": 305 }, { "epoch": 0.34440067529544177, "grad_norm": 0.502359362326431, "learning_rate": 4.918648310387986e-05, "loss": 0.4656, "step": 306 }, { "epoch": 0.345526167698368, "grad_norm": 0.36663337654610195, "learning_rate": 4.916562369628703e-05, "loss": 0.4902, "step": 307 }, { "epoch": 0.34665166010129433, "grad_norm": 0.5430348378224088, "learning_rate": 4.9144764288694205e-05, "loss": 0.4762, "step": 308 }, { "epoch": 0.3477771525042206, "grad_norm": 0.5126322468885988, "learning_rate": 4.912390488110138e-05, "loss": 0.4537, "step": 309 }, { "epoch": 0.3489026449071469, "grad_norm": 0.5095172872961993, "learning_rate": 4.910304547350855e-05, "loss": 0.4741, "step": 310 }, { "epoch": 0.35002813731007315, "grad_norm": 0.48284683021884783, "learning_rate": 4.908218606591573e-05, "loss": 0.4726, "step": 311 }, { "epoch": 0.35115362971299946, "grad_norm": 0.489802898549214, "learning_rate": 4.9061326658322906e-05, "loss": 0.4686, "step": 312 }, { "epoch": 0.3522791221159257, "grad_norm": 0.39249156296212534, "learning_rate": 4.904046725073008e-05, "loss": 0.4695, "step": 313 }, { "epoch": 0.353404614518852, "grad_norm": 0.5323916362107045, "learning_rate": 4.901960784313725e-05, "loss": 0.4965, "step": 314 }, { "epoch": 0.3545301069217783, "grad_norm": 0.4533239040835354, "learning_rate": 4.899874843554444e-05, "loss": 0.4707, "step": 315 }, { "epoch": 0.3556555993247046, "grad_norm": 0.4206387502485126, "learning_rate": 4.897788902795161e-05, "loss": 0.4579, "step": 316 }, { "epoch": 0.35678109172763084, "grad_norm": 0.6269190751003352, "learning_rate": 4.8957029620358784e-05, "loss": 0.4693, "step": 317 }, { "epoch": 0.3579065841305571, "grad_norm": 0.5737738257462925, "learning_rate": 4.893617021276596e-05, "loss": 0.4618, "step": 318 }, { "epoch": 0.3590320765334834, "grad_norm": 0.4441915442589318, "learning_rate": 4.891531080517314e-05, "loss": 0.4873, "step": 319 }, { "epoch": 0.36015756893640966, "grad_norm": 0.4262860610042779, "learning_rate": 4.889445139758031e-05, "loss": 0.4878, "step": 320 }, { "epoch": 0.36128306133933596, "grad_norm": 0.4858584345693206, "learning_rate": 4.8873591989987485e-05, "loss": 0.4669, "step": 321 }, { "epoch": 0.3624085537422622, "grad_norm": 0.3534387563975198, "learning_rate": 4.885273258239466e-05, "loss": 0.4811, "step": 322 }, { "epoch": 0.3635340461451885, "grad_norm": 0.5271950734928447, "learning_rate": 4.883187317480184e-05, "loss": 0.4711, "step": 323 }, { "epoch": 0.3646595385481148, "grad_norm": 0.409281575876073, "learning_rate": 4.8811013767209016e-05, "loss": 0.4659, "step": 324 }, { "epoch": 0.3657850309510411, "grad_norm": 0.539894161794808, "learning_rate": 4.879015435961619e-05, "loss": 0.4617, "step": 325 }, { "epoch": 0.36691052335396734, "grad_norm": 0.5540340872215955, "learning_rate": 4.876929495202336e-05, "loss": 0.4488, "step": 326 }, { "epoch": 0.36803601575689365, "grad_norm": 0.5450024514176801, "learning_rate": 4.874843554443054e-05, "loss": 0.4443, "step": 327 }, { "epoch": 0.3691615081598199, "grad_norm": 0.4022020669388415, "learning_rate": 4.872757613683772e-05, "loss": 0.4447, "step": 328 }, { "epoch": 0.3702870005627462, "grad_norm": 0.5015493685184448, "learning_rate": 4.8706716729244894e-05, "loss": 0.4805, "step": 329 }, { "epoch": 0.37141249296567247, "grad_norm": 0.5205564906682338, "learning_rate": 4.8685857321652064e-05, "loss": 0.4727, "step": 330 }, { "epoch": 0.3725379853685988, "grad_norm": 0.48440539535018856, "learning_rate": 4.866499791405924e-05, "loss": 0.4789, "step": 331 }, { "epoch": 0.37366347777152503, "grad_norm": 0.5749114846228484, "learning_rate": 4.864413850646642e-05, "loss": 0.4452, "step": 332 }, { "epoch": 0.37478897017445134, "grad_norm": 0.5347894114064871, "learning_rate": 4.8623279098873595e-05, "loss": 0.466, "step": 333 }, { "epoch": 0.3759144625773776, "grad_norm": 0.5219184880333937, "learning_rate": 4.860241969128077e-05, "loss": 0.4297, "step": 334 }, { "epoch": 0.3770399549803039, "grad_norm": 0.4540713981584441, "learning_rate": 4.858156028368794e-05, "loss": 0.4716, "step": 335 }, { "epoch": 0.37816544738323016, "grad_norm": 0.65356051165203, "learning_rate": 4.856070087609512e-05, "loss": 0.4709, "step": 336 }, { "epoch": 0.37929093978615647, "grad_norm": 0.4310409303487337, "learning_rate": 4.8539841468502296e-05, "loss": 0.4931, "step": 337 }, { "epoch": 0.3804164321890827, "grad_norm": 0.7081733344034011, "learning_rate": 4.851898206090947e-05, "loss": 0.4787, "step": 338 }, { "epoch": 0.38154192459200903, "grad_norm": 0.4155006654801174, "learning_rate": 4.8498122653316644e-05, "loss": 0.4547, "step": 339 }, { "epoch": 0.3826674169949353, "grad_norm": 0.6789073369172818, "learning_rate": 4.847726324572383e-05, "loss": 0.4526, "step": 340 }, { "epoch": 0.38379290939786154, "grad_norm": 0.4700941252868989, "learning_rate": 4.8456403838131e-05, "loss": 0.4585, "step": 341 }, { "epoch": 0.38491840180078785, "grad_norm": 0.5721349237605509, "learning_rate": 4.8435544430538175e-05, "loss": 0.4305, "step": 342 }, { "epoch": 0.3860438942037141, "grad_norm": 0.6679337639014323, "learning_rate": 4.841468502294535e-05, "loss": 0.4727, "step": 343 }, { "epoch": 0.3871693866066404, "grad_norm": 0.6136310708197658, "learning_rate": 4.839382561535253e-05, "loss": 0.4632, "step": 344 }, { "epoch": 0.38829487900956666, "grad_norm": 0.560884620312814, "learning_rate": 4.83729662077597e-05, "loss": 0.4438, "step": 345 }, { "epoch": 0.38942037141249297, "grad_norm": 0.6098279474363337, "learning_rate": 4.835210680016688e-05, "loss": 0.4407, "step": 346 }, { "epoch": 0.3905458638154192, "grad_norm": 0.5154661466104475, "learning_rate": 4.833124739257405e-05, "loss": 0.446, "step": 347 }, { "epoch": 0.39167135621834553, "grad_norm": 0.7038644519573083, "learning_rate": 4.831038798498123e-05, "loss": 0.4646, "step": 348 }, { "epoch": 0.3927968486212718, "grad_norm": 0.5207244024602116, "learning_rate": 4.828952857738841e-05, "loss": 0.4476, "step": 349 }, { "epoch": 0.3939223410241981, "grad_norm": 0.5084794430652734, "learning_rate": 4.8268669169795584e-05, "loss": 0.4478, "step": 350 }, { "epoch": 0.39504783342712435, "grad_norm": 0.5120708559114392, "learning_rate": 4.8247809762202754e-05, "loss": 0.4622, "step": 351 }, { "epoch": 0.39617332583005066, "grad_norm": 0.6265345182325885, "learning_rate": 4.822695035460993e-05, "loss": 0.4706, "step": 352 }, { "epoch": 0.3972988182329769, "grad_norm": 0.5750184584832099, "learning_rate": 4.820609094701711e-05, "loss": 0.4882, "step": 353 }, { "epoch": 0.3984243106359032, "grad_norm": 0.5490268536187386, "learning_rate": 4.818523153942428e-05, "loss": 0.4601, "step": 354 }, { "epoch": 0.3995498030388295, "grad_norm": 0.5317594975111523, "learning_rate": 4.816437213183146e-05, "loss": 0.4587, "step": 355 }, { "epoch": 0.4006752954417558, "grad_norm": 0.49454469908724474, "learning_rate": 4.814351272423863e-05, "loss": 0.4559, "step": 356 }, { "epoch": 0.40180078784468204, "grad_norm": 0.5764930655203424, "learning_rate": 4.812265331664581e-05, "loss": 0.4677, "step": 357 }, { "epoch": 0.40292628024760835, "grad_norm": 0.4254928567571142, "learning_rate": 4.8101793909052986e-05, "loss": 0.4468, "step": 358 }, { "epoch": 0.4040517726505346, "grad_norm": 0.5563885643276592, "learning_rate": 4.808093450146016e-05, "loss": 0.4333, "step": 359 }, { "epoch": 0.4051772650534609, "grad_norm": 0.4150167393933345, "learning_rate": 4.806007509386733e-05, "loss": 0.4515, "step": 360 }, { "epoch": 0.40630275745638716, "grad_norm": 0.4751390092870927, "learning_rate": 4.803921568627452e-05, "loss": 0.4776, "step": 361 }, { "epoch": 0.4074282498593135, "grad_norm": 0.43506496230734293, "learning_rate": 4.801835627868169e-05, "loss": 0.4533, "step": 362 }, { "epoch": 0.4085537422622397, "grad_norm": 0.4488704280500811, "learning_rate": 4.7997496871088864e-05, "loss": 0.4555, "step": 363 }, { "epoch": 0.40967923466516604, "grad_norm": 0.4888805568606264, "learning_rate": 4.797663746349604e-05, "loss": 0.4529, "step": 364 }, { "epoch": 0.4108047270680923, "grad_norm": 0.47826665202586255, "learning_rate": 4.795577805590322e-05, "loss": 0.4565, "step": 365 }, { "epoch": 0.41193021947101854, "grad_norm": 0.427518205471488, "learning_rate": 4.793491864831039e-05, "loss": 0.4622, "step": 366 }, { "epoch": 0.41305571187394485, "grad_norm": 0.4657645019409657, "learning_rate": 4.7914059240717565e-05, "loss": 0.4731, "step": 367 }, { "epoch": 0.4141812042768711, "grad_norm": 0.4980798737202691, "learning_rate": 4.789319983312474e-05, "loss": 0.4496, "step": 368 }, { "epoch": 0.4153066966797974, "grad_norm": 0.4106122018359277, "learning_rate": 4.787234042553192e-05, "loss": 0.4537, "step": 369 }, { "epoch": 0.41643218908272367, "grad_norm": 0.46050479994884624, "learning_rate": 4.785148101793909e-05, "loss": 0.4434, "step": 370 }, { "epoch": 0.41755768148565, "grad_norm": 0.5221350337150601, "learning_rate": 4.783062161034627e-05, "loss": 0.458, "step": 371 }, { "epoch": 0.41868317388857623, "grad_norm": 0.44101630868691777, "learning_rate": 4.780976220275344e-05, "loss": 0.4614, "step": 372 }, { "epoch": 0.41980866629150254, "grad_norm": 0.5767546396305836, "learning_rate": 4.778890279516062e-05, "loss": 0.4829, "step": 373 }, { "epoch": 0.4209341586944288, "grad_norm": 0.47996101159798066, "learning_rate": 4.77680433875678e-05, "loss": 0.4569, "step": 374 }, { "epoch": 0.4220596510973551, "grad_norm": 0.5033820590275159, "learning_rate": 4.774718397997497e-05, "loss": 0.453, "step": 375 }, { "epoch": 0.42318514350028136, "grad_norm": 0.4218904797929267, "learning_rate": 4.7726324572382145e-05, "loss": 0.4936, "step": 376 }, { "epoch": 0.42431063590320767, "grad_norm": 0.41845341895601695, "learning_rate": 4.770546516478932e-05, "loss": 0.4371, "step": 377 }, { "epoch": 0.4254361283061339, "grad_norm": 0.38945322748226296, "learning_rate": 4.76846057571965e-05, "loss": 0.4391, "step": 378 }, { "epoch": 0.42656162070906023, "grad_norm": 0.4161185110299941, "learning_rate": 4.766374634960367e-05, "loss": 0.4485, "step": 379 }, { "epoch": 0.4276871131119865, "grad_norm": 0.3864477310593941, "learning_rate": 4.764288694201085e-05, "loss": 0.4543, "step": 380 }, { "epoch": 0.4288126055149128, "grad_norm": 0.48714053443872946, "learning_rate": 4.762202753441802e-05, "loss": 0.4809, "step": 381 }, { "epoch": 0.42993809791783905, "grad_norm": 0.44027599652018634, "learning_rate": 4.76011681268252e-05, "loss": 0.4664, "step": 382 }, { "epoch": 0.43106359032076536, "grad_norm": 0.46260149439173776, "learning_rate": 4.758030871923238e-05, "loss": 0.4358, "step": 383 }, { "epoch": 0.4321890827236916, "grad_norm": 0.37340303441017136, "learning_rate": 4.7559449311639554e-05, "loss": 0.4444, "step": 384 }, { "epoch": 0.4333145751266179, "grad_norm": 0.4414988549473453, "learning_rate": 4.7538589904046724e-05, "loss": 0.4338, "step": 385 }, { "epoch": 0.43444006752954417, "grad_norm": 0.4002550060272223, "learning_rate": 4.751773049645391e-05, "loss": 0.4494, "step": 386 }, { "epoch": 0.4355655599324705, "grad_norm": 0.4158146887262931, "learning_rate": 4.749687108886108e-05, "loss": 0.4454, "step": 387 }, { "epoch": 0.43669105233539673, "grad_norm": 0.35977608941282263, "learning_rate": 4.7476011681268255e-05, "loss": 0.4533, "step": 388 }, { "epoch": 0.43781654473832304, "grad_norm": 0.4764697673218214, "learning_rate": 4.745515227367543e-05, "loss": 0.4835, "step": 389 }, { "epoch": 0.4389420371412493, "grad_norm": 0.35081968018481574, "learning_rate": 4.743429286608261e-05, "loss": 0.4579, "step": 390 }, { "epoch": 0.44006752954417555, "grad_norm": 0.4170219497175011, "learning_rate": 4.741343345848978e-05, "loss": 0.4398, "step": 391 }, { "epoch": 0.44119302194710186, "grad_norm": 0.44456892241843987, "learning_rate": 4.739257405089696e-05, "loss": 0.4805, "step": 392 }, { "epoch": 0.4423185143500281, "grad_norm": 0.3810686961824828, "learning_rate": 4.737171464330413e-05, "loss": 0.4687, "step": 393 }, { "epoch": 0.4434440067529544, "grad_norm": 0.4459516182156416, "learning_rate": 4.73508552357113e-05, "loss": 0.4441, "step": 394 }, { "epoch": 0.4445694991558807, "grad_norm": 0.36574072948327524, "learning_rate": 4.732999582811849e-05, "loss": 0.4759, "step": 395 }, { "epoch": 0.445694991558807, "grad_norm": 0.46519670122225776, "learning_rate": 4.730913642052566e-05, "loss": 0.4622, "step": 396 }, { "epoch": 0.44682048396173324, "grad_norm": 0.3782284810519757, "learning_rate": 4.7288277012932834e-05, "loss": 0.4518, "step": 397 }, { "epoch": 0.44794597636465955, "grad_norm": 0.4321697226823169, "learning_rate": 4.726741760534001e-05, "loss": 0.4213, "step": 398 }, { "epoch": 0.4490714687675858, "grad_norm": 0.3846389059595841, "learning_rate": 4.724655819774719e-05, "loss": 0.4473, "step": 399 }, { "epoch": 0.4501969611705121, "grad_norm": 0.4148349323542458, "learning_rate": 4.722569879015436e-05, "loss": 0.447, "step": 400 }, { "epoch": 0.45132245357343836, "grad_norm": 0.3987423433461808, "learning_rate": 4.720483938256154e-05, "loss": 0.4428, "step": 401 }, { "epoch": 0.4524479459763647, "grad_norm": 0.42246987876628445, "learning_rate": 4.718397997496871e-05, "loss": 0.4456, "step": 402 }, { "epoch": 0.4535734383792909, "grad_norm": 0.4060448399568812, "learning_rate": 4.716312056737589e-05, "loss": 0.4734, "step": 403 }, { "epoch": 0.45469893078221724, "grad_norm": 0.38939419691921573, "learning_rate": 4.7142261159783066e-05, "loss": 0.4506, "step": 404 }, { "epoch": 0.4558244231851435, "grad_norm": 0.39441558158161155, "learning_rate": 4.712140175219024e-05, "loss": 0.4611, "step": 405 }, { "epoch": 0.4569499155880698, "grad_norm": 0.37043790127930454, "learning_rate": 4.7100542344597413e-05, "loss": 0.4446, "step": 406 }, { "epoch": 0.45807540799099605, "grad_norm": 0.39081323070794516, "learning_rate": 4.707968293700459e-05, "loss": 0.4586, "step": 407 }, { "epoch": 0.45920090039392236, "grad_norm": 0.38815613346341743, "learning_rate": 4.705882352941177e-05, "loss": 0.4731, "step": 408 }, { "epoch": 0.4603263927968486, "grad_norm": 0.4081757852974463, "learning_rate": 4.7037964121818944e-05, "loss": 0.4392, "step": 409 }, { "epoch": 0.4614518851997749, "grad_norm": 0.3789626983185206, "learning_rate": 4.7017104714226115e-05, "loss": 0.4561, "step": 410 }, { "epoch": 0.4625773776027012, "grad_norm": 0.5000455667230893, "learning_rate": 4.69962453066333e-05, "loss": 0.4642, "step": 411 }, { "epoch": 0.4637028700056275, "grad_norm": 0.3422337438589666, "learning_rate": 4.697538589904047e-05, "loss": 0.4592, "step": 412 }, { "epoch": 0.46482836240855374, "grad_norm": 0.5638947084171662, "learning_rate": 4.6954526491447646e-05, "loss": 0.451, "step": 413 }, { "epoch": 0.46595385481148, "grad_norm": 0.38536737227105394, "learning_rate": 4.693366708385482e-05, "loss": 0.4386, "step": 414 }, { "epoch": 0.4670793472144063, "grad_norm": 0.46615900085704925, "learning_rate": 4.691280767626199e-05, "loss": 0.4572, "step": 415 }, { "epoch": 0.46820483961733256, "grad_norm": 0.45954601145736806, "learning_rate": 4.689194826866917e-05, "loss": 0.4745, "step": 416 }, { "epoch": 0.46933033202025887, "grad_norm": 0.3925870159696147, "learning_rate": 4.687108886107635e-05, "loss": 0.4383, "step": 417 }, { "epoch": 0.4704558244231851, "grad_norm": 0.4232172013685177, "learning_rate": 4.6850229453483524e-05, "loss": 0.4455, "step": 418 }, { "epoch": 0.47158131682611143, "grad_norm": 0.4709258500108095, "learning_rate": 4.6829370045890694e-05, "loss": 0.4329, "step": 419 }, { "epoch": 0.4727068092290377, "grad_norm": 0.5478084541084817, "learning_rate": 4.680851063829788e-05, "loss": 0.4778, "step": 420 }, { "epoch": 0.473832301631964, "grad_norm": 0.39060968027365583, "learning_rate": 4.678765123070505e-05, "loss": 0.446, "step": 421 }, { "epoch": 0.47495779403489025, "grad_norm": 0.43252322301543766, "learning_rate": 4.6766791823112225e-05, "loss": 0.4606, "step": 422 }, { "epoch": 0.47608328643781656, "grad_norm": 0.48537861169690405, "learning_rate": 4.67459324155194e-05, "loss": 0.4845, "step": 423 }, { "epoch": 0.4772087788407428, "grad_norm": 0.34601404275255593, "learning_rate": 4.672507300792658e-05, "loss": 0.4357, "step": 424 }, { "epoch": 0.4783342712436691, "grad_norm": 0.42339913946057167, "learning_rate": 4.670421360033375e-05, "loss": 0.4421, "step": 425 }, { "epoch": 0.47945976364659537, "grad_norm": 0.39857659754496044, "learning_rate": 4.668335419274093e-05, "loss": 0.4448, "step": 426 }, { "epoch": 0.4805852560495217, "grad_norm": 0.38982322860737306, "learning_rate": 4.66624947851481e-05, "loss": 0.4449, "step": 427 }, { "epoch": 0.48171074845244793, "grad_norm": 0.4167533082716713, "learning_rate": 4.664163537755528e-05, "loss": 0.4538, "step": 428 }, { "epoch": 0.48283624085537424, "grad_norm": 0.38396785885791673, "learning_rate": 4.662077596996246e-05, "loss": 0.4665, "step": 429 }, { "epoch": 0.4839617332583005, "grad_norm": 0.4460443959564988, "learning_rate": 4.6599916562369634e-05, "loss": 0.4695, "step": 430 }, { "epoch": 0.4850872256612268, "grad_norm": 0.4307496077176856, "learning_rate": 4.6579057154776804e-05, "loss": 0.479, "step": 431 }, { "epoch": 0.48621271806415306, "grad_norm": 0.4703944597323029, "learning_rate": 4.655819774718399e-05, "loss": 0.4616, "step": 432 }, { "epoch": 0.48733821046707937, "grad_norm": 0.4532939669627873, "learning_rate": 4.653733833959116e-05, "loss": 0.4386, "step": 433 }, { "epoch": 0.4884637028700056, "grad_norm": 0.38992923384312006, "learning_rate": 4.651647893199833e-05, "loss": 0.4403, "step": 434 }, { "epoch": 0.48958919527293193, "grad_norm": 0.41316331078388, "learning_rate": 4.649561952440551e-05, "loss": 0.431, "step": 435 }, { "epoch": 0.4907146876758582, "grad_norm": 0.36589748301197256, "learning_rate": 4.647476011681268e-05, "loss": 0.4487, "step": 436 }, { "epoch": 0.4918401800787845, "grad_norm": 0.4790306414346754, "learning_rate": 4.645390070921986e-05, "loss": 0.465, "step": 437 }, { "epoch": 0.49296567248171075, "grad_norm": 0.3884802942940033, "learning_rate": 4.6433041301627036e-05, "loss": 0.4488, "step": 438 }, { "epoch": 0.494091164884637, "grad_norm": 0.43931802766911515, "learning_rate": 4.641218189403421e-05, "loss": 0.4844, "step": 439 }, { "epoch": 0.4952166572875633, "grad_norm": 0.35209964530255544, "learning_rate": 4.6391322486441383e-05, "loss": 0.4442, "step": 440 }, { "epoch": 0.49634214969048956, "grad_norm": 0.38004709563408534, "learning_rate": 4.637046307884857e-05, "loss": 0.4753, "step": 441 }, { "epoch": 0.4974676420934159, "grad_norm": 0.3409798351543027, "learning_rate": 4.634960367125574e-05, "loss": 0.4342, "step": 442 }, { "epoch": 0.4985931344963421, "grad_norm": 0.39326837683822974, "learning_rate": 4.6328744263662914e-05, "loss": 0.4539, "step": 443 }, { "epoch": 0.49971862689926844, "grad_norm": 0.34187980768631865, "learning_rate": 4.630788485607009e-05, "loss": 0.4551, "step": 444 }, { "epoch": 0.5008441193021947, "grad_norm": 0.3788406979843315, "learning_rate": 4.628702544847727e-05, "loss": 0.4536, "step": 445 }, { "epoch": 0.501969611705121, "grad_norm": 0.37725539074157804, "learning_rate": 4.626616604088444e-05, "loss": 0.4391, "step": 446 }, { "epoch": 0.5030951041080473, "grad_norm": 0.3294085025027009, "learning_rate": 4.6245306633291616e-05, "loss": 0.4702, "step": 447 }, { "epoch": 0.5042205965109735, "grad_norm": 0.33534954747479645, "learning_rate": 4.622444722569879e-05, "loss": 0.437, "step": 448 }, { "epoch": 0.5053460889138999, "grad_norm": 0.4077362372846078, "learning_rate": 4.620358781810597e-05, "loss": 0.4335, "step": 449 }, { "epoch": 0.5064715813168261, "grad_norm": 0.39599735165417416, "learning_rate": 4.618272841051314e-05, "loss": 0.4428, "step": 450 }, { "epoch": 0.5075970737197524, "grad_norm": 0.3481976377046397, "learning_rate": 4.6161869002920323e-05, "loss": 0.4344, "step": 451 }, { "epoch": 0.5087225661226786, "grad_norm": 0.35576918023033427, "learning_rate": 4.6141009595327494e-05, "loss": 0.4343, "step": 452 }, { "epoch": 0.509848058525605, "grad_norm": 0.4458349270928396, "learning_rate": 4.612015018773467e-05, "loss": 0.4418, "step": 453 }, { "epoch": 0.5109735509285313, "grad_norm": 0.39940538094885114, "learning_rate": 4.609929078014185e-05, "loss": 0.4466, "step": 454 }, { "epoch": 0.5120990433314575, "grad_norm": 0.401792941347298, "learning_rate": 4.607843137254902e-05, "loss": 0.4317, "step": 455 }, { "epoch": 0.5132245357343838, "grad_norm": 0.3570336540962956, "learning_rate": 4.6057571964956195e-05, "loss": 0.4116, "step": 456 }, { "epoch": 0.5143500281373101, "grad_norm": 0.4132726931400482, "learning_rate": 4.603671255736337e-05, "loss": 0.4451, "step": 457 }, { "epoch": 0.5154755205402364, "grad_norm": 0.3450781738437834, "learning_rate": 4.601585314977055e-05, "loss": 0.4245, "step": 458 }, { "epoch": 0.5166010129431626, "grad_norm": 0.4044295769667828, "learning_rate": 4.599499374217772e-05, "loss": 0.4261, "step": 459 }, { "epoch": 0.5177265053460889, "grad_norm": 0.4460134360979799, "learning_rate": 4.59741343345849e-05, "loss": 0.4569, "step": 460 }, { "epoch": 0.5188519977490152, "grad_norm": 0.35709408492200145, "learning_rate": 4.595327492699207e-05, "loss": 0.447, "step": 461 }, { "epoch": 0.5199774901519415, "grad_norm": 0.49622852545171614, "learning_rate": 4.593241551939925e-05, "loss": 0.4482, "step": 462 }, { "epoch": 0.5211029825548678, "grad_norm": 0.43774205674931815, "learning_rate": 4.591155611180643e-05, "loss": 0.4447, "step": 463 }, { "epoch": 0.522228474957794, "grad_norm": 0.4071440395299347, "learning_rate": 4.5890696704213604e-05, "loss": 0.428, "step": 464 }, { "epoch": 0.5233539673607203, "grad_norm": 0.42816990064501337, "learning_rate": 4.5869837296620774e-05, "loss": 0.4264, "step": 465 }, { "epoch": 0.5244794597636466, "grad_norm": 0.4003821811209746, "learning_rate": 4.584897788902796e-05, "loss": 0.4502, "step": 466 }, { "epoch": 0.5256049521665729, "grad_norm": 0.4306225774416482, "learning_rate": 4.582811848143513e-05, "loss": 0.448, "step": 467 }, { "epoch": 0.5267304445694991, "grad_norm": 0.438000243330178, "learning_rate": 4.5807259073842305e-05, "loss": 0.4461, "step": 468 }, { "epoch": 0.5278559369724254, "grad_norm": 0.5459912564891531, "learning_rate": 4.578639966624948e-05, "loss": 0.4454, "step": 469 }, { "epoch": 0.5289814293753518, "grad_norm": 0.3937540607846447, "learning_rate": 4.576554025865666e-05, "loss": 0.4511, "step": 470 }, { "epoch": 0.530106921778278, "grad_norm": 0.5255901368328048, "learning_rate": 4.574468085106383e-05, "loss": 0.4656, "step": 471 }, { "epoch": 0.5312324141812043, "grad_norm": 0.37290888598540667, "learning_rate": 4.572382144347101e-05, "loss": 0.4346, "step": 472 }, { "epoch": 0.5323579065841305, "grad_norm": 0.5151271720318875, "learning_rate": 4.570296203587818e-05, "loss": 0.4386, "step": 473 }, { "epoch": 0.5334833989870569, "grad_norm": 0.5196455814853196, "learning_rate": 4.568210262828536e-05, "loss": 0.4281, "step": 474 }, { "epoch": 0.5346088913899831, "grad_norm": 0.5516694216088329, "learning_rate": 4.566124322069254e-05, "loss": 0.4678, "step": 475 }, { "epoch": 0.5357343837929094, "grad_norm": 0.40935239231865317, "learning_rate": 4.564038381309971e-05, "loss": 0.4392, "step": 476 }, { "epoch": 0.5368598761958356, "grad_norm": 0.4232251188780467, "learning_rate": 4.5619524405506884e-05, "loss": 0.4541, "step": 477 }, { "epoch": 0.537985368598762, "grad_norm": 0.47065714592515695, "learning_rate": 4.559866499791406e-05, "loss": 0.4573, "step": 478 }, { "epoch": 0.5391108610016883, "grad_norm": 0.45139662325934604, "learning_rate": 4.557780559032124e-05, "loss": 0.466, "step": 479 }, { "epoch": 0.5402363534046145, "grad_norm": 0.43277954798040297, "learning_rate": 4.555694618272841e-05, "loss": 0.4395, "step": 480 }, { "epoch": 0.5413618458075408, "grad_norm": 0.3937402339467187, "learning_rate": 4.553608677513559e-05, "loss": 0.4598, "step": 481 }, { "epoch": 0.5424873382104671, "grad_norm": 0.43149504891962365, "learning_rate": 4.551522736754276e-05, "loss": 0.4292, "step": 482 }, { "epoch": 0.5436128306133934, "grad_norm": 0.3833426447527127, "learning_rate": 4.549436795994994e-05, "loss": 0.4462, "step": 483 }, { "epoch": 0.5447383230163196, "grad_norm": 0.5753891830767618, "learning_rate": 4.5473508552357116e-05, "loss": 0.4674, "step": 484 }, { "epoch": 0.5458638154192459, "grad_norm": 0.37095342775133894, "learning_rate": 4.5452649144764293e-05, "loss": 0.4502, "step": 485 }, { "epoch": 0.5469893078221723, "grad_norm": 0.44452090514956777, "learning_rate": 4.5431789737171464e-05, "loss": 0.4195, "step": 486 }, { "epoch": 0.5481148002250985, "grad_norm": 0.39266318915026655, "learning_rate": 4.541093032957864e-05, "loss": 0.4308, "step": 487 }, { "epoch": 0.5492402926280248, "grad_norm": 0.38579575811998595, "learning_rate": 4.539007092198582e-05, "loss": 0.4732, "step": 488 }, { "epoch": 0.550365785030951, "grad_norm": 0.3927716846528752, "learning_rate": 4.5369211514392995e-05, "loss": 0.4554, "step": 489 }, { "epoch": 0.5514912774338773, "grad_norm": 0.3518677438969378, "learning_rate": 4.5348352106800165e-05, "loss": 0.435, "step": 490 }, { "epoch": 0.5526167698368036, "grad_norm": 0.3989470078182982, "learning_rate": 4.532749269920735e-05, "loss": 0.4581, "step": 491 }, { "epoch": 0.5537422622397299, "grad_norm": 0.31132596342015495, "learning_rate": 4.530663329161452e-05, "loss": 0.4166, "step": 492 }, { "epoch": 0.5548677546426561, "grad_norm": 0.42773832735938333, "learning_rate": 4.5285773884021696e-05, "loss": 0.4498, "step": 493 }, { "epoch": 0.5559932470455824, "grad_norm": 0.3337455720428052, "learning_rate": 4.526491447642887e-05, "loss": 0.4959, "step": 494 }, { "epoch": 0.5571187394485088, "grad_norm": 0.3784028479466481, "learning_rate": 4.524405506883605e-05, "loss": 0.4528, "step": 495 }, { "epoch": 0.558244231851435, "grad_norm": 0.3649036934635355, "learning_rate": 4.522319566124322e-05, "loss": 0.4382, "step": 496 }, { "epoch": 0.5593697242543613, "grad_norm": 0.37624738124672374, "learning_rate": 4.52023362536504e-05, "loss": 0.444, "step": 497 }, { "epoch": 0.5604952166572875, "grad_norm": 0.41375280657115326, "learning_rate": 4.5181476846057574e-05, "loss": 0.445, "step": 498 }, { "epoch": 0.5616207090602139, "grad_norm": 0.4473059694404265, "learning_rate": 4.5160617438464744e-05, "loss": 0.425, "step": 499 }, { "epoch": 0.5627462014631401, "grad_norm": 0.37225084914483775, "learning_rate": 4.513975803087193e-05, "loss": 0.441, "step": 500 }, { "epoch": 0.5638716938660664, "grad_norm": 0.3940588853331884, "learning_rate": 4.51188986232791e-05, "loss": 0.4466, "step": 501 }, { "epoch": 0.5649971862689926, "grad_norm": 0.3470104737718654, "learning_rate": 4.5098039215686275e-05, "loss": 0.4474, "step": 502 }, { "epoch": 0.566122678671919, "grad_norm": 0.4164834773144051, "learning_rate": 4.507717980809345e-05, "loss": 0.4442, "step": 503 }, { "epoch": 0.5672481710748453, "grad_norm": 0.3652420299854053, "learning_rate": 4.505632040050063e-05, "loss": 0.4436, "step": 504 }, { "epoch": 0.5683736634777715, "grad_norm": 0.4103075119748004, "learning_rate": 4.50354609929078e-05, "loss": 0.4459, "step": 505 }, { "epoch": 0.5694991558806978, "grad_norm": 0.39102170524673335, "learning_rate": 4.501460158531498e-05, "loss": 0.4268, "step": 506 }, { "epoch": 0.5706246482836241, "grad_norm": 0.4942727267066722, "learning_rate": 4.499374217772215e-05, "loss": 0.4722, "step": 507 }, { "epoch": 0.5717501406865504, "grad_norm": 0.3465319015459766, "learning_rate": 4.497288277012933e-05, "loss": 0.4408, "step": 508 }, { "epoch": 0.5728756330894766, "grad_norm": 0.4074806411985911, "learning_rate": 4.495202336253651e-05, "loss": 0.4212, "step": 509 }, { "epoch": 0.5740011254924029, "grad_norm": 0.38192085376045243, "learning_rate": 4.4931163954943684e-05, "loss": 0.41, "step": 510 }, { "epoch": 0.5751266178953293, "grad_norm": 0.3702590158057979, "learning_rate": 4.4910304547350854e-05, "loss": 0.4255, "step": 511 }, { "epoch": 0.5762521102982555, "grad_norm": 0.3436403538534127, "learning_rate": 4.488944513975804e-05, "loss": 0.4538, "step": 512 }, { "epoch": 0.5773776027011818, "grad_norm": 0.3877342893162592, "learning_rate": 4.486858573216521e-05, "loss": 0.4182, "step": 513 }, { "epoch": 0.578503095104108, "grad_norm": 0.3460201187876074, "learning_rate": 4.4847726324572385e-05, "loss": 0.4432, "step": 514 }, { "epoch": 0.5796285875070343, "grad_norm": 0.34511398785310915, "learning_rate": 4.482686691697956e-05, "loss": 0.4469, "step": 515 }, { "epoch": 0.5807540799099606, "grad_norm": 0.4258487344474797, "learning_rate": 4.480600750938674e-05, "loss": 0.4583, "step": 516 }, { "epoch": 0.5818795723128869, "grad_norm": 0.36803297271961477, "learning_rate": 4.478514810179391e-05, "loss": 0.4428, "step": 517 }, { "epoch": 0.5830050647158131, "grad_norm": 0.46401852203645827, "learning_rate": 4.4764288694201086e-05, "loss": 0.4456, "step": 518 }, { "epoch": 0.5841305571187394, "grad_norm": 0.39205048802946624, "learning_rate": 4.4743429286608263e-05, "loss": 0.441, "step": 519 }, { "epoch": 0.5852560495216658, "grad_norm": 0.39757611365031714, "learning_rate": 4.4722569879015434e-05, "loss": 0.4428, "step": 520 }, { "epoch": 0.586381541924592, "grad_norm": 0.3647536671953435, "learning_rate": 4.470171047142262e-05, "loss": 0.4513, "step": 521 }, { "epoch": 0.5875070343275183, "grad_norm": 0.39429072510874175, "learning_rate": 4.468085106382979e-05, "loss": 0.4132, "step": 522 }, { "epoch": 0.5886325267304445, "grad_norm": 0.40901642747342404, "learning_rate": 4.4659991656236965e-05, "loss": 0.4481, "step": 523 }, { "epoch": 0.5897580191333709, "grad_norm": 0.3992749524524198, "learning_rate": 4.463913224864414e-05, "loss": 0.4468, "step": 524 }, { "epoch": 0.5908835115362971, "grad_norm": 0.4722275927889856, "learning_rate": 4.461827284105132e-05, "loss": 0.4428, "step": 525 }, { "epoch": 0.5920090039392234, "grad_norm": 0.42866183958875864, "learning_rate": 4.459741343345849e-05, "loss": 0.4194, "step": 526 }, { "epoch": 0.5931344963421497, "grad_norm": 0.38204868156886707, "learning_rate": 4.4576554025865666e-05, "loss": 0.4402, "step": 527 }, { "epoch": 0.594259988745076, "grad_norm": 0.35148215802167393, "learning_rate": 4.455569461827284e-05, "loss": 0.4542, "step": 528 }, { "epoch": 0.5953854811480023, "grad_norm": 0.40153400690617524, "learning_rate": 4.453483521068002e-05, "loss": 0.4102, "step": 529 }, { "epoch": 0.5965109735509285, "grad_norm": 0.46986899886821576, "learning_rate": 4.45139758030872e-05, "loss": 0.4436, "step": 530 }, { "epoch": 0.5976364659538548, "grad_norm": 0.35390475462960685, "learning_rate": 4.4493116395494374e-05, "loss": 0.4398, "step": 531 }, { "epoch": 0.5987619583567811, "grad_norm": 0.4482185977258061, "learning_rate": 4.4472256987901544e-05, "loss": 0.4326, "step": 532 }, { "epoch": 0.5998874507597074, "grad_norm": 0.44232865264761434, "learning_rate": 4.445139758030872e-05, "loss": 0.4325, "step": 533 }, { "epoch": 0.6010129431626337, "grad_norm": 0.4183843016810463, "learning_rate": 4.44305381727159e-05, "loss": 0.4553, "step": 534 }, { "epoch": 0.6021384355655599, "grad_norm": 0.4242250812536985, "learning_rate": 4.4409678765123075e-05, "loss": 0.4232, "step": 535 }, { "epoch": 0.6032639279684862, "grad_norm": 0.3888142076123292, "learning_rate": 4.4388819357530245e-05, "loss": 0.4241, "step": 536 }, { "epoch": 0.6043894203714125, "grad_norm": 0.40486855004609845, "learning_rate": 4.436795994993743e-05, "loss": 0.4191, "step": 537 }, { "epoch": 0.6055149127743388, "grad_norm": 0.47154131963320084, "learning_rate": 4.43471005423446e-05, "loss": 0.4595, "step": 538 }, { "epoch": 0.606640405177265, "grad_norm": 0.38490507840256083, "learning_rate": 4.432624113475177e-05, "loss": 0.4199, "step": 539 }, { "epoch": 0.6077658975801913, "grad_norm": 0.46096486506497264, "learning_rate": 4.430538172715895e-05, "loss": 0.4448, "step": 540 }, { "epoch": 0.6088913899831176, "grad_norm": 0.4947895759240074, "learning_rate": 4.428452231956612e-05, "loss": 0.4342, "step": 541 }, { "epoch": 0.6100168823860439, "grad_norm": 0.3829854963511767, "learning_rate": 4.42636629119733e-05, "loss": 0.4186, "step": 542 }, { "epoch": 0.6111423747889702, "grad_norm": 0.6245507343869451, "learning_rate": 4.424280350438048e-05, "loss": 0.441, "step": 543 }, { "epoch": 0.6122678671918964, "grad_norm": 0.5300235385565563, "learning_rate": 4.4221944096787654e-05, "loss": 0.4375, "step": 544 }, { "epoch": 0.6133933595948228, "grad_norm": 0.4930881980261961, "learning_rate": 4.4201084689194824e-05, "loss": 0.4621, "step": 545 }, { "epoch": 0.614518851997749, "grad_norm": 0.5638424830870375, "learning_rate": 4.418022528160201e-05, "loss": 0.4411, "step": 546 }, { "epoch": 0.6156443444006753, "grad_norm": 0.3716115037856444, "learning_rate": 4.415936587400918e-05, "loss": 0.4528, "step": 547 }, { "epoch": 0.6167698368036015, "grad_norm": 0.5223401927324024, "learning_rate": 4.4138506466416355e-05, "loss": 0.4327, "step": 548 }, { "epoch": 0.6178953292065279, "grad_norm": 0.37311721165933265, "learning_rate": 4.411764705882353e-05, "loss": 0.4058, "step": 549 }, { "epoch": 0.6190208216094542, "grad_norm": 0.532332931429002, "learning_rate": 4.409678765123071e-05, "loss": 0.4445, "step": 550 }, { "epoch": 0.6201463140123804, "grad_norm": 0.5059754011866813, "learning_rate": 4.407592824363788e-05, "loss": 0.4257, "step": 551 }, { "epoch": 0.6212718064153067, "grad_norm": 0.4904818838015066, "learning_rate": 4.405506883604506e-05, "loss": 0.4353, "step": 552 }, { "epoch": 0.622397298818233, "grad_norm": 0.6200335434273374, "learning_rate": 4.4034209428452233e-05, "loss": 0.4416, "step": 553 }, { "epoch": 0.6235227912211593, "grad_norm": 0.3199203022808196, "learning_rate": 4.401335002085941e-05, "loss": 0.4355, "step": 554 }, { "epoch": 0.6246482836240855, "grad_norm": 0.5681807784529108, "learning_rate": 4.399249061326659e-05, "loss": 0.431, "step": 555 }, { "epoch": 0.6257737760270118, "grad_norm": 0.3995337627796738, "learning_rate": 4.3971631205673764e-05, "loss": 0.4312, "step": 556 }, { "epoch": 0.6268992684299382, "grad_norm": 0.5466993132659691, "learning_rate": 4.3950771798080935e-05, "loss": 0.4311, "step": 557 }, { "epoch": 0.6280247608328644, "grad_norm": 0.5670240814298136, "learning_rate": 4.392991239048811e-05, "loss": 0.4564, "step": 558 }, { "epoch": 0.6291502532357907, "grad_norm": 0.47107566738859724, "learning_rate": 4.390905298289529e-05, "loss": 0.4436, "step": 559 }, { "epoch": 0.6302757456387169, "grad_norm": 0.5380491861675493, "learning_rate": 4.388819357530246e-05, "loss": 0.4024, "step": 560 }, { "epoch": 0.6314012380416432, "grad_norm": 0.37407644137036594, "learning_rate": 4.386733416770964e-05, "loss": 0.4276, "step": 561 }, { "epoch": 0.6325267304445695, "grad_norm": 0.5179459476960132, "learning_rate": 4.384647476011681e-05, "loss": 0.4231, "step": 562 }, { "epoch": 0.6336522228474958, "grad_norm": 0.3832305989594554, "learning_rate": 4.382561535252399e-05, "loss": 0.4254, "step": 563 }, { "epoch": 0.634777715250422, "grad_norm": 0.48824132268901227, "learning_rate": 4.380475594493117e-05, "loss": 0.4414, "step": 564 }, { "epoch": 0.6359032076533483, "grad_norm": 0.45846104242587143, "learning_rate": 4.3783896537338344e-05, "loss": 0.4374, "step": 565 }, { "epoch": 0.6370287000562747, "grad_norm": 0.5017380646906237, "learning_rate": 4.3763037129745514e-05, "loss": 0.4478, "step": 566 }, { "epoch": 0.6381541924592009, "grad_norm": 0.4706523687823463, "learning_rate": 4.374217772215269e-05, "loss": 0.4393, "step": 567 }, { "epoch": 0.6392796848621272, "grad_norm": 0.43746034371341663, "learning_rate": 4.372131831455987e-05, "loss": 0.4289, "step": 568 }, { "epoch": 0.6404051772650534, "grad_norm": 0.4971311348473273, "learning_rate": 4.3700458906967045e-05, "loss": 0.4632, "step": 569 }, { "epoch": 0.6415306696679798, "grad_norm": 0.32424868625443787, "learning_rate": 4.367959949937422e-05, "loss": 0.4439, "step": 570 }, { "epoch": 0.642656162070906, "grad_norm": 0.5530000470387829, "learning_rate": 4.36587400917814e-05, "loss": 0.4438, "step": 571 }, { "epoch": 0.6437816544738323, "grad_norm": 0.3619983421314401, "learning_rate": 4.363788068418857e-05, "loss": 0.4193, "step": 572 }, { "epoch": 0.6449071468767585, "grad_norm": 0.46202193194933755, "learning_rate": 4.3617021276595746e-05, "loss": 0.4308, "step": 573 }, { "epoch": 0.6460326392796849, "grad_norm": 0.4798799400653708, "learning_rate": 4.359616186900292e-05, "loss": 0.4072, "step": 574 }, { "epoch": 0.6471581316826112, "grad_norm": 0.42761886423074474, "learning_rate": 4.35753024614101e-05, "loss": 0.4357, "step": 575 }, { "epoch": 0.6482836240855374, "grad_norm": 0.4906300910854437, "learning_rate": 4.355444305381727e-05, "loss": 0.441, "step": 576 }, { "epoch": 0.6494091164884637, "grad_norm": 0.4312074811449326, "learning_rate": 4.3533583646224454e-05, "loss": 0.468, "step": 577 }, { "epoch": 0.65053460889139, "grad_norm": 0.4999437976070137, "learning_rate": 4.3512724238631624e-05, "loss": 0.4442, "step": 578 }, { "epoch": 0.6516601012943163, "grad_norm": 0.45200142374904256, "learning_rate": 4.34918648310388e-05, "loss": 0.4351, "step": 579 }, { "epoch": 0.6527855936972425, "grad_norm": 0.4481417460480344, "learning_rate": 4.347100542344598e-05, "loss": 0.4351, "step": 580 }, { "epoch": 0.6539110861001688, "grad_norm": 0.416680484799885, "learning_rate": 4.345014601585315e-05, "loss": 0.4726, "step": 581 }, { "epoch": 0.6550365785030952, "grad_norm": 0.45466741269285743, "learning_rate": 4.3429286608260325e-05, "loss": 0.4445, "step": 582 }, { "epoch": 0.6561620709060214, "grad_norm": 0.3767132482639794, "learning_rate": 4.34084272006675e-05, "loss": 0.4494, "step": 583 }, { "epoch": 0.6572875633089477, "grad_norm": 0.4045713565741537, "learning_rate": 4.338756779307468e-05, "loss": 0.4478, "step": 584 }, { "epoch": 0.6584130557118739, "grad_norm": 0.41406546702832436, "learning_rate": 4.336670838548185e-05, "loss": 0.4296, "step": 585 }, { "epoch": 0.6595385481148002, "grad_norm": 0.45192122020443987, "learning_rate": 4.334584897788903e-05, "loss": 0.4548, "step": 586 }, { "epoch": 0.6606640405177265, "grad_norm": 0.42522165235824544, "learning_rate": 4.3324989570296203e-05, "loss": 0.4545, "step": 587 }, { "epoch": 0.6617895329206528, "grad_norm": 0.4025019554306989, "learning_rate": 4.330413016270338e-05, "loss": 0.427, "step": 588 }, { "epoch": 0.662915025323579, "grad_norm": 0.40092550396367915, "learning_rate": 4.328327075511056e-05, "loss": 0.4357, "step": 589 }, { "epoch": 0.6640405177265053, "grad_norm": 0.4029073566780126, "learning_rate": 4.3262411347517734e-05, "loss": 0.4437, "step": 590 }, { "epoch": 0.6651660101294317, "grad_norm": 0.3754421567640776, "learning_rate": 4.3241551939924905e-05, "loss": 0.4488, "step": 591 }, { "epoch": 0.6662915025323579, "grad_norm": 0.4093131149759515, "learning_rate": 4.322069253233209e-05, "loss": 0.4296, "step": 592 }, { "epoch": 0.6674169949352842, "grad_norm": 0.37396980661829454, "learning_rate": 4.319983312473926e-05, "loss": 0.4135, "step": 593 }, { "epoch": 0.6685424873382104, "grad_norm": 0.39676170583430237, "learning_rate": 4.3178973717146436e-05, "loss": 0.4407, "step": 594 }, { "epoch": 0.6696679797411368, "grad_norm": 0.3324304115520877, "learning_rate": 4.315811430955361e-05, "loss": 0.4272, "step": 595 }, { "epoch": 0.670793472144063, "grad_norm": 0.41321744590045745, "learning_rate": 4.313725490196079e-05, "loss": 0.4535, "step": 596 }, { "epoch": 0.6719189645469893, "grad_norm": 0.37423186701221084, "learning_rate": 4.311639549436796e-05, "loss": 0.4243, "step": 597 }, { "epoch": 0.6730444569499155, "grad_norm": 0.34707644350816663, "learning_rate": 4.309553608677514e-05, "loss": 0.4224, "step": 598 }, { "epoch": 0.6741699493528419, "grad_norm": 0.39162388441219653, "learning_rate": 4.3074676679182314e-05, "loss": 0.4117, "step": 599 }, { "epoch": 0.6752954417557682, "grad_norm": 0.3757134091896751, "learning_rate": 4.305381727158949e-05, "loss": 0.4372, "step": 600 }, { "epoch": 0.6764209341586944, "grad_norm": 0.486157183762819, "learning_rate": 4.303295786399667e-05, "loss": 0.4487, "step": 601 }, { "epoch": 0.6775464265616207, "grad_norm": 0.34615222028756854, "learning_rate": 4.301209845640384e-05, "loss": 0.438, "step": 602 }, { "epoch": 0.678671918964547, "grad_norm": 0.4148015924613456, "learning_rate": 4.2991239048811015e-05, "loss": 0.4545, "step": 603 }, { "epoch": 0.6797974113674733, "grad_norm": 0.3870669252002002, "learning_rate": 4.297037964121819e-05, "loss": 0.4078, "step": 604 }, { "epoch": 0.6809229037703995, "grad_norm": 0.31630147919989027, "learning_rate": 4.294952023362537e-05, "loss": 0.4179, "step": 605 }, { "epoch": 0.6820483961733258, "grad_norm": 0.4078672238404797, "learning_rate": 4.292866082603254e-05, "loss": 0.4363, "step": 606 }, { "epoch": 0.6831738885762522, "grad_norm": 0.38181818903469905, "learning_rate": 4.2907801418439716e-05, "loss": 0.4387, "step": 607 }, { "epoch": 0.6842993809791784, "grad_norm": 0.40887483819289494, "learning_rate": 4.288694201084689e-05, "loss": 0.4279, "step": 608 }, { "epoch": 0.6854248733821047, "grad_norm": 0.45835023477255316, "learning_rate": 4.286608260325407e-05, "loss": 0.4553, "step": 609 }, { "epoch": 0.6865503657850309, "grad_norm": 0.4496240755238681, "learning_rate": 4.284522319566125e-05, "loss": 0.4511, "step": 610 }, { "epoch": 0.6876758581879572, "grad_norm": 0.47923459811565877, "learning_rate": 4.2824363788068424e-05, "loss": 0.4494, "step": 611 }, { "epoch": 0.6888013505908835, "grad_norm": 0.4563499971704832, "learning_rate": 4.2803504380475594e-05, "loss": 0.4498, "step": 612 }, { "epoch": 0.6899268429938098, "grad_norm": 0.4658484510143094, "learning_rate": 4.278264497288277e-05, "loss": 0.446, "step": 613 }, { "epoch": 0.691052335396736, "grad_norm": 0.40099697936257683, "learning_rate": 4.276178556528995e-05, "loss": 0.4138, "step": 614 }, { "epoch": 0.6921778277996623, "grad_norm": 0.40681610293383885, "learning_rate": 4.2740926157697125e-05, "loss": 0.4428, "step": 615 }, { "epoch": 0.6933033202025887, "grad_norm": 0.492856289321406, "learning_rate": 4.2720066750104295e-05, "loss": 0.429, "step": 616 }, { "epoch": 0.6944288126055149, "grad_norm": 0.40198116454411964, "learning_rate": 4.269920734251148e-05, "loss": 0.4319, "step": 617 }, { "epoch": 0.6955543050084412, "grad_norm": 0.4049661414838683, "learning_rate": 4.267834793491865e-05, "loss": 0.4371, "step": 618 }, { "epoch": 0.6966797974113674, "grad_norm": 0.4200912676835283, "learning_rate": 4.2657488527325826e-05, "loss": 0.4273, "step": 619 }, { "epoch": 0.6978052898142938, "grad_norm": 0.3579260644405867, "learning_rate": 4.2636629119733e-05, "loss": 0.436, "step": 620 }, { "epoch": 0.69893078221722, "grad_norm": 0.41261145773033614, "learning_rate": 4.261576971214018e-05, "loss": 0.4355, "step": 621 }, { "epoch": 0.7000562746201463, "grad_norm": 0.38195673870959623, "learning_rate": 4.259491030454735e-05, "loss": 0.4407, "step": 622 }, { "epoch": 0.7011817670230726, "grad_norm": 0.47251318617526117, "learning_rate": 4.257405089695453e-05, "loss": 0.4434, "step": 623 }, { "epoch": 0.7023072594259989, "grad_norm": 0.413024502756469, "learning_rate": 4.2553191489361704e-05, "loss": 0.4228, "step": 624 }, { "epoch": 0.7034327518289252, "grad_norm": 0.4129659836054336, "learning_rate": 4.2532332081768875e-05, "loss": 0.4298, "step": 625 }, { "epoch": 0.7045582442318514, "grad_norm": 0.4371192692750543, "learning_rate": 4.251147267417606e-05, "loss": 0.422, "step": 626 }, { "epoch": 0.7056837366347777, "grad_norm": 0.3209464880480147, "learning_rate": 4.249061326658323e-05, "loss": 0.4159, "step": 627 }, { "epoch": 0.706809229037704, "grad_norm": 0.38213551742408286, "learning_rate": 4.2469753858990406e-05, "loss": 0.4651, "step": 628 }, { "epoch": 0.7079347214406303, "grad_norm": 0.37077014672780895, "learning_rate": 4.244889445139758e-05, "loss": 0.428, "step": 629 }, { "epoch": 0.7090602138435566, "grad_norm": 0.37388919361570394, "learning_rate": 4.242803504380476e-05, "loss": 0.4487, "step": 630 }, { "epoch": 0.7101857062464828, "grad_norm": 0.355919224811824, "learning_rate": 4.240717563621193e-05, "loss": 0.4273, "step": 631 }, { "epoch": 0.7113111986494092, "grad_norm": 0.3479874917806637, "learning_rate": 4.2386316228619114e-05, "loss": 0.4298, "step": 632 }, { "epoch": 0.7124366910523354, "grad_norm": 0.39097161117850043, "learning_rate": 4.2365456821026284e-05, "loss": 0.4251, "step": 633 }, { "epoch": 0.7135621834552617, "grad_norm": 0.39131656322095426, "learning_rate": 4.234459741343346e-05, "loss": 0.4261, "step": 634 }, { "epoch": 0.7146876758581879, "grad_norm": 1.9337556498338822, "learning_rate": 4.232373800584064e-05, "loss": 0.4553, "step": 635 }, { "epoch": 0.7158131682611142, "grad_norm": 0.7715880476594418, "learning_rate": 4.2302878598247815e-05, "loss": 0.4349, "step": 636 }, { "epoch": 0.7169386606640406, "grad_norm": 0.4198490504250616, "learning_rate": 4.2282019190654985e-05, "loss": 0.4427, "step": 637 }, { "epoch": 0.7180641530669668, "grad_norm": 0.6436591462942758, "learning_rate": 4.226115978306216e-05, "loss": 0.4428, "step": 638 }, { "epoch": 0.7191896454698931, "grad_norm": 0.46958357266306217, "learning_rate": 4.224030037546934e-05, "loss": 0.4096, "step": 639 }, { "epoch": 0.7203151378728193, "grad_norm": 0.5409557375822074, "learning_rate": 4.2219440967876516e-05, "loss": 0.4165, "step": 640 }, { "epoch": 0.7214406302757457, "grad_norm": 0.505386305383113, "learning_rate": 4.219858156028369e-05, "loss": 0.4232, "step": 641 }, { "epoch": 0.7225661226786719, "grad_norm": 0.47036754544713516, "learning_rate": 4.217772215269087e-05, "loss": 0.4187, "step": 642 }, { "epoch": 0.7236916150815982, "grad_norm": 0.5935180204625328, "learning_rate": 4.215686274509804e-05, "loss": 0.4326, "step": 643 }, { "epoch": 0.7248171074845244, "grad_norm": 0.37111793924942255, "learning_rate": 4.213600333750522e-05, "loss": 0.4235, "step": 644 }, { "epoch": 0.7259425998874508, "grad_norm": 0.6111195959607152, "learning_rate": 4.2115143929912394e-05, "loss": 0.4254, "step": 645 }, { "epoch": 0.727068092290377, "grad_norm": 0.35910288955770575, "learning_rate": 4.2094284522319564e-05, "loss": 0.4244, "step": 646 }, { "epoch": 0.7281935846933033, "grad_norm": 0.4804262191052388, "learning_rate": 4.207342511472674e-05, "loss": 0.4357, "step": 647 }, { "epoch": 0.7293190770962296, "grad_norm": 0.43546853881795533, "learning_rate": 4.205256570713392e-05, "loss": 0.4471, "step": 648 }, { "epoch": 0.7304445694991559, "grad_norm": 0.36651215549293115, "learning_rate": 4.2031706299541095e-05, "loss": 0.4359, "step": 649 }, { "epoch": 0.7315700619020822, "grad_norm": 0.5416106337436408, "learning_rate": 4.201084689194827e-05, "loss": 0.4514, "step": 650 }, { "epoch": 0.7326955543050084, "grad_norm": 0.37666903051702594, "learning_rate": 4.198998748435545e-05, "loss": 0.4423, "step": 651 }, { "epoch": 0.7338210467079347, "grad_norm": 0.44989927473315283, "learning_rate": 4.196912807676262e-05, "loss": 0.4268, "step": 652 }, { "epoch": 0.734946539110861, "grad_norm": 0.3864335324626091, "learning_rate": 4.1948268669169796e-05, "loss": 0.4494, "step": 653 }, { "epoch": 0.7360720315137873, "grad_norm": 0.4000593109156678, "learning_rate": 4.192740926157697e-05, "loss": 0.4434, "step": 654 }, { "epoch": 0.7371975239167136, "grad_norm": 0.423242298419072, "learning_rate": 4.190654985398415e-05, "loss": 0.4328, "step": 655 }, { "epoch": 0.7383230163196398, "grad_norm": 0.44706912801056875, "learning_rate": 4.188569044639132e-05, "loss": 0.4254, "step": 656 }, { "epoch": 0.7394485087225662, "grad_norm": 0.5086338570156853, "learning_rate": 4.1864831038798504e-05, "loss": 0.4425, "step": 657 }, { "epoch": 0.7405740011254924, "grad_norm": 0.4676027167307538, "learning_rate": 4.1843971631205674e-05, "loss": 0.4491, "step": 658 }, { "epoch": 0.7416994935284187, "grad_norm": 0.46458396727329027, "learning_rate": 4.182311222361285e-05, "loss": 0.4015, "step": 659 }, { "epoch": 0.7428249859313449, "grad_norm": 0.390783744931949, "learning_rate": 4.180225281602003e-05, "loss": 0.4184, "step": 660 }, { "epoch": 0.7439504783342712, "grad_norm": 0.44526805252316143, "learning_rate": 4.1781393408427205e-05, "loss": 0.4035, "step": 661 }, { "epoch": 0.7450759707371976, "grad_norm": 0.4217385671488669, "learning_rate": 4.1760534000834376e-05, "loss": 0.4332, "step": 662 }, { "epoch": 0.7462014631401238, "grad_norm": 0.44487860783732935, "learning_rate": 4.173967459324156e-05, "loss": 0.4266, "step": 663 }, { "epoch": 0.7473269555430501, "grad_norm": 0.4296879305918086, "learning_rate": 4.171881518564873e-05, "loss": 0.4205, "step": 664 }, { "epoch": 0.7484524479459763, "grad_norm": 0.4948881491751457, "learning_rate": 4.16979557780559e-05, "loss": 0.4447, "step": 665 }, { "epoch": 0.7495779403489027, "grad_norm": 0.41381310448412767, "learning_rate": 4.1677096370463084e-05, "loss": 0.435, "step": 666 }, { "epoch": 0.7507034327518289, "grad_norm": 0.4138662471855155, "learning_rate": 4.1656236962870254e-05, "loss": 0.4351, "step": 667 }, { "epoch": 0.7518289251547552, "grad_norm": 0.3869476402415003, "learning_rate": 4.163537755527743e-05, "loss": 0.4319, "step": 668 }, { "epoch": 0.7529544175576814, "grad_norm": 0.4882528682989917, "learning_rate": 4.161451814768461e-05, "loss": 0.4123, "step": 669 }, { "epoch": 0.7540799099606078, "grad_norm": 0.3739890771080639, "learning_rate": 4.1593658740091785e-05, "loss": 0.4268, "step": 670 }, { "epoch": 0.7552054023635341, "grad_norm": 0.5032273771625602, "learning_rate": 4.1572799332498955e-05, "loss": 0.4404, "step": 671 }, { "epoch": 0.7563308947664603, "grad_norm": 0.38387128180956526, "learning_rate": 4.155193992490614e-05, "loss": 0.4505, "step": 672 }, { "epoch": 0.7574563871693866, "grad_norm": 0.4995032503495298, "learning_rate": 4.153108051731331e-05, "loss": 0.4211, "step": 673 }, { "epoch": 0.7585818795723129, "grad_norm": 0.46352751067691306, "learning_rate": 4.1510221109720486e-05, "loss": 0.4253, "step": 674 }, { "epoch": 0.7597073719752392, "grad_norm": 0.4661239773263893, "learning_rate": 4.148936170212766e-05, "loss": 0.4533, "step": 675 }, { "epoch": 0.7608328643781654, "grad_norm": 0.42916960855475605, "learning_rate": 4.146850229453484e-05, "loss": 0.4333, "step": 676 }, { "epoch": 0.7619583567810917, "grad_norm": 0.40989406943220275, "learning_rate": 4.144764288694201e-05, "loss": 0.4413, "step": 677 }, { "epoch": 0.7630838491840181, "grad_norm": 0.7522787094637527, "learning_rate": 4.1426783479349194e-05, "loss": 0.4522, "step": 678 }, { "epoch": 0.7642093415869443, "grad_norm": 0.4277705459587538, "learning_rate": 4.1405924071756364e-05, "loss": 0.4348, "step": 679 }, { "epoch": 0.7653348339898706, "grad_norm": 0.4684118417529332, "learning_rate": 4.138506466416354e-05, "loss": 0.422, "step": 680 }, { "epoch": 0.7664603263927968, "grad_norm": 0.5197963821538139, "learning_rate": 4.136420525657072e-05, "loss": 0.4299, "step": 681 }, { "epoch": 0.7675858187957231, "grad_norm": 0.5235576475586984, "learning_rate": 4.1343345848977895e-05, "loss": 0.438, "step": 682 }, { "epoch": 0.7687113111986494, "grad_norm": 0.46712550772065836, "learning_rate": 4.1322486441385065e-05, "loss": 0.4344, "step": 683 }, { "epoch": 0.7698368036015757, "grad_norm": 0.3222703692853798, "learning_rate": 4.130162703379224e-05, "loss": 0.4263, "step": 684 }, { "epoch": 0.770962296004502, "grad_norm": 0.5188367404561216, "learning_rate": 4.128076762619942e-05, "loss": 0.425, "step": 685 }, { "epoch": 0.7720877884074282, "grad_norm": 0.5386961427613608, "learning_rate": 4.125990821860659e-05, "loss": 0.4276, "step": 686 }, { "epoch": 0.7732132808103546, "grad_norm": 0.42911439453279915, "learning_rate": 4.1239048811013766e-05, "loss": 0.4309, "step": 687 }, { "epoch": 0.7743387732132808, "grad_norm": 0.5088405648165022, "learning_rate": 4.121818940342094e-05, "loss": 0.4493, "step": 688 }, { "epoch": 0.7754642656162071, "grad_norm": 0.3815644681020926, "learning_rate": 4.119732999582812e-05, "loss": 0.4077, "step": 689 }, { "epoch": 0.7765897580191333, "grad_norm": 0.4840279343366164, "learning_rate": 4.11764705882353e-05, "loss": 0.4172, "step": 690 }, { "epoch": 0.7777152504220597, "grad_norm": 0.333716982007624, "learning_rate": 4.1155611180642474e-05, "loss": 0.42, "step": 691 }, { "epoch": 0.7788407428249859, "grad_norm": 0.5086503847022227, "learning_rate": 4.1134751773049644e-05, "loss": 0.4388, "step": 692 }, { "epoch": 0.7799662352279122, "grad_norm": 0.5138077690790301, "learning_rate": 4.111389236545682e-05, "loss": 0.4472, "step": 693 }, { "epoch": 0.7810917276308385, "grad_norm": 0.5073604041295958, "learning_rate": 4.1093032957864e-05, "loss": 0.4319, "step": 694 }, { "epoch": 0.7822172200337648, "grad_norm": 0.5070487690193936, "learning_rate": 4.1072173550271175e-05, "loss": 0.4406, "step": 695 }, { "epoch": 0.7833427124366911, "grad_norm": 0.39744464693598625, "learning_rate": 4.1051314142678346e-05, "loss": 0.434, "step": 696 }, { "epoch": 0.7844682048396173, "grad_norm": 0.4541031658226192, "learning_rate": 4.103045473508553e-05, "loss": 0.4454, "step": 697 }, { "epoch": 0.7855936972425436, "grad_norm": 0.3491750229319607, "learning_rate": 4.10095953274927e-05, "loss": 0.4332, "step": 698 }, { "epoch": 0.7867191896454699, "grad_norm": 0.4022760008208042, "learning_rate": 4.0988735919899877e-05, "loss": 0.4296, "step": 699 }, { "epoch": 0.7878446820483962, "grad_norm": 0.34684627806001544, "learning_rate": 4.0967876512307054e-05, "loss": 0.4331, "step": 700 }, { "epoch": 0.7889701744513224, "grad_norm": 0.4050405845879203, "learning_rate": 4.094701710471423e-05, "loss": 0.4464, "step": 701 }, { "epoch": 0.7900956668542487, "grad_norm": 0.36395612381945763, "learning_rate": 4.09261576971214e-05, "loss": 0.4444, "step": 702 }, { "epoch": 0.7912211592571751, "grad_norm": 0.398848237592344, "learning_rate": 4.0905298289528585e-05, "loss": 0.4288, "step": 703 }, { "epoch": 0.7923466516601013, "grad_norm": 0.40745644685078164, "learning_rate": 4.0884438881935755e-05, "loss": 0.4329, "step": 704 }, { "epoch": 0.7934721440630276, "grad_norm": 0.3547156716364725, "learning_rate": 4.0863579474342925e-05, "loss": 0.4191, "step": 705 }, { "epoch": 0.7945976364659538, "grad_norm": 0.377680056161795, "learning_rate": 4.084272006675011e-05, "loss": 0.4376, "step": 706 }, { "epoch": 0.7957231288688801, "grad_norm": 0.4073180644016936, "learning_rate": 4.082186065915728e-05, "loss": 0.4559, "step": 707 }, { "epoch": 0.7968486212718064, "grad_norm": 0.45186446852813356, "learning_rate": 4.0801001251564456e-05, "loss": 0.4277, "step": 708 }, { "epoch": 0.7979741136747327, "grad_norm": 0.36933911451661233, "learning_rate": 4.078014184397163e-05, "loss": 0.45, "step": 709 }, { "epoch": 0.799099606077659, "grad_norm": 0.35833391487238614, "learning_rate": 4.075928243637881e-05, "loss": 0.4403, "step": 710 }, { "epoch": 0.8002250984805852, "grad_norm": 0.3901982149558614, "learning_rate": 4.073842302878598e-05, "loss": 0.4216, "step": 711 }, { "epoch": 0.8013505908835116, "grad_norm": 0.40940384251834244, "learning_rate": 4.0717563621193164e-05, "loss": 0.4021, "step": 712 }, { "epoch": 0.8024760832864378, "grad_norm": 0.42919683308516116, "learning_rate": 4.0696704213600334e-05, "loss": 0.4376, "step": 713 }, { "epoch": 0.8036015756893641, "grad_norm": 0.4073165345943137, "learning_rate": 4.067584480600751e-05, "loss": 0.4153, "step": 714 }, { "epoch": 0.8047270680922903, "grad_norm": 0.4178501498334503, "learning_rate": 4.065498539841469e-05, "loss": 0.414, "step": 715 }, { "epoch": 0.8058525604952167, "grad_norm": 0.4403993787350139, "learning_rate": 4.0634125990821865e-05, "loss": 0.4399, "step": 716 }, { "epoch": 0.806978052898143, "grad_norm": 0.4114972670954794, "learning_rate": 4.0613266583229035e-05, "loss": 0.439, "step": 717 }, { "epoch": 0.8081035453010692, "grad_norm": 0.407394844667869, "learning_rate": 4.059240717563622e-05, "loss": 0.4123, "step": 718 }, { "epoch": 0.8092290377039955, "grad_norm": 0.39800729593005324, "learning_rate": 4.057154776804339e-05, "loss": 0.4236, "step": 719 }, { "epoch": 0.8103545301069218, "grad_norm": 0.4287708410386054, "learning_rate": 4.0550688360450566e-05, "loss": 0.4256, "step": 720 }, { "epoch": 0.8114800225098481, "grad_norm": 0.4016484816358628, "learning_rate": 4.052982895285774e-05, "loss": 0.4281, "step": 721 }, { "epoch": 0.8126055149127743, "grad_norm": 0.3719724351542615, "learning_rate": 4.050896954526492e-05, "loss": 0.4077, "step": 722 }, { "epoch": 0.8137310073157006, "grad_norm": 0.4023100055568255, "learning_rate": 4.048811013767209e-05, "loss": 0.4461, "step": 723 }, { "epoch": 0.814856499718627, "grad_norm": 0.4117093051704328, "learning_rate": 4.046725073007927e-05, "loss": 0.4175, "step": 724 }, { "epoch": 0.8159819921215532, "grad_norm": 0.34286385689334, "learning_rate": 4.0446391322486444e-05, "loss": 0.4356, "step": 725 }, { "epoch": 0.8171074845244795, "grad_norm": 0.35591813739094097, "learning_rate": 4.0425531914893614e-05, "loss": 0.4434, "step": 726 }, { "epoch": 0.8182329769274057, "grad_norm": 0.43567208149763015, "learning_rate": 4.040467250730079e-05, "loss": 0.437, "step": 727 }, { "epoch": 0.8193584693303321, "grad_norm": 0.3799825439351934, "learning_rate": 4.038381309970797e-05, "loss": 0.4248, "step": 728 }, { "epoch": 0.8204839617332583, "grad_norm": 0.38216998051723755, "learning_rate": 4.0362953692115145e-05, "loss": 0.4253, "step": 729 }, { "epoch": 0.8216094541361846, "grad_norm": 0.39231774228135824, "learning_rate": 4.034209428452232e-05, "loss": 0.4223, "step": 730 }, { "epoch": 0.8227349465391108, "grad_norm": 0.4102144130938295, "learning_rate": 4.03212348769295e-05, "loss": 0.4348, "step": 731 }, { "epoch": 0.8238604389420371, "grad_norm": 0.37115430835787877, "learning_rate": 4.030037546933667e-05, "loss": 0.409, "step": 732 }, { "epoch": 0.8249859313449635, "grad_norm": 0.40499256266698164, "learning_rate": 4.0279516061743847e-05, "loss": 0.4089, "step": 733 }, { "epoch": 0.8261114237478897, "grad_norm": 0.4916550738089128, "learning_rate": 4.0258656654151024e-05, "loss": 0.4272, "step": 734 }, { "epoch": 0.827236916150816, "grad_norm": 0.3681620907401364, "learning_rate": 4.02377972465582e-05, "loss": 0.4446, "step": 735 }, { "epoch": 0.8283624085537422, "grad_norm": 0.4795384990908562, "learning_rate": 4.021693783896537e-05, "loss": 0.4357, "step": 736 }, { "epoch": 0.8294879009566686, "grad_norm": 0.3684736183097587, "learning_rate": 4.0196078431372555e-05, "loss": 0.4119, "step": 737 }, { "epoch": 0.8306133933595948, "grad_norm": 0.43877380657382786, "learning_rate": 4.0175219023779725e-05, "loss": 0.4403, "step": 738 }, { "epoch": 0.8317388857625211, "grad_norm": 0.37814204050253025, "learning_rate": 4.01543596161869e-05, "loss": 0.434, "step": 739 }, { "epoch": 0.8328643781654473, "grad_norm": 0.45099287248352765, "learning_rate": 4.013350020859408e-05, "loss": 0.4149, "step": 740 }, { "epoch": 0.8339898705683737, "grad_norm": 0.34915848381393966, "learning_rate": 4.0112640801001256e-05, "loss": 0.4136, "step": 741 }, { "epoch": 0.8351153629713, "grad_norm": 0.5037598255538088, "learning_rate": 4.0091781393408426e-05, "loss": 0.4398, "step": 742 }, { "epoch": 0.8362408553742262, "grad_norm": 0.3612809802844246, "learning_rate": 4.007092198581561e-05, "loss": 0.4162, "step": 743 }, { "epoch": 0.8373663477771525, "grad_norm": 0.3979488796549818, "learning_rate": 4.005006257822278e-05, "loss": 0.3981, "step": 744 }, { "epoch": 0.8384918401800788, "grad_norm": 0.4440135625243805, "learning_rate": 4.002920317062996e-05, "loss": 0.429, "step": 745 }, { "epoch": 0.8396173325830051, "grad_norm": 0.3448234757480279, "learning_rate": 4.0008343763037134e-05, "loss": 0.4324, "step": 746 }, { "epoch": 0.8407428249859313, "grad_norm": 0.4775835287224156, "learning_rate": 3.9987484355444304e-05, "loss": 0.4249, "step": 747 }, { "epoch": 0.8418683173888576, "grad_norm": 0.3566220202478078, "learning_rate": 3.996662494785148e-05, "loss": 0.4211, "step": 748 }, { "epoch": 0.842993809791784, "grad_norm": 0.5285144169481172, "learning_rate": 3.994576554025866e-05, "loss": 0.4168, "step": 749 }, { "epoch": 0.8441193021947102, "grad_norm": 0.33354278924631714, "learning_rate": 3.9924906132665835e-05, "loss": 0.4261, "step": 750 }, { "epoch": 0.8452447945976365, "grad_norm": 0.3372581050524173, "learning_rate": 3.9904046725073005e-05, "loss": 0.4269, "step": 751 }, { "epoch": 0.8463702870005627, "grad_norm": 0.3146244454332402, "learning_rate": 3.988318731748019e-05, "loss": 0.4204, "step": 752 }, { "epoch": 0.8474957794034891, "grad_norm": 0.3706478564537626, "learning_rate": 3.986232790988736e-05, "loss": 0.4521, "step": 753 }, { "epoch": 0.8486212718064153, "grad_norm": 0.34630012288221157, "learning_rate": 3.9841468502294536e-05, "loss": 0.4142, "step": 754 }, { "epoch": 0.8497467642093416, "grad_norm": 0.36373433568245944, "learning_rate": 3.982060909470171e-05, "loss": 0.4252, "step": 755 }, { "epoch": 0.8508722566122678, "grad_norm": 0.3554211790643752, "learning_rate": 3.979974968710889e-05, "loss": 0.4474, "step": 756 }, { "epoch": 0.8519977490151941, "grad_norm": 0.30960141279598913, "learning_rate": 3.977889027951606e-05, "loss": 0.4167, "step": 757 }, { "epoch": 0.8531232414181205, "grad_norm": 0.37614788680975125, "learning_rate": 3.9758030871923244e-05, "loss": 0.4505, "step": 758 }, { "epoch": 0.8542487338210467, "grad_norm": 0.3938651785575828, "learning_rate": 3.9737171464330414e-05, "loss": 0.4349, "step": 759 }, { "epoch": 0.855374226223973, "grad_norm": 0.3460524380953148, "learning_rate": 3.971631205673759e-05, "loss": 0.4396, "step": 760 }, { "epoch": 0.8564997186268992, "grad_norm": 0.430535629585179, "learning_rate": 3.969545264914477e-05, "loss": 0.4154, "step": 761 }, { "epoch": 0.8576252110298256, "grad_norm": 0.34446139273212933, "learning_rate": 3.9674593241551945e-05, "loss": 0.3931, "step": 762 }, { "epoch": 0.8587507034327518, "grad_norm": 0.42192087717244775, "learning_rate": 3.9653733833959115e-05, "loss": 0.4261, "step": 763 }, { "epoch": 0.8598761958356781, "grad_norm": 0.40550449281201056, "learning_rate": 3.963287442636629e-05, "loss": 0.4569, "step": 764 }, { "epoch": 0.8610016882386043, "grad_norm": 0.3566914781532168, "learning_rate": 3.961201501877347e-05, "loss": 0.4141, "step": 765 }, { "epoch": 0.8621271806415307, "grad_norm": 0.3843475406384751, "learning_rate": 3.9591155611180646e-05, "loss": 0.4332, "step": 766 }, { "epoch": 0.863252673044457, "grad_norm": 0.3366748222918633, "learning_rate": 3.9570296203587817e-05, "loss": 0.4123, "step": 767 }, { "epoch": 0.8643781654473832, "grad_norm": 0.41416075386046697, "learning_rate": 3.9549436795994994e-05, "loss": 0.426, "step": 768 }, { "epoch": 0.8655036578503095, "grad_norm": 0.3752366359688814, "learning_rate": 3.952857738840217e-05, "loss": 0.4402, "step": 769 }, { "epoch": 0.8666291502532358, "grad_norm": 0.37688991154499113, "learning_rate": 3.950771798080935e-05, "loss": 0.4244, "step": 770 }, { "epoch": 0.8677546426561621, "grad_norm": 0.42637480636595876, "learning_rate": 3.9486858573216525e-05, "loss": 0.438, "step": 771 }, { "epoch": 0.8688801350590883, "grad_norm": 0.3568635983835573, "learning_rate": 3.9465999165623695e-05, "loss": 0.424, "step": 772 }, { "epoch": 0.8700056274620146, "grad_norm": 0.38797711927011286, "learning_rate": 3.944513975803087e-05, "loss": 0.4321, "step": 773 }, { "epoch": 0.871131119864941, "grad_norm": 0.3904824359653345, "learning_rate": 3.942428035043805e-05, "loss": 0.4161, "step": 774 }, { "epoch": 0.8722566122678672, "grad_norm": 0.47345678909928446, "learning_rate": 3.9403420942845226e-05, "loss": 0.4503, "step": 775 }, { "epoch": 0.8733821046707935, "grad_norm": 0.37497244213020403, "learning_rate": 3.9382561535252396e-05, "loss": 0.4255, "step": 776 }, { "epoch": 0.8745075970737197, "grad_norm": 0.4047268746098847, "learning_rate": 3.936170212765958e-05, "loss": 0.4327, "step": 777 }, { "epoch": 0.8756330894766461, "grad_norm": 0.3834914449330313, "learning_rate": 3.934084272006675e-05, "loss": 0.4079, "step": 778 }, { "epoch": 0.8767585818795723, "grad_norm": 0.43021072579406455, "learning_rate": 3.931998331247393e-05, "loss": 0.4143, "step": 779 }, { "epoch": 0.8778840742824986, "grad_norm": 0.3793510230856374, "learning_rate": 3.9299123904881104e-05, "loss": 0.431, "step": 780 }, { "epoch": 0.8790095666854248, "grad_norm": 0.37164807483969337, "learning_rate": 3.927826449728828e-05, "loss": 0.4341, "step": 781 }, { "epoch": 0.8801350590883511, "grad_norm": 0.3807695648271021, "learning_rate": 3.925740508969545e-05, "loss": 0.4096, "step": 782 }, { "epoch": 0.8812605514912775, "grad_norm": 0.3502384590348891, "learning_rate": 3.9236545682102635e-05, "loss": 0.4117, "step": 783 }, { "epoch": 0.8823860438942037, "grad_norm": 0.41955082958695283, "learning_rate": 3.9215686274509805e-05, "loss": 0.4179, "step": 784 }, { "epoch": 0.88351153629713, "grad_norm": 0.3435394433133878, "learning_rate": 3.919482686691698e-05, "loss": 0.4215, "step": 785 }, { "epoch": 0.8846370287000562, "grad_norm": 0.44230838156044133, "learning_rate": 3.917396745932416e-05, "loss": 0.436, "step": 786 }, { "epoch": 0.8857625211029826, "grad_norm": 0.3248857597519066, "learning_rate": 3.9153108051731336e-05, "loss": 0.41, "step": 787 }, { "epoch": 0.8868880135059088, "grad_norm": 0.48949666561437843, "learning_rate": 3.9132248644138506e-05, "loss": 0.4348, "step": 788 }, { "epoch": 0.8880135059088351, "grad_norm": 0.32922044316292487, "learning_rate": 3.911138923654568e-05, "loss": 0.4084, "step": 789 }, { "epoch": 0.8891389983117614, "grad_norm": 0.4097616554209572, "learning_rate": 3.909052982895286e-05, "loss": 0.4127, "step": 790 }, { "epoch": 0.8902644907146877, "grad_norm": 0.3847502404740843, "learning_rate": 3.906967042136003e-05, "loss": 0.4322, "step": 791 }, { "epoch": 0.891389983117614, "grad_norm": 0.39373480839252734, "learning_rate": 3.9048811013767214e-05, "loss": 0.4085, "step": 792 }, { "epoch": 0.8925154755205402, "grad_norm": 0.4665639076471283, "learning_rate": 3.9027951606174384e-05, "loss": 0.4281, "step": 793 }, { "epoch": 0.8936409679234665, "grad_norm": 0.32986547499650304, "learning_rate": 3.900709219858156e-05, "loss": 0.4261, "step": 794 }, { "epoch": 0.8947664603263928, "grad_norm": 0.484689789318943, "learning_rate": 3.898623279098874e-05, "loss": 0.4236, "step": 795 }, { "epoch": 0.8958919527293191, "grad_norm": 0.3386378151140954, "learning_rate": 3.8965373383395915e-05, "loss": 0.4117, "step": 796 }, { "epoch": 0.8970174451322454, "grad_norm": 0.4810985090257936, "learning_rate": 3.8944513975803085e-05, "loss": 0.4199, "step": 797 }, { "epoch": 0.8981429375351716, "grad_norm": 0.34069553000131625, "learning_rate": 3.892365456821027e-05, "loss": 0.4279, "step": 798 }, { "epoch": 0.899268429938098, "grad_norm": 0.39752219752724677, "learning_rate": 3.890279516061744e-05, "loss": 0.4172, "step": 799 }, { "epoch": 0.9003939223410242, "grad_norm": 0.39022425914879927, "learning_rate": 3.8881935753024616e-05, "loss": 0.3978, "step": 800 }, { "epoch": 0.9015194147439505, "grad_norm": 0.3458579209805956, "learning_rate": 3.8861076345431793e-05, "loss": 0.4135, "step": 801 }, { "epoch": 0.9026449071468767, "grad_norm": 0.4407202352189913, "learning_rate": 3.884021693783897e-05, "loss": 0.4154, "step": 802 }, { "epoch": 0.9037703995498031, "grad_norm": 0.47173587942543654, "learning_rate": 3.881935753024614e-05, "loss": 0.4572, "step": 803 }, { "epoch": 0.9048958919527293, "grad_norm": 0.5188592329216469, "learning_rate": 3.879849812265332e-05, "loss": 0.42, "step": 804 }, { "epoch": 0.9060213843556556, "grad_norm": 0.35403721006820305, "learning_rate": 3.8777638715060495e-05, "loss": 0.3938, "step": 805 }, { "epoch": 0.9071468767585819, "grad_norm": 0.4545974955129778, "learning_rate": 3.875677930746767e-05, "loss": 0.4285, "step": 806 }, { "epoch": 0.9082723691615081, "grad_norm": 0.38332622486859297, "learning_rate": 3.873591989987485e-05, "loss": 0.4324, "step": 807 }, { "epoch": 0.9093978615644345, "grad_norm": 0.4520007540189171, "learning_rate": 3.8715060492282026e-05, "loss": 0.4305, "step": 808 }, { "epoch": 0.9105233539673607, "grad_norm": 0.4148933007482292, "learning_rate": 3.8694201084689196e-05, "loss": 0.4218, "step": 809 }, { "epoch": 0.911648846370287, "grad_norm": 0.35859066112911336, "learning_rate": 3.867334167709637e-05, "loss": 0.4395, "step": 810 }, { "epoch": 0.9127743387732132, "grad_norm": 0.3384384473732276, "learning_rate": 3.865248226950355e-05, "loss": 0.4331, "step": 811 }, { "epoch": 0.9138998311761396, "grad_norm": 0.3212850763014723, "learning_rate": 3.863162286191072e-05, "loss": 0.4371, "step": 812 }, { "epoch": 0.9150253235790659, "grad_norm": 0.34283786993488946, "learning_rate": 3.86107634543179e-05, "loss": 0.4213, "step": 813 }, { "epoch": 0.9161508159819921, "grad_norm": 0.3578410656828841, "learning_rate": 3.8589904046725074e-05, "loss": 0.4264, "step": 814 }, { "epoch": 0.9172763083849184, "grad_norm": 0.33865929644502085, "learning_rate": 3.856904463913225e-05, "loss": 0.4242, "step": 815 }, { "epoch": 0.9184018007878447, "grad_norm": 0.3392167511998851, "learning_rate": 3.854818523153942e-05, "loss": 0.435, "step": 816 }, { "epoch": 0.919527293190771, "grad_norm": 0.4361222901548229, "learning_rate": 3.8527325823946605e-05, "loss": 0.4351, "step": 817 }, { "epoch": 0.9206527855936972, "grad_norm": 0.38626109347018045, "learning_rate": 3.8506466416353775e-05, "loss": 0.4413, "step": 818 }, { "epoch": 0.9217782779966235, "grad_norm": 0.376739528222001, "learning_rate": 3.848560700876095e-05, "loss": 0.4162, "step": 819 }, { "epoch": 0.9229037703995498, "grad_norm": 0.38666458978007023, "learning_rate": 3.846474760116813e-05, "loss": 0.4308, "step": 820 }, { "epoch": 0.9240292628024761, "grad_norm": 0.49211116299516156, "learning_rate": 3.8443888193575306e-05, "loss": 0.4319, "step": 821 }, { "epoch": 0.9251547552054024, "grad_norm": 0.35408915013798653, "learning_rate": 3.8423028785982476e-05, "loss": 0.4095, "step": 822 }, { "epoch": 0.9262802476083286, "grad_norm": 0.4801831963166499, "learning_rate": 3.840216937838966e-05, "loss": 0.4357, "step": 823 }, { "epoch": 0.927405740011255, "grad_norm": 0.355137877995065, "learning_rate": 3.838130997079683e-05, "loss": 0.4089, "step": 824 }, { "epoch": 0.9285312324141812, "grad_norm": 0.39619886118735476, "learning_rate": 3.836045056320401e-05, "loss": 0.4275, "step": 825 }, { "epoch": 0.9296567248171075, "grad_norm": 0.4149029728443111, "learning_rate": 3.8339591155611184e-05, "loss": 0.4271, "step": 826 }, { "epoch": 0.9307822172200337, "grad_norm": 0.3576650599906339, "learning_rate": 3.831873174801836e-05, "loss": 0.4132, "step": 827 }, { "epoch": 0.93190770962296, "grad_norm": 0.3906733425105834, "learning_rate": 3.829787234042553e-05, "loss": 0.4344, "step": 828 }, { "epoch": 0.9330332020258864, "grad_norm": 0.3593657860758568, "learning_rate": 3.8277012932832715e-05, "loss": 0.4149, "step": 829 }, { "epoch": 0.9341586944288126, "grad_norm": 0.3817439606842503, "learning_rate": 3.8256153525239885e-05, "loss": 0.4069, "step": 830 }, { "epoch": 0.9352841868317389, "grad_norm": 0.3973105618276613, "learning_rate": 3.8235294117647055e-05, "loss": 0.4324, "step": 831 }, { "epoch": 0.9364096792346651, "grad_norm": 0.366999411331023, "learning_rate": 3.821443471005424e-05, "loss": 0.4222, "step": 832 }, { "epoch": 0.9375351716375915, "grad_norm": 0.3464567606261278, "learning_rate": 3.819357530246141e-05, "loss": 0.4246, "step": 833 }, { "epoch": 0.9386606640405177, "grad_norm": 0.4438404878074898, "learning_rate": 3.8172715894868586e-05, "loss": 0.4143, "step": 834 }, { "epoch": 0.939786156443444, "grad_norm": 0.3129033931624516, "learning_rate": 3.8151856487275763e-05, "loss": 0.4307, "step": 835 }, { "epoch": 0.9409116488463702, "grad_norm": 0.4970325181275813, "learning_rate": 3.813099707968294e-05, "loss": 0.4212, "step": 836 }, { "epoch": 0.9420371412492966, "grad_norm": 0.3998089884639407, "learning_rate": 3.811013767209011e-05, "loss": 0.4208, "step": 837 }, { "epoch": 0.9431626336522229, "grad_norm": 0.40099412686189473, "learning_rate": 3.8089278264497294e-05, "loss": 0.422, "step": 838 }, { "epoch": 0.9442881260551491, "grad_norm": 0.5330625499056586, "learning_rate": 3.8068418856904465e-05, "loss": 0.4132, "step": 839 }, { "epoch": 0.9454136184580754, "grad_norm": 0.3036089387603486, "learning_rate": 3.804755944931164e-05, "loss": 0.3962, "step": 840 }, { "epoch": 0.9465391108610017, "grad_norm": 0.5044966493127429, "learning_rate": 3.802670004171882e-05, "loss": 0.4228, "step": 841 }, { "epoch": 0.947664603263928, "grad_norm": 0.3885120765788415, "learning_rate": 3.8005840634125996e-05, "loss": 0.4228, "step": 842 }, { "epoch": 0.9487900956668542, "grad_norm": 0.3285021365606724, "learning_rate": 3.7984981226533166e-05, "loss": 0.4287, "step": 843 }, { "epoch": 0.9499155880697805, "grad_norm": 0.3980934273222264, "learning_rate": 3.796412181894034e-05, "loss": 0.409, "step": 844 }, { "epoch": 0.9510410804727069, "grad_norm": 0.31490077533714694, "learning_rate": 3.794326241134752e-05, "loss": 0.4364, "step": 845 }, { "epoch": 0.9521665728756331, "grad_norm": 0.35959204795616695, "learning_rate": 3.79224030037547e-05, "loss": 0.4283, "step": 846 }, { "epoch": 0.9532920652785594, "grad_norm": 0.4126210300706387, "learning_rate": 3.7901543596161874e-05, "loss": 0.4197, "step": 847 }, { "epoch": 0.9544175576814856, "grad_norm": 0.3222558281528205, "learning_rate": 3.788068418856905e-05, "loss": 0.4277, "step": 848 }, { "epoch": 0.955543050084412, "grad_norm": 0.41145117521139, "learning_rate": 3.785982478097622e-05, "loss": 0.4257, "step": 849 }, { "epoch": 0.9566685424873382, "grad_norm": 0.351379058945545, "learning_rate": 3.78389653733834e-05, "loss": 0.4181, "step": 850 }, { "epoch": 0.9577940348902645, "grad_norm": 0.31029125166165955, "learning_rate": 3.7818105965790575e-05, "loss": 0.4297, "step": 851 }, { "epoch": 0.9589195272931907, "grad_norm": 0.3285912637236667, "learning_rate": 3.7797246558197745e-05, "loss": 0.4138, "step": 852 }, { "epoch": 0.960045019696117, "grad_norm": 0.4036547080190634, "learning_rate": 3.777638715060492e-05, "loss": 0.4366, "step": 853 }, { "epoch": 0.9611705120990434, "grad_norm": 0.3686324816741614, "learning_rate": 3.77555277430121e-05, "loss": 0.4337, "step": 854 }, { "epoch": 0.9622960045019696, "grad_norm": 0.48923825142344834, "learning_rate": 3.7734668335419276e-05, "loss": 0.4343, "step": 855 }, { "epoch": 0.9634214969048959, "grad_norm": 0.3013034390380091, "learning_rate": 3.7713808927826446e-05, "loss": 0.4216, "step": 856 }, { "epoch": 0.9645469893078221, "grad_norm": 0.4352224768520518, "learning_rate": 3.769294952023363e-05, "loss": 0.3941, "step": 857 }, { "epoch": 0.9656724817107485, "grad_norm": 0.3513506338023819, "learning_rate": 3.76720901126408e-05, "loss": 0.4166, "step": 858 }, { "epoch": 0.9667979741136747, "grad_norm": 0.40350199029637573, "learning_rate": 3.765123070504798e-05, "loss": 0.413, "step": 859 }, { "epoch": 0.967923466516601, "grad_norm": 0.362061939817286, "learning_rate": 3.7630371297455154e-05, "loss": 0.411, "step": 860 }, { "epoch": 0.9690489589195272, "grad_norm": 0.3399440337787816, "learning_rate": 3.760951188986233e-05, "loss": 0.4392, "step": 861 }, { "epoch": 0.9701744513224536, "grad_norm": 0.37743255775219053, "learning_rate": 3.75886524822695e-05, "loss": 0.4433, "step": 862 }, { "epoch": 0.9712999437253799, "grad_norm": 0.3638973275713123, "learning_rate": 3.7567793074676685e-05, "loss": 0.4203, "step": 863 }, { "epoch": 0.9724254361283061, "grad_norm": 0.3277233424581398, "learning_rate": 3.7546933667083855e-05, "loss": 0.398, "step": 864 }, { "epoch": 0.9735509285312324, "grad_norm": 0.3141565988423171, "learning_rate": 3.752607425949103e-05, "loss": 0.3959, "step": 865 }, { "epoch": 0.9746764209341587, "grad_norm": 0.35936283889585385, "learning_rate": 3.750521485189821e-05, "loss": 0.432, "step": 866 }, { "epoch": 0.975801913337085, "grad_norm": 0.31770357894398493, "learning_rate": 3.7484355444305386e-05, "loss": 0.431, "step": 867 }, { "epoch": 0.9769274057400112, "grad_norm": 0.3123167816580969, "learning_rate": 3.7463496036712556e-05, "loss": 0.4025, "step": 868 }, { "epoch": 0.9780528981429375, "grad_norm": 0.3692934723238402, "learning_rate": 3.744263662911974e-05, "loss": 0.4221, "step": 869 }, { "epoch": 0.9791783905458639, "grad_norm": 0.354161129420181, "learning_rate": 3.742177722152691e-05, "loss": 0.4172, "step": 870 }, { "epoch": 0.9803038829487901, "grad_norm": 0.36175776122206693, "learning_rate": 3.740091781393409e-05, "loss": 0.4247, "step": 871 }, { "epoch": 0.9814293753517164, "grad_norm": 0.33883677517413535, "learning_rate": 3.7380058406341264e-05, "loss": 0.4068, "step": 872 }, { "epoch": 0.9825548677546426, "grad_norm": 0.42954345350848233, "learning_rate": 3.7359198998748435e-05, "loss": 0.422, "step": 873 }, { "epoch": 0.983680360157569, "grad_norm": 0.3555531618337076, "learning_rate": 3.733833959115561e-05, "loss": 0.4117, "step": 874 }, { "epoch": 0.9848058525604952, "grad_norm": 0.3137468970852999, "learning_rate": 3.731748018356279e-05, "loss": 0.4044, "step": 875 }, { "epoch": 0.9859313449634215, "grad_norm": 0.32456521220251544, "learning_rate": 3.7296620775969966e-05, "loss": 0.4133, "step": 876 }, { "epoch": 0.9870568373663478, "grad_norm": 0.31014819015532874, "learning_rate": 3.7275761368377136e-05, "loss": 0.4141, "step": 877 }, { "epoch": 0.988182329769274, "grad_norm": 0.32436938468507787, "learning_rate": 3.725490196078432e-05, "loss": 0.4095, "step": 878 }, { "epoch": 0.9893078221722004, "grad_norm": 0.33188432959790465, "learning_rate": 3.723404255319149e-05, "loss": 0.4029, "step": 879 }, { "epoch": 0.9904333145751266, "grad_norm": 0.3654774295033461, "learning_rate": 3.721318314559867e-05, "loss": 0.4255, "step": 880 }, { "epoch": 0.9915588069780529, "grad_norm": 0.366785518306503, "learning_rate": 3.7192323738005844e-05, "loss": 0.4449, "step": 881 }, { "epoch": 0.9926842993809791, "grad_norm": 0.29826436924819194, "learning_rate": 3.717146433041302e-05, "loss": 0.4048, "step": 882 }, { "epoch": 0.9938097917839055, "grad_norm": 0.3152150195685499, "learning_rate": 3.715060492282019e-05, "loss": 0.4156, "step": 883 }, { "epoch": 0.9949352841868317, "grad_norm": 0.3790269660933605, "learning_rate": 3.712974551522737e-05, "loss": 0.4258, "step": 884 }, { "epoch": 0.996060776589758, "grad_norm": 0.3234490218718985, "learning_rate": 3.7108886107634545e-05, "loss": 0.4225, "step": 885 }, { "epoch": 0.9971862689926843, "grad_norm": 0.3692387676152948, "learning_rate": 3.708802670004172e-05, "loss": 0.4193, "step": 886 }, { "epoch": 0.9983117613956106, "grad_norm": 0.3548734847141469, "learning_rate": 3.70671672924489e-05, "loss": 0.4223, "step": 887 }, { "epoch": 0.9994372537985369, "grad_norm": 0.41390225631408695, "learning_rate": 3.7046307884856076e-05, "loss": 0.4223, "step": 888 }, { "epoch": 1.0, "grad_norm": 0.41390225631408695, "learning_rate": 3.7025448477263246e-05, "loss": 0.43, "step": 889 }, { "epoch": 1.0011254924029263, "grad_norm": 0.5580430959772991, "learning_rate": 3.700458906967042e-05, "loss": 0.3606, "step": 890 }, { "epoch": 1.0022509848058525, "grad_norm": 0.38786233021722444, "learning_rate": 3.69837296620776e-05, "loss": 0.3516, "step": 891 }, { "epoch": 1.0033764772087788, "grad_norm": 0.3607061049938279, "learning_rate": 3.696287025448478e-05, "loss": 0.3586, "step": 892 }, { "epoch": 1.004501969611705, "grad_norm": 0.30722252549464857, "learning_rate": 3.694201084689195e-05, "loss": 0.3566, "step": 893 }, { "epoch": 1.0056274620146315, "grad_norm": 0.4162073977345431, "learning_rate": 3.6921151439299124e-05, "loss": 0.3517, "step": 894 }, { "epoch": 1.0067529544175577, "grad_norm": 0.3477012359425953, "learning_rate": 3.69002920317063e-05, "loss": 0.356, "step": 895 }, { "epoch": 1.007878446820484, "grad_norm": 0.34334848103470345, "learning_rate": 3.687943262411347e-05, "loss": 0.3529, "step": 896 }, { "epoch": 1.0090039392234103, "grad_norm": 0.3303966213040783, "learning_rate": 3.6858573216520655e-05, "loss": 0.3775, "step": 897 }, { "epoch": 1.0101294316263365, "grad_norm": 0.41641276263804705, "learning_rate": 3.6837713808927825e-05, "loss": 0.3482, "step": 898 }, { "epoch": 1.0112549240292628, "grad_norm": 0.3475211971953784, "learning_rate": 3.6816854401335e-05, "loss": 0.3469, "step": 899 }, { "epoch": 1.012380416432189, "grad_norm": 0.3630650367930452, "learning_rate": 3.679599499374218e-05, "loss": 0.3733, "step": 900 }, { "epoch": 1.0135059088351153, "grad_norm": 0.33917879336611284, "learning_rate": 3.6775135586149356e-05, "loss": 0.3613, "step": 901 }, { "epoch": 1.0146314012380417, "grad_norm": 0.3916615454670656, "learning_rate": 3.6754276178556526e-05, "loss": 0.3642, "step": 902 }, { "epoch": 1.015756893640968, "grad_norm": 0.387700709428207, "learning_rate": 3.673341677096371e-05, "loss": 0.337, "step": 903 }, { "epoch": 1.0168823860438942, "grad_norm": 0.311008874794384, "learning_rate": 3.671255736337088e-05, "loss": 0.3464, "step": 904 }, { "epoch": 1.0180078784468205, "grad_norm": 0.34204508328431077, "learning_rate": 3.669169795577806e-05, "loss": 0.3493, "step": 905 }, { "epoch": 1.0191333708497468, "grad_norm": 0.35056912513693533, "learning_rate": 3.6670838548185234e-05, "loss": 0.3847, "step": 906 }, { "epoch": 1.020258863252673, "grad_norm": 0.3603063090886555, "learning_rate": 3.664997914059241e-05, "loss": 0.3696, "step": 907 }, { "epoch": 1.0213843556555993, "grad_norm": 0.3406429440812445, "learning_rate": 3.662911973299958e-05, "loss": 0.3585, "step": 908 }, { "epoch": 1.0225098480585255, "grad_norm": 0.4146200559571759, "learning_rate": 3.6608260325406765e-05, "loss": 0.3617, "step": 909 }, { "epoch": 1.023635340461452, "grad_norm": 0.30025908743312035, "learning_rate": 3.6587400917813936e-05, "loss": 0.3416, "step": 910 }, { "epoch": 1.0247608328643782, "grad_norm": 0.4720811356812383, "learning_rate": 3.656654151022111e-05, "loss": 0.3533, "step": 911 }, { "epoch": 1.0258863252673045, "grad_norm": 0.29184795311941897, "learning_rate": 3.654568210262829e-05, "loss": 0.3493, "step": 912 }, { "epoch": 1.0270118176702308, "grad_norm": 0.385289462825186, "learning_rate": 3.6524822695035466e-05, "loss": 0.393, "step": 913 }, { "epoch": 1.028137310073157, "grad_norm": 0.3107082501520784, "learning_rate": 3.650396328744264e-05, "loss": 0.3677, "step": 914 }, { "epoch": 1.0292628024760833, "grad_norm": 0.2892635060197107, "learning_rate": 3.6483103879849814e-05, "loss": 0.3484, "step": 915 }, { "epoch": 1.0303882948790095, "grad_norm": 0.37383301152112214, "learning_rate": 3.646224447225699e-05, "loss": 0.3741, "step": 916 }, { "epoch": 1.0315137872819358, "grad_norm": 0.32042127431190587, "learning_rate": 3.644138506466416e-05, "loss": 0.3453, "step": 917 }, { "epoch": 1.032639279684862, "grad_norm": 0.3227805251806716, "learning_rate": 3.6420525657071345e-05, "loss": 0.347, "step": 918 }, { "epoch": 1.0337647720877885, "grad_norm": 0.33975552005827825, "learning_rate": 3.6399666249478515e-05, "loss": 0.342, "step": 919 }, { "epoch": 1.0348902644907148, "grad_norm": 0.3053184721102955, "learning_rate": 3.637880684188569e-05, "loss": 0.368, "step": 920 }, { "epoch": 1.036015756893641, "grad_norm": 0.4171758873578538, "learning_rate": 3.635794743429287e-05, "loss": 0.3506, "step": 921 }, { "epoch": 1.0371412492965673, "grad_norm": 0.35788110167643483, "learning_rate": 3.6337088026700046e-05, "loss": 0.3678, "step": 922 }, { "epoch": 1.0382667416994935, "grad_norm": 0.40422162482455976, "learning_rate": 3.6316228619107216e-05, "loss": 0.3841, "step": 923 }, { "epoch": 1.0393922341024198, "grad_norm": 0.42302051382729106, "learning_rate": 3.629536921151439e-05, "loss": 0.3609, "step": 924 }, { "epoch": 1.040517726505346, "grad_norm": 0.3002900676912074, "learning_rate": 3.627450980392157e-05, "loss": 0.3764, "step": 925 }, { "epoch": 1.0416432189082723, "grad_norm": 0.4216178632940728, "learning_rate": 3.625365039632875e-05, "loss": 0.3525, "step": 926 }, { "epoch": 1.0427687113111987, "grad_norm": 0.36722403261101394, "learning_rate": 3.6232790988735924e-05, "loss": 0.3651, "step": 927 }, { "epoch": 1.043894203714125, "grad_norm": 0.37487765396444256, "learning_rate": 3.62119315811431e-05, "loss": 0.3732, "step": 928 }, { "epoch": 1.0450196961170513, "grad_norm": 0.40248279158053446, "learning_rate": 3.619107217355027e-05, "loss": 0.3514, "step": 929 }, { "epoch": 1.0461451885199775, "grad_norm": 0.34487298402942634, "learning_rate": 3.617021276595745e-05, "loss": 0.3453, "step": 930 }, { "epoch": 1.0472706809229038, "grad_norm": 0.35894348708147356, "learning_rate": 3.6149353358364625e-05, "loss": 0.3445, "step": 931 }, { "epoch": 1.04839617332583, "grad_norm": 0.46543989700724425, "learning_rate": 3.61284939507718e-05, "loss": 0.3554, "step": 932 }, { "epoch": 1.0495216657287563, "grad_norm": 0.32251577447042856, "learning_rate": 3.610763454317897e-05, "loss": 0.3571, "step": 933 }, { "epoch": 1.0506471581316825, "grad_norm": 0.3539766683535758, "learning_rate": 3.608677513558615e-05, "loss": 0.3291, "step": 934 }, { "epoch": 1.051772650534609, "grad_norm": 0.34471085249350447, "learning_rate": 3.6065915727993326e-05, "loss": 0.3764, "step": 935 }, { "epoch": 1.0528981429375353, "grad_norm": 0.33468302525089494, "learning_rate": 3.6045056320400496e-05, "loss": 0.3479, "step": 936 }, { "epoch": 1.0540236353404615, "grad_norm": 0.36538591134232934, "learning_rate": 3.602419691280768e-05, "loss": 0.3642, "step": 937 }, { "epoch": 1.0551491277433878, "grad_norm": 0.35282922968280045, "learning_rate": 3.600333750521485e-05, "loss": 0.3446, "step": 938 }, { "epoch": 1.056274620146314, "grad_norm": 0.35478764255979334, "learning_rate": 3.598247809762203e-05, "loss": 0.3752, "step": 939 }, { "epoch": 1.0574001125492403, "grad_norm": 0.3565613451966995, "learning_rate": 3.5961618690029204e-05, "loss": 0.362, "step": 940 }, { "epoch": 1.0585256049521665, "grad_norm": 0.33132722259601055, "learning_rate": 3.594075928243638e-05, "loss": 0.3559, "step": 941 }, { "epoch": 1.0596510973550928, "grad_norm": 0.34347700089780575, "learning_rate": 3.591989987484355e-05, "loss": 0.3641, "step": 942 }, { "epoch": 1.060776589758019, "grad_norm": 0.2772476546624268, "learning_rate": 3.5899040467250735e-05, "loss": 0.3433, "step": 943 }, { "epoch": 1.0619020821609455, "grad_norm": 0.36078868188752466, "learning_rate": 3.5878181059657906e-05, "loss": 0.36, "step": 944 }, { "epoch": 1.0630275745638718, "grad_norm": 0.2927763816273808, "learning_rate": 3.585732165206508e-05, "loss": 0.3757, "step": 945 }, { "epoch": 1.064153066966798, "grad_norm": 0.31067799008966573, "learning_rate": 3.583646224447226e-05, "loss": 0.3375, "step": 946 }, { "epoch": 1.0652785593697243, "grad_norm": 0.30786259543828726, "learning_rate": 3.5815602836879437e-05, "loss": 0.3691, "step": 947 }, { "epoch": 1.0664040517726505, "grad_norm": 0.34927488285962766, "learning_rate": 3.579474342928661e-05, "loss": 0.3512, "step": 948 }, { "epoch": 1.0675295441755768, "grad_norm": 0.3134128528998366, "learning_rate": 3.577388402169379e-05, "loss": 0.3684, "step": 949 }, { "epoch": 1.068655036578503, "grad_norm": 0.3684381541500359, "learning_rate": 3.575302461410096e-05, "loss": 0.3635, "step": 950 }, { "epoch": 1.0697805289814293, "grad_norm": 0.3071501276127385, "learning_rate": 3.573216520650814e-05, "loss": 0.3629, "step": 951 }, { "epoch": 1.0709060213843558, "grad_norm": 0.3650935121688607, "learning_rate": 3.5711305798915315e-05, "loss": 0.352, "step": 952 }, { "epoch": 1.072031513787282, "grad_norm": 0.3004157301630184, "learning_rate": 3.569044639132249e-05, "loss": 0.3627, "step": 953 }, { "epoch": 1.0731570061902083, "grad_norm": 0.3588467213474463, "learning_rate": 3.566958698372966e-05, "loss": 0.378, "step": 954 }, { "epoch": 1.0742824985931345, "grad_norm": 0.38695693104692636, "learning_rate": 3.564872757613684e-05, "loss": 0.3558, "step": 955 }, { "epoch": 1.0754079909960608, "grad_norm": 0.30329694533620805, "learning_rate": 3.5627868168544016e-05, "loss": 0.3841, "step": 956 }, { "epoch": 1.076533483398987, "grad_norm": 0.34905611952609783, "learning_rate": 3.5607008760951186e-05, "loss": 0.3689, "step": 957 }, { "epoch": 1.0776589758019133, "grad_norm": 0.28800778538826344, "learning_rate": 3.558614935335837e-05, "loss": 0.3543, "step": 958 }, { "epoch": 1.0787844682048395, "grad_norm": 0.3746527261236155, "learning_rate": 3.556528994576554e-05, "loss": 0.366, "step": 959 }, { "epoch": 1.079909960607766, "grad_norm": 0.32663591501026235, "learning_rate": 3.554443053817272e-05, "loss": 0.3499, "step": 960 }, { "epoch": 1.0810354530106923, "grad_norm": 0.3328189109583666, "learning_rate": 3.5523571130579894e-05, "loss": 0.353, "step": 961 }, { "epoch": 1.0821609454136185, "grad_norm": 0.31964664375303664, "learning_rate": 3.550271172298707e-05, "loss": 0.3672, "step": 962 }, { "epoch": 1.0832864378165448, "grad_norm": 0.36918332363958006, "learning_rate": 3.548185231539424e-05, "loss": 0.3798, "step": 963 }, { "epoch": 1.084411930219471, "grad_norm": 0.3254223917013834, "learning_rate": 3.546099290780142e-05, "loss": 0.3559, "step": 964 }, { "epoch": 1.0855374226223973, "grad_norm": 0.3008814703536633, "learning_rate": 3.5440133500208595e-05, "loss": 0.3609, "step": 965 }, { "epoch": 1.0866629150253235, "grad_norm": 0.35240736109329646, "learning_rate": 3.541927409261577e-05, "loss": 0.3777, "step": 966 }, { "epoch": 1.0877884074282498, "grad_norm": 0.3869312281732699, "learning_rate": 3.539841468502295e-05, "loss": 0.3724, "step": 967 }, { "epoch": 1.088913899831176, "grad_norm": 0.30726021570614737, "learning_rate": 3.5377555277430126e-05, "loss": 0.3531, "step": 968 }, { "epoch": 1.0900393922341025, "grad_norm": 0.34236583353183286, "learning_rate": 3.5356695869837296e-05, "loss": 0.3608, "step": 969 }, { "epoch": 1.0911648846370288, "grad_norm": 0.2916866803109591, "learning_rate": 3.533583646224447e-05, "loss": 0.3624, "step": 970 }, { "epoch": 1.092290377039955, "grad_norm": 0.3145203080926422, "learning_rate": 3.531497705465165e-05, "loss": 0.3684, "step": 971 }, { "epoch": 1.0934158694428813, "grad_norm": 0.2873541218671502, "learning_rate": 3.529411764705883e-05, "loss": 0.3617, "step": 972 }, { "epoch": 1.0945413618458075, "grad_norm": 0.3506652103429166, "learning_rate": 3.5273258239466e-05, "loss": 0.3583, "step": 973 }, { "epoch": 1.0956668542487338, "grad_norm": 0.3025123158669694, "learning_rate": 3.525239883187318e-05, "loss": 0.3472, "step": 974 }, { "epoch": 1.09679234665166, "grad_norm": 0.2899074126357094, "learning_rate": 3.523153942428035e-05, "loss": 0.3675, "step": 975 }, { "epoch": 1.0979178390545863, "grad_norm": 0.3150990472406033, "learning_rate": 3.521068001668753e-05, "loss": 0.3636, "step": 976 }, { "epoch": 1.0990433314575128, "grad_norm": 0.35489391655027186, "learning_rate": 3.5189820609094705e-05, "loss": 0.3384, "step": 977 }, { "epoch": 1.100168823860439, "grad_norm": 0.3041199542435297, "learning_rate": 3.5168961201501876e-05, "loss": 0.3571, "step": 978 }, { "epoch": 1.1012943162633653, "grad_norm": 0.31637443077212757, "learning_rate": 3.514810179390905e-05, "loss": 0.3703, "step": 979 }, { "epoch": 1.1024198086662915, "grad_norm": 0.33113581691565325, "learning_rate": 3.512724238631623e-05, "loss": 0.35, "step": 980 }, { "epoch": 1.1035453010692178, "grad_norm": 0.3300457711599469, "learning_rate": 3.5106382978723407e-05, "loss": 0.3485, "step": 981 }, { "epoch": 1.104670793472144, "grad_norm": 0.37342013448224476, "learning_rate": 3.508552357113058e-05, "loss": 0.3543, "step": 982 }, { "epoch": 1.1057962858750703, "grad_norm": 0.36084265787497494, "learning_rate": 3.506466416353776e-05, "loss": 0.3499, "step": 983 }, { "epoch": 1.1069217782779965, "grad_norm": 0.36650053348727774, "learning_rate": 3.504380475594493e-05, "loss": 0.3727, "step": 984 }, { "epoch": 1.108047270680923, "grad_norm": 0.38335191540233127, "learning_rate": 3.502294534835211e-05, "loss": 0.3557, "step": 985 }, { "epoch": 1.1091727630838493, "grad_norm": 0.36320976195356514, "learning_rate": 3.5002085940759285e-05, "loss": 0.382, "step": 986 }, { "epoch": 1.1102982554867755, "grad_norm": 0.38636958474248506, "learning_rate": 3.498122653316646e-05, "loss": 0.3402, "step": 987 }, { "epoch": 1.1114237478897018, "grad_norm": 0.38017701551768956, "learning_rate": 3.496036712557363e-05, "loss": 0.3742, "step": 988 }, { "epoch": 1.112549240292628, "grad_norm": 0.3198258149962093, "learning_rate": 3.4939507717980816e-05, "loss": 0.3432, "step": 989 }, { "epoch": 1.1136747326955543, "grad_norm": 0.38060186204014107, "learning_rate": 3.4918648310387986e-05, "loss": 0.364, "step": 990 }, { "epoch": 1.1148002250984805, "grad_norm": 0.3522538503310745, "learning_rate": 3.489778890279516e-05, "loss": 0.3862, "step": 991 }, { "epoch": 1.1159257175014068, "grad_norm": 0.34893950721299544, "learning_rate": 3.487692949520234e-05, "loss": 0.3674, "step": 992 }, { "epoch": 1.117051209904333, "grad_norm": 0.3145664530999275, "learning_rate": 3.485607008760952e-05, "loss": 0.3623, "step": 993 }, { "epoch": 1.1181767023072595, "grad_norm": 0.38231007603706296, "learning_rate": 3.483521068001669e-05, "loss": 0.3513, "step": 994 }, { "epoch": 1.1193021947101858, "grad_norm": 0.29574406471189, "learning_rate": 3.481435127242387e-05, "loss": 0.3686, "step": 995 }, { "epoch": 1.120427687113112, "grad_norm": 0.3786384191919254, "learning_rate": 3.479349186483104e-05, "loss": 0.3496, "step": 996 }, { "epoch": 1.1215531795160383, "grad_norm": 0.27933782961377807, "learning_rate": 3.477263245723821e-05, "loss": 0.3865, "step": 997 }, { "epoch": 1.1226786719189645, "grad_norm": 0.3796958540762593, "learning_rate": 3.4751773049645395e-05, "loss": 0.3701, "step": 998 }, { "epoch": 1.1238041643218908, "grad_norm": 0.31019085193512064, "learning_rate": 3.4730913642052565e-05, "loss": 0.3544, "step": 999 }, { "epoch": 1.124929656724817, "grad_norm": 0.3894747761447629, "learning_rate": 3.471005423445974e-05, "loss": 0.3613, "step": 1000 }, { "epoch": 1.1260551491277433, "grad_norm": 0.3848999285142024, "learning_rate": 3.468919482686692e-05, "loss": 0.3572, "step": 1001 }, { "epoch": 1.1271806415306695, "grad_norm": 0.4075083886945119, "learning_rate": 3.4668335419274096e-05, "loss": 0.3534, "step": 1002 }, { "epoch": 1.128306133933596, "grad_norm": 0.4244922841249029, "learning_rate": 3.4647476011681266e-05, "loss": 0.3857, "step": 1003 }, { "epoch": 1.1294316263365223, "grad_norm": 0.3575947287049676, "learning_rate": 3.462661660408844e-05, "loss": 0.3494, "step": 1004 }, { "epoch": 1.1305571187394485, "grad_norm": 0.3920246518678635, "learning_rate": 3.460575719649562e-05, "loss": 0.3693, "step": 1005 }, { "epoch": 1.1316826111423748, "grad_norm": 0.3065280136400847, "learning_rate": 3.45848977889028e-05, "loss": 0.3352, "step": 1006 }, { "epoch": 1.132808103545301, "grad_norm": 0.38525744406438595, "learning_rate": 3.4564038381309974e-05, "loss": 0.353, "step": 1007 }, { "epoch": 1.1339335959482273, "grad_norm": 0.47272322177864035, "learning_rate": 3.454317897371715e-05, "loss": 0.3673, "step": 1008 }, { "epoch": 1.1350590883511535, "grad_norm": 0.3327944075995892, "learning_rate": 3.452231956612432e-05, "loss": 0.3523, "step": 1009 }, { "epoch": 1.13618458075408, "grad_norm": 0.42906579303424525, "learning_rate": 3.45014601585315e-05, "loss": 0.3577, "step": 1010 }, { "epoch": 1.1373100731570063, "grad_norm": 0.31630743768076713, "learning_rate": 3.4480600750938675e-05, "loss": 0.3571, "step": 1011 }, { "epoch": 1.1384355655599325, "grad_norm": 0.41005007736044136, "learning_rate": 3.445974134334585e-05, "loss": 0.362, "step": 1012 }, { "epoch": 1.1395610579628588, "grad_norm": 0.3846148750924408, "learning_rate": 3.443888193575302e-05, "loss": 0.3554, "step": 1013 }, { "epoch": 1.140686550365785, "grad_norm": 0.39499988480138304, "learning_rate": 3.4418022528160206e-05, "loss": 0.367, "step": 1014 }, { "epoch": 1.1418120427687113, "grad_norm": 0.35657946077097175, "learning_rate": 3.4397163120567377e-05, "loss": 0.3694, "step": 1015 }, { "epoch": 1.1429375351716375, "grad_norm": 0.3728438143327632, "learning_rate": 3.4376303712974554e-05, "loss": 0.3713, "step": 1016 }, { "epoch": 1.1440630275745638, "grad_norm": 0.34659822653002426, "learning_rate": 3.435544430538173e-05, "loss": 0.3584, "step": 1017 }, { "epoch": 1.14518851997749, "grad_norm": 0.3828982028856398, "learning_rate": 3.43345848977889e-05, "loss": 0.357, "step": 1018 }, { "epoch": 1.1463140123804165, "grad_norm": 0.35840428604352054, "learning_rate": 3.431372549019608e-05, "loss": 0.3658, "step": 1019 }, { "epoch": 1.1474395047833428, "grad_norm": 0.3642341763560189, "learning_rate": 3.4292866082603255e-05, "loss": 0.3768, "step": 1020 }, { "epoch": 1.148564997186269, "grad_norm": 0.47028026081900165, "learning_rate": 3.427200667501043e-05, "loss": 0.3448, "step": 1021 }, { "epoch": 1.1496904895891953, "grad_norm": 0.33137638092807364, "learning_rate": 3.42511472674176e-05, "loss": 0.3855, "step": 1022 }, { "epoch": 1.1508159819921215, "grad_norm": 0.4049631157313659, "learning_rate": 3.4230287859824786e-05, "loss": 0.3801, "step": 1023 }, { "epoch": 1.1519414743950478, "grad_norm": 0.3829633936239526, "learning_rate": 3.4209428452231956e-05, "loss": 0.3791, "step": 1024 }, { "epoch": 1.153066966797974, "grad_norm": 0.42759635786809663, "learning_rate": 3.418856904463913e-05, "loss": 0.3676, "step": 1025 }, { "epoch": 1.1541924592009003, "grad_norm": 0.3728776125817692, "learning_rate": 3.416770963704631e-05, "loss": 0.3622, "step": 1026 }, { "epoch": 1.1553179516038266, "grad_norm": 0.39380341402257635, "learning_rate": 3.414685022945349e-05, "loss": 0.3785, "step": 1027 }, { "epoch": 1.156443444006753, "grad_norm": 0.32076593702973827, "learning_rate": 3.412599082186066e-05, "loss": 0.3745, "step": 1028 }, { "epoch": 1.1575689364096793, "grad_norm": 0.32908758752319733, "learning_rate": 3.410513141426784e-05, "loss": 0.3496, "step": 1029 }, { "epoch": 1.1586944288126055, "grad_norm": 0.41768970871312155, "learning_rate": 3.408427200667501e-05, "loss": 0.3575, "step": 1030 }, { "epoch": 1.1598199212155318, "grad_norm": 0.3106359891104045, "learning_rate": 3.406341259908219e-05, "loss": 0.3516, "step": 1031 }, { "epoch": 1.160945413618458, "grad_norm": 0.3870701068020313, "learning_rate": 3.4042553191489365e-05, "loss": 0.3356, "step": 1032 }, { "epoch": 1.1620709060213843, "grad_norm": 0.38611106269123546, "learning_rate": 3.402169378389654e-05, "loss": 0.3469, "step": 1033 }, { "epoch": 1.1631963984243106, "grad_norm": 0.3255124156021805, "learning_rate": 3.400083437630371e-05, "loss": 0.3722, "step": 1034 }, { "epoch": 1.164321890827237, "grad_norm": 0.32836642792719567, "learning_rate": 3.3979974968710896e-05, "loss": 0.3544, "step": 1035 }, { "epoch": 1.1654473832301633, "grad_norm": 0.3805911934596958, "learning_rate": 3.3959115561118066e-05, "loss": 0.3982, "step": 1036 }, { "epoch": 1.1665728756330895, "grad_norm": 0.3368162160417577, "learning_rate": 3.393825615352524e-05, "loss": 0.3679, "step": 1037 }, { "epoch": 1.1676983680360158, "grad_norm": 0.31363563073754847, "learning_rate": 3.391739674593242e-05, "loss": 0.3529, "step": 1038 }, { "epoch": 1.168823860438942, "grad_norm": 0.34006739877010494, "learning_rate": 3.389653733833959e-05, "loss": 0.3463, "step": 1039 }, { "epoch": 1.1699493528418683, "grad_norm": 0.3100061821836274, "learning_rate": 3.387567793074677e-05, "loss": 0.3381, "step": 1040 }, { "epoch": 1.1710748452447945, "grad_norm": 0.3065807803890228, "learning_rate": 3.3854818523153944e-05, "loss": 0.3651, "step": 1041 }, { "epoch": 1.1722003376477208, "grad_norm": 0.32611882573130585, "learning_rate": 3.383395911556112e-05, "loss": 0.3529, "step": 1042 }, { "epoch": 1.173325830050647, "grad_norm": 0.28895452201759864, "learning_rate": 3.381309970796829e-05, "loss": 0.3307, "step": 1043 }, { "epoch": 1.1744513224535735, "grad_norm": 0.31616663311663623, "learning_rate": 3.379224030037547e-05, "loss": 0.3615, "step": 1044 }, { "epoch": 1.1755768148564998, "grad_norm": 0.2999011173077538, "learning_rate": 3.3771380892782645e-05, "loss": 0.3527, "step": 1045 }, { "epoch": 1.176702307259426, "grad_norm": 0.28604936736274933, "learning_rate": 3.375052148518982e-05, "loss": 0.361, "step": 1046 }, { "epoch": 1.1778277996623523, "grad_norm": 0.3028269137775988, "learning_rate": 3.3729662077597e-05, "loss": 0.3668, "step": 1047 }, { "epoch": 1.1789532920652785, "grad_norm": 0.36698195409495143, "learning_rate": 3.3708802670004176e-05, "loss": 0.352, "step": 1048 }, { "epoch": 1.1800787844682048, "grad_norm": 0.2951939270230831, "learning_rate": 3.3687943262411347e-05, "loss": 0.3533, "step": 1049 }, { "epoch": 1.181204276871131, "grad_norm": 0.4064761843327334, "learning_rate": 3.3667083854818524e-05, "loss": 0.3601, "step": 1050 }, { "epoch": 1.1823297692740573, "grad_norm": 0.325934767924338, "learning_rate": 3.36462244472257e-05, "loss": 0.366, "step": 1051 }, { "epoch": 1.1834552616769836, "grad_norm": 0.3444374492643726, "learning_rate": 3.362536503963288e-05, "loss": 0.3591, "step": 1052 }, { "epoch": 1.18458075407991, "grad_norm": 0.3902013079098464, "learning_rate": 3.360450563204005e-05, "loss": 0.3609, "step": 1053 }, { "epoch": 1.1857062464828363, "grad_norm": 0.3552567977795283, "learning_rate": 3.358364622444723e-05, "loss": 0.3824, "step": 1054 }, { "epoch": 1.1868317388857625, "grad_norm": 0.5473634143542325, "learning_rate": 3.35627868168544e-05, "loss": 0.344, "step": 1055 }, { "epoch": 1.1879572312886888, "grad_norm": 0.31822857141954713, "learning_rate": 3.354192740926158e-05, "loss": 0.34, "step": 1056 }, { "epoch": 1.189082723691615, "grad_norm": 0.35648383062484057, "learning_rate": 3.3521068001668756e-05, "loss": 0.3664, "step": 1057 }, { "epoch": 1.1902082160945413, "grad_norm": 0.3533726981414865, "learning_rate": 3.350020859407593e-05, "loss": 0.3643, "step": 1058 }, { "epoch": 1.1913337084974676, "grad_norm": 0.38846901904691766, "learning_rate": 3.34793491864831e-05, "loss": 0.364, "step": 1059 }, { "epoch": 1.192459200900394, "grad_norm": 0.32829805282614477, "learning_rate": 3.345848977889028e-05, "loss": 0.3505, "step": 1060 }, { "epoch": 1.1935846933033203, "grad_norm": 0.3371243132688832, "learning_rate": 3.343763037129746e-05, "loss": 0.3706, "step": 1061 }, { "epoch": 1.1947101857062465, "grad_norm": 0.29390329610439453, "learning_rate": 3.341677096370463e-05, "loss": 0.3513, "step": 1062 }, { "epoch": 1.1958356781091728, "grad_norm": 0.3589333659631211, "learning_rate": 3.339591155611181e-05, "loss": 0.364, "step": 1063 }, { "epoch": 1.196961170512099, "grad_norm": 0.3025901807833534, "learning_rate": 3.337505214851898e-05, "loss": 0.3716, "step": 1064 }, { "epoch": 1.1980866629150253, "grad_norm": 0.2990903113895738, "learning_rate": 3.335419274092616e-05, "loss": 0.3703, "step": 1065 }, { "epoch": 1.1992121553179516, "grad_norm": 0.3084522992492389, "learning_rate": 3.3333333333333335e-05, "loss": 0.3423, "step": 1066 }, { "epoch": 1.2003376477208778, "grad_norm": 0.2833543979726358, "learning_rate": 3.331247392574051e-05, "loss": 0.3794, "step": 1067 }, { "epoch": 1.201463140123804, "grad_norm": 0.3502254927161911, "learning_rate": 3.329161451814768e-05, "loss": 0.3588, "step": 1068 }, { "epoch": 1.2025886325267305, "grad_norm": 0.2824861573083505, "learning_rate": 3.3270755110554866e-05, "loss": 0.3375, "step": 1069 }, { "epoch": 1.2037141249296568, "grad_norm": 0.32275485870283527, "learning_rate": 3.3249895702962036e-05, "loss": 0.3659, "step": 1070 }, { "epoch": 1.204839617332583, "grad_norm": 0.2831771258197277, "learning_rate": 3.322903629536921e-05, "loss": 0.3608, "step": 1071 }, { "epoch": 1.2059651097355093, "grad_norm": 0.4099026461053303, "learning_rate": 3.320817688777639e-05, "loss": 0.3657, "step": 1072 }, { "epoch": 1.2070906021384356, "grad_norm": 0.2988459528156424, "learning_rate": 3.318731748018357e-05, "loss": 0.3612, "step": 1073 }, { "epoch": 1.2082160945413618, "grad_norm": 0.3285103143387034, "learning_rate": 3.316645807259074e-05, "loss": 0.3243, "step": 1074 }, { "epoch": 1.209341586944288, "grad_norm": 0.3140866233456728, "learning_rate": 3.314559866499792e-05, "loss": 0.3605, "step": 1075 }, { "epoch": 1.2104670793472143, "grad_norm": 0.3136304362130377, "learning_rate": 3.312473925740509e-05, "loss": 0.3518, "step": 1076 }, { "epoch": 1.2115925717501406, "grad_norm": 0.37704920383565566, "learning_rate": 3.310387984981227e-05, "loss": 0.3505, "step": 1077 }, { "epoch": 1.212718064153067, "grad_norm": 0.3386484501892276, "learning_rate": 3.3083020442219445e-05, "loss": 0.3581, "step": 1078 }, { "epoch": 1.2138435565559933, "grad_norm": 0.2756287445653643, "learning_rate": 3.306216103462662e-05, "loss": 0.3508, "step": 1079 }, { "epoch": 1.2149690489589196, "grad_norm": 0.3727446701400803, "learning_rate": 3.304130162703379e-05, "loss": 0.3637, "step": 1080 }, { "epoch": 1.2160945413618458, "grad_norm": 0.34205106067470487, "learning_rate": 3.302044221944097e-05, "loss": 0.3719, "step": 1081 }, { "epoch": 1.217220033764772, "grad_norm": 0.277943785807938, "learning_rate": 3.2999582811848146e-05, "loss": 0.3663, "step": 1082 }, { "epoch": 1.2183455261676983, "grad_norm": 0.32778887865165535, "learning_rate": 3.2978723404255317e-05, "loss": 0.3788, "step": 1083 }, { "epoch": 1.2194710185706246, "grad_norm": 0.35850613973050943, "learning_rate": 3.2957863996662494e-05, "loss": 0.3516, "step": 1084 }, { "epoch": 1.220596510973551, "grad_norm": 0.32265446214334986, "learning_rate": 3.293700458906967e-05, "loss": 0.3637, "step": 1085 }, { "epoch": 1.2217220033764773, "grad_norm": 0.32337082624436203, "learning_rate": 3.291614518147685e-05, "loss": 0.353, "step": 1086 }, { "epoch": 1.2228474957794035, "grad_norm": 0.3566976026538077, "learning_rate": 3.2895285773884024e-05, "loss": 0.3702, "step": 1087 }, { "epoch": 1.2239729881823298, "grad_norm": 0.3602820801303339, "learning_rate": 3.28744263662912e-05, "loss": 0.3478, "step": 1088 }, { "epoch": 1.225098480585256, "grad_norm": 0.3167430696040855, "learning_rate": 3.285356695869837e-05, "loss": 0.3659, "step": 1089 }, { "epoch": 1.2262239729881823, "grad_norm": 0.3018055329469023, "learning_rate": 3.283270755110555e-05, "loss": 0.3588, "step": 1090 }, { "epoch": 1.2273494653911086, "grad_norm": 0.3230175863991661, "learning_rate": 3.2811848143512726e-05, "loss": 0.3843, "step": 1091 }, { "epoch": 1.2284749577940348, "grad_norm": 0.33437635395778137, "learning_rate": 3.27909887359199e-05, "loss": 0.3767, "step": 1092 }, { "epoch": 1.229600450196961, "grad_norm": 0.2981951069574055, "learning_rate": 3.277012932832707e-05, "loss": 0.3595, "step": 1093 }, { "epoch": 1.2307259425998875, "grad_norm": 0.3380111260459277, "learning_rate": 3.2749269920734257e-05, "loss": 0.3709, "step": 1094 }, { "epoch": 1.2318514350028138, "grad_norm": 0.3105746941727841, "learning_rate": 3.272841051314143e-05, "loss": 0.3683, "step": 1095 }, { "epoch": 1.23297692740574, "grad_norm": 0.30933018515689015, "learning_rate": 3.2707551105548604e-05, "loss": 0.3592, "step": 1096 }, { "epoch": 1.2341024198086663, "grad_norm": 0.3281758704709362, "learning_rate": 3.268669169795578e-05, "loss": 0.3606, "step": 1097 }, { "epoch": 1.2352279122115926, "grad_norm": 0.27019773248184603, "learning_rate": 3.266583229036296e-05, "loss": 0.3761, "step": 1098 }, { "epoch": 1.2363534046145188, "grad_norm": 0.3336597777910512, "learning_rate": 3.264497288277013e-05, "loss": 0.3959, "step": 1099 }, { "epoch": 1.237478897017445, "grad_norm": 0.31907940795687006, "learning_rate": 3.262411347517731e-05, "loss": 0.3615, "step": 1100 }, { "epoch": 1.2386043894203713, "grad_norm": 0.3307050654215005, "learning_rate": 3.260325406758448e-05, "loss": 0.3705, "step": 1101 }, { "epoch": 1.2397298818232976, "grad_norm": 0.31539392998297533, "learning_rate": 3.258239465999165e-05, "loss": 0.3495, "step": 1102 }, { "epoch": 1.240855374226224, "grad_norm": 0.33795761226174353, "learning_rate": 3.2561535252398836e-05, "loss": 0.3268, "step": 1103 }, { "epoch": 1.2419808666291503, "grad_norm": 0.3027820655749601, "learning_rate": 3.2540675844806006e-05, "loss": 0.3408, "step": 1104 }, { "epoch": 1.2431063590320766, "grad_norm": 0.3679583282318403, "learning_rate": 3.251981643721318e-05, "loss": 0.3677, "step": 1105 }, { "epoch": 1.2442318514350028, "grad_norm": 0.3613263755121883, "learning_rate": 3.249895702962036e-05, "loss": 0.3829, "step": 1106 }, { "epoch": 1.245357343837929, "grad_norm": 0.24750292423243272, "learning_rate": 3.247809762202754e-05, "loss": 0.3482, "step": 1107 }, { "epoch": 1.2464828362408553, "grad_norm": 0.3522480210531916, "learning_rate": 3.245723821443471e-05, "loss": 0.3625, "step": 1108 }, { "epoch": 1.2476083286437816, "grad_norm": 0.31652713432842655, "learning_rate": 3.243637880684189e-05, "loss": 0.3671, "step": 1109 }, { "epoch": 1.248733821046708, "grad_norm": 0.2933496822923214, "learning_rate": 3.241551939924906e-05, "loss": 0.3497, "step": 1110 }, { "epoch": 1.2498593134496343, "grad_norm": 0.32050895251241057, "learning_rate": 3.239465999165624e-05, "loss": 0.3716, "step": 1111 }, { "epoch": 1.2509848058525606, "grad_norm": 0.3121935413715743, "learning_rate": 3.2373800584063415e-05, "loss": 0.3771, "step": 1112 }, { "epoch": 1.2521102982554868, "grad_norm": 0.32265588805772627, "learning_rate": 3.235294117647059e-05, "loss": 0.3439, "step": 1113 }, { "epoch": 1.253235790658413, "grad_norm": 0.3064712619565091, "learning_rate": 3.233208176887776e-05, "loss": 0.3546, "step": 1114 }, { "epoch": 1.2543612830613393, "grad_norm": 0.2996084699077036, "learning_rate": 3.2311222361284946e-05, "loss": 0.3833, "step": 1115 }, { "epoch": 1.2554867754642656, "grad_norm": 0.3107489193677045, "learning_rate": 3.2290362953692116e-05, "loss": 0.364, "step": 1116 }, { "epoch": 1.2566122678671918, "grad_norm": 0.29187918061969403, "learning_rate": 3.226950354609929e-05, "loss": 0.3504, "step": 1117 }, { "epoch": 1.257737760270118, "grad_norm": 0.2941379965996245, "learning_rate": 3.224864413850647e-05, "loss": 0.3392, "step": 1118 }, { "epoch": 1.2588632526730446, "grad_norm": 0.25115923486308955, "learning_rate": 3.222778473091365e-05, "loss": 0.3876, "step": 1119 }, { "epoch": 1.2599887450759708, "grad_norm": 0.3252010811279875, "learning_rate": 3.220692532332082e-05, "loss": 0.3375, "step": 1120 }, { "epoch": 1.261114237478897, "grad_norm": 0.29814156629055977, "learning_rate": 3.2186065915727994e-05, "loss": 0.3401, "step": 1121 }, { "epoch": 1.2622397298818233, "grad_norm": 0.31902570430326976, "learning_rate": 3.216520650813517e-05, "loss": 0.3732, "step": 1122 }, { "epoch": 1.2633652222847496, "grad_norm": 0.3010703802720578, "learning_rate": 3.214434710054234e-05, "loss": 0.358, "step": 1123 }, { "epoch": 1.2644907146876758, "grad_norm": 0.32852710550779146, "learning_rate": 3.2123487692949525e-05, "loss": 0.3525, "step": 1124 }, { "epoch": 1.265616207090602, "grad_norm": 0.32212180119638056, "learning_rate": 3.2102628285356696e-05, "loss": 0.3749, "step": 1125 }, { "epoch": 1.2667416994935286, "grad_norm": 0.378384113691716, "learning_rate": 3.208176887776387e-05, "loss": 0.3669, "step": 1126 }, { "epoch": 1.2678671918964546, "grad_norm": 0.31165403587755924, "learning_rate": 3.206090947017105e-05, "loss": 0.3559, "step": 1127 }, { "epoch": 1.268992684299381, "grad_norm": 0.3679615615830758, "learning_rate": 3.2040050062578227e-05, "loss": 0.3623, "step": 1128 }, { "epoch": 1.2701181767023073, "grad_norm": 0.3467806488910905, "learning_rate": 3.20191906549854e-05, "loss": 0.3771, "step": 1129 }, { "epoch": 1.2712436691052336, "grad_norm": 1.9333645104311041, "learning_rate": 3.1998331247392574e-05, "loss": 0.3809, "step": 1130 }, { "epoch": 1.2723691615081598, "grad_norm": 0.4215069325465578, "learning_rate": 3.197747183979975e-05, "loss": 0.3489, "step": 1131 }, { "epoch": 1.273494653911086, "grad_norm": 0.2879811482225369, "learning_rate": 3.195661243220693e-05, "loss": 0.3627, "step": 1132 }, { "epoch": 1.2746201463140123, "grad_norm": 0.4477759704739148, "learning_rate": 3.19357530246141e-05, "loss": 0.3623, "step": 1133 }, { "epoch": 1.2757456387169386, "grad_norm": 0.3424164269682256, "learning_rate": 3.191489361702128e-05, "loss": 0.3476, "step": 1134 }, { "epoch": 1.276871131119865, "grad_norm": 0.32862691867356353, "learning_rate": 3.189403420942845e-05, "loss": 0.3649, "step": 1135 }, { "epoch": 1.277996623522791, "grad_norm": 0.3209270264744574, "learning_rate": 3.187317480183563e-05, "loss": 0.3535, "step": 1136 }, { "epoch": 1.2791221159257176, "grad_norm": 0.3565891148820592, "learning_rate": 3.1852315394242806e-05, "loss": 0.3443, "step": 1137 }, { "epoch": 1.2802476083286438, "grad_norm": 0.28408074419058515, "learning_rate": 3.183145598664998e-05, "loss": 0.369, "step": 1138 }, { "epoch": 1.28137310073157, "grad_norm": 0.3637840011075196, "learning_rate": 3.181059657905715e-05, "loss": 0.3608, "step": 1139 }, { "epoch": 1.2824985931344963, "grad_norm": 0.3595209908718878, "learning_rate": 3.178973717146434e-05, "loss": 0.3493, "step": 1140 }, { "epoch": 1.2836240855374226, "grad_norm": 0.26496883846043384, "learning_rate": 3.176887776387151e-05, "loss": 0.3633, "step": 1141 }, { "epoch": 1.2847495779403488, "grad_norm": 0.3336179309407727, "learning_rate": 3.1748018356278684e-05, "loss": 0.3662, "step": 1142 }, { "epoch": 1.285875070343275, "grad_norm": 0.32668676414933834, "learning_rate": 3.172715894868586e-05, "loss": 0.3671, "step": 1143 }, { "epoch": 1.2870005627462016, "grad_norm": 0.31252062188747054, "learning_rate": 3.170629954109303e-05, "loss": 0.3647, "step": 1144 }, { "epoch": 1.2881260551491278, "grad_norm": 0.31744497936057164, "learning_rate": 3.168544013350021e-05, "loss": 0.3622, "step": 1145 }, { "epoch": 1.289251547552054, "grad_norm": 0.2862050055745393, "learning_rate": 3.1664580725907385e-05, "loss": 0.3883, "step": 1146 }, { "epoch": 1.2903770399549803, "grad_norm": 0.30021118499678395, "learning_rate": 3.164372131831456e-05, "loss": 0.3579, "step": 1147 }, { "epoch": 1.2915025323579066, "grad_norm": 0.2910467656286127, "learning_rate": 3.162286191072173e-05, "loss": 0.3534, "step": 1148 }, { "epoch": 1.2926280247608328, "grad_norm": 0.28678455388133556, "learning_rate": 3.1602002503128916e-05, "loss": 0.3497, "step": 1149 }, { "epoch": 1.293753517163759, "grad_norm": 0.27836486011517614, "learning_rate": 3.1581143095536086e-05, "loss": 0.3443, "step": 1150 }, { "epoch": 1.2948790095666856, "grad_norm": 0.30812952315806486, "learning_rate": 3.156028368794326e-05, "loss": 0.3893, "step": 1151 }, { "epoch": 1.2960045019696116, "grad_norm": 0.2874885069684301, "learning_rate": 3.153942428035044e-05, "loss": 0.3685, "step": 1152 }, { "epoch": 1.297129994372538, "grad_norm": 0.3347706854010768, "learning_rate": 3.151856487275762e-05, "loss": 0.3927, "step": 1153 }, { "epoch": 1.2982554867754643, "grad_norm": 0.32176469835749927, "learning_rate": 3.149770546516479e-05, "loss": 0.38, "step": 1154 }, { "epoch": 1.2993809791783906, "grad_norm": 0.2898256632538439, "learning_rate": 3.147684605757197e-05, "loss": 0.3616, "step": 1155 }, { "epoch": 1.3005064715813168, "grad_norm": 0.33352792276895776, "learning_rate": 3.145598664997914e-05, "loss": 0.3748, "step": 1156 }, { "epoch": 1.301631963984243, "grad_norm": 0.3099488701941323, "learning_rate": 3.143512724238632e-05, "loss": 0.372, "step": 1157 }, { "epoch": 1.3027574563871693, "grad_norm": 0.30363398725151736, "learning_rate": 3.1414267834793495e-05, "loss": 0.3701, "step": 1158 }, { "epoch": 1.3038829487900956, "grad_norm": 0.28447681088914367, "learning_rate": 3.139340842720067e-05, "loss": 0.3774, "step": 1159 }, { "epoch": 1.305008441193022, "grad_norm": 0.3003448198337203, "learning_rate": 3.137254901960784e-05, "loss": 0.3598, "step": 1160 }, { "epoch": 1.306133933595948, "grad_norm": 0.3379084483923677, "learning_rate": 3.135168961201502e-05, "loss": 0.3708, "step": 1161 }, { "epoch": 1.3072594259988746, "grad_norm": 0.28091894310377574, "learning_rate": 3.13308302044222e-05, "loss": 0.3718, "step": 1162 }, { "epoch": 1.3083849184018008, "grad_norm": 0.33666696805419777, "learning_rate": 3.1309970796829374e-05, "loss": 0.3521, "step": 1163 }, { "epoch": 1.309510410804727, "grad_norm": 0.2784271381389026, "learning_rate": 3.128911138923655e-05, "loss": 0.365, "step": 1164 }, { "epoch": 1.3106359032076533, "grad_norm": 0.32996125555463496, "learning_rate": 3.126825198164372e-05, "loss": 0.3663, "step": 1165 }, { "epoch": 1.3117613956105796, "grad_norm": 0.29339874231665497, "learning_rate": 3.12473925740509e-05, "loss": 0.3579, "step": 1166 }, { "epoch": 1.3128868880135058, "grad_norm": 0.27539689603551204, "learning_rate": 3.1226533166458075e-05, "loss": 0.3428, "step": 1167 }, { "epoch": 1.314012380416432, "grad_norm": 0.30586051604779685, "learning_rate": 3.120567375886525e-05, "loss": 0.3742, "step": 1168 }, { "epoch": 1.3151378728193586, "grad_norm": 0.318506663490862, "learning_rate": 3.118481435127242e-05, "loss": 0.3555, "step": 1169 }, { "epoch": 1.3162633652222848, "grad_norm": 0.32193633165736774, "learning_rate": 3.11639549436796e-05, "loss": 0.3798, "step": 1170 }, { "epoch": 1.317388857625211, "grad_norm": 0.26006223637970205, "learning_rate": 3.1143095536086776e-05, "loss": 0.3688, "step": 1171 }, { "epoch": 1.3185143500281373, "grad_norm": 0.3091644393869938, "learning_rate": 3.112223612849395e-05, "loss": 0.344, "step": 1172 }, { "epoch": 1.3196398424310636, "grad_norm": 0.2950119209666744, "learning_rate": 3.110137672090112e-05, "loss": 0.3807, "step": 1173 }, { "epoch": 1.3207653348339898, "grad_norm": 0.2975497460062189, "learning_rate": 3.108051731330831e-05, "loss": 0.3581, "step": 1174 }, { "epoch": 1.321890827236916, "grad_norm": 0.3248317526622501, "learning_rate": 3.105965790571548e-05, "loss": 0.3455, "step": 1175 }, { "epoch": 1.3230163196398426, "grad_norm": 0.28753503994393237, "learning_rate": 3.1038798498122654e-05, "loss": 0.3526, "step": 1176 }, { "epoch": 1.3241418120427686, "grad_norm": 0.309321520103074, "learning_rate": 3.101793909052983e-05, "loss": 0.3671, "step": 1177 }, { "epoch": 1.325267304445695, "grad_norm": 0.31093843252643993, "learning_rate": 3.099707968293701e-05, "loss": 0.3797, "step": 1178 }, { "epoch": 1.3263927968486213, "grad_norm": 0.2941320554481767, "learning_rate": 3.097622027534418e-05, "loss": 0.3656, "step": 1179 }, { "epoch": 1.3275182892515476, "grad_norm": 0.33353760439258306, "learning_rate": 3.095536086775136e-05, "loss": 0.3659, "step": 1180 }, { "epoch": 1.3286437816544738, "grad_norm": 0.2569769588199655, "learning_rate": 3.093450146015853e-05, "loss": 0.3842, "step": 1181 }, { "epoch": 1.3297692740574, "grad_norm": 0.3330169931726158, "learning_rate": 3.091364205256571e-05, "loss": 0.3643, "step": 1182 }, { "epoch": 1.3308947664603263, "grad_norm": 0.2876950022849873, "learning_rate": 3.0892782644972886e-05, "loss": 0.357, "step": 1183 }, { "epoch": 1.3320202588632526, "grad_norm": 0.31915621662192034, "learning_rate": 3.087192323738006e-05, "loss": 0.3817, "step": 1184 }, { "epoch": 1.333145751266179, "grad_norm": 0.31039603557721346, "learning_rate": 3.085106382978723e-05, "loss": 0.3705, "step": 1185 }, { "epoch": 1.334271243669105, "grad_norm": 0.35493105743167, "learning_rate": 3.083020442219441e-05, "loss": 0.375, "step": 1186 }, { "epoch": 1.3353967360720316, "grad_norm": 0.2887229895411605, "learning_rate": 3.080934501460159e-05, "loss": 0.3705, "step": 1187 }, { "epoch": 1.3365222284749578, "grad_norm": 0.33105501034844814, "learning_rate": 3.078848560700876e-05, "loss": 0.3517, "step": 1188 }, { "epoch": 1.337647720877884, "grad_norm": 0.296938807012652, "learning_rate": 3.076762619941594e-05, "loss": 0.3569, "step": 1189 }, { "epoch": 1.3387732132808103, "grad_norm": 0.2631825890999598, "learning_rate": 3.074676679182311e-05, "loss": 0.3719, "step": 1190 }, { "epoch": 1.3398987056837366, "grad_norm": 0.3302060927278823, "learning_rate": 3.072590738423029e-05, "loss": 0.357, "step": 1191 }, { "epoch": 1.3410241980866628, "grad_norm": 0.284964127664028, "learning_rate": 3.0705047976637465e-05, "loss": 0.3637, "step": 1192 }, { "epoch": 1.342149690489589, "grad_norm": 0.3450486482153124, "learning_rate": 3.068418856904464e-05, "loss": 0.3545, "step": 1193 }, { "epoch": 1.3432751828925156, "grad_norm": 0.3097272198664404, "learning_rate": 3.066332916145181e-05, "loss": 0.3615, "step": 1194 }, { "epoch": 1.3444006752954418, "grad_norm": 0.31428949130893125, "learning_rate": 3.0642469753858996e-05, "loss": 0.3577, "step": 1195 }, { "epoch": 1.345526167698368, "grad_norm": 0.3459630485656923, "learning_rate": 3.062161034626617e-05, "loss": 0.38, "step": 1196 }, { "epoch": 1.3466516601012943, "grad_norm": 0.34840227455144135, "learning_rate": 3.0600750938673344e-05, "loss": 0.3731, "step": 1197 }, { "epoch": 1.3477771525042206, "grad_norm": 0.36198967619880806, "learning_rate": 3.057989153108052e-05, "loss": 0.3637, "step": 1198 }, { "epoch": 1.3489026449071468, "grad_norm": 0.36980762481338214, "learning_rate": 3.05590321234877e-05, "loss": 0.3545, "step": 1199 }, { "epoch": 1.350028137310073, "grad_norm": 0.34738553940185973, "learning_rate": 3.053817271589487e-05, "loss": 0.358, "step": 1200 }, { "epoch": 1.3511536297129996, "grad_norm": 0.28899221690961746, "learning_rate": 3.0517313308302048e-05, "loss": 0.3692, "step": 1201 }, { "epoch": 1.3522791221159256, "grad_norm": 0.348414484092682, "learning_rate": 3.0496453900709222e-05, "loss": 0.3562, "step": 1202 }, { "epoch": 1.353404614518852, "grad_norm": 0.28821763744716605, "learning_rate": 3.04755944931164e-05, "loss": 0.3623, "step": 1203 }, { "epoch": 1.3545301069217783, "grad_norm": 0.34701713808150375, "learning_rate": 3.0454735085523572e-05, "loss": 0.3667, "step": 1204 }, { "epoch": 1.3556555993247046, "grad_norm": 0.34731368442368854, "learning_rate": 3.0433875677930746e-05, "loss": 0.3767, "step": 1205 }, { "epoch": 1.3567810917276308, "grad_norm": 0.2873367344993991, "learning_rate": 3.0413016270337923e-05, "loss": 0.3594, "step": 1206 }, { "epoch": 1.357906584130557, "grad_norm": 0.33455871764963324, "learning_rate": 3.0392156862745097e-05, "loss": 0.3649, "step": 1207 }, { "epoch": 1.3590320765334833, "grad_norm": 0.3444401979121362, "learning_rate": 3.0371297455152277e-05, "loss": 0.3656, "step": 1208 }, { "epoch": 1.3601575689364096, "grad_norm": 0.2899964778052406, "learning_rate": 3.0350438047559447e-05, "loss": 0.3457, "step": 1209 }, { "epoch": 1.361283061339336, "grad_norm": 0.33907151317470086, "learning_rate": 3.0329578639966627e-05, "loss": 0.3627, "step": 1210 }, { "epoch": 1.362408553742262, "grad_norm": 0.3044719983267248, "learning_rate": 3.03087192323738e-05, "loss": 0.3654, "step": 1211 }, { "epoch": 1.3635340461451886, "grad_norm": 0.3161562860694256, "learning_rate": 3.0287859824780978e-05, "loss": 0.342, "step": 1212 }, { "epoch": 1.3646595385481148, "grad_norm": 0.3068022604762919, "learning_rate": 3.026700041718815e-05, "loss": 0.3539, "step": 1213 }, { "epoch": 1.365785030951041, "grad_norm": 0.2850677267934718, "learning_rate": 3.024614100959533e-05, "loss": 0.3725, "step": 1214 }, { "epoch": 1.3669105233539673, "grad_norm": 0.3160554970609396, "learning_rate": 3.0225281602002502e-05, "loss": 0.3868, "step": 1215 }, { "epoch": 1.3680360157568936, "grad_norm": 0.3373572652606873, "learning_rate": 3.0204422194409683e-05, "loss": 0.3711, "step": 1216 }, { "epoch": 1.3691615081598199, "grad_norm": 0.27352074329674897, "learning_rate": 3.0183562786816856e-05, "loss": 0.3593, "step": 1217 }, { "epoch": 1.370287000562746, "grad_norm": 0.32866686300985715, "learning_rate": 3.0162703379224033e-05, "loss": 0.3547, "step": 1218 }, { "epoch": 1.3714124929656726, "grad_norm": 0.28785334578687116, "learning_rate": 3.0141843971631207e-05, "loss": 0.3703, "step": 1219 }, { "epoch": 1.3725379853685988, "grad_norm": 0.3336610653280944, "learning_rate": 3.0120984564038384e-05, "loss": 0.3594, "step": 1220 }, { "epoch": 1.373663477771525, "grad_norm": 0.3252899566864213, "learning_rate": 3.0100125156445557e-05, "loss": 0.3666, "step": 1221 }, { "epoch": 1.3747889701744513, "grad_norm": 0.34272700759634595, "learning_rate": 3.0079265748852738e-05, "loss": 0.3673, "step": 1222 }, { "epoch": 1.3759144625773776, "grad_norm": 0.2839456991740852, "learning_rate": 3.0058406341259908e-05, "loss": 0.3731, "step": 1223 }, { "epoch": 1.3770399549803038, "grad_norm": 0.34144486456169987, "learning_rate": 3.0037546933667088e-05, "loss": 0.3535, "step": 1224 }, { "epoch": 1.37816544738323, "grad_norm": 0.3510452096605386, "learning_rate": 3.0016687526074262e-05, "loss": 0.3783, "step": 1225 }, { "epoch": 1.3792909397861566, "grad_norm": 0.3219709429564443, "learning_rate": 2.9995828118481435e-05, "loss": 0.3653, "step": 1226 }, { "epoch": 1.3804164321890826, "grad_norm": 0.2922536824083754, "learning_rate": 2.9974968710888612e-05, "loss": 0.3487, "step": 1227 }, { "epoch": 1.381541924592009, "grad_norm": 0.3146465080311366, "learning_rate": 2.9954109303295786e-05, "loss": 0.3582, "step": 1228 }, { "epoch": 1.3826674169949353, "grad_norm": 0.3202141542926466, "learning_rate": 2.9933249895702963e-05, "loss": 0.3767, "step": 1229 }, { "epoch": 1.3837929093978616, "grad_norm": 0.3126664207698992, "learning_rate": 2.9912390488110137e-05, "loss": 0.3861, "step": 1230 }, { "epoch": 1.3849184018007878, "grad_norm": 0.3006754510665695, "learning_rate": 2.9891531080517317e-05, "loss": 0.3826, "step": 1231 }, { "epoch": 1.386043894203714, "grad_norm": 0.34183231562741717, "learning_rate": 2.9870671672924487e-05, "loss": 0.3384, "step": 1232 }, { "epoch": 1.3871693866066404, "grad_norm": 0.2981637621431096, "learning_rate": 2.9849812265331668e-05, "loss": 0.3644, "step": 1233 }, { "epoch": 1.3882948790095666, "grad_norm": 0.32927113911951866, "learning_rate": 2.982895285773884e-05, "loss": 0.37, "step": 1234 }, { "epoch": 1.389420371412493, "grad_norm": 0.3516964621170918, "learning_rate": 2.9808093450146018e-05, "loss": 0.3734, "step": 1235 }, { "epoch": 1.3905458638154191, "grad_norm": 0.28294383540669815, "learning_rate": 2.9787234042553192e-05, "loss": 0.3545, "step": 1236 }, { "epoch": 1.3916713562183456, "grad_norm": 0.36437808290704293, "learning_rate": 2.976637463496037e-05, "loss": 0.3626, "step": 1237 }, { "epoch": 1.3927968486212718, "grad_norm": 0.31704230935830585, "learning_rate": 2.9745515227367542e-05, "loss": 0.3536, "step": 1238 }, { "epoch": 1.393922341024198, "grad_norm": 0.3234586646771036, "learning_rate": 2.9724655819774723e-05, "loss": 0.3373, "step": 1239 }, { "epoch": 1.3950478334271244, "grad_norm": 0.3108185575862165, "learning_rate": 2.9703796412181893e-05, "loss": 0.3685, "step": 1240 }, { "epoch": 1.3961733258300506, "grad_norm": 0.3752154178147501, "learning_rate": 2.9682937004589073e-05, "loss": 0.3676, "step": 1241 }, { "epoch": 1.3972988182329769, "grad_norm": 0.2884249248162915, "learning_rate": 2.9662077596996247e-05, "loss": 0.3806, "step": 1242 }, { "epoch": 1.3984243106359031, "grad_norm": 0.34853899665658195, "learning_rate": 2.9641218189403424e-05, "loss": 0.35, "step": 1243 }, { "epoch": 1.3995498030388296, "grad_norm": 0.38250560263603123, "learning_rate": 2.9620358781810597e-05, "loss": 0.3671, "step": 1244 }, { "epoch": 1.4006752954417558, "grad_norm": 0.31599487532627424, "learning_rate": 2.9599499374217778e-05, "loss": 0.3833, "step": 1245 }, { "epoch": 1.401800787844682, "grad_norm": 0.3500438918178945, "learning_rate": 2.9578639966624948e-05, "loss": 0.365, "step": 1246 }, { "epoch": 1.4029262802476083, "grad_norm": 0.41585464581458353, "learning_rate": 2.955778055903212e-05, "loss": 0.3918, "step": 1247 }, { "epoch": 1.4040517726505346, "grad_norm": 0.314777082319376, "learning_rate": 2.9536921151439302e-05, "loss": 0.3657, "step": 1248 }, { "epoch": 1.4051772650534609, "grad_norm": 0.3941954750192581, "learning_rate": 2.9516061743846472e-05, "loss": 0.3472, "step": 1249 }, { "epoch": 1.406302757456387, "grad_norm": 0.40917669615827007, "learning_rate": 2.9495202336253653e-05, "loss": 0.3646, "step": 1250 }, { "epoch": 1.4074282498593136, "grad_norm": 0.32821878108438296, "learning_rate": 2.9474342928660826e-05, "loss": 0.3636, "step": 1251 }, { "epoch": 1.4085537422622396, "grad_norm": 0.4247196049076011, "learning_rate": 2.9453483521068003e-05, "loss": 0.3412, "step": 1252 }, { "epoch": 1.409679234665166, "grad_norm": 0.3851171427422802, "learning_rate": 2.9432624113475177e-05, "loss": 0.3819, "step": 1253 }, { "epoch": 1.4108047270680923, "grad_norm": 0.3469070969772743, "learning_rate": 2.9411764705882354e-05, "loss": 0.3616, "step": 1254 }, { "epoch": 1.4119302194710186, "grad_norm": 0.444050908424801, "learning_rate": 2.9390905298289527e-05, "loss": 0.3565, "step": 1255 }, { "epoch": 1.4130557118739449, "grad_norm": 0.2879248830093118, "learning_rate": 2.9370045890696708e-05, "loss": 0.3601, "step": 1256 }, { "epoch": 1.414181204276871, "grad_norm": 0.4483867224495224, "learning_rate": 2.934918648310388e-05, "loss": 0.3682, "step": 1257 }, { "epoch": 1.4153066966797974, "grad_norm": 0.3000179145995747, "learning_rate": 2.9328327075511058e-05, "loss": 0.3636, "step": 1258 }, { "epoch": 1.4164321890827236, "grad_norm": 0.38516280700639255, "learning_rate": 2.9307467667918232e-05, "loss": 0.3641, "step": 1259 }, { "epoch": 1.41755768148565, "grad_norm": 0.3233722496864056, "learning_rate": 2.928660826032541e-05, "loss": 0.353, "step": 1260 }, { "epoch": 1.4186831738885761, "grad_norm": 0.35206171266037123, "learning_rate": 2.9265748852732582e-05, "loss": 0.3715, "step": 1261 }, { "epoch": 1.4198086662915026, "grad_norm": 0.3416243699378654, "learning_rate": 2.9244889445139763e-05, "loss": 0.374, "step": 1262 }, { "epoch": 1.4209341586944289, "grad_norm": 0.354667895796559, "learning_rate": 2.9224030037546933e-05, "loss": 0.3587, "step": 1263 }, { "epoch": 1.422059651097355, "grad_norm": 0.35090199403773553, "learning_rate": 2.9203170629954113e-05, "loss": 0.3679, "step": 1264 }, { "epoch": 1.4231851435002814, "grad_norm": 0.37870594240912114, "learning_rate": 2.9182311222361287e-05, "loss": 0.3497, "step": 1265 }, { "epoch": 1.4243106359032076, "grad_norm": 0.3281629547838438, "learning_rate": 2.9161451814768464e-05, "loss": 0.3457, "step": 1266 }, { "epoch": 1.4254361283061339, "grad_norm": 0.35899900559464903, "learning_rate": 2.9140592407175638e-05, "loss": 0.3627, "step": 1267 }, { "epoch": 1.4265616207090601, "grad_norm": 0.30995421196825085, "learning_rate": 2.911973299958281e-05, "loss": 0.3939, "step": 1268 }, { "epoch": 1.4276871131119866, "grad_norm": 0.3707647704012859, "learning_rate": 2.9098873591989988e-05, "loss": 0.3683, "step": 1269 }, { "epoch": 1.4288126055149128, "grad_norm": 0.3493957956050667, "learning_rate": 2.9078014184397162e-05, "loss": 0.3922, "step": 1270 }, { "epoch": 1.429938097917839, "grad_norm": 0.3392405036440076, "learning_rate": 2.9057154776804342e-05, "loss": 0.3675, "step": 1271 }, { "epoch": 1.4310635903207654, "grad_norm": 0.42908164729503306, "learning_rate": 2.9036295369211512e-05, "loss": 0.3497, "step": 1272 }, { "epoch": 1.4321890827236916, "grad_norm": 0.3217604733902604, "learning_rate": 2.9015435961618693e-05, "loss": 0.3419, "step": 1273 }, { "epoch": 1.4333145751266179, "grad_norm": 0.285513498828739, "learning_rate": 2.8994576554025866e-05, "loss": 0.3525, "step": 1274 }, { "epoch": 1.4344400675295441, "grad_norm": 0.31722207811402275, "learning_rate": 2.8973717146433043e-05, "loss": 0.3537, "step": 1275 }, { "epoch": 1.4355655599324706, "grad_norm": 0.3425240574013163, "learning_rate": 2.8952857738840217e-05, "loss": 0.3536, "step": 1276 }, { "epoch": 1.4366910523353966, "grad_norm": 0.37106807604623343, "learning_rate": 2.8931998331247394e-05, "loss": 0.3495, "step": 1277 }, { "epoch": 1.437816544738323, "grad_norm": 0.28250215963205166, "learning_rate": 2.8911138923654567e-05, "loss": 0.3481, "step": 1278 }, { "epoch": 1.4389420371412494, "grad_norm": 0.3549294685688371, "learning_rate": 2.8890279516061748e-05, "loss": 0.376, "step": 1279 }, { "epoch": 1.4400675295441756, "grad_norm": 0.30171989887856726, "learning_rate": 2.886942010846892e-05, "loss": 0.3755, "step": 1280 }, { "epoch": 1.4411930219471019, "grad_norm": 0.3139221398315422, "learning_rate": 2.88485607008761e-05, "loss": 0.3513, "step": 1281 }, { "epoch": 1.4423185143500281, "grad_norm": 0.3174768026324268, "learning_rate": 2.8827701293283272e-05, "loss": 0.3569, "step": 1282 }, { "epoch": 1.4434440067529544, "grad_norm": 0.31543813617403643, "learning_rate": 2.880684188569045e-05, "loss": 0.3551, "step": 1283 }, { "epoch": 1.4445694991558806, "grad_norm": 0.34405513448928665, "learning_rate": 2.8785982478097623e-05, "loss": 0.3615, "step": 1284 }, { "epoch": 1.445694991558807, "grad_norm": 0.3314207471763474, "learning_rate": 2.8765123070504803e-05, "loss": 0.364, "step": 1285 }, { "epoch": 1.4468204839617331, "grad_norm": 0.30011672219405916, "learning_rate": 2.8744263662911973e-05, "loss": 0.3541, "step": 1286 }, { "epoch": 1.4479459763646596, "grad_norm": 0.36939176440073757, "learning_rate": 2.8723404255319154e-05, "loss": 0.3466, "step": 1287 }, { "epoch": 1.4490714687675859, "grad_norm": 0.38877177781745204, "learning_rate": 2.8702544847726327e-05, "loss": 0.3786, "step": 1288 }, { "epoch": 1.4501969611705121, "grad_norm": 0.3409728207807626, "learning_rate": 2.8681685440133497e-05, "loss": 0.3791, "step": 1289 }, { "epoch": 1.4513224535734384, "grad_norm": 0.42817414825924877, "learning_rate": 2.8660826032540678e-05, "loss": 0.3491, "step": 1290 }, { "epoch": 1.4524479459763646, "grad_norm": 0.35198856950809654, "learning_rate": 2.863996662494785e-05, "loss": 0.3679, "step": 1291 }, { "epoch": 1.4535734383792909, "grad_norm": 0.32113222683338927, "learning_rate": 2.861910721735503e-05, "loss": 0.3467, "step": 1292 }, { "epoch": 1.4546989307822171, "grad_norm": 0.321401722942131, "learning_rate": 2.8598247809762202e-05, "loss": 0.3565, "step": 1293 }, { "epoch": 1.4558244231851436, "grad_norm": 0.3506970283170861, "learning_rate": 2.857738840216938e-05, "loss": 0.3691, "step": 1294 }, { "epoch": 1.4569499155880699, "grad_norm": 0.3366249220650027, "learning_rate": 2.8556528994576552e-05, "loss": 0.3679, "step": 1295 }, { "epoch": 1.458075407990996, "grad_norm": 0.3863825955582127, "learning_rate": 2.8535669586983733e-05, "loss": 0.3656, "step": 1296 }, { "epoch": 1.4592009003939224, "grad_norm": 0.3294265653501134, "learning_rate": 2.8514810179390906e-05, "loss": 0.363, "step": 1297 }, { "epoch": 1.4603263927968486, "grad_norm": 0.3400753421715317, "learning_rate": 2.8493950771798083e-05, "loss": 0.3677, "step": 1298 }, { "epoch": 1.4614518851997749, "grad_norm": 0.34166651813969356, "learning_rate": 2.8473091364205257e-05, "loss": 0.3975, "step": 1299 }, { "epoch": 1.4625773776027011, "grad_norm": 0.32157145849641844, "learning_rate": 2.8452231956612434e-05, "loss": 0.3599, "step": 1300 }, { "epoch": 1.4637028700056276, "grad_norm": 0.29318800149996266, "learning_rate": 2.8431372549019608e-05, "loss": 0.3639, "step": 1301 }, { "epoch": 1.4648283624085536, "grad_norm": 0.3275747590035082, "learning_rate": 2.8410513141426788e-05, "loss": 0.3405, "step": 1302 }, { "epoch": 1.46595385481148, "grad_norm": 0.2669444641392893, "learning_rate": 2.8389653733833958e-05, "loss": 0.3556, "step": 1303 }, { "epoch": 1.4670793472144064, "grad_norm": 0.2984242508636962, "learning_rate": 2.836879432624114e-05, "loss": 0.3475, "step": 1304 }, { "epoch": 1.4682048396173326, "grad_norm": 0.34309575853919444, "learning_rate": 2.8347934918648312e-05, "loss": 0.3236, "step": 1305 }, { "epoch": 1.4693303320202589, "grad_norm": 0.3386670126038393, "learning_rate": 2.832707551105549e-05, "loss": 0.3692, "step": 1306 }, { "epoch": 1.4704558244231851, "grad_norm": 0.3041272096720939, "learning_rate": 2.8306216103462663e-05, "loss": 0.3672, "step": 1307 }, { "epoch": 1.4715813168261114, "grad_norm": 0.4280504900617411, "learning_rate": 2.828535669586984e-05, "loss": 0.3532, "step": 1308 }, { "epoch": 1.4727068092290376, "grad_norm": 0.28299915352373894, "learning_rate": 2.8264497288277013e-05, "loss": 0.3389, "step": 1309 }, { "epoch": 1.473832301631964, "grad_norm": 0.33312026594342037, "learning_rate": 2.8243637880684187e-05, "loss": 0.3711, "step": 1310 }, { "epoch": 1.4749577940348901, "grad_norm": 0.3324677079496402, "learning_rate": 2.8222778473091367e-05, "loss": 0.3637, "step": 1311 }, { "epoch": 1.4760832864378166, "grad_norm": 0.3180122895020907, "learning_rate": 2.8201919065498537e-05, "loss": 0.3681, "step": 1312 }, { "epoch": 1.4772087788407429, "grad_norm": 0.3454913512736821, "learning_rate": 2.8181059657905718e-05, "loss": 0.3788, "step": 1313 }, { "epoch": 1.4783342712436691, "grad_norm": 0.2954651043434191, "learning_rate": 2.816020025031289e-05, "loss": 0.3629, "step": 1314 }, { "epoch": 1.4794597636465954, "grad_norm": 0.31225437993089217, "learning_rate": 2.813934084272007e-05, "loss": 0.3483, "step": 1315 }, { "epoch": 1.4805852560495216, "grad_norm": 0.2931420257748457, "learning_rate": 2.8118481435127242e-05, "loss": 0.3443, "step": 1316 }, { "epoch": 1.4817107484524479, "grad_norm": 0.3077463608704642, "learning_rate": 2.809762202753442e-05, "loss": 0.3562, "step": 1317 }, { "epoch": 1.4828362408553741, "grad_norm": 0.2868052518006215, "learning_rate": 2.8076762619941593e-05, "loss": 0.3532, "step": 1318 }, { "epoch": 1.4839617332583006, "grad_norm": 0.28223423866564457, "learning_rate": 2.8055903212348773e-05, "loss": 0.3334, "step": 1319 }, { "epoch": 1.4850872256612269, "grad_norm": 0.2934968437108151, "learning_rate": 2.8035043804755947e-05, "loss": 0.3609, "step": 1320 }, { "epoch": 1.4862127180641531, "grad_norm": 0.3726867856164999, "learning_rate": 2.8014184397163124e-05, "loss": 0.3658, "step": 1321 }, { "epoch": 1.4873382104670794, "grad_norm": 0.31940065928357514, "learning_rate": 2.7993324989570297e-05, "loss": 0.3752, "step": 1322 }, { "epoch": 1.4884637028700056, "grad_norm": 0.343528935258811, "learning_rate": 2.7972465581977474e-05, "loss": 0.3689, "step": 1323 }, { "epoch": 1.4895891952729319, "grad_norm": 0.29324201562634045, "learning_rate": 2.7951606174384648e-05, "loss": 0.3701, "step": 1324 }, { "epoch": 1.4907146876758581, "grad_norm": 0.307447149562183, "learning_rate": 2.7930746766791828e-05, "loss": 0.3623, "step": 1325 }, { "epoch": 1.4918401800787846, "grad_norm": 0.3370769636245937, "learning_rate": 2.7909887359199e-05, "loss": 0.3599, "step": 1326 }, { "epoch": 1.4929656724817106, "grad_norm": 0.2871673492029565, "learning_rate": 2.788902795160618e-05, "loss": 0.3719, "step": 1327 }, { "epoch": 1.4940911648846371, "grad_norm": 0.36895913560450455, "learning_rate": 2.7868168544013352e-05, "loss": 0.3678, "step": 1328 }, { "epoch": 1.4952166572875634, "grad_norm": 0.30425325809005394, "learning_rate": 2.784730913642053e-05, "loss": 0.3654, "step": 1329 }, { "epoch": 1.4963421496904896, "grad_norm": 0.3331261517980334, "learning_rate": 2.7826449728827703e-05, "loss": 0.3693, "step": 1330 }, { "epoch": 1.4974676420934159, "grad_norm": 0.2798679646502201, "learning_rate": 2.7805590321234876e-05, "loss": 0.3709, "step": 1331 }, { "epoch": 1.4985931344963421, "grad_norm": 0.37466709997642333, "learning_rate": 2.7784730913642053e-05, "loss": 0.363, "step": 1332 }, { "epoch": 1.4997186268992684, "grad_norm": 0.35357276036020097, "learning_rate": 2.7763871506049227e-05, "loss": 0.3683, "step": 1333 }, { "epoch": 1.5008441193021946, "grad_norm": 0.3354334941577856, "learning_rate": 2.7743012098456404e-05, "loss": 0.3438, "step": 1334 }, { "epoch": 1.501969611705121, "grad_norm": 0.4101041564365979, "learning_rate": 2.7722152690863578e-05, "loss": 0.3835, "step": 1335 }, { "epoch": 1.5030951041080471, "grad_norm": 0.36255802157025624, "learning_rate": 2.7701293283270758e-05, "loss": 0.3477, "step": 1336 }, { "epoch": 1.5042205965109736, "grad_norm": 0.4061869558301693, "learning_rate": 2.768043387567793e-05, "loss": 0.3688, "step": 1337 }, { "epoch": 1.5053460889138999, "grad_norm": 0.41388849066334216, "learning_rate": 2.765957446808511e-05, "loss": 0.3672, "step": 1338 }, { "epoch": 1.5064715813168261, "grad_norm": 0.330932817498522, "learning_rate": 2.7638715060492282e-05, "loss": 0.3643, "step": 1339 }, { "epoch": 1.5075970737197524, "grad_norm": 0.3399783880240679, "learning_rate": 2.761785565289946e-05, "loss": 0.3529, "step": 1340 }, { "epoch": 1.5087225661226786, "grad_norm": 0.28341696113530734, "learning_rate": 2.7596996245306633e-05, "loss": 0.3496, "step": 1341 }, { "epoch": 1.509848058525605, "grad_norm": 0.334967765352759, "learning_rate": 2.7576136837713813e-05, "loss": 0.3415, "step": 1342 }, { "epoch": 1.5109735509285311, "grad_norm": 0.2668604468971969, "learning_rate": 2.7555277430120983e-05, "loss": 0.3634, "step": 1343 }, { "epoch": 1.5120990433314576, "grad_norm": 0.31156721053710673, "learning_rate": 2.7534418022528164e-05, "loss": 0.3641, "step": 1344 }, { "epoch": 1.5132245357343836, "grad_norm": 0.29754957914675184, "learning_rate": 2.7513558614935337e-05, "loss": 0.3785, "step": 1345 }, { "epoch": 1.5143500281373101, "grad_norm": 0.2872566093068787, "learning_rate": 2.7492699207342514e-05, "loss": 0.3623, "step": 1346 }, { "epoch": 1.5154755205402364, "grad_norm": 0.3526852777204813, "learning_rate": 2.7471839799749688e-05, "loss": 0.3723, "step": 1347 }, { "epoch": 1.5166010129431626, "grad_norm": 0.31241125025784733, "learning_rate": 2.7450980392156865e-05, "loss": 0.3507, "step": 1348 }, { "epoch": 1.5177265053460889, "grad_norm": 0.3508625079587985, "learning_rate": 2.743012098456404e-05, "loss": 0.3608, "step": 1349 }, { "epoch": 1.5188519977490151, "grad_norm": 0.32157619105794166, "learning_rate": 2.740926157697122e-05, "loss": 0.3581, "step": 1350 }, { "epoch": 1.5199774901519416, "grad_norm": 0.3494380418250329, "learning_rate": 2.7388402169378392e-05, "loss": 0.3609, "step": 1351 }, { "epoch": 1.5211029825548676, "grad_norm": 0.3055065567786005, "learning_rate": 2.7367542761785563e-05, "loss": 0.3672, "step": 1352 }, { "epoch": 1.5222284749577941, "grad_norm": 0.3950982220672214, "learning_rate": 2.7346683354192743e-05, "loss": 0.376, "step": 1353 }, { "epoch": 1.5233539673607202, "grad_norm": 0.27852848240062467, "learning_rate": 2.7325823946599917e-05, "loss": 0.3485, "step": 1354 }, { "epoch": 1.5244794597636466, "grad_norm": 0.3737867565000807, "learning_rate": 2.7304964539007094e-05, "loss": 0.3664, "step": 1355 }, { "epoch": 1.5256049521665729, "grad_norm": 0.3119606266731146, "learning_rate": 2.7284105131414267e-05, "loss": 0.3619, "step": 1356 }, { "epoch": 1.5267304445694991, "grad_norm": 0.33933519597699924, "learning_rate": 2.7263245723821444e-05, "loss": 0.4008, "step": 1357 }, { "epoch": 1.5278559369724254, "grad_norm": 0.3275255812573412, "learning_rate": 2.7242386316228618e-05, "loss": 0.3702, "step": 1358 }, { "epoch": 1.5289814293753516, "grad_norm": 0.3747569415524062, "learning_rate": 2.7221526908635798e-05, "loss": 0.3553, "step": 1359 }, { "epoch": 1.5301069217782781, "grad_norm": 0.30992658499062065, "learning_rate": 2.7200667501042972e-05, "loss": 0.3556, "step": 1360 }, { "epoch": 1.5312324141812041, "grad_norm": 0.36837860346575607, "learning_rate": 2.717980809345015e-05, "loss": 0.3536, "step": 1361 }, { "epoch": 1.5323579065841306, "grad_norm": 0.3422637051978812, "learning_rate": 2.7158948685857322e-05, "loss": 0.3676, "step": 1362 }, { "epoch": 1.5334833989870569, "grad_norm": 0.2882475832928599, "learning_rate": 2.71380892782645e-05, "loss": 0.3714, "step": 1363 }, { "epoch": 1.5346088913899831, "grad_norm": 0.4680385431354928, "learning_rate": 2.7117229870671673e-05, "loss": 0.3728, "step": 1364 }, { "epoch": 1.5357343837929094, "grad_norm": 0.28687340173500175, "learning_rate": 2.7096370463078853e-05, "loss": 0.3677, "step": 1365 }, { "epoch": 1.5368598761958356, "grad_norm": 0.3168437934125687, "learning_rate": 2.7075511055486023e-05, "loss": 0.3516, "step": 1366 }, { "epoch": 1.5379853685987621, "grad_norm": 0.34254703558592353, "learning_rate": 2.7054651647893204e-05, "loss": 0.3733, "step": 1367 }, { "epoch": 1.5391108610016881, "grad_norm": 0.32210383347863225, "learning_rate": 2.7033792240300377e-05, "loss": 0.3657, "step": 1368 }, { "epoch": 1.5402363534046146, "grad_norm": 0.2951642244458056, "learning_rate": 2.7012932832707554e-05, "loss": 0.3624, "step": 1369 }, { "epoch": 1.5413618458075407, "grad_norm": 0.32973184204270484, "learning_rate": 2.6992073425114728e-05, "loss": 0.3466, "step": 1370 }, { "epoch": 1.5424873382104671, "grad_norm": 0.32937201569972335, "learning_rate": 2.6971214017521905e-05, "loss": 0.3609, "step": 1371 }, { "epoch": 1.5436128306133934, "grad_norm": 0.294240889891016, "learning_rate": 2.695035460992908e-05, "loss": 0.3528, "step": 1372 }, { "epoch": 1.5447383230163196, "grad_norm": 0.38730632898384704, "learning_rate": 2.6929495202336252e-05, "loss": 0.3592, "step": 1373 }, { "epoch": 1.545863815419246, "grad_norm": 0.265405748658469, "learning_rate": 2.6908635794743433e-05, "loss": 0.3523, "step": 1374 }, { "epoch": 1.5469893078221721, "grad_norm": 0.3090293159321234, "learning_rate": 2.6887776387150603e-05, "loss": 0.373, "step": 1375 }, { "epoch": 1.5481148002250986, "grad_norm": 0.33125373511524786, "learning_rate": 2.6866916979557783e-05, "loss": 0.3376, "step": 1376 }, { "epoch": 1.5492402926280247, "grad_norm": 0.3859675477375762, "learning_rate": 2.6846057571964957e-05, "loss": 0.3595, "step": 1377 }, { "epoch": 1.5503657850309511, "grad_norm": 0.2702204865287381, "learning_rate": 2.6825198164372134e-05, "loss": 0.3526, "step": 1378 }, { "epoch": 1.5514912774338772, "grad_norm": 0.4216493180934553, "learning_rate": 2.6804338756779307e-05, "loss": 0.3634, "step": 1379 }, { "epoch": 1.5526167698368036, "grad_norm": 0.3402054598291514, "learning_rate": 2.6783479349186484e-05, "loss": 0.3814, "step": 1380 }, { "epoch": 1.5537422622397299, "grad_norm": 0.3634322127130347, "learning_rate": 2.6762619941593658e-05, "loss": 0.3739, "step": 1381 }, { "epoch": 1.5548677546426561, "grad_norm": 0.4033902015465824, "learning_rate": 2.6741760534000838e-05, "loss": 0.3731, "step": 1382 }, { "epoch": 1.5559932470455824, "grad_norm": 0.40808104649969373, "learning_rate": 2.672090112640801e-05, "loss": 0.3522, "step": 1383 }, { "epoch": 1.5571187394485086, "grad_norm": 0.39575554889808146, "learning_rate": 2.670004171881519e-05, "loss": 0.3466, "step": 1384 }, { "epoch": 1.5582442318514351, "grad_norm": 0.33725542522344404, "learning_rate": 2.6679182311222362e-05, "loss": 0.3594, "step": 1385 }, { "epoch": 1.5593697242543612, "grad_norm": 0.3562002474404248, "learning_rate": 2.665832290362954e-05, "loss": 0.3766, "step": 1386 }, { "epoch": 1.5604952166572876, "grad_norm": 0.2792679981992388, "learning_rate": 2.6637463496036713e-05, "loss": 0.354, "step": 1387 }, { "epoch": 1.5616207090602139, "grad_norm": 0.3631975807941906, "learning_rate": 2.661660408844389e-05, "loss": 0.3628, "step": 1388 }, { "epoch": 1.5627462014631401, "grad_norm": 0.2922697632757867, "learning_rate": 2.6595744680851064e-05, "loss": 0.3786, "step": 1389 }, { "epoch": 1.5638716938660664, "grad_norm": 0.3930094259783832, "learning_rate": 2.6574885273258244e-05, "loss": 0.349, "step": 1390 }, { "epoch": 1.5649971862689926, "grad_norm": 0.2753952015092564, "learning_rate": 2.6554025865665418e-05, "loss": 0.3606, "step": 1391 }, { "epoch": 1.5661226786719191, "grad_norm": 0.323233762383296, "learning_rate": 2.6533166458072595e-05, "loss": 0.3584, "step": 1392 }, { "epoch": 1.5672481710748452, "grad_norm": 0.3065899573190829, "learning_rate": 2.6512307050479768e-05, "loss": 0.3569, "step": 1393 }, { "epoch": 1.5683736634777716, "grad_norm": 0.29359629957776534, "learning_rate": 2.6491447642886942e-05, "loss": 0.3721, "step": 1394 }, { "epoch": 1.5694991558806977, "grad_norm": 0.3453639950913077, "learning_rate": 2.647058823529412e-05, "loss": 0.3674, "step": 1395 }, { "epoch": 1.5706246482836241, "grad_norm": 0.29618728974968406, "learning_rate": 2.6449728827701292e-05, "loss": 0.3728, "step": 1396 }, { "epoch": 1.5717501406865504, "grad_norm": 0.4022340400841394, "learning_rate": 2.642886942010847e-05, "loss": 0.3599, "step": 1397 }, { "epoch": 1.5728756330894766, "grad_norm": 0.34040909178052503, "learning_rate": 2.6408010012515643e-05, "loss": 0.3452, "step": 1398 }, { "epoch": 1.574001125492403, "grad_norm": 0.39633565400793064, "learning_rate": 2.6387150604922823e-05, "loss": 0.3638, "step": 1399 }, { "epoch": 1.5751266178953292, "grad_norm": 0.3469815814003443, "learning_rate": 2.6366291197329997e-05, "loss": 0.3617, "step": 1400 }, { "epoch": 1.5762521102982556, "grad_norm": 0.3858237301262129, "learning_rate": 2.6345431789737174e-05, "loss": 0.3592, "step": 1401 }, { "epoch": 1.5773776027011817, "grad_norm": 0.36968305499637627, "learning_rate": 2.6324572382144347e-05, "loss": 0.3506, "step": 1402 }, { "epoch": 1.5785030951041081, "grad_norm": 0.3505404658131974, "learning_rate": 2.6303712974551524e-05, "loss": 0.3686, "step": 1403 }, { "epoch": 1.5796285875070342, "grad_norm": 0.33758728331020843, "learning_rate": 2.6282853566958698e-05, "loss": 0.3527, "step": 1404 }, { "epoch": 1.5807540799099606, "grad_norm": 0.3435492065868497, "learning_rate": 2.626199415936588e-05, "loss": 0.3475, "step": 1405 }, { "epoch": 1.581879572312887, "grad_norm": 0.3490084416143491, "learning_rate": 2.624113475177305e-05, "loss": 0.3607, "step": 1406 }, { "epoch": 1.5830050647158131, "grad_norm": 0.31414180653905893, "learning_rate": 2.622027534418023e-05, "loss": 0.3504, "step": 1407 }, { "epoch": 1.5841305571187394, "grad_norm": 0.3599821696826535, "learning_rate": 2.6199415936587403e-05, "loss": 0.3615, "step": 1408 }, { "epoch": 1.5852560495216657, "grad_norm": 0.42310764019699615, "learning_rate": 2.617855652899458e-05, "loss": 0.3724, "step": 1409 }, { "epoch": 1.5863815419245921, "grad_norm": 0.2833525199592301, "learning_rate": 2.6157697121401753e-05, "loss": 0.3617, "step": 1410 }, { "epoch": 1.5875070343275182, "grad_norm": 0.3619653752728842, "learning_rate": 2.613683771380893e-05, "loss": 0.3535, "step": 1411 }, { "epoch": 1.5886325267304446, "grad_norm": 0.31893555877641494, "learning_rate": 2.6115978306216104e-05, "loss": 0.3739, "step": 1412 }, { "epoch": 1.589758019133371, "grad_norm": 0.367002811604332, "learning_rate": 2.6095118898623284e-05, "loss": 0.3553, "step": 1413 }, { "epoch": 1.5908835115362971, "grad_norm": 0.27151097727860346, "learning_rate": 2.6074259491030458e-05, "loss": 0.3347, "step": 1414 }, { "epoch": 1.5920090039392234, "grad_norm": 0.3131896896726996, "learning_rate": 2.6053400083437628e-05, "loss": 0.3546, "step": 1415 }, { "epoch": 1.5931344963421497, "grad_norm": 0.36676987492115576, "learning_rate": 2.6032540675844808e-05, "loss": 0.3675, "step": 1416 }, { "epoch": 1.5942599887450761, "grad_norm": 0.2950227483896426, "learning_rate": 2.6011681268251982e-05, "loss": 0.3648, "step": 1417 }, { "epoch": 1.5953854811480022, "grad_norm": 0.34344487884738795, "learning_rate": 2.599082186065916e-05, "loss": 0.3597, "step": 1418 }, { "epoch": 1.5965109735509286, "grad_norm": 0.320230789996618, "learning_rate": 2.5969962453066332e-05, "loss": 0.3524, "step": 1419 }, { "epoch": 1.5976364659538547, "grad_norm": 0.32035648740276107, "learning_rate": 2.594910304547351e-05, "loss": 0.3595, "step": 1420 }, { "epoch": 1.5987619583567811, "grad_norm": 0.2888199453121108, "learning_rate": 2.5928243637880683e-05, "loss": 0.3862, "step": 1421 }, { "epoch": 1.5998874507597074, "grad_norm": 0.32236255339509834, "learning_rate": 2.5907384230287863e-05, "loss": 0.3476, "step": 1422 }, { "epoch": 1.6010129431626337, "grad_norm": 0.3203989927659959, "learning_rate": 2.5886524822695034e-05, "loss": 0.3702, "step": 1423 }, { "epoch": 1.60213843556556, "grad_norm": 0.2911113101367755, "learning_rate": 2.5865665415102214e-05, "loss": 0.3688, "step": 1424 }, { "epoch": 1.6032639279684862, "grad_norm": 0.35071227735634586, "learning_rate": 2.5844806007509388e-05, "loss": 0.3808, "step": 1425 }, { "epoch": 1.6043894203714126, "grad_norm": 1.359117007518664, "learning_rate": 2.5823946599916565e-05, "loss": 0.3551, "step": 1426 }, { "epoch": 1.6055149127743387, "grad_norm": 0.33498969187479993, "learning_rate": 2.5803087192323738e-05, "loss": 0.3593, "step": 1427 }, { "epoch": 1.6066404051772651, "grad_norm": 0.30337597464705507, "learning_rate": 2.5782227784730915e-05, "loss": 0.3674, "step": 1428 }, { "epoch": 1.6077658975801912, "grad_norm": 0.3207844519265783, "learning_rate": 2.576136837713809e-05, "loss": 0.3557, "step": 1429 }, { "epoch": 1.6088913899831176, "grad_norm": 0.3185723538525886, "learning_rate": 2.574050896954527e-05, "loss": 0.3633, "step": 1430 }, { "epoch": 1.610016882386044, "grad_norm": 0.3110802343229136, "learning_rate": 2.5719649561952443e-05, "loss": 0.3621, "step": 1431 }, { "epoch": 1.6111423747889702, "grad_norm": 0.39120392030901746, "learning_rate": 2.569879015435962e-05, "loss": 0.3718, "step": 1432 }, { "epoch": 1.6122678671918964, "grad_norm": 0.3044483498179327, "learning_rate": 2.5677930746766793e-05, "loss": 0.3525, "step": 1433 }, { "epoch": 1.6133933595948227, "grad_norm": 0.36593260259263516, "learning_rate": 2.565707133917397e-05, "loss": 0.3724, "step": 1434 }, { "epoch": 1.6145188519977491, "grad_norm": 0.34991456432334755, "learning_rate": 2.5636211931581144e-05, "loss": 0.3682, "step": 1435 }, { "epoch": 1.6156443444006752, "grad_norm": 0.32304123149901537, "learning_rate": 2.5615352523988317e-05, "loss": 0.3496, "step": 1436 }, { "epoch": 1.6167698368036016, "grad_norm": 0.34708749419764806, "learning_rate": 2.5594493116395494e-05, "loss": 0.3913, "step": 1437 }, { "epoch": 1.617895329206528, "grad_norm": 0.32488187134050506, "learning_rate": 2.5573633708802668e-05, "loss": 0.3469, "step": 1438 }, { "epoch": 1.6190208216094542, "grad_norm": 0.31694764933224345, "learning_rate": 2.555277430120985e-05, "loss": 0.3903, "step": 1439 }, { "epoch": 1.6201463140123804, "grad_norm": 0.2966648293508749, "learning_rate": 2.5531914893617022e-05, "loss": 0.3434, "step": 1440 }, { "epoch": 1.6212718064153067, "grad_norm": 0.3130351777750274, "learning_rate": 2.55110554860242e-05, "loss": 0.3642, "step": 1441 }, { "epoch": 1.6223972988182331, "grad_norm": 0.288157295810494, "learning_rate": 2.5490196078431373e-05, "loss": 0.3515, "step": 1442 }, { "epoch": 1.6235227912211592, "grad_norm": 0.34698217632629985, "learning_rate": 2.546933667083855e-05, "loss": 0.3733, "step": 1443 }, { "epoch": 1.6246482836240856, "grad_norm": 0.2724092253095966, "learning_rate": 2.5448477263245723e-05, "loss": 0.3497, "step": 1444 }, { "epoch": 1.6257737760270117, "grad_norm": 0.24953001796720836, "learning_rate": 2.5427617855652904e-05, "loss": 0.3573, "step": 1445 }, { "epoch": 1.6268992684299382, "grad_norm": 0.299260486745094, "learning_rate": 2.5406758448060074e-05, "loss": 0.3873, "step": 1446 }, { "epoch": 1.6280247608328644, "grad_norm": 0.26925589680552175, "learning_rate": 2.5385899040467254e-05, "loss": 0.3508, "step": 1447 }, { "epoch": 1.6291502532357907, "grad_norm": 0.29454604423730374, "learning_rate": 2.5365039632874428e-05, "loss": 0.3591, "step": 1448 }, { "epoch": 1.630275745638717, "grad_norm": 0.27324874812018735, "learning_rate": 2.5344180225281605e-05, "loss": 0.3625, "step": 1449 }, { "epoch": 1.6314012380416432, "grad_norm": 0.27258225073759196, "learning_rate": 2.5323320817688778e-05, "loss": 0.3554, "step": 1450 }, { "epoch": 1.6325267304445696, "grad_norm": 0.3035610321463261, "learning_rate": 2.5302461410095955e-05, "loss": 0.3535, "step": 1451 }, { "epoch": 1.6336522228474957, "grad_norm": 0.3628567082505913, "learning_rate": 2.528160200250313e-05, "loss": 0.358, "step": 1452 }, { "epoch": 1.6347777152504221, "grad_norm": 0.26138414055253223, "learning_rate": 2.526074259491031e-05, "loss": 0.3664, "step": 1453 }, { "epoch": 1.6359032076533482, "grad_norm": 0.3503328861643792, "learning_rate": 2.5239883187317483e-05, "loss": 0.3377, "step": 1454 }, { "epoch": 1.6370287000562747, "grad_norm": 0.2673845892434079, "learning_rate": 2.521902377972466e-05, "loss": 0.3474, "step": 1455 }, { "epoch": 1.638154192459201, "grad_norm": 0.27470271868463625, "learning_rate": 2.5198164372131833e-05, "loss": 0.3639, "step": 1456 }, { "epoch": 1.6392796848621272, "grad_norm": 0.3112867744755204, "learning_rate": 2.5177304964539007e-05, "loss": 0.3662, "step": 1457 }, { "epoch": 1.6404051772650534, "grad_norm": 0.29872249045203997, "learning_rate": 2.5156445556946184e-05, "loss": 0.3569, "step": 1458 }, { "epoch": 1.6415306696679797, "grad_norm": 0.2950030580824579, "learning_rate": 2.5135586149353358e-05, "loss": 0.3877, "step": 1459 }, { "epoch": 1.6426561620709061, "grad_norm": 0.30740378724405815, "learning_rate": 2.5114726741760535e-05, "loss": 0.368, "step": 1460 }, { "epoch": 1.6437816544738322, "grad_norm": 0.43735074719358724, "learning_rate": 2.5093867334167708e-05, "loss": 0.3862, "step": 1461 }, { "epoch": 1.6449071468767587, "grad_norm": 0.344358904604338, "learning_rate": 2.507300792657489e-05, "loss": 0.3556, "step": 1462 }, { "epoch": 1.646032639279685, "grad_norm": 0.3606518532796079, "learning_rate": 2.505214851898206e-05, "loss": 0.3506, "step": 1463 }, { "epoch": 1.6471581316826112, "grad_norm": 0.26793935225288906, "learning_rate": 2.503128911138924e-05, "loss": 0.3644, "step": 1464 }, { "epoch": 1.6482836240855374, "grad_norm": 0.36553458630391006, "learning_rate": 2.5010429703796413e-05, "loss": 0.3786, "step": 1465 }, { "epoch": 1.6494091164884637, "grad_norm": 0.3032742387012001, "learning_rate": 2.4989570296203586e-05, "loss": 0.3606, "step": 1466 }, { "epoch": 1.6505346088913901, "grad_norm": 0.2573644911193979, "learning_rate": 2.4968710888610763e-05, "loss": 0.3758, "step": 1467 }, { "epoch": 1.6516601012943162, "grad_norm": 0.3260439897844004, "learning_rate": 2.494785148101794e-05, "loss": 0.3701, "step": 1468 }, { "epoch": 1.6527855936972426, "grad_norm": 0.2791366230994869, "learning_rate": 2.4926992073425114e-05, "loss": 0.3608, "step": 1469 }, { "epoch": 1.6539110861001687, "grad_norm": 0.28073773442639216, "learning_rate": 2.490613266583229e-05, "loss": 0.3552, "step": 1470 }, { "epoch": 1.6550365785030952, "grad_norm": 0.2751936808067321, "learning_rate": 2.4885273258239468e-05, "loss": 0.3551, "step": 1471 }, { "epoch": 1.6561620709060214, "grad_norm": 0.31105318511449315, "learning_rate": 2.486441385064664e-05, "loss": 0.3846, "step": 1472 }, { "epoch": 1.6572875633089477, "grad_norm": 0.2779436567942526, "learning_rate": 2.484355444305382e-05, "loss": 0.342, "step": 1473 }, { "epoch": 1.658413055711874, "grad_norm": 0.260118994793512, "learning_rate": 2.4822695035460995e-05, "loss": 0.3416, "step": 1474 }, { "epoch": 1.6595385481148002, "grad_norm": 0.30797304765243294, "learning_rate": 2.480183562786817e-05, "loss": 0.3649, "step": 1475 }, { "epoch": 1.6606640405177266, "grad_norm": 0.27879300341701935, "learning_rate": 2.4780976220275346e-05, "loss": 0.3577, "step": 1476 }, { "epoch": 1.6617895329206527, "grad_norm": 0.2618302523010228, "learning_rate": 2.476011681268252e-05, "loss": 0.373, "step": 1477 }, { "epoch": 1.6629150253235792, "grad_norm": 0.2691572921484226, "learning_rate": 2.4739257405089697e-05, "loss": 0.3382, "step": 1478 }, { "epoch": 1.6640405177265052, "grad_norm": 0.3021887597303646, "learning_rate": 2.4718397997496874e-05, "loss": 0.3561, "step": 1479 }, { "epoch": 1.6651660101294317, "grad_norm": 0.29571070245395525, "learning_rate": 2.4697538589904047e-05, "loss": 0.3666, "step": 1480 }, { "epoch": 1.666291502532358, "grad_norm": 0.3060388532862541, "learning_rate": 2.4676679182311224e-05, "loss": 0.3574, "step": 1481 }, { "epoch": 1.6674169949352842, "grad_norm": 0.262863158327581, "learning_rate": 2.46558197747184e-05, "loss": 0.3515, "step": 1482 }, { "epoch": 1.6685424873382104, "grad_norm": 0.26211725924142215, "learning_rate": 2.4634960367125575e-05, "loss": 0.3673, "step": 1483 }, { "epoch": 1.6696679797411367, "grad_norm": 0.27559909119280296, "learning_rate": 2.461410095953275e-05, "loss": 0.3699, "step": 1484 }, { "epoch": 1.6707934721440632, "grad_norm": 0.3286258665699544, "learning_rate": 2.459324155193993e-05, "loss": 0.3547, "step": 1485 }, { "epoch": 1.6719189645469892, "grad_norm": 0.28394671282033973, "learning_rate": 2.4572382144347102e-05, "loss": 0.3699, "step": 1486 }, { "epoch": 1.6730444569499157, "grad_norm": 0.28904710622589413, "learning_rate": 2.4551522736754276e-05, "loss": 0.3498, "step": 1487 }, { "epoch": 1.674169949352842, "grad_norm": 0.3427205931479807, "learning_rate": 2.4530663329161453e-05, "loss": 0.3501, "step": 1488 }, { "epoch": 1.6752954417557682, "grad_norm": 0.29275848448510483, "learning_rate": 2.4509803921568626e-05, "loss": 0.3436, "step": 1489 }, { "epoch": 1.6764209341586944, "grad_norm": 0.281092401245526, "learning_rate": 2.4488944513975803e-05, "loss": 0.3618, "step": 1490 }, { "epoch": 1.6775464265616207, "grad_norm": 0.3028349680860752, "learning_rate": 2.446808510638298e-05, "loss": 0.36, "step": 1491 }, { "epoch": 1.6786719189645471, "grad_norm": 0.2740774488090583, "learning_rate": 2.4447225698790154e-05, "loss": 0.3761, "step": 1492 }, { "epoch": 1.6797974113674732, "grad_norm": 0.3565931288895132, "learning_rate": 2.442636629119733e-05, "loss": 0.3656, "step": 1493 }, { "epoch": 1.6809229037703997, "grad_norm": 0.30994295777900555, "learning_rate": 2.4405506883604508e-05, "loss": 0.3709, "step": 1494 }, { "epoch": 1.6820483961733257, "grad_norm": 0.2770746159082683, "learning_rate": 2.438464747601168e-05, "loss": 0.3668, "step": 1495 }, { "epoch": 1.6831738885762522, "grad_norm": 0.28851250362528635, "learning_rate": 2.436378806841886e-05, "loss": 0.3375, "step": 1496 }, { "epoch": 1.6842993809791784, "grad_norm": 0.30502905628031945, "learning_rate": 2.4342928660826032e-05, "loss": 0.372, "step": 1497 }, { "epoch": 1.6854248733821047, "grad_norm": 0.27606414842777804, "learning_rate": 2.432206925323321e-05, "loss": 0.3667, "step": 1498 }, { "epoch": 1.686550365785031, "grad_norm": 0.2807248595539354, "learning_rate": 2.4301209845640386e-05, "loss": 0.3372, "step": 1499 }, { "epoch": 1.6876758581879572, "grad_norm": 0.3189944644623768, "learning_rate": 2.428035043804756e-05, "loss": 0.3556, "step": 1500 }, { "epoch": 1.6888013505908837, "grad_norm": 0.3542996839432631, "learning_rate": 2.4259491030454737e-05, "loss": 0.3657, "step": 1501 }, { "epoch": 1.6899268429938097, "grad_norm": 0.26759487532851395, "learning_rate": 2.4238631622861914e-05, "loss": 0.3537, "step": 1502 }, { "epoch": 1.6910523353967362, "grad_norm": 0.32892071648122306, "learning_rate": 2.4217772215269087e-05, "loss": 0.3679, "step": 1503 }, { "epoch": 1.6921778277996622, "grad_norm": 0.27325117871239496, "learning_rate": 2.4196912807676264e-05, "loss": 0.3636, "step": 1504 }, { "epoch": 1.6933033202025887, "grad_norm": 0.31473981377419813, "learning_rate": 2.417605340008344e-05, "loss": 0.3546, "step": 1505 }, { "epoch": 1.694428812605515, "grad_norm": 0.6213973467005295, "learning_rate": 2.4155193992490615e-05, "loss": 0.3569, "step": 1506 }, { "epoch": 1.6955543050084412, "grad_norm": 0.29664784115736215, "learning_rate": 2.4134334584897792e-05, "loss": 0.3541, "step": 1507 }, { "epoch": 1.6966797974113674, "grad_norm": 0.30075562806982764, "learning_rate": 2.4113475177304965e-05, "loss": 0.3442, "step": 1508 }, { "epoch": 1.6978052898142937, "grad_norm": 0.2798816842619607, "learning_rate": 2.409261576971214e-05, "loss": 0.3723, "step": 1509 }, { "epoch": 1.6989307822172202, "grad_norm": 0.3125716574597028, "learning_rate": 2.4071756362119316e-05, "loss": 0.3534, "step": 1510 }, { "epoch": 1.7000562746201462, "grad_norm": 0.2695382685076537, "learning_rate": 2.4050896954526493e-05, "loss": 0.3501, "step": 1511 }, { "epoch": 1.7011817670230727, "grad_norm": 0.30428973956664224, "learning_rate": 2.4030037546933667e-05, "loss": 0.361, "step": 1512 }, { "epoch": 1.702307259425999, "grad_norm": 0.2954859753709326, "learning_rate": 2.4009178139340844e-05, "loss": 0.348, "step": 1513 }, { "epoch": 1.7034327518289252, "grad_norm": 0.2535522065599469, "learning_rate": 2.398831873174802e-05, "loss": 0.3448, "step": 1514 }, { "epoch": 1.7045582442318514, "grad_norm": 0.2877878849798194, "learning_rate": 2.3967459324155194e-05, "loss": 0.3615, "step": 1515 }, { "epoch": 1.7056837366347777, "grad_norm": 0.2679693175700858, "learning_rate": 2.394659991656237e-05, "loss": 0.3575, "step": 1516 }, { "epoch": 1.7068092290377042, "grad_norm": 0.270042339489181, "learning_rate": 2.3925740508969545e-05, "loss": 0.3612, "step": 1517 }, { "epoch": 1.7079347214406302, "grad_norm": 0.3277570559960174, "learning_rate": 2.390488110137672e-05, "loss": 0.3539, "step": 1518 }, { "epoch": 1.7090602138435567, "grad_norm": 0.273010908537002, "learning_rate": 2.38840216937839e-05, "loss": 0.3813, "step": 1519 }, { "epoch": 1.7101857062464827, "grad_norm": 0.3163418829636289, "learning_rate": 2.3863162286191072e-05, "loss": 0.367, "step": 1520 }, { "epoch": 1.7113111986494092, "grad_norm": 0.2790546740132572, "learning_rate": 2.384230287859825e-05, "loss": 0.3616, "step": 1521 }, { "epoch": 1.7124366910523354, "grad_norm": 0.3794647847000264, "learning_rate": 2.3821443471005426e-05, "loss": 0.3264, "step": 1522 }, { "epoch": 1.7135621834552617, "grad_norm": 0.27180490681435693, "learning_rate": 2.38005840634126e-05, "loss": 0.3667, "step": 1523 }, { "epoch": 1.714687675858188, "grad_norm": 0.3192761006046379, "learning_rate": 2.3779724655819777e-05, "loss": 0.3516, "step": 1524 }, { "epoch": 1.7158131682611142, "grad_norm": 0.29770507590073336, "learning_rate": 2.3758865248226954e-05, "loss": 0.3616, "step": 1525 }, { "epoch": 1.7169386606640407, "grad_norm": 0.3198828879152863, "learning_rate": 2.3738005840634127e-05, "loss": 0.3529, "step": 1526 }, { "epoch": 1.7180641530669667, "grad_norm": 0.3090153579359256, "learning_rate": 2.3717146433041304e-05, "loss": 0.3526, "step": 1527 }, { "epoch": 1.7191896454698932, "grad_norm": 0.3212232642206978, "learning_rate": 2.369628702544848e-05, "loss": 0.3607, "step": 1528 }, { "epoch": 1.7203151378728192, "grad_norm": 0.30043128684782044, "learning_rate": 2.367542761785565e-05, "loss": 0.352, "step": 1529 }, { "epoch": 1.7214406302757457, "grad_norm": 0.29295625581516677, "learning_rate": 2.365456821026283e-05, "loss": 0.3523, "step": 1530 }, { "epoch": 1.722566122678672, "grad_norm": 0.3148385769404437, "learning_rate": 2.3633708802670006e-05, "loss": 0.3428, "step": 1531 }, { "epoch": 1.7236916150815982, "grad_norm": 0.2809729795961225, "learning_rate": 2.361284939507718e-05, "loss": 0.3501, "step": 1532 }, { "epoch": 1.7248171074845244, "grad_norm": 0.26779520094077724, "learning_rate": 2.3591989987484356e-05, "loss": 0.3692, "step": 1533 }, { "epoch": 1.7259425998874507, "grad_norm": 0.34366805506707354, "learning_rate": 2.3571130579891533e-05, "loss": 0.3487, "step": 1534 }, { "epoch": 1.7270680922903772, "grad_norm": 0.31386821776914015, "learning_rate": 2.3550271172298707e-05, "loss": 0.3738, "step": 1535 }, { "epoch": 1.7281935846933032, "grad_norm": 0.27129248750888196, "learning_rate": 2.3529411764705884e-05, "loss": 0.347, "step": 1536 }, { "epoch": 1.7293190770962297, "grad_norm": 0.30115138922829, "learning_rate": 2.3508552357113057e-05, "loss": 0.3644, "step": 1537 }, { "epoch": 1.730444569499156, "grad_norm": 0.33400000355343035, "learning_rate": 2.3487692949520234e-05, "loss": 0.3735, "step": 1538 }, { "epoch": 1.7315700619020822, "grad_norm": 0.2817713843812286, "learning_rate": 2.346683354192741e-05, "loss": 0.3536, "step": 1539 }, { "epoch": 1.7326955543050084, "grad_norm": 0.2536336062497862, "learning_rate": 2.3445974134334585e-05, "loss": 0.3531, "step": 1540 }, { "epoch": 1.7338210467079347, "grad_norm": 0.292119578282618, "learning_rate": 2.3425114726741762e-05, "loss": 0.3917, "step": 1541 }, { "epoch": 1.7349465391108612, "grad_norm": 0.301736936214816, "learning_rate": 2.340425531914894e-05, "loss": 0.3368, "step": 1542 }, { "epoch": 1.7360720315137872, "grad_norm": 0.2834782995265929, "learning_rate": 2.3383395911556112e-05, "loss": 0.3627, "step": 1543 }, { "epoch": 1.7371975239167137, "grad_norm": 0.3472332663859999, "learning_rate": 2.336253650396329e-05, "loss": 0.3563, "step": 1544 }, { "epoch": 1.7383230163196397, "grad_norm": 0.2770080632091572, "learning_rate": 2.3341677096370466e-05, "loss": 0.3638, "step": 1545 }, { "epoch": 1.7394485087225662, "grad_norm": 0.28038474675505726, "learning_rate": 2.332081768877764e-05, "loss": 0.3578, "step": 1546 }, { "epoch": 1.7405740011254924, "grad_norm": 0.29413387062574675, "learning_rate": 2.3299958281184817e-05, "loss": 0.3581, "step": 1547 }, { "epoch": 1.7416994935284187, "grad_norm": 0.250154894365378, "learning_rate": 2.3279098873591994e-05, "loss": 0.3524, "step": 1548 }, { "epoch": 1.742824985931345, "grad_norm": 0.27004730168507385, "learning_rate": 2.3258239465999164e-05, "loss": 0.3592, "step": 1549 }, { "epoch": 1.7439504783342712, "grad_norm": 0.30931998115710535, "learning_rate": 2.323738005840634e-05, "loss": 0.3633, "step": 1550 }, { "epoch": 1.7450759707371977, "grad_norm": 0.260094920014104, "learning_rate": 2.3216520650813518e-05, "loss": 0.36, "step": 1551 }, { "epoch": 1.7462014631401237, "grad_norm": 0.28020792072208933, "learning_rate": 2.3195661243220692e-05, "loss": 0.3619, "step": 1552 }, { "epoch": 1.7473269555430502, "grad_norm": 0.29150594274575353, "learning_rate": 2.317480183562787e-05, "loss": 0.3752, "step": 1553 }, { "epoch": 1.7484524479459762, "grad_norm": 0.2780077227889234, "learning_rate": 2.3153942428035046e-05, "loss": 0.3404, "step": 1554 }, { "epoch": 1.7495779403489027, "grad_norm": 0.26577200333767786, "learning_rate": 2.313308302044222e-05, "loss": 0.3457, "step": 1555 }, { "epoch": 1.750703432751829, "grad_norm": 0.297363502447975, "learning_rate": 2.3112223612849396e-05, "loss": 0.3473, "step": 1556 }, { "epoch": 1.7518289251547552, "grad_norm": 0.26278420558469534, "learning_rate": 2.309136420525657e-05, "loss": 0.3474, "step": 1557 }, { "epoch": 1.7529544175576814, "grad_norm": 0.26900103936760594, "learning_rate": 2.3070504797663747e-05, "loss": 0.3531, "step": 1558 }, { "epoch": 1.7540799099606077, "grad_norm": 0.32212325532836394, "learning_rate": 2.3049645390070924e-05, "loss": 0.3671, "step": 1559 }, { "epoch": 1.7552054023635342, "grad_norm": 0.2970498028319336, "learning_rate": 2.3028785982478097e-05, "loss": 0.3488, "step": 1560 }, { "epoch": 1.7563308947664602, "grad_norm": 0.3127346620449437, "learning_rate": 2.3007926574885274e-05, "loss": 0.3585, "step": 1561 }, { "epoch": 1.7574563871693867, "grad_norm": 0.2765174597912786, "learning_rate": 2.298706716729245e-05, "loss": 0.3394, "step": 1562 }, { "epoch": 1.758581879572313, "grad_norm": 0.3009783707148293, "learning_rate": 2.2966207759699625e-05, "loss": 0.3629, "step": 1563 }, { "epoch": 1.7597073719752392, "grad_norm": 0.30115522865418154, "learning_rate": 2.2945348352106802e-05, "loss": 0.3725, "step": 1564 }, { "epoch": 1.7608328643781654, "grad_norm": 0.2930519854916032, "learning_rate": 2.292448894451398e-05, "loss": 0.3569, "step": 1565 }, { "epoch": 1.7619583567810917, "grad_norm": 0.3047405558309698, "learning_rate": 2.2903629536921153e-05, "loss": 0.3714, "step": 1566 }, { "epoch": 1.7630838491840182, "grad_norm": 0.2590925307869418, "learning_rate": 2.288277012932833e-05, "loss": 0.3429, "step": 1567 }, { "epoch": 1.7642093415869442, "grad_norm": 0.26569956950346013, "learning_rate": 2.2861910721735507e-05, "loss": 0.3563, "step": 1568 }, { "epoch": 1.7653348339898707, "grad_norm": 0.3212454261162196, "learning_rate": 2.284105131414268e-05, "loss": 0.3628, "step": 1569 }, { "epoch": 1.7664603263927967, "grad_norm": 0.25121736290737545, "learning_rate": 2.2820191906549854e-05, "loss": 0.3407, "step": 1570 }, { "epoch": 1.7675858187957232, "grad_norm": 0.2568310971026976, "learning_rate": 2.279933249895703e-05, "loss": 0.3556, "step": 1571 }, { "epoch": 1.7687113111986494, "grad_norm": 0.2766368973128219, "learning_rate": 2.2778473091364204e-05, "loss": 0.3488, "step": 1572 }, { "epoch": 1.7698368036015757, "grad_norm": 0.2830150867726597, "learning_rate": 2.275761368377138e-05, "loss": 0.3563, "step": 1573 }, { "epoch": 1.770962296004502, "grad_norm": 0.2704782966697743, "learning_rate": 2.2736754276178558e-05, "loss": 0.3785, "step": 1574 }, { "epoch": 1.7720877884074282, "grad_norm": 0.28693834596503254, "learning_rate": 2.2715894868585732e-05, "loss": 0.3568, "step": 1575 }, { "epoch": 1.7732132808103547, "grad_norm": 0.300591423339274, "learning_rate": 2.269503546099291e-05, "loss": 0.3401, "step": 1576 }, { "epoch": 1.7743387732132807, "grad_norm": 0.25477856684248135, "learning_rate": 2.2674176053400082e-05, "loss": 0.3485, "step": 1577 }, { "epoch": 1.7754642656162072, "grad_norm": 0.27289380951433195, "learning_rate": 2.265331664580726e-05, "loss": 0.3607, "step": 1578 }, { "epoch": 1.7765897580191332, "grad_norm": 0.28248295859121, "learning_rate": 2.2632457238214436e-05, "loss": 0.3511, "step": 1579 }, { "epoch": 1.7777152504220597, "grad_norm": 0.2658629597762577, "learning_rate": 2.261159783062161e-05, "loss": 0.377, "step": 1580 }, { "epoch": 1.778840742824986, "grad_norm": 0.27220952413476507, "learning_rate": 2.2590738423028787e-05, "loss": 0.3557, "step": 1581 }, { "epoch": 1.7799662352279122, "grad_norm": 0.2328823780748166, "learning_rate": 2.2569879015435964e-05, "loss": 0.3437, "step": 1582 }, { "epoch": 1.7810917276308385, "grad_norm": 0.27552976720286626, "learning_rate": 2.2549019607843138e-05, "loss": 0.3499, "step": 1583 }, { "epoch": 1.7822172200337647, "grad_norm": 0.26988928502984605, "learning_rate": 2.2528160200250315e-05, "loss": 0.3739, "step": 1584 }, { "epoch": 1.7833427124366912, "grad_norm": 0.28360069868054577, "learning_rate": 2.250730079265749e-05, "loss": 0.3586, "step": 1585 }, { "epoch": 1.7844682048396172, "grad_norm": 0.30703362231564924, "learning_rate": 2.2486441385064665e-05, "loss": 0.3402, "step": 1586 }, { "epoch": 1.7855936972425437, "grad_norm": 0.24229886888660893, "learning_rate": 2.2465581977471842e-05, "loss": 0.3749, "step": 1587 }, { "epoch": 1.78671918964547, "grad_norm": 0.3052949362012416, "learning_rate": 2.244472256987902e-05, "loss": 0.3432, "step": 1588 }, { "epoch": 1.7878446820483962, "grad_norm": 0.30727845879387705, "learning_rate": 2.2423863162286193e-05, "loss": 0.352, "step": 1589 }, { "epoch": 1.7889701744513224, "grad_norm": 0.2871867401825979, "learning_rate": 2.240300375469337e-05, "loss": 0.3654, "step": 1590 }, { "epoch": 1.7900956668542487, "grad_norm": 0.29179973425408784, "learning_rate": 2.2382144347100543e-05, "loss": 0.3616, "step": 1591 }, { "epoch": 1.7912211592571752, "grad_norm": 0.3214829947161843, "learning_rate": 2.2361284939507717e-05, "loss": 0.3512, "step": 1592 }, { "epoch": 1.7923466516601012, "grad_norm": 0.24147795550593532, "learning_rate": 2.2340425531914894e-05, "loss": 0.3462, "step": 1593 }, { "epoch": 1.7934721440630277, "grad_norm": 0.27689467276611157, "learning_rate": 2.231956612432207e-05, "loss": 0.36, "step": 1594 }, { "epoch": 1.7945976364659537, "grad_norm": 0.28150686561848237, "learning_rate": 2.2298706716729244e-05, "loss": 0.3532, "step": 1595 }, { "epoch": 1.7957231288688802, "grad_norm": 0.2581225749795623, "learning_rate": 2.227784730913642e-05, "loss": 0.3559, "step": 1596 }, { "epoch": 1.7968486212718064, "grad_norm": 0.3039816245853392, "learning_rate": 2.22569879015436e-05, "loss": 0.3538, "step": 1597 }, { "epoch": 1.7979741136747327, "grad_norm": 0.25714237851869526, "learning_rate": 2.2236128493950772e-05, "loss": 0.3442, "step": 1598 }, { "epoch": 1.799099606077659, "grad_norm": 0.24074871831754024, "learning_rate": 2.221526908635795e-05, "loss": 0.349, "step": 1599 }, { "epoch": 1.8002250984805852, "grad_norm": 0.28044366274540433, "learning_rate": 2.2194409678765123e-05, "loss": 0.3476, "step": 1600 }, { "epoch": 1.8013505908835117, "grad_norm": 0.3055206598860284, "learning_rate": 2.21735502711723e-05, "loss": 0.329, "step": 1601 }, { "epoch": 1.8024760832864377, "grad_norm": 0.32077315537142925, "learning_rate": 2.2152690863579477e-05, "loss": 0.3632, "step": 1602 }, { "epoch": 1.8036015756893642, "grad_norm": 0.3191853556743268, "learning_rate": 2.213183145598665e-05, "loss": 0.3724, "step": 1603 }, { "epoch": 1.8047270680922902, "grad_norm": 0.309559125522351, "learning_rate": 2.2110972048393827e-05, "loss": 0.36, "step": 1604 }, { "epoch": 1.8058525604952167, "grad_norm": 0.31187663837864876, "learning_rate": 2.2090112640801004e-05, "loss": 0.344, "step": 1605 }, { "epoch": 1.806978052898143, "grad_norm": 0.3041182224443529, "learning_rate": 2.2069253233208178e-05, "loss": 0.353, "step": 1606 }, { "epoch": 1.8081035453010692, "grad_norm": 0.29282481275876526, "learning_rate": 2.2048393825615355e-05, "loss": 0.3486, "step": 1607 }, { "epoch": 1.8092290377039955, "grad_norm": 0.29147172423218604, "learning_rate": 2.202753441802253e-05, "loss": 0.3672, "step": 1608 }, { "epoch": 1.8103545301069217, "grad_norm": 0.28412350307097217, "learning_rate": 2.2006675010429705e-05, "loss": 0.3353, "step": 1609 }, { "epoch": 1.8114800225098482, "grad_norm": 0.3067481559384326, "learning_rate": 2.1985815602836882e-05, "loss": 0.3768, "step": 1610 }, { "epoch": 1.8126055149127742, "grad_norm": 0.28038315675437364, "learning_rate": 2.1964956195244056e-05, "loss": 0.3654, "step": 1611 }, { "epoch": 1.8137310073157007, "grad_norm": 0.35539158393187636, "learning_rate": 2.194409678765123e-05, "loss": 0.3538, "step": 1612 }, { "epoch": 1.814856499718627, "grad_norm": 0.25885136003612047, "learning_rate": 2.1923237380058406e-05, "loss": 0.3644, "step": 1613 }, { "epoch": 1.8159819921215532, "grad_norm": 0.26093239365043436, "learning_rate": 2.1902377972465583e-05, "loss": 0.3708, "step": 1614 }, { "epoch": 1.8171074845244795, "grad_norm": 0.2961872877279803, "learning_rate": 2.1881518564872757e-05, "loss": 0.3596, "step": 1615 }, { "epoch": 1.8182329769274057, "grad_norm": 0.25680902610020434, "learning_rate": 2.1860659157279934e-05, "loss": 0.3577, "step": 1616 }, { "epoch": 1.8193584693303322, "grad_norm": 0.25991234140815395, "learning_rate": 2.183979974968711e-05, "loss": 0.3633, "step": 1617 }, { "epoch": 1.8204839617332582, "grad_norm": 0.2613869530348512, "learning_rate": 2.1818940342094285e-05, "loss": 0.3418, "step": 1618 }, { "epoch": 1.8216094541361847, "grad_norm": 0.24790904990988194, "learning_rate": 2.179808093450146e-05, "loss": 0.3632, "step": 1619 }, { "epoch": 1.8227349465391107, "grad_norm": 0.28799463478351406, "learning_rate": 2.1777221526908635e-05, "loss": 0.3699, "step": 1620 }, { "epoch": 1.8238604389420372, "grad_norm": 0.25548160538250764, "learning_rate": 2.1756362119315812e-05, "loss": 0.3442, "step": 1621 }, { "epoch": 1.8249859313449635, "grad_norm": 0.2985142619761683, "learning_rate": 2.173550271172299e-05, "loss": 0.3486, "step": 1622 }, { "epoch": 1.8261114237478897, "grad_norm": 0.2972946035959545, "learning_rate": 2.1714643304130163e-05, "loss": 0.3414, "step": 1623 }, { "epoch": 1.827236916150816, "grad_norm": 0.26170651498968683, "learning_rate": 2.169378389653734e-05, "loss": 0.3284, "step": 1624 }, { "epoch": 1.8283624085537422, "grad_norm": 0.2524407858115918, "learning_rate": 2.1672924488944517e-05, "loss": 0.3599, "step": 1625 }, { "epoch": 1.8294879009566687, "grad_norm": 0.3335691621333924, "learning_rate": 2.165206508135169e-05, "loss": 0.3603, "step": 1626 }, { "epoch": 1.8306133933595947, "grad_norm": 0.2768913167073537, "learning_rate": 2.1631205673758867e-05, "loss": 0.3588, "step": 1627 }, { "epoch": 1.8317388857625212, "grad_norm": 0.30050684042922793, "learning_rate": 2.1610346266166044e-05, "loss": 0.372, "step": 1628 }, { "epoch": 1.8328643781654472, "grad_norm": 0.2901843574196796, "learning_rate": 2.1589486858573218e-05, "loss": 0.3639, "step": 1629 }, { "epoch": 1.8339898705683737, "grad_norm": 0.29902669217912486, "learning_rate": 2.1568627450980395e-05, "loss": 0.3735, "step": 1630 }, { "epoch": 1.8351153629713, "grad_norm": 0.30980781618970216, "learning_rate": 2.154776804338757e-05, "loss": 0.3572, "step": 1631 }, { "epoch": 1.8362408553742262, "grad_norm": 0.26616420601594276, "learning_rate": 2.1526908635794745e-05, "loss": 0.3582, "step": 1632 }, { "epoch": 1.8373663477771525, "grad_norm": 0.29096782812841715, "learning_rate": 2.150604922820192e-05, "loss": 0.3533, "step": 1633 }, { "epoch": 1.8384918401800787, "grad_norm": 0.29936454913412547, "learning_rate": 2.1485189820609096e-05, "loss": 0.3441, "step": 1634 }, { "epoch": 1.8396173325830052, "grad_norm": 0.34946000879087785, "learning_rate": 2.146433041301627e-05, "loss": 0.3628, "step": 1635 }, { "epoch": 1.8407428249859312, "grad_norm": 0.2623712677205065, "learning_rate": 2.1443471005423447e-05, "loss": 0.3545, "step": 1636 }, { "epoch": 1.8418683173888577, "grad_norm": 0.2753735634753566, "learning_rate": 2.1422611597830624e-05, "loss": 0.3528, "step": 1637 }, { "epoch": 1.842993809791784, "grad_norm": 0.31812525886192866, "learning_rate": 2.1401752190237797e-05, "loss": 0.3697, "step": 1638 }, { "epoch": 1.8441193021947102, "grad_norm": 0.29105961621045684, "learning_rate": 2.1380892782644974e-05, "loss": 0.3546, "step": 1639 }, { "epoch": 1.8452447945976365, "grad_norm": 0.2691984264982239, "learning_rate": 2.1360033375052148e-05, "loss": 0.3536, "step": 1640 }, { "epoch": 1.8463702870005627, "grad_norm": 0.2993538772854178, "learning_rate": 2.1339173967459325e-05, "loss": 0.363, "step": 1641 }, { "epoch": 1.8474957794034892, "grad_norm": 0.29783181788963287, "learning_rate": 2.13183145598665e-05, "loss": 0.3592, "step": 1642 }, { "epoch": 1.8486212718064152, "grad_norm": 0.2775688059074239, "learning_rate": 2.1297455152273675e-05, "loss": 0.3581, "step": 1643 }, { "epoch": 1.8497467642093417, "grad_norm": 0.3133614801924746, "learning_rate": 2.1276595744680852e-05, "loss": 0.3498, "step": 1644 }, { "epoch": 1.8508722566122677, "grad_norm": 0.2772230818911116, "learning_rate": 2.125573633708803e-05, "loss": 0.3558, "step": 1645 }, { "epoch": 1.8519977490151942, "grad_norm": 0.30827283116401644, "learning_rate": 2.1234876929495203e-05, "loss": 0.3614, "step": 1646 }, { "epoch": 1.8531232414181205, "grad_norm": 0.24090218764810817, "learning_rate": 2.121401752190238e-05, "loss": 0.3662, "step": 1647 }, { "epoch": 1.8542487338210467, "grad_norm": 0.28761910481188807, "learning_rate": 2.1193158114309557e-05, "loss": 0.3441, "step": 1648 }, { "epoch": 1.855374226223973, "grad_norm": 0.2560509442786654, "learning_rate": 2.117229870671673e-05, "loss": 0.3547, "step": 1649 }, { "epoch": 1.8564997186268992, "grad_norm": 0.30034883076743724, "learning_rate": 2.1151439299123907e-05, "loss": 0.3449, "step": 1650 }, { "epoch": 1.8576252110298257, "grad_norm": 0.34444462233589906, "learning_rate": 2.113057989153108e-05, "loss": 0.3837, "step": 1651 }, { "epoch": 1.8587507034327517, "grad_norm": 0.27692690682489174, "learning_rate": 2.1109720483938258e-05, "loss": 0.3607, "step": 1652 }, { "epoch": 1.8598761958356782, "grad_norm": 0.26001142796077303, "learning_rate": 2.1088861076345435e-05, "loss": 0.3398, "step": 1653 }, { "epoch": 1.8610016882386042, "grad_norm": 0.25366060360784753, "learning_rate": 2.106800166875261e-05, "loss": 0.3656, "step": 1654 }, { "epoch": 1.8621271806415307, "grad_norm": 0.25058815872177637, "learning_rate": 2.1047142261159782e-05, "loss": 0.3439, "step": 1655 }, { "epoch": 1.863252673044457, "grad_norm": 0.28664975028041284, "learning_rate": 2.102628285356696e-05, "loss": 0.3583, "step": 1656 }, { "epoch": 1.8643781654473832, "grad_norm": 0.2732549675529288, "learning_rate": 2.1005423445974136e-05, "loss": 0.3305, "step": 1657 }, { "epoch": 1.8655036578503095, "grad_norm": 0.2773666490469463, "learning_rate": 2.098456403838131e-05, "loss": 0.3591, "step": 1658 }, { "epoch": 1.8666291502532357, "grad_norm": 0.2690002427002813, "learning_rate": 2.0963704630788487e-05, "loss": 0.3684, "step": 1659 }, { "epoch": 1.8677546426561622, "grad_norm": 0.27085097978896006, "learning_rate": 2.094284522319566e-05, "loss": 0.3384, "step": 1660 }, { "epoch": 1.8688801350590882, "grad_norm": 0.24697707069643743, "learning_rate": 2.0921985815602837e-05, "loss": 0.3572, "step": 1661 }, { "epoch": 1.8700056274620147, "grad_norm": 0.2764605247602527, "learning_rate": 2.0901126408010014e-05, "loss": 0.3552, "step": 1662 }, { "epoch": 1.871131119864941, "grad_norm": 0.2902550139143697, "learning_rate": 2.0880267000417188e-05, "loss": 0.3581, "step": 1663 }, { "epoch": 1.8722566122678672, "grad_norm": 0.25734658506325125, "learning_rate": 2.0859407592824365e-05, "loss": 0.3509, "step": 1664 }, { "epoch": 1.8733821046707935, "grad_norm": 0.29290615718137913, "learning_rate": 2.0838548185231542e-05, "loss": 0.3448, "step": 1665 }, { "epoch": 1.8745075970737197, "grad_norm": 0.2633403418767797, "learning_rate": 2.0817688777638715e-05, "loss": 0.3556, "step": 1666 }, { "epoch": 1.8756330894766462, "grad_norm": 0.3044255909775045, "learning_rate": 2.0796829370045892e-05, "loss": 0.3451, "step": 1667 }, { "epoch": 1.8767585818795722, "grad_norm": 0.2932864685525451, "learning_rate": 2.077596996245307e-05, "loss": 0.3657, "step": 1668 }, { "epoch": 1.8778840742824987, "grad_norm": 0.31135509455954635, "learning_rate": 2.0755110554860243e-05, "loss": 0.3734, "step": 1669 }, { "epoch": 1.8790095666854247, "grad_norm": 0.2664061935893102, "learning_rate": 2.073425114726742e-05, "loss": 0.3629, "step": 1670 }, { "epoch": 1.8801350590883512, "grad_norm": 0.2707969930148503, "learning_rate": 2.0713391739674597e-05, "loss": 0.3483, "step": 1671 }, { "epoch": 1.8812605514912775, "grad_norm": 0.2582761473461036, "learning_rate": 2.069253233208177e-05, "loss": 0.366, "step": 1672 }, { "epoch": 1.8823860438942037, "grad_norm": 0.2818191859830275, "learning_rate": 2.0671672924488947e-05, "loss": 0.3606, "step": 1673 }, { "epoch": 1.88351153629713, "grad_norm": 0.274907626023918, "learning_rate": 2.065081351689612e-05, "loss": 0.3733, "step": 1674 }, { "epoch": 1.8846370287000562, "grad_norm": 0.25302448281459705, "learning_rate": 2.0629954109303295e-05, "loss": 0.344, "step": 1675 }, { "epoch": 1.8857625211029827, "grad_norm": 0.2601145397643824, "learning_rate": 2.060909470171047e-05, "loss": 0.3655, "step": 1676 }, { "epoch": 1.8868880135059087, "grad_norm": 0.2598011168749623, "learning_rate": 2.058823529411765e-05, "loss": 0.3583, "step": 1677 }, { "epoch": 1.8880135059088352, "grad_norm": 0.2764045861628215, "learning_rate": 2.0567375886524822e-05, "loss": 0.3358, "step": 1678 }, { "epoch": 1.8891389983117612, "grad_norm": 0.2505563945259788, "learning_rate": 2.0546516478932e-05, "loss": 0.3326, "step": 1679 }, { "epoch": 1.8902644907146877, "grad_norm": 0.2593385914562438, "learning_rate": 2.0525657071339173e-05, "loss": 0.344, "step": 1680 }, { "epoch": 1.891389983117614, "grad_norm": 0.32013539903668187, "learning_rate": 2.050479766374635e-05, "loss": 0.3678, "step": 1681 }, { "epoch": 1.8925154755205402, "grad_norm": 0.2850992099914004, "learning_rate": 2.0483938256153527e-05, "loss": 0.397, "step": 1682 }, { "epoch": 1.8936409679234665, "grad_norm": 0.3016034620250037, "learning_rate": 2.04630788485607e-05, "loss": 0.3358, "step": 1683 }, { "epoch": 1.8947664603263927, "grad_norm": 0.322626269426066, "learning_rate": 2.0442219440967877e-05, "loss": 0.3493, "step": 1684 }, { "epoch": 1.8958919527293192, "grad_norm": 0.27415129738901345, "learning_rate": 2.0421360033375054e-05, "loss": 0.3612, "step": 1685 }, { "epoch": 1.8970174451322452, "grad_norm": 0.3202508460747489, "learning_rate": 2.0400500625782228e-05, "loss": 0.3449, "step": 1686 }, { "epoch": 1.8981429375351717, "grad_norm": 0.2610128644172156, "learning_rate": 2.0379641218189405e-05, "loss": 0.334, "step": 1687 }, { "epoch": 1.899268429938098, "grad_norm": 0.26431886989489495, "learning_rate": 2.0358781810596582e-05, "loss": 0.3701, "step": 1688 }, { "epoch": 1.9003939223410242, "grad_norm": 0.32289025222752066, "learning_rate": 2.0337922403003756e-05, "loss": 0.3772, "step": 1689 }, { "epoch": 1.9015194147439505, "grad_norm": 0.27620099175466095, "learning_rate": 2.0317062995410932e-05, "loss": 0.3634, "step": 1690 }, { "epoch": 1.9026449071468767, "grad_norm": 0.30452855448211125, "learning_rate": 2.029620358781811e-05, "loss": 0.3619, "step": 1691 }, { "epoch": 1.9037703995498032, "grad_norm": 0.30999319017283444, "learning_rate": 2.0275344180225283e-05, "loss": 0.3472, "step": 1692 }, { "epoch": 1.9048958919527292, "grad_norm": 0.34073549354424293, "learning_rate": 2.025448477263246e-05, "loss": 0.3417, "step": 1693 }, { "epoch": 1.9060213843556557, "grad_norm": 0.28162550986145274, "learning_rate": 2.0233625365039634e-05, "loss": 0.3536, "step": 1694 }, { "epoch": 1.9071468767585817, "grad_norm": 0.3215339598711887, "learning_rate": 2.0212765957446807e-05, "loss": 0.3682, "step": 1695 }, { "epoch": 1.9082723691615082, "grad_norm": 0.34154514944007364, "learning_rate": 2.0191906549853984e-05, "loss": 0.3573, "step": 1696 }, { "epoch": 1.9093978615644345, "grad_norm": 0.27450876997174517, "learning_rate": 2.017104714226116e-05, "loss": 0.3656, "step": 1697 }, { "epoch": 1.9105233539673607, "grad_norm": 0.32973694211143484, "learning_rate": 2.0150187734668335e-05, "loss": 0.3729, "step": 1698 }, { "epoch": 1.911648846370287, "grad_norm": 0.33057591238589434, "learning_rate": 2.0129328327075512e-05, "loss": 0.371, "step": 1699 }, { "epoch": 1.9127743387732132, "grad_norm": 0.28948186161364625, "learning_rate": 2.0108468919482685e-05, "loss": 0.3397, "step": 1700 }, { "epoch": 1.9138998311761397, "grad_norm": 0.3007970569880779, "learning_rate": 2.0087609511889862e-05, "loss": 0.3643, "step": 1701 }, { "epoch": 1.9150253235790657, "grad_norm": 0.2612518404162693, "learning_rate": 2.006675010429704e-05, "loss": 0.3532, "step": 1702 }, { "epoch": 1.9161508159819922, "grad_norm": 0.31521980587085163, "learning_rate": 2.0045890696704213e-05, "loss": 0.3572, "step": 1703 }, { "epoch": 1.9172763083849182, "grad_norm": 0.32716978204799535, "learning_rate": 2.002503128911139e-05, "loss": 0.3655, "step": 1704 }, { "epoch": 1.9184018007878447, "grad_norm": 0.2848312721456602, "learning_rate": 2.0004171881518567e-05, "loss": 0.3293, "step": 1705 }, { "epoch": 1.919527293190771, "grad_norm": 0.2849516222624094, "learning_rate": 1.998331247392574e-05, "loss": 0.3493, "step": 1706 }, { "epoch": 1.9206527855936972, "grad_norm": 0.2748223750385321, "learning_rate": 1.9962453066332917e-05, "loss": 0.3361, "step": 1707 }, { "epoch": 1.9217782779966235, "grad_norm": 0.3052533145067581, "learning_rate": 1.9941593658740094e-05, "loss": 0.3697, "step": 1708 }, { "epoch": 1.9229037703995497, "grad_norm": 0.2819225673013518, "learning_rate": 1.9920734251147268e-05, "loss": 0.3598, "step": 1709 }, { "epoch": 1.9240292628024762, "grad_norm": 0.28297852832083414, "learning_rate": 1.9899874843554445e-05, "loss": 0.3421, "step": 1710 }, { "epoch": 1.9251547552054022, "grad_norm": 0.32135792331365465, "learning_rate": 1.9879015435961622e-05, "loss": 0.3728, "step": 1711 }, { "epoch": 1.9262802476083287, "grad_norm": 0.2485116486993189, "learning_rate": 1.9858156028368796e-05, "loss": 0.3494, "step": 1712 }, { "epoch": 1.927405740011255, "grad_norm": 0.2749683711636245, "learning_rate": 1.9837296620775973e-05, "loss": 0.346, "step": 1713 }, { "epoch": 1.9285312324141812, "grad_norm": 0.2642179410888402, "learning_rate": 1.9816437213183146e-05, "loss": 0.3548, "step": 1714 }, { "epoch": 1.9296567248171075, "grad_norm": 0.25158261695078715, "learning_rate": 1.9795577805590323e-05, "loss": 0.359, "step": 1715 }, { "epoch": 1.9307822172200337, "grad_norm": 0.27223176041458313, "learning_rate": 1.9774718397997497e-05, "loss": 0.3414, "step": 1716 }, { "epoch": 1.93190770962296, "grad_norm": 0.2782144577617854, "learning_rate": 1.9753858990404674e-05, "loss": 0.3561, "step": 1717 }, { "epoch": 1.9330332020258862, "grad_norm": 0.27538099734788146, "learning_rate": 1.9732999582811847e-05, "loss": 0.3472, "step": 1718 }, { "epoch": 1.9341586944288127, "grad_norm": 0.2960828119915571, "learning_rate": 1.9712140175219024e-05, "loss": 0.3496, "step": 1719 }, { "epoch": 1.9352841868317388, "grad_norm": 0.258095045594745, "learning_rate": 1.9691280767626198e-05, "loss": 0.3517, "step": 1720 }, { "epoch": 1.9364096792346652, "grad_norm": 0.3024256600541793, "learning_rate": 1.9670421360033375e-05, "loss": 0.3586, "step": 1721 }, { "epoch": 1.9375351716375915, "grad_norm": 0.29098939153442666, "learning_rate": 1.9649561952440552e-05, "loss": 0.3643, "step": 1722 }, { "epoch": 1.9386606640405177, "grad_norm": 0.25782898610022725, "learning_rate": 1.9628702544847726e-05, "loss": 0.367, "step": 1723 }, { "epoch": 1.939786156443444, "grad_norm": 0.3495526740430891, "learning_rate": 1.9607843137254903e-05, "loss": 0.3577, "step": 1724 }, { "epoch": 1.9409116488463702, "grad_norm": 0.2728973828660973, "learning_rate": 1.958698372966208e-05, "loss": 0.3554, "step": 1725 }, { "epoch": 1.9420371412492967, "grad_norm": 0.2901290142358023, "learning_rate": 1.9566124322069253e-05, "loss": 0.37, "step": 1726 }, { "epoch": 1.9431626336522227, "grad_norm": 0.3031752356222974, "learning_rate": 1.954526491447643e-05, "loss": 0.3638, "step": 1727 }, { "epoch": 1.9442881260551492, "grad_norm": 0.260909753207997, "learning_rate": 1.9524405506883607e-05, "loss": 0.3618, "step": 1728 }, { "epoch": 1.9454136184580753, "grad_norm": 0.28948350014768964, "learning_rate": 1.950354609929078e-05, "loss": 0.3401, "step": 1729 }, { "epoch": 1.9465391108610017, "grad_norm": 0.2623446580726307, "learning_rate": 1.9482686691697958e-05, "loss": 0.3618, "step": 1730 }, { "epoch": 1.947664603263928, "grad_norm": 0.2666588748626957, "learning_rate": 1.9461827284105135e-05, "loss": 0.3424, "step": 1731 }, { "epoch": 1.9487900956668542, "grad_norm": 0.23758227129892492, "learning_rate": 1.9440967876512308e-05, "loss": 0.3647, "step": 1732 }, { "epoch": 1.9499155880697805, "grad_norm": 1.0235070433552647, "learning_rate": 1.9420108468919485e-05, "loss": 0.362, "step": 1733 }, { "epoch": 1.9510410804727067, "grad_norm": 0.28481161066229677, "learning_rate": 1.939924906132666e-05, "loss": 0.3631, "step": 1734 }, { "epoch": 1.9521665728756332, "grad_norm": 0.2848122389618838, "learning_rate": 1.9378389653733836e-05, "loss": 0.3469, "step": 1735 }, { "epoch": 1.9532920652785593, "grad_norm": 0.2759014719515173, "learning_rate": 1.9357530246141013e-05, "loss": 0.3425, "step": 1736 }, { "epoch": 1.9544175576814857, "grad_norm": 0.27874949300316715, "learning_rate": 1.9336670838548186e-05, "loss": 0.3855, "step": 1737 }, { "epoch": 1.955543050084412, "grad_norm": 0.31363642679753656, "learning_rate": 1.931581143095536e-05, "loss": 0.3536, "step": 1738 }, { "epoch": 1.9566685424873382, "grad_norm": 0.2556224324207228, "learning_rate": 1.9294952023362537e-05, "loss": 0.3432, "step": 1739 }, { "epoch": 1.9577940348902645, "grad_norm": 0.2670888092453423, "learning_rate": 1.927409261576971e-05, "loss": 0.3509, "step": 1740 }, { "epoch": 1.9589195272931907, "grad_norm": 0.25001267900165874, "learning_rate": 1.9253233208176888e-05, "loss": 0.3323, "step": 1741 }, { "epoch": 1.960045019696117, "grad_norm": 0.2974207872384669, "learning_rate": 1.9232373800584064e-05, "loss": 0.3544, "step": 1742 }, { "epoch": 1.9611705120990433, "grad_norm": 0.27472747483190185, "learning_rate": 1.9211514392991238e-05, "loss": 0.3521, "step": 1743 }, { "epoch": 1.9622960045019697, "grad_norm": 0.2683475797146492, "learning_rate": 1.9190654985398415e-05, "loss": 0.3682, "step": 1744 }, { "epoch": 1.9634214969048958, "grad_norm": 0.3822465905741808, "learning_rate": 1.9169795577805592e-05, "loss": 0.3535, "step": 1745 }, { "epoch": 1.9645469893078222, "grad_norm": 0.29811948702966473, "learning_rate": 1.9148936170212766e-05, "loss": 0.3732, "step": 1746 }, { "epoch": 1.9656724817107485, "grad_norm": 0.30142259657958986, "learning_rate": 1.9128076762619943e-05, "loss": 0.3432, "step": 1747 }, { "epoch": 1.9667979741136747, "grad_norm": 0.3786818892770981, "learning_rate": 1.910721735502712e-05, "loss": 0.3654, "step": 1748 }, { "epoch": 1.967923466516601, "grad_norm": 0.27029496791481233, "learning_rate": 1.9086357947434293e-05, "loss": 0.3682, "step": 1749 }, { "epoch": 1.9690489589195272, "grad_norm": 0.262798379544428, "learning_rate": 1.906549853984147e-05, "loss": 0.3568, "step": 1750 }, { "epoch": 1.9701744513224537, "grad_norm": 0.3135712581670641, "learning_rate": 1.9044639132248647e-05, "loss": 0.3511, "step": 1751 }, { "epoch": 1.9712999437253798, "grad_norm": 0.3158145619580369, "learning_rate": 1.902377972465582e-05, "loss": 0.36, "step": 1752 }, { "epoch": 1.9724254361283062, "grad_norm": 0.318706113946463, "learning_rate": 1.9002920317062998e-05, "loss": 0.3562, "step": 1753 }, { "epoch": 1.9735509285312323, "grad_norm": 0.310806681735437, "learning_rate": 1.898206090947017e-05, "loss": 0.3514, "step": 1754 }, { "epoch": 1.9746764209341587, "grad_norm": 0.2849866940009224, "learning_rate": 1.896120150187735e-05, "loss": 0.3477, "step": 1755 }, { "epoch": 1.975801913337085, "grad_norm": 0.2810634482745697, "learning_rate": 1.8940342094284525e-05, "loss": 0.3644, "step": 1756 }, { "epoch": 1.9769274057400112, "grad_norm": 0.2878137639733897, "learning_rate": 1.89194826866917e-05, "loss": 0.3594, "step": 1757 }, { "epoch": 1.9780528981429375, "grad_norm": 0.26057909113445293, "learning_rate": 1.8898623279098873e-05, "loss": 0.3755, "step": 1758 }, { "epoch": 1.9791783905458638, "grad_norm": 0.27092989925442396, "learning_rate": 1.887776387150605e-05, "loss": 0.3648, "step": 1759 }, { "epoch": 1.9803038829487902, "grad_norm": 0.2845108154959281, "learning_rate": 1.8856904463913223e-05, "loss": 0.3449, "step": 1760 }, { "epoch": 1.9814293753517163, "grad_norm": 0.24467445189735315, "learning_rate": 1.88360450563204e-05, "loss": 0.3558, "step": 1761 }, { "epoch": 1.9825548677546427, "grad_norm": 0.2715643743977259, "learning_rate": 1.8815185648727577e-05, "loss": 0.3567, "step": 1762 }, { "epoch": 1.983680360157569, "grad_norm": 0.2613996036084293, "learning_rate": 1.879432624113475e-05, "loss": 0.3467, "step": 1763 }, { "epoch": 1.9848058525604952, "grad_norm": 0.2816357872833296, "learning_rate": 1.8773466833541928e-05, "loss": 0.3417, "step": 1764 }, { "epoch": 1.9859313449634215, "grad_norm": 0.29529698315579805, "learning_rate": 1.8752607425949105e-05, "loss": 0.3527, "step": 1765 }, { "epoch": 1.9870568373663478, "grad_norm": 0.27238727861070106, "learning_rate": 1.8731748018356278e-05, "loss": 0.3717, "step": 1766 }, { "epoch": 1.988182329769274, "grad_norm": 0.27577156414013015, "learning_rate": 1.8710888610763455e-05, "loss": 0.3632, "step": 1767 }, { "epoch": 1.9893078221722003, "grad_norm": 0.31287278872365587, "learning_rate": 1.8690029203170632e-05, "loss": 0.3678, "step": 1768 }, { "epoch": 1.9904333145751267, "grad_norm": 0.2910024485455243, "learning_rate": 1.8669169795577806e-05, "loss": 0.3661, "step": 1769 }, { "epoch": 1.9915588069780528, "grad_norm": 0.29522751930001573, "learning_rate": 1.8648310387984983e-05, "loss": 0.3733, "step": 1770 }, { "epoch": 1.9926842993809792, "grad_norm": 0.2931943333929543, "learning_rate": 1.862745098039216e-05, "loss": 0.3554, "step": 1771 }, { "epoch": 1.9938097917839055, "grad_norm": 0.29961502454826516, "learning_rate": 1.8606591572799333e-05, "loss": 0.3534, "step": 1772 }, { "epoch": 1.9949352841868317, "grad_norm": 0.3016308875068367, "learning_rate": 1.858573216520651e-05, "loss": 0.3868, "step": 1773 }, { "epoch": 1.996060776589758, "grad_norm": 0.3051815491365933, "learning_rate": 1.8564872757613684e-05, "loss": 0.3573, "step": 1774 }, { "epoch": 1.9971862689926843, "grad_norm": 0.3463472368237023, "learning_rate": 1.854401335002086e-05, "loss": 0.3568, "step": 1775 }, { "epoch": 1.9983117613956107, "grad_norm": 0.30250184431483823, "learning_rate": 1.8523153942428038e-05, "loss": 0.3679, "step": 1776 }, { "epoch": 1.9994372537985368, "grad_norm": 0.27076812267453526, "learning_rate": 1.850229453483521e-05, "loss": 0.359, "step": 1777 }, { "epoch": 2.0, "grad_norm": 0.41375343672066456, "learning_rate": 1.848143512724239e-05, "loss": 0.3219, "step": 1778 }, { "epoch": 2.0011254924029265, "grad_norm": 0.30690460412174675, "learning_rate": 1.8460575719649562e-05, "loss": 0.2847, "step": 1779 }, { "epoch": 2.0022509848058525, "grad_norm": 0.29252851536455965, "learning_rate": 1.8439716312056736e-05, "loss": 0.2916, "step": 1780 }, { "epoch": 2.003376477208779, "grad_norm": 0.2867585999652241, "learning_rate": 1.8418856904463913e-05, "loss": 0.304, "step": 1781 }, { "epoch": 2.004501969611705, "grad_norm": 0.3147976773039966, "learning_rate": 1.839799749687109e-05, "loss": 0.2891, "step": 1782 }, { "epoch": 2.0056274620146315, "grad_norm": 0.2441846828289504, "learning_rate": 1.8377138089278263e-05, "loss": 0.2909, "step": 1783 }, { "epoch": 2.0067529544175575, "grad_norm": 0.2593896216365388, "learning_rate": 1.835627868168544e-05, "loss": 0.2753, "step": 1784 }, { "epoch": 2.007878446820484, "grad_norm": 0.2893905461877493, "learning_rate": 1.8335419274092617e-05, "loss": 0.2818, "step": 1785 }, { "epoch": 2.00900393922341, "grad_norm": 0.2846562929483248, "learning_rate": 1.831455986649979e-05, "loss": 0.2907, "step": 1786 }, { "epoch": 2.0101294316263365, "grad_norm": 0.2566724532797832, "learning_rate": 1.8293700458906968e-05, "loss": 0.2865, "step": 1787 }, { "epoch": 2.011254924029263, "grad_norm": 0.30986557763389416, "learning_rate": 1.8272841051314145e-05, "loss": 0.299, "step": 1788 }, { "epoch": 2.012380416432189, "grad_norm": 0.2790346837879426, "learning_rate": 1.825198164372132e-05, "loss": 0.2872, "step": 1789 }, { "epoch": 2.0135059088351155, "grad_norm": 0.28965248515971675, "learning_rate": 1.8231122236128495e-05, "loss": 0.282, "step": 1790 }, { "epoch": 2.0146314012380415, "grad_norm": 0.26758447999158064, "learning_rate": 1.8210262828535672e-05, "loss": 0.2854, "step": 1791 }, { "epoch": 2.015756893640968, "grad_norm": 0.25752829835015667, "learning_rate": 1.8189403420942846e-05, "loss": 0.2875, "step": 1792 }, { "epoch": 2.016882386043894, "grad_norm": 0.26237094621373575, "learning_rate": 1.8168544013350023e-05, "loss": 0.2861, "step": 1793 }, { "epoch": 2.0180078784468205, "grad_norm": 0.25324822624548066, "learning_rate": 1.8147684605757196e-05, "loss": 0.2804, "step": 1794 }, { "epoch": 2.019133370849747, "grad_norm": 0.27650509711437854, "learning_rate": 1.8126825198164373e-05, "loss": 0.298, "step": 1795 }, { "epoch": 2.020258863252673, "grad_norm": 0.271607108362916, "learning_rate": 1.810596579057155e-05, "loss": 0.2793, "step": 1796 }, { "epoch": 2.0213843556555995, "grad_norm": 0.2763902863245182, "learning_rate": 1.8085106382978724e-05, "loss": 0.2824, "step": 1797 }, { "epoch": 2.0225098480585255, "grad_norm": 0.29074430245042243, "learning_rate": 1.80642469753859e-05, "loss": 0.2847, "step": 1798 }, { "epoch": 2.023635340461452, "grad_norm": 0.252760394282513, "learning_rate": 1.8043387567793075e-05, "loss": 0.2729, "step": 1799 }, { "epoch": 2.024760832864378, "grad_norm": 0.25115826895976634, "learning_rate": 1.8022528160200248e-05, "loss": 0.2903, "step": 1800 }, { "epoch": 2.0258863252673045, "grad_norm": 0.31665556656306054, "learning_rate": 1.8001668752607425e-05, "loss": 0.2806, "step": 1801 }, { "epoch": 2.0270118176702305, "grad_norm": 0.27565102328032076, "learning_rate": 1.7980809345014602e-05, "loss": 0.2781, "step": 1802 }, { "epoch": 2.028137310073157, "grad_norm": 0.26334129144996565, "learning_rate": 1.7959949937421776e-05, "loss": 0.2865, "step": 1803 }, { "epoch": 2.0292628024760835, "grad_norm": 0.29084203177119927, "learning_rate": 1.7939090529828953e-05, "loss": 0.2915, "step": 1804 }, { "epoch": 2.0303882948790095, "grad_norm": 0.24821063662817036, "learning_rate": 1.791823112223613e-05, "loss": 0.2784, "step": 1805 }, { "epoch": 2.031513787281936, "grad_norm": 0.2550931735301453, "learning_rate": 1.7897371714643303e-05, "loss": 0.2836, "step": 1806 }, { "epoch": 2.032639279684862, "grad_norm": 0.27634727649104684, "learning_rate": 1.787651230705048e-05, "loss": 0.3069, "step": 1807 }, { "epoch": 2.0337647720877885, "grad_norm": 0.24014034990048097, "learning_rate": 1.7855652899457657e-05, "loss": 0.2858, "step": 1808 }, { "epoch": 2.0348902644907145, "grad_norm": 0.23529224395747875, "learning_rate": 1.783479349186483e-05, "loss": 0.292, "step": 1809 }, { "epoch": 2.036015756893641, "grad_norm": 0.2226918871531934, "learning_rate": 1.7813934084272008e-05, "loss": 0.289, "step": 1810 }, { "epoch": 2.037141249296567, "grad_norm": 0.24875514083553227, "learning_rate": 1.7793074676679185e-05, "loss": 0.2879, "step": 1811 }, { "epoch": 2.0382667416994935, "grad_norm": 0.22101380287283037, "learning_rate": 1.777221526908636e-05, "loss": 0.2785, "step": 1812 }, { "epoch": 2.03939223410242, "grad_norm": 0.24344041835452335, "learning_rate": 1.7751355861493535e-05, "loss": 0.2768, "step": 1813 }, { "epoch": 2.040517726505346, "grad_norm": 0.24709305555007302, "learning_rate": 1.773049645390071e-05, "loss": 0.2785, "step": 1814 }, { "epoch": 2.0416432189082725, "grad_norm": 0.23036508957897686, "learning_rate": 1.7709637046307886e-05, "loss": 0.2829, "step": 1815 }, { "epoch": 2.0427687113111985, "grad_norm": 0.304777104086667, "learning_rate": 1.7688777638715063e-05, "loss": 0.2842, "step": 1816 }, { "epoch": 2.043894203714125, "grad_norm": 0.24563307593084063, "learning_rate": 1.7667918231122237e-05, "loss": 0.2842, "step": 1817 }, { "epoch": 2.045019696117051, "grad_norm": 0.24049156495572827, "learning_rate": 1.7647058823529414e-05, "loss": 0.29, "step": 1818 }, { "epoch": 2.0461451885199775, "grad_norm": 0.26152397114334225, "learning_rate": 1.762619941593659e-05, "loss": 0.2943, "step": 1819 }, { "epoch": 2.047270680922904, "grad_norm": 0.24701566468961217, "learning_rate": 1.7605340008343764e-05, "loss": 0.28, "step": 1820 }, { "epoch": 2.04839617332583, "grad_norm": 0.22113320376779072, "learning_rate": 1.7584480600750938e-05, "loss": 0.2824, "step": 1821 }, { "epoch": 2.0495216657287565, "grad_norm": 0.2498303273769485, "learning_rate": 1.7563621193158115e-05, "loss": 0.2764, "step": 1822 }, { "epoch": 2.0506471581316825, "grad_norm": 0.2613079367123678, "learning_rate": 1.754276178556529e-05, "loss": 0.3029, "step": 1823 }, { "epoch": 2.051772650534609, "grad_norm": 0.2533549657170249, "learning_rate": 1.7521902377972465e-05, "loss": 0.2941, "step": 1824 }, { "epoch": 2.052898142937535, "grad_norm": 0.24525113996522538, "learning_rate": 1.7501042970379642e-05, "loss": 0.2791, "step": 1825 }, { "epoch": 2.0540236353404615, "grad_norm": 0.22636672236346222, "learning_rate": 1.7480183562786816e-05, "loss": 0.2708, "step": 1826 }, { "epoch": 2.0551491277433875, "grad_norm": 0.2318404892918077, "learning_rate": 1.7459324155193993e-05, "loss": 0.2831, "step": 1827 }, { "epoch": 2.056274620146314, "grad_norm": 0.22908482292345286, "learning_rate": 1.743846474760117e-05, "loss": 0.2791, "step": 1828 }, { "epoch": 2.0574001125492405, "grad_norm": 0.23199016490767796, "learning_rate": 1.7417605340008343e-05, "loss": 0.2899, "step": 1829 }, { "epoch": 2.0585256049521665, "grad_norm": 0.22679432927238993, "learning_rate": 1.739674593241552e-05, "loss": 0.2705, "step": 1830 }, { "epoch": 2.059651097355093, "grad_norm": 0.240936280203786, "learning_rate": 1.7375886524822697e-05, "loss": 0.2796, "step": 1831 }, { "epoch": 2.060776589758019, "grad_norm": 0.23052791316981805, "learning_rate": 1.735502711722987e-05, "loss": 0.298, "step": 1832 }, { "epoch": 2.0619020821609455, "grad_norm": 0.22399826316835342, "learning_rate": 1.7334167709637048e-05, "loss": 0.2768, "step": 1833 }, { "epoch": 2.0630275745638715, "grad_norm": 0.24389711598920422, "learning_rate": 1.731330830204422e-05, "loss": 0.2789, "step": 1834 }, { "epoch": 2.064153066966798, "grad_norm": 0.24531794065173357, "learning_rate": 1.72924488944514e-05, "loss": 0.2841, "step": 1835 }, { "epoch": 2.065278559369724, "grad_norm": 0.2857308138585535, "learning_rate": 1.7271589486858576e-05, "loss": 0.2746, "step": 1836 }, { "epoch": 2.0664040517726505, "grad_norm": 0.2331548964731216, "learning_rate": 1.725073007926575e-05, "loss": 0.2779, "step": 1837 }, { "epoch": 2.067529544175577, "grad_norm": 0.23649426513105923, "learning_rate": 1.7229870671672926e-05, "loss": 0.2913, "step": 1838 }, { "epoch": 2.068655036578503, "grad_norm": 0.2777841981435879, "learning_rate": 1.7209011264080103e-05, "loss": 0.2826, "step": 1839 }, { "epoch": 2.0697805289814295, "grad_norm": 0.27066327686914066, "learning_rate": 1.7188151856487277e-05, "loss": 0.2893, "step": 1840 }, { "epoch": 2.0709060213843555, "grad_norm": 0.23134899380353294, "learning_rate": 1.716729244889445e-05, "loss": 0.2804, "step": 1841 }, { "epoch": 2.072031513787282, "grad_norm": 0.29223852513335047, "learning_rate": 1.7146433041301627e-05, "loss": 0.2916, "step": 1842 }, { "epoch": 2.073157006190208, "grad_norm": 0.2735960908953659, "learning_rate": 1.71255736337088e-05, "loss": 0.2852, "step": 1843 }, { "epoch": 2.0742824985931345, "grad_norm": 0.26821528502754455, "learning_rate": 1.7104714226115978e-05, "loss": 0.2891, "step": 1844 }, { "epoch": 2.0754079909960605, "grad_norm": 0.26154260311021144, "learning_rate": 1.7083854818523155e-05, "loss": 0.2875, "step": 1845 }, { "epoch": 2.076533483398987, "grad_norm": 0.31021830521974225, "learning_rate": 1.706299541093033e-05, "loss": 0.2776, "step": 1846 }, { "epoch": 2.0776589758019135, "grad_norm": 0.2788988641156972, "learning_rate": 1.7042136003337505e-05, "loss": 0.2886, "step": 1847 }, { "epoch": 2.0787844682048395, "grad_norm": 0.2907858072020635, "learning_rate": 1.7021276595744682e-05, "loss": 0.2895, "step": 1848 }, { "epoch": 2.079909960607766, "grad_norm": 0.2542410475178318, "learning_rate": 1.7000417188151856e-05, "loss": 0.2856, "step": 1849 }, { "epoch": 2.081035453010692, "grad_norm": 0.24197984345301113, "learning_rate": 1.6979557780559033e-05, "loss": 0.2824, "step": 1850 }, { "epoch": 2.0821609454136185, "grad_norm": 0.2557692899387776, "learning_rate": 1.695869837296621e-05, "loss": 0.2909, "step": 1851 }, { "epoch": 2.0832864378165445, "grad_norm": 0.23793678801447735, "learning_rate": 1.6937838965373384e-05, "loss": 0.2689, "step": 1852 }, { "epoch": 2.084411930219471, "grad_norm": 0.29107842473943085, "learning_rate": 1.691697955778056e-05, "loss": 0.284, "step": 1853 }, { "epoch": 2.0855374226223975, "grad_norm": 0.24607914318213508, "learning_rate": 1.6896120150187734e-05, "loss": 0.2957, "step": 1854 }, { "epoch": 2.0866629150253235, "grad_norm": 0.21651709890692455, "learning_rate": 1.687526074259491e-05, "loss": 0.2677, "step": 1855 }, { "epoch": 2.08778840742825, "grad_norm": 0.22707602957596063, "learning_rate": 1.6854401335002088e-05, "loss": 0.2854, "step": 1856 }, { "epoch": 2.088913899831176, "grad_norm": 0.24846772345755247, "learning_rate": 1.6833541927409262e-05, "loss": 0.2679, "step": 1857 }, { "epoch": 2.0900393922341025, "grad_norm": 0.27573122817807033, "learning_rate": 1.681268251981644e-05, "loss": 0.3007, "step": 1858 }, { "epoch": 2.0911648846370285, "grad_norm": 0.23927598344656173, "learning_rate": 1.6791823112223616e-05, "loss": 0.2754, "step": 1859 }, { "epoch": 2.092290377039955, "grad_norm": 0.23518656387715997, "learning_rate": 1.677096370463079e-05, "loss": 0.2766, "step": 1860 }, { "epoch": 2.093415869442881, "grad_norm": 0.24448942505615562, "learning_rate": 1.6750104297037966e-05, "loss": 0.2913, "step": 1861 }, { "epoch": 2.0945413618458075, "grad_norm": 0.2336572648593039, "learning_rate": 1.672924488944514e-05, "loss": 0.2942, "step": 1862 }, { "epoch": 2.095666854248734, "grad_norm": 0.22716116914003923, "learning_rate": 1.6708385481852313e-05, "loss": 0.2881, "step": 1863 }, { "epoch": 2.09679234665166, "grad_norm": 0.2849566981299875, "learning_rate": 1.668752607425949e-05, "loss": 0.2805, "step": 1864 }, { "epoch": 2.0979178390545865, "grad_norm": 0.21858945358126292, "learning_rate": 1.6666666666666667e-05, "loss": 0.2807, "step": 1865 }, { "epoch": 2.0990433314575125, "grad_norm": 0.23697889851760048, "learning_rate": 1.664580725907384e-05, "loss": 0.2791, "step": 1866 }, { "epoch": 2.100168823860439, "grad_norm": 0.23126363606688877, "learning_rate": 1.6624947851481018e-05, "loss": 0.2878, "step": 1867 }, { "epoch": 2.101294316263365, "grad_norm": 0.2651888938021785, "learning_rate": 1.6604088443888195e-05, "loss": 0.2859, "step": 1868 }, { "epoch": 2.1024198086662915, "grad_norm": 0.23270730320241492, "learning_rate": 1.658322903629537e-05, "loss": 0.296, "step": 1869 }, { "epoch": 2.103545301069218, "grad_norm": 0.3042844752542814, "learning_rate": 1.6562369628702546e-05, "loss": 0.3132, "step": 1870 }, { "epoch": 2.104670793472144, "grad_norm": 0.24339368794150576, "learning_rate": 1.6541510221109723e-05, "loss": 0.2766, "step": 1871 }, { "epoch": 2.1057962858750705, "grad_norm": 0.26285249575918274, "learning_rate": 1.6520650813516896e-05, "loss": 0.285, "step": 1872 }, { "epoch": 2.1069217782779965, "grad_norm": 0.25652733532840694, "learning_rate": 1.6499791405924073e-05, "loss": 0.294, "step": 1873 }, { "epoch": 2.108047270680923, "grad_norm": 0.2543828171392836, "learning_rate": 1.6478931998331247e-05, "loss": 0.2746, "step": 1874 }, { "epoch": 2.109172763083849, "grad_norm": 0.2352973105601401, "learning_rate": 1.6458072590738424e-05, "loss": 0.2743, "step": 1875 }, { "epoch": 2.1102982554867755, "grad_norm": 0.25036103571955426, "learning_rate": 1.64372131831456e-05, "loss": 0.2909, "step": 1876 }, { "epoch": 2.1114237478897016, "grad_norm": 0.30189436640559725, "learning_rate": 1.6416353775552774e-05, "loss": 0.2969, "step": 1877 }, { "epoch": 2.112549240292628, "grad_norm": 0.24769249438614963, "learning_rate": 1.639549436795995e-05, "loss": 0.282, "step": 1878 }, { "epoch": 2.1136747326955545, "grad_norm": 0.25738252269107714, "learning_rate": 1.6374634960367128e-05, "loss": 0.2987, "step": 1879 }, { "epoch": 2.1148002250984805, "grad_norm": 0.3340073534477609, "learning_rate": 1.6353775552774302e-05, "loss": 0.2962, "step": 1880 }, { "epoch": 2.115925717501407, "grad_norm": 0.2641953651128952, "learning_rate": 1.633291614518148e-05, "loss": 0.2827, "step": 1881 }, { "epoch": 2.117051209904333, "grad_norm": 0.24068009657515393, "learning_rate": 1.6312056737588656e-05, "loss": 0.2795, "step": 1882 }, { "epoch": 2.1181767023072595, "grad_norm": 0.2507429416561855, "learning_rate": 1.6291197329995826e-05, "loss": 0.285, "step": 1883 }, { "epoch": 2.1193021947101855, "grad_norm": 0.2533889963610055, "learning_rate": 1.6270337922403003e-05, "loss": 0.2941, "step": 1884 }, { "epoch": 2.120427687113112, "grad_norm": 0.24688775417233455, "learning_rate": 1.624947851481018e-05, "loss": 0.3012, "step": 1885 }, { "epoch": 2.121553179516038, "grad_norm": 0.24915605314972172, "learning_rate": 1.6228619107217354e-05, "loss": 0.2993, "step": 1886 }, { "epoch": 2.1226786719189645, "grad_norm": 0.279903262549959, "learning_rate": 1.620775969962453e-05, "loss": 0.2834, "step": 1887 }, { "epoch": 2.123804164321891, "grad_norm": 0.23182376171306077, "learning_rate": 1.6186900292031708e-05, "loss": 0.2916, "step": 1888 }, { "epoch": 2.124929656724817, "grad_norm": 0.23000197495504152, "learning_rate": 1.616604088443888e-05, "loss": 0.2747, "step": 1889 }, { "epoch": 2.1260551491277435, "grad_norm": 0.23823286526237422, "learning_rate": 1.6145181476846058e-05, "loss": 0.2781, "step": 1890 }, { "epoch": 2.1271806415306695, "grad_norm": 0.21627846743690535, "learning_rate": 1.6124322069253235e-05, "loss": 0.2706, "step": 1891 }, { "epoch": 2.128306133933596, "grad_norm": 0.23892946673013887, "learning_rate": 1.610346266166041e-05, "loss": 0.2914, "step": 1892 }, { "epoch": 2.129431626336522, "grad_norm": 0.2656649096475775, "learning_rate": 1.6082603254067586e-05, "loss": 0.2866, "step": 1893 }, { "epoch": 2.1305571187394485, "grad_norm": 0.2227237824020542, "learning_rate": 1.6061743846474763e-05, "loss": 0.2625, "step": 1894 }, { "epoch": 2.1316826111423746, "grad_norm": 0.23235111483198348, "learning_rate": 1.6040884438881936e-05, "loss": 0.2859, "step": 1895 }, { "epoch": 2.132808103545301, "grad_norm": 0.25920239106869064, "learning_rate": 1.6020025031289113e-05, "loss": 0.2959, "step": 1896 }, { "epoch": 2.1339335959482275, "grad_norm": 0.23719185530213213, "learning_rate": 1.5999165623696287e-05, "loss": 0.2897, "step": 1897 }, { "epoch": 2.1350590883511535, "grad_norm": 0.22876937310915393, "learning_rate": 1.5978306216103464e-05, "loss": 0.2788, "step": 1898 }, { "epoch": 2.13618458075408, "grad_norm": 0.26616238576961354, "learning_rate": 1.595744680851064e-05, "loss": 0.2889, "step": 1899 }, { "epoch": 2.137310073157006, "grad_norm": 0.2166404539813475, "learning_rate": 1.5936587400917814e-05, "loss": 0.282, "step": 1900 }, { "epoch": 2.1384355655599325, "grad_norm": 0.23700101129905038, "learning_rate": 1.591572799332499e-05, "loss": 0.2968, "step": 1901 }, { "epoch": 2.1395610579628586, "grad_norm": 0.2285745331225241, "learning_rate": 1.589486858573217e-05, "loss": 0.2841, "step": 1902 }, { "epoch": 2.140686550365785, "grad_norm": 0.23783838496188303, "learning_rate": 1.5874009178139342e-05, "loss": 0.2909, "step": 1903 }, { "epoch": 2.1418120427687115, "grad_norm": 0.23082103720915573, "learning_rate": 1.5853149770546516e-05, "loss": 0.2824, "step": 1904 }, { "epoch": 2.1429375351716375, "grad_norm": 0.25094828821607146, "learning_rate": 1.5832290362953693e-05, "loss": 0.285, "step": 1905 }, { "epoch": 2.144063027574564, "grad_norm": 0.22431109979899386, "learning_rate": 1.5811430955360866e-05, "loss": 0.2737, "step": 1906 }, { "epoch": 2.14518851997749, "grad_norm": 0.22492379294000237, "learning_rate": 1.5790571547768043e-05, "loss": 0.2726, "step": 1907 }, { "epoch": 2.1463140123804165, "grad_norm": 0.2314053442523269, "learning_rate": 1.576971214017522e-05, "loss": 0.2754, "step": 1908 }, { "epoch": 2.1474395047833426, "grad_norm": 0.24673230264182605, "learning_rate": 1.5748852732582394e-05, "loss": 0.2921, "step": 1909 }, { "epoch": 2.148564997186269, "grad_norm": 0.23606707383444092, "learning_rate": 1.572799332498957e-05, "loss": 0.2804, "step": 1910 }, { "epoch": 2.1496904895891955, "grad_norm": 0.235724127482375, "learning_rate": 1.5707133917396748e-05, "loss": 0.2861, "step": 1911 }, { "epoch": 2.1508159819921215, "grad_norm": 0.24483607505245927, "learning_rate": 1.568627450980392e-05, "loss": 0.2878, "step": 1912 }, { "epoch": 2.151941474395048, "grad_norm": 0.2552535556772291, "learning_rate": 1.56654151022111e-05, "loss": 0.2857, "step": 1913 }, { "epoch": 2.153066966797974, "grad_norm": 0.22983484882907804, "learning_rate": 1.5644555694618275e-05, "loss": 0.2872, "step": 1914 }, { "epoch": 2.1541924592009005, "grad_norm": 0.25772716013553465, "learning_rate": 1.562369628702545e-05, "loss": 0.2959, "step": 1915 }, { "epoch": 2.1553179516038266, "grad_norm": 0.29415752414459184, "learning_rate": 1.5602836879432626e-05, "loss": 0.2909, "step": 1916 }, { "epoch": 2.156443444006753, "grad_norm": 0.2564449243204837, "learning_rate": 1.55819774718398e-05, "loss": 0.3047, "step": 1917 }, { "epoch": 2.157568936409679, "grad_norm": 0.2357261136445965, "learning_rate": 1.5561118064246976e-05, "loss": 0.2891, "step": 1918 }, { "epoch": 2.1586944288126055, "grad_norm": 0.23236268840383198, "learning_rate": 1.5540258656654153e-05, "loss": 0.2936, "step": 1919 }, { "epoch": 2.159819921215532, "grad_norm": 0.2526992651991741, "learning_rate": 1.5519399249061327e-05, "loss": 0.279, "step": 1920 }, { "epoch": 2.160945413618458, "grad_norm": 0.26601612523224494, "learning_rate": 1.5498539841468504e-05, "loss": 0.2805, "step": 1921 }, { "epoch": 2.1620709060213845, "grad_norm": 0.23000298824921414, "learning_rate": 1.547768043387568e-05, "loss": 0.2837, "step": 1922 }, { "epoch": 2.1631963984243106, "grad_norm": 0.24154706706349996, "learning_rate": 1.5456821026282855e-05, "loss": 0.2846, "step": 1923 }, { "epoch": 2.164321890827237, "grad_norm": 0.2360397761989054, "learning_rate": 1.543596161869003e-05, "loss": 0.283, "step": 1924 }, { "epoch": 2.165447383230163, "grad_norm": 0.2218092507800359, "learning_rate": 1.5415102211097205e-05, "loss": 0.2771, "step": 1925 }, { "epoch": 2.1665728756330895, "grad_norm": 0.2386052630849636, "learning_rate": 1.539424280350438e-05, "loss": 0.2924, "step": 1926 }, { "epoch": 2.1676983680360156, "grad_norm": 0.24947895655376598, "learning_rate": 1.5373383395911556e-05, "loss": 0.2925, "step": 1927 }, { "epoch": 2.168823860438942, "grad_norm": 0.2800300772115473, "learning_rate": 1.5352523988318733e-05, "loss": 0.2931, "step": 1928 }, { "epoch": 2.1699493528418685, "grad_norm": 0.22636221415787847, "learning_rate": 1.5331664580725906e-05, "loss": 0.2875, "step": 1929 }, { "epoch": 2.1710748452447945, "grad_norm": 0.24386646248262941, "learning_rate": 1.5310805173133083e-05, "loss": 0.2905, "step": 1930 }, { "epoch": 2.172200337647721, "grad_norm": 0.22365055654311475, "learning_rate": 1.528994576554026e-05, "loss": 0.2802, "step": 1931 }, { "epoch": 2.173325830050647, "grad_norm": 0.25602509803802304, "learning_rate": 1.5269086357947434e-05, "loss": 0.2724, "step": 1932 }, { "epoch": 2.1744513224535735, "grad_norm": 0.20551321646228457, "learning_rate": 1.5248226950354611e-05, "loss": 0.2721, "step": 1933 }, { "epoch": 2.1755768148564996, "grad_norm": 0.22807897179549413, "learning_rate": 1.5227367542761786e-05, "loss": 0.2892, "step": 1934 }, { "epoch": 2.176702307259426, "grad_norm": 0.25852860308404757, "learning_rate": 1.5206508135168961e-05, "loss": 0.3027, "step": 1935 }, { "epoch": 2.177827799662352, "grad_norm": 0.2279934128116949, "learning_rate": 1.5185648727576138e-05, "loss": 0.2801, "step": 1936 }, { "epoch": 2.1789532920652785, "grad_norm": 0.2351411289469844, "learning_rate": 1.5164789319983314e-05, "loss": 0.28, "step": 1937 }, { "epoch": 2.180078784468205, "grad_norm": 0.21552640522315936, "learning_rate": 1.5143929912390489e-05, "loss": 0.2947, "step": 1938 }, { "epoch": 2.181204276871131, "grad_norm": 0.24626801791538866, "learning_rate": 1.5123070504797664e-05, "loss": 0.2783, "step": 1939 }, { "epoch": 2.1823297692740575, "grad_norm": 0.23012350879449098, "learning_rate": 1.5102211097204841e-05, "loss": 0.2774, "step": 1940 }, { "epoch": 2.1834552616769836, "grad_norm": 0.23081070838683507, "learning_rate": 1.5081351689612017e-05, "loss": 0.2799, "step": 1941 }, { "epoch": 2.18458075407991, "grad_norm": 0.2490023091368887, "learning_rate": 1.5060492282019192e-05, "loss": 0.2916, "step": 1942 }, { "epoch": 2.185706246482836, "grad_norm": 0.23226830279863933, "learning_rate": 1.5039632874426369e-05, "loss": 0.274, "step": 1943 }, { "epoch": 2.1868317388857625, "grad_norm": 0.23814945426894574, "learning_rate": 1.5018773466833544e-05, "loss": 0.2987, "step": 1944 }, { "epoch": 2.1879572312886886, "grad_norm": 0.22888208424137457, "learning_rate": 1.4997914059240718e-05, "loss": 0.2809, "step": 1945 }, { "epoch": 2.189082723691615, "grad_norm": 0.22117598909045155, "learning_rate": 1.4977054651647893e-05, "loss": 0.2869, "step": 1946 }, { "epoch": 2.1902082160945415, "grad_norm": 0.2635412507153887, "learning_rate": 1.4956195244055068e-05, "loss": 0.3102, "step": 1947 }, { "epoch": 2.1913337084974676, "grad_norm": 0.21434697577713013, "learning_rate": 1.4935335836462244e-05, "loss": 0.2748, "step": 1948 }, { "epoch": 2.192459200900394, "grad_norm": 0.23605470994586675, "learning_rate": 1.491447642886942e-05, "loss": 0.2859, "step": 1949 }, { "epoch": 2.19358469330332, "grad_norm": 0.2405759189766832, "learning_rate": 1.4893617021276596e-05, "loss": 0.2942, "step": 1950 }, { "epoch": 2.1947101857062465, "grad_norm": 0.22131821842232993, "learning_rate": 1.4872757613683771e-05, "loss": 0.2884, "step": 1951 }, { "epoch": 2.1958356781091726, "grad_norm": 0.23216071326486187, "learning_rate": 1.4851898206090946e-05, "loss": 0.2741, "step": 1952 }, { "epoch": 2.196961170512099, "grad_norm": 0.2261133526570407, "learning_rate": 1.4831038798498123e-05, "loss": 0.2963, "step": 1953 }, { "epoch": 2.1980866629150255, "grad_norm": 0.2302291451269135, "learning_rate": 1.4810179390905299e-05, "loss": 0.2828, "step": 1954 }, { "epoch": 2.1992121553179516, "grad_norm": 0.2535578449757302, "learning_rate": 1.4789319983312474e-05, "loss": 0.3054, "step": 1955 }, { "epoch": 2.200337647720878, "grad_norm": 0.2353316415549731, "learning_rate": 1.4768460575719651e-05, "loss": 0.2851, "step": 1956 }, { "epoch": 2.201463140123804, "grad_norm": 0.22300891391695027, "learning_rate": 1.4747601168126826e-05, "loss": 0.2685, "step": 1957 }, { "epoch": 2.2025886325267305, "grad_norm": 0.24986486980542502, "learning_rate": 1.4726741760534002e-05, "loss": 0.2956, "step": 1958 }, { "epoch": 2.2037141249296566, "grad_norm": 0.2180771271590878, "learning_rate": 1.4705882352941177e-05, "loss": 0.2922, "step": 1959 }, { "epoch": 2.204839617332583, "grad_norm": 0.22680565869396152, "learning_rate": 1.4685022945348354e-05, "loss": 0.2909, "step": 1960 }, { "epoch": 2.205965109735509, "grad_norm": 0.23513680764714112, "learning_rate": 1.4664163537755529e-05, "loss": 0.2786, "step": 1961 }, { "epoch": 2.2070906021384356, "grad_norm": 0.24973876085692792, "learning_rate": 1.4643304130162704e-05, "loss": 0.2853, "step": 1962 }, { "epoch": 2.208216094541362, "grad_norm": 0.22544610054019418, "learning_rate": 1.4622444722569881e-05, "loss": 0.2831, "step": 1963 }, { "epoch": 2.209341586944288, "grad_norm": 0.22330625417293162, "learning_rate": 1.4601585314977057e-05, "loss": 0.2867, "step": 1964 }, { "epoch": 2.2104670793472145, "grad_norm": 0.22525152317015681, "learning_rate": 1.4580725907384232e-05, "loss": 0.29, "step": 1965 }, { "epoch": 2.2115925717501406, "grad_norm": 0.22249714982600474, "learning_rate": 1.4559866499791406e-05, "loss": 0.299, "step": 1966 }, { "epoch": 2.212718064153067, "grad_norm": 0.24092275848280195, "learning_rate": 1.4539007092198581e-05, "loss": 0.2896, "step": 1967 }, { "epoch": 2.213843556555993, "grad_norm": 0.22252299992217103, "learning_rate": 1.4518147684605756e-05, "loss": 0.258, "step": 1968 }, { "epoch": 2.2149690489589196, "grad_norm": 0.23636046190697863, "learning_rate": 1.4497288277012933e-05, "loss": 0.2908, "step": 1969 }, { "epoch": 2.216094541361846, "grad_norm": 0.2596597997389332, "learning_rate": 1.4476428869420108e-05, "loss": 0.27, "step": 1970 }, { "epoch": 2.217220033764772, "grad_norm": 0.2531683961826357, "learning_rate": 1.4455569461827284e-05, "loss": 0.2832, "step": 1971 }, { "epoch": 2.2183455261676985, "grad_norm": 0.2593605264440698, "learning_rate": 1.443471005423446e-05, "loss": 0.2841, "step": 1972 }, { "epoch": 2.2194710185706246, "grad_norm": 0.26699737148304314, "learning_rate": 1.4413850646641636e-05, "loss": 0.2799, "step": 1973 }, { "epoch": 2.220596510973551, "grad_norm": 0.2294951675397686, "learning_rate": 1.4392991239048811e-05, "loss": 0.2909, "step": 1974 }, { "epoch": 2.221722003376477, "grad_norm": 0.2245538365625567, "learning_rate": 1.4372131831455987e-05, "loss": 0.2799, "step": 1975 }, { "epoch": 2.2228474957794035, "grad_norm": 0.2646561800422373, "learning_rate": 1.4351272423863164e-05, "loss": 0.2765, "step": 1976 }, { "epoch": 2.2239729881823296, "grad_norm": 0.21416819505340884, "learning_rate": 1.4330413016270339e-05, "loss": 0.2801, "step": 1977 }, { "epoch": 2.225098480585256, "grad_norm": 0.21948393418095735, "learning_rate": 1.4309553608677514e-05, "loss": 0.2866, "step": 1978 }, { "epoch": 2.2262239729881825, "grad_norm": 0.22822972297920066, "learning_rate": 1.428869420108469e-05, "loss": 0.287, "step": 1979 }, { "epoch": 2.2273494653911086, "grad_norm": 0.2160982046115744, "learning_rate": 1.4267834793491866e-05, "loss": 0.2925, "step": 1980 }, { "epoch": 2.228474957794035, "grad_norm": 0.23144554832269953, "learning_rate": 1.4246975385899042e-05, "loss": 0.3105, "step": 1981 }, { "epoch": 2.229600450196961, "grad_norm": 0.2419669093281673, "learning_rate": 1.4226115978306217e-05, "loss": 0.2795, "step": 1982 }, { "epoch": 2.2307259425998875, "grad_norm": 0.2333075873767841, "learning_rate": 1.4205256570713394e-05, "loss": 0.2879, "step": 1983 }, { "epoch": 2.2318514350028136, "grad_norm": 0.2381375140609149, "learning_rate": 1.418439716312057e-05, "loss": 0.2969, "step": 1984 }, { "epoch": 2.23297692740574, "grad_norm": 0.22837890307254083, "learning_rate": 1.4163537755527745e-05, "loss": 0.283, "step": 1985 }, { "epoch": 2.234102419808666, "grad_norm": 0.24448338514717682, "learning_rate": 1.414267834793492e-05, "loss": 0.3013, "step": 1986 }, { "epoch": 2.2352279122115926, "grad_norm": 0.23420776111487138, "learning_rate": 1.4121818940342093e-05, "loss": 0.2878, "step": 1987 }, { "epoch": 2.236353404614519, "grad_norm": 0.23579942650757943, "learning_rate": 1.4100959532749269e-05, "loss": 0.288, "step": 1988 }, { "epoch": 2.237478897017445, "grad_norm": 0.23043040793992384, "learning_rate": 1.4080100125156446e-05, "loss": 0.305, "step": 1989 }, { "epoch": 2.2386043894203715, "grad_norm": 0.24659768389490117, "learning_rate": 1.4059240717563621e-05, "loss": 0.2778, "step": 1990 }, { "epoch": 2.2397298818232976, "grad_norm": 0.2525101439952681, "learning_rate": 1.4038381309970796e-05, "loss": 0.2797, "step": 1991 }, { "epoch": 2.240855374226224, "grad_norm": 0.2180718742006463, "learning_rate": 1.4017521902377973e-05, "loss": 0.2824, "step": 1992 }, { "epoch": 2.24198086662915, "grad_norm": 0.23039632064460322, "learning_rate": 1.3996662494785149e-05, "loss": 0.2732, "step": 1993 }, { "epoch": 2.2431063590320766, "grad_norm": 0.24390939737808814, "learning_rate": 1.3975803087192324e-05, "loss": 0.2942, "step": 1994 }, { "epoch": 2.2442318514350026, "grad_norm": 0.22495659632157705, "learning_rate": 1.39549436795995e-05, "loss": 0.2786, "step": 1995 }, { "epoch": 2.245357343837929, "grad_norm": 0.2220671184762533, "learning_rate": 1.3934084272006676e-05, "loss": 0.2902, "step": 1996 }, { "epoch": 2.2464828362408555, "grad_norm": 0.2063740174423525, "learning_rate": 1.3913224864413851e-05, "loss": 0.2794, "step": 1997 }, { "epoch": 2.2476083286437816, "grad_norm": 0.22864397206918258, "learning_rate": 1.3892365456821027e-05, "loss": 0.2899, "step": 1998 }, { "epoch": 2.248733821046708, "grad_norm": 0.22641553859678237, "learning_rate": 1.3871506049228202e-05, "loss": 0.2913, "step": 1999 }, { "epoch": 2.249859313449634, "grad_norm": 0.23273883384894037, "learning_rate": 1.3850646641635379e-05, "loss": 0.2896, "step": 2000 }, { "epoch": 2.2509848058525606, "grad_norm": 0.25883856114515486, "learning_rate": 1.3829787234042554e-05, "loss": 0.2853, "step": 2001 }, { "epoch": 2.2521102982554866, "grad_norm": 0.24543011458191846, "learning_rate": 1.380892782644973e-05, "loss": 0.2921, "step": 2002 }, { "epoch": 2.253235790658413, "grad_norm": 0.2573780345794268, "learning_rate": 1.3788068418856907e-05, "loss": 0.2987, "step": 2003 }, { "epoch": 2.254361283061339, "grad_norm": 0.2344713538028616, "learning_rate": 1.3767209011264082e-05, "loss": 0.2788, "step": 2004 }, { "epoch": 2.2554867754642656, "grad_norm": 0.2655728653325266, "learning_rate": 1.3746349603671257e-05, "loss": 0.2891, "step": 2005 }, { "epoch": 2.256612267867192, "grad_norm": 0.24696226562693468, "learning_rate": 1.3725490196078432e-05, "loss": 0.2832, "step": 2006 }, { "epoch": 2.257737760270118, "grad_norm": 0.27074526345994904, "learning_rate": 1.370463078848561e-05, "loss": 0.2991, "step": 2007 }, { "epoch": 2.2588632526730446, "grad_norm": 0.21307871627731073, "learning_rate": 1.3683771380892781e-05, "loss": 0.2802, "step": 2008 }, { "epoch": 2.2599887450759706, "grad_norm": 0.6066306419781285, "learning_rate": 1.3662911973299958e-05, "loss": 0.3119, "step": 2009 }, { "epoch": 2.261114237478897, "grad_norm": 0.2354767198892578, "learning_rate": 1.3642052565707134e-05, "loss": 0.2931, "step": 2010 }, { "epoch": 2.2622397298818235, "grad_norm": 0.3113759768538715, "learning_rate": 1.3621193158114309e-05, "loss": 0.3097, "step": 2011 }, { "epoch": 2.2633652222847496, "grad_norm": 0.23868520954039024, "learning_rate": 1.3600333750521486e-05, "loss": 0.295, "step": 2012 }, { "epoch": 2.264490714687676, "grad_norm": 0.25599462430091524, "learning_rate": 1.3579474342928661e-05, "loss": 0.2934, "step": 2013 }, { "epoch": 2.265616207090602, "grad_norm": 0.2378852312729475, "learning_rate": 1.3558614935335836e-05, "loss": 0.2905, "step": 2014 }, { "epoch": 2.2667416994935286, "grad_norm": 0.23537606322412846, "learning_rate": 1.3537755527743012e-05, "loss": 0.2933, "step": 2015 }, { "epoch": 2.2678671918964546, "grad_norm": 0.24999030117110338, "learning_rate": 1.3516896120150189e-05, "loss": 0.2907, "step": 2016 }, { "epoch": 2.268992684299381, "grad_norm": 0.2189977535068501, "learning_rate": 1.3496036712557364e-05, "loss": 0.2768, "step": 2017 }, { "epoch": 2.270118176702307, "grad_norm": 0.21605143200933585, "learning_rate": 1.347517730496454e-05, "loss": 0.2825, "step": 2018 }, { "epoch": 2.2712436691052336, "grad_norm": 0.2190715820513759, "learning_rate": 1.3454317897371716e-05, "loss": 0.2985, "step": 2019 }, { "epoch": 2.27236916150816, "grad_norm": 0.2419287070815025, "learning_rate": 1.3433458489778892e-05, "loss": 0.2812, "step": 2020 }, { "epoch": 2.273494653911086, "grad_norm": 0.23856366222450073, "learning_rate": 1.3412599082186067e-05, "loss": 0.2754, "step": 2021 }, { "epoch": 2.2746201463140125, "grad_norm": 0.23128552323354076, "learning_rate": 1.3391739674593242e-05, "loss": 0.2967, "step": 2022 }, { "epoch": 2.2757456387169386, "grad_norm": 0.23110186859812204, "learning_rate": 1.3370880267000419e-05, "loss": 0.2905, "step": 2023 }, { "epoch": 2.276871131119865, "grad_norm": 0.23791496512553711, "learning_rate": 1.3350020859407594e-05, "loss": 0.2956, "step": 2024 }, { "epoch": 2.277996623522791, "grad_norm": 0.270895607021542, "learning_rate": 1.332916145181477e-05, "loss": 0.2979, "step": 2025 }, { "epoch": 2.2791221159257176, "grad_norm": 0.2622847660820458, "learning_rate": 1.3308302044221945e-05, "loss": 0.2805, "step": 2026 }, { "epoch": 2.2802476083286436, "grad_norm": 0.2451853343226485, "learning_rate": 1.3287442636629122e-05, "loss": 0.2849, "step": 2027 }, { "epoch": 2.28137310073157, "grad_norm": 0.2181534341062286, "learning_rate": 1.3266583229036297e-05, "loss": 0.2843, "step": 2028 }, { "epoch": 2.2824985931344965, "grad_norm": 0.2350791322319216, "learning_rate": 1.3245723821443471e-05, "loss": 0.2804, "step": 2029 }, { "epoch": 2.2836240855374226, "grad_norm": 0.24384265303411898, "learning_rate": 1.3224864413850646e-05, "loss": 0.2844, "step": 2030 }, { "epoch": 2.284749577940349, "grad_norm": 0.21471389480099612, "learning_rate": 1.3204005006257821e-05, "loss": 0.2766, "step": 2031 }, { "epoch": 2.285875070343275, "grad_norm": 0.2558686689697758, "learning_rate": 1.3183145598664998e-05, "loss": 0.3006, "step": 2032 }, { "epoch": 2.2870005627462016, "grad_norm": 0.24596519958308774, "learning_rate": 1.3162286191072174e-05, "loss": 0.2791, "step": 2033 }, { "epoch": 2.2881260551491276, "grad_norm": 0.22178993068098377, "learning_rate": 1.3141426783479349e-05, "loss": 0.3006, "step": 2034 }, { "epoch": 2.289251547552054, "grad_norm": 0.21211849808178426, "learning_rate": 1.3120567375886524e-05, "loss": 0.2879, "step": 2035 }, { "epoch": 2.29037703995498, "grad_norm": 0.26189329024450775, "learning_rate": 1.3099707968293701e-05, "loss": 0.2919, "step": 2036 }, { "epoch": 2.2915025323579066, "grad_norm": 0.24020801441451947, "learning_rate": 1.3078848560700877e-05, "loss": 0.2936, "step": 2037 }, { "epoch": 2.292628024760833, "grad_norm": 0.2444872387207359, "learning_rate": 1.3057989153108052e-05, "loss": 0.3098, "step": 2038 }, { "epoch": 2.293753517163759, "grad_norm": 0.21895214125433066, "learning_rate": 1.3037129745515229e-05, "loss": 0.2743, "step": 2039 }, { "epoch": 2.2948790095666856, "grad_norm": 0.2496911198777528, "learning_rate": 1.3016270337922404e-05, "loss": 0.2918, "step": 2040 }, { "epoch": 2.2960045019696116, "grad_norm": 2.8468163932596022, "learning_rate": 1.299541093032958e-05, "loss": 0.2845, "step": 2041 }, { "epoch": 2.297129994372538, "grad_norm": 0.2414636263089686, "learning_rate": 1.2974551522736755e-05, "loss": 0.2854, "step": 2042 }, { "epoch": 2.298255486775464, "grad_norm": 0.22863532322963662, "learning_rate": 1.2953692115143932e-05, "loss": 0.2769, "step": 2043 }, { "epoch": 2.2993809791783906, "grad_norm": 0.20797566641270696, "learning_rate": 1.2932832707551107e-05, "loss": 0.2738, "step": 2044 }, { "epoch": 2.3005064715813166, "grad_norm": 0.2813082678198765, "learning_rate": 1.2911973299958282e-05, "loss": 0.2919, "step": 2045 }, { "epoch": 2.301631963984243, "grad_norm": 0.21880645593009593, "learning_rate": 1.2891113892365458e-05, "loss": 0.2845, "step": 2046 }, { "epoch": 2.3027574563871696, "grad_norm": 0.21662277253245404, "learning_rate": 1.2870254484772635e-05, "loss": 0.2999, "step": 2047 }, { "epoch": 2.3038829487900956, "grad_norm": 0.23410910980013766, "learning_rate": 1.284939507717981e-05, "loss": 0.2803, "step": 2048 }, { "epoch": 2.305008441193022, "grad_norm": 0.24807871237848997, "learning_rate": 1.2828535669586985e-05, "loss": 0.2887, "step": 2049 }, { "epoch": 2.306133933595948, "grad_norm": 0.23958912163692958, "learning_rate": 1.2807676261994159e-05, "loss": 0.2806, "step": 2050 }, { "epoch": 2.3072594259988746, "grad_norm": 0.24170572287325667, "learning_rate": 1.2786816854401334e-05, "loss": 0.2911, "step": 2051 }, { "epoch": 2.3083849184018006, "grad_norm": 0.2071987326770734, "learning_rate": 1.2765957446808511e-05, "loss": 0.281, "step": 2052 }, { "epoch": 2.309510410804727, "grad_norm": 0.2685294387603238, "learning_rate": 1.2745098039215686e-05, "loss": 0.2988, "step": 2053 }, { "epoch": 2.310635903207653, "grad_norm": 0.24356419885452857, "learning_rate": 1.2724238631622862e-05, "loss": 0.2918, "step": 2054 }, { "epoch": 2.3117613956105796, "grad_norm": 0.22854669119255341, "learning_rate": 1.2703379224030037e-05, "loss": 0.2906, "step": 2055 }, { "epoch": 2.312886888013506, "grad_norm": 0.24689465925397477, "learning_rate": 1.2682519816437214e-05, "loss": 0.2935, "step": 2056 }, { "epoch": 2.314012380416432, "grad_norm": 0.21811055770500665, "learning_rate": 1.2661660408844389e-05, "loss": 0.3016, "step": 2057 }, { "epoch": 2.3151378728193586, "grad_norm": 0.2493408748518838, "learning_rate": 1.2640801001251564e-05, "loss": 0.2834, "step": 2058 }, { "epoch": 2.3162633652222846, "grad_norm": 0.25721873798899103, "learning_rate": 1.2619941593658741e-05, "loss": 0.2983, "step": 2059 }, { "epoch": 2.317388857625211, "grad_norm": 0.22179194109950803, "learning_rate": 1.2599082186065917e-05, "loss": 0.2753, "step": 2060 }, { "epoch": 2.3185143500281375, "grad_norm": 0.266595262773116, "learning_rate": 1.2578222778473092e-05, "loss": 0.2839, "step": 2061 }, { "epoch": 2.3196398424310636, "grad_norm": 0.24206558428702046, "learning_rate": 1.2557363370880267e-05, "loss": 0.2853, "step": 2062 }, { "epoch": 2.32076533483399, "grad_norm": 0.2454398984492763, "learning_rate": 1.2536503963287444e-05, "loss": 0.3028, "step": 2063 }, { "epoch": 2.321890827236916, "grad_norm": 0.2321058588488482, "learning_rate": 1.251564455569462e-05, "loss": 0.269, "step": 2064 }, { "epoch": 2.3230163196398426, "grad_norm": 0.27267795334721745, "learning_rate": 1.2494785148101793e-05, "loss": 0.2792, "step": 2065 }, { "epoch": 2.3241418120427686, "grad_norm": 0.23234175584418776, "learning_rate": 1.247392574050897e-05, "loss": 0.28, "step": 2066 }, { "epoch": 2.325267304445695, "grad_norm": 0.2063643654191112, "learning_rate": 1.2453066332916145e-05, "loss": 0.2742, "step": 2067 }, { "epoch": 2.326392796848621, "grad_norm": 0.22497515405636748, "learning_rate": 1.243220692532332e-05, "loss": 0.2904, "step": 2068 }, { "epoch": 2.3275182892515476, "grad_norm": 0.20800896572524227, "learning_rate": 1.2411347517730498e-05, "loss": 0.2974, "step": 2069 }, { "epoch": 2.328643781654474, "grad_norm": 0.22460235366838985, "learning_rate": 1.2390488110137673e-05, "loss": 0.2777, "step": 2070 }, { "epoch": 2.3297692740574, "grad_norm": 0.23776076812455357, "learning_rate": 1.2369628702544848e-05, "loss": 0.2829, "step": 2071 }, { "epoch": 2.3308947664603266, "grad_norm": 0.2570845084981786, "learning_rate": 1.2348769294952024e-05, "loss": 0.2945, "step": 2072 }, { "epoch": 2.3320202588632526, "grad_norm": 0.2385004836723248, "learning_rate": 1.23279098873592e-05, "loss": 0.2867, "step": 2073 }, { "epoch": 2.333145751266179, "grad_norm": 0.24982697079123078, "learning_rate": 1.2307050479766376e-05, "loss": 0.2857, "step": 2074 }, { "epoch": 2.334271243669105, "grad_norm": 0.24642888230370272, "learning_rate": 1.2286191072173551e-05, "loss": 0.3053, "step": 2075 }, { "epoch": 2.3353967360720316, "grad_norm": 0.26130363264507717, "learning_rate": 1.2265331664580726e-05, "loss": 0.2916, "step": 2076 }, { "epoch": 2.3365222284749576, "grad_norm": 0.2124033043327759, "learning_rate": 1.2244472256987902e-05, "loss": 0.2764, "step": 2077 }, { "epoch": 2.337647720877884, "grad_norm": 0.2440455128961208, "learning_rate": 1.2223612849395077e-05, "loss": 0.3075, "step": 2078 }, { "epoch": 2.3387732132808106, "grad_norm": 0.245304116279532, "learning_rate": 1.2202753441802254e-05, "loss": 0.2895, "step": 2079 }, { "epoch": 2.3398987056837366, "grad_norm": 0.2372202784047367, "learning_rate": 1.218189403420943e-05, "loss": 0.2785, "step": 2080 }, { "epoch": 2.341024198086663, "grad_norm": 0.23688709955054182, "learning_rate": 1.2161034626616605e-05, "loss": 0.2777, "step": 2081 }, { "epoch": 2.342149690489589, "grad_norm": 0.2482625923726943, "learning_rate": 1.214017521902378e-05, "loss": 0.2833, "step": 2082 }, { "epoch": 2.3432751828925156, "grad_norm": 0.22738968926541633, "learning_rate": 1.2119315811430957e-05, "loss": 0.2709, "step": 2083 }, { "epoch": 2.3444006752954416, "grad_norm": 0.25147592555620085, "learning_rate": 1.2098456403838132e-05, "loss": 0.3008, "step": 2084 }, { "epoch": 2.345526167698368, "grad_norm": 0.22363924741115862, "learning_rate": 1.2077596996245307e-05, "loss": 0.2872, "step": 2085 }, { "epoch": 2.346651660101294, "grad_norm": 0.23011558044098404, "learning_rate": 1.2056737588652483e-05, "loss": 0.2901, "step": 2086 }, { "epoch": 2.3477771525042206, "grad_norm": 0.232959308790496, "learning_rate": 1.2035878181059658e-05, "loss": 0.2859, "step": 2087 }, { "epoch": 2.348902644907147, "grad_norm": 0.24124633231018813, "learning_rate": 1.2015018773466833e-05, "loss": 0.2946, "step": 2088 }, { "epoch": 2.350028137310073, "grad_norm": 0.23315070454396, "learning_rate": 1.199415936587401e-05, "loss": 0.2719, "step": 2089 }, { "epoch": 2.3511536297129996, "grad_norm": 0.21247783763819528, "learning_rate": 1.1973299958281186e-05, "loss": 0.28, "step": 2090 }, { "epoch": 2.3522791221159256, "grad_norm": 0.23387492479149327, "learning_rate": 1.195244055068836e-05, "loss": 0.2792, "step": 2091 }, { "epoch": 2.353404614518852, "grad_norm": 0.22205981665359048, "learning_rate": 1.1931581143095536e-05, "loss": 0.2869, "step": 2092 }, { "epoch": 2.354530106921778, "grad_norm": 0.2407814917985092, "learning_rate": 1.1910721735502713e-05, "loss": 0.2837, "step": 2093 }, { "epoch": 2.3556555993247046, "grad_norm": 0.22636696159410108, "learning_rate": 1.1889862327909888e-05, "loss": 0.2717, "step": 2094 }, { "epoch": 2.3567810917276306, "grad_norm": 0.20010784358214667, "learning_rate": 1.1869002920317064e-05, "loss": 0.2638, "step": 2095 }, { "epoch": 2.357906584130557, "grad_norm": 0.2302624047508185, "learning_rate": 1.184814351272424e-05, "loss": 0.2889, "step": 2096 }, { "epoch": 2.3590320765334836, "grad_norm": 0.21841431009246395, "learning_rate": 1.1827284105131414e-05, "loss": 0.2836, "step": 2097 }, { "epoch": 2.3601575689364096, "grad_norm": 0.21162560341411857, "learning_rate": 1.180642469753859e-05, "loss": 0.2744, "step": 2098 }, { "epoch": 2.361283061339336, "grad_norm": 0.23437617625703946, "learning_rate": 1.1785565289945767e-05, "loss": 0.2897, "step": 2099 }, { "epoch": 2.362408553742262, "grad_norm": 0.2443861444498022, "learning_rate": 1.1764705882352942e-05, "loss": 0.2773, "step": 2100 }, { "epoch": 2.3635340461451886, "grad_norm": 0.20195447071682132, "learning_rate": 1.1743846474760117e-05, "loss": 0.2852, "step": 2101 }, { "epoch": 2.3646595385481146, "grad_norm": 0.22050201926310495, "learning_rate": 1.1722987067167292e-05, "loss": 0.2811, "step": 2102 }, { "epoch": 2.365785030951041, "grad_norm": 0.24700086215612232, "learning_rate": 1.170212765957447e-05, "loss": 0.2944, "step": 2103 }, { "epoch": 2.366910523353967, "grad_norm": 0.21968358349344858, "learning_rate": 1.1681268251981645e-05, "loss": 0.2876, "step": 2104 }, { "epoch": 2.3680360157568936, "grad_norm": 0.22004217949350546, "learning_rate": 1.166040884438882e-05, "loss": 0.298, "step": 2105 }, { "epoch": 2.36916150815982, "grad_norm": 0.25166613532562226, "learning_rate": 1.1639549436795997e-05, "loss": 0.3039, "step": 2106 }, { "epoch": 2.370287000562746, "grad_norm": 0.2470279532584483, "learning_rate": 1.161869002920317e-05, "loss": 0.2905, "step": 2107 }, { "epoch": 2.3714124929656726, "grad_norm": 0.22645956564251657, "learning_rate": 1.1597830621610346e-05, "loss": 0.2857, "step": 2108 }, { "epoch": 2.3725379853685986, "grad_norm": 0.236072091244946, "learning_rate": 1.1576971214017523e-05, "loss": 0.2876, "step": 2109 }, { "epoch": 2.373663477771525, "grad_norm": 0.22344125624637598, "learning_rate": 1.1556111806424698e-05, "loss": 0.2871, "step": 2110 }, { "epoch": 2.3747889701744516, "grad_norm": 0.2740686740181796, "learning_rate": 1.1535252398831873e-05, "loss": 0.3108, "step": 2111 }, { "epoch": 2.3759144625773776, "grad_norm": 0.2633526483901452, "learning_rate": 1.1514392991239049e-05, "loss": 0.2776, "step": 2112 }, { "epoch": 2.377039954980304, "grad_norm": 0.23782573779211985, "learning_rate": 1.1493533583646226e-05, "loss": 0.2882, "step": 2113 }, { "epoch": 2.37816544738323, "grad_norm": 0.20792260803969095, "learning_rate": 1.1472674176053401e-05, "loss": 0.2711, "step": 2114 }, { "epoch": 2.3792909397861566, "grad_norm": 0.27657171997481406, "learning_rate": 1.1451814768460576e-05, "loss": 0.2866, "step": 2115 }, { "epoch": 2.3804164321890826, "grad_norm": 0.23403563768428892, "learning_rate": 1.1430955360867753e-05, "loss": 0.2936, "step": 2116 }, { "epoch": 2.381541924592009, "grad_norm": 0.2268435928373139, "learning_rate": 1.1410095953274927e-05, "loss": 0.2856, "step": 2117 }, { "epoch": 2.382667416994935, "grad_norm": 0.2547588426300782, "learning_rate": 1.1389236545682102e-05, "loss": 0.2825, "step": 2118 }, { "epoch": 2.3837929093978616, "grad_norm": 0.2251742508238687, "learning_rate": 1.1368377138089279e-05, "loss": 0.27, "step": 2119 }, { "epoch": 2.384918401800788, "grad_norm": 0.23999465674072157, "learning_rate": 1.1347517730496454e-05, "loss": 0.3047, "step": 2120 }, { "epoch": 2.386043894203714, "grad_norm": 0.23662614777077606, "learning_rate": 1.132665832290363e-05, "loss": 0.2988, "step": 2121 }, { "epoch": 2.3871693866066406, "grad_norm": 0.2186602406031677, "learning_rate": 1.1305798915310805e-05, "loss": 0.2728, "step": 2122 }, { "epoch": 2.3882948790095666, "grad_norm": 0.23328788430189215, "learning_rate": 1.1284939507717982e-05, "loss": 0.2899, "step": 2123 }, { "epoch": 2.389420371412493, "grad_norm": 0.22192987090662689, "learning_rate": 1.1264080100125157e-05, "loss": 0.2988, "step": 2124 }, { "epoch": 2.390545863815419, "grad_norm": 0.2250119571074059, "learning_rate": 1.1243220692532333e-05, "loss": 0.2956, "step": 2125 }, { "epoch": 2.3916713562183456, "grad_norm": 0.23070732043801884, "learning_rate": 1.122236128493951e-05, "loss": 0.2902, "step": 2126 }, { "epoch": 2.3927968486212716, "grad_norm": 0.22518386141677244, "learning_rate": 1.1201501877346685e-05, "loss": 0.2659, "step": 2127 }, { "epoch": 2.393922341024198, "grad_norm": 0.21584313221933796, "learning_rate": 1.1180642469753858e-05, "loss": 0.288, "step": 2128 }, { "epoch": 2.3950478334271246, "grad_norm": 0.24985482700142989, "learning_rate": 1.1159783062161035e-05, "loss": 0.2874, "step": 2129 }, { "epoch": 2.3961733258300506, "grad_norm": 0.23472182953400522, "learning_rate": 1.113892365456821e-05, "loss": 0.2797, "step": 2130 }, { "epoch": 2.397298818232977, "grad_norm": 0.23998025758676889, "learning_rate": 1.1118064246975386e-05, "loss": 0.2919, "step": 2131 }, { "epoch": 2.398424310635903, "grad_norm": 0.21809256274475072, "learning_rate": 1.1097204839382561e-05, "loss": 0.2758, "step": 2132 }, { "epoch": 2.3995498030388296, "grad_norm": 0.23454882483715764, "learning_rate": 1.1076345431789738e-05, "loss": 0.2952, "step": 2133 }, { "epoch": 2.4006752954417556, "grad_norm": 0.2246557652335286, "learning_rate": 1.1055486024196914e-05, "loss": 0.2981, "step": 2134 }, { "epoch": 2.401800787844682, "grad_norm": 0.25315434629928985, "learning_rate": 1.1034626616604089e-05, "loss": 0.3066, "step": 2135 }, { "epoch": 2.402926280247608, "grad_norm": 0.22665010188162998, "learning_rate": 1.1013767209011266e-05, "loss": 0.2923, "step": 2136 }, { "epoch": 2.4040517726505346, "grad_norm": 0.2501297143991106, "learning_rate": 1.0992907801418441e-05, "loss": 0.2782, "step": 2137 }, { "epoch": 2.405177265053461, "grad_norm": 0.23355470062481642, "learning_rate": 1.0972048393825615e-05, "loss": 0.2958, "step": 2138 }, { "epoch": 2.406302757456387, "grad_norm": 0.22524426706184972, "learning_rate": 1.0951188986232792e-05, "loss": 0.2971, "step": 2139 }, { "epoch": 2.4074282498593136, "grad_norm": 0.2652706460468126, "learning_rate": 1.0930329578639967e-05, "loss": 0.3063, "step": 2140 }, { "epoch": 2.4085537422622396, "grad_norm": 0.27147074305958385, "learning_rate": 1.0909470171047142e-05, "loss": 0.2835, "step": 2141 }, { "epoch": 2.409679234665166, "grad_norm": 0.21263510668944327, "learning_rate": 1.0888610763454318e-05, "loss": 0.2759, "step": 2142 }, { "epoch": 2.410804727068092, "grad_norm": 0.2414280469462777, "learning_rate": 1.0867751355861495e-05, "loss": 0.3011, "step": 2143 }, { "epoch": 2.4119302194710186, "grad_norm": 0.2563072903091181, "learning_rate": 1.084689194826867e-05, "loss": 0.3011, "step": 2144 }, { "epoch": 2.4130557118739446, "grad_norm": 0.26752154229648717, "learning_rate": 1.0826032540675845e-05, "loss": 0.3025, "step": 2145 }, { "epoch": 2.414181204276871, "grad_norm": 0.2191490869328681, "learning_rate": 1.0805173133083022e-05, "loss": 0.2838, "step": 2146 }, { "epoch": 2.4153066966797976, "grad_norm": 0.2754295487570369, "learning_rate": 1.0784313725490197e-05, "loss": 0.2745, "step": 2147 }, { "epoch": 2.4164321890827236, "grad_norm": 0.27496282852437165, "learning_rate": 1.0763454317897373e-05, "loss": 0.2949, "step": 2148 }, { "epoch": 2.41755768148565, "grad_norm": 0.200617793351321, "learning_rate": 1.0742594910304548e-05, "loss": 0.2844, "step": 2149 }, { "epoch": 2.418683173888576, "grad_norm": 0.2261161966006784, "learning_rate": 1.0721735502711723e-05, "loss": 0.3074, "step": 2150 }, { "epoch": 2.4198086662915026, "grad_norm": 0.2721691004576223, "learning_rate": 1.0700876095118899e-05, "loss": 0.2887, "step": 2151 }, { "epoch": 2.4209341586944286, "grad_norm": 0.2468891901297125, "learning_rate": 1.0680016687526074e-05, "loss": 0.2826, "step": 2152 }, { "epoch": 2.422059651097355, "grad_norm": 0.22981129965663172, "learning_rate": 1.065915727993325e-05, "loss": 0.2982, "step": 2153 }, { "epoch": 2.423185143500281, "grad_norm": 0.23778012142265284, "learning_rate": 1.0638297872340426e-05, "loss": 0.2803, "step": 2154 }, { "epoch": 2.4243106359032076, "grad_norm": 0.2502133834738445, "learning_rate": 1.0617438464747601e-05, "loss": 0.2894, "step": 2155 }, { "epoch": 2.425436128306134, "grad_norm": 0.24021651572443242, "learning_rate": 1.0596579057154778e-05, "loss": 0.2792, "step": 2156 }, { "epoch": 2.42656162070906, "grad_norm": 0.21287174233579118, "learning_rate": 1.0575719649561954e-05, "loss": 0.2653, "step": 2157 }, { "epoch": 2.4276871131119866, "grad_norm": 0.2596480129053586, "learning_rate": 1.0554860241969129e-05, "loss": 0.2812, "step": 2158 }, { "epoch": 2.4288126055149126, "grad_norm": 0.22839461536852768, "learning_rate": 1.0534000834376304e-05, "loss": 0.2887, "step": 2159 }, { "epoch": 2.429938097917839, "grad_norm": 0.25082900774514266, "learning_rate": 1.051314142678348e-05, "loss": 0.2848, "step": 2160 }, { "epoch": 2.4310635903207656, "grad_norm": 0.21582263533702323, "learning_rate": 1.0492282019190655e-05, "loss": 0.2848, "step": 2161 }, { "epoch": 2.4321890827236916, "grad_norm": 0.20981491769940364, "learning_rate": 1.047142261159783e-05, "loss": 0.2771, "step": 2162 }, { "epoch": 2.433314575126618, "grad_norm": 0.2478690946929455, "learning_rate": 1.0450563204005007e-05, "loss": 0.2978, "step": 2163 }, { "epoch": 2.434440067529544, "grad_norm": 0.23623868199579823, "learning_rate": 1.0429703796412182e-05, "loss": 0.2868, "step": 2164 }, { "epoch": 2.4355655599324706, "grad_norm": 0.22479499127056093, "learning_rate": 1.0408844388819358e-05, "loss": 0.2864, "step": 2165 }, { "epoch": 2.4366910523353966, "grad_norm": 0.22778228885333549, "learning_rate": 1.0387984981226535e-05, "loss": 0.2703, "step": 2166 }, { "epoch": 2.437816544738323, "grad_norm": 0.24955225194107739, "learning_rate": 1.036712557363371e-05, "loss": 0.2881, "step": 2167 }, { "epoch": 2.438942037141249, "grad_norm": 0.24688296808661256, "learning_rate": 1.0346266166040885e-05, "loss": 0.2892, "step": 2168 }, { "epoch": 2.4400675295441756, "grad_norm": 0.21066675955547629, "learning_rate": 1.032540675844806e-05, "loss": 0.2665, "step": 2169 }, { "epoch": 2.441193021947102, "grad_norm": 0.23152630032898566, "learning_rate": 1.0304547350855236e-05, "loss": 0.2879, "step": 2170 }, { "epoch": 2.442318514350028, "grad_norm": 0.23881733868242846, "learning_rate": 1.0283687943262411e-05, "loss": 0.284, "step": 2171 }, { "epoch": 2.4434440067529546, "grad_norm": 0.24727876228693577, "learning_rate": 1.0262828535669586e-05, "loss": 0.2799, "step": 2172 }, { "epoch": 2.4445694991558806, "grad_norm": 0.2237223246325913, "learning_rate": 1.0241969128076763e-05, "loss": 0.2839, "step": 2173 }, { "epoch": 2.445694991558807, "grad_norm": 0.2255880979174184, "learning_rate": 1.0221109720483939e-05, "loss": 0.3039, "step": 2174 }, { "epoch": 2.446820483961733, "grad_norm": 0.2170555923070572, "learning_rate": 1.0200250312891114e-05, "loss": 0.2768, "step": 2175 }, { "epoch": 2.4479459763646596, "grad_norm": 0.20774037005388524, "learning_rate": 1.0179390905298291e-05, "loss": 0.2834, "step": 2176 }, { "epoch": 2.4490714687675856, "grad_norm": 0.2265465766383895, "learning_rate": 1.0158531497705466e-05, "loss": 0.3058, "step": 2177 }, { "epoch": 2.450196961170512, "grad_norm": 0.20568423154158125, "learning_rate": 1.0137672090112642e-05, "loss": 0.2692, "step": 2178 }, { "epoch": 2.4513224535734386, "grad_norm": 0.22834881557663556, "learning_rate": 1.0116812682519817e-05, "loss": 0.2892, "step": 2179 }, { "epoch": 2.4524479459763646, "grad_norm": 0.22028619420108753, "learning_rate": 1.0095953274926992e-05, "loss": 0.2919, "step": 2180 }, { "epoch": 2.453573438379291, "grad_norm": 0.2425115217082142, "learning_rate": 1.0075093867334167e-05, "loss": 0.2764, "step": 2181 }, { "epoch": 2.454698930782217, "grad_norm": 0.22791631771168733, "learning_rate": 1.0054234459741343e-05, "loss": 0.267, "step": 2182 }, { "epoch": 2.4558244231851436, "grad_norm": 0.1926906774199996, "learning_rate": 1.003337505214852e-05, "loss": 0.2683, "step": 2183 }, { "epoch": 2.4569499155880696, "grad_norm": 0.23290868443818466, "learning_rate": 1.0012515644555695e-05, "loss": 0.2791, "step": 2184 }, { "epoch": 2.458075407990996, "grad_norm": 0.2968317001207595, "learning_rate": 9.99165623696287e-06, "loss": 0.2959, "step": 2185 }, { "epoch": 2.459200900393922, "grad_norm": 0.250149366010179, "learning_rate": 9.970796829370047e-06, "loss": 0.2968, "step": 2186 }, { "epoch": 2.4603263927968486, "grad_norm": 0.23676392349962846, "learning_rate": 9.949937421777223e-06, "loss": 0.2766, "step": 2187 }, { "epoch": 2.461451885199775, "grad_norm": 0.2968220951755795, "learning_rate": 9.929078014184398e-06, "loss": 0.2858, "step": 2188 }, { "epoch": 2.462577377602701, "grad_norm": 0.24526431357390857, "learning_rate": 9.908218606591573e-06, "loss": 0.2776, "step": 2189 }, { "epoch": 2.4637028700056276, "grad_norm": 0.2072075588604563, "learning_rate": 9.887359198998748e-06, "loss": 0.2786, "step": 2190 }, { "epoch": 2.4648283624085536, "grad_norm": 0.24560787407943072, "learning_rate": 9.866499791405924e-06, "loss": 0.3076, "step": 2191 }, { "epoch": 2.46595385481148, "grad_norm": 0.2807855048371902, "learning_rate": 9.845640383813099e-06, "loss": 0.3059, "step": 2192 }, { "epoch": 2.467079347214406, "grad_norm": 0.21339940056568182, "learning_rate": 9.824780976220276e-06, "loss": 0.2835, "step": 2193 }, { "epoch": 2.4682048396173326, "grad_norm": 0.23237003408073176, "learning_rate": 9.803921568627451e-06, "loss": 0.2774, "step": 2194 }, { "epoch": 2.4693303320202586, "grad_norm": 0.22471960654724552, "learning_rate": 9.783062161034627e-06, "loss": 0.3059, "step": 2195 }, { "epoch": 2.470455824423185, "grad_norm": 0.21133212055331363, "learning_rate": 9.762202753441804e-06, "loss": 0.2804, "step": 2196 }, { "epoch": 2.4715813168261116, "grad_norm": 0.22866555875952663, "learning_rate": 9.741343345848979e-06, "loss": 0.2668, "step": 2197 }, { "epoch": 2.4727068092290376, "grad_norm": 0.23022775610838142, "learning_rate": 9.720483938256154e-06, "loss": 0.2941, "step": 2198 }, { "epoch": 2.473832301631964, "grad_norm": 0.24916655338248048, "learning_rate": 9.69962453066333e-06, "loss": 0.2875, "step": 2199 }, { "epoch": 2.47495779403489, "grad_norm": 0.22598137001947038, "learning_rate": 9.678765123070506e-06, "loss": 0.287, "step": 2200 }, { "epoch": 2.4760832864378166, "grad_norm": 0.19562266451832722, "learning_rate": 9.65790571547768e-06, "loss": 0.2726, "step": 2201 }, { "epoch": 2.4772087788407426, "grad_norm": 0.22330052278775112, "learning_rate": 9.637046307884855e-06, "loss": 0.2862, "step": 2202 }, { "epoch": 2.478334271243669, "grad_norm": 0.22895521592496432, "learning_rate": 9.616186900292032e-06, "loss": 0.2858, "step": 2203 }, { "epoch": 2.479459763646595, "grad_norm": 0.22023179481636448, "learning_rate": 9.595327492699208e-06, "loss": 0.2768, "step": 2204 }, { "epoch": 2.4805852560495216, "grad_norm": 0.23642223233900708, "learning_rate": 9.574468085106383e-06, "loss": 0.2878, "step": 2205 }, { "epoch": 2.481710748452448, "grad_norm": 0.2391107708431571, "learning_rate": 9.55360867751356e-06, "loss": 0.2879, "step": 2206 }, { "epoch": 2.482836240855374, "grad_norm": 0.24152975198499732, "learning_rate": 9.532749269920735e-06, "loss": 0.2955, "step": 2207 }, { "epoch": 2.4839617332583006, "grad_norm": 0.23299532148669774, "learning_rate": 9.51188986232791e-06, "loss": 0.2962, "step": 2208 }, { "epoch": 2.4850872256612266, "grad_norm": 0.20896130963456966, "learning_rate": 9.491030454735086e-06, "loss": 0.284, "step": 2209 }, { "epoch": 2.486212718064153, "grad_norm": 0.2105385871507124, "learning_rate": 9.470171047142263e-06, "loss": 0.2729, "step": 2210 }, { "epoch": 2.4873382104670796, "grad_norm": 0.21484947461149867, "learning_rate": 9.449311639549436e-06, "loss": 0.2926, "step": 2211 }, { "epoch": 2.4884637028700056, "grad_norm": 0.2190385482446419, "learning_rate": 9.428452231956612e-06, "loss": 0.2825, "step": 2212 }, { "epoch": 2.489589195272932, "grad_norm": 0.2142534982080354, "learning_rate": 9.407592824363789e-06, "loss": 0.2751, "step": 2213 }, { "epoch": 2.490714687675858, "grad_norm": 0.21708738862041638, "learning_rate": 9.386733416770964e-06, "loss": 0.2786, "step": 2214 }, { "epoch": 2.4918401800787846, "grad_norm": 0.2181940682924344, "learning_rate": 9.365874009178139e-06, "loss": 0.2891, "step": 2215 }, { "epoch": 2.4929656724817106, "grad_norm": 0.24361785849546538, "learning_rate": 9.345014601585316e-06, "loss": 0.2862, "step": 2216 }, { "epoch": 2.494091164884637, "grad_norm": 0.2074874339468701, "learning_rate": 9.324155193992491e-06, "loss": 0.2779, "step": 2217 }, { "epoch": 2.495216657287563, "grad_norm": 0.22232685525965187, "learning_rate": 9.303295786399667e-06, "loss": 0.2872, "step": 2218 }, { "epoch": 2.4963421496904896, "grad_norm": 0.22940288362612324, "learning_rate": 9.282436378806842e-06, "loss": 0.2675, "step": 2219 }, { "epoch": 2.497467642093416, "grad_norm": 0.22467443084840247, "learning_rate": 9.261576971214019e-06, "loss": 0.2801, "step": 2220 }, { "epoch": 2.498593134496342, "grad_norm": 0.23139305058585594, "learning_rate": 9.240717563621194e-06, "loss": 0.2743, "step": 2221 }, { "epoch": 2.4997186268992686, "grad_norm": 0.2360404033010022, "learning_rate": 9.219858156028368e-06, "loss": 0.286, "step": 2222 }, { "epoch": 2.5008441193021946, "grad_norm": 0.20605066820343487, "learning_rate": 9.198998748435545e-06, "loss": 0.2823, "step": 2223 }, { "epoch": 2.501969611705121, "grad_norm": 0.2564735378536905, "learning_rate": 9.17813934084272e-06, "loss": 0.2947, "step": 2224 }, { "epoch": 2.503095104108047, "grad_norm": 0.2320837293470589, "learning_rate": 9.157279933249895e-06, "loss": 0.2768, "step": 2225 }, { "epoch": 2.5042205965109736, "grad_norm": 0.208589920793005, "learning_rate": 9.136420525657072e-06, "loss": 0.29, "step": 2226 }, { "epoch": 2.5053460889138996, "grad_norm": 0.21389293826499295, "learning_rate": 9.115561118064248e-06, "loss": 0.2798, "step": 2227 }, { "epoch": 2.506471581316826, "grad_norm": 0.22046720544274087, "learning_rate": 9.094701710471423e-06, "loss": 0.2937, "step": 2228 }, { "epoch": 2.5075970737197526, "grad_norm": 0.22495729889410385, "learning_rate": 9.073842302878598e-06, "loss": 0.2879, "step": 2229 }, { "epoch": 2.5087225661226786, "grad_norm": 0.20269539252904967, "learning_rate": 9.052982895285775e-06, "loss": 0.2774, "step": 2230 }, { "epoch": 2.509848058525605, "grad_norm": 0.1980840443630393, "learning_rate": 9.03212348769295e-06, "loss": 0.281, "step": 2231 }, { "epoch": 2.510973550928531, "grad_norm": 0.22695316930947035, "learning_rate": 9.011264080100124e-06, "loss": 0.2922, "step": 2232 }, { "epoch": 2.5120990433314576, "grad_norm": 0.20934803359715298, "learning_rate": 8.990404672507301e-06, "loss": 0.2794, "step": 2233 }, { "epoch": 2.5132245357343836, "grad_norm": 0.21112109217582253, "learning_rate": 8.969545264914476e-06, "loss": 0.2907, "step": 2234 }, { "epoch": 2.51435002813731, "grad_norm": 0.22573731590530483, "learning_rate": 8.948685857321652e-06, "loss": 0.291, "step": 2235 }, { "epoch": 2.515475520540236, "grad_norm": 0.23892740697159065, "learning_rate": 8.927826449728829e-06, "loss": 0.273, "step": 2236 }, { "epoch": 2.5166010129431626, "grad_norm": 0.22535585510058634, "learning_rate": 8.906967042136004e-06, "loss": 0.2837, "step": 2237 }, { "epoch": 2.517726505346089, "grad_norm": 0.2194038445722204, "learning_rate": 8.88610763454318e-06, "loss": 0.2927, "step": 2238 }, { "epoch": 2.518851997749015, "grad_norm": 0.22905667951438685, "learning_rate": 8.865248226950355e-06, "loss": 0.2789, "step": 2239 }, { "epoch": 2.5199774901519416, "grad_norm": 0.22625912351056832, "learning_rate": 8.844388819357532e-06, "loss": 0.2755, "step": 2240 }, { "epoch": 2.5211029825548676, "grad_norm": 0.23804689181224994, "learning_rate": 8.823529411764707e-06, "loss": 0.2792, "step": 2241 }, { "epoch": 2.522228474957794, "grad_norm": 0.2105408688549035, "learning_rate": 8.802670004171882e-06, "loss": 0.2972, "step": 2242 }, { "epoch": 2.52335396736072, "grad_norm": 0.22340033958156802, "learning_rate": 8.781810596579057e-06, "loss": 0.2946, "step": 2243 }, { "epoch": 2.5244794597636466, "grad_norm": 0.2297895889368776, "learning_rate": 8.760951188986233e-06, "loss": 0.2938, "step": 2244 }, { "epoch": 2.5256049521665727, "grad_norm": 0.2259147810494066, "learning_rate": 8.740091781393408e-06, "loss": 0.2813, "step": 2245 }, { "epoch": 2.526730444569499, "grad_norm": 0.23547288393006746, "learning_rate": 8.719232373800585e-06, "loss": 0.2994, "step": 2246 }, { "epoch": 2.5278559369724256, "grad_norm": 0.22543446780315715, "learning_rate": 8.69837296620776e-06, "loss": 0.2846, "step": 2247 }, { "epoch": 2.5289814293753516, "grad_norm": 0.2154532957908738, "learning_rate": 8.677513558614936e-06, "loss": 0.2879, "step": 2248 }, { "epoch": 2.530106921778278, "grad_norm": 0.2351801079174597, "learning_rate": 8.65665415102211e-06, "loss": 0.2765, "step": 2249 }, { "epoch": 2.531232414181204, "grad_norm": 0.21366786894791512, "learning_rate": 8.635794743429288e-06, "loss": 0.2734, "step": 2250 }, { "epoch": 2.5323579065841306, "grad_norm": 0.23645349161640047, "learning_rate": 8.614935335836463e-06, "loss": 0.2984, "step": 2251 }, { "epoch": 2.533483398987057, "grad_norm": 0.23434820101602807, "learning_rate": 8.594075928243638e-06, "loss": 0.2968, "step": 2252 }, { "epoch": 2.534608891389983, "grad_norm": 0.23800902126311332, "learning_rate": 8.573216520650814e-06, "loss": 0.2828, "step": 2253 }, { "epoch": 2.535734383792909, "grad_norm": 0.2538132352809376, "learning_rate": 8.552357113057989e-06, "loss": 0.2843, "step": 2254 }, { "epoch": 2.5368598761958356, "grad_norm": 0.21371163966598017, "learning_rate": 8.531497705465164e-06, "loss": 0.2751, "step": 2255 }, { "epoch": 2.537985368598762, "grad_norm": 0.21482253029817228, "learning_rate": 8.510638297872341e-06, "loss": 0.3006, "step": 2256 }, { "epoch": 2.539110861001688, "grad_norm": 0.21834391394392152, "learning_rate": 8.489778890279517e-06, "loss": 0.288, "step": 2257 }, { "epoch": 2.5402363534046146, "grad_norm": 0.2385102842630092, "learning_rate": 8.468919482686692e-06, "loss": 0.2873, "step": 2258 }, { "epoch": 2.5413618458075407, "grad_norm": 0.2496691464287376, "learning_rate": 8.448060075093867e-06, "loss": 0.2868, "step": 2259 }, { "epoch": 2.542487338210467, "grad_norm": 0.2014232955964171, "learning_rate": 8.427200667501044e-06, "loss": 0.2851, "step": 2260 }, { "epoch": 2.5436128306133936, "grad_norm": 0.23384968447549695, "learning_rate": 8.40634125990822e-06, "loss": 0.2842, "step": 2261 }, { "epoch": 2.5447383230163196, "grad_norm": 0.21977668018953103, "learning_rate": 8.385481852315395e-06, "loss": 0.2883, "step": 2262 }, { "epoch": 2.5458638154192457, "grad_norm": 0.21776563647468017, "learning_rate": 8.36462244472257e-06, "loss": 0.2856, "step": 2263 }, { "epoch": 2.546989307822172, "grad_norm": 0.2027944392061715, "learning_rate": 8.343763037129745e-06, "loss": 0.2765, "step": 2264 }, { "epoch": 2.5481148002250986, "grad_norm": 0.21029054091165603, "learning_rate": 8.32290362953692e-06, "loss": 0.2767, "step": 2265 }, { "epoch": 2.5492402926280247, "grad_norm": 0.21418622748856342, "learning_rate": 8.302044221944098e-06, "loss": 0.2948, "step": 2266 }, { "epoch": 2.550365785030951, "grad_norm": 0.21907388139154874, "learning_rate": 8.281184814351273e-06, "loss": 0.2736, "step": 2267 }, { "epoch": 2.551491277433877, "grad_norm": 0.21904845452521604, "learning_rate": 8.260325406758448e-06, "loss": 0.3056, "step": 2268 }, { "epoch": 2.5526167698368036, "grad_norm": 0.20459659904962244, "learning_rate": 8.239465999165623e-06, "loss": 0.2807, "step": 2269 }, { "epoch": 2.55374226223973, "grad_norm": 0.20176624512330674, "learning_rate": 8.2186065915728e-06, "loss": 0.282, "step": 2270 }, { "epoch": 2.554867754642656, "grad_norm": 0.2171053854970344, "learning_rate": 8.197747183979976e-06, "loss": 0.2867, "step": 2271 }, { "epoch": 2.555993247045582, "grad_norm": 0.21608909264471945, "learning_rate": 8.176887776387151e-06, "loss": 0.2843, "step": 2272 }, { "epoch": 2.5571187394485086, "grad_norm": 0.22363745774157207, "learning_rate": 8.156028368794328e-06, "loss": 0.292, "step": 2273 }, { "epoch": 2.558244231851435, "grad_norm": 0.1967157457122503, "learning_rate": 8.135168961201502e-06, "loss": 0.2785, "step": 2274 }, { "epoch": 2.559369724254361, "grad_norm": 0.21025592482731642, "learning_rate": 8.114309553608677e-06, "loss": 0.2758, "step": 2275 }, { "epoch": 2.5604952166572876, "grad_norm": 0.21847518826316134, "learning_rate": 8.093450146015854e-06, "loss": 0.2969, "step": 2276 }, { "epoch": 2.5616207090602137, "grad_norm": 0.22602160924202305, "learning_rate": 8.072590738423029e-06, "loss": 0.2883, "step": 2277 }, { "epoch": 2.56274620146314, "grad_norm": 0.20256712231044452, "learning_rate": 8.051731330830204e-06, "loss": 0.2695, "step": 2278 }, { "epoch": 2.5638716938660666, "grad_norm": 0.20681151204540096, "learning_rate": 8.030871923237381e-06, "loss": 0.2654, "step": 2279 }, { "epoch": 2.5649971862689926, "grad_norm": 0.23344582380587078, "learning_rate": 8.010012515644557e-06, "loss": 0.2986, "step": 2280 }, { "epoch": 2.566122678671919, "grad_norm": 0.22256302367590555, "learning_rate": 7.989153108051732e-06, "loss": 0.2833, "step": 2281 }, { "epoch": 2.567248171074845, "grad_norm": 0.20447744073654678, "learning_rate": 7.968293700458907e-06, "loss": 0.2859, "step": 2282 }, { "epoch": 2.5683736634777716, "grad_norm": 0.20565529180207448, "learning_rate": 7.947434292866084e-06, "loss": 0.2742, "step": 2283 }, { "epoch": 2.5694991558806977, "grad_norm": 0.21066765721313158, "learning_rate": 7.926574885273258e-06, "loss": 0.2944, "step": 2284 }, { "epoch": 2.570624648283624, "grad_norm": 0.21517637060390432, "learning_rate": 7.905715477680433e-06, "loss": 0.2875, "step": 2285 }, { "epoch": 2.57175014068655, "grad_norm": 0.21947956446898098, "learning_rate": 7.88485607008761e-06, "loss": 0.3004, "step": 2286 }, { "epoch": 2.5728756330894766, "grad_norm": 0.22114557622949502, "learning_rate": 7.863996662494785e-06, "loss": 0.2976, "step": 2287 }, { "epoch": 2.574001125492403, "grad_norm": 0.22379469537623312, "learning_rate": 7.84313725490196e-06, "loss": 0.2878, "step": 2288 }, { "epoch": 2.575126617895329, "grad_norm": 0.2071839477449149, "learning_rate": 7.822277847309138e-06, "loss": 0.2795, "step": 2289 }, { "epoch": 2.5762521102982556, "grad_norm": 0.2237931852947739, "learning_rate": 7.801418439716313e-06, "loss": 0.2971, "step": 2290 }, { "epoch": 2.5773776027011817, "grad_norm": 0.21266520141625195, "learning_rate": 7.780559032123488e-06, "loss": 0.2853, "step": 2291 }, { "epoch": 2.578503095104108, "grad_norm": 0.2486160020515366, "learning_rate": 7.759699624530664e-06, "loss": 0.2961, "step": 2292 }, { "epoch": 2.579628587507034, "grad_norm": 0.2068308805691666, "learning_rate": 7.73884021693784e-06, "loss": 0.2691, "step": 2293 }, { "epoch": 2.5807540799099606, "grad_norm": 0.2230851463060974, "learning_rate": 7.717980809345016e-06, "loss": 0.2995, "step": 2294 }, { "epoch": 2.5818795723128867, "grad_norm": 0.2374977031933618, "learning_rate": 7.69712140175219e-06, "loss": 0.283, "step": 2295 }, { "epoch": 2.583005064715813, "grad_norm": 0.24062860705542086, "learning_rate": 7.676261994159366e-06, "loss": 0.2957, "step": 2296 }, { "epoch": 2.5841305571187396, "grad_norm": 0.20537260389777368, "learning_rate": 7.655402586566542e-06, "loss": 0.2921, "step": 2297 }, { "epoch": 2.5852560495216657, "grad_norm": 0.21853998967769137, "learning_rate": 7.634543178973717e-06, "loss": 0.2894, "step": 2298 }, { "epoch": 2.586381541924592, "grad_norm": 0.21880735610653707, "learning_rate": 7.613683771380893e-06, "loss": 0.2791, "step": 2299 }, { "epoch": 2.587507034327518, "grad_norm": 0.22402757654717384, "learning_rate": 7.592824363788069e-06, "loss": 0.289, "step": 2300 }, { "epoch": 2.5886325267304446, "grad_norm": 0.2147892961394563, "learning_rate": 7.5719649561952445e-06, "loss": 0.2812, "step": 2301 }, { "epoch": 2.589758019133371, "grad_norm": 0.22876144329979556, "learning_rate": 7.551105548602421e-06, "loss": 0.2933, "step": 2302 }, { "epoch": 2.590883511536297, "grad_norm": 0.23442708833949216, "learning_rate": 7.530246141009596e-06, "loss": 0.2814, "step": 2303 }, { "epoch": 2.592009003939223, "grad_norm": 0.23484614963727998, "learning_rate": 7.509386733416772e-06, "loss": 0.3144, "step": 2304 }, { "epoch": 2.5931344963421497, "grad_norm": 0.20921707796315442, "learning_rate": 7.4885273258239465e-06, "loss": 0.286, "step": 2305 }, { "epoch": 2.594259988745076, "grad_norm": 0.22775240735379326, "learning_rate": 7.467667918231122e-06, "loss": 0.2788, "step": 2306 }, { "epoch": 2.595385481148002, "grad_norm": 0.22230000059940203, "learning_rate": 7.446808510638298e-06, "loss": 0.2815, "step": 2307 }, { "epoch": 2.5965109735509286, "grad_norm": 0.24545298078462746, "learning_rate": 7.425949103045473e-06, "loss": 0.2735, "step": 2308 }, { "epoch": 2.5976364659538547, "grad_norm": 0.19625632990047406, "learning_rate": 7.405089695452649e-06, "loss": 0.2731, "step": 2309 }, { "epoch": 2.598761958356781, "grad_norm": 0.20900090173879718, "learning_rate": 7.3842302878598255e-06, "loss": 0.2721, "step": 2310 }, { "epoch": 2.5998874507597076, "grad_norm": 0.2123621289927944, "learning_rate": 7.363370880267001e-06, "loss": 0.2698, "step": 2311 }, { "epoch": 2.6010129431626337, "grad_norm": 0.21369756285267333, "learning_rate": 7.342511472674177e-06, "loss": 0.2834, "step": 2312 }, { "epoch": 2.6021384355655597, "grad_norm": 0.22793059698710658, "learning_rate": 7.321652065081352e-06, "loss": 0.2851, "step": 2313 }, { "epoch": 2.603263927968486, "grad_norm": 0.2134184284204459, "learning_rate": 7.300792657488528e-06, "loss": 0.2811, "step": 2314 }, { "epoch": 2.6043894203714126, "grad_norm": 0.21325834093200643, "learning_rate": 7.279933249895703e-06, "loss": 0.2936, "step": 2315 }, { "epoch": 2.6055149127743387, "grad_norm": 0.1991068712411994, "learning_rate": 7.259073842302878e-06, "loss": 0.2834, "step": 2316 }, { "epoch": 2.606640405177265, "grad_norm": 0.22901278666536629, "learning_rate": 7.238214434710054e-06, "loss": 0.3015, "step": 2317 }, { "epoch": 2.607765897580191, "grad_norm": 0.21881134986820416, "learning_rate": 7.21735502711723e-06, "loss": 0.2876, "step": 2318 }, { "epoch": 2.6088913899831176, "grad_norm": 0.22029025156059676, "learning_rate": 7.196495619524406e-06, "loss": 0.2988, "step": 2319 }, { "epoch": 2.610016882386044, "grad_norm": 0.21007112294863065, "learning_rate": 7.175636211931582e-06, "loss": 0.2763, "step": 2320 }, { "epoch": 2.61114237478897, "grad_norm": 0.2126401817051627, "learning_rate": 7.154776804338757e-06, "loss": 0.2805, "step": 2321 }, { "epoch": 2.612267867191896, "grad_norm": 0.20852511391858303, "learning_rate": 7.133917396745933e-06, "loss": 0.2936, "step": 2322 }, { "epoch": 2.6133933595948227, "grad_norm": 0.21781244059761962, "learning_rate": 7.1130579891531085e-06, "loss": 0.2892, "step": 2323 }, { "epoch": 2.614518851997749, "grad_norm": 0.22501438470662116, "learning_rate": 7.092198581560285e-06, "loss": 0.2839, "step": 2324 }, { "epoch": 2.615644344400675, "grad_norm": 0.20568012937631386, "learning_rate": 7.07133917396746e-06, "loss": 0.2927, "step": 2325 }, { "epoch": 2.6167698368036016, "grad_norm": 0.21222804494470973, "learning_rate": 7.050479766374634e-06, "loss": 0.281, "step": 2326 }, { "epoch": 2.6178953292065277, "grad_norm": 0.20938841222313492, "learning_rate": 7.0296203587818105e-06, "loss": 0.2845, "step": 2327 }, { "epoch": 2.619020821609454, "grad_norm": 0.21620523521239354, "learning_rate": 7.008760951188987e-06, "loss": 0.2801, "step": 2328 }, { "epoch": 2.6201463140123806, "grad_norm": 0.2506118426158015, "learning_rate": 6.987901543596162e-06, "loss": 0.2954, "step": 2329 }, { "epoch": 2.6212718064153067, "grad_norm": 0.1973550955823624, "learning_rate": 6.967042136003338e-06, "loss": 0.2686, "step": 2330 }, { "epoch": 2.622397298818233, "grad_norm": 0.2066937107804017, "learning_rate": 6.946182728410513e-06, "loss": 0.2774, "step": 2331 }, { "epoch": 2.623522791221159, "grad_norm": 0.2202250524273311, "learning_rate": 6.9253233208176895e-06, "loss": 0.3068, "step": 2332 }, { "epoch": 2.6246482836240856, "grad_norm": 0.21755861209547087, "learning_rate": 6.904463913224865e-06, "loss": 0.2723, "step": 2333 }, { "epoch": 2.6257737760270117, "grad_norm": 0.21926633058957373, "learning_rate": 6.883604505632041e-06, "loss": 0.2903, "step": 2334 }, { "epoch": 2.626899268429938, "grad_norm": 0.2130377637928427, "learning_rate": 6.862745098039216e-06, "loss": 0.2804, "step": 2335 }, { "epoch": 2.628024760832864, "grad_norm": 0.19225627811370669, "learning_rate": 6.841885690446391e-06, "loss": 0.2887, "step": 2336 }, { "epoch": 2.6291502532357907, "grad_norm": 0.20057254466754687, "learning_rate": 6.821026282853567e-06, "loss": 0.2837, "step": 2337 }, { "epoch": 2.630275745638717, "grad_norm": 0.23011758988414296, "learning_rate": 6.800166875260743e-06, "loss": 0.2931, "step": 2338 }, { "epoch": 2.631401238041643, "grad_norm": 0.23666344897934585, "learning_rate": 6.779307467667918e-06, "loss": 0.2782, "step": 2339 }, { "epoch": 2.6325267304445696, "grad_norm": 0.2147014731643971, "learning_rate": 6.758448060075094e-06, "loss": 0.2912, "step": 2340 }, { "epoch": 2.6336522228474957, "grad_norm": 0.21234094073131748, "learning_rate": 6.73758865248227e-06, "loss": 0.2914, "step": 2341 }, { "epoch": 2.634777715250422, "grad_norm": 0.20060222937545014, "learning_rate": 6.716729244889446e-06, "loss": 0.2766, "step": 2342 }, { "epoch": 2.635903207653348, "grad_norm": 0.21917036852395436, "learning_rate": 6.695869837296621e-06, "loss": 0.3013, "step": 2343 }, { "epoch": 2.6370287000562747, "grad_norm": 0.20864615144591028, "learning_rate": 6.675010429703797e-06, "loss": 0.2792, "step": 2344 }, { "epoch": 2.6381541924592007, "grad_norm": 0.2192912221143167, "learning_rate": 6.6541510221109725e-06, "loss": 0.284, "step": 2345 }, { "epoch": 2.639279684862127, "grad_norm": 0.2039309149630558, "learning_rate": 6.633291614518149e-06, "loss": 0.2948, "step": 2346 }, { "epoch": 2.6404051772650536, "grad_norm": 0.22259191163490286, "learning_rate": 6.612432206925323e-06, "loss": 0.2737, "step": 2347 }, { "epoch": 2.6415306696679797, "grad_norm": 0.2419705952514684, "learning_rate": 6.591572799332499e-06, "loss": 0.2894, "step": 2348 }, { "epoch": 2.642656162070906, "grad_norm": 0.20985587856472956, "learning_rate": 6.5707133917396745e-06, "loss": 0.2947, "step": 2349 }, { "epoch": 2.643781654473832, "grad_norm": 0.20042601124344012, "learning_rate": 6.549853984146851e-06, "loss": 0.271, "step": 2350 }, { "epoch": 2.6449071468767587, "grad_norm": 0.20416712565695233, "learning_rate": 6.528994576554026e-06, "loss": 0.2743, "step": 2351 }, { "epoch": 2.646032639279685, "grad_norm": 0.2184086174145368, "learning_rate": 6.508135168961202e-06, "loss": 0.3015, "step": 2352 }, { "epoch": 2.647158131682611, "grad_norm": 0.24131101578961572, "learning_rate": 6.487275761368377e-06, "loss": 0.2775, "step": 2353 }, { "epoch": 2.648283624085537, "grad_norm": 0.22701304755475593, "learning_rate": 6.4664163537755535e-06, "loss": 0.2857, "step": 2354 }, { "epoch": 2.6494091164884637, "grad_norm": 0.20712379248467747, "learning_rate": 6.445556946182729e-06, "loss": 0.2758, "step": 2355 }, { "epoch": 2.65053460889139, "grad_norm": 0.23341164321770264, "learning_rate": 6.424697538589905e-06, "loss": 0.268, "step": 2356 }, { "epoch": 2.651660101294316, "grad_norm": 0.24310923952152994, "learning_rate": 6.403838130997079e-06, "loss": 0.2833, "step": 2357 }, { "epoch": 2.6527855936972426, "grad_norm": 0.229839416220484, "learning_rate": 6.3829787234042555e-06, "loss": 0.2889, "step": 2358 }, { "epoch": 2.6539110861001687, "grad_norm": 0.23215272147883, "learning_rate": 6.362119315811431e-06, "loss": 0.2975, "step": 2359 }, { "epoch": 2.655036578503095, "grad_norm": 0.24954803338960216, "learning_rate": 6.341259908218607e-06, "loss": 0.2946, "step": 2360 }, { "epoch": 2.6561620709060216, "grad_norm": 0.23026522507576283, "learning_rate": 6.320400500625782e-06, "loss": 0.268, "step": 2361 }, { "epoch": 2.6572875633089477, "grad_norm": 0.23021270773997743, "learning_rate": 6.299541093032958e-06, "loss": 0.261, "step": 2362 }, { "epoch": 2.6584130557118737, "grad_norm": 0.21115861014586346, "learning_rate": 6.278681685440134e-06, "loss": 0.2771, "step": 2363 }, { "epoch": 2.6595385481148, "grad_norm": 0.2405585243153947, "learning_rate": 6.25782227784731e-06, "loss": 0.2806, "step": 2364 }, { "epoch": 2.6606640405177266, "grad_norm": 0.2497609269658003, "learning_rate": 6.236962870254485e-06, "loss": 0.2874, "step": 2365 }, { "epoch": 2.6617895329206527, "grad_norm": 0.22645791008309762, "learning_rate": 6.21610346266166e-06, "loss": 0.3004, "step": 2366 }, { "epoch": 2.662915025323579, "grad_norm": 0.2197914591989606, "learning_rate": 6.1952440550688365e-06, "loss": 0.2794, "step": 2367 }, { "epoch": 2.664040517726505, "grad_norm": 0.22234883908095063, "learning_rate": 6.174384647476012e-06, "loss": 0.3004, "step": 2368 }, { "epoch": 2.6651660101294317, "grad_norm": 0.24165293762861514, "learning_rate": 6.153525239883188e-06, "loss": 0.2822, "step": 2369 }, { "epoch": 2.666291502532358, "grad_norm": 0.2552276571924829, "learning_rate": 6.132665832290363e-06, "loss": 0.2909, "step": 2370 }, { "epoch": 2.667416994935284, "grad_norm": 0.21066092081346655, "learning_rate": 6.1118064246975385e-06, "loss": 0.2798, "step": 2371 }, { "epoch": 2.66854248733821, "grad_norm": 0.2142596843222076, "learning_rate": 6.090947017104715e-06, "loss": 0.2776, "step": 2372 }, { "epoch": 2.6696679797411367, "grad_norm": 0.24551341038876937, "learning_rate": 6.07008760951189e-06, "loss": 0.2865, "step": 2373 }, { "epoch": 2.670793472144063, "grad_norm": 0.2340361094417635, "learning_rate": 6.049228201919066e-06, "loss": 0.2904, "step": 2374 }, { "epoch": 2.671918964546989, "grad_norm": 0.21307302351051388, "learning_rate": 6.028368794326241e-06, "loss": 0.2937, "step": 2375 }, { "epoch": 2.6730444569499157, "grad_norm": 0.2512900946420438, "learning_rate": 6.007509386733417e-06, "loss": 0.2842, "step": 2376 }, { "epoch": 2.6741699493528417, "grad_norm": 0.20979466873445987, "learning_rate": 5.986649979140593e-06, "loss": 0.2931, "step": 2377 }, { "epoch": 2.675295441755768, "grad_norm": 0.21119960362679138, "learning_rate": 5.965790571547768e-06, "loss": 0.2842, "step": 2378 }, { "epoch": 2.6764209341586946, "grad_norm": 0.19883138313973867, "learning_rate": 5.944931163954944e-06, "loss": 0.2771, "step": 2379 }, { "epoch": 2.6775464265616207, "grad_norm": 0.19968752507295803, "learning_rate": 5.92407175636212e-06, "loss": 0.2751, "step": 2380 }, { "epoch": 2.678671918964547, "grad_norm": 0.22015736000540867, "learning_rate": 5.903212348769295e-06, "loss": 0.2799, "step": 2381 }, { "epoch": 2.679797411367473, "grad_norm": 0.21339223053410786, "learning_rate": 5.882352941176471e-06, "loss": 0.2869, "step": 2382 }, { "epoch": 2.6809229037703997, "grad_norm": 0.19762335590197813, "learning_rate": 5.861493533583646e-06, "loss": 0.2829, "step": 2383 }, { "epoch": 2.6820483961733257, "grad_norm": 0.2074037589352283, "learning_rate": 5.840634125990822e-06, "loss": 0.2909, "step": 2384 }, { "epoch": 2.683173888576252, "grad_norm": 0.2117788165142603, "learning_rate": 5.8197747183979985e-06, "loss": 0.2732, "step": 2385 }, { "epoch": 2.684299380979178, "grad_norm": 0.23282240403764579, "learning_rate": 5.798915310805173e-06, "loss": 0.2939, "step": 2386 }, { "epoch": 2.6854248733821047, "grad_norm": 0.21921536525716026, "learning_rate": 5.778055903212349e-06, "loss": 0.2844, "step": 2387 }, { "epoch": 2.686550365785031, "grad_norm": 0.2104234762422923, "learning_rate": 5.757196495619524e-06, "loss": 0.2893, "step": 2388 }, { "epoch": 2.687675858187957, "grad_norm": 0.20466965592113787, "learning_rate": 5.7363370880267005e-06, "loss": 0.2839, "step": 2389 }, { "epoch": 2.6888013505908837, "grad_norm": 0.21641871937130808, "learning_rate": 5.715477680433877e-06, "loss": 0.2813, "step": 2390 }, { "epoch": 2.6899268429938097, "grad_norm": 0.19783335996013016, "learning_rate": 5.694618272841051e-06, "loss": 0.2681, "step": 2391 }, { "epoch": 2.691052335396736, "grad_norm": 0.23211934348643096, "learning_rate": 5.673758865248227e-06, "loss": 0.2737, "step": 2392 }, { "epoch": 2.692177827799662, "grad_norm": 0.21211198195733516, "learning_rate": 5.6528994576554025e-06, "loss": 0.293, "step": 2393 }, { "epoch": 2.6933033202025887, "grad_norm": 0.19763398145044694, "learning_rate": 5.632040050062579e-06, "loss": 0.2669, "step": 2394 }, { "epoch": 2.6944288126055147, "grad_norm": 0.2084177545225768, "learning_rate": 5.611180642469755e-06, "loss": 0.2727, "step": 2395 }, { "epoch": 2.695554305008441, "grad_norm": 0.23874699170463168, "learning_rate": 5.590321234876929e-06, "loss": 0.2871, "step": 2396 }, { "epoch": 2.6966797974113677, "grad_norm": 0.22847881193172173, "learning_rate": 5.569461827284105e-06, "loss": 0.2704, "step": 2397 }, { "epoch": 2.6978052898142937, "grad_norm": 0.21260192114331203, "learning_rate": 5.548602419691281e-06, "loss": 0.2765, "step": 2398 }, { "epoch": 2.69893078221722, "grad_norm": 0.2481199298540293, "learning_rate": 5.527743012098457e-06, "loss": 0.2978, "step": 2399 }, { "epoch": 2.700056274620146, "grad_norm": 0.22837599699280178, "learning_rate": 5.506883604505633e-06, "loss": 0.2778, "step": 2400 }, { "epoch": 2.7011817670230727, "grad_norm": 0.20862397407681496, "learning_rate": 5.486024196912807e-06, "loss": 0.282, "step": 2401 }, { "epoch": 2.702307259425999, "grad_norm": 0.21499220301713443, "learning_rate": 5.4651647893199835e-06, "loss": 0.2946, "step": 2402 }, { "epoch": 2.703432751828925, "grad_norm": 0.21876537448943154, "learning_rate": 5.444305381727159e-06, "loss": 0.2999, "step": 2403 }, { "epoch": 2.704558244231851, "grad_norm": 0.19584715347664308, "learning_rate": 5.423445974134335e-06, "loss": 0.2811, "step": 2404 }, { "epoch": 2.7056837366347777, "grad_norm": 0.199638455026977, "learning_rate": 5.402586566541511e-06, "loss": 0.2922, "step": 2405 }, { "epoch": 2.706809229037704, "grad_norm": 0.1959593267413218, "learning_rate": 5.381727158948686e-06, "loss": 0.2803, "step": 2406 }, { "epoch": 2.70793472144063, "grad_norm": 0.2172940746080715, "learning_rate": 5.360867751355862e-06, "loss": 0.2998, "step": 2407 }, { "epoch": 2.7090602138435567, "grad_norm": 0.21226736116643366, "learning_rate": 5.340008343763037e-06, "loss": 0.2793, "step": 2408 }, { "epoch": 2.7101857062464827, "grad_norm": 0.2292349157751999, "learning_rate": 5.319148936170213e-06, "loss": 0.2925, "step": 2409 }, { "epoch": 2.711311198649409, "grad_norm": 0.2065757335406282, "learning_rate": 5.298289528577389e-06, "loss": 0.2852, "step": 2410 }, { "epoch": 2.7124366910523356, "grad_norm": 0.21318268769213233, "learning_rate": 5.2774301209845645e-06, "loss": 0.3101, "step": 2411 }, { "epoch": 2.7135621834552617, "grad_norm": 0.19490619907298698, "learning_rate": 5.25657071339174e-06, "loss": 0.2845, "step": 2412 }, { "epoch": 2.7146876758581877, "grad_norm": 0.20387752435372739, "learning_rate": 5.235711305798915e-06, "loss": 0.2891, "step": 2413 }, { "epoch": 2.715813168261114, "grad_norm": 0.21901749886212424, "learning_rate": 5.214851898206091e-06, "loss": 0.2829, "step": 2414 }, { "epoch": 2.7169386606640407, "grad_norm": 0.21269033641949117, "learning_rate": 5.193992490613267e-06, "loss": 0.2866, "step": 2415 }, { "epoch": 2.7180641530669667, "grad_norm": 0.204672643305428, "learning_rate": 5.173133083020443e-06, "loss": 0.2928, "step": 2416 }, { "epoch": 2.719189645469893, "grad_norm": 0.22695422804231133, "learning_rate": 5.152273675427618e-06, "loss": 0.3034, "step": 2417 }, { "epoch": 2.720315137872819, "grad_norm": 0.20992539451243944, "learning_rate": 5.131414267834793e-06, "loss": 0.2756, "step": 2418 }, { "epoch": 2.7214406302757457, "grad_norm": 0.23946364092165132, "learning_rate": 5.110554860241969e-06, "loss": 0.2619, "step": 2419 }, { "epoch": 2.722566122678672, "grad_norm": 0.22133441280710264, "learning_rate": 5.0896954526491455e-06, "loss": 0.3024, "step": 2420 }, { "epoch": 2.723691615081598, "grad_norm": 0.19132609073407808, "learning_rate": 5.068836045056321e-06, "loss": 0.2809, "step": 2421 }, { "epoch": 2.724817107484524, "grad_norm": 0.20537504751422384, "learning_rate": 5.047976637463496e-06, "loss": 0.2981, "step": 2422 }, { "epoch": 2.7259425998874507, "grad_norm": 0.2026641684698212, "learning_rate": 5.027117229870671e-06, "loss": 0.2831, "step": 2423 }, { "epoch": 2.727068092290377, "grad_norm": 0.220392207872778, "learning_rate": 5.0062578222778475e-06, "loss": 0.2974, "step": 2424 }, { "epoch": 2.728193584693303, "grad_norm": 0.20374793230025023, "learning_rate": 4.985398414685024e-06, "loss": 0.2843, "step": 2425 }, { "epoch": 2.7293190770962297, "grad_norm": 0.2182187646308083, "learning_rate": 4.964539007092199e-06, "loss": 0.2827, "step": 2426 }, { "epoch": 2.7304445694991557, "grad_norm": 0.20515095934912667, "learning_rate": 4.943679599499374e-06, "loss": 0.2823, "step": 2427 }, { "epoch": 2.731570061902082, "grad_norm": 0.2274911617803538, "learning_rate": 4.9228201919065495e-06, "loss": 0.2874, "step": 2428 }, { "epoch": 2.7326955543050087, "grad_norm": 0.20240468754950888, "learning_rate": 4.901960784313726e-06, "loss": 0.2901, "step": 2429 }, { "epoch": 2.7338210467079347, "grad_norm": 0.21135908550005916, "learning_rate": 4.881101376720902e-06, "loss": 0.2792, "step": 2430 }, { "epoch": 2.734946539110861, "grad_norm": 0.21034155921896855, "learning_rate": 4.860241969128077e-06, "loss": 0.2911, "step": 2431 }, { "epoch": 2.736072031513787, "grad_norm": 0.2073170761162975, "learning_rate": 4.839382561535253e-06, "loss": 0.2768, "step": 2432 }, { "epoch": 2.7371975239167137, "grad_norm": 0.20922141980047607, "learning_rate": 4.818523153942428e-06, "loss": 0.2818, "step": 2433 }, { "epoch": 2.7383230163196397, "grad_norm": 0.20817722346637343, "learning_rate": 4.797663746349604e-06, "loss": 0.2799, "step": 2434 }, { "epoch": 2.739448508722566, "grad_norm": 0.20367161604931538, "learning_rate": 4.77680433875678e-06, "loss": 0.3002, "step": 2435 }, { "epoch": 2.740574001125492, "grad_norm": 0.20301772018260328, "learning_rate": 4.755944931163955e-06, "loss": 0.2734, "step": 2436 }, { "epoch": 2.7416994935284187, "grad_norm": 0.19056871488718577, "learning_rate": 4.735085523571131e-06, "loss": 0.2753, "step": 2437 }, { "epoch": 2.742824985931345, "grad_norm": 0.25569233009986714, "learning_rate": 4.714226115978306e-06, "loss": 0.2991, "step": 2438 }, { "epoch": 2.743950478334271, "grad_norm": 0.20649414202471206, "learning_rate": 4.693366708385482e-06, "loss": 0.2754, "step": 2439 }, { "epoch": 2.7450759707371977, "grad_norm": 0.20526360074859962, "learning_rate": 4.672507300792658e-06, "loss": 0.2846, "step": 2440 }, { "epoch": 2.7462014631401237, "grad_norm": 0.19992362065471733, "learning_rate": 4.651647893199833e-06, "loss": 0.2883, "step": 2441 }, { "epoch": 2.74732695554305, "grad_norm": 0.20760115590523082, "learning_rate": 4.6307884856070095e-06, "loss": 0.2734, "step": 2442 }, { "epoch": 2.748452447945976, "grad_norm": 0.20022172726588305, "learning_rate": 4.609929078014184e-06, "loss": 0.2746, "step": 2443 }, { "epoch": 2.7495779403489027, "grad_norm": 0.21742210771505113, "learning_rate": 4.58906967042136e-06, "loss": 0.2712, "step": 2444 }, { "epoch": 2.7507034327518287, "grad_norm": 0.1894221520336593, "learning_rate": 4.568210262828536e-06, "loss": 0.269, "step": 2445 }, { "epoch": 2.751828925154755, "grad_norm": 0.2014338015112663, "learning_rate": 4.5473508552357115e-06, "loss": 0.2754, "step": 2446 }, { "epoch": 2.7529544175576817, "grad_norm": 0.2767717819280576, "learning_rate": 4.526491447642888e-06, "loss": 0.2704, "step": 2447 }, { "epoch": 2.7540799099606077, "grad_norm": 0.2380476325461093, "learning_rate": 4.505632040050062e-06, "loss": 0.2861, "step": 2448 }, { "epoch": 2.755205402363534, "grad_norm": 0.21164580868587568, "learning_rate": 4.484772632457238e-06, "loss": 0.2768, "step": 2449 }, { "epoch": 2.75633089476646, "grad_norm": 0.2008040936085221, "learning_rate": 4.463913224864414e-06, "loss": 0.275, "step": 2450 }, { "epoch": 2.7574563871693867, "grad_norm": 0.227682709177952, "learning_rate": 4.44305381727159e-06, "loss": 0.3032, "step": 2451 }, { "epoch": 2.758581879572313, "grad_norm": 0.24775815808302365, "learning_rate": 4.422194409678766e-06, "loss": 0.2952, "step": 2452 }, { "epoch": 2.759707371975239, "grad_norm": 0.23280742706720892, "learning_rate": 4.401335002085941e-06, "loss": 0.2773, "step": 2453 }, { "epoch": 2.760832864378165, "grad_norm": 0.19451290711254568, "learning_rate": 4.380475594493116e-06, "loss": 0.2693, "step": 2454 }, { "epoch": 2.7619583567810917, "grad_norm": 0.20939579388836216, "learning_rate": 4.3596161869002925e-06, "loss": 0.2866, "step": 2455 }, { "epoch": 2.763083849184018, "grad_norm": 0.20672270738688484, "learning_rate": 4.338756779307468e-06, "loss": 0.2904, "step": 2456 }, { "epoch": 2.764209341586944, "grad_norm": 0.19033409152598357, "learning_rate": 4.317897371714644e-06, "loss": 0.2718, "step": 2457 }, { "epoch": 2.7653348339898707, "grad_norm": 0.2166789978324445, "learning_rate": 4.297037964121819e-06, "loss": 0.2983, "step": 2458 }, { "epoch": 2.7664603263927967, "grad_norm": 0.216447417269072, "learning_rate": 4.2761785565289945e-06, "loss": 0.2858, "step": 2459 }, { "epoch": 2.767585818795723, "grad_norm": 0.362675866181273, "learning_rate": 4.255319148936171e-06, "loss": 0.2881, "step": 2460 }, { "epoch": 2.7687113111986497, "grad_norm": 0.19521272525143493, "learning_rate": 4.234459741343346e-06, "loss": 0.2724, "step": 2461 }, { "epoch": 2.7698368036015757, "grad_norm": 0.19682316401858674, "learning_rate": 4.213600333750522e-06, "loss": 0.2859, "step": 2462 }, { "epoch": 2.7709622960045017, "grad_norm": 0.21277271670047132, "learning_rate": 4.192740926157697e-06, "loss": 0.2974, "step": 2463 }, { "epoch": 2.772087788407428, "grad_norm": 0.21323098001422092, "learning_rate": 4.171881518564873e-06, "loss": 0.2858, "step": 2464 }, { "epoch": 2.7732132808103547, "grad_norm": 0.2443899119261561, "learning_rate": 4.151022110972049e-06, "loss": 0.2851, "step": 2465 }, { "epoch": 2.7743387732132807, "grad_norm": 0.2139564808006101, "learning_rate": 4.130162703379224e-06, "loss": 0.2949, "step": 2466 }, { "epoch": 2.775464265616207, "grad_norm": 0.2212119000303061, "learning_rate": 4.1093032957864e-06, "loss": 0.2751, "step": 2467 }, { "epoch": 2.776589758019133, "grad_norm": 0.20484020499228098, "learning_rate": 4.0884438881935755e-06, "loss": 0.2771, "step": 2468 }, { "epoch": 2.7777152504220597, "grad_norm": 0.20123952910830462, "learning_rate": 4.067584480600751e-06, "loss": 0.3006, "step": 2469 }, { "epoch": 2.778840742824986, "grad_norm": 0.21577294384729115, "learning_rate": 4.046725073007927e-06, "loss": 0.2996, "step": 2470 }, { "epoch": 2.779966235227912, "grad_norm": 0.22582357217712795, "learning_rate": 4.025865665415102e-06, "loss": 0.2717, "step": 2471 }, { "epoch": 2.7810917276308382, "grad_norm": 0.2134397866045082, "learning_rate": 4.005006257822278e-06, "loss": 0.2825, "step": 2472 }, { "epoch": 2.7822172200337647, "grad_norm": 0.20324964622528435, "learning_rate": 3.984146850229454e-06, "loss": 0.2799, "step": 2473 }, { "epoch": 2.783342712436691, "grad_norm": 0.20795174662693527, "learning_rate": 3.963287442636629e-06, "loss": 0.2854, "step": 2474 }, { "epoch": 2.784468204839617, "grad_norm": 0.1997012004956189, "learning_rate": 3.942428035043805e-06, "loss": 0.2764, "step": 2475 }, { "epoch": 2.7855936972425437, "grad_norm": 0.2096005762129211, "learning_rate": 3.92156862745098e-06, "loss": 0.276, "step": 2476 }, { "epoch": 2.7867191896454697, "grad_norm": 0.2177380985810915, "learning_rate": 3.9007092198581565e-06, "loss": 0.2893, "step": 2477 }, { "epoch": 2.787844682048396, "grad_norm": 0.22482324020736472, "learning_rate": 3.879849812265332e-06, "loss": 0.2657, "step": 2478 }, { "epoch": 2.7889701744513227, "grad_norm": 0.19885926541241514, "learning_rate": 3.858990404672508e-06, "loss": 0.2884, "step": 2479 }, { "epoch": 2.7900956668542487, "grad_norm": 0.19908222649936524, "learning_rate": 3.838130997079683e-06, "loss": 0.2742, "step": 2480 }, { "epoch": 2.791221159257175, "grad_norm": 0.20760067726563736, "learning_rate": 3.8172715894868585e-06, "loss": 0.296, "step": 2481 }, { "epoch": 2.792346651660101, "grad_norm": 0.2533774016177821, "learning_rate": 3.7964121818940346e-06, "loss": 0.2861, "step": 2482 }, { "epoch": 2.7934721440630277, "grad_norm": 0.21052580972405535, "learning_rate": 3.7755527743012103e-06, "loss": 0.288, "step": 2483 }, { "epoch": 2.7945976364659537, "grad_norm": 0.206699855575665, "learning_rate": 3.754693366708386e-06, "loss": 0.288, "step": 2484 }, { "epoch": 2.79572312886888, "grad_norm": 0.19585545352446415, "learning_rate": 3.733833959115561e-06, "loss": 0.2747, "step": 2485 }, { "epoch": 2.7968486212718062, "grad_norm": 0.2062750655598761, "learning_rate": 3.7129745515227366e-06, "loss": 0.275, "step": 2486 }, { "epoch": 2.7979741136747327, "grad_norm": 0.19740523384916275, "learning_rate": 3.6921151439299128e-06, "loss": 0.2762, "step": 2487 }, { "epoch": 2.799099606077659, "grad_norm": 0.23460327350417823, "learning_rate": 3.6712557363370885e-06, "loss": 0.2663, "step": 2488 }, { "epoch": 2.800225098480585, "grad_norm": 0.27745757559360934, "learning_rate": 3.650396328744264e-06, "loss": 0.318, "step": 2489 }, { "epoch": 2.8013505908835117, "grad_norm": 0.20040376636692844, "learning_rate": 3.629536921151439e-06, "loss": 0.2947, "step": 2490 }, { "epoch": 2.8024760832864377, "grad_norm": 0.2065083158200927, "learning_rate": 3.608677513558615e-06, "loss": 0.2856, "step": 2491 }, { "epoch": 2.803601575689364, "grad_norm": 0.2609652931637163, "learning_rate": 3.587818105965791e-06, "loss": 0.2778, "step": 2492 }, { "epoch": 2.80472706809229, "grad_norm": 0.19707420225094988, "learning_rate": 3.5669586983729666e-06, "loss": 0.2847, "step": 2493 }, { "epoch": 2.8058525604952167, "grad_norm": 0.214409640748111, "learning_rate": 3.5460992907801423e-06, "loss": 0.2728, "step": 2494 }, { "epoch": 2.8069780528981427, "grad_norm": 0.21397007514173588, "learning_rate": 3.525239883187317e-06, "loss": 0.2939, "step": 2495 }, { "epoch": 2.808103545301069, "grad_norm": 0.2211780277225568, "learning_rate": 3.5043804755944933e-06, "loss": 0.2941, "step": 2496 }, { "epoch": 2.8092290377039957, "grad_norm": 0.19746449192767923, "learning_rate": 3.483521068001669e-06, "loss": 0.2865, "step": 2497 }, { "epoch": 2.8103545301069217, "grad_norm": 0.24933709829603812, "learning_rate": 3.4626616604088447e-06, "loss": 0.292, "step": 2498 }, { "epoch": 2.811480022509848, "grad_norm": 0.20025420830978435, "learning_rate": 3.4418022528160205e-06, "loss": 0.2936, "step": 2499 }, { "epoch": 2.812605514912774, "grad_norm": 0.21106860467217742, "learning_rate": 3.4209428452231953e-06, "loss": 0.2749, "step": 2500 }, { "epoch": 2.8137310073157007, "grad_norm": 0.244173358210336, "learning_rate": 3.4000834376303715e-06, "loss": 0.2921, "step": 2501 }, { "epoch": 2.814856499718627, "grad_norm": 0.23708488619881102, "learning_rate": 3.379224030037547e-06, "loss": 0.286, "step": 2502 }, { "epoch": 2.815981992121553, "grad_norm": 0.20203389839535874, "learning_rate": 3.358364622444723e-06, "loss": 0.2823, "step": 2503 }, { "epoch": 2.8171074845244792, "grad_norm": 0.20180941230046498, "learning_rate": 3.3375052148518986e-06, "loss": 0.2795, "step": 2504 }, { "epoch": 2.8182329769274057, "grad_norm": 0.1923974204466329, "learning_rate": 3.3166458072590743e-06, "loss": 0.2719, "step": 2505 }, { "epoch": 2.819358469330332, "grad_norm": 0.210276870318239, "learning_rate": 3.2957863996662496e-06, "loss": 0.2831, "step": 2506 }, { "epoch": 2.820483961733258, "grad_norm": 0.21238988385557064, "learning_rate": 3.2749269920734253e-06, "loss": 0.278, "step": 2507 }, { "epoch": 2.8216094541361847, "grad_norm": 0.19338349502478416, "learning_rate": 3.254067584480601e-06, "loss": 0.2717, "step": 2508 }, { "epoch": 2.8227349465391107, "grad_norm": 0.19141784353147265, "learning_rate": 3.2332081768877767e-06, "loss": 0.2687, "step": 2509 }, { "epoch": 2.823860438942037, "grad_norm": 0.21299210269078242, "learning_rate": 3.2123487692949525e-06, "loss": 0.273, "step": 2510 }, { "epoch": 2.8249859313449637, "grad_norm": 0.19645234999837438, "learning_rate": 3.1914893617021277e-06, "loss": 0.274, "step": 2511 }, { "epoch": 2.8261114237478897, "grad_norm": 0.2154736738045129, "learning_rate": 3.1706299541093035e-06, "loss": 0.2798, "step": 2512 }, { "epoch": 2.8272369161508157, "grad_norm": 0.20747340932057978, "learning_rate": 3.149770546516479e-06, "loss": 0.2911, "step": 2513 }, { "epoch": 2.828362408553742, "grad_norm": 0.21668977371189152, "learning_rate": 3.128911138923655e-06, "loss": 0.2809, "step": 2514 }, { "epoch": 2.8294879009566687, "grad_norm": 0.19752342856018376, "learning_rate": 3.10805173133083e-06, "loss": 0.2913, "step": 2515 }, { "epoch": 2.8306133933595947, "grad_norm": 0.2129254469727926, "learning_rate": 3.087192323738006e-06, "loss": 0.2991, "step": 2516 }, { "epoch": 2.831738885762521, "grad_norm": 0.2135496695763609, "learning_rate": 3.0663329161451816e-06, "loss": 0.3121, "step": 2517 }, { "epoch": 2.8328643781654472, "grad_norm": 0.20217705970631564, "learning_rate": 3.0454735085523573e-06, "loss": 0.2699, "step": 2518 }, { "epoch": 2.8339898705683737, "grad_norm": 0.18655748638369685, "learning_rate": 3.024614100959533e-06, "loss": 0.281, "step": 2519 }, { "epoch": 2.8351153629713, "grad_norm": 0.20710373842178917, "learning_rate": 3.0037546933667083e-06, "loss": 0.2818, "step": 2520 }, { "epoch": 2.836240855374226, "grad_norm": 0.1889143314236119, "learning_rate": 2.982895285773884e-06, "loss": 0.2686, "step": 2521 }, { "epoch": 2.8373663477771522, "grad_norm": 0.1916678750234142, "learning_rate": 2.96203587818106e-06, "loss": 0.28, "step": 2522 }, { "epoch": 2.8384918401800787, "grad_norm": 0.19425390855255784, "learning_rate": 2.9411764705882355e-06, "loss": 0.2979, "step": 2523 }, { "epoch": 2.839617332583005, "grad_norm": 0.20681634843217112, "learning_rate": 2.920317062995411e-06, "loss": 0.2871, "step": 2524 }, { "epoch": 2.8407428249859312, "grad_norm": 0.20023152119316995, "learning_rate": 2.8994576554025865e-06, "loss": 0.2801, "step": 2525 }, { "epoch": 2.8418683173888577, "grad_norm": 0.20177659743093546, "learning_rate": 2.878598247809762e-06, "loss": 0.2816, "step": 2526 }, { "epoch": 2.8429938097917837, "grad_norm": 0.19585899941288504, "learning_rate": 2.8577388402169383e-06, "loss": 0.2679, "step": 2527 }, { "epoch": 2.84411930219471, "grad_norm": 0.17859425580387345, "learning_rate": 2.8368794326241136e-06, "loss": 0.2707, "step": 2528 }, { "epoch": 2.8452447945976367, "grad_norm": 0.2039828332143888, "learning_rate": 2.8160200250312893e-06, "loss": 0.2991, "step": 2529 }, { "epoch": 2.8463702870005627, "grad_norm": 0.2030074794985201, "learning_rate": 2.7951606174384646e-06, "loss": 0.2793, "step": 2530 }, { "epoch": 2.847495779403489, "grad_norm": 0.19766777443355418, "learning_rate": 2.7743012098456403e-06, "loss": 0.2823, "step": 2531 }, { "epoch": 2.8486212718064152, "grad_norm": 0.19892663863890306, "learning_rate": 2.7534418022528165e-06, "loss": 0.2806, "step": 2532 }, { "epoch": 2.8497467642093417, "grad_norm": 0.21275315081509835, "learning_rate": 2.7325823946599917e-06, "loss": 0.2701, "step": 2533 }, { "epoch": 2.8508722566122677, "grad_norm": 0.21033439554694824, "learning_rate": 2.7117229870671675e-06, "loss": 0.2854, "step": 2534 }, { "epoch": 2.851997749015194, "grad_norm": 0.19767645497600447, "learning_rate": 2.690863579474343e-06, "loss": 0.288, "step": 2535 }, { "epoch": 2.8531232414181202, "grad_norm": 0.2081071860608737, "learning_rate": 2.6700041718815185e-06, "loss": 0.2853, "step": 2536 }, { "epoch": 2.8542487338210467, "grad_norm": 0.19156924866913874, "learning_rate": 2.6491447642886946e-06, "loss": 0.2768, "step": 2537 }, { "epoch": 2.855374226223973, "grad_norm": 0.21948381030849073, "learning_rate": 2.62828535669587e-06, "loss": 0.2862, "step": 2538 }, { "epoch": 2.856499718626899, "grad_norm": 0.194801788917978, "learning_rate": 2.6074259491030456e-06, "loss": 0.2797, "step": 2539 }, { "epoch": 2.8576252110298257, "grad_norm": 0.22618411821723028, "learning_rate": 2.5865665415102213e-06, "loss": 0.3074, "step": 2540 }, { "epoch": 2.8587507034327517, "grad_norm": 0.20254836873582036, "learning_rate": 2.5657071339173966e-06, "loss": 0.2699, "step": 2541 }, { "epoch": 2.859876195835678, "grad_norm": 0.19524970967882507, "learning_rate": 2.5448477263245727e-06, "loss": 0.2781, "step": 2542 }, { "epoch": 2.8610016882386042, "grad_norm": 0.19523879122973248, "learning_rate": 2.523988318731748e-06, "loss": 0.2901, "step": 2543 }, { "epoch": 2.8621271806415307, "grad_norm": 0.19385408771091103, "learning_rate": 2.5031289111389237e-06, "loss": 0.2861, "step": 2544 }, { "epoch": 2.8632526730444567, "grad_norm": 0.19805715117689787, "learning_rate": 2.4822695035460995e-06, "loss": 0.2692, "step": 2545 }, { "epoch": 2.864378165447383, "grad_norm": 0.19729671382210393, "learning_rate": 2.4614100959532747e-06, "loss": 0.293, "step": 2546 }, { "epoch": 2.8655036578503097, "grad_norm": 0.19120385629590778, "learning_rate": 2.440550688360451e-06, "loss": 0.2809, "step": 2547 }, { "epoch": 2.8666291502532357, "grad_norm": 0.20169331917856845, "learning_rate": 2.4196912807676266e-06, "loss": 0.2865, "step": 2548 }, { "epoch": 2.867754642656162, "grad_norm": 0.20308102680675588, "learning_rate": 2.398831873174802e-06, "loss": 0.292, "step": 2549 }, { "epoch": 2.8688801350590882, "grad_norm": 0.21920991074207272, "learning_rate": 2.3779724655819776e-06, "loss": 0.2769, "step": 2550 }, { "epoch": 2.8700056274620147, "grad_norm": 0.1909570999276725, "learning_rate": 2.357113057989153e-06, "loss": 0.2826, "step": 2551 }, { "epoch": 2.871131119864941, "grad_norm": 0.20820870859741275, "learning_rate": 2.336253650396329e-06, "loss": 0.2811, "step": 2552 }, { "epoch": 2.872256612267867, "grad_norm": 0.19636795603937476, "learning_rate": 2.3153942428035047e-06, "loss": 0.2817, "step": 2553 }, { "epoch": 2.8733821046707932, "grad_norm": 0.2072299777143624, "learning_rate": 2.29453483521068e-06, "loss": 0.2782, "step": 2554 }, { "epoch": 2.8745075970737197, "grad_norm": 0.21169030898396585, "learning_rate": 2.2736754276178557e-06, "loss": 0.2674, "step": 2555 }, { "epoch": 2.875633089476646, "grad_norm": 0.21706530639456728, "learning_rate": 2.252816020025031e-06, "loss": 0.2759, "step": 2556 }, { "epoch": 2.8767585818795722, "grad_norm": 0.2028602319447674, "learning_rate": 2.231956612432207e-06, "loss": 0.2785, "step": 2557 }, { "epoch": 2.8778840742824987, "grad_norm": 0.19144432960054086, "learning_rate": 2.211097204839383e-06, "loss": 0.2833, "step": 2558 }, { "epoch": 2.8790095666854247, "grad_norm": 0.2093653426820825, "learning_rate": 2.190237797246558e-06, "loss": 0.2815, "step": 2559 }, { "epoch": 2.880135059088351, "grad_norm": 0.18440025227997578, "learning_rate": 2.169378389653734e-06, "loss": 0.2786, "step": 2560 }, { "epoch": 2.8812605514912777, "grad_norm": 0.19656732367474106, "learning_rate": 2.1485189820609096e-06, "loss": 0.2803, "step": 2561 }, { "epoch": 2.8823860438942037, "grad_norm": 0.19783394536320123, "learning_rate": 2.1276595744680853e-06, "loss": 0.2812, "step": 2562 }, { "epoch": 2.8835115362971298, "grad_norm": 0.18647997820338216, "learning_rate": 2.106800166875261e-06, "loss": 0.2781, "step": 2563 }, { "epoch": 2.8846370287000562, "grad_norm": 0.19982575823634316, "learning_rate": 2.0859407592824363e-06, "loss": 0.2816, "step": 2564 }, { "epoch": 2.8857625211029827, "grad_norm": 0.195813225255546, "learning_rate": 2.065081351689612e-06, "loss": 0.2788, "step": 2565 }, { "epoch": 2.8868880135059087, "grad_norm": 0.2043720027738115, "learning_rate": 2.0442219440967877e-06, "loss": 0.286, "step": 2566 }, { "epoch": 2.888013505908835, "grad_norm": 0.19093315445014475, "learning_rate": 2.0233625365039634e-06, "loss": 0.2707, "step": 2567 }, { "epoch": 2.8891389983117612, "grad_norm": 0.21027820905380143, "learning_rate": 2.002503128911139e-06, "loss": 0.3025, "step": 2568 }, { "epoch": 2.8902644907146877, "grad_norm": 0.2026225980219709, "learning_rate": 1.9816437213183145e-06, "loss": 0.2853, "step": 2569 }, { "epoch": 2.891389983117614, "grad_norm": 0.1861779080477095, "learning_rate": 1.96078431372549e-06, "loss": 0.2783, "step": 2570 }, { "epoch": 2.8925154755205402, "grad_norm": 0.20932901602980997, "learning_rate": 1.939924906132666e-06, "loss": 0.2885, "step": 2571 }, { "epoch": 2.8936409679234663, "grad_norm": 0.2073789548556899, "learning_rate": 1.9190654985398416e-06, "loss": 0.2801, "step": 2572 }, { "epoch": 2.8947664603263927, "grad_norm": 0.18609584484982244, "learning_rate": 1.8982060909470173e-06, "loss": 0.2744, "step": 2573 }, { "epoch": 2.895891952729319, "grad_norm": 0.1812033482553102, "learning_rate": 1.877346683354193e-06, "loss": 0.2859, "step": 2574 }, { "epoch": 2.8970174451322452, "grad_norm": 0.20994333414010466, "learning_rate": 1.8564872757613683e-06, "loss": 0.2904, "step": 2575 }, { "epoch": 2.8981429375351717, "grad_norm": 0.19743578503548526, "learning_rate": 1.8356278681685442e-06, "loss": 0.2761, "step": 2576 }, { "epoch": 2.8992684299380977, "grad_norm": 0.19166570858524684, "learning_rate": 1.8147684605757195e-06, "loss": 0.2918, "step": 2577 }, { "epoch": 2.9003939223410242, "grad_norm": 0.20463881347239937, "learning_rate": 1.7939090529828954e-06, "loss": 0.2889, "step": 2578 }, { "epoch": 2.9015194147439507, "grad_norm": 0.19047750537612482, "learning_rate": 1.7730496453900712e-06, "loss": 0.2864, "step": 2579 }, { "epoch": 2.9026449071468767, "grad_norm": 0.19091573754700542, "learning_rate": 1.7521902377972467e-06, "loss": 0.2927, "step": 2580 }, { "epoch": 2.903770399549803, "grad_norm": 0.1876198954341774, "learning_rate": 1.7313308302044224e-06, "loss": 0.268, "step": 2581 }, { "epoch": 2.9048958919527292, "grad_norm": 0.20218741071231236, "learning_rate": 1.7104714226115977e-06, "loss": 0.2818, "step": 2582 }, { "epoch": 2.9060213843556557, "grad_norm": 0.19751340574083814, "learning_rate": 1.6896120150187736e-06, "loss": 0.289, "step": 2583 }, { "epoch": 2.9071468767585817, "grad_norm": 0.1929293500822755, "learning_rate": 1.6687526074259493e-06, "loss": 0.2701, "step": 2584 }, { "epoch": 2.908272369161508, "grad_norm": 0.19409315370373764, "learning_rate": 1.6478931998331248e-06, "loss": 0.294, "step": 2585 }, { "epoch": 2.9093978615644343, "grad_norm": 0.19513663853793442, "learning_rate": 1.6270337922403005e-06, "loss": 0.2909, "step": 2586 }, { "epoch": 2.9105233539673607, "grad_norm": 0.18520192324031312, "learning_rate": 1.6061743846474762e-06, "loss": 0.2879, "step": 2587 }, { "epoch": 2.911648846370287, "grad_norm": 0.19393576190790643, "learning_rate": 1.5853149770546517e-06, "loss": 0.2818, "step": 2588 }, { "epoch": 2.9127743387732132, "grad_norm": 0.1913327489426411, "learning_rate": 1.5644555694618274e-06, "loss": 0.2845, "step": 2589 }, { "epoch": 2.9138998311761397, "grad_norm": 0.19600196772862405, "learning_rate": 1.543596161869003e-06, "loss": 0.2877, "step": 2590 }, { "epoch": 2.9150253235790657, "grad_norm": 0.2003116343512408, "learning_rate": 1.5227367542761787e-06, "loss": 0.2761, "step": 2591 }, { "epoch": 2.916150815981992, "grad_norm": 0.18645182971152968, "learning_rate": 1.5018773466833542e-06, "loss": 0.2851, "step": 2592 }, { "epoch": 2.9172763083849182, "grad_norm": 0.20894705309538278, "learning_rate": 1.48101793909053e-06, "loss": 0.2903, "step": 2593 }, { "epoch": 2.9184018007878447, "grad_norm": 0.19120582673378814, "learning_rate": 1.4601585314977056e-06, "loss": 0.2776, "step": 2594 }, { "epoch": 2.9195272931907708, "grad_norm": 0.20278259638182897, "learning_rate": 1.439299123904881e-06, "loss": 0.2787, "step": 2595 }, { "epoch": 2.9206527855936972, "grad_norm": 0.19583937073430013, "learning_rate": 1.4184397163120568e-06, "loss": 0.2811, "step": 2596 }, { "epoch": 2.9217782779966237, "grad_norm": 0.1941542530021111, "learning_rate": 1.3975803087192323e-06, "loss": 0.2658, "step": 2597 }, { "epoch": 2.9229037703995497, "grad_norm": 0.20963262803457552, "learning_rate": 1.3767209011264082e-06, "loss": 0.278, "step": 2598 }, { "epoch": 2.924029262802476, "grad_norm": 0.2018274661438912, "learning_rate": 1.3558614935335837e-06, "loss": 0.294, "step": 2599 }, { "epoch": 2.9251547552054022, "grad_norm": 0.19436944269264386, "learning_rate": 1.3350020859407592e-06, "loss": 0.2834, "step": 2600 }, { "epoch": 2.9262802476083287, "grad_norm": 0.1878287577440724, "learning_rate": 1.314142678347935e-06, "loss": 0.2765, "step": 2601 }, { "epoch": 2.927405740011255, "grad_norm": 0.18341006129215123, "learning_rate": 1.2932832707551107e-06, "loss": 0.2746, "step": 2602 }, { "epoch": 2.9285312324141812, "grad_norm": 0.19735771783507766, "learning_rate": 1.2724238631622864e-06, "loss": 0.2913, "step": 2603 }, { "epoch": 2.9296567248171073, "grad_norm": 0.18771599689886934, "learning_rate": 1.2515644555694619e-06, "loss": 0.2753, "step": 2604 }, { "epoch": 2.9307822172200337, "grad_norm": 0.19841768486183753, "learning_rate": 1.2307050479766374e-06, "loss": 0.2814, "step": 2605 }, { "epoch": 2.93190770962296, "grad_norm": 0.1956614956245663, "learning_rate": 1.2098456403838133e-06, "loss": 0.2743, "step": 2606 }, { "epoch": 2.9330332020258862, "grad_norm": 0.2002743148871214, "learning_rate": 1.1889862327909888e-06, "loss": 0.2888, "step": 2607 }, { "epoch": 2.9341586944288127, "grad_norm": 0.21318426731074547, "learning_rate": 1.1681268251981645e-06, "loss": 0.2959, "step": 2608 }, { "epoch": 2.9352841868317388, "grad_norm": 0.18809272462436055, "learning_rate": 1.14726741760534e-06, "loss": 0.277, "step": 2609 }, { "epoch": 2.9364096792346652, "grad_norm": 0.19427439279930914, "learning_rate": 1.1264080100125155e-06, "loss": 0.2927, "step": 2610 }, { "epoch": 2.9375351716375917, "grad_norm": 0.2079310357704345, "learning_rate": 1.1055486024196914e-06, "loss": 0.2828, "step": 2611 }, { "epoch": 2.9386606640405177, "grad_norm": 0.19416657363268003, "learning_rate": 1.084689194826867e-06, "loss": 0.2911, "step": 2612 }, { "epoch": 2.9397861564434438, "grad_norm": 0.19916119078493613, "learning_rate": 1.0638297872340427e-06, "loss": 0.2924, "step": 2613 }, { "epoch": 2.9409116488463702, "grad_norm": 0.1983245462408925, "learning_rate": 1.0429703796412182e-06, "loss": 0.2708, "step": 2614 }, { "epoch": 2.9420371412492967, "grad_norm": 0.18590780784131763, "learning_rate": 1.0221109720483939e-06, "loss": 0.2863, "step": 2615 }, { "epoch": 2.9431626336522227, "grad_norm": 0.1872997802264514, "learning_rate": 1.0012515644555696e-06, "loss": 0.2811, "step": 2616 }, { "epoch": 2.9442881260551492, "grad_norm": 0.178184423835207, "learning_rate": 9.80392156862745e-07, "loss": 0.2731, "step": 2617 }, { "epoch": 2.9454136184580753, "grad_norm": 0.20370420485130178, "learning_rate": 9.595327492699208e-07, "loss": 0.298, "step": 2618 }, { "epoch": 2.9465391108610017, "grad_norm": 0.19363752795605113, "learning_rate": 9.386733416770965e-07, "loss": 0.2862, "step": 2619 }, { "epoch": 2.947664603263928, "grad_norm": 0.19337040777721937, "learning_rate": 9.178139340842721e-07, "loss": 0.2938, "step": 2620 }, { "epoch": 2.9487900956668542, "grad_norm": 0.20062145944273124, "learning_rate": 8.969545264914477e-07, "loss": 0.2854, "step": 2621 }, { "epoch": 2.9499155880697803, "grad_norm": 0.19780261743838537, "learning_rate": 8.760951188986233e-07, "loss": 0.2853, "step": 2622 }, { "epoch": 2.9510410804727067, "grad_norm": 0.18300708396430374, "learning_rate": 8.552357113057988e-07, "loss": 0.2859, "step": 2623 }, { "epoch": 2.952166572875633, "grad_norm": 0.19659081949531576, "learning_rate": 8.343763037129747e-07, "loss": 0.2873, "step": 2624 }, { "epoch": 2.9532920652785593, "grad_norm": 0.18879456754476104, "learning_rate": 8.135168961201503e-07, "loss": 0.3059, "step": 2625 }, { "epoch": 2.9544175576814857, "grad_norm": 0.1970187549688716, "learning_rate": 7.926574885273259e-07, "loss": 0.2751, "step": 2626 }, { "epoch": 2.9555430500844118, "grad_norm": 0.18765481178184712, "learning_rate": 7.717980809345015e-07, "loss": 0.2828, "step": 2627 }, { "epoch": 2.9566685424873382, "grad_norm": 0.18024019057588778, "learning_rate": 7.509386733416771e-07, "loss": 0.2796, "step": 2628 }, { "epoch": 2.9577940348902647, "grad_norm": 0.1911404613455637, "learning_rate": 7.300792657488528e-07, "loss": 0.2857, "step": 2629 }, { "epoch": 2.9589195272931907, "grad_norm": 0.19138951788504163, "learning_rate": 7.092198581560284e-07, "loss": 0.2792, "step": 2630 }, { "epoch": 2.9600450196961168, "grad_norm": 0.1910935556506434, "learning_rate": 6.883604505632041e-07, "loss": 0.2845, "step": 2631 }, { "epoch": 2.9611705120990433, "grad_norm": 0.19203721334554208, "learning_rate": 6.675010429703796e-07, "loss": 0.291, "step": 2632 }, { "epoch": 2.9622960045019697, "grad_norm": 0.1928645139619539, "learning_rate": 6.466416353775553e-07, "loss": 0.2929, "step": 2633 }, { "epoch": 2.9634214969048958, "grad_norm": 0.19918761206527566, "learning_rate": 6.257822277847309e-07, "loss": 0.3019, "step": 2634 }, { "epoch": 2.9645469893078222, "grad_norm": 0.1854244511600504, "learning_rate": 6.049228201919066e-07, "loss": 0.2762, "step": 2635 }, { "epoch": 2.9656724817107483, "grad_norm": 0.1790914058015419, "learning_rate": 5.840634125990823e-07, "loss": 0.2775, "step": 2636 }, { "epoch": 2.9667979741136747, "grad_norm": 0.18939335360021642, "learning_rate": 5.632040050062578e-07, "loss": 0.2868, "step": 2637 }, { "epoch": 2.967923466516601, "grad_norm": 0.18950602005484965, "learning_rate": 5.423445974134335e-07, "loss": 0.2806, "step": 2638 }, { "epoch": 2.9690489589195272, "grad_norm": 0.2057097341756207, "learning_rate": 5.214851898206091e-07, "loss": 0.2959, "step": 2639 }, { "epoch": 2.9701744513224537, "grad_norm": 0.18914510377229687, "learning_rate": 5.006257822277848e-07, "loss": 0.2813, "step": 2640 }, { "epoch": 2.9712999437253798, "grad_norm": 0.21148991319548016, "learning_rate": 4.797663746349604e-07, "loss": 0.2921, "step": 2641 }, { "epoch": 2.9724254361283062, "grad_norm": 0.19281924718786836, "learning_rate": 4.5890696704213606e-07, "loss": 0.2855, "step": 2642 }, { "epoch": 2.9735509285312323, "grad_norm": 0.18955123443150448, "learning_rate": 4.3804755944931167e-07, "loss": 0.2959, "step": 2643 }, { "epoch": 2.9746764209341587, "grad_norm": 0.18330368999509078, "learning_rate": 4.171881518564873e-07, "loss": 0.2867, "step": 2644 }, { "epoch": 2.9758019133370848, "grad_norm": 0.19150353774187667, "learning_rate": 3.9632874426366293e-07, "loss": 0.2807, "step": 2645 }, { "epoch": 2.9769274057400112, "grad_norm": 0.18078724381796288, "learning_rate": 3.7546933667083854e-07, "loss": 0.2659, "step": 2646 }, { "epoch": 2.9780528981429377, "grad_norm": 0.1956344780829508, "learning_rate": 3.546099290780142e-07, "loss": 0.2836, "step": 2647 }, { "epoch": 2.9791783905458638, "grad_norm": 0.18154947344603503, "learning_rate": 3.337505214851898e-07, "loss": 0.2691, "step": 2648 }, { "epoch": 2.9803038829487902, "grad_norm": 0.19970667773834722, "learning_rate": 3.1289111389236547e-07, "loss": 0.2939, "step": 2649 }, { "epoch": 2.9814293753517163, "grad_norm": 0.1970797047209464, "learning_rate": 2.9203170629954113e-07, "loss": 0.2973, "step": 2650 }, { "epoch": 2.9825548677546427, "grad_norm": 0.17558302415161703, "learning_rate": 2.7117229870671674e-07, "loss": 0.2687, "step": 2651 }, { "epoch": 2.983680360157569, "grad_norm": 0.19984560884749847, "learning_rate": 2.503128911138924e-07, "loss": 0.2917, "step": 2652 }, { "epoch": 2.9848058525604952, "grad_norm": 0.18904253639700785, "learning_rate": 2.2945348352106803e-07, "loss": 0.2881, "step": 2653 }, { "epoch": 2.9859313449634213, "grad_norm": 0.18998537420423053, "learning_rate": 2.0859407592824366e-07, "loss": 0.2899, "step": 2654 }, { "epoch": 2.9870568373663478, "grad_norm": 0.18137614988061299, "learning_rate": 1.8773466833541927e-07, "loss": 0.2673, "step": 2655 }, { "epoch": 2.9881823297692742, "grad_norm": 0.20380937660099302, "learning_rate": 1.668752607425949e-07, "loss": 0.2855, "step": 2656 }, { "epoch": 2.9893078221722003, "grad_norm": 0.18697756553685133, "learning_rate": 1.4601585314977056e-07, "loss": 0.2855, "step": 2657 }, { "epoch": 2.9904333145751267, "grad_norm": 0.18757566803524747, "learning_rate": 1.251564455569462e-07, "loss": 0.2881, "step": 2658 }, { "epoch": 2.9915588069780528, "grad_norm": 0.18601807443604287, "learning_rate": 1.0429703796412183e-07, "loss": 0.2848, "step": 2659 }, { "epoch": 2.9926842993809792, "grad_norm": 0.19183363220520985, "learning_rate": 8.343763037129745e-08, "loss": 0.2958, "step": 2660 }, { "epoch": 2.9938097917839057, "grad_norm": 0.20102940785890344, "learning_rate": 6.25782227784731e-08, "loss": 0.281, "step": 2661 }, { "epoch": 2.9949352841868317, "grad_norm": 0.189431904457012, "learning_rate": 4.1718815185648726e-08, "loss": 0.2813, "step": 2662 }, { "epoch": 2.996060776589758, "grad_norm": 0.17662567105058288, "learning_rate": 2.0859407592824363e-08, "loss": 0.2782, "step": 2663 }, { "epoch": 2.9971862689926843, "grad_norm": 0.18268732320356856, "learning_rate": 0.0, "loss": 0.2705, "step": 2664 }, { "epoch": 2.9971862689926843, "step": 2664, "total_flos": 2.27802848659977e+18, "train_loss": 0.43048976833845404, "train_runtime": 155129.8221, "train_samples_per_second": 0.275, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 2664, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.27802848659977e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }