diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18690 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9971862689926843, + "eval_steps": 500, + "global_step": 2664, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011254924029262803, + "grad_norm": 52.80103639717517, + "learning_rate": 1.8726591760299626e-07, + "loss": 11.1109, + "step": 1 + }, + { + "epoch": 0.0022509848058525606, + "grad_norm": 54.14343134228205, + "learning_rate": 3.7453183520599253e-07, + "loss": 11.1518, + "step": 2 + }, + { + "epoch": 0.0033764772087788407, + "grad_norm": 55.956204989803545, + "learning_rate": 5.617977528089887e-07, + "loss": 11.0032, + "step": 3 + }, + { + "epoch": 0.004501969611705121, + "grad_norm": 52.83455219346365, + "learning_rate": 7.490636704119851e-07, + "loss": 10.9916, + "step": 4 + }, + { + "epoch": 0.005627462014631401, + "grad_norm": 55.14267600803135, + "learning_rate": 9.363295880149814e-07, + "loss": 11.0876, + "step": 5 + }, + { + "epoch": 0.006752954417557681, + "grad_norm": 54.398858725586614, + "learning_rate": 1.1235955056179775e-06, + "loss": 11.1606, + "step": 6 + }, + { + "epoch": 0.007878446820483961, + "grad_norm": 54.32209978082585, + "learning_rate": 1.310861423220974e-06, + "loss": 11.0376, + "step": 7 + }, + { + "epoch": 0.009003939223410242, + "grad_norm": 58.95322793439154, + "learning_rate": 1.4981273408239701e-06, + "loss": 10.7967, + "step": 8 + }, + { + "epoch": 0.010129431626336522, + "grad_norm": 61.239998437535036, + "learning_rate": 1.6853932584269663e-06, + "loss": 10.6462, + "step": 9 + }, + { + "epoch": 0.011254924029262802, + "grad_norm": 61.97431021564583, + "learning_rate": 1.8726591760299627e-06, + "loss": 10.7333, + "step": 10 + }, + { + "epoch": 0.012380416432189083, + "grad_norm": 81.14048682798334, + "learning_rate": 2.0599250936329587e-06, + "loss": 9.5671, + "step": 11 + }, + { + "epoch": 0.013505908835115363, + "grad_norm": 86.6613892998916, + "learning_rate": 2.247191011235955e-06, + "loss": 9.3288, + "step": 12 + }, + { + "epoch": 0.014631401238041642, + "grad_norm": 97.56062675082107, + "learning_rate": 2.4344569288389516e-06, + "loss": 8.8569, + "step": 13 + }, + { + "epoch": 0.015756893640967922, + "grad_norm": 99.72668638976701, + "learning_rate": 2.621722846441948e-06, + "loss": 8.8229, + "step": 14 + }, + { + "epoch": 0.016882386043894203, + "grad_norm": 68.62441737325906, + "learning_rate": 2.808988764044944e-06, + "loss": 4.106, + "step": 15 + }, + { + "epoch": 0.018007878446820485, + "grad_norm": 60.37331191202543, + "learning_rate": 2.9962546816479402e-06, + "loss": 3.608, + "step": 16 + }, + { + "epoch": 0.019133370849746763, + "grad_norm": 48.17406200608835, + "learning_rate": 3.1835205992509364e-06, + "loss": 3.0803, + "step": 17 + }, + { + "epoch": 0.020258863252673044, + "grad_norm": 36.4342554407753, + "learning_rate": 3.3707865168539327e-06, + "loss": 2.5875, + "step": 18 + }, + { + "epoch": 0.021384355655599326, + "grad_norm": 31.524091079255932, + "learning_rate": 3.558052434456929e-06, + "loss": 2.33, + "step": 19 + }, + { + "epoch": 0.022509848058525603, + "grad_norm": 6.74346485385345, + "learning_rate": 3.7453183520599255e-06, + "loss": 1.3777, + "step": 20 + }, + { + "epoch": 0.023635340461451885, + "grad_norm": 5.1275756648686786, + "learning_rate": 3.932584269662922e-06, + "loss": 1.3449, + "step": 21 + }, + { + "epoch": 0.024760832864378166, + "grad_norm": 4.059089851221617, + "learning_rate": 4.1198501872659175e-06, + "loss": 1.248, + "step": 22 + }, + { + "epoch": 0.025886325267304444, + "grad_norm": 3.461699822238443, + "learning_rate": 4.307116104868914e-06, + "loss": 1.2294, + "step": 23 + }, + { + "epoch": 0.027011817670230726, + "grad_norm": 2.6194967883079197, + "learning_rate": 4.49438202247191e-06, + "loss": 1.1185, + "step": 24 + }, + { + "epoch": 0.028137310073157007, + "grad_norm": 2.1602870716248015, + "learning_rate": 4.6816479400749066e-06, + "loss": 1.1193, + "step": 25 + }, + { + "epoch": 0.029262802476083285, + "grad_norm": 1.8135748254982695, + "learning_rate": 4.868913857677903e-06, + "loss": 1.0589, + "step": 26 + }, + { + "epoch": 0.030388294879009566, + "grad_norm": 1.321133306398482, + "learning_rate": 5.056179775280899e-06, + "loss": 0.9617, + "step": 27 + }, + { + "epoch": 0.031513787281935844, + "grad_norm": 31.284838936554156, + "learning_rate": 5.243445692883896e-06, + "loss": 0.9937, + "step": 28 + }, + { + "epoch": 0.032639279684862126, + "grad_norm": 1.8832749187420972, + "learning_rate": 5.430711610486891e-06, + "loss": 0.9144, + "step": 29 + }, + { + "epoch": 0.03376477208778841, + "grad_norm": 1.4607030575064903, + "learning_rate": 5.617977528089888e-06, + "loss": 0.8985, + "step": 30 + }, + { + "epoch": 0.03489026449071469, + "grad_norm": 1.0370555540895343, + "learning_rate": 5.805243445692885e-06, + "loss": 0.8404, + "step": 31 + }, + { + "epoch": 0.03601575689364097, + "grad_norm": 0.9098468238765742, + "learning_rate": 5.9925093632958805e-06, + "loss": 0.8352, + "step": 32 + }, + { + "epoch": 0.03714124929656725, + "grad_norm": 0.8661052907885602, + "learning_rate": 6.179775280898876e-06, + "loss": 0.8258, + "step": 33 + }, + { + "epoch": 0.038266741699493526, + "grad_norm": 0.7740808609488935, + "learning_rate": 6.367041198501873e-06, + "loss": 0.8324, + "step": 34 + }, + { + "epoch": 0.03939223410241981, + "grad_norm": 0.7782713659204045, + "learning_rate": 6.554307116104869e-06, + "loss": 0.7588, + "step": 35 + }, + { + "epoch": 0.04051772650534609, + "grad_norm": 0.6841020945645767, + "learning_rate": 6.741573033707865e-06, + "loss": 0.7682, + "step": 36 + }, + { + "epoch": 0.04164321890827237, + "grad_norm": 0.6004121622938939, + "learning_rate": 6.928838951310862e-06, + "loss": 0.7549, + "step": 37 + }, + { + "epoch": 0.04276871131119865, + "grad_norm": 0.6229597097596257, + "learning_rate": 7.116104868913858e-06, + "loss": 0.7376, + "step": 38 + }, + { + "epoch": 0.04389420371412493, + "grad_norm": 0.7141033532392286, + "learning_rate": 7.303370786516854e-06, + "loss": 0.7535, + "step": 39 + }, + { + "epoch": 0.04501969611705121, + "grad_norm": 0.5725153155935927, + "learning_rate": 7.490636704119851e-06, + "loss": 0.7185, + "step": 40 + }, + { + "epoch": 0.04614518851997749, + "grad_norm": 0.5549438958370185, + "learning_rate": 7.677902621722846e-06, + "loss": 0.7518, + "step": 41 + }, + { + "epoch": 0.04727068092290377, + "grad_norm": 0.4660101265627369, + "learning_rate": 7.865168539325843e-06, + "loss": 0.6787, + "step": 42 + }, + { + "epoch": 0.04839617332583005, + "grad_norm": 0.4908539170309294, + "learning_rate": 8.05243445692884e-06, + "loss": 0.7032, + "step": 43 + }, + { + "epoch": 0.04952166572875633, + "grad_norm": 0.48924522260651016, + "learning_rate": 8.239700374531835e-06, + "loss": 0.6803, + "step": 44 + }, + { + "epoch": 0.050647158131682614, + "grad_norm": 0.475140896111031, + "learning_rate": 8.426966292134832e-06, + "loss": 0.6475, + "step": 45 + }, + { + "epoch": 0.05177265053460889, + "grad_norm": 0.4644093059355716, + "learning_rate": 8.614232209737828e-06, + "loss": 0.7013, + "step": 46 + }, + { + "epoch": 0.05289814293753517, + "grad_norm": 0.40301032630352857, + "learning_rate": 8.801498127340826e-06, + "loss": 0.6463, + "step": 47 + }, + { + "epoch": 0.05402363534046145, + "grad_norm": 0.43480363638927505, + "learning_rate": 8.98876404494382e-06, + "loss": 0.6775, + "step": 48 + }, + { + "epoch": 0.05514912774338773, + "grad_norm": 0.43971181451177166, + "learning_rate": 9.176029962546817e-06, + "loss": 0.7007, + "step": 49 + }, + { + "epoch": 0.056274620146314014, + "grad_norm": 0.41896418177510275, + "learning_rate": 9.363295880149813e-06, + "loss": 0.6468, + "step": 50 + }, + { + "epoch": 0.057400112549240295, + "grad_norm": 0.4149971177588748, + "learning_rate": 9.550561797752809e-06, + "loss": 0.628, + "step": 51 + }, + { + "epoch": 0.05852560495216657, + "grad_norm": 0.37242192155253623, + "learning_rate": 9.737827715355806e-06, + "loss": 0.652, + "step": 52 + }, + { + "epoch": 0.05965109735509285, + "grad_norm": 0.327485240758468, + "learning_rate": 9.925093632958802e-06, + "loss": 0.6334, + "step": 53 + }, + { + "epoch": 0.06077658975801913, + "grad_norm": 0.36141343502753065, + "learning_rate": 1.0112359550561798e-05, + "loss": 0.6259, + "step": 54 + }, + { + "epoch": 0.061902082160945414, + "grad_norm": 0.38897211704559004, + "learning_rate": 1.0299625468164795e-05, + "loss": 0.6226, + "step": 55 + }, + { + "epoch": 0.06302757456387169, + "grad_norm": 0.36207952707026725, + "learning_rate": 1.0486891385767791e-05, + "loss": 0.6289, + "step": 56 + }, + { + "epoch": 0.06415306696679797, + "grad_norm": 0.28595916020001694, + "learning_rate": 1.0674157303370787e-05, + "loss": 0.6149, + "step": 57 + }, + { + "epoch": 0.06527855936972425, + "grad_norm": 0.29359683815567633, + "learning_rate": 1.0861423220973783e-05, + "loss": 0.5888, + "step": 58 + }, + { + "epoch": 0.06640405177265053, + "grad_norm": 0.3228509053817298, + "learning_rate": 1.104868913857678e-05, + "loss": 0.6328, + "step": 59 + }, + { + "epoch": 0.06752954417557681, + "grad_norm": 0.3068303518794903, + "learning_rate": 1.1235955056179776e-05, + "loss": 0.571, + "step": 60 + }, + { + "epoch": 0.0686550365785031, + "grad_norm": 0.3231501093567655, + "learning_rate": 1.1423220973782772e-05, + "loss": 0.5728, + "step": 61 + }, + { + "epoch": 0.06978052898142938, + "grad_norm": 0.2827526067701556, + "learning_rate": 1.161048689138577e-05, + "loss": 0.5919, + "step": 62 + }, + { + "epoch": 0.07090602138435566, + "grad_norm": 0.3490733036925077, + "learning_rate": 1.1797752808988765e-05, + "loss": 0.6319, + "step": 63 + }, + { + "epoch": 0.07203151378728194, + "grad_norm": 0.36049201575238243, + "learning_rate": 1.1985018726591761e-05, + "loss": 0.6065, + "step": 64 + }, + { + "epoch": 0.07315700619020822, + "grad_norm": 0.2817612900392732, + "learning_rate": 1.2172284644194758e-05, + "loss": 0.6022, + "step": 65 + }, + { + "epoch": 0.0742824985931345, + "grad_norm": 0.27300283931060443, + "learning_rate": 1.2359550561797752e-05, + "loss": 0.5783, + "step": 66 + }, + { + "epoch": 0.07540799099606077, + "grad_norm": 0.3421112627990278, + "learning_rate": 1.254681647940075e-05, + "loss": 0.576, + "step": 67 + }, + { + "epoch": 0.07653348339898705, + "grad_norm": 0.33598705329341366, + "learning_rate": 1.2734082397003746e-05, + "loss": 0.5835, + "step": 68 + }, + { + "epoch": 0.07765897580191333, + "grad_norm": 0.27960476280957486, + "learning_rate": 1.2921348314606743e-05, + "loss": 0.5987, + "step": 69 + }, + { + "epoch": 0.07878446820483961, + "grad_norm": 0.2965350125129841, + "learning_rate": 1.3108614232209737e-05, + "loss": 0.6026, + "step": 70 + }, + { + "epoch": 0.0799099606077659, + "grad_norm": 0.3122772390396813, + "learning_rate": 1.3295880149812733e-05, + "loss": 0.574, + "step": 71 + }, + { + "epoch": 0.08103545301069218, + "grad_norm": 0.3021816040434541, + "learning_rate": 1.348314606741573e-05, + "loss": 0.5771, + "step": 72 + }, + { + "epoch": 0.08216094541361846, + "grad_norm": 0.2831578746374877, + "learning_rate": 1.3670411985018728e-05, + "loss": 0.5675, + "step": 73 + }, + { + "epoch": 0.08328643781654474, + "grad_norm": 0.32441513984635667, + "learning_rate": 1.3857677902621724e-05, + "loss": 0.5652, + "step": 74 + }, + { + "epoch": 0.08441193021947102, + "grad_norm": 0.31509832756589373, + "learning_rate": 1.4044943820224721e-05, + "loss": 0.5725, + "step": 75 + }, + { + "epoch": 0.0855374226223973, + "grad_norm": 0.3068003737845105, + "learning_rate": 1.4232209737827715e-05, + "loss": 0.5921, + "step": 76 + }, + { + "epoch": 0.08666291502532358, + "grad_norm": 0.28569121242288503, + "learning_rate": 1.4419475655430711e-05, + "loss": 0.5517, + "step": 77 + }, + { + "epoch": 0.08778840742824986, + "grad_norm": 0.30318713510099926, + "learning_rate": 1.4606741573033709e-05, + "loss": 0.5786, + "step": 78 + }, + { + "epoch": 0.08891389983117615, + "grad_norm": 0.32791686866753017, + "learning_rate": 1.4794007490636705e-05, + "loss": 0.5835, + "step": 79 + }, + { + "epoch": 0.09003939223410241, + "grad_norm": 0.34541735995694495, + "learning_rate": 1.4981273408239702e-05, + "loss": 0.6003, + "step": 80 + }, + { + "epoch": 0.0911648846370287, + "grad_norm": 0.24219057822403553, + "learning_rate": 1.5168539325842698e-05, + "loss": 0.5634, + "step": 81 + }, + { + "epoch": 0.09229037703995498, + "grad_norm": 0.3066124460269189, + "learning_rate": 1.5355805243445692e-05, + "loss": 0.5385, + "step": 82 + }, + { + "epoch": 0.09341586944288126, + "grad_norm": 0.36004311246679105, + "learning_rate": 1.554307116104869e-05, + "loss": 0.542, + "step": 83 + }, + { + "epoch": 0.09454136184580754, + "grad_norm": 0.277294813524559, + "learning_rate": 1.5730337078651687e-05, + "loss": 0.5467, + "step": 84 + }, + { + "epoch": 0.09566685424873382, + "grad_norm": 0.2742529403377881, + "learning_rate": 1.591760299625468e-05, + "loss": 0.5337, + "step": 85 + }, + { + "epoch": 0.0967923466516601, + "grad_norm": 0.37776459034853405, + "learning_rate": 1.610486891385768e-05, + "loss": 0.5392, + "step": 86 + }, + { + "epoch": 0.09791783905458638, + "grad_norm": 0.29713498839858976, + "learning_rate": 1.6292134831460676e-05, + "loss": 0.5513, + "step": 87 + }, + { + "epoch": 0.09904333145751266, + "grad_norm": 0.2677802103514856, + "learning_rate": 1.647940074906367e-05, + "loss": 0.5435, + "step": 88 + }, + { + "epoch": 0.10016882386043895, + "grad_norm": 0.3282651538789268, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.5556, + "step": 89 + }, + { + "epoch": 0.10129431626336523, + "grad_norm": 0.2903898300830952, + "learning_rate": 1.6853932584269665e-05, + "loss": 0.5182, + "step": 90 + }, + { + "epoch": 0.10241980866629151, + "grad_norm": 0.32940772248776146, + "learning_rate": 1.704119850187266e-05, + "loss": 0.5651, + "step": 91 + }, + { + "epoch": 0.10354530106921778, + "grad_norm": 0.29877064796568714, + "learning_rate": 1.7228464419475657e-05, + "loss": 0.5232, + "step": 92 + }, + { + "epoch": 0.10467079347214406, + "grad_norm": 0.3033306544759112, + "learning_rate": 1.7415730337078654e-05, + "loss": 0.5415, + "step": 93 + }, + { + "epoch": 0.10579628587507034, + "grad_norm": 0.298699393195244, + "learning_rate": 1.760299625468165e-05, + "loss": 0.5351, + "step": 94 + }, + { + "epoch": 0.10692177827799662, + "grad_norm": 0.27344653088956217, + "learning_rate": 1.7790262172284646e-05, + "loss": 0.5401, + "step": 95 + }, + { + "epoch": 0.1080472706809229, + "grad_norm": 0.2901283593528549, + "learning_rate": 1.797752808988764e-05, + "loss": 0.5548, + "step": 96 + }, + { + "epoch": 0.10917276308384918, + "grad_norm": 0.2955399438690073, + "learning_rate": 1.8164794007490637e-05, + "loss": 0.5336, + "step": 97 + }, + { + "epoch": 0.11029825548677546, + "grad_norm": 0.3044365394746884, + "learning_rate": 1.8352059925093635e-05, + "loss": 0.5095, + "step": 98 + }, + { + "epoch": 0.11142374788970175, + "grad_norm": 0.26929920330702195, + "learning_rate": 1.8539325842696632e-05, + "loss": 0.5569, + "step": 99 + }, + { + "epoch": 0.11254924029262803, + "grad_norm": 0.36727845819131605, + "learning_rate": 1.8726591760299626e-05, + "loss": 0.5818, + "step": 100 + }, + { + "epoch": 0.11367473269555431, + "grad_norm": 0.2836581651837986, + "learning_rate": 1.891385767790262e-05, + "loss": 0.5373, + "step": 101 + }, + { + "epoch": 0.11480022509848059, + "grad_norm": 0.29593257115280464, + "learning_rate": 1.9101123595505618e-05, + "loss": 0.5131, + "step": 102 + }, + { + "epoch": 0.11592571750140687, + "grad_norm": 0.29964886160890525, + "learning_rate": 1.9288389513108615e-05, + "loss": 0.5044, + "step": 103 + }, + { + "epoch": 0.11705120990433314, + "grad_norm": 0.30009105696644967, + "learning_rate": 1.9475655430711613e-05, + "loss": 0.536, + "step": 104 + }, + { + "epoch": 0.11817670230725942, + "grad_norm": 0.29291717707624504, + "learning_rate": 1.9662921348314607e-05, + "loss": 0.5505, + "step": 105 + }, + { + "epoch": 0.1193021947101857, + "grad_norm": 0.3294836067555843, + "learning_rate": 1.9850187265917604e-05, + "loss": 0.5505, + "step": 106 + }, + { + "epoch": 0.12042768711311198, + "grad_norm": 0.29401137621422074, + "learning_rate": 2.00374531835206e-05, + "loss": 0.528, + "step": 107 + }, + { + "epoch": 0.12155317951603826, + "grad_norm": 0.3030811720009754, + "learning_rate": 2.0224719101123596e-05, + "loss": 0.538, + "step": 108 + }, + { + "epoch": 0.12267867191896455, + "grad_norm": 0.32674282662604665, + "learning_rate": 2.0411985018726593e-05, + "loss": 0.541, + "step": 109 + }, + { + "epoch": 0.12380416432189083, + "grad_norm": 0.30319983351235286, + "learning_rate": 2.059925093632959e-05, + "loss": 0.504, + "step": 110 + }, + { + "epoch": 0.12492965672481711, + "grad_norm": 0.3402565154469349, + "learning_rate": 2.0786516853932585e-05, + "loss": 0.5251, + "step": 111 + }, + { + "epoch": 0.12605514912774338, + "grad_norm": 0.31872601282001034, + "learning_rate": 2.0973782771535582e-05, + "loss": 0.5286, + "step": 112 + }, + { + "epoch": 0.12718064153066966, + "grad_norm": 0.34754536732763297, + "learning_rate": 2.1161048689138577e-05, + "loss": 0.5235, + "step": 113 + }, + { + "epoch": 0.12830613393359594, + "grad_norm": 0.30998860710868686, + "learning_rate": 2.1348314606741574e-05, + "loss": 0.525, + "step": 114 + }, + { + "epoch": 0.12943162633652222, + "grad_norm": 0.32990918540472725, + "learning_rate": 2.153558052434457e-05, + "loss": 0.5265, + "step": 115 + }, + { + "epoch": 0.1305571187394485, + "grad_norm": 0.3423710738146026, + "learning_rate": 2.1722846441947566e-05, + "loss": 0.5338, + "step": 116 + }, + { + "epoch": 0.13168261114237478, + "grad_norm": 0.2872199647047314, + "learning_rate": 2.1910112359550563e-05, + "loss": 0.5299, + "step": 117 + }, + { + "epoch": 0.13280810354530106, + "grad_norm": 0.3317448545714626, + "learning_rate": 2.209737827715356e-05, + "loss": 0.4959, + "step": 118 + }, + { + "epoch": 0.13393359594822735, + "grad_norm": 0.31417498563521173, + "learning_rate": 2.2284644194756555e-05, + "loss": 0.52, + "step": 119 + }, + { + "epoch": 0.13505908835115363, + "grad_norm": 0.3645759776734259, + "learning_rate": 2.2471910112359552e-05, + "loss": 0.5296, + "step": 120 + }, + { + "epoch": 0.1361845807540799, + "grad_norm": 0.3180662213331512, + "learning_rate": 2.2659176029962546e-05, + "loss": 0.5063, + "step": 121 + }, + { + "epoch": 0.1373100731570062, + "grad_norm": 0.3716923342200342, + "learning_rate": 2.2846441947565544e-05, + "loss": 0.5046, + "step": 122 + }, + { + "epoch": 0.13843556555993247, + "grad_norm": 0.39150702044794555, + "learning_rate": 2.303370786516854e-05, + "loss": 0.4959, + "step": 123 + }, + { + "epoch": 0.13956105796285875, + "grad_norm": 0.3713739740316023, + "learning_rate": 2.322097378277154e-05, + "loss": 0.5015, + "step": 124 + }, + { + "epoch": 0.14068655036578503, + "grad_norm": 0.355150041365192, + "learning_rate": 2.3408239700374533e-05, + "loss": 0.5029, + "step": 125 + }, + { + "epoch": 0.14181204276871132, + "grad_norm": 0.47357406433732624, + "learning_rate": 2.359550561797753e-05, + "loss": 0.519, + "step": 126 + }, + { + "epoch": 0.1429375351716376, + "grad_norm": 0.35841513558308474, + "learning_rate": 2.3782771535580524e-05, + "loss": 0.517, + "step": 127 + }, + { + "epoch": 0.14406302757456388, + "grad_norm": 0.32127121635614614, + "learning_rate": 2.3970037453183522e-05, + "loss": 0.5068, + "step": 128 + }, + { + "epoch": 0.14518851997749016, + "grad_norm": 0.41380038534756397, + "learning_rate": 2.415730337078652e-05, + "loss": 0.53, + "step": 129 + }, + { + "epoch": 0.14631401238041644, + "grad_norm": 0.3342860962607464, + "learning_rate": 2.4344569288389517e-05, + "loss": 0.5131, + "step": 130 + }, + { + "epoch": 0.14743950478334272, + "grad_norm": 0.328086226882181, + "learning_rate": 2.453183520599251e-05, + "loss": 0.5359, + "step": 131 + }, + { + "epoch": 0.148564997186269, + "grad_norm": 0.3980527154392636, + "learning_rate": 2.4719101123595505e-05, + "loss": 0.4915, + "step": 132 + }, + { + "epoch": 0.14969048958919529, + "grad_norm": 0.3664150255854856, + "learning_rate": 2.4906367041198502e-05, + "loss": 0.5239, + "step": 133 + }, + { + "epoch": 0.15081598199212154, + "grad_norm": 0.36032405515932203, + "learning_rate": 2.50936329588015e-05, + "loss": 0.5085, + "step": 134 + }, + { + "epoch": 0.15194147439504782, + "grad_norm": 0.4406027959320581, + "learning_rate": 2.5280898876404497e-05, + "loss": 0.5126, + "step": 135 + }, + { + "epoch": 0.1530669667979741, + "grad_norm": 0.344695754779841, + "learning_rate": 2.546816479400749e-05, + "loss": 0.5122, + "step": 136 + }, + { + "epoch": 0.15419245920090038, + "grad_norm": 0.3726483933183008, + "learning_rate": 2.565543071161049e-05, + "loss": 0.4905, + "step": 137 + }, + { + "epoch": 0.15531795160382666, + "grad_norm": 0.3449312763960655, + "learning_rate": 2.5842696629213486e-05, + "loss": 0.4987, + "step": 138 + }, + { + "epoch": 0.15644344400675295, + "grad_norm": 0.35328970504291957, + "learning_rate": 2.6029962546816484e-05, + "loss": 0.5054, + "step": 139 + }, + { + "epoch": 0.15756893640967923, + "grad_norm": 0.3700337092111675, + "learning_rate": 2.6217228464419475e-05, + "loss": 0.509, + "step": 140 + }, + { + "epoch": 0.1586944288126055, + "grad_norm": 0.301320056673764, + "learning_rate": 2.6404494382022472e-05, + "loss": 0.4958, + "step": 141 + }, + { + "epoch": 0.1598199212155318, + "grad_norm": 0.4191378953980472, + "learning_rate": 2.6591760299625466e-05, + "loss": 0.5387, + "step": 142 + }, + { + "epoch": 0.16094541361845807, + "grad_norm": 0.3880541184543602, + "learning_rate": 2.6779026217228464e-05, + "loss": 0.5227, + "step": 143 + }, + { + "epoch": 0.16207090602138435, + "grad_norm": 0.39927231059272483, + "learning_rate": 2.696629213483146e-05, + "loss": 0.5237, + "step": 144 + }, + { + "epoch": 0.16319639842431063, + "grad_norm": 0.3961271339819255, + "learning_rate": 2.715355805243446e-05, + "loss": 0.5194, + "step": 145 + }, + { + "epoch": 0.16432189082723692, + "grad_norm": 0.4376696251019293, + "learning_rate": 2.7340823970037456e-05, + "loss": 0.5178, + "step": 146 + }, + { + "epoch": 0.1654473832301632, + "grad_norm": 0.44058938921182966, + "learning_rate": 2.752808988764045e-05, + "loss": 0.4998, + "step": 147 + }, + { + "epoch": 0.16657287563308948, + "grad_norm": 0.35261095257281155, + "learning_rate": 2.7715355805243448e-05, + "loss": 0.499, + "step": 148 + }, + { + "epoch": 0.16769836803601576, + "grad_norm": 0.5218533410763981, + "learning_rate": 2.7902621722846445e-05, + "loss": 0.5273, + "step": 149 + }, + { + "epoch": 0.16882386043894204, + "grad_norm": 0.4737891842741366, + "learning_rate": 2.8089887640449443e-05, + "loss": 0.5003, + "step": 150 + }, + { + "epoch": 0.16994935284186832, + "grad_norm": 0.392922001496729, + "learning_rate": 2.8277153558052437e-05, + "loss": 0.5016, + "step": 151 + }, + { + "epoch": 0.1710748452447946, + "grad_norm": 0.5302514501231146, + "learning_rate": 2.846441947565543e-05, + "loss": 0.5172, + "step": 152 + }, + { + "epoch": 0.17220033764772089, + "grad_norm": 0.49803115823639127, + "learning_rate": 2.8651685393258425e-05, + "loss": 0.4946, + "step": 153 + }, + { + "epoch": 0.17332583005064717, + "grad_norm": 0.4128451133760804, + "learning_rate": 2.8838951310861422e-05, + "loss": 0.5232, + "step": 154 + }, + { + "epoch": 0.17445132245357345, + "grad_norm": 0.6316627266885098, + "learning_rate": 2.902621722846442e-05, + "loss": 0.5059, + "step": 155 + }, + { + "epoch": 0.17557681485649973, + "grad_norm": 0.5295204042861669, + "learning_rate": 2.9213483146067417e-05, + "loss": 0.5243, + "step": 156 + }, + { + "epoch": 0.176702307259426, + "grad_norm": 0.45607245497823934, + "learning_rate": 2.940074906367041e-05, + "loss": 0.4821, + "step": 157 + }, + { + "epoch": 0.1778277996623523, + "grad_norm": 0.6021144875229769, + "learning_rate": 2.958801498127341e-05, + "loss": 0.5103, + "step": 158 + }, + { + "epoch": 0.17895329206527855, + "grad_norm": 0.48529780373586173, + "learning_rate": 2.9775280898876406e-05, + "loss": 0.4922, + "step": 159 + }, + { + "epoch": 0.18007878446820483, + "grad_norm": 0.4250055623471545, + "learning_rate": 2.9962546816479404e-05, + "loss": 0.4904, + "step": 160 + }, + { + "epoch": 0.1812042768711311, + "grad_norm": 0.6512919492582171, + "learning_rate": 3.01498127340824e-05, + "loss": 0.5145, + "step": 161 + }, + { + "epoch": 0.1823297692740574, + "grad_norm": 0.45356537836570343, + "learning_rate": 3.0337078651685396e-05, + "loss": 0.489, + "step": 162 + }, + { + "epoch": 0.18345526167698367, + "grad_norm": 0.4587778769232854, + "learning_rate": 3.052434456928839e-05, + "loss": 0.5031, + "step": 163 + }, + { + "epoch": 0.18458075407990995, + "grad_norm": 0.5209259547751122, + "learning_rate": 3.0711610486891384e-05, + "loss": 0.5122, + "step": 164 + }, + { + "epoch": 0.18570624648283623, + "grad_norm": 0.3205075873383086, + "learning_rate": 3.089887640449438e-05, + "loss": 0.484, + "step": 165 + }, + { + "epoch": 0.18683173888576252, + "grad_norm": 0.44421922243323253, + "learning_rate": 3.108614232209738e-05, + "loss": 0.4885, + "step": 166 + }, + { + "epoch": 0.1879572312886888, + "grad_norm": 0.38376560722257824, + "learning_rate": 3.1273408239700376e-05, + "loss": 0.5137, + "step": 167 + }, + { + "epoch": 0.18908272369161508, + "grad_norm": 0.33178548545336345, + "learning_rate": 3.1460674157303374e-05, + "loss": 0.5214, + "step": 168 + }, + { + "epoch": 0.19020821609454136, + "grad_norm": 0.3543354285220051, + "learning_rate": 3.164794007490637e-05, + "loss": 0.4652, + "step": 169 + }, + { + "epoch": 0.19133370849746764, + "grad_norm": 0.34821873695435235, + "learning_rate": 3.183520599250936e-05, + "loss": 0.4695, + "step": 170 + }, + { + "epoch": 0.19245920090039392, + "grad_norm": 0.346452239854666, + "learning_rate": 3.202247191011236e-05, + "loss": 0.4891, + "step": 171 + }, + { + "epoch": 0.1935846933033202, + "grad_norm": 0.4398933317388218, + "learning_rate": 3.220973782771536e-05, + "loss": 0.4911, + "step": 172 + }, + { + "epoch": 0.19471018570624649, + "grad_norm": 0.3624677233826849, + "learning_rate": 3.2397003745318354e-05, + "loss": 0.4912, + "step": 173 + }, + { + "epoch": 0.19583567810917277, + "grad_norm": 0.3699640798637125, + "learning_rate": 3.258426966292135e-05, + "loss": 0.5004, + "step": 174 + }, + { + "epoch": 0.19696117051209905, + "grad_norm": 0.41958584077529965, + "learning_rate": 3.277153558052435e-05, + "loss": 0.4752, + "step": 175 + }, + { + "epoch": 0.19808666291502533, + "grad_norm": 0.42502324118725465, + "learning_rate": 3.295880149812734e-05, + "loss": 0.4922, + "step": 176 + }, + { + "epoch": 0.1992121553179516, + "grad_norm": 0.36517865445954867, + "learning_rate": 3.314606741573034e-05, + "loss": 0.5048, + "step": 177 + }, + { + "epoch": 0.2003376477208779, + "grad_norm": 0.41574946856579004, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.4821, + "step": 178 + }, + { + "epoch": 0.20146314012380417, + "grad_norm": 0.4378349227779501, + "learning_rate": 3.352059925093633e-05, + "loss": 0.4933, + "step": 179 + }, + { + "epoch": 0.20258863252673046, + "grad_norm": 0.4776232193190497, + "learning_rate": 3.370786516853933e-05, + "loss": 0.4751, + "step": 180 + }, + { + "epoch": 0.20371412492965674, + "grad_norm": 0.43848154415790724, + "learning_rate": 3.389513108614232e-05, + "loss": 0.4807, + "step": 181 + }, + { + "epoch": 0.20483961733258302, + "grad_norm": 0.5845165253893854, + "learning_rate": 3.408239700374532e-05, + "loss": 0.5009, + "step": 182 + }, + { + "epoch": 0.20596510973550927, + "grad_norm": 0.4082870415882244, + "learning_rate": 3.4269662921348316e-05, + "loss": 0.4979, + "step": 183 + }, + { + "epoch": 0.20709060213843555, + "grad_norm": 0.4718495231442118, + "learning_rate": 3.445692883895131e-05, + "loss": 0.5038, + "step": 184 + }, + { + "epoch": 0.20821609454136183, + "grad_norm": 0.473455230356082, + "learning_rate": 3.464419475655431e-05, + "loss": 0.4833, + "step": 185 + }, + { + "epoch": 0.20934158694428812, + "grad_norm": 0.46125301819415737, + "learning_rate": 3.483146067415731e-05, + "loss": 0.4889, + "step": 186 + }, + { + "epoch": 0.2104670793472144, + "grad_norm": 0.4432990112364735, + "learning_rate": 3.5018726591760305e-05, + "loss": 0.4807, + "step": 187 + }, + { + "epoch": 0.21159257175014068, + "grad_norm": 0.5322159815450351, + "learning_rate": 3.52059925093633e-05, + "loss": 0.5219, + "step": 188 + }, + { + "epoch": 0.21271806415306696, + "grad_norm": 0.3936101857237881, + "learning_rate": 3.5393258426966294e-05, + "loss": 0.4922, + "step": 189 + }, + { + "epoch": 0.21384355655599324, + "grad_norm": 0.53345431181708, + "learning_rate": 3.558052434456929e-05, + "loss": 0.4897, + "step": 190 + }, + { + "epoch": 0.21496904895891952, + "grad_norm": 0.6142180117371377, + "learning_rate": 3.576779026217228e-05, + "loss": 0.493, + "step": 191 + }, + { + "epoch": 0.2160945413618458, + "grad_norm": 0.41631870227046885, + "learning_rate": 3.595505617977528e-05, + "loss": 0.4893, + "step": 192 + }, + { + "epoch": 0.21722003376477209, + "grad_norm": 0.4305104245523169, + "learning_rate": 3.614232209737828e-05, + "loss": 0.4866, + "step": 193 + }, + { + "epoch": 0.21834552616769837, + "grad_norm": 0.5169903617970245, + "learning_rate": 3.6329588014981274e-05, + "loss": 0.4901, + "step": 194 + }, + { + "epoch": 0.21947101857062465, + "grad_norm": 0.3860200825591215, + "learning_rate": 3.651685393258427e-05, + "loss": 0.4549, + "step": 195 + }, + { + "epoch": 0.22059651097355093, + "grad_norm": 0.5230520554579277, + "learning_rate": 3.670411985018727e-05, + "loss": 0.4724, + "step": 196 + }, + { + "epoch": 0.2217220033764772, + "grad_norm": 0.39548431249473126, + "learning_rate": 3.689138576779027e-05, + "loss": 0.4911, + "step": 197 + }, + { + "epoch": 0.2228474957794035, + "grad_norm": 0.48800271319592975, + "learning_rate": 3.7078651685393264e-05, + "loss": 0.4572, + "step": 198 + }, + { + "epoch": 0.22397298818232977, + "grad_norm": 0.41978987611240903, + "learning_rate": 3.726591760299626e-05, + "loss": 0.4829, + "step": 199 + }, + { + "epoch": 0.22509848058525606, + "grad_norm": 0.5469472170008755, + "learning_rate": 3.745318352059925e-05, + "loss": 0.5274, + "step": 200 + }, + { + "epoch": 0.22622397298818234, + "grad_norm": 0.3918679709299485, + "learning_rate": 3.764044943820225e-05, + "loss": 0.4766, + "step": 201 + }, + { + "epoch": 0.22734946539110862, + "grad_norm": 0.4611366578168398, + "learning_rate": 3.782771535580524e-05, + "loss": 0.4533, + "step": 202 + }, + { + "epoch": 0.2284749577940349, + "grad_norm": 0.35525042824778863, + "learning_rate": 3.801498127340824e-05, + "loss": 0.4801, + "step": 203 + }, + { + "epoch": 0.22960045019696118, + "grad_norm": 0.39795327337608555, + "learning_rate": 3.8202247191011236e-05, + "loss": 0.4796, + "step": 204 + }, + { + "epoch": 0.23072594259988746, + "grad_norm": 0.40314796565206873, + "learning_rate": 3.838951310861423e-05, + "loss": 0.4746, + "step": 205 + }, + { + "epoch": 0.23185143500281374, + "grad_norm": 0.6186856651894296, + "learning_rate": 3.857677902621723e-05, + "loss": 0.483, + "step": 206 + }, + { + "epoch": 0.23297692740574, + "grad_norm": 0.6262230101782875, + "learning_rate": 3.876404494382023e-05, + "loss": 0.5026, + "step": 207 + }, + { + "epoch": 0.23410241980866628, + "grad_norm": 0.6885063622065476, + "learning_rate": 3.8951310861423226e-05, + "loss": 0.4961, + "step": 208 + }, + { + "epoch": 0.23522791221159256, + "grad_norm": 0.46434262483818484, + "learning_rate": 3.913857677902622e-05, + "loss": 0.4824, + "step": 209 + }, + { + "epoch": 0.23635340461451884, + "grad_norm": 0.5876521011749303, + "learning_rate": 3.9325842696629214e-05, + "loss": 0.4799, + "step": 210 + }, + { + "epoch": 0.23747889701744512, + "grad_norm": 0.5679577524617186, + "learning_rate": 3.951310861423221e-05, + "loss": 0.4976, + "step": 211 + }, + { + "epoch": 0.2386043894203714, + "grad_norm": 0.4948818608996542, + "learning_rate": 3.970037453183521e-05, + "loss": 0.4689, + "step": 212 + }, + { + "epoch": 0.23972988182329769, + "grad_norm": 0.5366944392366912, + "learning_rate": 3.98876404494382e-05, + "loss": 0.5052, + "step": 213 + }, + { + "epoch": 0.24085537422622397, + "grad_norm": 0.46091893449282645, + "learning_rate": 4.00749063670412e-05, + "loss": 0.504, + "step": 214 + }, + { + "epoch": 0.24198086662915025, + "grad_norm": 0.6227113043840353, + "learning_rate": 4.0262172284644194e-05, + "loss": 0.4947, + "step": 215 + }, + { + "epoch": 0.24310635903207653, + "grad_norm": 0.37975248168226977, + "learning_rate": 4.044943820224719e-05, + "loss": 0.4681, + "step": 216 + }, + { + "epoch": 0.2442318514350028, + "grad_norm": 0.6602796166859184, + "learning_rate": 4.063670411985019e-05, + "loss": 0.4809, + "step": 217 + }, + { + "epoch": 0.2453573438379291, + "grad_norm": 0.46707379726848597, + "learning_rate": 4.082397003745319e-05, + "loss": 0.4586, + "step": 218 + }, + { + "epoch": 0.24648283624085537, + "grad_norm": 0.58153678436466, + "learning_rate": 4.1011235955056184e-05, + "loss": 0.508, + "step": 219 + }, + { + "epoch": 0.24760832864378166, + "grad_norm": 0.6402167998756934, + "learning_rate": 4.119850187265918e-05, + "loss": 0.5039, + "step": 220 + }, + { + "epoch": 0.24873382104670794, + "grad_norm": 0.5794603595581886, + "learning_rate": 4.138576779026217e-05, + "loss": 0.4653, + "step": 221 + }, + { + "epoch": 0.24985931344963422, + "grad_norm": 0.5230659913502629, + "learning_rate": 4.157303370786517e-05, + "loss": 0.5038, + "step": 222 + }, + { + "epoch": 0.2509848058525605, + "grad_norm": 0.6799656883961334, + "learning_rate": 4.176029962546817e-05, + "loss": 0.4898, + "step": 223 + }, + { + "epoch": 0.25211029825548675, + "grad_norm": 0.6621661466046944, + "learning_rate": 4.1947565543071165e-05, + "loss": 0.4844, + "step": 224 + }, + { + "epoch": 0.25323579065841306, + "grad_norm": 0.5616167950816823, + "learning_rate": 4.2134831460674156e-05, + "loss": 0.488, + "step": 225 + }, + { + "epoch": 0.2543612830613393, + "grad_norm": 0.6575771702191943, + "learning_rate": 4.232209737827715e-05, + "loss": 0.4791, + "step": 226 + }, + { + "epoch": 0.2554867754642656, + "grad_norm": 0.5488369093936996, + "learning_rate": 4.250936329588015e-05, + "loss": 0.5005, + "step": 227 + }, + { + "epoch": 0.2566122678671919, + "grad_norm": 0.6144786136277036, + "learning_rate": 4.269662921348315e-05, + "loss": 0.4663, + "step": 228 + }, + { + "epoch": 0.2577377602701182, + "grad_norm": 0.600777544617447, + "learning_rate": 4.2883895131086146e-05, + "loss": 0.4786, + "step": 229 + }, + { + "epoch": 0.25886325267304444, + "grad_norm": 0.8023306491917337, + "learning_rate": 4.307116104868914e-05, + "loss": 0.4861, + "step": 230 + }, + { + "epoch": 0.25998874507597075, + "grad_norm": 0.5046450430408941, + "learning_rate": 4.3258426966292134e-05, + "loss": 0.4727, + "step": 231 + }, + { + "epoch": 0.261114237478897, + "grad_norm": 0.7089229481882726, + "learning_rate": 4.344569288389513e-05, + "loss": 0.4907, + "step": 232 + }, + { + "epoch": 0.2622397298818233, + "grad_norm": 0.7578774630778147, + "learning_rate": 4.363295880149813e-05, + "loss": 0.4568, + "step": 233 + }, + { + "epoch": 0.26336522228474957, + "grad_norm": 0.43469225376554593, + "learning_rate": 4.3820224719101126e-05, + "loss": 0.4882, + "step": 234 + }, + { + "epoch": 0.2644907146876759, + "grad_norm": 0.7064451875198454, + "learning_rate": 4.4007490636704124e-05, + "loss": 0.4777, + "step": 235 + }, + { + "epoch": 0.26561620709060213, + "grad_norm": 0.5236443057755517, + "learning_rate": 4.419475655430712e-05, + "loss": 0.4541, + "step": 236 + }, + { + "epoch": 0.26674169949352844, + "grad_norm": 0.49987222413307647, + "learning_rate": 4.438202247191011e-05, + "loss": 0.4813, + "step": 237 + }, + { + "epoch": 0.2678671918964547, + "grad_norm": 0.4887524110978959, + "learning_rate": 4.456928838951311e-05, + "loss": 0.4904, + "step": 238 + }, + { + "epoch": 0.268992684299381, + "grad_norm": 0.43081504689209343, + "learning_rate": 4.475655430711611e-05, + "loss": 0.4828, + "step": 239 + }, + { + "epoch": 0.27011817670230726, + "grad_norm": 0.4927726763052006, + "learning_rate": 4.4943820224719104e-05, + "loss": 0.4661, + "step": 240 + }, + { + "epoch": 0.27124366910523356, + "grad_norm": 0.48347879754332407, + "learning_rate": 4.51310861423221e-05, + "loss": 0.4698, + "step": 241 + }, + { + "epoch": 0.2723691615081598, + "grad_norm": 0.4867564616696551, + "learning_rate": 4.531835205992509e-05, + "loss": 0.49, + "step": 242 + }, + { + "epoch": 0.2734946539110861, + "grad_norm": 0.5374635091852026, + "learning_rate": 4.550561797752809e-05, + "loss": 0.4559, + "step": 243 + }, + { + "epoch": 0.2746201463140124, + "grad_norm": 0.4772645954109197, + "learning_rate": 4.569288389513109e-05, + "loss": 0.4717, + "step": 244 + }, + { + "epoch": 0.27574563871693863, + "grad_norm": 0.4347548915476938, + "learning_rate": 4.5880149812734085e-05, + "loss": 0.4532, + "step": 245 + }, + { + "epoch": 0.27687113111986494, + "grad_norm": 0.4759288672038084, + "learning_rate": 4.606741573033708e-05, + "loss": 0.4945, + "step": 246 + }, + { + "epoch": 0.2779966235227912, + "grad_norm": 0.5792894788506712, + "learning_rate": 4.625468164794008e-05, + "loss": 0.4983, + "step": 247 + }, + { + "epoch": 0.2791221159257175, + "grad_norm": 0.3980178738338264, + "learning_rate": 4.644194756554308e-05, + "loss": 0.4827, + "step": 248 + }, + { + "epoch": 0.28024760832864376, + "grad_norm": 0.5408869118408165, + "learning_rate": 4.662921348314607e-05, + "loss": 0.4528, + "step": 249 + }, + { + "epoch": 0.28137310073157007, + "grad_norm": 0.5689847204055498, + "learning_rate": 4.6816479400749066e-05, + "loss": 0.4598, + "step": 250 + }, + { + "epoch": 0.2824985931344963, + "grad_norm": 0.5617698646408457, + "learning_rate": 4.700374531835206e-05, + "loss": 0.5022, + "step": 251 + }, + { + "epoch": 0.28362408553742263, + "grad_norm": 0.45230255865587565, + "learning_rate": 4.719101123595506e-05, + "loss": 0.47, + "step": 252 + }, + { + "epoch": 0.2847495779403489, + "grad_norm": 0.42374590684633967, + "learning_rate": 4.737827715355805e-05, + "loss": 0.466, + "step": 253 + }, + { + "epoch": 0.2858750703432752, + "grad_norm": 0.4180084231861174, + "learning_rate": 4.756554307116105e-05, + "loss": 0.4747, + "step": 254 + }, + { + "epoch": 0.28700056274620145, + "grad_norm": 0.4787386782007702, + "learning_rate": 4.7752808988764046e-05, + "loss": 0.4611, + "step": 255 + }, + { + "epoch": 0.28812605514912776, + "grad_norm": 0.40084601864669134, + "learning_rate": 4.7940074906367044e-05, + "loss": 0.4577, + "step": 256 + }, + { + "epoch": 0.289251547552054, + "grad_norm": 0.5597034903024327, + "learning_rate": 4.812734082397004e-05, + "loss": 0.4843, + "step": 257 + }, + { + "epoch": 0.2903770399549803, + "grad_norm": 0.4389515417232518, + "learning_rate": 4.831460674157304e-05, + "loss": 0.4667, + "step": 258 + }, + { + "epoch": 0.2915025323579066, + "grad_norm": 0.5153267551952044, + "learning_rate": 4.8501872659176036e-05, + "loss": 0.4543, + "step": 259 + }, + { + "epoch": 0.2926280247608329, + "grad_norm": 0.4414645886637002, + "learning_rate": 4.8689138576779034e-05, + "loss": 0.4687, + "step": 260 + }, + { + "epoch": 0.29375351716375914, + "grad_norm": 0.5441323388581608, + "learning_rate": 4.8876404494382024e-05, + "loss": 0.4904, + "step": 261 + }, + { + "epoch": 0.29487900956668545, + "grad_norm": 0.47357188882841866, + "learning_rate": 4.906367041198502e-05, + "loss": 0.4646, + "step": 262 + }, + { + "epoch": 0.2960045019696117, + "grad_norm": 0.43582942744547837, + "learning_rate": 4.925093632958801e-05, + "loss": 0.4747, + "step": 263 + }, + { + "epoch": 0.297129994372538, + "grad_norm": 0.4659511261837298, + "learning_rate": 4.943820224719101e-05, + "loss": 0.4613, + "step": 264 + }, + { + "epoch": 0.29825548677546426, + "grad_norm": 0.4561502948161637, + "learning_rate": 4.962546816479401e-05, + "loss": 0.4691, + "step": 265 + }, + { + "epoch": 0.29938097917839057, + "grad_norm": 0.4541481932977169, + "learning_rate": 4.9812734082397005e-05, + "loss": 0.4661, + "step": 266 + }, + { + "epoch": 0.3005064715813168, + "grad_norm": 0.47547350717861037, + "learning_rate": 5e-05, + "loss": 0.4678, + "step": 267 + }, + { + "epoch": 0.3016319639842431, + "grad_norm": 0.406819603933647, + "learning_rate": 4.997914059240717e-05, + "loss": 0.4381, + "step": 268 + }, + { + "epoch": 0.3027574563871694, + "grad_norm": 0.4460099057965906, + "learning_rate": 4.9958281184814356e-05, + "loss": 0.468, + "step": 269 + }, + { + "epoch": 0.30388294879009564, + "grad_norm": 0.5142189061381415, + "learning_rate": 4.9937421777221527e-05, + "loss": 0.4714, + "step": 270 + }, + { + "epoch": 0.30500844119302195, + "grad_norm": 0.5665935978369622, + "learning_rate": 4.9916562369628704e-05, + "loss": 0.4818, + "step": 271 + }, + { + "epoch": 0.3061339335959482, + "grad_norm": 0.47186870063336683, + "learning_rate": 4.989570296203588e-05, + "loss": 0.4793, + "step": 272 + }, + { + "epoch": 0.3072594259988745, + "grad_norm": 0.457167782453592, + "learning_rate": 4.987484355444306e-05, + "loss": 0.4404, + "step": 273 + }, + { + "epoch": 0.30838491840180077, + "grad_norm": 0.5127134764884651, + "learning_rate": 4.985398414685023e-05, + "loss": 0.465, + "step": 274 + }, + { + "epoch": 0.3095104108047271, + "grad_norm": 0.6532795598287193, + "learning_rate": 4.983312473925741e-05, + "loss": 0.518, + "step": 275 + }, + { + "epoch": 0.31063590320765333, + "grad_norm": 0.6930677368699536, + "learning_rate": 4.981226533166458e-05, + "loss": 0.4866, + "step": 276 + }, + { + "epoch": 0.31176139561057964, + "grad_norm": 0.5061640947852638, + "learning_rate": 4.979140592407176e-05, + "loss": 0.4609, + "step": 277 + }, + { + "epoch": 0.3128868880135059, + "grad_norm": 0.5257849117764034, + "learning_rate": 4.9770546516478936e-05, + "loss": 0.457, + "step": 278 + }, + { + "epoch": 0.3140123804164322, + "grad_norm": 0.4321326701858845, + "learning_rate": 4.974968710888611e-05, + "loss": 0.4454, + "step": 279 + }, + { + "epoch": 0.31513787281935846, + "grad_norm": 0.38723391299657206, + "learning_rate": 4.972882770129328e-05, + "loss": 0.4643, + "step": 280 + }, + { + "epoch": 0.31626336522228476, + "grad_norm": 0.5298180060499573, + "learning_rate": 4.970796829370046e-05, + "loss": 0.4746, + "step": 281 + }, + { + "epoch": 0.317388857625211, + "grad_norm": 0.4061129177315826, + "learning_rate": 4.968710888610764e-05, + "loss": 0.4483, + "step": 282 + }, + { + "epoch": 0.3185143500281373, + "grad_norm": 0.44266904421962466, + "learning_rate": 4.9666249478514814e-05, + "loss": 0.4462, + "step": 283 + }, + { + "epoch": 0.3196398424310636, + "grad_norm": 0.5723516835538393, + "learning_rate": 4.964539007092199e-05, + "loss": 0.4912, + "step": 284 + }, + { + "epoch": 0.3207653348339899, + "grad_norm": 0.4268190071809671, + "learning_rate": 4.962453066332917e-05, + "loss": 0.4747, + "step": 285 + }, + { + "epoch": 0.32189082723691614, + "grad_norm": 0.5119510744401135, + "learning_rate": 4.960367125573634e-05, + "loss": 0.4822, + "step": 286 + }, + { + "epoch": 0.32301631963984245, + "grad_norm": 0.6845433911954164, + "learning_rate": 4.9582811848143515e-05, + "loss": 0.4784, + "step": 287 + }, + { + "epoch": 0.3241418120427687, + "grad_norm": 0.5512193560031954, + "learning_rate": 4.956195244055069e-05, + "loss": 0.4652, + "step": 288 + }, + { + "epoch": 0.325267304445695, + "grad_norm": 0.44176750937437276, + "learning_rate": 4.954109303295786e-05, + "loss": 0.4591, + "step": 289 + }, + { + "epoch": 0.32639279684862127, + "grad_norm": 0.4802436465499188, + "learning_rate": 4.952023362536504e-05, + "loss": 0.4646, + "step": 290 + }, + { + "epoch": 0.3275182892515476, + "grad_norm": 0.5130069834143126, + "learning_rate": 4.9499374217772216e-05, + "loss": 0.4632, + "step": 291 + }, + { + "epoch": 0.32864378165447383, + "grad_norm": 0.3492902519652929, + "learning_rate": 4.947851481017939e-05, + "loss": 0.4479, + "step": 292 + }, + { + "epoch": 0.3297692740574001, + "grad_norm": 0.44213997370659125, + "learning_rate": 4.945765540258657e-05, + "loss": 0.5066, + "step": 293 + }, + { + "epoch": 0.3308947664603264, + "grad_norm": 0.42377286032567374, + "learning_rate": 4.943679599499375e-05, + "loss": 0.4302, + "step": 294 + }, + { + "epoch": 0.33202025886325265, + "grad_norm": 0.394814830036649, + "learning_rate": 4.941593658740092e-05, + "loss": 0.4767, + "step": 295 + }, + { + "epoch": 0.33314575126617896, + "grad_norm": 0.4318002819787604, + "learning_rate": 4.9395077179808094e-05, + "loss": 0.4672, + "step": 296 + }, + { + "epoch": 0.3342712436691052, + "grad_norm": 0.4312528420970919, + "learning_rate": 4.937421777221527e-05, + "loss": 0.4301, + "step": 297 + }, + { + "epoch": 0.3353967360720315, + "grad_norm": 0.4506557238069578, + "learning_rate": 4.935335836462245e-05, + "loss": 0.4799, + "step": 298 + }, + { + "epoch": 0.3365222284749578, + "grad_norm": 0.4035767985622815, + "learning_rate": 4.933249895702962e-05, + "loss": 0.4685, + "step": 299 + }, + { + "epoch": 0.3376477208778841, + "grad_norm": 0.3974620013501897, + "learning_rate": 4.93116395494368e-05, + "loss": 0.4597, + "step": 300 + }, + { + "epoch": 0.33877321328081034, + "grad_norm": 0.4262423662726608, + "learning_rate": 4.929078014184397e-05, + "loss": 0.5115, + "step": 301 + }, + { + "epoch": 0.33989870568373665, + "grad_norm": 0.46574622767948337, + "learning_rate": 4.926992073425115e-05, + "loss": 0.4633, + "step": 302 + }, + { + "epoch": 0.3410241980866629, + "grad_norm": 0.3662096934434811, + "learning_rate": 4.9249061326658326e-05, + "loss": 0.459, + "step": 303 + }, + { + "epoch": 0.3421496904895892, + "grad_norm": 0.4345723771900289, + "learning_rate": 4.92282019190655e-05, + "loss": 0.4912, + "step": 304 + }, + { + "epoch": 0.34327518289251546, + "grad_norm": 0.43786190381782847, + "learning_rate": 4.9207342511472674e-05, + "loss": 0.4433, + "step": 305 + }, + { + "epoch": 0.34440067529544177, + "grad_norm": 0.502359362326431, + "learning_rate": 4.918648310387986e-05, + "loss": 0.4656, + "step": 306 + }, + { + "epoch": 0.345526167698368, + "grad_norm": 0.36663337654610195, + "learning_rate": 4.916562369628703e-05, + "loss": 0.4902, + "step": 307 + }, + { + "epoch": 0.34665166010129433, + "grad_norm": 0.5430348378224088, + "learning_rate": 4.9144764288694205e-05, + "loss": 0.4762, + "step": 308 + }, + { + "epoch": 0.3477771525042206, + "grad_norm": 0.5126322468885988, + "learning_rate": 4.912390488110138e-05, + "loss": 0.4537, + "step": 309 + }, + { + "epoch": 0.3489026449071469, + "grad_norm": 0.5095172872961993, + "learning_rate": 4.910304547350855e-05, + "loss": 0.4741, + "step": 310 + }, + { + "epoch": 0.35002813731007315, + "grad_norm": 0.48284683021884783, + "learning_rate": 4.908218606591573e-05, + "loss": 0.4726, + "step": 311 + }, + { + "epoch": 0.35115362971299946, + "grad_norm": 0.489802898549214, + "learning_rate": 4.9061326658322906e-05, + "loss": 0.4686, + "step": 312 + }, + { + "epoch": 0.3522791221159257, + "grad_norm": 0.39249156296212534, + "learning_rate": 4.904046725073008e-05, + "loss": 0.4695, + "step": 313 + }, + { + "epoch": 0.353404614518852, + "grad_norm": 0.5323916362107045, + "learning_rate": 4.901960784313725e-05, + "loss": 0.4965, + "step": 314 + }, + { + "epoch": 0.3545301069217783, + "grad_norm": 0.4533239040835354, + "learning_rate": 4.899874843554444e-05, + "loss": 0.4707, + "step": 315 + }, + { + "epoch": 0.3556555993247046, + "grad_norm": 0.4206387502485126, + "learning_rate": 4.897788902795161e-05, + "loss": 0.4579, + "step": 316 + }, + { + "epoch": 0.35678109172763084, + "grad_norm": 0.6269190751003352, + "learning_rate": 4.8957029620358784e-05, + "loss": 0.4693, + "step": 317 + }, + { + "epoch": 0.3579065841305571, + "grad_norm": 0.5737738257462925, + "learning_rate": 4.893617021276596e-05, + "loss": 0.4618, + "step": 318 + }, + { + "epoch": 0.3590320765334834, + "grad_norm": 0.4441915442589318, + "learning_rate": 4.891531080517314e-05, + "loss": 0.4873, + "step": 319 + }, + { + "epoch": 0.36015756893640966, + "grad_norm": 0.4262860610042779, + "learning_rate": 4.889445139758031e-05, + "loss": 0.4878, + "step": 320 + }, + { + "epoch": 0.36128306133933596, + "grad_norm": 0.4858584345693206, + "learning_rate": 4.8873591989987485e-05, + "loss": 0.4669, + "step": 321 + }, + { + "epoch": 0.3624085537422622, + "grad_norm": 0.3534387563975198, + "learning_rate": 4.885273258239466e-05, + "loss": 0.4811, + "step": 322 + }, + { + "epoch": 0.3635340461451885, + "grad_norm": 0.5271950734928447, + "learning_rate": 4.883187317480184e-05, + "loss": 0.4711, + "step": 323 + }, + { + "epoch": 0.3646595385481148, + "grad_norm": 0.409281575876073, + "learning_rate": 4.8811013767209016e-05, + "loss": 0.4659, + "step": 324 + }, + { + "epoch": 0.3657850309510411, + "grad_norm": 0.539894161794808, + "learning_rate": 4.879015435961619e-05, + "loss": 0.4617, + "step": 325 + }, + { + "epoch": 0.36691052335396734, + "grad_norm": 0.5540340872215955, + "learning_rate": 4.876929495202336e-05, + "loss": 0.4488, + "step": 326 + }, + { + "epoch": 0.36803601575689365, + "grad_norm": 0.5450024514176801, + "learning_rate": 4.874843554443054e-05, + "loss": 0.4443, + "step": 327 + }, + { + "epoch": 0.3691615081598199, + "grad_norm": 0.4022020669388415, + "learning_rate": 4.872757613683772e-05, + "loss": 0.4447, + "step": 328 + }, + { + "epoch": 0.3702870005627462, + "grad_norm": 0.5015493685184448, + "learning_rate": 4.8706716729244894e-05, + "loss": 0.4805, + "step": 329 + }, + { + "epoch": 0.37141249296567247, + "grad_norm": 0.5205564906682338, + "learning_rate": 4.8685857321652064e-05, + "loss": 0.4727, + "step": 330 + }, + { + "epoch": 0.3725379853685988, + "grad_norm": 0.48440539535018856, + "learning_rate": 4.866499791405924e-05, + "loss": 0.4789, + "step": 331 + }, + { + "epoch": 0.37366347777152503, + "grad_norm": 0.5749114846228484, + "learning_rate": 4.864413850646642e-05, + "loss": 0.4452, + "step": 332 + }, + { + "epoch": 0.37478897017445134, + "grad_norm": 0.5347894114064871, + "learning_rate": 4.8623279098873595e-05, + "loss": 0.466, + "step": 333 + }, + { + "epoch": 0.3759144625773776, + "grad_norm": 0.5219184880333937, + "learning_rate": 4.860241969128077e-05, + "loss": 0.4297, + "step": 334 + }, + { + "epoch": 0.3770399549803039, + "grad_norm": 0.4540713981584441, + "learning_rate": 4.858156028368794e-05, + "loss": 0.4716, + "step": 335 + }, + { + "epoch": 0.37816544738323016, + "grad_norm": 0.65356051165203, + "learning_rate": 4.856070087609512e-05, + "loss": 0.4709, + "step": 336 + }, + { + "epoch": 0.37929093978615647, + "grad_norm": 0.4310409303487337, + "learning_rate": 4.8539841468502296e-05, + "loss": 0.4931, + "step": 337 + }, + { + "epoch": 0.3804164321890827, + "grad_norm": 0.7081733344034011, + "learning_rate": 4.851898206090947e-05, + "loss": 0.4787, + "step": 338 + }, + { + "epoch": 0.38154192459200903, + "grad_norm": 0.4155006654801174, + "learning_rate": 4.8498122653316644e-05, + "loss": 0.4547, + "step": 339 + }, + { + "epoch": 0.3826674169949353, + "grad_norm": 0.6789073369172818, + "learning_rate": 4.847726324572383e-05, + "loss": 0.4526, + "step": 340 + }, + { + "epoch": 0.38379290939786154, + "grad_norm": 0.4700941252868989, + "learning_rate": 4.8456403838131e-05, + "loss": 0.4585, + "step": 341 + }, + { + "epoch": 0.38491840180078785, + "grad_norm": 0.5721349237605509, + "learning_rate": 4.8435544430538175e-05, + "loss": 0.4305, + "step": 342 + }, + { + "epoch": 0.3860438942037141, + "grad_norm": 0.6679337639014323, + "learning_rate": 4.841468502294535e-05, + "loss": 0.4727, + "step": 343 + }, + { + "epoch": 0.3871693866066404, + "grad_norm": 0.6136310708197658, + "learning_rate": 4.839382561535253e-05, + "loss": 0.4632, + "step": 344 + }, + { + "epoch": 0.38829487900956666, + "grad_norm": 0.560884620312814, + "learning_rate": 4.83729662077597e-05, + "loss": 0.4438, + "step": 345 + }, + { + "epoch": 0.38942037141249297, + "grad_norm": 0.6098279474363337, + "learning_rate": 4.835210680016688e-05, + "loss": 0.4407, + "step": 346 + }, + { + "epoch": 0.3905458638154192, + "grad_norm": 0.5154661466104475, + "learning_rate": 4.833124739257405e-05, + "loss": 0.446, + "step": 347 + }, + { + "epoch": 0.39167135621834553, + "grad_norm": 0.7038644519573083, + "learning_rate": 4.831038798498123e-05, + "loss": 0.4646, + "step": 348 + }, + { + "epoch": 0.3927968486212718, + "grad_norm": 0.5207244024602116, + "learning_rate": 4.828952857738841e-05, + "loss": 0.4476, + "step": 349 + }, + { + "epoch": 0.3939223410241981, + "grad_norm": 0.5084794430652734, + "learning_rate": 4.8268669169795584e-05, + "loss": 0.4478, + "step": 350 + }, + { + "epoch": 0.39504783342712435, + "grad_norm": 0.5120708559114392, + "learning_rate": 4.8247809762202754e-05, + "loss": 0.4622, + "step": 351 + }, + { + "epoch": 0.39617332583005066, + "grad_norm": 0.6265345182325885, + "learning_rate": 4.822695035460993e-05, + "loss": 0.4706, + "step": 352 + }, + { + "epoch": 0.3972988182329769, + "grad_norm": 0.5750184584832099, + "learning_rate": 4.820609094701711e-05, + "loss": 0.4882, + "step": 353 + }, + { + "epoch": 0.3984243106359032, + "grad_norm": 0.5490268536187386, + "learning_rate": 4.818523153942428e-05, + "loss": 0.4601, + "step": 354 + }, + { + "epoch": 0.3995498030388295, + "grad_norm": 0.5317594975111523, + "learning_rate": 4.816437213183146e-05, + "loss": 0.4587, + "step": 355 + }, + { + "epoch": 0.4006752954417558, + "grad_norm": 0.49454469908724474, + "learning_rate": 4.814351272423863e-05, + "loss": 0.4559, + "step": 356 + }, + { + "epoch": 0.40180078784468204, + "grad_norm": 0.5764930655203424, + "learning_rate": 4.812265331664581e-05, + "loss": 0.4677, + "step": 357 + }, + { + "epoch": 0.40292628024760835, + "grad_norm": 0.4254928567571142, + "learning_rate": 4.8101793909052986e-05, + "loss": 0.4468, + "step": 358 + }, + { + "epoch": 0.4040517726505346, + "grad_norm": 0.5563885643276592, + "learning_rate": 4.808093450146016e-05, + "loss": 0.4333, + "step": 359 + }, + { + "epoch": 0.4051772650534609, + "grad_norm": 0.4150167393933345, + "learning_rate": 4.806007509386733e-05, + "loss": 0.4515, + "step": 360 + }, + { + "epoch": 0.40630275745638716, + "grad_norm": 0.4751390092870927, + "learning_rate": 4.803921568627452e-05, + "loss": 0.4776, + "step": 361 + }, + { + "epoch": 0.4074282498593135, + "grad_norm": 0.43506496230734293, + "learning_rate": 4.801835627868169e-05, + "loss": 0.4533, + "step": 362 + }, + { + "epoch": 0.4085537422622397, + "grad_norm": 0.4488704280500811, + "learning_rate": 4.7997496871088864e-05, + "loss": 0.4555, + "step": 363 + }, + { + "epoch": 0.40967923466516604, + "grad_norm": 0.4888805568606264, + "learning_rate": 4.797663746349604e-05, + "loss": 0.4529, + "step": 364 + }, + { + "epoch": 0.4108047270680923, + "grad_norm": 0.47826665202586255, + "learning_rate": 4.795577805590322e-05, + "loss": 0.4565, + "step": 365 + }, + { + "epoch": 0.41193021947101854, + "grad_norm": 0.427518205471488, + "learning_rate": 4.793491864831039e-05, + "loss": 0.4622, + "step": 366 + }, + { + "epoch": 0.41305571187394485, + "grad_norm": 0.4657645019409657, + "learning_rate": 4.7914059240717565e-05, + "loss": 0.4731, + "step": 367 + }, + { + "epoch": 0.4141812042768711, + "grad_norm": 0.4980798737202691, + "learning_rate": 4.789319983312474e-05, + "loss": 0.4496, + "step": 368 + }, + { + "epoch": 0.4153066966797974, + "grad_norm": 0.4106122018359277, + "learning_rate": 4.787234042553192e-05, + "loss": 0.4537, + "step": 369 + }, + { + "epoch": 0.41643218908272367, + "grad_norm": 0.46050479994884624, + "learning_rate": 4.785148101793909e-05, + "loss": 0.4434, + "step": 370 + }, + { + "epoch": 0.41755768148565, + "grad_norm": 0.5221350337150601, + "learning_rate": 4.783062161034627e-05, + "loss": 0.458, + "step": 371 + }, + { + "epoch": 0.41868317388857623, + "grad_norm": 0.44101630868691777, + "learning_rate": 4.780976220275344e-05, + "loss": 0.4614, + "step": 372 + }, + { + "epoch": 0.41980866629150254, + "grad_norm": 0.5767546396305836, + "learning_rate": 4.778890279516062e-05, + "loss": 0.4829, + "step": 373 + }, + { + "epoch": 0.4209341586944288, + "grad_norm": 0.47996101159798066, + "learning_rate": 4.77680433875678e-05, + "loss": 0.4569, + "step": 374 + }, + { + "epoch": 0.4220596510973551, + "grad_norm": 0.5033820590275159, + "learning_rate": 4.774718397997497e-05, + "loss": 0.453, + "step": 375 + }, + { + "epoch": 0.42318514350028136, + "grad_norm": 0.4218904797929267, + "learning_rate": 4.7726324572382145e-05, + "loss": 0.4936, + "step": 376 + }, + { + "epoch": 0.42431063590320767, + "grad_norm": 0.41845341895601695, + "learning_rate": 4.770546516478932e-05, + "loss": 0.4371, + "step": 377 + }, + { + "epoch": 0.4254361283061339, + "grad_norm": 0.38945322748226296, + "learning_rate": 4.76846057571965e-05, + "loss": 0.4391, + "step": 378 + }, + { + "epoch": 0.42656162070906023, + "grad_norm": 0.4161185110299941, + "learning_rate": 4.766374634960367e-05, + "loss": 0.4485, + "step": 379 + }, + { + "epoch": 0.4276871131119865, + "grad_norm": 0.3864477310593941, + "learning_rate": 4.764288694201085e-05, + "loss": 0.4543, + "step": 380 + }, + { + "epoch": 0.4288126055149128, + "grad_norm": 0.48714053443872946, + "learning_rate": 4.762202753441802e-05, + "loss": 0.4809, + "step": 381 + }, + { + "epoch": 0.42993809791783905, + "grad_norm": 0.44027599652018634, + "learning_rate": 4.76011681268252e-05, + "loss": 0.4664, + "step": 382 + }, + { + "epoch": 0.43106359032076536, + "grad_norm": 0.46260149439173776, + "learning_rate": 4.758030871923238e-05, + "loss": 0.4358, + "step": 383 + }, + { + "epoch": 0.4321890827236916, + "grad_norm": 0.37340303441017136, + "learning_rate": 4.7559449311639554e-05, + "loss": 0.4444, + "step": 384 + }, + { + "epoch": 0.4333145751266179, + "grad_norm": 0.4414988549473453, + "learning_rate": 4.7538589904046724e-05, + "loss": 0.4338, + "step": 385 + }, + { + "epoch": 0.43444006752954417, + "grad_norm": 0.4002550060272223, + "learning_rate": 4.751773049645391e-05, + "loss": 0.4494, + "step": 386 + }, + { + "epoch": 0.4355655599324705, + "grad_norm": 0.4158146887262931, + "learning_rate": 4.749687108886108e-05, + "loss": 0.4454, + "step": 387 + }, + { + "epoch": 0.43669105233539673, + "grad_norm": 0.35977608941282263, + "learning_rate": 4.7476011681268255e-05, + "loss": 0.4533, + "step": 388 + }, + { + "epoch": 0.43781654473832304, + "grad_norm": 0.4764697673218214, + "learning_rate": 4.745515227367543e-05, + "loss": 0.4835, + "step": 389 + }, + { + "epoch": 0.4389420371412493, + "grad_norm": 0.35081968018481574, + "learning_rate": 4.743429286608261e-05, + "loss": 0.4579, + "step": 390 + }, + { + "epoch": 0.44006752954417555, + "grad_norm": 0.4170219497175011, + "learning_rate": 4.741343345848978e-05, + "loss": 0.4398, + "step": 391 + }, + { + "epoch": 0.44119302194710186, + "grad_norm": 0.44456892241843987, + "learning_rate": 4.739257405089696e-05, + "loss": 0.4805, + "step": 392 + }, + { + "epoch": 0.4423185143500281, + "grad_norm": 0.3810686961824828, + "learning_rate": 4.737171464330413e-05, + "loss": 0.4687, + "step": 393 + }, + { + "epoch": 0.4434440067529544, + "grad_norm": 0.4459516182156416, + "learning_rate": 4.73508552357113e-05, + "loss": 0.4441, + "step": 394 + }, + { + "epoch": 0.4445694991558807, + "grad_norm": 0.36574072948327524, + "learning_rate": 4.732999582811849e-05, + "loss": 0.4759, + "step": 395 + }, + { + "epoch": 0.445694991558807, + "grad_norm": 0.46519670122225776, + "learning_rate": 4.730913642052566e-05, + "loss": 0.4622, + "step": 396 + }, + { + "epoch": 0.44682048396173324, + "grad_norm": 0.3782284810519757, + "learning_rate": 4.7288277012932834e-05, + "loss": 0.4518, + "step": 397 + }, + { + "epoch": 0.44794597636465955, + "grad_norm": 0.4321697226823169, + "learning_rate": 4.726741760534001e-05, + "loss": 0.4213, + "step": 398 + }, + { + "epoch": 0.4490714687675858, + "grad_norm": 0.3846389059595841, + "learning_rate": 4.724655819774719e-05, + "loss": 0.4473, + "step": 399 + }, + { + "epoch": 0.4501969611705121, + "grad_norm": 0.4148349323542458, + "learning_rate": 4.722569879015436e-05, + "loss": 0.447, + "step": 400 + }, + { + "epoch": 0.45132245357343836, + "grad_norm": 0.3987423433461808, + "learning_rate": 4.720483938256154e-05, + "loss": 0.4428, + "step": 401 + }, + { + "epoch": 0.4524479459763647, + "grad_norm": 0.42246987876628445, + "learning_rate": 4.718397997496871e-05, + "loss": 0.4456, + "step": 402 + }, + { + "epoch": 0.4535734383792909, + "grad_norm": 0.4060448399568812, + "learning_rate": 4.716312056737589e-05, + "loss": 0.4734, + "step": 403 + }, + { + "epoch": 0.45469893078221724, + "grad_norm": 0.38939419691921573, + "learning_rate": 4.7142261159783066e-05, + "loss": 0.4506, + "step": 404 + }, + { + "epoch": 0.4558244231851435, + "grad_norm": 0.39441558158161155, + "learning_rate": 4.712140175219024e-05, + "loss": 0.4611, + "step": 405 + }, + { + "epoch": 0.4569499155880698, + "grad_norm": 0.37043790127930454, + "learning_rate": 4.7100542344597413e-05, + "loss": 0.4446, + "step": 406 + }, + { + "epoch": 0.45807540799099605, + "grad_norm": 0.39081323070794516, + "learning_rate": 4.707968293700459e-05, + "loss": 0.4586, + "step": 407 + }, + { + "epoch": 0.45920090039392236, + "grad_norm": 0.38815613346341743, + "learning_rate": 4.705882352941177e-05, + "loss": 0.4731, + "step": 408 + }, + { + "epoch": 0.4603263927968486, + "grad_norm": 0.4081757852974463, + "learning_rate": 4.7037964121818944e-05, + "loss": 0.4392, + "step": 409 + }, + { + "epoch": 0.4614518851997749, + "grad_norm": 0.3789626983185206, + "learning_rate": 4.7017104714226115e-05, + "loss": 0.4561, + "step": 410 + }, + { + "epoch": 0.4625773776027012, + "grad_norm": 0.5000455667230893, + "learning_rate": 4.69962453066333e-05, + "loss": 0.4642, + "step": 411 + }, + { + "epoch": 0.4637028700056275, + "grad_norm": 0.3422337438589666, + "learning_rate": 4.697538589904047e-05, + "loss": 0.4592, + "step": 412 + }, + { + "epoch": 0.46482836240855374, + "grad_norm": 0.5638947084171662, + "learning_rate": 4.6954526491447646e-05, + "loss": 0.451, + "step": 413 + }, + { + "epoch": 0.46595385481148, + "grad_norm": 0.38536737227105394, + "learning_rate": 4.693366708385482e-05, + "loss": 0.4386, + "step": 414 + }, + { + "epoch": 0.4670793472144063, + "grad_norm": 0.46615900085704925, + "learning_rate": 4.691280767626199e-05, + "loss": 0.4572, + "step": 415 + }, + { + "epoch": 0.46820483961733256, + "grad_norm": 0.45954601145736806, + "learning_rate": 4.689194826866917e-05, + "loss": 0.4745, + "step": 416 + }, + { + "epoch": 0.46933033202025887, + "grad_norm": 0.3925870159696147, + "learning_rate": 4.687108886107635e-05, + "loss": 0.4383, + "step": 417 + }, + { + "epoch": 0.4704558244231851, + "grad_norm": 0.4232172013685177, + "learning_rate": 4.6850229453483524e-05, + "loss": 0.4455, + "step": 418 + }, + { + "epoch": 0.47158131682611143, + "grad_norm": 0.4709258500108095, + "learning_rate": 4.6829370045890694e-05, + "loss": 0.4329, + "step": 419 + }, + { + "epoch": 0.4727068092290377, + "grad_norm": 0.5478084541084817, + "learning_rate": 4.680851063829788e-05, + "loss": 0.4778, + "step": 420 + }, + { + "epoch": 0.473832301631964, + "grad_norm": 0.39060968027365583, + "learning_rate": 4.678765123070505e-05, + "loss": 0.446, + "step": 421 + }, + { + "epoch": 0.47495779403489025, + "grad_norm": 0.43252322301543766, + "learning_rate": 4.6766791823112225e-05, + "loss": 0.4606, + "step": 422 + }, + { + "epoch": 0.47608328643781656, + "grad_norm": 0.48537861169690405, + "learning_rate": 4.67459324155194e-05, + "loss": 0.4845, + "step": 423 + }, + { + "epoch": 0.4772087788407428, + "grad_norm": 0.34601404275255593, + "learning_rate": 4.672507300792658e-05, + "loss": 0.4357, + "step": 424 + }, + { + "epoch": 0.4783342712436691, + "grad_norm": 0.42339913946057167, + "learning_rate": 4.670421360033375e-05, + "loss": 0.4421, + "step": 425 + }, + { + "epoch": 0.47945976364659537, + "grad_norm": 0.39857659754496044, + "learning_rate": 4.668335419274093e-05, + "loss": 0.4448, + "step": 426 + }, + { + "epoch": 0.4805852560495217, + "grad_norm": 0.38982322860737306, + "learning_rate": 4.66624947851481e-05, + "loss": 0.4449, + "step": 427 + }, + { + "epoch": 0.48171074845244793, + "grad_norm": 0.4167533082716713, + "learning_rate": 4.664163537755528e-05, + "loss": 0.4538, + "step": 428 + }, + { + "epoch": 0.48283624085537424, + "grad_norm": 0.38396785885791673, + "learning_rate": 4.662077596996246e-05, + "loss": 0.4665, + "step": 429 + }, + { + "epoch": 0.4839617332583005, + "grad_norm": 0.4460443959564988, + "learning_rate": 4.6599916562369634e-05, + "loss": 0.4695, + "step": 430 + }, + { + "epoch": 0.4850872256612268, + "grad_norm": 0.4307496077176856, + "learning_rate": 4.6579057154776804e-05, + "loss": 0.479, + "step": 431 + }, + { + "epoch": 0.48621271806415306, + "grad_norm": 0.4703944597323029, + "learning_rate": 4.655819774718399e-05, + "loss": 0.4616, + "step": 432 + }, + { + "epoch": 0.48733821046707937, + "grad_norm": 0.4532939669627873, + "learning_rate": 4.653733833959116e-05, + "loss": 0.4386, + "step": 433 + }, + { + "epoch": 0.4884637028700056, + "grad_norm": 0.38992923384312006, + "learning_rate": 4.651647893199833e-05, + "loss": 0.4403, + "step": 434 + }, + { + "epoch": 0.48958919527293193, + "grad_norm": 0.41316331078388, + "learning_rate": 4.649561952440551e-05, + "loss": 0.431, + "step": 435 + }, + { + "epoch": 0.4907146876758582, + "grad_norm": 0.36589748301197256, + "learning_rate": 4.647476011681268e-05, + "loss": 0.4487, + "step": 436 + }, + { + "epoch": 0.4918401800787845, + "grad_norm": 0.4790306414346754, + "learning_rate": 4.645390070921986e-05, + "loss": 0.465, + "step": 437 + }, + { + "epoch": 0.49296567248171075, + "grad_norm": 0.3884802942940033, + "learning_rate": 4.6433041301627036e-05, + "loss": 0.4488, + "step": 438 + }, + { + "epoch": 0.494091164884637, + "grad_norm": 0.43931802766911515, + "learning_rate": 4.641218189403421e-05, + "loss": 0.4844, + "step": 439 + }, + { + "epoch": 0.4952166572875633, + "grad_norm": 0.35209964530255544, + "learning_rate": 4.6391322486441383e-05, + "loss": 0.4442, + "step": 440 + }, + { + "epoch": 0.49634214969048956, + "grad_norm": 0.38004709563408534, + "learning_rate": 4.637046307884857e-05, + "loss": 0.4753, + "step": 441 + }, + { + "epoch": 0.4974676420934159, + "grad_norm": 0.3409798351543027, + "learning_rate": 4.634960367125574e-05, + "loss": 0.4342, + "step": 442 + }, + { + "epoch": 0.4985931344963421, + "grad_norm": 0.39326837683822974, + "learning_rate": 4.6328744263662914e-05, + "loss": 0.4539, + "step": 443 + }, + { + "epoch": 0.49971862689926844, + "grad_norm": 0.34187980768631865, + "learning_rate": 4.630788485607009e-05, + "loss": 0.4551, + "step": 444 + }, + { + "epoch": 0.5008441193021947, + "grad_norm": 0.3788406979843315, + "learning_rate": 4.628702544847727e-05, + "loss": 0.4536, + "step": 445 + }, + { + "epoch": 0.501969611705121, + "grad_norm": 0.37725539074157804, + "learning_rate": 4.626616604088444e-05, + "loss": 0.4391, + "step": 446 + }, + { + "epoch": 0.5030951041080473, + "grad_norm": 0.3294085025027009, + "learning_rate": 4.6245306633291616e-05, + "loss": 0.4702, + "step": 447 + }, + { + "epoch": 0.5042205965109735, + "grad_norm": 0.33534954747479645, + "learning_rate": 4.622444722569879e-05, + "loss": 0.437, + "step": 448 + }, + { + "epoch": 0.5053460889138999, + "grad_norm": 0.4077362372846078, + "learning_rate": 4.620358781810597e-05, + "loss": 0.4335, + "step": 449 + }, + { + "epoch": 0.5064715813168261, + "grad_norm": 0.39599735165417416, + "learning_rate": 4.618272841051314e-05, + "loss": 0.4428, + "step": 450 + }, + { + "epoch": 0.5075970737197524, + "grad_norm": 0.3481976377046397, + "learning_rate": 4.6161869002920323e-05, + "loss": 0.4344, + "step": 451 + }, + { + "epoch": 0.5087225661226786, + "grad_norm": 0.35576918023033427, + "learning_rate": 4.6141009595327494e-05, + "loss": 0.4343, + "step": 452 + }, + { + "epoch": 0.509848058525605, + "grad_norm": 0.4458349270928396, + "learning_rate": 4.612015018773467e-05, + "loss": 0.4418, + "step": 453 + }, + { + "epoch": 0.5109735509285313, + "grad_norm": 0.39940538094885114, + "learning_rate": 4.609929078014185e-05, + "loss": 0.4466, + "step": 454 + }, + { + "epoch": 0.5120990433314575, + "grad_norm": 0.401792941347298, + "learning_rate": 4.607843137254902e-05, + "loss": 0.4317, + "step": 455 + }, + { + "epoch": 0.5132245357343838, + "grad_norm": 0.3570336540962956, + "learning_rate": 4.6057571964956195e-05, + "loss": 0.4116, + "step": 456 + }, + { + "epoch": 0.5143500281373101, + "grad_norm": 0.4132726931400482, + "learning_rate": 4.603671255736337e-05, + "loss": 0.4451, + "step": 457 + }, + { + "epoch": 0.5154755205402364, + "grad_norm": 0.3450781738437834, + "learning_rate": 4.601585314977055e-05, + "loss": 0.4245, + "step": 458 + }, + { + "epoch": 0.5166010129431626, + "grad_norm": 0.4044295769667828, + "learning_rate": 4.599499374217772e-05, + "loss": 0.4261, + "step": 459 + }, + { + "epoch": 0.5177265053460889, + "grad_norm": 0.4460134360979799, + "learning_rate": 4.59741343345849e-05, + "loss": 0.4569, + "step": 460 + }, + { + "epoch": 0.5188519977490152, + "grad_norm": 0.35709408492200145, + "learning_rate": 4.595327492699207e-05, + "loss": 0.447, + "step": 461 + }, + { + "epoch": 0.5199774901519415, + "grad_norm": 0.49622852545171614, + "learning_rate": 4.593241551939925e-05, + "loss": 0.4482, + "step": 462 + }, + { + "epoch": 0.5211029825548678, + "grad_norm": 0.43774205674931815, + "learning_rate": 4.591155611180643e-05, + "loss": 0.4447, + "step": 463 + }, + { + "epoch": 0.522228474957794, + "grad_norm": 0.4071440395299347, + "learning_rate": 4.5890696704213604e-05, + "loss": 0.428, + "step": 464 + }, + { + "epoch": 0.5233539673607203, + "grad_norm": 0.42816990064501337, + "learning_rate": 4.5869837296620774e-05, + "loss": 0.4264, + "step": 465 + }, + { + "epoch": 0.5244794597636466, + "grad_norm": 0.4003821811209746, + "learning_rate": 4.584897788902796e-05, + "loss": 0.4502, + "step": 466 + }, + { + "epoch": 0.5256049521665729, + "grad_norm": 0.4306225774416482, + "learning_rate": 4.582811848143513e-05, + "loss": 0.448, + "step": 467 + }, + { + "epoch": 0.5267304445694991, + "grad_norm": 0.438000243330178, + "learning_rate": 4.5807259073842305e-05, + "loss": 0.4461, + "step": 468 + }, + { + "epoch": 0.5278559369724254, + "grad_norm": 0.5459912564891531, + "learning_rate": 4.578639966624948e-05, + "loss": 0.4454, + "step": 469 + }, + { + "epoch": 0.5289814293753518, + "grad_norm": 0.3937540607846447, + "learning_rate": 4.576554025865666e-05, + "loss": 0.4511, + "step": 470 + }, + { + "epoch": 0.530106921778278, + "grad_norm": 0.5255901368328048, + "learning_rate": 4.574468085106383e-05, + "loss": 0.4656, + "step": 471 + }, + { + "epoch": 0.5312324141812043, + "grad_norm": 0.37290888598540667, + "learning_rate": 4.572382144347101e-05, + "loss": 0.4346, + "step": 472 + }, + { + "epoch": 0.5323579065841305, + "grad_norm": 0.5151271720318875, + "learning_rate": 4.570296203587818e-05, + "loss": 0.4386, + "step": 473 + }, + { + "epoch": 0.5334833989870569, + "grad_norm": 0.5196455814853196, + "learning_rate": 4.568210262828536e-05, + "loss": 0.4281, + "step": 474 + }, + { + "epoch": 0.5346088913899831, + "grad_norm": 0.5516694216088329, + "learning_rate": 4.566124322069254e-05, + "loss": 0.4678, + "step": 475 + }, + { + "epoch": 0.5357343837929094, + "grad_norm": 0.40935239231865317, + "learning_rate": 4.564038381309971e-05, + "loss": 0.4392, + "step": 476 + }, + { + "epoch": 0.5368598761958356, + "grad_norm": 0.4232251188780467, + "learning_rate": 4.5619524405506884e-05, + "loss": 0.4541, + "step": 477 + }, + { + "epoch": 0.537985368598762, + "grad_norm": 0.47065714592515695, + "learning_rate": 4.559866499791406e-05, + "loss": 0.4573, + "step": 478 + }, + { + "epoch": 0.5391108610016883, + "grad_norm": 0.45139662325934604, + "learning_rate": 4.557780559032124e-05, + "loss": 0.466, + "step": 479 + }, + { + "epoch": 0.5402363534046145, + "grad_norm": 0.43277954798040297, + "learning_rate": 4.555694618272841e-05, + "loss": 0.4395, + "step": 480 + }, + { + "epoch": 0.5413618458075408, + "grad_norm": 0.3937402339467187, + "learning_rate": 4.553608677513559e-05, + "loss": 0.4598, + "step": 481 + }, + { + "epoch": 0.5424873382104671, + "grad_norm": 0.43149504891962365, + "learning_rate": 4.551522736754276e-05, + "loss": 0.4292, + "step": 482 + }, + { + "epoch": 0.5436128306133934, + "grad_norm": 0.3833426447527127, + "learning_rate": 4.549436795994994e-05, + "loss": 0.4462, + "step": 483 + }, + { + "epoch": 0.5447383230163196, + "grad_norm": 0.5753891830767618, + "learning_rate": 4.5473508552357116e-05, + "loss": 0.4674, + "step": 484 + }, + { + "epoch": 0.5458638154192459, + "grad_norm": 0.37095342775133894, + "learning_rate": 4.5452649144764293e-05, + "loss": 0.4502, + "step": 485 + }, + { + "epoch": 0.5469893078221723, + "grad_norm": 0.44452090514956777, + "learning_rate": 4.5431789737171464e-05, + "loss": 0.4195, + "step": 486 + }, + { + "epoch": 0.5481148002250985, + "grad_norm": 0.39266318915026655, + "learning_rate": 4.541093032957864e-05, + "loss": 0.4308, + "step": 487 + }, + { + "epoch": 0.5492402926280248, + "grad_norm": 0.38579575811998595, + "learning_rate": 4.539007092198582e-05, + "loss": 0.4732, + "step": 488 + }, + { + "epoch": 0.550365785030951, + "grad_norm": 0.3927716846528752, + "learning_rate": 4.5369211514392995e-05, + "loss": 0.4554, + "step": 489 + }, + { + "epoch": 0.5514912774338773, + "grad_norm": 0.3518677438969378, + "learning_rate": 4.5348352106800165e-05, + "loss": 0.435, + "step": 490 + }, + { + "epoch": 0.5526167698368036, + "grad_norm": 0.3989470078182982, + "learning_rate": 4.532749269920735e-05, + "loss": 0.4581, + "step": 491 + }, + { + "epoch": 0.5537422622397299, + "grad_norm": 0.31132596342015495, + "learning_rate": 4.530663329161452e-05, + "loss": 0.4166, + "step": 492 + }, + { + "epoch": 0.5548677546426561, + "grad_norm": 0.42773832735938333, + "learning_rate": 4.5285773884021696e-05, + "loss": 0.4498, + "step": 493 + }, + { + "epoch": 0.5559932470455824, + "grad_norm": 0.3337455720428052, + "learning_rate": 4.526491447642887e-05, + "loss": 0.4959, + "step": 494 + }, + { + "epoch": 0.5571187394485088, + "grad_norm": 0.3784028479466481, + "learning_rate": 4.524405506883605e-05, + "loss": 0.4528, + "step": 495 + }, + { + "epoch": 0.558244231851435, + "grad_norm": 0.3649036934635355, + "learning_rate": 4.522319566124322e-05, + "loss": 0.4382, + "step": 496 + }, + { + "epoch": 0.5593697242543613, + "grad_norm": 0.37624738124672374, + "learning_rate": 4.52023362536504e-05, + "loss": 0.444, + "step": 497 + }, + { + "epoch": 0.5604952166572875, + "grad_norm": 0.41375280657115326, + "learning_rate": 4.5181476846057574e-05, + "loss": 0.445, + "step": 498 + }, + { + "epoch": 0.5616207090602139, + "grad_norm": 0.4473059694404265, + "learning_rate": 4.5160617438464744e-05, + "loss": 0.425, + "step": 499 + }, + { + "epoch": 0.5627462014631401, + "grad_norm": 0.37225084914483775, + "learning_rate": 4.513975803087193e-05, + "loss": 0.441, + "step": 500 + }, + { + "epoch": 0.5638716938660664, + "grad_norm": 0.3940588853331884, + "learning_rate": 4.51188986232791e-05, + "loss": 0.4466, + "step": 501 + }, + { + "epoch": 0.5649971862689926, + "grad_norm": 0.3470104737718654, + "learning_rate": 4.5098039215686275e-05, + "loss": 0.4474, + "step": 502 + }, + { + "epoch": 0.566122678671919, + "grad_norm": 0.4164834773144051, + "learning_rate": 4.507717980809345e-05, + "loss": 0.4442, + "step": 503 + }, + { + "epoch": 0.5672481710748453, + "grad_norm": 0.3652420299854053, + "learning_rate": 4.505632040050063e-05, + "loss": 0.4436, + "step": 504 + }, + { + "epoch": 0.5683736634777715, + "grad_norm": 0.4103075119748004, + "learning_rate": 4.50354609929078e-05, + "loss": 0.4459, + "step": 505 + }, + { + "epoch": 0.5694991558806978, + "grad_norm": 0.39102170524673335, + "learning_rate": 4.501460158531498e-05, + "loss": 0.4268, + "step": 506 + }, + { + "epoch": 0.5706246482836241, + "grad_norm": 0.4942727267066722, + "learning_rate": 4.499374217772215e-05, + "loss": 0.4722, + "step": 507 + }, + { + "epoch": 0.5717501406865504, + "grad_norm": 0.3465319015459766, + "learning_rate": 4.497288277012933e-05, + "loss": 0.4408, + "step": 508 + }, + { + "epoch": 0.5728756330894766, + "grad_norm": 0.4074806411985911, + "learning_rate": 4.495202336253651e-05, + "loss": 0.4212, + "step": 509 + }, + { + "epoch": 0.5740011254924029, + "grad_norm": 0.38192085376045243, + "learning_rate": 4.4931163954943684e-05, + "loss": 0.41, + "step": 510 + }, + { + "epoch": 0.5751266178953293, + "grad_norm": 0.3702590158057979, + "learning_rate": 4.4910304547350854e-05, + "loss": 0.4255, + "step": 511 + }, + { + "epoch": 0.5762521102982555, + "grad_norm": 0.3436403538534127, + "learning_rate": 4.488944513975804e-05, + "loss": 0.4538, + "step": 512 + }, + { + "epoch": 0.5773776027011818, + "grad_norm": 0.3877342893162592, + "learning_rate": 4.486858573216521e-05, + "loss": 0.4182, + "step": 513 + }, + { + "epoch": 0.578503095104108, + "grad_norm": 0.3460201187876074, + "learning_rate": 4.4847726324572385e-05, + "loss": 0.4432, + "step": 514 + }, + { + "epoch": 0.5796285875070343, + "grad_norm": 0.34511398785310915, + "learning_rate": 4.482686691697956e-05, + "loss": 0.4469, + "step": 515 + }, + { + "epoch": 0.5807540799099606, + "grad_norm": 0.4258487344474797, + "learning_rate": 4.480600750938674e-05, + "loss": 0.4583, + "step": 516 + }, + { + "epoch": 0.5818795723128869, + "grad_norm": 0.36803297271961477, + "learning_rate": 4.478514810179391e-05, + "loss": 0.4428, + "step": 517 + }, + { + "epoch": 0.5830050647158131, + "grad_norm": 0.46401852203645827, + "learning_rate": 4.4764288694201086e-05, + "loss": 0.4456, + "step": 518 + }, + { + "epoch": 0.5841305571187394, + "grad_norm": 0.39205048802946624, + "learning_rate": 4.4743429286608263e-05, + "loss": 0.441, + "step": 519 + }, + { + "epoch": 0.5852560495216658, + "grad_norm": 0.39757611365031714, + "learning_rate": 4.4722569879015434e-05, + "loss": 0.4428, + "step": 520 + }, + { + "epoch": 0.586381541924592, + "grad_norm": 0.3647536671953435, + "learning_rate": 4.470171047142262e-05, + "loss": 0.4513, + "step": 521 + }, + { + "epoch": 0.5875070343275183, + "grad_norm": 0.39429072510874175, + "learning_rate": 4.468085106382979e-05, + "loss": 0.4132, + "step": 522 + }, + { + "epoch": 0.5886325267304445, + "grad_norm": 0.40901642747342404, + "learning_rate": 4.4659991656236965e-05, + "loss": 0.4481, + "step": 523 + }, + { + "epoch": 0.5897580191333709, + "grad_norm": 0.3992749524524198, + "learning_rate": 4.463913224864414e-05, + "loss": 0.4468, + "step": 524 + }, + { + "epoch": 0.5908835115362971, + "grad_norm": 0.4722275927889856, + "learning_rate": 4.461827284105132e-05, + "loss": 0.4428, + "step": 525 + }, + { + "epoch": 0.5920090039392234, + "grad_norm": 0.42866183958875864, + "learning_rate": 4.459741343345849e-05, + "loss": 0.4194, + "step": 526 + }, + { + "epoch": 0.5931344963421497, + "grad_norm": 0.38204868156886707, + "learning_rate": 4.4576554025865666e-05, + "loss": 0.4402, + "step": 527 + }, + { + "epoch": 0.594259988745076, + "grad_norm": 0.35148215802167393, + "learning_rate": 4.455569461827284e-05, + "loss": 0.4542, + "step": 528 + }, + { + "epoch": 0.5953854811480023, + "grad_norm": 0.40153400690617524, + "learning_rate": 4.453483521068002e-05, + "loss": 0.4102, + "step": 529 + }, + { + "epoch": 0.5965109735509285, + "grad_norm": 0.46986899886821576, + "learning_rate": 4.45139758030872e-05, + "loss": 0.4436, + "step": 530 + }, + { + "epoch": 0.5976364659538548, + "grad_norm": 0.35390475462960685, + "learning_rate": 4.4493116395494374e-05, + "loss": 0.4398, + "step": 531 + }, + { + "epoch": 0.5987619583567811, + "grad_norm": 0.4482185977258061, + "learning_rate": 4.4472256987901544e-05, + "loss": 0.4326, + "step": 532 + }, + { + "epoch": 0.5998874507597074, + "grad_norm": 0.44232865264761434, + "learning_rate": 4.445139758030872e-05, + "loss": 0.4325, + "step": 533 + }, + { + "epoch": 0.6010129431626337, + "grad_norm": 0.4183843016810463, + "learning_rate": 4.44305381727159e-05, + "loss": 0.4553, + "step": 534 + }, + { + "epoch": 0.6021384355655599, + "grad_norm": 0.4242250812536985, + "learning_rate": 4.4409678765123075e-05, + "loss": 0.4232, + "step": 535 + }, + { + "epoch": 0.6032639279684862, + "grad_norm": 0.3888142076123292, + "learning_rate": 4.4388819357530245e-05, + "loss": 0.4241, + "step": 536 + }, + { + "epoch": 0.6043894203714125, + "grad_norm": 0.40486855004609845, + "learning_rate": 4.436795994993743e-05, + "loss": 0.4191, + "step": 537 + }, + { + "epoch": 0.6055149127743388, + "grad_norm": 0.47154131963320084, + "learning_rate": 4.43471005423446e-05, + "loss": 0.4595, + "step": 538 + }, + { + "epoch": 0.606640405177265, + "grad_norm": 0.38490507840256083, + "learning_rate": 4.432624113475177e-05, + "loss": 0.4199, + "step": 539 + }, + { + "epoch": 0.6077658975801913, + "grad_norm": 0.46096486506497264, + "learning_rate": 4.430538172715895e-05, + "loss": 0.4448, + "step": 540 + }, + { + "epoch": 0.6088913899831176, + "grad_norm": 0.4947895759240074, + "learning_rate": 4.428452231956612e-05, + "loss": 0.4342, + "step": 541 + }, + { + "epoch": 0.6100168823860439, + "grad_norm": 0.3829854963511767, + "learning_rate": 4.42636629119733e-05, + "loss": 0.4186, + "step": 542 + }, + { + "epoch": 0.6111423747889702, + "grad_norm": 0.6245507343869451, + "learning_rate": 4.424280350438048e-05, + "loss": 0.441, + "step": 543 + }, + { + "epoch": 0.6122678671918964, + "grad_norm": 0.5300235385565563, + "learning_rate": 4.4221944096787654e-05, + "loss": 0.4375, + "step": 544 + }, + { + "epoch": 0.6133933595948228, + "grad_norm": 0.4930881980261961, + "learning_rate": 4.4201084689194824e-05, + "loss": 0.4621, + "step": 545 + }, + { + "epoch": 0.614518851997749, + "grad_norm": 0.5638424830870375, + "learning_rate": 4.418022528160201e-05, + "loss": 0.4411, + "step": 546 + }, + { + "epoch": 0.6156443444006753, + "grad_norm": 0.3716115037856444, + "learning_rate": 4.415936587400918e-05, + "loss": 0.4528, + "step": 547 + }, + { + "epoch": 0.6167698368036015, + "grad_norm": 0.5223401927324024, + "learning_rate": 4.4138506466416355e-05, + "loss": 0.4327, + "step": 548 + }, + { + "epoch": 0.6178953292065279, + "grad_norm": 0.37311721165933265, + "learning_rate": 4.411764705882353e-05, + "loss": 0.4058, + "step": 549 + }, + { + "epoch": 0.6190208216094542, + "grad_norm": 0.532332931429002, + "learning_rate": 4.409678765123071e-05, + "loss": 0.4445, + "step": 550 + }, + { + "epoch": 0.6201463140123804, + "grad_norm": 0.5059754011866813, + "learning_rate": 4.407592824363788e-05, + "loss": 0.4257, + "step": 551 + }, + { + "epoch": 0.6212718064153067, + "grad_norm": 0.4904818838015066, + "learning_rate": 4.405506883604506e-05, + "loss": 0.4353, + "step": 552 + }, + { + "epoch": 0.622397298818233, + "grad_norm": 0.6200335434273374, + "learning_rate": 4.4034209428452233e-05, + "loss": 0.4416, + "step": 553 + }, + { + "epoch": 0.6235227912211593, + "grad_norm": 0.3199203022808196, + "learning_rate": 4.401335002085941e-05, + "loss": 0.4355, + "step": 554 + }, + { + "epoch": 0.6246482836240855, + "grad_norm": 0.5681807784529108, + "learning_rate": 4.399249061326659e-05, + "loss": 0.431, + "step": 555 + }, + { + "epoch": 0.6257737760270118, + "grad_norm": 0.3995337627796738, + "learning_rate": 4.3971631205673764e-05, + "loss": 0.4312, + "step": 556 + }, + { + "epoch": 0.6268992684299382, + "grad_norm": 0.5466993132659691, + "learning_rate": 4.3950771798080935e-05, + "loss": 0.4311, + "step": 557 + }, + { + "epoch": 0.6280247608328644, + "grad_norm": 0.5670240814298136, + "learning_rate": 4.392991239048811e-05, + "loss": 0.4564, + "step": 558 + }, + { + "epoch": 0.6291502532357907, + "grad_norm": 0.47107566738859724, + "learning_rate": 4.390905298289529e-05, + "loss": 0.4436, + "step": 559 + }, + { + "epoch": 0.6302757456387169, + "grad_norm": 0.5380491861675493, + "learning_rate": 4.388819357530246e-05, + "loss": 0.4024, + "step": 560 + }, + { + "epoch": 0.6314012380416432, + "grad_norm": 0.37407644137036594, + "learning_rate": 4.386733416770964e-05, + "loss": 0.4276, + "step": 561 + }, + { + "epoch": 0.6325267304445695, + "grad_norm": 0.5179459476960132, + "learning_rate": 4.384647476011681e-05, + "loss": 0.4231, + "step": 562 + }, + { + "epoch": 0.6336522228474958, + "grad_norm": 0.3832305989594554, + "learning_rate": 4.382561535252399e-05, + "loss": 0.4254, + "step": 563 + }, + { + "epoch": 0.634777715250422, + "grad_norm": 0.48824132268901227, + "learning_rate": 4.380475594493117e-05, + "loss": 0.4414, + "step": 564 + }, + { + "epoch": 0.6359032076533483, + "grad_norm": 0.45846104242587143, + "learning_rate": 4.3783896537338344e-05, + "loss": 0.4374, + "step": 565 + }, + { + "epoch": 0.6370287000562747, + "grad_norm": 0.5017380646906237, + "learning_rate": 4.3763037129745514e-05, + "loss": 0.4478, + "step": 566 + }, + { + "epoch": 0.6381541924592009, + "grad_norm": 0.4706523687823463, + "learning_rate": 4.374217772215269e-05, + "loss": 0.4393, + "step": 567 + }, + { + "epoch": 0.6392796848621272, + "grad_norm": 0.43746034371341663, + "learning_rate": 4.372131831455987e-05, + "loss": 0.4289, + "step": 568 + }, + { + "epoch": 0.6404051772650534, + "grad_norm": 0.4971311348473273, + "learning_rate": 4.3700458906967045e-05, + "loss": 0.4632, + "step": 569 + }, + { + "epoch": 0.6415306696679798, + "grad_norm": 0.32424868625443787, + "learning_rate": 4.367959949937422e-05, + "loss": 0.4439, + "step": 570 + }, + { + "epoch": 0.642656162070906, + "grad_norm": 0.5530000470387829, + "learning_rate": 4.36587400917814e-05, + "loss": 0.4438, + "step": 571 + }, + { + "epoch": 0.6437816544738323, + "grad_norm": 0.3619983421314401, + "learning_rate": 4.363788068418857e-05, + "loss": 0.4193, + "step": 572 + }, + { + "epoch": 0.6449071468767585, + "grad_norm": 0.46202193194933755, + "learning_rate": 4.3617021276595746e-05, + "loss": 0.4308, + "step": 573 + }, + { + "epoch": 0.6460326392796849, + "grad_norm": 0.4798799400653708, + "learning_rate": 4.359616186900292e-05, + "loss": 0.4072, + "step": 574 + }, + { + "epoch": 0.6471581316826112, + "grad_norm": 0.42761886423074474, + "learning_rate": 4.35753024614101e-05, + "loss": 0.4357, + "step": 575 + }, + { + "epoch": 0.6482836240855374, + "grad_norm": 0.4906300910854437, + "learning_rate": 4.355444305381727e-05, + "loss": 0.441, + "step": 576 + }, + { + "epoch": 0.6494091164884637, + "grad_norm": 0.4312074811449326, + "learning_rate": 4.3533583646224454e-05, + "loss": 0.468, + "step": 577 + }, + { + "epoch": 0.65053460889139, + "grad_norm": 0.4999437976070137, + "learning_rate": 4.3512724238631624e-05, + "loss": 0.4442, + "step": 578 + }, + { + "epoch": 0.6516601012943163, + "grad_norm": 0.45200142374904256, + "learning_rate": 4.34918648310388e-05, + "loss": 0.4351, + "step": 579 + }, + { + "epoch": 0.6527855936972425, + "grad_norm": 0.4481417460480344, + "learning_rate": 4.347100542344598e-05, + "loss": 0.4351, + "step": 580 + }, + { + "epoch": 0.6539110861001688, + "grad_norm": 0.416680484799885, + "learning_rate": 4.345014601585315e-05, + "loss": 0.4726, + "step": 581 + }, + { + "epoch": 0.6550365785030952, + "grad_norm": 0.45466741269285743, + "learning_rate": 4.3429286608260325e-05, + "loss": 0.4445, + "step": 582 + }, + { + "epoch": 0.6561620709060214, + "grad_norm": 0.3767132482639794, + "learning_rate": 4.34084272006675e-05, + "loss": 0.4494, + "step": 583 + }, + { + "epoch": 0.6572875633089477, + "grad_norm": 0.4045713565741537, + "learning_rate": 4.338756779307468e-05, + "loss": 0.4478, + "step": 584 + }, + { + "epoch": 0.6584130557118739, + "grad_norm": 0.41406546702832436, + "learning_rate": 4.336670838548185e-05, + "loss": 0.4296, + "step": 585 + }, + { + "epoch": 0.6595385481148002, + "grad_norm": 0.45192122020443987, + "learning_rate": 4.334584897788903e-05, + "loss": 0.4548, + "step": 586 + }, + { + "epoch": 0.6606640405177265, + "grad_norm": 0.42522165235824544, + "learning_rate": 4.3324989570296203e-05, + "loss": 0.4545, + "step": 587 + }, + { + "epoch": 0.6617895329206528, + "grad_norm": 0.4025019554306989, + "learning_rate": 4.330413016270338e-05, + "loss": 0.427, + "step": 588 + }, + { + "epoch": 0.662915025323579, + "grad_norm": 0.40092550396367915, + "learning_rate": 4.328327075511056e-05, + "loss": 0.4357, + "step": 589 + }, + { + "epoch": 0.6640405177265053, + "grad_norm": 0.4029073566780126, + "learning_rate": 4.3262411347517734e-05, + "loss": 0.4437, + "step": 590 + }, + { + "epoch": 0.6651660101294317, + "grad_norm": 0.3754421567640776, + "learning_rate": 4.3241551939924905e-05, + "loss": 0.4488, + "step": 591 + }, + { + "epoch": 0.6662915025323579, + "grad_norm": 0.4093131149759515, + "learning_rate": 4.322069253233209e-05, + "loss": 0.4296, + "step": 592 + }, + { + "epoch": 0.6674169949352842, + "grad_norm": 0.37396980661829454, + "learning_rate": 4.319983312473926e-05, + "loss": 0.4135, + "step": 593 + }, + { + "epoch": 0.6685424873382104, + "grad_norm": 0.39676170583430237, + "learning_rate": 4.3178973717146436e-05, + "loss": 0.4407, + "step": 594 + }, + { + "epoch": 0.6696679797411368, + "grad_norm": 0.3324304115520877, + "learning_rate": 4.315811430955361e-05, + "loss": 0.4272, + "step": 595 + }, + { + "epoch": 0.670793472144063, + "grad_norm": 0.41321744590045745, + "learning_rate": 4.313725490196079e-05, + "loss": 0.4535, + "step": 596 + }, + { + "epoch": 0.6719189645469893, + "grad_norm": 0.37423186701221084, + "learning_rate": 4.311639549436796e-05, + "loss": 0.4243, + "step": 597 + }, + { + "epoch": 0.6730444569499155, + "grad_norm": 0.34707644350816663, + "learning_rate": 4.309553608677514e-05, + "loss": 0.4224, + "step": 598 + }, + { + "epoch": 0.6741699493528419, + "grad_norm": 0.39162388441219653, + "learning_rate": 4.3074676679182314e-05, + "loss": 0.4117, + "step": 599 + }, + { + "epoch": 0.6752954417557682, + "grad_norm": 0.3757134091896751, + "learning_rate": 4.305381727158949e-05, + "loss": 0.4372, + "step": 600 + }, + { + "epoch": 0.6764209341586944, + "grad_norm": 0.486157183762819, + "learning_rate": 4.303295786399667e-05, + "loss": 0.4487, + "step": 601 + }, + { + "epoch": 0.6775464265616207, + "grad_norm": 0.34615222028756854, + "learning_rate": 4.301209845640384e-05, + "loss": 0.438, + "step": 602 + }, + { + "epoch": 0.678671918964547, + "grad_norm": 0.4148015924613456, + "learning_rate": 4.2991239048811015e-05, + "loss": 0.4545, + "step": 603 + }, + { + "epoch": 0.6797974113674733, + "grad_norm": 0.3870669252002002, + "learning_rate": 4.297037964121819e-05, + "loss": 0.4078, + "step": 604 + }, + { + "epoch": 0.6809229037703995, + "grad_norm": 0.31630147919989027, + "learning_rate": 4.294952023362537e-05, + "loss": 0.4179, + "step": 605 + }, + { + "epoch": 0.6820483961733258, + "grad_norm": 0.4078672238404797, + "learning_rate": 4.292866082603254e-05, + "loss": 0.4363, + "step": 606 + }, + { + "epoch": 0.6831738885762522, + "grad_norm": 0.38181818903469905, + "learning_rate": 4.2907801418439716e-05, + "loss": 0.4387, + "step": 607 + }, + { + "epoch": 0.6842993809791784, + "grad_norm": 0.40887483819289494, + "learning_rate": 4.288694201084689e-05, + "loss": 0.4279, + "step": 608 + }, + { + "epoch": 0.6854248733821047, + "grad_norm": 0.45835023477255316, + "learning_rate": 4.286608260325407e-05, + "loss": 0.4553, + "step": 609 + }, + { + "epoch": 0.6865503657850309, + "grad_norm": 0.4496240755238681, + "learning_rate": 4.284522319566125e-05, + "loss": 0.4511, + "step": 610 + }, + { + "epoch": 0.6876758581879572, + "grad_norm": 0.47923459811565877, + "learning_rate": 4.2824363788068424e-05, + "loss": 0.4494, + "step": 611 + }, + { + "epoch": 0.6888013505908835, + "grad_norm": 0.4563499971704832, + "learning_rate": 4.2803504380475594e-05, + "loss": 0.4498, + "step": 612 + }, + { + "epoch": 0.6899268429938098, + "grad_norm": 0.4658484510143094, + "learning_rate": 4.278264497288277e-05, + "loss": 0.446, + "step": 613 + }, + { + "epoch": 0.691052335396736, + "grad_norm": 0.40099697936257683, + "learning_rate": 4.276178556528995e-05, + "loss": 0.4138, + "step": 614 + }, + { + "epoch": 0.6921778277996623, + "grad_norm": 0.40681610293383885, + "learning_rate": 4.2740926157697125e-05, + "loss": 0.4428, + "step": 615 + }, + { + "epoch": 0.6933033202025887, + "grad_norm": 0.492856289321406, + "learning_rate": 4.2720066750104295e-05, + "loss": 0.429, + "step": 616 + }, + { + "epoch": 0.6944288126055149, + "grad_norm": 0.40198116454411964, + "learning_rate": 4.269920734251148e-05, + "loss": 0.4319, + "step": 617 + }, + { + "epoch": 0.6955543050084412, + "grad_norm": 0.4049661414838683, + "learning_rate": 4.267834793491865e-05, + "loss": 0.4371, + "step": 618 + }, + { + "epoch": 0.6966797974113674, + "grad_norm": 0.4200912676835283, + "learning_rate": 4.2657488527325826e-05, + "loss": 0.4273, + "step": 619 + }, + { + "epoch": 0.6978052898142938, + "grad_norm": 0.3579260644405867, + "learning_rate": 4.2636629119733e-05, + "loss": 0.436, + "step": 620 + }, + { + "epoch": 0.69893078221722, + "grad_norm": 0.41261145773033614, + "learning_rate": 4.261576971214018e-05, + "loss": 0.4355, + "step": 621 + }, + { + "epoch": 0.7000562746201463, + "grad_norm": 0.38195673870959623, + "learning_rate": 4.259491030454735e-05, + "loss": 0.4407, + "step": 622 + }, + { + "epoch": 0.7011817670230726, + "grad_norm": 0.47251318617526117, + "learning_rate": 4.257405089695453e-05, + "loss": 0.4434, + "step": 623 + }, + { + "epoch": 0.7023072594259989, + "grad_norm": 0.413024502756469, + "learning_rate": 4.2553191489361704e-05, + "loss": 0.4228, + "step": 624 + }, + { + "epoch": 0.7034327518289252, + "grad_norm": 0.4129659836054336, + "learning_rate": 4.2532332081768875e-05, + "loss": 0.4298, + "step": 625 + }, + { + "epoch": 0.7045582442318514, + "grad_norm": 0.4371192692750543, + "learning_rate": 4.251147267417606e-05, + "loss": 0.422, + "step": 626 + }, + { + "epoch": 0.7056837366347777, + "grad_norm": 0.3209464880480147, + "learning_rate": 4.249061326658323e-05, + "loss": 0.4159, + "step": 627 + }, + { + "epoch": 0.706809229037704, + "grad_norm": 0.38213551742408286, + "learning_rate": 4.2469753858990406e-05, + "loss": 0.4651, + "step": 628 + }, + { + "epoch": 0.7079347214406303, + "grad_norm": 0.37077014672780895, + "learning_rate": 4.244889445139758e-05, + "loss": 0.428, + "step": 629 + }, + { + "epoch": 0.7090602138435566, + "grad_norm": 0.37388919361570394, + "learning_rate": 4.242803504380476e-05, + "loss": 0.4487, + "step": 630 + }, + { + "epoch": 0.7101857062464828, + "grad_norm": 0.355919224811824, + "learning_rate": 4.240717563621193e-05, + "loss": 0.4273, + "step": 631 + }, + { + "epoch": 0.7113111986494092, + "grad_norm": 0.3479874917806637, + "learning_rate": 4.2386316228619114e-05, + "loss": 0.4298, + "step": 632 + }, + { + "epoch": 0.7124366910523354, + "grad_norm": 0.39097161117850043, + "learning_rate": 4.2365456821026284e-05, + "loss": 0.4251, + "step": 633 + }, + { + "epoch": 0.7135621834552617, + "grad_norm": 0.39131656322095426, + "learning_rate": 4.234459741343346e-05, + "loss": 0.4261, + "step": 634 + }, + { + "epoch": 0.7146876758581879, + "grad_norm": 1.9337556498338822, + "learning_rate": 4.232373800584064e-05, + "loss": 0.4553, + "step": 635 + }, + { + "epoch": 0.7158131682611142, + "grad_norm": 0.7715880476594418, + "learning_rate": 4.2302878598247815e-05, + "loss": 0.4349, + "step": 636 + }, + { + "epoch": 0.7169386606640406, + "grad_norm": 0.4198490504250616, + "learning_rate": 4.2282019190654985e-05, + "loss": 0.4427, + "step": 637 + }, + { + "epoch": 0.7180641530669668, + "grad_norm": 0.6436591462942758, + "learning_rate": 4.226115978306216e-05, + "loss": 0.4428, + "step": 638 + }, + { + "epoch": 0.7191896454698931, + "grad_norm": 0.46958357266306217, + "learning_rate": 4.224030037546934e-05, + "loss": 0.4096, + "step": 639 + }, + { + "epoch": 0.7203151378728193, + "grad_norm": 0.5409557375822074, + "learning_rate": 4.2219440967876516e-05, + "loss": 0.4165, + "step": 640 + }, + { + "epoch": 0.7214406302757457, + "grad_norm": 0.505386305383113, + "learning_rate": 4.219858156028369e-05, + "loss": 0.4232, + "step": 641 + }, + { + "epoch": 0.7225661226786719, + "grad_norm": 0.47036754544713516, + "learning_rate": 4.217772215269087e-05, + "loss": 0.4187, + "step": 642 + }, + { + "epoch": 0.7236916150815982, + "grad_norm": 0.5935180204625328, + "learning_rate": 4.215686274509804e-05, + "loss": 0.4326, + "step": 643 + }, + { + "epoch": 0.7248171074845244, + "grad_norm": 0.37111793924942255, + "learning_rate": 4.213600333750522e-05, + "loss": 0.4235, + "step": 644 + }, + { + "epoch": 0.7259425998874508, + "grad_norm": 0.6111195959607152, + "learning_rate": 4.2115143929912394e-05, + "loss": 0.4254, + "step": 645 + }, + { + "epoch": 0.727068092290377, + "grad_norm": 0.35910288955770575, + "learning_rate": 4.2094284522319564e-05, + "loss": 0.4244, + "step": 646 + }, + { + "epoch": 0.7281935846933033, + "grad_norm": 0.4804262191052388, + "learning_rate": 4.207342511472674e-05, + "loss": 0.4357, + "step": 647 + }, + { + "epoch": 0.7293190770962296, + "grad_norm": 0.43546853881795533, + "learning_rate": 4.205256570713392e-05, + "loss": 0.4471, + "step": 648 + }, + { + "epoch": 0.7304445694991559, + "grad_norm": 0.36651215549293115, + "learning_rate": 4.2031706299541095e-05, + "loss": 0.4359, + "step": 649 + }, + { + "epoch": 0.7315700619020822, + "grad_norm": 0.5416106337436408, + "learning_rate": 4.201084689194827e-05, + "loss": 0.4514, + "step": 650 + }, + { + "epoch": 0.7326955543050084, + "grad_norm": 0.37666903051702594, + "learning_rate": 4.198998748435545e-05, + "loss": 0.4423, + "step": 651 + }, + { + "epoch": 0.7338210467079347, + "grad_norm": 0.44989927473315283, + "learning_rate": 4.196912807676262e-05, + "loss": 0.4268, + "step": 652 + }, + { + "epoch": 0.734946539110861, + "grad_norm": 0.3864335324626091, + "learning_rate": 4.1948268669169796e-05, + "loss": 0.4494, + "step": 653 + }, + { + "epoch": 0.7360720315137873, + "grad_norm": 0.4000593109156678, + "learning_rate": 4.192740926157697e-05, + "loss": 0.4434, + "step": 654 + }, + { + "epoch": 0.7371975239167136, + "grad_norm": 0.423242298419072, + "learning_rate": 4.190654985398415e-05, + "loss": 0.4328, + "step": 655 + }, + { + "epoch": 0.7383230163196398, + "grad_norm": 0.44706912801056875, + "learning_rate": 4.188569044639132e-05, + "loss": 0.4254, + "step": 656 + }, + { + "epoch": 0.7394485087225662, + "grad_norm": 0.5086338570156853, + "learning_rate": 4.1864831038798504e-05, + "loss": 0.4425, + "step": 657 + }, + { + "epoch": 0.7405740011254924, + "grad_norm": 0.4676027167307538, + "learning_rate": 4.1843971631205674e-05, + "loss": 0.4491, + "step": 658 + }, + { + "epoch": 0.7416994935284187, + "grad_norm": 0.46458396727329027, + "learning_rate": 4.182311222361285e-05, + "loss": 0.4015, + "step": 659 + }, + { + "epoch": 0.7428249859313449, + "grad_norm": 0.390783744931949, + "learning_rate": 4.180225281602003e-05, + "loss": 0.4184, + "step": 660 + }, + { + "epoch": 0.7439504783342712, + "grad_norm": 0.44526805252316143, + "learning_rate": 4.1781393408427205e-05, + "loss": 0.4035, + "step": 661 + }, + { + "epoch": 0.7450759707371976, + "grad_norm": 0.4217385671488669, + "learning_rate": 4.1760534000834376e-05, + "loss": 0.4332, + "step": 662 + }, + { + "epoch": 0.7462014631401238, + "grad_norm": 0.44487860783732935, + "learning_rate": 4.173967459324156e-05, + "loss": 0.4266, + "step": 663 + }, + { + "epoch": 0.7473269555430501, + "grad_norm": 0.4296879305918086, + "learning_rate": 4.171881518564873e-05, + "loss": 0.4205, + "step": 664 + }, + { + "epoch": 0.7484524479459763, + "grad_norm": 0.4948881491751457, + "learning_rate": 4.16979557780559e-05, + "loss": 0.4447, + "step": 665 + }, + { + "epoch": 0.7495779403489027, + "grad_norm": 0.41381310448412767, + "learning_rate": 4.1677096370463084e-05, + "loss": 0.435, + "step": 666 + }, + { + "epoch": 0.7507034327518289, + "grad_norm": 0.4138662471855155, + "learning_rate": 4.1656236962870254e-05, + "loss": 0.4351, + "step": 667 + }, + { + "epoch": 0.7518289251547552, + "grad_norm": 0.3869476402415003, + "learning_rate": 4.163537755527743e-05, + "loss": 0.4319, + "step": 668 + }, + { + "epoch": 0.7529544175576814, + "grad_norm": 0.4882528682989917, + "learning_rate": 4.161451814768461e-05, + "loss": 0.4123, + "step": 669 + }, + { + "epoch": 0.7540799099606078, + "grad_norm": 0.3739890771080639, + "learning_rate": 4.1593658740091785e-05, + "loss": 0.4268, + "step": 670 + }, + { + "epoch": 0.7552054023635341, + "grad_norm": 0.5032273771625602, + "learning_rate": 4.1572799332498955e-05, + "loss": 0.4404, + "step": 671 + }, + { + "epoch": 0.7563308947664603, + "grad_norm": 0.38387128180956526, + "learning_rate": 4.155193992490614e-05, + "loss": 0.4505, + "step": 672 + }, + { + "epoch": 0.7574563871693866, + "grad_norm": 0.4995032503495298, + "learning_rate": 4.153108051731331e-05, + "loss": 0.4211, + "step": 673 + }, + { + "epoch": 0.7585818795723129, + "grad_norm": 0.46352751067691306, + "learning_rate": 4.1510221109720486e-05, + "loss": 0.4253, + "step": 674 + }, + { + "epoch": 0.7597073719752392, + "grad_norm": 0.4661239773263893, + "learning_rate": 4.148936170212766e-05, + "loss": 0.4533, + "step": 675 + }, + { + "epoch": 0.7608328643781654, + "grad_norm": 0.42916960855475605, + "learning_rate": 4.146850229453484e-05, + "loss": 0.4333, + "step": 676 + }, + { + "epoch": 0.7619583567810917, + "grad_norm": 0.40989406943220275, + "learning_rate": 4.144764288694201e-05, + "loss": 0.4413, + "step": 677 + }, + { + "epoch": 0.7630838491840181, + "grad_norm": 0.7522787094637527, + "learning_rate": 4.1426783479349194e-05, + "loss": 0.4522, + "step": 678 + }, + { + "epoch": 0.7642093415869443, + "grad_norm": 0.4277705459587538, + "learning_rate": 4.1405924071756364e-05, + "loss": 0.4348, + "step": 679 + }, + { + "epoch": 0.7653348339898706, + "grad_norm": 0.4684118417529332, + "learning_rate": 4.138506466416354e-05, + "loss": 0.422, + "step": 680 + }, + { + "epoch": 0.7664603263927968, + "grad_norm": 0.5197963821538139, + "learning_rate": 4.136420525657072e-05, + "loss": 0.4299, + "step": 681 + }, + { + "epoch": 0.7675858187957231, + "grad_norm": 0.5235576475586984, + "learning_rate": 4.1343345848977895e-05, + "loss": 0.438, + "step": 682 + }, + { + "epoch": 0.7687113111986494, + "grad_norm": 0.46712550772065836, + "learning_rate": 4.1322486441385065e-05, + "loss": 0.4344, + "step": 683 + }, + { + "epoch": 0.7698368036015757, + "grad_norm": 0.3222703692853798, + "learning_rate": 4.130162703379224e-05, + "loss": 0.4263, + "step": 684 + }, + { + "epoch": 0.770962296004502, + "grad_norm": 0.5188367404561216, + "learning_rate": 4.128076762619942e-05, + "loss": 0.425, + "step": 685 + }, + { + "epoch": 0.7720877884074282, + "grad_norm": 0.5386961427613608, + "learning_rate": 4.125990821860659e-05, + "loss": 0.4276, + "step": 686 + }, + { + "epoch": 0.7732132808103546, + "grad_norm": 0.42911439453279915, + "learning_rate": 4.1239048811013766e-05, + "loss": 0.4309, + "step": 687 + }, + { + "epoch": 0.7743387732132808, + "grad_norm": 0.5088405648165022, + "learning_rate": 4.121818940342094e-05, + "loss": 0.4493, + "step": 688 + }, + { + "epoch": 0.7754642656162071, + "grad_norm": 0.3815644681020926, + "learning_rate": 4.119732999582812e-05, + "loss": 0.4077, + "step": 689 + }, + { + "epoch": 0.7765897580191333, + "grad_norm": 0.4840279343366164, + "learning_rate": 4.11764705882353e-05, + "loss": 0.4172, + "step": 690 + }, + { + "epoch": 0.7777152504220597, + "grad_norm": 0.333716982007624, + "learning_rate": 4.1155611180642474e-05, + "loss": 0.42, + "step": 691 + }, + { + "epoch": 0.7788407428249859, + "grad_norm": 0.5086503847022227, + "learning_rate": 4.1134751773049644e-05, + "loss": 0.4388, + "step": 692 + }, + { + "epoch": 0.7799662352279122, + "grad_norm": 0.5138077690790301, + "learning_rate": 4.111389236545682e-05, + "loss": 0.4472, + "step": 693 + }, + { + "epoch": 0.7810917276308385, + "grad_norm": 0.5073604041295958, + "learning_rate": 4.1093032957864e-05, + "loss": 0.4319, + "step": 694 + }, + { + "epoch": 0.7822172200337648, + "grad_norm": 0.5070487690193936, + "learning_rate": 4.1072173550271175e-05, + "loss": 0.4406, + "step": 695 + }, + { + "epoch": 0.7833427124366911, + "grad_norm": 0.39744464693598625, + "learning_rate": 4.1051314142678346e-05, + "loss": 0.434, + "step": 696 + }, + { + "epoch": 0.7844682048396173, + "grad_norm": 0.4541031658226192, + "learning_rate": 4.103045473508553e-05, + "loss": 0.4454, + "step": 697 + }, + { + "epoch": 0.7855936972425436, + "grad_norm": 0.3491750229319607, + "learning_rate": 4.10095953274927e-05, + "loss": 0.4332, + "step": 698 + }, + { + "epoch": 0.7867191896454699, + "grad_norm": 0.4022760008208042, + "learning_rate": 4.0988735919899877e-05, + "loss": 0.4296, + "step": 699 + }, + { + "epoch": 0.7878446820483962, + "grad_norm": 0.34684627806001544, + "learning_rate": 4.0967876512307054e-05, + "loss": 0.4331, + "step": 700 + }, + { + "epoch": 0.7889701744513224, + "grad_norm": 0.4050405845879203, + "learning_rate": 4.094701710471423e-05, + "loss": 0.4464, + "step": 701 + }, + { + "epoch": 0.7900956668542487, + "grad_norm": 0.36395612381945763, + "learning_rate": 4.09261576971214e-05, + "loss": 0.4444, + "step": 702 + }, + { + "epoch": 0.7912211592571751, + "grad_norm": 0.398848237592344, + "learning_rate": 4.0905298289528585e-05, + "loss": 0.4288, + "step": 703 + }, + { + "epoch": 0.7923466516601013, + "grad_norm": 0.40745644685078164, + "learning_rate": 4.0884438881935755e-05, + "loss": 0.4329, + "step": 704 + }, + { + "epoch": 0.7934721440630276, + "grad_norm": 0.3547156716364725, + "learning_rate": 4.0863579474342925e-05, + "loss": 0.4191, + "step": 705 + }, + { + "epoch": 0.7945976364659538, + "grad_norm": 0.377680056161795, + "learning_rate": 4.084272006675011e-05, + "loss": 0.4376, + "step": 706 + }, + { + "epoch": 0.7957231288688801, + "grad_norm": 0.4073180644016936, + "learning_rate": 4.082186065915728e-05, + "loss": 0.4559, + "step": 707 + }, + { + "epoch": 0.7968486212718064, + "grad_norm": 0.45186446852813356, + "learning_rate": 4.0801001251564456e-05, + "loss": 0.4277, + "step": 708 + }, + { + "epoch": 0.7979741136747327, + "grad_norm": 0.36933911451661233, + "learning_rate": 4.078014184397163e-05, + "loss": 0.45, + "step": 709 + }, + { + "epoch": 0.799099606077659, + "grad_norm": 0.35833391487238614, + "learning_rate": 4.075928243637881e-05, + "loss": 0.4403, + "step": 710 + }, + { + "epoch": 0.8002250984805852, + "grad_norm": 0.3901982149558614, + "learning_rate": 4.073842302878598e-05, + "loss": 0.4216, + "step": 711 + }, + { + "epoch": 0.8013505908835116, + "grad_norm": 0.40940384251834244, + "learning_rate": 4.0717563621193164e-05, + "loss": 0.4021, + "step": 712 + }, + { + "epoch": 0.8024760832864378, + "grad_norm": 0.42919683308516116, + "learning_rate": 4.0696704213600334e-05, + "loss": 0.4376, + "step": 713 + }, + { + "epoch": 0.8036015756893641, + "grad_norm": 0.4073165345943137, + "learning_rate": 4.067584480600751e-05, + "loss": 0.4153, + "step": 714 + }, + { + "epoch": 0.8047270680922903, + "grad_norm": 0.4178501498334503, + "learning_rate": 4.065498539841469e-05, + "loss": 0.414, + "step": 715 + }, + { + "epoch": 0.8058525604952167, + "grad_norm": 0.4403993787350139, + "learning_rate": 4.0634125990821865e-05, + "loss": 0.4399, + "step": 716 + }, + { + "epoch": 0.806978052898143, + "grad_norm": 0.4114972670954794, + "learning_rate": 4.0613266583229035e-05, + "loss": 0.439, + "step": 717 + }, + { + "epoch": 0.8081035453010692, + "grad_norm": 0.407394844667869, + "learning_rate": 4.059240717563622e-05, + "loss": 0.4123, + "step": 718 + }, + { + "epoch": 0.8092290377039955, + "grad_norm": 0.39800729593005324, + "learning_rate": 4.057154776804339e-05, + "loss": 0.4236, + "step": 719 + }, + { + "epoch": 0.8103545301069218, + "grad_norm": 0.4287708410386054, + "learning_rate": 4.0550688360450566e-05, + "loss": 0.4256, + "step": 720 + }, + { + "epoch": 0.8114800225098481, + "grad_norm": 0.4016484816358628, + "learning_rate": 4.052982895285774e-05, + "loss": 0.4281, + "step": 721 + }, + { + "epoch": 0.8126055149127743, + "grad_norm": 0.3719724351542615, + "learning_rate": 4.050896954526492e-05, + "loss": 0.4077, + "step": 722 + }, + { + "epoch": 0.8137310073157006, + "grad_norm": 0.4023100055568255, + "learning_rate": 4.048811013767209e-05, + "loss": 0.4461, + "step": 723 + }, + { + "epoch": 0.814856499718627, + "grad_norm": 0.4117093051704328, + "learning_rate": 4.046725073007927e-05, + "loss": 0.4175, + "step": 724 + }, + { + "epoch": 0.8159819921215532, + "grad_norm": 0.34286385689334, + "learning_rate": 4.0446391322486444e-05, + "loss": 0.4356, + "step": 725 + }, + { + "epoch": 0.8171074845244795, + "grad_norm": 0.35591813739094097, + "learning_rate": 4.0425531914893614e-05, + "loss": 0.4434, + "step": 726 + }, + { + "epoch": 0.8182329769274057, + "grad_norm": 0.43567208149763015, + "learning_rate": 4.040467250730079e-05, + "loss": 0.437, + "step": 727 + }, + { + "epoch": 0.8193584693303321, + "grad_norm": 0.3799825439351934, + "learning_rate": 4.038381309970797e-05, + "loss": 0.4248, + "step": 728 + }, + { + "epoch": 0.8204839617332583, + "grad_norm": 0.38216998051723755, + "learning_rate": 4.0362953692115145e-05, + "loss": 0.4253, + "step": 729 + }, + { + "epoch": 0.8216094541361846, + "grad_norm": 0.39231774228135824, + "learning_rate": 4.034209428452232e-05, + "loss": 0.4223, + "step": 730 + }, + { + "epoch": 0.8227349465391108, + "grad_norm": 0.4102144130938295, + "learning_rate": 4.03212348769295e-05, + "loss": 0.4348, + "step": 731 + }, + { + "epoch": 0.8238604389420371, + "grad_norm": 0.37115430835787877, + "learning_rate": 4.030037546933667e-05, + "loss": 0.409, + "step": 732 + }, + { + "epoch": 0.8249859313449635, + "grad_norm": 0.40499256266698164, + "learning_rate": 4.0279516061743847e-05, + "loss": 0.4089, + "step": 733 + }, + { + "epoch": 0.8261114237478897, + "grad_norm": 0.4916550738089128, + "learning_rate": 4.0258656654151024e-05, + "loss": 0.4272, + "step": 734 + }, + { + "epoch": 0.827236916150816, + "grad_norm": 0.3681620907401364, + "learning_rate": 4.02377972465582e-05, + "loss": 0.4446, + "step": 735 + }, + { + "epoch": 0.8283624085537422, + "grad_norm": 0.4795384990908562, + "learning_rate": 4.021693783896537e-05, + "loss": 0.4357, + "step": 736 + }, + { + "epoch": 0.8294879009566686, + "grad_norm": 0.3684736183097587, + "learning_rate": 4.0196078431372555e-05, + "loss": 0.4119, + "step": 737 + }, + { + "epoch": 0.8306133933595948, + "grad_norm": 0.43877380657382786, + "learning_rate": 4.0175219023779725e-05, + "loss": 0.4403, + "step": 738 + }, + { + "epoch": 0.8317388857625211, + "grad_norm": 0.37814204050253025, + "learning_rate": 4.01543596161869e-05, + "loss": 0.434, + "step": 739 + }, + { + "epoch": 0.8328643781654473, + "grad_norm": 0.45099287248352765, + "learning_rate": 4.013350020859408e-05, + "loss": 0.4149, + "step": 740 + }, + { + "epoch": 0.8339898705683737, + "grad_norm": 0.34915848381393966, + "learning_rate": 4.0112640801001256e-05, + "loss": 0.4136, + "step": 741 + }, + { + "epoch": 0.8351153629713, + "grad_norm": 0.5037598255538088, + "learning_rate": 4.0091781393408426e-05, + "loss": 0.4398, + "step": 742 + }, + { + "epoch": 0.8362408553742262, + "grad_norm": 0.3612809802844246, + "learning_rate": 4.007092198581561e-05, + "loss": 0.4162, + "step": 743 + }, + { + "epoch": 0.8373663477771525, + "grad_norm": 0.3979488796549818, + "learning_rate": 4.005006257822278e-05, + "loss": 0.3981, + "step": 744 + }, + { + "epoch": 0.8384918401800788, + "grad_norm": 0.4440135625243805, + "learning_rate": 4.002920317062996e-05, + "loss": 0.429, + "step": 745 + }, + { + "epoch": 0.8396173325830051, + "grad_norm": 0.3448234757480279, + "learning_rate": 4.0008343763037134e-05, + "loss": 0.4324, + "step": 746 + }, + { + "epoch": 0.8407428249859313, + "grad_norm": 0.4775835287224156, + "learning_rate": 3.9987484355444304e-05, + "loss": 0.4249, + "step": 747 + }, + { + "epoch": 0.8418683173888576, + "grad_norm": 0.3566220202478078, + "learning_rate": 3.996662494785148e-05, + "loss": 0.4211, + "step": 748 + }, + { + "epoch": 0.842993809791784, + "grad_norm": 0.5285144169481172, + "learning_rate": 3.994576554025866e-05, + "loss": 0.4168, + "step": 749 + }, + { + "epoch": 0.8441193021947102, + "grad_norm": 0.33354278924631714, + "learning_rate": 3.9924906132665835e-05, + "loss": 0.4261, + "step": 750 + }, + { + "epoch": 0.8452447945976365, + "grad_norm": 0.3372581050524173, + "learning_rate": 3.9904046725073005e-05, + "loss": 0.4269, + "step": 751 + }, + { + "epoch": 0.8463702870005627, + "grad_norm": 0.3146244454332402, + "learning_rate": 3.988318731748019e-05, + "loss": 0.4204, + "step": 752 + }, + { + "epoch": 0.8474957794034891, + "grad_norm": 0.3706478564537626, + "learning_rate": 3.986232790988736e-05, + "loss": 0.4521, + "step": 753 + }, + { + "epoch": 0.8486212718064153, + "grad_norm": 0.34630012288221157, + "learning_rate": 3.9841468502294536e-05, + "loss": 0.4142, + "step": 754 + }, + { + "epoch": 0.8497467642093416, + "grad_norm": 0.36373433568245944, + "learning_rate": 3.982060909470171e-05, + "loss": 0.4252, + "step": 755 + }, + { + "epoch": 0.8508722566122678, + "grad_norm": 0.3554211790643752, + "learning_rate": 3.979974968710889e-05, + "loss": 0.4474, + "step": 756 + }, + { + "epoch": 0.8519977490151941, + "grad_norm": 0.30960141279598913, + "learning_rate": 3.977889027951606e-05, + "loss": 0.4167, + "step": 757 + }, + { + "epoch": 0.8531232414181205, + "grad_norm": 0.37614788680975125, + "learning_rate": 3.9758030871923244e-05, + "loss": 0.4505, + "step": 758 + }, + { + "epoch": 0.8542487338210467, + "grad_norm": 0.3938651785575828, + "learning_rate": 3.9737171464330414e-05, + "loss": 0.4349, + "step": 759 + }, + { + "epoch": 0.855374226223973, + "grad_norm": 0.3460524380953148, + "learning_rate": 3.971631205673759e-05, + "loss": 0.4396, + "step": 760 + }, + { + "epoch": 0.8564997186268992, + "grad_norm": 0.430535629585179, + "learning_rate": 3.969545264914477e-05, + "loss": 0.4154, + "step": 761 + }, + { + "epoch": 0.8576252110298256, + "grad_norm": 0.34446139273212933, + "learning_rate": 3.9674593241551945e-05, + "loss": 0.3931, + "step": 762 + }, + { + "epoch": 0.8587507034327518, + "grad_norm": 0.42192087717244775, + "learning_rate": 3.9653733833959115e-05, + "loss": 0.4261, + "step": 763 + }, + { + "epoch": 0.8598761958356781, + "grad_norm": 0.40550449281201056, + "learning_rate": 3.963287442636629e-05, + "loss": 0.4569, + "step": 764 + }, + { + "epoch": 0.8610016882386043, + "grad_norm": 0.3566914781532168, + "learning_rate": 3.961201501877347e-05, + "loss": 0.4141, + "step": 765 + }, + { + "epoch": 0.8621271806415307, + "grad_norm": 0.3843475406384751, + "learning_rate": 3.9591155611180646e-05, + "loss": 0.4332, + "step": 766 + }, + { + "epoch": 0.863252673044457, + "grad_norm": 0.3366748222918633, + "learning_rate": 3.9570296203587817e-05, + "loss": 0.4123, + "step": 767 + }, + { + "epoch": 0.8643781654473832, + "grad_norm": 0.41416075386046697, + "learning_rate": 3.9549436795994994e-05, + "loss": 0.426, + "step": 768 + }, + { + "epoch": 0.8655036578503095, + "grad_norm": 0.3752366359688814, + "learning_rate": 3.952857738840217e-05, + "loss": 0.4402, + "step": 769 + }, + { + "epoch": 0.8666291502532358, + "grad_norm": 0.37688991154499113, + "learning_rate": 3.950771798080935e-05, + "loss": 0.4244, + "step": 770 + }, + { + "epoch": 0.8677546426561621, + "grad_norm": 0.42637480636595876, + "learning_rate": 3.9486858573216525e-05, + "loss": 0.438, + "step": 771 + }, + { + "epoch": 0.8688801350590883, + "grad_norm": 0.3568635983835573, + "learning_rate": 3.9465999165623695e-05, + "loss": 0.424, + "step": 772 + }, + { + "epoch": 0.8700056274620146, + "grad_norm": 0.38797711927011286, + "learning_rate": 3.944513975803087e-05, + "loss": 0.4321, + "step": 773 + }, + { + "epoch": 0.871131119864941, + "grad_norm": 0.3904824359653345, + "learning_rate": 3.942428035043805e-05, + "loss": 0.4161, + "step": 774 + }, + { + "epoch": 0.8722566122678672, + "grad_norm": 0.47345678909928446, + "learning_rate": 3.9403420942845226e-05, + "loss": 0.4503, + "step": 775 + }, + { + "epoch": 0.8733821046707935, + "grad_norm": 0.37497244213020403, + "learning_rate": 3.9382561535252396e-05, + "loss": 0.4255, + "step": 776 + }, + { + "epoch": 0.8745075970737197, + "grad_norm": 0.4047268746098847, + "learning_rate": 3.936170212765958e-05, + "loss": 0.4327, + "step": 777 + }, + { + "epoch": 0.8756330894766461, + "grad_norm": 0.3834914449330313, + "learning_rate": 3.934084272006675e-05, + "loss": 0.4079, + "step": 778 + }, + { + "epoch": 0.8767585818795723, + "grad_norm": 0.43021072579406455, + "learning_rate": 3.931998331247393e-05, + "loss": 0.4143, + "step": 779 + }, + { + "epoch": 0.8778840742824986, + "grad_norm": 0.3793510230856374, + "learning_rate": 3.9299123904881104e-05, + "loss": 0.431, + "step": 780 + }, + { + "epoch": 0.8790095666854248, + "grad_norm": 0.37164807483969337, + "learning_rate": 3.927826449728828e-05, + "loss": 0.4341, + "step": 781 + }, + { + "epoch": 0.8801350590883511, + "grad_norm": 0.3807695648271021, + "learning_rate": 3.925740508969545e-05, + "loss": 0.4096, + "step": 782 + }, + { + "epoch": 0.8812605514912775, + "grad_norm": 0.3502384590348891, + "learning_rate": 3.9236545682102635e-05, + "loss": 0.4117, + "step": 783 + }, + { + "epoch": 0.8823860438942037, + "grad_norm": 0.41955082958695283, + "learning_rate": 3.9215686274509805e-05, + "loss": 0.4179, + "step": 784 + }, + { + "epoch": 0.88351153629713, + "grad_norm": 0.3435394433133878, + "learning_rate": 3.919482686691698e-05, + "loss": 0.4215, + "step": 785 + }, + { + "epoch": 0.8846370287000562, + "grad_norm": 0.44230838156044133, + "learning_rate": 3.917396745932416e-05, + "loss": 0.436, + "step": 786 + }, + { + "epoch": 0.8857625211029826, + "grad_norm": 0.3248857597519066, + "learning_rate": 3.9153108051731336e-05, + "loss": 0.41, + "step": 787 + }, + { + "epoch": 0.8868880135059088, + "grad_norm": 0.48949666561437843, + "learning_rate": 3.9132248644138506e-05, + "loss": 0.4348, + "step": 788 + }, + { + "epoch": 0.8880135059088351, + "grad_norm": 0.32922044316292487, + "learning_rate": 3.911138923654568e-05, + "loss": 0.4084, + "step": 789 + }, + { + "epoch": 0.8891389983117614, + "grad_norm": 0.4097616554209572, + "learning_rate": 3.909052982895286e-05, + "loss": 0.4127, + "step": 790 + }, + { + "epoch": 0.8902644907146877, + "grad_norm": 0.3847502404740843, + "learning_rate": 3.906967042136003e-05, + "loss": 0.4322, + "step": 791 + }, + { + "epoch": 0.891389983117614, + "grad_norm": 0.39373480839252734, + "learning_rate": 3.9048811013767214e-05, + "loss": 0.4085, + "step": 792 + }, + { + "epoch": 0.8925154755205402, + "grad_norm": 0.4665639076471283, + "learning_rate": 3.9027951606174384e-05, + "loss": 0.4281, + "step": 793 + }, + { + "epoch": 0.8936409679234665, + "grad_norm": 0.32986547499650304, + "learning_rate": 3.900709219858156e-05, + "loss": 0.4261, + "step": 794 + }, + { + "epoch": 0.8947664603263928, + "grad_norm": 0.484689789318943, + "learning_rate": 3.898623279098874e-05, + "loss": 0.4236, + "step": 795 + }, + { + "epoch": 0.8958919527293191, + "grad_norm": 0.3386378151140954, + "learning_rate": 3.8965373383395915e-05, + "loss": 0.4117, + "step": 796 + }, + { + "epoch": 0.8970174451322454, + "grad_norm": 0.4810985090257936, + "learning_rate": 3.8944513975803085e-05, + "loss": 0.4199, + "step": 797 + }, + { + "epoch": 0.8981429375351716, + "grad_norm": 0.34069553000131625, + "learning_rate": 3.892365456821027e-05, + "loss": 0.4279, + "step": 798 + }, + { + "epoch": 0.899268429938098, + "grad_norm": 0.39752219752724677, + "learning_rate": 3.890279516061744e-05, + "loss": 0.4172, + "step": 799 + }, + { + "epoch": 0.9003939223410242, + "grad_norm": 0.39022425914879927, + "learning_rate": 3.8881935753024616e-05, + "loss": 0.3978, + "step": 800 + }, + { + "epoch": 0.9015194147439505, + "grad_norm": 0.3458579209805956, + "learning_rate": 3.8861076345431793e-05, + "loss": 0.4135, + "step": 801 + }, + { + "epoch": 0.9026449071468767, + "grad_norm": 0.4407202352189913, + "learning_rate": 3.884021693783897e-05, + "loss": 0.4154, + "step": 802 + }, + { + "epoch": 0.9037703995498031, + "grad_norm": 0.47173587942543654, + "learning_rate": 3.881935753024614e-05, + "loss": 0.4572, + "step": 803 + }, + { + "epoch": 0.9048958919527293, + "grad_norm": 0.5188592329216469, + "learning_rate": 3.879849812265332e-05, + "loss": 0.42, + "step": 804 + }, + { + "epoch": 0.9060213843556556, + "grad_norm": 0.35403721006820305, + "learning_rate": 3.8777638715060495e-05, + "loss": 0.3938, + "step": 805 + }, + { + "epoch": 0.9071468767585819, + "grad_norm": 0.4545974955129778, + "learning_rate": 3.875677930746767e-05, + "loss": 0.4285, + "step": 806 + }, + { + "epoch": 0.9082723691615081, + "grad_norm": 0.38332622486859297, + "learning_rate": 3.873591989987485e-05, + "loss": 0.4324, + "step": 807 + }, + { + "epoch": 0.9093978615644345, + "grad_norm": 0.4520007540189171, + "learning_rate": 3.8715060492282026e-05, + "loss": 0.4305, + "step": 808 + }, + { + "epoch": 0.9105233539673607, + "grad_norm": 0.4148933007482292, + "learning_rate": 3.8694201084689196e-05, + "loss": 0.4218, + "step": 809 + }, + { + "epoch": 0.911648846370287, + "grad_norm": 0.35859066112911336, + "learning_rate": 3.867334167709637e-05, + "loss": 0.4395, + "step": 810 + }, + { + "epoch": 0.9127743387732132, + "grad_norm": 0.3384384473732276, + "learning_rate": 3.865248226950355e-05, + "loss": 0.4331, + "step": 811 + }, + { + "epoch": 0.9138998311761396, + "grad_norm": 0.3212850763014723, + "learning_rate": 3.863162286191072e-05, + "loss": 0.4371, + "step": 812 + }, + { + "epoch": 0.9150253235790659, + "grad_norm": 0.34283786993488946, + "learning_rate": 3.86107634543179e-05, + "loss": 0.4213, + "step": 813 + }, + { + "epoch": 0.9161508159819921, + "grad_norm": 0.3578410656828841, + "learning_rate": 3.8589904046725074e-05, + "loss": 0.4264, + "step": 814 + }, + { + "epoch": 0.9172763083849184, + "grad_norm": 0.33865929644502085, + "learning_rate": 3.856904463913225e-05, + "loss": 0.4242, + "step": 815 + }, + { + "epoch": 0.9184018007878447, + "grad_norm": 0.3392167511998851, + "learning_rate": 3.854818523153942e-05, + "loss": 0.435, + "step": 816 + }, + { + "epoch": 0.919527293190771, + "grad_norm": 0.4361222901548229, + "learning_rate": 3.8527325823946605e-05, + "loss": 0.4351, + "step": 817 + }, + { + "epoch": 0.9206527855936972, + "grad_norm": 0.38626109347018045, + "learning_rate": 3.8506466416353775e-05, + "loss": 0.4413, + "step": 818 + }, + { + "epoch": 0.9217782779966235, + "grad_norm": 0.376739528222001, + "learning_rate": 3.848560700876095e-05, + "loss": 0.4162, + "step": 819 + }, + { + "epoch": 0.9229037703995498, + "grad_norm": 0.38666458978007023, + "learning_rate": 3.846474760116813e-05, + "loss": 0.4308, + "step": 820 + }, + { + "epoch": 0.9240292628024761, + "grad_norm": 0.49211116299516156, + "learning_rate": 3.8443888193575306e-05, + "loss": 0.4319, + "step": 821 + }, + { + "epoch": 0.9251547552054024, + "grad_norm": 0.35408915013798653, + "learning_rate": 3.8423028785982476e-05, + "loss": 0.4095, + "step": 822 + }, + { + "epoch": 0.9262802476083286, + "grad_norm": 0.4801831963166499, + "learning_rate": 3.840216937838966e-05, + "loss": 0.4357, + "step": 823 + }, + { + "epoch": 0.927405740011255, + "grad_norm": 0.355137877995065, + "learning_rate": 3.838130997079683e-05, + "loss": 0.4089, + "step": 824 + }, + { + "epoch": 0.9285312324141812, + "grad_norm": 0.39619886118735476, + "learning_rate": 3.836045056320401e-05, + "loss": 0.4275, + "step": 825 + }, + { + "epoch": 0.9296567248171075, + "grad_norm": 0.4149029728443111, + "learning_rate": 3.8339591155611184e-05, + "loss": 0.4271, + "step": 826 + }, + { + "epoch": 0.9307822172200337, + "grad_norm": 0.3576650599906339, + "learning_rate": 3.831873174801836e-05, + "loss": 0.4132, + "step": 827 + }, + { + "epoch": 0.93190770962296, + "grad_norm": 0.3906733425105834, + "learning_rate": 3.829787234042553e-05, + "loss": 0.4344, + "step": 828 + }, + { + "epoch": 0.9330332020258864, + "grad_norm": 0.3593657860758568, + "learning_rate": 3.8277012932832715e-05, + "loss": 0.4149, + "step": 829 + }, + { + "epoch": 0.9341586944288126, + "grad_norm": 0.3817439606842503, + "learning_rate": 3.8256153525239885e-05, + "loss": 0.4069, + "step": 830 + }, + { + "epoch": 0.9352841868317389, + "grad_norm": 0.3973105618276613, + "learning_rate": 3.8235294117647055e-05, + "loss": 0.4324, + "step": 831 + }, + { + "epoch": 0.9364096792346651, + "grad_norm": 0.366999411331023, + "learning_rate": 3.821443471005424e-05, + "loss": 0.4222, + "step": 832 + }, + { + "epoch": 0.9375351716375915, + "grad_norm": 0.3464567606261278, + "learning_rate": 3.819357530246141e-05, + "loss": 0.4246, + "step": 833 + }, + { + "epoch": 0.9386606640405177, + "grad_norm": 0.4438404878074898, + "learning_rate": 3.8172715894868586e-05, + "loss": 0.4143, + "step": 834 + }, + { + "epoch": 0.939786156443444, + "grad_norm": 0.3129033931624516, + "learning_rate": 3.8151856487275763e-05, + "loss": 0.4307, + "step": 835 + }, + { + "epoch": 0.9409116488463702, + "grad_norm": 0.4970325181275813, + "learning_rate": 3.813099707968294e-05, + "loss": 0.4212, + "step": 836 + }, + { + "epoch": 0.9420371412492966, + "grad_norm": 0.3998089884639407, + "learning_rate": 3.811013767209011e-05, + "loss": 0.4208, + "step": 837 + }, + { + "epoch": 0.9431626336522229, + "grad_norm": 0.40099412686189473, + "learning_rate": 3.8089278264497294e-05, + "loss": 0.422, + "step": 838 + }, + { + "epoch": 0.9442881260551491, + "grad_norm": 0.5330625499056586, + "learning_rate": 3.8068418856904465e-05, + "loss": 0.4132, + "step": 839 + }, + { + "epoch": 0.9454136184580754, + "grad_norm": 0.3036089387603486, + "learning_rate": 3.804755944931164e-05, + "loss": 0.3962, + "step": 840 + }, + { + "epoch": 0.9465391108610017, + "grad_norm": 0.5044966493127429, + "learning_rate": 3.802670004171882e-05, + "loss": 0.4228, + "step": 841 + }, + { + "epoch": 0.947664603263928, + "grad_norm": 0.3885120765788415, + "learning_rate": 3.8005840634125996e-05, + "loss": 0.4228, + "step": 842 + }, + { + "epoch": 0.9487900956668542, + "grad_norm": 0.3285021365606724, + "learning_rate": 3.7984981226533166e-05, + "loss": 0.4287, + "step": 843 + }, + { + "epoch": 0.9499155880697805, + "grad_norm": 0.3980934273222264, + "learning_rate": 3.796412181894034e-05, + "loss": 0.409, + "step": 844 + }, + { + "epoch": 0.9510410804727069, + "grad_norm": 0.31490077533714694, + "learning_rate": 3.794326241134752e-05, + "loss": 0.4364, + "step": 845 + }, + { + "epoch": 0.9521665728756331, + "grad_norm": 0.35959204795616695, + "learning_rate": 3.79224030037547e-05, + "loss": 0.4283, + "step": 846 + }, + { + "epoch": 0.9532920652785594, + "grad_norm": 0.4126210300706387, + "learning_rate": 3.7901543596161874e-05, + "loss": 0.4197, + "step": 847 + }, + { + "epoch": 0.9544175576814856, + "grad_norm": 0.3222558281528205, + "learning_rate": 3.788068418856905e-05, + "loss": 0.4277, + "step": 848 + }, + { + "epoch": 0.955543050084412, + "grad_norm": 0.41145117521139, + "learning_rate": 3.785982478097622e-05, + "loss": 0.4257, + "step": 849 + }, + { + "epoch": 0.9566685424873382, + "grad_norm": 0.351379058945545, + "learning_rate": 3.78389653733834e-05, + "loss": 0.4181, + "step": 850 + }, + { + "epoch": 0.9577940348902645, + "grad_norm": 0.31029125166165955, + "learning_rate": 3.7818105965790575e-05, + "loss": 0.4297, + "step": 851 + }, + { + "epoch": 0.9589195272931907, + "grad_norm": 0.3285912637236667, + "learning_rate": 3.7797246558197745e-05, + "loss": 0.4138, + "step": 852 + }, + { + "epoch": 0.960045019696117, + "grad_norm": 0.4036547080190634, + "learning_rate": 3.777638715060492e-05, + "loss": 0.4366, + "step": 853 + }, + { + "epoch": 0.9611705120990434, + "grad_norm": 0.3686324816741614, + "learning_rate": 3.77555277430121e-05, + "loss": 0.4337, + "step": 854 + }, + { + "epoch": 0.9622960045019696, + "grad_norm": 0.48923825142344834, + "learning_rate": 3.7734668335419276e-05, + "loss": 0.4343, + "step": 855 + }, + { + "epoch": 0.9634214969048959, + "grad_norm": 0.3013034390380091, + "learning_rate": 3.7713808927826446e-05, + "loss": 0.4216, + "step": 856 + }, + { + "epoch": 0.9645469893078221, + "grad_norm": 0.4352224768520518, + "learning_rate": 3.769294952023363e-05, + "loss": 0.3941, + "step": 857 + }, + { + "epoch": 0.9656724817107485, + "grad_norm": 0.3513506338023819, + "learning_rate": 3.76720901126408e-05, + "loss": 0.4166, + "step": 858 + }, + { + "epoch": 0.9667979741136747, + "grad_norm": 0.40350199029637573, + "learning_rate": 3.765123070504798e-05, + "loss": 0.413, + "step": 859 + }, + { + "epoch": 0.967923466516601, + "grad_norm": 0.362061939817286, + "learning_rate": 3.7630371297455154e-05, + "loss": 0.411, + "step": 860 + }, + { + "epoch": 0.9690489589195272, + "grad_norm": 0.3399440337787816, + "learning_rate": 3.760951188986233e-05, + "loss": 0.4392, + "step": 861 + }, + { + "epoch": 0.9701744513224536, + "grad_norm": 0.37743255775219053, + "learning_rate": 3.75886524822695e-05, + "loss": 0.4433, + "step": 862 + }, + { + "epoch": 0.9712999437253799, + "grad_norm": 0.3638973275713123, + "learning_rate": 3.7567793074676685e-05, + "loss": 0.4203, + "step": 863 + }, + { + "epoch": 0.9724254361283061, + "grad_norm": 0.3277233424581398, + "learning_rate": 3.7546933667083855e-05, + "loss": 0.398, + "step": 864 + }, + { + "epoch": 0.9735509285312324, + "grad_norm": 0.3141565988423171, + "learning_rate": 3.752607425949103e-05, + "loss": 0.3959, + "step": 865 + }, + { + "epoch": 0.9746764209341587, + "grad_norm": 0.35936283889585385, + "learning_rate": 3.750521485189821e-05, + "loss": 0.432, + "step": 866 + }, + { + "epoch": 0.975801913337085, + "grad_norm": 0.31770357894398493, + "learning_rate": 3.7484355444305386e-05, + "loss": 0.431, + "step": 867 + }, + { + "epoch": 0.9769274057400112, + "grad_norm": 0.3123167816580969, + "learning_rate": 3.7463496036712556e-05, + "loss": 0.4025, + "step": 868 + }, + { + "epoch": 0.9780528981429375, + "grad_norm": 0.3692934723238402, + "learning_rate": 3.744263662911974e-05, + "loss": 0.4221, + "step": 869 + }, + { + "epoch": 0.9791783905458639, + "grad_norm": 0.354161129420181, + "learning_rate": 3.742177722152691e-05, + "loss": 0.4172, + "step": 870 + }, + { + "epoch": 0.9803038829487901, + "grad_norm": 0.36175776122206693, + "learning_rate": 3.740091781393409e-05, + "loss": 0.4247, + "step": 871 + }, + { + "epoch": 0.9814293753517164, + "grad_norm": 0.33883677517413535, + "learning_rate": 3.7380058406341264e-05, + "loss": 0.4068, + "step": 872 + }, + { + "epoch": 0.9825548677546426, + "grad_norm": 0.42954345350848233, + "learning_rate": 3.7359198998748435e-05, + "loss": 0.422, + "step": 873 + }, + { + "epoch": 0.983680360157569, + "grad_norm": 0.3555531618337076, + "learning_rate": 3.733833959115561e-05, + "loss": 0.4117, + "step": 874 + }, + { + "epoch": 0.9848058525604952, + "grad_norm": 0.3137468970852999, + "learning_rate": 3.731748018356279e-05, + "loss": 0.4044, + "step": 875 + }, + { + "epoch": 0.9859313449634215, + "grad_norm": 0.32456521220251544, + "learning_rate": 3.7296620775969966e-05, + "loss": 0.4133, + "step": 876 + }, + { + "epoch": 0.9870568373663478, + "grad_norm": 0.31014819015532874, + "learning_rate": 3.7275761368377136e-05, + "loss": 0.4141, + "step": 877 + }, + { + "epoch": 0.988182329769274, + "grad_norm": 0.32436938468507787, + "learning_rate": 3.725490196078432e-05, + "loss": 0.4095, + "step": 878 + }, + { + "epoch": 0.9893078221722004, + "grad_norm": 0.33188432959790465, + "learning_rate": 3.723404255319149e-05, + "loss": 0.4029, + "step": 879 + }, + { + "epoch": 0.9904333145751266, + "grad_norm": 0.3654774295033461, + "learning_rate": 3.721318314559867e-05, + "loss": 0.4255, + "step": 880 + }, + { + "epoch": 0.9915588069780529, + "grad_norm": 0.366785518306503, + "learning_rate": 3.7192323738005844e-05, + "loss": 0.4449, + "step": 881 + }, + { + "epoch": 0.9926842993809791, + "grad_norm": 0.29826436924819194, + "learning_rate": 3.717146433041302e-05, + "loss": 0.4048, + "step": 882 + }, + { + "epoch": 0.9938097917839055, + "grad_norm": 0.3152150195685499, + "learning_rate": 3.715060492282019e-05, + "loss": 0.4156, + "step": 883 + }, + { + "epoch": 0.9949352841868317, + "grad_norm": 0.3790269660933605, + "learning_rate": 3.712974551522737e-05, + "loss": 0.4258, + "step": 884 + }, + { + "epoch": 0.996060776589758, + "grad_norm": 0.3234490218718985, + "learning_rate": 3.7108886107634545e-05, + "loss": 0.4225, + "step": 885 + }, + { + "epoch": 0.9971862689926843, + "grad_norm": 0.3692387676152948, + "learning_rate": 3.708802670004172e-05, + "loss": 0.4193, + "step": 886 + }, + { + "epoch": 0.9983117613956106, + "grad_norm": 0.3548734847141469, + "learning_rate": 3.70671672924489e-05, + "loss": 0.4223, + "step": 887 + }, + { + "epoch": 0.9994372537985369, + "grad_norm": 0.41390225631408695, + "learning_rate": 3.7046307884856076e-05, + "loss": 0.4223, + "step": 888 + }, + { + "epoch": 1.0, + "grad_norm": 0.41390225631408695, + "learning_rate": 3.7025448477263246e-05, + "loss": 0.43, + "step": 889 + }, + { + "epoch": 1.0011254924029263, + "grad_norm": 0.5580430959772991, + "learning_rate": 3.700458906967042e-05, + "loss": 0.3606, + "step": 890 + }, + { + "epoch": 1.0022509848058525, + "grad_norm": 0.38786233021722444, + "learning_rate": 3.69837296620776e-05, + "loss": 0.3516, + "step": 891 + }, + { + "epoch": 1.0033764772087788, + "grad_norm": 0.3607061049938279, + "learning_rate": 3.696287025448478e-05, + "loss": 0.3586, + "step": 892 + }, + { + "epoch": 1.004501969611705, + "grad_norm": 0.30722252549464857, + "learning_rate": 3.694201084689195e-05, + "loss": 0.3566, + "step": 893 + }, + { + "epoch": 1.0056274620146315, + "grad_norm": 0.4162073977345431, + "learning_rate": 3.6921151439299124e-05, + "loss": 0.3517, + "step": 894 + }, + { + "epoch": 1.0067529544175577, + "grad_norm": 0.3477012359425953, + "learning_rate": 3.69002920317063e-05, + "loss": 0.356, + "step": 895 + }, + { + "epoch": 1.007878446820484, + "grad_norm": 0.34334848103470345, + "learning_rate": 3.687943262411347e-05, + "loss": 0.3529, + "step": 896 + }, + { + "epoch": 1.0090039392234103, + "grad_norm": 0.3303966213040783, + "learning_rate": 3.6858573216520655e-05, + "loss": 0.3775, + "step": 897 + }, + { + "epoch": 1.0101294316263365, + "grad_norm": 0.41641276263804705, + "learning_rate": 3.6837713808927825e-05, + "loss": 0.3482, + "step": 898 + }, + { + "epoch": 1.0112549240292628, + "grad_norm": 0.3475211971953784, + "learning_rate": 3.6816854401335e-05, + "loss": 0.3469, + "step": 899 + }, + { + "epoch": 1.012380416432189, + "grad_norm": 0.3630650367930452, + "learning_rate": 3.679599499374218e-05, + "loss": 0.3733, + "step": 900 + }, + { + "epoch": 1.0135059088351153, + "grad_norm": 0.33917879336611284, + "learning_rate": 3.6775135586149356e-05, + "loss": 0.3613, + "step": 901 + }, + { + "epoch": 1.0146314012380417, + "grad_norm": 0.3916615454670656, + "learning_rate": 3.6754276178556526e-05, + "loss": 0.3642, + "step": 902 + }, + { + "epoch": 1.015756893640968, + "grad_norm": 0.387700709428207, + "learning_rate": 3.673341677096371e-05, + "loss": 0.337, + "step": 903 + }, + { + "epoch": 1.0168823860438942, + "grad_norm": 0.311008874794384, + "learning_rate": 3.671255736337088e-05, + "loss": 0.3464, + "step": 904 + }, + { + "epoch": 1.0180078784468205, + "grad_norm": 0.34204508328431077, + "learning_rate": 3.669169795577806e-05, + "loss": 0.3493, + "step": 905 + }, + { + "epoch": 1.0191333708497468, + "grad_norm": 0.35056912513693533, + "learning_rate": 3.6670838548185234e-05, + "loss": 0.3847, + "step": 906 + }, + { + "epoch": 1.020258863252673, + "grad_norm": 0.3603063090886555, + "learning_rate": 3.664997914059241e-05, + "loss": 0.3696, + "step": 907 + }, + { + "epoch": 1.0213843556555993, + "grad_norm": 0.3406429440812445, + "learning_rate": 3.662911973299958e-05, + "loss": 0.3585, + "step": 908 + }, + { + "epoch": 1.0225098480585255, + "grad_norm": 0.4146200559571759, + "learning_rate": 3.6608260325406765e-05, + "loss": 0.3617, + "step": 909 + }, + { + "epoch": 1.023635340461452, + "grad_norm": 0.30025908743312035, + "learning_rate": 3.6587400917813936e-05, + "loss": 0.3416, + "step": 910 + }, + { + "epoch": 1.0247608328643782, + "grad_norm": 0.4720811356812383, + "learning_rate": 3.656654151022111e-05, + "loss": 0.3533, + "step": 911 + }, + { + "epoch": 1.0258863252673045, + "grad_norm": 0.29184795311941897, + "learning_rate": 3.654568210262829e-05, + "loss": 0.3493, + "step": 912 + }, + { + "epoch": 1.0270118176702308, + "grad_norm": 0.385289462825186, + "learning_rate": 3.6524822695035466e-05, + "loss": 0.393, + "step": 913 + }, + { + "epoch": 1.028137310073157, + "grad_norm": 0.3107082501520784, + "learning_rate": 3.650396328744264e-05, + "loss": 0.3677, + "step": 914 + }, + { + "epoch": 1.0292628024760833, + "grad_norm": 0.2892635060197107, + "learning_rate": 3.6483103879849814e-05, + "loss": 0.3484, + "step": 915 + }, + { + "epoch": 1.0303882948790095, + "grad_norm": 0.37383301152112214, + "learning_rate": 3.646224447225699e-05, + "loss": 0.3741, + "step": 916 + }, + { + "epoch": 1.0315137872819358, + "grad_norm": 0.32042127431190587, + "learning_rate": 3.644138506466416e-05, + "loss": 0.3453, + "step": 917 + }, + { + "epoch": 1.032639279684862, + "grad_norm": 0.3227805251806716, + "learning_rate": 3.6420525657071345e-05, + "loss": 0.347, + "step": 918 + }, + { + "epoch": 1.0337647720877885, + "grad_norm": 0.33975552005827825, + "learning_rate": 3.6399666249478515e-05, + "loss": 0.342, + "step": 919 + }, + { + "epoch": 1.0348902644907148, + "grad_norm": 0.3053184721102955, + "learning_rate": 3.637880684188569e-05, + "loss": 0.368, + "step": 920 + }, + { + "epoch": 1.036015756893641, + "grad_norm": 0.4171758873578538, + "learning_rate": 3.635794743429287e-05, + "loss": 0.3506, + "step": 921 + }, + { + "epoch": 1.0371412492965673, + "grad_norm": 0.35788110167643483, + "learning_rate": 3.6337088026700046e-05, + "loss": 0.3678, + "step": 922 + }, + { + "epoch": 1.0382667416994935, + "grad_norm": 0.40422162482455976, + "learning_rate": 3.6316228619107216e-05, + "loss": 0.3841, + "step": 923 + }, + { + "epoch": 1.0393922341024198, + "grad_norm": 0.42302051382729106, + "learning_rate": 3.629536921151439e-05, + "loss": 0.3609, + "step": 924 + }, + { + "epoch": 1.040517726505346, + "grad_norm": 0.3002900676912074, + "learning_rate": 3.627450980392157e-05, + "loss": 0.3764, + "step": 925 + }, + { + "epoch": 1.0416432189082723, + "grad_norm": 0.4216178632940728, + "learning_rate": 3.625365039632875e-05, + "loss": 0.3525, + "step": 926 + }, + { + "epoch": 1.0427687113111987, + "grad_norm": 0.36722403261101394, + "learning_rate": 3.6232790988735924e-05, + "loss": 0.3651, + "step": 927 + }, + { + "epoch": 1.043894203714125, + "grad_norm": 0.37487765396444256, + "learning_rate": 3.62119315811431e-05, + "loss": 0.3732, + "step": 928 + }, + { + "epoch": 1.0450196961170513, + "grad_norm": 0.40248279158053446, + "learning_rate": 3.619107217355027e-05, + "loss": 0.3514, + "step": 929 + }, + { + "epoch": 1.0461451885199775, + "grad_norm": 0.34487298402942634, + "learning_rate": 3.617021276595745e-05, + "loss": 0.3453, + "step": 930 + }, + { + "epoch": 1.0472706809229038, + "grad_norm": 0.35894348708147356, + "learning_rate": 3.6149353358364625e-05, + "loss": 0.3445, + "step": 931 + }, + { + "epoch": 1.04839617332583, + "grad_norm": 0.46543989700724425, + "learning_rate": 3.61284939507718e-05, + "loss": 0.3554, + "step": 932 + }, + { + "epoch": 1.0495216657287563, + "grad_norm": 0.32251577447042856, + "learning_rate": 3.610763454317897e-05, + "loss": 0.3571, + "step": 933 + }, + { + "epoch": 1.0506471581316825, + "grad_norm": 0.3539766683535758, + "learning_rate": 3.608677513558615e-05, + "loss": 0.3291, + "step": 934 + }, + { + "epoch": 1.051772650534609, + "grad_norm": 0.34471085249350447, + "learning_rate": 3.6065915727993326e-05, + "loss": 0.3764, + "step": 935 + }, + { + "epoch": 1.0528981429375353, + "grad_norm": 0.33468302525089494, + "learning_rate": 3.6045056320400496e-05, + "loss": 0.3479, + "step": 936 + }, + { + "epoch": 1.0540236353404615, + "grad_norm": 0.36538591134232934, + "learning_rate": 3.602419691280768e-05, + "loss": 0.3642, + "step": 937 + }, + { + "epoch": 1.0551491277433878, + "grad_norm": 0.35282922968280045, + "learning_rate": 3.600333750521485e-05, + "loss": 0.3446, + "step": 938 + }, + { + "epoch": 1.056274620146314, + "grad_norm": 0.35478764255979334, + "learning_rate": 3.598247809762203e-05, + "loss": 0.3752, + "step": 939 + }, + { + "epoch": 1.0574001125492403, + "grad_norm": 0.3565613451966995, + "learning_rate": 3.5961618690029204e-05, + "loss": 0.362, + "step": 940 + }, + { + "epoch": 1.0585256049521665, + "grad_norm": 0.33132722259601055, + "learning_rate": 3.594075928243638e-05, + "loss": 0.3559, + "step": 941 + }, + { + "epoch": 1.0596510973550928, + "grad_norm": 0.34347700089780575, + "learning_rate": 3.591989987484355e-05, + "loss": 0.3641, + "step": 942 + }, + { + "epoch": 1.060776589758019, + "grad_norm": 0.2772476546624268, + "learning_rate": 3.5899040467250735e-05, + "loss": 0.3433, + "step": 943 + }, + { + "epoch": 1.0619020821609455, + "grad_norm": 0.36078868188752466, + "learning_rate": 3.5878181059657906e-05, + "loss": 0.36, + "step": 944 + }, + { + "epoch": 1.0630275745638718, + "grad_norm": 0.2927763816273808, + "learning_rate": 3.585732165206508e-05, + "loss": 0.3757, + "step": 945 + }, + { + "epoch": 1.064153066966798, + "grad_norm": 0.31067799008966573, + "learning_rate": 3.583646224447226e-05, + "loss": 0.3375, + "step": 946 + }, + { + "epoch": 1.0652785593697243, + "grad_norm": 0.30786259543828726, + "learning_rate": 3.5815602836879437e-05, + "loss": 0.3691, + "step": 947 + }, + { + "epoch": 1.0664040517726505, + "grad_norm": 0.34927488285962766, + "learning_rate": 3.579474342928661e-05, + "loss": 0.3512, + "step": 948 + }, + { + "epoch": 1.0675295441755768, + "grad_norm": 0.3134128528998366, + "learning_rate": 3.577388402169379e-05, + "loss": 0.3684, + "step": 949 + }, + { + "epoch": 1.068655036578503, + "grad_norm": 0.3684381541500359, + "learning_rate": 3.575302461410096e-05, + "loss": 0.3635, + "step": 950 + }, + { + "epoch": 1.0697805289814293, + "grad_norm": 0.3071501276127385, + "learning_rate": 3.573216520650814e-05, + "loss": 0.3629, + "step": 951 + }, + { + "epoch": 1.0709060213843558, + "grad_norm": 0.3650935121688607, + "learning_rate": 3.5711305798915315e-05, + "loss": 0.352, + "step": 952 + }, + { + "epoch": 1.072031513787282, + "grad_norm": 0.3004157301630184, + "learning_rate": 3.569044639132249e-05, + "loss": 0.3627, + "step": 953 + }, + { + "epoch": 1.0731570061902083, + "grad_norm": 0.3588467213474463, + "learning_rate": 3.566958698372966e-05, + "loss": 0.378, + "step": 954 + }, + { + "epoch": 1.0742824985931345, + "grad_norm": 0.38695693104692636, + "learning_rate": 3.564872757613684e-05, + "loss": 0.3558, + "step": 955 + }, + { + "epoch": 1.0754079909960608, + "grad_norm": 0.30329694533620805, + "learning_rate": 3.5627868168544016e-05, + "loss": 0.3841, + "step": 956 + }, + { + "epoch": 1.076533483398987, + "grad_norm": 0.34905611952609783, + "learning_rate": 3.5607008760951186e-05, + "loss": 0.3689, + "step": 957 + }, + { + "epoch": 1.0776589758019133, + "grad_norm": 0.28800778538826344, + "learning_rate": 3.558614935335837e-05, + "loss": 0.3543, + "step": 958 + }, + { + "epoch": 1.0787844682048395, + "grad_norm": 0.3746527261236155, + "learning_rate": 3.556528994576554e-05, + "loss": 0.366, + "step": 959 + }, + { + "epoch": 1.079909960607766, + "grad_norm": 0.32663591501026235, + "learning_rate": 3.554443053817272e-05, + "loss": 0.3499, + "step": 960 + }, + { + "epoch": 1.0810354530106923, + "grad_norm": 0.3328189109583666, + "learning_rate": 3.5523571130579894e-05, + "loss": 0.353, + "step": 961 + }, + { + "epoch": 1.0821609454136185, + "grad_norm": 0.31964664375303664, + "learning_rate": 3.550271172298707e-05, + "loss": 0.3672, + "step": 962 + }, + { + "epoch": 1.0832864378165448, + "grad_norm": 0.36918332363958006, + "learning_rate": 3.548185231539424e-05, + "loss": 0.3798, + "step": 963 + }, + { + "epoch": 1.084411930219471, + "grad_norm": 0.3254223917013834, + "learning_rate": 3.546099290780142e-05, + "loss": 0.3559, + "step": 964 + }, + { + "epoch": 1.0855374226223973, + "grad_norm": 0.3008814703536633, + "learning_rate": 3.5440133500208595e-05, + "loss": 0.3609, + "step": 965 + }, + { + "epoch": 1.0866629150253235, + "grad_norm": 0.35240736109329646, + "learning_rate": 3.541927409261577e-05, + "loss": 0.3777, + "step": 966 + }, + { + "epoch": 1.0877884074282498, + "grad_norm": 0.3869312281732699, + "learning_rate": 3.539841468502295e-05, + "loss": 0.3724, + "step": 967 + }, + { + "epoch": 1.088913899831176, + "grad_norm": 0.30726021570614737, + "learning_rate": 3.5377555277430126e-05, + "loss": 0.3531, + "step": 968 + }, + { + "epoch": 1.0900393922341025, + "grad_norm": 0.34236583353183286, + "learning_rate": 3.5356695869837296e-05, + "loss": 0.3608, + "step": 969 + }, + { + "epoch": 1.0911648846370288, + "grad_norm": 0.2916866803109591, + "learning_rate": 3.533583646224447e-05, + "loss": 0.3624, + "step": 970 + }, + { + "epoch": 1.092290377039955, + "grad_norm": 0.3145203080926422, + "learning_rate": 3.531497705465165e-05, + "loss": 0.3684, + "step": 971 + }, + { + "epoch": 1.0934158694428813, + "grad_norm": 0.2873541218671502, + "learning_rate": 3.529411764705883e-05, + "loss": 0.3617, + "step": 972 + }, + { + "epoch": 1.0945413618458075, + "grad_norm": 0.3506652103429166, + "learning_rate": 3.5273258239466e-05, + "loss": 0.3583, + "step": 973 + }, + { + "epoch": 1.0956668542487338, + "grad_norm": 0.3025123158669694, + "learning_rate": 3.525239883187318e-05, + "loss": 0.3472, + "step": 974 + }, + { + "epoch": 1.09679234665166, + "grad_norm": 0.2899074126357094, + "learning_rate": 3.523153942428035e-05, + "loss": 0.3675, + "step": 975 + }, + { + "epoch": 1.0979178390545863, + "grad_norm": 0.3150990472406033, + "learning_rate": 3.521068001668753e-05, + "loss": 0.3636, + "step": 976 + }, + { + "epoch": 1.0990433314575128, + "grad_norm": 0.35489391655027186, + "learning_rate": 3.5189820609094705e-05, + "loss": 0.3384, + "step": 977 + }, + { + "epoch": 1.100168823860439, + "grad_norm": 0.3041199542435297, + "learning_rate": 3.5168961201501876e-05, + "loss": 0.3571, + "step": 978 + }, + { + "epoch": 1.1012943162633653, + "grad_norm": 0.31637443077212757, + "learning_rate": 3.514810179390905e-05, + "loss": 0.3703, + "step": 979 + }, + { + "epoch": 1.1024198086662915, + "grad_norm": 0.33113581691565325, + "learning_rate": 3.512724238631623e-05, + "loss": 0.35, + "step": 980 + }, + { + "epoch": 1.1035453010692178, + "grad_norm": 0.3300457711599469, + "learning_rate": 3.5106382978723407e-05, + "loss": 0.3485, + "step": 981 + }, + { + "epoch": 1.104670793472144, + "grad_norm": 0.37342013448224476, + "learning_rate": 3.508552357113058e-05, + "loss": 0.3543, + "step": 982 + }, + { + "epoch": 1.1057962858750703, + "grad_norm": 0.36084265787497494, + "learning_rate": 3.506466416353776e-05, + "loss": 0.3499, + "step": 983 + }, + { + "epoch": 1.1069217782779965, + "grad_norm": 0.36650053348727774, + "learning_rate": 3.504380475594493e-05, + "loss": 0.3727, + "step": 984 + }, + { + "epoch": 1.108047270680923, + "grad_norm": 0.38335191540233127, + "learning_rate": 3.502294534835211e-05, + "loss": 0.3557, + "step": 985 + }, + { + "epoch": 1.1091727630838493, + "grad_norm": 0.36320976195356514, + "learning_rate": 3.5002085940759285e-05, + "loss": 0.382, + "step": 986 + }, + { + "epoch": 1.1102982554867755, + "grad_norm": 0.38636958474248506, + "learning_rate": 3.498122653316646e-05, + "loss": 0.3402, + "step": 987 + }, + { + "epoch": 1.1114237478897018, + "grad_norm": 0.38017701551768956, + "learning_rate": 3.496036712557363e-05, + "loss": 0.3742, + "step": 988 + }, + { + "epoch": 1.112549240292628, + "grad_norm": 0.3198258149962093, + "learning_rate": 3.4939507717980816e-05, + "loss": 0.3432, + "step": 989 + }, + { + "epoch": 1.1136747326955543, + "grad_norm": 0.38060186204014107, + "learning_rate": 3.4918648310387986e-05, + "loss": 0.364, + "step": 990 + }, + { + "epoch": 1.1148002250984805, + "grad_norm": 0.3522538503310745, + "learning_rate": 3.489778890279516e-05, + "loss": 0.3862, + "step": 991 + }, + { + "epoch": 1.1159257175014068, + "grad_norm": 0.34893950721299544, + "learning_rate": 3.487692949520234e-05, + "loss": 0.3674, + "step": 992 + }, + { + "epoch": 1.117051209904333, + "grad_norm": 0.3145664530999275, + "learning_rate": 3.485607008760952e-05, + "loss": 0.3623, + "step": 993 + }, + { + "epoch": 1.1181767023072595, + "grad_norm": 0.38231007603706296, + "learning_rate": 3.483521068001669e-05, + "loss": 0.3513, + "step": 994 + }, + { + "epoch": 1.1193021947101858, + "grad_norm": 0.29574406471189, + "learning_rate": 3.481435127242387e-05, + "loss": 0.3686, + "step": 995 + }, + { + "epoch": 1.120427687113112, + "grad_norm": 0.3786384191919254, + "learning_rate": 3.479349186483104e-05, + "loss": 0.3496, + "step": 996 + }, + { + "epoch": 1.1215531795160383, + "grad_norm": 0.27933782961377807, + "learning_rate": 3.477263245723821e-05, + "loss": 0.3865, + "step": 997 + }, + { + "epoch": 1.1226786719189645, + "grad_norm": 0.3796958540762593, + "learning_rate": 3.4751773049645395e-05, + "loss": 0.3701, + "step": 998 + }, + { + "epoch": 1.1238041643218908, + "grad_norm": 0.31019085193512064, + "learning_rate": 3.4730913642052565e-05, + "loss": 0.3544, + "step": 999 + }, + { + "epoch": 1.124929656724817, + "grad_norm": 0.3894747761447629, + "learning_rate": 3.471005423445974e-05, + "loss": 0.3613, + "step": 1000 + }, + { + "epoch": 1.1260551491277433, + "grad_norm": 0.3848999285142024, + "learning_rate": 3.468919482686692e-05, + "loss": 0.3572, + "step": 1001 + }, + { + "epoch": 1.1271806415306695, + "grad_norm": 0.4075083886945119, + "learning_rate": 3.4668335419274096e-05, + "loss": 0.3534, + "step": 1002 + }, + { + "epoch": 1.128306133933596, + "grad_norm": 0.4244922841249029, + "learning_rate": 3.4647476011681266e-05, + "loss": 0.3857, + "step": 1003 + }, + { + "epoch": 1.1294316263365223, + "grad_norm": 0.3575947287049676, + "learning_rate": 3.462661660408844e-05, + "loss": 0.3494, + "step": 1004 + }, + { + "epoch": 1.1305571187394485, + "grad_norm": 0.3920246518678635, + "learning_rate": 3.460575719649562e-05, + "loss": 0.3693, + "step": 1005 + }, + { + "epoch": 1.1316826111423748, + "grad_norm": 0.3065280136400847, + "learning_rate": 3.45848977889028e-05, + "loss": 0.3352, + "step": 1006 + }, + { + "epoch": 1.132808103545301, + "grad_norm": 0.38525744406438595, + "learning_rate": 3.4564038381309974e-05, + "loss": 0.353, + "step": 1007 + }, + { + "epoch": 1.1339335959482273, + "grad_norm": 0.47272322177864035, + "learning_rate": 3.454317897371715e-05, + "loss": 0.3673, + "step": 1008 + }, + { + "epoch": 1.1350590883511535, + "grad_norm": 0.3327944075995892, + "learning_rate": 3.452231956612432e-05, + "loss": 0.3523, + "step": 1009 + }, + { + "epoch": 1.13618458075408, + "grad_norm": 0.42906579303424525, + "learning_rate": 3.45014601585315e-05, + "loss": 0.3577, + "step": 1010 + }, + { + "epoch": 1.1373100731570063, + "grad_norm": 0.31630743768076713, + "learning_rate": 3.4480600750938675e-05, + "loss": 0.3571, + "step": 1011 + }, + { + "epoch": 1.1384355655599325, + "grad_norm": 0.41005007736044136, + "learning_rate": 3.445974134334585e-05, + "loss": 0.362, + "step": 1012 + }, + { + "epoch": 1.1395610579628588, + "grad_norm": 0.3846148750924408, + "learning_rate": 3.443888193575302e-05, + "loss": 0.3554, + "step": 1013 + }, + { + "epoch": 1.140686550365785, + "grad_norm": 0.39499988480138304, + "learning_rate": 3.4418022528160206e-05, + "loss": 0.367, + "step": 1014 + }, + { + "epoch": 1.1418120427687113, + "grad_norm": 0.35657946077097175, + "learning_rate": 3.4397163120567377e-05, + "loss": 0.3694, + "step": 1015 + }, + { + "epoch": 1.1429375351716375, + "grad_norm": 0.3728438143327632, + "learning_rate": 3.4376303712974554e-05, + "loss": 0.3713, + "step": 1016 + }, + { + "epoch": 1.1440630275745638, + "grad_norm": 0.34659822653002426, + "learning_rate": 3.435544430538173e-05, + "loss": 0.3584, + "step": 1017 + }, + { + "epoch": 1.14518851997749, + "grad_norm": 0.3828982028856398, + "learning_rate": 3.43345848977889e-05, + "loss": 0.357, + "step": 1018 + }, + { + "epoch": 1.1463140123804165, + "grad_norm": 0.35840428604352054, + "learning_rate": 3.431372549019608e-05, + "loss": 0.3658, + "step": 1019 + }, + { + "epoch": 1.1474395047833428, + "grad_norm": 0.3642341763560189, + "learning_rate": 3.4292866082603255e-05, + "loss": 0.3768, + "step": 1020 + }, + { + "epoch": 1.148564997186269, + "grad_norm": 0.47028026081900165, + "learning_rate": 3.427200667501043e-05, + "loss": 0.3448, + "step": 1021 + }, + { + "epoch": 1.1496904895891953, + "grad_norm": 0.33137638092807364, + "learning_rate": 3.42511472674176e-05, + "loss": 0.3855, + "step": 1022 + }, + { + "epoch": 1.1508159819921215, + "grad_norm": 0.4049631157313659, + "learning_rate": 3.4230287859824786e-05, + "loss": 0.3801, + "step": 1023 + }, + { + "epoch": 1.1519414743950478, + "grad_norm": 0.3829633936239526, + "learning_rate": 3.4209428452231956e-05, + "loss": 0.3791, + "step": 1024 + }, + { + "epoch": 1.153066966797974, + "grad_norm": 0.42759635786809663, + "learning_rate": 3.418856904463913e-05, + "loss": 0.3676, + "step": 1025 + }, + { + "epoch": 1.1541924592009003, + "grad_norm": 0.3728776125817692, + "learning_rate": 3.416770963704631e-05, + "loss": 0.3622, + "step": 1026 + }, + { + "epoch": 1.1553179516038266, + "grad_norm": 0.39380341402257635, + "learning_rate": 3.414685022945349e-05, + "loss": 0.3785, + "step": 1027 + }, + { + "epoch": 1.156443444006753, + "grad_norm": 0.32076593702973827, + "learning_rate": 3.412599082186066e-05, + "loss": 0.3745, + "step": 1028 + }, + { + "epoch": 1.1575689364096793, + "grad_norm": 0.32908758752319733, + "learning_rate": 3.410513141426784e-05, + "loss": 0.3496, + "step": 1029 + }, + { + "epoch": 1.1586944288126055, + "grad_norm": 0.41768970871312155, + "learning_rate": 3.408427200667501e-05, + "loss": 0.3575, + "step": 1030 + }, + { + "epoch": 1.1598199212155318, + "grad_norm": 0.3106359891104045, + "learning_rate": 3.406341259908219e-05, + "loss": 0.3516, + "step": 1031 + }, + { + "epoch": 1.160945413618458, + "grad_norm": 0.3870701068020313, + "learning_rate": 3.4042553191489365e-05, + "loss": 0.3356, + "step": 1032 + }, + { + "epoch": 1.1620709060213843, + "grad_norm": 0.38611106269123546, + "learning_rate": 3.402169378389654e-05, + "loss": 0.3469, + "step": 1033 + }, + { + "epoch": 1.1631963984243106, + "grad_norm": 0.3255124156021805, + "learning_rate": 3.400083437630371e-05, + "loss": 0.3722, + "step": 1034 + }, + { + "epoch": 1.164321890827237, + "grad_norm": 0.32836642792719567, + "learning_rate": 3.3979974968710896e-05, + "loss": 0.3544, + "step": 1035 + }, + { + "epoch": 1.1654473832301633, + "grad_norm": 0.3805911934596958, + "learning_rate": 3.3959115561118066e-05, + "loss": 0.3982, + "step": 1036 + }, + { + "epoch": 1.1665728756330895, + "grad_norm": 0.3368162160417577, + "learning_rate": 3.393825615352524e-05, + "loss": 0.3679, + "step": 1037 + }, + { + "epoch": 1.1676983680360158, + "grad_norm": 0.31363563073754847, + "learning_rate": 3.391739674593242e-05, + "loss": 0.3529, + "step": 1038 + }, + { + "epoch": 1.168823860438942, + "grad_norm": 0.34006739877010494, + "learning_rate": 3.389653733833959e-05, + "loss": 0.3463, + "step": 1039 + }, + { + "epoch": 1.1699493528418683, + "grad_norm": 0.3100061821836274, + "learning_rate": 3.387567793074677e-05, + "loss": 0.3381, + "step": 1040 + }, + { + "epoch": 1.1710748452447945, + "grad_norm": 0.3065807803890228, + "learning_rate": 3.3854818523153944e-05, + "loss": 0.3651, + "step": 1041 + }, + { + "epoch": 1.1722003376477208, + "grad_norm": 0.32611882573130585, + "learning_rate": 3.383395911556112e-05, + "loss": 0.3529, + "step": 1042 + }, + { + "epoch": 1.173325830050647, + "grad_norm": 0.28895452201759864, + "learning_rate": 3.381309970796829e-05, + "loss": 0.3307, + "step": 1043 + }, + { + "epoch": 1.1744513224535735, + "grad_norm": 0.31616663311663623, + "learning_rate": 3.379224030037547e-05, + "loss": 0.3615, + "step": 1044 + }, + { + "epoch": 1.1755768148564998, + "grad_norm": 0.2999011173077538, + "learning_rate": 3.3771380892782645e-05, + "loss": 0.3527, + "step": 1045 + }, + { + "epoch": 1.176702307259426, + "grad_norm": 0.28604936736274933, + "learning_rate": 3.375052148518982e-05, + "loss": 0.361, + "step": 1046 + }, + { + "epoch": 1.1778277996623523, + "grad_norm": 0.3028269137775988, + "learning_rate": 3.3729662077597e-05, + "loss": 0.3668, + "step": 1047 + }, + { + "epoch": 1.1789532920652785, + "grad_norm": 0.36698195409495143, + "learning_rate": 3.3708802670004176e-05, + "loss": 0.352, + "step": 1048 + }, + { + "epoch": 1.1800787844682048, + "grad_norm": 0.2951939270230831, + "learning_rate": 3.3687943262411347e-05, + "loss": 0.3533, + "step": 1049 + }, + { + "epoch": 1.181204276871131, + "grad_norm": 0.4064761843327334, + "learning_rate": 3.3667083854818524e-05, + "loss": 0.3601, + "step": 1050 + }, + { + "epoch": 1.1823297692740573, + "grad_norm": 0.325934767924338, + "learning_rate": 3.36462244472257e-05, + "loss": 0.366, + "step": 1051 + }, + { + "epoch": 1.1834552616769836, + "grad_norm": 0.3444374492643726, + "learning_rate": 3.362536503963288e-05, + "loss": 0.3591, + "step": 1052 + }, + { + "epoch": 1.18458075407991, + "grad_norm": 0.3902013079098464, + "learning_rate": 3.360450563204005e-05, + "loss": 0.3609, + "step": 1053 + }, + { + "epoch": 1.1857062464828363, + "grad_norm": 0.3552567977795283, + "learning_rate": 3.358364622444723e-05, + "loss": 0.3824, + "step": 1054 + }, + { + "epoch": 1.1868317388857625, + "grad_norm": 0.5473634143542325, + "learning_rate": 3.35627868168544e-05, + "loss": 0.344, + "step": 1055 + }, + { + "epoch": 1.1879572312886888, + "grad_norm": 0.31822857141954713, + "learning_rate": 3.354192740926158e-05, + "loss": 0.34, + "step": 1056 + }, + { + "epoch": 1.189082723691615, + "grad_norm": 0.35648383062484057, + "learning_rate": 3.3521068001668756e-05, + "loss": 0.3664, + "step": 1057 + }, + { + "epoch": 1.1902082160945413, + "grad_norm": 0.3533726981414865, + "learning_rate": 3.350020859407593e-05, + "loss": 0.3643, + "step": 1058 + }, + { + "epoch": 1.1913337084974676, + "grad_norm": 0.38846901904691766, + "learning_rate": 3.34793491864831e-05, + "loss": 0.364, + "step": 1059 + }, + { + "epoch": 1.192459200900394, + "grad_norm": 0.32829805282614477, + "learning_rate": 3.345848977889028e-05, + "loss": 0.3505, + "step": 1060 + }, + { + "epoch": 1.1935846933033203, + "grad_norm": 0.3371243132688832, + "learning_rate": 3.343763037129746e-05, + "loss": 0.3706, + "step": 1061 + }, + { + "epoch": 1.1947101857062465, + "grad_norm": 0.29390329610439453, + "learning_rate": 3.341677096370463e-05, + "loss": 0.3513, + "step": 1062 + }, + { + "epoch": 1.1958356781091728, + "grad_norm": 0.3589333659631211, + "learning_rate": 3.339591155611181e-05, + "loss": 0.364, + "step": 1063 + }, + { + "epoch": 1.196961170512099, + "grad_norm": 0.3025901807833534, + "learning_rate": 3.337505214851898e-05, + "loss": 0.3716, + "step": 1064 + }, + { + "epoch": 1.1980866629150253, + "grad_norm": 0.2990903113895738, + "learning_rate": 3.335419274092616e-05, + "loss": 0.3703, + "step": 1065 + }, + { + "epoch": 1.1992121553179516, + "grad_norm": 0.3084522992492389, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.3423, + "step": 1066 + }, + { + "epoch": 1.2003376477208778, + "grad_norm": 0.2833543979726358, + "learning_rate": 3.331247392574051e-05, + "loss": 0.3794, + "step": 1067 + }, + { + "epoch": 1.201463140123804, + "grad_norm": 0.3502254927161911, + "learning_rate": 3.329161451814768e-05, + "loss": 0.3588, + "step": 1068 + }, + { + "epoch": 1.2025886325267305, + "grad_norm": 0.2824861573083505, + "learning_rate": 3.3270755110554866e-05, + "loss": 0.3375, + "step": 1069 + }, + { + "epoch": 1.2037141249296568, + "grad_norm": 0.32275485870283527, + "learning_rate": 3.3249895702962036e-05, + "loss": 0.3659, + "step": 1070 + }, + { + "epoch": 1.204839617332583, + "grad_norm": 0.2831771258197277, + "learning_rate": 3.322903629536921e-05, + "loss": 0.3608, + "step": 1071 + }, + { + "epoch": 1.2059651097355093, + "grad_norm": 0.4099026461053303, + "learning_rate": 3.320817688777639e-05, + "loss": 0.3657, + "step": 1072 + }, + { + "epoch": 1.2070906021384356, + "grad_norm": 0.2988459528156424, + "learning_rate": 3.318731748018357e-05, + "loss": 0.3612, + "step": 1073 + }, + { + "epoch": 1.2082160945413618, + "grad_norm": 0.3285103143387034, + "learning_rate": 3.316645807259074e-05, + "loss": 0.3243, + "step": 1074 + }, + { + "epoch": 1.209341586944288, + "grad_norm": 0.3140866233456728, + "learning_rate": 3.314559866499792e-05, + "loss": 0.3605, + "step": 1075 + }, + { + "epoch": 1.2104670793472143, + "grad_norm": 0.3136304362130377, + "learning_rate": 3.312473925740509e-05, + "loss": 0.3518, + "step": 1076 + }, + { + "epoch": 1.2115925717501406, + "grad_norm": 0.37704920383565566, + "learning_rate": 3.310387984981227e-05, + "loss": 0.3505, + "step": 1077 + }, + { + "epoch": 1.212718064153067, + "grad_norm": 0.3386484501892276, + "learning_rate": 3.3083020442219445e-05, + "loss": 0.3581, + "step": 1078 + }, + { + "epoch": 1.2138435565559933, + "grad_norm": 0.2756287445653643, + "learning_rate": 3.306216103462662e-05, + "loss": 0.3508, + "step": 1079 + }, + { + "epoch": 1.2149690489589196, + "grad_norm": 0.3727446701400803, + "learning_rate": 3.304130162703379e-05, + "loss": 0.3637, + "step": 1080 + }, + { + "epoch": 1.2160945413618458, + "grad_norm": 0.34205106067470487, + "learning_rate": 3.302044221944097e-05, + "loss": 0.3719, + "step": 1081 + }, + { + "epoch": 1.217220033764772, + "grad_norm": 0.277943785807938, + "learning_rate": 3.2999582811848146e-05, + "loss": 0.3663, + "step": 1082 + }, + { + "epoch": 1.2183455261676983, + "grad_norm": 0.32778887865165535, + "learning_rate": 3.2978723404255317e-05, + "loss": 0.3788, + "step": 1083 + }, + { + "epoch": 1.2194710185706246, + "grad_norm": 0.35850613973050943, + "learning_rate": 3.2957863996662494e-05, + "loss": 0.3516, + "step": 1084 + }, + { + "epoch": 1.220596510973551, + "grad_norm": 0.32265446214334986, + "learning_rate": 3.293700458906967e-05, + "loss": 0.3637, + "step": 1085 + }, + { + "epoch": 1.2217220033764773, + "grad_norm": 0.32337082624436203, + "learning_rate": 3.291614518147685e-05, + "loss": 0.353, + "step": 1086 + }, + { + "epoch": 1.2228474957794035, + "grad_norm": 0.3566976026538077, + "learning_rate": 3.2895285773884024e-05, + "loss": 0.3702, + "step": 1087 + }, + { + "epoch": 1.2239729881823298, + "grad_norm": 0.3602820801303339, + "learning_rate": 3.28744263662912e-05, + "loss": 0.3478, + "step": 1088 + }, + { + "epoch": 1.225098480585256, + "grad_norm": 0.3167430696040855, + "learning_rate": 3.285356695869837e-05, + "loss": 0.3659, + "step": 1089 + }, + { + "epoch": 1.2262239729881823, + "grad_norm": 0.3018055329469023, + "learning_rate": 3.283270755110555e-05, + "loss": 0.3588, + "step": 1090 + }, + { + "epoch": 1.2273494653911086, + "grad_norm": 0.3230175863991661, + "learning_rate": 3.2811848143512726e-05, + "loss": 0.3843, + "step": 1091 + }, + { + "epoch": 1.2284749577940348, + "grad_norm": 0.33437635395778137, + "learning_rate": 3.27909887359199e-05, + "loss": 0.3767, + "step": 1092 + }, + { + "epoch": 1.229600450196961, + "grad_norm": 0.2981951069574055, + "learning_rate": 3.277012932832707e-05, + "loss": 0.3595, + "step": 1093 + }, + { + "epoch": 1.2307259425998875, + "grad_norm": 0.3380111260459277, + "learning_rate": 3.2749269920734257e-05, + "loss": 0.3709, + "step": 1094 + }, + { + "epoch": 1.2318514350028138, + "grad_norm": 0.3105746941727841, + "learning_rate": 3.272841051314143e-05, + "loss": 0.3683, + "step": 1095 + }, + { + "epoch": 1.23297692740574, + "grad_norm": 0.30933018515689015, + "learning_rate": 3.2707551105548604e-05, + "loss": 0.3592, + "step": 1096 + }, + { + "epoch": 1.2341024198086663, + "grad_norm": 0.3281758704709362, + "learning_rate": 3.268669169795578e-05, + "loss": 0.3606, + "step": 1097 + }, + { + "epoch": 1.2352279122115926, + "grad_norm": 0.27019773248184603, + "learning_rate": 3.266583229036296e-05, + "loss": 0.3761, + "step": 1098 + }, + { + "epoch": 1.2363534046145188, + "grad_norm": 0.3336597777910512, + "learning_rate": 3.264497288277013e-05, + "loss": 0.3959, + "step": 1099 + }, + { + "epoch": 1.237478897017445, + "grad_norm": 0.31907940795687006, + "learning_rate": 3.262411347517731e-05, + "loss": 0.3615, + "step": 1100 + }, + { + "epoch": 1.2386043894203713, + "grad_norm": 0.3307050654215005, + "learning_rate": 3.260325406758448e-05, + "loss": 0.3705, + "step": 1101 + }, + { + "epoch": 1.2397298818232976, + "grad_norm": 0.31539392998297533, + "learning_rate": 3.258239465999165e-05, + "loss": 0.3495, + "step": 1102 + }, + { + "epoch": 1.240855374226224, + "grad_norm": 0.33795761226174353, + "learning_rate": 3.2561535252398836e-05, + "loss": 0.3268, + "step": 1103 + }, + { + "epoch": 1.2419808666291503, + "grad_norm": 0.3027820655749601, + "learning_rate": 3.2540675844806006e-05, + "loss": 0.3408, + "step": 1104 + }, + { + "epoch": 1.2431063590320766, + "grad_norm": 0.3679583282318403, + "learning_rate": 3.251981643721318e-05, + "loss": 0.3677, + "step": 1105 + }, + { + "epoch": 1.2442318514350028, + "grad_norm": 0.3613263755121883, + "learning_rate": 3.249895702962036e-05, + "loss": 0.3829, + "step": 1106 + }, + { + "epoch": 1.245357343837929, + "grad_norm": 0.24750292423243272, + "learning_rate": 3.247809762202754e-05, + "loss": 0.3482, + "step": 1107 + }, + { + "epoch": 1.2464828362408553, + "grad_norm": 0.3522480210531916, + "learning_rate": 3.245723821443471e-05, + "loss": 0.3625, + "step": 1108 + }, + { + "epoch": 1.2476083286437816, + "grad_norm": 0.31652713432842655, + "learning_rate": 3.243637880684189e-05, + "loss": 0.3671, + "step": 1109 + }, + { + "epoch": 1.248733821046708, + "grad_norm": 0.2933496822923214, + "learning_rate": 3.241551939924906e-05, + "loss": 0.3497, + "step": 1110 + }, + { + "epoch": 1.2498593134496343, + "grad_norm": 0.32050895251241057, + "learning_rate": 3.239465999165624e-05, + "loss": 0.3716, + "step": 1111 + }, + { + "epoch": 1.2509848058525606, + "grad_norm": 0.3121935413715743, + "learning_rate": 3.2373800584063415e-05, + "loss": 0.3771, + "step": 1112 + }, + { + "epoch": 1.2521102982554868, + "grad_norm": 0.32265588805772627, + "learning_rate": 3.235294117647059e-05, + "loss": 0.3439, + "step": 1113 + }, + { + "epoch": 1.253235790658413, + "grad_norm": 0.3064712619565091, + "learning_rate": 3.233208176887776e-05, + "loss": 0.3546, + "step": 1114 + }, + { + "epoch": 1.2543612830613393, + "grad_norm": 0.2996084699077036, + "learning_rate": 3.2311222361284946e-05, + "loss": 0.3833, + "step": 1115 + }, + { + "epoch": 1.2554867754642656, + "grad_norm": 0.3107489193677045, + "learning_rate": 3.2290362953692116e-05, + "loss": 0.364, + "step": 1116 + }, + { + "epoch": 1.2566122678671918, + "grad_norm": 0.29187918061969403, + "learning_rate": 3.226950354609929e-05, + "loss": 0.3504, + "step": 1117 + }, + { + "epoch": 1.257737760270118, + "grad_norm": 0.2941379965996245, + "learning_rate": 3.224864413850647e-05, + "loss": 0.3392, + "step": 1118 + }, + { + "epoch": 1.2588632526730446, + "grad_norm": 0.25115923486308955, + "learning_rate": 3.222778473091365e-05, + "loss": 0.3876, + "step": 1119 + }, + { + "epoch": 1.2599887450759708, + "grad_norm": 0.3252010811279875, + "learning_rate": 3.220692532332082e-05, + "loss": 0.3375, + "step": 1120 + }, + { + "epoch": 1.261114237478897, + "grad_norm": 0.29814156629055977, + "learning_rate": 3.2186065915727994e-05, + "loss": 0.3401, + "step": 1121 + }, + { + "epoch": 1.2622397298818233, + "grad_norm": 0.31902570430326976, + "learning_rate": 3.216520650813517e-05, + "loss": 0.3732, + "step": 1122 + }, + { + "epoch": 1.2633652222847496, + "grad_norm": 0.3010703802720578, + "learning_rate": 3.214434710054234e-05, + "loss": 0.358, + "step": 1123 + }, + { + "epoch": 1.2644907146876758, + "grad_norm": 0.32852710550779146, + "learning_rate": 3.2123487692949525e-05, + "loss": 0.3525, + "step": 1124 + }, + { + "epoch": 1.265616207090602, + "grad_norm": 0.32212180119638056, + "learning_rate": 3.2102628285356696e-05, + "loss": 0.3749, + "step": 1125 + }, + { + "epoch": 1.2667416994935286, + "grad_norm": 0.378384113691716, + "learning_rate": 3.208176887776387e-05, + "loss": 0.3669, + "step": 1126 + }, + { + "epoch": 1.2678671918964546, + "grad_norm": 0.31165403587755924, + "learning_rate": 3.206090947017105e-05, + "loss": 0.3559, + "step": 1127 + }, + { + "epoch": 1.268992684299381, + "grad_norm": 0.3679615615830758, + "learning_rate": 3.2040050062578227e-05, + "loss": 0.3623, + "step": 1128 + }, + { + "epoch": 1.2701181767023073, + "grad_norm": 0.3467806488910905, + "learning_rate": 3.20191906549854e-05, + "loss": 0.3771, + "step": 1129 + }, + { + "epoch": 1.2712436691052336, + "grad_norm": 1.9333645104311041, + "learning_rate": 3.1998331247392574e-05, + "loss": 0.3809, + "step": 1130 + }, + { + "epoch": 1.2723691615081598, + "grad_norm": 0.4215069325465578, + "learning_rate": 3.197747183979975e-05, + "loss": 0.3489, + "step": 1131 + }, + { + "epoch": 1.273494653911086, + "grad_norm": 0.2879811482225369, + "learning_rate": 3.195661243220693e-05, + "loss": 0.3627, + "step": 1132 + }, + { + "epoch": 1.2746201463140123, + "grad_norm": 0.4477759704739148, + "learning_rate": 3.19357530246141e-05, + "loss": 0.3623, + "step": 1133 + }, + { + "epoch": 1.2757456387169386, + "grad_norm": 0.3424164269682256, + "learning_rate": 3.191489361702128e-05, + "loss": 0.3476, + "step": 1134 + }, + { + "epoch": 1.276871131119865, + "grad_norm": 0.32862691867356353, + "learning_rate": 3.189403420942845e-05, + "loss": 0.3649, + "step": 1135 + }, + { + "epoch": 1.277996623522791, + "grad_norm": 0.3209270264744574, + "learning_rate": 3.187317480183563e-05, + "loss": 0.3535, + "step": 1136 + }, + { + "epoch": 1.2791221159257176, + "grad_norm": 0.3565891148820592, + "learning_rate": 3.1852315394242806e-05, + "loss": 0.3443, + "step": 1137 + }, + { + "epoch": 1.2802476083286438, + "grad_norm": 0.28408074419058515, + "learning_rate": 3.183145598664998e-05, + "loss": 0.369, + "step": 1138 + }, + { + "epoch": 1.28137310073157, + "grad_norm": 0.3637840011075196, + "learning_rate": 3.181059657905715e-05, + "loss": 0.3608, + "step": 1139 + }, + { + "epoch": 1.2824985931344963, + "grad_norm": 0.3595209908718878, + "learning_rate": 3.178973717146434e-05, + "loss": 0.3493, + "step": 1140 + }, + { + "epoch": 1.2836240855374226, + "grad_norm": 0.26496883846043384, + "learning_rate": 3.176887776387151e-05, + "loss": 0.3633, + "step": 1141 + }, + { + "epoch": 1.2847495779403488, + "grad_norm": 0.3336179309407727, + "learning_rate": 3.1748018356278684e-05, + "loss": 0.3662, + "step": 1142 + }, + { + "epoch": 1.285875070343275, + "grad_norm": 0.32668676414933834, + "learning_rate": 3.172715894868586e-05, + "loss": 0.3671, + "step": 1143 + }, + { + "epoch": 1.2870005627462016, + "grad_norm": 0.31252062188747054, + "learning_rate": 3.170629954109303e-05, + "loss": 0.3647, + "step": 1144 + }, + { + "epoch": 1.2881260551491278, + "grad_norm": 0.31744497936057164, + "learning_rate": 3.168544013350021e-05, + "loss": 0.3622, + "step": 1145 + }, + { + "epoch": 1.289251547552054, + "grad_norm": 0.2862050055745393, + "learning_rate": 3.1664580725907385e-05, + "loss": 0.3883, + "step": 1146 + }, + { + "epoch": 1.2903770399549803, + "grad_norm": 0.30021118499678395, + "learning_rate": 3.164372131831456e-05, + "loss": 0.3579, + "step": 1147 + }, + { + "epoch": 1.2915025323579066, + "grad_norm": 0.2910467656286127, + "learning_rate": 3.162286191072173e-05, + "loss": 0.3534, + "step": 1148 + }, + { + "epoch": 1.2926280247608328, + "grad_norm": 0.28678455388133556, + "learning_rate": 3.1602002503128916e-05, + "loss": 0.3497, + "step": 1149 + }, + { + "epoch": 1.293753517163759, + "grad_norm": 0.27836486011517614, + "learning_rate": 3.1581143095536086e-05, + "loss": 0.3443, + "step": 1150 + }, + { + "epoch": 1.2948790095666856, + "grad_norm": 0.30812952315806486, + "learning_rate": 3.156028368794326e-05, + "loss": 0.3893, + "step": 1151 + }, + { + "epoch": 1.2960045019696116, + "grad_norm": 0.2874885069684301, + "learning_rate": 3.153942428035044e-05, + "loss": 0.3685, + "step": 1152 + }, + { + "epoch": 1.297129994372538, + "grad_norm": 0.3347706854010768, + "learning_rate": 3.151856487275762e-05, + "loss": 0.3927, + "step": 1153 + }, + { + "epoch": 1.2982554867754643, + "grad_norm": 0.32176469835749927, + "learning_rate": 3.149770546516479e-05, + "loss": 0.38, + "step": 1154 + }, + { + "epoch": 1.2993809791783906, + "grad_norm": 0.2898256632538439, + "learning_rate": 3.147684605757197e-05, + "loss": 0.3616, + "step": 1155 + }, + { + "epoch": 1.3005064715813168, + "grad_norm": 0.33352792276895776, + "learning_rate": 3.145598664997914e-05, + "loss": 0.3748, + "step": 1156 + }, + { + "epoch": 1.301631963984243, + "grad_norm": 0.3099488701941323, + "learning_rate": 3.143512724238632e-05, + "loss": 0.372, + "step": 1157 + }, + { + "epoch": 1.3027574563871693, + "grad_norm": 0.30363398725151736, + "learning_rate": 3.1414267834793495e-05, + "loss": 0.3701, + "step": 1158 + }, + { + "epoch": 1.3038829487900956, + "grad_norm": 0.28447681088914367, + "learning_rate": 3.139340842720067e-05, + "loss": 0.3774, + "step": 1159 + }, + { + "epoch": 1.305008441193022, + "grad_norm": 0.3003448198337203, + "learning_rate": 3.137254901960784e-05, + "loss": 0.3598, + "step": 1160 + }, + { + "epoch": 1.306133933595948, + "grad_norm": 0.3379084483923677, + "learning_rate": 3.135168961201502e-05, + "loss": 0.3708, + "step": 1161 + }, + { + "epoch": 1.3072594259988746, + "grad_norm": 0.28091894310377574, + "learning_rate": 3.13308302044222e-05, + "loss": 0.3718, + "step": 1162 + }, + { + "epoch": 1.3083849184018008, + "grad_norm": 0.33666696805419777, + "learning_rate": 3.1309970796829374e-05, + "loss": 0.3521, + "step": 1163 + }, + { + "epoch": 1.309510410804727, + "grad_norm": 0.2784271381389026, + "learning_rate": 3.128911138923655e-05, + "loss": 0.365, + "step": 1164 + }, + { + "epoch": 1.3106359032076533, + "grad_norm": 0.32996125555463496, + "learning_rate": 3.126825198164372e-05, + "loss": 0.3663, + "step": 1165 + }, + { + "epoch": 1.3117613956105796, + "grad_norm": 0.29339874231665497, + "learning_rate": 3.12473925740509e-05, + "loss": 0.3579, + "step": 1166 + }, + { + "epoch": 1.3128868880135058, + "grad_norm": 0.27539689603551204, + "learning_rate": 3.1226533166458075e-05, + "loss": 0.3428, + "step": 1167 + }, + { + "epoch": 1.314012380416432, + "grad_norm": 0.30586051604779685, + "learning_rate": 3.120567375886525e-05, + "loss": 0.3742, + "step": 1168 + }, + { + "epoch": 1.3151378728193586, + "grad_norm": 0.318506663490862, + "learning_rate": 3.118481435127242e-05, + "loss": 0.3555, + "step": 1169 + }, + { + "epoch": 1.3162633652222848, + "grad_norm": 0.32193633165736774, + "learning_rate": 3.11639549436796e-05, + "loss": 0.3798, + "step": 1170 + }, + { + "epoch": 1.317388857625211, + "grad_norm": 0.26006223637970205, + "learning_rate": 3.1143095536086776e-05, + "loss": 0.3688, + "step": 1171 + }, + { + "epoch": 1.3185143500281373, + "grad_norm": 0.3091644393869938, + "learning_rate": 3.112223612849395e-05, + "loss": 0.344, + "step": 1172 + }, + { + "epoch": 1.3196398424310636, + "grad_norm": 0.2950119209666744, + "learning_rate": 3.110137672090112e-05, + "loss": 0.3807, + "step": 1173 + }, + { + "epoch": 1.3207653348339898, + "grad_norm": 0.2975497460062189, + "learning_rate": 3.108051731330831e-05, + "loss": 0.3581, + "step": 1174 + }, + { + "epoch": 1.321890827236916, + "grad_norm": 0.3248317526622501, + "learning_rate": 3.105965790571548e-05, + "loss": 0.3455, + "step": 1175 + }, + { + "epoch": 1.3230163196398426, + "grad_norm": 0.28753503994393237, + "learning_rate": 3.1038798498122654e-05, + "loss": 0.3526, + "step": 1176 + }, + { + "epoch": 1.3241418120427686, + "grad_norm": 0.309321520103074, + "learning_rate": 3.101793909052983e-05, + "loss": 0.3671, + "step": 1177 + }, + { + "epoch": 1.325267304445695, + "grad_norm": 0.31093843252643993, + "learning_rate": 3.099707968293701e-05, + "loss": 0.3797, + "step": 1178 + }, + { + "epoch": 1.3263927968486213, + "grad_norm": 0.2941320554481767, + "learning_rate": 3.097622027534418e-05, + "loss": 0.3656, + "step": 1179 + }, + { + "epoch": 1.3275182892515476, + "grad_norm": 0.33353760439258306, + "learning_rate": 3.095536086775136e-05, + "loss": 0.3659, + "step": 1180 + }, + { + "epoch": 1.3286437816544738, + "grad_norm": 0.2569769588199655, + "learning_rate": 3.093450146015853e-05, + "loss": 0.3842, + "step": 1181 + }, + { + "epoch": 1.3297692740574, + "grad_norm": 0.3330169931726158, + "learning_rate": 3.091364205256571e-05, + "loss": 0.3643, + "step": 1182 + }, + { + "epoch": 1.3308947664603263, + "grad_norm": 0.2876950022849873, + "learning_rate": 3.0892782644972886e-05, + "loss": 0.357, + "step": 1183 + }, + { + "epoch": 1.3320202588632526, + "grad_norm": 0.31915621662192034, + "learning_rate": 3.087192323738006e-05, + "loss": 0.3817, + "step": 1184 + }, + { + "epoch": 1.333145751266179, + "grad_norm": 0.31039603557721346, + "learning_rate": 3.085106382978723e-05, + "loss": 0.3705, + "step": 1185 + }, + { + "epoch": 1.334271243669105, + "grad_norm": 0.35493105743167, + "learning_rate": 3.083020442219441e-05, + "loss": 0.375, + "step": 1186 + }, + { + "epoch": 1.3353967360720316, + "grad_norm": 0.2887229895411605, + "learning_rate": 3.080934501460159e-05, + "loss": 0.3705, + "step": 1187 + }, + { + "epoch": 1.3365222284749578, + "grad_norm": 0.33105501034844814, + "learning_rate": 3.078848560700876e-05, + "loss": 0.3517, + "step": 1188 + }, + { + "epoch": 1.337647720877884, + "grad_norm": 0.296938807012652, + "learning_rate": 3.076762619941594e-05, + "loss": 0.3569, + "step": 1189 + }, + { + "epoch": 1.3387732132808103, + "grad_norm": 0.2631825890999598, + "learning_rate": 3.074676679182311e-05, + "loss": 0.3719, + "step": 1190 + }, + { + "epoch": 1.3398987056837366, + "grad_norm": 0.3302060927278823, + "learning_rate": 3.072590738423029e-05, + "loss": 0.357, + "step": 1191 + }, + { + "epoch": 1.3410241980866628, + "grad_norm": 0.284964127664028, + "learning_rate": 3.0705047976637465e-05, + "loss": 0.3637, + "step": 1192 + }, + { + "epoch": 1.342149690489589, + "grad_norm": 0.3450486482153124, + "learning_rate": 3.068418856904464e-05, + "loss": 0.3545, + "step": 1193 + }, + { + "epoch": 1.3432751828925156, + "grad_norm": 0.3097272198664404, + "learning_rate": 3.066332916145181e-05, + "loss": 0.3615, + "step": 1194 + }, + { + "epoch": 1.3444006752954418, + "grad_norm": 0.31428949130893125, + "learning_rate": 3.0642469753858996e-05, + "loss": 0.3577, + "step": 1195 + }, + { + "epoch": 1.345526167698368, + "grad_norm": 0.3459630485656923, + "learning_rate": 3.062161034626617e-05, + "loss": 0.38, + "step": 1196 + }, + { + "epoch": 1.3466516601012943, + "grad_norm": 0.34840227455144135, + "learning_rate": 3.0600750938673344e-05, + "loss": 0.3731, + "step": 1197 + }, + { + "epoch": 1.3477771525042206, + "grad_norm": 0.36198967619880806, + "learning_rate": 3.057989153108052e-05, + "loss": 0.3637, + "step": 1198 + }, + { + "epoch": 1.3489026449071468, + "grad_norm": 0.36980762481338214, + "learning_rate": 3.05590321234877e-05, + "loss": 0.3545, + "step": 1199 + }, + { + "epoch": 1.350028137310073, + "grad_norm": 0.34738553940185973, + "learning_rate": 3.053817271589487e-05, + "loss": 0.358, + "step": 1200 + }, + { + "epoch": 1.3511536297129996, + "grad_norm": 0.28899221690961746, + "learning_rate": 3.0517313308302048e-05, + "loss": 0.3692, + "step": 1201 + }, + { + "epoch": 1.3522791221159256, + "grad_norm": 0.348414484092682, + "learning_rate": 3.0496453900709222e-05, + "loss": 0.3562, + "step": 1202 + }, + { + "epoch": 1.353404614518852, + "grad_norm": 0.28821763744716605, + "learning_rate": 3.04755944931164e-05, + "loss": 0.3623, + "step": 1203 + }, + { + "epoch": 1.3545301069217783, + "grad_norm": 0.34701713808150375, + "learning_rate": 3.0454735085523572e-05, + "loss": 0.3667, + "step": 1204 + }, + { + "epoch": 1.3556555993247046, + "grad_norm": 0.34731368442368854, + "learning_rate": 3.0433875677930746e-05, + "loss": 0.3767, + "step": 1205 + }, + { + "epoch": 1.3567810917276308, + "grad_norm": 0.2873367344993991, + "learning_rate": 3.0413016270337923e-05, + "loss": 0.3594, + "step": 1206 + }, + { + "epoch": 1.357906584130557, + "grad_norm": 0.33455871764963324, + "learning_rate": 3.0392156862745097e-05, + "loss": 0.3649, + "step": 1207 + }, + { + "epoch": 1.3590320765334833, + "grad_norm": 0.3444401979121362, + "learning_rate": 3.0371297455152277e-05, + "loss": 0.3656, + "step": 1208 + }, + { + "epoch": 1.3601575689364096, + "grad_norm": 0.2899964778052406, + "learning_rate": 3.0350438047559447e-05, + "loss": 0.3457, + "step": 1209 + }, + { + "epoch": 1.361283061339336, + "grad_norm": 0.33907151317470086, + "learning_rate": 3.0329578639966627e-05, + "loss": 0.3627, + "step": 1210 + }, + { + "epoch": 1.362408553742262, + "grad_norm": 0.3044719983267248, + "learning_rate": 3.03087192323738e-05, + "loss": 0.3654, + "step": 1211 + }, + { + "epoch": 1.3635340461451886, + "grad_norm": 0.3161562860694256, + "learning_rate": 3.0287859824780978e-05, + "loss": 0.342, + "step": 1212 + }, + { + "epoch": 1.3646595385481148, + "grad_norm": 0.3068022604762919, + "learning_rate": 3.026700041718815e-05, + "loss": 0.3539, + "step": 1213 + }, + { + "epoch": 1.365785030951041, + "grad_norm": 0.2850677267934718, + "learning_rate": 3.024614100959533e-05, + "loss": 0.3725, + "step": 1214 + }, + { + "epoch": 1.3669105233539673, + "grad_norm": 0.3160554970609396, + "learning_rate": 3.0225281602002502e-05, + "loss": 0.3868, + "step": 1215 + }, + { + "epoch": 1.3680360157568936, + "grad_norm": 0.3373572652606873, + "learning_rate": 3.0204422194409683e-05, + "loss": 0.3711, + "step": 1216 + }, + { + "epoch": 1.3691615081598199, + "grad_norm": 0.27352074329674897, + "learning_rate": 3.0183562786816856e-05, + "loss": 0.3593, + "step": 1217 + }, + { + "epoch": 1.370287000562746, + "grad_norm": 0.32866686300985715, + "learning_rate": 3.0162703379224033e-05, + "loss": 0.3547, + "step": 1218 + }, + { + "epoch": 1.3714124929656726, + "grad_norm": 0.28785334578687116, + "learning_rate": 3.0141843971631207e-05, + "loss": 0.3703, + "step": 1219 + }, + { + "epoch": 1.3725379853685988, + "grad_norm": 0.3336610653280944, + "learning_rate": 3.0120984564038384e-05, + "loss": 0.3594, + "step": 1220 + }, + { + "epoch": 1.373663477771525, + "grad_norm": 0.3252899566864213, + "learning_rate": 3.0100125156445557e-05, + "loss": 0.3666, + "step": 1221 + }, + { + "epoch": 1.3747889701744513, + "grad_norm": 0.34272700759634595, + "learning_rate": 3.0079265748852738e-05, + "loss": 0.3673, + "step": 1222 + }, + { + "epoch": 1.3759144625773776, + "grad_norm": 0.2839456991740852, + "learning_rate": 3.0058406341259908e-05, + "loss": 0.3731, + "step": 1223 + }, + { + "epoch": 1.3770399549803038, + "grad_norm": 0.34144486456169987, + "learning_rate": 3.0037546933667088e-05, + "loss": 0.3535, + "step": 1224 + }, + { + "epoch": 1.37816544738323, + "grad_norm": 0.3510452096605386, + "learning_rate": 3.0016687526074262e-05, + "loss": 0.3783, + "step": 1225 + }, + { + "epoch": 1.3792909397861566, + "grad_norm": 0.3219709429564443, + "learning_rate": 2.9995828118481435e-05, + "loss": 0.3653, + "step": 1226 + }, + { + "epoch": 1.3804164321890826, + "grad_norm": 0.2922536824083754, + "learning_rate": 2.9974968710888612e-05, + "loss": 0.3487, + "step": 1227 + }, + { + "epoch": 1.381541924592009, + "grad_norm": 0.3146465080311366, + "learning_rate": 2.9954109303295786e-05, + "loss": 0.3582, + "step": 1228 + }, + { + "epoch": 1.3826674169949353, + "grad_norm": 0.3202141542926466, + "learning_rate": 2.9933249895702963e-05, + "loss": 0.3767, + "step": 1229 + }, + { + "epoch": 1.3837929093978616, + "grad_norm": 0.3126664207698992, + "learning_rate": 2.9912390488110137e-05, + "loss": 0.3861, + "step": 1230 + }, + { + "epoch": 1.3849184018007878, + "grad_norm": 0.3006754510665695, + "learning_rate": 2.9891531080517317e-05, + "loss": 0.3826, + "step": 1231 + }, + { + "epoch": 1.386043894203714, + "grad_norm": 0.34183231562741717, + "learning_rate": 2.9870671672924487e-05, + "loss": 0.3384, + "step": 1232 + }, + { + "epoch": 1.3871693866066404, + "grad_norm": 0.2981637621431096, + "learning_rate": 2.9849812265331668e-05, + "loss": 0.3644, + "step": 1233 + }, + { + "epoch": 1.3882948790095666, + "grad_norm": 0.32927113911951866, + "learning_rate": 2.982895285773884e-05, + "loss": 0.37, + "step": 1234 + }, + { + "epoch": 1.389420371412493, + "grad_norm": 0.3516964621170918, + "learning_rate": 2.9808093450146018e-05, + "loss": 0.3734, + "step": 1235 + }, + { + "epoch": 1.3905458638154191, + "grad_norm": 0.28294383540669815, + "learning_rate": 2.9787234042553192e-05, + "loss": 0.3545, + "step": 1236 + }, + { + "epoch": 1.3916713562183456, + "grad_norm": 0.36437808290704293, + "learning_rate": 2.976637463496037e-05, + "loss": 0.3626, + "step": 1237 + }, + { + "epoch": 1.3927968486212718, + "grad_norm": 0.31704230935830585, + "learning_rate": 2.9745515227367542e-05, + "loss": 0.3536, + "step": 1238 + }, + { + "epoch": 1.393922341024198, + "grad_norm": 0.3234586646771036, + "learning_rate": 2.9724655819774723e-05, + "loss": 0.3373, + "step": 1239 + }, + { + "epoch": 1.3950478334271244, + "grad_norm": 0.3108185575862165, + "learning_rate": 2.9703796412181893e-05, + "loss": 0.3685, + "step": 1240 + }, + { + "epoch": 1.3961733258300506, + "grad_norm": 0.3752154178147501, + "learning_rate": 2.9682937004589073e-05, + "loss": 0.3676, + "step": 1241 + }, + { + "epoch": 1.3972988182329769, + "grad_norm": 0.2884249248162915, + "learning_rate": 2.9662077596996247e-05, + "loss": 0.3806, + "step": 1242 + }, + { + "epoch": 1.3984243106359031, + "grad_norm": 0.34853899665658195, + "learning_rate": 2.9641218189403424e-05, + "loss": 0.35, + "step": 1243 + }, + { + "epoch": 1.3995498030388296, + "grad_norm": 0.38250560263603123, + "learning_rate": 2.9620358781810597e-05, + "loss": 0.3671, + "step": 1244 + }, + { + "epoch": 1.4006752954417558, + "grad_norm": 0.31599487532627424, + "learning_rate": 2.9599499374217778e-05, + "loss": 0.3833, + "step": 1245 + }, + { + "epoch": 1.401800787844682, + "grad_norm": 0.3500438918178945, + "learning_rate": 2.9578639966624948e-05, + "loss": 0.365, + "step": 1246 + }, + { + "epoch": 1.4029262802476083, + "grad_norm": 0.41585464581458353, + "learning_rate": 2.955778055903212e-05, + "loss": 0.3918, + "step": 1247 + }, + { + "epoch": 1.4040517726505346, + "grad_norm": 0.314777082319376, + "learning_rate": 2.9536921151439302e-05, + "loss": 0.3657, + "step": 1248 + }, + { + "epoch": 1.4051772650534609, + "grad_norm": 0.3941954750192581, + "learning_rate": 2.9516061743846472e-05, + "loss": 0.3472, + "step": 1249 + }, + { + "epoch": 1.406302757456387, + "grad_norm": 0.40917669615827007, + "learning_rate": 2.9495202336253653e-05, + "loss": 0.3646, + "step": 1250 + }, + { + "epoch": 1.4074282498593136, + "grad_norm": 0.32821878108438296, + "learning_rate": 2.9474342928660826e-05, + "loss": 0.3636, + "step": 1251 + }, + { + "epoch": 1.4085537422622396, + "grad_norm": 0.4247196049076011, + "learning_rate": 2.9453483521068003e-05, + "loss": 0.3412, + "step": 1252 + }, + { + "epoch": 1.409679234665166, + "grad_norm": 0.3851171427422802, + "learning_rate": 2.9432624113475177e-05, + "loss": 0.3819, + "step": 1253 + }, + { + "epoch": 1.4108047270680923, + "grad_norm": 0.3469070969772743, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.3616, + "step": 1254 + }, + { + "epoch": 1.4119302194710186, + "grad_norm": 0.444050908424801, + "learning_rate": 2.9390905298289527e-05, + "loss": 0.3565, + "step": 1255 + }, + { + "epoch": 1.4130557118739449, + "grad_norm": 0.2879248830093118, + "learning_rate": 2.9370045890696708e-05, + "loss": 0.3601, + "step": 1256 + }, + { + "epoch": 1.414181204276871, + "grad_norm": 0.4483867224495224, + "learning_rate": 2.934918648310388e-05, + "loss": 0.3682, + "step": 1257 + }, + { + "epoch": 1.4153066966797974, + "grad_norm": 0.3000179145995747, + "learning_rate": 2.9328327075511058e-05, + "loss": 0.3636, + "step": 1258 + }, + { + "epoch": 1.4164321890827236, + "grad_norm": 0.38516280700639255, + "learning_rate": 2.9307467667918232e-05, + "loss": 0.3641, + "step": 1259 + }, + { + "epoch": 1.41755768148565, + "grad_norm": 0.3233722496864056, + "learning_rate": 2.928660826032541e-05, + "loss": 0.353, + "step": 1260 + }, + { + "epoch": 1.4186831738885761, + "grad_norm": 0.35206171266037123, + "learning_rate": 2.9265748852732582e-05, + "loss": 0.3715, + "step": 1261 + }, + { + "epoch": 1.4198086662915026, + "grad_norm": 0.3416243699378654, + "learning_rate": 2.9244889445139763e-05, + "loss": 0.374, + "step": 1262 + }, + { + "epoch": 1.4209341586944289, + "grad_norm": 0.354667895796559, + "learning_rate": 2.9224030037546933e-05, + "loss": 0.3587, + "step": 1263 + }, + { + "epoch": 1.422059651097355, + "grad_norm": 0.35090199403773553, + "learning_rate": 2.9203170629954113e-05, + "loss": 0.3679, + "step": 1264 + }, + { + "epoch": 1.4231851435002814, + "grad_norm": 0.37870594240912114, + "learning_rate": 2.9182311222361287e-05, + "loss": 0.3497, + "step": 1265 + }, + { + "epoch": 1.4243106359032076, + "grad_norm": 0.3281629547838438, + "learning_rate": 2.9161451814768464e-05, + "loss": 0.3457, + "step": 1266 + }, + { + "epoch": 1.4254361283061339, + "grad_norm": 0.35899900559464903, + "learning_rate": 2.9140592407175638e-05, + "loss": 0.3627, + "step": 1267 + }, + { + "epoch": 1.4265616207090601, + "grad_norm": 0.30995421196825085, + "learning_rate": 2.911973299958281e-05, + "loss": 0.3939, + "step": 1268 + }, + { + "epoch": 1.4276871131119866, + "grad_norm": 0.3707647704012859, + "learning_rate": 2.9098873591989988e-05, + "loss": 0.3683, + "step": 1269 + }, + { + "epoch": 1.4288126055149128, + "grad_norm": 0.3493957956050667, + "learning_rate": 2.9078014184397162e-05, + "loss": 0.3922, + "step": 1270 + }, + { + "epoch": 1.429938097917839, + "grad_norm": 0.3392405036440076, + "learning_rate": 2.9057154776804342e-05, + "loss": 0.3675, + "step": 1271 + }, + { + "epoch": 1.4310635903207654, + "grad_norm": 0.42908164729503306, + "learning_rate": 2.9036295369211512e-05, + "loss": 0.3497, + "step": 1272 + }, + { + "epoch": 1.4321890827236916, + "grad_norm": 0.3217604733902604, + "learning_rate": 2.9015435961618693e-05, + "loss": 0.3419, + "step": 1273 + }, + { + "epoch": 1.4333145751266179, + "grad_norm": 0.285513498828739, + "learning_rate": 2.8994576554025866e-05, + "loss": 0.3525, + "step": 1274 + }, + { + "epoch": 1.4344400675295441, + "grad_norm": 0.31722207811402275, + "learning_rate": 2.8973717146433043e-05, + "loss": 0.3537, + "step": 1275 + }, + { + "epoch": 1.4355655599324706, + "grad_norm": 0.3425240574013163, + "learning_rate": 2.8952857738840217e-05, + "loss": 0.3536, + "step": 1276 + }, + { + "epoch": 1.4366910523353966, + "grad_norm": 0.37106807604623343, + "learning_rate": 2.8931998331247394e-05, + "loss": 0.3495, + "step": 1277 + }, + { + "epoch": 1.437816544738323, + "grad_norm": 0.28250215963205166, + "learning_rate": 2.8911138923654567e-05, + "loss": 0.3481, + "step": 1278 + }, + { + "epoch": 1.4389420371412494, + "grad_norm": 0.3549294685688371, + "learning_rate": 2.8890279516061748e-05, + "loss": 0.376, + "step": 1279 + }, + { + "epoch": 1.4400675295441756, + "grad_norm": 0.30171989887856726, + "learning_rate": 2.886942010846892e-05, + "loss": 0.3755, + "step": 1280 + }, + { + "epoch": 1.4411930219471019, + "grad_norm": 0.3139221398315422, + "learning_rate": 2.88485607008761e-05, + "loss": 0.3513, + "step": 1281 + }, + { + "epoch": 1.4423185143500281, + "grad_norm": 0.3174768026324268, + "learning_rate": 2.8827701293283272e-05, + "loss": 0.3569, + "step": 1282 + }, + { + "epoch": 1.4434440067529544, + "grad_norm": 0.31543813617403643, + "learning_rate": 2.880684188569045e-05, + "loss": 0.3551, + "step": 1283 + }, + { + "epoch": 1.4445694991558806, + "grad_norm": 0.34405513448928665, + "learning_rate": 2.8785982478097623e-05, + "loss": 0.3615, + "step": 1284 + }, + { + "epoch": 1.445694991558807, + "grad_norm": 0.3314207471763474, + "learning_rate": 2.8765123070504803e-05, + "loss": 0.364, + "step": 1285 + }, + { + "epoch": 1.4468204839617331, + "grad_norm": 0.30011672219405916, + "learning_rate": 2.8744263662911973e-05, + "loss": 0.3541, + "step": 1286 + }, + { + "epoch": 1.4479459763646596, + "grad_norm": 0.36939176440073757, + "learning_rate": 2.8723404255319154e-05, + "loss": 0.3466, + "step": 1287 + }, + { + "epoch": 1.4490714687675859, + "grad_norm": 0.38877177781745204, + "learning_rate": 2.8702544847726327e-05, + "loss": 0.3786, + "step": 1288 + }, + { + "epoch": 1.4501969611705121, + "grad_norm": 0.3409728207807626, + "learning_rate": 2.8681685440133497e-05, + "loss": 0.3791, + "step": 1289 + }, + { + "epoch": 1.4513224535734384, + "grad_norm": 0.42817414825924877, + "learning_rate": 2.8660826032540678e-05, + "loss": 0.3491, + "step": 1290 + }, + { + "epoch": 1.4524479459763646, + "grad_norm": 0.35198856950809654, + "learning_rate": 2.863996662494785e-05, + "loss": 0.3679, + "step": 1291 + }, + { + "epoch": 1.4535734383792909, + "grad_norm": 0.32113222683338927, + "learning_rate": 2.861910721735503e-05, + "loss": 0.3467, + "step": 1292 + }, + { + "epoch": 1.4546989307822171, + "grad_norm": 0.321401722942131, + "learning_rate": 2.8598247809762202e-05, + "loss": 0.3565, + "step": 1293 + }, + { + "epoch": 1.4558244231851436, + "grad_norm": 0.3506970283170861, + "learning_rate": 2.857738840216938e-05, + "loss": 0.3691, + "step": 1294 + }, + { + "epoch": 1.4569499155880699, + "grad_norm": 0.3366249220650027, + "learning_rate": 2.8556528994576552e-05, + "loss": 0.3679, + "step": 1295 + }, + { + "epoch": 1.458075407990996, + "grad_norm": 0.3863825955582127, + "learning_rate": 2.8535669586983733e-05, + "loss": 0.3656, + "step": 1296 + }, + { + "epoch": 1.4592009003939224, + "grad_norm": 0.3294265653501134, + "learning_rate": 2.8514810179390906e-05, + "loss": 0.363, + "step": 1297 + }, + { + "epoch": 1.4603263927968486, + "grad_norm": 0.3400753421715317, + "learning_rate": 2.8493950771798083e-05, + "loss": 0.3677, + "step": 1298 + }, + { + "epoch": 1.4614518851997749, + "grad_norm": 0.34166651813969356, + "learning_rate": 2.8473091364205257e-05, + "loss": 0.3975, + "step": 1299 + }, + { + "epoch": 1.4625773776027011, + "grad_norm": 0.32157145849641844, + "learning_rate": 2.8452231956612434e-05, + "loss": 0.3599, + "step": 1300 + }, + { + "epoch": 1.4637028700056276, + "grad_norm": 0.29318800149996266, + "learning_rate": 2.8431372549019608e-05, + "loss": 0.3639, + "step": 1301 + }, + { + "epoch": 1.4648283624085536, + "grad_norm": 0.3275747590035082, + "learning_rate": 2.8410513141426788e-05, + "loss": 0.3405, + "step": 1302 + }, + { + "epoch": 1.46595385481148, + "grad_norm": 0.2669444641392893, + "learning_rate": 2.8389653733833958e-05, + "loss": 0.3556, + "step": 1303 + }, + { + "epoch": 1.4670793472144064, + "grad_norm": 0.2984242508636962, + "learning_rate": 2.836879432624114e-05, + "loss": 0.3475, + "step": 1304 + }, + { + "epoch": 1.4682048396173326, + "grad_norm": 0.34309575853919444, + "learning_rate": 2.8347934918648312e-05, + "loss": 0.3236, + "step": 1305 + }, + { + "epoch": 1.4693303320202589, + "grad_norm": 0.3386670126038393, + "learning_rate": 2.832707551105549e-05, + "loss": 0.3692, + "step": 1306 + }, + { + "epoch": 1.4704558244231851, + "grad_norm": 0.3041272096720939, + "learning_rate": 2.8306216103462663e-05, + "loss": 0.3672, + "step": 1307 + }, + { + "epoch": 1.4715813168261114, + "grad_norm": 0.4280504900617411, + "learning_rate": 2.828535669586984e-05, + "loss": 0.3532, + "step": 1308 + }, + { + "epoch": 1.4727068092290376, + "grad_norm": 0.28299915352373894, + "learning_rate": 2.8264497288277013e-05, + "loss": 0.3389, + "step": 1309 + }, + { + "epoch": 1.473832301631964, + "grad_norm": 0.33312026594342037, + "learning_rate": 2.8243637880684187e-05, + "loss": 0.3711, + "step": 1310 + }, + { + "epoch": 1.4749577940348901, + "grad_norm": 0.3324677079496402, + "learning_rate": 2.8222778473091367e-05, + "loss": 0.3637, + "step": 1311 + }, + { + "epoch": 1.4760832864378166, + "grad_norm": 0.3180122895020907, + "learning_rate": 2.8201919065498537e-05, + "loss": 0.3681, + "step": 1312 + }, + { + "epoch": 1.4772087788407429, + "grad_norm": 0.3454913512736821, + "learning_rate": 2.8181059657905718e-05, + "loss": 0.3788, + "step": 1313 + }, + { + "epoch": 1.4783342712436691, + "grad_norm": 0.2954651043434191, + "learning_rate": 2.816020025031289e-05, + "loss": 0.3629, + "step": 1314 + }, + { + "epoch": 1.4794597636465954, + "grad_norm": 0.31225437993089217, + "learning_rate": 2.813934084272007e-05, + "loss": 0.3483, + "step": 1315 + }, + { + "epoch": 1.4805852560495216, + "grad_norm": 0.2931420257748457, + "learning_rate": 2.8118481435127242e-05, + "loss": 0.3443, + "step": 1316 + }, + { + "epoch": 1.4817107484524479, + "grad_norm": 0.3077463608704642, + "learning_rate": 2.809762202753442e-05, + "loss": 0.3562, + "step": 1317 + }, + { + "epoch": 1.4828362408553741, + "grad_norm": 0.2868052518006215, + "learning_rate": 2.8076762619941593e-05, + "loss": 0.3532, + "step": 1318 + }, + { + "epoch": 1.4839617332583006, + "grad_norm": 0.28223423866564457, + "learning_rate": 2.8055903212348773e-05, + "loss": 0.3334, + "step": 1319 + }, + { + "epoch": 1.4850872256612269, + "grad_norm": 0.2934968437108151, + "learning_rate": 2.8035043804755947e-05, + "loss": 0.3609, + "step": 1320 + }, + { + "epoch": 1.4862127180641531, + "grad_norm": 0.3726867856164999, + "learning_rate": 2.8014184397163124e-05, + "loss": 0.3658, + "step": 1321 + }, + { + "epoch": 1.4873382104670794, + "grad_norm": 0.31940065928357514, + "learning_rate": 2.7993324989570297e-05, + "loss": 0.3752, + "step": 1322 + }, + { + "epoch": 1.4884637028700056, + "grad_norm": 0.343528935258811, + "learning_rate": 2.7972465581977474e-05, + "loss": 0.3689, + "step": 1323 + }, + { + "epoch": 1.4895891952729319, + "grad_norm": 0.29324201562634045, + "learning_rate": 2.7951606174384648e-05, + "loss": 0.3701, + "step": 1324 + }, + { + "epoch": 1.4907146876758581, + "grad_norm": 0.307447149562183, + "learning_rate": 2.7930746766791828e-05, + "loss": 0.3623, + "step": 1325 + }, + { + "epoch": 1.4918401800787846, + "grad_norm": 0.3370769636245937, + "learning_rate": 2.7909887359199e-05, + "loss": 0.3599, + "step": 1326 + }, + { + "epoch": 1.4929656724817106, + "grad_norm": 0.2871673492029565, + "learning_rate": 2.788902795160618e-05, + "loss": 0.3719, + "step": 1327 + }, + { + "epoch": 1.4940911648846371, + "grad_norm": 0.36895913560450455, + "learning_rate": 2.7868168544013352e-05, + "loss": 0.3678, + "step": 1328 + }, + { + "epoch": 1.4952166572875634, + "grad_norm": 0.30425325809005394, + "learning_rate": 2.784730913642053e-05, + "loss": 0.3654, + "step": 1329 + }, + { + "epoch": 1.4963421496904896, + "grad_norm": 0.3331261517980334, + "learning_rate": 2.7826449728827703e-05, + "loss": 0.3693, + "step": 1330 + }, + { + "epoch": 1.4974676420934159, + "grad_norm": 0.2798679646502201, + "learning_rate": 2.7805590321234876e-05, + "loss": 0.3709, + "step": 1331 + }, + { + "epoch": 1.4985931344963421, + "grad_norm": 0.37466709997642333, + "learning_rate": 2.7784730913642053e-05, + "loss": 0.363, + "step": 1332 + }, + { + "epoch": 1.4997186268992684, + "grad_norm": 0.35357276036020097, + "learning_rate": 2.7763871506049227e-05, + "loss": 0.3683, + "step": 1333 + }, + { + "epoch": 1.5008441193021946, + "grad_norm": 0.3354334941577856, + "learning_rate": 2.7743012098456404e-05, + "loss": 0.3438, + "step": 1334 + }, + { + "epoch": 1.501969611705121, + "grad_norm": 0.4101041564365979, + "learning_rate": 2.7722152690863578e-05, + "loss": 0.3835, + "step": 1335 + }, + { + "epoch": 1.5030951041080471, + "grad_norm": 0.36255802157025624, + "learning_rate": 2.7701293283270758e-05, + "loss": 0.3477, + "step": 1336 + }, + { + "epoch": 1.5042205965109736, + "grad_norm": 0.4061869558301693, + "learning_rate": 2.768043387567793e-05, + "loss": 0.3688, + "step": 1337 + }, + { + "epoch": 1.5053460889138999, + "grad_norm": 0.41388849066334216, + "learning_rate": 2.765957446808511e-05, + "loss": 0.3672, + "step": 1338 + }, + { + "epoch": 1.5064715813168261, + "grad_norm": 0.330932817498522, + "learning_rate": 2.7638715060492282e-05, + "loss": 0.3643, + "step": 1339 + }, + { + "epoch": 1.5075970737197524, + "grad_norm": 0.3399783880240679, + "learning_rate": 2.761785565289946e-05, + "loss": 0.3529, + "step": 1340 + }, + { + "epoch": 1.5087225661226786, + "grad_norm": 0.28341696113530734, + "learning_rate": 2.7596996245306633e-05, + "loss": 0.3496, + "step": 1341 + }, + { + "epoch": 1.509848058525605, + "grad_norm": 0.334967765352759, + "learning_rate": 2.7576136837713813e-05, + "loss": 0.3415, + "step": 1342 + }, + { + "epoch": 1.5109735509285311, + "grad_norm": 0.2668604468971969, + "learning_rate": 2.7555277430120983e-05, + "loss": 0.3634, + "step": 1343 + }, + { + "epoch": 1.5120990433314576, + "grad_norm": 0.31156721053710673, + "learning_rate": 2.7534418022528164e-05, + "loss": 0.3641, + "step": 1344 + }, + { + "epoch": 1.5132245357343836, + "grad_norm": 0.29754957914675184, + "learning_rate": 2.7513558614935337e-05, + "loss": 0.3785, + "step": 1345 + }, + { + "epoch": 1.5143500281373101, + "grad_norm": 0.2872566093068787, + "learning_rate": 2.7492699207342514e-05, + "loss": 0.3623, + "step": 1346 + }, + { + "epoch": 1.5154755205402364, + "grad_norm": 0.3526852777204813, + "learning_rate": 2.7471839799749688e-05, + "loss": 0.3723, + "step": 1347 + }, + { + "epoch": 1.5166010129431626, + "grad_norm": 0.31241125025784733, + "learning_rate": 2.7450980392156865e-05, + "loss": 0.3507, + "step": 1348 + }, + { + "epoch": 1.5177265053460889, + "grad_norm": 0.3508625079587985, + "learning_rate": 2.743012098456404e-05, + "loss": 0.3608, + "step": 1349 + }, + { + "epoch": 1.5188519977490151, + "grad_norm": 0.32157619105794166, + "learning_rate": 2.740926157697122e-05, + "loss": 0.3581, + "step": 1350 + }, + { + "epoch": 1.5199774901519416, + "grad_norm": 0.3494380418250329, + "learning_rate": 2.7388402169378392e-05, + "loss": 0.3609, + "step": 1351 + }, + { + "epoch": 1.5211029825548676, + "grad_norm": 0.3055065567786005, + "learning_rate": 2.7367542761785563e-05, + "loss": 0.3672, + "step": 1352 + }, + { + "epoch": 1.5222284749577941, + "grad_norm": 0.3950982220672214, + "learning_rate": 2.7346683354192743e-05, + "loss": 0.376, + "step": 1353 + }, + { + "epoch": 1.5233539673607202, + "grad_norm": 0.27852848240062467, + "learning_rate": 2.7325823946599917e-05, + "loss": 0.3485, + "step": 1354 + }, + { + "epoch": 1.5244794597636466, + "grad_norm": 0.3737867565000807, + "learning_rate": 2.7304964539007094e-05, + "loss": 0.3664, + "step": 1355 + }, + { + "epoch": 1.5256049521665729, + "grad_norm": 0.3119606266731146, + "learning_rate": 2.7284105131414267e-05, + "loss": 0.3619, + "step": 1356 + }, + { + "epoch": 1.5267304445694991, + "grad_norm": 0.33933519597699924, + "learning_rate": 2.7263245723821444e-05, + "loss": 0.4008, + "step": 1357 + }, + { + "epoch": 1.5278559369724254, + "grad_norm": 0.3275255812573412, + "learning_rate": 2.7242386316228618e-05, + "loss": 0.3702, + "step": 1358 + }, + { + "epoch": 1.5289814293753516, + "grad_norm": 0.3747569415524062, + "learning_rate": 2.7221526908635798e-05, + "loss": 0.3553, + "step": 1359 + }, + { + "epoch": 1.5301069217782781, + "grad_norm": 0.30992658499062065, + "learning_rate": 2.7200667501042972e-05, + "loss": 0.3556, + "step": 1360 + }, + { + "epoch": 1.5312324141812041, + "grad_norm": 0.36837860346575607, + "learning_rate": 2.717980809345015e-05, + "loss": 0.3536, + "step": 1361 + }, + { + "epoch": 1.5323579065841306, + "grad_norm": 0.3422637051978812, + "learning_rate": 2.7158948685857322e-05, + "loss": 0.3676, + "step": 1362 + }, + { + "epoch": 1.5334833989870569, + "grad_norm": 0.2882475832928599, + "learning_rate": 2.71380892782645e-05, + "loss": 0.3714, + "step": 1363 + }, + { + "epoch": 1.5346088913899831, + "grad_norm": 0.4680385431354928, + "learning_rate": 2.7117229870671673e-05, + "loss": 0.3728, + "step": 1364 + }, + { + "epoch": 1.5357343837929094, + "grad_norm": 0.28687340173500175, + "learning_rate": 2.7096370463078853e-05, + "loss": 0.3677, + "step": 1365 + }, + { + "epoch": 1.5368598761958356, + "grad_norm": 0.3168437934125687, + "learning_rate": 2.7075511055486023e-05, + "loss": 0.3516, + "step": 1366 + }, + { + "epoch": 1.5379853685987621, + "grad_norm": 0.34254703558592353, + "learning_rate": 2.7054651647893204e-05, + "loss": 0.3733, + "step": 1367 + }, + { + "epoch": 1.5391108610016881, + "grad_norm": 0.32210383347863225, + "learning_rate": 2.7033792240300377e-05, + "loss": 0.3657, + "step": 1368 + }, + { + "epoch": 1.5402363534046146, + "grad_norm": 0.2951642244458056, + "learning_rate": 2.7012932832707554e-05, + "loss": 0.3624, + "step": 1369 + }, + { + "epoch": 1.5413618458075407, + "grad_norm": 0.32973184204270484, + "learning_rate": 2.6992073425114728e-05, + "loss": 0.3466, + "step": 1370 + }, + { + "epoch": 1.5424873382104671, + "grad_norm": 0.32937201569972335, + "learning_rate": 2.6971214017521905e-05, + "loss": 0.3609, + "step": 1371 + }, + { + "epoch": 1.5436128306133934, + "grad_norm": 0.294240889891016, + "learning_rate": 2.695035460992908e-05, + "loss": 0.3528, + "step": 1372 + }, + { + "epoch": 1.5447383230163196, + "grad_norm": 0.38730632898384704, + "learning_rate": 2.6929495202336252e-05, + "loss": 0.3592, + "step": 1373 + }, + { + "epoch": 1.545863815419246, + "grad_norm": 0.265405748658469, + "learning_rate": 2.6908635794743433e-05, + "loss": 0.3523, + "step": 1374 + }, + { + "epoch": 1.5469893078221721, + "grad_norm": 0.3090293159321234, + "learning_rate": 2.6887776387150603e-05, + "loss": 0.373, + "step": 1375 + }, + { + "epoch": 1.5481148002250986, + "grad_norm": 0.33125373511524786, + "learning_rate": 2.6866916979557783e-05, + "loss": 0.3376, + "step": 1376 + }, + { + "epoch": 1.5492402926280247, + "grad_norm": 0.3859675477375762, + "learning_rate": 2.6846057571964957e-05, + "loss": 0.3595, + "step": 1377 + }, + { + "epoch": 1.5503657850309511, + "grad_norm": 0.2702204865287381, + "learning_rate": 2.6825198164372134e-05, + "loss": 0.3526, + "step": 1378 + }, + { + "epoch": 1.5514912774338772, + "grad_norm": 0.4216493180934553, + "learning_rate": 2.6804338756779307e-05, + "loss": 0.3634, + "step": 1379 + }, + { + "epoch": 1.5526167698368036, + "grad_norm": 0.3402054598291514, + "learning_rate": 2.6783479349186484e-05, + "loss": 0.3814, + "step": 1380 + }, + { + "epoch": 1.5537422622397299, + "grad_norm": 0.3634322127130347, + "learning_rate": 2.6762619941593658e-05, + "loss": 0.3739, + "step": 1381 + }, + { + "epoch": 1.5548677546426561, + "grad_norm": 0.4033902015465824, + "learning_rate": 2.6741760534000838e-05, + "loss": 0.3731, + "step": 1382 + }, + { + "epoch": 1.5559932470455824, + "grad_norm": 0.40808104649969373, + "learning_rate": 2.672090112640801e-05, + "loss": 0.3522, + "step": 1383 + }, + { + "epoch": 1.5571187394485086, + "grad_norm": 0.39575554889808146, + "learning_rate": 2.670004171881519e-05, + "loss": 0.3466, + "step": 1384 + }, + { + "epoch": 1.5582442318514351, + "grad_norm": 0.33725542522344404, + "learning_rate": 2.6679182311222362e-05, + "loss": 0.3594, + "step": 1385 + }, + { + "epoch": 1.5593697242543612, + "grad_norm": 0.3562002474404248, + "learning_rate": 2.665832290362954e-05, + "loss": 0.3766, + "step": 1386 + }, + { + "epoch": 1.5604952166572876, + "grad_norm": 0.2792679981992388, + "learning_rate": 2.6637463496036713e-05, + "loss": 0.354, + "step": 1387 + }, + { + "epoch": 1.5616207090602139, + "grad_norm": 0.3631975807941906, + "learning_rate": 2.661660408844389e-05, + "loss": 0.3628, + "step": 1388 + }, + { + "epoch": 1.5627462014631401, + "grad_norm": 0.2922697632757867, + "learning_rate": 2.6595744680851064e-05, + "loss": 0.3786, + "step": 1389 + }, + { + "epoch": 1.5638716938660664, + "grad_norm": 0.3930094259783832, + "learning_rate": 2.6574885273258244e-05, + "loss": 0.349, + "step": 1390 + }, + { + "epoch": 1.5649971862689926, + "grad_norm": 0.2753952015092564, + "learning_rate": 2.6554025865665418e-05, + "loss": 0.3606, + "step": 1391 + }, + { + "epoch": 1.5661226786719191, + "grad_norm": 0.323233762383296, + "learning_rate": 2.6533166458072595e-05, + "loss": 0.3584, + "step": 1392 + }, + { + "epoch": 1.5672481710748452, + "grad_norm": 0.3065899573190829, + "learning_rate": 2.6512307050479768e-05, + "loss": 0.3569, + "step": 1393 + }, + { + "epoch": 1.5683736634777716, + "grad_norm": 0.29359629957776534, + "learning_rate": 2.6491447642886942e-05, + "loss": 0.3721, + "step": 1394 + }, + { + "epoch": 1.5694991558806977, + "grad_norm": 0.3453639950913077, + "learning_rate": 2.647058823529412e-05, + "loss": 0.3674, + "step": 1395 + }, + { + "epoch": 1.5706246482836241, + "grad_norm": 0.29618728974968406, + "learning_rate": 2.6449728827701292e-05, + "loss": 0.3728, + "step": 1396 + }, + { + "epoch": 1.5717501406865504, + "grad_norm": 0.4022340400841394, + "learning_rate": 2.642886942010847e-05, + "loss": 0.3599, + "step": 1397 + }, + { + "epoch": 1.5728756330894766, + "grad_norm": 0.34040909178052503, + "learning_rate": 2.6408010012515643e-05, + "loss": 0.3452, + "step": 1398 + }, + { + "epoch": 1.574001125492403, + "grad_norm": 0.39633565400793064, + "learning_rate": 2.6387150604922823e-05, + "loss": 0.3638, + "step": 1399 + }, + { + "epoch": 1.5751266178953292, + "grad_norm": 0.3469815814003443, + "learning_rate": 2.6366291197329997e-05, + "loss": 0.3617, + "step": 1400 + }, + { + "epoch": 1.5762521102982556, + "grad_norm": 0.3858237301262129, + "learning_rate": 2.6345431789737174e-05, + "loss": 0.3592, + "step": 1401 + }, + { + "epoch": 1.5773776027011817, + "grad_norm": 0.36968305499637627, + "learning_rate": 2.6324572382144347e-05, + "loss": 0.3506, + "step": 1402 + }, + { + "epoch": 1.5785030951041081, + "grad_norm": 0.3505404658131974, + "learning_rate": 2.6303712974551524e-05, + "loss": 0.3686, + "step": 1403 + }, + { + "epoch": 1.5796285875070342, + "grad_norm": 0.33758728331020843, + "learning_rate": 2.6282853566958698e-05, + "loss": 0.3527, + "step": 1404 + }, + { + "epoch": 1.5807540799099606, + "grad_norm": 0.3435492065868497, + "learning_rate": 2.626199415936588e-05, + "loss": 0.3475, + "step": 1405 + }, + { + "epoch": 1.581879572312887, + "grad_norm": 0.3490084416143491, + "learning_rate": 2.624113475177305e-05, + "loss": 0.3607, + "step": 1406 + }, + { + "epoch": 1.5830050647158131, + "grad_norm": 0.31414180653905893, + "learning_rate": 2.622027534418023e-05, + "loss": 0.3504, + "step": 1407 + }, + { + "epoch": 1.5841305571187394, + "grad_norm": 0.3599821696826535, + "learning_rate": 2.6199415936587403e-05, + "loss": 0.3615, + "step": 1408 + }, + { + "epoch": 1.5852560495216657, + "grad_norm": 0.42310764019699615, + "learning_rate": 2.617855652899458e-05, + "loss": 0.3724, + "step": 1409 + }, + { + "epoch": 1.5863815419245921, + "grad_norm": 0.2833525199592301, + "learning_rate": 2.6157697121401753e-05, + "loss": 0.3617, + "step": 1410 + }, + { + "epoch": 1.5875070343275182, + "grad_norm": 0.3619653752728842, + "learning_rate": 2.613683771380893e-05, + "loss": 0.3535, + "step": 1411 + }, + { + "epoch": 1.5886325267304446, + "grad_norm": 0.31893555877641494, + "learning_rate": 2.6115978306216104e-05, + "loss": 0.3739, + "step": 1412 + }, + { + "epoch": 1.589758019133371, + "grad_norm": 0.367002811604332, + "learning_rate": 2.6095118898623284e-05, + "loss": 0.3553, + "step": 1413 + }, + { + "epoch": 1.5908835115362971, + "grad_norm": 0.27151097727860346, + "learning_rate": 2.6074259491030458e-05, + "loss": 0.3347, + "step": 1414 + }, + { + "epoch": 1.5920090039392234, + "grad_norm": 0.3131896896726996, + "learning_rate": 2.6053400083437628e-05, + "loss": 0.3546, + "step": 1415 + }, + { + "epoch": 1.5931344963421497, + "grad_norm": 0.36676987492115576, + "learning_rate": 2.6032540675844808e-05, + "loss": 0.3675, + "step": 1416 + }, + { + "epoch": 1.5942599887450761, + "grad_norm": 0.2950227483896426, + "learning_rate": 2.6011681268251982e-05, + "loss": 0.3648, + "step": 1417 + }, + { + "epoch": 1.5953854811480022, + "grad_norm": 0.34344487884738795, + "learning_rate": 2.599082186065916e-05, + "loss": 0.3597, + "step": 1418 + }, + { + "epoch": 1.5965109735509286, + "grad_norm": 0.320230789996618, + "learning_rate": 2.5969962453066332e-05, + "loss": 0.3524, + "step": 1419 + }, + { + "epoch": 1.5976364659538547, + "grad_norm": 0.32035648740276107, + "learning_rate": 2.594910304547351e-05, + "loss": 0.3595, + "step": 1420 + }, + { + "epoch": 1.5987619583567811, + "grad_norm": 0.2888199453121108, + "learning_rate": 2.5928243637880683e-05, + "loss": 0.3862, + "step": 1421 + }, + { + "epoch": 1.5998874507597074, + "grad_norm": 0.32236255339509834, + "learning_rate": 2.5907384230287863e-05, + "loss": 0.3476, + "step": 1422 + }, + { + "epoch": 1.6010129431626337, + "grad_norm": 0.3203989927659959, + "learning_rate": 2.5886524822695034e-05, + "loss": 0.3702, + "step": 1423 + }, + { + "epoch": 1.60213843556556, + "grad_norm": 0.2911113101367755, + "learning_rate": 2.5865665415102214e-05, + "loss": 0.3688, + "step": 1424 + }, + { + "epoch": 1.6032639279684862, + "grad_norm": 0.35071227735634586, + "learning_rate": 2.5844806007509388e-05, + "loss": 0.3808, + "step": 1425 + }, + { + "epoch": 1.6043894203714126, + "grad_norm": 1.359117007518664, + "learning_rate": 2.5823946599916565e-05, + "loss": 0.3551, + "step": 1426 + }, + { + "epoch": 1.6055149127743387, + "grad_norm": 0.33498969187479993, + "learning_rate": 2.5803087192323738e-05, + "loss": 0.3593, + "step": 1427 + }, + { + "epoch": 1.6066404051772651, + "grad_norm": 0.30337597464705507, + "learning_rate": 2.5782227784730915e-05, + "loss": 0.3674, + "step": 1428 + }, + { + "epoch": 1.6077658975801912, + "grad_norm": 0.3207844519265783, + "learning_rate": 2.576136837713809e-05, + "loss": 0.3557, + "step": 1429 + }, + { + "epoch": 1.6088913899831176, + "grad_norm": 0.3185723538525886, + "learning_rate": 2.574050896954527e-05, + "loss": 0.3633, + "step": 1430 + }, + { + "epoch": 1.610016882386044, + "grad_norm": 0.3110802343229136, + "learning_rate": 2.5719649561952443e-05, + "loss": 0.3621, + "step": 1431 + }, + { + "epoch": 1.6111423747889702, + "grad_norm": 0.39120392030901746, + "learning_rate": 2.569879015435962e-05, + "loss": 0.3718, + "step": 1432 + }, + { + "epoch": 1.6122678671918964, + "grad_norm": 0.3044483498179327, + "learning_rate": 2.5677930746766793e-05, + "loss": 0.3525, + "step": 1433 + }, + { + "epoch": 1.6133933595948227, + "grad_norm": 0.36593260259263516, + "learning_rate": 2.565707133917397e-05, + "loss": 0.3724, + "step": 1434 + }, + { + "epoch": 1.6145188519977491, + "grad_norm": 0.34991456432334755, + "learning_rate": 2.5636211931581144e-05, + "loss": 0.3682, + "step": 1435 + }, + { + "epoch": 1.6156443444006752, + "grad_norm": 0.32304123149901537, + "learning_rate": 2.5615352523988317e-05, + "loss": 0.3496, + "step": 1436 + }, + { + "epoch": 1.6167698368036016, + "grad_norm": 0.34708749419764806, + "learning_rate": 2.5594493116395494e-05, + "loss": 0.3913, + "step": 1437 + }, + { + "epoch": 1.617895329206528, + "grad_norm": 0.32488187134050506, + "learning_rate": 2.5573633708802668e-05, + "loss": 0.3469, + "step": 1438 + }, + { + "epoch": 1.6190208216094542, + "grad_norm": 0.31694764933224345, + "learning_rate": 2.555277430120985e-05, + "loss": 0.3903, + "step": 1439 + }, + { + "epoch": 1.6201463140123804, + "grad_norm": 0.2966648293508749, + "learning_rate": 2.5531914893617022e-05, + "loss": 0.3434, + "step": 1440 + }, + { + "epoch": 1.6212718064153067, + "grad_norm": 0.3130351777750274, + "learning_rate": 2.55110554860242e-05, + "loss": 0.3642, + "step": 1441 + }, + { + "epoch": 1.6223972988182331, + "grad_norm": 0.288157295810494, + "learning_rate": 2.5490196078431373e-05, + "loss": 0.3515, + "step": 1442 + }, + { + "epoch": 1.6235227912211592, + "grad_norm": 0.34698217632629985, + "learning_rate": 2.546933667083855e-05, + "loss": 0.3733, + "step": 1443 + }, + { + "epoch": 1.6246482836240856, + "grad_norm": 0.2724092253095966, + "learning_rate": 2.5448477263245723e-05, + "loss": 0.3497, + "step": 1444 + }, + { + "epoch": 1.6257737760270117, + "grad_norm": 0.24953001796720836, + "learning_rate": 2.5427617855652904e-05, + "loss": 0.3573, + "step": 1445 + }, + { + "epoch": 1.6268992684299382, + "grad_norm": 0.299260486745094, + "learning_rate": 2.5406758448060074e-05, + "loss": 0.3873, + "step": 1446 + }, + { + "epoch": 1.6280247608328644, + "grad_norm": 0.26925589680552175, + "learning_rate": 2.5385899040467254e-05, + "loss": 0.3508, + "step": 1447 + }, + { + "epoch": 1.6291502532357907, + "grad_norm": 0.29454604423730374, + "learning_rate": 2.5365039632874428e-05, + "loss": 0.3591, + "step": 1448 + }, + { + "epoch": 1.630275745638717, + "grad_norm": 0.27324874812018735, + "learning_rate": 2.5344180225281605e-05, + "loss": 0.3625, + "step": 1449 + }, + { + "epoch": 1.6314012380416432, + "grad_norm": 0.27258225073759196, + "learning_rate": 2.5323320817688778e-05, + "loss": 0.3554, + "step": 1450 + }, + { + "epoch": 1.6325267304445696, + "grad_norm": 0.3035610321463261, + "learning_rate": 2.5302461410095955e-05, + "loss": 0.3535, + "step": 1451 + }, + { + "epoch": 1.6336522228474957, + "grad_norm": 0.3628567082505913, + "learning_rate": 2.528160200250313e-05, + "loss": 0.358, + "step": 1452 + }, + { + "epoch": 1.6347777152504221, + "grad_norm": 0.26138414055253223, + "learning_rate": 2.526074259491031e-05, + "loss": 0.3664, + "step": 1453 + }, + { + "epoch": 1.6359032076533482, + "grad_norm": 0.3503328861643792, + "learning_rate": 2.5239883187317483e-05, + "loss": 0.3377, + "step": 1454 + }, + { + "epoch": 1.6370287000562747, + "grad_norm": 0.2673845892434079, + "learning_rate": 2.521902377972466e-05, + "loss": 0.3474, + "step": 1455 + }, + { + "epoch": 1.638154192459201, + "grad_norm": 0.27470271868463625, + "learning_rate": 2.5198164372131833e-05, + "loss": 0.3639, + "step": 1456 + }, + { + "epoch": 1.6392796848621272, + "grad_norm": 0.3112867744755204, + "learning_rate": 2.5177304964539007e-05, + "loss": 0.3662, + "step": 1457 + }, + { + "epoch": 1.6404051772650534, + "grad_norm": 0.29872249045203997, + "learning_rate": 2.5156445556946184e-05, + "loss": 0.3569, + "step": 1458 + }, + { + "epoch": 1.6415306696679797, + "grad_norm": 0.2950030580824579, + "learning_rate": 2.5135586149353358e-05, + "loss": 0.3877, + "step": 1459 + }, + { + "epoch": 1.6426561620709061, + "grad_norm": 0.30740378724405815, + "learning_rate": 2.5114726741760535e-05, + "loss": 0.368, + "step": 1460 + }, + { + "epoch": 1.6437816544738322, + "grad_norm": 0.43735074719358724, + "learning_rate": 2.5093867334167708e-05, + "loss": 0.3862, + "step": 1461 + }, + { + "epoch": 1.6449071468767587, + "grad_norm": 0.344358904604338, + "learning_rate": 2.507300792657489e-05, + "loss": 0.3556, + "step": 1462 + }, + { + "epoch": 1.646032639279685, + "grad_norm": 0.3606518532796079, + "learning_rate": 2.505214851898206e-05, + "loss": 0.3506, + "step": 1463 + }, + { + "epoch": 1.6471581316826112, + "grad_norm": 0.26793935225288906, + "learning_rate": 2.503128911138924e-05, + "loss": 0.3644, + "step": 1464 + }, + { + "epoch": 1.6482836240855374, + "grad_norm": 0.36553458630391006, + "learning_rate": 2.5010429703796413e-05, + "loss": 0.3786, + "step": 1465 + }, + { + "epoch": 1.6494091164884637, + "grad_norm": 0.3032742387012001, + "learning_rate": 2.4989570296203586e-05, + "loss": 0.3606, + "step": 1466 + }, + { + "epoch": 1.6505346088913901, + "grad_norm": 0.2573644911193979, + "learning_rate": 2.4968710888610763e-05, + "loss": 0.3758, + "step": 1467 + }, + { + "epoch": 1.6516601012943162, + "grad_norm": 0.3260439897844004, + "learning_rate": 2.494785148101794e-05, + "loss": 0.3701, + "step": 1468 + }, + { + "epoch": 1.6527855936972426, + "grad_norm": 0.2791366230994869, + "learning_rate": 2.4926992073425114e-05, + "loss": 0.3608, + "step": 1469 + }, + { + "epoch": 1.6539110861001687, + "grad_norm": 0.28073773442639216, + "learning_rate": 2.490613266583229e-05, + "loss": 0.3552, + "step": 1470 + }, + { + "epoch": 1.6550365785030952, + "grad_norm": 0.2751936808067321, + "learning_rate": 2.4885273258239468e-05, + "loss": 0.3551, + "step": 1471 + }, + { + "epoch": 1.6561620709060214, + "grad_norm": 0.31105318511449315, + "learning_rate": 2.486441385064664e-05, + "loss": 0.3846, + "step": 1472 + }, + { + "epoch": 1.6572875633089477, + "grad_norm": 0.2779436567942526, + "learning_rate": 2.484355444305382e-05, + "loss": 0.342, + "step": 1473 + }, + { + "epoch": 1.658413055711874, + "grad_norm": 0.260118994793512, + "learning_rate": 2.4822695035460995e-05, + "loss": 0.3416, + "step": 1474 + }, + { + "epoch": 1.6595385481148002, + "grad_norm": 0.30797304765243294, + "learning_rate": 2.480183562786817e-05, + "loss": 0.3649, + "step": 1475 + }, + { + "epoch": 1.6606640405177266, + "grad_norm": 0.27879300341701935, + "learning_rate": 2.4780976220275346e-05, + "loss": 0.3577, + "step": 1476 + }, + { + "epoch": 1.6617895329206527, + "grad_norm": 0.2618302523010228, + "learning_rate": 2.476011681268252e-05, + "loss": 0.373, + "step": 1477 + }, + { + "epoch": 1.6629150253235792, + "grad_norm": 0.2691572921484226, + "learning_rate": 2.4739257405089697e-05, + "loss": 0.3382, + "step": 1478 + }, + { + "epoch": 1.6640405177265052, + "grad_norm": 0.3021887597303646, + "learning_rate": 2.4718397997496874e-05, + "loss": 0.3561, + "step": 1479 + }, + { + "epoch": 1.6651660101294317, + "grad_norm": 0.29571070245395525, + "learning_rate": 2.4697538589904047e-05, + "loss": 0.3666, + "step": 1480 + }, + { + "epoch": 1.666291502532358, + "grad_norm": 0.3060388532862541, + "learning_rate": 2.4676679182311224e-05, + "loss": 0.3574, + "step": 1481 + }, + { + "epoch": 1.6674169949352842, + "grad_norm": 0.262863158327581, + "learning_rate": 2.46558197747184e-05, + "loss": 0.3515, + "step": 1482 + }, + { + "epoch": 1.6685424873382104, + "grad_norm": 0.26211725924142215, + "learning_rate": 2.4634960367125575e-05, + "loss": 0.3673, + "step": 1483 + }, + { + "epoch": 1.6696679797411367, + "grad_norm": 0.27559909119280296, + "learning_rate": 2.461410095953275e-05, + "loss": 0.3699, + "step": 1484 + }, + { + "epoch": 1.6707934721440632, + "grad_norm": 0.3286258665699544, + "learning_rate": 2.459324155193993e-05, + "loss": 0.3547, + "step": 1485 + }, + { + "epoch": 1.6719189645469892, + "grad_norm": 0.28394671282033973, + "learning_rate": 2.4572382144347102e-05, + "loss": 0.3699, + "step": 1486 + }, + { + "epoch": 1.6730444569499157, + "grad_norm": 0.28904710622589413, + "learning_rate": 2.4551522736754276e-05, + "loss": 0.3498, + "step": 1487 + }, + { + "epoch": 1.674169949352842, + "grad_norm": 0.3427205931479807, + "learning_rate": 2.4530663329161453e-05, + "loss": 0.3501, + "step": 1488 + }, + { + "epoch": 1.6752954417557682, + "grad_norm": 0.29275848448510483, + "learning_rate": 2.4509803921568626e-05, + "loss": 0.3436, + "step": 1489 + }, + { + "epoch": 1.6764209341586944, + "grad_norm": 0.281092401245526, + "learning_rate": 2.4488944513975803e-05, + "loss": 0.3618, + "step": 1490 + }, + { + "epoch": 1.6775464265616207, + "grad_norm": 0.3028349680860752, + "learning_rate": 2.446808510638298e-05, + "loss": 0.36, + "step": 1491 + }, + { + "epoch": 1.6786719189645471, + "grad_norm": 0.2740774488090583, + "learning_rate": 2.4447225698790154e-05, + "loss": 0.3761, + "step": 1492 + }, + { + "epoch": 1.6797974113674732, + "grad_norm": 0.3565931288895132, + "learning_rate": 2.442636629119733e-05, + "loss": 0.3656, + "step": 1493 + }, + { + "epoch": 1.6809229037703997, + "grad_norm": 0.30994295777900555, + "learning_rate": 2.4405506883604508e-05, + "loss": 0.3709, + "step": 1494 + }, + { + "epoch": 1.6820483961733257, + "grad_norm": 0.2770746159082683, + "learning_rate": 2.438464747601168e-05, + "loss": 0.3668, + "step": 1495 + }, + { + "epoch": 1.6831738885762522, + "grad_norm": 0.28851250362528635, + "learning_rate": 2.436378806841886e-05, + "loss": 0.3375, + "step": 1496 + }, + { + "epoch": 1.6842993809791784, + "grad_norm": 0.30502905628031945, + "learning_rate": 2.4342928660826032e-05, + "loss": 0.372, + "step": 1497 + }, + { + "epoch": 1.6854248733821047, + "grad_norm": 0.27606414842777804, + "learning_rate": 2.432206925323321e-05, + "loss": 0.3667, + "step": 1498 + }, + { + "epoch": 1.686550365785031, + "grad_norm": 0.2807248595539354, + "learning_rate": 2.4301209845640386e-05, + "loss": 0.3372, + "step": 1499 + }, + { + "epoch": 1.6876758581879572, + "grad_norm": 0.3189944644623768, + "learning_rate": 2.428035043804756e-05, + "loss": 0.3556, + "step": 1500 + }, + { + "epoch": 1.6888013505908837, + "grad_norm": 0.3542996839432631, + "learning_rate": 2.4259491030454737e-05, + "loss": 0.3657, + "step": 1501 + }, + { + "epoch": 1.6899268429938097, + "grad_norm": 0.26759487532851395, + "learning_rate": 2.4238631622861914e-05, + "loss": 0.3537, + "step": 1502 + }, + { + "epoch": 1.6910523353967362, + "grad_norm": 0.32892071648122306, + "learning_rate": 2.4217772215269087e-05, + "loss": 0.3679, + "step": 1503 + }, + { + "epoch": 1.6921778277996622, + "grad_norm": 0.27325117871239496, + "learning_rate": 2.4196912807676264e-05, + "loss": 0.3636, + "step": 1504 + }, + { + "epoch": 1.6933033202025887, + "grad_norm": 0.31473981377419813, + "learning_rate": 2.417605340008344e-05, + "loss": 0.3546, + "step": 1505 + }, + { + "epoch": 1.694428812605515, + "grad_norm": 0.6213973467005295, + "learning_rate": 2.4155193992490615e-05, + "loss": 0.3569, + "step": 1506 + }, + { + "epoch": 1.6955543050084412, + "grad_norm": 0.29664784115736215, + "learning_rate": 2.4134334584897792e-05, + "loss": 0.3541, + "step": 1507 + }, + { + "epoch": 1.6966797974113674, + "grad_norm": 0.30075562806982764, + "learning_rate": 2.4113475177304965e-05, + "loss": 0.3442, + "step": 1508 + }, + { + "epoch": 1.6978052898142937, + "grad_norm": 0.2798816842619607, + "learning_rate": 2.409261576971214e-05, + "loss": 0.3723, + "step": 1509 + }, + { + "epoch": 1.6989307822172202, + "grad_norm": 0.3125716574597028, + "learning_rate": 2.4071756362119316e-05, + "loss": 0.3534, + "step": 1510 + }, + { + "epoch": 1.7000562746201462, + "grad_norm": 0.2695382685076537, + "learning_rate": 2.4050896954526493e-05, + "loss": 0.3501, + "step": 1511 + }, + { + "epoch": 1.7011817670230727, + "grad_norm": 0.30428973956664224, + "learning_rate": 2.4030037546933667e-05, + "loss": 0.361, + "step": 1512 + }, + { + "epoch": 1.702307259425999, + "grad_norm": 0.2954859753709326, + "learning_rate": 2.4009178139340844e-05, + "loss": 0.348, + "step": 1513 + }, + { + "epoch": 1.7034327518289252, + "grad_norm": 0.2535522065599469, + "learning_rate": 2.398831873174802e-05, + "loss": 0.3448, + "step": 1514 + }, + { + "epoch": 1.7045582442318514, + "grad_norm": 0.2877878849798194, + "learning_rate": 2.3967459324155194e-05, + "loss": 0.3615, + "step": 1515 + }, + { + "epoch": 1.7056837366347777, + "grad_norm": 0.2679693175700858, + "learning_rate": 2.394659991656237e-05, + "loss": 0.3575, + "step": 1516 + }, + { + "epoch": 1.7068092290377042, + "grad_norm": 0.270042339489181, + "learning_rate": 2.3925740508969545e-05, + "loss": 0.3612, + "step": 1517 + }, + { + "epoch": 1.7079347214406302, + "grad_norm": 0.3277570559960174, + "learning_rate": 2.390488110137672e-05, + "loss": 0.3539, + "step": 1518 + }, + { + "epoch": 1.7090602138435567, + "grad_norm": 0.273010908537002, + "learning_rate": 2.38840216937839e-05, + "loss": 0.3813, + "step": 1519 + }, + { + "epoch": 1.7101857062464827, + "grad_norm": 0.3163418829636289, + "learning_rate": 2.3863162286191072e-05, + "loss": 0.367, + "step": 1520 + }, + { + "epoch": 1.7113111986494092, + "grad_norm": 0.2790546740132572, + "learning_rate": 2.384230287859825e-05, + "loss": 0.3616, + "step": 1521 + }, + { + "epoch": 1.7124366910523354, + "grad_norm": 0.3794647847000264, + "learning_rate": 2.3821443471005426e-05, + "loss": 0.3264, + "step": 1522 + }, + { + "epoch": 1.7135621834552617, + "grad_norm": 0.27180490681435693, + "learning_rate": 2.38005840634126e-05, + "loss": 0.3667, + "step": 1523 + }, + { + "epoch": 1.714687675858188, + "grad_norm": 0.3192761006046379, + "learning_rate": 2.3779724655819777e-05, + "loss": 0.3516, + "step": 1524 + }, + { + "epoch": 1.7158131682611142, + "grad_norm": 0.29770507590073336, + "learning_rate": 2.3758865248226954e-05, + "loss": 0.3616, + "step": 1525 + }, + { + "epoch": 1.7169386606640407, + "grad_norm": 0.3198828879152863, + "learning_rate": 2.3738005840634127e-05, + "loss": 0.3529, + "step": 1526 + }, + { + "epoch": 1.7180641530669667, + "grad_norm": 0.3090153579359256, + "learning_rate": 2.3717146433041304e-05, + "loss": 0.3526, + "step": 1527 + }, + { + "epoch": 1.7191896454698932, + "grad_norm": 0.3212232642206978, + "learning_rate": 2.369628702544848e-05, + "loss": 0.3607, + "step": 1528 + }, + { + "epoch": 1.7203151378728192, + "grad_norm": 0.30043128684782044, + "learning_rate": 2.367542761785565e-05, + "loss": 0.352, + "step": 1529 + }, + { + "epoch": 1.7214406302757457, + "grad_norm": 0.29295625581516677, + "learning_rate": 2.365456821026283e-05, + "loss": 0.3523, + "step": 1530 + }, + { + "epoch": 1.722566122678672, + "grad_norm": 0.3148385769404437, + "learning_rate": 2.3633708802670006e-05, + "loss": 0.3428, + "step": 1531 + }, + { + "epoch": 1.7236916150815982, + "grad_norm": 0.2809729795961225, + "learning_rate": 2.361284939507718e-05, + "loss": 0.3501, + "step": 1532 + }, + { + "epoch": 1.7248171074845244, + "grad_norm": 0.26779520094077724, + "learning_rate": 2.3591989987484356e-05, + "loss": 0.3692, + "step": 1533 + }, + { + "epoch": 1.7259425998874507, + "grad_norm": 0.34366805506707354, + "learning_rate": 2.3571130579891533e-05, + "loss": 0.3487, + "step": 1534 + }, + { + "epoch": 1.7270680922903772, + "grad_norm": 0.31386821776914015, + "learning_rate": 2.3550271172298707e-05, + "loss": 0.3738, + "step": 1535 + }, + { + "epoch": 1.7281935846933032, + "grad_norm": 0.27129248750888196, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.347, + "step": 1536 + }, + { + "epoch": 1.7293190770962297, + "grad_norm": 0.30115138922829, + "learning_rate": 2.3508552357113057e-05, + "loss": 0.3644, + "step": 1537 + }, + { + "epoch": 1.730444569499156, + "grad_norm": 0.33400000355343035, + "learning_rate": 2.3487692949520234e-05, + "loss": 0.3735, + "step": 1538 + }, + { + "epoch": 1.7315700619020822, + "grad_norm": 0.2817713843812286, + "learning_rate": 2.346683354192741e-05, + "loss": 0.3536, + "step": 1539 + }, + { + "epoch": 1.7326955543050084, + "grad_norm": 0.2536336062497862, + "learning_rate": 2.3445974134334585e-05, + "loss": 0.3531, + "step": 1540 + }, + { + "epoch": 1.7338210467079347, + "grad_norm": 0.292119578282618, + "learning_rate": 2.3425114726741762e-05, + "loss": 0.3917, + "step": 1541 + }, + { + "epoch": 1.7349465391108612, + "grad_norm": 0.301736936214816, + "learning_rate": 2.340425531914894e-05, + "loss": 0.3368, + "step": 1542 + }, + { + "epoch": 1.7360720315137872, + "grad_norm": 0.2834782995265929, + "learning_rate": 2.3383395911556112e-05, + "loss": 0.3627, + "step": 1543 + }, + { + "epoch": 1.7371975239167137, + "grad_norm": 0.3472332663859999, + "learning_rate": 2.336253650396329e-05, + "loss": 0.3563, + "step": 1544 + }, + { + "epoch": 1.7383230163196397, + "grad_norm": 0.2770080632091572, + "learning_rate": 2.3341677096370466e-05, + "loss": 0.3638, + "step": 1545 + }, + { + "epoch": 1.7394485087225662, + "grad_norm": 0.28038474675505726, + "learning_rate": 2.332081768877764e-05, + "loss": 0.3578, + "step": 1546 + }, + { + "epoch": 1.7405740011254924, + "grad_norm": 0.29413387062574675, + "learning_rate": 2.3299958281184817e-05, + "loss": 0.3581, + "step": 1547 + }, + { + "epoch": 1.7416994935284187, + "grad_norm": 0.250154894365378, + "learning_rate": 2.3279098873591994e-05, + "loss": 0.3524, + "step": 1548 + }, + { + "epoch": 1.742824985931345, + "grad_norm": 0.27004730168507385, + "learning_rate": 2.3258239465999164e-05, + "loss": 0.3592, + "step": 1549 + }, + { + "epoch": 1.7439504783342712, + "grad_norm": 0.30931998115710535, + "learning_rate": 2.323738005840634e-05, + "loss": 0.3633, + "step": 1550 + }, + { + "epoch": 1.7450759707371977, + "grad_norm": 0.260094920014104, + "learning_rate": 2.3216520650813518e-05, + "loss": 0.36, + "step": 1551 + }, + { + "epoch": 1.7462014631401237, + "grad_norm": 0.28020792072208933, + "learning_rate": 2.3195661243220692e-05, + "loss": 0.3619, + "step": 1552 + }, + { + "epoch": 1.7473269555430502, + "grad_norm": 0.29150594274575353, + "learning_rate": 2.317480183562787e-05, + "loss": 0.3752, + "step": 1553 + }, + { + "epoch": 1.7484524479459762, + "grad_norm": 0.2780077227889234, + "learning_rate": 2.3153942428035046e-05, + "loss": 0.3404, + "step": 1554 + }, + { + "epoch": 1.7495779403489027, + "grad_norm": 0.26577200333767786, + "learning_rate": 2.313308302044222e-05, + "loss": 0.3457, + "step": 1555 + }, + { + "epoch": 1.750703432751829, + "grad_norm": 0.297363502447975, + "learning_rate": 2.3112223612849396e-05, + "loss": 0.3473, + "step": 1556 + }, + { + "epoch": 1.7518289251547552, + "grad_norm": 0.26278420558469534, + "learning_rate": 2.309136420525657e-05, + "loss": 0.3474, + "step": 1557 + }, + { + "epoch": 1.7529544175576814, + "grad_norm": 0.26900103936760594, + "learning_rate": 2.3070504797663747e-05, + "loss": 0.3531, + "step": 1558 + }, + { + "epoch": 1.7540799099606077, + "grad_norm": 0.32212325532836394, + "learning_rate": 2.3049645390070924e-05, + "loss": 0.3671, + "step": 1559 + }, + { + "epoch": 1.7552054023635342, + "grad_norm": 0.2970498028319336, + "learning_rate": 2.3028785982478097e-05, + "loss": 0.3488, + "step": 1560 + }, + { + "epoch": 1.7563308947664602, + "grad_norm": 0.3127346620449437, + "learning_rate": 2.3007926574885274e-05, + "loss": 0.3585, + "step": 1561 + }, + { + "epoch": 1.7574563871693867, + "grad_norm": 0.2765174597912786, + "learning_rate": 2.298706716729245e-05, + "loss": 0.3394, + "step": 1562 + }, + { + "epoch": 1.758581879572313, + "grad_norm": 0.3009783707148293, + "learning_rate": 2.2966207759699625e-05, + "loss": 0.3629, + "step": 1563 + }, + { + "epoch": 1.7597073719752392, + "grad_norm": 0.30115522865418154, + "learning_rate": 2.2945348352106802e-05, + "loss": 0.3725, + "step": 1564 + }, + { + "epoch": 1.7608328643781654, + "grad_norm": 0.2930519854916032, + "learning_rate": 2.292448894451398e-05, + "loss": 0.3569, + "step": 1565 + }, + { + "epoch": 1.7619583567810917, + "grad_norm": 0.3047405558309698, + "learning_rate": 2.2903629536921153e-05, + "loss": 0.3714, + "step": 1566 + }, + { + "epoch": 1.7630838491840182, + "grad_norm": 0.2590925307869418, + "learning_rate": 2.288277012932833e-05, + "loss": 0.3429, + "step": 1567 + }, + { + "epoch": 1.7642093415869442, + "grad_norm": 0.26569956950346013, + "learning_rate": 2.2861910721735507e-05, + "loss": 0.3563, + "step": 1568 + }, + { + "epoch": 1.7653348339898707, + "grad_norm": 0.3212454261162196, + "learning_rate": 2.284105131414268e-05, + "loss": 0.3628, + "step": 1569 + }, + { + "epoch": 1.7664603263927967, + "grad_norm": 0.25121736290737545, + "learning_rate": 2.2820191906549854e-05, + "loss": 0.3407, + "step": 1570 + }, + { + "epoch": 1.7675858187957232, + "grad_norm": 0.2568310971026976, + "learning_rate": 2.279933249895703e-05, + "loss": 0.3556, + "step": 1571 + }, + { + "epoch": 1.7687113111986494, + "grad_norm": 0.2766368973128219, + "learning_rate": 2.2778473091364204e-05, + "loss": 0.3488, + "step": 1572 + }, + { + "epoch": 1.7698368036015757, + "grad_norm": 0.2830150867726597, + "learning_rate": 2.275761368377138e-05, + "loss": 0.3563, + "step": 1573 + }, + { + "epoch": 1.770962296004502, + "grad_norm": 0.2704782966697743, + "learning_rate": 2.2736754276178558e-05, + "loss": 0.3785, + "step": 1574 + }, + { + "epoch": 1.7720877884074282, + "grad_norm": 0.28693834596503254, + "learning_rate": 2.2715894868585732e-05, + "loss": 0.3568, + "step": 1575 + }, + { + "epoch": 1.7732132808103547, + "grad_norm": 0.300591423339274, + "learning_rate": 2.269503546099291e-05, + "loss": 0.3401, + "step": 1576 + }, + { + "epoch": 1.7743387732132807, + "grad_norm": 0.25477856684248135, + "learning_rate": 2.2674176053400082e-05, + "loss": 0.3485, + "step": 1577 + }, + { + "epoch": 1.7754642656162072, + "grad_norm": 0.27289380951433195, + "learning_rate": 2.265331664580726e-05, + "loss": 0.3607, + "step": 1578 + }, + { + "epoch": 1.7765897580191332, + "grad_norm": 0.28248295859121, + "learning_rate": 2.2632457238214436e-05, + "loss": 0.3511, + "step": 1579 + }, + { + "epoch": 1.7777152504220597, + "grad_norm": 0.2658629597762577, + "learning_rate": 2.261159783062161e-05, + "loss": 0.377, + "step": 1580 + }, + { + "epoch": 1.778840742824986, + "grad_norm": 0.27220952413476507, + "learning_rate": 2.2590738423028787e-05, + "loss": 0.3557, + "step": 1581 + }, + { + "epoch": 1.7799662352279122, + "grad_norm": 0.2328823780748166, + "learning_rate": 2.2569879015435964e-05, + "loss": 0.3437, + "step": 1582 + }, + { + "epoch": 1.7810917276308385, + "grad_norm": 0.27552976720286626, + "learning_rate": 2.2549019607843138e-05, + "loss": 0.3499, + "step": 1583 + }, + { + "epoch": 1.7822172200337647, + "grad_norm": 0.26988928502984605, + "learning_rate": 2.2528160200250315e-05, + "loss": 0.3739, + "step": 1584 + }, + { + "epoch": 1.7833427124366912, + "grad_norm": 0.28360069868054577, + "learning_rate": 2.250730079265749e-05, + "loss": 0.3586, + "step": 1585 + }, + { + "epoch": 1.7844682048396172, + "grad_norm": 0.30703362231564924, + "learning_rate": 2.2486441385064665e-05, + "loss": 0.3402, + "step": 1586 + }, + { + "epoch": 1.7855936972425437, + "grad_norm": 0.24229886888660893, + "learning_rate": 2.2465581977471842e-05, + "loss": 0.3749, + "step": 1587 + }, + { + "epoch": 1.78671918964547, + "grad_norm": 0.3052949362012416, + "learning_rate": 2.244472256987902e-05, + "loss": 0.3432, + "step": 1588 + }, + { + "epoch": 1.7878446820483962, + "grad_norm": 0.30727845879387705, + "learning_rate": 2.2423863162286193e-05, + "loss": 0.352, + "step": 1589 + }, + { + "epoch": 1.7889701744513224, + "grad_norm": 0.2871867401825979, + "learning_rate": 2.240300375469337e-05, + "loss": 0.3654, + "step": 1590 + }, + { + "epoch": 1.7900956668542487, + "grad_norm": 0.29179973425408784, + "learning_rate": 2.2382144347100543e-05, + "loss": 0.3616, + "step": 1591 + }, + { + "epoch": 1.7912211592571752, + "grad_norm": 0.3214829947161843, + "learning_rate": 2.2361284939507717e-05, + "loss": 0.3512, + "step": 1592 + }, + { + "epoch": 1.7923466516601012, + "grad_norm": 0.24147795550593532, + "learning_rate": 2.2340425531914894e-05, + "loss": 0.3462, + "step": 1593 + }, + { + "epoch": 1.7934721440630277, + "grad_norm": 0.27689467276611157, + "learning_rate": 2.231956612432207e-05, + "loss": 0.36, + "step": 1594 + }, + { + "epoch": 1.7945976364659537, + "grad_norm": 0.28150686561848237, + "learning_rate": 2.2298706716729244e-05, + "loss": 0.3532, + "step": 1595 + }, + { + "epoch": 1.7957231288688802, + "grad_norm": 0.2581225749795623, + "learning_rate": 2.227784730913642e-05, + "loss": 0.3559, + "step": 1596 + }, + { + "epoch": 1.7968486212718064, + "grad_norm": 0.3039816245853392, + "learning_rate": 2.22569879015436e-05, + "loss": 0.3538, + "step": 1597 + }, + { + "epoch": 1.7979741136747327, + "grad_norm": 0.25714237851869526, + "learning_rate": 2.2236128493950772e-05, + "loss": 0.3442, + "step": 1598 + }, + { + "epoch": 1.799099606077659, + "grad_norm": 0.24074871831754024, + "learning_rate": 2.221526908635795e-05, + "loss": 0.349, + "step": 1599 + }, + { + "epoch": 1.8002250984805852, + "grad_norm": 0.28044366274540433, + "learning_rate": 2.2194409678765123e-05, + "loss": 0.3476, + "step": 1600 + }, + { + "epoch": 1.8013505908835117, + "grad_norm": 0.3055206598860284, + "learning_rate": 2.21735502711723e-05, + "loss": 0.329, + "step": 1601 + }, + { + "epoch": 1.8024760832864377, + "grad_norm": 0.32077315537142925, + "learning_rate": 2.2152690863579477e-05, + "loss": 0.3632, + "step": 1602 + }, + { + "epoch": 1.8036015756893642, + "grad_norm": 0.3191853556743268, + "learning_rate": 2.213183145598665e-05, + "loss": 0.3724, + "step": 1603 + }, + { + "epoch": 1.8047270680922902, + "grad_norm": 0.309559125522351, + "learning_rate": 2.2110972048393827e-05, + "loss": 0.36, + "step": 1604 + }, + { + "epoch": 1.8058525604952167, + "grad_norm": 0.31187663837864876, + "learning_rate": 2.2090112640801004e-05, + "loss": 0.344, + "step": 1605 + }, + { + "epoch": 1.806978052898143, + "grad_norm": 0.3041182224443529, + "learning_rate": 2.2069253233208178e-05, + "loss": 0.353, + "step": 1606 + }, + { + "epoch": 1.8081035453010692, + "grad_norm": 0.29282481275876526, + "learning_rate": 2.2048393825615355e-05, + "loss": 0.3486, + "step": 1607 + }, + { + "epoch": 1.8092290377039955, + "grad_norm": 0.29147172423218604, + "learning_rate": 2.202753441802253e-05, + "loss": 0.3672, + "step": 1608 + }, + { + "epoch": 1.8103545301069217, + "grad_norm": 0.28412350307097217, + "learning_rate": 2.2006675010429705e-05, + "loss": 0.3353, + "step": 1609 + }, + { + "epoch": 1.8114800225098482, + "grad_norm": 0.3067481559384326, + "learning_rate": 2.1985815602836882e-05, + "loss": 0.3768, + "step": 1610 + }, + { + "epoch": 1.8126055149127742, + "grad_norm": 0.28038315675437364, + "learning_rate": 2.1964956195244056e-05, + "loss": 0.3654, + "step": 1611 + }, + { + "epoch": 1.8137310073157007, + "grad_norm": 0.35539158393187636, + "learning_rate": 2.194409678765123e-05, + "loss": 0.3538, + "step": 1612 + }, + { + "epoch": 1.814856499718627, + "grad_norm": 0.25885136003612047, + "learning_rate": 2.1923237380058406e-05, + "loss": 0.3644, + "step": 1613 + }, + { + "epoch": 1.8159819921215532, + "grad_norm": 0.26093239365043436, + "learning_rate": 2.1902377972465583e-05, + "loss": 0.3708, + "step": 1614 + }, + { + "epoch": 1.8171074845244795, + "grad_norm": 0.2961872877279803, + "learning_rate": 2.1881518564872757e-05, + "loss": 0.3596, + "step": 1615 + }, + { + "epoch": 1.8182329769274057, + "grad_norm": 0.25680902610020434, + "learning_rate": 2.1860659157279934e-05, + "loss": 0.3577, + "step": 1616 + }, + { + "epoch": 1.8193584693303322, + "grad_norm": 0.25991234140815395, + "learning_rate": 2.183979974968711e-05, + "loss": 0.3633, + "step": 1617 + }, + { + "epoch": 1.8204839617332582, + "grad_norm": 0.2613869530348512, + "learning_rate": 2.1818940342094285e-05, + "loss": 0.3418, + "step": 1618 + }, + { + "epoch": 1.8216094541361847, + "grad_norm": 0.24790904990988194, + "learning_rate": 2.179808093450146e-05, + "loss": 0.3632, + "step": 1619 + }, + { + "epoch": 1.8227349465391107, + "grad_norm": 0.28799463478351406, + "learning_rate": 2.1777221526908635e-05, + "loss": 0.3699, + "step": 1620 + }, + { + "epoch": 1.8238604389420372, + "grad_norm": 0.25548160538250764, + "learning_rate": 2.1756362119315812e-05, + "loss": 0.3442, + "step": 1621 + }, + { + "epoch": 1.8249859313449635, + "grad_norm": 0.2985142619761683, + "learning_rate": 2.173550271172299e-05, + "loss": 0.3486, + "step": 1622 + }, + { + "epoch": 1.8261114237478897, + "grad_norm": 0.2972946035959545, + "learning_rate": 2.1714643304130163e-05, + "loss": 0.3414, + "step": 1623 + }, + { + "epoch": 1.827236916150816, + "grad_norm": 0.26170651498968683, + "learning_rate": 2.169378389653734e-05, + "loss": 0.3284, + "step": 1624 + }, + { + "epoch": 1.8283624085537422, + "grad_norm": 0.2524407858115918, + "learning_rate": 2.1672924488944517e-05, + "loss": 0.3599, + "step": 1625 + }, + { + "epoch": 1.8294879009566687, + "grad_norm": 0.3335691621333924, + "learning_rate": 2.165206508135169e-05, + "loss": 0.3603, + "step": 1626 + }, + { + "epoch": 1.8306133933595947, + "grad_norm": 0.2768913167073537, + "learning_rate": 2.1631205673758867e-05, + "loss": 0.3588, + "step": 1627 + }, + { + "epoch": 1.8317388857625212, + "grad_norm": 0.30050684042922793, + "learning_rate": 2.1610346266166044e-05, + "loss": 0.372, + "step": 1628 + }, + { + "epoch": 1.8328643781654472, + "grad_norm": 0.2901843574196796, + "learning_rate": 2.1589486858573218e-05, + "loss": 0.3639, + "step": 1629 + }, + { + "epoch": 1.8339898705683737, + "grad_norm": 0.29902669217912486, + "learning_rate": 2.1568627450980395e-05, + "loss": 0.3735, + "step": 1630 + }, + { + "epoch": 1.8351153629713, + "grad_norm": 0.30980781618970216, + "learning_rate": 2.154776804338757e-05, + "loss": 0.3572, + "step": 1631 + }, + { + "epoch": 1.8362408553742262, + "grad_norm": 0.26616420601594276, + "learning_rate": 2.1526908635794745e-05, + "loss": 0.3582, + "step": 1632 + }, + { + "epoch": 1.8373663477771525, + "grad_norm": 0.29096782812841715, + "learning_rate": 2.150604922820192e-05, + "loss": 0.3533, + "step": 1633 + }, + { + "epoch": 1.8384918401800787, + "grad_norm": 0.29936454913412547, + "learning_rate": 2.1485189820609096e-05, + "loss": 0.3441, + "step": 1634 + }, + { + "epoch": 1.8396173325830052, + "grad_norm": 0.34946000879087785, + "learning_rate": 2.146433041301627e-05, + "loss": 0.3628, + "step": 1635 + }, + { + "epoch": 1.8407428249859312, + "grad_norm": 0.2623712677205065, + "learning_rate": 2.1443471005423447e-05, + "loss": 0.3545, + "step": 1636 + }, + { + "epoch": 1.8418683173888577, + "grad_norm": 0.2753735634753566, + "learning_rate": 2.1422611597830624e-05, + "loss": 0.3528, + "step": 1637 + }, + { + "epoch": 1.842993809791784, + "grad_norm": 0.31812525886192866, + "learning_rate": 2.1401752190237797e-05, + "loss": 0.3697, + "step": 1638 + }, + { + "epoch": 1.8441193021947102, + "grad_norm": 0.29105961621045684, + "learning_rate": 2.1380892782644974e-05, + "loss": 0.3546, + "step": 1639 + }, + { + "epoch": 1.8452447945976365, + "grad_norm": 0.2691984264982239, + "learning_rate": 2.1360033375052148e-05, + "loss": 0.3536, + "step": 1640 + }, + { + "epoch": 1.8463702870005627, + "grad_norm": 0.2993538772854178, + "learning_rate": 2.1339173967459325e-05, + "loss": 0.363, + "step": 1641 + }, + { + "epoch": 1.8474957794034892, + "grad_norm": 0.29783181788963287, + "learning_rate": 2.13183145598665e-05, + "loss": 0.3592, + "step": 1642 + }, + { + "epoch": 1.8486212718064152, + "grad_norm": 0.2775688059074239, + "learning_rate": 2.1297455152273675e-05, + "loss": 0.3581, + "step": 1643 + }, + { + "epoch": 1.8497467642093417, + "grad_norm": 0.3133614801924746, + "learning_rate": 2.1276595744680852e-05, + "loss": 0.3498, + "step": 1644 + }, + { + "epoch": 1.8508722566122677, + "grad_norm": 0.2772230818911116, + "learning_rate": 2.125573633708803e-05, + "loss": 0.3558, + "step": 1645 + }, + { + "epoch": 1.8519977490151942, + "grad_norm": 0.30827283116401644, + "learning_rate": 2.1234876929495203e-05, + "loss": 0.3614, + "step": 1646 + }, + { + "epoch": 1.8531232414181205, + "grad_norm": 0.24090218764810817, + "learning_rate": 2.121401752190238e-05, + "loss": 0.3662, + "step": 1647 + }, + { + "epoch": 1.8542487338210467, + "grad_norm": 0.28761910481188807, + "learning_rate": 2.1193158114309557e-05, + "loss": 0.3441, + "step": 1648 + }, + { + "epoch": 1.855374226223973, + "grad_norm": 0.2560509442786654, + "learning_rate": 2.117229870671673e-05, + "loss": 0.3547, + "step": 1649 + }, + { + "epoch": 1.8564997186268992, + "grad_norm": 0.30034883076743724, + "learning_rate": 2.1151439299123907e-05, + "loss": 0.3449, + "step": 1650 + }, + { + "epoch": 1.8576252110298257, + "grad_norm": 0.34444462233589906, + "learning_rate": 2.113057989153108e-05, + "loss": 0.3837, + "step": 1651 + }, + { + "epoch": 1.8587507034327517, + "grad_norm": 0.27692690682489174, + "learning_rate": 2.1109720483938258e-05, + "loss": 0.3607, + "step": 1652 + }, + { + "epoch": 1.8598761958356782, + "grad_norm": 0.26001142796077303, + "learning_rate": 2.1088861076345435e-05, + "loss": 0.3398, + "step": 1653 + }, + { + "epoch": 1.8610016882386042, + "grad_norm": 0.25366060360784753, + "learning_rate": 2.106800166875261e-05, + "loss": 0.3656, + "step": 1654 + }, + { + "epoch": 1.8621271806415307, + "grad_norm": 0.25058815872177637, + "learning_rate": 2.1047142261159782e-05, + "loss": 0.3439, + "step": 1655 + }, + { + "epoch": 1.863252673044457, + "grad_norm": 0.28664975028041284, + "learning_rate": 2.102628285356696e-05, + "loss": 0.3583, + "step": 1656 + }, + { + "epoch": 1.8643781654473832, + "grad_norm": 0.2732549675529288, + "learning_rate": 2.1005423445974136e-05, + "loss": 0.3305, + "step": 1657 + }, + { + "epoch": 1.8655036578503095, + "grad_norm": 0.2773666490469463, + "learning_rate": 2.098456403838131e-05, + "loss": 0.3591, + "step": 1658 + }, + { + "epoch": 1.8666291502532357, + "grad_norm": 0.2690002427002813, + "learning_rate": 2.0963704630788487e-05, + "loss": 0.3684, + "step": 1659 + }, + { + "epoch": 1.8677546426561622, + "grad_norm": 0.27085097978896006, + "learning_rate": 2.094284522319566e-05, + "loss": 0.3384, + "step": 1660 + }, + { + "epoch": 1.8688801350590882, + "grad_norm": 0.24697707069643743, + "learning_rate": 2.0921985815602837e-05, + "loss": 0.3572, + "step": 1661 + }, + { + "epoch": 1.8700056274620147, + "grad_norm": 0.2764605247602527, + "learning_rate": 2.0901126408010014e-05, + "loss": 0.3552, + "step": 1662 + }, + { + "epoch": 1.871131119864941, + "grad_norm": 0.2902550139143697, + "learning_rate": 2.0880267000417188e-05, + "loss": 0.3581, + "step": 1663 + }, + { + "epoch": 1.8722566122678672, + "grad_norm": 0.25734658506325125, + "learning_rate": 2.0859407592824365e-05, + "loss": 0.3509, + "step": 1664 + }, + { + "epoch": 1.8733821046707935, + "grad_norm": 0.29290615718137913, + "learning_rate": 2.0838548185231542e-05, + "loss": 0.3448, + "step": 1665 + }, + { + "epoch": 1.8745075970737197, + "grad_norm": 0.2633403418767797, + "learning_rate": 2.0817688777638715e-05, + "loss": 0.3556, + "step": 1666 + }, + { + "epoch": 1.8756330894766462, + "grad_norm": 0.3044255909775045, + "learning_rate": 2.0796829370045892e-05, + "loss": 0.3451, + "step": 1667 + }, + { + "epoch": 1.8767585818795722, + "grad_norm": 0.2932864685525451, + "learning_rate": 2.077596996245307e-05, + "loss": 0.3657, + "step": 1668 + }, + { + "epoch": 1.8778840742824987, + "grad_norm": 0.31135509455954635, + "learning_rate": 2.0755110554860243e-05, + "loss": 0.3734, + "step": 1669 + }, + { + "epoch": 1.8790095666854247, + "grad_norm": 0.2664061935893102, + "learning_rate": 2.073425114726742e-05, + "loss": 0.3629, + "step": 1670 + }, + { + "epoch": 1.8801350590883512, + "grad_norm": 0.2707969930148503, + "learning_rate": 2.0713391739674597e-05, + "loss": 0.3483, + "step": 1671 + }, + { + "epoch": 1.8812605514912775, + "grad_norm": 0.2582761473461036, + "learning_rate": 2.069253233208177e-05, + "loss": 0.366, + "step": 1672 + }, + { + "epoch": 1.8823860438942037, + "grad_norm": 0.2818191859830275, + "learning_rate": 2.0671672924488947e-05, + "loss": 0.3606, + "step": 1673 + }, + { + "epoch": 1.88351153629713, + "grad_norm": 0.274907626023918, + "learning_rate": 2.065081351689612e-05, + "loss": 0.3733, + "step": 1674 + }, + { + "epoch": 1.8846370287000562, + "grad_norm": 0.25302448281459705, + "learning_rate": 2.0629954109303295e-05, + "loss": 0.344, + "step": 1675 + }, + { + "epoch": 1.8857625211029827, + "grad_norm": 0.2601145397643824, + "learning_rate": 2.060909470171047e-05, + "loss": 0.3655, + "step": 1676 + }, + { + "epoch": 1.8868880135059087, + "grad_norm": 0.2598011168749623, + "learning_rate": 2.058823529411765e-05, + "loss": 0.3583, + "step": 1677 + }, + { + "epoch": 1.8880135059088352, + "grad_norm": 0.2764045861628215, + "learning_rate": 2.0567375886524822e-05, + "loss": 0.3358, + "step": 1678 + }, + { + "epoch": 1.8891389983117612, + "grad_norm": 0.2505563945259788, + "learning_rate": 2.0546516478932e-05, + "loss": 0.3326, + "step": 1679 + }, + { + "epoch": 1.8902644907146877, + "grad_norm": 0.2593385914562438, + "learning_rate": 2.0525657071339173e-05, + "loss": 0.344, + "step": 1680 + }, + { + "epoch": 1.891389983117614, + "grad_norm": 0.32013539903668187, + "learning_rate": 2.050479766374635e-05, + "loss": 0.3678, + "step": 1681 + }, + { + "epoch": 1.8925154755205402, + "grad_norm": 0.2850992099914004, + "learning_rate": 2.0483938256153527e-05, + "loss": 0.397, + "step": 1682 + }, + { + "epoch": 1.8936409679234665, + "grad_norm": 0.3016034620250037, + "learning_rate": 2.04630788485607e-05, + "loss": 0.3358, + "step": 1683 + }, + { + "epoch": 1.8947664603263927, + "grad_norm": 0.322626269426066, + "learning_rate": 2.0442219440967877e-05, + "loss": 0.3493, + "step": 1684 + }, + { + "epoch": 1.8958919527293192, + "grad_norm": 0.27415129738901345, + "learning_rate": 2.0421360033375054e-05, + "loss": 0.3612, + "step": 1685 + }, + { + "epoch": 1.8970174451322452, + "grad_norm": 0.3202508460747489, + "learning_rate": 2.0400500625782228e-05, + "loss": 0.3449, + "step": 1686 + }, + { + "epoch": 1.8981429375351717, + "grad_norm": 0.2610128644172156, + "learning_rate": 2.0379641218189405e-05, + "loss": 0.334, + "step": 1687 + }, + { + "epoch": 1.899268429938098, + "grad_norm": 0.26431886989489495, + "learning_rate": 2.0358781810596582e-05, + "loss": 0.3701, + "step": 1688 + }, + { + "epoch": 1.9003939223410242, + "grad_norm": 0.32289025222752066, + "learning_rate": 2.0337922403003756e-05, + "loss": 0.3772, + "step": 1689 + }, + { + "epoch": 1.9015194147439505, + "grad_norm": 0.27620099175466095, + "learning_rate": 2.0317062995410932e-05, + "loss": 0.3634, + "step": 1690 + }, + { + "epoch": 1.9026449071468767, + "grad_norm": 0.30452855448211125, + "learning_rate": 2.029620358781811e-05, + "loss": 0.3619, + "step": 1691 + }, + { + "epoch": 1.9037703995498032, + "grad_norm": 0.30999319017283444, + "learning_rate": 2.0275344180225283e-05, + "loss": 0.3472, + "step": 1692 + }, + { + "epoch": 1.9048958919527292, + "grad_norm": 0.34073549354424293, + "learning_rate": 2.025448477263246e-05, + "loss": 0.3417, + "step": 1693 + }, + { + "epoch": 1.9060213843556557, + "grad_norm": 0.28162550986145274, + "learning_rate": 2.0233625365039634e-05, + "loss": 0.3536, + "step": 1694 + }, + { + "epoch": 1.9071468767585817, + "grad_norm": 0.3215339598711887, + "learning_rate": 2.0212765957446807e-05, + "loss": 0.3682, + "step": 1695 + }, + { + "epoch": 1.9082723691615082, + "grad_norm": 0.34154514944007364, + "learning_rate": 2.0191906549853984e-05, + "loss": 0.3573, + "step": 1696 + }, + { + "epoch": 1.9093978615644345, + "grad_norm": 0.27450876997174517, + "learning_rate": 2.017104714226116e-05, + "loss": 0.3656, + "step": 1697 + }, + { + "epoch": 1.9105233539673607, + "grad_norm": 0.32973694211143484, + "learning_rate": 2.0150187734668335e-05, + "loss": 0.3729, + "step": 1698 + }, + { + "epoch": 1.911648846370287, + "grad_norm": 0.33057591238589434, + "learning_rate": 2.0129328327075512e-05, + "loss": 0.371, + "step": 1699 + }, + { + "epoch": 1.9127743387732132, + "grad_norm": 0.28948186161364625, + "learning_rate": 2.0108468919482685e-05, + "loss": 0.3397, + "step": 1700 + }, + { + "epoch": 1.9138998311761397, + "grad_norm": 0.3007970569880779, + "learning_rate": 2.0087609511889862e-05, + "loss": 0.3643, + "step": 1701 + }, + { + "epoch": 1.9150253235790657, + "grad_norm": 0.2612518404162693, + "learning_rate": 2.006675010429704e-05, + "loss": 0.3532, + "step": 1702 + }, + { + "epoch": 1.9161508159819922, + "grad_norm": 0.31521980587085163, + "learning_rate": 2.0045890696704213e-05, + "loss": 0.3572, + "step": 1703 + }, + { + "epoch": 1.9172763083849182, + "grad_norm": 0.32716978204799535, + "learning_rate": 2.002503128911139e-05, + "loss": 0.3655, + "step": 1704 + }, + { + "epoch": 1.9184018007878447, + "grad_norm": 0.2848312721456602, + "learning_rate": 2.0004171881518567e-05, + "loss": 0.3293, + "step": 1705 + }, + { + "epoch": 1.919527293190771, + "grad_norm": 0.2849516222624094, + "learning_rate": 1.998331247392574e-05, + "loss": 0.3493, + "step": 1706 + }, + { + "epoch": 1.9206527855936972, + "grad_norm": 0.2748223750385321, + "learning_rate": 1.9962453066332917e-05, + "loss": 0.3361, + "step": 1707 + }, + { + "epoch": 1.9217782779966235, + "grad_norm": 0.3052533145067581, + "learning_rate": 1.9941593658740094e-05, + "loss": 0.3697, + "step": 1708 + }, + { + "epoch": 1.9229037703995497, + "grad_norm": 0.2819225673013518, + "learning_rate": 1.9920734251147268e-05, + "loss": 0.3598, + "step": 1709 + }, + { + "epoch": 1.9240292628024762, + "grad_norm": 0.28297852832083414, + "learning_rate": 1.9899874843554445e-05, + "loss": 0.3421, + "step": 1710 + }, + { + "epoch": 1.9251547552054022, + "grad_norm": 0.32135792331365465, + "learning_rate": 1.9879015435961622e-05, + "loss": 0.3728, + "step": 1711 + }, + { + "epoch": 1.9262802476083287, + "grad_norm": 0.2485116486993189, + "learning_rate": 1.9858156028368796e-05, + "loss": 0.3494, + "step": 1712 + }, + { + "epoch": 1.927405740011255, + "grad_norm": 0.2749683711636245, + "learning_rate": 1.9837296620775973e-05, + "loss": 0.346, + "step": 1713 + }, + { + "epoch": 1.9285312324141812, + "grad_norm": 0.2642179410888402, + "learning_rate": 1.9816437213183146e-05, + "loss": 0.3548, + "step": 1714 + }, + { + "epoch": 1.9296567248171075, + "grad_norm": 0.25158261695078715, + "learning_rate": 1.9795577805590323e-05, + "loss": 0.359, + "step": 1715 + }, + { + "epoch": 1.9307822172200337, + "grad_norm": 0.27223176041458313, + "learning_rate": 1.9774718397997497e-05, + "loss": 0.3414, + "step": 1716 + }, + { + "epoch": 1.93190770962296, + "grad_norm": 0.2782144577617854, + "learning_rate": 1.9753858990404674e-05, + "loss": 0.3561, + "step": 1717 + }, + { + "epoch": 1.9330332020258862, + "grad_norm": 0.27538099734788146, + "learning_rate": 1.9732999582811847e-05, + "loss": 0.3472, + "step": 1718 + }, + { + "epoch": 1.9341586944288127, + "grad_norm": 0.2960828119915571, + "learning_rate": 1.9712140175219024e-05, + "loss": 0.3496, + "step": 1719 + }, + { + "epoch": 1.9352841868317388, + "grad_norm": 0.258095045594745, + "learning_rate": 1.9691280767626198e-05, + "loss": 0.3517, + "step": 1720 + }, + { + "epoch": 1.9364096792346652, + "grad_norm": 0.3024256600541793, + "learning_rate": 1.9670421360033375e-05, + "loss": 0.3586, + "step": 1721 + }, + { + "epoch": 1.9375351716375915, + "grad_norm": 0.29098939153442666, + "learning_rate": 1.9649561952440552e-05, + "loss": 0.3643, + "step": 1722 + }, + { + "epoch": 1.9386606640405177, + "grad_norm": 0.25782898610022725, + "learning_rate": 1.9628702544847726e-05, + "loss": 0.367, + "step": 1723 + }, + { + "epoch": 1.939786156443444, + "grad_norm": 0.3495526740430891, + "learning_rate": 1.9607843137254903e-05, + "loss": 0.3577, + "step": 1724 + }, + { + "epoch": 1.9409116488463702, + "grad_norm": 0.2728973828660973, + "learning_rate": 1.958698372966208e-05, + "loss": 0.3554, + "step": 1725 + }, + { + "epoch": 1.9420371412492967, + "grad_norm": 0.2901290142358023, + "learning_rate": 1.9566124322069253e-05, + "loss": 0.37, + "step": 1726 + }, + { + "epoch": 1.9431626336522227, + "grad_norm": 0.3031752356222974, + "learning_rate": 1.954526491447643e-05, + "loss": 0.3638, + "step": 1727 + }, + { + "epoch": 1.9442881260551492, + "grad_norm": 0.260909753207997, + "learning_rate": 1.9524405506883607e-05, + "loss": 0.3618, + "step": 1728 + }, + { + "epoch": 1.9454136184580753, + "grad_norm": 0.28948350014768964, + "learning_rate": 1.950354609929078e-05, + "loss": 0.3401, + "step": 1729 + }, + { + "epoch": 1.9465391108610017, + "grad_norm": 0.2623446580726307, + "learning_rate": 1.9482686691697958e-05, + "loss": 0.3618, + "step": 1730 + }, + { + "epoch": 1.947664603263928, + "grad_norm": 0.2666588748626957, + "learning_rate": 1.9461827284105135e-05, + "loss": 0.3424, + "step": 1731 + }, + { + "epoch": 1.9487900956668542, + "grad_norm": 0.23758227129892492, + "learning_rate": 1.9440967876512308e-05, + "loss": 0.3647, + "step": 1732 + }, + { + "epoch": 1.9499155880697805, + "grad_norm": 1.0235070433552647, + "learning_rate": 1.9420108468919485e-05, + "loss": 0.362, + "step": 1733 + }, + { + "epoch": 1.9510410804727067, + "grad_norm": 0.28481161066229677, + "learning_rate": 1.939924906132666e-05, + "loss": 0.3631, + "step": 1734 + }, + { + "epoch": 1.9521665728756332, + "grad_norm": 0.2848122389618838, + "learning_rate": 1.9378389653733836e-05, + "loss": 0.3469, + "step": 1735 + }, + { + "epoch": 1.9532920652785593, + "grad_norm": 0.2759014719515173, + "learning_rate": 1.9357530246141013e-05, + "loss": 0.3425, + "step": 1736 + }, + { + "epoch": 1.9544175576814857, + "grad_norm": 0.27874949300316715, + "learning_rate": 1.9336670838548186e-05, + "loss": 0.3855, + "step": 1737 + }, + { + "epoch": 1.955543050084412, + "grad_norm": 0.31363642679753656, + "learning_rate": 1.931581143095536e-05, + "loss": 0.3536, + "step": 1738 + }, + { + "epoch": 1.9566685424873382, + "grad_norm": 0.2556224324207228, + "learning_rate": 1.9294952023362537e-05, + "loss": 0.3432, + "step": 1739 + }, + { + "epoch": 1.9577940348902645, + "grad_norm": 0.2670888092453423, + "learning_rate": 1.927409261576971e-05, + "loss": 0.3509, + "step": 1740 + }, + { + "epoch": 1.9589195272931907, + "grad_norm": 0.25001267900165874, + "learning_rate": 1.9253233208176888e-05, + "loss": 0.3323, + "step": 1741 + }, + { + "epoch": 1.960045019696117, + "grad_norm": 0.2974207872384669, + "learning_rate": 1.9232373800584064e-05, + "loss": 0.3544, + "step": 1742 + }, + { + "epoch": 1.9611705120990433, + "grad_norm": 0.27472747483190185, + "learning_rate": 1.9211514392991238e-05, + "loss": 0.3521, + "step": 1743 + }, + { + "epoch": 1.9622960045019697, + "grad_norm": 0.2683475797146492, + "learning_rate": 1.9190654985398415e-05, + "loss": 0.3682, + "step": 1744 + }, + { + "epoch": 1.9634214969048958, + "grad_norm": 0.3822465905741808, + "learning_rate": 1.9169795577805592e-05, + "loss": 0.3535, + "step": 1745 + }, + { + "epoch": 1.9645469893078222, + "grad_norm": 0.29811948702966473, + "learning_rate": 1.9148936170212766e-05, + "loss": 0.3732, + "step": 1746 + }, + { + "epoch": 1.9656724817107485, + "grad_norm": 0.30142259657958986, + "learning_rate": 1.9128076762619943e-05, + "loss": 0.3432, + "step": 1747 + }, + { + "epoch": 1.9667979741136747, + "grad_norm": 0.3786818892770981, + "learning_rate": 1.910721735502712e-05, + "loss": 0.3654, + "step": 1748 + }, + { + "epoch": 1.967923466516601, + "grad_norm": 0.27029496791481233, + "learning_rate": 1.9086357947434293e-05, + "loss": 0.3682, + "step": 1749 + }, + { + "epoch": 1.9690489589195272, + "grad_norm": 0.262798379544428, + "learning_rate": 1.906549853984147e-05, + "loss": 0.3568, + "step": 1750 + }, + { + "epoch": 1.9701744513224537, + "grad_norm": 0.3135712581670641, + "learning_rate": 1.9044639132248647e-05, + "loss": 0.3511, + "step": 1751 + }, + { + "epoch": 1.9712999437253798, + "grad_norm": 0.3158145619580369, + "learning_rate": 1.902377972465582e-05, + "loss": 0.36, + "step": 1752 + }, + { + "epoch": 1.9724254361283062, + "grad_norm": 0.318706113946463, + "learning_rate": 1.9002920317062998e-05, + "loss": 0.3562, + "step": 1753 + }, + { + "epoch": 1.9735509285312323, + "grad_norm": 0.310806681735437, + "learning_rate": 1.898206090947017e-05, + "loss": 0.3514, + "step": 1754 + }, + { + "epoch": 1.9746764209341587, + "grad_norm": 0.2849866940009224, + "learning_rate": 1.896120150187735e-05, + "loss": 0.3477, + "step": 1755 + }, + { + "epoch": 1.975801913337085, + "grad_norm": 0.2810634482745697, + "learning_rate": 1.8940342094284525e-05, + "loss": 0.3644, + "step": 1756 + }, + { + "epoch": 1.9769274057400112, + "grad_norm": 0.2878137639733897, + "learning_rate": 1.89194826866917e-05, + "loss": 0.3594, + "step": 1757 + }, + { + "epoch": 1.9780528981429375, + "grad_norm": 0.26057909113445293, + "learning_rate": 1.8898623279098873e-05, + "loss": 0.3755, + "step": 1758 + }, + { + "epoch": 1.9791783905458638, + "grad_norm": 0.27092989925442396, + "learning_rate": 1.887776387150605e-05, + "loss": 0.3648, + "step": 1759 + }, + { + "epoch": 1.9803038829487902, + "grad_norm": 0.2845108154959281, + "learning_rate": 1.8856904463913223e-05, + "loss": 0.3449, + "step": 1760 + }, + { + "epoch": 1.9814293753517163, + "grad_norm": 0.24467445189735315, + "learning_rate": 1.88360450563204e-05, + "loss": 0.3558, + "step": 1761 + }, + { + "epoch": 1.9825548677546427, + "grad_norm": 0.2715643743977259, + "learning_rate": 1.8815185648727577e-05, + "loss": 0.3567, + "step": 1762 + }, + { + "epoch": 1.983680360157569, + "grad_norm": 0.2613996036084293, + "learning_rate": 1.879432624113475e-05, + "loss": 0.3467, + "step": 1763 + }, + { + "epoch": 1.9848058525604952, + "grad_norm": 0.2816357872833296, + "learning_rate": 1.8773466833541928e-05, + "loss": 0.3417, + "step": 1764 + }, + { + "epoch": 1.9859313449634215, + "grad_norm": 0.29529698315579805, + "learning_rate": 1.8752607425949105e-05, + "loss": 0.3527, + "step": 1765 + }, + { + "epoch": 1.9870568373663478, + "grad_norm": 0.27238727861070106, + "learning_rate": 1.8731748018356278e-05, + "loss": 0.3717, + "step": 1766 + }, + { + "epoch": 1.988182329769274, + "grad_norm": 0.27577156414013015, + "learning_rate": 1.8710888610763455e-05, + "loss": 0.3632, + "step": 1767 + }, + { + "epoch": 1.9893078221722003, + "grad_norm": 0.31287278872365587, + "learning_rate": 1.8690029203170632e-05, + "loss": 0.3678, + "step": 1768 + }, + { + "epoch": 1.9904333145751267, + "grad_norm": 0.2910024485455243, + "learning_rate": 1.8669169795577806e-05, + "loss": 0.3661, + "step": 1769 + }, + { + "epoch": 1.9915588069780528, + "grad_norm": 0.29522751930001573, + "learning_rate": 1.8648310387984983e-05, + "loss": 0.3733, + "step": 1770 + }, + { + "epoch": 1.9926842993809792, + "grad_norm": 0.2931943333929543, + "learning_rate": 1.862745098039216e-05, + "loss": 0.3554, + "step": 1771 + }, + { + "epoch": 1.9938097917839055, + "grad_norm": 0.29961502454826516, + "learning_rate": 1.8606591572799333e-05, + "loss": 0.3534, + "step": 1772 + }, + { + "epoch": 1.9949352841868317, + "grad_norm": 0.3016308875068367, + "learning_rate": 1.858573216520651e-05, + "loss": 0.3868, + "step": 1773 + }, + { + "epoch": 1.996060776589758, + "grad_norm": 0.3051815491365933, + "learning_rate": 1.8564872757613684e-05, + "loss": 0.3573, + "step": 1774 + }, + { + "epoch": 1.9971862689926843, + "grad_norm": 0.3463472368237023, + "learning_rate": 1.854401335002086e-05, + "loss": 0.3568, + "step": 1775 + }, + { + "epoch": 1.9983117613956107, + "grad_norm": 0.30250184431483823, + "learning_rate": 1.8523153942428038e-05, + "loss": 0.3679, + "step": 1776 + }, + { + "epoch": 1.9994372537985368, + "grad_norm": 0.27076812267453526, + "learning_rate": 1.850229453483521e-05, + "loss": 0.359, + "step": 1777 + }, + { + "epoch": 2.0, + "grad_norm": 0.41375343672066456, + "learning_rate": 1.848143512724239e-05, + "loss": 0.3219, + "step": 1778 + }, + { + "epoch": 2.0011254924029265, + "grad_norm": 0.30690460412174675, + "learning_rate": 1.8460575719649562e-05, + "loss": 0.2847, + "step": 1779 + }, + { + "epoch": 2.0022509848058525, + "grad_norm": 0.29252851536455965, + "learning_rate": 1.8439716312056736e-05, + "loss": 0.2916, + "step": 1780 + }, + { + "epoch": 2.003376477208779, + "grad_norm": 0.2867585999652241, + "learning_rate": 1.8418856904463913e-05, + "loss": 0.304, + "step": 1781 + }, + { + "epoch": 2.004501969611705, + "grad_norm": 0.3147976773039966, + "learning_rate": 1.839799749687109e-05, + "loss": 0.2891, + "step": 1782 + }, + { + "epoch": 2.0056274620146315, + "grad_norm": 0.2441846828289504, + "learning_rate": 1.8377138089278263e-05, + "loss": 0.2909, + "step": 1783 + }, + { + "epoch": 2.0067529544175575, + "grad_norm": 0.2593896216365388, + "learning_rate": 1.835627868168544e-05, + "loss": 0.2753, + "step": 1784 + }, + { + "epoch": 2.007878446820484, + "grad_norm": 0.2893905461877493, + "learning_rate": 1.8335419274092617e-05, + "loss": 0.2818, + "step": 1785 + }, + { + "epoch": 2.00900393922341, + "grad_norm": 0.2846562929483248, + "learning_rate": 1.831455986649979e-05, + "loss": 0.2907, + "step": 1786 + }, + { + "epoch": 2.0101294316263365, + "grad_norm": 0.2566724532797832, + "learning_rate": 1.8293700458906968e-05, + "loss": 0.2865, + "step": 1787 + }, + { + "epoch": 2.011254924029263, + "grad_norm": 0.30986557763389416, + "learning_rate": 1.8272841051314145e-05, + "loss": 0.299, + "step": 1788 + }, + { + "epoch": 2.012380416432189, + "grad_norm": 0.2790346837879426, + "learning_rate": 1.825198164372132e-05, + "loss": 0.2872, + "step": 1789 + }, + { + "epoch": 2.0135059088351155, + "grad_norm": 0.28965248515971675, + "learning_rate": 1.8231122236128495e-05, + "loss": 0.282, + "step": 1790 + }, + { + "epoch": 2.0146314012380415, + "grad_norm": 0.26758447999158064, + "learning_rate": 1.8210262828535672e-05, + "loss": 0.2854, + "step": 1791 + }, + { + "epoch": 2.015756893640968, + "grad_norm": 0.25752829835015667, + "learning_rate": 1.8189403420942846e-05, + "loss": 0.2875, + "step": 1792 + }, + { + "epoch": 2.016882386043894, + "grad_norm": 0.26237094621373575, + "learning_rate": 1.8168544013350023e-05, + "loss": 0.2861, + "step": 1793 + }, + { + "epoch": 2.0180078784468205, + "grad_norm": 0.25324822624548066, + "learning_rate": 1.8147684605757196e-05, + "loss": 0.2804, + "step": 1794 + }, + { + "epoch": 2.019133370849747, + "grad_norm": 0.27650509711437854, + "learning_rate": 1.8126825198164373e-05, + "loss": 0.298, + "step": 1795 + }, + { + "epoch": 2.020258863252673, + "grad_norm": 0.271607108362916, + "learning_rate": 1.810596579057155e-05, + "loss": 0.2793, + "step": 1796 + }, + { + "epoch": 2.0213843556555995, + "grad_norm": 0.2763902863245182, + "learning_rate": 1.8085106382978724e-05, + "loss": 0.2824, + "step": 1797 + }, + { + "epoch": 2.0225098480585255, + "grad_norm": 0.29074430245042243, + "learning_rate": 1.80642469753859e-05, + "loss": 0.2847, + "step": 1798 + }, + { + "epoch": 2.023635340461452, + "grad_norm": 0.252760394282513, + "learning_rate": 1.8043387567793075e-05, + "loss": 0.2729, + "step": 1799 + }, + { + "epoch": 2.024760832864378, + "grad_norm": 0.25115826895976634, + "learning_rate": 1.8022528160200248e-05, + "loss": 0.2903, + "step": 1800 + }, + { + "epoch": 2.0258863252673045, + "grad_norm": 0.31665556656306054, + "learning_rate": 1.8001668752607425e-05, + "loss": 0.2806, + "step": 1801 + }, + { + "epoch": 2.0270118176702305, + "grad_norm": 0.27565102328032076, + "learning_rate": 1.7980809345014602e-05, + "loss": 0.2781, + "step": 1802 + }, + { + "epoch": 2.028137310073157, + "grad_norm": 0.26334129144996565, + "learning_rate": 1.7959949937421776e-05, + "loss": 0.2865, + "step": 1803 + }, + { + "epoch": 2.0292628024760835, + "grad_norm": 0.29084203177119927, + "learning_rate": 1.7939090529828953e-05, + "loss": 0.2915, + "step": 1804 + }, + { + "epoch": 2.0303882948790095, + "grad_norm": 0.24821063662817036, + "learning_rate": 1.791823112223613e-05, + "loss": 0.2784, + "step": 1805 + }, + { + "epoch": 2.031513787281936, + "grad_norm": 0.2550931735301453, + "learning_rate": 1.7897371714643303e-05, + "loss": 0.2836, + "step": 1806 + }, + { + "epoch": 2.032639279684862, + "grad_norm": 0.27634727649104684, + "learning_rate": 1.787651230705048e-05, + "loss": 0.3069, + "step": 1807 + }, + { + "epoch": 2.0337647720877885, + "grad_norm": 0.24014034990048097, + "learning_rate": 1.7855652899457657e-05, + "loss": 0.2858, + "step": 1808 + }, + { + "epoch": 2.0348902644907145, + "grad_norm": 0.23529224395747875, + "learning_rate": 1.783479349186483e-05, + "loss": 0.292, + "step": 1809 + }, + { + "epoch": 2.036015756893641, + "grad_norm": 0.2226918871531934, + "learning_rate": 1.7813934084272008e-05, + "loss": 0.289, + "step": 1810 + }, + { + "epoch": 2.037141249296567, + "grad_norm": 0.24875514083553227, + "learning_rate": 1.7793074676679185e-05, + "loss": 0.2879, + "step": 1811 + }, + { + "epoch": 2.0382667416994935, + "grad_norm": 0.22101380287283037, + "learning_rate": 1.777221526908636e-05, + "loss": 0.2785, + "step": 1812 + }, + { + "epoch": 2.03939223410242, + "grad_norm": 0.24344041835452335, + "learning_rate": 1.7751355861493535e-05, + "loss": 0.2768, + "step": 1813 + }, + { + "epoch": 2.040517726505346, + "grad_norm": 0.24709305555007302, + "learning_rate": 1.773049645390071e-05, + "loss": 0.2785, + "step": 1814 + }, + { + "epoch": 2.0416432189082725, + "grad_norm": 0.23036508957897686, + "learning_rate": 1.7709637046307886e-05, + "loss": 0.2829, + "step": 1815 + }, + { + "epoch": 2.0427687113111985, + "grad_norm": 0.304777104086667, + "learning_rate": 1.7688777638715063e-05, + "loss": 0.2842, + "step": 1816 + }, + { + "epoch": 2.043894203714125, + "grad_norm": 0.24563307593084063, + "learning_rate": 1.7667918231122237e-05, + "loss": 0.2842, + "step": 1817 + }, + { + "epoch": 2.045019696117051, + "grad_norm": 0.24049156495572827, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.29, + "step": 1818 + }, + { + "epoch": 2.0461451885199775, + "grad_norm": 0.26152397114334225, + "learning_rate": 1.762619941593659e-05, + "loss": 0.2943, + "step": 1819 + }, + { + "epoch": 2.047270680922904, + "grad_norm": 0.24701566468961217, + "learning_rate": 1.7605340008343764e-05, + "loss": 0.28, + "step": 1820 + }, + { + "epoch": 2.04839617332583, + "grad_norm": 0.22113320376779072, + "learning_rate": 1.7584480600750938e-05, + "loss": 0.2824, + "step": 1821 + }, + { + "epoch": 2.0495216657287565, + "grad_norm": 0.2498303273769485, + "learning_rate": 1.7563621193158115e-05, + "loss": 0.2764, + "step": 1822 + }, + { + "epoch": 2.0506471581316825, + "grad_norm": 0.2613079367123678, + "learning_rate": 1.754276178556529e-05, + "loss": 0.3029, + "step": 1823 + }, + { + "epoch": 2.051772650534609, + "grad_norm": 0.2533549657170249, + "learning_rate": 1.7521902377972465e-05, + "loss": 0.2941, + "step": 1824 + }, + { + "epoch": 2.052898142937535, + "grad_norm": 0.24525113996522538, + "learning_rate": 1.7501042970379642e-05, + "loss": 0.2791, + "step": 1825 + }, + { + "epoch": 2.0540236353404615, + "grad_norm": 0.22636672236346222, + "learning_rate": 1.7480183562786816e-05, + "loss": 0.2708, + "step": 1826 + }, + { + "epoch": 2.0551491277433875, + "grad_norm": 0.2318404892918077, + "learning_rate": 1.7459324155193993e-05, + "loss": 0.2831, + "step": 1827 + }, + { + "epoch": 2.056274620146314, + "grad_norm": 0.22908482292345286, + "learning_rate": 1.743846474760117e-05, + "loss": 0.2791, + "step": 1828 + }, + { + "epoch": 2.0574001125492405, + "grad_norm": 0.23199016490767796, + "learning_rate": 1.7417605340008343e-05, + "loss": 0.2899, + "step": 1829 + }, + { + "epoch": 2.0585256049521665, + "grad_norm": 0.22679432927238993, + "learning_rate": 1.739674593241552e-05, + "loss": 0.2705, + "step": 1830 + }, + { + "epoch": 2.059651097355093, + "grad_norm": 0.240936280203786, + "learning_rate": 1.7375886524822697e-05, + "loss": 0.2796, + "step": 1831 + }, + { + "epoch": 2.060776589758019, + "grad_norm": 0.23052791316981805, + "learning_rate": 1.735502711722987e-05, + "loss": 0.298, + "step": 1832 + }, + { + "epoch": 2.0619020821609455, + "grad_norm": 0.22399826316835342, + "learning_rate": 1.7334167709637048e-05, + "loss": 0.2768, + "step": 1833 + }, + { + "epoch": 2.0630275745638715, + "grad_norm": 0.24389711598920422, + "learning_rate": 1.731330830204422e-05, + "loss": 0.2789, + "step": 1834 + }, + { + "epoch": 2.064153066966798, + "grad_norm": 0.24531794065173357, + "learning_rate": 1.72924488944514e-05, + "loss": 0.2841, + "step": 1835 + }, + { + "epoch": 2.065278559369724, + "grad_norm": 0.2857308138585535, + "learning_rate": 1.7271589486858576e-05, + "loss": 0.2746, + "step": 1836 + }, + { + "epoch": 2.0664040517726505, + "grad_norm": 0.2331548964731216, + "learning_rate": 1.725073007926575e-05, + "loss": 0.2779, + "step": 1837 + }, + { + "epoch": 2.067529544175577, + "grad_norm": 0.23649426513105923, + "learning_rate": 1.7229870671672926e-05, + "loss": 0.2913, + "step": 1838 + }, + { + "epoch": 2.068655036578503, + "grad_norm": 0.2777841981435879, + "learning_rate": 1.7209011264080103e-05, + "loss": 0.2826, + "step": 1839 + }, + { + "epoch": 2.0697805289814295, + "grad_norm": 0.27066327686914066, + "learning_rate": 1.7188151856487277e-05, + "loss": 0.2893, + "step": 1840 + }, + { + "epoch": 2.0709060213843555, + "grad_norm": 0.23134899380353294, + "learning_rate": 1.716729244889445e-05, + "loss": 0.2804, + "step": 1841 + }, + { + "epoch": 2.072031513787282, + "grad_norm": 0.29223852513335047, + "learning_rate": 1.7146433041301627e-05, + "loss": 0.2916, + "step": 1842 + }, + { + "epoch": 2.073157006190208, + "grad_norm": 0.2735960908953659, + "learning_rate": 1.71255736337088e-05, + "loss": 0.2852, + "step": 1843 + }, + { + "epoch": 2.0742824985931345, + "grad_norm": 0.26821528502754455, + "learning_rate": 1.7104714226115978e-05, + "loss": 0.2891, + "step": 1844 + }, + { + "epoch": 2.0754079909960605, + "grad_norm": 0.26154260311021144, + "learning_rate": 1.7083854818523155e-05, + "loss": 0.2875, + "step": 1845 + }, + { + "epoch": 2.076533483398987, + "grad_norm": 0.31021830521974225, + "learning_rate": 1.706299541093033e-05, + "loss": 0.2776, + "step": 1846 + }, + { + "epoch": 2.0776589758019135, + "grad_norm": 0.2788988641156972, + "learning_rate": 1.7042136003337505e-05, + "loss": 0.2886, + "step": 1847 + }, + { + "epoch": 2.0787844682048395, + "grad_norm": 0.2907858072020635, + "learning_rate": 1.7021276595744682e-05, + "loss": 0.2895, + "step": 1848 + }, + { + "epoch": 2.079909960607766, + "grad_norm": 0.2542410475178318, + "learning_rate": 1.7000417188151856e-05, + "loss": 0.2856, + "step": 1849 + }, + { + "epoch": 2.081035453010692, + "grad_norm": 0.24197984345301113, + "learning_rate": 1.6979557780559033e-05, + "loss": 0.2824, + "step": 1850 + }, + { + "epoch": 2.0821609454136185, + "grad_norm": 0.2557692899387776, + "learning_rate": 1.695869837296621e-05, + "loss": 0.2909, + "step": 1851 + }, + { + "epoch": 2.0832864378165445, + "grad_norm": 0.23793678801447735, + "learning_rate": 1.6937838965373384e-05, + "loss": 0.2689, + "step": 1852 + }, + { + "epoch": 2.084411930219471, + "grad_norm": 0.29107842473943085, + "learning_rate": 1.691697955778056e-05, + "loss": 0.284, + "step": 1853 + }, + { + "epoch": 2.0855374226223975, + "grad_norm": 0.24607914318213508, + "learning_rate": 1.6896120150187734e-05, + "loss": 0.2957, + "step": 1854 + }, + { + "epoch": 2.0866629150253235, + "grad_norm": 0.21651709890692455, + "learning_rate": 1.687526074259491e-05, + "loss": 0.2677, + "step": 1855 + }, + { + "epoch": 2.08778840742825, + "grad_norm": 0.22707602957596063, + "learning_rate": 1.6854401335002088e-05, + "loss": 0.2854, + "step": 1856 + }, + { + "epoch": 2.088913899831176, + "grad_norm": 0.24846772345755247, + "learning_rate": 1.6833541927409262e-05, + "loss": 0.2679, + "step": 1857 + }, + { + "epoch": 2.0900393922341025, + "grad_norm": 0.27573122817807033, + "learning_rate": 1.681268251981644e-05, + "loss": 0.3007, + "step": 1858 + }, + { + "epoch": 2.0911648846370285, + "grad_norm": 0.23927598344656173, + "learning_rate": 1.6791823112223616e-05, + "loss": 0.2754, + "step": 1859 + }, + { + "epoch": 2.092290377039955, + "grad_norm": 0.23518656387715997, + "learning_rate": 1.677096370463079e-05, + "loss": 0.2766, + "step": 1860 + }, + { + "epoch": 2.093415869442881, + "grad_norm": 0.24448942505615562, + "learning_rate": 1.6750104297037966e-05, + "loss": 0.2913, + "step": 1861 + }, + { + "epoch": 2.0945413618458075, + "grad_norm": 0.2336572648593039, + "learning_rate": 1.672924488944514e-05, + "loss": 0.2942, + "step": 1862 + }, + { + "epoch": 2.095666854248734, + "grad_norm": 0.22716116914003923, + "learning_rate": 1.6708385481852313e-05, + "loss": 0.2881, + "step": 1863 + }, + { + "epoch": 2.09679234665166, + "grad_norm": 0.2849566981299875, + "learning_rate": 1.668752607425949e-05, + "loss": 0.2805, + "step": 1864 + }, + { + "epoch": 2.0979178390545865, + "grad_norm": 0.21858945358126292, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.2807, + "step": 1865 + }, + { + "epoch": 2.0990433314575125, + "grad_norm": 0.23697889851760048, + "learning_rate": 1.664580725907384e-05, + "loss": 0.2791, + "step": 1866 + }, + { + "epoch": 2.100168823860439, + "grad_norm": 0.23126363606688877, + "learning_rate": 1.6624947851481018e-05, + "loss": 0.2878, + "step": 1867 + }, + { + "epoch": 2.101294316263365, + "grad_norm": 0.2651888938021785, + "learning_rate": 1.6604088443888195e-05, + "loss": 0.2859, + "step": 1868 + }, + { + "epoch": 2.1024198086662915, + "grad_norm": 0.23270730320241492, + "learning_rate": 1.658322903629537e-05, + "loss": 0.296, + "step": 1869 + }, + { + "epoch": 2.103545301069218, + "grad_norm": 0.3042844752542814, + "learning_rate": 1.6562369628702546e-05, + "loss": 0.3132, + "step": 1870 + }, + { + "epoch": 2.104670793472144, + "grad_norm": 0.24339368794150576, + "learning_rate": 1.6541510221109723e-05, + "loss": 0.2766, + "step": 1871 + }, + { + "epoch": 2.1057962858750705, + "grad_norm": 0.26285249575918274, + "learning_rate": 1.6520650813516896e-05, + "loss": 0.285, + "step": 1872 + }, + { + "epoch": 2.1069217782779965, + "grad_norm": 0.25652733532840694, + "learning_rate": 1.6499791405924073e-05, + "loss": 0.294, + "step": 1873 + }, + { + "epoch": 2.108047270680923, + "grad_norm": 0.2543828171392836, + "learning_rate": 1.6478931998331247e-05, + "loss": 0.2746, + "step": 1874 + }, + { + "epoch": 2.109172763083849, + "grad_norm": 0.2352973105601401, + "learning_rate": 1.6458072590738424e-05, + "loss": 0.2743, + "step": 1875 + }, + { + "epoch": 2.1102982554867755, + "grad_norm": 0.25036103571955426, + "learning_rate": 1.64372131831456e-05, + "loss": 0.2909, + "step": 1876 + }, + { + "epoch": 2.1114237478897016, + "grad_norm": 0.30189436640559725, + "learning_rate": 1.6416353775552774e-05, + "loss": 0.2969, + "step": 1877 + }, + { + "epoch": 2.112549240292628, + "grad_norm": 0.24769249438614963, + "learning_rate": 1.639549436795995e-05, + "loss": 0.282, + "step": 1878 + }, + { + "epoch": 2.1136747326955545, + "grad_norm": 0.25738252269107714, + "learning_rate": 1.6374634960367128e-05, + "loss": 0.2987, + "step": 1879 + }, + { + "epoch": 2.1148002250984805, + "grad_norm": 0.3340073534477609, + "learning_rate": 1.6353775552774302e-05, + "loss": 0.2962, + "step": 1880 + }, + { + "epoch": 2.115925717501407, + "grad_norm": 0.2641953651128952, + "learning_rate": 1.633291614518148e-05, + "loss": 0.2827, + "step": 1881 + }, + { + "epoch": 2.117051209904333, + "grad_norm": 0.24068009657515393, + "learning_rate": 1.6312056737588656e-05, + "loss": 0.2795, + "step": 1882 + }, + { + "epoch": 2.1181767023072595, + "grad_norm": 0.2507429416561855, + "learning_rate": 1.6291197329995826e-05, + "loss": 0.285, + "step": 1883 + }, + { + "epoch": 2.1193021947101855, + "grad_norm": 0.2533889963610055, + "learning_rate": 1.6270337922403003e-05, + "loss": 0.2941, + "step": 1884 + }, + { + "epoch": 2.120427687113112, + "grad_norm": 0.24688775417233455, + "learning_rate": 1.624947851481018e-05, + "loss": 0.3012, + "step": 1885 + }, + { + "epoch": 2.121553179516038, + "grad_norm": 0.24915605314972172, + "learning_rate": 1.6228619107217354e-05, + "loss": 0.2993, + "step": 1886 + }, + { + "epoch": 2.1226786719189645, + "grad_norm": 0.279903262549959, + "learning_rate": 1.620775969962453e-05, + "loss": 0.2834, + "step": 1887 + }, + { + "epoch": 2.123804164321891, + "grad_norm": 0.23182376171306077, + "learning_rate": 1.6186900292031708e-05, + "loss": 0.2916, + "step": 1888 + }, + { + "epoch": 2.124929656724817, + "grad_norm": 0.23000197495504152, + "learning_rate": 1.616604088443888e-05, + "loss": 0.2747, + "step": 1889 + }, + { + "epoch": 2.1260551491277435, + "grad_norm": 0.23823286526237422, + "learning_rate": 1.6145181476846058e-05, + "loss": 0.2781, + "step": 1890 + }, + { + "epoch": 2.1271806415306695, + "grad_norm": 0.21627846743690535, + "learning_rate": 1.6124322069253235e-05, + "loss": 0.2706, + "step": 1891 + }, + { + "epoch": 2.128306133933596, + "grad_norm": 0.23892946673013887, + "learning_rate": 1.610346266166041e-05, + "loss": 0.2914, + "step": 1892 + }, + { + "epoch": 2.129431626336522, + "grad_norm": 0.2656649096475775, + "learning_rate": 1.6082603254067586e-05, + "loss": 0.2866, + "step": 1893 + }, + { + "epoch": 2.1305571187394485, + "grad_norm": 0.2227237824020542, + "learning_rate": 1.6061743846474763e-05, + "loss": 0.2625, + "step": 1894 + }, + { + "epoch": 2.1316826111423746, + "grad_norm": 0.23235111483198348, + "learning_rate": 1.6040884438881936e-05, + "loss": 0.2859, + "step": 1895 + }, + { + "epoch": 2.132808103545301, + "grad_norm": 0.25920239106869064, + "learning_rate": 1.6020025031289113e-05, + "loss": 0.2959, + "step": 1896 + }, + { + "epoch": 2.1339335959482275, + "grad_norm": 0.23719185530213213, + "learning_rate": 1.5999165623696287e-05, + "loss": 0.2897, + "step": 1897 + }, + { + "epoch": 2.1350590883511535, + "grad_norm": 0.22876937310915393, + "learning_rate": 1.5978306216103464e-05, + "loss": 0.2788, + "step": 1898 + }, + { + "epoch": 2.13618458075408, + "grad_norm": 0.26616238576961354, + "learning_rate": 1.595744680851064e-05, + "loss": 0.2889, + "step": 1899 + }, + { + "epoch": 2.137310073157006, + "grad_norm": 0.2166404539813475, + "learning_rate": 1.5936587400917814e-05, + "loss": 0.282, + "step": 1900 + }, + { + "epoch": 2.1384355655599325, + "grad_norm": 0.23700101129905038, + "learning_rate": 1.591572799332499e-05, + "loss": 0.2968, + "step": 1901 + }, + { + "epoch": 2.1395610579628586, + "grad_norm": 0.2285745331225241, + "learning_rate": 1.589486858573217e-05, + "loss": 0.2841, + "step": 1902 + }, + { + "epoch": 2.140686550365785, + "grad_norm": 0.23783838496188303, + "learning_rate": 1.5874009178139342e-05, + "loss": 0.2909, + "step": 1903 + }, + { + "epoch": 2.1418120427687115, + "grad_norm": 0.23082103720915573, + "learning_rate": 1.5853149770546516e-05, + "loss": 0.2824, + "step": 1904 + }, + { + "epoch": 2.1429375351716375, + "grad_norm": 0.25094828821607146, + "learning_rate": 1.5832290362953693e-05, + "loss": 0.285, + "step": 1905 + }, + { + "epoch": 2.144063027574564, + "grad_norm": 0.22431109979899386, + "learning_rate": 1.5811430955360866e-05, + "loss": 0.2737, + "step": 1906 + }, + { + "epoch": 2.14518851997749, + "grad_norm": 0.22492379294000237, + "learning_rate": 1.5790571547768043e-05, + "loss": 0.2726, + "step": 1907 + }, + { + "epoch": 2.1463140123804165, + "grad_norm": 0.2314053442523269, + "learning_rate": 1.576971214017522e-05, + "loss": 0.2754, + "step": 1908 + }, + { + "epoch": 2.1474395047833426, + "grad_norm": 0.24673230264182605, + "learning_rate": 1.5748852732582394e-05, + "loss": 0.2921, + "step": 1909 + }, + { + "epoch": 2.148564997186269, + "grad_norm": 0.23606707383444092, + "learning_rate": 1.572799332498957e-05, + "loss": 0.2804, + "step": 1910 + }, + { + "epoch": 2.1496904895891955, + "grad_norm": 0.235724127482375, + "learning_rate": 1.5707133917396748e-05, + "loss": 0.2861, + "step": 1911 + }, + { + "epoch": 2.1508159819921215, + "grad_norm": 0.24483607505245927, + "learning_rate": 1.568627450980392e-05, + "loss": 0.2878, + "step": 1912 + }, + { + "epoch": 2.151941474395048, + "grad_norm": 0.2552535556772291, + "learning_rate": 1.56654151022111e-05, + "loss": 0.2857, + "step": 1913 + }, + { + "epoch": 2.153066966797974, + "grad_norm": 0.22983484882907804, + "learning_rate": 1.5644555694618275e-05, + "loss": 0.2872, + "step": 1914 + }, + { + "epoch": 2.1541924592009005, + "grad_norm": 0.25772716013553465, + "learning_rate": 1.562369628702545e-05, + "loss": 0.2959, + "step": 1915 + }, + { + "epoch": 2.1553179516038266, + "grad_norm": 0.29415752414459184, + "learning_rate": 1.5602836879432626e-05, + "loss": 0.2909, + "step": 1916 + }, + { + "epoch": 2.156443444006753, + "grad_norm": 0.2564449243204837, + "learning_rate": 1.55819774718398e-05, + "loss": 0.3047, + "step": 1917 + }, + { + "epoch": 2.157568936409679, + "grad_norm": 0.2357261136445965, + "learning_rate": 1.5561118064246976e-05, + "loss": 0.2891, + "step": 1918 + }, + { + "epoch": 2.1586944288126055, + "grad_norm": 0.23236268840383198, + "learning_rate": 1.5540258656654153e-05, + "loss": 0.2936, + "step": 1919 + }, + { + "epoch": 2.159819921215532, + "grad_norm": 0.2526992651991741, + "learning_rate": 1.5519399249061327e-05, + "loss": 0.279, + "step": 1920 + }, + { + "epoch": 2.160945413618458, + "grad_norm": 0.26601612523224494, + "learning_rate": 1.5498539841468504e-05, + "loss": 0.2805, + "step": 1921 + }, + { + "epoch": 2.1620709060213845, + "grad_norm": 0.23000298824921414, + "learning_rate": 1.547768043387568e-05, + "loss": 0.2837, + "step": 1922 + }, + { + "epoch": 2.1631963984243106, + "grad_norm": 0.24154706706349996, + "learning_rate": 1.5456821026282855e-05, + "loss": 0.2846, + "step": 1923 + }, + { + "epoch": 2.164321890827237, + "grad_norm": 0.2360397761989054, + "learning_rate": 1.543596161869003e-05, + "loss": 0.283, + "step": 1924 + }, + { + "epoch": 2.165447383230163, + "grad_norm": 0.2218092507800359, + "learning_rate": 1.5415102211097205e-05, + "loss": 0.2771, + "step": 1925 + }, + { + "epoch": 2.1665728756330895, + "grad_norm": 0.2386052630849636, + "learning_rate": 1.539424280350438e-05, + "loss": 0.2924, + "step": 1926 + }, + { + "epoch": 2.1676983680360156, + "grad_norm": 0.24947895655376598, + "learning_rate": 1.5373383395911556e-05, + "loss": 0.2925, + "step": 1927 + }, + { + "epoch": 2.168823860438942, + "grad_norm": 0.2800300772115473, + "learning_rate": 1.5352523988318733e-05, + "loss": 0.2931, + "step": 1928 + }, + { + "epoch": 2.1699493528418685, + "grad_norm": 0.22636221415787847, + "learning_rate": 1.5331664580725906e-05, + "loss": 0.2875, + "step": 1929 + }, + { + "epoch": 2.1710748452447945, + "grad_norm": 0.24386646248262941, + "learning_rate": 1.5310805173133083e-05, + "loss": 0.2905, + "step": 1930 + }, + { + "epoch": 2.172200337647721, + "grad_norm": 0.22365055654311475, + "learning_rate": 1.528994576554026e-05, + "loss": 0.2802, + "step": 1931 + }, + { + "epoch": 2.173325830050647, + "grad_norm": 0.25602509803802304, + "learning_rate": 1.5269086357947434e-05, + "loss": 0.2724, + "step": 1932 + }, + { + "epoch": 2.1744513224535735, + "grad_norm": 0.20551321646228457, + "learning_rate": 1.5248226950354611e-05, + "loss": 0.2721, + "step": 1933 + }, + { + "epoch": 2.1755768148564996, + "grad_norm": 0.22807897179549413, + "learning_rate": 1.5227367542761786e-05, + "loss": 0.2892, + "step": 1934 + }, + { + "epoch": 2.176702307259426, + "grad_norm": 0.25852860308404757, + "learning_rate": 1.5206508135168961e-05, + "loss": 0.3027, + "step": 1935 + }, + { + "epoch": 2.177827799662352, + "grad_norm": 0.2279934128116949, + "learning_rate": 1.5185648727576138e-05, + "loss": 0.2801, + "step": 1936 + }, + { + "epoch": 2.1789532920652785, + "grad_norm": 0.2351411289469844, + "learning_rate": 1.5164789319983314e-05, + "loss": 0.28, + "step": 1937 + }, + { + "epoch": 2.180078784468205, + "grad_norm": 0.21552640522315936, + "learning_rate": 1.5143929912390489e-05, + "loss": 0.2947, + "step": 1938 + }, + { + "epoch": 2.181204276871131, + "grad_norm": 0.24626801791538866, + "learning_rate": 1.5123070504797664e-05, + "loss": 0.2783, + "step": 1939 + }, + { + "epoch": 2.1823297692740575, + "grad_norm": 0.23012350879449098, + "learning_rate": 1.5102211097204841e-05, + "loss": 0.2774, + "step": 1940 + }, + { + "epoch": 2.1834552616769836, + "grad_norm": 0.23081070838683507, + "learning_rate": 1.5081351689612017e-05, + "loss": 0.2799, + "step": 1941 + }, + { + "epoch": 2.18458075407991, + "grad_norm": 0.2490023091368887, + "learning_rate": 1.5060492282019192e-05, + "loss": 0.2916, + "step": 1942 + }, + { + "epoch": 2.185706246482836, + "grad_norm": 0.23226830279863933, + "learning_rate": 1.5039632874426369e-05, + "loss": 0.274, + "step": 1943 + }, + { + "epoch": 2.1868317388857625, + "grad_norm": 0.23814945426894574, + "learning_rate": 1.5018773466833544e-05, + "loss": 0.2987, + "step": 1944 + }, + { + "epoch": 2.1879572312886886, + "grad_norm": 0.22888208424137457, + "learning_rate": 1.4997914059240718e-05, + "loss": 0.2809, + "step": 1945 + }, + { + "epoch": 2.189082723691615, + "grad_norm": 0.22117598909045155, + "learning_rate": 1.4977054651647893e-05, + "loss": 0.2869, + "step": 1946 + }, + { + "epoch": 2.1902082160945415, + "grad_norm": 0.2635412507153887, + "learning_rate": 1.4956195244055068e-05, + "loss": 0.3102, + "step": 1947 + }, + { + "epoch": 2.1913337084974676, + "grad_norm": 0.21434697577713013, + "learning_rate": 1.4935335836462244e-05, + "loss": 0.2748, + "step": 1948 + }, + { + "epoch": 2.192459200900394, + "grad_norm": 0.23605470994586675, + "learning_rate": 1.491447642886942e-05, + "loss": 0.2859, + "step": 1949 + }, + { + "epoch": 2.19358469330332, + "grad_norm": 0.2405759189766832, + "learning_rate": 1.4893617021276596e-05, + "loss": 0.2942, + "step": 1950 + }, + { + "epoch": 2.1947101857062465, + "grad_norm": 0.22131821842232993, + "learning_rate": 1.4872757613683771e-05, + "loss": 0.2884, + "step": 1951 + }, + { + "epoch": 2.1958356781091726, + "grad_norm": 0.23216071326486187, + "learning_rate": 1.4851898206090946e-05, + "loss": 0.2741, + "step": 1952 + }, + { + "epoch": 2.196961170512099, + "grad_norm": 0.2261133526570407, + "learning_rate": 1.4831038798498123e-05, + "loss": 0.2963, + "step": 1953 + }, + { + "epoch": 2.1980866629150255, + "grad_norm": 0.2302291451269135, + "learning_rate": 1.4810179390905299e-05, + "loss": 0.2828, + "step": 1954 + }, + { + "epoch": 2.1992121553179516, + "grad_norm": 0.2535578449757302, + "learning_rate": 1.4789319983312474e-05, + "loss": 0.3054, + "step": 1955 + }, + { + "epoch": 2.200337647720878, + "grad_norm": 0.2353316415549731, + "learning_rate": 1.4768460575719651e-05, + "loss": 0.2851, + "step": 1956 + }, + { + "epoch": 2.201463140123804, + "grad_norm": 0.22300891391695027, + "learning_rate": 1.4747601168126826e-05, + "loss": 0.2685, + "step": 1957 + }, + { + "epoch": 2.2025886325267305, + "grad_norm": 0.24986486980542502, + "learning_rate": 1.4726741760534002e-05, + "loss": 0.2956, + "step": 1958 + }, + { + "epoch": 2.2037141249296566, + "grad_norm": 0.2180771271590878, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.2922, + "step": 1959 + }, + { + "epoch": 2.204839617332583, + "grad_norm": 0.22680565869396152, + "learning_rate": 1.4685022945348354e-05, + "loss": 0.2909, + "step": 1960 + }, + { + "epoch": 2.205965109735509, + "grad_norm": 0.23513680764714112, + "learning_rate": 1.4664163537755529e-05, + "loss": 0.2786, + "step": 1961 + }, + { + "epoch": 2.2070906021384356, + "grad_norm": 0.24973876085692792, + "learning_rate": 1.4643304130162704e-05, + "loss": 0.2853, + "step": 1962 + }, + { + "epoch": 2.208216094541362, + "grad_norm": 0.22544610054019418, + "learning_rate": 1.4622444722569881e-05, + "loss": 0.2831, + "step": 1963 + }, + { + "epoch": 2.209341586944288, + "grad_norm": 0.22330625417293162, + "learning_rate": 1.4601585314977057e-05, + "loss": 0.2867, + "step": 1964 + }, + { + "epoch": 2.2104670793472145, + "grad_norm": 0.22525152317015681, + "learning_rate": 1.4580725907384232e-05, + "loss": 0.29, + "step": 1965 + }, + { + "epoch": 2.2115925717501406, + "grad_norm": 0.22249714982600474, + "learning_rate": 1.4559866499791406e-05, + "loss": 0.299, + "step": 1966 + }, + { + "epoch": 2.212718064153067, + "grad_norm": 0.24092275848280195, + "learning_rate": 1.4539007092198581e-05, + "loss": 0.2896, + "step": 1967 + }, + { + "epoch": 2.213843556555993, + "grad_norm": 0.22252299992217103, + "learning_rate": 1.4518147684605756e-05, + "loss": 0.258, + "step": 1968 + }, + { + "epoch": 2.2149690489589196, + "grad_norm": 0.23636046190697863, + "learning_rate": 1.4497288277012933e-05, + "loss": 0.2908, + "step": 1969 + }, + { + "epoch": 2.216094541361846, + "grad_norm": 0.2596597997389332, + "learning_rate": 1.4476428869420108e-05, + "loss": 0.27, + "step": 1970 + }, + { + "epoch": 2.217220033764772, + "grad_norm": 0.2531683961826357, + "learning_rate": 1.4455569461827284e-05, + "loss": 0.2832, + "step": 1971 + }, + { + "epoch": 2.2183455261676985, + "grad_norm": 0.2593605264440698, + "learning_rate": 1.443471005423446e-05, + "loss": 0.2841, + "step": 1972 + }, + { + "epoch": 2.2194710185706246, + "grad_norm": 0.26699737148304314, + "learning_rate": 1.4413850646641636e-05, + "loss": 0.2799, + "step": 1973 + }, + { + "epoch": 2.220596510973551, + "grad_norm": 0.2294951675397686, + "learning_rate": 1.4392991239048811e-05, + "loss": 0.2909, + "step": 1974 + }, + { + "epoch": 2.221722003376477, + "grad_norm": 0.2245538365625567, + "learning_rate": 1.4372131831455987e-05, + "loss": 0.2799, + "step": 1975 + }, + { + "epoch": 2.2228474957794035, + "grad_norm": 0.2646561800422373, + "learning_rate": 1.4351272423863164e-05, + "loss": 0.2765, + "step": 1976 + }, + { + "epoch": 2.2239729881823296, + "grad_norm": 0.21416819505340884, + "learning_rate": 1.4330413016270339e-05, + "loss": 0.2801, + "step": 1977 + }, + { + "epoch": 2.225098480585256, + "grad_norm": 0.21948393418095735, + "learning_rate": 1.4309553608677514e-05, + "loss": 0.2866, + "step": 1978 + }, + { + "epoch": 2.2262239729881825, + "grad_norm": 0.22822972297920066, + "learning_rate": 1.428869420108469e-05, + "loss": 0.287, + "step": 1979 + }, + { + "epoch": 2.2273494653911086, + "grad_norm": 0.2160982046115744, + "learning_rate": 1.4267834793491866e-05, + "loss": 0.2925, + "step": 1980 + }, + { + "epoch": 2.228474957794035, + "grad_norm": 0.23144554832269953, + "learning_rate": 1.4246975385899042e-05, + "loss": 0.3105, + "step": 1981 + }, + { + "epoch": 2.229600450196961, + "grad_norm": 0.2419669093281673, + "learning_rate": 1.4226115978306217e-05, + "loss": 0.2795, + "step": 1982 + }, + { + "epoch": 2.2307259425998875, + "grad_norm": 0.2333075873767841, + "learning_rate": 1.4205256570713394e-05, + "loss": 0.2879, + "step": 1983 + }, + { + "epoch": 2.2318514350028136, + "grad_norm": 0.2381375140609149, + "learning_rate": 1.418439716312057e-05, + "loss": 0.2969, + "step": 1984 + }, + { + "epoch": 2.23297692740574, + "grad_norm": 0.22837890307254083, + "learning_rate": 1.4163537755527745e-05, + "loss": 0.283, + "step": 1985 + }, + { + "epoch": 2.234102419808666, + "grad_norm": 0.24448338514717682, + "learning_rate": 1.414267834793492e-05, + "loss": 0.3013, + "step": 1986 + }, + { + "epoch": 2.2352279122115926, + "grad_norm": 0.23420776111487138, + "learning_rate": 1.4121818940342093e-05, + "loss": 0.2878, + "step": 1987 + }, + { + "epoch": 2.236353404614519, + "grad_norm": 0.23579942650757943, + "learning_rate": 1.4100959532749269e-05, + "loss": 0.288, + "step": 1988 + }, + { + "epoch": 2.237478897017445, + "grad_norm": 0.23043040793992384, + "learning_rate": 1.4080100125156446e-05, + "loss": 0.305, + "step": 1989 + }, + { + "epoch": 2.2386043894203715, + "grad_norm": 0.24659768389490117, + "learning_rate": 1.4059240717563621e-05, + "loss": 0.2778, + "step": 1990 + }, + { + "epoch": 2.2397298818232976, + "grad_norm": 0.2525101439952681, + "learning_rate": 1.4038381309970796e-05, + "loss": 0.2797, + "step": 1991 + }, + { + "epoch": 2.240855374226224, + "grad_norm": 0.2180718742006463, + "learning_rate": 1.4017521902377973e-05, + "loss": 0.2824, + "step": 1992 + }, + { + "epoch": 2.24198086662915, + "grad_norm": 0.23039632064460322, + "learning_rate": 1.3996662494785149e-05, + "loss": 0.2732, + "step": 1993 + }, + { + "epoch": 2.2431063590320766, + "grad_norm": 0.24390939737808814, + "learning_rate": 1.3975803087192324e-05, + "loss": 0.2942, + "step": 1994 + }, + { + "epoch": 2.2442318514350026, + "grad_norm": 0.22495659632157705, + "learning_rate": 1.39549436795995e-05, + "loss": 0.2786, + "step": 1995 + }, + { + "epoch": 2.245357343837929, + "grad_norm": 0.2220671184762533, + "learning_rate": 1.3934084272006676e-05, + "loss": 0.2902, + "step": 1996 + }, + { + "epoch": 2.2464828362408555, + "grad_norm": 0.2063740174423525, + "learning_rate": 1.3913224864413851e-05, + "loss": 0.2794, + "step": 1997 + }, + { + "epoch": 2.2476083286437816, + "grad_norm": 0.22864397206918258, + "learning_rate": 1.3892365456821027e-05, + "loss": 0.2899, + "step": 1998 + }, + { + "epoch": 2.248733821046708, + "grad_norm": 0.22641553859678237, + "learning_rate": 1.3871506049228202e-05, + "loss": 0.2913, + "step": 1999 + }, + { + "epoch": 2.249859313449634, + "grad_norm": 0.23273883384894037, + "learning_rate": 1.3850646641635379e-05, + "loss": 0.2896, + "step": 2000 + }, + { + "epoch": 2.2509848058525606, + "grad_norm": 0.25883856114515486, + "learning_rate": 1.3829787234042554e-05, + "loss": 0.2853, + "step": 2001 + }, + { + "epoch": 2.2521102982554866, + "grad_norm": 0.24543011458191846, + "learning_rate": 1.380892782644973e-05, + "loss": 0.2921, + "step": 2002 + }, + { + "epoch": 2.253235790658413, + "grad_norm": 0.2573780345794268, + "learning_rate": 1.3788068418856907e-05, + "loss": 0.2987, + "step": 2003 + }, + { + "epoch": 2.254361283061339, + "grad_norm": 0.2344713538028616, + "learning_rate": 1.3767209011264082e-05, + "loss": 0.2788, + "step": 2004 + }, + { + "epoch": 2.2554867754642656, + "grad_norm": 0.2655728653325266, + "learning_rate": 1.3746349603671257e-05, + "loss": 0.2891, + "step": 2005 + }, + { + "epoch": 2.256612267867192, + "grad_norm": 0.24696226562693468, + "learning_rate": 1.3725490196078432e-05, + "loss": 0.2832, + "step": 2006 + }, + { + "epoch": 2.257737760270118, + "grad_norm": 0.27074526345994904, + "learning_rate": 1.370463078848561e-05, + "loss": 0.2991, + "step": 2007 + }, + { + "epoch": 2.2588632526730446, + "grad_norm": 0.21307871627731073, + "learning_rate": 1.3683771380892781e-05, + "loss": 0.2802, + "step": 2008 + }, + { + "epoch": 2.2599887450759706, + "grad_norm": 0.6066306419781285, + "learning_rate": 1.3662911973299958e-05, + "loss": 0.3119, + "step": 2009 + }, + { + "epoch": 2.261114237478897, + "grad_norm": 0.2354767198892578, + "learning_rate": 1.3642052565707134e-05, + "loss": 0.2931, + "step": 2010 + }, + { + "epoch": 2.2622397298818235, + "grad_norm": 0.3113759768538715, + "learning_rate": 1.3621193158114309e-05, + "loss": 0.3097, + "step": 2011 + }, + { + "epoch": 2.2633652222847496, + "grad_norm": 0.23868520954039024, + "learning_rate": 1.3600333750521486e-05, + "loss": 0.295, + "step": 2012 + }, + { + "epoch": 2.264490714687676, + "grad_norm": 0.25599462430091524, + "learning_rate": 1.3579474342928661e-05, + "loss": 0.2934, + "step": 2013 + }, + { + "epoch": 2.265616207090602, + "grad_norm": 0.2378852312729475, + "learning_rate": 1.3558614935335836e-05, + "loss": 0.2905, + "step": 2014 + }, + { + "epoch": 2.2667416994935286, + "grad_norm": 0.23537606322412846, + "learning_rate": 1.3537755527743012e-05, + "loss": 0.2933, + "step": 2015 + }, + { + "epoch": 2.2678671918964546, + "grad_norm": 0.24999030117110338, + "learning_rate": 1.3516896120150189e-05, + "loss": 0.2907, + "step": 2016 + }, + { + "epoch": 2.268992684299381, + "grad_norm": 0.2189977535068501, + "learning_rate": 1.3496036712557364e-05, + "loss": 0.2768, + "step": 2017 + }, + { + "epoch": 2.270118176702307, + "grad_norm": 0.21605143200933585, + "learning_rate": 1.347517730496454e-05, + "loss": 0.2825, + "step": 2018 + }, + { + "epoch": 2.2712436691052336, + "grad_norm": 0.2190715820513759, + "learning_rate": 1.3454317897371716e-05, + "loss": 0.2985, + "step": 2019 + }, + { + "epoch": 2.27236916150816, + "grad_norm": 0.2419287070815025, + "learning_rate": 1.3433458489778892e-05, + "loss": 0.2812, + "step": 2020 + }, + { + "epoch": 2.273494653911086, + "grad_norm": 0.23856366222450073, + "learning_rate": 1.3412599082186067e-05, + "loss": 0.2754, + "step": 2021 + }, + { + "epoch": 2.2746201463140125, + "grad_norm": 0.23128552323354076, + "learning_rate": 1.3391739674593242e-05, + "loss": 0.2967, + "step": 2022 + }, + { + "epoch": 2.2757456387169386, + "grad_norm": 0.23110186859812204, + "learning_rate": 1.3370880267000419e-05, + "loss": 0.2905, + "step": 2023 + }, + { + "epoch": 2.276871131119865, + "grad_norm": 0.23791496512553711, + "learning_rate": 1.3350020859407594e-05, + "loss": 0.2956, + "step": 2024 + }, + { + "epoch": 2.277996623522791, + "grad_norm": 0.270895607021542, + "learning_rate": 1.332916145181477e-05, + "loss": 0.2979, + "step": 2025 + }, + { + "epoch": 2.2791221159257176, + "grad_norm": 0.2622847660820458, + "learning_rate": 1.3308302044221945e-05, + "loss": 0.2805, + "step": 2026 + }, + { + "epoch": 2.2802476083286436, + "grad_norm": 0.2451853343226485, + "learning_rate": 1.3287442636629122e-05, + "loss": 0.2849, + "step": 2027 + }, + { + "epoch": 2.28137310073157, + "grad_norm": 0.2181534341062286, + "learning_rate": 1.3266583229036297e-05, + "loss": 0.2843, + "step": 2028 + }, + { + "epoch": 2.2824985931344965, + "grad_norm": 0.2350791322319216, + "learning_rate": 1.3245723821443471e-05, + "loss": 0.2804, + "step": 2029 + }, + { + "epoch": 2.2836240855374226, + "grad_norm": 0.24384265303411898, + "learning_rate": 1.3224864413850646e-05, + "loss": 0.2844, + "step": 2030 + }, + { + "epoch": 2.284749577940349, + "grad_norm": 0.21471389480099612, + "learning_rate": 1.3204005006257821e-05, + "loss": 0.2766, + "step": 2031 + }, + { + "epoch": 2.285875070343275, + "grad_norm": 0.2558686689697758, + "learning_rate": 1.3183145598664998e-05, + "loss": 0.3006, + "step": 2032 + }, + { + "epoch": 2.2870005627462016, + "grad_norm": 0.24596519958308774, + "learning_rate": 1.3162286191072174e-05, + "loss": 0.2791, + "step": 2033 + }, + { + "epoch": 2.2881260551491276, + "grad_norm": 0.22178993068098377, + "learning_rate": 1.3141426783479349e-05, + "loss": 0.3006, + "step": 2034 + }, + { + "epoch": 2.289251547552054, + "grad_norm": 0.21211849808178426, + "learning_rate": 1.3120567375886524e-05, + "loss": 0.2879, + "step": 2035 + }, + { + "epoch": 2.29037703995498, + "grad_norm": 0.26189329024450775, + "learning_rate": 1.3099707968293701e-05, + "loss": 0.2919, + "step": 2036 + }, + { + "epoch": 2.2915025323579066, + "grad_norm": 0.24020801441451947, + "learning_rate": 1.3078848560700877e-05, + "loss": 0.2936, + "step": 2037 + }, + { + "epoch": 2.292628024760833, + "grad_norm": 0.2444872387207359, + "learning_rate": 1.3057989153108052e-05, + "loss": 0.3098, + "step": 2038 + }, + { + "epoch": 2.293753517163759, + "grad_norm": 0.21895214125433066, + "learning_rate": 1.3037129745515229e-05, + "loss": 0.2743, + "step": 2039 + }, + { + "epoch": 2.2948790095666856, + "grad_norm": 0.2496911198777528, + "learning_rate": 1.3016270337922404e-05, + "loss": 0.2918, + "step": 2040 + }, + { + "epoch": 2.2960045019696116, + "grad_norm": 2.8468163932596022, + "learning_rate": 1.299541093032958e-05, + "loss": 0.2845, + "step": 2041 + }, + { + "epoch": 2.297129994372538, + "grad_norm": 0.2414636263089686, + "learning_rate": 1.2974551522736755e-05, + "loss": 0.2854, + "step": 2042 + }, + { + "epoch": 2.298255486775464, + "grad_norm": 0.22863532322963662, + "learning_rate": 1.2953692115143932e-05, + "loss": 0.2769, + "step": 2043 + }, + { + "epoch": 2.2993809791783906, + "grad_norm": 0.20797566641270696, + "learning_rate": 1.2932832707551107e-05, + "loss": 0.2738, + "step": 2044 + }, + { + "epoch": 2.3005064715813166, + "grad_norm": 0.2813082678198765, + "learning_rate": 1.2911973299958282e-05, + "loss": 0.2919, + "step": 2045 + }, + { + "epoch": 2.301631963984243, + "grad_norm": 0.21880645593009593, + "learning_rate": 1.2891113892365458e-05, + "loss": 0.2845, + "step": 2046 + }, + { + "epoch": 2.3027574563871696, + "grad_norm": 0.21662277253245404, + "learning_rate": 1.2870254484772635e-05, + "loss": 0.2999, + "step": 2047 + }, + { + "epoch": 2.3038829487900956, + "grad_norm": 0.23410910980013766, + "learning_rate": 1.284939507717981e-05, + "loss": 0.2803, + "step": 2048 + }, + { + "epoch": 2.305008441193022, + "grad_norm": 0.24807871237848997, + "learning_rate": 1.2828535669586985e-05, + "loss": 0.2887, + "step": 2049 + }, + { + "epoch": 2.306133933595948, + "grad_norm": 0.23958912163692958, + "learning_rate": 1.2807676261994159e-05, + "loss": 0.2806, + "step": 2050 + }, + { + "epoch": 2.3072594259988746, + "grad_norm": 0.24170572287325667, + "learning_rate": 1.2786816854401334e-05, + "loss": 0.2911, + "step": 2051 + }, + { + "epoch": 2.3083849184018006, + "grad_norm": 0.2071987326770734, + "learning_rate": 1.2765957446808511e-05, + "loss": 0.281, + "step": 2052 + }, + { + "epoch": 2.309510410804727, + "grad_norm": 0.2685294387603238, + "learning_rate": 1.2745098039215686e-05, + "loss": 0.2988, + "step": 2053 + }, + { + "epoch": 2.310635903207653, + "grad_norm": 0.24356419885452857, + "learning_rate": 1.2724238631622862e-05, + "loss": 0.2918, + "step": 2054 + }, + { + "epoch": 2.3117613956105796, + "grad_norm": 0.22854669119255341, + "learning_rate": 1.2703379224030037e-05, + "loss": 0.2906, + "step": 2055 + }, + { + "epoch": 2.312886888013506, + "grad_norm": 0.24689465925397477, + "learning_rate": 1.2682519816437214e-05, + "loss": 0.2935, + "step": 2056 + }, + { + "epoch": 2.314012380416432, + "grad_norm": 0.21811055770500665, + "learning_rate": 1.2661660408844389e-05, + "loss": 0.3016, + "step": 2057 + }, + { + "epoch": 2.3151378728193586, + "grad_norm": 0.2493408748518838, + "learning_rate": 1.2640801001251564e-05, + "loss": 0.2834, + "step": 2058 + }, + { + "epoch": 2.3162633652222846, + "grad_norm": 0.25721873798899103, + "learning_rate": 1.2619941593658741e-05, + "loss": 0.2983, + "step": 2059 + }, + { + "epoch": 2.317388857625211, + "grad_norm": 0.22179194109950803, + "learning_rate": 1.2599082186065917e-05, + "loss": 0.2753, + "step": 2060 + }, + { + "epoch": 2.3185143500281375, + "grad_norm": 0.266595262773116, + "learning_rate": 1.2578222778473092e-05, + "loss": 0.2839, + "step": 2061 + }, + { + "epoch": 2.3196398424310636, + "grad_norm": 0.24206558428702046, + "learning_rate": 1.2557363370880267e-05, + "loss": 0.2853, + "step": 2062 + }, + { + "epoch": 2.32076533483399, + "grad_norm": 0.2454398984492763, + "learning_rate": 1.2536503963287444e-05, + "loss": 0.3028, + "step": 2063 + }, + { + "epoch": 2.321890827236916, + "grad_norm": 0.2321058588488482, + "learning_rate": 1.251564455569462e-05, + "loss": 0.269, + "step": 2064 + }, + { + "epoch": 2.3230163196398426, + "grad_norm": 0.27267795334721745, + "learning_rate": 1.2494785148101793e-05, + "loss": 0.2792, + "step": 2065 + }, + { + "epoch": 2.3241418120427686, + "grad_norm": 0.23234175584418776, + "learning_rate": 1.247392574050897e-05, + "loss": 0.28, + "step": 2066 + }, + { + "epoch": 2.325267304445695, + "grad_norm": 0.2063643654191112, + "learning_rate": 1.2453066332916145e-05, + "loss": 0.2742, + "step": 2067 + }, + { + "epoch": 2.326392796848621, + "grad_norm": 0.22497515405636748, + "learning_rate": 1.243220692532332e-05, + "loss": 0.2904, + "step": 2068 + }, + { + "epoch": 2.3275182892515476, + "grad_norm": 0.20800896572524227, + "learning_rate": 1.2411347517730498e-05, + "loss": 0.2974, + "step": 2069 + }, + { + "epoch": 2.328643781654474, + "grad_norm": 0.22460235366838985, + "learning_rate": 1.2390488110137673e-05, + "loss": 0.2777, + "step": 2070 + }, + { + "epoch": 2.3297692740574, + "grad_norm": 0.23776076812455357, + "learning_rate": 1.2369628702544848e-05, + "loss": 0.2829, + "step": 2071 + }, + { + "epoch": 2.3308947664603266, + "grad_norm": 0.2570845084981786, + "learning_rate": 1.2348769294952024e-05, + "loss": 0.2945, + "step": 2072 + }, + { + "epoch": 2.3320202588632526, + "grad_norm": 0.2385004836723248, + "learning_rate": 1.23279098873592e-05, + "loss": 0.2867, + "step": 2073 + }, + { + "epoch": 2.333145751266179, + "grad_norm": 0.24982697079123078, + "learning_rate": 1.2307050479766376e-05, + "loss": 0.2857, + "step": 2074 + }, + { + "epoch": 2.334271243669105, + "grad_norm": 0.24642888230370272, + "learning_rate": 1.2286191072173551e-05, + "loss": 0.3053, + "step": 2075 + }, + { + "epoch": 2.3353967360720316, + "grad_norm": 0.26130363264507717, + "learning_rate": 1.2265331664580726e-05, + "loss": 0.2916, + "step": 2076 + }, + { + "epoch": 2.3365222284749576, + "grad_norm": 0.2124033043327759, + "learning_rate": 1.2244472256987902e-05, + "loss": 0.2764, + "step": 2077 + }, + { + "epoch": 2.337647720877884, + "grad_norm": 0.2440455128961208, + "learning_rate": 1.2223612849395077e-05, + "loss": 0.3075, + "step": 2078 + }, + { + "epoch": 2.3387732132808106, + "grad_norm": 0.245304116279532, + "learning_rate": 1.2202753441802254e-05, + "loss": 0.2895, + "step": 2079 + }, + { + "epoch": 2.3398987056837366, + "grad_norm": 0.2372202784047367, + "learning_rate": 1.218189403420943e-05, + "loss": 0.2785, + "step": 2080 + }, + { + "epoch": 2.341024198086663, + "grad_norm": 0.23688709955054182, + "learning_rate": 1.2161034626616605e-05, + "loss": 0.2777, + "step": 2081 + }, + { + "epoch": 2.342149690489589, + "grad_norm": 0.2482625923726943, + "learning_rate": 1.214017521902378e-05, + "loss": 0.2833, + "step": 2082 + }, + { + "epoch": 2.3432751828925156, + "grad_norm": 0.22738968926541633, + "learning_rate": 1.2119315811430957e-05, + "loss": 0.2709, + "step": 2083 + }, + { + "epoch": 2.3444006752954416, + "grad_norm": 0.25147592555620085, + "learning_rate": 1.2098456403838132e-05, + "loss": 0.3008, + "step": 2084 + }, + { + "epoch": 2.345526167698368, + "grad_norm": 0.22363924741115862, + "learning_rate": 1.2077596996245307e-05, + "loss": 0.2872, + "step": 2085 + }, + { + "epoch": 2.346651660101294, + "grad_norm": 0.23011558044098404, + "learning_rate": 1.2056737588652483e-05, + "loss": 0.2901, + "step": 2086 + }, + { + "epoch": 2.3477771525042206, + "grad_norm": 0.232959308790496, + "learning_rate": 1.2035878181059658e-05, + "loss": 0.2859, + "step": 2087 + }, + { + "epoch": 2.348902644907147, + "grad_norm": 0.24124633231018813, + "learning_rate": 1.2015018773466833e-05, + "loss": 0.2946, + "step": 2088 + }, + { + "epoch": 2.350028137310073, + "grad_norm": 0.23315070454396, + "learning_rate": 1.199415936587401e-05, + "loss": 0.2719, + "step": 2089 + }, + { + "epoch": 2.3511536297129996, + "grad_norm": 0.21247783763819528, + "learning_rate": 1.1973299958281186e-05, + "loss": 0.28, + "step": 2090 + }, + { + "epoch": 2.3522791221159256, + "grad_norm": 0.23387492479149327, + "learning_rate": 1.195244055068836e-05, + "loss": 0.2792, + "step": 2091 + }, + { + "epoch": 2.353404614518852, + "grad_norm": 0.22205981665359048, + "learning_rate": 1.1931581143095536e-05, + "loss": 0.2869, + "step": 2092 + }, + { + "epoch": 2.354530106921778, + "grad_norm": 0.2407814917985092, + "learning_rate": 1.1910721735502713e-05, + "loss": 0.2837, + "step": 2093 + }, + { + "epoch": 2.3556555993247046, + "grad_norm": 0.22636696159410108, + "learning_rate": 1.1889862327909888e-05, + "loss": 0.2717, + "step": 2094 + }, + { + "epoch": 2.3567810917276306, + "grad_norm": 0.20010784358214667, + "learning_rate": 1.1869002920317064e-05, + "loss": 0.2638, + "step": 2095 + }, + { + "epoch": 2.357906584130557, + "grad_norm": 0.2302624047508185, + "learning_rate": 1.184814351272424e-05, + "loss": 0.2889, + "step": 2096 + }, + { + "epoch": 2.3590320765334836, + "grad_norm": 0.21841431009246395, + "learning_rate": 1.1827284105131414e-05, + "loss": 0.2836, + "step": 2097 + }, + { + "epoch": 2.3601575689364096, + "grad_norm": 0.21162560341411857, + "learning_rate": 1.180642469753859e-05, + "loss": 0.2744, + "step": 2098 + }, + { + "epoch": 2.361283061339336, + "grad_norm": 0.23437617625703946, + "learning_rate": 1.1785565289945767e-05, + "loss": 0.2897, + "step": 2099 + }, + { + "epoch": 2.362408553742262, + "grad_norm": 0.2443861444498022, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.2773, + "step": 2100 + }, + { + "epoch": 2.3635340461451886, + "grad_norm": 0.20195447071682132, + "learning_rate": 1.1743846474760117e-05, + "loss": 0.2852, + "step": 2101 + }, + { + "epoch": 2.3646595385481146, + "grad_norm": 0.22050201926310495, + "learning_rate": 1.1722987067167292e-05, + "loss": 0.2811, + "step": 2102 + }, + { + "epoch": 2.365785030951041, + "grad_norm": 0.24700086215612232, + "learning_rate": 1.170212765957447e-05, + "loss": 0.2944, + "step": 2103 + }, + { + "epoch": 2.366910523353967, + "grad_norm": 0.21968358349344858, + "learning_rate": 1.1681268251981645e-05, + "loss": 0.2876, + "step": 2104 + }, + { + "epoch": 2.3680360157568936, + "grad_norm": 0.22004217949350546, + "learning_rate": 1.166040884438882e-05, + "loss": 0.298, + "step": 2105 + }, + { + "epoch": 2.36916150815982, + "grad_norm": 0.25166613532562226, + "learning_rate": 1.1639549436795997e-05, + "loss": 0.3039, + "step": 2106 + }, + { + "epoch": 2.370287000562746, + "grad_norm": 0.2470279532584483, + "learning_rate": 1.161869002920317e-05, + "loss": 0.2905, + "step": 2107 + }, + { + "epoch": 2.3714124929656726, + "grad_norm": 0.22645956564251657, + "learning_rate": 1.1597830621610346e-05, + "loss": 0.2857, + "step": 2108 + }, + { + "epoch": 2.3725379853685986, + "grad_norm": 0.236072091244946, + "learning_rate": 1.1576971214017523e-05, + "loss": 0.2876, + "step": 2109 + }, + { + "epoch": 2.373663477771525, + "grad_norm": 0.22344125624637598, + "learning_rate": 1.1556111806424698e-05, + "loss": 0.2871, + "step": 2110 + }, + { + "epoch": 2.3747889701744516, + "grad_norm": 0.2740686740181796, + "learning_rate": 1.1535252398831873e-05, + "loss": 0.3108, + "step": 2111 + }, + { + "epoch": 2.3759144625773776, + "grad_norm": 0.2633526483901452, + "learning_rate": 1.1514392991239049e-05, + "loss": 0.2776, + "step": 2112 + }, + { + "epoch": 2.377039954980304, + "grad_norm": 0.23782573779211985, + "learning_rate": 1.1493533583646226e-05, + "loss": 0.2882, + "step": 2113 + }, + { + "epoch": 2.37816544738323, + "grad_norm": 0.20792260803969095, + "learning_rate": 1.1472674176053401e-05, + "loss": 0.2711, + "step": 2114 + }, + { + "epoch": 2.3792909397861566, + "grad_norm": 0.27657171997481406, + "learning_rate": 1.1451814768460576e-05, + "loss": 0.2866, + "step": 2115 + }, + { + "epoch": 2.3804164321890826, + "grad_norm": 0.23403563768428892, + "learning_rate": 1.1430955360867753e-05, + "loss": 0.2936, + "step": 2116 + }, + { + "epoch": 2.381541924592009, + "grad_norm": 0.2268435928373139, + "learning_rate": 1.1410095953274927e-05, + "loss": 0.2856, + "step": 2117 + }, + { + "epoch": 2.382667416994935, + "grad_norm": 0.2547588426300782, + "learning_rate": 1.1389236545682102e-05, + "loss": 0.2825, + "step": 2118 + }, + { + "epoch": 2.3837929093978616, + "grad_norm": 0.2251742508238687, + "learning_rate": 1.1368377138089279e-05, + "loss": 0.27, + "step": 2119 + }, + { + "epoch": 2.384918401800788, + "grad_norm": 0.23999465674072157, + "learning_rate": 1.1347517730496454e-05, + "loss": 0.3047, + "step": 2120 + }, + { + "epoch": 2.386043894203714, + "grad_norm": 0.23662614777077606, + "learning_rate": 1.132665832290363e-05, + "loss": 0.2988, + "step": 2121 + }, + { + "epoch": 2.3871693866066406, + "grad_norm": 0.2186602406031677, + "learning_rate": 1.1305798915310805e-05, + "loss": 0.2728, + "step": 2122 + }, + { + "epoch": 2.3882948790095666, + "grad_norm": 0.23328788430189215, + "learning_rate": 1.1284939507717982e-05, + "loss": 0.2899, + "step": 2123 + }, + { + "epoch": 2.389420371412493, + "grad_norm": 0.22192987090662689, + "learning_rate": 1.1264080100125157e-05, + "loss": 0.2988, + "step": 2124 + }, + { + "epoch": 2.390545863815419, + "grad_norm": 0.2250119571074059, + "learning_rate": 1.1243220692532333e-05, + "loss": 0.2956, + "step": 2125 + }, + { + "epoch": 2.3916713562183456, + "grad_norm": 0.23070732043801884, + "learning_rate": 1.122236128493951e-05, + "loss": 0.2902, + "step": 2126 + }, + { + "epoch": 2.3927968486212716, + "grad_norm": 0.22518386141677244, + "learning_rate": 1.1201501877346685e-05, + "loss": 0.2659, + "step": 2127 + }, + { + "epoch": 2.393922341024198, + "grad_norm": 0.21584313221933796, + "learning_rate": 1.1180642469753858e-05, + "loss": 0.288, + "step": 2128 + }, + { + "epoch": 2.3950478334271246, + "grad_norm": 0.24985482700142989, + "learning_rate": 1.1159783062161035e-05, + "loss": 0.2874, + "step": 2129 + }, + { + "epoch": 2.3961733258300506, + "grad_norm": 0.23472182953400522, + "learning_rate": 1.113892365456821e-05, + "loss": 0.2797, + "step": 2130 + }, + { + "epoch": 2.397298818232977, + "grad_norm": 0.23998025758676889, + "learning_rate": 1.1118064246975386e-05, + "loss": 0.2919, + "step": 2131 + }, + { + "epoch": 2.398424310635903, + "grad_norm": 0.21809256274475072, + "learning_rate": 1.1097204839382561e-05, + "loss": 0.2758, + "step": 2132 + }, + { + "epoch": 2.3995498030388296, + "grad_norm": 0.23454882483715764, + "learning_rate": 1.1076345431789738e-05, + "loss": 0.2952, + "step": 2133 + }, + { + "epoch": 2.4006752954417556, + "grad_norm": 0.2246557652335286, + "learning_rate": 1.1055486024196914e-05, + "loss": 0.2981, + "step": 2134 + }, + { + "epoch": 2.401800787844682, + "grad_norm": 0.25315434629928985, + "learning_rate": 1.1034626616604089e-05, + "loss": 0.3066, + "step": 2135 + }, + { + "epoch": 2.402926280247608, + "grad_norm": 0.22665010188162998, + "learning_rate": 1.1013767209011266e-05, + "loss": 0.2923, + "step": 2136 + }, + { + "epoch": 2.4040517726505346, + "grad_norm": 0.2501297143991106, + "learning_rate": 1.0992907801418441e-05, + "loss": 0.2782, + "step": 2137 + }, + { + "epoch": 2.405177265053461, + "grad_norm": 0.23355470062481642, + "learning_rate": 1.0972048393825615e-05, + "loss": 0.2958, + "step": 2138 + }, + { + "epoch": 2.406302757456387, + "grad_norm": 0.22524426706184972, + "learning_rate": 1.0951188986232792e-05, + "loss": 0.2971, + "step": 2139 + }, + { + "epoch": 2.4074282498593136, + "grad_norm": 0.2652706460468126, + "learning_rate": 1.0930329578639967e-05, + "loss": 0.3063, + "step": 2140 + }, + { + "epoch": 2.4085537422622396, + "grad_norm": 0.27147074305958385, + "learning_rate": 1.0909470171047142e-05, + "loss": 0.2835, + "step": 2141 + }, + { + "epoch": 2.409679234665166, + "grad_norm": 0.21263510668944327, + "learning_rate": 1.0888610763454318e-05, + "loss": 0.2759, + "step": 2142 + }, + { + "epoch": 2.410804727068092, + "grad_norm": 0.2414280469462777, + "learning_rate": 1.0867751355861495e-05, + "loss": 0.3011, + "step": 2143 + }, + { + "epoch": 2.4119302194710186, + "grad_norm": 0.2563072903091181, + "learning_rate": 1.084689194826867e-05, + "loss": 0.3011, + "step": 2144 + }, + { + "epoch": 2.4130557118739446, + "grad_norm": 0.26752154229648717, + "learning_rate": 1.0826032540675845e-05, + "loss": 0.3025, + "step": 2145 + }, + { + "epoch": 2.414181204276871, + "grad_norm": 0.2191490869328681, + "learning_rate": 1.0805173133083022e-05, + "loss": 0.2838, + "step": 2146 + }, + { + "epoch": 2.4153066966797976, + "grad_norm": 0.2754295487570369, + "learning_rate": 1.0784313725490197e-05, + "loss": 0.2745, + "step": 2147 + }, + { + "epoch": 2.4164321890827236, + "grad_norm": 0.27496282852437165, + "learning_rate": 1.0763454317897373e-05, + "loss": 0.2949, + "step": 2148 + }, + { + "epoch": 2.41755768148565, + "grad_norm": 0.200617793351321, + "learning_rate": 1.0742594910304548e-05, + "loss": 0.2844, + "step": 2149 + }, + { + "epoch": 2.418683173888576, + "grad_norm": 0.2261161966006784, + "learning_rate": 1.0721735502711723e-05, + "loss": 0.3074, + "step": 2150 + }, + { + "epoch": 2.4198086662915026, + "grad_norm": 0.2721691004576223, + "learning_rate": 1.0700876095118899e-05, + "loss": 0.2887, + "step": 2151 + }, + { + "epoch": 2.4209341586944286, + "grad_norm": 0.2468891901297125, + "learning_rate": 1.0680016687526074e-05, + "loss": 0.2826, + "step": 2152 + }, + { + "epoch": 2.422059651097355, + "grad_norm": 0.22981129965663172, + "learning_rate": 1.065915727993325e-05, + "loss": 0.2982, + "step": 2153 + }, + { + "epoch": 2.423185143500281, + "grad_norm": 0.23778012142265284, + "learning_rate": 1.0638297872340426e-05, + "loss": 0.2803, + "step": 2154 + }, + { + "epoch": 2.4243106359032076, + "grad_norm": 0.2502133834738445, + "learning_rate": 1.0617438464747601e-05, + "loss": 0.2894, + "step": 2155 + }, + { + "epoch": 2.425436128306134, + "grad_norm": 0.24021651572443242, + "learning_rate": 1.0596579057154778e-05, + "loss": 0.2792, + "step": 2156 + }, + { + "epoch": 2.42656162070906, + "grad_norm": 0.21287174233579118, + "learning_rate": 1.0575719649561954e-05, + "loss": 0.2653, + "step": 2157 + }, + { + "epoch": 2.4276871131119866, + "grad_norm": 0.2596480129053586, + "learning_rate": 1.0554860241969129e-05, + "loss": 0.2812, + "step": 2158 + }, + { + "epoch": 2.4288126055149126, + "grad_norm": 0.22839461536852768, + "learning_rate": 1.0534000834376304e-05, + "loss": 0.2887, + "step": 2159 + }, + { + "epoch": 2.429938097917839, + "grad_norm": 0.25082900774514266, + "learning_rate": 1.051314142678348e-05, + "loss": 0.2848, + "step": 2160 + }, + { + "epoch": 2.4310635903207656, + "grad_norm": 0.21582263533702323, + "learning_rate": 1.0492282019190655e-05, + "loss": 0.2848, + "step": 2161 + }, + { + "epoch": 2.4321890827236916, + "grad_norm": 0.20981491769940364, + "learning_rate": 1.047142261159783e-05, + "loss": 0.2771, + "step": 2162 + }, + { + "epoch": 2.433314575126618, + "grad_norm": 0.2478690946929455, + "learning_rate": 1.0450563204005007e-05, + "loss": 0.2978, + "step": 2163 + }, + { + "epoch": 2.434440067529544, + "grad_norm": 0.23623868199579823, + "learning_rate": 1.0429703796412182e-05, + "loss": 0.2868, + "step": 2164 + }, + { + "epoch": 2.4355655599324706, + "grad_norm": 0.22479499127056093, + "learning_rate": 1.0408844388819358e-05, + "loss": 0.2864, + "step": 2165 + }, + { + "epoch": 2.4366910523353966, + "grad_norm": 0.22778228885333549, + "learning_rate": 1.0387984981226535e-05, + "loss": 0.2703, + "step": 2166 + }, + { + "epoch": 2.437816544738323, + "grad_norm": 0.24955225194107739, + "learning_rate": 1.036712557363371e-05, + "loss": 0.2881, + "step": 2167 + }, + { + "epoch": 2.438942037141249, + "grad_norm": 0.24688296808661256, + "learning_rate": 1.0346266166040885e-05, + "loss": 0.2892, + "step": 2168 + }, + { + "epoch": 2.4400675295441756, + "grad_norm": 0.21066675955547629, + "learning_rate": 1.032540675844806e-05, + "loss": 0.2665, + "step": 2169 + }, + { + "epoch": 2.441193021947102, + "grad_norm": 0.23152630032898566, + "learning_rate": 1.0304547350855236e-05, + "loss": 0.2879, + "step": 2170 + }, + { + "epoch": 2.442318514350028, + "grad_norm": 0.23881733868242846, + "learning_rate": 1.0283687943262411e-05, + "loss": 0.284, + "step": 2171 + }, + { + "epoch": 2.4434440067529546, + "grad_norm": 0.24727876228693577, + "learning_rate": 1.0262828535669586e-05, + "loss": 0.2799, + "step": 2172 + }, + { + "epoch": 2.4445694991558806, + "grad_norm": 0.2237223246325913, + "learning_rate": 1.0241969128076763e-05, + "loss": 0.2839, + "step": 2173 + }, + { + "epoch": 2.445694991558807, + "grad_norm": 0.2255880979174184, + "learning_rate": 1.0221109720483939e-05, + "loss": 0.3039, + "step": 2174 + }, + { + "epoch": 2.446820483961733, + "grad_norm": 0.2170555923070572, + "learning_rate": 1.0200250312891114e-05, + "loss": 0.2768, + "step": 2175 + }, + { + "epoch": 2.4479459763646596, + "grad_norm": 0.20774037005388524, + "learning_rate": 1.0179390905298291e-05, + "loss": 0.2834, + "step": 2176 + }, + { + "epoch": 2.4490714687675856, + "grad_norm": 0.2265465766383895, + "learning_rate": 1.0158531497705466e-05, + "loss": 0.3058, + "step": 2177 + }, + { + "epoch": 2.450196961170512, + "grad_norm": 0.20568423154158125, + "learning_rate": 1.0137672090112642e-05, + "loss": 0.2692, + "step": 2178 + }, + { + "epoch": 2.4513224535734386, + "grad_norm": 0.22834881557663556, + "learning_rate": 1.0116812682519817e-05, + "loss": 0.2892, + "step": 2179 + }, + { + "epoch": 2.4524479459763646, + "grad_norm": 0.22028619420108753, + "learning_rate": 1.0095953274926992e-05, + "loss": 0.2919, + "step": 2180 + }, + { + "epoch": 2.453573438379291, + "grad_norm": 0.2425115217082142, + "learning_rate": 1.0075093867334167e-05, + "loss": 0.2764, + "step": 2181 + }, + { + "epoch": 2.454698930782217, + "grad_norm": 0.22791631771168733, + "learning_rate": 1.0054234459741343e-05, + "loss": 0.267, + "step": 2182 + }, + { + "epoch": 2.4558244231851436, + "grad_norm": 0.1926906774199996, + "learning_rate": 1.003337505214852e-05, + "loss": 0.2683, + "step": 2183 + }, + { + "epoch": 2.4569499155880696, + "grad_norm": 0.23290868443818466, + "learning_rate": 1.0012515644555695e-05, + "loss": 0.2791, + "step": 2184 + }, + { + "epoch": 2.458075407990996, + "grad_norm": 0.2968317001207595, + "learning_rate": 9.99165623696287e-06, + "loss": 0.2959, + "step": 2185 + }, + { + "epoch": 2.459200900393922, + "grad_norm": 0.250149366010179, + "learning_rate": 9.970796829370047e-06, + "loss": 0.2968, + "step": 2186 + }, + { + "epoch": 2.4603263927968486, + "grad_norm": 0.23676392349962846, + "learning_rate": 9.949937421777223e-06, + "loss": 0.2766, + "step": 2187 + }, + { + "epoch": 2.461451885199775, + "grad_norm": 0.2968220951755795, + "learning_rate": 9.929078014184398e-06, + "loss": 0.2858, + "step": 2188 + }, + { + "epoch": 2.462577377602701, + "grad_norm": 0.24526431357390857, + "learning_rate": 9.908218606591573e-06, + "loss": 0.2776, + "step": 2189 + }, + { + "epoch": 2.4637028700056276, + "grad_norm": 0.2072075588604563, + "learning_rate": 9.887359198998748e-06, + "loss": 0.2786, + "step": 2190 + }, + { + "epoch": 2.4648283624085536, + "grad_norm": 0.24560787407943072, + "learning_rate": 9.866499791405924e-06, + "loss": 0.3076, + "step": 2191 + }, + { + "epoch": 2.46595385481148, + "grad_norm": 0.2807855048371902, + "learning_rate": 9.845640383813099e-06, + "loss": 0.3059, + "step": 2192 + }, + { + "epoch": 2.467079347214406, + "grad_norm": 0.21339940056568182, + "learning_rate": 9.824780976220276e-06, + "loss": 0.2835, + "step": 2193 + }, + { + "epoch": 2.4682048396173326, + "grad_norm": 0.23237003408073176, + "learning_rate": 9.803921568627451e-06, + "loss": 0.2774, + "step": 2194 + }, + { + "epoch": 2.4693303320202586, + "grad_norm": 0.22471960654724552, + "learning_rate": 9.783062161034627e-06, + "loss": 0.3059, + "step": 2195 + }, + { + "epoch": 2.470455824423185, + "grad_norm": 0.21133212055331363, + "learning_rate": 9.762202753441804e-06, + "loss": 0.2804, + "step": 2196 + }, + { + "epoch": 2.4715813168261116, + "grad_norm": 0.22866555875952663, + "learning_rate": 9.741343345848979e-06, + "loss": 0.2668, + "step": 2197 + }, + { + "epoch": 2.4727068092290376, + "grad_norm": 0.23022775610838142, + "learning_rate": 9.720483938256154e-06, + "loss": 0.2941, + "step": 2198 + }, + { + "epoch": 2.473832301631964, + "grad_norm": 0.24916655338248048, + "learning_rate": 9.69962453066333e-06, + "loss": 0.2875, + "step": 2199 + }, + { + "epoch": 2.47495779403489, + "grad_norm": 0.22598137001947038, + "learning_rate": 9.678765123070506e-06, + "loss": 0.287, + "step": 2200 + }, + { + "epoch": 2.4760832864378166, + "grad_norm": 0.19562266451832722, + "learning_rate": 9.65790571547768e-06, + "loss": 0.2726, + "step": 2201 + }, + { + "epoch": 2.4772087788407426, + "grad_norm": 0.22330052278775112, + "learning_rate": 9.637046307884855e-06, + "loss": 0.2862, + "step": 2202 + }, + { + "epoch": 2.478334271243669, + "grad_norm": 0.22895521592496432, + "learning_rate": 9.616186900292032e-06, + "loss": 0.2858, + "step": 2203 + }, + { + "epoch": 2.479459763646595, + "grad_norm": 0.22023179481636448, + "learning_rate": 9.595327492699208e-06, + "loss": 0.2768, + "step": 2204 + }, + { + "epoch": 2.4805852560495216, + "grad_norm": 0.23642223233900708, + "learning_rate": 9.574468085106383e-06, + "loss": 0.2878, + "step": 2205 + }, + { + "epoch": 2.481710748452448, + "grad_norm": 0.2391107708431571, + "learning_rate": 9.55360867751356e-06, + "loss": 0.2879, + "step": 2206 + }, + { + "epoch": 2.482836240855374, + "grad_norm": 0.24152975198499732, + "learning_rate": 9.532749269920735e-06, + "loss": 0.2955, + "step": 2207 + }, + { + "epoch": 2.4839617332583006, + "grad_norm": 0.23299532148669774, + "learning_rate": 9.51188986232791e-06, + "loss": 0.2962, + "step": 2208 + }, + { + "epoch": 2.4850872256612266, + "grad_norm": 0.20896130963456966, + "learning_rate": 9.491030454735086e-06, + "loss": 0.284, + "step": 2209 + }, + { + "epoch": 2.486212718064153, + "grad_norm": 0.2105385871507124, + "learning_rate": 9.470171047142263e-06, + "loss": 0.2729, + "step": 2210 + }, + { + "epoch": 2.4873382104670796, + "grad_norm": 0.21484947461149867, + "learning_rate": 9.449311639549436e-06, + "loss": 0.2926, + "step": 2211 + }, + { + "epoch": 2.4884637028700056, + "grad_norm": 0.2190385482446419, + "learning_rate": 9.428452231956612e-06, + "loss": 0.2825, + "step": 2212 + }, + { + "epoch": 2.489589195272932, + "grad_norm": 0.2142534982080354, + "learning_rate": 9.407592824363789e-06, + "loss": 0.2751, + "step": 2213 + }, + { + "epoch": 2.490714687675858, + "grad_norm": 0.21708738862041638, + "learning_rate": 9.386733416770964e-06, + "loss": 0.2786, + "step": 2214 + }, + { + "epoch": 2.4918401800787846, + "grad_norm": 0.2181940682924344, + "learning_rate": 9.365874009178139e-06, + "loss": 0.2891, + "step": 2215 + }, + { + "epoch": 2.4929656724817106, + "grad_norm": 0.24361785849546538, + "learning_rate": 9.345014601585316e-06, + "loss": 0.2862, + "step": 2216 + }, + { + "epoch": 2.494091164884637, + "grad_norm": 0.2074874339468701, + "learning_rate": 9.324155193992491e-06, + "loss": 0.2779, + "step": 2217 + }, + { + "epoch": 2.495216657287563, + "grad_norm": 0.22232685525965187, + "learning_rate": 9.303295786399667e-06, + "loss": 0.2872, + "step": 2218 + }, + { + "epoch": 2.4963421496904896, + "grad_norm": 0.22940288362612324, + "learning_rate": 9.282436378806842e-06, + "loss": 0.2675, + "step": 2219 + }, + { + "epoch": 2.497467642093416, + "grad_norm": 0.22467443084840247, + "learning_rate": 9.261576971214019e-06, + "loss": 0.2801, + "step": 2220 + }, + { + "epoch": 2.498593134496342, + "grad_norm": 0.23139305058585594, + "learning_rate": 9.240717563621194e-06, + "loss": 0.2743, + "step": 2221 + }, + { + "epoch": 2.4997186268992686, + "grad_norm": 0.2360404033010022, + "learning_rate": 9.219858156028368e-06, + "loss": 0.286, + "step": 2222 + }, + { + "epoch": 2.5008441193021946, + "grad_norm": 0.20605066820343487, + "learning_rate": 9.198998748435545e-06, + "loss": 0.2823, + "step": 2223 + }, + { + "epoch": 2.501969611705121, + "grad_norm": 0.2564735378536905, + "learning_rate": 9.17813934084272e-06, + "loss": 0.2947, + "step": 2224 + }, + { + "epoch": 2.503095104108047, + "grad_norm": 0.2320837293470589, + "learning_rate": 9.157279933249895e-06, + "loss": 0.2768, + "step": 2225 + }, + { + "epoch": 2.5042205965109736, + "grad_norm": 0.208589920793005, + "learning_rate": 9.136420525657072e-06, + "loss": 0.29, + "step": 2226 + }, + { + "epoch": 2.5053460889138996, + "grad_norm": 0.21389293826499295, + "learning_rate": 9.115561118064248e-06, + "loss": 0.2798, + "step": 2227 + }, + { + "epoch": 2.506471581316826, + "grad_norm": 0.22046720544274087, + "learning_rate": 9.094701710471423e-06, + "loss": 0.2937, + "step": 2228 + }, + { + "epoch": 2.5075970737197526, + "grad_norm": 0.22495729889410385, + "learning_rate": 9.073842302878598e-06, + "loss": 0.2879, + "step": 2229 + }, + { + "epoch": 2.5087225661226786, + "grad_norm": 0.20269539252904967, + "learning_rate": 9.052982895285775e-06, + "loss": 0.2774, + "step": 2230 + }, + { + "epoch": 2.509848058525605, + "grad_norm": 0.1980840443630393, + "learning_rate": 9.03212348769295e-06, + "loss": 0.281, + "step": 2231 + }, + { + "epoch": 2.510973550928531, + "grad_norm": 0.22695316930947035, + "learning_rate": 9.011264080100124e-06, + "loss": 0.2922, + "step": 2232 + }, + { + "epoch": 2.5120990433314576, + "grad_norm": 0.20934803359715298, + "learning_rate": 8.990404672507301e-06, + "loss": 0.2794, + "step": 2233 + }, + { + "epoch": 2.5132245357343836, + "grad_norm": 0.21112109217582253, + "learning_rate": 8.969545264914476e-06, + "loss": 0.2907, + "step": 2234 + }, + { + "epoch": 2.51435002813731, + "grad_norm": 0.22573731590530483, + "learning_rate": 8.948685857321652e-06, + "loss": 0.291, + "step": 2235 + }, + { + "epoch": 2.515475520540236, + "grad_norm": 0.23892740697159065, + "learning_rate": 8.927826449728829e-06, + "loss": 0.273, + "step": 2236 + }, + { + "epoch": 2.5166010129431626, + "grad_norm": 0.22535585510058634, + "learning_rate": 8.906967042136004e-06, + "loss": 0.2837, + "step": 2237 + }, + { + "epoch": 2.517726505346089, + "grad_norm": 0.2194038445722204, + "learning_rate": 8.88610763454318e-06, + "loss": 0.2927, + "step": 2238 + }, + { + "epoch": 2.518851997749015, + "grad_norm": 0.22905667951438685, + "learning_rate": 8.865248226950355e-06, + "loss": 0.2789, + "step": 2239 + }, + { + "epoch": 2.5199774901519416, + "grad_norm": 0.22625912351056832, + "learning_rate": 8.844388819357532e-06, + "loss": 0.2755, + "step": 2240 + }, + { + "epoch": 2.5211029825548676, + "grad_norm": 0.23804689181224994, + "learning_rate": 8.823529411764707e-06, + "loss": 0.2792, + "step": 2241 + }, + { + "epoch": 2.522228474957794, + "grad_norm": 0.2105408688549035, + "learning_rate": 8.802670004171882e-06, + "loss": 0.2972, + "step": 2242 + }, + { + "epoch": 2.52335396736072, + "grad_norm": 0.22340033958156802, + "learning_rate": 8.781810596579057e-06, + "loss": 0.2946, + "step": 2243 + }, + { + "epoch": 2.5244794597636466, + "grad_norm": 0.2297895889368776, + "learning_rate": 8.760951188986233e-06, + "loss": 0.2938, + "step": 2244 + }, + { + "epoch": 2.5256049521665727, + "grad_norm": 0.2259147810494066, + "learning_rate": 8.740091781393408e-06, + "loss": 0.2813, + "step": 2245 + }, + { + "epoch": 2.526730444569499, + "grad_norm": 0.23547288393006746, + "learning_rate": 8.719232373800585e-06, + "loss": 0.2994, + "step": 2246 + }, + { + "epoch": 2.5278559369724256, + "grad_norm": 0.22543446780315715, + "learning_rate": 8.69837296620776e-06, + "loss": 0.2846, + "step": 2247 + }, + { + "epoch": 2.5289814293753516, + "grad_norm": 0.2154532957908738, + "learning_rate": 8.677513558614936e-06, + "loss": 0.2879, + "step": 2248 + }, + { + "epoch": 2.530106921778278, + "grad_norm": 0.2351801079174597, + "learning_rate": 8.65665415102211e-06, + "loss": 0.2765, + "step": 2249 + }, + { + "epoch": 2.531232414181204, + "grad_norm": 0.21366786894791512, + "learning_rate": 8.635794743429288e-06, + "loss": 0.2734, + "step": 2250 + }, + { + "epoch": 2.5323579065841306, + "grad_norm": 0.23645349161640047, + "learning_rate": 8.614935335836463e-06, + "loss": 0.2984, + "step": 2251 + }, + { + "epoch": 2.533483398987057, + "grad_norm": 0.23434820101602807, + "learning_rate": 8.594075928243638e-06, + "loss": 0.2968, + "step": 2252 + }, + { + "epoch": 2.534608891389983, + "grad_norm": 0.23800902126311332, + "learning_rate": 8.573216520650814e-06, + "loss": 0.2828, + "step": 2253 + }, + { + "epoch": 2.535734383792909, + "grad_norm": 0.2538132352809376, + "learning_rate": 8.552357113057989e-06, + "loss": 0.2843, + "step": 2254 + }, + { + "epoch": 2.5368598761958356, + "grad_norm": 0.21371163966598017, + "learning_rate": 8.531497705465164e-06, + "loss": 0.2751, + "step": 2255 + }, + { + "epoch": 2.537985368598762, + "grad_norm": 0.21482253029817228, + "learning_rate": 8.510638297872341e-06, + "loss": 0.3006, + "step": 2256 + }, + { + "epoch": 2.539110861001688, + "grad_norm": 0.21834391394392152, + "learning_rate": 8.489778890279517e-06, + "loss": 0.288, + "step": 2257 + }, + { + "epoch": 2.5402363534046146, + "grad_norm": 0.2385102842630092, + "learning_rate": 8.468919482686692e-06, + "loss": 0.2873, + "step": 2258 + }, + { + "epoch": 2.5413618458075407, + "grad_norm": 0.2496691464287376, + "learning_rate": 8.448060075093867e-06, + "loss": 0.2868, + "step": 2259 + }, + { + "epoch": 2.542487338210467, + "grad_norm": 0.2014232955964171, + "learning_rate": 8.427200667501044e-06, + "loss": 0.2851, + "step": 2260 + }, + { + "epoch": 2.5436128306133936, + "grad_norm": 0.23384968447549695, + "learning_rate": 8.40634125990822e-06, + "loss": 0.2842, + "step": 2261 + }, + { + "epoch": 2.5447383230163196, + "grad_norm": 0.21977668018953103, + "learning_rate": 8.385481852315395e-06, + "loss": 0.2883, + "step": 2262 + }, + { + "epoch": 2.5458638154192457, + "grad_norm": 0.21776563647468017, + "learning_rate": 8.36462244472257e-06, + "loss": 0.2856, + "step": 2263 + }, + { + "epoch": 2.546989307822172, + "grad_norm": 0.2027944392061715, + "learning_rate": 8.343763037129745e-06, + "loss": 0.2765, + "step": 2264 + }, + { + "epoch": 2.5481148002250986, + "grad_norm": 0.21029054091165603, + "learning_rate": 8.32290362953692e-06, + "loss": 0.2767, + "step": 2265 + }, + { + "epoch": 2.5492402926280247, + "grad_norm": 0.21418622748856342, + "learning_rate": 8.302044221944098e-06, + "loss": 0.2948, + "step": 2266 + }, + { + "epoch": 2.550365785030951, + "grad_norm": 0.21907388139154874, + "learning_rate": 8.281184814351273e-06, + "loss": 0.2736, + "step": 2267 + }, + { + "epoch": 2.551491277433877, + "grad_norm": 0.21904845452521604, + "learning_rate": 8.260325406758448e-06, + "loss": 0.3056, + "step": 2268 + }, + { + "epoch": 2.5526167698368036, + "grad_norm": 0.20459659904962244, + "learning_rate": 8.239465999165623e-06, + "loss": 0.2807, + "step": 2269 + }, + { + "epoch": 2.55374226223973, + "grad_norm": 0.20176624512330674, + "learning_rate": 8.2186065915728e-06, + "loss": 0.282, + "step": 2270 + }, + { + "epoch": 2.554867754642656, + "grad_norm": 0.2171053854970344, + "learning_rate": 8.197747183979976e-06, + "loss": 0.2867, + "step": 2271 + }, + { + "epoch": 2.555993247045582, + "grad_norm": 0.21608909264471945, + "learning_rate": 8.176887776387151e-06, + "loss": 0.2843, + "step": 2272 + }, + { + "epoch": 2.5571187394485086, + "grad_norm": 0.22363745774157207, + "learning_rate": 8.156028368794328e-06, + "loss": 0.292, + "step": 2273 + }, + { + "epoch": 2.558244231851435, + "grad_norm": 0.1967157457122503, + "learning_rate": 8.135168961201502e-06, + "loss": 0.2785, + "step": 2274 + }, + { + "epoch": 2.559369724254361, + "grad_norm": 0.21025592482731642, + "learning_rate": 8.114309553608677e-06, + "loss": 0.2758, + "step": 2275 + }, + { + "epoch": 2.5604952166572876, + "grad_norm": 0.21847518826316134, + "learning_rate": 8.093450146015854e-06, + "loss": 0.2969, + "step": 2276 + }, + { + "epoch": 2.5616207090602137, + "grad_norm": 0.22602160924202305, + "learning_rate": 8.072590738423029e-06, + "loss": 0.2883, + "step": 2277 + }, + { + "epoch": 2.56274620146314, + "grad_norm": 0.20256712231044452, + "learning_rate": 8.051731330830204e-06, + "loss": 0.2695, + "step": 2278 + }, + { + "epoch": 2.5638716938660666, + "grad_norm": 0.20681151204540096, + "learning_rate": 8.030871923237381e-06, + "loss": 0.2654, + "step": 2279 + }, + { + "epoch": 2.5649971862689926, + "grad_norm": 0.23344582380587078, + "learning_rate": 8.010012515644557e-06, + "loss": 0.2986, + "step": 2280 + }, + { + "epoch": 2.566122678671919, + "grad_norm": 0.22256302367590555, + "learning_rate": 7.989153108051732e-06, + "loss": 0.2833, + "step": 2281 + }, + { + "epoch": 2.567248171074845, + "grad_norm": 0.20447744073654678, + "learning_rate": 7.968293700458907e-06, + "loss": 0.2859, + "step": 2282 + }, + { + "epoch": 2.5683736634777716, + "grad_norm": 0.20565529180207448, + "learning_rate": 7.947434292866084e-06, + "loss": 0.2742, + "step": 2283 + }, + { + "epoch": 2.5694991558806977, + "grad_norm": 0.21066765721313158, + "learning_rate": 7.926574885273258e-06, + "loss": 0.2944, + "step": 2284 + }, + { + "epoch": 2.570624648283624, + "grad_norm": 0.21517637060390432, + "learning_rate": 7.905715477680433e-06, + "loss": 0.2875, + "step": 2285 + }, + { + "epoch": 2.57175014068655, + "grad_norm": 0.21947956446898098, + "learning_rate": 7.88485607008761e-06, + "loss": 0.3004, + "step": 2286 + }, + { + "epoch": 2.5728756330894766, + "grad_norm": 0.22114557622949502, + "learning_rate": 7.863996662494785e-06, + "loss": 0.2976, + "step": 2287 + }, + { + "epoch": 2.574001125492403, + "grad_norm": 0.22379469537623312, + "learning_rate": 7.84313725490196e-06, + "loss": 0.2878, + "step": 2288 + }, + { + "epoch": 2.575126617895329, + "grad_norm": 0.2071839477449149, + "learning_rate": 7.822277847309138e-06, + "loss": 0.2795, + "step": 2289 + }, + { + "epoch": 2.5762521102982556, + "grad_norm": 0.2237931852947739, + "learning_rate": 7.801418439716313e-06, + "loss": 0.2971, + "step": 2290 + }, + { + "epoch": 2.5773776027011817, + "grad_norm": 0.21266520141625195, + "learning_rate": 7.780559032123488e-06, + "loss": 0.2853, + "step": 2291 + }, + { + "epoch": 2.578503095104108, + "grad_norm": 0.2486160020515366, + "learning_rate": 7.759699624530664e-06, + "loss": 0.2961, + "step": 2292 + }, + { + "epoch": 2.579628587507034, + "grad_norm": 0.2068308805691666, + "learning_rate": 7.73884021693784e-06, + "loss": 0.2691, + "step": 2293 + }, + { + "epoch": 2.5807540799099606, + "grad_norm": 0.2230851463060974, + "learning_rate": 7.717980809345016e-06, + "loss": 0.2995, + "step": 2294 + }, + { + "epoch": 2.5818795723128867, + "grad_norm": 0.2374977031933618, + "learning_rate": 7.69712140175219e-06, + "loss": 0.283, + "step": 2295 + }, + { + "epoch": 2.583005064715813, + "grad_norm": 0.24062860705542086, + "learning_rate": 7.676261994159366e-06, + "loss": 0.2957, + "step": 2296 + }, + { + "epoch": 2.5841305571187396, + "grad_norm": 0.20537260389777368, + "learning_rate": 7.655402586566542e-06, + "loss": 0.2921, + "step": 2297 + }, + { + "epoch": 2.5852560495216657, + "grad_norm": 0.21853998967769137, + "learning_rate": 7.634543178973717e-06, + "loss": 0.2894, + "step": 2298 + }, + { + "epoch": 2.586381541924592, + "grad_norm": 0.21880735610653707, + "learning_rate": 7.613683771380893e-06, + "loss": 0.2791, + "step": 2299 + }, + { + "epoch": 2.587507034327518, + "grad_norm": 0.22402757654717384, + "learning_rate": 7.592824363788069e-06, + "loss": 0.289, + "step": 2300 + }, + { + "epoch": 2.5886325267304446, + "grad_norm": 0.2147892961394563, + "learning_rate": 7.5719649561952445e-06, + "loss": 0.2812, + "step": 2301 + }, + { + "epoch": 2.589758019133371, + "grad_norm": 0.22876144329979556, + "learning_rate": 7.551105548602421e-06, + "loss": 0.2933, + "step": 2302 + }, + { + "epoch": 2.590883511536297, + "grad_norm": 0.23442708833949216, + "learning_rate": 7.530246141009596e-06, + "loss": 0.2814, + "step": 2303 + }, + { + "epoch": 2.592009003939223, + "grad_norm": 0.23484614963727998, + "learning_rate": 7.509386733416772e-06, + "loss": 0.3144, + "step": 2304 + }, + { + "epoch": 2.5931344963421497, + "grad_norm": 0.20921707796315442, + "learning_rate": 7.4885273258239465e-06, + "loss": 0.286, + "step": 2305 + }, + { + "epoch": 2.594259988745076, + "grad_norm": 0.22775240735379326, + "learning_rate": 7.467667918231122e-06, + "loss": 0.2788, + "step": 2306 + }, + { + "epoch": 2.595385481148002, + "grad_norm": 0.22230000059940203, + "learning_rate": 7.446808510638298e-06, + "loss": 0.2815, + "step": 2307 + }, + { + "epoch": 2.5965109735509286, + "grad_norm": 0.24545298078462746, + "learning_rate": 7.425949103045473e-06, + "loss": 0.2735, + "step": 2308 + }, + { + "epoch": 2.5976364659538547, + "grad_norm": 0.19625632990047406, + "learning_rate": 7.405089695452649e-06, + "loss": 0.2731, + "step": 2309 + }, + { + "epoch": 2.598761958356781, + "grad_norm": 0.20900090173879718, + "learning_rate": 7.3842302878598255e-06, + "loss": 0.2721, + "step": 2310 + }, + { + "epoch": 2.5998874507597076, + "grad_norm": 0.2123621289927944, + "learning_rate": 7.363370880267001e-06, + "loss": 0.2698, + "step": 2311 + }, + { + "epoch": 2.6010129431626337, + "grad_norm": 0.21369756285267333, + "learning_rate": 7.342511472674177e-06, + "loss": 0.2834, + "step": 2312 + }, + { + "epoch": 2.6021384355655597, + "grad_norm": 0.22793059698710658, + "learning_rate": 7.321652065081352e-06, + "loss": 0.2851, + "step": 2313 + }, + { + "epoch": 2.603263927968486, + "grad_norm": 0.2134184284204459, + "learning_rate": 7.300792657488528e-06, + "loss": 0.2811, + "step": 2314 + }, + { + "epoch": 2.6043894203714126, + "grad_norm": 0.21325834093200643, + "learning_rate": 7.279933249895703e-06, + "loss": 0.2936, + "step": 2315 + }, + { + "epoch": 2.6055149127743387, + "grad_norm": 0.1991068712411994, + "learning_rate": 7.259073842302878e-06, + "loss": 0.2834, + "step": 2316 + }, + { + "epoch": 2.606640405177265, + "grad_norm": 0.22901278666536629, + "learning_rate": 7.238214434710054e-06, + "loss": 0.3015, + "step": 2317 + }, + { + "epoch": 2.607765897580191, + "grad_norm": 0.21881134986820416, + "learning_rate": 7.21735502711723e-06, + "loss": 0.2876, + "step": 2318 + }, + { + "epoch": 2.6088913899831176, + "grad_norm": 0.22029025156059676, + "learning_rate": 7.196495619524406e-06, + "loss": 0.2988, + "step": 2319 + }, + { + "epoch": 2.610016882386044, + "grad_norm": 0.21007112294863065, + "learning_rate": 7.175636211931582e-06, + "loss": 0.2763, + "step": 2320 + }, + { + "epoch": 2.61114237478897, + "grad_norm": 0.2126401817051627, + "learning_rate": 7.154776804338757e-06, + "loss": 0.2805, + "step": 2321 + }, + { + "epoch": 2.612267867191896, + "grad_norm": 0.20852511391858303, + "learning_rate": 7.133917396745933e-06, + "loss": 0.2936, + "step": 2322 + }, + { + "epoch": 2.6133933595948227, + "grad_norm": 0.21781244059761962, + "learning_rate": 7.1130579891531085e-06, + "loss": 0.2892, + "step": 2323 + }, + { + "epoch": 2.614518851997749, + "grad_norm": 0.22501438470662116, + "learning_rate": 7.092198581560285e-06, + "loss": 0.2839, + "step": 2324 + }, + { + "epoch": 2.615644344400675, + "grad_norm": 0.20568012937631386, + "learning_rate": 7.07133917396746e-06, + "loss": 0.2927, + "step": 2325 + }, + { + "epoch": 2.6167698368036016, + "grad_norm": 0.21222804494470973, + "learning_rate": 7.050479766374634e-06, + "loss": 0.281, + "step": 2326 + }, + { + "epoch": 2.6178953292065277, + "grad_norm": 0.20938841222313492, + "learning_rate": 7.0296203587818105e-06, + "loss": 0.2845, + "step": 2327 + }, + { + "epoch": 2.619020821609454, + "grad_norm": 0.21620523521239354, + "learning_rate": 7.008760951188987e-06, + "loss": 0.2801, + "step": 2328 + }, + { + "epoch": 2.6201463140123806, + "grad_norm": 0.2506118426158015, + "learning_rate": 6.987901543596162e-06, + "loss": 0.2954, + "step": 2329 + }, + { + "epoch": 2.6212718064153067, + "grad_norm": 0.1973550955823624, + "learning_rate": 6.967042136003338e-06, + "loss": 0.2686, + "step": 2330 + }, + { + "epoch": 2.622397298818233, + "grad_norm": 0.2066937107804017, + "learning_rate": 6.946182728410513e-06, + "loss": 0.2774, + "step": 2331 + }, + { + "epoch": 2.623522791221159, + "grad_norm": 0.2202250524273311, + "learning_rate": 6.9253233208176895e-06, + "loss": 0.3068, + "step": 2332 + }, + { + "epoch": 2.6246482836240856, + "grad_norm": 0.21755861209547087, + "learning_rate": 6.904463913224865e-06, + "loss": 0.2723, + "step": 2333 + }, + { + "epoch": 2.6257737760270117, + "grad_norm": 0.21926633058957373, + "learning_rate": 6.883604505632041e-06, + "loss": 0.2903, + "step": 2334 + }, + { + "epoch": 2.626899268429938, + "grad_norm": 0.2130377637928427, + "learning_rate": 6.862745098039216e-06, + "loss": 0.2804, + "step": 2335 + }, + { + "epoch": 2.628024760832864, + "grad_norm": 0.19225627811370669, + "learning_rate": 6.841885690446391e-06, + "loss": 0.2887, + "step": 2336 + }, + { + "epoch": 2.6291502532357907, + "grad_norm": 0.20057254466754687, + "learning_rate": 6.821026282853567e-06, + "loss": 0.2837, + "step": 2337 + }, + { + "epoch": 2.630275745638717, + "grad_norm": 0.23011758988414296, + "learning_rate": 6.800166875260743e-06, + "loss": 0.2931, + "step": 2338 + }, + { + "epoch": 2.631401238041643, + "grad_norm": 0.23666344897934585, + "learning_rate": 6.779307467667918e-06, + "loss": 0.2782, + "step": 2339 + }, + { + "epoch": 2.6325267304445696, + "grad_norm": 0.2147014731643971, + "learning_rate": 6.758448060075094e-06, + "loss": 0.2912, + "step": 2340 + }, + { + "epoch": 2.6336522228474957, + "grad_norm": 0.21234094073131748, + "learning_rate": 6.73758865248227e-06, + "loss": 0.2914, + "step": 2341 + }, + { + "epoch": 2.634777715250422, + "grad_norm": 0.20060222937545014, + "learning_rate": 6.716729244889446e-06, + "loss": 0.2766, + "step": 2342 + }, + { + "epoch": 2.635903207653348, + "grad_norm": 0.21917036852395436, + "learning_rate": 6.695869837296621e-06, + "loss": 0.3013, + "step": 2343 + }, + { + "epoch": 2.6370287000562747, + "grad_norm": 0.20864615144591028, + "learning_rate": 6.675010429703797e-06, + "loss": 0.2792, + "step": 2344 + }, + { + "epoch": 2.6381541924592007, + "grad_norm": 0.2192912221143167, + "learning_rate": 6.6541510221109725e-06, + "loss": 0.284, + "step": 2345 + }, + { + "epoch": 2.639279684862127, + "grad_norm": 0.2039309149630558, + "learning_rate": 6.633291614518149e-06, + "loss": 0.2948, + "step": 2346 + }, + { + "epoch": 2.6404051772650536, + "grad_norm": 0.22259191163490286, + "learning_rate": 6.612432206925323e-06, + "loss": 0.2737, + "step": 2347 + }, + { + "epoch": 2.6415306696679797, + "grad_norm": 0.2419705952514684, + "learning_rate": 6.591572799332499e-06, + "loss": 0.2894, + "step": 2348 + }, + { + "epoch": 2.642656162070906, + "grad_norm": 0.20985587856472956, + "learning_rate": 6.5707133917396745e-06, + "loss": 0.2947, + "step": 2349 + }, + { + "epoch": 2.643781654473832, + "grad_norm": 0.20042601124344012, + "learning_rate": 6.549853984146851e-06, + "loss": 0.271, + "step": 2350 + }, + { + "epoch": 2.6449071468767587, + "grad_norm": 0.20416712565695233, + "learning_rate": 6.528994576554026e-06, + "loss": 0.2743, + "step": 2351 + }, + { + "epoch": 2.646032639279685, + "grad_norm": 0.2184086174145368, + "learning_rate": 6.508135168961202e-06, + "loss": 0.3015, + "step": 2352 + }, + { + "epoch": 2.647158131682611, + "grad_norm": 0.24131101578961572, + "learning_rate": 6.487275761368377e-06, + "loss": 0.2775, + "step": 2353 + }, + { + "epoch": 2.648283624085537, + "grad_norm": 0.22701304755475593, + "learning_rate": 6.4664163537755535e-06, + "loss": 0.2857, + "step": 2354 + }, + { + "epoch": 2.6494091164884637, + "grad_norm": 0.20712379248467747, + "learning_rate": 6.445556946182729e-06, + "loss": 0.2758, + "step": 2355 + }, + { + "epoch": 2.65053460889139, + "grad_norm": 0.23341164321770264, + "learning_rate": 6.424697538589905e-06, + "loss": 0.268, + "step": 2356 + }, + { + "epoch": 2.651660101294316, + "grad_norm": 0.24310923952152994, + "learning_rate": 6.403838130997079e-06, + "loss": 0.2833, + "step": 2357 + }, + { + "epoch": 2.6527855936972426, + "grad_norm": 0.229839416220484, + "learning_rate": 6.3829787234042555e-06, + "loss": 0.2889, + "step": 2358 + }, + { + "epoch": 2.6539110861001687, + "grad_norm": 0.23215272147883, + "learning_rate": 6.362119315811431e-06, + "loss": 0.2975, + "step": 2359 + }, + { + "epoch": 2.655036578503095, + "grad_norm": 0.24954803338960216, + "learning_rate": 6.341259908218607e-06, + "loss": 0.2946, + "step": 2360 + }, + { + "epoch": 2.6561620709060216, + "grad_norm": 0.23026522507576283, + "learning_rate": 6.320400500625782e-06, + "loss": 0.268, + "step": 2361 + }, + { + "epoch": 2.6572875633089477, + "grad_norm": 0.23021270773997743, + "learning_rate": 6.299541093032958e-06, + "loss": 0.261, + "step": 2362 + }, + { + "epoch": 2.6584130557118737, + "grad_norm": 0.21115861014586346, + "learning_rate": 6.278681685440134e-06, + "loss": 0.2771, + "step": 2363 + }, + { + "epoch": 2.6595385481148, + "grad_norm": 0.2405585243153947, + "learning_rate": 6.25782227784731e-06, + "loss": 0.2806, + "step": 2364 + }, + { + "epoch": 2.6606640405177266, + "grad_norm": 0.2497609269658003, + "learning_rate": 6.236962870254485e-06, + "loss": 0.2874, + "step": 2365 + }, + { + "epoch": 2.6617895329206527, + "grad_norm": 0.22645791008309762, + "learning_rate": 6.21610346266166e-06, + "loss": 0.3004, + "step": 2366 + }, + { + "epoch": 2.662915025323579, + "grad_norm": 0.2197914591989606, + "learning_rate": 6.1952440550688365e-06, + "loss": 0.2794, + "step": 2367 + }, + { + "epoch": 2.664040517726505, + "grad_norm": 0.22234883908095063, + "learning_rate": 6.174384647476012e-06, + "loss": 0.3004, + "step": 2368 + }, + { + "epoch": 2.6651660101294317, + "grad_norm": 0.24165293762861514, + "learning_rate": 6.153525239883188e-06, + "loss": 0.2822, + "step": 2369 + }, + { + "epoch": 2.666291502532358, + "grad_norm": 0.2552276571924829, + "learning_rate": 6.132665832290363e-06, + "loss": 0.2909, + "step": 2370 + }, + { + "epoch": 2.667416994935284, + "grad_norm": 0.21066092081346655, + "learning_rate": 6.1118064246975385e-06, + "loss": 0.2798, + "step": 2371 + }, + { + "epoch": 2.66854248733821, + "grad_norm": 0.2142596843222076, + "learning_rate": 6.090947017104715e-06, + "loss": 0.2776, + "step": 2372 + }, + { + "epoch": 2.6696679797411367, + "grad_norm": 0.24551341038876937, + "learning_rate": 6.07008760951189e-06, + "loss": 0.2865, + "step": 2373 + }, + { + "epoch": 2.670793472144063, + "grad_norm": 0.2340361094417635, + "learning_rate": 6.049228201919066e-06, + "loss": 0.2904, + "step": 2374 + }, + { + "epoch": 2.671918964546989, + "grad_norm": 0.21307302351051388, + "learning_rate": 6.028368794326241e-06, + "loss": 0.2937, + "step": 2375 + }, + { + "epoch": 2.6730444569499157, + "grad_norm": 0.2512900946420438, + "learning_rate": 6.007509386733417e-06, + "loss": 0.2842, + "step": 2376 + }, + { + "epoch": 2.6741699493528417, + "grad_norm": 0.20979466873445987, + "learning_rate": 5.986649979140593e-06, + "loss": 0.2931, + "step": 2377 + }, + { + "epoch": 2.675295441755768, + "grad_norm": 0.21119960362679138, + "learning_rate": 5.965790571547768e-06, + "loss": 0.2842, + "step": 2378 + }, + { + "epoch": 2.6764209341586946, + "grad_norm": 0.19883138313973867, + "learning_rate": 5.944931163954944e-06, + "loss": 0.2771, + "step": 2379 + }, + { + "epoch": 2.6775464265616207, + "grad_norm": 0.19968752507295803, + "learning_rate": 5.92407175636212e-06, + "loss": 0.2751, + "step": 2380 + }, + { + "epoch": 2.678671918964547, + "grad_norm": 0.22015736000540867, + "learning_rate": 5.903212348769295e-06, + "loss": 0.2799, + "step": 2381 + }, + { + "epoch": 2.679797411367473, + "grad_norm": 0.21339223053410786, + "learning_rate": 5.882352941176471e-06, + "loss": 0.2869, + "step": 2382 + }, + { + "epoch": 2.6809229037703997, + "grad_norm": 0.19762335590197813, + "learning_rate": 5.861493533583646e-06, + "loss": 0.2829, + "step": 2383 + }, + { + "epoch": 2.6820483961733257, + "grad_norm": 0.2074037589352283, + "learning_rate": 5.840634125990822e-06, + "loss": 0.2909, + "step": 2384 + }, + { + "epoch": 2.683173888576252, + "grad_norm": 0.2117788165142603, + "learning_rate": 5.8197747183979985e-06, + "loss": 0.2732, + "step": 2385 + }, + { + "epoch": 2.684299380979178, + "grad_norm": 0.23282240403764579, + "learning_rate": 5.798915310805173e-06, + "loss": 0.2939, + "step": 2386 + }, + { + "epoch": 2.6854248733821047, + "grad_norm": 0.21921536525716026, + "learning_rate": 5.778055903212349e-06, + "loss": 0.2844, + "step": 2387 + }, + { + "epoch": 2.686550365785031, + "grad_norm": 0.2104234762422923, + "learning_rate": 5.757196495619524e-06, + "loss": 0.2893, + "step": 2388 + }, + { + "epoch": 2.687675858187957, + "grad_norm": 0.20466965592113787, + "learning_rate": 5.7363370880267005e-06, + "loss": 0.2839, + "step": 2389 + }, + { + "epoch": 2.6888013505908837, + "grad_norm": 0.21641871937130808, + "learning_rate": 5.715477680433877e-06, + "loss": 0.2813, + "step": 2390 + }, + { + "epoch": 2.6899268429938097, + "grad_norm": 0.19783335996013016, + "learning_rate": 5.694618272841051e-06, + "loss": 0.2681, + "step": 2391 + }, + { + "epoch": 2.691052335396736, + "grad_norm": 0.23211934348643096, + "learning_rate": 5.673758865248227e-06, + "loss": 0.2737, + "step": 2392 + }, + { + "epoch": 2.692177827799662, + "grad_norm": 0.21211198195733516, + "learning_rate": 5.6528994576554025e-06, + "loss": 0.293, + "step": 2393 + }, + { + "epoch": 2.6933033202025887, + "grad_norm": 0.19763398145044694, + "learning_rate": 5.632040050062579e-06, + "loss": 0.2669, + "step": 2394 + }, + { + "epoch": 2.6944288126055147, + "grad_norm": 0.2084177545225768, + "learning_rate": 5.611180642469755e-06, + "loss": 0.2727, + "step": 2395 + }, + { + "epoch": 2.695554305008441, + "grad_norm": 0.23874699170463168, + "learning_rate": 5.590321234876929e-06, + "loss": 0.2871, + "step": 2396 + }, + { + "epoch": 2.6966797974113677, + "grad_norm": 0.22847881193172173, + "learning_rate": 5.569461827284105e-06, + "loss": 0.2704, + "step": 2397 + }, + { + "epoch": 2.6978052898142937, + "grad_norm": 0.21260192114331203, + "learning_rate": 5.548602419691281e-06, + "loss": 0.2765, + "step": 2398 + }, + { + "epoch": 2.69893078221722, + "grad_norm": 0.2481199298540293, + "learning_rate": 5.527743012098457e-06, + "loss": 0.2978, + "step": 2399 + }, + { + "epoch": 2.700056274620146, + "grad_norm": 0.22837599699280178, + "learning_rate": 5.506883604505633e-06, + "loss": 0.2778, + "step": 2400 + }, + { + "epoch": 2.7011817670230727, + "grad_norm": 0.20862397407681496, + "learning_rate": 5.486024196912807e-06, + "loss": 0.282, + "step": 2401 + }, + { + "epoch": 2.702307259425999, + "grad_norm": 0.21499220301713443, + "learning_rate": 5.4651647893199835e-06, + "loss": 0.2946, + "step": 2402 + }, + { + "epoch": 2.703432751828925, + "grad_norm": 0.21876537448943154, + "learning_rate": 5.444305381727159e-06, + "loss": 0.2999, + "step": 2403 + }, + { + "epoch": 2.704558244231851, + "grad_norm": 0.19584715347664308, + "learning_rate": 5.423445974134335e-06, + "loss": 0.2811, + "step": 2404 + }, + { + "epoch": 2.7056837366347777, + "grad_norm": 0.199638455026977, + "learning_rate": 5.402586566541511e-06, + "loss": 0.2922, + "step": 2405 + }, + { + "epoch": 2.706809229037704, + "grad_norm": 0.1959593267413218, + "learning_rate": 5.381727158948686e-06, + "loss": 0.2803, + "step": 2406 + }, + { + "epoch": 2.70793472144063, + "grad_norm": 0.2172940746080715, + "learning_rate": 5.360867751355862e-06, + "loss": 0.2998, + "step": 2407 + }, + { + "epoch": 2.7090602138435567, + "grad_norm": 0.21226736116643366, + "learning_rate": 5.340008343763037e-06, + "loss": 0.2793, + "step": 2408 + }, + { + "epoch": 2.7101857062464827, + "grad_norm": 0.2292349157751999, + "learning_rate": 5.319148936170213e-06, + "loss": 0.2925, + "step": 2409 + }, + { + "epoch": 2.711311198649409, + "grad_norm": 0.2065757335406282, + "learning_rate": 5.298289528577389e-06, + "loss": 0.2852, + "step": 2410 + }, + { + "epoch": 2.7124366910523356, + "grad_norm": 0.21318268769213233, + "learning_rate": 5.2774301209845645e-06, + "loss": 0.3101, + "step": 2411 + }, + { + "epoch": 2.7135621834552617, + "grad_norm": 0.19490619907298698, + "learning_rate": 5.25657071339174e-06, + "loss": 0.2845, + "step": 2412 + }, + { + "epoch": 2.7146876758581877, + "grad_norm": 0.20387752435372739, + "learning_rate": 5.235711305798915e-06, + "loss": 0.2891, + "step": 2413 + }, + { + "epoch": 2.715813168261114, + "grad_norm": 0.21901749886212424, + "learning_rate": 5.214851898206091e-06, + "loss": 0.2829, + "step": 2414 + }, + { + "epoch": 2.7169386606640407, + "grad_norm": 0.21269033641949117, + "learning_rate": 5.193992490613267e-06, + "loss": 0.2866, + "step": 2415 + }, + { + "epoch": 2.7180641530669667, + "grad_norm": 0.204672643305428, + "learning_rate": 5.173133083020443e-06, + "loss": 0.2928, + "step": 2416 + }, + { + "epoch": 2.719189645469893, + "grad_norm": 0.22695422804231133, + "learning_rate": 5.152273675427618e-06, + "loss": 0.3034, + "step": 2417 + }, + { + "epoch": 2.720315137872819, + "grad_norm": 0.20992539451243944, + "learning_rate": 5.131414267834793e-06, + "loss": 0.2756, + "step": 2418 + }, + { + "epoch": 2.7214406302757457, + "grad_norm": 0.23946364092165132, + "learning_rate": 5.110554860241969e-06, + "loss": 0.2619, + "step": 2419 + }, + { + "epoch": 2.722566122678672, + "grad_norm": 0.22133441280710264, + "learning_rate": 5.0896954526491455e-06, + "loss": 0.3024, + "step": 2420 + }, + { + "epoch": 2.723691615081598, + "grad_norm": 0.19132609073407808, + "learning_rate": 5.068836045056321e-06, + "loss": 0.2809, + "step": 2421 + }, + { + "epoch": 2.724817107484524, + "grad_norm": 0.20537504751422384, + "learning_rate": 5.047976637463496e-06, + "loss": 0.2981, + "step": 2422 + }, + { + "epoch": 2.7259425998874507, + "grad_norm": 0.2026641684698212, + "learning_rate": 5.027117229870671e-06, + "loss": 0.2831, + "step": 2423 + }, + { + "epoch": 2.727068092290377, + "grad_norm": 0.220392207872778, + "learning_rate": 5.0062578222778475e-06, + "loss": 0.2974, + "step": 2424 + }, + { + "epoch": 2.728193584693303, + "grad_norm": 0.20374793230025023, + "learning_rate": 4.985398414685024e-06, + "loss": 0.2843, + "step": 2425 + }, + { + "epoch": 2.7293190770962297, + "grad_norm": 0.2182187646308083, + "learning_rate": 4.964539007092199e-06, + "loss": 0.2827, + "step": 2426 + }, + { + "epoch": 2.7304445694991557, + "grad_norm": 0.20515095934912667, + "learning_rate": 4.943679599499374e-06, + "loss": 0.2823, + "step": 2427 + }, + { + "epoch": 2.731570061902082, + "grad_norm": 0.2274911617803538, + "learning_rate": 4.9228201919065495e-06, + "loss": 0.2874, + "step": 2428 + }, + { + "epoch": 2.7326955543050087, + "grad_norm": 0.20240468754950888, + "learning_rate": 4.901960784313726e-06, + "loss": 0.2901, + "step": 2429 + }, + { + "epoch": 2.7338210467079347, + "grad_norm": 0.21135908550005916, + "learning_rate": 4.881101376720902e-06, + "loss": 0.2792, + "step": 2430 + }, + { + "epoch": 2.734946539110861, + "grad_norm": 0.21034155921896855, + "learning_rate": 4.860241969128077e-06, + "loss": 0.2911, + "step": 2431 + }, + { + "epoch": 2.736072031513787, + "grad_norm": 0.2073170761162975, + "learning_rate": 4.839382561535253e-06, + "loss": 0.2768, + "step": 2432 + }, + { + "epoch": 2.7371975239167137, + "grad_norm": 0.20922141980047607, + "learning_rate": 4.818523153942428e-06, + "loss": 0.2818, + "step": 2433 + }, + { + "epoch": 2.7383230163196397, + "grad_norm": 0.20817722346637343, + "learning_rate": 4.797663746349604e-06, + "loss": 0.2799, + "step": 2434 + }, + { + "epoch": 2.739448508722566, + "grad_norm": 0.20367161604931538, + "learning_rate": 4.77680433875678e-06, + "loss": 0.3002, + "step": 2435 + }, + { + "epoch": 2.740574001125492, + "grad_norm": 0.20301772018260328, + "learning_rate": 4.755944931163955e-06, + "loss": 0.2734, + "step": 2436 + }, + { + "epoch": 2.7416994935284187, + "grad_norm": 0.19056871488718577, + "learning_rate": 4.735085523571131e-06, + "loss": 0.2753, + "step": 2437 + }, + { + "epoch": 2.742824985931345, + "grad_norm": 0.25569233009986714, + "learning_rate": 4.714226115978306e-06, + "loss": 0.2991, + "step": 2438 + }, + { + "epoch": 2.743950478334271, + "grad_norm": 0.20649414202471206, + "learning_rate": 4.693366708385482e-06, + "loss": 0.2754, + "step": 2439 + }, + { + "epoch": 2.7450759707371977, + "grad_norm": 0.20526360074859962, + "learning_rate": 4.672507300792658e-06, + "loss": 0.2846, + "step": 2440 + }, + { + "epoch": 2.7462014631401237, + "grad_norm": 0.19992362065471733, + "learning_rate": 4.651647893199833e-06, + "loss": 0.2883, + "step": 2441 + }, + { + "epoch": 2.74732695554305, + "grad_norm": 0.20760115590523082, + "learning_rate": 4.6307884856070095e-06, + "loss": 0.2734, + "step": 2442 + }, + { + "epoch": 2.748452447945976, + "grad_norm": 0.20022172726588305, + "learning_rate": 4.609929078014184e-06, + "loss": 0.2746, + "step": 2443 + }, + { + "epoch": 2.7495779403489027, + "grad_norm": 0.21742210771505113, + "learning_rate": 4.58906967042136e-06, + "loss": 0.2712, + "step": 2444 + }, + { + "epoch": 2.7507034327518287, + "grad_norm": 0.1894221520336593, + "learning_rate": 4.568210262828536e-06, + "loss": 0.269, + "step": 2445 + }, + { + "epoch": 2.751828925154755, + "grad_norm": 0.2014338015112663, + "learning_rate": 4.5473508552357115e-06, + "loss": 0.2754, + "step": 2446 + }, + { + "epoch": 2.7529544175576817, + "grad_norm": 0.2767717819280576, + "learning_rate": 4.526491447642888e-06, + "loss": 0.2704, + "step": 2447 + }, + { + "epoch": 2.7540799099606077, + "grad_norm": 0.2380476325461093, + "learning_rate": 4.505632040050062e-06, + "loss": 0.2861, + "step": 2448 + }, + { + "epoch": 2.755205402363534, + "grad_norm": 0.21164580868587568, + "learning_rate": 4.484772632457238e-06, + "loss": 0.2768, + "step": 2449 + }, + { + "epoch": 2.75633089476646, + "grad_norm": 0.2008040936085221, + "learning_rate": 4.463913224864414e-06, + "loss": 0.275, + "step": 2450 + }, + { + "epoch": 2.7574563871693867, + "grad_norm": 0.227682709177952, + "learning_rate": 4.44305381727159e-06, + "loss": 0.3032, + "step": 2451 + }, + { + "epoch": 2.758581879572313, + "grad_norm": 0.24775815808302365, + "learning_rate": 4.422194409678766e-06, + "loss": 0.2952, + "step": 2452 + }, + { + "epoch": 2.759707371975239, + "grad_norm": 0.23280742706720892, + "learning_rate": 4.401335002085941e-06, + "loss": 0.2773, + "step": 2453 + }, + { + "epoch": 2.760832864378165, + "grad_norm": 0.19451290711254568, + "learning_rate": 4.380475594493116e-06, + "loss": 0.2693, + "step": 2454 + }, + { + "epoch": 2.7619583567810917, + "grad_norm": 0.20939579388836216, + "learning_rate": 4.3596161869002925e-06, + "loss": 0.2866, + "step": 2455 + }, + { + "epoch": 2.763083849184018, + "grad_norm": 0.20672270738688484, + "learning_rate": 4.338756779307468e-06, + "loss": 0.2904, + "step": 2456 + }, + { + "epoch": 2.764209341586944, + "grad_norm": 0.19033409152598357, + "learning_rate": 4.317897371714644e-06, + "loss": 0.2718, + "step": 2457 + }, + { + "epoch": 2.7653348339898707, + "grad_norm": 0.2166789978324445, + "learning_rate": 4.297037964121819e-06, + "loss": 0.2983, + "step": 2458 + }, + { + "epoch": 2.7664603263927967, + "grad_norm": 0.216447417269072, + "learning_rate": 4.2761785565289945e-06, + "loss": 0.2858, + "step": 2459 + }, + { + "epoch": 2.767585818795723, + "grad_norm": 0.362675866181273, + "learning_rate": 4.255319148936171e-06, + "loss": 0.2881, + "step": 2460 + }, + { + "epoch": 2.7687113111986497, + "grad_norm": 0.19521272525143493, + "learning_rate": 4.234459741343346e-06, + "loss": 0.2724, + "step": 2461 + }, + { + "epoch": 2.7698368036015757, + "grad_norm": 0.19682316401858674, + "learning_rate": 4.213600333750522e-06, + "loss": 0.2859, + "step": 2462 + }, + { + "epoch": 2.7709622960045017, + "grad_norm": 0.21277271670047132, + "learning_rate": 4.192740926157697e-06, + "loss": 0.2974, + "step": 2463 + }, + { + "epoch": 2.772087788407428, + "grad_norm": 0.21323098001422092, + "learning_rate": 4.171881518564873e-06, + "loss": 0.2858, + "step": 2464 + }, + { + "epoch": 2.7732132808103547, + "grad_norm": 0.2443899119261561, + "learning_rate": 4.151022110972049e-06, + "loss": 0.2851, + "step": 2465 + }, + { + "epoch": 2.7743387732132807, + "grad_norm": 0.2139564808006101, + "learning_rate": 4.130162703379224e-06, + "loss": 0.2949, + "step": 2466 + }, + { + "epoch": 2.775464265616207, + "grad_norm": 0.2212119000303061, + "learning_rate": 4.1093032957864e-06, + "loss": 0.2751, + "step": 2467 + }, + { + "epoch": 2.776589758019133, + "grad_norm": 0.20484020499228098, + "learning_rate": 4.0884438881935755e-06, + "loss": 0.2771, + "step": 2468 + }, + { + "epoch": 2.7777152504220597, + "grad_norm": 0.20123952910830462, + "learning_rate": 4.067584480600751e-06, + "loss": 0.3006, + "step": 2469 + }, + { + "epoch": 2.778840742824986, + "grad_norm": 0.21577294384729115, + "learning_rate": 4.046725073007927e-06, + "loss": 0.2996, + "step": 2470 + }, + { + "epoch": 2.779966235227912, + "grad_norm": 0.22582357217712795, + "learning_rate": 4.025865665415102e-06, + "loss": 0.2717, + "step": 2471 + }, + { + "epoch": 2.7810917276308382, + "grad_norm": 0.2134397866045082, + "learning_rate": 4.005006257822278e-06, + "loss": 0.2825, + "step": 2472 + }, + { + "epoch": 2.7822172200337647, + "grad_norm": 0.20324964622528435, + "learning_rate": 3.984146850229454e-06, + "loss": 0.2799, + "step": 2473 + }, + { + "epoch": 2.783342712436691, + "grad_norm": 0.20795174662693527, + "learning_rate": 3.963287442636629e-06, + "loss": 0.2854, + "step": 2474 + }, + { + "epoch": 2.784468204839617, + "grad_norm": 0.1997012004956189, + "learning_rate": 3.942428035043805e-06, + "loss": 0.2764, + "step": 2475 + }, + { + "epoch": 2.7855936972425437, + "grad_norm": 0.2096005762129211, + "learning_rate": 3.92156862745098e-06, + "loss": 0.276, + "step": 2476 + }, + { + "epoch": 2.7867191896454697, + "grad_norm": 0.2177380985810915, + "learning_rate": 3.9007092198581565e-06, + "loss": 0.2893, + "step": 2477 + }, + { + "epoch": 2.787844682048396, + "grad_norm": 0.22482324020736472, + "learning_rate": 3.879849812265332e-06, + "loss": 0.2657, + "step": 2478 + }, + { + "epoch": 2.7889701744513227, + "grad_norm": 0.19885926541241514, + "learning_rate": 3.858990404672508e-06, + "loss": 0.2884, + "step": 2479 + }, + { + "epoch": 2.7900956668542487, + "grad_norm": 0.19908222649936524, + "learning_rate": 3.838130997079683e-06, + "loss": 0.2742, + "step": 2480 + }, + { + "epoch": 2.791221159257175, + "grad_norm": 0.20760067726563736, + "learning_rate": 3.8172715894868585e-06, + "loss": 0.296, + "step": 2481 + }, + { + "epoch": 2.792346651660101, + "grad_norm": 0.2533774016177821, + "learning_rate": 3.7964121818940346e-06, + "loss": 0.2861, + "step": 2482 + }, + { + "epoch": 2.7934721440630277, + "grad_norm": 0.21052580972405535, + "learning_rate": 3.7755527743012103e-06, + "loss": 0.288, + "step": 2483 + }, + { + "epoch": 2.7945976364659537, + "grad_norm": 0.206699855575665, + "learning_rate": 3.754693366708386e-06, + "loss": 0.288, + "step": 2484 + }, + { + "epoch": 2.79572312886888, + "grad_norm": 0.19585545352446415, + "learning_rate": 3.733833959115561e-06, + "loss": 0.2747, + "step": 2485 + }, + { + "epoch": 2.7968486212718062, + "grad_norm": 0.2062750655598761, + "learning_rate": 3.7129745515227366e-06, + "loss": 0.275, + "step": 2486 + }, + { + "epoch": 2.7979741136747327, + "grad_norm": 0.19740523384916275, + "learning_rate": 3.6921151439299128e-06, + "loss": 0.2762, + "step": 2487 + }, + { + "epoch": 2.799099606077659, + "grad_norm": 0.23460327350417823, + "learning_rate": 3.6712557363370885e-06, + "loss": 0.2663, + "step": 2488 + }, + { + "epoch": 2.800225098480585, + "grad_norm": 0.27745757559360934, + "learning_rate": 3.650396328744264e-06, + "loss": 0.318, + "step": 2489 + }, + { + "epoch": 2.8013505908835117, + "grad_norm": 0.20040376636692844, + "learning_rate": 3.629536921151439e-06, + "loss": 0.2947, + "step": 2490 + }, + { + "epoch": 2.8024760832864377, + "grad_norm": 0.2065083158200927, + "learning_rate": 3.608677513558615e-06, + "loss": 0.2856, + "step": 2491 + }, + { + "epoch": 2.803601575689364, + "grad_norm": 0.2609652931637163, + "learning_rate": 3.587818105965791e-06, + "loss": 0.2778, + "step": 2492 + }, + { + "epoch": 2.80472706809229, + "grad_norm": 0.19707420225094988, + "learning_rate": 3.5669586983729666e-06, + "loss": 0.2847, + "step": 2493 + }, + { + "epoch": 2.8058525604952167, + "grad_norm": 0.214409640748111, + "learning_rate": 3.5460992907801423e-06, + "loss": 0.2728, + "step": 2494 + }, + { + "epoch": 2.8069780528981427, + "grad_norm": 0.21397007514173588, + "learning_rate": 3.525239883187317e-06, + "loss": 0.2939, + "step": 2495 + }, + { + "epoch": 2.808103545301069, + "grad_norm": 0.2211780277225568, + "learning_rate": 3.5043804755944933e-06, + "loss": 0.2941, + "step": 2496 + }, + { + "epoch": 2.8092290377039957, + "grad_norm": 0.19746449192767923, + "learning_rate": 3.483521068001669e-06, + "loss": 0.2865, + "step": 2497 + }, + { + "epoch": 2.8103545301069217, + "grad_norm": 0.24933709829603812, + "learning_rate": 3.4626616604088447e-06, + "loss": 0.292, + "step": 2498 + }, + { + "epoch": 2.811480022509848, + "grad_norm": 0.20025420830978435, + "learning_rate": 3.4418022528160205e-06, + "loss": 0.2936, + "step": 2499 + }, + { + "epoch": 2.812605514912774, + "grad_norm": 0.21106860467217742, + "learning_rate": 3.4209428452231953e-06, + "loss": 0.2749, + "step": 2500 + }, + { + "epoch": 2.8137310073157007, + "grad_norm": 0.244173358210336, + "learning_rate": 3.4000834376303715e-06, + "loss": 0.2921, + "step": 2501 + }, + { + "epoch": 2.814856499718627, + "grad_norm": 0.23708488619881102, + "learning_rate": 3.379224030037547e-06, + "loss": 0.286, + "step": 2502 + }, + { + "epoch": 2.815981992121553, + "grad_norm": 0.20203389839535874, + "learning_rate": 3.358364622444723e-06, + "loss": 0.2823, + "step": 2503 + }, + { + "epoch": 2.8171074845244792, + "grad_norm": 0.20180941230046498, + "learning_rate": 3.3375052148518986e-06, + "loss": 0.2795, + "step": 2504 + }, + { + "epoch": 2.8182329769274057, + "grad_norm": 0.1923974204466329, + "learning_rate": 3.3166458072590743e-06, + "loss": 0.2719, + "step": 2505 + }, + { + "epoch": 2.819358469330332, + "grad_norm": 0.210276870318239, + "learning_rate": 3.2957863996662496e-06, + "loss": 0.2831, + "step": 2506 + }, + { + "epoch": 2.820483961733258, + "grad_norm": 0.21238988385557064, + "learning_rate": 3.2749269920734253e-06, + "loss": 0.278, + "step": 2507 + }, + { + "epoch": 2.8216094541361847, + "grad_norm": 0.19338349502478416, + "learning_rate": 3.254067584480601e-06, + "loss": 0.2717, + "step": 2508 + }, + { + "epoch": 2.8227349465391107, + "grad_norm": 0.19141784353147265, + "learning_rate": 3.2332081768877767e-06, + "loss": 0.2687, + "step": 2509 + }, + { + "epoch": 2.823860438942037, + "grad_norm": 0.21299210269078242, + "learning_rate": 3.2123487692949525e-06, + "loss": 0.273, + "step": 2510 + }, + { + "epoch": 2.8249859313449637, + "grad_norm": 0.19645234999837438, + "learning_rate": 3.1914893617021277e-06, + "loss": 0.274, + "step": 2511 + }, + { + "epoch": 2.8261114237478897, + "grad_norm": 0.2154736738045129, + "learning_rate": 3.1706299541093035e-06, + "loss": 0.2798, + "step": 2512 + }, + { + "epoch": 2.8272369161508157, + "grad_norm": 0.20747340932057978, + "learning_rate": 3.149770546516479e-06, + "loss": 0.2911, + "step": 2513 + }, + { + "epoch": 2.828362408553742, + "grad_norm": 0.21668977371189152, + "learning_rate": 3.128911138923655e-06, + "loss": 0.2809, + "step": 2514 + }, + { + "epoch": 2.8294879009566687, + "grad_norm": 0.19752342856018376, + "learning_rate": 3.10805173133083e-06, + "loss": 0.2913, + "step": 2515 + }, + { + "epoch": 2.8306133933595947, + "grad_norm": 0.2129254469727926, + "learning_rate": 3.087192323738006e-06, + "loss": 0.2991, + "step": 2516 + }, + { + "epoch": 2.831738885762521, + "grad_norm": 0.2135496695763609, + "learning_rate": 3.0663329161451816e-06, + "loss": 0.3121, + "step": 2517 + }, + { + "epoch": 2.8328643781654472, + "grad_norm": 0.20217705970631564, + "learning_rate": 3.0454735085523573e-06, + "loss": 0.2699, + "step": 2518 + }, + { + "epoch": 2.8339898705683737, + "grad_norm": 0.18655748638369685, + "learning_rate": 3.024614100959533e-06, + "loss": 0.281, + "step": 2519 + }, + { + "epoch": 2.8351153629713, + "grad_norm": 0.20710373842178917, + "learning_rate": 3.0037546933667083e-06, + "loss": 0.2818, + "step": 2520 + }, + { + "epoch": 2.836240855374226, + "grad_norm": 0.1889143314236119, + "learning_rate": 2.982895285773884e-06, + "loss": 0.2686, + "step": 2521 + }, + { + "epoch": 2.8373663477771522, + "grad_norm": 0.1916678750234142, + "learning_rate": 2.96203587818106e-06, + "loss": 0.28, + "step": 2522 + }, + { + "epoch": 2.8384918401800787, + "grad_norm": 0.19425390855255784, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.2979, + "step": 2523 + }, + { + "epoch": 2.839617332583005, + "grad_norm": 0.20681634843217112, + "learning_rate": 2.920317062995411e-06, + "loss": 0.2871, + "step": 2524 + }, + { + "epoch": 2.8407428249859312, + "grad_norm": 0.20023152119316995, + "learning_rate": 2.8994576554025865e-06, + "loss": 0.2801, + "step": 2525 + }, + { + "epoch": 2.8418683173888577, + "grad_norm": 0.20177659743093546, + "learning_rate": 2.878598247809762e-06, + "loss": 0.2816, + "step": 2526 + }, + { + "epoch": 2.8429938097917837, + "grad_norm": 0.19585899941288504, + "learning_rate": 2.8577388402169383e-06, + "loss": 0.2679, + "step": 2527 + }, + { + "epoch": 2.84411930219471, + "grad_norm": 0.17859425580387345, + "learning_rate": 2.8368794326241136e-06, + "loss": 0.2707, + "step": 2528 + }, + { + "epoch": 2.8452447945976367, + "grad_norm": 0.2039828332143888, + "learning_rate": 2.8160200250312893e-06, + "loss": 0.2991, + "step": 2529 + }, + { + "epoch": 2.8463702870005627, + "grad_norm": 0.2030074794985201, + "learning_rate": 2.7951606174384646e-06, + "loss": 0.2793, + "step": 2530 + }, + { + "epoch": 2.847495779403489, + "grad_norm": 0.19766777443355418, + "learning_rate": 2.7743012098456403e-06, + "loss": 0.2823, + "step": 2531 + }, + { + "epoch": 2.8486212718064152, + "grad_norm": 0.19892663863890306, + "learning_rate": 2.7534418022528165e-06, + "loss": 0.2806, + "step": 2532 + }, + { + "epoch": 2.8497467642093417, + "grad_norm": 0.21275315081509835, + "learning_rate": 2.7325823946599917e-06, + "loss": 0.2701, + "step": 2533 + }, + { + "epoch": 2.8508722566122677, + "grad_norm": 0.21033439554694824, + "learning_rate": 2.7117229870671675e-06, + "loss": 0.2854, + "step": 2534 + }, + { + "epoch": 2.851997749015194, + "grad_norm": 0.19767645497600447, + "learning_rate": 2.690863579474343e-06, + "loss": 0.288, + "step": 2535 + }, + { + "epoch": 2.8531232414181202, + "grad_norm": 0.2081071860608737, + "learning_rate": 2.6700041718815185e-06, + "loss": 0.2853, + "step": 2536 + }, + { + "epoch": 2.8542487338210467, + "grad_norm": 0.19156924866913874, + "learning_rate": 2.6491447642886946e-06, + "loss": 0.2768, + "step": 2537 + }, + { + "epoch": 2.855374226223973, + "grad_norm": 0.21948381030849073, + "learning_rate": 2.62828535669587e-06, + "loss": 0.2862, + "step": 2538 + }, + { + "epoch": 2.856499718626899, + "grad_norm": 0.194801788917978, + "learning_rate": 2.6074259491030456e-06, + "loss": 0.2797, + "step": 2539 + }, + { + "epoch": 2.8576252110298257, + "grad_norm": 0.22618411821723028, + "learning_rate": 2.5865665415102213e-06, + "loss": 0.3074, + "step": 2540 + }, + { + "epoch": 2.8587507034327517, + "grad_norm": 0.20254836873582036, + "learning_rate": 2.5657071339173966e-06, + "loss": 0.2699, + "step": 2541 + }, + { + "epoch": 2.859876195835678, + "grad_norm": 0.19524970967882507, + "learning_rate": 2.5448477263245727e-06, + "loss": 0.2781, + "step": 2542 + }, + { + "epoch": 2.8610016882386042, + "grad_norm": 0.19523879122973248, + "learning_rate": 2.523988318731748e-06, + "loss": 0.2901, + "step": 2543 + }, + { + "epoch": 2.8621271806415307, + "grad_norm": 0.19385408771091103, + "learning_rate": 2.5031289111389237e-06, + "loss": 0.2861, + "step": 2544 + }, + { + "epoch": 2.8632526730444567, + "grad_norm": 0.19805715117689787, + "learning_rate": 2.4822695035460995e-06, + "loss": 0.2692, + "step": 2545 + }, + { + "epoch": 2.864378165447383, + "grad_norm": 0.19729671382210393, + "learning_rate": 2.4614100959532747e-06, + "loss": 0.293, + "step": 2546 + }, + { + "epoch": 2.8655036578503097, + "grad_norm": 0.19120385629590778, + "learning_rate": 2.440550688360451e-06, + "loss": 0.2809, + "step": 2547 + }, + { + "epoch": 2.8666291502532357, + "grad_norm": 0.20169331917856845, + "learning_rate": 2.4196912807676266e-06, + "loss": 0.2865, + "step": 2548 + }, + { + "epoch": 2.867754642656162, + "grad_norm": 0.20308102680675588, + "learning_rate": 2.398831873174802e-06, + "loss": 0.292, + "step": 2549 + }, + { + "epoch": 2.8688801350590882, + "grad_norm": 0.21920991074207272, + "learning_rate": 2.3779724655819776e-06, + "loss": 0.2769, + "step": 2550 + }, + { + "epoch": 2.8700056274620147, + "grad_norm": 0.1909570999276725, + "learning_rate": 2.357113057989153e-06, + "loss": 0.2826, + "step": 2551 + }, + { + "epoch": 2.871131119864941, + "grad_norm": 0.20820870859741275, + "learning_rate": 2.336253650396329e-06, + "loss": 0.2811, + "step": 2552 + }, + { + "epoch": 2.872256612267867, + "grad_norm": 0.19636795603937476, + "learning_rate": 2.3153942428035047e-06, + "loss": 0.2817, + "step": 2553 + }, + { + "epoch": 2.8733821046707932, + "grad_norm": 0.2072299777143624, + "learning_rate": 2.29453483521068e-06, + "loss": 0.2782, + "step": 2554 + }, + { + "epoch": 2.8745075970737197, + "grad_norm": 0.21169030898396585, + "learning_rate": 2.2736754276178557e-06, + "loss": 0.2674, + "step": 2555 + }, + { + "epoch": 2.875633089476646, + "grad_norm": 0.21706530639456728, + "learning_rate": 2.252816020025031e-06, + "loss": 0.2759, + "step": 2556 + }, + { + "epoch": 2.8767585818795722, + "grad_norm": 0.2028602319447674, + "learning_rate": 2.231956612432207e-06, + "loss": 0.2785, + "step": 2557 + }, + { + "epoch": 2.8778840742824987, + "grad_norm": 0.19144432960054086, + "learning_rate": 2.211097204839383e-06, + "loss": 0.2833, + "step": 2558 + }, + { + "epoch": 2.8790095666854247, + "grad_norm": 0.2093653426820825, + "learning_rate": 2.190237797246558e-06, + "loss": 0.2815, + "step": 2559 + }, + { + "epoch": 2.880135059088351, + "grad_norm": 0.18440025227997578, + "learning_rate": 2.169378389653734e-06, + "loss": 0.2786, + "step": 2560 + }, + { + "epoch": 2.8812605514912777, + "grad_norm": 0.19656732367474106, + "learning_rate": 2.1485189820609096e-06, + "loss": 0.2803, + "step": 2561 + }, + { + "epoch": 2.8823860438942037, + "grad_norm": 0.19783394536320123, + "learning_rate": 2.1276595744680853e-06, + "loss": 0.2812, + "step": 2562 + }, + { + "epoch": 2.8835115362971298, + "grad_norm": 0.18647997820338216, + "learning_rate": 2.106800166875261e-06, + "loss": 0.2781, + "step": 2563 + }, + { + "epoch": 2.8846370287000562, + "grad_norm": 0.19982575823634316, + "learning_rate": 2.0859407592824363e-06, + "loss": 0.2816, + "step": 2564 + }, + { + "epoch": 2.8857625211029827, + "grad_norm": 0.195813225255546, + "learning_rate": 2.065081351689612e-06, + "loss": 0.2788, + "step": 2565 + }, + { + "epoch": 2.8868880135059087, + "grad_norm": 0.2043720027738115, + "learning_rate": 2.0442219440967877e-06, + "loss": 0.286, + "step": 2566 + }, + { + "epoch": 2.888013505908835, + "grad_norm": 0.19093315445014475, + "learning_rate": 2.0233625365039634e-06, + "loss": 0.2707, + "step": 2567 + }, + { + "epoch": 2.8891389983117612, + "grad_norm": 0.21027820905380143, + "learning_rate": 2.002503128911139e-06, + "loss": 0.3025, + "step": 2568 + }, + { + "epoch": 2.8902644907146877, + "grad_norm": 0.2026225980219709, + "learning_rate": 1.9816437213183145e-06, + "loss": 0.2853, + "step": 2569 + }, + { + "epoch": 2.891389983117614, + "grad_norm": 0.1861779080477095, + "learning_rate": 1.96078431372549e-06, + "loss": 0.2783, + "step": 2570 + }, + { + "epoch": 2.8925154755205402, + "grad_norm": 0.20932901602980997, + "learning_rate": 1.939924906132666e-06, + "loss": 0.2885, + "step": 2571 + }, + { + "epoch": 2.8936409679234663, + "grad_norm": 0.2073789548556899, + "learning_rate": 1.9190654985398416e-06, + "loss": 0.2801, + "step": 2572 + }, + { + "epoch": 2.8947664603263927, + "grad_norm": 0.18609584484982244, + "learning_rate": 1.8982060909470173e-06, + "loss": 0.2744, + "step": 2573 + }, + { + "epoch": 2.895891952729319, + "grad_norm": 0.1812033482553102, + "learning_rate": 1.877346683354193e-06, + "loss": 0.2859, + "step": 2574 + }, + { + "epoch": 2.8970174451322452, + "grad_norm": 0.20994333414010466, + "learning_rate": 1.8564872757613683e-06, + "loss": 0.2904, + "step": 2575 + }, + { + "epoch": 2.8981429375351717, + "grad_norm": 0.19743578503548526, + "learning_rate": 1.8356278681685442e-06, + "loss": 0.2761, + "step": 2576 + }, + { + "epoch": 2.8992684299380977, + "grad_norm": 0.19166570858524684, + "learning_rate": 1.8147684605757195e-06, + "loss": 0.2918, + "step": 2577 + }, + { + "epoch": 2.9003939223410242, + "grad_norm": 0.20463881347239937, + "learning_rate": 1.7939090529828954e-06, + "loss": 0.2889, + "step": 2578 + }, + { + "epoch": 2.9015194147439507, + "grad_norm": 0.19047750537612482, + "learning_rate": 1.7730496453900712e-06, + "loss": 0.2864, + "step": 2579 + }, + { + "epoch": 2.9026449071468767, + "grad_norm": 0.19091573754700542, + "learning_rate": 1.7521902377972467e-06, + "loss": 0.2927, + "step": 2580 + }, + { + "epoch": 2.903770399549803, + "grad_norm": 0.1876198954341774, + "learning_rate": 1.7313308302044224e-06, + "loss": 0.268, + "step": 2581 + }, + { + "epoch": 2.9048958919527292, + "grad_norm": 0.20218741071231236, + "learning_rate": 1.7104714226115977e-06, + "loss": 0.2818, + "step": 2582 + }, + { + "epoch": 2.9060213843556557, + "grad_norm": 0.19751340574083814, + "learning_rate": 1.6896120150187736e-06, + "loss": 0.289, + "step": 2583 + }, + { + "epoch": 2.9071468767585817, + "grad_norm": 0.1929293500822755, + "learning_rate": 1.6687526074259493e-06, + "loss": 0.2701, + "step": 2584 + }, + { + "epoch": 2.908272369161508, + "grad_norm": 0.19409315370373764, + "learning_rate": 1.6478931998331248e-06, + "loss": 0.294, + "step": 2585 + }, + { + "epoch": 2.9093978615644343, + "grad_norm": 0.19513663853793442, + "learning_rate": 1.6270337922403005e-06, + "loss": 0.2909, + "step": 2586 + }, + { + "epoch": 2.9105233539673607, + "grad_norm": 0.18520192324031312, + "learning_rate": 1.6061743846474762e-06, + "loss": 0.2879, + "step": 2587 + }, + { + "epoch": 2.911648846370287, + "grad_norm": 0.19393576190790643, + "learning_rate": 1.5853149770546517e-06, + "loss": 0.2818, + "step": 2588 + }, + { + "epoch": 2.9127743387732132, + "grad_norm": 0.1913327489426411, + "learning_rate": 1.5644555694618274e-06, + "loss": 0.2845, + "step": 2589 + }, + { + "epoch": 2.9138998311761397, + "grad_norm": 0.19600196772862405, + "learning_rate": 1.543596161869003e-06, + "loss": 0.2877, + "step": 2590 + }, + { + "epoch": 2.9150253235790657, + "grad_norm": 0.2003116343512408, + "learning_rate": 1.5227367542761787e-06, + "loss": 0.2761, + "step": 2591 + }, + { + "epoch": 2.916150815981992, + "grad_norm": 0.18645182971152968, + "learning_rate": 1.5018773466833542e-06, + "loss": 0.2851, + "step": 2592 + }, + { + "epoch": 2.9172763083849182, + "grad_norm": 0.20894705309538278, + "learning_rate": 1.48101793909053e-06, + "loss": 0.2903, + "step": 2593 + }, + { + "epoch": 2.9184018007878447, + "grad_norm": 0.19120582673378814, + "learning_rate": 1.4601585314977056e-06, + "loss": 0.2776, + "step": 2594 + }, + { + "epoch": 2.9195272931907708, + "grad_norm": 0.20278259638182897, + "learning_rate": 1.439299123904881e-06, + "loss": 0.2787, + "step": 2595 + }, + { + "epoch": 2.9206527855936972, + "grad_norm": 0.19583937073430013, + "learning_rate": 1.4184397163120568e-06, + "loss": 0.2811, + "step": 2596 + }, + { + "epoch": 2.9217782779966237, + "grad_norm": 0.1941542530021111, + "learning_rate": 1.3975803087192323e-06, + "loss": 0.2658, + "step": 2597 + }, + { + "epoch": 2.9229037703995497, + "grad_norm": 0.20963262803457552, + "learning_rate": 1.3767209011264082e-06, + "loss": 0.278, + "step": 2598 + }, + { + "epoch": 2.924029262802476, + "grad_norm": 0.2018274661438912, + "learning_rate": 1.3558614935335837e-06, + "loss": 0.294, + "step": 2599 + }, + { + "epoch": 2.9251547552054022, + "grad_norm": 0.19436944269264386, + "learning_rate": 1.3350020859407592e-06, + "loss": 0.2834, + "step": 2600 + }, + { + "epoch": 2.9262802476083287, + "grad_norm": 0.1878287577440724, + "learning_rate": 1.314142678347935e-06, + "loss": 0.2765, + "step": 2601 + }, + { + "epoch": 2.927405740011255, + "grad_norm": 0.18341006129215123, + "learning_rate": 1.2932832707551107e-06, + "loss": 0.2746, + "step": 2602 + }, + { + "epoch": 2.9285312324141812, + "grad_norm": 0.19735771783507766, + "learning_rate": 1.2724238631622864e-06, + "loss": 0.2913, + "step": 2603 + }, + { + "epoch": 2.9296567248171073, + "grad_norm": 0.18771599689886934, + "learning_rate": 1.2515644555694619e-06, + "loss": 0.2753, + "step": 2604 + }, + { + "epoch": 2.9307822172200337, + "grad_norm": 0.19841768486183753, + "learning_rate": 1.2307050479766374e-06, + "loss": 0.2814, + "step": 2605 + }, + { + "epoch": 2.93190770962296, + "grad_norm": 0.1956614956245663, + "learning_rate": 1.2098456403838133e-06, + "loss": 0.2743, + "step": 2606 + }, + { + "epoch": 2.9330332020258862, + "grad_norm": 0.2002743148871214, + "learning_rate": 1.1889862327909888e-06, + "loss": 0.2888, + "step": 2607 + }, + { + "epoch": 2.9341586944288127, + "grad_norm": 0.21318426731074547, + "learning_rate": 1.1681268251981645e-06, + "loss": 0.2959, + "step": 2608 + }, + { + "epoch": 2.9352841868317388, + "grad_norm": 0.18809272462436055, + "learning_rate": 1.14726741760534e-06, + "loss": 0.277, + "step": 2609 + }, + { + "epoch": 2.9364096792346652, + "grad_norm": 0.19427439279930914, + "learning_rate": 1.1264080100125155e-06, + "loss": 0.2927, + "step": 2610 + }, + { + "epoch": 2.9375351716375917, + "grad_norm": 0.2079310357704345, + "learning_rate": 1.1055486024196914e-06, + "loss": 0.2828, + "step": 2611 + }, + { + "epoch": 2.9386606640405177, + "grad_norm": 0.19416657363268003, + "learning_rate": 1.084689194826867e-06, + "loss": 0.2911, + "step": 2612 + }, + { + "epoch": 2.9397861564434438, + "grad_norm": 0.19916119078493613, + "learning_rate": 1.0638297872340427e-06, + "loss": 0.2924, + "step": 2613 + }, + { + "epoch": 2.9409116488463702, + "grad_norm": 0.1983245462408925, + "learning_rate": 1.0429703796412182e-06, + "loss": 0.2708, + "step": 2614 + }, + { + "epoch": 2.9420371412492967, + "grad_norm": 0.18590780784131763, + "learning_rate": 1.0221109720483939e-06, + "loss": 0.2863, + "step": 2615 + }, + { + "epoch": 2.9431626336522227, + "grad_norm": 0.1872997802264514, + "learning_rate": 1.0012515644555696e-06, + "loss": 0.2811, + "step": 2616 + }, + { + "epoch": 2.9442881260551492, + "grad_norm": 0.178184423835207, + "learning_rate": 9.80392156862745e-07, + "loss": 0.2731, + "step": 2617 + }, + { + "epoch": 2.9454136184580753, + "grad_norm": 0.20370420485130178, + "learning_rate": 9.595327492699208e-07, + "loss": 0.298, + "step": 2618 + }, + { + "epoch": 2.9465391108610017, + "grad_norm": 0.19363752795605113, + "learning_rate": 9.386733416770965e-07, + "loss": 0.2862, + "step": 2619 + }, + { + "epoch": 2.947664603263928, + "grad_norm": 0.19337040777721937, + "learning_rate": 9.178139340842721e-07, + "loss": 0.2938, + "step": 2620 + }, + { + "epoch": 2.9487900956668542, + "grad_norm": 0.20062145944273124, + "learning_rate": 8.969545264914477e-07, + "loss": 0.2854, + "step": 2621 + }, + { + "epoch": 2.9499155880697803, + "grad_norm": 0.19780261743838537, + "learning_rate": 8.760951188986233e-07, + "loss": 0.2853, + "step": 2622 + }, + { + "epoch": 2.9510410804727067, + "grad_norm": 0.18300708396430374, + "learning_rate": 8.552357113057988e-07, + "loss": 0.2859, + "step": 2623 + }, + { + "epoch": 2.952166572875633, + "grad_norm": 0.19659081949531576, + "learning_rate": 8.343763037129747e-07, + "loss": 0.2873, + "step": 2624 + }, + { + "epoch": 2.9532920652785593, + "grad_norm": 0.18879456754476104, + "learning_rate": 8.135168961201503e-07, + "loss": 0.3059, + "step": 2625 + }, + { + "epoch": 2.9544175576814857, + "grad_norm": 0.1970187549688716, + "learning_rate": 7.926574885273259e-07, + "loss": 0.2751, + "step": 2626 + }, + { + "epoch": 2.9555430500844118, + "grad_norm": 0.18765481178184712, + "learning_rate": 7.717980809345015e-07, + "loss": 0.2828, + "step": 2627 + }, + { + "epoch": 2.9566685424873382, + "grad_norm": 0.18024019057588778, + "learning_rate": 7.509386733416771e-07, + "loss": 0.2796, + "step": 2628 + }, + { + "epoch": 2.9577940348902647, + "grad_norm": 0.1911404613455637, + "learning_rate": 7.300792657488528e-07, + "loss": 0.2857, + "step": 2629 + }, + { + "epoch": 2.9589195272931907, + "grad_norm": 0.19138951788504163, + "learning_rate": 7.092198581560284e-07, + "loss": 0.2792, + "step": 2630 + }, + { + "epoch": 2.9600450196961168, + "grad_norm": 0.1910935556506434, + "learning_rate": 6.883604505632041e-07, + "loss": 0.2845, + "step": 2631 + }, + { + "epoch": 2.9611705120990433, + "grad_norm": 0.19203721334554208, + "learning_rate": 6.675010429703796e-07, + "loss": 0.291, + "step": 2632 + }, + { + "epoch": 2.9622960045019697, + "grad_norm": 0.1928645139619539, + "learning_rate": 6.466416353775553e-07, + "loss": 0.2929, + "step": 2633 + }, + { + "epoch": 2.9634214969048958, + "grad_norm": 0.19918761206527566, + "learning_rate": 6.257822277847309e-07, + "loss": 0.3019, + "step": 2634 + }, + { + "epoch": 2.9645469893078222, + "grad_norm": 0.1854244511600504, + "learning_rate": 6.049228201919066e-07, + "loss": 0.2762, + "step": 2635 + }, + { + "epoch": 2.9656724817107483, + "grad_norm": 0.1790914058015419, + "learning_rate": 5.840634125990823e-07, + "loss": 0.2775, + "step": 2636 + }, + { + "epoch": 2.9667979741136747, + "grad_norm": 0.18939335360021642, + "learning_rate": 5.632040050062578e-07, + "loss": 0.2868, + "step": 2637 + }, + { + "epoch": 2.967923466516601, + "grad_norm": 0.18950602005484965, + "learning_rate": 5.423445974134335e-07, + "loss": 0.2806, + "step": 2638 + }, + { + "epoch": 2.9690489589195272, + "grad_norm": 0.2057097341756207, + "learning_rate": 5.214851898206091e-07, + "loss": 0.2959, + "step": 2639 + }, + { + "epoch": 2.9701744513224537, + "grad_norm": 0.18914510377229687, + "learning_rate": 5.006257822277848e-07, + "loss": 0.2813, + "step": 2640 + }, + { + "epoch": 2.9712999437253798, + "grad_norm": 0.21148991319548016, + "learning_rate": 4.797663746349604e-07, + "loss": 0.2921, + "step": 2641 + }, + { + "epoch": 2.9724254361283062, + "grad_norm": 0.19281924718786836, + "learning_rate": 4.5890696704213606e-07, + "loss": 0.2855, + "step": 2642 + }, + { + "epoch": 2.9735509285312323, + "grad_norm": 0.18955123443150448, + "learning_rate": 4.3804755944931167e-07, + "loss": 0.2959, + "step": 2643 + }, + { + "epoch": 2.9746764209341587, + "grad_norm": 0.18330368999509078, + "learning_rate": 4.171881518564873e-07, + "loss": 0.2867, + "step": 2644 + }, + { + "epoch": 2.9758019133370848, + "grad_norm": 0.19150353774187667, + "learning_rate": 3.9632874426366293e-07, + "loss": 0.2807, + "step": 2645 + }, + { + "epoch": 2.9769274057400112, + "grad_norm": 0.18078724381796288, + "learning_rate": 3.7546933667083854e-07, + "loss": 0.2659, + "step": 2646 + }, + { + "epoch": 2.9780528981429377, + "grad_norm": 0.1956344780829508, + "learning_rate": 3.546099290780142e-07, + "loss": 0.2836, + "step": 2647 + }, + { + "epoch": 2.9791783905458638, + "grad_norm": 0.18154947344603503, + "learning_rate": 3.337505214851898e-07, + "loss": 0.2691, + "step": 2648 + }, + { + "epoch": 2.9803038829487902, + "grad_norm": 0.19970667773834722, + "learning_rate": 3.1289111389236547e-07, + "loss": 0.2939, + "step": 2649 + }, + { + "epoch": 2.9814293753517163, + "grad_norm": 0.1970797047209464, + "learning_rate": 2.9203170629954113e-07, + "loss": 0.2973, + "step": 2650 + }, + { + "epoch": 2.9825548677546427, + "grad_norm": 0.17558302415161703, + "learning_rate": 2.7117229870671674e-07, + "loss": 0.2687, + "step": 2651 + }, + { + "epoch": 2.983680360157569, + "grad_norm": 0.19984560884749847, + "learning_rate": 2.503128911138924e-07, + "loss": 0.2917, + "step": 2652 + }, + { + "epoch": 2.9848058525604952, + "grad_norm": 0.18904253639700785, + "learning_rate": 2.2945348352106803e-07, + "loss": 0.2881, + "step": 2653 + }, + { + "epoch": 2.9859313449634213, + "grad_norm": 0.18998537420423053, + "learning_rate": 2.0859407592824366e-07, + "loss": 0.2899, + "step": 2654 + }, + { + "epoch": 2.9870568373663478, + "grad_norm": 0.18137614988061299, + "learning_rate": 1.8773466833541927e-07, + "loss": 0.2673, + "step": 2655 + }, + { + "epoch": 2.9881823297692742, + "grad_norm": 0.20380937660099302, + "learning_rate": 1.668752607425949e-07, + "loss": 0.2855, + "step": 2656 + }, + { + "epoch": 2.9893078221722003, + "grad_norm": 0.18697756553685133, + "learning_rate": 1.4601585314977056e-07, + "loss": 0.2855, + "step": 2657 + }, + { + "epoch": 2.9904333145751267, + "grad_norm": 0.18757566803524747, + "learning_rate": 1.251564455569462e-07, + "loss": 0.2881, + "step": 2658 + }, + { + "epoch": 2.9915588069780528, + "grad_norm": 0.18601807443604287, + "learning_rate": 1.0429703796412183e-07, + "loss": 0.2848, + "step": 2659 + }, + { + "epoch": 2.9926842993809792, + "grad_norm": 0.19183363220520985, + "learning_rate": 8.343763037129745e-08, + "loss": 0.2958, + "step": 2660 + }, + { + "epoch": 2.9938097917839057, + "grad_norm": 0.20102940785890344, + "learning_rate": 6.25782227784731e-08, + "loss": 0.281, + "step": 2661 + }, + { + "epoch": 2.9949352841868317, + "grad_norm": 0.189431904457012, + "learning_rate": 4.1718815185648726e-08, + "loss": 0.2813, + "step": 2662 + }, + { + "epoch": 2.996060776589758, + "grad_norm": 0.17662567105058288, + "learning_rate": 2.0859407592824363e-08, + "loss": 0.2782, + "step": 2663 + }, + { + "epoch": 2.9971862689926843, + "grad_norm": 0.18268732320356856, + "learning_rate": 0.0, + "loss": 0.2705, + "step": 2664 + }, + { + "epoch": 2.9971862689926843, + "step": 2664, + "total_flos": 2.27802848659977e+18, + "train_loss": 0.43048976833845404, + "train_runtime": 155129.8221, + "train_samples_per_second": 0.275, + "train_steps_per_second": 0.017 + } + ], + "logging_steps": 1, + "max_steps": 2664, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.27802848659977e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}