{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1308, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0061162079510703364, "grad_norm": 0.796875, "learning_rate": 1.5151515151515152e-07, "loss": 1.9698597192764282, "step": 2 }, { "epoch": 0.012232415902140673, "grad_norm": 1.625, "learning_rate": 4.5454545454545457e-07, "loss": 2.038839101791382, "step": 4 }, { "epoch": 0.01834862385321101, "grad_norm": 0.70703125, "learning_rate": 7.575757575757576e-07, "loss": 2.013974189758301, "step": 6 }, { "epoch": 0.024464831804281346, "grad_norm": 0.83984375, "learning_rate": 1.0606060606060608e-06, "loss": 2.130162239074707, "step": 8 }, { "epoch": 0.03058103975535168, "grad_norm": 0.7890625, "learning_rate": 1.3636363636363636e-06, "loss": 2.004484176635742, "step": 10 }, { "epoch": 0.03669724770642202, "grad_norm": 0.8671875, "learning_rate": 1.6666666666666667e-06, "loss": 2.008946180343628, "step": 12 }, { "epoch": 0.04281345565749235, "grad_norm": 0.74609375, "learning_rate": 1.96969696969697e-06, "loss": 2.0183446407318115, "step": 14 }, { "epoch": 0.04892966360856269, "grad_norm": 1.078125, "learning_rate": 2.2727272727272728e-06, "loss": 1.9834390878677368, "step": 16 }, { "epoch": 0.05504587155963303, "grad_norm": 0.75390625, "learning_rate": 2.575757575757576e-06, "loss": 2.044725179672241, "step": 18 }, { "epoch": 0.06116207951070336, "grad_norm": 0.8828125, "learning_rate": 2.8787878787878793e-06, "loss": 2.3765246868133545, "step": 20 }, { "epoch": 0.0672782874617737, "grad_norm": 0.62890625, "learning_rate": 3.181818181818182e-06, "loss": 2.135927438735962, "step": 22 }, { "epoch": 0.07339449541284404, "grad_norm": 1.1484375, "learning_rate": 3.4848484848484854e-06, "loss": 2.1705098152160645, "step": 24 }, { "epoch": 0.07951070336391437, "grad_norm": 1.2421875, "learning_rate": 3.7878787878787882e-06, "loss": 2.0761096477508545, "step": 26 }, { "epoch": 0.0856269113149847, "grad_norm": 2.03125, "learning_rate": 4.0909090909090915e-06, "loss": 2.0886521339416504, "step": 28 }, { "epoch": 0.09174311926605505, "grad_norm": 0.921875, "learning_rate": 4.393939393939394e-06, "loss": 2.201143980026245, "step": 30 }, { "epoch": 0.09785932721712538, "grad_norm": 8.625, "learning_rate": 4.696969696969698e-06, "loss": 2.045245885848999, "step": 32 }, { "epoch": 0.10397553516819572, "grad_norm": 0.65234375, "learning_rate": 5e-06, "loss": 1.907004475593567, "step": 34 }, { "epoch": 0.11009174311926606, "grad_norm": 1.2265625, "learning_rate": 5.303030303030303e-06, "loss": 1.9484505653381348, "step": 36 }, { "epoch": 0.1162079510703364, "grad_norm": 0.77734375, "learning_rate": 5.606060606060606e-06, "loss": 1.963153600692749, "step": 38 }, { "epoch": 0.12232415902140673, "grad_norm": 0.51953125, "learning_rate": 5.90909090909091e-06, "loss": 2.0666375160217285, "step": 40 }, { "epoch": 0.12844036697247707, "grad_norm": 0.4453125, "learning_rate": 6.212121212121213e-06, "loss": 1.9147648811340332, "step": 42 }, { "epoch": 0.1345565749235474, "grad_norm": 0.59765625, "learning_rate": 6.515151515151516e-06, "loss": 1.8978981971740723, "step": 44 }, { "epoch": 0.14067278287461774, "grad_norm": 0.61328125, "learning_rate": 6.818181818181818e-06, "loss": 1.8800417184829712, "step": 46 }, { "epoch": 0.14678899082568808, "grad_norm": 0.78125, "learning_rate": 7.121212121212122e-06, "loss": 2.1724555492401123, "step": 48 }, { "epoch": 0.1529051987767584, "grad_norm": 0.8046875, "learning_rate": 7.424242424242425e-06, "loss": 2.0132710933685303, "step": 50 }, { "epoch": 0.15902140672782875, "grad_norm": 0.53515625, "learning_rate": 7.727272727272727e-06, "loss": 1.9249347448349, "step": 52 }, { "epoch": 0.1651376146788991, "grad_norm": 0.5625, "learning_rate": 8.03030303030303e-06, "loss": 2.0230674743652344, "step": 54 }, { "epoch": 0.1712538226299694, "grad_norm": 0.46484375, "learning_rate": 8.333333333333334e-06, "loss": 1.8684461116790771, "step": 56 }, { "epoch": 0.17737003058103976, "grad_norm": 0.68359375, "learning_rate": 8.636363636363637e-06, "loss": 1.9816838502883911, "step": 58 }, { "epoch": 0.1834862385321101, "grad_norm": 0.70703125, "learning_rate": 8.93939393939394e-06, "loss": 1.8070955276489258, "step": 60 }, { "epoch": 0.18960244648318042, "grad_norm": 0.71875, "learning_rate": 9.242424242424244e-06, "loss": 1.9102303981781006, "step": 62 }, { "epoch": 0.19571865443425077, "grad_norm": 0.703125, "learning_rate": 9.545454545454547e-06, "loss": 1.761095643043518, "step": 64 }, { "epoch": 0.2018348623853211, "grad_norm": 0.9765625, "learning_rate": 9.84848484848485e-06, "loss": 1.8172270059585571, "step": 66 }, { "epoch": 0.20795107033639143, "grad_norm": 0.69921875, "learning_rate": 9.99998560409937e-06, "loss": 1.7951654195785522, "step": 68 }, { "epoch": 0.21406727828746178, "grad_norm": 0.59375, "learning_rate": 9.999870437446959e-06, "loss": 1.7246266603469849, "step": 70 }, { "epoch": 0.22018348623853212, "grad_norm": 0.61328125, "learning_rate": 9.99964010708956e-06, "loss": 1.7382261753082275, "step": 72 }, { "epoch": 0.22629969418960244, "grad_norm": 0.93359375, "learning_rate": 9.999294618921943e-06, "loss": 1.8094028234481812, "step": 74 }, { "epoch": 0.2324159021406728, "grad_norm": 0.69140625, "learning_rate": 9.998833981786072e-06, "loss": 1.7889823913574219, "step": 76 }, { "epoch": 0.23853211009174313, "grad_norm": 0.515625, "learning_rate": 9.998258207470882e-06, "loss": 1.7645984888076782, "step": 78 }, { "epoch": 0.24464831804281345, "grad_norm": 1.6328125, "learning_rate": 9.997567310711977e-06, "loss": 1.692162275314331, "step": 80 }, { "epoch": 0.25076452599388377, "grad_norm": 0.38671875, "learning_rate": 9.996761309191248e-06, "loss": 1.6656694412231445, "step": 82 }, { "epoch": 0.25688073394495414, "grad_norm": 1.0859375, "learning_rate": 9.995840223536428e-06, "loss": 1.69821035861969, "step": 84 }, { "epoch": 0.26299694189602446, "grad_norm": 0.55078125, "learning_rate": 9.99480407732056e-06, "loss": 1.693019986152649, "step": 86 }, { "epoch": 0.2691131498470948, "grad_norm": 0.66796875, "learning_rate": 9.993652897061394e-06, "loss": 1.585938572883606, "step": 88 }, { "epoch": 0.27522935779816515, "grad_norm": 0.55859375, "learning_rate": 9.99238671222071e-06, "loss": 1.5834678411483765, "step": 90 }, { "epoch": 0.28134556574923547, "grad_norm": 0.72265625, "learning_rate": 9.991005555203553e-06, "loss": 1.5904253721237183, "step": 92 }, { "epoch": 0.2874617737003058, "grad_norm": 0.93359375, "learning_rate": 9.989509461357428e-06, "loss": 1.7213293313980103, "step": 94 }, { "epoch": 0.29357798165137616, "grad_norm": 0.96875, "learning_rate": 9.98789846897137e-06, "loss": 1.59124755859375, "step": 96 }, { "epoch": 0.2996941896024465, "grad_norm": 0.328125, "learning_rate": 9.986172619274977e-06, "loss": 1.4882735013961792, "step": 98 }, { "epoch": 0.3058103975535168, "grad_norm": 0.490234375, "learning_rate": 9.984331956437354e-06, "loss": 1.6401163339614868, "step": 100 }, { "epoch": 0.3119266055045872, "grad_norm": 0.484375, "learning_rate": 9.982376527565981e-06, "loss": 1.6229268312454224, "step": 102 }, { "epoch": 0.3180428134556575, "grad_norm": 0.6953125, "learning_rate": 9.980306382705504e-06, "loss": 1.6486362218856812, "step": 104 }, { "epoch": 0.3241590214067278, "grad_norm": 0.4921875, "learning_rate": 9.978121574836463e-06, "loss": 1.7563343048095703, "step": 106 }, { "epoch": 0.3302752293577982, "grad_norm": 0.55078125, "learning_rate": 9.975822159873925e-06, "loss": 1.5931520462036133, "step": 108 }, { "epoch": 0.3363914373088685, "grad_norm": 0.490234375, "learning_rate": 9.973408196666062e-06, "loss": 1.6376924514770508, "step": 110 }, { "epoch": 0.3425076452599388, "grad_norm": 1.0234375, "learning_rate": 9.970879746992641e-06, "loss": 1.6083383560180664, "step": 112 }, { "epoch": 0.3486238532110092, "grad_norm": 0.408203125, "learning_rate": 9.968236875563444e-06, "loss": 1.5672008991241455, "step": 114 }, { "epoch": 0.3547400611620795, "grad_norm": 0.400390625, "learning_rate": 9.965479650016611e-06, "loss": 1.5744966268539429, "step": 116 }, { "epoch": 0.36085626911314983, "grad_norm": 0.51171875, "learning_rate": 9.962608140916906e-06, "loss": 1.6350196599960327, "step": 118 }, { "epoch": 0.3669724770642202, "grad_norm": 0.4609375, "learning_rate": 9.959622421753922e-06, "loss": 1.4963032007217407, "step": 120 }, { "epoch": 0.3730886850152905, "grad_norm": 0.486328125, "learning_rate": 9.956522568940185e-06, "loss": 1.5451488494873047, "step": 122 }, { "epoch": 0.37920489296636084, "grad_norm": 0.439453125, "learning_rate": 9.953308661809209e-06, "loss": 1.599358320236206, "step": 124 }, { "epoch": 0.3853211009174312, "grad_norm": 0.37890625, "learning_rate": 9.949980782613466e-06, "loss": 1.5644880533218384, "step": 126 }, { "epoch": 0.39143730886850153, "grad_norm": 0.6171875, "learning_rate": 9.94653901652227e-06, "loss": 1.6034414768218994, "step": 128 }, { "epoch": 0.39755351681957185, "grad_norm": 0.55078125, "learning_rate": 9.942983451619614e-06, "loss": 1.6047066450119019, "step": 130 }, { "epoch": 0.4036697247706422, "grad_norm": 0.4140625, "learning_rate": 9.939314178901898e-06, "loss": 1.5338762998580933, "step": 132 }, { "epoch": 0.40978593272171254, "grad_norm": 0.60546875, "learning_rate": 9.935531292275615e-06, "loss": 1.5983346700668335, "step": 134 }, { "epoch": 0.41590214067278286, "grad_norm": 0.390625, "learning_rate": 9.931634888554937e-06, "loss": 1.4490175247192383, "step": 136 }, { "epoch": 0.42201834862385323, "grad_norm": 0.86328125, "learning_rate": 9.927625067459245e-06, "loss": 1.43030846118927, "step": 138 }, { "epoch": 0.42813455657492355, "grad_norm": 0.369140625, "learning_rate": 9.923501931610571e-06, "loss": 1.5441913604736328, "step": 140 }, { "epoch": 0.43425076452599387, "grad_norm": 1.296875, "learning_rate": 9.919265586530977e-06, "loss": 1.5886114835739136, "step": 142 }, { "epoch": 0.44036697247706424, "grad_norm": 0.8203125, "learning_rate": 9.914916140639849e-06, "loss": 1.5252549648284912, "step": 144 }, { "epoch": 0.44648318042813456, "grad_norm": 0.353515625, "learning_rate": 9.910453705251127e-06, "loss": 1.4197413921356201, "step": 146 }, { "epoch": 0.4525993883792049, "grad_norm": 0.5390625, "learning_rate": 9.905878394570453e-06, "loss": 1.5738030672073364, "step": 148 }, { "epoch": 0.45871559633027525, "grad_norm": 0.55859375, "learning_rate": 9.90119032569225e-06, "loss": 1.595241904258728, "step": 150 }, { "epoch": 0.4648318042813456, "grad_norm": 0.515625, "learning_rate": 9.89638961859672e-06, "loss": 1.5898534059524536, "step": 152 }, { "epoch": 0.4709480122324159, "grad_norm": 0.5078125, "learning_rate": 9.891476396146785e-06, "loss": 1.5508402585983276, "step": 154 }, { "epoch": 0.47706422018348627, "grad_norm": 0.404296875, "learning_rate": 9.886450784084934e-06, "loss": 1.5691711902618408, "step": 156 }, { "epoch": 0.4831804281345566, "grad_norm": 0.85546875, "learning_rate": 9.88131291103e-06, "loss": 1.5895097255706787, "step": 158 }, { "epoch": 0.4892966360856269, "grad_norm": 0.58984375, "learning_rate": 9.876062908473883e-06, "loss": 1.5543285608291626, "step": 160 }, { "epoch": 0.4954128440366973, "grad_norm": 0.375, "learning_rate": 9.870700910778169e-06, "loss": 1.4683598279953003, "step": 162 }, { "epoch": 0.5015290519877675, "grad_norm": 1.328125, "learning_rate": 9.865227055170706e-06, "loss": 1.4957642555236816, "step": 164 }, { "epoch": 0.5076452599388379, "grad_norm": 0.54296875, "learning_rate": 9.85964148174208e-06, "loss": 1.448598027229309, "step": 166 }, { "epoch": 0.5137614678899083, "grad_norm": 0.54296875, "learning_rate": 9.853944333442036e-06, "loss": 1.4433187246322632, "step": 168 }, { "epoch": 0.5198776758409785, "grad_norm": 5.46875, "learning_rate": 9.848135756075816e-06, "loss": 1.500611424446106, "step": 170 }, { "epoch": 0.5259938837920489, "grad_norm": 0.435546875, "learning_rate": 9.842215898300434e-06, "loss": 1.4782170057296753, "step": 172 }, { "epoch": 0.5321100917431193, "grad_norm": 0.35546875, "learning_rate": 9.836184911620863e-06, "loss": 1.485479712486267, "step": 174 }, { "epoch": 0.5382262996941896, "grad_norm": 1.53125, "learning_rate": 9.830042950386162e-06, "loss": 1.5060051679611206, "step": 176 }, { "epoch": 0.5443425076452599, "grad_norm": 0.3671875, "learning_rate": 9.823790171785527e-06, "loss": 1.4704962968826294, "step": 178 }, { "epoch": 0.5504587155963303, "grad_norm": 0.4765625, "learning_rate": 9.817426735844265e-06, "loss": 1.4355278015136719, "step": 180 }, { "epoch": 0.5565749235474006, "grad_norm": 0.55078125, "learning_rate": 9.810952805419701e-06, "loss": 1.5194344520568848, "step": 182 }, { "epoch": 0.5626911314984709, "grad_norm": 0.455078125, "learning_rate": 9.804368546197007e-06, "loss": 1.5073320865631104, "step": 184 }, { "epoch": 0.5688073394495413, "grad_norm": 0.6640625, "learning_rate": 9.797674126684967e-06, "loss": 1.522252082824707, "step": 186 }, { "epoch": 0.5749235474006116, "grad_norm": 0.5390625, "learning_rate": 9.790869718211657e-06, "loss": 1.6073163747787476, "step": 188 }, { "epoch": 0.581039755351682, "grad_norm": 0.416015625, "learning_rate": 9.783955494920067e-06, "loss": 1.4052844047546387, "step": 190 }, { "epoch": 0.5871559633027523, "grad_norm": 0.3515625, "learning_rate": 9.77693163376364e-06, "loss": 1.4193068742752075, "step": 192 }, { "epoch": 0.5932721712538226, "grad_norm": 0.4609375, "learning_rate": 9.76979831450175e-06, "loss": 1.5307352542877197, "step": 194 }, { "epoch": 0.599388379204893, "grad_norm": 0.5625, "learning_rate": 9.76255571969509e-06, "loss": 1.424899697303772, "step": 196 }, { "epoch": 0.6055045871559633, "grad_norm": 0.52734375, "learning_rate": 9.755204034701004e-06, "loss": 1.359844446182251, "step": 198 }, { "epoch": 0.6116207951070336, "grad_norm": 0.5078125, "learning_rate": 9.747743447668755e-06, "loss": 1.582168459892273, "step": 200 }, { "epoch": 0.617737003058104, "grad_norm": 0.470703125, "learning_rate": 9.740174149534694e-06, "loss": 1.488830327987671, "step": 202 }, { "epoch": 0.6238532110091743, "grad_norm": 0.5, "learning_rate": 9.732496334017376e-06, "loss": 1.4927191734313965, "step": 204 }, { "epoch": 0.6299694189602446, "grad_norm": 0.45703125, "learning_rate": 9.724710197612615e-06, "loss": 1.4716768264770508, "step": 206 }, { "epoch": 0.636085626911315, "grad_norm": 0.408203125, "learning_rate": 9.716815939588437e-06, "loss": 1.3903311491012573, "step": 208 }, { "epoch": 0.6422018348623854, "grad_norm": 0.341796875, "learning_rate": 9.708813761979992e-06, "loss": 1.5344760417938232, "step": 210 }, { "epoch": 0.6483180428134556, "grad_norm": 0.73046875, "learning_rate": 9.700703869584386e-06, "loss": 1.4522379636764526, "step": 212 }, { "epoch": 0.654434250764526, "grad_norm": 0.35546875, "learning_rate": 9.692486469955425e-06, "loss": 1.3874422311782837, "step": 214 }, { "epoch": 0.6605504587155964, "grad_norm": 0.2734375, "learning_rate": 9.684161773398321e-06, "loss": 1.3861643075942993, "step": 216 }, { "epoch": 0.6666666666666666, "grad_norm": 0.55859375, "learning_rate": 9.675729992964292e-06, "loss": 1.5152150392532349, "step": 218 }, { "epoch": 0.672782874617737, "grad_norm": 0.98828125, "learning_rate": 9.667191344445123e-06, "loss": 1.3514238595962524, "step": 220 }, { "epoch": 0.6788990825688074, "grad_norm": 0.70703125, "learning_rate": 9.658546046367646e-06, "loss": 1.39436936378479, "step": 222 }, { "epoch": 0.6850152905198776, "grad_norm": 0.69140625, "learning_rate": 9.649794319988121e-06, "loss": 1.4995126724243164, "step": 224 }, { "epoch": 0.691131498470948, "grad_norm": 0.490234375, "learning_rate": 9.640936389286617e-06, "loss": 1.4583836793899536, "step": 226 }, { "epoch": 0.6972477064220184, "grad_norm": 0.61328125, "learning_rate": 9.631972480961235e-06, "loss": 1.4303733110427856, "step": 228 }, { "epoch": 0.7033639143730887, "grad_norm": 0.421875, "learning_rate": 9.622902824422336e-06, "loss": 1.393810749053955, "step": 230 }, { "epoch": 0.709480122324159, "grad_norm": 0.453125, "learning_rate": 9.613727651786659e-06, "loss": 1.51703679561615, "step": 232 }, { "epoch": 0.7155963302752294, "grad_norm": 0.5, "learning_rate": 9.604447197871382e-06, "loss": 1.373485803604126, "step": 234 }, { "epoch": 0.7217125382262997, "grad_norm": 0.384765625, "learning_rate": 9.59506170018811e-06, "loss": 1.4396356344223022, "step": 236 }, { "epoch": 0.72782874617737, "grad_norm": 0.5859375, "learning_rate": 9.5855713989368e-06, "loss": 1.5568106174468994, "step": 238 }, { "epoch": 0.7339449541284404, "grad_norm": 0.94140625, "learning_rate": 9.575976536999616e-06, "loss": 1.4187113046646118, "step": 240 }, { "epoch": 0.7400611620795107, "grad_norm": 0.423828125, "learning_rate": 9.566277359934703e-06, "loss": 1.4353150129318237, "step": 242 }, { "epoch": 0.746177370030581, "grad_norm": 0.62109375, "learning_rate": 9.556474115969911e-06, "loss": 1.5181076526641846, "step": 244 }, { "epoch": 0.7522935779816514, "grad_norm": 0.408203125, "learning_rate": 9.546567055996441e-06, "loss": 1.4269428253173828, "step": 246 }, { "epoch": 0.7584097859327217, "grad_norm": 0.4765625, "learning_rate": 9.536556433562422e-06, "loss": 1.4407360553741455, "step": 248 }, { "epoch": 0.764525993883792, "grad_norm": 0.373046875, "learning_rate": 9.526442504866427e-06, "loss": 1.3571839332580566, "step": 250 }, { "epoch": 0.7706422018348624, "grad_norm": 0.41796875, "learning_rate": 9.516225528750904e-06, "loss": 1.4300589561462402, "step": 252 }, { "epoch": 0.7767584097859327, "grad_norm": 0.59375, "learning_rate": 9.505905766695564e-06, "loss": 1.5078905820846558, "step": 254 }, { "epoch": 0.7828746177370031, "grad_norm": 0.64453125, "learning_rate": 9.495483482810688e-06, "loss": 1.456427812576294, "step": 256 }, { "epoch": 0.7889908256880734, "grad_norm": 0.546875, "learning_rate": 9.484958943830363e-06, "loss": 1.4158270359039307, "step": 258 }, { "epoch": 0.7951070336391437, "grad_norm": 0.3984375, "learning_rate": 9.474332419105652e-06, "loss": 1.3977278470993042, "step": 260 }, { "epoch": 0.8012232415902141, "grad_norm": 0.51953125, "learning_rate": 9.463604180597712e-06, "loss": 1.3898099660873413, "step": 262 }, { "epoch": 0.8073394495412844, "grad_norm": 0.58203125, "learning_rate": 9.452774502870822e-06, "loss": 1.4355534315109253, "step": 264 }, { "epoch": 0.8134556574923547, "grad_norm": 0.59375, "learning_rate": 9.441843663085368e-06, "loss": 1.454459309577942, "step": 266 }, { "epoch": 0.8195718654434251, "grad_norm": 0.4453125, "learning_rate": 9.430811940990736e-06, "loss": 1.4455972909927368, "step": 268 }, { "epoch": 0.8256880733944955, "grad_norm": 0.57421875, "learning_rate": 9.419679618918164e-06, "loss": 1.381105661392212, "step": 270 }, { "epoch": 0.8318042813455657, "grad_norm": 0.46484375, "learning_rate": 9.408446981773514e-06, "loss": 1.4196290969848633, "step": 272 }, { "epoch": 0.8379204892966361, "grad_norm": 1.1328125, "learning_rate": 9.397114317029975e-06, "loss": 1.4939439296722412, "step": 274 }, { "epoch": 0.8440366972477065, "grad_norm": 0.5859375, "learning_rate": 9.38568191472071e-06, "loss": 1.4450997114181519, "step": 276 }, { "epoch": 0.8501529051987767, "grad_norm": 0.6640625, "learning_rate": 9.374150067431433e-06, "loss": 1.4556881189346313, "step": 278 }, { "epoch": 0.8562691131498471, "grad_norm": 0.36328125, "learning_rate": 9.362519070292924e-06, "loss": 1.3958441019058228, "step": 280 }, { "epoch": 0.8623853211009175, "grad_norm": 0.55078125, "learning_rate": 9.350789220973468e-06, "loss": 1.492562174797058, "step": 282 }, { "epoch": 0.8685015290519877, "grad_norm": 0.33203125, "learning_rate": 9.33896081967124e-06, "loss": 1.3768280744552612, "step": 284 }, { "epoch": 0.8746177370030581, "grad_norm": 0.94140625, "learning_rate": 9.32703416910663e-06, "loss": 1.3635163307189941, "step": 286 }, { "epoch": 0.8807339449541285, "grad_norm": 0.703125, "learning_rate": 9.315009574514487e-06, "loss": 1.3402776718139648, "step": 288 }, { "epoch": 0.8868501529051988, "grad_norm": 0.328125, "learning_rate": 9.302887343636305e-06, "loss": 1.4155652523040771, "step": 290 }, { "epoch": 0.8929663608562691, "grad_norm": 0.453125, "learning_rate": 9.290667786712354e-06, "loss": 1.5360904932022095, "step": 292 }, { "epoch": 0.8990825688073395, "grad_norm": 0.447265625, "learning_rate": 9.278351216473737e-06, "loss": 1.4269368648529053, "step": 294 }, { "epoch": 0.9051987767584098, "grad_norm": 2.40625, "learning_rate": 9.265937948134393e-06, "loss": 1.4990252256393433, "step": 296 }, { "epoch": 0.9113149847094801, "grad_norm": 0.37890625, "learning_rate": 9.253428299383013e-06, "loss": 1.4629848003387451, "step": 298 }, { "epoch": 0.9174311926605505, "grad_norm": 0.66015625, "learning_rate": 9.240822590374927e-06, "loss": 1.3986918926239014, "step": 300 }, { "epoch": 0.9235474006116208, "grad_norm": 0.40234375, "learning_rate": 9.228121143723901e-06, "loss": 1.374011754989624, "step": 302 }, { "epoch": 0.9296636085626911, "grad_norm": 0.33203125, "learning_rate": 9.215324284493888e-06, "loss": 1.3964948654174805, "step": 304 }, { "epoch": 0.9357798165137615, "grad_norm": 0.43359375, "learning_rate": 9.202432340190696e-06, "loss": 1.3667786121368408, "step": 306 }, { "epoch": 0.9418960244648318, "grad_norm": 0.380859375, "learning_rate": 9.18944564075362e-06, "loss": 1.3175499439239502, "step": 308 }, { "epoch": 0.9480122324159022, "grad_norm": 0.376953125, "learning_rate": 9.17636451854699e-06, "loss": 1.3974062204360962, "step": 310 }, { "epoch": 0.9541284403669725, "grad_norm": 0.578125, "learning_rate": 9.163189308351666e-06, "loss": 1.405277132987976, "step": 312 }, { "epoch": 0.9602446483180428, "grad_norm": 0.59375, "learning_rate": 9.149920347356472e-06, "loss": 1.4029018878936768, "step": 314 }, { "epoch": 0.9663608562691132, "grad_norm": 0.41015625, "learning_rate": 9.136557975149563e-06, "loss": 1.3701725006103516, "step": 316 }, { "epoch": 0.9724770642201835, "grad_norm": 0.5859375, "learning_rate": 9.12310253370974e-06, "loss": 1.4639108180999756, "step": 318 }, { "epoch": 0.9785932721712538, "grad_norm": 1.140625, "learning_rate": 9.109554367397699e-06, "loss": 1.3428951501846313, "step": 320 }, { "epoch": 0.9847094801223242, "grad_norm": 0.9296875, "learning_rate": 9.095913822947197e-06, "loss": 1.2543865442276, "step": 322 }, { "epoch": 0.9908256880733946, "grad_norm": 0.4296875, "learning_rate": 9.082181249456211e-06, "loss": 1.287245512008667, "step": 324 }, { "epoch": 0.9969418960244648, "grad_norm": 0.404296875, "learning_rate": 9.06835699837798e-06, "loss": 1.3998972177505493, "step": 326 }, { "epoch": 1.003058103975535, "grad_norm": 0.4921875, "learning_rate": 9.054441423512015e-06, "loss": 1.381530523300171, "step": 328 }, { "epoch": 1.0091743119266054, "grad_norm": 0.380859375, "learning_rate": 9.040434880995052e-06, "loss": 1.388320803642273, "step": 330 }, { "epoch": 1.0152905198776758, "grad_norm": 0.478515625, "learning_rate": 9.026337729291927e-06, "loss": 1.3628325462341309, "step": 332 }, { "epoch": 1.0214067278287462, "grad_norm": 0.447265625, "learning_rate": 9.012150329186412e-06, "loss": 1.3321391344070435, "step": 334 }, { "epoch": 1.0275229357798166, "grad_norm": 0.53515625, "learning_rate": 8.997873043771974e-06, "loss": 1.4002933502197266, "step": 336 }, { "epoch": 1.033639143730887, "grad_norm": 0.55859375, "learning_rate": 8.983506238442486e-06, "loss": 1.3850795030593872, "step": 338 }, { "epoch": 1.039755351681957, "grad_norm": 0.8984375, "learning_rate": 8.969050280882873e-06, "loss": 1.3575987815856934, "step": 340 }, { "epoch": 1.0458715596330275, "grad_norm": 1.671875, "learning_rate": 8.954505541059707e-06, "loss": 1.448436975479126, "step": 342 }, { "epoch": 1.0519877675840978, "grad_norm": 0.51953125, "learning_rate": 8.939872391211732e-06, "loss": 1.3202804327011108, "step": 344 }, { "epoch": 1.0581039755351682, "grad_norm": 0.5703125, "learning_rate": 8.925151205840343e-06, "loss": 1.335026741027832, "step": 346 }, { "epoch": 1.0642201834862386, "grad_norm": 0.412109375, "learning_rate": 8.910342361699996e-06, "loss": 1.3237738609313965, "step": 348 }, { "epoch": 1.070336391437309, "grad_norm": 0.51171875, "learning_rate": 8.895446237788574e-06, "loss": 1.3708387613296509, "step": 350 }, { "epoch": 1.0764525993883791, "grad_norm": 0.400390625, "learning_rate": 8.88046321533768e-06, "loss": 1.3443958759307861, "step": 352 }, { "epoch": 1.0825688073394495, "grad_norm": 1.2578125, "learning_rate": 8.865393677802882e-06, "loss": 1.231789231300354, "step": 354 }, { "epoch": 1.0886850152905199, "grad_norm": 0.455078125, "learning_rate": 8.850238010853902e-06, "loss": 1.3118000030517578, "step": 356 }, { "epoch": 1.0948012232415902, "grad_norm": 0.46484375, "learning_rate": 8.834996602364738e-06, "loss": 1.449766993522644, "step": 358 }, { "epoch": 1.1009174311926606, "grad_norm": 0.53515625, "learning_rate": 8.81966984240375e-06, "loss": 1.3435068130493164, "step": 360 }, { "epoch": 1.107033639143731, "grad_norm": 5.9375, "learning_rate": 8.80425812322367e-06, "loss": 1.2645937204360962, "step": 362 }, { "epoch": 1.1131498470948011, "grad_norm": 0.64453125, "learning_rate": 8.78876183925156e-06, "loss": 1.2984048128128052, "step": 364 }, { "epoch": 1.1192660550458715, "grad_norm": 0.5546875, "learning_rate": 8.77318138707872e-06, "loss": 1.3319660425186157, "step": 366 }, { "epoch": 1.1253822629969419, "grad_norm": 0.97265625, "learning_rate": 8.757517165450543e-06, "loss": 1.3149017095565796, "step": 368 }, { "epoch": 1.1314984709480123, "grad_norm": 0.4375, "learning_rate": 8.741769575256306e-06, "loss": 1.3030190467834473, "step": 370 }, { "epoch": 1.1376146788990826, "grad_norm": 0.45703125, "learning_rate": 8.725939019518902e-06, "loss": 1.2816126346588135, "step": 372 }, { "epoch": 1.143730886850153, "grad_norm": 0.640625, "learning_rate": 8.710025903384548e-06, "loss": 1.3657718896865845, "step": 374 }, { "epoch": 1.1498470948012232, "grad_norm": 0.408203125, "learning_rate": 8.69403063411239e-06, "loss": 1.2346255779266357, "step": 376 }, { "epoch": 1.1559633027522935, "grad_norm": 0.35546875, "learning_rate": 8.6779536210641e-06, "loss": 1.2943626642227173, "step": 378 }, { "epoch": 1.162079510703364, "grad_norm": 0.515625, "learning_rate": 8.661795275693385e-06, "loss": 1.3616715669631958, "step": 380 }, { "epoch": 1.1681957186544343, "grad_norm": 0.447265625, "learning_rate": 8.64555601153547e-06, "loss": 1.2533824443817139, "step": 382 }, { "epoch": 1.1743119266055047, "grad_norm": 0.56640625, "learning_rate": 8.629236244196502e-06, "loss": 1.287404179573059, "step": 384 }, { "epoch": 1.1804281345565748, "grad_norm": 0.62890625, "learning_rate": 8.612836391342925e-06, "loss": 1.3631038665771484, "step": 386 }, { "epoch": 1.1865443425076452, "grad_norm": 0.41015625, "learning_rate": 8.596356872690779e-06, "loss": 1.3277571201324463, "step": 388 }, { "epoch": 1.1926605504587156, "grad_norm": 0.50390625, "learning_rate": 8.579798109994968e-06, "loss": 1.3345115184783936, "step": 390 }, { "epoch": 1.198776758409786, "grad_norm": 0.458984375, "learning_rate": 8.563160527038467e-06, "loss": 1.2454558610916138, "step": 392 }, { "epoch": 1.2048929663608563, "grad_norm": 0.53125, "learning_rate": 8.546444549621467e-06, "loss": 1.3097434043884277, "step": 394 }, { "epoch": 1.2110091743119267, "grad_norm": 0.470703125, "learning_rate": 8.529650605550478e-06, "loss": 1.2673131227493286, "step": 396 }, { "epoch": 1.217125382262997, "grad_norm": 0.55859375, "learning_rate": 8.512779124627395e-06, "loss": 1.4371856451034546, "step": 398 }, { "epoch": 1.2232415902140672, "grad_norm": 0.392578125, "learning_rate": 8.495830538638481e-06, "loss": 1.2818241119384766, "step": 400 }, { "epoch": 1.2293577981651376, "grad_norm": 0.48046875, "learning_rate": 8.478805281343335e-06, "loss": 1.215641736984253, "step": 402 }, { "epoch": 1.235474006116208, "grad_norm": 2.296875, "learning_rate": 8.461703788463757e-06, "loss": 1.2823781967163086, "step": 404 }, { "epoch": 1.2415902140672783, "grad_norm": 0.671875, "learning_rate": 8.44452649767264e-06, "loss": 1.3114620447158813, "step": 406 }, { "epoch": 1.2477064220183487, "grad_norm": 0.62890625, "learning_rate": 8.427273848582744e-06, "loss": 1.2511239051818848, "step": 408 }, { "epoch": 1.2538226299694188, "grad_norm": 0.5234375, "learning_rate": 8.40994628273544e-06, "loss": 1.2478758096694946, "step": 410 }, { "epoch": 1.2599388379204892, "grad_norm": 0.5390625, "learning_rate": 8.392544243589428e-06, "loss": 1.3285698890686035, "step": 412 }, { "epoch": 1.2660550458715596, "grad_norm": 0.74609375, "learning_rate": 8.375068176509375e-06, "loss": 1.3709665536880493, "step": 414 }, { "epoch": 1.27217125382263, "grad_norm": 0.703125, "learning_rate": 8.357518528754524e-06, "loss": 1.3329336643218994, "step": 416 }, { "epoch": 1.2782874617737003, "grad_norm": 0.53515625, "learning_rate": 8.339895749467238e-06, "loss": 1.2674789428710938, "step": 418 }, { "epoch": 1.2844036697247707, "grad_norm": 0.80859375, "learning_rate": 8.322200289661517e-06, "loss": 1.152662992477417, "step": 420 }, { "epoch": 1.290519877675841, "grad_norm": 0.45703125, "learning_rate": 8.304432602211446e-06, "loss": 1.3445444107055664, "step": 422 }, { "epoch": 1.2966360856269112, "grad_norm": 0.578125, "learning_rate": 8.28659314183961e-06, "loss": 1.3826080560684204, "step": 424 }, { "epoch": 1.3027522935779816, "grad_norm": 0.3828125, "learning_rate": 8.268682365105453e-06, "loss": 1.3560914993286133, "step": 426 }, { "epoch": 1.308868501529052, "grad_norm": 0.62890625, "learning_rate": 8.250700730393599e-06, "loss": 1.2076865434646606, "step": 428 }, { "epoch": 1.3149847094801224, "grad_norm": 0.408203125, "learning_rate": 8.232648697902113e-06, "loss": 1.3048980236053467, "step": 430 }, { "epoch": 1.3211009174311927, "grad_norm": 0.4921875, "learning_rate": 8.21452672963073e-06, "loss": 1.352384328842163, "step": 432 }, { "epoch": 1.3272171253822629, "grad_norm": 0.3984375, "learning_rate": 8.196335289369027e-06, "loss": 1.390981674194336, "step": 434 }, { "epoch": 1.3333333333333333, "grad_norm": 1.0625, "learning_rate": 8.178074842684554e-06, "loss": 1.32779860496521, "step": 436 }, { "epoch": 1.3394495412844036, "grad_norm": 1.0390625, "learning_rate": 8.159745856910922e-06, "loss": 1.2868674993515015, "step": 438 }, { "epoch": 1.345565749235474, "grad_norm": 0.419921875, "learning_rate": 8.14134880113584e-06, "loss": 1.305415153503418, "step": 440 }, { "epoch": 1.3516819571865444, "grad_norm": 0.53125, "learning_rate": 8.122884146189104e-06, "loss": 1.3808095455169678, "step": 442 }, { "epoch": 1.3577981651376148, "grad_norm": 0.4921875, "learning_rate": 8.104352364630565e-06, "loss": 1.2937378883361816, "step": 444 }, { "epoch": 1.3639143730886851, "grad_norm": 0.32421875, "learning_rate": 8.085753930738013e-06, "loss": 1.2958605289459229, "step": 446 }, { "epoch": 1.3700305810397553, "grad_norm": 0.60546875, "learning_rate": 8.067089320495057e-06, "loss": 1.3038794994354248, "step": 448 }, { "epoch": 1.3761467889908257, "grad_norm": 0.52734375, "learning_rate": 8.048359011578927e-06, "loss": 1.2670778036117554, "step": 450 }, { "epoch": 1.382262996941896, "grad_norm": 1.390625, "learning_rate": 8.029563483348268e-06, "loss": 1.3002293109893799, "step": 452 }, { "epoch": 1.3883792048929664, "grad_norm": 0.73046875, "learning_rate": 8.010703216830852e-06, "loss": 1.3091164827346802, "step": 454 }, { "epoch": 1.3944954128440368, "grad_norm": 0.4921875, "learning_rate": 7.991778694711278e-06, "loss": 1.2860240936279297, "step": 456 }, { "epoch": 1.400611620795107, "grad_norm": 0.421875, "learning_rate": 7.972790401318627e-06, "loss": 1.2974958419799805, "step": 458 }, { "epoch": 1.4067278287461773, "grad_norm": 1.4609375, "learning_rate": 7.953738822614048e-06, "loss": 1.3687572479248047, "step": 460 }, { "epoch": 1.4128440366972477, "grad_norm": 0.447265625, "learning_rate": 7.934624446178328e-06, "loss": 1.2588635683059692, "step": 462 }, { "epoch": 1.418960244648318, "grad_norm": 0.427734375, "learning_rate": 7.915447761199427e-06, "loss": 1.3145904541015625, "step": 464 }, { "epoch": 1.4250764525993884, "grad_norm": 0.318359375, "learning_rate": 7.896209258459934e-06, "loss": 1.2143771648406982, "step": 466 }, { "epoch": 1.4311926605504588, "grad_norm": 0.40625, "learning_rate": 7.876909430324527e-06, "loss": 1.2713569402694702, "step": 468 }, { "epoch": 1.4373088685015292, "grad_norm": 0.64453125, "learning_rate": 7.85754877072737e-06, "loss": 1.3136000633239746, "step": 470 }, { "epoch": 1.4434250764525993, "grad_norm": 0.46484375, "learning_rate": 7.838127775159451e-06, "loss": 1.2473974227905273, "step": 472 }, { "epoch": 1.4495412844036697, "grad_norm": 0.5078125, "learning_rate": 7.818646940655933e-06, "loss": 1.3004451990127563, "step": 474 }, { "epoch": 1.45565749235474, "grad_norm": 1.4296875, "learning_rate": 7.799106765783407e-06, "loss": 1.3775520324707031, "step": 476 }, { "epoch": 1.4617737003058104, "grad_norm": 0.515625, "learning_rate": 7.779507750627145e-06, "loss": 1.409247875213623, "step": 478 }, { "epoch": 1.4678899082568808, "grad_norm": 3.953125, "learning_rate": 7.7598503967783e-06, "loss": 1.282897710800171, "step": 480 }, { "epoch": 1.474006116207951, "grad_norm": 0.4453125, "learning_rate": 7.74013520732107e-06, "loss": 1.2685235738754272, "step": 482 }, { "epoch": 1.4801223241590213, "grad_norm": 0.54296875, "learning_rate": 7.720362686819814e-06, "loss": 1.202805995941162, "step": 484 }, { "epoch": 1.4862385321100917, "grad_norm": 0.359375, "learning_rate": 7.700533341306155e-06, "loss": 1.3179457187652588, "step": 486 }, { "epoch": 1.492354740061162, "grad_norm": 0.4140625, "learning_rate": 7.680647678266011e-06, "loss": 1.3416056632995605, "step": 488 }, { "epoch": 1.4984709480122325, "grad_norm": 0.609375, "learning_rate": 7.66070620662662e-06, "loss": 1.2907155752182007, "step": 490 }, { "epoch": 1.5045871559633026, "grad_norm": 0.6640625, "learning_rate": 7.640709436743512e-06, "loss": 1.2985384464263916, "step": 492 }, { "epoch": 1.5107033639143732, "grad_norm": 0.54296875, "learning_rate": 7.620657880387448e-06, "loss": 1.2733287811279297, "step": 494 }, { "epoch": 1.5168195718654434, "grad_norm": 0.55859375, "learning_rate": 7.600552050731315e-06, "loss": 1.2120338678359985, "step": 496 }, { "epoch": 1.5229357798165137, "grad_norm": 0.859375, "learning_rate": 7.5803924623370025e-06, "loss": 1.2848923206329346, "step": 498 }, { "epoch": 1.529051987767584, "grad_norm": 0.69140625, "learning_rate": 7.5601796311422325e-06, "loss": 1.3336488008499146, "step": 500 }, { "epoch": 1.5351681957186545, "grad_norm": 0.45703125, "learning_rate": 7.539914074447349e-06, "loss": 1.2442420721054077, "step": 502 }, { "epoch": 1.5412844036697249, "grad_norm": 0.68359375, "learning_rate": 7.519596310902081e-06, "loss": 1.266619324684143, "step": 504 }, { "epoch": 1.547400611620795, "grad_norm": 0.65625, "learning_rate": 7.499226860492273e-06, "loss": 1.374267816543579, "step": 506 }, { "epoch": 1.5535168195718656, "grad_norm": 0.46484375, "learning_rate": 7.478806244526576e-06, "loss": 1.3529757261276245, "step": 508 }, { "epoch": 1.5596330275229358, "grad_norm": 0.455078125, "learning_rate": 7.458334985623102e-06, "loss": 1.2986624240875244, "step": 510 }, { "epoch": 1.5657492354740061, "grad_norm": 0.37109375, "learning_rate": 7.437813607696049e-06, "loss": 1.2934763431549072, "step": 512 }, { "epoch": 1.5718654434250765, "grad_norm": 0.90234375, "learning_rate": 7.4172426359422976e-06, "loss": 1.3502346277236938, "step": 514 }, { "epoch": 1.5779816513761467, "grad_norm": 0.58203125, "learning_rate": 7.396622596827967e-06, "loss": 1.2319389581680298, "step": 516 }, { "epoch": 1.5840978593272173, "grad_norm": 0.625, "learning_rate": 7.375954018074941e-06, "loss": 1.3282928466796875, "step": 518 }, { "epoch": 1.5902140672782874, "grad_norm": 0.6875, "learning_rate": 7.3552374286473595e-06, "loss": 1.3678048849105835, "step": 520 }, { "epoch": 1.5963302752293578, "grad_norm": 0.30859375, "learning_rate": 7.3344733587380875e-06, "loss": 1.2744084596633911, "step": 522 }, { "epoch": 1.6024464831804281, "grad_norm": 0.84375, "learning_rate": 7.31366233975514e-06, "loss": 1.281977891921997, "step": 524 }, { "epoch": 1.6085626911314985, "grad_norm": 0.61328125, "learning_rate": 7.292804904308087e-06, "loss": 1.2926934957504272, "step": 526 }, { "epoch": 1.614678899082569, "grad_norm": 0.484375, "learning_rate": 7.271901586194417e-06, "loss": 1.3355308771133423, "step": 528 }, { "epoch": 1.620795107033639, "grad_norm": 0.62890625, "learning_rate": 7.2509529203858794e-06, "loss": 1.2734055519104004, "step": 530 }, { "epoch": 1.6269113149847096, "grad_norm": 0.412109375, "learning_rate": 7.229959443014793e-06, "loss": 1.2471139430999756, "step": 532 }, { "epoch": 1.6330275229357798, "grad_norm": 0.376953125, "learning_rate": 7.208921691360323e-06, "loss": 1.3476160764694214, "step": 534 }, { "epoch": 1.6391437308868502, "grad_norm": 0.408203125, "learning_rate": 7.187840203834732e-06, "loss": 1.2233093976974487, "step": 536 }, { "epoch": 1.6452599388379205, "grad_norm": 0.66015625, "learning_rate": 7.166715519969601e-06, "loss": 1.2761595249176025, "step": 538 }, { "epoch": 1.6513761467889907, "grad_norm": 1.84375, "learning_rate": 7.145548180402021e-06, "loss": 1.3554096221923828, "step": 540 }, { "epoch": 1.6574923547400613, "grad_norm": 0.474609375, "learning_rate": 7.124338726860755e-06, "loss": 1.3470004796981812, "step": 542 }, { "epoch": 1.6636085626911314, "grad_norm": 0.796875, "learning_rate": 7.103087702152377e-06, "loss": 1.312508225440979, "step": 544 }, { "epoch": 1.6697247706422018, "grad_norm": 2.484375, "learning_rate": 7.081795650147375e-06, "loss": 1.2889965772628784, "step": 546 }, { "epoch": 1.6758409785932722, "grad_norm": 0.75, "learning_rate": 7.060463115766239e-06, "loss": 1.3792515993118286, "step": 548 }, { "epoch": 1.6819571865443423, "grad_norm": 1.96875, "learning_rate": 7.0390906449655104e-06, "loss": 1.321378469467163, "step": 550 }, { "epoch": 1.688073394495413, "grad_norm": 0.515625, "learning_rate": 7.017678784723806e-06, "loss": 1.3485661745071411, "step": 552 }, { "epoch": 1.694189602446483, "grad_norm": 0.5546875, "learning_rate": 6.99622808302783e-06, "loss": 1.3221888542175293, "step": 554 }, { "epoch": 1.7003058103975535, "grad_norm": 0.46875, "learning_rate": 6.974739088858338e-06, "loss": 1.3821053504943848, "step": 556 }, { "epoch": 1.7064220183486238, "grad_norm": 0.466796875, "learning_rate": 6.9532123521760944e-06, "loss": 1.272276759147644, "step": 558 }, { "epoch": 1.7125382262996942, "grad_norm": 0.4375, "learning_rate": 6.931648423907796e-06, "loss": 1.2930102348327637, "step": 560 }, { "epoch": 1.7186544342507646, "grad_norm": 0.361328125, "learning_rate": 6.91004785593197e-06, "loss": 1.2617864608764648, "step": 562 }, { "epoch": 1.7247706422018347, "grad_norm": 0.640625, "learning_rate": 6.888411201064854e-06, "loss": 1.3153817653656006, "step": 564 }, { "epoch": 1.7308868501529053, "grad_norm": 0.412109375, "learning_rate": 6.866739013046243e-06, "loss": 1.2653061151504517, "step": 566 }, { "epoch": 1.7370030581039755, "grad_norm": 0.5078125, "learning_rate": 6.845031846525322e-06, "loss": 1.2796239852905273, "step": 568 }, { "epoch": 1.7431192660550459, "grad_norm": 0.328125, "learning_rate": 6.823290257046467e-06, "loss": 1.2797678709030151, "step": 570 }, { "epoch": 1.7492354740061162, "grad_norm": 0.671875, "learning_rate": 6.801514801035031e-06, "loss": 1.2564300298690796, "step": 572 }, { "epoch": 1.7553516819571864, "grad_norm": 0.53515625, "learning_rate": 6.7797060357831045e-06, "loss": 1.3716152906417847, "step": 574 }, { "epoch": 1.761467889908257, "grad_norm": 0.64453125, "learning_rate": 6.757864519435245e-06, "loss": 1.3831623792648315, "step": 576 }, { "epoch": 1.7675840978593271, "grad_norm": 1.4765625, "learning_rate": 6.735990810974205e-06, "loss": 1.3119230270385742, "step": 578 }, { "epoch": 1.7737003058103975, "grad_norm": 0.69140625, "learning_rate": 6.71408547020661e-06, "loss": 1.2804102897644043, "step": 580 }, { "epoch": 1.7798165137614679, "grad_norm": 0.578125, "learning_rate": 6.6921490577486495e-06, "loss": 1.403084635734558, "step": 582 }, { "epoch": 1.7859327217125383, "grad_norm": 0.78125, "learning_rate": 6.6701821350117155e-06, "loss": 1.2526099681854248, "step": 584 }, { "epoch": 1.7920489296636086, "grad_norm": 0.7890625, "learning_rate": 6.648185264188043e-06, "loss": 1.2811146974563599, "step": 586 }, { "epoch": 1.7981651376146788, "grad_norm": 0.6171875, "learning_rate": 6.626159008236316e-06, "loss": 1.2454664707183838, "step": 588 }, { "epoch": 1.8042813455657494, "grad_norm": 0.71875, "learning_rate": 6.60410393086726e-06, "loss": 1.2602325677871704, "step": 590 }, { "epoch": 1.8103975535168195, "grad_norm": 0.421875, "learning_rate": 6.582020596529224e-06, "loss": 1.2364270687103271, "step": 592 }, { "epoch": 1.81651376146789, "grad_norm": 0.462890625, "learning_rate": 6.559909570393723e-06, "loss": 1.2236618995666504, "step": 594 }, { "epoch": 1.8226299694189603, "grad_norm": 1.109375, "learning_rate": 6.537771418340981e-06, "loss": 1.3950483798980713, "step": 596 }, { "epoch": 1.8287461773700304, "grad_norm": 0.578125, "learning_rate": 6.515606706945448e-06, "loss": 1.2344207763671875, "step": 598 }, { "epoch": 1.834862385321101, "grad_norm": 0.62890625, "learning_rate": 6.493416003461296e-06, "loss": 1.335288643836975, "step": 600 }, { "epoch": 1.8409785932721712, "grad_norm": 0.515625, "learning_rate": 6.4711998758079064e-06, "loss": 1.255522608757019, "step": 602 }, { "epoch": 1.8470948012232415, "grad_norm": 0.392578125, "learning_rate": 6.448958892555332e-06, "loss": 1.2738847732543945, "step": 604 }, { "epoch": 1.853211009174312, "grad_norm": 0.37890625, "learning_rate": 6.426693622909742e-06, "loss": 1.2251421213150024, "step": 606 }, { "epoch": 1.8593272171253823, "grad_norm": 0.58203125, "learning_rate": 6.404404636698869e-06, "loss": 1.1613845825195312, "step": 608 }, { "epoch": 1.8654434250764527, "grad_norm": 0.47265625, "learning_rate": 6.3820925043574074e-06, "loss": 1.288172721862793, "step": 610 }, { "epoch": 1.8715596330275228, "grad_norm": 1.2109375, "learning_rate": 6.35975779691243e-06, "loss": 1.2886998653411865, "step": 612 }, { "epoch": 1.8776758409785934, "grad_norm": 0.72265625, "learning_rate": 6.337401085968759e-06, "loss": 1.286860466003418, "step": 614 }, { "epoch": 1.8837920489296636, "grad_norm": 0.46484375, "learning_rate": 6.3150229436943514e-06, "loss": 1.2472259998321533, "step": 616 }, { "epoch": 1.889908256880734, "grad_norm": 0.796875, "learning_rate": 6.2926239428056456e-06, "loss": 1.309545874595642, "step": 618 }, { "epoch": 1.8960244648318043, "grad_norm": 0.546875, "learning_rate": 6.270204656552908e-06, "loss": 1.2884358167648315, "step": 620 }, { "epoch": 1.9021406727828745, "grad_norm": 0.61328125, "learning_rate": 6.247765658705564e-06, "loss": 1.2543675899505615, "step": 622 }, { "epoch": 1.908256880733945, "grad_norm": 0.3515625, "learning_rate": 6.225307523537509e-06, "loss": 1.1704795360565186, "step": 624 }, { "epoch": 1.9143730886850152, "grad_norm": 0.921875, "learning_rate": 6.2028308258124135e-06, "loss": 1.362220048904419, "step": 626 }, { "epoch": 1.9204892966360856, "grad_norm": 1.078125, "learning_rate": 6.180336140769015e-06, "loss": 1.3805466890335083, "step": 628 }, { "epoch": 1.926605504587156, "grad_norm": 0.60546875, "learning_rate": 6.157824044106394e-06, "loss": 1.3186891078948975, "step": 630 }, { "epoch": 1.9327217125382263, "grad_norm": 1.0625, "learning_rate": 6.13529511196924e-06, "loss": 1.1534855365753174, "step": 632 }, { "epoch": 1.9388379204892967, "grad_norm": 0.80859375, "learning_rate": 6.112749920933111e-06, "loss": 1.2515051364898682, "step": 634 }, { "epoch": 1.9449541284403669, "grad_norm": 1.4140625, "learning_rate": 6.090189047989665e-06, "loss": 1.4018653631210327, "step": 636 }, { "epoch": 1.9510703363914375, "grad_norm": 0.5625, "learning_rate": 6.067613070531912e-06, "loss": 1.300402283668518, "step": 638 }, { "epoch": 1.9571865443425076, "grad_norm": 0.6015625, "learning_rate": 6.045022566339419e-06, "loss": 1.3779313564300537, "step": 640 }, { "epoch": 1.963302752293578, "grad_norm": 0.439453125, "learning_rate": 6.022418113563536e-06, "loss": 1.2664169073104858, "step": 642 }, { "epoch": 1.9694189602446484, "grad_norm": 0.72265625, "learning_rate": 5.999800290712594e-06, "loss": 1.2255876064300537, "step": 644 }, { "epoch": 1.9755351681957185, "grad_norm": 0.34375, "learning_rate": 5.9771696766370965e-06, "loss": 1.3016749620437622, "step": 646 }, { "epoch": 1.981651376146789, "grad_norm": 0.443359375, "learning_rate": 5.9545268505149114e-06, "loss": 1.2409298419952393, "step": 648 }, { "epoch": 1.9877675840978593, "grad_norm": 0.85546875, "learning_rate": 5.931872391836446e-06, "loss": 1.3296973705291748, "step": 650 }, { "epoch": 1.9938837920489296, "grad_norm": 0.5703125, "learning_rate": 5.909206880389813e-06, "loss": 1.376185655593872, "step": 652 }, { "epoch": 2.0, "grad_norm": 0.80078125, "learning_rate": 5.8865308962459976e-06, "loss": 1.2528204917907715, "step": 654 }, { "epoch": 2.00611620795107, "grad_norm": 0.45703125, "learning_rate": 5.863845019744007e-06, "loss": 1.1687815189361572, "step": 656 }, { "epoch": 2.0122324159021407, "grad_norm": 0.7734375, "learning_rate": 5.841149831476024e-06, "loss": 1.2196176052093506, "step": 658 }, { "epoch": 2.018348623853211, "grad_norm": 0.4453125, "learning_rate": 5.81844591227254e-06, "loss": 1.261337399482727, "step": 660 }, { "epoch": 2.0244648318042815, "grad_norm": 0.490234375, "learning_rate": 5.795733843187496e-06, "loss": 1.2313090562820435, "step": 662 }, { "epoch": 2.0305810397553516, "grad_norm": 1.1640625, "learning_rate": 5.773014205483414e-06, "loss": 1.2076407670974731, "step": 664 }, { "epoch": 2.036697247706422, "grad_norm": 0.5859375, "learning_rate": 5.750287580616511e-06, "loss": 1.1940546035766602, "step": 666 }, { "epoch": 2.0428134556574924, "grad_norm": 1.3203125, "learning_rate": 5.7275545502218274e-06, "loss": 1.0421754121780396, "step": 668 }, { "epoch": 2.0489296636085625, "grad_norm": 0.54296875, "learning_rate": 5.704815696098337e-06, "loss": 1.2445980310440063, "step": 670 }, { "epoch": 2.055045871559633, "grad_norm": 0.50390625, "learning_rate": 5.68207160019406e-06, "loss": 1.2573648691177368, "step": 672 }, { "epoch": 2.0611620795107033, "grad_norm": 0.498046875, "learning_rate": 5.659322844591166e-06, "loss": 1.3194655179977417, "step": 674 }, { "epoch": 2.067278287461774, "grad_norm": 0.51171875, "learning_rate": 5.636570011491082e-06, "loss": 1.2315115928649902, "step": 676 }, { "epoch": 2.073394495412844, "grad_norm": 0.486328125, "learning_rate": 5.613813683199582e-06, "loss": 1.1668107509613037, "step": 678 }, { "epoch": 2.079510703363914, "grad_norm": 0.51953125, "learning_rate": 5.591054442111901e-06, "loss": 1.1951708793640137, "step": 680 }, { "epoch": 2.085626911314985, "grad_norm": 0.384765625, "learning_rate": 5.568292870697812e-06, "loss": 1.1300991773605347, "step": 682 }, { "epoch": 2.091743119266055, "grad_norm": 0.71484375, "learning_rate": 5.545529551486731e-06, "loss": 1.269416332244873, "step": 684 }, { "epoch": 2.0978593272171255, "grad_norm": 0.81640625, "learning_rate": 5.522765067052805e-06, "loss": 1.1883726119995117, "step": 686 }, { "epoch": 2.1039755351681957, "grad_norm": 0.482421875, "learning_rate": 5.500000000000001e-06, "loss": 1.172653317451477, "step": 688 }, { "epoch": 2.1100917431192663, "grad_norm": 0.6796875, "learning_rate": 5.477234932947196e-06, "loss": 1.2290334701538086, "step": 690 }, { "epoch": 2.1162079510703364, "grad_norm": 1.4453125, "learning_rate": 5.45447044851327e-06, "loss": 1.1191812753677368, "step": 692 }, { "epoch": 2.1223241590214066, "grad_norm": 0.43359375, "learning_rate": 5.431707129302188e-06, "loss": 1.3137654066085815, "step": 694 }, { "epoch": 2.128440366972477, "grad_norm": 0.734375, "learning_rate": 5.4089455578881005e-06, "loss": 1.1786179542541504, "step": 696 }, { "epoch": 2.1345565749235473, "grad_norm": 0.419921875, "learning_rate": 5.386186316800418e-06, "loss": 1.1776201725006104, "step": 698 }, { "epoch": 2.140672782874618, "grad_norm": 0.482421875, "learning_rate": 5.36342998850892e-06, "loss": 1.18330979347229, "step": 700 }, { "epoch": 2.146788990825688, "grad_norm": 0.68359375, "learning_rate": 5.340677155408835e-06, "loss": 1.2524994611740112, "step": 702 }, { "epoch": 2.1529051987767582, "grad_norm": 0.62109375, "learning_rate": 5.317928399805943e-06, "loss": 1.2536473274230957, "step": 704 }, { "epoch": 2.159021406727829, "grad_norm": 0.46875, "learning_rate": 5.295184303901665e-06, "loss": 1.1864341497421265, "step": 706 }, { "epoch": 2.165137614678899, "grad_norm": 0.439453125, "learning_rate": 5.272445449778175e-06, "loss": 1.2302113771438599, "step": 708 }, { "epoch": 2.1712538226299696, "grad_norm": 0.447265625, "learning_rate": 5.249712419383492e-06, "loss": 1.153498888015747, "step": 710 }, { "epoch": 2.1773700305810397, "grad_norm": 0.4765625, "learning_rate": 5.226985794516587e-06, "loss": 1.2334654331207275, "step": 712 }, { "epoch": 2.18348623853211, "grad_norm": 0.4453125, "learning_rate": 5.204266156812504e-06, "loss": 1.1027376651763916, "step": 714 }, { "epoch": 2.1896024464831805, "grad_norm": 4.0625, "learning_rate": 5.181554087727462e-06, "loss": 1.2752158641815186, "step": 716 }, { "epoch": 2.1957186544342506, "grad_norm": 0.5625, "learning_rate": 5.158850168523979e-06, "loss": 1.2342238426208496, "step": 718 }, { "epoch": 2.2018348623853212, "grad_norm": 0.7421875, "learning_rate": 5.136154980255995e-06, "loss": 1.2153668403625488, "step": 720 }, { "epoch": 2.2079510703363914, "grad_norm": 0.6796875, "learning_rate": 5.1134691037540055e-06, "loss": 1.2085171937942505, "step": 722 }, { "epoch": 2.214067278287462, "grad_norm": 0.80859375, "learning_rate": 5.090793119610189e-06, "loss": 1.230190634727478, "step": 724 }, { "epoch": 2.220183486238532, "grad_norm": 0.65625, "learning_rate": 5.068127608163557e-06, "loss": 1.1547964811325073, "step": 726 }, { "epoch": 2.2262996941896023, "grad_norm": 0.57421875, "learning_rate": 5.045473149485091e-06, "loss": 1.2784456014633179, "step": 728 }, { "epoch": 2.232415902140673, "grad_norm": 0.734375, "learning_rate": 5.022830323362905e-06, "loss": 1.1994041204452515, "step": 730 }, { "epoch": 2.238532110091743, "grad_norm": 0.59375, "learning_rate": 5.000199709287408e-06, "loss": 1.1957271099090576, "step": 732 }, { "epoch": 2.2446483180428136, "grad_norm": 0.70703125, "learning_rate": 4.9775818864364635e-06, "loss": 1.2446789741516113, "step": 734 }, { "epoch": 2.2507645259938838, "grad_norm": 0.486328125, "learning_rate": 4.954977433660583e-06, "loss": 1.1822783946990967, "step": 736 }, { "epoch": 2.2568807339449544, "grad_norm": 0.60546875, "learning_rate": 4.9323869294680915e-06, "loss": 1.1413577795028687, "step": 738 }, { "epoch": 2.2629969418960245, "grad_norm": 0.49609375, "learning_rate": 4.909810952010336e-06, "loss": 1.1892144680023193, "step": 740 }, { "epoch": 2.2691131498470947, "grad_norm": 0.498046875, "learning_rate": 4.887250079066892e-06, "loss": 1.2589919567108154, "step": 742 }, { "epoch": 2.2752293577981653, "grad_norm": 0.9375, "learning_rate": 4.86470488803076e-06, "loss": 1.2584980726242065, "step": 744 }, { "epoch": 2.2813455657492354, "grad_norm": 0.435546875, "learning_rate": 4.842175955893608e-06, "loss": 1.1710209846496582, "step": 746 }, { "epoch": 2.287461773700306, "grad_norm": 0.59765625, "learning_rate": 4.819663859230986e-06, "loss": 1.2968641519546509, "step": 748 }, { "epoch": 2.293577981651376, "grad_norm": 1.2109375, "learning_rate": 4.797169174187588e-06, "loss": 1.198433756828308, "step": 750 }, { "epoch": 2.2996941896024463, "grad_norm": 0.6875, "learning_rate": 4.774692476462493e-06, "loss": 1.296976089477539, "step": 752 }, { "epoch": 2.305810397553517, "grad_norm": 0.63671875, "learning_rate": 4.752234341294438e-06, "loss": 1.2286152839660645, "step": 754 }, { "epoch": 2.311926605504587, "grad_norm": 0.87890625, "learning_rate": 4.729795343447093e-06, "loss": 1.2850275039672852, "step": 756 }, { "epoch": 2.3180428134556577, "grad_norm": 0.578125, "learning_rate": 4.707376057194356e-06, "loss": 1.2537508010864258, "step": 758 }, { "epoch": 2.324159021406728, "grad_norm": 0.58984375, "learning_rate": 4.68497705630565e-06, "loss": 1.1948941946029663, "step": 760 }, { "epoch": 2.330275229357798, "grad_norm": 0.51171875, "learning_rate": 4.662598914031241e-06, "loss": 1.2438340187072754, "step": 762 }, { "epoch": 2.3363914373088686, "grad_norm": 0.62890625, "learning_rate": 4.6402422030875704e-06, "loss": 1.3103235960006714, "step": 764 }, { "epoch": 2.3425076452599387, "grad_norm": 1.1328125, "learning_rate": 4.617907495642594e-06, "loss": 1.1827704906463623, "step": 766 }, { "epoch": 2.3486238532110093, "grad_norm": 0.515625, "learning_rate": 4.595595363301133e-06, "loss": 1.1387625932693481, "step": 768 }, { "epoch": 2.3547400611620795, "grad_norm": 0.68359375, "learning_rate": 4.5733063770902595e-06, "loss": 1.2371636629104614, "step": 770 }, { "epoch": 2.3608562691131496, "grad_norm": 0.54296875, "learning_rate": 4.551041107444671e-06, "loss": 1.1606448888778687, "step": 772 }, { "epoch": 2.36697247706422, "grad_norm": 0.60546875, "learning_rate": 4.528800124192095e-06, "loss": 1.3499796390533447, "step": 774 }, { "epoch": 2.3730886850152904, "grad_norm": 0.388671875, "learning_rate": 4.506583996538705e-06, "loss": 1.1447316408157349, "step": 776 }, { "epoch": 2.379204892966361, "grad_norm": 0.6640625, "learning_rate": 4.484393293054553e-06, "loss": 1.190900444984436, "step": 778 }, { "epoch": 2.385321100917431, "grad_norm": 0.54296875, "learning_rate": 4.462228581659019e-06, "loss": 1.2503337860107422, "step": 780 }, { "epoch": 2.3914373088685017, "grad_norm": 0.609375, "learning_rate": 4.440090429606278e-06, "loss": 1.1737557649612427, "step": 782 }, { "epoch": 2.397553516819572, "grad_norm": 0.75390625, "learning_rate": 4.417979403470778e-06, "loss": 1.239940881729126, "step": 784 }, { "epoch": 2.4036697247706424, "grad_norm": 0.419921875, "learning_rate": 4.3958960691327425e-06, "loss": 1.1777243614196777, "step": 786 }, { "epoch": 2.4097859327217126, "grad_norm": 0.5859375, "learning_rate": 4.373840991763686e-06, "loss": 1.1661309003829956, "step": 788 }, { "epoch": 2.4159021406727827, "grad_norm": 0.466796875, "learning_rate": 4.3518147358119575e-06, "loss": 1.2908847332000732, "step": 790 }, { "epoch": 2.4220183486238533, "grad_norm": 0.58984375, "learning_rate": 4.329817864988285e-06, "loss": 1.261257290840149, "step": 792 }, { "epoch": 2.4281345565749235, "grad_norm": 0.486328125, "learning_rate": 4.307850942251351e-06, "loss": 1.0505046844482422, "step": 794 }, { "epoch": 2.434250764525994, "grad_norm": 0.68359375, "learning_rate": 4.285914529793392e-06, "loss": 1.260128378868103, "step": 796 }, { "epoch": 2.4403669724770642, "grad_norm": 0.72265625, "learning_rate": 4.2640091890257984e-06, "loss": 1.336702823638916, "step": 798 }, { "epoch": 2.4464831804281344, "grad_norm": 0.50390625, "learning_rate": 4.242135480564756e-06, "loss": 1.2336891889572144, "step": 800 }, { "epoch": 2.452599388379205, "grad_norm": 0.75390625, "learning_rate": 4.220293964216899e-06, "loss": 1.1661975383758545, "step": 802 }, { "epoch": 2.458715596330275, "grad_norm": 0.59765625, "learning_rate": 4.198485198964971e-06, "loss": 1.2408455610275269, "step": 804 }, { "epoch": 2.4648318042813457, "grad_norm": 0.48828125, "learning_rate": 4.176709742953536e-06, "loss": 1.1344859600067139, "step": 806 }, { "epoch": 2.470948012232416, "grad_norm": 5.84375, "learning_rate": 4.15496815347468e-06, "loss": 1.1564085483551025, "step": 808 }, { "epoch": 2.477064220183486, "grad_norm": 0.69921875, "learning_rate": 4.133260986953759e-06, "loss": 1.3386648893356323, "step": 810 }, { "epoch": 2.4831804281345566, "grad_norm": 0.423828125, "learning_rate": 4.111588798935146e-06, "loss": 1.1828325986862183, "step": 812 }, { "epoch": 2.489296636085627, "grad_norm": 0.703125, "learning_rate": 4.089952144068031e-06, "loss": 1.1244158744812012, "step": 814 }, { "epoch": 2.4954128440366974, "grad_norm": 0.68359375, "learning_rate": 4.068351576092204e-06, "loss": 1.2698228359222412, "step": 816 }, { "epoch": 2.5015290519877675, "grad_norm": 0.74609375, "learning_rate": 4.046787647823906e-06, "loss": 1.2006717920303345, "step": 818 }, { "epoch": 2.5076452599388377, "grad_norm": 1.375, "learning_rate": 4.025260911141664e-06, "loss": 1.217053771018982, "step": 820 }, { "epoch": 2.5137614678899083, "grad_norm": 0.53125, "learning_rate": 4.003771916972171e-06, "loss": 1.2399015426635742, "step": 822 }, { "epoch": 2.5198776758409784, "grad_norm": 0.59765625, "learning_rate": 3.982321215276195e-06, "loss": 1.1872673034667969, "step": 824 }, { "epoch": 2.525993883792049, "grad_norm": 1.2421875, "learning_rate": 3.960909355034491e-06, "loss": 1.2071783542633057, "step": 826 }, { "epoch": 2.532110091743119, "grad_norm": 0.58984375, "learning_rate": 3.939536884233762e-06, "loss": 1.2099813222885132, "step": 828 }, { "epoch": 2.5382262996941893, "grad_norm": 0.8203125, "learning_rate": 3.918204349852626e-06, "loss": 1.2038205862045288, "step": 830 }, { "epoch": 2.54434250764526, "grad_norm": 1.1015625, "learning_rate": 3.896912297847626e-06, "loss": 1.1809529066085815, "step": 832 }, { "epoch": 2.5504587155963305, "grad_norm": 0.57421875, "learning_rate": 3.875661273139246e-06, "loss": 1.1591264009475708, "step": 834 }, { "epoch": 2.5565749235474007, "grad_norm": 0.8359375, "learning_rate": 3.854451819597981e-06, "loss": 1.0593103170394897, "step": 836 }, { "epoch": 2.562691131498471, "grad_norm": 0.466796875, "learning_rate": 3.833284480030401e-06, "loss": 1.2778112888336182, "step": 838 }, { "epoch": 2.5688073394495414, "grad_norm": 0.6015625, "learning_rate": 3.81215979616527e-06, "loss": 1.153441309928894, "step": 840 }, { "epoch": 2.5749235474006116, "grad_norm": 0.71875, "learning_rate": 3.79107830863968e-06, "loss": 1.251842975616455, "step": 842 }, { "epoch": 2.581039755351682, "grad_norm": 0.77734375, "learning_rate": 3.7700405569852082e-06, "loss": 1.1608760356903076, "step": 844 }, { "epoch": 2.5871559633027523, "grad_norm": 0.96875, "learning_rate": 3.749047079614121e-06, "loss": 1.1455830335617065, "step": 846 }, { "epoch": 2.5932721712538225, "grad_norm": 0.390625, "learning_rate": 3.7280984138055842e-06, "loss": 1.201966643333435, "step": 848 }, { "epoch": 2.599388379204893, "grad_norm": 0.62890625, "learning_rate": 3.707195095691913e-06, "loss": 1.232427954673767, "step": 850 }, { "epoch": 2.6055045871559632, "grad_norm": 2.890625, "learning_rate": 3.6863376602448607e-06, "loss": 1.257423758506775, "step": 852 }, { "epoch": 2.611620795107034, "grad_norm": 0.50390625, "learning_rate": 3.665526641261914e-06, "loss": 1.154307246208191, "step": 854 }, { "epoch": 2.617737003058104, "grad_norm": 0.56640625, "learning_rate": 3.6447625713526415e-06, "loss": 1.3352923393249512, "step": 856 }, { "epoch": 2.623853211009174, "grad_norm": 0.59765625, "learning_rate": 3.6240459819250605e-06, "loss": 1.2940092086791992, "step": 858 }, { "epoch": 2.6299694189602447, "grad_norm": 1.984375, "learning_rate": 3.603377403172035e-06, "loss": 1.3129587173461914, "step": 860 }, { "epoch": 2.636085626911315, "grad_norm": 0.66015625, "learning_rate": 3.582757364057704e-06, "loss": 1.1294050216674805, "step": 862 }, { "epoch": 2.6422018348623855, "grad_norm": 1.0625, "learning_rate": 3.5621863923039533e-06, "loss": 1.1990245580673218, "step": 864 }, { "epoch": 2.6483180428134556, "grad_norm": 0.609375, "learning_rate": 3.5416650143768994e-06, "loss": 1.1870311498641968, "step": 866 }, { "epoch": 2.6544342507645258, "grad_norm": 0.44921875, "learning_rate": 3.5211937554734234e-06, "loss": 1.199330449104309, "step": 868 }, { "epoch": 2.6605504587155964, "grad_norm": 0.609375, "learning_rate": 3.5007731395077273e-06, "loss": 1.247740387916565, "step": 870 }, { "epoch": 2.6666666666666665, "grad_norm": 0.447265625, "learning_rate": 3.4804036890979207e-06, "loss": 1.1606550216674805, "step": 872 }, { "epoch": 2.672782874617737, "grad_norm": 0.48046875, "learning_rate": 3.460085925552653e-06, "loss": 1.1833080053329468, "step": 874 }, { "epoch": 2.6788990825688073, "grad_norm": 0.8984375, "learning_rate": 3.439820368857768e-06, "loss": 1.199750304222107, "step": 876 }, { "epoch": 2.6850152905198774, "grad_norm": 0.69140625, "learning_rate": 3.4196075376629976e-06, "loss": 1.1525050401687622, "step": 878 }, { "epoch": 2.691131498470948, "grad_norm": 1.7578125, "learning_rate": 3.3994479492686867e-06, "loss": 1.251511812210083, "step": 880 }, { "epoch": 2.6972477064220186, "grad_norm": 1.1640625, "learning_rate": 3.379342119612553e-06, "loss": 1.2259825468063354, "step": 882 }, { "epoch": 2.7033639143730888, "grad_norm": 0.39453125, "learning_rate": 3.3592905632564874e-06, "loss": 1.2564154863357544, "step": 884 }, { "epoch": 2.709480122324159, "grad_norm": 0.75, "learning_rate": 3.3392937933733804e-06, "loss": 1.1841342449188232, "step": 886 }, { "epoch": 2.7155963302752295, "grad_norm": 0.76953125, "learning_rate": 3.319352321733989e-06, "loss": 1.194476842880249, "step": 888 }, { "epoch": 2.7217125382262997, "grad_norm": 0.59765625, "learning_rate": 3.2994666586938473e-06, "loss": 1.254859209060669, "step": 890 }, { "epoch": 2.7278287461773703, "grad_norm": 0.58203125, "learning_rate": 3.2796373131801873e-06, "loss": 1.255743384361267, "step": 892 }, { "epoch": 2.7339449541284404, "grad_norm": 0.86328125, "learning_rate": 3.259864792678933e-06, "loss": 1.2186676263809204, "step": 894 }, { "epoch": 2.7400611620795106, "grad_norm": 0.6015625, "learning_rate": 3.2401496032217017e-06, "loss": 1.3046661615371704, "step": 896 }, { "epoch": 2.746177370030581, "grad_norm": 0.50390625, "learning_rate": 3.2204922493728576e-06, "loss": 1.3042587041854858, "step": 898 }, { "epoch": 2.7522935779816513, "grad_norm": 0.5234375, "learning_rate": 3.200893234216596e-06, "loss": 1.179953694343567, "step": 900 }, { "epoch": 2.758409785932722, "grad_norm": 0.515625, "learning_rate": 3.1813530593440693e-06, "loss": 1.2110344171524048, "step": 902 }, { "epoch": 2.764525993883792, "grad_norm": 0.8984375, "learning_rate": 3.1618722248405504e-06, "loss": 1.155335783958435, "step": 904 }, { "epoch": 2.770642201834862, "grad_norm": 0.578125, "learning_rate": 3.1424512292726315e-06, "loss": 1.1856063604354858, "step": 906 }, { "epoch": 2.776758409785933, "grad_norm": 1.1796875, "learning_rate": 3.123090569675472e-06, "loss": 1.142336368560791, "step": 908 }, { "epoch": 2.782874617737003, "grad_norm": 0.93359375, "learning_rate": 3.1037907415400674e-06, "loss": 1.143799066543579, "step": 910 }, { "epoch": 2.7889908256880735, "grad_norm": 1.0, "learning_rate": 3.0845522388005756e-06, "loss": 1.2811185121536255, "step": 912 }, { "epoch": 2.7951070336391437, "grad_norm": 0.81640625, "learning_rate": 3.0653755538216724e-06, "loss": 1.099307894706726, "step": 914 }, { "epoch": 2.801223241590214, "grad_norm": 0.58984375, "learning_rate": 3.046261177385954e-06, "loss": 1.1932672262191772, "step": 916 }, { "epoch": 2.8073394495412844, "grad_norm": 0.5859375, "learning_rate": 3.027209598681373e-06, "loss": 1.1614950895309448, "step": 918 }, { "epoch": 2.8134556574923546, "grad_norm": 1.625, "learning_rate": 3.008221305288722e-06, "loss": 1.3066401481628418, "step": 920 }, { "epoch": 2.819571865443425, "grad_norm": 0.71484375, "learning_rate": 2.9892967831691506e-06, "loss": 1.261734962463379, "step": 922 }, { "epoch": 2.8256880733944953, "grad_norm": 0.87109375, "learning_rate": 2.9704365166517337e-06, "loss": 1.2576831579208374, "step": 924 }, { "epoch": 2.8318042813455655, "grad_norm": 0.578125, "learning_rate": 2.9516409884210726e-06, "loss": 1.1941940784454346, "step": 926 }, { "epoch": 2.837920489296636, "grad_norm": 0.78515625, "learning_rate": 2.9329106795049445e-06, "loss": 1.2333204746246338, "step": 928 }, { "epoch": 2.8440366972477067, "grad_norm": 0.5234375, "learning_rate": 2.914246069261988e-06, "loss": 1.2176916599273682, "step": 930 }, { "epoch": 2.850152905198777, "grad_norm": 0.75390625, "learning_rate": 2.8956476353694368e-06, "loss": 1.2780966758728027, "step": 932 }, { "epoch": 2.856269113149847, "grad_norm": 1.09375, "learning_rate": 2.877115853810898e-06, "loss": 1.2115226984024048, "step": 934 }, { "epoch": 2.8623853211009176, "grad_norm": 0.65625, "learning_rate": 2.8586511988641634e-06, "loss": 1.189244031906128, "step": 936 }, { "epoch": 2.8685015290519877, "grad_norm": 0.56640625, "learning_rate": 2.8402541430890794e-06, "loss": 1.2004551887512207, "step": 938 }, { "epoch": 2.8746177370030583, "grad_norm": 0.84375, "learning_rate": 2.821925157315447e-06, "loss": 1.228663682937622, "step": 940 }, { "epoch": 2.8807339449541285, "grad_norm": 0.5703125, "learning_rate": 2.8036647106309744e-06, "loss": 1.2045689821243286, "step": 942 }, { "epoch": 2.8868501529051986, "grad_norm": 0.5859375, "learning_rate": 2.78547327036927e-06, "loss": 1.2150650024414062, "step": 944 }, { "epoch": 2.8929663608562692, "grad_norm": 0.65625, "learning_rate": 2.767351302097887e-06, "loss": 1.199387788772583, "step": 946 }, { "epoch": 2.8990825688073394, "grad_norm": 0.62109375, "learning_rate": 2.7492992696064013e-06, "loss": 1.2142434120178223, "step": 948 }, { "epoch": 2.90519877675841, "grad_norm": 0.76953125, "learning_rate": 2.731317634894548e-06, "loss": 1.2693067789077759, "step": 950 }, { "epoch": 2.91131498470948, "grad_norm": 1.4921875, "learning_rate": 2.7134068581603936e-06, "loss": 1.2424131631851196, "step": 952 }, { "epoch": 2.9174311926605503, "grad_norm": 0.625, "learning_rate": 2.6955673977885566e-06, "loss": 1.2381134033203125, "step": 954 }, { "epoch": 2.923547400611621, "grad_norm": 0.8046875, "learning_rate": 2.677799710338486e-06, "loss": 1.2375258207321167, "step": 956 }, { "epoch": 2.929663608562691, "grad_norm": 1.125, "learning_rate": 2.660104250532764e-06, "loss": 1.129172921180725, "step": 958 }, { "epoch": 2.9357798165137616, "grad_norm": 0.5, "learning_rate": 2.6424814712454773e-06, "loss": 1.1203192472457886, "step": 960 }, { "epoch": 2.941896024464832, "grad_norm": 1.046875, "learning_rate": 2.624931823490625e-06, "loss": 1.2383675575256348, "step": 962 }, { "epoch": 2.948012232415902, "grad_norm": 0.6171875, "learning_rate": 2.607455756410573e-06, "loss": 1.1556285619735718, "step": 964 }, { "epoch": 2.9541284403669725, "grad_norm": 0.90625, "learning_rate": 2.5900537172645624e-06, "loss": 1.211835503578186, "step": 966 }, { "epoch": 2.9602446483180427, "grad_norm": 0.8046875, "learning_rate": 2.5727261514172586e-06, "loss": 1.1909599304199219, "step": 968 }, { "epoch": 2.9663608562691133, "grad_norm": 0.7265625, "learning_rate": 2.55547350232736e-06, "loss": 1.2073407173156738, "step": 970 }, { "epoch": 2.9724770642201834, "grad_norm": 0.498046875, "learning_rate": 2.5382962115362454e-06, "loss": 1.202832818031311, "step": 972 }, { "epoch": 2.9785932721712536, "grad_norm": 0.62109375, "learning_rate": 2.521194718656669e-06, "loss": 1.2078254222869873, "step": 974 }, { "epoch": 2.984709480122324, "grad_norm": 0.498046875, "learning_rate": 2.504169461361518e-06, "loss": 1.1780730485916138, "step": 976 }, { "epoch": 2.9908256880733948, "grad_norm": 1.0546875, "learning_rate": 2.487220875372606e-06, "loss": 1.1677711009979248, "step": 978 }, { "epoch": 2.996941896024465, "grad_norm": 0.984375, "learning_rate": 2.470349394449524e-06, "loss": 1.2224700450897217, "step": 980 }, { "epoch": 3.003058103975535, "grad_norm": 0.5859375, "learning_rate": 2.453555450378535e-06, "loss": 1.2254760265350342, "step": 982 }, { "epoch": 3.0091743119266057, "grad_norm": 1.359375, "learning_rate": 2.436839472961534e-06, "loss": 1.235056757926941, "step": 984 }, { "epoch": 3.015290519877676, "grad_norm": 0.53125, "learning_rate": 2.4202018900050327e-06, "loss": 1.2022202014923096, "step": 986 }, { "epoch": 3.021406727828746, "grad_norm": 0.470703125, "learning_rate": 2.4036431273092238e-06, "loss": 1.2913790941238403, "step": 988 }, { "epoch": 3.0275229357798166, "grad_norm": 0.6640625, "learning_rate": 2.387163608657078e-06, "loss": 1.2257859706878662, "step": 990 }, { "epoch": 3.0336391437308867, "grad_norm": 0.453125, "learning_rate": 2.3707637558034994e-06, "loss": 1.173649787902832, "step": 992 }, { "epoch": 3.0397553516819573, "grad_norm": 0.89453125, "learning_rate": 2.3544439884645317e-06, "loss": 1.2261406183242798, "step": 994 }, { "epoch": 3.0458715596330275, "grad_norm": 1.421875, "learning_rate": 2.3382047243066163e-06, "loss": 1.132150650024414, "step": 996 }, { "epoch": 3.051987767584098, "grad_norm": 0.470703125, "learning_rate": 2.3220463789359014e-06, "loss": 1.1366033554077148, "step": 998 }, { "epoch": 3.058103975535168, "grad_norm": 0.953125, "learning_rate": 2.30596936588761e-06, "loss": 1.1769903898239136, "step": 1000 }, { "epoch": 3.0642201834862384, "grad_norm": 0.427734375, "learning_rate": 2.2899740966154526e-06, "loss": 1.2203010320663452, "step": 1002 }, { "epoch": 3.070336391437309, "grad_norm": 0.578125, "learning_rate": 2.274060980481098e-06, "loss": 1.077088475227356, "step": 1004 }, { "epoch": 3.076452599388379, "grad_norm": 0.384765625, "learning_rate": 2.2582304247436963e-06, "loss": 1.177517056465149, "step": 1006 }, { "epoch": 3.0825688073394497, "grad_norm": 0.408203125, "learning_rate": 2.2424828345494575e-06, "loss": 1.0615501403808594, "step": 1008 }, { "epoch": 3.08868501529052, "grad_norm": 0.51953125, "learning_rate": 2.226818612921281e-06, "loss": 1.257022738456726, "step": 1010 }, { "epoch": 3.09480122324159, "grad_norm": 0.57421875, "learning_rate": 2.2112381607484417e-06, "loss": 1.3333863019943237, "step": 1012 }, { "epoch": 3.1009174311926606, "grad_norm": 0.53125, "learning_rate": 2.195741876776331e-06, "loss": 1.116982102394104, "step": 1014 }, { "epoch": 3.1070336391437308, "grad_norm": 0.484375, "learning_rate": 2.180330157596251e-06, "loss": 1.1025663614273071, "step": 1016 }, { "epoch": 3.1131498470948014, "grad_norm": 0.84375, "learning_rate": 2.1650033976352645e-06, "loss": 1.1931098699569702, "step": 1018 }, { "epoch": 3.1192660550458715, "grad_norm": 0.58984375, "learning_rate": 2.1497619891461016e-06, "loss": 1.2750816345214844, "step": 1020 }, { "epoch": 3.1253822629969417, "grad_norm": 0.578125, "learning_rate": 2.134606322197119e-06, "loss": 1.200748324394226, "step": 1022 }, { "epoch": 3.1314984709480123, "grad_norm": 0.9375, "learning_rate": 2.119536784662321e-06, "loss": 1.1820026636123657, "step": 1024 }, { "epoch": 3.1376146788990824, "grad_norm": 0.50390625, "learning_rate": 2.1045537622114265e-06, "loss": 1.072840929031372, "step": 1026 }, { "epoch": 3.143730886850153, "grad_norm": 0.7265625, "learning_rate": 2.089657638300005e-06, "loss": 1.1731314659118652, "step": 1028 }, { "epoch": 3.149847094801223, "grad_norm": 0.55859375, "learning_rate": 2.0748487941596596e-06, "loss": 1.1329575777053833, "step": 1030 }, { "epoch": 3.1559633027522938, "grad_norm": 0.6328125, "learning_rate": 2.06012760878827e-06, "loss": 1.2068923711776733, "step": 1032 }, { "epoch": 3.162079510703364, "grad_norm": 0.5703125, "learning_rate": 2.045494458940295e-06, "loss": 1.1488394737243652, "step": 1034 }, { "epoch": 3.168195718654434, "grad_norm": 0.65625, "learning_rate": 2.0309497191171285e-06, "loss": 1.1287355422973633, "step": 1036 }, { "epoch": 3.1743119266055047, "grad_norm": 0.453125, "learning_rate": 2.0164937615575148e-06, "loss": 1.182981014251709, "step": 1038 }, { "epoch": 3.180428134556575, "grad_norm": 0.66015625, "learning_rate": 2.002126956228026e-06, "loss": 1.159349799156189, "step": 1040 }, { "epoch": 3.1865443425076454, "grad_norm": 0.89453125, "learning_rate": 1.9878496708135885e-06, "loss": 1.1993876695632935, "step": 1042 }, { "epoch": 3.1926605504587156, "grad_norm": 0.435546875, "learning_rate": 1.973662270708074e-06, "loss": 1.1298656463623047, "step": 1044 }, { "epoch": 3.198776758409786, "grad_norm": 0.5078125, "learning_rate": 1.959565119004951e-06, "loss": 1.1985409259796143, "step": 1046 }, { "epoch": 3.2048929663608563, "grad_norm": 0.9453125, "learning_rate": 1.9455585764879877e-06, "loss": 1.1955678462982178, "step": 1048 }, { "epoch": 3.2110091743119265, "grad_norm": 0.5703125, "learning_rate": 1.9316430016220223e-06, "loss": 1.1202224493026733, "step": 1050 }, { "epoch": 3.217125382262997, "grad_norm": 0.72265625, "learning_rate": 1.91781875054379e-06, "loss": 1.157238245010376, "step": 1052 }, { "epoch": 3.223241590214067, "grad_norm": 1.0390625, "learning_rate": 1.9040861770528047e-06, "loss": 1.1316120624542236, "step": 1054 }, { "epoch": 3.229357798165138, "grad_norm": 0.58984375, "learning_rate": 1.890445632602303e-06, "loss": 1.1989833116531372, "step": 1056 }, { "epoch": 3.235474006116208, "grad_norm": 1.828125, "learning_rate": 1.876897466290259e-06, "loss": 1.222222089767456, "step": 1058 }, { "epoch": 3.241590214067278, "grad_norm": 0.63671875, "learning_rate": 1.8634420248504382e-06, "loss": 1.2111024856567383, "step": 1060 }, { "epoch": 3.2477064220183487, "grad_norm": 0.5859375, "learning_rate": 1.8500796526435305e-06, "loss": 1.172393560409546, "step": 1062 }, { "epoch": 3.253822629969419, "grad_norm": 2.328125, "learning_rate": 1.8368106916483358e-06, "loss": 1.1863235235214233, "step": 1064 }, { "epoch": 3.2599388379204894, "grad_norm": 0.53125, "learning_rate": 1.8236354814530113e-06, "loss": 1.29865562915802, "step": 1066 }, { "epoch": 3.2660550458715596, "grad_norm": 0.5, "learning_rate": 1.8105543592463803e-06, "loss": 1.261027455329895, "step": 1068 }, { "epoch": 3.2721712538226297, "grad_norm": 0.421875, "learning_rate": 1.7975676598093042e-06, "loss": 1.2421050071716309, "step": 1070 }, { "epoch": 3.2782874617737003, "grad_norm": 1.2578125, "learning_rate": 1.784675715506113e-06, "loss": 1.274834394454956, "step": 1072 }, { "epoch": 3.2844036697247705, "grad_norm": 0.87890625, "learning_rate": 1.7718788562760992e-06, "loss": 1.2069604396820068, "step": 1074 }, { "epoch": 3.290519877675841, "grad_norm": 0.63671875, "learning_rate": 1.7591774096250736e-06, "loss": 1.1289021968841553, "step": 1076 }, { "epoch": 3.2966360856269112, "grad_norm": 0.79296875, "learning_rate": 1.7465717006169887e-06, "loss": 1.2350070476531982, "step": 1078 }, { "epoch": 3.302752293577982, "grad_norm": 0.69921875, "learning_rate": 1.734062051865609e-06, "loss": 1.1916759014129639, "step": 1080 }, { "epoch": 3.308868501529052, "grad_norm": 0.5390625, "learning_rate": 1.7216487835262635e-06, "loss": 1.1767183542251587, "step": 1082 }, { "epoch": 3.314984709480122, "grad_norm": 1.0859375, "learning_rate": 1.7093322132876485e-06, "loss": 1.1724700927734375, "step": 1084 }, { "epoch": 3.3211009174311927, "grad_norm": 0.494140625, "learning_rate": 1.6971126563636977e-06, "loss": 1.1266517639160156, "step": 1086 }, { "epoch": 3.327217125382263, "grad_norm": 0.61328125, "learning_rate": 1.6849904254855151e-06, "loss": 1.211061716079712, "step": 1088 }, { "epoch": 3.3333333333333335, "grad_norm": 1.609375, "learning_rate": 1.6729658308933706e-06, "loss": 1.2213722467422485, "step": 1090 }, { "epoch": 3.3394495412844036, "grad_norm": 0.419921875, "learning_rate": 1.6610391803287611e-06, "loss": 1.1516450643539429, "step": 1092 }, { "epoch": 3.3455657492354742, "grad_norm": 1.03125, "learning_rate": 1.6492107790265338e-06, "loss": 1.1679214239120483, "step": 1094 }, { "epoch": 3.3516819571865444, "grad_norm": 0.796875, "learning_rate": 1.6374809297070766e-06, "loss": 1.2308049201965332, "step": 1096 }, { "epoch": 3.3577981651376145, "grad_norm": 0.53125, "learning_rate": 1.6258499325685673e-06, "loss": 1.181188941001892, "step": 1098 }, { "epoch": 3.363914373088685, "grad_norm": 0.9609375, "learning_rate": 1.6143180852792911e-06, "loss": 1.2255089282989502, "step": 1100 }, { "epoch": 3.3700305810397553, "grad_norm": 0.68359375, "learning_rate": 1.602885682970026e-06, "loss": 1.2569115161895752, "step": 1102 }, { "epoch": 3.376146788990826, "grad_norm": 0.92578125, "learning_rate": 1.5915530182264868e-06, "loss": 1.2300969362258911, "step": 1104 }, { "epoch": 3.382262996941896, "grad_norm": 2.84375, "learning_rate": 1.5803203810818366e-06, "loss": 1.2431167364120483, "step": 1106 }, { "epoch": 3.388379204892966, "grad_norm": 0.7890625, "learning_rate": 1.5691880590092671e-06, "loss": 1.2489876747131348, "step": 1108 }, { "epoch": 3.3944954128440368, "grad_norm": 0.5703125, "learning_rate": 1.558156336914634e-06, "loss": 1.1550531387329102, "step": 1110 }, { "epoch": 3.400611620795107, "grad_norm": 0.62890625, "learning_rate": 1.547225497129179e-06, "loss": 1.1249154806137085, "step": 1112 }, { "epoch": 3.4067278287461775, "grad_norm": 1.84375, "learning_rate": 1.5363958194022896e-06, "loss": 1.1979522705078125, "step": 1114 }, { "epoch": 3.4128440366972477, "grad_norm": 0.89453125, "learning_rate": 1.5256675808943488e-06, "loss": 1.1001931428909302, "step": 1116 }, { "epoch": 3.418960244648318, "grad_norm": 0.6484375, "learning_rate": 1.5150410561696382e-06, "loss": 1.1855971813201904, "step": 1118 }, { "epoch": 3.4250764525993884, "grad_norm": 0.69140625, "learning_rate": 1.5045165171893117e-06, "loss": 1.197637677192688, "step": 1120 }, { "epoch": 3.4311926605504586, "grad_norm": 0.61328125, "learning_rate": 1.4940942333044367e-06, "loss": 1.1402236223220825, "step": 1122 }, { "epoch": 3.437308868501529, "grad_norm": 0.50390625, "learning_rate": 1.4837744712490983e-06, "loss": 1.1059956550598145, "step": 1124 }, { "epoch": 3.4434250764525993, "grad_norm": 0.7734375, "learning_rate": 1.4735574951335752e-06, "loss": 1.1585502624511719, "step": 1126 }, { "epoch": 3.44954128440367, "grad_norm": 0.55859375, "learning_rate": 1.4634435664375784e-06, "loss": 1.2298681735992432, "step": 1128 }, { "epoch": 3.45565749235474, "grad_norm": 0.4765625, "learning_rate": 1.4534329440035599e-06, "loss": 1.1276212930679321, "step": 1130 }, { "epoch": 3.46177370030581, "grad_norm": 0.7421875, "learning_rate": 1.4435258840300897e-06, "loss": 1.1073015928268433, "step": 1132 }, { "epoch": 3.467889908256881, "grad_norm": 0.5703125, "learning_rate": 1.4337226400652977e-06, "loss": 1.1824053525924683, "step": 1134 }, { "epoch": 3.474006116207951, "grad_norm": 1.2265625, "learning_rate": 1.424023463000384e-06, "loss": 1.2478643655776978, "step": 1136 }, { "epoch": 3.4801223241590216, "grad_norm": 0.458984375, "learning_rate": 1.4144286010631993e-06, "loss": 1.2114766836166382, "step": 1138 }, { "epoch": 3.4862385321100917, "grad_norm": 0.57421875, "learning_rate": 1.4049382998118919e-06, "loss": 1.2164137363433838, "step": 1140 }, { "epoch": 3.4923547400611623, "grad_norm": 0.8046875, "learning_rate": 1.3955528021286208e-06, "loss": 1.115936517715454, "step": 1142 }, { "epoch": 3.4984709480122325, "grad_norm": 0.7421875, "learning_rate": 1.3862723482133437e-06, "loss": 1.1582000255584717, "step": 1144 }, { "epoch": 3.5045871559633026, "grad_norm": 1.203125, "learning_rate": 1.3770971755776667e-06, "loss": 1.1616395711898804, "step": 1146 }, { "epoch": 3.510703363914373, "grad_norm": 0.73828125, "learning_rate": 1.3680275190387677e-06, "loss": 1.20869779586792, "step": 1148 }, { "epoch": 3.5168195718654434, "grad_norm": 0.5859375, "learning_rate": 1.3590636107133849e-06, "loss": 1.2474617958068848, "step": 1150 }, { "epoch": 3.522935779816514, "grad_norm": 1.2578125, "learning_rate": 1.3502056800118784e-06, "loss": 1.2327600717544556, "step": 1152 }, { "epoch": 3.529051987767584, "grad_norm": 0.46484375, "learning_rate": 1.3414539536323568e-06, "loss": 1.1355574131011963, "step": 1154 }, { "epoch": 3.5351681957186543, "grad_norm": 1.0625, "learning_rate": 1.3328086555548764e-06, "loss": 1.1376428604125977, "step": 1156 }, { "epoch": 3.541284403669725, "grad_norm": 0.48828125, "learning_rate": 1.3242700070357098e-06, "loss": 1.128600001335144, "step": 1158 }, { "epoch": 3.547400611620795, "grad_norm": 1.65625, "learning_rate": 1.3158382266016803e-06, "loss": 1.2273775339126587, "step": 1160 }, { "epoch": 3.5535168195718656, "grad_norm": 0.53125, "learning_rate": 1.3075135300445746e-06, "loss": 1.1972393989562988, "step": 1162 }, { "epoch": 3.5596330275229358, "grad_norm": 0.62890625, "learning_rate": 1.2992961304156146e-06, "loss": 1.2698583602905273, "step": 1164 }, { "epoch": 3.565749235474006, "grad_norm": 0.54296875, "learning_rate": 1.2911862380200076e-06, "loss": 1.215325117111206, "step": 1166 }, { "epoch": 3.5718654434250765, "grad_norm": 0.53125, "learning_rate": 1.2831840604115647e-06, "loss": 1.1836117506027222, "step": 1168 }, { "epoch": 3.5779816513761467, "grad_norm": 1.0859375, "learning_rate": 1.2752898023873873e-06, "loss": 1.1673725843429565, "step": 1170 }, { "epoch": 3.5840978593272173, "grad_norm": 0.6171875, "learning_rate": 1.2675036659826251e-06, "loss": 1.1013611555099487, "step": 1172 }, { "epoch": 3.5902140672782874, "grad_norm": 0.55859375, "learning_rate": 1.2598258504653082e-06, "loss": 1.2070239782333374, "step": 1174 }, { "epoch": 3.5963302752293576, "grad_norm": 0.5234375, "learning_rate": 1.2522565523312456e-06, "loss": 1.1760621070861816, "step": 1176 }, { "epoch": 3.602446483180428, "grad_norm": 1.0625, "learning_rate": 1.2447959652989963e-06, "loss": 1.2546082735061646, "step": 1178 }, { "epoch": 3.6085626911314987, "grad_norm": 1.40625, "learning_rate": 1.2374442803049125e-06, "loss": 1.11211359500885, "step": 1180 }, { "epoch": 3.614678899082569, "grad_norm": 0.67578125, "learning_rate": 1.2302016854982504e-06, "loss": 1.1653016805648804, "step": 1182 }, { "epoch": 3.620795107033639, "grad_norm": 0.76953125, "learning_rate": 1.2230683662363599e-06, "loss": 1.0931107997894287, "step": 1184 }, { "epoch": 3.6269113149847096, "grad_norm": 0.53125, "learning_rate": 1.2160445050799346e-06, "loss": 1.1593706607818604, "step": 1186 }, { "epoch": 3.63302752293578, "grad_norm": 0.5859375, "learning_rate": 1.2091302817883444e-06, "loss": 1.2466744184494019, "step": 1188 }, { "epoch": 3.6391437308868504, "grad_norm": 0.69921875, "learning_rate": 1.2023258733150345e-06, "loss": 1.1520183086395264, "step": 1190 }, { "epoch": 3.6452599388379205, "grad_norm": 0.6875, "learning_rate": 1.195631453802994e-06, "loss": 1.1501617431640625, "step": 1192 }, { "epoch": 3.6513761467889907, "grad_norm": 0.5078125, "learning_rate": 1.1890471945803e-06, "loss": 1.1947115659713745, "step": 1194 }, { "epoch": 3.6574923547400613, "grad_norm": 0.82421875, "learning_rate": 1.1825732641557358e-06, "loss": 1.090171217918396, "step": 1196 }, { "epoch": 3.6636085626911314, "grad_norm": 0.486328125, "learning_rate": 1.1762098282144735e-06, "loss": 1.231759786605835, "step": 1198 }, { "epoch": 3.669724770642202, "grad_norm": 0.5859375, "learning_rate": 1.169957049613839e-06, "loss": 1.2382960319519043, "step": 1200 }, { "epoch": 3.675840978593272, "grad_norm": 0.46484375, "learning_rate": 1.1638150883791386e-06, "loss": 1.1713348627090454, "step": 1202 }, { "epoch": 3.6819571865443423, "grad_norm": 0.625, "learning_rate": 1.157784101699567e-06, "loss": 1.1755608320236206, "step": 1204 }, { "epoch": 3.688073394495413, "grad_norm": 0.609375, "learning_rate": 1.1518642439241849e-06, "loss": 1.2025344371795654, "step": 1206 }, { "epoch": 3.694189602446483, "grad_norm": 1.2265625, "learning_rate": 1.146055666557966e-06, "loss": 1.2071685791015625, "step": 1208 }, { "epoch": 3.7003058103975537, "grad_norm": 0.703125, "learning_rate": 1.140358518257922e-06, "loss": 1.1952728033065796, "step": 1210 }, { "epoch": 3.706422018348624, "grad_norm": 0.60546875, "learning_rate": 1.1347729448292953e-06, "loss": 1.21987783908844, "step": 1212 }, { "epoch": 3.712538226299694, "grad_norm": 0.7421875, "learning_rate": 1.129299089221832e-06, "loss": 1.2178161144256592, "step": 1214 }, { "epoch": 3.7186544342507646, "grad_norm": 0.68359375, "learning_rate": 1.1239370915261196e-06, "loss": 1.1406751871109009, "step": 1216 }, { "epoch": 3.7247706422018347, "grad_norm": 1.6484375, "learning_rate": 1.1186870889700013e-06, "loss": 1.1654596328735352, "step": 1218 }, { "epoch": 3.7308868501529053, "grad_norm": 2.328125, "learning_rate": 1.1135492159150676e-06, "loss": 1.2073957920074463, "step": 1220 }, { "epoch": 3.7370030581039755, "grad_norm": 0.609375, "learning_rate": 1.108523603853215e-06, "loss": 1.1250100135803223, "step": 1222 }, { "epoch": 3.7431192660550456, "grad_norm": 0.60546875, "learning_rate": 1.1036103814032804e-06, "loss": 1.2246984243392944, "step": 1224 }, { "epoch": 3.7492354740061162, "grad_norm": 0.58203125, "learning_rate": 1.0988096743077513e-06, "loss": 1.1390925645828247, "step": 1226 }, { "epoch": 3.7553516819571864, "grad_norm": 0.58203125, "learning_rate": 1.094121605429547e-06, "loss": 1.1992175579071045, "step": 1228 }, { "epoch": 3.761467889908257, "grad_norm": 0.4609375, "learning_rate": 1.089546294748873e-06, "loss": 1.0999352931976318, "step": 1230 }, { "epoch": 3.767584097859327, "grad_norm": 0.451171875, "learning_rate": 1.085083859360151e-06, "loss": 1.1122483015060425, "step": 1232 }, { "epoch": 3.7737003058103973, "grad_norm": 1.0625, "learning_rate": 1.0807344134690236e-06, "loss": 1.1888892650604248, "step": 1234 }, { "epoch": 3.779816513761468, "grad_norm": 0.59765625, "learning_rate": 1.0764980683894297e-06, "loss": 1.1580041646957397, "step": 1236 }, { "epoch": 3.7859327217125385, "grad_norm": 0.76171875, "learning_rate": 1.0723749325407564e-06, "loss": 1.176745891571045, "step": 1238 }, { "epoch": 3.7920489296636086, "grad_norm": 0.65234375, "learning_rate": 1.0683651114450641e-06, "loss": 1.1710706949234009, "step": 1240 }, { "epoch": 3.7981651376146788, "grad_norm": 2.578125, "learning_rate": 1.0644687077243864e-06, "loss": 1.1870887279510498, "step": 1242 }, { "epoch": 3.8042813455657494, "grad_norm": 0.76953125, "learning_rate": 1.0606858210981025e-06, "loss": 1.169495940208435, "step": 1244 }, { "epoch": 3.8103975535168195, "grad_norm": 0.66015625, "learning_rate": 1.0570165483803867e-06, "loss": 1.1190178394317627, "step": 1246 }, { "epoch": 3.81651376146789, "grad_norm": 1.15625, "learning_rate": 1.05346098347773e-06, "loss": 1.1436067819595337, "step": 1248 }, { "epoch": 3.8226299694189603, "grad_norm": 0.66796875, "learning_rate": 1.050019217386535e-06, "loss": 1.2288410663604736, "step": 1250 }, { "epoch": 3.8287461773700304, "grad_norm": 1.25, "learning_rate": 1.0466913381907914e-06, "loss": 1.2218413352966309, "step": 1252 }, { "epoch": 3.834862385321101, "grad_norm": 0.53515625, "learning_rate": 1.0434774310598166e-06, "loss": 1.208377480506897, "step": 1254 }, { "epoch": 3.840978593272171, "grad_norm": 0.5859375, "learning_rate": 1.04037757824608e-06, "loss": 1.1784342527389526, "step": 1256 }, { "epoch": 3.8470948012232418, "grad_norm": 1.0703125, "learning_rate": 1.0373918590830952e-06, "loss": 1.2136183977127075, "step": 1258 }, { "epoch": 3.853211009174312, "grad_norm": 0.5390625, "learning_rate": 1.0345203499833913e-06, "loss": 1.2747994661331177, "step": 1260 }, { "epoch": 3.859327217125382, "grad_norm": 0.73828125, "learning_rate": 1.0317631244365575e-06, "loss": 1.1638200283050537, "step": 1262 }, { "epoch": 3.8654434250764527, "grad_norm": 0.466796875, "learning_rate": 1.0291202530073602e-06, "loss": 1.2454450130462646, "step": 1264 }, { "epoch": 3.871559633027523, "grad_norm": 0.64453125, "learning_rate": 1.0265918033339392e-06, "loss": 1.1502002477645874, "step": 1266 }, { "epoch": 3.8776758409785934, "grad_norm": 0.9609375, "learning_rate": 1.0241778401260764e-06, "loss": 1.1322892904281616, "step": 1268 }, { "epoch": 3.8837920489296636, "grad_norm": 0.61328125, "learning_rate": 1.0218784251635382e-06, "loss": 1.1245934963226318, "step": 1270 }, { "epoch": 3.8899082568807337, "grad_norm": 0.82421875, "learning_rate": 1.0196936172944962e-06, "loss": 1.2275093793869019, "step": 1272 }, { "epoch": 3.8960244648318043, "grad_norm": 0.6328125, "learning_rate": 1.0176234724340201e-06, "loss": 1.2591514587402344, "step": 1274 }, { "epoch": 3.9021406727828745, "grad_norm": 0.46484375, "learning_rate": 1.0156680435626468e-06, "loss": 1.1828017234802246, "step": 1276 }, { "epoch": 3.908256880733945, "grad_norm": 0.6875, "learning_rate": 1.0138273807250244e-06, "loss": 1.1989636421203613, "step": 1278 }, { "epoch": 3.914373088685015, "grad_norm": 0.5234375, "learning_rate": 1.0121015310286318e-06, "loss": 1.1318210363388062, "step": 1280 }, { "epoch": 3.9204892966360854, "grad_norm": 0.51953125, "learning_rate": 1.0104905386425735e-06, "loss": 1.1387715339660645, "step": 1282 }, { "epoch": 3.926605504587156, "grad_norm": 0.486328125, "learning_rate": 1.0089944447964479e-06, "loss": 1.0994793176651, "step": 1284 }, { "epoch": 3.9327217125382266, "grad_norm": 0.5078125, "learning_rate": 1.0076132877792933e-06, "loss": 1.2001361846923828, "step": 1286 }, { "epoch": 3.9388379204892967, "grad_norm": 2.203125, "learning_rate": 1.0063471029386065e-06, "loss": 1.1622974872589111, "step": 1288 }, { "epoch": 3.944954128440367, "grad_norm": 1.015625, "learning_rate": 1.0051959226794407e-06, "loss": 1.170785903930664, "step": 1290 }, { "epoch": 3.9510703363914375, "grad_norm": 0.546875, "learning_rate": 1.004159776463573e-06, "loss": 1.1379996538162231, "step": 1292 }, { "epoch": 3.9571865443425076, "grad_norm": 1.046875, "learning_rate": 1.003238690808754e-06, "loss": 1.2565704584121704, "step": 1294 }, { "epoch": 3.963302752293578, "grad_norm": 0.609375, "learning_rate": 1.0024326892880253e-06, "loss": 1.1255217790603638, "step": 1296 }, { "epoch": 3.9694189602446484, "grad_norm": 0.58984375, "learning_rate": 1.0017417925291187e-06, "loss": 1.149346113204956, "step": 1298 }, { "epoch": 3.9755351681957185, "grad_norm": 0.46875, "learning_rate": 1.001166018213929e-06, "loss": 1.1946812868118286, "step": 1300 }, { "epoch": 3.981651376146789, "grad_norm": 1.3828125, "learning_rate": 1.0007053810780578e-06, "loss": 1.1319454908370972, "step": 1302 }, { "epoch": 3.9877675840978593, "grad_norm": 0.8203125, "learning_rate": 1.0003598929104407e-06, "loss": 1.1686453819274902, "step": 1304 }, { "epoch": 3.99388379204893, "grad_norm": 0.52734375, "learning_rate": 1.0001295625530423e-06, "loss": 1.130082607269287, "step": 1306 }, { "epoch": 4.0, "grad_norm": 0.78515625, "learning_rate": 1.0000143959006323e-06, "loss": 1.2041049003601074, "step": 1308 }, { "epoch": 4.0, "step": 1308, "total_flos": 3.2734142949973033e+18, "train_loss": 1.3248716153500641, "train_runtime": 16065.8623, "train_samples_per_second": 2.605, "train_steps_per_second": 0.081 } ], "logging_steps": 2, "max_steps": 1308, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 9999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2734142949973033e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }