{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9870598856455013, "eval_steps": 500, "global_step": 621, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004814926271441469, "grad_norm": 0.3190668225288391, "learning_rate": 9.98389694041868e-06, "loss": 14.3722, "step": 1 }, { "epoch": 0.009629852542882938, "grad_norm": 0.704807698726654, "learning_rate": 9.96779388083736e-06, "loss": 17.3468, "step": 2 }, { "epoch": 0.014444778814324405, "grad_norm": 0.5227249264717102, "learning_rate": 9.95169082125604e-06, "loss": 18.7806, "step": 3 }, { "epoch": 0.019259705085765876, "grad_norm": 0.34830373525619507, "learning_rate": 9.93558776167472e-06, "loss": 18.8868, "step": 4 }, { "epoch": 0.024074631357207343, "grad_norm": 0.36558371782302856, "learning_rate": 9.919484702093398e-06, "loss": 17.0113, "step": 5 }, { "epoch": 0.02888955762864881, "grad_norm": 0.464693546295166, "learning_rate": 9.903381642512077e-06, "loss": 16.2247, "step": 6 }, { "epoch": 0.03370448390009028, "grad_norm": 0.45501771569252014, "learning_rate": 9.887278582930757e-06, "loss": 15.7179, "step": 7 }, { "epoch": 0.03851941017153175, "grad_norm": 0.6688278317451477, "learning_rate": 9.871175523349438e-06, "loss": 18.7794, "step": 8 }, { "epoch": 0.043334336442973215, "grad_norm": 0.40696507692337036, "learning_rate": 9.855072463768118e-06, "loss": 16.3889, "step": 9 }, { "epoch": 0.048149262714414685, "grad_norm": 0.38113319873809814, "learning_rate": 9.838969404186796e-06, "loss": 16.7515, "step": 10 }, { "epoch": 0.052964188985856156, "grad_norm": 0.35052913427352905, "learning_rate": 9.822866344605476e-06, "loss": 15.7069, "step": 11 }, { "epoch": 0.05777911525729762, "grad_norm": 0.47708237171173096, "learning_rate": 9.806763285024155e-06, "loss": 16.9549, "step": 12 }, { "epoch": 0.0625940415287391, "grad_norm": 0.4960598945617676, "learning_rate": 9.790660225442835e-06, "loss": 16.8876, "step": 13 }, { "epoch": 0.06740896780018056, "grad_norm": 0.39951273798942566, "learning_rate": 9.774557165861515e-06, "loss": 15.1888, "step": 14 }, { "epoch": 0.07222389407162202, "grad_norm": 0.21550379693508148, "learning_rate": 9.758454106280194e-06, "loss": 14.6054, "step": 15 }, { "epoch": 0.0770388203430635, "grad_norm": 0.30602937936782837, "learning_rate": 9.742351046698874e-06, "loss": 16.4524, "step": 16 }, { "epoch": 0.08185374661450497, "grad_norm": 0.30777233839035034, "learning_rate": 9.726247987117554e-06, "loss": 14.3263, "step": 17 }, { "epoch": 0.08666867288594643, "grad_norm": 0.35533130168914795, "learning_rate": 9.710144927536233e-06, "loss": 16.2459, "step": 18 }, { "epoch": 0.09148359915738791, "grad_norm": 0.23820991814136505, "learning_rate": 9.694041867954911e-06, "loss": 14.2147, "step": 19 }, { "epoch": 0.09629852542882937, "grad_norm": 0.2193877398967743, "learning_rate": 9.677938808373591e-06, "loss": 14.4833, "step": 20 }, { "epoch": 0.10111345170027083, "grad_norm": 0.24645549058914185, "learning_rate": 9.66183574879227e-06, "loss": 14.6868, "step": 21 }, { "epoch": 0.10592837797171231, "grad_norm": 0.2614218592643738, "learning_rate": 9.64573268921095e-06, "loss": 15.4011, "step": 22 }, { "epoch": 0.11074330424315378, "grad_norm": 0.3114742040634155, "learning_rate": 9.62962962962963e-06, "loss": 14.9497, "step": 23 }, { "epoch": 0.11555823051459524, "grad_norm": 0.20465250313282013, "learning_rate": 9.61352657004831e-06, "loss": 13.4598, "step": 24 }, { "epoch": 0.12037315678603672, "grad_norm": 0.3349449336528778, "learning_rate": 9.59742351046699e-06, "loss": 14.3437, "step": 25 }, { "epoch": 0.1251880830574782, "grad_norm": 0.4164576828479767, "learning_rate": 9.581320450885669e-06, "loss": 14.5754, "step": 26 }, { "epoch": 0.13000300932891964, "grad_norm": 0.3533851206302643, "learning_rate": 9.565217391304349e-06, "loss": 14.7727, "step": 27 }, { "epoch": 0.13481793560036112, "grad_norm": 0.3998354375362396, "learning_rate": 9.549114331723028e-06, "loss": 13.1538, "step": 28 }, { "epoch": 0.1396328618718026, "grad_norm": 0.3069708049297333, "learning_rate": 9.533011272141708e-06, "loss": 13.2611, "step": 29 }, { "epoch": 0.14444778814324405, "grad_norm": 0.20584744215011597, "learning_rate": 9.516908212560388e-06, "loss": 14.7522, "step": 30 }, { "epoch": 0.14926271441468553, "grad_norm": 0.2097318172454834, "learning_rate": 9.500805152979067e-06, "loss": 15.0184, "step": 31 }, { "epoch": 0.154077640686127, "grad_norm": 0.3266746401786804, "learning_rate": 9.484702093397747e-06, "loss": 13.2641, "step": 32 }, { "epoch": 0.15889256695756845, "grad_norm": 0.2459367960691452, "learning_rate": 9.468599033816425e-06, "loss": 15.1449, "step": 33 }, { "epoch": 0.16370749322900993, "grad_norm": 0.4554983973503113, "learning_rate": 9.452495974235105e-06, "loss": 14.6538, "step": 34 }, { "epoch": 0.1685224195004514, "grad_norm": 0.3142286241054535, "learning_rate": 9.436392914653784e-06, "loss": 14.0927, "step": 35 }, { "epoch": 0.17333734577189286, "grad_norm": 0.2828330993652344, "learning_rate": 9.420289855072464e-06, "loss": 14.5898, "step": 36 }, { "epoch": 0.17815227204333434, "grad_norm": 0.25663697719573975, "learning_rate": 9.404186795491144e-06, "loss": 12.7267, "step": 37 }, { "epoch": 0.18296719831477581, "grad_norm": 0.4929574728012085, "learning_rate": 9.388083735909823e-06, "loss": 14.2236, "step": 38 }, { "epoch": 0.18778212458621726, "grad_norm": 0.414725661277771, "learning_rate": 9.371980676328503e-06, "loss": 13.9036, "step": 39 }, { "epoch": 0.19259705085765874, "grad_norm": 0.2808246910572052, "learning_rate": 9.355877616747183e-06, "loss": 14.7754, "step": 40 }, { "epoch": 0.19741197712910022, "grad_norm": 0.2846072316169739, "learning_rate": 9.339774557165862e-06, "loss": 13.519, "step": 41 }, { "epoch": 0.20222690340054167, "grad_norm": 0.2638435661792755, "learning_rate": 9.323671497584542e-06, "loss": 14.5145, "step": 42 }, { "epoch": 0.20704182967198315, "grad_norm": 0.22342148423194885, "learning_rate": 9.307568438003222e-06, "loss": 12.7617, "step": 43 }, { "epoch": 0.21185675594342462, "grad_norm": 0.2732909619808197, "learning_rate": 9.291465378421901e-06, "loss": 13.6826, "step": 44 }, { "epoch": 0.21667168221486607, "grad_norm": 0.23550738394260406, "learning_rate": 9.275362318840581e-06, "loss": 13.1958, "step": 45 }, { "epoch": 0.22148660848630755, "grad_norm": 0.2673870027065277, "learning_rate": 9.25925925925926e-06, "loss": 14.1934, "step": 46 }, { "epoch": 0.22630153475774903, "grad_norm": 0.303568571805954, "learning_rate": 9.243156199677939e-06, "loss": 11.8819, "step": 47 }, { "epoch": 0.23111646102919048, "grad_norm": 0.27822041511535645, "learning_rate": 9.227053140096618e-06, "loss": 11.8434, "step": 48 }, { "epoch": 0.23593138730063196, "grad_norm": 0.21598580479621887, "learning_rate": 9.210950080515298e-06, "loss": 13.0898, "step": 49 }, { "epoch": 0.24074631357207343, "grad_norm": 0.22329603135585785, "learning_rate": 9.194847020933978e-06, "loss": 12.1004, "step": 50 }, { "epoch": 0.24556123984351488, "grad_norm": 0.22152170538902283, "learning_rate": 9.178743961352658e-06, "loss": 13.8467, "step": 51 }, { "epoch": 0.2503761661149564, "grad_norm": 0.2549304664134979, "learning_rate": 9.162640901771337e-06, "loss": 13.3853, "step": 52 }, { "epoch": 0.2551910923863978, "grad_norm": 0.2308962047100067, "learning_rate": 9.146537842190017e-06, "loss": 12.9115, "step": 53 }, { "epoch": 0.2600060186578393, "grad_norm": 0.19011437892913818, "learning_rate": 9.130434782608697e-06, "loss": 12.7455, "step": 54 }, { "epoch": 0.26482094492928077, "grad_norm": 0.21280792355537415, "learning_rate": 9.114331723027376e-06, "loss": 13.4157, "step": 55 }, { "epoch": 0.26963587120072224, "grad_norm": 0.35571521520614624, "learning_rate": 9.098228663446056e-06, "loss": 13.961, "step": 56 }, { "epoch": 0.2744507974721637, "grad_norm": 0.25055205821990967, "learning_rate": 9.082125603864736e-06, "loss": 13.5945, "step": 57 }, { "epoch": 0.2792657237436052, "grad_norm": 0.22618041932582855, "learning_rate": 9.066022544283415e-06, "loss": 13.6034, "step": 58 }, { "epoch": 0.2840806500150466, "grad_norm": 0.2419959455728531, "learning_rate": 9.049919484702095e-06, "loss": 13.5142, "step": 59 }, { "epoch": 0.2888955762864881, "grad_norm": 0.3027523458003998, "learning_rate": 9.033816425120775e-06, "loss": 12.9825, "step": 60 }, { "epoch": 0.2937105025579296, "grad_norm": 0.1812627613544464, "learning_rate": 9.017713365539453e-06, "loss": 12.4197, "step": 61 }, { "epoch": 0.29852542882937105, "grad_norm": 0.2510731518268585, "learning_rate": 9.001610305958132e-06, "loss": 13.2717, "step": 62 }, { "epoch": 0.30334035510081253, "grad_norm": 0.2064312994480133, "learning_rate": 8.985507246376812e-06, "loss": 13.3493, "step": 63 }, { "epoch": 0.308155281372254, "grad_norm": 0.2627861797809601, "learning_rate": 8.969404186795492e-06, "loss": 13.3364, "step": 64 }, { "epoch": 0.31297020764369543, "grad_norm": 0.22463975846767426, "learning_rate": 8.953301127214171e-06, "loss": 12.0759, "step": 65 }, { "epoch": 0.3177851339151369, "grad_norm": 0.3166675865650177, "learning_rate": 8.937198067632851e-06, "loss": 12.8393, "step": 66 }, { "epoch": 0.3226000601865784, "grad_norm": 0.16428841650485992, "learning_rate": 8.92109500805153e-06, "loss": 12.6723, "step": 67 }, { "epoch": 0.32741498645801986, "grad_norm": 0.1815037578344345, "learning_rate": 8.90499194847021e-06, "loss": 12.6212, "step": 68 }, { "epoch": 0.33222991272946134, "grad_norm": 0.2504093050956726, "learning_rate": 8.888888888888888e-06, "loss": 12.6547, "step": 69 }, { "epoch": 0.3370448390009028, "grad_norm": 0.17379416525363922, "learning_rate": 8.87278582930757e-06, "loss": 10.5245, "step": 70 }, { "epoch": 0.34185976527234424, "grad_norm": 0.20780153572559357, "learning_rate": 8.85668276972625e-06, "loss": 11.1868, "step": 71 }, { "epoch": 0.3466746915437857, "grad_norm": 0.2680881917476654, "learning_rate": 8.840579710144929e-06, "loss": 11.9582, "step": 72 }, { "epoch": 0.3514896178152272, "grad_norm": 0.1777425855398178, "learning_rate": 8.824476650563609e-06, "loss": 11.3178, "step": 73 }, { "epoch": 0.3563045440866687, "grad_norm": 0.20199166238307953, "learning_rate": 8.808373590982288e-06, "loss": 12.5066, "step": 74 }, { "epoch": 0.36111947035811015, "grad_norm": 0.23542606830596924, "learning_rate": 8.792270531400966e-06, "loss": 11.885, "step": 75 }, { "epoch": 0.36593439662955163, "grad_norm": 0.23038695752620697, "learning_rate": 8.776167471819646e-06, "loss": 11.1026, "step": 76 }, { "epoch": 0.3707493229009931, "grad_norm": 0.2536081075668335, "learning_rate": 8.760064412238326e-06, "loss": 13.065, "step": 77 }, { "epoch": 0.37556424917243453, "grad_norm": 0.2599170207977295, "learning_rate": 8.743961352657005e-06, "loss": 12.4683, "step": 78 }, { "epoch": 0.380379175443876, "grad_norm": 0.23882345855236053, "learning_rate": 8.727858293075685e-06, "loss": 11.6778, "step": 79 }, { "epoch": 0.3851941017153175, "grad_norm": 0.23855774104595184, "learning_rate": 8.711755233494365e-06, "loss": 13.026, "step": 80 }, { "epoch": 0.39000902798675896, "grad_norm": 0.26537057757377625, "learning_rate": 8.695652173913044e-06, "loss": 12.4535, "step": 81 }, { "epoch": 0.39482395425820044, "grad_norm": 0.21693478524684906, "learning_rate": 8.679549114331724e-06, "loss": 12.9436, "step": 82 }, { "epoch": 0.3996388805296419, "grad_norm": 0.162302166223526, "learning_rate": 8.663446054750402e-06, "loss": 11.3558, "step": 83 }, { "epoch": 0.40445380680108334, "grad_norm": 0.271846741437912, "learning_rate": 8.647342995169082e-06, "loss": 11.0237, "step": 84 }, { "epoch": 0.4092687330725248, "grad_norm": 0.16958190500736237, "learning_rate": 8.631239935587761e-06, "loss": 11.3822, "step": 85 }, { "epoch": 0.4140836593439663, "grad_norm": 0.19066102802753448, "learning_rate": 8.615136876006443e-06, "loss": 11.6137, "step": 86 }, { "epoch": 0.41889858561540777, "grad_norm": 0.21410760283470154, "learning_rate": 8.599033816425122e-06, "loss": 11.1353, "step": 87 }, { "epoch": 0.42371351188684925, "grad_norm": 0.17947272956371307, "learning_rate": 8.582930756843802e-06, "loss": 11.0955, "step": 88 }, { "epoch": 0.4285284381582907, "grad_norm": 0.2798727750778198, "learning_rate": 8.56682769726248e-06, "loss": 11.5026, "step": 89 }, { "epoch": 0.43334336442973215, "grad_norm": 0.19547878205776215, "learning_rate": 8.55072463768116e-06, "loss": 11.2341, "step": 90 }, { "epoch": 0.4381582907011736, "grad_norm": 0.20346851646900177, "learning_rate": 8.53462157809984e-06, "loss": 11.7612, "step": 91 }, { "epoch": 0.4429732169726151, "grad_norm": 0.22177843749523163, "learning_rate": 8.518518518518519e-06, "loss": 12.027, "step": 92 }, { "epoch": 0.4477881432440566, "grad_norm": 0.14566639065742493, "learning_rate": 8.502415458937199e-06, "loss": 12.1414, "step": 93 }, { "epoch": 0.45260306951549806, "grad_norm": 0.19193682074546814, "learning_rate": 8.486312399355879e-06, "loss": 11.2928, "step": 94 }, { "epoch": 0.45741799578693954, "grad_norm": 0.18830566108226776, "learning_rate": 8.470209339774558e-06, "loss": 12.3402, "step": 95 }, { "epoch": 0.46223292205838096, "grad_norm": 0.19319747388362885, "learning_rate": 8.454106280193238e-06, "loss": 11.4159, "step": 96 }, { "epoch": 0.46704784832982243, "grad_norm": 0.2581634521484375, "learning_rate": 8.438003220611916e-06, "loss": 12.5042, "step": 97 }, { "epoch": 0.4718627746012639, "grad_norm": 0.2127319574356079, "learning_rate": 8.421900161030596e-06, "loss": 11.8059, "step": 98 }, { "epoch": 0.4766777008727054, "grad_norm": 0.18906573951244354, "learning_rate": 8.405797101449275e-06, "loss": 12.773, "step": 99 }, { "epoch": 0.48149262714414687, "grad_norm": 0.2039322406053543, "learning_rate": 8.389694041867955e-06, "loss": 11.2793, "step": 100 }, { "epoch": 0.48630755341558835, "grad_norm": 0.17869459092617035, "learning_rate": 8.373590982286636e-06, "loss": 12.1488, "step": 101 }, { "epoch": 0.49112247968702977, "grad_norm": 0.24505895376205444, "learning_rate": 8.357487922705316e-06, "loss": 12.6911, "step": 102 }, { "epoch": 0.49593740595847124, "grad_norm": 0.24129539728164673, "learning_rate": 8.341384863123994e-06, "loss": 11.1825, "step": 103 }, { "epoch": 0.5007523322299128, "grad_norm": 0.20321142673492432, "learning_rate": 8.325281803542674e-06, "loss": 11.3817, "step": 104 }, { "epoch": 0.5055672585013542, "grad_norm": 0.2557075321674347, "learning_rate": 8.309178743961353e-06, "loss": 13.0008, "step": 105 }, { "epoch": 0.5103821847727956, "grad_norm": 0.27801477909088135, "learning_rate": 8.293075684380033e-06, "loss": 10.5208, "step": 106 }, { "epoch": 0.5151971110442372, "grad_norm": 0.18863140046596527, "learning_rate": 8.276972624798713e-06, "loss": 11.17, "step": 107 }, { "epoch": 0.5200120373156786, "grad_norm": 0.1997506022453308, "learning_rate": 8.260869565217392e-06, "loss": 11.094, "step": 108 }, { "epoch": 0.5248269635871201, "grad_norm": 0.17764043807983398, "learning_rate": 8.244766505636072e-06, "loss": 10.9546, "step": 109 }, { "epoch": 0.5296418898585615, "grad_norm": 0.22004744410514832, "learning_rate": 8.228663446054752e-06, "loss": 10.8977, "step": 110 }, { "epoch": 0.534456816130003, "grad_norm": 0.20619215071201324, "learning_rate": 8.212560386473431e-06, "loss": 12.0217, "step": 111 }, { "epoch": 0.5392717424014445, "grad_norm": 0.1944962590932846, "learning_rate": 8.19645732689211e-06, "loss": 11.5528, "step": 112 }, { "epoch": 0.5440866686728859, "grad_norm": 0.13986949622631073, "learning_rate": 8.180354267310789e-06, "loss": 10.8501, "step": 113 }, { "epoch": 0.5489015949443274, "grad_norm": 0.18104106187820435, "learning_rate": 8.164251207729469e-06, "loss": 12.0401, "step": 114 }, { "epoch": 0.5537165212157689, "grad_norm": 0.22354455292224884, "learning_rate": 8.148148148148148e-06, "loss": 12.3038, "step": 115 }, { "epoch": 0.5585314474872104, "grad_norm": 0.21359990537166595, "learning_rate": 8.132045088566828e-06, "loss": 10.9812, "step": 116 }, { "epoch": 0.5633463737586518, "grad_norm": 0.25966572761535645, "learning_rate": 8.115942028985508e-06, "loss": 11.0717, "step": 117 }, { "epoch": 0.5681613000300932, "grad_norm": 0.18161477148532867, "learning_rate": 8.099838969404187e-06, "loss": 10.8503, "step": 118 }, { "epoch": 0.5729762263015348, "grad_norm": 0.30178895592689514, "learning_rate": 8.083735909822867e-06, "loss": 12.6225, "step": 119 }, { "epoch": 0.5777911525729762, "grad_norm": 0.13033385574817657, "learning_rate": 8.067632850241547e-06, "loss": 11.2823, "step": 120 }, { "epoch": 0.5826060788444177, "grad_norm": 0.2345341593027115, "learning_rate": 8.051529790660226e-06, "loss": 10.6418, "step": 121 }, { "epoch": 0.5874210051158592, "grad_norm": 0.23290252685546875, "learning_rate": 8.035426731078906e-06, "loss": 10.7231, "step": 122 }, { "epoch": 0.5922359313873007, "grad_norm": 0.19367018342018127, "learning_rate": 8.019323671497586e-06, "loss": 10.2351, "step": 123 }, { "epoch": 0.5970508576587421, "grad_norm": 0.22510769963264465, "learning_rate": 8.003220611916265e-06, "loss": 10.3216, "step": 124 }, { "epoch": 0.6018657839301835, "grad_norm": 0.21876239776611328, "learning_rate": 7.987117552334945e-06, "loss": 11.0453, "step": 125 }, { "epoch": 0.6066807102016251, "grad_norm": 0.23988570272922516, "learning_rate": 7.971014492753623e-06, "loss": 10.9186, "step": 126 }, { "epoch": 0.6114956364730665, "grad_norm": 0.1909828633069992, "learning_rate": 7.954911433172303e-06, "loss": 10.7444, "step": 127 }, { "epoch": 0.616310562744508, "grad_norm": 0.2268180102109909, "learning_rate": 7.938808373590982e-06, "loss": 12.1826, "step": 128 }, { "epoch": 0.6211254890159494, "grad_norm": 0.18531453609466553, "learning_rate": 7.922705314009662e-06, "loss": 11.0919, "step": 129 }, { "epoch": 0.6259404152873909, "grad_norm": 0.24563215672969818, "learning_rate": 7.906602254428342e-06, "loss": 10.825, "step": 130 }, { "epoch": 0.6307553415588324, "grad_norm": 0.26069939136505127, "learning_rate": 7.890499194847021e-06, "loss": 10.9237, "step": 131 }, { "epoch": 0.6355702678302738, "grad_norm": 0.18118217587471008, "learning_rate": 7.874396135265701e-06, "loss": 11.2994, "step": 132 }, { "epoch": 0.6403851941017153, "grad_norm": 0.2178242951631546, "learning_rate": 7.85829307568438e-06, "loss": 10.6764, "step": 133 }, { "epoch": 0.6452001203731568, "grad_norm": 0.18861421942710876, "learning_rate": 7.84219001610306e-06, "loss": 11.7684, "step": 134 }, { "epoch": 0.6500150466445983, "grad_norm": 0.2540731430053711, "learning_rate": 7.82608695652174e-06, "loss": 10.4613, "step": 135 }, { "epoch": 0.6548299729160397, "grad_norm": 0.22468675673007965, "learning_rate": 7.80998389694042e-06, "loss": 11.0479, "step": 136 }, { "epoch": 0.6596448991874811, "grad_norm": 0.18307951092720032, "learning_rate": 7.7938808373591e-06, "loss": 11.7074, "step": 137 }, { "epoch": 0.6644598254589227, "grad_norm": 0.2777751088142395, "learning_rate": 7.77777777777778e-06, "loss": 11.5034, "step": 138 }, { "epoch": 0.6692747517303641, "grad_norm": 0.20376338064670563, "learning_rate": 7.761674718196459e-06, "loss": 11.178, "step": 139 }, { "epoch": 0.6740896780018056, "grad_norm": 0.19434967637062073, "learning_rate": 7.745571658615137e-06, "loss": 10.198, "step": 140 }, { "epoch": 0.6789046042732471, "grad_norm": 0.28449344635009766, "learning_rate": 7.729468599033817e-06, "loss": 11.1956, "step": 141 }, { "epoch": 0.6837195305446885, "grad_norm": 0.18125340342521667, "learning_rate": 7.713365539452496e-06, "loss": 12.0773, "step": 142 }, { "epoch": 0.68853445681613, "grad_norm": 0.2260919064283371, "learning_rate": 7.697262479871176e-06, "loss": 11.2763, "step": 143 }, { "epoch": 0.6933493830875714, "grad_norm": 0.23274123668670654, "learning_rate": 7.681159420289856e-06, "loss": 11.9908, "step": 144 }, { "epoch": 0.698164309359013, "grad_norm": 0.16333813965320587, "learning_rate": 7.665056360708535e-06, "loss": 12.1754, "step": 145 }, { "epoch": 0.7029792356304544, "grad_norm": 0.19147440791130066, "learning_rate": 7.648953301127215e-06, "loss": 10.8012, "step": 146 }, { "epoch": 0.7077941619018959, "grad_norm": 0.24757863581180573, "learning_rate": 7.632850241545895e-06, "loss": 11.222, "step": 147 }, { "epoch": 0.7126090881733373, "grad_norm": 0.2936674952507019, "learning_rate": 7.616747181964574e-06, "loss": 10.7104, "step": 148 }, { "epoch": 0.7174240144447788, "grad_norm": 0.25289615988731384, "learning_rate": 7.600644122383254e-06, "loss": 11.2394, "step": 149 }, { "epoch": 0.7222389407162203, "grad_norm": 0.16242274641990662, "learning_rate": 7.584541062801934e-06, "loss": 10.9335, "step": 150 }, { "epoch": 0.7270538669876617, "grad_norm": 0.16051234304904938, "learning_rate": 7.568438003220613e-06, "loss": 10.5078, "step": 151 }, { "epoch": 0.7318687932591033, "grad_norm": 0.19001922011375427, "learning_rate": 7.552334943639292e-06, "loss": 10.2024, "step": 152 }, { "epoch": 0.7366837195305447, "grad_norm": 0.1944311112165451, "learning_rate": 7.536231884057972e-06, "loss": 10.3439, "step": 153 }, { "epoch": 0.7414986458019862, "grad_norm": 0.22597943246364594, "learning_rate": 7.5201288244766514e-06, "loss": 9.8315, "step": 154 }, { "epoch": 0.7463135720734276, "grad_norm": 0.16061653196811676, "learning_rate": 7.504025764895331e-06, "loss": 10.1577, "step": 155 }, { "epoch": 0.7511284983448691, "grad_norm": 0.18217833340168, "learning_rate": 7.48792270531401e-06, "loss": 11.1101, "step": 156 }, { "epoch": 0.7559434246163106, "grad_norm": 0.24722352623939514, "learning_rate": 7.47181964573269e-06, "loss": 9.7077, "step": 157 }, { "epoch": 0.760758350887752, "grad_norm": 0.19641828536987305, "learning_rate": 7.455716586151369e-06, "loss": 10.4689, "step": 158 }, { "epoch": 0.7655732771591935, "grad_norm": 0.2800208330154419, "learning_rate": 7.439613526570049e-06, "loss": 11.402, "step": 159 }, { "epoch": 0.770388203430635, "grad_norm": 0.19170229136943817, "learning_rate": 7.423510466988728e-06, "loss": 10.1995, "step": 160 }, { "epoch": 0.7752031297020764, "grad_norm": 0.1706549972295761, "learning_rate": 7.4074074074074075e-06, "loss": 10.3468, "step": 161 }, { "epoch": 0.7800180559735179, "grad_norm": 0.21024712920188904, "learning_rate": 7.391304347826087e-06, "loss": 11.7541, "step": 162 }, { "epoch": 0.7848329822449593, "grad_norm": 0.22287265956401825, "learning_rate": 7.375201288244767e-06, "loss": 11.3616, "step": 163 }, { "epoch": 0.7896479085164009, "grad_norm": 0.195387065410614, "learning_rate": 7.359098228663447e-06, "loss": 9.8747, "step": 164 }, { "epoch": 0.7944628347878423, "grad_norm": 0.2072424590587616, "learning_rate": 7.342995169082127e-06, "loss": 11.2013, "step": 165 }, { "epoch": 0.7992777610592838, "grad_norm": 0.17055857181549072, "learning_rate": 7.326892109500806e-06, "loss": 9.3551, "step": 166 }, { "epoch": 0.8040926873307253, "grad_norm": 0.2913988530635834, "learning_rate": 7.3107890499194855e-06, "loss": 9.5701, "step": 167 }, { "epoch": 0.8089076136021667, "grad_norm": 0.27838587760925293, "learning_rate": 7.294685990338165e-06, "loss": 9.1273, "step": 168 }, { "epoch": 0.8137225398736082, "grad_norm": 0.16759181022644043, "learning_rate": 7.278582930756845e-06, "loss": 12.0986, "step": 169 }, { "epoch": 0.8185374661450496, "grad_norm": 0.2335626184940338, "learning_rate": 7.262479871175524e-06, "loss": 9.4595, "step": 170 }, { "epoch": 0.8233523924164912, "grad_norm": 0.22770944237709045, "learning_rate": 7.246376811594203e-06, "loss": 11.482, "step": 171 }, { "epoch": 0.8281673186879326, "grad_norm": 0.16300161182880402, "learning_rate": 7.230273752012883e-06, "loss": 10.084, "step": 172 }, { "epoch": 0.832982244959374, "grad_norm": 0.1577334851026535, "learning_rate": 7.214170692431563e-06, "loss": 11.7738, "step": 173 }, { "epoch": 0.8377971712308155, "grad_norm": 0.26999086141586304, "learning_rate": 7.1980676328502416e-06, "loss": 11.7664, "step": 174 }, { "epoch": 0.842612097502257, "grad_norm": 0.17184922099113464, "learning_rate": 7.181964573268921e-06, "loss": 10.0153, "step": 175 }, { "epoch": 0.8474270237736985, "grad_norm": 0.19260835647583008, "learning_rate": 7.165861513687601e-06, "loss": 11.0206, "step": 176 }, { "epoch": 0.8522419500451399, "grad_norm": 0.13800834119319916, "learning_rate": 7.149758454106281e-06, "loss": 11.2491, "step": 177 }, { "epoch": 0.8570568763165815, "grad_norm": 0.18511894345283508, "learning_rate": 7.1336553945249594e-06, "loss": 10.0002, "step": 178 }, { "epoch": 0.8618718025880229, "grad_norm": 0.19319257140159607, "learning_rate": 7.117552334943641e-06, "loss": 10.3143, "step": 179 }, { "epoch": 0.8666867288594643, "grad_norm": 0.23793131113052368, "learning_rate": 7.10144927536232e-06, "loss": 9.9405, "step": 180 }, { "epoch": 0.8715016551309058, "grad_norm": 0.22520898282527924, "learning_rate": 7.085346215780999e-06, "loss": 11.7933, "step": 181 }, { "epoch": 0.8763165814023472, "grad_norm": 0.1998303085565567, "learning_rate": 7.069243156199679e-06, "loss": 10.8949, "step": 182 }, { "epoch": 0.8811315076737888, "grad_norm": 0.2205827236175537, "learning_rate": 7.053140096618359e-06, "loss": 11.2439, "step": 183 }, { "epoch": 0.8859464339452302, "grad_norm": 0.18895015120506287, "learning_rate": 7.0370370370370375e-06, "loss": 11.1144, "step": 184 }, { "epoch": 0.8907613602166716, "grad_norm": 0.17686723172664642, "learning_rate": 7.020933977455717e-06, "loss": 10.2206, "step": 185 }, { "epoch": 0.8955762864881132, "grad_norm": 0.2033979296684265, "learning_rate": 7.004830917874397e-06, "loss": 11.4571, "step": 186 }, { "epoch": 0.9003912127595546, "grad_norm": 0.19752806425094604, "learning_rate": 6.9887278582930765e-06, "loss": 10.0299, "step": 187 }, { "epoch": 0.9052061390309961, "grad_norm": 0.26918548345565796, "learning_rate": 6.972624798711755e-06, "loss": 10.0987, "step": 188 }, { "epoch": 0.9100210653024375, "grad_norm": 0.14812156558036804, "learning_rate": 6.956521739130435e-06, "loss": 10.1505, "step": 189 }, { "epoch": 0.9148359915738791, "grad_norm": 0.21255257725715637, "learning_rate": 6.940418679549115e-06, "loss": 10.48, "step": 190 }, { "epoch": 0.9196509178453205, "grad_norm": 0.20056240260601044, "learning_rate": 6.924315619967794e-06, "loss": 10.615, "step": 191 }, { "epoch": 0.9244658441167619, "grad_norm": 0.2510916590690613, "learning_rate": 6.908212560386473e-06, "loss": 9.2769, "step": 192 }, { "epoch": 0.9292807703882034, "grad_norm": 0.19624245166778564, "learning_rate": 6.892109500805153e-06, "loss": 10.3494, "step": 193 }, { "epoch": 0.9340956966596449, "grad_norm": 0.19198696315288544, "learning_rate": 6.8760064412238326e-06, "loss": 11.5993, "step": 194 }, { "epoch": 0.9389106229310864, "grad_norm": 0.18541178107261658, "learning_rate": 6.859903381642513e-06, "loss": 11.1401, "step": 195 }, { "epoch": 0.9437255492025278, "grad_norm": 0.2111266553401947, "learning_rate": 6.843800322061193e-06, "loss": 10.4827, "step": 196 }, { "epoch": 0.9485404754739694, "grad_norm": 0.19431617856025696, "learning_rate": 6.8276972624798724e-06, "loss": 9.9673, "step": 197 }, { "epoch": 0.9533554017454108, "grad_norm": 0.20121034979820251, "learning_rate": 6.811594202898551e-06, "loss": 10.0902, "step": 198 }, { "epoch": 0.9581703280168522, "grad_norm": 0.24719102680683136, "learning_rate": 6.795491143317231e-06, "loss": 9.7116, "step": 199 }, { "epoch": 0.9629852542882937, "grad_norm": 0.14550495147705078, "learning_rate": 6.779388083735911e-06, "loss": 10.3404, "step": 200 }, { "epoch": 0.9678001805597352, "grad_norm": 0.19170908629894257, "learning_rate": 6.76328502415459e-06, "loss": 10.6444, "step": 201 }, { "epoch": 0.9726151068311767, "grad_norm": 0.23954305052757263, "learning_rate": 6.747181964573269e-06, "loss": 10.3116, "step": 202 }, { "epoch": 0.9774300331026181, "grad_norm": 0.15414614975452423, "learning_rate": 6.731078904991949e-06, "loss": 10.4329, "step": 203 }, { "epoch": 0.9822449593740595, "grad_norm": 0.19790370762348175, "learning_rate": 6.7149758454106285e-06, "loss": 10.6693, "step": 204 }, { "epoch": 0.9870598856455011, "grad_norm": 0.23332847654819489, "learning_rate": 6.698872785829308e-06, "loss": 11.7078, "step": 205 }, { "epoch": 0.9918748119169425, "grad_norm": 0.1728251725435257, "learning_rate": 6.682769726247987e-06, "loss": 10.1582, "step": 206 }, { "epoch": 0.996689738188384, "grad_norm": 0.18887676298618317, "learning_rate": 6.666666666666667e-06, "loss": 10.0072, "step": 207 }, { "epoch": 1.0, "grad_norm": 0.18887676298618317, "learning_rate": 6.666666666666667e-06, "loss": 7.3169, "step": 208 }, { "epoch": 1.0048149262714414, "grad_norm": 0.22321873903274536, "learning_rate": 6.650563607085346e-06, "loss": 9.9937, "step": 209 }, { "epoch": 1.0096298525428828, "grad_norm": 0.23789285123348236, "learning_rate": 6.634460547504026e-06, "loss": 10.1548, "step": 210 }, { "epoch": 1.0144447788143245, "grad_norm": 0.2545947730541229, "learning_rate": 6.6183574879227065e-06, "loss": 10.1225, "step": 211 }, { "epoch": 1.019259705085766, "grad_norm": 0.19479095935821533, "learning_rate": 6.602254428341386e-06, "loss": 9.4662, "step": 212 }, { "epoch": 1.0240746313572073, "grad_norm": 0.1563379466533661, "learning_rate": 6.586151368760065e-06, "loss": 10.5629, "step": 213 }, { "epoch": 1.0288895576286488, "grad_norm": 0.25045880675315857, "learning_rate": 6.570048309178745e-06, "loss": 9.0693, "step": 214 }, { "epoch": 1.0337044839000902, "grad_norm": 0.20094619691371918, "learning_rate": 6.553945249597424e-06, "loss": 10.7736, "step": 215 }, { "epoch": 1.0385194101715318, "grad_norm": 0.2038065642118454, "learning_rate": 6.537842190016104e-06, "loss": 11.2619, "step": 216 }, { "epoch": 1.0433343364429732, "grad_norm": 0.1970120072364807, "learning_rate": 6.521739130434783e-06, "loss": 10.3251, "step": 217 }, { "epoch": 1.0481492627144147, "grad_norm": 0.19979062676429749, "learning_rate": 6.5056360708534626e-06, "loss": 10.6034, "step": 218 }, { "epoch": 1.052964188985856, "grad_norm": 0.16085349023342133, "learning_rate": 6.489533011272142e-06, "loss": 9.5934, "step": 219 }, { "epoch": 1.0577791152572975, "grad_norm": 0.20374242961406708, "learning_rate": 6.473429951690822e-06, "loss": 8.9761, "step": 220 }, { "epoch": 1.0625940415287392, "grad_norm": 0.19417604804039001, "learning_rate": 6.457326892109501e-06, "loss": 9.8925, "step": 221 }, { "epoch": 1.0674089678001806, "grad_norm": 0.1641014963388443, "learning_rate": 6.44122383252818e-06, "loss": 10.019, "step": 222 }, { "epoch": 1.072223894071622, "grad_norm": 0.15444359183311462, "learning_rate": 6.42512077294686e-06, "loss": 9.8515, "step": 223 }, { "epoch": 1.0770388203430634, "grad_norm": 0.31960421800613403, "learning_rate": 6.40901771336554e-06, "loss": 9.6703, "step": 224 }, { "epoch": 1.081853746614505, "grad_norm": 0.18809086084365845, "learning_rate": 6.3929146537842194e-06, "loss": 10.5771, "step": 225 }, { "epoch": 1.0866686728859465, "grad_norm": 0.2899991571903229, "learning_rate": 6.376811594202898e-06, "loss": 9.9195, "step": 226 }, { "epoch": 1.091483599157388, "grad_norm": 0.20936541259288788, "learning_rate": 6.360708534621579e-06, "loss": 9.9802, "step": 227 }, { "epoch": 1.0962985254288293, "grad_norm": 0.20921356976032257, "learning_rate": 6.3446054750402585e-06, "loss": 10.7929, "step": 228 }, { "epoch": 1.1011134517002708, "grad_norm": 0.16953137516975403, "learning_rate": 6.328502415458938e-06, "loss": 9.2881, "step": 229 }, { "epoch": 1.1059283779717124, "grad_norm": 0.16596080362796783, "learning_rate": 6.312399355877618e-06, "loss": 9.9087, "step": 230 }, { "epoch": 1.1107433042431538, "grad_norm": 0.17415396869182587, "learning_rate": 6.296296296296297e-06, "loss": 10.5827, "step": 231 }, { "epoch": 1.1155582305145952, "grad_norm": 0.1941956877708435, "learning_rate": 6.280193236714976e-06, "loss": 10.6707, "step": 232 }, { "epoch": 1.1203731567860367, "grad_norm": 0.2597000300884247, "learning_rate": 6.264090177133656e-06, "loss": 12.2734, "step": 233 }, { "epoch": 1.1251880830574783, "grad_norm": 0.1953185349702835, "learning_rate": 6.247987117552336e-06, "loss": 9.6042, "step": 234 }, { "epoch": 1.1300030093289197, "grad_norm": 0.19797232747077942, "learning_rate": 6.2318840579710145e-06, "loss": 10.6643, "step": 235 }, { "epoch": 1.1348179356003611, "grad_norm": 0.18180033564567566, "learning_rate": 6.215780998389694e-06, "loss": 10.2799, "step": 236 }, { "epoch": 1.1396328618718026, "grad_norm": 0.17393337190151215, "learning_rate": 6.199677938808374e-06, "loss": 9.4656, "step": 237 }, { "epoch": 1.144447788143244, "grad_norm": 0.1834544539451599, "learning_rate": 6.1835748792270535e-06, "loss": 8.9495, "step": 238 }, { "epoch": 1.1492627144146854, "grad_norm": 0.14842462539672852, "learning_rate": 6.167471819645733e-06, "loss": 9.6697, "step": 239 }, { "epoch": 1.154077640686127, "grad_norm": 0.2158040702342987, "learning_rate": 6.151368760064412e-06, "loss": 9.1501, "step": 240 }, { "epoch": 1.1588925669575685, "grad_norm": 0.18131056427955627, "learning_rate": 6.135265700483092e-06, "loss": 10.3998, "step": 241 }, { "epoch": 1.16370749322901, "grad_norm": 0.22484710812568665, "learning_rate": 6.119162640901772e-06, "loss": 10.0693, "step": 242 }, { "epoch": 1.1685224195004513, "grad_norm": 0.18370361626148224, "learning_rate": 6.103059581320452e-06, "loss": 10.4377, "step": 243 }, { "epoch": 1.173337345771893, "grad_norm": 0.2081800103187561, "learning_rate": 6.086956521739132e-06, "loss": 9.8576, "step": 244 }, { "epoch": 1.1781522720433344, "grad_norm": 0.1726984828710556, "learning_rate": 6.0708534621578104e-06, "loss": 10.4158, "step": 245 }, { "epoch": 1.1829671983147758, "grad_norm": 0.22167733311653137, "learning_rate": 6.05475040257649e-06, "loss": 9.2583, "step": 246 }, { "epoch": 1.1877821245862172, "grad_norm": 0.24326634407043457, "learning_rate": 6.03864734299517e-06, "loss": 10.1546, "step": 247 }, { "epoch": 1.1925970508576587, "grad_norm": 0.20624417066574097, "learning_rate": 6.0225442834138495e-06, "loss": 10.3518, "step": 248 }, { "epoch": 1.1974119771291003, "grad_norm": 0.22262895107269287, "learning_rate": 6.006441223832528e-06, "loss": 10.2671, "step": 249 }, { "epoch": 1.2022269034005417, "grad_norm": 0.18244238197803497, "learning_rate": 5.990338164251208e-06, "loss": 10.3412, "step": 250 }, { "epoch": 1.2070418296719831, "grad_norm": 0.12642191350460052, "learning_rate": 5.974235104669888e-06, "loss": 8.3136, "step": 251 }, { "epoch": 1.2118567559434246, "grad_norm": 0.22949941456317902, "learning_rate": 5.958132045088567e-06, "loss": 9.8669, "step": 252 }, { "epoch": 1.216671682214866, "grad_norm": 0.17824606597423553, "learning_rate": 5.942028985507247e-06, "loss": 8.9438, "step": 253 }, { "epoch": 1.2214866084863076, "grad_norm": 0.21706126630306244, "learning_rate": 5.925925925925926e-06, "loss": 10.5671, "step": 254 }, { "epoch": 1.226301534757749, "grad_norm": 0.1777815967798233, "learning_rate": 5.9098228663446055e-06, "loss": 9.0566, "step": 255 }, { "epoch": 1.2311164610291905, "grad_norm": 0.16943249106407166, "learning_rate": 5.893719806763285e-06, "loss": 10.5991, "step": 256 }, { "epoch": 1.235931387300632, "grad_norm": 0.19475321471691132, "learning_rate": 5.877616747181965e-06, "loss": 9.7751, "step": 257 }, { "epoch": 1.2407463135720733, "grad_norm": 0.15499532222747803, "learning_rate": 5.861513687600645e-06, "loss": 9.2155, "step": 258 }, { "epoch": 1.245561239843515, "grad_norm": 0.21997332572937012, "learning_rate": 5.845410628019324e-06, "loss": 9.9653, "step": 259 }, { "epoch": 1.2503761661149564, "grad_norm": 0.2071482390165329, "learning_rate": 5.829307568438004e-06, "loss": 9.9446, "step": 260 }, { "epoch": 1.2551910923863978, "grad_norm": 0.18931487202644348, "learning_rate": 5.8132045088566835e-06, "loss": 10.322, "step": 261 }, { "epoch": 1.2600060186578392, "grad_norm": 0.14098307490348816, "learning_rate": 5.797101449275363e-06, "loss": 9.7707, "step": 262 }, { "epoch": 1.2648209449292809, "grad_norm": 0.22090758383274078, "learning_rate": 5.780998389694043e-06, "loss": 9.702, "step": 263 }, { "epoch": 1.2696358712007223, "grad_norm": 0.181729257106781, "learning_rate": 5.764895330112722e-06, "loss": 9.5123, "step": 264 }, { "epoch": 1.2744507974721637, "grad_norm": 0.1258496791124344, "learning_rate": 5.748792270531401e-06, "loss": 9.0927, "step": 265 }, { "epoch": 1.2792657237436051, "grad_norm": 0.21762683987617493, "learning_rate": 5.732689210950081e-06, "loss": 8.5398, "step": 266 }, { "epoch": 1.2840806500150466, "grad_norm": 0.14968731999397278, "learning_rate": 5.716586151368761e-06, "loss": 9.486, "step": 267 }, { "epoch": 1.288895576286488, "grad_norm": 0.17779159545898438, "learning_rate": 5.70048309178744e-06, "loss": 10.4383, "step": 268 }, { "epoch": 1.2937105025579296, "grad_norm": 0.19466915726661682, "learning_rate": 5.684380032206119e-06, "loss": 10.2354, "step": 269 }, { "epoch": 1.298525428829371, "grad_norm": 0.22139185667037964, "learning_rate": 5.668276972624799e-06, "loss": 9.7784, "step": 270 }, { "epoch": 1.3033403551008125, "grad_norm": 0.21013078093528748, "learning_rate": 5.652173913043479e-06, "loss": 11.4605, "step": 271 }, { "epoch": 1.3081552813722541, "grad_norm": 0.17095215618610382, "learning_rate": 5.6360708534621574e-06, "loss": 9.6968, "step": 272 }, { "epoch": 1.3129702076436955, "grad_norm": 0.15703898668289185, "learning_rate": 5.619967793880838e-06, "loss": 8.9945, "step": 273 }, { "epoch": 1.317785133915137, "grad_norm": 0.16166311502456665, "learning_rate": 5.603864734299518e-06, "loss": 8.9587, "step": 274 }, { "epoch": 1.3226000601865784, "grad_norm": 0.18226633965969086, "learning_rate": 5.587761674718197e-06, "loss": 9.5976, "step": 275 }, { "epoch": 1.3274149864580198, "grad_norm": 0.16516032814979553, "learning_rate": 5.571658615136877e-06, "loss": 8.9366, "step": 276 }, { "epoch": 1.3322299127294612, "grad_norm": 0.18485237658023834, "learning_rate": 5.555555555555557e-06, "loss": 8.6867, "step": 277 }, { "epoch": 1.3370448390009029, "grad_norm": 0.16183756291866302, "learning_rate": 5.5394524959742355e-06, "loss": 10.0091, "step": 278 }, { "epoch": 1.3418597652723443, "grad_norm": 0.18236857652664185, "learning_rate": 5.523349436392915e-06, "loss": 11.0512, "step": 279 }, { "epoch": 1.3466746915437857, "grad_norm": 0.16111883521080017, "learning_rate": 5.507246376811595e-06, "loss": 9.9661, "step": 280 }, { "epoch": 1.3514896178152271, "grad_norm": 0.17416836321353912, "learning_rate": 5.4911433172302745e-06, "loss": 8.5023, "step": 281 }, { "epoch": 1.3563045440866688, "grad_norm": 0.1845031976699829, "learning_rate": 5.475040257648953e-06, "loss": 8.1757, "step": 282 }, { "epoch": 1.3611194703581102, "grad_norm": 0.14829057455062866, "learning_rate": 5.458937198067633e-06, "loss": 10.4167, "step": 283 }, { "epoch": 1.3659343966295516, "grad_norm": 0.18102510273456573, "learning_rate": 5.442834138486313e-06, "loss": 10.1862, "step": 284 }, { "epoch": 1.370749322900993, "grad_norm": 0.1877845823764801, "learning_rate": 5.426731078904992e-06, "loss": 10.4398, "step": 285 }, { "epoch": 1.3755642491724345, "grad_norm": 0.19289150834083557, "learning_rate": 5.410628019323671e-06, "loss": 10.1863, "step": 286 }, { "epoch": 1.380379175443876, "grad_norm": 0.14551950991153717, "learning_rate": 5.394524959742351e-06, "loss": 8.6225, "step": 287 }, { "epoch": 1.3851941017153175, "grad_norm": 0.15998440980911255, "learning_rate": 5.3784219001610306e-06, "loss": 9.6626, "step": 288 }, { "epoch": 1.390009027986759, "grad_norm": 0.15218336880207062, "learning_rate": 5.362318840579711e-06, "loss": 9.0365, "step": 289 }, { "epoch": 1.3948239542582004, "grad_norm": 0.19268082082271576, "learning_rate": 5.346215780998391e-06, "loss": 9.069, "step": 290 }, { "epoch": 1.399638880529642, "grad_norm": 0.15415695309638977, "learning_rate": 5.3301127214170704e-06, "loss": 10.0018, "step": 291 }, { "epoch": 1.4044538068010834, "grad_norm": 0.1783796101808548, "learning_rate": 5.314009661835749e-06, "loss": 8.5794, "step": 292 }, { "epoch": 1.4092687330725249, "grad_norm": 0.23539525270462036, "learning_rate": 5.297906602254429e-06, "loss": 9.2077, "step": 293 }, { "epoch": 1.4140836593439663, "grad_norm": 0.19150039553642273, "learning_rate": 5.281803542673109e-06, "loss": 8.6828, "step": 294 }, { "epoch": 1.4188985856154077, "grad_norm": 0.18820087611675262, "learning_rate": 5.265700483091788e-06, "loss": 9.4677, "step": 295 }, { "epoch": 1.4237135118868491, "grad_norm": 0.5018635988235474, "learning_rate": 5.249597423510467e-06, "loss": 9.6455, "step": 296 }, { "epoch": 1.4285284381582908, "grad_norm": 0.17721492052078247, "learning_rate": 5.233494363929147e-06, "loss": 8.3182, "step": 297 }, { "epoch": 1.4333433644297322, "grad_norm": 0.20144477486610413, "learning_rate": 5.2173913043478265e-06, "loss": 9.1568, "step": 298 }, { "epoch": 1.4381582907011736, "grad_norm": 0.18805253505706787, "learning_rate": 5.201288244766506e-06, "loss": 9.7496, "step": 299 }, { "epoch": 1.442973216972615, "grad_norm": 0.1500595659017563, "learning_rate": 5.185185185185185e-06, "loss": 9.9708, "step": 300 }, { "epoch": 1.4477881432440567, "grad_norm": 0.19444873929023743, "learning_rate": 5.169082125603865e-06, "loss": 9.5143, "step": 301 }, { "epoch": 1.4526030695154981, "grad_norm": 0.18682818114757538, "learning_rate": 5.152979066022544e-06, "loss": 10.2596, "step": 302 }, { "epoch": 1.4574179957869395, "grad_norm": 0.17984358966350555, "learning_rate": 5.136876006441224e-06, "loss": 10.2697, "step": 303 }, { "epoch": 1.462232922058381, "grad_norm": 0.17564424872398376, "learning_rate": 5.1207729468599045e-06, "loss": 9.9508, "step": 304 }, { "epoch": 1.4670478483298224, "grad_norm": 0.1954619437456131, "learning_rate": 5.104669887278584e-06, "loss": 10.603, "step": 305 }, { "epoch": 1.4718627746012638, "grad_norm": 0.16032911837100983, "learning_rate": 5.088566827697263e-06, "loss": 10.1388, "step": 306 }, { "epoch": 1.4766777008727054, "grad_norm": 0.18712233006954193, "learning_rate": 5.072463768115943e-06, "loss": 10.603, "step": 307 }, { "epoch": 1.4814926271441469, "grad_norm": 0.18479761481285095, "learning_rate": 5.056360708534622e-06, "loss": 9.8074, "step": 308 }, { "epoch": 1.4863075534155883, "grad_norm": 0.14700675010681152, "learning_rate": 5.040257648953302e-06, "loss": 10.4248, "step": 309 }, { "epoch": 1.4911224796870297, "grad_norm": 0.13533058762550354, "learning_rate": 5.024154589371981e-06, "loss": 9.1913, "step": 310 }, { "epoch": 1.4959374059584714, "grad_norm": 0.1617136150598526, "learning_rate": 5.0080515297906606e-06, "loss": 8.7997, "step": 311 }, { "epoch": 1.5007523322299128, "grad_norm": 0.14999867975711823, "learning_rate": 4.99194847020934e-06, "loss": 10.7806, "step": 312 }, { "epoch": 1.5055672585013542, "grad_norm": 0.1483631134033203, "learning_rate": 4.97584541062802e-06, "loss": 8.9508, "step": 313 }, { "epoch": 1.5103821847727956, "grad_norm": 0.1401262730360031, "learning_rate": 4.959742351046699e-06, "loss": 9.3086, "step": 314 }, { "epoch": 1.515197111044237, "grad_norm": 0.20340582728385925, "learning_rate": 4.9436392914653784e-06, "loss": 9.6934, "step": 315 }, { "epoch": 1.5200120373156785, "grad_norm": 0.10809484124183655, "learning_rate": 4.927536231884059e-06, "loss": 8.5021, "step": 316 }, { "epoch": 1.52482696358712, "grad_norm": 0.18179920315742493, "learning_rate": 4.911433172302738e-06, "loss": 8.7153, "step": 317 }, { "epoch": 1.5296418898585615, "grad_norm": 0.1383148580789566, "learning_rate": 4.8953301127214175e-06, "loss": 10.6062, "step": 318 }, { "epoch": 1.534456816130003, "grad_norm": 0.21121209859848022, "learning_rate": 4.879227053140097e-06, "loss": 8.7374, "step": 319 }, { "epoch": 1.5392717424014446, "grad_norm": 0.19276529550552368, "learning_rate": 4.863123993558777e-06, "loss": 8.7942, "step": 320 }, { "epoch": 1.544086668672886, "grad_norm": 0.18534629046916962, "learning_rate": 4.847020933977456e-06, "loss": 7.3924, "step": 321 }, { "epoch": 1.5489015949443274, "grad_norm": 0.11499077826738358, "learning_rate": 4.830917874396135e-06, "loss": 9.042, "step": 322 }, { "epoch": 1.5537165212157689, "grad_norm": 0.19323264062404633, "learning_rate": 4.814814814814815e-06, "loss": 9.5308, "step": 323 }, { "epoch": 1.5585314474872103, "grad_norm": 0.163632333278656, "learning_rate": 4.798711755233495e-06, "loss": 9.5654, "step": 324 }, { "epoch": 1.5633463737586517, "grad_norm": 0.24960660934448242, "learning_rate": 4.782608695652174e-06, "loss": 8.9639, "step": 325 }, { "epoch": 1.5681613000300931, "grad_norm": 0.13659049570560455, "learning_rate": 4.766505636070854e-06, "loss": 9.2311, "step": 326 }, { "epoch": 1.5729762263015348, "grad_norm": 0.19566506147384644, "learning_rate": 4.750402576489534e-06, "loss": 9.9125, "step": 327 }, { "epoch": 1.5777911525729762, "grad_norm": 0.13559715449810028, "learning_rate": 4.7342995169082125e-06, "loss": 10.2432, "step": 328 }, { "epoch": 1.5826060788444178, "grad_norm": 0.20595477521419525, "learning_rate": 4.718196457326892e-06, "loss": 9.3677, "step": 329 }, { "epoch": 1.5874210051158593, "grad_norm": 0.1580948680639267, "learning_rate": 4.702093397745572e-06, "loss": 10.0316, "step": 330 }, { "epoch": 1.5922359313873007, "grad_norm": 0.1536228209733963, "learning_rate": 4.6859903381642516e-06, "loss": 10.8169, "step": 331 }, { "epoch": 1.597050857658742, "grad_norm": 0.17159651219844818, "learning_rate": 4.669887278582931e-06, "loss": 9.8849, "step": 332 }, { "epoch": 1.6018657839301835, "grad_norm": 0.14754590392112732, "learning_rate": 4.653784219001611e-06, "loss": 8.2419, "step": 333 }, { "epoch": 1.606680710201625, "grad_norm": 0.15272633731365204, "learning_rate": 4.637681159420291e-06, "loss": 10.358, "step": 334 }, { "epoch": 1.6114956364730664, "grad_norm": 0.23571325838565826, "learning_rate": 4.621578099838969e-06, "loss": 8.9846, "step": 335 }, { "epoch": 1.616310562744508, "grad_norm": 0.1520383059978485, "learning_rate": 4.605475040257649e-06, "loss": 9.627, "step": 336 }, { "epoch": 1.6211254890159494, "grad_norm": 0.16789157688617706, "learning_rate": 4.589371980676329e-06, "loss": 8.9584, "step": 337 }, { "epoch": 1.6259404152873909, "grad_norm": 0.23156379163265228, "learning_rate": 4.5732689210950084e-06, "loss": 9.4165, "step": 338 }, { "epoch": 1.6307553415588325, "grad_norm": 0.2569849491119385, "learning_rate": 4.557165861513688e-06, "loss": 8.6545, "step": 339 }, { "epoch": 1.635570267830274, "grad_norm": 0.1471448540687561, "learning_rate": 4.541062801932368e-06, "loss": 9.7279, "step": 340 }, { "epoch": 1.6403851941017153, "grad_norm": 0.19168996810913086, "learning_rate": 4.5249597423510475e-06, "loss": 8.6823, "step": 341 }, { "epoch": 1.6452001203731568, "grad_norm": 0.16900351643562317, "learning_rate": 4.508856682769726e-06, "loss": 9.3926, "step": 342 }, { "epoch": 1.6500150466445982, "grad_norm": 0.1279803216457367, "learning_rate": 4.492753623188406e-06, "loss": 8.8222, "step": 343 }, { "epoch": 1.6548299729160396, "grad_norm": 0.16592150926589966, "learning_rate": 4.476650563607086e-06, "loss": 9.3739, "step": 344 }, { "epoch": 1.659644899187481, "grad_norm": 0.18117226660251617, "learning_rate": 4.460547504025765e-06, "loss": 8.6028, "step": 345 }, { "epoch": 1.6644598254589227, "grad_norm": 0.15843939781188965, "learning_rate": 4.444444444444444e-06, "loss": 7.3552, "step": 346 }, { "epoch": 1.669274751730364, "grad_norm": 0.1672333925962448, "learning_rate": 4.428341384863125e-06, "loss": 8.1781, "step": 347 }, { "epoch": 1.6740896780018057, "grad_norm": 0.1798122376203537, "learning_rate": 4.412238325281804e-06, "loss": 9.5976, "step": 348 }, { "epoch": 1.6789046042732472, "grad_norm": 0.15125727653503418, "learning_rate": 4.396135265700483e-06, "loss": 9.2915, "step": 349 }, { "epoch": 1.6837195305446886, "grad_norm": 0.15909244120121002, "learning_rate": 4.380032206119163e-06, "loss": 8.5171, "step": 350 }, { "epoch": 1.68853445681613, "grad_norm": 0.19835767149925232, "learning_rate": 4.3639291465378425e-06, "loss": 9.3835, "step": 351 }, { "epoch": 1.6933493830875714, "grad_norm": 0.30680009722709656, "learning_rate": 4.347826086956522e-06, "loss": 9.3733, "step": 352 }, { "epoch": 1.6981643093590129, "grad_norm": 0.13429707288742065, "learning_rate": 4.331723027375201e-06, "loss": 9.6266, "step": 353 }, { "epoch": 1.7029792356304543, "grad_norm": 0.16423039138317108, "learning_rate": 4.315619967793881e-06, "loss": 9.4546, "step": 354 }, { "epoch": 1.707794161901896, "grad_norm": 0.14078310132026672, "learning_rate": 4.299516908212561e-06, "loss": 9.1663, "step": 355 }, { "epoch": 1.7126090881733373, "grad_norm": 0.2016141414642334, "learning_rate": 4.28341384863124e-06, "loss": 8.6698, "step": 356 }, { "epoch": 1.7174240144447788, "grad_norm": 0.13229703903198242, "learning_rate": 4.26731078904992e-06, "loss": 8.5468, "step": 357 }, { "epoch": 1.7222389407162204, "grad_norm": 0.22356487810611725, "learning_rate": 4.251207729468599e-06, "loss": 9.311, "step": 358 }, { "epoch": 1.7270538669876618, "grad_norm": 0.19844292104244232, "learning_rate": 4.235104669887279e-06, "loss": 9.054, "step": 359 }, { "epoch": 1.7318687932591033, "grad_norm": 0.18081983923912048, "learning_rate": 4.219001610305958e-06, "loss": 8.9678, "step": 360 }, { "epoch": 1.7366837195305447, "grad_norm": 0.2216968685388565, "learning_rate": 4.202898550724638e-06, "loss": 8.6121, "step": 361 }, { "epoch": 1.741498645801986, "grad_norm": 0.14121295511722565, "learning_rate": 4.186795491143318e-06, "loss": 9.2074, "step": 362 }, { "epoch": 1.7463135720734275, "grad_norm": 0.148764505982399, "learning_rate": 4.170692431561997e-06, "loss": 9.1965, "step": 363 }, { "epoch": 1.751128498344869, "grad_norm": 0.20818910002708435, "learning_rate": 4.154589371980677e-06, "loss": 8.5382, "step": 364 }, { "epoch": 1.7559434246163106, "grad_norm": 0.1755458116531372, "learning_rate": 4.138486312399356e-06, "loss": 9.0389, "step": 365 }, { "epoch": 1.760758350887752, "grad_norm": 0.15656408667564392, "learning_rate": 4.122383252818036e-06, "loss": 9.226, "step": 366 }, { "epoch": 1.7655732771591937, "grad_norm": 0.14213398098945618, "learning_rate": 4.106280193236716e-06, "loss": 8.2302, "step": 367 }, { "epoch": 1.770388203430635, "grad_norm": 0.1693073809146881, "learning_rate": 4.0901771336553945e-06, "loss": 9.6989, "step": 368 }, { "epoch": 1.7752031297020765, "grad_norm": 0.15878278017044067, "learning_rate": 4.074074074074074e-06, "loss": 9.4632, "step": 369 }, { "epoch": 1.780018055973518, "grad_norm": 0.22463774681091309, "learning_rate": 4.057971014492754e-06, "loss": 9.7328, "step": 370 }, { "epoch": 1.7848329822449593, "grad_norm": 0.4724883437156677, "learning_rate": 4.0418679549114335e-06, "loss": 10.2544, "step": 371 }, { "epoch": 1.7896479085164008, "grad_norm": 0.17619994282722473, "learning_rate": 4.025764895330113e-06, "loss": 9.6433, "step": 372 }, { "epoch": 1.7944628347878422, "grad_norm": 0.16114237904548645, "learning_rate": 4.009661835748793e-06, "loss": 10.651, "step": 373 }, { "epoch": 1.7992777610592838, "grad_norm": 0.2053680568933487, "learning_rate": 3.9935587761674725e-06, "loss": 8.8208, "step": 374 }, { "epoch": 1.8040926873307253, "grad_norm": 0.17200101912021637, "learning_rate": 3.977455716586151e-06, "loss": 8.9841, "step": 375 }, { "epoch": 1.8089076136021667, "grad_norm": 0.12033673375844955, "learning_rate": 3.961352657004831e-06, "loss": 8.552, "step": 376 }, { "epoch": 1.8137225398736083, "grad_norm": 0.17469695210456848, "learning_rate": 3.945249597423511e-06, "loss": 8.6438, "step": 377 }, { "epoch": 1.8185374661450497, "grad_norm": 0.19993340969085693, "learning_rate": 3.92914653784219e-06, "loss": 9.326, "step": 378 }, { "epoch": 1.8233523924164912, "grad_norm": 0.18282270431518555, "learning_rate": 3.91304347826087e-06, "loss": 8.6382, "step": 379 }, { "epoch": 1.8281673186879326, "grad_norm": 0.21918214857578278, "learning_rate": 3.89694041867955e-06, "loss": 9.7637, "step": 380 }, { "epoch": 1.832982244959374, "grad_norm": 0.19311483204364777, "learning_rate": 3.880837359098229e-06, "loss": 9.7215, "step": 381 }, { "epoch": 1.8377971712308154, "grad_norm": 0.2024223506450653, "learning_rate": 3.864734299516908e-06, "loss": 9.1813, "step": 382 }, { "epoch": 1.8426120975022569, "grad_norm": 0.15196166932582855, "learning_rate": 3.848631239935588e-06, "loss": 9.4212, "step": 383 }, { "epoch": 1.8474270237736985, "grad_norm": 0.20014698803424835, "learning_rate": 3.832528180354268e-06, "loss": 9.0854, "step": 384 }, { "epoch": 1.85224195004514, "grad_norm": 0.2045230120420456, "learning_rate": 3.816425120772947e-06, "loss": 9.3168, "step": 385 }, { "epoch": 1.8570568763165816, "grad_norm": 0.13044817745685577, "learning_rate": 3.800322061191627e-06, "loss": 8.4085, "step": 386 }, { "epoch": 1.861871802588023, "grad_norm": 0.19362546503543854, "learning_rate": 3.7842190016103066e-06, "loss": 8.5539, "step": 387 }, { "epoch": 1.8666867288594644, "grad_norm": 0.19143155217170715, "learning_rate": 3.768115942028986e-06, "loss": 9.0545, "step": 388 }, { "epoch": 1.8715016551309058, "grad_norm": 0.18278856575489044, "learning_rate": 3.7520128824476656e-06, "loss": 8.6361, "step": 389 }, { "epoch": 1.8763165814023472, "grad_norm": 0.20836183428764343, "learning_rate": 3.735909822866345e-06, "loss": 9.0898, "step": 390 }, { "epoch": 1.8811315076737887, "grad_norm": 0.18853327631950378, "learning_rate": 3.7198067632850245e-06, "loss": 9.8428, "step": 391 }, { "epoch": 1.88594643394523, "grad_norm": 0.13650333881378174, "learning_rate": 3.7037037037037037e-06, "loss": 8.8437, "step": 392 }, { "epoch": 1.8907613602166715, "grad_norm": 0.20635420083999634, "learning_rate": 3.6876006441223834e-06, "loss": 9.0691, "step": 393 }, { "epoch": 1.8955762864881132, "grad_norm": 0.16736768186092377, "learning_rate": 3.6714975845410635e-06, "loss": 9.318, "step": 394 }, { "epoch": 1.9003912127595546, "grad_norm": 0.21544639766216278, "learning_rate": 3.6553945249597428e-06, "loss": 8.9387, "step": 395 }, { "epoch": 1.9052061390309962, "grad_norm": 0.17389844357967377, "learning_rate": 3.6392914653784224e-06, "loss": 9.549, "step": 396 }, { "epoch": 1.9100210653024376, "grad_norm": 0.21728019416332245, "learning_rate": 3.6231884057971017e-06, "loss": 8.0753, "step": 397 }, { "epoch": 1.914835991573879, "grad_norm": 0.199959859251976, "learning_rate": 3.6070853462157814e-06, "loss": 9.9683, "step": 398 }, { "epoch": 1.9196509178453205, "grad_norm": 0.16808640956878662, "learning_rate": 3.5909822866344606e-06, "loss": 9.2667, "step": 399 }, { "epoch": 1.924465844116762, "grad_norm": 0.15371474623680115, "learning_rate": 3.5748792270531403e-06, "loss": 8.9782, "step": 400 }, { "epoch": 1.9292807703882033, "grad_norm": 0.22420039772987366, "learning_rate": 3.5587761674718204e-06, "loss": 9.5041, "step": 401 }, { "epoch": 1.9340956966596448, "grad_norm": 0.19234929978847504, "learning_rate": 3.5426731078904997e-06, "loss": 8.8785, "step": 402 }, { "epoch": 1.9389106229310864, "grad_norm": 0.13435740768909454, "learning_rate": 3.5265700483091793e-06, "loss": 9.3544, "step": 403 }, { "epoch": 1.9437255492025278, "grad_norm": 0.21900928020477295, "learning_rate": 3.5104669887278586e-06, "loss": 8.1942, "step": 404 }, { "epoch": 1.9485404754739695, "grad_norm": 0.16180120408535004, "learning_rate": 3.4943639291465383e-06, "loss": 9.4132, "step": 405 }, { "epoch": 1.953355401745411, "grad_norm": 0.2743014991283417, "learning_rate": 3.4782608695652175e-06, "loss": 10.1588, "step": 406 }, { "epoch": 1.9581703280168523, "grad_norm": 0.14160144329071045, "learning_rate": 3.462157809983897e-06, "loss": 9.0612, "step": 407 }, { "epoch": 1.9629852542882937, "grad_norm": 0.1383216828107834, "learning_rate": 3.4460547504025764e-06, "loss": 7.8391, "step": 408 }, { "epoch": 1.9678001805597352, "grad_norm": 0.16990961134433746, "learning_rate": 3.4299516908212565e-06, "loss": 9.6392, "step": 409 }, { "epoch": 1.9726151068311766, "grad_norm": 0.17103661596775055, "learning_rate": 3.4138486312399362e-06, "loss": 9.8119, "step": 410 }, { "epoch": 1.977430033102618, "grad_norm": 0.13866282999515533, "learning_rate": 3.3977455716586155e-06, "loss": 8.2033, "step": 411 }, { "epoch": 1.9822449593740594, "grad_norm": 0.21080395579338074, "learning_rate": 3.381642512077295e-06, "loss": 10.3113, "step": 412 }, { "epoch": 1.987059885645501, "grad_norm": 0.19845469295978546, "learning_rate": 3.3655394524959744e-06, "loss": 8.2103, "step": 413 }, { "epoch": 1.9918748119169425, "grad_norm": 0.1903708279132843, "learning_rate": 3.349436392914654e-06, "loss": 9.1371, "step": 414 }, { "epoch": 1.9966897381883841, "grad_norm": 0.16223041713237762, "learning_rate": 3.3333333333333333e-06, "loss": 7.368, "step": 415 }, { "epoch": 2.0, "grad_norm": 0.14595092833042145, "learning_rate": 3.317230273752013e-06, "loss": 5.8598, "step": 416 }, { "epoch": 2.0048149262714414, "grad_norm": 0.15010525286197662, "learning_rate": 3.301127214170693e-06, "loss": 8.1315, "step": 417 }, { "epoch": 2.009629852542883, "grad_norm": 0.23141905665397644, "learning_rate": 3.2850241545893724e-06, "loss": 9.3878, "step": 418 }, { "epoch": 2.0144447788143243, "grad_norm": 0.11268898099660873, "learning_rate": 3.268921095008052e-06, "loss": 7.8098, "step": 419 }, { "epoch": 2.0192597050857657, "grad_norm": 0.16212859749794006, "learning_rate": 3.2528180354267313e-06, "loss": 8.5319, "step": 420 }, { "epoch": 2.0240746313572076, "grad_norm": 0.1565706580877304, "learning_rate": 3.236714975845411e-06, "loss": 7.8942, "step": 421 }, { "epoch": 2.028889557628649, "grad_norm": 0.1680455058813095, "learning_rate": 3.22061191626409e-06, "loss": 8.2839, "step": 422 }, { "epoch": 2.0337044839000904, "grad_norm": 0.2539815306663513, "learning_rate": 3.20450885668277e-06, "loss": 8.9937, "step": 423 }, { "epoch": 2.038519410171532, "grad_norm": 0.238030806183815, "learning_rate": 3.188405797101449e-06, "loss": 8.4321, "step": 424 }, { "epoch": 2.0433343364429732, "grad_norm": 0.19473034143447876, "learning_rate": 3.1723027375201292e-06, "loss": 8.0307, "step": 425 }, { "epoch": 2.0481492627144147, "grad_norm": 0.16554652154445648, "learning_rate": 3.156199677938809e-06, "loss": 9.5945, "step": 426 }, { "epoch": 2.052964188985856, "grad_norm": 0.19130951166152954, "learning_rate": 3.140096618357488e-06, "loss": 8.0234, "step": 427 }, { "epoch": 2.0577791152572975, "grad_norm": 0.14681276679039001, "learning_rate": 3.123993558776168e-06, "loss": 8.6784, "step": 428 }, { "epoch": 2.062594041528739, "grad_norm": 0.10328257828950882, "learning_rate": 3.107890499194847e-06, "loss": 8.0287, "step": 429 }, { "epoch": 2.0674089678001804, "grad_norm": 0.19125495851039886, "learning_rate": 3.0917874396135268e-06, "loss": 8.0046, "step": 430 }, { "epoch": 2.072223894071622, "grad_norm": 0.1793103963136673, "learning_rate": 3.075684380032206e-06, "loss": 7.0518, "step": 431 }, { "epoch": 2.0770388203430636, "grad_norm": 0.2568497657775879, "learning_rate": 3.059581320450886e-06, "loss": 8.2794, "step": 432 }, { "epoch": 2.081853746614505, "grad_norm": 0.18120069801807404, "learning_rate": 3.043478260869566e-06, "loss": 10.1022, "step": 433 }, { "epoch": 2.0866686728859465, "grad_norm": 0.27532005310058594, "learning_rate": 3.027375201288245e-06, "loss": 9.3059, "step": 434 }, { "epoch": 2.091483599157388, "grad_norm": 0.15648192167282104, "learning_rate": 3.0112721417069247e-06, "loss": 7.8617, "step": 435 }, { "epoch": 2.0962985254288293, "grad_norm": 0.17381350696086884, "learning_rate": 2.995169082125604e-06, "loss": 9.0082, "step": 436 }, { "epoch": 2.1011134517002708, "grad_norm": 0.13711951673030853, "learning_rate": 2.9790660225442837e-06, "loss": 7.764, "step": 437 }, { "epoch": 2.105928377971712, "grad_norm": 0.23948128521442413, "learning_rate": 2.962962962962963e-06, "loss": 8.4041, "step": 438 }, { "epoch": 2.1107433042431536, "grad_norm": 0.15631070733070374, "learning_rate": 2.9468599033816426e-06, "loss": 8.1708, "step": 439 }, { "epoch": 2.115558230514595, "grad_norm": 0.1608411967754364, "learning_rate": 2.9307568438003227e-06, "loss": 8.301, "step": 440 }, { "epoch": 2.120373156786037, "grad_norm": 0.16660411655902863, "learning_rate": 2.914653784219002e-06, "loss": 9.5365, "step": 441 }, { "epoch": 2.1251880830574783, "grad_norm": 0.17191386222839355, "learning_rate": 2.8985507246376816e-06, "loss": 8.9256, "step": 442 }, { "epoch": 2.1300030093289197, "grad_norm": 0.18492081761360168, "learning_rate": 2.882447665056361e-06, "loss": 9.0577, "step": 443 }, { "epoch": 2.134817935600361, "grad_norm": 0.2561168670654297, "learning_rate": 2.8663446054750405e-06, "loss": 8.8758, "step": 444 }, { "epoch": 2.1396328618718026, "grad_norm": 0.1588340848684311, "learning_rate": 2.85024154589372e-06, "loss": 8.1364, "step": 445 }, { "epoch": 2.144447788143244, "grad_norm": 0.1650805026292801, "learning_rate": 2.8341384863123995e-06, "loss": 8.337, "step": 446 }, { "epoch": 2.1492627144146854, "grad_norm": 0.2011885941028595, "learning_rate": 2.8180354267310787e-06, "loss": 8.6296, "step": 447 }, { "epoch": 2.154077640686127, "grad_norm": 0.18557001650333405, "learning_rate": 2.801932367149759e-06, "loss": 8.6218, "step": 448 }, { "epoch": 2.1588925669575683, "grad_norm": 0.1598547399044037, "learning_rate": 2.7858293075684385e-06, "loss": 8.4741, "step": 449 }, { "epoch": 2.16370749322901, "grad_norm": 0.17089636623859406, "learning_rate": 2.7697262479871177e-06, "loss": 9.0788, "step": 450 }, { "epoch": 2.1685224195004515, "grad_norm": 0.1817985475063324, "learning_rate": 2.7536231884057974e-06, "loss": 9.9717, "step": 451 }, { "epoch": 2.173337345771893, "grad_norm": 0.23914600908756256, "learning_rate": 2.7375201288244767e-06, "loss": 8.7548, "step": 452 }, { "epoch": 2.1781522720433344, "grad_norm": 0.17113572359085083, "learning_rate": 2.7214170692431564e-06, "loss": 7.7566, "step": 453 }, { "epoch": 2.182967198314776, "grad_norm": 0.14485716819763184, "learning_rate": 2.7053140096618356e-06, "loss": 8.9532, "step": 454 }, { "epoch": 2.1877821245862172, "grad_norm": 0.14129236340522766, "learning_rate": 2.6892109500805153e-06, "loss": 9.2833, "step": 455 }, { "epoch": 2.1925970508576587, "grad_norm": 0.23692472279071808, "learning_rate": 2.6731078904991954e-06, "loss": 8.3895, "step": 456 }, { "epoch": 2.1974119771291, "grad_norm": 0.16027197241783142, "learning_rate": 2.6570048309178746e-06, "loss": 7.7012, "step": 457 }, { "epoch": 2.2022269034005415, "grad_norm": 0.1416737139225006, "learning_rate": 2.6409017713365543e-06, "loss": 9.0799, "step": 458 }, { "epoch": 2.207041829671983, "grad_norm": 0.20678099989891052, "learning_rate": 2.6247987117552336e-06, "loss": 9.3679, "step": 459 }, { "epoch": 2.211856755943425, "grad_norm": 0.1649148017168045, "learning_rate": 2.6086956521739132e-06, "loss": 8.0503, "step": 460 }, { "epoch": 2.216671682214866, "grad_norm": 0.21159884333610535, "learning_rate": 2.5925925925925925e-06, "loss": 9.0968, "step": 461 }, { "epoch": 2.2214866084863076, "grad_norm": 0.13705681264400482, "learning_rate": 2.576489533011272e-06, "loss": 8.9948, "step": 462 }, { "epoch": 2.226301534757749, "grad_norm": 0.16624397039413452, "learning_rate": 2.5603864734299523e-06, "loss": 8.6079, "step": 463 }, { "epoch": 2.2311164610291905, "grad_norm": 0.1475958675146103, "learning_rate": 2.5442834138486315e-06, "loss": 8.0187, "step": 464 }, { "epoch": 2.235931387300632, "grad_norm": 0.13494673371315002, "learning_rate": 2.528180354267311e-06, "loss": 8.6545, "step": 465 }, { "epoch": 2.2407463135720733, "grad_norm": 0.17623811960220337, "learning_rate": 2.5120772946859904e-06, "loss": 9.4341, "step": 466 }, { "epoch": 2.2455612398435147, "grad_norm": 0.1706833392381668, "learning_rate": 2.49597423510467e-06, "loss": 8.7199, "step": 467 }, { "epoch": 2.2503761661149566, "grad_norm": 0.1953025609254837, "learning_rate": 2.4798711755233494e-06, "loss": 8.9361, "step": 468 }, { "epoch": 2.255191092386398, "grad_norm": 0.20142245292663574, "learning_rate": 2.4637681159420295e-06, "loss": 8.0552, "step": 469 }, { "epoch": 2.2600060186578395, "grad_norm": 0.20138177275657654, "learning_rate": 2.4476650563607087e-06, "loss": 8.5942, "step": 470 }, { "epoch": 2.264820944929281, "grad_norm": 0.16559800505638123, "learning_rate": 2.4315619967793884e-06, "loss": 8.8228, "step": 471 }, { "epoch": 2.2696358712007223, "grad_norm": 0.19990870356559753, "learning_rate": 2.4154589371980677e-06, "loss": 8.8207, "step": 472 }, { "epoch": 2.2744507974721637, "grad_norm": 0.21723681688308716, "learning_rate": 2.3993558776167473e-06, "loss": 8.4973, "step": 473 }, { "epoch": 2.279265723743605, "grad_norm": 0.17915472388267517, "learning_rate": 2.383252818035427e-06, "loss": 9.6049, "step": 474 }, { "epoch": 2.2840806500150466, "grad_norm": 0.16757084429264069, "learning_rate": 2.3671497584541063e-06, "loss": 9.7332, "step": 475 }, { "epoch": 2.288895576286488, "grad_norm": 0.16891081631183624, "learning_rate": 2.351046698872786e-06, "loss": 8.8673, "step": 476 }, { "epoch": 2.2937105025579294, "grad_norm": 0.20567509531974792, "learning_rate": 2.3349436392914656e-06, "loss": 7.8363, "step": 477 }, { "epoch": 2.298525428829371, "grad_norm": 0.1999160349369049, "learning_rate": 2.3188405797101453e-06, "loss": 8.7728, "step": 478 }, { "epoch": 2.3033403551008127, "grad_norm": 0.2348831444978714, "learning_rate": 2.3027375201288245e-06, "loss": 9.1277, "step": 479 }, { "epoch": 2.308155281372254, "grad_norm": 0.1700768917798996, "learning_rate": 2.2866344605475042e-06, "loss": 8.6687, "step": 480 }, { "epoch": 2.3129702076436955, "grad_norm": 0.16349351406097412, "learning_rate": 2.270531400966184e-06, "loss": 8.0606, "step": 481 }, { "epoch": 2.317785133915137, "grad_norm": 0.1540592461824417, "learning_rate": 2.254428341384863e-06, "loss": 7.3249, "step": 482 }, { "epoch": 2.3226000601865784, "grad_norm": 0.1774080991744995, "learning_rate": 2.238325281803543e-06, "loss": 8.3134, "step": 483 }, { "epoch": 2.32741498645802, "grad_norm": 0.14969424903392792, "learning_rate": 2.222222222222222e-06, "loss": 7.8938, "step": 484 }, { "epoch": 2.3322299127294612, "grad_norm": 0.20331765711307526, "learning_rate": 2.206119162640902e-06, "loss": 8.3293, "step": 485 }, { "epoch": 2.3370448390009027, "grad_norm": 0.1849997490644455, "learning_rate": 2.1900161030595814e-06, "loss": 7.811, "step": 486 }, { "epoch": 2.341859765272344, "grad_norm": 0.1732867807149887, "learning_rate": 2.173913043478261e-06, "loss": 10.516, "step": 487 }, { "epoch": 2.346674691543786, "grad_norm": 0.21279215812683105, "learning_rate": 2.1578099838969404e-06, "loss": 9.1675, "step": 488 }, { "epoch": 2.3514896178152274, "grad_norm": 0.1616515964269638, "learning_rate": 2.14170692431562e-06, "loss": 7.8694, "step": 489 }, { "epoch": 2.356304544086669, "grad_norm": 0.1548496037721634, "learning_rate": 2.1256038647342997e-06, "loss": 9.4236, "step": 490 }, { "epoch": 2.36111947035811, "grad_norm": 0.19034922122955322, "learning_rate": 2.109500805152979e-06, "loss": 8.6999, "step": 491 }, { "epoch": 2.3659343966295516, "grad_norm": 0.15850062668323517, "learning_rate": 2.093397745571659e-06, "loss": 9.3389, "step": 492 }, { "epoch": 2.370749322900993, "grad_norm": 0.17764140665531158, "learning_rate": 2.0772946859903383e-06, "loss": 8.3777, "step": 493 }, { "epoch": 2.3755642491724345, "grad_norm": 0.1516241729259491, "learning_rate": 2.061191626409018e-06, "loss": 8.215, "step": 494 }, { "epoch": 2.380379175443876, "grad_norm": 0.19306409358978271, "learning_rate": 2.0450885668276972e-06, "loss": 8.5159, "step": 495 }, { "epoch": 2.3851941017153173, "grad_norm": 0.18563927710056305, "learning_rate": 2.028985507246377e-06, "loss": 9.1431, "step": 496 }, { "epoch": 2.3900090279867587, "grad_norm": 0.2177901268005371, "learning_rate": 2.0128824476650566e-06, "loss": 8.7708, "step": 497 }, { "epoch": 2.3948239542582006, "grad_norm": 0.18854300677776337, "learning_rate": 1.9967793880837363e-06, "loss": 7.7517, "step": 498 }, { "epoch": 2.399638880529642, "grad_norm": 0.19311924278736115, "learning_rate": 1.9806763285024155e-06, "loss": 8.5485, "step": 499 }, { "epoch": 2.4044538068010834, "grad_norm": 0.1653197556734085, "learning_rate": 1.964573268921095e-06, "loss": 8.2121, "step": 500 }, { "epoch": 2.409268733072525, "grad_norm": 0.14467386901378632, "learning_rate": 1.948470209339775e-06, "loss": 7.212, "step": 501 }, { "epoch": 2.4140836593439663, "grad_norm": 0.127033531665802, "learning_rate": 1.932367149758454e-06, "loss": 7.9942, "step": 502 }, { "epoch": 2.4188985856154077, "grad_norm": 0.22416523098945618, "learning_rate": 1.916264090177134e-06, "loss": 9.2825, "step": 503 }, { "epoch": 2.423713511886849, "grad_norm": 0.15797053277492523, "learning_rate": 1.9001610305958135e-06, "loss": 9.0045, "step": 504 }, { "epoch": 2.4285284381582906, "grad_norm": 0.16567374765872955, "learning_rate": 1.884057971014493e-06, "loss": 7.8467, "step": 505 }, { "epoch": 2.433343364429732, "grad_norm": 0.2187729775905609, "learning_rate": 1.8679549114331724e-06, "loss": 6.8691, "step": 506 }, { "epoch": 2.438158290701174, "grad_norm": 0.1330510675907135, "learning_rate": 1.8518518518518519e-06, "loss": 8.6586, "step": 507 }, { "epoch": 2.4429732169726153, "grad_norm": 0.18938250839710236, "learning_rate": 1.8357487922705318e-06, "loss": 8.7543, "step": 508 }, { "epoch": 2.4477881432440567, "grad_norm": 0.16788271069526672, "learning_rate": 1.8196457326892112e-06, "loss": 7.234, "step": 509 }, { "epoch": 2.452603069515498, "grad_norm": 0.13278517127037048, "learning_rate": 1.8035426731078907e-06, "loss": 8.4826, "step": 510 }, { "epoch": 2.4574179957869395, "grad_norm": 0.12632611393928528, "learning_rate": 1.7874396135265702e-06, "loss": 8.6997, "step": 511 }, { "epoch": 2.462232922058381, "grad_norm": 0.21339954435825348, "learning_rate": 1.7713365539452498e-06, "loss": 8.8112, "step": 512 }, { "epoch": 2.4670478483298224, "grad_norm": 0.17126010358333588, "learning_rate": 1.7552334943639293e-06, "loss": 7.5743, "step": 513 }, { "epoch": 2.471862774601264, "grad_norm": 0.13244563341140747, "learning_rate": 1.7391304347826088e-06, "loss": 7.7622, "step": 514 }, { "epoch": 2.4766777008727052, "grad_norm": 0.21267832815647125, "learning_rate": 1.7230273752012882e-06, "loss": 6.7007, "step": 515 }, { "epoch": 2.4814926271441466, "grad_norm": 0.12102889269590378, "learning_rate": 1.7069243156199681e-06, "loss": 9.1424, "step": 516 }, { "epoch": 2.4863075534155885, "grad_norm": 0.13392595946788788, "learning_rate": 1.6908212560386476e-06, "loss": 8.5965, "step": 517 }, { "epoch": 2.49112247968703, "grad_norm": 0.1512872725725174, "learning_rate": 1.674718196457327e-06, "loss": 8.3953, "step": 518 }, { "epoch": 2.4959374059584714, "grad_norm": 0.13532410562038422, "learning_rate": 1.6586151368760065e-06, "loss": 8.184, "step": 519 }, { "epoch": 2.5007523322299128, "grad_norm": 0.1816960871219635, "learning_rate": 1.6425120772946862e-06, "loss": 9.9139, "step": 520 }, { "epoch": 2.505567258501354, "grad_norm": 0.11753327399492264, "learning_rate": 1.6264090177133656e-06, "loss": 8.4936, "step": 521 }, { "epoch": 2.5103821847727956, "grad_norm": 0.20234891772270203, "learning_rate": 1.610305958132045e-06, "loss": 8.1004, "step": 522 }, { "epoch": 2.515197111044237, "grad_norm": 0.14017826318740845, "learning_rate": 1.5942028985507246e-06, "loss": 8.4294, "step": 523 }, { "epoch": 2.5200120373156785, "grad_norm": 0.1481131762266159, "learning_rate": 1.5780998389694045e-06, "loss": 8.3886, "step": 524 }, { "epoch": 2.5248269635871203, "grad_norm": 0.2701749801635742, "learning_rate": 1.561996779388084e-06, "loss": 8.1065, "step": 525 }, { "epoch": 2.5296418898585618, "grad_norm": 0.16109466552734375, "learning_rate": 1.5458937198067634e-06, "loss": 8.4212, "step": 526 }, { "epoch": 2.534456816130003, "grad_norm": 0.18063953518867493, "learning_rate": 1.529790660225443e-06, "loss": 7.8353, "step": 527 }, { "epoch": 2.5392717424014446, "grad_norm": 0.16267195343971252, "learning_rate": 1.5136876006441225e-06, "loss": 7.9062, "step": 528 }, { "epoch": 2.544086668672886, "grad_norm": 0.1997467577457428, "learning_rate": 1.497584541062802e-06, "loss": 8.6919, "step": 529 }, { "epoch": 2.5489015949443274, "grad_norm": 0.15415464341640472, "learning_rate": 1.4814814814814815e-06, "loss": 8.2469, "step": 530 }, { "epoch": 2.553716521215769, "grad_norm": 0.1869962513446808, "learning_rate": 1.4653784219001613e-06, "loss": 8.216, "step": 531 }, { "epoch": 2.5585314474872103, "grad_norm": 0.14521171152591705, "learning_rate": 1.4492753623188408e-06, "loss": 8.2956, "step": 532 }, { "epoch": 2.5633463737586517, "grad_norm": 0.1761654019355774, "learning_rate": 1.4331723027375203e-06, "loss": 8.3107, "step": 533 }, { "epoch": 2.568161300030093, "grad_norm": 0.1776813566684723, "learning_rate": 1.4170692431561997e-06, "loss": 7.2249, "step": 534 }, { "epoch": 2.5729762263015346, "grad_norm": 0.19041168689727783, "learning_rate": 1.4009661835748794e-06, "loss": 8.6567, "step": 535 }, { "epoch": 2.577791152572976, "grad_norm": 0.1729832887649536, "learning_rate": 1.3848631239935589e-06, "loss": 9.1775, "step": 536 }, { "epoch": 2.582606078844418, "grad_norm": 0.1917349100112915, "learning_rate": 1.3687600644122383e-06, "loss": 8.1724, "step": 537 }, { "epoch": 2.5874210051158593, "grad_norm": 0.19829866290092468, "learning_rate": 1.3526570048309178e-06, "loss": 9.4741, "step": 538 }, { "epoch": 2.5922359313873007, "grad_norm": 0.17467886209487915, "learning_rate": 1.3365539452495977e-06, "loss": 8.3608, "step": 539 }, { "epoch": 2.597050857658742, "grad_norm": 0.25771814584732056, "learning_rate": 1.3204508856682772e-06, "loss": 8.5837, "step": 540 }, { "epoch": 2.6018657839301835, "grad_norm": 0.13524986803531647, "learning_rate": 1.3043478260869566e-06, "loss": 7.8586, "step": 541 }, { "epoch": 2.606680710201625, "grad_norm": 0.20528331398963928, "learning_rate": 1.288244766505636e-06, "loss": 8.9202, "step": 542 }, { "epoch": 2.6114956364730664, "grad_norm": 0.18491816520690918, "learning_rate": 1.2721417069243158e-06, "loss": 8.1091, "step": 543 }, { "epoch": 2.6163105627445082, "grad_norm": 0.14208512008190155, "learning_rate": 1.2560386473429952e-06, "loss": 8.8829, "step": 544 }, { "epoch": 2.6211254890159497, "grad_norm": 0.22715114057064056, "learning_rate": 1.2399355877616747e-06, "loss": 8.4472, "step": 545 }, { "epoch": 2.625940415287391, "grad_norm": 0.18286040425300598, "learning_rate": 1.2238325281803544e-06, "loss": 8.129, "step": 546 }, { "epoch": 2.6307553415588325, "grad_norm": 0.18549402058124542, "learning_rate": 1.2077294685990338e-06, "loss": 8.3491, "step": 547 }, { "epoch": 2.635570267830274, "grad_norm": 0.16227751970291138, "learning_rate": 1.1916264090177135e-06, "loss": 8.0191, "step": 548 }, { "epoch": 2.6403851941017153, "grad_norm": 0.17795391380786896, "learning_rate": 1.175523349436393e-06, "loss": 7.1223, "step": 549 }, { "epoch": 2.6452001203731568, "grad_norm": 0.17126573622226715, "learning_rate": 1.1594202898550726e-06, "loss": 8.2455, "step": 550 }, { "epoch": 2.650015046644598, "grad_norm": 0.17369426786899567, "learning_rate": 1.1433172302737521e-06, "loss": 9.6182, "step": 551 }, { "epoch": 2.6548299729160396, "grad_norm": 0.14956361055374146, "learning_rate": 1.1272141706924316e-06, "loss": 7.7231, "step": 552 }, { "epoch": 2.659644899187481, "grad_norm": 0.17787741124629974, "learning_rate": 1.111111111111111e-06, "loss": 8.531, "step": 553 }, { "epoch": 2.6644598254589225, "grad_norm": 0.16423143446445465, "learning_rate": 1.0950080515297907e-06, "loss": 9.07, "step": 554 }, { "epoch": 2.669274751730364, "grad_norm": 0.18575292825698853, "learning_rate": 1.0789049919484702e-06, "loss": 9.2116, "step": 555 }, { "epoch": 2.6740896780018057, "grad_norm": 0.1774529069662094, "learning_rate": 1.0628019323671499e-06, "loss": 7.6522, "step": 556 }, { "epoch": 2.678904604273247, "grad_norm": 0.12618403136730194, "learning_rate": 1.0466988727858295e-06, "loss": 8.2429, "step": 557 }, { "epoch": 2.6837195305446886, "grad_norm": 0.1379764825105667, "learning_rate": 1.030595813204509e-06, "loss": 9.0101, "step": 558 }, { "epoch": 2.68853445681613, "grad_norm": 0.1804221123456955, "learning_rate": 1.0144927536231885e-06, "loss": 7.1618, "step": 559 }, { "epoch": 2.6933493830875714, "grad_norm": 0.2020816057920456, "learning_rate": 9.983896940418681e-07, "loss": 8.0899, "step": 560 }, { "epoch": 2.698164309359013, "grad_norm": 0.1975187063217163, "learning_rate": 9.822866344605476e-07, "loss": 6.9638, "step": 561 }, { "epoch": 2.7029792356304543, "grad_norm": 0.21582917869091034, "learning_rate": 9.66183574879227e-07, "loss": 8.3501, "step": 562 }, { "epoch": 2.707794161901896, "grad_norm": 0.1378657966852188, "learning_rate": 9.500805152979067e-07, "loss": 7.8302, "step": 563 }, { "epoch": 2.7126090881733376, "grad_norm": 0.17029066383838654, "learning_rate": 9.339774557165862e-07, "loss": 8.3873, "step": 564 }, { "epoch": 2.717424014444779, "grad_norm": 0.1723220944404602, "learning_rate": 9.178743961352659e-07, "loss": 7.8308, "step": 565 }, { "epoch": 2.7222389407162204, "grad_norm": 0.15351563692092896, "learning_rate": 9.017713365539453e-07, "loss": 7.8458, "step": 566 }, { "epoch": 2.727053866987662, "grad_norm": 0.15215708315372467, "learning_rate": 8.856682769726249e-07, "loss": 8.4561, "step": 567 }, { "epoch": 2.7318687932591033, "grad_norm": 0.18015427887439728, "learning_rate": 8.695652173913044e-07, "loss": 7.986, "step": 568 }, { "epoch": 2.7366837195305447, "grad_norm": 0.18228279054164886, "learning_rate": 8.534621578099841e-07, "loss": 8.3809, "step": 569 }, { "epoch": 2.741498645801986, "grad_norm": 0.17855818569660187, "learning_rate": 8.373590982286635e-07, "loss": 7.2222, "step": 570 }, { "epoch": 2.7463135720734275, "grad_norm": 0.12565724551677704, "learning_rate": 8.212560386473431e-07, "loss": 8.319, "step": 571 }, { "epoch": 2.751128498344869, "grad_norm": 0.1549467146396637, "learning_rate": 8.051529790660226e-07, "loss": 7.4813, "step": 572 }, { "epoch": 2.7559434246163104, "grad_norm": 0.19094257056713104, "learning_rate": 7.890499194847022e-07, "loss": 7.5127, "step": 573 }, { "epoch": 2.760758350887752, "grad_norm": 0.18528102338314056, "learning_rate": 7.729468599033817e-07, "loss": 8.4165, "step": 574 }, { "epoch": 2.7655732771591937, "grad_norm": 0.17467372119426727, "learning_rate": 7.568438003220613e-07, "loss": 8.4328, "step": 575 }, { "epoch": 2.770388203430635, "grad_norm": 0.1786053627729416, "learning_rate": 7.407407407407407e-07, "loss": 7.6748, "step": 576 }, { "epoch": 2.7752031297020765, "grad_norm": 0.2303641140460968, "learning_rate": 7.246376811594204e-07, "loss": 7.6819, "step": 577 }, { "epoch": 2.780018055973518, "grad_norm": 0.20672529935836792, "learning_rate": 7.085346215780999e-07, "loss": 7.6466, "step": 578 }, { "epoch": 2.7848329822449593, "grad_norm": 0.20678630471229553, "learning_rate": 6.924315619967794e-07, "loss": 7.83, "step": 579 }, { "epoch": 2.7896479085164008, "grad_norm": 0.22579342126846313, "learning_rate": 6.763285024154589e-07, "loss": 8.352, "step": 580 }, { "epoch": 2.794462834787842, "grad_norm": 0.21970775723457336, "learning_rate": 6.602254428341386e-07, "loss": 9.0786, "step": 581 }, { "epoch": 2.799277761059284, "grad_norm": 0.15649309754371643, "learning_rate": 6.44122383252818e-07, "loss": 8.0365, "step": 582 }, { "epoch": 2.8040926873307255, "grad_norm": 0.15020275115966797, "learning_rate": 6.280193236714976e-07, "loss": 7.5352, "step": 583 }, { "epoch": 2.808907613602167, "grad_norm": 0.1699695736169815, "learning_rate": 6.119162640901772e-07, "loss": 7.9234, "step": 584 }, { "epoch": 2.8137225398736083, "grad_norm": 0.14597013592720032, "learning_rate": 5.958132045088568e-07, "loss": 8.406, "step": 585 }, { "epoch": 2.8185374661450497, "grad_norm": 0.1936945766210556, "learning_rate": 5.797101449275363e-07, "loss": 7.1909, "step": 586 }, { "epoch": 2.823352392416491, "grad_norm": 0.1677147001028061, "learning_rate": 5.636070853462158e-07, "loss": 8.3866, "step": 587 }, { "epoch": 2.8281673186879326, "grad_norm": 0.1816486269235611, "learning_rate": 5.475040257648954e-07, "loss": 8.0837, "step": 588 }, { "epoch": 2.832982244959374, "grad_norm": 0.18202394247055054, "learning_rate": 5.314009661835749e-07, "loss": 9.1406, "step": 589 }, { "epoch": 2.8377971712308154, "grad_norm": 0.2390686720609665, "learning_rate": 5.152979066022545e-07, "loss": 8.5755, "step": 590 }, { "epoch": 2.842612097502257, "grad_norm": 0.18315307796001434, "learning_rate": 4.991948470209341e-07, "loss": 8.1501, "step": 591 }, { "epoch": 2.8474270237736983, "grad_norm": 0.17412015795707703, "learning_rate": 4.830917874396135e-07, "loss": 8.2024, "step": 592 }, { "epoch": 2.8522419500451397, "grad_norm": 0.18761633336544037, "learning_rate": 4.669887278582931e-07, "loss": 7.597, "step": 593 }, { "epoch": 2.8570568763165816, "grad_norm": 0.1563250869512558, "learning_rate": 4.5088566827697267e-07, "loss": 8.4617, "step": 594 }, { "epoch": 2.861871802588023, "grad_norm": 0.13112574815750122, "learning_rate": 4.347826086956522e-07, "loss": 7.6717, "step": 595 }, { "epoch": 2.8666867288594644, "grad_norm": 0.1944950670003891, "learning_rate": 4.1867954911433176e-07, "loss": 8.5494, "step": 596 }, { "epoch": 2.871501655130906, "grad_norm": 0.18215830624103546, "learning_rate": 4.025764895330113e-07, "loss": 8.4453, "step": 597 }, { "epoch": 2.8763165814023472, "grad_norm": 0.15392394363880157, "learning_rate": 3.8647342995169085e-07, "loss": 7.0528, "step": 598 }, { "epoch": 2.8811315076737887, "grad_norm": 0.17069800198078156, "learning_rate": 3.7037037037037036e-07, "loss": 9.8743, "step": 599 }, { "epoch": 2.88594643394523, "grad_norm": 0.13186608254909515, "learning_rate": 3.5426731078904993e-07, "loss": 7.9741, "step": 600 }, { "epoch": 2.8907613602166715, "grad_norm": 0.15300041437149048, "learning_rate": 3.3816425120772945e-07, "loss": 8.788, "step": 601 }, { "epoch": 2.8955762864881134, "grad_norm": 0.15090717375278473, "learning_rate": 3.22061191626409e-07, "loss": 7.801, "step": 602 }, { "epoch": 2.900391212759555, "grad_norm": 0.1606573611497879, "learning_rate": 3.059581320450886e-07, "loss": 8.5165, "step": 603 }, { "epoch": 2.9052061390309962, "grad_norm": 0.15746456384658813, "learning_rate": 2.8985507246376816e-07, "loss": 7.3699, "step": 604 }, { "epoch": 2.9100210653024376, "grad_norm": 0.1550646871328354, "learning_rate": 2.737520128824477e-07, "loss": 7.6848, "step": 605 }, { "epoch": 2.914835991573879, "grad_norm": 0.14871163666248322, "learning_rate": 2.5764895330112725e-07, "loss": 7.6812, "step": 606 }, { "epoch": 2.9196509178453205, "grad_norm": 0.2426673322916031, "learning_rate": 2.4154589371980677e-07, "loss": 7.7157, "step": 607 }, { "epoch": 2.924465844116762, "grad_norm": 0.19695597887039185, "learning_rate": 2.2544283413848634e-07, "loss": 7.9716, "step": 608 }, { "epoch": 2.9292807703882033, "grad_norm": 0.18192477524280548, "learning_rate": 2.0933977455716588e-07, "loss": 7.4395, "step": 609 }, { "epoch": 2.9340956966596448, "grad_norm": 0.18087869882583618, "learning_rate": 1.9323671497584542e-07, "loss": 7.6177, "step": 610 }, { "epoch": 2.938910622931086, "grad_norm": 0.1489817202091217, "learning_rate": 1.7713365539452497e-07, "loss": 7.4342, "step": 611 }, { "epoch": 2.9437255492025276, "grad_norm": 0.12941974401474, "learning_rate": 1.610305958132045e-07, "loss": 9.0655, "step": 612 }, { "epoch": 2.9485404754739695, "grad_norm": 0.1680421680212021, "learning_rate": 1.4492753623188408e-07, "loss": 7.2567, "step": 613 }, { "epoch": 2.953355401745411, "grad_norm": 0.18065397441387177, "learning_rate": 1.2882447665056362e-07, "loss": 7.9862, "step": 614 }, { "epoch": 2.9581703280168523, "grad_norm": 0.1599837988615036, "learning_rate": 1.1272141706924317e-07, "loss": 8.2424, "step": 615 }, { "epoch": 2.9629852542882937, "grad_norm": 0.1959857940673828, "learning_rate": 9.661835748792271e-08, "loss": 7.4787, "step": 616 }, { "epoch": 2.967800180559735, "grad_norm": 0.15649034082889557, "learning_rate": 8.051529790660226e-08, "loss": 8.1575, "step": 617 }, { "epoch": 2.9726151068311766, "grad_norm": 0.1679297834634781, "learning_rate": 6.441223832528181e-08, "loss": 7.2253, "step": 618 }, { "epoch": 2.977430033102618, "grad_norm": 0.15790539979934692, "learning_rate": 4.8309178743961356e-08, "loss": 7.6764, "step": 619 }, { "epoch": 2.9822449593740594, "grad_norm": 0.1694169044494629, "learning_rate": 3.2206119162640906e-08, "loss": 8.6771, "step": 620 }, { "epoch": 2.9870598856455013, "grad_norm": 0.17527279257774353, "learning_rate": 1.6103059581320453e-08, "loss": 10.7562, "step": 621 }, { "epoch": 2.9870598856455013, "step": 621, "total_flos": 2.802876100504453e+18, "train_loss": 10.002562027622536, "train_runtime": 59343.0737, "train_samples_per_second": 1.344, "train_steps_per_second": 0.01 } ], "logging_steps": 1.0, "max_steps": 621, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.802876100504453e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }