diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,32872 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 4690, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010660980810234541, + "grad_norm": 5.4528142214861735, + "learning_rate": 1.7057569296375268e-07, + "loss": 0.837, + "step": 1 + }, + { + "epoch": 0.0021321961620469083, + "grad_norm": 5.511675498609771, + "learning_rate": 3.4115138592750537e-07, + "loss": 0.8508, + "step": 2 + }, + { + "epoch": 0.0031982942430703624, + "grad_norm": 5.464511849599797, + "learning_rate": 5.11727078891258e-07, + "loss": 0.8447, + "step": 3 + }, + { + "epoch": 0.0042643923240938165, + "grad_norm": 5.447268946229452, + "learning_rate": 6.823027718550107e-07, + "loss": 0.836, + "step": 4 + }, + { + "epoch": 0.005330490405117271, + "grad_norm": 5.381379257626523, + "learning_rate": 8.528784648187634e-07, + "loss": 0.8463, + "step": 5 + }, + { + "epoch": 0.006396588486140725, + "grad_norm": 5.4441126548183725, + "learning_rate": 1.023454157782516e-06, + "loss": 0.8514, + "step": 6 + }, + { + "epoch": 0.007462686567164179, + "grad_norm": 5.008014360316408, + "learning_rate": 1.1940298507462686e-06, + "loss": 0.8215, + "step": 7 + }, + { + "epoch": 0.008528784648187633, + "grad_norm": 5.000283670760778, + "learning_rate": 1.3646055437100215e-06, + "loss": 0.8361, + "step": 8 + }, + { + "epoch": 0.009594882729211088, + "grad_norm": 3.964243964413472, + "learning_rate": 1.5351812366737743e-06, + "loss": 0.7919, + "step": 9 + }, + { + "epoch": 0.010660980810234541, + "grad_norm": 3.7864879773612823, + "learning_rate": 1.7057569296375267e-06, + "loss": 0.7889, + "step": 10 + }, + { + "epoch": 0.011727078891257996, + "grad_norm": 2.512068820205037, + "learning_rate": 1.8763326226012796e-06, + "loss": 0.7586, + "step": 11 + }, + { + "epoch": 0.01279317697228145, + "grad_norm": 2.259289045541819, + "learning_rate": 2.046908315565032e-06, + "loss": 0.7527, + "step": 12 + }, + { + "epoch": 0.013859275053304905, + "grad_norm": 2.138291149917742, + "learning_rate": 2.217484008528785e-06, + "loss": 0.7504, + "step": 13 + }, + { + "epoch": 0.014925373134328358, + "grad_norm": 1.9179684528050671, + "learning_rate": 2.3880597014925373e-06, + "loss": 0.7449, + "step": 14 + }, + { + "epoch": 0.015991471215351813, + "grad_norm": 2.0680127043179235, + "learning_rate": 2.55863539445629e-06, + "loss": 0.7272, + "step": 15 + }, + { + "epoch": 0.017057569296375266, + "grad_norm": 3.1604717469534447, + "learning_rate": 2.729211087420043e-06, + "loss": 0.7213, + "step": 16 + }, + { + "epoch": 0.01812366737739872, + "grad_norm": 3.278566086414656, + "learning_rate": 2.8997867803837954e-06, + "loss": 0.7202, + "step": 17 + }, + { + "epoch": 0.019189765458422176, + "grad_norm": 3.1700600356728033, + "learning_rate": 3.0703624733475486e-06, + "loss": 0.7189, + "step": 18 + }, + { + "epoch": 0.02025586353944563, + "grad_norm": 3.0282075577668857, + "learning_rate": 3.240938166311301e-06, + "loss": 0.7239, + "step": 19 + }, + { + "epoch": 0.021321961620469083, + "grad_norm": 2.652594115580126, + "learning_rate": 3.4115138592750535e-06, + "loss": 0.701, + "step": 20 + }, + { + "epoch": 0.022388059701492536, + "grad_norm": 1.9034175357194445, + "learning_rate": 3.582089552238806e-06, + "loss": 0.6933, + "step": 21 + }, + { + "epoch": 0.023454157782515993, + "grad_norm": 1.5538529247302069, + "learning_rate": 3.752665245202559e-06, + "loss": 0.6571, + "step": 22 + }, + { + "epoch": 0.024520255863539446, + "grad_norm": 1.2849096876637858, + "learning_rate": 3.9232409381663116e-06, + "loss": 0.6563, + "step": 23 + }, + { + "epoch": 0.0255863539445629, + "grad_norm": 1.0948864208805895, + "learning_rate": 4.093816631130064e-06, + "loss": 0.6466, + "step": 24 + }, + { + "epoch": 0.026652452025586353, + "grad_norm": 1.0624300872894943, + "learning_rate": 4.264392324093816e-06, + "loss": 0.6424, + "step": 25 + }, + { + "epoch": 0.02771855010660981, + "grad_norm": 1.131312650641839, + "learning_rate": 4.43496801705757e-06, + "loss": 0.6369, + "step": 26 + }, + { + "epoch": 0.028784648187633263, + "grad_norm": 1.0990409054108197, + "learning_rate": 4.605543710021322e-06, + "loss": 0.624, + "step": 27 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 0.9649930973741134, + "learning_rate": 4.7761194029850745e-06, + "loss": 0.6149, + "step": 28 + }, + { + "epoch": 0.03091684434968017, + "grad_norm": 0.7025445114737157, + "learning_rate": 4.946695095948828e-06, + "loss": 0.6143, + "step": 29 + }, + { + "epoch": 0.031982942430703626, + "grad_norm": 0.7153092735896356, + "learning_rate": 5.11727078891258e-06, + "loss": 0.6087, + "step": 30 + }, + { + "epoch": 0.03304904051172708, + "grad_norm": 0.8673451563666814, + "learning_rate": 5.2878464818763335e-06, + "loss": 0.6062, + "step": 31 + }, + { + "epoch": 0.03411513859275053, + "grad_norm": 0.8051803897020771, + "learning_rate": 5.458422174840086e-06, + "loss": 0.5961, + "step": 32 + }, + { + "epoch": 0.035181236673773986, + "grad_norm": 0.6282947287043709, + "learning_rate": 5.628997867803838e-06, + "loss": 0.5945, + "step": 33 + }, + { + "epoch": 0.03624733475479744, + "grad_norm": 0.5284036506183948, + "learning_rate": 5.799573560767591e-06, + "loss": 0.5873, + "step": 34 + }, + { + "epoch": 0.03731343283582089, + "grad_norm": 0.6560915453306406, + "learning_rate": 5.970149253731343e-06, + "loss": 0.5926, + "step": 35 + }, + { + "epoch": 0.03837953091684435, + "grad_norm": 0.6254389118129263, + "learning_rate": 6.140724946695097e-06, + "loss": 0.5925, + "step": 36 + }, + { + "epoch": 0.039445628997867806, + "grad_norm": 0.4747128371854551, + "learning_rate": 6.31130063965885e-06, + "loss": 0.5789, + "step": 37 + }, + { + "epoch": 0.04051172707889126, + "grad_norm": 0.43328964711656565, + "learning_rate": 6.481876332622602e-06, + "loss": 0.5777, + "step": 38 + }, + { + "epoch": 0.04157782515991471, + "grad_norm": 0.5041187156835155, + "learning_rate": 6.6524520255863545e-06, + "loss": 0.5731, + "step": 39 + }, + { + "epoch": 0.042643923240938165, + "grad_norm": 0.4996573987096877, + "learning_rate": 6.823027718550107e-06, + "loss": 0.5704, + "step": 40 + }, + { + "epoch": 0.04371002132196162, + "grad_norm": 0.40178653209472504, + "learning_rate": 6.993603411513859e-06, + "loss": 0.5687, + "step": 41 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 0.39678565068179716, + "learning_rate": 7.164179104477612e-06, + "loss": 0.5606, + "step": 42 + }, + { + "epoch": 0.04584221748400853, + "grad_norm": 0.3838477339215644, + "learning_rate": 7.334754797441366e-06, + "loss": 0.5551, + "step": 43 + }, + { + "epoch": 0.046908315565031986, + "grad_norm": 0.39376377958875114, + "learning_rate": 7.505330490405118e-06, + "loss": 0.5586, + "step": 44 + }, + { + "epoch": 0.04797441364605544, + "grad_norm": 0.4089383682673747, + "learning_rate": 7.67590618336887e-06, + "loss": 0.544, + "step": 45 + }, + { + "epoch": 0.04904051172707889, + "grad_norm": 0.3674905464499231, + "learning_rate": 7.846481876332623e-06, + "loss": 0.5562, + "step": 46 + }, + { + "epoch": 0.050106609808102345, + "grad_norm": 0.3391811295227044, + "learning_rate": 8.017057569296376e-06, + "loss": 0.5489, + "step": 47 + }, + { + "epoch": 0.0511727078891258, + "grad_norm": 0.3771487479148055, + "learning_rate": 8.187633262260128e-06, + "loss": 0.5553, + "step": 48 + }, + { + "epoch": 0.05223880597014925, + "grad_norm": 0.3264015224929875, + "learning_rate": 8.35820895522388e-06, + "loss": 0.5505, + "step": 49 + }, + { + "epoch": 0.053304904051172705, + "grad_norm": 0.3331723647408018, + "learning_rate": 8.528784648187633e-06, + "loss": 0.5357, + "step": 50 + }, + { + "epoch": 0.054371002132196165, + "grad_norm": 0.37485261764922745, + "learning_rate": 8.699360341151387e-06, + "loss": 0.5441, + "step": 51 + }, + { + "epoch": 0.05543710021321962, + "grad_norm": 0.9120774222301191, + "learning_rate": 8.86993603411514e-06, + "loss": 0.5324, + "step": 52 + }, + { + "epoch": 0.05650319829424307, + "grad_norm": 0.3263354859744661, + "learning_rate": 9.040511727078892e-06, + "loss": 0.5352, + "step": 53 + }, + { + "epoch": 0.057569296375266525, + "grad_norm": 0.35885753984901503, + "learning_rate": 9.211087420042644e-06, + "loss": 0.5405, + "step": 54 + }, + { + "epoch": 0.05863539445628998, + "grad_norm": 0.29649061716771913, + "learning_rate": 9.381663113006397e-06, + "loss": 0.5361, + "step": 55 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 0.24951946598497796, + "learning_rate": 9.552238805970149e-06, + "loss": 0.5301, + "step": 56 + }, + { + "epoch": 0.060767590618336885, + "grad_norm": 0.3772047271710351, + "learning_rate": 9.722814498933903e-06, + "loss": 0.5448, + "step": 57 + }, + { + "epoch": 0.06183368869936034, + "grad_norm": 0.2493874964259867, + "learning_rate": 9.893390191897656e-06, + "loss": 0.522, + "step": 58 + }, + { + "epoch": 0.0628997867803838, + "grad_norm": 0.24173737961013422, + "learning_rate": 1.0063965884861408e-05, + "loss": 0.5295, + "step": 59 + }, + { + "epoch": 0.06396588486140725, + "grad_norm": 0.3214864858366852, + "learning_rate": 1.023454157782516e-05, + "loss": 0.534, + "step": 60 + }, + { + "epoch": 0.0650319829424307, + "grad_norm": 0.22660214109074844, + "learning_rate": 1.0405117270788913e-05, + "loss": 0.5207, + "step": 61 + }, + { + "epoch": 0.06609808102345416, + "grad_norm": 0.22692046089904563, + "learning_rate": 1.0575692963752667e-05, + "loss": 0.524, + "step": 62 + }, + { + "epoch": 0.06716417910447761, + "grad_norm": 0.267684560903461, + "learning_rate": 1.074626865671642e-05, + "loss": 0.5241, + "step": 63 + }, + { + "epoch": 0.06823027718550106, + "grad_norm": 0.2145100827755043, + "learning_rate": 1.0916844349680172e-05, + "loss": 0.5267, + "step": 64 + }, + { + "epoch": 0.06929637526652452, + "grad_norm": 0.2581454974388823, + "learning_rate": 1.1087420042643924e-05, + "loss": 0.5263, + "step": 65 + }, + { + "epoch": 0.07036247334754797, + "grad_norm": 0.23991967383468807, + "learning_rate": 1.1257995735607677e-05, + "loss": 0.5165, + "step": 66 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 0.19942140384569496, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.5168, + "step": 67 + }, + { + "epoch": 0.07249466950959488, + "grad_norm": 0.2412614596538889, + "learning_rate": 1.1599147121535181e-05, + "loss": 0.5137, + "step": 68 + }, + { + "epoch": 0.07356076759061833, + "grad_norm": 0.213075163701682, + "learning_rate": 1.1769722814498934e-05, + "loss": 0.5177, + "step": 69 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 0.2078353315515298, + "learning_rate": 1.1940298507462686e-05, + "loss": 0.5203, + "step": 70 + }, + { + "epoch": 0.07569296375266525, + "grad_norm": 0.24803061207482333, + "learning_rate": 1.2110874200426442e-05, + "loss": 0.5127, + "step": 71 + }, + { + "epoch": 0.0767590618336887, + "grad_norm": 0.200388439144567, + "learning_rate": 1.2281449893390195e-05, + "loss": 0.524, + "step": 72 + }, + { + "epoch": 0.07782515991471216, + "grad_norm": 0.26609196507744726, + "learning_rate": 1.2452025586353947e-05, + "loss": 0.5166, + "step": 73 + }, + { + "epoch": 0.07889125799573561, + "grad_norm": 0.2746212953481589, + "learning_rate": 1.26226012793177e-05, + "loss": 0.5097, + "step": 74 + }, + { + "epoch": 0.07995735607675906, + "grad_norm": 0.25408109337829365, + "learning_rate": 1.2793176972281452e-05, + "loss": 0.5124, + "step": 75 + }, + { + "epoch": 0.08102345415778252, + "grad_norm": 0.23041327392909428, + "learning_rate": 1.2963752665245204e-05, + "loss": 0.5114, + "step": 76 + }, + { + "epoch": 0.08208955223880597, + "grad_norm": 0.25359601440739266, + "learning_rate": 1.3134328358208957e-05, + "loss": 0.5126, + "step": 77 + }, + { + "epoch": 0.08315565031982942, + "grad_norm": 0.26693374625690547, + "learning_rate": 1.3304904051172709e-05, + "loss": 0.5026, + "step": 78 + }, + { + "epoch": 0.08422174840085288, + "grad_norm": 0.32689327987808636, + "learning_rate": 1.3475479744136461e-05, + "loss": 0.51, + "step": 79 + }, + { + "epoch": 0.08528784648187633, + "grad_norm": 0.3372086808231497, + "learning_rate": 1.3646055437100214e-05, + "loss": 0.5026, + "step": 80 + }, + { + "epoch": 0.08635394456289978, + "grad_norm": 0.3813830415514882, + "learning_rate": 1.3816631130063966e-05, + "loss": 0.5066, + "step": 81 + }, + { + "epoch": 0.08742004264392324, + "grad_norm": 0.5438093416524072, + "learning_rate": 1.3987206823027719e-05, + "loss": 0.5164, + "step": 82 + }, + { + "epoch": 0.08848614072494669, + "grad_norm": 0.6495392831189992, + "learning_rate": 1.4157782515991471e-05, + "loss": 0.5043, + "step": 83 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 0.6977147730974188, + "learning_rate": 1.4328358208955224e-05, + "loss": 0.5036, + "step": 84 + }, + { + "epoch": 0.0906183368869936, + "grad_norm": 0.6222679447369283, + "learning_rate": 1.4498933901918976e-05, + "loss": 0.5098, + "step": 85 + }, + { + "epoch": 0.09168443496801706, + "grad_norm": 0.47152661245187916, + "learning_rate": 1.4669509594882732e-05, + "loss": 0.5055, + "step": 86 + }, + { + "epoch": 0.09275053304904052, + "grad_norm": 0.3720647971705006, + "learning_rate": 1.4840085287846484e-05, + "loss": 0.5093, + "step": 87 + }, + { + "epoch": 0.09381663113006397, + "grad_norm": 0.429318588389719, + "learning_rate": 1.5010660980810237e-05, + "loss": 0.5012, + "step": 88 + }, + { + "epoch": 0.09488272921108742, + "grad_norm": 0.5114262103736821, + "learning_rate": 1.5181236673773989e-05, + "loss": 0.5057, + "step": 89 + }, + { + "epoch": 0.09594882729211088, + "grad_norm": 0.4452652595986177, + "learning_rate": 1.535181236673774e-05, + "loss": 0.5044, + "step": 90 + }, + { + "epoch": 0.09701492537313433, + "grad_norm": 0.30399112681775725, + "learning_rate": 1.5522388059701494e-05, + "loss": 0.5039, + "step": 91 + }, + { + "epoch": 0.09808102345415778, + "grad_norm": 0.44638035180999586, + "learning_rate": 1.5692963752665246e-05, + "loss": 0.5088, + "step": 92 + }, + { + "epoch": 0.09914712153518124, + "grad_norm": 0.5355105792075635, + "learning_rate": 1.5863539445629e-05, + "loss": 0.5068, + "step": 93 + }, + { + "epoch": 0.10021321961620469, + "grad_norm": 1.6435340842916062, + "learning_rate": 1.603411513859275e-05, + "loss": 0.5044, + "step": 94 + }, + { + "epoch": 0.10127931769722814, + "grad_norm": 0.41709741899745906, + "learning_rate": 1.6204690831556504e-05, + "loss": 0.4993, + "step": 95 + }, + { + "epoch": 0.1023454157782516, + "grad_norm": 0.8489238666616422, + "learning_rate": 1.6375266524520256e-05, + "loss": 0.4982, + "step": 96 + }, + { + "epoch": 0.10341151385927505, + "grad_norm": 0.7950895739539379, + "learning_rate": 1.654584221748401e-05, + "loss": 0.4955, + "step": 97 + }, + { + "epoch": 0.1044776119402985, + "grad_norm": 0.3524964920301634, + "learning_rate": 1.671641791044776e-05, + "loss": 0.4896, + "step": 98 + }, + { + "epoch": 0.10554371002132196, + "grad_norm": 0.48403087660030325, + "learning_rate": 1.6886993603411513e-05, + "loss": 0.4938, + "step": 99 + }, + { + "epoch": 0.10660980810234541, + "grad_norm": 0.8862953843822765, + "learning_rate": 1.7057569296375266e-05, + "loss": 0.5003, + "step": 100 + }, + { + "epoch": 0.10767590618336886, + "grad_norm": 0.84687190937384, + "learning_rate": 1.7228144989339018e-05, + "loss": 0.498, + "step": 101 + }, + { + "epoch": 0.10874200426439233, + "grad_norm": 0.7104553983279852, + "learning_rate": 1.7398720682302774e-05, + "loss": 0.5002, + "step": 102 + }, + { + "epoch": 0.10980810234541578, + "grad_norm": 1.1434719823658013, + "learning_rate": 1.7569296375266526e-05, + "loss": 0.4992, + "step": 103 + }, + { + "epoch": 0.11087420042643924, + "grad_norm": 0.8704506872447706, + "learning_rate": 1.773987206823028e-05, + "loss": 0.4951, + "step": 104 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.897490279226691, + "learning_rate": 1.791044776119403e-05, + "loss": 0.4916, + "step": 105 + }, + { + "epoch": 0.11300639658848614, + "grad_norm": 1.484307049969091, + "learning_rate": 1.8081023454157784e-05, + "loss": 0.4943, + "step": 106 + }, + { + "epoch": 0.1140724946695096, + "grad_norm": 0.6421017701917927, + "learning_rate": 1.8251599147121536e-05, + "loss": 0.4906, + "step": 107 + }, + { + "epoch": 0.11513859275053305, + "grad_norm": 1.6387082014606487, + "learning_rate": 1.842217484008529e-05, + "loss": 0.4976, + "step": 108 + }, + { + "epoch": 0.1162046908315565, + "grad_norm": 0.7508434661676452, + "learning_rate": 1.859275053304904e-05, + "loss": 0.4893, + "step": 109 + }, + { + "epoch": 0.11727078891257996, + "grad_norm": 1.5878659815817608, + "learning_rate": 1.8763326226012793e-05, + "loss": 0.4936, + "step": 110 + }, + { + "epoch": 0.11833688699360341, + "grad_norm": 1.2700012445274647, + "learning_rate": 1.8933901918976546e-05, + "loss": 0.4976, + "step": 111 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 1.4468232793600246, + "learning_rate": 1.9104477611940298e-05, + "loss": 0.4916, + "step": 112 + }, + { + "epoch": 0.12046908315565032, + "grad_norm": 1.2073925718708678, + "learning_rate": 1.9275053304904054e-05, + "loss": 0.4924, + "step": 113 + }, + { + "epoch": 0.12153518123667377, + "grad_norm": 1.382482149756203, + "learning_rate": 1.9445628997867806e-05, + "loss": 0.495, + "step": 114 + }, + { + "epoch": 0.12260127931769722, + "grad_norm": 1.1681250872190037, + "learning_rate": 1.961620469083156e-05, + "loss": 0.4929, + "step": 115 + }, + { + "epoch": 0.12366737739872068, + "grad_norm": 1.1644844991590728, + "learning_rate": 1.978678038379531e-05, + "loss": 0.487, + "step": 116 + }, + { + "epoch": 0.12473347547974413, + "grad_norm": 1.1590131735118943, + "learning_rate": 1.9957356076759064e-05, + "loss": 0.491, + "step": 117 + }, + { + "epoch": 0.1257995735607676, + "grad_norm": 0.8845517269114772, + "learning_rate": 2.0127931769722816e-05, + "loss": 0.4899, + "step": 118 + }, + { + "epoch": 0.12686567164179105, + "grad_norm": 0.8720137285450001, + "learning_rate": 2.029850746268657e-05, + "loss": 0.4948, + "step": 119 + }, + { + "epoch": 0.1279317697228145, + "grad_norm": 0.8270825293355472, + "learning_rate": 2.046908315565032e-05, + "loss": 0.4873, + "step": 120 + }, + { + "epoch": 0.12899786780383796, + "grad_norm": 0.7112733086734145, + "learning_rate": 2.0639658848614073e-05, + "loss": 0.4852, + "step": 121 + }, + { + "epoch": 0.1300639658848614, + "grad_norm": 0.63087466396226, + "learning_rate": 2.0810234541577826e-05, + "loss": 0.4842, + "step": 122 + }, + { + "epoch": 0.13113006396588486, + "grad_norm": 0.6768967900200205, + "learning_rate": 2.098081023454158e-05, + "loss": 0.4892, + "step": 123 + }, + { + "epoch": 0.13219616204690832, + "grad_norm": 0.4441105086375026, + "learning_rate": 2.1151385927505334e-05, + "loss": 0.49, + "step": 124 + }, + { + "epoch": 0.13326226012793177, + "grad_norm": 0.7871223305057534, + "learning_rate": 2.1321961620469086e-05, + "loss": 0.4861, + "step": 125 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 0.7473471277752516, + "learning_rate": 2.149253731343284e-05, + "loss": 0.4859, + "step": 126 + }, + { + "epoch": 0.13539445628997868, + "grad_norm": 0.6860378579705178, + "learning_rate": 2.166311300639659e-05, + "loss": 0.483, + "step": 127 + }, + { + "epoch": 0.13646055437100213, + "grad_norm": 0.6877050783906112, + "learning_rate": 2.1833688699360344e-05, + "loss": 0.4778, + "step": 128 + }, + { + "epoch": 0.13752665245202558, + "grad_norm": 0.7802953077555831, + "learning_rate": 2.2004264392324096e-05, + "loss": 0.4841, + "step": 129 + }, + { + "epoch": 0.13859275053304904, + "grad_norm": 0.6105421403859265, + "learning_rate": 2.217484008528785e-05, + "loss": 0.4888, + "step": 130 + }, + { + "epoch": 0.1396588486140725, + "grad_norm": 0.8085739454604434, + "learning_rate": 2.23454157782516e-05, + "loss": 0.4844, + "step": 131 + }, + { + "epoch": 0.14072494669509594, + "grad_norm": 0.599568911512429, + "learning_rate": 2.2515991471215353e-05, + "loss": 0.4859, + "step": 132 + }, + { + "epoch": 0.1417910447761194, + "grad_norm": 0.7298411511687625, + "learning_rate": 2.2686567164179106e-05, + "loss": 0.4839, + "step": 133 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.5022968348693084, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.4799, + "step": 134 + }, + { + "epoch": 0.1439232409381663, + "grad_norm": 0.7782509559217928, + "learning_rate": 2.302771855010661e-05, + "loss": 0.4811, + "step": 135 + }, + { + "epoch": 0.14498933901918976, + "grad_norm": 0.8576912283821915, + "learning_rate": 2.3198294243070363e-05, + "loss": 0.4855, + "step": 136 + }, + { + "epoch": 0.1460554371002132, + "grad_norm": 0.75095345536952, + "learning_rate": 2.3368869936034115e-05, + "loss": 0.4831, + "step": 137 + }, + { + "epoch": 0.14712153518123666, + "grad_norm": 1.1021094209824167, + "learning_rate": 2.3539445628997868e-05, + "loss": 0.485, + "step": 138 + }, + { + "epoch": 0.14818763326226012, + "grad_norm": 1.3850416902760037, + "learning_rate": 2.371002132196162e-05, + "loss": 0.478, + "step": 139 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.5373300308852162, + "learning_rate": 2.3880597014925373e-05, + "loss": 0.4743, + "step": 140 + }, + { + "epoch": 0.15031982942430705, + "grad_norm": 1.2965983051499783, + "learning_rate": 2.405117270788913e-05, + "loss": 0.4877, + "step": 141 + }, + { + "epoch": 0.1513859275053305, + "grad_norm": 0.9983577194279637, + "learning_rate": 2.4221748400852884e-05, + "loss": 0.4814, + "step": 142 + }, + { + "epoch": 0.15245202558635396, + "grad_norm": 0.720371040224616, + "learning_rate": 2.4392324093816637e-05, + "loss": 0.4809, + "step": 143 + }, + { + "epoch": 0.1535181236673774, + "grad_norm": 0.8215017978389719, + "learning_rate": 2.456289978678039e-05, + "loss": 0.4751, + "step": 144 + }, + { + "epoch": 0.15458422174840086, + "grad_norm": 0.8563504324634484, + "learning_rate": 2.473347547974414e-05, + "loss": 0.4788, + "step": 145 + }, + { + "epoch": 0.15565031982942432, + "grad_norm": 1.0397801044745016, + "learning_rate": 2.4904051172707894e-05, + "loss": 0.4828, + "step": 146 + }, + { + "epoch": 0.15671641791044777, + "grad_norm": 0.8499495288588741, + "learning_rate": 2.5074626865671646e-05, + "loss": 0.4823, + "step": 147 + }, + { + "epoch": 0.15778251599147122, + "grad_norm": 0.7506494495443314, + "learning_rate": 2.52452025586354e-05, + "loss": 0.4719, + "step": 148 + }, + { + "epoch": 0.15884861407249468, + "grad_norm": 0.8572573387032592, + "learning_rate": 2.541577825159915e-05, + "loss": 0.4779, + "step": 149 + }, + { + "epoch": 0.15991471215351813, + "grad_norm": 0.8980484862912236, + "learning_rate": 2.5586353944562904e-05, + "loss": 0.4669, + "step": 150 + }, + { + "epoch": 0.16098081023454158, + "grad_norm": 1.3008931725410442, + "learning_rate": 2.5756929637526656e-05, + "loss": 0.4778, + "step": 151 + }, + { + "epoch": 0.16204690831556504, + "grad_norm": 0.5404519887326864, + "learning_rate": 2.592750533049041e-05, + "loss": 0.4731, + "step": 152 + }, + { + "epoch": 0.1631130063965885, + "grad_norm": 1.0716812716539528, + "learning_rate": 2.609808102345416e-05, + "loss": 0.4798, + "step": 153 + }, + { + "epoch": 0.16417910447761194, + "grad_norm": 1.2709697777670215, + "learning_rate": 2.6268656716417913e-05, + "loss": 0.4871, + "step": 154 + }, + { + "epoch": 0.1652452025586354, + "grad_norm": 0.537320261984439, + "learning_rate": 2.6439232409381666e-05, + "loss": 0.4749, + "step": 155 + }, + { + "epoch": 0.16631130063965885, + "grad_norm": 1.1514684368533086, + "learning_rate": 2.6609808102345418e-05, + "loss": 0.4753, + "step": 156 + }, + { + "epoch": 0.1673773987206823, + "grad_norm": 0.9432669540338412, + "learning_rate": 2.678038379530917e-05, + "loss": 0.4807, + "step": 157 + }, + { + "epoch": 0.16844349680170576, + "grad_norm": 0.642515332753071, + "learning_rate": 2.6950959488272923e-05, + "loss": 0.477, + "step": 158 + }, + { + "epoch": 0.1695095948827292, + "grad_norm": 0.8211012768546435, + "learning_rate": 2.7121535181236675e-05, + "loss": 0.4766, + "step": 159 + }, + { + "epoch": 0.17057569296375266, + "grad_norm": 0.8776070226460816, + "learning_rate": 2.7292110874200428e-05, + "loss": 0.4717, + "step": 160 + }, + { + "epoch": 0.17164179104477612, + "grad_norm": 0.7718966074497755, + "learning_rate": 2.746268656716418e-05, + "loss": 0.4687, + "step": 161 + }, + { + "epoch": 0.17270788912579957, + "grad_norm": 0.7779381456798092, + "learning_rate": 2.7633262260127933e-05, + "loss": 0.4679, + "step": 162 + }, + { + "epoch": 0.17377398720682302, + "grad_norm": 0.5901156093872161, + "learning_rate": 2.7803837953091685e-05, + "loss": 0.4751, + "step": 163 + }, + { + "epoch": 0.17484008528784648, + "grad_norm": 0.7154596410625235, + "learning_rate": 2.7974413646055437e-05, + "loss": 0.4728, + "step": 164 + }, + { + "epoch": 0.17590618336886993, + "grad_norm": 0.7000348159687018, + "learning_rate": 2.814498933901919e-05, + "loss": 0.4725, + "step": 165 + }, + { + "epoch": 0.17697228144989338, + "grad_norm": 0.7657735939109871, + "learning_rate": 2.8315565031982942e-05, + "loss": 0.4784, + "step": 166 + }, + { + "epoch": 0.17803837953091683, + "grad_norm": 0.8475989590871966, + "learning_rate": 2.8486140724946695e-05, + "loss": 0.4711, + "step": 167 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 1.0922356892583558, + "learning_rate": 2.8656716417910447e-05, + "loss": 0.478, + "step": 168 + }, + { + "epoch": 0.18017057569296374, + "grad_norm": 0.970068377938896, + "learning_rate": 2.88272921108742e-05, + "loss": 0.4653, + "step": 169 + }, + { + "epoch": 0.1812366737739872, + "grad_norm": 0.7059217860390857, + "learning_rate": 2.8997867803837952e-05, + "loss": 0.474, + "step": 170 + }, + { + "epoch": 0.18230277185501065, + "grad_norm": 1.0368427776415303, + "learning_rate": 2.9168443496801708e-05, + "loss": 0.4745, + "step": 171 + }, + { + "epoch": 0.18336886993603413, + "grad_norm": 1.1739249199645363, + "learning_rate": 2.9339019189765464e-05, + "loss": 0.4713, + "step": 172 + }, + { + "epoch": 0.18443496801705758, + "grad_norm": 0.772697311844866, + "learning_rate": 2.9509594882729216e-05, + "loss": 0.4718, + "step": 173 + }, + { + "epoch": 0.18550106609808104, + "grad_norm": 0.8330939252501977, + "learning_rate": 2.968017057569297e-05, + "loss": 0.4722, + "step": 174 + }, + { + "epoch": 0.1865671641791045, + "grad_norm": 0.831387960842226, + "learning_rate": 2.985074626865672e-05, + "loss": 0.472, + "step": 175 + }, + { + "epoch": 0.18763326226012794, + "grad_norm": 0.7976609685235961, + "learning_rate": 3.0021321961620473e-05, + "loss": 0.47, + "step": 176 + }, + { + "epoch": 0.1886993603411514, + "grad_norm": 0.8823919254760567, + "learning_rate": 3.0191897654584226e-05, + "loss": 0.4759, + "step": 177 + }, + { + "epoch": 0.18976545842217485, + "grad_norm": 0.8804820784317922, + "learning_rate": 3.0362473347547978e-05, + "loss": 0.4659, + "step": 178 + }, + { + "epoch": 0.1908315565031983, + "grad_norm": 0.9681666262840064, + "learning_rate": 3.053304904051173e-05, + "loss": 0.4729, + "step": 179 + }, + { + "epoch": 0.19189765458422176, + "grad_norm": 1.1048871958759772, + "learning_rate": 3.070362473347548e-05, + "loss": 0.4665, + "step": 180 + }, + { + "epoch": 0.1929637526652452, + "grad_norm": 0.8870408442356555, + "learning_rate": 3.0874200426439235e-05, + "loss": 0.4745, + "step": 181 + }, + { + "epoch": 0.19402985074626866, + "grad_norm": 0.9631165384838795, + "learning_rate": 3.104477611940299e-05, + "loss": 0.4698, + "step": 182 + }, + { + "epoch": 0.19509594882729211, + "grad_norm": 0.9385999582540799, + "learning_rate": 3.121535181236674e-05, + "loss": 0.4705, + "step": 183 + }, + { + "epoch": 0.19616204690831557, + "grad_norm": 0.7939219655274056, + "learning_rate": 3.138592750533049e-05, + "loss": 0.4732, + "step": 184 + }, + { + "epoch": 0.19722814498933902, + "grad_norm": 0.736357203214717, + "learning_rate": 3.1556503198294245e-05, + "loss": 0.4746, + "step": 185 + }, + { + "epoch": 0.19829424307036247, + "grad_norm": 0.8037466041799969, + "learning_rate": 3.1727078891258e-05, + "loss": 0.4636, + "step": 186 + }, + { + "epoch": 0.19936034115138593, + "grad_norm": 1.0193532786616932, + "learning_rate": 3.189765458422175e-05, + "loss": 0.4721, + "step": 187 + }, + { + "epoch": 0.20042643923240938, + "grad_norm": 1.4921054947249843, + "learning_rate": 3.20682302771855e-05, + "loss": 0.4718, + "step": 188 + }, + { + "epoch": 0.20149253731343283, + "grad_norm": 0.7621564304858446, + "learning_rate": 3.2238805970149255e-05, + "loss": 0.4653, + "step": 189 + }, + { + "epoch": 0.2025586353944563, + "grad_norm": 1.070754972082962, + "learning_rate": 3.240938166311301e-05, + "loss": 0.4654, + "step": 190 + }, + { + "epoch": 0.20362473347547974, + "grad_norm": 1.5652147505824308, + "learning_rate": 3.257995735607676e-05, + "loss": 0.472, + "step": 191 + }, + { + "epoch": 0.2046908315565032, + "grad_norm": 0.869980401463994, + "learning_rate": 3.275053304904051e-05, + "loss": 0.4646, + "step": 192 + }, + { + "epoch": 0.20575692963752665, + "grad_norm": 1.4005482830515732, + "learning_rate": 3.2921108742004264e-05, + "loss": 0.4697, + "step": 193 + }, + { + "epoch": 0.2068230277185501, + "grad_norm": 1.2810330542613626, + "learning_rate": 3.309168443496802e-05, + "loss": 0.4631, + "step": 194 + }, + { + "epoch": 0.20788912579957355, + "grad_norm": 1.036250299110791, + "learning_rate": 3.326226012793177e-05, + "loss": 0.4757, + "step": 195 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 0.8072040874296945, + "learning_rate": 3.343283582089552e-05, + "loss": 0.4629, + "step": 196 + }, + { + "epoch": 0.21002132196162046, + "grad_norm": 1.4196573201874585, + "learning_rate": 3.3603411513859274e-05, + "loss": 0.4684, + "step": 197 + }, + { + "epoch": 0.21108742004264391, + "grad_norm": 0.6191497943271066, + "learning_rate": 3.3773987206823026e-05, + "loss": 0.4631, + "step": 198 + }, + { + "epoch": 0.21215351812366737, + "grad_norm": 1.2158443187059038, + "learning_rate": 3.394456289978678e-05, + "loss": 0.4671, + "step": 199 + }, + { + "epoch": 0.21321961620469082, + "grad_norm": 0.8129012874793476, + "learning_rate": 3.411513859275053e-05, + "loss": 0.4618, + "step": 200 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.0518786895781704, + "learning_rate": 3.4285714285714284e-05, + "loss": 0.4697, + "step": 201 + }, + { + "epoch": 0.21535181236673773, + "grad_norm": 0.9688679826055264, + "learning_rate": 3.4456289978678036e-05, + "loss": 0.4683, + "step": 202 + }, + { + "epoch": 0.21641791044776118, + "grad_norm": 1.3724882136741294, + "learning_rate": 3.462686567164179e-05, + "loss": 0.4718, + "step": 203 + }, + { + "epoch": 0.21748400852878466, + "grad_norm": 0.5641264907256553, + "learning_rate": 3.479744136460555e-05, + "loss": 0.4551, + "step": 204 + }, + { + "epoch": 0.21855010660980811, + "grad_norm": 1.0701546739161645, + "learning_rate": 3.49680170575693e-05, + "loss": 0.4612, + "step": 205 + }, + { + "epoch": 0.21961620469083157, + "grad_norm": 1.049942985757254, + "learning_rate": 3.513859275053305e-05, + "loss": 0.4696, + "step": 206 + }, + { + "epoch": 0.22068230277185502, + "grad_norm": 1.1283220333372375, + "learning_rate": 3.5309168443496805e-05, + "loss": 0.4632, + "step": 207 + }, + { + "epoch": 0.22174840085287847, + "grad_norm": 0.9308516582288439, + "learning_rate": 3.547974413646056e-05, + "loss": 0.4661, + "step": 208 + }, + { + "epoch": 0.22281449893390193, + "grad_norm": 1.1185548874871707, + "learning_rate": 3.565031982942431e-05, + "loss": 0.4641, + "step": 209 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.919634938330087, + "learning_rate": 3.582089552238806e-05, + "loss": 0.4707, + "step": 210 + }, + { + "epoch": 0.22494669509594883, + "grad_norm": 1.4776402736329493, + "learning_rate": 3.5991471215351815e-05, + "loss": 0.4652, + "step": 211 + }, + { + "epoch": 0.2260127931769723, + "grad_norm": 0.7167195593546435, + "learning_rate": 3.616204690831557e-05, + "loss": 0.4595, + "step": 212 + }, + { + "epoch": 0.22707889125799574, + "grad_norm": 1.2140834565384866, + "learning_rate": 3.633262260127932e-05, + "loss": 0.4711, + "step": 213 + }, + { + "epoch": 0.2281449893390192, + "grad_norm": 1.0029544450885874, + "learning_rate": 3.650319829424307e-05, + "loss": 0.4596, + "step": 214 + }, + { + "epoch": 0.22921108742004265, + "grad_norm": 1.5043815628272939, + "learning_rate": 3.6673773987206824e-05, + "loss": 0.4638, + "step": 215 + }, + { + "epoch": 0.2302771855010661, + "grad_norm": 0.8569767572345512, + "learning_rate": 3.684434968017058e-05, + "loss": 0.4598, + "step": 216 + }, + { + "epoch": 0.23134328358208955, + "grad_norm": 1.3925636332034894, + "learning_rate": 3.701492537313433e-05, + "loss": 0.4604, + "step": 217 + }, + { + "epoch": 0.232409381663113, + "grad_norm": 1.1816721080629815, + "learning_rate": 3.718550106609808e-05, + "loss": 0.4518, + "step": 218 + }, + { + "epoch": 0.23347547974413646, + "grad_norm": 0.8651413005301738, + "learning_rate": 3.7356076759061834e-05, + "loss": 0.465, + "step": 219 + }, + { + "epoch": 0.2345415778251599, + "grad_norm": 1.1820130745722497, + "learning_rate": 3.7526652452025586e-05, + "loss": 0.4637, + "step": 220 + }, + { + "epoch": 0.23560767590618337, + "grad_norm": 0.8427968342892499, + "learning_rate": 3.769722814498934e-05, + "loss": 0.4617, + "step": 221 + }, + { + "epoch": 0.23667377398720682, + "grad_norm": 1.11073913884723, + "learning_rate": 3.786780383795309e-05, + "loss": 0.4655, + "step": 222 + }, + { + "epoch": 0.23773987206823027, + "grad_norm": 0.9189934131123165, + "learning_rate": 3.8038379530916844e-05, + "loss": 0.4766, + "step": 223 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 1.1535170456681654, + "learning_rate": 3.8208955223880596e-05, + "loss": 0.4625, + "step": 224 + }, + { + "epoch": 0.23987206823027718, + "grad_norm": 0.9374181623692117, + "learning_rate": 3.8379530916844355e-05, + "loss": 0.4669, + "step": 225 + }, + { + "epoch": 0.24093816631130063, + "grad_norm": 1.0563319134423954, + "learning_rate": 3.855010660980811e-05, + "loss": 0.4611, + "step": 226 + }, + { + "epoch": 0.2420042643923241, + "grad_norm": 1.168785335326744, + "learning_rate": 3.872068230277186e-05, + "loss": 0.4634, + "step": 227 + }, + { + "epoch": 0.24307036247334754, + "grad_norm": 1.0248065818014773, + "learning_rate": 3.889125799573561e-05, + "loss": 0.4593, + "step": 228 + }, + { + "epoch": 0.244136460554371, + "grad_norm": 0.8637465871329612, + "learning_rate": 3.9061833688699365e-05, + "loss": 0.4632, + "step": 229 + }, + { + "epoch": 0.24520255863539445, + "grad_norm": 0.8460415005099113, + "learning_rate": 3.923240938166312e-05, + "loss": 0.4624, + "step": 230 + }, + { + "epoch": 0.2462686567164179, + "grad_norm": 0.8955587399970688, + "learning_rate": 3.940298507462687e-05, + "loss": 0.457, + "step": 231 + }, + { + "epoch": 0.24733475479744135, + "grad_norm": 1.1657262128526866, + "learning_rate": 3.957356076759062e-05, + "loss": 0.4645, + "step": 232 + }, + { + "epoch": 0.2484008528784648, + "grad_norm": 1.0166441196536635, + "learning_rate": 3.9744136460554375e-05, + "loss": 0.4634, + "step": 233 + }, + { + "epoch": 0.24946695095948826, + "grad_norm": 0.9826615671115314, + "learning_rate": 3.991471215351813e-05, + "loss": 0.4614, + "step": 234 + }, + { + "epoch": 0.2505330490405117, + "grad_norm": 1.0599689795674345, + "learning_rate": 4.008528784648188e-05, + "loss": 0.4731, + "step": 235 + }, + { + "epoch": 0.2515991471215352, + "grad_norm": 0.9093543668098233, + "learning_rate": 4.025586353944563e-05, + "loss": 0.4566, + "step": 236 + }, + { + "epoch": 0.2526652452025586, + "grad_norm": 1.013585300158083, + "learning_rate": 4.0426439232409384e-05, + "loss": 0.468, + "step": 237 + }, + { + "epoch": 0.2537313432835821, + "grad_norm": 1.036723508797436, + "learning_rate": 4.059701492537314e-05, + "loss": 0.4584, + "step": 238 + }, + { + "epoch": 0.2547974413646055, + "grad_norm": 1.316129392280748, + "learning_rate": 4.076759061833689e-05, + "loss": 0.4675, + "step": 239 + }, + { + "epoch": 0.255863539445629, + "grad_norm": 1.0896330595803854, + "learning_rate": 4.093816631130064e-05, + "loss": 0.4623, + "step": 240 + }, + { + "epoch": 0.25692963752665243, + "grad_norm": 1.1510385991266068, + "learning_rate": 4.1108742004264394e-05, + "loss": 0.4649, + "step": 241 + }, + { + "epoch": 0.2579957356076759, + "grad_norm": 0.8229957414542415, + "learning_rate": 4.1279317697228146e-05, + "loss": 0.4584, + "step": 242 + }, + { + "epoch": 0.25906183368869934, + "grad_norm": 0.9568171883082784, + "learning_rate": 4.14498933901919e-05, + "loss": 0.4628, + "step": 243 + }, + { + "epoch": 0.2601279317697228, + "grad_norm": 0.9570238151905792, + "learning_rate": 4.162046908315565e-05, + "loss": 0.4544, + "step": 244 + }, + { + "epoch": 0.26119402985074625, + "grad_norm": 1.1790086045153048, + "learning_rate": 4.1791044776119404e-05, + "loss": 0.4565, + "step": 245 + }, + { + "epoch": 0.2622601279317697, + "grad_norm": 0.772255483862789, + "learning_rate": 4.196162046908316e-05, + "loss": 0.4575, + "step": 246 + }, + { + "epoch": 0.26332622601279315, + "grad_norm": 1.1363053470021465, + "learning_rate": 4.213219616204691e-05, + "loss": 0.4612, + "step": 247 + }, + { + "epoch": 0.26439232409381663, + "grad_norm": 0.7565375099785713, + "learning_rate": 4.230277185501067e-05, + "loss": 0.456, + "step": 248 + }, + { + "epoch": 0.26545842217484006, + "grad_norm": 0.6892425990836365, + "learning_rate": 4.247334754797441e-05, + "loss": 0.4559, + "step": 249 + }, + { + "epoch": 0.26652452025586354, + "grad_norm": 0.7417047962613619, + "learning_rate": 4.264392324093817e-05, + "loss": 0.4508, + "step": 250 + }, + { + "epoch": 0.267590618336887, + "grad_norm": 0.7470072441966399, + "learning_rate": 4.2814498933901925e-05, + "loss": 0.461, + "step": 251 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 0.9617786110698198, + "learning_rate": 4.298507462686568e-05, + "loss": 0.4513, + "step": 252 + }, + { + "epoch": 0.2697228144989339, + "grad_norm": 1.327440065045842, + "learning_rate": 4.315565031982943e-05, + "loss": 0.4638, + "step": 253 + }, + { + "epoch": 0.27078891257995735, + "grad_norm": 0.7442017560708861, + "learning_rate": 4.332622601279318e-05, + "loss": 0.4557, + "step": 254 + }, + { + "epoch": 0.27185501066098083, + "grad_norm": 0.7202532764515195, + "learning_rate": 4.3496801705756935e-05, + "loss": 0.4543, + "step": 255 + }, + { + "epoch": 0.27292110874200426, + "grad_norm": 0.8784340624592276, + "learning_rate": 4.366737739872069e-05, + "loss": 0.4579, + "step": 256 + }, + { + "epoch": 0.27398720682302774, + "grad_norm": 0.8848099666534693, + "learning_rate": 4.383795309168444e-05, + "loss": 0.4527, + "step": 257 + }, + { + "epoch": 0.27505330490405117, + "grad_norm": 0.7690402407029407, + "learning_rate": 4.400852878464819e-05, + "loss": 0.4576, + "step": 258 + }, + { + "epoch": 0.27611940298507465, + "grad_norm": 0.828823436313651, + "learning_rate": 4.4179104477611944e-05, + "loss": 0.4601, + "step": 259 + }, + { + "epoch": 0.2771855010660981, + "grad_norm": 1.186866608247454, + "learning_rate": 4.43496801705757e-05, + "loss": 0.4606, + "step": 260 + }, + { + "epoch": 0.27825159914712155, + "grad_norm": 0.9191614369421419, + "learning_rate": 4.452025586353945e-05, + "loss": 0.457, + "step": 261 + }, + { + "epoch": 0.279317697228145, + "grad_norm": 0.9275759467424034, + "learning_rate": 4.46908315565032e-05, + "loss": 0.4531, + "step": 262 + }, + { + "epoch": 0.28038379530916846, + "grad_norm": 1.3018942692140634, + "learning_rate": 4.4861407249466954e-05, + "loss": 0.4519, + "step": 263 + }, + { + "epoch": 0.2814498933901919, + "grad_norm": 1.2828905543825992, + "learning_rate": 4.5031982942430706e-05, + "loss": 0.4517, + "step": 264 + }, + { + "epoch": 0.28251599147121537, + "grad_norm": 0.8444164365212923, + "learning_rate": 4.5202558635394466e-05, + "loss": 0.4579, + "step": 265 + }, + { + "epoch": 0.2835820895522388, + "grad_norm": 0.7390719742847955, + "learning_rate": 4.537313432835821e-05, + "loss": 0.4564, + "step": 266 + }, + { + "epoch": 0.2846481876332623, + "grad_norm": 0.804814881156404, + "learning_rate": 4.554371002132197e-05, + "loss": 0.4516, + "step": 267 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.232886488524498, + "learning_rate": 4.5714285714285716e-05, + "loss": 0.4552, + "step": 268 + }, + { + "epoch": 0.2867803837953092, + "grad_norm": 1.1058003798103109, + "learning_rate": 4.5884861407249475e-05, + "loss": 0.4531, + "step": 269 + }, + { + "epoch": 0.2878464818763326, + "grad_norm": 1.1891457338823357, + "learning_rate": 4.605543710021322e-05, + "loss": 0.4511, + "step": 270 + }, + { + "epoch": 0.2889125799573561, + "grad_norm": 1.0630830374386846, + "learning_rate": 4.622601279317698e-05, + "loss": 0.4506, + "step": 271 + }, + { + "epoch": 0.2899786780383795, + "grad_norm": 0.8558226373887493, + "learning_rate": 4.6396588486140726e-05, + "loss": 0.4533, + "step": 272 + }, + { + "epoch": 0.291044776119403, + "grad_norm": 0.8431544624031982, + "learning_rate": 4.6567164179104485e-05, + "loss": 0.4489, + "step": 273 + }, + { + "epoch": 0.2921108742004264, + "grad_norm": 1.0780225196774107, + "learning_rate": 4.673773987206823e-05, + "loss": 0.4535, + "step": 274 + }, + { + "epoch": 0.2931769722814499, + "grad_norm": 1.040484818452518, + "learning_rate": 4.690831556503199e-05, + "loss": 0.4526, + "step": 275 + }, + { + "epoch": 0.2942430703624733, + "grad_norm": 1.4317419430672074, + "learning_rate": 4.7078891257995735e-05, + "loss": 0.4491, + "step": 276 + }, + { + "epoch": 0.2953091684434968, + "grad_norm": 0.7350004141525754, + "learning_rate": 4.7249466950959495e-05, + "loss": 0.4508, + "step": 277 + }, + { + "epoch": 0.29637526652452023, + "grad_norm": 1.0597151973403933, + "learning_rate": 4.742004264392324e-05, + "loss": 0.4536, + "step": 278 + }, + { + "epoch": 0.2974413646055437, + "grad_norm": 1.7726271804234848, + "learning_rate": 4.7590618336887e-05, + "loss": 0.4533, + "step": 279 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.7058843328989329, + "learning_rate": 4.7761194029850745e-05, + "loss": 0.4546, + "step": 280 + }, + { + "epoch": 0.2995735607675906, + "grad_norm": 2.0525677674617597, + "learning_rate": 4.7931769722814504e-05, + "loss": 0.4591, + "step": 281 + }, + { + "epoch": 0.3006396588486141, + "grad_norm": 1.0197143782821263, + "learning_rate": 4.810234541577826e-05, + "loss": 0.4539, + "step": 282 + }, + { + "epoch": 0.3017057569296375, + "grad_norm": 2.682566079537771, + "learning_rate": 4.827292110874201e-05, + "loss": 0.4696, + "step": 283 + }, + { + "epoch": 0.302771855010661, + "grad_norm": 2.7347833899867395, + "learning_rate": 4.844349680170577e-05, + "loss": 0.4854, + "step": 284 + }, + { + "epoch": 0.30383795309168443, + "grad_norm": 1.4147621114370599, + "learning_rate": 4.8614072494669514e-05, + "loss": 0.4613, + "step": 285 + }, + { + "epoch": 0.3049040511727079, + "grad_norm": 1.4281612301500841, + "learning_rate": 4.878464818763327e-05, + "loss": 0.4609, + "step": 286 + }, + { + "epoch": 0.30597014925373134, + "grad_norm": 1.0989723638165922, + "learning_rate": 4.895522388059702e-05, + "loss": 0.4627, + "step": 287 + }, + { + "epoch": 0.3070362473347548, + "grad_norm": 1.025429892081604, + "learning_rate": 4.912579957356078e-05, + "loss": 0.4516, + "step": 288 + }, + { + "epoch": 0.30810234541577824, + "grad_norm": 1.1574096426201355, + "learning_rate": 4.9296375266524524e-05, + "loss": 0.463, + "step": 289 + }, + { + "epoch": 0.3091684434968017, + "grad_norm": 0.9660522145703263, + "learning_rate": 4.946695095948828e-05, + "loss": 0.4551, + "step": 290 + }, + { + "epoch": 0.31023454157782515, + "grad_norm": 0.7154708715646663, + "learning_rate": 4.963752665245203e-05, + "loss": 0.4592, + "step": 291 + }, + { + "epoch": 0.31130063965884863, + "grad_norm": 0.6930388295609867, + "learning_rate": 4.980810234541579e-05, + "loss": 0.4541, + "step": 292 + }, + { + "epoch": 0.31236673773987206, + "grad_norm": 0.9114648684646235, + "learning_rate": 4.997867803837953e-05, + "loss": 0.4574, + "step": 293 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 1.245129676815517, + "learning_rate": 5.014925373134329e-05, + "loss": 0.4542, + "step": 294 + }, + { + "epoch": 0.31449893390191896, + "grad_norm": 0.9624168019988285, + "learning_rate": 5.031982942430704e-05, + "loss": 0.4536, + "step": 295 + }, + { + "epoch": 0.31556503198294245, + "grad_norm": 0.8009138019519386, + "learning_rate": 5.04904051172708e-05, + "loss": 0.4512, + "step": 296 + }, + { + "epoch": 0.31663113006396587, + "grad_norm": 0.6547749384104303, + "learning_rate": 5.066098081023454e-05, + "loss": 0.4519, + "step": 297 + }, + { + "epoch": 0.31769722814498935, + "grad_norm": 0.83857949506653, + "learning_rate": 5.08315565031983e-05, + "loss": 0.4513, + "step": 298 + }, + { + "epoch": 0.3187633262260128, + "grad_norm": 0.934818630813169, + "learning_rate": 5.100213219616205e-05, + "loss": 0.4547, + "step": 299 + }, + { + "epoch": 0.31982942430703626, + "grad_norm": 1.1298040583791609, + "learning_rate": 5.117270788912581e-05, + "loss": 0.4573, + "step": 300 + }, + { + "epoch": 0.3208955223880597, + "grad_norm": 1.1242753661227198, + "learning_rate": 5.134328358208955e-05, + "loss": 0.4555, + "step": 301 + }, + { + "epoch": 0.32196162046908317, + "grad_norm": 0.8386711391896823, + "learning_rate": 5.151385927505331e-05, + "loss": 0.4454, + "step": 302 + }, + { + "epoch": 0.3230277185501066, + "grad_norm": 1.0644491026122258, + "learning_rate": 5.168443496801706e-05, + "loss": 0.4544, + "step": 303 + }, + { + "epoch": 0.32409381663113007, + "grad_norm": 1.0964593961058677, + "learning_rate": 5.185501066098082e-05, + "loss": 0.4556, + "step": 304 + }, + { + "epoch": 0.3251599147121535, + "grad_norm": 0.9831083339112521, + "learning_rate": 5.202558635394456e-05, + "loss": 0.4467, + "step": 305 + }, + { + "epoch": 0.326226012793177, + "grad_norm": 0.9900349752767852, + "learning_rate": 5.219616204690832e-05, + "loss": 0.4516, + "step": 306 + }, + { + "epoch": 0.3272921108742004, + "grad_norm": 1.0598608792836142, + "learning_rate": 5.236673773987207e-05, + "loss": 0.4498, + "step": 307 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 0.9889355725212928, + "learning_rate": 5.2537313432835826e-05, + "loss": 0.4514, + "step": 308 + }, + { + "epoch": 0.3294243070362473, + "grad_norm": 0.9865582250975005, + "learning_rate": 5.270788912579957e-05, + "loss": 0.4513, + "step": 309 + }, + { + "epoch": 0.3304904051172708, + "grad_norm": 1.252376286795114, + "learning_rate": 5.287846481876333e-05, + "loss": 0.4523, + "step": 310 + }, + { + "epoch": 0.3315565031982942, + "grad_norm": 0.6493116364566459, + "learning_rate": 5.304904051172708e-05, + "loss": 0.445, + "step": 311 + }, + { + "epoch": 0.3326226012793177, + "grad_norm": 1.0832555318476744, + "learning_rate": 5.3219616204690836e-05, + "loss": 0.4551, + "step": 312 + }, + { + "epoch": 0.3336886993603412, + "grad_norm": 1.0891590791019572, + "learning_rate": 5.3390191897654595e-05, + "loss": 0.4501, + "step": 313 + }, + { + "epoch": 0.3347547974413646, + "grad_norm": 0.5965442300858297, + "learning_rate": 5.356076759061834e-05, + "loss": 0.4433, + "step": 314 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.9796043883687924, + "learning_rate": 5.37313432835821e-05, + "loss": 0.4517, + "step": 315 + }, + { + "epoch": 0.3368869936034115, + "grad_norm": 0.993909895121157, + "learning_rate": 5.3901918976545846e-05, + "loss": 0.452, + "step": 316 + }, + { + "epoch": 0.337953091684435, + "grad_norm": 0.6471597791846867, + "learning_rate": 5.4072494669509605e-05, + "loss": 0.4489, + "step": 317 + }, + { + "epoch": 0.3390191897654584, + "grad_norm": 1.200049501099017, + "learning_rate": 5.424307036247335e-05, + "loss": 0.4519, + "step": 318 + }, + { + "epoch": 0.3400852878464819, + "grad_norm": 0.8547891891381943, + "learning_rate": 5.441364605543711e-05, + "loss": 0.4491, + "step": 319 + }, + { + "epoch": 0.3411513859275053, + "grad_norm": 0.8872211588826856, + "learning_rate": 5.4584221748400855e-05, + "loss": 0.4522, + "step": 320 + }, + { + "epoch": 0.3422174840085288, + "grad_norm": 1.114364988812215, + "learning_rate": 5.4754797441364615e-05, + "loss": 0.4503, + "step": 321 + }, + { + "epoch": 0.34328358208955223, + "grad_norm": 0.9431484820101103, + "learning_rate": 5.492537313432836e-05, + "loss": 0.4433, + "step": 322 + }, + { + "epoch": 0.3443496801705757, + "grad_norm": 1.0650041065819467, + "learning_rate": 5.509594882729212e-05, + "loss": 0.4484, + "step": 323 + }, + { + "epoch": 0.34541577825159914, + "grad_norm": 1.1014629307818748, + "learning_rate": 5.5266524520255865e-05, + "loss": 0.4439, + "step": 324 + }, + { + "epoch": 0.3464818763326226, + "grad_norm": 1.3284550365482601, + "learning_rate": 5.5437100213219624e-05, + "loss": 0.4489, + "step": 325 + }, + { + "epoch": 0.34754797441364604, + "grad_norm": 0.6740188587682626, + "learning_rate": 5.560767590618337e-05, + "loss": 0.4461, + "step": 326 + }, + { + "epoch": 0.3486140724946695, + "grad_norm": 0.8535552014500319, + "learning_rate": 5.577825159914713e-05, + "loss": 0.4469, + "step": 327 + }, + { + "epoch": 0.34968017057569295, + "grad_norm": 0.9344951335512115, + "learning_rate": 5.5948827292110875e-05, + "loss": 0.4403, + "step": 328 + }, + { + "epoch": 0.35074626865671643, + "grad_norm": 1.6493682506419003, + "learning_rate": 5.6119402985074634e-05, + "loss": 0.4471, + "step": 329 + }, + { + "epoch": 0.35181236673773986, + "grad_norm": 0.5980828549596681, + "learning_rate": 5.628997867803838e-05, + "loss": 0.4407, + "step": 330 + }, + { + "epoch": 0.35287846481876334, + "grad_norm": 1.3993066518146489, + "learning_rate": 5.646055437100214e-05, + "loss": 0.4463, + "step": 331 + }, + { + "epoch": 0.35394456289978676, + "grad_norm": 1.364713422133084, + "learning_rate": 5.6631130063965884e-05, + "loss": 0.4515, + "step": 332 + }, + { + "epoch": 0.35501066098081024, + "grad_norm": 0.6912706432665139, + "learning_rate": 5.6801705756929644e-05, + "loss": 0.4495, + "step": 333 + }, + { + "epoch": 0.35607675906183367, + "grad_norm": 1.0070213135643369, + "learning_rate": 5.697228144989339e-05, + "loss": 0.4444, + "step": 334 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 1.60072329010338, + "learning_rate": 5.714285714285715e-05, + "loss": 0.4491, + "step": 335 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 0.7418596944182502, + "learning_rate": 5.7313432835820894e-05, + "loss": 0.4426, + "step": 336 + }, + { + "epoch": 0.35927505330490406, + "grad_norm": 1.5682849618752128, + "learning_rate": 5.748400852878465e-05, + "loss": 0.4557, + "step": 337 + }, + { + "epoch": 0.3603411513859275, + "grad_norm": 0.9757128890719955, + "learning_rate": 5.76545842217484e-05, + "loss": 0.4434, + "step": 338 + }, + { + "epoch": 0.36140724946695096, + "grad_norm": 1.023456270027187, + "learning_rate": 5.782515991471216e-05, + "loss": 0.4493, + "step": 339 + }, + { + "epoch": 0.3624733475479744, + "grad_norm": 1.1069512289563967, + "learning_rate": 5.7995735607675904e-05, + "loss": 0.4487, + "step": 340 + }, + { + "epoch": 0.36353944562899787, + "grad_norm": 1.2762171846197148, + "learning_rate": 5.816631130063966e-05, + "loss": 0.4494, + "step": 341 + }, + { + "epoch": 0.3646055437100213, + "grad_norm": 0.9563297488524058, + "learning_rate": 5.8336886993603415e-05, + "loss": 0.4538, + "step": 342 + }, + { + "epoch": 0.3656716417910448, + "grad_norm": 0.8844708150282824, + "learning_rate": 5.850746268656717e-05, + "loss": 0.4599, + "step": 343 + }, + { + "epoch": 0.36673773987206826, + "grad_norm": 0.9248540817564378, + "learning_rate": 5.867803837953093e-05, + "loss": 0.4532, + "step": 344 + }, + { + "epoch": 0.3678038379530917, + "grad_norm": 0.897676935037186, + "learning_rate": 5.884861407249467e-05, + "loss": 0.4424, + "step": 345 + }, + { + "epoch": 0.36886993603411516, + "grad_norm": 1.3371410250762796, + "learning_rate": 5.901918976545843e-05, + "loss": 0.4528, + "step": 346 + }, + { + "epoch": 0.3699360341151386, + "grad_norm": 0.710536412391709, + "learning_rate": 5.918976545842218e-05, + "loss": 0.4496, + "step": 347 + }, + { + "epoch": 0.37100213219616207, + "grad_norm": 0.9252679189981835, + "learning_rate": 5.936034115138594e-05, + "loss": 0.4512, + "step": 348 + }, + { + "epoch": 0.3720682302771855, + "grad_norm": 1.2517055018731635, + "learning_rate": 5.953091684434968e-05, + "loss": 0.4473, + "step": 349 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.9298497655115758, + "learning_rate": 5.970149253731344e-05, + "loss": 0.4411, + "step": 350 + }, + { + "epoch": 0.3742004264392324, + "grad_norm": 0.7999639695039473, + "learning_rate": 5.987206823027719e-05, + "loss": 0.4525, + "step": 351 + }, + { + "epoch": 0.3752665245202559, + "grad_norm": 0.9596155583258207, + "learning_rate": 6.0042643923240946e-05, + "loss": 0.4472, + "step": 352 + }, + { + "epoch": 0.3763326226012793, + "grad_norm": 1.4841986378845615, + "learning_rate": 6.021321961620469e-05, + "loss": 0.453, + "step": 353 + }, + { + "epoch": 0.3773987206823028, + "grad_norm": 0.704856727131918, + "learning_rate": 6.038379530916845e-05, + "loss": 0.4418, + "step": 354 + }, + { + "epoch": 0.3784648187633262, + "grad_norm": 1.1708028615880235, + "learning_rate": 6.05543710021322e-05, + "loss": 0.4483, + "step": 355 + }, + { + "epoch": 0.3795309168443497, + "grad_norm": 1.6336721778285583, + "learning_rate": 6.0724946695095956e-05, + "loss": 0.4523, + "step": 356 + }, + { + "epoch": 0.3805970149253731, + "grad_norm": 0.7634972844271057, + "learning_rate": 6.08955223880597e-05, + "loss": 0.438, + "step": 357 + }, + { + "epoch": 0.3816631130063966, + "grad_norm": 2.0551249748572276, + "learning_rate": 6.106609808102346e-05, + "loss": 0.4465, + "step": 358 + }, + { + "epoch": 0.38272921108742003, + "grad_norm": 1.1247068935515665, + "learning_rate": 6.123667377398721e-05, + "loss": 0.4528, + "step": 359 + }, + { + "epoch": 0.3837953091684435, + "grad_norm": 2.399781202126181, + "learning_rate": 6.140724946695097e-05, + "loss": 0.4638, + "step": 360 + }, + { + "epoch": 0.38486140724946694, + "grad_norm": 2.227691194378861, + "learning_rate": 6.157782515991472e-05, + "loss": 0.4605, + "step": 361 + }, + { + "epoch": 0.3859275053304904, + "grad_norm": 1.286105573757533, + "learning_rate": 6.174840085287847e-05, + "loss": 0.4549, + "step": 362 + }, + { + "epoch": 0.38699360341151384, + "grad_norm": 1.5661635396554183, + "learning_rate": 6.191897654584222e-05, + "loss": 0.4553, + "step": 363 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 1.1356847004536101, + "learning_rate": 6.208955223880598e-05, + "loss": 0.4551, + "step": 364 + }, + { + "epoch": 0.38912579957356075, + "grad_norm": 1.5226959517373695, + "learning_rate": 6.226012793176973e-05, + "loss": 0.4527, + "step": 365 + }, + { + "epoch": 0.39019189765458423, + "grad_norm": 1.2678150824491392, + "learning_rate": 6.243070362473348e-05, + "loss": 0.4535, + "step": 366 + }, + { + "epoch": 0.39125799573560766, + "grad_norm": 1.2767687308223976, + "learning_rate": 6.260127931769723e-05, + "loss": 0.4533, + "step": 367 + }, + { + "epoch": 0.39232409381663114, + "grad_norm": 1.2583112180411133, + "learning_rate": 6.277185501066099e-05, + "loss": 0.4479, + "step": 368 + }, + { + "epoch": 0.39339019189765456, + "grad_norm": 0.8712817882443821, + "learning_rate": 6.294243070362474e-05, + "loss": 0.4487, + "step": 369 + }, + { + "epoch": 0.39445628997867804, + "grad_norm": 1.367324065405987, + "learning_rate": 6.311300639658849e-05, + "loss": 0.4478, + "step": 370 + }, + { + "epoch": 0.39552238805970147, + "grad_norm": 0.8890072622408898, + "learning_rate": 6.328358208955224e-05, + "loss": 0.448, + "step": 371 + }, + { + "epoch": 0.39658848614072495, + "grad_norm": 1.3147387007679658, + "learning_rate": 6.3454157782516e-05, + "loss": 0.4462, + "step": 372 + }, + { + "epoch": 0.3976545842217484, + "grad_norm": 1.0068738388038798, + "learning_rate": 6.362473347547975e-05, + "loss": 0.4477, + "step": 373 + }, + { + "epoch": 0.39872068230277186, + "grad_norm": 0.8943784235638984, + "learning_rate": 6.37953091684435e-05, + "loss": 0.4386, + "step": 374 + }, + { + "epoch": 0.3997867803837953, + "grad_norm": 0.9000653302119703, + "learning_rate": 6.396588486140725e-05, + "loss": 0.4473, + "step": 375 + }, + { + "epoch": 0.40085287846481876, + "grad_norm": 0.9661659593224818, + "learning_rate": 6.4136460554371e-05, + "loss": 0.438, + "step": 376 + }, + { + "epoch": 0.40191897654584224, + "grad_norm": 1.2385940487697282, + "learning_rate": 6.430703624733477e-05, + "loss": 0.4466, + "step": 377 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 0.7438645095725858, + "learning_rate": 6.447761194029851e-05, + "loss": 0.4419, + "step": 378 + }, + { + "epoch": 0.40405117270788915, + "grad_norm": 0.7506871300171193, + "learning_rate": 6.464818763326228e-05, + "loss": 0.4448, + "step": 379 + }, + { + "epoch": 0.4051172707889126, + "grad_norm": 0.9063302132407942, + "learning_rate": 6.481876332622601e-05, + "loss": 0.443, + "step": 380 + }, + { + "epoch": 0.40618336886993606, + "grad_norm": 0.7598737346973297, + "learning_rate": 6.498933901918978e-05, + "loss": 0.4387, + "step": 381 + }, + { + "epoch": 0.4072494669509595, + "grad_norm": 0.7700124455332041, + "learning_rate": 6.515991471215352e-05, + "loss": 0.4427, + "step": 382 + }, + { + "epoch": 0.40831556503198296, + "grad_norm": 0.8602508992172722, + "learning_rate": 6.533049040511728e-05, + "loss": 0.4417, + "step": 383 + }, + { + "epoch": 0.4093816631130064, + "grad_norm": 1.0830008793393127, + "learning_rate": 6.550106609808102e-05, + "loss": 0.4438, + "step": 384 + }, + { + "epoch": 0.41044776119402987, + "grad_norm": 1.556613073128131, + "learning_rate": 6.567164179104479e-05, + "loss": 0.4564, + "step": 385 + }, + { + "epoch": 0.4115138592750533, + "grad_norm": 0.6513365090394059, + "learning_rate": 6.584221748400853e-05, + "loss": 0.4549, + "step": 386 + }, + { + "epoch": 0.4125799573560768, + "grad_norm": 1.7019947089510525, + "learning_rate": 6.60127931769723e-05, + "loss": 0.4577, + "step": 387 + }, + { + "epoch": 0.4136460554371002, + "grad_norm": 0.8602634122130145, + "learning_rate": 6.618336886993603e-05, + "loss": 0.4453, + "step": 388 + }, + { + "epoch": 0.4147121535181237, + "grad_norm": 1.4701074651429544, + "learning_rate": 6.63539445628998e-05, + "loss": 0.4553, + "step": 389 + }, + { + "epoch": 0.4157782515991471, + "grad_norm": 1.0042942676179272, + "learning_rate": 6.652452025586354e-05, + "loss": 0.4517, + "step": 390 + }, + { + "epoch": 0.4168443496801706, + "grad_norm": 1.2563707215527287, + "learning_rate": 6.66950959488273e-05, + "loss": 0.4583, + "step": 391 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 0.8702664926874648, + "learning_rate": 6.686567164179104e-05, + "loss": 0.4447, + "step": 392 + }, + { + "epoch": 0.4189765458422175, + "grad_norm": 1.2431851878497833, + "learning_rate": 6.703624733475481e-05, + "loss": 0.4561, + "step": 393 + }, + { + "epoch": 0.4200426439232409, + "grad_norm": 0.74543869723929, + "learning_rate": 6.720682302771855e-05, + "loss": 0.4433, + "step": 394 + }, + { + "epoch": 0.4211087420042644, + "grad_norm": 1.0726381467701929, + "learning_rate": 6.737739872068231e-05, + "loss": 0.4435, + "step": 395 + }, + { + "epoch": 0.42217484008528783, + "grad_norm": 0.8897029348079197, + "learning_rate": 6.754797441364605e-05, + "loss": 0.4432, + "step": 396 + }, + { + "epoch": 0.4232409381663113, + "grad_norm": 0.9052325689021318, + "learning_rate": 6.771855010660982e-05, + "loss": 0.4468, + "step": 397 + }, + { + "epoch": 0.42430703624733473, + "grad_norm": 0.9903919600049477, + "learning_rate": 6.788912579957356e-05, + "loss": 0.4421, + "step": 398 + }, + { + "epoch": 0.4253731343283582, + "grad_norm": 1.2565089097200555, + "learning_rate": 6.805970149253732e-05, + "loss": 0.4484, + "step": 399 + }, + { + "epoch": 0.42643923240938164, + "grad_norm": 1.217499983640722, + "learning_rate": 6.823027718550106e-05, + "loss": 0.45, + "step": 400 + }, + { + "epoch": 0.4275053304904051, + "grad_norm": 0.8648975399735686, + "learning_rate": 6.840085287846483e-05, + "loss": 0.4411, + "step": 401 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.7803010542574316, + "learning_rate": 6.857142857142857e-05, + "loss": 0.4452, + "step": 402 + }, + { + "epoch": 0.42963752665245203, + "grad_norm": 1.070484531211215, + "learning_rate": 6.874200426439233e-05, + "loss": 0.4415, + "step": 403 + }, + { + "epoch": 0.43070362473347545, + "grad_norm": 1.0518099835702677, + "learning_rate": 6.891257995735607e-05, + "loss": 0.4483, + "step": 404 + }, + { + "epoch": 0.43176972281449894, + "grad_norm": 1.0987178262200696, + "learning_rate": 6.908315565031984e-05, + "loss": 0.4504, + "step": 405 + }, + { + "epoch": 0.43283582089552236, + "grad_norm": 1.298631454730131, + "learning_rate": 6.925373134328358e-05, + "loss": 0.4484, + "step": 406 + }, + { + "epoch": 0.43390191897654584, + "grad_norm": 0.7350226964695964, + "learning_rate": 6.942430703624734e-05, + "loss": 0.4469, + "step": 407 + }, + { + "epoch": 0.4349680170575693, + "grad_norm": 1.003473508893585, + "learning_rate": 6.95948827292111e-05, + "loss": 0.4473, + "step": 408 + }, + { + "epoch": 0.43603411513859275, + "grad_norm": 1.5274417692020188, + "learning_rate": 6.976545842217485e-05, + "loss": 0.4406, + "step": 409 + }, + { + "epoch": 0.43710021321961623, + "grad_norm": 0.795881436552545, + "learning_rate": 6.99360341151386e-05, + "loss": 0.4458, + "step": 410 + }, + { + "epoch": 0.43816631130063965, + "grad_norm": 1.8802864262284982, + "learning_rate": 7.010660980810235e-05, + "loss": 0.452, + "step": 411 + }, + { + "epoch": 0.43923240938166314, + "grad_norm": 1.0226364397036434, + "learning_rate": 7.02771855010661e-05, + "loss": 0.4439, + "step": 412 + }, + { + "epoch": 0.44029850746268656, + "grad_norm": 2.2181657239884736, + "learning_rate": 7.044776119402986e-05, + "loss": 0.4635, + "step": 413 + }, + { + "epoch": 0.44136460554371004, + "grad_norm": 1.9407812835735618, + "learning_rate": 7.061833688699361e-05, + "loss": 0.4592, + "step": 414 + }, + { + "epoch": 0.44243070362473347, + "grad_norm": 1.309070344848072, + "learning_rate": 7.078891257995736e-05, + "loss": 0.4465, + "step": 415 + }, + { + "epoch": 0.44349680170575695, + "grad_norm": 1.2883250095052161, + "learning_rate": 7.095948827292111e-05, + "loss": 0.4444, + "step": 416 + }, + { + "epoch": 0.4445628997867804, + "grad_norm": 1.0903343923529414, + "learning_rate": 7.113006396588487e-05, + "loss": 0.4496, + "step": 417 + }, + { + "epoch": 0.44562899786780386, + "grad_norm": 1.183664382780786, + "learning_rate": 7.130063965884862e-05, + "loss": 0.4438, + "step": 418 + }, + { + "epoch": 0.4466950959488273, + "grad_norm": 0.658411366668824, + "learning_rate": 7.147121535181237e-05, + "loss": 0.4487, + "step": 419 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.9629596016665242, + "learning_rate": 7.164179104477612e-05, + "loss": 0.4492, + "step": 420 + }, + { + "epoch": 0.4488272921108742, + "grad_norm": 0.6918711094052541, + "learning_rate": 7.181236673773988e-05, + "loss": 0.4459, + "step": 421 + }, + { + "epoch": 0.44989339019189767, + "grad_norm": 0.8563536919057261, + "learning_rate": 7.198294243070363e-05, + "loss": 0.4414, + "step": 422 + }, + { + "epoch": 0.4509594882729211, + "grad_norm": 0.6997094585258404, + "learning_rate": 7.215351812366738e-05, + "loss": 0.4371, + "step": 423 + }, + { + "epoch": 0.4520255863539446, + "grad_norm": 0.9078702301189076, + "learning_rate": 7.232409381663113e-05, + "loss": 0.4385, + "step": 424 + }, + { + "epoch": 0.453091684434968, + "grad_norm": 0.9105908002111917, + "learning_rate": 7.249466950959489e-05, + "loss": 0.4435, + "step": 425 + }, + { + "epoch": 0.4541577825159915, + "grad_norm": 1.17133908131614, + "learning_rate": 7.266524520255864e-05, + "loss": 0.4436, + "step": 426 + }, + { + "epoch": 0.4552238805970149, + "grad_norm": 0.9871418897305207, + "learning_rate": 7.283582089552239e-05, + "loss": 0.4382, + "step": 427 + }, + { + "epoch": 0.4562899786780384, + "grad_norm": 1.1416965993237809, + "learning_rate": 7.300639658848614e-05, + "loss": 0.4413, + "step": 428 + }, + { + "epoch": 0.4573560767590618, + "grad_norm": 0.8995759612900418, + "learning_rate": 7.31769722814499e-05, + "loss": 0.441, + "step": 429 + }, + { + "epoch": 0.4584221748400853, + "grad_norm": 0.9218576549939405, + "learning_rate": 7.334754797441365e-05, + "loss": 0.4384, + "step": 430 + }, + { + "epoch": 0.4594882729211087, + "grad_norm": 0.858746072858543, + "learning_rate": 7.35181236673774e-05, + "loss": 0.4431, + "step": 431 + }, + { + "epoch": 0.4605543710021322, + "grad_norm": 0.935134912271826, + "learning_rate": 7.368869936034115e-05, + "loss": 0.4419, + "step": 432 + }, + { + "epoch": 0.4616204690831556, + "grad_norm": 0.9646962439556347, + "learning_rate": 7.38592750533049e-05, + "loss": 0.4447, + "step": 433 + }, + { + "epoch": 0.4626865671641791, + "grad_norm": 1.1656061718106945, + "learning_rate": 7.402985074626866e-05, + "loss": 0.4449, + "step": 434 + }, + { + "epoch": 0.46375266524520253, + "grad_norm": 0.7326350434710983, + "learning_rate": 7.420042643923241e-05, + "loss": 0.4306, + "step": 435 + }, + { + "epoch": 0.464818763326226, + "grad_norm": 0.4426357716461027, + "learning_rate": 7.437100213219616e-05, + "loss": 0.4391, + "step": 436 + }, + { + "epoch": 0.46588486140724944, + "grad_norm": 0.6078030487605691, + "learning_rate": 7.454157782515992e-05, + "loss": 0.4347, + "step": 437 + }, + { + "epoch": 0.4669509594882729, + "grad_norm": 0.8052535102563257, + "learning_rate": 7.471215351812367e-05, + "loss": 0.4399, + "step": 438 + }, + { + "epoch": 0.4680170575692964, + "grad_norm": 0.993986917853917, + "learning_rate": 7.488272921108743e-05, + "loss": 0.4491, + "step": 439 + }, + { + "epoch": 0.4690831556503198, + "grad_norm": 1.1313255325891831, + "learning_rate": 7.505330490405117e-05, + "loss": 0.4493, + "step": 440 + }, + { + "epoch": 0.4701492537313433, + "grad_norm": 0.6842654100869138, + "learning_rate": 7.522388059701494e-05, + "loss": 0.4381, + "step": 441 + }, + { + "epoch": 0.47121535181236673, + "grad_norm": 0.914667240873549, + "learning_rate": 7.539445628997868e-05, + "loss": 0.4298, + "step": 442 + }, + { + "epoch": 0.4722814498933902, + "grad_norm": 1.079253155327782, + "learning_rate": 7.556503198294244e-05, + "loss": 0.4419, + "step": 443 + }, + { + "epoch": 0.47334754797441364, + "grad_norm": 0.7735237895647086, + "learning_rate": 7.573560767590618e-05, + "loss": 0.4363, + "step": 444 + }, + { + "epoch": 0.4744136460554371, + "grad_norm": 0.8494163950632265, + "learning_rate": 7.590618336886995e-05, + "loss": 0.4424, + "step": 445 + }, + { + "epoch": 0.47547974413646055, + "grad_norm": 0.8832423465229242, + "learning_rate": 7.607675906183369e-05, + "loss": 0.4433, + "step": 446 + }, + { + "epoch": 0.47654584221748403, + "grad_norm": 0.9230538736492173, + "learning_rate": 7.624733475479745e-05, + "loss": 0.4373, + "step": 447 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 1.0547271971615721, + "learning_rate": 7.641791044776119e-05, + "loss": 0.4383, + "step": 448 + }, + { + "epoch": 0.47867803837953093, + "grad_norm": 0.8897022777657935, + "learning_rate": 7.658848614072496e-05, + "loss": 0.442, + "step": 449 + }, + { + "epoch": 0.47974413646055436, + "grad_norm": 0.9036068567889025, + "learning_rate": 7.675906183368871e-05, + "loss": 0.4438, + "step": 450 + }, + { + "epoch": 0.48081023454157784, + "grad_norm": 0.881957048697129, + "learning_rate": 7.692963752665246e-05, + "loss": 0.4417, + "step": 451 + }, + { + "epoch": 0.48187633262260127, + "grad_norm": 0.9500253572061279, + "learning_rate": 7.710021321961622e-05, + "loss": 0.4366, + "step": 452 + }, + { + "epoch": 0.48294243070362475, + "grad_norm": 1.2095267797004439, + "learning_rate": 7.727078891257997e-05, + "loss": 0.4362, + "step": 453 + }, + { + "epoch": 0.4840085287846482, + "grad_norm": 1.0482792150572497, + "learning_rate": 7.744136460554372e-05, + "loss": 0.4322, + "step": 454 + }, + { + "epoch": 0.48507462686567165, + "grad_norm": 1.0807249109863046, + "learning_rate": 7.761194029850747e-05, + "loss": 0.4463, + "step": 455 + }, + { + "epoch": 0.4861407249466951, + "grad_norm": 1.0949378713445743, + "learning_rate": 7.778251599147123e-05, + "loss": 0.4384, + "step": 456 + }, + { + "epoch": 0.48720682302771856, + "grad_norm": 1.1230076391837633, + "learning_rate": 7.795309168443498e-05, + "loss": 0.4328, + "step": 457 + }, + { + "epoch": 0.488272921108742, + "grad_norm": 0.9284196769926588, + "learning_rate": 7.812366737739873e-05, + "loss": 0.433, + "step": 458 + }, + { + "epoch": 0.48933901918976547, + "grad_norm": 0.8307602310830237, + "learning_rate": 7.829424307036248e-05, + "loss": 0.4355, + "step": 459 + }, + { + "epoch": 0.4904051172707889, + "grad_norm": 0.6845844913178624, + "learning_rate": 7.846481876332623e-05, + "loss": 0.4324, + "step": 460 + }, + { + "epoch": 0.4914712153518124, + "grad_norm": 0.6727473022234879, + "learning_rate": 7.863539445628999e-05, + "loss": 0.4343, + "step": 461 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 0.9118111002510331, + "learning_rate": 7.880597014925374e-05, + "loss": 0.4332, + "step": 462 + }, + { + "epoch": 0.4936034115138593, + "grad_norm": 1.0468419215009597, + "learning_rate": 7.897654584221749e-05, + "loss": 0.4303, + "step": 463 + }, + { + "epoch": 0.4946695095948827, + "grad_norm": 1.0423557702670259, + "learning_rate": 7.914712153518124e-05, + "loss": 0.4468, + "step": 464 + }, + { + "epoch": 0.4957356076759062, + "grad_norm": 1.0941125408721255, + "learning_rate": 7.9317697228145e-05, + "loss": 0.437, + "step": 465 + }, + { + "epoch": 0.4968017057569296, + "grad_norm": 1.009365737040848, + "learning_rate": 7.948827292110875e-05, + "loss": 0.4344, + "step": 466 + }, + { + "epoch": 0.4978678038379531, + "grad_norm": 0.976490369885011, + "learning_rate": 7.96588486140725e-05, + "loss": 0.4405, + "step": 467 + }, + { + "epoch": 0.4989339019189765, + "grad_norm": 1.0336971441325562, + "learning_rate": 7.982942430703625e-05, + "loss": 0.4415, + "step": 468 + }, + { + "epoch": 0.5, + "grad_norm": 1.1162353727508352, + "learning_rate": 8e-05, + "loss": 0.441, + "step": 469 + }, + { + "epoch": 0.5010660980810234, + "grad_norm": 0.870590499557518, + "learning_rate": 7.999998892103944e-05, + "loss": 0.4385, + "step": 470 + }, + { + "epoch": 0.502132196162047, + "grad_norm": 1.069571130953163, + "learning_rate": 7.999995568416386e-05, + "loss": 0.436, + "step": 471 + }, + { + "epoch": 0.5031982942430704, + "grad_norm": 0.9278286961514632, + "learning_rate": 7.99999002893917e-05, + "loss": 0.439, + "step": 472 + }, + { + "epoch": 0.5042643923240938, + "grad_norm": 0.7961408606111287, + "learning_rate": 7.999982273675363e-05, + "loss": 0.4411, + "step": 473 + }, + { + "epoch": 0.5053304904051172, + "grad_norm": 0.8234137572564111, + "learning_rate": 7.999972302629264e-05, + "loss": 0.4301, + "step": 474 + }, + { + "epoch": 0.5063965884861408, + "grad_norm": 0.8624956407119708, + "learning_rate": 7.999960115806391e-05, + "loss": 0.4329, + "step": 475 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 0.8970331272898654, + "learning_rate": 7.999945713213499e-05, + "loss": 0.4326, + "step": 476 + }, + { + "epoch": 0.5085287846481876, + "grad_norm": 1.0727116898193436, + "learning_rate": 7.999929094858566e-05, + "loss": 0.4345, + "step": 477 + }, + { + "epoch": 0.509594882729211, + "grad_norm": 1.3157426521337816, + "learning_rate": 7.999910260750796e-05, + "loss": 0.4431, + "step": 478 + }, + { + "epoch": 0.5106609808102346, + "grad_norm": 0.8323751755101396, + "learning_rate": 7.999889210900623e-05, + "loss": 0.4368, + "step": 479 + }, + { + "epoch": 0.511727078891258, + "grad_norm": 0.7429493009987966, + "learning_rate": 7.999865945319708e-05, + "loss": 0.4352, + "step": 480 + }, + { + "epoch": 0.5127931769722814, + "grad_norm": 0.7605461290267354, + "learning_rate": 7.999840464020936e-05, + "loss": 0.438, + "step": 481 + }, + { + "epoch": 0.5138592750533049, + "grad_norm": 1.0898292270229712, + "learning_rate": 7.999812767018428e-05, + "loss": 0.4397, + "step": 482 + }, + { + "epoch": 0.5149253731343284, + "grad_norm": 1.1380886467744538, + "learning_rate": 7.999782854327523e-05, + "loss": 0.4367, + "step": 483 + }, + { + "epoch": 0.5159914712153518, + "grad_norm": 0.923359986404487, + "learning_rate": 7.99975072596479e-05, + "loss": 0.4384, + "step": 484 + }, + { + "epoch": 0.5170575692963753, + "grad_norm": 0.8375674330038942, + "learning_rate": 7.999716381948029e-05, + "loss": 0.436, + "step": 485 + }, + { + "epoch": 0.5181236673773987, + "grad_norm": 0.7570588135036903, + "learning_rate": 7.999679822296263e-05, + "loss": 0.4333, + "step": 486 + }, + { + "epoch": 0.5191897654584222, + "grad_norm": 0.8549659681016898, + "learning_rate": 7.999641047029747e-05, + "loss": 0.4356, + "step": 487 + }, + { + "epoch": 0.5202558635394456, + "grad_norm": 0.8852253500570002, + "learning_rate": 7.999600056169956e-05, + "loss": 0.4367, + "step": 488 + }, + { + "epoch": 0.5213219616204691, + "grad_norm": 0.9048960575709987, + "learning_rate": 7.9995568497396e-05, + "loss": 0.435, + "step": 489 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.88758605404917, + "learning_rate": 7.999511427762612e-05, + "loss": 0.4326, + "step": 490 + }, + { + "epoch": 0.523454157782516, + "grad_norm": 0.9051344896994227, + "learning_rate": 7.999463790264155e-05, + "loss": 0.4345, + "step": 491 + }, + { + "epoch": 0.5245202558635395, + "grad_norm": 0.9863125025310046, + "learning_rate": 7.999413937270616e-05, + "loss": 0.4302, + "step": 492 + }, + { + "epoch": 0.5255863539445629, + "grad_norm": 1.0414867859886516, + "learning_rate": 7.999361868809611e-05, + "loss": 0.4339, + "step": 493 + }, + { + "epoch": 0.5266524520255863, + "grad_norm": 0.9324905753531538, + "learning_rate": 7.999307584909985e-05, + "loss": 0.4411, + "step": 494 + }, + { + "epoch": 0.5277185501066098, + "grad_norm": 0.8218386585042996, + "learning_rate": 7.999251085601804e-05, + "loss": 0.4349, + "step": 495 + }, + { + "epoch": 0.5287846481876333, + "grad_norm": 0.6954291556313583, + "learning_rate": 7.999192370916371e-05, + "loss": 0.4406, + "step": 496 + }, + { + "epoch": 0.5298507462686567, + "grad_norm": 0.599463593583278, + "learning_rate": 7.999131440886208e-05, + "loss": 0.4294, + "step": 497 + }, + { + "epoch": 0.5309168443496801, + "grad_norm": 0.6547393051285391, + "learning_rate": 7.999068295545068e-05, + "loss": 0.4257, + "step": 498 + }, + { + "epoch": 0.5319829424307037, + "grad_norm": 0.5821330571530523, + "learning_rate": 7.99900293492793e-05, + "loss": 0.4283, + "step": 499 + }, + { + "epoch": 0.5330490405117271, + "grad_norm": 0.6804915712665963, + "learning_rate": 7.998935359071001e-05, + "loss": 0.4328, + "step": 500 + }, + { + "epoch": 0.5341151385927505, + "grad_norm": 0.8261518180898303, + "learning_rate": 7.998865568011713e-05, + "loss": 0.4272, + "step": 501 + }, + { + "epoch": 0.535181236673774, + "grad_norm": 0.984885443645453, + "learning_rate": 7.998793561788727e-05, + "loss": 0.4304, + "step": 502 + }, + { + "epoch": 0.5362473347547975, + "grad_norm": 1.0533079413867938, + "learning_rate": 7.998719340441933e-05, + "loss": 0.4325, + "step": 503 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 0.7930549662599339, + "learning_rate": 7.998642904012442e-05, + "loss": 0.4318, + "step": 504 + }, + { + "epoch": 0.5383795309168443, + "grad_norm": 0.7826200841082803, + "learning_rate": 7.998564252542599e-05, + "loss": 0.4335, + "step": 505 + }, + { + "epoch": 0.5394456289978679, + "grad_norm": 0.9783531802962122, + "learning_rate": 7.998483386075972e-05, + "loss": 0.4359, + "step": 506 + }, + { + "epoch": 0.5405117270788913, + "grad_norm": 1.0305053743252428, + "learning_rate": 7.998400304657356e-05, + "loss": 0.4295, + "step": 507 + }, + { + "epoch": 0.5415778251599147, + "grad_norm": 1.0480888416790912, + "learning_rate": 7.998315008332773e-05, + "loss": 0.4312, + "step": 508 + }, + { + "epoch": 0.5426439232409381, + "grad_norm": 1.0913103402452111, + "learning_rate": 7.998227497149475e-05, + "loss": 0.4359, + "step": 509 + }, + { + "epoch": 0.5437100213219617, + "grad_norm": 0.7828175941941907, + "learning_rate": 7.998137771155938e-05, + "loss": 0.4253, + "step": 510 + }, + { + "epoch": 0.5447761194029851, + "grad_norm": 0.753244557765304, + "learning_rate": 7.998045830401864e-05, + "loss": 0.4393, + "step": 511 + }, + { + "epoch": 0.5458422174840085, + "grad_norm": 0.7846311196148755, + "learning_rate": 7.997951674938185e-05, + "loss": 0.4381, + "step": 512 + }, + { + "epoch": 0.5469083155650319, + "grad_norm": 0.5773342532146049, + "learning_rate": 7.997855304817059e-05, + "loss": 0.4324, + "step": 513 + }, + { + "epoch": 0.5479744136460555, + "grad_norm": 0.5590862741693943, + "learning_rate": 7.997756720091866e-05, + "loss": 0.4347, + "step": 514 + }, + { + "epoch": 0.5490405117270789, + "grad_norm": 0.5750952292279558, + "learning_rate": 7.997655920817222e-05, + "loss": 0.4316, + "step": 515 + }, + { + "epoch": 0.5501066098081023, + "grad_norm": 0.6003198820780814, + "learning_rate": 7.997552907048961e-05, + "loss": 0.4315, + "step": 516 + }, + { + "epoch": 0.5511727078891258, + "grad_norm": 0.7339386833494144, + "learning_rate": 7.997447678844148e-05, + "loss": 0.4334, + "step": 517 + }, + { + "epoch": 0.5522388059701493, + "grad_norm": 0.901683791960714, + "learning_rate": 7.997340236261076e-05, + "loss": 0.4301, + "step": 518 + }, + { + "epoch": 0.5533049040511727, + "grad_norm": 1.1497802402889508, + "learning_rate": 7.997230579359261e-05, + "loss": 0.4305, + "step": 519 + }, + { + "epoch": 0.5543710021321961, + "grad_norm": 1.0986758974484423, + "learning_rate": 7.997118708199447e-05, + "loss": 0.4319, + "step": 520 + }, + { + "epoch": 0.5554371002132196, + "grad_norm": 0.9768374453514556, + "learning_rate": 7.997004622843603e-05, + "loss": 0.4298, + "step": 521 + }, + { + "epoch": 0.5565031982942431, + "grad_norm": 0.8738815441198616, + "learning_rate": 7.996888323354932e-05, + "loss": 0.4334, + "step": 522 + }, + { + "epoch": 0.5575692963752665, + "grad_norm": 0.7838623932210756, + "learning_rate": 7.996769809797851e-05, + "loss": 0.4188, + "step": 523 + }, + { + "epoch": 0.55863539445629, + "grad_norm": 0.7503706641165158, + "learning_rate": 7.996649082238015e-05, + "loss": 0.4293, + "step": 524 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.7732501093769599, + "learning_rate": 7.9965261407423e-05, + "loss": 0.4222, + "step": 525 + }, + { + "epoch": 0.5607675906183369, + "grad_norm": 0.7009167747253003, + "learning_rate": 7.996400985378807e-05, + "loss": 0.4208, + "step": 526 + }, + { + "epoch": 0.5618336886993603, + "grad_norm": 0.6810945772619069, + "learning_rate": 7.996273616216868e-05, + "loss": 0.4327, + "step": 527 + }, + { + "epoch": 0.5628997867803838, + "grad_norm": 0.683953064924211, + "learning_rate": 7.996144033327038e-05, + "loss": 0.432, + "step": 528 + }, + { + "epoch": 0.5639658848614072, + "grad_norm": 0.6899323492718954, + "learning_rate": 7.9960122367811e-05, + "loss": 0.4213, + "step": 529 + }, + { + "epoch": 0.5650319829424307, + "grad_norm": 0.7321597867806313, + "learning_rate": 7.995878226652061e-05, + "loss": 0.4324, + "step": 530 + }, + { + "epoch": 0.5660980810234542, + "grad_norm": 0.8962870617353619, + "learning_rate": 7.995742003014156e-05, + "loss": 0.4331, + "step": 531 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 0.9482010305629667, + "learning_rate": 7.995603565942846e-05, + "loss": 0.4305, + "step": 532 + }, + { + "epoch": 0.5682302771855011, + "grad_norm": 0.8547510485520907, + "learning_rate": 7.995462915514819e-05, + "loss": 0.428, + "step": 533 + }, + { + "epoch": 0.5692963752665245, + "grad_norm": 0.7205166965219559, + "learning_rate": 7.995320051807987e-05, + "loss": 0.4274, + "step": 534 + }, + { + "epoch": 0.570362473347548, + "grad_norm": 0.6501576811903161, + "learning_rate": 7.995174974901489e-05, + "loss": 0.4349, + "step": 535 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7124693365482168, + "learning_rate": 7.99502768487569e-05, + "loss": 0.4332, + "step": 536 + }, + { + "epoch": 0.5724946695095949, + "grad_norm": 0.9137269789343048, + "learning_rate": 7.994878181812181e-05, + "loss": 0.4329, + "step": 537 + }, + { + "epoch": 0.5735607675906184, + "grad_norm": 1.042428645077687, + "learning_rate": 7.994726465793782e-05, + "loss": 0.4251, + "step": 538 + }, + { + "epoch": 0.5746268656716418, + "grad_norm": 0.9004373071226697, + "learning_rate": 7.994572536904529e-05, + "loss": 0.4285, + "step": 539 + }, + { + "epoch": 0.5756929637526652, + "grad_norm": 0.8473522603996991, + "learning_rate": 7.994416395229696e-05, + "loss": 0.4366, + "step": 540 + }, + { + "epoch": 0.5767590618336887, + "grad_norm": 0.7717855677683063, + "learning_rate": 7.994258040855776e-05, + "loss": 0.426, + "step": 541 + }, + { + "epoch": 0.5778251599147122, + "grad_norm": 0.7447673936934801, + "learning_rate": 7.99409747387049e-05, + "loss": 0.4301, + "step": 542 + }, + { + "epoch": 0.5788912579957356, + "grad_norm": 0.7651727619884234, + "learning_rate": 7.993934694362782e-05, + "loss": 0.429, + "step": 543 + }, + { + "epoch": 0.579957356076759, + "grad_norm": 0.771334240782246, + "learning_rate": 7.993769702422824e-05, + "loss": 0.4351, + "step": 544 + }, + { + "epoch": 0.5810234541577826, + "grad_norm": 0.8138260174231243, + "learning_rate": 7.993602498142015e-05, + "loss": 0.4296, + "step": 545 + }, + { + "epoch": 0.582089552238806, + "grad_norm": 0.9083617257757822, + "learning_rate": 7.993433081612975e-05, + "loss": 0.4207, + "step": 546 + }, + { + "epoch": 0.5831556503198294, + "grad_norm": 1.015947743360487, + "learning_rate": 7.993261452929551e-05, + "loss": 0.435, + "step": 547 + }, + { + "epoch": 0.5842217484008528, + "grad_norm": 0.9636126108742088, + "learning_rate": 7.99308761218682e-05, + "loss": 0.43, + "step": 548 + }, + { + "epoch": 0.5852878464818764, + "grad_norm": 0.9480436655111312, + "learning_rate": 7.992911559481077e-05, + "loss": 0.4333, + "step": 549 + }, + { + "epoch": 0.5863539445628998, + "grad_norm": 0.9004301184201612, + "learning_rate": 7.992733294909848e-05, + "loss": 0.4287, + "step": 550 + }, + { + "epoch": 0.5874200426439232, + "grad_norm": 0.8680389446297649, + "learning_rate": 7.992552818571883e-05, + "loss": 0.4286, + "step": 551 + }, + { + "epoch": 0.5884861407249466, + "grad_norm": 0.8221740761869751, + "learning_rate": 7.992370130567155e-05, + "loss": 0.4264, + "step": 552 + }, + { + "epoch": 0.5895522388059702, + "grad_norm": 0.6856491729915255, + "learning_rate": 7.992185230996864e-05, + "loss": 0.4271, + "step": 553 + }, + { + "epoch": 0.5906183368869936, + "grad_norm": 0.587285396525197, + "learning_rate": 7.991998119963436e-05, + "loss": 0.4249, + "step": 554 + }, + { + "epoch": 0.591684434968017, + "grad_norm": 0.6202171961164545, + "learning_rate": 7.991808797570519e-05, + "loss": 0.437, + "step": 555 + }, + { + "epoch": 0.5927505330490405, + "grad_norm": 0.5703467722471379, + "learning_rate": 7.991617263922988e-05, + "loss": 0.4261, + "step": 556 + }, + { + "epoch": 0.593816631130064, + "grad_norm": 0.5467752825405972, + "learning_rate": 7.991423519126945e-05, + "loss": 0.4261, + "step": 557 + }, + { + "epoch": 0.5948827292110874, + "grad_norm": 0.584586914451865, + "learning_rate": 7.991227563289713e-05, + "loss": 0.4306, + "step": 558 + }, + { + "epoch": 0.5959488272921108, + "grad_norm": 0.5995377319354194, + "learning_rate": 7.991029396519839e-05, + "loss": 0.4295, + "step": 559 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.5825912670190095, + "learning_rate": 7.9908290189271e-05, + "loss": 0.4246, + "step": 560 + }, + { + "epoch": 0.5980810234541578, + "grad_norm": 0.5445221492611624, + "learning_rate": 7.990626430622494e-05, + "loss": 0.4258, + "step": 561 + }, + { + "epoch": 0.5991471215351812, + "grad_norm": 0.6388885844122285, + "learning_rate": 7.990421631718244e-05, + "loss": 0.4237, + "step": 562 + }, + { + "epoch": 0.6002132196162047, + "grad_norm": 0.7985864321661731, + "learning_rate": 7.9902146223278e-05, + "loss": 0.4263, + "step": 563 + }, + { + "epoch": 0.6012793176972282, + "grad_norm": 0.9243564293353863, + "learning_rate": 7.990005402565831e-05, + "loss": 0.4319, + "step": 564 + }, + { + "epoch": 0.6023454157782516, + "grad_norm": 1.0041597610953659, + "learning_rate": 7.989793972548236e-05, + "loss": 0.4317, + "step": 565 + }, + { + "epoch": 0.603411513859275, + "grad_norm": 0.9913301041254353, + "learning_rate": 7.989580332392137e-05, + "loss": 0.4228, + "step": 566 + }, + { + "epoch": 0.6044776119402985, + "grad_norm": 0.9215600177004183, + "learning_rate": 7.989364482215878e-05, + "loss": 0.4349, + "step": 567 + }, + { + "epoch": 0.605543710021322, + "grad_norm": 0.8910563956821462, + "learning_rate": 7.989146422139029e-05, + "loss": 0.4256, + "step": 568 + }, + { + "epoch": 0.6066098081023454, + "grad_norm": 0.9133910919719787, + "learning_rate": 7.988926152282384e-05, + "loss": 0.4269, + "step": 569 + }, + { + "epoch": 0.6076759061833689, + "grad_norm": 0.964984044461557, + "learning_rate": 7.988703672767962e-05, + "loss": 0.4271, + "step": 570 + }, + { + "epoch": 0.6087420042643923, + "grad_norm": 1.0759011919787849, + "learning_rate": 7.988478983719003e-05, + "loss": 0.4252, + "step": 571 + }, + { + "epoch": 0.6098081023454158, + "grad_norm": 0.8837314544429662, + "learning_rate": 7.988252085259976e-05, + "loss": 0.4297, + "step": 572 + }, + { + "epoch": 0.6108742004264393, + "grad_norm": 0.6734363195402464, + "learning_rate": 7.988022977516569e-05, + "loss": 0.4295, + "step": 573 + }, + { + "epoch": 0.6119402985074627, + "grad_norm": 0.6034655355722486, + "learning_rate": 7.987791660615695e-05, + "loss": 0.4251, + "step": 574 + }, + { + "epoch": 0.6130063965884861, + "grad_norm": 0.5138841355005869, + "learning_rate": 7.987558134685494e-05, + "loss": 0.4257, + "step": 575 + }, + { + "epoch": 0.6140724946695096, + "grad_norm": 0.5645600852787206, + "learning_rate": 7.987322399855324e-05, + "loss": 0.4241, + "step": 576 + }, + { + "epoch": 0.6151385927505331, + "grad_norm": 0.5906802943834981, + "learning_rate": 7.987084456255773e-05, + "loss": 0.4315, + "step": 577 + }, + { + "epoch": 0.6162046908315565, + "grad_norm": 0.6711742756532632, + "learning_rate": 7.986844304018649e-05, + "loss": 0.429, + "step": 578 + }, + { + "epoch": 0.6172707889125799, + "grad_norm": 0.8364188220124186, + "learning_rate": 7.986601943276982e-05, + "loss": 0.4195, + "step": 579 + }, + { + "epoch": 0.6183368869936035, + "grad_norm": 0.9774403295859477, + "learning_rate": 7.986357374165028e-05, + "loss": 0.4332, + "step": 580 + }, + { + "epoch": 0.6194029850746269, + "grad_norm": 1.04494654978909, + "learning_rate": 7.986110596818265e-05, + "loss": 0.428, + "step": 581 + }, + { + "epoch": 0.6204690831556503, + "grad_norm": 0.9878512856031877, + "learning_rate": 7.985861611373397e-05, + "loss": 0.4248, + "step": 582 + }, + { + "epoch": 0.6215351812366737, + "grad_norm": 0.9524713066725912, + "learning_rate": 7.985610417968348e-05, + "loss": 0.4251, + "step": 583 + }, + { + "epoch": 0.6226012793176973, + "grad_norm": 0.74751402505921, + "learning_rate": 7.985357016742264e-05, + "loss": 0.4319, + "step": 584 + }, + { + "epoch": 0.6236673773987207, + "grad_norm": 0.4034413623064052, + "learning_rate": 7.985101407835519e-05, + "loss": 0.4271, + "step": 585 + }, + { + "epoch": 0.6247334754797441, + "grad_norm": 0.43538759493027, + "learning_rate": 7.984843591389706e-05, + "loss": 0.4295, + "step": 586 + }, + { + "epoch": 0.6257995735607675, + "grad_norm": 0.5770360682304801, + "learning_rate": 7.984583567547643e-05, + "loss": 0.4247, + "step": 587 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 0.601220491680326, + "learning_rate": 7.984321336453364e-05, + "loss": 0.4217, + "step": 588 + }, + { + "epoch": 0.6279317697228145, + "grad_norm": 0.5416809102850237, + "learning_rate": 7.984056898252141e-05, + "loss": 0.4178, + "step": 589 + }, + { + "epoch": 0.6289978678038379, + "grad_norm": 0.5991300962093767, + "learning_rate": 7.983790253090452e-05, + "loss": 0.4268, + "step": 590 + }, + { + "epoch": 0.6300639658848614, + "grad_norm": 0.5939600973200748, + "learning_rate": 7.983521401116005e-05, + "loss": 0.4201, + "step": 591 + }, + { + "epoch": 0.6311300639658849, + "grad_norm": 0.5745808408760033, + "learning_rate": 7.983250342477733e-05, + "loss": 0.4263, + "step": 592 + }, + { + "epoch": 0.6321961620469083, + "grad_norm": 0.6369439414081867, + "learning_rate": 7.982977077325788e-05, + "loss": 0.4221, + "step": 593 + }, + { + "epoch": 0.6332622601279317, + "grad_norm": 0.7105979728558217, + "learning_rate": 7.98270160581154e-05, + "loss": 0.4235, + "step": 594 + }, + { + "epoch": 0.6343283582089553, + "grad_norm": 0.6814030775883104, + "learning_rate": 7.982423928087593e-05, + "loss": 0.4255, + "step": 595 + }, + { + "epoch": 0.6353944562899787, + "grad_norm": 0.6048922616611062, + "learning_rate": 7.982144044307762e-05, + "loss": 0.4216, + "step": 596 + }, + { + "epoch": 0.6364605543710021, + "grad_norm": 0.6271148450150956, + "learning_rate": 7.981861954627088e-05, + "loss": 0.429, + "step": 597 + }, + { + "epoch": 0.6375266524520256, + "grad_norm": 0.7648541301808355, + "learning_rate": 7.981577659201833e-05, + "loss": 0.425, + "step": 598 + }, + { + "epoch": 0.6385927505330491, + "grad_norm": 0.7722433311546377, + "learning_rate": 7.981291158189486e-05, + "loss": 0.4266, + "step": 599 + }, + { + "epoch": 0.6396588486140725, + "grad_norm": 0.8045597536473933, + "learning_rate": 7.98100245174875e-05, + "loss": 0.4226, + "step": 600 + }, + { + "epoch": 0.6407249466950959, + "grad_norm": 0.985587897420224, + "learning_rate": 7.980711540039554e-05, + "loss": 0.422, + "step": 601 + }, + { + "epoch": 0.6417910447761194, + "grad_norm": 1.1286184314617196, + "learning_rate": 7.980418423223049e-05, + "loss": 0.4215, + "step": 602 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.7080020035877171, + "learning_rate": 7.980123101461606e-05, + "loss": 0.4247, + "step": 603 + }, + { + "epoch": 0.6439232409381663, + "grad_norm": 0.5626555829828844, + "learning_rate": 7.979825574918818e-05, + "loss": 0.419, + "step": 604 + }, + { + "epoch": 0.6449893390191898, + "grad_norm": 0.636169281039428, + "learning_rate": 7.979525843759499e-05, + "loss": 0.4278, + "step": 605 + }, + { + "epoch": 0.6460554371002132, + "grad_norm": 0.7198635719149019, + "learning_rate": 7.979223908149685e-05, + "loss": 0.4255, + "step": 606 + }, + { + "epoch": 0.6471215351812367, + "grad_norm": 0.8342343801146002, + "learning_rate": 7.978919768256631e-05, + "loss": 0.4227, + "step": 607 + }, + { + "epoch": 0.6481876332622601, + "grad_norm": 0.9031065878045232, + "learning_rate": 7.978613424248818e-05, + "loss": 0.4308, + "step": 608 + }, + { + "epoch": 0.6492537313432836, + "grad_norm": 0.8023633786215478, + "learning_rate": 7.978304876295941e-05, + "loss": 0.419, + "step": 609 + }, + { + "epoch": 0.650319829424307, + "grad_norm": 0.9239599844896783, + "learning_rate": 7.977994124568922e-05, + "loss": 0.424, + "step": 610 + }, + { + "epoch": 0.6513859275053305, + "grad_norm": 0.9482115919888203, + "learning_rate": 7.9776811692399e-05, + "loss": 0.4186, + "step": 611 + }, + { + "epoch": 0.652452025586354, + "grad_norm": 0.786317946938952, + "learning_rate": 7.977366010482236e-05, + "loss": 0.4141, + "step": 612 + }, + { + "epoch": 0.6535181236673774, + "grad_norm": 0.6264488608057049, + "learning_rate": 7.977048648470513e-05, + "loss": 0.4209, + "step": 613 + }, + { + "epoch": 0.6545842217484008, + "grad_norm": 0.7117064313531933, + "learning_rate": 7.976729083380532e-05, + "loss": 0.4262, + "step": 614 + }, + { + "epoch": 0.6556503198294243, + "grad_norm": 0.7175291742270824, + "learning_rate": 7.976407315389314e-05, + "loss": 0.4251, + "step": 615 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 0.6490475195317971, + "learning_rate": 7.976083344675105e-05, + "loss": 0.4137, + "step": 616 + }, + { + "epoch": 0.6577825159914712, + "grad_norm": 0.6621191439676352, + "learning_rate": 7.975757171417365e-05, + "loss": 0.4216, + "step": 617 + }, + { + "epoch": 0.6588486140724946, + "grad_norm": 0.6960151950050518, + "learning_rate": 7.97542879579678e-05, + "loss": 0.4276, + "step": 618 + }, + { + "epoch": 0.6599147121535182, + "grad_norm": 0.6648695880576607, + "learning_rate": 7.975098217995248e-05, + "loss": 0.4209, + "step": 619 + }, + { + "epoch": 0.6609808102345416, + "grad_norm": 0.6121386791069903, + "learning_rate": 7.974765438195897e-05, + "loss": 0.419, + "step": 620 + }, + { + "epoch": 0.662046908315565, + "grad_norm": 0.5748855152032502, + "learning_rate": 7.974430456583069e-05, + "loss": 0.4249, + "step": 621 + }, + { + "epoch": 0.6631130063965884, + "grad_norm": 0.5788717188362661, + "learning_rate": 7.974093273342325e-05, + "loss": 0.4204, + "step": 622 + }, + { + "epoch": 0.664179104477612, + "grad_norm": 0.499053345156499, + "learning_rate": 7.973753888660446e-05, + "loss": 0.4258, + "step": 623 + }, + { + "epoch": 0.6652452025586354, + "grad_norm": 0.6143099826003396, + "learning_rate": 7.973412302725435e-05, + "loss": 0.4233, + "step": 624 + }, + { + "epoch": 0.6663113006396588, + "grad_norm": 0.7815047847063621, + "learning_rate": 7.973068515726514e-05, + "loss": 0.4226, + "step": 625 + }, + { + "epoch": 0.6673773987206824, + "grad_norm": 0.8746470159019124, + "learning_rate": 7.972722527854119e-05, + "loss": 0.4286, + "step": 626 + }, + { + "epoch": 0.6684434968017058, + "grad_norm": 0.8412210250981513, + "learning_rate": 7.972374339299915e-05, + "loss": 0.4222, + "step": 627 + }, + { + "epoch": 0.6695095948827292, + "grad_norm": 0.8583591463497402, + "learning_rate": 7.972023950256775e-05, + "loss": 0.4248, + "step": 628 + }, + { + "epoch": 0.6705756929637526, + "grad_norm": 0.8975533200078734, + "learning_rate": 7.9716713609188e-05, + "loss": 0.4332, + "step": 629 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.9787622637115202, + "learning_rate": 7.971316571481306e-05, + "loss": 0.4305, + "step": 630 + }, + { + "epoch": 0.6727078891257996, + "grad_norm": 0.9814190373327007, + "learning_rate": 7.970959582140825e-05, + "loss": 0.4339, + "step": 631 + }, + { + "epoch": 0.673773987206823, + "grad_norm": 0.9082356663533697, + "learning_rate": 7.970600393095113e-05, + "loss": 0.4262, + "step": 632 + }, + { + "epoch": 0.6748400852878464, + "grad_norm": 0.7973168134733778, + "learning_rate": 7.970239004543141e-05, + "loss": 0.4263, + "step": 633 + }, + { + "epoch": 0.67590618336887, + "grad_norm": 0.7223873106620373, + "learning_rate": 7.969875416685101e-05, + "loss": 0.4174, + "step": 634 + }, + { + "epoch": 0.6769722814498934, + "grad_norm": 0.7431984730465593, + "learning_rate": 7.9695096297224e-05, + "loss": 0.4296, + "step": 635 + }, + { + "epoch": 0.6780383795309168, + "grad_norm": 0.7282657781320044, + "learning_rate": 7.969141643857665e-05, + "loss": 0.4267, + "step": 636 + }, + { + "epoch": 0.6791044776119403, + "grad_norm": 0.5869814576553907, + "learning_rate": 7.968771459294742e-05, + "loss": 0.4191, + "step": 637 + }, + { + "epoch": 0.6801705756929638, + "grad_norm": 0.5004088468685179, + "learning_rate": 7.968399076238694e-05, + "loss": 0.4198, + "step": 638 + }, + { + "epoch": 0.6812366737739872, + "grad_norm": 0.6199279637183518, + "learning_rate": 7.968024494895802e-05, + "loss": 0.423, + "step": 639 + }, + { + "epoch": 0.6823027718550106, + "grad_norm": 0.6026223540177856, + "learning_rate": 7.967647715473563e-05, + "loss": 0.4252, + "step": 640 + }, + { + "epoch": 0.6833688699360341, + "grad_norm": 0.5355415437778251, + "learning_rate": 7.967268738180694e-05, + "loss": 0.4131, + "step": 641 + }, + { + "epoch": 0.6844349680170576, + "grad_norm": 0.5663522144573513, + "learning_rate": 7.966887563227132e-05, + "loss": 0.4252, + "step": 642 + }, + { + "epoch": 0.685501066098081, + "grad_norm": 0.5690136220036935, + "learning_rate": 7.966504190824021e-05, + "loss": 0.4186, + "step": 643 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 0.6635581651982301, + "learning_rate": 7.966118621183735e-05, + "loss": 0.4161, + "step": 644 + }, + { + "epoch": 0.6876332622601279, + "grad_norm": 0.7754746339129004, + "learning_rate": 7.96573085451986e-05, + "loss": 0.4285, + "step": 645 + }, + { + "epoch": 0.6886993603411514, + "grad_norm": 0.9415320034442117, + "learning_rate": 7.965340891047196e-05, + "loss": 0.4242, + "step": 646 + }, + { + "epoch": 0.6897654584221748, + "grad_norm": 1.133439191591166, + "learning_rate": 7.964948730981763e-05, + "loss": 0.4264, + "step": 647 + }, + { + "epoch": 0.6908315565031983, + "grad_norm": 0.8248991981565036, + "learning_rate": 7.964554374540797e-05, + "loss": 0.4271, + "step": 648 + }, + { + "epoch": 0.6918976545842217, + "grad_norm": 0.49941087039661575, + "learning_rate": 7.964157821942752e-05, + "loss": 0.4185, + "step": 649 + }, + { + "epoch": 0.6929637526652452, + "grad_norm": 0.5088744299454488, + "learning_rate": 7.963759073407297e-05, + "loss": 0.4228, + "step": 650 + }, + { + "epoch": 0.6940298507462687, + "grad_norm": 0.5561088080079999, + "learning_rate": 7.963358129155318e-05, + "loss": 0.4199, + "step": 651 + }, + { + "epoch": 0.6950959488272921, + "grad_norm": 0.5380479849912571, + "learning_rate": 7.962954989408916e-05, + "loss": 0.4251, + "step": 652 + }, + { + "epoch": 0.6961620469083155, + "grad_norm": 0.5750528453656307, + "learning_rate": 7.962549654391412e-05, + "loss": 0.4199, + "step": 653 + }, + { + "epoch": 0.697228144989339, + "grad_norm": 0.6325892582842803, + "learning_rate": 7.962142124327338e-05, + "loss": 0.4206, + "step": 654 + }, + { + "epoch": 0.6982942430703625, + "grad_norm": 0.7898343271492005, + "learning_rate": 7.961732399442448e-05, + "loss": 0.422, + "step": 655 + }, + { + "epoch": 0.6993603411513859, + "grad_norm": 0.9578185478631844, + "learning_rate": 7.961320479963703e-05, + "loss": 0.4158, + "step": 656 + }, + { + "epoch": 0.7004264392324094, + "grad_norm": 1.064875777304307, + "learning_rate": 7.96090636611929e-05, + "loss": 0.4212, + "step": 657 + }, + { + "epoch": 0.7014925373134329, + "grad_norm": 0.870822886466798, + "learning_rate": 7.960490058138604e-05, + "loss": 0.4294, + "step": 658 + }, + { + "epoch": 0.7025586353944563, + "grad_norm": 0.7665000673413538, + "learning_rate": 7.960071556252259e-05, + "loss": 0.4293, + "step": 659 + }, + { + "epoch": 0.7036247334754797, + "grad_norm": 0.7943565964649919, + "learning_rate": 7.959650860692082e-05, + "loss": 0.4153, + "step": 660 + }, + { + "epoch": 0.7046908315565032, + "grad_norm": 0.820866857806031, + "learning_rate": 7.959227971691118e-05, + "loss": 0.4218, + "step": 661 + }, + { + "epoch": 0.7057569296375267, + "grad_norm": 0.6751476503197974, + "learning_rate": 7.958802889483626e-05, + "loss": 0.4193, + "step": 662 + }, + { + "epoch": 0.7068230277185501, + "grad_norm": 0.3829844790409102, + "learning_rate": 7.958375614305076e-05, + "loss": 0.42, + "step": 663 + }, + { + "epoch": 0.7078891257995735, + "grad_norm": 0.5522600827629381, + "learning_rate": 7.957946146392159e-05, + "loss": 0.4224, + "step": 664 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 0.7878606313595656, + "learning_rate": 7.957514485982778e-05, + "loss": 0.4144, + "step": 665 + }, + { + "epoch": 0.7100213219616205, + "grad_norm": 0.7385093188465528, + "learning_rate": 7.95708063331605e-05, + "loss": 0.4259, + "step": 666 + }, + { + "epoch": 0.7110874200426439, + "grad_norm": 0.58178433551703, + "learning_rate": 7.956644588632307e-05, + "loss": 0.422, + "step": 667 + }, + { + "epoch": 0.7121535181236673, + "grad_norm": 0.6284258703892865, + "learning_rate": 7.956206352173093e-05, + "loss": 0.421, + "step": 668 + }, + { + "epoch": 0.7132196162046909, + "grad_norm": 0.7936414413541673, + "learning_rate": 7.95576592418117e-05, + "loss": 0.4259, + "step": 669 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.8521993846503206, + "learning_rate": 7.955323304900514e-05, + "loss": 0.4228, + "step": 670 + }, + { + "epoch": 0.7153518123667377, + "grad_norm": 0.7127889848840783, + "learning_rate": 7.954878494576312e-05, + "loss": 0.4201, + "step": 671 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 0.6074747505915871, + "learning_rate": 7.954431493454964e-05, + "loss": 0.4213, + "step": 672 + }, + { + "epoch": 0.7174840085287847, + "grad_norm": 0.7129990882836668, + "learning_rate": 7.953982301784085e-05, + "loss": 0.4234, + "step": 673 + }, + { + "epoch": 0.7185501066098081, + "grad_norm": 0.7722302015202298, + "learning_rate": 7.953530919812506e-05, + "loss": 0.4191, + "step": 674 + }, + { + "epoch": 0.7196162046908315, + "grad_norm": 0.7433421116192664, + "learning_rate": 7.95307734779027e-05, + "loss": 0.4211, + "step": 675 + }, + { + "epoch": 0.720682302771855, + "grad_norm": 0.8264421753718556, + "learning_rate": 7.95262158596863e-05, + "loss": 0.4218, + "step": 676 + }, + { + "epoch": 0.7217484008528785, + "grad_norm": 1.0203826309064437, + "learning_rate": 7.952163634600055e-05, + "loss": 0.4171, + "step": 677 + }, + { + "epoch": 0.7228144989339019, + "grad_norm": 0.8482809756462577, + "learning_rate": 7.951703493938226e-05, + "loss": 0.4194, + "step": 678 + }, + { + "epoch": 0.7238805970149254, + "grad_norm": 0.5426953720506598, + "learning_rate": 7.951241164238039e-05, + "loss": 0.4191, + "step": 679 + }, + { + "epoch": 0.7249466950959488, + "grad_norm": 0.7193984844467518, + "learning_rate": 7.950776645755596e-05, + "loss": 0.4213, + "step": 680 + }, + { + "epoch": 0.7260127931769723, + "grad_norm": 0.7142598516288384, + "learning_rate": 7.950309938748221e-05, + "loss": 0.4205, + "step": 681 + }, + { + "epoch": 0.7270788912579957, + "grad_norm": 0.39915429482120396, + "learning_rate": 7.949841043474445e-05, + "loss": 0.4147, + "step": 682 + }, + { + "epoch": 0.7281449893390192, + "grad_norm": 0.5327714348171071, + "learning_rate": 7.949369960194009e-05, + "loss": 0.4183, + "step": 683 + }, + { + "epoch": 0.7292110874200426, + "grad_norm": 0.5612054527272023, + "learning_rate": 7.94889668916787e-05, + "loss": 0.4185, + "step": 684 + }, + { + "epoch": 0.7302771855010661, + "grad_norm": 0.4666415892203282, + "learning_rate": 7.948421230658196e-05, + "loss": 0.4182, + "step": 685 + }, + { + "epoch": 0.7313432835820896, + "grad_norm": 0.520314689388758, + "learning_rate": 7.947943584928364e-05, + "loss": 0.4237, + "step": 686 + }, + { + "epoch": 0.732409381663113, + "grad_norm": 0.41852601792232424, + "learning_rate": 7.947463752242968e-05, + "loss": 0.4136, + "step": 687 + }, + { + "epoch": 0.7334754797441365, + "grad_norm": 0.44187419017345353, + "learning_rate": 7.946981732867808e-05, + "loss": 0.4188, + "step": 688 + }, + { + "epoch": 0.7345415778251599, + "grad_norm": 0.5519060149192045, + "learning_rate": 7.946497527069898e-05, + "loss": 0.4178, + "step": 689 + }, + { + "epoch": 0.7356076759061834, + "grad_norm": 0.5189998274103562, + "learning_rate": 7.946011135117466e-05, + "loss": 0.4156, + "step": 690 + }, + { + "epoch": 0.7366737739872068, + "grad_norm": 0.5845444307770654, + "learning_rate": 7.945522557279944e-05, + "loss": 0.413, + "step": 691 + }, + { + "epoch": 0.7377398720682303, + "grad_norm": 0.7034648713070897, + "learning_rate": 7.94503179382798e-05, + "loss": 0.4234, + "step": 692 + }, + { + "epoch": 0.7388059701492538, + "grad_norm": 0.8180017083563943, + "learning_rate": 7.944538845033431e-05, + "loss": 0.4181, + "step": 693 + }, + { + "epoch": 0.7398720682302772, + "grad_norm": 1.0411030646276598, + "learning_rate": 7.944043711169367e-05, + "loss": 0.4177, + "step": 694 + }, + { + "epoch": 0.7409381663113006, + "grad_norm": 1.1183438708338787, + "learning_rate": 7.943546392510065e-05, + "loss": 0.4245, + "step": 695 + }, + { + "epoch": 0.7420042643923241, + "grad_norm": 0.7568820848776477, + "learning_rate": 7.943046889331013e-05, + "loss": 0.4223, + "step": 696 + }, + { + "epoch": 0.7430703624733476, + "grad_norm": 0.4694996552198355, + "learning_rate": 7.94254520190891e-05, + "loss": 0.4204, + "step": 697 + }, + { + "epoch": 0.744136460554371, + "grad_norm": 0.42506834068311544, + "learning_rate": 7.942041330521665e-05, + "loss": 0.4133, + "step": 698 + }, + { + "epoch": 0.7452025586353944, + "grad_norm": 0.7432323993182501, + "learning_rate": 7.941535275448399e-05, + "loss": 0.4204, + "step": 699 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 1.017099030937784, + "learning_rate": 7.941027036969437e-05, + "loss": 0.4151, + "step": 700 + }, + { + "epoch": 0.7473347547974414, + "grad_norm": 1.042873613244398, + "learning_rate": 7.940516615366318e-05, + "loss": 0.4225, + "step": 701 + }, + { + "epoch": 0.7484008528784648, + "grad_norm": 0.672037071535835, + "learning_rate": 7.940004010921787e-05, + "loss": 0.4155, + "step": 702 + }, + { + "epoch": 0.7494669509594882, + "grad_norm": 0.47977822737782094, + "learning_rate": 7.939489223919803e-05, + "loss": 0.4239, + "step": 703 + }, + { + "epoch": 0.7505330490405118, + "grad_norm": 0.5839565536019953, + "learning_rate": 7.938972254645529e-05, + "loss": 0.4157, + "step": 704 + }, + { + "epoch": 0.7515991471215352, + "grad_norm": 0.659589798795434, + "learning_rate": 7.938453103385343e-05, + "loss": 0.4297, + "step": 705 + }, + { + "epoch": 0.7526652452025586, + "grad_norm": 0.6224572447097061, + "learning_rate": 7.937931770426825e-05, + "loss": 0.4178, + "step": 706 + }, + { + "epoch": 0.753731343283582, + "grad_norm": 0.5223926429775323, + "learning_rate": 7.937408256058764e-05, + "loss": 0.4195, + "step": 707 + }, + { + "epoch": 0.7547974413646056, + "grad_norm": 0.5263236599515704, + "learning_rate": 7.936882560571165e-05, + "loss": 0.4225, + "step": 708 + }, + { + "epoch": 0.755863539445629, + "grad_norm": 0.6701920085872527, + "learning_rate": 7.936354684255231e-05, + "loss": 0.423, + "step": 709 + }, + { + "epoch": 0.7569296375266524, + "grad_norm": 0.8271768873467236, + "learning_rate": 7.935824627403382e-05, + "loss": 0.4197, + "step": 710 + }, + { + "epoch": 0.7579957356076759, + "grad_norm": 0.8903386742919085, + "learning_rate": 7.935292390309239e-05, + "loss": 0.4161, + "step": 711 + }, + { + "epoch": 0.7590618336886994, + "grad_norm": 0.8971908647303243, + "learning_rate": 7.934757973267636e-05, + "loss": 0.4212, + "step": 712 + }, + { + "epoch": 0.7601279317697228, + "grad_norm": 0.8738601854484014, + "learning_rate": 7.93422137657461e-05, + "loss": 0.4145, + "step": 713 + }, + { + "epoch": 0.7611940298507462, + "grad_norm": 0.8238424939346762, + "learning_rate": 7.93368260052741e-05, + "loss": 0.4231, + "step": 714 + }, + { + "epoch": 0.7622601279317697, + "grad_norm": 0.8322879935982136, + "learning_rate": 7.933141645424489e-05, + "loss": 0.4123, + "step": 715 + }, + { + "epoch": 0.7633262260127932, + "grad_norm": 0.7762441733310584, + "learning_rate": 7.932598511565506e-05, + "loss": 0.4139, + "step": 716 + }, + { + "epoch": 0.7643923240938166, + "grad_norm": 0.7714495728551265, + "learning_rate": 7.932053199251332e-05, + "loss": 0.4172, + "step": 717 + }, + { + "epoch": 0.7654584221748401, + "grad_norm": 0.6994068976148212, + "learning_rate": 7.931505708784042e-05, + "loss": 0.4209, + "step": 718 + }, + { + "epoch": 0.7665245202558635, + "grad_norm": 0.4918911877930488, + "learning_rate": 7.930956040466912e-05, + "loss": 0.4187, + "step": 719 + }, + { + "epoch": 0.767590618336887, + "grad_norm": 0.4206999757056118, + "learning_rate": 7.930404194604436e-05, + "loss": 0.4197, + "step": 720 + }, + { + "epoch": 0.7686567164179104, + "grad_norm": 0.491102787820834, + "learning_rate": 7.929850171502304e-05, + "loss": 0.4223, + "step": 721 + }, + { + "epoch": 0.7697228144989339, + "grad_norm": 0.5414023421551015, + "learning_rate": 7.92929397146742e-05, + "loss": 0.4158, + "step": 722 + }, + { + "epoch": 0.7707889125799574, + "grad_norm": 0.43007919824776014, + "learning_rate": 7.928735594807885e-05, + "loss": 0.4197, + "step": 723 + }, + { + "epoch": 0.7718550106609808, + "grad_norm": 0.357619381539661, + "learning_rate": 7.928175041833014e-05, + "loss": 0.4111, + "step": 724 + }, + { + "epoch": 0.7729211087420043, + "grad_norm": 0.44213212514382777, + "learning_rate": 7.927612312853321e-05, + "loss": 0.4233, + "step": 725 + }, + { + "epoch": 0.7739872068230277, + "grad_norm": 0.4580051310888454, + "learning_rate": 7.927047408180533e-05, + "loss": 0.409, + "step": 726 + }, + { + "epoch": 0.7750533049040512, + "grad_norm": 0.5077929282870794, + "learning_rate": 7.926480328127573e-05, + "loss": 0.4198, + "step": 727 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 0.6011826055428627, + "learning_rate": 7.925911073008578e-05, + "loss": 0.4217, + "step": 728 + }, + { + "epoch": 0.7771855010660981, + "grad_norm": 0.9129504733266989, + "learning_rate": 7.925339643138885e-05, + "loss": 0.4231, + "step": 729 + }, + { + "epoch": 0.7782515991471215, + "grad_norm": 1.2218378663823979, + "learning_rate": 7.924766038835035e-05, + "loss": 0.4243, + "step": 730 + }, + { + "epoch": 0.779317697228145, + "grad_norm": 0.7903329161767807, + "learning_rate": 7.924190260414776e-05, + "loss": 0.4208, + "step": 731 + }, + { + "epoch": 0.7803837953091685, + "grad_norm": 0.6805821512299308, + "learning_rate": 7.923612308197058e-05, + "loss": 0.42, + "step": 732 + }, + { + "epoch": 0.7814498933901919, + "grad_norm": 0.6416299900106649, + "learning_rate": 7.923032182502037e-05, + "loss": 0.4201, + "step": 733 + }, + { + "epoch": 0.7825159914712153, + "grad_norm": 0.6659708417529658, + "learning_rate": 7.922449883651074e-05, + "loss": 0.4144, + "step": 734 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.5293729604666099, + "learning_rate": 7.92186541196673e-05, + "loss": 0.4219, + "step": 735 + }, + { + "epoch": 0.7846481876332623, + "grad_norm": 0.4286984074785958, + "learning_rate": 7.921278767772774e-05, + "loss": 0.4141, + "step": 736 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.49672004967076516, + "learning_rate": 7.920689951394175e-05, + "loss": 0.4222, + "step": 737 + }, + { + "epoch": 0.7867803837953091, + "grad_norm": 0.6284915886717239, + "learning_rate": 7.920098963157108e-05, + "loss": 0.4252, + "step": 738 + }, + { + "epoch": 0.7878464818763327, + "grad_norm": 0.787902960458551, + "learning_rate": 7.919505803388949e-05, + "loss": 0.4248, + "step": 739 + }, + { + "epoch": 0.7889125799573561, + "grad_norm": 0.8686899347398345, + "learning_rate": 7.918910472418278e-05, + "loss": 0.415, + "step": 740 + }, + { + "epoch": 0.7899786780383795, + "grad_norm": 0.8437670042808346, + "learning_rate": 7.918312970574875e-05, + "loss": 0.4164, + "step": 741 + }, + { + "epoch": 0.7910447761194029, + "grad_norm": 0.8540882179405255, + "learning_rate": 7.917713298189728e-05, + "loss": 0.4253, + "step": 742 + }, + { + "epoch": 0.7921108742004265, + "grad_norm": 0.9478085360927803, + "learning_rate": 7.917111455595023e-05, + "loss": 0.4196, + "step": 743 + }, + { + "epoch": 0.7931769722814499, + "grad_norm": 0.9327562081026817, + "learning_rate": 7.916507443124153e-05, + "loss": 0.4242, + "step": 744 + }, + { + "epoch": 0.7942430703624733, + "grad_norm": 0.8551898160448683, + "learning_rate": 7.915901261111703e-05, + "loss": 0.4178, + "step": 745 + }, + { + "epoch": 0.7953091684434968, + "grad_norm": 0.6425189560126273, + "learning_rate": 7.91529290989347e-05, + "loss": 0.4213, + "step": 746 + }, + { + "epoch": 0.7963752665245203, + "grad_norm": 0.5781816257700045, + "learning_rate": 7.91468238980645e-05, + "loss": 0.4291, + "step": 747 + }, + { + "epoch": 0.7974413646055437, + "grad_norm": 0.6921643667155062, + "learning_rate": 7.914069701188837e-05, + "loss": 0.4197, + "step": 748 + }, + { + "epoch": 0.7985074626865671, + "grad_norm": 0.7614603203693651, + "learning_rate": 7.913454844380031e-05, + "loss": 0.4156, + "step": 749 + }, + { + "epoch": 0.7995735607675906, + "grad_norm": 0.7095354236567498, + "learning_rate": 7.912837819720628e-05, + "loss": 0.4127, + "step": 750 + }, + { + "epoch": 0.8006396588486141, + "grad_norm": 0.6873164412119117, + "learning_rate": 7.91221862755243e-05, + "loss": 0.417, + "step": 751 + }, + { + "epoch": 0.8017057569296375, + "grad_norm": 0.718103908521537, + "learning_rate": 7.911597268218435e-05, + "loss": 0.4234, + "step": 752 + }, + { + "epoch": 0.802771855010661, + "grad_norm": 0.6911031732156341, + "learning_rate": 7.910973742062847e-05, + "loss": 0.4142, + "step": 753 + }, + { + "epoch": 0.8038379530916845, + "grad_norm": 0.7537929051659943, + "learning_rate": 7.910348049431064e-05, + "loss": 0.4161, + "step": 754 + }, + { + "epoch": 0.8049040511727079, + "grad_norm": 0.845377153307119, + "learning_rate": 7.909720190669689e-05, + "loss": 0.4135, + "step": 755 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 0.9386808147663165, + "learning_rate": 7.909090166126523e-05, + "loss": 0.4175, + "step": 756 + }, + { + "epoch": 0.8070362473347548, + "grad_norm": 0.874198077040556, + "learning_rate": 7.908457976150565e-05, + "loss": 0.4168, + "step": 757 + }, + { + "epoch": 0.8081023454157783, + "grad_norm": 0.734655468857772, + "learning_rate": 7.907823621092017e-05, + "loss": 0.4218, + "step": 758 + }, + { + "epoch": 0.8091684434968017, + "grad_norm": 0.664396615528394, + "learning_rate": 7.907187101302279e-05, + "loss": 0.4095, + "step": 759 + }, + { + "epoch": 0.8102345415778252, + "grad_norm": 0.5039424928453715, + "learning_rate": 7.90654841713395e-05, + "loss": 0.4102, + "step": 760 + }, + { + "epoch": 0.8113006396588486, + "grad_norm": 0.38736014140762187, + "learning_rate": 7.905907568940825e-05, + "loss": 0.4075, + "step": 761 + }, + { + "epoch": 0.8123667377398721, + "grad_norm": 0.5398794260824558, + "learning_rate": 7.905264557077905e-05, + "loss": 0.4185, + "step": 762 + }, + { + "epoch": 0.8134328358208955, + "grad_norm": 0.6961113732246285, + "learning_rate": 7.904619381901382e-05, + "loss": 0.419, + "step": 763 + }, + { + "epoch": 0.814498933901919, + "grad_norm": 0.7099747167015686, + "learning_rate": 7.903972043768652e-05, + "loss": 0.4192, + "step": 764 + }, + { + "epoch": 0.8155650319829424, + "grad_norm": 0.7111718390365964, + "learning_rate": 7.903322543038302e-05, + "loss": 0.4231, + "step": 765 + }, + { + "epoch": 0.8166311300639659, + "grad_norm": 0.7243885981248829, + "learning_rate": 7.902670880070126e-05, + "loss": 0.4213, + "step": 766 + }, + { + "epoch": 0.8176972281449894, + "grad_norm": 0.8443210902757722, + "learning_rate": 7.902017055225111e-05, + "loss": 0.4173, + "step": 767 + }, + { + "epoch": 0.8187633262260128, + "grad_norm": 0.9889792776736421, + "learning_rate": 7.901361068865441e-05, + "loss": 0.4188, + "step": 768 + }, + { + "epoch": 0.8198294243070362, + "grad_norm": 1.1016684504340584, + "learning_rate": 7.9007029213545e-05, + "loss": 0.4206, + "step": 769 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.6894548537797883, + "learning_rate": 7.900042613056864e-05, + "loss": 0.4147, + "step": 770 + }, + { + "epoch": 0.8219616204690832, + "grad_norm": 0.4780239669310881, + "learning_rate": 7.899380144338313e-05, + "loss": 0.4098, + "step": 771 + }, + { + "epoch": 0.8230277185501066, + "grad_norm": 0.7367707580027851, + "learning_rate": 7.898715515565817e-05, + "loss": 0.425, + "step": 772 + }, + { + "epoch": 0.82409381663113, + "grad_norm": 0.9588196347102074, + "learning_rate": 7.898048727107549e-05, + "loss": 0.4201, + "step": 773 + }, + { + "epoch": 0.8251599147121536, + "grad_norm": 0.9328164536411601, + "learning_rate": 7.897379779332873e-05, + "loss": 0.4086, + "step": 774 + }, + { + "epoch": 0.826226012793177, + "grad_norm": 0.7663371340787454, + "learning_rate": 7.896708672612352e-05, + "loss": 0.4157, + "step": 775 + }, + { + "epoch": 0.8272921108742004, + "grad_norm": 0.5139058112934316, + "learning_rate": 7.896035407317746e-05, + "loss": 0.417, + "step": 776 + }, + { + "epoch": 0.8283582089552238, + "grad_norm": 0.5600567127054864, + "learning_rate": 7.895359983822004e-05, + "loss": 0.4164, + "step": 777 + }, + { + "epoch": 0.8294243070362474, + "grad_norm": 0.7393036162635956, + "learning_rate": 7.894682402499283e-05, + "loss": 0.4254, + "step": 778 + }, + { + "epoch": 0.8304904051172708, + "grad_norm": 0.7834777517715102, + "learning_rate": 7.894002663724921e-05, + "loss": 0.4156, + "step": 779 + }, + { + "epoch": 0.8315565031982942, + "grad_norm": 0.760746777211738, + "learning_rate": 7.89332076787546e-05, + "loss": 0.4183, + "step": 780 + }, + { + "epoch": 0.8326226012793176, + "grad_norm": 0.7072386884303129, + "learning_rate": 7.892636715328638e-05, + "loss": 0.4085, + "step": 781 + }, + { + "epoch": 0.8336886993603412, + "grad_norm": 0.5891467963351983, + "learning_rate": 7.89195050646338e-05, + "loss": 0.4174, + "step": 782 + }, + { + "epoch": 0.8347547974413646, + "grad_norm": 0.4907232824657599, + "learning_rate": 7.891262141659812e-05, + "loss": 0.4172, + "step": 783 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 0.46554454666441136, + "learning_rate": 7.890571621299252e-05, + "loss": 0.4135, + "step": 784 + }, + { + "epoch": 0.8368869936034116, + "grad_norm": 0.448212012101914, + "learning_rate": 7.889878945764215e-05, + "loss": 0.4187, + "step": 785 + }, + { + "epoch": 0.837953091684435, + "grad_norm": 0.6062486130131177, + "learning_rate": 7.889184115438403e-05, + "loss": 0.4184, + "step": 786 + }, + { + "epoch": 0.8390191897654584, + "grad_norm": 0.6256523573917208, + "learning_rate": 7.888487130706719e-05, + "loss": 0.4108, + "step": 787 + }, + { + "epoch": 0.8400852878464818, + "grad_norm": 0.5794981444661234, + "learning_rate": 7.887787991955254e-05, + "loss": 0.4156, + "step": 788 + }, + { + "epoch": 0.8411513859275054, + "grad_norm": 0.5092450399277311, + "learning_rate": 7.887086699571297e-05, + "loss": 0.4138, + "step": 789 + }, + { + "epoch": 0.8422174840085288, + "grad_norm": 0.3853513561237604, + "learning_rate": 7.886383253943326e-05, + "loss": 0.4101, + "step": 790 + }, + { + "epoch": 0.8432835820895522, + "grad_norm": 0.5174800575237039, + "learning_rate": 7.885677655461013e-05, + "loss": 0.415, + "step": 791 + }, + { + "epoch": 0.8443496801705757, + "grad_norm": 0.7565196574017777, + "learning_rate": 7.884969904515224e-05, + "loss": 0.4163, + "step": 792 + }, + { + "epoch": 0.8454157782515992, + "grad_norm": 0.9209372045007519, + "learning_rate": 7.884260001498015e-05, + "loss": 0.4153, + "step": 793 + }, + { + "epoch": 0.8464818763326226, + "grad_norm": 1.0713514936667083, + "learning_rate": 7.883547946802637e-05, + "loss": 0.4158, + "step": 794 + }, + { + "epoch": 0.847547974413646, + "grad_norm": 0.8424265293921103, + "learning_rate": 7.882833740823531e-05, + "loss": 0.4085, + "step": 795 + }, + { + "epoch": 0.8486140724946695, + "grad_norm": 0.4698049947846306, + "learning_rate": 7.882117383956328e-05, + "loss": 0.4176, + "step": 796 + }, + { + "epoch": 0.849680170575693, + "grad_norm": 0.45769071058539357, + "learning_rate": 7.881398876597855e-05, + "loss": 0.4122, + "step": 797 + }, + { + "epoch": 0.8507462686567164, + "grad_norm": 0.5978831801648813, + "learning_rate": 7.880678219146125e-05, + "loss": 0.4134, + "step": 798 + }, + { + "epoch": 0.8518123667377399, + "grad_norm": 0.6151035364335734, + "learning_rate": 7.879955412000348e-05, + "loss": 0.413, + "step": 799 + }, + { + "epoch": 0.8528784648187633, + "grad_norm": 0.5747882183128944, + "learning_rate": 7.87923045556092e-05, + "loss": 0.4119, + "step": 800 + }, + { + "epoch": 0.8539445628997868, + "grad_norm": 0.5283977724250003, + "learning_rate": 7.878503350229428e-05, + "loss": 0.4123, + "step": 801 + }, + { + "epoch": 0.8550106609808102, + "grad_norm": 0.5661813646706929, + "learning_rate": 7.877774096408652e-05, + "loss": 0.4173, + "step": 802 + }, + { + "epoch": 0.8560767590618337, + "grad_norm": 0.7555232731803762, + "learning_rate": 7.87704269450256e-05, + "loss": 0.414, + "step": 803 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.9330140626616096, + "learning_rate": 7.876309144916312e-05, + "loss": 0.4174, + "step": 804 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 1.0609474276194846, + "learning_rate": 7.875573448056255e-05, + "loss": 0.4184, + "step": 805 + }, + { + "epoch": 0.8592750533049041, + "grad_norm": 0.9156452459182889, + "learning_rate": 7.874835604329928e-05, + "loss": 0.4129, + "step": 806 + }, + { + "epoch": 0.8603411513859275, + "grad_norm": 0.7492113965064349, + "learning_rate": 7.874095614146057e-05, + "loss": 0.4134, + "step": 807 + }, + { + "epoch": 0.8614072494669509, + "grad_norm": 0.5003348192655799, + "learning_rate": 7.873353477914559e-05, + "loss": 0.4194, + "step": 808 + }, + { + "epoch": 0.8624733475479744, + "grad_norm": 0.5127452250412682, + "learning_rate": 7.872609196046537e-05, + "loss": 0.4098, + "step": 809 + }, + { + "epoch": 0.8635394456289979, + "grad_norm": 0.6332821308118973, + "learning_rate": 7.871862768954285e-05, + "loss": 0.4158, + "step": 810 + }, + { + "epoch": 0.8646055437100213, + "grad_norm": 0.7546855273362159, + "learning_rate": 7.871114197051289e-05, + "loss": 0.4182, + "step": 811 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 0.7220245458809873, + "learning_rate": 7.870363480752214e-05, + "loss": 0.4088, + "step": 812 + }, + { + "epoch": 0.8667377398720683, + "grad_norm": 0.6636256075414101, + "learning_rate": 7.869610620472918e-05, + "loss": 0.4169, + "step": 813 + }, + { + "epoch": 0.8678038379530917, + "grad_norm": 0.5862374738875737, + "learning_rate": 7.86885561663045e-05, + "loss": 0.4182, + "step": 814 + }, + { + "epoch": 0.8688699360341151, + "grad_norm": 0.47483648705484865, + "learning_rate": 7.868098469643039e-05, + "loss": 0.4151, + "step": 815 + }, + { + "epoch": 0.8699360341151386, + "grad_norm": 0.45874603100581357, + "learning_rate": 7.867339179930108e-05, + "loss": 0.4139, + "step": 816 + }, + { + "epoch": 0.8710021321961621, + "grad_norm": 0.5617556695876103, + "learning_rate": 7.866577747912262e-05, + "loss": 0.4123, + "step": 817 + }, + { + "epoch": 0.8720682302771855, + "grad_norm": 0.6132829210691816, + "learning_rate": 7.865814174011295e-05, + "loss": 0.4102, + "step": 818 + }, + { + "epoch": 0.8731343283582089, + "grad_norm": 0.6010015927279606, + "learning_rate": 7.86504845865019e-05, + "loss": 0.419, + "step": 819 + }, + { + "epoch": 0.8742004264392325, + "grad_norm": 0.7239480812231027, + "learning_rate": 7.864280602253109e-05, + "loss": 0.4208, + "step": 820 + }, + { + "epoch": 0.8752665245202559, + "grad_norm": 0.8304065457671317, + "learning_rate": 7.863510605245409e-05, + "loss": 0.4241, + "step": 821 + }, + { + "epoch": 0.8763326226012793, + "grad_norm": 0.9073014145416783, + "learning_rate": 7.862738468053625e-05, + "loss": 0.4175, + "step": 822 + }, + { + "epoch": 0.8773987206823027, + "grad_norm": 0.8823598817021182, + "learning_rate": 7.861964191105483e-05, + "loss": 0.4191, + "step": 823 + }, + { + "epoch": 0.8784648187633263, + "grad_norm": 0.7779060734831246, + "learning_rate": 7.861187774829891e-05, + "loss": 0.4101, + "step": 824 + }, + { + "epoch": 0.8795309168443497, + "grad_norm": 0.7580593362013702, + "learning_rate": 7.860409219656942e-05, + "loss": 0.4122, + "step": 825 + }, + { + "epoch": 0.8805970149253731, + "grad_norm": 0.877095795887976, + "learning_rate": 7.85962852601792e-05, + "loss": 0.4186, + "step": 826 + }, + { + "epoch": 0.8816631130063965, + "grad_norm": 0.965490750532556, + "learning_rate": 7.858845694345283e-05, + "loss": 0.4114, + "step": 827 + }, + { + "epoch": 0.8827292110874201, + "grad_norm": 0.9339934525386837, + "learning_rate": 7.858060725072682e-05, + "loss": 0.4119, + "step": 828 + }, + { + "epoch": 0.8837953091684435, + "grad_norm": 0.7470003853390679, + "learning_rate": 7.857273618634949e-05, + "loss": 0.411, + "step": 829 + }, + { + "epoch": 0.8848614072494669, + "grad_norm": 0.5531734893024891, + "learning_rate": 7.8564843754681e-05, + "loss": 0.4061, + "step": 830 + }, + { + "epoch": 0.8859275053304904, + "grad_norm": 0.569407252019787, + "learning_rate": 7.855692996009332e-05, + "loss": 0.409, + "step": 831 + }, + { + "epoch": 0.8869936034115139, + "grad_norm": 0.6880810896792248, + "learning_rate": 7.854899480697033e-05, + "loss": 0.4112, + "step": 832 + }, + { + "epoch": 0.8880597014925373, + "grad_norm": 0.7188833271346624, + "learning_rate": 7.854103829970765e-05, + "loss": 0.4159, + "step": 833 + }, + { + "epoch": 0.8891257995735607, + "grad_norm": 0.6200962748194073, + "learning_rate": 7.853306044271281e-05, + "loss": 0.4104, + "step": 834 + }, + { + "epoch": 0.8901918976545842, + "grad_norm": 0.6029255662836998, + "learning_rate": 7.852506124040509e-05, + "loss": 0.41, + "step": 835 + }, + { + "epoch": 0.8912579957356077, + "grad_norm": 0.5455766236674676, + "learning_rate": 7.851704069721567e-05, + "loss": 0.4155, + "step": 836 + }, + { + "epoch": 0.8923240938166311, + "grad_norm": 0.59815834722666, + "learning_rate": 7.850899881758746e-05, + "loss": 0.4099, + "step": 837 + }, + { + "epoch": 0.8933901918976546, + "grad_norm": 0.6269668637980782, + "learning_rate": 7.850093560597529e-05, + "loss": 0.4086, + "step": 838 + }, + { + "epoch": 0.894456289978678, + "grad_norm": 0.5257832120028597, + "learning_rate": 7.849285106684576e-05, + "loss": 0.41, + "step": 839 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.6116996403200065, + "learning_rate": 7.848474520467727e-05, + "loss": 0.4201, + "step": 840 + }, + { + "epoch": 0.896588486140725, + "grad_norm": 0.6371232956894439, + "learning_rate": 7.847661802396004e-05, + "loss": 0.412, + "step": 841 + }, + { + "epoch": 0.8976545842217484, + "grad_norm": 0.5766017052591853, + "learning_rate": 7.84684695291961e-05, + "loss": 0.4166, + "step": 842 + }, + { + "epoch": 0.8987206823027718, + "grad_norm": 0.6967805160059684, + "learning_rate": 7.846029972489932e-05, + "loss": 0.4134, + "step": 843 + }, + { + "epoch": 0.8997867803837953, + "grad_norm": 0.7884289858058776, + "learning_rate": 7.845210861559533e-05, + "loss": 0.4128, + "step": 844 + }, + { + "epoch": 0.9008528784648188, + "grad_norm": 0.7180820620379316, + "learning_rate": 7.84438962058216e-05, + "loss": 0.4151, + "step": 845 + }, + { + "epoch": 0.9019189765458422, + "grad_norm": 0.595379884431672, + "learning_rate": 7.843566250012734e-05, + "loss": 0.4197, + "step": 846 + }, + { + "epoch": 0.9029850746268657, + "grad_norm": 0.44736210153445316, + "learning_rate": 7.842740750307362e-05, + "loss": 0.4096, + "step": 847 + }, + { + "epoch": 0.9040511727078892, + "grad_norm": 0.41043276693031083, + "learning_rate": 7.841913121923327e-05, + "loss": 0.4179, + "step": 848 + }, + { + "epoch": 0.9051172707889126, + "grad_norm": 0.3848671265951802, + "learning_rate": 7.841083365319093e-05, + "loss": 0.4119, + "step": 849 + }, + { + "epoch": 0.906183368869936, + "grad_norm": 0.3978522634845732, + "learning_rate": 7.840251480954302e-05, + "loss": 0.4109, + "step": 850 + }, + { + "epoch": 0.9072494669509595, + "grad_norm": 0.5471539558932556, + "learning_rate": 7.839417469289773e-05, + "loss": 0.4137, + "step": 851 + }, + { + "epoch": 0.908315565031983, + "grad_norm": 0.6676260565465565, + "learning_rate": 7.838581330787508e-05, + "loss": 0.4132, + "step": 852 + }, + { + "epoch": 0.9093816631130064, + "grad_norm": 0.5826596489602703, + "learning_rate": 7.837743065910682e-05, + "loss": 0.4066, + "step": 853 + }, + { + "epoch": 0.9104477611940298, + "grad_norm": 0.5230535811454794, + "learning_rate": 7.83690267512365e-05, + "loss": 0.405, + "step": 854 + }, + { + "epoch": 0.9115138592750534, + "grad_norm": 0.5626160739139322, + "learning_rate": 7.836060158891947e-05, + "loss": 0.4089, + "step": 855 + }, + { + "epoch": 0.9125799573560768, + "grad_norm": 0.4697100322492265, + "learning_rate": 7.835215517682282e-05, + "loss": 0.4143, + "step": 856 + }, + { + "epoch": 0.9136460554371002, + "grad_norm": 0.36866479086378523, + "learning_rate": 7.834368751962542e-05, + "loss": 0.405, + "step": 857 + }, + { + "epoch": 0.9147121535181236, + "grad_norm": 0.4787182490306844, + "learning_rate": 7.833519862201791e-05, + "loss": 0.4144, + "step": 858 + }, + { + "epoch": 0.9157782515991472, + "grad_norm": 0.5878869019859624, + "learning_rate": 7.83266884887027e-05, + "loss": 0.4183, + "step": 859 + }, + { + "epoch": 0.9168443496801706, + "grad_norm": 0.7152505035199795, + "learning_rate": 7.831815712439397e-05, + "loss": 0.4095, + "step": 860 + }, + { + "epoch": 0.917910447761194, + "grad_norm": 0.7805590910529553, + "learning_rate": 7.830960453381764e-05, + "loss": 0.4106, + "step": 861 + }, + { + "epoch": 0.9189765458422174, + "grad_norm": 0.7572280791863363, + "learning_rate": 7.830103072171142e-05, + "loss": 0.4102, + "step": 862 + }, + { + "epoch": 0.920042643923241, + "grad_norm": 0.7493776976712094, + "learning_rate": 7.829243569282473e-05, + "loss": 0.4088, + "step": 863 + }, + { + "epoch": 0.9211087420042644, + "grad_norm": 0.7609395686522328, + "learning_rate": 7.828381945191879e-05, + "loss": 0.4111, + "step": 864 + }, + { + "epoch": 0.9221748400852878, + "grad_norm": 0.8239887481859134, + "learning_rate": 7.827518200376654e-05, + "loss": 0.4195, + "step": 865 + }, + { + "epoch": 0.9232409381663113, + "grad_norm": 0.8016884574752428, + "learning_rate": 7.826652335315268e-05, + "loss": 0.4056, + "step": 866 + }, + { + "epoch": 0.9243070362473348, + "grad_norm": 0.7736961735336623, + "learning_rate": 7.825784350487365e-05, + "loss": 0.4184, + "step": 867 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 0.8458442232897897, + "learning_rate": 7.824914246373764e-05, + "loss": 0.4159, + "step": 868 + }, + { + "epoch": 0.9264392324093816, + "grad_norm": 0.9770784658028318, + "learning_rate": 7.824042023456458e-05, + "loss": 0.414, + "step": 869 + }, + { + "epoch": 0.9275053304904051, + "grad_norm": 0.9265743591320977, + "learning_rate": 7.823167682218611e-05, + "loss": 0.4145, + "step": 870 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.7609304260280078, + "learning_rate": 7.822291223144564e-05, + "loss": 0.4133, + "step": 871 + }, + { + "epoch": 0.929637526652452, + "grad_norm": 0.6283447272354892, + "learning_rate": 7.821412646719829e-05, + "loss": 0.412, + "step": 872 + }, + { + "epoch": 0.9307036247334755, + "grad_norm": 0.6090160541584229, + "learning_rate": 7.820531953431093e-05, + "loss": 0.4083, + "step": 873 + }, + { + "epoch": 0.9317697228144989, + "grad_norm": 0.7086782043281862, + "learning_rate": 7.819649143766215e-05, + "loss": 0.4121, + "step": 874 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 0.6602890170546788, + "learning_rate": 7.818764218214224e-05, + "loss": 0.403, + "step": 875 + }, + { + "epoch": 0.9339019189765458, + "grad_norm": 0.5431433609726145, + "learning_rate": 7.817877177265323e-05, + "loss": 0.4145, + "step": 876 + }, + { + "epoch": 0.9349680170575693, + "grad_norm": 0.5717886780149007, + "learning_rate": 7.816988021410885e-05, + "loss": 0.4162, + "step": 877 + }, + { + "epoch": 0.9360341151385928, + "grad_norm": 0.518417714023232, + "learning_rate": 7.816096751143459e-05, + "loss": 0.4058, + "step": 878 + }, + { + "epoch": 0.9371002132196162, + "grad_norm": 0.43803136693398226, + "learning_rate": 7.815203366956762e-05, + "loss": 0.4071, + "step": 879 + }, + { + "epoch": 0.9381663113006397, + "grad_norm": 0.5244904028290258, + "learning_rate": 7.814307869345682e-05, + "loss": 0.4214, + "step": 880 + }, + { + "epoch": 0.9392324093816631, + "grad_norm": 0.637533228472145, + "learning_rate": 7.813410258806275e-05, + "loss": 0.4178, + "step": 881 + }, + { + "epoch": 0.9402985074626866, + "grad_norm": 0.7047581414477792, + "learning_rate": 7.812510535835775e-05, + "loss": 0.4073, + "step": 882 + }, + { + "epoch": 0.94136460554371, + "grad_norm": 0.641172550785535, + "learning_rate": 7.811608700932582e-05, + "loss": 0.4103, + "step": 883 + }, + { + "epoch": 0.9424307036247335, + "grad_norm": 0.5994428458639327, + "learning_rate": 7.810704754596263e-05, + "loss": 0.4135, + "step": 884 + }, + { + "epoch": 0.9434968017057569, + "grad_norm": 0.6070504903362325, + "learning_rate": 7.809798697327558e-05, + "loss": 0.4168, + "step": 885 + }, + { + "epoch": 0.9445628997867804, + "grad_norm": 0.6389004815644979, + "learning_rate": 7.808890529628374e-05, + "loss": 0.4126, + "step": 886 + }, + { + "epoch": 0.9456289978678039, + "grad_norm": 0.5841071688845674, + "learning_rate": 7.807980252001791e-05, + "loss": 0.4163, + "step": 887 + }, + { + "epoch": 0.9466950959488273, + "grad_norm": 0.5441565394837324, + "learning_rate": 7.807067864952055e-05, + "loss": 0.4073, + "step": 888 + }, + { + "epoch": 0.9477611940298507, + "grad_norm": 0.5817224526693675, + "learning_rate": 7.806153368984583e-05, + "loss": 0.4095, + "step": 889 + }, + { + "epoch": 0.9488272921108742, + "grad_norm": 0.6150981739518016, + "learning_rate": 7.805236764605954e-05, + "loss": 0.4111, + "step": 890 + }, + { + "epoch": 0.9498933901918977, + "grad_norm": 0.6063228912727467, + "learning_rate": 7.804318052323922e-05, + "loss": 0.4118, + "step": 891 + }, + { + "epoch": 0.9509594882729211, + "grad_norm": 0.5902912758191065, + "learning_rate": 7.803397232647406e-05, + "loss": 0.4145, + "step": 892 + }, + { + "epoch": 0.9520255863539445, + "grad_norm": 0.5500221760128661, + "learning_rate": 7.80247430608649e-05, + "loss": 0.4173, + "step": 893 + }, + { + "epoch": 0.9530916844349681, + "grad_norm": 0.5579754203833759, + "learning_rate": 7.80154927315243e-05, + "loss": 0.4088, + "step": 894 + }, + { + "epoch": 0.9541577825159915, + "grad_norm": 0.6650648355002524, + "learning_rate": 7.800622134357644e-05, + "loss": 0.405, + "step": 895 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 0.7556125255322795, + "learning_rate": 7.799692890215721e-05, + "loss": 0.4125, + "step": 896 + }, + { + "epoch": 0.9562899786780383, + "grad_norm": 0.7165371046432817, + "learning_rate": 7.798761541241413e-05, + "loss": 0.4086, + "step": 897 + }, + { + "epoch": 0.9573560767590619, + "grad_norm": 0.6697196433407722, + "learning_rate": 7.797828087950637e-05, + "loss": 0.4125, + "step": 898 + }, + { + "epoch": 0.9584221748400853, + "grad_norm": 0.6629917240602252, + "learning_rate": 7.79689253086048e-05, + "loss": 0.4172, + "step": 899 + }, + { + "epoch": 0.9594882729211087, + "grad_norm": 0.6424942713817352, + "learning_rate": 7.795954870489191e-05, + "loss": 0.4131, + "step": 900 + }, + { + "epoch": 0.9605543710021321, + "grad_norm": 0.4594700765306279, + "learning_rate": 7.795015107356186e-05, + "loss": 0.4122, + "step": 901 + }, + { + "epoch": 0.9616204690831557, + "grad_norm": 0.40226566645245615, + "learning_rate": 7.794073241982043e-05, + "loss": 0.4141, + "step": 902 + }, + { + "epoch": 0.9626865671641791, + "grad_norm": 0.5260998304778581, + "learning_rate": 7.793129274888508e-05, + "loss": 0.4119, + "step": 903 + }, + { + "epoch": 0.9637526652452025, + "grad_norm": 0.628398237724531, + "learning_rate": 7.792183206598491e-05, + "loss": 0.4088, + "step": 904 + }, + { + "epoch": 0.964818763326226, + "grad_norm": 0.6607508208979607, + "learning_rate": 7.791235037636062e-05, + "loss": 0.4126, + "step": 905 + }, + { + "epoch": 0.9658848614072495, + "grad_norm": 0.7250208135809759, + "learning_rate": 7.79028476852646e-05, + "loss": 0.4159, + "step": 906 + }, + { + "epoch": 0.9669509594882729, + "grad_norm": 0.7410413741786345, + "learning_rate": 7.789332399796079e-05, + "loss": 0.4099, + "step": 907 + }, + { + "epoch": 0.9680170575692963, + "grad_norm": 0.6585611107689859, + "learning_rate": 7.78837793197249e-05, + "loss": 0.4099, + "step": 908 + }, + { + "epoch": 0.9690831556503199, + "grad_norm": 0.6090162859858071, + "learning_rate": 7.787421365584414e-05, + "loss": 0.4113, + "step": 909 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.6733319020369772, + "learning_rate": 7.786462701161738e-05, + "loss": 0.4093, + "step": 910 + }, + { + "epoch": 0.9712153518123667, + "grad_norm": 0.7485470209020791, + "learning_rate": 7.785501939235513e-05, + "loss": 0.4141, + "step": 911 + }, + { + "epoch": 0.9722814498933902, + "grad_norm": 0.6601939210618039, + "learning_rate": 7.784539080337955e-05, + "loss": 0.4079, + "step": 912 + }, + { + "epoch": 0.9733475479744137, + "grad_norm": 0.5436166438340034, + "learning_rate": 7.783574125002432e-05, + "loss": 0.4041, + "step": 913 + }, + { + "epoch": 0.9744136460554371, + "grad_norm": 0.617167761753248, + "learning_rate": 7.782607073763484e-05, + "loss": 0.4086, + "step": 914 + }, + { + "epoch": 0.9754797441364605, + "grad_norm": 0.6443414561615247, + "learning_rate": 7.781637927156804e-05, + "loss": 0.4104, + "step": 915 + }, + { + "epoch": 0.976545842217484, + "grad_norm": 0.5741265835867478, + "learning_rate": 7.780666685719249e-05, + "loss": 0.4148, + "step": 916 + }, + { + "epoch": 0.9776119402985075, + "grad_norm": 0.42635079769878254, + "learning_rate": 7.779693349988839e-05, + "loss": 0.4048, + "step": 917 + }, + { + "epoch": 0.9786780383795309, + "grad_norm": 0.47310627157488677, + "learning_rate": 7.77871792050475e-05, + "loss": 0.409, + "step": 918 + }, + { + "epoch": 0.9797441364605544, + "grad_norm": 0.5694527852620955, + "learning_rate": 7.777740397807316e-05, + "loss": 0.4093, + "step": 919 + }, + { + "epoch": 0.9808102345415778, + "grad_norm": 0.5924875514072048, + "learning_rate": 7.776760782438038e-05, + "loss": 0.4111, + "step": 920 + }, + { + "epoch": 0.9818763326226013, + "grad_norm": 0.49411522824636867, + "learning_rate": 7.775779074939571e-05, + "loss": 0.4154, + "step": 921 + }, + { + "epoch": 0.9829424307036247, + "grad_norm": 0.5511529858288959, + "learning_rate": 7.77479527585573e-05, + "loss": 0.4132, + "step": 922 + }, + { + "epoch": 0.9840085287846482, + "grad_norm": 0.6881028326855904, + "learning_rate": 7.773809385731487e-05, + "loss": 0.4013, + "step": 923 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 0.8172075890856048, + "learning_rate": 7.772821405112974e-05, + "loss": 0.417, + "step": 924 + }, + { + "epoch": 0.9861407249466951, + "grad_norm": 0.9223848954361227, + "learning_rate": 7.771831334547483e-05, + "loss": 0.4071, + "step": 925 + }, + { + "epoch": 0.9872068230277186, + "grad_norm": 0.9603810119097379, + "learning_rate": 7.77083917458346e-05, + "loss": 0.4114, + "step": 926 + }, + { + "epoch": 0.988272921108742, + "grad_norm": 1.0088704018315873, + "learning_rate": 7.769844925770512e-05, + "loss": 0.4168, + "step": 927 + }, + { + "epoch": 0.9893390191897654, + "grad_norm": 0.9263247355978909, + "learning_rate": 7.7688485886594e-05, + "loss": 0.4068, + "step": 928 + }, + { + "epoch": 0.990405117270789, + "grad_norm": 0.6336714165848655, + "learning_rate": 7.76785016380204e-05, + "loss": 0.4039, + "step": 929 + }, + { + "epoch": 0.9914712153518124, + "grad_norm": 0.3609254347454348, + "learning_rate": 7.766849651751512e-05, + "loss": 0.4094, + "step": 930 + }, + { + "epoch": 0.9925373134328358, + "grad_norm": 0.37973843808767777, + "learning_rate": 7.765847053062046e-05, + "loss": 0.4124, + "step": 931 + }, + { + "epoch": 0.9936034115138592, + "grad_norm": 0.6199111977710898, + "learning_rate": 7.764842368289028e-05, + "loss": 0.4065, + "step": 932 + }, + { + "epoch": 0.9946695095948828, + "grad_norm": 0.7388812044022451, + "learning_rate": 7.763835597989005e-05, + "loss": 0.4126, + "step": 933 + }, + { + "epoch": 0.9957356076759062, + "grad_norm": 0.6631941161213446, + "learning_rate": 7.762826742719672e-05, + "loss": 0.4077, + "step": 934 + }, + { + "epoch": 0.9968017057569296, + "grad_norm": 0.5013692544152526, + "learning_rate": 7.761815803039883e-05, + "loss": 0.4088, + "step": 935 + }, + { + "epoch": 0.997867803837953, + "grad_norm": 0.5138742991856071, + "learning_rate": 7.760802779509647e-05, + "loss": 0.4109, + "step": 936 + }, + { + "epoch": 0.9989339019189766, + "grad_norm": 0.5647725207929322, + "learning_rate": 7.759787672690124e-05, + "loss": 0.4136, + "step": 937 + }, + { + "epoch": 1.0, + "grad_norm": 0.49646274447841793, + "learning_rate": 7.758770483143634e-05, + "loss": 0.4046, + "step": 938 + }, + { + "epoch": 1.0010660980810235, + "grad_norm": 0.41415428574276214, + "learning_rate": 7.757751211433646e-05, + "loss": 0.3978, + "step": 939 + }, + { + "epoch": 1.0021321961620469, + "grad_norm": 0.4418990401143746, + "learning_rate": 7.75672985812478e-05, + "loss": 0.4007, + "step": 940 + }, + { + "epoch": 1.0031982942430704, + "grad_norm": 0.5702542511470434, + "learning_rate": 7.75570642378282e-05, + "loss": 0.4008, + "step": 941 + }, + { + "epoch": 1.004264392324094, + "grad_norm": 0.6510908049787664, + "learning_rate": 7.754680908974687e-05, + "loss": 0.4006, + "step": 942 + }, + { + "epoch": 1.0053304904051172, + "grad_norm": 0.7117289661008723, + "learning_rate": 7.75365331426847e-05, + "loss": 0.4029, + "step": 943 + }, + { + "epoch": 1.0063965884861408, + "grad_norm": 0.8003621078354415, + "learning_rate": 7.752623640233398e-05, + "loss": 0.4028, + "step": 944 + }, + { + "epoch": 1.007462686567164, + "grad_norm": 0.7457196896824791, + "learning_rate": 7.751591887439859e-05, + "loss": 0.4077, + "step": 945 + }, + { + "epoch": 1.0085287846481876, + "grad_norm": 0.650649322119117, + "learning_rate": 7.75055805645939e-05, + "loss": 0.4018, + "step": 946 + }, + { + "epoch": 1.0095948827292112, + "grad_norm": 0.7354343157171254, + "learning_rate": 7.749522147864681e-05, + "loss": 0.397, + "step": 947 + }, + { + "epoch": 1.0106609808102345, + "grad_norm": 0.8392238195122467, + "learning_rate": 7.748484162229572e-05, + "loss": 0.3956, + "step": 948 + }, + { + "epoch": 1.011727078891258, + "grad_norm": 0.6811125765381959, + "learning_rate": 7.747444100129048e-05, + "loss": 0.4008, + "step": 949 + }, + { + "epoch": 1.0127931769722816, + "grad_norm": 0.5072610455516853, + "learning_rate": 7.746401962139255e-05, + "loss": 0.3975, + "step": 950 + }, + { + "epoch": 1.0138592750533049, + "grad_norm": 0.475527294382562, + "learning_rate": 7.745357748837482e-05, + "loss": 0.3977, + "step": 951 + }, + { + "epoch": 1.0149253731343284, + "grad_norm": 0.6012477231886632, + "learning_rate": 7.744311460802166e-05, + "loss": 0.4016, + "step": 952 + }, + { + "epoch": 1.0159914712153517, + "grad_norm": 0.6774620182327961, + "learning_rate": 7.7432630986129e-05, + "loss": 0.395, + "step": 953 + }, + { + "epoch": 1.0170575692963753, + "grad_norm": 0.6361012701129908, + "learning_rate": 7.742212662850421e-05, + "loss": 0.3961, + "step": 954 + }, + { + "epoch": 1.0181236673773988, + "grad_norm": 0.4593566880462273, + "learning_rate": 7.741160154096614e-05, + "loss": 0.4015, + "step": 955 + }, + { + "epoch": 1.019189765458422, + "grad_norm": 0.4253384790692246, + "learning_rate": 7.740105572934516e-05, + "loss": 0.3969, + "step": 956 + }, + { + "epoch": 1.0202558635394456, + "grad_norm": 0.5377056528363119, + "learning_rate": 7.739048919948309e-05, + "loss": 0.3925, + "step": 957 + }, + { + "epoch": 1.0213219616204692, + "grad_norm": 0.4792515921280752, + "learning_rate": 7.737990195723325e-05, + "loss": 0.3954, + "step": 958 + }, + { + "epoch": 1.0223880597014925, + "grad_norm": 0.37572937988220495, + "learning_rate": 7.736929400846041e-05, + "loss": 0.3946, + "step": 959 + }, + { + "epoch": 1.023454157782516, + "grad_norm": 0.4748049142013816, + "learning_rate": 7.735866535904083e-05, + "loss": 0.3938, + "step": 960 + }, + { + "epoch": 1.0245202558635393, + "grad_norm": 0.5674490975829488, + "learning_rate": 7.734801601486224e-05, + "loss": 0.4053, + "step": 961 + }, + { + "epoch": 1.0255863539445629, + "grad_norm": 0.49630911394420096, + "learning_rate": 7.733734598182379e-05, + "loss": 0.391, + "step": 962 + }, + { + "epoch": 1.0266524520255864, + "grad_norm": 0.41816138182017965, + "learning_rate": 7.732665526583616e-05, + "loss": 0.3974, + "step": 963 + }, + { + "epoch": 1.0277185501066097, + "grad_norm": 0.48628841101926257, + "learning_rate": 7.731594387282144e-05, + "loss": 0.3897, + "step": 964 + }, + { + "epoch": 1.0287846481876333, + "grad_norm": 0.5327676523101473, + "learning_rate": 7.730521180871317e-05, + "loss": 0.3958, + "step": 965 + }, + { + "epoch": 1.0298507462686568, + "grad_norm": 0.5698192622774071, + "learning_rate": 7.729445907945637e-05, + "loss": 0.3996, + "step": 966 + }, + { + "epoch": 1.0309168443496801, + "grad_norm": 0.6600227288417884, + "learning_rate": 7.728368569100749e-05, + "loss": 0.398, + "step": 967 + }, + { + "epoch": 1.0319829424307037, + "grad_norm": 0.7199584858108914, + "learning_rate": 7.727289164933443e-05, + "loss": 0.3952, + "step": 968 + }, + { + "epoch": 1.033049040511727, + "grad_norm": 0.6780630536094696, + "learning_rate": 7.726207696041653e-05, + "loss": 0.3901, + "step": 969 + }, + { + "epoch": 1.0341151385927505, + "grad_norm": 0.6051350564389102, + "learning_rate": 7.725124163024456e-05, + "loss": 0.4027, + "step": 970 + }, + { + "epoch": 1.035181236673774, + "grad_norm": 0.6703569464507834, + "learning_rate": 7.724038566482073e-05, + "loss": 0.3982, + "step": 971 + }, + { + "epoch": 1.0362473347547974, + "grad_norm": 0.701997393773405, + "learning_rate": 7.722950907015867e-05, + "loss": 0.4064, + "step": 972 + }, + { + "epoch": 1.037313432835821, + "grad_norm": 0.7033610282992381, + "learning_rate": 7.721861185228347e-05, + "loss": 0.4024, + "step": 973 + }, + { + "epoch": 1.0383795309168444, + "grad_norm": 0.7094358785590401, + "learning_rate": 7.72076940172316e-05, + "loss": 0.4075, + "step": 974 + }, + { + "epoch": 1.0394456289978677, + "grad_norm": 0.5778245595035633, + "learning_rate": 7.719675557105101e-05, + "loss": 0.399, + "step": 975 + }, + { + "epoch": 1.0405117270788913, + "grad_norm": 0.495341844608196, + "learning_rate": 7.718579651980099e-05, + "loss": 0.3987, + "step": 976 + }, + { + "epoch": 1.0415778251599148, + "grad_norm": 0.5705131082078807, + "learning_rate": 7.717481686955231e-05, + "loss": 0.4012, + "step": 977 + }, + { + "epoch": 1.0426439232409381, + "grad_norm": 0.7460655829453172, + "learning_rate": 7.71638166263871e-05, + "loss": 0.3996, + "step": 978 + }, + { + "epoch": 1.0437100213219617, + "grad_norm": 0.8323692131007797, + "learning_rate": 7.715279579639895e-05, + "loss": 0.3893, + "step": 979 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.7432520305478516, + "learning_rate": 7.714175438569282e-05, + "loss": 0.398, + "step": 980 + }, + { + "epoch": 1.0458422174840085, + "grad_norm": 0.6728948975330855, + "learning_rate": 7.713069240038506e-05, + "loss": 0.4017, + "step": 981 + }, + { + "epoch": 1.046908315565032, + "grad_norm": 0.4983139656283151, + "learning_rate": 7.711960984660346e-05, + "loss": 0.394, + "step": 982 + }, + { + "epoch": 1.0479744136460554, + "grad_norm": 0.33754531605342536, + "learning_rate": 7.710850673048717e-05, + "loss": 0.3944, + "step": 983 + }, + { + "epoch": 1.049040511727079, + "grad_norm": 0.3725573631734465, + "learning_rate": 7.709738305818674e-05, + "loss": 0.4005, + "step": 984 + }, + { + "epoch": 1.0501066098081024, + "grad_norm": 0.4551248038249476, + "learning_rate": 7.708623883586409e-05, + "loss": 0.3971, + "step": 985 + }, + { + "epoch": 1.0511727078891258, + "grad_norm": 0.41338624681327313, + "learning_rate": 7.707507406969256e-05, + "loss": 0.4075, + "step": 986 + }, + { + "epoch": 1.0522388059701493, + "grad_norm": 0.3693936866798565, + "learning_rate": 7.706388876585685e-05, + "loss": 0.4035, + "step": 987 + }, + { + "epoch": 1.0533049040511726, + "grad_norm": 0.3457954659560392, + "learning_rate": 7.705268293055302e-05, + "loss": 0.3988, + "step": 988 + }, + { + "epoch": 1.0543710021321961, + "grad_norm": 0.3988913386622119, + "learning_rate": 7.704145656998853e-05, + "loss": 0.3896, + "step": 989 + }, + { + "epoch": 1.0554371002132197, + "grad_norm": 0.48450611938317545, + "learning_rate": 7.703020969038222e-05, + "loss": 0.392, + "step": 990 + }, + { + "epoch": 1.056503198294243, + "grad_norm": 0.5311541916427419, + "learning_rate": 7.701894229796424e-05, + "loss": 0.3952, + "step": 991 + }, + { + "epoch": 1.0575692963752665, + "grad_norm": 0.5299103995739938, + "learning_rate": 7.700765439897616e-05, + "loss": 0.3959, + "step": 992 + }, + { + "epoch": 1.05863539445629, + "grad_norm": 0.5180100310098522, + "learning_rate": 7.69963459996709e-05, + "loss": 0.3953, + "step": 993 + }, + { + "epoch": 1.0597014925373134, + "grad_norm": 0.6335245544459096, + "learning_rate": 7.69850171063127e-05, + "loss": 0.3963, + "step": 994 + }, + { + "epoch": 1.060767590618337, + "grad_norm": 0.7565263057674985, + "learning_rate": 7.697366772517719e-05, + "loss": 0.4011, + "step": 995 + }, + { + "epoch": 1.0618336886993602, + "grad_norm": 0.7626681326172062, + "learning_rate": 7.696229786255136e-05, + "loss": 0.3973, + "step": 996 + }, + { + "epoch": 1.0628997867803838, + "grad_norm": 0.7180860950610816, + "learning_rate": 7.695090752473348e-05, + "loss": 0.4045, + "step": 997 + }, + { + "epoch": 1.0639658848614073, + "grad_norm": 0.6227425406154337, + "learning_rate": 7.693949671803323e-05, + "loss": 0.3976, + "step": 998 + }, + { + "epoch": 1.0650319829424306, + "grad_norm": 0.7201441487555086, + "learning_rate": 7.69280654487716e-05, + "loss": 0.3985, + "step": 999 + }, + { + "epoch": 1.0660980810234542, + "grad_norm": 0.8997609053026269, + "learning_rate": 7.691661372328093e-05, + "loss": 0.3986, + "step": 1000 + }, + { + "epoch": 1.0671641791044777, + "grad_norm": 1.000428399051643, + "learning_rate": 7.690514154790485e-05, + "loss": 0.3952, + "step": 1001 + }, + { + "epoch": 1.068230277185501, + "grad_norm": 0.8713614689438668, + "learning_rate": 7.689364892899838e-05, + "loss": 0.397, + "step": 1002 + }, + { + "epoch": 1.0692963752665245, + "grad_norm": 0.7060560170201216, + "learning_rate": 7.688213587292783e-05, + "loss": 0.398, + "step": 1003 + }, + { + "epoch": 1.070362473347548, + "grad_norm": 0.5741051379265467, + "learning_rate": 7.687060238607082e-05, + "loss": 0.3979, + "step": 1004 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.46041215316462103, + "learning_rate": 7.685904847481631e-05, + "loss": 0.3959, + "step": 1005 + }, + { + "epoch": 1.072494669509595, + "grad_norm": 0.3666940747393567, + "learning_rate": 7.684747414556457e-05, + "loss": 0.3933, + "step": 1006 + }, + { + "epoch": 1.0735607675906182, + "grad_norm": 0.3941609183280189, + "learning_rate": 7.683587940472716e-05, + "loss": 0.4004, + "step": 1007 + }, + { + "epoch": 1.0746268656716418, + "grad_norm": 0.46186462913236076, + "learning_rate": 7.6824264258727e-05, + "loss": 0.4004, + "step": 1008 + }, + { + "epoch": 1.0756929637526653, + "grad_norm": 0.5333763291785089, + "learning_rate": 7.681262871399824e-05, + "loss": 0.3989, + "step": 1009 + }, + { + "epoch": 1.0767590618336886, + "grad_norm": 0.4861997016882574, + "learning_rate": 7.680097277698637e-05, + "loss": 0.3974, + "step": 1010 + }, + { + "epoch": 1.0778251599147122, + "grad_norm": 0.420953916020597, + "learning_rate": 7.678929645414822e-05, + "loss": 0.4036, + "step": 1011 + }, + { + "epoch": 1.0788912579957357, + "grad_norm": 0.41234855570784623, + "learning_rate": 7.67775997519518e-05, + "loss": 0.3946, + "step": 1012 + }, + { + "epoch": 1.079957356076759, + "grad_norm": 0.542278036057536, + "learning_rate": 7.676588267687651e-05, + "loss": 0.3957, + "step": 1013 + }, + { + "epoch": 1.0810234541577826, + "grad_norm": 0.6900107120803527, + "learning_rate": 7.6754145235413e-05, + "loss": 0.3974, + "step": 1014 + }, + { + "epoch": 1.0820895522388059, + "grad_norm": 0.7866757471169271, + "learning_rate": 7.674238743406319e-05, + "loss": 0.4015, + "step": 1015 + }, + { + "epoch": 1.0831556503198294, + "grad_norm": 0.8352910554164142, + "learning_rate": 7.673060927934032e-05, + "loss": 0.404, + "step": 1016 + }, + { + "epoch": 1.084221748400853, + "grad_norm": 0.7695238174542417, + "learning_rate": 7.671881077776884e-05, + "loss": 0.3926, + "step": 1017 + }, + { + "epoch": 1.0852878464818763, + "grad_norm": 0.5536485102257792, + "learning_rate": 7.670699193588453e-05, + "loss": 0.3985, + "step": 1018 + }, + { + "epoch": 1.0863539445628998, + "grad_norm": 0.4228656833300754, + "learning_rate": 7.66951527602344e-05, + "loss": 0.3996, + "step": 1019 + }, + { + "epoch": 1.0874200426439233, + "grad_norm": 0.46124002487349797, + "learning_rate": 7.668329325737674e-05, + "loss": 0.3891, + "step": 1020 + }, + { + "epoch": 1.0884861407249466, + "grad_norm": 0.4882167434505241, + "learning_rate": 7.667141343388111e-05, + "loss": 0.4026, + "step": 1021 + }, + { + "epoch": 1.0895522388059702, + "grad_norm": 0.44551496006705965, + "learning_rate": 7.665951329632829e-05, + "loss": 0.4028, + "step": 1022 + }, + { + "epoch": 1.0906183368869935, + "grad_norm": 0.3735062426653466, + "learning_rate": 7.664759285131039e-05, + "loss": 0.3934, + "step": 1023 + }, + { + "epoch": 1.091684434968017, + "grad_norm": 0.3517866272882918, + "learning_rate": 7.663565210543065e-05, + "loss": 0.3908, + "step": 1024 + }, + { + "epoch": 1.0927505330490406, + "grad_norm": 0.36578234461021986, + "learning_rate": 7.662369106530367e-05, + "loss": 0.401, + "step": 1025 + }, + { + "epoch": 1.0938166311300639, + "grad_norm": 0.37034761078040107, + "learning_rate": 7.661170973755523e-05, + "loss": 0.3963, + "step": 1026 + }, + { + "epoch": 1.0948827292110874, + "grad_norm": 0.44787182009231896, + "learning_rate": 7.659970812882236e-05, + "loss": 0.396, + "step": 1027 + }, + { + "epoch": 1.095948827292111, + "grad_norm": 0.46802593434023804, + "learning_rate": 7.658768624575331e-05, + "loss": 0.3895, + "step": 1028 + }, + { + "epoch": 1.0970149253731343, + "grad_norm": 0.46152503018911306, + "learning_rate": 7.657564409500763e-05, + "loss": 0.4017, + "step": 1029 + }, + { + "epoch": 1.0980810234541578, + "grad_norm": 0.46014261998376804, + "learning_rate": 7.6563581683256e-05, + "loss": 0.3986, + "step": 1030 + }, + { + "epoch": 1.0991471215351813, + "grad_norm": 0.5602013316636195, + "learning_rate": 7.655149901718038e-05, + "loss": 0.3998, + "step": 1031 + }, + { + "epoch": 1.1002132196162047, + "grad_norm": 0.6361354791978112, + "learning_rate": 7.653939610347393e-05, + "loss": 0.3931, + "step": 1032 + }, + { + "epoch": 1.1012793176972282, + "grad_norm": 0.6096074276843847, + "learning_rate": 7.652727294884107e-05, + "loss": 0.403, + "step": 1033 + }, + { + "epoch": 1.1023454157782515, + "grad_norm": 0.6436284856894914, + "learning_rate": 7.651512955999737e-05, + "loss": 0.4062, + "step": 1034 + }, + { + "epoch": 1.103411513859275, + "grad_norm": 0.7789944154844247, + "learning_rate": 7.650296594366962e-05, + "loss": 0.4001, + "step": 1035 + }, + { + "epoch": 1.1044776119402986, + "grad_norm": 0.7656806635135831, + "learning_rate": 7.649078210659587e-05, + "loss": 0.3941, + "step": 1036 + }, + { + "epoch": 1.105543710021322, + "grad_norm": 0.6743206765604671, + "learning_rate": 7.647857805552532e-05, + "loss": 0.3982, + "step": 1037 + }, + { + "epoch": 1.1066098081023454, + "grad_norm": 0.6738594516444768, + "learning_rate": 7.646635379721837e-05, + "loss": 0.3995, + "step": 1038 + }, + { + "epoch": 1.1076759061833688, + "grad_norm": 0.5665383403100596, + "learning_rate": 7.645410933844663e-05, + "loss": 0.3996, + "step": 1039 + }, + { + "epoch": 1.1087420042643923, + "grad_norm": 0.4665621519906685, + "learning_rate": 7.644184468599289e-05, + "loss": 0.3986, + "step": 1040 + }, + { + "epoch": 1.1098081023454158, + "grad_norm": 0.39730689152255527, + "learning_rate": 7.642955984665113e-05, + "loss": 0.4026, + "step": 1041 + }, + { + "epoch": 1.1108742004264391, + "grad_norm": 0.4095924968703311, + "learning_rate": 7.641725482722651e-05, + "loss": 0.3944, + "step": 1042 + }, + { + "epoch": 1.1119402985074627, + "grad_norm": 0.42742701569060193, + "learning_rate": 7.640492963453538e-05, + "loss": 0.4004, + "step": 1043 + }, + { + "epoch": 1.1130063965884862, + "grad_norm": 0.4714508774381032, + "learning_rate": 7.639258427540526e-05, + "loss": 0.3923, + "step": 1044 + }, + { + "epoch": 1.1140724946695095, + "grad_norm": 0.43458932184535304, + "learning_rate": 7.638021875667483e-05, + "loss": 0.4013, + "step": 1045 + }, + { + "epoch": 1.115138592750533, + "grad_norm": 0.4337664712771011, + "learning_rate": 7.636783308519394e-05, + "loss": 0.3971, + "step": 1046 + }, + { + "epoch": 1.1162046908315566, + "grad_norm": 0.5374420364179895, + "learning_rate": 7.63554272678236e-05, + "loss": 0.3987, + "step": 1047 + }, + { + "epoch": 1.11727078891258, + "grad_norm": 0.599484068723148, + "learning_rate": 7.634300131143601e-05, + "loss": 0.4031, + "step": 1048 + }, + { + "epoch": 1.1183368869936035, + "grad_norm": 0.5397453012699273, + "learning_rate": 7.63305552229145e-05, + "loss": 0.3929, + "step": 1049 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.49396537617545705, + "learning_rate": 7.631808900915355e-05, + "loss": 0.4011, + "step": 1050 + }, + { + "epoch": 1.1204690831556503, + "grad_norm": 0.5246477358086288, + "learning_rate": 7.63056026770588e-05, + "loss": 0.395, + "step": 1051 + }, + { + "epoch": 1.1215351812366738, + "grad_norm": 0.4694349641844175, + "learning_rate": 7.6293096233547e-05, + "loss": 0.4, + "step": 1052 + }, + { + "epoch": 1.1226012793176972, + "grad_norm": 0.49101259479940984, + "learning_rate": 7.628056968554613e-05, + "loss": 0.4023, + "step": 1053 + }, + { + "epoch": 1.1236673773987207, + "grad_norm": 0.5743556431588469, + "learning_rate": 7.626802303999519e-05, + "loss": 0.3978, + "step": 1054 + }, + { + "epoch": 1.1247334754797442, + "grad_norm": 0.5806632736863245, + "learning_rate": 7.62554563038444e-05, + "loss": 0.3934, + "step": 1055 + }, + { + "epoch": 1.1257995735607675, + "grad_norm": 0.5458834450664017, + "learning_rate": 7.624286948405506e-05, + "loss": 0.4, + "step": 1056 + }, + { + "epoch": 1.126865671641791, + "grad_norm": 0.5294205958455052, + "learning_rate": 7.623026258759963e-05, + "loss": 0.4058, + "step": 1057 + }, + { + "epoch": 1.1279317697228146, + "grad_norm": 0.517163593116652, + "learning_rate": 7.621763562146167e-05, + "loss": 0.3987, + "step": 1058 + }, + { + "epoch": 1.128997867803838, + "grad_norm": 0.4383142153955423, + "learning_rate": 7.620498859263584e-05, + "loss": 0.3953, + "step": 1059 + }, + { + "epoch": 1.1300639658848615, + "grad_norm": 0.48624167313347716, + "learning_rate": 7.619232150812799e-05, + "loss": 0.4012, + "step": 1060 + }, + { + "epoch": 1.1311300639658848, + "grad_norm": 0.5650158980091758, + "learning_rate": 7.617963437495498e-05, + "loss": 0.3989, + "step": 1061 + }, + { + "epoch": 1.1321961620469083, + "grad_norm": 0.6366745345835356, + "learning_rate": 7.616692720014484e-05, + "loss": 0.3948, + "step": 1062 + }, + { + "epoch": 1.1332622601279319, + "grad_norm": 0.7232155780875018, + "learning_rate": 7.615419999073667e-05, + "loss": 0.3894, + "step": 1063 + }, + { + "epoch": 1.1343283582089552, + "grad_norm": 0.8621803906529987, + "learning_rate": 7.614145275378072e-05, + "loss": 0.4004, + "step": 1064 + }, + { + "epoch": 1.1353944562899787, + "grad_norm": 1.031817218923734, + "learning_rate": 7.612868549633825e-05, + "loss": 0.3913, + "step": 1065 + }, + { + "epoch": 1.136460554371002, + "grad_norm": 0.995716991471701, + "learning_rate": 7.611589822548168e-05, + "loss": 0.3989, + "step": 1066 + }, + { + "epoch": 1.1375266524520256, + "grad_norm": 0.9135720092202024, + "learning_rate": 7.61030909482945e-05, + "loss": 0.4013, + "step": 1067 + }, + { + "epoch": 1.138592750533049, + "grad_norm": 0.7949810308021543, + "learning_rate": 7.609026367187125e-05, + "loss": 0.3977, + "step": 1068 + }, + { + "epoch": 1.1396588486140724, + "grad_norm": 0.7103297637427972, + "learning_rate": 7.607741640331761e-05, + "loss": 0.3925, + "step": 1069 + }, + { + "epoch": 1.140724946695096, + "grad_norm": 0.6765477797632053, + "learning_rate": 7.606454914975029e-05, + "loss": 0.3957, + "step": 1070 + }, + { + "epoch": 1.1417910447761195, + "grad_norm": 0.6061563911680509, + "learning_rate": 7.605166191829705e-05, + "loss": 0.3952, + "step": 1071 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.6115108481298734, + "learning_rate": 7.603875471609677e-05, + "loss": 0.4006, + "step": 1072 + }, + { + "epoch": 1.1439232409381663, + "grad_norm": 0.7609527472496548, + "learning_rate": 7.602582755029938e-05, + "loss": 0.3987, + "step": 1073 + }, + { + "epoch": 1.1449893390191899, + "grad_norm": 0.7173940431562806, + "learning_rate": 7.601288042806582e-05, + "loss": 0.3995, + "step": 1074 + }, + { + "epoch": 1.1460554371002132, + "grad_norm": 0.5756472561097495, + "learning_rate": 7.599991335656817e-05, + "loss": 0.3929, + "step": 1075 + }, + { + "epoch": 1.1471215351812367, + "grad_norm": 0.5051191561774657, + "learning_rate": 7.598692634298949e-05, + "loss": 0.3974, + "step": 1076 + }, + { + "epoch": 1.14818763326226, + "grad_norm": 0.506878322162707, + "learning_rate": 7.59739193945239e-05, + "loss": 0.3988, + "step": 1077 + }, + { + "epoch": 1.1492537313432836, + "grad_norm": 0.4945186749103627, + "learning_rate": 7.596089251837659e-05, + "loss": 0.3967, + "step": 1078 + }, + { + "epoch": 1.150319829424307, + "grad_norm": 0.5094561991761961, + "learning_rate": 7.594784572176378e-05, + "loss": 0.3945, + "step": 1079 + }, + { + "epoch": 1.1513859275053304, + "grad_norm": 0.5005613849300656, + "learning_rate": 7.593477901191268e-05, + "loss": 0.3962, + "step": 1080 + }, + { + "epoch": 1.152452025586354, + "grad_norm": 0.39722111509884156, + "learning_rate": 7.592169239606161e-05, + "loss": 0.407, + "step": 1081 + }, + { + "epoch": 1.1535181236673775, + "grad_norm": 0.3086629383868203, + "learning_rate": 7.590858588145985e-05, + "loss": 0.3968, + "step": 1082 + }, + { + "epoch": 1.1545842217484008, + "grad_norm": 0.32105906743248047, + "learning_rate": 7.589545947536774e-05, + "loss": 0.4023, + "step": 1083 + }, + { + "epoch": 1.1556503198294243, + "grad_norm": 0.393821577059538, + "learning_rate": 7.588231318505661e-05, + "loss": 0.4, + "step": 1084 + }, + { + "epoch": 1.1567164179104479, + "grad_norm": 0.44231624171196116, + "learning_rate": 7.586914701780886e-05, + "loss": 0.4008, + "step": 1085 + }, + { + "epoch": 1.1577825159914712, + "grad_norm": 0.4026841091750872, + "learning_rate": 7.585596098091782e-05, + "loss": 0.3966, + "step": 1086 + }, + { + "epoch": 1.1588486140724947, + "grad_norm": 0.4378761127318457, + "learning_rate": 7.58427550816879e-05, + "loss": 0.3967, + "step": 1087 + }, + { + "epoch": 1.159914712153518, + "grad_norm": 0.5373953761219459, + "learning_rate": 7.582952932743445e-05, + "loss": 0.4052, + "step": 1088 + }, + { + "epoch": 1.1609808102345416, + "grad_norm": 0.6376217391121629, + "learning_rate": 7.581628372548388e-05, + "loss": 0.3968, + "step": 1089 + }, + { + "epoch": 1.1620469083155651, + "grad_norm": 0.7124571886953409, + "learning_rate": 7.580301828317354e-05, + "loss": 0.3967, + "step": 1090 + }, + { + "epoch": 1.1631130063965884, + "grad_norm": 0.710369838726693, + "learning_rate": 7.578973300785182e-05, + "loss": 0.3926, + "step": 1091 + }, + { + "epoch": 1.164179104477612, + "grad_norm": 0.601219772248201, + "learning_rate": 7.577642790687805e-05, + "loss": 0.3977, + "step": 1092 + }, + { + "epoch": 1.1652452025586353, + "grad_norm": 0.4939938541558801, + "learning_rate": 7.57631029876226e-05, + "loss": 0.3976, + "step": 1093 + }, + { + "epoch": 1.1663113006396588, + "grad_norm": 0.5843429305407729, + "learning_rate": 7.574975825746673e-05, + "loss": 0.3985, + "step": 1094 + }, + { + "epoch": 1.1673773987206824, + "grad_norm": 0.7423520288508058, + "learning_rate": 7.573639372380277e-05, + "loss": 0.3937, + "step": 1095 + }, + { + "epoch": 1.1684434968017057, + "grad_norm": 0.6911804134372755, + "learning_rate": 7.572300939403395e-05, + "loss": 0.3998, + "step": 1096 + }, + { + "epoch": 1.1695095948827292, + "grad_norm": 0.5980230638871175, + "learning_rate": 7.570960527557452e-05, + "loss": 0.3927, + "step": 1097 + }, + { + "epoch": 1.1705756929637527, + "grad_norm": 0.5740905237257593, + "learning_rate": 7.569618137584964e-05, + "loss": 0.3966, + "step": 1098 + }, + { + "epoch": 1.171641791044776, + "grad_norm": 0.5934276451380195, + "learning_rate": 7.568273770229546e-05, + "loss": 0.3958, + "step": 1099 + }, + { + "epoch": 1.1727078891257996, + "grad_norm": 0.6018407068991795, + "learning_rate": 7.566927426235909e-05, + "loss": 0.392, + "step": 1100 + }, + { + "epoch": 1.1737739872068231, + "grad_norm": 0.5584205973152891, + "learning_rate": 7.565579106349857e-05, + "loss": 0.403, + "step": 1101 + }, + { + "epoch": 1.1748400852878464, + "grad_norm": 0.5611127016532973, + "learning_rate": 7.564228811318288e-05, + "loss": 0.3946, + "step": 1102 + }, + { + "epoch": 1.17590618336887, + "grad_norm": 0.6160481911439404, + "learning_rate": 7.562876541889195e-05, + "loss": 0.3942, + "step": 1103 + }, + { + "epoch": 1.1769722814498933, + "grad_norm": 0.5055559528539143, + "learning_rate": 7.561522298811667e-05, + "loss": 0.3928, + "step": 1104 + }, + { + "epoch": 1.1780383795309168, + "grad_norm": 0.4031249818431398, + "learning_rate": 7.560166082835883e-05, + "loss": 0.3947, + "step": 1105 + }, + { + "epoch": 1.1791044776119404, + "grad_norm": 0.4844578748616443, + "learning_rate": 7.558807894713116e-05, + "loss": 0.4, + "step": 1106 + }, + { + "epoch": 1.1801705756929637, + "grad_norm": 0.5923421522905422, + "learning_rate": 7.557447735195732e-05, + "loss": 0.3932, + "step": 1107 + }, + { + "epoch": 1.1812366737739872, + "grad_norm": 0.5732059132565734, + "learning_rate": 7.556085605037191e-05, + "loss": 0.396, + "step": 1108 + }, + { + "epoch": 1.1823027718550105, + "grad_norm": 0.5727038238228728, + "learning_rate": 7.554721504992038e-05, + "loss": 0.3942, + "step": 1109 + }, + { + "epoch": 1.183368869936034, + "grad_norm": 0.5941550331742329, + "learning_rate": 7.553355435815915e-05, + "loss": 0.3921, + "step": 1110 + }, + { + "epoch": 1.1844349680170576, + "grad_norm": 0.6162721480823822, + "learning_rate": 7.551987398265554e-05, + "loss": 0.3973, + "step": 1111 + }, + { + "epoch": 1.1855010660980811, + "grad_norm": 0.6503625421341107, + "learning_rate": 7.550617393098777e-05, + "loss": 0.4, + "step": 1112 + }, + { + "epoch": 1.1865671641791045, + "grad_norm": 0.6428847327632391, + "learning_rate": 7.549245421074496e-05, + "loss": 0.4032, + "step": 1113 + }, + { + "epoch": 1.187633262260128, + "grad_norm": 0.6516650235926786, + "learning_rate": 7.54787148295271e-05, + "loss": 0.3972, + "step": 1114 + }, + { + "epoch": 1.1886993603411513, + "grad_norm": 0.6996227895669477, + "learning_rate": 7.546495579494512e-05, + "loss": 0.3977, + "step": 1115 + }, + { + "epoch": 1.1897654584221748, + "grad_norm": 0.6011748919765059, + "learning_rate": 7.54511771146208e-05, + "loss": 0.3937, + "step": 1116 + }, + { + "epoch": 1.1908315565031984, + "grad_norm": 0.4608169210715356, + "learning_rate": 7.54373787961868e-05, + "loss": 0.4012, + "step": 1117 + }, + { + "epoch": 1.1918976545842217, + "grad_norm": 0.41047190471676254, + "learning_rate": 7.542356084728669e-05, + "loss": 0.3999, + "step": 1118 + }, + { + "epoch": 1.1929637526652452, + "grad_norm": 0.41683875837640383, + "learning_rate": 7.540972327557487e-05, + "loss": 0.3962, + "step": 1119 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.4017382310711894, + "learning_rate": 7.539586608871667e-05, + "loss": 0.3971, + "step": 1120 + }, + { + "epoch": 1.195095948827292, + "grad_norm": 0.3327575746497003, + "learning_rate": 7.538198929438823e-05, + "loss": 0.3998, + "step": 1121 + }, + { + "epoch": 1.1961620469083156, + "grad_norm": 0.36060866682655995, + "learning_rate": 7.536809290027657e-05, + "loss": 0.3892, + "step": 1122 + }, + { + "epoch": 1.197228144989339, + "grad_norm": 0.4016423989687354, + "learning_rate": 7.53541769140796e-05, + "loss": 0.3928, + "step": 1123 + }, + { + "epoch": 1.1982942430703625, + "grad_norm": 0.4667699426682389, + "learning_rate": 7.5340241343506e-05, + "loss": 0.3927, + "step": 1124 + }, + { + "epoch": 1.199360341151386, + "grad_norm": 0.52739114765906, + "learning_rate": 7.532628619627541e-05, + "loss": 0.3965, + "step": 1125 + }, + { + "epoch": 1.2004264392324093, + "grad_norm": 0.5145519240447523, + "learning_rate": 7.531231148011821e-05, + "loss": 0.3934, + "step": 1126 + }, + { + "epoch": 1.2014925373134329, + "grad_norm": 0.46867941356549736, + "learning_rate": 7.529831720277569e-05, + "loss": 0.4006, + "step": 1127 + }, + { + "epoch": 1.2025586353944564, + "grad_norm": 0.394042649849781, + "learning_rate": 7.528430337199995e-05, + "loss": 0.4003, + "step": 1128 + }, + { + "epoch": 1.2036247334754797, + "grad_norm": 0.449136494508411, + "learning_rate": 7.527026999555393e-05, + "loss": 0.3978, + "step": 1129 + }, + { + "epoch": 1.2046908315565032, + "grad_norm": 0.5439656094014544, + "learning_rate": 7.525621708121136e-05, + "loss": 0.3974, + "step": 1130 + }, + { + "epoch": 1.2057569296375266, + "grad_norm": 0.6999516585400055, + "learning_rate": 7.524214463675686e-05, + "loss": 0.4066, + "step": 1131 + }, + { + "epoch": 1.20682302771855, + "grad_norm": 0.9223862712790244, + "learning_rate": 7.522805266998582e-05, + "loss": 0.3935, + "step": 1132 + }, + { + "epoch": 1.2078891257995736, + "grad_norm": 1.005522314616768, + "learning_rate": 7.521394118870446e-05, + "loss": 0.3996, + "step": 1133 + }, + { + "epoch": 1.208955223880597, + "grad_norm": 0.9189093535473554, + "learning_rate": 7.519981020072979e-05, + "loss": 0.3986, + "step": 1134 + }, + { + "epoch": 1.2100213219616205, + "grad_norm": 0.7474366825523275, + "learning_rate": 7.518565971388967e-05, + "loss": 0.3975, + "step": 1135 + }, + { + "epoch": 1.2110874200426438, + "grad_norm": 0.517722383461889, + "learning_rate": 7.51714897360227e-05, + "loss": 0.3915, + "step": 1136 + }, + { + "epoch": 1.2121535181236673, + "grad_norm": 0.460287240625528, + "learning_rate": 7.515730027497836e-05, + "loss": 0.3991, + "step": 1137 + }, + { + "epoch": 1.2132196162046909, + "grad_norm": 0.5513920397892742, + "learning_rate": 7.514309133861684e-05, + "loss": 0.3876, + "step": 1138 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.5926185140798839, + "learning_rate": 7.512886293480914e-05, + "loss": 0.3987, + "step": 1139 + }, + { + "epoch": 1.2153518123667377, + "grad_norm": 0.6015497667677062, + "learning_rate": 7.51146150714371e-05, + "loss": 0.3942, + "step": 1140 + }, + { + "epoch": 1.2164179104477613, + "grad_norm": 0.4979357814188255, + "learning_rate": 7.510034775639324e-05, + "loss": 0.3962, + "step": 1141 + }, + { + "epoch": 1.2174840085287846, + "grad_norm": 0.3795778689452729, + "learning_rate": 7.508606099758097e-05, + "loss": 0.4005, + "step": 1142 + }, + { + "epoch": 1.2185501066098081, + "grad_norm": 0.3928656441581091, + "learning_rate": 7.507175480291437e-05, + "loss": 0.3984, + "step": 1143 + }, + { + "epoch": 1.2196162046908317, + "grad_norm": 0.3930681703186637, + "learning_rate": 7.505742918031836e-05, + "loss": 0.3984, + "step": 1144 + }, + { + "epoch": 1.220682302771855, + "grad_norm": 0.4091879483768399, + "learning_rate": 7.504308413772856e-05, + "loss": 0.3933, + "step": 1145 + }, + { + "epoch": 1.2217484008528785, + "grad_norm": 0.46891356577284954, + "learning_rate": 7.502871968309139e-05, + "loss": 0.3988, + "step": 1146 + }, + { + "epoch": 1.2228144989339018, + "grad_norm": 0.5103860857976652, + "learning_rate": 7.5014335824364e-05, + "loss": 0.3935, + "step": 1147 + }, + { + "epoch": 1.2238805970149254, + "grad_norm": 0.5082917446932953, + "learning_rate": 7.499993256951433e-05, + "loss": 0.3927, + "step": 1148 + }, + { + "epoch": 1.224946695095949, + "grad_norm": 0.49379137994921035, + "learning_rate": 7.498550992652101e-05, + "loss": 0.4001, + "step": 1149 + }, + { + "epoch": 1.2260127931769722, + "grad_norm": 0.5353013303840448, + "learning_rate": 7.497106790337345e-05, + "loss": 0.3929, + "step": 1150 + }, + { + "epoch": 1.2270788912579957, + "grad_norm": 0.5876139459836854, + "learning_rate": 7.495660650807174e-05, + "loss": 0.3994, + "step": 1151 + }, + { + "epoch": 1.2281449893390193, + "grad_norm": 0.734210823795574, + "learning_rate": 7.494212574862682e-05, + "loss": 0.3967, + "step": 1152 + }, + { + "epoch": 1.2292110874200426, + "grad_norm": 0.7913917318938646, + "learning_rate": 7.49276256330602e-05, + "loss": 0.3936, + "step": 1153 + }, + { + "epoch": 1.2302771855010661, + "grad_norm": 0.7405044856069717, + "learning_rate": 7.491310616940422e-05, + "loss": 0.4, + "step": 1154 + }, + { + "epoch": 1.2313432835820897, + "grad_norm": 0.6663912293042208, + "learning_rate": 7.489856736570192e-05, + "loss": 0.4047, + "step": 1155 + }, + { + "epoch": 1.232409381663113, + "grad_norm": 0.6021454402370039, + "learning_rate": 7.488400923000703e-05, + "loss": 0.3907, + "step": 1156 + }, + { + "epoch": 1.2334754797441365, + "grad_norm": 0.5514602002833817, + "learning_rate": 7.4869431770384e-05, + "loss": 0.3978, + "step": 1157 + }, + { + "epoch": 1.2345415778251598, + "grad_norm": 0.48903597186198305, + "learning_rate": 7.485483499490799e-05, + "loss": 0.4005, + "step": 1158 + }, + { + "epoch": 1.2356076759061834, + "grad_norm": 0.5332694417143288, + "learning_rate": 7.484021891166486e-05, + "loss": 0.3952, + "step": 1159 + }, + { + "epoch": 1.236673773987207, + "grad_norm": 0.49543835364305194, + "learning_rate": 7.482558352875113e-05, + "loss": 0.3986, + "step": 1160 + }, + { + "epoch": 1.2377398720682302, + "grad_norm": 0.4261853368714419, + "learning_rate": 7.481092885427408e-05, + "loss": 0.39, + "step": 1161 + }, + { + "epoch": 1.2388059701492538, + "grad_norm": 0.4265314566032864, + "learning_rate": 7.479625489635162e-05, + "loss": 0.3911, + "step": 1162 + }, + { + "epoch": 1.239872068230277, + "grad_norm": 0.5029650666087466, + "learning_rate": 7.478156166311236e-05, + "loss": 0.3994, + "step": 1163 + }, + { + "epoch": 1.2409381663113006, + "grad_norm": 0.4455729452597588, + "learning_rate": 7.476684916269559e-05, + "loss": 0.3952, + "step": 1164 + }, + { + "epoch": 1.2420042643923241, + "grad_norm": 0.47338288349681196, + "learning_rate": 7.475211740325127e-05, + "loss": 0.3969, + "step": 1165 + }, + { + "epoch": 1.2430703624733475, + "grad_norm": 0.4305124151121982, + "learning_rate": 7.473736639294004e-05, + "loss": 0.3967, + "step": 1166 + }, + { + "epoch": 1.244136460554371, + "grad_norm": 0.40696000141373095, + "learning_rate": 7.472259613993316e-05, + "loss": 0.393, + "step": 1167 + }, + { + "epoch": 1.2452025586353945, + "grad_norm": 0.45477308879606393, + "learning_rate": 7.470780665241262e-05, + "loss": 0.3958, + "step": 1168 + }, + { + "epoch": 1.2462686567164178, + "grad_norm": 0.5704960820413824, + "learning_rate": 7.469299793857101e-05, + "loss": 0.3989, + "step": 1169 + }, + { + "epoch": 1.2473347547974414, + "grad_norm": 0.6135689230632373, + "learning_rate": 7.467817000661159e-05, + "loss": 0.3921, + "step": 1170 + }, + { + "epoch": 1.248400852878465, + "grad_norm": 0.6777534760469583, + "learning_rate": 7.466332286474826e-05, + "loss": 0.3962, + "step": 1171 + }, + { + "epoch": 1.2494669509594882, + "grad_norm": 0.6898047595171254, + "learning_rate": 7.464845652120557e-05, + "loss": 0.3915, + "step": 1172 + }, + { + "epoch": 1.2505330490405118, + "grad_norm": 0.7123893467359865, + "learning_rate": 7.46335709842187e-05, + "loss": 0.4025, + "step": 1173 + }, + { + "epoch": 1.251599147121535, + "grad_norm": 0.6820123806207717, + "learning_rate": 7.461866626203348e-05, + "loss": 0.3941, + "step": 1174 + }, + { + "epoch": 1.2526652452025586, + "grad_norm": 0.6533571683107087, + "learning_rate": 7.460374236290631e-05, + "loss": 0.4025, + "step": 1175 + }, + { + "epoch": 1.2537313432835822, + "grad_norm": 0.6571872810240704, + "learning_rate": 7.45887992951043e-05, + "loss": 0.3976, + "step": 1176 + }, + { + "epoch": 1.2547974413646055, + "grad_norm": 0.6849183239389355, + "learning_rate": 7.457383706690511e-05, + "loss": 0.3976, + "step": 1177 + }, + { + "epoch": 1.255863539445629, + "grad_norm": 0.6130858125092414, + "learning_rate": 7.455885568659705e-05, + "loss": 0.3917, + "step": 1178 + }, + { + "epoch": 1.2569296375266523, + "grad_norm": 0.5653602132939001, + "learning_rate": 7.454385516247899e-05, + "loss": 0.3898, + "step": 1179 + }, + { + "epoch": 1.2579957356076759, + "grad_norm": 0.6091279178319541, + "learning_rate": 7.452883550286049e-05, + "loss": 0.3928, + "step": 1180 + }, + { + "epoch": 1.2590618336886994, + "grad_norm": 0.5595753993224933, + "learning_rate": 7.451379671606162e-05, + "loss": 0.3957, + "step": 1181 + }, + { + "epoch": 1.260127931769723, + "grad_norm": 0.5079986278587565, + "learning_rate": 7.449873881041312e-05, + "loss": 0.3893, + "step": 1182 + }, + { + "epoch": 1.2611940298507462, + "grad_norm": 0.464090085159201, + "learning_rate": 7.448366179425628e-05, + "loss": 0.3926, + "step": 1183 + }, + { + "epoch": 1.2622601279317698, + "grad_norm": 0.510243217894761, + "learning_rate": 7.446856567594294e-05, + "loss": 0.3937, + "step": 1184 + }, + { + "epoch": 1.263326226012793, + "grad_norm": 0.46150844552833925, + "learning_rate": 7.445345046383563e-05, + "loss": 0.3947, + "step": 1185 + }, + { + "epoch": 1.2643923240938166, + "grad_norm": 0.3949887975107038, + "learning_rate": 7.443831616630734e-05, + "loss": 0.3884, + "step": 1186 + }, + { + "epoch": 1.2654584221748402, + "grad_norm": 0.5026720516501154, + "learning_rate": 7.442316279174172e-05, + "loss": 0.3906, + "step": 1187 + }, + { + "epoch": 1.2665245202558635, + "grad_norm": 0.502955271204735, + "learning_rate": 7.440799034853294e-05, + "loss": 0.3906, + "step": 1188 + }, + { + "epoch": 1.267590618336887, + "grad_norm": 0.43070711981801013, + "learning_rate": 7.439279884508573e-05, + "loss": 0.3959, + "step": 1189 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.3992942313925878, + "learning_rate": 7.437758828981542e-05, + "loss": 0.3979, + "step": 1190 + }, + { + "epoch": 1.2697228144989339, + "grad_norm": 0.41776106633422166, + "learning_rate": 7.436235869114785e-05, + "loss": 0.3888, + "step": 1191 + }, + { + "epoch": 1.2707889125799574, + "grad_norm": 0.3678243374968389, + "learning_rate": 7.434711005751942e-05, + "loss": 0.3929, + "step": 1192 + }, + { + "epoch": 1.271855010660981, + "grad_norm": 0.4462778916558436, + "learning_rate": 7.43318423973771e-05, + "loss": 0.4052, + "step": 1193 + }, + { + "epoch": 1.2729211087420043, + "grad_norm": 0.4072295381721561, + "learning_rate": 7.431655571917835e-05, + "loss": 0.3901, + "step": 1194 + }, + { + "epoch": 1.2739872068230278, + "grad_norm": 0.33668391253724705, + "learning_rate": 7.430125003139124e-05, + "loss": 0.395, + "step": 1195 + }, + { + "epoch": 1.275053304904051, + "grad_norm": 0.49984395156609207, + "learning_rate": 7.428592534249429e-05, + "loss": 0.3921, + "step": 1196 + }, + { + "epoch": 1.2761194029850746, + "grad_norm": 0.4961401301018006, + "learning_rate": 7.42705816609766e-05, + "loss": 0.3986, + "step": 1197 + }, + { + "epoch": 1.2771855010660982, + "grad_norm": 0.36987736865515364, + "learning_rate": 7.425521899533776e-05, + "loss": 0.3863, + "step": 1198 + }, + { + "epoch": 1.2782515991471215, + "grad_norm": 0.37433483688551117, + "learning_rate": 7.42398373540879e-05, + "loss": 0.4008, + "step": 1199 + }, + { + "epoch": 1.279317697228145, + "grad_norm": 0.3960664301566479, + "learning_rate": 7.422443674574764e-05, + "loss": 0.3936, + "step": 1200 + }, + { + "epoch": 1.2803837953091683, + "grad_norm": 0.42962763499848106, + "learning_rate": 7.42090171788481e-05, + "loss": 0.3968, + "step": 1201 + }, + { + "epoch": 1.2814498933901919, + "grad_norm": 0.4947438283060391, + "learning_rate": 7.419357866193097e-05, + "loss": 0.3974, + "step": 1202 + }, + { + "epoch": 1.2825159914712154, + "grad_norm": 0.7638002002790177, + "learning_rate": 7.417812120354833e-05, + "loss": 0.3987, + "step": 1203 + }, + { + "epoch": 1.2835820895522387, + "grad_norm": 0.6618293847952098, + "learning_rate": 7.416264481226284e-05, + "loss": 0.3954, + "step": 1204 + }, + { + "epoch": 1.2846481876332623, + "grad_norm": 0.7564644303522643, + "learning_rate": 7.414714949664761e-05, + "loss": 0.4025, + "step": 1205 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.7341957818072022, + "learning_rate": 7.413163526528623e-05, + "loss": 0.3972, + "step": 1206 + }, + { + "epoch": 1.2867803837953091, + "grad_norm": 0.7172296615238862, + "learning_rate": 7.41161021267728e-05, + "loss": 0.4024, + "step": 1207 + }, + { + "epoch": 1.2878464818763327, + "grad_norm": 0.6773018518559085, + "learning_rate": 7.410055008971186e-05, + "loss": 0.3986, + "step": 1208 + }, + { + "epoch": 1.2889125799573562, + "grad_norm": 0.7499881724261244, + "learning_rate": 7.40849791627184e-05, + "loss": 0.401, + "step": 1209 + }, + { + "epoch": 1.2899786780383795, + "grad_norm": 0.6552329889222517, + "learning_rate": 7.406938935441795e-05, + "loss": 0.399, + "step": 1210 + }, + { + "epoch": 1.291044776119403, + "grad_norm": 0.6452927503574722, + "learning_rate": 7.405378067344645e-05, + "loss": 0.4023, + "step": 1211 + }, + { + "epoch": 1.2921108742004264, + "grad_norm": 0.6122823637184173, + "learning_rate": 7.403815312845027e-05, + "loss": 0.4003, + "step": 1212 + }, + { + "epoch": 1.29317697228145, + "grad_norm": 0.4885206292712561, + "learning_rate": 7.402250672808627e-05, + "loss": 0.3948, + "step": 1213 + }, + { + "epoch": 1.2942430703624734, + "grad_norm": 0.36971065365308836, + "learning_rate": 7.400684148102175e-05, + "loss": 0.3983, + "step": 1214 + }, + { + "epoch": 1.2953091684434968, + "grad_norm": 0.5341249123573403, + "learning_rate": 7.399115739593444e-05, + "loss": 0.3998, + "step": 1215 + }, + { + "epoch": 1.2963752665245203, + "grad_norm": 0.6951670150170282, + "learning_rate": 7.397545448151249e-05, + "loss": 0.3917, + "step": 1216 + }, + { + "epoch": 1.2974413646055436, + "grad_norm": 0.6307408026599629, + "learning_rate": 7.395973274645452e-05, + "loss": 0.3966, + "step": 1217 + }, + { + "epoch": 1.2985074626865671, + "grad_norm": 0.41244623390075813, + "learning_rate": 7.394399219946955e-05, + "loss": 0.3943, + "step": 1218 + }, + { + "epoch": 1.2995735607675907, + "grad_norm": 0.4959351860092428, + "learning_rate": 7.392823284927704e-05, + "loss": 0.3969, + "step": 1219 + }, + { + "epoch": 1.3006396588486142, + "grad_norm": 0.6354141360008508, + "learning_rate": 7.391245470460682e-05, + "loss": 0.3912, + "step": 1220 + }, + { + "epoch": 1.3017057569296375, + "grad_norm": 0.5545244238798931, + "learning_rate": 7.389665777419916e-05, + "loss": 0.3959, + "step": 1221 + }, + { + "epoch": 1.302771855010661, + "grad_norm": 0.4981798925494521, + "learning_rate": 7.388084206680477e-05, + "loss": 0.3975, + "step": 1222 + }, + { + "epoch": 1.3038379530916844, + "grad_norm": 0.6996743346801269, + "learning_rate": 7.386500759118472e-05, + "loss": 0.3969, + "step": 1223 + }, + { + "epoch": 1.304904051172708, + "grad_norm": 0.8545480674302531, + "learning_rate": 7.384915435611047e-05, + "loss": 0.3911, + "step": 1224 + }, + { + "epoch": 1.3059701492537314, + "grad_norm": 0.9034425556959509, + "learning_rate": 7.38332823703639e-05, + "loss": 0.3984, + "step": 1225 + }, + { + "epoch": 1.3070362473347548, + "grad_norm": 0.8405317648162198, + "learning_rate": 7.381739164273727e-05, + "loss": 0.3919, + "step": 1226 + }, + { + "epoch": 1.3081023454157783, + "grad_norm": 0.7611193691575167, + "learning_rate": 7.380148218203321e-05, + "loss": 0.3936, + "step": 1227 + }, + { + "epoch": 1.3091684434968016, + "grad_norm": 0.7529452227808444, + "learning_rate": 7.378555399706473e-05, + "loss": 0.3976, + "step": 1228 + }, + { + "epoch": 1.3102345415778252, + "grad_norm": 0.7498395498416355, + "learning_rate": 7.376960709665522e-05, + "loss": 0.3938, + "step": 1229 + }, + { + "epoch": 1.3113006396588487, + "grad_norm": 0.617409199350578, + "learning_rate": 7.375364148963845e-05, + "loss": 0.4, + "step": 1230 + }, + { + "epoch": 1.312366737739872, + "grad_norm": 0.5012176007830088, + "learning_rate": 7.373765718485851e-05, + "loss": 0.3898, + "step": 1231 + }, + { + "epoch": 1.3134328358208955, + "grad_norm": 0.43946542231558117, + "learning_rate": 7.37216541911699e-05, + "loss": 0.4011, + "step": 1232 + }, + { + "epoch": 1.3144989339019189, + "grad_norm": 0.37173647672592797, + "learning_rate": 7.370563251743743e-05, + "loss": 0.4035, + "step": 1233 + }, + { + "epoch": 1.3155650319829424, + "grad_norm": 0.3453534441885958, + "learning_rate": 7.368959217253627e-05, + "loss": 0.3997, + "step": 1234 + }, + { + "epoch": 1.316631130063966, + "grad_norm": 0.35475360394383565, + "learning_rate": 7.367353316535195e-05, + "loss": 0.396, + "step": 1235 + }, + { + "epoch": 1.3176972281449895, + "grad_norm": 0.4807258068112041, + "learning_rate": 7.365745550478034e-05, + "loss": 0.3981, + "step": 1236 + }, + { + "epoch": 1.3187633262260128, + "grad_norm": 0.5770643962715295, + "learning_rate": 7.364135919972759e-05, + "loss": 0.397, + "step": 1237 + }, + { + "epoch": 1.3198294243070363, + "grad_norm": 0.5494777777905933, + "learning_rate": 7.362524425911024e-05, + "loss": 0.3937, + "step": 1238 + }, + { + "epoch": 1.3208955223880596, + "grad_norm": 0.4823030916464439, + "learning_rate": 7.360911069185513e-05, + "loss": 0.3946, + "step": 1239 + }, + { + "epoch": 1.3219616204690832, + "grad_norm": 0.44738974205793797, + "learning_rate": 7.35929585068994e-05, + "loss": 0.3987, + "step": 1240 + }, + { + "epoch": 1.3230277185501067, + "grad_norm": 0.4647985462455625, + "learning_rate": 7.357678771319055e-05, + "loss": 0.3989, + "step": 1241 + }, + { + "epoch": 1.32409381663113, + "grad_norm": 0.4907957965266697, + "learning_rate": 7.356059831968634e-05, + "loss": 0.3928, + "step": 1242 + }, + { + "epoch": 1.3251599147121536, + "grad_norm": 0.4834632280662174, + "learning_rate": 7.354439033535486e-05, + "loss": 0.3925, + "step": 1243 + }, + { + "epoch": 1.3262260127931769, + "grad_norm": 0.5312494947802512, + "learning_rate": 7.352816376917448e-05, + "loss": 0.395, + "step": 1244 + }, + { + "epoch": 1.3272921108742004, + "grad_norm": 0.6134924022779664, + "learning_rate": 7.351191863013387e-05, + "loss": 0.3952, + "step": 1245 + }, + { + "epoch": 1.328358208955224, + "grad_norm": 0.6854173044473056, + "learning_rate": 7.349565492723204e-05, + "loss": 0.3993, + "step": 1246 + }, + { + "epoch": 1.3294243070362473, + "grad_norm": 0.6787952149521692, + "learning_rate": 7.347937266947817e-05, + "loss": 0.388, + "step": 1247 + }, + { + "epoch": 1.3304904051172708, + "grad_norm": 0.5985162637767774, + "learning_rate": 7.346307186589183e-05, + "loss": 0.3964, + "step": 1248 + }, + { + "epoch": 1.331556503198294, + "grad_norm": 0.5267115292559772, + "learning_rate": 7.344675252550278e-05, + "loss": 0.397, + "step": 1249 + }, + { + "epoch": 1.3326226012793176, + "grad_norm": 0.4627942903946516, + "learning_rate": 7.343041465735115e-05, + "loss": 0.398, + "step": 1250 + }, + { + "epoch": 1.3336886993603412, + "grad_norm": 0.38809726070928446, + "learning_rate": 7.34140582704872e-05, + "loss": 0.3912, + "step": 1251 + }, + { + "epoch": 1.3347547974413647, + "grad_norm": 0.38768792360594717, + "learning_rate": 7.339768337397156e-05, + "loss": 0.3976, + "step": 1252 + }, + { + "epoch": 1.335820895522388, + "grad_norm": 0.8280978007735911, + "learning_rate": 7.338128997687505e-05, + "loss": 0.3916, + "step": 1253 + }, + { + "epoch": 1.3368869936034116, + "grad_norm": 0.5768542990929353, + "learning_rate": 7.336487808827878e-05, + "loss": 0.4006, + "step": 1254 + }, + { + "epoch": 1.3379530916844349, + "grad_norm": 0.5660915592706494, + "learning_rate": 7.334844771727407e-05, + "loss": 0.3889, + "step": 1255 + }, + { + "epoch": 1.3390191897654584, + "grad_norm": 0.5472277393608467, + "learning_rate": 7.333199887296249e-05, + "loss": 0.3966, + "step": 1256 + }, + { + "epoch": 1.340085287846482, + "grad_norm": 0.591414528568585, + "learning_rate": 7.331553156445585e-05, + "loss": 0.3942, + "step": 1257 + }, + { + "epoch": 1.3411513859275053, + "grad_norm": 0.551915916206759, + "learning_rate": 7.329904580087618e-05, + "loss": 0.3981, + "step": 1258 + }, + { + "epoch": 1.3422174840085288, + "grad_norm": 0.5979571239616868, + "learning_rate": 7.328254159135575e-05, + "loss": 0.391, + "step": 1259 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.6293378397487646, + "learning_rate": 7.326601894503703e-05, + "loss": 0.39, + "step": 1260 + }, + { + "epoch": 1.3443496801705757, + "grad_norm": 0.6163722703678122, + "learning_rate": 7.324947787107267e-05, + "loss": 0.3967, + "step": 1261 + }, + { + "epoch": 1.3454157782515992, + "grad_norm": 0.5746957535175811, + "learning_rate": 7.323291837862561e-05, + "loss": 0.3948, + "step": 1262 + }, + { + "epoch": 1.3464818763326227, + "grad_norm": 0.5421145237930761, + "learning_rate": 7.321634047686895e-05, + "loss": 0.3914, + "step": 1263 + }, + { + "epoch": 1.347547974413646, + "grad_norm": 0.5487443991159977, + "learning_rate": 7.319974417498594e-05, + "loss": 0.3895, + "step": 1264 + }, + { + "epoch": 1.3486140724946696, + "grad_norm": 0.5644289944018167, + "learning_rate": 7.318312948217012e-05, + "loss": 0.3928, + "step": 1265 + }, + { + "epoch": 1.349680170575693, + "grad_norm": 0.6211957603031802, + "learning_rate": 7.316649640762515e-05, + "loss": 0.3914, + "step": 1266 + }, + { + "epoch": 1.3507462686567164, + "grad_norm": 0.6243920858668569, + "learning_rate": 7.314984496056487e-05, + "loss": 0.3918, + "step": 1267 + }, + { + "epoch": 1.35181236673774, + "grad_norm": 0.48121327241122974, + "learning_rate": 7.313317515021334e-05, + "loss": 0.3949, + "step": 1268 + }, + { + "epoch": 1.3528784648187633, + "grad_norm": 0.42705279500592713, + "learning_rate": 7.311648698580475e-05, + "loss": 0.3914, + "step": 1269 + }, + { + "epoch": 1.3539445628997868, + "grad_norm": 0.4868001076889179, + "learning_rate": 7.309978047658348e-05, + "loss": 0.3969, + "step": 1270 + }, + { + "epoch": 1.3550106609808101, + "grad_norm": 0.5141089848205457, + "learning_rate": 7.308305563180409e-05, + "loss": 0.3947, + "step": 1271 + }, + { + "epoch": 1.3560767590618337, + "grad_norm": 0.5120767848632203, + "learning_rate": 7.306631246073125e-05, + "loss": 0.393, + "step": 1272 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.5231030391185801, + "learning_rate": 7.30495509726398e-05, + "loss": 0.3897, + "step": 1273 + }, + { + "epoch": 1.3582089552238805, + "grad_norm": 0.43134602113074105, + "learning_rate": 7.303277117681475e-05, + "loss": 0.3955, + "step": 1274 + }, + { + "epoch": 1.359275053304904, + "grad_norm": 0.2918563082914735, + "learning_rate": 7.301597308255124e-05, + "loss": 0.3933, + "step": 1275 + }, + { + "epoch": 1.3603411513859274, + "grad_norm": 0.2886464392254564, + "learning_rate": 7.299915669915454e-05, + "loss": 0.3947, + "step": 1276 + }, + { + "epoch": 1.361407249466951, + "grad_norm": 0.3719373094507802, + "learning_rate": 7.298232203594003e-05, + "loss": 0.3972, + "step": 1277 + }, + { + "epoch": 1.3624733475479744, + "grad_norm": 0.3893538092048672, + "learning_rate": 7.296546910223327e-05, + "loss": 0.3976, + "step": 1278 + }, + { + "epoch": 1.363539445628998, + "grad_norm": 0.33287474885809093, + "learning_rate": 7.294859790736989e-05, + "loss": 0.3899, + "step": 1279 + }, + { + "epoch": 1.3646055437100213, + "grad_norm": 0.44445251040661327, + "learning_rate": 7.293170846069564e-05, + "loss": 0.3982, + "step": 1280 + }, + { + "epoch": 1.3656716417910448, + "grad_norm": 0.5752395269544717, + "learning_rate": 7.291480077156642e-05, + "loss": 0.3968, + "step": 1281 + }, + { + "epoch": 1.3667377398720681, + "grad_norm": 0.5884059035360013, + "learning_rate": 7.289787484934823e-05, + "loss": 0.387, + "step": 1282 + }, + { + "epoch": 1.3678038379530917, + "grad_norm": 0.6411711941306089, + "learning_rate": 7.288093070341709e-05, + "loss": 0.3979, + "step": 1283 + }, + { + "epoch": 1.3688699360341152, + "grad_norm": 0.7297554798439111, + "learning_rate": 7.286396834315925e-05, + "loss": 0.3975, + "step": 1284 + }, + { + "epoch": 1.3699360341151385, + "grad_norm": 0.8133268333459146, + "learning_rate": 7.284698777797091e-05, + "loss": 0.3958, + "step": 1285 + }, + { + "epoch": 1.371002132196162, + "grad_norm": 0.8171389709979388, + "learning_rate": 7.282998901725846e-05, + "loss": 0.3935, + "step": 1286 + }, + { + "epoch": 1.3720682302771854, + "grad_norm": 0.6715918102841708, + "learning_rate": 7.281297207043832e-05, + "loss": 0.3963, + "step": 1287 + }, + { + "epoch": 1.373134328358209, + "grad_norm": 0.5821302267603919, + "learning_rate": 7.279593694693698e-05, + "loss": 0.3957, + "step": 1288 + }, + { + "epoch": 1.3742004264392325, + "grad_norm": 0.5793056879165892, + "learning_rate": 7.277888365619104e-05, + "loss": 0.399, + "step": 1289 + }, + { + "epoch": 1.375266524520256, + "grad_norm": 0.6131326531548551, + "learning_rate": 7.276181220764713e-05, + "loss": 0.391, + "step": 1290 + }, + { + "epoch": 1.3763326226012793, + "grad_norm": 0.6655360716979257, + "learning_rate": 7.274472261076192e-05, + "loss": 0.3936, + "step": 1291 + }, + { + "epoch": 1.3773987206823028, + "grad_norm": 0.6514488453949313, + "learning_rate": 7.272761487500219e-05, + "loss": 0.389, + "step": 1292 + }, + { + "epoch": 1.3784648187633262, + "grad_norm": 0.6004752538724129, + "learning_rate": 7.271048900984473e-05, + "loss": 0.3979, + "step": 1293 + }, + { + "epoch": 1.3795309168443497, + "grad_norm": 0.5286751186792054, + "learning_rate": 7.269334502477636e-05, + "loss": 0.3975, + "step": 1294 + }, + { + "epoch": 1.3805970149253732, + "grad_norm": 0.3782372078245128, + "learning_rate": 7.267618292929398e-05, + "loss": 0.3965, + "step": 1295 + }, + { + "epoch": 1.3816631130063965, + "grad_norm": 0.4478939359799176, + "learning_rate": 7.265900273290448e-05, + "loss": 0.3882, + "step": 1296 + }, + { + "epoch": 1.38272921108742, + "grad_norm": 0.40504400102254523, + "learning_rate": 7.264180444512481e-05, + "loss": 0.3979, + "step": 1297 + }, + { + "epoch": 1.3837953091684434, + "grad_norm": 0.38764841393473026, + "learning_rate": 7.262458807548191e-05, + "loss": 0.3963, + "step": 1298 + }, + { + "epoch": 1.384861407249467, + "grad_norm": 0.3656453484587496, + "learning_rate": 7.260735363351278e-05, + "loss": 0.3938, + "step": 1299 + }, + { + "epoch": 1.3859275053304905, + "grad_norm": 0.48752680564499556, + "learning_rate": 7.259010112876437e-05, + "loss": 0.3929, + "step": 1300 + }, + { + "epoch": 1.3869936034115138, + "grad_norm": 0.5660988157671498, + "learning_rate": 7.257283057079371e-05, + "loss": 0.3965, + "step": 1301 + }, + { + "epoch": 1.3880597014925373, + "grad_norm": 0.5415116199650241, + "learning_rate": 7.255554196916777e-05, + "loss": 0.3969, + "step": 1302 + }, + { + "epoch": 1.3891257995735606, + "grad_norm": 0.48579681088844257, + "learning_rate": 7.253823533346353e-05, + "loss": 0.3963, + "step": 1303 + }, + { + "epoch": 1.3901918976545842, + "grad_norm": 0.4521402830070269, + "learning_rate": 7.2520910673268e-05, + "loss": 0.3889, + "step": 1304 + }, + { + "epoch": 1.3912579957356077, + "grad_norm": 0.3868095009917062, + "learning_rate": 7.250356799817811e-05, + "loss": 0.3847, + "step": 1305 + }, + { + "epoch": 1.3923240938166312, + "grad_norm": 0.3772982157721458, + "learning_rate": 7.24862073178008e-05, + "loss": 0.3898, + "step": 1306 + }, + { + "epoch": 1.3933901918976546, + "grad_norm": 0.3776649103582633, + "learning_rate": 7.2468828641753e-05, + "loss": 0.3917, + "step": 1307 + }, + { + "epoch": 1.394456289978678, + "grad_norm": 0.3824919655119669, + "learning_rate": 7.245143197966158e-05, + "loss": 0.3938, + "step": 1308 + }, + { + "epoch": 1.3955223880597014, + "grad_norm": 0.5015303372462967, + "learning_rate": 7.243401734116341e-05, + "loss": 0.3896, + "step": 1309 + }, + { + "epoch": 1.396588486140725, + "grad_norm": 0.594587592040505, + "learning_rate": 7.241658473590526e-05, + "loss": 0.3927, + "step": 1310 + }, + { + "epoch": 1.3976545842217485, + "grad_norm": 0.560051836440119, + "learning_rate": 7.239913417354393e-05, + "loss": 0.3956, + "step": 1311 + }, + { + "epoch": 1.3987206823027718, + "grad_norm": 0.4940699663851546, + "learning_rate": 7.238166566374607e-05, + "loss": 0.3976, + "step": 1312 + }, + { + "epoch": 1.3997867803837953, + "grad_norm": 0.4683362628695071, + "learning_rate": 7.236417921618839e-05, + "loss": 0.3934, + "step": 1313 + }, + { + "epoch": 1.4008528784648187, + "grad_norm": 0.46229731243535377, + "learning_rate": 7.234667484055742e-05, + "loss": 0.3897, + "step": 1314 + }, + { + "epoch": 1.4019189765458422, + "grad_norm": 0.4363083782167336, + "learning_rate": 7.232915254654968e-05, + "loss": 0.3925, + "step": 1315 + }, + { + "epoch": 1.4029850746268657, + "grad_norm": 0.4257678057099821, + "learning_rate": 7.231161234387165e-05, + "loss": 0.3926, + "step": 1316 + }, + { + "epoch": 1.4040511727078893, + "grad_norm": 0.4846979047099743, + "learning_rate": 7.229405424223967e-05, + "loss": 0.3913, + "step": 1317 + }, + { + "epoch": 1.4051172707889126, + "grad_norm": 0.5325656515388444, + "learning_rate": 7.227647825137998e-05, + "loss": 0.3922, + "step": 1318 + }, + { + "epoch": 1.4061833688699361, + "grad_norm": 0.544343816677594, + "learning_rate": 7.225888438102882e-05, + "loss": 0.3888, + "step": 1319 + }, + { + "epoch": 1.4072494669509594, + "grad_norm": 0.5779022828781076, + "learning_rate": 7.224127264093225e-05, + "loss": 0.4001, + "step": 1320 + }, + { + "epoch": 1.408315565031983, + "grad_norm": 0.7411524315870592, + "learning_rate": 7.222364304084627e-05, + "loss": 0.3919, + "step": 1321 + }, + { + "epoch": 1.4093816631130065, + "grad_norm": 0.9208203333396868, + "learning_rate": 7.220599559053676e-05, + "loss": 0.3985, + "step": 1322 + }, + { + "epoch": 1.4104477611940298, + "grad_norm": 0.9562178280419171, + "learning_rate": 7.218833029977948e-05, + "loss": 0.4012, + "step": 1323 + }, + { + "epoch": 1.4115138592750534, + "grad_norm": 0.8517901376976156, + "learning_rate": 7.217064717836009e-05, + "loss": 0.3941, + "step": 1324 + }, + { + "epoch": 1.4125799573560767, + "grad_norm": 0.6475331249527876, + "learning_rate": 7.215294623607414e-05, + "loss": 0.3952, + "step": 1325 + }, + { + "epoch": 1.4136460554371002, + "grad_norm": 0.4701406210795358, + "learning_rate": 7.213522748272699e-05, + "loss": 0.3912, + "step": 1326 + }, + { + "epoch": 1.4147121535181237, + "grad_norm": 0.519797694341973, + "learning_rate": 7.211749092813395e-05, + "loss": 0.3941, + "step": 1327 + }, + { + "epoch": 1.415778251599147, + "grad_norm": 0.5546483656534481, + "learning_rate": 7.20997365821201e-05, + "loss": 0.39, + "step": 1328 + }, + { + "epoch": 1.4168443496801706, + "grad_norm": 0.5322694569447141, + "learning_rate": 7.208196445452048e-05, + "loss": 0.3933, + "step": 1329 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.5615390005781126, + "learning_rate": 7.20641745551799e-05, + "loss": 0.389, + "step": 1330 + }, + { + "epoch": 1.4189765458422174, + "grad_norm": 0.5359170050084836, + "learning_rate": 7.204636689395304e-05, + "loss": 0.3915, + "step": 1331 + }, + { + "epoch": 1.420042643923241, + "grad_norm": 0.46419011837159624, + "learning_rate": 7.202854148070443e-05, + "loss": 0.3879, + "step": 1332 + }, + { + "epoch": 1.4211087420042645, + "grad_norm": 0.40341695125405613, + "learning_rate": 7.201069832530838e-05, + "loss": 0.3926, + "step": 1333 + }, + { + "epoch": 1.4221748400852878, + "grad_norm": 0.37929724565888073, + "learning_rate": 7.199283743764913e-05, + "loss": 0.3876, + "step": 1334 + }, + { + "epoch": 1.4232409381663114, + "grad_norm": 0.3708060382902423, + "learning_rate": 7.197495882762065e-05, + "loss": 0.394, + "step": 1335 + }, + { + "epoch": 1.4243070362473347, + "grad_norm": 0.42952036732595084, + "learning_rate": 7.195706250512676e-05, + "loss": 0.3911, + "step": 1336 + }, + { + "epoch": 1.4253731343283582, + "grad_norm": 0.3633415202092657, + "learning_rate": 7.19391484800811e-05, + "loss": 0.3881, + "step": 1337 + }, + { + "epoch": 1.4264392324093818, + "grad_norm": 0.3233744886500194, + "learning_rate": 7.192121676240713e-05, + "loss": 0.3969, + "step": 1338 + }, + { + "epoch": 1.427505330490405, + "grad_norm": 0.35414304904628113, + "learning_rate": 7.190326736203805e-05, + "loss": 0.3968, + "step": 1339 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.36418321869973685, + "learning_rate": 7.188530028891691e-05, + "loss": 0.3932, + "step": 1340 + }, + { + "epoch": 1.429637526652452, + "grad_norm": 0.32413601902229694, + "learning_rate": 7.186731555299654e-05, + "loss": 0.3936, + "step": 1341 + }, + { + "epoch": 1.4307036247334755, + "grad_norm": 0.3875245106191369, + "learning_rate": 7.184931316423955e-05, + "loss": 0.3956, + "step": 1342 + }, + { + "epoch": 1.431769722814499, + "grad_norm": 0.40455914476109633, + "learning_rate": 7.183129313261833e-05, + "loss": 0.393, + "step": 1343 + }, + { + "epoch": 1.4328358208955223, + "grad_norm": 0.4415785397804514, + "learning_rate": 7.181325546811503e-05, + "loss": 0.3961, + "step": 1344 + }, + { + "epoch": 1.4339019189765458, + "grad_norm": 0.5500841020492822, + "learning_rate": 7.179520018072158e-05, + "loss": 0.3954, + "step": 1345 + }, + { + "epoch": 1.4349680170575694, + "grad_norm": 0.6263260139567476, + "learning_rate": 7.177712728043967e-05, + "loss": 0.3981, + "step": 1346 + }, + { + "epoch": 1.4360341151385927, + "grad_norm": 0.7401499402434726, + "learning_rate": 7.175903677728077e-05, + "loss": 0.3974, + "step": 1347 + }, + { + "epoch": 1.4371002132196162, + "grad_norm": 0.8726338431226237, + "learning_rate": 7.174092868126604e-05, + "loss": 0.3871, + "step": 1348 + }, + { + "epoch": 1.4381663113006398, + "grad_norm": 0.8511811546689546, + "learning_rate": 7.172280300242648e-05, + "loss": 0.3935, + "step": 1349 + }, + { + "epoch": 1.439232409381663, + "grad_norm": 0.8021327414666732, + "learning_rate": 7.17046597508027e-05, + "loss": 0.3989, + "step": 1350 + }, + { + "epoch": 1.4402985074626866, + "grad_norm": 0.7465635016649216, + "learning_rate": 7.168649893644517e-05, + "loss": 0.3911, + "step": 1351 + }, + { + "epoch": 1.44136460554371, + "grad_norm": 0.722112156696109, + "learning_rate": 7.166832056941405e-05, + "loss": 0.4008, + "step": 1352 + }, + { + "epoch": 1.4424307036247335, + "grad_norm": 0.6928780087951198, + "learning_rate": 7.165012465977916e-05, + "loss": 0.3944, + "step": 1353 + }, + { + "epoch": 1.443496801705757, + "grad_norm": 0.5324488681456964, + "learning_rate": 7.163191121762012e-05, + "loss": 0.3923, + "step": 1354 + }, + { + "epoch": 1.4445628997867803, + "grad_norm": 0.37600301998098723, + "learning_rate": 7.161368025302622e-05, + "loss": 0.3931, + "step": 1355 + }, + { + "epoch": 1.4456289978678039, + "grad_norm": 0.43884431846324246, + "learning_rate": 7.159543177609648e-05, + "loss": 0.4019, + "step": 1356 + }, + { + "epoch": 1.4466950959488272, + "grad_norm": 0.4943768560008293, + "learning_rate": 7.15771657969396e-05, + "loss": 0.3918, + "step": 1357 + }, + { + "epoch": 1.4477611940298507, + "grad_norm": 0.5078112364255435, + "learning_rate": 7.155888232567396e-05, + "loss": 0.3931, + "step": 1358 + }, + { + "epoch": 1.4488272921108742, + "grad_norm": 0.5262670681715143, + "learning_rate": 7.15405813724277e-05, + "loss": 0.3901, + "step": 1359 + }, + { + "epoch": 1.4498933901918978, + "grad_norm": 0.44931799250014814, + "learning_rate": 7.152226294733857e-05, + "loss": 0.3928, + "step": 1360 + }, + { + "epoch": 1.450959488272921, + "grad_norm": 0.3766185113949834, + "learning_rate": 7.150392706055401e-05, + "loss": 0.3888, + "step": 1361 + }, + { + "epoch": 1.4520255863539446, + "grad_norm": 0.31545132670855125, + "learning_rate": 7.148557372223118e-05, + "loss": 0.3917, + "step": 1362 + }, + { + "epoch": 1.453091684434968, + "grad_norm": 0.4218667315766686, + "learning_rate": 7.146720294253687e-05, + "loss": 0.3958, + "step": 1363 + }, + { + "epoch": 1.4541577825159915, + "grad_norm": 0.5982279310643812, + "learning_rate": 7.14488147316475e-05, + "loss": 0.3923, + "step": 1364 + }, + { + "epoch": 1.455223880597015, + "grad_norm": 0.6848592048351014, + "learning_rate": 7.143040909974923e-05, + "loss": 0.3984, + "step": 1365 + }, + { + "epoch": 1.4562899786780383, + "grad_norm": 0.6819349249907524, + "learning_rate": 7.14119860570378e-05, + "loss": 0.3939, + "step": 1366 + }, + { + "epoch": 1.4573560767590619, + "grad_norm": 0.6440204606115185, + "learning_rate": 7.139354561371863e-05, + "loss": 0.3997, + "step": 1367 + }, + { + "epoch": 1.4584221748400852, + "grad_norm": 0.630295208498841, + "learning_rate": 7.137508778000676e-05, + "loss": 0.3948, + "step": 1368 + }, + { + "epoch": 1.4594882729211087, + "grad_norm": 0.5314317110342841, + "learning_rate": 7.135661256612688e-05, + "loss": 0.3889, + "step": 1369 + }, + { + "epoch": 1.4605543710021323, + "grad_norm": 0.34981486673823153, + "learning_rate": 7.133811998231327e-05, + "loss": 0.3935, + "step": 1370 + }, + { + "epoch": 1.4616204690831556, + "grad_norm": 0.35709223587817546, + "learning_rate": 7.131961003880989e-05, + "loss": 0.3934, + "step": 1371 + }, + { + "epoch": 1.462686567164179, + "grad_norm": 0.45371546795069073, + "learning_rate": 7.130108274587027e-05, + "loss": 0.3968, + "step": 1372 + }, + { + "epoch": 1.4637526652452024, + "grad_norm": 0.5253418087756354, + "learning_rate": 7.128253811375759e-05, + "loss": 0.394, + "step": 1373 + }, + { + "epoch": 1.464818763326226, + "grad_norm": 0.4991498770169075, + "learning_rate": 7.126397615274459e-05, + "loss": 0.3846, + "step": 1374 + }, + { + "epoch": 1.4658848614072495, + "grad_norm": 0.49528918882508394, + "learning_rate": 7.124539687311362e-05, + "loss": 0.3945, + "step": 1375 + }, + { + "epoch": 1.466950959488273, + "grad_norm": 0.5068485592396924, + "learning_rate": 7.122680028515668e-05, + "loss": 0.3959, + "step": 1376 + }, + { + "epoch": 1.4680170575692963, + "grad_norm": 0.5465623764731636, + "learning_rate": 7.120818639917527e-05, + "loss": 0.3983, + "step": 1377 + }, + { + "epoch": 1.4690831556503199, + "grad_norm": 0.6180418289803238, + "learning_rate": 7.118955522548053e-05, + "loss": 0.3906, + "step": 1378 + }, + { + "epoch": 1.4701492537313432, + "grad_norm": 0.6374392706839199, + "learning_rate": 7.117090677439317e-05, + "loss": 0.4016, + "step": 1379 + }, + { + "epoch": 1.4712153518123667, + "grad_norm": 0.5780281794791482, + "learning_rate": 7.115224105624346e-05, + "loss": 0.3891, + "step": 1380 + }, + { + "epoch": 1.4722814498933903, + "grad_norm": 0.5215481763118731, + "learning_rate": 7.113355808137122e-05, + "loss": 0.3937, + "step": 1381 + }, + { + "epoch": 1.4733475479744136, + "grad_norm": 0.5508793693931563, + "learning_rate": 7.111485786012588e-05, + "loss": 0.3993, + "step": 1382 + }, + { + "epoch": 1.4744136460554371, + "grad_norm": 0.6235158688200941, + "learning_rate": 7.109614040286636e-05, + "loss": 0.3909, + "step": 1383 + }, + { + "epoch": 1.4754797441364604, + "grad_norm": 0.6200848574811243, + "learning_rate": 7.107740571996118e-05, + "loss": 0.4002, + "step": 1384 + }, + { + "epoch": 1.476545842217484, + "grad_norm": 0.5904012350207049, + "learning_rate": 7.105865382178836e-05, + "loss": 0.3934, + "step": 1385 + }, + { + "epoch": 1.4776119402985075, + "grad_norm": 0.5289964058379455, + "learning_rate": 7.10398847187355e-05, + "loss": 0.3914, + "step": 1386 + }, + { + "epoch": 1.478678038379531, + "grad_norm": 0.443464050960444, + "learning_rate": 7.10210984211997e-05, + "loss": 0.392, + "step": 1387 + }, + { + "epoch": 1.4797441364605544, + "grad_norm": 0.4427955685061585, + "learning_rate": 7.100229493958757e-05, + "loss": 0.3969, + "step": 1388 + }, + { + "epoch": 1.480810234541578, + "grad_norm": 0.46393678647424846, + "learning_rate": 7.09834742843153e-05, + "loss": 0.3934, + "step": 1389 + }, + { + "epoch": 1.4818763326226012, + "grad_norm": 0.39212911960500413, + "learning_rate": 7.096463646580853e-05, + "loss": 0.3969, + "step": 1390 + }, + { + "epoch": 1.4829424307036247, + "grad_norm": 0.31518826807678046, + "learning_rate": 7.094578149450243e-05, + "loss": 0.3965, + "step": 1391 + }, + { + "epoch": 1.4840085287846483, + "grad_norm": 0.35137446772893605, + "learning_rate": 7.092690938084168e-05, + "loss": 0.394, + "step": 1392 + }, + { + "epoch": 1.4850746268656716, + "grad_norm": 0.43189084089863594, + "learning_rate": 7.090802013528047e-05, + "loss": 0.3907, + "step": 1393 + }, + { + "epoch": 1.4861407249466951, + "grad_norm": 0.4502661111740289, + "learning_rate": 7.088911376828241e-05, + "loss": 0.3946, + "step": 1394 + }, + { + "epoch": 1.4872068230277184, + "grad_norm": 0.44527023350907197, + "learning_rate": 7.087019029032071e-05, + "loss": 0.3977, + "step": 1395 + }, + { + "epoch": 1.488272921108742, + "grad_norm": 0.45794515563438, + "learning_rate": 7.085124971187794e-05, + "loss": 0.3974, + "step": 1396 + }, + { + "epoch": 1.4893390191897655, + "grad_norm": 0.46227336496153126, + "learning_rate": 7.083229204344623e-05, + "loss": 0.3861, + "step": 1397 + }, + { + "epoch": 1.4904051172707888, + "grad_norm": 0.4503424372012637, + "learning_rate": 7.081331729552712e-05, + "loss": 0.3951, + "step": 1398 + }, + { + "epoch": 1.4914712153518124, + "grad_norm": 0.4826116426207179, + "learning_rate": 7.079432547863164e-05, + "loss": 0.3883, + "step": 1399 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.526692112887762, + "learning_rate": 7.077531660328028e-05, + "loss": 0.3908, + "step": 1400 + }, + { + "epoch": 1.4936034115138592, + "grad_norm": 0.4637243917340167, + "learning_rate": 7.075629068000297e-05, + "loss": 0.3939, + "step": 1401 + }, + { + "epoch": 1.4946695095948828, + "grad_norm": 0.4348762733252528, + "learning_rate": 7.073724771933906e-05, + "loss": 0.3859, + "step": 1402 + }, + { + "epoch": 1.4957356076759063, + "grad_norm": 0.4277816665773673, + "learning_rate": 7.071818773183738e-05, + "loss": 0.3969, + "step": 1403 + }, + { + "epoch": 1.4968017057569296, + "grad_norm": 0.4767523309718055, + "learning_rate": 7.069911072805618e-05, + "loss": 0.3918, + "step": 1404 + }, + { + "epoch": 1.4978678038379531, + "grad_norm": 0.4905037191035888, + "learning_rate": 7.068001671856309e-05, + "loss": 0.3906, + "step": 1405 + }, + { + "epoch": 1.4989339019189765, + "grad_norm": 0.4362857431277552, + "learning_rate": 7.066090571393524e-05, + "loss": 0.3909, + "step": 1406 + }, + { + "epoch": 1.5, + "grad_norm": 0.3290996660217568, + "learning_rate": 7.064177772475912e-05, + "loss": 0.3976, + "step": 1407 + }, + { + "epoch": 1.5010660980810235, + "grad_norm": 0.3744727365069323, + "learning_rate": 7.062263276163064e-05, + "loss": 0.3962, + "step": 1408 + }, + { + "epoch": 1.502132196162047, + "grad_norm": 0.5302924684211919, + "learning_rate": 7.060347083515511e-05, + "loss": 0.3847, + "step": 1409 + }, + { + "epoch": 1.5031982942430704, + "grad_norm": 0.636023047266614, + "learning_rate": 7.058429195594727e-05, + "loss": 0.3952, + "step": 1410 + }, + { + "epoch": 1.5042643923240937, + "grad_norm": 0.5944045338529551, + "learning_rate": 7.056509613463118e-05, + "loss": 0.3901, + "step": 1411 + }, + { + "epoch": 1.5053304904051172, + "grad_norm": 0.4426101595482755, + "learning_rate": 7.054588338184034e-05, + "loss": 0.3949, + "step": 1412 + }, + { + "epoch": 1.5063965884861408, + "grad_norm": 0.3582450742335668, + "learning_rate": 7.052665370821764e-05, + "loss": 0.3882, + "step": 1413 + }, + { + "epoch": 1.5074626865671643, + "grad_norm": 0.3954777880467866, + "learning_rate": 7.050740712441528e-05, + "loss": 0.3972, + "step": 1414 + }, + { + "epoch": 1.5085287846481876, + "grad_norm": 0.4416647029712508, + "learning_rate": 7.04881436410949e-05, + "loss": 0.39, + "step": 1415 + }, + { + "epoch": 1.509594882729211, + "grad_norm": 0.44707077361093805, + "learning_rate": 7.046886326892747e-05, + "loss": 0.3855, + "step": 1416 + }, + { + "epoch": 1.5106609808102345, + "grad_norm": 0.40402387918207455, + "learning_rate": 7.044956601859329e-05, + "loss": 0.3928, + "step": 1417 + }, + { + "epoch": 1.511727078891258, + "grad_norm": 0.31989191868470707, + "learning_rate": 7.043025190078205e-05, + "loss": 0.3919, + "step": 1418 + }, + { + "epoch": 1.5127931769722816, + "grad_norm": 0.2810822260181101, + "learning_rate": 7.041092092619277e-05, + "loss": 0.3852, + "step": 1419 + }, + { + "epoch": 1.5138592750533049, + "grad_norm": 0.38427721507681256, + "learning_rate": 7.039157310553378e-05, + "loss": 0.3975, + "step": 1420 + }, + { + "epoch": 1.5149253731343284, + "grad_norm": 0.43787056804713514, + "learning_rate": 7.03722084495228e-05, + "loss": 0.3986, + "step": 1421 + }, + { + "epoch": 1.5159914712153517, + "grad_norm": 0.39271707463092426, + "learning_rate": 7.035282696888684e-05, + "loss": 0.393, + "step": 1422 + }, + { + "epoch": 1.5170575692963753, + "grad_norm": 0.41954871590765713, + "learning_rate": 7.033342867436221e-05, + "loss": 0.3979, + "step": 1423 + }, + { + "epoch": 1.5181236673773988, + "grad_norm": 0.488489761777447, + "learning_rate": 7.031401357669456e-05, + "loss": 0.3881, + "step": 1424 + }, + { + "epoch": 1.5191897654584223, + "grad_norm": 0.5515408415736078, + "learning_rate": 7.029458168663887e-05, + "loss": 0.3862, + "step": 1425 + }, + { + "epoch": 1.5202558635394456, + "grad_norm": 0.6316116354397406, + "learning_rate": 7.027513301495937e-05, + "loss": 0.3889, + "step": 1426 + }, + { + "epoch": 1.521321961620469, + "grad_norm": 0.6866508216670858, + "learning_rate": 7.025566757242962e-05, + "loss": 0.3927, + "step": 1427 + }, + { + "epoch": 1.5223880597014925, + "grad_norm": 0.6861740365624607, + "learning_rate": 7.023618536983249e-05, + "loss": 0.3976, + "step": 1428 + }, + { + "epoch": 1.523454157782516, + "grad_norm": 0.6832689017816802, + "learning_rate": 7.021668641796008e-05, + "loss": 0.3935, + "step": 1429 + }, + { + "epoch": 1.5245202558635396, + "grad_norm": 0.678454190085777, + "learning_rate": 7.019717072761377e-05, + "loss": 0.3879, + "step": 1430 + }, + { + "epoch": 1.5255863539445629, + "grad_norm": 0.6514965485012345, + "learning_rate": 7.01776383096043e-05, + "loss": 0.3917, + "step": 1431 + }, + { + "epoch": 1.5266524520255862, + "grad_norm": 0.5811169015448752, + "learning_rate": 7.01580891747516e-05, + "loss": 0.3942, + "step": 1432 + }, + { + "epoch": 1.5277185501066097, + "grad_norm": 0.569097620467461, + "learning_rate": 7.013852333388483e-05, + "loss": 0.3955, + "step": 1433 + }, + { + "epoch": 1.5287846481876333, + "grad_norm": 0.4808827995992214, + "learning_rate": 7.011894079784248e-05, + "loss": 0.3908, + "step": 1434 + }, + { + "epoch": 1.5298507462686568, + "grad_norm": 0.3656185727390282, + "learning_rate": 7.009934157747227e-05, + "loss": 0.3946, + "step": 1435 + }, + { + "epoch": 1.5309168443496801, + "grad_norm": 0.5145348764394406, + "learning_rate": 7.007972568363112e-05, + "loss": 0.3882, + "step": 1436 + }, + { + "epoch": 1.5319829424307037, + "grad_norm": 0.6632114181696835, + "learning_rate": 7.006009312718525e-05, + "loss": 0.3912, + "step": 1437 + }, + { + "epoch": 1.533049040511727, + "grad_norm": 0.6714409788868576, + "learning_rate": 7.004044391901005e-05, + "loss": 0.3946, + "step": 1438 + }, + { + "epoch": 1.5341151385927505, + "grad_norm": 0.5938831067807631, + "learning_rate": 7.002077806999016e-05, + "loss": 0.3911, + "step": 1439 + }, + { + "epoch": 1.535181236673774, + "grad_norm": 0.5658150891169751, + "learning_rate": 7.000109559101944e-05, + "loss": 0.39, + "step": 1440 + }, + { + "epoch": 1.5362473347547976, + "grad_norm": 0.5719079089059944, + "learning_rate": 6.998139649300097e-05, + "loss": 0.3963, + "step": 1441 + }, + { + "epoch": 1.537313432835821, + "grad_norm": 0.4711947234722605, + "learning_rate": 6.996168078684702e-05, + "loss": 0.3909, + "step": 1442 + }, + { + "epoch": 1.5383795309168442, + "grad_norm": 0.4358846322039788, + "learning_rate": 6.994194848347908e-05, + "loss": 0.3933, + "step": 1443 + }, + { + "epoch": 1.5394456289978677, + "grad_norm": 0.6131004479680616, + "learning_rate": 6.99221995938278e-05, + "loss": 0.3995, + "step": 1444 + }, + { + "epoch": 1.5405117270788913, + "grad_norm": 0.6962404795880364, + "learning_rate": 6.990243412883304e-05, + "loss": 0.3925, + "step": 1445 + }, + { + "epoch": 1.5415778251599148, + "grad_norm": 0.6694951018990504, + "learning_rate": 6.988265209944387e-05, + "loss": 0.3892, + "step": 1446 + }, + { + "epoch": 1.5426439232409381, + "grad_norm": 0.5156094912368259, + "learning_rate": 6.986285351661847e-05, + "loss": 0.3892, + "step": 1447 + }, + { + "epoch": 1.5437100213219617, + "grad_norm": 0.41423025778526606, + "learning_rate": 6.984303839132425e-05, + "loss": 0.3964, + "step": 1448 + }, + { + "epoch": 1.544776119402985, + "grad_norm": 0.3696131621291919, + "learning_rate": 6.982320673453773e-05, + "loss": 0.3905, + "step": 1449 + }, + { + "epoch": 1.5458422174840085, + "grad_norm": 0.37566704365434295, + "learning_rate": 6.980335855724465e-05, + "loss": 0.3898, + "step": 1450 + }, + { + "epoch": 1.546908315565032, + "grad_norm": 0.44656110415033884, + "learning_rate": 6.978349387043986e-05, + "loss": 0.3954, + "step": 1451 + }, + { + "epoch": 1.5479744136460556, + "grad_norm": 0.5097702199954142, + "learning_rate": 6.976361268512735e-05, + "loss": 0.3916, + "step": 1452 + }, + { + "epoch": 1.549040511727079, + "grad_norm": 0.6115774145379382, + "learning_rate": 6.974371501232027e-05, + "loss": 0.4001, + "step": 1453 + }, + { + "epoch": 1.5501066098081022, + "grad_norm": 0.6298592763126765, + "learning_rate": 6.97238008630409e-05, + "loss": 0.3913, + "step": 1454 + }, + { + "epoch": 1.5511727078891258, + "grad_norm": 0.5640252014215751, + "learning_rate": 6.970387024832066e-05, + "loss": 0.3972, + "step": 1455 + }, + { + "epoch": 1.5522388059701493, + "grad_norm": 0.5637434287286831, + "learning_rate": 6.968392317920005e-05, + "loss": 0.3914, + "step": 1456 + }, + { + "epoch": 1.5533049040511728, + "grad_norm": 0.5758115244979767, + "learning_rate": 6.96639596667287e-05, + "loss": 0.3888, + "step": 1457 + }, + { + "epoch": 1.5543710021321961, + "grad_norm": 0.579669625426786, + "learning_rate": 6.96439797219654e-05, + "loss": 0.3964, + "step": 1458 + }, + { + "epoch": 1.5554371002132195, + "grad_norm": 0.5774000389217503, + "learning_rate": 6.962398335597798e-05, + "loss": 0.391, + "step": 1459 + }, + { + "epoch": 1.556503198294243, + "grad_norm": 0.5665645013725628, + "learning_rate": 6.960397057984336e-05, + "loss": 0.3925, + "step": 1460 + }, + { + "epoch": 1.5575692963752665, + "grad_norm": 0.5243134790028094, + "learning_rate": 6.958394140464761e-05, + "loss": 0.3931, + "step": 1461 + }, + { + "epoch": 1.55863539445629, + "grad_norm": 0.5072252071927075, + "learning_rate": 6.956389584148586e-05, + "loss": 0.3874, + "step": 1462 + }, + { + "epoch": 1.5597014925373134, + "grad_norm": 0.42649762130370344, + "learning_rate": 6.954383390146228e-05, + "loss": 0.3902, + "step": 1463 + }, + { + "epoch": 1.560767590618337, + "grad_norm": 0.33264595497577854, + "learning_rate": 6.952375559569016e-05, + "loss": 0.3863, + "step": 1464 + }, + { + "epoch": 1.5618336886993602, + "grad_norm": 0.3759252125909897, + "learning_rate": 6.950366093529184e-05, + "loss": 0.393, + "step": 1465 + }, + { + "epoch": 1.5628997867803838, + "grad_norm": 0.4265770921364556, + "learning_rate": 6.94835499313987e-05, + "loss": 0.3852, + "step": 1466 + }, + { + "epoch": 1.5639658848614073, + "grad_norm": 0.4579903850772894, + "learning_rate": 6.946342259515122e-05, + "loss": 0.3911, + "step": 1467 + }, + { + "epoch": 1.5650319829424308, + "grad_norm": 0.5479175031792544, + "learning_rate": 6.944327893769887e-05, + "loss": 0.3962, + "step": 1468 + }, + { + "epoch": 1.5660980810234542, + "grad_norm": 0.6055176521095669, + "learning_rate": 6.94231189702002e-05, + "loss": 0.3864, + "step": 1469 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.6727380474108113, + "learning_rate": 6.940294270382278e-05, + "loss": 0.3921, + "step": 1470 + }, + { + "epoch": 1.568230277185501, + "grad_norm": 0.6356621826960686, + "learning_rate": 6.938275014974323e-05, + "loss": 0.3947, + "step": 1471 + }, + { + "epoch": 1.5692963752665245, + "grad_norm": 0.5243747711823524, + "learning_rate": 6.936254131914717e-05, + "loss": 0.3959, + "step": 1472 + }, + { + "epoch": 1.570362473347548, + "grad_norm": 0.46811482341260074, + "learning_rate": 6.934231622322923e-05, + "loss": 0.3904, + "step": 1473 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.3465924585456931, + "learning_rate": 6.932207487319305e-05, + "loss": 0.393, + "step": 1474 + }, + { + "epoch": 1.572494669509595, + "grad_norm": 0.3025626410259389, + "learning_rate": 6.930181728025133e-05, + "loss": 0.3925, + "step": 1475 + }, + { + "epoch": 1.5735607675906182, + "grad_norm": 0.391223961099086, + "learning_rate": 6.928154345562569e-05, + "loss": 0.39, + "step": 1476 + }, + { + "epoch": 1.5746268656716418, + "grad_norm": 0.41487388189624425, + "learning_rate": 6.926125341054676e-05, + "loss": 0.3924, + "step": 1477 + }, + { + "epoch": 1.5756929637526653, + "grad_norm": 0.45859115746381557, + "learning_rate": 6.92409471562542e-05, + "loss": 0.3948, + "step": 1478 + }, + { + "epoch": 1.5767590618336889, + "grad_norm": 0.5136700870287039, + "learning_rate": 6.922062470399663e-05, + "loss": 0.3861, + "step": 1479 + }, + { + "epoch": 1.5778251599147122, + "grad_norm": 0.5017752722816414, + "learning_rate": 6.920028606503161e-05, + "loss": 0.3912, + "step": 1480 + }, + { + "epoch": 1.5788912579957355, + "grad_norm": 0.46815074536424645, + "learning_rate": 6.91799312506257e-05, + "loss": 0.3923, + "step": 1481 + }, + { + "epoch": 1.579957356076759, + "grad_norm": 0.4683376068962054, + "learning_rate": 6.915956027205438e-05, + "loss": 0.3891, + "step": 1482 + }, + { + "epoch": 1.5810234541577826, + "grad_norm": 0.3838678721557934, + "learning_rate": 6.913917314060215e-05, + "loss": 0.3884, + "step": 1483 + }, + { + "epoch": 1.582089552238806, + "grad_norm": 0.28512465563571143, + "learning_rate": 6.911876986756241e-05, + "loss": 0.3946, + "step": 1484 + }, + { + "epoch": 1.5831556503198294, + "grad_norm": 0.2798136165240047, + "learning_rate": 6.90983504642375e-05, + "loss": 0.3925, + "step": 1485 + }, + { + "epoch": 1.5842217484008527, + "grad_norm": 0.32595231025368043, + "learning_rate": 6.907791494193873e-05, + "loss": 0.3933, + "step": 1486 + }, + { + "epoch": 1.5852878464818763, + "grad_norm": 0.33263930837599737, + "learning_rate": 6.905746331198631e-05, + "loss": 0.3882, + "step": 1487 + }, + { + "epoch": 1.5863539445628998, + "grad_norm": 0.3001000433354499, + "learning_rate": 6.903699558570935e-05, + "loss": 0.3944, + "step": 1488 + }, + { + "epoch": 1.5874200426439233, + "grad_norm": 0.33691283404723554, + "learning_rate": 6.901651177444596e-05, + "loss": 0.3901, + "step": 1489 + }, + { + "epoch": 1.5884861407249466, + "grad_norm": 0.36842928704561506, + "learning_rate": 6.899601188954306e-05, + "loss": 0.3891, + "step": 1490 + }, + { + "epoch": 1.5895522388059702, + "grad_norm": 0.37394858126001956, + "learning_rate": 6.897549594235654e-05, + "loss": 0.3964, + "step": 1491 + }, + { + "epoch": 1.5906183368869935, + "grad_norm": 0.3288296151355204, + "learning_rate": 6.895496394425118e-05, + "loss": 0.3894, + "step": 1492 + }, + { + "epoch": 1.591684434968017, + "grad_norm": 0.33186383529359254, + "learning_rate": 6.893441590660064e-05, + "loss": 0.3964, + "step": 1493 + }, + { + "epoch": 1.5927505330490406, + "grad_norm": 0.32170536578140646, + "learning_rate": 6.891385184078744e-05, + "loss": 0.3892, + "step": 1494 + }, + { + "epoch": 1.593816631130064, + "grad_norm": 0.34823720165895466, + "learning_rate": 6.889327175820302e-05, + "loss": 0.3912, + "step": 1495 + }, + { + "epoch": 1.5948827292110874, + "grad_norm": 0.36827171478815857, + "learning_rate": 6.887267567024767e-05, + "loss": 0.3877, + "step": 1496 + }, + { + "epoch": 1.5959488272921107, + "grad_norm": 0.324329055251669, + "learning_rate": 6.885206358833056e-05, + "loss": 0.3907, + "step": 1497 + }, + { + "epoch": 1.5970149253731343, + "grad_norm": 0.3724585692258156, + "learning_rate": 6.883143552386971e-05, + "loss": 0.3882, + "step": 1498 + }, + { + "epoch": 1.5980810234541578, + "grad_norm": 0.3726849206910898, + "learning_rate": 6.881079148829198e-05, + "loss": 0.3905, + "step": 1499 + }, + { + "epoch": 1.5991471215351813, + "grad_norm": 0.37495710080763744, + "learning_rate": 6.879013149303312e-05, + "loss": 0.3935, + "step": 1500 + }, + { + "epoch": 1.6002132196162047, + "grad_norm": 0.5021479834874241, + "learning_rate": 6.876945554953767e-05, + "loss": 0.3925, + "step": 1501 + }, + { + "epoch": 1.6012793176972282, + "grad_norm": 0.6598112081849651, + "learning_rate": 6.874876366925904e-05, + "loss": 0.3917, + "step": 1502 + }, + { + "epoch": 1.6023454157782515, + "grad_norm": 0.7759163516332811, + "learning_rate": 6.872805586365947e-05, + "loss": 0.3885, + "step": 1503 + }, + { + "epoch": 1.603411513859275, + "grad_norm": 0.8164276907570536, + "learning_rate": 6.870733214420998e-05, + "loss": 0.3928, + "step": 1504 + }, + { + "epoch": 1.6044776119402986, + "grad_norm": 0.8744103848461371, + "learning_rate": 6.868659252239045e-05, + "loss": 0.394, + "step": 1505 + }, + { + "epoch": 1.6055437100213221, + "grad_norm": 0.8615414040135513, + "learning_rate": 6.866583700968954e-05, + "loss": 0.3838, + "step": 1506 + }, + { + "epoch": 1.6066098081023454, + "grad_norm": 0.7600225280740014, + "learning_rate": 6.864506561760474e-05, + "loss": 0.3963, + "step": 1507 + }, + { + "epoch": 1.6076759061833688, + "grad_norm": 0.5613832665703143, + "learning_rate": 6.862427835764231e-05, + "loss": 0.3995, + "step": 1508 + }, + { + "epoch": 1.6087420042643923, + "grad_norm": 0.3881251295902559, + "learning_rate": 6.860347524131733e-05, + "loss": 0.3852, + "step": 1509 + }, + { + "epoch": 1.6098081023454158, + "grad_norm": 0.3100627174569291, + "learning_rate": 6.858265628015362e-05, + "loss": 0.3914, + "step": 1510 + }, + { + "epoch": 1.6108742004264394, + "grad_norm": 0.3947566702558035, + "learning_rate": 6.856182148568382e-05, + "loss": 0.3961, + "step": 1511 + }, + { + "epoch": 1.6119402985074627, + "grad_norm": 0.49178390954875206, + "learning_rate": 6.854097086944932e-05, + "loss": 0.3941, + "step": 1512 + }, + { + "epoch": 1.613006396588486, + "grad_norm": 0.5414001966948181, + "learning_rate": 6.852010444300028e-05, + "loss": 0.3926, + "step": 1513 + }, + { + "epoch": 1.6140724946695095, + "grad_norm": 0.49531130512745414, + "learning_rate": 6.849922221789561e-05, + "loss": 0.3915, + "step": 1514 + }, + { + "epoch": 1.615138592750533, + "grad_norm": 0.41543603344069213, + "learning_rate": 6.847832420570298e-05, + "loss": 0.3883, + "step": 1515 + }, + { + "epoch": 1.6162046908315566, + "grad_norm": 0.33242146656919136, + "learning_rate": 6.84574104179988e-05, + "loss": 0.3948, + "step": 1516 + }, + { + "epoch": 1.61727078891258, + "grad_norm": 0.3892193202210671, + "learning_rate": 6.843648086636822e-05, + "loss": 0.3943, + "step": 1517 + }, + { + "epoch": 1.6183368869936035, + "grad_norm": 0.5073950555317511, + "learning_rate": 6.841553556240514e-05, + "loss": 0.3899, + "step": 1518 + }, + { + "epoch": 1.6194029850746268, + "grad_norm": 0.543311094919119, + "learning_rate": 6.839457451771214e-05, + "loss": 0.3915, + "step": 1519 + }, + { + "epoch": 1.6204690831556503, + "grad_norm": 0.5283833705504892, + "learning_rate": 6.837359774390058e-05, + "loss": 0.3874, + "step": 1520 + }, + { + "epoch": 1.6215351812366738, + "grad_norm": 0.5077032134792163, + "learning_rate": 6.835260525259048e-05, + "loss": 0.3904, + "step": 1521 + }, + { + "epoch": 1.6226012793176974, + "grad_norm": 0.5106764897990909, + "learning_rate": 6.83315970554106e-05, + "loss": 0.3898, + "step": 1522 + }, + { + "epoch": 1.6236673773987207, + "grad_norm": 0.5286416559892751, + "learning_rate": 6.831057316399839e-05, + "loss": 0.3886, + "step": 1523 + }, + { + "epoch": 1.624733475479744, + "grad_norm": 0.409097737243627, + "learning_rate": 6.828953358999998e-05, + "loss": 0.3899, + "step": 1524 + }, + { + "epoch": 1.6257995735607675, + "grad_norm": 0.3213269166849657, + "learning_rate": 6.826847834507024e-05, + "loss": 0.3892, + "step": 1525 + }, + { + "epoch": 1.626865671641791, + "grad_norm": 0.3126601387070219, + "learning_rate": 6.824740744087262e-05, + "loss": 0.3861, + "step": 1526 + }, + { + "epoch": 1.6279317697228146, + "grad_norm": 0.39308792247161223, + "learning_rate": 6.822632088907937e-05, + "loss": 0.3912, + "step": 1527 + }, + { + "epoch": 1.628997867803838, + "grad_norm": 0.48501314817610786, + "learning_rate": 6.820521870137129e-05, + "loss": 0.3901, + "step": 1528 + }, + { + "epoch": 1.6300639658848612, + "grad_norm": 0.5074473205522086, + "learning_rate": 6.818410088943791e-05, + "loss": 0.391, + "step": 1529 + }, + { + "epoch": 1.6311300639658848, + "grad_norm": 0.5821772106338331, + "learning_rate": 6.816296746497744e-05, + "loss": 0.3868, + "step": 1530 + }, + { + "epoch": 1.6321961620469083, + "grad_norm": 0.6719640615154807, + "learning_rate": 6.814181843969664e-05, + "loss": 0.3974, + "step": 1531 + }, + { + "epoch": 1.6332622601279319, + "grad_norm": 0.599159712940155, + "learning_rate": 6.812065382531101e-05, + "loss": 0.391, + "step": 1532 + }, + { + "epoch": 1.6343283582089554, + "grad_norm": 0.5406155214874598, + "learning_rate": 6.809947363354464e-05, + "loss": 0.3907, + "step": 1533 + }, + { + "epoch": 1.6353944562899787, + "grad_norm": 0.5404737249091326, + "learning_rate": 6.807827787613024e-05, + "loss": 0.3933, + "step": 1534 + }, + { + "epoch": 1.636460554371002, + "grad_norm": 0.5080707037084039, + "learning_rate": 6.805706656480917e-05, + "loss": 0.3802, + "step": 1535 + }, + { + "epoch": 1.6375266524520256, + "grad_norm": 0.45278920241628184, + "learning_rate": 6.803583971133139e-05, + "loss": 0.3965, + "step": 1536 + }, + { + "epoch": 1.638592750533049, + "grad_norm": 0.46161933055133486, + "learning_rate": 6.801459732745547e-05, + "loss": 0.4023, + "step": 1537 + }, + { + "epoch": 1.6396588486140726, + "grad_norm": 0.5047213330551431, + "learning_rate": 6.799333942494861e-05, + "loss": 0.386, + "step": 1538 + }, + { + "epoch": 1.640724946695096, + "grad_norm": 0.4450247054157199, + "learning_rate": 6.797206601558654e-05, + "loss": 0.3933, + "step": 1539 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.3955698230219666, + "learning_rate": 6.795077711115368e-05, + "loss": 0.3941, + "step": 1540 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.4975244131254161, + "learning_rate": 6.792947272344292e-05, + "loss": 0.3985, + "step": 1541 + }, + { + "epoch": 1.6439232409381663, + "grad_norm": 0.4201752241177451, + "learning_rate": 6.790815286425581e-05, + "loss": 0.3899, + "step": 1542 + }, + { + "epoch": 1.6449893390191899, + "grad_norm": 0.3070985227762259, + "learning_rate": 6.788681754540245e-05, + "loss": 0.3931, + "step": 1543 + }, + { + "epoch": 1.6460554371002132, + "grad_norm": 0.40901085056872827, + "learning_rate": 6.78654667787015e-05, + "loss": 0.3911, + "step": 1544 + }, + { + "epoch": 1.6471215351812367, + "grad_norm": 0.4407517158255465, + "learning_rate": 6.784410057598016e-05, + "loss": 0.3888, + "step": 1545 + }, + { + "epoch": 1.64818763326226, + "grad_norm": 0.3680848288610597, + "learning_rate": 6.782271894907419e-05, + "loss": 0.3867, + "step": 1546 + }, + { + "epoch": 1.6492537313432836, + "grad_norm": 0.38558182531740504, + "learning_rate": 6.78013219098279e-05, + "loss": 0.3873, + "step": 1547 + }, + { + "epoch": 1.650319829424307, + "grad_norm": 0.43293933049711597, + "learning_rate": 6.777990947009418e-05, + "loss": 0.3876, + "step": 1548 + }, + { + "epoch": 1.6513859275053306, + "grad_norm": 0.5130367741977939, + "learning_rate": 6.775848164173436e-05, + "loss": 0.3971, + "step": 1549 + }, + { + "epoch": 1.652452025586354, + "grad_norm": 0.5953290932568412, + "learning_rate": 6.773703843661837e-05, + "loss": 0.3919, + "step": 1550 + }, + { + "epoch": 1.6535181236673773, + "grad_norm": 0.603488942008407, + "learning_rate": 6.771557986662462e-05, + "loss": 0.39, + "step": 1551 + }, + { + "epoch": 1.6545842217484008, + "grad_norm": 0.5050491129848195, + "learning_rate": 6.769410594364004e-05, + "loss": 0.3875, + "step": 1552 + }, + { + "epoch": 1.6556503198294243, + "grad_norm": 0.4097987145826878, + "learning_rate": 6.767261667956009e-05, + "loss": 0.3923, + "step": 1553 + }, + { + "epoch": 1.6567164179104479, + "grad_norm": 0.38259958335783656, + "learning_rate": 6.765111208628866e-05, + "loss": 0.394, + "step": 1554 + }, + { + "epoch": 1.6577825159914712, + "grad_norm": 0.36285793649423737, + "learning_rate": 6.762959217573823e-05, + "loss": 0.3902, + "step": 1555 + }, + { + "epoch": 1.6588486140724945, + "grad_norm": 0.38527741858086, + "learning_rate": 6.760805695982967e-05, + "loss": 0.3934, + "step": 1556 + }, + { + "epoch": 1.659914712153518, + "grad_norm": 0.43180049398669657, + "learning_rate": 6.75865064504924e-05, + "loss": 0.3948, + "step": 1557 + }, + { + "epoch": 1.6609808102345416, + "grad_norm": 0.4605936531276027, + "learning_rate": 6.756494065966426e-05, + "loss": 0.3853, + "step": 1558 + }, + { + "epoch": 1.6620469083155651, + "grad_norm": 0.5637754400106197, + "learning_rate": 6.754335959929159e-05, + "loss": 0.3906, + "step": 1559 + }, + { + "epoch": 1.6631130063965884, + "grad_norm": 0.6706100285105229, + "learning_rate": 6.752176328132918e-05, + "loss": 0.3896, + "step": 1560 + }, + { + "epoch": 1.664179104477612, + "grad_norm": 0.6676286093604591, + "learning_rate": 6.750015171774025e-05, + "loss": 0.3916, + "step": 1561 + }, + { + "epoch": 1.6652452025586353, + "grad_norm": 0.5819487463144472, + "learning_rate": 6.747852492049648e-05, + "loss": 0.3865, + "step": 1562 + }, + { + "epoch": 1.6663113006396588, + "grad_norm": 0.4906481414544558, + "learning_rate": 6.745688290157803e-05, + "loss": 0.3901, + "step": 1563 + }, + { + "epoch": 1.6673773987206824, + "grad_norm": 0.43576919495858, + "learning_rate": 6.743522567297341e-05, + "loss": 0.3946, + "step": 1564 + }, + { + "epoch": 1.668443496801706, + "grad_norm": 0.3626934309190754, + "learning_rate": 6.741355324667963e-05, + "loss": 0.3859, + "step": 1565 + }, + { + "epoch": 1.6695095948827292, + "grad_norm": 0.30739074025899915, + "learning_rate": 6.739186563470208e-05, + "loss": 0.3855, + "step": 1566 + }, + { + "epoch": 1.6705756929637525, + "grad_norm": 0.39101458085234175, + "learning_rate": 6.737016284905455e-05, + "loss": 0.3931, + "step": 1567 + }, + { + "epoch": 1.671641791044776, + "grad_norm": 0.5051971466609865, + "learning_rate": 6.734844490175929e-05, + "loss": 0.3881, + "step": 1568 + }, + { + "epoch": 1.6727078891257996, + "grad_norm": 0.5097545702694593, + "learning_rate": 6.732671180484687e-05, + "loss": 0.385, + "step": 1569 + }, + { + "epoch": 1.6737739872068231, + "grad_norm": 0.39153153015185177, + "learning_rate": 6.730496357035634e-05, + "loss": 0.3876, + "step": 1570 + }, + { + "epoch": 1.6748400852878464, + "grad_norm": 0.3456157078554192, + "learning_rate": 6.728320021033509e-05, + "loss": 0.3913, + "step": 1571 + }, + { + "epoch": 1.67590618336887, + "grad_norm": 0.38289300413638977, + "learning_rate": 6.726142173683884e-05, + "loss": 0.3889, + "step": 1572 + }, + { + "epoch": 1.6769722814498933, + "grad_norm": 0.36549754916811916, + "learning_rate": 6.723962816193178e-05, + "loss": 0.389, + "step": 1573 + }, + { + "epoch": 1.6780383795309168, + "grad_norm": 0.33051094759676547, + "learning_rate": 6.721781949768639e-05, + "loss": 0.3919, + "step": 1574 + }, + { + "epoch": 1.6791044776119404, + "grad_norm": 0.36416872005257267, + "learning_rate": 6.719599575618357e-05, + "loss": 0.3912, + "step": 1575 + }, + { + "epoch": 1.680170575692964, + "grad_norm": 0.34559502107142376, + "learning_rate": 6.717415694951251e-05, + "loss": 0.3851, + "step": 1576 + }, + { + "epoch": 1.6812366737739872, + "grad_norm": 0.3116613026962773, + "learning_rate": 6.715230308977078e-05, + "loss": 0.3916, + "step": 1577 + }, + { + "epoch": 1.6823027718550105, + "grad_norm": 0.30050046517095325, + "learning_rate": 6.713043418906428e-05, + "loss": 0.3962, + "step": 1578 + }, + { + "epoch": 1.683368869936034, + "grad_norm": 0.3577516859630259, + "learning_rate": 6.710855025950727e-05, + "loss": 0.3888, + "step": 1579 + }, + { + "epoch": 1.6844349680170576, + "grad_norm": 0.4364133412686821, + "learning_rate": 6.708665131322227e-05, + "loss": 0.3863, + "step": 1580 + }, + { + "epoch": 1.6855010660980811, + "grad_norm": 0.4687325703384506, + "learning_rate": 6.706473736234018e-05, + "loss": 0.3915, + "step": 1581 + }, + { + "epoch": 1.6865671641791045, + "grad_norm": 0.5090267219412645, + "learning_rate": 6.704280841900019e-05, + "loss": 0.3839, + "step": 1582 + }, + { + "epoch": 1.6876332622601278, + "grad_norm": 0.5457964796096376, + "learning_rate": 6.70208644953498e-05, + "loss": 0.3956, + "step": 1583 + }, + { + "epoch": 1.6886993603411513, + "grad_norm": 0.6368745088599855, + "learning_rate": 6.699890560354478e-05, + "loss": 0.3902, + "step": 1584 + }, + { + "epoch": 1.6897654584221748, + "grad_norm": 0.7026042269970657, + "learning_rate": 6.697693175574923e-05, + "loss": 0.3898, + "step": 1585 + }, + { + "epoch": 1.6908315565031984, + "grad_norm": 0.6804973492768571, + "learning_rate": 6.695494296413554e-05, + "loss": 0.3835, + "step": 1586 + }, + { + "epoch": 1.6918976545842217, + "grad_norm": 0.6670494458267645, + "learning_rate": 6.693293924088432e-05, + "loss": 0.3951, + "step": 1587 + }, + { + "epoch": 1.6929637526652452, + "grad_norm": 0.6433701055713933, + "learning_rate": 6.691092059818451e-05, + "loss": 0.3884, + "step": 1588 + }, + { + "epoch": 1.6940298507462686, + "grad_norm": 0.6565301872411362, + "learning_rate": 6.688888704823329e-05, + "loss": 0.3951, + "step": 1589 + }, + { + "epoch": 1.695095948827292, + "grad_norm": 0.6960842723329089, + "learning_rate": 6.686683860323611e-05, + "loss": 0.3946, + "step": 1590 + }, + { + "epoch": 1.6961620469083156, + "grad_norm": 0.6818586763465625, + "learning_rate": 6.684477527540664e-05, + "loss": 0.3946, + "step": 1591 + }, + { + "epoch": 1.6972281449893392, + "grad_norm": 0.5572883382416285, + "learning_rate": 6.682269707696685e-05, + "loss": 0.3867, + "step": 1592 + }, + { + "epoch": 1.6982942430703625, + "grad_norm": 0.3990760403818603, + "learning_rate": 6.680060402014689e-05, + "loss": 0.3854, + "step": 1593 + }, + { + "epoch": 1.6993603411513858, + "grad_norm": 0.4118819183924536, + "learning_rate": 6.677849611718515e-05, + "loss": 0.3864, + "step": 1594 + }, + { + "epoch": 1.7004264392324093, + "grad_norm": 0.49548805198701124, + "learning_rate": 6.67563733803283e-05, + "loss": 0.3974, + "step": 1595 + }, + { + "epoch": 1.7014925373134329, + "grad_norm": 0.5484992450512396, + "learning_rate": 6.673423582183117e-05, + "loss": 0.3895, + "step": 1596 + }, + { + "epoch": 1.7025586353944564, + "grad_norm": 0.5575698301270008, + "learning_rate": 6.67120834539568e-05, + "loss": 0.3877, + "step": 1597 + }, + { + "epoch": 1.7036247334754797, + "grad_norm": 0.5151550662716535, + "learning_rate": 6.668991628897648e-05, + "loss": 0.3877, + "step": 1598 + }, + { + "epoch": 1.7046908315565032, + "grad_norm": 0.4728544267085605, + "learning_rate": 6.666773433916965e-05, + "loss": 0.3869, + "step": 1599 + }, + { + "epoch": 1.7057569296375266, + "grad_norm": 0.47045760280939164, + "learning_rate": 6.664553761682395e-05, + "loss": 0.3851, + "step": 1600 + }, + { + "epoch": 1.70682302771855, + "grad_norm": 0.45385159010082227, + "learning_rate": 6.662332613423522e-05, + "loss": 0.3957, + "step": 1601 + }, + { + "epoch": 1.7078891257995736, + "grad_norm": 0.39340233247394074, + "learning_rate": 6.660109990370747e-05, + "loss": 0.3831, + "step": 1602 + }, + { + "epoch": 1.7089552238805972, + "grad_norm": 0.4438034696481542, + "learning_rate": 6.657885893755288e-05, + "loss": 0.3941, + "step": 1603 + }, + { + "epoch": 1.7100213219616205, + "grad_norm": 0.5254817458294889, + "learning_rate": 6.655660324809177e-05, + "loss": 0.387, + "step": 1604 + }, + { + "epoch": 1.7110874200426438, + "grad_norm": 0.4913954053153753, + "learning_rate": 6.653433284765266e-05, + "loss": 0.3826, + "step": 1605 + }, + { + "epoch": 1.7121535181236673, + "grad_norm": 0.36756522486239, + "learning_rate": 6.651204774857218e-05, + "loss": 0.4007, + "step": 1606 + }, + { + "epoch": 1.7132196162046909, + "grad_norm": 0.34100380243193384, + "learning_rate": 6.648974796319512e-05, + "loss": 0.3932, + "step": 1607 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.38180887373133354, + "learning_rate": 6.646743350387438e-05, + "loss": 0.3857, + "step": 1608 + }, + { + "epoch": 1.7153518123667377, + "grad_norm": 0.4155049597785398, + "learning_rate": 6.644510438297105e-05, + "loss": 0.3925, + "step": 1609 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.4422223445035819, + "learning_rate": 6.642276061285428e-05, + "loss": 0.387, + "step": 1610 + }, + { + "epoch": 1.7174840085287846, + "grad_norm": 0.5794607050629841, + "learning_rate": 6.640040220590136e-05, + "loss": 0.3887, + "step": 1611 + }, + { + "epoch": 1.7185501066098081, + "grad_norm": 0.6678832046144233, + "learning_rate": 6.63780291744977e-05, + "loss": 0.3899, + "step": 1612 + }, + { + "epoch": 1.7196162046908317, + "grad_norm": 0.6122776429863264, + "learning_rate": 6.635564153103677e-05, + "loss": 0.3929, + "step": 1613 + }, + { + "epoch": 1.720682302771855, + "grad_norm": 0.5059533853234256, + "learning_rate": 6.633323928792018e-05, + "loss": 0.3915, + "step": 1614 + }, + { + "epoch": 1.7217484008528785, + "grad_norm": 0.4164548494103394, + "learning_rate": 6.631082245755762e-05, + "loss": 0.3962, + "step": 1615 + }, + { + "epoch": 1.7228144989339018, + "grad_norm": 0.39276463156642827, + "learning_rate": 6.628839105236681e-05, + "loss": 0.3904, + "step": 1616 + }, + { + "epoch": 1.7238805970149254, + "grad_norm": 0.41559718909247517, + "learning_rate": 6.626594508477361e-05, + "loss": 0.3917, + "step": 1617 + }, + { + "epoch": 1.724946695095949, + "grad_norm": 0.44665059431355064, + "learning_rate": 6.624348456721191e-05, + "loss": 0.3904, + "step": 1618 + }, + { + "epoch": 1.7260127931769724, + "grad_norm": 0.4665709219109089, + "learning_rate": 6.622100951212368e-05, + "loss": 0.3901, + "step": 1619 + }, + { + "epoch": 1.7270788912579957, + "grad_norm": 0.5150232707357006, + "learning_rate": 6.619851993195893e-05, + "loss": 0.3922, + "step": 1620 + }, + { + "epoch": 1.728144989339019, + "grad_norm": 0.6026588085567588, + "learning_rate": 6.61760158391757e-05, + "loss": 0.3972, + "step": 1621 + }, + { + "epoch": 1.7292110874200426, + "grad_norm": 0.6385069316475197, + "learning_rate": 6.615349724624012e-05, + "loss": 0.3963, + "step": 1622 + }, + { + "epoch": 1.7302771855010661, + "grad_norm": 0.6066727609006709, + "learning_rate": 6.61309641656263e-05, + "loss": 0.3916, + "step": 1623 + }, + { + "epoch": 1.7313432835820897, + "grad_norm": 0.5241084773133352, + "learning_rate": 6.610841660981639e-05, + "loss": 0.3897, + "step": 1624 + }, + { + "epoch": 1.732409381663113, + "grad_norm": 0.4853459956518966, + "learning_rate": 6.608585459130057e-05, + "loss": 0.3943, + "step": 1625 + }, + { + "epoch": 1.7334754797441365, + "grad_norm": 0.43877664392065524, + "learning_rate": 6.606327812257705e-05, + "loss": 0.3889, + "step": 1626 + }, + { + "epoch": 1.7345415778251598, + "grad_norm": 0.39226116124002375, + "learning_rate": 6.604068721615198e-05, + "loss": 0.394, + "step": 1627 + }, + { + "epoch": 1.7356076759061834, + "grad_norm": 0.34110686996523315, + "learning_rate": 6.601808188453957e-05, + "loss": 0.3912, + "step": 1628 + }, + { + "epoch": 1.736673773987207, + "grad_norm": 0.3299899773295145, + "learning_rate": 6.599546214026199e-05, + "loss": 0.396, + "step": 1629 + }, + { + "epoch": 1.7377398720682304, + "grad_norm": 0.376767657465232, + "learning_rate": 6.597282799584941e-05, + "loss": 0.3878, + "step": 1630 + }, + { + "epoch": 1.7388059701492538, + "grad_norm": 0.3870812218773304, + "learning_rate": 6.595017946383998e-05, + "loss": 0.3924, + "step": 1631 + }, + { + "epoch": 1.739872068230277, + "grad_norm": 0.32816223635729713, + "learning_rate": 6.59275165567798e-05, + "loss": 0.3891, + "step": 1632 + }, + { + "epoch": 1.7409381663113006, + "grad_norm": 0.3368154332955513, + "learning_rate": 6.590483928722293e-05, + "loss": 0.3867, + "step": 1633 + }, + { + "epoch": 1.7420042643923241, + "grad_norm": 0.32846108321184647, + "learning_rate": 6.58821476677314e-05, + "loss": 0.3899, + "step": 1634 + }, + { + "epoch": 1.7430703624733477, + "grad_norm": 0.30960636706164546, + "learning_rate": 6.585944171087521e-05, + "loss": 0.3967, + "step": 1635 + }, + { + "epoch": 1.744136460554371, + "grad_norm": 0.3431044349932443, + "learning_rate": 6.583672142923226e-05, + "loss": 0.3957, + "step": 1636 + }, + { + "epoch": 1.7452025586353943, + "grad_norm": 0.3817314249874563, + "learning_rate": 6.581398683538842e-05, + "loss": 0.3938, + "step": 1637 + }, + { + "epoch": 1.7462686567164178, + "grad_norm": 0.3846143335468199, + "learning_rate": 6.579123794193746e-05, + "loss": 0.3875, + "step": 1638 + }, + { + "epoch": 1.7473347547974414, + "grad_norm": 0.40698506124086886, + "learning_rate": 6.576847476148109e-05, + "loss": 0.3878, + "step": 1639 + }, + { + "epoch": 1.748400852878465, + "grad_norm": 0.5160395538152548, + "learning_rate": 6.574569730662893e-05, + "loss": 0.3935, + "step": 1640 + }, + { + "epoch": 1.7494669509594882, + "grad_norm": 0.6377912942089725, + "learning_rate": 6.57229055899985e-05, + "loss": 0.392, + "step": 1641 + }, + { + "epoch": 1.7505330490405118, + "grad_norm": 0.6901659496461329, + "learning_rate": 6.570009962421523e-05, + "loss": 0.3912, + "step": 1642 + }, + { + "epoch": 1.751599147121535, + "grad_norm": 0.6169852874235238, + "learning_rate": 6.567727942191246e-05, + "loss": 0.391, + "step": 1643 + }, + { + "epoch": 1.7526652452025586, + "grad_norm": 0.4948279659725487, + "learning_rate": 6.565444499573136e-05, + "loss": 0.3893, + "step": 1644 + }, + { + "epoch": 1.7537313432835822, + "grad_norm": 0.5319346837051117, + "learning_rate": 6.563159635832105e-05, + "loss": 0.396, + "step": 1645 + }, + { + "epoch": 1.7547974413646057, + "grad_norm": 0.6070963028163101, + "learning_rate": 6.560873352233846e-05, + "loss": 0.3879, + "step": 1646 + }, + { + "epoch": 1.755863539445629, + "grad_norm": 0.5500219234930699, + "learning_rate": 6.558585650044842e-05, + "loss": 0.3962, + "step": 1647 + }, + { + "epoch": 1.7569296375266523, + "grad_norm": 0.3774689595977208, + "learning_rate": 6.556296530532364e-05, + "loss": 0.3853, + "step": 1648 + }, + { + "epoch": 1.7579957356076759, + "grad_norm": 0.3097496369115005, + "learning_rate": 6.554005994964459e-05, + "loss": 0.3889, + "step": 1649 + }, + { + "epoch": 1.7590618336886994, + "grad_norm": 0.3192713728666256, + "learning_rate": 6.55171404460997e-05, + "loss": 0.3904, + "step": 1650 + }, + { + "epoch": 1.760127931769723, + "grad_norm": 0.32864817312795563, + "learning_rate": 6.549420680738516e-05, + "loss": 0.3944, + "step": 1651 + }, + { + "epoch": 1.7611940298507462, + "grad_norm": 0.32211875803439644, + "learning_rate": 6.547125904620504e-05, + "loss": 0.385, + "step": 1652 + }, + { + "epoch": 1.7622601279317696, + "grad_norm": 0.30017127275221656, + "learning_rate": 6.544829717527118e-05, + "loss": 0.3885, + "step": 1653 + }, + { + "epoch": 1.763326226012793, + "grad_norm": 0.3646601960483686, + "learning_rate": 6.542532120730327e-05, + "loss": 0.3905, + "step": 1654 + }, + { + "epoch": 1.7643923240938166, + "grad_norm": 0.4908471802914999, + "learning_rate": 6.540233115502881e-05, + "loss": 0.3889, + "step": 1655 + }, + { + "epoch": 1.7654584221748402, + "grad_norm": 0.5574952396992828, + "learning_rate": 6.537932703118308e-05, + "loss": 0.3881, + "step": 1656 + }, + { + "epoch": 1.7665245202558635, + "grad_norm": 0.5663694328254277, + "learning_rate": 6.535630884850917e-05, + "loss": 0.3862, + "step": 1657 + }, + { + "epoch": 1.767590618336887, + "grad_norm": 0.5503251345750754, + "learning_rate": 6.533327661975799e-05, + "loss": 0.3826, + "step": 1658 + }, + { + "epoch": 1.7686567164179103, + "grad_norm": 0.4691123534761767, + "learning_rate": 6.531023035768815e-05, + "loss": 0.3905, + "step": 1659 + }, + { + "epoch": 1.7697228144989339, + "grad_norm": 0.41955605601856755, + "learning_rate": 6.528717007506612e-05, + "loss": 0.396, + "step": 1660 + }, + { + "epoch": 1.7707889125799574, + "grad_norm": 0.4274791936147579, + "learning_rate": 6.526409578466606e-05, + "loss": 0.3874, + "step": 1661 + }, + { + "epoch": 1.771855010660981, + "grad_norm": 0.4609745881429261, + "learning_rate": 6.524100749926997e-05, + "loss": 0.3965, + "step": 1662 + }, + { + "epoch": 1.7729211087420043, + "grad_norm": 0.5004816396707016, + "learning_rate": 6.521790523166752e-05, + "loss": 0.391, + "step": 1663 + }, + { + "epoch": 1.7739872068230276, + "grad_norm": 0.5323254145339794, + "learning_rate": 6.51947889946562e-05, + "loss": 0.3855, + "step": 1664 + }, + { + "epoch": 1.775053304904051, + "grad_norm": 0.5367601341161515, + "learning_rate": 6.517165880104119e-05, + "loss": 0.395, + "step": 1665 + }, + { + "epoch": 1.7761194029850746, + "grad_norm": 0.5721670713464904, + "learning_rate": 6.51485146636354e-05, + "loss": 0.3831, + "step": 1666 + }, + { + "epoch": 1.7771855010660982, + "grad_norm": 0.6751411525618288, + "learning_rate": 6.51253565952595e-05, + "loss": 0.3897, + "step": 1667 + }, + { + "epoch": 1.7782515991471215, + "grad_norm": 0.7475500173594719, + "learning_rate": 6.510218460874186e-05, + "loss": 0.3918, + "step": 1668 + }, + { + "epoch": 1.779317697228145, + "grad_norm": 0.7174243154443929, + "learning_rate": 6.507899871691852e-05, + "loss": 0.3872, + "step": 1669 + }, + { + "epoch": 1.7803837953091683, + "grad_norm": 0.5785749621394422, + "learning_rate": 6.50557989326333e-05, + "loss": 0.3898, + "step": 1670 + }, + { + "epoch": 1.7814498933901919, + "grad_norm": 0.4450886636964885, + "learning_rate": 6.503258526873767e-05, + "loss": 0.3824, + "step": 1671 + }, + { + "epoch": 1.7825159914712154, + "grad_norm": 0.4254053253981925, + "learning_rate": 6.500935773809076e-05, + "loss": 0.3911, + "step": 1672 + }, + { + "epoch": 1.783582089552239, + "grad_norm": 0.4474966076928897, + "learning_rate": 6.498611635355947e-05, + "loss": 0.39, + "step": 1673 + }, + { + "epoch": 1.7846481876332623, + "grad_norm": 0.45486240848719184, + "learning_rate": 6.496286112801826e-05, + "loss": 0.3914, + "step": 1674 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.514667280468136, + "learning_rate": 6.493959207434934e-05, + "loss": 0.3861, + "step": 1675 + }, + { + "epoch": 1.7867803837953091, + "grad_norm": 0.5337184119841549, + "learning_rate": 6.491630920544257e-05, + "loss": 0.3871, + "step": 1676 + }, + { + "epoch": 1.7878464818763327, + "grad_norm": 0.4780735793732188, + "learning_rate": 6.489301253419545e-05, + "loss": 0.3821, + "step": 1677 + }, + { + "epoch": 1.7889125799573562, + "grad_norm": 0.4231898830746279, + "learning_rate": 6.48697020735131e-05, + "loss": 0.386, + "step": 1678 + }, + { + "epoch": 1.7899786780383795, + "grad_norm": 0.3663697733311359, + "learning_rate": 6.484637783630832e-05, + "loss": 0.385, + "step": 1679 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.30984371824344287, + "learning_rate": 6.482303983550151e-05, + "loss": 0.391, + "step": 1680 + }, + { + "epoch": 1.7921108742004264, + "grad_norm": 0.35751913701235005, + "learning_rate": 6.479968808402075e-05, + "loss": 0.3957, + "step": 1681 + }, + { + "epoch": 1.79317697228145, + "grad_norm": 0.4093914369810105, + "learning_rate": 6.477632259480165e-05, + "loss": 0.3935, + "step": 1682 + }, + { + "epoch": 1.7942430703624734, + "grad_norm": 0.41377357296377376, + "learning_rate": 6.475294338078752e-05, + "loss": 0.3844, + "step": 1683 + }, + { + "epoch": 1.7953091684434968, + "grad_norm": 0.39868902718714505, + "learning_rate": 6.472955045492918e-05, + "loss": 0.392, + "step": 1684 + }, + { + "epoch": 1.7963752665245203, + "grad_norm": 0.42005660101933967, + "learning_rate": 6.470614383018512e-05, + "loss": 0.3932, + "step": 1685 + }, + { + "epoch": 1.7974413646055436, + "grad_norm": 0.4135157735982948, + "learning_rate": 6.468272351952141e-05, + "loss": 0.3938, + "step": 1686 + }, + { + "epoch": 1.7985074626865671, + "grad_norm": 0.42801661015742615, + "learning_rate": 6.465928953591165e-05, + "loss": 0.3879, + "step": 1687 + }, + { + "epoch": 1.7995735607675907, + "grad_norm": 0.4816981612374122, + "learning_rate": 6.463584189233709e-05, + "loss": 0.3951, + "step": 1688 + }, + { + "epoch": 1.8006396588486142, + "grad_norm": 0.46743748656875295, + "learning_rate": 6.461238060178647e-05, + "loss": 0.3939, + "step": 1689 + }, + { + "epoch": 1.8017057569296375, + "grad_norm": 0.4222158155941226, + "learning_rate": 6.458890567725614e-05, + "loss": 0.3906, + "step": 1690 + }, + { + "epoch": 1.8027718550106608, + "grad_norm": 0.3658953583582503, + "learning_rate": 6.456541713174999e-05, + "loss": 0.3962, + "step": 1691 + }, + { + "epoch": 1.8038379530916844, + "grad_norm": 0.33668688496020444, + "learning_rate": 6.454191497827945e-05, + "loss": 0.3921, + "step": 1692 + }, + { + "epoch": 1.804904051172708, + "grad_norm": 0.3206971308888266, + "learning_rate": 6.451839922986349e-05, + "loss": 0.3883, + "step": 1693 + }, + { + "epoch": 1.8059701492537314, + "grad_norm": 0.3143057384612725, + "learning_rate": 6.449486989952863e-05, + "loss": 0.3896, + "step": 1694 + }, + { + "epoch": 1.8070362473347548, + "grad_norm": 0.34510492572302714, + "learning_rate": 6.447132700030887e-05, + "loss": 0.3826, + "step": 1695 + }, + { + "epoch": 1.8081023454157783, + "grad_norm": 0.4547140804511741, + "learning_rate": 6.444777054524576e-05, + "loss": 0.3884, + "step": 1696 + }, + { + "epoch": 1.8091684434968016, + "grad_norm": 0.47384268369658744, + "learning_rate": 6.442420054738837e-05, + "loss": 0.3834, + "step": 1697 + }, + { + "epoch": 1.8102345415778252, + "grad_norm": 0.47381168033622467, + "learning_rate": 6.440061701979323e-05, + "loss": 0.3812, + "step": 1698 + }, + { + "epoch": 1.8113006396588487, + "grad_norm": 0.4548557810738229, + "learning_rate": 6.43770199755244e-05, + "loss": 0.3879, + "step": 1699 + }, + { + "epoch": 1.8123667377398722, + "grad_norm": 0.4563303678565612, + "learning_rate": 6.435340942765341e-05, + "loss": 0.3881, + "step": 1700 + }, + { + "epoch": 1.8134328358208955, + "grad_norm": 0.4345832462667786, + "learning_rate": 6.432978538925928e-05, + "loss": 0.3875, + "step": 1701 + }, + { + "epoch": 1.8144989339019189, + "grad_norm": 0.36369291668844994, + "learning_rate": 6.430614787342853e-05, + "loss": 0.3891, + "step": 1702 + }, + { + "epoch": 1.8155650319829424, + "grad_norm": 0.351336330190162, + "learning_rate": 6.428249689325505e-05, + "loss": 0.3884, + "step": 1703 + }, + { + "epoch": 1.816631130063966, + "grad_norm": 0.3720689107883433, + "learning_rate": 6.425883246184031e-05, + "loss": 0.3917, + "step": 1704 + }, + { + "epoch": 1.8176972281449895, + "grad_norm": 0.3277156951268358, + "learning_rate": 6.423515459229313e-05, + "loss": 0.3857, + "step": 1705 + }, + { + "epoch": 1.8187633262260128, + "grad_norm": 0.3146191623979247, + "learning_rate": 6.421146329772988e-05, + "loss": 0.3881, + "step": 1706 + }, + { + "epoch": 1.819829424307036, + "grad_norm": 0.39372224035911607, + "learning_rate": 6.418775859127424e-05, + "loss": 0.3847, + "step": 1707 + }, + { + "epoch": 1.8208955223880596, + "grad_norm": 0.4723072413424627, + "learning_rate": 6.416404048605744e-05, + "loss": 0.3946, + "step": 1708 + }, + { + "epoch": 1.8219616204690832, + "grad_norm": 0.5461469639906784, + "learning_rate": 6.414030899521802e-05, + "loss": 0.3977, + "step": 1709 + }, + { + "epoch": 1.8230277185501067, + "grad_norm": 0.5644867900194028, + "learning_rate": 6.411656413190205e-05, + "loss": 0.3854, + "step": 1710 + }, + { + "epoch": 1.82409381663113, + "grad_norm": 0.48007304376329935, + "learning_rate": 6.409280590926292e-05, + "loss": 0.3947, + "step": 1711 + }, + { + "epoch": 1.8251599147121536, + "grad_norm": 0.41131804519163606, + "learning_rate": 6.406903434046146e-05, + "loss": 0.3913, + "step": 1712 + }, + { + "epoch": 1.8262260127931769, + "grad_norm": 0.37379945962827266, + "learning_rate": 6.404524943866588e-05, + "loss": 0.3807, + "step": 1713 + }, + { + "epoch": 1.8272921108742004, + "grad_norm": 0.42604642417053085, + "learning_rate": 6.402145121705178e-05, + "loss": 0.3953, + "step": 1714 + }, + { + "epoch": 1.828358208955224, + "grad_norm": 0.541443650037536, + "learning_rate": 6.399763968880214e-05, + "loss": 0.3915, + "step": 1715 + }, + { + "epoch": 1.8294243070362475, + "grad_norm": 0.5461544504898415, + "learning_rate": 6.397381486710728e-05, + "loss": 0.3917, + "step": 1716 + }, + { + "epoch": 1.8304904051172708, + "grad_norm": 0.4620349232311522, + "learning_rate": 6.394997676516497e-05, + "loss": 0.3951, + "step": 1717 + }, + { + "epoch": 1.831556503198294, + "grad_norm": 0.3606206070971171, + "learning_rate": 6.392612539618024e-05, + "loss": 0.3877, + "step": 1718 + }, + { + "epoch": 1.8326226012793176, + "grad_norm": 0.3665152949807116, + "learning_rate": 6.39022607733655e-05, + "loss": 0.3911, + "step": 1719 + }, + { + "epoch": 1.8336886993603412, + "grad_norm": 0.3617479779123929, + "learning_rate": 6.387838290994056e-05, + "loss": 0.3864, + "step": 1720 + }, + { + "epoch": 1.8347547974413647, + "grad_norm": 0.35982095746420845, + "learning_rate": 6.385449181913246e-05, + "loss": 0.387, + "step": 1721 + }, + { + "epoch": 1.835820895522388, + "grad_norm": 0.42991157325695156, + "learning_rate": 6.383058751417566e-05, + "loss": 0.3872, + "step": 1722 + }, + { + "epoch": 1.8368869936034116, + "grad_norm": 0.4025624229827069, + "learning_rate": 6.380667000831188e-05, + "loss": 0.3879, + "step": 1723 + }, + { + "epoch": 1.8379530916844349, + "grad_norm": 0.37576823662421394, + "learning_rate": 6.37827393147902e-05, + "loss": 0.3934, + "step": 1724 + }, + { + "epoch": 1.8390191897654584, + "grad_norm": 0.4724397154377103, + "learning_rate": 6.375879544686695e-05, + "loss": 0.3907, + "step": 1725 + }, + { + "epoch": 1.840085287846482, + "grad_norm": 0.49890032962764724, + "learning_rate": 6.37348384178058e-05, + "loss": 0.3905, + "step": 1726 + }, + { + "epoch": 1.8411513859275055, + "grad_norm": 0.5271786209536122, + "learning_rate": 6.371086824087772e-05, + "loss": 0.3879, + "step": 1727 + }, + { + "epoch": 1.8422174840085288, + "grad_norm": 0.5789984364979213, + "learning_rate": 6.368688492936091e-05, + "loss": 0.3968, + "step": 1728 + }, + { + "epoch": 1.8432835820895521, + "grad_norm": 0.5914257408271724, + "learning_rate": 6.366288849654091e-05, + "loss": 0.3917, + "step": 1729 + }, + { + "epoch": 1.8443496801705757, + "grad_norm": 0.4741221375005763, + "learning_rate": 6.363887895571045e-05, + "loss": 0.3845, + "step": 1730 + }, + { + "epoch": 1.8454157782515992, + "grad_norm": 0.3731893195453408, + "learning_rate": 6.361485632016963e-05, + "loss": 0.3927, + "step": 1731 + }, + { + "epoch": 1.8464818763326227, + "grad_norm": 0.36291074847148286, + "learning_rate": 6.359082060322569e-05, + "loss": 0.3834, + "step": 1732 + }, + { + "epoch": 1.847547974413646, + "grad_norm": 0.33824345650854754, + "learning_rate": 6.356677181819319e-05, + "loss": 0.3923, + "step": 1733 + }, + { + "epoch": 1.8486140724946694, + "grad_norm": 0.3590581537534159, + "learning_rate": 6.35427099783939e-05, + "loss": 0.3885, + "step": 1734 + }, + { + "epoch": 1.849680170575693, + "grad_norm": 0.393824937605615, + "learning_rate": 6.351863509715684e-05, + "loss": 0.3808, + "step": 1735 + }, + { + "epoch": 1.8507462686567164, + "grad_norm": 0.38639667752223744, + "learning_rate": 6.349454718781822e-05, + "loss": 0.3893, + "step": 1736 + }, + { + "epoch": 1.85181236673774, + "grad_norm": 0.3933756538854629, + "learning_rate": 6.347044626372153e-05, + "loss": 0.3866, + "step": 1737 + }, + { + "epoch": 1.8528784648187633, + "grad_norm": 0.4385122841126179, + "learning_rate": 6.34463323382174e-05, + "loss": 0.388, + "step": 1738 + }, + { + "epoch": 1.8539445628997868, + "grad_norm": 0.4219487489482543, + "learning_rate": 6.342220542466368e-05, + "loss": 0.3902, + "step": 1739 + }, + { + "epoch": 1.8550106609808101, + "grad_norm": 0.42067268031548977, + "learning_rate": 6.339806553642545e-05, + "loss": 0.3873, + "step": 1740 + }, + { + "epoch": 1.8560767590618337, + "grad_norm": 0.4563965192041482, + "learning_rate": 6.337391268687495e-05, + "loss": 0.3865, + "step": 1741 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.4746787282948449, + "learning_rate": 6.334974688939161e-05, + "loss": 0.3832, + "step": 1742 + }, + { + "epoch": 1.8582089552238807, + "grad_norm": 0.4616609792809343, + "learning_rate": 6.3325568157362e-05, + "loss": 0.3878, + "step": 1743 + }, + { + "epoch": 1.859275053304904, + "grad_norm": 0.4503741534784094, + "learning_rate": 6.33013765041799e-05, + "loss": 0.3857, + "step": 1744 + }, + { + "epoch": 1.8603411513859274, + "grad_norm": 0.3431922287079442, + "learning_rate": 6.327717194324622e-05, + "loss": 0.3903, + "step": 1745 + }, + { + "epoch": 1.861407249466951, + "grad_norm": 0.2609150587662325, + "learning_rate": 6.325295448796903e-05, + "loss": 0.391, + "step": 1746 + }, + { + "epoch": 1.8624733475479744, + "grad_norm": 0.33270455925916104, + "learning_rate": 6.322872415176356e-05, + "loss": 0.3934, + "step": 1747 + }, + { + "epoch": 1.863539445628998, + "grad_norm": 0.43954183331919916, + "learning_rate": 6.320448094805214e-05, + "loss": 0.3873, + "step": 1748 + }, + { + "epoch": 1.8646055437100213, + "grad_norm": 0.5101457999652256, + "learning_rate": 6.318022489026425e-05, + "loss": 0.3906, + "step": 1749 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.5395181016945636, + "learning_rate": 6.315595599183646e-05, + "loss": 0.3858, + "step": 1750 + }, + { + "epoch": 1.8667377398720681, + "grad_norm": 0.6031812678730777, + "learning_rate": 6.313167426621253e-05, + "loss": 0.3868, + "step": 1751 + }, + { + "epoch": 1.8678038379530917, + "grad_norm": 0.6041907845639172, + "learning_rate": 6.310737972684322e-05, + "loss": 0.3902, + "step": 1752 + }, + { + "epoch": 1.8688699360341152, + "grad_norm": 0.5089623629692637, + "learning_rate": 6.308307238718649e-05, + "loss": 0.385, + "step": 1753 + }, + { + "epoch": 1.8699360341151388, + "grad_norm": 0.40212189395827613, + "learning_rate": 6.305875226070729e-05, + "loss": 0.3914, + "step": 1754 + }, + { + "epoch": 1.871002132196162, + "grad_norm": 0.35938255962719357, + "learning_rate": 6.303441936087776e-05, + "loss": 0.3867, + "step": 1755 + }, + { + "epoch": 1.8720682302771854, + "grad_norm": 0.42202040294266446, + "learning_rate": 6.301007370117703e-05, + "loss": 0.3902, + "step": 1756 + }, + { + "epoch": 1.873134328358209, + "grad_norm": 0.5647279892704614, + "learning_rate": 6.298571529509135e-05, + "loss": 0.3845, + "step": 1757 + }, + { + "epoch": 1.8742004264392325, + "grad_norm": 0.6637603262956403, + "learning_rate": 6.296134415611399e-05, + "loss": 0.3854, + "step": 1758 + }, + { + "epoch": 1.875266524520256, + "grad_norm": 0.7031726822916566, + "learning_rate": 6.29369602977453e-05, + "loss": 0.3855, + "step": 1759 + }, + { + "epoch": 1.8763326226012793, + "grad_norm": 0.7834542862723698, + "learning_rate": 6.291256373349269e-05, + "loss": 0.3945, + "step": 1760 + }, + { + "epoch": 1.8773987206823026, + "grad_norm": 0.8226206202632336, + "learning_rate": 6.288815447687056e-05, + "loss": 0.3901, + "step": 1761 + }, + { + "epoch": 1.8784648187633262, + "grad_norm": 0.7994895608418134, + "learning_rate": 6.286373254140038e-05, + "loss": 0.3893, + "step": 1762 + }, + { + "epoch": 1.8795309168443497, + "grad_norm": 0.7178750160169209, + "learning_rate": 6.283929794061065e-05, + "loss": 0.3917, + "step": 1763 + }, + { + "epoch": 1.8805970149253732, + "grad_norm": 0.5231999499518026, + "learning_rate": 6.281485068803683e-05, + "loss": 0.3857, + "step": 1764 + }, + { + "epoch": 1.8816631130063965, + "grad_norm": 0.39458981834934215, + "learning_rate": 6.279039079722147e-05, + "loss": 0.3873, + "step": 1765 + }, + { + "epoch": 1.88272921108742, + "grad_norm": 0.5291211636623102, + "learning_rate": 6.276591828171406e-05, + "loss": 0.386, + "step": 1766 + }, + { + "epoch": 1.8837953091684434, + "grad_norm": 0.6451705117892833, + "learning_rate": 6.274143315507108e-05, + "loss": 0.3861, + "step": 1767 + }, + { + "epoch": 1.884861407249467, + "grad_norm": 0.6150320216897379, + "learning_rate": 6.271693543085607e-05, + "loss": 0.3891, + "step": 1768 + }, + { + "epoch": 1.8859275053304905, + "grad_norm": 0.4359432097990482, + "learning_rate": 6.269242512263945e-05, + "loss": 0.3929, + "step": 1769 + }, + { + "epoch": 1.886993603411514, + "grad_norm": 0.31560057559310783, + "learning_rate": 6.266790224399867e-05, + "loss": 0.3887, + "step": 1770 + }, + { + "epoch": 1.8880597014925373, + "grad_norm": 0.4422794491574624, + "learning_rate": 6.264336680851813e-05, + "loss": 0.3902, + "step": 1771 + }, + { + "epoch": 1.8891257995735606, + "grad_norm": 0.5106376023504305, + "learning_rate": 6.26188188297892e-05, + "loss": 0.3891, + "step": 1772 + }, + { + "epoch": 1.8901918976545842, + "grad_norm": 0.4415611667008926, + "learning_rate": 6.259425832141017e-05, + "loss": 0.3859, + "step": 1773 + }, + { + "epoch": 1.8912579957356077, + "grad_norm": 0.40641816501262484, + "learning_rate": 6.256968529698628e-05, + "loss": 0.3937, + "step": 1774 + }, + { + "epoch": 1.8923240938166312, + "grad_norm": 0.448521267716046, + "learning_rate": 6.254509977012972e-05, + "loss": 0.3793, + "step": 1775 + }, + { + "epoch": 1.8933901918976546, + "grad_norm": 0.5095261552719494, + "learning_rate": 6.252050175445959e-05, + "loss": 0.3887, + "step": 1776 + }, + { + "epoch": 1.8944562899786779, + "grad_norm": 0.5730353742123191, + "learning_rate": 6.249589126360192e-05, + "loss": 0.3876, + "step": 1777 + }, + { + "epoch": 1.8955223880597014, + "grad_norm": 0.6087785843423836, + "learning_rate": 6.247126831118962e-05, + "loss": 0.391, + "step": 1778 + }, + { + "epoch": 1.896588486140725, + "grad_norm": 0.5935074468788941, + "learning_rate": 6.244663291086256e-05, + "loss": 0.392, + "step": 1779 + }, + { + "epoch": 1.8976545842217485, + "grad_norm": 0.5590142718060106, + "learning_rate": 6.242198507626746e-05, + "loss": 0.3893, + "step": 1780 + }, + { + "epoch": 1.8987206823027718, + "grad_norm": 0.5336039934734914, + "learning_rate": 6.23973248210579e-05, + "loss": 0.3855, + "step": 1781 + }, + { + "epoch": 1.8997867803837953, + "grad_norm": 0.445389003613856, + "learning_rate": 6.237265215889444e-05, + "loss": 0.3886, + "step": 1782 + }, + { + "epoch": 1.9008528784648187, + "grad_norm": 0.3716042168347448, + "learning_rate": 6.234796710344441e-05, + "loss": 0.3798, + "step": 1783 + }, + { + "epoch": 1.9019189765458422, + "grad_norm": 0.4696131039929026, + "learning_rate": 6.232326966838207e-05, + "loss": 0.3924, + "step": 1784 + }, + { + "epoch": 1.9029850746268657, + "grad_norm": 0.5401242190818367, + "learning_rate": 6.229855986738851e-05, + "loss": 0.3868, + "step": 1785 + }, + { + "epoch": 1.9040511727078893, + "grad_norm": 0.4997589943313716, + "learning_rate": 6.227383771415166e-05, + "loss": 0.3889, + "step": 1786 + }, + { + "epoch": 1.9051172707889126, + "grad_norm": 0.4625159920961848, + "learning_rate": 6.224910322236634e-05, + "loss": 0.3902, + "step": 1787 + }, + { + "epoch": 1.906183368869936, + "grad_norm": 0.4473030136236895, + "learning_rate": 6.222435640573414e-05, + "loss": 0.3922, + "step": 1788 + }, + { + "epoch": 1.9072494669509594, + "grad_norm": 0.3688121331850802, + "learning_rate": 6.219959727796354e-05, + "loss": 0.3819, + "step": 1789 + }, + { + "epoch": 1.908315565031983, + "grad_norm": 0.27997416856472385, + "learning_rate": 6.217482585276979e-05, + "loss": 0.3935, + "step": 1790 + }, + { + "epoch": 1.9093816631130065, + "grad_norm": 0.2996081067229179, + "learning_rate": 6.215004214387497e-05, + "loss": 0.3889, + "step": 1791 + }, + { + "epoch": 1.9104477611940298, + "grad_norm": 0.4281784065305154, + "learning_rate": 6.212524616500798e-05, + "loss": 0.3865, + "step": 1792 + }, + { + "epoch": 1.9115138592750534, + "grad_norm": 0.44659189904413066, + "learning_rate": 6.210043792990449e-05, + "loss": 0.3868, + "step": 1793 + }, + { + "epoch": 1.9125799573560767, + "grad_norm": 0.3623900185596153, + "learning_rate": 6.2075617452307e-05, + "loss": 0.3823, + "step": 1794 + }, + { + "epoch": 1.9136460554371002, + "grad_norm": 0.37626885568568225, + "learning_rate": 6.205078474596473e-05, + "loss": 0.3873, + "step": 1795 + }, + { + "epoch": 1.9147121535181237, + "grad_norm": 0.3768400015936373, + "learning_rate": 6.202593982463373e-05, + "loss": 0.39, + "step": 1796 + }, + { + "epoch": 1.9157782515991473, + "grad_norm": 0.4025791301348382, + "learning_rate": 6.200108270207679e-05, + "loss": 0.3791, + "step": 1797 + }, + { + "epoch": 1.9168443496801706, + "grad_norm": 0.5071478998115798, + "learning_rate": 6.197621339206345e-05, + "loss": 0.3819, + "step": 1798 + }, + { + "epoch": 1.917910447761194, + "grad_norm": 0.5571967213799047, + "learning_rate": 6.195133190837004e-05, + "loss": 0.3892, + "step": 1799 + }, + { + "epoch": 1.9189765458422174, + "grad_norm": 0.5126160369273954, + "learning_rate": 6.192643826477959e-05, + "loss": 0.3835, + "step": 1800 + }, + { + "epoch": 1.920042643923241, + "grad_norm": 0.44604407636296517, + "learning_rate": 6.190153247508189e-05, + "loss": 0.3953, + "step": 1801 + }, + { + "epoch": 1.9211087420042645, + "grad_norm": 0.39601219897116957, + "learning_rate": 6.187661455307346e-05, + "loss": 0.3916, + "step": 1802 + }, + { + "epoch": 1.9221748400852878, + "grad_norm": 0.38498451897362057, + "learning_rate": 6.185168451255752e-05, + "loss": 0.393, + "step": 1803 + }, + { + "epoch": 1.9232409381663111, + "grad_norm": 0.36258385258732584, + "learning_rate": 6.182674236734404e-05, + "loss": 0.3858, + "step": 1804 + }, + { + "epoch": 1.9243070362473347, + "grad_norm": 0.29544150204233544, + "learning_rate": 6.180178813124965e-05, + "loss": 0.385, + "step": 1805 + }, + { + "epoch": 1.9253731343283582, + "grad_norm": 0.3264189147080742, + "learning_rate": 6.177682181809772e-05, + "loss": 0.3851, + "step": 1806 + }, + { + "epoch": 1.9264392324093818, + "grad_norm": 0.3541649704837544, + "learning_rate": 6.175184344171827e-05, + "loss": 0.3885, + "step": 1807 + }, + { + "epoch": 1.927505330490405, + "grad_norm": 0.30666928053826004, + "learning_rate": 6.172685301594802e-05, + "loss": 0.3873, + "step": 1808 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.2639778431840215, + "learning_rate": 6.170185055463039e-05, + "loss": 0.3854, + "step": 1809 + }, + { + "epoch": 1.929637526652452, + "grad_norm": 0.277287436322927, + "learning_rate": 6.167683607161542e-05, + "loss": 0.3865, + "step": 1810 + }, + { + "epoch": 1.9307036247334755, + "grad_norm": 0.3219135209443072, + "learning_rate": 6.165180958075985e-05, + "loss": 0.3848, + "step": 1811 + }, + { + "epoch": 1.931769722814499, + "grad_norm": 0.3303760398066151, + "learning_rate": 6.162677109592704e-05, + "loss": 0.386, + "step": 1812 + }, + { + "epoch": 1.9328358208955225, + "grad_norm": 0.3084131821261726, + "learning_rate": 6.160172063098703e-05, + "loss": 0.3884, + "step": 1813 + }, + { + "epoch": 1.9339019189765458, + "grad_norm": 0.31138352082161325, + "learning_rate": 6.157665819981646e-05, + "loss": 0.3832, + "step": 1814 + }, + { + "epoch": 1.9349680170575692, + "grad_norm": 0.3417574287817514, + "learning_rate": 6.155158381629863e-05, + "loss": 0.39, + "step": 1815 + }, + { + "epoch": 1.9360341151385927, + "grad_norm": 0.29411408973125125, + "learning_rate": 6.152649749432343e-05, + "loss": 0.3857, + "step": 1816 + }, + { + "epoch": 1.9371002132196162, + "grad_norm": 0.28770100823418326, + "learning_rate": 6.150139924778738e-05, + "loss": 0.3868, + "step": 1817 + }, + { + "epoch": 1.9381663113006398, + "grad_norm": 0.3600800861434762, + "learning_rate": 6.14762890905936e-05, + "loss": 0.3907, + "step": 1818 + }, + { + "epoch": 1.939232409381663, + "grad_norm": 0.40220665801015487, + "learning_rate": 6.145116703665184e-05, + "loss": 0.3839, + "step": 1819 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.4133033373544916, + "learning_rate": 6.142603309987838e-05, + "loss": 0.387, + "step": 1820 + }, + { + "epoch": 1.94136460554371, + "grad_norm": 0.47788470275197026, + "learning_rate": 6.140088729419613e-05, + "loss": 0.3903, + "step": 1821 + }, + { + "epoch": 1.9424307036247335, + "grad_norm": 0.4896797272268825, + "learning_rate": 6.137572963353455e-05, + "loss": 0.385, + "step": 1822 + }, + { + "epoch": 1.943496801705757, + "grad_norm": 0.45525617704254706, + "learning_rate": 6.135056013182969e-05, + "loss": 0.3949, + "step": 1823 + }, + { + "epoch": 1.9445628997867805, + "grad_norm": 0.41754979578711426, + "learning_rate": 6.132537880302412e-05, + "loss": 0.3901, + "step": 1824 + }, + { + "epoch": 1.9456289978678039, + "grad_norm": 0.36895879752123967, + "learning_rate": 6.130018566106702e-05, + "loss": 0.3845, + "step": 1825 + }, + { + "epoch": 1.9466950959488272, + "grad_norm": 0.2966243721883261, + "learning_rate": 6.127498071991406e-05, + "loss": 0.3894, + "step": 1826 + }, + { + "epoch": 1.9477611940298507, + "grad_norm": 0.29871248582481646, + "learning_rate": 6.12497639935275e-05, + "loss": 0.3859, + "step": 1827 + }, + { + "epoch": 1.9488272921108742, + "grad_norm": 0.3298826130551037, + "learning_rate": 6.122453549587603e-05, + "loss": 0.3857, + "step": 1828 + }, + { + "epoch": 1.9498933901918978, + "grad_norm": 0.3676840915758954, + "learning_rate": 6.119929524093499e-05, + "loss": 0.3894, + "step": 1829 + }, + { + "epoch": 1.950959488272921, + "grad_norm": 0.3611638397068088, + "learning_rate": 6.117404324268615e-05, + "loss": 0.3917, + "step": 1830 + }, + { + "epoch": 1.9520255863539444, + "grad_norm": 0.36356753037609535, + "learning_rate": 6.11487795151178e-05, + "loss": 0.3885, + "step": 1831 + }, + { + "epoch": 1.953091684434968, + "grad_norm": 0.4256434759309199, + "learning_rate": 6.112350407222472e-05, + "loss": 0.3871, + "step": 1832 + }, + { + "epoch": 1.9541577825159915, + "grad_norm": 0.5131488919037342, + "learning_rate": 6.109821692800822e-05, + "loss": 0.3925, + "step": 1833 + }, + { + "epoch": 1.955223880597015, + "grad_norm": 0.5200430235069933, + "learning_rate": 6.107291809647603e-05, + "loss": 0.3898, + "step": 1834 + }, + { + "epoch": 1.9562899786780383, + "grad_norm": 0.44124171195877854, + "learning_rate": 6.104760759164242e-05, + "loss": 0.3887, + "step": 1835 + }, + { + "epoch": 1.9573560767590619, + "grad_norm": 0.3699718554474287, + "learning_rate": 6.102228542752809e-05, + "loss": 0.3901, + "step": 1836 + }, + { + "epoch": 1.9584221748400852, + "grad_norm": 0.3748188674691452, + "learning_rate": 6.0996951618160164e-05, + "loss": 0.3948, + "step": 1837 + }, + { + "epoch": 1.9594882729211087, + "grad_norm": 0.3898699056134908, + "learning_rate": 6.097160617757231e-05, + "loss": 0.3906, + "step": 1838 + }, + { + "epoch": 1.9605543710021323, + "grad_norm": 0.3988044604464649, + "learning_rate": 6.094624911980455e-05, + "loss": 0.3859, + "step": 1839 + }, + { + "epoch": 1.9616204690831558, + "grad_norm": 0.3913844382375237, + "learning_rate": 6.0920880458903396e-05, + "loss": 0.3861, + "step": 1840 + }, + { + "epoch": 1.962686567164179, + "grad_norm": 0.32899811131426077, + "learning_rate": 6.089550020892175e-05, + "loss": 0.3866, + "step": 1841 + }, + { + "epoch": 1.9637526652452024, + "grad_norm": 0.31976863842060194, + "learning_rate": 6.0870108383918964e-05, + "loss": 0.3827, + "step": 1842 + }, + { + "epoch": 1.964818763326226, + "grad_norm": 0.4262488192151024, + "learning_rate": 6.084470499796077e-05, + "loss": 0.389, + "step": 1843 + }, + { + "epoch": 1.9658848614072495, + "grad_norm": 0.4077721083204829, + "learning_rate": 6.081929006511935e-05, + "loss": 0.3839, + "step": 1844 + }, + { + "epoch": 1.966950959488273, + "grad_norm": 0.3347619429742381, + "learning_rate": 6.079386359947325e-05, + "loss": 0.3831, + "step": 1845 + }, + { + "epoch": 1.9680170575692963, + "grad_norm": 0.24957267266383398, + "learning_rate": 6.07684256151074e-05, + "loss": 0.3821, + "step": 1846 + }, + { + "epoch": 1.9690831556503199, + "grad_norm": 0.2976405468436129, + "learning_rate": 6.074297612611312e-05, + "loss": 0.3833, + "step": 1847 + }, + { + "epoch": 1.9701492537313432, + "grad_norm": 0.4249489763296379, + "learning_rate": 6.071751514658811e-05, + "loss": 0.39, + "step": 1848 + }, + { + "epoch": 1.9712153518123667, + "grad_norm": 0.45599309991133324, + "learning_rate": 6.069204269063644e-05, + "loss": 0.3922, + "step": 1849 + }, + { + "epoch": 1.9722814498933903, + "grad_norm": 0.38232128582738506, + "learning_rate": 6.066655877236851e-05, + "loss": 0.3889, + "step": 1850 + }, + { + "epoch": 1.9733475479744138, + "grad_norm": 0.34695673920538606, + "learning_rate": 6.06410634059011e-05, + "loss": 0.3894, + "step": 1851 + }, + { + "epoch": 1.9744136460554371, + "grad_norm": 0.4535787896409531, + "learning_rate": 6.06155566053573e-05, + "loss": 0.3886, + "step": 1852 + }, + { + "epoch": 1.9754797441364604, + "grad_norm": 0.5028521097525349, + "learning_rate": 6.059003838486657e-05, + "loss": 0.3914, + "step": 1853 + }, + { + "epoch": 1.976545842217484, + "grad_norm": 0.43918619216494054, + "learning_rate": 6.056450875856467e-05, + "loss": 0.3849, + "step": 1854 + }, + { + "epoch": 1.9776119402985075, + "grad_norm": 0.45035177429943046, + "learning_rate": 6.053896774059368e-05, + "loss": 0.391, + "step": 1855 + }, + { + "epoch": 1.978678038379531, + "grad_norm": 0.5612538235109835, + "learning_rate": 6.051341534510201e-05, + "loss": 0.3929, + "step": 1856 + }, + { + "epoch": 1.9797441364605544, + "grad_norm": 0.6454231135981549, + "learning_rate": 6.048785158624436e-05, + "loss": 0.3871, + "step": 1857 + }, + { + "epoch": 1.9808102345415777, + "grad_norm": 0.7284809133654999, + "learning_rate": 6.0462276478181696e-05, + "loss": 0.391, + "step": 1858 + }, + { + "epoch": 1.9818763326226012, + "grad_norm": 0.7042942526408297, + "learning_rate": 6.043669003508134e-05, + "loss": 0.3827, + "step": 1859 + }, + { + "epoch": 1.9829424307036247, + "grad_norm": 0.6203970586107856, + "learning_rate": 6.0411092271116815e-05, + "loss": 0.393, + "step": 1860 + }, + { + "epoch": 1.9840085287846483, + "grad_norm": 0.5381848565811816, + "learning_rate": 6.038548320046797e-05, + "loss": 0.3884, + "step": 1861 + }, + { + "epoch": 1.9850746268656716, + "grad_norm": 0.3673908310108203, + "learning_rate": 6.035986283732091e-05, + "loss": 0.3858, + "step": 1862 + }, + { + "epoch": 1.9861407249466951, + "grad_norm": 0.2992259271172484, + "learning_rate": 6.0334231195867954e-05, + "loss": 0.3876, + "step": 1863 + }, + { + "epoch": 1.9872068230277184, + "grad_norm": 0.5048589299680658, + "learning_rate": 6.030858829030773e-05, + "loss": 0.3915, + "step": 1864 + }, + { + "epoch": 1.988272921108742, + "grad_norm": 0.6398688231174923, + "learning_rate": 6.0282934134845055e-05, + "loss": 0.3871, + "step": 1865 + }, + { + "epoch": 1.9893390191897655, + "grad_norm": 0.6041356427710289, + "learning_rate": 6.025726874369101e-05, + "loss": 0.3906, + "step": 1866 + }, + { + "epoch": 1.990405117270789, + "grad_norm": 0.436971182640899, + "learning_rate": 6.023159213106288e-05, + "loss": 0.3859, + "step": 1867 + }, + { + "epoch": 1.9914712153518124, + "grad_norm": 0.3014364362613937, + "learning_rate": 6.020590431118417e-05, + "loss": 0.3871, + "step": 1868 + }, + { + "epoch": 1.9925373134328357, + "grad_norm": 0.34001157186356745, + "learning_rate": 6.018020529828461e-05, + "loss": 0.3852, + "step": 1869 + }, + { + "epoch": 1.9936034115138592, + "grad_norm": 0.4404981881926425, + "learning_rate": 6.0154495106600094e-05, + "loss": 0.3844, + "step": 1870 + }, + { + "epoch": 1.9946695095948828, + "grad_norm": 0.48771270351289164, + "learning_rate": 6.012877375037278e-05, + "loss": 0.3866, + "step": 1871 + }, + { + "epoch": 1.9957356076759063, + "grad_norm": 0.4990494207144115, + "learning_rate": 6.01030412438509e-05, + "loss": 0.3858, + "step": 1872 + }, + { + "epoch": 1.9968017057569296, + "grad_norm": 0.468499732746224, + "learning_rate": 6.007729760128898e-05, + "loss": 0.3843, + "step": 1873 + }, + { + "epoch": 1.997867803837953, + "grad_norm": 0.41065101398677445, + "learning_rate": 6.0051542836947625e-05, + "loss": 0.3886, + "step": 1874 + }, + { + "epoch": 1.9989339019189765, + "grad_norm": 0.444948728644674, + "learning_rate": 6.002577696509365e-05, + "loss": 0.3855, + "step": 1875 + }, + { + "epoch": 2.0, + "grad_norm": 0.5080421482170958, + "learning_rate": 6.000000000000001e-05, + "loss": 0.3761, + "step": 1876 + }, + { + "epoch": 2.0010660980810235, + "grad_norm": 0.5368826782068931, + "learning_rate": 5.9974211955945795e-05, + "loss": 0.3645, + "step": 1877 + }, + { + "epoch": 2.002132196162047, + "grad_norm": 0.5340141540515194, + "learning_rate": 5.9948412847216255e-05, + "loss": 0.3729, + "step": 1878 + }, + { + "epoch": 2.00319829424307, + "grad_norm": 0.48248902079325257, + "learning_rate": 5.992260268810273e-05, + "loss": 0.3696, + "step": 1879 + }, + { + "epoch": 2.0042643923240937, + "grad_norm": 0.4867113475226342, + "learning_rate": 5.989678149290274e-05, + "loss": 0.3736, + "step": 1880 + }, + { + "epoch": 2.0053304904051172, + "grad_norm": 0.5127638734663647, + "learning_rate": 5.987094927591987e-05, + "loss": 0.3655, + "step": 1881 + }, + { + "epoch": 2.0063965884861408, + "grad_norm": 0.5333958878222103, + "learning_rate": 5.9845106051463815e-05, + "loss": 0.3731, + "step": 1882 + }, + { + "epoch": 2.0074626865671643, + "grad_norm": 0.496391847282843, + "learning_rate": 5.9819251833850395e-05, + "loss": 0.3634, + "step": 1883 + }, + { + "epoch": 2.008528784648188, + "grad_norm": 0.3918376522955584, + "learning_rate": 5.979338663740149e-05, + "loss": 0.3748, + "step": 1884 + }, + { + "epoch": 2.009594882729211, + "grad_norm": 0.3947052793986647, + "learning_rate": 5.9767510476445097e-05, + "loss": 0.3721, + "step": 1885 + }, + { + "epoch": 2.0106609808102345, + "grad_norm": 0.41802301612396614, + "learning_rate": 5.974162336531522e-05, + "loss": 0.368, + "step": 1886 + }, + { + "epoch": 2.011727078891258, + "grad_norm": 0.4284040217771835, + "learning_rate": 5.9715725318352024e-05, + "loss": 0.366, + "step": 1887 + }, + { + "epoch": 2.0127931769722816, + "grad_norm": 0.4723022945666149, + "learning_rate": 5.968981634990164e-05, + "loss": 0.3688, + "step": 1888 + }, + { + "epoch": 2.013859275053305, + "grad_norm": 0.44534446364431024, + "learning_rate": 5.9663896474316325e-05, + "loss": 0.3685, + "step": 1889 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4512122720314313, + "learning_rate": 5.9637965705954316e-05, + "loss": 0.3675, + "step": 1890 + }, + { + "epoch": 2.0159914712153517, + "grad_norm": 0.41496594298021805, + "learning_rate": 5.961202405917993e-05, + "loss": 0.3708, + "step": 1891 + }, + { + "epoch": 2.0170575692963753, + "grad_norm": 0.4351254782206402, + "learning_rate": 5.9586071548363475e-05, + "loss": 0.3649, + "step": 1892 + }, + { + "epoch": 2.018123667377399, + "grad_norm": 0.42476919414349634, + "learning_rate": 5.95601081878813e-05, + "loss": 0.3663, + "step": 1893 + }, + { + "epoch": 2.0191897654584223, + "grad_norm": 0.3868685718287423, + "learning_rate": 5.9534133992115766e-05, + "loss": 0.3696, + "step": 1894 + }, + { + "epoch": 2.0202558635394454, + "grad_norm": 0.3249684583348523, + "learning_rate": 5.9508148975455216e-05, + "loss": 0.368, + "step": 1895 + }, + { + "epoch": 2.021321961620469, + "grad_norm": 0.35899462535259774, + "learning_rate": 5.9482153152294e-05, + "loss": 0.3673, + "step": 1896 + }, + { + "epoch": 2.0223880597014925, + "grad_norm": 0.3843529775756281, + "learning_rate": 5.945614653703245e-05, + "loss": 0.3671, + "step": 1897 + }, + { + "epoch": 2.023454157782516, + "grad_norm": 0.4180113835206048, + "learning_rate": 5.9430129144076894e-05, + "loss": 0.3731, + "step": 1898 + }, + { + "epoch": 2.0245202558635396, + "grad_norm": 0.37475722594758115, + "learning_rate": 5.9404100987839594e-05, + "loss": 0.3649, + "step": 1899 + }, + { + "epoch": 2.025586353944563, + "grad_norm": 0.28629598775419896, + "learning_rate": 5.937806208273881e-05, + "loss": 0.3679, + "step": 1900 + }, + { + "epoch": 2.026652452025586, + "grad_norm": 0.300199898611435, + "learning_rate": 5.9352012443198744e-05, + "loss": 0.3751, + "step": 1901 + }, + { + "epoch": 2.0277185501066097, + "grad_norm": 0.33065361440169816, + "learning_rate": 5.932595208364954e-05, + "loss": 0.3674, + "step": 1902 + }, + { + "epoch": 2.0287846481876333, + "grad_norm": 0.45055044275702016, + "learning_rate": 5.9299881018527286e-05, + "loss": 0.3669, + "step": 1903 + }, + { + "epoch": 2.029850746268657, + "grad_norm": 0.5359016977011789, + "learning_rate": 5.927379926227398e-05, + "loss": 0.3736, + "step": 1904 + }, + { + "epoch": 2.0309168443496803, + "grad_norm": 0.4728072949818313, + "learning_rate": 5.924770682933758e-05, + "loss": 0.3673, + "step": 1905 + }, + { + "epoch": 2.0319829424307034, + "grad_norm": 0.35779722355951155, + "learning_rate": 5.9221603734171916e-05, + "loss": 0.3712, + "step": 1906 + }, + { + "epoch": 2.033049040511727, + "grad_norm": 0.25778926704533023, + "learning_rate": 5.919548999123677e-05, + "loss": 0.3605, + "step": 1907 + }, + { + "epoch": 2.0341151385927505, + "grad_norm": 0.30958936271010357, + "learning_rate": 5.9169365614997786e-05, + "loss": 0.3633, + "step": 1908 + }, + { + "epoch": 2.035181236673774, + "grad_norm": 0.3966071294002005, + "learning_rate": 5.914323061992651e-05, + "loss": 0.3712, + "step": 1909 + }, + { + "epoch": 2.0362473347547976, + "grad_norm": 0.418075833743344, + "learning_rate": 5.9117085020500375e-05, + "loss": 0.367, + "step": 1910 + }, + { + "epoch": 2.0373134328358207, + "grad_norm": 0.4289422566733278, + "learning_rate": 5.909092883120269e-05, + "loss": 0.3677, + "step": 1911 + }, + { + "epoch": 2.038379530916844, + "grad_norm": 0.38407217216684475, + "learning_rate": 5.9064762066522614e-05, + "loss": 0.3657, + "step": 1912 + }, + { + "epoch": 2.0394456289978677, + "grad_norm": 0.3152326073948108, + "learning_rate": 5.9038584740955166e-05, + "loss": 0.3775, + "step": 1913 + }, + { + "epoch": 2.0405117270788913, + "grad_norm": 0.2902867416768938, + "learning_rate": 5.9012396869001255e-05, + "loss": 0.3629, + "step": 1914 + }, + { + "epoch": 2.041577825159915, + "grad_norm": 0.33254014516271413, + "learning_rate": 5.8986198465167566e-05, + "loss": 0.3652, + "step": 1915 + }, + { + "epoch": 2.0426439232409384, + "grad_norm": 0.3355861259716711, + "learning_rate": 5.895998954396669e-05, + "loss": 0.3739, + "step": 1916 + }, + { + "epoch": 2.0437100213219614, + "grad_norm": 0.33027480184371727, + "learning_rate": 5.893377011991696e-05, + "loss": 0.3642, + "step": 1917 + }, + { + "epoch": 2.044776119402985, + "grad_norm": 0.36437673592398223, + "learning_rate": 5.8907540207542616e-05, + "loss": 0.3673, + "step": 1918 + }, + { + "epoch": 2.0458422174840085, + "grad_norm": 0.37919989065780685, + "learning_rate": 5.888129982137364e-05, + "loss": 0.3697, + "step": 1919 + }, + { + "epoch": 2.046908315565032, + "grad_norm": 0.39804386255773294, + "learning_rate": 5.885504897594587e-05, + "loss": 0.3706, + "step": 1920 + }, + { + "epoch": 2.0479744136460556, + "grad_norm": 0.4048265506757637, + "learning_rate": 5.882878768580089e-05, + "loss": 0.368, + "step": 1921 + }, + { + "epoch": 2.0490405117270787, + "grad_norm": 0.39214048772357035, + "learning_rate": 5.880251596548608e-05, + "loss": 0.3729, + "step": 1922 + }, + { + "epoch": 2.050106609808102, + "grad_norm": 0.3768034830806379, + "learning_rate": 5.877623382955463e-05, + "loss": 0.366, + "step": 1923 + }, + { + "epoch": 2.0511727078891258, + "grad_norm": 0.36912589068028795, + "learning_rate": 5.874994129256546e-05, + "loss": 0.3704, + "step": 1924 + }, + { + "epoch": 2.0522388059701493, + "grad_norm": 0.34472606046121007, + "learning_rate": 5.872363836908328e-05, + "loss": 0.3605, + "step": 1925 + }, + { + "epoch": 2.053304904051173, + "grad_norm": 0.3447390942221336, + "learning_rate": 5.869732507367854e-05, + "loss": 0.3632, + "step": 1926 + }, + { + "epoch": 2.0543710021321964, + "grad_norm": 0.39703909632410184, + "learning_rate": 5.867100142092745e-05, + "loss": 0.3685, + "step": 1927 + }, + { + "epoch": 2.0554371002132195, + "grad_norm": 0.44058895200097586, + "learning_rate": 5.864466742541192e-05, + "loss": 0.3698, + "step": 1928 + }, + { + "epoch": 2.056503198294243, + "grad_norm": 0.4721233594093033, + "learning_rate": 5.861832310171963e-05, + "loss": 0.3764, + "step": 1929 + }, + { + "epoch": 2.0575692963752665, + "grad_norm": 0.4144618882599998, + "learning_rate": 5.8591968464443964e-05, + "loss": 0.3667, + "step": 1930 + }, + { + "epoch": 2.05863539445629, + "grad_norm": 0.302798084281156, + "learning_rate": 5.856560352818403e-05, + "loss": 0.3753, + "step": 1931 + }, + { + "epoch": 2.0597014925373136, + "grad_norm": 0.35481671123309483, + "learning_rate": 5.853922830754462e-05, + "loss": 0.3725, + "step": 1932 + }, + { + "epoch": 2.0607675906183367, + "grad_norm": 0.3547278123247793, + "learning_rate": 5.851284281713623e-05, + "loss": 0.3718, + "step": 1933 + }, + { + "epoch": 2.0618336886993602, + "grad_norm": 0.3404365623104542, + "learning_rate": 5.848644707157508e-05, + "loss": 0.3768, + "step": 1934 + }, + { + "epoch": 2.0628997867803838, + "grad_norm": 0.3550698273564781, + "learning_rate": 5.8460041085483004e-05, + "loss": 0.3727, + "step": 1935 + }, + { + "epoch": 2.0639658848614073, + "grad_norm": 0.35208070245068634, + "learning_rate": 5.8433624873487577e-05, + "loss": 0.37, + "step": 1936 + }, + { + "epoch": 2.065031982942431, + "grad_norm": 0.34247134066073665, + "learning_rate": 5.840719845022198e-05, + "loss": 0.3687, + "step": 1937 + }, + { + "epoch": 2.066098081023454, + "grad_norm": 0.35093264721237094, + "learning_rate": 5.8380761830325095e-05, + "loss": 0.3736, + "step": 1938 + }, + { + "epoch": 2.0671641791044775, + "grad_norm": 0.3638250203992304, + "learning_rate": 5.8354315028441434e-05, + "loss": 0.368, + "step": 1939 + }, + { + "epoch": 2.068230277185501, + "grad_norm": 0.31391019565989364, + "learning_rate": 5.832785805922115e-05, + "loss": 0.3755, + "step": 1940 + }, + { + "epoch": 2.0692963752665245, + "grad_norm": 0.32954228352966447, + "learning_rate": 5.830139093732003e-05, + "loss": 0.3695, + "step": 1941 + }, + { + "epoch": 2.070362473347548, + "grad_norm": 0.39242675952607037, + "learning_rate": 5.827491367739948e-05, + "loss": 0.3732, + "step": 1942 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.45718271088467405, + "learning_rate": 5.824842629412653e-05, + "loss": 0.3711, + "step": 1943 + }, + { + "epoch": 2.0724946695095947, + "grad_norm": 0.449089038740122, + "learning_rate": 5.822192880217381e-05, + "loss": 0.3682, + "step": 1944 + }, + { + "epoch": 2.0735607675906182, + "grad_norm": 0.37218601001021756, + "learning_rate": 5.819542121621955e-05, + "loss": 0.3672, + "step": 1945 + }, + { + "epoch": 2.074626865671642, + "grad_norm": 0.3365200917538621, + "learning_rate": 5.8168903550947586e-05, + "loss": 0.3667, + "step": 1946 + }, + { + "epoch": 2.0756929637526653, + "grad_norm": 0.3776897290391663, + "learning_rate": 5.814237582104732e-05, + "loss": 0.369, + "step": 1947 + }, + { + "epoch": 2.076759061833689, + "grad_norm": 0.4241260819869886, + "learning_rate": 5.811583804121373e-05, + "loss": 0.3721, + "step": 1948 + }, + { + "epoch": 2.077825159914712, + "grad_norm": 0.43159827522631167, + "learning_rate": 5.808929022614738e-05, + "loss": 0.3707, + "step": 1949 + }, + { + "epoch": 2.0788912579957355, + "grad_norm": 0.46068845826576565, + "learning_rate": 5.806273239055437e-05, + "loss": 0.3665, + "step": 1950 + }, + { + "epoch": 2.079957356076759, + "grad_norm": 0.4492734276793744, + "learning_rate": 5.803616454914636e-05, + "loss": 0.3687, + "step": 1951 + }, + { + "epoch": 2.0810234541577826, + "grad_norm": 0.4521877821462862, + "learning_rate": 5.800958671664057e-05, + "loss": 0.3638, + "step": 1952 + }, + { + "epoch": 2.082089552238806, + "grad_norm": 0.41545909536895476, + "learning_rate": 5.798299890775971e-05, + "loss": 0.3738, + "step": 1953 + }, + { + "epoch": 2.0831556503198296, + "grad_norm": 0.2972587597086915, + "learning_rate": 5.795640113723207e-05, + "loss": 0.367, + "step": 1954 + }, + { + "epoch": 2.0842217484008527, + "grad_norm": 0.27641022547500294, + "learning_rate": 5.7929793419791416e-05, + "loss": 0.3711, + "step": 1955 + }, + { + "epoch": 2.0852878464818763, + "grad_norm": 0.39396716547715316, + "learning_rate": 5.790317577017705e-05, + "loss": 0.3698, + "step": 1956 + }, + { + "epoch": 2.0863539445629, + "grad_norm": 0.38769077302687954, + "learning_rate": 5.787654820313376e-05, + "loss": 0.371, + "step": 1957 + }, + { + "epoch": 2.0874200426439233, + "grad_norm": 0.31630048451704645, + "learning_rate": 5.784991073341184e-05, + "loss": 0.3697, + "step": 1958 + }, + { + "epoch": 2.088486140724947, + "grad_norm": 0.23632112461394705, + "learning_rate": 5.782326337576705e-05, + "loss": 0.3674, + "step": 1959 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.2503975390702189, + "learning_rate": 5.779660614496067e-05, + "loss": 0.367, + "step": 1960 + }, + { + "epoch": 2.0906183368869935, + "grad_norm": 0.31776105349081096, + "learning_rate": 5.776993905575939e-05, + "loss": 0.3656, + "step": 1961 + }, + { + "epoch": 2.091684434968017, + "grad_norm": 0.29292989917695805, + "learning_rate": 5.77432621229354e-05, + "loss": 0.3712, + "step": 1962 + }, + { + "epoch": 2.0927505330490406, + "grad_norm": 0.2736016829783113, + "learning_rate": 5.771657536126634e-05, + "loss": 0.3682, + "step": 1963 + }, + { + "epoch": 2.093816631130064, + "grad_norm": 0.301999599571911, + "learning_rate": 5.768987878553531e-05, + "loss": 0.3683, + "step": 1964 + }, + { + "epoch": 2.094882729211087, + "grad_norm": 0.3234173819094893, + "learning_rate": 5.766317241053077e-05, + "loss": 0.369, + "step": 1965 + }, + { + "epoch": 2.0959488272921107, + "grad_norm": 0.324842749445397, + "learning_rate": 5.763645625104673e-05, + "loss": 0.3634, + "step": 1966 + }, + { + "epoch": 2.0970149253731343, + "grad_norm": 0.29363848696991995, + "learning_rate": 5.7609730321882495e-05, + "loss": 0.364, + "step": 1967 + }, + { + "epoch": 2.098081023454158, + "grad_norm": 0.2645265350384704, + "learning_rate": 5.758299463784287e-05, + "loss": 0.365, + "step": 1968 + }, + { + "epoch": 2.0991471215351813, + "grad_norm": 0.2474828784122483, + "learning_rate": 5.755624921373805e-05, + "loss": 0.3743, + "step": 1969 + }, + { + "epoch": 2.100213219616205, + "grad_norm": 0.2310435326688506, + "learning_rate": 5.752949406438357e-05, + "loss": 0.372, + "step": 1970 + }, + { + "epoch": 2.101279317697228, + "grad_norm": 0.22201440839590442, + "learning_rate": 5.7502729204600416e-05, + "loss": 0.3721, + "step": 1971 + }, + { + "epoch": 2.1023454157782515, + "grad_norm": 0.2541491754650197, + "learning_rate": 5.747595464921493e-05, + "loss": 0.3715, + "step": 1972 + }, + { + "epoch": 2.103411513859275, + "grad_norm": 0.31165144411877344, + "learning_rate": 5.744917041305882e-05, + "loss": 0.3654, + "step": 1973 + }, + { + "epoch": 2.1044776119402986, + "grad_norm": 0.28922154074403444, + "learning_rate": 5.7422376510969165e-05, + "loss": 0.3728, + "step": 1974 + }, + { + "epoch": 2.105543710021322, + "grad_norm": 0.27560948128394847, + "learning_rate": 5.739557295778838e-05, + "loss": 0.3695, + "step": 1975 + }, + { + "epoch": 2.106609808102345, + "grad_norm": 0.2625778705797159, + "learning_rate": 5.736875976836426e-05, + "loss": 0.3701, + "step": 1976 + }, + { + "epoch": 2.1076759061833688, + "grad_norm": 0.3217515539116153, + "learning_rate": 5.73419369575499e-05, + "loss": 0.3724, + "step": 1977 + }, + { + "epoch": 2.1087420042643923, + "grad_norm": 0.3509916580574595, + "learning_rate": 5.731510454020377e-05, + "loss": 0.3658, + "step": 1978 + }, + { + "epoch": 2.109808102345416, + "grad_norm": 0.3227845858529989, + "learning_rate": 5.728826253118961e-05, + "loss": 0.3709, + "step": 1979 + }, + { + "epoch": 2.1108742004264394, + "grad_norm": 0.3134700723493593, + "learning_rate": 5.7261410945376496e-05, + "loss": 0.3759, + "step": 1980 + }, + { + "epoch": 2.111940298507463, + "grad_norm": 0.31039720803106174, + "learning_rate": 5.723454979763882e-05, + "loss": 0.3724, + "step": 1981 + }, + { + "epoch": 2.113006396588486, + "grad_norm": 0.32706272995692337, + "learning_rate": 5.720767910285626e-05, + "loss": 0.3722, + "step": 1982 + }, + { + "epoch": 2.1140724946695095, + "grad_norm": 0.30080459490071365, + "learning_rate": 5.718079887591381e-05, + "loss": 0.3733, + "step": 1983 + }, + { + "epoch": 2.115138592750533, + "grad_norm": 0.3043060216756326, + "learning_rate": 5.715390913170167e-05, + "loss": 0.375, + "step": 1984 + }, + { + "epoch": 2.1162046908315566, + "grad_norm": 0.28712340226310096, + "learning_rate": 5.7127009885115394e-05, + "loss": 0.3733, + "step": 1985 + }, + { + "epoch": 2.11727078891258, + "grad_norm": 0.31229581889122415, + "learning_rate": 5.710010115105576e-05, + "loss": 0.3717, + "step": 1986 + }, + { + "epoch": 2.1183368869936032, + "grad_norm": 0.3112982453246555, + "learning_rate": 5.707318294442881e-05, + "loss": 0.375, + "step": 1987 + }, + { + "epoch": 2.1194029850746268, + "grad_norm": 0.2935803415699457, + "learning_rate": 5.704625528014582e-05, + "loss": 0.3688, + "step": 1988 + }, + { + "epoch": 2.1204690831556503, + "grad_norm": 0.34557760313152336, + "learning_rate": 5.701931817312334e-05, + "loss": 0.3654, + "step": 1989 + }, + { + "epoch": 2.121535181236674, + "grad_norm": 0.32408207030896125, + "learning_rate": 5.6992371638283094e-05, + "loss": 0.3681, + "step": 1990 + }, + { + "epoch": 2.1226012793176974, + "grad_norm": 0.3209402684629185, + "learning_rate": 5.6965415690552083e-05, + "loss": 0.3691, + "step": 1991 + }, + { + "epoch": 2.1236673773987205, + "grad_norm": 0.37804296374542823, + "learning_rate": 5.693845034486251e-05, + "loss": 0.3717, + "step": 1992 + }, + { + "epoch": 2.124733475479744, + "grad_norm": 0.402239674935612, + "learning_rate": 5.691147561615175e-05, + "loss": 0.3676, + "step": 1993 + }, + { + "epoch": 2.1257995735607675, + "grad_norm": 0.40844484750895815, + "learning_rate": 5.688449151936243e-05, + "loss": 0.3724, + "step": 1994 + }, + { + "epoch": 2.126865671641791, + "grad_norm": 0.42766169546473953, + "learning_rate": 5.6857498069442306e-05, + "loss": 0.3709, + "step": 1995 + }, + { + "epoch": 2.1279317697228146, + "grad_norm": 0.37265857219610005, + "learning_rate": 5.683049528134437e-05, + "loss": 0.3737, + "step": 1996 + }, + { + "epoch": 2.128997867803838, + "grad_norm": 0.3122405362957184, + "learning_rate": 5.680348317002676e-05, + "loss": 0.3677, + "step": 1997 + }, + { + "epoch": 2.1300639658848612, + "grad_norm": 0.38579576284804146, + "learning_rate": 5.677646175045276e-05, + "loss": 0.3716, + "step": 1998 + }, + { + "epoch": 2.131130063965885, + "grad_norm": 0.49480827662022464, + "learning_rate": 5.674943103759086e-05, + "loss": 0.3778, + "step": 1999 + }, + { + "epoch": 2.1321961620469083, + "grad_norm": 0.5608752285357496, + "learning_rate": 5.672239104641466e-05, + "loss": 0.3734, + "step": 2000 + }, + { + "epoch": 2.133262260127932, + "grad_norm": 0.4928793043710862, + "learning_rate": 5.669534179190289e-05, + "loss": 0.3682, + "step": 2001 + }, + { + "epoch": 2.1343283582089554, + "grad_norm": 0.38504633757152384, + "learning_rate": 5.666828328903947e-05, + "loss": 0.3745, + "step": 2002 + }, + { + "epoch": 2.1353944562899785, + "grad_norm": 0.2618645941857424, + "learning_rate": 5.664121555281339e-05, + "loss": 0.3747, + "step": 2003 + }, + { + "epoch": 2.136460554371002, + "grad_norm": 0.2755495972773531, + "learning_rate": 5.661413859821874e-05, + "loss": 0.3791, + "step": 2004 + }, + { + "epoch": 2.1375266524520256, + "grad_norm": 0.34215532694497924, + "learning_rate": 5.658705244025479e-05, + "loss": 0.3773, + "step": 2005 + }, + { + "epoch": 2.138592750533049, + "grad_norm": 0.30880880152023477, + "learning_rate": 5.6559957093925826e-05, + "loss": 0.3748, + "step": 2006 + }, + { + "epoch": 2.1396588486140726, + "grad_norm": 0.3224170651717293, + "learning_rate": 5.653285257424129e-05, + "loss": 0.3693, + "step": 2007 + }, + { + "epoch": 2.140724946695096, + "grad_norm": 0.35605595038557464, + "learning_rate": 5.650573889621566e-05, + "loss": 0.3713, + "step": 2008 + }, + { + "epoch": 2.1417910447761193, + "grad_norm": 0.3377860560809023, + "learning_rate": 5.6478616074868506e-05, + "loss": 0.376, + "step": 2009 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.2591813760532827, + "learning_rate": 5.645148412522447e-05, + "loss": 0.3745, + "step": 2010 + }, + { + "epoch": 2.1439232409381663, + "grad_norm": 0.28268845158045547, + "learning_rate": 5.642434306231323e-05, + "loss": 0.3708, + "step": 2011 + }, + { + "epoch": 2.14498933901919, + "grad_norm": 0.35060571169428506, + "learning_rate": 5.639719290116954e-05, + "loss": 0.3654, + "step": 2012 + }, + { + "epoch": 2.1460554371002134, + "grad_norm": 0.3694429698768061, + "learning_rate": 5.637003365683317e-05, + "loss": 0.3671, + "step": 2013 + }, + { + "epoch": 2.1471215351812365, + "grad_norm": 0.34971403364074133, + "learning_rate": 5.6342865344348935e-05, + "loss": 0.3677, + "step": 2014 + }, + { + "epoch": 2.14818763326226, + "grad_norm": 0.27907178596426185, + "learning_rate": 5.631568797876665e-05, + "loss": 0.3741, + "step": 2015 + }, + { + "epoch": 2.1492537313432836, + "grad_norm": 0.27207794051401735, + "learning_rate": 5.628850157514118e-05, + "loss": 0.3658, + "step": 2016 + }, + { + "epoch": 2.150319829424307, + "grad_norm": 0.28041793573401386, + "learning_rate": 5.6261306148532377e-05, + "loss": 0.3673, + "step": 2017 + }, + { + "epoch": 2.1513859275053306, + "grad_norm": 0.24028826632892578, + "learning_rate": 5.62341017140051e-05, + "loss": 0.3708, + "step": 2018 + }, + { + "epoch": 2.1524520255863537, + "grad_norm": 0.3066348756647207, + "learning_rate": 5.6206888286629186e-05, + "loss": 0.3722, + "step": 2019 + }, + { + "epoch": 2.1535181236673773, + "grad_norm": 0.2776104929301282, + "learning_rate": 5.6179665881479444e-05, + "loss": 0.371, + "step": 2020 + }, + { + "epoch": 2.154584221748401, + "grad_norm": 0.2932002920422834, + "learning_rate": 5.61524345136357e-05, + "loss": 0.3765, + "step": 2021 + }, + { + "epoch": 2.1556503198294243, + "grad_norm": 0.42204445414488995, + "learning_rate": 5.6125194198182683e-05, + "loss": 0.3747, + "step": 2022 + }, + { + "epoch": 2.156716417910448, + "grad_norm": 0.4892242798710215, + "learning_rate": 5.609794495021016e-05, + "loss": 0.3751, + "step": 2023 + }, + { + "epoch": 2.1577825159914714, + "grad_norm": 0.43720511006835944, + "learning_rate": 5.607068678481274e-05, + "loss": 0.368, + "step": 2024 + }, + { + "epoch": 2.1588486140724945, + "grad_norm": 0.33287359123355437, + "learning_rate": 5.6043419717090075e-05, + "loss": 0.3694, + "step": 2025 + }, + { + "epoch": 2.159914712153518, + "grad_norm": 0.28896027090363063, + "learning_rate": 5.6016143762146685e-05, + "loss": 0.3688, + "step": 2026 + }, + { + "epoch": 2.1609808102345416, + "grad_norm": 0.2887140479083496, + "learning_rate": 5.598885893509203e-05, + "loss": 0.3737, + "step": 2027 + }, + { + "epoch": 2.162046908315565, + "grad_norm": 0.2695168201652278, + "learning_rate": 5.59615652510405e-05, + "loss": 0.3705, + "step": 2028 + }, + { + "epoch": 2.1631130063965887, + "grad_norm": 0.2407806636654491, + "learning_rate": 5.593426272511136e-05, + "loss": 0.3698, + "step": 2029 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.32911747478553777, + "learning_rate": 5.590695137242881e-05, + "loss": 0.3752, + "step": 2030 + }, + { + "epoch": 2.1652452025586353, + "grad_norm": 0.3868451813947213, + "learning_rate": 5.587963120812189e-05, + "loss": 0.3794, + "step": 2031 + }, + { + "epoch": 2.166311300639659, + "grad_norm": 0.34786974694646033, + "learning_rate": 5.585230224732458e-05, + "loss": 0.3652, + "step": 2032 + }, + { + "epoch": 2.1673773987206824, + "grad_norm": 0.35478109819952885, + "learning_rate": 5.582496450517569e-05, + "loss": 0.3795, + "step": 2033 + }, + { + "epoch": 2.168443496801706, + "grad_norm": 0.3508750515755295, + "learning_rate": 5.5797617996818915e-05, + "loss": 0.3646, + "step": 2034 + }, + { + "epoch": 2.1695095948827294, + "grad_norm": 0.3447886192528013, + "learning_rate": 5.57702627374028e-05, + "loss": 0.3676, + "step": 2035 + }, + { + "epoch": 2.1705756929637525, + "grad_norm": 0.41611174037016146, + "learning_rate": 5.5742898742080726e-05, + "loss": 0.3703, + "step": 2036 + }, + { + "epoch": 2.171641791044776, + "grad_norm": 0.4442311781729241, + "learning_rate": 5.5715526026010935e-05, + "loss": 0.3659, + "step": 2037 + }, + { + "epoch": 2.1727078891257996, + "grad_norm": 0.3670273857196127, + "learning_rate": 5.568814460435649e-05, + "loss": 0.3768, + "step": 2038 + }, + { + "epoch": 2.173773987206823, + "grad_norm": 0.2593513887640986, + "learning_rate": 5.5660754492285264e-05, + "loss": 0.3669, + "step": 2039 + }, + { + "epoch": 2.1748400852878467, + "grad_norm": 0.28282579166446825, + "learning_rate": 5.563335570496996e-05, + "loss": 0.3739, + "step": 2040 + }, + { + "epoch": 2.1759061833688698, + "grad_norm": 0.36977014027652605, + "learning_rate": 5.560594825758809e-05, + "loss": 0.3615, + "step": 2041 + }, + { + "epoch": 2.1769722814498933, + "grad_norm": 0.37906406180450586, + "learning_rate": 5.557853216532194e-05, + "loss": 0.374, + "step": 2042 + }, + { + "epoch": 2.178038379530917, + "grad_norm": 0.3003073120798734, + "learning_rate": 5.555110744335863e-05, + "loss": 0.3688, + "step": 2043 + }, + { + "epoch": 2.1791044776119404, + "grad_norm": 0.30581675422030474, + "learning_rate": 5.552367410688999e-05, + "loss": 0.3709, + "step": 2044 + }, + { + "epoch": 2.180170575692964, + "grad_norm": 0.4243247655082495, + "learning_rate": 5.5496232171112703e-05, + "loss": 0.3725, + "step": 2045 + }, + { + "epoch": 2.181236673773987, + "grad_norm": 0.5418944865918137, + "learning_rate": 5.546878165122815e-05, + "loss": 0.3747, + "step": 2046 + }, + { + "epoch": 2.1823027718550105, + "grad_norm": 0.5717964571682613, + "learning_rate": 5.544132256244249e-05, + "loss": 0.3701, + "step": 2047 + }, + { + "epoch": 2.183368869936034, + "grad_norm": 0.5656819581120287, + "learning_rate": 5.5413854919966654e-05, + "loss": 0.3721, + "step": 2048 + }, + { + "epoch": 2.1844349680170576, + "grad_norm": 0.48959229163286955, + "learning_rate": 5.538637873901626e-05, + "loss": 0.3697, + "step": 2049 + }, + { + "epoch": 2.185501066098081, + "grad_norm": 0.39474515330398696, + "learning_rate": 5.5358894034811705e-05, + "loss": 0.3723, + "step": 2050 + }, + { + "epoch": 2.1865671641791047, + "grad_norm": 0.29811160693240174, + "learning_rate": 5.533140082257808e-05, + "loss": 0.3765, + "step": 2051 + }, + { + "epoch": 2.1876332622601278, + "grad_norm": 0.2398629529634624, + "learning_rate": 5.530389911754519e-05, + "loss": 0.3702, + "step": 2052 + }, + { + "epoch": 2.1886993603411513, + "grad_norm": 0.2641014816643812, + "learning_rate": 5.527638893494755e-05, + "loss": 0.375, + "step": 2053 + }, + { + "epoch": 2.189765458422175, + "grad_norm": 0.26988134759785193, + "learning_rate": 5.5248870290024396e-05, + "loss": 0.3702, + "step": 2054 + }, + { + "epoch": 2.1908315565031984, + "grad_norm": 0.26917332673147915, + "learning_rate": 5.5221343198019596e-05, + "loss": 0.3695, + "step": 2055 + }, + { + "epoch": 2.191897654584222, + "grad_norm": 0.32231195092404136, + "learning_rate": 5.519380767418176e-05, + "loss": 0.3685, + "step": 2056 + }, + { + "epoch": 2.192963752665245, + "grad_norm": 0.3242363910277114, + "learning_rate": 5.5166263733764096e-05, + "loss": 0.3695, + "step": 2057 + }, + { + "epoch": 2.1940298507462686, + "grad_norm": 0.30073683294689246, + "learning_rate": 5.5138711392024545e-05, + "loss": 0.3704, + "step": 2058 + }, + { + "epoch": 2.195095948827292, + "grad_norm": 0.26491871391522587, + "learning_rate": 5.5111150664225665e-05, + "loss": 0.3715, + "step": 2059 + }, + { + "epoch": 2.1961620469083156, + "grad_norm": 0.294788631674408, + "learning_rate": 5.508358156563466e-05, + "loss": 0.3695, + "step": 2060 + }, + { + "epoch": 2.197228144989339, + "grad_norm": 0.3148068642489263, + "learning_rate": 5.505600411152341e-05, + "loss": 0.3723, + "step": 2061 + }, + { + "epoch": 2.1982942430703627, + "grad_norm": 0.2787016615117562, + "learning_rate": 5.502841831716833e-05, + "loss": 0.3739, + "step": 2062 + }, + { + "epoch": 2.199360341151386, + "grad_norm": 0.2643411187566, + "learning_rate": 5.5000824197850575e-05, + "loss": 0.3739, + "step": 2063 + }, + { + "epoch": 2.2004264392324093, + "grad_norm": 0.2991815067944646, + "learning_rate": 5.497322176885582e-05, + "loss": 0.3734, + "step": 2064 + }, + { + "epoch": 2.201492537313433, + "grad_norm": 0.31374929510374666, + "learning_rate": 5.494561104547439e-05, + "loss": 0.3668, + "step": 2065 + }, + { + "epoch": 2.2025586353944564, + "grad_norm": 0.2983982343106373, + "learning_rate": 5.491799204300119e-05, + "loss": 0.3756, + "step": 2066 + }, + { + "epoch": 2.20362473347548, + "grad_norm": 0.3055575842818981, + "learning_rate": 5.489036477673571e-05, + "loss": 0.3739, + "step": 2067 + }, + { + "epoch": 2.204690831556503, + "grad_norm": 0.36274620979363453, + "learning_rate": 5.486272926198202e-05, + "loss": 0.3701, + "step": 2068 + }, + { + "epoch": 2.2057569296375266, + "grad_norm": 0.38535842211347343, + "learning_rate": 5.483508551404875e-05, + "loss": 0.3683, + "step": 2069 + }, + { + "epoch": 2.20682302771855, + "grad_norm": 0.36592619429217743, + "learning_rate": 5.4807433548249106e-05, + "loss": 0.3719, + "step": 2070 + }, + { + "epoch": 2.2078891257995736, + "grad_norm": 0.320538300126649, + "learning_rate": 5.4779773379900856e-05, + "loss": 0.3681, + "step": 2071 + }, + { + "epoch": 2.208955223880597, + "grad_norm": 0.3284949841600627, + "learning_rate": 5.4752105024326265e-05, + "loss": 0.3722, + "step": 2072 + }, + { + "epoch": 2.2100213219616203, + "grad_norm": 0.36352567522233187, + "learning_rate": 5.4724428496852184e-05, + "loss": 0.3742, + "step": 2073 + }, + { + "epoch": 2.211087420042644, + "grad_norm": 0.3708212783274699, + "learning_rate": 5.469674381280997e-05, + "loss": 0.3723, + "step": 2074 + }, + { + "epoch": 2.2121535181236673, + "grad_norm": 0.34144099793040533, + "learning_rate": 5.4669050987535504e-05, + "loss": 0.3716, + "step": 2075 + }, + { + "epoch": 2.213219616204691, + "grad_norm": 0.31816989140951235, + "learning_rate": 5.464135003636914e-05, + "loss": 0.3651, + "step": 2076 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.29852456569861313, + "learning_rate": 5.461364097465581e-05, + "loss": 0.3711, + "step": 2077 + }, + { + "epoch": 2.2153518123667375, + "grad_norm": 0.2455931272133471, + "learning_rate": 5.4585923817744864e-05, + "loss": 0.3718, + "step": 2078 + }, + { + "epoch": 2.216417910447761, + "grad_norm": 0.26413290094993985, + "learning_rate": 5.455819858099018e-05, + "loss": 0.365, + "step": 2079 + }, + { + "epoch": 2.2174840085287846, + "grad_norm": 0.24066490453483533, + "learning_rate": 5.4530465279750087e-05, + "loss": 0.3705, + "step": 2080 + }, + { + "epoch": 2.218550106609808, + "grad_norm": 0.2618435559346739, + "learning_rate": 5.450272392938742e-05, + "loss": 0.3735, + "step": 2081 + }, + { + "epoch": 2.2196162046908317, + "grad_norm": 0.24293872764002158, + "learning_rate": 5.4474974545269394e-05, + "loss": 0.3717, + "step": 2082 + }, + { + "epoch": 2.220682302771855, + "grad_norm": 0.2368196730572011, + "learning_rate": 5.444721714276778e-05, + "loss": 0.3639, + "step": 2083 + }, + { + "epoch": 2.2217484008528783, + "grad_norm": 0.27260123271017317, + "learning_rate": 5.44194517372587e-05, + "loss": 0.3685, + "step": 2084 + }, + { + "epoch": 2.222814498933902, + "grad_norm": 0.270272968775527, + "learning_rate": 5.439167834412277e-05, + "loss": 0.3672, + "step": 2085 + }, + { + "epoch": 2.2238805970149254, + "grad_norm": 0.29536457376389963, + "learning_rate": 5.436389697874499e-05, + "loss": 0.3656, + "step": 2086 + }, + { + "epoch": 2.224946695095949, + "grad_norm": 0.38141850154306833, + "learning_rate": 5.4336107656514796e-05, + "loss": 0.3718, + "step": 2087 + }, + { + "epoch": 2.2260127931769724, + "grad_norm": 0.38555373110953656, + "learning_rate": 5.430831039282603e-05, + "loss": 0.3653, + "step": 2088 + }, + { + "epoch": 2.227078891257996, + "grad_norm": 0.33806578284967886, + "learning_rate": 5.428050520307693e-05, + "loss": 0.3651, + "step": 2089 + }, + { + "epoch": 2.228144989339019, + "grad_norm": 0.3782268091971875, + "learning_rate": 5.425269210267013e-05, + "loss": 0.3711, + "step": 2090 + }, + { + "epoch": 2.2292110874200426, + "grad_norm": 0.4292891226917417, + "learning_rate": 5.422487110701263e-05, + "loss": 0.3702, + "step": 2091 + }, + { + "epoch": 2.230277185501066, + "grad_norm": 0.41142332636108486, + "learning_rate": 5.419704223151584e-05, + "loss": 0.3661, + "step": 2092 + }, + { + "epoch": 2.2313432835820897, + "grad_norm": 0.3326023012950207, + "learning_rate": 5.416920549159549e-05, + "loss": 0.3721, + "step": 2093 + }, + { + "epoch": 2.232409381663113, + "grad_norm": 0.26521116383654986, + "learning_rate": 5.4141360902671696e-05, + "loss": 0.3723, + "step": 2094 + }, + { + "epoch": 2.2334754797441363, + "grad_norm": 0.2612433990558302, + "learning_rate": 5.411350848016891e-05, + "loss": 0.3601, + "step": 2095 + }, + { + "epoch": 2.23454157782516, + "grad_norm": 0.3434339511954728, + "learning_rate": 5.4085648239515914e-05, + "loss": 0.3692, + "step": 2096 + }, + { + "epoch": 2.2356076759061834, + "grad_norm": 0.3918166163519152, + "learning_rate": 5.4057780196145856e-05, + "loss": 0.3601, + "step": 2097 + }, + { + "epoch": 2.236673773987207, + "grad_norm": 0.3818529019674707, + "learning_rate": 5.402990436549617e-05, + "loss": 0.3703, + "step": 2098 + }, + { + "epoch": 2.2377398720682304, + "grad_norm": 0.36098975401837946, + "learning_rate": 5.4002020763008624e-05, + "loss": 0.3711, + "step": 2099 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.31093262243992925, + "learning_rate": 5.397412940412927e-05, + "loss": 0.3698, + "step": 2100 + }, + { + "epoch": 2.239872068230277, + "grad_norm": 0.3234247785754041, + "learning_rate": 5.39462303043085e-05, + "loss": 0.3699, + "step": 2101 + }, + { + "epoch": 2.2409381663113006, + "grad_norm": 0.3388924115506885, + "learning_rate": 5.391832347900095e-05, + "loss": 0.3714, + "step": 2102 + }, + { + "epoch": 2.242004264392324, + "grad_norm": 0.2867861763859992, + "learning_rate": 5.389040894366554e-05, + "loss": 0.3681, + "step": 2103 + }, + { + "epoch": 2.2430703624733477, + "grad_norm": 0.27923094302877083, + "learning_rate": 5.386248671376549e-05, + "loss": 0.3711, + "step": 2104 + }, + { + "epoch": 2.2441364605543708, + "grad_norm": 0.31103472441375013, + "learning_rate": 5.383455680476824e-05, + "loss": 0.3647, + "step": 2105 + }, + { + "epoch": 2.2452025586353943, + "grad_norm": 0.2851322771459375, + "learning_rate": 5.380661923214553e-05, + "loss": 0.3692, + "step": 2106 + }, + { + "epoch": 2.246268656716418, + "grad_norm": 0.32537762914509355, + "learning_rate": 5.377867401137332e-05, + "loss": 0.372, + "step": 2107 + }, + { + "epoch": 2.2473347547974414, + "grad_norm": 0.36286224273560747, + "learning_rate": 5.375072115793181e-05, + "loss": 0.3736, + "step": 2108 + }, + { + "epoch": 2.248400852878465, + "grad_norm": 0.39983878699233394, + "learning_rate": 5.3722760687305414e-05, + "loss": 0.3726, + "step": 2109 + }, + { + "epoch": 2.2494669509594885, + "grad_norm": 0.3468587900831904, + "learning_rate": 5.3694792614982794e-05, + "loss": 0.3701, + "step": 2110 + }, + { + "epoch": 2.2505330490405115, + "grad_norm": 0.29431394759332513, + "learning_rate": 5.366681695645681e-05, + "loss": 0.3741, + "step": 2111 + }, + { + "epoch": 2.251599147121535, + "grad_norm": 0.292448439837048, + "learning_rate": 5.363883372722452e-05, + "loss": 0.3705, + "step": 2112 + }, + { + "epoch": 2.2526652452025586, + "grad_norm": 0.3477064399327982, + "learning_rate": 5.3610842942787156e-05, + "loss": 0.3698, + "step": 2113 + }, + { + "epoch": 2.253731343283582, + "grad_norm": 0.30311348425697426, + "learning_rate": 5.3582844618650196e-05, + "loss": 0.3734, + "step": 2114 + }, + { + "epoch": 2.2547974413646057, + "grad_norm": 0.29982583065786783, + "learning_rate": 5.355483877032324e-05, + "loss": 0.376, + "step": 2115 + }, + { + "epoch": 2.2558635394456292, + "grad_norm": 0.2708619876359533, + "learning_rate": 5.352682541332006e-05, + "loss": 0.3743, + "step": 2116 + }, + { + "epoch": 2.2569296375266523, + "grad_norm": 0.28890106497206663, + "learning_rate": 5.349880456315862e-05, + "loss": 0.3715, + "step": 2117 + }, + { + "epoch": 2.257995735607676, + "grad_norm": 0.2663781257854233, + "learning_rate": 5.347077623536099e-05, + "loss": 0.3679, + "step": 2118 + }, + { + "epoch": 2.2590618336886994, + "grad_norm": 0.2674833564078522, + "learning_rate": 5.344274044545344e-05, + "loss": 0.3669, + "step": 2119 + }, + { + "epoch": 2.260127931769723, + "grad_norm": 0.28497773715596586, + "learning_rate": 5.3414697208966315e-05, + "loss": 0.3669, + "step": 2120 + }, + { + "epoch": 2.2611940298507465, + "grad_norm": 0.26215123696658, + "learning_rate": 5.3386646541434126e-05, + "loss": 0.3706, + "step": 2121 + }, + { + "epoch": 2.2622601279317696, + "grad_norm": 0.32005074927132454, + "learning_rate": 5.335858845839546e-05, + "loss": 0.3702, + "step": 2122 + }, + { + "epoch": 2.263326226012793, + "grad_norm": 0.32411250381064693, + "learning_rate": 5.333052297539308e-05, + "loss": 0.3699, + "step": 2123 + }, + { + "epoch": 2.2643923240938166, + "grad_norm": 0.29000255006330194, + "learning_rate": 5.3302450107973766e-05, + "loss": 0.368, + "step": 2124 + }, + { + "epoch": 2.26545842217484, + "grad_norm": 0.27431618415808595, + "learning_rate": 5.327436987168844e-05, + "loss": 0.371, + "step": 2125 + }, + { + "epoch": 2.2665245202558637, + "grad_norm": 0.378024359549344, + "learning_rate": 5.324628228209209e-05, + "loss": 0.3721, + "step": 2126 + }, + { + "epoch": 2.267590618336887, + "grad_norm": 0.43512140087622997, + "learning_rate": 5.321818735474379e-05, + "loss": 0.3715, + "step": 2127 + }, + { + "epoch": 2.2686567164179103, + "grad_norm": 0.3750669758497868, + "learning_rate": 5.3190085105206666e-05, + "loss": 0.3764, + "step": 2128 + }, + { + "epoch": 2.269722814498934, + "grad_norm": 0.28073557975371866, + "learning_rate": 5.31619755490479e-05, + "loss": 0.3748, + "step": 2129 + }, + { + "epoch": 2.2707889125799574, + "grad_norm": 0.2949560743757881, + "learning_rate": 5.3133858701838735e-05, + "loss": 0.3709, + "step": 2130 + }, + { + "epoch": 2.271855010660981, + "grad_norm": 0.3204542728152737, + "learning_rate": 5.310573457915443e-05, + "loss": 0.3703, + "step": 2131 + }, + { + "epoch": 2.272921108742004, + "grad_norm": 0.2868376825764523, + "learning_rate": 5.307760319657429e-05, + "loss": 0.3708, + "step": 2132 + }, + { + "epoch": 2.2739872068230276, + "grad_norm": 0.251294097973084, + "learning_rate": 5.3049464569681654e-05, + "loss": 0.3689, + "step": 2133 + }, + { + "epoch": 2.275053304904051, + "grad_norm": 0.27463227847743765, + "learning_rate": 5.3021318714063834e-05, + "loss": 0.3761, + "step": 2134 + }, + { + "epoch": 2.2761194029850746, + "grad_norm": 0.33592667040636226, + "learning_rate": 5.299316564531219e-05, + "loss": 0.3731, + "step": 2135 + }, + { + "epoch": 2.277185501066098, + "grad_norm": 0.38903214147148135, + "learning_rate": 5.296500537902205e-05, + "loss": 0.37, + "step": 2136 + }, + { + "epoch": 2.2782515991471217, + "grad_norm": 0.34915279703933755, + "learning_rate": 5.293683793079274e-05, + "loss": 0.3649, + "step": 2137 + }, + { + "epoch": 2.279317697228145, + "grad_norm": 0.3506016673900483, + "learning_rate": 5.2908663316227577e-05, + "loss": 0.3734, + "step": 2138 + }, + { + "epoch": 2.2803837953091683, + "grad_norm": 0.3878071683023028, + "learning_rate": 5.28804815509338e-05, + "loss": 0.3707, + "step": 2139 + }, + { + "epoch": 2.281449893390192, + "grad_norm": 0.35568233275956773, + "learning_rate": 5.285229265052268e-05, + "loss": 0.3758, + "step": 2140 + }, + { + "epoch": 2.2825159914712154, + "grad_norm": 0.3418784225462311, + "learning_rate": 5.2824096630609385e-05, + "loss": 0.37, + "step": 2141 + }, + { + "epoch": 2.283582089552239, + "grad_norm": 0.3607422406160099, + "learning_rate": 5.2795893506813024e-05, + "loss": 0.3728, + "step": 2142 + }, + { + "epoch": 2.2846481876332625, + "grad_norm": 0.3721645564569364, + "learning_rate": 5.276768329475671e-05, + "loss": 0.3717, + "step": 2143 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.3220957092565807, + "learning_rate": 5.2739466010067385e-05, + "loss": 0.3739, + "step": 2144 + }, + { + "epoch": 2.286780383795309, + "grad_norm": 0.2920281939799502, + "learning_rate": 5.271124166837599e-05, + "loss": 0.366, + "step": 2145 + }, + { + "epoch": 2.2878464818763327, + "grad_norm": 0.3080101489292651, + "learning_rate": 5.2683010285317333e-05, + "loss": 0.3675, + "step": 2146 + }, + { + "epoch": 2.288912579957356, + "grad_norm": 0.3042289029809852, + "learning_rate": 5.265477187653012e-05, + "loss": 0.369, + "step": 2147 + }, + { + "epoch": 2.2899786780383797, + "grad_norm": 0.3475082339985931, + "learning_rate": 5.262652645765699e-05, + "loss": 0.3702, + "step": 2148 + }, + { + "epoch": 2.291044776119403, + "grad_norm": 0.3115045075355101, + "learning_rate": 5.2598274044344414e-05, + "loss": 0.3687, + "step": 2149 + }, + { + "epoch": 2.2921108742004264, + "grad_norm": 0.32610593675887717, + "learning_rate": 5.257001465224278e-05, + "loss": 0.3692, + "step": 2150 + }, + { + "epoch": 2.29317697228145, + "grad_norm": 0.37456808053238067, + "learning_rate": 5.2541748297006306e-05, + "loss": 0.3721, + "step": 2151 + }, + { + "epoch": 2.2942430703624734, + "grad_norm": 0.3350461025624831, + "learning_rate": 5.251347499429309e-05, + "loss": 0.3732, + "step": 2152 + }, + { + "epoch": 2.295309168443497, + "grad_norm": 0.2568059568097965, + "learning_rate": 5.2485194759765074e-05, + "loss": 0.3691, + "step": 2153 + }, + { + "epoch": 2.29637526652452, + "grad_norm": 0.27831384466881337, + "learning_rate": 5.245690760908803e-05, + "loss": 0.3686, + "step": 2154 + }, + { + "epoch": 2.2974413646055436, + "grad_norm": 0.26602965147812224, + "learning_rate": 5.242861355793157e-05, + "loss": 0.3645, + "step": 2155 + }, + { + "epoch": 2.298507462686567, + "grad_norm": 0.2587039755436819, + "learning_rate": 5.240031262196914e-05, + "loss": 0.3757, + "step": 2156 + }, + { + "epoch": 2.2995735607675907, + "grad_norm": 0.27151417310136094, + "learning_rate": 5.237200481687798e-05, + "loss": 0.368, + "step": 2157 + }, + { + "epoch": 2.300639658848614, + "grad_norm": 0.26058762590380463, + "learning_rate": 5.234369015833914e-05, + "loss": 0.3697, + "step": 2158 + }, + { + "epoch": 2.3017057569296373, + "grad_norm": 0.33421242107141547, + "learning_rate": 5.2315368662037485e-05, + "loss": 0.3675, + "step": 2159 + }, + { + "epoch": 2.302771855010661, + "grad_norm": 0.3659615591367041, + "learning_rate": 5.228704034366162e-05, + "loss": 0.3703, + "step": 2160 + }, + { + "epoch": 2.3038379530916844, + "grad_norm": 0.27978795201644363, + "learning_rate": 5.2258705218904005e-05, + "loss": 0.371, + "step": 2161 + }, + { + "epoch": 2.304904051172708, + "grad_norm": 0.21852529768913895, + "learning_rate": 5.2230363303460794e-05, + "loss": 0.3683, + "step": 2162 + }, + { + "epoch": 2.3059701492537314, + "grad_norm": 0.3118783856447842, + "learning_rate": 5.220201461303193e-05, + "loss": 0.3714, + "step": 2163 + }, + { + "epoch": 2.307036247334755, + "grad_norm": 0.3392350656900747, + "learning_rate": 5.2173659163321145e-05, + "loss": 0.3682, + "step": 2164 + }, + { + "epoch": 2.308102345415778, + "grad_norm": 0.32734701181884485, + "learning_rate": 5.2145296970035846e-05, + "loss": 0.3744, + "step": 2165 + }, + { + "epoch": 2.3091684434968016, + "grad_norm": 0.303313761017607, + "learning_rate": 5.211692804888726e-05, + "loss": 0.3744, + "step": 2166 + }, + { + "epoch": 2.310234541577825, + "grad_norm": 0.2905681835769858, + "learning_rate": 5.2088552415590254e-05, + "loss": 0.3713, + "step": 2167 + }, + { + "epoch": 2.3113006396588487, + "grad_norm": 0.3534515955282182, + "learning_rate": 5.2060170085863484e-05, + "loss": 0.3703, + "step": 2168 + }, + { + "epoch": 2.3123667377398722, + "grad_norm": 0.4264619477891961, + "learning_rate": 5.203178107542925e-05, + "loss": 0.3715, + "step": 2169 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.47866741810652247, + "learning_rate": 5.200338540001364e-05, + "loss": 0.3706, + "step": 2170 + }, + { + "epoch": 2.314498933901919, + "grad_norm": 0.4733916257306672, + "learning_rate": 5.1974983075346335e-05, + "loss": 0.3639, + "step": 2171 + }, + { + "epoch": 2.3155650319829424, + "grad_norm": 0.42832051016045336, + "learning_rate": 5.194657411716076e-05, + "loss": 0.3698, + "step": 2172 + }, + { + "epoch": 2.316631130063966, + "grad_norm": 0.43789085333356936, + "learning_rate": 5.1918158541194014e-05, + "loss": 0.3735, + "step": 2173 + }, + { + "epoch": 2.3176972281449895, + "grad_norm": 0.4573284771159797, + "learning_rate": 5.188973636318684e-05, + "loss": 0.3713, + "step": 2174 + }, + { + "epoch": 2.318763326226013, + "grad_norm": 0.3916370694042795, + "learning_rate": 5.1861307598883644e-05, + "loss": 0.3723, + "step": 2175 + }, + { + "epoch": 2.319829424307036, + "grad_norm": 0.2585801987726249, + "learning_rate": 5.1832872264032495e-05, + "loss": 0.37, + "step": 2176 + }, + { + "epoch": 2.3208955223880596, + "grad_norm": 0.2602560441928257, + "learning_rate": 5.180443037438508e-05, + "loss": 0.3711, + "step": 2177 + }, + { + "epoch": 2.321961620469083, + "grad_norm": 0.36519338651936206, + "learning_rate": 5.1775981945696736e-05, + "loss": 0.3701, + "step": 2178 + }, + { + "epoch": 2.3230277185501067, + "grad_norm": 0.34363559674089617, + "learning_rate": 5.1747526993726406e-05, + "loss": 0.3701, + "step": 2179 + }, + { + "epoch": 2.3240938166311302, + "grad_norm": 0.34656810990367415, + "learning_rate": 5.1719065534236665e-05, + "loss": 0.3784, + "step": 2180 + }, + { + "epoch": 2.3251599147121533, + "grad_norm": 0.3632003951397555, + "learning_rate": 5.169059758299367e-05, + "loss": 0.3667, + "step": 2181 + }, + { + "epoch": 2.326226012793177, + "grad_norm": 0.33113190305897466, + "learning_rate": 5.1662123155767195e-05, + "loss": 0.3672, + "step": 2182 + }, + { + "epoch": 2.3272921108742004, + "grad_norm": 0.3490341083520184, + "learning_rate": 5.163364226833058e-05, + "loss": 0.3715, + "step": 2183 + }, + { + "epoch": 2.328358208955224, + "grad_norm": 0.30252710599573834, + "learning_rate": 5.1605154936460774e-05, + "loss": 0.3699, + "step": 2184 + }, + { + "epoch": 2.3294243070362475, + "grad_norm": 0.23971322338603304, + "learning_rate": 5.1576661175938274e-05, + "loss": 0.3684, + "step": 2185 + }, + { + "epoch": 2.3304904051172706, + "grad_norm": 0.2935195353785945, + "learning_rate": 5.154816100254714e-05, + "loss": 0.3728, + "step": 2186 + }, + { + "epoch": 2.331556503198294, + "grad_norm": 0.32527117670349187, + "learning_rate": 5.151965443207498e-05, + "loss": 0.3672, + "step": 2187 + }, + { + "epoch": 2.3326226012793176, + "grad_norm": 0.29601539551210887, + "learning_rate": 5.149114148031296e-05, + "loss": 0.3722, + "step": 2188 + }, + { + "epoch": 2.333688699360341, + "grad_norm": 0.29110059563391355, + "learning_rate": 5.1462622163055764e-05, + "loss": 0.3711, + "step": 2189 + }, + { + "epoch": 2.3347547974413647, + "grad_norm": 0.2777132993013124, + "learning_rate": 5.143409649610163e-05, + "loss": 0.3733, + "step": 2190 + }, + { + "epoch": 2.3358208955223883, + "grad_norm": 0.2838065564769729, + "learning_rate": 5.14055644952523e-05, + "loss": 0.3677, + "step": 2191 + }, + { + "epoch": 2.3368869936034113, + "grad_norm": 0.26546607714553994, + "learning_rate": 5.137702617631299e-05, + "loss": 0.3705, + "step": 2192 + }, + { + "epoch": 2.337953091684435, + "grad_norm": 0.2691865136150305, + "learning_rate": 5.134848155509245e-05, + "loss": 0.3693, + "step": 2193 + }, + { + "epoch": 2.3390191897654584, + "grad_norm": 0.3248422617546964, + "learning_rate": 5.131993064740293e-05, + "loss": 0.373, + "step": 2194 + }, + { + "epoch": 2.340085287846482, + "grad_norm": 0.44219209612841087, + "learning_rate": 5.1291373469060156e-05, + "loss": 0.3746, + "step": 2195 + }, + { + "epoch": 2.3411513859275055, + "grad_norm": 0.4487435007125376, + "learning_rate": 5.12628100358833e-05, + "loss": 0.3693, + "step": 2196 + }, + { + "epoch": 2.342217484008529, + "grad_norm": 0.39660870409128357, + "learning_rate": 5.123424036369504e-05, + "loss": 0.3748, + "step": 2197 + }, + { + "epoch": 2.343283582089552, + "grad_norm": 0.35851143039701705, + "learning_rate": 5.120566446832146e-05, + "loss": 0.3691, + "step": 2198 + }, + { + "epoch": 2.3443496801705757, + "grad_norm": 0.334759825605026, + "learning_rate": 5.117708236559216e-05, + "loss": 0.3673, + "step": 2199 + }, + { + "epoch": 2.345415778251599, + "grad_norm": 0.2941272239375261, + "learning_rate": 5.114849407134012e-05, + "loss": 0.3643, + "step": 2200 + }, + { + "epoch": 2.3464818763326227, + "grad_norm": 0.2789608400358062, + "learning_rate": 5.111989960140175e-05, + "loss": 0.3671, + "step": 2201 + }, + { + "epoch": 2.3475479744136463, + "grad_norm": 0.3040283299295704, + "learning_rate": 5.109129897161694e-05, + "loss": 0.3772, + "step": 2202 + }, + { + "epoch": 2.3486140724946694, + "grad_norm": 0.319463979431584, + "learning_rate": 5.106269219782891e-05, + "loss": 0.3749, + "step": 2203 + }, + { + "epoch": 2.349680170575693, + "grad_norm": 0.2870715537863343, + "learning_rate": 5.1034079295884366e-05, + "loss": 0.3702, + "step": 2204 + }, + { + "epoch": 2.3507462686567164, + "grad_norm": 0.23446913387159526, + "learning_rate": 5.100546028163334e-05, + "loss": 0.3753, + "step": 2205 + }, + { + "epoch": 2.35181236673774, + "grad_norm": 0.25437833515945757, + "learning_rate": 5.0976835170929296e-05, + "loss": 0.3666, + "step": 2206 + }, + { + "epoch": 2.3528784648187635, + "grad_norm": 0.30198542630658826, + "learning_rate": 5.0948203979629046e-05, + "loss": 0.3673, + "step": 2207 + }, + { + "epoch": 2.3539445628997866, + "grad_norm": 0.3141893057528808, + "learning_rate": 5.091956672359279e-05, + "loss": 0.3627, + "step": 2208 + }, + { + "epoch": 2.35501066098081, + "grad_norm": 0.2805349073413673, + "learning_rate": 5.089092341868407e-05, + "loss": 0.3676, + "step": 2209 + }, + { + "epoch": 2.3560767590618337, + "grad_norm": 0.2464031232660798, + "learning_rate": 5.08622740807698e-05, + "loss": 0.3719, + "step": 2210 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.23438323370618938, + "learning_rate": 5.0833618725720214e-05, + "loss": 0.3615, + "step": 2211 + }, + { + "epoch": 2.3582089552238807, + "grad_norm": 0.2997661424210569, + "learning_rate": 5.080495736940889e-05, + "loss": 0.3705, + "step": 2212 + }, + { + "epoch": 2.359275053304904, + "grad_norm": 0.28620092721857604, + "learning_rate": 5.077629002771274e-05, + "loss": 0.3713, + "step": 2213 + }, + { + "epoch": 2.3603411513859274, + "grad_norm": 0.2468965983835025, + "learning_rate": 5.074761671651198e-05, + "loss": 0.3696, + "step": 2214 + }, + { + "epoch": 2.361407249466951, + "grad_norm": 0.25371334633765574, + "learning_rate": 5.071893745169012e-05, + "loss": 0.3656, + "step": 2215 + }, + { + "epoch": 2.3624733475479744, + "grad_norm": 0.24096589331073096, + "learning_rate": 5.0690252249133986e-05, + "loss": 0.3709, + "step": 2216 + }, + { + "epoch": 2.363539445628998, + "grad_norm": 0.2152100387619128, + "learning_rate": 5.066156112473371e-05, + "loss": 0.3727, + "step": 2217 + }, + { + "epoch": 2.364605543710021, + "grad_norm": 0.19650715541656136, + "learning_rate": 5.063286409438265e-05, + "loss": 0.3678, + "step": 2218 + }, + { + "epoch": 2.3656716417910446, + "grad_norm": 0.21642059764416482, + "learning_rate": 5.0604161173977504e-05, + "loss": 0.3701, + "step": 2219 + }, + { + "epoch": 2.366737739872068, + "grad_norm": 0.27848392769400904, + "learning_rate": 5.057545237941818e-05, + "loss": 0.3697, + "step": 2220 + }, + { + "epoch": 2.3678038379530917, + "grad_norm": 0.3063826642109227, + "learning_rate": 5.054673772660785e-05, + "loss": 0.3669, + "step": 2221 + }, + { + "epoch": 2.368869936034115, + "grad_norm": 0.32561764160202256, + "learning_rate": 5.0518017231452965e-05, + "loss": 0.369, + "step": 2222 + }, + { + "epoch": 2.3699360341151388, + "grad_norm": 0.2857238132037038, + "learning_rate": 5.048929090986315e-05, + "loss": 0.3658, + "step": 2223 + }, + { + "epoch": 2.3710021321961623, + "grad_norm": 0.24657414607956593, + "learning_rate": 5.046055877775134e-05, + "loss": 0.3689, + "step": 2224 + }, + { + "epoch": 2.3720682302771854, + "grad_norm": 0.25182509640136513, + "learning_rate": 5.04318208510336e-05, + "loss": 0.3737, + "step": 2225 + }, + { + "epoch": 2.373134328358209, + "grad_norm": 0.2663592462420767, + "learning_rate": 5.040307714562928e-05, + "loss": 0.3654, + "step": 2226 + }, + { + "epoch": 2.3742004264392325, + "grad_norm": 0.2879071667851132, + "learning_rate": 5.0374327677460865e-05, + "loss": 0.362, + "step": 2227 + }, + { + "epoch": 2.375266524520256, + "grad_norm": 0.31268904527265867, + "learning_rate": 5.034557246245411e-05, + "loss": 0.3716, + "step": 2228 + }, + { + "epoch": 2.3763326226012795, + "grad_norm": 0.33284965204702893, + "learning_rate": 5.031681151653788e-05, + "loss": 0.3736, + "step": 2229 + }, + { + "epoch": 2.3773987206823026, + "grad_norm": 0.33195392118584366, + "learning_rate": 5.028804485564424e-05, + "loss": 0.3677, + "step": 2230 + }, + { + "epoch": 2.378464818763326, + "grad_norm": 0.3258278442178994, + "learning_rate": 5.025927249570844e-05, + "loss": 0.3693, + "step": 2231 + }, + { + "epoch": 2.3795309168443497, + "grad_norm": 0.27343346069158153, + "learning_rate": 5.0230494452668864e-05, + "loss": 0.3733, + "step": 2232 + }, + { + "epoch": 2.3805970149253732, + "grad_norm": 0.25449187697759107, + "learning_rate": 5.020171074246707e-05, + "loss": 0.3668, + "step": 2233 + }, + { + "epoch": 2.3816631130063968, + "grad_norm": 0.30392088734456846, + "learning_rate": 5.01729213810477e-05, + "loss": 0.3728, + "step": 2234 + }, + { + "epoch": 2.38272921108742, + "grad_norm": 0.3421952140379261, + "learning_rate": 5.014412638435861e-05, + "loss": 0.3684, + "step": 2235 + }, + { + "epoch": 2.3837953091684434, + "grad_norm": 0.2857424583185023, + "learning_rate": 5.011532576835069e-05, + "loss": 0.3629, + "step": 2236 + }, + { + "epoch": 2.384861407249467, + "grad_norm": 0.2704981763316653, + "learning_rate": 5.008651954897802e-05, + "loss": 0.3686, + "step": 2237 + }, + { + "epoch": 2.3859275053304905, + "grad_norm": 0.31308011375671224, + "learning_rate": 5.005770774219771e-05, + "loss": 0.3668, + "step": 2238 + }, + { + "epoch": 2.386993603411514, + "grad_norm": 0.3213528795416127, + "learning_rate": 5.002889036397005e-05, + "loss": 0.3732, + "step": 2239 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.34520492416317516, + "learning_rate": 5.000006743025834e-05, + "loss": 0.3745, + "step": 2240 + }, + { + "epoch": 2.3891257995735606, + "grad_norm": 0.30507013156130014, + "learning_rate": 4.997123895702898e-05, + "loss": 0.3705, + "step": 2241 + }, + { + "epoch": 2.390191897654584, + "grad_norm": 0.2626181582745989, + "learning_rate": 4.994240496025147e-05, + "loss": 0.3727, + "step": 2242 + }, + { + "epoch": 2.3912579957356077, + "grad_norm": 0.28512789584756165, + "learning_rate": 4.9913565455898327e-05, + "loss": 0.3756, + "step": 2243 + }, + { + "epoch": 2.3923240938166312, + "grad_norm": 0.34534631086868595, + "learning_rate": 4.988472045994515e-05, + "loss": 0.3713, + "step": 2244 + }, + { + "epoch": 2.3933901918976543, + "grad_norm": 0.3429792538710119, + "learning_rate": 4.9855869988370566e-05, + "loss": 0.3703, + "step": 2245 + }, + { + "epoch": 2.394456289978678, + "grad_norm": 0.25240157643598227, + "learning_rate": 4.982701405715622e-05, + "loss": 0.3677, + "step": 2246 + }, + { + "epoch": 2.3955223880597014, + "grad_norm": 0.2549810372821131, + "learning_rate": 4.9798152682286824e-05, + "loss": 0.3711, + "step": 2247 + }, + { + "epoch": 2.396588486140725, + "grad_norm": 0.351895077255566, + "learning_rate": 4.976928587975006e-05, + "loss": 0.3686, + "step": 2248 + }, + { + "epoch": 2.3976545842217485, + "grad_norm": 0.30952942264717365, + "learning_rate": 4.974041366553665e-05, + "loss": 0.372, + "step": 2249 + }, + { + "epoch": 2.398720682302772, + "grad_norm": 0.2747608348019261, + "learning_rate": 4.9711536055640285e-05, + "loss": 0.3728, + "step": 2250 + }, + { + "epoch": 2.399786780383795, + "grad_norm": 0.29969340676529904, + "learning_rate": 4.9682653066057676e-05, + "loss": 0.3696, + "step": 2251 + }, + { + "epoch": 2.4008528784648187, + "grad_norm": 0.32127749277577844, + "learning_rate": 4.965376471278848e-05, + "loss": 0.3733, + "step": 2252 + }, + { + "epoch": 2.401918976545842, + "grad_norm": 0.3048300891627042, + "learning_rate": 4.962487101183536e-05, + "loss": 0.369, + "step": 2253 + }, + { + "epoch": 2.4029850746268657, + "grad_norm": 0.3219753451808507, + "learning_rate": 4.959597197920392e-05, + "loss": 0.3681, + "step": 2254 + }, + { + "epoch": 2.4040511727078893, + "grad_norm": 0.2965082879251037, + "learning_rate": 4.956706763090272e-05, + "loss": 0.3724, + "step": 2255 + }, + { + "epoch": 2.405117270788913, + "grad_norm": 0.275785450919491, + "learning_rate": 4.953815798294327e-05, + "loss": 0.3675, + "step": 2256 + }, + { + "epoch": 2.406183368869936, + "grad_norm": 0.8759881578646651, + "learning_rate": 4.950924305134001e-05, + "loss": 0.3698, + "step": 2257 + }, + { + "epoch": 2.4072494669509594, + "grad_norm": 0.2354192544695265, + "learning_rate": 4.948032285211031e-05, + "loss": 0.3693, + "step": 2258 + }, + { + "epoch": 2.408315565031983, + "grad_norm": 0.28320526847754757, + "learning_rate": 4.945139740127444e-05, + "loss": 0.3685, + "step": 2259 + }, + { + "epoch": 2.4093816631130065, + "grad_norm": 0.27994748944288567, + "learning_rate": 4.9422466714855635e-05, + "loss": 0.3716, + "step": 2260 + }, + { + "epoch": 2.41044776119403, + "grad_norm": 0.2577423187481169, + "learning_rate": 4.939353080887996e-05, + "loss": 0.3736, + "step": 2261 + }, + { + "epoch": 2.411513859275053, + "grad_norm": 0.20189014886728646, + "learning_rate": 4.936458969937642e-05, + "loss": 0.3634, + "step": 2262 + }, + { + "epoch": 2.4125799573560767, + "grad_norm": 0.27831002089150636, + "learning_rate": 4.933564340237687e-05, + "loss": 0.3698, + "step": 2263 + }, + { + "epoch": 2.4136460554371, + "grad_norm": 0.33503367593001443, + "learning_rate": 4.930669193391607e-05, + "loss": 0.3759, + "step": 2264 + }, + { + "epoch": 2.4147121535181237, + "grad_norm": 0.2557075329669632, + "learning_rate": 4.927773531003161e-05, + "loss": 0.3719, + "step": 2265 + }, + { + "epoch": 2.4157782515991473, + "grad_norm": 0.1959575254371725, + "learning_rate": 4.9248773546763984e-05, + "loss": 0.3666, + "step": 2266 + }, + { + "epoch": 2.4168443496801704, + "grad_norm": 0.2566851026971127, + "learning_rate": 4.921980666015647e-05, + "loss": 0.3722, + "step": 2267 + }, + { + "epoch": 2.417910447761194, + "grad_norm": 0.3373730577896526, + "learning_rate": 4.919083466625524e-05, + "loss": 0.3704, + "step": 2268 + }, + { + "epoch": 2.4189765458422174, + "grad_norm": 0.29147620272337893, + "learning_rate": 4.916185758110928e-05, + "loss": 0.3764, + "step": 2269 + }, + { + "epoch": 2.420042643923241, + "grad_norm": 0.23015006905725324, + "learning_rate": 4.913287542077035e-05, + "loss": 0.3765, + "step": 2270 + }, + { + "epoch": 2.4211087420042645, + "grad_norm": 0.2666917022092777, + "learning_rate": 4.91038882012931e-05, + "loss": 0.3685, + "step": 2271 + }, + { + "epoch": 2.4221748400852876, + "grad_norm": 0.2807196321927748, + "learning_rate": 4.907489593873493e-05, + "loss": 0.3673, + "step": 2272 + }, + { + "epoch": 2.423240938166311, + "grad_norm": 0.23362271125449918, + "learning_rate": 4.904589864915605e-05, + "loss": 0.367, + "step": 2273 + }, + { + "epoch": 2.4243070362473347, + "grad_norm": 0.2813715337244537, + "learning_rate": 4.901689634861943e-05, + "loss": 0.3712, + "step": 2274 + }, + { + "epoch": 2.425373134328358, + "grad_norm": 0.29016723056538085, + "learning_rate": 4.898788905319087e-05, + "loss": 0.373, + "step": 2275 + }, + { + "epoch": 2.4264392324093818, + "grad_norm": 0.302081673689402, + "learning_rate": 4.895887677893889e-05, + "loss": 0.3654, + "step": 2276 + }, + { + "epoch": 2.4275053304904053, + "grad_norm": 0.28261113280230443, + "learning_rate": 4.892985954193478e-05, + "loss": 0.3748, + "step": 2277 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.32074482716991354, + "learning_rate": 4.890083735825258e-05, + "loss": 0.3649, + "step": 2278 + }, + { + "epoch": 2.429637526652452, + "grad_norm": 0.3198080528899319, + "learning_rate": 4.887181024396907e-05, + "loss": 0.3593, + "step": 2279 + }, + { + "epoch": 2.4307036247334755, + "grad_norm": 0.25493105066167904, + "learning_rate": 4.884277821516377e-05, + "loss": 0.3661, + "step": 2280 + }, + { + "epoch": 2.431769722814499, + "grad_norm": 0.21652193711164933, + "learning_rate": 4.881374128791892e-05, + "loss": 0.3712, + "step": 2281 + }, + { + "epoch": 2.4328358208955225, + "grad_norm": 0.25095432049789496, + "learning_rate": 4.878469947831945e-05, + "loss": 0.3704, + "step": 2282 + }, + { + "epoch": 2.433901918976546, + "grad_norm": 0.26679049451296216, + "learning_rate": 4.875565280245303e-05, + "loss": 0.3741, + "step": 2283 + }, + { + "epoch": 2.434968017057569, + "grad_norm": 0.2999335227622482, + "learning_rate": 4.872660127640999e-05, + "loss": 0.3713, + "step": 2284 + }, + { + "epoch": 2.4360341151385927, + "grad_norm": 0.2695301789461439, + "learning_rate": 4.8697544916283386e-05, + "loss": 0.3659, + "step": 2285 + }, + { + "epoch": 2.4371002132196162, + "grad_norm": 0.21471471033527165, + "learning_rate": 4.866848373816893e-05, + "loss": 0.3682, + "step": 2286 + }, + { + "epoch": 2.4381663113006398, + "grad_norm": 0.23947760320903563, + "learning_rate": 4.863941775816498e-05, + "loss": 0.372, + "step": 2287 + }, + { + "epoch": 2.4392324093816633, + "grad_norm": 0.267331462460001, + "learning_rate": 4.8610346992372603e-05, + "loss": 0.3654, + "step": 2288 + }, + { + "epoch": 2.4402985074626864, + "grad_norm": 0.20555064622823938, + "learning_rate": 4.85812714568955e-05, + "loss": 0.3693, + "step": 2289 + }, + { + "epoch": 2.44136460554371, + "grad_norm": 0.21294997398754437, + "learning_rate": 4.855219116783997e-05, + "loss": 0.3728, + "step": 2290 + }, + { + "epoch": 2.4424307036247335, + "grad_norm": 0.21976358289050846, + "learning_rate": 4.8523106141315005e-05, + "loss": 0.3792, + "step": 2291 + }, + { + "epoch": 2.443496801705757, + "grad_norm": 0.23884227282875203, + "learning_rate": 4.8494016393432205e-05, + "loss": 0.3607, + "step": 2292 + }, + { + "epoch": 2.4445628997867805, + "grad_norm": 0.2624658049626465, + "learning_rate": 4.846492194030577e-05, + "loss": 0.3707, + "step": 2293 + }, + { + "epoch": 2.4456289978678036, + "grad_norm": 0.28918877258810133, + "learning_rate": 4.843582279805251e-05, + "loss": 0.3737, + "step": 2294 + }, + { + "epoch": 2.446695095948827, + "grad_norm": 0.31404526252223114, + "learning_rate": 4.840671898279185e-05, + "loss": 0.3659, + "step": 2295 + }, + { + "epoch": 2.4477611940298507, + "grad_norm": 0.29838133141624607, + "learning_rate": 4.837761051064579e-05, + "loss": 0.3687, + "step": 2296 + }, + { + "epoch": 2.4488272921108742, + "grad_norm": 0.22887592628169673, + "learning_rate": 4.834849739773889e-05, + "loss": 0.3722, + "step": 2297 + }, + { + "epoch": 2.449893390191898, + "grad_norm": 0.23115344121100373, + "learning_rate": 4.8319379660198316e-05, + "loss": 0.3737, + "step": 2298 + }, + { + "epoch": 2.450959488272921, + "grad_norm": 0.23213200641703416, + "learning_rate": 4.829025731415378e-05, + "loss": 0.364, + "step": 2299 + }, + { + "epoch": 2.4520255863539444, + "grad_norm": 0.2116924844746429, + "learning_rate": 4.826113037573756e-05, + "loss": 0.3674, + "step": 2300 + }, + { + "epoch": 2.453091684434968, + "grad_norm": 0.2559342759140697, + "learning_rate": 4.823199886108445e-05, + "loss": 0.3752, + "step": 2301 + }, + { + "epoch": 2.4541577825159915, + "grad_norm": 0.293845262944136, + "learning_rate": 4.82028627863318e-05, + "loss": 0.3709, + "step": 2302 + }, + { + "epoch": 2.455223880597015, + "grad_norm": 0.29454888718180283, + "learning_rate": 4.817372216761948e-05, + "loss": 0.3722, + "step": 2303 + }, + { + "epoch": 2.4562899786780386, + "grad_norm": 0.24511640910718516, + "learning_rate": 4.8144577021089884e-05, + "loss": 0.3747, + "step": 2304 + }, + { + "epoch": 2.4573560767590616, + "grad_norm": 0.2763149476728892, + "learning_rate": 4.81154273628879e-05, + "loss": 0.363, + "step": 2305 + }, + { + "epoch": 2.458422174840085, + "grad_norm": 0.3243976443000684, + "learning_rate": 4.8086273209160936e-05, + "loss": 0.3693, + "step": 2306 + }, + { + "epoch": 2.4594882729211087, + "grad_norm": 0.28035443109857583, + "learning_rate": 4.8057114576058863e-05, + "loss": 0.3709, + "step": 2307 + }, + { + "epoch": 2.4605543710021323, + "grad_norm": 0.20090541292771277, + "learning_rate": 4.802795147973406e-05, + "loss": 0.3666, + "step": 2308 + }, + { + "epoch": 2.461620469083156, + "grad_norm": 0.2580300489910844, + "learning_rate": 4.799878393634136e-05, + "loss": 0.3668, + "step": 2309 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.25079828700699003, + "learning_rate": 4.796961196203806e-05, + "loss": 0.3763, + "step": 2310 + }, + { + "epoch": 2.4637526652452024, + "grad_norm": 0.24137932132689202, + "learning_rate": 4.7940435572983936e-05, + "loss": 0.3705, + "step": 2311 + }, + { + "epoch": 2.464818763326226, + "grad_norm": 0.24074250093617247, + "learning_rate": 4.791125478534118e-05, + "loss": 0.3701, + "step": 2312 + }, + { + "epoch": 2.4658848614072495, + "grad_norm": 0.21896631390072002, + "learning_rate": 4.7882069615274435e-05, + "loss": 0.3683, + "step": 2313 + }, + { + "epoch": 2.466950959488273, + "grad_norm": 0.22445307173012358, + "learning_rate": 4.7852880078950764e-05, + "loss": 0.3721, + "step": 2314 + }, + { + "epoch": 2.4680170575692966, + "grad_norm": 0.27306888281171515, + "learning_rate": 4.782368619253965e-05, + "loss": 0.3687, + "step": 2315 + }, + { + "epoch": 2.4690831556503197, + "grad_norm": 0.36336555042461727, + "learning_rate": 4.7794487972213e-05, + "loss": 0.3639, + "step": 2316 + }, + { + "epoch": 2.470149253731343, + "grad_norm": 0.3601488690657708, + "learning_rate": 4.776528543414511e-05, + "loss": 0.3738, + "step": 2317 + }, + { + "epoch": 2.4712153518123667, + "grad_norm": 0.35339799743518896, + "learning_rate": 4.773607859451265e-05, + "loss": 0.3713, + "step": 2318 + }, + { + "epoch": 2.4722814498933903, + "grad_norm": 0.3693531153358677, + "learning_rate": 4.770686746949472e-05, + "loss": 0.3681, + "step": 2319 + }, + { + "epoch": 2.473347547974414, + "grad_norm": 0.39311860926438036, + "learning_rate": 4.767765207527275e-05, + "loss": 0.3696, + "step": 2320 + }, + { + "epoch": 2.474413646055437, + "grad_norm": 0.37612079543054977, + "learning_rate": 4.764843242803053e-05, + "loss": 0.3661, + "step": 2321 + }, + { + "epoch": 2.4754797441364604, + "grad_norm": 0.3521543006142998, + "learning_rate": 4.761920854395426e-05, + "loss": 0.3716, + "step": 2322 + }, + { + "epoch": 2.476545842217484, + "grad_norm": 0.26305773466563936, + "learning_rate": 4.7589980439232433e-05, + "loss": 0.3707, + "step": 2323 + }, + { + "epoch": 2.4776119402985075, + "grad_norm": 0.2559553568605443, + "learning_rate": 4.756074813005591e-05, + "loss": 0.3704, + "step": 2324 + }, + { + "epoch": 2.478678038379531, + "grad_norm": 0.2890296801712053, + "learning_rate": 4.753151163261787e-05, + "loss": 0.3669, + "step": 2325 + }, + { + "epoch": 2.479744136460554, + "grad_norm": 0.268964216758793, + "learning_rate": 4.75022709631138e-05, + "loss": 0.3704, + "step": 2326 + }, + { + "epoch": 2.4808102345415777, + "grad_norm": 0.29305303643593816, + "learning_rate": 4.747302613774153e-05, + "loss": 0.3667, + "step": 2327 + }, + { + "epoch": 2.481876332622601, + "grad_norm": 0.3343530498108033, + "learning_rate": 4.7443777172701146e-05, + "loss": 0.3703, + "step": 2328 + }, + { + "epoch": 2.4829424307036247, + "grad_norm": 0.3429097239396157, + "learning_rate": 4.74145240841951e-05, + "loss": 0.374, + "step": 2329 + }, + { + "epoch": 2.4840085287846483, + "grad_norm": 0.27000380770235705, + "learning_rate": 4.738526688842803e-05, + "loss": 0.371, + "step": 2330 + }, + { + "epoch": 2.485074626865672, + "grad_norm": 0.24578044590662604, + "learning_rate": 4.735600560160695e-05, + "loss": 0.3713, + "step": 2331 + }, + { + "epoch": 2.486140724946695, + "grad_norm": 0.2595315485397087, + "learning_rate": 4.7326740239941054e-05, + "loss": 0.3748, + "step": 2332 + }, + { + "epoch": 2.4872068230277184, + "grad_norm": 0.2207117203267181, + "learning_rate": 4.729747081964185e-05, + "loss": 0.3635, + "step": 2333 + }, + { + "epoch": 2.488272921108742, + "grad_norm": 0.22525118617182993, + "learning_rate": 4.7268197356923076e-05, + "loss": 0.3693, + "step": 2334 + }, + { + "epoch": 2.4893390191897655, + "grad_norm": 0.26088710793376507, + "learning_rate": 4.7238919868000704e-05, + "loss": 0.3609, + "step": 2335 + }, + { + "epoch": 2.490405117270789, + "grad_norm": 0.2885428062747875, + "learning_rate": 4.720963836909295e-05, + "loss": 0.3762, + "step": 2336 + }, + { + "epoch": 2.4914712153518126, + "grad_norm": 0.25259361954597753, + "learning_rate": 4.718035287642022e-05, + "loss": 0.3714, + "step": 2337 + }, + { + "epoch": 2.4925373134328357, + "grad_norm": 0.24137669671809825, + "learning_rate": 4.715106340620518e-05, + "loss": 0.3677, + "step": 2338 + }, + { + "epoch": 2.4936034115138592, + "grad_norm": 0.2686347142176362, + "learning_rate": 4.712176997467266e-05, + "loss": 0.3706, + "step": 2339 + }, + { + "epoch": 2.4946695095948828, + "grad_norm": 0.2812137990571536, + "learning_rate": 4.709247259804971e-05, + "loss": 0.371, + "step": 2340 + }, + { + "epoch": 2.4957356076759063, + "grad_norm": 0.24346310160667034, + "learning_rate": 4.706317129256554e-05, + "loss": 0.3716, + "step": 2341 + }, + { + "epoch": 2.49680170575693, + "grad_norm": 0.23214815216215345, + "learning_rate": 4.703386607445157e-05, + "loss": 0.3712, + "step": 2342 + }, + { + "epoch": 2.497867803837953, + "grad_norm": 0.2739545324648638, + "learning_rate": 4.7004556959941335e-05, + "loss": 0.3651, + "step": 2343 + }, + { + "epoch": 2.4989339019189765, + "grad_norm": 0.25915247784626605, + "learning_rate": 4.69752439652706e-05, + "loss": 0.3663, + "step": 2344 + }, + { + "epoch": 2.5, + "grad_norm": 0.25618378366217026, + "learning_rate": 4.694592710667723e-05, + "loss": 0.374, + "step": 2345 + }, + { + "epoch": 2.5010660980810235, + "grad_norm": 0.30631526116242996, + "learning_rate": 4.69166064004012e-05, + "loss": 0.3716, + "step": 2346 + }, + { + "epoch": 2.502132196162047, + "grad_norm": 0.3366426624946492, + "learning_rate": 4.688728186268472e-05, + "loss": 0.3714, + "step": 2347 + }, + { + "epoch": 2.50319829424307, + "grad_norm": 0.25931098502063027, + "learning_rate": 4.685795350977202e-05, + "loss": 0.3715, + "step": 2348 + }, + { + "epoch": 2.5042643923240937, + "grad_norm": 0.2123807260445706, + "learning_rate": 4.6828621357909494e-05, + "loss": 0.3649, + "step": 2349 + }, + { + "epoch": 2.5053304904051172, + "grad_norm": 0.19519486858966945, + "learning_rate": 4.679928542334564e-05, + "loss": 0.3756, + "step": 2350 + }, + { + "epoch": 2.5063965884861408, + "grad_norm": 0.24863053693547293, + "learning_rate": 4.676994572233101e-05, + "loss": 0.3676, + "step": 2351 + }, + { + "epoch": 2.5074626865671643, + "grad_norm": 0.25591846224110576, + "learning_rate": 4.674060227111831e-05, + "loss": 0.3746, + "step": 2352 + }, + { + "epoch": 2.5085287846481874, + "grad_norm": 0.2281497146449445, + "learning_rate": 4.6711255085962275e-05, + "loss": 0.3636, + "step": 2353 + }, + { + "epoch": 2.509594882729211, + "grad_norm": 0.2994568082565574, + "learning_rate": 4.66819041831197e-05, + "loss": 0.373, + "step": 2354 + }, + { + "epoch": 2.5106609808102345, + "grad_norm": 0.29113727854820887, + "learning_rate": 4.66525495788495e-05, + "loss": 0.3723, + "step": 2355 + }, + { + "epoch": 2.511727078891258, + "grad_norm": 0.2845367853179552, + "learning_rate": 4.662319128941256e-05, + "loss": 0.3659, + "step": 2356 + }, + { + "epoch": 2.5127931769722816, + "grad_norm": 0.24591581925857822, + "learning_rate": 4.6593829331071854e-05, + "loss": 0.3777, + "step": 2357 + }, + { + "epoch": 2.5138592750533046, + "grad_norm": 0.2322117630084555, + "learning_rate": 4.6564463720092405e-05, + "loss": 0.3679, + "step": 2358 + }, + { + "epoch": 2.5149253731343286, + "grad_norm": 0.22223391656686764, + "learning_rate": 4.653509447274121e-05, + "loss": 0.3685, + "step": 2359 + }, + { + "epoch": 2.5159914712153517, + "grad_norm": 0.2403114798612083, + "learning_rate": 4.650572160528733e-05, + "loss": 0.3661, + "step": 2360 + }, + { + "epoch": 2.5170575692963753, + "grad_norm": 0.2414630524740656, + "learning_rate": 4.647634513400178e-05, + "loss": 0.3682, + "step": 2361 + }, + { + "epoch": 2.518123667377399, + "grad_norm": 0.2006519271395349, + "learning_rate": 4.644696507515762e-05, + "loss": 0.3654, + "step": 2362 + }, + { + "epoch": 2.5191897654584223, + "grad_norm": 0.24007590073272467, + "learning_rate": 4.641758144502985e-05, + "loss": 0.3659, + "step": 2363 + }, + { + "epoch": 2.520255863539446, + "grad_norm": 0.26804872309010386, + "learning_rate": 4.638819425989551e-05, + "loss": 0.3709, + "step": 2364 + }, + { + "epoch": 2.521321961620469, + "grad_norm": 0.24000354250919373, + "learning_rate": 4.635880353603356e-05, + "loss": 0.3712, + "step": 2365 + }, + { + "epoch": 2.5223880597014925, + "grad_norm": 0.2652528639205553, + "learning_rate": 4.632940928972491e-05, + "loss": 0.3688, + "step": 2366 + }, + { + "epoch": 2.523454157782516, + "grad_norm": 0.2580420852617307, + "learning_rate": 4.630001153725247e-05, + "loss": 0.3703, + "step": 2367 + }, + { + "epoch": 2.5245202558635396, + "grad_norm": 0.22089649535392641, + "learning_rate": 4.627061029490105e-05, + "loss": 0.3711, + "step": 2368 + }, + { + "epoch": 2.525586353944563, + "grad_norm": 0.2545584727108516, + "learning_rate": 4.6241205578957435e-05, + "loss": 0.3737, + "step": 2369 + }, + { + "epoch": 2.526652452025586, + "grad_norm": 0.2967974395396264, + "learning_rate": 4.6211797405710285e-05, + "loss": 0.3712, + "step": 2370 + }, + { + "epoch": 2.5277185501066097, + "grad_norm": 0.2791964105337959, + "learning_rate": 4.618238579145022e-05, + "loss": 0.3732, + "step": 2371 + }, + { + "epoch": 2.5287846481876333, + "grad_norm": 0.26473846775672055, + "learning_rate": 4.6152970752469716e-05, + "loss": 0.3672, + "step": 2372 + }, + { + "epoch": 2.529850746268657, + "grad_norm": 0.2049402609403739, + "learning_rate": 4.612355230506321e-05, + "loss": 0.3691, + "step": 2373 + }, + { + "epoch": 2.5309168443496803, + "grad_norm": 0.1856152488020746, + "learning_rate": 4.609413046552697e-05, + "loss": 0.3741, + "step": 2374 + }, + { + "epoch": 2.5319829424307034, + "grad_norm": 0.2258681946492881, + "learning_rate": 4.606470525015917e-05, + "loss": 0.369, + "step": 2375 + }, + { + "epoch": 2.533049040511727, + "grad_norm": 0.25975437520850114, + "learning_rate": 4.603527667525987e-05, + "loss": 0.3712, + "step": 2376 + }, + { + "epoch": 2.5341151385927505, + "grad_norm": 0.2306018312278267, + "learning_rate": 4.6005844757130937e-05, + "loss": 0.3644, + "step": 2377 + }, + { + "epoch": 2.535181236673774, + "grad_norm": 0.24072038440875435, + "learning_rate": 4.597640951207615e-05, + "loss": 0.3737, + "step": 2378 + }, + { + "epoch": 2.5362473347547976, + "grad_norm": 0.2686312117022459, + "learning_rate": 4.5946970956401086e-05, + "loss": 0.3646, + "step": 2379 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.2940976442002358, + "learning_rate": 4.59175291064132e-05, + "loss": 0.3668, + "step": 2380 + }, + { + "epoch": 2.538379530916844, + "grad_norm": 0.2926304524554624, + "learning_rate": 4.588808397842172e-05, + "loss": 0.3662, + "step": 2381 + }, + { + "epoch": 2.5394456289978677, + "grad_norm": 0.24031871433591176, + "learning_rate": 4.585863558873774e-05, + "loss": 0.365, + "step": 2382 + }, + { + "epoch": 2.5405117270788913, + "grad_norm": 0.23386304551259074, + "learning_rate": 4.582918395367412e-05, + "loss": 0.3708, + "step": 2383 + }, + { + "epoch": 2.541577825159915, + "grad_norm": 0.23138470902625616, + "learning_rate": 4.5799729089545546e-05, + "loss": 0.3666, + "step": 2384 + }, + { + "epoch": 2.542643923240938, + "grad_norm": 0.20493257086499, + "learning_rate": 4.577027101266847e-05, + "loss": 0.3641, + "step": 2385 + }, + { + "epoch": 2.543710021321962, + "grad_norm": 0.2742256917065172, + "learning_rate": 4.574080973936115e-05, + "loss": 0.3656, + "step": 2386 + }, + { + "epoch": 2.544776119402985, + "grad_norm": 0.2573784786511887, + "learning_rate": 4.5711345285943585e-05, + "loss": 0.3722, + "step": 2387 + }, + { + "epoch": 2.5458422174840085, + "grad_norm": 0.220635079626304, + "learning_rate": 4.568187766873757e-05, + "loss": 0.3707, + "step": 2388 + }, + { + "epoch": 2.546908315565032, + "grad_norm": 0.24947774232184444, + "learning_rate": 4.565240690406661e-05, + "loss": 0.3685, + "step": 2389 + }, + { + "epoch": 2.5479744136460556, + "grad_norm": 0.22455688901803345, + "learning_rate": 4.5622933008256e-05, + "loss": 0.3663, + "step": 2390 + }, + { + "epoch": 2.549040511727079, + "grad_norm": 0.24210657316421766, + "learning_rate": 4.559345599763273e-05, + "loss": 0.3715, + "step": 2391 + }, + { + "epoch": 2.550106609808102, + "grad_norm": 0.23303940791069144, + "learning_rate": 4.556397588852553e-05, + "loss": 0.3748, + "step": 2392 + }, + { + "epoch": 2.5511727078891258, + "grad_norm": 0.20346922824676458, + "learning_rate": 4.553449269726487e-05, + "loss": 0.378, + "step": 2393 + }, + { + "epoch": 2.5522388059701493, + "grad_norm": 0.25118366195456854, + "learning_rate": 4.550500644018289e-05, + "loss": 0.3722, + "step": 2394 + }, + { + "epoch": 2.553304904051173, + "grad_norm": 0.23799851735663155, + "learning_rate": 4.547551713361344e-05, + "loss": 0.3654, + "step": 2395 + }, + { + "epoch": 2.5543710021321964, + "grad_norm": 0.24008761159127065, + "learning_rate": 4.544602479389207e-05, + "loss": 0.3715, + "step": 2396 + }, + { + "epoch": 2.5554371002132195, + "grad_norm": 0.3224169072622209, + "learning_rate": 4.5416529437355996e-05, + "loss": 0.3772, + "step": 2397 + }, + { + "epoch": 2.556503198294243, + "grad_norm": 0.28620149997931676, + "learning_rate": 4.538703108034414e-05, + "loss": 0.3678, + "step": 2398 + }, + { + "epoch": 2.5575692963752665, + "grad_norm": 0.2192934282403724, + "learning_rate": 4.535752973919701e-05, + "loss": 0.3703, + "step": 2399 + }, + { + "epoch": 2.55863539445629, + "grad_norm": 0.1709676106553397, + "learning_rate": 4.532802543025686e-05, + "loss": 0.3699, + "step": 2400 + }, + { + "epoch": 2.5597014925373136, + "grad_norm": 0.24392777534293011, + "learning_rate": 4.529851816986752e-05, + "loss": 0.3687, + "step": 2401 + }, + { + "epoch": 2.5607675906183367, + "grad_norm": 0.24891562920270666, + "learning_rate": 4.5269007974374494e-05, + "loss": 0.3697, + "step": 2402 + }, + { + "epoch": 2.5618336886993602, + "grad_norm": 0.2384548720836165, + "learning_rate": 4.5239494860124895e-05, + "loss": 0.3685, + "step": 2403 + }, + { + "epoch": 2.5628997867803838, + "grad_norm": 0.27098692831127463, + "learning_rate": 4.5209978843467436e-05, + "loss": 0.3678, + "step": 2404 + }, + { + "epoch": 2.5639658848614073, + "grad_norm": 0.20778244660818992, + "learning_rate": 4.5180459940752474e-05, + "loss": 0.3667, + "step": 2405 + }, + { + "epoch": 2.565031982942431, + "grad_norm": 0.22120855767567077, + "learning_rate": 4.515093816833193e-05, + "loss": 0.3682, + "step": 2406 + }, + { + "epoch": 2.566098081023454, + "grad_norm": 0.2634670562341265, + "learning_rate": 4.512141354255935e-05, + "loss": 0.3689, + "step": 2407 + }, + { + "epoch": 2.5671641791044775, + "grad_norm": 0.2583204612978333, + "learning_rate": 4.509188607978983e-05, + "loss": 0.365, + "step": 2408 + }, + { + "epoch": 2.568230277185501, + "grad_norm": 0.279446147717918, + "learning_rate": 4.5062355796380066e-05, + "loss": 0.3685, + "step": 2409 + }, + { + "epoch": 2.5692963752665245, + "grad_norm": 0.2489838679141536, + "learning_rate": 4.503282270868829e-05, + "loss": 0.3699, + "step": 2410 + }, + { + "epoch": 2.570362473347548, + "grad_norm": 0.2818230446222446, + "learning_rate": 4.500328683307428e-05, + "loss": 0.3735, + "step": 2411 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.32345371657550076, + "learning_rate": 4.4973748185899416e-05, + "loss": 0.3729, + "step": 2412 + }, + { + "epoch": 2.572494669509595, + "grad_norm": 0.27902407224490827, + "learning_rate": 4.4944206783526536e-05, + "loss": 0.3723, + "step": 2413 + }, + { + "epoch": 2.5735607675906182, + "grad_norm": 0.2424292615142393, + "learning_rate": 4.4914662642320054e-05, + "loss": 0.3687, + "step": 2414 + }, + { + "epoch": 2.574626865671642, + "grad_norm": 0.251352116384431, + "learning_rate": 4.4885115778645895e-05, + "loss": 0.3727, + "step": 2415 + }, + { + "epoch": 2.5756929637526653, + "grad_norm": 0.24325740030245527, + "learning_rate": 4.485556620887148e-05, + "loss": 0.3674, + "step": 2416 + }, + { + "epoch": 2.576759061833689, + "grad_norm": 0.30803045624118025, + "learning_rate": 4.482601394936573e-05, + "loss": 0.3707, + "step": 2417 + }, + { + "epoch": 2.5778251599147124, + "grad_norm": 0.3292401151984711, + "learning_rate": 4.479645901649908e-05, + "loss": 0.3732, + "step": 2418 + }, + { + "epoch": 2.5788912579957355, + "grad_norm": 0.29582277126522766, + "learning_rate": 4.476690142664342e-05, + "loss": 0.3744, + "step": 2419 + }, + { + "epoch": 2.579957356076759, + "grad_norm": 0.26686256357070676, + "learning_rate": 4.47373411961721e-05, + "loss": 0.3687, + "step": 2420 + }, + { + "epoch": 2.5810234541577826, + "grad_norm": 0.24943332540729626, + "learning_rate": 4.470777834145997e-05, + "loss": 0.3658, + "step": 2421 + }, + { + "epoch": 2.582089552238806, + "grad_norm": 0.2121258978338484, + "learning_rate": 4.467821287888331e-05, + "loss": 0.3668, + "step": 2422 + }, + { + "epoch": 2.5831556503198296, + "grad_norm": 0.2156156797629481, + "learning_rate": 4.464864482481984e-05, + "loss": 0.3642, + "step": 2423 + }, + { + "epoch": 2.5842217484008527, + "grad_norm": 0.2006814924668892, + "learning_rate": 4.461907419564874e-05, + "loss": 0.3697, + "step": 2424 + }, + { + "epoch": 2.5852878464818763, + "grad_norm": 0.210853772833808, + "learning_rate": 4.4589501007750595e-05, + "loss": 0.3749, + "step": 2425 + }, + { + "epoch": 2.5863539445629, + "grad_norm": 0.25717175869949604, + "learning_rate": 4.4559925277507416e-05, + "loss": 0.3698, + "step": 2426 + }, + { + "epoch": 2.5874200426439233, + "grad_norm": 0.25073349786641386, + "learning_rate": 4.4530347021302626e-05, + "loss": 0.3713, + "step": 2427 + }, + { + "epoch": 2.588486140724947, + "grad_norm": 0.2643644113163801, + "learning_rate": 4.450076625552102e-05, + "loss": 0.3747, + "step": 2428 + }, + { + "epoch": 2.58955223880597, + "grad_norm": 0.24060221681359292, + "learning_rate": 4.447118299654883e-05, + "loss": 0.3599, + "step": 2429 + }, + { + "epoch": 2.5906183368869935, + "grad_norm": 0.2720300708647726, + "learning_rate": 4.444159726077363e-05, + "loss": 0.3718, + "step": 2430 + }, + { + "epoch": 2.591684434968017, + "grad_norm": 0.3229511582480334, + "learning_rate": 4.4412009064584384e-05, + "loss": 0.3666, + "step": 2431 + }, + { + "epoch": 2.5927505330490406, + "grad_norm": 0.3888728224230281, + "learning_rate": 4.438241842437142e-05, + "loss": 0.3654, + "step": 2432 + }, + { + "epoch": 2.593816631130064, + "grad_norm": 0.38553138291681316, + "learning_rate": 4.435282535652641e-05, + "loss": 0.3724, + "step": 2433 + }, + { + "epoch": 2.594882729211087, + "grad_norm": 0.31969838656564453, + "learning_rate": 4.4323229877442374e-05, + "loss": 0.3694, + "step": 2434 + }, + { + "epoch": 2.5959488272921107, + "grad_norm": 0.40489827746268775, + "learning_rate": 4.429363200351366e-05, + "loss": 0.3701, + "step": 2435 + }, + { + "epoch": 2.5970149253731343, + "grad_norm": 0.4247590203835174, + "learning_rate": 4.426403175113598e-05, + "loss": 0.3751, + "step": 2436 + }, + { + "epoch": 2.598081023454158, + "grad_norm": 0.3951246174491148, + "learning_rate": 4.423442913670631e-05, + "loss": 0.3693, + "step": 2437 + }, + { + "epoch": 2.5991471215351813, + "grad_norm": 0.3111985733932929, + "learning_rate": 4.420482417662297e-05, + "loss": 0.3704, + "step": 2438 + }, + { + "epoch": 2.6002132196162044, + "grad_norm": 0.18873972891323956, + "learning_rate": 4.417521688728556e-05, + "loss": 0.3661, + "step": 2439 + }, + { + "epoch": 2.6012793176972284, + "grad_norm": 0.2495555063976354, + "learning_rate": 4.4145607285095e-05, + "loss": 0.3755, + "step": 2440 + }, + { + "epoch": 2.6023454157782515, + "grad_norm": 0.36376650288976037, + "learning_rate": 4.411599538645345e-05, + "loss": 0.3681, + "step": 2441 + }, + { + "epoch": 2.603411513859275, + "grad_norm": 0.4693799411527801, + "learning_rate": 4.408638120776436e-05, + "loss": 0.3705, + "step": 2442 + }, + { + "epoch": 2.6044776119402986, + "grad_norm": 0.46887250402317515, + "learning_rate": 4.405676476543247e-05, + "loss": 0.3671, + "step": 2443 + }, + { + "epoch": 2.605543710021322, + "grad_norm": 0.36949597097867015, + "learning_rate": 4.402714607586373e-05, + "loss": 0.3738, + "step": 2444 + }, + { + "epoch": 2.6066098081023457, + "grad_norm": 0.3009704252138755, + "learning_rate": 4.399752515546538e-05, + "loss": 0.3681, + "step": 2445 + }, + { + "epoch": 2.6076759061833688, + "grad_norm": 0.237310962341784, + "learning_rate": 4.396790202064583e-05, + "loss": 0.3686, + "step": 2446 + }, + { + "epoch": 2.6087420042643923, + "grad_norm": 0.23276180946430972, + "learning_rate": 4.393827668781478e-05, + "loss": 0.3683, + "step": 2447 + }, + { + "epoch": 2.609808102345416, + "grad_norm": 0.32501790451361395, + "learning_rate": 4.390864917338313e-05, + "loss": 0.3708, + "step": 2448 + }, + { + "epoch": 2.6108742004264394, + "grad_norm": 0.3613827214092068, + "learning_rate": 4.387901949376297e-05, + "loss": 0.3651, + "step": 2449 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.3553110668616482, + "learning_rate": 4.3849387665367614e-05, + "loss": 0.3728, + "step": 2450 + }, + { + "epoch": 2.613006396588486, + "grad_norm": 0.292471368689879, + "learning_rate": 4.381975370461155e-05, + "loss": 0.3745, + "step": 2451 + }, + { + "epoch": 2.6140724946695095, + "grad_norm": 0.2016201008306948, + "learning_rate": 4.379011762791045e-05, + "loss": 0.3671, + "step": 2452 + }, + { + "epoch": 2.615138592750533, + "grad_norm": 0.2616397181578619, + "learning_rate": 4.3760479451681164e-05, + "loss": 0.3705, + "step": 2453 + }, + { + "epoch": 2.6162046908315566, + "grad_norm": 0.3997537994958033, + "learning_rate": 4.3730839192341705e-05, + "loss": 0.3699, + "step": 2454 + }, + { + "epoch": 2.61727078891258, + "grad_norm": 0.4296512845219555, + "learning_rate": 4.370119686631123e-05, + "loss": 0.3654, + "step": 2455 + }, + { + "epoch": 2.6183368869936032, + "grad_norm": 0.327116358825709, + "learning_rate": 4.3671552490010036e-05, + "loss": 0.372, + "step": 2456 + }, + { + "epoch": 2.6194029850746268, + "grad_norm": 0.27235595479261326, + "learning_rate": 4.3641906079859584e-05, + "loss": 0.3698, + "step": 2457 + }, + { + "epoch": 2.6204690831556503, + "grad_norm": 0.35732218737027005, + "learning_rate": 4.3612257652282446e-05, + "loss": 0.3667, + "step": 2458 + }, + { + "epoch": 2.621535181236674, + "grad_norm": 0.37424865085620856, + "learning_rate": 4.358260722370229e-05, + "loss": 0.364, + "step": 2459 + }, + { + "epoch": 2.6226012793176974, + "grad_norm": 0.42605572507707096, + "learning_rate": 4.355295481054394e-05, + "loss": 0.3735, + "step": 2460 + }, + { + "epoch": 2.6236673773987205, + "grad_norm": 0.45443453638837905, + "learning_rate": 4.352330042923328e-05, + "loss": 0.372, + "step": 2461 + }, + { + "epoch": 2.624733475479744, + "grad_norm": 0.41383949227823347, + "learning_rate": 4.3493644096197274e-05, + "loss": 0.3717, + "step": 2462 + }, + { + "epoch": 2.6257995735607675, + "grad_norm": 0.2860714841801369, + "learning_rate": 4.3463985827864024e-05, + "loss": 0.3646, + "step": 2463 + }, + { + "epoch": 2.626865671641791, + "grad_norm": 0.23137129814316412, + "learning_rate": 4.343432564066264e-05, + "loss": 0.3701, + "step": 2464 + }, + { + "epoch": 2.6279317697228146, + "grad_norm": 0.2512842592763747, + "learning_rate": 4.340466355102336e-05, + "loss": 0.3672, + "step": 2465 + }, + { + "epoch": 2.6289978678038377, + "grad_norm": 0.28901573296850475, + "learning_rate": 4.3374999575377393e-05, + "loss": 0.3651, + "step": 2466 + }, + { + "epoch": 2.6300639658848612, + "grad_norm": 0.2993536460359868, + "learning_rate": 4.334533373015709e-05, + "loss": 0.3688, + "step": 2467 + }, + { + "epoch": 2.631130063965885, + "grad_norm": 0.2614300970595663, + "learning_rate": 4.3315666031795736e-05, + "loss": 0.3678, + "step": 2468 + }, + { + "epoch": 2.6321961620469083, + "grad_norm": 0.25189646001058735, + "learning_rate": 4.328599649672774e-05, + "loss": 0.3712, + "step": 2469 + }, + { + "epoch": 2.633262260127932, + "grad_norm": 0.24766325430481256, + "learning_rate": 4.3256325141388464e-05, + "loss": 0.3641, + "step": 2470 + }, + { + "epoch": 2.6343283582089554, + "grad_norm": 0.3035417706772448, + "learning_rate": 4.3226651982214286e-05, + "loss": 0.3684, + "step": 2471 + }, + { + "epoch": 2.635394456289979, + "grad_norm": 0.2636223605711239, + "learning_rate": 4.319697703564261e-05, + "loss": 0.3688, + "step": 2472 + }, + { + "epoch": 2.636460554371002, + "grad_norm": 0.20666359945758703, + "learning_rate": 4.3167300318111805e-05, + "loss": 0.3636, + "step": 2473 + }, + { + "epoch": 2.6375266524520256, + "grad_norm": 0.24577933602515428, + "learning_rate": 4.313762184606124e-05, + "loss": 0.3681, + "step": 2474 + }, + { + "epoch": 2.638592750533049, + "grad_norm": 0.23261579434937743, + "learning_rate": 4.310794163593123e-05, + "loss": 0.3665, + "step": 2475 + }, + { + "epoch": 2.6396588486140726, + "grad_norm": 0.21528938082866045, + "learning_rate": 4.307825970416308e-05, + "loss": 0.3712, + "step": 2476 + }, + { + "epoch": 2.640724946695096, + "grad_norm": 0.2560143422062812, + "learning_rate": 4.3048576067199034e-05, + "loss": 0.3668, + "step": 2477 + }, + { + "epoch": 2.6417910447761193, + "grad_norm": 0.24544814002376064, + "learning_rate": 4.3018890741482296e-05, + "loss": 0.3705, + "step": 2478 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.2439487587446761, + "learning_rate": 4.298920374345698e-05, + "loss": 0.3656, + "step": 2479 + }, + { + "epoch": 2.6439232409381663, + "grad_norm": 0.25308388826969747, + "learning_rate": 4.295951508956814e-05, + "loss": 0.3638, + "step": 2480 + }, + { + "epoch": 2.64498933901919, + "grad_norm": 0.30616158821118233, + "learning_rate": 4.292982479626175e-05, + "loss": 0.3628, + "step": 2481 + }, + { + "epoch": 2.6460554371002134, + "grad_norm": 0.2635196635922222, + "learning_rate": 4.290013287998469e-05, + "loss": 0.369, + "step": 2482 + }, + { + "epoch": 2.6471215351812365, + "grad_norm": 0.2737120868957713, + "learning_rate": 4.287043935718474e-05, + "loss": 0.3729, + "step": 2483 + }, + { + "epoch": 2.64818763326226, + "grad_norm": 0.31387047069745105, + "learning_rate": 4.2840744244310565e-05, + "loss": 0.3749, + "step": 2484 + }, + { + "epoch": 2.6492537313432836, + "grad_norm": 0.29584704047191807, + "learning_rate": 4.281104755781172e-05, + "loss": 0.371, + "step": 2485 + }, + { + "epoch": 2.650319829424307, + "grad_norm": 0.31359007568593433, + "learning_rate": 4.278134931413862e-05, + "loss": 0.3698, + "step": 2486 + }, + { + "epoch": 2.6513859275053306, + "grad_norm": 0.2606911686791426, + "learning_rate": 4.275164952974256e-05, + "loss": 0.3712, + "step": 2487 + }, + { + "epoch": 2.6524520255863537, + "grad_norm": 0.2110093354675097, + "learning_rate": 4.272194822107566e-05, + "loss": 0.3699, + "step": 2488 + }, + { + "epoch": 2.6535181236673773, + "grad_norm": 0.1961521426965646, + "learning_rate": 4.2692245404590906e-05, + "loss": 0.366, + "step": 2489 + }, + { + "epoch": 2.654584221748401, + "grad_norm": 0.2515365045262743, + "learning_rate": 4.266254109674213e-05, + "loss": 0.3683, + "step": 2490 + }, + { + "epoch": 2.6556503198294243, + "grad_norm": 0.3249680113246906, + "learning_rate": 4.263283531398395e-05, + "loss": 0.3637, + "step": 2491 + }, + { + "epoch": 2.656716417910448, + "grad_norm": 0.3118378628242833, + "learning_rate": 4.260312807277185e-05, + "loss": 0.3719, + "step": 2492 + }, + { + "epoch": 2.657782515991471, + "grad_norm": 0.22886489658313464, + "learning_rate": 4.2573419389562074e-05, + "loss": 0.3715, + "step": 2493 + }, + { + "epoch": 2.6588486140724945, + "grad_norm": 0.1662131591027839, + "learning_rate": 4.254370928081171e-05, + "loss": 0.3772, + "step": 2494 + }, + { + "epoch": 2.659914712153518, + "grad_norm": 0.1969499093984148, + "learning_rate": 4.25139977629786e-05, + "loss": 0.3699, + "step": 2495 + }, + { + "epoch": 2.6609808102345416, + "grad_norm": 0.25930037314010335, + "learning_rate": 4.248428485252139e-05, + "loss": 0.3655, + "step": 2496 + }, + { + "epoch": 2.662046908315565, + "grad_norm": 0.26415096213448547, + "learning_rate": 4.2454570565899476e-05, + "loss": 0.3734, + "step": 2497 + }, + { + "epoch": 2.663113006396588, + "grad_norm": 0.27116001496004927, + "learning_rate": 4.242485491957305e-05, + "loss": 0.3657, + "step": 2498 + }, + { + "epoch": 2.664179104477612, + "grad_norm": 0.25863792879198805, + "learning_rate": 4.239513793000301e-05, + "loss": 0.3753, + "step": 2499 + }, + { + "epoch": 2.6652452025586353, + "grad_norm": 0.23133617297550452, + "learning_rate": 4.2365419613651035e-05, + "loss": 0.3737, + "step": 2500 + }, + { + "epoch": 2.666311300639659, + "grad_norm": 0.20980481301021206, + "learning_rate": 4.233569998697954e-05, + "loss": 0.3701, + "step": 2501 + }, + { + "epoch": 2.6673773987206824, + "grad_norm": 0.24297415494208804, + "learning_rate": 4.2305979066451626e-05, + "loss": 0.3677, + "step": 2502 + }, + { + "epoch": 2.668443496801706, + "grad_norm": 0.2493426212110305, + "learning_rate": 4.2276256868531165e-05, + "loss": 0.3652, + "step": 2503 + }, + { + "epoch": 2.6695095948827294, + "grad_norm": 0.2303457530378945, + "learning_rate": 4.22465334096827e-05, + "loss": 0.3771, + "step": 2504 + }, + { + "epoch": 2.6705756929637525, + "grad_norm": 0.223769372564616, + "learning_rate": 4.221680870637148e-05, + "loss": 0.3673, + "step": 2505 + }, + { + "epoch": 2.671641791044776, + "grad_norm": 0.2590177839797346, + "learning_rate": 4.2187082775063436e-05, + "loss": 0.3685, + "step": 2506 + }, + { + "epoch": 2.6727078891257996, + "grad_norm": 0.24760592724404568, + "learning_rate": 4.2157355632225216e-05, + "loss": 0.368, + "step": 2507 + }, + { + "epoch": 2.673773987206823, + "grad_norm": 0.22621973085952393, + "learning_rate": 4.2127627294324095e-05, + "loss": 0.3676, + "step": 2508 + }, + { + "epoch": 2.6748400852878467, + "grad_norm": 0.2335594327305558, + "learning_rate": 4.2097897777828025e-05, + "loss": 0.3743, + "step": 2509 + }, + { + "epoch": 2.6759061833688698, + "grad_norm": 0.2281132024099097, + "learning_rate": 4.2068167099205625e-05, + "loss": 0.3692, + "step": 2510 + }, + { + "epoch": 2.6769722814498933, + "grad_norm": 0.22341724033801894, + "learning_rate": 4.203843527492613e-05, + "loss": 0.3702, + "step": 2511 + }, + { + "epoch": 2.678038379530917, + "grad_norm": 0.2424014361387363, + "learning_rate": 4.200870232145943e-05, + "loss": 0.3706, + "step": 2512 + }, + { + "epoch": 2.6791044776119404, + "grad_norm": 0.1953899351329716, + "learning_rate": 4.1978968255276043e-05, + "loss": 0.3652, + "step": 2513 + }, + { + "epoch": 2.680170575692964, + "grad_norm": 0.22170371983312567, + "learning_rate": 4.1949233092847095e-05, + "loss": 0.3684, + "step": 2514 + }, + { + "epoch": 2.681236673773987, + "grad_norm": 0.23765221374061374, + "learning_rate": 4.1919496850644316e-05, + "loss": 0.3637, + "step": 2515 + }, + { + "epoch": 2.6823027718550105, + "grad_norm": 0.23687777883894273, + "learning_rate": 4.1889759545140045e-05, + "loss": 0.3655, + "step": 2516 + }, + { + "epoch": 2.683368869936034, + "grad_norm": 0.22338536825632072, + "learning_rate": 4.186002119280718e-05, + "loss": 0.3707, + "step": 2517 + }, + { + "epoch": 2.6844349680170576, + "grad_norm": 0.22005314669076753, + "learning_rate": 4.183028181011927e-05, + "loss": 0.374, + "step": 2518 + }, + { + "epoch": 2.685501066098081, + "grad_norm": 0.2303045665765263, + "learning_rate": 4.180054141355035e-05, + "loss": 0.3747, + "step": 2519 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.18534817689847655, + "learning_rate": 4.177080001957506e-05, + "loss": 0.3667, + "step": 2520 + }, + { + "epoch": 2.6876332622601278, + "grad_norm": 0.2629553843000235, + "learning_rate": 4.174105764466859e-05, + "loss": 0.3705, + "step": 2521 + }, + { + "epoch": 2.6886993603411513, + "grad_norm": 0.29148782597094064, + "learning_rate": 4.1711314305306676e-05, + "loss": 0.3704, + "step": 2522 + }, + { + "epoch": 2.689765458422175, + "grad_norm": 0.2811624835341636, + "learning_rate": 4.168157001796557e-05, + "loss": 0.3723, + "step": 2523 + }, + { + "epoch": 2.6908315565031984, + "grad_norm": 0.24880403947380741, + "learning_rate": 4.165182479912208e-05, + "loss": 0.367, + "step": 2524 + }, + { + "epoch": 2.6918976545842215, + "grad_norm": 0.2008980746882621, + "learning_rate": 4.1622078665253486e-05, + "loss": 0.3661, + "step": 2525 + }, + { + "epoch": 2.6929637526652455, + "grad_norm": 0.23350481252873526, + "learning_rate": 4.159233163283762e-05, + "loss": 0.3762, + "step": 2526 + }, + { + "epoch": 2.6940298507462686, + "grad_norm": 0.20261861943939447, + "learning_rate": 4.156258371835279e-05, + "loss": 0.3701, + "step": 2527 + }, + { + "epoch": 2.695095948827292, + "grad_norm": 0.25996452870016273, + "learning_rate": 4.153283493827777e-05, + "loss": 0.3776, + "step": 2528 + }, + { + "epoch": 2.6961620469083156, + "grad_norm": 0.19687929153495226, + "learning_rate": 4.150308530909187e-05, + "loss": 0.3636, + "step": 2529 + }, + { + "epoch": 2.697228144989339, + "grad_norm": 0.19556973866759192, + "learning_rate": 4.147333484727484e-05, + "loss": 0.3586, + "step": 2530 + }, + { + "epoch": 2.6982942430703627, + "grad_norm": 0.21721440727421978, + "learning_rate": 4.144358356930686e-05, + "loss": 0.3685, + "step": 2531 + }, + { + "epoch": 2.699360341151386, + "grad_norm": 0.2289561477297396, + "learning_rate": 4.141383149166861e-05, + "loss": 0.3693, + "step": 2532 + }, + { + "epoch": 2.7004264392324093, + "grad_norm": 0.21622654938339614, + "learning_rate": 4.138407863084119e-05, + "loss": 0.3676, + "step": 2533 + }, + { + "epoch": 2.701492537313433, + "grad_norm": 0.20523535800109563, + "learning_rate": 4.1354325003306146e-05, + "loss": 0.369, + "step": 2534 + }, + { + "epoch": 2.7025586353944564, + "grad_norm": 0.24463058005414473, + "learning_rate": 4.132457062554543e-05, + "loss": 0.3677, + "step": 2535 + }, + { + "epoch": 2.70362473347548, + "grad_norm": 0.1851731295878307, + "learning_rate": 4.129481551404143e-05, + "loss": 0.374, + "step": 2536 + }, + { + "epoch": 2.704690831556503, + "grad_norm": 0.19922292111977952, + "learning_rate": 4.1265059685276936e-05, + "loss": 0.3654, + "step": 2537 + }, + { + "epoch": 2.7057569296375266, + "grad_norm": 0.2752693144696535, + "learning_rate": 4.123530315573512e-05, + "loss": 0.3694, + "step": 2538 + }, + { + "epoch": 2.70682302771855, + "grad_norm": 0.23479171085838138, + "learning_rate": 4.120554594189955e-05, + "loss": 0.3734, + "step": 2539 + }, + { + "epoch": 2.7078891257995736, + "grad_norm": 0.1999301041207985, + "learning_rate": 4.117578806025419e-05, + "loss": 0.3674, + "step": 2540 + }, + { + "epoch": 2.708955223880597, + "grad_norm": 0.21844202367376767, + "learning_rate": 4.114602952728335e-05, + "loss": 0.3695, + "step": 2541 + }, + { + "epoch": 2.7100213219616203, + "grad_norm": 0.22666372255765088, + "learning_rate": 4.111627035947171e-05, + "loss": 0.3675, + "step": 2542 + }, + { + "epoch": 2.711087420042644, + "grad_norm": 0.20627199369884083, + "learning_rate": 4.108651057330432e-05, + "loss": 0.375, + "step": 2543 + }, + { + "epoch": 2.7121535181236673, + "grad_norm": 0.17843299855865702, + "learning_rate": 4.1056750185266515e-05, + "loss": 0.3685, + "step": 2544 + }, + { + "epoch": 2.713219616204691, + "grad_norm": 0.2435265879503595, + "learning_rate": 4.102698921184405e-05, + "loss": 0.37, + "step": 2545 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.2682269696770955, + "learning_rate": 4.0997227669522924e-05, + "loss": 0.3607, + "step": 2546 + }, + { + "epoch": 2.7153518123667375, + "grad_norm": 0.23667515131488986, + "learning_rate": 4.096746557478949e-05, + "loss": 0.372, + "step": 2547 + }, + { + "epoch": 2.716417910447761, + "grad_norm": 0.20880475425387418, + "learning_rate": 4.0937702944130426e-05, + "loss": 0.368, + "step": 2548 + }, + { + "epoch": 2.7174840085287846, + "grad_norm": 0.18392143745348416, + "learning_rate": 4.0907939794032654e-05, + "loss": 0.3733, + "step": 2549 + }, + { + "epoch": 2.718550106609808, + "grad_norm": 0.2165652019434693, + "learning_rate": 4.087817614098343e-05, + "loss": 0.3751, + "step": 2550 + }, + { + "epoch": 2.7196162046908317, + "grad_norm": 0.24037086751445425, + "learning_rate": 4.084841200147025e-05, + "loss": 0.3739, + "step": 2551 + }, + { + "epoch": 2.7206823027718547, + "grad_norm": 0.2501714427485925, + "learning_rate": 4.0818647391980926e-05, + "loss": 0.3737, + "step": 2552 + }, + { + "epoch": 2.7217484008528787, + "grad_norm": 0.2344141067398134, + "learning_rate": 4.078888232900349e-05, + "loss": 0.3698, + "step": 2553 + }, + { + "epoch": 2.722814498933902, + "grad_norm": 0.2349463019216503, + "learning_rate": 4.075911682902625e-05, + "loss": 0.3736, + "step": 2554 + }, + { + "epoch": 2.7238805970149254, + "grad_norm": 0.20029237944534903, + "learning_rate": 4.0729350908537724e-05, + "loss": 0.3757, + "step": 2555 + }, + { + "epoch": 2.724946695095949, + "grad_norm": 0.27878488904776033, + "learning_rate": 4.069958458402671e-05, + "loss": 0.3683, + "step": 2556 + }, + { + "epoch": 2.7260127931769724, + "grad_norm": 0.29774631195776696, + "learning_rate": 4.06698178719822e-05, + "loss": 0.3685, + "step": 2557 + }, + { + "epoch": 2.727078891257996, + "grad_norm": 0.2104346445232383, + "learning_rate": 4.0640050788893386e-05, + "loss": 0.3652, + "step": 2558 + }, + { + "epoch": 2.728144989339019, + "grad_norm": 0.2317958367423688, + "learning_rate": 4.0610283351249716e-05, + "loss": 0.3692, + "step": 2559 + }, + { + "epoch": 2.7292110874200426, + "grad_norm": 0.24510244686355384, + "learning_rate": 4.058051557554078e-05, + "loss": 0.3665, + "step": 2560 + }, + { + "epoch": 2.730277185501066, + "grad_norm": 0.19542448243466262, + "learning_rate": 4.0550747478256384e-05, + "loss": 0.3648, + "step": 2561 + }, + { + "epoch": 2.7313432835820897, + "grad_norm": 0.2389772106956452, + "learning_rate": 4.052097907588652e-05, + "loss": 0.3703, + "step": 2562 + }, + { + "epoch": 2.732409381663113, + "grad_norm": 0.3107936261643165, + "learning_rate": 4.049121038492131e-05, + "loss": 0.3788, + "step": 2563 + }, + { + "epoch": 2.7334754797441363, + "grad_norm": 0.2919215370644085, + "learning_rate": 4.0461441421851075e-05, + "loss": 0.3606, + "step": 2564 + }, + { + "epoch": 2.73454157782516, + "grad_norm": 0.24187272238294624, + "learning_rate": 4.043167220316628e-05, + "loss": 0.3758, + "step": 2565 + }, + { + "epoch": 2.7356076759061834, + "grad_norm": 0.24598827183680877, + "learning_rate": 4.040190274535752e-05, + "loss": 0.3715, + "step": 2566 + }, + { + "epoch": 2.736673773987207, + "grad_norm": 0.21275237895976437, + "learning_rate": 4.037213306491552e-05, + "loss": 0.3739, + "step": 2567 + }, + { + "epoch": 2.7377398720682304, + "grad_norm": 0.25491281948796235, + "learning_rate": 4.0342363178331146e-05, + "loss": 0.3671, + "step": 2568 + }, + { + "epoch": 2.7388059701492535, + "grad_norm": 0.2833091097168945, + "learning_rate": 4.031259310209536e-05, + "loss": 0.371, + "step": 2569 + }, + { + "epoch": 2.739872068230277, + "grad_norm": 0.18872311871080574, + "learning_rate": 4.028282285269925e-05, + "loss": 0.3759, + "step": 2570 + }, + { + "epoch": 2.7409381663113006, + "grad_norm": 0.19447861987862541, + "learning_rate": 4.0253052446633966e-05, + "loss": 0.3713, + "step": 2571 + }, + { + "epoch": 2.742004264392324, + "grad_norm": 0.20648269121909482, + "learning_rate": 4.022328190039079e-05, + "loss": 0.3647, + "step": 2572 + }, + { + "epoch": 2.7430703624733477, + "grad_norm": 0.1961509547861183, + "learning_rate": 4.019351123046104e-05, + "loss": 0.3594, + "step": 2573 + }, + { + "epoch": 2.7441364605543708, + "grad_norm": 0.21126880262710795, + "learning_rate": 4.0163740453336125e-05, + "loss": 0.3675, + "step": 2574 + }, + { + "epoch": 2.7452025586353943, + "grad_norm": 0.20602815602021118, + "learning_rate": 4.0133969585507514e-05, + "loss": 0.3674, + "step": 2575 + }, + { + "epoch": 2.746268656716418, + "grad_norm": 0.21819379544686981, + "learning_rate": 4.010419864346671e-05, + "loss": 0.3759, + "step": 2576 + }, + { + "epoch": 2.7473347547974414, + "grad_norm": 0.2076650890871053, + "learning_rate": 4.0074427643705274e-05, + "loss": 0.3676, + "step": 2577 + }, + { + "epoch": 2.748400852878465, + "grad_norm": 0.2017797586755664, + "learning_rate": 4.004465660271479e-05, + "loss": 0.3685, + "step": 2578 + }, + { + "epoch": 2.749466950959488, + "grad_norm": 0.2744626448821499, + "learning_rate": 4.001488553698687e-05, + "loss": 0.3691, + "step": 2579 + }, + { + "epoch": 2.750533049040512, + "grad_norm": 0.22575769767487924, + "learning_rate": 3.998511446301315e-05, + "loss": 0.3713, + "step": 2580 + }, + { + "epoch": 2.751599147121535, + "grad_norm": 0.18090826243386235, + "learning_rate": 3.995534339728522e-05, + "loss": 0.3643, + "step": 2581 + }, + { + "epoch": 2.7526652452025586, + "grad_norm": 0.1810203385011346, + "learning_rate": 3.992557235629473e-05, + "loss": 0.3679, + "step": 2582 + }, + { + "epoch": 2.753731343283582, + "grad_norm": 0.2004645021897271, + "learning_rate": 3.989580135653329e-05, + "loss": 0.3623, + "step": 2583 + }, + { + "epoch": 2.7547974413646057, + "grad_norm": 0.20665259172252912, + "learning_rate": 3.98660304144925e-05, + "loss": 0.3709, + "step": 2584 + }, + { + "epoch": 2.7558635394456292, + "grad_norm": 0.21652327508884206, + "learning_rate": 3.983625954666389e-05, + "loss": 0.3638, + "step": 2585 + }, + { + "epoch": 2.7569296375266523, + "grad_norm": 0.1905584143867772, + "learning_rate": 3.9806488769538966e-05, + "loss": 0.3698, + "step": 2586 + }, + { + "epoch": 2.757995735607676, + "grad_norm": 0.2122311206240018, + "learning_rate": 3.977671809960923e-05, + "loss": 0.3671, + "step": 2587 + }, + { + "epoch": 2.7590618336886994, + "grad_norm": 0.29171836988191924, + "learning_rate": 3.974694755336604e-05, + "loss": 0.3693, + "step": 2588 + }, + { + "epoch": 2.760127931769723, + "grad_norm": 0.27181002702035517, + "learning_rate": 3.971717714730076e-05, + "loss": 0.3665, + "step": 2589 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.21014364337275343, + "learning_rate": 3.968740689790464e-05, + "loss": 0.3729, + "step": 2590 + }, + { + "epoch": 2.7622601279317696, + "grad_norm": 0.2526245805801581, + "learning_rate": 3.9657636821668874e-05, + "loss": 0.3713, + "step": 2591 + }, + { + "epoch": 2.763326226012793, + "grad_norm": 0.25768688989060495, + "learning_rate": 3.9627866935084496e-05, + "loss": 0.3689, + "step": 2592 + }, + { + "epoch": 2.7643923240938166, + "grad_norm": 0.2526926794684304, + "learning_rate": 3.959809725464249e-05, + "loss": 0.369, + "step": 2593 + }, + { + "epoch": 2.76545842217484, + "grad_norm": 0.28824876900515534, + "learning_rate": 3.956832779683374e-05, + "loss": 0.3716, + "step": 2594 + }, + { + "epoch": 2.7665245202558637, + "grad_norm": 0.26584418015578326, + "learning_rate": 3.953855857814894e-05, + "loss": 0.3703, + "step": 2595 + }, + { + "epoch": 2.767590618336887, + "grad_norm": 0.23043100400918048, + "learning_rate": 3.950878961507871e-05, + "loss": 0.3652, + "step": 2596 + }, + { + "epoch": 2.7686567164179103, + "grad_norm": 0.2502510778112517, + "learning_rate": 3.94790209241135e-05, + "loss": 0.3672, + "step": 2597 + }, + { + "epoch": 2.769722814498934, + "grad_norm": 0.2639439073598543, + "learning_rate": 3.944925252174363e-05, + "loss": 0.3788, + "step": 2598 + }, + { + "epoch": 2.7707889125799574, + "grad_norm": 0.2841116000042142, + "learning_rate": 3.9419484424459235e-05, + "loss": 0.3727, + "step": 2599 + }, + { + "epoch": 2.771855010660981, + "grad_norm": 0.2735556859075221, + "learning_rate": 3.938971664875029e-05, + "loss": 0.3717, + "step": 2600 + }, + { + "epoch": 2.772921108742004, + "grad_norm": 0.28637597882164606, + "learning_rate": 3.935994921110661e-05, + "loss": 0.3688, + "step": 2601 + }, + { + "epoch": 2.7739872068230276, + "grad_norm": 0.265199180250517, + "learning_rate": 3.933018212801782e-05, + "loss": 0.3756, + "step": 2602 + }, + { + "epoch": 2.775053304904051, + "grad_norm": 0.24641376266091428, + "learning_rate": 3.9300415415973295e-05, + "loss": 0.3694, + "step": 2603 + }, + { + "epoch": 2.7761194029850746, + "grad_norm": 0.2029121854284716, + "learning_rate": 3.9270649091462276e-05, + "loss": 0.3619, + "step": 2604 + }, + { + "epoch": 2.777185501066098, + "grad_norm": 0.31971921589949576, + "learning_rate": 3.9240883170973776e-05, + "loss": 0.3664, + "step": 2605 + }, + { + "epoch": 2.7782515991471213, + "grad_norm": 0.34026995782754504, + "learning_rate": 3.9211117670996524e-05, + "loss": 0.3709, + "step": 2606 + }, + { + "epoch": 2.7793176972281453, + "grad_norm": 0.31436146070519905, + "learning_rate": 3.918135260801908e-05, + "loss": 0.3697, + "step": 2607 + }, + { + "epoch": 2.7803837953091683, + "grad_norm": 0.30334671667064705, + "learning_rate": 3.9151587998529754e-05, + "loss": 0.3694, + "step": 2608 + }, + { + "epoch": 2.781449893390192, + "grad_norm": 0.26731009437526165, + "learning_rate": 3.912182385901659e-05, + "loss": 0.368, + "step": 2609 + }, + { + "epoch": 2.7825159914712154, + "grad_norm": 0.27147115708275976, + "learning_rate": 3.909206020596736e-05, + "loss": 0.3626, + "step": 2610 + }, + { + "epoch": 2.783582089552239, + "grad_norm": 0.32889706955527953, + "learning_rate": 3.906229705586959e-05, + "loss": 0.3696, + "step": 2611 + }, + { + "epoch": 2.7846481876332625, + "grad_norm": 0.3499202779561899, + "learning_rate": 3.903253442521051e-05, + "loss": 0.368, + "step": 2612 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.2870452604692784, + "learning_rate": 3.9002772330477096e-05, + "loss": 0.3678, + "step": 2613 + }, + { + "epoch": 2.786780383795309, + "grad_norm": 0.2959372810749118, + "learning_rate": 3.897301078815597e-05, + "loss": 0.3666, + "step": 2614 + }, + { + "epoch": 2.7878464818763327, + "grad_norm": 0.251783830117275, + "learning_rate": 3.894324981473349e-05, + "loss": 0.369, + "step": 2615 + }, + { + "epoch": 2.788912579957356, + "grad_norm": 0.20368390077877208, + "learning_rate": 3.891348942669571e-05, + "loss": 0.3659, + "step": 2616 + }, + { + "epoch": 2.7899786780383797, + "grad_norm": 0.3014830515949927, + "learning_rate": 3.88837296405283e-05, + "loss": 0.3645, + "step": 2617 + }, + { + "epoch": 2.791044776119403, + "grad_norm": 0.26718545932498405, + "learning_rate": 3.8853970472716656e-05, + "loss": 0.365, + "step": 2618 + }, + { + "epoch": 2.7921108742004264, + "grad_norm": 0.24899810548394205, + "learning_rate": 3.882421193974581e-05, + "loss": 0.369, + "step": 2619 + }, + { + "epoch": 2.79317697228145, + "grad_norm": 0.2285678956121162, + "learning_rate": 3.879445405810047e-05, + "loss": 0.3654, + "step": 2620 + }, + { + "epoch": 2.7942430703624734, + "grad_norm": 0.25841393123219936, + "learning_rate": 3.876469684426489e-05, + "loss": 0.3705, + "step": 2621 + }, + { + "epoch": 2.795309168443497, + "grad_norm": 0.21957675621749626, + "learning_rate": 3.873494031472307e-05, + "loss": 0.3706, + "step": 2622 + }, + { + "epoch": 2.79637526652452, + "grad_norm": 0.20277378161854126, + "learning_rate": 3.870518448595858e-05, + "loss": 0.3708, + "step": 2623 + }, + { + "epoch": 2.7974413646055436, + "grad_norm": 0.19507078530612013, + "learning_rate": 3.867542937445458e-05, + "loss": 0.3695, + "step": 2624 + }, + { + "epoch": 2.798507462686567, + "grad_norm": 0.22778058725192096, + "learning_rate": 3.864567499669387e-05, + "loss": 0.3704, + "step": 2625 + }, + { + "epoch": 2.7995735607675907, + "grad_norm": 0.2139823499648018, + "learning_rate": 3.861592136915881e-05, + "loss": 0.368, + "step": 2626 + }, + { + "epoch": 2.800639658848614, + "grad_norm": 0.2755772492578195, + "learning_rate": 3.858616850833141e-05, + "loss": 0.3744, + "step": 2627 + }, + { + "epoch": 2.8017057569296373, + "grad_norm": 0.22789410651650585, + "learning_rate": 3.855641643069316e-05, + "loss": 0.3691, + "step": 2628 + }, + { + "epoch": 2.802771855010661, + "grad_norm": 0.19420102545723644, + "learning_rate": 3.852666515272517e-05, + "loss": 0.3701, + "step": 2629 + }, + { + "epoch": 2.8038379530916844, + "grad_norm": 0.22416479381704973, + "learning_rate": 3.849691469090814e-05, + "loss": 0.3657, + "step": 2630 + }, + { + "epoch": 2.804904051172708, + "grad_norm": 0.2226881113118721, + "learning_rate": 3.846716506172224e-05, + "loss": 0.3712, + "step": 2631 + }, + { + "epoch": 2.8059701492537314, + "grad_norm": 0.1957187061751904, + "learning_rate": 3.8437416281647226e-05, + "loss": 0.3686, + "step": 2632 + }, + { + "epoch": 2.8070362473347545, + "grad_norm": 0.196407100670983, + "learning_rate": 3.8407668367162397e-05, + "loss": 0.3638, + "step": 2633 + }, + { + "epoch": 2.8081023454157785, + "grad_norm": 0.1860094576496804, + "learning_rate": 3.837792133474653e-05, + "loss": 0.364, + "step": 2634 + }, + { + "epoch": 2.8091684434968016, + "grad_norm": 0.1954370294920811, + "learning_rate": 3.8348175200877934e-05, + "loss": 0.3699, + "step": 2635 + }, + { + "epoch": 2.810234541577825, + "grad_norm": 0.2202007081382702, + "learning_rate": 3.831842998203444e-05, + "loss": 0.367, + "step": 2636 + }, + { + "epoch": 2.8113006396588487, + "grad_norm": 0.18691426984371892, + "learning_rate": 3.828868569469333e-05, + "loss": 0.3675, + "step": 2637 + }, + { + "epoch": 2.8123667377398722, + "grad_norm": 0.16936594094488316, + "learning_rate": 3.825894235533143e-05, + "loss": 0.3606, + "step": 2638 + }, + { + "epoch": 2.8134328358208958, + "grad_norm": 0.2188119066765755, + "learning_rate": 3.8229199980424957e-05, + "loss": 0.371, + "step": 2639 + }, + { + "epoch": 2.814498933901919, + "grad_norm": 0.19720285663307738, + "learning_rate": 3.819945858644966e-05, + "loss": 0.3647, + "step": 2640 + }, + { + "epoch": 2.8155650319829424, + "grad_norm": 0.2081234890857164, + "learning_rate": 3.8169718189880735e-05, + "loss": 0.3663, + "step": 2641 + }, + { + "epoch": 2.816631130063966, + "grad_norm": 0.1684341668760121, + "learning_rate": 3.8139978807192824e-05, + "loss": 0.367, + "step": 2642 + }, + { + "epoch": 2.8176972281449895, + "grad_norm": 0.1883701323467958, + "learning_rate": 3.8110240454859975e-05, + "loss": 0.3665, + "step": 2643 + }, + { + "epoch": 2.818763326226013, + "grad_norm": 0.18509131366924267, + "learning_rate": 3.808050314935569e-05, + "loss": 0.3658, + "step": 2644 + }, + { + "epoch": 2.819829424307036, + "grad_norm": 0.18175683445064622, + "learning_rate": 3.8050766907152925e-05, + "loss": 0.3685, + "step": 2645 + }, + { + "epoch": 2.8208955223880596, + "grad_norm": 0.1749386564896647, + "learning_rate": 3.802103174472397e-05, + "loss": 0.364, + "step": 2646 + }, + { + "epoch": 2.821961620469083, + "grad_norm": 0.1877228031849092, + "learning_rate": 3.799129767854058e-05, + "loss": 0.3659, + "step": 2647 + }, + { + "epoch": 2.8230277185501067, + "grad_norm": 0.23830061584479004, + "learning_rate": 3.796156472507388e-05, + "loss": 0.3681, + "step": 2648 + }, + { + "epoch": 2.8240938166311302, + "grad_norm": 0.20524827433645737, + "learning_rate": 3.79318329007944e-05, + "loss": 0.3688, + "step": 2649 + }, + { + "epoch": 2.8251599147121533, + "grad_norm": 0.20637998521159778, + "learning_rate": 3.790210222217199e-05, + "loss": 0.3704, + "step": 2650 + }, + { + "epoch": 2.826226012793177, + "grad_norm": 0.18452122451122424, + "learning_rate": 3.787237270567591e-05, + "loss": 0.3687, + "step": 2651 + }, + { + "epoch": 2.8272921108742004, + "grad_norm": 0.2102244859465425, + "learning_rate": 3.7842644367774804e-05, + "loss": 0.3719, + "step": 2652 + }, + { + "epoch": 2.828358208955224, + "grad_norm": 0.17836976761573398, + "learning_rate": 3.781291722493657e-05, + "loss": 0.3701, + "step": 2653 + }, + { + "epoch": 2.8294243070362475, + "grad_norm": 0.20742348576542263, + "learning_rate": 3.7783191293628535e-05, + "loss": 0.3703, + "step": 2654 + }, + { + "epoch": 2.8304904051172706, + "grad_norm": 0.19693977146930702, + "learning_rate": 3.775346659031731e-05, + "loss": 0.3684, + "step": 2655 + }, + { + "epoch": 2.831556503198294, + "grad_norm": 0.2143540021763547, + "learning_rate": 3.7723743131468855e-05, + "loss": 0.3677, + "step": 2656 + }, + { + "epoch": 2.8326226012793176, + "grad_norm": 0.2456330609899814, + "learning_rate": 3.769402093354838e-05, + "loss": 0.3673, + "step": 2657 + }, + { + "epoch": 2.833688699360341, + "grad_norm": 0.2504165053266609, + "learning_rate": 3.766430001302047e-05, + "loss": 0.3684, + "step": 2658 + }, + { + "epoch": 2.8347547974413647, + "grad_norm": 0.20402042897097772, + "learning_rate": 3.7634580386348965e-05, + "loss": 0.3686, + "step": 2659 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.2261020174040382, + "learning_rate": 3.7604862069997006e-05, + "loss": 0.3646, + "step": 2660 + }, + { + "epoch": 2.836886993603412, + "grad_norm": 0.18453288873034465, + "learning_rate": 3.7575145080426966e-05, + "loss": 0.3645, + "step": 2661 + }, + { + "epoch": 2.837953091684435, + "grad_norm": 0.22623379991968529, + "learning_rate": 3.7545429434100524e-05, + "loss": 0.3717, + "step": 2662 + }, + { + "epoch": 2.8390191897654584, + "grad_norm": 0.2781308273184132, + "learning_rate": 3.751571514747863e-05, + "loss": 0.3727, + "step": 2663 + }, + { + "epoch": 2.840085287846482, + "grad_norm": 0.2095156006824654, + "learning_rate": 3.748600223702141e-05, + "loss": 0.3709, + "step": 2664 + }, + { + "epoch": 2.8411513859275055, + "grad_norm": 0.19821375447623568, + "learning_rate": 3.7456290719188294e-05, + "loss": 0.3716, + "step": 2665 + }, + { + "epoch": 2.842217484008529, + "grad_norm": 0.19589404239503924, + "learning_rate": 3.742658061043793e-05, + "loss": 0.3686, + "step": 2666 + }, + { + "epoch": 2.843283582089552, + "grad_norm": 0.2663508610327882, + "learning_rate": 3.7396871927228165e-05, + "loss": 0.3699, + "step": 2667 + }, + { + "epoch": 2.8443496801705757, + "grad_norm": 0.22076482818199494, + "learning_rate": 3.7367164686016055e-05, + "loss": 0.3738, + "step": 2668 + }, + { + "epoch": 2.845415778251599, + "grad_norm": 0.15927170037082947, + "learning_rate": 3.733745890325788e-05, + "loss": 0.3658, + "step": 2669 + }, + { + "epoch": 2.8464818763326227, + "grad_norm": 0.20652944107978283, + "learning_rate": 3.7307754595409094e-05, + "loss": 0.3637, + "step": 2670 + }, + { + "epoch": 2.8475479744136463, + "grad_norm": 0.24357837304067567, + "learning_rate": 3.727805177892435e-05, + "loss": 0.3693, + "step": 2671 + }, + { + "epoch": 2.8486140724946694, + "grad_norm": 0.245995883074642, + "learning_rate": 3.7248350470257456e-05, + "loss": 0.3672, + "step": 2672 + }, + { + "epoch": 2.849680170575693, + "grad_norm": 0.19478885206841665, + "learning_rate": 3.721865068586138e-05, + "loss": 0.37, + "step": 2673 + }, + { + "epoch": 2.8507462686567164, + "grad_norm": 0.2530400593412467, + "learning_rate": 3.71889524421883e-05, + "loss": 0.3649, + "step": 2674 + }, + { + "epoch": 2.85181236673774, + "grad_norm": 0.2694796185289994, + "learning_rate": 3.715925575568945e-05, + "loss": 0.3674, + "step": 2675 + }, + { + "epoch": 2.8528784648187635, + "grad_norm": 0.2246054293784152, + "learning_rate": 3.712956064281527e-05, + "loss": 0.3628, + "step": 2676 + }, + { + "epoch": 2.8539445628997866, + "grad_norm": 0.204940141551566, + "learning_rate": 3.7099867120015316e-05, + "loss": 0.3711, + "step": 2677 + }, + { + "epoch": 2.85501066098081, + "grad_norm": 0.2634467161792027, + "learning_rate": 3.707017520373827e-05, + "loss": 0.3711, + "step": 2678 + }, + { + "epoch": 2.8560767590618337, + "grad_norm": 0.2790273311361743, + "learning_rate": 3.7040484910431874e-05, + "loss": 0.3692, + "step": 2679 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.1836831295255749, + "learning_rate": 3.7010796256543034e-05, + "loss": 0.3698, + "step": 2680 + }, + { + "epoch": 2.8582089552238807, + "grad_norm": 0.2364870469211339, + "learning_rate": 3.6981109258517724e-05, + "loss": 0.3703, + "step": 2681 + }, + { + "epoch": 2.859275053304904, + "grad_norm": 0.27041513703480374, + "learning_rate": 3.695142393280098e-05, + "loss": 0.3665, + "step": 2682 + }, + { + "epoch": 2.8603411513859274, + "grad_norm": 0.20251757763705422, + "learning_rate": 3.692174029583693e-05, + "loss": 0.3744, + "step": 2683 + }, + { + "epoch": 2.861407249466951, + "grad_norm": 0.23004902486067985, + "learning_rate": 3.689205836406878e-05, + "loss": 0.3666, + "step": 2684 + }, + { + "epoch": 2.8624733475479744, + "grad_norm": 0.210050309914908, + "learning_rate": 3.686237815393878e-05, + "loss": 0.3726, + "step": 2685 + }, + { + "epoch": 2.863539445628998, + "grad_norm": 0.2447284603799976, + "learning_rate": 3.683269968188821e-05, + "loss": 0.3665, + "step": 2686 + }, + { + "epoch": 2.864605543710021, + "grad_norm": 0.1878574669176508, + "learning_rate": 3.68030229643574e-05, + "loss": 0.3739, + "step": 2687 + }, + { + "epoch": 2.8656716417910446, + "grad_norm": 0.20854764527126085, + "learning_rate": 3.6773348017785714e-05, + "loss": 0.3671, + "step": 2688 + }, + { + "epoch": 2.866737739872068, + "grad_norm": 0.20789517420801748, + "learning_rate": 3.6743674858611556e-05, + "loss": 0.3767, + "step": 2689 + }, + { + "epoch": 2.8678038379530917, + "grad_norm": 0.17804786727873217, + "learning_rate": 3.6714003503272265e-05, + "loss": 0.3627, + "step": 2690 + }, + { + "epoch": 2.868869936034115, + "grad_norm": 0.1808758602651246, + "learning_rate": 3.668433396820426e-05, + "loss": 0.3702, + "step": 2691 + }, + { + "epoch": 2.8699360341151388, + "grad_norm": 0.23463363469678927, + "learning_rate": 3.665466626984294e-05, + "loss": 0.3703, + "step": 2692 + }, + { + "epoch": 2.8710021321961623, + "grad_norm": 0.246472560729465, + "learning_rate": 3.662500042462262e-05, + "loss": 0.361, + "step": 2693 + }, + { + "epoch": 2.8720682302771854, + "grad_norm": 0.20564287257852504, + "learning_rate": 3.6595336448976655e-05, + "loss": 0.3701, + "step": 2694 + }, + { + "epoch": 2.873134328358209, + "grad_norm": 0.2106812492583941, + "learning_rate": 3.656567435933736e-05, + "loss": 0.3628, + "step": 2695 + }, + { + "epoch": 2.8742004264392325, + "grad_norm": 0.20453409365105865, + "learning_rate": 3.6536014172135996e-05, + "loss": 0.3649, + "step": 2696 + }, + { + "epoch": 2.875266524520256, + "grad_norm": 0.20646291714378256, + "learning_rate": 3.650635590380274e-05, + "loss": 0.3666, + "step": 2697 + }, + { + "epoch": 2.8763326226012795, + "grad_norm": 0.1836576189111925, + "learning_rate": 3.647669957076674e-05, + "loss": 0.3686, + "step": 2698 + }, + { + "epoch": 2.8773987206823026, + "grad_norm": 0.21929175594887504, + "learning_rate": 3.644704518945607e-05, + "loss": 0.3747, + "step": 2699 + }, + { + "epoch": 2.878464818763326, + "grad_norm": 0.18711665960997673, + "learning_rate": 3.641739277629772e-05, + "loss": 0.3641, + "step": 2700 + }, + { + "epoch": 2.8795309168443497, + "grad_norm": 0.17522013442035012, + "learning_rate": 3.638774234771757e-05, + "loss": 0.3799, + "step": 2701 + }, + { + "epoch": 2.8805970149253732, + "grad_norm": 0.17988902594802456, + "learning_rate": 3.635809392014042e-05, + "loss": 0.3703, + "step": 2702 + }, + { + "epoch": 2.8816631130063968, + "grad_norm": 0.17377049259647168, + "learning_rate": 3.632844750998998e-05, + "loss": 0.3755, + "step": 2703 + }, + { + "epoch": 2.88272921108742, + "grad_norm": 0.19092248556619265, + "learning_rate": 3.629880313368879e-05, + "loss": 0.3632, + "step": 2704 + }, + { + "epoch": 2.8837953091684434, + "grad_norm": 0.18451802560202984, + "learning_rate": 3.6269160807658315e-05, + "loss": 0.3685, + "step": 2705 + }, + { + "epoch": 2.884861407249467, + "grad_norm": 0.16498456390879312, + "learning_rate": 3.6239520548318836e-05, + "loss": 0.3645, + "step": 2706 + }, + { + "epoch": 2.8859275053304905, + "grad_norm": 0.19518509163645606, + "learning_rate": 3.620988237208956e-05, + "loss": 0.3779, + "step": 2707 + }, + { + "epoch": 2.886993603411514, + "grad_norm": 0.2093799236223249, + "learning_rate": 3.6180246295388465e-05, + "loss": 0.3683, + "step": 2708 + }, + { + "epoch": 2.888059701492537, + "grad_norm": 0.18513792614540311, + "learning_rate": 3.615061233463239e-05, + "loss": 0.3673, + "step": 2709 + }, + { + "epoch": 2.8891257995735606, + "grad_norm": 0.18716040338636258, + "learning_rate": 3.612098050623705e-05, + "loss": 0.3648, + "step": 2710 + }, + { + "epoch": 2.890191897654584, + "grad_norm": 0.1570679754787949, + "learning_rate": 3.6091350826616886e-05, + "loss": 0.3602, + "step": 2711 + }, + { + "epoch": 2.8912579957356077, + "grad_norm": 0.1646466111643218, + "learning_rate": 3.606172331218523e-05, + "loss": 0.3715, + "step": 2712 + }, + { + "epoch": 2.8923240938166312, + "grad_norm": 0.19255818967368557, + "learning_rate": 3.603209797935418e-05, + "loss": 0.3643, + "step": 2713 + }, + { + "epoch": 2.8933901918976543, + "grad_norm": 0.23514975868653648, + "learning_rate": 3.600247484453465e-05, + "loss": 0.3767, + "step": 2714 + }, + { + "epoch": 2.894456289978678, + "grad_norm": 0.1715556714292468, + "learning_rate": 3.597285392413628e-05, + "loss": 0.3653, + "step": 2715 + }, + { + "epoch": 2.8955223880597014, + "grad_norm": 0.19278886011586072, + "learning_rate": 3.5943235234567534e-05, + "loss": 0.3688, + "step": 2716 + }, + { + "epoch": 2.896588486140725, + "grad_norm": 0.20799089269786109, + "learning_rate": 3.591361879223564e-05, + "loss": 0.3684, + "step": 2717 + }, + { + "epoch": 2.8976545842217485, + "grad_norm": 0.18186928796074078, + "learning_rate": 3.588400461354657e-05, + "loss": 0.3641, + "step": 2718 + }, + { + "epoch": 2.8987206823027716, + "grad_norm": 0.2508899499795832, + "learning_rate": 3.5854392714905015e-05, + "loss": 0.3701, + "step": 2719 + }, + { + "epoch": 2.8997867803837956, + "grad_norm": 0.24536404303538248, + "learning_rate": 3.582478311271445e-05, + "loss": 0.3684, + "step": 2720 + }, + { + "epoch": 2.9008528784648187, + "grad_norm": 0.21642088139256604, + "learning_rate": 3.579517582337705e-05, + "loss": 0.3717, + "step": 2721 + }, + { + "epoch": 2.901918976545842, + "grad_norm": 0.24159904518479744, + "learning_rate": 3.57655708632937e-05, + "loss": 0.3698, + "step": 2722 + }, + { + "epoch": 2.9029850746268657, + "grad_norm": 0.20752353712283125, + "learning_rate": 3.573596824886403e-05, + "loss": 0.3687, + "step": 2723 + }, + { + "epoch": 2.9040511727078893, + "grad_norm": 0.23067889172808448, + "learning_rate": 3.570636799648634e-05, + "loss": 0.3699, + "step": 2724 + }, + { + "epoch": 2.905117270788913, + "grad_norm": 0.20818039787632228, + "learning_rate": 3.5676770122557646e-05, + "loss": 0.3684, + "step": 2725 + }, + { + "epoch": 2.906183368869936, + "grad_norm": 0.21491825417478544, + "learning_rate": 3.5647174643473605e-05, + "loss": 0.369, + "step": 2726 + }, + { + "epoch": 2.9072494669509594, + "grad_norm": 0.19045430642939612, + "learning_rate": 3.5617581575628586e-05, + "loss": 0.3727, + "step": 2727 + }, + { + "epoch": 2.908315565031983, + "grad_norm": 0.2022812002014471, + "learning_rate": 3.5587990935415616e-05, + "loss": 0.3632, + "step": 2728 + }, + { + "epoch": 2.9093816631130065, + "grad_norm": 0.23520827934411384, + "learning_rate": 3.555840273922638e-05, + "loss": 0.3698, + "step": 2729 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.22589615939607646, + "learning_rate": 3.552881700345118e-05, + "loss": 0.3674, + "step": 2730 + }, + { + "epoch": 2.911513859275053, + "grad_norm": 0.17879660376241868, + "learning_rate": 3.5499233744478986e-05, + "loss": 0.3711, + "step": 2731 + }, + { + "epoch": 2.9125799573560767, + "grad_norm": 0.1701981830839262, + "learning_rate": 3.54696529786974e-05, + "loss": 0.373, + "step": 2732 + }, + { + "epoch": 2.9136460554371, + "grad_norm": 0.17388147057917147, + "learning_rate": 3.54400747224926e-05, + "loss": 0.3694, + "step": 2733 + }, + { + "epoch": 2.9147121535181237, + "grad_norm": 0.198912284935368, + "learning_rate": 3.541049899224941e-05, + "loss": 0.3638, + "step": 2734 + }, + { + "epoch": 2.9157782515991473, + "grad_norm": 0.1991247673513127, + "learning_rate": 3.538092580435127e-05, + "loss": 0.3689, + "step": 2735 + }, + { + "epoch": 2.9168443496801704, + "grad_norm": 0.21753207227603405, + "learning_rate": 3.5351355175180175e-05, + "loss": 0.3677, + "step": 2736 + }, + { + "epoch": 2.917910447761194, + "grad_norm": 0.19550372191277038, + "learning_rate": 3.53217871211167e-05, + "loss": 0.3635, + "step": 2737 + }, + { + "epoch": 2.9189765458422174, + "grad_norm": 0.18563885663326216, + "learning_rate": 3.529222165854005e-05, + "loss": 0.3649, + "step": 2738 + }, + { + "epoch": 2.920042643923241, + "grad_norm": 0.22917352839550473, + "learning_rate": 3.526265880382791e-05, + "loss": 0.3781, + "step": 2739 + }, + { + "epoch": 2.9211087420042645, + "grad_norm": 0.1757204517843413, + "learning_rate": 3.523309857335659e-05, + "loss": 0.3714, + "step": 2740 + }, + { + "epoch": 2.9221748400852876, + "grad_norm": 0.19335871653526074, + "learning_rate": 3.5203540983500925e-05, + "loss": 0.364, + "step": 2741 + }, + { + "epoch": 2.923240938166311, + "grad_norm": 0.2516292353731809, + "learning_rate": 3.517398605063426e-05, + "loss": 0.3645, + "step": 2742 + }, + { + "epoch": 2.9243070362473347, + "grad_norm": 0.20904716360457082, + "learning_rate": 3.514443379112853e-05, + "loss": 0.3732, + "step": 2743 + }, + { + "epoch": 2.925373134328358, + "grad_norm": 0.22653010522456504, + "learning_rate": 3.511488422135412e-05, + "loss": 0.3732, + "step": 2744 + }, + { + "epoch": 2.9264392324093818, + "grad_norm": 0.21041114421861964, + "learning_rate": 3.508533735767995e-05, + "loss": 0.3677, + "step": 2745 + }, + { + "epoch": 2.927505330490405, + "grad_norm": 0.19966036046378136, + "learning_rate": 3.505579321647347e-05, + "loss": 0.3655, + "step": 2746 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.1739262049297657, + "learning_rate": 3.5026251814100604e-05, + "loss": 0.3688, + "step": 2747 + }, + { + "epoch": 2.929637526652452, + "grad_norm": 0.18367778908948013, + "learning_rate": 3.4996713166925724e-05, + "loss": 0.3673, + "step": 2748 + }, + { + "epoch": 2.9307036247334755, + "grad_norm": 0.2188532499554241, + "learning_rate": 3.496717729131172e-05, + "loss": 0.3669, + "step": 2749 + }, + { + "epoch": 2.931769722814499, + "grad_norm": 0.20187002981612032, + "learning_rate": 3.493764420361995e-05, + "loss": 0.3705, + "step": 2750 + }, + { + "epoch": 2.9328358208955225, + "grad_norm": 0.16322026861126518, + "learning_rate": 3.490811392021018e-05, + "loss": 0.3611, + "step": 2751 + }, + { + "epoch": 2.933901918976546, + "grad_norm": 0.19108916099257472, + "learning_rate": 3.4878586457440655e-05, + "loss": 0.3699, + "step": 2752 + }, + { + "epoch": 2.934968017057569, + "grad_norm": 0.21458163159618987, + "learning_rate": 3.484906183166807e-05, + "loss": 0.3687, + "step": 2753 + }, + { + "epoch": 2.9360341151385927, + "grad_norm": 0.1904908632261195, + "learning_rate": 3.481954005924755e-05, + "loss": 0.3657, + "step": 2754 + }, + { + "epoch": 2.9371002132196162, + "grad_norm": 0.18685652839997016, + "learning_rate": 3.4790021156532585e-05, + "loss": 0.3688, + "step": 2755 + }, + { + "epoch": 2.9381663113006398, + "grad_norm": 0.19812866208833532, + "learning_rate": 3.476050513987512e-05, + "loss": 0.3665, + "step": 2756 + }, + { + "epoch": 2.9392324093816633, + "grad_norm": 0.19248170405628037, + "learning_rate": 3.4730992025625506e-05, + "loss": 0.3705, + "step": 2757 + }, + { + "epoch": 2.9402985074626864, + "grad_norm": 0.17211944249351815, + "learning_rate": 3.4701481830132486e-05, + "loss": 0.3644, + "step": 2758 + }, + { + "epoch": 2.94136460554371, + "grad_norm": 0.1964548683909922, + "learning_rate": 3.467197456974315e-05, + "loss": 0.3652, + "step": 2759 + }, + { + "epoch": 2.9424307036247335, + "grad_norm": 0.20371355156065427, + "learning_rate": 3.4642470260802986e-05, + "loss": 0.3749, + "step": 2760 + }, + { + "epoch": 2.943496801705757, + "grad_norm": 0.1469689938993747, + "learning_rate": 3.4612968919655886e-05, + "loss": 0.3682, + "step": 2761 + }, + { + "epoch": 2.9445628997867805, + "grad_norm": 0.1964620626188304, + "learning_rate": 3.458347056264401e-05, + "loss": 0.3726, + "step": 2762 + }, + { + "epoch": 2.9456289978678036, + "grad_norm": 0.18358666019195752, + "learning_rate": 3.4553975206107944e-05, + "loss": 0.3668, + "step": 2763 + }, + { + "epoch": 2.946695095948827, + "grad_norm": 0.16170424306429584, + "learning_rate": 3.452448286638657e-05, + "loss": 0.3735, + "step": 2764 + }, + { + "epoch": 2.9477611940298507, + "grad_norm": 0.18591540468001805, + "learning_rate": 3.4494993559817134e-05, + "loss": 0.37, + "step": 2765 + }, + { + "epoch": 2.9488272921108742, + "grad_norm": 0.22260097019813457, + "learning_rate": 3.446550730273514e-05, + "loss": 0.3656, + "step": 2766 + }, + { + "epoch": 2.949893390191898, + "grad_norm": 0.2482136891258535, + "learning_rate": 3.4436024111474475e-05, + "loss": 0.3683, + "step": 2767 + }, + { + "epoch": 2.950959488272921, + "grad_norm": 0.19730939109015577, + "learning_rate": 3.440654400236729e-05, + "loss": 0.378, + "step": 2768 + }, + { + "epoch": 2.9520255863539444, + "grad_norm": 0.20731684739497086, + "learning_rate": 3.437706699174402e-05, + "loss": 0.364, + "step": 2769 + }, + { + "epoch": 2.953091684434968, + "grad_norm": 0.22896154197636323, + "learning_rate": 3.43475930959334e-05, + "loss": 0.3685, + "step": 2770 + }, + { + "epoch": 2.9541577825159915, + "grad_norm": 0.20500126289635404, + "learning_rate": 3.431812233126245e-05, + "loss": 0.3748, + "step": 2771 + }, + { + "epoch": 2.955223880597015, + "grad_norm": 0.1633714645059073, + "learning_rate": 3.428865471405643e-05, + "loss": 0.3591, + "step": 2772 + }, + { + "epoch": 2.956289978678038, + "grad_norm": 0.21578628128041608, + "learning_rate": 3.425919026063886e-05, + "loss": 0.3745, + "step": 2773 + }, + { + "epoch": 2.957356076759062, + "grad_norm": 0.16947850465245867, + "learning_rate": 3.422972898733154e-05, + "loss": 0.3695, + "step": 2774 + }, + { + "epoch": 2.958422174840085, + "grad_norm": 0.15700808274896483, + "learning_rate": 3.420027091045446e-05, + "loss": 0.3627, + "step": 2775 + }, + { + "epoch": 2.9594882729211087, + "grad_norm": 0.1897865970091229, + "learning_rate": 3.417081604632589e-05, + "loss": 0.3624, + "step": 2776 + }, + { + "epoch": 2.9605543710021323, + "grad_norm": 0.19673063754579187, + "learning_rate": 3.414136441126227e-05, + "loss": 0.3692, + "step": 2777 + }, + { + "epoch": 2.961620469083156, + "grad_norm": 0.17553015896004215, + "learning_rate": 3.4111916021578285e-05, + "loss": 0.3678, + "step": 2778 + }, + { + "epoch": 2.9626865671641793, + "grad_norm": 0.2183249284173145, + "learning_rate": 3.408247089358681e-05, + "loss": 0.3747, + "step": 2779 + }, + { + "epoch": 2.9637526652452024, + "grad_norm": 0.21927885211369533, + "learning_rate": 3.405302904359893e-05, + "loss": 0.3673, + "step": 2780 + }, + { + "epoch": 2.964818763326226, + "grad_norm": 0.18731814539514857, + "learning_rate": 3.402359048792386e-05, + "loss": 0.3716, + "step": 2781 + }, + { + "epoch": 2.9658848614072495, + "grad_norm": 0.2338787632638593, + "learning_rate": 3.399415524286907e-05, + "loss": 0.3732, + "step": 2782 + }, + { + "epoch": 2.966950959488273, + "grad_norm": 0.21152997804401275, + "learning_rate": 3.396472332474015e-05, + "loss": 0.3669, + "step": 2783 + }, + { + "epoch": 2.9680170575692966, + "grad_norm": 0.17079876375720443, + "learning_rate": 3.393529474984083e-05, + "loss": 0.3714, + "step": 2784 + }, + { + "epoch": 2.9690831556503197, + "grad_norm": 0.20326343887457965, + "learning_rate": 3.390586953447304e-05, + "loss": 0.3685, + "step": 2785 + }, + { + "epoch": 2.970149253731343, + "grad_norm": 0.265596968296806, + "learning_rate": 3.38764476949368e-05, + "loss": 0.3701, + "step": 2786 + }, + { + "epoch": 2.9712153518123667, + "grad_norm": 0.20949193099124558, + "learning_rate": 3.38470292475303e-05, + "loss": 0.3634, + "step": 2787 + }, + { + "epoch": 2.9722814498933903, + "grad_norm": 0.19275153065422299, + "learning_rate": 3.3817614208549796e-05, + "loss": 0.3652, + "step": 2788 + }, + { + "epoch": 2.973347547974414, + "grad_norm": 0.25513016177139153, + "learning_rate": 3.378820259428972e-05, + "loss": 0.3678, + "step": 2789 + }, + { + "epoch": 2.974413646055437, + "grad_norm": 0.20641698835987704, + "learning_rate": 3.3758794421042585e-05, + "loss": 0.3686, + "step": 2790 + }, + { + "epoch": 2.9754797441364604, + "grad_norm": 0.21686027546060258, + "learning_rate": 3.3729389705098956e-05, + "loss": 0.3721, + "step": 2791 + }, + { + "epoch": 2.976545842217484, + "grad_norm": 0.21228392649559347, + "learning_rate": 3.3699988462747536e-05, + "loss": 0.3619, + "step": 2792 + }, + { + "epoch": 2.9776119402985075, + "grad_norm": 0.15906755651360552, + "learning_rate": 3.3670590710275095e-05, + "loss": 0.3634, + "step": 2793 + }, + { + "epoch": 2.978678038379531, + "grad_norm": 0.23673718965759863, + "learning_rate": 3.3641196463966466e-05, + "loss": 0.3682, + "step": 2794 + }, + { + "epoch": 2.979744136460554, + "grad_norm": 0.2179359974386122, + "learning_rate": 3.36118057401045e-05, + "loss": 0.3661, + "step": 2795 + }, + { + "epoch": 2.9808102345415777, + "grad_norm": 0.18870552087602346, + "learning_rate": 3.358241855497015e-05, + "loss": 0.3692, + "step": 2796 + }, + { + "epoch": 2.981876332622601, + "grad_norm": 0.21000830418856412, + "learning_rate": 3.35530349248424e-05, + "loss": 0.3723, + "step": 2797 + }, + { + "epoch": 2.9829424307036247, + "grad_norm": 0.1834568664565684, + "learning_rate": 3.352365486599823e-05, + "loss": 0.3699, + "step": 2798 + }, + { + "epoch": 2.9840085287846483, + "grad_norm": 0.21955054980022862, + "learning_rate": 3.349427839471268e-05, + "loss": 0.3693, + "step": 2799 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.19131385619838284, + "learning_rate": 3.346490552725879e-05, + "loss": 0.3633, + "step": 2800 + }, + { + "epoch": 2.9861407249466954, + "grad_norm": 0.22039397671168487, + "learning_rate": 3.3435536279907615e-05, + "loss": 0.3668, + "step": 2801 + }, + { + "epoch": 2.9872068230277184, + "grad_norm": 0.25817755908994, + "learning_rate": 3.340617066892815e-05, + "loss": 0.3694, + "step": 2802 + }, + { + "epoch": 2.988272921108742, + "grad_norm": 0.17992068437591005, + "learning_rate": 3.3376808710587456e-05, + "loss": 0.3608, + "step": 2803 + }, + { + "epoch": 2.9893390191897655, + "grad_norm": 0.19725409347741943, + "learning_rate": 3.334745042115052e-05, + "loss": 0.3674, + "step": 2804 + }, + { + "epoch": 2.990405117270789, + "grad_norm": 0.2199411430669973, + "learning_rate": 3.331809581688031e-05, + "loss": 0.3685, + "step": 2805 + }, + { + "epoch": 2.9914712153518126, + "grad_norm": 0.186450392495569, + "learning_rate": 3.328874491403774e-05, + "loss": 0.3652, + "step": 2806 + }, + { + "epoch": 2.9925373134328357, + "grad_norm": 0.2010767437452293, + "learning_rate": 3.32593977288817e-05, + "loss": 0.3795, + "step": 2807 + }, + { + "epoch": 2.9936034115138592, + "grad_norm": 0.19580642217271252, + "learning_rate": 3.3230054277668994e-05, + "loss": 0.3688, + "step": 2808 + }, + { + "epoch": 2.9946695095948828, + "grad_norm": 0.17801010293169015, + "learning_rate": 3.320071457665437e-05, + "loss": 0.3694, + "step": 2809 + }, + { + "epoch": 2.9957356076759063, + "grad_norm": 0.183255241372143, + "learning_rate": 3.317137864209051e-05, + "loss": 0.3711, + "step": 2810 + }, + { + "epoch": 2.99680170575693, + "grad_norm": 0.1656711458529668, + "learning_rate": 3.3142046490227984e-05, + "loss": 0.3668, + "step": 2811 + }, + { + "epoch": 2.997867803837953, + "grad_norm": 0.15999468329558042, + "learning_rate": 3.311271813731529e-05, + "loss": 0.3625, + "step": 2812 + }, + { + "epoch": 2.9989339019189765, + "grad_norm": 0.2031732214049462, + "learning_rate": 3.3083393599598804e-05, + "loss": 0.3667, + "step": 2813 + }, + { + "epoch": 3.0, + "grad_norm": 0.1833937765340318, + "learning_rate": 3.305407289332279e-05, + "loss": 0.356, + "step": 2814 + }, + { + "epoch": 3.0010660980810235, + "grad_norm": 0.16676086865014514, + "learning_rate": 3.3024756034729403e-05, + "loss": 0.3446, + "step": 2815 + }, + { + "epoch": 3.002132196162047, + "grad_norm": 0.1944616035010321, + "learning_rate": 3.299544304005867e-05, + "loss": 0.3499, + "step": 2816 + }, + { + "epoch": 3.00319829424307, + "grad_norm": 0.21726317154600533, + "learning_rate": 3.296613392554845e-05, + "loss": 0.3447, + "step": 2817 + }, + { + "epoch": 3.0042643923240937, + "grad_norm": 0.18162073687276598, + "learning_rate": 3.293682870743446e-05, + "loss": 0.3449, + "step": 2818 + }, + { + "epoch": 3.0053304904051172, + "grad_norm": 0.19337743208072578, + "learning_rate": 3.2907527401950314e-05, + "loss": 0.3438, + "step": 2819 + }, + { + "epoch": 3.0063965884861408, + "grad_norm": 0.20006928027604456, + "learning_rate": 3.287823002532735e-05, + "loss": 0.348, + "step": 2820 + }, + { + "epoch": 3.0074626865671643, + "grad_norm": 0.21921022479851462, + "learning_rate": 3.284893659379483e-05, + "loss": 0.355, + "step": 2821 + }, + { + "epoch": 3.008528784648188, + "grad_norm": 0.19159498596282584, + "learning_rate": 3.2819647123579785e-05, + "loss": 0.3471, + "step": 2822 + }, + { + "epoch": 3.009594882729211, + "grad_norm": 0.1975964755116819, + "learning_rate": 3.2790361630907073e-05, + "loss": 0.351, + "step": 2823 + }, + { + "epoch": 3.0106609808102345, + "grad_norm": 0.19162779591776102, + "learning_rate": 3.276108013199931e-05, + "loss": 0.3471, + "step": 2824 + }, + { + "epoch": 3.011727078891258, + "grad_norm": 0.20069049454309051, + "learning_rate": 3.273180264307693e-05, + "loss": 0.3473, + "step": 2825 + }, + { + "epoch": 3.0127931769722816, + "grad_norm": 0.18162811798879747, + "learning_rate": 3.270252918035817e-05, + "loss": 0.3497, + "step": 2826 + }, + { + "epoch": 3.013859275053305, + "grad_norm": 0.2190938039478538, + "learning_rate": 3.2673259760058966e-05, + "loss": 0.342, + "step": 2827 + }, + { + "epoch": 3.014925373134328, + "grad_norm": 0.20368209134299048, + "learning_rate": 3.264399439839307e-05, + "loss": 0.3565, + "step": 2828 + }, + { + "epoch": 3.0159914712153517, + "grad_norm": 0.21564518091097337, + "learning_rate": 3.261473311157197e-05, + "loss": 0.3477, + "step": 2829 + }, + { + "epoch": 3.0170575692963753, + "grad_norm": 0.22960493863723194, + "learning_rate": 3.258547591580493e-05, + "loss": 0.355, + "step": 2830 + }, + { + "epoch": 3.018123667377399, + "grad_norm": 0.20630353684777758, + "learning_rate": 3.255622282729886e-05, + "loss": 0.3491, + "step": 2831 + }, + { + "epoch": 3.0191897654584223, + "grad_norm": 0.19432289775660633, + "learning_rate": 3.252697386225848e-05, + "loss": 0.3463, + "step": 2832 + }, + { + "epoch": 3.0202558635394454, + "grad_norm": 0.22226714326699387, + "learning_rate": 3.24977290368862e-05, + "loss": 0.3474, + "step": 2833 + }, + { + "epoch": 3.021321961620469, + "grad_norm": 0.20919728574830324, + "learning_rate": 3.2468488367382146e-05, + "loss": 0.3543, + "step": 2834 + }, + { + "epoch": 3.0223880597014925, + "grad_norm": 0.2097539980157466, + "learning_rate": 3.2439251869944096e-05, + "loss": 0.3546, + "step": 2835 + }, + { + "epoch": 3.023454157782516, + "grad_norm": 0.2378340506907076, + "learning_rate": 3.2410019560767566e-05, + "loss": 0.3507, + "step": 2836 + }, + { + "epoch": 3.0245202558635396, + "grad_norm": 0.19806299173295153, + "learning_rate": 3.238079145604576e-05, + "loss": 0.3497, + "step": 2837 + }, + { + "epoch": 3.025586353944563, + "grad_norm": 0.21295221351192548, + "learning_rate": 3.235156757196948e-05, + "loss": 0.3427, + "step": 2838 + }, + { + "epoch": 3.026652452025586, + "grad_norm": 0.22762110787574957, + "learning_rate": 3.2322347924727264e-05, + "loss": 0.352, + "step": 2839 + }, + { + "epoch": 3.0277185501066097, + "grad_norm": 0.20297127463286518, + "learning_rate": 3.229313253050529e-05, + "loss": 0.3481, + "step": 2840 + }, + { + "epoch": 3.0287846481876333, + "grad_norm": 0.1801189363866266, + "learning_rate": 3.2263921405487356e-05, + "loss": 0.3491, + "step": 2841 + }, + { + "epoch": 3.029850746268657, + "grad_norm": 0.2148709514009852, + "learning_rate": 3.2234714565854895e-05, + "loss": 0.3562, + "step": 2842 + }, + { + "epoch": 3.0309168443496803, + "grad_norm": 0.1905295271571919, + "learning_rate": 3.2205512027787005e-05, + "loss": 0.3468, + "step": 2843 + }, + { + "epoch": 3.0319829424307034, + "grad_norm": 0.2416529548256475, + "learning_rate": 3.2176313807460355e-05, + "loss": 0.3521, + "step": 2844 + }, + { + "epoch": 3.033049040511727, + "grad_norm": 0.19945666420393923, + "learning_rate": 3.214711992104925e-05, + "loss": 0.3519, + "step": 2845 + }, + { + "epoch": 3.0341151385927505, + "grad_norm": 0.1969420298529883, + "learning_rate": 3.211793038472558e-05, + "loss": 0.3489, + "step": 2846 + }, + { + "epoch": 3.035181236673774, + "grad_norm": 0.24393952090774143, + "learning_rate": 3.208874521465882e-05, + "loss": 0.3555, + "step": 2847 + }, + { + "epoch": 3.0362473347547976, + "grad_norm": 0.17068118729565857, + "learning_rate": 3.205956442701607e-05, + "loss": 0.3492, + "step": 2848 + }, + { + "epoch": 3.0373134328358207, + "grad_norm": 0.18265684276116836, + "learning_rate": 3.203038803796195e-05, + "loss": 0.3536, + "step": 2849 + }, + { + "epoch": 3.038379530916844, + "grad_norm": 0.1750878986387194, + "learning_rate": 3.200121606365865e-05, + "loss": 0.3467, + "step": 2850 + }, + { + "epoch": 3.0394456289978677, + "grad_norm": 0.1501654395365917, + "learning_rate": 3.197204852026595e-05, + "loss": 0.3463, + "step": 2851 + }, + { + "epoch": 3.0405117270788913, + "grad_norm": 0.1881586833141222, + "learning_rate": 3.194288542394116e-05, + "loss": 0.3485, + "step": 2852 + }, + { + "epoch": 3.041577825159915, + "grad_norm": 0.17391665540804127, + "learning_rate": 3.191372679083908e-05, + "loss": 0.349, + "step": 2853 + }, + { + "epoch": 3.0426439232409384, + "grad_norm": 0.1855229052361205, + "learning_rate": 3.18845726371121e-05, + "loss": 0.3522, + "step": 2854 + }, + { + "epoch": 3.0437100213219614, + "grad_norm": 0.19940822544482417, + "learning_rate": 3.1855422978910136e-05, + "loss": 0.3482, + "step": 2855 + }, + { + "epoch": 3.044776119402985, + "grad_norm": 0.16112705274220399, + "learning_rate": 3.182627783238053e-05, + "loss": 0.347, + "step": 2856 + }, + { + "epoch": 3.0458422174840085, + "grad_norm": 0.20167791211166475, + "learning_rate": 3.179713721366821e-05, + "loss": 0.3534, + "step": 2857 + }, + { + "epoch": 3.046908315565032, + "grad_norm": 0.18479202502339434, + "learning_rate": 3.176800113891556e-05, + "loss": 0.351, + "step": 2858 + }, + { + "epoch": 3.0479744136460556, + "grad_norm": 0.16272058166990602, + "learning_rate": 3.173886962426246e-05, + "loss": 0.3494, + "step": 2859 + }, + { + "epoch": 3.0490405117270787, + "grad_norm": 0.16165195964515638, + "learning_rate": 3.1709742685846224e-05, + "loss": 0.3473, + "step": 2860 + }, + { + "epoch": 3.050106609808102, + "grad_norm": 0.17386330305907433, + "learning_rate": 3.168062033980169e-05, + "loss": 0.3497, + "step": 2861 + }, + { + "epoch": 3.0511727078891258, + "grad_norm": 0.1458661691624279, + "learning_rate": 3.165150260226112e-05, + "loss": 0.35, + "step": 2862 + }, + { + "epoch": 3.0522388059701493, + "grad_norm": 0.20435764871354212, + "learning_rate": 3.162238948935423e-05, + "loss": 0.3486, + "step": 2863 + }, + { + "epoch": 3.053304904051173, + "grad_norm": 0.19071168515585496, + "learning_rate": 3.159328101720816e-05, + "loss": 0.3471, + "step": 2864 + }, + { + "epoch": 3.0543710021321964, + "grad_norm": 0.18693245756126411, + "learning_rate": 3.156417720194749e-05, + "loss": 0.3408, + "step": 2865 + }, + { + "epoch": 3.0554371002132195, + "grad_norm": 0.19376415825853904, + "learning_rate": 3.153507805969425e-05, + "loss": 0.3462, + "step": 2866 + }, + { + "epoch": 3.056503198294243, + "grad_norm": 0.23970364168448857, + "learning_rate": 3.150598360656781e-05, + "loss": 0.3585, + "step": 2867 + }, + { + "epoch": 3.0575692963752665, + "grad_norm": 0.24686273765529104, + "learning_rate": 3.1476893858685e-05, + "loss": 0.3522, + "step": 2868 + }, + { + "epoch": 3.05863539445629, + "grad_norm": 0.20134496461618384, + "learning_rate": 3.1447808832160034e-05, + "loss": 0.3534, + "step": 2869 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.20811412001280552, + "learning_rate": 3.141872854310452e-05, + "loss": 0.3529, + "step": 2870 + }, + { + "epoch": 3.0607675906183367, + "grad_norm": 0.2252811945149839, + "learning_rate": 3.13896530076274e-05, + "loss": 0.3507, + "step": 2871 + }, + { + "epoch": 3.0618336886993602, + "grad_norm": 0.27899336334092645, + "learning_rate": 3.1360582241835025e-05, + "loss": 0.3445, + "step": 2872 + }, + { + "epoch": 3.0628997867803838, + "grad_norm": 0.18208464259977086, + "learning_rate": 3.13315162618311e-05, + "loss": 0.3512, + "step": 2873 + }, + { + "epoch": 3.0639658848614073, + "grad_norm": 0.19082089290473603, + "learning_rate": 3.130245508371663e-05, + "loss": 0.3528, + "step": 2874 + }, + { + "epoch": 3.065031982942431, + "grad_norm": 0.2131583906254748, + "learning_rate": 3.127339872359002e-05, + "loss": 0.3521, + "step": 2875 + }, + { + "epoch": 3.066098081023454, + "grad_norm": 0.17934067948301102, + "learning_rate": 3.1244347197546986e-05, + "loss": 0.3484, + "step": 2876 + }, + { + "epoch": 3.0671641791044775, + "grad_norm": 0.21329220073762686, + "learning_rate": 3.1215300521680564e-05, + "loss": 0.3564, + "step": 2877 + }, + { + "epoch": 3.068230277185501, + "grad_norm": 0.21979298064989125, + "learning_rate": 3.118625871208109e-05, + "loss": 0.3441, + "step": 2878 + }, + { + "epoch": 3.0692963752665245, + "grad_norm": 0.1899827348463154, + "learning_rate": 3.115722178483624e-05, + "loss": 0.3548, + "step": 2879 + }, + { + "epoch": 3.070362473347548, + "grad_norm": 0.19208909798825324, + "learning_rate": 3.1128189756030934e-05, + "loss": 0.3472, + "step": 2880 + }, + { + "epoch": 3.0714285714285716, + "grad_norm": 0.15715982773841544, + "learning_rate": 3.109916264174743e-05, + "loss": 0.3447, + "step": 2881 + }, + { + "epoch": 3.0724946695095947, + "grad_norm": 0.19051789498649285, + "learning_rate": 3.1070140458065235e-05, + "loss": 0.3478, + "step": 2882 + }, + { + "epoch": 3.0735607675906182, + "grad_norm": 0.17970300278684168, + "learning_rate": 3.104112322106112e-05, + "loss": 0.3468, + "step": 2883 + }, + { + "epoch": 3.074626865671642, + "grad_norm": 0.1733355341791082, + "learning_rate": 3.1012110946809134e-05, + "loss": 0.355, + "step": 2884 + }, + { + "epoch": 3.0756929637526653, + "grad_norm": 0.1997157707879784, + "learning_rate": 3.0983103651380574e-05, + "loss": 0.3472, + "step": 2885 + }, + { + "epoch": 3.076759061833689, + "grad_norm": 0.1683478624827942, + "learning_rate": 3.0954101350843966e-05, + "loss": 0.3472, + "step": 2886 + }, + { + "epoch": 3.077825159914712, + "grad_norm": 0.17517827837363226, + "learning_rate": 3.092510406126508e-05, + "loss": 0.3466, + "step": 2887 + }, + { + "epoch": 3.0788912579957355, + "grad_norm": 0.18556306694129251, + "learning_rate": 3.089611179870691e-05, + "loss": 0.3498, + "step": 2888 + }, + { + "epoch": 3.079957356076759, + "grad_norm": 0.17804006539183986, + "learning_rate": 3.086712457922966e-05, + "loss": 0.3459, + "step": 2889 + }, + { + "epoch": 3.0810234541577826, + "grad_norm": 0.17926971743225267, + "learning_rate": 3.083814241889074e-05, + "loss": 0.3485, + "step": 2890 + }, + { + "epoch": 3.082089552238806, + "grad_norm": 0.18765772737419348, + "learning_rate": 3.0809165333744765e-05, + "loss": 0.3555, + "step": 2891 + }, + { + "epoch": 3.0831556503198296, + "grad_norm": 0.1665179916020946, + "learning_rate": 3.0780193339843545e-05, + "loss": 0.346, + "step": 2892 + }, + { + "epoch": 3.0842217484008527, + "grad_norm": 0.16253422719498084, + "learning_rate": 3.075122645323603e-05, + "loss": 0.347, + "step": 2893 + }, + { + "epoch": 3.0852878464818763, + "grad_norm": 0.1705949117676651, + "learning_rate": 3.072226468996839e-05, + "loss": 0.3531, + "step": 2894 + }, + { + "epoch": 3.0863539445629, + "grad_norm": 0.16669859994578132, + "learning_rate": 3.0693308066083954e-05, + "loss": 0.3489, + "step": 2895 + }, + { + "epoch": 3.0874200426439233, + "grad_norm": 0.15555031037234748, + "learning_rate": 3.0664356597623144e-05, + "loss": 0.3505, + "step": 2896 + }, + { + "epoch": 3.088486140724947, + "grad_norm": 0.18596041745219372, + "learning_rate": 3.0635410300623596e-05, + "loss": 0.354, + "step": 2897 + }, + { + "epoch": 3.08955223880597, + "grad_norm": 0.1905420596562174, + "learning_rate": 3.060646919112004e-05, + "loss": 0.3481, + "step": 2898 + }, + { + "epoch": 3.0906183368869935, + "grad_norm": 0.18015092645672678, + "learning_rate": 3.057753328514438e-05, + "loss": 0.3492, + "step": 2899 + }, + { + "epoch": 3.091684434968017, + "grad_norm": 0.1773289970330081, + "learning_rate": 3.0548602598725564e-05, + "loss": 0.3523, + "step": 2900 + }, + { + "epoch": 3.0927505330490406, + "grad_norm": 0.19120781604022777, + "learning_rate": 3.0519677147889705e-05, + "loss": 0.3533, + "step": 2901 + }, + { + "epoch": 3.093816631130064, + "grad_norm": 0.18705169216756135, + "learning_rate": 3.0490756948660017e-05, + "loss": 0.3486, + "step": 2902 + }, + { + "epoch": 3.094882729211087, + "grad_norm": 0.21344733721957102, + "learning_rate": 3.046184201705675e-05, + "loss": 0.3546, + "step": 2903 + }, + { + "epoch": 3.0959488272921107, + "grad_norm": 0.16522989881515734, + "learning_rate": 3.043293236909729e-05, + "loss": 0.3486, + "step": 2904 + }, + { + "epoch": 3.0970149253731343, + "grad_norm": 0.1819572191490791, + "learning_rate": 3.0404028020796087e-05, + "loss": 0.3496, + "step": 2905 + }, + { + "epoch": 3.098081023454158, + "grad_norm": 0.15871225159309935, + "learning_rate": 3.0375128988164655e-05, + "loss": 0.3476, + "step": 2906 + }, + { + "epoch": 3.0991471215351813, + "grad_norm": 0.17994107490800862, + "learning_rate": 3.0346235287211532e-05, + "loss": 0.3509, + "step": 2907 + }, + { + "epoch": 3.100213219616205, + "grad_norm": 0.16605077988407455, + "learning_rate": 3.0317346933942337e-05, + "loss": 0.3563, + "step": 2908 + }, + { + "epoch": 3.101279317697228, + "grad_norm": 0.16665287698505823, + "learning_rate": 3.028846394435973e-05, + "loss": 0.3468, + "step": 2909 + }, + { + "epoch": 3.1023454157782515, + "grad_norm": 0.19464014834448382, + "learning_rate": 3.0259586334463366e-05, + "loss": 0.3461, + "step": 2910 + }, + { + "epoch": 3.103411513859275, + "grad_norm": 0.1387449385010934, + "learning_rate": 3.0230714120249947e-05, + "loss": 0.351, + "step": 2911 + }, + { + "epoch": 3.1044776119402986, + "grad_norm": 0.17872611874146532, + "learning_rate": 3.020184731771319e-05, + "loss": 0.3488, + "step": 2912 + }, + { + "epoch": 3.105543710021322, + "grad_norm": 0.18416826482656345, + "learning_rate": 3.017298594284379e-05, + "loss": 0.3487, + "step": 2913 + }, + { + "epoch": 3.106609808102345, + "grad_norm": 0.17275054090026867, + "learning_rate": 3.0144130011629448e-05, + "loss": 0.354, + "step": 2914 + }, + { + "epoch": 3.1076759061833688, + "grad_norm": 0.17763788392460803, + "learning_rate": 3.011527954005486e-05, + "loss": 0.3485, + "step": 2915 + }, + { + "epoch": 3.1087420042643923, + "grad_norm": 0.1945341807582054, + "learning_rate": 3.0086434544101677e-05, + "loss": 0.3489, + "step": 2916 + }, + { + "epoch": 3.109808102345416, + "grad_norm": 0.14591971732114, + "learning_rate": 3.005759503974854e-05, + "loss": 0.3499, + "step": 2917 + }, + { + "epoch": 3.1108742004264394, + "grad_norm": 0.16096483078908078, + "learning_rate": 3.0028761042971028e-05, + "loss": 0.3446, + "step": 2918 + }, + { + "epoch": 3.111940298507463, + "grad_norm": 0.1877410826166536, + "learning_rate": 2.9999932569741673e-05, + "loss": 0.3543, + "step": 2919 + }, + { + "epoch": 3.113006396588486, + "grad_norm": 0.1583922934703547, + "learning_rate": 2.9971109636029952e-05, + "loss": 0.3454, + "step": 2920 + }, + { + "epoch": 3.1140724946695095, + "grad_norm": 0.16306779868577526, + "learning_rate": 2.994229225780229e-05, + "loss": 0.3419, + "step": 2921 + }, + { + "epoch": 3.115138592750533, + "grad_norm": 0.1919727737485944, + "learning_rate": 2.991348045102199e-05, + "loss": 0.3479, + "step": 2922 + }, + { + "epoch": 3.1162046908315566, + "grad_norm": 0.14431740367185825, + "learning_rate": 2.988467423164931e-05, + "loss": 0.3519, + "step": 2923 + }, + { + "epoch": 3.11727078891258, + "grad_norm": 0.19602916545574073, + "learning_rate": 2.9855873615641414e-05, + "loss": 0.3489, + "step": 2924 + }, + { + "epoch": 3.1183368869936032, + "grad_norm": 0.15941172314892962, + "learning_rate": 2.982707861895231e-05, + "loss": 0.3473, + "step": 2925 + }, + { + "epoch": 3.1194029850746268, + "grad_norm": 0.15400208783886424, + "learning_rate": 2.9798289257532946e-05, + "loss": 0.347, + "step": 2926 + }, + { + "epoch": 3.1204690831556503, + "grad_norm": 0.16733091547516227, + "learning_rate": 2.976950554733114e-05, + "loss": 0.3487, + "step": 2927 + }, + { + "epoch": 3.121535181236674, + "grad_norm": 0.16131815693424373, + "learning_rate": 2.9740727504291577e-05, + "loss": 0.3515, + "step": 2928 + }, + { + "epoch": 3.1226012793176974, + "grad_norm": 0.18988500484746434, + "learning_rate": 2.9711955144355776e-05, + "loss": 0.3475, + "step": 2929 + }, + { + "epoch": 3.1236673773987205, + "grad_norm": 0.16873039013955726, + "learning_rate": 2.9683188483462135e-05, + "loss": 0.3506, + "step": 2930 + }, + { + "epoch": 3.124733475479744, + "grad_norm": 0.18997760424236068, + "learning_rate": 2.9654427537545915e-05, + "loss": 0.3527, + "step": 2931 + }, + { + "epoch": 3.1257995735607675, + "grad_norm": 0.16015025466863764, + "learning_rate": 2.962567232253914e-05, + "loss": 0.349, + "step": 2932 + }, + { + "epoch": 3.126865671641791, + "grad_norm": 0.14231292340170754, + "learning_rate": 2.9596922854370737e-05, + "loss": 0.3487, + "step": 2933 + }, + { + "epoch": 3.1279317697228146, + "grad_norm": 0.16799598139417324, + "learning_rate": 2.9568179148966406e-05, + "loss": 0.3504, + "step": 2934 + }, + { + "epoch": 3.128997867803838, + "grad_norm": 0.13092435516555, + "learning_rate": 2.9539441222248685e-05, + "loss": 0.3502, + "step": 2935 + }, + { + "epoch": 3.1300639658848612, + "grad_norm": 0.16201697102720602, + "learning_rate": 2.9510709090136855e-05, + "loss": 0.3507, + "step": 2936 + }, + { + "epoch": 3.131130063965885, + "grad_norm": 0.13920375866502643, + "learning_rate": 2.9481982768547048e-05, + "loss": 0.3484, + "step": 2937 + }, + { + "epoch": 3.1321961620469083, + "grad_norm": 0.1608670042161187, + "learning_rate": 2.945326227339215e-05, + "loss": 0.3505, + "step": 2938 + }, + { + "epoch": 3.133262260127932, + "grad_norm": 0.15274868977241857, + "learning_rate": 2.942454762058184e-05, + "loss": 0.353, + "step": 2939 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.1728035181021134, + "learning_rate": 2.939583882602251e-05, + "loss": 0.3537, + "step": 2940 + }, + { + "epoch": 3.1353944562899785, + "grad_norm": 0.20061129935035046, + "learning_rate": 2.936713590561735e-05, + "loss": 0.3537, + "step": 2941 + }, + { + "epoch": 3.136460554371002, + "grad_norm": 0.16121457578954346, + "learning_rate": 2.9338438875266315e-05, + "loss": 0.3493, + "step": 2942 + }, + { + "epoch": 3.1375266524520256, + "grad_norm": 0.1985681459680183, + "learning_rate": 2.930974775086602e-05, + "loss": 0.3494, + "step": 2943 + }, + { + "epoch": 3.138592750533049, + "grad_norm": 0.13586545126970054, + "learning_rate": 2.928106254830989e-05, + "loss": 0.3485, + "step": 2944 + }, + { + "epoch": 3.1396588486140726, + "grad_norm": 0.193935638265073, + "learning_rate": 2.9252383283488038e-05, + "loss": 0.3506, + "step": 2945 + }, + { + "epoch": 3.140724946695096, + "grad_norm": 0.15997507985832404, + "learning_rate": 2.9223709972287274e-05, + "loss": 0.3501, + "step": 2946 + }, + { + "epoch": 3.1417910447761193, + "grad_norm": 0.16394293111831423, + "learning_rate": 2.9195042630591115e-05, + "loss": 0.3428, + "step": 2947 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 0.17865359760016075, + "learning_rate": 2.9166381274279803e-05, + "loss": 0.3519, + "step": 2948 + }, + { + "epoch": 3.1439232409381663, + "grad_norm": 0.17991378591531254, + "learning_rate": 2.913772591923021e-05, + "loss": 0.3485, + "step": 2949 + }, + { + "epoch": 3.14498933901919, + "grad_norm": 0.1693801011865618, + "learning_rate": 2.9109076581315937e-05, + "loss": 0.3559, + "step": 2950 + }, + { + "epoch": 3.1460554371002134, + "grad_norm": 0.14463952221236143, + "learning_rate": 2.908043327640723e-05, + "loss": 0.3579, + "step": 2951 + }, + { + "epoch": 3.1471215351812365, + "grad_norm": 0.1573143821262481, + "learning_rate": 2.9051796020370964e-05, + "loss": 0.3488, + "step": 2952 + }, + { + "epoch": 3.14818763326226, + "grad_norm": 0.16678149027123818, + "learning_rate": 2.9023164829070718e-05, + "loss": 0.3559, + "step": 2953 + }, + { + "epoch": 3.1492537313432836, + "grad_norm": 0.140262490078489, + "learning_rate": 2.8994539718366672e-05, + "loss": 0.3535, + "step": 2954 + }, + { + "epoch": 3.150319829424307, + "grad_norm": 0.2046721039374505, + "learning_rate": 2.8965920704115644e-05, + "loss": 0.3511, + "step": 2955 + }, + { + "epoch": 3.1513859275053306, + "grad_norm": 0.1438158341906599, + "learning_rate": 2.8937307802171085e-05, + "loss": 0.3443, + "step": 2956 + }, + { + "epoch": 3.1524520255863537, + "grad_norm": 0.14253271019539046, + "learning_rate": 2.8908701028383084e-05, + "loss": 0.3475, + "step": 2957 + }, + { + "epoch": 3.1535181236673773, + "grad_norm": 0.14190886636267147, + "learning_rate": 2.888010039859826e-05, + "loss": 0.3465, + "step": 2958 + }, + { + "epoch": 3.154584221748401, + "grad_norm": 0.1635398501114063, + "learning_rate": 2.8851505928659896e-05, + "loss": 0.3521, + "step": 2959 + }, + { + "epoch": 3.1556503198294243, + "grad_norm": 0.1520685576715207, + "learning_rate": 2.8822917634407858e-05, + "loss": 0.3463, + "step": 2960 + }, + { + "epoch": 3.156716417910448, + "grad_norm": 0.1723966595861352, + "learning_rate": 2.8794335531678545e-05, + "loss": 0.3481, + "step": 2961 + }, + { + "epoch": 3.1577825159914714, + "grad_norm": 0.16208068603676945, + "learning_rate": 2.8765759636304973e-05, + "loss": 0.3575, + "step": 2962 + }, + { + "epoch": 3.1588486140724945, + "grad_norm": 0.1453800336738485, + "learning_rate": 2.8737189964116705e-05, + "loss": 0.3471, + "step": 2963 + }, + { + "epoch": 3.159914712153518, + "grad_norm": 0.1858091083588778, + "learning_rate": 2.8708626530939865e-05, + "loss": 0.3506, + "step": 2964 + }, + { + "epoch": 3.1609808102345416, + "grad_norm": 0.18692761383630035, + "learning_rate": 2.868006935259708e-05, + "loss": 0.3467, + "step": 2965 + }, + { + "epoch": 3.162046908315565, + "grad_norm": 0.14846566192818872, + "learning_rate": 2.8651518444907556e-05, + "loss": 0.3471, + "step": 2966 + }, + { + "epoch": 3.1631130063965887, + "grad_norm": 0.16153996741706544, + "learning_rate": 2.862297382368702e-05, + "loss": 0.3542, + "step": 2967 + }, + { + "epoch": 3.1641791044776117, + "grad_norm": 0.16957351241623203, + "learning_rate": 2.8594435504747724e-05, + "loss": 0.3525, + "step": 2968 + }, + { + "epoch": 3.1652452025586353, + "grad_norm": 0.13030665419204615, + "learning_rate": 2.856590350389837e-05, + "loss": 0.3463, + "step": 2969 + }, + { + "epoch": 3.166311300639659, + "grad_norm": 0.13338420990478728, + "learning_rate": 2.8537377836944232e-05, + "loss": 0.3566, + "step": 2970 + }, + { + "epoch": 3.1673773987206824, + "grad_norm": 0.15453541002399912, + "learning_rate": 2.850885851968706e-05, + "loss": 0.3455, + "step": 2971 + }, + { + "epoch": 3.168443496801706, + "grad_norm": 0.1539275910876287, + "learning_rate": 2.8480345567925036e-05, + "loss": 0.3505, + "step": 2972 + }, + { + "epoch": 3.1695095948827294, + "grad_norm": 0.1601347292242495, + "learning_rate": 2.8451838997452875e-05, + "loss": 0.355, + "step": 2973 + }, + { + "epoch": 3.1705756929637525, + "grad_norm": 0.16109585673660984, + "learning_rate": 2.8423338824061732e-05, + "loss": 0.3494, + "step": 2974 + }, + { + "epoch": 3.171641791044776, + "grad_norm": 0.15883551225228731, + "learning_rate": 2.839484506353924e-05, + "loss": 0.3461, + "step": 2975 + }, + { + "epoch": 3.1727078891257996, + "grad_norm": 0.18127651611679274, + "learning_rate": 2.836635773166943e-05, + "loss": 0.3533, + "step": 2976 + }, + { + "epoch": 3.173773987206823, + "grad_norm": 0.14832179685684777, + "learning_rate": 2.833787684423282e-05, + "loss": 0.3512, + "step": 2977 + }, + { + "epoch": 3.1748400852878467, + "grad_norm": 0.1833917506129557, + "learning_rate": 2.8309402417006344e-05, + "loss": 0.3488, + "step": 2978 + }, + { + "epoch": 3.1759061833688698, + "grad_norm": 0.16898776691695933, + "learning_rate": 2.8280934465763352e-05, + "loss": 0.3451, + "step": 2979 + }, + { + "epoch": 3.1769722814498933, + "grad_norm": 0.14677271644886078, + "learning_rate": 2.82524730062736e-05, + "loss": 0.3509, + "step": 2980 + }, + { + "epoch": 3.178038379530917, + "grad_norm": 0.19057648965710483, + "learning_rate": 2.8224018054303278e-05, + "loss": 0.3551, + "step": 2981 + }, + { + "epoch": 3.1791044776119404, + "grad_norm": 0.13538971093481636, + "learning_rate": 2.8195569625614933e-05, + "loss": 0.3456, + "step": 2982 + }, + { + "epoch": 3.180170575692964, + "grad_norm": 0.1390167814695544, + "learning_rate": 2.816712773596751e-05, + "loss": 0.3531, + "step": 2983 + }, + { + "epoch": 3.181236673773987, + "grad_norm": 0.15503459765163632, + "learning_rate": 2.8138692401116366e-05, + "loss": 0.3496, + "step": 2984 + }, + { + "epoch": 3.1823027718550105, + "grad_norm": 0.14462829278663678, + "learning_rate": 2.811026363681317e-05, + "loss": 0.3553, + "step": 2985 + }, + { + "epoch": 3.183368869936034, + "grad_norm": 0.17695463207505402, + "learning_rate": 2.8081841458806002e-05, + "loss": 0.3533, + "step": 2986 + }, + { + "epoch": 3.1844349680170576, + "grad_norm": 0.1591547097918123, + "learning_rate": 2.8053425882839252e-05, + "loss": 0.3471, + "step": 2987 + }, + { + "epoch": 3.185501066098081, + "grad_norm": 0.23045461569297743, + "learning_rate": 2.802501692465368e-05, + "loss": 0.3558, + "step": 2988 + }, + { + "epoch": 3.1865671641791047, + "grad_norm": 0.17997948094924288, + "learning_rate": 2.799661459998638e-05, + "loss": 0.3451, + "step": 2989 + }, + { + "epoch": 3.1876332622601278, + "grad_norm": 0.15073757914110497, + "learning_rate": 2.7968218924570757e-05, + "loss": 0.3499, + "step": 2990 + }, + { + "epoch": 3.1886993603411513, + "grad_norm": 0.149109855986733, + "learning_rate": 2.7939829914136533e-05, + "loss": 0.3505, + "step": 2991 + }, + { + "epoch": 3.189765458422175, + "grad_norm": 0.1828583325585577, + "learning_rate": 2.791144758440975e-05, + "loss": 0.3534, + "step": 2992 + }, + { + "epoch": 3.1908315565031984, + "grad_norm": 0.1835467829513033, + "learning_rate": 2.788307195111276e-05, + "loss": 0.3571, + "step": 2993 + }, + { + "epoch": 3.191897654584222, + "grad_norm": 0.14112165071165333, + "learning_rate": 2.7854703029964157e-05, + "loss": 0.349, + "step": 2994 + }, + { + "epoch": 3.192963752665245, + "grad_norm": 0.14994590904845737, + "learning_rate": 2.7826340836678868e-05, + "loss": 0.3512, + "step": 2995 + }, + { + "epoch": 3.1940298507462686, + "grad_norm": 0.15001218329008587, + "learning_rate": 2.779798538696807e-05, + "loss": 0.3508, + "step": 2996 + }, + { + "epoch": 3.195095948827292, + "grad_norm": 0.14167484348076181, + "learning_rate": 2.776963669653923e-05, + "loss": 0.3531, + "step": 2997 + }, + { + "epoch": 3.1961620469083156, + "grad_norm": 0.16415685787588938, + "learning_rate": 2.7741294781096008e-05, + "loss": 0.3475, + "step": 2998 + }, + { + "epoch": 3.197228144989339, + "grad_norm": 0.18767514637702254, + "learning_rate": 2.7712959656338375e-05, + "loss": 0.3503, + "step": 2999 + }, + { + "epoch": 3.1982942430703627, + "grad_norm": 0.1318270842245805, + "learning_rate": 2.7684631337962535e-05, + "loss": 0.3483, + "step": 3000 + }, + { + "epoch": 3.199360341151386, + "grad_norm": 0.17743086770734862, + "learning_rate": 2.7656309841660864e-05, + "loss": 0.3497, + "step": 3001 + }, + { + "epoch": 3.2004264392324093, + "grad_norm": 0.20226460565691426, + "learning_rate": 2.7627995183122025e-05, + "loss": 0.3548, + "step": 3002 + }, + { + "epoch": 3.201492537313433, + "grad_norm": 0.1404795955564027, + "learning_rate": 2.7599687378030862e-05, + "loss": 0.3456, + "step": 3003 + }, + { + "epoch": 3.2025586353944564, + "grad_norm": 0.1664198795587612, + "learning_rate": 2.7571386442068443e-05, + "loss": 0.348, + "step": 3004 + }, + { + "epoch": 3.20362473347548, + "grad_norm": 0.1871561566919217, + "learning_rate": 2.754309239091199e-05, + "loss": 0.3487, + "step": 3005 + }, + { + "epoch": 3.204690831556503, + "grad_norm": 0.1762158383532131, + "learning_rate": 2.7514805240234942e-05, + "loss": 0.3489, + "step": 3006 + }, + { + "epoch": 3.2057569296375266, + "grad_norm": 0.1409821970880933, + "learning_rate": 2.7486525005706915e-05, + "loss": 0.3462, + "step": 3007 + }, + { + "epoch": 3.20682302771855, + "grad_norm": 0.1919928381276622, + "learning_rate": 2.745825170299371e-05, + "loss": 0.3431, + "step": 3008 + }, + { + "epoch": 3.2078891257995736, + "grad_norm": 0.14051119559763037, + "learning_rate": 2.7429985347757232e-05, + "loss": 0.3448, + "step": 3009 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.1581183828916611, + "learning_rate": 2.7401725955655582e-05, + "loss": 0.3494, + "step": 3010 + }, + { + "epoch": 3.2100213219616203, + "grad_norm": 0.16965574848535128, + "learning_rate": 2.7373473542343023e-05, + "loss": 0.3447, + "step": 3011 + }, + { + "epoch": 3.211087420042644, + "grad_norm": 0.1596889434072171, + "learning_rate": 2.7345228123469886e-05, + "loss": 0.3483, + "step": 3012 + }, + { + "epoch": 3.2121535181236673, + "grad_norm": 0.17408804946301867, + "learning_rate": 2.731698971468268e-05, + "loss": 0.3567, + "step": 3013 + }, + { + "epoch": 3.213219616204691, + "grad_norm": 0.16081014646576602, + "learning_rate": 2.7288758331624025e-05, + "loss": 0.3447, + "step": 3014 + }, + { + "epoch": 3.2142857142857144, + "grad_norm": 0.1387088635896051, + "learning_rate": 2.7260533989932628e-05, + "loss": 0.3512, + "step": 3015 + }, + { + "epoch": 3.2153518123667375, + "grad_norm": 0.15123466339681416, + "learning_rate": 2.7232316705243305e-05, + "loss": 0.3481, + "step": 3016 + }, + { + "epoch": 3.216417910447761, + "grad_norm": 0.15290558325073092, + "learning_rate": 2.720410649318698e-05, + "loss": 0.3488, + "step": 3017 + }, + { + "epoch": 3.2174840085287846, + "grad_norm": 0.17149168701297954, + "learning_rate": 2.7175903369390638e-05, + "loss": 0.3517, + "step": 3018 + }, + { + "epoch": 3.218550106609808, + "grad_norm": 0.16114167670939453, + "learning_rate": 2.7147707349477327e-05, + "loss": 0.352, + "step": 3019 + }, + { + "epoch": 3.2196162046908317, + "grad_norm": 0.14097634690323843, + "learning_rate": 2.7119518449066205e-05, + "loss": 0.349, + "step": 3020 + }, + { + "epoch": 3.220682302771855, + "grad_norm": 0.16246129452859187, + "learning_rate": 2.7091336683772437e-05, + "loss": 0.3496, + "step": 3021 + }, + { + "epoch": 3.2217484008528783, + "grad_norm": 0.16336429392891585, + "learning_rate": 2.7063162069207262e-05, + "loss": 0.3474, + "step": 3022 + }, + { + "epoch": 3.222814498933902, + "grad_norm": 0.14988643488196962, + "learning_rate": 2.7034994620977965e-05, + "loss": 0.3477, + "step": 3023 + }, + { + "epoch": 3.2238805970149254, + "grad_norm": 0.16792178240326763, + "learning_rate": 2.700683435468782e-05, + "loss": 0.3515, + "step": 3024 + }, + { + "epoch": 3.224946695095949, + "grad_norm": 0.14865516690552993, + "learning_rate": 2.6978681285936176e-05, + "loss": 0.3535, + "step": 3025 + }, + { + "epoch": 3.2260127931769724, + "grad_norm": 0.15929449524500977, + "learning_rate": 2.6950535430318373e-05, + "loss": 0.3519, + "step": 3026 + }, + { + "epoch": 3.227078891257996, + "grad_norm": 0.1599091744019697, + "learning_rate": 2.692239680342572e-05, + "loss": 0.3529, + "step": 3027 + }, + { + "epoch": 3.228144989339019, + "grad_norm": 0.1484966105614425, + "learning_rate": 2.689426542084558e-05, + "loss": 0.3579, + "step": 3028 + }, + { + "epoch": 3.2292110874200426, + "grad_norm": 0.1385274858632669, + "learning_rate": 2.686614129816129e-05, + "loss": 0.3497, + "step": 3029 + }, + { + "epoch": 3.230277185501066, + "grad_norm": 0.14991864182995415, + "learning_rate": 2.683802445095211e-05, + "loss": 0.3525, + "step": 3030 + }, + { + "epoch": 3.2313432835820897, + "grad_norm": 0.15476961485342514, + "learning_rate": 2.6809914894793344e-05, + "loss": 0.3424, + "step": 3031 + }, + { + "epoch": 3.232409381663113, + "grad_norm": 0.12397588075589204, + "learning_rate": 2.6781812645256216e-05, + "loss": 0.3426, + "step": 3032 + }, + { + "epoch": 3.2334754797441363, + "grad_norm": 0.15055980357741625, + "learning_rate": 2.6753717717907925e-05, + "loss": 0.3515, + "step": 3033 + }, + { + "epoch": 3.23454157782516, + "grad_norm": 0.14872467882760348, + "learning_rate": 2.672563012831158e-05, + "loss": 0.3456, + "step": 3034 + }, + { + "epoch": 3.2356076759061834, + "grad_norm": 0.14261242873484498, + "learning_rate": 2.6697549892026247e-05, + "loss": 0.3544, + "step": 3035 + }, + { + "epoch": 3.236673773987207, + "grad_norm": 0.15857980357826498, + "learning_rate": 2.666947702460693e-05, + "loss": 0.3546, + "step": 3036 + }, + { + "epoch": 3.2377398720682304, + "grad_norm": 0.16110840435766394, + "learning_rate": 2.6641411541604544e-05, + "loss": 0.3536, + "step": 3037 + }, + { + "epoch": 3.2388059701492535, + "grad_norm": 0.16212693241700432, + "learning_rate": 2.6613353458565887e-05, + "loss": 0.3511, + "step": 3038 + }, + { + "epoch": 3.239872068230277, + "grad_norm": 0.14587623221049775, + "learning_rate": 2.6585302791033688e-05, + "loss": 0.3522, + "step": 3039 + }, + { + "epoch": 3.2409381663113006, + "grad_norm": 0.13597680071298357, + "learning_rate": 2.6557259554546577e-05, + "loss": 0.3447, + "step": 3040 + }, + { + "epoch": 3.242004264392324, + "grad_norm": 0.16519790924407182, + "learning_rate": 2.6529223764639013e-05, + "loss": 0.3531, + "step": 3041 + }, + { + "epoch": 3.2430703624733477, + "grad_norm": 0.12865148005181068, + "learning_rate": 2.650119543684139e-05, + "loss": 0.3453, + "step": 3042 + }, + { + "epoch": 3.2441364605543708, + "grad_norm": 0.16931532502182053, + "learning_rate": 2.6473174586679947e-05, + "loss": 0.3481, + "step": 3043 + }, + { + "epoch": 3.2452025586353943, + "grad_norm": 0.20153378061586566, + "learning_rate": 2.644516122967678e-05, + "loss": 0.3474, + "step": 3044 + }, + { + "epoch": 3.246268656716418, + "grad_norm": 0.15391567159601363, + "learning_rate": 2.6417155381349814e-05, + "loss": 0.3555, + "step": 3045 + }, + { + "epoch": 3.2473347547974414, + "grad_norm": 0.18485138313641797, + "learning_rate": 2.638915705721284e-05, + "loss": 0.348, + "step": 3046 + }, + { + "epoch": 3.248400852878465, + "grad_norm": 0.19279732548240924, + "learning_rate": 2.6361166272775503e-05, + "loss": 0.3473, + "step": 3047 + }, + { + "epoch": 3.2494669509594885, + "grad_norm": 0.14914227409372177, + "learning_rate": 2.6333183043543207e-05, + "loss": 0.3491, + "step": 3048 + }, + { + "epoch": 3.2505330490405115, + "grad_norm": 0.198163678392842, + "learning_rate": 2.630520738501721e-05, + "loss": 0.3537, + "step": 3049 + }, + { + "epoch": 3.251599147121535, + "grad_norm": 0.1495697517853574, + "learning_rate": 2.6277239312694596e-05, + "loss": 0.3521, + "step": 3050 + }, + { + "epoch": 3.2526652452025586, + "grad_norm": 0.18049308488761015, + "learning_rate": 2.624927884206821e-05, + "loss": 0.3559, + "step": 3051 + }, + { + "epoch": 3.253731343283582, + "grad_norm": 0.1554880781007249, + "learning_rate": 2.6221325988626686e-05, + "loss": 0.3474, + "step": 3052 + }, + { + "epoch": 3.2547974413646057, + "grad_norm": 0.17014877400454104, + "learning_rate": 2.619338076785448e-05, + "loss": 0.3471, + "step": 3053 + }, + { + "epoch": 3.2558635394456292, + "grad_norm": 0.2175697746157974, + "learning_rate": 2.6165443195231763e-05, + "loss": 0.3526, + "step": 3054 + }, + { + "epoch": 3.2569296375266523, + "grad_norm": 0.1533228757448852, + "learning_rate": 2.6137513286234528e-05, + "loss": 0.355, + "step": 3055 + }, + { + "epoch": 3.257995735607676, + "grad_norm": 0.170158319003947, + "learning_rate": 2.6109591056334474e-05, + "loss": 0.3497, + "step": 3056 + }, + { + "epoch": 3.2590618336886994, + "grad_norm": 0.2110756154467986, + "learning_rate": 2.608167652099906e-05, + "loss": 0.3439, + "step": 3057 + }, + { + "epoch": 3.260127931769723, + "grad_norm": 0.14837278713007337, + "learning_rate": 2.6053769695691507e-05, + "loss": 0.3423, + "step": 3058 + }, + { + "epoch": 3.2611940298507465, + "grad_norm": 0.1809824101682613, + "learning_rate": 2.6025870595870733e-05, + "loss": 0.3487, + "step": 3059 + }, + { + "epoch": 3.2622601279317696, + "grad_norm": 0.21033652261955751, + "learning_rate": 2.5997979236991386e-05, + "loss": 0.3527, + "step": 3060 + }, + { + "epoch": 3.263326226012793, + "grad_norm": 0.21776809754829038, + "learning_rate": 2.5970095634503833e-05, + "loss": 0.3472, + "step": 3061 + }, + { + "epoch": 3.2643923240938166, + "grad_norm": 0.15831570347972593, + "learning_rate": 2.5942219803854168e-05, + "loss": 0.3437, + "step": 3062 + }, + { + "epoch": 3.26545842217484, + "grad_norm": 0.17838265493780678, + "learning_rate": 2.59143517604841e-05, + "loss": 0.3476, + "step": 3063 + }, + { + "epoch": 3.2665245202558637, + "grad_norm": 0.20607722523788694, + "learning_rate": 2.588649151983111e-05, + "loss": 0.3472, + "step": 3064 + }, + { + "epoch": 3.267590618336887, + "grad_norm": 0.13745662304832107, + "learning_rate": 2.5858639097328314e-05, + "loss": 0.351, + "step": 3065 + }, + { + "epoch": 3.2686567164179103, + "grad_norm": 0.1941806894488011, + "learning_rate": 2.583079450840453e-05, + "loss": 0.3537, + "step": 3066 + }, + { + "epoch": 3.269722814498934, + "grad_norm": 0.21824169614328015, + "learning_rate": 2.5802957768484173e-05, + "loss": 0.3461, + "step": 3067 + }, + { + "epoch": 3.2707889125799574, + "grad_norm": 0.16071832337632452, + "learning_rate": 2.5775128892987368e-05, + "loss": 0.3505, + "step": 3068 + }, + { + "epoch": 3.271855010660981, + "grad_norm": 0.163987167314734, + "learning_rate": 2.574730789732989e-05, + "loss": 0.345, + "step": 3069 + }, + { + "epoch": 3.272921108742004, + "grad_norm": 0.19905464540458992, + "learning_rate": 2.5719494796923085e-05, + "loss": 0.3574, + "step": 3070 + }, + { + "epoch": 3.2739872068230276, + "grad_norm": 0.16383647964999232, + "learning_rate": 2.569168960717398e-05, + "loss": 0.3495, + "step": 3071 + }, + { + "epoch": 3.275053304904051, + "grad_norm": 0.17693978324786094, + "learning_rate": 2.5663892343485214e-05, + "loss": 0.3499, + "step": 3072 + }, + { + "epoch": 3.2761194029850746, + "grad_norm": 0.18500426824572122, + "learning_rate": 2.5636103021255026e-05, + "loss": 0.3549, + "step": 3073 + }, + { + "epoch": 3.277185501066098, + "grad_norm": 0.19504056601794661, + "learning_rate": 2.5608321655877243e-05, + "loss": 0.3525, + "step": 3074 + }, + { + "epoch": 3.2782515991471217, + "grad_norm": 0.16037712681498448, + "learning_rate": 2.5580548262741304e-05, + "loss": 0.3474, + "step": 3075 + }, + { + "epoch": 3.279317697228145, + "grad_norm": 0.17838055168396916, + "learning_rate": 2.5552782857232238e-05, + "loss": 0.3519, + "step": 3076 + }, + { + "epoch": 3.2803837953091683, + "grad_norm": 0.18837820237068092, + "learning_rate": 2.5525025454730612e-05, + "loss": 0.3497, + "step": 3077 + }, + { + "epoch": 3.281449893390192, + "grad_norm": 0.1403381013776277, + "learning_rate": 2.54972760706126e-05, + "loss": 0.3505, + "step": 3078 + }, + { + "epoch": 3.2825159914712154, + "grad_norm": 0.234296775090172, + "learning_rate": 2.546953472024991e-05, + "loss": 0.3502, + "step": 3079 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.17074806676345755, + "learning_rate": 2.5441801419009835e-05, + "loss": 0.3449, + "step": 3080 + }, + { + "epoch": 3.2846481876332625, + "grad_norm": 0.17367375722672437, + "learning_rate": 2.541407618225515e-05, + "loss": 0.3503, + "step": 3081 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 0.22550229833239124, + "learning_rate": 2.53863590253442e-05, + "loss": 0.3477, + "step": 3082 + }, + { + "epoch": 3.286780383795309, + "grad_norm": 0.13550221130573722, + "learning_rate": 2.5358649963630867e-05, + "loss": 0.3449, + "step": 3083 + }, + { + "epoch": 3.2878464818763327, + "grad_norm": 0.2501969239886366, + "learning_rate": 2.533094901246452e-05, + "loss": 0.3528, + "step": 3084 + }, + { + "epoch": 3.288912579957356, + "grad_norm": 0.20047206624632516, + "learning_rate": 2.5303256187190038e-05, + "loss": 0.3466, + "step": 3085 + }, + { + "epoch": 3.2899786780383797, + "grad_norm": 0.14385706915661878, + "learning_rate": 2.527557150314783e-05, + "loss": 0.348, + "step": 3086 + }, + { + "epoch": 3.291044776119403, + "grad_norm": 0.23455244794599786, + "learning_rate": 2.524789497567375e-05, + "loss": 0.3575, + "step": 3087 + }, + { + "epoch": 3.2921108742004264, + "grad_norm": 0.19720110623267217, + "learning_rate": 2.522022662009916e-05, + "loss": 0.355, + "step": 3088 + }, + { + "epoch": 3.29317697228145, + "grad_norm": 0.16009116202356855, + "learning_rate": 2.5192566451750904e-05, + "loss": 0.3483, + "step": 3089 + }, + { + "epoch": 3.2942430703624734, + "grad_norm": 0.19300246066961, + "learning_rate": 2.516491448595126e-05, + "loss": 0.3537, + "step": 3090 + }, + { + "epoch": 3.295309168443497, + "grad_norm": 0.17653040799060407, + "learning_rate": 2.5137270738018e-05, + "loss": 0.3463, + "step": 3091 + }, + { + "epoch": 3.29637526652452, + "grad_norm": 0.19537049403171847, + "learning_rate": 2.5109635223264305e-05, + "loss": 0.3424, + "step": 3092 + }, + { + "epoch": 3.2974413646055436, + "grad_norm": 0.18015777011327852, + "learning_rate": 2.5082007956998817e-05, + "loss": 0.3476, + "step": 3093 + }, + { + "epoch": 3.298507462686567, + "grad_norm": 0.20459512311591024, + "learning_rate": 2.505438895452562e-05, + "loss": 0.351, + "step": 3094 + }, + { + "epoch": 3.2995735607675907, + "grad_norm": 0.12366732649317048, + "learning_rate": 2.5026778231144194e-05, + "loss": 0.354, + "step": 3095 + }, + { + "epoch": 3.300639658848614, + "grad_norm": 0.18890110077159672, + "learning_rate": 2.4999175802149438e-05, + "loss": 0.3523, + "step": 3096 + }, + { + "epoch": 3.3017057569296373, + "grad_norm": 0.1626946147400098, + "learning_rate": 2.4971581682831668e-05, + "loss": 0.3527, + "step": 3097 + }, + { + "epoch": 3.302771855010661, + "grad_norm": 0.14801464468188533, + "learning_rate": 2.494399588847662e-05, + "loss": 0.3533, + "step": 3098 + }, + { + "epoch": 3.3038379530916844, + "grad_norm": 0.1593500938211409, + "learning_rate": 2.4916418434365346e-05, + "loss": 0.3528, + "step": 3099 + }, + { + "epoch": 3.304904051172708, + "grad_norm": 0.19644211588346217, + "learning_rate": 2.488884933577434e-05, + "loss": 0.3454, + "step": 3100 + }, + { + "epoch": 3.3059701492537314, + "grad_norm": 0.1353438588547366, + "learning_rate": 2.4861288607975458e-05, + "loss": 0.3516, + "step": 3101 + }, + { + "epoch": 3.307036247334755, + "grad_norm": 0.17358657320991785, + "learning_rate": 2.4833736266235917e-05, + "loss": 0.3499, + "step": 3102 + }, + { + "epoch": 3.308102345415778, + "grad_norm": 0.15831348237018122, + "learning_rate": 2.4806192325818258e-05, + "loss": 0.3531, + "step": 3103 + }, + { + "epoch": 3.3091684434968016, + "grad_norm": 0.13584971050526615, + "learning_rate": 2.47786568019804e-05, + "loss": 0.3593, + "step": 3104 + }, + { + "epoch": 3.310234541577825, + "grad_norm": 0.15418755306420462, + "learning_rate": 2.475112970997562e-05, + "loss": 0.341, + "step": 3105 + }, + { + "epoch": 3.3113006396588487, + "grad_norm": 0.1615129100696353, + "learning_rate": 2.472361106505245e-05, + "loss": 0.3514, + "step": 3106 + }, + { + "epoch": 3.3123667377398722, + "grad_norm": 0.15930398883446692, + "learning_rate": 2.4696100882454817e-05, + "loss": 0.351, + "step": 3107 + }, + { + "epoch": 3.3134328358208958, + "grad_norm": 0.1403230514001666, + "learning_rate": 2.466859917742193e-05, + "loss": 0.3505, + "step": 3108 + }, + { + "epoch": 3.314498933901919, + "grad_norm": 0.17920374641541106, + "learning_rate": 2.464110596518831e-05, + "loss": 0.3449, + "step": 3109 + }, + { + "epoch": 3.3155650319829424, + "grad_norm": 0.1254186924644907, + "learning_rate": 2.4613621260983755e-05, + "loss": 0.3461, + "step": 3110 + }, + { + "epoch": 3.316631130063966, + "grad_norm": 0.13780163296736875, + "learning_rate": 2.458614508003336e-05, + "loss": 0.3511, + "step": 3111 + }, + { + "epoch": 3.3176972281449895, + "grad_norm": 0.143814949009123, + "learning_rate": 2.455867743755751e-05, + "loss": 0.3505, + "step": 3112 + }, + { + "epoch": 3.318763326226013, + "grad_norm": 0.12209116221359133, + "learning_rate": 2.4531218348771866e-05, + "loss": 0.3443, + "step": 3113 + }, + { + "epoch": 3.319829424307036, + "grad_norm": 0.14799939510773816, + "learning_rate": 2.450376782888731e-05, + "loss": 0.3471, + "step": 3114 + }, + { + "epoch": 3.3208955223880596, + "grad_norm": 0.14123259883639513, + "learning_rate": 2.4476325893110008e-05, + "loss": 0.3502, + "step": 3115 + }, + { + "epoch": 3.321961620469083, + "grad_norm": 0.14375390587624387, + "learning_rate": 2.4448892556641393e-05, + "loss": 0.3463, + "step": 3116 + }, + { + "epoch": 3.3230277185501067, + "grad_norm": 0.17421629802465335, + "learning_rate": 2.4421467834678067e-05, + "loss": 0.3475, + "step": 3117 + }, + { + "epoch": 3.3240938166311302, + "grad_norm": 0.14674808581087775, + "learning_rate": 2.439405174241192e-05, + "loss": 0.3451, + "step": 3118 + }, + { + "epoch": 3.3251599147121533, + "grad_norm": 0.21187149354636753, + "learning_rate": 2.4366644295030054e-05, + "loss": 0.3483, + "step": 3119 + }, + { + "epoch": 3.326226012793177, + "grad_norm": 0.2012599089170034, + "learning_rate": 2.433924550771476e-05, + "loss": 0.3483, + "step": 3120 + }, + { + "epoch": 3.3272921108742004, + "grad_norm": 0.16780980496521763, + "learning_rate": 2.4311855395643527e-05, + "loss": 0.3462, + "step": 3121 + }, + { + "epoch": 3.328358208955224, + "grad_norm": 0.16005417142229794, + "learning_rate": 2.428447397398908e-05, + "loss": 0.3533, + "step": 3122 + }, + { + "epoch": 3.3294243070362475, + "grad_norm": 0.15099935350620822, + "learning_rate": 2.425710125791929e-05, + "loss": 0.35, + "step": 3123 + }, + { + "epoch": 3.3304904051172706, + "grad_norm": 0.15768848577018113, + "learning_rate": 2.4229737262597216e-05, + "loss": 0.3498, + "step": 3124 + }, + { + "epoch": 3.331556503198294, + "grad_norm": 0.155851400636398, + "learning_rate": 2.4202382003181098e-05, + "loss": 0.3557, + "step": 3125 + }, + { + "epoch": 3.3326226012793176, + "grad_norm": 0.14267053890933395, + "learning_rate": 2.4175035494824316e-05, + "loss": 0.3486, + "step": 3126 + }, + { + "epoch": 3.333688699360341, + "grad_norm": 0.15843384932692273, + "learning_rate": 2.4147697752675433e-05, + "loss": 0.3515, + "step": 3127 + }, + { + "epoch": 3.3347547974413647, + "grad_norm": 0.17371867125289828, + "learning_rate": 2.4120368791878122e-05, + "loss": 0.3491, + "step": 3128 + }, + { + "epoch": 3.3358208955223883, + "grad_norm": 0.16084952359302623, + "learning_rate": 2.4093048627571206e-05, + "loss": 0.3529, + "step": 3129 + }, + { + "epoch": 3.3368869936034113, + "grad_norm": 0.14408159186617614, + "learning_rate": 2.4065737274888646e-05, + "loss": 0.3525, + "step": 3130 + }, + { + "epoch": 3.337953091684435, + "grad_norm": 0.1656284179555205, + "learning_rate": 2.403843474895952e-05, + "loss": 0.3486, + "step": 3131 + }, + { + "epoch": 3.3390191897654584, + "grad_norm": 0.14146974935511217, + "learning_rate": 2.401114106490798e-05, + "loss": 0.3482, + "step": 3132 + }, + { + "epoch": 3.340085287846482, + "grad_norm": 0.1573564750602216, + "learning_rate": 2.3983856237853322e-05, + "loss": 0.3543, + "step": 3133 + }, + { + "epoch": 3.3411513859275055, + "grad_norm": 0.1757062911156589, + "learning_rate": 2.395658028290995e-05, + "loss": 0.3538, + "step": 3134 + }, + { + "epoch": 3.342217484008529, + "grad_norm": 0.13319696217453428, + "learning_rate": 2.3929313215187274e-05, + "loss": 0.3488, + "step": 3135 + }, + { + "epoch": 3.343283582089552, + "grad_norm": 0.15216598233478518, + "learning_rate": 2.390205504978986e-05, + "loss": 0.3574, + "step": 3136 + }, + { + "epoch": 3.3443496801705757, + "grad_norm": 0.14909080713095296, + "learning_rate": 2.3874805801817313e-05, + "loss": 0.3488, + "step": 3137 + }, + { + "epoch": 3.345415778251599, + "grad_norm": 0.12886400979675366, + "learning_rate": 2.384756548636432e-05, + "loss": 0.3474, + "step": 3138 + }, + { + "epoch": 3.3464818763326227, + "grad_norm": 0.17178937827304538, + "learning_rate": 2.3820334118520566e-05, + "loss": 0.3487, + "step": 3139 + }, + { + "epoch": 3.3475479744136463, + "grad_norm": 0.14792535442811752, + "learning_rate": 2.3793111713370824e-05, + "loss": 0.3493, + "step": 3140 + }, + { + "epoch": 3.3486140724946694, + "grad_norm": 0.13063840167660581, + "learning_rate": 2.3765898285994898e-05, + "loss": 0.3436, + "step": 3141 + }, + { + "epoch": 3.349680170575693, + "grad_norm": 0.1415902890105865, + "learning_rate": 2.3738693851467627e-05, + "loss": 0.3449, + "step": 3142 + }, + { + "epoch": 3.3507462686567164, + "grad_norm": 0.14468858858404718, + "learning_rate": 2.371149842485882e-05, + "loss": 0.3515, + "step": 3143 + }, + { + "epoch": 3.35181236673774, + "grad_norm": 0.12569229542636323, + "learning_rate": 2.3684312021233353e-05, + "loss": 0.3531, + "step": 3144 + }, + { + "epoch": 3.3528784648187635, + "grad_norm": 0.13410217015876344, + "learning_rate": 2.3657134655651085e-05, + "loss": 0.3493, + "step": 3145 + }, + { + "epoch": 3.3539445628997866, + "grad_norm": 0.12050748503992254, + "learning_rate": 2.3629966343166836e-05, + "loss": 0.3475, + "step": 3146 + }, + { + "epoch": 3.35501066098081, + "grad_norm": 0.13625653337833818, + "learning_rate": 2.3602807098830462e-05, + "loss": 0.3527, + "step": 3147 + }, + { + "epoch": 3.3560767590618337, + "grad_norm": 0.1270892484979118, + "learning_rate": 2.3575656937686765e-05, + "loss": 0.3451, + "step": 3148 + }, + { + "epoch": 3.357142857142857, + "grad_norm": 0.1292034055414621, + "learning_rate": 2.3548515874775547e-05, + "loss": 0.3468, + "step": 3149 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.13474465507291278, + "learning_rate": 2.3521383925131508e-05, + "loss": 0.3475, + "step": 3150 + }, + { + "epoch": 3.359275053304904, + "grad_norm": 0.14535330585186346, + "learning_rate": 2.349426110378435e-05, + "loss": 0.349, + "step": 3151 + }, + { + "epoch": 3.3603411513859274, + "grad_norm": 0.14304977788811093, + "learning_rate": 2.3467147425758735e-05, + "loss": 0.3497, + "step": 3152 + }, + { + "epoch": 3.361407249466951, + "grad_norm": 0.13088921289643768, + "learning_rate": 2.3440042906074187e-05, + "loss": 0.3476, + "step": 3153 + }, + { + "epoch": 3.3624733475479744, + "grad_norm": 0.13796584925331404, + "learning_rate": 2.3412947559745226e-05, + "loss": 0.3463, + "step": 3154 + }, + { + "epoch": 3.363539445628998, + "grad_norm": 0.1388788022895029, + "learning_rate": 2.338586140178127e-05, + "loss": 0.3485, + "step": 3155 + }, + { + "epoch": 3.364605543710021, + "grad_norm": 0.12618907252498665, + "learning_rate": 2.335878444718663e-05, + "loss": 0.3547, + "step": 3156 + }, + { + "epoch": 3.3656716417910446, + "grad_norm": 0.14196252167717818, + "learning_rate": 2.3331716710960536e-05, + "loss": 0.3495, + "step": 3157 + }, + { + "epoch": 3.366737739872068, + "grad_norm": 0.13807258992426674, + "learning_rate": 2.3304658208097105e-05, + "loss": 0.3498, + "step": 3158 + }, + { + "epoch": 3.3678038379530917, + "grad_norm": 0.12981419614673304, + "learning_rate": 2.3277608953585346e-05, + "loss": 0.3441, + "step": 3159 + }, + { + "epoch": 3.368869936034115, + "grad_norm": 0.16657533516126402, + "learning_rate": 2.3250568962409155e-05, + "loss": 0.353, + "step": 3160 + }, + { + "epoch": 3.3699360341151388, + "grad_norm": 0.12148830285530396, + "learning_rate": 2.322353824954725e-05, + "loss": 0.3523, + "step": 3161 + }, + { + "epoch": 3.3710021321961623, + "grad_norm": 0.1706380629428814, + "learning_rate": 2.319651682997325e-05, + "loss": 0.3487, + "step": 3162 + }, + { + "epoch": 3.3720682302771854, + "grad_norm": 0.14429216152158644, + "learning_rate": 2.316950471865564e-05, + "loss": 0.3511, + "step": 3163 + }, + { + "epoch": 3.373134328358209, + "grad_norm": 0.13448516843067299, + "learning_rate": 2.31425019305577e-05, + "loss": 0.3542, + "step": 3164 + }, + { + "epoch": 3.3742004264392325, + "grad_norm": 0.16508680435233514, + "learning_rate": 2.3115508480637575e-05, + "loss": 0.3477, + "step": 3165 + }, + { + "epoch": 3.375266524520256, + "grad_norm": 0.1326808111603962, + "learning_rate": 2.308852438384824e-05, + "loss": 0.349, + "step": 3166 + }, + { + "epoch": 3.3763326226012795, + "grad_norm": 0.14047757472242012, + "learning_rate": 2.3061549655137498e-05, + "loss": 0.3513, + "step": 3167 + }, + { + "epoch": 3.3773987206823026, + "grad_norm": 0.1441083727611898, + "learning_rate": 2.3034584309447913e-05, + "loss": 0.3479, + "step": 3168 + }, + { + "epoch": 3.378464818763326, + "grad_norm": 0.13591931890052086, + "learning_rate": 2.3007628361716902e-05, + "loss": 0.3515, + "step": 3169 + }, + { + "epoch": 3.3795309168443497, + "grad_norm": 0.1255576121479874, + "learning_rate": 2.298068182687666e-05, + "loss": 0.3519, + "step": 3170 + }, + { + "epoch": 3.3805970149253732, + "grad_norm": 0.14967290050741613, + "learning_rate": 2.295374471985418e-05, + "loss": 0.3482, + "step": 3171 + }, + { + "epoch": 3.3816631130063968, + "grad_norm": 0.12421430145394535, + "learning_rate": 2.2926817055571194e-05, + "loss": 0.3526, + "step": 3172 + }, + { + "epoch": 3.38272921108742, + "grad_norm": 0.17262897437577504, + "learning_rate": 2.289989884894425e-05, + "loss": 0.3442, + "step": 3173 + }, + { + "epoch": 3.3837953091684434, + "grad_norm": 0.14546213806847885, + "learning_rate": 2.287299011488461e-05, + "loss": 0.3531, + "step": 3174 + }, + { + "epoch": 3.384861407249467, + "grad_norm": 0.1574355823375362, + "learning_rate": 2.2846090868298333e-05, + "loss": 0.3474, + "step": 3175 + }, + { + "epoch": 3.3859275053304905, + "grad_norm": 0.15336705073042523, + "learning_rate": 2.2819201124086216e-05, + "loss": 0.3503, + "step": 3176 + }, + { + "epoch": 3.386993603411514, + "grad_norm": 0.13640452357535515, + "learning_rate": 2.279232089714374e-05, + "loss": 0.3505, + "step": 3177 + }, + { + "epoch": 3.388059701492537, + "grad_norm": 0.15004013469173, + "learning_rate": 2.2765450202361186e-05, + "loss": 0.3519, + "step": 3178 + }, + { + "epoch": 3.3891257995735606, + "grad_norm": 0.14151494212433288, + "learning_rate": 2.273858905462353e-05, + "loss": 0.3518, + "step": 3179 + }, + { + "epoch": 3.390191897654584, + "grad_norm": 0.1345161110814909, + "learning_rate": 2.2711737468810418e-05, + "loss": 0.3481, + "step": 3180 + }, + { + "epoch": 3.3912579957356077, + "grad_norm": 0.12660333007474672, + "learning_rate": 2.268489545979625e-05, + "loss": 0.3484, + "step": 3181 + }, + { + "epoch": 3.3923240938166312, + "grad_norm": 0.13060878781996574, + "learning_rate": 2.265806304245012e-05, + "loss": 0.349, + "step": 3182 + }, + { + "epoch": 3.3933901918976543, + "grad_norm": 0.13348764262187832, + "learning_rate": 2.263124023163576e-05, + "loss": 0.3471, + "step": 3183 + }, + { + "epoch": 3.394456289978678, + "grad_norm": 0.12898484952700903, + "learning_rate": 2.2604427042211633e-05, + "loss": 0.3521, + "step": 3184 + }, + { + "epoch": 3.3955223880597014, + "grad_norm": 0.14350540973892006, + "learning_rate": 2.2577623489030865e-05, + "loss": 0.3485, + "step": 3185 + }, + { + "epoch": 3.396588486140725, + "grad_norm": 0.11886459992083558, + "learning_rate": 2.25508295869412e-05, + "loss": 0.3518, + "step": 3186 + }, + { + "epoch": 3.3976545842217485, + "grad_norm": 0.12711816829481948, + "learning_rate": 2.2524045350785088e-05, + "loss": 0.3508, + "step": 3187 + }, + { + "epoch": 3.398720682302772, + "grad_norm": 0.12078532866593578, + "learning_rate": 2.2497270795399598e-05, + "loss": 0.3519, + "step": 3188 + }, + { + "epoch": 3.399786780383795, + "grad_norm": 0.14360930786740936, + "learning_rate": 2.2470505935616457e-05, + "loss": 0.3496, + "step": 3189 + }, + { + "epoch": 3.4008528784648187, + "grad_norm": 0.12963425574071996, + "learning_rate": 2.2443750786261976e-05, + "loss": 0.3553, + "step": 3190 + }, + { + "epoch": 3.401918976545842, + "grad_norm": 0.13049268581835108, + "learning_rate": 2.2417005362157135e-05, + "loss": 0.3517, + "step": 3191 + }, + { + "epoch": 3.4029850746268657, + "grad_norm": 0.15413582639564263, + "learning_rate": 2.2390269678117525e-05, + "loss": 0.3517, + "step": 3192 + }, + { + "epoch": 3.4040511727078893, + "grad_norm": 0.12663844292929444, + "learning_rate": 2.2363543748953296e-05, + "loss": 0.3466, + "step": 3193 + }, + { + "epoch": 3.405117270788913, + "grad_norm": 0.13821512264352942, + "learning_rate": 2.2336827589469232e-05, + "loss": 0.3523, + "step": 3194 + }, + { + "epoch": 3.406183368869936, + "grad_norm": 0.11822096798559724, + "learning_rate": 2.2310121214464706e-05, + "loss": 0.3439, + "step": 3195 + }, + { + "epoch": 3.4072494669509594, + "grad_norm": 0.12664000688547125, + "learning_rate": 2.228342463873367e-05, + "loss": 0.3566, + "step": 3196 + }, + { + "epoch": 3.408315565031983, + "grad_norm": 0.13892075241413948, + "learning_rate": 2.2256737877064607e-05, + "loss": 0.3459, + "step": 3197 + }, + { + "epoch": 3.4093816631130065, + "grad_norm": 0.12083777967142596, + "learning_rate": 2.2230060944240623e-05, + "loss": 0.3519, + "step": 3198 + }, + { + "epoch": 3.41044776119403, + "grad_norm": 0.14125853339995179, + "learning_rate": 2.220339385503934e-05, + "loss": 0.351, + "step": 3199 + }, + { + "epoch": 3.411513859275053, + "grad_norm": 0.13762034609216564, + "learning_rate": 2.2176736624232964e-05, + "loss": 0.3427, + "step": 3200 + }, + { + "epoch": 3.4125799573560767, + "grad_norm": 0.17537017660770718, + "learning_rate": 2.2150089266588173e-05, + "loss": 0.356, + "step": 3201 + }, + { + "epoch": 3.4136460554371, + "grad_norm": 0.13074766627409623, + "learning_rate": 2.2123451796866247e-05, + "loss": 0.3531, + "step": 3202 + }, + { + "epoch": 3.4147121535181237, + "grad_norm": 0.15980968926250733, + "learning_rate": 2.2096824229822973e-05, + "loss": 0.348, + "step": 3203 + }, + { + "epoch": 3.4157782515991473, + "grad_norm": 0.1180753762858572, + "learning_rate": 2.2070206580208598e-05, + "loss": 0.3505, + "step": 3204 + }, + { + "epoch": 3.4168443496801704, + "grad_norm": 0.14397710716700302, + "learning_rate": 2.2043598862767937e-05, + "loss": 0.3539, + "step": 3205 + }, + { + "epoch": 3.417910447761194, + "grad_norm": 0.15418073714942476, + "learning_rate": 2.2017001092240288e-05, + "loss": 0.349, + "step": 3206 + }, + { + "epoch": 3.4189765458422174, + "grad_norm": 0.15081085999678343, + "learning_rate": 2.1990413283359447e-05, + "loss": 0.3483, + "step": 3207 + }, + { + "epoch": 3.420042643923241, + "grad_norm": 0.11082213481326116, + "learning_rate": 2.1963835450853646e-05, + "loss": 0.3508, + "step": 3208 + }, + { + "epoch": 3.4211087420042645, + "grad_norm": 0.15412690956286193, + "learning_rate": 2.1937267609445634e-05, + "loss": 0.3537, + "step": 3209 + }, + { + "epoch": 3.4221748400852876, + "grad_norm": 0.13940387381283997, + "learning_rate": 2.191070977385264e-05, + "loss": 0.3509, + "step": 3210 + }, + { + "epoch": 3.423240938166311, + "grad_norm": 0.12890974324334237, + "learning_rate": 2.1884161958786283e-05, + "loss": 0.3516, + "step": 3211 + }, + { + "epoch": 3.4243070362473347, + "grad_norm": 0.14545024164766443, + "learning_rate": 2.1857624178952693e-05, + "loss": 0.3447, + "step": 3212 + }, + { + "epoch": 3.425373134328358, + "grad_norm": 0.1330648345375272, + "learning_rate": 2.1831096449052424e-05, + "loss": 0.3448, + "step": 3213 + }, + { + "epoch": 3.4264392324093818, + "grad_norm": 0.12671343057546725, + "learning_rate": 2.1804578783780465e-05, + "loss": 0.3516, + "step": 3214 + }, + { + "epoch": 3.4275053304904053, + "grad_norm": 0.14329897961042323, + "learning_rate": 2.177807119782621e-05, + "loss": 0.3501, + "step": 3215 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.1337530920580984, + "learning_rate": 2.175157370587348e-05, + "loss": 0.3466, + "step": 3216 + }, + { + "epoch": 3.429637526652452, + "grad_norm": 0.16912612472386718, + "learning_rate": 2.1725086322600526e-05, + "loss": 0.3507, + "step": 3217 + }, + { + "epoch": 3.4307036247334755, + "grad_norm": 0.14266480292217754, + "learning_rate": 2.1698609062679985e-05, + "loss": 0.3545, + "step": 3218 + }, + { + "epoch": 3.431769722814499, + "grad_norm": 0.14472729674524853, + "learning_rate": 2.167214194077886e-05, + "loss": 0.3496, + "step": 3219 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.16298798619576543, + "learning_rate": 2.1645684971558572e-05, + "loss": 0.3474, + "step": 3220 + }, + { + "epoch": 3.433901918976546, + "grad_norm": 0.1468219073239334, + "learning_rate": 2.1619238169674918e-05, + "loss": 0.353, + "step": 3221 + }, + { + "epoch": 3.434968017057569, + "grad_norm": 0.19187830974162423, + "learning_rate": 2.1592801549778034e-05, + "loss": 0.3518, + "step": 3222 + }, + { + "epoch": 3.4360341151385927, + "grad_norm": 0.17741138849918361, + "learning_rate": 2.1566375126512437e-05, + "loss": 0.3464, + "step": 3223 + }, + { + "epoch": 3.4371002132196162, + "grad_norm": 0.1603692083176825, + "learning_rate": 2.1539958914517e-05, + "loss": 0.3476, + "step": 3224 + }, + { + "epoch": 3.4381663113006398, + "grad_norm": 0.1663268863315761, + "learning_rate": 2.151355292842494e-05, + "loss": 0.3496, + "step": 3225 + }, + { + "epoch": 3.4392324093816633, + "grad_norm": 0.1767760891428736, + "learning_rate": 2.1487157182863773e-05, + "loss": 0.3465, + "step": 3226 + }, + { + "epoch": 3.4402985074626864, + "grad_norm": 0.14846390418886782, + "learning_rate": 2.1460771692455388e-05, + "loss": 0.3572, + "step": 3227 + }, + { + "epoch": 3.44136460554371, + "grad_norm": 0.1805328018495273, + "learning_rate": 2.143439647181597e-05, + "loss": 0.3521, + "step": 3228 + }, + { + "epoch": 3.4424307036247335, + "grad_norm": 0.15018285087651392, + "learning_rate": 2.140803153555604e-05, + "loss": 0.356, + "step": 3229 + }, + { + "epoch": 3.443496801705757, + "grad_norm": 0.16606433943154644, + "learning_rate": 2.1381676898280372e-05, + "loss": 0.3473, + "step": 3230 + }, + { + "epoch": 3.4445628997867805, + "grad_norm": 0.16838391423331878, + "learning_rate": 2.135533257458808e-05, + "loss": 0.3482, + "step": 3231 + }, + { + "epoch": 3.4456289978678036, + "grad_norm": 0.1327187010033925, + "learning_rate": 2.1328998579072566e-05, + "loss": 0.3562, + "step": 3232 + }, + { + "epoch": 3.446695095948827, + "grad_norm": 0.15065705283311742, + "learning_rate": 2.130267492632146e-05, + "loss": 0.3471, + "step": 3233 + }, + { + "epoch": 3.4477611940298507, + "grad_norm": 0.11852053817686456, + "learning_rate": 2.1276361630916718e-05, + "loss": 0.3542, + "step": 3234 + }, + { + "epoch": 3.4488272921108742, + "grad_norm": 0.13152597541802122, + "learning_rate": 2.125005870743453e-05, + "loss": 0.3539, + "step": 3235 + }, + { + "epoch": 3.449893390191898, + "grad_norm": 0.14103478135014386, + "learning_rate": 2.1223766170445383e-05, + "loss": 0.3508, + "step": 3236 + }, + { + "epoch": 3.450959488272921, + "grad_norm": 0.11147740547754476, + "learning_rate": 2.1197484034513927e-05, + "loss": 0.3453, + "step": 3237 + }, + { + "epoch": 3.4520255863539444, + "grad_norm": 0.12515850742634446, + "learning_rate": 2.1171212314199117e-05, + "loss": 0.3512, + "step": 3238 + }, + { + "epoch": 3.453091684434968, + "grad_norm": 0.12854446210826828, + "learning_rate": 2.1144951024054144e-05, + "loss": 0.3471, + "step": 3239 + }, + { + "epoch": 3.4541577825159915, + "grad_norm": 0.12150324184398577, + "learning_rate": 2.111870017862636e-05, + "loss": 0.3472, + "step": 3240 + }, + { + "epoch": 3.455223880597015, + "grad_norm": 0.1402826179243876, + "learning_rate": 2.1092459792457384e-05, + "loss": 0.355, + "step": 3241 + }, + { + "epoch": 3.4562899786780386, + "grad_norm": 0.12712753288646708, + "learning_rate": 2.1066229880083035e-05, + "loss": 0.35, + "step": 3242 + }, + { + "epoch": 3.4573560767590616, + "grad_norm": 0.13839576676511672, + "learning_rate": 2.104001045603333e-05, + "loss": 0.3527, + "step": 3243 + }, + { + "epoch": 3.458422174840085, + "grad_norm": 0.13644673504349322, + "learning_rate": 2.1013801534832434e-05, + "loss": 0.3509, + "step": 3244 + }, + { + "epoch": 3.4594882729211087, + "grad_norm": 0.14306136604732483, + "learning_rate": 2.0987603130998745e-05, + "loss": 0.3507, + "step": 3245 + }, + { + "epoch": 3.4605543710021323, + "grad_norm": 0.15950680626735908, + "learning_rate": 2.096141525904484e-05, + "loss": 0.3485, + "step": 3246 + }, + { + "epoch": 3.461620469083156, + "grad_norm": 0.11275460669446143, + "learning_rate": 2.09352379334774e-05, + "loss": 0.3529, + "step": 3247 + }, + { + "epoch": 3.4626865671641793, + "grad_norm": 0.1745871935155396, + "learning_rate": 2.0909071168797332e-05, + "loss": 0.3509, + "step": 3248 + }, + { + "epoch": 3.4637526652452024, + "grad_norm": 0.12473410126901693, + "learning_rate": 2.0882914979499635e-05, + "loss": 0.353, + "step": 3249 + }, + { + "epoch": 3.464818763326226, + "grad_norm": 0.15474894939169337, + "learning_rate": 2.0856769380073497e-05, + "loss": 0.3544, + "step": 3250 + }, + { + "epoch": 3.4658848614072495, + "grad_norm": 0.14452008614896242, + "learning_rate": 2.0830634385002234e-05, + "loss": 0.3528, + "step": 3251 + }, + { + "epoch": 3.466950959488273, + "grad_norm": 0.1418673442984818, + "learning_rate": 2.080451000876325e-05, + "loss": 0.3476, + "step": 3252 + }, + { + "epoch": 3.4680170575692966, + "grad_norm": 0.16943424994947526, + "learning_rate": 2.0778396265828097e-05, + "loss": 0.356, + "step": 3253 + }, + { + "epoch": 3.4690831556503197, + "grad_norm": 0.14164905105719902, + "learning_rate": 2.0752293170662453e-05, + "loss": 0.3492, + "step": 3254 + }, + { + "epoch": 3.470149253731343, + "grad_norm": 0.17297216086140343, + "learning_rate": 2.0726200737726044e-05, + "loss": 0.3456, + "step": 3255 + }, + { + "epoch": 3.4712153518123667, + "grad_norm": 0.13358545880627057, + "learning_rate": 2.0700118981472737e-05, + "loss": 0.35, + "step": 3256 + }, + { + "epoch": 3.4722814498933903, + "grad_norm": 0.130949084809917, + "learning_rate": 2.0674047916350472e-05, + "loss": 0.3485, + "step": 3257 + }, + { + "epoch": 3.473347547974414, + "grad_norm": 0.15140183416957245, + "learning_rate": 2.0647987556801276e-05, + "loss": 0.3488, + "step": 3258 + }, + { + "epoch": 3.474413646055437, + "grad_norm": 0.13537402309349822, + "learning_rate": 2.0621937917261202e-05, + "loss": 0.3526, + "step": 3259 + }, + { + "epoch": 3.4754797441364604, + "grad_norm": 0.12927192577901372, + "learning_rate": 2.059589901216042e-05, + "loss": 0.3578, + "step": 3260 + }, + { + "epoch": 3.476545842217484, + "grad_norm": 0.13927688677691658, + "learning_rate": 2.0569870855923133e-05, + "loss": 0.3514, + "step": 3261 + }, + { + "epoch": 3.4776119402985075, + "grad_norm": 0.1446653742467127, + "learning_rate": 2.0543853462967568e-05, + "loss": 0.3476, + "step": 3262 + }, + { + "epoch": 3.478678038379531, + "grad_norm": 0.14655142804794563, + "learning_rate": 2.0517846847706018e-05, + "loss": 0.3521, + "step": 3263 + }, + { + "epoch": 3.479744136460554, + "grad_norm": 0.15775326994652664, + "learning_rate": 2.0491851024544798e-05, + "loss": 0.3515, + "step": 3264 + }, + { + "epoch": 3.4808102345415777, + "grad_norm": 0.14124595672889267, + "learning_rate": 2.0465866007884254e-05, + "loss": 0.3519, + "step": 3265 + }, + { + "epoch": 3.481876332622601, + "grad_norm": 0.14159196279556782, + "learning_rate": 2.0439891812118713e-05, + "loss": 0.3467, + "step": 3266 + }, + { + "epoch": 3.4829424307036247, + "grad_norm": 0.15538385639162236, + "learning_rate": 2.0413928451636532e-05, + "loss": 0.3478, + "step": 3267 + }, + { + "epoch": 3.4840085287846483, + "grad_norm": 0.12201107022654291, + "learning_rate": 2.038797594082009e-05, + "loss": 0.3468, + "step": 3268 + }, + { + "epoch": 3.485074626865672, + "grad_norm": 0.13632684112183946, + "learning_rate": 2.0362034294045694e-05, + "loss": 0.3461, + "step": 3269 + }, + { + "epoch": 3.486140724946695, + "grad_norm": 0.14100032656692765, + "learning_rate": 2.0336103525683685e-05, + "loss": 0.3519, + "step": 3270 + }, + { + "epoch": 3.4872068230277184, + "grad_norm": 0.13444787195465405, + "learning_rate": 2.0310183650098357e-05, + "loss": 0.3495, + "step": 3271 + }, + { + "epoch": 3.488272921108742, + "grad_norm": 0.12769235515457522, + "learning_rate": 2.0284274681647993e-05, + "loss": 0.3453, + "step": 3272 + }, + { + "epoch": 3.4893390191897655, + "grad_norm": 0.12220035736690575, + "learning_rate": 2.0258376634684786e-05, + "loss": 0.3496, + "step": 3273 + }, + { + "epoch": 3.490405117270789, + "grad_norm": 0.1568430272842312, + "learning_rate": 2.023248952355492e-05, + "loss": 0.354, + "step": 3274 + }, + { + "epoch": 3.4914712153518126, + "grad_norm": 0.11426477449378872, + "learning_rate": 2.0206613362598507e-05, + "loss": 0.3476, + "step": 3275 + }, + { + "epoch": 3.4925373134328357, + "grad_norm": 0.15450922714432697, + "learning_rate": 2.018074816614962e-05, + "loss": 0.351, + "step": 3276 + }, + { + "epoch": 3.4936034115138592, + "grad_norm": 0.13239244497688343, + "learning_rate": 2.0154893948536195e-05, + "loss": 0.3516, + "step": 3277 + }, + { + "epoch": 3.4946695095948828, + "grad_norm": 0.1591598013421958, + "learning_rate": 2.0129050724080138e-05, + "loss": 0.3499, + "step": 3278 + }, + { + "epoch": 3.4957356076759063, + "grad_norm": 0.15772096557717954, + "learning_rate": 2.0103218507097274e-05, + "loss": 0.3497, + "step": 3279 + }, + { + "epoch": 3.49680170575693, + "grad_norm": 0.12425309706820227, + "learning_rate": 2.0077397311897274e-05, + "loss": 0.3474, + "step": 3280 + }, + { + "epoch": 3.497867803837953, + "grad_norm": 0.1506011178175042, + "learning_rate": 2.005158715278376e-05, + "loss": 0.3472, + "step": 3281 + }, + { + "epoch": 3.4989339019189765, + "grad_norm": 0.11149966822962049, + "learning_rate": 2.0025788044054212e-05, + "loss": 0.3504, + "step": 3282 + }, + { + "epoch": 3.5, + "grad_norm": 0.15273982349883355, + "learning_rate": 2.0000000000000012e-05, + "loss": 0.3485, + "step": 3283 + }, + { + "epoch": 3.5010660980810235, + "grad_norm": 0.12108827495133537, + "learning_rate": 1.9974223034906362e-05, + "loss": 0.3521, + "step": 3284 + }, + { + "epoch": 3.502132196162047, + "grad_norm": 0.14237030818834184, + "learning_rate": 1.9948457163052385e-05, + "loss": 0.3519, + "step": 3285 + }, + { + "epoch": 3.50319829424307, + "grad_norm": 0.11471865230712094, + "learning_rate": 1.9922702398711026e-05, + "loss": 0.3536, + "step": 3286 + }, + { + "epoch": 3.5042643923240937, + "grad_norm": 0.13762079947986805, + "learning_rate": 1.989695875614911e-05, + "loss": 0.3496, + "step": 3287 + }, + { + "epoch": 3.5053304904051172, + "grad_norm": 0.12762020300370305, + "learning_rate": 1.987122624962724e-05, + "loss": 0.3483, + "step": 3288 + }, + { + "epoch": 3.5063965884861408, + "grad_norm": 0.13891339336483755, + "learning_rate": 1.9845504893399906e-05, + "loss": 0.3494, + "step": 3289 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.12926973335164987, + "learning_rate": 1.9819794701715412e-05, + "loss": 0.3489, + "step": 3290 + }, + { + "epoch": 3.5085287846481874, + "grad_norm": 0.11926976710182068, + "learning_rate": 1.9794095688815846e-05, + "loss": 0.3496, + "step": 3291 + }, + { + "epoch": 3.509594882729211, + "grad_norm": 0.1169531192619173, + "learning_rate": 1.9768407868937136e-05, + "loss": 0.3536, + "step": 3292 + }, + { + "epoch": 3.5106609808102345, + "grad_norm": 0.11709738090603788, + "learning_rate": 1.9742731256308997e-05, + "loss": 0.3505, + "step": 3293 + }, + { + "epoch": 3.511727078891258, + "grad_norm": 0.11720868076698791, + "learning_rate": 1.9717065865154962e-05, + "loss": 0.3555, + "step": 3294 + }, + { + "epoch": 3.5127931769722816, + "grad_norm": 0.11598807611455429, + "learning_rate": 1.969141170969228e-05, + "loss": 0.3492, + "step": 3295 + }, + { + "epoch": 3.5138592750533046, + "grad_norm": 0.11677301458081207, + "learning_rate": 1.9665768804132046e-05, + "loss": 0.3513, + "step": 3296 + }, + { + "epoch": 3.5149253731343286, + "grad_norm": 0.11769361312971446, + "learning_rate": 1.9640137162679108e-05, + "loss": 0.358, + "step": 3297 + }, + { + "epoch": 3.5159914712153517, + "grad_norm": 0.11945244360571523, + "learning_rate": 1.9614516799532035e-05, + "loss": 0.3508, + "step": 3298 + }, + { + "epoch": 3.5170575692963753, + "grad_norm": 0.10553148957707399, + "learning_rate": 1.9588907728883192e-05, + "loss": 0.3527, + "step": 3299 + }, + { + "epoch": 3.518123667377399, + "grad_norm": 0.12506068947753565, + "learning_rate": 1.9563309964918664e-05, + "loss": 0.3525, + "step": 3300 + }, + { + "epoch": 3.5191897654584223, + "grad_norm": 0.1274298809654222, + "learning_rate": 1.9537723521818307e-05, + "loss": 0.3586, + "step": 3301 + }, + { + "epoch": 3.520255863539446, + "grad_norm": 0.11963014257012117, + "learning_rate": 1.9512148413755653e-05, + "loss": 0.3554, + "step": 3302 + }, + { + "epoch": 3.521321961620469, + "grad_norm": 0.1426495501126215, + "learning_rate": 1.9486584654897987e-05, + "loss": 0.3544, + "step": 3303 + }, + { + "epoch": 3.5223880597014925, + "grad_norm": 0.11378394285714087, + "learning_rate": 1.9461032259406317e-05, + "loss": 0.357, + "step": 3304 + }, + { + "epoch": 3.523454157782516, + "grad_norm": 0.13151158677780098, + "learning_rate": 1.9435491241435343e-05, + "loss": 0.3504, + "step": 3305 + }, + { + "epoch": 3.5245202558635396, + "grad_norm": 0.11047177913070025, + "learning_rate": 1.9409961615133435e-05, + "loss": 0.3449, + "step": 3306 + }, + { + "epoch": 3.525586353944563, + "grad_norm": 0.14518194022909464, + "learning_rate": 1.9384443394642697e-05, + "loss": 0.3537, + "step": 3307 + }, + { + "epoch": 3.526652452025586, + "grad_norm": 0.13288265424403722, + "learning_rate": 1.9358936594098915e-05, + "loss": 0.3472, + "step": 3308 + }, + { + "epoch": 3.5277185501066097, + "grad_norm": 0.13481285136461793, + "learning_rate": 1.9333441227631494e-05, + "loss": 0.3522, + "step": 3309 + }, + { + "epoch": 3.5287846481876333, + "grad_norm": 0.12955597370536356, + "learning_rate": 1.9307957309363562e-05, + "loss": 0.3591, + "step": 3310 + }, + { + "epoch": 3.529850746268657, + "grad_norm": 0.13070491932515202, + "learning_rate": 1.928248485341188e-05, + "loss": 0.3523, + "step": 3311 + }, + { + "epoch": 3.5309168443496803, + "grad_norm": 0.12150544834528254, + "learning_rate": 1.9257023873886885e-05, + "loss": 0.3548, + "step": 3312 + }, + { + "epoch": 3.5319829424307034, + "grad_norm": 0.12389814641248816, + "learning_rate": 1.9231574384892608e-05, + "loss": 0.3491, + "step": 3313 + }, + { + "epoch": 3.533049040511727, + "grad_norm": 0.11812191547579137, + "learning_rate": 1.9206136400526753e-05, + "loss": 0.3552, + "step": 3314 + }, + { + "epoch": 3.5341151385927505, + "grad_norm": 0.11705609586266169, + "learning_rate": 1.9180709934880657e-05, + "loss": 0.3567, + "step": 3315 + }, + { + "epoch": 3.535181236673774, + "grad_norm": 0.12705382222301062, + "learning_rate": 1.915529500203923e-05, + "loss": 0.3522, + "step": 3316 + }, + { + "epoch": 3.5362473347547976, + "grad_norm": 0.1195951170844615, + "learning_rate": 1.9129891616081045e-05, + "loss": 0.3459, + "step": 3317 + }, + { + "epoch": 3.5373134328358207, + "grad_norm": 0.11156152984847112, + "learning_rate": 1.910449979107827e-05, + "loss": 0.3524, + "step": 3318 + }, + { + "epoch": 3.538379530916844, + "grad_norm": 0.12607688725487942, + "learning_rate": 1.907911954109662e-05, + "loss": 0.3505, + "step": 3319 + }, + { + "epoch": 3.5394456289978677, + "grad_norm": 0.12385238816738517, + "learning_rate": 1.9053750880195453e-05, + "loss": 0.353, + "step": 3320 + }, + { + "epoch": 3.5405117270788913, + "grad_norm": 0.11986136600785428, + "learning_rate": 1.9028393822427707e-05, + "loss": 0.3534, + "step": 3321 + }, + { + "epoch": 3.541577825159915, + "grad_norm": 0.1306944687194926, + "learning_rate": 1.900304838183984e-05, + "loss": 0.3451, + "step": 3322 + }, + { + "epoch": 3.542643923240938, + "grad_norm": 0.12521720177645257, + "learning_rate": 1.8977714572471942e-05, + "loss": 0.3463, + "step": 3323 + }, + { + "epoch": 3.543710021321962, + "grad_norm": 0.14676298985824088, + "learning_rate": 1.8952392408357596e-05, + "loss": 0.348, + "step": 3324 + }, + { + "epoch": 3.544776119402985, + "grad_norm": 0.1270324568921544, + "learning_rate": 1.892708190352398e-05, + "loss": 0.3469, + "step": 3325 + }, + { + "epoch": 3.5458422174840085, + "grad_norm": 0.13384984086554236, + "learning_rate": 1.890178307199181e-05, + "loss": 0.3586, + "step": 3326 + }, + { + "epoch": 3.546908315565032, + "grad_norm": 0.1403291759377033, + "learning_rate": 1.88764959277753e-05, + "loss": 0.3544, + "step": 3327 + }, + { + "epoch": 3.5479744136460556, + "grad_norm": 0.1578563606515396, + "learning_rate": 1.8851220484882223e-05, + "loss": 0.3484, + "step": 3328 + }, + { + "epoch": 3.549040511727079, + "grad_norm": 0.1332388726289066, + "learning_rate": 1.8825956757313864e-05, + "loss": 0.3546, + "step": 3329 + }, + { + "epoch": 3.550106609808102, + "grad_norm": 0.14974892308226678, + "learning_rate": 1.8800704759065027e-05, + "loss": 0.3465, + "step": 3330 + }, + { + "epoch": 3.5511727078891258, + "grad_norm": 0.13311408301746736, + "learning_rate": 1.877546450412398e-05, + "loss": 0.3425, + "step": 3331 + }, + { + "epoch": 3.5522388059701493, + "grad_norm": 0.1486101241297523, + "learning_rate": 1.8750236006472525e-05, + "loss": 0.3448, + "step": 3332 + }, + { + "epoch": 3.553304904051173, + "grad_norm": 0.126555084429222, + "learning_rate": 1.872501928008594e-05, + "loss": 0.3476, + "step": 3333 + }, + { + "epoch": 3.5543710021321964, + "grad_norm": 0.1288483891538286, + "learning_rate": 1.8699814338933e-05, + "loss": 0.3517, + "step": 3334 + }, + { + "epoch": 3.5554371002132195, + "grad_norm": 0.13278424016806864, + "learning_rate": 1.8674621196975892e-05, + "loss": 0.352, + "step": 3335 + }, + { + "epoch": 3.556503198294243, + "grad_norm": 0.13632340052187167, + "learning_rate": 1.864943986817033e-05, + "loss": 0.3486, + "step": 3336 + }, + { + "epoch": 3.5575692963752665, + "grad_norm": 0.12666253550466774, + "learning_rate": 1.8624270366465476e-05, + "loss": 0.351, + "step": 3337 + }, + { + "epoch": 3.55863539445629, + "grad_norm": 0.13872865389172592, + "learning_rate": 1.8599112705803894e-05, + "loss": 0.3546, + "step": 3338 + }, + { + "epoch": 3.5597014925373136, + "grad_norm": 0.11819669860101821, + "learning_rate": 1.857396690012163e-05, + "loss": 0.3496, + "step": 3339 + }, + { + "epoch": 3.5607675906183367, + "grad_norm": 0.1324098142656928, + "learning_rate": 1.8548832963348167e-05, + "loss": 0.3512, + "step": 3340 + }, + { + "epoch": 3.5618336886993602, + "grad_norm": 0.1213227275635731, + "learning_rate": 1.8523710909406408e-05, + "loss": 0.3529, + "step": 3341 + }, + { + "epoch": 3.5628997867803838, + "grad_norm": 0.13139162851354952, + "learning_rate": 1.8498600752212633e-05, + "loss": 0.3463, + "step": 3342 + }, + { + "epoch": 3.5639658848614073, + "grad_norm": 0.12022593351799091, + "learning_rate": 1.847350250567658e-05, + "loss": 0.349, + "step": 3343 + }, + { + "epoch": 3.565031982942431, + "grad_norm": 0.14848931387258837, + "learning_rate": 1.8448416183701387e-05, + "loss": 0.3523, + "step": 3344 + }, + { + "epoch": 3.566098081023454, + "grad_norm": 0.13293877643754679, + "learning_rate": 1.8423341800183547e-05, + "loss": 0.3548, + "step": 3345 + }, + { + "epoch": 3.5671641791044775, + "grad_norm": 0.13819112943650344, + "learning_rate": 1.8398279369012975e-05, + "loss": 0.3586, + "step": 3346 + }, + { + "epoch": 3.568230277185501, + "grad_norm": 0.15221572285365978, + "learning_rate": 1.8373228904072958e-05, + "loss": 0.3487, + "step": 3347 + }, + { + "epoch": 3.5692963752665245, + "grad_norm": 0.12335264810660222, + "learning_rate": 1.8348190419240168e-05, + "loss": 0.3576, + "step": 3348 + }, + { + "epoch": 3.570362473347548, + "grad_norm": 0.14402755325400285, + "learning_rate": 1.8323163928384597e-05, + "loss": 0.3557, + "step": 3349 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.121086447817541, + "learning_rate": 1.829814944536963e-05, + "loss": 0.3545, + "step": 3350 + }, + { + "epoch": 3.572494669509595, + "grad_norm": 0.1456232992893829, + "learning_rate": 1.8273146984051987e-05, + "loss": 0.3508, + "step": 3351 + }, + { + "epoch": 3.5735607675906182, + "grad_norm": 0.13621927041516765, + "learning_rate": 1.8248156558281756e-05, + "loss": 0.3479, + "step": 3352 + }, + { + "epoch": 3.574626865671642, + "grad_norm": 0.12188450870001802, + "learning_rate": 1.8223178181902296e-05, + "loss": 0.3514, + "step": 3353 + }, + { + "epoch": 3.5756929637526653, + "grad_norm": 0.13329743005861297, + "learning_rate": 1.8198211868750352e-05, + "loss": 0.3495, + "step": 3354 + }, + { + "epoch": 3.576759061833689, + "grad_norm": 0.118410852419534, + "learning_rate": 1.8173257632655973e-05, + "loss": 0.3482, + "step": 3355 + }, + { + "epoch": 3.5778251599147124, + "grad_norm": 0.12392097571524742, + "learning_rate": 1.8148315487442482e-05, + "loss": 0.3509, + "step": 3356 + }, + { + "epoch": 3.5788912579957355, + "grad_norm": 0.149797120649777, + "learning_rate": 1.8123385446926546e-05, + "loss": 0.351, + "step": 3357 + }, + { + "epoch": 3.579957356076759, + "grad_norm": 0.11637490638409546, + "learning_rate": 1.8098467524918114e-05, + "loss": 0.348, + "step": 3358 + }, + { + "epoch": 3.5810234541577826, + "grad_norm": 0.1302506928123163, + "learning_rate": 1.807356173522043e-05, + "loss": 0.3473, + "step": 3359 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.11734314593339212, + "learning_rate": 1.8048668091629976e-05, + "loss": 0.3556, + "step": 3360 + }, + { + "epoch": 3.5831556503198296, + "grad_norm": 0.13699195492761537, + "learning_rate": 1.802378660793656e-05, + "loss": 0.3509, + "step": 3361 + }, + { + "epoch": 3.5842217484008527, + "grad_norm": 0.1192014977364567, + "learning_rate": 1.7998917297923224e-05, + "loss": 0.3471, + "step": 3362 + }, + { + "epoch": 3.5852878464818763, + "grad_norm": 0.1296315171549434, + "learning_rate": 1.7974060175366288e-05, + "loss": 0.3465, + "step": 3363 + }, + { + "epoch": 3.5863539445629, + "grad_norm": 0.13858112246619184, + "learning_rate": 1.794921525403528e-05, + "loss": 0.3497, + "step": 3364 + }, + { + "epoch": 3.5874200426439233, + "grad_norm": 0.1119373071168273, + "learning_rate": 1.7924382547693006e-05, + "loss": 0.3545, + "step": 3365 + }, + { + "epoch": 3.588486140724947, + "grad_norm": 0.13479994066024156, + "learning_rate": 1.7899562070095517e-05, + "loss": 0.3459, + "step": 3366 + }, + { + "epoch": 3.58955223880597, + "grad_norm": 0.11961904622503035, + "learning_rate": 1.7874753834992023e-05, + "loss": 0.3486, + "step": 3367 + }, + { + "epoch": 3.5906183368869935, + "grad_norm": 0.1295577113450289, + "learning_rate": 1.7849957856125032e-05, + "loss": 0.3437, + "step": 3368 + }, + { + "epoch": 3.591684434968017, + "grad_norm": 0.13384088967192623, + "learning_rate": 1.7825174147230212e-05, + "loss": 0.3508, + "step": 3369 + }, + { + "epoch": 3.5927505330490406, + "grad_norm": 0.14860888147610143, + "learning_rate": 1.780040272203647e-05, + "loss": 0.3456, + "step": 3370 + }, + { + "epoch": 3.593816631130064, + "grad_norm": 0.12817477422964021, + "learning_rate": 1.7775643594265858e-05, + "loss": 0.3502, + "step": 3371 + }, + { + "epoch": 3.594882729211087, + "grad_norm": 0.15577728782871172, + "learning_rate": 1.775089677763366e-05, + "loss": 0.3537, + "step": 3372 + }, + { + "epoch": 3.5959488272921107, + "grad_norm": 0.12786761103356425, + "learning_rate": 1.7726162285848343e-05, + "loss": 0.3447, + "step": 3373 + }, + { + "epoch": 3.5970149253731343, + "grad_norm": 0.14028709617612561, + "learning_rate": 1.77014401326115e-05, + "loss": 0.3463, + "step": 3374 + }, + { + "epoch": 3.598081023454158, + "grad_norm": 0.12954870341900906, + "learning_rate": 1.7676730331617934e-05, + "loss": 0.3519, + "step": 3375 + }, + { + "epoch": 3.5991471215351813, + "grad_norm": 0.12691468222674968, + "learning_rate": 1.765203289655559e-05, + "loss": 0.3491, + "step": 3376 + }, + { + "epoch": 3.6002132196162044, + "grad_norm": 0.15452666879821078, + "learning_rate": 1.7627347841105575e-05, + "loss": 0.3469, + "step": 3377 + }, + { + "epoch": 3.6012793176972284, + "grad_norm": 0.12502978476826676, + "learning_rate": 1.7602675178942102e-05, + "loss": 0.3441, + "step": 3378 + }, + { + "epoch": 3.6023454157782515, + "grad_norm": 0.13006787492356886, + "learning_rate": 1.7578014923732558e-05, + "loss": 0.3501, + "step": 3379 + }, + { + "epoch": 3.603411513859275, + "grad_norm": 0.10724477831469756, + "learning_rate": 1.7553367089137438e-05, + "loss": 0.3501, + "step": 3380 + }, + { + "epoch": 3.6044776119402986, + "grad_norm": 0.13170260076841805, + "learning_rate": 1.7528731688810383e-05, + "loss": 0.3505, + "step": 3381 + }, + { + "epoch": 3.605543710021322, + "grad_norm": 0.11121721943362509, + "learning_rate": 1.7504108736398087e-05, + "loss": 0.3523, + "step": 3382 + }, + { + "epoch": 3.6066098081023457, + "grad_norm": 0.14589447941044803, + "learning_rate": 1.747949824554041e-05, + "loss": 0.3546, + "step": 3383 + }, + { + "epoch": 3.6076759061833688, + "grad_norm": 0.13319282485716943, + "learning_rate": 1.745490022987029e-05, + "loss": 0.3496, + "step": 3384 + }, + { + "epoch": 3.6087420042643923, + "grad_norm": 0.12337788148881837, + "learning_rate": 1.7430314703013727e-05, + "loss": 0.3504, + "step": 3385 + }, + { + "epoch": 3.609808102345416, + "grad_norm": 0.14541264966288076, + "learning_rate": 1.7405741678589838e-05, + "loss": 0.3507, + "step": 3386 + }, + { + "epoch": 3.6108742004264394, + "grad_norm": 0.11959380106376388, + "learning_rate": 1.7381181170210814e-05, + "loss": 0.3511, + "step": 3387 + }, + { + "epoch": 3.611940298507463, + "grad_norm": 0.1245152479458827, + "learning_rate": 1.7356633191481877e-05, + "loss": 0.3469, + "step": 3388 + }, + { + "epoch": 3.613006396588486, + "grad_norm": 0.10647154581436648, + "learning_rate": 1.7332097756001335e-05, + "loss": 0.3512, + "step": 3389 + }, + { + "epoch": 3.6140724946695095, + "grad_norm": 0.11205355107663482, + "learning_rate": 1.730757487736057e-05, + "loss": 0.3449, + "step": 3390 + }, + { + "epoch": 3.615138592750533, + "grad_norm": 0.1071566104296763, + "learning_rate": 1.7283064569143947e-05, + "loss": 0.349, + "step": 3391 + }, + { + "epoch": 3.6162046908315566, + "grad_norm": 0.10713350170160939, + "learning_rate": 1.7258566844928915e-05, + "loss": 0.3495, + "step": 3392 + }, + { + "epoch": 3.61727078891258, + "grad_norm": 0.11331625037340512, + "learning_rate": 1.7234081718285965e-05, + "loss": 0.351, + "step": 3393 + }, + { + "epoch": 3.6183368869936032, + "grad_norm": 0.115641631530902, + "learning_rate": 1.7209609202778542e-05, + "loss": 0.3487, + "step": 3394 + }, + { + "epoch": 3.6194029850746268, + "grad_norm": 0.12437375198658421, + "learning_rate": 1.7185149311963186e-05, + "loss": 0.3513, + "step": 3395 + }, + { + "epoch": 3.6204690831556503, + "grad_norm": 0.11954602193026391, + "learning_rate": 1.716070205938938e-05, + "loss": 0.3523, + "step": 3396 + }, + { + "epoch": 3.621535181236674, + "grad_norm": 0.1310194394078668, + "learning_rate": 1.7136267458599633e-05, + "loss": 0.3495, + "step": 3397 + }, + { + "epoch": 3.6226012793176974, + "grad_norm": 0.11894966733116948, + "learning_rate": 1.711184552312945e-05, + "loss": 0.3491, + "step": 3398 + }, + { + "epoch": 3.6236673773987205, + "grad_norm": 0.1260933584074009, + "learning_rate": 1.7087436266507333e-05, + "loss": 0.3516, + "step": 3399 + }, + { + "epoch": 3.624733475479744, + "grad_norm": 0.11495853607822465, + "learning_rate": 1.706303970225471e-05, + "loss": 0.3507, + "step": 3400 + }, + { + "epoch": 3.6257995735607675, + "grad_norm": 0.1580722155106881, + "learning_rate": 1.703865584388602e-05, + "loss": 0.349, + "step": 3401 + }, + { + "epoch": 3.626865671641791, + "grad_norm": 0.1166417464300119, + "learning_rate": 1.7014284704908673e-05, + "loss": 0.3504, + "step": 3402 + }, + { + "epoch": 3.6279317697228146, + "grad_norm": 0.13094944084926075, + "learning_rate": 1.6989926298822977e-05, + "loss": 0.3517, + "step": 3403 + }, + { + "epoch": 3.6289978678038377, + "grad_norm": 0.1091018787371315, + "learning_rate": 1.6965580639122247e-05, + "loss": 0.347, + "step": 3404 + }, + { + "epoch": 3.6300639658848612, + "grad_norm": 0.13573951697931824, + "learning_rate": 1.694124773929271e-05, + "loss": 0.3506, + "step": 3405 + }, + { + "epoch": 3.631130063965885, + "grad_norm": 0.11997387496896626, + "learning_rate": 1.691692761281354e-05, + "loss": 0.3505, + "step": 3406 + }, + { + "epoch": 3.6321961620469083, + "grad_norm": 0.1581248499900699, + "learning_rate": 1.6892620273156795e-05, + "loss": 0.351, + "step": 3407 + }, + { + "epoch": 3.633262260127932, + "grad_norm": 0.1253330836795441, + "learning_rate": 1.686832573378749e-05, + "loss": 0.3527, + "step": 3408 + }, + { + "epoch": 3.6343283582089554, + "grad_norm": 0.12308261327495958, + "learning_rate": 1.684404400816354e-05, + "loss": 0.3485, + "step": 3409 + }, + { + "epoch": 3.635394456289979, + "grad_norm": 0.1094772181271851, + "learning_rate": 1.6819775109735777e-05, + "loss": 0.3526, + "step": 3410 + }, + { + "epoch": 3.636460554371002, + "grad_norm": 0.14783525680446527, + "learning_rate": 1.6795519051947877e-05, + "loss": 0.3493, + "step": 3411 + }, + { + "epoch": 3.6375266524520256, + "grad_norm": 0.11897120110784674, + "learning_rate": 1.6771275848236447e-05, + "loss": 0.3482, + "step": 3412 + }, + { + "epoch": 3.638592750533049, + "grad_norm": 0.1346934227512216, + "learning_rate": 1.674704551203098e-05, + "loss": 0.3498, + "step": 3413 + }, + { + "epoch": 3.6396588486140726, + "grad_norm": 0.11012404823074266, + "learning_rate": 1.6722828056753794e-05, + "loss": 0.35, + "step": 3414 + }, + { + "epoch": 3.640724946695096, + "grad_norm": 0.13021214801697548, + "learning_rate": 1.6698623495820117e-05, + "loss": 0.3483, + "step": 3415 + }, + { + "epoch": 3.6417910447761193, + "grad_norm": 0.10956855386418592, + "learning_rate": 1.6674431842638014e-05, + "loss": 0.3492, + "step": 3416 + }, + { + "epoch": 3.642857142857143, + "grad_norm": 0.11548747621938685, + "learning_rate": 1.6650253110608415e-05, + "loss": 0.345, + "step": 3417 + }, + { + "epoch": 3.6439232409381663, + "grad_norm": 0.10744538789719912, + "learning_rate": 1.662608731312506e-05, + "loss": 0.3513, + "step": 3418 + }, + { + "epoch": 3.64498933901919, + "grad_norm": 0.12523119814259084, + "learning_rate": 1.6601934463574553e-05, + "loss": 0.349, + "step": 3419 + }, + { + "epoch": 3.6460554371002134, + "grad_norm": 0.1183786159156495, + "learning_rate": 1.657779457533632e-05, + "loss": 0.3497, + "step": 3420 + }, + { + "epoch": 3.6471215351812365, + "grad_norm": 0.1258499101414295, + "learning_rate": 1.6553667661782624e-05, + "loss": 0.3471, + "step": 3421 + }, + { + "epoch": 3.64818763326226, + "grad_norm": 0.11610530838109377, + "learning_rate": 1.652955373627848e-05, + "loss": 0.3533, + "step": 3422 + }, + { + "epoch": 3.6492537313432836, + "grad_norm": 0.12286306836120377, + "learning_rate": 1.6505452812181775e-05, + "loss": 0.3483, + "step": 3423 + }, + { + "epoch": 3.650319829424307, + "grad_norm": 0.09734440175920124, + "learning_rate": 1.648136490284318e-05, + "loss": 0.344, + "step": 3424 + }, + { + "epoch": 3.6513859275053306, + "grad_norm": 0.13317697498320796, + "learning_rate": 1.645729002160611e-05, + "loss": 0.3485, + "step": 3425 + }, + { + "epoch": 3.6524520255863537, + "grad_norm": 0.11968047549008518, + "learning_rate": 1.643322818180682e-05, + "loss": 0.3504, + "step": 3426 + }, + { + "epoch": 3.6535181236673773, + "grad_norm": 0.12970016078696814, + "learning_rate": 1.6409179396774317e-05, + "loss": 0.3494, + "step": 3427 + }, + { + "epoch": 3.654584221748401, + "grad_norm": 0.10979013596405256, + "learning_rate": 1.638514367983039e-05, + "loss": 0.3475, + "step": 3428 + }, + { + "epoch": 3.6556503198294243, + "grad_norm": 0.11958509569515018, + "learning_rate": 1.6361121044289553e-05, + "loss": 0.3424, + "step": 3429 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.1210201578994878, + "learning_rate": 1.6337111503459104e-05, + "loss": 0.3443, + "step": 3430 + }, + { + "epoch": 3.657782515991471, + "grad_norm": 0.11828921119105222, + "learning_rate": 1.63131150706391e-05, + "loss": 0.3442, + "step": 3431 + }, + { + "epoch": 3.6588486140724945, + "grad_norm": 0.12968866909782567, + "learning_rate": 1.6289131759122292e-05, + "loss": 0.3488, + "step": 3432 + }, + { + "epoch": 3.659914712153518, + "grad_norm": 0.11513145282740704, + "learning_rate": 1.62651615821942e-05, + "loss": 0.3515, + "step": 3433 + }, + { + "epoch": 3.6609808102345416, + "grad_norm": 0.10887008435723226, + "learning_rate": 1.6241204553133054e-05, + "loss": 0.3422, + "step": 3434 + }, + { + "epoch": 3.662046908315565, + "grad_norm": 0.10848962774276776, + "learning_rate": 1.6217260685209815e-05, + "loss": 0.3508, + "step": 3435 + }, + { + "epoch": 3.663113006396588, + "grad_norm": 0.10984253318601138, + "learning_rate": 1.619332999168812e-05, + "loss": 0.3473, + "step": 3436 + }, + { + "epoch": 3.664179104477612, + "grad_norm": 0.10354031929218735, + "learning_rate": 1.6169412485824343e-05, + "loss": 0.3511, + "step": 3437 + }, + { + "epoch": 3.6652452025586353, + "grad_norm": 0.11266732298785025, + "learning_rate": 1.6145508180867537e-05, + "loss": 0.3438, + "step": 3438 + }, + { + "epoch": 3.666311300639659, + "grad_norm": 0.11007091054264355, + "learning_rate": 1.6121617090059455e-05, + "loss": 0.3519, + "step": 3439 + }, + { + "epoch": 3.6673773987206824, + "grad_norm": 0.11010507294324125, + "learning_rate": 1.6097739226634494e-05, + "loss": 0.3503, + "step": 3440 + }, + { + "epoch": 3.668443496801706, + "grad_norm": 0.1107608574897804, + "learning_rate": 1.6073874603819767e-05, + "loss": 0.3444, + "step": 3441 + }, + { + "epoch": 3.6695095948827294, + "grad_norm": 0.09996660829470029, + "learning_rate": 1.605002323483505e-05, + "loss": 0.3508, + "step": 3442 + }, + { + "epoch": 3.6705756929637525, + "grad_norm": 0.10541897341054957, + "learning_rate": 1.6026185132892722e-05, + "loss": 0.3473, + "step": 3443 + }, + { + "epoch": 3.671641791044776, + "grad_norm": 0.09364677982992888, + "learning_rate": 1.6002360311197874e-05, + "loss": 0.345, + "step": 3444 + }, + { + "epoch": 3.6727078891257996, + "grad_norm": 0.11633515138576979, + "learning_rate": 1.5978548782948228e-05, + "loss": 0.3537, + "step": 3445 + }, + { + "epoch": 3.673773987206823, + "grad_norm": 0.10455339919916513, + "learning_rate": 1.595475056133413e-05, + "loss": 0.3441, + "step": 3446 + }, + { + "epoch": 3.6748400852878467, + "grad_norm": 0.10817600283132285, + "learning_rate": 1.5930965659538547e-05, + "loss": 0.3513, + "step": 3447 + }, + { + "epoch": 3.6759061833688698, + "grad_norm": 0.11724298481759776, + "learning_rate": 1.590719409073708e-05, + "loss": 0.3471, + "step": 3448 + }, + { + "epoch": 3.6769722814498933, + "grad_norm": 0.11034160049080355, + "learning_rate": 1.5883435868097942e-05, + "loss": 0.3494, + "step": 3449 + }, + { + "epoch": 3.678038379530917, + "grad_norm": 0.10935682929664699, + "learning_rate": 1.5859691004781977e-05, + "loss": 0.3531, + "step": 3450 + }, + { + "epoch": 3.6791044776119404, + "grad_norm": 0.10669839160222076, + "learning_rate": 1.5835959513942577e-05, + "loss": 0.3464, + "step": 3451 + }, + { + "epoch": 3.680170575692964, + "grad_norm": 0.10499647433795345, + "learning_rate": 1.5812241408725757e-05, + "loss": 0.3484, + "step": 3452 + }, + { + "epoch": 3.681236673773987, + "grad_norm": 0.10722258869475547, + "learning_rate": 1.5788536702270136e-05, + "loss": 0.3498, + "step": 3453 + }, + { + "epoch": 3.6823027718550105, + "grad_norm": 0.1120874379953065, + "learning_rate": 1.5764845407706863e-05, + "loss": 0.3561, + "step": 3454 + }, + { + "epoch": 3.683368869936034, + "grad_norm": 0.11848268184587527, + "learning_rate": 1.5741167538159697e-05, + "loss": 0.3464, + "step": 3455 + }, + { + "epoch": 3.6844349680170576, + "grad_norm": 0.12047369839392386, + "learning_rate": 1.5717503106744957e-05, + "loss": 0.3572, + "step": 3456 + }, + { + "epoch": 3.685501066098081, + "grad_norm": 0.12166580342753797, + "learning_rate": 1.569385212657149e-05, + "loss": 0.3489, + "step": 3457 + }, + { + "epoch": 3.6865671641791042, + "grad_norm": 0.12106373222633703, + "learning_rate": 1.567021461074071e-05, + "loss": 0.3516, + "step": 3458 + }, + { + "epoch": 3.6876332622601278, + "grad_norm": 0.11484973241212469, + "learning_rate": 1.5646590572346596e-05, + "loss": 0.3524, + "step": 3459 + }, + { + "epoch": 3.6886993603411513, + "grad_norm": 0.10937051777873967, + "learning_rate": 1.5622980024475608e-05, + "loss": 0.3439, + "step": 3460 + }, + { + "epoch": 3.689765458422175, + "grad_norm": 0.12137658902823265, + "learning_rate": 1.5599382980206773e-05, + "loss": 0.3516, + "step": 3461 + }, + { + "epoch": 3.6908315565031984, + "grad_norm": 0.12974179145102452, + "learning_rate": 1.5575799452611647e-05, + "loss": 0.3545, + "step": 3462 + }, + { + "epoch": 3.6918976545842215, + "grad_norm": 0.13163480419478382, + "learning_rate": 1.5552229454754245e-05, + "loss": 0.3462, + "step": 3463 + }, + { + "epoch": 3.6929637526652455, + "grad_norm": 0.09995629474690315, + "learning_rate": 1.5528672999691137e-05, + "loss": 0.3463, + "step": 3464 + }, + { + "epoch": 3.6940298507462686, + "grad_norm": 0.12284728073753405, + "learning_rate": 1.550513010047139e-05, + "loss": 0.3478, + "step": 3465 + }, + { + "epoch": 3.695095948827292, + "grad_norm": 0.11912635744633349, + "learning_rate": 1.5481600770136512e-05, + "loss": 0.3469, + "step": 3466 + }, + { + "epoch": 3.6961620469083156, + "grad_norm": 0.1046940554664763, + "learning_rate": 1.5458085021720557e-05, + "loss": 0.349, + "step": 3467 + }, + { + "epoch": 3.697228144989339, + "grad_norm": 0.10784839095846516, + "learning_rate": 1.543458286825003e-05, + "loss": 0.3533, + "step": 3468 + }, + { + "epoch": 3.6982942430703627, + "grad_norm": 0.1195343323249887, + "learning_rate": 1.5411094322743875e-05, + "loss": 0.3508, + "step": 3469 + }, + { + "epoch": 3.699360341151386, + "grad_norm": 0.10962783595704657, + "learning_rate": 1.5387619398213543e-05, + "loss": 0.3471, + "step": 3470 + }, + { + "epoch": 3.7004264392324093, + "grad_norm": 0.1150816905348978, + "learning_rate": 1.5364158107662935e-05, + "loss": 0.3518, + "step": 3471 + }, + { + "epoch": 3.701492537313433, + "grad_norm": 0.13829883854757222, + "learning_rate": 1.534071046408836e-05, + "loss": 0.3521, + "step": 3472 + }, + { + "epoch": 3.7025586353944564, + "grad_norm": 0.12445105825556152, + "learning_rate": 1.531727648047861e-05, + "loss": 0.3554, + "step": 3473 + }, + { + "epoch": 3.70362473347548, + "grad_norm": 0.1157531505696079, + "learning_rate": 1.5293856169814885e-05, + "loss": 0.3489, + "step": 3474 + }, + { + "epoch": 3.704690831556503, + "grad_norm": 0.1330840415067106, + "learning_rate": 1.527044954507084e-05, + "loss": 0.35, + "step": 3475 + }, + { + "epoch": 3.7057569296375266, + "grad_norm": 0.10784494484991011, + "learning_rate": 1.5247056619212507e-05, + "loss": 0.3488, + "step": 3476 + }, + { + "epoch": 3.70682302771855, + "grad_norm": 0.12127485364434097, + "learning_rate": 1.5223677405198354e-05, + "loss": 0.3499, + "step": 3477 + }, + { + "epoch": 3.7078891257995736, + "grad_norm": 0.11013802869572574, + "learning_rate": 1.5200311915979255e-05, + "loss": 0.3421, + "step": 3478 + }, + { + "epoch": 3.708955223880597, + "grad_norm": 0.12178309778215979, + "learning_rate": 1.517696016449849e-05, + "loss": 0.3527, + "step": 3479 + }, + { + "epoch": 3.7100213219616203, + "grad_norm": 0.11504029087431382, + "learning_rate": 1.515362216369169e-05, + "loss": 0.3483, + "step": 3480 + }, + { + "epoch": 3.711087420042644, + "grad_norm": 0.12440538915523314, + "learning_rate": 1.5130297926486908e-05, + "loss": 0.3481, + "step": 3481 + }, + { + "epoch": 3.7121535181236673, + "grad_norm": 0.1157756581499601, + "learning_rate": 1.5106987465804572e-05, + "loss": 0.3472, + "step": 3482 + }, + { + "epoch": 3.713219616204691, + "grad_norm": 0.13884491519651773, + "learning_rate": 1.5083690794557435e-05, + "loss": 0.3491, + "step": 3483 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.11160024061874728, + "learning_rate": 1.5060407925650662e-05, + "loss": 0.3442, + "step": 3484 + }, + { + "epoch": 3.7153518123667375, + "grad_norm": 0.1388304685706655, + "learning_rate": 1.503713887198175e-05, + "loss": 0.3524, + "step": 3485 + }, + { + "epoch": 3.716417910447761, + "grad_norm": 0.10291882827022071, + "learning_rate": 1.5013883646440555e-05, + "loss": 0.3469, + "step": 3486 + }, + { + "epoch": 3.7174840085287846, + "grad_norm": 0.11581595497958068, + "learning_rate": 1.499064226190924e-05, + "loss": 0.3478, + "step": 3487 + }, + { + "epoch": 3.718550106609808, + "grad_norm": 0.12039433257965831, + "learning_rate": 1.4967414731262339e-05, + "loss": 0.3476, + "step": 3488 + }, + { + "epoch": 3.7196162046908317, + "grad_norm": 0.11994551705543056, + "learning_rate": 1.494420106736671e-05, + "loss": 0.349, + "step": 3489 + }, + { + "epoch": 3.7206823027718547, + "grad_norm": 0.10943678193661488, + "learning_rate": 1.4921001283081488e-05, + "loss": 0.3443, + "step": 3490 + }, + { + "epoch": 3.7217484008528787, + "grad_norm": 0.12365465123838881, + "learning_rate": 1.489781539125816e-05, + "loss": 0.3428, + "step": 3491 + }, + { + "epoch": 3.722814498933902, + "grad_norm": 0.12996918119430528, + "learning_rate": 1.4874643404740505e-05, + "loss": 0.3492, + "step": 3492 + }, + { + "epoch": 3.7238805970149254, + "grad_norm": 0.12320823229427783, + "learning_rate": 1.4851485336364616e-05, + "loss": 0.3555, + "step": 3493 + }, + { + "epoch": 3.724946695095949, + "grad_norm": 0.11983024758002496, + "learning_rate": 1.4828341198958827e-05, + "loss": 0.3529, + "step": 3494 + }, + { + "epoch": 3.7260127931769724, + "grad_norm": 0.11090723612747737, + "learning_rate": 1.4805211005343804e-05, + "loss": 0.3497, + "step": 3495 + }, + { + "epoch": 3.727078891257996, + "grad_norm": 0.11889325003301451, + "learning_rate": 1.4782094768332477e-05, + "loss": 0.3475, + "step": 3496 + }, + { + "epoch": 3.728144989339019, + "grad_norm": 0.11039401592347903, + "learning_rate": 1.4758992500730047e-05, + "loss": 0.3506, + "step": 3497 + }, + { + "epoch": 3.7292110874200426, + "grad_norm": 0.11723348316498812, + "learning_rate": 1.4735904215333942e-05, + "loss": 0.3491, + "step": 3498 + }, + { + "epoch": 3.730277185501066, + "grad_norm": 0.1096667948027839, + "learning_rate": 1.4712829924933888e-05, + "loss": 0.35, + "step": 3499 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.12827667700436324, + "learning_rate": 1.4689769642311862e-05, + "loss": 0.3488, + "step": 3500 + }, + { + "epoch": 3.732409381663113, + "grad_norm": 0.11597291937955526, + "learning_rate": 1.466672338024202e-05, + "loss": 0.3529, + "step": 3501 + }, + { + "epoch": 3.7334754797441363, + "grad_norm": 0.12098185152644861, + "learning_rate": 1.4643691151490825e-05, + "loss": 0.3543, + "step": 3502 + }, + { + "epoch": 3.73454157782516, + "grad_norm": 0.1264853588754496, + "learning_rate": 1.462067296881692e-05, + "loss": 0.3494, + "step": 3503 + }, + { + "epoch": 3.7356076759061834, + "grad_norm": 0.11554640924569769, + "learning_rate": 1.4597668844971203e-05, + "loss": 0.3477, + "step": 3504 + }, + { + "epoch": 3.736673773987207, + "grad_norm": 0.1263291190867985, + "learning_rate": 1.4574678792696735e-05, + "loss": 0.3464, + "step": 3505 + }, + { + "epoch": 3.7377398720682304, + "grad_norm": 0.1140310154067713, + "learning_rate": 1.4551702824728824e-05, + "loss": 0.3534, + "step": 3506 + }, + { + "epoch": 3.7388059701492535, + "grad_norm": 0.1374330707269001, + "learning_rate": 1.452874095379496e-05, + "loss": 0.3525, + "step": 3507 + }, + { + "epoch": 3.739872068230277, + "grad_norm": 0.10373341206088472, + "learning_rate": 1.4505793192614838e-05, + "loss": 0.3487, + "step": 3508 + }, + { + "epoch": 3.7409381663113006, + "grad_norm": 0.12296501711153075, + "learning_rate": 1.4482859553900302e-05, + "loss": 0.3523, + "step": 3509 + }, + { + "epoch": 3.742004264392324, + "grad_norm": 0.09959644973450633, + "learning_rate": 1.4459940050355412e-05, + "loss": 0.3459, + "step": 3510 + }, + { + "epoch": 3.7430703624733477, + "grad_norm": 0.12381271538841246, + "learning_rate": 1.4437034694676388e-05, + "loss": 0.3442, + "step": 3511 + }, + { + "epoch": 3.7441364605543708, + "grad_norm": 0.11420172023496247, + "learning_rate": 1.4414143499551583e-05, + "loss": 0.3435, + "step": 3512 + }, + { + "epoch": 3.7452025586353943, + "grad_norm": 0.12761045293800952, + "learning_rate": 1.4391266477661545e-05, + "loss": 0.3502, + "step": 3513 + }, + { + "epoch": 3.746268656716418, + "grad_norm": 0.12938266287837766, + "learning_rate": 1.4368403641678951e-05, + "loss": 0.353, + "step": 3514 + }, + { + "epoch": 3.7473347547974414, + "grad_norm": 0.12147529927692077, + "learning_rate": 1.434555500426864e-05, + "loss": 0.3529, + "step": 3515 + }, + { + "epoch": 3.748400852878465, + "grad_norm": 0.12991621986150606, + "learning_rate": 1.4322720578087546e-05, + "loss": 0.3508, + "step": 3516 + }, + { + "epoch": 3.749466950959488, + "grad_norm": 0.11210702326795871, + "learning_rate": 1.4299900375784761e-05, + "loss": 0.3491, + "step": 3517 + }, + { + "epoch": 3.750533049040512, + "grad_norm": 0.12221491366601378, + "learning_rate": 1.4277094410001508e-05, + "loss": 0.3486, + "step": 3518 + }, + { + "epoch": 3.751599147121535, + "grad_norm": 0.11758736893333697, + "learning_rate": 1.4254302693371083e-05, + "loss": 0.3525, + "step": 3519 + }, + { + "epoch": 3.7526652452025586, + "grad_norm": 0.11720998722860178, + "learning_rate": 1.4231525238518917e-05, + "loss": 0.3526, + "step": 3520 + }, + { + "epoch": 3.753731343283582, + "grad_norm": 0.11702149986171113, + "learning_rate": 1.4208762058062546e-05, + "loss": 0.3567, + "step": 3521 + }, + { + "epoch": 3.7547974413646057, + "grad_norm": 0.11781947922119294, + "learning_rate": 1.4186013164611593e-05, + "loss": 0.3476, + "step": 3522 + }, + { + "epoch": 3.7558635394456292, + "grad_norm": 0.11028285471866768, + "learning_rate": 1.4163278570767744e-05, + "loss": 0.3472, + "step": 3523 + }, + { + "epoch": 3.7569296375266523, + "grad_norm": 0.11480800401974497, + "learning_rate": 1.4140558289124795e-05, + "loss": 0.3498, + "step": 3524 + }, + { + "epoch": 3.757995735607676, + "grad_norm": 0.10511476764554756, + "learning_rate": 1.411785233226861e-05, + "loss": 0.346, + "step": 3525 + }, + { + "epoch": 3.7590618336886994, + "grad_norm": 0.11246301899459499, + "learning_rate": 1.4095160712777087e-05, + "loss": 0.3463, + "step": 3526 + }, + { + "epoch": 3.760127931769723, + "grad_norm": 0.11075751087710273, + "learning_rate": 1.4072483443220213e-05, + "loss": 0.351, + "step": 3527 + }, + { + "epoch": 3.7611940298507465, + "grad_norm": 0.11457686956190802, + "learning_rate": 1.4049820536160033e-05, + "loss": 0.3507, + "step": 3528 + }, + { + "epoch": 3.7622601279317696, + "grad_norm": 0.13257820543611734, + "learning_rate": 1.4027172004150594e-05, + "loss": 0.3483, + "step": 3529 + }, + { + "epoch": 3.763326226012793, + "grad_norm": 0.09934268832616149, + "learning_rate": 1.400453785973801e-05, + "loss": 0.3471, + "step": 3530 + }, + { + "epoch": 3.7643923240938166, + "grad_norm": 0.1217605278748467, + "learning_rate": 1.3981918115460448e-05, + "loss": 0.3452, + "step": 3531 + }, + { + "epoch": 3.76545842217484, + "grad_norm": 0.1025857492577431, + "learning_rate": 1.3959312783848033e-05, + "loss": 0.3496, + "step": 3532 + }, + { + "epoch": 3.7665245202558637, + "grad_norm": 0.12375183362556455, + "learning_rate": 1.3936721877422965e-05, + "loss": 0.3532, + "step": 3533 + }, + { + "epoch": 3.767590618336887, + "grad_norm": 0.10188334610518712, + "learning_rate": 1.3914145408699437e-05, + "loss": 0.3436, + "step": 3534 + }, + { + "epoch": 3.7686567164179103, + "grad_norm": 0.12657161432596883, + "learning_rate": 1.3891583390183621e-05, + "loss": 0.3517, + "step": 3535 + }, + { + "epoch": 3.769722814498934, + "grad_norm": 0.11261285266481309, + "learning_rate": 1.3869035834373712e-05, + "loss": 0.3533, + "step": 3536 + }, + { + "epoch": 3.7707889125799574, + "grad_norm": 0.11006622035677963, + "learning_rate": 1.3846502753759899e-05, + "loss": 0.3475, + "step": 3537 + }, + { + "epoch": 3.771855010660981, + "grad_norm": 0.10803610024542715, + "learning_rate": 1.3823984160824306e-05, + "loss": 0.3521, + "step": 3538 + }, + { + "epoch": 3.772921108742004, + "grad_norm": 0.10308686673663463, + "learning_rate": 1.3801480068041083e-05, + "loss": 0.3486, + "step": 3539 + }, + { + "epoch": 3.7739872068230276, + "grad_norm": 0.11418577872966433, + "learning_rate": 1.3778990487876338e-05, + "loss": 0.3527, + "step": 3540 + }, + { + "epoch": 3.775053304904051, + "grad_norm": 0.115083278681719, + "learning_rate": 1.3756515432788105e-05, + "loss": 0.3547, + "step": 3541 + }, + { + "epoch": 3.7761194029850746, + "grad_norm": 0.105339094797037, + "learning_rate": 1.3734054915226405e-05, + "loss": 0.3509, + "step": 3542 + }, + { + "epoch": 3.777185501066098, + "grad_norm": 0.11586176556566104, + "learning_rate": 1.3711608947633202e-05, + "loss": 0.3508, + "step": 3543 + }, + { + "epoch": 3.7782515991471213, + "grad_norm": 0.1066837228197777, + "learning_rate": 1.3689177542442406e-05, + "loss": 0.3477, + "step": 3544 + }, + { + "epoch": 3.7793176972281453, + "grad_norm": 0.11175162407532467, + "learning_rate": 1.3666760712079828e-05, + "loss": 0.3486, + "step": 3545 + }, + { + "epoch": 3.7803837953091683, + "grad_norm": 0.10538041603411623, + "learning_rate": 1.3644358468963233e-05, + "loss": 0.3477, + "step": 3546 + }, + { + "epoch": 3.781449893390192, + "grad_norm": 0.12393679042543049, + "learning_rate": 1.3621970825502317e-05, + "loss": 0.3535, + "step": 3547 + }, + { + "epoch": 3.7825159914712154, + "grad_norm": 0.10491158418048802, + "learning_rate": 1.3599597794098648e-05, + "loss": 0.3562, + "step": 3548 + }, + { + "epoch": 3.783582089552239, + "grad_norm": 0.12508143946693426, + "learning_rate": 1.3577239387145729e-05, + "loss": 0.3513, + "step": 3549 + }, + { + "epoch": 3.7846481876332625, + "grad_norm": 0.11761432608648041, + "learning_rate": 1.3554895617028958e-05, + "loss": 0.3502, + "step": 3550 + }, + { + "epoch": 3.7857142857142856, + "grad_norm": 0.12468188783060895, + "learning_rate": 1.3532566496125634e-05, + "loss": 0.349, + "step": 3551 + }, + { + "epoch": 3.786780383795309, + "grad_norm": 0.11776026667363994, + "learning_rate": 1.3510252036804907e-05, + "loss": 0.3496, + "step": 3552 + }, + { + "epoch": 3.7878464818763327, + "grad_norm": 0.10743422269996004, + "learning_rate": 1.348795225142784e-05, + "loss": 0.3512, + "step": 3553 + }, + { + "epoch": 3.788912579957356, + "grad_norm": 0.10720339132099257, + "learning_rate": 1.3465667152347353e-05, + "loss": 0.346, + "step": 3554 + }, + { + "epoch": 3.7899786780383797, + "grad_norm": 0.10732063941636186, + "learning_rate": 1.3443396751908243e-05, + "loss": 0.3459, + "step": 3555 + }, + { + "epoch": 3.791044776119403, + "grad_norm": 0.1113815053455615, + "learning_rate": 1.3421141062447136e-05, + "loss": 0.3491, + "step": 3556 + }, + { + "epoch": 3.7921108742004264, + "grad_norm": 0.11139797758194125, + "learning_rate": 1.3398900096292535e-05, + "loss": 0.3513, + "step": 3557 + }, + { + "epoch": 3.79317697228145, + "grad_norm": 0.11752612657415666, + "learning_rate": 1.3376673865764796e-05, + "loss": 0.354, + "step": 3558 + }, + { + "epoch": 3.7942430703624734, + "grad_norm": 0.11709263878140849, + "learning_rate": 1.3354462383176064e-05, + "loss": 0.3547, + "step": 3559 + }, + { + "epoch": 3.795309168443497, + "grad_norm": 0.12982606895742937, + "learning_rate": 1.3332265660830364e-05, + "loss": 0.3467, + "step": 3560 + }, + { + "epoch": 3.79637526652452, + "grad_norm": 0.11565736542146775, + "learning_rate": 1.3310083711023527e-05, + "loss": 0.3516, + "step": 3561 + }, + { + "epoch": 3.7974413646055436, + "grad_norm": 0.09828691895649047, + "learning_rate": 1.3287916546043209e-05, + "loss": 0.3487, + "step": 3562 + }, + { + "epoch": 3.798507462686567, + "grad_norm": 0.1155849088844352, + "learning_rate": 1.3265764178168841e-05, + "loss": 0.3512, + "step": 3563 + }, + { + "epoch": 3.7995735607675907, + "grad_norm": 0.10134447160803774, + "learning_rate": 1.3243626619671704e-05, + "loss": 0.3541, + "step": 3564 + }, + { + "epoch": 3.800639658848614, + "grad_norm": 0.10380565104455901, + "learning_rate": 1.3221503882814846e-05, + "loss": 0.3466, + "step": 3565 + }, + { + "epoch": 3.8017057569296373, + "grad_norm": 0.10299695475323739, + "learning_rate": 1.3199395979853132e-05, + "loss": 0.3503, + "step": 3566 + }, + { + "epoch": 3.802771855010661, + "grad_norm": 0.131615477948487, + "learning_rate": 1.3177302923033164e-05, + "loss": 0.3515, + "step": 3567 + }, + { + "epoch": 3.8038379530916844, + "grad_norm": 0.09761956730459642, + "learning_rate": 1.3155224724593364e-05, + "loss": 0.3516, + "step": 3568 + }, + { + "epoch": 3.804904051172708, + "grad_norm": 0.10306669274819454, + "learning_rate": 1.3133161396763909e-05, + "loss": 0.3456, + "step": 3569 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.10145722545444867, + "learning_rate": 1.311111295176672e-05, + "loss": 0.3439, + "step": 3570 + }, + { + "epoch": 3.8070362473347545, + "grad_norm": 0.10770148837143485, + "learning_rate": 1.3089079401815497e-05, + "loss": 0.3496, + "step": 3571 + }, + { + "epoch": 3.8081023454157785, + "grad_norm": 0.11320765417229854, + "learning_rate": 1.3067060759115684e-05, + "loss": 0.3542, + "step": 3572 + }, + { + "epoch": 3.8091684434968016, + "grad_norm": 0.1163579650557563, + "learning_rate": 1.3045057035864477e-05, + "loss": 0.3554, + "step": 3573 + }, + { + "epoch": 3.810234541577825, + "grad_norm": 0.09905809774744204, + "learning_rate": 1.302306824425077e-05, + "loss": 0.3486, + "step": 3574 + }, + { + "epoch": 3.8113006396588487, + "grad_norm": 0.12608559744995004, + "learning_rate": 1.3001094396455223e-05, + "loss": 0.3514, + "step": 3575 + }, + { + "epoch": 3.8123667377398722, + "grad_norm": 0.09595835889524401, + "learning_rate": 1.297913550465022e-05, + "loss": 0.3462, + "step": 3576 + }, + { + "epoch": 3.8134328358208958, + "grad_norm": 0.13266355171728472, + "learning_rate": 1.2957191580999821e-05, + "loss": 0.3542, + "step": 3577 + }, + { + "epoch": 3.814498933901919, + "grad_norm": 0.09688667994875337, + "learning_rate": 1.2935262637659824e-05, + "loss": 0.3512, + "step": 3578 + }, + { + "epoch": 3.8155650319829424, + "grad_norm": 0.13077081224692166, + "learning_rate": 1.2913348686777734e-05, + "loss": 0.3463, + "step": 3579 + }, + { + "epoch": 3.816631130063966, + "grad_norm": 0.1000818706820891, + "learning_rate": 1.2891449740492749e-05, + "loss": 0.3499, + "step": 3580 + }, + { + "epoch": 3.8176972281449895, + "grad_norm": 0.12244332857777668, + "learning_rate": 1.2869565810935724e-05, + "loss": 0.3483, + "step": 3581 + }, + { + "epoch": 3.818763326226013, + "grad_norm": 0.10413603824918938, + "learning_rate": 1.2847696910229228e-05, + "loss": 0.3513, + "step": 3582 + }, + { + "epoch": 3.819829424307036, + "grad_norm": 0.10738770551253633, + "learning_rate": 1.2825843050487495e-05, + "loss": 0.3474, + "step": 3583 + }, + { + "epoch": 3.8208955223880596, + "grad_norm": 0.1181068563058338, + "learning_rate": 1.2804004243816444e-05, + "loss": 0.3541, + "step": 3584 + }, + { + "epoch": 3.821961620469083, + "grad_norm": 0.10846504895224321, + "learning_rate": 1.2782180502313609e-05, + "loss": 0.3499, + "step": 3585 + }, + { + "epoch": 3.8230277185501067, + "grad_norm": 0.10970803247483393, + "learning_rate": 1.2760371838068228e-05, + "loss": 0.3463, + "step": 3586 + }, + { + "epoch": 3.8240938166311302, + "grad_norm": 0.11300737915406556, + "learning_rate": 1.2738578263161175e-05, + "loss": 0.3497, + "step": 3587 + }, + { + "epoch": 3.8251599147121533, + "grad_norm": 0.10850041595269576, + "learning_rate": 1.2716799789664931e-05, + "loss": 0.3503, + "step": 3588 + }, + { + "epoch": 3.826226012793177, + "grad_norm": 0.1242287822226434, + "learning_rate": 1.2695036429643657e-05, + "loss": 0.3464, + "step": 3589 + }, + { + "epoch": 3.8272921108742004, + "grad_norm": 0.11234434571174157, + "learning_rate": 1.2673288195153118e-05, + "loss": 0.349, + "step": 3590 + }, + { + "epoch": 3.828358208955224, + "grad_norm": 0.11607729586786202, + "learning_rate": 1.2651555098240724e-05, + "loss": 0.3502, + "step": 3591 + }, + { + "epoch": 3.8294243070362475, + "grad_norm": 0.11570224496079395, + "learning_rate": 1.2629837150945447e-05, + "loss": 0.3499, + "step": 3592 + }, + { + "epoch": 3.8304904051172706, + "grad_norm": 0.12136791006711244, + "learning_rate": 1.2608134365297922e-05, + "loss": 0.354, + "step": 3593 + }, + { + "epoch": 3.831556503198294, + "grad_norm": 0.10914056771119587, + "learning_rate": 1.2586446753320374e-05, + "loss": 0.3538, + "step": 3594 + }, + { + "epoch": 3.8326226012793176, + "grad_norm": 0.12699655682552763, + "learning_rate": 1.256477432702659e-05, + "loss": 0.3469, + "step": 3595 + }, + { + "epoch": 3.833688699360341, + "grad_norm": 0.11727167103613097, + "learning_rate": 1.2543117098421976e-05, + "loss": 0.345, + "step": 3596 + }, + { + "epoch": 3.8347547974413647, + "grad_norm": 0.1207939527624419, + "learning_rate": 1.2521475079503524e-05, + "loss": 0.3477, + "step": 3597 + }, + { + "epoch": 3.835820895522388, + "grad_norm": 0.11622965193657621, + "learning_rate": 1.2499848282259767e-05, + "loss": 0.3489, + "step": 3598 + }, + { + "epoch": 3.836886993603412, + "grad_norm": 0.13564634838395118, + "learning_rate": 1.2478236718670834e-05, + "loss": 0.3498, + "step": 3599 + }, + { + "epoch": 3.837953091684435, + "grad_norm": 0.11550765632689088, + "learning_rate": 1.2456640400708424e-05, + "loss": 0.3515, + "step": 3600 + }, + { + "epoch": 3.8390191897654584, + "grad_norm": 0.10993793619669771, + "learning_rate": 1.2435059340335753e-05, + "loss": 0.3435, + "step": 3601 + }, + { + "epoch": 3.840085287846482, + "grad_norm": 0.11694097576572215, + "learning_rate": 1.241349354950761e-05, + "loss": 0.3487, + "step": 3602 + }, + { + "epoch": 3.8411513859275055, + "grad_norm": 0.10341436996831099, + "learning_rate": 1.2391943040170343e-05, + "loss": 0.348, + "step": 3603 + }, + { + "epoch": 3.842217484008529, + "grad_norm": 0.12769414945414873, + "learning_rate": 1.2370407824261785e-05, + "loss": 0.3489, + "step": 3604 + }, + { + "epoch": 3.843283582089552, + "grad_norm": 0.09401881452558605, + "learning_rate": 1.2348887913711343e-05, + "loss": 0.3522, + "step": 3605 + }, + { + "epoch": 3.8443496801705757, + "grad_norm": 0.11795172219684597, + "learning_rate": 1.2327383320439937e-05, + "loss": 0.3449, + "step": 3606 + }, + { + "epoch": 3.845415778251599, + "grad_norm": 0.1051555043312947, + "learning_rate": 1.2305894056359967e-05, + "loss": 0.3499, + "step": 3607 + }, + { + "epoch": 3.8464818763326227, + "grad_norm": 0.12236877442049936, + "learning_rate": 1.2284420133375385e-05, + "loss": 0.3522, + "step": 3608 + }, + { + "epoch": 3.8475479744136463, + "grad_norm": 0.10450987023264936, + "learning_rate": 1.2262961563381643e-05, + "loss": 0.35, + "step": 3609 + }, + { + "epoch": 3.8486140724946694, + "grad_norm": 0.11632851457098174, + "learning_rate": 1.224151835826565e-05, + "loss": 0.3425, + "step": 3610 + }, + { + "epoch": 3.849680170575693, + "grad_norm": 0.1313585390733969, + "learning_rate": 1.222009052990583e-05, + "loss": 0.3501, + "step": 3611 + }, + { + "epoch": 3.8507462686567164, + "grad_norm": 0.09885089090066136, + "learning_rate": 1.2198678090172096e-05, + "loss": 0.3476, + "step": 3612 + }, + { + "epoch": 3.85181236673774, + "grad_norm": 0.12386048454800534, + "learning_rate": 1.2177281050925829e-05, + "loss": 0.3533, + "step": 3613 + }, + { + "epoch": 3.8528784648187635, + "grad_norm": 0.10109383676324435, + "learning_rate": 1.2155899424019864e-05, + "loss": 0.3479, + "step": 3614 + }, + { + "epoch": 3.8539445628997866, + "grad_norm": 0.11463340028361436, + "learning_rate": 1.2134533221298517e-05, + "loss": 0.3547, + "step": 3615 + }, + { + "epoch": 3.85501066098081, + "grad_norm": 0.10969960215345126, + "learning_rate": 1.2113182454597565e-05, + "loss": 0.3514, + "step": 3616 + }, + { + "epoch": 3.8560767590618337, + "grad_norm": 0.11112827116259213, + "learning_rate": 1.2091847135744198e-05, + "loss": 0.349, + "step": 3617 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.11843720838928255, + "learning_rate": 1.2070527276557092e-05, + "loss": 0.3441, + "step": 3618 + }, + { + "epoch": 3.8582089552238807, + "grad_norm": 0.10458805732550369, + "learning_rate": 1.2049222888846334e-05, + "loss": 0.3561, + "step": 3619 + }, + { + "epoch": 3.859275053304904, + "grad_norm": 0.11054412675997474, + "learning_rate": 1.2027933984413469e-05, + "loss": 0.3527, + "step": 3620 + }, + { + "epoch": 3.8603411513859274, + "grad_norm": 0.10285700468758723, + "learning_rate": 1.2006660575051407e-05, + "loss": 0.3471, + "step": 3621 + }, + { + "epoch": 3.861407249466951, + "grad_norm": 0.1136506392149834, + "learning_rate": 1.1985402672544532e-05, + "loss": 0.3453, + "step": 3622 + }, + { + "epoch": 3.8624733475479744, + "grad_norm": 0.09831280484139478, + "learning_rate": 1.1964160288668629e-05, + "loss": 0.3506, + "step": 3623 + }, + { + "epoch": 3.863539445628998, + "grad_norm": 0.11187723045529688, + "learning_rate": 1.1942933435190845e-05, + "loss": 0.3499, + "step": 3624 + }, + { + "epoch": 3.864605543710021, + "grad_norm": 0.09266608981403165, + "learning_rate": 1.1921722123869773e-05, + "loss": 0.351, + "step": 3625 + }, + { + "epoch": 3.8656716417910446, + "grad_norm": 0.11511057624858669, + "learning_rate": 1.1900526366455369e-05, + "loss": 0.3468, + "step": 3626 + }, + { + "epoch": 3.866737739872068, + "grad_norm": 0.10956860731954124, + "learning_rate": 1.1879346174689e-05, + "loss": 0.3492, + "step": 3627 + }, + { + "epoch": 3.8678038379530917, + "grad_norm": 0.10708959205937109, + "learning_rate": 1.1858181560303366e-05, + "loss": 0.3469, + "step": 3628 + }, + { + "epoch": 3.868869936034115, + "grad_norm": 0.11831696552244149, + "learning_rate": 1.183703253502257e-05, + "loss": 0.3462, + "step": 3629 + }, + { + "epoch": 3.8699360341151388, + "grad_norm": 0.11785222740074164, + "learning_rate": 1.1815899110562081e-05, + "loss": 0.3504, + "step": 3630 + }, + { + "epoch": 3.8710021321961623, + "grad_norm": 0.11362853709274134, + "learning_rate": 1.1794781298628725e-05, + "loss": 0.349, + "step": 3631 + }, + { + "epoch": 3.8720682302771854, + "grad_norm": 0.11822864612991446, + "learning_rate": 1.1773679110920648e-05, + "loss": 0.3477, + "step": 3632 + }, + { + "epoch": 3.873134328358209, + "grad_norm": 0.10399345332909624, + "learning_rate": 1.1752592559127378e-05, + "loss": 0.3473, + "step": 3633 + }, + { + "epoch": 3.8742004264392325, + "grad_norm": 0.10956934208464444, + "learning_rate": 1.1731521654929785e-05, + "loss": 0.3497, + "step": 3634 + }, + { + "epoch": 3.875266524520256, + "grad_norm": 0.12301783671574228, + "learning_rate": 1.1710466410000021e-05, + "loss": 0.3505, + "step": 3635 + }, + { + "epoch": 3.8763326226012795, + "grad_norm": 0.11590042052538112, + "learning_rate": 1.1689426836001618e-05, + "loss": 0.3478, + "step": 3636 + }, + { + "epoch": 3.8773987206823026, + "grad_norm": 0.10524997397293638, + "learning_rate": 1.1668402944589405e-05, + "loss": 0.3476, + "step": 3637 + }, + { + "epoch": 3.878464818763326, + "grad_norm": 0.13955869940094448, + "learning_rate": 1.1647394747409538e-05, + "loss": 0.3486, + "step": 3638 + }, + { + "epoch": 3.8795309168443497, + "grad_norm": 0.09733253071530201, + "learning_rate": 1.1626402256099439e-05, + "loss": 0.3501, + "step": 3639 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.13743368067281408, + "learning_rate": 1.1605425482287869e-05, + "loss": 0.3448, + "step": 3640 + }, + { + "epoch": 3.8816631130063968, + "grad_norm": 0.13018722986242173, + "learning_rate": 1.1584464437594875e-05, + "loss": 0.3507, + "step": 3641 + }, + { + "epoch": 3.88272921108742, + "grad_norm": 0.11788165392769949, + "learning_rate": 1.1563519133631793e-05, + "loss": 0.3506, + "step": 3642 + }, + { + "epoch": 3.8837953091684434, + "grad_norm": 0.13701749120457338, + "learning_rate": 1.1542589582001215e-05, + "loss": 0.3456, + "step": 3643 + }, + { + "epoch": 3.884861407249467, + "grad_norm": 0.10813589381609363, + "learning_rate": 1.1521675794297028e-05, + "loss": 0.3483, + "step": 3644 + }, + { + "epoch": 3.8859275053304905, + "grad_norm": 0.1224953861353445, + "learning_rate": 1.1500777782104406e-05, + "loss": 0.3417, + "step": 3645 + }, + { + "epoch": 3.886993603411514, + "grad_norm": 0.11152538352587757, + "learning_rate": 1.1479895556999731e-05, + "loss": 0.3514, + "step": 3646 + }, + { + "epoch": 3.888059701492537, + "grad_norm": 0.11316497383817867, + "learning_rate": 1.145902913055068e-05, + "loss": 0.3523, + "step": 3647 + }, + { + "epoch": 3.8891257995735606, + "grad_norm": 0.11019891396827555, + "learning_rate": 1.1438178514316181e-05, + "loss": 0.3498, + "step": 3648 + }, + { + "epoch": 3.890191897654584, + "grad_norm": 0.10842084319722904, + "learning_rate": 1.1417343719846387e-05, + "loss": 0.354, + "step": 3649 + }, + { + "epoch": 3.8912579957356077, + "grad_norm": 0.1056924090606256, + "learning_rate": 1.1396524758682678e-05, + "loss": 0.3456, + "step": 3650 + }, + { + "epoch": 3.8923240938166312, + "grad_norm": 0.10490460770794618, + "learning_rate": 1.137572164235769e-05, + "loss": 0.3492, + "step": 3651 + }, + { + "epoch": 3.8933901918976543, + "grad_norm": 0.09975927894418671, + "learning_rate": 1.1354934382395272e-05, + "loss": 0.3509, + "step": 3652 + }, + { + "epoch": 3.894456289978678, + "grad_norm": 0.10145902183398905, + "learning_rate": 1.1334162990310471e-05, + "loss": 0.3447, + "step": 3653 + }, + { + "epoch": 3.8955223880597014, + "grad_norm": 0.10481224542330943, + "learning_rate": 1.1313407477609561e-05, + "loss": 0.347, + "step": 3654 + }, + { + "epoch": 3.896588486140725, + "grad_norm": 0.10001648400892982, + "learning_rate": 1.1292667855790027e-05, + "loss": 0.3513, + "step": 3655 + }, + { + "epoch": 3.8976545842217485, + "grad_norm": 0.09792997968349304, + "learning_rate": 1.1271944136340544e-05, + "loss": 0.3423, + "step": 3656 + }, + { + "epoch": 3.8987206823027716, + "grad_norm": 0.1087579485635863, + "learning_rate": 1.1251236330740962e-05, + "loss": 0.3518, + "step": 3657 + }, + { + "epoch": 3.8997867803837956, + "grad_norm": 0.10273076928541128, + "learning_rate": 1.123054445046233e-05, + "loss": 0.3403, + "step": 3658 + }, + { + "epoch": 3.9008528784648187, + "grad_norm": 0.11360397048458296, + "learning_rate": 1.1209868506966881e-05, + "loss": 0.3479, + "step": 3659 + }, + { + "epoch": 3.901918976545842, + "grad_norm": 0.09676431212644183, + "learning_rate": 1.118920851170803e-05, + "loss": 0.3412, + "step": 3660 + }, + { + "epoch": 3.9029850746268657, + "grad_norm": 0.09307333554900554, + "learning_rate": 1.1168564476130301e-05, + "loss": 0.3482, + "step": 3661 + }, + { + "epoch": 3.9040511727078893, + "grad_norm": 0.10397754347232523, + "learning_rate": 1.1147936411669446e-05, + "loss": 0.3481, + "step": 3662 + }, + { + "epoch": 3.905117270788913, + "grad_norm": 0.08937777918666276, + "learning_rate": 1.1127324329752342e-05, + "loss": 0.3483, + "step": 3663 + }, + { + "epoch": 3.906183368869936, + "grad_norm": 0.09042966313131054, + "learning_rate": 1.110672824179699e-05, + "loss": 0.3487, + "step": 3664 + }, + { + "epoch": 3.9072494669509594, + "grad_norm": 0.09557462612752123, + "learning_rate": 1.1086148159212562e-05, + "loss": 0.3536, + "step": 3665 + }, + { + "epoch": 3.908315565031983, + "grad_norm": 0.09035818870121247, + "learning_rate": 1.1065584093399373e-05, + "loss": 0.3519, + "step": 3666 + }, + { + "epoch": 3.9093816631130065, + "grad_norm": 0.0942584757184624, + "learning_rate": 1.1045036055748817e-05, + "loss": 0.3448, + "step": 3667 + }, + { + "epoch": 3.91044776119403, + "grad_norm": 0.09364257142264377, + "learning_rate": 1.102450405764345e-05, + "loss": 0.3504, + "step": 3668 + }, + { + "epoch": 3.911513859275053, + "grad_norm": 0.09007339517858659, + "learning_rate": 1.100398811045695e-05, + "loss": 0.3501, + "step": 3669 + }, + { + "epoch": 3.9125799573560767, + "grad_norm": 0.08911386586011126, + "learning_rate": 1.0983488225554053e-05, + "loss": 0.3493, + "step": 3670 + }, + { + "epoch": 3.9136460554371, + "grad_norm": 0.09675034869903551, + "learning_rate": 1.0963004414290653e-05, + "loss": 0.3572, + "step": 3671 + }, + { + "epoch": 3.9147121535181237, + "grad_norm": 0.09188043129655729, + "learning_rate": 1.0942536688013713e-05, + "loss": 0.3541, + "step": 3672 + }, + { + "epoch": 3.9157782515991473, + "grad_norm": 0.09937464367903363, + "learning_rate": 1.0922085058061285e-05, + "loss": 0.3482, + "step": 3673 + }, + { + "epoch": 3.9168443496801704, + "grad_norm": 0.09926679761687571, + "learning_rate": 1.0901649535762506e-05, + "loss": 0.3567, + "step": 3674 + }, + { + "epoch": 3.917910447761194, + "grad_norm": 0.09965290213224968, + "learning_rate": 1.0881230132437608e-05, + "loss": 0.3462, + "step": 3675 + }, + { + "epoch": 3.9189765458422174, + "grad_norm": 0.1003530954835924, + "learning_rate": 1.0860826859397862e-05, + "loss": 0.3472, + "step": 3676 + }, + { + "epoch": 3.920042643923241, + "grad_norm": 0.09596106891446698, + "learning_rate": 1.0840439727945626e-05, + "loss": 0.3479, + "step": 3677 + }, + { + "epoch": 3.9211087420042645, + "grad_norm": 0.09919666795826218, + "learning_rate": 1.0820068749374327e-05, + "loss": 0.3449, + "step": 3678 + }, + { + "epoch": 3.9221748400852876, + "grad_norm": 0.1012132428938292, + "learning_rate": 1.0799713934968406e-05, + "loss": 0.3512, + "step": 3679 + }, + { + "epoch": 3.923240938166311, + "grad_norm": 0.09755542244495895, + "learning_rate": 1.0779375296003374e-05, + "loss": 0.352, + "step": 3680 + }, + { + "epoch": 3.9243070362473347, + "grad_norm": 0.09000645950013296, + "learning_rate": 1.0759052843745806e-05, + "loss": 0.3468, + "step": 3681 + }, + { + "epoch": 3.925373134328358, + "grad_norm": 0.09876937193470242, + "learning_rate": 1.073874658945325e-05, + "loss": 0.3504, + "step": 3682 + }, + { + "epoch": 3.9264392324093818, + "grad_norm": 0.09239651298132856, + "learning_rate": 1.0718456544374333e-05, + "loss": 0.3491, + "step": 3683 + }, + { + "epoch": 3.927505330490405, + "grad_norm": 0.10058467408628505, + "learning_rate": 1.0698182719748682e-05, + "loss": 0.3511, + "step": 3684 + }, + { + "epoch": 3.928571428571429, + "grad_norm": 0.08600735325668393, + "learning_rate": 1.0677925126806956e-05, + "loss": 0.3421, + "step": 3685 + }, + { + "epoch": 3.929637526652452, + "grad_norm": 0.09083257850771187, + "learning_rate": 1.0657683776770788e-05, + "loss": 0.3493, + "step": 3686 + }, + { + "epoch": 3.9307036247334755, + "grad_norm": 0.09314333955394852, + "learning_rate": 1.0637458680852841e-05, + "loss": 0.3533, + "step": 3687 + }, + { + "epoch": 3.931769722814499, + "grad_norm": 0.09360495013685566, + "learning_rate": 1.0617249850256766e-05, + "loss": 0.349, + "step": 3688 + }, + { + "epoch": 3.9328358208955225, + "grad_norm": 0.09627416992471283, + "learning_rate": 1.0597057296177225e-05, + "loss": 0.3516, + "step": 3689 + }, + { + "epoch": 3.933901918976546, + "grad_norm": 0.09541087809563052, + "learning_rate": 1.0576881029799808e-05, + "loss": 0.3543, + "step": 3690 + }, + { + "epoch": 3.934968017057569, + "grad_norm": 0.10348458965898262, + "learning_rate": 1.0556721062301141e-05, + "loss": 0.3567, + "step": 3691 + }, + { + "epoch": 3.9360341151385927, + "grad_norm": 0.09872750858387926, + "learning_rate": 1.05365774048488e-05, + "loss": 0.3491, + "step": 3692 + }, + { + "epoch": 3.9371002132196162, + "grad_norm": 0.10350018663429181, + "learning_rate": 1.051645006860131e-05, + "loss": 0.3498, + "step": 3693 + }, + { + "epoch": 3.9381663113006398, + "grad_norm": 0.10594157752622746, + "learning_rate": 1.0496339064708172e-05, + "loss": 0.3506, + "step": 3694 + }, + { + "epoch": 3.9392324093816633, + "grad_norm": 0.10372816420096216, + "learning_rate": 1.0476244404309846e-05, + "loss": 0.3514, + "step": 3695 + }, + { + "epoch": 3.9402985074626864, + "grad_norm": 0.10601590445244279, + "learning_rate": 1.0456166098537737e-05, + "loss": 0.3444, + "step": 3696 + }, + { + "epoch": 3.94136460554371, + "grad_norm": 0.11008051797371131, + "learning_rate": 1.0436104158514158e-05, + "loss": 0.3473, + "step": 3697 + }, + { + "epoch": 3.9424307036247335, + "grad_norm": 0.09606351168418183, + "learning_rate": 1.0416058595352391e-05, + "loss": 0.3484, + "step": 3698 + }, + { + "epoch": 3.943496801705757, + "grad_norm": 0.1071996803306346, + "learning_rate": 1.039602942015664e-05, + "loss": 0.3476, + "step": 3699 + }, + { + "epoch": 3.9445628997867805, + "grad_norm": 0.09623667142559411, + "learning_rate": 1.0376016644022044e-05, + "loss": 0.3447, + "step": 3700 + }, + { + "epoch": 3.9456289978678036, + "grad_norm": 0.10721079752870356, + "learning_rate": 1.035602027803461e-05, + "loss": 0.3469, + "step": 3701 + }, + { + "epoch": 3.946695095948827, + "grad_norm": 0.1025843055534788, + "learning_rate": 1.0336040333271295e-05, + "loss": 0.3447, + "step": 3702 + }, + { + "epoch": 3.9477611940298507, + "grad_norm": 0.09871032111134458, + "learning_rate": 1.0316076820799968e-05, + "loss": 0.3529, + "step": 3703 + }, + { + "epoch": 3.9488272921108742, + "grad_norm": 0.10311671354974446, + "learning_rate": 1.029612975167935e-05, + "loss": 0.3474, + "step": 3704 + }, + { + "epoch": 3.949893390191898, + "grad_norm": 0.10336281514714324, + "learning_rate": 1.0276199136959097e-05, + "loss": 0.3512, + "step": 3705 + }, + { + "epoch": 3.950959488272921, + "grad_norm": 0.10601850200687606, + "learning_rate": 1.025628498767973e-05, + "loss": 0.3514, + "step": 3706 + }, + { + "epoch": 3.9520255863539444, + "grad_norm": 0.08908113002105139, + "learning_rate": 1.0236387314872664e-05, + "loss": 0.3435, + "step": 3707 + }, + { + "epoch": 3.953091684434968, + "grad_norm": 0.10671007341009846, + "learning_rate": 1.0216506129560155e-05, + "loss": 0.3551, + "step": 3708 + }, + { + "epoch": 3.9541577825159915, + "grad_norm": 0.09784872373144313, + "learning_rate": 1.0196641442755354e-05, + "loss": 0.3466, + "step": 3709 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.11088241430701273, + "learning_rate": 1.0176793265462282e-05, + "loss": 0.3455, + "step": 3710 + }, + { + "epoch": 3.956289978678038, + "grad_norm": 0.0910427685127364, + "learning_rate": 1.0156961608675768e-05, + "loss": 0.3505, + "step": 3711 + }, + { + "epoch": 3.957356076759062, + "grad_norm": 0.10461641359651551, + "learning_rate": 1.0137146483381538e-05, + "loss": 0.354, + "step": 3712 + }, + { + "epoch": 3.958422174840085, + "grad_norm": 0.09627545711712145, + "learning_rate": 1.0117347900556137e-05, + "loss": 0.3513, + "step": 3713 + }, + { + "epoch": 3.9594882729211087, + "grad_norm": 0.1073622509994735, + "learning_rate": 1.0097565871166961e-05, + "loss": 0.3546, + "step": 3714 + }, + { + "epoch": 3.9605543710021323, + "grad_norm": 0.09484625771277952, + "learning_rate": 1.0077800406172207e-05, + "loss": 0.3524, + "step": 3715 + }, + { + "epoch": 3.961620469083156, + "grad_norm": 0.08894481152860242, + "learning_rate": 1.0058051516520929e-05, + "loss": 0.3474, + "step": 3716 + }, + { + "epoch": 3.9626865671641793, + "grad_norm": 0.09733312978906855, + "learning_rate": 1.0038319213152979e-05, + "loss": 0.3495, + "step": 3717 + }, + { + "epoch": 3.9637526652452024, + "grad_norm": 0.09864930949926093, + "learning_rate": 1.001860350699904e-05, + "loss": 0.3496, + "step": 3718 + }, + { + "epoch": 3.964818763326226, + "grad_norm": 0.0981894103889243, + "learning_rate": 9.99890440898057e-06, + "loss": 0.3529, + "step": 3719 + }, + { + "epoch": 3.9658848614072495, + "grad_norm": 0.10482896841184723, + "learning_rate": 9.97922193000985e-06, + "loss": 0.3519, + "step": 3720 + }, + { + "epoch": 3.966950959488273, + "grad_norm": 0.10498811386677907, + "learning_rate": 9.959556080989973e-06, + "loss": 0.3488, + "step": 3721 + }, + { + "epoch": 3.9680170575692966, + "grad_norm": 0.11366999497073144, + "learning_rate": 9.939906872814764e-06, + "loss": 0.3535, + "step": 3722 + }, + { + "epoch": 3.9690831556503197, + "grad_norm": 0.09665112411107446, + "learning_rate": 9.920274316368879e-06, + "loss": 0.3511, + "step": 3723 + }, + { + "epoch": 3.970149253731343, + "grad_norm": 0.10146451503365203, + "learning_rate": 9.900658422527734e-06, + "loss": 0.346, + "step": 3724 + }, + { + "epoch": 3.9712153518123667, + "grad_norm": 0.10925095180031567, + "learning_rate": 9.881059202157525e-06, + "loss": 0.3507, + "step": 3725 + }, + { + "epoch": 3.9722814498933903, + "grad_norm": 0.09225433257638278, + "learning_rate": 9.86147666611518e-06, + "loss": 0.3482, + "step": 3726 + }, + { + "epoch": 3.973347547974414, + "grad_norm": 0.0943825856008904, + "learning_rate": 9.841910825248412e-06, + "loss": 0.3464, + "step": 3727 + }, + { + "epoch": 3.974413646055437, + "grad_norm": 0.09290035671178411, + "learning_rate": 9.82236169039569e-06, + "loss": 0.3449, + "step": 3728 + }, + { + "epoch": 3.9754797441364604, + "grad_norm": 0.0921926282044607, + "learning_rate": 9.802829272386227e-06, + "loss": 0.3554, + "step": 3729 + }, + { + "epoch": 3.976545842217484, + "grad_norm": 0.0960555525918967, + "learning_rate": 9.783313582039935e-06, + "loss": 0.3483, + "step": 3730 + }, + { + "epoch": 3.9776119402985075, + "grad_norm": 0.0917415114727513, + "learning_rate": 9.763814630167516e-06, + "loss": 0.3458, + "step": 3731 + }, + { + "epoch": 3.978678038379531, + "grad_norm": 0.08801795546770015, + "learning_rate": 9.744332427570384e-06, + "loss": 0.3436, + "step": 3732 + }, + { + "epoch": 3.979744136460554, + "grad_norm": 0.09754716872379321, + "learning_rate": 9.72486698504064e-06, + "loss": 0.3406, + "step": 3733 + }, + { + "epoch": 3.9808102345415777, + "grad_norm": 0.09042401583271688, + "learning_rate": 9.705418313361141e-06, + "loss": 0.3486, + "step": 3734 + }, + { + "epoch": 3.981876332622601, + "grad_norm": 0.08028758020380314, + "learning_rate": 9.685986423305449e-06, + "loss": 0.3494, + "step": 3735 + }, + { + "epoch": 3.9829424307036247, + "grad_norm": 0.09007532159921652, + "learning_rate": 9.666571325637806e-06, + "loss": 0.3482, + "step": 3736 + }, + { + "epoch": 3.9840085287846483, + "grad_norm": 0.09260516061644863, + "learning_rate": 9.647173031113173e-06, + "loss": 0.3452, + "step": 3737 + }, + { + "epoch": 3.9850746268656714, + "grad_norm": 0.09089810495372147, + "learning_rate": 9.627791550477209e-06, + "loss": 0.3462, + "step": 3738 + }, + { + "epoch": 3.9861407249466954, + "grad_norm": 0.09771736284518276, + "learning_rate": 9.608426894466225e-06, + "loss": 0.3532, + "step": 3739 + }, + { + "epoch": 3.9872068230277184, + "grad_norm": 0.1259794070436244, + "learning_rate": 9.589079073807244e-06, + "loss": 0.349, + "step": 3740 + }, + { + "epoch": 3.988272921108742, + "grad_norm": 0.09140483649945423, + "learning_rate": 9.569748099217962e-06, + "loss": 0.3433, + "step": 3741 + }, + { + "epoch": 3.9893390191897655, + "grad_norm": 0.12248710790523741, + "learning_rate": 9.55043398140672e-06, + "loss": 0.3522, + "step": 3742 + }, + { + "epoch": 3.990405117270789, + "grad_norm": 0.09833501236467464, + "learning_rate": 9.53113673107254e-06, + "loss": 0.3531, + "step": 3743 + }, + { + "epoch": 3.9914712153518126, + "grad_norm": 0.10722069561647879, + "learning_rate": 9.511856358905108e-06, + "loss": 0.3513, + "step": 3744 + }, + { + "epoch": 3.9925373134328357, + "grad_norm": 0.12373429125821357, + "learning_rate": 9.492592875584728e-06, + "loss": 0.3498, + "step": 3745 + }, + { + "epoch": 3.9936034115138592, + "grad_norm": 0.10792510560456024, + "learning_rate": 9.473346291782376e-06, + "loss": 0.3532, + "step": 3746 + }, + { + "epoch": 3.9946695095948828, + "grad_norm": 0.10969296551778704, + "learning_rate": 9.454116618159675e-06, + "loss": 0.3481, + "step": 3747 + }, + { + "epoch": 3.9957356076759063, + "grad_norm": 0.10095085119982077, + "learning_rate": 9.434903865368837e-06, + "loss": 0.3484, + "step": 3748 + }, + { + "epoch": 3.99680170575693, + "grad_norm": 0.1008779164840716, + "learning_rate": 9.415708044052744e-06, + "loss": 0.3515, + "step": 3749 + }, + { + "epoch": 3.997867803837953, + "grad_norm": 0.09334484031468043, + "learning_rate": 9.396529164844893e-06, + "loss": 0.3536, + "step": 3750 + }, + { + "epoch": 3.9989339019189765, + "grad_norm": 0.08642194713513292, + "learning_rate": 9.377367238369368e-06, + "loss": 0.3451, + "step": 3751 + }, + { + "epoch": 4.0, + "grad_norm": 0.1134408652616496, + "learning_rate": 9.358222275240884e-06, + "loss": 0.336, + "step": 3752 + }, + { + "epoch": 4.001066098081023, + "grad_norm": 0.104623946486961, + "learning_rate": 9.33909428606476e-06, + "loss": 0.3372, + "step": 3753 + }, + { + "epoch": 4.002132196162047, + "grad_norm": 0.10983574887630808, + "learning_rate": 9.31998328143692e-06, + "loss": 0.3316, + "step": 3754 + }, + { + "epoch": 4.00319829424307, + "grad_norm": 0.10886704749727195, + "learning_rate": 9.30088927194384e-06, + "loss": 0.3346, + "step": 3755 + }, + { + "epoch": 4.004264392324094, + "grad_norm": 0.11057981475076427, + "learning_rate": 9.281812268162626e-06, + "loss": 0.3377, + "step": 3756 + }, + { + "epoch": 4.005330490405117, + "grad_norm": 0.12400384882704346, + "learning_rate": 9.262752280660944e-06, + "loss": 0.3342, + "step": 3757 + }, + { + "epoch": 4.00639658848614, + "grad_norm": 0.11298478774918766, + "learning_rate": 9.243709319997047e-06, + "loss": 0.338, + "step": 3758 + }, + { + "epoch": 4.007462686567164, + "grad_norm": 0.13596408021241022, + "learning_rate": 9.224683396719728e-06, + "loss": 0.335, + "step": 3759 + }, + { + "epoch": 4.008528784648187, + "grad_norm": 0.11672755460945555, + "learning_rate": 9.205674521368362e-06, + "loss": 0.3316, + "step": 3760 + }, + { + "epoch": 4.009594882729211, + "grad_norm": 0.11331525635027023, + "learning_rate": 9.186682704472898e-06, + "loss": 0.3294, + "step": 3761 + }, + { + "epoch": 4.0106609808102345, + "grad_norm": 0.1195438195692172, + "learning_rate": 9.167707956553787e-06, + "loss": 0.3343, + "step": 3762 + }, + { + "epoch": 4.011727078891258, + "grad_norm": 0.1091957644805815, + "learning_rate": 9.148750288122063e-06, + "loss": 0.3348, + "step": 3763 + }, + { + "epoch": 4.0127931769722816, + "grad_norm": 0.11204673273341918, + "learning_rate": 9.129809709679297e-06, + "loss": 0.3321, + "step": 3764 + }, + { + "epoch": 4.013859275053305, + "grad_norm": 0.10282221122253991, + "learning_rate": 9.110886231717595e-06, + "loss": 0.3339, + "step": 3765 + }, + { + "epoch": 4.014925373134329, + "grad_norm": 0.10864476435323264, + "learning_rate": 9.09197986471955e-06, + "loss": 0.3355, + "step": 3766 + }, + { + "epoch": 4.015991471215352, + "grad_norm": 0.10772893714735123, + "learning_rate": 9.073090619158322e-06, + "loss": 0.3306, + "step": 3767 + }, + { + "epoch": 4.017057569296376, + "grad_norm": 0.11249780372035993, + "learning_rate": 9.054218505497587e-06, + "loss": 0.3343, + "step": 3768 + }, + { + "epoch": 4.018123667377399, + "grad_norm": 0.1126010464588701, + "learning_rate": 9.035363534191486e-06, + "loss": 0.3312, + "step": 3769 + }, + { + "epoch": 4.019189765458422, + "grad_norm": 0.12907028042017912, + "learning_rate": 9.016525715684711e-06, + "loss": 0.3327, + "step": 3770 + }, + { + "epoch": 4.020255863539446, + "grad_norm": 0.11807831450158258, + "learning_rate": 8.99770506041243e-06, + "loss": 0.3386, + "step": 3771 + }, + { + "epoch": 4.021321961620469, + "grad_norm": 0.10963014735265454, + "learning_rate": 8.978901578800316e-06, + "loss": 0.3309, + "step": 3772 + }, + { + "epoch": 4.022388059701493, + "grad_norm": 0.11867805956803665, + "learning_rate": 8.960115281264507e-06, + "loss": 0.3423, + "step": 3773 + }, + { + "epoch": 4.023454157782516, + "grad_norm": 0.11598119651875034, + "learning_rate": 8.941346178211639e-06, + "loss": 0.3281, + "step": 3774 + }, + { + "epoch": 4.024520255863539, + "grad_norm": 0.10424229151793267, + "learning_rate": 8.922594280038823e-06, + "loss": 0.323, + "step": 3775 + }, + { + "epoch": 4.025586353944563, + "grad_norm": 0.10762211731941046, + "learning_rate": 8.903859597133646e-06, + "loss": 0.3344, + "step": 3776 + }, + { + "epoch": 4.026652452025586, + "grad_norm": 0.11394090140716011, + "learning_rate": 8.88514213987413e-06, + "loss": 0.3331, + "step": 3777 + }, + { + "epoch": 4.02771855010661, + "grad_norm": 0.09343973744188745, + "learning_rate": 8.866441918628777e-06, + "loss": 0.333, + "step": 3778 + }, + { + "epoch": 4.028784648187633, + "grad_norm": 0.11583112915467723, + "learning_rate": 8.847758943756556e-06, + "loss": 0.3385, + "step": 3779 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.10675906671526333, + "learning_rate": 8.829093225606842e-06, + "loss": 0.3338, + "step": 3780 + }, + { + "epoch": 4.03091684434968, + "grad_norm": 0.10418803649968214, + "learning_rate": 8.810444774519475e-06, + "loss": 0.3383, + "step": 3781 + }, + { + "epoch": 4.031982942430703, + "grad_norm": 0.12106960416467054, + "learning_rate": 8.791813600824742e-06, + "loss": 0.3378, + "step": 3782 + }, + { + "epoch": 4.033049040511727, + "grad_norm": 0.11769429634565491, + "learning_rate": 8.773199714843339e-06, + "loss": 0.3326, + "step": 3783 + }, + { + "epoch": 4.0341151385927505, + "grad_norm": 0.11355409773845418, + "learning_rate": 8.754603126886385e-06, + "loss": 0.3351, + "step": 3784 + }, + { + "epoch": 4.035181236673774, + "grad_norm": 0.09880842808847773, + "learning_rate": 8.736023847255426e-06, + "loss": 0.3356, + "step": 3785 + }, + { + "epoch": 4.036247334754798, + "grad_norm": 0.11158321630050068, + "learning_rate": 8.71746188624242e-06, + "loss": 0.3363, + "step": 3786 + }, + { + "epoch": 4.037313432835821, + "grad_norm": 0.1161769963643131, + "learning_rate": 8.698917254129732e-06, + "loss": 0.3328, + "step": 3787 + }, + { + "epoch": 4.038379530916845, + "grad_norm": 0.11351757271831203, + "learning_rate": 8.680389961190116e-06, + "loss": 0.3311, + "step": 3788 + }, + { + "epoch": 4.039445628997868, + "grad_norm": 0.10521775128019155, + "learning_rate": 8.66188001768673e-06, + "loss": 0.3346, + "step": 3789 + }, + { + "epoch": 4.040511727078891, + "grad_norm": 0.10112600207752875, + "learning_rate": 8.64338743387314e-06, + "loss": 0.3365, + "step": 3790 + }, + { + "epoch": 4.041577825159915, + "grad_norm": 0.10617915170837067, + "learning_rate": 8.624912219993248e-06, + "loss": 0.3332, + "step": 3791 + }, + { + "epoch": 4.042643923240938, + "grad_norm": 0.10169862043274906, + "learning_rate": 8.606454386281368e-06, + "loss": 0.3276, + "step": 3792 + }, + { + "epoch": 4.043710021321962, + "grad_norm": 0.10543707108073325, + "learning_rate": 8.588013942962195e-06, + "loss": 0.337, + "step": 3793 + }, + { + "epoch": 4.044776119402985, + "grad_norm": 0.10597483996500817, + "learning_rate": 8.569590900250775e-06, + "loss": 0.3389, + "step": 3794 + }, + { + "epoch": 4.045842217484009, + "grad_norm": 0.10326713662353049, + "learning_rate": 8.551185268352502e-06, + "loss": 0.3337, + "step": 3795 + }, + { + "epoch": 4.046908315565032, + "grad_norm": 0.0966216388278116, + "learning_rate": 8.532797057463145e-06, + "loss": 0.3398, + "step": 3796 + }, + { + "epoch": 4.047974413646055, + "grad_norm": 0.09422934050818582, + "learning_rate": 8.51442627776883e-06, + "loss": 0.3373, + "step": 3797 + }, + { + "epoch": 4.049040511727079, + "grad_norm": 0.1036405776308619, + "learning_rate": 8.496072939445997e-06, + "loss": 0.3336, + "step": 3798 + }, + { + "epoch": 4.050106609808102, + "grad_norm": 0.10297234333425688, + "learning_rate": 8.477737052661444e-06, + "loss": 0.3357, + "step": 3799 + }, + { + "epoch": 4.051172707889126, + "grad_norm": 0.09051843440055769, + "learning_rate": 8.459418627572304e-06, + "loss": 0.3356, + "step": 3800 + }, + { + "epoch": 4.052238805970149, + "grad_norm": 0.09979444399952614, + "learning_rate": 8.44111767432604e-06, + "loss": 0.3383, + "step": 3801 + }, + { + "epoch": 4.053304904051172, + "grad_norm": 0.09217956240397418, + "learning_rate": 8.422834203060418e-06, + "loss": 0.3324, + "step": 3802 + }, + { + "epoch": 4.054371002132196, + "grad_norm": 0.09053551898130377, + "learning_rate": 8.404568223903529e-06, + "loss": 0.3336, + "step": 3803 + }, + { + "epoch": 4.0554371002132195, + "grad_norm": 0.09177723542384852, + "learning_rate": 8.386319746973787e-06, + "loss": 0.3339, + "step": 3804 + }, + { + "epoch": 4.056503198294243, + "grad_norm": 0.09189669268340435, + "learning_rate": 8.36808878237989e-06, + "loss": 0.3382, + "step": 3805 + }, + { + "epoch": 4.0575692963752665, + "grad_norm": 0.0942192210220161, + "learning_rate": 8.349875340220847e-06, + "loss": 0.327, + "step": 3806 + }, + { + "epoch": 4.05863539445629, + "grad_norm": 0.09403630781253433, + "learning_rate": 8.331679430585971e-06, + "loss": 0.3406, + "step": 3807 + }, + { + "epoch": 4.059701492537314, + "grad_norm": 0.10368304373550687, + "learning_rate": 8.313501063554827e-06, + "loss": 0.3329, + "step": 3808 + }, + { + "epoch": 4.060767590618337, + "grad_norm": 0.09734440431718942, + "learning_rate": 8.295340249197301e-06, + "loss": 0.3339, + "step": 3809 + }, + { + "epoch": 4.061833688699361, + "grad_norm": 0.09789072226393697, + "learning_rate": 8.277196997573545e-06, + "loss": 0.3392, + "step": 3810 + }, + { + "epoch": 4.062899786780384, + "grad_norm": 0.11027941919044942, + "learning_rate": 8.259071318733962e-06, + "loss": 0.3326, + "step": 3811 + }, + { + "epoch": 4.063965884861407, + "grad_norm": 0.09341818511371246, + "learning_rate": 8.240963222719243e-06, + "loss": 0.3304, + "step": 3812 + }, + { + "epoch": 4.065031982942431, + "grad_norm": 0.0985386753548714, + "learning_rate": 8.222872719560339e-06, + "loss": 0.3373, + "step": 3813 + }, + { + "epoch": 4.066098081023454, + "grad_norm": 0.09601340900666522, + "learning_rate": 8.204799819278438e-06, + "loss": 0.3352, + "step": 3814 + }, + { + "epoch": 4.067164179104478, + "grad_norm": 0.11111998488141665, + "learning_rate": 8.186744531884989e-06, + "loss": 0.3399, + "step": 3815 + }, + { + "epoch": 4.068230277185501, + "grad_norm": 0.09039086409906084, + "learning_rate": 8.168706867381692e-06, + "loss": 0.3316, + "step": 3816 + }, + { + "epoch": 4.069296375266524, + "grad_norm": 0.09819313076741153, + "learning_rate": 8.150686835760467e-06, + "loss": 0.3389, + "step": 3817 + }, + { + "epoch": 4.070362473347548, + "grad_norm": 0.10154340371524484, + "learning_rate": 8.132684447003471e-06, + "loss": 0.3348, + "step": 3818 + }, + { + "epoch": 4.071428571428571, + "grad_norm": 0.08699449860021097, + "learning_rate": 8.114699711083113e-06, + "loss": 0.3377, + "step": 3819 + }, + { + "epoch": 4.072494669509595, + "grad_norm": 0.10459986672211814, + "learning_rate": 8.096732637961974e-06, + "loss": 0.3366, + "step": 3820 + }, + { + "epoch": 4.073560767590618, + "grad_norm": 0.09394697895679324, + "learning_rate": 8.078783237592894e-06, + "loss": 0.3309, + "step": 3821 + }, + { + "epoch": 4.074626865671641, + "grad_norm": 0.0914538941685346, + "learning_rate": 8.060851519918901e-06, + "loss": 0.3361, + "step": 3822 + }, + { + "epoch": 4.075692963752665, + "grad_norm": 0.08804168096705205, + "learning_rate": 8.042937494873255e-06, + "loss": 0.3342, + "step": 3823 + }, + { + "epoch": 4.076759061833688, + "grad_norm": 0.08507323171593308, + "learning_rate": 8.025041172379366e-06, + "loss": 0.3338, + "step": 3824 + }, + { + "epoch": 4.077825159914712, + "grad_norm": 0.08903348958715734, + "learning_rate": 8.007162562350882e-06, + "loss": 0.3372, + "step": 3825 + }, + { + "epoch": 4.0788912579957355, + "grad_norm": 0.08728752400888612, + "learning_rate": 7.989301674691634e-06, + "loss": 0.3299, + "step": 3826 + }, + { + "epoch": 4.0799573560767595, + "grad_norm": 0.10425110765874443, + "learning_rate": 7.971458519295598e-06, + "loss": 0.3363, + "step": 3827 + }, + { + "epoch": 4.081023454157783, + "grad_norm": 0.08654110423017557, + "learning_rate": 7.953633106046971e-06, + "loss": 0.3362, + "step": 3828 + }, + { + "epoch": 4.082089552238806, + "grad_norm": 0.09648199398820004, + "learning_rate": 7.935825444820109e-06, + "loss": 0.3336, + "step": 3829 + }, + { + "epoch": 4.08315565031983, + "grad_norm": 0.09601930808087997, + "learning_rate": 7.918035545479532e-06, + "loss": 0.3331, + "step": 3830 + }, + { + "epoch": 4.084221748400853, + "grad_norm": 0.08913546055906947, + "learning_rate": 7.900263417879905e-06, + "loss": 0.3333, + "step": 3831 + }, + { + "epoch": 4.085287846481877, + "grad_norm": 0.08746186482392278, + "learning_rate": 7.882509071866074e-06, + "loss": 0.3351, + "step": 3832 + }, + { + "epoch": 4.0863539445629, + "grad_norm": 0.09436654519402421, + "learning_rate": 7.864772517273019e-06, + "loss": 0.3287, + "step": 3833 + }, + { + "epoch": 4.087420042643923, + "grad_norm": 0.09443565575400105, + "learning_rate": 7.847053763925884e-06, + "loss": 0.3306, + "step": 3834 + }, + { + "epoch": 4.088486140724947, + "grad_norm": 0.09012184838426746, + "learning_rate": 7.829352821639915e-06, + "loss": 0.3345, + "step": 3835 + }, + { + "epoch": 4.08955223880597, + "grad_norm": 0.08459214899851109, + "learning_rate": 7.811669700220523e-06, + "loss": 0.3338, + "step": 3836 + }, + { + "epoch": 4.090618336886994, + "grad_norm": 0.0940823716359374, + "learning_rate": 7.794004409463256e-06, + "loss": 0.3333, + "step": 3837 + }, + { + "epoch": 4.091684434968017, + "grad_norm": 0.09658670184543283, + "learning_rate": 7.77635695915374e-06, + "loss": 0.332, + "step": 3838 + }, + { + "epoch": 4.09275053304904, + "grad_norm": 0.09760335361225825, + "learning_rate": 7.758727359067752e-06, + "loss": 0.334, + "step": 3839 + }, + { + "epoch": 4.093816631130064, + "grad_norm": 0.09269720585969814, + "learning_rate": 7.741115618971182e-06, + "loss": 0.3381, + "step": 3840 + }, + { + "epoch": 4.094882729211087, + "grad_norm": 0.08955928469567048, + "learning_rate": 7.723521748620023e-06, + "loss": 0.334, + "step": 3841 + }, + { + "epoch": 4.095948827292111, + "grad_norm": 0.09989251564772907, + "learning_rate": 7.705945757760349e-06, + "loss": 0.3324, + "step": 3842 + }, + { + "epoch": 4.097014925373134, + "grad_norm": 0.08940858807005556, + "learning_rate": 7.688387656128355e-06, + "loss": 0.3462, + "step": 3843 + }, + { + "epoch": 4.098081023454157, + "grad_norm": 0.09359897320243363, + "learning_rate": 7.670847453450325e-06, + "loss": 0.3359, + "step": 3844 + }, + { + "epoch": 4.099147121535181, + "grad_norm": 0.09401701464605473, + "learning_rate": 7.653325159442597e-06, + "loss": 0.3352, + "step": 3845 + }, + { + "epoch": 4.100213219616204, + "grad_norm": 0.08829874201235319, + "learning_rate": 7.63582078381163e-06, + "loss": 0.3375, + "step": 3846 + }, + { + "epoch": 4.101279317697228, + "grad_norm": 0.10865348890518493, + "learning_rate": 7.618334336253927e-06, + "loss": 0.3344, + "step": 3847 + }, + { + "epoch": 4.1023454157782515, + "grad_norm": 0.09122759099076949, + "learning_rate": 7.60086582645609e-06, + "loss": 0.3311, + "step": 3848 + }, + { + "epoch": 4.103411513859275, + "grad_norm": 0.08531789969850555, + "learning_rate": 7.5834152640947444e-06, + "loss": 0.3326, + "step": 3849 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.108511169138579, + "learning_rate": 7.565982658836599e-06, + "loss": 0.3299, + "step": 3850 + }, + { + "epoch": 4.105543710021322, + "grad_norm": 0.07907053738511821, + "learning_rate": 7.548568020338418e-06, + "loss": 0.3335, + "step": 3851 + }, + { + "epoch": 4.106609808102346, + "grad_norm": 0.09975341694550234, + "learning_rate": 7.531171358247009e-06, + "loss": 0.3352, + "step": 3852 + }, + { + "epoch": 4.107675906183369, + "grad_norm": 0.08769932023249331, + "learning_rate": 7.5137926821992055e-06, + "loss": 0.334, + "step": 3853 + }, + { + "epoch": 4.108742004264393, + "grad_norm": 0.10031133415920226, + "learning_rate": 7.496432001821898e-06, + "loss": 0.3311, + "step": 3854 + }, + { + "epoch": 4.109808102345416, + "grad_norm": 0.09300616655005942, + "learning_rate": 7.4790893267320115e-06, + "loss": 0.3347, + "step": 3855 + }, + { + "epoch": 4.110874200426439, + "grad_norm": 0.09170488621474711, + "learning_rate": 7.461764666536471e-06, + "loss": 0.3334, + "step": 3856 + }, + { + "epoch": 4.111940298507463, + "grad_norm": 0.10055859853923502, + "learning_rate": 7.444458030832238e-06, + "loss": 0.3395, + "step": 3857 + }, + { + "epoch": 4.113006396588486, + "grad_norm": 0.08680033935993048, + "learning_rate": 7.427169429206294e-06, + "loss": 0.3411, + "step": 3858 + }, + { + "epoch": 4.11407249466951, + "grad_norm": 0.09485145644525105, + "learning_rate": 7.409898871235639e-06, + "loss": 0.333, + "step": 3859 + }, + { + "epoch": 4.115138592750533, + "grad_norm": 0.09123152248422452, + "learning_rate": 7.39264636648724e-06, + "loss": 0.3331, + "step": 3860 + }, + { + "epoch": 4.116204690831556, + "grad_norm": 0.10403836531483734, + "learning_rate": 7.375411924518099e-06, + "loss": 0.328, + "step": 3861 + }, + { + "epoch": 4.11727078891258, + "grad_norm": 0.08367336466007777, + "learning_rate": 7.358195554875203e-06, + "loss": 0.3337, + "step": 3862 + }, + { + "epoch": 4.118336886993603, + "grad_norm": 0.10846732273711228, + "learning_rate": 7.340997267095535e-06, + "loss": 0.3333, + "step": 3863 + }, + { + "epoch": 4.119402985074627, + "grad_norm": 0.12164518439766003, + "learning_rate": 7.323817070706036e-06, + "loss": 0.3341, + "step": 3864 + }, + { + "epoch": 4.12046908315565, + "grad_norm": 0.09560739668306972, + "learning_rate": 7.3066549752236435e-06, + "loss": 0.3364, + "step": 3865 + }, + { + "epoch": 4.121535181236673, + "grad_norm": 0.09724336285167177, + "learning_rate": 7.289510990155286e-06, + "loss": 0.3264, + "step": 3866 + }, + { + "epoch": 4.122601279317697, + "grad_norm": 0.1017353520878921, + "learning_rate": 7.2723851249978114e-06, + "loss": 0.3305, + "step": 3867 + }, + { + "epoch": 4.1236673773987205, + "grad_norm": 0.1069625874611212, + "learning_rate": 7.255277389238075e-06, + "loss": 0.335, + "step": 3868 + }, + { + "epoch": 4.1247334754797444, + "grad_norm": 0.09669306285547304, + "learning_rate": 7.238187792352871e-06, + "loss": 0.328, + "step": 3869 + }, + { + "epoch": 4.1257995735607675, + "grad_norm": 0.10282239370897285, + "learning_rate": 7.221116343808963e-06, + "loss": 0.3311, + "step": 3870 + }, + { + "epoch": 4.126865671641791, + "grad_norm": 0.10105373041143043, + "learning_rate": 7.2040630530630175e-06, + "loss": 0.334, + "step": 3871 + }, + { + "epoch": 4.127931769722815, + "grad_norm": 0.0854802255803901, + "learning_rate": 7.187027929561683e-06, + "loss": 0.3374, + "step": 3872 + }, + { + "epoch": 4.128997867803838, + "grad_norm": 0.10520223879341609, + "learning_rate": 7.170010982741549e-06, + "loss": 0.3332, + "step": 3873 + }, + { + "epoch": 4.130063965884862, + "grad_norm": 0.09932971506774815, + "learning_rate": 7.153012222029097e-06, + "loss": 0.3313, + "step": 3874 + }, + { + "epoch": 4.131130063965885, + "grad_norm": 0.09719494548254369, + "learning_rate": 7.136031656840763e-06, + "loss": 0.3308, + "step": 3875 + }, + { + "epoch": 4.132196162046908, + "grad_norm": 0.08740775740958265, + "learning_rate": 7.1190692965829126e-06, + "loss": 0.3353, + "step": 3876 + }, + { + "epoch": 4.133262260127932, + "grad_norm": 0.08603123144210989, + "learning_rate": 7.102125150651784e-06, + "loss": 0.3288, + "step": 3877 + }, + { + "epoch": 4.134328358208955, + "grad_norm": 0.09256501786020642, + "learning_rate": 7.085199228433577e-06, + "loss": 0.3333, + "step": 3878 + }, + { + "epoch": 4.135394456289979, + "grad_norm": 0.09108945673753906, + "learning_rate": 7.068291539304368e-06, + "loss": 0.3381, + "step": 3879 + }, + { + "epoch": 4.136460554371002, + "grad_norm": 0.0796397200521715, + "learning_rate": 7.0514020926301285e-06, + "loss": 0.3314, + "step": 3880 + }, + { + "epoch": 4.137526652452026, + "grad_norm": 0.09620207070657663, + "learning_rate": 7.034530897766738e-06, + "loss": 0.3348, + "step": 3881 + }, + { + "epoch": 4.138592750533049, + "grad_norm": 0.08902821892098343, + "learning_rate": 7.017677964059979e-06, + "loss": 0.336, + "step": 3882 + }, + { + "epoch": 4.139658848614072, + "grad_norm": 0.08377254099774022, + "learning_rate": 7.000843300845473e-06, + "loss": 0.3312, + "step": 3883 + }, + { + "epoch": 4.140724946695096, + "grad_norm": 0.09387188110362456, + "learning_rate": 6.984026917448763e-06, + "loss": 0.3355, + "step": 3884 + }, + { + "epoch": 4.141791044776119, + "grad_norm": 0.08819576291353799, + "learning_rate": 6.967228823185257e-06, + "loss": 0.3306, + "step": 3885 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.09756341988481407, + "learning_rate": 6.950449027360213e-06, + "loss": 0.3359, + "step": 3886 + }, + { + "epoch": 4.143923240938166, + "grad_norm": 0.08464278902185567, + "learning_rate": 6.9336875392687695e-06, + "loss": 0.3336, + "step": 3887 + }, + { + "epoch": 4.144989339019189, + "grad_norm": 0.08869325843091481, + "learning_rate": 6.91694436819593e-06, + "loss": 0.3325, + "step": 3888 + }, + { + "epoch": 4.146055437100213, + "grad_norm": 0.08580192830087008, + "learning_rate": 6.9002195234165295e-06, + "loss": 0.329, + "step": 3889 + }, + { + "epoch": 4.1471215351812365, + "grad_norm": 0.09091187799203043, + "learning_rate": 6.8835130141952625e-06, + "loss": 0.3319, + "step": 3890 + }, + { + "epoch": 4.1481876332622605, + "grad_norm": 0.08949013100276429, + "learning_rate": 6.866824849786673e-06, + "loss": 0.3348, + "step": 3891 + }, + { + "epoch": 4.149253731343284, + "grad_norm": 0.07727474516040857, + "learning_rate": 6.850155039435145e-06, + "loss": 0.3353, + "step": 3892 + }, + { + "epoch": 4.150319829424307, + "grad_norm": 0.09632177794333178, + "learning_rate": 6.833503592374864e-06, + "loss": 0.3387, + "step": 3893 + }, + { + "epoch": 4.151385927505331, + "grad_norm": 0.08865373218038916, + "learning_rate": 6.8168705178298835e-06, + "loss": 0.3309, + "step": 3894 + }, + { + "epoch": 4.152452025586354, + "grad_norm": 0.09466620554710381, + "learning_rate": 6.800255825014063e-06, + "loss": 0.333, + "step": 3895 + }, + { + "epoch": 4.153518123667378, + "grad_norm": 0.08200431933116618, + "learning_rate": 6.78365952313107e-06, + "loss": 0.3304, + "step": 3896 + }, + { + "epoch": 4.154584221748401, + "grad_norm": 0.0857938915091229, + "learning_rate": 6.767081621374392e-06, + "loss": 0.3335, + "step": 3897 + }, + { + "epoch": 4.155650319829424, + "grad_norm": 0.09483772251293643, + "learning_rate": 6.750522128927332e-06, + "loss": 0.3366, + "step": 3898 + }, + { + "epoch": 4.156716417910448, + "grad_norm": 0.0963326853227681, + "learning_rate": 6.733981054962995e-06, + "loss": 0.3287, + "step": 3899 + }, + { + "epoch": 4.157782515991471, + "grad_norm": 0.10620078154702665, + "learning_rate": 6.717458408644262e-06, + "loss": 0.3337, + "step": 3900 + }, + { + "epoch": 4.158848614072495, + "grad_norm": 0.0807624229086923, + "learning_rate": 6.700954199123821e-06, + "loss": 0.3351, + "step": 3901 + }, + { + "epoch": 4.159914712153518, + "grad_norm": 0.08529115263300027, + "learning_rate": 6.68446843554416e-06, + "loss": 0.3324, + "step": 3902 + }, + { + "epoch": 4.160980810234541, + "grad_norm": 0.09055299262053584, + "learning_rate": 6.66800112703752e-06, + "loss": 0.3337, + "step": 3903 + }, + { + "epoch": 4.162046908315565, + "grad_norm": 0.10436288204758602, + "learning_rate": 6.6515522827259414e-06, + "loss": 0.3353, + "step": 3904 + }, + { + "epoch": 4.163113006396588, + "grad_norm": 0.09038774397915303, + "learning_rate": 6.63512191172123e-06, + "loss": 0.3332, + "step": 3905 + }, + { + "epoch": 4.164179104477612, + "grad_norm": 0.10833203568207923, + "learning_rate": 6.618710023124961e-06, + "loss": 0.3335, + "step": 3906 + }, + { + "epoch": 4.165245202558635, + "grad_norm": 0.08665653296892867, + "learning_rate": 6.6023166260284555e-06, + "loss": 0.3384, + "step": 3907 + }, + { + "epoch": 4.166311300639659, + "grad_norm": 0.08387664029237912, + "learning_rate": 6.585941729512808e-06, + "loss": 0.3385, + "step": 3908 + }, + { + "epoch": 4.167377398720682, + "grad_norm": 0.08361794082456976, + "learning_rate": 6.569585342648861e-06, + "loss": 0.3333, + "step": 3909 + }, + { + "epoch": 4.1684434968017055, + "grad_norm": 0.08647483778408228, + "learning_rate": 6.55324747449722e-06, + "loss": 0.3383, + "step": 3910 + }, + { + "epoch": 4.169509594882729, + "grad_norm": 0.08605099292944113, + "learning_rate": 6.536928134108183e-06, + "loss": 0.3436, + "step": 3911 + }, + { + "epoch": 4.1705756929637525, + "grad_norm": 0.1009274470047588, + "learning_rate": 6.520627330521838e-06, + "loss": 0.337, + "step": 3912 + }, + { + "epoch": 4.1716417910447765, + "grad_norm": 0.08627645906743135, + "learning_rate": 6.504345072767986e-06, + "loss": 0.3391, + "step": 3913 + }, + { + "epoch": 4.1727078891258, + "grad_norm": 0.08829399505823976, + "learning_rate": 6.48808136986613e-06, + "loss": 0.3321, + "step": 3914 + }, + { + "epoch": 4.173773987206823, + "grad_norm": 0.08315647833828722, + "learning_rate": 6.471836230825533e-06, + "loss": 0.3386, + "step": 3915 + }, + { + "epoch": 4.174840085287847, + "grad_norm": 0.08482870520076757, + "learning_rate": 6.455609664645153e-06, + "loss": 0.3345, + "step": 3916 + }, + { + "epoch": 4.17590618336887, + "grad_norm": 0.08464445899920194, + "learning_rate": 6.439401680313677e-06, + "loss": 0.3321, + "step": 3917 + }, + { + "epoch": 4.176972281449894, + "grad_norm": 0.10394390037142433, + "learning_rate": 6.423212286809462e-06, + "loss": 0.3308, + "step": 3918 + }, + { + "epoch": 4.178038379530917, + "grad_norm": 0.09043061918479862, + "learning_rate": 6.407041493100603e-06, + "loss": 0.3375, + "step": 3919 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.08930388093603546, + "learning_rate": 6.390889308144879e-06, + "loss": 0.3412, + "step": 3920 + }, + { + "epoch": 4.180170575692964, + "grad_norm": 0.09541571773419251, + "learning_rate": 6.374755740889775e-06, + "loss": 0.3381, + "step": 3921 + }, + { + "epoch": 4.181236673773987, + "grad_norm": 0.09036041961820482, + "learning_rate": 6.3586408002724195e-06, + "loss": 0.3319, + "step": 3922 + }, + { + "epoch": 4.182302771855011, + "grad_norm": 0.09106038987637816, + "learning_rate": 6.342544495219671e-06, + "loss": 0.3308, + "step": 3923 + }, + { + "epoch": 4.183368869936034, + "grad_norm": 0.08457513382844575, + "learning_rate": 6.326466834648055e-06, + "loss": 0.3318, + "step": 3924 + }, + { + "epoch": 4.184434968017057, + "grad_norm": 0.08404396446642012, + "learning_rate": 6.310407827463736e-06, + "loss": 0.3295, + "step": 3925 + }, + { + "epoch": 4.185501066098081, + "grad_norm": 0.09617345450918036, + "learning_rate": 6.29436748256258e-06, + "loss": 0.331, + "step": 3926 + }, + { + "epoch": 4.186567164179104, + "grad_norm": 0.09146742715237753, + "learning_rate": 6.278345808830102e-06, + "loss": 0.3346, + "step": 3927 + }, + { + "epoch": 4.187633262260128, + "grad_norm": 0.08405415079436931, + "learning_rate": 6.262342815141495e-06, + "loss": 0.3346, + "step": 3928 + }, + { + "epoch": 4.188699360341151, + "grad_norm": 0.09298813400757024, + "learning_rate": 6.246358510361559e-06, + "loss": 0.3318, + "step": 3929 + }, + { + "epoch": 4.189765458422174, + "grad_norm": 0.08261784827703691, + "learning_rate": 6.230392903344777e-06, + "loss": 0.3314, + "step": 3930 + }, + { + "epoch": 4.190831556503198, + "grad_norm": 0.09664506287358952, + "learning_rate": 6.214446002935282e-06, + "loss": 0.3275, + "step": 3931 + }, + { + "epoch": 4.1918976545842215, + "grad_norm": 0.10130324537487397, + "learning_rate": 6.198517817966805e-06, + "loss": 0.3392, + "step": 3932 + }, + { + "epoch": 4.1929637526652455, + "grad_norm": 0.0885774081244104, + "learning_rate": 6.182608357262738e-06, + "loss": 0.332, + "step": 3933 + }, + { + "epoch": 4.1940298507462686, + "grad_norm": 0.09028658637288987, + "learning_rate": 6.166717629636103e-06, + "loss": 0.3273, + "step": 3934 + }, + { + "epoch": 4.1950959488272925, + "grad_norm": 0.09059894957809664, + "learning_rate": 6.150845643889542e-06, + "loss": 0.3391, + "step": 3935 + }, + { + "epoch": 4.196162046908316, + "grad_norm": 0.0947415382692891, + "learning_rate": 6.1349924088152905e-06, + "loss": 0.3329, + "step": 3936 + }, + { + "epoch": 4.197228144989339, + "grad_norm": 0.07775785182191923, + "learning_rate": 6.119157933195232e-06, + "loss": 0.3337, + "step": 3937 + }, + { + "epoch": 4.198294243070363, + "grad_norm": 0.09230865898141519, + "learning_rate": 6.1033422258008364e-06, + "loss": 0.3356, + "step": 3938 + }, + { + "epoch": 4.199360341151386, + "grad_norm": 0.09075042427461232, + "learning_rate": 6.087545295393198e-06, + "loss": 0.3366, + "step": 3939 + }, + { + "epoch": 4.20042643923241, + "grad_norm": 0.08591434886921874, + "learning_rate": 6.071767150722974e-06, + "loss": 0.3329, + "step": 3940 + }, + { + "epoch": 4.201492537313433, + "grad_norm": 0.08354052632216036, + "learning_rate": 6.056007800530444e-06, + "loss": 0.337, + "step": 3941 + }, + { + "epoch": 4.202558635394456, + "grad_norm": 0.09416369800009326, + "learning_rate": 6.040267253545482e-06, + "loss": 0.3336, + "step": 3942 + }, + { + "epoch": 4.20362473347548, + "grad_norm": 0.08465467278990185, + "learning_rate": 6.024545518487515e-06, + "loss": 0.3357, + "step": 3943 + }, + { + "epoch": 4.204690831556503, + "grad_norm": 0.08504324724039129, + "learning_rate": 6.0088426040655704e-06, + "loss": 0.3328, + "step": 3944 + }, + { + "epoch": 4.205756929637527, + "grad_norm": 0.07972041873768856, + "learning_rate": 5.993158518978255e-06, + "loss": 0.3296, + "step": 3945 + }, + { + "epoch": 4.20682302771855, + "grad_norm": 0.08077528320595509, + "learning_rate": 5.977493271913739e-06, + "loss": 0.331, + "step": 3946 + }, + { + "epoch": 4.207889125799573, + "grad_norm": 0.07617274164835192, + "learning_rate": 5.961846871549739e-06, + "loss": 0.331, + "step": 3947 + }, + { + "epoch": 4.208955223880597, + "grad_norm": 0.09002104870840356, + "learning_rate": 5.946219326553557e-06, + "loss": 0.3404, + "step": 3948 + }, + { + "epoch": 4.21002132196162, + "grad_norm": 0.09718872414198558, + "learning_rate": 5.930610645582051e-06, + "loss": 0.3367, + "step": 3949 + }, + { + "epoch": 4.211087420042644, + "grad_norm": 0.08759177753235749, + "learning_rate": 5.915020837281602e-06, + "loss": 0.3389, + "step": 3950 + }, + { + "epoch": 4.212153518123667, + "grad_norm": 0.08550266499801752, + "learning_rate": 5.899449910288169e-06, + "loss": 0.3314, + "step": 3951 + }, + { + "epoch": 4.21321961620469, + "grad_norm": 0.08610889895227287, + "learning_rate": 5.883897873227216e-06, + "loss": 0.3321, + "step": 3952 + }, + { + "epoch": 4.214285714285714, + "grad_norm": 0.07857770022593799, + "learning_rate": 5.868364734713776e-06, + "loss": 0.3407, + "step": 3953 + }, + { + "epoch": 4.2153518123667375, + "grad_norm": 0.10475981108457424, + "learning_rate": 5.852850503352407e-06, + "loss": 0.3384, + "step": 3954 + }, + { + "epoch": 4.2164179104477615, + "grad_norm": 0.07804224672207336, + "learning_rate": 5.837355187737172e-06, + "loss": 0.3368, + "step": 3955 + }, + { + "epoch": 4.217484008528785, + "grad_norm": 0.08058373335851238, + "learning_rate": 5.821878796451681e-06, + "loss": 0.3322, + "step": 3956 + }, + { + "epoch": 4.218550106609808, + "grad_norm": 0.08404369349869568, + "learning_rate": 5.806421338069053e-06, + "loss": 0.337, + "step": 3957 + }, + { + "epoch": 4.219616204690832, + "grad_norm": 0.08080561168777425, + "learning_rate": 5.790982821151905e-06, + "loss": 0.3339, + "step": 3958 + }, + { + "epoch": 4.220682302771855, + "grad_norm": 0.0828839824572233, + "learning_rate": 5.7755632542523744e-06, + "loss": 0.3308, + "step": 3959 + }, + { + "epoch": 4.221748400852879, + "grad_norm": 0.09417702718528696, + "learning_rate": 5.7601626459121175e-06, + "loss": 0.3337, + "step": 3960 + }, + { + "epoch": 4.222814498933902, + "grad_norm": 0.08229090459967245, + "learning_rate": 5.744781004662247e-06, + "loss": 0.3349, + "step": 3961 + }, + { + "epoch": 4.223880597014926, + "grad_norm": 0.08396218859737672, + "learning_rate": 5.729418339023407e-06, + "loss": 0.3415, + "step": 3962 + }, + { + "epoch": 4.224946695095949, + "grad_norm": 0.08423179577148057, + "learning_rate": 5.714074657505708e-06, + "loss": 0.3349, + "step": 3963 + }, + { + "epoch": 4.226012793176972, + "grad_norm": 0.0747196908529032, + "learning_rate": 5.6987499686087695e-06, + "loss": 0.3361, + "step": 3964 + }, + { + "epoch": 4.227078891257996, + "grad_norm": 0.09430188688349524, + "learning_rate": 5.683444280821651e-06, + "loss": 0.3381, + "step": 3965 + }, + { + "epoch": 4.228144989339019, + "grad_norm": 0.07893437331383663, + "learning_rate": 5.668157602622914e-06, + "loss": 0.333, + "step": 3966 + }, + { + "epoch": 4.229211087420043, + "grad_norm": 0.07954371770687908, + "learning_rate": 5.6528899424805886e-06, + "loss": 0.3354, + "step": 3967 + }, + { + "epoch": 4.230277185501066, + "grad_norm": 0.07863279825537235, + "learning_rate": 5.637641308852169e-06, + "loss": 0.3242, + "step": 3968 + }, + { + "epoch": 4.231343283582089, + "grad_norm": 0.08110711920649173, + "learning_rate": 5.622411710184592e-06, + "loss": 0.3359, + "step": 3969 + }, + { + "epoch": 4.232409381663113, + "grad_norm": 0.08076149758379426, + "learning_rate": 5.607201154914275e-06, + "loss": 0.3315, + "step": 3970 + }, + { + "epoch": 4.233475479744136, + "grad_norm": 0.08084146195418195, + "learning_rate": 5.592009651467081e-06, + "loss": 0.3333, + "step": 3971 + }, + { + "epoch": 4.23454157782516, + "grad_norm": 0.07585877249863301, + "learning_rate": 5.5768372082582925e-06, + "loss": 0.3276, + "step": 3972 + }, + { + "epoch": 4.235607675906183, + "grad_norm": 0.07846952562528792, + "learning_rate": 5.561683833692666e-06, + "loss": 0.335, + "step": 3973 + }, + { + "epoch": 4.2366737739872065, + "grad_norm": 0.08010327334926544, + "learning_rate": 5.546549536164381e-06, + "loss": 0.3341, + "step": 3974 + }, + { + "epoch": 4.23773987206823, + "grad_norm": 0.0799875870943478, + "learning_rate": 5.531434324057068e-06, + "loss": 0.336, + "step": 3975 + }, + { + "epoch": 4.2388059701492535, + "grad_norm": 0.08230353675191754, + "learning_rate": 5.516338205743745e-06, + "loss": 0.3299, + "step": 3976 + }, + { + "epoch": 4.2398720682302775, + "grad_norm": 0.08053168122604346, + "learning_rate": 5.501261189586889e-06, + "loss": 0.3339, + "step": 3977 + }, + { + "epoch": 4.240938166311301, + "grad_norm": 0.07665080765192847, + "learning_rate": 5.486203283938376e-06, + "loss": 0.3365, + "step": 3978 + }, + { + "epoch": 4.242004264392324, + "grad_norm": 0.08493277690607401, + "learning_rate": 5.471164497139523e-06, + "loss": 0.3352, + "step": 3979 + }, + { + "epoch": 4.243070362473348, + "grad_norm": 0.08757131176321435, + "learning_rate": 5.456144837521012e-06, + "loss": 0.336, + "step": 3980 + }, + { + "epoch": 4.244136460554371, + "grad_norm": 0.08088307895934461, + "learning_rate": 5.441144313402964e-06, + "loss": 0.3352, + "step": 3981 + }, + { + "epoch": 4.245202558635395, + "grad_norm": 0.10303950476433886, + "learning_rate": 5.426162933094898e-06, + "loss": 0.3363, + "step": 3982 + }, + { + "epoch": 4.246268656716418, + "grad_norm": 0.08551651606703374, + "learning_rate": 5.411200704895705e-06, + "loss": 0.3419, + "step": 3983 + }, + { + "epoch": 4.247334754797441, + "grad_norm": 0.08310317399531018, + "learning_rate": 5.396257637093687e-06, + "loss": 0.334, + "step": 3984 + }, + { + "epoch": 4.248400852878465, + "grad_norm": 0.08656256571909457, + "learning_rate": 5.381333737966525e-06, + "loss": 0.3345, + "step": 3985 + }, + { + "epoch": 4.249466950959488, + "grad_norm": 0.08701071532379277, + "learning_rate": 5.3664290157813005e-06, + "loss": 0.3326, + "step": 3986 + }, + { + "epoch": 4.250533049040512, + "grad_norm": 0.08260458610600334, + "learning_rate": 5.3515434787944295e-06, + "loss": 0.3357, + "step": 3987 + }, + { + "epoch": 4.251599147121535, + "grad_norm": 0.09118839401494544, + "learning_rate": 5.336677135251744e-06, + "loss": 0.3309, + "step": 3988 + }, + { + "epoch": 4.252665245202559, + "grad_norm": 0.0833732616912321, + "learning_rate": 5.321829993388421e-06, + "loss": 0.3332, + "step": 3989 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.0770950611344399, + "learning_rate": 5.307002061429001e-06, + "loss": 0.3295, + "step": 3990 + }, + { + "epoch": 4.254797441364605, + "grad_norm": 0.08654137149689174, + "learning_rate": 5.292193347587389e-06, + "loss": 0.3335, + "step": 3991 + }, + { + "epoch": 4.255863539445629, + "grad_norm": 0.08023871846633238, + "learning_rate": 5.277403860066841e-06, + "loss": 0.3318, + "step": 3992 + }, + { + "epoch": 4.256929637526652, + "grad_norm": 0.08681903564804878, + "learning_rate": 5.262633607059982e-06, + "loss": 0.3386, + "step": 3993 + }, + { + "epoch": 4.257995735607676, + "grad_norm": 0.08369350417988806, + "learning_rate": 5.247882596748737e-06, + "loss": 0.3356, + "step": 3994 + }, + { + "epoch": 4.259061833688699, + "grad_norm": 0.0911097900892705, + "learning_rate": 5.233150837304415e-06, + "loss": 0.3389, + "step": 3995 + }, + { + "epoch": 4.2601279317697225, + "grad_norm": 0.08091682403105288, + "learning_rate": 5.218438336887643e-06, + "loss": 0.3418, + "step": 3996 + }, + { + "epoch": 4.2611940298507465, + "grad_norm": 0.07957719556291395, + "learning_rate": 5.203745103648392e-06, + "loss": 0.3353, + "step": 3997 + }, + { + "epoch": 4.26226012793177, + "grad_norm": 0.077328833322233, + "learning_rate": 5.189071145725928e-06, + "loss": 0.3383, + "step": 3998 + }, + { + "epoch": 4.2633262260127935, + "grad_norm": 0.08081742747511964, + "learning_rate": 5.174416471248873e-06, + "loss": 0.3325, + "step": 3999 + }, + { + "epoch": 4.264392324093817, + "grad_norm": 0.07861878616801701, + "learning_rate": 5.159781088335161e-06, + "loss": 0.3342, + "step": 4000 + }, + { + "epoch": 4.26545842217484, + "grad_norm": 0.08400538210654203, + "learning_rate": 5.145165005092017e-06, + "loss": 0.3327, + "step": 4001 + }, + { + "epoch": 4.266524520255864, + "grad_norm": 0.07589313698458251, + "learning_rate": 5.130568229616004e-06, + "loss": 0.3332, + "step": 4002 + }, + { + "epoch": 4.267590618336887, + "grad_norm": 0.08147340980126598, + "learning_rate": 5.115990769992971e-06, + "loss": 0.3281, + "step": 4003 + }, + { + "epoch": 4.268656716417911, + "grad_norm": 0.07835055312388887, + "learning_rate": 5.101432634298089e-06, + "loss": 0.3339, + "step": 4004 + }, + { + "epoch": 4.269722814498934, + "grad_norm": 0.07947896584006585, + "learning_rate": 5.086893830595783e-06, + "loss": 0.3382, + "step": 4005 + }, + { + "epoch": 4.270788912579957, + "grad_norm": 0.07841316295017421, + "learning_rate": 5.07237436693981e-06, + "loss": 0.333, + "step": 4006 + }, + { + "epoch": 4.271855010660981, + "grad_norm": 0.09374263052005652, + "learning_rate": 5.057874251373194e-06, + "loss": 0.3332, + "step": 4007 + }, + { + "epoch": 4.272921108742004, + "grad_norm": 0.07579864101983415, + "learning_rate": 5.0433934919282525e-06, + "loss": 0.3324, + "step": 4008 + }, + { + "epoch": 4.273987206823028, + "grad_norm": 0.0804083559161328, + "learning_rate": 5.0289320966265645e-06, + "loss": 0.3325, + "step": 4009 + }, + { + "epoch": 4.275053304904051, + "grad_norm": 0.08512014086820764, + "learning_rate": 5.014490073478993e-06, + "loss": 0.342, + "step": 4010 + }, + { + "epoch": 4.276119402985074, + "grad_norm": 0.08530022661651851, + "learning_rate": 5.00006743048568e-06, + "loss": 0.3348, + "step": 4011 + }, + { + "epoch": 4.277185501066098, + "grad_norm": 0.07751500270346133, + "learning_rate": 4.985664175636e-06, + "loss": 0.3377, + "step": 4012 + }, + { + "epoch": 4.278251599147121, + "grad_norm": 0.075817452063534, + "learning_rate": 4.97128031690862e-06, + "loss": 0.3324, + "step": 4013 + }, + { + "epoch": 4.279317697228145, + "grad_norm": 0.07989341231264155, + "learning_rate": 4.956915862271445e-06, + "loss": 0.3371, + "step": 4014 + }, + { + "epoch": 4.280383795309168, + "grad_norm": 0.08374182318320397, + "learning_rate": 4.942570819681649e-06, + "loss": 0.3349, + "step": 4015 + }, + { + "epoch": 4.281449893390192, + "grad_norm": 0.08077261991520684, + "learning_rate": 4.928245197085626e-06, + "loss": 0.3418, + "step": 4016 + }, + { + "epoch": 4.282515991471215, + "grad_norm": 0.09142010449739497, + "learning_rate": 4.913939002419028e-06, + "loss": 0.3403, + "step": 4017 + }, + { + "epoch": 4.2835820895522385, + "grad_norm": 0.08261620442873233, + "learning_rate": 4.899652243606752e-06, + "loss": 0.3342, + "step": 4018 + }, + { + "epoch": 4.2846481876332625, + "grad_norm": 0.07949162404713021, + "learning_rate": 4.88538492856291e-06, + "loss": 0.3342, + "step": 4019 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.08029027369166614, + "learning_rate": 4.871137065190854e-06, + "loss": 0.3347, + "step": 4020 + }, + { + "epoch": 4.28678038379531, + "grad_norm": 0.0792316332451311, + "learning_rate": 4.856908661383175e-06, + "loss": 0.339, + "step": 4021 + }, + { + "epoch": 4.287846481876333, + "grad_norm": 0.0801437973547572, + "learning_rate": 4.842699725021649e-06, + "loss": 0.331, + "step": 4022 + }, + { + "epoch": 4.288912579957356, + "grad_norm": 0.08488879536646476, + "learning_rate": 4.828510263977295e-06, + "loss": 0.3306, + "step": 4023 + }, + { + "epoch": 4.28997867803838, + "grad_norm": 0.07756702190689371, + "learning_rate": 4.814340286110346e-06, + "loss": 0.3321, + "step": 4024 + }, + { + "epoch": 4.291044776119403, + "grad_norm": 0.09257841600912252, + "learning_rate": 4.800189799270221e-06, + "loss": 0.3341, + "step": 4025 + }, + { + "epoch": 4.292110874200427, + "grad_norm": 0.08241075259625737, + "learning_rate": 4.786058811295564e-06, + "loss": 0.3379, + "step": 4026 + }, + { + "epoch": 4.29317697228145, + "grad_norm": 0.08900515508524041, + "learning_rate": 4.771947330014195e-06, + "loss": 0.3335, + "step": 4027 + }, + { + "epoch": 4.294243070362473, + "grad_norm": 0.08609235279832582, + "learning_rate": 4.757855363243149e-06, + "loss": 0.3395, + "step": 4028 + }, + { + "epoch": 4.295309168443497, + "grad_norm": 0.08284456057342021, + "learning_rate": 4.743782918788653e-06, + "loss": 0.3291, + "step": 4029 + }, + { + "epoch": 4.29637526652452, + "grad_norm": 0.0713526073589266, + "learning_rate": 4.729730004446094e-06, + "loss": 0.3265, + "step": 4030 + }, + { + "epoch": 4.297441364605544, + "grad_norm": 0.08416229502175335, + "learning_rate": 4.715696628000057e-06, + "loss": 0.3343, + "step": 4031 + }, + { + "epoch": 4.298507462686567, + "grad_norm": 0.07860706489090766, + "learning_rate": 4.701682797224316e-06, + "loss": 0.3342, + "step": 4032 + }, + { + "epoch": 4.29957356076759, + "grad_norm": 0.07625660551514413, + "learning_rate": 4.687688519881799e-06, + "loss": 0.3352, + "step": 4033 + }, + { + "epoch": 4.300639658848614, + "grad_norm": 0.08922308300609538, + "learning_rate": 4.673713803724602e-06, + "loss": 0.3264, + "step": 4034 + }, + { + "epoch": 4.301705756929637, + "grad_norm": 0.07964888287437034, + "learning_rate": 4.659758656494e-06, + "loss": 0.3347, + "step": 4035 + }, + { + "epoch": 4.302771855010661, + "grad_norm": 0.08135734469936476, + "learning_rate": 4.645823085920409e-06, + "loss": 0.3411, + "step": 4036 + }, + { + "epoch": 4.303837953091684, + "grad_norm": 0.07699749391475862, + "learning_rate": 4.6319070997234315e-06, + "loss": 0.3366, + "step": 4037 + }, + { + "epoch": 4.3049040511727075, + "grad_norm": 0.079215440831635, + "learning_rate": 4.618010705611777e-06, + "loss": 0.3404, + "step": 4038 + }, + { + "epoch": 4.3059701492537314, + "grad_norm": 0.08102338622902823, + "learning_rate": 4.604133911283333e-06, + "loss": 0.3379, + "step": 4039 + }, + { + "epoch": 4.3070362473347545, + "grad_norm": 0.08630172945404184, + "learning_rate": 4.590276724425136e-06, + "loss": 0.3319, + "step": 4040 + }, + { + "epoch": 4.3081023454157785, + "grad_norm": 0.09511994379660102, + "learning_rate": 4.576439152713326e-06, + "loss": 0.3315, + "step": 4041 + }, + { + "epoch": 4.309168443496802, + "grad_norm": 0.08335455005241084, + "learning_rate": 4.562621203813211e-06, + "loss": 0.3316, + "step": 4042 + }, + { + "epoch": 4.310234541577826, + "grad_norm": 0.08307933355980296, + "learning_rate": 4.548822885379212e-06, + "loss": 0.336, + "step": 4043 + }, + { + "epoch": 4.311300639658849, + "grad_norm": 0.0843309027586609, + "learning_rate": 4.535044205054893e-06, + "loss": 0.3295, + "step": 4044 + }, + { + "epoch": 4.312366737739872, + "grad_norm": 0.08460136640969265, + "learning_rate": 4.521285170472904e-06, + "loss": 0.336, + "step": 4045 + }, + { + "epoch": 4.313432835820896, + "grad_norm": 0.07814369280046486, + "learning_rate": 4.507545789255052e-06, + "loss": 0.3332, + "step": 4046 + }, + { + "epoch": 4.314498933901919, + "grad_norm": 0.0855030516487757, + "learning_rate": 4.4938260690122435e-06, + "loss": 0.3342, + "step": 4047 + }, + { + "epoch": 4.315565031982943, + "grad_norm": 0.08812581908559658, + "learning_rate": 4.480126017344471e-06, + "loss": 0.3329, + "step": 4048 + }, + { + "epoch": 4.316631130063966, + "grad_norm": 0.07749089456051617, + "learning_rate": 4.466445641840862e-06, + "loss": 0.3333, + "step": 4049 + }, + { + "epoch": 4.317697228144989, + "grad_norm": 0.07815807671811865, + "learning_rate": 4.45278495007964e-06, + "loss": 0.3363, + "step": 4050 + }, + { + "epoch": 4.318763326226013, + "grad_norm": 0.07358888097958845, + "learning_rate": 4.439143949628118e-06, + "loss": 0.3295, + "step": 4051 + }, + { + "epoch": 4.319829424307036, + "grad_norm": 0.08431613734886675, + "learning_rate": 4.425522648042684e-06, + "loss": 0.3275, + "step": 4052 + }, + { + "epoch": 4.32089552238806, + "grad_norm": 0.07450901993676554, + "learning_rate": 4.411921052868846e-06, + "loss": 0.33, + "step": 4053 + }, + { + "epoch": 4.321961620469083, + "grad_norm": 0.08108642538320732, + "learning_rate": 4.3983391716411775e-06, + "loss": 0.3374, + "step": 4054 + }, + { + "epoch": 4.323027718550106, + "grad_norm": 0.08057675359517547, + "learning_rate": 4.384777011883343e-06, + "loss": 0.3327, + "step": 4055 + }, + { + "epoch": 4.32409381663113, + "grad_norm": 0.07748909745389584, + "learning_rate": 4.371234581108059e-06, + "loss": 0.3264, + "step": 4056 + }, + { + "epoch": 4.325159914712153, + "grad_norm": 0.0920305873286274, + "learning_rate": 4.3577118868171335e-06, + "loss": 0.3326, + "step": 4057 + }, + { + "epoch": 4.326226012793177, + "grad_norm": 0.07953455564884193, + "learning_rate": 4.344208936501449e-06, + "loss": 0.3327, + "step": 4058 + }, + { + "epoch": 4.3272921108742, + "grad_norm": 0.08324725165522431, + "learning_rate": 4.3307257376409155e-06, + "loss": 0.3329, + "step": 4059 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.08278692213524609, + "learning_rate": 4.317262297704541e-06, + "loss": 0.3374, + "step": 4060 + }, + { + "epoch": 4.3294243070362475, + "grad_norm": 0.07643673415451332, + "learning_rate": 4.3038186241503644e-06, + "loss": 0.338, + "step": 4061 + }, + { + "epoch": 4.330490405117271, + "grad_norm": 0.07581981080944176, + "learning_rate": 4.290394724425495e-06, + "loss": 0.3384, + "step": 4062 + }, + { + "epoch": 4.3315565031982945, + "grad_norm": 0.07771968698621509, + "learning_rate": 4.276990605966056e-06, + "loss": 0.3337, + "step": 4063 + }, + { + "epoch": 4.332622601279318, + "grad_norm": 0.07895244133624792, + "learning_rate": 4.2636062761972406e-06, + "loss": 0.3409, + "step": 4064 + }, + { + "epoch": 4.333688699360341, + "grad_norm": 0.07797323195669634, + "learning_rate": 4.2502417425332746e-06, + "loss": 0.3376, + "step": 4065 + }, + { + "epoch": 4.334754797441365, + "grad_norm": 0.07997960877114406, + "learning_rate": 4.236897012377421e-06, + "loss": 0.3371, + "step": 4066 + }, + { + "epoch": 4.335820895522388, + "grad_norm": 0.07600199349205194, + "learning_rate": 4.223572093121951e-06, + "loss": 0.3364, + "step": 4067 + }, + { + "epoch": 4.336886993603412, + "grad_norm": 0.08268817910100379, + "learning_rate": 4.210266992148188e-06, + "loss": 0.336, + "step": 4068 + }, + { + "epoch": 4.337953091684435, + "grad_norm": 0.0745603318232261, + "learning_rate": 4.196981716826471e-06, + "loss": 0.3323, + "step": 4069 + }, + { + "epoch": 4.339019189765459, + "grad_norm": 0.09208585615342714, + "learning_rate": 4.183716274516134e-06, + "loss": 0.3314, + "step": 4070 + }, + { + "epoch": 4.340085287846482, + "grad_norm": 0.07711887841230759, + "learning_rate": 4.170470672565557e-06, + "loss": 0.3331, + "step": 4071 + }, + { + "epoch": 4.341151385927505, + "grad_norm": 0.08621935992949319, + "learning_rate": 4.157244918312113e-06, + "loss": 0.3341, + "step": 4072 + }, + { + "epoch": 4.342217484008529, + "grad_norm": 0.08226222595272614, + "learning_rate": 4.144039019082184e-06, + "loss": 0.333, + "step": 4073 + }, + { + "epoch": 4.343283582089552, + "grad_norm": 0.07808395675180577, + "learning_rate": 4.1308529821911495e-06, + "loss": 0.3303, + "step": 4074 + }, + { + "epoch": 4.344349680170576, + "grad_norm": 0.0769818042274672, + "learning_rate": 4.117686814943382e-06, + "loss": 0.3417, + "step": 4075 + }, + { + "epoch": 4.345415778251599, + "grad_norm": 0.07879847363496587, + "learning_rate": 4.104540524632268e-06, + "loss": 0.3382, + "step": 4076 + }, + { + "epoch": 4.346481876332622, + "grad_norm": 0.08293035530571763, + "learning_rate": 4.091414118540158e-06, + "loss": 0.3331, + "step": 4077 + }, + { + "epoch": 4.347547974413646, + "grad_norm": 0.07727360738321705, + "learning_rate": 4.078307603938397e-06, + "loss": 0.337, + "step": 4078 + }, + { + "epoch": 4.348614072494669, + "grad_norm": 0.07551576922057891, + "learning_rate": 4.0652209880873214e-06, + "loss": 0.328, + "step": 4079 + }, + { + "epoch": 4.349680170575693, + "grad_norm": 0.07917186089359039, + "learning_rate": 4.052154278236242e-06, + "loss": 0.334, + "step": 4080 + }, + { + "epoch": 4.350746268656716, + "grad_norm": 0.07567017656306375, + "learning_rate": 4.039107481623417e-06, + "loss": 0.3345, + "step": 4081 + }, + { + "epoch": 4.3518123667377395, + "grad_norm": 0.07605848497463297, + "learning_rate": 4.026080605476104e-06, + "loss": 0.3311, + "step": 4082 + }, + { + "epoch": 4.3528784648187635, + "grad_norm": 0.08119621016465227, + "learning_rate": 4.013073657010518e-06, + "loss": 0.334, + "step": 4083 + }, + { + "epoch": 4.353944562899787, + "grad_norm": 0.07763766653584449, + "learning_rate": 4.000086643431838e-06, + "loss": 0.3346, + "step": 4084 + }, + { + "epoch": 4.355010660980811, + "grad_norm": 0.08680938596509069, + "learning_rate": 3.987119571934179e-06, + "loss": 0.3379, + "step": 4085 + }, + { + "epoch": 4.356076759061834, + "grad_norm": 0.07539433250460861, + "learning_rate": 3.974172449700633e-06, + "loss": 0.3312, + "step": 4086 + }, + { + "epoch": 4.357142857142857, + "grad_norm": 0.08358711815421738, + "learning_rate": 3.961245283903239e-06, + "loss": 0.3346, + "step": 4087 + }, + { + "epoch": 4.358208955223881, + "grad_norm": 0.07975462632947826, + "learning_rate": 3.948338081702958e-06, + "loss": 0.3278, + "step": 4088 + }, + { + "epoch": 4.359275053304904, + "grad_norm": 0.07783915298575807, + "learning_rate": 3.935450850249725e-06, + "loss": 0.3356, + "step": 4089 + }, + { + "epoch": 4.360341151385928, + "grad_norm": 0.07816454670420965, + "learning_rate": 3.9225835966823966e-06, + "loss": 0.3361, + "step": 4090 + }, + { + "epoch": 4.361407249466951, + "grad_norm": 0.08148461720782253, + "learning_rate": 3.909736328128748e-06, + "loss": 0.3333, + "step": 4091 + }, + { + "epoch": 4.362473347547974, + "grad_norm": 0.08042384846739183, + "learning_rate": 3.896909051705509e-06, + "loss": 0.3321, + "step": 4092 + }, + { + "epoch": 4.363539445628998, + "grad_norm": 0.08319011697671452, + "learning_rate": 3.884101774518327e-06, + "loss": 0.3354, + "step": 4093 + }, + { + "epoch": 4.364605543710021, + "grad_norm": 0.0782252777997426, + "learning_rate": 3.871314503661761e-06, + "loss": 0.3361, + "step": 4094 + }, + { + "epoch": 4.365671641791045, + "grad_norm": 0.07065927300340862, + "learning_rate": 3.858547246219293e-06, + "loss": 0.3312, + "step": 4095 + }, + { + "epoch": 4.366737739872068, + "grad_norm": 0.08468956849885999, + "learning_rate": 3.845800009263334e-06, + "loss": 0.3307, + "step": 4096 + }, + { + "epoch": 4.367803837953092, + "grad_norm": 0.08901806534396679, + "learning_rate": 3.833072799855173e-06, + "loss": 0.3315, + "step": 4097 + }, + { + "epoch": 4.368869936034115, + "grad_norm": 0.07301214410208626, + "learning_rate": 3.820365625045037e-06, + "loss": 0.3331, + "step": 4098 + }, + { + "epoch": 4.369936034115138, + "grad_norm": 0.08198685107248344, + "learning_rate": 3.8076784918720242e-06, + "loss": 0.3311, + "step": 4099 + }, + { + "epoch": 4.371002132196162, + "grad_norm": 0.08355479249429133, + "learning_rate": 3.7950114073641573e-06, + "loss": 0.3355, + "step": 4100 + }, + { + "epoch": 4.372068230277185, + "grad_norm": 0.07726712718675953, + "learning_rate": 3.7823643785383434e-06, + "loss": 0.333, + "step": 4101 + }, + { + "epoch": 4.373134328358209, + "grad_norm": 0.07279799524384821, + "learning_rate": 3.7697374124003872e-06, + "loss": 0.3303, + "step": 4102 + }, + { + "epoch": 4.3742004264392325, + "grad_norm": 0.08528796671091622, + "learning_rate": 3.757130515944951e-06, + "loss": 0.3342, + "step": 4103 + }, + { + "epoch": 4.3752665245202556, + "grad_norm": 0.08572585263620255, + "learning_rate": 3.7445436961556135e-06, + "loss": 0.3369, + "step": 4104 + }, + { + "epoch": 4.3763326226012795, + "grad_norm": 0.0748149784520539, + "learning_rate": 3.7319769600048237e-06, + "loss": 0.3387, + "step": 4105 + }, + { + "epoch": 4.377398720682303, + "grad_norm": 0.07657381538769502, + "learning_rate": 3.7194303144538847e-06, + "loss": 0.3327, + "step": 4106 + }, + { + "epoch": 4.378464818763327, + "grad_norm": 0.0807481452009072, + "learning_rate": 3.706903766452996e-06, + "loss": 0.3343, + "step": 4107 + }, + { + "epoch": 4.37953091684435, + "grad_norm": 0.08393196828122008, + "learning_rate": 3.6943973229412124e-06, + "loss": 0.3374, + "step": 4108 + }, + { + "epoch": 4.380597014925373, + "grad_norm": 0.0782549467161215, + "learning_rate": 3.681910990846462e-06, + "loss": 0.3287, + "step": 4109 + }, + { + "epoch": 4.381663113006397, + "grad_norm": 0.08070850418302111, + "learning_rate": 3.669444777085507e-06, + "loss": 0.337, + "step": 4110 + }, + { + "epoch": 4.38272921108742, + "grad_norm": 0.07534023101241813, + "learning_rate": 3.6569986885639954e-06, + "loss": 0.3341, + "step": 4111 + }, + { + "epoch": 4.383795309168444, + "grad_norm": 0.07838807058793515, + "learning_rate": 3.6445727321764035e-06, + "loss": 0.3353, + "step": 4112 + }, + { + "epoch": 4.384861407249467, + "grad_norm": 0.07822860155918042, + "learning_rate": 3.6321669148060833e-06, + "loss": 0.3342, + "step": 4113 + }, + { + "epoch": 4.38592750533049, + "grad_norm": 0.07732616158756847, + "learning_rate": 3.619781243325187e-06, + "loss": 0.3382, + "step": 4114 + }, + { + "epoch": 4.386993603411514, + "grad_norm": 0.0767992597048576, + "learning_rate": 3.6074157245947495e-06, + "loss": 0.3392, + "step": 4115 + }, + { + "epoch": 4.388059701492537, + "grad_norm": 0.08304386197535715, + "learning_rate": 3.5950703654646303e-06, + "loss": 0.3368, + "step": 4116 + }, + { + "epoch": 4.389125799573561, + "grad_norm": 0.07467146074464837, + "learning_rate": 3.5827451727735007e-06, + "loss": 0.333, + "step": 4117 + }, + { + "epoch": 4.390191897654584, + "grad_norm": 0.07659772726124718, + "learning_rate": 3.5704401533488865e-06, + "loss": 0.3332, + "step": 4118 + }, + { + "epoch": 4.391257995735607, + "grad_norm": 0.0779901058287516, + "learning_rate": 3.5581553140071256e-06, + "loss": 0.3353, + "step": 4119 + }, + { + "epoch": 4.392324093816631, + "grad_norm": 0.08271167127027722, + "learning_rate": 3.5458906615533883e-06, + "loss": 0.3325, + "step": 4120 + }, + { + "epoch": 4.393390191897654, + "grad_norm": 0.07640335606088489, + "learning_rate": 3.53364620278164e-06, + "loss": 0.3336, + "step": 4121 + }, + { + "epoch": 4.394456289978678, + "grad_norm": 0.08042193883025106, + "learning_rate": 3.5214219444746856e-06, + "loss": 0.3384, + "step": 4122 + }, + { + "epoch": 4.395522388059701, + "grad_norm": 0.07840556030820514, + "learning_rate": 3.5092178934041353e-06, + "loss": 0.3386, + "step": 4123 + }, + { + "epoch": 4.396588486140725, + "grad_norm": 0.08151804289922143, + "learning_rate": 3.497034056330382e-06, + "loss": 0.3317, + "step": 4124 + }, + { + "epoch": 4.3976545842217485, + "grad_norm": 0.08168021320582781, + "learning_rate": 3.4848704400026434e-06, + "loss": 0.3321, + "step": 4125 + }, + { + "epoch": 4.398720682302772, + "grad_norm": 0.07609160414759338, + "learning_rate": 3.4727270511589396e-06, + "loss": 0.3363, + "step": 4126 + }, + { + "epoch": 4.399786780383796, + "grad_norm": 0.07842449906210139, + "learning_rate": 3.4606038965260715e-06, + "loss": 0.3281, + "step": 4127 + }, + { + "epoch": 4.400852878464819, + "grad_norm": 0.07972055102539237, + "learning_rate": 3.4485009828196357e-06, + "loss": 0.3348, + "step": 4128 + }, + { + "epoch": 4.401918976545843, + "grad_norm": 0.07239605085558472, + "learning_rate": 3.4364183167440123e-06, + "loss": 0.3314, + "step": 4129 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.07272119283107233, + "learning_rate": 3.4243559049923803e-06, + "loss": 0.3345, + "step": 4130 + }, + { + "epoch": 4.404051172707889, + "grad_norm": 0.07698965215086702, + "learning_rate": 3.412313754246688e-06, + "loss": 0.3364, + "step": 4131 + }, + { + "epoch": 4.405117270788913, + "grad_norm": 0.07835880219239048, + "learning_rate": 3.400291871177652e-06, + "loss": 0.3325, + "step": 4132 + }, + { + "epoch": 4.406183368869936, + "grad_norm": 0.07883352671429816, + "learning_rate": 3.3882902624447777e-06, + "loss": 0.3302, + "step": 4133 + }, + { + "epoch": 4.40724946695096, + "grad_norm": 0.07598330265933242, + "learning_rate": 3.3763089346963417e-06, + "loss": 0.3315, + "step": 4134 + }, + { + "epoch": 4.408315565031983, + "grad_norm": 0.07233756048331239, + "learning_rate": 3.3643478945693552e-06, + "loss": 0.3344, + "step": 4135 + }, + { + "epoch": 4.409381663113006, + "grad_norm": 0.07815412688938289, + "learning_rate": 3.352407148689625e-06, + "loss": 0.3355, + "step": 4136 + }, + { + "epoch": 4.41044776119403, + "grad_norm": 0.08194740814774278, + "learning_rate": 3.3404867036716994e-06, + "loss": 0.3353, + "step": 4137 + }, + { + "epoch": 4.411513859275053, + "grad_norm": 0.07605509113317131, + "learning_rate": 3.328586566118901e-06, + "loss": 0.3352, + "step": 4138 + }, + { + "epoch": 4.412579957356077, + "grad_norm": 0.07758256960053231, + "learning_rate": 3.316706742623268e-06, + "loss": 0.3391, + "step": 4139 + }, + { + "epoch": 4.4136460554371, + "grad_norm": 0.07526062814521468, + "learning_rate": 3.3048472397656115e-06, + "loss": 0.3331, + "step": 4140 + }, + { + "epoch": 4.414712153518123, + "grad_norm": 0.08486437341532607, + "learning_rate": 3.2930080641154816e-06, + "loss": 0.3323, + "step": 4141 + }, + { + "epoch": 4.415778251599147, + "grad_norm": 0.0830672341181274, + "learning_rate": 3.2811892222311694e-06, + "loss": 0.3319, + "step": 4142 + }, + { + "epoch": 4.41684434968017, + "grad_norm": 0.07212703483795091, + "learning_rate": 3.269390720659691e-06, + "loss": 0.3332, + "step": 4143 + }, + { + "epoch": 4.417910447761194, + "grad_norm": 0.0790520570244998, + "learning_rate": 3.257612565936805e-06, + "loss": 0.3359, + "step": 4144 + }, + { + "epoch": 4.418976545842217, + "grad_norm": 0.07333741287228444, + "learning_rate": 3.2458547645870086e-06, + "loss": 0.3285, + "step": 4145 + }, + { + "epoch": 4.4200426439232405, + "grad_norm": 0.07631456166587812, + "learning_rate": 3.2341173231234956e-06, + "loss": 0.3362, + "step": 4146 + }, + { + "epoch": 4.4211087420042645, + "grad_norm": 0.07268449281095922, + "learning_rate": 3.2224002480482075e-06, + "loss": 0.335, + "step": 4147 + }, + { + "epoch": 4.422174840085288, + "grad_norm": 0.07265611750504483, + "learning_rate": 3.210703545851792e-06, + "loss": 0.3323, + "step": 4148 + }, + { + "epoch": 4.423240938166312, + "grad_norm": 0.07887619669716188, + "learning_rate": 3.1990272230136266e-06, + "loss": 0.3305, + "step": 4149 + }, + { + "epoch": 4.424307036247335, + "grad_norm": 0.07492581724533613, + "learning_rate": 3.187371286001768e-06, + "loss": 0.3387, + "step": 4150 + }, + { + "epoch": 4.425373134328359, + "grad_norm": 0.07375810082278332, + "learning_rate": 3.175735741273007e-06, + "loss": 0.3323, + "step": 4151 + }, + { + "epoch": 4.426439232409382, + "grad_norm": 0.0756484197290268, + "learning_rate": 3.164120595272837e-06, + "loss": 0.3339, + "step": 4152 + }, + { + "epoch": 4.427505330490405, + "grad_norm": 0.07829165152170597, + "learning_rate": 3.1525258544354354e-06, + "loss": 0.3358, + "step": 4153 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.07587711508081742, + "learning_rate": 3.140951525183691e-06, + "loss": 0.3389, + "step": 4154 + }, + { + "epoch": 4.429637526652452, + "grad_norm": 0.07159911504321198, + "learning_rate": 3.1293976139291814e-06, + "loss": 0.3293, + "step": 4155 + }, + { + "epoch": 4.430703624733475, + "grad_norm": 0.0699462370372574, + "learning_rate": 3.117864127072179e-06, + "loss": 0.3384, + "step": 4156 + }, + { + "epoch": 4.431769722814499, + "grad_norm": 0.07194403996523205, + "learning_rate": 3.106351071001621e-06, + "loss": 0.3298, + "step": 4157 + }, + { + "epoch": 4.432835820895522, + "grad_norm": 0.07348866543415371, + "learning_rate": 3.0948584520951488e-06, + "loss": 0.3327, + "step": 4158 + }, + { + "epoch": 4.433901918976546, + "grad_norm": 0.07901403560235466, + "learning_rate": 3.083386276719087e-06, + "loss": 0.3354, + "step": 4159 + }, + { + "epoch": 4.434968017057569, + "grad_norm": 0.07614682029865204, + "learning_rate": 3.071934551228406e-06, + "loss": 0.3399, + "step": 4160 + }, + { + "epoch": 4.436034115138593, + "grad_norm": 0.07095662489839566, + "learning_rate": 3.060503281966778e-06, + "loss": 0.3364, + "step": 4161 + }, + { + "epoch": 4.437100213219616, + "grad_norm": 0.07661517083301671, + "learning_rate": 3.049092475266533e-06, + "loss": 0.3407, + "step": 4162 + }, + { + "epoch": 4.438166311300639, + "grad_norm": 0.07612860870888066, + "learning_rate": 3.037702137448659e-06, + "loss": 0.336, + "step": 4163 + }, + { + "epoch": 4.439232409381663, + "grad_norm": 0.07257317111602125, + "learning_rate": 3.0263322748228117e-06, + "loss": 0.3347, + "step": 4164 + }, + { + "epoch": 4.440298507462686, + "grad_norm": 0.06942974184000834, + "learning_rate": 3.0149828936873084e-06, + "loss": 0.3379, + "step": 4165 + }, + { + "epoch": 4.44136460554371, + "grad_norm": 0.07367479114255061, + "learning_rate": 3.003654000329115e-06, + "loss": 0.3325, + "step": 4166 + }, + { + "epoch": 4.4424307036247335, + "grad_norm": 0.07405338625538695, + "learning_rate": 2.9923456010238426e-06, + "loss": 0.3377, + "step": 4167 + }, + { + "epoch": 4.443496801705757, + "grad_norm": 0.07391480199022413, + "learning_rate": 2.98105770203577e-06, + "loss": 0.3336, + "step": 4168 + }, + { + "epoch": 4.4445628997867805, + "grad_norm": 0.0697155481700767, + "learning_rate": 2.9697903096177973e-06, + "loss": 0.3323, + "step": 4169 + }, + { + "epoch": 4.445628997867804, + "grad_norm": 0.07479820610427203, + "learning_rate": 2.9585434300114734e-06, + "loss": 0.332, + "step": 4170 + }, + { + "epoch": 4.446695095948828, + "grad_norm": 0.07917174679867747, + "learning_rate": 2.9473170694469934e-06, + "loss": 0.3346, + "step": 4171 + }, + { + "epoch": 4.447761194029851, + "grad_norm": 0.07299094494689012, + "learning_rate": 2.9361112341431643e-06, + "loss": 0.3303, + "step": 4172 + }, + { + "epoch": 4.448827292110874, + "grad_norm": 0.07367637209674983, + "learning_rate": 2.924925930307447e-06, + "loss": 0.3346, + "step": 4173 + }, + { + "epoch": 4.449893390191898, + "grad_norm": 0.07495993556247976, + "learning_rate": 2.9137611641359222e-06, + "loss": 0.3321, + "step": 4174 + }, + { + "epoch": 4.450959488272921, + "grad_norm": 0.07267443806340775, + "learning_rate": 2.902616941813281e-06, + "loss": 0.3299, + "step": 4175 + }, + { + "epoch": 4.452025586353945, + "grad_norm": 0.0708925866597283, + "learning_rate": 2.8914932695128393e-06, + "loss": 0.3332, + "step": 4176 + }, + { + "epoch": 4.453091684434968, + "grad_norm": 0.07269636700113985, + "learning_rate": 2.880390153396544e-06, + "loss": 0.3303, + "step": 4177 + }, + { + "epoch": 4.454157782515992, + "grad_norm": 0.07332256346717313, + "learning_rate": 2.86930759961495e-06, + "loss": 0.3294, + "step": 4178 + }, + { + "epoch": 4.455223880597015, + "grad_norm": 0.07309550795550446, + "learning_rate": 2.8582456143071956e-06, + "loss": 0.3318, + "step": 4179 + }, + { + "epoch": 4.456289978678038, + "grad_norm": 0.07177315229711953, + "learning_rate": 2.8472042036010594e-06, + "loss": 0.3333, + "step": 4180 + }, + { + "epoch": 4.457356076759062, + "grad_norm": 0.0747294310777951, + "learning_rate": 2.8361833736129107e-06, + "loss": 0.3239, + "step": 4181 + }, + { + "epoch": 4.458422174840085, + "grad_norm": 0.07554108527569321, + "learning_rate": 2.8251831304477108e-06, + "loss": 0.3381, + "step": 4182 + }, + { + "epoch": 4.459488272921108, + "grad_norm": 0.07278816202889332, + "learning_rate": 2.81420348019902e-06, + "loss": 0.3347, + "step": 4183 + }, + { + "epoch": 4.460554371002132, + "grad_norm": 0.07323691646275544, + "learning_rate": 2.8032444289490012e-06, + "loss": 0.3337, + "step": 4184 + }, + { + "epoch": 4.461620469083155, + "grad_norm": 0.07761545782310925, + "learning_rate": 2.792305982768402e-06, + "loss": 0.3331, + "step": 4185 + }, + { + "epoch": 4.462686567164179, + "grad_norm": 0.0739374400916545, + "learning_rate": 2.7813881477165395e-06, + "loss": 0.3279, + "step": 4186 + }, + { + "epoch": 4.463752665245202, + "grad_norm": 0.07242003332809858, + "learning_rate": 2.7704909298413362e-06, + "loss": 0.3363, + "step": 4187 + }, + { + "epoch": 4.464818763326226, + "grad_norm": 0.07187298705284119, + "learning_rate": 2.7596143351792837e-06, + "loss": 0.3386, + "step": 4188 + }, + { + "epoch": 4.4658848614072495, + "grad_norm": 0.07239042923705526, + "learning_rate": 2.7487583697554555e-06, + "loss": 0.3323, + "step": 4189 + }, + { + "epoch": 4.466950959488273, + "grad_norm": 0.07678589531267026, + "learning_rate": 2.7379230395834764e-06, + "loss": 0.3329, + "step": 4190 + }, + { + "epoch": 4.468017057569297, + "grad_norm": 0.07189456346112492, + "learning_rate": 2.7271083506655728e-06, + "loss": 0.3382, + "step": 4191 + }, + { + "epoch": 4.46908315565032, + "grad_norm": 0.07515170520654751, + "learning_rate": 2.716314308992516e-06, + "loss": 0.3385, + "step": 4192 + }, + { + "epoch": 4.470149253731344, + "grad_norm": 0.07313542483059168, + "learning_rate": 2.7055409205436346e-06, + "loss": 0.3316, + "step": 4193 + }, + { + "epoch": 4.471215351812367, + "grad_norm": 0.07066165546162541, + "learning_rate": 2.6947881912868346e-06, + "loss": 0.3315, + "step": 4194 + }, + { + "epoch": 4.47228144989339, + "grad_norm": 0.07272630195981485, + "learning_rate": 2.6840561271785694e-06, + "loss": 0.3354, + "step": 4195 + }, + { + "epoch": 4.473347547974414, + "grad_norm": 0.07161497516187117, + "learning_rate": 2.6733447341638472e-06, + "loss": 0.3388, + "step": 4196 + }, + { + "epoch": 4.474413646055437, + "grad_norm": 0.07454202797819427, + "learning_rate": 2.662654018176212e-06, + "loss": 0.3333, + "step": 4197 + }, + { + "epoch": 4.475479744136461, + "grad_norm": 0.07508088009677, + "learning_rate": 2.6519839851377737e-06, + "loss": 0.3387, + "step": 4198 + }, + { + "epoch": 4.476545842217484, + "grad_norm": 0.07804787962517462, + "learning_rate": 2.6413346409591745e-06, + "loss": 0.3317, + "step": 4199 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.07128589665687961, + "learning_rate": 2.630705991539602e-06, + "loss": 0.3251, + "step": 4200 + }, + { + "epoch": 4.478678038379531, + "grad_norm": 0.07355996310666547, + "learning_rate": 2.6200980427667635e-06, + "loss": 0.3297, + "step": 4201 + }, + { + "epoch": 4.479744136460554, + "grad_norm": 0.07500471190524303, + "learning_rate": 2.6095108005169188e-06, + "loss": 0.3292, + "step": 4202 + }, + { + "epoch": 4.480810234541578, + "grad_norm": 0.07946453180732672, + "learning_rate": 2.5989442706548574e-06, + "loss": 0.3352, + "step": 4203 + }, + { + "epoch": 4.481876332622601, + "grad_norm": 0.06970897348276822, + "learning_rate": 2.5883984590338738e-06, + "loss": 0.3279, + "step": 4204 + }, + { + "epoch": 4.482942430703625, + "grad_norm": 0.08238106944138114, + "learning_rate": 2.5778733714958027e-06, + "loss": 0.3324, + "step": 4205 + }, + { + "epoch": 4.484008528784648, + "grad_norm": 0.07235582940293811, + "learning_rate": 2.5673690138710018e-06, + "loss": 0.3365, + "step": 4206 + }, + { + "epoch": 4.485074626865671, + "grad_norm": 0.07346252026606727, + "learning_rate": 2.556885391978341e-06, + "loss": 0.3349, + "step": 4207 + }, + { + "epoch": 4.486140724946695, + "grad_norm": 0.07558966474067527, + "learning_rate": 2.5464225116251886e-06, + "loss": 0.3358, + "step": 4208 + }, + { + "epoch": 4.4872068230277184, + "grad_norm": 0.07040408366742829, + "learning_rate": 2.53598037860745e-06, + "loss": 0.331, + "step": 4209 + }, + { + "epoch": 4.4882729211087415, + "grad_norm": 0.07488263115221298, + "learning_rate": 2.5255589987095207e-06, + "loss": 0.3342, + "step": 4210 + }, + { + "epoch": 4.4893390191897655, + "grad_norm": 0.07051801023243459, + "learning_rate": 2.5151583777042988e-06, + "loss": 0.3306, + "step": 4211 + }, + { + "epoch": 4.490405117270789, + "grad_norm": 0.07182136451708207, + "learning_rate": 2.5047785213531882e-06, + "loss": 0.3272, + "step": 4212 + }, + { + "epoch": 4.491471215351813, + "grad_norm": 0.07727816037885224, + "learning_rate": 2.494419435406097e-06, + "loss": 0.3354, + "step": 4213 + }, + { + "epoch": 4.492537313432836, + "grad_norm": 0.07344914255236583, + "learning_rate": 2.4840811256014164e-06, + "loss": 0.3293, + "step": 4214 + }, + { + "epoch": 4.49360341151386, + "grad_norm": 0.07192568760462288, + "learning_rate": 2.4737635976660325e-06, + "loss": 0.3287, + "step": 4215 + }, + { + "epoch": 4.494669509594883, + "grad_norm": 0.0701357026382714, + "learning_rate": 2.4634668573153154e-06, + "loss": 0.3386, + "step": 4216 + }, + { + "epoch": 4.495735607675906, + "grad_norm": 0.06637179767672738, + "learning_rate": 2.4531909102531294e-06, + "loss": 0.3325, + "step": 4217 + }, + { + "epoch": 4.49680170575693, + "grad_norm": 0.07656271024622706, + "learning_rate": 2.442935762171819e-06, + "loss": 0.3394, + "step": 4218 + }, + { + "epoch": 4.497867803837953, + "grad_norm": 0.0714666290950529, + "learning_rate": 2.4327014187521948e-06, + "loss": 0.3367, + "step": 4219 + }, + { + "epoch": 4.498933901918977, + "grad_norm": 0.07606369364800027, + "learning_rate": 2.422487885663554e-06, + "loss": 0.3386, + "step": 4220 + }, + { + "epoch": 4.5, + "grad_norm": 0.06844926088782544, + "learning_rate": 2.4122951685636674e-06, + "loss": 0.3326, + "step": 4221 + }, + { + "epoch": 4.501066098081023, + "grad_norm": 0.07046377328122992, + "learning_rate": 2.4021232730987622e-06, + "loss": 0.3352, + "step": 4222 + }, + { + "epoch": 4.502132196162047, + "grad_norm": 0.07467074933049231, + "learning_rate": 2.3919722049035433e-06, + "loss": 0.3334, + "step": 4223 + }, + { + "epoch": 4.50319829424307, + "grad_norm": 0.07060359053074924, + "learning_rate": 2.38184196960118e-06, + "loss": 0.3362, + "step": 4224 + }, + { + "epoch": 4.504264392324094, + "grad_norm": 0.07379325677186419, + "learning_rate": 2.3717325728032935e-06, + "loss": 0.3429, + "step": 4225 + }, + { + "epoch": 4.505330490405117, + "grad_norm": 0.07310277967696212, + "learning_rate": 2.3616440201099567e-06, + "loss": 0.3388, + "step": 4226 + }, + { + "epoch": 4.50639658848614, + "grad_norm": 0.07006559210490895, + "learning_rate": 2.3515763171097115e-06, + "loss": 0.3355, + "step": 4227 + }, + { + "epoch": 4.507462686567164, + "grad_norm": 0.07133173745971073, + "learning_rate": 2.341529469379551e-06, + "loss": 0.3318, + "step": 4228 + }, + { + "epoch": 4.508528784648187, + "grad_norm": 0.07098735264122637, + "learning_rate": 2.3315034824848846e-06, + "loss": 0.3346, + "step": 4229 + }, + { + "epoch": 4.509594882729211, + "grad_norm": 0.0777511243612946, + "learning_rate": 2.3214983619795995e-06, + "loss": 0.3382, + "step": 4230 + }, + { + "epoch": 4.5106609808102345, + "grad_norm": 0.06908524988843785, + "learning_rate": 2.3115141134060215e-06, + "loss": 0.3351, + "step": 4231 + }, + { + "epoch": 4.5117270788912585, + "grad_norm": 0.07463758297713084, + "learning_rate": 2.301550742294887e-06, + "loss": 0.3385, + "step": 4232 + }, + { + "epoch": 4.5127931769722816, + "grad_norm": 0.07271121038748618, + "learning_rate": 2.2916082541653983e-06, + "loss": 0.3349, + "step": 4233 + }, + { + "epoch": 4.513859275053305, + "grad_norm": 0.07475883982198514, + "learning_rate": 2.281686654525177e-06, + "loss": 0.333, + "step": 4234 + }, + { + "epoch": 4.514925373134329, + "grad_norm": 0.07660331206011552, + "learning_rate": 2.2717859488702665e-06, + "loss": 0.3378, + "step": 4235 + }, + { + "epoch": 4.515991471215352, + "grad_norm": 0.07040590430776542, + "learning_rate": 2.2619061426851463e-06, + "loss": 0.3353, + "step": 4236 + }, + { + "epoch": 4.517057569296375, + "grad_norm": 0.07585806379146305, + "learning_rate": 2.252047241442723e-06, + "loss": 0.3395, + "step": 4237 + }, + { + "epoch": 4.518123667377399, + "grad_norm": 0.07561687413038395, + "learning_rate": 2.2422092506043036e-06, + "loss": 0.3367, + "step": 4238 + }, + { + "epoch": 4.519189765458422, + "grad_norm": 0.06884029998313776, + "learning_rate": 2.2323921756196263e-06, + "loss": 0.336, + "step": 4239 + }, + { + "epoch": 4.520255863539446, + "grad_norm": 0.0699098475783151, + "learning_rate": 2.2225960219268526e-06, + "loss": 0.3348, + "step": 4240 + }, + { + "epoch": 4.521321961620469, + "grad_norm": 0.07422236573639096, + "learning_rate": 2.212820794952526e-06, + "loss": 0.3304, + "step": 4241 + }, + { + "epoch": 4.522388059701493, + "grad_norm": 0.07002733474761201, + "learning_rate": 2.2030665001116213e-06, + "loss": 0.3297, + "step": 4242 + }, + { + "epoch": 4.523454157782516, + "grad_norm": 0.07060390805754775, + "learning_rate": 2.1933331428075146e-06, + "loss": 0.3305, + "step": 4243 + }, + { + "epoch": 4.524520255863539, + "grad_norm": 0.07131829668068262, + "learning_rate": 2.1836207284319724e-06, + "loss": 0.331, + "step": 4244 + }, + { + "epoch": 4.525586353944563, + "grad_norm": 0.07092568911405243, + "learning_rate": 2.1739292623651755e-06, + "loss": 0.3324, + "step": 4245 + }, + { + "epoch": 4.526652452025586, + "grad_norm": 0.07054221449199592, + "learning_rate": 2.164258749975683e-06, + "loss": 0.3366, + "step": 4246 + }, + { + "epoch": 4.52771855010661, + "grad_norm": 0.07706442180790107, + "learning_rate": 2.154609196620472e-06, + "loss": 0.3371, + "step": 4247 + }, + { + "epoch": 4.528784648187633, + "grad_norm": 0.07508166585212031, + "learning_rate": 2.144980607644871e-06, + "loss": 0.3321, + "step": 4248 + }, + { + "epoch": 4.529850746268656, + "grad_norm": 0.07179015508589877, + "learning_rate": 2.135372988382636e-06, + "loss": 0.3326, + "step": 4249 + }, + { + "epoch": 4.53091684434968, + "grad_norm": 0.07298872176356, + "learning_rate": 2.1257863441558867e-06, + "loss": 0.3336, + "step": 4250 + }, + { + "epoch": 4.531982942430703, + "grad_norm": 0.07322352264632, + "learning_rate": 2.116220680275114e-06, + "loss": 0.335, + "step": 4251 + }, + { + "epoch": 4.533049040511727, + "grad_norm": 0.07412360547888225, + "learning_rate": 2.1066760020392075e-06, + "loss": 0.333, + "step": 4252 + }, + { + "epoch": 4.5341151385927505, + "grad_norm": 0.07037038219362778, + "learning_rate": 2.0971523147354224e-06, + "loss": 0.3345, + "step": 4253 + }, + { + "epoch": 4.535181236673774, + "grad_norm": 0.07385081135383645, + "learning_rate": 2.0876496236393915e-06, + "loss": 0.3361, + "step": 4254 + }, + { + "epoch": 4.536247334754798, + "grad_norm": 0.07977224842487596, + "learning_rate": 2.0781679340151007e-06, + "loss": 0.3322, + "step": 4255 + }, + { + "epoch": 4.537313432835821, + "grad_norm": 0.07150280522896328, + "learning_rate": 2.0687072511149207e-06, + "loss": 0.3343, + "step": 4256 + }, + { + "epoch": 4.538379530916845, + "grad_norm": 0.07550230379695838, + "learning_rate": 2.0592675801795715e-06, + "loss": 0.3289, + "step": 4257 + }, + { + "epoch": 4.539445628997868, + "grad_norm": 0.07468614314295811, + "learning_rate": 2.0498489264381537e-06, + "loss": 0.3364, + "step": 4258 + }, + { + "epoch": 4.540511727078892, + "grad_norm": 0.07766324270385316, + "learning_rate": 2.040451295108099e-06, + "loss": 0.3332, + "step": 4259 + }, + { + "epoch": 4.541577825159915, + "grad_norm": 0.0736473092823455, + "learning_rate": 2.0310746913952075e-06, + "loss": 0.3307, + "step": 4260 + }, + { + "epoch": 4.542643923240938, + "grad_norm": 0.0734896770686822, + "learning_rate": 2.0217191204936393e-06, + "loss": 0.3396, + "step": 4261 + }, + { + "epoch": 4.543710021321962, + "grad_norm": 0.06952100029741023, + "learning_rate": 2.012384587585885e-06, + "loss": 0.3346, + "step": 4262 + }, + { + "epoch": 4.544776119402985, + "grad_norm": 0.07462393218580446, + "learning_rate": 2.003071097842795e-06, + "loss": 0.3356, + "step": 4263 + }, + { + "epoch": 4.545842217484008, + "grad_norm": 0.07095210368849357, + "learning_rate": 1.993778656423557e-06, + "loss": 0.3299, + "step": 4264 + }, + { + "epoch": 4.546908315565032, + "grad_norm": 0.07173596162673962, + "learning_rate": 1.9845072684757084e-06, + "loss": 0.3331, + "step": 4265 + }, + { + "epoch": 4.547974413646055, + "grad_norm": 0.07462211744625402, + "learning_rate": 1.975256939135104e-06, + "loss": 0.3331, + "step": 4266 + }, + { + "epoch": 4.549040511727079, + "grad_norm": 0.06797661761770143, + "learning_rate": 1.966027673525952e-06, + "loss": 0.3349, + "step": 4267 + }, + { + "epoch": 4.550106609808102, + "grad_norm": 0.06909061390055764, + "learning_rate": 1.9568194767607897e-06, + "loss": 0.333, + "step": 4268 + }, + { + "epoch": 4.551172707889126, + "grad_norm": 0.07230285207266503, + "learning_rate": 1.9476323539404697e-06, + "loss": 0.336, + "step": 4269 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.06795386640951454, + "learning_rate": 1.9384663101541834e-06, + "loss": 0.3306, + "step": 4270 + }, + { + "epoch": 4.553304904051172, + "grad_norm": 0.07185284208858062, + "learning_rate": 1.9293213504794474e-06, + "loss": 0.3329, + "step": 4271 + }, + { + "epoch": 4.554371002132196, + "grad_norm": 0.06964066381645194, + "learning_rate": 1.9201974799820976e-06, + "loss": 0.3336, + "step": 4272 + }, + { + "epoch": 4.5554371002132195, + "grad_norm": 0.06841885093169003, + "learning_rate": 1.911094703716274e-06, + "loss": 0.3373, + "step": 4273 + }, + { + "epoch": 4.556503198294243, + "grad_norm": 0.07829671714309364, + "learning_rate": 1.9020130267244408e-06, + "loss": 0.3337, + "step": 4274 + }, + { + "epoch": 4.5575692963752665, + "grad_norm": 0.07118332494914492, + "learning_rate": 1.8929524540373868e-06, + "loss": 0.3337, + "step": 4275 + }, + { + "epoch": 4.55863539445629, + "grad_norm": 0.07251578667557201, + "learning_rate": 1.8839129906741903e-06, + "loss": 0.3374, + "step": 4276 + }, + { + "epoch": 4.559701492537314, + "grad_norm": 0.06723960489550845, + "learning_rate": 1.8748946416422464e-06, + "loss": 0.3306, + "step": 4277 + }, + { + "epoch": 4.560767590618337, + "grad_norm": 0.07111432053727844, + "learning_rate": 1.8658974119372475e-06, + "loss": 0.3356, + "step": 4278 + }, + { + "epoch": 4.561833688699361, + "grad_norm": 0.06575145639594351, + "learning_rate": 1.856921306543198e-06, + "loss": 0.3326, + "step": 4279 + }, + { + "epoch": 4.562899786780384, + "grad_norm": 0.07466010683175027, + "learning_rate": 1.847966330432387e-06, + "loss": 0.3331, + "step": 4280 + }, + { + "epoch": 4.563965884861407, + "grad_norm": 0.07631299648143247, + "learning_rate": 1.839032488565411e-06, + "loss": 0.3351, + "step": 4281 + }, + { + "epoch": 4.565031982942431, + "grad_norm": 0.06992148350886665, + "learning_rate": 1.8301197858911512e-06, + "loss": 0.3341, + "step": 4282 + }, + { + "epoch": 4.566098081023454, + "grad_norm": 0.06950406008223693, + "learning_rate": 1.8212282273467874e-06, + "loss": 0.3326, + "step": 4283 + }, + { + "epoch": 4.567164179104478, + "grad_norm": 0.0715513717486181, + "learning_rate": 1.8123578178577706e-06, + "loss": 0.3314, + "step": 4284 + }, + { + "epoch": 4.568230277185501, + "grad_norm": 0.07759734713431235, + "learning_rate": 1.8035085623378544e-06, + "loss": 0.3284, + "step": 4285 + }, + { + "epoch": 4.569296375266525, + "grad_norm": 0.07289404960905593, + "learning_rate": 1.7946804656890648e-06, + "loss": 0.3339, + "step": 4286 + }, + { + "epoch": 4.570362473347548, + "grad_norm": 0.07031085270085344, + "learning_rate": 1.7858735328017119e-06, + "loss": 0.3352, + "step": 4287 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.0683840513250094, + "learning_rate": 1.7770877685543687e-06, + "loss": 0.3319, + "step": 4288 + }, + { + "epoch": 4.572494669509595, + "grad_norm": 0.07276081186481145, + "learning_rate": 1.768323177813902e-06, + "loss": 0.3314, + "step": 4289 + }, + { + "epoch": 4.573560767590618, + "grad_norm": 0.07091066982025185, + "learning_rate": 1.7595797654354374e-06, + "loss": 0.3327, + "step": 4290 + }, + { + "epoch": 4.574626865671641, + "grad_norm": 0.07166306063065245, + "learning_rate": 1.750857536262367e-06, + "loss": 0.3349, + "step": 4291 + }, + { + "epoch": 4.575692963752665, + "grad_norm": 0.07054961140916155, + "learning_rate": 1.7421564951263547e-06, + "loss": 0.332, + "step": 4292 + }, + { + "epoch": 4.576759061833688, + "grad_norm": 0.07361301895145772, + "learning_rate": 1.7334766468473275e-06, + "loss": 0.3337, + "step": 4293 + }, + { + "epoch": 4.577825159914712, + "grad_norm": 0.07045798280358322, + "learning_rate": 1.7248179962334699e-06, + "loss": 0.3359, + "step": 4294 + }, + { + "epoch": 4.5788912579957355, + "grad_norm": 0.06774482441192077, + "learning_rate": 1.7161805480812166e-06, + "loss": 0.3327, + "step": 4295 + }, + { + "epoch": 4.5799573560767595, + "grad_norm": 0.0718262080789473, + "learning_rate": 1.7075643071752735e-06, + "loss": 0.3345, + "step": 4296 + }, + { + "epoch": 4.581023454157783, + "grad_norm": 0.0691615331261111, + "learning_rate": 1.6989692782885914e-06, + "loss": 0.332, + "step": 4297 + }, + { + "epoch": 4.582089552238806, + "grad_norm": 0.0731179918926363, + "learning_rate": 1.6903954661823618e-06, + "loss": 0.3332, + "step": 4298 + }, + { + "epoch": 4.58315565031983, + "grad_norm": 0.07078605872262074, + "learning_rate": 1.6818428756060346e-06, + "loss": 0.3331, + "step": 4299 + }, + { + "epoch": 4.584221748400853, + "grad_norm": 0.06734701136745615, + "learning_rate": 1.6733115112973042e-06, + "loss": 0.3328, + "step": 4300 + }, + { + "epoch": 4.585287846481877, + "grad_norm": 0.06790749484835054, + "learning_rate": 1.6648013779820972e-06, + "loss": 0.3385, + "step": 4301 + }, + { + "epoch": 4.5863539445629, + "grad_norm": 0.07187466083950145, + "learning_rate": 1.656312480374589e-06, + "loss": 0.3356, + "step": 4302 + }, + { + "epoch": 4.587420042643923, + "grad_norm": 0.07459252483448003, + "learning_rate": 1.6478448231771914e-06, + "loss": 0.333, + "step": 4303 + }, + { + "epoch": 4.588486140724947, + "grad_norm": 0.0717675220591279, + "learning_rate": 1.639398411080535e-06, + "loss": 0.3319, + "step": 4304 + }, + { + "epoch": 4.58955223880597, + "grad_norm": 0.07118074349730492, + "learning_rate": 1.6309732487634989e-06, + "loss": 0.3318, + "step": 4305 + }, + { + "epoch": 4.590618336886994, + "grad_norm": 0.06815517038121584, + "learning_rate": 1.6225693408931898e-06, + "loss": 0.3395, + "step": 4306 + }, + { + "epoch": 4.591684434968017, + "grad_norm": 0.06997889027085387, + "learning_rate": 1.6141866921249282e-06, + "loss": 0.331, + "step": 4307 + }, + { + "epoch": 4.59275053304904, + "grad_norm": 0.07279040293236479, + "learning_rate": 1.6058253071022711e-06, + "loss": 0.3322, + "step": 4308 + }, + { + "epoch": 4.593816631130064, + "grad_norm": 0.0745884036380402, + "learning_rate": 1.5974851904569931e-06, + "loss": 0.3353, + "step": 4309 + }, + { + "epoch": 4.594882729211087, + "grad_norm": 0.06892857071091296, + "learning_rate": 1.589166346809079e-06, + "loss": 0.3336, + "step": 4310 + }, + { + "epoch": 4.595948827292111, + "grad_norm": 0.06646283141120592, + "learning_rate": 1.58086878076674e-06, + "loss": 0.3355, + "step": 4311 + }, + { + "epoch": 4.597014925373134, + "grad_norm": 0.06892657692261639, + "learning_rate": 1.5725924969263973e-06, + "loss": 0.3341, + "step": 4312 + }, + { + "epoch": 4.598081023454158, + "grad_norm": 0.07247620559120913, + "learning_rate": 1.5643374998726768e-06, + "loss": 0.3371, + "step": 4313 + }, + { + "epoch": 4.599147121535181, + "grad_norm": 0.0682871385404355, + "learning_rate": 1.5561037941784184e-06, + "loss": 0.337, + "step": 4314 + }, + { + "epoch": 4.600213219616204, + "grad_norm": 0.07060756058234041, + "learning_rate": 1.5478913844046716e-06, + "loss": 0.3356, + "step": 4315 + }, + { + "epoch": 4.601279317697228, + "grad_norm": 0.06829225466485105, + "learning_rate": 1.5397002751006863e-06, + "loss": 0.337, + "step": 4316 + }, + { + "epoch": 4.6023454157782515, + "grad_norm": 0.06766530449323094, + "learning_rate": 1.531530470803908e-06, + "loss": 0.3321, + "step": 4317 + }, + { + "epoch": 4.603411513859275, + "grad_norm": 0.0670931989643599, + "learning_rate": 1.5233819760399793e-06, + "loss": 0.335, + "step": 4318 + }, + { + "epoch": 4.604477611940299, + "grad_norm": 0.06938288654468625, + "learning_rate": 1.5152547953227515e-06, + "loss": 0.3318, + "step": 4319 + }, + { + "epoch": 4.605543710021322, + "grad_norm": 0.06867733948754896, + "learning_rate": 1.5071489331542543e-06, + "loss": 0.3314, + "step": 4320 + }, + { + "epoch": 4.606609808102346, + "grad_norm": 0.07777991942380198, + "learning_rate": 1.499064394024714e-06, + "loss": 0.3385, + "step": 4321 + }, + { + "epoch": 4.607675906183369, + "grad_norm": 0.06967592936560713, + "learning_rate": 1.4910011824125436e-06, + "loss": 0.3323, + "step": 4322 + }, + { + "epoch": 4.608742004264393, + "grad_norm": 0.06751157296924212, + "learning_rate": 1.482959302784357e-06, + "loss": 0.3393, + "step": 4323 + }, + { + "epoch": 4.609808102345416, + "grad_norm": 0.06979920693263406, + "learning_rate": 1.4749387595949195e-06, + "loss": 0.336, + "step": 4324 + }, + { + "epoch": 4.610874200426439, + "grad_norm": 0.07053350912123188, + "learning_rate": 1.4669395572872015e-06, + "loss": 0.3298, + "step": 4325 + }, + { + "epoch": 4.611940298507463, + "grad_norm": 0.07371928778646851, + "learning_rate": 1.4589617002923516e-06, + "loss": 0.3342, + "step": 4326 + }, + { + "epoch": 4.613006396588486, + "grad_norm": 0.07223172335830214, + "learning_rate": 1.4510051930296799e-06, + "loss": 0.3321, + "step": 4327 + }, + { + "epoch": 4.61407249466951, + "grad_norm": 0.066958488669665, + "learning_rate": 1.4430700399066821e-06, + "loss": 0.3312, + "step": 4328 + }, + { + "epoch": 4.615138592750533, + "grad_norm": 0.07053436931086877, + "learning_rate": 1.435156245319016e-06, + "loss": 0.3372, + "step": 4329 + }, + { + "epoch": 4.616204690831556, + "grad_norm": 0.06968744357208878, + "learning_rate": 1.4272638136505257e-06, + "loss": 0.3368, + "step": 4330 + }, + { + "epoch": 4.61727078891258, + "grad_norm": 0.06859881353886961, + "learning_rate": 1.4193927492731897e-06, + "loss": 0.3277, + "step": 4331 + }, + { + "epoch": 4.618336886993603, + "grad_norm": 0.07231488171540013, + "learning_rate": 1.4115430565471776e-06, + "loss": 0.3321, + "step": 4332 + }, + { + "epoch": 4.619402985074627, + "grad_norm": 0.0666791642191909, + "learning_rate": 1.4037147398208118e-06, + "loss": 0.3371, + "step": 4333 + }, + { + "epoch": 4.62046908315565, + "grad_norm": 0.0776009579849058, + "learning_rate": 1.3959078034305785e-06, + "loss": 0.3347, + "step": 4334 + }, + { + "epoch": 4.621535181236673, + "grad_norm": 0.06614100139929324, + "learning_rate": 1.3881222517010983e-06, + "loss": 0.3376, + "step": 4335 + }, + { + "epoch": 4.622601279317697, + "grad_norm": 0.06757259397296564, + "learning_rate": 1.3803580889451795e-06, + "loss": 0.3353, + "step": 4336 + }, + { + "epoch": 4.6236673773987205, + "grad_norm": 0.07131647416320316, + "learning_rate": 1.3726153194637548e-06, + "loss": 0.3441, + "step": 4337 + }, + { + "epoch": 4.6247334754797444, + "grad_norm": 0.06932530118685151, + "learning_rate": 1.3648939475459178e-06, + "loss": 0.329, + "step": 4338 + }, + { + "epoch": 4.6257995735607675, + "grad_norm": 0.07255409329631657, + "learning_rate": 1.3571939774689091e-06, + "loss": 0.3288, + "step": 4339 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.06673195482515673, + "learning_rate": 1.3495154134981126e-06, + "loss": 0.3354, + "step": 4340 + }, + { + "epoch": 4.627931769722815, + "grad_norm": 0.06572556100183624, + "learning_rate": 1.34185825988705e-06, + "loss": 0.3329, + "step": 4341 + }, + { + "epoch": 4.628997867803838, + "grad_norm": 0.06490987870262467, + "learning_rate": 1.3342225208773906e-06, + "loss": 0.3354, + "step": 4342 + }, + { + "epoch": 4.630063965884862, + "grad_norm": 0.07309588285106411, + "learning_rate": 1.3266082006989333e-06, + "loss": 0.331, + "step": 4343 + }, + { + "epoch": 4.631130063965885, + "grad_norm": 0.07038414592289387, + "learning_rate": 1.3190153035696196e-06, + "loss": 0.3435, + "step": 4344 + }, + { + "epoch": 4.632196162046908, + "grad_norm": 0.06928635687457206, + "learning_rate": 1.311443833695516e-06, + "loss": 0.3364, + "step": 4345 + }, + { + "epoch": 4.633262260127932, + "grad_norm": 0.06537020208079272, + "learning_rate": 1.303893795270823e-06, + "loss": 0.3316, + "step": 4346 + }, + { + "epoch": 4.634328358208955, + "grad_norm": 0.06896350457838996, + "learning_rate": 1.296365192477871e-06, + "loss": 0.3367, + "step": 4347 + }, + { + "epoch": 4.635394456289979, + "grad_norm": 0.06599969292032785, + "learning_rate": 1.2888580294871233e-06, + "loss": 0.3346, + "step": 4348 + }, + { + "epoch": 4.636460554371002, + "grad_norm": 0.06925949481192152, + "learning_rate": 1.281372310457143e-06, + "loss": 0.3349, + "step": 4349 + }, + { + "epoch": 4.637526652452026, + "grad_norm": 0.07032567037244174, + "learning_rate": 1.2739080395346347e-06, + "loss": 0.3324, + "step": 4350 + }, + { + "epoch": 4.638592750533049, + "grad_norm": 0.06808103931489896, + "learning_rate": 1.2664652208544205e-06, + "loss": 0.3364, + "step": 4351 + }, + { + "epoch": 4.639658848614072, + "grad_norm": 0.07415952469477234, + "learning_rate": 1.2590438585394372e-06, + "loss": 0.3368, + "step": 4352 + }, + { + "epoch": 4.640724946695096, + "grad_norm": 0.07472723302057231, + "learning_rate": 1.2516439567007254e-06, + "loss": 0.3312, + "step": 4353 + }, + { + "epoch": 4.641791044776119, + "grad_norm": 0.06723931963836592, + "learning_rate": 1.2442655194374464e-06, + "loss": 0.3363, + "step": 4354 + }, + { + "epoch": 4.642857142857143, + "grad_norm": 0.06731787952306181, + "learning_rate": 1.2369085508368862e-06, + "loss": 0.3384, + "step": 4355 + }, + { + "epoch": 4.643923240938166, + "grad_norm": 0.06995325462163068, + "learning_rate": 1.2295730549744023e-06, + "loss": 0.3331, + "step": 4356 + }, + { + "epoch": 4.644989339019189, + "grad_norm": 0.07264991945604682, + "learning_rate": 1.2222590359134868e-06, + "loss": 0.3335, + "step": 4357 + }, + { + "epoch": 4.646055437100213, + "grad_norm": 0.0664979493499962, + "learning_rate": 1.2149664977057296e-06, + "loss": 0.331, + "step": 4358 + }, + { + "epoch": 4.6471215351812365, + "grad_norm": 0.06852799522357257, + "learning_rate": 1.2076954443908152e-06, + "loss": 0.3342, + "step": 4359 + }, + { + "epoch": 4.6481876332622605, + "grad_norm": 0.0700385586316551, + "learning_rate": 1.20044587999653e-06, + "loss": 0.3324, + "step": 4360 + }, + { + "epoch": 4.649253731343284, + "grad_norm": 0.06744640423267867, + "learning_rate": 1.1932178085387514e-06, + "loss": 0.3336, + "step": 4361 + }, + { + "epoch": 4.650319829424307, + "grad_norm": 0.07133474365028102, + "learning_rate": 1.1860112340214624e-06, + "loss": 0.3357, + "step": 4362 + }, + { + "epoch": 4.651385927505331, + "grad_norm": 0.06780843638739921, + "learning_rate": 1.178826160436728e-06, + "loss": 0.3339, + "step": 4363 + }, + { + "epoch": 4.652452025586354, + "grad_norm": 0.06642970597324976, + "learning_rate": 1.1716625917647018e-06, + "loss": 0.3341, + "step": 4364 + }, + { + "epoch": 4.653518123667378, + "grad_norm": 0.07245838112455406, + "learning_rate": 1.1645205319736318e-06, + "loss": 0.3345, + "step": 4365 + }, + { + "epoch": 4.654584221748401, + "grad_norm": 0.07452837382246935, + "learning_rate": 1.1573999850198515e-06, + "loss": 0.3311, + "step": 4366 + }, + { + "epoch": 4.655650319829425, + "grad_norm": 0.07068223858842627, + "learning_rate": 1.1503009548477695e-06, + "loss": 0.3382, + "step": 4367 + }, + { + "epoch": 4.656716417910448, + "grad_norm": 0.06557927350005839, + "learning_rate": 1.143223445389876e-06, + "loss": 0.3304, + "step": 4368 + }, + { + "epoch": 4.657782515991471, + "grad_norm": 0.0663596931630506, + "learning_rate": 1.1361674605667505e-06, + "loss": 0.3306, + "step": 4369 + }, + { + "epoch": 4.658848614072495, + "grad_norm": 0.06703039652535904, + "learning_rate": 1.1291330042870396e-06, + "loss": 0.3375, + "step": 4370 + }, + { + "epoch": 4.659914712153518, + "grad_norm": 0.06939523423465807, + "learning_rate": 1.122120080447462e-06, + "loss": 0.3369, + "step": 4371 + }, + { + "epoch": 4.660980810234541, + "grad_norm": 0.07088459747200242, + "learning_rate": 1.115128692932821e-06, + "loss": 0.3397, + "step": 4372 + }, + { + "epoch": 4.662046908315565, + "grad_norm": 0.07022817101518151, + "learning_rate": 1.1081588456159786e-06, + "loss": 0.3359, + "step": 4373 + }, + { + "epoch": 4.663113006396588, + "grad_norm": 0.07123058169602929, + "learning_rate": 1.10121054235786e-06, + "loss": 0.3333, + "step": 4374 + }, + { + "epoch": 4.664179104477612, + "grad_norm": 0.0675246173798063, + "learning_rate": 1.0942837870074795e-06, + "loss": 0.3333, + "step": 4375 + }, + { + "epoch": 4.665245202558635, + "grad_norm": 0.06471932900262051, + "learning_rate": 1.087378583401888e-06, + "loss": 0.3321, + "step": 4376 + }, + { + "epoch": 4.666311300639659, + "grad_norm": 0.06686992898419163, + "learning_rate": 1.080494935366212e-06, + "loss": 0.3331, + "step": 4377 + }, + { + "epoch": 4.667377398720682, + "grad_norm": 0.06937875917999614, + "learning_rate": 1.073632846713637e-06, + "loss": 0.3362, + "step": 4378 + }, + { + "epoch": 4.6684434968017055, + "grad_norm": 0.06987137809858991, + "learning_rate": 1.0667923212454023e-06, + "loss": 0.3368, + "step": 4379 + }, + { + "epoch": 4.669509594882729, + "grad_norm": 0.06867240275257108, + "learning_rate": 1.0599733627508013e-06, + "loss": 0.3346, + "step": 4380 + }, + { + "epoch": 4.6705756929637525, + "grad_norm": 0.06826231476314719, + "learning_rate": 1.0531759750071857e-06, + "loss": 0.3313, + "step": 4381 + }, + { + "epoch": 4.6716417910447765, + "grad_norm": 0.06977322414302481, + "learning_rate": 1.0464001617799525e-06, + "loss": 0.3349, + "step": 4382 + }, + { + "epoch": 4.6727078891258, + "grad_norm": 0.07065917384047074, + "learning_rate": 1.0396459268225523e-06, + "loss": 0.3303, + "step": 4383 + }, + { + "epoch": 4.673773987206823, + "grad_norm": 0.06723899201810622, + "learning_rate": 1.0329132738764814e-06, + "loss": 0.3376, + "step": 4384 + }, + { + "epoch": 4.674840085287847, + "grad_norm": 0.06708033630230664, + "learning_rate": 1.0262022066712763e-06, + "loss": 0.3368, + "step": 4385 + }, + { + "epoch": 4.67590618336887, + "grad_norm": 0.06703109426648023, + "learning_rate": 1.0195127289245188e-06, + "loss": 0.3305, + "step": 4386 + }, + { + "epoch": 4.676972281449894, + "grad_norm": 0.07014172973051326, + "learning_rate": 1.0128448443418315e-06, + "loss": 0.3354, + "step": 4387 + }, + { + "epoch": 4.678038379530917, + "grad_norm": 0.07076031888009388, + "learning_rate": 1.0061985566168863e-06, + "loss": 0.3393, + "step": 4388 + }, + { + "epoch": 4.67910447761194, + "grad_norm": 0.06971401068803788, + "learning_rate": 9.99573869431365e-07, + "loss": 0.3405, + "step": 4389 + }, + { + "epoch": 4.680170575692964, + "grad_norm": 0.06595206438851949, + "learning_rate": 9.92970786455012e-07, + "loss": 0.3291, + "step": 4390 + }, + { + "epoch": 4.681236673773987, + "grad_norm": 0.07143631976414835, + "learning_rate": 9.863893113455857e-07, + "loss": 0.3294, + "step": 4391 + }, + { + "epoch": 4.682302771855011, + "grad_norm": 0.06704484678485922, + "learning_rate": 9.798294477488901e-07, + "loss": 0.3355, + "step": 4392 + }, + { + "epoch": 4.683368869936034, + "grad_norm": 0.06626390001420994, + "learning_rate": 9.732911992987382e-07, + "loss": 0.3352, + "step": 4393 + }, + { + "epoch": 4.684434968017058, + "grad_norm": 0.06399841711925291, + "learning_rate": 9.667745696169839e-07, + "loss": 0.3365, + "step": 4394 + }, + { + "epoch": 4.685501066098081, + "grad_norm": 0.06442430133602112, + "learning_rate": 9.602795623135042e-07, + "loss": 0.3306, + "step": 4395 + }, + { + "epoch": 4.686567164179104, + "grad_norm": 0.06647357574669362, + "learning_rate": 9.5380618098619e-07, + "loss": 0.3311, + "step": 4396 + }, + { + "epoch": 4.687633262260128, + "grad_norm": 0.06493583228185477, + "learning_rate": 9.473544292209591e-07, + "loss": 0.327, + "step": 4397 + }, + { + "epoch": 4.688699360341151, + "grad_norm": 0.06666580308482166, + "learning_rate": 9.409243105917532e-07, + "loss": 0.334, + "step": 4398 + }, + { + "epoch": 4.689765458422174, + "grad_norm": 0.06921334075814284, + "learning_rate": 9.345158286605182e-07, + "loss": 0.3363, + "step": 4399 + }, + { + "epoch": 4.690831556503198, + "grad_norm": 0.06673171229849625, + "learning_rate": 9.281289869772192e-07, + "loss": 0.328, + "step": 4400 + }, + { + "epoch": 4.6918976545842215, + "grad_norm": 0.06584271320243143, + "learning_rate": 9.217637890798348e-07, + "loss": 0.3394, + "step": 4401 + }, + { + "epoch": 4.6929637526652455, + "grad_norm": 0.0671007285287849, + "learning_rate": 9.154202384943622e-07, + "loss": 0.3318, + "step": 4402 + }, + { + "epoch": 4.6940298507462686, + "grad_norm": 0.06331709951463382, + "learning_rate": 9.090983387347863e-07, + "loss": 0.3347, + "step": 4403 + }, + { + "epoch": 4.6950959488272925, + "grad_norm": 0.06898127420033802, + "learning_rate": 9.027980933031188e-07, + "loss": 0.3374, + "step": 4404 + }, + { + "epoch": 4.696162046908316, + "grad_norm": 0.07051509740595827, + "learning_rate": 8.965195056893638e-07, + "loss": 0.3351, + "step": 4405 + }, + { + "epoch": 4.697228144989339, + "grad_norm": 0.06642503187647329, + "learning_rate": 8.902625793715391e-07, + "loss": 0.3289, + "step": 4406 + }, + { + "epoch": 4.698294243070363, + "grad_norm": 0.062149770848458644, + "learning_rate": 8.8402731781565e-07, + "loss": 0.3321, + "step": 4407 + }, + { + "epoch": 4.699360341151386, + "grad_norm": 0.06584174207599507, + "learning_rate": 8.778137244757068e-07, + "loss": 0.3303, + "step": 4408 + }, + { + "epoch": 4.70042643923241, + "grad_norm": 0.06691401972973397, + "learning_rate": 8.716218027937251e-07, + "loss": 0.3305, + "step": 4409 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.0690126967878217, + "learning_rate": 8.654515561997034e-07, + "loss": 0.3354, + "step": 4410 + }, + { + "epoch": 4.702558635394456, + "grad_norm": 0.06740746680029383, + "learning_rate": 8.593029881116322e-07, + "loss": 0.3242, + "step": 4411 + }, + { + "epoch": 4.70362473347548, + "grad_norm": 0.06898874376741002, + "learning_rate": 8.531761019355067e-07, + "loss": 0.3387, + "step": 4412 + }, + { + "epoch": 4.704690831556503, + "grad_norm": 0.06501918561728087, + "learning_rate": 8.470709010653011e-07, + "loss": 0.3329, + "step": 4413 + }, + { + "epoch": 4.705756929637527, + "grad_norm": 0.06601690952715261, + "learning_rate": 8.409873888829768e-07, + "loss": 0.3378, + "step": 4414 + }, + { + "epoch": 4.70682302771855, + "grad_norm": 0.06584452917029096, + "learning_rate": 8.349255687584867e-07, + "loss": 0.3418, + "step": 4415 + }, + { + "epoch": 4.707889125799573, + "grad_norm": 0.07132046643473576, + "learning_rate": 8.288854440497629e-07, + "loss": 0.3356, + "step": 4416 + }, + { + "epoch": 4.708955223880597, + "grad_norm": 0.06688057560968289, + "learning_rate": 8.228670181027199e-07, + "loss": 0.3406, + "step": 4417 + }, + { + "epoch": 4.71002132196162, + "grad_norm": 0.06467424384256552, + "learning_rate": 8.168702942512507e-07, + "loss": 0.3404, + "step": 4418 + }, + { + "epoch": 4.711087420042644, + "grad_norm": 0.06658144857820635, + "learning_rate": 8.108952758172317e-07, + "loss": 0.3373, + "step": 4419 + }, + { + "epoch": 4.712153518123667, + "grad_norm": 0.06579984809073838, + "learning_rate": 8.049419661105129e-07, + "loss": 0.3327, + "step": 4420 + }, + { + "epoch": 4.713219616204691, + "grad_norm": 0.06783834549966072, + "learning_rate": 7.99010368428923e-07, + "loss": 0.3363, + "step": 4421 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.065981531581556, + "learning_rate": 7.93100486058247e-07, + "loss": 0.3362, + "step": 4422 + }, + { + "epoch": 4.7153518123667375, + "grad_norm": 0.066232413118734, + "learning_rate": 7.872123222722617e-07, + "loss": 0.3285, + "step": 4423 + }, + { + "epoch": 4.7164179104477615, + "grad_norm": 0.06404021502472225, + "learning_rate": 7.813458803327001e-07, + "loss": 0.333, + "step": 4424 + }, + { + "epoch": 4.717484008528785, + "grad_norm": 0.06489096407537417, + "learning_rate": 7.755011634892695e-07, + "loss": 0.3336, + "step": 4425 + }, + { + "epoch": 4.718550106609808, + "grad_norm": 0.06614640728593563, + "learning_rate": 7.696781749796333e-07, + "loss": 0.3326, + "step": 4426 + }, + { + "epoch": 4.719616204690832, + "grad_norm": 0.06662313517511247, + "learning_rate": 7.638769180294292e-07, + "loss": 0.3354, + "step": 4427 + }, + { + "epoch": 4.720682302771855, + "grad_norm": 0.06630182982279928, + "learning_rate": 7.580973958522553e-07, + "loss": 0.3274, + "step": 4428 + }, + { + "epoch": 4.721748400852879, + "grad_norm": 0.06231304613251542, + "learning_rate": 7.523396116496573e-07, + "loss": 0.3351, + "step": 4429 + }, + { + "epoch": 4.722814498933902, + "grad_norm": 0.06561797780372054, + "learning_rate": 7.46603568611155e-07, + "loss": 0.3303, + "step": 4430 + }, + { + "epoch": 4.723880597014926, + "grad_norm": 0.06606066948383217, + "learning_rate": 7.408892699142156e-07, + "loss": 0.3337, + "step": 4431 + }, + { + "epoch": 4.724946695095949, + "grad_norm": 0.06532372136865669, + "learning_rate": 7.35196718724267e-07, + "loss": 0.3361, + "step": 4432 + }, + { + "epoch": 4.726012793176972, + "grad_norm": 0.07451746009716578, + "learning_rate": 7.295259181946801e-07, + "loss": 0.3353, + "step": 4433 + }, + { + "epoch": 4.727078891257996, + "grad_norm": 0.06275057701061912, + "learning_rate": 7.23876871466791e-07, + "loss": 0.3384, + "step": 4434 + }, + { + "epoch": 4.728144989339019, + "grad_norm": 0.06554740340061337, + "learning_rate": 7.182495816698787e-07, + "loss": 0.3302, + "step": 4435 + }, + { + "epoch": 4.729211087420042, + "grad_norm": 0.0690906174501306, + "learning_rate": 7.126440519211608e-07, + "loss": 0.3378, + "step": 4436 + }, + { + "epoch": 4.730277185501066, + "grad_norm": 0.06708686099386059, + "learning_rate": 7.070602853258112e-07, + "loss": 0.3358, + "step": 4437 + }, + { + "epoch": 4.731343283582089, + "grad_norm": 0.06891228927845029, + "learning_rate": 7.014982849769558e-07, + "loss": 0.3354, + "step": 4438 + }, + { + "epoch": 4.732409381663113, + "grad_norm": 0.06537602757352802, + "learning_rate": 6.95958053955641e-07, + "loss": 0.3371, + "step": 4439 + }, + { + "epoch": 4.733475479744136, + "grad_norm": 0.06604287763417778, + "learning_rate": 6.904395953308784e-07, + "loss": 0.328, + "step": 4440 + }, + { + "epoch": 4.73454157782516, + "grad_norm": 0.06405394013935223, + "learning_rate": 6.849429121596007e-07, + "loss": 0.3346, + "step": 4441 + }, + { + "epoch": 4.735607675906183, + "grad_norm": 0.06528070970380216, + "learning_rate": 6.794680074866833e-07, + "loss": 0.3419, + "step": 4442 + }, + { + "epoch": 4.7366737739872065, + "grad_norm": 0.06512815748529073, + "learning_rate": 6.740148843449401e-07, + "loss": 0.3331, + "step": 4443 + }, + { + "epoch": 4.73773987206823, + "grad_norm": 0.06646695491242108, + "learning_rate": 6.685835457551281e-07, + "loss": 0.3346, + "step": 4444 + }, + { + "epoch": 4.7388059701492535, + "grad_norm": 0.0672561173174234, + "learning_rate": 6.631739947259075e-07, + "loss": 0.3368, + "step": 4445 + }, + { + "epoch": 4.7398720682302775, + "grad_norm": 0.06724341190669111, + "learning_rate": 6.577862342539032e-07, + "loss": 0.3291, + "step": 4446 + }, + { + "epoch": 4.740938166311301, + "grad_norm": 0.06595018864181441, + "learning_rate": 6.524202673236524e-07, + "loss": 0.3335, + "step": 4447 + }, + { + "epoch": 4.742004264392325, + "grad_norm": 0.06439992810416004, + "learning_rate": 6.470760969076173e-07, + "loss": 0.3371, + "step": 4448 + }, + { + "epoch": 4.743070362473348, + "grad_norm": 0.06515103944575167, + "learning_rate": 6.417537259661899e-07, + "loss": 0.3392, + "step": 4449 + }, + { + "epoch": 4.744136460554371, + "grad_norm": 0.06518098968521178, + "learning_rate": 6.364531574476962e-07, + "loss": 0.3263, + "step": 4450 + }, + { + "epoch": 4.745202558635395, + "grad_norm": 0.06553602384394039, + "learning_rate": 6.311743942883652e-07, + "loss": 0.3319, + "step": 4451 + }, + { + "epoch": 4.746268656716418, + "grad_norm": 0.06646799447298098, + "learning_rate": 6.259174394123602e-07, + "loss": 0.3287, + "step": 4452 + }, + { + "epoch": 4.747334754797441, + "grad_norm": 0.06614233529232587, + "learning_rate": 6.206822957317693e-07, + "loss": 0.3309, + "step": 4453 + }, + { + "epoch": 4.748400852878465, + "grad_norm": 0.06540305031790555, + "learning_rate": 6.154689661465752e-07, + "loss": 0.3389, + "step": 4454 + }, + { + "epoch": 4.749466950959488, + "grad_norm": 0.06512815324295504, + "learning_rate": 6.102774535447031e-07, + "loss": 0.3341, + "step": 4455 + }, + { + "epoch": 4.750533049040512, + "grad_norm": 0.06377242318608925, + "learning_rate": 6.051077608019773e-07, + "loss": 0.3335, + "step": 4456 + }, + { + "epoch": 4.751599147121535, + "grad_norm": 0.06507854548241519, + "learning_rate": 5.99959890782138e-07, + "loss": 0.3292, + "step": 4457 + }, + { + "epoch": 4.752665245202559, + "grad_norm": 0.06569932677501229, + "learning_rate": 5.948338463368419e-07, + "loss": 0.3407, + "step": 4458 + }, + { + "epoch": 4.753731343283582, + "grad_norm": 0.06613270042766406, + "learning_rate": 5.897296303056444e-07, + "loss": 0.3418, + "step": 4459 + }, + { + "epoch": 4.754797441364605, + "grad_norm": 0.06517187632853641, + "learning_rate": 5.846472455160213e-07, + "loss": 0.3361, + "step": 4460 + }, + { + "epoch": 4.755863539445629, + "grad_norm": 0.06689709594911152, + "learning_rate": 5.795866947833472e-07, + "loss": 0.3365, + "step": 4461 + }, + { + "epoch": 4.756929637526652, + "grad_norm": 0.06666505765328634, + "learning_rate": 5.745479809109045e-07, + "loss": 0.3306, + "step": 4462 + }, + { + "epoch": 4.757995735607675, + "grad_norm": 0.06646312328818217, + "learning_rate": 5.695311066898779e-07, + "loss": 0.3335, + "step": 4463 + }, + { + "epoch": 4.759061833688699, + "grad_norm": 0.06785871586907363, + "learning_rate": 5.6453607489936e-07, + "loss": 0.3364, + "step": 4464 + }, + { + "epoch": 4.7601279317697225, + "grad_norm": 0.06542533321108031, + "learning_rate": 5.595628883063331e-07, + "loss": 0.3355, + "step": 4465 + }, + { + "epoch": 4.7611940298507465, + "grad_norm": 0.06297758066267968, + "learning_rate": 5.546115496656867e-07, + "loss": 0.3383, + "step": 4466 + }, + { + "epoch": 4.76226012793177, + "grad_norm": 0.065278353186685, + "learning_rate": 5.496820617202047e-07, + "loss": 0.3349, + "step": 4467 + }, + { + "epoch": 4.7633262260127935, + "grad_norm": 0.06538720202179056, + "learning_rate": 5.447744272005695e-07, + "loss": 0.3331, + "step": 4468 + }, + { + "epoch": 4.764392324093817, + "grad_norm": 0.06718790884542368, + "learning_rate": 5.398886488253485e-07, + "loss": 0.3379, + "step": 4469 + }, + { + "epoch": 4.76545842217484, + "grad_norm": 0.06449471524894189, + "learning_rate": 5.350247293010169e-07, + "loss": 0.3349, + "step": 4470 + }, + { + "epoch": 4.766524520255864, + "grad_norm": 0.06433921241405373, + "learning_rate": 5.301826713219305e-07, + "loss": 0.3319, + "step": 4471 + }, + { + "epoch": 4.767590618336887, + "grad_norm": 0.06817594418030926, + "learning_rate": 5.253624775703347e-07, + "loss": 0.3361, + "step": 4472 + }, + { + "epoch": 4.768656716417911, + "grad_norm": 0.06600839287904284, + "learning_rate": 5.205641507163694e-07, + "loss": 0.3343, + "step": 4473 + }, + { + "epoch": 4.769722814498934, + "grad_norm": 0.06616379620806581, + "learning_rate": 5.157876934180551e-07, + "loss": 0.3318, + "step": 4474 + }, + { + "epoch": 4.770788912579958, + "grad_norm": 0.06606131200753886, + "learning_rate": 5.110331083213105e-07, + "loss": 0.3376, + "step": 4475 + }, + { + "epoch": 4.771855010660981, + "grad_norm": 0.06329917860131302, + "learning_rate": 5.063003980599179e-07, + "loss": 0.3366, + "step": 4476 + }, + { + "epoch": 4.772921108742004, + "grad_norm": 0.06481792763454347, + "learning_rate": 5.01589565255558e-07, + "loss": 0.3394, + "step": 4477 + }, + { + "epoch": 4.773987206823028, + "grad_norm": 0.0687947416522106, + "learning_rate": 4.969006125177834e-07, + "loss": 0.3368, + "step": 4478 + }, + { + "epoch": 4.775053304904051, + "grad_norm": 0.06404270869284907, + "learning_rate": 4.922335424440361e-07, + "loss": 0.3323, + "step": 4479 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.06757244081482576, + "learning_rate": 4.875883576196261e-07, + "loss": 0.3367, + "step": 4480 + }, + { + "epoch": 4.777185501066098, + "grad_norm": 0.06305226854333544, + "learning_rate": 4.829650606177438e-07, + "loss": 0.3345, + "step": 4481 + }, + { + "epoch": 4.778251599147121, + "grad_norm": 0.06783144251481625, + "learning_rate": 4.783636539994607e-07, + "loss": 0.3381, + "step": 4482 + }, + { + "epoch": 4.779317697228145, + "grad_norm": 0.07112258236166988, + "learning_rate": 4.737841403137067e-07, + "loss": 0.3319, + "step": 4483 + }, + { + "epoch": 4.780383795309168, + "grad_norm": 0.06778618377309222, + "learning_rate": 4.692265220973058e-07, + "loss": 0.3368, + "step": 4484 + }, + { + "epoch": 4.781449893390192, + "grad_norm": 0.061819307755361846, + "learning_rate": 4.6469080187493634e-07, + "loss": 0.3305, + "step": 4485 + }, + { + "epoch": 4.782515991471215, + "grad_norm": 0.06472343334221725, + "learning_rate": 4.601769821591529e-07, + "loss": 0.3301, + "step": 4486 + }, + { + "epoch": 4.7835820895522385, + "grad_norm": 0.06769820854581794, + "learning_rate": 4.5568506545037305e-07, + "loss": 0.3363, + "step": 4487 + }, + { + "epoch": 4.7846481876332625, + "grad_norm": 0.06599930627838296, + "learning_rate": 4.512150542368909e-07, + "loss": 0.3344, + "step": 4488 + }, + { + "epoch": 4.785714285714286, + "grad_norm": 0.06669160582080108, + "learning_rate": 4.467669509948591e-07, + "loss": 0.335, + "step": 4489 + }, + { + "epoch": 4.786780383795309, + "grad_norm": 0.06657029869145847, + "learning_rate": 4.423407581882932e-07, + "loss": 0.3318, + "step": 4490 + }, + { + "epoch": 4.787846481876333, + "grad_norm": 0.06875440174953255, + "learning_rate": 4.3793647826907203e-07, + "loss": 0.3301, + "step": 4491 + }, + { + "epoch": 4.788912579957356, + "grad_norm": 0.06843506786311358, + "learning_rate": 4.3355411367694164e-07, + "loss": 0.3377, + "step": 4492 + }, + { + "epoch": 4.78997867803838, + "grad_norm": 0.06459340857709406, + "learning_rate": 4.2919366683951135e-07, + "loss": 0.3336, + "step": 4493 + }, + { + "epoch": 4.791044776119403, + "grad_norm": 0.06386692441739224, + "learning_rate": 4.2485514017222674e-07, + "loss": 0.333, + "step": 4494 + }, + { + "epoch": 4.792110874200427, + "grad_norm": 0.06649809538469802, + "learning_rate": 4.205385360784142e-07, + "loss": 0.337, + "step": 4495 + }, + { + "epoch": 4.79317697228145, + "grad_norm": 0.06494463953807365, + "learning_rate": 4.162438569492455e-07, + "loss": 0.3381, + "step": 4496 + }, + { + "epoch": 4.794243070362473, + "grad_norm": 0.06443760229121664, + "learning_rate": 4.119711051637554e-07, + "loss": 0.331, + "step": 4497 + }, + { + "epoch": 4.795309168443497, + "grad_norm": 0.06694814400064329, + "learning_rate": 4.077202830888238e-07, + "loss": 0.3366, + "step": 4498 + }, + { + "epoch": 4.79637526652452, + "grad_norm": 0.061826241061163335, + "learning_rate": 4.0349139307918063e-07, + "loss": 0.3229, + "step": 4499 + }, + { + "epoch": 4.797441364605544, + "grad_norm": 0.06413189534977501, + "learning_rate": 3.992844374774141e-07, + "loss": 0.3357, + "step": 4500 + }, + { + "epoch": 4.798507462686567, + "grad_norm": 0.06301453498466782, + "learning_rate": 3.950994186139623e-07, + "loss": 0.3262, + "step": 4501 + }, + { + "epoch": 4.79957356076759, + "grad_norm": 0.0669025986464228, + "learning_rate": 3.909363388071041e-07, + "loss": 0.3333, + "step": 4502 + }, + { + "epoch": 4.800639658848614, + "grad_norm": 0.06727116449771486, + "learning_rate": 3.8679520036296823e-07, + "loss": 0.3285, + "step": 4503 + }, + { + "epoch": 4.801705756929637, + "grad_norm": 0.06423952902920181, + "learning_rate": 3.826760055755374e-07, + "loss": 0.3298, + "step": 4504 + }, + { + "epoch": 4.802771855010661, + "grad_norm": 0.06287554539652727, + "learning_rate": 3.785787567266219e-07, + "loss": 0.3314, + "step": 4505 + }, + { + "epoch": 4.803837953091684, + "grad_norm": 0.06428325628977336, + "learning_rate": 3.745034560858907e-07, + "loss": 0.3374, + "step": 4506 + }, + { + "epoch": 4.8049040511727075, + "grad_norm": 0.0615240827531194, + "learning_rate": 3.7045010591084893e-07, + "loss": 0.3346, + "step": 4507 + }, + { + "epoch": 4.8059701492537314, + "grad_norm": 0.06350727088776514, + "learning_rate": 3.66418708446834e-07, + "loss": 0.331, + "step": 4508 + }, + { + "epoch": 4.8070362473347545, + "grad_norm": 0.06394487503770653, + "learning_rate": 3.6240926592704173e-07, + "loss": 0.3281, + "step": 4509 + }, + { + "epoch": 4.8081023454157785, + "grad_norm": 0.06562664075180806, + "learning_rate": 3.58421780572491e-07, + "loss": 0.3364, + "step": 4510 + }, + { + "epoch": 4.809168443496802, + "grad_norm": 0.0641531508096308, + "learning_rate": 3.5445625459203715e-07, + "loss": 0.3311, + "step": 4511 + }, + { + "epoch": 4.810234541577826, + "grad_norm": 0.06317633180319338, + "learning_rate": 3.5051269018238075e-07, + "loss": 0.3327, + "step": 4512 + }, + { + "epoch": 4.811300639658849, + "grad_norm": 0.06640788788183018, + "learning_rate": 3.4659108952805e-07, + "loss": 0.3339, + "step": 4513 + }, + { + "epoch": 4.812366737739872, + "grad_norm": 0.06476401703214942, + "learning_rate": 3.4269145480140486e-07, + "loss": 0.3283, + "step": 4514 + }, + { + "epoch": 4.813432835820896, + "grad_norm": 0.06632322608724953, + "learning_rate": 3.388137881626419e-07, + "loss": 0.3333, + "step": 4515 + }, + { + "epoch": 4.814498933901919, + "grad_norm": 0.06584746139018403, + "learning_rate": 3.3495809175978944e-07, + "loss": 0.3341, + "step": 4516 + }, + { + "epoch": 4.815565031982942, + "grad_norm": 0.06225839649171285, + "learning_rate": 3.311243677287035e-07, + "loss": 0.339, + "step": 4517 + }, + { + "epoch": 4.816631130063966, + "grad_norm": 0.06266507769908893, + "learning_rate": 3.2731261819306306e-07, + "loss": 0.3338, + "step": 4518 + }, + { + "epoch": 4.817697228144989, + "grad_norm": 0.06361144124581149, + "learning_rate": 3.2352284526438347e-07, + "loss": 0.3262, + "step": 4519 + }, + { + "epoch": 4.818763326226013, + "grad_norm": 0.06657050940016936, + "learning_rate": 3.1975505104199446e-07, + "loss": 0.3347, + "step": 4520 + }, + { + "epoch": 4.819829424307036, + "grad_norm": 0.06336689637535396, + "learning_rate": 3.1600923761307077e-07, + "loss": 0.3314, + "step": 4521 + }, + { + "epoch": 4.82089552238806, + "grad_norm": 0.06697036655954806, + "learning_rate": 3.1228540705258827e-07, + "loss": 0.3373, + "step": 4522 + }, + { + "epoch": 4.821961620469083, + "grad_norm": 0.06669058010931536, + "learning_rate": 3.08583561423359e-07, + "loss": 0.3333, + "step": 4523 + }, + { + "epoch": 4.823027718550106, + "grad_norm": 0.06204857673664051, + "learning_rate": 3.0490370277601375e-07, + "loss": 0.3305, + "step": 4524 + }, + { + "epoch": 4.82409381663113, + "grad_norm": 0.06414424868604003, + "learning_rate": 3.0124583314900204e-07, + "loss": 0.3285, + "step": 4525 + }, + { + "epoch": 4.825159914712153, + "grad_norm": 0.06489342987918388, + "learning_rate": 2.976099545685962e-07, + "loss": 0.3389, + "step": 4526 + }, + { + "epoch": 4.826226012793177, + "grad_norm": 0.06392477993982196, + "learning_rate": 2.9399606904887856e-07, + "loss": 0.3301, + "step": 4527 + }, + { + "epoch": 4.8272921108742, + "grad_norm": 0.06216530530535471, + "learning_rate": 2.9040417859175884e-07, + "loss": 0.3278, + "step": 4528 + }, + { + "epoch": 4.8283582089552235, + "grad_norm": 0.06907666472180728, + "learning_rate": 2.8683428518695654e-07, + "loss": 0.3462, + "step": 4529 + }, + { + "epoch": 4.8294243070362475, + "grad_norm": 0.06389830168648562, + "learning_rate": 2.832863908120009e-07, + "loss": 0.3316, + "step": 4530 + }, + { + "epoch": 4.830490405117271, + "grad_norm": 0.0659969650902936, + "learning_rate": 2.7976049743224876e-07, + "loss": 0.334, + "step": 4531 + }, + { + "epoch": 4.8315565031982945, + "grad_norm": 0.06337199459457302, + "learning_rate": 2.762566070008621e-07, + "loss": 0.3379, + "step": 4532 + }, + { + "epoch": 4.832622601279318, + "grad_norm": 0.06123098557327691, + "learning_rate": 2.7277472145880837e-07, + "loss": 0.3337, + "step": 4533 + }, + { + "epoch": 4.833688699360341, + "grad_norm": 0.06421391981205138, + "learning_rate": 2.6931484273487796e-07, + "loss": 0.3351, + "step": 4534 + }, + { + "epoch": 4.834754797441365, + "grad_norm": 0.06631306221864534, + "learning_rate": 2.658769727456534e-07, + "loss": 0.3395, + "step": 4535 + }, + { + "epoch": 4.835820895522388, + "grad_norm": 0.06390409569316938, + "learning_rate": 2.6246111339554903e-07, + "loss": 0.335, + "step": 4536 + }, + { + "epoch": 4.836886993603412, + "grad_norm": 0.06255999806490187, + "learning_rate": 2.5906726657676685e-07, + "loss": 0.3371, + "step": 4537 + }, + { + "epoch": 4.837953091684435, + "grad_norm": 0.0638021517251054, + "learning_rate": 2.556954341693185e-07, + "loss": 0.3324, + "step": 4538 + }, + { + "epoch": 4.839019189765459, + "grad_norm": 0.06516423398474619, + "learning_rate": 2.5234561804102556e-07, + "loss": 0.3363, + "step": 4539 + }, + { + "epoch": 4.840085287846482, + "grad_norm": 0.06196390275858242, + "learning_rate": 2.4901782004751905e-07, + "loss": 0.3265, + "step": 4540 + }, + { + "epoch": 4.841151385927505, + "grad_norm": 0.06281847969805605, + "learning_rate": 2.457120420322179e-07, + "loss": 0.3373, + "step": 4541 + }, + { + "epoch": 4.842217484008529, + "grad_norm": 0.06560695196791491, + "learning_rate": 2.424282858263549e-07, + "loss": 0.3375, + "step": 4542 + }, + { + "epoch": 4.843283582089552, + "grad_norm": 0.06533330528579777, + "learning_rate": 2.3916655324895953e-07, + "loss": 0.3293, + "step": 4543 + }, + { + "epoch": 4.844349680170575, + "grad_norm": 0.06480021279499126, + "learning_rate": 2.35926846106862e-07, + "loss": 0.3355, + "step": 4544 + }, + { + "epoch": 4.845415778251599, + "grad_norm": 0.06287240019527855, + "learning_rate": 2.3270916619469342e-07, + "loss": 0.3364, + "step": 4545 + }, + { + "epoch": 4.846481876332622, + "grad_norm": 0.06700125191000252, + "learning_rate": 2.2951351529487685e-07, + "loss": 0.338, + "step": 4546 + }, + { + "epoch": 4.847547974413646, + "grad_norm": 0.06356697255958557, + "learning_rate": 2.2633989517764522e-07, + "loss": 0.335, + "step": 4547 + }, + { + "epoch": 4.848614072494669, + "grad_norm": 0.06622567910030012, + "learning_rate": 2.2318830760100995e-07, + "loss": 0.3398, + "step": 4548 + }, + { + "epoch": 4.849680170575693, + "grad_norm": 0.06340820773787822, + "learning_rate": 2.2005875431078794e-07, + "loss": 0.3361, + "step": 4549 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.06467319645032904, + "learning_rate": 2.169512370405924e-07, + "loss": 0.3362, + "step": 4550 + }, + { + "epoch": 4.8518123667377395, + "grad_norm": 0.06494652144642511, + "learning_rate": 2.138657575118286e-07, + "loss": 0.334, + "step": 4551 + }, + { + "epoch": 4.8528784648187635, + "grad_norm": 0.06344916494239451, + "learning_rate": 2.1080231743368928e-07, + "loss": 0.3261, + "step": 4552 + }, + { + "epoch": 4.853944562899787, + "grad_norm": 0.06552581044666182, + "learning_rate": 2.0776091850315483e-07, + "loss": 0.3338, + "step": 4553 + }, + { + "epoch": 4.855010660980811, + "grad_norm": 0.06182298175401099, + "learning_rate": 2.0474156240501086e-07, + "loss": 0.3335, + "step": 4554 + }, + { + "epoch": 4.856076759061834, + "grad_norm": 0.06542556498649621, + "learning_rate": 2.0174425081182615e-07, + "loss": 0.3364, + "step": 4555 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 0.06371796771439006, + "learning_rate": 1.9876898538394362e-07, + "loss": 0.3353, + "step": 4556 + }, + { + "epoch": 4.858208955223881, + "grad_norm": 0.06396421709422397, + "learning_rate": 1.9581576776951605e-07, + "loss": 0.3295, + "step": 4557 + }, + { + "epoch": 4.859275053304904, + "grad_norm": 0.06288608503660843, + "learning_rate": 1.9288459960446592e-07, + "loss": 0.3292, + "step": 4558 + }, + { + "epoch": 4.860341151385928, + "grad_norm": 0.06363937784876181, + "learning_rate": 1.899754825125122e-07, + "loss": 0.3366, + "step": 4559 + }, + { + "epoch": 4.861407249466951, + "grad_norm": 0.06336696226394795, + "learning_rate": 1.8708841810515244e-07, + "loss": 0.3274, + "step": 4560 + }, + { + "epoch": 4.862473347547974, + "grad_norm": 0.0635413388603882, + "learning_rate": 1.8422340798167181e-07, + "loss": 0.3359, + "step": 4561 + }, + { + "epoch": 4.863539445628998, + "grad_norm": 0.06482344768083516, + "learning_rate": 1.8138045372913414e-07, + "loss": 0.3387, + "step": 4562 + }, + { + "epoch": 4.864605543710021, + "grad_norm": 0.06543147651304172, + "learning_rate": 1.7855955692239525e-07, + "loss": 0.3369, + "step": 4563 + }, + { + "epoch": 4.865671641791045, + "grad_norm": 0.0634061615569733, + "learning_rate": 1.757607191240762e-07, + "loss": 0.3337, + "step": 4564 + }, + { + "epoch": 4.866737739872068, + "grad_norm": 0.06497196758674298, + "learning_rate": 1.7298394188459466e-07, + "loss": 0.3337, + "step": 4565 + }, + { + "epoch": 4.867803837953092, + "grad_norm": 0.0643026256592065, + "learning_rate": 1.7022922674213793e-07, + "loss": 0.3358, + "step": 4566 + }, + { + "epoch": 4.868869936034115, + "grad_norm": 0.06380088154716912, + "learning_rate": 1.6749657522267205e-07, + "loss": 0.3323, + "step": 4567 + }, + { + "epoch": 4.869936034115138, + "grad_norm": 0.062292059451068596, + "learning_rate": 1.6478598883995057e-07, + "loss": 0.3297, + "step": 4568 + }, + { + "epoch": 4.871002132196162, + "grad_norm": 0.06429186614170503, + "learning_rate": 1.620974690954924e-07, + "loss": 0.3318, + "step": 4569 + }, + { + "epoch": 4.872068230277185, + "grad_norm": 0.061982887675499386, + "learning_rate": 1.59431017478604e-07, + "loss": 0.3376, + "step": 4570 + }, + { + "epoch": 4.8731343283582085, + "grad_norm": 0.0636117943991852, + "learning_rate": 1.5678663546634832e-07, + "loss": 0.3342, + "step": 4571 + }, + { + "epoch": 4.8742004264392325, + "grad_norm": 0.06390306822281343, + "learning_rate": 1.541643245235891e-07, + "loss": 0.3332, + "step": 4572 + }, + { + "epoch": 4.8752665245202556, + "grad_norm": 0.06414487391507678, + "learning_rate": 1.515640861029466e-07, + "loss": 0.3319, + "step": 4573 + }, + { + "epoch": 4.8763326226012795, + "grad_norm": 0.06234839482693829, + "learning_rate": 1.4898592164481528e-07, + "loss": 0.3345, + "step": 4574 + }, + { + "epoch": 4.877398720682303, + "grad_norm": 0.06282579096563232, + "learning_rate": 1.4642983257736388e-07, + "loss": 0.3318, + "step": 4575 + }, + { + "epoch": 4.878464818763327, + "grad_norm": 0.06413128319015131, + "learning_rate": 1.4389582031653525e-07, + "loss": 0.3288, + "step": 4576 + }, + { + "epoch": 4.87953091684435, + "grad_norm": 0.061604350840344914, + "learning_rate": 1.4138388626603772e-07, + "loss": 0.3293, + "step": 4577 + }, + { + "epoch": 4.880597014925373, + "grad_norm": 0.06492239286709452, + "learning_rate": 1.388940318173537e-07, + "loss": 0.3359, + "step": 4578 + }, + { + "epoch": 4.881663113006397, + "grad_norm": 0.06750216444127496, + "learning_rate": 1.3642625834973555e-07, + "loss": 0.3319, + "step": 4579 + }, + { + "epoch": 4.88272921108742, + "grad_norm": 0.06551862088137983, + "learning_rate": 1.3398056723019638e-07, + "loss": 0.3371, + "step": 4580 + }, + { + "epoch": 4.883795309168444, + "grad_norm": 0.06303973420478017, + "learning_rate": 1.3155695981352356e-07, + "loss": 0.3339, + "step": 4581 + }, + { + "epoch": 4.884861407249467, + "grad_norm": 0.06431600254532914, + "learning_rate": 1.2915543744227433e-07, + "loss": 0.3387, + "step": 4582 + }, + { + "epoch": 4.88592750533049, + "grad_norm": 0.06224741808886467, + "learning_rate": 1.267760014467667e-07, + "loss": 0.3303, + "step": 4583 + }, + { + "epoch": 4.886993603411514, + "grad_norm": 0.0686255077912375, + "learning_rate": 1.2441865314507529e-07, + "loss": 0.3315, + "step": 4584 + }, + { + "epoch": 4.888059701492537, + "grad_norm": 0.06375747762463076, + "learning_rate": 1.220833938430621e-07, + "loss": 0.3329, + "step": 4585 + }, + { + "epoch": 4.889125799573561, + "grad_norm": 0.06360467146598325, + "learning_rate": 1.1977022483432355e-07, + "loss": 0.3304, + "step": 4586 + }, + { + "epoch": 4.890191897654584, + "grad_norm": 0.06371846108482049, + "learning_rate": 1.1747914740025235e-07, + "loss": 0.3316, + "step": 4587 + }, + { + "epoch": 4.891257995735607, + "grad_norm": 0.06123471557161623, + "learning_rate": 1.152101628099711e-07, + "loss": 0.333, + "step": 4588 + }, + { + "epoch": 4.892324093816631, + "grad_norm": 0.0653623572630271, + "learning_rate": 1.1296327232038995e-07, + "loss": 0.3383, + "step": 4589 + }, + { + "epoch": 4.893390191897654, + "grad_norm": 0.06317084300404086, + "learning_rate": 1.1073847717616659e-07, + "loss": 0.3355, + "step": 4590 + }, + { + "epoch": 4.894456289978678, + "grad_norm": 0.06436959516579645, + "learning_rate": 1.0853577860971965e-07, + "loss": 0.3375, + "step": 4591 + }, + { + "epoch": 4.895522388059701, + "grad_norm": 0.06436654116700972, + "learning_rate": 1.0635517784122862e-07, + "loss": 0.3361, + "step": 4592 + }, + { + "epoch": 4.896588486140725, + "grad_norm": 0.0638870216071855, + "learning_rate": 1.0419667607863393e-07, + "loss": 0.3359, + "step": 4593 + }, + { + "epoch": 4.8976545842217485, + "grad_norm": 0.06402271945130204, + "learning_rate": 1.0206027451764133e-07, + "loss": 0.3365, + "step": 4594 + }, + { + "epoch": 4.898720682302772, + "grad_norm": 0.06360664197572094, + "learning_rate": 9.994597434169529e-08, + "loss": 0.3333, + "step": 4595 + }, + { + "epoch": 4.899786780383796, + "grad_norm": 0.06298163310834395, + "learning_rate": 9.785377672201002e-08, + "loss": 0.3304, + "step": 4596 + }, + { + "epoch": 4.900852878464819, + "grad_norm": 0.06222784082540263, + "learning_rate": 9.578368281756068e-08, + "loss": 0.3307, + "step": 4597 + }, + { + "epoch": 4.901918976545842, + "grad_norm": 0.06603440394113538, + "learning_rate": 9.373569377506553e-08, + "loss": 0.3351, + "step": 4598 + }, + { + "epoch": 4.902985074626866, + "grad_norm": 0.061308075618675476, + "learning_rate": 9.17098107290082e-08, + "loss": 0.3373, + "step": 4599 + }, + { + "epoch": 4.904051172707889, + "grad_norm": 0.06278471458473436, + "learning_rate": 8.970603480161988e-08, + "loss": 0.3301, + "step": 4600 + }, + { + "epoch": 4.905117270788913, + "grad_norm": 0.0626450228202153, + "learning_rate": 8.772436710288822e-08, + "loss": 0.3327, + "step": 4601 + }, + { + "epoch": 4.906183368869936, + "grad_norm": 0.06235194061204951, + "learning_rate": 8.576480873055737e-08, + "loss": 0.3337, + "step": 4602 + }, + { + "epoch": 4.90724946695096, + "grad_norm": 0.06548793549407254, + "learning_rate": 8.382736077011899e-08, + "loss": 0.3319, + "step": 4603 + }, + { + "epoch": 4.908315565031983, + "grad_norm": 0.06161345936082664, + "learning_rate": 8.191202429481681e-08, + "loss": 0.3343, + "step": 4604 + }, + { + "epoch": 4.909381663113006, + "grad_norm": 0.06227151592088014, + "learning_rate": 8.001880036565102e-08, + "loss": 0.3299, + "step": 4605 + }, + { + "epoch": 4.91044776119403, + "grad_norm": 0.0628595755152068, + "learning_rate": 7.814769003136491e-08, + "loss": 0.335, + "step": 4606 + }, + { + "epoch": 4.911513859275053, + "grad_norm": 0.06133289029888643, + "learning_rate": 7.629869432845827e-08, + "loss": 0.3327, + "step": 4607 + }, + { + "epoch": 4.912579957356077, + "grad_norm": 0.06399713009061905, + "learning_rate": 7.447181428117844e-08, + "loss": 0.3368, + "step": 4608 + }, + { + "epoch": 4.9136460554371, + "grad_norm": 0.06471533837047432, + "learning_rate": 7.266705090152482e-08, + "loss": 0.3371, + "step": 4609 + }, + { + "epoch": 4.914712153518123, + "grad_norm": 0.0631935092408263, + "learning_rate": 7.08844051892399e-08, + "loss": 0.3346, + "step": 4610 + }, + { + "epoch": 4.915778251599147, + "grad_norm": 0.06449225156722846, + "learning_rate": 6.912387813181376e-08, + "loss": 0.3322, + "step": 4611 + }, + { + "epoch": 4.91684434968017, + "grad_norm": 0.06188700970750854, + "learning_rate": 6.738547070449297e-08, + "loss": 0.3325, + "step": 4612 + }, + { + "epoch": 4.917910447761194, + "grad_norm": 0.06443747535001423, + "learning_rate": 6.566918387026278e-08, + "loss": 0.335, + "step": 4613 + }, + { + "epoch": 4.918976545842217, + "grad_norm": 0.06527583600959296, + "learning_rate": 6.397501857985599e-08, + "loss": 0.3299, + "step": 4614 + }, + { + "epoch": 4.9200426439232405, + "grad_norm": 0.06254955888214284, + "learning_rate": 6.230297577175304e-08, + "loss": 0.3318, + "step": 4615 + }, + { + "epoch": 4.9211087420042645, + "grad_norm": 0.06654203572641401, + "learning_rate": 6.065305637218188e-08, + "loss": 0.3377, + "step": 4616 + }, + { + "epoch": 4.922174840085288, + "grad_norm": 0.06615365835912221, + "learning_rate": 5.902526129510477e-08, + "loss": 0.3324, + "step": 4617 + }, + { + "epoch": 4.923240938166312, + "grad_norm": 0.06358574578582833, + "learning_rate": 5.741959144223597e-08, + "loss": 0.3405, + "step": 4618 + }, + { + "epoch": 4.924307036247335, + "grad_norm": 0.06592424042080675, + "learning_rate": 5.583604770304174e-08, + "loss": 0.3329, + "step": 4619 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.06489809675701481, + "learning_rate": 5.427463095471375e-08, + "loss": 0.3331, + "step": 4620 + }, + { + "epoch": 4.926439232409382, + "grad_norm": 0.06346784271705076, + "learning_rate": 5.27353420622001e-08, + "loss": 0.3378, + "step": 4621 + }, + { + "epoch": 4.927505330490405, + "grad_norm": 0.06492417082655748, + "learning_rate": 5.121818187818761e-08, + "loss": 0.3352, + "step": 4622 + }, + { + "epoch": 4.928571428571429, + "grad_norm": 0.06263743897374244, + "learning_rate": 4.9723151243106225e-08, + "loss": 0.3375, + "step": 4623 + }, + { + "epoch": 4.929637526652452, + "grad_norm": 0.06117815118487387, + "learning_rate": 4.825025098512015e-08, + "loss": 0.3407, + "step": 4624 + }, + { + "epoch": 4.930703624733475, + "grad_norm": 0.06245193201459552, + "learning_rate": 4.679948192013673e-08, + "loss": 0.3279, + "step": 4625 + }, + { + "epoch": 4.931769722814499, + "grad_norm": 0.06712207626671726, + "learning_rate": 4.537084485181531e-08, + "loss": 0.3347, + "step": 4626 + }, + { + "epoch": 4.932835820895522, + "grad_norm": 0.0637857168923382, + "learning_rate": 4.396434057154508e-08, + "loss": 0.3303, + "step": 4627 + }, + { + "epoch": 4.933901918976546, + "grad_norm": 0.06368341666107143, + "learning_rate": 4.257996985844948e-08, + "loss": 0.3293, + "step": 4628 + }, + { + "epoch": 4.934968017057569, + "grad_norm": 0.06745977801668324, + "learning_rate": 4.121773347940394e-08, + "loss": 0.3329, + "step": 4629 + }, + { + "epoch": 4.936034115138593, + "grad_norm": 0.06316814202138803, + "learning_rate": 3.987763218901375e-08, + "loss": 0.3323, + "step": 4630 + }, + { + "epoch": 4.937100213219616, + "grad_norm": 0.06238635312813817, + "learning_rate": 3.85596667296273e-08, + "loss": 0.3319, + "step": 4631 + }, + { + "epoch": 4.938166311300639, + "grad_norm": 0.06453862518488947, + "learning_rate": 3.7263837831327255e-08, + "loss": 0.3316, + "step": 4632 + }, + { + "epoch": 4.939232409381663, + "grad_norm": 0.06561075891909876, + "learning_rate": 3.5990146211939416e-08, + "loss": 0.3357, + "step": 4633 + }, + { + "epoch": 4.940298507462686, + "grad_norm": 0.06243805915307683, + "learning_rate": 3.473859257701495e-08, + "loss": 0.3357, + "step": 4634 + }, + { + "epoch": 4.94136460554371, + "grad_norm": 0.06580397289868427, + "learning_rate": 3.3509177619857056e-08, + "loss": 0.3354, + "step": 4635 + }, + { + "epoch": 4.9424307036247335, + "grad_norm": 0.06077030430444124, + "learning_rate": 3.2301902021494304e-08, + "loss": 0.3247, + "step": 4636 + }, + { + "epoch": 4.943496801705757, + "grad_norm": 0.06466626231200338, + "learning_rate": 3.111676645069839e-08, + "loss": 0.3321, + "step": 4637 + }, + { + "epoch": 4.9445628997867805, + "grad_norm": 0.06068940940138413, + "learning_rate": 2.9953771563966396e-08, + "loss": 0.3288, + "step": 4638 + }, + { + "epoch": 4.945628997867804, + "grad_norm": 0.06278652639592003, + "learning_rate": 2.881291800554298e-08, + "loss": 0.3372, + "step": 4639 + }, + { + "epoch": 4.946695095948828, + "grad_norm": 0.061309406774551704, + "learning_rate": 2.769420640739817e-08, + "loss": 0.3353, + "step": 4640 + }, + { + "epoch": 4.947761194029851, + "grad_norm": 0.06281507165537796, + "learning_rate": 2.6597637389240704e-08, + "loss": 0.3321, + "step": 4641 + }, + { + "epoch": 4.948827292110874, + "grad_norm": 0.061550308085559016, + "learning_rate": 2.552321155851356e-08, + "loss": 0.3325, + "step": 4642 + }, + { + "epoch": 4.949893390191898, + "grad_norm": 0.06375020106010763, + "learning_rate": 2.447092951038954e-08, + "loss": 0.3315, + "step": 4643 + }, + { + "epoch": 4.950959488272921, + "grad_norm": 0.06552775876706644, + "learning_rate": 2.344079182778458e-08, + "loss": 0.3445, + "step": 4644 + }, + { + "epoch": 4.952025586353945, + "grad_norm": 0.06809372509479077, + "learning_rate": 2.2432799081339997e-08, + "loss": 0.3325, + "step": 4645 + }, + { + "epoch": 4.953091684434968, + "grad_norm": 0.06352213483924132, + "learning_rate": 2.1446951829422468e-08, + "loss": 0.3363, + "step": 4646 + }, + { + "epoch": 4.954157782515992, + "grad_norm": 0.0628161589783622, + "learning_rate": 2.0483250618150708e-08, + "loss": 0.334, + "step": 4647 + }, + { + "epoch": 4.955223880597015, + "grad_norm": 0.06170202655763391, + "learning_rate": 1.954169598136435e-08, + "loss": 0.3379, + "step": 4648 + }, + { + "epoch": 4.956289978678038, + "grad_norm": 0.06284704035908611, + "learning_rate": 1.862228844062841e-08, + "loss": 0.334, + "step": 4649 + }, + { + "epoch": 4.957356076759062, + "grad_norm": 0.06343922078643444, + "learning_rate": 1.772502850525548e-08, + "loss": 0.3358, + "step": 4650 + }, + { + "epoch": 4.958422174840085, + "grad_norm": 0.06367103465728541, + "learning_rate": 1.6849916672270206e-08, + "loss": 0.3362, + "step": 4651 + }, + { + "epoch": 4.959488272921108, + "grad_norm": 0.06339160396502194, + "learning_rate": 1.5996953426449245e-08, + "loss": 0.3362, + "step": 4652 + }, + { + "epoch": 4.960554371002132, + "grad_norm": 0.06146054352349314, + "learning_rate": 1.51661392402902e-08, + "loss": 0.3327, + "step": 4653 + }, + { + "epoch": 4.961620469083155, + "grad_norm": 0.0644259680452347, + "learning_rate": 1.4357474574011598e-08, + "loss": 0.3275, + "step": 4654 + }, + { + "epoch": 4.962686567164179, + "grad_norm": 0.06489614895198179, + "learning_rate": 1.3570959875579547e-08, + "loss": 0.3354, + "step": 4655 + }, + { + "epoch": 4.963752665245202, + "grad_norm": 0.061701893851838184, + "learning_rate": 1.2806595580676651e-08, + "loss": 0.3296, + "step": 4656 + }, + { + "epoch": 4.964818763326226, + "grad_norm": 0.06305988708818967, + "learning_rate": 1.2064382112728646e-08, + "loss": 0.3271, + "step": 4657 + }, + { + "epoch": 4.9658848614072495, + "grad_norm": 0.06467949485812308, + "learning_rate": 1.1344319882873323e-08, + "loss": 0.3373, + "step": 4658 + }, + { + "epoch": 4.966950959488273, + "grad_norm": 0.0616143550364817, + "learning_rate": 1.064640928999605e-08, + "loss": 0.3352, + "step": 4659 + }, + { + "epoch": 4.968017057569297, + "grad_norm": 0.06426893511659539, + "learning_rate": 9.970650720703135e-09, + "loss": 0.3364, + "step": 4660 + }, + { + "epoch": 4.96908315565032, + "grad_norm": 0.06068623678209842, + "learning_rate": 9.317044549321808e-09, + "loss": 0.3317, + "step": 4661 + }, + { + "epoch": 4.970149253731344, + "grad_norm": 0.06197114220318593, + "learning_rate": 8.685591137922444e-09, + "loss": 0.3286, + "step": 4662 + }, + { + "epoch": 4.971215351812367, + "grad_norm": 0.06386456846237887, + "learning_rate": 8.076290836296352e-09, + "loss": 0.3352, + "step": 4663 + }, + { + "epoch": 4.97228144989339, + "grad_norm": 0.06241553173429086, + "learning_rate": 7.489143981960211e-09, + "loss": 0.3375, + "step": 4664 + }, + { + "epoch": 4.973347547974414, + "grad_norm": 0.06569632030334341, + "learning_rate": 6.924150900169402e-09, + "loss": 0.334, + "step": 4665 + }, + { + "epoch": 4.974413646055437, + "grad_norm": 0.06246704081644333, + "learning_rate": 6.381311903900234e-09, + "loss": 0.3328, + "step": 4666 + }, + { + "epoch": 4.975479744136461, + "grad_norm": 0.06349233758317888, + "learning_rate": 5.860627293849952e-09, + "loss": 0.3371, + "step": 4667 + }, + { + "epoch": 4.976545842217484, + "grad_norm": 0.061921416439339974, + "learning_rate": 5.36209735845894e-09, + "loss": 0.3386, + "step": 4668 + }, + { + "epoch": 4.977611940298507, + "grad_norm": 0.062197087906872645, + "learning_rate": 4.885722373879631e-09, + "loss": 0.3402, + "step": 4669 + }, + { + "epoch": 4.978678038379531, + "grad_norm": 0.06266534129824461, + "learning_rate": 4.431502604003157e-09, + "loss": 0.3249, + "step": 4670 + }, + { + "epoch": 4.979744136460554, + "grad_norm": 0.062785232042293, + "learning_rate": 3.999438300446024e-09, + "loss": 0.3328, + "step": 4671 + }, + { + "epoch": 4.980810234541578, + "grad_norm": 0.06336115371955366, + "learning_rate": 3.5895297025456686e-09, + "loss": 0.3298, + "step": 4672 + }, + { + "epoch": 4.981876332622601, + "grad_norm": 0.06207536666619536, + "learning_rate": 3.2017770373737877e-09, + "loss": 0.3376, + "step": 4673 + }, + { + "epoch": 4.982942430703625, + "grad_norm": 0.06040648920327252, + "learning_rate": 2.8361805197185678e-09, + "loss": 0.3313, + "step": 4674 + }, + { + "epoch": 4.984008528784648, + "grad_norm": 0.06070989404951965, + "learning_rate": 2.4927403521068926e-09, + "loss": 0.3359, + "step": 4675 + }, + { + "epoch": 4.985074626865671, + "grad_norm": 0.061482753937327835, + "learning_rate": 2.1714567247821393e-09, + "loss": 0.3363, + "step": 4676 + }, + { + "epoch": 4.986140724946695, + "grad_norm": 0.06142287227143837, + "learning_rate": 1.872329815726381e-09, + "loss": 0.3397, + "step": 4677 + }, + { + "epoch": 4.9872068230277184, + "grad_norm": 0.06141544242318959, + "learning_rate": 1.5953597906337437e-09, + "loss": 0.3335, + "step": 4678 + }, + { + "epoch": 4.9882729211087415, + "grad_norm": 0.06367883708761624, + "learning_rate": 1.3405468029370484e-09, + "loss": 0.3371, + "step": 4679 + }, + { + "epoch": 4.9893390191897655, + "grad_norm": 0.06116208249352169, + "learning_rate": 1.1078909937811688e-09, + "loss": 0.3295, + "step": 4680 + }, + { + "epoch": 4.990405117270789, + "grad_norm": 0.06315685975189177, + "learning_rate": 8.973924920541166e-10, + "loss": 0.3355, + "step": 4681 + }, + { + "epoch": 4.991471215351813, + "grad_norm": 0.06339091373459602, + "learning_rate": 7.090514143515137e-10, + "loss": 0.3304, + "step": 4682 + }, + { + "epoch": 4.992537313432836, + "grad_norm": 0.06281856911763233, + "learning_rate": 5.428678650165609e-10, + "loss": 0.3336, + "step": 4683 + }, + { + "epoch": 4.99360341151386, + "grad_norm": 0.062428937737431266, + "learning_rate": 3.988419360956286e-10, + "loss": 0.3368, + "step": 4684 + }, + { + "epoch": 4.994669509594883, + "grad_norm": 0.063210979241412, + "learning_rate": 2.769737073737844e-10, + "loss": 0.3338, + "step": 4685 + }, + { + "epoch": 4.995735607675906, + "grad_norm": 0.06278318354034783, + "learning_rate": 1.7726324636591075e-10, + "loss": 0.3374, + "step": 4686 + }, + { + "epoch": 4.99680170575693, + "grad_norm": 0.06485031870184303, + "learning_rate": 9.971060830338275e-11, + "loss": 0.3327, + "step": 4687 + }, + { + "epoch": 4.997867803837953, + "grad_norm": 0.062487731895542534, + "learning_rate": 4.431583613850876e-11, + "loss": 0.3312, + "step": 4688 + }, + { + "epoch": 4.998933901918977, + "grad_norm": 0.06242272501094243, + "learning_rate": 1.1078960571175856e-11, + "loss": 0.3316, + "step": 4689 + }, + { + "epoch": 5.0, + "grad_norm": 0.08020564798786287, + "learning_rate": 0.0, + "loss": 0.3297, + "step": 4690 + }, + { + "epoch": 5.0, + "step": 4690, + "total_flos": 7.868010719084544e+16, + "train_loss": 0.37999461384724453, + "train_runtime": 78929.6999, + "train_samples_per_second": 30.397, + "train_steps_per_second": 0.059 + } + ], + "logging_steps": 1, + "max_steps": 4690, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.868010719084544e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}