diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22639 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.997677659080353, + "eval_steps": 500, + "global_step": 3228, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009289363678588017, + "grad_norm": 54.7475656620091, + "learning_rate": 0.0, + "loss": 11.0245, + "step": 1 + }, + { + "epoch": 0.0018578727357176034, + "grad_norm": 56.51890067533405, + "learning_rate": 1.5479876160990715e-07, + "loss": 10.9563, + "step": 2 + }, + { + "epoch": 0.002786809103576405, + "grad_norm": 52.84327216336746, + "learning_rate": 3.095975232198143e-07, + "loss": 11.1232, + "step": 3 + }, + { + "epoch": 0.0037157454714352067, + "grad_norm": 54.47240324631095, + "learning_rate": 4.6439628482972136e-07, + "loss": 11.0395, + "step": 4 + }, + { + "epoch": 0.004644681839294009, + "grad_norm": 56.06603126343985, + "learning_rate": 6.191950464396286e-07, + "loss": 10.9521, + "step": 5 + }, + { + "epoch": 0.00557361820715281, + "grad_norm": 56.653051495890196, + "learning_rate": 7.739938080495357e-07, + "loss": 10.9283, + "step": 6 + }, + { + "epoch": 0.006502554575011612, + "grad_norm": 57.19065708163452, + "learning_rate": 9.287925696594427e-07, + "loss": 10.9048, + "step": 7 + }, + { + "epoch": 0.0074314909428704135, + "grad_norm": 57.13309141244447, + "learning_rate": 1.08359133126935e-06, + "loss": 10.8692, + "step": 8 + }, + { + "epoch": 0.008360427310729215, + "grad_norm": 61.70021371119148, + "learning_rate": 1.2383900928792572e-06, + "loss": 10.7026, + "step": 9 + }, + { + "epoch": 0.009289363678588018, + "grad_norm": 58.64471121402323, + "learning_rate": 1.3931888544891641e-06, + "loss": 10.7762, + "step": 10 + }, + { + "epoch": 0.010218300046446818, + "grad_norm": 61.95985964008096, + "learning_rate": 1.5479876160990713e-06, + "loss": 10.6442, + "step": 11 + }, + { + "epoch": 0.01114723641430562, + "grad_norm": 81.74822353424084, + "learning_rate": 1.7027863777089783e-06, + "loss": 9.3569, + "step": 12 + }, + { + "epoch": 0.012076172782164421, + "grad_norm": 87.15614377730954, + "learning_rate": 1.8575851393188855e-06, + "loss": 9.3139, + "step": 13 + }, + { + "epoch": 0.013005109150023224, + "grad_norm": 90.69218149138536, + "learning_rate": 2.012383900928793e-06, + "loss": 8.9405, + "step": 14 + }, + { + "epoch": 0.013934045517882025, + "grad_norm": 96.84729927584988, + "learning_rate": 2.1671826625387e-06, + "loss": 8.6999, + "step": 15 + }, + { + "epoch": 0.014862981885740827, + "grad_norm": 65.07865556439408, + "learning_rate": 2.321981424148607e-06, + "loss": 3.7051, + "step": 16 + }, + { + "epoch": 0.01579191825359963, + "grad_norm": 62.52820085696381, + "learning_rate": 2.4767801857585144e-06, + "loss": 3.6981, + "step": 17 + }, + { + "epoch": 0.01672085462145843, + "grad_norm": 53.649988186848155, + "learning_rate": 2.631578947368421e-06, + "loss": 3.2385, + "step": 18 + }, + { + "epoch": 0.01764979098931723, + "grad_norm": 39.157669241392924, + "learning_rate": 2.7863777089783283e-06, + "loss": 2.6875, + "step": 19 + }, + { + "epoch": 0.018578727357176035, + "grad_norm": 34.53162931711357, + "learning_rate": 2.9411764705882355e-06, + "loss": 2.4641, + "step": 20 + }, + { + "epoch": 0.019507663725034836, + "grad_norm": 14.044732265304198, + "learning_rate": 3.0959752321981426e-06, + "loss": 1.7491, + "step": 21 + }, + { + "epoch": 0.020436600092893636, + "grad_norm": 6.143568036385673, + "learning_rate": 3.25077399380805e-06, + "loss": 1.2988, + "step": 22 + }, + { + "epoch": 0.021365536460752437, + "grad_norm": 5.2826561116659745, + "learning_rate": 3.4055727554179566e-06, + "loss": 1.3204, + "step": 23 + }, + { + "epoch": 0.02229447282861124, + "grad_norm": 4.030771783214909, + "learning_rate": 3.560371517027864e-06, + "loss": 1.2404, + "step": 24 + }, + { + "epoch": 0.023223409196470042, + "grad_norm": 3.426990839601739, + "learning_rate": 3.715170278637771e-06, + "loss": 1.1722, + "step": 25 + }, + { + "epoch": 0.024152345564328843, + "grad_norm": 2.663561421971141, + "learning_rate": 3.869969040247678e-06, + "loss": 1.132, + "step": 26 + }, + { + "epoch": 0.025081281932187643, + "grad_norm": 2.2095185737240914, + "learning_rate": 4.024767801857586e-06, + "loss": 1.0879, + "step": 27 + }, + { + "epoch": 0.026010218300046448, + "grad_norm": 1.8901365750210222, + "learning_rate": 4.1795665634674924e-06, + "loss": 1.0258, + "step": 28 + }, + { + "epoch": 0.02693915466790525, + "grad_norm": 1.3861813570839094, + "learning_rate": 4.3343653250774e-06, + "loss": 0.9735, + "step": 29 + }, + { + "epoch": 0.02786809103576405, + "grad_norm": 1.5525651218677432, + "learning_rate": 4.489164086687307e-06, + "loss": 0.9165, + "step": 30 + }, + { + "epoch": 0.028797027403622853, + "grad_norm": 4.777669808853715, + "learning_rate": 4.643962848297214e-06, + "loss": 0.8589, + "step": 31 + }, + { + "epoch": 0.029725963771481654, + "grad_norm": 1.9514011751823228, + "learning_rate": 4.798761609907121e-06, + "loss": 0.8463, + "step": 32 + }, + { + "epoch": 0.030654900139340455, + "grad_norm": 1.0398033405311462, + "learning_rate": 4.953560371517029e-06, + "loss": 0.8531, + "step": 33 + }, + { + "epoch": 0.03158383650719926, + "grad_norm": 0.9239326446187678, + "learning_rate": 5.1083591331269355e-06, + "loss": 0.8295, + "step": 34 + }, + { + "epoch": 0.032512772875058056, + "grad_norm": 0.8239281374239108, + "learning_rate": 5.263157894736842e-06, + "loss": 0.7953, + "step": 35 + }, + { + "epoch": 0.03344170924291686, + "grad_norm": 0.7456489091955973, + "learning_rate": 5.41795665634675e-06, + "loss": 0.8011, + "step": 36 + }, + { + "epoch": 0.034370645610775664, + "grad_norm": 0.7322576669859537, + "learning_rate": 5.5727554179566566e-06, + "loss": 0.7743, + "step": 37 + }, + { + "epoch": 0.03529958197863446, + "grad_norm": 0.6841211213339884, + "learning_rate": 5.727554179566564e-06, + "loss": 0.7502, + "step": 38 + }, + { + "epoch": 0.036228518346493266, + "grad_norm": 0.6392360133178303, + "learning_rate": 5.882352941176471e-06, + "loss": 0.7588, + "step": 39 + }, + { + "epoch": 0.03715745471435207, + "grad_norm": 0.5816712224571956, + "learning_rate": 6.0371517027863785e-06, + "loss": 0.763, + "step": 40 + }, + { + "epoch": 0.03808639108221087, + "grad_norm": 0.5551930154686257, + "learning_rate": 6.191950464396285e-06, + "loss": 0.7203, + "step": 41 + }, + { + "epoch": 0.03901532745006967, + "grad_norm": 0.627393609542681, + "learning_rate": 6.346749226006192e-06, + "loss": 0.7372, + "step": 42 + }, + { + "epoch": 0.03994426381792847, + "grad_norm": 0.5209393663178439, + "learning_rate": 6.5015479876161e-06, + "loss": 0.7142, + "step": 43 + }, + { + "epoch": 0.04087320018578727, + "grad_norm": 0.4311011906767075, + "learning_rate": 6.656346749226007e-06, + "loss": 0.6497, + "step": 44 + }, + { + "epoch": 0.04180213655364608, + "grad_norm": 0.44948642321517024, + "learning_rate": 6.811145510835913e-06, + "loss": 0.694, + "step": 45 + }, + { + "epoch": 0.042731072921504874, + "grad_norm": 0.4845127976887666, + "learning_rate": 6.965944272445821e-06, + "loss": 0.7111, + "step": 46 + }, + { + "epoch": 0.04366000928936368, + "grad_norm": 0.4680884482351343, + "learning_rate": 7.120743034055728e-06, + "loss": 0.6841, + "step": 47 + }, + { + "epoch": 0.04458894565722248, + "grad_norm": 0.4268577390410502, + "learning_rate": 7.275541795665634e-06, + "loss": 0.6861, + "step": 48 + }, + { + "epoch": 0.04551788202508128, + "grad_norm": 0.4221226832933112, + "learning_rate": 7.430340557275542e-06, + "loss": 0.6741, + "step": 49 + }, + { + "epoch": 0.046446818392940084, + "grad_norm": 0.5220141311400619, + "learning_rate": 7.585139318885449e-06, + "loss": 0.669, + "step": 50 + }, + { + "epoch": 0.04737575476079889, + "grad_norm": 0.42881334524580667, + "learning_rate": 7.739938080495356e-06, + "loss": 0.676, + "step": 51 + }, + { + "epoch": 0.048304691128657685, + "grad_norm": 0.3524885307145976, + "learning_rate": 7.894736842105263e-06, + "loss": 0.6381, + "step": 52 + }, + { + "epoch": 0.04923362749651649, + "grad_norm": 0.4114675079627211, + "learning_rate": 8.049535603715171e-06, + "loss": 0.6049, + "step": 53 + }, + { + "epoch": 0.05016256386437529, + "grad_norm": 0.40656103184868714, + "learning_rate": 8.204334365325078e-06, + "loss": 0.6202, + "step": 54 + }, + { + "epoch": 0.05109150023223409, + "grad_norm": 0.3732755562975793, + "learning_rate": 8.359133126934985e-06, + "loss": 0.6231, + "step": 55 + }, + { + "epoch": 0.052020436600092895, + "grad_norm": 0.3799659025653529, + "learning_rate": 8.513931888544892e-06, + "loss": 0.6167, + "step": 56 + }, + { + "epoch": 0.05294937296795169, + "grad_norm": 0.3548794527966943, + "learning_rate": 8.6687306501548e-06, + "loss": 0.6235, + "step": 57 + }, + { + "epoch": 0.0538783093358105, + "grad_norm": 0.3366392962881869, + "learning_rate": 8.823529411764707e-06, + "loss": 0.6037, + "step": 58 + }, + { + "epoch": 0.0548072457036693, + "grad_norm": 0.35622745848432813, + "learning_rate": 8.978328173374614e-06, + "loss": 0.6505, + "step": 59 + }, + { + "epoch": 0.0557361820715281, + "grad_norm": 0.33139910543786816, + "learning_rate": 9.13312693498452e-06, + "loss": 0.5907, + "step": 60 + }, + { + "epoch": 0.0566651184393869, + "grad_norm": 0.26984628140930006, + "learning_rate": 9.287925696594429e-06, + "loss": 0.6075, + "step": 61 + }, + { + "epoch": 0.057594054807245706, + "grad_norm": 0.30168457074265415, + "learning_rate": 9.442724458204334e-06, + "loss": 0.5929, + "step": 62 + }, + { + "epoch": 0.058522991175104504, + "grad_norm": 0.3514265855140124, + "learning_rate": 9.597523219814242e-06, + "loss": 0.5962, + "step": 63 + }, + { + "epoch": 0.05945192754296331, + "grad_norm": 0.3551734532081626, + "learning_rate": 9.752321981424149e-06, + "loss": 0.6197, + "step": 64 + }, + { + "epoch": 0.06038086391082211, + "grad_norm": 0.29947361957296, + "learning_rate": 9.907120743034057e-06, + "loss": 0.5796, + "step": 65 + }, + { + "epoch": 0.06130980027868091, + "grad_norm": 0.31337803543321063, + "learning_rate": 1.0061919504643963e-05, + "loss": 0.5948, + "step": 66 + }, + { + "epoch": 0.062238736646539713, + "grad_norm": 0.3215034540150923, + "learning_rate": 1.0216718266253871e-05, + "loss": 0.5481, + "step": 67 + }, + { + "epoch": 0.06316767301439852, + "grad_norm": 0.3102955136307414, + "learning_rate": 1.0371517027863778e-05, + "loss": 0.5789, + "step": 68 + }, + { + "epoch": 0.06409660938225732, + "grad_norm": 0.2573332075607664, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.5656, + "step": 69 + }, + { + "epoch": 0.06502554575011611, + "grad_norm": 0.2537807753587459, + "learning_rate": 1.0681114551083591e-05, + "loss": 0.5678, + "step": 70 + }, + { + "epoch": 0.06595448211797492, + "grad_norm": 0.30023463727934896, + "learning_rate": 1.08359133126935e-05, + "loss": 0.572, + "step": 71 + }, + { + "epoch": 0.06688341848583372, + "grad_norm": 0.2705492110612055, + "learning_rate": 1.0990712074303406e-05, + "loss": 0.5956, + "step": 72 + }, + { + "epoch": 0.06781235485369252, + "grad_norm": 0.27304258159582684, + "learning_rate": 1.1145510835913313e-05, + "loss": 0.5943, + "step": 73 + }, + { + "epoch": 0.06874129122155133, + "grad_norm": 0.27460235847783515, + "learning_rate": 1.130030959752322e-05, + "loss": 0.5738, + "step": 74 + }, + { + "epoch": 0.06967022758941012, + "grad_norm": 0.24083051338698006, + "learning_rate": 1.1455108359133128e-05, + "loss": 0.5543, + "step": 75 + }, + { + "epoch": 0.07059916395726892, + "grad_norm": 0.2862680829656088, + "learning_rate": 1.1609907120743033e-05, + "loss": 0.5574, + "step": 76 + }, + { + "epoch": 0.07152810032512773, + "grad_norm": 0.2522441867822406, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.5561, + "step": 77 + }, + { + "epoch": 0.07245703669298653, + "grad_norm": 0.2731655705611119, + "learning_rate": 1.1919504643962849e-05, + "loss": 0.5736, + "step": 78 + }, + { + "epoch": 0.07338597306084534, + "grad_norm": 0.2520420481239673, + "learning_rate": 1.2074303405572757e-05, + "loss": 0.5418, + "step": 79 + }, + { + "epoch": 0.07431490942870414, + "grad_norm": 0.22930537041421725, + "learning_rate": 1.2229102167182662e-05, + "loss": 0.5171, + "step": 80 + }, + { + "epoch": 0.07524384579656293, + "grad_norm": 0.273072858512794, + "learning_rate": 1.238390092879257e-05, + "loss": 0.5433, + "step": 81 + }, + { + "epoch": 0.07617278216442173, + "grad_norm": 0.2891845927667174, + "learning_rate": 1.2538699690402477e-05, + "loss": 0.5779, + "step": 82 + }, + { + "epoch": 0.07710171853228054, + "grad_norm": 0.2523018814544343, + "learning_rate": 1.2693498452012384e-05, + "loss": 0.5461, + "step": 83 + }, + { + "epoch": 0.07803065490013934, + "grad_norm": 0.3166725101519938, + "learning_rate": 1.2848297213622292e-05, + "loss": 0.5643, + "step": 84 + }, + { + "epoch": 0.07895959126799815, + "grad_norm": 0.2631824444127893, + "learning_rate": 1.30030959752322e-05, + "loss": 0.5138, + "step": 85 + }, + { + "epoch": 0.07988852763585694, + "grad_norm": 0.3004458758805224, + "learning_rate": 1.3157894736842106e-05, + "loss": 0.551, + "step": 86 + }, + { + "epoch": 0.08081746400371574, + "grad_norm": 0.27824957348028967, + "learning_rate": 1.3312693498452014e-05, + "loss": 0.5774, + "step": 87 + }, + { + "epoch": 0.08174640037157455, + "grad_norm": 0.30038729524349966, + "learning_rate": 1.346749226006192e-05, + "loss": 0.5255, + "step": 88 + }, + { + "epoch": 0.08267533673943335, + "grad_norm": 0.2942525266347019, + "learning_rate": 1.3622291021671826e-05, + "loss": 0.5569, + "step": 89 + }, + { + "epoch": 0.08360427310729215, + "grad_norm": 0.29120858799373767, + "learning_rate": 1.3777089783281735e-05, + "loss": 0.5224, + "step": 90 + }, + { + "epoch": 0.08453320947515096, + "grad_norm": 0.3596968247905661, + "learning_rate": 1.3931888544891641e-05, + "loss": 0.5199, + "step": 91 + }, + { + "epoch": 0.08546214584300975, + "grad_norm": 0.2812166695733365, + "learning_rate": 1.4086687306501548e-05, + "loss": 0.5363, + "step": 92 + }, + { + "epoch": 0.08639108221086855, + "grad_norm": 0.2992496470672875, + "learning_rate": 1.4241486068111457e-05, + "loss": 0.5634, + "step": 93 + }, + { + "epoch": 0.08732001857872736, + "grad_norm": 0.29257796169998596, + "learning_rate": 1.4396284829721363e-05, + "loss": 0.5527, + "step": 94 + }, + { + "epoch": 0.08824895494658616, + "grad_norm": 0.2732189071664008, + "learning_rate": 1.4551083591331268e-05, + "loss": 0.5026, + "step": 95 + }, + { + "epoch": 0.08917789131444497, + "grad_norm": 0.27378573341841284, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.575, + "step": 96 + }, + { + "epoch": 0.09010682768230376, + "grad_norm": 0.2917134046408098, + "learning_rate": 1.4860681114551084e-05, + "loss": 0.5561, + "step": 97 + }, + { + "epoch": 0.09103576405016256, + "grad_norm": 0.26829458430398617, + "learning_rate": 1.5015479876160992e-05, + "loss": 0.5082, + "step": 98 + }, + { + "epoch": 0.09196470041802136, + "grad_norm": 0.34740553302312405, + "learning_rate": 1.5170278637770899e-05, + "loss": 0.5794, + "step": 99 + }, + { + "epoch": 0.09289363678588017, + "grad_norm": 0.2936917575237726, + "learning_rate": 1.5325077399380806e-05, + "loss": 0.548, + "step": 100 + }, + { + "epoch": 0.09382257315373897, + "grad_norm": 0.3117382936574086, + "learning_rate": 1.5479876160990712e-05, + "loss": 0.5368, + "step": 101 + }, + { + "epoch": 0.09475150952159778, + "grad_norm": 0.22828777606096362, + "learning_rate": 1.563467492260062e-05, + "loss": 0.5314, + "step": 102 + }, + { + "epoch": 0.09568044588945657, + "grad_norm": 0.2771765662389807, + "learning_rate": 1.5789473684210526e-05, + "loss": 0.5482, + "step": 103 + }, + { + "epoch": 0.09660938225731537, + "grad_norm": 0.2789355900210165, + "learning_rate": 1.5944272445820436e-05, + "loss": 0.5214, + "step": 104 + }, + { + "epoch": 0.09753831862517418, + "grad_norm": 0.29645303240245474, + "learning_rate": 1.6099071207430343e-05, + "loss": 0.5403, + "step": 105 + }, + { + "epoch": 0.09846725499303298, + "grad_norm": 0.2501658792361872, + "learning_rate": 1.6253869969040246e-05, + "loss": 0.5125, + "step": 106 + }, + { + "epoch": 0.09939619136089178, + "grad_norm": 0.27343053126969535, + "learning_rate": 1.6408668730650156e-05, + "loss": 0.4957, + "step": 107 + }, + { + "epoch": 0.10032512772875057, + "grad_norm": 0.2578817321890831, + "learning_rate": 1.6563467492260063e-05, + "loss": 0.5118, + "step": 108 + }, + { + "epoch": 0.10125406409660938, + "grad_norm": 0.26242875789808284, + "learning_rate": 1.671826625386997e-05, + "loss": 0.5009, + "step": 109 + }, + { + "epoch": 0.10218300046446818, + "grad_norm": 0.27788258841659896, + "learning_rate": 1.6873065015479876e-05, + "loss": 0.5152, + "step": 110 + }, + { + "epoch": 0.10311193683232699, + "grad_norm": 0.32503303320367943, + "learning_rate": 1.7027863777089783e-05, + "loss": 0.5675, + "step": 111 + }, + { + "epoch": 0.10404087320018579, + "grad_norm": 0.25485198676459764, + "learning_rate": 1.7182662538699693e-05, + "loss": 0.4957, + "step": 112 + }, + { + "epoch": 0.1049698095680446, + "grad_norm": 0.33134645178022537, + "learning_rate": 1.73374613003096e-05, + "loss": 0.5523, + "step": 113 + }, + { + "epoch": 0.10589874593590339, + "grad_norm": 0.29136775834871204, + "learning_rate": 1.7492260061919503e-05, + "loss": 0.5432, + "step": 114 + }, + { + "epoch": 0.10682768230376219, + "grad_norm": 0.29657828249266954, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.5074, + "step": 115 + }, + { + "epoch": 0.107756618671621, + "grad_norm": 0.2694020146187824, + "learning_rate": 1.780185758513932e-05, + "loss": 0.5159, + "step": 116 + }, + { + "epoch": 0.1086855550394798, + "grad_norm": 0.32008332295676645, + "learning_rate": 1.7956656346749227e-05, + "loss": 0.5078, + "step": 117 + }, + { + "epoch": 0.1096144914073386, + "grad_norm": 0.30370374026876107, + "learning_rate": 1.8111455108359134e-05, + "loss": 0.5251, + "step": 118 + }, + { + "epoch": 0.11054342777519739, + "grad_norm": 0.2540384388820426, + "learning_rate": 1.826625386996904e-05, + "loss": 0.4889, + "step": 119 + }, + { + "epoch": 0.1114723641430562, + "grad_norm": 0.2817516173937527, + "learning_rate": 1.8421052631578947e-05, + "loss": 0.5096, + "step": 120 + }, + { + "epoch": 0.112401300510915, + "grad_norm": 0.28087162622672707, + "learning_rate": 1.8575851393188857e-05, + "loss": 0.5184, + "step": 121 + }, + { + "epoch": 0.1133302368787738, + "grad_norm": 0.2638926817448589, + "learning_rate": 1.873065015479876e-05, + "loss": 0.5073, + "step": 122 + }, + { + "epoch": 0.11425917324663261, + "grad_norm": 0.27924441224774316, + "learning_rate": 1.8885448916408668e-05, + "loss": 0.5045, + "step": 123 + }, + { + "epoch": 0.11518810961449141, + "grad_norm": 0.3038546535105753, + "learning_rate": 1.9040247678018578e-05, + "loss": 0.5647, + "step": 124 + }, + { + "epoch": 0.1161170459823502, + "grad_norm": 0.28072761267618945, + "learning_rate": 1.9195046439628485e-05, + "loss": 0.5041, + "step": 125 + }, + { + "epoch": 0.11704598235020901, + "grad_norm": 0.30934280983091644, + "learning_rate": 1.934984520123839e-05, + "loss": 0.5189, + "step": 126 + }, + { + "epoch": 0.11797491871806781, + "grad_norm": 0.3017588490881342, + "learning_rate": 1.9504643962848298e-05, + "loss": 0.5113, + "step": 127 + }, + { + "epoch": 0.11890385508592662, + "grad_norm": 0.2595245492733277, + "learning_rate": 1.9659442724458205e-05, + "loss": 0.5396, + "step": 128 + }, + { + "epoch": 0.11983279145378542, + "grad_norm": 0.3188023407278314, + "learning_rate": 1.9814241486068115e-05, + "loss": 0.4933, + "step": 129 + }, + { + "epoch": 0.12076172782164422, + "grad_norm": 0.2709603354646054, + "learning_rate": 1.9969040247678018e-05, + "loss": 0.5159, + "step": 130 + }, + { + "epoch": 0.12169066418950301, + "grad_norm": 0.28324770317715753, + "learning_rate": 2.0123839009287925e-05, + "loss": 0.4932, + "step": 131 + }, + { + "epoch": 0.12261960055736182, + "grad_norm": 0.27619503735171647, + "learning_rate": 2.0278637770897835e-05, + "loss": 0.5303, + "step": 132 + }, + { + "epoch": 0.12354853692522062, + "grad_norm": 0.26401711089649377, + "learning_rate": 2.0433436532507742e-05, + "loss": 0.4868, + "step": 133 + }, + { + "epoch": 0.12447747329307943, + "grad_norm": 0.2593772391982529, + "learning_rate": 2.058823529411765e-05, + "loss": 0.4929, + "step": 134 + }, + { + "epoch": 0.12540640966093822, + "grad_norm": 0.28933660519338583, + "learning_rate": 2.0743034055727555e-05, + "loss": 0.476, + "step": 135 + }, + { + "epoch": 0.12633534602879704, + "grad_norm": 0.23316118822530046, + "learning_rate": 2.0897832817337462e-05, + "loss": 0.4568, + "step": 136 + }, + { + "epoch": 0.12726428239665583, + "grad_norm": 0.31249950051428027, + "learning_rate": 2.105263157894737e-05, + "loss": 0.5187, + "step": 137 + }, + { + "epoch": 0.12819321876451464, + "grad_norm": 0.29165292379136437, + "learning_rate": 2.1207430340557276e-05, + "loss": 0.5117, + "step": 138 + }, + { + "epoch": 0.12912215513237343, + "grad_norm": 0.26649297211443057, + "learning_rate": 2.1362229102167182e-05, + "loss": 0.479, + "step": 139 + }, + { + "epoch": 0.13005109150023222, + "grad_norm": 0.3142784146098132, + "learning_rate": 2.151702786377709e-05, + "loss": 0.5216, + "step": 140 + }, + { + "epoch": 0.13098002786809104, + "grad_norm": 0.29423375787358585, + "learning_rate": 2.1671826625387e-05, + "loss": 0.4994, + "step": 141 + }, + { + "epoch": 0.13190896423594983, + "grad_norm": 0.28110784066712485, + "learning_rate": 2.1826625386996906e-05, + "loss": 0.4734, + "step": 142 + }, + { + "epoch": 0.13283790060380865, + "grad_norm": 0.29325368926120227, + "learning_rate": 2.1981424148606813e-05, + "loss": 0.4963, + "step": 143 + }, + { + "epoch": 0.13376683697166744, + "grad_norm": 0.29510748124918773, + "learning_rate": 2.213622291021672e-05, + "loss": 0.4982, + "step": 144 + }, + { + "epoch": 0.13469577333952623, + "grad_norm": 0.26461232115849287, + "learning_rate": 2.2291021671826626e-05, + "loss": 0.5166, + "step": 145 + }, + { + "epoch": 0.13562470970738505, + "grad_norm": 0.31236587334144944, + "learning_rate": 2.2445820433436533e-05, + "loss": 0.5312, + "step": 146 + }, + { + "epoch": 0.13655364607524384, + "grad_norm": 0.300081395956833, + "learning_rate": 2.260061919504644e-05, + "loss": 0.5129, + "step": 147 + }, + { + "epoch": 0.13748258244310266, + "grad_norm": 0.31610036230407396, + "learning_rate": 2.2755417956656347e-05, + "loss": 0.4776, + "step": 148 + }, + { + "epoch": 0.13841151881096145, + "grad_norm": 0.30145949638272806, + "learning_rate": 2.2910216718266257e-05, + "loss": 0.4902, + "step": 149 + }, + { + "epoch": 0.13934045517882024, + "grad_norm": 0.27339946306277607, + "learning_rate": 2.3065015479876163e-05, + "loss": 0.464, + "step": 150 + }, + { + "epoch": 0.14026939154667906, + "grad_norm": 0.3156892182016014, + "learning_rate": 2.3219814241486067e-05, + "loss": 0.495, + "step": 151 + }, + { + "epoch": 0.14119832791453785, + "grad_norm": 0.3687764130938456, + "learning_rate": 2.3374613003095977e-05, + "loss": 0.4868, + "step": 152 + }, + { + "epoch": 0.14212726428239666, + "grad_norm": 0.2800596635623465, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.4739, + "step": 153 + }, + { + "epoch": 0.14305620065025546, + "grad_norm": 0.3992825928546232, + "learning_rate": 2.368421052631579e-05, + "loss": 0.4913, + "step": 154 + }, + { + "epoch": 0.14398513701811425, + "grad_norm": 0.3239223108616911, + "learning_rate": 2.3839009287925697e-05, + "loss": 0.4734, + "step": 155 + }, + { + "epoch": 0.14491407338597306, + "grad_norm": 0.3963740287758293, + "learning_rate": 2.3993808049535604e-05, + "loss": 0.5013, + "step": 156 + }, + { + "epoch": 0.14584300975383185, + "grad_norm": 0.36982949622797573, + "learning_rate": 2.4148606811145514e-05, + "loss": 0.5329, + "step": 157 + }, + { + "epoch": 0.14677194612169067, + "grad_norm": 0.3446342173260943, + "learning_rate": 2.430340557275542e-05, + "loss": 0.4617, + "step": 158 + }, + { + "epoch": 0.14770088248954946, + "grad_norm": 0.3726891488879446, + "learning_rate": 2.4458204334365324e-05, + "loss": 0.5158, + "step": 159 + }, + { + "epoch": 0.14862981885740828, + "grad_norm": 0.34306897903348194, + "learning_rate": 2.4613003095975234e-05, + "loss": 0.5011, + "step": 160 + }, + { + "epoch": 0.14955875522526707, + "grad_norm": 0.3509672474944589, + "learning_rate": 2.476780185758514e-05, + "loss": 0.5179, + "step": 161 + }, + { + "epoch": 0.15048769159312586, + "grad_norm": 0.36888921482695486, + "learning_rate": 2.4922600619195048e-05, + "loss": 0.488, + "step": 162 + }, + { + "epoch": 0.15141662796098468, + "grad_norm": 0.2879360129205803, + "learning_rate": 2.5077399380804955e-05, + "loss": 0.4818, + "step": 163 + }, + { + "epoch": 0.15234556432884347, + "grad_norm": 0.3541612368911695, + "learning_rate": 2.5232198142414865e-05, + "loss": 0.4645, + "step": 164 + }, + { + "epoch": 0.1532745006967023, + "grad_norm": 0.35885056781188374, + "learning_rate": 2.5386996904024768e-05, + "loss": 0.5074, + "step": 165 + }, + { + "epoch": 0.15420343706456108, + "grad_norm": 0.34753901489218775, + "learning_rate": 2.5541795665634678e-05, + "loss": 0.5209, + "step": 166 + }, + { + "epoch": 0.15513237343241987, + "grad_norm": 0.3353853516137484, + "learning_rate": 2.5696594427244585e-05, + "loss": 0.5021, + "step": 167 + }, + { + "epoch": 0.15606130980027869, + "grad_norm": 0.3298140566724545, + "learning_rate": 2.585139318885449e-05, + "loss": 0.5159, + "step": 168 + }, + { + "epoch": 0.15699024616813748, + "grad_norm": 0.3391997674833228, + "learning_rate": 2.60061919504644e-05, + "loss": 0.4754, + "step": 169 + }, + { + "epoch": 0.1579191825359963, + "grad_norm": 0.28704771216792646, + "learning_rate": 2.616099071207431e-05, + "loss": 0.4703, + "step": 170 + }, + { + "epoch": 0.15884811890385508, + "grad_norm": 0.37267201290204965, + "learning_rate": 2.6315789473684212e-05, + "loss": 0.5214, + "step": 171 + }, + { + "epoch": 0.15977705527171387, + "grad_norm": 0.27902165277762125, + "learning_rate": 2.647058823529412e-05, + "loss": 0.4699, + "step": 172 + }, + { + "epoch": 0.1607059916395727, + "grad_norm": 0.3398458423351264, + "learning_rate": 2.662538699690403e-05, + "loss": 0.5197, + "step": 173 + }, + { + "epoch": 0.16163492800743148, + "grad_norm": 0.3435324805081321, + "learning_rate": 2.6780185758513932e-05, + "loss": 0.4789, + "step": 174 + }, + { + "epoch": 0.1625638643752903, + "grad_norm": 0.4351648566726122, + "learning_rate": 2.693498452012384e-05, + "loss": 0.4987, + "step": 175 + }, + { + "epoch": 0.1634928007431491, + "grad_norm": 0.3198149348956976, + "learning_rate": 2.708978328173375e-05, + "loss": 0.475, + "step": 176 + }, + { + "epoch": 0.1644217371110079, + "grad_norm": 0.3950400431692381, + "learning_rate": 2.7244582043343652e-05, + "loss": 0.4744, + "step": 177 + }, + { + "epoch": 0.1653506734788667, + "grad_norm": 0.3760190425753378, + "learning_rate": 2.7399380804953563e-05, + "loss": 0.4938, + "step": 178 + }, + { + "epoch": 0.1662796098467255, + "grad_norm": 0.417176002219102, + "learning_rate": 2.755417956656347e-05, + "loss": 0.513, + "step": 179 + }, + { + "epoch": 0.1672085462145843, + "grad_norm": 0.3474000705492875, + "learning_rate": 2.7708978328173373e-05, + "loss": 0.464, + "step": 180 + }, + { + "epoch": 0.1681374825824431, + "grad_norm": 0.35375534821068394, + "learning_rate": 2.7863777089783283e-05, + "loss": 0.5123, + "step": 181 + }, + { + "epoch": 0.16906641895030192, + "grad_norm": 0.4153734241752173, + "learning_rate": 2.8018575851393193e-05, + "loss": 0.4786, + "step": 182 + }, + { + "epoch": 0.1699953553181607, + "grad_norm": 0.3633545185465792, + "learning_rate": 2.8173374613003096e-05, + "loss": 0.4881, + "step": 183 + }, + { + "epoch": 0.1709242916860195, + "grad_norm": 0.4177673269252791, + "learning_rate": 2.8328173374613003e-05, + "loss": 0.4851, + "step": 184 + }, + { + "epoch": 0.17185322805387832, + "grad_norm": 0.34872663223370376, + "learning_rate": 2.8482972136222913e-05, + "loss": 0.4562, + "step": 185 + }, + { + "epoch": 0.1727821644217371, + "grad_norm": 0.3826892295449655, + "learning_rate": 2.8637770897832817e-05, + "loss": 0.4704, + "step": 186 + }, + { + "epoch": 0.17371110078959592, + "grad_norm": 0.2911667819931114, + "learning_rate": 2.8792569659442727e-05, + "loss": 0.4647, + "step": 187 + }, + { + "epoch": 0.17464003715745471, + "grad_norm": 0.3577750615715225, + "learning_rate": 2.8947368421052634e-05, + "loss": 0.4758, + "step": 188 + }, + { + "epoch": 0.1755689735253135, + "grad_norm": 0.28796027674445784, + "learning_rate": 2.9102167182662537e-05, + "loss": 0.4934, + "step": 189 + }, + { + "epoch": 0.17649790989317232, + "grad_norm": 0.328868526631627, + "learning_rate": 2.9256965944272447e-05, + "loss": 0.4989, + "step": 190 + }, + { + "epoch": 0.1774268462610311, + "grad_norm": 0.30926168846748986, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.4641, + "step": 191 + }, + { + "epoch": 0.17835578262888993, + "grad_norm": 0.3990926605295306, + "learning_rate": 2.9566563467492264e-05, + "loss": 0.5012, + "step": 192 + }, + { + "epoch": 0.17928471899674872, + "grad_norm": 0.2967167603656067, + "learning_rate": 2.9721362229102167e-05, + "loss": 0.4804, + "step": 193 + }, + { + "epoch": 0.1802136553646075, + "grad_norm": 0.334297436056706, + "learning_rate": 2.9876160990712077e-05, + "loss": 0.4589, + "step": 194 + }, + { + "epoch": 0.18114259173246633, + "grad_norm": 0.39015351822564853, + "learning_rate": 3.0030959752321984e-05, + "loss": 0.5029, + "step": 195 + }, + { + "epoch": 0.18207152810032512, + "grad_norm": 0.3166981562612517, + "learning_rate": 3.0185758513931888e-05, + "loss": 0.4826, + "step": 196 + }, + { + "epoch": 0.18300046446818394, + "grad_norm": 0.3697612896094445, + "learning_rate": 3.0340557275541798e-05, + "loss": 0.4834, + "step": 197 + }, + { + "epoch": 0.18392940083604273, + "grad_norm": 0.3931362669456308, + "learning_rate": 3.0495356037151708e-05, + "loss": 0.4827, + "step": 198 + }, + { + "epoch": 0.18485833720390155, + "grad_norm": 0.37542137369737677, + "learning_rate": 3.065015479876161e-05, + "loss": 0.5137, + "step": 199 + }, + { + "epoch": 0.18578727357176034, + "grad_norm": 0.40365155511717965, + "learning_rate": 3.080495356037152e-05, + "loss": 0.4616, + "step": 200 + }, + { + "epoch": 0.18671620993961913, + "grad_norm": 0.3462177469197346, + "learning_rate": 3.0959752321981425e-05, + "loss": 0.5022, + "step": 201 + }, + { + "epoch": 0.18764514630747794, + "grad_norm": 0.4146922057088302, + "learning_rate": 3.111455108359133e-05, + "loss": 0.463, + "step": 202 + }, + { + "epoch": 0.18857408267533673, + "grad_norm": 0.34021579524785434, + "learning_rate": 3.126934984520124e-05, + "loss": 0.4588, + "step": 203 + }, + { + "epoch": 0.18950301904319555, + "grad_norm": 0.37786067260822703, + "learning_rate": 3.142414860681115e-05, + "loss": 0.4759, + "step": 204 + }, + { + "epoch": 0.19043195541105434, + "grad_norm": 0.3871068334106478, + "learning_rate": 3.157894736842105e-05, + "loss": 0.4702, + "step": 205 + }, + { + "epoch": 0.19136089177891313, + "grad_norm": 0.30844377564643344, + "learning_rate": 3.173374613003096e-05, + "loss": 0.4506, + "step": 206 + }, + { + "epoch": 0.19228982814677195, + "grad_norm": 0.387790224919225, + "learning_rate": 3.188854489164087e-05, + "loss": 0.4554, + "step": 207 + }, + { + "epoch": 0.19321876451463074, + "grad_norm": 0.3486750036808893, + "learning_rate": 3.204334365325077e-05, + "loss": 0.4735, + "step": 208 + }, + { + "epoch": 0.19414770088248956, + "grad_norm": 0.36108244308720244, + "learning_rate": 3.2198142414860685e-05, + "loss": 0.4516, + "step": 209 + }, + { + "epoch": 0.19507663725034835, + "grad_norm": 0.34168388969435387, + "learning_rate": 3.235294117647059e-05, + "loss": 0.4716, + "step": 210 + }, + { + "epoch": 0.19600557361820714, + "grad_norm": 0.3336301263066555, + "learning_rate": 3.250773993808049e-05, + "loss": 0.462, + "step": 211 + }, + { + "epoch": 0.19693450998606596, + "grad_norm": 0.30575256557626257, + "learning_rate": 3.2662538699690406e-05, + "loss": 0.433, + "step": 212 + }, + { + "epoch": 0.19786344635392475, + "grad_norm": 0.31175854650330226, + "learning_rate": 3.281733746130031e-05, + "loss": 0.4831, + "step": 213 + }, + { + "epoch": 0.19879238272178357, + "grad_norm": 0.3195577431584941, + "learning_rate": 3.297213622291022e-05, + "loss": 0.4939, + "step": 214 + }, + { + "epoch": 0.19972131908964236, + "grad_norm": 0.324334575666152, + "learning_rate": 3.3126934984520126e-05, + "loss": 0.4674, + "step": 215 + }, + { + "epoch": 0.20065025545750115, + "grad_norm": 0.30930275701374427, + "learning_rate": 3.328173374613003e-05, + "loss": 0.4861, + "step": 216 + }, + { + "epoch": 0.20157919182535997, + "grad_norm": 0.3196175745480417, + "learning_rate": 3.343653250773994e-05, + "loss": 0.4785, + "step": 217 + }, + { + "epoch": 0.20250812819321876, + "grad_norm": 0.3460990192307274, + "learning_rate": 3.3591331269349846e-05, + "loss": 0.4843, + "step": 218 + }, + { + "epoch": 0.20343706456107757, + "grad_norm": 0.3189777288334882, + "learning_rate": 3.374613003095975e-05, + "loss": 0.5066, + "step": 219 + }, + { + "epoch": 0.20436600092893636, + "grad_norm": 0.36563447762228074, + "learning_rate": 3.390092879256966e-05, + "loss": 0.4544, + "step": 220 + }, + { + "epoch": 0.20529493729679518, + "grad_norm": 0.3437365831635904, + "learning_rate": 3.4055727554179566e-05, + "loss": 0.4644, + "step": 221 + }, + { + "epoch": 0.20622387366465397, + "grad_norm": 0.3568648135029391, + "learning_rate": 3.421052631578947e-05, + "loss": 0.4934, + "step": 222 + }, + { + "epoch": 0.20715281003251276, + "grad_norm": 0.40598865357747305, + "learning_rate": 3.436532507739939e-05, + "loss": 0.4913, + "step": 223 + }, + { + "epoch": 0.20808174640037158, + "grad_norm": 0.4241132879182254, + "learning_rate": 3.452012383900929e-05, + "loss": 0.4535, + "step": 224 + }, + { + "epoch": 0.20901068276823037, + "grad_norm": 0.35305194926225575, + "learning_rate": 3.46749226006192e-05, + "loss": 0.4682, + "step": 225 + }, + { + "epoch": 0.2099396191360892, + "grad_norm": 0.618623679289102, + "learning_rate": 3.482972136222911e-05, + "loss": 0.4818, + "step": 226 + }, + { + "epoch": 0.21086855550394798, + "grad_norm": 0.5920146651422737, + "learning_rate": 3.498452012383901e-05, + "loss": 0.4841, + "step": 227 + }, + { + "epoch": 0.21179749187180677, + "grad_norm": 0.527405882028656, + "learning_rate": 3.513931888544892e-05, + "loss": 0.4834, + "step": 228 + }, + { + "epoch": 0.2127264282396656, + "grad_norm": 0.8374512884655507, + "learning_rate": 3.529411764705883e-05, + "loss": 0.4914, + "step": 229 + }, + { + "epoch": 0.21365536460752438, + "grad_norm": 0.7597460219900155, + "learning_rate": 3.5448916408668734e-05, + "loss": 0.4609, + "step": 230 + }, + { + "epoch": 0.2145843009753832, + "grad_norm": 0.5139866484545614, + "learning_rate": 3.560371517027864e-05, + "loss": 0.457, + "step": 231 + }, + { + "epoch": 0.215513237343242, + "grad_norm": 1.235134692869283, + "learning_rate": 3.575851393188855e-05, + "loss": 0.5001, + "step": 232 + }, + { + "epoch": 0.21644217371110078, + "grad_norm": 0.41145239593782507, + "learning_rate": 3.5913312693498454e-05, + "loss": 0.4375, + "step": 233 + }, + { + "epoch": 0.2173711100789596, + "grad_norm": 0.6816112033947701, + "learning_rate": 3.606811145510836e-05, + "loss": 0.4771, + "step": 234 + }, + { + "epoch": 0.21830004644681839, + "grad_norm": 0.4343968108259159, + "learning_rate": 3.622291021671827e-05, + "loss": 0.4565, + "step": 235 + }, + { + "epoch": 0.2192289828146772, + "grad_norm": 0.6469045830088757, + "learning_rate": 3.6377708978328174e-05, + "loss": 0.4705, + "step": 236 + }, + { + "epoch": 0.220157919182536, + "grad_norm": 0.43542141035499127, + "learning_rate": 3.653250773993808e-05, + "loss": 0.4497, + "step": 237 + }, + { + "epoch": 0.22108685555039478, + "grad_norm": 0.5546407234766495, + "learning_rate": 3.668730650154799e-05, + "loss": 0.4678, + "step": 238 + }, + { + "epoch": 0.2220157919182536, + "grad_norm": 0.4764192191948784, + "learning_rate": 3.6842105263157895e-05, + "loss": 0.475, + "step": 239 + }, + { + "epoch": 0.2229447282861124, + "grad_norm": 0.43770272881976247, + "learning_rate": 3.69969040247678e-05, + "loss": 0.4427, + "step": 240 + }, + { + "epoch": 0.2238736646539712, + "grad_norm": 0.4550187245086204, + "learning_rate": 3.7151702786377715e-05, + "loss": 0.4414, + "step": 241 + }, + { + "epoch": 0.22480260102183, + "grad_norm": 0.5330169483676697, + "learning_rate": 3.7306501547987615e-05, + "loss": 0.4819, + "step": 242 + }, + { + "epoch": 0.22573153738968882, + "grad_norm": 0.4355552273324838, + "learning_rate": 3.746130030959752e-05, + "loss": 0.473, + "step": 243 + }, + { + "epoch": 0.2266604737575476, + "grad_norm": 0.529146441551152, + "learning_rate": 3.7616099071207435e-05, + "loss": 0.5233, + "step": 244 + }, + { + "epoch": 0.2275894101254064, + "grad_norm": 0.42768230047362027, + "learning_rate": 3.7770897832817335e-05, + "loss": 0.4652, + "step": 245 + }, + { + "epoch": 0.22851834649326522, + "grad_norm": 0.498811282728108, + "learning_rate": 3.792569659442725e-05, + "loss": 0.4983, + "step": 246 + }, + { + "epoch": 0.229447282861124, + "grad_norm": 0.4438510879340567, + "learning_rate": 3.8080495356037155e-05, + "loss": 0.4366, + "step": 247 + }, + { + "epoch": 0.23037621922898283, + "grad_norm": 0.455953819278577, + "learning_rate": 3.8235294117647055e-05, + "loss": 0.4506, + "step": 248 + }, + { + "epoch": 0.23130515559684162, + "grad_norm": 0.40125931844226015, + "learning_rate": 3.839009287925697e-05, + "loss": 0.4626, + "step": 249 + }, + { + "epoch": 0.2322340919647004, + "grad_norm": 0.3539141975061322, + "learning_rate": 3.8544891640866876e-05, + "loss": 0.4499, + "step": 250 + }, + { + "epoch": 0.23316302833255922, + "grad_norm": 0.5042429216492247, + "learning_rate": 3.869969040247678e-05, + "loss": 0.4647, + "step": 251 + }, + { + "epoch": 0.23409196470041801, + "grad_norm": 0.3535166606707465, + "learning_rate": 3.885448916408669e-05, + "loss": 0.4746, + "step": 252 + }, + { + "epoch": 0.23502090106827683, + "grad_norm": 0.5056187976580123, + "learning_rate": 3.9009287925696596e-05, + "loss": 0.4405, + "step": 253 + }, + { + "epoch": 0.23594983743613562, + "grad_norm": 0.33101182681558045, + "learning_rate": 3.91640866873065e-05, + "loss": 0.4589, + "step": 254 + }, + { + "epoch": 0.2368787738039944, + "grad_norm": 0.4759625175194635, + "learning_rate": 3.931888544891641e-05, + "loss": 0.4532, + "step": 255 + }, + { + "epoch": 0.23780771017185323, + "grad_norm": 0.3870086650346993, + "learning_rate": 3.9473684210526316e-05, + "loss": 0.4468, + "step": 256 + }, + { + "epoch": 0.23873664653971202, + "grad_norm": 0.408444973925042, + "learning_rate": 3.962848297213623e-05, + "loss": 0.4646, + "step": 257 + }, + { + "epoch": 0.23966558290757084, + "grad_norm": 0.44744012598105387, + "learning_rate": 3.978328173374613e-05, + "loss": 0.455, + "step": 258 + }, + { + "epoch": 0.24059451927542963, + "grad_norm": 0.37785914884066873, + "learning_rate": 3.9938080495356037e-05, + "loss": 0.4874, + "step": 259 + }, + { + "epoch": 0.24152345564328845, + "grad_norm": 0.476511379545392, + "learning_rate": 4.009287925696595e-05, + "loss": 0.4546, + "step": 260 + }, + { + "epoch": 0.24245239201114724, + "grad_norm": 0.31290275808884926, + "learning_rate": 4.024767801857585e-05, + "loss": 0.4567, + "step": 261 + }, + { + "epoch": 0.24338132837900603, + "grad_norm": 0.46969828490127746, + "learning_rate": 4.0402476780185764e-05, + "loss": 0.4921, + "step": 262 + }, + { + "epoch": 0.24431026474686485, + "grad_norm": 0.29314739835009784, + "learning_rate": 4.055727554179567e-05, + "loss": 0.4263, + "step": 263 + }, + { + "epoch": 0.24523920111472364, + "grad_norm": 0.4260667269541755, + "learning_rate": 4.071207430340557e-05, + "loss": 0.4473, + "step": 264 + }, + { + "epoch": 0.24616813748258246, + "grad_norm": 0.29731013199401535, + "learning_rate": 4.0866873065015484e-05, + "loss": 0.4657, + "step": 265 + }, + { + "epoch": 0.24709707385044125, + "grad_norm": 0.38120279832509496, + "learning_rate": 4.102167182662539e-05, + "loss": 0.4368, + "step": 266 + }, + { + "epoch": 0.24802601021830004, + "grad_norm": 0.3167100904137455, + "learning_rate": 4.11764705882353e-05, + "loss": 0.4562, + "step": 267 + }, + { + "epoch": 0.24895494658615885, + "grad_norm": 0.4814669900513275, + "learning_rate": 4.1331269349845204e-05, + "loss": 0.4461, + "step": 268 + }, + { + "epoch": 0.24988388295401764, + "grad_norm": 0.32074498184201283, + "learning_rate": 4.148606811145511e-05, + "loss": 0.4613, + "step": 269 + }, + { + "epoch": 0.25081281932187643, + "grad_norm": 0.4084599440557474, + "learning_rate": 4.164086687306502e-05, + "loss": 0.47, + "step": 270 + }, + { + "epoch": 0.2517417556897353, + "grad_norm": 0.32165189918387554, + "learning_rate": 4.1795665634674924e-05, + "loss": 0.4814, + "step": 271 + }, + { + "epoch": 0.25267069205759407, + "grad_norm": 0.32816460839997996, + "learning_rate": 4.195046439628483e-05, + "loss": 0.4256, + "step": 272 + }, + { + "epoch": 0.25359962842545286, + "grad_norm": 0.3008939493300022, + "learning_rate": 4.210526315789474e-05, + "loss": 0.473, + "step": 273 + }, + { + "epoch": 0.25452856479331165, + "grad_norm": 0.3529450610360616, + "learning_rate": 4.2260061919504645e-05, + "loss": 0.4547, + "step": 274 + }, + { + "epoch": 0.25545750116117044, + "grad_norm": 0.2972895680132811, + "learning_rate": 4.241486068111455e-05, + "loss": 0.4539, + "step": 275 + }, + { + "epoch": 0.2563864375290293, + "grad_norm": 0.30425905871603104, + "learning_rate": 4.256965944272446e-05, + "loss": 0.4468, + "step": 276 + }, + { + "epoch": 0.2573153738968881, + "grad_norm": 0.314584382041487, + "learning_rate": 4.2724458204334365e-05, + "loss": 0.4952, + "step": 277 + }, + { + "epoch": 0.25824431026474687, + "grad_norm": 0.3160101237954399, + "learning_rate": 4.287925696594428e-05, + "loss": 0.4654, + "step": 278 + }, + { + "epoch": 0.25917324663260566, + "grad_norm": 0.31709523022756986, + "learning_rate": 4.303405572755418e-05, + "loss": 0.4673, + "step": 279 + }, + { + "epoch": 0.26010218300046445, + "grad_norm": 0.2980363449021666, + "learning_rate": 4.3188854489164085e-05, + "loss": 0.4187, + "step": 280 + }, + { + "epoch": 0.2610311193683233, + "grad_norm": 0.33880341972476546, + "learning_rate": 4.3343653250774e-05, + "loss": 0.477, + "step": 281 + }, + { + "epoch": 0.2619600557361821, + "grad_norm": 0.35521193406469326, + "learning_rate": 4.3498452012383905e-05, + "loss": 0.4762, + "step": 282 + }, + { + "epoch": 0.2628889921040409, + "grad_norm": 0.39850142060231275, + "learning_rate": 4.365325077399381e-05, + "loss": 0.4634, + "step": 283 + }, + { + "epoch": 0.26381792847189967, + "grad_norm": 0.36023726265847905, + "learning_rate": 4.380804953560372e-05, + "loss": 0.4757, + "step": 284 + }, + { + "epoch": 0.26474686483975846, + "grad_norm": 0.32295320874219785, + "learning_rate": 4.3962848297213626e-05, + "loss": 0.4523, + "step": 285 + }, + { + "epoch": 0.2656758012076173, + "grad_norm": 0.42060056160022957, + "learning_rate": 4.411764705882353e-05, + "loss": 0.4928, + "step": 286 + }, + { + "epoch": 0.2666047375754761, + "grad_norm": 0.38366257834055983, + "learning_rate": 4.427244582043344e-05, + "loss": 0.4773, + "step": 287 + }, + { + "epoch": 0.2675336739433349, + "grad_norm": 0.34821572318085275, + "learning_rate": 4.4427244582043346e-05, + "loss": 0.4495, + "step": 288 + }, + { + "epoch": 0.2684626103111937, + "grad_norm": 0.35080988887460157, + "learning_rate": 4.458204334365325e-05, + "loss": 0.4619, + "step": 289 + }, + { + "epoch": 0.26939154667905246, + "grad_norm": 0.3120875797937547, + "learning_rate": 4.473684210526316e-05, + "loss": 0.4354, + "step": 290 + }, + { + "epoch": 0.2703204830469113, + "grad_norm": 0.36916544806752755, + "learning_rate": 4.4891640866873066e-05, + "loss": 0.4428, + "step": 291 + }, + { + "epoch": 0.2712494194147701, + "grad_norm": 0.3120938357971595, + "learning_rate": 4.504643962848297e-05, + "loss": 0.479, + "step": 292 + }, + { + "epoch": 0.2721783557826289, + "grad_norm": 0.33009772967549367, + "learning_rate": 4.520123839009288e-05, + "loss": 0.44, + "step": 293 + }, + { + "epoch": 0.2731072921504877, + "grad_norm": 0.3147604795938479, + "learning_rate": 4.535603715170279e-05, + "loss": 0.4873, + "step": 294 + }, + { + "epoch": 0.27403622851834647, + "grad_norm": 0.3024028343733452, + "learning_rate": 4.551083591331269e-05, + "loss": 0.4684, + "step": 295 + }, + { + "epoch": 0.2749651648862053, + "grad_norm": 0.31188391724375253, + "learning_rate": 4.56656346749226e-05, + "loss": 0.447, + "step": 296 + }, + { + "epoch": 0.2758941012540641, + "grad_norm": 0.28892910625279733, + "learning_rate": 4.582043343653251e-05, + "loss": 0.4665, + "step": 297 + }, + { + "epoch": 0.2768230376219229, + "grad_norm": 0.3736874839548374, + "learning_rate": 4.597523219814241e-05, + "loss": 0.4526, + "step": 298 + }, + { + "epoch": 0.2777519739897817, + "grad_norm": 0.28993119120770083, + "learning_rate": 4.613003095975233e-05, + "loss": 0.4526, + "step": 299 + }, + { + "epoch": 0.2786809103576405, + "grad_norm": 0.3489816006908061, + "learning_rate": 4.6284829721362234e-05, + "loss": 0.4389, + "step": 300 + }, + { + "epoch": 0.2796098467254993, + "grad_norm": 0.32821341978908847, + "learning_rate": 4.6439628482972134e-05, + "loss": 0.47, + "step": 301 + }, + { + "epoch": 0.2805387830933581, + "grad_norm": 0.3101425751832829, + "learning_rate": 4.659442724458205e-05, + "loss": 0.4602, + "step": 302 + }, + { + "epoch": 0.2814677194612169, + "grad_norm": 0.3009900967986436, + "learning_rate": 4.6749226006191954e-05, + "loss": 0.4604, + "step": 303 + }, + { + "epoch": 0.2823966558290757, + "grad_norm": 0.3103354270645873, + "learning_rate": 4.690402476780186e-05, + "loss": 0.4737, + "step": 304 + }, + { + "epoch": 0.2833255921969345, + "grad_norm": 0.30614215441269343, + "learning_rate": 4.705882352941177e-05, + "loss": 0.4707, + "step": 305 + }, + { + "epoch": 0.28425452856479333, + "grad_norm": 0.31908370093108424, + "learning_rate": 4.7213622291021674e-05, + "loss": 0.4473, + "step": 306 + }, + { + "epoch": 0.2851834649326521, + "grad_norm": 0.2959072198444788, + "learning_rate": 4.736842105263158e-05, + "loss": 0.4579, + "step": 307 + }, + { + "epoch": 0.2861124013005109, + "grad_norm": 0.3057457489708315, + "learning_rate": 4.752321981424149e-05, + "loss": 0.4199, + "step": 308 + }, + { + "epoch": 0.2870413376683697, + "grad_norm": 0.30818993057548966, + "learning_rate": 4.7678018575851394e-05, + "loss": 0.4627, + "step": 309 + }, + { + "epoch": 0.2879702740362285, + "grad_norm": 0.33780222748958855, + "learning_rate": 4.783281733746131e-05, + "loss": 0.4955, + "step": 310 + }, + { + "epoch": 0.28889921040408734, + "grad_norm": 0.3538296738454953, + "learning_rate": 4.798761609907121e-05, + "loss": 0.4528, + "step": 311 + }, + { + "epoch": 0.2898281467719461, + "grad_norm": 0.35013935319870626, + "learning_rate": 4.8142414860681115e-05, + "loss": 0.4896, + "step": 312 + }, + { + "epoch": 0.2907570831398049, + "grad_norm": 0.3919444872765707, + "learning_rate": 4.829721362229103e-05, + "loss": 0.434, + "step": 313 + }, + { + "epoch": 0.2916860195076637, + "grad_norm": 0.2873775450384677, + "learning_rate": 4.845201238390093e-05, + "loss": 0.43, + "step": 314 + }, + { + "epoch": 0.29261495587552255, + "grad_norm": 0.43278899104997853, + "learning_rate": 4.860681114551084e-05, + "loss": 0.459, + "step": 315 + }, + { + "epoch": 0.29354389224338134, + "grad_norm": 0.29762624151628303, + "learning_rate": 4.876160990712075e-05, + "loss": 0.4381, + "step": 316 + }, + { + "epoch": 0.29447282861124013, + "grad_norm": 0.38552936527851245, + "learning_rate": 4.891640866873065e-05, + "loss": 0.4488, + "step": 317 + }, + { + "epoch": 0.2954017649790989, + "grad_norm": 0.3262364914608491, + "learning_rate": 4.907120743034056e-05, + "loss": 0.4745, + "step": 318 + }, + { + "epoch": 0.2963307013469577, + "grad_norm": 0.3368476064425339, + "learning_rate": 4.922600619195047e-05, + "loss": 0.4588, + "step": 319 + }, + { + "epoch": 0.29725963771481656, + "grad_norm": 0.3312500891089113, + "learning_rate": 4.9380804953560375e-05, + "loss": 0.4794, + "step": 320 + }, + { + "epoch": 0.29818857408267535, + "grad_norm": 0.3390444459850631, + "learning_rate": 4.953560371517028e-05, + "loss": 0.4604, + "step": 321 + }, + { + "epoch": 0.29911751045053414, + "grad_norm": 0.3387119384494132, + "learning_rate": 4.969040247678019e-05, + "loss": 0.4602, + "step": 322 + }, + { + "epoch": 0.30004644681839293, + "grad_norm": 0.37541797983485836, + "learning_rate": 4.9845201238390096e-05, + "loss": 0.4857, + "step": 323 + }, + { + "epoch": 0.3009753831862517, + "grad_norm": 0.36204268391898264, + "learning_rate": 5e-05, + "loss": 0.4487, + "step": 324 + }, + { + "epoch": 0.30190431955411057, + "grad_norm": 0.5069674260137537, + "learning_rate": 4.998278829604131e-05, + "loss": 0.4795, + "step": 325 + }, + { + "epoch": 0.30283325592196936, + "grad_norm": 0.31607540585629734, + "learning_rate": 4.996557659208262e-05, + "loss": 0.459, + "step": 326 + }, + { + "epoch": 0.30376219228982815, + "grad_norm": 0.5375774496764807, + "learning_rate": 4.9948364888123924e-05, + "loss": 0.465, + "step": 327 + }, + { + "epoch": 0.30469112865768694, + "grad_norm": 0.34299271466845627, + "learning_rate": 4.9931153184165236e-05, + "loss": 0.4693, + "step": 328 + }, + { + "epoch": 0.30562006502554573, + "grad_norm": 0.4143539281353358, + "learning_rate": 4.991394148020654e-05, + "loss": 0.4606, + "step": 329 + }, + { + "epoch": 0.3065490013934046, + "grad_norm": 0.4855004584252198, + "learning_rate": 4.989672977624785e-05, + "loss": 0.4783, + "step": 330 + }, + { + "epoch": 0.30747793776126336, + "grad_norm": 0.3423755850607668, + "learning_rate": 4.987951807228916e-05, + "loss": 0.4444, + "step": 331 + }, + { + "epoch": 0.30840687412912215, + "grad_norm": 0.3583846261043207, + "learning_rate": 4.986230636833047e-05, + "loss": 0.473, + "step": 332 + }, + { + "epoch": 0.30933581049698095, + "grad_norm": 0.3138663780309369, + "learning_rate": 4.9845094664371775e-05, + "loss": 0.4353, + "step": 333 + }, + { + "epoch": 0.31026474686483974, + "grad_norm": 0.3232503486528393, + "learning_rate": 4.9827882960413086e-05, + "loss": 0.4662, + "step": 334 + }, + { + "epoch": 0.3111936832326986, + "grad_norm": 0.33077321915284624, + "learning_rate": 4.981067125645439e-05, + "loss": 0.4389, + "step": 335 + }, + { + "epoch": 0.31212261960055737, + "grad_norm": 0.32940453207532444, + "learning_rate": 4.9793459552495696e-05, + "loss": 0.4549, + "step": 336 + }, + { + "epoch": 0.31305155596841616, + "grad_norm": 0.3489785623343884, + "learning_rate": 4.977624784853701e-05, + "loss": 0.4519, + "step": 337 + }, + { + "epoch": 0.31398049233627495, + "grad_norm": 0.3380056249650716, + "learning_rate": 4.975903614457831e-05, + "loss": 0.4501, + "step": 338 + }, + { + "epoch": 0.31490942870413374, + "grad_norm": 0.39621724081434734, + "learning_rate": 4.9741824440619625e-05, + "loss": 0.4886, + "step": 339 + }, + { + "epoch": 0.3158383650719926, + "grad_norm": 0.34090797197414546, + "learning_rate": 4.972461273666093e-05, + "loss": 0.4738, + "step": 340 + }, + { + "epoch": 0.3167673014398514, + "grad_norm": 0.35008163528496755, + "learning_rate": 4.970740103270224e-05, + "loss": 0.4533, + "step": 341 + }, + { + "epoch": 0.31769623780771017, + "grad_norm": 0.35189206816774354, + "learning_rate": 4.969018932874355e-05, + "loss": 0.4286, + "step": 342 + }, + { + "epoch": 0.31862517417556896, + "grad_norm": 0.3640074292494907, + "learning_rate": 4.967297762478486e-05, + "loss": 0.4412, + "step": 343 + }, + { + "epoch": 0.31955411054342775, + "grad_norm": 0.3532873296836122, + "learning_rate": 4.9655765920826164e-05, + "loss": 0.4686, + "step": 344 + }, + { + "epoch": 0.3204830469112866, + "grad_norm": 0.37025094985949847, + "learning_rate": 4.9638554216867475e-05, + "loss": 0.4338, + "step": 345 + }, + { + "epoch": 0.3214119832791454, + "grad_norm": 0.33462244732386653, + "learning_rate": 4.962134251290878e-05, + "loss": 0.4392, + "step": 346 + }, + { + "epoch": 0.3223409196470042, + "grad_norm": 0.3820555101933481, + "learning_rate": 4.960413080895009e-05, + "loss": 0.4898, + "step": 347 + }, + { + "epoch": 0.32326985601486297, + "grad_norm": 0.35860399216197664, + "learning_rate": 4.95869191049914e-05, + "loss": 0.4876, + "step": 348 + }, + { + "epoch": 0.32419879238272176, + "grad_norm": 0.3419630528737662, + "learning_rate": 4.95697074010327e-05, + "loss": 0.4698, + "step": 349 + }, + { + "epoch": 0.3251277287505806, + "grad_norm": 0.35528988609754913, + "learning_rate": 4.9552495697074014e-05, + "loss": 0.4275, + "step": 350 + }, + { + "epoch": 0.3260566651184394, + "grad_norm": 0.38379201396008805, + "learning_rate": 4.953528399311532e-05, + "loss": 0.4445, + "step": 351 + }, + { + "epoch": 0.3269856014862982, + "grad_norm": 0.3398302347977464, + "learning_rate": 4.951807228915663e-05, + "loss": 0.4554, + "step": 352 + }, + { + "epoch": 0.327914537854157, + "grad_norm": 0.3897934147080159, + "learning_rate": 4.9500860585197936e-05, + "loss": 0.4396, + "step": 353 + }, + { + "epoch": 0.3288434742220158, + "grad_norm": 0.3704282311872543, + "learning_rate": 4.948364888123925e-05, + "loss": 0.4476, + "step": 354 + }, + { + "epoch": 0.3297724105898746, + "grad_norm": 0.3792694514662743, + "learning_rate": 4.946643717728055e-05, + "loss": 0.4153, + "step": 355 + }, + { + "epoch": 0.3307013469577334, + "grad_norm": 0.3900734100570611, + "learning_rate": 4.944922547332186e-05, + "loss": 0.4347, + "step": 356 + }, + { + "epoch": 0.3316302833255922, + "grad_norm": 0.3409495630328528, + "learning_rate": 4.943201376936317e-05, + "loss": 0.4242, + "step": 357 + }, + { + "epoch": 0.332559219693451, + "grad_norm": 0.3661404637247587, + "learning_rate": 4.9414802065404474e-05, + "loss": 0.4426, + "step": 358 + }, + { + "epoch": 0.3334881560613098, + "grad_norm": 0.4272666657937548, + "learning_rate": 4.9397590361445786e-05, + "loss": 0.4468, + "step": 359 + }, + { + "epoch": 0.3344170924291686, + "grad_norm": 0.2754985787418611, + "learning_rate": 4.938037865748709e-05, + "loss": 0.4187, + "step": 360 + }, + { + "epoch": 0.3353460287970274, + "grad_norm": 0.38303057883878117, + "learning_rate": 4.93631669535284e-05, + "loss": 0.4594, + "step": 361 + }, + { + "epoch": 0.3362749651648862, + "grad_norm": 0.3015558469101507, + "learning_rate": 4.934595524956971e-05, + "loss": 0.4337, + "step": 362 + }, + { + "epoch": 0.337203901532745, + "grad_norm": 0.36850129972653306, + "learning_rate": 4.932874354561102e-05, + "loss": 0.4666, + "step": 363 + }, + { + "epoch": 0.33813283790060383, + "grad_norm": 0.2884585481636936, + "learning_rate": 4.9311531841652325e-05, + "loss": 0.4437, + "step": 364 + }, + { + "epoch": 0.3390617742684626, + "grad_norm": 0.3163751089768913, + "learning_rate": 4.929432013769364e-05, + "loss": 0.4591, + "step": 365 + }, + { + "epoch": 0.3399907106363214, + "grad_norm": 0.28834325339704836, + "learning_rate": 4.927710843373494e-05, + "loss": 0.4339, + "step": 366 + }, + { + "epoch": 0.3409196470041802, + "grad_norm": 0.3089186351649892, + "learning_rate": 4.9259896729776253e-05, + "loss": 0.4413, + "step": 367 + }, + { + "epoch": 0.341848583372039, + "grad_norm": 0.35241634143696166, + "learning_rate": 4.924268502581756e-05, + "loss": 0.4571, + "step": 368 + }, + { + "epoch": 0.34277751973989784, + "grad_norm": 0.3101976948447532, + "learning_rate": 4.922547332185887e-05, + "loss": 0.4558, + "step": 369 + }, + { + "epoch": 0.34370645610775663, + "grad_norm": 0.35319592477754186, + "learning_rate": 4.9208261617900175e-05, + "loss": 0.4429, + "step": 370 + }, + { + "epoch": 0.3446353924756154, + "grad_norm": 0.3322284047796149, + "learning_rate": 4.919104991394149e-05, + "loss": 0.4524, + "step": 371 + }, + { + "epoch": 0.3455643288434742, + "grad_norm": 0.37262088448309033, + "learning_rate": 4.917383820998279e-05, + "loss": 0.4325, + "step": 372 + }, + { + "epoch": 0.346493265211333, + "grad_norm": 0.336028619967502, + "learning_rate": 4.9156626506024104e-05, + "loss": 0.4488, + "step": 373 + }, + { + "epoch": 0.34742220157919185, + "grad_norm": 0.37466782782498514, + "learning_rate": 4.913941480206541e-05, + "loss": 0.458, + "step": 374 + }, + { + "epoch": 0.34835113794705064, + "grad_norm": 0.3083157732259951, + "learning_rate": 4.9122203098106714e-05, + "loss": 0.4522, + "step": 375 + }, + { + "epoch": 0.34928007431490943, + "grad_norm": 0.3696409621615052, + "learning_rate": 4.9104991394148026e-05, + "loss": 0.4769, + "step": 376 + }, + { + "epoch": 0.3502090106827682, + "grad_norm": 0.37589325200592977, + "learning_rate": 4.908777969018933e-05, + "loss": 0.4535, + "step": 377 + }, + { + "epoch": 0.351137947050627, + "grad_norm": 0.290205612168557, + "learning_rate": 4.9070567986230636e-05, + "loss": 0.4669, + "step": 378 + }, + { + "epoch": 0.35206688341848585, + "grad_norm": 0.37740647897617646, + "learning_rate": 4.905335628227195e-05, + "loss": 0.4384, + "step": 379 + }, + { + "epoch": 0.35299581978634464, + "grad_norm": 0.28374091712444127, + "learning_rate": 4.903614457831325e-05, + "loss": 0.4437, + "step": 380 + }, + { + "epoch": 0.35392475615420343, + "grad_norm": 0.3555291712519551, + "learning_rate": 4.9018932874354564e-05, + "loss": 0.4437, + "step": 381 + }, + { + "epoch": 0.3548536925220622, + "grad_norm": 0.3021908819889071, + "learning_rate": 4.900172117039587e-05, + "loss": 0.4529, + "step": 382 + }, + { + "epoch": 0.355782628889921, + "grad_norm": 0.28070022204204287, + "learning_rate": 4.898450946643718e-05, + "loss": 0.4445, + "step": 383 + }, + { + "epoch": 0.35671156525777986, + "grad_norm": 0.34406702126509014, + "learning_rate": 4.8967297762478486e-05, + "loss": 0.4532, + "step": 384 + }, + { + "epoch": 0.35764050162563865, + "grad_norm": 0.2888201625795591, + "learning_rate": 4.89500860585198e-05, + "loss": 0.4636, + "step": 385 + }, + { + "epoch": 0.35856943799349744, + "grad_norm": 0.3598719298462701, + "learning_rate": 4.89328743545611e-05, + "loss": 0.4402, + "step": 386 + }, + { + "epoch": 0.35949837436135623, + "grad_norm": 0.2895073326465502, + "learning_rate": 4.8915662650602415e-05, + "loss": 0.4625, + "step": 387 + }, + { + "epoch": 0.360427310729215, + "grad_norm": 0.323679285493869, + "learning_rate": 4.889845094664372e-05, + "loss": 0.4253, + "step": 388 + }, + { + "epoch": 0.36135624709707387, + "grad_norm": 0.32852184199735984, + "learning_rate": 4.888123924268503e-05, + "loss": 0.4404, + "step": 389 + }, + { + "epoch": 0.36228518346493266, + "grad_norm": 0.3825310829770502, + "learning_rate": 4.8864027538726336e-05, + "loss": 0.4324, + "step": 390 + }, + { + "epoch": 0.36321411983279145, + "grad_norm": 0.3147280199593259, + "learning_rate": 4.884681583476765e-05, + "loss": 0.4576, + "step": 391 + }, + { + "epoch": 0.36414305620065024, + "grad_norm": 0.33189632998575647, + "learning_rate": 4.882960413080895e-05, + "loss": 0.4508, + "step": 392 + }, + { + "epoch": 0.36507199256850903, + "grad_norm": 0.31161211897471663, + "learning_rate": 4.8812392426850265e-05, + "loss": 0.433, + "step": 393 + }, + { + "epoch": 0.3660009289363679, + "grad_norm": 0.29323440533256345, + "learning_rate": 4.879518072289157e-05, + "loss": 0.4426, + "step": 394 + }, + { + "epoch": 0.36692986530422667, + "grad_norm": 0.3044769147297849, + "learning_rate": 4.877796901893288e-05, + "loss": 0.4383, + "step": 395 + }, + { + "epoch": 0.36785880167208546, + "grad_norm": 0.33105623962975367, + "learning_rate": 4.876075731497419e-05, + "loss": 0.4511, + "step": 396 + }, + { + "epoch": 0.36878773803994425, + "grad_norm": 0.2926145124782726, + "learning_rate": 4.87435456110155e-05, + "loss": 0.4494, + "step": 397 + }, + { + "epoch": 0.3697166744078031, + "grad_norm": 0.2785219519909291, + "learning_rate": 4.8726333907056804e-05, + "loss": 0.4193, + "step": 398 + }, + { + "epoch": 0.3706456107756619, + "grad_norm": 0.3190152822347888, + "learning_rate": 4.870912220309811e-05, + "loss": 0.4348, + "step": 399 + }, + { + "epoch": 0.3715745471435207, + "grad_norm": 0.2944642128721377, + "learning_rate": 4.8691910499139414e-05, + "loss": 0.4413, + "step": 400 + }, + { + "epoch": 0.37250348351137946, + "grad_norm": 0.2874818565004639, + "learning_rate": 4.8674698795180725e-05, + "loss": 0.4379, + "step": 401 + }, + { + "epoch": 0.37343241987923825, + "grad_norm": 0.27777508008099533, + "learning_rate": 4.865748709122203e-05, + "loss": 0.4241, + "step": 402 + }, + { + "epoch": 0.3743613562470971, + "grad_norm": 0.32978399945466425, + "learning_rate": 4.864027538726334e-05, + "loss": 0.447, + "step": 403 + }, + { + "epoch": 0.3752902926149559, + "grad_norm": 0.3094967262792066, + "learning_rate": 4.862306368330465e-05, + "loss": 0.4466, + "step": 404 + }, + { + "epoch": 0.3762192289828147, + "grad_norm": 0.29574208650459116, + "learning_rate": 4.860585197934596e-05, + "loss": 0.436, + "step": 405 + }, + { + "epoch": 0.37714816535067347, + "grad_norm": 0.288270087489997, + "learning_rate": 4.8588640275387264e-05, + "loss": 0.4441, + "step": 406 + }, + { + "epoch": 0.37807710171853226, + "grad_norm": 0.30162172795880676, + "learning_rate": 4.8571428571428576e-05, + "loss": 0.4145, + "step": 407 + }, + { + "epoch": 0.3790060380863911, + "grad_norm": 0.2618604647255097, + "learning_rate": 4.855421686746988e-05, + "loss": 0.4162, + "step": 408 + }, + { + "epoch": 0.3799349744542499, + "grad_norm": 0.30851370446236615, + "learning_rate": 4.853700516351119e-05, + "loss": 0.4224, + "step": 409 + }, + { + "epoch": 0.3808639108221087, + "grad_norm": 0.3006905343379606, + "learning_rate": 4.85197934595525e-05, + "loss": 0.4879, + "step": 410 + }, + { + "epoch": 0.3817928471899675, + "grad_norm": 0.28307585696243226, + "learning_rate": 4.850258175559381e-05, + "loss": 0.4068, + "step": 411 + }, + { + "epoch": 0.38272178355782627, + "grad_norm": 0.26102123895109014, + "learning_rate": 4.8485370051635114e-05, + "loss": 0.428, + "step": 412 + }, + { + "epoch": 0.3836507199256851, + "grad_norm": 0.26532852818511954, + "learning_rate": 4.8468158347676426e-05, + "loss": 0.4356, + "step": 413 + }, + { + "epoch": 0.3845796562935439, + "grad_norm": 0.27713412833380047, + "learning_rate": 4.845094664371773e-05, + "loss": 0.4166, + "step": 414 + }, + { + "epoch": 0.3855085926614027, + "grad_norm": 0.2721011127848934, + "learning_rate": 4.843373493975904e-05, + "loss": 0.4588, + "step": 415 + }, + { + "epoch": 0.3864375290292615, + "grad_norm": 0.270987247231762, + "learning_rate": 4.841652323580035e-05, + "loss": 0.4418, + "step": 416 + }, + { + "epoch": 0.3873664653971203, + "grad_norm": 0.2815915571880338, + "learning_rate": 4.839931153184166e-05, + "loss": 0.438, + "step": 417 + }, + { + "epoch": 0.3882954017649791, + "grad_norm": 0.2900554851874775, + "learning_rate": 4.8382099827882965e-05, + "loss": 0.4495, + "step": 418 + }, + { + "epoch": 0.3892243381328379, + "grad_norm": 0.3334842549517787, + "learning_rate": 4.836488812392428e-05, + "loss": 0.4646, + "step": 419 + }, + { + "epoch": 0.3901532745006967, + "grad_norm": 0.24007212579976095, + "learning_rate": 4.8347676419965575e-05, + "loss": 0.4238, + "step": 420 + }, + { + "epoch": 0.3910822108685555, + "grad_norm": 0.3077325041506196, + "learning_rate": 4.833046471600689e-05, + "loss": 0.4447, + "step": 421 + }, + { + "epoch": 0.3920111472364143, + "grad_norm": 0.2863122225447303, + "learning_rate": 4.831325301204819e-05, + "loss": 0.4365, + "step": 422 + }, + { + "epoch": 0.3929400836042731, + "grad_norm": 0.30580235165952646, + "learning_rate": 4.8296041308089504e-05, + "loss": 0.4453, + "step": 423 + }, + { + "epoch": 0.3938690199721319, + "grad_norm": 0.26472608629055344, + "learning_rate": 4.827882960413081e-05, + "loss": 0.445, + "step": 424 + }, + { + "epoch": 0.3947979563399907, + "grad_norm": 0.30850387581287186, + "learning_rate": 4.826161790017212e-05, + "loss": 0.4459, + "step": 425 + }, + { + "epoch": 0.3957268927078495, + "grad_norm": 0.32129564773476693, + "learning_rate": 4.8244406196213425e-05, + "loss": 0.4302, + "step": 426 + }, + { + "epoch": 0.3966558290757083, + "grad_norm": 0.2856821353622809, + "learning_rate": 4.822719449225474e-05, + "loss": 0.4366, + "step": 427 + }, + { + "epoch": 0.39758476544356713, + "grad_norm": 0.27875816280973154, + "learning_rate": 4.820998278829604e-05, + "loss": 0.4177, + "step": 428 + }, + { + "epoch": 0.3985137018114259, + "grad_norm": 0.375391127760771, + "learning_rate": 4.8192771084337354e-05, + "loss": 0.4549, + "step": 429 + }, + { + "epoch": 0.3994426381792847, + "grad_norm": 0.2919547832552509, + "learning_rate": 4.817555938037866e-05, + "loss": 0.4377, + "step": 430 + }, + { + "epoch": 0.4003715745471435, + "grad_norm": 0.35769067629140655, + "learning_rate": 4.815834767641997e-05, + "loss": 0.4106, + "step": 431 + }, + { + "epoch": 0.4013005109150023, + "grad_norm": 0.3169078733636773, + "learning_rate": 4.8141135972461276e-05, + "loss": 0.4344, + "step": 432 + }, + { + "epoch": 0.40222944728286114, + "grad_norm": 0.38647441373213326, + "learning_rate": 4.812392426850259e-05, + "loss": 0.4662, + "step": 433 + }, + { + "epoch": 0.40315838365071993, + "grad_norm": 0.3093808502182461, + "learning_rate": 4.810671256454389e-05, + "loss": 0.4268, + "step": 434 + }, + { + "epoch": 0.4040873200185787, + "grad_norm": 0.3428360410015756, + "learning_rate": 4.8089500860585204e-05, + "loss": 0.4248, + "step": 435 + }, + { + "epoch": 0.4050162563864375, + "grad_norm": 0.39226349195208254, + "learning_rate": 4.807228915662651e-05, + "loss": 0.4828, + "step": 436 + }, + { + "epoch": 0.40594519275429636, + "grad_norm": 0.3124911712167451, + "learning_rate": 4.805507745266782e-05, + "loss": 0.42, + "step": 437 + }, + { + "epoch": 0.40687412912215515, + "grad_norm": 0.3325357418649828, + "learning_rate": 4.8037865748709126e-05, + "loss": 0.4183, + "step": 438 + }, + { + "epoch": 0.40780306549001394, + "grad_norm": 0.2967793995799997, + "learning_rate": 4.802065404475044e-05, + "loss": 0.4522, + "step": 439 + }, + { + "epoch": 0.40873200185787273, + "grad_norm": 0.3542440897290239, + "learning_rate": 4.800344234079174e-05, + "loss": 0.4463, + "step": 440 + }, + { + "epoch": 0.4096609382257315, + "grad_norm": 0.307305266388483, + "learning_rate": 4.798623063683305e-05, + "loss": 0.4444, + "step": 441 + }, + { + "epoch": 0.41058987459359036, + "grad_norm": 0.3009613722666484, + "learning_rate": 4.796901893287435e-05, + "loss": 0.42, + "step": 442 + }, + { + "epoch": 0.41151881096144916, + "grad_norm": 0.2942089986687925, + "learning_rate": 4.7951807228915665e-05, + "loss": 0.4408, + "step": 443 + }, + { + "epoch": 0.41244774732930795, + "grad_norm": 0.25994860698808103, + "learning_rate": 4.793459552495697e-05, + "loss": 0.422, + "step": 444 + }, + { + "epoch": 0.41337668369716674, + "grad_norm": 0.30165826806753704, + "learning_rate": 4.791738382099828e-05, + "loss": 0.4433, + "step": 445 + }, + { + "epoch": 0.4143056200650255, + "grad_norm": 0.2591476595221208, + "learning_rate": 4.7900172117039587e-05, + "loss": 0.4035, + "step": 446 + }, + { + "epoch": 0.41523455643288437, + "grad_norm": 0.31443538659749376, + "learning_rate": 4.78829604130809e-05, + "loss": 0.4451, + "step": 447 + }, + { + "epoch": 0.41616349280074316, + "grad_norm": 0.2845675404201316, + "learning_rate": 4.78657487091222e-05, + "loss": 0.4499, + "step": 448 + }, + { + "epoch": 0.41709242916860195, + "grad_norm": 0.3082724543796442, + "learning_rate": 4.7848537005163515e-05, + "loss": 0.4567, + "step": 449 + }, + { + "epoch": 0.41802136553646074, + "grad_norm": 0.2886233884056253, + "learning_rate": 4.783132530120482e-05, + "loss": 0.4351, + "step": 450 + }, + { + "epoch": 0.41895030190431953, + "grad_norm": 0.32824185907530123, + "learning_rate": 4.781411359724613e-05, + "loss": 0.4343, + "step": 451 + }, + { + "epoch": 0.4198792382721784, + "grad_norm": 0.3045951917878721, + "learning_rate": 4.779690189328744e-05, + "loss": 0.4233, + "step": 452 + }, + { + "epoch": 0.42080817464003717, + "grad_norm": 0.30065030364940504, + "learning_rate": 4.777969018932875e-05, + "loss": 0.4449, + "step": 453 + }, + { + "epoch": 0.42173711100789596, + "grad_norm": 0.28194914295465456, + "learning_rate": 4.7762478485370054e-05, + "loss": 0.4216, + "step": 454 + }, + { + "epoch": 0.42266604737575475, + "grad_norm": 0.358326292324705, + "learning_rate": 4.7745266781411366e-05, + "loss": 0.4545, + "step": 455 + }, + { + "epoch": 0.42359498374361354, + "grad_norm": 0.32509662830836533, + "learning_rate": 4.772805507745267e-05, + "loss": 0.4173, + "step": 456 + }, + { + "epoch": 0.4245239201114724, + "grad_norm": 0.33738335206220016, + "learning_rate": 4.771084337349398e-05, + "loss": 0.4433, + "step": 457 + }, + { + "epoch": 0.4254528564793312, + "grad_norm": 0.3607433985711699, + "learning_rate": 4.769363166953529e-05, + "loss": 0.4317, + "step": 458 + }, + { + "epoch": 0.42638179284718997, + "grad_norm": 0.32470986956497677, + "learning_rate": 4.76764199655766e-05, + "loss": 0.4278, + "step": 459 + }, + { + "epoch": 0.42731072921504876, + "grad_norm": 0.31596561843145204, + "learning_rate": 4.7659208261617904e-05, + "loss": 0.4507, + "step": 460 + }, + { + "epoch": 0.42823966558290755, + "grad_norm": 0.3145646642534761, + "learning_rate": 4.7641996557659216e-05, + "loss": 0.4353, + "step": 461 + }, + { + "epoch": 0.4291686019507664, + "grad_norm": 0.28777196903769586, + "learning_rate": 4.7624784853700514e-05, + "loss": 0.4606, + "step": 462 + }, + { + "epoch": 0.4300975383186252, + "grad_norm": 0.34643745463301656, + "learning_rate": 4.7607573149741826e-05, + "loss": 0.4264, + "step": 463 + }, + { + "epoch": 0.431026474686484, + "grad_norm": 0.3022418092258536, + "learning_rate": 4.759036144578313e-05, + "loss": 0.4046, + "step": 464 + }, + { + "epoch": 0.43195541105434276, + "grad_norm": 0.27983510155632235, + "learning_rate": 4.757314974182444e-05, + "loss": 0.4122, + "step": 465 + }, + { + "epoch": 0.43288434742220155, + "grad_norm": 0.28454933014088335, + "learning_rate": 4.755593803786575e-05, + "loss": 0.4558, + "step": 466 + }, + { + "epoch": 0.4338132837900604, + "grad_norm": 0.32254214458961294, + "learning_rate": 4.753872633390706e-05, + "loss": 0.4272, + "step": 467 + }, + { + "epoch": 0.4347422201579192, + "grad_norm": 0.2613178418721715, + "learning_rate": 4.7521514629948365e-05, + "loss": 0.4418, + "step": 468 + }, + { + "epoch": 0.435671156525778, + "grad_norm": 0.3056014331601247, + "learning_rate": 4.7504302925989676e-05, + "loss": 0.421, + "step": 469 + }, + { + "epoch": 0.43660009289363677, + "grad_norm": 0.2695626668562775, + "learning_rate": 4.748709122203098e-05, + "loss": 0.4321, + "step": 470 + }, + { + "epoch": 0.43752902926149556, + "grad_norm": 0.27446080453869376, + "learning_rate": 4.746987951807229e-05, + "loss": 0.4375, + "step": 471 + }, + { + "epoch": 0.4384579656293544, + "grad_norm": 0.29987194356039903, + "learning_rate": 4.74526678141136e-05, + "loss": 0.4436, + "step": 472 + }, + { + "epoch": 0.4393869019972132, + "grad_norm": 0.2973778889764395, + "learning_rate": 4.743545611015491e-05, + "loss": 0.4508, + "step": 473 + }, + { + "epoch": 0.440315838365072, + "grad_norm": 0.28858718688747587, + "learning_rate": 4.7418244406196215e-05, + "loss": 0.4088, + "step": 474 + }, + { + "epoch": 0.4412447747329308, + "grad_norm": 0.2776236267106209, + "learning_rate": 4.740103270223753e-05, + "loss": 0.4472, + "step": 475 + }, + { + "epoch": 0.44217371110078957, + "grad_norm": 0.2673174324156048, + "learning_rate": 4.738382099827883e-05, + "loss": 0.4318, + "step": 476 + }, + { + "epoch": 0.4431026474686484, + "grad_norm": 0.2789675691022897, + "learning_rate": 4.7366609294320144e-05, + "loss": 0.4256, + "step": 477 + }, + { + "epoch": 0.4440315838365072, + "grad_norm": 0.3242561622863543, + "learning_rate": 4.734939759036145e-05, + "loss": 0.4314, + "step": 478 + }, + { + "epoch": 0.444960520204366, + "grad_norm": 0.26245305270546254, + "learning_rate": 4.733218588640276e-05, + "loss": 0.432, + "step": 479 + }, + { + "epoch": 0.4458894565722248, + "grad_norm": 0.2908962068774102, + "learning_rate": 4.7314974182444065e-05, + "loss": 0.4262, + "step": 480 + }, + { + "epoch": 0.44681839294008363, + "grad_norm": 0.27171256702754587, + "learning_rate": 4.729776247848538e-05, + "loss": 0.4352, + "step": 481 + }, + { + "epoch": 0.4477473293079424, + "grad_norm": 0.3640009107002527, + "learning_rate": 4.728055077452668e-05, + "loss": 0.4337, + "step": 482 + }, + { + "epoch": 0.4486762656758012, + "grad_norm": 0.3035811404578588, + "learning_rate": 4.7263339070567994e-05, + "loss": 0.453, + "step": 483 + }, + { + "epoch": 0.44960520204366, + "grad_norm": 0.3635508538266957, + "learning_rate": 4.724612736660929e-05, + "loss": 0.4296, + "step": 484 + }, + { + "epoch": 0.4505341384115188, + "grad_norm": 0.3170039262392314, + "learning_rate": 4.7228915662650604e-05, + "loss": 0.4207, + "step": 485 + }, + { + "epoch": 0.45146307477937764, + "grad_norm": 0.31774438891510837, + "learning_rate": 4.721170395869191e-05, + "loss": 0.4406, + "step": 486 + }, + { + "epoch": 0.45239201114723643, + "grad_norm": 0.34019517987590486, + "learning_rate": 4.719449225473322e-05, + "loss": 0.4902, + "step": 487 + }, + { + "epoch": 0.4533209475150952, + "grad_norm": 0.2793264697380186, + "learning_rate": 4.7177280550774526e-05, + "loss": 0.4454, + "step": 488 + }, + { + "epoch": 0.454249883882954, + "grad_norm": 0.28144894598721126, + "learning_rate": 4.716006884681584e-05, + "loss": 0.4187, + "step": 489 + }, + { + "epoch": 0.4551788202508128, + "grad_norm": 0.30680476206969465, + "learning_rate": 4.714285714285714e-05, + "loss": 0.4458, + "step": 490 + }, + { + "epoch": 0.45610775661867164, + "grad_norm": 0.2761255587228729, + "learning_rate": 4.7125645438898454e-05, + "loss": 0.4141, + "step": 491 + }, + { + "epoch": 0.45703669298653044, + "grad_norm": 0.2747933627980402, + "learning_rate": 4.710843373493976e-05, + "loss": 0.4558, + "step": 492 + }, + { + "epoch": 0.4579656293543892, + "grad_norm": 0.2914395069490274, + "learning_rate": 4.709122203098107e-05, + "loss": 0.4353, + "step": 493 + }, + { + "epoch": 0.458894565722248, + "grad_norm": 0.26839160122030176, + "learning_rate": 4.7074010327022376e-05, + "loss": 0.4131, + "step": 494 + }, + { + "epoch": 0.4598235020901068, + "grad_norm": 0.2822993839047011, + "learning_rate": 4.705679862306369e-05, + "loss": 0.4265, + "step": 495 + }, + { + "epoch": 0.46075243845796565, + "grad_norm": 0.2657294003991585, + "learning_rate": 4.703958691910499e-05, + "loss": 0.4408, + "step": 496 + }, + { + "epoch": 0.46168137482582444, + "grad_norm": 0.29482740291064663, + "learning_rate": 4.7022375215146305e-05, + "loss": 0.4368, + "step": 497 + }, + { + "epoch": 0.46261031119368323, + "grad_norm": 0.26269631652127934, + "learning_rate": 4.700516351118761e-05, + "loss": 0.4313, + "step": 498 + }, + { + "epoch": 0.463539247561542, + "grad_norm": 0.33705615306936615, + "learning_rate": 4.698795180722892e-05, + "loss": 0.4529, + "step": 499 + }, + { + "epoch": 0.4644681839294008, + "grad_norm": 0.24499796930149098, + "learning_rate": 4.6970740103270227e-05, + "loss": 0.4385, + "step": 500 + }, + { + "epoch": 0.46539712029725966, + "grad_norm": 0.30948537181680397, + "learning_rate": 4.695352839931154e-05, + "loss": 0.4449, + "step": 501 + }, + { + "epoch": 0.46632605666511845, + "grad_norm": 0.29180609911144206, + "learning_rate": 4.6936316695352843e-05, + "loss": 0.4352, + "step": 502 + }, + { + "epoch": 0.46725499303297724, + "grad_norm": 0.2891708931616031, + "learning_rate": 4.6919104991394155e-05, + "loss": 0.439, + "step": 503 + }, + { + "epoch": 0.46818392940083603, + "grad_norm": 0.2686671320742356, + "learning_rate": 4.690189328743546e-05, + "loss": 0.4306, + "step": 504 + }, + { + "epoch": 0.4691128657686948, + "grad_norm": 0.30780838224951595, + "learning_rate": 4.6884681583476765e-05, + "loss": 0.431, + "step": 505 + }, + { + "epoch": 0.47004180213655367, + "grad_norm": 0.25568604631287883, + "learning_rate": 4.686746987951807e-05, + "loss": 0.458, + "step": 506 + }, + { + "epoch": 0.47097073850441246, + "grad_norm": 0.37480261887391536, + "learning_rate": 4.685025817555938e-05, + "loss": 0.4343, + "step": 507 + }, + { + "epoch": 0.47189967487227125, + "grad_norm": 0.2552649038864197, + "learning_rate": 4.683304647160069e-05, + "loss": 0.3908, + "step": 508 + }, + { + "epoch": 0.47282861124013004, + "grad_norm": 0.35376195689446877, + "learning_rate": 4.6815834767642e-05, + "loss": 0.4491, + "step": 509 + }, + { + "epoch": 0.4737575476079888, + "grad_norm": 0.3016203905265542, + "learning_rate": 4.6798623063683304e-05, + "loss": 0.4291, + "step": 510 + }, + { + "epoch": 0.4746864839758477, + "grad_norm": 0.29055972391783796, + "learning_rate": 4.6781411359724616e-05, + "loss": 0.4195, + "step": 511 + }, + { + "epoch": 0.47561542034370646, + "grad_norm": 0.3190347010255864, + "learning_rate": 4.676419965576592e-05, + "loss": 0.4108, + "step": 512 + }, + { + "epoch": 0.47654435671156525, + "grad_norm": 0.3294349955740729, + "learning_rate": 4.674698795180723e-05, + "loss": 0.4693, + "step": 513 + }, + { + "epoch": 0.47747329307942404, + "grad_norm": 0.31781079968735304, + "learning_rate": 4.672977624784854e-05, + "loss": 0.4435, + "step": 514 + }, + { + "epoch": 0.47840222944728283, + "grad_norm": 0.3226175038410287, + "learning_rate": 4.671256454388985e-05, + "loss": 0.4335, + "step": 515 + }, + { + "epoch": 0.4793311658151417, + "grad_norm": 0.2637433153066965, + "learning_rate": 4.6695352839931154e-05, + "loss": 0.4415, + "step": 516 + }, + { + "epoch": 0.48026010218300047, + "grad_norm": 0.3003332421321039, + "learning_rate": 4.6678141135972466e-05, + "loss": 0.4202, + "step": 517 + }, + { + "epoch": 0.48118903855085926, + "grad_norm": 0.37384718084964086, + "learning_rate": 4.666092943201377e-05, + "loss": 0.4603, + "step": 518 + }, + { + "epoch": 0.48211797491871805, + "grad_norm": 0.2715471956417477, + "learning_rate": 4.664371772805508e-05, + "loss": 0.4507, + "step": 519 + }, + { + "epoch": 0.4830469112865769, + "grad_norm": 0.262436801625057, + "learning_rate": 4.662650602409639e-05, + "loss": 0.4209, + "step": 520 + }, + { + "epoch": 0.4839758476544357, + "grad_norm": 0.3007149616543295, + "learning_rate": 4.66092943201377e-05, + "loss": 0.4493, + "step": 521 + }, + { + "epoch": 0.4849047840222945, + "grad_norm": 0.2804336311294239, + "learning_rate": 4.6592082616179005e-05, + "loss": 0.4286, + "step": 522 + }, + { + "epoch": 0.48583372039015327, + "grad_norm": 0.26330683566772206, + "learning_rate": 4.6574870912220316e-05, + "loss": 0.4202, + "step": 523 + }, + { + "epoch": 0.48676265675801206, + "grad_norm": 0.286309727815638, + "learning_rate": 4.655765920826162e-05, + "loss": 0.4301, + "step": 524 + }, + { + "epoch": 0.4876915931258709, + "grad_norm": 0.2529965819385294, + "learning_rate": 4.654044750430293e-05, + "loss": 0.4325, + "step": 525 + }, + { + "epoch": 0.4886205294937297, + "grad_norm": 0.3253005644243001, + "learning_rate": 4.652323580034423e-05, + "loss": 0.42, + "step": 526 + }, + { + "epoch": 0.4895494658615885, + "grad_norm": 0.2658924013835058, + "learning_rate": 4.650602409638554e-05, + "loss": 0.4193, + "step": 527 + }, + { + "epoch": 0.4904784022294473, + "grad_norm": 0.27995881407155443, + "learning_rate": 4.648881239242685e-05, + "loss": 0.4216, + "step": 528 + }, + { + "epoch": 0.49140733859730606, + "grad_norm": 0.3025444675822039, + "learning_rate": 4.647160068846816e-05, + "loss": 0.4621, + "step": 529 + }, + { + "epoch": 0.4923362749651649, + "grad_norm": 0.2737570425581585, + "learning_rate": 4.6454388984509465e-05, + "loss": 0.4324, + "step": 530 + }, + { + "epoch": 0.4932652113330237, + "grad_norm": 0.2582930487746826, + "learning_rate": 4.643717728055078e-05, + "loss": 0.4423, + "step": 531 + }, + { + "epoch": 0.4941941477008825, + "grad_norm": 0.28559852630894345, + "learning_rate": 4.641996557659208e-05, + "loss": 0.4477, + "step": 532 + }, + { + "epoch": 0.4951230840687413, + "grad_norm": 0.29483695154838563, + "learning_rate": 4.6402753872633394e-05, + "loss": 0.4156, + "step": 533 + }, + { + "epoch": 0.49605202043660007, + "grad_norm": 0.2613379393803091, + "learning_rate": 4.63855421686747e-05, + "loss": 0.432, + "step": 534 + }, + { + "epoch": 0.4969809568044589, + "grad_norm": 0.22622325752066913, + "learning_rate": 4.636833046471601e-05, + "loss": 0.4297, + "step": 535 + }, + { + "epoch": 0.4979098931723177, + "grad_norm": 0.28353494710194255, + "learning_rate": 4.6351118760757315e-05, + "loss": 0.4533, + "step": 536 + }, + { + "epoch": 0.4988388295401765, + "grad_norm": 0.31712551450474613, + "learning_rate": 4.633390705679863e-05, + "loss": 0.4667, + "step": 537 + }, + { + "epoch": 0.4997677659080353, + "grad_norm": 0.25690414290083796, + "learning_rate": 4.631669535283993e-05, + "loss": 0.434, + "step": 538 + }, + { + "epoch": 0.5006967022758941, + "grad_norm": 0.26005133208971615, + "learning_rate": 4.6299483648881244e-05, + "loss": 0.4385, + "step": 539 + }, + { + "epoch": 0.5016256386437529, + "grad_norm": 0.34410172574131104, + "learning_rate": 4.628227194492255e-05, + "loss": 0.4508, + "step": 540 + }, + { + "epoch": 0.5025545750116117, + "grad_norm": 0.25043115978998853, + "learning_rate": 4.626506024096386e-05, + "loss": 0.4098, + "step": 541 + }, + { + "epoch": 0.5034835113794706, + "grad_norm": 0.3104609566884535, + "learning_rate": 4.6247848537005166e-05, + "loss": 0.4305, + "step": 542 + }, + { + "epoch": 0.5044124477473293, + "grad_norm": 0.293951951581098, + "learning_rate": 4.623063683304648e-05, + "loss": 0.4461, + "step": 543 + }, + { + "epoch": 0.5053413841151881, + "grad_norm": 0.2560108568907451, + "learning_rate": 4.621342512908778e-05, + "loss": 0.4355, + "step": 544 + }, + { + "epoch": 0.5062703204830469, + "grad_norm": 0.358412433789302, + "learning_rate": 4.6196213425129094e-05, + "loss": 0.4425, + "step": 545 + }, + { + "epoch": 0.5071992568509057, + "grad_norm": 0.3056935351314926, + "learning_rate": 4.61790017211704e-05, + "loss": 0.4428, + "step": 546 + }, + { + "epoch": 0.5081281932187646, + "grad_norm": 0.24667559189967134, + "learning_rate": 4.6161790017211704e-05, + "loss": 0.3957, + "step": 547 + }, + { + "epoch": 0.5090571295866233, + "grad_norm": 0.33367259063635524, + "learning_rate": 4.614457831325301e-05, + "loss": 0.4355, + "step": 548 + }, + { + "epoch": 0.5099860659544821, + "grad_norm": 0.27853496333383976, + "learning_rate": 4.612736660929432e-05, + "loss": 0.4126, + "step": 549 + }, + { + "epoch": 0.5109150023223409, + "grad_norm": 0.268125448502846, + "learning_rate": 4.6110154905335626e-05, + "loss": 0.4303, + "step": 550 + }, + { + "epoch": 0.5118439386901997, + "grad_norm": 0.3441204665083962, + "learning_rate": 4.609294320137694e-05, + "loss": 0.449, + "step": 551 + }, + { + "epoch": 0.5127728750580586, + "grad_norm": 0.24236240929630035, + "learning_rate": 4.607573149741824e-05, + "loss": 0.414, + "step": 552 + }, + { + "epoch": 0.5137018114259173, + "grad_norm": 0.3015664010797303, + "learning_rate": 4.6058519793459555e-05, + "loss": 0.4204, + "step": 553 + }, + { + "epoch": 0.5146307477937762, + "grad_norm": 0.25242151768392607, + "learning_rate": 4.604130808950086e-05, + "loss": 0.4273, + "step": 554 + }, + { + "epoch": 0.5155596841616349, + "grad_norm": 0.30744068298559074, + "learning_rate": 4.602409638554217e-05, + "loss": 0.4508, + "step": 555 + }, + { + "epoch": 0.5164886205294937, + "grad_norm": 0.30718731865640203, + "learning_rate": 4.600688468158348e-05, + "loss": 0.4474, + "step": 556 + }, + { + "epoch": 0.5174175568973526, + "grad_norm": 0.29846939639927766, + "learning_rate": 4.598967297762479e-05, + "loss": 0.4302, + "step": 557 + }, + { + "epoch": 0.5183464932652113, + "grad_norm": 0.29336388381608647, + "learning_rate": 4.5972461273666093e-05, + "loss": 0.4227, + "step": 558 + }, + { + "epoch": 0.5192754296330702, + "grad_norm": 0.28098300576052854, + "learning_rate": 4.5955249569707405e-05, + "loss": 0.4225, + "step": 559 + }, + { + "epoch": 0.5202043660009289, + "grad_norm": 0.27826523322374, + "learning_rate": 4.593803786574871e-05, + "loss": 0.4395, + "step": 560 + }, + { + "epoch": 0.5211333023687877, + "grad_norm": 0.2857326733803591, + "learning_rate": 4.592082616179002e-05, + "loss": 0.4251, + "step": 561 + }, + { + "epoch": 0.5220622387366466, + "grad_norm": 0.24460900653251463, + "learning_rate": 4.590361445783133e-05, + "loss": 0.3983, + "step": 562 + }, + { + "epoch": 0.5229911751045053, + "grad_norm": 0.31606867032450164, + "learning_rate": 4.588640275387264e-05, + "loss": 0.3987, + "step": 563 + }, + { + "epoch": 0.5239201114723642, + "grad_norm": 0.2720837508546668, + "learning_rate": 4.5869191049913944e-05, + "loss": 0.4187, + "step": 564 + }, + { + "epoch": 0.5248490478402229, + "grad_norm": 0.3692339108368577, + "learning_rate": 4.5851979345955256e-05, + "loss": 0.4027, + "step": 565 + }, + { + "epoch": 0.5257779842080818, + "grad_norm": 0.2621237616477022, + "learning_rate": 4.583476764199656e-05, + "loss": 0.4069, + "step": 566 + }, + { + "epoch": 0.5267069205759406, + "grad_norm": 0.28576357238482947, + "learning_rate": 4.581755593803787e-05, + "loss": 0.4083, + "step": 567 + }, + { + "epoch": 0.5276358569437993, + "grad_norm": 0.2876324843394776, + "learning_rate": 4.580034423407917e-05, + "loss": 0.4053, + "step": 568 + }, + { + "epoch": 0.5285647933116582, + "grad_norm": 0.2952514491419477, + "learning_rate": 4.578313253012048e-05, + "loss": 0.4372, + "step": 569 + }, + { + "epoch": 0.5294937296795169, + "grad_norm": 0.2413668924475377, + "learning_rate": 4.576592082616179e-05, + "loss": 0.3872, + "step": 570 + }, + { + "epoch": 0.5304226660473758, + "grad_norm": 0.31322703800100615, + "learning_rate": 4.57487091222031e-05, + "loss": 0.415, + "step": 571 + }, + { + "epoch": 0.5313516024152346, + "grad_norm": 0.29838477602444063, + "learning_rate": 4.5731497418244404e-05, + "loss": 0.4227, + "step": 572 + }, + { + "epoch": 0.5322805387830933, + "grad_norm": 0.2743433087149783, + "learning_rate": 4.5714285714285716e-05, + "loss": 0.4069, + "step": 573 + }, + { + "epoch": 0.5332094751509522, + "grad_norm": 0.2681736163650413, + "learning_rate": 4.569707401032702e-05, + "loss": 0.4124, + "step": 574 + }, + { + "epoch": 0.5341384115188109, + "grad_norm": 0.27958710415501165, + "learning_rate": 4.567986230636833e-05, + "loss": 0.4479, + "step": 575 + }, + { + "epoch": 0.5350673478866698, + "grad_norm": 0.3107133186979922, + "learning_rate": 4.566265060240964e-05, + "loss": 0.4309, + "step": 576 + }, + { + "epoch": 0.5359962842545286, + "grad_norm": 0.3008304998300067, + "learning_rate": 4.564543889845095e-05, + "loss": 0.4215, + "step": 577 + }, + { + "epoch": 0.5369252206223873, + "grad_norm": 0.3077480260868722, + "learning_rate": 4.5628227194492255e-05, + "loss": 0.4177, + "step": 578 + }, + { + "epoch": 0.5378541569902462, + "grad_norm": 0.2856706754554965, + "learning_rate": 4.5611015490533566e-05, + "loss": 0.4298, + "step": 579 + }, + { + "epoch": 0.5387830933581049, + "grad_norm": 0.3062434202958225, + "learning_rate": 4.559380378657487e-05, + "loss": 0.4489, + "step": 580 + }, + { + "epoch": 0.5397120297259638, + "grad_norm": 0.26880576068838663, + "learning_rate": 4.557659208261618e-05, + "loss": 0.4378, + "step": 581 + }, + { + "epoch": 0.5406409660938226, + "grad_norm": 0.2504523646966336, + "learning_rate": 4.555938037865749e-05, + "loss": 0.4308, + "step": 582 + }, + { + "epoch": 0.5415699024616814, + "grad_norm": 0.26076967760757586, + "learning_rate": 4.55421686746988e-05, + "loss": 0.422, + "step": 583 + }, + { + "epoch": 0.5424988388295402, + "grad_norm": 0.235898250335184, + "learning_rate": 4.5524956970740105e-05, + "loss": 0.4317, + "step": 584 + }, + { + "epoch": 0.5434277751973989, + "grad_norm": 0.24591979617927925, + "learning_rate": 4.550774526678142e-05, + "loss": 0.4576, + "step": 585 + }, + { + "epoch": 0.5443567115652578, + "grad_norm": 0.2336427626086619, + "learning_rate": 4.549053356282272e-05, + "loss": 0.4229, + "step": 586 + }, + { + "epoch": 0.5452856479331166, + "grad_norm": 0.24258696081299802, + "learning_rate": 4.5473321858864034e-05, + "loss": 0.4314, + "step": 587 + }, + { + "epoch": 0.5462145843009754, + "grad_norm": 0.2448812822768551, + "learning_rate": 4.545611015490534e-05, + "loss": 0.4148, + "step": 588 + }, + { + "epoch": 0.5471435206688342, + "grad_norm": 0.23590842458952918, + "learning_rate": 4.543889845094665e-05, + "loss": 0.42, + "step": 589 + }, + { + "epoch": 0.5480724570366929, + "grad_norm": 0.2522688595104704, + "learning_rate": 4.542168674698795e-05, + "loss": 0.4314, + "step": 590 + }, + { + "epoch": 0.5490013934045518, + "grad_norm": 0.26737150464975795, + "learning_rate": 4.540447504302926e-05, + "loss": 0.4086, + "step": 591 + }, + { + "epoch": 0.5499303297724106, + "grad_norm": 0.2559167030387441, + "learning_rate": 4.5387263339070566e-05, + "loss": 0.4365, + "step": 592 + }, + { + "epoch": 0.5508592661402694, + "grad_norm": 0.2767220404229008, + "learning_rate": 4.537005163511188e-05, + "loss": 0.4148, + "step": 593 + }, + { + "epoch": 0.5517882025081282, + "grad_norm": 0.27937423564051256, + "learning_rate": 4.535283993115318e-05, + "loss": 0.4253, + "step": 594 + }, + { + "epoch": 0.552717138875987, + "grad_norm": 0.2900121711086677, + "learning_rate": 4.5335628227194494e-05, + "loss": 0.4325, + "step": 595 + }, + { + "epoch": 0.5536460752438458, + "grad_norm": 0.27657569986368546, + "learning_rate": 4.53184165232358e-05, + "loss": 0.4505, + "step": 596 + }, + { + "epoch": 0.5545750116117046, + "grad_norm": 0.3126165251767002, + "learning_rate": 4.530120481927711e-05, + "loss": 0.4358, + "step": 597 + }, + { + "epoch": 0.5555039479795634, + "grad_norm": 0.27088886464887985, + "learning_rate": 4.5283993115318416e-05, + "loss": 0.4393, + "step": 598 + }, + { + "epoch": 0.5564328843474222, + "grad_norm": 0.2504180754270926, + "learning_rate": 4.526678141135973e-05, + "loss": 0.4188, + "step": 599 + }, + { + "epoch": 0.557361820715281, + "grad_norm": 0.30044985875880187, + "learning_rate": 4.524956970740103e-05, + "loss": 0.4272, + "step": 600 + }, + { + "epoch": 0.5582907570831398, + "grad_norm": 0.26657267822610964, + "learning_rate": 4.5232358003442345e-05, + "loss": 0.4351, + "step": 601 + }, + { + "epoch": 0.5592196934509986, + "grad_norm": 0.2534161771476428, + "learning_rate": 4.521514629948365e-05, + "loss": 0.4171, + "step": 602 + }, + { + "epoch": 0.5601486298188574, + "grad_norm": 0.2856683409630207, + "learning_rate": 4.519793459552496e-05, + "loss": 0.4229, + "step": 603 + }, + { + "epoch": 0.5610775661867162, + "grad_norm": 0.2232216928555761, + "learning_rate": 4.5180722891566266e-05, + "loss": 0.3916, + "step": 604 + }, + { + "epoch": 0.562006502554575, + "grad_norm": 0.3018832675679175, + "learning_rate": 4.516351118760758e-05, + "loss": 0.4179, + "step": 605 + }, + { + "epoch": 0.5629354389224338, + "grad_norm": 0.2569380913397442, + "learning_rate": 4.514629948364888e-05, + "loss": 0.4311, + "step": 606 + }, + { + "epoch": 0.5638643752902927, + "grad_norm": 0.2722108299381111, + "learning_rate": 4.5129087779690195e-05, + "loss": 0.4227, + "step": 607 + }, + { + "epoch": 0.5647933116581514, + "grad_norm": 0.24245246643994783, + "learning_rate": 4.51118760757315e-05, + "loss": 0.4058, + "step": 608 + }, + { + "epoch": 0.5657222480260102, + "grad_norm": 0.2895449101358798, + "learning_rate": 4.509466437177281e-05, + "loss": 0.4239, + "step": 609 + }, + { + "epoch": 0.566651184393869, + "grad_norm": 0.22872993936692906, + "learning_rate": 4.507745266781412e-05, + "loss": 0.4194, + "step": 610 + }, + { + "epoch": 0.5675801207617278, + "grad_norm": 0.2951044959391205, + "learning_rate": 4.506024096385542e-05, + "loss": 0.4159, + "step": 611 + }, + { + "epoch": 0.5685090571295867, + "grad_norm": 0.2509311292631651, + "learning_rate": 4.504302925989673e-05, + "loss": 0.4237, + "step": 612 + }, + { + "epoch": 0.5694379934974454, + "grad_norm": 0.32512599427391997, + "learning_rate": 4.502581755593804e-05, + "loss": 0.416, + "step": 613 + }, + { + "epoch": 0.5703669298653042, + "grad_norm": 0.2819827511805377, + "learning_rate": 4.5008605851979344e-05, + "loss": 0.3945, + "step": 614 + }, + { + "epoch": 0.571295866233163, + "grad_norm": 0.26137765186363743, + "learning_rate": 4.4991394148020655e-05, + "loss": 0.4364, + "step": 615 + }, + { + "epoch": 0.5722248026010218, + "grad_norm": 0.24986429511060695, + "learning_rate": 4.497418244406196e-05, + "loss": 0.4192, + "step": 616 + }, + { + "epoch": 0.5731537389688807, + "grad_norm": 0.32510562396239795, + "learning_rate": 4.495697074010327e-05, + "loss": 0.4564, + "step": 617 + }, + { + "epoch": 0.5740826753367394, + "grad_norm": 0.2437246846388565, + "learning_rate": 4.493975903614458e-05, + "loss": 0.4491, + "step": 618 + }, + { + "epoch": 0.5750116117045982, + "grad_norm": 0.28270839310005524, + "learning_rate": 4.492254733218589e-05, + "loss": 0.4165, + "step": 619 + }, + { + "epoch": 0.575940548072457, + "grad_norm": 0.2978639151525471, + "learning_rate": 4.4905335628227194e-05, + "loss": 0.4181, + "step": 620 + }, + { + "epoch": 0.5768694844403158, + "grad_norm": 0.283277054225362, + "learning_rate": 4.4888123924268506e-05, + "loss": 0.4506, + "step": 621 + }, + { + "epoch": 0.5777984208081747, + "grad_norm": 0.24187266585240402, + "learning_rate": 4.487091222030981e-05, + "loss": 0.401, + "step": 622 + }, + { + "epoch": 0.5787273571760334, + "grad_norm": 0.2863505365930887, + "learning_rate": 4.485370051635112e-05, + "loss": 0.4117, + "step": 623 + }, + { + "epoch": 0.5796562935438923, + "grad_norm": 0.2773113061348817, + "learning_rate": 4.483648881239243e-05, + "loss": 0.4244, + "step": 624 + }, + { + "epoch": 0.5805852299117511, + "grad_norm": 0.23781370664986884, + "learning_rate": 4.481927710843374e-05, + "loss": 0.4172, + "step": 625 + }, + { + "epoch": 0.5815141662796098, + "grad_norm": 0.2899327516074531, + "learning_rate": 4.4802065404475044e-05, + "loss": 0.414, + "step": 626 + }, + { + "epoch": 0.5824431026474687, + "grad_norm": 0.21917991809985152, + "learning_rate": 4.4784853700516356e-05, + "loss": 0.3858, + "step": 627 + }, + { + "epoch": 0.5833720390153274, + "grad_norm": 0.29585581964621416, + "learning_rate": 4.476764199655766e-05, + "loss": 0.4262, + "step": 628 + }, + { + "epoch": 0.5843009753831863, + "grad_norm": 0.24103700840664655, + "learning_rate": 4.475043029259897e-05, + "loss": 0.4061, + "step": 629 + }, + { + "epoch": 0.5852299117510451, + "grad_norm": 0.30538358771552415, + "learning_rate": 4.473321858864028e-05, + "loss": 0.4068, + "step": 630 + }, + { + "epoch": 0.5861588481189038, + "grad_norm": 0.28929577954162256, + "learning_rate": 4.471600688468159e-05, + "loss": 0.4307, + "step": 631 + }, + { + "epoch": 0.5870877844867627, + "grad_norm": 0.28414615427121404, + "learning_rate": 4.469879518072289e-05, + "loss": 0.4201, + "step": 632 + }, + { + "epoch": 0.5880167208546214, + "grad_norm": 0.26330534375194914, + "learning_rate": 4.46815834767642e-05, + "loss": 0.4346, + "step": 633 + }, + { + "epoch": 0.5889456572224803, + "grad_norm": 0.24452133530493197, + "learning_rate": 4.4664371772805505e-05, + "loss": 0.4283, + "step": 634 + }, + { + "epoch": 0.5898745935903391, + "grad_norm": 0.275064839540402, + "learning_rate": 4.4647160068846817e-05, + "loss": 0.4103, + "step": 635 + }, + { + "epoch": 0.5908035299581978, + "grad_norm": 0.23295221964562113, + "learning_rate": 4.462994836488812e-05, + "loss": 0.4265, + "step": 636 + }, + { + "epoch": 0.5917324663260567, + "grad_norm": 0.29348555430997814, + "learning_rate": 4.461273666092943e-05, + "loss": 0.4498, + "step": 637 + }, + { + "epoch": 0.5926614026939154, + "grad_norm": 0.2387454374843579, + "learning_rate": 4.459552495697074e-05, + "loss": 0.4267, + "step": 638 + }, + { + "epoch": 0.5935903390617743, + "grad_norm": 0.24669696423956647, + "learning_rate": 4.457831325301205e-05, + "loss": 0.4109, + "step": 639 + }, + { + "epoch": 0.5945192754296331, + "grad_norm": 0.2569187304772452, + "learning_rate": 4.4561101549053355e-05, + "loss": 0.4142, + "step": 640 + }, + { + "epoch": 0.5954482117974919, + "grad_norm": 0.23582837954343758, + "learning_rate": 4.454388984509467e-05, + "loss": 0.4361, + "step": 641 + }, + { + "epoch": 0.5963771481653507, + "grad_norm": 0.2577453345455404, + "learning_rate": 4.452667814113597e-05, + "loss": 0.4208, + "step": 642 + }, + { + "epoch": 0.5973060845332094, + "grad_norm": 0.2575075232080959, + "learning_rate": 4.4509466437177284e-05, + "loss": 0.4175, + "step": 643 + }, + { + "epoch": 0.5982350209010683, + "grad_norm": 0.2592496028490341, + "learning_rate": 4.449225473321859e-05, + "loss": 0.4243, + "step": 644 + }, + { + "epoch": 0.5991639572689271, + "grad_norm": 0.2518897193108137, + "learning_rate": 4.44750430292599e-05, + "loss": 0.4238, + "step": 645 + }, + { + "epoch": 0.6000928936367859, + "grad_norm": 0.27689384611411444, + "learning_rate": 4.4457831325301206e-05, + "loss": 0.4145, + "step": 646 + }, + { + "epoch": 0.6010218300046447, + "grad_norm": 0.25485645505304355, + "learning_rate": 4.444061962134252e-05, + "loss": 0.4201, + "step": 647 + }, + { + "epoch": 0.6019507663725034, + "grad_norm": 0.27128573378890863, + "learning_rate": 4.442340791738382e-05, + "loss": 0.4316, + "step": 648 + }, + { + "epoch": 0.6028797027403623, + "grad_norm": 0.2578065387198813, + "learning_rate": 4.4406196213425134e-05, + "loss": 0.4193, + "step": 649 + }, + { + "epoch": 0.6038086391082211, + "grad_norm": 0.2667247647777713, + "learning_rate": 4.438898450946644e-05, + "loss": 0.3986, + "step": 650 + }, + { + "epoch": 0.6047375754760799, + "grad_norm": 0.26585574043938304, + "learning_rate": 4.437177280550775e-05, + "loss": 0.4433, + "step": 651 + }, + { + "epoch": 0.6056665118439387, + "grad_norm": 0.29551030914401943, + "learning_rate": 4.4354561101549056e-05, + "loss": 0.4461, + "step": 652 + }, + { + "epoch": 0.6065954482117974, + "grad_norm": 0.2841049021245725, + "learning_rate": 4.433734939759036e-05, + "loss": 0.4071, + "step": 653 + }, + { + "epoch": 0.6075243845796563, + "grad_norm": 0.30886770292110666, + "learning_rate": 4.4320137693631666e-05, + "loss": 0.4544, + "step": 654 + }, + { + "epoch": 0.6084533209475151, + "grad_norm": 0.2686093425730839, + "learning_rate": 4.430292598967298e-05, + "loss": 0.407, + "step": 655 + }, + { + "epoch": 0.6093822573153739, + "grad_norm": 0.30412706468771006, + "learning_rate": 4.428571428571428e-05, + "loss": 0.3915, + "step": 656 + }, + { + "epoch": 0.6103111936832327, + "grad_norm": 0.35364500741842214, + "learning_rate": 4.4268502581755595e-05, + "loss": 0.453, + "step": 657 + }, + { + "epoch": 0.6112401300510915, + "grad_norm": 0.26460657385536474, + "learning_rate": 4.42512908777969e-05, + "loss": 0.3909, + "step": 658 + }, + { + "epoch": 0.6121690664189503, + "grad_norm": 0.4428486110797581, + "learning_rate": 4.423407917383821e-05, + "loss": 0.4269, + "step": 659 + }, + { + "epoch": 0.6130980027868091, + "grad_norm": 0.2866328626559702, + "learning_rate": 4.4216867469879516e-05, + "loss": 0.4204, + "step": 660 + }, + { + "epoch": 0.6140269391546679, + "grad_norm": 0.36462567528091633, + "learning_rate": 4.419965576592083e-05, + "loss": 0.4353, + "step": 661 + }, + { + "epoch": 0.6149558755225267, + "grad_norm": 0.3087527803770854, + "learning_rate": 4.418244406196213e-05, + "loss": 0.4204, + "step": 662 + }, + { + "epoch": 0.6158848118903855, + "grad_norm": 0.3103535653539977, + "learning_rate": 4.4165232358003445e-05, + "loss": 0.4344, + "step": 663 + }, + { + "epoch": 0.6168137482582443, + "grad_norm": 0.28523746975783004, + "learning_rate": 4.414802065404475e-05, + "loss": 0.4163, + "step": 664 + }, + { + "epoch": 0.6177426846261032, + "grad_norm": 0.2842390326622571, + "learning_rate": 4.413080895008606e-05, + "loss": 0.408, + "step": 665 + }, + { + "epoch": 0.6186716209939619, + "grad_norm": 0.23234413274784502, + "learning_rate": 4.411359724612737e-05, + "loss": 0.4194, + "step": 666 + }, + { + "epoch": 0.6196005573618207, + "grad_norm": 0.275004251162571, + "learning_rate": 4.409638554216868e-05, + "loss": 0.4323, + "step": 667 + }, + { + "epoch": 0.6205294937296795, + "grad_norm": 0.31847768258839976, + "learning_rate": 4.4079173838209984e-05, + "loss": 0.4268, + "step": 668 + }, + { + "epoch": 0.6214584300975383, + "grad_norm": 0.23406432898908414, + "learning_rate": 4.4061962134251295e-05, + "loss": 0.4137, + "step": 669 + }, + { + "epoch": 0.6223873664653972, + "grad_norm": 0.3093054020461385, + "learning_rate": 4.40447504302926e-05, + "loss": 0.3857, + "step": 670 + }, + { + "epoch": 0.6233163028332559, + "grad_norm": 0.25338402995679804, + "learning_rate": 4.402753872633391e-05, + "loss": 0.4007, + "step": 671 + }, + { + "epoch": 0.6242452392011147, + "grad_norm": 0.2935153845414147, + "learning_rate": 4.401032702237522e-05, + "loss": 0.4253, + "step": 672 + }, + { + "epoch": 0.6251741755689735, + "grad_norm": 0.3257007027536486, + "learning_rate": 4.399311531841653e-05, + "loss": 0.4573, + "step": 673 + }, + { + "epoch": 0.6261031119368323, + "grad_norm": 0.29349741747875213, + "learning_rate": 4.3975903614457834e-05, + "loss": 0.4016, + "step": 674 + }, + { + "epoch": 0.6270320483046912, + "grad_norm": 0.3229849235540013, + "learning_rate": 4.395869191049914e-05, + "loss": 0.4272, + "step": 675 + }, + { + "epoch": 0.6279609846725499, + "grad_norm": 0.36116533623931474, + "learning_rate": 4.3941480206540444e-05, + "loss": 0.4144, + "step": 676 + }, + { + "epoch": 0.6288899210404088, + "grad_norm": 0.2563978464326882, + "learning_rate": 4.3924268502581756e-05, + "loss": 0.4194, + "step": 677 + }, + { + "epoch": 0.6298188574082675, + "grad_norm": 0.3196063755723246, + "learning_rate": 4.390705679862306e-05, + "loss": 0.393, + "step": 678 + }, + { + "epoch": 0.6307477937761263, + "grad_norm": 0.35436425630405965, + "learning_rate": 4.388984509466437e-05, + "loss": 0.4171, + "step": 679 + }, + { + "epoch": 0.6316767301439852, + "grad_norm": 0.30135253821167374, + "learning_rate": 4.387263339070568e-05, + "loss": 0.4256, + "step": 680 + }, + { + "epoch": 0.6326056665118439, + "grad_norm": 0.30674680549750183, + "learning_rate": 4.385542168674699e-05, + "loss": 0.3959, + "step": 681 + }, + { + "epoch": 0.6335346028797028, + "grad_norm": 0.32187516875469857, + "learning_rate": 4.3838209982788294e-05, + "loss": 0.419, + "step": 682 + }, + { + "epoch": 0.6344635392475615, + "grad_norm": 0.2896684222699531, + "learning_rate": 4.3820998278829606e-05, + "loss": 0.4324, + "step": 683 + }, + { + "epoch": 0.6353924756154203, + "grad_norm": 0.2799876389868912, + "learning_rate": 4.380378657487091e-05, + "loss": 0.4238, + "step": 684 + }, + { + "epoch": 0.6363214119832792, + "grad_norm": 0.26251094611425313, + "learning_rate": 4.378657487091222e-05, + "loss": 0.4537, + "step": 685 + }, + { + "epoch": 0.6372503483511379, + "grad_norm": 0.24365180702477157, + "learning_rate": 4.376936316695353e-05, + "loss": 0.4201, + "step": 686 + }, + { + "epoch": 0.6381792847189968, + "grad_norm": 0.34911025101281234, + "learning_rate": 4.375215146299484e-05, + "loss": 0.4591, + "step": 687 + }, + { + "epoch": 0.6391082210868555, + "grad_norm": 0.25889134972689726, + "learning_rate": 4.3734939759036145e-05, + "loss": 0.4272, + "step": 688 + }, + { + "epoch": 0.6400371574547143, + "grad_norm": 0.28475140108094354, + "learning_rate": 4.371772805507746e-05, + "loss": 0.4267, + "step": 689 + }, + { + "epoch": 0.6409660938225732, + "grad_norm": 0.3161029094501537, + "learning_rate": 4.370051635111876e-05, + "loss": 0.4418, + "step": 690 + }, + { + "epoch": 0.6418950301904319, + "grad_norm": 0.2396023845575197, + "learning_rate": 4.3683304647160073e-05, + "loss": 0.4421, + "step": 691 + }, + { + "epoch": 0.6428239665582908, + "grad_norm": 0.2832054932462056, + "learning_rate": 4.366609294320138e-05, + "loss": 0.4091, + "step": 692 + }, + { + "epoch": 0.6437529029261495, + "grad_norm": 0.28376936431369904, + "learning_rate": 4.364888123924269e-05, + "loss": 0.3974, + "step": 693 + }, + { + "epoch": 0.6446818392940084, + "grad_norm": 0.24743406449667463, + "learning_rate": 4.3631669535283995e-05, + "loss": 0.411, + "step": 694 + }, + { + "epoch": 0.6456107756618672, + "grad_norm": 0.2730246000748479, + "learning_rate": 4.361445783132531e-05, + "loss": 0.4277, + "step": 695 + }, + { + "epoch": 0.6465397120297259, + "grad_norm": 0.2650695341327053, + "learning_rate": 4.359724612736661e-05, + "loss": 0.3944, + "step": 696 + }, + { + "epoch": 0.6474686483975848, + "grad_norm": 0.321320466947324, + "learning_rate": 4.358003442340792e-05, + "loss": 0.4464, + "step": 697 + }, + { + "epoch": 0.6483975847654435, + "grad_norm": 0.3178561485682785, + "learning_rate": 4.356282271944923e-05, + "loss": 0.4503, + "step": 698 + }, + { + "epoch": 0.6493265211333024, + "grad_norm": 0.25967628951798444, + "learning_rate": 4.3545611015490534e-05, + "loss": 0.4225, + "step": 699 + }, + { + "epoch": 0.6502554575011612, + "grad_norm": 0.2895967588454303, + "learning_rate": 4.3528399311531846e-05, + "loss": 0.4181, + "step": 700 + }, + { + "epoch": 0.6511843938690199, + "grad_norm": 0.278239598528544, + "learning_rate": 4.351118760757315e-05, + "loss": 0.4196, + "step": 701 + }, + { + "epoch": 0.6521133302368788, + "grad_norm": 0.24741676508319585, + "learning_rate": 4.3493975903614456e-05, + "loss": 0.4183, + "step": 702 + }, + { + "epoch": 0.6530422666047375, + "grad_norm": 0.23960910227402535, + "learning_rate": 4.347676419965577e-05, + "loss": 0.4154, + "step": 703 + }, + { + "epoch": 0.6539712029725964, + "grad_norm": 0.30614533128558064, + "learning_rate": 4.345955249569707e-05, + "loss": 0.4132, + "step": 704 + }, + { + "epoch": 0.6549001393404552, + "grad_norm": 0.2690126725576619, + "learning_rate": 4.3442340791738384e-05, + "loss": 0.4252, + "step": 705 + }, + { + "epoch": 0.655829075708314, + "grad_norm": 0.2849912129415421, + "learning_rate": 4.342512908777969e-05, + "loss": 0.4179, + "step": 706 + }, + { + "epoch": 0.6567580120761728, + "grad_norm": 0.2480212482763167, + "learning_rate": 4.3407917383821e-05, + "loss": 0.429, + "step": 707 + }, + { + "epoch": 0.6576869484440316, + "grad_norm": 0.2804131416188235, + "learning_rate": 4.3390705679862306e-05, + "loss": 0.4068, + "step": 708 + }, + { + "epoch": 0.6586158848118904, + "grad_norm": 0.2768683400232619, + "learning_rate": 4.337349397590362e-05, + "loss": 0.423, + "step": 709 + }, + { + "epoch": 0.6595448211797492, + "grad_norm": 0.24148977148375103, + "learning_rate": 4.335628227194492e-05, + "loss": 0.4322, + "step": 710 + }, + { + "epoch": 0.660473757547608, + "grad_norm": 0.2405167807911801, + "learning_rate": 4.3339070567986235e-05, + "loss": 0.4046, + "step": 711 + }, + { + "epoch": 0.6614026939154668, + "grad_norm": 0.22245312753439328, + "learning_rate": 4.332185886402754e-05, + "loss": 0.4126, + "step": 712 + }, + { + "epoch": 0.6623316302833256, + "grad_norm": 0.2669023790429907, + "learning_rate": 4.330464716006885e-05, + "loss": 0.4338, + "step": 713 + }, + { + "epoch": 0.6632605666511844, + "grad_norm": 0.2521812276262092, + "learning_rate": 4.3287435456110156e-05, + "loss": 0.4448, + "step": 714 + }, + { + "epoch": 0.6641895030190432, + "grad_norm": 0.2625757212246767, + "learning_rate": 4.327022375215147e-05, + "loss": 0.414, + "step": 715 + }, + { + "epoch": 0.665118439386902, + "grad_norm": 0.25033293810292023, + "learning_rate": 4.325301204819277e-05, + "loss": 0.423, + "step": 716 + }, + { + "epoch": 0.6660473757547608, + "grad_norm": 0.22334251988732523, + "learning_rate": 4.323580034423408e-05, + "loss": 0.4237, + "step": 717 + }, + { + "epoch": 0.6669763121226197, + "grad_norm": 0.2397245440218477, + "learning_rate": 4.321858864027539e-05, + "loss": 0.4137, + "step": 718 + }, + { + "epoch": 0.6679052484904784, + "grad_norm": 0.30143292571229535, + "learning_rate": 4.3201376936316695e-05, + "loss": 0.3724, + "step": 719 + }, + { + "epoch": 0.6688341848583372, + "grad_norm": 0.23079068591492524, + "learning_rate": 4.318416523235801e-05, + "loss": 0.4007, + "step": 720 + }, + { + "epoch": 0.669763121226196, + "grad_norm": 0.30620216556806684, + "learning_rate": 4.316695352839931e-05, + "loss": 0.4384, + "step": 721 + }, + { + "epoch": 0.6706920575940548, + "grad_norm": 0.29486394422042084, + "learning_rate": 4.3149741824440624e-05, + "loss": 0.4046, + "step": 722 + }, + { + "epoch": 0.6716209939619137, + "grad_norm": 0.3305581175194426, + "learning_rate": 4.313253012048193e-05, + "loss": 0.4242, + "step": 723 + }, + { + "epoch": 0.6725499303297724, + "grad_norm": 0.2679330466442912, + "learning_rate": 4.311531841652324e-05, + "loss": 0.4257, + "step": 724 + }, + { + "epoch": 0.6734788666976312, + "grad_norm": 0.35811562362234445, + "learning_rate": 4.3098106712564545e-05, + "loss": 0.4384, + "step": 725 + }, + { + "epoch": 0.67440780306549, + "grad_norm": 0.29334272663787064, + "learning_rate": 4.308089500860585e-05, + "loss": 0.4444, + "step": 726 + }, + { + "epoch": 0.6753367394333488, + "grad_norm": 0.24401338629880093, + "learning_rate": 4.306368330464716e-05, + "loss": 0.4193, + "step": 727 + }, + { + "epoch": 0.6762656758012077, + "grad_norm": 0.28052429245661187, + "learning_rate": 4.304647160068847e-05, + "loss": 0.4088, + "step": 728 + }, + { + "epoch": 0.6771946121690664, + "grad_norm": 0.25925872113831155, + "learning_rate": 4.302925989672978e-05, + "loss": 0.4073, + "step": 729 + }, + { + "epoch": 0.6781235485369252, + "grad_norm": 0.2736489940435994, + "learning_rate": 4.3012048192771084e-05, + "loss": 0.4237, + "step": 730 + }, + { + "epoch": 0.679052484904784, + "grad_norm": 0.25905656215957673, + "learning_rate": 4.2994836488812396e-05, + "loss": 0.4162, + "step": 731 + }, + { + "epoch": 0.6799814212726428, + "grad_norm": 0.24390493228872448, + "learning_rate": 4.29776247848537e-05, + "loss": 0.4307, + "step": 732 + }, + { + "epoch": 0.6809103576405017, + "grad_norm": 0.24108332455669904, + "learning_rate": 4.296041308089501e-05, + "loss": 0.4408, + "step": 733 + }, + { + "epoch": 0.6818392940083604, + "grad_norm": 0.23861930125640354, + "learning_rate": 4.294320137693632e-05, + "loss": 0.4097, + "step": 734 + }, + { + "epoch": 0.6827682303762193, + "grad_norm": 0.2342264218537004, + "learning_rate": 4.292598967297763e-05, + "loss": 0.4006, + "step": 735 + }, + { + "epoch": 0.683697166744078, + "grad_norm": 0.2534426133410622, + "learning_rate": 4.2908777969018934e-05, + "loss": 0.4127, + "step": 736 + }, + { + "epoch": 0.6846261031119368, + "grad_norm": 0.2711574107583699, + "learning_rate": 4.2891566265060246e-05, + "loss": 0.4334, + "step": 737 + }, + { + "epoch": 0.6855550394797957, + "grad_norm": 0.31772185693408067, + "learning_rate": 4.287435456110155e-05, + "loss": 0.4187, + "step": 738 + }, + { + "epoch": 0.6864839758476544, + "grad_norm": 0.30536491271045085, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.4215, + "step": 739 + }, + { + "epoch": 0.6874129122155133, + "grad_norm": 0.33006635156656183, + "learning_rate": 4.283993115318417e-05, + "loss": 0.405, + "step": 740 + }, + { + "epoch": 0.688341848583372, + "grad_norm": 0.2964099136976024, + "learning_rate": 4.282271944922547e-05, + "loss": 0.419, + "step": 741 + }, + { + "epoch": 0.6892707849512308, + "grad_norm": 0.2931220669174559, + "learning_rate": 4.2805507745266785e-05, + "loss": 0.4422, + "step": 742 + }, + { + "epoch": 0.6901997213190897, + "grad_norm": 0.2983381792229504, + "learning_rate": 4.278829604130809e-05, + "loss": 0.4411, + "step": 743 + }, + { + "epoch": 0.6911286576869484, + "grad_norm": 0.2648675526907866, + "learning_rate": 4.27710843373494e-05, + "loss": 0.4429, + "step": 744 + }, + { + "epoch": 0.6920575940548073, + "grad_norm": 0.2602456162921885, + "learning_rate": 4.275387263339071e-05, + "loss": 0.4198, + "step": 745 + }, + { + "epoch": 0.692986530422666, + "grad_norm": 0.23353204937477282, + "learning_rate": 4.273666092943202e-05, + "loss": 0.4028, + "step": 746 + }, + { + "epoch": 0.6939154667905248, + "grad_norm": 0.2991360217695013, + "learning_rate": 4.2719449225473323e-05, + "loss": 0.4143, + "step": 747 + }, + { + "epoch": 0.6948444031583837, + "grad_norm": 0.25702159816645936, + "learning_rate": 4.2702237521514635e-05, + "loss": 0.4213, + "step": 748 + }, + { + "epoch": 0.6957733395262424, + "grad_norm": 0.2838721908179337, + "learning_rate": 4.268502581755594e-05, + "loss": 0.4387, + "step": 749 + }, + { + "epoch": 0.6967022758941013, + "grad_norm": 0.24149559875694418, + "learning_rate": 4.2667814113597245e-05, + "loss": 0.4184, + "step": 750 + }, + { + "epoch": 0.69763121226196, + "grad_norm": 0.2952086958075859, + "learning_rate": 4.265060240963856e-05, + "loss": 0.4167, + "step": 751 + }, + { + "epoch": 0.6985601486298189, + "grad_norm": 0.23972843797863336, + "learning_rate": 4.263339070567986e-05, + "loss": 0.4141, + "step": 752 + }, + { + "epoch": 0.6994890849976777, + "grad_norm": 0.2746023756298842, + "learning_rate": 4.2616179001721174e-05, + "loss": 0.4364, + "step": 753 + }, + { + "epoch": 0.7004180213655364, + "grad_norm": 0.29765586671383076, + "learning_rate": 4.259896729776248e-05, + "loss": 0.4089, + "step": 754 + }, + { + "epoch": 0.7013469577333953, + "grad_norm": 0.21800684863628034, + "learning_rate": 4.258175559380379e-05, + "loss": 0.4143, + "step": 755 + }, + { + "epoch": 0.702275894101254, + "grad_norm": 0.28610650003068566, + "learning_rate": 4.2564543889845096e-05, + "loss": 0.4138, + "step": 756 + }, + { + "epoch": 0.7032048304691129, + "grad_norm": 0.2640665130452579, + "learning_rate": 4.254733218588641e-05, + "loss": 0.4549, + "step": 757 + }, + { + "epoch": 0.7041337668369717, + "grad_norm": 0.25026626980377065, + "learning_rate": 4.253012048192771e-05, + "loss": 0.4038, + "step": 758 + }, + { + "epoch": 0.7050627032048304, + "grad_norm": 0.275452109464895, + "learning_rate": 4.251290877796902e-05, + "loss": 0.4222, + "step": 759 + }, + { + "epoch": 0.7059916395726893, + "grad_norm": 0.25082813720469543, + "learning_rate": 4.249569707401033e-05, + "loss": 0.414, + "step": 760 + }, + { + "epoch": 0.706920575940548, + "grad_norm": 0.2607187023425719, + "learning_rate": 4.2478485370051634e-05, + "loss": 0.4186, + "step": 761 + }, + { + "epoch": 0.7078495123084069, + "grad_norm": 0.2628577880876619, + "learning_rate": 4.2461273666092946e-05, + "loss": 0.4313, + "step": 762 + }, + { + "epoch": 0.7087784486762657, + "grad_norm": 0.256368283425763, + "learning_rate": 4.244406196213425e-05, + "loss": 0.4371, + "step": 763 + }, + { + "epoch": 0.7097073850441245, + "grad_norm": 0.27295950833164523, + "learning_rate": 4.242685025817556e-05, + "loss": 0.4224, + "step": 764 + }, + { + "epoch": 0.7106363214119833, + "grad_norm": 0.2788051820721469, + "learning_rate": 4.240963855421687e-05, + "loss": 0.4261, + "step": 765 + }, + { + "epoch": 0.711565257779842, + "grad_norm": 0.2966812172764707, + "learning_rate": 4.239242685025818e-05, + "loss": 0.4622, + "step": 766 + }, + { + "epoch": 0.7124941941477009, + "grad_norm": 0.26841840271189216, + "learning_rate": 4.2375215146299485e-05, + "loss": 0.4065, + "step": 767 + }, + { + "epoch": 0.7134231305155597, + "grad_norm": 0.262959590522867, + "learning_rate": 4.2358003442340797e-05, + "loss": 0.3935, + "step": 768 + }, + { + "epoch": 0.7143520668834185, + "grad_norm": 0.24938352992988017, + "learning_rate": 4.23407917383821e-05, + "loss": 0.4231, + "step": 769 + }, + { + "epoch": 0.7152810032512773, + "grad_norm": 0.29239937087989554, + "learning_rate": 4.232358003442341e-05, + "loss": 0.4064, + "step": 770 + }, + { + "epoch": 0.716209939619136, + "grad_norm": 0.23536282870494213, + "learning_rate": 4.230636833046472e-05, + "loss": 0.402, + "step": 771 + }, + { + "epoch": 0.7171388759869949, + "grad_norm": 0.2714220152937965, + "learning_rate": 4.228915662650603e-05, + "loss": 0.3936, + "step": 772 + }, + { + "epoch": 0.7180678123548537, + "grad_norm": 0.2747257504638425, + "learning_rate": 4.2271944922547335e-05, + "loss": 0.4163, + "step": 773 + }, + { + "epoch": 0.7189967487227125, + "grad_norm": 0.2723713156748802, + "learning_rate": 4.225473321858865e-05, + "loss": 0.4066, + "step": 774 + }, + { + "epoch": 0.7199256850905713, + "grad_norm": 0.2557791781141218, + "learning_rate": 4.223752151462995e-05, + "loss": 0.4087, + "step": 775 + }, + { + "epoch": 0.72085462145843, + "grad_norm": 0.23079182269430176, + "learning_rate": 4.222030981067126e-05, + "loss": 0.3846, + "step": 776 + }, + { + "epoch": 0.7217835578262889, + "grad_norm": 0.28776961647603927, + "learning_rate": 4.220309810671257e-05, + "loss": 0.4035, + "step": 777 + }, + { + "epoch": 0.7227124941941477, + "grad_norm": 0.25625135388254744, + "learning_rate": 4.2185886402753874e-05, + "loss": 0.4196, + "step": 778 + }, + { + "epoch": 0.7236414305620065, + "grad_norm": 0.29595393487275606, + "learning_rate": 4.2168674698795186e-05, + "loss": 0.4448, + "step": 779 + }, + { + "epoch": 0.7245703669298653, + "grad_norm": 0.24841926756950633, + "learning_rate": 4.215146299483649e-05, + "loss": 0.4153, + "step": 780 + }, + { + "epoch": 0.725499303297724, + "grad_norm": 0.28459325293580745, + "learning_rate": 4.2134251290877796e-05, + "loss": 0.444, + "step": 781 + }, + { + "epoch": 0.7264282396655829, + "grad_norm": 0.26193828352352555, + "learning_rate": 4.211703958691911e-05, + "loss": 0.4302, + "step": 782 + }, + { + "epoch": 0.7273571760334417, + "grad_norm": 0.2479500695456553, + "learning_rate": 4.209982788296041e-05, + "loss": 0.4073, + "step": 783 + }, + { + "epoch": 0.7282861124013005, + "grad_norm": 0.28792928032141457, + "learning_rate": 4.2082616179001724e-05, + "loss": 0.4006, + "step": 784 + }, + { + "epoch": 0.7292150487691593, + "grad_norm": 0.22643696354988968, + "learning_rate": 4.206540447504303e-05, + "loss": 0.4186, + "step": 785 + }, + { + "epoch": 0.7301439851370181, + "grad_norm": 0.30992055940652685, + "learning_rate": 4.204819277108434e-05, + "loss": 0.4013, + "step": 786 + }, + { + "epoch": 0.7310729215048769, + "grad_norm": 0.22983459619899754, + "learning_rate": 4.2030981067125646e-05, + "loss": 0.4181, + "step": 787 + }, + { + "epoch": 0.7320018578727358, + "grad_norm": 0.25635099005397793, + "learning_rate": 4.201376936316696e-05, + "loss": 0.4099, + "step": 788 + }, + { + "epoch": 0.7329307942405945, + "grad_norm": 0.2562041424665725, + "learning_rate": 4.199655765920826e-05, + "loss": 0.4108, + "step": 789 + }, + { + "epoch": 0.7338597306084533, + "grad_norm": 0.25150699905041096, + "learning_rate": 4.1979345955249575e-05, + "loss": 0.4085, + "step": 790 + }, + { + "epoch": 0.7347886669763122, + "grad_norm": 0.2714063520253769, + "learning_rate": 4.196213425129088e-05, + "loss": 0.4214, + "step": 791 + }, + { + "epoch": 0.7357176033441709, + "grad_norm": 0.2665920594786692, + "learning_rate": 4.194492254733219e-05, + "loss": 0.4199, + "step": 792 + }, + { + "epoch": 0.7366465397120298, + "grad_norm": 0.2671300207460533, + "learning_rate": 4.1927710843373496e-05, + "loss": 0.4161, + "step": 793 + }, + { + "epoch": 0.7375754760798885, + "grad_norm": 0.27034859657353083, + "learning_rate": 4.191049913941481e-05, + "loss": 0.4271, + "step": 794 + }, + { + "epoch": 0.7385044124477473, + "grad_norm": 0.26690699071818325, + "learning_rate": 4.189328743545611e-05, + "loss": 0.4192, + "step": 795 + }, + { + "epoch": 0.7394333488156062, + "grad_norm": 0.3051440328850412, + "learning_rate": 4.1876075731497425e-05, + "loss": 0.4132, + "step": 796 + }, + { + "epoch": 0.7403622851834649, + "grad_norm": 0.21974143916170535, + "learning_rate": 4.185886402753873e-05, + "loss": 0.3979, + "step": 797 + }, + { + "epoch": 0.7412912215513238, + "grad_norm": 0.24983461807683474, + "learning_rate": 4.184165232358004e-05, + "loss": 0.4079, + "step": 798 + }, + { + "epoch": 0.7422201579191825, + "grad_norm": 0.24224059591231967, + "learning_rate": 4.182444061962135e-05, + "loss": 0.4032, + "step": 799 + }, + { + "epoch": 0.7431490942870413, + "grad_norm": 0.2502956003038763, + "learning_rate": 4.180722891566265e-05, + "loss": 0.409, + "step": 800 + }, + { + "epoch": 0.7440780306549002, + "grad_norm": 0.24589196452215084, + "learning_rate": 4.1790017211703964e-05, + "loss": 0.4111, + "step": 801 + }, + { + "epoch": 0.7450069670227589, + "grad_norm": 0.22442063047390312, + "learning_rate": 4.177280550774527e-05, + "loss": 0.4212, + "step": 802 + }, + { + "epoch": 0.7459359033906178, + "grad_norm": 0.2360026228130957, + "learning_rate": 4.1755593803786574e-05, + "loss": 0.3978, + "step": 803 + }, + { + "epoch": 0.7468648397584765, + "grad_norm": 0.26711637554464907, + "learning_rate": 4.1738382099827885e-05, + "loss": 0.4028, + "step": 804 + }, + { + "epoch": 0.7477937761263354, + "grad_norm": 0.2956104037100123, + "learning_rate": 4.172117039586919e-05, + "loss": 0.417, + "step": 805 + }, + { + "epoch": 0.7487227124941942, + "grad_norm": 0.2764502588713264, + "learning_rate": 4.17039586919105e-05, + "loss": 0.4075, + "step": 806 + }, + { + "epoch": 0.7496516488620529, + "grad_norm": 0.26416939396871475, + "learning_rate": 4.168674698795181e-05, + "loss": 0.4306, + "step": 807 + }, + { + "epoch": 0.7505805852299118, + "grad_norm": 0.29657741806318, + "learning_rate": 4.166953528399312e-05, + "loss": 0.4213, + "step": 808 + }, + { + "epoch": 0.7515095215977705, + "grad_norm": 0.262465514495409, + "learning_rate": 4.1652323580034424e-05, + "loss": 0.4211, + "step": 809 + }, + { + "epoch": 0.7524384579656294, + "grad_norm": 0.29476951737672735, + "learning_rate": 4.1635111876075736e-05, + "loss": 0.4163, + "step": 810 + }, + { + "epoch": 0.7533673943334882, + "grad_norm": 0.25271095485048567, + "learning_rate": 4.161790017211704e-05, + "loss": 0.4064, + "step": 811 + }, + { + "epoch": 0.7542963307013469, + "grad_norm": 0.30865411128822723, + "learning_rate": 4.160068846815835e-05, + "loss": 0.4151, + "step": 812 + }, + { + "epoch": 0.7552252670692058, + "grad_norm": 0.24802250876527565, + "learning_rate": 4.158347676419966e-05, + "loss": 0.4445, + "step": 813 + }, + { + "epoch": 0.7561542034370645, + "grad_norm": 0.3701424649307489, + "learning_rate": 4.156626506024097e-05, + "loss": 0.4183, + "step": 814 + }, + { + "epoch": 0.7570831398049234, + "grad_norm": 0.2337360342099123, + "learning_rate": 4.1549053356282274e-05, + "loss": 0.3858, + "step": 815 + }, + { + "epoch": 0.7580120761727822, + "grad_norm": 0.2910522135380579, + "learning_rate": 4.1531841652323586e-05, + "loss": 0.4122, + "step": 816 + }, + { + "epoch": 0.758941012540641, + "grad_norm": 0.236795985808718, + "learning_rate": 4.151462994836489e-05, + "loss": 0.4322, + "step": 817 + }, + { + "epoch": 0.7598699489084998, + "grad_norm": 0.28675347251155037, + "learning_rate": 4.14974182444062e-05, + "loss": 0.411, + "step": 818 + }, + { + "epoch": 0.7607988852763585, + "grad_norm": 0.24135506233109558, + "learning_rate": 4.148020654044751e-05, + "loss": 0.4033, + "step": 819 + }, + { + "epoch": 0.7617278216442174, + "grad_norm": 0.29903357931708696, + "learning_rate": 4.146299483648882e-05, + "loss": 0.4028, + "step": 820 + }, + { + "epoch": 0.7626567580120762, + "grad_norm": 0.2666535342900728, + "learning_rate": 4.1445783132530125e-05, + "loss": 0.4064, + "step": 821 + }, + { + "epoch": 0.763585694379935, + "grad_norm": 0.33097109067206043, + "learning_rate": 4.1428571428571437e-05, + "loss": 0.4167, + "step": 822 + }, + { + "epoch": 0.7645146307477938, + "grad_norm": 0.27705874830384963, + "learning_rate": 4.1411359724612735e-05, + "loss": 0.4317, + "step": 823 + }, + { + "epoch": 0.7654435671156525, + "grad_norm": 0.27006115913006945, + "learning_rate": 4.1394148020654047e-05, + "loss": 0.4011, + "step": 824 + }, + { + "epoch": 0.7663725034835114, + "grad_norm": 0.2191843080256796, + "learning_rate": 4.137693631669535e-05, + "loss": 0.3877, + "step": 825 + }, + { + "epoch": 0.7673014398513702, + "grad_norm": 0.26765533335243286, + "learning_rate": 4.135972461273666e-05, + "loss": 0.416, + "step": 826 + }, + { + "epoch": 0.768230376219229, + "grad_norm": 0.28444860627784857, + "learning_rate": 4.134251290877797e-05, + "loss": 0.4142, + "step": 827 + }, + { + "epoch": 0.7691593125870878, + "grad_norm": 0.2590581112218012, + "learning_rate": 4.132530120481928e-05, + "loss": 0.455, + "step": 828 + }, + { + "epoch": 0.7700882489549465, + "grad_norm": 0.28511612916261636, + "learning_rate": 4.1308089500860585e-05, + "loss": 0.4439, + "step": 829 + }, + { + "epoch": 0.7710171853228054, + "grad_norm": 0.2724311438434457, + "learning_rate": 4.12908777969019e-05, + "loss": 0.4105, + "step": 830 + }, + { + "epoch": 0.7719461216906642, + "grad_norm": 0.23702064984429358, + "learning_rate": 4.12736660929432e-05, + "loss": 0.4136, + "step": 831 + }, + { + "epoch": 0.772875058058523, + "grad_norm": 0.2892255007964407, + "learning_rate": 4.1256454388984514e-05, + "loss": 0.4089, + "step": 832 + }, + { + "epoch": 0.7738039944263818, + "grad_norm": 0.24990575367096712, + "learning_rate": 4.123924268502582e-05, + "loss": 0.443, + "step": 833 + }, + { + "epoch": 0.7747329307942405, + "grad_norm": 0.3184046525164543, + "learning_rate": 4.122203098106713e-05, + "loss": 0.4318, + "step": 834 + }, + { + "epoch": 0.7756618671620994, + "grad_norm": 0.2673099414060095, + "learning_rate": 4.1204819277108436e-05, + "loss": 0.4013, + "step": 835 + }, + { + "epoch": 0.7765908035299582, + "grad_norm": 0.2925594820844854, + "learning_rate": 4.118760757314975e-05, + "loss": 0.4271, + "step": 836 + }, + { + "epoch": 0.777519739897817, + "grad_norm": 0.22953985012166775, + "learning_rate": 4.117039586919105e-05, + "loss": 0.3745, + "step": 837 + }, + { + "epoch": 0.7784486762656758, + "grad_norm": 0.33609789574689314, + "learning_rate": 4.1153184165232364e-05, + "loss": 0.4037, + "step": 838 + }, + { + "epoch": 0.7793776126335346, + "grad_norm": 0.24182783755842846, + "learning_rate": 4.113597246127367e-05, + "loss": 0.3996, + "step": 839 + }, + { + "epoch": 0.7803065490013934, + "grad_norm": 0.24661825820680278, + "learning_rate": 4.111876075731498e-05, + "loss": 0.4, + "step": 840 + }, + { + "epoch": 0.7812354853692522, + "grad_norm": 0.2676051683765745, + "learning_rate": 4.1101549053356286e-05, + "loss": 0.4341, + "step": 841 + }, + { + "epoch": 0.782164421737111, + "grad_norm": 0.2302592197123923, + "learning_rate": 4.10843373493976e-05, + "loss": 0.3956, + "step": 842 + }, + { + "epoch": 0.7830933581049698, + "grad_norm": 0.26234103881814524, + "learning_rate": 4.10671256454389e-05, + "loss": 0.4011, + "step": 843 + }, + { + "epoch": 0.7840222944728286, + "grad_norm": 0.2801794293079212, + "learning_rate": 4.104991394148021e-05, + "loss": 0.4329, + "step": 844 + }, + { + "epoch": 0.7849512308406874, + "grad_norm": 0.23397262641748529, + "learning_rate": 4.103270223752151e-05, + "loss": 0.3709, + "step": 845 + }, + { + "epoch": 0.7858801672085463, + "grad_norm": 0.3544703249382208, + "learning_rate": 4.1015490533562825e-05, + "loss": 0.4171, + "step": 846 + }, + { + "epoch": 0.786809103576405, + "grad_norm": 0.2722275854852537, + "learning_rate": 4.099827882960413e-05, + "loss": 0.4337, + "step": 847 + }, + { + "epoch": 0.7877380399442638, + "grad_norm": 0.26553648559179016, + "learning_rate": 4.098106712564544e-05, + "loss": 0.4173, + "step": 848 + }, + { + "epoch": 0.7886669763121226, + "grad_norm": 0.2678696587952733, + "learning_rate": 4.0963855421686746e-05, + "loss": 0.4111, + "step": 849 + }, + { + "epoch": 0.7895959126799814, + "grad_norm": 0.2657158376139536, + "learning_rate": 4.094664371772806e-05, + "loss": 0.4126, + "step": 850 + }, + { + "epoch": 0.7905248490478403, + "grad_norm": 0.287284238795583, + "learning_rate": 4.092943201376936e-05, + "loss": 0.4207, + "step": 851 + }, + { + "epoch": 0.791453785415699, + "grad_norm": 0.2754176953959211, + "learning_rate": 4.0912220309810675e-05, + "loss": 0.4259, + "step": 852 + }, + { + "epoch": 0.7923827217835578, + "grad_norm": 0.262107512607669, + "learning_rate": 4.089500860585198e-05, + "loss": 0.4127, + "step": 853 + }, + { + "epoch": 0.7933116581514166, + "grad_norm": 0.24162465041652548, + "learning_rate": 4.087779690189329e-05, + "loss": 0.4009, + "step": 854 + }, + { + "epoch": 0.7942405945192754, + "grad_norm": 0.266448063011791, + "learning_rate": 4.08605851979346e-05, + "loss": 0.4271, + "step": 855 + }, + { + "epoch": 0.7951695308871343, + "grad_norm": 0.24051279422477578, + "learning_rate": 4.084337349397591e-05, + "loss": 0.4089, + "step": 856 + }, + { + "epoch": 0.796098467254993, + "grad_norm": 0.2729628116438206, + "learning_rate": 4.0826161790017214e-05, + "loss": 0.4167, + "step": 857 + }, + { + "epoch": 0.7970274036228518, + "grad_norm": 0.23410719385943532, + "learning_rate": 4.0808950086058525e-05, + "loss": 0.4179, + "step": 858 + }, + { + "epoch": 0.7979563399907106, + "grad_norm": 0.30219591202715035, + "learning_rate": 4.079173838209983e-05, + "loss": 0.4138, + "step": 859 + }, + { + "epoch": 0.7988852763585694, + "grad_norm": 0.23985576065241604, + "learning_rate": 4.077452667814114e-05, + "loss": 0.4075, + "step": 860 + }, + { + "epoch": 0.7998142127264283, + "grad_norm": 0.25029357115071, + "learning_rate": 4.075731497418245e-05, + "loss": 0.4118, + "step": 861 + }, + { + "epoch": 0.800743149094287, + "grad_norm": 0.29345741652304597, + "learning_rate": 4.074010327022376e-05, + "loss": 0.397, + "step": 862 + }, + { + "epoch": 0.8016720854621459, + "grad_norm": 0.24385532620239805, + "learning_rate": 4.0722891566265064e-05, + "loss": 0.4, + "step": 863 + }, + { + "epoch": 0.8026010218300046, + "grad_norm": 0.2637648833416592, + "learning_rate": 4.0705679862306376e-05, + "loss": 0.4219, + "step": 864 + }, + { + "epoch": 0.8035299581978634, + "grad_norm": 0.25527572185866787, + "learning_rate": 4.0688468158347674e-05, + "loss": 0.4007, + "step": 865 + }, + { + "epoch": 0.8044588945657223, + "grad_norm": 0.2513201236957386, + "learning_rate": 4.0671256454388986e-05, + "loss": 0.3955, + "step": 866 + }, + { + "epoch": 0.805387830933581, + "grad_norm": 0.25660774924232826, + "learning_rate": 4.065404475043029e-05, + "loss": 0.3787, + "step": 867 + }, + { + "epoch": 0.8063167673014399, + "grad_norm": 0.26206437729581916, + "learning_rate": 4.06368330464716e-05, + "loss": 0.4196, + "step": 868 + }, + { + "epoch": 0.8072457036692986, + "grad_norm": 0.24835895191817717, + "learning_rate": 4.061962134251291e-05, + "loss": 0.3994, + "step": 869 + }, + { + "epoch": 0.8081746400371574, + "grad_norm": 0.26426141442192896, + "learning_rate": 4.060240963855422e-05, + "loss": 0.4254, + "step": 870 + }, + { + "epoch": 0.8091035764050163, + "grad_norm": 0.24819792871771582, + "learning_rate": 4.0585197934595524e-05, + "loss": 0.4124, + "step": 871 + }, + { + "epoch": 0.810032512772875, + "grad_norm": 0.2444630039170481, + "learning_rate": 4.0567986230636836e-05, + "loss": 0.4374, + "step": 872 + }, + { + "epoch": 0.8109614491407339, + "grad_norm": 0.25882143197020263, + "learning_rate": 4.055077452667814e-05, + "loss": 0.4224, + "step": 873 + }, + { + "epoch": 0.8118903855085927, + "grad_norm": 0.22956628354423742, + "learning_rate": 4.053356282271945e-05, + "loss": 0.4014, + "step": 874 + }, + { + "epoch": 0.8128193218764515, + "grad_norm": 0.264956195080769, + "learning_rate": 4.051635111876076e-05, + "loss": 0.3938, + "step": 875 + }, + { + "epoch": 0.8137482582443103, + "grad_norm": 0.24985343436278015, + "learning_rate": 4.049913941480207e-05, + "loss": 0.4223, + "step": 876 + }, + { + "epoch": 0.814677194612169, + "grad_norm": 0.2864398731906139, + "learning_rate": 4.0481927710843375e-05, + "loss": 0.3934, + "step": 877 + }, + { + "epoch": 0.8156061309800279, + "grad_norm": 0.24292187852748645, + "learning_rate": 4.046471600688469e-05, + "loss": 0.3944, + "step": 878 + }, + { + "epoch": 0.8165350673478867, + "grad_norm": 0.2860359553745768, + "learning_rate": 4.044750430292599e-05, + "loss": 0.4127, + "step": 879 + }, + { + "epoch": 0.8174640037157455, + "grad_norm": 0.29668643281017715, + "learning_rate": 4.0430292598967303e-05, + "loss": 0.3998, + "step": 880 + }, + { + "epoch": 0.8183929400836043, + "grad_norm": 0.278439860888243, + "learning_rate": 4.041308089500861e-05, + "loss": 0.3744, + "step": 881 + }, + { + "epoch": 0.819321876451463, + "grad_norm": 0.2763340216330821, + "learning_rate": 4.039586919104992e-05, + "loss": 0.4046, + "step": 882 + }, + { + "epoch": 0.8202508128193219, + "grad_norm": 0.2578084168559668, + "learning_rate": 4.0378657487091225e-05, + "loss": 0.4144, + "step": 883 + }, + { + "epoch": 0.8211797491871807, + "grad_norm": 0.2704411317528433, + "learning_rate": 4.036144578313254e-05, + "loss": 0.3938, + "step": 884 + }, + { + "epoch": 0.8221086855550395, + "grad_norm": 0.2216930759924396, + "learning_rate": 4.034423407917384e-05, + "loss": 0.3976, + "step": 885 + }, + { + "epoch": 0.8230376219228983, + "grad_norm": 0.23736907014407435, + "learning_rate": 4.032702237521515e-05, + "loss": 0.4032, + "step": 886 + }, + { + "epoch": 0.823966558290757, + "grad_norm": 0.2751484069936401, + "learning_rate": 4.030981067125645e-05, + "loss": 0.4142, + "step": 887 + }, + { + "epoch": 0.8248954946586159, + "grad_norm": 0.22573606403051907, + "learning_rate": 4.0292598967297764e-05, + "loss": 0.3922, + "step": 888 + }, + { + "epoch": 0.8258244310264747, + "grad_norm": 0.2602795860148468, + "learning_rate": 4.027538726333907e-05, + "loss": 0.4139, + "step": 889 + }, + { + "epoch": 0.8267533673943335, + "grad_norm": 0.24581668806400517, + "learning_rate": 4.025817555938038e-05, + "loss": 0.4173, + "step": 890 + }, + { + "epoch": 0.8276823037621923, + "grad_norm": 0.23586598943899656, + "learning_rate": 4.0240963855421686e-05, + "loss": 0.397, + "step": 891 + }, + { + "epoch": 0.828611240130051, + "grad_norm": 0.22698815324965912, + "learning_rate": 4.0223752151463e-05, + "loss": 0.397, + "step": 892 + }, + { + "epoch": 0.8295401764979099, + "grad_norm": 0.251756454856816, + "learning_rate": 4.02065404475043e-05, + "loss": 0.43, + "step": 893 + }, + { + "epoch": 0.8304691128657687, + "grad_norm": 0.23474799972973084, + "learning_rate": 4.0189328743545614e-05, + "loss": 0.4232, + "step": 894 + }, + { + "epoch": 0.8313980492336275, + "grad_norm": 0.2284304060585405, + "learning_rate": 4.017211703958692e-05, + "loss": 0.4363, + "step": 895 + }, + { + "epoch": 0.8323269856014863, + "grad_norm": 0.22189967592933565, + "learning_rate": 4.015490533562823e-05, + "loss": 0.3996, + "step": 896 + }, + { + "epoch": 0.8332559219693451, + "grad_norm": 0.24727551944573103, + "learning_rate": 4.0137693631669536e-05, + "loss": 0.3881, + "step": 897 + }, + { + "epoch": 0.8341848583372039, + "grad_norm": 0.26502357788364, + "learning_rate": 4.012048192771085e-05, + "loss": 0.4345, + "step": 898 + }, + { + "epoch": 0.8351137947050628, + "grad_norm": 0.24506267621395414, + "learning_rate": 4.010327022375215e-05, + "loss": 0.408, + "step": 899 + }, + { + "epoch": 0.8360427310729215, + "grad_norm": 0.2686607976921608, + "learning_rate": 4.0086058519793465e-05, + "loss": 0.4284, + "step": 900 + }, + { + "epoch": 0.8369716674407803, + "grad_norm": 0.22658254041478237, + "learning_rate": 4.006884681583477e-05, + "loss": 0.4036, + "step": 901 + }, + { + "epoch": 0.8379006038086391, + "grad_norm": 0.24269121548237235, + "learning_rate": 4.005163511187608e-05, + "loss": 0.4158, + "step": 902 + }, + { + "epoch": 0.8388295401764979, + "grad_norm": 0.27624389649170866, + "learning_rate": 4.0034423407917386e-05, + "loss": 0.4067, + "step": 903 + }, + { + "epoch": 0.8397584765443568, + "grad_norm": 0.24834115738955598, + "learning_rate": 4.00172117039587e-05, + "loss": 0.4068, + "step": 904 + }, + { + "epoch": 0.8406874129122155, + "grad_norm": 0.25359081776709796, + "learning_rate": 4e-05, + "loss": 0.4013, + "step": 905 + }, + { + "epoch": 0.8416163492800743, + "grad_norm": 0.2621593127454798, + "learning_rate": 3.9982788296041315e-05, + "loss": 0.4019, + "step": 906 + }, + { + "epoch": 0.8425452856479331, + "grad_norm": 0.22773835872678272, + "learning_rate": 3.996557659208262e-05, + "loss": 0.3898, + "step": 907 + }, + { + "epoch": 0.8434742220157919, + "grad_norm": 0.25454356735572764, + "learning_rate": 3.9948364888123925e-05, + "loss": 0.4041, + "step": 908 + }, + { + "epoch": 0.8444031583836508, + "grad_norm": 0.24027047666859136, + "learning_rate": 3.993115318416523e-05, + "loss": 0.4158, + "step": 909 + }, + { + "epoch": 0.8453320947515095, + "grad_norm": 0.2485921595211011, + "learning_rate": 3.991394148020654e-05, + "loss": 0.4101, + "step": 910 + }, + { + "epoch": 0.8462610311193683, + "grad_norm": 0.22285216317499973, + "learning_rate": 3.989672977624785e-05, + "loss": 0.3969, + "step": 911 + }, + { + "epoch": 0.8471899674872271, + "grad_norm": 0.2596638733050711, + "learning_rate": 3.987951807228916e-05, + "loss": 0.42, + "step": 912 + }, + { + "epoch": 0.8481189038550859, + "grad_norm": 0.23550833352781333, + "learning_rate": 3.9862306368330464e-05, + "loss": 0.4136, + "step": 913 + }, + { + "epoch": 0.8490478402229448, + "grad_norm": 0.24273175946950082, + "learning_rate": 3.9845094664371775e-05, + "loss": 0.4347, + "step": 914 + }, + { + "epoch": 0.8499767765908035, + "grad_norm": 0.2571365495317302, + "learning_rate": 3.982788296041308e-05, + "loss": 0.4322, + "step": 915 + }, + { + "epoch": 0.8509057129586624, + "grad_norm": 0.22272682206410346, + "learning_rate": 3.981067125645439e-05, + "loss": 0.382, + "step": 916 + }, + { + "epoch": 0.8518346493265211, + "grad_norm": 0.22974977779029027, + "learning_rate": 3.97934595524957e-05, + "loss": 0.3874, + "step": 917 + }, + { + "epoch": 0.8527635856943799, + "grad_norm": 0.23626085060684773, + "learning_rate": 3.977624784853701e-05, + "loss": 0.4205, + "step": 918 + }, + { + "epoch": 0.8536925220622388, + "grad_norm": 0.2711925943505794, + "learning_rate": 3.9759036144578314e-05, + "loss": 0.4048, + "step": 919 + }, + { + "epoch": 0.8546214584300975, + "grad_norm": 0.23373429510160257, + "learning_rate": 3.9741824440619626e-05, + "loss": 0.4183, + "step": 920 + }, + { + "epoch": 0.8555503947979564, + "grad_norm": 0.269906941097679, + "learning_rate": 3.972461273666093e-05, + "loss": 0.4187, + "step": 921 + }, + { + "epoch": 0.8564793311658151, + "grad_norm": 0.21569823380266548, + "learning_rate": 3.970740103270224e-05, + "loss": 0.3984, + "step": 922 + }, + { + "epoch": 0.8574082675336739, + "grad_norm": 0.23805027630937817, + "learning_rate": 3.969018932874355e-05, + "loss": 0.4166, + "step": 923 + }, + { + "epoch": 0.8583372039015328, + "grad_norm": 0.23069245503461422, + "learning_rate": 3.967297762478486e-05, + "loss": 0.4262, + "step": 924 + }, + { + "epoch": 0.8592661402693915, + "grad_norm": 0.2428482973970677, + "learning_rate": 3.9655765920826164e-05, + "loss": 0.4168, + "step": 925 + }, + { + "epoch": 0.8601950766372504, + "grad_norm": 0.2183870165831691, + "learning_rate": 3.9638554216867476e-05, + "loss": 0.4051, + "step": 926 + }, + { + "epoch": 0.8611240130051091, + "grad_norm": 0.25786863909064817, + "learning_rate": 3.962134251290878e-05, + "loss": 0.4024, + "step": 927 + }, + { + "epoch": 0.862052949372968, + "grad_norm": 0.21274681020345573, + "learning_rate": 3.960413080895009e-05, + "loss": 0.4056, + "step": 928 + }, + { + "epoch": 0.8629818857408268, + "grad_norm": 0.2805951596096335, + "learning_rate": 3.958691910499139e-05, + "loss": 0.4251, + "step": 929 + }, + { + "epoch": 0.8639108221086855, + "grad_norm": 0.22683405969287646, + "learning_rate": 3.95697074010327e-05, + "loss": 0.4217, + "step": 930 + }, + { + "epoch": 0.8648397584765444, + "grad_norm": 0.26021009912177695, + "learning_rate": 3.955249569707401e-05, + "loss": 0.4219, + "step": 931 + }, + { + "epoch": 0.8657686948444031, + "grad_norm": 0.24224641967846924, + "learning_rate": 3.953528399311532e-05, + "loss": 0.424, + "step": 932 + }, + { + "epoch": 0.866697631212262, + "grad_norm": 0.22247989096036871, + "learning_rate": 3.9518072289156625e-05, + "loss": 0.3968, + "step": 933 + }, + { + "epoch": 0.8676265675801208, + "grad_norm": 0.22730388207221008, + "learning_rate": 3.950086058519794e-05, + "loss": 0.4027, + "step": 934 + }, + { + "epoch": 0.8685555039479795, + "grad_norm": 0.22984491491807468, + "learning_rate": 3.948364888123924e-05, + "loss": 0.4015, + "step": 935 + }, + { + "epoch": 0.8694844403158384, + "grad_norm": 0.2706739171118266, + "learning_rate": 3.9466437177280554e-05, + "loss": 0.4057, + "step": 936 + }, + { + "epoch": 0.8704133766836971, + "grad_norm": 0.2869836202377347, + "learning_rate": 3.944922547332186e-05, + "loss": 0.4169, + "step": 937 + }, + { + "epoch": 0.871342313051556, + "grad_norm": 0.24144907861227421, + "learning_rate": 3.943201376936317e-05, + "loss": 0.3895, + "step": 938 + }, + { + "epoch": 0.8722712494194148, + "grad_norm": 0.2318365608955253, + "learning_rate": 3.9414802065404475e-05, + "loss": 0.3887, + "step": 939 + }, + { + "epoch": 0.8732001857872735, + "grad_norm": 0.24391204560861726, + "learning_rate": 3.939759036144579e-05, + "loss": 0.4144, + "step": 940 + }, + { + "epoch": 0.8741291221551324, + "grad_norm": 0.24279276074096975, + "learning_rate": 3.938037865748709e-05, + "loss": 0.4105, + "step": 941 + }, + { + "epoch": 0.8750580585229911, + "grad_norm": 0.2206737590128525, + "learning_rate": 3.9363166953528404e-05, + "loss": 0.4055, + "step": 942 + }, + { + "epoch": 0.87598699489085, + "grad_norm": 0.2501829527519811, + "learning_rate": 3.934595524956971e-05, + "loss": 0.419, + "step": 943 + }, + { + "epoch": 0.8769159312587088, + "grad_norm": 0.20113559459810643, + "learning_rate": 3.932874354561102e-05, + "loss": 0.3995, + "step": 944 + }, + { + "epoch": 0.8778448676265675, + "grad_norm": 0.2590015788880731, + "learning_rate": 3.9311531841652326e-05, + "loss": 0.4106, + "step": 945 + }, + { + "epoch": 0.8787738039944264, + "grad_norm": 0.21875502199045596, + "learning_rate": 3.929432013769364e-05, + "loss": 0.4137, + "step": 946 + }, + { + "epoch": 0.8797027403622851, + "grad_norm": 0.22619403593853593, + "learning_rate": 3.927710843373494e-05, + "loss": 0.3897, + "step": 947 + }, + { + "epoch": 0.880631676730144, + "grad_norm": 0.28274876745197, + "learning_rate": 3.9259896729776254e-05, + "loss": 0.4243, + "step": 948 + }, + { + "epoch": 0.8815606130980028, + "grad_norm": 0.2550591340833801, + "learning_rate": 3.924268502581756e-05, + "loss": 0.4262, + "step": 949 + }, + { + "epoch": 0.8824895494658616, + "grad_norm": 0.28300258572852405, + "learning_rate": 3.9225473321858864e-05, + "loss": 0.3981, + "step": 950 + }, + { + "epoch": 0.8834184858337204, + "grad_norm": 0.2651505216641192, + "learning_rate": 3.920826161790017e-05, + "loss": 0.3987, + "step": 951 + }, + { + "epoch": 0.8843474222015791, + "grad_norm": 0.2699525095302782, + "learning_rate": 3.919104991394148e-05, + "loss": 0.3963, + "step": 952 + }, + { + "epoch": 0.885276358569438, + "grad_norm": 0.24632775459420023, + "learning_rate": 3.9173838209982786e-05, + "loss": 0.3793, + "step": 953 + }, + { + "epoch": 0.8862052949372968, + "grad_norm": 0.2729975548782007, + "learning_rate": 3.91566265060241e-05, + "loss": 0.4259, + "step": 954 + }, + { + "epoch": 0.8871342313051556, + "grad_norm": 0.23113626254645106, + "learning_rate": 3.91394148020654e-05, + "loss": 0.4118, + "step": 955 + }, + { + "epoch": 0.8880631676730144, + "grad_norm": 0.2493442523902469, + "learning_rate": 3.9122203098106715e-05, + "loss": 0.4277, + "step": 956 + }, + { + "epoch": 0.8889921040408733, + "grad_norm": 0.24785873510336145, + "learning_rate": 3.910499139414802e-05, + "loss": 0.4065, + "step": 957 + }, + { + "epoch": 0.889921040408732, + "grad_norm": 0.24341153477797958, + "learning_rate": 3.908777969018933e-05, + "loss": 0.4037, + "step": 958 + }, + { + "epoch": 0.8908499767765908, + "grad_norm": 0.23299336958401565, + "learning_rate": 3.9070567986230637e-05, + "loss": 0.4018, + "step": 959 + }, + { + "epoch": 0.8917789131444496, + "grad_norm": 0.27717768327110737, + "learning_rate": 3.905335628227195e-05, + "loss": 0.4136, + "step": 960 + }, + { + "epoch": 0.8927078495123084, + "grad_norm": 0.2634600076134501, + "learning_rate": 3.903614457831325e-05, + "loss": 0.4151, + "step": 961 + }, + { + "epoch": 0.8936367858801673, + "grad_norm": 0.2431673757388936, + "learning_rate": 3.9018932874354565e-05, + "loss": 0.4046, + "step": 962 + }, + { + "epoch": 0.894565722248026, + "grad_norm": 0.2873899288522361, + "learning_rate": 3.900172117039587e-05, + "loss": 0.4029, + "step": 963 + }, + { + "epoch": 0.8954946586158848, + "grad_norm": 0.2068486702586044, + "learning_rate": 3.898450946643718e-05, + "loss": 0.3862, + "step": 964 + }, + { + "epoch": 0.8964235949837436, + "grad_norm": 0.21678196583034384, + "learning_rate": 3.896729776247849e-05, + "loss": 0.3926, + "step": 965 + }, + { + "epoch": 0.8973525313516024, + "grad_norm": 0.23384577673120308, + "learning_rate": 3.89500860585198e-05, + "loss": 0.3986, + "step": 966 + }, + { + "epoch": 0.8982814677194613, + "grad_norm": 0.2168835780899871, + "learning_rate": 3.8932874354561104e-05, + "loss": 0.4117, + "step": 967 + }, + { + "epoch": 0.89921040408732, + "grad_norm": 0.2182530040951482, + "learning_rate": 3.8915662650602416e-05, + "loss": 0.3993, + "step": 968 + }, + { + "epoch": 0.9001393404551788, + "grad_norm": 0.2279085259514714, + "learning_rate": 3.889845094664372e-05, + "loss": 0.4179, + "step": 969 + }, + { + "epoch": 0.9010682768230376, + "grad_norm": 0.2161021312161298, + "learning_rate": 3.888123924268503e-05, + "loss": 0.4196, + "step": 970 + }, + { + "epoch": 0.9019972131908964, + "grad_norm": 0.23665046914396706, + "learning_rate": 3.886402753872633e-05, + "loss": 0.4151, + "step": 971 + }, + { + "epoch": 0.9029261495587553, + "grad_norm": 0.2153174695085264, + "learning_rate": 3.884681583476764e-05, + "loss": 0.392, + "step": 972 + }, + { + "epoch": 0.903855085926614, + "grad_norm": 0.22865056532137162, + "learning_rate": 3.882960413080895e-05, + "loss": 0.4285, + "step": 973 + }, + { + "epoch": 0.9047840222944729, + "grad_norm": 0.24540292705433664, + "learning_rate": 3.881239242685026e-05, + "loss": 0.4085, + "step": 974 + }, + { + "epoch": 0.9057129586623316, + "grad_norm": 0.21512013945160077, + "learning_rate": 3.8795180722891564e-05, + "loss": 0.4001, + "step": 975 + }, + { + "epoch": 0.9066418950301904, + "grad_norm": 0.2543665399130075, + "learning_rate": 3.8777969018932876e-05, + "loss": 0.4233, + "step": 976 + }, + { + "epoch": 0.9075708313980493, + "grad_norm": 0.2604590648729307, + "learning_rate": 3.876075731497418e-05, + "loss": 0.4315, + "step": 977 + }, + { + "epoch": 0.908499767765908, + "grad_norm": 0.2426498446435362, + "learning_rate": 3.874354561101549e-05, + "loss": 0.4194, + "step": 978 + }, + { + "epoch": 0.9094287041337669, + "grad_norm": 0.21692216609161416, + "learning_rate": 3.87263339070568e-05, + "loss": 0.392, + "step": 979 + }, + { + "epoch": 0.9103576405016256, + "grad_norm": 0.24908430824930783, + "learning_rate": 3.870912220309811e-05, + "loss": 0.3719, + "step": 980 + }, + { + "epoch": 0.9112865768694844, + "grad_norm": 0.2159682412651961, + "learning_rate": 3.8691910499139415e-05, + "loss": 0.3898, + "step": 981 + }, + { + "epoch": 0.9122155132373433, + "grad_norm": 0.21127012379911123, + "learning_rate": 3.8674698795180726e-05, + "loss": 0.4248, + "step": 982 + }, + { + "epoch": 0.913144449605202, + "grad_norm": 0.22813635269217872, + "learning_rate": 3.865748709122203e-05, + "loss": 0.4244, + "step": 983 + }, + { + "epoch": 0.9140733859730609, + "grad_norm": 0.23743928654284238, + "learning_rate": 3.864027538726334e-05, + "loss": 0.4214, + "step": 984 + }, + { + "epoch": 0.9150023223409196, + "grad_norm": 0.2186026650640482, + "learning_rate": 3.862306368330465e-05, + "loss": 0.405, + "step": 985 + }, + { + "epoch": 0.9159312587087785, + "grad_norm": 0.257356309199639, + "learning_rate": 3.860585197934596e-05, + "loss": 0.3917, + "step": 986 + }, + { + "epoch": 0.9168601950766373, + "grad_norm": 0.19464718995992264, + "learning_rate": 3.8588640275387265e-05, + "loss": 0.4049, + "step": 987 + }, + { + "epoch": 0.917789131444496, + "grad_norm": 0.25657951632861403, + "learning_rate": 3.857142857142858e-05, + "loss": 0.4186, + "step": 988 + }, + { + "epoch": 0.9187180678123549, + "grad_norm": 0.22465676748227606, + "learning_rate": 3.855421686746988e-05, + "loss": 0.4345, + "step": 989 + }, + { + "epoch": 0.9196470041802136, + "grad_norm": 0.2506754334740293, + "learning_rate": 3.8537005163511194e-05, + "loss": 0.4025, + "step": 990 + }, + { + "epoch": 0.9205759405480725, + "grad_norm": 0.2340373514090616, + "learning_rate": 3.85197934595525e-05, + "loss": 0.4256, + "step": 991 + }, + { + "epoch": 0.9215048769159313, + "grad_norm": 0.23036227594793113, + "learning_rate": 3.8502581755593804e-05, + "loss": 0.4176, + "step": 992 + }, + { + "epoch": 0.92243381328379, + "grad_norm": 0.23093971462681168, + "learning_rate": 3.848537005163511e-05, + "loss": 0.4007, + "step": 993 + }, + { + "epoch": 0.9233627496516489, + "grad_norm": 0.2317551394646806, + "learning_rate": 3.846815834767642e-05, + "loss": 0.405, + "step": 994 + }, + { + "epoch": 0.9242916860195076, + "grad_norm": 0.21158502861181783, + "learning_rate": 3.8450946643717725e-05, + "loss": 0.3942, + "step": 995 + }, + { + "epoch": 0.9252206223873665, + "grad_norm": 0.2537032563068634, + "learning_rate": 3.843373493975904e-05, + "loss": 0.3791, + "step": 996 + }, + { + "epoch": 0.9261495587552253, + "grad_norm": 0.2239536974059551, + "learning_rate": 3.841652323580034e-05, + "loss": 0.4064, + "step": 997 + }, + { + "epoch": 0.927078495123084, + "grad_norm": 0.25068704163807093, + "learning_rate": 3.8399311531841654e-05, + "loss": 0.4273, + "step": 998 + }, + { + "epoch": 0.9280074314909429, + "grad_norm": 0.2457256835693048, + "learning_rate": 3.838209982788296e-05, + "loss": 0.4096, + "step": 999 + }, + { + "epoch": 0.9289363678588016, + "grad_norm": 0.23667198447703158, + "learning_rate": 3.836488812392427e-05, + "loss": 0.3934, + "step": 1000 + }, + { + "epoch": 0.9298653042266605, + "grad_norm": 0.24449737642939945, + "learning_rate": 3.8347676419965576e-05, + "loss": 0.409, + "step": 1001 + }, + { + "epoch": 0.9307942405945193, + "grad_norm": 0.24650595873011708, + "learning_rate": 3.833046471600689e-05, + "loss": 0.4053, + "step": 1002 + }, + { + "epoch": 0.931723176962378, + "grad_norm": 0.26151738161916, + "learning_rate": 3.831325301204819e-05, + "loss": 0.4104, + "step": 1003 + }, + { + "epoch": 0.9326521133302369, + "grad_norm": 0.2310178657482112, + "learning_rate": 3.8296041308089504e-05, + "loss": 0.3937, + "step": 1004 + }, + { + "epoch": 0.9335810496980956, + "grad_norm": 0.23546622207591916, + "learning_rate": 3.827882960413081e-05, + "loss": 0.4041, + "step": 1005 + }, + { + "epoch": 0.9345099860659545, + "grad_norm": 0.2607486009527593, + "learning_rate": 3.826161790017212e-05, + "loss": 0.4353, + "step": 1006 + }, + { + "epoch": 0.9354389224338133, + "grad_norm": 0.2392967181255042, + "learning_rate": 3.8244406196213426e-05, + "loss": 0.3939, + "step": 1007 + }, + { + "epoch": 0.9363678588016721, + "grad_norm": 0.21674134867792574, + "learning_rate": 3.822719449225474e-05, + "loss": 0.388, + "step": 1008 + }, + { + "epoch": 0.9372967951695309, + "grad_norm": 0.24940419360087127, + "learning_rate": 3.820998278829604e-05, + "loss": 0.4239, + "step": 1009 + }, + { + "epoch": 0.9382257315373896, + "grad_norm": 0.2596709039538196, + "learning_rate": 3.8192771084337355e-05, + "loss": 0.4005, + "step": 1010 + }, + { + "epoch": 0.9391546679052485, + "grad_norm": 0.23606867754298808, + "learning_rate": 3.817555938037866e-05, + "loss": 0.4374, + "step": 1011 + }, + { + "epoch": 0.9400836042731073, + "grad_norm": 0.27125016523755546, + "learning_rate": 3.815834767641997e-05, + "loss": 0.4426, + "step": 1012 + }, + { + "epoch": 0.9410125406409661, + "grad_norm": 0.24501601889271793, + "learning_rate": 3.8141135972461277e-05, + "loss": 0.3874, + "step": 1013 + }, + { + "epoch": 0.9419414770088249, + "grad_norm": 0.2442683405582566, + "learning_rate": 3.812392426850258e-05, + "loss": 0.4047, + "step": 1014 + }, + { + "epoch": 0.9428704133766836, + "grad_norm": 0.26685613518926404, + "learning_rate": 3.810671256454389e-05, + "loss": 0.436, + "step": 1015 + }, + { + "epoch": 0.9437993497445425, + "grad_norm": 0.23983650732436032, + "learning_rate": 3.80895008605852e-05, + "loss": 0.4119, + "step": 1016 + }, + { + "epoch": 0.9447282861124013, + "grad_norm": 0.2570741433796242, + "learning_rate": 3.8072289156626503e-05, + "loss": 0.4236, + "step": 1017 + }, + { + "epoch": 0.9456572224802601, + "grad_norm": 0.2544931404350317, + "learning_rate": 3.8055077452667815e-05, + "loss": 0.3768, + "step": 1018 + }, + { + "epoch": 0.9465861588481189, + "grad_norm": 0.2345446843587865, + "learning_rate": 3.803786574870912e-05, + "loss": 0.4143, + "step": 1019 + }, + { + "epoch": 0.9475150952159777, + "grad_norm": 0.24384024290773332, + "learning_rate": 3.802065404475043e-05, + "loss": 0.4044, + "step": 1020 + }, + { + "epoch": 0.9484440315838365, + "grad_norm": 0.21208048637232893, + "learning_rate": 3.800344234079174e-05, + "loss": 0.4217, + "step": 1021 + }, + { + "epoch": 0.9493729679516953, + "grad_norm": 0.2599635780830963, + "learning_rate": 3.798623063683305e-05, + "loss": 0.4122, + "step": 1022 + }, + { + "epoch": 0.9503019043195541, + "grad_norm": 0.2391907225652313, + "learning_rate": 3.7969018932874354e-05, + "loss": 0.3861, + "step": 1023 + }, + { + "epoch": 0.9512308406874129, + "grad_norm": 0.21784120153158443, + "learning_rate": 3.7951807228915666e-05, + "loss": 0.4017, + "step": 1024 + }, + { + "epoch": 0.9521597770552717, + "grad_norm": 0.2868793994136997, + "learning_rate": 3.793459552495697e-05, + "loss": 0.4078, + "step": 1025 + }, + { + "epoch": 0.9530887134231305, + "grad_norm": 0.2303322060460401, + "learning_rate": 3.791738382099828e-05, + "loss": 0.3959, + "step": 1026 + }, + { + "epoch": 0.9540176497909894, + "grad_norm": 0.21929777651344276, + "learning_rate": 3.790017211703959e-05, + "loss": 0.4378, + "step": 1027 + }, + { + "epoch": 0.9549465861588481, + "grad_norm": 0.29921819323800025, + "learning_rate": 3.78829604130809e-05, + "loss": 0.4114, + "step": 1028 + }, + { + "epoch": 0.9558755225267069, + "grad_norm": 0.24792104039539412, + "learning_rate": 3.7865748709122204e-05, + "loss": 0.3957, + "step": 1029 + }, + { + "epoch": 0.9568044588945657, + "grad_norm": 0.2547675598430285, + "learning_rate": 3.7848537005163516e-05, + "loss": 0.4139, + "step": 1030 + }, + { + "epoch": 0.9577333952624245, + "grad_norm": 0.23635621173173713, + "learning_rate": 3.783132530120482e-05, + "loss": 0.4302, + "step": 1031 + }, + { + "epoch": 0.9586623316302834, + "grad_norm": 0.23539058473689725, + "learning_rate": 3.781411359724613e-05, + "loss": 0.4106, + "step": 1032 + }, + { + "epoch": 0.9595912679981421, + "grad_norm": 0.2252626765187598, + "learning_rate": 3.779690189328744e-05, + "loss": 0.4201, + "step": 1033 + }, + { + "epoch": 0.9605202043660009, + "grad_norm": 0.23605984311517597, + "learning_rate": 3.777969018932875e-05, + "loss": 0.4014, + "step": 1034 + }, + { + "epoch": 0.9614491407338597, + "grad_norm": 0.22411023295920124, + "learning_rate": 3.776247848537005e-05, + "loss": 0.3995, + "step": 1035 + }, + { + "epoch": 0.9623780771017185, + "grad_norm": 0.22444324386702214, + "learning_rate": 3.774526678141136e-05, + "loss": 0.396, + "step": 1036 + }, + { + "epoch": 0.9633070134695774, + "grad_norm": 0.2392457838225579, + "learning_rate": 3.7728055077452665e-05, + "loss": 0.3868, + "step": 1037 + }, + { + "epoch": 0.9642359498374361, + "grad_norm": 0.24259642738031353, + "learning_rate": 3.7710843373493976e-05, + "loss": 0.4059, + "step": 1038 + }, + { + "epoch": 0.965164886205295, + "grad_norm": 0.22122843588430882, + "learning_rate": 3.769363166953528e-05, + "loss": 0.3932, + "step": 1039 + }, + { + "epoch": 0.9660938225731538, + "grad_norm": 0.39575069207466423, + "learning_rate": 3.767641996557659e-05, + "loss": 0.4106, + "step": 1040 + }, + { + "epoch": 0.9670227589410125, + "grad_norm": 0.20970275754321213, + "learning_rate": 3.76592082616179e-05, + "loss": 0.3826, + "step": 1041 + }, + { + "epoch": 0.9679516953088714, + "grad_norm": 0.2689513010745634, + "learning_rate": 3.764199655765921e-05, + "loss": 0.4246, + "step": 1042 + }, + { + "epoch": 0.9688806316767301, + "grad_norm": 0.21716008885897425, + "learning_rate": 3.7624784853700515e-05, + "loss": 0.372, + "step": 1043 + }, + { + "epoch": 0.969809568044589, + "grad_norm": 0.19032663763219618, + "learning_rate": 3.760757314974183e-05, + "loss": 0.3871, + "step": 1044 + }, + { + "epoch": 0.9707385044124478, + "grad_norm": 0.24689207170253144, + "learning_rate": 3.759036144578313e-05, + "loss": 0.3887, + "step": 1045 + }, + { + "epoch": 0.9716674407803065, + "grad_norm": 0.23664100739316893, + "learning_rate": 3.7573149741824444e-05, + "loss": 0.403, + "step": 1046 + }, + { + "epoch": 0.9725963771481654, + "grad_norm": 0.22874968861174907, + "learning_rate": 3.755593803786575e-05, + "loss": 0.429, + "step": 1047 + }, + { + "epoch": 0.9735253135160241, + "grad_norm": 0.213785249209633, + "learning_rate": 3.753872633390706e-05, + "loss": 0.3864, + "step": 1048 + }, + { + "epoch": 0.974454249883883, + "grad_norm": 0.24065036290868722, + "learning_rate": 3.7521514629948365e-05, + "loss": 0.433, + "step": 1049 + }, + { + "epoch": 0.9753831862517418, + "grad_norm": 0.22801923121906967, + "learning_rate": 3.750430292598968e-05, + "loss": 0.3929, + "step": 1050 + }, + { + "epoch": 0.9763121226196005, + "grad_norm": 0.24919400482036166, + "learning_rate": 3.748709122203098e-05, + "loss": 0.4002, + "step": 1051 + }, + { + "epoch": 0.9772410589874594, + "grad_norm": 0.24145964608043208, + "learning_rate": 3.7469879518072294e-05, + "loss": 0.4318, + "step": 1052 + }, + { + "epoch": 0.9781699953553181, + "grad_norm": 0.20174232619370144, + "learning_rate": 3.74526678141136e-05, + "loss": 0.3912, + "step": 1053 + }, + { + "epoch": 0.979098931723177, + "grad_norm": 0.21629692614591498, + "learning_rate": 3.743545611015491e-05, + "loss": 0.3904, + "step": 1054 + }, + { + "epoch": 0.9800278680910358, + "grad_norm": 0.22440046201731037, + "learning_rate": 3.7418244406196216e-05, + "loss": 0.4137, + "step": 1055 + }, + { + "epoch": 0.9809568044588945, + "grad_norm": 0.22672281532519148, + "learning_rate": 3.740103270223752e-05, + "loss": 0.3976, + "step": 1056 + }, + { + "epoch": 0.9818857408267534, + "grad_norm": 0.20668391208886464, + "learning_rate": 3.7383820998278826e-05, + "loss": 0.3951, + "step": 1057 + }, + { + "epoch": 0.9828146771946121, + "grad_norm": 0.2142818046995827, + "learning_rate": 3.736660929432014e-05, + "loss": 0.393, + "step": 1058 + }, + { + "epoch": 0.983743613562471, + "grad_norm": 0.21718324928035726, + "learning_rate": 3.734939759036144e-05, + "loss": 0.398, + "step": 1059 + }, + { + "epoch": 0.9846725499303298, + "grad_norm": 0.23550777849566212, + "learning_rate": 3.7332185886402754e-05, + "loss": 0.4065, + "step": 1060 + }, + { + "epoch": 0.9856014862981886, + "grad_norm": 0.2167543874679464, + "learning_rate": 3.731497418244406e-05, + "loss": 0.3927, + "step": 1061 + }, + { + "epoch": 0.9865304226660474, + "grad_norm": 0.2530841425618813, + "learning_rate": 3.729776247848537e-05, + "loss": 0.4099, + "step": 1062 + }, + { + "epoch": 0.9874593590339061, + "grad_norm": 0.2376035990853862, + "learning_rate": 3.7280550774526676e-05, + "loss": 0.4056, + "step": 1063 + }, + { + "epoch": 0.988388295401765, + "grad_norm": 0.22485376239690827, + "learning_rate": 3.726333907056799e-05, + "loss": 0.411, + "step": 1064 + }, + { + "epoch": 0.9893172317696238, + "grad_norm": 0.2364195030509185, + "learning_rate": 3.724612736660929e-05, + "loss": 0.4024, + "step": 1065 + }, + { + "epoch": 0.9902461681374826, + "grad_norm": 0.2741768987248631, + "learning_rate": 3.7228915662650605e-05, + "loss": 0.4172, + "step": 1066 + }, + { + "epoch": 0.9911751045053414, + "grad_norm": 0.2224572039465709, + "learning_rate": 3.721170395869191e-05, + "loss": 0.4063, + "step": 1067 + }, + { + "epoch": 0.9921040408732001, + "grad_norm": 0.25384861733232483, + "learning_rate": 3.719449225473322e-05, + "loss": 0.3874, + "step": 1068 + }, + { + "epoch": 0.993032977241059, + "grad_norm": 0.21782354887827174, + "learning_rate": 3.717728055077453e-05, + "loss": 0.38, + "step": 1069 + }, + { + "epoch": 0.9939619136089178, + "grad_norm": 0.23692026091455595, + "learning_rate": 3.716006884681584e-05, + "loss": 0.4025, + "step": 1070 + }, + { + "epoch": 0.9948908499767766, + "grad_norm": 0.2628360369682366, + "learning_rate": 3.7142857142857143e-05, + "loss": 0.4011, + "step": 1071 + }, + { + "epoch": 0.9958197863446354, + "grad_norm": 0.23586526214391343, + "learning_rate": 3.7125645438898455e-05, + "loss": 0.4004, + "step": 1072 + }, + { + "epoch": 0.9967487227124942, + "grad_norm": 0.2699525810140143, + "learning_rate": 3.710843373493976e-05, + "loss": 0.4211, + "step": 1073 + }, + { + "epoch": 0.997677659080353, + "grad_norm": 0.24858789264268025, + "learning_rate": 3.709122203098107e-05, + "loss": 0.4055, + "step": 1074 + }, + { + "epoch": 0.9986065954482118, + "grad_norm": 0.21173668049329192, + "learning_rate": 3.707401032702238e-05, + "loss": 0.3769, + "step": 1075 + }, + { + "epoch": 0.9995355318160706, + "grad_norm": 0.24262016662999178, + "learning_rate": 3.705679862306369e-05, + "loss": 0.4221, + "step": 1076 + }, + { + "epoch": 1.0, + "grad_norm": 0.24262016662999178, + "learning_rate": 3.703958691910499e-05, + "loss": 0.4056, + "step": 1077 + }, + { + "epoch": 1.0009289363678588, + "grad_norm": 0.42400479184527856, + "learning_rate": 3.70223752151463e-05, + "loss": 0.359, + "step": 1078 + }, + { + "epoch": 1.0018578727357177, + "grad_norm": 0.25465365326859013, + "learning_rate": 3.7005163511187604e-05, + "loss": 0.3744, + "step": 1079 + }, + { + "epoch": 1.0027868091035763, + "grad_norm": 0.2749579483536481, + "learning_rate": 3.6987951807228916e-05, + "loss": 0.3215, + "step": 1080 + }, + { + "epoch": 1.0037157454714352, + "grad_norm": 0.25882510922444807, + "learning_rate": 3.697074010327022e-05, + "loss": 0.3664, + "step": 1081 + }, + { + "epoch": 1.004644681839294, + "grad_norm": 0.25257387474158305, + "learning_rate": 3.695352839931153e-05, + "loss": 0.3439, + "step": 1082 + }, + { + "epoch": 1.0055736182071529, + "grad_norm": 0.2818136562808609, + "learning_rate": 3.693631669535284e-05, + "loss": 0.3495, + "step": 1083 + }, + { + "epoch": 1.0065025545750117, + "grad_norm": 0.2796090387992452, + "learning_rate": 3.691910499139415e-05, + "loss": 0.3577, + "step": 1084 + }, + { + "epoch": 1.0074314909428703, + "grad_norm": 0.25921344914520555, + "learning_rate": 3.6901893287435454e-05, + "loss": 0.3474, + "step": 1085 + }, + { + "epoch": 1.0083604273107292, + "grad_norm": 0.26684815525436895, + "learning_rate": 3.6884681583476766e-05, + "loss": 0.3476, + "step": 1086 + }, + { + "epoch": 1.009289363678588, + "grad_norm": 0.27357439621922414, + "learning_rate": 3.686746987951807e-05, + "loss": 0.3437, + "step": 1087 + }, + { + "epoch": 1.0102183000464469, + "grad_norm": 0.24447102717487307, + "learning_rate": 3.685025817555938e-05, + "loss": 0.3221, + "step": 1088 + }, + { + "epoch": 1.0111472364143057, + "grad_norm": 0.258464222591644, + "learning_rate": 3.683304647160069e-05, + "loss": 0.356, + "step": 1089 + }, + { + "epoch": 1.0120761727821643, + "grad_norm": 0.27925086544400457, + "learning_rate": 3.6815834767642e-05, + "loss": 0.3701, + "step": 1090 + }, + { + "epoch": 1.0130051091500232, + "grad_norm": 0.2457044337838858, + "learning_rate": 3.6798623063683305e-05, + "loss": 0.3458, + "step": 1091 + }, + { + "epoch": 1.013934045517882, + "grad_norm": 0.2973360259118452, + "learning_rate": 3.6781411359724616e-05, + "loss": 0.3503, + "step": 1092 + }, + { + "epoch": 1.0148629818857409, + "grad_norm": 0.24684519541371244, + "learning_rate": 3.676419965576592e-05, + "loss": 0.3555, + "step": 1093 + }, + { + "epoch": 1.0157919182535997, + "grad_norm": 0.2553488145462288, + "learning_rate": 3.674698795180723e-05, + "loss": 0.339, + "step": 1094 + }, + { + "epoch": 1.0167208546214583, + "grad_norm": 0.27698275528497907, + "learning_rate": 3.672977624784854e-05, + "loss": 0.3555, + "step": 1095 + }, + { + "epoch": 1.0176497909893172, + "grad_norm": 0.25512853436869026, + "learning_rate": 3.671256454388985e-05, + "loss": 0.3508, + "step": 1096 + }, + { + "epoch": 1.018578727357176, + "grad_norm": 0.23687230499469353, + "learning_rate": 3.6695352839931155e-05, + "loss": 0.3541, + "step": 1097 + }, + { + "epoch": 1.0195076637250349, + "grad_norm": 0.2363717396988948, + "learning_rate": 3.667814113597246e-05, + "loss": 0.3533, + "step": 1098 + }, + { + "epoch": 1.0204366000928937, + "grad_norm": 0.2303201035801806, + "learning_rate": 3.666092943201377e-05, + "loss": 0.3632, + "step": 1099 + }, + { + "epoch": 1.0213655364607523, + "grad_norm": 0.2459782206416615, + "learning_rate": 3.664371772805508e-05, + "loss": 0.3618, + "step": 1100 + }, + { + "epoch": 1.0222944728286112, + "grad_norm": 0.296616176626298, + "learning_rate": 3.662650602409639e-05, + "loss": 0.3922, + "step": 1101 + }, + { + "epoch": 1.02322340919647, + "grad_norm": 0.2147341891639701, + "learning_rate": 3.6609294320137694e-05, + "loss": 0.3519, + "step": 1102 + }, + { + "epoch": 1.0241523455643289, + "grad_norm": 0.266276617171416, + "learning_rate": 3.6592082616179e-05, + "loss": 0.3377, + "step": 1103 + }, + { + "epoch": 1.0250812819321877, + "grad_norm": 0.23338087166757537, + "learning_rate": 3.657487091222031e-05, + "loss": 0.3436, + "step": 1104 + }, + { + "epoch": 1.0260102183000464, + "grad_norm": 0.26374694027225837, + "learning_rate": 3.6557659208261616e-05, + "loss": 0.3478, + "step": 1105 + }, + { + "epoch": 1.0269391546679052, + "grad_norm": 0.2337037433490987, + "learning_rate": 3.654044750430293e-05, + "loss": 0.3517, + "step": 1106 + }, + { + "epoch": 1.027868091035764, + "grad_norm": 0.2596231255028893, + "learning_rate": 3.652323580034423e-05, + "loss": 0.365, + "step": 1107 + }, + { + "epoch": 1.0287970274036229, + "grad_norm": 0.240620940997802, + "learning_rate": 3.6506024096385544e-05, + "loss": 0.3628, + "step": 1108 + }, + { + "epoch": 1.0297259637714817, + "grad_norm": 0.21426737173957083, + "learning_rate": 3.648881239242685e-05, + "loss": 0.3335, + "step": 1109 + }, + { + "epoch": 1.0306549001393404, + "grad_norm": 0.2229960824044303, + "learning_rate": 3.647160068846816e-05, + "loss": 0.3549, + "step": 1110 + }, + { + "epoch": 1.0315838365071992, + "grad_norm": 0.27079181149762227, + "learning_rate": 3.6454388984509466e-05, + "loss": 0.3767, + "step": 1111 + }, + { + "epoch": 1.032512772875058, + "grad_norm": 0.22945807604481355, + "learning_rate": 3.643717728055078e-05, + "loss": 0.3442, + "step": 1112 + }, + { + "epoch": 1.033441709242917, + "grad_norm": 0.20955895984599512, + "learning_rate": 3.641996557659208e-05, + "loss": 0.3248, + "step": 1113 + }, + { + "epoch": 1.0343706456107757, + "grad_norm": 0.23194107498456232, + "learning_rate": 3.6402753872633395e-05, + "loss": 0.3502, + "step": 1114 + }, + { + "epoch": 1.0352995819786344, + "grad_norm": 0.2564535374989993, + "learning_rate": 3.63855421686747e-05, + "loss": 0.3694, + "step": 1115 + }, + { + "epoch": 1.0362285183464932, + "grad_norm": 0.2280355951773234, + "learning_rate": 3.636833046471601e-05, + "loss": 0.3373, + "step": 1116 + }, + { + "epoch": 1.037157454714352, + "grad_norm": 0.23682133916062154, + "learning_rate": 3.6351118760757316e-05, + "loss": 0.3355, + "step": 1117 + }, + { + "epoch": 1.038086391082211, + "grad_norm": 0.216976014046176, + "learning_rate": 3.633390705679863e-05, + "loss": 0.3476, + "step": 1118 + }, + { + "epoch": 1.0390153274500697, + "grad_norm": 0.2067060037353485, + "learning_rate": 3.631669535283993e-05, + "loss": 0.3515, + "step": 1119 + }, + { + "epoch": 1.0399442638179284, + "grad_norm": 0.26852478563809756, + "learning_rate": 3.629948364888124e-05, + "loss": 0.3687, + "step": 1120 + }, + { + "epoch": 1.0408732001857872, + "grad_norm": 0.23440938063759156, + "learning_rate": 3.628227194492255e-05, + "loss": 0.3676, + "step": 1121 + }, + { + "epoch": 1.041802136553646, + "grad_norm": 0.22839083712400235, + "learning_rate": 3.6265060240963855e-05, + "loss": 0.3483, + "step": 1122 + }, + { + "epoch": 1.042731072921505, + "grad_norm": 0.2868994905545559, + "learning_rate": 3.624784853700517e-05, + "loss": 0.3495, + "step": 1123 + }, + { + "epoch": 1.0436600092893638, + "grad_norm": 0.21384049452616608, + "learning_rate": 3.623063683304647e-05, + "loss": 0.3419, + "step": 1124 + }, + { + "epoch": 1.0445889456572224, + "grad_norm": 0.22545699928123575, + "learning_rate": 3.6213425129087784e-05, + "loss": 0.3507, + "step": 1125 + }, + { + "epoch": 1.0455178820250812, + "grad_norm": 0.2616098875142772, + "learning_rate": 3.619621342512909e-05, + "loss": 0.3684, + "step": 1126 + }, + { + "epoch": 1.04644681839294, + "grad_norm": 0.20808430540664596, + "learning_rate": 3.6179001721170394e-05, + "loss": 0.331, + "step": 1127 + }, + { + "epoch": 1.047375754760799, + "grad_norm": 0.2447603473682486, + "learning_rate": 3.6161790017211705e-05, + "loss": 0.3455, + "step": 1128 + }, + { + "epoch": 1.0483046911286578, + "grad_norm": 0.2500131614712927, + "learning_rate": 3.614457831325301e-05, + "loss": 0.3361, + "step": 1129 + }, + { + "epoch": 1.0492336274965164, + "grad_norm": 0.2325076708728905, + "learning_rate": 3.612736660929432e-05, + "loss": 0.3345, + "step": 1130 + }, + { + "epoch": 1.0501625638643752, + "grad_norm": 0.3372982804928654, + "learning_rate": 3.611015490533563e-05, + "loss": 0.3442, + "step": 1131 + }, + { + "epoch": 1.051091500232234, + "grad_norm": 0.26364292805811546, + "learning_rate": 3.609294320137694e-05, + "loss": 0.355, + "step": 1132 + }, + { + "epoch": 1.052020436600093, + "grad_norm": 0.2527413633654713, + "learning_rate": 3.6075731497418244e-05, + "loss": 0.3629, + "step": 1133 + }, + { + "epoch": 1.0529493729679518, + "grad_norm": 0.2554529746289192, + "learning_rate": 3.6058519793459556e-05, + "loss": 0.3383, + "step": 1134 + }, + { + "epoch": 1.0538783093358104, + "grad_norm": 0.27047127903067114, + "learning_rate": 3.604130808950086e-05, + "loss": 0.3282, + "step": 1135 + }, + { + "epoch": 1.0548072457036692, + "grad_norm": 0.19736989479380834, + "learning_rate": 3.602409638554217e-05, + "loss": 0.3713, + "step": 1136 + }, + { + "epoch": 1.055736182071528, + "grad_norm": 0.2945278332205401, + "learning_rate": 3.600688468158348e-05, + "loss": 0.3537, + "step": 1137 + }, + { + "epoch": 1.056665118439387, + "grad_norm": 0.2583946369570546, + "learning_rate": 3.598967297762479e-05, + "loss": 0.3963, + "step": 1138 + }, + { + "epoch": 1.0575940548072458, + "grad_norm": 0.50245090499772, + "learning_rate": 3.5972461273666094e-05, + "loss": 0.3514, + "step": 1139 + }, + { + "epoch": 1.0585229911751044, + "grad_norm": 0.21531395236728787, + "learning_rate": 3.5955249569707406e-05, + "loss": 0.3562, + "step": 1140 + }, + { + "epoch": 1.0594519275429632, + "grad_norm": 0.24061017735313153, + "learning_rate": 3.593803786574871e-05, + "loss": 0.3694, + "step": 1141 + }, + { + "epoch": 1.060380863910822, + "grad_norm": 0.2268314841653852, + "learning_rate": 3.5920826161790016e-05, + "loss": 0.3422, + "step": 1142 + }, + { + "epoch": 1.061309800278681, + "grad_norm": 0.22078553445878826, + "learning_rate": 3.590361445783133e-05, + "loss": 0.3606, + "step": 1143 + }, + { + "epoch": 1.0622387366465398, + "grad_norm": 0.22090392465288902, + "learning_rate": 3.588640275387263e-05, + "loss": 0.3448, + "step": 1144 + }, + { + "epoch": 1.0631676730143984, + "grad_norm": 0.23293244274551053, + "learning_rate": 3.5869191049913945e-05, + "loss": 0.3415, + "step": 1145 + }, + { + "epoch": 1.0640966093822573, + "grad_norm": 0.21096983770192937, + "learning_rate": 3.585197934595525e-05, + "loss": 0.3563, + "step": 1146 + }, + { + "epoch": 1.065025545750116, + "grad_norm": 0.22212524809608433, + "learning_rate": 3.583476764199656e-05, + "loss": 0.3316, + "step": 1147 + }, + { + "epoch": 1.065954482117975, + "grad_norm": 0.20899680745058694, + "learning_rate": 3.5817555938037867e-05, + "loss": 0.3649, + "step": 1148 + }, + { + "epoch": 1.0668834184858338, + "grad_norm": 0.2224898644473964, + "learning_rate": 3.580034423407918e-05, + "loss": 0.3485, + "step": 1149 + }, + { + "epoch": 1.0678123548536926, + "grad_norm": 0.2487544062552923, + "learning_rate": 3.578313253012048e-05, + "loss": 0.3522, + "step": 1150 + }, + { + "epoch": 1.0687412912215513, + "grad_norm": 0.23055813456250082, + "learning_rate": 3.576592082616179e-05, + "loss": 0.3618, + "step": 1151 + }, + { + "epoch": 1.06967022758941, + "grad_norm": 0.20028327166610577, + "learning_rate": 3.57487091222031e-05, + "loss": 0.3358, + "step": 1152 + }, + { + "epoch": 1.070599163957269, + "grad_norm": 0.22311914856614992, + "learning_rate": 3.5731497418244405e-05, + "loss": 0.3549, + "step": 1153 + }, + { + "epoch": 1.0715281003251278, + "grad_norm": 0.23560718370813885, + "learning_rate": 3.571428571428572e-05, + "loss": 0.374, + "step": 1154 + }, + { + "epoch": 1.0724570366929864, + "grad_norm": 0.4148928972512194, + "learning_rate": 3.569707401032702e-05, + "loss": 0.3528, + "step": 1155 + }, + { + "epoch": 1.0733859730608453, + "grad_norm": 0.25885861158094975, + "learning_rate": 3.5679862306368334e-05, + "loss": 0.3412, + "step": 1156 + }, + { + "epoch": 1.0743149094287041, + "grad_norm": 0.21300196539669833, + "learning_rate": 3.566265060240964e-05, + "loss": 0.3355, + "step": 1157 + }, + { + "epoch": 1.075243845796563, + "grad_norm": 0.235345413141929, + "learning_rate": 3.564543889845095e-05, + "loss": 0.3629, + "step": 1158 + }, + { + "epoch": 1.0761727821644218, + "grad_norm": 0.23367347817298498, + "learning_rate": 3.5628227194492256e-05, + "loss": 0.3613, + "step": 1159 + }, + { + "epoch": 1.0771017185322806, + "grad_norm": 0.2326105412483254, + "learning_rate": 3.561101549053357e-05, + "loss": 0.3461, + "step": 1160 + }, + { + "epoch": 1.0780306549001393, + "grad_norm": 0.23402340603110036, + "learning_rate": 3.559380378657487e-05, + "loss": 0.3383, + "step": 1161 + }, + { + "epoch": 1.0789595912679981, + "grad_norm": 0.2392078922693959, + "learning_rate": 3.557659208261618e-05, + "loss": 0.3481, + "step": 1162 + }, + { + "epoch": 1.079888527635857, + "grad_norm": 0.21367450129437765, + "learning_rate": 3.555938037865749e-05, + "loss": 0.3552, + "step": 1163 + }, + { + "epoch": 1.0808174640037158, + "grad_norm": 0.2715676998763513, + "learning_rate": 3.5542168674698794e-05, + "loss": 0.3379, + "step": 1164 + }, + { + "epoch": 1.0817464003715744, + "grad_norm": 0.2386244987904004, + "learning_rate": 3.5524956970740106e-05, + "loss": 0.3681, + "step": 1165 + }, + { + "epoch": 1.0826753367394333, + "grad_norm": 0.22559148972950677, + "learning_rate": 3.550774526678141e-05, + "loss": 0.3784, + "step": 1166 + }, + { + "epoch": 1.0836042731072921, + "grad_norm": 0.2525819984114936, + "learning_rate": 3.549053356282272e-05, + "loss": 0.3547, + "step": 1167 + }, + { + "epoch": 1.084533209475151, + "grad_norm": 0.21367915648446467, + "learning_rate": 3.547332185886403e-05, + "loss": 0.3396, + "step": 1168 + }, + { + "epoch": 1.0854621458430098, + "grad_norm": 0.2521020982590234, + "learning_rate": 3.545611015490534e-05, + "loss": 0.3592, + "step": 1169 + }, + { + "epoch": 1.0863910822108687, + "grad_norm": 0.21484751786346046, + "learning_rate": 3.5438898450946645e-05, + "loss": 0.3401, + "step": 1170 + }, + { + "epoch": 1.0873200185787273, + "grad_norm": 0.22930787394886787, + "learning_rate": 3.5421686746987956e-05, + "loss": 0.3503, + "step": 1171 + }, + { + "epoch": 1.0882489549465861, + "grad_norm": 0.21234758307608245, + "learning_rate": 3.540447504302926e-05, + "loss": 0.342, + "step": 1172 + }, + { + "epoch": 1.089177891314445, + "grad_norm": 0.1870818694435957, + "learning_rate": 3.538726333907057e-05, + "loss": 0.3388, + "step": 1173 + }, + { + "epoch": 1.0901068276823038, + "grad_norm": 0.2075435312021659, + "learning_rate": 3.537005163511188e-05, + "loss": 0.3534, + "step": 1174 + }, + { + "epoch": 1.0910357640501624, + "grad_norm": 0.2080276884082451, + "learning_rate": 3.535283993115319e-05, + "loss": 0.3308, + "step": 1175 + }, + { + "epoch": 1.0919647004180213, + "grad_norm": 0.21516330313139653, + "learning_rate": 3.5335628227194495e-05, + "loss": 0.3632, + "step": 1176 + }, + { + "epoch": 1.0928936367858801, + "grad_norm": 0.24747236372423687, + "learning_rate": 3.53184165232358e-05, + "loss": 0.3819, + "step": 1177 + }, + { + "epoch": 1.093822573153739, + "grad_norm": 0.2084513425036716, + "learning_rate": 3.530120481927711e-05, + "loss": 0.3284, + "step": 1178 + }, + { + "epoch": 1.0947515095215978, + "grad_norm": 0.2206018613707994, + "learning_rate": 3.528399311531842e-05, + "loss": 0.3736, + "step": 1179 + }, + { + "epoch": 1.0956804458894567, + "grad_norm": 0.26865785499633366, + "learning_rate": 3.526678141135973e-05, + "loss": 0.3811, + "step": 1180 + }, + { + "epoch": 1.0966093822573153, + "grad_norm": 0.2208182545530636, + "learning_rate": 3.5249569707401034e-05, + "loss": 0.3613, + "step": 1181 + }, + { + "epoch": 1.0975383186251741, + "grad_norm": 0.2654033751873286, + "learning_rate": 3.5232358003442345e-05, + "loss": 0.3426, + "step": 1182 + }, + { + "epoch": 1.098467254993033, + "grad_norm": 0.22186733925795926, + "learning_rate": 3.521514629948365e-05, + "loss": 0.3542, + "step": 1183 + }, + { + "epoch": 1.0993961913608918, + "grad_norm": 0.24207554296990225, + "learning_rate": 3.5197934595524955e-05, + "loss": 0.3839, + "step": 1184 + }, + { + "epoch": 1.1003251277287505, + "grad_norm": 0.2720135985816513, + "learning_rate": 3.518072289156627e-05, + "loss": 0.3818, + "step": 1185 + }, + { + "epoch": 1.1012540640966093, + "grad_norm": 0.23520239556784836, + "learning_rate": 3.516351118760757e-05, + "loss": 0.3573, + "step": 1186 + }, + { + "epoch": 1.1021830004644682, + "grad_norm": 0.20611704267650519, + "learning_rate": 3.5146299483648884e-05, + "loss": 0.3636, + "step": 1187 + }, + { + "epoch": 1.103111936832327, + "grad_norm": 0.23271916548723978, + "learning_rate": 3.512908777969019e-05, + "loss": 0.3563, + "step": 1188 + }, + { + "epoch": 1.1040408732001858, + "grad_norm": 0.22320870615937669, + "learning_rate": 3.51118760757315e-05, + "loss": 0.353, + "step": 1189 + }, + { + "epoch": 1.1049698095680447, + "grad_norm": 0.24142802201840471, + "learning_rate": 3.5094664371772806e-05, + "loss": 0.3674, + "step": 1190 + }, + { + "epoch": 1.1058987459359033, + "grad_norm": 0.24032397770102173, + "learning_rate": 3.507745266781412e-05, + "loss": 0.3396, + "step": 1191 + }, + { + "epoch": 1.1068276823037622, + "grad_norm": 0.23859301700052904, + "learning_rate": 3.506024096385542e-05, + "loss": 0.3709, + "step": 1192 + }, + { + "epoch": 1.107756618671621, + "grad_norm": 0.20993929975570846, + "learning_rate": 3.5043029259896734e-05, + "loss": 0.3532, + "step": 1193 + }, + { + "epoch": 1.1086855550394799, + "grad_norm": 0.21525862778600577, + "learning_rate": 3.502581755593804e-05, + "loss": 0.3414, + "step": 1194 + }, + { + "epoch": 1.1096144914073387, + "grad_norm": 0.2153151742361438, + "learning_rate": 3.500860585197935e-05, + "loss": 0.3479, + "step": 1195 + }, + { + "epoch": 1.1105434277751973, + "grad_norm": 0.2341045687466886, + "learning_rate": 3.4991394148020656e-05, + "loss": 0.3514, + "step": 1196 + }, + { + "epoch": 1.1114723641430562, + "grad_norm": 0.22041252024232932, + "learning_rate": 3.497418244406197e-05, + "loss": 0.3517, + "step": 1197 + }, + { + "epoch": 1.112401300510915, + "grad_norm": 0.2421034130602699, + "learning_rate": 3.495697074010327e-05, + "loss": 0.355, + "step": 1198 + }, + { + "epoch": 1.1133302368787739, + "grad_norm": 0.2148087803076203, + "learning_rate": 3.4939759036144585e-05, + "loss": 0.3433, + "step": 1199 + }, + { + "epoch": 1.1142591732466327, + "grad_norm": 0.23749094706036994, + "learning_rate": 3.492254733218589e-05, + "loss": 0.3566, + "step": 1200 + }, + { + "epoch": 1.1151881096144913, + "grad_norm": 0.24636426776494177, + "learning_rate": 3.4905335628227195e-05, + "loss": 0.3415, + "step": 1201 + }, + { + "epoch": 1.1161170459823502, + "grad_norm": 0.21408882618052283, + "learning_rate": 3.488812392426851e-05, + "loss": 0.348, + "step": 1202 + }, + { + "epoch": 1.117045982350209, + "grad_norm": 0.20922550693268657, + "learning_rate": 3.487091222030981e-05, + "loss": 0.3385, + "step": 1203 + }, + { + "epoch": 1.1179749187180679, + "grad_norm": 0.20482695943117954, + "learning_rate": 3.485370051635112e-05, + "loss": 0.3714, + "step": 1204 + }, + { + "epoch": 1.1189038550859267, + "grad_norm": 0.21259189935478204, + "learning_rate": 3.483648881239243e-05, + "loss": 0.3583, + "step": 1205 + }, + { + "epoch": 1.1198327914537853, + "grad_norm": 0.23993781373108236, + "learning_rate": 3.4819277108433733e-05, + "loss": 0.3549, + "step": 1206 + }, + { + "epoch": 1.1207617278216442, + "grad_norm": 0.21359898911362096, + "learning_rate": 3.4802065404475045e-05, + "loss": 0.3915, + "step": 1207 + }, + { + "epoch": 1.121690664189503, + "grad_norm": 0.24339391904887306, + "learning_rate": 3.478485370051635e-05, + "loss": 0.3481, + "step": 1208 + }, + { + "epoch": 1.1226196005573619, + "grad_norm": 0.18696533553937125, + "learning_rate": 3.476764199655766e-05, + "loss": 0.3489, + "step": 1209 + }, + { + "epoch": 1.1235485369252207, + "grad_norm": 0.22213428030739601, + "learning_rate": 3.475043029259897e-05, + "loss": 0.3442, + "step": 1210 + }, + { + "epoch": 1.1244774732930793, + "grad_norm": 0.22004231403214247, + "learning_rate": 3.473321858864028e-05, + "loss": 0.3567, + "step": 1211 + }, + { + "epoch": 1.1254064096609382, + "grad_norm": 0.24987319354407675, + "learning_rate": 3.4716006884681584e-05, + "loss": 0.3608, + "step": 1212 + }, + { + "epoch": 1.126335346028797, + "grad_norm": 0.22593282852581212, + "learning_rate": 3.4698795180722896e-05, + "loss": 0.3604, + "step": 1213 + }, + { + "epoch": 1.1272642823966559, + "grad_norm": 0.22913929079537423, + "learning_rate": 3.46815834767642e-05, + "loss": 0.3308, + "step": 1214 + }, + { + "epoch": 1.1281932187645147, + "grad_norm": 0.22512936563970043, + "learning_rate": 3.466437177280551e-05, + "loss": 0.3539, + "step": 1215 + }, + { + "epoch": 1.1291221551323734, + "grad_norm": 0.2051404174738686, + "learning_rate": 3.464716006884682e-05, + "loss": 0.363, + "step": 1216 + }, + { + "epoch": 1.1300510915002322, + "grad_norm": 0.21490274319372904, + "learning_rate": 3.462994836488813e-05, + "loss": 0.3354, + "step": 1217 + }, + { + "epoch": 1.130980027868091, + "grad_norm": 0.2174571977823224, + "learning_rate": 3.4612736660929434e-05, + "loss": 0.3513, + "step": 1218 + }, + { + "epoch": 1.1319089642359499, + "grad_norm": 0.19767325158530422, + "learning_rate": 3.4595524956970746e-05, + "loss": 0.3435, + "step": 1219 + }, + { + "epoch": 1.1328379006038087, + "grad_norm": 0.21719464735692653, + "learning_rate": 3.457831325301205e-05, + "loss": 0.3574, + "step": 1220 + }, + { + "epoch": 1.1337668369716674, + "grad_norm": 0.20581929044171807, + "learning_rate": 3.456110154905336e-05, + "loss": 0.3341, + "step": 1221 + }, + { + "epoch": 1.1346957733395262, + "grad_norm": 0.19844253892744268, + "learning_rate": 3.454388984509467e-05, + "loss": 0.3362, + "step": 1222 + }, + { + "epoch": 1.135624709707385, + "grad_norm": 0.2120233032755566, + "learning_rate": 3.452667814113598e-05, + "loss": 0.3521, + "step": 1223 + }, + { + "epoch": 1.136553646075244, + "grad_norm": 0.2038168421319054, + "learning_rate": 3.4509466437177285e-05, + "loss": 0.3831, + "step": 1224 + }, + { + "epoch": 1.1374825824431027, + "grad_norm": 0.25162161041366465, + "learning_rate": 3.449225473321859e-05, + "loss": 0.3629, + "step": 1225 + }, + { + "epoch": 1.1384115188109614, + "grad_norm": 0.23246017940495503, + "learning_rate": 3.4475043029259895e-05, + "loss": 0.3361, + "step": 1226 + }, + { + "epoch": 1.1393404551788202, + "grad_norm": 0.20778768722697064, + "learning_rate": 3.4457831325301206e-05, + "loss": 0.3555, + "step": 1227 + }, + { + "epoch": 1.140269391546679, + "grad_norm": 0.2153164058775418, + "learning_rate": 3.444061962134251e-05, + "loss": 0.3351, + "step": 1228 + }, + { + "epoch": 1.141198327914538, + "grad_norm": 0.20123624716901134, + "learning_rate": 3.442340791738382e-05, + "loss": 0.3443, + "step": 1229 + }, + { + "epoch": 1.1421272642823967, + "grad_norm": 0.21058058376175867, + "learning_rate": 3.440619621342513e-05, + "loss": 0.3587, + "step": 1230 + }, + { + "epoch": 1.1430562006502554, + "grad_norm": 0.2263383068441551, + "learning_rate": 3.438898450946644e-05, + "loss": 0.3305, + "step": 1231 + }, + { + "epoch": 1.1439851370181142, + "grad_norm": 0.21243006539833154, + "learning_rate": 3.4371772805507745e-05, + "loss": 0.3554, + "step": 1232 + }, + { + "epoch": 1.144914073385973, + "grad_norm": 0.2102541546923477, + "learning_rate": 3.435456110154906e-05, + "loss": 0.3567, + "step": 1233 + }, + { + "epoch": 1.145843009753832, + "grad_norm": 0.23046254004079486, + "learning_rate": 3.433734939759036e-05, + "loss": 0.3617, + "step": 1234 + }, + { + "epoch": 1.1467719461216908, + "grad_norm": 0.2930500467524406, + "learning_rate": 3.4320137693631674e-05, + "loss": 0.3555, + "step": 1235 + }, + { + "epoch": 1.1477008824895494, + "grad_norm": 0.23152478794927303, + "learning_rate": 3.430292598967298e-05, + "loss": 0.3734, + "step": 1236 + }, + { + "epoch": 1.1486298188574082, + "grad_norm": 0.2549389816652615, + "learning_rate": 3.428571428571429e-05, + "loss": 0.3293, + "step": 1237 + }, + { + "epoch": 1.149558755225267, + "grad_norm": 0.2167026826205843, + "learning_rate": 3.4268502581755595e-05, + "loss": 0.3411, + "step": 1238 + }, + { + "epoch": 1.150487691593126, + "grad_norm": 0.2485030988375794, + "learning_rate": 3.425129087779691e-05, + "loss": 0.3559, + "step": 1239 + }, + { + "epoch": 1.1514166279609848, + "grad_norm": 0.24163510390409695, + "learning_rate": 3.423407917383821e-05, + "loss": 0.3448, + "step": 1240 + }, + { + "epoch": 1.1523455643288434, + "grad_norm": 0.23020837321077686, + "learning_rate": 3.4216867469879524e-05, + "loss": 0.3346, + "step": 1241 + }, + { + "epoch": 1.1532745006967022, + "grad_norm": 0.21827258140007752, + "learning_rate": 3.419965576592083e-05, + "loss": 0.3594, + "step": 1242 + }, + { + "epoch": 1.154203437064561, + "grad_norm": 0.2259959955019267, + "learning_rate": 3.418244406196214e-05, + "loss": 0.3263, + "step": 1243 + }, + { + "epoch": 1.15513237343242, + "grad_norm": 0.2235516711875459, + "learning_rate": 3.4165232358003446e-05, + "loss": 0.3315, + "step": 1244 + }, + { + "epoch": 1.1560613098002788, + "grad_norm": 0.20721762489645518, + "learning_rate": 3.414802065404476e-05, + "loss": 0.3604, + "step": 1245 + }, + { + "epoch": 1.1569902461681374, + "grad_norm": 0.2231484555937936, + "learning_rate": 3.413080895008606e-05, + "loss": 0.3476, + "step": 1246 + }, + { + "epoch": 1.1579191825359962, + "grad_norm": 0.22679340464244963, + "learning_rate": 3.411359724612737e-05, + "loss": 0.3788, + "step": 1247 + }, + { + "epoch": 1.158848118903855, + "grad_norm": 0.2183757050580824, + "learning_rate": 3.409638554216867e-05, + "loss": 0.3308, + "step": 1248 + }, + { + "epoch": 1.159777055271714, + "grad_norm": 0.23012791532153595, + "learning_rate": 3.4079173838209984e-05, + "loss": 0.3475, + "step": 1249 + }, + { + "epoch": 1.1607059916395728, + "grad_norm": 0.2151078725178325, + "learning_rate": 3.406196213425129e-05, + "loss": 0.3542, + "step": 1250 + }, + { + "epoch": 1.1616349280074314, + "grad_norm": 0.20530486932614164, + "learning_rate": 3.40447504302926e-05, + "loss": 0.3362, + "step": 1251 + }, + { + "epoch": 1.1625638643752902, + "grad_norm": 0.21039678917229546, + "learning_rate": 3.4027538726333906e-05, + "loss": 0.3602, + "step": 1252 + }, + { + "epoch": 1.163492800743149, + "grad_norm": 0.2292399032989059, + "learning_rate": 3.401032702237522e-05, + "loss": 0.3678, + "step": 1253 + }, + { + "epoch": 1.164421737111008, + "grad_norm": 0.23493270267966537, + "learning_rate": 3.399311531841652e-05, + "loss": 0.3531, + "step": 1254 + }, + { + "epoch": 1.1653506734788668, + "grad_norm": 0.20806480245707265, + "learning_rate": 3.3975903614457835e-05, + "loss": 0.3474, + "step": 1255 + }, + { + "epoch": 1.1662796098467254, + "grad_norm": 0.23970461641766203, + "learning_rate": 3.395869191049914e-05, + "loss": 0.3555, + "step": 1256 + }, + { + "epoch": 1.1672085462145843, + "grad_norm": 0.23969672692118035, + "learning_rate": 3.394148020654045e-05, + "loss": 0.3634, + "step": 1257 + }, + { + "epoch": 1.168137482582443, + "grad_norm": 0.2285160457892427, + "learning_rate": 3.392426850258176e-05, + "loss": 0.3442, + "step": 1258 + }, + { + "epoch": 1.169066418950302, + "grad_norm": 0.22369375804224814, + "learning_rate": 3.390705679862307e-05, + "loss": 0.3311, + "step": 1259 + }, + { + "epoch": 1.1699953553181608, + "grad_norm": 0.21415743915884217, + "learning_rate": 3.3889845094664373e-05, + "loss": 0.3469, + "step": 1260 + }, + { + "epoch": 1.1709242916860194, + "grad_norm": 0.19956161096816913, + "learning_rate": 3.3872633390705685e-05, + "loss": 0.3665, + "step": 1261 + }, + { + "epoch": 1.1718532280538783, + "grad_norm": 0.22850970980578056, + "learning_rate": 3.385542168674699e-05, + "loss": 0.3311, + "step": 1262 + }, + { + "epoch": 1.172782164421737, + "grad_norm": 0.22798171419609636, + "learning_rate": 3.38382099827883e-05, + "loss": 0.3487, + "step": 1263 + }, + { + "epoch": 1.173711100789596, + "grad_norm": 0.23282279392195235, + "learning_rate": 3.382099827882961e-05, + "loss": 0.3698, + "step": 1264 + }, + { + "epoch": 1.1746400371574548, + "grad_norm": 0.2703092707739479, + "learning_rate": 3.380378657487092e-05, + "loss": 0.3394, + "step": 1265 + }, + { + "epoch": 1.1755689735253134, + "grad_norm": 0.22016376237672852, + "learning_rate": 3.3786574870912224e-05, + "loss": 0.3546, + "step": 1266 + }, + { + "epoch": 1.1764979098931723, + "grad_norm": 0.20478813537848256, + "learning_rate": 3.3769363166953536e-05, + "loss": 0.3528, + "step": 1267 + }, + { + "epoch": 1.1774268462610311, + "grad_norm": 0.2561543579105953, + "learning_rate": 3.3752151462994834e-05, + "loss": 0.383, + "step": 1268 + }, + { + "epoch": 1.17835578262889, + "grad_norm": 0.2192358895821466, + "learning_rate": 3.3734939759036146e-05, + "loss": 0.3548, + "step": 1269 + }, + { + "epoch": 1.1792847189967488, + "grad_norm": 0.21639896112955978, + "learning_rate": 3.371772805507745e-05, + "loss": 0.3438, + "step": 1270 + }, + { + "epoch": 1.1802136553646074, + "grad_norm": 0.23897229606319104, + "learning_rate": 3.370051635111876e-05, + "loss": 0.3408, + "step": 1271 + }, + { + "epoch": 1.1811425917324663, + "grad_norm": 0.21230871449016278, + "learning_rate": 3.368330464716007e-05, + "loss": 0.3529, + "step": 1272 + }, + { + "epoch": 1.1820715281003251, + "grad_norm": 0.2225131278174084, + "learning_rate": 3.366609294320138e-05, + "loss": 0.3443, + "step": 1273 + }, + { + "epoch": 1.183000464468184, + "grad_norm": 0.22179982347649585, + "learning_rate": 3.3648881239242684e-05, + "loss": 0.3667, + "step": 1274 + }, + { + "epoch": 1.1839294008360428, + "grad_norm": 0.2363874317205004, + "learning_rate": 3.3631669535283996e-05, + "loss": 0.3786, + "step": 1275 + }, + { + "epoch": 1.1848583372039014, + "grad_norm": 0.2193585078333849, + "learning_rate": 3.36144578313253e-05, + "loss": 0.3297, + "step": 1276 + }, + { + "epoch": 1.1857872735717603, + "grad_norm": 0.2284696248734332, + "learning_rate": 3.359724612736661e-05, + "loss": 0.3437, + "step": 1277 + }, + { + "epoch": 1.1867162099396191, + "grad_norm": 0.20793086153649565, + "learning_rate": 3.358003442340792e-05, + "loss": 0.3531, + "step": 1278 + }, + { + "epoch": 1.187645146307478, + "grad_norm": 0.21567184317250074, + "learning_rate": 3.356282271944923e-05, + "loss": 0.3406, + "step": 1279 + }, + { + "epoch": 1.1885740826753368, + "grad_norm": 0.24433871916587774, + "learning_rate": 3.3545611015490535e-05, + "loss": 0.3716, + "step": 1280 + }, + { + "epoch": 1.1895030190431957, + "grad_norm": 0.22152344071069677, + "learning_rate": 3.3528399311531847e-05, + "loss": 0.3356, + "step": 1281 + }, + { + "epoch": 1.1904319554110543, + "grad_norm": 0.21020957516671623, + "learning_rate": 3.351118760757315e-05, + "loss": 0.374, + "step": 1282 + }, + { + "epoch": 1.1913608917789131, + "grad_norm": 0.5970058115054317, + "learning_rate": 3.349397590361446e-05, + "loss": 0.3366, + "step": 1283 + }, + { + "epoch": 1.192289828146772, + "grad_norm": 0.22658123670184568, + "learning_rate": 3.347676419965577e-05, + "loss": 0.345, + "step": 1284 + }, + { + "epoch": 1.1932187645146308, + "grad_norm": 0.20134140896250374, + "learning_rate": 3.345955249569708e-05, + "loss": 0.3369, + "step": 1285 + }, + { + "epoch": 1.1941477008824894, + "grad_norm": 0.2203877272353749, + "learning_rate": 3.3442340791738385e-05, + "loss": 0.343, + "step": 1286 + }, + { + "epoch": 1.1950766372503483, + "grad_norm": 0.20330351316025363, + "learning_rate": 3.34251290877797e-05, + "loss": 0.3546, + "step": 1287 + }, + { + "epoch": 1.1960055736182071, + "grad_norm": 0.20833906150996628, + "learning_rate": 3.3407917383821e-05, + "loss": 0.3653, + "step": 1288 + }, + { + "epoch": 1.196934509986066, + "grad_norm": 0.21852984142409648, + "learning_rate": 3.339070567986231e-05, + "loss": 0.3576, + "step": 1289 + }, + { + "epoch": 1.1978634463539248, + "grad_norm": 0.23067032362394563, + "learning_rate": 3.337349397590361e-05, + "loss": 0.3237, + "step": 1290 + }, + { + "epoch": 1.1987923827217837, + "grad_norm": 0.21203185556352686, + "learning_rate": 3.3356282271944924e-05, + "loss": 0.3413, + "step": 1291 + }, + { + "epoch": 1.1997213190896423, + "grad_norm": 0.19814905094151894, + "learning_rate": 3.333907056798623e-05, + "loss": 0.3187, + "step": 1292 + }, + { + "epoch": 1.2006502554575011, + "grad_norm": 0.23683735633528624, + "learning_rate": 3.332185886402754e-05, + "loss": 0.3692, + "step": 1293 + }, + { + "epoch": 1.20157919182536, + "grad_norm": 0.23935032392727087, + "learning_rate": 3.3304647160068846e-05, + "loss": 0.341, + "step": 1294 + }, + { + "epoch": 1.2025081281932188, + "grad_norm": 0.21718405839661278, + "learning_rate": 3.328743545611016e-05, + "loss": 0.339, + "step": 1295 + }, + { + "epoch": 1.2034370645610775, + "grad_norm": 0.2333890322775293, + "learning_rate": 3.327022375215146e-05, + "loss": 0.3522, + "step": 1296 + }, + { + "epoch": 1.2043660009289363, + "grad_norm": 0.23203001135682452, + "learning_rate": 3.3253012048192774e-05, + "loss": 0.3328, + "step": 1297 + }, + { + "epoch": 1.2052949372967952, + "grad_norm": 0.20690983108544067, + "learning_rate": 3.323580034423408e-05, + "loss": 0.3527, + "step": 1298 + }, + { + "epoch": 1.206223873664654, + "grad_norm": 0.2466034408066261, + "learning_rate": 3.321858864027539e-05, + "loss": 0.3629, + "step": 1299 + }, + { + "epoch": 1.2071528100325128, + "grad_norm": 0.21156982100561744, + "learning_rate": 3.3201376936316696e-05, + "loss": 0.3669, + "step": 1300 + }, + { + "epoch": 1.2080817464003717, + "grad_norm": 0.2160203305108374, + "learning_rate": 3.318416523235801e-05, + "loss": 0.3528, + "step": 1301 + }, + { + "epoch": 1.2090106827682303, + "grad_norm": 0.24371827812631483, + "learning_rate": 3.316695352839931e-05, + "loss": 0.3665, + "step": 1302 + }, + { + "epoch": 1.2099396191360892, + "grad_norm": 0.22457004262344515, + "learning_rate": 3.3149741824440625e-05, + "loss": 0.3458, + "step": 1303 + }, + { + "epoch": 1.210868555503948, + "grad_norm": 0.22055226828004412, + "learning_rate": 3.313253012048193e-05, + "loss": 0.3738, + "step": 1304 + }, + { + "epoch": 1.2117974918718069, + "grad_norm": 0.22285967677243848, + "learning_rate": 3.311531841652324e-05, + "loss": 0.3616, + "step": 1305 + }, + { + "epoch": 1.2127264282396655, + "grad_norm": 0.2161533855354827, + "learning_rate": 3.3098106712564546e-05, + "loss": 0.3699, + "step": 1306 + }, + { + "epoch": 1.2136553646075243, + "grad_norm": 0.22228225680627903, + "learning_rate": 3.308089500860586e-05, + "loss": 0.3363, + "step": 1307 + }, + { + "epoch": 1.2145843009753832, + "grad_norm": 0.20607924665127353, + "learning_rate": 3.306368330464716e-05, + "loss": 0.3303, + "step": 1308 + }, + { + "epoch": 1.215513237343242, + "grad_norm": 0.18066272382062054, + "learning_rate": 3.3046471600688475e-05, + "loss": 0.3569, + "step": 1309 + }, + { + "epoch": 1.2164421737111009, + "grad_norm": 0.21118001457268404, + "learning_rate": 3.302925989672978e-05, + "loss": 0.347, + "step": 1310 + }, + { + "epoch": 1.2173711100789597, + "grad_norm": 0.20561719917149285, + "learning_rate": 3.3012048192771085e-05, + "loss": 0.3546, + "step": 1311 + }, + { + "epoch": 1.2183000464468183, + "grad_norm": 0.22731459097307283, + "learning_rate": 3.299483648881239e-05, + "loss": 0.3745, + "step": 1312 + }, + { + "epoch": 1.2192289828146772, + "grad_norm": 0.2424041406528847, + "learning_rate": 3.29776247848537e-05, + "loss": 0.3486, + "step": 1313 + }, + { + "epoch": 1.220157919182536, + "grad_norm": 0.20069413390660085, + "learning_rate": 3.296041308089501e-05, + "loss": 0.3383, + "step": 1314 + }, + { + "epoch": 1.2210868555503949, + "grad_norm": 0.2281772093622241, + "learning_rate": 3.294320137693632e-05, + "loss": 0.3434, + "step": 1315 + }, + { + "epoch": 1.2220157919182535, + "grad_norm": 0.19211768424635667, + "learning_rate": 3.2925989672977624e-05, + "loss": 0.3352, + "step": 1316 + }, + { + "epoch": 1.2229447282861123, + "grad_norm": 0.2513611309761323, + "learning_rate": 3.2908777969018935e-05, + "loss": 0.345, + "step": 1317 + }, + { + "epoch": 1.2238736646539712, + "grad_norm": 0.24961481880569977, + "learning_rate": 3.289156626506024e-05, + "loss": 0.3231, + "step": 1318 + }, + { + "epoch": 1.22480260102183, + "grad_norm": 0.20092458352098808, + "learning_rate": 3.287435456110155e-05, + "loss": 0.3563, + "step": 1319 + }, + { + "epoch": 1.2257315373896889, + "grad_norm": 0.2436154621559124, + "learning_rate": 3.285714285714286e-05, + "loss": 0.3831, + "step": 1320 + }, + { + "epoch": 1.2266604737575477, + "grad_norm": 0.24743764188275466, + "learning_rate": 3.283993115318417e-05, + "loss": 0.3574, + "step": 1321 + }, + { + "epoch": 1.2275894101254063, + "grad_norm": 0.19895641348642534, + "learning_rate": 3.2822719449225474e-05, + "loss": 0.3482, + "step": 1322 + }, + { + "epoch": 1.2285183464932652, + "grad_norm": 0.22955056964532125, + "learning_rate": 3.2805507745266786e-05, + "loss": 0.3532, + "step": 1323 + }, + { + "epoch": 1.229447282861124, + "grad_norm": 0.2100380867776916, + "learning_rate": 3.278829604130809e-05, + "loss": 0.3348, + "step": 1324 + }, + { + "epoch": 1.2303762192289829, + "grad_norm": 0.18910582670755707, + "learning_rate": 3.27710843373494e-05, + "loss": 0.343, + "step": 1325 + }, + { + "epoch": 1.2313051555968415, + "grad_norm": 0.23613291094866282, + "learning_rate": 3.275387263339071e-05, + "loss": 0.3567, + "step": 1326 + }, + { + "epoch": 1.2322340919647004, + "grad_norm": 0.19368690581896822, + "learning_rate": 3.273666092943202e-05, + "loss": 0.3561, + "step": 1327 + }, + { + "epoch": 1.2331630283325592, + "grad_norm": 0.20487266897523065, + "learning_rate": 3.2719449225473324e-05, + "loss": 0.3477, + "step": 1328 + }, + { + "epoch": 1.234091964700418, + "grad_norm": 0.19205460972365668, + "learning_rate": 3.2702237521514636e-05, + "loss": 0.353, + "step": 1329 + }, + { + "epoch": 1.2350209010682769, + "grad_norm": 0.22085132737761864, + "learning_rate": 3.268502581755594e-05, + "loss": 0.3499, + "step": 1330 + }, + { + "epoch": 1.2359498374361357, + "grad_norm": 0.19737628387896256, + "learning_rate": 3.266781411359725e-05, + "loss": 0.3396, + "step": 1331 + }, + { + "epoch": 1.2368787738039944, + "grad_norm": 0.19772707162756117, + "learning_rate": 3.265060240963855e-05, + "loss": 0.3364, + "step": 1332 + }, + { + "epoch": 1.2378077101718532, + "grad_norm": 0.1990220518002121, + "learning_rate": 3.263339070567986e-05, + "loss": 0.3543, + "step": 1333 + }, + { + "epoch": 1.238736646539712, + "grad_norm": 0.2088922466620167, + "learning_rate": 3.261617900172117e-05, + "loss": 0.3423, + "step": 1334 + }, + { + "epoch": 1.239665582907571, + "grad_norm": 0.23503440075942228, + "learning_rate": 3.259896729776248e-05, + "loss": 0.378, + "step": 1335 + }, + { + "epoch": 1.2405945192754295, + "grad_norm": 0.20957487854697568, + "learning_rate": 3.2581755593803785e-05, + "loss": 0.3497, + "step": 1336 + }, + { + "epoch": 1.2415234556432884, + "grad_norm": 0.20536395635402577, + "learning_rate": 3.2564543889845097e-05, + "loss": 0.3621, + "step": 1337 + }, + { + "epoch": 1.2424523920111472, + "grad_norm": 0.19566124028745266, + "learning_rate": 3.25473321858864e-05, + "loss": 0.3392, + "step": 1338 + }, + { + "epoch": 1.243381328379006, + "grad_norm": 0.19859624787750643, + "learning_rate": 3.253012048192771e-05, + "loss": 0.3305, + "step": 1339 + }, + { + "epoch": 1.244310264746865, + "grad_norm": 0.20220312062723214, + "learning_rate": 3.251290877796902e-05, + "loss": 0.3429, + "step": 1340 + }, + { + "epoch": 1.2452392011147237, + "grad_norm": 0.20306062927254576, + "learning_rate": 3.249569707401033e-05, + "loss": 0.3515, + "step": 1341 + }, + { + "epoch": 1.2461681374825824, + "grad_norm": 0.22899674919736335, + "learning_rate": 3.2478485370051635e-05, + "loss": 0.3512, + "step": 1342 + }, + { + "epoch": 1.2470970738504412, + "grad_norm": 0.20719067431783017, + "learning_rate": 3.246127366609295e-05, + "loss": 0.3687, + "step": 1343 + }, + { + "epoch": 1.2480260102183, + "grad_norm": 0.19102206124283935, + "learning_rate": 3.244406196213425e-05, + "loss": 0.3562, + "step": 1344 + }, + { + "epoch": 1.248954946586159, + "grad_norm": 0.1986375316054701, + "learning_rate": 3.2426850258175564e-05, + "loss": 0.3549, + "step": 1345 + }, + { + "epoch": 1.2498838829540175, + "grad_norm": 0.20320312735004203, + "learning_rate": 3.240963855421687e-05, + "loss": 0.344, + "step": 1346 + }, + { + "epoch": 1.2508128193218764, + "grad_norm": 0.20833632211864914, + "learning_rate": 3.239242685025818e-05, + "loss": 0.3419, + "step": 1347 + }, + { + "epoch": 1.2517417556897352, + "grad_norm": 0.20209089611428194, + "learning_rate": 3.2375215146299486e-05, + "loss": 0.378, + "step": 1348 + }, + { + "epoch": 1.252670692057594, + "grad_norm": 0.23317589217939863, + "learning_rate": 3.23580034423408e-05, + "loss": 0.3472, + "step": 1349 + }, + { + "epoch": 1.253599628425453, + "grad_norm": 0.23889292842454002, + "learning_rate": 3.23407917383821e-05, + "loss": 0.3546, + "step": 1350 + }, + { + "epoch": 1.2545285647933118, + "grad_norm": 0.23185784512316487, + "learning_rate": 3.2323580034423414e-05, + "loss": 0.353, + "step": 1351 + }, + { + "epoch": 1.2554575011611704, + "grad_norm": 0.25338080904879534, + "learning_rate": 3.230636833046472e-05, + "loss": 0.3369, + "step": 1352 + }, + { + "epoch": 1.2563864375290292, + "grad_norm": 0.2099328571771037, + "learning_rate": 3.2289156626506024e-05, + "loss": 0.3393, + "step": 1353 + }, + { + "epoch": 1.257315373896888, + "grad_norm": 0.24756023680513764, + "learning_rate": 3.227194492254733e-05, + "loss": 0.3448, + "step": 1354 + }, + { + "epoch": 1.258244310264747, + "grad_norm": 0.23036929610075849, + "learning_rate": 3.225473321858864e-05, + "loss": 0.3428, + "step": 1355 + }, + { + "epoch": 1.2591732466326055, + "grad_norm": 0.26373689904363057, + "learning_rate": 3.2237521514629946e-05, + "loss": 0.3525, + "step": 1356 + }, + { + "epoch": 1.2601021830004644, + "grad_norm": 0.20482417525178498, + "learning_rate": 3.222030981067126e-05, + "loss": 0.3463, + "step": 1357 + }, + { + "epoch": 1.2610311193683232, + "grad_norm": 0.24978474205561277, + "learning_rate": 3.220309810671256e-05, + "loss": 0.3616, + "step": 1358 + }, + { + "epoch": 1.261960055736182, + "grad_norm": 0.2200819349873659, + "learning_rate": 3.2185886402753875e-05, + "loss": 0.3438, + "step": 1359 + }, + { + "epoch": 1.262888992104041, + "grad_norm": 0.23973808094498264, + "learning_rate": 3.216867469879518e-05, + "loss": 0.3552, + "step": 1360 + }, + { + "epoch": 1.2638179284718998, + "grad_norm": 0.22114441784933195, + "learning_rate": 3.215146299483649e-05, + "loss": 0.3752, + "step": 1361 + }, + { + "epoch": 1.2647468648397584, + "grad_norm": 0.27751924776177905, + "learning_rate": 3.2134251290877796e-05, + "loss": 0.3515, + "step": 1362 + }, + { + "epoch": 1.2656758012076172, + "grad_norm": 0.21042269563943283, + "learning_rate": 3.211703958691911e-05, + "loss": 0.3465, + "step": 1363 + }, + { + "epoch": 1.266604737575476, + "grad_norm": 0.28456491769206277, + "learning_rate": 3.209982788296041e-05, + "loss": 0.3694, + "step": 1364 + }, + { + "epoch": 1.267533673943335, + "grad_norm": 0.2307337278461796, + "learning_rate": 3.2082616179001725e-05, + "loss": 0.3686, + "step": 1365 + }, + { + "epoch": 1.2684626103111936, + "grad_norm": 0.2295732638687118, + "learning_rate": 3.206540447504303e-05, + "loss": 0.3595, + "step": 1366 + }, + { + "epoch": 1.2693915466790524, + "grad_norm": 0.25011196671438235, + "learning_rate": 3.204819277108434e-05, + "loss": 0.3676, + "step": 1367 + }, + { + "epoch": 1.2703204830469113, + "grad_norm": 0.21793396400635984, + "learning_rate": 3.203098106712565e-05, + "loss": 0.3505, + "step": 1368 + }, + { + "epoch": 1.27124941941477, + "grad_norm": 0.19814582335576053, + "learning_rate": 3.201376936316696e-05, + "loss": 0.333, + "step": 1369 + }, + { + "epoch": 1.272178355782629, + "grad_norm": 0.22394942492694084, + "learning_rate": 3.1996557659208264e-05, + "loss": 0.358, + "step": 1370 + }, + { + "epoch": 1.2731072921504878, + "grad_norm": 0.2021914187434177, + "learning_rate": 3.1979345955249575e-05, + "loss": 0.3385, + "step": 1371 + }, + { + "epoch": 1.2740362285183464, + "grad_norm": 0.18777768433600583, + "learning_rate": 3.196213425129088e-05, + "loss": 0.3599, + "step": 1372 + }, + { + "epoch": 1.2749651648862053, + "grad_norm": 0.20249518397349203, + "learning_rate": 3.194492254733219e-05, + "loss": 0.3548, + "step": 1373 + }, + { + "epoch": 1.275894101254064, + "grad_norm": 0.2269731964103628, + "learning_rate": 3.192771084337349e-05, + "loss": 0.3537, + "step": 1374 + }, + { + "epoch": 1.276823037621923, + "grad_norm": 0.19146870047473466, + "learning_rate": 3.19104991394148e-05, + "loss": 0.3555, + "step": 1375 + }, + { + "epoch": 1.2777519739897816, + "grad_norm": 0.2073759588674991, + "learning_rate": 3.189328743545611e-05, + "loss": 0.351, + "step": 1376 + }, + { + "epoch": 1.2786809103576404, + "grad_norm": 0.2157939027150171, + "learning_rate": 3.187607573149742e-05, + "loss": 0.368, + "step": 1377 + }, + { + "epoch": 1.2796098467254993, + "grad_norm": 0.21802601041303335, + "learning_rate": 3.1858864027538724e-05, + "loss": 0.3697, + "step": 1378 + }, + { + "epoch": 1.2805387830933581, + "grad_norm": 0.20338664825260214, + "learning_rate": 3.1841652323580036e-05, + "loss": 0.3553, + "step": 1379 + }, + { + "epoch": 1.281467719461217, + "grad_norm": 0.21747341979747276, + "learning_rate": 3.182444061962134e-05, + "loss": 0.3434, + "step": 1380 + }, + { + "epoch": 1.2823966558290758, + "grad_norm": 0.19657412183584266, + "learning_rate": 3.180722891566265e-05, + "loss": 0.354, + "step": 1381 + }, + { + "epoch": 1.2833255921969344, + "grad_norm": 0.203905731577841, + "learning_rate": 3.179001721170396e-05, + "loss": 0.3639, + "step": 1382 + }, + { + "epoch": 1.2842545285647933, + "grad_norm": 0.19642280259503783, + "learning_rate": 3.177280550774527e-05, + "loss": 0.3429, + "step": 1383 + }, + { + "epoch": 1.2851834649326521, + "grad_norm": 0.20991675969465276, + "learning_rate": 3.1755593803786574e-05, + "loss": 0.3521, + "step": 1384 + }, + { + "epoch": 1.286112401300511, + "grad_norm": 0.20253615601371597, + "learning_rate": 3.1738382099827886e-05, + "loss": 0.3473, + "step": 1385 + }, + { + "epoch": 1.2870413376683696, + "grad_norm": 0.198688892854731, + "learning_rate": 3.172117039586919e-05, + "loss": 0.3436, + "step": 1386 + }, + { + "epoch": 1.2879702740362284, + "grad_norm": 2.4661548054153672, + "learning_rate": 3.17039586919105e-05, + "loss": 0.3541, + "step": 1387 + }, + { + "epoch": 1.2888992104040873, + "grad_norm": 0.20742528996459592, + "learning_rate": 3.168674698795181e-05, + "loss": 0.3517, + "step": 1388 + }, + { + "epoch": 1.2898281467719461, + "grad_norm": 0.20329525205803542, + "learning_rate": 3.166953528399312e-05, + "loss": 0.3424, + "step": 1389 + }, + { + "epoch": 1.290757083139805, + "grad_norm": 0.2112927190742448, + "learning_rate": 3.1652323580034425e-05, + "loss": 0.3556, + "step": 1390 + }, + { + "epoch": 1.2916860195076638, + "grad_norm": 0.23636619370443224, + "learning_rate": 3.163511187607574e-05, + "loss": 0.3735, + "step": 1391 + }, + { + "epoch": 1.2926149558755227, + "grad_norm": 0.22276923652505054, + "learning_rate": 3.161790017211704e-05, + "loss": 0.3579, + "step": 1392 + }, + { + "epoch": 1.2935438922433813, + "grad_norm": 0.2005541094030619, + "learning_rate": 3.1600688468158353e-05, + "loss": 0.3554, + "step": 1393 + }, + { + "epoch": 1.2944728286112401, + "grad_norm": 0.22155762592559106, + "learning_rate": 3.158347676419966e-05, + "loss": 0.3546, + "step": 1394 + }, + { + "epoch": 1.295401764979099, + "grad_norm": 0.20946083857237227, + "learning_rate": 3.1566265060240963e-05, + "loss": 0.349, + "step": 1395 + }, + { + "epoch": 1.2963307013469576, + "grad_norm": 0.20702763325289733, + "learning_rate": 3.154905335628227e-05, + "loss": 0.3649, + "step": 1396 + }, + { + "epoch": 1.2972596377148164, + "grad_norm": 0.2229383054425248, + "learning_rate": 3.153184165232358e-05, + "loss": 0.349, + "step": 1397 + }, + { + "epoch": 1.2981885740826753, + "grad_norm": 0.23124550374061495, + "learning_rate": 3.1514629948364885e-05, + "loss": 0.336, + "step": 1398 + }, + { + "epoch": 1.2991175104505341, + "grad_norm": 0.19990916764328137, + "learning_rate": 3.14974182444062e-05, + "loss": 0.3619, + "step": 1399 + }, + { + "epoch": 1.300046446818393, + "grad_norm": 0.21799120394800936, + "learning_rate": 3.14802065404475e-05, + "loss": 0.3492, + "step": 1400 + }, + { + "epoch": 1.3009753831862518, + "grad_norm": 0.3118082548093683, + "learning_rate": 3.1462994836488814e-05, + "loss": 0.3486, + "step": 1401 + }, + { + "epoch": 1.3019043195541107, + "grad_norm": 0.21875180339324876, + "learning_rate": 3.144578313253012e-05, + "loss": 0.345, + "step": 1402 + }, + { + "epoch": 1.3028332559219693, + "grad_norm": 0.20622030938134586, + "learning_rate": 3.142857142857143e-05, + "loss": 0.3395, + "step": 1403 + }, + { + "epoch": 1.3037621922898281, + "grad_norm": 0.22507581862454673, + "learning_rate": 3.1411359724612736e-05, + "loss": 0.3462, + "step": 1404 + }, + { + "epoch": 1.304691128657687, + "grad_norm": 0.20997485299055227, + "learning_rate": 3.139414802065405e-05, + "loss": 0.3248, + "step": 1405 + }, + { + "epoch": 1.3056200650255456, + "grad_norm": 0.20777883901322605, + "learning_rate": 3.137693631669535e-05, + "loss": 0.3632, + "step": 1406 + }, + { + "epoch": 1.3065490013934045, + "grad_norm": 0.22237908122375719, + "learning_rate": 3.1359724612736664e-05, + "loss": 0.3547, + "step": 1407 + }, + { + "epoch": 1.3074779377612633, + "grad_norm": 0.22989418042498605, + "learning_rate": 3.134251290877797e-05, + "loss": 0.3386, + "step": 1408 + }, + { + "epoch": 1.3084068741291222, + "grad_norm": 0.18232306340600674, + "learning_rate": 3.132530120481928e-05, + "loss": 0.3387, + "step": 1409 + }, + { + "epoch": 1.309335810496981, + "grad_norm": 0.22379285894618078, + "learning_rate": 3.1308089500860586e-05, + "loss": 0.351, + "step": 1410 + }, + { + "epoch": 1.3102647468648398, + "grad_norm": 0.22957367785503446, + "learning_rate": 3.12908777969019e-05, + "loss": 0.3538, + "step": 1411 + }, + { + "epoch": 1.3111936832326987, + "grad_norm": 0.2213926772964687, + "learning_rate": 3.12736660929432e-05, + "loss": 0.3501, + "step": 1412 + }, + { + "epoch": 1.3121226196005573, + "grad_norm": 0.2199668333695165, + "learning_rate": 3.1256454388984515e-05, + "loss": 0.3357, + "step": 1413 + }, + { + "epoch": 1.3130515559684162, + "grad_norm": 0.22045412078084195, + "learning_rate": 3.123924268502582e-05, + "loss": 0.3665, + "step": 1414 + }, + { + "epoch": 1.313980492336275, + "grad_norm": 0.23937083341287288, + "learning_rate": 3.122203098106713e-05, + "loss": 0.3541, + "step": 1415 + }, + { + "epoch": 1.3149094287041336, + "grad_norm": 0.22142560842558442, + "learning_rate": 3.1204819277108436e-05, + "loss": 0.3589, + "step": 1416 + }, + { + "epoch": 1.3158383650719925, + "grad_norm": 0.25259554318810806, + "learning_rate": 3.118760757314974e-05, + "loss": 0.3558, + "step": 1417 + }, + { + "epoch": 1.3167673014398513, + "grad_norm": 0.23469823002390106, + "learning_rate": 3.1170395869191046e-05, + "loss": 0.3604, + "step": 1418 + }, + { + "epoch": 1.3176962378077102, + "grad_norm": 0.2318292616166613, + "learning_rate": 3.115318416523236e-05, + "loss": 0.3231, + "step": 1419 + }, + { + "epoch": 1.318625174175569, + "grad_norm": 0.23736119043390674, + "learning_rate": 3.113597246127366e-05, + "loss": 0.3644, + "step": 1420 + }, + { + "epoch": 1.3195541105434279, + "grad_norm": 0.2164485499905474, + "learning_rate": 3.1118760757314975e-05, + "loss": 0.3662, + "step": 1421 + }, + { + "epoch": 1.3204830469112867, + "grad_norm": 0.21673871274658302, + "learning_rate": 3.110154905335628e-05, + "loss": 0.3451, + "step": 1422 + }, + { + "epoch": 1.3214119832791453, + "grad_norm": 0.2472895979807864, + "learning_rate": 3.108433734939759e-05, + "loss": 0.3543, + "step": 1423 + }, + { + "epoch": 1.3223409196470042, + "grad_norm": 0.2245120667209746, + "learning_rate": 3.10671256454389e-05, + "loss": 0.3624, + "step": 1424 + }, + { + "epoch": 1.323269856014863, + "grad_norm": 0.24985123445714202, + "learning_rate": 3.104991394148021e-05, + "loss": 0.3387, + "step": 1425 + }, + { + "epoch": 1.3241987923827216, + "grad_norm": 0.2142027736341417, + "learning_rate": 3.1032702237521514e-05, + "loss": 0.3576, + "step": 1426 + }, + { + "epoch": 1.3251277287505805, + "grad_norm": 0.21501285847890808, + "learning_rate": 3.1015490533562825e-05, + "loss": 0.3259, + "step": 1427 + }, + { + "epoch": 1.3260566651184393, + "grad_norm": 0.22544501880772352, + "learning_rate": 3.099827882960413e-05, + "loss": 0.3363, + "step": 1428 + }, + { + "epoch": 1.3269856014862982, + "grad_norm": 0.23996880906387041, + "learning_rate": 3.098106712564544e-05, + "loss": 0.3681, + "step": 1429 + }, + { + "epoch": 1.327914537854157, + "grad_norm": 0.22942067334804397, + "learning_rate": 3.096385542168675e-05, + "loss": 0.3505, + "step": 1430 + }, + { + "epoch": 1.3288434742220159, + "grad_norm": 0.23432507600874153, + "learning_rate": 3.094664371772806e-05, + "loss": 0.3703, + "step": 1431 + }, + { + "epoch": 1.3297724105898747, + "grad_norm": 0.19874580942818373, + "learning_rate": 3.0929432013769364e-05, + "loss": 0.3629, + "step": 1432 + }, + { + "epoch": 1.3307013469577333, + "grad_norm": 0.2312124529538691, + "learning_rate": 3.0912220309810676e-05, + "loss": 0.3382, + "step": 1433 + }, + { + "epoch": 1.3316302833255922, + "grad_norm": 0.20766547973250402, + "learning_rate": 3.089500860585198e-05, + "loss": 0.3572, + "step": 1434 + }, + { + "epoch": 1.332559219693451, + "grad_norm": 0.20786382857619148, + "learning_rate": 3.087779690189329e-05, + "loss": 0.358, + "step": 1435 + }, + { + "epoch": 1.3334881560613099, + "grad_norm": 0.20447235064175212, + "learning_rate": 3.08605851979346e-05, + "loss": 0.3364, + "step": 1436 + }, + { + "epoch": 1.3344170924291685, + "grad_norm": 0.1996680493576843, + "learning_rate": 3.084337349397591e-05, + "loss": 0.3557, + "step": 1437 + }, + { + "epoch": 1.3353460287970274, + "grad_norm": 0.19501800099094735, + "learning_rate": 3.082616179001721e-05, + "loss": 0.3515, + "step": 1438 + }, + { + "epoch": 1.3362749651648862, + "grad_norm": 0.23158157472952676, + "learning_rate": 3.080895008605852e-05, + "loss": 0.3582, + "step": 1439 + }, + { + "epoch": 1.337203901532745, + "grad_norm": 0.6667951671559779, + "learning_rate": 3.0791738382099825e-05, + "loss": 0.3487, + "step": 1440 + }, + { + "epoch": 1.3381328379006039, + "grad_norm": 0.20918867974295047, + "learning_rate": 3.0774526678141136e-05, + "loss": 0.3528, + "step": 1441 + }, + { + "epoch": 1.3390617742684627, + "grad_norm": 0.22293293277353693, + "learning_rate": 3.075731497418244e-05, + "loss": 0.3409, + "step": 1442 + }, + { + "epoch": 1.3399907106363214, + "grad_norm": 0.1829660308906723, + "learning_rate": 3.074010327022375e-05, + "loss": 0.3455, + "step": 1443 + }, + { + "epoch": 1.3409196470041802, + "grad_norm": 0.2000567256104809, + "learning_rate": 3.072289156626506e-05, + "loss": 0.3643, + "step": 1444 + }, + { + "epoch": 1.341848583372039, + "grad_norm": 0.2115519953166791, + "learning_rate": 3.070567986230637e-05, + "loss": 0.3661, + "step": 1445 + }, + { + "epoch": 1.342777519739898, + "grad_norm": 0.21732387886026813, + "learning_rate": 3.0688468158347675e-05, + "loss": 0.3316, + "step": 1446 + }, + { + "epoch": 1.3437064561077565, + "grad_norm": 0.20075318769521555, + "learning_rate": 3.067125645438899e-05, + "loss": 0.3554, + "step": 1447 + }, + { + "epoch": 1.3446353924756154, + "grad_norm": 0.20861534949300697, + "learning_rate": 3.065404475043029e-05, + "loss": 0.3505, + "step": 1448 + }, + { + "epoch": 1.3455643288434742, + "grad_norm": 0.254657487659329, + "learning_rate": 3.0636833046471604e-05, + "loss": 0.3625, + "step": 1449 + }, + { + "epoch": 1.346493265211333, + "grad_norm": 0.2070643343223196, + "learning_rate": 3.061962134251291e-05, + "loss": 0.3239, + "step": 1450 + }, + { + "epoch": 1.347422201579192, + "grad_norm": 0.1928014077888377, + "learning_rate": 3.060240963855422e-05, + "loss": 0.3558, + "step": 1451 + }, + { + "epoch": 1.3483511379470507, + "grad_norm": 0.2294446621025945, + "learning_rate": 3.0585197934595525e-05, + "loss": 0.3674, + "step": 1452 + }, + { + "epoch": 1.3492800743149094, + "grad_norm": 0.18874531273124526, + "learning_rate": 3.056798623063684e-05, + "loss": 0.3561, + "step": 1453 + }, + { + "epoch": 1.3502090106827682, + "grad_norm": 0.22251661860503666, + "learning_rate": 3.055077452667814e-05, + "loss": 0.3603, + "step": 1454 + }, + { + "epoch": 1.351137947050627, + "grad_norm": 0.2235668510889753, + "learning_rate": 3.0533562822719454e-05, + "loss": 0.3517, + "step": 1455 + }, + { + "epoch": 1.352066883418486, + "grad_norm": 0.21432492708979067, + "learning_rate": 3.0516351118760762e-05, + "loss": 0.3496, + "step": 1456 + }, + { + "epoch": 1.3529958197863445, + "grad_norm": 0.20494031548874958, + "learning_rate": 3.049913941480207e-05, + "loss": 0.3342, + "step": 1457 + }, + { + "epoch": 1.3539247561542034, + "grad_norm": 0.20672906781500622, + "learning_rate": 3.048192771084338e-05, + "loss": 0.366, + "step": 1458 + }, + { + "epoch": 1.3548536925220622, + "grad_norm": 0.2042325486051598, + "learning_rate": 3.046471600688468e-05, + "loss": 0.3293, + "step": 1459 + }, + { + "epoch": 1.355782628889921, + "grad_norm": 0.20799195423693348, + "learning_rate": 3.044750430292599e-05, + "loss": 0.3273, + "step": 1460 + }, + { + "epoch": 1.35671156525778, + "grad_norm": 0.1915937763700306, + "learning_rate": 3.0430292598967298e-05, + "loss": 0.3195, + "step": 1461 + }, + { + "epoch": 1.3576405016256388, + "grad_norm": 0.18941206561090246, + "learning_rate": 3.0413080895008606e-05, + "loss": 0.3603, + "step": 1462 + }, + { + "epoch": 1.3585694379934974, + "grad_norm": 0.22011411157308233, + "learning_rate": 3.0395869191049914e-05, + "loss": 0.3389, + "step": 1463 + }, + { + "epoch": 1.3594983743613562, + "grad_norm": 0.20454140375565522, + "learning_rate": 3.0378657487091223e-05, + "loss": 0.3412, + "step": 1464 + }, + { + "epoch": 1.360427310729215, + "grad_norm": 0.204807974919683, + "learning_rate": 3.036144578313253e-05, + "loss": 0.3688, + "step": 1465 + }, + { + "epoch": 1.361356247097074, + "grad_norm": 0.2500844338812536, + "learning_rate": 3.034423407917384e-05, + "loss": 0.3528, + "step": 1466 + }, + { + "epoch": 1.3622851834649325, + "grad_norm": 0.19177649858240708, + "learning_rate": 3.0327022375215148e-05, + "loss": 0.3371, + "step": 1467 + }, + { + "epoch": 1.3632141198327914, + "grad_norm": 0.23935175027569758, + "learning_rate": 3.0309810671256456e-05, + "loss": 0.3659, + "step": 1468 + }, + { + "epoch": 1.3641430562006502, + "grad_norm": 0.2132668493989873, + "learning_rate": 3.0292598967297765e-05, + "loss": 0.3447, + "step": 1469 + }, + { + "epoch": 1.365071992568509, + "grad_norm": 0.2060605613886273, + "learning_rate": 3.0275387263339073e-05, + "loss": 0.3522, + "step": 1470 + }, + { + "epoch": 1.366000928936368, + "grad_norm": 0.19961249600782785, + "learning_rate": 3.025817555938038e-05, + "loss": 0.3369, + "step": 1471 + }, + { + "epoch": 1.3669298653042268, + "grad_norm": 0.20880345374087075, + "learning_rate": 3.024096385542169e-05, + "loss": 0.3371, + "step": 1472 + }, + { + "epoch": 1.3678588016720854, + "grad_norm": 0.18622587429732448, + "learning_rate": 3.0223752151463e-05, + "loss": 0.3546, + "step": 1473 + }, + { + "epoch": 1.3687877380399442, + "grad_norm": 0.21866694803984443, + "learning_rate": 3.0206540447504307e-05, + "loss": 0.3582, + "step": 1474 + }, + { + "epoch": 1.369716674407803, + "grad_norm": 0.1952217882301225, + "learning_rate": 3.0189328743545615e-05, + "loss": 0.338, + "step": 1475 + }, + { + "epoch": 1.370645610775662, + "grad_norm": 0.1935932378498297, + "learning_rate": 3.0172117039586924e-05, + "loss": 0.3311, + "step": 1476 + }, + { + "epoch": 1.3715745471435206, + "grad_norm": 0.21516220551975052, + "learning_rate": 3.0154905335628232e-05, + "loss": 0.3438, + "step": 1477 + }, + { + "epoch": 1.3725034835113794, + "grad_norm": 0.18251550631463553, + "learning_rate": 3.013769363166954e-05, + "loss": 0.355, + "step": 1478 + }, + { + "epoch": 1.3734324198792383, + "grad_norm": 0.21335773806259328, + "learning_rate": 3.012048192771085e-05, + "loss": 0.3555, + "step": 1479 + }, + { + "epoch": 1.374361356247097, + "grad_norm": 0.19458561617263528, + "learning_rate": 3.010327022375215e-05, + "loss": 0.3334, + "step": 1480 + }, + { + "epoch": 1.375290292614956, + "grad_norm": 0.2006684368240207, + "learning_rate": 3.008605851979346e-05, + "loss": 0.3541, + "step": 1481 + }, + { + "epoch": 1.3762192289828148, + "grad_norm": 0.20772428446640925, + "learning_rate": 3.0068846815834767e-05, + "loss": 0.3289, + "step": 1482 + }, + { + "epoch": 1.3771481653506734, + "grad_norm": 0.1940082452881049, + "learning_rate": 3.0051635111876076e-05, + "loss": 0.3464, + "step": 1483 + }, + { + "epoch": 1.3780771017185323, + "grad_norm": 0.21437646933932283, + "learning_rate": 3.0034423407917384e-05, + "loss": 0.3589, + "step": 1484 + }, + { + "epoch": 1.379006038086391, + "grad_norm": 0.20429035849628274, + "learning_rate": 3.0017211703958692e-05, + "loss": 0.3367, + "step": 1485 + }, + { + "epoch": 1.37993497445425, + "grad_norm": 0.19153881560052258, + "learning_rate": 3e-05, + "loss": 0.3595, + "step": 1486 + }, + { + "epoch": 1.3808639108221086, + "grad_norm": 0.21368509231169802, + "learning_rate": 2.998278829604131e-05, + "loss": 0.338, + "step": 1487 + }, + { + "epoch": 1.3817928471899674, + "grad_norm": 0.19137787166733314, + "learning_rate": 2.9965576592082618e-05, + "loss": 0.3409, + "step": 1488 + }, + { + "epoch": 1.3827217835578263, + "grad_norm": 0.20518740175365516, + "learning_rate": 2.9948364888123926e-05, + "loss": 0.3458, + "step": 1489 + }, + { + "epoch": 1.3836507199256851, + "grad_norm": 0.21123625494807322, + "learning_rate": 2.9931153184165234e-05, + "loss": 0.3577, + "step": 1490 + }, + { + "epoch": 1.384579656293544, + "grad_norm": 0.20634934940948052, + "learning_rate": 2.9913941480206543e-05, + "loss": 0.3472, + "step": 1491 + }, + { + "epoch": 1.3855085926614028, + "grad_norm": 0.20463549838354728, + "learning_rate": 2.989672977624785e-05, + "loss": 0.3457, + "step": 1492 + }, + { + "epoch": 1.3864375290292614, + "grad_norm": 0.24704845466896294, + "learning_rate": 2.987951807228916e-05, + "loss": 0.3598, + "step": 1493 + }, + { + "epoch": 1.3873664653971203, + "grad_norm": 0.1988515336691101, + "learning_rate": 2.9862306368330468e-05, + "loss": 0.3372, + "step": 1494 + }, + { + "epoch": 1.3882954017649791, + "grad_norm": 0.20847564536416982, + "learning_rate": 2.9845094664371776e-05, + "loss": 0.3428, + "step": 1495 + }, + { + "epoch": 1.389224338132838, + "grad_norm": 0.21413822989945702, + "learning_rate": 2.9827882960413085e-05, + "loss": 0.3787, + "step": 1496 + }, + { + "epoch": 1.3901532745006966, + "grad_norm": 0.22368254215474134, + "learning_rate": 2.9810671256454393e-05, + "loss": 0.3139, + "step": 1497 + }, + { + "epoch": 1.3910822108685554, + "grad_norm": 0.19657862058957046, + "learning_rate": 2.97934595524957e-05, + "loss": 0.3386, + "step": 1498 + }, + { + "epoch": 1.3920111472364143, + "grad_norm": 0.20285755404055592, + "learning_rate": 2.977624784853701e-05, + "loss": 0.3551, + "step": 1499 + }, + { + "epoch": 1.3929400836042731, + "grad_norm": 0.24237808422101634, + "learning_rate": 2.975903614457832e-05, + "loss": 0.3325, + "step": 1500 + }, + { + "epoch": 1.393869019972132, + "grad_norm": 0.1939612180735908, + "learning_rate": 2.974182444061962e-05, + "loss": 0.3643, + "step": 1501 + }, + { + "epoch": 1.3947979563399908, + "grad_norm": 0.25538025094523903, + "learning_rate": 2.972461273666093e-05, + "loss": 0.3472, + "step": 1502 + }, + { + "epoch": 1.3957268927078494, + "grad_norm": 0.2411165671533172, + "learning_rate": 2.9707401032702237e-05, + "loss": 0.3618, + "step": 1503 + }, + { + "epoch": 1.3966558290757083, + "grad_norm": 0.18225449502294644, + "learning_rate": 2.9690189328743545e-05, + "loss": 0.3378, + "step": 1504 + }, + { + "epoch": 1.3975847654435671, + "grad_norm": 0.2222434673585961, + "learning_rate": 2.9672977624784854e-05, + "loss": 0.3501, + "step": 1505 + }, + { + "epoch": 1.398513701811426, + "grad_norm": 0.22332973721087676, + "learning_rate": 2.9655765920826162e-05, + "loss": 0.3361, + "step": 1506 + }, + { + "epoch": 1.3994426381792846, + "grad_norm": 0.204654731810479, + "learning_rate": 2.963855421686747e-05, + "loss": 0.3594, + "step": 1507 + }, + { + "epoch": 1.4003715745471434, + "grad_norm": 0.18130868120983315, + "learning_rate": 2.962134251290878e-05, + "loss": 0.3504, + "step": 1508 + }, + { + "epoch": 1.4013005109150023, + "grad_norm": 0.22439741167196856, + "learning_rate": 2.9604130808950087e-05, + "loss": 0.3291, + "step": 1509 + }, + { + "epoch": 1.4022294472828611, + "grad_norm": 0.2513401337007612, + "learning_rate": 2.9586919104991396e-05, + "loss": 0.3703, + "step": 1510 + }, + { + "epoch": 1.40315838365072, + "grad_norm": 0.20610567460377152, + "learning_rate": 2.9569707401032704e-05, + "loss": 0.3347, + "step": 1511 + }, + { + "epoch": 1.4040873200185788, + "grad_norm": 0.2134355620033519, + "learning_rate": 2.9552495697074012e-05, + "loss": 0.3328, + "step": 1512 + }, + { + "epoch": 1.4050162563864375, + "grad_norm": 0.26437614395333625, + "learning_rate": 2.953528399311532e-05, + "loss": 0.3456, + "step": 1513 + }, + { + "epoch": 1.4059451927542963, + "grad_norm": 0.20273923027295387, + "learning_rate": 2.951807228915663e-05, + "loss": 0.3522, + "step": 1514 + }, + { + "epoch": 1.4068741291221551, + "grad_norm": 0.23385915941470425, + "learning_rate": 2.9500860585197938e-05, + "loss": 0.3654, + "step": 1515 + }, + { + "epoch": 1.407803065490014, + "grad_norm": 0.24126554734693004, + "learning_rate": 2.9483648881239246e-05, + "loss": 0.3486, + "step": 1516 + }, + { + "epoch": 1.4087320018578726, + "grad_norm": 0.23539838223695975, + "learning_rate": 2.9466437177280554e-05, + "loss": 0.3526, + "step": 1517 + }, + { + "epoch": 1.4096609382257315, + "grad_norm": 0.202514576386571, + "learning_rate": 2.9449225473321863e-05, + "loss": 0.347, + "step": 1518 + }, + { + "epoch": 1.4105898745935903, + "grad_norm": 0.21679264136440565, + "learning_rate": 2.943201376936317e-05, + "loss": 0.3446, + "step": 1519 + }, + { + "epoch": 1.4115188109614492, + "grad_norm": 0.20985791433208453, + "learning_rate": 2.941480206540448e-05, + "loss": 0.3336, + "step": 1520 + }, + { + "epoch": 1.412447747329308, + "grad_norm": 0.20082124256632042, + "learning_rate": 2.9397590361445788e-05, + "loss": 0.3627, + "step": 1521 + }, + { + "epoch": 1.4133766836971668, + "grad_norm": 0.21260803696246838, + "learning_rate": 2.9380378657487096e-05, + "loss": 0.3279, + "step": 1522 + }, + { + "epoch": 1.4143056200650255, + "grad_norm": 0.19228212989651475, + "learning_rate": 2.9363166953528398e-05, + "loss": 0.3536, + "step": 1523 + }, + { + "epoch": 1.4152345564328843, + "grad_norm": 0.1964531621998223, + "learning_rate": 2.9345955249569706e-05, + "loss": 0.3244, + "step": 1524 + }, + { + "epoch": 1.4161634928007432, + "grad_norm": 0.20227705650558617, + "learning_rate": 2.9328743545611015e-05, + "loss": 0.3637, + "step": 1525 + }, + { + "epoch": 1.417092429168602, + "grad_norm": 0.19644430453193695, + "learning_rate": 2.9311531841652323e-05, + "loss": 0.3379, + "step": 1526 + }, + { + "epoch": 1.4180213655364606, + "grad_norm": 0.19015970604275795, + "learning_rate": 2.929432013769363e-05, + "loss": 0.3505, + "step": 1527 + }, + { + "epoch": 1.4189503019043195, + "grad_norm": 0.19835877082909076, + "learning_rate": 2.927710843373494e-05, + "loss": 0.3479, + "step": 1528 + }, + { + "epoch": 1.4198792382721783, + "grad_norm": 0.20490877773794255, + "learning_rate": 2.925989672977625e-05, + "loss": 0.3656, + "step": 1529 + }, + { + "epoch": 1.4208081746400372, + "grad_norm": 0.2700572256039223, + "learning_rate": 2.9242685025817557e-05, + "loss": 0.35, + "step": 1530 + }, + { + "epoch": 1.421737111007896, + "grad_norm": 0.19596937388189786, + "learning_rate": 2.9225473321858865e-05, + "loss": 0.3539, + "step": 1531 + }, + { + "epoch": 1.4226660473757549, + "grad_norm": 0.21738021003203584, + "learning_rate": 2.9208261617900174e-05, + "loss": 0.3581, + "step": 1532 + }, + { + "epoch": 1.4235949837436135, + "grad_norm": 0.20702370533181025, + "learning_rate": 2.9191049913941482e-05, + "loss": 0.3408, + "step": 1533 + }, + { + "epoch": 1.4245239201114723, + "grad_norm": 0.18957085537630466, + "learning_rate": 2.917383820998279e-05, + "loss": 0.3734, + "step": 1534 + }, + { + "epoch": 1.4254528564793312, + "grad_norm": 0.22813119163304144, + "learning_rate": 2.91566265060241e-05, + "loss": 0.3694, + "step": 1535 + }, + { + "epoch": 1.42638179284719, + "grad_norm": 0.20400390978454386, + "learning_rate": 2.9139414802065407e-05, + "loss": 0.3435, + "step": 1536 + }, + { + "epoch": 1.4273107292150486, + "grad_norm": 0.18107256509762557, + "learning_rate": 2.9122203098106716e-05, + "loss": 0.3558, + "step": 1537 + }, + { + "epoch": 1.4282396655829075, + "grad_norm": 0.23378269273098054, + "learning_rate": 2.9104991394148024e-05, + "loss": 0.3488, + "step": 1538 + }, + { + "epoch": 1.4291686019507663, + "grad_norm": 0.1881671167013014, + "learning_rate": 2.9087779690189332e-05, + "loss": 0.3399, + "step": 1539 + }, + { + "epoch": 1.4300975383186252, + "grad_norm": 0.19700102593332322, + "learning_rate": 2.907056798623064e-05, + "loss": 0.3562, + "step": 1540 + }, + { + "epoch": 1.431026474686484, + "grad_norm": 0.2013437891388278, + "learning_rate": 2.905335628227195e-05, + "loss": 0.342, + "step": 1541 + }, + { + "epoch": 1.4319554110543429, + "grad_norm": 0.19061395252656796, + "learning_rate": 2.9036144578313258e-05, + "loss": 0.3286, + "step": 1542 + }, + { + "epoch": 1.4328843474222015, + "grad_norm": 0.2133955681118407, + "learning_rate": 2.9018932874354566e-05, + "loss": 0.355, + "step": 1543 + }, + { + "epoch": 1.4338132837900603, + "grad_norm": 0.21653616877161203, + "learning_rate": 2.9001721170395868e-05, + "loss": 0.3637, + "step": 1544 + }, + { + "epoch": 1.4347422201579192, + "grad_norm": 0.2241202614347583, + "learning_rate": 2.8984509466437176e-05, + "loss": 0.3392, + "step": 1545 + }, + { + "epoch": 1.435671156525778, + "grad_norm": 0.2024615955265462, + "learning_rate": 2.8967297762478484e-05, + "loss": 0.3543, + "step": 1546 + }, + { + "epoch": 1.4366000928936367, + "grad_norm": 0.1955822427239985, + "learning_rate": 2.8950086058519793e-05, + "loss": 0.3364, + "step": 1547 + }, + { + "epoch": 1.4375290292614955, + "grad_norm": 0.20873100814874504, + "learning_rate": 2.89328743545611e-05, + "loss": 0.3517, + "step": 1548 + }, + { + "epoch": 1.4384579656293544, + "grad_norm": 0.2112000677453288, + "learning_rate": 2.891566265060241e-05, + "loss": 0.3469, + "step": 1549 + }, + { + "epoch": 1.4393869019972132, + "grad_norm": 0.20592979867590985, + "learning_rate": 2.8898450946643718e-05, + "loss": 0.3601, + "step": 1550 + }, + { + "epoch": 1.440315838365072, + "grad_norm": 0.21192866699319832, + "learning_rate": 2.8881239242685026e-05, + "loss": 0.3469, + "step": 1551 + }, + { + "epoch": 1.4412447747329309, + "grad_norm": 0.18398546591558737, + "learning_rate": 2.8864027538726335e-05, + "loss": 0.3441, + "step": 1552 + }, + { + "epoch": 1.4421737111007895, + "grad_norm": 0.2017578725992054, + "learning_rate": 2.8846815834767643e-05, + "loss": 0.3617, + "step": 1553 + }, + { + "epoch": 1.4431026474686484, + "grad_norm": 0.20148355147165561, + "learning_rate": 2.882960413080895e-05, + "loss": 0.3574, + "step": 1554 + }, + { + "epoch": 1.4440315838365072, + "grad_norm": 0.18503169901772, + "learning_rate": 2.881239242685026e-05, + "loss": 0.3602, + "step": 1555 + }, + { + "epoch": 1.444960520204366, + "grad_norm": 0.19444893478794364, + "learning_rate": 2.879518072289157e-05, + "loss": 0.3423, + "step": 1556 + }, + { + "epoch": 1.4458894565722247, + "grad_norm": 0.17839868562302205, + "learning_rate": 2.8777969018932877e-05, + "loss": 0.3228, + "step": 1557 + }, + { + "epoch": 1.4468183929400835, + "grad_norm": 0.17929509233834304, + "learning_rate": 2.8760757314974185e-05, + "loss": 0.3096, + "step": 1558 + }, + { + "epoch": 1.4477473293079424, + "grad_norm": 0.18214616689218266, + "learning_rate": 2.8743545611015494e-05, + "loss": 0.3522, + "step": 1559 + }, + { + "epoch": 1.4486762656758012, + "grad_norm": 0.2012672967397137, + "learning_rate": 2.8726333907056802e-05, + "loss": 0.358, + "step": 1560 + }, + { + "epoch": 1.44960520204366, + "grad_norm": 0.19930879626694126, + "learning_rate": 2.870912220309811e-05, + "loss": 0.3431, + "step": 1561 + }, + { + "epoch": 1.450534138411519, + "grad_norm": 0.17903764713191608, + "learning_rate": 2.869191049913942e-05, + "loss": 0.3396, + "step": 1562 + }, + { + "epoch": 1.4514630747793777, + "grad_norm": 0.19529230647348247, + "learning_rate": 2.8674698795180727e-05, + "loss": 0.3592, + "step": 1563 + }, + { + "epoch": 1.4523920111472364, + "grad_norm": 0.19975868842982136, + "learning_rate": 2.8657487091222036e-05, + "loss": 0.355, + "step": 1564 + }, + { + "epoch": 1.4533209475150952, + "grad_norm": 0.20358611197286738, + "learning_rate": 2.8640275387263337e-05, + "loss": 0.3527, + "step": 1565 + }, + { + "epoch": 1.454249883882954, + "grad_norm": 0.19856276668969844, + "learning_rate": 2.8623063683304646e-05, + "loss": 0.3266, + "step": 1566 + }, + { + "epoch": 1.4551788202508127, + "grad_norm": 0.19124828137152422, + "learning_rate": 2.8605851979345954e-05, + "loss": 0.3324, + "step": 1567 + }, + { + "epoch": 1.4561077566186715, + "grad_norm": 0.20725907417756995, + "learning_rate": 2.8588640275387262e-05, + "loss": 0.3357, + "step": 1568 + }, + { + "epoch": 1.4570366929865304, + "grad_norm": 0.2176938171580828, + "learning_rate": 2.857142857142857e-05, + "loss": 0.3435, + "step": 1569 + }, + { + "epoch": 1.4579656293543892, + "grad_norm": 0.21108437489228465, + "learning_rate": 2.855421686746988e-05, + "loss": 0.3466, + "step": 1570 + }, + { + "epoch": 1.458894565722248, + "grad_norm": 0.22480057073204013, + "learning_rate": 2.8537005163511188e-05, + "loss": 0.3606, + "step": 1571 + }, + { + "epoch": 1.459823502090107, + "grad_norm": 0.20864206196181312, + "learning_rate": 2.8519793459552496e-05, + "loss": 0.339, + "step": 1572 + }, + { + "epoch": 1.4607524384579658, + "grad_norm": 0.21922099540081025, + "learning_rate": 2.8502581755593804e-05, + "loss": 0.3444, + "step": 1573 + }, + { + "epoch": 1.4616813748258244, + "grad_norm": 0.2276494800159974, + "learning_rate": 2.8485370051635113e-05, + "loss": 0.3599, + "step": 1574 + }, + { + "epoch": 1.4626103111936832, + "grad_norm": 0.2288239429440535, + "learning_rate": 2.846815834767642e-05, + "loss": 0.3491, + "step": 1575 + }, + { + "epoch": 1.463539247561542, + "grad_norm": 0.17841881990882816, + "learning_rate": 2.845094664371773e-05, + "loss": 0.343, + "step": 1576 + }, + { + "epoch": 1.4644681839294007, + "grad_norm": 0.21286928712739886, + "learning_rate": 2.8433734939759038e-05, + "loss": 0.3521, + "step": 1577 + }, + { + "epoch": 1.4653971202972595, + "grad_norm": 0.21833463235565898, + "learning_rate": 2.8416523235800346e-05, + "loss": 0.3438, + "step": 1578 + }, + { + "epoch": 1.4663260566651184, + "grad_norm": 0.18733866679034886, + "learning_rate": 2.8399311531841655e-05, + "loss": 0.3382, + "step": 1579 + }, + { + "epoch": 1.4672549930329772, + "grad_norm": 0.20885255362383456, + "learning_rate": 2.8382099827882963e-05, + "loss": 0.3352, + "step": 1580 + }, + { + "epoch": 1.468183929400836, + "grad_norm": 0.2110857036563971, + "learning_rate": 2.836488812392427e-05, + "loss": 0.3451, + "step": 1581 + }, + { + "epoch": 1.469112865768695, + "grad_norm": 0.21499377814488893, + "learning_rate": 2.834767641996558e-05, + "loss": 0.344, + "step": 1582 + }, + { + "epoch": 1.4700418021365538, + "grad_norm": 0.2241849573695223, + "learning_rate": 2.833046471600689e-05, + "loss": 0.3292, + "step": 1583 + }, + { + "epoch": 1.4709707385044124, + "grad_norm": 0.21699607258286505, + "learning_rate": 2.8313253012048197e-05, + "loss": 0.334, + "step": 1584 + }, + { + "epoch": 1.4718996748722712, + "grad_norm": 0.22542594584645712, + "learning_rate": 2.8296041308089505e-05, + "loss": 0.3477, + "step": 1585 + }, + { + "epoch": 1.47282861124013, + "grad_norm": 0.1970265391341249, + "learning_rate": 2.8278829604130807e-05, + "loss": 0.3338, + "step": 1586 + }, + { + "epoch": 1.4737575476079887, + "grad_norm": 0.21144181981047877, + "learning_rate": 2.8261617900172115e-05, + "loss": 0.3657, + "step": 1587 + }, + { + "epoch": 1.4746864839758476, + "grad_norm": 0.22030587436035384, + "learning_rate": 2.8244406196213424e-05, + "loss": 0.3263, + "step": 1588 + }, + { + "epoch": 1.4756154203437064, + "grad_norm": 0.21592987334276875, + "learning_rate": 2.8227194492254732e-05, + "loss": 0.3691, + "step": 1589 + }, + { + "epoch": 1.4765443567115653, + "grad_norm": 0.20142452673876848, + "learning_rate": 2.820998278829604e-05, + "loss": 0.3485, + "step": 1590 + }, + { + "epoch": 1.477473293079424, + "grad_norm": 0.21246785539377971, + "learning_rate": 2.819277108433735e-05, + "loss": 0.3379, + "step": 1591 + }, + { + "epoch": 1.478402229447283, + "grad_norm": 0.20419932770516422, + "learning_rate": 2.8175559380378657e-05, + "loss": 0.3664, + "step": 1592 + }, + { + "epoch": 1.4793311658151418, + "grad_norm": 0.23868328892673205, + "learning_rate": 2.8158347676419966e-05, + "loss": 0.3579, + "step": 1593 + }, + { + "epoch": 1.4802601021830004, + "grad_norm": 0.18142148540515865, + "learning_rate": 2.8141135972461274e-05, + "loss": 0.3293, + "step": 1594 + }, + { + "epoch": 1.4811890385508593, + "grad_norm": 0.190821964829027, + "learning_rate": 2.8123924268502582e-05, + "loss": 0.3336, + "step": 1595 + }, + { + "epoch": 1.482117974918718, + "grad_norm": 0.1990392036313888, + "learning_rate": 2.810671256454389e-05, + "loss": 0.3639, + "step": 1596 + }, + { + "epoch": 1.483046911286577, + "grad_norm": 0.20621738698691292, + "learning_rate": 2.80895008605852e-05, + "loss": 0.3524, + "step": 1597 + }, + { + "epoch": 1.4839758476544356, + "grad_norm": 0.21378160126323356, + "learning_rate": 2.8072289156626508e-05, + "loss": 0.3507, + "step": 1598 + }, + { + "epoch": 1.4849047840222944, + "grad_norm": 0.2159950509848571, + "learning_rate": 2.8055077452667816e-05, + "loss": 0.3549, + "step": 1599 + }, + { + "epoch": 1.4858337203901533, + "grad_norm": 0.18702549069355595, + "learning_rate": 2.8037865748709124e-05, + "loss": 0.3366, + "step": 1600 + }, + { + "epoch": 1.4867626567580121, + "grad_norm": 0.1994967299413536, + "learning_rate": 2.8020654044750433e-05, + "loss": 0.3256, + "step": 1601 + }, + { + "epoch": 1.487691593125871, + "grad_norm": 0.21942189823000416, + "learning_rate": 2.800344234079174e-05, + "loss": 0.3581, + "step": 1602 + }, + { + "epoch": 1.4886205294937298, + "grad_norm": 0.1766879531892212, + "learning_rate": 2.798623063683305e-05, + "loss": 0.3457, + "step": 1603 + }, + { + "epoch": 1.4895494658615884, + "grad_norm": 0.19258517625663005, + "learning_rate": 2.7969018932874358e-05, + "loss": 0.3425, + "step": 1604 + }, + { + "epoch": 1.4904784022294473, + "grad_norm": 0.2101457920763198, + "learning_rate": 2.7951807228915666e-05, + "loss": 0.3442, + "step": 1605 + }, + { + "epoch": 1.4914073385973061, + "grad_norm": 0.17432791499600817, + "learning_rate": 2.7934595524956975e-05, + "loss": 0.3537, + "step": 1606 + }, + { + "epoch": 1.492336274965165, + "grad_norm": 0.21787927461103915, + "learning_rate": 2.7917383820998277e-05, + "loss": 0.3375, + "step": 1607 + }, + { + "epoch": 1.4932652113330236, + "grad_norm": 0.2135791752018399, + "learning_rate": 2.7900172117039585e-05, + "loss": 0.3642, + "step": 1608 + }, + { + "epoch": 1.4941941477008824, + "grad_norm": 0.19691607764966712, + "learning_rate": 2.7882960413080893e-05, + "loss": 0.3492, + "step": 1609 + }, + { + "epoch": 1.4951230840687413, + "grad_norm": 0.19492765300345521, + "learning_rate": 2.7865748709122202e-05, + "loss": 0.3657, + "step": 1610 + }, + { + "epoch": 1.4960520204366001, + "grad_norm": 0.20105927888888867, + "learning_rate": 2.784853700516351e-05, + "loss": 0.3522, + "step": 1611 + }, + { + "epoch": 1.496980956804459, + "grad_norm": 0.2022257310559514, + "learning_rate": 2.783132530120482e-05, + "loss": 0.3352, + "step": 1612 + }, + { + "epoch": 1.4979098931723178, + "grad_norm": 0.18412786976848391, + "learning_rate": 2.7814113597246127e-05, + "loss": 0.3569, + "step": 1613 + }, + { + "epoch": 1.4988388295401764, + "grad_norm": 0.18521152850671202, + "learning_rate": 2.7796901893287435e-05, + "loss": 0.3291, + "step": 1614 + }, + { + "epoch": 1.4997677659080353, + "grad_norm": 0.18484257371830262, + "learning_rate": 2.7779690189328744e-05, + "loss": 0.3471, + "step": 1615 + }, + { + "epoch": 1.5006967022758941, + "grad_norm": 0.20613823228990855, + "learning_rate": 2.7762478485370052e-05, + "loss": 0.3523, + "step": 1616 + }, + { + "epoch": 1.5016256386437528, + "grad_norm": 0.21700406423639299, + "learning_rate": 2.774526678141136e-05, + "loss": 0.3402, + "step": 1617 + }, + { + "epoch": 1.5025545750116116, + "grad_norm": 0.1825892710960065, + "learning_rate": 2.772805507745267e-05, + "loss": 0.3463, + "step": 1618 + }, + { + "epoch": 1.5034835113794704, + "grad_norm": 0.21767334282347994, + "learning_rate": 2.7710843373493977e-05, + "loss": 0.3591, + "step": 1619 + }, + { + "epoch": 1.5044124477473293, + "grad_norm": 0.2240701912543056, + "learning_rate": 2.7693631669535286e-05, + "loss": 0.3491, + "step": 1620 + }, + { + "epoch": 1.5053413841151881, + "grad_norm": 0.20794265732288209, + "learning_rate": 2.7676419965576594e-05, + "loss": 0.3597, + "step": 1621 + }, + { + "epoch": 1.506270320483047, + "grad_norm": 0.1942386308673407, + "learning_rate": 2.7659208261617903e-05, + "loss": 0.3553, + "step": 1622 + }, + { + "epoch": 1.5071992568509058, + "grad_norm": 0.23953241896174912, + "learning_rate": 2.764199655765921e-05, + "loss": 0.3353, + "step": 1623 + }, + { + "epoch": 1.5081281932187647, + "grad_norm": 0.19064283645632496, + "learning_rate": 2.762478485370052e-05, + "loss": 0.3309, + "step": 1624 + }, + { + "epoch": 1.5090571295866233, + "grad_norm": 0.19455048912942696, + "learning_rate": 2.7607573149741828e-05, + "loss": 0.3512, + "step": 1625 + }, + { + "epoch": 1.5099860659544821, + "grad_norm": 0.24734707148619361, + "learning_rate": 2.7590361445783136e-05, + "loss": 0.3369, + "step": 1626 + }, + { + "epoch": 1.5109150023223408, + "grad_norm": 0.19596045941652765, + "learning_rate": 2.7573149741824445e-05, + "loss": 0.3447, + "step": 1627 + }, + { + "epoch": 1.5118439386901996, + "grad_norm": 0.20593155094983895, + "learning_rate": 2.7555938037865753e-05, + "loss": 0.3281, + "step": 1628 + }, + { + "epoch": 1.5127728750580585, + "grad_norm": 0.18985986735657961, + "learning_rate": 2.7538726333907055e-05, + "loss": 0.3444, + "step": 1629 + }, + { + "epoch": 1.5137018114259173, + "grad_norm": 0.20048398993083572, + "learning_rate": 2.7521514629948363e-05, + "loss": 0.3352, + "step": 1630 + }, + { + "epoch": 1.5146307477937762, + "grad_norm": 0.2340324824637472, + "learning_rate": 2.750430292598967e-05, + "loss": 0.3547, + "step": 1631 + }, + { + "epoch": 1.515559684161635, + "grad_norm": 0.1979649285811828, + "learning_rate": 2.748709122203098e-05, + "loss": 0.3282, + "step": 1632 + }, + { + "epoch": 1.5164886205294938, + "grad_norm": 0.22728561652019158, + "learning_rate": 2.7469879518072288e-05, + "loss": 0.3433, + "step": 1633 + }, + { + "epoch": 1.5174175568973527, + "grad_norm": 0.1909605625979193, + "learning_rate": 2.7452667814113597e-05, + "loss": 0.3481, + "step": 1634 + }, + { + "epoch": 1.5183464932652113, + "grad_norm": 0.21650749973633887, + "learning_rate": 2.7435456110154905e-05, + "loss": 0.3316, + "step": 1635 + }, + { + "epoch": 1.5192754296330702, + "grad_norm": 0.20766956553484378, + "learning_rate": 2.7418244406196213e-05, + "loss": 0.3553, + "step": 1636 + }, + { + "epoch": 1.5202043660009288, + "grad_norm": 0.19184440178799098, + "learning_rate": 2.7401032702237522e-05, + "loss": 0.3479, + "step": 1637 + }, + { + "epoch": 1.5211333023687876, + "grad_norm": 0.19041440103788876, + "learning_rate": 2.738382099827883e-05, + "loss": 0.3229, + "step": 1638 + }, + { + "epoch": 1.5220622387366465, + "grad_norm": 0.19777552122771166, + "learning_rate": 2.736660929432014e-05, + "loss": 0.3535, + "step": 1639 + }, + { + "epoch": 1.5229911751045053, + "grad_norm": 0.2004208302269382, + "learning_rate": 2.7349397590361447e-05, + "loss": 0.34, + "step": 1640 + }, + { + "epoch": 1.5239201114723642, + "grad_norm": 0.20336163065057344, + "learning_rate": 2.7332185886402755e-05, + "loss": 0.3521, + "step": 1641 + }, + { + "epoch": 1.524849047840223, + "grad_norm": 0.19703431204687524, + "learning_rate": 2.7314974182444064e-05, + "loss": 0.3414, + "step": 1642 + }, + { + "epoch": 1.5257779842080819, + "grad_norm": 0.19845951237979154, + "learning_rate": 2.7297762478485372e-05, + "loss": 0.3677, + "step": 1643 + }, + { + "epoch": 1.5267069205759407, + "grad_norm": 0.19034132547856192, + "learning_rate": 2.728055077452668e-05, + "loss": 0.3406, + "step": 1644 + }, + { + "epoch": 1.5276358569437993, + "grad_norm": 0.18091025641587147, + "learning_rate": 2.726333907056799e-05, + "loss": 0.3313, + "step": 1645 + }, + { + "epoch": 1.5285647933116582, + "grad_norm": 0.18588920730588274, + "learning_rate": 2.7246127366609297e-05, + "loss": 0.3379, + "step": 1646 + }, + { + "epoch": 1.5294937296795168, + "grad_norm": 0.183194983638107, + "learning_rate": 2.7228915662650606e-05, + "loss": 0.3514, + "step": 1647 + }, + { + "epoch": 1.5304226660473756, + "grad_norm": 0.20313603959857246, + "learning_rate": 2.7211703958691914e-05, + "loss": 0.3148, + "step": 1648 + }, + { + "epoch": 1.5313516024152345, + "grad_norm": 0.18413859078792266, + "learning_rate": 2.7194492254733223e-05, + "loss": 0.3406, + "step": 1649 + }, + { + "epoch": 1.5322805387830933, + "grad_norm": 0.2218660778677224, + "learning_rate": 2.7177280550774524e-05, + "loss": 0.3473, + "step": 1650 + }, + { + "epoch": 1.5332094751509522, + "grad_norm": 0.20256666218542452, + "learning_rate": 2.7160068846815833e-05, + "loss": 0.327, + "step": 1651 + }, + { + "epoch": 1.534138411518811, + "grad_norm": 0.20973362475741678, + "learning_rate": 2.714285714285714e-05, + "loss": 0.3382, + "step": 1652 + }, + { + "epoch": 1.5350673478866699, + "grad_norm": 0.21420987345417866, + "learning_rate": 2.712564543889845e-05, + "loss": 0.3549, + "step": 1653 + }, + { + "epoch": 1.5359962842545287, + "grad_norm": 0.20547490967457238, + "learning_rate": 2.7108433734939758e-05, + "loss": 0.3692, + "step": 1654 + }, + { + "epoch": 1.5369252206223873, + "grad_norm": 0.20579644597140412, + "learning_rate": 2.7091222030981066e-05, + "loss": 0.3603, + "step": 1655 + }, + { + "epoch": 1.5378541569902462, + "grad_norm": 0.17767519418921987, + "learning_rate": 2.7074010327022375e-05, + "loss": 0.3207, + "step": 1656 + }, + { + "epoch": 1.5387830933581048, + "grad_norm": 0.21067954281906232, + "learning_rate": 2.7056798623063683e-05, + "loss": 0.3668, + "step": 1657 + }, + { + "epoch": 1.5397120297259637, + "grad_norm": 0.1962228244666741, + "learning_rate": 2.703958691910499e-05, + "loss": 0.3492, + "step": 1658 + }, + { + "epoch": 1.5406409660938225, + "grad_norm": 0.19557765329976629, + "learning_rate": 2.70223752151463e-05, + "loss": 0.3474, + "step": 1659 + }, + { + "epoch": 1.5415699024616814, + "grad_norm": 0.18500343001861966, + "learning_rate": 2.7005163511187608e-05, + "loss": 0.3416, + "step": 1660 + }, + { + "epoch": 1.5424988388295402, + "grad_norm": 0.2023975455255552, + "learning_rate": 2.6987951807228917e-05, + "loss": 0.3406, + "step": 1661 + }, + { + "epoch": 1.543427775197399, + "grad_norm": 0.19197373382978503, + "learning_rate": 2.6970740103270225e-05, + "loss": 0.3494, + "step": 1662 + }, + { + "epoch": 1.544356711565258, + "grad_norm": 0.200189895799683, + "learning_rate": 2.6953528399311533e-05, + "loss": 0.3437, + "step": 1663 + }, + { + "epoch": 1.5452856479331167, + "grad_norm": 0.2224630313261313, + "learning_rate": 2.6936316695352842e-05, + "loss": 0.362, + "step": 1664 + }, + { + "epoch": 1.5462145843009754, + "grad_norm": 0.22940474152039902, + "learning_rate": 2.691910499139415e-05, + "loss": 0.3469, + "step": 1665 + }, + { + "epoch": 1.5471435206688342, + "grad_norm": 0.2206713442498346, + "learning_rate": 2.690189328743546e-05, + "loss": 0.3565, + "step": 1666 + }, + { + "epoch": 1.5480724570366928, + "grad_norm": 0.19182785647681846, + "learning_rate": 2.6884681583476767e-05, + "loss": 0.3279, + "step": 1667 + }, + { + "epoch": 1.5490013934045517, + "grad_norm": 0.23133211363227132, + "learning_rate": 2.6867469879518075e-05, + "loss": 0.3571, + "step": 1668 + }, + { + "epoch": 1.5499303297724105, + "grad_norm": 0.19494744045107942, + "learning_rate": 2.6850258175559384e-05, + "loss": 0.325, + "step": 1669 + }, + { + "epoch": 1.5508592661402694, + "grad_norm": 0.19903204750297201, + "learning_rate": 2.6833046471600692e-05, + "loss": 0.3385, + "step": 1670 + }, + { + "epoch": 1.5517882025081282, + "grad_norm": 0.1941975771860589, + "learning_rate": 2.6815834767641994e-05, + "loss": 0.3472, + "step": 1671 + }, + { + "epoch": 1.552717138875987, + "grad_norm": 0.20247277746107317, + "learning_rate": 2.6798623063683302e-05, + "loss": 0.3473, + "step": 1672 + }, + { + "epoch": 1.553646075243846, + "grad_norm": 0.21609956845102268, + "learning_rate": 2.678141135972461e-05, + "loss": 0.3259, + "step": 1673 + }, + { + "epoch": 1.5545750116117047, + "grad_norm": 0.19593104013837537, + "learning_rate": 2.676419965576592e-05, + "loss": 0.354, + "step": 1674 + }, + { + "epoch": 1.5555039479795634, + "grad_norm": 0.18796258655354342, + "learning_rate": 2.6746987951807227e-05, + "loss": 0.3506, + "step": 1675 + }, + { + "epoch": 1.5564328843474222, + "grad_norm": 0.2059985043490576, + "learning_rate": 2.6729776247848536e-05, + "loss": 0.3533, + "step": 1676 + }, + { + "epoch": 1.5573618207152808, + "grad_norm": 0.2268856053908675, + "learning_rate": 2.6712564543889844e-05, + "loss": 0.3617, + "step": 1677 + }, + { + "epoch": 1.5582907570831397, + "grad_norm": 0.19081957209925454, + "learning_rate": 2.6695352839931153e-05, + "loss": 0.3425, + "step": 1678 + }, + { + "epoch": 1.5592196934509985, + "grad_norm": 0.21370626444738328, + "learning_rate": 2.667814113597246e-05, + "loss": 0.3435, + "step": 1679 + }, + { + "epoch": 1.5601486298188574, + "grad_norm": 0.20241828803392062, + "learning_rate": 2.666092943201377e-05, + "loss": 0.3471, + "step": 1680 + }, + { + "epoch": 1.5610775661867162, + "grad_norm": 0.2126563580465179, + "learning_rate": 2.6643717728055078e-05, + "loss": 0.3345, + "step": 1681 + }, + { + "epoch": 1.562006502554575, + "grad_norm": 0.19313249003359623, + "learning_rate": 2.6626506024096386e-05, + "loss": 0.3367, + "step": 1682 + }, + { + "epoch": 1.562935438922434, + "grad_norm": 0.20080971884495982, + "learning_rate": 2.6609294320137695e-05, + "loss": 0.3512, + "step": 1683 + }, + { + "epoch": 1.5638643752902928, + "grad_norm": 0.2326488096574912, + "learning_rate": 2.6592082616179003e-05, + "loss": 0.3507, + "step": 1684 + }, + { + "epoch": 1.5647933116581514, + "grad_norm": 0.20822523814151891, + "learning_rate": 2.657487091222031e-05, + "loss": 0.3483, + "step": 1685 + }, + { + "epoch": 1.5657222480260102, + "grad_norm": 0.21602169294812015, + "learning_rate": 2.655765920826162e-05, + "loss": 0.3451, + "step": 1686 + }, + { + "epoch": 1.5666511843938689, + "grad_norm": 0.2026495811587196, + "learning_rate": 2.6540447504302928e-05, + "loss": 0.3401, + "step": 1687 + }, + { + "epoch": 1.5675801207617277, + "grad_norm": 0.20373496856303852, + "learning_rate": 2.6523235800344237e-05, + "loss": 0.3472, + "step": 1688 + }, + { + "epoch": 1.5685090571295865, + "grad_norm": 0.202815214563734, + "learning_rate": 2.6506024096385545e-05, + "loss": 0.3443, + "step": 1689 + }, + { + "epoch": 1.5694379934974454, + "grad_norm": 0.20807192948042533, + "learning_rate": 2.6488812392426853e-05, + "loss": 0.3244, + "step": 1690 + }, + { + "epoch": 1.5703669298653042, + "grad_norm": 0.19319517687258816, + "learning_rate": 2.6471600688468162e-05, + "loss": 0.3382, + "step": 1691 + }, + { + "epoch": 1.571295866233163, + "grad_norm": 0.22053885061161396, + "learning_rate": 2.6454388984509463e-05, + "loss": 0.3666, + "step": 1692 + }, + { + "epoch": 1.572224802601022, + "grad_norm": 0.207890820090718, + "learning_rate": 2.6437177280550772e-05, + "loss": 0.3436, + "step": 1693 + }, + { + "epoch": 1.5731537389688808, + "grad_norm": 0.19245242691550665, + "learning_rate": 2.641996557659208e-05, + "loss": 0.3539, + "step": 1694 + }, + { + "epoch": 1.5740826753367394, + "grad_norm": 0.21843726413189252, + "learning_rate": 2.640275387263339e-05, + "loss": 0.3421, + "step": 1695 + }, + { + "epoch": 1.5750116117045982, + "grad_norm": 0.220646507516627, + "learning_rate": 2.6385542168674697e-05, + "loss": 0.3541, + "step": 1696 + }, + { + "epoch": 1.5759405480724569, + "grad_norm": 0.22537203693214305, + "learning_rate": 2.6368330464716005e-05, + "loss": 0.3277, + "step": 1697 + }, + { + "epoch": 1.5768694844403157, + "grad_norm": 0.20461650445631827, + "learning_rate": 2.6351118760757314e-05, + "loss": 0.3387, + "step": 1698 + }, + { + "epoch": 1.5777984208081746, + "grad_norm": 0.2136483563667655, + "learning_rate": 2.6333907056798622e-05, + "loss": 0.3426, + "step": 1699 + }, + { + "epoch": 1.5787273571760334, + "grad_norm": 0.19981232379812758, + "learning_rate": 2.631669535283993e-05, + "loss": 0.3454, + "step": 1700 + }, + { + "epoch": 1.5796562935438923, + "grad_norm": 0.23165196856655018, + "learning_rate": 2.629948364888124e-05, + "loss": 0.3775, + "step": 1701 + }, + { + "epoch": 1.580585229911751, + "grad_norm": 0.2055708819872716, + "learning_rate": 2.6282271944922547e-05, + "loss": 0.3398, + "step": 1702 + }, + { + "epoch": 1.58151416627961, + "grad_norm": 0.20549199744598382, + "learning_rate": 2.6265060240963856e-05, + "loss": 0.3303, + "step": 1703 + }, + { + "epoch": 1.5824431026474688, + "grad_norm": 0.19735611925338659, + "learning_rate": 2.6247848537005164e-05, + "loss": 0.328, + "step": 1704 + }, + { + "epoch": 1.5833720390153274, + "grad_norm": 0.18310842492165016, + "learning_rate": 2.6230636833046473e-05, + "loss": 0.343, + "step": 1705 + }, + { + "epoch": 1.5843009753831863, + "grad_norm": 0.1873807736634079, + "learning_rate": 2.621342512908778e-05, + "loss": 0.3481, + "step": 1706 + }, + { + "epoch": 1.585229911751045, + "grad_norm": 0.20000318958063903, + "learning_rate": 2.619621342512909e-05, + "loss": 0.3356, + "step": 1707 + }, + { + "epoch": 1.5861588481189037, + "grad_norm": 0.2015136507544033, + "learning_rate": 2.6179001721170398e-05, + "loss": 0.3324, + "step": 1708 + }, + { + "epoch": 1.5870877844867626, + "grad_norm": 0.18262820920953574, + "learning_rate": 2.6161790017211706e-05, + "loss": 0.3532, + "step": 1709 + }, + { + "epoch": 1.5880167208546214, + "grad_norm": 0.23344245143312825, + "learning_rate": 2.6144578313253015e-05, + "loss": 0.3775, + "step": 1710 + }, + { + "epoch": 1.5889456572224803, + "grad_norm": 0.2199511607804858, + "learning_rate": 2.6127366609294323e-05, + "loss": 0.3168, + "step": 1711 + }, + { + "epoch": 1.5898745935903391, + "grad_norm": 0.212983265606066, + "learning_rate": 2.611015490533563e-05, + "loss": 0.3437, + "step": 1712 + }, + { + "epoch": 1.590803529958198, + "grad_norm": 0.213128136916506, + "learning_rate": 2.6092943201376936e-05, + "loss": 0.3754, + "step": 1713 + }, + { + "epoch": 1.5917324663260568, + "grad_norm": 0.2448986605848652, + "learning_rate": 2.6075731497418245e-05, + "loss": 0.352, + "step": 1714 + }, + { + "epoch": 1.5926614026939154, + "grad_norm": 0.20674274942405582, + "learning_rate": 2.605851979345955e-05, + "loss": 0.3496, + "step": 1715 + }, + { + "epoch": 1.5935903390617743, + "grad_norm": 0.18776586917192614, + "learning_rate": 2.6041308089500858e-05, + "loss": 0.3322, + "step": 1716 + }, + { + "epoch": 1.5945192754296331, + "grad_norm": 0.20953790690241902, + "learning_rate": 2.6024096385542167e-05, + "loss": 0.331, + "step": 1717 + }, + { + "epoch": 1.5954482117974917, + "grad_norm": 0.20314003333327893, + "learning_rate": 2.6006884681583475e-05, + "loss": 0.3357, + "step": 1718 + }, + { + "epoch": 1.5963771481653506, + "grad_norm": 0.20922503716379712, + "learning_rate": 2.5989672977624783e-05, + "loss": 0.3572, + "step": 1719 + }, + { + "epoch": 1.5973060845332094, + "grad_norm": 0.20988177401915958, + "learning_rate": 2.5972461273666092e-05, + "loss": 0.352, + "step": 1720 + }, + { + "epoch": 1.5982350209010683, + "grad_norm": 0.21315542074548124, + "learning_rate": 2.59552495697074e-05, + "loss": 0.3626, + "step": 1721 + }, + { + "epoch": 1.5991639572689271, + "grad_norm": 0.20813871660567065, + "learning_rate": 2.593803786574871e-05, + "loss": 0.3474, + "step": 1722 + }, + { + "epoch": 1.600092893636786, + "grad_norm": 0.1872599355028666, + "learning_rate": 2.5920826161790017e-05, + "loss": 0.3445, + "step": 1723 + }, + { + "epoch": 1.6010218300046448, + "grad_norm": 0.1911940128780578, + "learning_rate": 2.5903614457831325e-05, + "loss": 0.3391, + "step": 1724 + }, + { + "epoch": 1.6019507663725034, + "grad_norm": 0.2092461298477511, + "learning_rate": 2.5886402753872634e-05, + "loss": 0.3539, + "step": 1725 + }, + { + "epoch": 1.6028797027403623, + "grad_norm": 0.20596276428098503, + "learning_rate": 2.5869191049913942e-05, + "loss": 0.3539, + "step": 1726 + }, + { + "epoch": 1.6038086391082211, + "grad_norm": 0.1932818475108648, + "learning_rate": 2.585197934595525e-05, + "loss": 0.3526, + "step": 1727 + }, + { + "epoch": 1.6047375754760798, + "grad_norm": 0.21283715898106823, + "learning_rate": 2.583476764199656e-05, + "loss": 0.3468, + "step": 1728 + }, + { + "epoch": 1.6056665118439386, + "grad_norm": 0.20794295159851667, + "learning_rate": 2.5817555938037867e-05, + "loss": 0.3496, + "step": 1729 + }, + { + "epoch": 1.6065954482117974, + "grad_norm": 0.20156095064197782, + "learning_rate": 2.5800344234079176e-05, + "loss": 0.3396, + "step": 1730 + }, + { + "epoch": 1.6075243845796563, + "grad_norm": 0.18701079703686946, + "learning_rate": 2.5783132530120484e-05, + "loss": 0.3491, + "step": 1731 + }, + { + "epoch": 1.6084533209475151, + "grad_norm": 0.19536223716914838, + "learning_rate": 2.5765920826161793e-05, + "loss": 0.3377, + "step": 1732 + }, + { + "epoch": 1.609382257315374, + "grad_norm": 0.17791846865411473, + "learning_rate": 2.57487091222031e-05, + "loss": 0.3472, + "step": 1733 + }, + { + "epoch": 1.6103111936832328, + "grad_norm": 0.2129004966083023, + "learning_rate": 2.573149741824441e-05, + "loss": 0.329, + "step": 1734 + }, + { + "epoch": 1.6112401300510915, + "grad_norm": 0.18814499058025813, + "learning_rate": 2.5714285714285714e-05, + "loss": 0.3429, + "step": 1735 + }, + { + "epoch": 1.6121690664189503, + "grad_norm": 0.1809854014670112, + "learning_rate": 2.5697074010327023e-05, + "loss": 0.3237, + "step": 1736 + }, + { + "epoch": 1.6130980027868091, + "grad_norm": 0.18538402717591057, + "learning_rate": 2.567986230636833e-05, + "loss": 0.3497, + "step": 1737 + }, + { + "epoch": 1.6140269391546678, + "grad_norm": 0.18559136167726656, + "learning_rate": 2.566265060240964e-05, + "loss": 0.354, + "step": 1738 + }, + { + "epoch": 1.6149558755225266, + "grad_norm": 0.20263301316144824, + "learning_rate": 2.5645438898450948e-05, + "loss": 0.3672, + "step": 1739 + }, + { + "epoch": 1.6158848118903855, + "grad_norm": 0.196423315030862, + "learning_rate": 2.5628227194492253e-05, + "loss": 0.3408, + "step": 1740 + }, + { + "epoch": 1.6168137482582443, + "grad_norm": 0.19368812454813836, + "learning_rate": 2.561101549053356e-05, + "loss": 0.348, + "step": 1741 + }, + { + "epoch": 1.6177426846261032, + "grad_norm": 0.19885581508283579, + "learning_rate": 2.559380378657487e-05, + "loss": 0.3371, + "step": 1742 + }, + { + "epoch": 1.618671620993962, + "grad_norm": 0.21769073543185363, + "learning_rate": 2.5576592082616178e-05, + "loss": 0.3652, + "step": 1743 + }, + { + "epoch": 1.6196005573618208, + "grad_norm": 0.215179619394168, + "learning_rate": 2.5559380378657487e-05, + "loss": 0.3412, + "step": 1744 + }, + { + "epoch": 1.6205294937296795, + "grad_norm": 0.1898709834286627, + "learning_rate": 2.5542168674698795e-05, + "loss": 0.3352, + "step": 1745 + }, + { + "epoch": 1.6214584300975383, + "grad_norm": 0.24343732302612947, + "learning_rate": 2.5524956970740103e-05, + "loss": 0.3271, + "step": 1746 + }, + { + "epoch": 1.6223873664653972, + "grad_norm": 0.19406338350847918, + "learning_rate": 2.5507745266781412e-05, + "loss": 0.3207, + "step": 1747 + }, + { + "epoch": 1.6233163028332558, + "grad_norm": 0.18606967526005735, + "learning_rate": 2.549053356282272e-05, + "loss": 0.3402, + "step": 1748 + }, + { + "epoch": 1.6242452392011146, + "grad_norm": 0.24944407718719433, + "learning_rate": 2.547332185886403e-05, + "loss": 0.3327, + "step": 1749 + }, + { + "epoch": 1.6251741755689735, + "grad_norm": 0.20574480952491447, + "learning_rate": 2.5456110154905337e-05, + "loss": 0.3353, + "step": 1750 + }, + { + "epoch": 1.6261031119368323, + "grad_norm": 0.1899167111957658, + "learning_rate": 2.5438898450946645e-05, + "loss": 0.3533, + "step": 1751 + }, + { + "epoch": 1.6270320483046912, + "grad_norm": 0.2355371100922719, + "learning_rate": 2.5421686746987954e-05, + "loss": 0.3442, + "step": 1752 + }, + { + "epoch": 1.62796098467255, + "grad_norm": 0.22707474270252473, + "learning_rate": 2.5404475043029262e-05, + "loss": 0.3485, + "step": 1753 + }, + { + "epoch": 1.6288899210404089, + "grad_norm": 0.23297359604796236, + "learning_rate": 2.538726333907057e-05, + "loss": 0.3492, + "step": 1754 + }, + { + "epoch": 1.6298188574082675, + "grad_norm": 0.18949532826143803, + "learning_rate": 2.537005163511188e-05, + "loss": 0.3479, + "step": 1755 + }, + { + "epoch": 1.6307477937761263, + "grad_norm": 0.18362249976581027, + "learning_rate": 2.5352839931153184e-05, + "loss": 0.3581, + "step": 1756 + }, + { + "epoch": 1.6316767301439852, + "grad_norm": 0.2554058616714679, + "learning_rate": 2.5335628227194492e-05, + "loss": 0.3546, + "step": 1757 + }, + { + "epoch": 1.6326056665118438, + "grad_norm": 0.22540032891893913, + "learning_rate": 2.53184165232358e-05, + "loss": 0.3619, + "step": 1758 + }, + { + "epoch": 1.6335346028797026, + "grad_norm": 0.20418915234656554, + "learning_rate": 2.530120481927711e-05, + "loss": 0.361, + "step": 1759 + }, + { + "epoch": 1.6344635392475615, + "grad_norm": 0.19977889622190362, + "learning_rate": 2.5283993115318418e-05, + "loss": 0.3377, + "step": 1760 + }, + { + "epoch": 1.6353924756154203, + "grad_norm": 0.22689586847036763, + "learning_rate": 2.5266781411359726e-05, + "loss": 0.3282, + "step": 1761 + }, + { + "epoch": 1.6363214119832792, + "grad_norm": 0.20137363650236248, + "learning_rate": 2.5249569707401034e-05, + "loss": 0.3444, + "step": 1762 + }, + { + "epoch": 1.637250348351138, + "grad_norm": 0.20785537438471768, + "learning_rate": 2.5232358003442343e-05, + "loss": 0.3476, + "step": 1763 + }, + { + "epoch": 1.6381792847189969, + "grad_norm": 0.21247019504395587, + "learning_rate": 2.521514629948365e-05, + "loss": 0.3579, + "step": 1764 + }, + { + "epoch": 1.6391082210868555, + "grad_norm": 0.20923349346429376, + "learning_rate": 2.5197934595524956e-05, + "loss": 0.3547, + "step": 1765 + }, + { + "epoch": 1.6400371574547143, + "grad_norm": 0.19648308507644077, + "learning_rate": 2.5180722891566265e-05, + "loss": 0.3606, + "step": 1766 + }, + { + "epoch": 1.6409660938225732, + "grad_norm": 0.2302898022257594, + "learning_rate": 2.5163511187607573e-05, + "loss": 0.3521, + "step": 1767 + }, + { + "epoch": 1.6418950301904318, + "grad_norm": 0.2324588872518842, + "learning_rate": 2.514629948364888e-05, + "loss": 0.3551, + "step": 1768 + }, + { + "epoch": 1.6428239665582907, + "grad_norm": 0.22587569032785934, + "learning_rate": 2.512908777969019e-05, + "loss": 0.3874, + "step": 1769 + }, + { + "epoch": 1.6437529029261495, + "grad_norm": 0.2281758443165584, + "learning_rate": 2.5111876075731498e-05, + "loss": 0.3501, + "step": 1770 + }, + { + "epoch": 1.6446818392940084, + "grad_norm": 0.20674500113194846, + "learning_rate": 2.5094664371772807e-05, + "loss": 0.3321, + "step": 1771 + }, + { + "epoch": 1.6456107756618672, + "grad_norm": 0.18216341746652498, + "learning_rate": 2.5077452667814115e-05, + "loss": 0.3451, + "step": 1772 + }, + { + "epoch": 1.646539712029726, + "grad_norm": 0.24083907570929008, + "learning_rate": 2.5060240963855423e-05, + "loss": 0.3571, + "step": 1773 + }, + { + "epoch": 1.647468648397585, + "grad_norm": 0.21053850263699328, + "learning_rate": 2.5043029259896732e-05, + "loss": 0.3393, + "step": 1774 + }, + { + "epoch": 1.6483975847654435, + "grad_norm": 0.21190369434405645, + "learning_rate": 2.502581755593804e-05, + "loss": 0.3562, + "step": 1775 + }, + { + "epoch": 1.6493265211333024, + "grad_norm": 0.22135502539564378, + "learning_rate": 2.500860585197935e-05, + "loss": 0.3525, + "step": 1776 + }, + { + "epoch": 1.6502554575011612, + "grad_norm": 0.22367773820582154, + "learning_rate": 2.4991394148020654e-05, + "loss": 0.3466, + "step": 1777 + }, + { + "epoch": 1.6511843938690198, + "grad_norm": 0.21451556990315485, + "learning_rate": 2.4974182444061962e-05, + "loss": 0.3338, + "step": 1778 + }, + { + "epoch": 1.6521133302368787, + "grad_norm": 0.2465298762993094, + "learning_rate": 2.495697074010327e-05, + "loss": 0.333, + "step": 1779 + }, + { + "epoch": 1.6530422666047375, + "grad_norm": 0.2547044770751165, + "learning_rate": 2.493975903614458e-05, + "loss": 0.4001, + "step": 1780 + }, + { + "epoch": 1.6539712029725964, + "grad_norm": 0.22774781080068754, + "learning_rate": 2.4922547332185887e-05, + "loss": 0.3461, + "step": 1781 + }, + { + "epoch": 1.6549001393404552, + "grad_norm": 0.21204449913798434, + "learning_rate": 2.4905335628227196e-05, + "loss": 0.3274, + "step": 1782 + }, + { + "epoch": 1.655829075708314, + "grad_norm": 0.21580956854064373, + "learning_rate": 2.4888123924268504e-05, + "loss": 0.3359, + "step": 1783 + }, + { + "epoch": 1.656758012076173, + "grad_norm": 0.24076908609466494, + "learning_rate": 2.4870912220309813e-05, + "loss": 0.3588, + "step": 1784 + }, + { + "epoch": 1.6576869484440317, + "grad_norm": 0.2125733983681432, + "learning_rate": 2.485370051635112e-05, + "loss": 0.3295, + "step": 1785 + }, + { + "epoch": 1.6586158848118904, + "grad_norm": 0.1890708531709907, + "learning_rate": 2.483648881239243e-05, + "loss": 0.3418, + "step": 1786 + }, + { + "epoch": 1.6595448211797492, + "grad_norm": 0.19675942962384732, + "learning_rate": 2.4819277108433738e-05, + "loss": 0.3472, + "step": 1787 + }, + { + "epoch": 1.6604737575476078, + "grad_norm": 0.2170796522409692, + "learning_rate": 2.4802065404475046e-05, + "loss": 0.3457, + "step": 1788 + }, + { + "epoch": 1.6614026939154667, + "grad_norm": 0.21149308045148213, + "learning_rate": 2.478485370051635e-05, + "loss": 0.3759, + "step": 1789 + }, + { + "epoch": 1.6623316302833255, + "grad_norm": 0.20813548096362552, + "learning_rate": 2.476764199655766e-05, + "loss": 0.3582, + "step": 1790 + }, + { + "epoch": 1.6632605666511844, + "grad_norm": 0.19034774277541827, + "learning_rate": 2.4750430292598968e-05, + "loss": 0.3489, + "step": 1791 + }, + { + "epoch": 1.6641895030190432, + "grad_norm": 0.19380501073483952, + "learning_rate": 2.4733218588640276e-05, + "loss": 0.3339, + "step": 1792 + }, + { + "epoch": 1.665118439386902, + "grad_norm": 0.1834320423794678, + "learning_rate": 2.4716006884681585e-05, + "loss": 0.3617, + "step": 1793 + }, + { + "epoch": 1.666047375754761, + "grad_norm": 0.24296412127242056, + "learning_rate": 2.4698795180722893e-05, + "loss": 0.3596, + "step": 1794 + }, + { + "epoch": 1.6669763121226198, + "grad_norm": 0.2280578339681279, + "learning_rate": 2.46815834767642e-05, + "loss": 0.3584, + "step": 1795 + }, + { + "epoch": 1.6679052484904784, + "grad_norm": 0.19120766695593186, + "learning_rate": 2.466437177280551e-05, + "loss": 0.3479, + "step": 1796 + }, + { + "epoch": 1.6688341848583372, + "grad_norm": 0.2310771090381013, + "learning_rate": 2.464716006884682e-05, + "loss": 0.3555, + "step": 1797 + }, + { + "epoch": 1.6697631212261959, + "grad_norm": 0.18985210461541444, + "learning_rate": 2.4629948364888127e-05, + "loss": 0.3356, + "step": 1798 + }, + { + "epoch": 1.6706920575940547, + "grad_norm": 0.19786178896000547, + "learning_rate": 2.4612736660929435e-05, + "loss": 0.3374, + "step": 1799 + }, + { + "epoch": 1.6716209939619135, + "grad_norm": 0.23092029223763727, + "learning_rate": 2.4595524956970744e-05, + "loss": 0.3447, + "step": 1800 + }, + { + "epoch": 1.6725499303297724, + "grad_norm": 0.1922509738346118, + "learning_rate": 2.4578313253012052e-05, + "loss": 0.3292, + "step": 1801 + }, + { + "epoch": 1.6734788666976312, + "grad_norm": 0.21074032243268462, + "learning_rate": 2.4561101549053357e-05, + "loss": 0.3591, + "step": 1802 + }, + { + "epoch": 1.67440780306549, + "grad_norm": 0.2210145887813637, + "learning_rate": 2.4543889845094665e-05, + "loss": 0.3408, + "step": 1803 + }, + { + "epoch": 1.675336739433349, + "grad_norm": 0.1955049882758258, + "learning_rate": 2.4526678141135974e-05, + "loss": 0.3252, + "step": 1804 + }, + { + "epoch": 1.6762656758012078, + "grad_norm": 0.20630880882991626, + "learning_rate": 2.4509466437177282e-05, + "loss": 0.3343, + "step": 1805 + }, + { + "epoch": 1.6771946121690664, + "grad_norm": 0.2068467693733224, + "learning_rate": 2.449225473321859e-05, + "loss": 0.3718, + "step": 1806 + }, + { + "epoch": 1.6781235485369252, + "grad_norm": 0.21746922099285282, + "learning_rate": 2.44750430292599e-05, + "loss": 0.3473, + "step": 1807 + }, + { + "epoch": 1.6790524849047839, + "grad_norm": 0.22531439643757564, + "learning_rate": 2.4457831325301207e-05, + "loss": 0.3378, + "step": 1808 + }, + { + "epoch": 1.6799814212726427, + "grad_norm": 0.16961930850382043, + "learning_rate": 2.4440619621342516e-05, + "loss": 0.3307, + "step": 1809 + }, + { + "epoch": 1.6809103576405016, + "grad_norm": 0.2184425585470951, + "learning_rate": 2.4423407917383824e-05, + "loss": 0.3444, + "step": 1810 + }, + { + "epoch": 1.6818392940083604, + "grad_norm": 0.20341565894023106, + "learning_rate": 2.4406196213425133e-05, + "loss": 0.341, + "step": 1811 + }, + { + "epoch": 1.6827682303762193, + "grad_norm": 0.20083870829005598, + "learning_rate": 2.438898450946644e-05, + "loss": 0.3561, + "step": 1812 + }, + { + "epoch": 1.683697166744078, + "grad_norm": 0.22620645826766164, + "learning_rate": 2.437177280550775e-05, + "loss": 0.3585, + "step": 1813 + }, + { + "epoch": 1.684626103111937, + "grad_norm": 0.1941948141031554, + "learning_rate": 2.4354561101549054e-05, + "loss": 0.3443, + "step": 1814 + }, + { + "epoch": 1.6855550394797958, + "grad_norm": 0.2277987840149738, + "learning_rate": 2.4337349397590363e-05, + "loss": 0.3595, + "step": 1815 + }, + { + "epoch": 1.6864839758476544, + "grad_norm": 0.19485898183092634, + "learning_rate": 2.432013769363167e-05, + "loss": 0.322, + "step": 1816 + }, + { + "epoch": 1.6874129122155133, + "grad_norm": 0.18603940637738384, + "learning_rate": 2.430292598967298e-05, + "loss": 0.3582, + "step": 1817 + }, + { + "epoch": 1.6883418485833719, + "grad_norm": 0.23071978533124235, + "learning_rate": 2.4285714285714288e-05, + "loss": 0.3644, + "step": 1818 + }, + { + "epoch": 1.6892707849512307, + "grad_norm": 0.20747311082414033, + "learning_rate": 2.4268502581755596e-05, + "loss": 0.3373, + "step": 1819 + }, + { + "epoch": 1.6901997213190896, + "grad_norm": 0.19539815282798248, + "learning_rate": 2.4251290877796905e-05, + "loss": 0.3348, + "step": 1820 + }, + { + "epoch": 1.6911286576869484, + "grad_norm": 0.21365984840479363, + "learning_rate": 2.4234079173838213e-05, + "loss": 0.344, + "step": 1821 + }, + { + "epoch": 1.6920575940548073, + "grad_norm": 0.21504209447331304, + "learning_rate": 2.421686746987952e-05, + "loss": 0.3439, + "step": 1822 + }, + { + "epoch": 1.6929865304226661, + "grad_norm": 0.1982570782906239, + "learning_rate": 2.419965576592083e-05, + "loss": 0.3507, + "step": 1823 + }, + { + "epoch": 1.693915466790525, + "grad_norm": 0.19337549611458893, + "learning_rate": 2.418244406196214e-05, + "loss": 0.3591, + "step": 1824 + }, + { + "epoch": 1.6948444031583838, + "grad_norm": 0.20348014807541867, + "learning_rate": 2.4165232358003443e-05, + "loss": 0.325, + "step": 1825 + }, + { + "epoch": 1.6957733395262424, + "grad_norm": 0.20540644281014037, + "learning_rate": 2.4148020654044752e-05, + "loss": 0.3411, + "step": 1826 + }, + { + "epoch": 1.6967022758941013, + "grad_norm": 0.20236522736446938, + "learning_rate": 2.413080895008606e-05, + "loss": 0.3551, + "step": 1827 + }, + { + "epoch": 1.69763121226196, + "grad_norm": 0.20198047155898852, + "learning_rate": 2.411359724612737e-05, + "loss": 0.3477, + "step": 1828 + }, + { + "epoch": 1.6985601486298187, + "grad_norm": 0.21306903457439758, + "learning_rate": 2.4096385542168677e-05, + "loss": 0.3423, + "step": 1829 + }, + { + "epoch": 1.6994890849976776, + "grad_norm": 0.19666890014745034, + "learning_rate": 2.4079173838209985e-05, + "loss": 0.3444, + "step": 1830 + }, + { + "epoch": 1.7004180213655364, + "grad_norm": 0.19584912929425907, + "learning_rate": 2.4061962134251294e-05, + "loss": 0.3541, + "step": 1831 + }, + { + "epoch": 1.7013469577333953, + "grad_norm": 0.20466324373230668, + "learning_rate": 2.4044750430292602e-05, + "loss": 0.3537, + "step": 1832 + }, + { + "epoch": 1.7022758941012541, + "grad_norm": 0.20414327533805723, + "learning_rate": 2.402753872633391e-05, + "loss": 0.3491, + "step": 1833 + }, + { + "epoch": 1.703204830469113, + "grad_norm": 0.17877641915198147, + "learning_rate": 2.401032702237522e-05, + "loss": 0.3306, + "step": 1834 + }, + { + "epoch": 1.7041337668369718, + "grad_norm": 0.19756753438210706, + "learning_rate": 2.3993115318416524e-05, + "loss": 0.3522, + "step": 1835 + }, + { + "epoch": 1.7050627032048304, + "grad_norm": 0.20624963074350838, + "learning_rate": 2.3975903614457832e-05, + "loss": 0.3271, + "step": 1836 + }, + { + "epoch": 1.7059916395726893, + "grad_norm": 0.18278177540349813, + "learning_rate": 2.395869191049914e-05, + "loss": 0.3285, + "step": 1837 + }, + { + "epoch": 1.706920575940548, + "grad_norm": 0.18016020662628543, + "learning_rate": 2.394148020654045e-05, + "loss": 0.3512, + "step": 1838 + }, + { + "epoch": 1.7078495123084068, + "grad_norm": 0.19957513588627965, + "learning_rate": 2.3924268502581758e-05, + "loss": 0.342, + "step": 1839 + }, + { + "epoch": 1.7087784486762656, + "grad_norm": 0.19966706741908602, + "learning_rate": 2.3907056798623066e-05, + "loss": 0.346, + "step": 1840 + }, + { + "epoch": 1.7097073850441245, + "grad_norm": 0.18844002311194907, + "learning_rate": 2.3889845094664374e-05, + "loss": 0.3448, + "step": 1841 + }, + { + "epoch": 1.7106363214119833, + "grad_norm": 0.19040969107600308, + "learning_rate": 2.3872633390705683e-05, + "loss": 0.3606, + "step": 1842 + }, + { + "epoch": 1.7115652577798421, + "grad_norm": 0.19757417243583347, + "learning_rate": 2.385542168674699e-05, + "loss": 0.3299, + "step": 1843 + }, + { + "epoch": 1.712494194147701, + "grad_norm": 0.19669847871271992, + "learning_rate": 2.38382099827883e-05, + "loss": 0.3628, + "step": 1844 + }, + { + "epoch": 1.7134231305155598, + "grad_norm": 0.1817638016455849, + "learning_rate": 2.3820998278829608e-05, + "loss": 0.3723, + "step": 1845 + }, + { + "epoch": 1.7143520668834185, + "grad_norm": 0.19044959623520683, + "learning_rate": 2.3803786574870913e-05, + "loss": 0.336, + "step": 1846 + }, + { + "epoch": 1.7152810032512773, + "grad_norm": 0.20454387824331371, + "learning_rate": 2.378657487091222e-05, + "loss": 0.3561, + "step": 1847 + }, + { + "epoch": 1.716209939619136, + "grad_norm": 0.2101073375770777, + "learning_rate": 2.376936316695353e-05, + "loss": 0.3373, + "step": 1848 + }, + { + "epoch": 1.7171388759869948, + "grad_norm": 0.200320939170345, + "learning_rate": 2.3752151462994838e-05, + "loss": 0.3404, + "step": 1849 + }, + { + "epoch": 1.7180678123548536, + "grad_norm": 0.1952985352033432, + "learning_rate": 2.3734939759036147e-05, + "loss": 0.3518, + "step": 1850 + }, + { + "epoch": 1.7189967487227125, + "grad_norm": 0.19447811004540178, + "learning_rate": 2.3717728055077455e-05, + "loss": 0.3428, + "step": 1851 + }, + { + "epoch": 1.7199256850905713, + "grad_norm": 0.23780369028264597, + "learning_rate": 2.3700516351118763e-05, + "loss": 0.3454, + "step": 1852 + }, + { + "epoch": 1.7208546214584302, + "grad_norm": 0.21663178518511156, + "learning_rate": 2.3683304647160072e-05, + "loss": 0.3585, + "step": 1853 + }, + { + "epoch": 1.721783557826289, + "grad_norm": 0.1920341932268259, + "learning_rate": 2.366609294320138e-05, + "loss": 0.3373, + "step": 1854 + }, + { + "epoch": 1.7227124941941478, + "grad_norm": 0.20088003096202445, + "learning_rate": 2.364888123924269e-05, + "loss": 0.3408, + "step": 1855 + }, + { + "epoch": 1.7236414305620065, + "grad_norm": 0.19365614048712398, + "learning_rate": 2.3631669535283997e-05, + "loss": 0.3297, + "step": 1856 + }, + { + "epoch": 1.7245703669298653, + "grad_norm": 0.1906200226199188, + "learning_rate": 2.3614457831325302e-05, + "loss": 0.352, + "step": 1857 + }, + { + "epoch": 1.725499303297724, + "grad_norm": 0.20155735822067727, + "learning_rate": 2.359724612736661e-05, + "loss": 0.34, + "step": 1858 + }, + { + "epoch": 1.7264282396655828, + "grad_norm": 0.19158534386586176, + "learning_rate": 2.358003442340792e-05, + "loss": 0.3243, + "step": 1859 + }, + { + "epoch": 1.7273571760334416, + "grad_norm": 0.20500769628661117, + "learning_rate": 2.3562822719449227e-05, + "loss": 0.3318, + "step": 1860 + }, + { + "epoch": 1.7282861124013005, + "grad_norm": 0.211428858561031, + "learning_rate": 2.3545611015490536e-05, + "loss": 0.3332, + "step": 1861 + }, + { + "epoch": 1.7292150487691593, + "grad_norm": 0.18958780180075827, + "learning_rate": 2.3528399311531844e-05, + "loss": 0.3391, + "step": 1862 + }, + { + "epoch": 1.7301439851370182, + "grad_norm": 0.19932551881534102, + "learning_rate": 2.3511187607573152e-05, + "loss": 0.3354, + "step": 1863 + }, + { + "epoch": 1.731072921504877, + "grad_norm": 0.19686069428098435, + "learning_rate": 2.349397590361446e-05, + "loss": 0.3343, + "step": 1864 + }, + { + "epoch": 1.7320018578727359, + "grad_norm": 0.1874048143051562, + "learning_rate": 2.347676419965577e-05, + "loss": 0.3331, + "step": 1865 + }, + { + "epoch": 1.7329307942405945, + "grad_norm": 0.22051015667350352, + "learning_rate": 2.3459552495697078e-05, + "loss": 0.3406, + "step": 1866 + }, + { + "epoch": 1.7338597306084533, + "grad_norm": 0.19652476043071881, + "learning_rate": 2.3442340791738383e-05, + "loss": 0.3413, + "step": 1867 + }, + { + "epoch": 1.7347886669763122, + "grad_norm": 0.18429305065219817, + "learning_rate": 2.342512908777969e-05, + "loss": 0.3392, + "step": 1868 + }, + { + "epoch": 1.7357176033441708, + "grad_norm": 0.21153170060670934, + "learning_rate": 2.3407917383821e-05, + "loss": 0.3537, + "step": 1869 + }, + { + "epoch": 1.7366465397120296, + "grad_norm": 0.19456879357822598, + "learning_rate": 2.3390705679862308e-05, + "loss": 0.3405, + "step": 1870 + }, + { + "epoch": 1.7375754760798885, + "grad_norm": 0.19705924651045284, + "learning_rate": 2.3373493975903616e-05, + "loss": 0.3518, + "step": 1871 + }, + { + "epoch": 1.7385044124477473, + "grad_norm": 0.23371156033466298, + "learning_rate": 2.3356282271944925e-05, + "loss": 0.348, + "step": 1872 + }, + { + "epoch": 1.7394333488156062, + "grad_norm": 0.17949864900216111, + "learning_rate": 2.3339070567986233e-05, + "loss": 0.3411, + "step": 1873 + }, + { + "epoch": 1.740362285183465, + "grad_norm": 0.19886700512246908, + "learning_rate": 2.332185886402754e-05, + "loss": 0.3447, + "step": 1874 + }, + { + "epoch": 1.7412912215513239, + "grad_norm": 0.2084202011104834, + "learning_rate": 2.330464716006885e-05, + "loss": 0.3571, + "step": 1875 + }, + { + "epoch": 1.7422201579191825, + "grad_norm": 0.18145128888963527, + "learning_rate": 2.3287435456110158e-05, + "loss": 0.339, + "step": 1876 + }, + { + "epoch": 1.7431490942870413, + "grad_norm": 0.19420189258092047, + "learning_rate": 2.3270223752151467e-05, + "loss": 0.3496, + "step": 1877 + }, + { + "epoch": 1.7440780306549002, + "grad_norm": 0.19829094715505624, + "learning_rate": 2.325301204819277e-05, + "loss": 0.3137, + "step": 1878 + }, + { + "epoch": 1.7450069670227588, + "grad_norm": 0.20101000694540003, + "learning_rate": 2.323580034423408e-05, + "loss": 0.3544, + "step": 1879 + }, + { + "epoch": 1.7459359033906177, + "grad_norm": 0.1905563892060915, + "learning_rate": 2.321858864027539e-05, + "loss": 0.3345, + "step": 1880 + }, + { + "epoch": 1.7468648397584765, + "grad_norm": 0.19097935275198497, + "learning_rate": 2.3201376936316697e-05, + "loss": 0.3483, + "step": 1881 + }, + { + "epoch": 1.7477937761263354, + "grad_norm": 0.1991523378048746, + "learning_rate": 2.3184165232358005e-05, + "loss": 0.3345, + "step": 1882 + }, + { + "epoch": 1.7487227124941942, + "grad_norm": 0.1960091282122507, + "learning_rate": 2.3166953528399314e-05, + "loss": 0.3309, + "step": 1883 + }, + { + "epoch": 1.749651648862053, + "grad_norm": 0.18402497230694953, + "learning_rate": 2.3149741824440622e-05, + "loss": 0.3414, + "step": 1884 + }, + { + "epoch": 1.750580585229912, + "grad_norm": 0.18159596100294748, + "learning_rate": 2.313253012048193e-05, + "loss": 0.3571, + "step": 1885 + }, + { + "epoch": 1.7515095215977705, + "grad_norm": 0.20590477472666366, + "learning_rate": 2.311531841652324e-05, + "loss": 0.3403, + "step": 1886 + }, + { + "epoch": 1.7524384579656294, + "grad_norm": 0.18739986818056353, + "learning_rate": 2.3098106712564547e-05, + "loss": 0.368, + "step": 1887 + }, + { + "epoch": 1.7533673943334882, + "grad_norm": 0.1922357485644034, + "learning_rate": 2.3080895008605852e-05, + "loss": 0.3622, + "step": 1888 + }, + { + "epoch": 1.7542963307013468, + "grad_norm": 0.1907104113911057, + "learning_rate": 2.306368330464716e-05, + "loss": 0.337, + "step": 1889 + }, + { + "epoch": 1.7552252670692057, + "grad_norm": 0.18926167592617857, + "learning_rate": 2.304647160068847e-05, + "loss": 0.3637, + "step": 1890 + }, + { + "epoch": 1.7561542034370645, + "grad_norm": 0.19061055691996645, + "learning_rate": 2.3029259896729777e-05, + "loss": 0.3605, + "step": 1891 + }, + { + "epoch": 1.7570831398049234, + "grad_norm": 0.301161005754076, + "learning_rate": 2.3012048192771086e-05, + "loss": 0.3274, + "step": 1892 + }, + { + "epoch": 1.7580120761727822, + "grad_norm": 0.17079160860066211, + "learning_rate": 2.2994836488812394e-05, + "loss": 0.3499, + "step": 1893 + }, + { + "epoch": 1.758941012540641, + "grad_norm": 0.20510016168695175, + "learning_rate": 2.2977624784853703e-05, + "loss": 0.3718, + "step": 1894 + }, + { + "epoch": 1.7598699489085, + "grad_norm": 0.1860961861877371, + "learning_rate": 2.296041308089501e-05, + "loss": 0.3446, + "step": 1895 + }, + { + "epoch": 1.7607988852763585, + "grad_norm": 0.1823482266004319, + "learning_rate": 2.294320137693632e-05, + "loss": 0.3444, + "step": 1896 + }, + { + "epoch": 1.7617278216442174, + "grad_norm": 0.181834177614641, + "learning_rate": 2.2925989672977628e-05, + "loss": 0.352, + "step": 1897 + }, + { + "epoch": 1.7626567580120762, + "grad_norm": 0.189848159943081, + "learning_rate": 2.2908777969018936e-05, + "loss": 0.3502, + "step": 1898 + }, + { + "epoch": 1.7635856943799348, + "grad_norm": 0.19580044122071402, + "learning_rate": 2.289156626506024e-05, + "loss": 0.343, + "step": 1899 + }, + { + "epoch": 1.7645146307477937, + "grad_norm": 0.1958254403858763, + "learning_rate": 2.287435456110155e-05, + "loss": 0.3301, + "step": 1900 + }, + { + "epoch": 1.7654435671156525, + "grad_norm": 0.1714082079009884, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.3136, + "step": 1901 + }, + { + "epoch": 1.7663725034835114, + "grad_norm": 0.188207144023739, + "learning_rate": 2.2839931153184166e-05, + "loss": 0.3496, + "step": 1902 + }, + { + "epoch": 1.7673014398513702, + "grad_norm": 0.19149325753512772, + "learning_rate": 2.2822719449225475e-05, + "loss": 0.3471, + "step": 1903 + }, + { + "epoch": 1.768230376219229, + "grad_norm": 0.20844243247605607, + "learning_rate": 2.2805507745266783e-05, + "loss": 0.3635, + "step": 1904 + }, + { + "epoch": 1.769159312587088, + "grad_norm": 0.21302605597258154, + "learning_rate": 2.278829604130809e-05, + "loss": 0.3586, + "step": 1905 + }, + { + "epoch": 1.7700882489549465, + "grad_norm": 0.18803392186519952, + "learning_rate": 2.27710843373494e-05, + "loss": 0.3236, + "step": 1906 + }, + { + "epoch": 1.7710171853228054, + "grad_norm": 0.19461739487306387, + "learning_rate": 2.275387263339071e-05, + "loss": 0.3336, + "step": 1907 + }, + { + "epoch": 1.7719461216906642, + "grad_norm": 0.19855123181901438, + "learning_rate": 2.2736660929432017e-05, + "loss": 0.328, + "step": 1908 + }, + { + "epoch": 1.7728750580585229, + "grad_norm": 0.193296807754737, + "learning_rate": 2.2719449225473325e-05, + "loss": 0.3429, + "step": 1909 + }, + { + "epoch": 1.7738039944263817, + "grad_norm": 0.18250714695301445, + "learning_rate": 2.270223752151463e-05, + "loss": 0.3402, + "step": 1910 + }, + { + "epoch": 1.7747329307942405, + "grad_norm": 0.2207728048782478, + "learning_rate": 2.268502581755594e-05, + "loss": 0.3648, + "step": 1911 + }, + { + "epoch": 1.7756618671620994, + "grad_norm": 0.20763889726318494, + "learning_rate": 2.2667814113597247e-05, + "loss": 0.335, + "step": 1912 + }, + { + "epoch": 1.7765908035299582, + "grad_norm": 0.1802847803327561, + "learning_rate": 2.2650602409638555e-05, + "loss": 0.3659, + "step": 1913 + }, + { + "epoch": 1.777519739897817, + "grad_norm": 0.2017058059062457, + "learning_rate": 2.2633390705679864e-05, + "loss": 0.3229, + "step": 1914 + }, + { + "epoch": 1.778448676265676, + "grad_norm": 0.18380005279157755, + "learning_rate": 2.2616179001721172e-05, + "loss": 0.3297, + "step": 1915 + }, + { + "epoch": 1.7793776126335346, + "grad_norm": 0.20980299060848648, + "learning_rate": 2.259896729776248e-05, + "loss": 0.3603, + "step": 1916 + }, + { + "epoch": 1.7803065490013934, + "grad_norm": 0.21845300686706248, + "learning_rate": 2.258175559380379e-05, + "loss": 0.3526, + "step": 1917 + }, + { + "epoch": 1.7812354853692522, + "grad_norm": 0.18306805762943487, + "learning_rate": 2.2564543889845097e-05, + "loss": 0.3427, + "step": 1918 + }, + { + "epoch": 1.7821644217371109, + "grad_norm": 0.22267478907178537, + "learning_rate": 2.2547332185886406e-05, + "loss": 0.3502, + "step": 1919 + }, + { + "epoch": 1.7830933581049697, + "grad_norm": 0.2114169092592862, + "learning_rate": 2.253012048192771e-05, + "loss": 0.337, + "step": 1920 + }, + { + "epoch": 1.7840222944728286, + "grad_norm": 0.2114698629778862, + "learning_rate": 2.251290877796902e-05, + "loss": 0.3331, + "step": 1921 + }, + { + "epoch": 1.7849512308406874, + "grad_norm": 0.1737401942597242, + "learning_rate": 2.2495697074010328e-05, + "loss": 0.3402, + "step": 1922 + }, + { + "epoch": 1.7858801672085463, + "grad_norm": 0.21860276556477812, + "learning_rate": 2.2478485370051636e-05, + "loss": 0.3785, + "step": 1923 + }, + { + "epoch": 1.786809103576405, + "grad_norm": 0.21990594737894525, + "learning_rate": 2.2461273666092944e-05, + "loss": 0.333, + "step": 1924 + }, + { + "epoch": 1.787738039944264, + "grad_norm": 0.17965595351731087, + "learning_rate": 2.2444061962134253e-05, + "loss": 0.3161, + "step": 1925 + }, + { + "epoch": 1.7886669763121226, + "grad_norm": 0.18037444567864072, + "learning_rate": 2.242685025817556e-05, + "loss": 0.3218, + "step": 1926 + }, + { + "epoch": 1.7895959126799814, + "grad_norm": 0.19458012919376982, + "learning_rate": 2.240963855421687e-05, + "loss": 0.3445, + "step": 1927 + }, + { + "epoch": 1.7905248490478403, + "grad_norm": 0.1870071334265635, + "learning_rate": 2.2392426850258178e-05, + "loss": 0.3467, + "step": 1928 + }, + { + "epoch": 1.7914537854156989, + "grad_norm": 0.20143024738216037, + "learning_rate": 2.2375215146299486e-05, + "loss": 0.3667, + "step": 1929 + }, + { + "epoch": 1.7923827217835577, + "grad_norm": 0.18517571958598458, + "learning_rate": 2.2358003442340795e-05, + "loss": 0.347, + "step": 1930 + }, + { + "epoch": 1.7933116581514166, + "grad_norm": 0.18972740321401171, + "learning_rate": 2.23407917383821e-05, + "loss": 0.3561, + "step": 1931 + }, + { + "epoch": 1.7942405945192754, + "grad_norm": 0.18681340130076599, + "learning_rate": 2.2323580034423408e-05, + "loss": 0.3526, + "step": 1932 + }, + { + "epoch": 1.7951695308871343, + "grad_norm": 0.17839062435491831, + "learning_rate": 2.2306368330464717e-05, + "loss": 0.3492, + "step": 1933 + }, + { + "epoch": 1.7960984672549931, + "grad_norm": 0.19307814521598352, + "learning_rate": 2.2289156626506025e-05, + "loss": 0.3543, + "step": 1934 + }, + { + "epoch": 1.797027403622852, + "grad_norm": 0.18708013233718124, + "learning_rate": 2.2271944922547333e-05, + "loss": 0.3459, + "step": 1935 + }, + { + "epoch": 1.7979563399907106, + "grad_norm": 0.17553651305256152, + "learning_rate": 2.2254733218588642e-05, + "loss": 0.3598, + "step": 1936 + }, + { + "epoch": 1.7988852763585694, + "grad_norm": 0.18618042918495897, + "learning_rate": 2.223752151462995e-05, + "loss": 0.3248, + "step": 1937 + }, + { + "epoch": 1.7998142127264283, + "grad_norm": 0.17742356291112643, + "learning_rate": 2.222030981067126e-05, + "loss": 0.3367, + "step": 1938 + }, + { + "epoch": 1.800743149094287, + "grad_norm": 0.1783779824099535, + "learning_rate": 2.2203098106712567e-05, + "loss": 0.3459, + "step": 1939 + }, + { + "epoch": 1.8016720854621457, + "grad_norm": 0.1873864409342349, + "learning_rate": 2.2185886402753875e-05, + "loss": 0.3411, + "step": 1940 + }, + { + "epoch": 1.8026010218300046, + "grad_norm": 0.19820459741704952, + "learning_rate": 2.216867469879518e-05, + "loss": 0.3622, + "step": 1941 + }, + { + "epoch": 1.8035299581978634, + "grad_norm": 0.19579829256800121, + "learning_rate": 2.215146299483649e-05, + "loss": 0.3488, + "step": 1942 + }, + { + "epoch": 1.8044588945657223, + "grad_norm": 0.18994419202784169, + "learning_rate": 2.2134251290877797e-05, + "loss": 0.3274, + "step": 1943 + }, + { + "epoch": 1.8053878309335811, + "grad_norm": 0.17560731190213005, + "learning_rate": 2.2117039586919106e-05, + "loss": 0.3249, + "step": 1944 + }, + { + "epoch": 1.80631676730144, + "grad_norm": 0.18332672388432894, + "learning_rate": 2.2099827882960414e-05, + "loss": 0.3287, + "step": 1945 + }, + { + "epoch": 1.8072457036692986, + "grad_norm": 0.17542344527664755, + "learning_rate": 2.2082616179001722e-05, + "loss": 0.3438, + "step": 1946 + }, + { + "epoch": 1.8081746400371574, + "grad_norm": 0.19087927519310427, + "learning_rate": 2.206540447504303e-05, + "loss": 0.3422, + "step": 1947 + }, + { + "epoch": 1.8091035764050163, + "grad_norm": 0.17849449852961385, + "learning_rate": 2.204819277108434e-05, + "loss": 0.3461, + "step": 1948 + }, + { + "epoch": 1.810032512772875, + "grad_norm": 0.1994999151338678, + "learning_rate": 2.2030981067125648e-05, + "loss": 0.3394, + "step": 1949 + }, + { + "epoch": 1.8109614491407338, + "grad_norm": 0.1907734719321139, + "learning_rate": 2.2013769363166956e-05, + "loss": 0.3503, + "step": 1950 + }, + { + "epoch": 1.8118903855085926, + "grad_norm": 0.19648653560232324, + "learning_rate": 2.1996557659208264e-05, + "loss": 0.3316, + "step": 1951 + }, + { + "epoch": 1.8128193218764515, + "grad_norm": 0.1896427151890389, + "learning_rate": 2.197934595524957e-05, + "loss": 0.3392, + "step": 1952 + }, + { + "epoch": 1.8137482582443103, + "grad_norm": 0.19027376064820928, + "learning_rate": 2.1962134251290878e-05, + "loss": 0.3537, + "step": 1953 + }, + { + "epoch": 1.8146771946121691, + "grad_norm": 0.1855808710590735, + "learning_rate": 2.1944922547332186e-05, + "loss": 0.3187, + "step": 1954 + }, + { + "epoch": 1.815606130980028, + "grad_norm": 0.1984498093903945, + "learning_rate": 2.1927710843373495e-05, + "loss": 0.3613, + "step": 1955 + }, + { + "epoch": 1.8165350673478868, + "grad_norm": 0.2242505101068666, + "learning_rate": 2.1910499139414803e-05, + "loss": 0.3266, + "step": 1956 + }, + { + "epoch": 1.8174640037157455, + "grad_norm": 0.18137823080536702, + "learning_rate": 2.189328743545611e-05, + "loss": 0.3399, + "step": 1957 + }, + { + "epoch": 1.8183929400836043, + "grad_norm": 0.22546513335231178, + "learning_rate": 2.187607573149742e-05, + "loss": 0.3301, + "step": 1958 + }, + { + "epoch": 1.819321876451463, + "grad_norm": 0.20216140374779656, + "learning_rate": 2.185886402753873e-05, + "loss": 0.3401, + "step": 1959 + }, + { + "epoch": 1.8202508128193218, + "grad_norm": 0.19247489072112667, + "learning_rate": 2.1841652323580037e-05, + "loss": 0.3451, + "step": 1960 + }, + { + "epoch": 1.8211797491871806, + "grad_norm": 0.217041904392123, + "learning_rate": 2.1824440619621345e-05, + "loss": 0.3609, + "step": 1961 + }, + { + "epoch": 1.8221086855550395, + "grad_norm": 0.20709511532226713, + "learning_rate": 2.1807228915662654e-05, + "loss": 0.3389, + "step": 1962 + }, + { + "epoch": 1.8230376219228983, + "grad_norm": 0.20249280813186266, + "learning_rate": 2.179001721170396e-05, + "loss": 0.3367, + "step": 1963 + }, + { + "epoch": 1.8239665582907572, + "grad_norm": 0.21759029061763033, + "learning_rate": 2.1772805507745267e-05, + "loss": 0.3532, + "step": 1964 + }, + { + "epoch": 1.824895494658616, + "grad_norm": 0.19698131495510754, + "learning_rate": 2.1755593803786575e-05, + "loss": 0.3453, + "step": 1965 + }, + { + "epoch": 1.8258244310264748, + "grad_norm": 0.222032209900618, + "learning_rate": 2.1738382099827884e-05, + "loss": 0.3358, + "step": 1966 + }, + { + "epoch": 1.8267533673943335, + "grad_norm": 0.2098001700888135, + "learning_rate": 2.1721170395869192e-05, + "loss": 0.342, + "step": 1967 + }, + { + "epoch": 1.8276823037621923, + "grad_norm": 0.20422523735242673, + "learning_rate": 2.17039586919105e-05, + "loss": 0.321, + "step": 1968 + }, + { + "epoch": 1.828611240130051, + "grad_norm": 0.2012686527729587, + "learning_rate": 2.168674698795181e-05, + "loss": 0.3442, + "step": 1969 + }, + { + "epoch": 1.8295401764979098, + "grad_norm": 0.21283216710918032, + "learning_rate": 2.1669535283993117e-05, + "loss": 0.3588, + "step": 1970 + }, + { + "epoch": 1.8304691128657686, + "grad_norm": 0.19417470172069212, + "learning_rate": 2.1652323580034426e-05, + "loss": 0.3283, + "step": 1971 + }, + { + "epoch": 1.8313980492336275, + "grad_norm": 0.20490789352440664, + "learning_rate": 2.1635111876075734e-05, + "loss": 0.3637, + "step": 1972 + }, + { + "epoch": 1.8323269856014863, + "grad_norm": 0.2069872797582916, + "learning_rate": 2.161790017211704e-05, + "loss": 0.3346, + "step": 1973 + }, + { + "epoch": 1.8332559219693452, + "grad_norm": 0.18465378730661033, + "learning_rate": 2.1600688468158348e-05, + "loss": 0.3514, + "step": 1974 + }, + { + "epoch": 1.834184858337204, + "grad_norm": 0.1772459699016406, + "learning_rate": 2.1583476764199656e-05, + "loss": 0.3446, + "step": 1975 + }, + { + "epoch": 1.8351137947050629, + "grad_norm": 0.2246086295913219, + "learning_rate": 2.1566265060240964e-05, + "loss": 0.3426, + "step": 1976 + }, + { + "epoch": 1.8360427310729215, + "grad_norm": 0.19999809607634841, + "learning_rate": 2.1549053356282273e-05, + "loss": 0.3266, + "step": 1977 + }, + { + "epoch": 1.8369716674407803, + "grad_norm": 0.18937275360959321, + "learning_rate": 2.153184165232358e-05, + "loss": 0.3362, + "step": 1978 + }, + { + "epoch": 1.837900603808639, + "grad_norm": 0.1894381875679942, + "learning_rate": 2.151462994836489e-05, + "loss": 0.3555, + "step": 1979 + }, + { + "epoch": 1.8388295401764978, + "grad_norm": 0.2163255652366772, + "learning_rate": 2.1497418244406198e-05, + "loss": 0.3509, + "step": 1980 + }, + { + "epoch": 1.8397584765443566, + "grad_norm": 0.21158701402876343, + "learning_rate": 2.1480206540447506e-05, + "loss": 0.3419, + "step": 1981 + }, + { + "epoch": 1.8406874129122155, + "grad_norm": 0.1832944886654045, + "learning_rate": 2.1462994836488815e-05, + "loss": 0.3519, + "step": 1982 + }, + { + "epoch": 1.8416163492800743, + "grad_norm": 0.20724791335919457, + "learning_rate": 2.1445783132530123e-05, + "loss": 0.3332, + "step": 1983 + }, + { + "epoch": 1.8425452856479332, + "grad_norm": 0.2026632632941415, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.3535, + "step": 1984 + }, + { + "epoch": 1.843474222015792, + "grad_norm": 0.18716762184495547, + "learning_rate": 2.1411359724612737e-05, + "loss": 0.3388, + "step": 1985 + }, + { + "epoch": 1.8444031583836509, + "grad_norm": 0.19811432373555643, + "learning_rate": 2.1394148020654045e-05, + "loss": 0.338, + "step": 1986 + }, + { + "epoch": 1.8453320947515095, + "grad_norm": 0.2078115035058908, + "learning_rate": 2.1376936316695353e-05, + "loss": 0.3175, + "step": 1987 + }, + { + "epoch": 1.8462610311193683, + "grad_norm": 0.19475185295752664, + "learning_rate": 2.1359724612736662e-05, + "loss": 0.3284, + "step": 1988 + }, + { + "epoch": 1.847189967487227, + "grad_norm": 0.19643704859931127, + "learning_rate": 2.134251290877797e-05, + "loss": 0.3579, + "step": 1989 + }, + { + "epoch": 1.8481189038550858, + "grad_norm": 0.19642321023564513, + "learning_rate": 2.132530120481928e-05, + "loss": 0.3215, + "step": 1990 + }, + { + "epoch": 1.8490478402229447, + "grad_norm": 0.18968037438997556, + "learning_rate": 2.1308089500860587e-05, + "loss": 0.3206, + "step": 1991 + }, + { + "epoch": 1.8499767765908035, + "grad_norm": 0.19787881644954247, + "learning_rate": 2.1290877796901895e-05, + "loss": 0.3362, + "step": 1992 + }, + { + "epoch": 1.8509057129586624, + "grad_norm": 0.19024488827058927, + "learning_rate": 2.1273666092943204e-05, + "loss": 0.3398, + "step": 1993 + }, + { + "epoch": 1.8518346493265212, + "grad_norm": 0.19182808398964474, + "learning_rate": 2.125645438898451e-05, + "loss": 0.3428, + "step": 1994 + }, + { + "epoch": 1.85276358569438, + "grad_norm": 0.21009107426495674, + "learning_rate": 2.1239242685025817e-05, + "loss": 0.3434, + "step": 1995 + }, + { + "epoch": 1.853692522062239, + "grad_norm": 0.1914880152985732, + "learning_rate": 2.1222030981067126e-05, + "loss": 0.327, + "step": 1996 + }, + { + "epoch": 1.8546214584300975, + "grad_norm": 0.1913970478509932, + "learning_rate": 2.1204819277108434e-05, + "loss": 0.34, + "step": 1997 + }, + { + "epoch": 1.8555503947979564, + "grad_norm": 0.2155756840317901, + "learning_rate": 2.1187607573149742e-05, + "loss": 0.3245, + "step": 1998 + }, + { + "epoch": 1.856479331165815, + "grad_norm": 0.18445074398921807, + "learning_rate": 2.117039586919105e-05, + "loss": 0.3429, + "step": 1999 + }, + { + "epoch": 1.8574082675336738, + "grad_norm": 0.18560311928741094, + "learning_rate": 2.115318416523236e-05, + "loss": 0.328, + "step": 2000 + }, + { + "epoch": 1.8583372039015327, + "grad_norm": 0.17532760828844168, + "learning_rate": 2.1135972461273668e-05, + "loss": 0.3499, + "step": 2001 + }, + { + "epoch": 1.8592661402693915, + "grad_norm": 0.18320848398237594, + "learning_rate": 2.1118760757314976e-05, + "loss": 0.3371, + "step": 2002 + }, + { + "epoch": 1.8601950766372504, + "grad_norm": 0.18645021050922914, + "learning_rate": 2.1101549053356284e-05, + "loss": 0.3712, + "step": 2003 + }, + { + "epoch": 1.8611240130051092, + "grad_norm": 0.1930981725638318, + "learning_rate": 2.1084337349397593e-05, + "loss": 0.3631, + "step": 2004 + }, + { + "epoch": 1.862052949372968, + "grad_norm": 0.23294279233206924, + "learning_rate": 2.1067125645438898e-05, + "loss": 0.3627, + "step": 2005 + }, + { + "epoch": 1.862981885740827, + "grad_norm": 0.22252132694320956, + "learning_rate": 2.1049913941480206e-05, + "loss": 0.3251, + "step": 2006 + }, + { + "epoch": 1.8639108221086855, + "grad_norm": 0.18325558308090584, + "learning_rate": 2.1032702237521515e-05, + "loss": 0.3516, + "step": 2007 + }, + { + "epoch": 1.8648397584765444, + "grad_norm": 0.18803938364991366, + "learning_rate": 2.1015490533562823e-05, + "loss": 0.335, + "step": 2008 + }, + { + "epoch": 1.865768694844403, + "grad_norm": 0.21560136159819376, + "learning_rate": 2.099827882960413e-05, + "loss": 0.3502, + "step": 2009 + }, + { + "epoch": 1.8666976312122618, + "grad_norm": 0.2075112616048167, + "learning_rate": 2.098106712564544e-05, + "loss": 0.3448, + "step": 2010 + }, + { + "epoch": 1.8676265675801207, + "grad_norm": 0.1794063031438016, + "learning_rate": 2.0963855421686748e-05, + "loss": 0.3589, + "step": 2011 + }, + { + "epoch": 1.8685555039479795, + "grad_norm": 0.2088734523300138, + "learning_rate": 2.0946643717728057e-05, + "loss": 0.3401, + "step": 2012 + }, + { + "epoch": 1.8694844403158384, + "grad_norm": 0.20514199980300488, + "learning_rate": 2.0929432013769365e-05, + "loss": 0.3562, + "step": 2013 + }, + { + "epoch": 1.8704133766836972, + "grad_norm": 0.1888327050014192, + "learning_rate": 2.0912220309810673e-05, + "loss": 0.3104, + "step": 2014 + }, + { + "epoch": 1.871342313051556, + "grad_norm": 0.18729822949477454, + "learning_rate": 2.0895008605851982e-05, + "loss": 0.3547, + "step": 2015 + }, + { + "epoch": 1.872271249419415, + "grad_norm": 0.22612417730319762, + "learning_rate": 2.0877796901893287e-05, + "loss": 0.3499, + "step": 2016 + }, + { + "epoch": 1.8732001857872735, + "grad_norm": 0.19668595579111908, + "learning_rate": 2.0860585197934595e-05, + "loss": 0.3296, + "step": 2017 + }, + { + "epoch": 1.8741291221551324, + "grad_norm": 0.18571506463595566, + "learning_rate": 2.0843373493975904e-05, + "loss": 0.3312, + "step": 2018 + }, + { + "epoch": 1.875058058522991, + "grad_norm": 0.18777769470240502, + "learning_rate": 2.0826161790017212e-05, + "loss": 0.3234, + "step": 2019 + }, + { + "epoch": 1.8759869948908499, + "grad_norm": 0.18583762914371052, + "learning_rate": 2.080895008605852e-05, + "loss": 0.3271, + "step": 2020 + }, + { + "epoch": 1.8769159312587087, + "grad_norm": 0.1960771159495725, + "learning_rate": 2.079173838209983e-05, + "loss": 0.3581, + "step": 2021 + }, + { + "epoch": 1.8778448676265675, + "grad_norm": 0.19599306930659696, + "learning_rate": 2.0774526678141137e-05, + "loss": 0.357, + "step": 2022 + }, + { + "epoch": 1.8787738039944264, + "grad_norm": 0.1951564097933878, + "learning_rate": 2.0757314974182446e-05, + "loss": 0.3591, + "step": 2023 + }, + { + "epoch": 1.8797027403622852, + "grad_norm": 0.19235765747681716, + "learning_rate": 2.0740103270223754e-05, + "loss": 0.3609, + "step": 2024 + }, + { + "epoch": 1.880631676730144, + "grad_norm": 0.19639988332184477, + "learning_rate": 2.0722891566265062e-05, + "loss": 0.3397, + "step": 2025 + }, + { + "epoch": 1.881560613098003, + "grad_norm": 0.19618499182075172, + "learning_rate": 2.0705679862306367e-05, + "loss": 0.3319, + "step": 2026 + }, + { + "epoch": 1.8824895494658616, + "grad_norm": 0.18391264325568676, + "learning_rate": 2.0688468158347676e-05, + "loss": 0.3668, + "step": 2027 + }, + { + "epoch": 1.8834184858337204, + "grad_norm": 0.1865315427514266, + "learning_rate": 2.0671256454388984e-05, + "loss": 0.357, + "step": 2028 + }, + { + "epoch": 1.884347422201579, + "grad_norm": 0.1965123947388705, + "learning_rate": 2.0654044750430293e-05, + "loss": 0.3317, + "step": 2029 + }, + { + "epoch": 1.8852763585694379, + "grad_norm": 0.19090795755847184, + "learning_rate": 2.06368330464716e-05, + "loss": 0.3409, + "step": 2030 + }, + { + "epoch": 1.8862052949372967, + "grad_norm": 0.2011381053988122, + "learning_rate": 2.061962134251291e-05, + "loss": 0.3579, + "step": 2031 + }, + { + "epoch": 1.8871342313051556, + "grad_norm": 0.17714274475948777, + "learning_rate": 2.0602409638554218e-05, + "loss": 0.3486, + "step": 2032 + }, + { + "epoch": 1.8880631676730144, + "grad_norm": 0.20684528947484848, + "learning_rate": 2.0585197934595526e-05, + "loss": 0.3315, + "step": 2033 + }, + { + "epoch": 1.8889921040408733, + "grad_norm": 0.20594746537045297, + "learning_rate": 2.0567986230636835e-05, + "loss": 0.3518, + "step": 2034 + }, + { + "epoch": 1.889921040408732, + "grad_norm": 0.20843330387434464, + "learning_rate": 2.0550774526678143e-05, + "loss": 0.3377, + "step": 2035 + }, + { + "epoch": 1.890849976776591, + "grad_norm": 0.19119948120789587, + "learning_rate": 2.053356282271945e-05, + "loss": 0.3508, + "step": 2036 + }, + { + "epoch": 1.8917789131444496, + "grad_norm": 0.20499362964321996, + "learning_rate": 2.0516351118760756e-05, + "loss": 0.3386, + "step": 2037 + }, + { + "epoch": 1.8927078495123084, + "grad_norm": 0.20951246709031993, + "learning_rate": 2.0499139414802065e-05, + "loss": 0.3454, + "step": 2038 + }, + { + "epoch": 1.8936367858801673, + "grad_norm": 0.1933469877666157, + "learning_rate": 2.0481927710843373e-05, + "loss": 0.3498, + "step": 2039 + }, + { + "epoch": 1.8945657222480259, + "grad_norm": 0.2033030466421948, + "learning_rate": 2.046471600688468e-05, + "loss": 0.3496, + "step": 2040 + }, + { + "epoch": 1.8954946586158847, + "grad_norm": 0.21435793489097013, + "learning_rate": 2.044750430292599e-05, + "loss": 0.35, + "step": 2041 + }, + { + "epoch": 1.8964235949837436, + "grad_norm": 0.21062698065040608, + "learning_rate": 2.04302925989673e-05, + "loss": 0.3598, + "step": 2042 + }, + { + "epoch": 1.8973525313516024, + "grad_norm": 0.19925766321798463, + "learning_rate": 2.0413080895008607e-05, + "loss": 0.3301, + "step": 2043 + }, + { + "epoch": 1.8982814677194613, + "grad_norm": 0.19489181490424323, + "learning_rate": 2.0395869191049915e-05, + "loss": 0.325, + "step": 2044 + }, + { + "epoch": 1.8992104040873201, + "grad_norm": 0.20174771996042817, + "learning_rate": 2.0378657487091224e-05, + "loss": 0.3308, + "step": 2045 + }, + { + "epoch": 1.900139340455179, + "grad_norm": 0.20408788253809196, + "learning_rate": 2.0361445783132532e-05, + "loss": 0.3211, + "step": 2046 + }, + { + "epoch": 1.9010682768230376, + "grad_norm": 0.1878516530540329, + "learning_rate": 2.0344234079173837e-05, + "loss": 0.3414, + "step": 2047 + }, + { + "epoch": 1.9019972131908964, + "grad_norm": 0.21257975400270182, + "learning_rate": 2.0327022375215145e-05, + "loss": 0.3781, + "step": 2048 + }, + { + "epoch": 1.9029261495587553, + "grad_norm": 0.20595091660025142, + "learning_rate": 2.0309810671256454e-05, + "loss": 0.3373, + "step": 2049 + }, + { + "epoch": 1.903855085926614, + "grad_norm": 0.19877200066200787, + "learning_rate": 2.0292598967297762e-05, + "loss": 0.3417, + "step": 2050 + }, + { + "epoch": 1.9047840222944727, + "grad_norm": 0.20104969915961177, + "learning_rate": 2.027538726333907e-05, + "loss": 0.3467, + "step": 2051 + }, + { + "epoch": 1.9057129586623316, + "grad_norm": 0.2327660603720691, + "learning_rate": 2.025817555938038e-05, + "loss": 0.3568, + "step": 2052 + }, + { + "epoch": 1.9066418950301904, + "grad_norm": 0.18750059187569587, + "learning_rate": 2.0240963855421687e-05, + "loss": 0.3427, + "step": 2053 + }, + { + "epoch": 1.9075708313980493, + "grad_norm": 0.18529718656829258, + "learning_rate": 2.0223752151462996e-05, + "loss": 0.3657, + "step": 2054 + }, + { + "epoch": 1.9084997677659081, + "grad_norm": 0.21378367120455413, + "learning_rate": 2.0206540447504304e-05, + "loss": 0.3448, + "step": 2055 + }, + { + "epoch": 1.909428704133767, + "grad_norm": 0.19816996642108167, + "learning_rate": 2.0189328743545613e-05, + "loss": 0.3379, + "step": 2056 + }, + { + "epoch": 1.9103576405016256, + "grad_norm": 0.17952569303828486, + "learning_rate": 2.017211703958692e-05, + "loss": 0.3495, + "step": 2057 + }, + { + "epoch": 1.9112865768694844, + "grad_norm": 0.1887115752212257, + "learning_rate": 2.0154905335628226e-05, + "loss": 0.3528, + "step": 2058 + }, + { + "epoch": 1.9122155132373433, + "grad_norm": 0.19795487285379446, + "learning_rate": 2.0137693631669534e-05, + "loss": 0.312, + "step": 2059 + }, + { + "epoch": 1.913144449605202, + "grad_norm": 0.1913042487899833, + "learning_rate": 2.0120481927710843e-05, + "loss": 0.3474, + "step": 2060 + }, + { + "epoch": 1.9140733859730608, + "grad_norm": 0.18702333471919602, + "learning_rate": 2.010327022375215e-05, + "loss": 0.3313, + "step": 2061 + }, + { + "epoch": 1.9150023223409196, + "grad_norm": 0.18216724495400727, + "learning_rate": 2.008605851979346e-05, + "loss": 0.3273, + "step": 2062 + }, + { + "epoch": 1.9159312587087785, + "grad_norm": 0.1870733559834162, + "learning_rate": 2.0068846815834768e-05, + "loss": 0.328, + "step": 2063 + }, + { + "epoch": 1.9168601950766373, + "grad_norm": 0.20069109494783774, + "learning_rate": 2.0051635111876076e-05, + "loss": 0.3468, + "step": 2064 + }, + { + "epoch": 1.9177891314444961, + "grad_norm": 0.1788738115806139, + "learning_rate": 2.0034423407917385e-05, + "loss": 0.3186, + "step": 2065 + }, + { + "epoch": 1.918718067812355, + "grad_norm": 0.18264576767730586, + "learning_rate": 2.0017211703958693e-05, + "loss": 0.3399, + "step": 2066 + }, + { + "epoch": 1.9196470041802136, + "grad_norm": 0.1798033926304608, + "learning_rate": 2e-05, + "loss": 0.3475, + "step": 2067 + }, + { + "epoch": 1.9205759405480725, + "grad_norm": 0.20622934405855825, + "learning_rate": 1.998278829604131e-05, + "loss": 0.3448, + "step": 2068 + }, + { + "epoch": 1.9215048769159313, + "grad_norm": 0.20482612442847245, + "learning_rate": 1.9965576592082615e-05, + "loss": 0.3479, + "step": 2069 + }, + { + "epoch": 1.92243381328379, + "grad_norm": 0.19506902393707024, + "learning_rate": 1.9948364888123923e-05, + "loss": 0.3428, + "step": 2070 + }, + { + "epoch": 1.9233627496516488, + "grad_norm": 0.20966049670336326, + "learning_rate": 1.9931153184165232e-05, + "loss": 0.344, + "step": 2071 + }, + { + "epoch": 1.9242916860195076, + "grad_norm": 0.1865501634784815, + "learning_rate": 1.991394148020654e-05, + "loss": 0.3616, + "step": 2072 + }, + { + "epoch": 1.9252206223873665, + "grad_norm": 0.196751064921281, + "learning_rate": 1.989672977624785e-05, + "loss": 0.3318, + "step": 2073 + }, + { + "epoch": 1.9261495587552253, + "grad_norm": 0.1804349938052275, + "learning_rate": 1.9879518072289157e-05, + "loss": 0.3416, + "step": 2074 + }, + { + "epoch": 1.9270784951230842, + "grad_norm": 0.18106072316653846, + "learning_rate": 1.9862306368330465e-05, + "loss": 0.3336, + "step": 2075 + }, + { + "epoch": 1.928007431490943, + "grad_norm": 0.18984032415869853, + "learning_rate": 1.9845094664371774e-05, + "loss": 0.35, + "step": 2076 + }, + { + "epoch": 1.9289363678588016, + "grad_norm": 0.21925027722818818, + "learning_rate": 1.9827882960413082e-05, + "loss": 0.3428, + "step": 2077 + }, + { + "epoch": 1.9298653042266605, + "grad_norm": 0.17538934019640326, + "learning_rate": 1.981067125645439e-05, + "loss": 0.3489, + "step": 2078 + }, + { + "epoch": 1.9307942405945193, + "grad_norm": 0.21247914753683844, + "learning_rate": 1.9793459552495696e-05, + "loss": 0.3436, + "step": 2079 + }, + { + "epoch": 1.931723176962378, + "grad_norm": 0.20170277926352845, + "learning_rate": 1.9776247848537004e-05, + "loss": 0.3665, + "step": 2080 + }, + { + "epoch": 1.9326521133302368, + "grad_norm": 0.21489372804535314, + "learning_rate": 1.9759036144578312e-05, + "loss": 0.3271, + "step": 2081 + }, + { + "epoch": 1.9335810496980956, + "grad_norm": 0.19404730364157935, + "learning_rate": 1.974182444061962e-05, + "loss": 0.3395, + "step": 2082 + }, + { + "epoch": 1.9345099860659545, + "grad_norm": 0.1768311883365791, + "learning_rate": 1.972461273666093e-05, + "loss": 0.3304, + "step": 2083 + }, + { + "epoch": 1.9354389224338133, + "grad_norm": 0.19513776903048077, + "learning_rate": 1.9707401032702238e-05, + "loss": 0.3355, + "step": 2084 + }, + { + "epoch": 1.9363678588016722, + "grad_norm": 0.20552603770095754, + "learning_rate": 1.9690189328743546e-05, + "loss": 0.3586, + "step": 2085 + }, + { + "epoch": 1.937296795169531, + "grad_norm": 0.20606350494140052, + "learning_rate": 1.9672977624784854e-05, + "loss": 0.341, + "step": 2086 + }, + { + "epoch": 1.9382257315373896, + "grad_norm": 0.21117517653181553, + "learning_rate": 1.9655765920826163e-05, + "loss": 0.3556, + "step": 2087 + }, + { + "epoch": 1.9391546679052485, + "grad_norm": 0.19889606660583597, + "learning_rate": 1.963855421686747e-05, + "loss": 0.3754, + "step": 2088 + }, + { + "epoch": 1.9400836042731073, + "grad_norm": 0.2315272588969166, + "learning_rate": 1.962134251290878e-05, + "loss": 0.3625, + "step": 2089 + }, + { + "epoch": 1.941012540640966, + "grad_norm": 0.1801392600297905, + "learning_rate": 1.9604130808950085e-05, + "loss": 0.3678, + "step": 2090 + }, + { + "epoch": 1.9419414770088248, + "grad_norm": 0.20819332360704426, + "learning_rate": 1.9586919104991393e-05, + "loss": 0.3413, + "step": 2091 + }, + { + "epoch": 1.9428704133766836, + "grad_norm": 0.20855084760144948, + "learning_rate": 1.95697074010327e-05, + "loss": 0.3479, + "step": 2092 + }, + { + "epoch": 1.9437993497445425, + "grad_norm": 0.1934591269154208, + "learning_rate": 1.955249569707401e-05, + "loss": 0.3292, + "step": 2093 + }, + { + "epoch": 1.9447282861124013, + "grad_norm": 0.19888631185262237, + "learning_rate": 1.9535283993115318e-05, + "loss": 0.3517, + "step": 2094 + }, + { + "epoch": 1.9456572224802602, + "grad_norm": 0.23696777928124235, + "learning_rate": 1.9518072289156627e-05, + "loss": 0.3603, + "step": 2095 + }, + { + "epoch": 1.946586158848119, + "grad_norm": 0.1930501835027152, + "learning_rate": 1.9500860585197935e-05, + "loss": 0.3511, + "step": 2096 + }, + { + "epoch": 1.9475150952159777, + "grad_norm": 0.1847048438548015, + "learning_rate": 1.9483648881239243e-05, + "loss": 0.3256, + "step": 2097 + }, + { + "epoch": 1.9484440315838365, + "grad_norm": 0.17904191429627417, + "learning_rate": 1.9466437177280552e-05, + "loss": 0.3526, + "step": 2098 + }, + { + "epoch": 1.9493729679516953, + "grad_norm": 0.21564625082297997, + "learning_rate": 1.944922547332186e-05, + "loss": 0.3336, + "step": 2099 + }, + { + "epoch": 1.950301904319554, + "grad_norm": 0.18093745627216662, + "learning_rate": 1.9432013769363165e-05, + "loss": 0.337, + "step": 2100 + }, + { + "epoch": 1.9512308406874128, + "grad_norm": 0.20861506817234635, + "learning_rate": 1.9414802065404474e-05, + "loss": 0.3545, + "step": 2101 + }, + { + "epoch": 1.9521597770552717, + "grad_norm": 0.18651110392988504, + "learning_rate": 1.9397590361445782e-05, + "loss": 0.3382, + "step": 2102 + }, + { + "epoch": 1.9530887134231305, + "grad_norm": 0.19498741795933872, + "learning_rate": 1.938037865748709e-05, + "loss": 0.3196, + "step": 2103 + }, + { + "epoch": 1.9540176497909894, + "grad_norm": 0.19938426190865807, + "learning_rate": 1.93631669535284e-05, + "loss": 0.3439, + "step": 2104 + }, + { + "epoch": 1.9549465861588482, + "grad_norm": 0.20167566769499962, + "learning_rate": 1.9345955249569707e-05, + "loss": 0.3422, + "step": 2105 + }, + { + "epoch": 1.955875522526707, + "grad_norm": 0.1858484130472836, + "learning_rate": 1.9328743545611016e-05, + "loss": 0.3334, + "step": 2106 + }, + { + "epoch": 1.9568044588945657, + "grad_norm": 0.19150012147084414, + "learning_rate": 1.9311531841652324e-05, + "loss": 0.3199, + "step": 2107 + }, + { + "epoch": 1.9577333952624245, + "grad_norm": 0.20387383733567602, + "learning_rate": 1.9294320137693632e-05, + "loss": 0.3531, + "step": 2108 + }, + { + "epoch": 1.9586623316302834, + "grad_norm": 0.18764227954582366, + "learning_rate": 1.927710843373494e-05, + "loss": 0.3485, + "step": 2109 + }, + { + "epoch": 1.959591267998142, + "grad_norm": 0.19184067561980478, + "learning_rate": 1.925989672977625e-05, + "loss": 0.3375, + "step": 2110 + }, + { + "epoch": 1.9605202043660008, + "grad_norm": 0.18361672644385202, + "learning_rate": 1.9242685025817554e-05, + "loss": 0.3411, + "step": 2111 + }, + { + "epoch": 1.9614491407338597, + "grad_norm": 0.19465647573588937, + "learning_rate": 1.9225473321858863e-05, + "loss": 0.3273, + "step": 2112 + }, + { + "epoch": 1.9623780771017185, + "grad_norm": 0.18345387692416829, + "learning_rate": 1.920826161790017e-05, + "loss": 0.3534, + "step": 2113 + }, + { + "epoch": 1.9633070134695774, + "grad_norm": 0.18440352667913198, + "learning_rate": 1.919104991394148e-05, + "loss": 0.3338, + "step": 2114 + }, + { + "epoch": 1.9642359498374362, + "grad_norm": 0.1905629308788521, + "learning_rate": 1.9173838209982788e-05, + "loss": 0.348, + "step": 2115 + }, + { + "epoch": 1.965164886205295, + "grad_norm": 0.17738331253996667, + "learning_rate": 1.9156626506024096e-05, + "loss": 0.338, + "step": 2116 + }, + { + "epoch": 1.966093822573154, + "grad_norm": 0.19317990178767253, + "learning_rate": 1.9139414802065405e-05, + "loss": 0.3463, + "step": 2117 + }, + { + "epoch": 1.9670227589410125, + "grad_norm": 0.19118447547256318, + "learning_rate": 1.9122203098106713e-05, + "loss": 0.3205, + "step": 2118 + }, + { + "epoch": 1.9679516953088714, + "grad_norm": 0.1941495864820665, + "learning_rate": 1.910499139414802e-05, + "loss": 0.3539, + "step": 2119 + }, + { + "epoch": 1.96888063167673, + "grad_norm": 0.19760267671496076, + "learning_rate": 1.908777969018933e-05, + "loss": 0.3464, + "step": 2120 + }, + { + "epoch": 1.9698095680445888, + "grad_norm": 0.20254146860119207, + "learning_rate": 1.9070567986230638e-05, + "loss": 0.3499, + "step": 2121 + }, + { + "epoch": 1.9707385044124477, + "grad_norm": 0.1843345665414209, + "learning_rate": 1.9053356282271943e-05, + "loss": 0.3208, + "step": 2122 + }, + { + "epoch": 1.9716674407803065, + "grad_norm": 0.18931583414329914, + "learning_rate": 1.9036144578313252e-05, + "loss": 0.3314, + "step": 2123 + }, + { + "epoch": 1.9725963771481654, + "grad_norm": 0.17544270577499563, + "learning_rate": 1.901893287435456e-05, + "loss": 0.3287, + "step": 2124 + }, + { + "epoch": 1.9735253135160242, + "grad_norm": 0.19688051277717727, + "learning_rate": 1.900172117039587e-05, + "loss": 0.3522, + "step": 2125 + }, + { + "epoch": 1.974454249883883, + "grad_norm": 0.19873447267182606, + "learning_rate": 1.8984509466437177e-05, + "loss": 0.3312, + "step": 2126 + }, + { + "epoch": 1.975383186251742, + "grad_norm": 0.19087379498613669, + "learning_rate": 1.8967297762478485e-05, + "loss": 0.3676, + "step": 2127 + }, + { + "epoch": 1.9763121226196005, + "grad_norm": 0.19614887292658442, + "learning_rate": 1.8950086058519794e-05, + "loss": 0.3592, + "step": 2128 + }, + { + "epoch": 1.9772410589874594, + "grad_norm": 0.20900078157895577, + "learning_rate": 1.8932874354561102e-05, + "loss": 0.3431, + "step": 2129 + }, + { + "epoch": 1.978169995355318, + "grad_norm": 0.19572251469376092, + "learning_rate": 1.891566265060241e-05, + "loss": 0.3266, + "step": 2130 + }, + { + "epoch": 1.9790989317231769, + "grad_norm": 0.17443311056477603, + "learning_rate": 1.889845094664372e-05, + "loss": 0.3338, + "step": 2131 + }, + { + "epoch": 1.9800278680910357, + "grad_norm": 0.20884670381056047, + "learning_rate": 1.8881239242685024e-05, + "loss": 0.3434, + "step": 2132 + }, + { + "epoch": 1.9809568044588945, + "grad_norm": 0.19322917648542368, + "learning_rate": 1.8864027538726332e-05, + "loss": 0.3201, + "step": 2133 + }, + { + "epoch": 1.9818857408267534, + "grad_norm": 0.18161054306051905, + "learning_rate": 1.884681583476764e-05, + "loss": 0.3369, + "step": 2134 + }, + { + "epoch": 1.9828146771946122, + "grad_norm": 0.19995407241341367, + "learning_rate": 1.882960413080895e-05, + "loss": 0.3286, + "step": 2135 + }, + { + "epoch": 1.983743613562471, + "grad_norm": 0.1989383783308988, + "learning_rate": 1.8812392426850258e-05, + "loss": 0.3627, + "step": 2136 + }, + { + "epoch": 1.98467254993033, + "grad_norm": 0.20119453420571734, + "learning_rate": 1.8795180722891566e-05, + "loss": 0.3256, + "step": 2137 + }, + { + "epoch": 1.9856014862981886, + "grad_norm": 0.2182021666836405, + "learning_rate": 1.8777969018932874e-05, + "loss": 0.3461, + "step": 2138 + }, + { + "epoch": 1.9865304226660474, + "grad_norm": 0.2102127576334096, + "learning_rate": 1.8760757314974183e-05, + "loss": 0.3416, + "step": 2139 + }, + { + "epoch": 1.987459359033906, + "grad_norm": 0.18281161210727, + "learning_rate": 1.874354561101549e-05, + "loss": 0.3288, + "step": 2140 + }, + { + "epoch": 1.9883882954017649, + "grad_norm": 0.21447362464818037, + "learning_rate": 1.87263339070568e-05, + "loss": 0.3259, + "step": 2141 + }, + { + "epoch": 1.9893172317696237, + "grad_norm": 0.2115018838882039, + "learning_rate": 1.8709122203098108e-05, + "loss": 0.3453, + "step": 2142 + }, + { + "epoch": 1.9902461681374826, + "grad_norm": 0.1960898048792348, + "learning_rate": 1.8691910499139413e-05, + "loss": 0.3445, + "step": 2143 + }, + { + "epoch": 1.9911751045053414, + "grad_norm": 0.1930005268145748, + "learning_rate": 1.867469879518072e-05, + "loss": 0.3257, + "step": 2144 + }, + { + "epoch": 1.9921040408732003, + "grad_norm": 0.22568106040346528, + "learning_rate": 1.865748709122203e-05, + "loss": 0.3308, + "step": 2145 + }, + { + "epoch": 1.993032977241059, + "grad_norm": 0.1984125411749985, + "learning_rate": 1.8640275387263338e-05, + "loss": 0.3465, + "step": 2146 + }, + { + "epoch": 1.993961913608918, + "grad_norm": 0.19270028218557986, + "learning_rate": 1.8623063683304647e-05, + "loss": 0.3766, + "step": 2147 + }, + { + "epoch": 1.9948908499767766, + "grad_norm": 0.21961399982931906, + "learning_rate": 1.8605851979345955e-05, + "loss": 0.364, + "step": 2148 + }, + { + "epoch": 1.9958197863446354, + "grad_norm": 0.19683946052220597, + "learning_rate": 1.8588640275387263e-05, + "loss": 0.3494, + "step": 2149 + }, + { + "epoch": 1.996748722712494, + "grad_norm": 0.2216547899837589, + "learning_rate": 1.8571428571428572e-05, + "loss": 0.3343, + "step": 2150 + }, + { + "epoch": 1.9976776590803529, + "grad_norm": 0.18513919900783316, + "learning_rate": 1.855421686746988e-05, + "loss": 0.3333, + "step": 2151 + }, + { + "epoch": 1.9986065954482117, + "grad_norm": 0.19337964070214114, + "learning_rate": 1.853700516351119e-05, + "loss": 0.349, + "step": 2152 + }, + { + "epoch": 1.9995355318160706, + "grad_norm": 0.22054539434718806, + "learning_rate": 1.8519793459552494e-05, + "loss": 0.3425, + "step": 2153 + }, + { + "epoch": 2.0, + "grad_norm": 0.3469287314164476, + "learning_rate": 1.8502581755593802e-05, + "loss": 0.3178, + "step": 2154 + }, + { + "epoch": 2.000928936367859, + "grad_norm": 0.2157058376159686, + "learning_rate": 1.848537005163511e-05, + "loss": 0.2774, + "step": 2155 + }, + { + "epoch": 2.0018578727357177, + "grad_norm": 0.23148826903740533, + "learning_rate": 1.846815834767642e-05, + "loss": 0.2857, + "step": 2156 + }, + { + "epoch": 2.0027868091035765, + "grad_norm": 0.2528270886829438, + "learning_rate": 1.8450946643717727e-05, + "loss": 0.2855, + "step": 2157 + }, + { + "epoch": 2.0037157454714354, + "grad_norm": 0.24255204959272156, + "learning_rate": 1.8433734939759036e-05, + "loss": 0.2678, + "step": 2158 + }, + { + "epoch": 2.0046446818392942, + "grad_norm": 0.21988723244876518, + "learning_rate": 1.8416523235800344e-05, + "loss": 0.2666, + "step": 2159 + }, + { + "epoch": 2.0055736182071526, + "grad_norm": 0.230258825927274, + "learning_rate": 1.8399311531841652e-05, + "loss": 0.2855, + "step": 2160 + }, + { + "epoch": 2.0065025545750115, + "grad_norm": 0.24750969996912872, + "learning_rate": 1.838209982788296e-05, + "loss": 0.274, + "step": 2161 + }, + { + "epoch": 2.0074314909428703, + "grad_norm": 0.21415269672959325, + "learning_rate": 1.836488812392427e-05, + "loss": 0.2559, + "step": 2162 + }, + { + "epoch": 2.008360427310729, + "grad_norm": 0.24277353543089242, + "learning_rate": 1.8347676419965578e-05, + "loss": 0.2881, + "step": 2163 + }, + { + "epoch": 2.009289363678588, + "grad_norm": 0.22462647600480273, + "learning_rate": 1.8330464716006886e-05, + "loss": 0.273, + "step": 2164 + }, + { + "epoch": 2.010218300046447, + "grad_norm": 0.23734616553826293, + "learning_rate": 1.8313253012048194e-05, + "loss": 0.2876, + "step": 2165 + }, + { + "epoch": 2.0111472364143057, + "grad_norm": 0.24635100347524322, + "learning_rate": 1.82960413080895e-05, + "loss": 0.2943, + "step": 2166 + }, + { + "epoch": 2.0120761727821646, + "grad_norm": 0.23031332092273404, + "learning_rate": 1.8278829604130808e-05, + "loss": 0.2724, + "step": 2167 + }, + { + "epoch": 2.0130051091500234, + "grad_norm": 0.20356233261897416, + "learning_rate": 1.8261617900172116e-05, + "loss": 0.2768, + "step": 2168 + }, + { + "epoch": 2.0139340455178822, + "grad_norm": 0.24232305321170966, + "learning_rate": 1.8244406196213425e-05, + "loss": 0.278, + "step": 2169 + }, + { + "epoch": 2.0148629818857406, + "grad_norm": 0.20514711352879814, + "learning_rate": 1.8227194492254733e-05, + "loss": 0.2889, + "step": 2170 + }, + { + "epoch": 2.0157919182535995, + "grad_norm": 0.21172830872100393, + "learning_rate": 1.820998278829604e-05, + "loss": 0.2795, + "step": 2171 + }, + { + "epoch": 2.0167208546214583, + "grad_norm": 0.19343812296165566, + "learning_rate": 1.819277108433735e-05, + "loss": 0.27, + "step": 2172 + }, + { + "epoch": 2.017649790989317, + "grad_norm": 0.22135939544753655, + "learning_rate": 1.8175559380378658e-05, + "loss": 0.2653, + "step": 2173 + }, + { + "epoch": 2.018578727357176, + "grad_norm": 0.21690056177642822, + "learning_rate": 1.8158347676419967e-05, + "loss": 0.2739, + "step": 2174 + }, + { + "epoch": 2.019507663725035, + "grad_norm": 0.21806288455919925, + "learning_rate": 1.8141135972461275e-05, + "loss": 0.2973, + "step": 2175 + }, + { + "epoch": 2.0204366000928937, + "grad_norm": 0.22552713294809715, + "learning_rate": 1.8123924268502583e-05, + "loss": 0.2854, + "step": 2176 + }, + { + "epoch": 2.0213655364607526, + "grad_norm": 0.2055813988009532, + "learning_rate": 1.8106712564543892e-05, + "loss": 0.2806, + "step": 2177 + }, + { + "epoch": 2.0222944728286114, + "grad_norm": 0.20183039247919923, + "learning_rate": 1.8089500860585197e-05, + "loss": 0.2812, + "step": 2178 + }, + { + "epoch": 2.0232234091964703, + "grad_norm": 0.19722763146647482, + "learning_rate": 1.8072289156626505e-05, + "loss": 0.2677, + "step": 2179 + }, + { + "epoch": 2.0241523455643287, + "grad_norm": 0.21974854118511616, + "learning_rate": 1.8055077452667814e-05, + "loss": 0.2869, + "step": 2180 + }, + { + "epoch": 2.0250812819321875, + "grad_norm": 0.20162885066764683, + "learning_rate": 1.8037865748709122e-05, + "loss": 0.2793, + "step": 2181 + }, + { + "epoch": 2.0260102183000464, + "grad_norm": 0.18375708438721025, + "learning_rate": 1.802065404475043e-05, + "loss": 0.2693, + "step": 2182 + }, + { + "epoch": 2.026939154667905, + "grad_norm": 0.2180582338151327, + "learning_rate": 1.800344234079174e-05, + "loss": 0.2778, + "step": 2183 + }, + { + "epoch": 2.027868091035764, + "grad_norm": 0.1904035088310135, + "learning_rate": 1.7986230636833047e-05, + "loss": 0.267, + "step": 2184 + }, + { + "epoch": 2.028797027403623, + "grad_norm": 0.21014267773827153, + "learning_rate": 1.7969018932874356e-05, + "loss": 0.2888, + "step": 2185 + }, + { + "epoch": 2.0297259637714817, + "grad_norm": 0.19891687553756965, + "learning_rate": 1.7951807228915664e-05, + "loss": 0.2874, + "step": 2186 + }, + { + "epoch": 2.0306549001393406, + "grad_norm": 0.20789519384380667, + "learning_rate": 1.7934595524956972e-05, + "loss": 0.2623, + "step": 2187 + }, + { + "epoch": 2.0315838365071994, + "grad_norm": 0.20429364046527473, + "learning_rate": 1.791738382099828e-05, + "loss": 0.2784, + "step": 2188 + }, + { + "epoch": 2.0325127728750583, + "grad_norm": 0.19034811630615903, + "learning_rate": 1.790017211703959e-05, + "loss": 0.292, + "step": 2189 + }, + { + "epoch": 2.0334417092429167, + "grad_norm": 0.20841543986466363, + "learning_rate": 1.7882960413080894e-05, + "loss": 0.2819, + "step": 2190 + }, + { + "epoch": 2.0343706456107755, + "grad_norm": 0.1955048400721127, + "learning_rate": 1.7865748709122203e-05, + "loss": 0.2641, + "step": 2191 + }, + { + "epoch": 2.0352995819786344, + "grad_norm": 0.1830392111751671, + "learning_rate": 1.784853700516351e-05, + "loss": 0.2671, + "step": 2192 + }, + { + "epoch": 2.036228518346493, + "grad_norm": 0.1902652316296126, + "learning_rate": 1.783132530120482e-05, + "loss": 0.2767, + "step": 2193 + }, + { + "epoch": 2.037157454714352, + "grad_norm": 0.19416968904801624, + "learning_rate": 1.7814113597246128e-05, + "loss": 0.2768, + "step": 2194 + }, + { + "epoch": 2.038086391082211, + "grad_norm": 0.22500831983539799, + "learning_rate": 1.7796901893287436e-05, + "loss": 0.2738, + "step": 2195 + }, + { + "epoch": 2.0390153274500697, + "grad_norm": 0.1928120950604119, + "learning_rate": 1.7779690189328745e-05, + "loss": 0.2811, + "step": 2196 + }, + { + "epoch": 2.0399442638179286, + "grad_norm": 0.17208435269872213, + "learning_rate": 1.7762478485370053e-05, + "loss": 0.2755, + "step": 2197 + }, + { + "epoch": 2.0408732001857874, + "grad_norm": 0.18090731260063056, + "learning_rate": 1.774526678141136e-05, + "loss": 0.2544, + "step": 2198 + }, + { + "epoch": 2.0418021365536463, + "grad_norm": 0.183776818234785, + "learning_rate": 1.772805507745267e-05, + "loss": 0.2866, + "step": 2199 + }, + { + "epoch": 2.0427310729215047, + "grad_norm": 0.18371201328625772, + "learning_rate": 1.7710843373493978e-05, + "loss": 0.2704, + "step": 2200 + }, + { + "epoch": 2.0436600092893635, + "grad_norm": 0.18123129067098215, + "learning_rate": 1.7693631669535287e-05, + "loss": 0.3001, + "step": 2201 + }, + { + "epoch": 2.0445889456572224, + "grad_norm": 0.1811746843377487, + "learning_rate": 1.7676419965576595e-05, + "loss": 0.2676, + "step": 2202 + }, + { + "epoch": 2.0455178820250812, + "grad_norm": 0.18554403639433137, + "learning_rate": 1.76592082616179e-05, + "loss": 0.2738, + "step": 2203 + }, + { + "epoch": 2.04644681839294, + "grad_norm": 0.19747699767180618, + "learning_rate": 1.764199655765921e-05, + "loss": 0.2763, + "step": 2204 + }, + { + "epoch": 2.047375754760799, + "grad_norm": 0.1935650970013363, + "learning_rate": 1.7624784853700517e-05, + "loss": 0.2715, + "step": 2205 + }, + { + "epoch": 2.0483046911286578, + "grad_norm": 0.1854401984756031, + "learning_rate": 1.7607573149741825e-05, + "loss": 0.2789, + "step": 2206 + }, + { + "epoch": 2.0492336274965166, + "grad_norm": 0.1878981380085049, + "learning_rate": 1.7590361445783134e-05, + "loss": 0.2747, + "step": 2207 + }, + { + "epoch": 2.0501625638643755, + "grad_norm": 0.18824729243879953, + "learning_rate": 1.7573149741824442e-05, + "loss": 0.2766, + "step": 2208 + }, + { + "epoch": 2.0510915002322343, + "grad_norm": 0.18511176681612157, + "learning_rate": 1.755593803786575e-05, + "loss": 0.2808, + "step": 2209 + }, + { + "epoch": 2.0520204366000927, + "grad_norm": 0.17356200032688698, + "learning_rate": 1.753872633390706e-05, + "loss": 0.2739, + "step": 2210 + }, + { + "epoch": 2.0529493729679515, + "grad_norm": 0.1751736860021714, + "learning_rate": 1.7521514629948367e-05, + "loss": 0.2685, + "step": 2211 + }, + { + "epoch": 2.0538783093358104, + "grad_norm": 0.18256208438125998, + "learning_rate": 1.7504302925989676e-05, + "loss": 0.2604, + "step": 2212 + }, + { + "epoch": 2.0548072457036692, + "grad_norm": 0.1767328307937035, + "learning_rate": 1.7487091222030984e-05, + "loss": 0.2673, + "step": 2213 + }, + { + "epoch": 2.055736182071528, + "grad_norm": 0.18036991264224542, + "learning_rate": 1.7469879518072292e-05, + "loss": 0.2735, + "step": 2214 + }, + { + "epoch": 2.056665118439387, + "grad_norm": 0.17618684914959876, + "learning_rate": 1.7452667814113597e-05, + "loss": 0.2671, + "step": 2215 + }, + { + "epoch": 2.0575940548072458, + "grad_norm": 0.22278489819230726, + "learning_rate": 1.7435456110154906e-05, + "loss": 0.2953, + "step": 2216 + }, + { + "epoch": 2.0585229911751046, + "grad_norm": 0.1957544879322214, + "learning_rate": 1.7418244406196214e-05, + "loss": 0.29, + "step": 2217 + }, + { + "epoch": 2.0594519275429635, + "grad_norm": 0.20251022082815323, + "learning_rate": 1.7401032702237523e-05, + "loss": 0.2655, + "step": 2218 + }, + { + "epoch": 2.0603808639108223, + "grad_norm": 0.18210373540439084, + "learning_rate": 1.738382099827883e-05, + "loss": 0.2719, + "step": 2219 + }, + { + "epoch": 2.0613098002786807, + "grad_norm": 0.19081176507689804, + "learning_rate": 1.736660929432014e-05, + "loss": 0.289, + "step": 2220 + }, + { + "epoch": 2.0622387366465396, + "grad_norm": 0.17312113263605755, + "learning_rate": 1.7349397590361448e-05, + "loss": 0.2707, + "step": 2221 + }, + { + "epoch": 2.0631676730143984, + "grad_norm": 0.2042563838051068, + "learning_rate": 1.7332185886402756e-05, + "loss": 0.2734, + "step": 2222 + }, + { + "epoch": 2.0640966093822573, + "grad_norm": 0.19562693428663167, + "learning_rate": 1.7314974182444065e-05, + "loss": 0.2735, + "step": 2223 + }, + { + "epoch": 2.065025545750116, + "grad_norm": 0.207464339658486, + "learning_rate": 1.7297762478485373e-05, + "loss": 0.2771, + "step": 2224 + }, + { + "epoch": 2.065954482117975, + "grad_norm": 0.20297657867438057, + "learning_rate": 1.728055077452668e-05, + "loss": 0.2945, + "step": 2225 + }, + { + "epoch": 2.066883418485834, + "grad_norm": 0.19308466101824381, + "learning_rate": 1.726333907056799e-05, + "loss": 0.2761, + "step": 2226 + }, + { + "epoch": 2.0678123548536926, + "grad_norm": 0.1940231498846366, + "learning_rate": 1.7246127366609295e-05, + "loss": 0.2781, + "step": 2227 + }, + { + "epoch": 2.0687412912215515, + "grad_norm": 0.19959094783677078, + "learning_rate": 1.7228915662650603e-05, + "loss": 0.283, + "step": 2228 + }, + { + "epoch": 2.0696702275894103, + "grad_norm": 0.17718713745827044, + "learning_rate": 1.721170395869191e-05, + "loss": 0.2731, + "step": 2229 + }, + { + "epoch": 2.0705991639572687, + "grad_norm": 0.20114808299866702, + "learning_rate": 1.719449225473322e-05, + "loss": 0.2682, + "step": 2230 + }, + { + "epoch": 2.0715281003251276, + "grad_norm": 0.18891743595084226, + "learning_rate": 1.717728055077453e-05, + "loss": 0.268, + "step": 2231 + }, + { + "epoch": 2.0724570366929864, + "grad_norm": 0.19305694440730528, + "learning_rate": 1.7160068846815837e-05, + "loss": 0.2864, + "step": 2232 + }, + { + "epoch": 2.0733859730608453, + "grad_norm": 0.18612754148970131, + "learning_rate": 1.7142857142857145e-05, + "loss": 0.2804, + "step": 2233 + }, + { + "epoch": 2.074314909428704, + "grad_norm": 0.21870942336091884, + "learning_rate": 1.7125645438898454e-05, + "loss": 0.2803, + "step": 2234 + }, + { + "epoch": 2.075243845796563, + "grad_norm": 0.1905749129757181, + "learning_rate": 1.7108433734939762e-05, + "loss": 0.2769, + "step": 2235 + }, + { + "epoch": 2.076172782164422, + "grad_norm": 0.17958345546132004, + "learning_rate": 1.709122203098107e-05, + "loss": 0.2676, + "step": 2236 + }, + { + "epoch": 2.0771017185322806, + "grad_norm": 0.17978922180228651, + "learning_rate": 1.707401032702238e-05, + "loss": 0.2566, + "step": 2237 + }, + { + "epoch": 2.0780306549001395, + "grad_norm": 0.19619208069729835, + "learning_rate": 1.7056798623063684e-05, + "loss": 0.2847, + "step": 2238 + }, + { + "epoch": 2.0789595912679983, + "grad_norm": 0.2028901050066484, + "learning_rate": 1.7039586919104992e-05, + "loss": 0.2839, + "step": 2239 + }, + { + "epoch": 2.0798885276358567, + "grad_norm": 0.20553977912065274, + "learning_rate": 1.70223752151463e-05, + "loss": 0.27, + "step": 2240 + }, + { + "epoch": 2.0808174640037156, + "grad_norm": 0.2175363885394602, + "learning_rate": 1.700516351118761e-05, + "loss": 0.2827, + "step": 2241 + }, + { + "epoch": 2.0817464003715744, + "grad_norm": 0.17374801708666573, + "learning_rate": 1.6987951807228917e-05, + "loss": 0.2514, + "step": 2242 + }, + { + "epoch": 2.0826753367394333, + "grad_norm": 0.2009093330228981, + "learning_rate": 1.6970740103270226e-05, + "loss": 0.2695, + "step": 2243 + }, + { + "epoch": 2.083604273107292, + "grad_norm": 0.18927752447294688, + "learning_rate": 1.6953528399311534e-05, + "loss": 0.2576, + "step": 2244 + }, + { + "epoch": 2.084533209475151, + "grad_norm": 0.23624065141040287, + "learning_rate": 1.6936316695352843e-05, + "loss": 0.2998, + "step": 2245 + }, + { + "epoch": 2.08546214584301, + "grad_norm": 0.18922983252395634, + "learning_rate": 1.691910499139415e-05, + "loss": 0.2805, + "step": 2246 + }, + { + "epoch": 2.0863910822108687, + "grad_norm": 0.19589618442669648, + "learning_rate": 1.690189328743546e-05, + "loss": 0.2683, + "step": 2247 + }, + { + "epoch": 2.0873200185787275, + "grad_norm": 0.19640479008012807, + "learning_rate": 1.6884681583476768e-05, + "loss": 0.2744, + "step": 2248 + }, + { + "epoch": 2.0882489549465864, + "grad_norm": 0.19901030382264118, + "learning_rate": 1.6867469879518073e-05, + "loss": 0.264, + "step": 2249 + }, + { + "epoch": 2.0891778913144448, + "grad_norm": 0.1952682814125198, + "learning_rate": 1.685025817555938e-05, + "loss": 0.2958, + "step": 2250 + }, + { + "epoch": 2.0901068276823036, + "grad_norm": 0.19766258576899515, + "learning_rate": 1.683304647160069e-05, + "loss": 0.2786, + "step": 2251 + }, + { + "epoch": 2.0910357640501624, + "grad_norm": 0.19455969769783835, + "learning_rate": 1.6815834767641998e-05, + "loss": 0.2798, + "step": 2252 + }, + { + "epoch": 2.0919647004180213, + "grad_norm": 0.18818819171389964, + "learning_rate": 1.6798623063683306e-05, + "loss": 0.2781, + "step": 2253 + }, + { + "epoch": 2.09289363678588, + "grad_norm": 0.1915501131624007, + "learning_rate": 1.6781411359724615e-05, + "loss": 0.2744, + "step": 2254 + }, + { + "epoch": 2.093822573153739, + "grad_norm": 0.18831604411563369, + "learning_rate": 1.6764199655765923e-05, + "loss": 0.2619, + "step": 2255 + }, + { + "epoch": 2.094751509521598, + "grad_norm": 0.19348282994201058, + "learning_rate": 1.674698795180723e-05, + "loss": 0.2751, + "step": 2256 + }, + { + "epoch": 2.0956804458894567, + "grad_norm": 0.19615082676185192, + "learning_rate": 1.672977624784854e-05, + "loss": 0.2888, + "step": 2257 + }, + { + "epoch": 2.0966093822573155, + "grad_norm": 0.1847380252200025, + "learning_rate": 1.671256454388985e-05, + "loss": 0.2854, + "step": 2258 + }, + { + "epoch": 2.0975383186251744, + "grad_norm": 0.20348389567918942, + "learning_rate": 1.6695352839931153e-05, + "loss": 0.2873, + "step": 2259 + }, + { + "epoch": 2.0984672549930328, + "grad_norm": 0.19400279600413042, + "learning_rate": 1.6678141135972462e-05, + "loss": 0.2813, + "step": 2260 + }, + { + "epoch": 2.0993961913608916, + "grad_norm": 0.18111282515696545, + "learning_rate": 1.666092943201377e-05, + "loss": 0.2983, + "step": 2261 + }, + { + "epoch": 2.1003251277287505, + "grad_norm": 0.18813347341361206, + "learning_rate": 1.664371772805508e-05, + "loss": 0.269, + "step": 2262 + }, + { + "epoch": 2.1012540640966093, + "grad_norm": 0.17424998589403623, + "learning_rate": 1.6626506024096387e-05, + "loss": 0.2607, + "step": 2263 + }, + { + "epoch": 2.102183000464468, + "grad_norm": 0.17051368168715583, + "learning_rate": 1.6609294320137695e-05, + "loss": 0.2632, + "step": 2264 + }, + { + "epoch": 2.103111936832327, + "grad_norm": 0.19333768098843307, + "learning_rate": 1.6592082616179004e-05, + "loss": 0.2739, + "step": 2265 + }, + { + "epoch": 2.104040873200186, + "grad_norm": 0.17969196315731728, + "learning_rate": 1.6574870912220312e-05, + "loss": 0.2642, + "step": 2266 + }, + { + "epoch": 2.1049698095680447, + "grad_norm": 0.17290109863089814, + "learning_rate": 1.655765920826162e-05, + "loss": 0.274, + "step": 2267 + }, + { + "epoch": 2.1058987459359035, + "grad_norm": 0.17856097137064214, + "learning_rate": 1.654044750430293e-05, + "loss": 0.2735, + "step": 2268 + }, + { + "epoch": 2.1068276823037624, + "grad_norm": 0.18795624301701083, + "learning_rate": 1.6523235800344237e-05, + "loss": 0.2811, + "step": 2269 + }, + { + "epoch": 2.107756618671621, + "grad_norm": 0.17878323498135645, + "learning_rate": 1.6506024096385542e-05, + "loss": 0.2757, + "step": 2270 + }, + { + "epoch": 2.1086855550394796, + "grad_norm": 0.1754157334553124, + "learning_rate": 1.648881239242685e-05, + "loss": 0.2719, + "step": 2271 + }, + { + "epoch": 2.1096144914073385, + "grad_norm": 0.18969010978993706, + "learning_rate": 1.647160068846816e-05, + "loss": 0.2805, + "step": 2272 + }, + { + "epoch": 2.1105434277751973, + "grad_norm": 0.2042293911687927, + "learning_rate": 1.6454388984509468e-05, + "loss": 0.2869, + "step": 2273 + }, + { + "epoch": 2.111472364143056, + "grad_norm": 0.1733527665317661, + "learning_rate": 1.6437177280550776e-05, + "loss": 0.2684, + "step": 2274 + }, + { + "epoch": 2.112401300510915, + "grad_norm": 0.18133085027013407, + "learning_rate": 1.6419965576592084e-05, + "loss": 0.2783, + "step": 2275 + }, + { + "epoch": 2.113330236878774, + "grad_norm": 0.19680333317378604, + "learning_rate": 1.6402753872633393e-05, + "loss": 0.2804, + "step": 2276 + }, + { + "epoch": 2.1142591732466327, + "grad_norm": 0.18081135463697318, + "learning_rate": 1.63855421686747e-05, + "loss": 0.2605, + "step": 2277 + }, + { + "epoch": 2.1151881096144916, + "grad_norm": 0.19740057774259165, + "learning_rate": 1.636833046471601e-05, + "loss": 0.2789, + "step": 2278 + }, + { + "epoch": 2.1161170459823504, + "grad_norm": 0.18568459190301742, + "learning_rate": 1.6351118760757318e-05, + "loss": 0.2828, + "step": 2279 + }, + { + "epoch": 2.117045982350209, + "grad_norm": 0.19365683468329514, + "learning_rate": 1.6333907056798626e-05, + "loss": 0.2688, + "step": 2280 + }, + { + "epoch": 2.1179749187180676, + "grad_norm": 0.20252419703650162, + "learning_rate": 1.631669535283993e-05, + "loss": 0.2713, + "step": 2281 + }, + { + "epoch": 2.1189038550859265, + "grad_norm": 0.1792890569752184, + "learning_rate": 1.629948364888124e-05, + "loss": 0.262, + "step": 2282 + }, + { + "epoch": 2.1198327914537853, + "grad_norm": 0.20524389028390555, + "learning_rate": 1.6282271944922548e-05, + "loss": 0.2843, + "step": 2283 + }, + { + "epoch": 2.120761727821644, + "grad_norm": 0.28341598447437294, + "learning_rate": 1.6265060240963857e-05, + "loss": 0.3053, + "step": 2284 + }, + { + "epoch": 2.121690664189503, + "grad_norm": 0.20203034410437481, + "learning_rate": 1.6247848537005165e-05, + "loss": 0.2854, + "step": 2285 + }, + { + "epoch": 2.122619600557362, + "grad_norm": 0.1953939465453228, + "learning_rate": 1.6230636833046473e-05, + "loss": 0.2899, + "step": 2286 + }, + { + "epoch": 2.1235485369252207, + "grad_norm": 0.1793459966677846, + "learning_rate": 1.6213425129087782e-05, + "loss": 0.2796, + "step": 2287 + }, + { + "epoch": 2.1244774732930796, + "grad_norm": 0.17783901924314915, + "learning_rate": 1.619621342512909e-05, + "loss": 0.2626, + "step": 2288 + }, + { + "epoch": 2.1254064096609384, + "grad_norm": 0.22343298968404166, + "learning_rate": 1.61790017211704e-05, + "loss": 0.2811, + "step": 2289 + }, + { + "epoch": 2.126335346028797, + "grad_norm": 0.1814389246560736, + "learning_rate": 1.6161790017211707e-05, + "loss": 0.2598, + "step": 2290 + }, + { + "epoch": 2.1272642823966557, + "grad_norm": 0.18055709417523266, + "learning_rate": 1.6144578313253012e-05, + "loss": 0.2798, + "step": 2291 + }, + { + "epoch": 2.1281932187645145, + "grad_norm": 0.17888962889126264, + "learning_rate": 1.612736660929432e-05, + "loss": 0.2682, + "step": 2292 + }, + { + "epoch": 2.1291221551323734, + "grad_norm": 0.19721301430366406, + "learning_rate": 1.611015490533563e-05, + "loss": 0.2719, + "step": 2293 + }, + { + "epoch": 2.130051091500232, + "grad_norm": 0.19444402714079692, + "learning_rate": 1.6092943201376937e-05, + "loss": 0.2783, + "step": 2294 + }, + { + "epoch": 2.130980027868091, + "grad_norm": 0.1838632683011167, + "learning_rate": 1.6075731497418246e-05, + "loss": 0.2768, + "step": 2295 + }, + { + "epoch": 2.13190896423595, + "grad_norm": 0.1884248262044888, + "learning_rate": 1.6058519793459554e-05, + "loss": 0.2707, + "step": 2296 + }, + { + "epoch": 2.1328379006038087, + "grad_norm": 0.19204342794555168, + "learning_rate": 1.6041308089500863e-05, + "loss": 0.2775, + "step": 2297 + }, + { + "epoch": 2.1337668369716676, + "grad_norm": 0.18608945592808687, + "learning_rate": 1.602409638554217e-05, + "loss": 0.2685, + "step": 2298 + }, + { + "epoch": 2.1346957733395264, + "grad_norm": 0.18587485562812903, + "learning_rate": 1.600688468158348e-05, + "loss": 0.2786, + "step": 2299 + }, + { + "epoch": 2.1356247097073853, + "grad_norm": 0.19587986603415075, + "learning_rate": 1.5989672977624788e-05, + "loss": 0.2856, + "step": 2300 + }, + { + "epoch": 2.1365536460752437, + "grad_norm": 0.1942871240310058, + "learning_rate": 1.5972461273666096e-05, + "loss": 0.2803, + "step": 2301 + }, + { + "epoch": 2.1374825824431025, + "grad_norm": 0.18749941680014653, + "learning_rate": 1.59552495697074e-05, + "loss": 0.2917, + "step": 2302 + }, + { + "epoch": 2.1384115188109614, + "grad_norm": 0.1887106230362805, + "learning_rate": 1.593803786574871e-05, + "loss": 0.2872, + "step": 2303 + }, + { + "epoch": 2.13934045517882, + "grad_norm": 0.18424120413217845, + "learning_rate": 1.5920826161790018e-05, + "loss": 0.2744, + "step": 2304 + }, + { + "epoch": 2.140269391546679, + "grad_norm": 0.2091266962240203, + "learning_rate": 1.5903614457831326e-05, + "loss": 0.2872, + "step": 2305 + }, + { + "epoch": 2.141198327914538, + "grad_norm": 0.167622736533787, + "learning_rate": 1.5886402753872635e-05, + "loss": 0.2579, + "step": 2306 + }, + { + "epoch": 2.1421272642823967, + "grad_norm": 0.18300097585692288, + "learning_rate": 1.5869191049913943e-05, + "loss": 0.2979, + "step": 2307 + }, + { + "epoch": 2.1430562006502556, + "grad_norm": 0.18348416995061526, + "learning_rate": 1.585197934595525e-05, + "loss": 0.261, + "step": 2308 + }, + { + "epoch": 2.1439851370181144, + "grad_norm": 0.1890149322297578, + "learning_rate": 1.583476764199656e-05, + "loss": 0.2918, + "step": 2309 + }, + { + "epoch": 2.144914073385973, + "grad_norm": 0.1925329558848419, + "learning_rate": 1.581755593803787e-05, + "loss": 0.2823, + "step": 2310 + }, + { + "epoch": 2.1458430097538317, + "grad_norm": 0.1805093088723906, + "learning_rate": 1.5800344234079177e-05, + "loss": 0.279, + "step": 2311 + }, + { + "epoch": 2.1467719461216905, + "grad_norm": 0.1896794503755551, + "learning_rate": 1.5783132530120482e-05, + "loss": 0.2791, + "step": 2312 + }, + { + "epoch": 2.1477008824895494, + "grad_norm": 0.19321251388893304, + "learning_rate": 1.576592082616179e-05, + "loss": 0.2947, + "step": 2313 + }, + { + "epoch": 2.1486298188574082, + "grad_norm": 0.17234766420937184, + "learning_rate": 1.57487091222031e-05, + "loss": 0.2516, + "step": 2314 + }, + { + "epoch": 2.149558755225267, + "grad_norm": 0.1822533993953394, + "learning_rate": 1.5731497418244407e-05, + "loss": 0.2823, + "step": 2315 + }, + { + "epoch": 2.150487691593126, + "grad_norm": 0.17706007655705194, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.2874, + "step": 2316 + }, + { + "epoch": 2.1514166279609848, + "grad_norm": 0.19007988270275075, + "learning_rate": 1.5697074010327024e-05, + "loss": 0.2964, + "step": 2317 + }, + { + "epoch": 2.1523455643288436, + "grad_norm": 0.18529312530513586, + "learning_rate": 1.5679862306368332e-05, + "loss": 0.2622, + "step": 2318 + }, + { + "epoch": 2.1532745006967025, + "grad_norm": 0.17979646607166183, + "learning_rate": 1.566265060240964e-05, + "loss": 0.2873, + "step": 2319 + }, + { + "epoch": 2.1542034370645613, + "grad_norm": 0.17790060931246865, + "learning_rate": 1.564543889845095e-05, + "loss": 0.2684, + "step": 2320 + }, + { + "epoch": 2.1551323734324197, + "grad_norm": 0.18215704599038443, + "learning_rate": 1.5628227194492257e-05, + "loss": 0.2541, + "step": 2321 + }, + { + "epoch": 2.1560613098002785, + "grad_norm": 0.1892490048553774, + "learning_rate": 1.5611015490533566e-05, + "loss": 0.2669, + "step": 2322 + }, + { + "epoch": 2.1569902461681374, + "grad_norm": 0.19249753393355534, + "learning_rate": 1.559380378657487e-05, + "loss": 0.2761, + "step": 2323 + }, + { + "epoch": 2.1579191825359962, + "grad_norm": 0.1839816309966501, + "learning_rate": 1.557659208261618e-05, + "loss": 0.2677, + "step": 2324 + }, + { + "epoch": 2.158848118903855, + "grad_norm": 0.1810375798407296, + "learning_rate": 1.5559380378657488e-05, + "loss": 0.2773, + "step": 2325 + }, + { + "epoch": 2.159777055271714, + "grad_norm": 0.18831165733890587, + "learning_rate": 1.5542168674698796e-05, + "loss": 0.2848, + "step": 2326 + }, + { + "epoch": 2.1607059916395728, + "grad_norm": 0.19377240510871852, + "learning_rate": 1.5524956970740104e-05, + "loss": 0.2941, + "step": 2327 + }, + { + "epoch": 2.1616349280074316, + "grad_norm": 0.1761716521658807, + "learning_rate": 1.5507745266781413e-05, + "loss": 0.2829, + "step": 2328 + }, + { + "epoch": 2.1625638643752905, + "grad_norm": 0.19274957074290722, + "learning_rate": 1.549053356282272e-05, + "loss": 0.2732, + "step": 2329 + }, + { + "epoch": 2.163492800743149, + "grad_norm": 0.18470851789842668, + "learning_rate": 1.547332185886403e-05, + "loss": 0.2829, + "step": 2330 + }, + { + "epoch": 2.1644217371110077, + "grad_norm": 0.17644911709437527, + "learning_rate": 1.5456110154905338e-05, + "loss": 0.2639, + "step": 2331 + }, + { + "epoch": 2.1653506734788666, + "grad_norm": 0.1860663550583413, + "learning_rate": 1.5438898450946646e-05, + "loss": 0.265, + "step": 2332 + }, + { + "epoch": 2.1662796098467254, + "grad_norm": 0.1732712738965505, + "learning_rate": 1.5421686746987955e-05, + "loss": 0.275, + "step": 2333 + }, + { + "epoch": 2.1672085462145843, + "grad_norm": 0.20738176414089443, + "learning_rate": 1.540447504302926e-05, + "loss": 0.2922, + "step": 2334 + }, + { + "epoch": 2.168137482582443, + "grad_norm": 0.18615130026642834, + "learning_rate": 1.5387263339070568e-05, + "loss": 0.2749, + "step": 2335 + }, + { + "epoch": 2.169066418950302, + "grad_norm": 0.18330688472974185, + "learning_rate": 1.5370051635111877e-05, + "loss": 0.2829, + "step": 2336 + }, + { + "epoch": 2.169995355318161, + "grad_norm": 0.19173214451709955, + "learning_rate": 1.5352839931153185e-05, + "loss": 0.2915, + "step": 2337 + }, + { + "epoch": 2.1709242916860196, + "grad_norm": 0.186766371199974, + "learning_rate": 1.5335628227194493e-05, + "loss": 0.2658, + "step": 2338 + }, + { + "epoch": 2.1718532280538785, + "grad_norm": 0.18396142682701574, + "learning_rate": 1.5318416523235802e-05, + "loss": 0.2899, + "step": 2339 + }, + { + "epoch": 2.1727821644217373, + "grad_norm": 0.1780638489393759, + "learning_rate": 1.530120481927711e-05, + "loss": 0.2787, + "step": 2340 + }, + { + "epoch": 2.1737111007895957, + "grad_norm": 0.1752883172604511, + "learning_rate": 1.528399311531842e-05, + "loss": 0.2748, + "step": 2341 + }, + { + "epoch": 2.1746400371574546, + "grad_norm": 0.177303228639095, + "learning_rate": 1.5266781411359727e-05, + "loss": 0.2783, + "step": 2342 + }, + { + "epoch": 2.1755689735253134, + "grad_norm": 0.18826552255300416, + "learning_rate": 1.5249569707401035e-05, + "loss": 0.2684, + "step": 2343 + }, + { + "epoch": 2.1764979098931723, + "grad_norm": 0.17442496731786208, + "learning_rate": 1.523235800344234e-05, + "loss": 0.2749, + "step": 2344 + }, + { + "epoch": 2.177426846261031, + "grad_norm": 0.1737699610108496, + "learning_rate": 1.5215146299483649e-05, + "loss": 0.2703, + "step": 2345 + }, + { + "epoch": 2.17835578262889, + "grad_norm": 0.17176047456146679, + "learning_rate": 1.5197934595524957e-05, + "loss": 0.2683, + "step": 2346 + }, + { + "epoch": 2.179284718996749, + "grad_norm": 0.17869107629479228, + "learning_rate": 1.5180722891566266e-05, + "loss": 0.2689, + "step": 2347 + }, + { + "epoch": 2.1802136553646077, + "grad_norm": 0.188008817213917, + "learning_rate": 1.5163511187607574e-05, + "loss": 0.2735, + "step": 2348 + }, + { + "epoch": 2.1811425917324665, + "grad_norm": 0.17523593662500742, + "learning_rate": 1.5146299483648882e-05, + "loss": 0.2763, + "step": 2349 + }, + { + "epoch": 2.182071528100325, + "grad_norm": 0.18284465023530216, + "learning_rate": 1.512908777969019e-05, + "loss": 0.2792, + "step": 2350 + }, + { + "epoch": 2.1830004644681837, + "grad_norm": 0.1868107881910502, + "learning_rate": 1.51118760757315e-05, + "loss": 0.2841, + "step": 2351 + }, + { + "epoch": 2.1839294008360426, + "grad_norm": 0.1814772554278054, + "learning_rate": 1.5094664371772808e-05, + "loss": 0.2818, + "step": 2352 + }, + { + "epoch": 2.1848583372039014, + "grad_norm": 0.1982916282455446, + "learning_rate": 1.5077452667814116e-05, + "loss": 0.2909, + "step": 2353 + }, + { + "epoch": 2.1857872735717603, + "grad_norm": 0.18547018214356015, + "learning_rate": 1.5060240963855424e-05, + "loss": 0.2633, + "step": 2354 + }, + { + "epoch": 2.186716209939619, + "grad_norm": 0.18629229106963163, + "learning_rate": 1.504302925989673e-05, + "loss": 0.2714, + "step": 2355 + }, + { + "epoch": 2.187645146307478, + "grad_norm": 0.17792953090605365, + "learning_rate": 1.5025817555938038e-05, + "loss": 0.2804, + "step": 2356 + }, + { + "epoch": 2.188574082675337, + "grad_norm": 0.19411643483321742, + "learning_rate": 1.5008605851979346e-05, + "loss": 0.2834, + "step": 2357 + }, + { + "epoch": 2.1895030190431957, + "grad_norm": 0.17838088091084456, + "learning_rate": 1.4991394148020655e-05, + "loss": 0.2894, + "step": 2358 + }, + { + "epoch": 2.1904319554110545, + "grad_norm": 0.18798746896736152, + "learning_rate": 1.4974182444061963e-05, + "loss": 0.2825, + "step": 2359 + }, + { + "epoch": 2.1913608917789134, + "grad_norm": 0.18713834475410582, + "learning_rate": 1.4956970740103271e-05, + "loss": 0.2753, + "step": 2360 + }, + { + "epoch": 2.1922898281467718, + "grad_norm": 0.21091756592860836, + "learning_rate": 1.493975903614458e-05, + "loss": 0.2841, + "step": 2361 + }, + { + "epoch": 2.1932187645146306, + "grad_norm": 0.18950367949638064, + "learning_rate": 1.4922547332185888e-05, + "loss": 0.2786, + "step": 2362 + }, + { + "epoch": 2.1941477008824894, + "grad_norm": 0.18906056192602488, + "learning_rate": 1.4905335628227197e-05, + "loss": 0.2792, + "step": 2363 + }, + { + "epoch": 2.1950766372503483, + "grad_norm": 0.18613054799021314, + "learning_rate": 1.4888123924268505e-05, + "loss": 0.2681, + "step": 2364 + }, + { + "epoch": 2.196005573618207, + "grad_norm": 0.18070972999393983, + "learning_rate": 1.487091222030981e-05, + "loss": 0.2689, + "step": 2365 + }, + { + "epoch": 2.196934509986066, + "grad_norm": 0.17512758804049097, + "learning_rate": 1.4853700516351118e-05, + "loss": 0.2702, + "step": 2366 + }, + { + "epoch": 2.197863446353925, + "grad_norm": 0.1977826508419132, + "learning_rate": 1.4836488812392427e-05, + "loss": 0.2714, + "step": 2367 + }, + { + "epoch": 2.1987923827217837, + "grad_norm": 0.19045916194894494, + "learning_rate": 1.4819277108433735e-05, + "loss": 0.2898, + "step": 2368 + }, + { + "epoch": 2.1997213190896425, + "grad_norm": 0.1894952240845188, + "learning_rate": 1.4802065404475044e-05, + "loss": 0.2793, + "step": 2369 + }, + { + "epoch": 2.200650255457501, + "grad_norm": 0.18712603681731005, + "learning_rate": 1.4784853700516352e-05, + "loss": 0.2875, + "step": 2370 + }, + { + "epoch": 2.2015791918253598, + "grad_norm": 0.18017707880699574, + "learning_rate": 1.476764199655766e-05, + "loss": 0.2676, + "step": 2371 + }, + { + "epoch": 2.2025081281932186, + "grad_norm": 0.1872086868065948, + "learning_rate": 1.4750430292598969e-05, + "loss": 0.2812, + "step": 2372 + }, + { + "epoch": 2.2034370645610775, + "grad_norm": 0.18279582654005014, + "learning_rate": 1.4733218588640277e-05, + "loss": 0.2775, + "step": 2373 + }, + { + "epoch": 2.2043660009289363, + "grad_norm": 0.1742546303606481, + "learning_rate": 1.4716006884681586e-05, + "loss": 0.2672, + "step": 2374 + }, + { + "epoch": 2.205294937296795, + "grad_norm": 0.19829823980108882, + "learning_rate": 1.4698795180722894e-05, + "loss": 0.2932, + "step": 2375 + }, + { + "epoch": 2.206223873664654, + "grad_norm": 0.1907823494693345, + "learning_rate": 1.4681583476764199e-05, + "loss": 0.2759, + "step": 2376 + }, + { + "epoch": 2.207152810032513, + "grad_norm": 0.1760244498190223, + "learning_rate": 1.4664371772805507e-05, + "loss": 0.2675, + "step": 2377 + }, + { + "epoch": 2.2080817464003717, + "grad_norm": 0.17827743105039992, + "learning_rate": 1.4647160068846816e-05, + "loss": 0.2737, + "step": 2378 + }, + { + "epoch": 2.2090106827682305, + "grad_norm": 0.19304042505289626, + "learning_rate": 1.4629948364888124e-05, + "loss": 0.2798, + "step": 2379 + }, + { + "epoch": 2.2099396191360894, + "grad_norm": 0.16940952191421077, + "learning_rate": 1.4612736660929433e-05, + "loss": 0.2736, + "step": 2380 + }, + { + "epoch": 2.210868555503948, + "grad_norm": 2.077281485156273, + "learning_rate": 1.4595524956970741e-05, + "loss": 0.3002, + "step": 2381 + }, + { + "epoch": 2.2117974918718066, + "grad_norm": 0.19341272255759445, + "learning_rate": 1.457831325301205e-05, + "loss": 0.2777, + "step": 2382 + }, + { + "epoch": 2.2127264282396655, + "grad_norm": 0.20112600841398107, + "learning_rate": 1.4561101549053358e-05, + "loss": 0.2887, + "step": 2383 + }, + { + "epoch": 2.2136553646075243, + "grad_norm": 0.19413557575902854, + "learning_rate": 1.4543889845094666e-05, + "loss": 0.2947, + "step": 2384 + }, + { + "epoch": 2.214584300975383, + "grad_norm": 0.19780119460533946, + "learning_rate": 1.4526678141135975e-05, + "loss": 0.2839, + "step": 2385 + }, + { + "epoch": 2.215513237343242, + "grad_norm": 0.17913396155866435, + "learning_rate": 1.4509466437177283e-05, + "loss": 0.2733, + "step": 2386 + }, + { + "epoch": 2.216442173711101, + "grad_norm": 0.18886643483257676, + "learning_rate": 1.4492254733218588e-05, + "loss": 0.2864, + "step": 2387 + }, + { + "epoch": 2.2173711100789597, + "grad_norm": 0.1995227456484706, + "learning_rate": 1.4475043029259896e-05, + "loss": 0.2838, + "step": 2388 + }, + { + "epoch": 2.2183000464468186, + "grad_norm": 0.18895265517020549, + "learning_rate": 1.4457831325301205e-05, + "loss": 0.2706, + "step": 2389 + }, + { + "epoch": 2.2192289828146774, + "grad_norm": 0.19334993843744272, + "learning_rate": 1.4440619621342513e-05, + "loss": 0.2787, + "step": 2390 + }, + { + "epoch": 2.220157919182536, + "grad_norm": 0.20236064176192772, + "learning_rate": 1.4423407917383822e-05, + "loss": 0.2788, + "step": 2391 + }, + { + "epoch": 2.2210868555503946, + "grad_norm": 0.3655805333065711, + "learning_rate": 1.440619621342513e-05, + "loss": 0.2708, + "step": 2392 + }, + { + "epoch": 2.2220157919182535, + "grad_norm": 0.18758244632384213, + "learning_rate": 1.4388984509466438e-05, + "loss": 0.2825, + "step": 2393 + }, + { + "epoch": 2.2229447282861123, + "grad_norm": 0.17757060628418905, + "learning_rate": 1.4371772805507747e-05, + "loss": 0.2779, + "step": 2394 + }, + { + "epoch": 2.223873664653971, + "grad_norm": 0.19104858972751676, + "learning_rate": 1.4354561101549055e-05, + "loss": 0.272, + "step": 2395 + }, + { + "epoch": 2.22480260102183, + "grad_norm": 0.19793684191220914, + "learning_rate": 1.4337349397590364e-05, + "loss": 0.2728, + "step": 2396 + }, + { + "epoch": 2.225731537389689, + "grad_norm": 0.18934791385839767, + "learning_rate": 1.4320137693631669e-05, + "loss": 0.2796, + "step": 2397 + }, + { + "epoch": 2.2266604737575477, + "grad_norm": 0.17614906901226796, + "learning_rate": 1.4302925989672977e-05, + "loss": 0.2812, + "step": 2398 + }, + { + "epoch": 2.2275894101254066, + "grad_norm": 0.17586759106445218, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.2677, + "step": 2399 + }, + { + "epoch": 2.2285183464932654, + "grad_norm": 0.1872402948258275, + "learning_rate": 1.4268502581755594e-05, + "loss": 0.2555, + "step": 2400 + }, + { + "epoch": 2.229447282861124, + "grad_norm": 0.19771821410158671, + "learning_rate": 1.4251290877796902e-05, + "loss": 0.2677, + "step": 2401 + }, + { + "epoch": 2.2303762192289827, + "grad_norm": 0.17818595832024386, + "learning_rate": 1.423407917383821e-05, + "loss": 0.2607, + "step": 2402 + }, + { + "epoch": 2.2313051555968415, + "grad_norm": 0.1776547560819404, + "learning_rate": 1.4216867469879519e-05, + "loss": 0.2749, + "step": 2403 + }, + { + "epoch": 2.2322340919647004, + "grad_norm": 0.170957875139368, + "learning_rate": 1.4199655765920827e-05, + "loss": 0.2591, + "step": 2404 + }, + { + "epoch": 2.233163028332559, + "grad_norm": 0.1742647844913809, + "learning_rate": 1.4182444061962136e-05, + "loss": 0.2738, + "step": 2405 + }, + { + "epoch": 2.234091964700418, + "grad_norm": 0.1925283348131071, + "learning_rate": 1.4165232358003444e-05, + "loss": 0.2812, + "step": 2406 + }, + { + "epoch": 2.235020901068277, + "grad_norm": 0.19555384576301638, + "learning_rate": 1.4148020654044753e-05, + "loss": 0.2945, + "step": 2407 + }, + { + "epoch": 2.2359498374361357, + "grad_norm": 0.17682561155125856, + "learning_rate": 1.4130808950086058e-05, + "loss": 0.2614, + "step": 2408 + }, + { + "epoch": 2.2368787738039946, + "grad_norm": 0.21751007119184168, + "learning_rate": 1.4113597246127366e-05, + "loss": 0.2985, + "step": 2409 + }, + { + "epoch": 2.2378077101718534, + "grad_norm": 0.18932523578727964, + "learning_rate": 1.4096385542168674e-05, + "loss": 0.2706, + "step": 2410 + }, + { + "epoch": 2.238736646539712, + "grad_norm": 0.1741531414035046, + "learning_rate": 1.4079173838209983e-05, + "loss": 0.2775, + "step": 2411 + }, + { + "epoch": 2.2396655829075707, + "grad_norm": 0.1790079893951182, + "learning_rate": 1.4061962134251291e-05, + "loss": 0.2716, + "step": 2412 + }, + { + "epoch": 2.2405945192754295, + "grad_norm": 0.1897657068346121, + "learning_rate": 1.40447504302926e-05, + "loss": 0.2887, + "step": 2413 + }, + { + "epoch": 2.2415234556432884, + "grad_norm": 0.17563878014722972, + "learning_rate": 1.4027538726333908e-05, + "loss": 0.2678, + "step": 2414 + }, + { + "epoch": 2.242452392011147, + "grad_norm": 0.1734405490085664, + "learning_rate": 1.4010327022375216e-05, + "loss": 0.2863, + "step": 2415 + }, + { + "epoch": 2.243381328379006, + "grad_norm": 0.18332196872544093, + "learning_rate": 1.3993115318416525e-05, + "loss": 0.2734, + "step": 2416 + }, + { + "epoch": 2.244310264746865, + "grad_norm": 0.17947567690760682, + "learning_rate": 1.3975903614457833e-05, + "loss": 0.27, + "step": 2417 + }, + { + "epoch": 2.2452392011147237, + "grad_norm": 0.18735527285985934, + "learning_rate": 1.3958691910499138e-05, + "loss": 0.2851, + "step": 2418 + }, + { + "epoch": 2.2461681374825826, + "grad_norm": 0.19049022365781365, + "learning_rate": 1.3941480206540447e-05, + "loss": 0.2795, + "step": 2419 + }, + { + "epoch": 2.2470970738504414, + "grad_norm": 0.17554046191220224, + "learning_rate": 1.3924268502581755e-05, + "loss": 0.2703, + "step": 2420 + }, + { + "epoch": 2.2480260102183, + "grad_norm": 0.18762826003233593, + "learning_rate": 1.3907056798623063e-05, + "loss": 0.2928, + "step": 2421 + }, + { + "epoch": 2.2489549465861587, + "grad_norm": 0.18234373655440445, + "learning_rate": 1.3889845094664372e-05, + "loss": 0.2817, + "step": 2422 + }, + { + "epoch": 2.2498838829540175, + "grad_norm": 0.18049105683824865, + "learning_rate": 1.387263339070568e-05, + "loss": 0.2744, + "step": 2423 + }, + { + "epoch": 2.2508128193218764, + "grad_norm": 0.1770550838421612, + "learning_rate": 1.3855421686746989e-05, + "loss": 0.2694, + "step": 2424 + }, + { + "epoch": 2.2517417556897352, + "grad_norm": 0.1756612487036032, + "learning_rate": 1.3838209982788297e-05, + "loss": 0.2647, + "step": 2425 + }, + { + "epoch": 2.252670692057594, + "grad_norm": 0.1811227177376827, + "learning_rate": 1.3820998278829605e-05, + "loss": 0.2732, + "step": 2426 + }, + { + "epoch": 2.253599628425453, + "grad_norm": 0.1895660967425297, + "learning_rate": 1.3803786574870914e-05, + "loss": 0.2883, + "step": 2427 + }, + { + "epoch": 2.2545285647933118, + "grad_norm": 0.1759719292934546, + "learning_rate": 1.3786574870912222e-05, + "loss": 0.2704, + "step": 2428 + }, + { + "epoch": 2.2554575011611706, + "grad_norm": 0.1852248917014425, + "learning_rate": 1.3769363166953527e-05, + "loss": 0.2714, + "step": 2429 + }, + { + "epoch": 2.2563864375290295, + "grad_norm": 0.1872463960739118, + "learning_rate": 1.3752151462994836e-05, + "loss": 0.2884, + "step": 2430 + }, + { + "epoch": 2.257315373896888, + "grad_norm": 0.19160635539373605, + "learning_rate": 1.3734939759036144e-05, + "loss": 0.2653, + "step": 2431 + }, + { + "epoch": 2.2582443102647467, + "grad_norm": 0.17691605824841325, + "learning_rate": 1.3717728055077452e-05, + "loss": 0.2587, + "step": 2432 + }, + { + "epoch": 2.2591732466326055, + "grad_norm": 0.1840601443984862, + "learning_rate": 1.3700516351118761e-05, + "loss": 0.2863, + "step": 2433 + }, + { + "epoch": 2.2601021830004644, + "grad_norm": 0.17288469873242204, + "learning_rate": 1.368330464716007e-05, + "loss": 0.2547, + "step": 2434 + }, + { + "epoch": 2.2610311193683232, + "grad_norm": 0.20393520871661627, + "learning_rate": 1.3666092943201378e-05, + "loss": 0.2918, + "step": 2435 + }, + { + "epoch": 2.261960055736182, + "grad_norm": 0.185024404089295, + "learning_rate": 1.3648881239242686e-05, + "loss": 0.2779, + "step": 2436 + }, + { + "epoch": 2.262888992104041, + "grad_norm": 0.1901769547372566, + "learning_rate": 1.3631669535283994e-05, + "loss": 0.2931, + "step": 2437 + }, + { + "epoch": 2.2638179284718998, + "grad_norm": 0.2214837074293375, + "learning_rate": 1.3614457831325303e-05, + "loss": 0.3066, + "step": 2438 + }, + { + "epoch": 2.2647468648397586, + "grad_norm": 0.18890674277222674, + "learning_rate": 1.3597246127366611e-05, + "loss": 0.273, + "step": 2439 + }, + { + "epoch": 2.2656758012076175, + "grad_norm": 0.17542645415927108, + "learning_rate": 1.3580034423407916e-05, + "loss": 0.2746, + "step": 2440 + }, + { + "epoch": 2.2666047375754763, + "grad_norm": 0.19723610546547288, + "learning_rate": 1.3562822719449225e-05, + "loss": 0.2862, + "step": 2441 + }, + { + "epoch": 2.2675336739433347, + "grad_norm": 0.18409020452903432, + "learning_rate": 1.3545611015490533e-05, + "loss": 0.2756, + "step": 2442 + }, + { + "epoch": 2.2684626103111936, + "grad_norm": 0.17662475275167705, + "learning_rate": 1.3528399311531841e-05, + "loss": 0.2678, + "step": 2443 + }, + { + "epoch": 2.2693915466790524, + "grad_norm": 0.1791475766551058, + "learning_rate": 1.351118760757315e-05, + "loss": 0.275, + "step": 2444 + }, + { + "epoch": 2.2703204830469113, + "grad_norm": 0.1854099470575039, + "learning_rate": 1.3493975903614458e-05, + "loss": 0.2728, + "step": 2445 + }, + { + "epoch": 2.27124941941477, + "grad_norm": 0.1852589646237944, + "learning_rate": 1.3476764199655767e-05, + "loss": 0.277, + "step": 2446 + }, + { + "epoch": 2.272178355782629, + "grad_norm": 0.18383715705433082, + "learning_rate": 1.3459552495697075e-05, + "loss": 0.2889, + "step": 2447 + }, + { + "epoch": 2.273107292150488, + "grad_norm": 0.1951503760520118, + "learning_rate": 1.3442340791738383e-05, + "loss": 0.2805, + "step": 2448 + }, + { + "epoch": 2.2740362285183466, + "grad_norm": 0.20073372805491246, + "learning_rate": 1.3425129087779692e-05, + "loss": 0.2731, + "step": 2449 + }, + { + "epoch": 2.2749651648862055, + "grad_norm": 0.16310800213885338, + "learning_rate": 1.3407917383820997e-05, + "loss": 0.2639, + "step": 2450 + }, + { + "epoch": 2.275894101254064, + "grad_norm": 0.17606453823569151, + "learning_rate": 1.3390705679862305e-05, + "loss": 0.2694, + "step": 2451 + }, + { + "epoch": 2.2768230376219227, + "grad_norm": 0.21268495266034193, + "learning_rate": 1.3373493975903614e-05, + "loss": 0.273, + "step": 2452 + }, + { + "epoch": 2.2777519739897816, + "grad_norm": 0.17975510622715113, + "learning_rate": 1.3356282271944922e-05, + "loss": 0.2721, + "step": 2453 + }, + { + "epoch": 2.2786809103576404, + "grad_norm": 0.17495866372601926, + "learning_rate": 1.333907056798623e-05, + "loss": 0.2608, + "step": 2454 + }, + { + "epoch": 2.2796098467254993, + "grad_norm": 0.1799744906204507, + "learning_rate": 1.3321858864027539e-05, + "loss": 0.2773, + "step": 2455 + }, + { + "epoch": 2.280538783093358, + "grad_norm": 0.20386977048097904, + "learning_rate": 1.3304647160068847e-05, + "loss": 0.298, + "step": 2456 + }, + { + "epoch": 2.281467719461217, + "grad_norm": 0.17543050916264705, + "learning_rate": 1.3287435456110156e-05, + "loss": 0.2682, + "step": 2457 + }, + { + "epoch": 2.282396655829076, + "grad_norm": 0.18115557846011818, + "learning_rate": 1.3270223752151464e-05, + "loss": 0.2963, + "step": 2458 + }, + { + "epoch": 2.2833255921969347, + "grad_norm": 0.1825233749262127, + "learning_rate": 1.3253012048192772e-05, + "loss": 0.2759, + "step": 2459 + }, + { + "epoch": 2.2842545285647935, + "grad_norm": 0.18810580131761556, + "learning_rate": 1.3235800344234081e-05, + "loss": 0.2736, + "step": 2460 + }, + { + "epoch": 2.2851834649326523, + "grad_norm": 0.20406387820144806, + "learning_rate": 1.3218588640275386e-05, + "loss": 0.2885, + "step": 2461 + }, + { + "epoch": 2.2861124013005107, + "grad_norm": 0.17866786419081768, + "learning_rate": 1.3201376936316694e-05, + "loss": 0.27, + "step": 2462 + }, + { + "epoch": 2.2870413376683696, + "grad_norm": 0.19558916886417338, + "learning_rate": 1.3184165232358003e-05, + "loss": 0.2745, + "step": 2463 + }, + { + "epoch": 2.2879702740362284, + "grad_norm": 0.17178727146349201, + "learning_rate": 1.3166953528399311e-05, + "loss": 0.2695, + "step": 2464 + }, + { + "epoch": 2.2888992104040873, + "grad_norm": 0.18186780650265671, + "learning_rate": 1.314974182444062e-05, + "loss": 0.2768, + "step": 2465 + }, + { + "epoch": 2.289828146771946, + "grad_norm": 0.1851530367223113, + "learning_rate": 1.3132530120481928e-05, + "loss": 0.2867, + "step": 2466 + }, + { + "epoch": 2.290757083139805, + "grad_norm": 0.16670097051368127, + "learning_rate": 1.3115318416523236e-05, + "loss": 0.2793, + "step": 2467 + }, + { + "epoch": 2.291686019507664, + "grad_norm": 0.1856617082893609, + "learning_rate": 1.3098106712564545e-05, + "loss": 0.2852, + "step": 2468 + }, + { + "epoch": 2.2926149558755227, + "grad_norm": 0.16657158904737376, + "learning_rate": 1.3080895008605853e-05, + "loss": 0.2648, + "step": 2469 + }, + { + "epoch": 2.2935438922433815, + "grad_norm": 0.1817338062264846, + "learning_rate": 1.3063683304647162e-05, + "loss": 0.2888, + "step": 2470 + }, + { + "epoch": 2.29447282861124, + "grad_norm": 0.16841896976761195, + "learning_rate": 1.3046471600688468e-05, + "loss": 0.2765, + "step": 2471 + }, + { + "epoch": 2.2954017649790988, + "grad_norm": 0.16883158219004724, + "learning_rate": 1.3029259896729775e-05, + "loss": 0.2684, + "step": 2472 + }, + { + "epoch": 2.2963307013469576, + "grad_norm": 0.17717975998990523, + "learning_rate": 1.3012048192771083e-05, + "loss": 0.268, + "step": 2473 + }, + { + "epoch": 2.2972596377148164, + "grad_norm": 0.18560225793278562, + "learning_rate": 1.2994836488812392e-05, + "loss": 0.3009, + "step": 2474 + }, + { + "epoch": 2.2981885740826753, + "grad_norm": 0.17167104424094876, + "learning_rate": 1.29776247848537e-05, + "loss": 0.2759, + "step": 2475 + }, + { + "epoch": 2.299117510450534, + "grad_norm": 0.18419703054460113, + "learning_rate": 1.2960413080895009e-05, + "loss": 0.2842, + "step": 2476 + }, + { + "epoch": 2.300046446818393, + "grad_norm": 0.18156266735689566, + "learning_rate": 1.2943201376936317e-05, + "loss": 0.2676, + "step": 2477 + }, + { + "epoch": 2.300975383186252, + "grad_norm": 0.18273226393990982, + "learning_rate": 1.2925989672977625e-05, + "loss": 0.2852, + "step": 2478 + }, + { + "epoch": 2.3019043195541107, + "grad_norm": 0.18012479146698604, + "learning_rate": 1.2908777969018934e-05, + "loss": 0.2817, + "step": 2479 + }, + { + "epoch": 2.3028332559219695, + "grad_norm": 0.18611898476583089, + "learning_rate": 1.2891566265060242e-05, + "loss": 0.2734, + "step": 2480 + }, + { + "epoch": 2.3037621922898284, + "grad_norm": 0.2045432229041082, + "learning_rate": 1.287435456110155e-05, + "loss": 0.2868, + "step": 2481 + }, + { + "epoch": 2.3046911286576868, + "grad_norm": 0.18166021704998359, + "learning_rate": 1.2857142857142857e-05, + "loss": 0.2742, + "step": 2482 + }, + { + "epoch": 2.3056200650255456, + "grad_norm": 0.18583776084377682, + "learning_rate": 1.2839931153184166e-05, + "loss": 0.2878, + "step": 2483 + }, + { + "epoch": 2.3065490013934045, + "grad_norm": 0.18056875244820153, + "learning_rate": 1.2822719449225474e-05, + "loss": 0.2722, + "step": 2484 + }, + { + "epoch": 2.3074779377612633, + "grad_norm": 0.1775475098336737, + "learning_rate": 1.280550774526678e-05, + "loss": 0.2621, + "step": 2485 + }, + { + "epoch": 2.308406874129122, + "grad_norm": 0.19416763735743434, + "learning_rate": 1.2788296041308089e-05, + "loss": 0.2777, + "step": 2486 + }, + { + "epoch": 2.309335810496981, + "grad_norm": 0.17930364275371594, + "learning_rate": 1.2771084337349398e-05, + "loss": 0.2686, + "step": 2487 + }, + { + "epoch": 2.31026474686484, + "grad_norm": 0.19195008866900118, + "learning_rate": 1.2753872633390706e-05, + "loss": 0.2767, + "step": 2488 + }, + { + "epoch": 2.3111936832326987, + "grad_norm": 0.2102981226089247, + "learning_rate": 1.2736660929432014e-05, + "loss": 0.2975, + "step": 2489 + }, + { + "epoch": 2.3121226196005575, + "grad_norm": 0.1838111249569394, + "learning_rate": 1.2719449225473323e-05, + "loss": 0.2701, + "step": 2490 + }, + { + "epoch": 2.313051555968416, + "grad_norm": 0.1834891735582498, + "learning_rate": 1.2702237521514631e-05, + "loss": 0.2668, + "step": 2491 + }, + { + "epoch": 2.313980492336275, + "grad_norm": 0.1748837067912934, + "learning_rate": 1.268502581755594e-05, + "loss": 0.2536, + "step": 2492 + }, + { + "epoch": 2.3149094287041336, + "grad_norm": 0.18170866744646788, + "learning_rate": 1.2667814113597246e-05, + "loss": 0.2791, + "step": 2493 + }, + { + "epoch": 2.3158383650719925, + "grad_norm": 0.17517417710104063, + "learning_rate": 1.2650602409638555e-05, + "loss": 0.2697, + "step": 2494 + }, + { + "epoch": 2.3167673014398513, + "grad_norm": 0.1696007724114442, + "learning_rate": 1.2633390705679863e-05, + "loss": 0.2701, + "step": 2495 + }, + { + "epoch": 2.31769623780771, + "grad_norm": 0.19983708306954123, + "learning_rate": 1.2616179001721171e-05, + "loss": 0.2759, + "step": 2496 + }, + { + "epoch": 2.318625174175569, + "grad_norm": 0.18007852898360852, + "learning_rate": 1.2598967297762478e-05, + "loss": 0.2738, + "step": 2497 + }, + { + "epoch": 2.319554110543428, + "grad_norm": 0.17640021787489446, + "learning_rate": 1.2581755593803787e-05, + "loss": 0.2568, + "step": 2498 + }, + { + "epoch": 2.3204830469112867, + "grad_norm": 0.17426432406739611, + "learning_rate": 1.2564543889845095e-05, + "loss": 0.2813, + "step": 2499 + }, + { + "epoch": 2.3214119832791456, + "grad_norm": 0.1819597764792421, + "learning_rate": 1.2547332185886403e-05, + "loss": 0.2843, + "step": 2500 + }, + { + "epoch": 2.3223409196470044, + "grad_norm": 0.6807628441338488, + "learning_rate": 1.2530120481927712e-05, + "loss": 0.2916, + "step": 2501 + }, + { + "epoch": 2.323269856014863, + "grad_norm": 0.18961215732510447, + "learning_rate": 1.251290877796902e-05, + "loss": 0.2885, + "step": 2502 + }, + { + "epoch": 2.3241987923827216, + "grad_norm": 0.185281322046881, + "learning_rate": 1.2495697074010327e-05, + "loss": 0.261, + "step": 2503 + }, + { + "epoch": 2.3251277287505805, + "grad_norm": 0.18212697020003069, + "learning_rate": 1.2478485370051635e-05, + "loss": 0.2824, + "step": 2504 + }, + { + "epoch": 2.3260566651184393, + "grad_norm": 0.19341133551329, + "learning_rate": 1.2461273666092944e-05, + "loss": 0.2861, + "step": 2505 + }, + { + "epoch": 2.326985601486298, + "grad_norm": 0.17022345986335427, + "learning_rate": 1.2444061962134252e-05, + "loss": 0.2693, + "step": 2506 + }, + { + "epoch": 2.327914537854157, + "grad_norm": 0.18536850713483183, + "learning_rate": 1.242685025817556e-05, + "loss": 0.2746, + "step": 2507 + }, + { + "epoch": 2.328843474222016, + "grad_norm": 0.16506173986908923, + "learning_rate": 1.2409638554216869e-05, + "loss": 0.265, + "step": 2508 + }, + { + "epoch": 2.3297724105898747, + "grad_norm": 0.17182803727776838, + "learning_rate": 1.2392426850258176e-05, + "loss": 0.2736, + "step": 2509 + }, + { + "epoch": 2.3307013469577336, + "grad_norm": 0.1742820002887796, + "learning_rate": 1.2375215146299484e-05, + "loss": 0.2618, + "step": 2510 + }, + { + "epoch": 2.331630283325592, + "grad_norm": 0.19642680627052156, + "learning_rate": 1.2358003442340792e-05, + "loss": 0.274, + "step": 2511 + }, + { + "epoch": 2.332559219693451, + "grad_norm": 0.18642669549275, + "learning_rate": 1.23407917383821e-05, + "loss": 0.284, + "step": 2512 + }, + { + "epoch": 2.3334881560613097, + "grad_norm": 0.17054316401763836, + "learning_rate": 1.232358003442341e-05, + "loss": 0.2869, + "step": 2513 + }, + { + "epoch": 2.3344170924291685, + "grad_norm": 0.18369081567830525, + "learning_rate": 1.2306368330464718e-05, + "loss": 0.2767, + "step": 2514 + }, + { + "epoch": 2.3353460287970274, + "grad_norm": 0.20114946960190172, + "learning_rate": 1.2289156626506026e-05, + "loss": 0.2814, + "step": 2515 + }, + { + "epoch": 2.336274965164886, + "grad_norm": 0.18437735158767538, + "learning_rate": 1.2271944922547333e-05, + "loss": 0.2919, + "step": 2516 + }, + { + "epoch": 2.337203901532745, + "grad_norm": 0.18161588561862224, + "learning_rate": 1.2254733218588641e-05, + "loss": 0.287, + "step": 2517 + }, + { + "epoch": 2.338132837900604, + "grad_norm": 0.18870854980187435, + "learning_rate": 1.223752151462995e-05, + "loss": 0.2805, + "step": 2518 + }, + { + "epoch": 2.3390617742684627, + "grad_norm": 0.1768654024578103, + "learning_rate": 1.2220309810671258e-05, + "loss": 0.269, + "step": 2519 + }, + { + "epoch": 2.3399907106363216, + "grad_norm": 0.17490366879789768, + "learning_rate": 1.2203098106712566e-05, + "loss": 0.2783, + "step": 2520 + }, + { + "epoch": 2.3409196470041804, + "grad_norm": 0.18896883370592954, + "learning_rate": 1.2185886402753875e-05, + "loss": 0.268, + "step": 2521 + }, + { + "epoch": 2.341848583372039, + "grad_norm": 0.1801247883571675, + "learning_rate": 1.2168674698795181e-05, + "loss": 0.2719, + "step": 2522 + }, + { + "epoch": 2.3427775197398977, + "grad_norm": 0.18694993552175548, + "learning_rate": 1.215146299483649e-05, + "loss": 0.2757, + "step": 2523 + }, + { + "epoch": 2.3437064561077565, + "grad_norm": 0.18275519642880572, + "learning_rate": 1.2134251290877798e-05, + "loss": 0.263, + "step": 2524 + }, + { + "epoch": 2.3446353924756154, + "grad_norm": 0.17351016085246834, + "learning_rate": 1.2117039586919107e-05, + "loss": 0.2565, + "step": 2525 + }, + { + "epoch": 2.345564328843474, + "grad_norm": 0.18379515482313682, + "learning_rate": 1.2099827882960415e-05, + "loss": 0.2696, + "step": 2526 + }, + { + "epoch": 2.346493265211333, + "grad_norm": 0.1720315048361915, + "learning_rate": 1.2082616179001722e-05, + "loss": 0.2641, + "step": 2527 + }, + { + "epoch": 2.347422201579192, + "grad_norm": 0.18590275177538543, + "learning_rate": 1.206540447504303e-05, + "loss": 0.2733, + "step": 2528 + }, + { + "epoch": 2.3483511379470507, + "grad_norm": 0.169610311172933, + "learning_rate": 1.2048192771084338e-05, + "loss": 0.2702, + "step": 2529 + }, + { + "epoch": 2.3492800743149096, + "grad_norm": 0.1963508763656982, + "learning_rate": 1.2030981067125647e-05, + "loss": 0.2652, + "step": 2530 + }, + { + "epoch": 2.350209010682768, + "grad_norm": 0.18587076519396542, + "learning_rate": 1.2013769363166955e-05, + "loss": 0.2789, + "step": 2531 + }, + { + "epoch": 2.351137947050627, + "grad_norm": 0.1854453938930749, + "learning_rate": 1.1996557659208262e-05, + "loss": 0.2732, + "step": 2532 + }, + { + "epoch": 2.3520668834184857, + "grad_norm": 0.1942839700749339, + "learning_rate": 1.197934595524957e-05, + "loss": 0.2803, + "step": 2533 + }, + { + "epoch": 2.3529958197863445, + "grad_norm": 0.19659561231939945, + "learning_rate": 1.1962134251290879e-05, + "loss": 0.2913, + "step": 2534 + }, + { + "epoch": 2.3539247561542034, + "grad_norm": 0.17480801654404732, + "learning_rate": 1.1944922547332187e-05, + "loss": 0.271, + "step": 2535 + }, + { + "epoch": 2.3548536925220622, + "grad_norm": 0.1944663657157406, + "learning_rate": 1.1927710843373496e-05, + "loss": 0.2641, + "step": 2536 + }, + { + "epoch": 2.355782628889921, + "grad_norm": 0.18816619868751366, + "learning_rate": 1.1910499139414804e-05, + "loss": 0.2833, + "step": 2537 + }, + { + "epoch": 2.35671156525778, + "grad_norm": 0.20018181591126397, + "learning_rate": 1.189328743545611e-05, + "loss": 0.2852, + "step": 2538 + }, + { + "epoch": 2.3576405016256388, + "grad_norm": 0.1756427462956202, + "learning_rate": 1.1876075731497419e-05, + "loss": 0.2778, + "step": 2539 + }, + { + "epoch": 2.3585694379934976, + "grad_norm": 0.18983605510608587, + "learning_rate": 1.1858864027538727e-05, + "loss": 0.2769, + "step": 2540 + }, + { + "epoch": 2.3594983743613565, + "grad_norm": 0.20928473702526423, + "learning_rate": 1.1841652323580036e-05, + "loss": 0.2856, + "step": 2541 + }, + { + "epoch": 2.360427310729215, + "grad_norm": 0.18099417160933323, + "learning_rate": 1.1824440619621344e-05, + "loss": 0.2758, + "step": 2542 + }, + { + "epoch": 2.3613562470970737, + "grad_norm": 0.1788276756913518, + "learning_rate": 1.1807228915662651e-05, + "loss": 0.2809, + "step": 2543 + }, + { + "epoch": 2.3622851834649325, + "grad_norm": 0.18196101771025752, + "learning_rate": 1.179001721170396e-05, + "loss": 0.2714, + "step": 2544 + }, + { + "epoch": 2.3632141198327914, + "grad_norm": 0.16952857307776525, + "learning_rate": 1.1772805507745268e-05, + "loss": 0.2562, + "step": 2545 + }, + { + "epoch": 2.3641430562006502, + "grad_norm": 0.18206933762294697, + "learning_rate": 1.1755593803786576e-05, + "loss": 0.2761, + "step": 2546 + }, + { + "epoch": 2.365071992568509, + "grad_norm": 0.19380265765568364, + "learning_rate": 1.1738382099827885e-05, + "loss": 0.2869, + "step": 2547 + }, + { + "epoch": 2.366000928936368, + "grad_norm": 0.16983950092360575, + "learning_rate": 1.1721170395869191e-05, + "loss": 0.2745, + "step": 2548 + }, + { + "epoch": 2.3669298653042268, + "grad_norm": 0.19191163491346017, + "learning_rate": 1.17039586919105e-05, + "loss": 0.2946, + "step": 2549 + }, + { + "epoch": 2.3678588016720856, + "grad_norm": 0.17125497389397198, + "learning_rate": 1.1686746987951808e-05, + "loss": 0.2735, + "step": 2550 + }, + { + "epoch": 2.368787738039944, + "grad_norm": 0.19949567009374117, + "learning_rate": 1.1669535283993117e-05, + "loss": 0.2946, + "step": 2551 + }, + { + "epoch": 2.369716674407803, + "grad_norm": 0.18799349450204403, + "learning_rate": 1.1652323580034425e-05, + "loss": 0.2838, + "step": 2552 + }, + { + "epoch": 2.3706456107756617, + "grad_norm": 0.17431131561492344, + "learning_rate": 1.1635111876075733e-05, + "loss": 0.2662, + "step": 2553 + }, + { + "epoch": 2.3715745471435206, + "grad_norm": 0.17729505501913617, + "learning_rate": 1.161790017211704e-05, + "loss": 0.2845, + "step": 2554 + }, + { + "epoch": 2.3725034835113794, + "grad_norm": 0.17567637231257308, + "learning_rate": 1.1600688468158348e-05, + "loss": 0.2828, + "step": 2555 + }, + { + "epoch": 2.3734324198792383, + "grad_norm": 0.18325562355641456, + "learning_rate": 1.1583476764199657e-05, + "loss": 0.2881, + "step": 2556 + }, + { + "epoch": 2.374361356247097, + "grad_norm": 0.20307829278142425, + "learning_rate": 1.1566265060240965e-05, + "loss": 0.2797, + "step": 2557 + }, + { + "epoch": 2.375290292614956, + "grad_norm": 0.1760215816447019, + "learning_rate": 1.1549053356282274e-05, + "loss": 0.2866, + "step": 2558 + }, + { + "epoch": 2.376219228982815, + "grad_norm": 0.16869374727160663, + "learning_rate": 1.153184165232358e-05, + "loss": 0.2677, + "step": 2559 + }, + { + "epoch": 2.3771481653506736, + "grad_norm": 0.19634176148081195, + "learning_rate": 1.1514629948364889e-05, + "loss": 0.2826, + "step": 2560 + }, + { + "epoch": 2.3780771017185325, + "grad_norm": 0.1901365638963679, + "learning_rate": 1.1497418244406197e-05, + "loss": 0.2717, + "step": 2561 + }, + { + "epoch": 2.3790060380863913, + "grad_norm": 0.17289531409457187, + "learning_rate": 1.1480206540447506e-05, + "loss": 0.2745, + "step": 2562 + }, + { + "epoch": 2.3799349744542497, + "grad_norm": 0.1863744077538756, + "learning_rate": 1.1462994836488814e-05, + "loss": 0.2861, + "step": 2563 + }, + { + "epoch": 2.3808639108221086, + "grad_norm": 0.19647354430195266, + "learning_rate": 1.144578313253012e-05, + "loss": 0.2953, + "step": 2564 + }, + { + "epoch": 2.3817928471899674, + "grad_norm": 0.16997392062698125, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.2662, + "step": 2565 + }, + { + "epoch": 2.3827217835578263, + "grad_norm": 0.17741241720060805, + "learning_rate": 1.1411359724612737e-05, + "loss": 0.2801, + "step": 2566 + }, + { + "epoch": 2.383650719925685, + "grad_norm": 0.1734889437651768, + "learning_rate": 1.1394148020654046e-05, + "loss": 0.2715, + "step": 2567 + }, + { + "epoch": 2.384579656293544, + "grad_norm": 0.18052716242882721, + "learning_rate": 1.1376936316695354e-05, + "loss": 0.2861, + "step": 2568 + }, + { + "epoch": 2.385508592661403, + "grad_norm": 0.17565238572476038, + "learning_rate": 1.1359724612736663e-05, + "loss": 0.2714, + "step": 2569 + }, + { + "epoch": 2.3864375290292617, + "grad_norm": 0.1706278981440593, + "learning_rate": 1.134251290877797e-05, + "loss": 0.2762, + "step": 2570 + }, + { + "epoch": 2.38736646539712, + "grad_norm": 0.17926154917329037, + "learning_rate": 1.1325301204819278e-05, + "loss": 0.2854, + "step": 2571 + }, + { + "epoch": 2.388295401764979, + "grad_norm": 0.1802656856974113, + "learning_rate": 1.1308089500860586e-05, + "loss": 0.2807, + "step": 2572 + }, + { + "epoch": 2.3892243381328377, + "grad_norm": 0.17506633593361373, + "learning_rate": 1.1290877796901895e-05, + "loss": 0.2779, + "step": 2573 + }, + { + "epoch": 2.3901532745006966, + "grad_norm": 0.1728471092867977, + "learning_rate": 1.1273666092943203e-05, + "loss": 0.2765, + "step": 2574 + }, + { + "epoch": 2.3910822108685554, + "grad_norm": 0.19565702280776281, + "learning_rate": 1.125645438898451e-05, + "loss": 0.2635, + "step": 2575 + }, + { + "epoch": 2.3920111472364143, + "grad_norm": 0.18314447255310315, + "learning_rate": 1.1239242685025818e-05, + "loss": 0.2737, + "step": 2576 + }, + { + "epoch": 2.392940083604273, + "grad_norm": 0.1750548586458759, + "learning_rate": 1.1222030981067126e-05, + "loss": 0.2642, + "step": 2577 + }, + { + "epoch": 2.393869019972132, + "grad_norm": 0.1832451160171968, + "learning_rate": 1.1204819277108435e-05, + "loss": 0.2745, + "step": 2578 + }, + { + "epoch": 2.394797956339991, + "grad_norm": 0.18031801543289583, + "learning_rate": 1.1187607573149743e-05, + "loss": 0.2684, + "step": 2579 + }, + { + "epoch": 2.3957268927078497, + "grad_norm": 0.1799011721479443, + "learning_rate": 1.117039586919105e-05, + "loss": 0.2807, + "step": 2580 + }, + { + "epoch": 2.3966558290757085, + "grad_norm": 0.17467422843455963, + "learning_rate": 1.1153184165232358e-05, + "loss": 0.2661, + "step": 2581 + }, + { + "epoch": 2.3975847654435674, + "grad_norm": 0.182510619647452, + "learning_rate": 1.1135972461273667e-05, + "loss": 0.2661, + "step": 2582 + }, + { + "epoch": 2.3985137018114258, + "grad_norm": 0.19000712669188646, + "learning_rate": 1.1118760757314975e-05, + "loss": 0.2785, + "step": 2583 + }, + { + "epoch": 2.3994426381792846, + "grad_norm": 0.16903359729376602, + "learning_rate": 1.1101549053356284e-05, + "loss": 0.2704, + "step": 2584 + }, + { + "epoch": 2.4003715745471434, + "grad_norm": 0.18217095684501244, + "learning_rate": 1.108433734939759e-05, + "loss": 0.2634, + "step": 2585 + }, + { + "epoch": 2.4013005109150023, + "grad_norm": 0.17717495586427512, + "learning_rate": 1.1067125645438899e-05, + "loss": 0.2551, + "step": 2586 + }, + { + "epoch": 2.402229447282861, + "grad_norm": 0.1838135913410062, + "learning_rate": 1.1049913941480207e-05, + "loss": 0.2876, + "step": 2587 + }, + { + "epoch": 2.40315838365072, + "grad_norm": 0.17031644253992032, + "learning_rate": 1.1032702237521515e-05, + "loss": 0.2624, + "step": 2588 + }, + { + "epoch": 2.404087320018579, + "grad_norm": 0.17556071203202087, + "learning_rate": 1.1015490533562824e-05, + "loss": 0.2753, + "step": 2589 + }, + { + "epoch": 2.4050162563864377, + "grad_norm": 0.1834407267830562, + "learning_rate": 1.0998278829604132e-05, + "loss": 0.2746, + "step": 2590 + }, + { + "epoch": 2.4059451927542965, + "grad_norm": 0.19926877874180607, + "learning_rate": 1.0981067125645439e-05, + "loss": 0.29, + "step": 2591 + }, + { + "epoch": 2.406874129122155, + "grad_norm": 0.17906776663032192, + "learning_rate": 1.0963855421686747e-05, + "loss": 0.2844, + "step": 2592 + }, + { + "epoch": 2.4078030654900138, + "grad_norm": 0.16699051055445974, + "learning_rate": 1.0946643717728056e-05, + "loss": 0.2588, + "step": 2593 + }, + { + "epoch": 2.4087320018578726, + "grad_norm": 0.1751714906923023, + "learning_rate": 1.0929432013769364e-05, + "loss": 0.2717, + "step": 2594 + }, + { + "epoch": 2.4096609382257315, + "grad_norm": 0.17331285320106854, + "learning_rate": 1.0912220309810673e-05, + "loss": 0.251, + "step": 2595 + }, + { + "epoch": 2.4105898745935903, + "grad_norm": 0.1739226946282106, + "learning_rate": 1.089500860585198e-05, + "loss": 0.2883, + "step": 2596 + }, + { + "epoch": 2.411518810961449, + "grad_norm": 0.19012644739523568, + "learning_rate": 1.0877796901893288e-05, + "loss": 0.2806, + "step": 2597 + }, + { + "epoch": 2.412447747329308, + "grad_norm": 0.18129586130288267, + "learning_rate": 1.0860585197934596e-05, + "loss": 0.2769, + "step": 2598 + }, + { + "epoch": 2.413376683697167, + "grad_norm": 0.17128766688292568, + "learning_rate": 1.0843373493975904e-05, + "loss": 0.2725, + "step": 2599 + }, + { + "epoch": 2.4143056200650257, + "grad_norm": 0.1723024173039281, + "learning_rate": 1.0826161790017213e-05, + "loss": 0.2712, + "step": 2600 + }, + { + "epoch": 2.4152345564328845, + "grad_norm": 0.18914686577326056, + "learning_rate": 1.080895008605852e-05, + "loss": 0.2823, + "step": 2601 + }, + { + "epoch": 2.4161634928007434, + "grad_norm": 0.17962076856198506, + "learning_rate": 1.0791738382099828e-05, + "loss": 0.2738, + "step": 2602 + }, + { + "epoch": 2.417092429168602, + "grad_norm": 0.18223450453434567, + "learning_rate": 1.0774526678141136e-05, + "loss": 0.2817, + "step": 2603 + }, + { + "epoch": 2.4180213655364606, + "grad_norm": 0.17204461955432634, + "learning_rate": 1.0757314974182445e-05, + "loss": 0.2784, + "step": 2604 + }, + { + "epoch": 2.4189503019043195, + "grad_norm": 0.18734269004984067, + "learning_rate": 1.0740103270223753e-05, + "loss": 0.2798, + "step": 2605 + }, + { + "epoch": 2.4198792382721783, + "grad_norm": 0.18627841159310524, + "learning_rate": 1.0722891566265062e-05, + "loss": 0.2736, + "step": 2606 + }, + { + "epoch": 2.420808174640037, + "grad_norm": 0.18288440859506122, + "learning_rate": 1.0705679862306368e-05, + "loss": 0.2762, + "step": 2607 + }, + { + "epoch": 2.421737111007896, + "grad_norm": 0.16671344490112985, + "learning_rate": 1.0688468158347677e-05, + "loss": 0.2583, + "step": 2608 + }, + { + "epoch": 2.422666047375755, + "grad_norm": 0.16774244865276433, + "learning_rate": 1.0671256454388985e-05, + "loss": 0.2667, + "step": 2609 + }, + { + "epoch": 2.4235949837436137, + "grad_norm": 0.1906502737894114, + "learning_rate": 1.0654044750430293e-05, + "loss": 0.2729, + "step": 2610 + }, + { + "epoch": 2.4245239201114726, + "grad_norm": 0.19139371036181707, + "learning_rate": 1.0636833046471602e-05, + "loss": 0.2917, + "step": 2611 + }, + { + "epoch": 2.425452856479331, + "grad_norm": 0.1830525571294439, + "learning_rate": 1.0619621342512909e-05, + "loss": 0.273, + "step": 2612 + }, + { + "epoch": 2.42638179284719, + "grad_norm": 0.17846541802522353, + "learning_rate": 1.0602409638554217e-05, + "loss": 0.2697, + "step": 2613 + }, + { + "epoch": 2.4273107292150486, + "grad_norm": 0.1912546863200559, + "learning_rate": 1.0585197934595525e-05, + "loss": 0.2614, + "step": 2614 + }, + { + "epoch": 2.4282396655829075, + "grad_norm": 0.18087505849060975, + "learning_rate": 1.0567986230636834e-05, + "loss": 0.2615, + "step": 2615 + }, + { + "epoch": 2.4291686019507663, + "grad_norm": 0.18672321455235916, + "learning_rate": 1.0550774526678142e-05, + "loss": 0.2829, + "step": 2616 + }, + { + "epoch": 2.430097538318625, + "grad_norm": 0.17819321647845873, + "learning_rate": 1.0533562822719449e-05, + "loss": 0.2781, + "step": 2617 + }, + { + "epoch": 2.431026474686484, + "grad_norm": 0.1802526099580208, + "learning_rate": 1.0516351118760757e-05, + "loss": 0.2799, + "step": 2618 + }, + { + "epoch": 2.431955411054343, + "grad_norm": 0.1732325664289292, + "learning_rate": 1.0499139414802066e-05, + "loss": 0.267, + "step": 2619 + }, + { + "epoch": 2.4328843474222017, + "grad_norm": 0.1862818608754717, + "learning_rate": 1.0481927710843374e-05, + "loss": 0.257, + "step": 2620 + }, + { + "epoch": 2.4338132837900606, + "grad_norm": 0.19978393037967157, + "learning_rate": 1.0464716006884682e-05, + "loss": 0.2982, + "step": 2621 + }, + { + "epoch": 2.4347422201579194, + "grad_norm": 0.16756039896886504, + "learning_rate": 1.0447504302925991e-05, + "loss": 0.2713, + "step": 2622 + }, + { + "epoch": 2.435671156525778, + "grad_norm": 0.1812223551466504, + "learning_rate": 1.0430292598967298e-05, + "loss": 0.271, + "step": 2623 + }, + { + "epoch": 2.4366000928936367, + "grad_norm": 0.18945960502714276, + "learning_rate": 1.0413080895008606e-05, + "loss": 0.2799, + "step": 2624 + }, + { + "epoch": 2.4375290292614955, + "grad_norm": 0.17519311685029668, + "learning_rate": 1.0395869191049914e-05, + "loss": 0.2765, + "step": 2625 + }, + { + "epoch": 2.4384579656293544, + "grad_norm": 0.18324447068348415, + "learning_rate": 1.0378657487091223e-05, + "loss": 0.2762, + "step": 2626 + }, + { + "epoch": 2.439386901997213, + "grad_norm": 0.17760314979609604, + "learning_rate": 1.0361445783132531e-05, + "loss": 0.2686, + "step": 2627 + }, + { + "epoch": 2.440315838365072, + "grad_norm": 0.17054771582356815, + "learning_rate": 1.0344234079173838e-05, + "loss": 0.2615, + "step": 2628 + }, + { + "epoch": 2.441244774732931, + "grad_norm": 0.1826470589868783, + "learning_rate": 1.0327022375215146e-05, + "loss": 0.2942, + "step": 2629 + }, + { + "epoch": 2.4421737111007897, + "grad_norm": 0.1949199675365578, + "learning_rate": 1.0309810671256455e-05, + "loss": 0.2861, + "step": 2630 + }, + { + "epoch": 2.4431026474686486, + "grad_norm": 0.1960583056723856, + "learning_rate": 1.0292598967297763e-05, + "loss": 0.272, + "step": 2631 + }, + { + "epoch": 2.444031583836507, + "grad_norm": 0.17233454553942215, + "learning_rate": 1.0275387263339071e-05, + "loss": 0.2778, + "step": 2632 + }, + { + "epoch": 2.444960520204366, + "grad_norm": 0.1954500467677132, + "learning_rate": 1.0258175559380378e-05, + "loss": 0.2769, + "step": 2633 + }, + { + "epoch": 2.4458894565722247, + "grad_norm": 0.1833072097746944, + "learning_rate": 1.0240963855421687e-05, + "loss": 0.2931, + "step": 2634 + }, + { + "epoch": 2.4468183929400835, + "grad_norm": 0.18965157114105274, + "learning_rate": 1.0223752151462995e-05, + "loss": 0.2877, + "step": 2635 + }, + { + "epoch": 2.4477473293079424, + "grad_norm": 0.20978733790727866, + "learning_rate": 1.0206540447504303e-05, + "loss": 0.2854, + "step": 2636 + }, + { + "epoch": 2.448676265675801, + "grad_norm": 0.17423882357451637, + "learning_rate": 1.0189328743545612e-05, + "loss": 0.2801, + "step": 2637 + }, + { + "epoch": 2.44960520204366, + "grad_norm": 0.17370390492230156, + "learning_rate": 1.0172117039586919e-05, + "loss": 0.2756, + "step": 2638 + }, + { + "epoch": 2.450534138411519, + "grad_norm": 0.16986015376541683, + "learning_rate": 1.0154905335628227e-05, + "loss": 0.2684, + "step": 2639 + }, + { + "epoch": 2.4514630747793777, + "grad_norm": 0.18055842985320075, + "learning_rate": 1.0137693631669535e-05, + "loss": 0.2691, + "step": 2640 + }, + { + "epoch": 2.4523920111472366, + "grad_norm": 0.1879144310139025, + "learning_rate": 1.0120481927710844e-05, + "loss": 0.2829, + "step": 2641 + }, + { + "epoch": 2.4533209475150954, + "grad_norm": 0.16956291193537515, + "learning_rate": 1.0103270223752152e-05, + "loss": 0.2836, + "step": 2642 + }, + { + "epoch": 2.454249883882954, + "grad_norm": 0.17981281771166743, + "learning_rate": 1.008605851979346e-05, + "loss": 0.2885, + "step": 2643 + }, + { + "epoch": 2.4551788202508127, + "grad_norm": 0.17947003101922124, + "learning_rate": 1.0068846815834767e-05, + "loss": 0.2693, + "step": 2644 + }, + { + "epoch": 2.4561077566186715, + "grad_norm": 0.1940400400563871, + "learning_rate": 1.0051635111876076e-05, + "loss": 0.2846, + "step": 2645 + }, + { + "epoch": 2.4570366929865304, + "grad_norm": 0.18487882046134174, + "learning_rate": 1.0034423407917384e-05, + "loss": 0.2577, + "step": 2646 + }, + { + "epoch": 2.4579656293543892, + "grad_norm": 0.17290264600688812, + "learning_rate": 1.0017211703958692e-05, + "loss": 0.2787, + "step": 2647 + }, + { + "epoch": 2.458894565722248, + "grad_norm": 0.17023515664022007, + "learning_rate": 1e-05, + "loss": 0.2673, + "step": 2648 + }, + { + "epoch": 2.459823502090107, + "grad_norm": 0.1776155129313746, + "learning_rate": 9.982788296041308e-06, + "loss": 0.2759, + "step": 2649 + }, + { + "epoch": 2.4607524384579658, + "grad_norm": 0.1922956087310196, + "learning_rate": 9.965576592082616e-06, + "loss": 0.2784, + "step": 2650 + }, + { + "epoch": 2.4616813748258246, + "grad_norm": 0.19175819536969194, + "learning_rate": 9.948364888123924e-06, + "loss": 0.271, + "step": 2651 + }, + { + "epoch": 2.462610311193683, + "grad_norm": 0.17557011388898683, + "learning_rate": 9.931153184165233e-06, + "loss": 0.2803, + "step": 2652 + }, + { + "epoch": 2.463539247561542, + "grad_norm": 0.17863208746544765, + "learning_rate": 9.913941480206541e-06, + "loss": 0.2717, + "step": 2653 + }, + { + "epoch": 2.4644681839294007, + "grad_norm": 0.18402079757460876, + "learning_rate": 9.896729776247848e-06, + "loss": 0.2705, + "step": 2654 + }, + { + "epoch": 2.4653971202972595, + "grad_norm": 0.18480632945427394, + "learning_rate": 9.879518072289156e-06, + "loss": 0.2939, + "step": 2655 + }, + { + "epoch": 2.4663260566651184, + "grad_norm": 0.1699533541431954, + "learning_rate": 9.862306368330465e-06, + "loss": 0.2773, + "step": 2656 + }, + { + "epoch": 2.4672549930329772, + "grad_norm": 0.17129023030549725, + "learning_rate": 9.845094664371773e-06, + "loss": 0.2748, + "step": 2657 + }, + { + "epoch": 2.468183929400836, + "grad_norm": 0.17801465299435518, + "learning_rate": 9.827882960413081e-06, + "loss": 0.2862, + "step": 2658 + }, + { + "epoch": 2.469112865768695, + "grad_norm": 0.16935169755624885, + "learning_rate": 9.81067125645439e-06, + "loss": 0.2682, + "step": 2659 + }, + { + "epoch": 2.4700418021365538, + "grad_norm": 0.2000237770209295, + "learning_rate": 9.793459552495697e-06, + "loss": 0.2813, + "step": 2660 + }, + { + "epoch": 2.4709707385044126, + "grad_norm": 0.18784791548400018, + "learning_rate": 9.776247848537005e-06, + "loss": 0.2829, + "step": 2661 + }, + { + "epoch": 2.4718996748722715, + "grad_norm": 0.17733443093063525, + "learning_rate": 9.759036144578313e-06, + "loss": 0.278, + "step": 2662 + }, + { + "epoch": 2.47282861124013, + "grad_norm": 0.17325580348112374, + "learning_rate": 9.741824440619622e-06, + "loss": 0.2773, + "step": 2663 + }, + { + "epoch": 2.4737575476079887, + "grad_norm": 0.17758599669012587, + "learning_rate": 9.72461273666093e-06, + "loss": 0.2691, + "step": 2664 + }, + { + "epoch": 2.4746864839758476, + "grad_norm": 0.2084107357635697, + "learning_rate": 9.707401032702237e-06, + "loss": 0.2772, + "step": 2665 + }, + { + "epoch": 2.4756154203437064, + "grad_norm": 0.18325821857011707, + "learning_rate": 9.690189328743545e-06, + "loss": 0.2731, + "step": 2666 + }, + { + "epoch": 2.4765443567115653, + "grad_norm": 0.18181226539042997, + "learning_rate": 9.672977624784854e-06, + "loss": 0.2745, + "step": 2667 + }, + { + "epoch": 2.477473293079424, + "grad_norm": 0.18724076787159039, + "learning_rate": 9.655765920826162e-06, + "loss": 0.2882, + "step": 2668 + }, + { + "epoch": 2.478402229447283, + "grad_norm": 0.19008040074552002, + "learning_rate": 9.63855421686747e-06, + "loss": 0.296, + "step": 2669 + }, + { + "epoch": 2.479331165815142, + "grad_norm": 0.1683701319992699, + "learning_rate": 9.621342512908777e-06, + "loss": 0.2805, + "step": 2670 + }, + { + "epoch": 2.4802601021830006, + "grad_norm": 0.17949764929549564, + "learning_rate": 9.604130808950086e-06, + "loss": 0.2715, + "step": 2671 + }, + { + "epoch": 2.481189038550859, + "grad_norm": 0.18184976004626205, + "learning_rate": 9.586919104991394e-06, + "loss": 0.2773, + "step": 2672 + }, + { + "epoch": 2.482117974918718, + "grad_norm": 0.17775807892598655, + "learning_rate": 9.569707401032702e-06, + "loss": 0.283, + "step": 2673 + }, + { + "epoch": 2.4830469112865767, + "grad_norm": 0.18923652551120143, + "learning_rate": 9.55249569707401e-06, + "loss": 0.2845, + "step": 2674 + }, + { + "epoch": 2.4839758476544356, + "grad_norm": 0.17906213434493118, + "learning_rate": 9.535283993115319e-06, + "loss": 0.2729, + "step": 2675 + }, + { + "epoch": 2.4849047840222944, + "grad_norm": 0.18853045947261501, + "learning_rate": 9.518072289156626e-06, + "loss": 0.2749, + "step": 2676 + }, + { + "epoch": 2.4858337203901533, + "grad_norm": 0.1725220515656567, + "learning_rate": 9.500860585197934e-06, + "loss": 0.2556, + "step": 2677 + }, + { + "epoch": 2.486762656758012, + "grad_norm": 0.18222882747239028, + "learning_rate": 9.483648881239243e-06, + "loss": 0.2761, + "step": 2678 + }, + { + "epoch": 2.487691593125871, + "grad_norm": 0.18770105080124452, + "learning_rate": 9.466437177280551e-06, + "loss": 0.2759, + "step": 2679 + }, + { + "epoch": 2.48862052949373, + "grad_norm": 0.1797766621607397, + "learning_rate": 9.44922547332186e-06, + "loss": 0.2931, + "step": 2680 + }, + { + "epoch": 2.4895494658615887, + "grad_norm": 0.1842354032637381, + "learning_rate": 9.432013769363166e-06, + "loss": 0.2826, + "step": 2681 + }, + { + "epoch": 2.4904784022294475, + "grad_norm": 0.164114716412252, + "learning_rate": 9.414802065404475e-06, + "loss": 0.2659, + "step": 2682 + }, + { + "epoch": 2.491407338597306, + "grad_norm": 0.16405362051068478, + "learning_rate": 9.397590361445783e-06, + "loss": 0.2658, + "step": 2683 + }, + { + "epoch": 2.4923362749651647, + "grad_norm": 0.17956863474404955, + "learning_rate": 9.380378657487091e-06, + "loss": 0.2781, + "step": 2684 + }, + { + "epoch": 2.4932652113330236, + "grad_norm": 0.17737289818146046, + "learning_rate": 9.3631669535284e-06, + "loss": 0.296, + "step": 2685 + }, + { + "epoch": 2.4941941477008824, + "grad_norm": 0.18585541574048775, + "learning_rate": 9.345955249569706e-06, + "loss": 0.3013, + "step": 2686 + }, + { + "epoch": 2.4951230840687413, + "grad_norm": 0.1643243554438021, + "learning_rate": 9.328743545611015e-06, + "loss": 0.2623, + "step": 2687 + }, + { + "epoch": 2.4960520204366, + "grad_norm": 0.17270213602505902, + "learning_rate": 9.311531841652323e-06, + "loss": 0.2766, + "step": 2688 + }, + { + "epoch": 2.496980956804459, + "grad_norm": 0.16782827955554608, + "learning_rate": 9.294320137693632e-06, + "loss": 0.2665, + "step": 2689 + }, + { + "epoch": 2.497909893172318, + "grad_norm": 0.17122748158137155, + "learning_rate": 9.27710843373494e-06, + "loss": 0.2705, + "step": 2690 + }, + { + "epoch": 2.4988388295401767, + "grad_norm": 0.17869194306118438, + "learning_rate": 9.259896729776247e-06, + "loss": 0.281, + "step": 2691 + }, + { + "epoch": 2.499767765908035, + "grad_norm": 0.1695879134783124, + "learning_rate": 9.242685025817555e-06, + "loss": 0.2537, + "step": 2692 + }, + { + "epoch": 2.500696702275894, + "grad_norm": 0.1662584266248051, + "learning_rate": 9.225473321858864e-06, + "loss": 0.2632, + "step": 2693 + }, + { + "epoch": 2.5016256386437528, + "grad_norm": 0.17093570405435482, + "learning_rate": 9.208261617900172e-06, + "loss": 0.2792, + "step": 2694 + }, + { + "epoch": 2.5025545750116116, + "grad_norm": 0.17307510808065985, + "learning_rate": 9.19104991394148e-06, + "loss": 0.2769, + "step": 2695 + }, + { + "epoch": 2.5034835113794704, + "grad_norm": 0.17692551001812182, + "learning_rate": 9.173838209982789e-06, + "loss": 0.2648, + "step": 2696 + }, + { + "epoch": 2.5044124477473293, + "grad_norm": 0.17879082159343984, + "learning_rate": 9.156626506024097e-06, + "loss": 0.2763, + "step": 2697 + }, + { + "epoch": 2.505341384115188, + "grad_norm": 0.19597822806679713, + "learning_rate": 9.139414802065404e-06, + "loss": 0.2869, + "step": 2698 + }, + { + "epoch": 2.506270320483047, + "grad_norm": 0.17477995496206758, + "learning_rate": 9.122203098106712e-06, + "loss": 0.2787, + "step": 2699 + }, + { + "epoch": 2.507199256850906, + "grad_norm": 0.19752698933537774, + "learning_rate": 9.10499139414802e-06, + "loss": 0.2796, + "step": 2700 + }, + { + "epoch": 2.5081281932187647, + "grad_norm": 0.19121547602167308, + "learning_rate": 9.087779690189329e-06, + "loss": 0.2763, + "step": 2701 + }, + { + "epoch": 2.5090571295866235, + "grad_norm": 0.19101906802675103, + "learning_rate": 9.070567986230637e-06, + "loss": 0.3006, + "step": 2702 + }, + { + "epoch": 2.5099860659544824, + "grad_norm": 0.17615835392415316, + "learning_rate": 9.053356282271946e-06, + "loss": 0.2846, + "step": 2703 + }, + { + "epoch": 2.5109150023223408, + "grad_norm": 0.1762519374159477, + "learning_rate": 9.036144578313253e-06, + "loss": 0.2693, + "step": 2704 + }, + { + "epoch": 2.5118439386901996, + "grad_norm": 0.20903449277468536, + "learning_rate": 9.018932874354561e-06, + "loss": 0.2829, + "step": 2705 + }, + { + "epoch": 2.5127728750580585, + "grad_norm": 0.17834560785229894, + "learning_rate": 9.00172117039587e-06, + "loss": 0.2674, + "step": 2706 + }, + { + "epoch": 2.5137018114259173, + "grad_norm": 0.1793333142413588, + "learning_rate": 8.984509466437178e-06, + "loss": 0.2592, + "step": 2707 + }, + { + "epoch": 2.514630747793776, + "grad_norm": 0.18220038218018145, + "learning_rate": 8.967297762478486e-06, + "loss": 0.2738, + "step": 2708 + }, + { + "epoch": 2.515559684161635, + "grad_norm": 0.1767274685035216, + "learning_rate": 8.950086058519795e-06, + "loss": 0.2777, + "step": 2709 + }, + { + "epoch": 2.516488620529494, + "grad_norm": 0.1838314403491137, + "learning_rate": 8.932874354561101e-06, + "loss": 0.2811, + "step": 2710 + }, + { + "epoch": 2.5174175568973527, + "grad_norm": 0.2000434374752791, + "learning_rate": 8.91566265060241e-06, + "loss": 0.2744, + "step": 2711 + }, + { + "epoch": 2.518346493265211, + "grad_norm": 0.1821868758548907, + "learning_rate": 8.898450946643718e-06, + "loss": 0.2768, + "step": 2712 + }, + { + "epoch": 2.51927542963307, + "grad_norm": 0.29230691057904484, + "learning_rate": 8.881239242685026e-06, + "loss": 0.2734, + "step": 2713 + }, + { + "epoch": 2.520204366000929, + "grad_norm": 0.167813313251146, + "learning_rate": 8.864027538726335e-06, + "loss": 0.2601, + "step": 2714 + }, + { + "epoch": 2.5211333023687876, + "grad_norm": 0.1769085034080416, + "learning_rate": 8.846815834767643e-06, + "loss": 0.2726, + "step": 2715 + }, + { + "epoch": 2.5220622387366465, + "grad_norm": 0.20220842083808388, + "learning_rate": 8.82960413080895e-06, + "loss": 0.2817, + "step": 2716 + }, + { + "epoch": 2.5229911751045053, + "grad_norm": 0.18444616770204614, + "learning_rate": 8.812392426850258e-06, + "loss": 0.2773, + "step": 2717 + }, + { + "epoch": 2.523920111472364, + "grad_norm": 0.18296071897423907, + "learning_rate": 8.795180722891567e-06, + "loss": 0.2797, + "step": 2718 + }, + { + "epoch": 2.524849047840223, + "grad_norm": 0.1817067050377542, + "learning_rate": 8.777969018932875e-06, + "loss": 0.2641, + "step": 2719 + }, + { + "epoch": 2.525777984208082, + "grad_norm": 0.1794572761384637, + "learning_rate": 8.760757314974184e-06, + "loss": 0.2813, + "step": 2720 + }, + { + "epoch": 2.5267069205759407, + "grad_norm": 0.1676740839883334, + "learning_rate": 8.743545611015492e-06, + "loss": 0.2808, + "step": 2721 + }, + { + "epoch": 2.5276358569437996, + "grad_norm": 0.17116798658772298, + "learning_rate": 8.726333907056799e-06, + "loss": 0.2778, + "step": 2722 + }, + { + "epoch": 2.5285647933116584, + "grad_norm": 0.17943680035895654, + "learning_rate": 8.709122203098107e-06, + "loss": 0.2713, + "step": 2723 + }, + { + "epoch": 2.529493729679517, + "grad_norm": 0.18500653140503157, + "learning_rate": 8.691910499139416e-06, + "loss": 0.2748, + "step": 2724 + }, + { + "epoch": 2.5304226660473756, + "grad_norm": 0.18656247536015494, + "learning_rate": 8.674698795180724e-06, + "loss": 0.2931, + "step": 2725 + }, + { + "epoch": 2.5313516024152345, + "grad_norm": 0.17519545206291068, + "learning_rate": 8.657487091222032e-06, + "loss": 0.2636, + "step": 2726 + }, + { + "epoch": 2.5322805387830933, + "grad_norm": 0.18579678715189885, + "learning_rate": 8.64027538726334e-06, + "loss": 0.2609, + "step": 2727 + }, + { + "epoch": 2.533209475150952, + "grad_norm": 0.17685544278047252, + "learning_rate": 8.623063683304647e-06, + "loss": 0.269, + "step": 2728 + }, + { + "epoch": 2.534138411518811, + "grad_norm": 0.1716684760379682, + "learning_rate": 8.605851979345956e-06, + "loss": 0.273, + "step": 2729 + }, + { + "epoch": 2.53506734788667, + "grad_norm": 0.18997722560178743, + "learning_rate": 8.588640275387264e-06, + "loss": 0.2735, + "step": 2730 + }, + { + "epoch": 2.5359962842545287, + "grad_norm": 0.1725056612650081, + "learning_rate": 8.571428571428573e-06, + "loss": 0.2782, + "step": 2731 + }, + { + "epoch": 2.536925220622387, + "grad_norm": 0.1846867810643451, + "learning_rate": 8.554216867469881e-06, + "loss": 0.281, + "step": 2732 + }, + { + "epoch": 2.537854156990246, + "grad_norm": 0.1746259889225677, + "learning_rate": 8.53700516351119e-06, + "loss": 0.2802, + "step": 2733 + }, + { + "epoch": 2.538783093358105, + "grad_norm": 0.18034001099175637, + "learning_rate": 8.519793459552496e-06, + "loss": 0.2983, + "step": 2734 + }, + { + "epoch": 2.5397120297259637, + "grad_norm": 0.17285283661416923, + "learning_rate": 8.502581755593805e-06, + "loss": 0.267, + "step": 2735 + }, + { + "epoch": 2.5406409660938225, + "grad_norm": 0.16623526260482924, + "learning_rate": 8.485370051635113e-06, + "loss": 0.2714, + "step": 2736 + }, + { + "epoch": 2.5415699024616814, + "grad_norm": 0.17140669097240416, + "learning_rate": 8.468158347676421e-06, + "loss": 0.2764, + "step": 2737 + }, + { + "epoch": 2.54249883882954, + "grad_norm": 0.17006832454927703, + "learning_rate": 8.45094664371773e-06, + "loss": 0.2911, + "step": 2738 + }, + { + "epoch": 2.543427775197399, + "grad_norm": 0.16385080260835502, + "learning_rate": 8.433734939759036e-06, + "loss": 0.2677, + "step": 2739 + }, + { + "epoch": 2.544356711565258, + "grad_norm": 0.1798411378111758, + "learning_rate": 8.416523235800345e-06, + "loss": 0.2876, + "step": 2740 + }, + { + "epoch": 2.5452856479331167, + "grad_norm": 0.17749376274284356, + "learning_rate": 8.399311531841653e-06, + "loss": 0.2809, + "step": 2741 + }, + { + "epoch": 2.5462145843009756, + "grad_norm": 0.17811015681571632, + "learning_rate": 8.382099827882962e-06, + "loss": 0.2758, + "step": 2742 + }, + { + "epoch": 2.5471435206688344, + "grad_norm": 0.1791864260525822, + "learning_rate": 8.36488812392427e-06, + "loss": 0.2868, + "step": 2743 + }, + { + "epoch": 2.548072457036693, + "grad_norm": 0.18193816987523823, + "learning_rate": 8.347676419965577e-06, + "loss": 0.2759, + "step": 2744 + }, + { + "epoch": 2.5490013934045517, + "grad_norm": 0.18066533942488144, + "learning_rate": 8.330464716006885e-06, + "loss": 0.2671, + "step": 2745 + }, + { + "epoch": 2.5499303297724105, + "grad_norm": 0.17448722669503738, + "learning_rate": 8.313253012048194e-06, + "loss": 0.2747, + "step": 2746 + }, + { + "epoch": 2.5508592661402694, + "grad_norm": 0.1818779797983094, + "learning_rate": 8.296041308089502e-06, + "loss": 0.2836, + "step": 2747 + }, + { + "epoch": 2.551788202508128, + "grad_norm": 0.1802872411024581, + "learning_rate": 8.27882960413081e-06, + "loss": 0.2762, + "step": 2748 + }, + { + "epoch": 2.552717138875987, + "grad_norm": 0.17919426431481447, + "learning_rate": 8.261617900172119e-06, + "loss": 0.2844, + "step": 2749 + }, + { + "epoch": 2.553646075243846, + "grad_norm": 0.17435336893217318, + "learning_rate": 8.244406196213425e-06, + "loss": 0.2803, + "step": 2750 + }, + { + "epoch": 2.5545750116117047, + "grad_norm": 0.19650206355264208, + "learning_rate": 8.227194492254734e-06, + "loss": 0.2995, + "step": 2751 + }, + { + "epoch": 2.555503947979563, + "grad_norm": 0.1751970358659085, + "learning_rate": 8.209982788296042e-06, + "loss": 0.272, + "step": 2752 + }, + { + "epoch": 2.556432884347422, + "grad_norm": 0.1690503222258136, + "learning_rate": 8.19277108433735e-06, + "loss": 0.2739, + "step": 2753 + }, + { + "epoch": 2.557361820715281, + "grad_norm": 0.17620039647154925, + "learning_rate": 8.175559380378659e-06, + "loss": 0.2805, + "step": 2754 + }, + { + "epoch": 2.5582907570831397, + "grad_norm": 0.17388899053880985, + "learning_rate": 8.158347676419966e-06, + "loss": 0.2704, + "step": 2755 + }, + { + "epoch": 2.5592196934509985, + "grad_norm": 0.17685736263136762, + "learning_rate": 8.141135972461274e-06, + "loss": 0.2649, + "step": 2756 + }, + { + "epoch": 2.5601486298188574, + "grad_norm": 0.17315801191584784, + "learning_rate": 8.123924268502583e-06, + "loss": 0.2783, + "step": 2757 + }, + { + "epoch": 2.5610775661867162, + "grad_norm": 0.17710989290259285, + "learning_rate": 8.106712564543891e-06, + "loss": 0.2833, + "step": 2758 + }, + { + "epoch": 2.562006502554575, + "grad_norm": 0.17693674550109328, + "learning_rate": 8.0895008605852e-06, + "loss": 0.28, + "step": 2759 + }, + { + "epoch": 2.562935438922434, + "grad_norm": 0.18006819440338806, + "learning_rate": 8.072289156626506e-06, + "loss": 0.2793, + "step": 2760 + }, + { + "epoch": 2.5638643752902928, + "grad_norm": 0.17402028938476224, + "learning_rate": 8.055077452667814e-06, + "loss": 0.2806, + "step": 2761 + }, + { + "epoch": 2.5647933116581516, + "grad_norm": 0.1756389111979832, + "learning_rate": 8.037865748709123e-06, + "loss": 0.2797, + "step": 2762 + }, + { + "epoch": 2.5657222480260105, + "grad_norm": 0.1798104235898651, + "learning_rate": 8.020654044750431e-06, + "loss": 0.2801, + "step": 2763 + }, + { + "epoch": 2.566651184393869, + "grad_norm": 0.17595147246541074, + "learning_rate": 8.00344234079174e-06, + "loss": 0.2796, + "step": 2764 + }, + { + "epoch": 2.5675801207617277, + "grad_norm": 0.1866139051974165, + "learning_rate": 7.986230636833048e-06, + "loss": 0.2792, + "step": 2765 + }, + { + "epoch": 2.5685090571295865, + "grad_norm": 0.17242021849415873, + "learning_rate": 7.969018932874355e-06, + "loss": 0.2799, + "step": 2766 + }, + { + "epoch": 2.5694379934974454, + "grad_norm": 0.1722563276561945, + "learning_rate": 7.951807228915663e-06, + "loss": 0.2658, + "step": 2767 + }, + { + "epoch": 2.5703669298653042, + "grad_norm": 0.18138158515978361, + "learning_rate": 7.934595524956972e-06, + "loss": 0.2937, + "step": 2768 + }, + { + "epoch": 2.571295866233163, + "grad_norm": 0.17876412039688724, + "learning_rate": 7.91738382099828e-06, + "loss": 0.2749, + "step": 2769 + }, + { + "epoch": 2.572224802601022, + "grad_norm": 0.17551798053405512, + "learning_rate": 7.900172117039588e-06, + "loss": 0.2755, + "step": 2770 + }, + { + "epoch": 2.5731537389688808, + "grad_norm": 0.17206297732377063, + "learning_rate": 7.882960413080895e-06, + "loss": 0.2729, + "step": 2771 + }, + { + "epoch": 2.574082675336739, + "grad_norm": 0.17868585578978832, + "learning_rate": 7.865748709122203e-06, + "loss": 0.2958, + "step": 2772 + }, + { + "epoch": 2.575011611704598, + "grad_norm": 0.17935135793930987, + "learning_rate": 7.848537005163512e-06, + "loss": 0.2679, + "step": 2773 + }, + { + "epoch": 2.575940548072457, + "grad_norm": 0.16765676035780458, + "learning_rate": 7.83132530120482e-06, + "loss": 0.2648, + "step": 2774 + }, + { + "epoch": 2.5768694844403157, + "grad_norm": 0.18548752107627353, + "learning_rate": 7.814113597246129e-06, + "loss": 0.2953, + "step": 2775 + }, + { + "epoch": 2.5777984208081746, + "grad_norm": 0.18625401386712584, + "learning_rate": 7.796901893287435e-06, + "loss": 0.2903, + "step": 2776 + }, + { + "epoch": 2.5787273571760334, + "grad_norm": 0.17797790691377569, + "learning_rate": 7.779690189328744e-06, + "loss": 0.2587, + "step": 2777 + }, + { + "epoch": 2.5796562935438923, + "grad_norm": 0.17169551543009043, + "learning_rate": 7.762478485370052e-06, + "loss": 0.2591, + "step": 2778 + }, + { + "epoch": 2.580585229911751, + "grad_norm": 0.17885288708688196, + "learning_rate": 7.74526678141136e-06, + "loss": 0.2644, + "step": 2779 + }, + { + "epoch": 2.58151416627961, + "grad_norm": 0.17569046883577338, + "learning_rate": 7.728055077452669e-06, + "loss": 0.2802, + "step": 2780 + }, + { + "epoch": 2.582443102647469, + "grad_norm": 0.18426157444373517, + "learning_rate": 7.710843373493977e-06, + "loss": 0.2734, + "step": 2781 + }, + { + "epoch": 2.5833720390153276, + "grad_norm": 0.18868415686038092, + "learning_rate": 7.693631669535284e-06, + "loss": 0.2862, + "step": 2782 + }, + { + "epoch": 2.5843009753831865, + "grad_norm": 0.1826628963868718, + "learning_rate": 7.676419965576592e-06, + "loss": 0.275, + "step": 2783 + }, + { + "epoch": 2.5852299117510453, + "grad_norm": 0.18305810292468144, + "learning_rate": 7.659208261617901e-06, + "loss": 0.2783, + "step": 2784 + }, + { + "epoch": 2.5861588481189037, + "grad_norm": 0.19042265177468026, + "learning_rate": 7.64199655765921e-06, + "loss": 0.2836, + "step": 2785 + }, + { + "epoch": 2.5870877844867626, + "grad_norm": 0.17182388584361694, + "learning_rate": 7.624784853700518e-06, + "loss": 0.2758, + "step": 2786 + }, + { + "epoch": 2.5880167208546214, + "grad_norm": 0.16171186896342973, + "learning_rate": 7.607573149741824e-06, + "loss": 0.27, + "step": 2787 + }, + { + "epoch": 2.5889456572224803, + "grad_norm": 0.16522516692804265, + "learning_rate": 7.590361445783133e-06, + "loss": 0.2518, + "step": 2788 + }, + { + "epoch": 2.589874593590339, + "grad_norm": 0.18331019909753513, + "learning_rate": 7.573149741824441e-06, + "loss": 0.2866, + "step": 2789 + }, + { + "epoch": 2.590803529958198, + "grad_norm": 0.18035348067205553, + "learning_rate": 7.55593803786575e-06, + "loss": 0.2613, + "step": 2790 + }, + { + "epoch": 2.591732466326057, + "grad_norm": 0.18295229566848326, + "learning_rate": 7.538726333907058e-06, + "loss": 0.272, + "step": 2791 + }, + { + "epoch": 2.592661402693915, + "grad_norm": 0.16902236952248076, + "learning_rate": 7.521514629948365e-06, + "loss": 0.2676, + "step": 2792 + }, + { + "epoch": 2.593590339061774, + "grad_norm": 0.18044289901142607, + "learning_rate": 7.504302925989673e-06, + "loss": 0.2907, + "step": 2793 + }, + { + "epoch": 2.594519275429633, + "grad_norm": 0.18658630020919675, + "learning_rate": 7.4870912220309815e-06, + "loss": 0.2898, + "step": 2794 + }, + { + "epoch": 2.5954482117974917, + "grad_norm": 0.1659203986129596, + "learning_rate": 7.46987951807229e-06, + "loss": 0.2767, + "step": 2795 + }, + { + "epoch": 2.5963771481653506, + "grad_norm": 0.18224464598969245, + "learning_rate": 7.452667814113598e-06, + "loss": 0.2744, + "step": 2796 + }, + { + "epoch": 2.5973060845332094, + "grad_norm": 0.18798345715683007, + "learning_rate": 7.435456110154905e-06, + "loss": 0.2803, + "step": 2797 + }, + { + "epoch": 2.5982350209010683, + "grad_norm": 0.17498384353384416, + "learning_rate": 7.418244406196213e-06, + "loss": 0.2752, + "step": 2798 + }, + { + "epoch": 2.599163957268927, + "grad_norm": 0.17390827410315354, + "learning_rate": 7.401032702237522e-06, + "loss": 0.2713, + "step": 2799 + }, + { + "epoch": 2.600092893636786, + "grad_norm": 0.17905924555277583, + "learning_rate": 7.38382099827883e-06, + "loss": 0.272, + "step": 2800 + }, + { + "epoch": 2.601021830004645, + "grad_norm": 0.19237045567028638, + "learning_rate": 7.366609294320139e-06, + "loss": 0.2754, + "step": 2801 + }, + { + "epoch": 2.6019507663725037, + "grad_norm": 0.19081730075922945, + "learning_rate": 7.349397590361447e-06, + "loss": 0.2784, + "step": 2802 + }, + { + "epoch": 2.6028797027403625, + "grad_norm": 0.1773815781690385, + "learning_rate": 7.332185886402754e-06, + "loss": 0.2812, + "step": 2803 + }, + { + "epoch": 2.6038086391082214, + "grad_norm": 0.18905163088336832, + "learning_rate": 7.314974182444062e-06, + "loss": 0.2798, + "step": 2804 + }, + { + "epoch": 2.6047375754760798, + "grad_norm": 0.17873348755643859, + "learning_rate": 7.2977624784853705e-06, + "loss": 0.2716, + "step": 2805 + }, + { + "epoch": 2.6056665118439386, + "grad_norm": 0.17030522579544236, + "learning_rate": 7.280550774526679e-06, + "loss": 0.2685, + "step": 2806 + }, + { + "epoch": 2.6065954482117974, + "grad_norm": 0.1862035284607427, + "learning_rate": 7.263339070567987e-06, + "loss": 0.2788, + "step": 2807 + }, + { + "epoch": 2.6075243845796563, + "grad_norm": 0.17337568922147115, + "learning_rate": 7.246127366609294e-06, + "loss": 0.2642, + "step": 2808 + }, + { + "epoch": 2.608453320947515, + "grad_norm": 0.19450967596793517, + "learning_rate": 7.228915662650602e-06, + "loss": 0.2727, + "step": 2809 + }, + { + "epoch": 2.609382257315374, + "grad_norm": 0.1893548613084817, + "learning_rate": 7.211703958691911e-06, + "loss": 0.2791, + "step": 2810 + }, + { + "epoch": 2.610311193683233, + "grad_norm": 0.18755670041525413, + "learning_rate": 7.194492254733219e-06, + "loss": 0.2893, + "step": 2811 + }, + { + "epoch": 2.6112401300510912, + "grad_norm": 0.1737293113823649, + "learning_rate": 7.177280550774528e-06, + "loss": 0.2654, + "step": 2812 + }, + { + "epoch": 2.61216906641895, + "grad_norm": 0.17980094681636927, + "learning_rate": 7.160068846815834e-06, + "loss": 0.2801, + "step": 2813 + }, + { + "epoch": 2.613098002786809, + "grad_norm": 0.17904359523084443, + "learning_rate": 7.142857142857143e-06, + "loss": 0.2767, + "step": 2814 + }, + { + "epoch": 2.6140269391546678, + "grad_norm": 0.17364837747203463, + "learning_rate": 7.125645438898451e-06, + "loss": 0.2702, + "step": 2815 + }, + { + "epoch": 2.6149558755225266, + "grad_norm": 0.17617804961029349, + "learning_rate": 7.1084337349397595e-06, + "loss": 0.2642, + "step": 2816 + }, + { + "epoch": 2.6158848118903855, + "grad_norm": 0.1730303401260583, + "learning_rate": 7.091222030981068e-06, + "loss": 0.2778, + "step": 2817 + }, + { + "epoch": 2.6168137482582443, + "grad_norm": 0.1831938804451245, + "learning_rate": 7.074010327022376e-06, + "loss": 0.2811, + "step": 2818 + }, + { + "epoch": 2.617742684626103, + "grad_norm": 0.1753374582449086, + "learning_rate": 7.056798623063683e-06, + "loss": 0.2717, + "step": 2819 + }, + { + "epoch": 2.618671620993962, + "grad_norm": 0.18343820977770026, + "learning_rate": 7.039586919104991e-06, + "loss": 0.3012, + "step": 2820 + }, + { + "epoch": 2.619600557361821, + "grad_norm": 0.18184849598170855, + "learning_rate": 7.0223752151463e-06, + "loss": 0.2697, + "step": 2821 + }, + { + "epoch": 2.6205294937296797, + "grad_norm": 0.167860177827926, + "learning_rate": 7.005163511187608e-06, + "loss": 0.2784, + "step": 2822 + }, + { + "epoch": 2.6214584300975385, + "grad_norm": 0.17080098742603222, + "learning_rate": 6.987951807228917e-06, + "loss": 0.2577, + "step": 2823 + }, + { + "epoch": 2.6223873664653974, + "grad_norm": 0.1883622438215358, + "learning_rate": 6.970740103270223e-06, + "loss": 0.2879, + "step": 2824 + }, + { + "epoch": 2.623316302833256, + "grad_norm": 0.1744115106332342, + "learning_rate": 6.953528399311532e-06, + "loss": 0.2739, + "step": 2825 + }, + { + "epoch": 2.6242452392011146, + "grad_norm": 0.16819532461761638, + "learning_rate": 6.93631669535284e-06, + "loss": 0.262, + "step": 2826 + }, + { + "epoch": 2.6251741755689735, + "grad_norm": 0.1760702927558675, + "learning_rate": 6.9191049913941485e-06, + "loss": 0.2698, + "step": 2827 + }, + { + "epoch": 2.6261031119368323, + "grad_norm": 0.17497265930471442, + "learning_rate": 6.901893287435457e-06, + "loss": 0.268, + "step": 2828 + }, + { + "epoch": 2.627032048304691, + "grad_norm": 0.23004773700222614, + "learning_rate": 6.884681583476764e-06, + "loss": 0.2662, + "step": 2829 + }, + { + "epoch": 2.62796098467255, + "grad_norm": 0.1757075749648394, + "learning_rate": 6.867469879518072e-06, + "loss": 0.2788, + "step": 2830 + }, + { + "epoch": 2.628889921040409, + "grad_norm": 0.1817582870520524, + "learning_rate": 6.8502581755593804e-06, + "loss": 0.2864, + "step": 2831 + }, + { + "epoch": 2.6298188574082673, + "grad_norm": 0.17943345206651734, + "learning_rate": 6.833046471600689e-06, + "loss": 0.2709, + "step": 2832 + }, + { + "epoch": 2.630747793776126, + "grad_norm": 0.18193170849342033, + "learning_rate": 6.815834767641997e-06, + "loss": 0.2776, + "step": 2833 + }, + { + "epoch": 2.631676730143985, + "grad_norm": 0.17203280602668908, + "learning_rate": 6.798623063683306e-06, + "loss": 0.2619, + "step": 2834 + }, + { + "epoch": 2.632605666511844, + "grad_norm": 0.17778486134836455, + "learning_rate": 6.781411359724612e-06, + "loss": 0.2825, + "step": 2835 + }, + { + "epoch": 2.6335346028797026, + "grad_norm": 0.17844868569560363, + "learning_rate": 6.764199655765921e-06, + "loss": 0.2858, + "step": 2836 + }, + { + "epoch": 2.6344635392475615, + "grad_norm": 0.17726568802995427, + "learning_rate": 6.746987951807229e-06, + "loss": 0.2811, + "step": 2837 + }, + { + "epoch": 2.6353924756154203, + "grad_norm": 0.16952025898385292, + "learning_rate": 6.7297762478485375e-06, + "loss": 0.271, + "step": 2838 + }, + { + "epoch": 2.636321411983279, + "grad_norm": 0.16547292535807162, + "learning_rate": 6.712564543889846e-06, + "loss": 0.2685, + "step": 2839 + }, + { + "epoch": 2.637250348351138, + "grad_norm": 0.17302245425655283, + "learning_rate": 6.695352839931153e-06, + "loss": 0.271, + "step": 2840 + }, + { + "epoch": 2.638179284718997, + "grad_norm": 0.16762567096195974, + "learning_rate": 6.678141135972461e-06, + "loss": 0.266, + "step": 2841 + }, + { + "epoch": 2.6391082210868557, + "grad_norm": 0.1699246368225437, + "learning_rate": 6.6609294320137694e-06, + "loss": 0.2696, + "step": 2842 + }, + { + "epoch": 2.6400371574547146, + "grad_norm": 0.16340294143468365, + "learning_rate": 6.643717728055078e-06, + "loss": 0.2732, + "step": 2843 + }, + { + "epoch": 2.6409660938225734, + "grad_norm": 0.17885057916542138, + "learning_rate": 6.626506024096386e-06, + "loss": 0.2896, + "step": 2844 + }, + { + "epoch": 2.641895030190432, + "grad_norm": 0.17028443868648646, + "learning_rate": 6.609294320137693e-06, + "loss": 0.2771, + "step": 2845 + }, + { + "epoch": 2.6428239665582907, + "grad_norm": 0.17146793678084205, + "learning_rate": 6.592082616179001e-06, + "loss": 0.277, + "step": 2846 + }, + { + "epoch": 2.6437529029261495, + "grad_norm": 0.18046897103380846, + "learning_rate": 6.57487091222031e-06, + "loss": 0.2712, + "step": 2847 + }, + { + "epoch": 2.6446818392940084, + "grad_norm": 0.17765626048887875, + "learning_rate": 6.557659208261618e-06, + "loss": 0.2755, + "step": 2848 + }, + { + "epoch": 2.645610775661867, + "grad_norm": 0.16752452222262293, + "learning_rate": 6.5404475043029266e-06, + "loss": 0.2676, + "step": 2849 + }, + { + "epoch": 2.646539712029726, + "grad_norm": 0.17019441380452832, + "learning_rate": 6.523235800344234e-06, + "loss": 0.27, + "step": 2850 + }, + { + "epoch": 2.647468648397585, + "grad_norm": 0.20022193692643978, + "learning_rate": 6.506024096385542e-06, + "loss": 0.2719, + "step": 2851 + }, + { + "epoch": 2.6483975847654433, + "grad_norm": 0.18034574748663765, + "learning_rate": 6.48881239242685e-06, + "loss": 0.2815, + "step": 2852 + }, + { + "epoch": 2.649326521133302, + "grad_norm": 0.17256922959908166, + "learning_rate": 6.4716006884681585e-06, + "loss": 0.2822, + "step": 2853 + }, + { + "epoch": 2.650255457501161, + "grad_norm": 0.17557669549784571, + "learning_rate": 6.454388984509467e-06, + "loss": 0.2819, + "step": 2854 + }, + { + "epoch": 2.65118439386902, + "grad_norm": 0.18357507050027355, + "learning_rate": 6.437177280550775e-06, + "loss": 0.2856, + "step": 2855 + }, + { + "epoch": 2.6521133302368787, + "grad_norm": 0.17883677795624237, + "learning_rate": 6.419965576592083e-06, + "loss": 0.278, + "step": 2856 + }, + { + "epoch": 2.6530422666047375, + "grad_norm": 0.1626862574701004, + "learning_rate": 6.40275387263339e-06, + "loss": 0.2574, + "step": 2857 + }, + { + "epoch": 2.6539712029725964, + "grad_norm": 0.16383951586471682, + "learning_rate": 6.385542168674699e-06, + "loss": 0.2752, + "step": 2858 + }, + { + "epoch": 2.654900139340455, + "grad_norm": 0.16774157905432333, + "learning_rate": 6.368330464716007e-06, + "loss": 0.2788, + "step": 2859 + }, + { + "epoch": 2.655829075708314, + "grad_norm": 0.17064891524490505, + "learning_rate": 6.3511187607573156e-06, + "loss": 0.2744, + "step": 2860 + }, + { + "epoch": 2.656758012076173, + "grad_norm": 0.18019181299443282, + "learning_rate": 6.333907056798623e-06, + "loss": 0.2749, + "step": 2861 + }, + { + "epoch": 2.6576869484440317, + "grad_norm": 0.18065522881499407, + "learning_rate": 6.3166953528399315e-06, + "loss": 0.2781, + "step": 2862 + }, + { + "epoch": 2.6586158848118906, + "grad_norm": 0.16699071545783015, + "learning_rate": 6.299483648881239e-06, + "loss": 0.2699, + "step": 2863 + }, + { + "epoch": 2.6595448211797494, + "grad_norm": 0.17960486746690257, + "learning_rate": 6.2822719449225475e-06, + "loss": 0.2701, + "step": 2864 + }, + { + "epoch": 2.660473757547608, + "grad_norm": 0.16751383073112, + "learning_rate": 6.265060240963856e-06, + "loss": 0.2706, + "step": 2865 + }, + { + "epoch": 2.6614026939154667, + "grad_norm": 0.17012354333185065, + "learning_rate": 6.2478485370051634e-06, + "loss": 0.2603, + "step": 2866 + }, + { + "epoch": 2.6623316302833255, + "grad_norm": 0.16323188266871058, + "learning_rate": 6.230636833046472e-06, + "loss": 0.2587, + "step": 2867 + }, + { + "epoch": 2.6632605666511844, + "grad_norm": 0.16800338995254763, + "learning_rate": 6.21342512908778e-06, + "loss": 0.2844, + "step": 2868 + }, + { + "epoch": 2.6641895030190432, + "grad_norm": 0.17523825857623324, + "learning_rate": 6.196213425129088e-06, + "loss": 0.2664, + "step": 2869 + }, + { + "epoch": 2.665118439386902, + "grad_norm": 0.17259113929735792, + "learning_rate": 6.179001721170396e-06, + "loss": 0.2682, + "step": 2870 + }, + { + "epoch": 2.666047375754761, + "grad_norm": 0.15843773315912996, + "learning_rate": 6.161790017211705e-06, + "loss": 0.2526, + "step": 2871 + }, + { + "epoch": 2.6669763121226198, + "grad_norm": 0.1667368341714137, + "learning_rate": 6.144578313253013e-06, + "loss": 0.2836, + "step": 2872 + }, + { + "epoch": 2.667905248490478, + "grad_norm": 0.18356977684527812, + "learning_rate": 6.1273666092943205e-06, + "loss": 0.2879, + "step": 2873 + }, + { + "epoch": 2.668834184858337, + "grad_norm": 0.1718844280644391, + "learning_rate": 6.110154905335629e-06, + "loss": 0.2691, + "step": 2874 + }, + { + "epoch": 2.669763121226196, + "grad_norm": 0.17542563761145286, + "learning_rate": 6.092943201376937e-06, + "loss": 0.2835, + "step": 2875 + }, + { + "epoch": 2.6706920575940547, + "grad_norm": 0.18112601734776523, + "learning_rate": 6.075731497418245e-06, + "loss": 0.2813, + "step": 2876 + }, + { + "epoch": 2.6716209939619135, + "grad_norm": 0.17517423987429506, + "learning_rate": 6.058519793459553e-06, + "loss": 0.2769, + "step": 2877 + }, + { + "epoch": 2.6725499303297724, + "grad_norm": 0.17260026662133704, + "learning_rate": 6.041308089500861e-06, + "loss": 0.2764, + "step": 2878 + }, + { + "epoch": 2.6734788666976312, + "grad_norm": 0.17274079216676472, + "learning_rate": 6.024096385542169e-06, + "loss": 0.2633, + "step": 2879 + }, + { + "epoch": 2.67440780306549, + "grad_norm": 0.1749655577705151, + "learning_rate": 6.006884681583478e-06, + "loss": 0.2801, + "step": 2880 + }, + { + "epoch": 2.675336739433349, + "grad_norm": 0.18151574461910705, + "learning_rate": 5.989672977624785e-06, + "loss": 0.2866, + "step": 2881 + }, + { + "epoch": 2.6762656758012078, + "grad_norm": 0.1603999496686991, + "learning_rate": 5.972461273666094e-06, + "loss": 0.2582, + "step": 2882 + }, + { + "epoch": 2.6771946121690666, + "grad_norm": 0.1757361166235524, + "learning_rate": 5.955249569707402e-06, + "loss": 0.2844, + "step": 2883 + }, + { + "epoch": 2.6781235485369255, + "grad_norm": 0.17402452532430263, + "learning_rate": 5.9380378657487095e-06, + "loss": 0.2765, + "step": 2884 + }, + { + "epoch": 2.679052484904784, + "grad_norm": 0.1709408589895161, + "learning_rate": 5.920826161790018e-06, + "loss": 0.2838, + "step": 2885 + }, + { + "epoch": 2.6799814212726427, + "grad_norm": 0.17361142927299925, + "learning_rate": 5.9036144578313255e-06, + "loss": 0.2868, + "step": 2886 + }, + { + "epoch": 2.6809103576405016, + "grad_norm": 0.1597414246640021, + "learning_rate": 5.886402753872634e-06, + "loss": 0.2611, + "step": 2887 + }, + { + "epoch": 2.6818392940083604, + "grad_norm": 0.16013784375486953, + "learning_rate": 5.869191049913942e-06, + "loss": 0.2557, + "step": 2888 + }, + { + "epoch": 2.6827682303762193, + "grad_norm": 0.15920257619823422, + "learning_rate": 5.85197934595525e-06, + "loss": 0.2537, + "step": 2889 + }, + { + "epoch": 2.683697166744078, + "grad_norm": 0.19602228120008208, + "learning_rate": 5.834767641996558e-06, + "loss": 0.2767, + "step": 2890 + }, + { + "epoch": 2.684626103111937, + "grad_norm": 0.16026967401774392, + "learning_rate": 5.817555938037867e-06, + "loss": 0.2652, + "step": 2891 + }, + { + "epoch": 2.685555039479796, + "grad_norm": 0.16763626760535624, + "learning_rate": 5.800344234079174e-06, + "loss": 0.2797, + "step": 2892 + }, + { + "epoch": 2.686483975847654, + "grad_norm": 0.1706032665023045, + "learning_rate": 5.783132530120483e-06, + "loss": 0.2744, + "step": 2893 + }, + { + "epoch": 2.687412912215513, + "grad_norm": 0.18732977118836014, + "learning_rate": 5.76592082616179e-06, + "loss": 0.2924, + "step": 2894 + }, + { + "epoch": 2.688341848583372, + "grad_norm": 0.17185758153715494, + "learning_rate": 5.7487091222030986e-06, + "loss": 0.2655, + "step": 2895 + }, + { + "epoch": 2.6892707849512307, + "grad_norm": 0.17530485016367162, + "learning_rate": 5.731497418244407e-06, + "loss": 0.2847, + "step": 2896 + }, + { + "epoch": 2.6901997213190896, + "grad_norm": 0.15863803214886732, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.2704, + "step": 2897 + }, + { + "epoch": 2.6911286576869484, + "grad_norm": 0.1691861266842899, + "learning_rate": 5.697074010327023e-06, + "loss": 0.29, + "step": 2898 + }, + { + "epoch": 2.6920575940548073, + "grad_norm": 0.16970518454179528, + "learning_rate": 5.679862306368331e-06, + "loss": 0.2614, + "step": 2899 + }, + { + "epoch": 2.692986530422666, + "grad_norm": 0.1741414021917801, + "learning_rate": 5.662650602409639e-06, + "loss": 0.2822, + "step": 2900 + }, + { + "epoch": 2.693915466790525, + "grad_norm": 0.1681316865512381, + "learning_rate": 5.645438898450947e-06, + "loss": 0.2577, + "step": 2901 + }, + { + "epoch": 2.694844403158384, + "grad_norm": 0.16184757987994441, + "learning_rate": 5.628227194492255e-06, + "loss": 0.2583, + "step": 2902 + }, + { + "epoch": 2.6957733395262427, + "grad_norm": 0.16281128048095134, + "learning_rate": 5.611015490533563e-06, + "loss": 0.2641, + "step": 2903 + }, + { + "epoch": 2.6967022758941015, + "grad_norm": 0.16822832216234318, + "learning_rate": 5.593803786574872e-06, + "loss": 0.2674, + "step": 2904 + }, + { + "epoch": 2.69763121226196, + "grad_norm": 0.1660704396611372, + "learning_rate": 5.576592082616179e-06, + "loss": 0.2693, + "step": 2905 + }, + { + "epoch": 2.6985601486298187, + "grad_norm": 0.17603534664952974, + "learning_rate": 5.5593803786574876e-06, + "loss": 0.2913, + "step": 2906 + }, + { + "epoch": 2.6994890849976776, + "grad_norm": 0.16173314477849363, + "learning_rate": 5.542168674698795e-06, + "loss": 0.2565, + "step": 2907 + }, + { + "epoch": 2.7004180213655364, + "grad_norm": 0.1663575560714465, + "learning_rate": 5.5249569707401035e-06, + "loss": 0.2748, + "step": 2908 + }, + { + "epoch": 2.7013469577333953, + "grad_norm": 0.1721550238413437, + "learning_rate": 5.507745266781412e-06, + "loss": 0.2912, + "step": 2909 + }, + { + "epoch": 2.702275894101254, + "grad_norm": 0.16768308926561104, + "learning_rate": 5.4905335628227195e-06, + "loss": 0.2666, + "step": 2910 + }, + { + "epoch": 2.703204830469113, + "grad_norm": 0.18026118938082336, + "learning_rate": 5.473321858864028e-06, + "loss": 0.2742, + "step": 2911 + }, + { + "epoch": 2.704133766836972, + "grad_norm": 0.17114811293445578, + "learning_rate": 5.456110154905336e-06, + "loss": 0.2869, + "step": 2912 + }, + { + "epoch": 2.70506270320483, + "grad_norm": 0.1897353218307368, + "learning_rate": 5.438898450946644e-06, + "loss": 0.2891, + "step": 2913 + }, + { + "epoch": 2.705991639572689, + "grad_norm": 0.17270248612365974, + "learning_rate": 5.421686746987952e-06, + "loss": 0.262, + "step": 2914 + }, + { + "epoch": 2.706920575940548, + "grad_norm": 0.16787775314760653, + "learning_rate": 5.40447504302926e-06, + "loss": 0.2698, + "step": 2915 + }, + { + "epoch": 2.7078495123084068, + "grad_norm": 0.1969016770528086, + "learning_rate": 5.387263339070568e-06, + "loss": 0.2822, + "step": 2916 + }, + { + "epoch": 2.7087784486762656, + "grad_norm": 0.16996615370604629, + "learning_rate": 5.370051635111877e-06, + "loss": 0.2695, + "step": 2917 + }, + { + "epoch": 2.7097073850441245, + "grad_norm": 0.16671749967465915, + "learning_rate": 5.352839931153184e-06, + "loss": 0.2673, + "step": 2918 + }, + { + "epoch": 2.7106363214119833, + "grad_norm": 0.17148870032680638, + "learning_rate": 5.3356282271944925e-06, + "loss": 0.2736, + "step": 2919 + }, + { + "epoch": 2.711565257779842, + "grad_norm": 0.16182282721435845, + "learning_rate": 5.318416523235801e-06, + "loss": 0.255, + "step": 2920 + }, + { + "epoch": 2.712494194147701, + "grad_norm": 0.18004793360917773, + "learning_rate": 5.3012048192771085e-06, + "loss": 0.3005, + "step": 2921 + }, + { + "epoch": 2.71342313051556, + "grad_norm": 0.16893534756516349, + "learning_rate": 5.283993115318417e-06, + "loss": 0.2614, + "step": 2922 + }, + { + "epoch": 2.7143520668834187, + "grad_norm": 0.17993627923927366, + "learning_rate": 5.2667814113597244e-06, + "loss": 0.2829, + "step": 2923 + }, + { + "epoch": 2.7152810032512775, + "grad_norm": 0.16985404866566564, + "learning_rate": 5.249569707401033e-06, + "loss": 0.2714, + "step": 2924 + }, + { + "epoch": 2.716209939619136, + "grad_norm": 0.16997003285554874, + "learning_rate": 5.232358003442341e-06, + "loss": 0.2723, + "step": 2925 + }, + { + "epoch": 2.7171388759869948, + "grad_norm": 0.17280395561719686, + "learning_rate": 5.215146299483649e-06, + "loss": 0.2806, + "step": 2926 + }, + { + "epoch": 2.7180678123548536, + "grad_norm": 0.18591179771276334, + "learning_rate": 5.197934595524957e-06, + "loss": 0.2839, + "step": 2927 + }, + { + "epoch": 2.7189967487227125, + "grad_norm": 0.16848714989466346, + "learning_rate": 5.180722891566266e-06, + "loss": 0.27, + "step": 2928 + }, + { + "epoch": 2.7199256850905713, + "grad_norm": 0.17396159174476428, + "learning_rate": 5.163511187607573e-06, + "loss": 0.2887, + "step": 2929 + }, + { + "epoch": 2.72085462145843, + "grad_norm": 0.16362910998344926, + "learning_rate": 5.1462994836488815e-06, + "loss": 0.2677, + "step": 2930 + }, + { + "epoch": 2.721783557826289, + "grad_norm": 0.17616701253337705, + "learning_rate": 5.129087779690189e-06, + "loss": 0.2805, + "step": 2931 + }, + { + "epoch": 2.722712494194148, + "grad_norm": 0.16554867783146326, + "learning_rate": 5.1118760757314975e-06, + "loss": 0.2604, + "step": 2932 + }, + { + "epoch": 2.7236414305620062, + "grad_norm": 0.1830057396941038, + "learning_rate": 5.094664371772806e-06, + "loss": 0.2661, + "step": 2933 + }, + { + "epoch": 2.724570366929865, + "grad_norm": 0.16804873136561682, + "learning_rate": 5.0774526678141135e-06, + "loss": 0.2761, + "step": 2934 + }, + { + "epoch": 2.725499303297724, + "grad_norm": 0.17881514628477865, + "learning_rate": 5.060240963855422e-06, + "loss": 0.2833, + "step": 2935 + }, + { + "epoch": 2.726428239665583, + "grad_norm": 0.18028085375147532, + "learning_rate": 5.04302925989673e-06, + "loss": 0.28, + "step": 2936 + }, + { + "epoch": 2.7273571760334416, + "grad_norm": 0.17128865792041942, + "learning_rate": 5.025817555938038e-06, + "loss": 0.2827, + "step": 2937 + }, + { + "epoch": 2.7282861124013005, + "grad_norm": 0.17508069891904077, + "learning_rate": 5.008605851979346e-06, + "loss": 0.2846, + "step": 2938 + }, + { + "epoch": 2.7292150487691593, + "grad_norm": 0.1624442446983855, + "learning_rate": 4.991394148020654e-06, + "loss": 0.2605, + "step": 2939 + }, + { + "epoch": 2.730143985137018, + "grad_norm": 0.16717663468832003, + "learning_rate": 4.974182444061962e-06, + "loss": 0.2562, + "step": 2940 + }, + { + "epoch": 2.731072921504877, + "grad_norm": 0.17435792481065424, + "learning_rate": 4.9569707401032706e-06, + "loss": 0.2641, + "step": 2941 + }, + { + "epoch": 2.732001857872736, + "grad_norm": 0.18206523972349564, + "learning_rate": 4.939759036144578e-06, + "loss": 0.2906, + "step": 2942 + }, + { + "epoch": 2.7329307942405947, + "grad_norm": 0.1660518040509173, + "learning_rate": 4.9225473321858865e-06, + "loss": 0.2616, + "step": 2943 + }, + { + "epoch": 2.7338597306084536, + "grad_norm": 0.17641521165468432, + "learning_rate": 4.905335628227195e-06, + "loss": 0.279, + "step": 2944 + }, + { + "epoch": 2.7347886669763124, + "grad_norm": 0.17021194005960247, + "learning_rate": 4.8881239242685025e-06, + "loss": 0.2662, + "step": 2945 + }, + { + "epoch": 2.735717603344171, + "grad_norm": 0.18680530424480912, + "learning_rate": 4.870912220309811e-06, + "loss": 0.279, + "step": 2946 + }, + { + "epoch": 2.7366465397120296, + "grad_norm": 0.16624112372533945, + "learning_rate": 4.853700516351118e-06, + "loss": 0.2603, + "step": 2947 + }, + { + "epoch": 2.7375754760798885, + "grad_norm": 0.1658532363601099, + "learning_rate": 4.836488812392427e-06, + "loss": 0.2573, + "step": 2948 + }, + { + "epoch": 2.7385044124477473, + "grad_norm": 0.17508939946629776, + "learning_rate": 4.819277108433735e-06, + "loss": 0.2748, + "step": 2949 + }, + { + "epoch": 2.739433348815606, + "grad_norm": 0.1697082547434485, + "learning_rate": 4.802065404475043e-06, + "loss": 0.277, + "step": 2950 + }, + { + "epoch": 2.740362285183465, + "grad_norm": 0.17265474187461444, + "learning_rate": 4.784853700516351e-06, + "loss": 0.2766, + "step": 2951 + }, + { + "epoch": 2.741291221551324, + "grad_norm": 0.1801581954287595, + "learning_rate": 4.7676419965576596e-06, + "loss": 0.2865, + "step": 2952 + }, + { + "epoch": 2.7422201579191823, + "grad_norm": 0.1671113061115391, + "learning_rate": 4.750430292598967e-06, + "loss": 0.2789, + "step": 2953 + }, + { + "epoch": 2.743149094287041, + "grad_norm": 0.1810191012298334, + "learning_rate": 4.7332185886402755e-06, + "loss": 0.2829, + "step": 2954 + }, + { + "epoch": 2.7440780306549, + "grad_norm": 0.1873382578762812, + "learning_rate": 4.716006884681583e-06, + "loss": 0.3014, + "step": 2955 + }, + { + "epoch": 2.745006967022759, + "grad_norm": 0.16836258371111407, + "learning_rate": 4.6987951807228915e-06, + "loss": 0.2777, + "step": 2956 + }, + { + "epoch": 2.7459359033906177, + "grad_norm": 0.17397396442719273, + "learning_rate": 4.6815834767642e-06, + "loss": 0.2794, + "step": 2957 + }, + { + "epoch": 2.7468648397584765, + "grad_norm": 0.17012803273497898, + "learning_rate": 4.6643717728055074e-06, + "loss": 0.2735, + "step": 2958 + }, + { + "epoch": 2.7477937761263354, + "grad_norm": 0.1726453541385025, + "learning_rate": 4.647160068846816e-06, + "loss": 0.2728, + "step": 2959 + }, + { + "epoch": 2.748722712494194, + "grad_norm": 0.1607579980714856, + "learning_rate": 4.629948364888123e-06, + "loss": 0.2652, + "step": 2960 + }, + { + "epoch": 2.749651648862053, + "grad_norm": 0.19173078813688318, + "learning_rate": 4.612736660929432e-06, + "loss": 0.2889, + "step": 2961 + }, + { + "epoch": 2.750580585229912, + "grad_norm": 0.16695194838007188, + "learning_rate": 4.59552495697074e-06, + "loss": 0.2711, + "step": 2962 + }, + { + "epoch": 2.7515095215977707, + "grad_norm": 0.1644552191700174, + "learning_rate": 4.578313253012049e-06, + "loss": 0.2612, + "step": 2963 + }, + { + "epoch": 2.7524384579656296, + "grad_norm": 0.16861408449465942, + "learning_rate": 4.561101549053356e-06, + "loss": 0.2672, + "step": 2964 + }, + { + "epoch": 2.7533673943334884, + "grad_norm": 0.16556411207676844, + "learning_rate": 4.5438898450946645e-06, + "loss": 0.2843, + "step": 2965 + }, + { + "epoch": 2.754296330701347, + "grad_norm": 0.1653424279361049, + "learning_rate": 4.526678141135973e-06, + "loss": 0.2698, + "step": 2966 + }, + { + "epoch": 2.7552252670692057, + "grad_norm": 0.17268687936725427, + "learning_rate": 4.5094664371772805e-06, + "loss": 0.2753, + "step": 2967 + }, + { + "epoch": 2.7561542034370645, + "grad_norm": 0.16679010719063728, + "learning_rate": 4.492254733218589e-06, + "loss": 0.2658, + "step": 2968 + }, + { + "epoch": 2.7570831398049234, + "grad_norm": 0.1731384049435442, + "learning_rate": 4.475043029259897e-06, + "loss": 0.2776, + "step": 2969 + }, + { + "epoch": 2.758012076172782, + "grad_norm": 0.17817981617485937, + "learning_rate": 4.457831325301205e-06, + "loss": 0.2723, + "step": 2970 + }, + { + "epoch": 2.758941012540641, + "grad_norm": 0.17631736307329407, + "learning_rate": 4.440619621342513e-06, + "loss": 0.29, + "step": 2971 + }, + { + "epoch": 2.7598699489085, + "grad_norm": 0.16557688303929388, + "learning_rate": 4.423407917383822e-06, + "loss": 0.2715, + "step": 2972 + }, + { + "epoch": 2.7607988852763583, + "grad_norm": 0.16830413475646644, + "learning_rate": 4.406196213425129e-06, + "loss": 0.2755, + "step": 2973 + }, + { + "epoch": 2.761727821644217, + "grad_norm": 0.16634439441042223, + "learning_rate": 4.388984509466438e-06, + "loss": 0.2821, + "step": 2974 + }, + { + "epoch": 2.762656758012076, + "grad_norm": 0.17593720968564058, + "learning_rate": 4.371772805507746e-06, + "loss": 0.2661, + "step": 2975 + }, + { + "epoch": 2.763585694379935, + "grad_norm": 0.16971061649067173, + "learning_rate": 4.3545611015490536e-06, + "loss": 0.2611, + "step": 2976 + }, + { + "epoch": 2.7645146307477937, + "grad_norm": 0.18290133340663603, + "learning_rate": 4.337349397590362e-06, + "loss": 0.296, + "step": 2977 + }, + { + "epoch": 2.7654435671156525, + "grad_norm": 0.16094186761731016, + "learning_rate": 4.32013769363167e-06, + "loss": 0.2599, + "step": 2978 + }, + { + "epoch": 2.7663725034835114, + "grad_norm": 0.1700225427982041, + "learning_rate": 4.302925989672978e-06, + "loss": 0.2851, + "step": 2979 + }, + { + "epoch": 2.7673014398513702, + "grad_norm": 0.169245573986097, + "learning_rate": 4.285714285714286e-06, + "loss": 0.2543, + "step": 2980 + }, + { + "epoch": 2.768230376219229, + "grad_norm": 0.16461588181504275, + "learning_rate": 4.268502581755595e-06, + "loss": 0.2671, + "step": 2981 + }, + { + "epoch": 2.769159312587088, + "grad_norm": 0.16325022819287804, + "learning_rate": 4.251290877796902e-06, + "loss": 0.2917, + "step": 2982 + }, + { + "epoch": 2.7700882489549468, + "grad_norm": 0.17634053929655194, + "learning_rate": 4.234079173838211e-06, + "loss": 0.289, + "step": 2983 + }, + { + "epoch": 2.7710171853228056, + "grad_norm": 0.17119702526297131, + "learning_rate": 4.216867469879518e-06, + "loss": 0.283, + "step": 2984 + }, + { + "epoch": 2.7719461216906645, + "grad_norm": 0.16474730190343942, + "learning_rate": 4.199655765920827e-06, + "loss": 0.2639, + "step": 2985 + }, + { + "epoch": 2.772875058058523, + "grad_norm": 0.33285250375326086, + "learning_rate": 4.182444061962135e-06, + "loss": 0.267, + "step": 2986 + }, + { + "epoch": 2.7738039944263817, + "grad_norm": 0.16364044631808572, + "learning_rate": 4.1652323580034426e-06, + "loss": 0.2717, + "step": 2987 + }, + { + "epoch": 2.7747329307942405, + "grad_norm": 0.16681061485901677, + "learning_rate": 4.148020654044751e-06, + "loss": 0.2683, + "step": 2988 + }, + { + "epoch": 2.7756618671620994, + "grad_norm": 0.1651724380142913, + "learning_rate": 4.130808950086059e-06, + "loss": 0.2704, + "step": 2989 + }, + { + "epoch": 2.7765908035299582, + "grad_norm": 0.1709084288247249, + "learning_rate": 4.113597246127367e-06, + "loss": 0.2819, + "step": 2990 + }, + { + "epoch": 2.777519739897817, + "grad_norm": 0.16357948699205382, + "learning_rate": 4.096385542168675e-06, + "loss": 0.2875, + "step": 2991 + }, + { + "epoch": 2.778448676265676, + "grad_norm": 0.17167993006882395, + "learning_rate": 4.079173838209983e-06, + "loss": 0.2745, + "step": 2992 + }, + { + "epoch": 2.7793776126335343, + "grad_norm": 0.1675771078403747, + "learning_rate": 4.061962134251291e-06, + "loss": 0.2823, + "step": 2993 + }, + { + "epoch": 2.780306549001393, + "grad_norm": 0.16584264394434609, + "learning_rate": 4.0447504302926e-06, + "loss": 0.2744, + "step": 2994 + }, + { + "epoch": 2.781235485369252, + "grad_norm": 0.16662698036780293, + "learning_rate": 4.027538726333907e-06, + "loss": 0.2688, + "step": 2995 + }, + { + "epoch": 2.782164421737111, + "grad_norm": 0.16794738348660668, + "learning_rate": 4.010327022375216e-06, + "loss": 0.2888, + "step": 2996 + }, + { + "epoch": 2.7830933581049697, + "grad_norm": 0.17789218429995182, + "learning_rate": 3.993115318416524e-06, + "loss": 0.2859, + "step": 2997 + }, + { + "epoch": 2.7840222944728286, + "grad_norm": 0.16091287332048385, + "learning_rate": 3.975903614457832e-06, + "loss": 0.2746, + "step": 2998 + }, + { + "epoch": 2.7849512308406874, + "grad_norm": 0.18944455859801637, + "learning_rate": 3.95869191049914e-06, + "loss": 0.2716, + "step": 2999 + }, + { + "epoch": 2.7858801672085463, + "grad_norm": 0.2954117399236272, + "learning_rate": 3.9414802065404475e-06, + "loss": 0.2902, + "step": 3000 + }, + { + "epoch": 2.786809103576405, + "grad_norm": 0.16573385628794401, + "learning_rate": 3.924268502581756e-06, + "loss": 0.2726, + "step": 3001 + }, + { + "epoch": 2.787738039944264, + "grad_norm": 0.17004238681555403, + "learning_rate": 3.907056798623064e-06, + "loss": 0.2831, + "step": 3002 + }, + { + "epoch": 2.788666976312123, + "grad_norm": 0.17663477423381163, + "learning_rate": 3.889845094664372e-06, + "loss": 0.2651, + "step": 3003 + }, + { + "epoch": 2.7895959126799816, + "grad_norm": 0.17378992478360958, + "learning_rate": 3.87263339070568e-06, + "loss": 0.2743, + "step": 3004 + }, + { + "epoch": 2.7905248490478405, + "grad_norm": 0.16494900421379727, + "learning_rate": 3.855421686746989e-06, + "loss": 0.2689, + "step": 3005 + }, + { + "epoch": 2.791453785415699, + "grad_norm": 0.1603907236344023, + "learning_rate": 3.838209982788296e-06, + "loss": 0.2657, + "step": 3006 + }, + { + "epoch": 2.7923827217835577, + "grad_norm": 0.1567572624521738, + "learning_rate": 3.820998278829605e-06, + "loss": 0.2637, + "step": 3007 + }, + { + "epoch": 2.7933116581514166, + "grad_norm": 0.18404937236372684, + "learning_rate": 3.803786574870912e-06, + "loss": 0.2755, + "step": 3008 + }, + { + "epoch": 2.7942405945192754, + "grad_norm": 0.16868307932572335, + "learning_rate": 3.7865748709122206e-06, + "loss": 0.2778, + "step": 3009 + }, + { + "epoch": 2.7951695308871343, + "grad_norm": 0.17073649004440103, + "learning_rate": 3.769363166953529e-06, + "loss": 0.264, + "step": 3010 + }, + { + "epoch": 2.796098467254993, + "grad_norm": 0.16505500534365, + "learning_rate": 3.7521514629948365e-06, + "loss": 0.2672, + "step": 3011 + }, + { + "epoch": 2.797027403622852, + "grad_norm": 0.16381885442693822, + "learning_rate": 3.734939759036145e-06, + "loss": 0.272, + "step": 3012 + }, + { + "epoch": 2.7979563399907104, + "grad_norm": 0.16292544827439895, + "learning_rate": 3.7177280550774525e-06, + "loss": 0.2737, + "step": 3013 + }, + { + "epoch": 2.798885276358569, + "grad_norm": 0.17228455569690837, + "learning_rate": 3.700516351118761e-06, + "loss": 0.2811, + "step": 3014 + }, + { + "epoch": 2.799814212726428, + "grad_norm": 0.19524306180865297, + "learning_rate": 3.6833046471600693e-06, + "loss": 0.2713, + "step": 3015 + }, + { + "epoch": 2.800743149094287, + "grad_norm": 0.16070401680293828, + "learning_rate": 3.666092943201377e-06, + "loss": 0.2692, + "step": 3016 + }, + { + "epoch": 2.8016720854621457, + "grad_norm": 0.16705220705216192, + "learning_rate": 3.6488812392426853e-06, + "loss": 0.2744, + "step": 3017 + }, + { + "epoch": 2.8026010218300046, + "grad_norm": 0.16539813777059287, + "learning_rate": 3.6316695352839937e-06, + "loss": 0.2747, + "step": 3018 + }, + { + "epoch": 2.8035299581978634, + "grad_norm": 0.16555633870974384, + "learning_rate": 3.614457831325301e-06, + "loss": 0.2899, + "step": 3019 + }, + { + "epoch": 2.8044588945657223, + "grad_norm": 0.16101648042383737, + "learning_rate": 3.5972461273666096e-06, + "loss": 0.268, + "step": 3020 + }, + { + "epoch": 2.805387830933581, + "grad_norm": 0.16506138181326488, + "learning_rate": 3.580034423407917e-06, + "loss": 0.2653, + "step": 3021 + }, + { + "epoch": 2.80631676730144, + "grad_norm": 0.1741809446855172, + "learning_rate": 3.5628227194492256e-06, + "loss": 0.2626, + "step": 3022 + }, + { + "epoch": 2.807245703669299, + "grad_norm": 0.16483290521015864, + "learning_rate": 3.545611015490534e-06, + "loss": 0.286, + "step": 3023 + }, + { + "epoch": 2.8081746400371577, + "grad_norm": 0.1571851560823419, + "learning_rate": 3.5283993115318415e-06, + "loss": 0.2495, + "step": 3024 + }, + { + "epoch": 2.8091035764050165, + "grad_norm": 0.16235471506982116, + "learning_rate": 3.51118760757315e-06, + "loss": 0.2778, + "step": 3025 + }, + { + "epoch": 2.810032512772875, + "grad_norm": 0.15756406208488002, + "learning_rate": 3.4939759036144583e-06, + "loss": 0.2643, + "step": 3026 + }, + { + "epoch": 2.8109614491407338, + "grad_norm": 0.17055705625316472, + "learning_rate": 3.476764199655766e-06, + "loss": 0.2996, + "step": 3027 + }, + { + "epoch": 2.8118903855085926, + "grad_norm": 0.1796767292475761, + "learning_rate": 3.4595524956970743e-06, + "loss": 0.276, + "step": 3028 + }, + { + "epoch": 2.8128193218764515, + "grad_norm": 0.1717885163183135, + "learning_rate": 3.442340791738382e-06, + "loss": 0.2933, + "step": 3029 + }, + { + "epoch": 2.8137482582443103, + "grad_norm": 0.17849394515997952, + "learning_rate": 3.4251290877796902e-06, + "loss": 0.2835, + "step": 3030 + }, + { + "epoch": 2.814677194612169, + "grad_norm": 0.1724053731347566, + "learning_rate": 3.4079173838209986e-06, + "loss": 0.2829, + "step": 3031 + }, + { + "epoch": 2.815606130980028, + "grad_norm": 0.17700506216444933, + "learning_rate": 3.390705679862306e-06, + "loss": 0.2836, + "step": 3032 + }, + { + "epoch": 2.816535067347887, + "grad_norm": 0.1711437698846605, + "learning_rate": 3.3734939759036146e-06, + "loss": 0.2848, + "step": 3033 + }, + { + "epoch": 2.8174640037157452, + "grad_norm": 0.1681253963815512, + "learning_rate": 3.356282271944923e-06, + "loss": 0.2792, + "step": 3034 + }, + { + "epoch": 2.818392940083604, + "grad_norm": 0.15981392784911405, + "learning_rate": 3.3390705679862305e-06, + "loss": 0.2699, + "step": 3035 + }, + { + "epoch": 2.819321876451463, + "grad_norm": 0.1740927276763424, + "learning_rate": 3.321858864027539e-06, + "loss": 0.2841, + "step": 3036 + }, + { + "epoch": 2.8202508128193218, + "grad_norm": 0.17717015170291478, + "learning_rate": 3.3046471600688465e-06, + "loss": 0.2819, + "step": 3037 + }, + { + "epoch": 2.8211797491871806, + "grad_norm": 0.1677501092435031, + "learning_rate": 3.287435456110155e-06, + "loss": 0.273, + "step": 3038 + }, + { + "epoch": 2.8221086855550395, + "grad_norm": 0.16673210395588484, + "learning_rate": 3.2702237521514633e-06, + "loss": 0.2693, + "step": 3039 + }, + { + "epoch": 2.8230376219228983, + "grad_norm": 0.16028997391776514, + "learning_rate": 3.253012048192771e-06, + "loss": 0.2624, + "step": 3040 + }, + { + "epoch": 2.823966558290757, + "grad_norm": 0.15950671745325568, + "learning_rate": 3.2358003442340792e-06, + "loss": 0.266, + "step": 3041 + }, + { + "epoch": 2.824895494658616, + "grad_norm": 0.18954997486322914, + "learning_rate": 3.2185886402753876e-06, + "loss": 0.2705, + "step": 3042 + }, + { + "epoch": 2.825824431026475, + "grad_norm": 0.18313245510676568, + "learning_rate": 3.201376936316695e-06, + "loss": 0.2903, + "step": 3043 + }, + { + "epoch": 2.8267533673943337, + "grad_norm": 0.17647582446439805, + "learning_rate": 3.1841652323580036e-06, + "loss": 0.299, + "step": 3044 + }, + { + "epoch": 2.8276823037621925, + "grad_norm": 0.18666596433252766, + "learning_rate": 3.1669535283993116e-06, + "loss": 0.2691, + "step": 3045 + }, + { + "epoch": 2.828611240130051, + "grad_norm": 0.1621297164024695, + "learning_rate": 3.1497418244406195e-06, + "loss": 0.2581, + "step": 3046 + }, + { + "epoch": 2.82954017649791, + "grad_norm": 0.16497540494920462, + "learning_rate": 3.132530120481928e-06, + "loss": 0.2674, + "step": 3047 + }, + { + "epoch": 2.8304691128657686, + "grad_norm": 0.18187531575636953, + "learning_rate": 3.115318416523236e-06, + "loss": 0.3102, + "step": 3048 + }, + { + "epoch": 2.8313980492336275, + "grad_norm": 0.19013570305586183, + "learning_rate": 3.098106712564544e-06, + "loss": 0.2816, + "step": 3049 + }, + { + "epoch": 2.8323269856014863, + "grad_norm": 0.16717134338818446, + "learning_rate": 3.0808950086058523e-06, + "loss": 0.2771, + "step": 3050 + }, + { + "epoch": 2.833255921969345, + "grad_norm": 0.16409077988979776, + "learning_rate": 3.0636833046471603e-06, + "loss": 0.2907, + "step": 3051 + }, + { + "epoch": 2.834184858337204, + "grad_norm": 0.16156022525948335, + "learning_rate": 3.0464716006884687e-06, + "loss": 0.2707, + "step": 3052 + }, + { + "epoch": 2.835113794705063, + "grad_norm": 0.1628946794644337, + "learning_rate": 3.0292598967297766e-06, + "loss": 0.2525, + "step": 3053 + }, + { + "epoch": 2.8360427310729213, + "grad_norm": 0.17817137797577384, + "learning_rate": 3.0120481927710846e-06, + "loss": 0.287, + "step": 3054 + }, + { + "epoch": 2.83697166744078, + "grad_norm": 0.1574612277457272, + "learning_rate": 2.9948364888123926e-06, + "loss": 0.2691, + "step": 3055 + }, + { + "epoch": 2.837900603808639, + "grad_norm": 0.1596146649010269, + "learning_rate": 2.977624784853701e-06, + "loss": 0.2618, + "step": 3056 + }, + { + "epoch": 2.838829540176498, + "grad_norm": 0.1666069390299578, + "learning_rate": 2.960413080895009e-06, + "loss": 0.2788, + "step": 3057 + }, + { + "epoch": 2.8397584765443566, + "grad_norm": 0.16942939086250466, + "learning_rate": 2.943201376936317e-06, + "loss": 0.278, + "step": 3058 + }, + { + "epoch": 2.8406874129122155, + "grad_norm": 0.16780692378544415, + "learning_rate": 2.925989672977625e-06, + "loss": 0.2864, + "step": 3059 + }, + { + "epoch": 2.8416163492800743, + "grad_norm": 0.16519686217403176, + "learning_rate": 2.9087779690189333e-06, + "loss": 0.2743, + "step": 3060 + }, + { + "epoch": 2.842545285647933, + "grad_norm": 0.18887259058754785, + "learning_rate": 2.8915662650602413e-06, + "loss": 0.28, + "step": 3061 + }, + { + "epoch": 2.843474222015792, + "grad_norm": 0.16104392704343692, + "learning_rate": 2.8743545611015493e-06, + "loss": 0.2651, + "step": 3062 + }, + { + "epoch": 2.844403158383651, + "grad_norm": 0.16267194525224002, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.2743, + "step": 3063 + }, + { + "epoch": 2.8453320947515097, + "grad_norm": 0.16408592399590138, + "learning_rate": 2.8399311531841657e-06, + "loss": 0.2703, + "step": 3064 + }, + { + "epoch": 2.8462610311193686, + "grad_norm": 0.17434242637401323, + "learning_rate": 2.8227194492254736e-06, + "loss": 0.2797, + "step": 3065 + }, + { + "epoch": 2.847189967487227, + "grad_norm": 0.16647472284974027, + "learning_rate": 2.8055077452667816e-06, + "loss": 0.2623, + "step": 3066 + }, + { + "epoch": 2.848118903855086, + "grad_norm": 0.16968808669769664, + "learning_rate": 2.7882960413080896e-06, + "loss": 0.278, + "step": 3067 + }, + { + "epoch": 2.8490478402229447, + "grad_norm": 0.17519157938858387, + "learning_rate": 2.7710843373493976e-06, + "loss": 0.2864, + "step": 3068 + }, + { + "epoch": 2.8499767765908035, + "grad_norm": 0.15968806347904377, + "learning_rate": 2.753872633390706e-06, + "loss": 0.2781, + "step": 3069 + }, + { + "epoch": 2.8509057129586624, + "grad_norm": 0.1690915914984628, + "learning_rate": 2.736660929432014e-06, + "loss": 0.2761, + "step": 3070 + }, + { + "epoch": 2.851834649326521, + "grad_norm": 0.16230516310874854, + "learning_rate": 2.719449225473322e-06, + "loss": 0.268, + "step": 3071 + }, + { + "epoch": 2.85276358569438, + "grad_norm": 0.17552261229791508, + "learning_rate": 2.70223752151463e-06, + "loss": 0.2862, + "step": 3072 + }, + { + "epoch": 2.853692522062239, + "grad_norm": 0.16542085819719532, + "learning_rate": 2.6850258175559383e-06, + "loss": 0.2707, + "step": 3073 + }, + { + "epoch": 2.8546214584300973, + "grad_norm": 0.17209167653272706, + "learning_rate": 2.6678141135972463e-06, + "loss": 0.2745, + "step": 3074 + }, + { + "epoch": 2.855550394797956, + "grad_norm": 0.1801208337324367, + "learning_rate": 2.6506024096385542e-06, + "loss": 0.275, + "step": 3075 + }, + { + "epoch": 2.856479331165815, + "grad_norm": 0.1658175112453202, + "learning_rate": 2.6333907056798622e-06, + "loss": 0.28, + "step": 3076 + }, + { + "epoch": 2.857408267533674, + "grad_norm": 0.1682675080594553, + "learning_rate": 2.6161790017211706e-06, + "loss": 0.272, + "step": 3077 + }, + { + "epoch": 2.8583372039015327, + "grad_norm": 0.16424665159704155, + "learning_rate": 2.5989672977624786e-06, + "loss": 0.2722, + "step": 3078 + }, + { + "epoch": 2.8592661402693915, + "grad_norm": 0.17646757447622446, + "learning_rate": 2.5817555938037866e-06, + "loss": 0.2743, + "step": 3079 + }, + { + "epoch": 2.8601950766372504, + "grad_norm": 0.16214982779968065, + "learning_rate": 2.5645438898450946e-06, + "loss": 0.2636, + "step": 3080 + }, + { + "epoch": 2.861124013005109, + "grad_norm": 0.16427321944645545, + "learning_rate": 2.547332185886403e-06, + "loss": 0.2687, + "step": 3081 + }, + { + "epoch": 2.862052949372968, + "grad_norm": 0.1651934825129748, + "learning_rate": 2.530120481927711e-06, + "loss": 0.2716, + "step": 3082 + }, + { + "epoch": 2.862981885740827, + "grad_norm": 0.1801065723272109, + "learning_rate": 2.512908777969019e-06, + "loss": 0.2907, + "step": 3083 + }, + { + "epoch": 2.8639108221086857, + "grad_norm": 0.1625252701117974, + "learning_rate": 2.495697074010327e-06, + "loss": 0.2694, + "step": 3084 + }, + { + "epoch": 2.8648397584765446, + "grad_norm": 0.1693264127024217, + "learning_rate": 2.4784853700516353e-06, + "loss": 0.2719, + "step": 3085 + }, + { + "epoch": 2.865768694844403, + "grad_norm": 0.18597914567062376, + "learning_rate": 2.4612736660929433e-06, + "loss": 0.2759, + "step": 3086 + }, + { + "epoch": 2.866697631212262, + "grad_norm": 0.1680321028316844, + "learning_rate": 2.4440619621342512e-06, + "loss": 0.2582, + "step": 3087 + }, + { + "epoch": 2.8676265675801207, + "grad_norm": 0.17018851398423457, + "learning_rate": 2.426850258175559e-06, + "loss": 0.2741, + "step": 3088 + }, + { + "epoch": 2.8685555039479795, + "grad_norm": 0.17493183425794315, + "learning_rate": 2.4096385542168676e-06, + "loss": 0.2824, + "step": 3089 + }, + { + "epoch": 2.8694844403158384, + "grad_norm": 0.17392176050130073, + "learning_rate": 2.3924268502581756e-06, + "loss": 0.2861, + "step": 3090 + }, + { + "epoch": 2.8704133766836972, + "grad_norm": 0.16101372989259485, + "learning_rate": 2.3752151462994836e-06, + "loss": 0.2729, + "step": 3091 + }, + { + "epoch": 2.871342313051556, + "grad_norm": 0.1811947560081589, + "learning_rate": 2.3580034423407915e-06, + "loss": 0.29, + "step": 3092 + }, + { + "epoch": 2.872271249419415, + "grad_norm": 0.16656066225329413, + "learning_rate": 2.3407917383821e-06, + "loss": 0.2582, + "step": 3093 + }, + { + "epoch": 2.8732001857872733, + "grad_norm": 0.15996052741989927, + "learning_rate": 2.323580034423408e-06, + "loss": 0.2597, + "step": 3094 + }, + { + "epoch": 2.874129122155132, + "grad_norm": 0.16380652320388664, + "learning_rate": 2.306368330464716e-06, + "loss": 0.2705, + "step": 3095 + }, + { + "epoch": 2.875058058522991, + "grad_norm": 0.1628946736235343, + "learning_rate": 2.2891566265060243e-06, + "loss": 0.2723, + "step": 3096 + }, + { + "epoch": 2.87598699489085, + "grad_norm": 0.17560963055242468, + "learning_rate": 2.2719449225473323e-06, + "loss": 0.2798, + "step": 3097 + }, + { + "epoch": 2.8769159312587087, + "grad_norm": 0.16780655023233185, + "learning_rate": 2.2547332185886402e-06, + "loss": 0.2657, + "step": 3098 + }, + { + "epoch": 2.8778448676265675, + "grad_norm": 0.17403931947708165, + "learning_rate": 2.2375215146299486e-06, + "loss": 0.2785, + "step": 3099 + }, + { + "epoch": 2.8787738039944264, + "grad_norm": 0.17010520777366847, + "learning_rate": 2.2203098106712566e-06, + "loss": 0.2614, + "step": 3100 + }, + { + "epoch": 2.8797027403622852, + "grad_norm": 0.16812351916262241, + "learning_rate": 2.2030981067125646e-06, + "loss": 0.281, + "step": 3101 + }, + { + "epoch": 2.880631676730144, + "grad_norm": 0.16691194577486648, + "learning_rate": 2.185886402753873e-06, + "loss": 0.2662, + "step": 3102 + }, + { + "epoch": 2.881560613098003, + "grad_norm": 0.1759493066311429, + "learning_rate": 2.168674698795181e-06, + "loss": 0.2747, + "step": 3103 + }, + { + "epoch": 2.8824895494658618, + "grad_norm": 0.1707537478590751, + "learning_rate": 2.151462994836489e-06, + "loss": 0.2707, + "step": 3104 + }, + { + "epoch": 2.8834184858337206, + "grad_norm": 0.18135021613947166, + "learning_rate": 2.1342512908777974e-06, + "loss": 0.2905, + "step": 3105 + }, + { + "epoch": 2.884347422201579, + "grad_norm": 0.16500668342734362, + "learning_rate": 2.1170395869191053e-06, + "loss": 0.287, + "step": 3106 + }, + { + "epoch": 2.885276358569438, + "grad_norm": 0.1719432107365479, + "learning_rate": 2.0998278829604133e-06, + "loss": 0.2813, + "step": 3107 + }, + { + "epoch": 2.8862052949372967, + "grad_norm": 0.15985021847497927, + "learning_rate": 2.0826161790017213e-06, + "loss": 0.263, + "step": 3108 + }, + { + "epoch": 2.8871342313051556, + "grad_norm": 0.16354165474456403, + "learning_rate": 2.0654044750430297e-06, + "loss": 0.2836, + "step": 3109 + }, + { + "epoch": 2.8880631676730144, + "grad_norm": 0.16878198938643335, + "learning_rate": 2.0481927710843377e-06, + "loss": 0.2785, + "step": 3110 + }, + { + "epoch": 2.8889921040408733, + "grad_norm": 0.16589172961440266, + "learning_rate": 2.0309810671256456e-06, + "loss": 0.2784, + "step": 3111 + }, + { + "epoch": 2.889921040408732, + "grad_norm": 0.17438950890163252, + "learning_rate": 2.0137693631669536e-06, + "loss": 0.2714, + "step": 3112 + }, + { + "epoch": 2.890849976776591, + "grad_norm": 0.16242581403999212, + "learning_rate": 1.996557659208262e-06, + "loss": 0.2629, + "step": 3113 + }, + { + "epoch": 2.8917789131444493, + "grad_norm": 0.1744527543075875, + "learning_rate": 1.97934595524957e-06, + "loss": 0.2707, + "step": 3114 + }, + { + "epoch": 2.892707849512308, + "grad_norm": 0.16426414428304387, + "learning_rate": 1.962134251290878e-06, + "loss": 0.2816, + "step": 3115 + }, + { + "epoch": 2.893636785880167, + "grad_norm": 0.1621871796090516, + "learning_rate": 1.944922547332186e-06, + "loss": 0.2751, + "step": 3116 + }, + { + "epoch": 2.894565722248026, + "grad_norm": 0.1775919652596439, + "learning_rate": 1.9277108433734943e-06, + "loss": 0.2895, + "step": 3117 + }, + { + "epoch": 2.8954946586158847, + "grad_norm": 0.17006770205203484, + "learning_rate": 1.9104991394148023e-06, + "loss": 0.2688, + "step": 3118 + }, + { + "epoch": 2.8964235949837436, + "grad_norm": 0.16894126178473537, + "learning_rate": 1.8932874354561103e-06, + "loss": 0.2609, + "step": 3119 + }, + { + "epoch": 2.8973525313516024, + "grad_norm": 0.16582301619354106, + "learning_rate": 1.8760757314974183e-06, + "loss": 0.2732, + "step": 3120 + }, + { + "epoch": 2.8982814677194613, + "grad_norm": 0.17231531336949352, + "learning_rate": 1.8588640275387262e-06, + "loss": 0.2734, + "step": 3121 + }, + { + "epoch": 2.89921040408732, + "grad_norm": 0.16228298439571703, + "learning_rate": 1.8416523235800346e-06, + "loss": 0.2662, + "step": 3122 + }, + { + "epoch": 2.900139340455179, + "grad_norm": 0.1655198446070943, + "learning_rate": 1.8244406196213426e-06, + "loss": 0.2678, + "step": 3123 + }, + { + "epoch": 2.901068276823038, + "grad_norm": 0.17406281157074208, + "learning_rate": 1.8072289156626506e-06, + "loss": 0.2802, + "step": 3124 + }, + { + "epoch": 2.9019972131908967, + "grad_norm": 0.16674227317723664, + "learning_rate": 1.7900172117039586e-06, + "loss": 0.266, + "step": 3125 + }, + { + "epoch": 2.9029261495587555, + "grad_norm": 0.1706960267209148, + "learning_rate": 1.772805507745267e-06, + "loss": 0.2974, + "step": 3126 + }, + { + "epoch": 2.903855085926614, + "grad_norm": 0.1658872040790518, + "learning_rate": 1.755593803786575e-06, + "loss": 0.2694, + "step": 3127 + }, + { + "epoch": 2.9047840222944727, + "grad_norm": 0.16658354730048636, + "learning_rate": 1.738382099827883e-06, + "loss": 0.2587, + "step": 3128 + }, + { + "epoch": 2.9057129586623316, + "grad_norm": 0.16444858436297213, + "learning_rate": 1.721170395869191e-06, + "loss": 0.2813, + "step": 3129 + }, + { + "epoch": 2.9066418950301904, + "grad_norm": 0.16336194076854949, + "learning_rate": 1.7039586919104993e-06, + "loss": 0.2732, + "step": 3130 + }, + { + "epoch": 2.9075708313980493, + "grad_norm": 0.171145553329007, + "learning_rate": 1.6867469879518073e-06, + "loss": 0.2723, + "step": 3131 + }, + { + "epoch": 2.908499767765908, + "grad_norm": 0.16347568718682626, + "learning_rate": 1.6695352839931153e-06, + "loss": 0.2762, + "step": 3132 + }, + { + "epoch": 2.909428704133767, + "grad_norm": 0.16509176449938368, + "learning_rate": 1.6523235800344232e-06, + "loss": 0.2684, + "step": 3133 + }, + { + "epoch": 2.9103576405016254, + "grad_norm": 0.16259817795344064, + "learning_rate": 1.6351118760757316e-06, + "loss": 0.2672, + "step": 3134 + }, + { + "epoch": 2.911286576869484, + "grad_norm": 0.16536212949320228, + "learning_rate": 1.6179001721170396e-06, + "loss": 0.2712, + "step": 3135 + }, + { + "epoch": 2.912215513237343, + "grad_norm": 0.17142664756129727, + "learning_rate": 1.6006884681583476e-06, + "loss": 0.2781, + "step": 3136 + }, + { + "epoch": 2.913144449605202, + "grad_norm": 0.16836163730587553, + "learning_rate": 1.5834767641996558e-06, + "loss": 0.2808, + "step": 3137 + }, + { + "epoch": 2.9140733859730608, + "grad_norm": 0.16607118988603203, + "learning_rate": 1.566265060240964e-06, + "loss": 0.2766, + "step": 3138 + }, + { + "epoch": 2.9150023223409196, + "grad_norm": 0.1672718943537726, + "learning_rate": 1.549053356282272e-06, + "loss": 0.2704, + "step": 3139 + }, + { + "epoch": 2.9159312587087785, + "grad_norm": 0.16835726638255913, + "learning_rate": 1.5318416523235801e-06, + "loss": 0.2769, + "step": 3140 + }, + { + "epoch": 2.9168601950766373, + "grad_norm": 0.161048165231964, + "learning_rate": 1.5146299483648883e-06, + "loss": 0.2837, + "step": 3141 + }, + { + "epoch": 2.917789131444496, + "grad_norm": 0.16282977150245906, + "learning_rate": 1.4974182444061963e-06, + "loss": 0.2638, + "step": 3142 + }, + { + "epoch": 2.918718067812355, + "grad_norm": 0.17323001324895482, + "learning_rate": 1.4802065404475045e-06, + "loss": 0.2815, + "step": 3143 + }, + { + "epoch": 2.919647004180214, + "grad_norm": 0.16388686409084324, + "learning_rate": 1.4629948364888125e-06, + "loss": 0.2827, + "step": 3144 + }, + { + "epoch": 2.9205759405480727, + "grad_norm": 0.16384234361312092, + "learning_rate": 1.4457831325301207e-06, + "loss": 0.282, + "step": 3145 + }, + { + "epoch": 2.9215048769159315, + "grad_norm": 0.16568266746026178, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.2704, + "step": 3146 + }, + { + "epoch": 2.92243381328379, + "grad_norm": 0.16915537105295747, + "learning_rate": 1.4113597246127368e-06, + "loss": 0.2796, + "step": 3147 + }, + { + "epoch": 2.9233627496516488, + "grad_norm": 0.15803350350704365, + "learning_rate": 1.3941480206540448e-06, + "loss": 0.2632, + "step": 3148 + }, + { + "epoch": 2.9242916860195076, + "grad_norm": 0.1724406036070289, + "learning_rate": 1.376936316695353e-06, + "loss": 0.2811, + "step": 3149 + }, + { + "epoch": 2.9252206223873665, + "grad_norm": 0.16469469642126497, + "learning_rate": 1.359724612736661e-06, + "loss": 0.255, + "step": 3150 + }, + { + "epoch": 2.9261495587552253, + "grad_norm": 0.18760969517301657, + "learning_rate": 1.3425129087779691e-06, + "loss": 0.2682, + "step": 3151 + }, + { + "epoch": 2.927078495123084, + "grad_norm": 0.16665139220788266, + "learning_rate": 1.3253012048192771e-06, + "loss": 0.268, + "step": 3152 + }, + { + "epoch": 2.928007431490943, + "grad_norm": 0.17530978095431596, + "learning_rate": 1.3080895008605853e-06, + "loss": 0.2624, + "step": 3153 + }, + { + "epoch": 2.9289363678588014, + "grad_norm": 0.1631271421954177, + "learning_rate": 1.2908777969018933e-06, + "loss": 0.2727, + "step": 3154 + }, + { + "epoch": 2.9298653042266602, + "grad_norm": 0.15966257157616312, + "learning_rate": 1.2736660929432015e-06, + "loss": 0.2688, + "step": 3155 + }, + { + "epoch": 2.930794240594519, + "grad_norm": 0.15325566348484762, + "learning_rate": 1.2564543889845095e-06, + "loss": 0.2559, + "step": 3156 + }, + { + "epoch": 2.931723176962378, + "grad_norm": 0.16790370946361619, + "learning_rate": 1.2392426850258176e-06, + "loss": 0.286, + "step": 3157 + }, + { + "epoch": 2.932652113330237, + "grad_norm": 0.16679108887381452, + "learning_rate": 1.2220309810671256e-06, + "loss": 0.2863, + "step": 3158 + }, + { + "epoch": 2.9335810496980956, + "grad_norm": 0.17093563722555055, + "learning_rate": 1.2048192771084338e-06, + "loss": 0.283, + "step": 3159 + }, + { + "epoch": 2.9345099860659545, + "grad_norm": 0.1671496062211597, + "learning_rate": 1.1876075731497418e-06, + "loss": 0.2792, + "step": 3160 + }, + { + "epoch": 2.9354389224338133, + "grad_norm": 0.1599449296648143, + "learning_rate": 1.17039586919105e-06, + "loss": 0.2448, + "step": 3161 + }, + { + "epoch": 2.936367858801672, + "grad_norm": 0.15847396108522563, + "learning_rate": 1.153184165232358e-06, + "loss": 0.2704, + "step": 3162 + }, + { + "epoch": 2.937296795169531, + "grad_norm": 0.16199214200992662, + "learning_rate": 1.1359724612736661e-06, + "loss": 0.2516, + "step": 3163 + }, + { + "epoch": 2.93822573153739, + "grad_norm": 0.1626770624143151, + "learning_rate": 1.1187607573149743e-06, + "loss": 0.2682, + "step": 3164 + }, + { + "epoch": 2.9391546679052487, + "grad_norm": 0.16395321212716582, + "learning_rate": 1.1015490533562823e-06, + "loss": 0.2722, + "step": 3165 + }, + { + "epoch": 2.9400836042731076, + "grad_norm": 0.15821372199935677, + "learning_rate": 1.0843373493975905e-06, + "loss": 0.2687, + "step": 3166 + }, + { + "epoch": 2.941012540640966, + "grad_norm": 0.15843662175473597, + "learning_rate": 1.0671256454388987e-06, + "loss": 0.2681, + "step": 3167 + }, + { + "epoch": 2.941941477008825, + "grad_norm": 0.16333393084594955, + "learning_rate": 1.0499139414802067e-06, + "loss": 0.2856, + "step": 3168 + }, + { + "epoch": 2.9428704133766836, + "grad_norm": 0.17542548208560677, + "learning_rate": 1.0327022375215148e-06, + "loss": 0.2798, + "step": 3169 + }, + { + "epoch": 2.9437993497445425, + "grad_norm": 0.17024423281462964, + "learning_rate": 1.0154905335628228e-06, + "loss": 0.2742, + "step": 3170 + }, + { + "epoch": 2.9447282861124013, + "grad_norm": 0.1576061324066094, + "learning_rate": 9.98278829604131e-07, + "loss": 0.2781, + "step": 3171 + }, + { + "epoch": 2.94565722248026, + "grad_norm": 0.1601221693884007, + "learning_rate": 9.81067125645439e-07, + "loss": 0.2693, + "step": 3172 + }, + { + "epoch": 2.946586158848119, + "grad_norm": 0.16786872565481428, + "learning_rate": 9.638554216867472e-07, + "loss": 0.2753, + "step": 3173 + }, + { + "epoch": 2.9475150952159774, + "grad_norm": 0.16660337771357872, + "learning_rate": 9.466437177280551e-07, + "loss": 0.2758, + "step": 3174 + }, + { + "epoch": 2.9484440315838363, + "grad_norm": 0.1691333863317382, + "learning_rate": 9.294320137693631e-07, + "loss": 0.2795, + "step": 3175 + }, + { + "epoch": 2.949372967951695, + "grad_norm": 0.1722484989104376, + "learning_rate": 9.122203098106713e-07, + "loss": 0.2802, + "step": 3176 + }, + { + "epoch": 2.950301904319554, + "grad_norm": 0.16958831531505558, + "learning_rate": 8.950086058519793e-07, + "loss": 0.2901, + "step": 3177 + }, + { + "epoch": 2.951230840687413, + "grad_norm": 0.17023309066648035, + "learning_rate": 8.777969018932875e-07, + "loss": 0.2717, + "step": 3178 + }, + { + "epoch": 2.9521597770552717, + "grad_norm": 0.15809523592675392, + "learning_rate": 8.605851979345955e-07, + "loss": 0.265, + "step": 3179 + }, + { + "epoch": 2.9530887134231305, + "grad_norm": 0.1675472901902543, + "learning_rate": 8.433734939759036e-07, + "loss": 0.2849, + "step": 3180 + }, + { + "epoch": 2.9540176497909894, + "grad_norm": 0.1654082507246669, + "learning_rate": 8.261617900172116e-07, + "loss": 0.2791, + "step": 3181 + }, + { + "epoch": 2.954946586158848, + "grad_norm": 0.162707952994883, + "learning_rate": 8.089500860585198e-07, + "loss": 0.2813, + "step": 3182 + }, + { + "epoch": 2.955875522526707, + "grad_norm": 0.16446726694121308, + "learning_rate": 7.917383820998279e-07, + "loss": 0.2872, + "step": 3183 + }, + { + "epoch": 2.956804458894566, + "grad_norm": 0.1637824540494551, + "learning_rate": 7.74526678141136e-07, + "loss": 0.2795, + "step": 3184 + }, + { + "epoch": 2.9577333952624247, + "grad_norm": 0.18638420051088697, + "learning_rate": 7.573149741824442e-07, + "loss": 0.2709, + "step": 3185 + }, + { + "epoch": 2.9586623316302836, + "grad_norm": 0.15707600467510602, + "learning_rate": 7.401032702237522e-07, + "loss": 0.2628, + "step": 3186 + }, + { + "epoch": 2.959591267998142, + "grad_norm": 0.16140242751307413, + "learning_rate": 7.228915662650603e-07, + "loss": 0.2735, + "step": 3187 + }, + { + "epoch": 2.960520204366001, + "grad_norm": 0.16654689577483472, + "learning_rate": 7.056798623063684e-07, + "loss": 0.2831, + "step": 3188 + }, + { + "epoch": 2.9614491407338597, + "grad_norm": 0.16057309604016964, + "learning_rate": 6.884681583476765e-07, + "loss": 0.2635, + "step": 3189 + }, + { + "epoch": 2.9623780771017185, + "grad_norm": 0.16490464189295742, + "learning_rate": 6.712564543889846e-07, + "loss": 0.2752, + "step": 3190 + }, + { + "epoch": 2.9633070134695774, + "grad_norm": 0.1671405024647208, + "learning_rate": 6.540447504302927e-07, + "loss": 0.269, + "step": 3191 + }, + { + "epoch": 2.964235949837436, + "grad_norm": 0.1710576565622639, + "learning_rate": 6.368330464716007e-07, + "loss": 0.2843, + "step": 3192 + }, + { + "epoch": 2.965164886205295, + "grad_norm": 0.16276635918619017, + "learning_rate": 6.196213425129088e-07, + "loss": 0.2813, + "step": 3193 + }, + { + "epoch": 2.966093822573154, + "grad_norm": 0.15281133331696603, + "learning_rate": 6.024096385542169e-07, + "loss": 0.2639, + "step": 3194 + }, + { + "epoch": 2.9670227589410123, + "grad_norm": 0.16323794995058483, + "learning_rate": 5.85197934595525e-07, + "loss": 0.2663, + "step": 3195 + }, + { + "epoch": 2.967951695308871, + "grad_norm": 0.1624282394962894, + "learning_rate": 5.679862306368331e-07, + "loss": 0.2752, + "step": 3196 + }, + { + "epoch": 2.96888063167673, + "grad_norm": 0.16134159864744876, + "learning_rate": 5.507745266781412e-07, + "loss": 0.2702, + "step": 3197 + }, + { + "epoch": 2.969809568044589, + "grad_norm": 0.16589417651717428, + "learning_rate": 5.335628227194493e-07, + "loss": 0.2828, + "step": 3198 + }, + { + "epoch": 2.9707385044124477, + "grad_norm": 0.16183155315903366, + "learning_rate": 5.163511187607574e-07, + "loss": 0.2852, + "step": 3199 + }, + { + "epoch": 2.9716674407803065, + "grad_norm": 0.16678141742547176, + "learning_rate": 4.991394148020655e-07, + "loss": 0.2813, + "step": 3200 + }, + { + "epoch": 2.9725963771481654, + "grad_norm": 0.1619658189257669, + "learning_rate": 4.819277108433736e-07, + "loss": 0.2649, + "step": 3201 + }, + { + "epoch": 2.9735253135160242, + "grad_norm": 0.15993238852155908, + "learning_rate": 4.6471600688468156e-07, + "loss": 0.2764, + "step": 3202 + }, + { + "epoch": 2.974454249883883, + "grad_norm": 0.16228626601674104, + "learning_rate": 4.4750430292598964e-07, + "loss": 0.2738, + "step": 3203 + }, + { + "epoch": 2.975383186251742, + "grad_norm": 0.16945807739892618, + "learning_rate": 4.3029259896729773e-07, + "loss": 0.2875, + "step": 3204 + }, + { + "epoch": 2.9763121226196008, + "grad_norm": 0.16673802928277734, + "learning_rate": 4.130808950086058e-07, + "loss": 0.2751, + "step": 3205 + }, + { + "epoch": 2.9772410589874596, + "grad_norm": 0.16066420702808745, + "learning_rate": 3.9586919104991394e-07, + "loss": 0.2691, + "step": 3206 + }, + { + "epoch": 2.978169995355318, + "grad_norm": 0.167644174523662, + "learning_rate": 3.786574870912221e-07, + "loss": 0.276, + "step": 3207 + }, + { + "epoch": 2.979098931723177, + "grad_norm": 0.16405491071346914, + "learning_rate": 3.6144578313253016e-07, + "loss": 0.27, + "step": 3208 + }, + { + "epoch": 2.9800278680910357, + "grad_norm": 0.15776111203718893, + "learning_rate": 3.4423407917383825e-07, + "loss": 0.2754, + "step": 3209 + }, + { + "epoch": 2.9809568044588945, + "grad_norm": 0.16291385160189412, + "learning_rate": 3.2702237521514633e-07, + "loss": 0.2801, + "step": 3210 + }, + { + "epoch": 2.9818857408267534, + "grad_norm": 0.15798932483422015, + "learning_rate": 3.098106712564544e-07, + "loss": 0.2655, + "step": 3211 + }, + { + "epoch": 2.9828146771946122, + "grad_norm": 0.16945630512529875, + "learning_rate": 2.925989672977625e-07, + "loss": 0.274, + "step": 3212 + }, + { + "epoch": 2.983743613562471, + "grad_norm": 0.1708554921170849, + "learning_rate": 2.753872633390706e-07, + "loss": 0.272, + "step": 3213 + }, + { + "epoch": 2.98467254993033, + "grad_norm": 0.15872652564709347, + "learning_rate": 2.581755593803787e-07, + "loss": 0.258, + "step": 3214 + }, + { + "epoch": 2.9856014862981883, + "grad_norm": 0.1830290046487256, + "learning_rate": 2.409638554216868e-07, + "loss": 0.2794, + "step": 3215 + }, + { + "epoch": 2.986530422666047, + "grad_norm": 0.15745598014130835, + "learning_rate": 2.2375215146299482e-07, + "loss": 0.2645, + "step": 3216 + }, + { + "epoch": 2.987459359033906, + "grad_norm": 0.16220353029392465, + "learning_rate": 2.065404475043029e-07, + "loss": 0.2789, + "step": 3217 + }, + { + "epoch": 2.988388295401765, + "grad_norm": 0.16537567907705764, + "learning_rate": 1.8932874354561104e-07, + "loss": 0.2724, + "step": 3218 + }, + { + "epoch": 2.9893172317696237, + "grad_norm": 0.15966135311906623, + "learning_rate": 1.7211703958691912e-07, + "loss": 0.2603, + "step": 3219 + }, + { + "epoch": 2.9902461681374826, + "grad_norm": 0.16670677322460914, + "learning_rate": 1.549053356282272e-07, + "loss": 0.277, + "step": 3220 + }, + { + "epoch": 2.9911751045053414, + "grad_norm": 0.16236390105210605, + "learning_rate": 1.376936316695353e-07, + "loss": 0.2778, + "step": 3221 + }, + { + "epoch": 2.9921040408732003, + "grad_norm": 0.15740425470314215, + "learning_rate": 1.204819277108434e-07, + "loss": 0.263, + "step": 3222 + }, + { + "epoch": 2.993032977241059, + "grad_norm": 0.16389828021365552, + "learning_rate": 1.0327022375215145e-07, + "loss": 0.276, + "step": 3223 + }, + { + "epoch": 2.993961913608918, + "grad_norm": 0.1578402137230674, + "learning_rate": 8.605851979345956e-08, + "loss": 0.26, + "step": 3224 + }, + { + "epoch": 2.994890849976777, + "grad_norm": 0.1705942119258943, + "learning_rate": 6.884681583476764e-08, + "loss": 0.2776, + "step": 3225 + }, + { + "epoch": 2.9958197863446356, + "grad_norm": 0.16036831167171559, + "learning_rate": 5.1635111876075726e-08, + "loss": 0.2761, + "step": 3226 + }, + { + "epoch": 2.996748722712494, + "grad_norm": 0.17533866907265988, + "learning_rate": 3.442340791738382e-08, + "loss": 0.2913, + "step": 3227 + }, + { + "epoch": 2.997677659080353, + "grad_norm": 0.1660516196575767, + "learning_rate": 1.721170395869191e-08, + "loss": 0.2709, + "step": 3228 + }, + { + "epoch": 2.997677659080353, + "step": 3228, + "total_flos": 3.5889551683046343e+19, + "train_loss": 0.40990160300847617, + "train_runtime": 91578.7681, + "train_samples_per_second": 0.564, + "train_steps_per_second": 0.035 + } + ], + "logging_steps": 1, + "max_steps": 3228, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.5889551683046343e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}