{ "best_global_step": 68500, "best_metric": 0.3663762955426973, "best_model_checkpoint": "/workspace/output/linear_probe/checkpoint-68500", "epoch": 20.0, "eval_steps": 500, "global_step": 70460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002838489923360772, "grad_norm": 7.76682186126709, "learning_rate": 9.998722679534488e-05, "loss": 7.2466278076171875, "step": 10 }, { "epoch": 0.005676979846721544, "grad_norm": 7.594738483428955, "learning_rate": 9.997303434572808e-05, "loss": 6.797149658203125, "step": 20 }, { "epoch": 0.008515469770082317, "grad_norm": 7.652416706085205, "learning_rate": 9.995884189611128e-05, "loss": 6.383975219726563, "step": 30 }, { "epoch": 0.011353959693443088, "grad_norm": 7.619444370269775, "learning_rate": 9.994464944649447e-05, "loss": 6.25147705078125, "step": 40 }, { "epoch": 0.01419244961680386, "grad_norm": 7.724758148193359, "learning_rate": 9.993045699687766e-05, "loss": 6.0975341796875, "step": 50 }, { "epoch": 0.017030939540164634, "grad_norm": 7.203300476074219, "learning_rate": 9.991626454726086e-05, "loss": 5.999920654296875, "step": 60 }, { "epoch": 0.019869429463525403, "grad_norm": 7.607264995574951, "learning_rate": 9.990207209764406e-05, "loss": 5.880581665039062, "step": 70 }, { "epoch": 0.022707919386886176, "grad_norm": 8.079435348510742, "learning_rate": 9.988787964802724e-05, "loss": 5.840594482421875, "step": 80 }, { "epoch": 0.02554640931024695, "grad_norm": 7.458168983459473, "learning_rate": 9.987368719841045e-05, "loss": 5.775900268554688, "step": 90 }, { "epoch": 0.02838489923360772, "grad_norm": 7.3460235595703125, "learning_rate": 9.985949474879364e-05, "loss": 5.66275634765625, "step": 100 }, { "epoch": 0.031223389156968494, "grad_norm": 7.358061790466309, "learning_rate": 9.984530229917685e-05, "loss": 5.594174194335937, "step": 110 }, { "epoch": 0.03406187908032927, "grad_norm": 7.612614631652832, "learning_rate": 9.983110984956003e-05, "loss": 5.608694458007813, "step": 120 }, { "epoch": 0.03690036900369004, "grad_norm": 7.309238433837891, "learning_rate": 9.981691739994324e-05, "loss": 5.541342163085938, "step": 130 }, { "epoch": 0.039738858927050806, "grad_norm": 7.377403736114502, "learning_rate": 9.980272495032643e-05, "loss": 5.459539794921875, "step": 140 }, { "epoch": 0.04257734885041158, "grad_norm": 7.193301200866699, "learning_rate": 9.978853250070962e-05, "loss": 5.4269287109375, "step": 150 }, { "epoch": 0.04541583877377235, "grad_norm": 7.7525129318237305, "learning_rate": 9.977434005109282e-05, "loss": 5.439059448242188, "step": 160 }, { "epoch": 0.04825432869713313, "grad_norm": 7.614742755889893, "learning_rate": 9.976014760147601e-05, "loss": 5.3873138427734375, "step": 170 }, { "epoch": 0.0510928186204939, "grad_norm": 7.591485977172852, "learning_rate": 9.974595515185922e-05, "loss": 5.33026123046875, "step": 180 }, { "epoch": 0.05393130854385467, "grad_norm": 7.427134037017822, "learning_rate": 9.973176270224241e-05, "loss": 5.246792602539062, "step": 190 }, { "epoch": 0.05676979846721544, "grad_norm": 7.474941253662109, "learning_rate": 9.97175702526256e-05, "loss": 5.257366943359375, "step": 200 }, { "epoch": 0.05960828839057621, "grad_norm": 7.841590881347656, "learning_rate": 9.97033778030088e-05, "loss": 5.2572174072265625, "step": 210 }, { "epoch": 0.06244677831393699, "grad_norm": 7.8390703201293945, "learning_rate": 9.9689185353392e-05, "loss": 5.13358154296875, "step": 220 }, { "epoch": 0.06528526823729776, "grad_norm": 7.310807228088379, "learning_rate": 9.96749929037752e-05, "loss": 5.132040405273438, "step": 230 }, { "epoch": 0.06812375816065853, "grad_norm": 7.760238170623779, "learning_rate": 9.966080045415839e-05, "loss": 5.17379150390625, "step": 240 }, { "epoch": 0.0709622480840193, "grad_norm": 7.937736988067627, "learning_rate": 9.964660800454158e-05, "loss": 5.136419677734375, "step": 250 }, { "epoch": 0.07380073800738007, "grad_norm": 8.008391380310059, "learning_rate": 9.963241555492479e-05, "loss": 5.0685546875, "step": 260 }, { "epoch": 0.07663922793074085, "grad_norm": 7.778807163238525, "learning_rate": 9.961822310530799e-05, "loss": 5.008302307128906, "step": 270 }, { "epoch": 0.07947771785410161, "grad_norm": 7.225378513336182, "learning_rate": 9.960403065569118e-05, "loss": 5.050762939453125, "step": 280 }, { "epoch": 0.08231620777746239, "grad_norm": 7.45269250869751, "learning_rate": 9.958983820607437e-05, "loss": 4.998600769042969, "step": 290 }, { "epoch": 0.08515469770082316, "grad_norm": 7.411711692810059, "learning_rate": 9.957564575645757e-05, "loss": 5.01221923828125, "step": 300 }, { "epoch": 0.08799318762418393, "grad_norm": 7.513561248779297, "learning_rate": 9.956145330684077e-05, "loss": 4.900582885742187, "step": 310 }, { "epoch": 0.0908316775475447, "grad_norm": 7.64567232131958, "learning_rate": 9.954726085722395e-05, "loss": 4.944937133789063, "step": 320 }, { "epoch": 0.09367016747090548, "grad_norm": 7.637795448303223, "learning_rate": 9.953306840760716e-05, "loss": 4.958914184570313, "step": 330 }, { "epoch": 0.09650865739426626, "grad_norm": 7.40865421295166, "learning_rate": 9.951887595799035e-05, "loss": 4.893185424804687, "step": 340 }, { "epoch": 0.09934714731762702, "grad_norm": 7.527649879455566, "learning_rate": 9.950468350837356e-05, "loss": 4.824237060546875, "step": 350 }, { "epoch": 0.1021856372409878, "grad_norm": 7.51497220993042, "learning_rate": 9.949049105875674e-05, "loss": 4.823028564453125, "step": 360 }, { "epoch": 0.10502412716434857, "grad_norm": 7.939122200012207, "learning_rate": 9.947629860913995e-05, "loss": 4.820355224609375, "step": 370 }, { "epoch": 0.10786261708770933, "grad_norm": 7.173007965087891, "learning_rate": 9.946210615952314e-05, "loss": 4.785848999023438, "step": 380 }, { "epoch": 0.11070110701107011, "grad_norm": 7.373956203460693, "learning_rate": 9.944791370990633e-05, "loss": 4.8253662109375, "step": 390 }, { "epoch": 0.11353959693443089, "grad_norm": 7.905656814575195, "learning_rate": 9.943372126028954e-05, "loss": 4.769357299804687, "step": 400 }, { "epoch": 0.11637808685779165, "grad_norm": 7.4880499839782715, "learning_rate": 9.941952881067272e-05, "loss": 4.85400390625, "step": 410 }, { "epoch": 0.11921657678115243, "grad_norm": 8.068448066711426, "learning_rate": 9.940533636105593e-05, "loss": 4.7669189453125, "step": 420 }, { "epoch": 0.1220550667045132, "grad_norm": 7.440807342529297, "learning_rate": 9.939114391143912e-05, "loss": 4.75029296875, "step": 430 }, { "epoch": 0.12489355662787398, "grad_norm": 7.373204708099365, "learning_rate": 9.937695146182233e-05, "loss": 4.720832824707031, "step": 440 }, { "epoch": 0.12773204655123474, "grad_norm": 7.416317939758301, "learning_rate": 9.93627590122055e-05, "loss": 4.649078369140625, "step": 450 }, { "epoch": 0.13057053647459552, "grad_norm": 7.169602394104004, "learning_rate": 9.934856656258871e-05, "loss": 4.64569091796875, "step": 460 }, { "epoch": 0.1334090263979563, "grad_norm": 7.526989936828613, "learning_rate": 9.93343741129719e-05, "loss": 4.636640930175782, "step": 470 }, { "epoch": 0.13624751632131707, "grad_norm": 7.723114013671875, "learning_rate": 9.93201816633551e-05, "loss": 4.65252685546875, "step": 480 }, { "epoch": 0.13908600624467782, "grad_norm": 7.627930641174316, "learning_rate": 9.930598921373829e-05, "loss": 4.617521667480469, "step": 490 }, { "epoch": 0.1419244961680386, "grad_norm": 7.407610893249512, "learning_rate": 9.929179676412149e-05, "loss": 4.624217224121094, "step": 500 }, { "epoch": 0.1419244961680386, "eval_accuracy": 0.04826095250206651, "eval_loss": 4.775277614593506, "eval_runtime": 48.7597, "eval_samples_per_second": 322.541, "eval_steps_per_second": 5.045, "step": 500 }, { "epoch": 0.14476298609139937, "grad_norm": 7.1845855712890625, "learning_rate": 9.927760431450469e-05, "loss": 4.587855529785156, "step": 510 }, { "epoch": 0.14760147601476015, "grad_norm": 7.368954181671143, "learning_rate": 9.926341186488789e-05, "loss": 4.5818634033203125, "step": 520 }, { "epoch": 0.15043996593812092, "grad_norm": 7.547170639038086, "learning_rate": 9.924921941527108e-05, "loss": 4.569480895996094, "step": 530 }, { "epoch": 0.1532784558614817, "grad_norm": 7.475296497344971, "learning_rate": 9.923502696565427e-05, "loss": 4.5938232421875, "step": 540 }, { "epoch": 0.15611694578484248, "grad_norm": 7.583178997039795, "learning_rate": 9.922083451603748e-05, "loss": 4.555241394042969, "step": 550 }, { "epoch": 0.15895543570820322, "grad_norm": 7.413067817687988, "learning_rate": 9.920664206642067e-05, "loss": 4.552943420410156, "step": 560 }, { "epoch": 0.161793925631564, "grad_norm": 8.377163887023926, "learning_rate": 9.919244961680387e-05, "loss": 4.5195762634277346, "step": 570 }, { "epoch": 0.16463241555492478, "grad_norm": 7.393665790557861, "learning_rate": 9.917825716718706e-05, "loss": 4.5089599609375, "step": 580 }, { "epoch": 0.16747090547828555, "grad_norm": 7.318619251251221, "learning_rate": 9.916406471757027e-05, "loss": 4.537031555175782, "step": 590 }, { "epoch": 0.17030939540164633, "grad_norm": 7.593929290771484, "learning_rate": 9.914987226795346e-05, "loss": 4.506903076171875, "step": 600 }, { "epoch": 0.1731478853250071, "grad_norm": 7.540308475494385, "learning_rate": 9.913567981833665e-05, "loss": 4.479457092285156, "step": 610 }, { "epoch": 0.17598637524836785, "grad_norm": 7.28300142288208, "learning_rate": 9.912148736871985e-05, "loss": 4.4705047607421875, "step": 620 }, { "epoch": 0.17882486517172863, "grad_norm": 7.444228172302246, "learning_rate": 9.910729491910304e-05, "loss": 4.477705383300782, "step": 630 }, { "epoch": 0.1816633550950894, "grad_norm": 7.416791915893555, "learning_rate": 9.909310246948625e-05, "loss": 4.504969787597656, "step": 640 }, { "epoch": 0.18450184501845018, "grad_norm": 7.280526161193848, "learning_rate": 9.907891001986943e-05, "loss": 4.4890281677246096, "step": 650 }, { "epoch": 0.18734033494181096, "grad_norm": 7.356183052062988, "learning_rate": 9.906471757025263e-05, "loss": 4.433049011230469, "step": 660 }, { "epoch": 0.19017882486517174, "grad_norm": 7.694573879241943, "learning_rate": 9.905052512063583e-05, "loss": 4.41678466796875, "step": 670 }, { "epoch": 0.1930173147885325, "grad_norm": 7.402281284332275, "learning_rate": 9.903633267101903e-05, "loss": 4.338172912597656, "step": 680 }, { "epoch": 0.19585580471189326, "grad_norm": 7.582495212554932, "learning_rate": 9.902214022140221e-05, "loss": 4.325717163085938, "step": 690 }, { "epoch": 0.19869429463525404, "grad_norm": 7.557459354400635, "learning_rate": 9.900794777178542e-05, "loss": 4.461737060546875, "step": 700 }, { "epoch": 0.2015327845586148, "grad_norm": 7.385646343231201, "learning_rate": 9.899375532216861e-05, "loss": 4.358491516113281, "step": 710 }, { "epoch": 0.2043712744819756, "grad_norm": 7.5637407302856445, "learning_rate": 9.89795628725518e-05, "loss": 4.352311706542968, "step": 720 }, { "epoch": 0.20720976440533637, "grad_norm": 7.401700019836426, "learning_rate": 9.8965370422935e-05, "loss": 4.315826416015625, "step": 730 }, { "epoch": 0.21004825432869714, "grad_norm": 7.531904220581055, "learning_rate": 9.895117797331819e-05, "loss": 4.323371887207031, "step": 740 }, { "epoch": 0.21288674425205792, "grad_norm": 7.4545183181762695, "learning_rate": 9.89369855237014e-05, "loss": 4.33734359741211, "step": 750 }, { "epoch": 0.21572523417541867, "grad_norm": 7.374246120452881, "learning_rate": 9.89227930740846e-05, "loss": 4.358880615234375, "step": 760 }, { "epoch": 0.21856372409877944, "grad_norm": 7.573185443878174, "learning_rate": 9.890860062446779e-05, "loss": 4.341133117675781, "step": 770 }, { "epoch": 0.22140221402214022, "grad_norm": 7.136955261230469, "learning_rate": 9.889440817485098e-05, "loss": 4.351332092285157, "step": 780 }, { "epoch": 0.224240703945501, "grad_norm": 6.988339900970459, "learning_rate": 9.888021572523419e-05, "loss": 4.263563537597657, "step": 790 }, { "epoch": 0.22707919386886177, "grad_norm": 7.39056921005249, "learning_rate": 9.886602327561738e-05, "loss": 4.288069152832032, "step": 800 }, { "epoch": 0.22991768379222255, "grad_norm": 7.416216850280762, "learning_rate": 9.885183082600057e-05, "loss": 4.312583923339844, "step": 810 }, { "epoch": 0.2327561737155833, "grad_norm": 7.864441871643066, "learning_rate": 9.883763837638377e-05, "loss": 4.340090942382813, "step": 820 }, { "epoch": 0.23559466363894407, "grad_norm": 7.529303550720215, "learning_rate": 9.882344592676696e-05, "loss": 4.329707336425781, "step": 830 }, { "epoch": 0.23843315356230485, "grad_norm": 7.442269325256348, "learning_rate": 9.880925347715017e-05, "loss": 4.334190368652344, "step": 840 }, { "epoch": 0.24127164348566563, "grad_norm": 7.484404563903809, "learning_rate": 9.879506102753335e-05, "loss": 4.207070922851562, "step": 850 }, { "epoch": 0.2441101334090264, "grad_norm": 7.453701972961426, "learning_rate": 9.878086857791655e-05, "loss": 4.178807067871094, "step": 860 }, { "epoch": 0.24694862333238718, "grad_norm": 7.495302200317383, "learning_rate": 9.876667612829975e-05, "loss": 4.275102233886718, "step": 870 }, { "epoch": 0.24978711325574796, "grad_norm": 7.248611927032471, "learning_rate": 9.875248367868295e-05, "loss": 4.207390594482422, "step": 880 }, { "epoch": 0.2526256031791087, "grad_norm": 7.357669830322266, "learning_rate": 9.873829122906613e-05, "loss": 4.195413208007812, "step": 890 }, { "epoch": 0.2554640931024695, "grad_norm": 7.408799171447754, "learning_rate": 9.872409877944934e-05, "loss": 4.2572990417480465, "step": 900 }, { "epoch": 0.25830258302583026, "grad_norm": 7.3809123039245605, "learning_rate": 9.870990632983253e-05, "loss": 4.2829345703125, "step": 910 }, { "epoch": 0.26114107294919103, "grad_norm": 7.2500319480896, "learning_rate": 9.869571388021574e-05, "loss": 4.217479705810547, "step": 920 }, { "epoch": 0.2639795628725518, "grad_norm": 7.559372901916504, "learning_rate": 9.868152143059892e-05, "loss": 4.166035461425781, "step": 930 }, { "epoch": 0.2668180527959126, "grad_norm": 7.493195533752441, "learning_rate": 9.866732898098213e-05, "loss": 4.176700592041016, "step": 940 }, { "epoch": 0.26965654271927336, "grad_norm": 7.439601898193359, "learning_rate": 9.865313653136532e-05, "loss": 4.186924743652344, "step": 950 }, { "epoch": 0.27249503264263414, "grad_norm": 7.466905117034912, "learning_rate": 9.863894408174851e-05, "loss": 4.113320922851562, "step": 960 }, { "epoch": 0.2753335225659949, "grad_norm": 7.331973075866699, "learning_rate": 9.862475163213171e-05, "loss": 4.195168304443359, "step": 970 }, { "epoch": 0.27817201248935564, "grad_norm": 7.543071746826172, "learning_rate": 9.86105591825149e-05, "loss": 4.169551086425781, "step": 980 }, { "epoch": 0.2810105024127164, "grad_norm": 7.530240058898926, "learning_rate": 9.859636673289811e-05, "loss": 4.154579162597656, "step": 990 }, { "epoch": 0.2838489923360772, "grad_norm": 7.512021541595459, "learning_rate": 9.85821742832813e-05, "loss": 4.100720977783203, "step": 1000 }, { "epoch": 0.2838489923360772, "eval_accuracy": 0.0749666179182298, "eval_loss": 4.339223384857178, "eval_runtime": 46.5708, "eval_samples_per_second": 337.701, "eval_steps_per_second": 5.282, "step": 1000 }, { "epoch": 0.28668748225943796, "grad_norm": 7.3781046867370605, "learning_rate": 9.85679818336645e-05, "loss": 4.118574523925782, "step": 1010 }, { "epoch": 0.28952597218279874, "grad_norm": 7.494996070861816, "learning_rate": 9.855378938404769e-05, "loss": 4.2174072265625, "step": 1020 }, { "epoch": 0.2923644621061595, "grad_norm": 7.530285358428955, "learning_rate": 9.85395969344309e-05, "loss": 4.0882415771484375, "step": 1030 }, { "epoch": 0.2952029520295203, "grad_norm": 7.388326644897461, "learning_rate": 9.852540448481409e-05, "loss": 4.195710754394531, "step": 1040 }, { "epoch": 0.29804144195288107, "grad_norm": 7.545435428619385, "learning_rate": 9.851121203519728e-05, "loss": 4.10050048828125, "step": 1050 }, { "epoch": 0.30087993187624185, "grad_norm": 7.309998512268066, "learning_rate": 9.849701958558047e-05, "loss": 4.103385925292969, "step": 1060 }, { "epoch": 0.3037184217996026, "grad_norm": 7.32162618637085, "learning_rate": 9.848282713596367e-05, "loss": 4.052483367919922, "step": 1070 }, { "epoch": 0.3065569117229634, "grad_norm": 7.409285545349121, "learning_rate": 9.846863468634687e-05, "loss": 4.1409355163574215, "step": 1080 }, { "epoch": 0.3093954016463242, "grad_norm": 7.533459663391113, "learning_rate": 9.845444223673005e-05, "loss": 4.0912315368652346, "step": 1090 }, { "epoch": 0.31223389156968495, "grad_norm": 7.330305576324463, "learning_rate": 9.844024978711326e-05, "loss": 4.093975830078125, "step": 1100 }, { "epoch": 0.3150723814930457, "grad_norm": 7.608935832977295, "learning_rate": 9.842605733749645e-05, "loss": 4.112785339355469, "step": 1110 }, { "epoch": 0.31791087141640645, "grad_norm": 7.343015670776367, "learning_rate": 9.841186488787966e-05, "loss": 4.060424041748047, "step": 1120 }, { "epoch": 0.3207493613397672, "grad_norm": 7.392816543579102, "learning_rate": 9.839767243826284e-05, "loss": 4.102029418945312, "step": 1130 }, { "epoch": 0.323587851263128, "grad_norm": 7.59547758102417, "learning_rate": 9.838347998864605e-05, "loss": 4.055174255371094, "step": 1140 }, { "epoch": 0.3264263411864888, "grad_norm": 7.77843713760376, "learning_rate": 9.836928753902924e-05, "loss": 4.04984130859375, "step": 1150 }, { "epoch": 0.32926483110984955, "grad_norm": 7.300534725189209, "learning_rate": 9.835509508941243e-05, "loss": 4.075349426269531, "step": 1160 }, { "epoch": 0.33210332103321033, "grad_norm": 7.074621200561523, "learning_rate": 9.834090263979563e-05, "loss": 4.096788024902343, "step": 1170 }, { "epoch": 0.3349418109565711, "grad_norm": 7.0375776290893555, "learning_rate": 9.832671019017882e-05, "loss": 3.981670379638672, "step": 1180 }, { "epoch": 0.3377803008799319, "grad_norm": 7.532200813293457, "learning_rate": 9.831251774056203e-05, "loss": 4.134120178222656, "step": 1190 }, { "epoch": 0.34061879080329266, "grad_norm": 7.49403190612793, "learning_rate": 9.829832529094522e-05, "loss": 4.025303649902344, "step": 1200 }, { "epoch": 0.34345728072665344, "grad_norm": 7.981142520904541, "learning_rate": 9.828413284132841e-05, "loss": 4.074159240722656, "step": 1210 }, { "epoch": 0.3462957706500142, "grad_norm": 7.059720039367676, "learning_rate": 9.826994039171161e-05, "loss": 3.9903884887695313, "step": 1220 }, { "epoch": 0.349134260573375, "grad_norm": 7.4872894287109375, "learning_rate": 9.825574794209481e-05, "loss": 4.013993835449218, "step": 1230 }, { "epoch": 0.3519727504967357, "grad_norm": 7.589225769042969, "learning_rate": 9.824155549247801e-05, "loss": 4.116233825683594, "step": 1240 }, { "epoch": 0.3548112404200965, "grad_norm": 7.458975315093994, "learning_rate": 9.82273630428612e-05, "loss": 4.010140991210937, "step": 1250 }, { "epoch": 0.35764973034345726, "grad_norm": 7.350898742675781, "learning_rate": 9.82131705932444e-05, "loss": 4.049894714355469, "step": 1260 }, { "epoch": 0.36048822026681804, "grad_norm": 7.416518211364746, "learning_rate": 9.81989781436276e-05, "loss": 3.974617767333984, "step": 1270 }, { "epoch": 0.3633267101901788, "grad_norm": 7.816158294677734, "learning_rate": 9.81847856940108e-05, "loss": 3.978753662109375, "step": 1280 }, { "epoch": 0.3661652001135396, "grad_norm": 7.631082534790039, "learning_rate": 9.817059324439399e-05, "loss": 4.002184295654297, "step": 1290 }, { "epoch": 0.36900369003690037, "grad_norm": 7.88928747177124, "learning_rate": 9.815640079477718e-05, "loss": 3.947435760498047, "step": 1300 }, { "epoch": 0.37184217996026114, "grad_norm": 7.25336217880249, "learning_rate": 9.814220834516038e-05, "loss": 4.004731750488281, "step": 1310 }, { "epoch": 0.3746806698836219, "grad_norm": 7.2683424949646, "learning_rate": 9.812801589554358e-05, "loss": 3.9502548217773437, "step": 1320 }, { "epoch": 0.3775191598069827, "grad_norm": 7.0883684158325195, "learning_rate": 9.811382344592676e-05, "loss": 3.9726882934570313, "step": 1330 }, { "epoch": 0.3803576497303435, "grad_norm": 7.190035820007324, "learning_rate": 9.809963099630997e-05, "loss": 3.9505935668945313, "step": 1340 }, { "epoch": 0.38319613965370425, "grad_norm": 7.727528095245361, "learning_rate": 9.808543854669316e-05, "loss": 3.9511199951171876, "step": 1350 }, { "epoch": 0.386034629577065, "grad_norm": 7.00855016708374, "learning_rate": 9.807124609707637e-05, "loss": 3.975506591796875, "step": 1360 }, { "epoch": 0.3888731195004258, "grad_norm": 7.4674763679504395, "learning_rate": 9.805705364745955e-05, "loss": 3.9500396728515623, "step": 1370 }, { "epoch": 0.3917116094237865, "grad_norm": 7.539651393890381, "learning_rate": 9.804286119784276e-05, "loss": 3.9499832153320313, "step": 1380 }, { "epoch": 0.3945500993471473, "grad_norm": 7.42949914932251, "learning_rate": 9.802866874822595e-05, "loss": 3.950425720214844, "step": 1390 }, { "epoch": 0.3973885892705081, "grad_norm": 7.866581916809082, "learning_rate": 9.801447629860914e-05, "loss": 3.8999092102050783, "step": 1400 }, { "epoch": 0.40022707919386885, "grad_norm": 8.170848846435547, "learning_rate": 9.800028384899234e-05, "loss": 3.888724517822266, "step": 1410 }, { "epoch": 0.4030655691172296, "grad_norm": 7.490846157073975, "learning_rate": 9.798609139937553e-05, "loss": 3.9405487060546873, "step": 1420 }, { "epoch": 0.4059040590405904, "grad_norm": 7.587957382202148, "learning_rate": 9.797189894975874e-05, "loss": 4.001874542236328, "step": 1430 }, { "epoch": 0.4087425489639512, "grad_norm": 7.573546409606934, "learning_rate": 9.795770650014193e-05, "loss": 3.9135177612304686, "step": 1440 }, { "epoch": 0.41158103888731196, "grad_norm": 7.1717023849487305, "learning_rate": 9.794351405052512e-05, "loss": 3.9038543701171875, "step": 1450 }, { "epoch": 0.41441952881067273, "grad_norm": 7.459227085113525, "learning_rate": 9.792932160090832e-05, "loss": 3.9926803588867186, "step": 1460 }, { "epoch": 0.4172580187340335, "grad_norm": 7.641129016876221, "learning_rate": 9.791512915129152e-05, "loss": 3.8880523681640624, "step": 1470 }, { "epoch": 0.4200965086573943, "grad_norm": 7.6166863441467285, "learning_rate": 9.790093670167472e-05, "loss": 3.9282272338867186, "step": 1480 }, { "epoch": 0.42293499858075506, "grad_norm": 7.763879299163818, "learning_rate": 9.788674425205791e-05, "loss": 3.897177505493164, "step": 1490 }, { "epoch": 0.42577348850411584, "grad_norm": 7.537065029144287, "learning_rate": 9.78725518024411e-05, "loss": 3.9462852478027344, "step": 1500 }, { "epoch": 0.42577348850411584, "eval_accuracy": 0.09156228142684555, "eval_loss": 4.094232559204102, "eval_runtime": 47.2443, "eval_samples_per_second": 332.887, "eval_steps_per_second": 5.207, "step": 1500 }, { "epoch": 0.42861197842747656, "grad_norm": 7.482300758361816, "learning_rate": 9.785835935282431e-05, "loss": 3.8631237030029295, "step": 1510 }, { "epoch": 0.43145046835083734, "grad_norm": 7.203685760498047, "learning_rate": 9.78441669032075e-05, "loss": 3.881513214111328, "step": 1520 }, { "epoch": 0.4342889582741981, "grad_norm": 7.295687675476074, "learning_rate": 9.78299744535907e-05, "loss": 3.915008544921875, "step": 1530 }, { "epoch": 0.4371274481975589, "grad_norm": 7.128635883331299, "learning_rate": 9.781578200397389e-05, "loss": 3.8935218811035157, "step": 1540 }, { "epoch": 0.43996593812091966, "grad_norm": 7.163247585296631, "learning_rate": 9.780158955435708e-05, "loss": 3.8974098205566405, "step": 1550 }, { "epoch": 0.44280442804428044, "grad_norm": 7.198397636413574, "learning_rate": 9.778739710474029e-05, "loss": 3.921527862548828, "step": 1560 }, { "epoch": 0.4456429179676412, "grad_norm": 7.140389919281006, "learning_rate": 9.777320465512347e-05, "loss": 3.9081375122070314, "step": 1570 }, { "epoch": 0.448481407891002, "grad_norm": 7.363821029663086, "learning_rate": 9.775901220550668e-05, "loss": 3.9475006103515624, "step": 1580 }, { "epoch": 0.45131989781436277, "grad_norm": 7.8528032302856445, "learning_rate": 9.774481975588987e-05, "loss": 3.888922119140625, "step": 1590 }, { "epoch": 0.45415838773772355, "grad_norm": 7.489673614501953, "learning_rate": 9.773062730627308e-05, "loss": 3.8283409118652343, "step": 1600 }, { "epoch": 0.4569968776610843, "grad_norm": 7.343048095703125, "learning_rate": 9.771643485665626e-05, "loss": 3.8091278076171875, "step": 1610 }, { "epoch": 0.4598353675844451, "grad_norm": 7.540294647216797, "learning_rate": 9.770224240703946e-05, "loss": 3.836109161376953, "step": 1620 }, { "epoch": 0.4626738575078059, "grad_norm": 7.126834392547607, "learning_rate": 9.768804995742266e-05, "loss": 3.890799713134766, "step": 1630 }, { "epoch": 0.4655123474311666, "grad_norm": 7.19163703918457, "learning_rate": 9.767385750780585e-05, "loss": 3.825541687011719, "step": 1640 }, { "epoch": 0.46835083735452737, "grad_norm": 7.563293933868408, "learning_rate": 9.765966505818904e-05, "loss": 3.8770778656005858, "step": 1650 }, { "epoch": 0.47118932727788815, "grad_norm": 7.4351911544799805, "learning_rate": 9.764547260857224e-05, "loss": 3.8535110473632814, "step": 1660 }, { "epoch": 0.4740278172012489, "grad_norm": 7.410805702209473, "learning_rate": 9.763128015895544e-05, "loss": 3.844521331787109, "step": 1670 }, { "epoch": 0.4768663071246097, "grad_norm": 7.007425308227539, "learning_rate": 9.761708770933864e-05, "loss": 3.847270965576172, "step": 1680 }, { "epoch": 0.4797047970479705, "grad_norm": 7.266729354858398, "learning_rate": 9.760289525972183e-05, "loss": 3.8391189575195312, "step": 1690 }, { "epoch": 0.48254328697133125, "grad_norm": 7.043731212615967, "learning_rate": 9.758870281010502e-05, "loss": 3.8580551147460938, "step": 1700 }, { "epoch": 0.48538177689469203, "grad_norm": 7.80348539352417, "learning_rate": 9.757451036048823e-05, "loss": 3.8612030029296873, "step": 1710 }, { "epoch": 0.4882202668180528, "grad_norm": 7.110872268676758, "learning_rate": 9.756031791087142e-05, "loss": 3.8344985961914064, "step": 1720 }, { "epoch": 0.4910587567414136, "grad_norm": 7.537649631500244, "learning_rate": 9.754612546125462e-05, "loss": 3.82003173828125, "step": 1730 }, { "epoch": 0.49389724666477436, "grad_norm": 7.452249050140381, "learning_rate": 9.753193301163781e-05, "loss": 3.835468292236328, "step": 1740 }, { "epoch": 0.49673573658813513, "grad_norm": 7.655674934387207, "learning_rate": 9.7517740562021e-05, "loss": 3.796575927734375, "step": 1750 }, { "epoch": 0.4995742265114959, "grad_norm": 7.385485649108887, "learning_rate": 9.750354811240421e-05, "loss": 3.808521270751953, "step": 1760 }, { "epoch": 0.5024127164348566, "grad_norm": 7.229243755340576, "learning_rate": 9.74893556627874e-05, "loss": 3.773734283447266, "step": 1770 }, { "epoch": 0.5052512063582174, "grad_norm": 7.363079071044922, "learning_rate": 9.74751632131706e-05, "loss": 3.810292053222656, "step": 1780 }, { "epoch": 0.5080896962815782, "grad_norm": 7.342677116394043, "learning_rate": 9.746097076355379e-05, "loss": 3.8419174194335937, "step": 1790 }, { "epoch": 0.510928186204939, "grad_norm": 7.295414447784424, "learning_rate": 9.7446778313937e-05, "loss": 3.835002899169922, "step": 1800 }, { "epoch": 0.5137666761282997, "grad_norm": 7.010468482971191, "learning_rate": 9.743258586432019e-05, "loss": 3.842693328857422, "step": 1810 }, { "epoch": 0.5166051660516605, "grad_norm": 7.429100513458252, "learning_rate": 9.741839341470338e-05, "loss": 3.808984375, "step": 1820 }, { "epoch": 0.5194436559750213, "grad_norm": 7.120047092437744, "learning_rate": 9.740420096508658e-05, "loss": 3.8370285034179688, "step": 1830 }, { "epoch": 0.5222821458983821, "grad_norm": 7.252555847167969, "learning_rate": 9.739000851546978e-05, "loss": 3.720958709716797, "step": 1840 }, { "epoch": 0.5251206358217428, "grad_norm": 7.269909858703613, "learning_rate": 9.737581606585298e-05, "loss": 3.757588195800781, "step": 1850 }, { "epoch": 0.5279591257451036, "grad_norm": 7.816965103149414, "learning_rate": 9.736162361623617e-05, "loss": 3.788361358642578, "step": 1860 }, { "epoch": 0.5307976156684644, "grad_norm": 7.376947402954102, "learning_rate": 9.734743116661936e-05, "loss": 3.834918975830078, "step": 1870 }, { "epoch": 0.5336361055918252, "grad_norm": 7.221868515014648, "learning_rate": 9.733323871700256e-05, "loss": 3.785225677490234, "step": 1880 }, { "epoch": 0.536474595515186, "grad_norm": 7.156528949737549, "learning_rate": 9.731904626738576e-05, "loss": 3.7371444702148438, "step": 1890 }, { "epoch": 0.5393130854385467, "grad_norm": 7.221426963806152, "learning_rate": 9.730485381776894e-05, "loss": 3.8065452575683594, "step": 1900 }, { "epoch": 0.5421515753619075, "grad_norm": 7.087551116943359, "learning_rate": 9.729066136815215e-05, "loss": 3.7958106994628906, "step": 1910 }, { "epoch": 0.5449900652852683, "grad_norm": 7.675655841827393, "learning_rate": 9.727646891853534e-05, "loss": 3.7744598388671875, "step": 1920 }, { "epoch": 0.547828555208629, "grad_norm": 7.386330604553223, "learning_rate": 9.726227646891855e-05, "loss": 3.7709419250488283, "step": 1930 }, { "epoch": 0.5506670451319898, "grad_norm": 7.170223712921143, "learning_rate": 9.724808401930173e-05, "loss": 3.7384742736816405, "step": 1940 }, { "epoch": 0.5535055350553506, "grad_norm": 7.229175567626953, "learning_rate": 9.723389156968494e-05, "loss": 3.761402130126953, "step": 1950 }, { "epoch": 0.5563440249787113, "grad_norm": 7.357839107513428, "learning_rate": 9.721969912006813e-05, "loss": 3.766895294189453, "step": 1960 }, { "epoch": 0.559182514902072, "grad_norm": 7.547580242156982, "learning_rate": 9.720550667045132e-05, "loss": 3.77874755859375, "step": 1970 }, { "epoch": 0.5620210048254328, "grad_norm": 7.912546634674072, "learning_rate": 9.719131422083452e-05, "loss": 3.7376174926757812, "step": 1980 }, { "epoch": 0.5648594947487936, "grad_norm": 7.137049674987793, "learning_rate": 9.717712177121771e-05, "loss": 3.7662296295166016, "step": 1990 }, { "epoch": 0.5676979846721544, "grad_norm": 7.490048408508301, "learning_rate": 9.716292932160092e-05, "loss": 3.7667610168457033, "step": 2000 }, { "epoch": 0.5676979846721544, "eval_accuracy": 0.10637756724105042, "eval_loss": 3.9384877681732178, "eval_runtime": 49.7564, "eval_samples_per_second": 316.08, "eval_steps_per_second": 4.944, "step": 2000 }, { "epoch": 0.5705364745955152, "grad_norm": 7.462189197540283, "learning_rate": 9.714873687198411e-05, "loss": 3.7647079467773437, "step": 2010 }, { "epoch": 0.5733749645188759, "grad_norm": 7.215369701385498, "learning_rate": 9.71345444223673e-05, "loss": 3.7710647583007812, "step": 2020 }, { "epoch": 0.5762134544422367, "grad_norm": 7.437196254730225, "learning_rate": 9.71203519727505e-05, "loss": 3.658802032470703, "step": 2030 }, { "epoch": 0.5790519443655975, "grad_norm": 7.301856517791748, "learning_rate": 9.71061595231337e-05, "loss": 3.7459312438964845, "step": 2040 }, { "epoch": 0.5818904342889583, "grad_norm": 7.765122890472412, "learning_rate": 9.70919670735169e-05, "loss": 3.743536376953125, "step": 2050 }, { "epoch": 0.584728924212319, "grad_norm": 6.925072193145752, "learning_rate": 9.707777462390009e-05, "loss": 3.710199737548828, "step": 2060 }, { "epoch": 0.5875674141356798, "grad_norm": 7.253894805908203, "learning_rate": 9.706358217428328e-05, "loss": 3.7687889099121095, "step": 2070 }, { "epoch": 0.5904059040590406, "grad_norm": 7.424910545349121, "learning_rate": 9.704938972466648e-05, "loss": 3.7045562744140623, "step": 2080 }, { "epoch": 0.5932443939824014, "grad_norm": 7.380148410797119, "learning_rate": 9.703519727504968e-05, "loss": 3.6738853454589844, "step": 2090 }, { "epoch": 0.5960828839057621, "grad_norm": 7.1335248947143555, "learning_rate": 9.702100482543286e-05, "loss": 3.720709228515625, "step": 2100 }, { "epoch": 0.5989213738291229, "grad_norm": 7.660240173339844, "learning_rate": 9.700681237581607e-05, "loss": 3.7432182312011717, "step": 2110 }, { "epoch": 0.6017598637524837, "grad_norm": 7.4602861404418945, "learning_rate": 9.699261992619926e-05, "loss": 3.775579071044922, "step": 2120 }, { "epoch": 0.6045983536758445, "grad_norm": 7.762479782104492, "learning_rate": 9.697842747658247e-05, "loss": 3.638703155517578, "step": 2130 }, { "epoch": 0.6074368435992052, "grad_norm": 7.361124515533447, "learning_rate": 9.696423502696565e-05, "loss": 3.6977561950683593, "step": 2140 }, { "epoch": 0.610275333522566, "grad_norm": 7.593191623687744, "learning_rate": 9.695004257734886e-05, "loss": 3.6737380981445313, "step": 2150 }, { "epoch": 0.6131138234459268, "grad_norm": 7.438437461853027, "learning_rate": 9.693585012773205e-05, "loss": 3.6838111877441406, "step": 2160 }, { "epoch": 0.6159523133692876, "grad_norm": 7.179313659667969, "learning_rate": 9.692165767811526e-05, "loss": 3.691519927978516, "step": 2170 }, { "epoch": 0.6187908032926484, "grad_norm": 6.957818984985352, "learning_rate": 9.690746522849844e-05, "loss": 3.717293548583984, "step": 2180 }, { "epoch": 0.6216292932160091, "grad_norm": 7.232036590576172, "learning_rate": 9.689327277888164e-05, "loss": 3.680225372314453, "step": 2190 }, { "epoch": 0.6244677831393699, "grad_norm": 7.262876987457275, "learning_rate": 9.687908032926484e-05, "loss": 3.7111167907714844, "step": 2200 }, { "epoch": 0.6273062730627307, "grad_norm": 8.157391548156738, "learning_rate": 9.686488787964803e-05, "loss": 3.7050239562988283, "step": 2210 }, { "epoch": 0.6301447629860913, "grad_norm": 7.367589473724365, "learning_rate": 9.685069543003122e-05, "loss": 3.6679656982421873, "step": 2220 }, { "epoch": 0.6329832529094521, "grad_norm": 7.720321178436279, "learning_rate": 9.683650298041442e-05, "loss": 3.659359359741211, "step": 2230 }, { "epoch": 0.6358217428328129, "grad_norm": 7.420895576477051, "learning_rate": 9.682231053079763e-05, "loss": 3.667042922973633, "step": 2240 }, { "epoch": 0.6386602327561737, "grad_norm": 7.23732852935791, "learning_rate": 9.680811808118082e-05, "loss": 3.689688491821289, "step": 2250 }, { "epoch": 0.6414987226795344, "grad_norm": 7.261723041534424, "learning_rate": 9.679392563156401e-05, "loss": 3.763965606689453, "step": 2260 }, { "epoch": 0.6443372126028952, "grad_norm": 7.487043380737305, "learning_rate": 9.67797331819472e-05, "loss": 3.6036285400390624, "step": 2270 }, { "epoch": 0.647175702526256, "grad_norm": 7.626828670501709, "learning_rate": 9.676554073233041e-05, "loss": 3.7397308349609375, "step": 2280 }, { "epoch": 0.6500141924496168, "grad_norm": 7.565433979034424, "learning_rate": 9.67513482827136e-05, "loss": 3.709708404541016, "step": 2290 }, { "epoch": 0.6528526823729776, "grad_norm": 7.4478912353515625, "learning_rate": 9.67371558330968e-05, "loss": 3.6449459075927733, "step": 2300 }, { "epoch": 0.6556911722963383, "grad_norm": 7.31154727935791, "learning_rate": 9.672296338347999e-05, "loss": 3.6980731964111326, "step": 2310 }, { "epoch": 0.6585296622196991, "grad_norm": 7.294930458068848, "learning_rate": 9.670877093386319e-05, "loss": 3.6901737213134767, "step": 2320 }, { "epoch": 0.6613681521430599, "grad_norm": 7.380934238433838, "learning_rate": 9.669457848424639e-05, "loss": 3.7028076171875, "step": 2330 }, { "epoch": 0.6642066420664207, "grad_norm": 7.143312454223633, "learning_rate": 9.668038603462957e-05, "loss": 3.740081787109375, "step": 2340 }, { "epoch": 0.6670451319897814, "grad_norm": 7.557438850402832, "learning_rate": 9.666619358501278e-05, "loss": 3.6650466918945312, "step": 2350 }, { "epoch": 0.6698836219131422, "grad_norm": 7.303873062133789, "learning_rate": 9.665200113539597e-05, "loss": 3.6579383850097655, "step": 2360 }, { "epoch": 0.672722111836503, "grad_norm": 7.311411380767822, "learning_rate": 9.663780868577918e-05, "loss": 3.5823558807373046, "step": 2370 }, { "epoch": 0.6755606017598638, "grad_norm": 7.140070915222168, "learning_rate": 9.662361623616236e-05, "loss": 3.586924362182617, "step": 2380 }, { "epoch": 0.6783990916832245, "grad_norm": 7.323861598968506, "learning_rate": 9.660942378654557e-05, "loss": 3.6682365417480467, "step": 2390 }, { "epoch": 0.6812375816065853, "grad_norm": 6.9112396240234375, "learning_rate": 9.659523133692876e-05, "loss": 3.6532161712646483, "step": 2400 }, { "epoch": 0.6840760715299461, "grad_norm": 7.156010627746582, "learning_rate": 9.658103888731195e-05, "loss": 3.689322662353516, "step": 2410 }, { "epoch": 0.6869145614533069, "grad_norm": 7.530256271362305, "learning_rate": 9.656684643769515e-05, "loss": 3.5621185302734375, "step": 2420 }, { "epoch": 0.6897530513766676, "grad_norm": 7.8801589012146, "learning_rate": 9.655265398807834e-05, "loss": 3.6548622131347654, "step": 2430 }, { "epoch": 0.6925915413000284, "grad_norm": 6.943436622619629, "learning_rate": 9.653846153846155e-05, "loss": 3.674822235107422, "step": 2440 }, { "epoch": 0.6954300312233892, "grad_norm": 7.101441860198975, "learning_rate": 9.652426908884474e-05, "loss": 3.622018814086914, "step": 2450 }, { "epoch": 0.69826852114675, "grad_norm": 7.4701924324035645, "learning_rate": 9.651007663922793e-05, "loss": 3.6729667663574217, "step": 2460 }, { "epoch": 0.7011070110701108, "grad_norm": 7.105383396148682, "learning_rate": 9.649588418961113e-05, "loss": 3.5898292541503904, "step": 2470 }, { "epoch": 0.7039455009934714, "grad_norm": 7.484338760375977, "learning_rate": 9.648169173999433e-05, "loss": 3.6049171447753907, "step": 2480 }, { "epoch": 0.7067839909168322, "grad_norm": 7.060992240905762, "learning_rate": 9.646749929037753e-05, "loss": 3.6324234008789062, "step": 2490 }, { "epoch": 0.709622480840193, "grad_norm": 7.2826457023620605, "learning_rate": 9.645330684076072e-05, "loss": 3.6472145080566407, "step": 2500 }, { "epoch": 0.709622480840193, "eval_accuracy": 0.11973039994913207, "eval_loss": 3.80971360206604, "eval_runtime": 49.6219, "eval_samples_per_second": 316.936, "eval_steps_per_second": 4.957, "step": 2500 }, { "epoch": 0.7124609707635537, "grad_norm": 7.482487201690674, "learning_rate": 9.643911439114391e-05, "loss": 3.610051727294922, "step": 2510 }, { "epoch": 0.7152994606869145, "grad_norm": 7.180037021636963, "learning_rate": 9.642492194152712e-05, "loss": 3.688520050048828, "step": 2520 }, { "epoch": 0.7181379506102753, "grad_norm": 7.227767467498779, "learning_rate": 9.641072949191031e-05, "loss": 3.645109939575195, "step": 2530 }, { "epoch": 0.7209764405336361, "grad_norm": 7.288755893707275, "learning_rate": 9.63965370422935e-05, "loss": 3.6191864013671875, "step": 2540 }, { "epoch": 0.7238149304569969, "grad_norm": 7.296230316162109, "learning_rate": 9.63823445926767e-05, "loss": 3.6331405639648438, "step": 2550 }, { "epoch": 0.7266534203803576, "grad_norm": 7.676813125610352, "learning_rate": 9.636815214305989e-05, "loss": 3.6324691772460938, "step": 2560 }, { "epoch": 0.7294919103037184, "grad_norm": 7.43597936630249, "learning_rate": 9.63539596934431e-05, "loss": 3.61390380859375, "step": 2570 }, { "epoch": 0.7323304002270792, "grad_norm": 7.204632759094238, "learning_rate": 9.633976724382628e-05, "loss": 3.639260482788086, "step": 2580 }, { "epoch": 0.73516889015044, "grad_norm": 7.291706085205078, "learning_rate": 9.632557479420949e-05, "loss": 3.5786514282226562, "step": 2590 }, { "epoch": 0.7380073800738007, "grad_norm": 7.207672119140625, "learning_rate": 9.631138234459268e-05, "loss": 3.5225643157958983, "step": 2600 }, { "epoch": 0.7408458699971615, "grad_norm": 7.254996299743652, "learning_rate": 9.629718989497589e-05, "loss": 3.6426658630371094, "step": 2610 }, { "epoch": 0.7436843599205223, "grad_norm": 7.12778377532959, "learning_rate": 9.628299744535907e-05, "loss": 3.532062530517578, "step": 2620 }, { "epoch": 0.7465228498438831, "grad_norm": 7.583011627197266, "learning_rate": 9.626880499574227e-05, "loss": 3.6407665252685546, "step": 2630 }, { "epoch": 0.7493613397672438, "grad_norm": 7.308427810668945, "learning_rate": 9.625461254612547e-05, "loss": 3.549312973022461, "step": 2640 }, { "epoch": 0.7521998296906046, "grad_norm": 7.261886119842529, "learning_rate": 9.624042009650866e-05, "loss": 3.574083709716797, "step": 2650 }, { "epoch": 0.7550383196139654, "grad_norm": 7.2608642578125, "learning_rate": 9.622622764689185e-05, "loss": 3.5501007080078124, "step": 2660 }, { "epoch": 0.7578768095373262, "grad_norm": 7.297380447387695, "learning_rate": 9.621203519727505e-05, "loss": 3.601484680175781, "step": 2670 }, { "epoch": 0.760715299460687, "grad_norm": 7.433940887451172, "learning_rate": 9.619784274765825e-05, "loss": 3.6204559326171877, "step": 2680 }, { "epoch": 0.7635537893840477, "grad_norm": 7.104815483093262, "learning_rate": 9.618365029804145e-05, "loss": 3.5624732971191406, "step": 2690 }, { "epoch": 0.7663922793074085, "grad_norm": 7.194116115570068, "learning_rate": 9.616945784842464e-05, "loss": 3.550040435791016, "step": 2700 }, { "epoch": 0.7692307692307693, "grad_norm": 7.035511016845703, "learning_rate": 9.615526539880783e-05, "loss": 3.553791809082031, "step": 2710 }, { "epoch": 0.77206925915413, "grad_norm": 7.355343341827393, "learning_rate": 9.614107294919104e-05, "loss": 3.4965911865234376, "step": 2720 }, { "epoch": 0.7749077490774908, "grad_norm": 7.1193366050720215, "learning_rate": 9.612688049957423e-05, "loss": 3.580805206298828, "step": 2730 }, { "epoch": 0.7777462390008516, "grad_norm": 7.123780727386475, "learning_rate": 9.611268804995743e-05, "loss": 3.5761260986328125, "step": 2740 }, { "epoch": 0.7805847289242123, "grad_norm": 7.398573398590088, "learning_rate": 9.609849560034062e-05, "loss": 3.509032440185547, "step": 2750 }, { "epoch": 0.783423218847573, "grad_norm": 7.086894512176514, "learning_rate": 9.608430315072383e-05, "loss": 3.540152359008789, "step": 2760 }, { "epoch": 0.7862617087709338, "grad_norm": 7.653377532958984, "learning_rate": 9.607011070110702e-05, "loss": 3.5385326385498046, "step": 2770 }, { "epoch": 0.7891001986942946, "grad_norm": 7.275185585021973, "learning_rate": 9.605733749645189e-05, "loss": 3.542436218261719, "step": 2780 }, { "epoch": 0.7919386886176554, "grad_norm": 7.434903144836426, "learning_rate": 9.604314504683509e-05, "loss": 3.5810466766357423, "step": 2790 }, { "epoch": 0.7947771785410161, "grad_norm": 7.3837714195251465, "learning_rate": 9.602895259721828e-05, "loss": 3.4981517791748047, "step": 2800 }, { "epoch": 0.7976156684643769, "grad_norm": 7.432369709014893, "learning_rate": 9.601476014760147e-05, "loss": 3.565850830078125, "step": 2810 }, { "epoch": 0.8004541583877377, "grad_norm": 7.555720806121826, "learning_rate": 9.600056769798468e-05, "loss": 3.568635940551758, "step": 2820 }, { "epoch": 0.8032926483110985, "grad_norm": 7.406350135803223, "learning_rate": 9.598637524836787e-05, "loss": 3.57777099609375, "step": 2830 }, { "epoch": 0.8061311382344593, "grad_norm": 7.433150768280029, "learning_rate": 9.597218279875107e-05, "loss": 3.570902633666992, "step": 2840 }, { "epoch": 0.80896962815782, "grad_norm": 7.303187370300293, "learning_rate": 9.595799034913426e-05, "loss": 3.5548660278320314, "step": 2850 }, { "epoch": 0.8118081180811808, "grad_norm": 7.327703952789307, "learning_rate": 9.594379789951747e-05, "loss": 3.5038543701171876, "step": 2860 }, { "epoch": 0.8146466080045416, "grad_norm": 7.204839706420898, "learning_rate": 9.592960544990066e-05, "loss": 3.5796218872070313, "step": 2870 }, { "epoch": 0.8174850979279024, "grad_norm": 7.403165817260742, "learning_rate": 9.591541300028385e-05, "loss": 3.572002410888672, "step": 2880 }, { "epoch": 0.8203235878512631, "grad_norm": 7.163651466369629, "learning_rate": 9.590122055066705e-05, "loss": 3.5343738555908204, "step": 2890 }, { "epoch": 0.8231620777746239, "grad_norm": 7.3628644943237305, "learning_rate": 9.588702810105025e-05, "loss": 3.5443962097167967, "step": 2900 }, { "epoch": 0.8260005676979847, "grad_norm": 7.071038722991943, "learning_rate": 9.587283565143345e-05, "loss": 3.5356773376464843, "step": 2910 }, { "epoch": 0.8288390576213455, "grad_norm": 7.461095809936523, "learning_rate": 9.585864320181664e-05, "loss": 3.492567443847656, "step": 2920 }, { "epoch": 0.8316775475447062, "grad_norm": 7.566678047180176, "learning_rate": 9.584445075219983e-05, "loss": 3.530070495605469, "step": 2930 }, { "epoch": 0.834516037468067, "grad_norm": 7.0731201171875, "learning_rate": 9.583025830258303e-05, "loss": 3.5129356384277344, "step": 2940 }, { "epoch": 0.8373545273914278, "grad_norm": 7.395421028137207, "learning_rate": 9.581606585296623e-05, "loss": 3.5494197845458983, "step": 2950 }, { "epoch": 0.8401930173147886, "grad_norm": 7.107268333435059, "learning_rate": 9.580187340334941e-05, "loss": 3.54199104309082, "step": 2960 }, { "epoch": 0.8430315072381493, "grad_norm": 7.520330905914307, "learning_rate": 9.578768095373262e-05, "loss": 3.5169609069824217, "step": 2970 }, { "epoch": 0.8458699971615101, "grad_norm": 7.476873397827148, "learning_rate": 9.577348850411581e-05, "loss": 3.458138275146484, "step": 2980 }, { "epoch": 0.8487084870848709, "grad_norm": 7.781281471252441, "learning_rate": 9.575929605449902e-05, "loss": 3.518218994140625, "step": 2990 }, { "epoch": 0.8515469770082317, "grad_norm": 7.301414489746094, "learning_rate": 9.57451036048822e-05, "loss": 3.4562393188476563, "step": 3000 }, { "epoch": 0.8515469770082317, "eval_accuracy": 0.12774209957398105, "eval_loss": 3.7206008434295654, "eval_runtime": 48.3186, "eval_samples_per_second": 325.486, "eval_steps_per_second": 5.091, "step": 3000 }, { "epoch": 0.8543854669315923, "grad_norm": 7.186266899108887, "learning_rate": 9.573091115526541e-05, "loss": 3.4925193786621094, "step": 3010 }, { "epoch": 0.8572239568549531, "grad_norm": 7.168111324310303, "learning_rate": 9.57167187056486e-05, "loss": 3.4770233154296877, "step": 3020 }, { "epoch": 0.8600624467783139, "grad_norm": 7.3371710777282715, "learning_rate": 9.57025262560318e-05, "loss": 3.527056884765625, "step": 3030 }, { "epoch": 0.8629009367016747, "grad_norm": 7.28606653213501, "learning_rate": 9.568833380641499e-05, "loss": 3.506011199951172, "step": 3040 }, { "epoch": 0.8657394266250354, "grad_norm": 7.301010608673096, "learning_rate": 9.567414135679818e-05, "loss": 3.5705257415771485, "step": 3050 }, { "epoch": 0.8685779165483962, "grad_norm": 7.810239315032959, "learning_rate": 9.565994890718139e-05, "loss": 3.5152103424072267, "step": 3060 }, { "epoch": 0.871416406471757, "grad_norm": 7.248566627502441, "learning_rate": 9.564575645756458e-05, "loss": 3.5466377258300783, "step": 3070 }, { "epoch": 0.8742548963951178, "grad_norm": 7.049148082733154, "learning_rate": 9.563156400794777e-05, "loss": 3.5013992309570314, "step": 3080 }, { "epoch": 0.8770933863184786, "grad_norm": 7.321220397949219, "learning_rate": 9.561737155833097e-05, "loss": 3.4358802795410157, "step": 3090 }, { "epoch": 0.8799318762418393, "grad_norm": 7.252366542816162, "learning_rate": 9.560317910871417e-05, "loss": 3.4336090087890625, "step": 3100 }, { "epoch": 0.8827703661652001, "grad_norm": 7.176443576812744, "learning_rate": 9.558898665909737e-05, "loss": 3.4392013549804688, "step": 3110 }, { "epoch": 0.8856088560885609, "grad_norm": 7.287851810455322, "learning_rate": 9.557479420948056e-05, "loss": 3.45914306640625, "step": 3120 }, { "epoch": 0.8884473460119217, "grad_norm": 7.333312511444092, "learning_rate": 9.556060175986375e-05, "loss": 3.4894023895263673, "step": 3130 }, { "epoch": 0.8912858359352824, "grad_norm": 7.352703094482422, "learning_rate": 9.554640931024695e-05, "loss": 3.5318389892578126, "step": 3140 }, { "epoch": 0.8941243258586432, "grad_norm": 7.3453545570373535, "learning_rate": 9.553221686063015e-05, "loss": 3.489518737792969, "step": 3150 }, { "epoch": 0.896962815782004, "grad_norm": 7.323802471160889, "learning_rate": 9.551802441101333e-05, "loss": 3.537572479248047, "step": 3160 }, { "epoch": 0.8998013057053648, "grad_norm": 7.320858478546143, "learning_rate": 9.550383196139654e-05, "loss": 3.454131317138672, "step": 3170 }, { "epoch": 0.9026397956287255, "grad_norm": 7.0663557052612305, "learning_rate": 9.548963951177973e-05, "loss": 3.3973625183105467, "step": 3180 }, { "epoch": 0.9054782855520863, "grad_norm": 7.05025053024292, "learning_rate": 9.547544706216294e-05, "loss": 3.479450988769531, "step": 3190 }, { "epoch": 0.9083167754754471, "grad_norm": 7.340791702270508, "learning_rate": 9.546125461254612e-05, "loss": 3.4749183654785156, "step": 3200 }, { "epoch": 0.9111552653988079, "grad_norm": 7.227952480316162, "learning_rate": 9.544706216292933e-05, "loss": 3.509593963623047, "step": 3210 }, { "epoch": 0.9139937553221686, "grad_norm": 7.240564823150635, "learning_rate": 9.543286971331252e-05, "loss": 3.47366943359375, "step": 3220 }, { "epoch": 0.9168322452455294, "grad_norm": 7.625861644744873, "learning_rate": 9.541867726369573e-05, "loss": 3.4899932861328127, "step": 3230 }, { "epoch": 0.9196707351688902, "grad_norm": 7.5059709548950195, "learning_rate": 9.540448481407891e-05, "loss": 3.4519630432128907, "step": 3240 }, { "epoch": 0.922509225092251, "grad_norm": 7.126120090484619, "learning_rate": 9.539029236446211e-05, "loss": 3.456466293334961, "step": 3250 }, { "epoch": 0.9253477150156117, "grad_norm": 7.1017022132873535, "learning_rate": 9.537609991484531e-05, "loss": 3.514039993286133, "step": 3260 }, { "epoch": 0.9281862049389724, "grad_norm": 7.508925437927246, "learning_rate": 9.53619074652285e-05, "loss": 3.452556610107422, "step": 3270 }, { "epoch": 0.9310246948623332, "grad_norm": 7.301242828369141, "learning_rate": 9.53477150156117e-05, "loss": 3.4460525512695312, "step": 3280 }, { "epoch": 0.933863184785694, "grad_norm": 7.041483402252197, "learning_rate": 9.533352256599489e-05, "loss": 3.4189437866210937, "step": 3290 }, { "epoch": 0.9367016747090547, "grad_norm": 7.8884053230285645, "learning_rate": 9.53193301163781e-05, "loss": 3.519847869873047, "step": 3300 }, { "epoch": 0.9395401646324155, "grad_norm": 7.292904853820801, "learning_rate": 9.530513766676129e-05, "loss": 3.495802307128906, "step": 3310 }, { "epoch": 0.9423786545557763, "grad_norm": 7.393863677978516, "learning_rate": 9.529094521714448e-05, "loss": 3.4945892333984374, "step": 3320 }, { "epoch": 0.9452171444791371, "grad_norm": 7.209682464599609, "learning_rate": 9.527675276752767e-05, "loss": 3.485137939453125, "step": 3330 }, { "epoch": 0.9480556344024978, "grad_norm": 7.190655708312988, "learning_rate": 9.526256031791088e-05, "loss": 3.461884689331055, "step": 3340 }, { "epoch": 0.9508941243258586, "grad_norm": 7.329770088195801, "learning_rate": 9.524836786829407e-05, "loss": 3.4564563751220705, "step": 3350 }, { "epoch": 0.9537326142492194, "grad_norm": 7.14788293838501, "learning_rate": 9.523417541867727e-05, "loss": 3.4477493286132814, "step": 3360 }, { "epoch": 0.9565711041725802, "grad_norm": 7.215503215789795, "learning_rate": 9.521998296906046e-05, "loss": 3.4847389221191407, "step": 3370 }, { "epoch": 0.959409594095941, "grad_norm": 6.936811447143555, "learning_rate": 9.520579051944365e-05, "loss": 3.506412124633789, "step": 3380 }, { "epoch": 0.9622480840193017, "grad_norm": 7.888884544372559, "learning_rate": 9.519159806982686e-05, "loss": 3.465282440185547, "step": 3390 }, { "epoch": 0.9650865739426625, "grad_norm": 7.094465255737305, "learning_rate": 9.517740562021005e-05, "loss": 3.4656211853027346, "step": 3400 }, { "epoch": 0.9679250638660233, "grad_norm": 7.254475116729736, "learning_rate": 9.516321317059325e-05, "loss": 3.4536678314208986, "step": 3410 }, { "epoch": 0.9707635537893841, "grad_norm": 6.871583461761475, "learning_rate": 9.514902072097644e-05, "loss": 3.495751953125, "step": 3420 }, { "epoch": 0.9736020437127448, "grad_norm": 7.021137714385986, "learning_rate": 9.513482827135965e-05, "loss": 3.4352497100830077, "step": 3430 }, { "epoch": 0.9764405336361056, "grad_norm": 7.264500617980957, "learning_rate": 9.512063582174284e-05, "loss": 3.4637676239013673, "step": 3440 }, { "epoch": 0.9792790235594664, "grad_norm": 7.2182793617248535, "learning_rate": 9.510644337212603e-05, "loss": 3.4034854888916017, "step": 3450 }, { "epoch": 0.9821175134828272, "grad_norm": 7.172633647918701, "learning_rate": 9.509225092250923e-05, "loss": 3.404012680053711, "step": 3460 }, { "epoch": 0.9849560034061879, "grad_norm": 7.095450401306152, "learning_rate": 9.507805847289242e-05, "loss": 3.454293060302734, "step": 3470 }, { "epoch": 0.9877944933295487, "grad_norm": 7.579015731811523, "learning_rate": 9.506386602327563e-05, "loss": 3.4485897064208983, "step": 3480 }, { "epoch": 0.9906329832529095, "grad_norm": 7.109419345855713, "learning_rate": 9.504967357365882e-05, "loss": 3.425096130371094, "step": 3490 }, { "epoch": 0.9934714731762703, "grad_norm": 7.1194610595703125, "learning_rate": 9.503548112404202e-05, "loss": 3.3326652526855467, "step": 3500 }, { "epoch": 0.9934714731762703, "eval_accuracy": 0.13925096966999428, "eval_loss": 3.6279399394989014, "eval_runtime": 46.1363, "eval_samples_per_second": 340.882, "eval_steps_per_second": 5.332, "step": 3500 }, { "epoch": 0.996309963099631, "grad_norm": 8.108049392700195, "learning_rate": 9.502128867442521e-05, "loss": 3.3762611389160155, "step": 3510 }, { "epoch": 0.9991484530229918, "grad_norm": 7.766464710235596, "learning_rate": 9.500709622480842e-05, "loss": 3.4609107971191406, "step": 3520 }, { "epoch": 1.0019869429463526, "grad_norm": 7.104329586029053, "learning_rate": 9.49929037751916e-05, "loss": 3.439278411865234, "step": 3530 }, { "epoch": 1.0048254328697133, "grad_norm": 7.065787315368652, "learning_rate": 9.49787113255748e-05, "loss": 3.3602855682373045, "step": 3540 }, { "epoch": 1.0076639227930742, "grad_norm": 7.622304439544678, "learning_rate": 9.4964518875958e-05, "loss": 3.3708641052246096, "step": 3550 }, { "epoch": 1.0105024127164348, "grad_norm": 6.99111270904541, "learning_rate": 9.49503264263412e-05, "loss": 3.342230224609375, "step": 3560 }, { "epoch": 1.0133409026397957, "grad_norm": 7.472219944000244, "learning_rate": 9.493613397672438e-05, "loss": 3.4143531799316404, "step": 3570 }, { "epoch": 1.0161793925631564, "grad_norm": 7.127761363983154, "learning_rate": 9.492194152710759e-05, "loss": 3.341227722167969, "step": 3580 }, { "epoch": 1.0190178824865173, "grad_norm": 7.218255043029785, "learning_rate": 9.490774907749078e-05, "loss": 3.3834506988525392, "step": 3590 }, { "epoch": 1.021856372409878, "grad_norm": 7.0136213302612305, "learning_rate": 9.489355662787398e-05, "loss": 3.328974151611328, "step": 3600 }, { "epoch": 1.0246948623332388, "grad_norm": 7.233702659606934, "learning_rate": 9.487936417825717e-05, "loss": 3.4117610931396483, "step": 3610 }, { "epoch": 1.0275333522565995, "grad_norm": 7.186321258544922, "learning_rate": 9.486517172864036e-05, "loss": 3.395829772949219, "step": 3620 }, { "epoch": 1.0303718421799604, "grad_norm": 7.53147029876709, "learning_rate": 9.485097927902357e-05, "loss": 3.3556568145751955, "step": 3630 }, { "epoch": 1.033210332103321, "grad_norm": 7.109918594360352, "learning_rate": 9.483678682940676e-05, "loss": 3.3442901611328124, "step": 3640 }, { "epoch": 1.036048822026682, "grad_norm": 7.425334930419922, "learning_rate": 9.482259437978996e-05, "loss": 3.3542556762695312, "step": 3650 }, { "epoch": 1.0388873119500426, "grad_norm": 7.147945880889893, "learning_rate": 9.480840193017315e-05, "loss": 3.406561279296875, "step": 3660 }, { "epoch": 1.0417258018734032, "grad_norm": 7.6601433753967285, "learning_rate": 9.479420948055636e-05, "loss": 3.392993927001953, "step": 3670 }, { "epoch": 1.0445642917967641, "grad_norm": 7.315784931182861, "learning_rate": 9.478001703093955e-05, "loss": 3.41961784362793, "step": 3680 }, { "epoch": 1.0474027817201248, "grad_norm": 7.471482753753662, "learning_rate": 9.476582458132274e-05, "loss": 3.412517547607422, "step": 3690 }, { "epoch": 1.0502412716434857, "grad_norm": 7.401261329650879, "learning_rate": 9.475163213170594e-05, "loss": 3.397623825073242, "step": 3700 }, { "epoch": 1.0530797615668464, "grad_norm": 7.364709377288818, "learning_rate": 9.473743968208913e-05, "loss": 3.3277698516845704, "step": 3710 }, { "epoch": 1.0559182514902072, "grad_norm": 7.126255989074707, "learning_rate": 9.472324723247234e-05, "loss": 3.411181640625, "step": 3720 }, { "epoch": 1.058756741413568, "grad_norm": 7.366354942321777, "learning_rate": 9.470905478285552e-05, "loss": 3.4318737030029296, "step": 3730 }, { "epoch": 1.0615952313369288, "grad_norm": 7.1839165687561035, "learning_rate": 9.469486233323872e-05, "loss": 3.3635009765625, "step": 3740 }, { "epoch": 1.0644337212602895, "grad_norm": 7.364368438720703, "learning_rate": 9.468066988362192e-05, "loss": 3.373552703857422, "step": 3750 }, { "epoch": 1.0672722111836503, "grad_norm": 7.370746612548828, "learning_rate": 9.466647743400512e-05, "loss": 3.428628158569336, "step": 3760 }, { "epoch": 1.070110701107011, "grad_norm": 7.029669284820557, "learning_rate": 9.46522849843883e-05, "loss": 3.348693084716797, "step": 3770 }, { "epoch": 1.072949191030372, "grad_norm": 6.948419570922852, "learning_rate": 9.463809253477151e-05, "loss": 3.2941844940185545, "step": 3780 }, { "epoch": 1.0757876809537326, "grad_norm": 6.941299915313721, "learning_rate": 9.46239000851547e-05, "loss": 3.3203140258789063, "step": 3790 }, { "epoch": 1.0786261708770934, "grad_norm": 7.364920139312744, "learning_rate": 9.460970763553791e-05, "loss": 3.314009094238281, "step": 3800 }, { "epoch": 1.0814646608004541, "grad_norm": 7.6898980140686035, "learning_rate": 9.459551518592109e-05, "loss": 3.432494354248047, "step": 3810 }, { "epoch": 1.084303150723815, "grad_norm": 6.97192907333374, "learning_rate": 9.45813227363043e-05, "loss": 3.373524856567383, "step": 3820 }, { "epoch": 1.0871416406471757, "grad_norm": 7.121440887451172, "learning_rate": 9.456713028668749e-05, "loss": 3.36458740234375, "step": 3830 }, { "epoch": 1.0899801305705366, "grad_norm": 7.221580505371094, "learning_rate": 9.455293783707068e-05, "loss": 3.3123931884765625, "step": 3840 }, { "epoch": 1.0928186204938972, "grad_norm": 7.292924880981445, "learning_rate": 9.453874538745388e-05, "loss": 3.34912223815918, "step": 3850 }, { "epoch": 1.095657110417258, "grad_norm": 7.157872676849365, "learning_rate": 9.452455293783707e-05, "loss": 3.352229690551758, "step": 3860 }, { "epoch": 1.0984956003406188, "grad_norm": 7.3831071853637695, "learning_rate": 9.451036048822028e-05, "loss": 3.269633483886719, "step": 3870 }, { "epoch": 1.1013340902639797, "grad_norm": 7.143009185791016, "learning_rate": 9.449616803860347e-05, "loss": 3.3933265686035154, "step": 3880 }, { "epoch": 1.1041725801873403, "grad_norm": 7.303972244262695, "learning_rate": 9.448197558898666e-05, "loss": 3.264752960205078, "step": 3890 }, { "epoch": 1.1070110701107012, "grad_norm": 7.286591529846191, "learning_rate": 9.446778313936986e-05, "loss": 3.3574974060058596, "step": 3900 }, { "epoch": 1.1098495600340619, "grad_norm": 7.070778846740723, "learning_rate": 9.445359068975306e-05, "loss": 3.30953369140625, "step": 3910 }, { "epoch": 1.1126880499574225, "grad_norm": 7.649458885192871, "learning_rate": 9.443939824013626e-05, "loss": 3.4041366577148438, "step": 3920 }, { "epoch": 1.1155265398807834, "grad_norm": 7.335157871246338, "learning_rate": 9.442520579051945e-05, "loss": 3.390773391723633, "step": 3930 }, { "epoch": 1.118365029804144, "grad_norm": 6.90259313583374, "learning_rate": 9.441101334090264e-05, "loss": 3.427594757080078, "step": 3940 }, { "epoch": 1.121203519727505, "grad_norm": 7.111352920532227, "learning_rate": 9.439682089128584e-05, "loss": 3.3014858245849608, "step": 3950 }, { "epoch": 1.1240420096508656, "grad_norm": 7.482234954833984, "learning_rate": 9.438262844166904e-05, "loss": 3.376997375488281, "step": 3960 }, { "epoch": 1.1268804995742265, "grad_norm": 7.400355339050293, "learning_rate": 9.436843599205222e-05, "loss": 3.3445358276367188, "step": 3970 }, { "epoch": 1.1297189894975872, "grad_norm": 7.384416580200195, "learning_rate": 9.435424354243543e-05, "loss": 3.240322494506836, "step": 3980 }, { "epoch": 1.132557479420948, "grad_norm": 7.537659645080566, "learning_rate": 9.434005109281862e-05, "loss": 3.3364639282226562, "step": 3990 }, { "epoch": 1.1353959693443088, "grad_norm": 7.418066024780273, "learning_rate": 9.432585864320183e-05, "loss": 3.34169921875, "step": 4000 }, { "epoch": 1.1353959693443088, "eval_accuracy": 0.14999682075411713, "eval_loss": 3.5549230575561523, "eval_runtime": 48.9797, "eval_samples_per_second": 321.092, "eval_steps_per_second": 5.022, "step": 4000 }, { "epoch": 1.1382344592676696, "grad_norm": 7.341390609741211, "learning_rate": 9.431166619358501e-05, "loss": 3.2886451721191405, "step": 4010 }, { "epoch": 1.1410729491910303, "grad_norm": 7.2296061515808105, "learning_rate": 9.429747374396822e-05, "loss": 3.382833480834961, "step": 4020 }, { "epoch": 1.1439114391143912, "grad_norm": 7.278980255126953, "learning_rate": 9.428328129435141e-05, "loss": 3.3879600524902345, "step": 4030 }, { "epoch": 1.1467499290377519, "grad_norm": 7.378370761871338, "learning_rate": 9.42690888447346e-05, "loss": 3.362626647949219, "step": 4040 }, { "epoch": 1.1495884189611127, "grad_norm": 7.051491737365723, "learning_rate": 9.42548963951178e-05, "loss": 3.326863098144531, "step": 4050 }, { "epoch": 1.1524269088844734, "grad_norm": 7.0447211265563965, "learning_rate": 9.424070394550099e-05, "loss": 3.3227081298828125, "step": 4060 }, { "epoch": 1.1552653988078343, "grad_norm": 7.52214241027832, "learning_rate": 9.42265114958842e-05, "loss": 3.252707672119141, "step": 4070 }, { "epoch": 1.158103888731195, "grad_norm": 7.271135330200195, "learning_rate": 9.421231904626739e-05, "loss": 3.2926795959472654, "step": 4080 }, { "epoch": 1.1609423786545559, "grad_norm": 7.099951267242432, "learning_rate": 9.419812659665058e-05, "loss": 3.3218547821044924, "step": 4090 }, { "epoch": 1.1637808685779165, "grad_norm": 7.275931358337402, "learning_rate": 9.418393414703378e-05, "loss": 3.280129623413086, "step": 4100 }, { "epoch": 1.1666193585012774, "grad_norm": 7.335946559906006, "learning_rate": 9.416974169741698e-05, "loss": 3.3001636505126952, "step": 4110 }, { "epoch": 1.169457848424638, "grad_norm": 7.290144920349121, "learning_rate": 9.415554924780018e-05, "loss": 3.3305782318115233, "step": 4120 }, { "epoch": 1.172296338347999, "grad_norm": 7.157697677612305, "learning_rate": 9.414135679818337e-05, "loss": 3.2884254455566406, "step": 4130 }, { "epoch": 1.1751348282713596, "grad_norm": 7.199974536895752, "learning_rate": 9.412716434856656e-05, "loss": 3.3420024871826173, "step": 4140 }, { "epoch": 1.1779733181947205, "grad_norm": 7.213240146636963, "learning_rate": 9.411297189894977e-05, "loss": 3.2515792846679688, "step": 4150 }, { "epoch": 1.1808118081180812, "grad_norm": 7.344404697418213, "learning_rate": 9.409877944933296e-05, "loss": 3.3045196533203125, "step": 4160 }, { "epoch": 1.1836502980414418, "grad_norm": 7.321131706237793, "learning_rate": 9.408458699971616e-05, "loss": 3.314865493774414, "step": 4170 }, { "epoch": 1.1864887879648027, "grad_norm": 7.048640251159668, "learning_rate": 9.407039455009935e-05, "loss": 3.2927295684814455, "step": 4180 }, { "epoch": 1.1893272778881636, "grad_norm": 7.080460071563721, "learning_rate": 9.405620210048254e-05, "loss": 3.3725242614746094, "step": 4190 }, { "epoch": 1.1921657678115243, "grad_norm": 7.425757884979248, "learning_rate": 9.404200965086575e-05, "loss": 3.337075424194336, "step": 4200 }, { "epoch": 1.195004257734885, "grad_norm": 7.289700984954834, "learning_rate": 9.402781720124893e-05, "loss": 3.2470550537109375, "step": 4210 }, { "epoch": 1.1978427476582458, "grad_norm": 7.10521125793457, "learning_rate": 9.401362475163214e-05, "loss": 3.301532745361328, "step": 4220 }, { "epoch": 1.2006812375816065, "grad_norm": 7.216850757598877, "learning_rate": 9.399943230201533e-05, "loss": 3.3322959899902345, "step": 4230 }, { "epoch": 1.2035197275049674, "grad_norm": 7.4538140296936035, "learning_rate": 9.398523985239854e-05, "loss": 3.3171485900878905, "step": 4240 }, { "epoch": 1.206358217428328, "grad_norm": 7.334599018096924, "learning_rate": 9.397104740278172e-05, "loss": 3.2595905303955077, "step": 4250 }, { "epoch": 1.209196707351689, "grad_norm": 7.587103843688965, "learning_rate": 9.395685495316492e-05, "loss": 3.34097900390625, "step": 4260 }, { "epoch": 1.2120351972750496, "grad_norm": 7.0766472816467285, "learning_rate": 9.394266250354812e-05, "loss": 3.327468490600586, "step": 4270 }, { "epoch": 1.2148736871984105, "grad_norm": 7.047877311706543, "learning_rate": 9.392847005393131e-05, "loss": 3.3046295166015627, "step": 4280 }, { "epoch": 1.2177121771217712, "grad_norm": 7.594003200531006, "learning_rate": 9.39142776043145e-05, "loss": 3.29381103515625, "step": 4290 }, { "epoch": 1.220550667045132, "grad_norm": 7.244882106781006, "learning_rate": 9.39000851546977e-05, "loss": 3.2825294494628907, "step": 4300 }, { "epoch": 1.2233891569684927, "grad_norm": 7.3350043296813965, "learning_rate": 9.38858927050809e-05, "loss": 3.3299888610839843, "step": 4310 }, { "epoch": 1.2262276468918536, "grad_norm": 7.453231334686279, "learning_rate": 9.38717002554641e-05, "loss": 3.224446105957031, "step": 4320 }, { "epoch": 1.2290661368152143, "grad_norm": 6.790136814117432, "learning_rate": 9.385750780584729e-05, "loss": 3.2744186401367186, "step": 4330 }, { "epoch": 1.2319046267385751, "grad_norm": 6.952952861785889, "learning_rate": 9.384331535623048e-05, "loss": 3.264435577392578, "step": 4340 }, { "epoch": 1.2347431166619358, "grad_norm": 7.255730152130127, "learning_rate": 9.382912290661369e-05, "loss": 3.1986141204833984, "step": 4350 }, { "epoch": 1.2375816065852967, "grad_norm": 7.482794761657715, "learning_rate": 9.381493045699688e-05, "loss": 3.396790313720703, "step": 4360 }, { "epoch": 1.2404200965086574, "grad_norm": 7.925017356872559, "learning_rate": 9.380073800738008e-05, "loss": 3.2445045471191407, "step": 4370 }, { "epoch": 1.2432585864320183, "grad_norm": 7.262943744659424, "learning_rate": 9.378654555776327e-05, "loss": 3.303883361816406, "step": 4380 }, { "epoch": 1.246097076355379, "grad_norm": 7.222881317138672, "learning_rate": 9.377235310814646e-05, "loss": 3.2613006591796876, "step": 4390 }, { "epoch": 1.2489355662787398, "grad_norm": 7.367786407470703, "learning_rate": 9.375816065852967e-05, "loss": 3.3100345611572264, "step": 4400 }, { "epoch": 1.2517740562021005, "grad_norm": 7.068490028381348, "learning_rate": 9.374396820891285e-05, "loss": 3.311224365234375, "step": 4410 }, { "epoch": 1.2546125461254611, "grad_norm": 7.1714863777160645, "learning_rate": 9.372977575929606e-05, "loss": 3.311724853515625, "step": 4420 }, { "epoch": 1.257451036048822, "grad_norm": 6.9548468589782715, "learning_rate": 9.371558330967925e-05, "loss": 3.2392837524414064, "step": 4430 }, { "epoch": 1.260289525972183, "grad_norm": 7.238521099090576, "learning_rate": 9.370139086006246e-05, "loss": 3.3035820007324217, "step": 4440 }, { "epoch": 1.2631280158955436, "grad_norm": 7.413900852203369, "learning_rate": 9.368719841044564e-05, "loss": 3.2317596435546876, "step": 4450 }, { "epoch": 1.2659665058189042, "grad_norm": 7.557633399963379, "learning_rate": 9.367300596082885e-05, "loss": 3.332170867919922, "step": 4460 }, { "epoch": 1.2688049957422651, "grad_norm": 7.234616279602051, "learning_rate": 9.365881351121204e-05, "loss": 3.297655487060547, "step": 4470 }, { "epoch": 1.271643485665626, "grad_norm": 6.897222995758057, "learning_rate": 9.364462106159525e-05, "loss": 3.285963439941406, "step": 4480 }, { "epoch": 1.2744819755889867, "grad_norm": 7.269834041595459, "learning_rate": 9.363042861197842e-05, "loss": 3.283355712890625, "step": 4490 }, { "epoch": 1.2773204655123473, "grad_norm": 7.431650638580322, "learning_rate": 9.361623616236163e-05, "loss": 3.248888397216797, "step": 4500 }, { "epoch": 1.2773204655123473, "eval_accuracy": 0.15368474597825396, "eval_loss": 3.5073113441467285, "eval_runtime": 46.1249, "eval_samples_per_second": 340.965, "eval_steps_per_second": 5.333, "step": 4500 }, { "epoch": 1.2801589554357082, "grad_norm": 7.114413738250732, "learning_rate": 9.360204371274483e-05, "loss": 3.2910877227783204, "step": 4510 }, { "epoch": 1.282997445359069, "grad_norm": 7.386227130889893, "learning_rate": 9.358785126312802e-05, "loss": 3.239373779296875, "step": 4520 }, { "epoch": 1.2858359352824298, "grad_norm": 7.28155517578125, "learning_rate": 9.357365881351121e-05, "loss": 3.224109649658203, "step": 4530 }, { "epoch": 1.2886744252057905, "grad_norm": 7.23994255065918, "learning_rate": 9.35594663638944e-05, "loss": 3.329684829711914, "step": 4540 }, { "epoch": 1.2915129151291513, "grad_norm": 6.898649215698242, "learning_rate": 9.354527391427761e-05, "loss": 3.191995620727539, "step": 4550 }, { "epoch": 1.294351405052512, "grad_norm": 7.527935028076172, "learning_rate": 9.35310814646608e-05, "loss": 3.32872314453125, "step": 4560 }, { "epoch": 1.297189894975873, "grad_norm": 6.850247383117676, "learning_rate": 9.3516889015044e-05, "loss": 3.1340450286865233, "step": 4570 }, { "epoch": 1.3000283848992336, "grad_norm": 7.3845062255859375, "learning_rate": 9.350269656542719e-05, "loss": 3.346833038330078, "step": 4580 }, { "epoch": 1.3028668748225944, "grad_norm": 7.377581596374512, "learning_rate": 9.34885041158104e-05, "loss": 3.223681640625, "step": 4590 }, { "epoch": 1.305705364745955, "grad_norm": 7.375144004821777, "learning_rate": 9.347431166619359e-05, "loss": 3.3208389282226562, "step": 4600 }, { "epoch": 1.308543854669316, "grad_norm": 7.060912132263184, "learning_rate": 9.346011921657679e-05, "loss": 3.2943771362304686, "step": 4610 }, { "epoch": 1.3113823445926767, "grad_norm": 7.400177955627441, "learning_rate": 9.344592676695998e-05, "loss": 3.252069091796875, "step": 4620 }, { "epoch": 1.3142208345160376, "grad_norm": 7.318269729614258, "learning_rate": 9.343173431734317e-05, "loss": 3.3188220977783205, "step": 4630 }, { "epoch": 1.3170593244393982, "grad_norm": 6.954713821411133, "learning_rate": 9.341754186772638e-05, "loss": 3.255667877197266, "step": 4640 }, { "epoch": 1.319897814362759, "grad_norm": 7.133588790893555, "learning_rate": 9.340334941810956e-05, "loss": 3.2589595794677733, "step": 4650 }, { "epoch": 1.3227363042861198, "grad_norm": 7.215946674346924, "learning_rate": 9.338915696849277e-05, "loss": 3.2439811706542967, "step": 4660 }, { "epoch": 1.3255747942094804, "grad_norm": 7.091145992279053, "learning_rate": 9.337496451887596e-05, "loss": 3.2282867431640625, "step": 4670 }, { "epoch": 1.3284132841328413, "grad_norm": 7.0551533699035645, "learning_rate": 9.336077206925917e-05, "loss": 3.232868194580078, "step": 4680 }, { "epoch": 1.3312517740562022, "grad_norm": 7.281014442443848, "learning_rate": 9.334657961964235e-05, "loss": 3.254621887207031, "step": 4690 }, { "epoch": 1.3340902639795629, "grad_norm": 7.4372639656066895, "learning_rate": 9.333238717002555e-05, "loss": 3.2474105834960936, "step": 4700 }, { "epoch": 1.3369287539029235, "grad_norm": 7.605950832366943, "learning_rate": 9.331819472040875e-05, "loss": 3.2187015533447267, "step": 4710 }, { "epoch": 1.3397672438262844, "grad_norm": 7.0749993324279785, "learning_rate": 9.330400227079194e-05, "loss": 3.281220245361328, "step": 4720 }, { "epoch": 1.3426057337496453, "grad_norm": 7.26392126083374, "learning_rate": 9.328980982117515e-05, "loss": 3.2343074798583986, "step": 4730 }, { "epoch": 1.345444223673006, "grad_norm": 6.9073967933654785, "learning_rate": 9.327561737155834e-05, "loss": 3.223860168457031, "step": 4740 }, { "epoch": 1.3482827135963666, "grad_norm": 7.540451526641846, "learning_rate": 9.326142492194153e-05, "loss": 3.267478179931641, "step": 4750 }, { "epoch": 1.3511212035197275, "grad_norm": 7.152810573577881, "learning_rate": 9.324723247232473e-05, "loss": 3.2855567932128906, "step": 4760 }, { "epoch": 1.3539596934430882, "grad_norm": 7.058323860168457, "learning_rate": 9.323304002270793e-05, "loss": 3.2484668731689452, "step": 4770 }, { "epoch": 1.356798183366449, "grad_norm": 7.244044303894043, "learning_rate": 9.321884757309111e-05, "loss": 3.1976093292236327, "step": 4780 }, { "epoch": 1.3596366732898097, "grad_norm": 7.281748294830322, "learning_rate": 9.320465512347432e-05, "loss": 3.229585647583008, "step": 4790 }, { "epoch": 1.3624751632131706, "grad_norm": 7.178518295288086, "learning_rate": 9.319046267385751e-05, "loss": 3.196010208129883, "step": 4800 }, { "epoch": 1.3653136531365313, "grad_norm": 7.4299774169921875, "learning_rate": 9.317627022424072e-05, "loss": 3.2255107879638674, "step": 4810 }, { "epoch": 1.3681521430598922, "grad_norm": 7.159573554992676, "learning_rate": 9.31620777746239e-05, "loss": 3.15787353515625, "step": 4820 }, { "epoch": 1.3709906329832529, "grad_norm": 7.223909854888916, "learning_rate": 9.31478853250071e-05, "loss": 3.189915084838867, "step": 4830 }, { "epoch": 1.3738291229066137, "grad_norm": 7.036489009857178, "learning_rate": 9.31336928753903e-05, "loss": 3.2516876220703126, "step": 4840 }, { "epoch": 1.3766676128299744, "grad_norm": 7.230433464050293, "learning_rate": 9.311950042577349e-05, "loss": 3.2341278076171873, "step": 4850 }, { "epoch": 1.3795061027533353, "grad_norm": 7.222001552581787, "learning_rate": 9.310530797615669e-05, "loss": 3.215932846069336, "step": 4860 }, { "epoch": 1.382344592676696, "grad_norm": 7.490117073059082, "learning_rate": 9.309111552653988e-05, "loss": 3.262723922729492, "step": 4870 }, { "epoch": 1.3851830826000568, "grad_norm": 7.418575763702393, "learning_rate": 9.307692307692309e-05, "loss": 3.2761322021484376, "step": 4880 }, { "epoch": 1.3880215725234175, "grad_norm": 7.358985900878906, "learning_rate": 9.306273062730628e-05, "loss": 3.2874183654785156, "step": 4890 }, { "epoch": 1.3908600624467784, "grad_norm": 7.266669750213623, "learning_rate": 9.304853817768947e-05, "loss": 3.191728973388672, "step": 4900 }, { "epoch": 1.393698552370139, "grad_norm": 6.994680881500244, "learning_rate": 9.303434572807267e-05, "loss": 3.1996747970581056, "step": 4910 }, { "epoch": 1.3965370422934997, "grad_norm": 7.113898277282715, "learning_rate": 9.302015327845587e-05, "loss": 3.180240821838379, "step": 4920 }, { "epoch": 1.3993755322168606, "grad_norm": 7.2651777267456055, "learning_rate": 9.300596082883907e-05, "loss": 3.283345413208008, "step": 4930 }, { "epoch": 1.4022140221402215, "grad_norm": 7.486123561859131, "learning_rate": 9.299176837922226e-05, "loss": 3.3377925872802736, "step": 4940 }, { "epoch": 1.4050525120635822, "grad_norm": 7.23104190826416, "learning_rate": 9.297757592960545e-05, "loss": 3.1779571533203126, "step": 4950 }, { "epoch": 1.4078910019869428, "grad_norm": 7.087817192077637, "learning_rate": 9.296338347998865e-05, "loss": 3.1918216705322267, "step": 4960 }, { "epoch": 1.4107294919103037, "grad_norm": 6.830405235290527, "learning_rate": 9.294919103037185e-05, "loss": 3.256993865966797, "step": 4970 }, { "epoch": 1.4135679818336646, "grad_norm": 7.265485763549805, "learning_rate": 9.293499858075503e-05, "loss": 3.254475402832031, "step": 4980 }, { "epoch": 1.4164064717570253, "grad_norm": 6.98724889755249, "learning_rate": 9.292080613113824e-05, "loss": 3.3058307647705076, "step": 4990 }, { "epoch": 1.419244961680386, "grad_norm": 7.368050575256348, "learning_rate": 9.290661368152143e-05, "loss": 3.225707244873047, "step": 5000 }, { "epoch": 1.419244961680386, "eval_accuracy": 0.16067908692058244, "eval_loss": 3.4523489475250244, "eval_runtime": 46.4202, "eval_samples_per_second": 338.796, "eval_steps_per_second": 5.299, "step": 5000 }, { "epoch": 1.4220834516037468, "grad_norm": 7.144771099090576, "learning_rate": 9.289242123190464e-05, "loss": 3.2532699584960936, "step": 5010 }, { "epoch": 1.4249219415271077, "grad_norm": 7.227400779724121, "learning_rate": 9.287822878228782e-05, "loss": 3.192732238769531, "step": 5020 }, { "epoch": 1.4277604314504684, "grad_norm": 7.084562301635742, "learning_rate": 9.286403633267103e-05, "loss": 3.18719596862793, "step": 5030 }, { "epoch": 1.430598921373829, "grad_norm": 6.924837589263916, "learning_rate": 9.284984388305422e-05, "loss": 3.262770080566406, "step": 5040 }, { "epoch": 1.43343741129719, "grad_norm": 7.179226398468018, "learning_rate": 9.283565143343743e-05, "loss": 3.2581466674804687, "step": 5050 }, { "epoch": 1.4362759012205506, "grad_norm": 7.36436653137207, "learning_rate": 9.282145898382061e-05, "loss": 3.2570877075195312, "step": 5060 }, { "epoch": 1.4391143911439115, "grad_norm": 7.653949737548828, "learning_rate": 9.280726653420381e-05, "loss": 3.2612266540527344, "step": 5070 }, { "epoch": 1.4419528810672722, "grad_norm": 7.07012939453125, "learning_rate": 9.279307408458701e-05, "loss": 3.29212646484375, "step": 5080 }, { "epoch": 1.444791370990633, "grad_norm": 7.390495777130127, "learning_rate": 9.27788816349702e-05, "loss": 3.2210464477539062, "step": 5090 }, { "epoch": 1.4476298609139937, "grad_norm": 7.719757080078125, "learning_rate": 9.27646891853534e-05, "loss": 3.134431266784668, "step": 5100 }, { "epoch": 1.4504683508373546, "grad_norm": 7.131259441375732, "learning_rate": 9.275049673573659e-05, "loss": 3.1971675872802736, "step": 5110 }, { "epoch": 1.4533068407607153, "grad_norm": 7.040030479431152, "learning_rate": 9.27363042861198e-05, "loss": 3.189625549316406, "step": 5120 }, { "epoch": 1.4561453306840761, "grad_norm": 7.066779136657715, "learning_rate": 9.272211183650299e-05, "loss": 3.1389875411987305, "step": 5130 }, { "epoch": 1.4589838206074368, "grad_norm": 6.775429725646973, "learning_rate": 9.270791938688618e-05, "loss": 3.1779560089111327, "step": 5140 }, { "epoch": 1.4618223105307977, "grad_norm": 6.951879978179932, "learning_rate": 9.269372693726937e-05, "loss": 3.173996353149414, "step": 5150 }, { "epoch": 1.4646608004541584, "grad_norm": 7.007215976715088, "learning_rate": 9.267953448765258e-05, "loss": 3.2046859741210936, "step": 5160 }, { "epoch": 1.467499290377519, "grad_norm": 7.073834419250488, "learning_rate": 9.266534203803577e-05, "loss": 3.2057849884033205, "step": 5170 }, { "epoch": 1.47033778030088, "grad_norm": 7.186058044433594, "learning_rate": 9.265114958841897e-05, "loss": 3.179424285888672, "step": 5180 }, { "epoch": 1.4731762702242408, "grad_norm": 7.235440731048584, "learning_rate": 9.263695713880216e-05, "loss": 3.261613464355469, "step": 5190 }, { "epoch": 1.4760147601476015, "grad_norm": 7.579680919647217, "learning_rate": 9.262276468918535e-05, "loss": 3.2251110076904297, "step": 5200 }, { "epoch": 1.4788532500709621, "grad_norm": 7.5220723152160645, "learning_rate": 9.260857223956856e-05, "loss": 3.2767261505126952, "step": 5210 }, { "epoch": 1.481691739994323, "grad_norm": 7.0771331787109375, "learning_rate": 9.259437978995174e-05, "loss": 3.141231155395508, "step": 5220 }, { "epoch": 1.484530229917684, "grad_norm": 7.41853141784668, "learning_rate": 9.258018734033495e-05, "loss": 3.229845428466797, "step": 5230 }, { "epoch": 1.4873687198410446, "grad_norm": 7.233590126037598, "learning_rate": 9.256599489071814e-05, "loss": 3.1608207702636717, "step": 5240 }, { "epoch": 1.4902072097644052, "grad_norm": 7.336684226989746, "learning_rate": 9.255180244110135e-05, "loss": 3.2212688446044924, "step": 5250 }, { "epoch": 1.4930456996877661, "grad_norm": 7.386810302734375, "learning_rate": 9.253760999148453e-05, "loss": 3.193793296813965, "step": 5260 }, { "epoch": 1.495884189611127, "grad_norm": 7.065281391143799, "learning_rate": 9.252341754186773e-05, "loss": 3.223069763183594, "step": 5270 }, { "epoch": 1.4987226795344877, "grad_norm": 7.333076000213623, "learning_rate": 9.250922509225093e-05, "loss": 3.2296676635742188, "step": 5280 }, { "epoch": 1.5015611694578483, "grad_norm": 7.250054359436035, "learning_rate": 9.249503264263412e-05, "loss": 3.2687000274658202, "step": 5290 }, { "epoch": 1.5043996593812092, "grad_norm": 7.119361877441406, "learning_rate": 9.248084019301731e-05, "loss": 3.161273193359375, "step": 5300 }, { "epoch": 1.5072381493045701, "grad_norm": 7.030625820159912, "learning_rate": 9.246664774340051e-05, "loss": 3.210436248779297, "step": 5310 }, { "epoch": 1.5100766392279308, "grad_norm": 7.782882213592529, "learning_rate": 9.245245529378371e-05, "loss": 3.1225540161132814, "step": 5320 }, { "epoch": 1.5129151291512914, "grad_norm": 7.52493143081665, "learning_rate": 9.243826284416691e-05, "loss": 3.2268863677978517, "step": 5330 }, { "epoch": 1.5157536190746523, "grad_norm": 6.991419792175293, "learning_rate": 9.24240703945501e-05, "loss": 3.1397605895996095, "step": 5340 }, { "epoch": 1.518592108998013, "grad_norm": 7.338247299194336, "learning_rate": 9.24098779449333e-05, "loss": 3.200835418701172, "step": 5350 }, { "epoch": 1.521430598921374, "grad_norm": 7.047677516937256, "learning_rate": 9.23956854953165e-05, "loss": 3.1793678283691404, "step": 5360 }, { "epoch": 1.5242690888447346, "grad_norm": 7.306790351867676, "learning_rate": 9.23814930456997e-05, "loss": 3.1712764739990233, "step": 5370 }, { "epoch": 1.5271075787680952, "grad_norm": 7.425307750701904, "learning_rate": 9.236730059608289e-05, "loss": 3.170822334289551, "step": 5380 }, { "epoch": 1.529946068691456, "grad_norm": 6.8926191329956055, "learning_rate": 9.235310814646608e-05, "loss": 3.157527542114258, "step": 5390 }, { "epoch": 1.532784558614817, "grad_norm": 7.314680576324463, "learning_rate": 9.233891569684929e-05, "loss": 3.2451419830322266, "step": 5400 }, { "epoch": 1.5356230485381777, "grad_norm": 7.656919956207275, "learning_rate": 9.232472324723248e-05, "loss": 3.1457244873046877, "step": 5410 }, { "epoch": 1.5384615384615383, "grad_norm": 7.232032299041748, "learning_rate": 9.231053079761567e-05, "loss": 3.113418960571289, "step": 5420 }, { "epoch": 1.5413000283848992, "grad_norm": 6.67373514175415, "learning_rate": 9.229633834799887e-05, "loss": 3.181204986572266, "step": 5430 }, { "epoch": 1.54413851830826, "grad_norm": 7.322835922241211, "learning_rate": 9.228214589838206e-05, "loss": 3.157475471496582, "step": 5440 }, { "epoch": 1.5469770082316208, "grad_norm": 7.130016803741455, "learning_rate": 9.226795344876527e-05, "loss": 3.1366180419921874, "step": 5450 }, { "epoch": 1.5498154981549814, "grad_norm": 7.198541641235352, "learning_rate": 9.225376099914845e-05, "loss": 3.163106155395508, "step": 5460 }, { "epoch": 1.5526539880783423, "grad_norm": 7.105282306671143, "learning_rate": 9.223956854953166e-05, "loss": 3.2160858154296874, "step": 5470 }, { "epoch": 1.5554924780017032, "grad_norm": 6.897276401519775, "learning_rate": 9.222537609991485e-05, "loss": 3.208335113525391, "step": 5480 }, { "epoch": 1.5583309679250639, "grad_norm": 7.211322784423828, "learning_rate": 9.221118365029806e-05, "loss": 3.1508649826049804, "step": 5490 }, { "epoch": 1.5611694578484245, "grad_norm": 7.5442423820495605, "learning_rate": 9.219699120068124e-05, "loss": 3.189815330505371, "step": 5500 }, { "epoch": 1.5611694578484245, "eval_accuracy": 0.16900871113371907, "eval_loss": 3.3975045680999756, "eval_runtime": 55.1624, "eval_samples_per_second": 285.104, "eval_steps_per_second": 4.46, "step": 5500 }, { "epoch": 1.5640079477717854, "grad_norm": 7.285554885864258, "learning_rate": 9.218279875106444e-05, "loss": 3.1128177642822266, "step": 5510 }, { "epoch": 1.5668464376951463, "grad_norm": 7.125631809234619, "learning_rate": 9.216860630144764e-05, "loss": 3.2529502868652345, "step": 5520 }, { "epoch": 1.569684927618507, "grad_norm": 6.9921040534973145, "learning_rate": 9.215441385183083e-05, "loss": 3.168400192260742, "step": 5530 }, { "epoch": 1.5725234175418676, "grad_norm": 7.110685348510742, "learning_rate": 9.214022140221402e-05, "loss": 3.078858184814453, "step": 5540 }, { "epoch": 1.5753619074652285, "grad_norm": 7.3972554206848145, "learning_rate": 9.212602895259722e-05, "loss": 3.1411602020263674, "step": 5550 }, { "epoch": 1.5782003973885894, "grad_norm": 6.688753128051758, "learning_rate": 9.211183650298042e-05, "loss": 3.137198257446289, "step": 5560 }, { "epoch": 1.58103888731195, "grad_norm": 6.876489162445068, "learning_rate": 9.209764405336362e-05, "loss": 3.1382915496826174, "step": 5570 }, { "epoch": 1.5838773772353107, "grad_norm": 7.417903900146484, "learning_rate": 9.208345160374681e-05, "loss": 3.1872196197509766, "step": 5580 }, { "epoch": 1.5867158671586716, "grad_norm": 7.161935806274414, "learning_rate": 9.206925915413e-05, "loss": 3.1930747985839845, "step": 5590 }, { "epoch": 1.5895543570820325, "grad_norm": 7.382147312164307, "learning_rate": 9.205506670451321e-05, "loss": 3.1615686416625977, "step": 5600 }, { "epoch": 1.5923928470053932, "grad_norm": 7.2882184982299805, "learning_rate": 9.20408742548964e-05, "loss": 3.203538513183594, "step": 5610 }, { "epoch": 1.5952313369287539, "grad_norm": 6.775840759277344, "learning_rate": 9.20266818052796e-05, "loss": 3.2089218139648437, "step": 5620 }, { "epoch": 1.5980698268521145, "grad_norm": 6.9309916496276855, "learning_rate": 9.201248935566279e-05, "loss": 3.088114929199219, "step": 5630 }, { "epoch": 1.6009083167754754, "grad_norm": 7.038385391235352, "learning_rate": 9.199829690604598e-05, "loss": 3.1084808349609374, "step": 5640 }, { "epoch": 1.6037468066988363, "grad_norm": 7.439398288726807, "learning_rate": 9.198410445642919e-05, "loss": 3.1674209594726563, "step": 5650 }, { "epoch": 1.606585296622197, "grad_norm": 7.139480113983154, "learning_rate": 9.196991200681237e-05, "loss": 3.1199363708496093, "step": 5660 }, { "epoch": 1.6094237865455576, "grad_norm": 6.843716621398926, "learning_rate": 9.195571955719558e-05, "loss": 3.1905256271362306, "step": 5670 }, { "epoch": 1.6122622764689185, "grad_norm": 7.337143898010254, "learning_rate": 9.194152710757877e-05, "loss": 3.1593145370483398, "step": 5680 }, { "epoch": 1.6151007663922794, "grad_norm": 7.0564775466918945, "learning_rate": 9.192733465796198e-05, "loss": 3.132944869995117, "step": 5690 }, { "epoch": 1.61793925631564, "grad_norm": 6.970536708831787, "learning_rate": 9.191314220834516e-05, "loss": 3.148952102661133, "step": 5700 }, { "epoch": 1.6207777462390007, "grad_norm": 7.229760646820068, "learning_rate": 9.189894975872836e-05, "loss": 3.122034454345703, "step": 5710 }, { "epoch": 1.6236162361623616, "grad_norm": 7.585146903991699, "learning_rate": 9.188475730911156e-05, "loss": 3.1076847076416017, "step": 5720 }, { "epoch": 1.6264547260857225, "grad_norm": 6.81208610534668, "learning_rate": 9.187056485949476e-05, "loss": 3.1699615478515626, "step": 5730 }, { "epoch": 1.6292932160090832, "grad_norm": 7.1756672859191895, "learning_rate": 9.185637240987794e-05, "loss": 3.2082454681396486, "step": 5740 }, { "epoch": 1.6321317059324438, "grad_norm": 7.52459716796875, "learning_rate": 9.184217996026115e-05, "loss": 3.126272964477539, "step": 5750 }, { "epoch": 1.6349701958558047, "grad_norm": 6.994636535644531, "learning_rate": 9.182798751064434e-05, "loss": 3.098810577392578, "step": 5760 }, { "epoch": 1.6378086857791656, "grad_norm": 7.5030131340026855, "learning_rate": 9.181379506102754e-05, "loss": 3.1747045516967773, "step": 5770 }, { "epoch": 1.6406471757025263, "grad_norm": 7.237790107727051, "learning_rate": 9.179960261141073e-05, "loss": 3.160980987548828, "step": 5780 }, { "epoch": 1.643485665625887, "grad_norm": 7.071313858032227, "learning_rate": 9.178541016179392e-05, "loss": 3.0869491577148436, "step": 5790 }, { "epoch": 1.6463241555492478, "grad_norm": 7.074745178222656, "learning_rate": 9.177121771217713e-05, "loss": 3.118192672729492, "step": 5800 }, { "epoch": 1.6491626454726087, "grad_norm": 7.258425712585449, "learning_rate": 9.175702526256032e-05, "loss": 3.2412723541259765, "step": 5810 }, { "epoch": 1.6520011353959694, "grad_norm": 7.228935241699219, "learning_rate": 9.174283281294352e-05, "loss": 3.1119361877441407, "step": 5820 }, { "epoch": 1.65483962531933, "grad_norm": 7.258237838745117, "learning_rate": 9.172864036332671e-05, "loss": 3.0715408325195312, "step": 5830 }, { "epoch": 1.657678115242691, "grad_norm": 7.388478755950928, "learning_rate": 9.171444791370992e-05, "loss": 3.1476696014404295, "step": 5840 }, { "epoch": 1.6605166051660518, "grad_norm": 7.203771114349365, "learning_rate": 9.170025546409311e-05, "loss": 3.172256088256836, "step": 5850 }, { "epoch": 1.6633550950894125, "grad_norm": 6.906182765960693, "learning_rate": 9.16860630144763e-05, "loss": 3.12984619140625, "step": 5860 }, { "epoch": 1.6661935850127731, "grad_norm": 7.378785133361816, "learning_rate": 9.16718705648595e-05, "loss": 3.151509475708008, "step": 5870 }, { "epoch": 1.6690320749361338, "grad_norm": 7.287806034088135, "learning_rate": 9.165767811524269e-05, "loss": 3.135848045349121, "step": 5880 }, { "epoch": 1.6718705648594947, "grad_norm": 6.928171157836914, "learning_rate": 9.16434856656259e-05, "loss": 3.1486383438110352, "step": 5890 }, { "epoch": 1.6747090547828556, "grad_norm": 6.992494583129883, "learning_rate": 9.162929321600908e-05, "loss": 3.1492950439453127, "step": 5900 }, { "epoch": 1.6775475447062163, "grad_norm": 7.486504077911377, "learning_rate": 9.161510076639228e-05, "loss": 3.112318420410156, "step": 5910 }, { "epoch": 1.680386034629577, "grad_norm": 7.546731948852539, "learning_rate": 9.160090831677548e-05, "loss": 3.0992712020874023, "step": 5920 }, { "epoch": 1.6832245245529378, "grad_norm": 7.064881801605225, "learning_rate": 9.158671586715868e-05, "loss": 3.16375732421875, "step": 5930 }, { "epoch": 1.6860630144762987, "grad_norm": 7.273037433624268, "learning_rate": 9.157252341754186e-05, "loss": 3.0974388122558594, "step": 5940 }, { "epoch": 1.6889015043996594, "grad_norm": 7.133945941925049, "learning_rate": 9.155833096792507e-05, "loss": 3.2099559783935545, "step": 5950 }, { "epoch": 1.69173999432302, "grad_norm": 7.203307628631592, "learning_rate": 9.154413851830826e-05, "loss": 3.155659484863281, "step": 5960 }, { "epoch": 1.694578484246381, "grad_norm": 6.748016834259033, "learning_rate": 9.152994606869146e-05, "loss": 3.1310297012329102, "step": 5970 }, { "epoch": 1.6974169741697418, "grad_norm": 7.2437520027160645, "learning_rate": 9.151575361907465e-05, "loss": 3.214795684814453, "step": 5980 }, { "epoch": 1.7002554640931025, "grad_norm": 7.309248447418213, "learning_rate": 9.150156116945784e-05, "loss": 3.0795413970947267, "step": 5990 }, { "epoch": 1.7030939540164631, "grad_norm": 7.051397323608398, "learning_rate": 9.148736871984105e-05, "loss": 3.1205345153808595, "step": 6000 }, { "epoch": 1.7030939540164631, "eval_accuracy": 0.16455776689769186, "eval_loss": 3.359905958175659, "eval_runtime": 49.4642, "eval_samples_per_second": 317.947, "eval_steps_per_second": 4.973, "step": 6000 }, { "epoch": 1.705932443939824, "grad_norm": 7.265932083129883, "learning_rate": 9.147317627022424e-05, "loss": 3.1718673706054688, "step": 6010 }, { "epoch": 1.708770933863185, "grad_norm": 7.045156478881836, "learning_rate": 9.145898382060745e-05, "loss": 3.14837532043457, "step": 6020 }, { "epoch": 1.7116094237865456, "grad_norm": 6.883181571960449, "learning_rate": 9.144479137099063e-05, "loss": 3.128076934814453, "step": 6030 }, { "epoch": 1.7144479137099062, "grad_norm": 7.434040069580078, "learning_rate": 9.143059892137384e-05, "loss": 3.1323272705078127, "step": 6040 }, { "epoch": 1.7172864036332671, "grad_norm": 6.957037448883057, "learning_rate": 9.141640647175703e-05, "loss": 3.091903305053711, "step": 6050 }, { "epoch": 1.720124893556628, "grad_norm": 7.4107561111450195, "learning_rate": 9.140221402214024e-05, "loss": 3.1311073303222656, "step": 6060 }, { "epoch": 1.7229633834799887, "grad_norm": 7.51769495010376, "learning_rate": 9.138802157252342e-05, "loss": 3.159127426147461, "step": 6070 }, { "epoch": 1.7258018734033493, "grad_norm": 7.149380683898926, "learning_rate": 9.137382912290662e-05, "loss": 3.070414161682129, "step": 6080 }, { "epoch": 1.7286403633267102, "grad_norm": 6.875611782073975, "learning_rate": 9.135963667328982e-05, "loss": 3.0872310638427733, "step": 6090 }, { "epoch": 1.7314788532500711, "grad_norm": 7.161673069000244, "learning_rate": 9.134544422367301e-05, "loss": 3.0810394287109375, "step": 6100 }, { "epoch": 1.7343173431734318, "grad_norm": 7.742437839508057, "learning_rate": 9.13312517740562e-05, "loss": 3.149779510498047, "step": 6110 }, { "epoch": 1.7371558330967924, "grad_norm": 6.913571834564209, "learning_rate": 9.13170593244394e-05, "loss": 3.18798942565918, "step": 6120 }, { "epoch": 1.7399943230201533, "grad_norm": 7.010511875152588, "learning_rate": 9.13028668748226e-05, "loss": 3.177118492126465, "step": 6130 }, { "epoch": 1.742832812943514, "grad_norm": 6.823977947235107, "learning_rate": 9.12886744252058e-05, "loss": 3.1103050231933596, "step": 6140 }, { "epoch": 1.7456713028668749, "grad_norm": 7.345937728881836, "learning_rate": 9.127448197558899e-05, "loss": 3.136082649230957, "step": 6150 }, { "epoch": 1.7485097927902356, "grad_norm": 7.181262016296387, "learning_rate": 9.126028952597218e-05, "loss": 3.115665817260742, "step": 6160 }, { "epoch": 1.7513482827135962, "grad_norm": 6.846573352813721, "learning_rate": 9.124609707635539e-05, "loss": 3.0637435913085938, "step": 6170 }, { "epoch": 1.754186772636957, "grad_norm": 7.564462184906006, "learning_rate": 9.123190462673858e-05, "loss": 3.168865203857422, "step": 6180 }, { "epoch": 1.757025262560318, "grad_norm": 7.124812602996826, "learning_rate": 9.121771217712178e-05, "loss": 3.185659408569336, "step": 6190 }, { "epoch": 1.7598637524836787, "grad_norm": 6.985522270202637, "learning_rate": 9.120351972750497e-05, "loss": 3.097420883178711, "step": 6200 }, { "epoch": 1.7627022424070393, "grad_norm": 6.971820831298828, "learning_rate": 9.118932727788816e-05, "loss": 3.0733257293701173, "step": 6210 }, { "epoch": 1.7655407323304002, "grad_norm": 6.932805061340332, "learning_rate": 9.117513482827137e-05, "loss": 3.0486093521118165, "step": 6220 }, { "epoch": 1.768379222253761, "grad_norm": 7.102753639221191, "learning_rate": 9.116094237865455e-05, "loss": 3.116290283203125, "step": 6230 }, { "epoch": 1.7712177121771218, "grad_norm": 7.330874919891357, "learning_rate": 9.114674992903776e-05, "loss": 3.0788433074951174, "step": 6240 }, { "epoch": 1.7740562021004824, "grad_norm": 7.017622947692871, "learning_rate": 9.113255747942095e-05, "loss": 3.0669355392456055, "step": 6250 }, { "epoch": 1.7768946920238433, "grad_norm": 7.642595291137695, "learning_rate": 9.111836502980416e-05, "loss": 3.177346038818359, "step": 6260 }, { "epoch": 1.7797331819472042, "grad_norm": 7.0256171226501465, "learning_rate": 9.110417258018734e-05, "loss": 3.153858757019043, "step": 6270 }, { "epoch": 1.7825716718705649, "grad_norm": 7.4350972175598145, "learning_rate": 9.108998013057054e-05, "loss": 3.094461441040039, "step": 6280 }, { "epoch": 1.7854101617939255, "grad_norm": 7.516396522521973, "learning_rate": 9.107578768095374e-05, "loss": 3.1002126693725587, "step": 6290 }, { "epoch": 1.7882486517172864, "grad_norm": 6.813597679138184, "learning_rate": 9.106159523133694e-05, "loss": 3.2005592346191407, "step": 6300 }, { "epoch": 1.7910871416406473, "grad_norm": 6.7827653884887695, "learning_rate": 9.104740278172012e-05, "loss": 3.130660629272461, "step": 6310 }, { "epoch": 1.793925631564008, "grad_norm": 7.027134895324707, "learning_rate": 9.103321033210333e-05, "loss": 3.127294158935547, "step": 6320 }, { "epoch": 1.7967641214873686, "grad_norm": 7.464398384094238, "learning_rate": 9.101901788248652e-05, "loss": 3.0698070526123047, "step": 6330 }, { "epoch": 1.7996026114107295, "grad_norm": 6.882223129272461, "learning_rate": 9.100482543286972e-05, "loss": 3.0357269287109374, "step": 6340 }, { "epoch": 1.8024411013340904, "grad_norm": 7.074294090270996, "learning_rate": 9.099063298325291e-05, "loss": 3.1358715057373048, "step": 6350 }, { "epoch": 1.805279591257451, "grad_norm": 7.3020734786987305, "learning_rate": 9.09764405336361e-05, "loss": 3.0794723510742186, "step": 6360 }, { "epoch": 1.8081180811808117, "grad_norm": 7.841075420379639, "learning_rate": 9.096224808401931e-05, "loss": 3.1631757736206056, "step": 6370 }, { "epoch": 1.8109565711041726, "grad_norm": 7.577653408050537, "learning_rate": 9.09480556344025e-05, "loss": 3.1034149169921874, "step": 6380 }, { "epoch": 1.8137950610275335, "grad_norm": 6.905479907989502, "learning_rate": 9.09338631847857e-05, "loss": 3.1052978515625, "step": 6390 }, { "epoch": 1.8166335509508942, "grad_norm": 6.563653945922852, "learning_rate": 9.091967073516889e-05, "loss": 3.056715393066406, "step": 6400 }, { "epoch": 1.8194720408742548, "grad_norm": 6.989198684692383, "learning_rate": 9.09054782855521e-05, "loss": 3.134942626953125, "step": 6410 }, { "epoch": 1.8223105307976155, "grad_norm": 7.600033283233643, "learning_rate": 9.089128583593529e-05, "loss": 3.1876243591308593, "step": 6420 }, { "epoch": 1.8251490207209764, "grad_norm": 7.300682544708252, "learning_rate": 9.087709338631848e-05, "loss": 3.105923652648926, "step": 6430 }, { "epoch": 1.8279875106443373, "grad_norm": 7.330543518066406, "learning_rate": 9.086290093670168e-05, "loss": 3.0584564208984375, "step": 6440 }, { "epoch": 1.830826000567698, "grad_norm": 7.241456031799316, "learning_rate": 9.084870848708487e-05, "loss": 3.0900482177734374, "step": 6450 }, { "epoch": 1.8336644904910586, "grad_norm": 7.649754047393799, "learning_rate": 9.083451603746808e-05, "loss": 3.149706268310547, "step": 6460 }, { "epoch": 1.8365029804144195, "grad_norm": 7.1990966796875, "learning_rate": 9.082032358785126e-05, "loss": 3.1107961654663088, "step": 6470 }, { "epoch": 1.8393414703377804, "grad_norm": 7.180905342102051, "learning_rate": 9.080613113823447e-05, "loss": 2.9968276977539063, "step": 6480 }, { "epoch": 1.842179960261141, "grad_norm": 7.120096683502197, "learning_rate": 9.079193868861766e-05, "loss": 3.158291244506836, "step": 6490 }, { "epoch": 1.8450184501845017, "grad_norm": 7.124682903289795, "learning_rate": 9.077774623900087e-05, "loss": 3.0507122039794923, "step": 6500 }, { "epoch": 1.8450184501845017, "eval_accuracy": 0.1796909773001844, "eval_loss": 3.32062029838562, "eval_runtime": 46.179, "eval_samples_per_second": 340.566, "eval_steps_per_second": 5.327, "step": 6500 }, { "epoch": 1.8478569401078626, "grad_norm": 7.418855667114258, "learning_rate": 9.076355378938405e-05, "loss": 3.132175064086914, "step": 6510 }, { "epoch": 1.8506954300312235, "grad_norm": 7.267030239105225, "learning_rate": 9.074936133976725e-05, "loss": 3.068056106567383, "step": 6520 }, { "epoch": 1.8535339199545842, "grad_norm": 6.870095729827881, "learning_rate": 9.073516889015045e-05, "loss": 2.9891359329223635, "step": 6530 }, { "epoch": 1.8563724098779448, "grad_norm": 7.2687554359436035, "learning_rate": 9.072097644053364e-05, "loss": 3.1243703842163084, "step": 6540 }, { "epoch": 1.8592108998013057, "grad_norm": 7.233234405517578, "learning_rate": 9.070678399091683e-05, "loss": 3.2096431732177733, "step": 6550 }, { "epoch": 1.8620493897246666, "grad_norm": 6.932727336883545, "learning_rate": 9.069259154130003e-05, "loss": 3.0531225204467773, "step": 6560 }, { "epoch": 1.8648878796480273, "grad_norm": 7.36217737197876, "learning_rate": 9.067839909168323e-05, "loss": 3.0075233459472654, "step": 6570 }, { "epoch": 1.867726369571388, "grad_norm": 7.329558372497559, "learning_rate": 9.066420664206643e-05, "loss": 3.1232620239257813, "step": 6580 }, { "epoch": 1.8705648594947488, "grad_norm": 7.101428985595703, "learning_rate": 9.065001419244962e-05, "loss": 3.1878583908081053, "step": 6590 }, { "epoch": 1.8734033494181097, "grad_norm": 7.354240417480469, "learning_rate": 9.063582174283281e-05, "loss": 3.1040374755859377, "step": 6600 }, { "epoch": 1.8762418393414704, "grad_norm": 6.854389190673828, "learning_rate": 9.062162929321602e-05, "loss": 3.0504123687744142, "step": 6610 }, { "epoch": 1.879080329264831, "grad_norm": 7.839287281036377, "learning_rate": 9.060743684359921e-05, "loss": 3.0400516510009767, "step": 6620 }, { "epoch": 1.881918819188192, "grad_norm": 7.03000545501709, "learning_rate": 9.05932443939824e-05, "loss": 3.0506364822387697, "step": 6630 }, { "epoch": 1.8847573091115528, "grad_norm": 6.938281059265137, "learning_rate": 9.05790519443656e-05, "loss": 3.0934690475463866, "step": 6640 }, { "epoch": 1.8875957990349135, "grad_norm": 7.197543144226074, "learning_rate": 9.05648594947488e-05, "loss": 3.1102413177490233, "step": 6650 }, { "epoch": 1.8904342889582741, "grad_norm": 7.698197364807129, "learning_rate": 9.0550667045132e-05, "loss": 3.1511466979980467, "step": 6660 }, { "epoch": 1.8932727788816348, "grad_norm": 7.027536869049072, "learning_rate": 9.053647459551519e-05, "loss": 3.1108928680419923, "step": 6670 }, { "epoch": 1.8961112688049957, "grad_norm": 7.466037273406982, "learning_rate": 9.052228214589839e-05, "loss": 3.099617767333984, "step": 6680 }, { "epoch": 1.8989497587283566, "grad_norm": 7.3546295166015625, "learning_rate": 9.050808969628158e-05, "loss": 3.015228271484375, "step": 6690 }, { "epoch": 1.9017882486517173, "grad_norm": 7.027085781097412, "learning_rate": 9.049389724666479e-05, "loss": 3.0263950347900392, "step": 6700 }, { "epoch": 1.904626738575078, "grad_norm": 7.261797904968262, "learning_rate": 9.047970479704797e-05, "loss": 3.0751758575439454, "step": 6710 }, { "epoch": 1.9074652284984388, "grad_norm": 7.319139003753662, "learning_rate": 9.046551234743117e-05, "loss": 3.0575067520141603, "step": 6720 }, { "epoch": 1.9103037184217997, "grad_norm": 6.99714469909668, "learning_rate": 9.045131989781437e-05, "loss": 3.0946077346801757, "step": 6730 }, { "epoch": 1.9131422083451604, "grad_norm": 7.450718402862549, "learning_rate": 9.043712744819757e-05, "loss": 3.090783882141113, "step": 6740 }, { "epoch": 1.915980698268521, "grad_norm": 6.9283366203308105, "learning_rate": 9.042293499858075e-05, "loss": 3.0973028182983398, "step": 6750 }, { "epoch": 1.918819188191882, "grad_norm": 7.012002944946289, "learning_rate": 9.040874254896396e-05, "loss": 3.1117820739746094, "step": 6760 }, { "epoch": 1.9216576781152428, "grad_norm": 7.584601402282715, "learning_rate": 9.039455009934715e-05, "loss": 3.0271873474121094, "step": 6770 }, { "epoch": 1.9244961680386035, "grad_norm": 6.956268310546875, "learning_rate": 9.038035764973035e-05, "loss": 3.094856834411621, "step": 6780 }, { "epoch": 1.9273346579619641, "grad_norm": 7.197462558746338, "learning_rate": 9.036758444507523e-05, "loss": 3.056380271911621, "step": 6790 }, { "epoch": 1.930173147885325, "grad_norm": 7.315066814422607, "learning_rate": 9.035339199545843e-05, "loss": 3.0565967559814453, "step": 6800 }, { "epoch": 1.933011637808686, "grad_norm": 7.4915995597839355, "learning_rate": 9.033919954584162e-05, "loss": 3.077047920227051, "step": 6810 }, { "epoch": 1.9358501277320466, "grad_norm": 6.8488078117370605, "learning_rate": 9.032500709622481e-05, "loss": 3.086366653442383, "step": 6820 }, { "epoch": 1.9386886176554072, "grad_norm": 6.963672161102295, "learning_rate": 9.0310814646608e-05, "loss": 3.0693288803100587, "step": 6830 }, { "epoch": 1.9415271075787681, "grad_norm": 7.125454902648926, "learning_rate": 9.029662219699121e-05, "loss": 3.004816246032715, "step": 6840 }, { "epoch": 1.944365597502129, "grad_norm": 6.970752239227295, "learning_rate": 9.028242974737439e-05, "loss": 3.005447769165039, "step": 6850 }, { "epoch": 1.9472040874254897, "grad_norm": 7.229462146759033, "learning_rate": 9.02682372977576e-05, "loss": 3.0055103302001953, "step": 6860 }, { "epoch": 1.9500425773488503, "grad_norm": 6.916894912719727, "learning_rate": 9.025404484814079e-05, "loss": 3.0426855087280273, "step": 6870 }, { "epoch": 1.9528810672722112, "grad_norm": 7.3634185791015625, "learning_rate": 9.0239852398524e-05, "loss": 3.0429637908935545, "step": 6880 }, { "epoch": 1.9557195571955721, "grad_norm": 7.260014057159424, "learning_rate": 9.022565994890718e-05, "loss": 3.0763599395751955, "step": 6890 }, { "epoch": 1.9585580471189328, "grad_norm": 7.484272480010986, "learning_rate": 9.021146749929039e-05, "loss": 3.006413459777832, "step": 6900 }, { "epoch": 1.9613965370422934, "grad_norm": 7.407604217529297, "learning_rate": 9.019727504967358e-05, "loss": 3.0897136688232423, "step": 6910 }, { "epoch": 1.964235026965654, "grad_norm": 6.982259750366211, "learning_rate": 9.018308260005677e-05, "loss": 3.112310791015625, "step": 6920 }, { "epoch": 1.967073516889015, "grad_norm": 7.1636762619018555, "learning_rate": 9.016889015043997e-05, "loss": 3.0192333221435548, "step": 6930 }, { "epoch": 1.9699120068123759, "grad_norm": 7.2769036293029785, "learning_rate": 9.015469770082316e-05, "loss": 3.0604976654052733, "step": 6940 }, { "epoch": 1.9727504967357365, "grad_norm": 7.406731605529785, "learning_rate": 9.014050525120637e-05, "loss": 3.0251796722412108, "step": 6950 }, { "epoch": 1.9755889866590972, "grad_norm": 7.0674943923950195, "learning_rate": 9.012631280158956e-05, "loss": 3.011102294921875, "step": 6960 }, { "epoch": 1.978427476582458, "grad_norm": 7.2743706703186035, "learning_rate": 9.011212035197275e-05, "loss": 3.0371013641357423, "step": 6970 }, { "epoch": 1.981265966505819, "grad_norm": 7.500882148742676, "learning_rate": 9.009792790235595e-05, "loss": 3.0980018615722655, "step": 6980 }, { "epoch": 1.9841044564291797, "grad_norm": 7.372799396514893, "learning_rate": 9.008373545273915e-05, "loss": 3.017116355895996, "step": 6990 }, { "epoch": 1.9869429463525403, "grad_norm": 6.875178337097168, "learning_rate": 9.006954300312235e-05, "loss": 3.0776180267333983, "step": 7000 }, { "epoch": 1.9869429463525403, "eval_accuracy": 0.1838875818655815, "eval_loss": 3.2836720943450928, "eval_runtime": 48.2708, "eval_samples_per_second": 325.808, "eval_steps_per_second": 5.096, "step": 7000 }, { "epoch": 1.9897814362759012, "grad_norm": 7.214849472045898, "learning_rate": 9.005535055350554e-05, "loss": 3.0473934173583985, "step": 7010 }, { "epoch": 1.992619926199262, "grad_norm": 7.253635883331299, "learning_rate": 9.004115810388873e-05, "loss": 3.100991058349609, "step": 7020 }, { "epoch": 1.9954584161226228, "grad_norm": 6.890476226806641, "learning_rate": 9.002696565427194e-05, "loss": 3.062240219116211, "step": 7030 }, { "epoch": 1.9982969060459834, "grad_norm": 7.0843071937561035, "learning_rate": 9.001277320465513e-05, "loss": 3.0320831298828126, "step": 7040 }, { "epoch": 2.001135395969344, "grad_norm": 7.020041465759277, "learning_rate": 9e-05, "loss": 2.9865982055664064, "step": 7050 }, { "epoch": 2.003973885892705, "grad_norm": 7.337575435638428, "learning_rate": 8.99858075503832e-05, "loss": 3.0478511810302735, "step": 7060 }, { "epoch": 2.006812375816066, "grad_norm": 6.634625434875488, "learning_rate": 8.997161510076639e-05, "loss": 3.002874565124512, "step": 7070 }, { "epoch": 2.0096508657394265, "grad_norm": 7.193136692047119, "learning_rate": 8.995742265114959e-05, "loss": 2.993805694580078, "step": 7080 }, { "epoch": 2.012489355662787, "grad_norm": 7.1726484298706055, "learning_rate": 8.994323020153279e-05, "loss": 2.955626106262207, "step": 7090 }, { "epoch": 2.0153278455861483, "grad_norm": 7.465689659118652, "learning_rate": 8.992903775191599e-05, "loss": 3.0385114669799806, "step": 7100 }, { "epoch": 2.018166335509509, "grad_norm": 7.2667670249938965, "learning_rate": 8.991484530229918e-05, "loss": 3.011845016479492, "step": 7110 }, { "epoch": 2.0210048254328696, "grad_norm": 6.933682441711426, "learning_rate": 8.990065285268237e-05, "loss": 3.0108573913574217, "step": 7120 }, { "epoch": 2.0238433153562303, "grad_norm": 7.140575408935547, "learning_rate": 8.988646040306558e-05, "loss": 3.0276166915893556, "step": 7130 }, { "epoch": 2.0266818052795914, "grad_norm": 7.31045389175415, "learning_rate": 8.987226795344877e-05, "loss": 3.011644744873047, "step": 7140 }, { "epoch": 2.029520295202952, "grad_norm": 7.340124130249023, "learning_rate": 8.985807550383197e-05, "loss": 2.9920093536376955, "step": 7150 }, { "epoch": 2.0323587851263127, "grad_norm": 6.799068927764893, "learning_rate": 8.984388305421516e-05, "loss": 2.9929824829101563, "step": 7160 }, { "epoch": 2.0351972750496734, "grad_norm": 6.816397666931152, "learning_rate": 8.982969060459837e-05, "loss": 3.026209259033203, "step": 7170 }, { "epoch": 2.0380357649730345, "grad_norm": 6.848625183105469, "learning_rate": 8.981549815498156e-05, "loss": 2.923965072631836, "step": 7180 }, { "epoch": 2.040874254896395, "grad_norm": 7.213102340698242, "learning_rate": 8.980130570536475e-05, "loss": 2.958165740966797, "step": 7190 }, { "epoch": 2.043712744819756, "grad_norm": 7.22298526763916, "learning_rate": 8.978711325574795e-05, "loss": 2.997864532470703, "step": 7200 }, { "epoch": 2.0465512347431165, "grad_norm": 7.2114715576171875, "learning_rate": 8.977292080613114e-05, "loss": 3.102789306640625, "step": 7210 }, { "epoch": 2.0493897246664776, "grad_norm": 6.915980339050293, "learning_rate": 8.975872835651435e-05, "loss": 2.9992111206054686, "step": 7220 }, { "epoch": 2.0522282145898383, "grad_norm": 6.80872917175293, "learning_rate": 8.974453590689753e-05, "loss": 2.9949310302734373, "step": 7230 }, { "epoch": 2.055066704513199, "grad_norm": 6.854336738586426, "learning_rate": 8.973034345728073e-05, "loss": 3.006937599182129, "step": 7240 }, { "epoch": 2.0579051944365596, "grad_norm": 7.122447967529297, "learning_rate": 8.971615100766393e-05, "loss": 2.9268562316894533, "step": 7250 }, { "epoch": 2.0607436843599207, "grad_norm": 7.403303623199463, "learning_rate": 8.970195855804713e-05, "loss": 3.038021469116211, "step": 7260 }, { "epoch": 2.0635821742832814, "grad_norm": 6.829321384429932, "learning_rate": 8.968776610843031e-05, "loss": 2.9471763610839843, "step": 7270 }, { "epoch": 2.066420664206642, "grad_norm": 7.2231364250183105, "learning_rate": 8.967357365881352e-05, "loss": 2.9844751358032227, "step": 7280 }, { "epoch": 2.0692591541300027, "grad_norm": 7.275439739227295, "learning_rate": 8.965938120919671e-05, "loss": 2.974617767333984, "step": 7290 }, { "epoch": 2.072097644053364, "grad_norm": 7.04807186126709, "learning_rate": 8.96451887595799e-05, "loss": 3.00186824798584, "step": 7300 }, { "epoch": 2.0749361339767245, "grad_norm": 7.410334587097168, "learning_rate": 8.96309963099631e-05, "loss": 3.0554100036621095, "step": 7310 }, { "epoch": 2.077774623900085, "grad_norm": 6.9013214111328125, "learning_rate": 8.961680386034629e-05, "loss": 3.001266670227051, "step": 7320 }, { "epoch": 2.080613113823446, "grad_norm": 6.877133846282959, "learning_rate": 8.96026114107295e-05, "loss": 3.0021188735961912, "step": 7330 }, { "epoch": 2.0834516037468065, "grad_norm": 7.444881439208984, "learning_rate": 8.95884189611127e-05, "loss": 3.0845413208007812, "step": 7340 }, { "epoch": 2.0862900936701676, "grad_norm": 6.977440357208252, "learning_rate": 8.957422651149589e-05, "loss": 2.999803161621094, "step": 7350 }, { "epoch": 2.0891285835935283, "grad_norm": 7.0117716789245605, "learning_rate": 8.956003406187908e-05, "loss": 2.9508079528808593, "step": 7360 }, { "epoch": 2.091967073516889, "grad_norm": 6.8857502937316895, "learning_rate": 8.954584161226229e-05, "loss": 2.966469383239746, "step": 7370 }, { "epoch": 2.0948055634402496, "grad_norm": 7.439071178436279, "learning_rate": 8.953164916264548e-05, "loss": 3.0064109802246093, "step": 7380 }, { "epoch": 2.0976440533636107, "grad_norm": 7.105075836181641, "learning_rate": 8.951745671302867e-05, "loss": 2.9738485336303713, "step": 7390 }, { "epoch": 2.1004825432869714, "grad_norm": 7.012788772583008, "learning_rate": 8.950326426341187e-05, "loss": 2.955512046813965, "step": 7400 }, { "epoch": 2.103321033210332, "grad_norm": 6.730929374694824, "learning_rate": 8.948907181379506e-05, "loss": 3.0361576080322266, "step": 7410 }, { "epoch": 2.1061595231336927, "grad_norm": 7.4957499504089355, "learning_rate": 8.947487936417827e-05, "loss": 3.003215217590332, "step": 7420 }, { "epoch": 2.108998013057054, "grad_norm": 7.167891025543213, "learning_rate": 8.946068691456145e-05, "loss": 2.944964790344238, "step": 7430 }, { "epoch": 2.1118365029804145, "grad_norm": 7.20109748840332, "learning_rate": 8.944649446494465e-05, "loss": 3.0146488189697265, "step": 7440 }, { "epoch": 2.114674992903775, "grad_norm": 7.363357067108154, "learning_rate": 8.943230201532785e-05, "loss": 3.0049734115600586, "step": 7450 }, { "epoch": 2.117513482827136, "grad_norm": 7.020476818084717, "learning_rate": 8.941810956571105e-05, "loss": 3.008789825439453, "step": 7460 }, { "epoch": 2.120351972750497, "grad_norm": 7.480217933654785, "learning_rate": 8.940391711609423e-05, "loss": 2.9790884017944337, "step": 7470 }, { "epoch": 2.1231904626738576, "grad_norm": 7.118488311767578, "learning_rate": 8.938972466647744e-05, "loss": 2.961781120300293, "step": 7480 }, { "epoch": 2.1260289525972182, "grad_norm": 7.233987331390381, "learning_rate": 8.937553221686063e-05, "loss": 3.0179004669189453, "step": 7490 }, { "epoch": 2.128867442520579, "grad_norm": 7.0895562171936035, "learning_rate": 8.936133976724384e-05, "loss": 3.033833122253418, "step": 7500 }, { "epoch": 2.128867442520579, "eval_accuracy": 0.18477777071278692, "eval_loss": 3.2561779022216797, "eval_runtime": 44.552, "eval_samples_per_second": 353.003, "eval_steps_per_second": 5.522, "step": 7500 }, { "epoch": 2.13170593244394, "grad_norm": 7.158928871154785, "learning_rate": 8.934714731762702e-05, "loss": 2.9850347518920897, "step": 7510 }, { "epoch": 2.1345444223673007, "grad_norm": 7.235931396484375, "learning_rate": 8.933295486801023e-05, "loss": 2.940884017944336, "step": 7520 }, { "epoch": 2.1373829122906614, "grad_norm": 6.909207820892334, "learning_rate": 8.931876241839342e-05, "loss": 3.050968551635742, "step": 7530 }, { "epoch": 2.140221402214022, "grad_norm": 7.26664400100708, "learning_rate": 8.930456996877661e-05, "loss": 2.896742248535156, "step": 7540 }, { "epoch": 2.1430598921373827, "grad_norm": 6.895328521728516, "learning_rate": 8.929037751915981e-05, "loss": 3.0134418487548826, "step": 7550 }, { "epoch": 2.145898382060744, "grad_norm": 7.195440769195557, "learning_rate": 8.9276185069543e-05, "loss": 2.97463321685791, "step": 7560 }, { "epoch": 2.1487368719841045, "grad_norm": 6.864922523498535, "learning_rate": 8.926199261992621e-05, "loss": 3.0006351470947266, "step": 7570 }, { "epoch": 2.151575361907465, "grad_norm": 6.906733512878418, "learning_rate": 8.92478001703094e-05, "loss": 2.9121646881103516, "step": 7580 }, { "epoch": 2.1544138518308262, "grad_norm": 7.0508317947387695, "learning_rate": 8.92336077206926e-05, "loss": 2.9423547744750977, "step": 7590 }, { "epoch": 2.157252341754187, "grad_norm": 6.944141864776611, "learning_rate": 8.921941527107579e-05, "loss": 3.002177047729492, "step": 7600 }, { "epoch": 2.1600908316775476, "grad_norm": 6.861388206481934, "learning_rate": 8.9205222821459e-05, "loss": 3.023697090148926, "step": 7610 }, { "epoch": 2.1629293216009082, "grad_norm": 7.501952648162842, "learning_rate": 8.919103037184219e-05, "loss": 2.9673084259033202, "step": 7620 }, { "epoch": 2.165767811524269, "grad_norm": 7.208130359649658, "learning_rate": 8.917683792222538e-05, "loss": 2.9726791381835938, "step": 7630 }, { "epoch": 2.16860630144763, "grad_norm": 6.967475891113281, "learning_rate": 8.916264547260857e-05, "loss": 3.052836608886719, "step": 7640 }, { "epoch": 2.1714447913709907, "grad_norm": 6.831790447235107, "learning_rate": 8.914845302299177e-05, "loss": 2.9636695861816404, "step": 7650 }, { "epoch": 2.1742832812943513, "grad_norm": 6.943994998931885, "learning_rate": 8.913426057337497e-05, "loss": 2.9653341293334963, "step": 7660 }, { "epoch": 2.177121771217712, "grad_norm": 6.855183124542236, "learning_rate": 8.912006812375815e-05, "loss": 2.9928884506225586, "step": 7670 }, { "epoch": 2.179960261141073, "grad_norm": 7.079280853271484, "learning_rate": 8.910587567414136e-05, "loss": 2.956267547607422, "step": 7680 }, { "epoch": 2.1827987510644338, "grad_norm": 7.41858434677124, "learning_rate": 8.909168322452455e-05, "loss": 3.005336380004883, "step": 7690 }, { "epoch": 2.1856372409877944, "grad_norm": 6.8690314292907715, "learning_rate": 8.907749077490776e-05, "loss": 2.9264204025268556, "step": 7700 }, { "epoch": 2.188475730911155, "grad_norm": 7.094261169433594, "learning_rate": 8.906329832529094e-05, "loss": 3.0204660415649416, "step": 7710 }, { "epoch": 2.191314220834516, "grad_norm": 7.09037446975708, "learning_rate": 8.904910587567415e-05, "loss": 2.884510040283203, "step": 7720 }, { "epoch": 2.194152710757877, "grad_norm": 7.329184532165527, "learning_rate": 8.903491342605734e-05, "loss": 2.946273422241211, "step": 7730 }, { "epoch": 2.1969912006812375, "grad_norm": 6.8371453285217285, "learning_rate": 8.902072097644055e-05, "loss": 2.996639442443848, "step": 7740 }, { "epoch": 2.199829690604598, "grad_norm": 7.345523357391357, "learning_rate": 8.900652852682373e-05, "loss": 2.925536346435547, "step": 7750 }, { "epoch": 2.2026681805279593, "grad_norm": 7.15377950668335, "learning_rate": 8.899233607720693e-05, "loss": 2.921312713623047, "step": 7760 }, { "epoch": 2.20550667045132, "grad_norm": 7.075297832489014, "learning_rate": 8.897814362759013e-05, "loss": 3.032107925415039, "step": 7770 }, { "epoch": 2.2083451603746806, "grad_norm": 7.433184623718262, "learning_rate": 8.896395117797332e-05, "loss": 2.9729080200195312, "step": 7780 }, { "epoch": 2.2111836502980413, "grad_norm": 7.298872947692871, "learning_rate": 8.894975872835651e-05, "loss": 2.9828184127807615, "step": 7790 }, { "epoch": 2.2140221402214024, "grad_norm": 6.794515609741211, "learning_rate": 8.893556627873971e-05, "loss": 2.959442901611328, "step": 7800 }, { "epoch": 2.216860630144763, "grad_norm": 7.307714462280273, "learning_rate": 8.892137382912291e-05, "loss": 2.9701629638671876, "step": 7810 }, { "epoch": 2.2196991200681238, "grad_norm": 7.365050315856934, "learning_rate": 8.890718137950611e-05, "loss": 2.932099533081055, "step": 7820 }, { "epoch": 2.2225376099914844, "grad_norm": 6.848733425140381, "learning_rate": 8.88929889298893e-05, "loss": 3.0102256774902343, "step": 7830 }, { "epoch": 2.225376099914845, "grad_norm": 6.932369709014893, "learning_rate": 8.88787964802725e-05, "loss": 2.9806324005126954, "step": 7840 }, { "epoch": 2.228214589838206, "grad_norm": 6.907382011413574, "learning_rate": 8.88646040306557e-05, "loss": 2.898751068115234, "step": 7850 }, { "epoch": 2.231053079761567, "grad_norm": 7.3757243156433105, "learning_rate": 8.88504115810389e-05, "loss": 3.018966865539551, "step": 7860 }, { "epoch": 2.2338915696849275, "grad_norm": 6.8580474853515625, "learning_rate": 8.883621913142209e-05, "loss": 2.9562408447265627, "step": 7870 }, { "epoch": 2.236730059608288, "grad_norm": 7.170674800872803, "learning_rate": 8.882202668180528e-05, "loss": 2.9601058959960938, "step": 7880 }, { "epoch": 2.2395685495316493, "grad_norm": 7.538591384887695, "learning_rate": 8.880783423218848e-05, "loss": 3.0050739288330077, "step": 7890 }, { "epoch": 2.24240703945501, "grad_norm": 6.964306354522705, "learning_rate": 8.879364178257168e-05, "loss": 2.9637704849243165, "step": 7900 }, { "epoch": 2.2452455293783706, "grad_norm": 7.015460968017578, "learning_rate": 8.877944933295486e-05, "loss": 2.9406723022460937, "step": 7910 }, { "epoch": 2.2480840193017313, "grad_norm": 7.140761852264404, "learning_rate": 8.876525688333807e-05, "loss": 2.9569629669189452, "step": 7920 }, { "epoch": 2.2509225092250924, "grad_norm": 6.992654800415039, "learning_rate": 8.875106443372126e-05, "loss": 2.9742013931274416, "step": 7930 }, { "epoch": 2.253760999148453, "grad_norm": 7.389883041381836, "learning_rate": 8.873687198410447e-05, "loss": 2.9747982025146484, "step": 7940 }, { "epoch": 2.2565994890718137, "grad_norm": 7.27595853805542, "learning_rate": 8.872267953448765e-05, "loss": 2.9884355545043944, "step": 7950 }, { "epoch": 2.2594379789951744, "grad_norm": 7.192576885223389, "learning_rate": 8.870848708487086e-05, "loss": 3.006010818481445, "step": 7960 }, { "epoch": 2.2622764689185355, "grad_norm": 7.14976167678833, "learning_rate": 8.869429463525405e-05, "loss": 2.9540781021118163, "step": 7970 }, { "epoch": 2.265114958841896, "grad_norm": 7.214508056640625, "learning_rate": 8.868010218563724e-05, "loss": 2.984724235534668, "step": 7980 }, { "epoch": 2.267953448765257, "grad_norm": 7.010662078857422, "learning_rate": 8.866590973602044e-05, "loss": 2.9162067413330077, "step": 7990 }, { "epoch": 2.2707919386886175, "grad_norm": 7.103058338165283, "learning_rate": 8.865171728640363e-05, "loss": 2.969054412841797, "step": 8000 }, { "epoch": 2.2707919386886175, "eval_accuracy": 0.19666815031474535, "eval_loss": 3.2188949584960938, "eval_runtime": 45.1994, "eval_samples_per_second": 347.947, "eval_steps_per_second": 5.443, "step": 8000 }, { "epoch": 2.2736304286119786, "grad_norm": 6.998394012451172, "learning_rate": 8.863752483678684e-05, "loss": 2.9513620376586913, "step": 8010 }, { "epoch": 2.2764689185353393, "grad_norm": 7.2774763107299805, "learning_rate": 8.862333238717003e-05, "loss": 3.0010002136230467, "step": 8020 }, { "epoch": 2.2793074084587, "grad_norm": 7.241909503936768, "learning_rate": 8.860913993755322e-05, "loss": 2.944485092163086, "step": 8030 }, { "epoch": 2.2821458983820606, "grad_norm": 7.239370822906494, "learning_rate": 8.859494748793642e-05, "loss": 2.9679054260253905, "step": 8040 }, { "epoch": 2.2849843883054213, "grad_norm": 6.860888957977295, "learning_rate": 8.858075503831962e-05, "loss": 2.9858963012695314, "step": 8050 }, { "epoch": 2.2878228782287824, "grad_norm": 6.866098403930664, "learning_rate": 8.856656258870282e-05, "loss": 3.016581726074219, "step": 8060 }, { "epoch": 2.290661368152143, "grad_norm": 7.1915411949157715, "learning_rate": 8.855237013908601e-05, "loss": 2.889341926574707, "step": 8070 }, { "epoch": 2.2934998580755037, "grad_norm": 7.065985202789307, "learning_rate": 8.85381776894692e-05, "loss": 2.8584890365600586, "step": 8080 }, { "epoch": 2.296338347998865, "grad_norm": 6.95184326171875, "learning_rate": 8.852398523985241e-05, "loss": 2.9160017013549804, "step": 8090 }, { "epoch": 2.2991768379222255, "grad_norm": 6.942594528198242, "learning_rate": 8.85097927902356e-05, "loss": 3.0736419677734377, "step": 8100 }, { "epoch": 2.302015327845586, "grad_norm": 7.1761088371276855, "learning_rate": 8.84956003406188e-05, "loss": 3.018292999267578, "step": 8110 }, { "epoch": 2.304853817768947, "grad_norm": 7.297174453735352, "learning_rate": 8.848140789100199e-05, "loss": 2.927659797668457, "step": 8120 }, { "epoch": 2.3076923076923075, "grad_norm": 7.233119487762451, "learning_rate": 8.846721544138518e-05, "loss": 2.9284170150756834, "step": 8130 }, { "epoch": 2.3105307976156686, "grad_norm": 6.977563381195068, "learning_rate": 8.845302299176839e-05, "loss": 2.9881343841552734, "step": 8140 }, { "epoch": 2.3133692875390293, "grad_norm": 7.381855010986328, "learning_rate": 8.843883054215158e-05, "loss": 2.9343181610107423, "step": 8150 }, { "epoch": 2.31620777746239, "grad_norm": 7.002601146697998, "learning_rate": 8.842463809253478e-05, "loss": 2.9738813400268556, "step": 8160 }, { "epoch": 2.3190462673857506, "grad_norm": 6.818892955780029, "learning_rate": 8.841044564291797e-05, "loss": 2.9726383209228517, "step": 8170 }, { "epoch": 2.3218847573091117, "grad_norm": 6.93902063369751, "learning_rate": 8.839625319330118e-05, "loss": 2.932486343383789, "step": 8180 }, { "epoch": 2.3247232472324724, "grad_norm": 6.869699954986572, "learning_rate": 8.838206074368437e-05, "loss": 2.9810461044311523, "step": 8190 }, { "epoch": 2.327561737155833, "grad_norm": 7.040811061859131, "learning_rate": 8.836786829406756e-05, "loss": 3.0223529815673826, "step": 8200 }, { "epoch": 2.3304002270791937, "grad_norm": 7.318644046783447, "learning_rate": 8.835367584445076e-05, "loss": 2.97913818359375, "step": 8210 }, { "epoch": 2.333238717002555, "grad_norm": 7.084681034088135, "learning_rate": 8.833948339483395e-05, "loss": 2.990447235107422, "step": 8220 }, { "epoch": 2.3360772069259155, "grad_norm": 7.0512518882751465, "learning_rate": 8.832529094521716e-05, "loss": 2.925343322753906, "step": 8230 }, { "epoch": 2.338915696849276, "grad_norm": 6.8431806564331055, "learning_rate": 8.831109849560034e-05, "loss": 3.0136962890625, "step": 8240 }, { "epoch": 2.341754186772637, "grad_norm": 7.306234359741211, "learning_rate": 8.829690604598354e-05, "loss": 2.9823299407958985, "step": 8250 }, { "epoch": 2.344592676695998, "grad_norm": 7.096662521362305, "learning_rate": 8.828271359636674e-05, "loss": 2.9963764190673827, "step": 8260 }, { "epoch": 2.3474311666193586, "grad_norm": 6.933344841003418, "learning_rate": 8.826852114674994e-05, "loss": 2.945130157470703, "step": 8270 }, { "epoch": 2.3502696565427192, "grad_norm": 7.218621730804443, "learning_rate": 8.825432869713312e-05, "loss": 2.9685407638549806, "step": 8280 }, { "epoch": 2.35310814646608, "grad_norm": 7.187698841094971, "learning_rate": 8.824013624751633e-05, "loss": 2.8783145904541017, "step": 8290 }, { "epoch": 2.355946636389441, "grad_norm": 6.906722068786621, "learning_rate": 8.822594379789952e-05, "loss": 2.9712802886962892, "step": 8300 }, { "epoch": 2.3587851263128017, "grad_norm": 7.323396682739258, "learning_rate": 8.821175134828272e-05, "loss": 2.9350963592529298, "step": 8310 }, { "epoch": 2.3616236162361623, "grad_norm": 7.0457563400268555, "learning_rate": 8.819755889866591e-05, "loss": 3.0265869140625, "step": 8320 }, { "epoch": 2.364462106159523, "grad_norm": 7.1170759201049805, "learning_rate": 8.81833664490491e-05, "loss": 2.94677791595459, "step": 8330 }, { "epoch": 2.3673005960828837, "grad_norm": 6.928656578063965, "learning_rate": 8.816917399943231e-05, "loss": 2.9650028228759764, "step": 8340 }, { "epoch": 2.370139086006245, "grad_norm": 6.818179607391357, "learning_rate": 8.81549815498155e-05, "loss": 2.967848205566406, "step": 8350 }, { "epoch": 2.3729775759296055, "grad_norm": 6.818332195281982, "learning_rate": 8.81407891001987e-05, "loss": 3.027314376831055, "step": 8360 }, { "epoch": 2.375816065852966, "grad_norm": 6.946022033691406, "learning_rate": 8.812659665058189e-05, "loss": 2.9380189895629885, "step": 8370 }, { "epoch": 2.3786545557763272, "grad_norm": 7.160337924957275, "learning_rate": 8.81124042009651e-05, "loss": 2.886252021789551, "step": 8380 }, { "epoch": 2.381493045699688, "grad_norm": 6.824338436126709, "learning_rate": 8.809821175134829e-05, "loss": 2.9398828506469727, "step": 8390 }, { "epoch": 2.3843315356230486, "grad_norm": 6.988341331481934, "learning_rate": 8.808401930173148e-05, "loss": 2.9014394760131834, "step": 8400 }, { "epoch": 2.3871700255464092, "grad_norm": 6.887318134307861, "learning_rate": 8.806982685211468e-05, "loss": 2.925220489501953, "step": 8410 }, { "epoch": 2.39000851546977, "grad_norm": 7.130527496337891, "learning_rate": 8.805563440249788e-05, "loss": 2.915163040161133, "step": 8420 }, { "epoch": 2.392847005393131, "grad_norm": 7.068617343902588, "learning_rate": 8.804144195288108e-05, "loss": 2.990524673461914, "step": 8430 }, { "epoch": 2.3956854953164917, "grad_norm": 7.3207221031188965, "learning_rate": 8.802724950326427e-05, "loss": 2.9729509353637695, "step": 8440 }, { "epoch": 2.3985239852398523, "grad_norm": 7.431980609893799, "learning_rate": 8.801305705364746e-05, "loss": 2.984950637817383, "step": 8450 }, { "epoch": 2.401362475163213, "grad_norm": 7.138377666473389, "learning_rate": 8.799886460403066e-05, "loss": 2.910663604736328, "step": 8460 }, { "epoch": 2.404200965086574, "grad_norm": 7.368851661682129, "learning_rate": 8.798467215441386e-05, "loss": 2.928873825073242, "step": 8470 }, { "epoch": 2.4070394550099348, "grad_norm": 7.015850067138672, "learning_rate": 8.797047970479704e-05, "loss": 2.9680854797363283, "step": 8480 }, { "epoch": 2.4098779449332954, "grad_norm": 7.081567764282227, "learning_rate": 8.795628725518025e-05, "loss": 2.966905212402344, "step": 8490 }, { "epoch": 2.412716434856656, "grad_norm": 7.037362098693848, "learning_rate": 8.794209480556344e-05, "loss": 2.8691570281982424, "step": 8500 }, { "epoch": 2.412716434856656, "eval_accuracy": 0.19552362179690977, "eval_loss": 3.1851179599761963, "eval_runtime": 48.1131, "eval_samples_per_second": 326.875, "eval_steps_per_second": 5.113, "step": 8500 }, { "epoch": 2.415554924780017, "grad_norm": 7.438180446624756, "learning_rate": 8.792790235594665e-05, "loss": 2.9282684326171875, "step": 8510 }, { "epoch": 2.418393414703378, "grad_norm": 7.114441871643066, "learning_rate": 8.791370990632983e-05, "loss": 3.0190719604492187, "step": 8520 }, { "epoch": 2.4212319046267385, "grad_norm": 6.840938091278076, "learning_rate": 8.789951745671304e-05, "loss": 2.919853401184082, "step": 8530 }, { "epoch": 2.424070394550099, "grad_norm": 7.619636535644531, "learning_rate": 8.788532500709623e-05, "loss": 2.943578338623047, "step": 8540 }, { "epoch": 2.42690888447346, "grad_norm": 6.722525596618652, "learning_rate": 8.787113255747942e-05, "loss": 2.9094802856445314, "step": 8550 }, { "epoch": 2.429747374396821, "grad_norm": 6.7119975090026855, "learning_rate": 8.785694010786262e-05, "loss": 2.9845453262329102, "step": 8560 }, { "epoch": 2.4325858643201816, "grad_norm": 6.9675774574279785, "learning_rate": 8.784274765824581e-05, "loss": 2.919679069519043, "step": 8570 }, { "epoch": 2.4354243542435423, "grad_norm": 7.04935884475708, "learning_rate": 8.782855520862902e-05, "loss": 2.9998090744018553, "step": 8580 }, { "epoch": 2.4382628441669034, "grad_norm": 6.9028167724609375, "learning_rate": 8.781436275901221e-05, "loss": 2.9296749114990233, "step": 8590 }, { "epoch": 2.441101334090264, "grad_norm": 7.281465530395508, "learning_rate": 8.78001703093954e-05, "loss": 2.9030799865722656, "step": 8600 }, { "epoch": 2.4439398240136248, "grad_norm": 6.893548011779785, "learning_rate": 8.77859778597786e-05, "loss": 2.916340637207031, "step": 8610 }, { "epoch": 2.4467783139369854, "grad_norm": 6.831697463989258, "learning_rate": 8.77717854101618e-05, "loss": 2.9174118041992188, "step": 8620 }, { "epoch": 2.449616803860346, "grad_norm": 7.210524559020996, "learning_rate": 8.7757592960545e-05, "loss": 2.955144500732422, "step": 8630 }, { "epoch": 2.452455293783707, "grad_norm": 6.921781539916992, "learning_rate": 8.774340051092819e-05, "loss": 2.94144287109375, "step": 8640 }, { "epoch": 2.455293783707068, "grad_norm": 6.723336219787598, "learning_rate": 8.772920806131138e-05, "loss": 2.966686248779297, "step": 8650 }, { "epoch": 2.4581322736304285, "grad_norm": 6.924388885498047, "learning_rate": 8.771501561169458e-05, "loss": 2.9000694274902346, "step": 8660 }, { "epoch": 2.4609707635537896, "grad_norm": 7.592317581176758, "learning_rate": 8.770082316207778e-05, "loss": 2.9506412506103517, "step": 8670 }, { "epoch": 2.4638092534771503, "grad_norm": 6.894314765930176, "learning_rate": 8.768663071246096e-05, "loss": 2.9427852630615234, "step": 8680 }, { "epoch": 2.466647743400511, "grad_norm": 6.761929035186768, "learning_rate": 8.767243826284417e-05, "loss": 2.8824424743652344, "step": 8690 }, { "epoch": 2.4694862333238716, "grad_norm": 7.359562873840332, "learning_rate": 8.765824581322736e-05, "loss": 2.89410343170166, "step": 8700 }, { "epoch": 2.4723247232472323, "grad_norm": 6.97009801864624, "learning_rate": 8.764405336361057e-05, "loss": 2.9914566040039063, "step": 8710 }, { "epoch": 2.4751632131705934, "grad_norm": 7.341858863830566, "learning_rate": 8.762986091399375e-05, "loss": 3.012974739074707, "step": 8720 }, { "epoch": 2.478001703093954, "grad_norm": 7.577096462249756, "learning_rate": 8.761566846437696e-05, "loss": 2.9189479827880858, "step": 8730 }, { "epoch": 2.4808401930173147, "grad_norm": 7.3713202476501465, "learning_rate": 8.760147601476015e-05, "loss": 2.973664474487305, "step": 8740 }, { "epoch": 2.4836786829406754, "grad_norm": 6.977940082550049, "learning_rate": 8.758728356514336e-05, "loss": 2.9420749664306642, "step": 8750 }, { "epoch": 2.4865171728640365, "grad_norm": 7.015196323394775, "learning_rate": 8.757309111552654e-05, "loss": 2.9606887817382814, "step": 8760 }, { "epoch": 2.489355662787397, "grad_norm": 6.993571758270264, "learning_rate": 8.755889866590974e-05, "loss": 2.9707393646240234, "step": 8770 }, { "epoch": 2.492194152710758, "grad_norm": 6.839693546295166, "learning_rate": 8.754470621629294e-05, "loss": 2.8748878479003905, "step": 8780 }, { "epoch": 2.4950326426341185, "grad_norm": 7.084383010864258, "learning_rate": 8.753051376667613e-05, "loss": 2.843754768371582, "step": 8790 }, { "epoch": 2.4978711325574796, "grad_norm": 6.754295825958252, "learning_rate": 8.751632131705932e-05, "loss": 2.887722396850586, "step": 8800 }, { "epoch": 2.5007096224808403, "grad_norm": 7.128180503845215, "learning_rate": 8.750212886744252e-05, "loss": 3.0059329986572267, "step": 8810 }, { "epoch": 2.503548112404201, "grad_norm": 6.737642765045166, "learning_rate": 8.748793641782572e-05, "loss": 2.9341175079345705, "step": 8820 }, { "epoch": 2.5063866023275616, "grad_norm": 6.963868141174316, "learning_rate": 8.747374396820892e-05, "loss": 2.9516048431396484, "step": 8830 }, { "epoch": 2.5092250922509223, "grad_norm": 7.3788042068481445, "learning_rate": 8.745955151859211e-05, "loss": 2.903459930419922, "step": 8840 }, { "epoch": 2.5120635821742834, "grad_norm": 6.559618949890137, "learning_rate": 8.74453590689753e-05, "loss": 2.972796630859375, "step": 8850 }, { "epoch": 2.514902072097644, "grad_norm": 7.302895545959473, "learning_rate": 8.743116661935851e-05, "loss": 2.9096120834350585, "step": 8860 }, { "epoch": 2.5177405620210047, "grad_norm": 7.245938777923584, "learning_rate": 8.74169741697417e-05, "loss": 2.9015254974365234, "step": 8870 }, { "epoch": 2.520579051944366, "grad_norm": 7.009023189544678, "learning_rate": 8.74027817201249e-05, "loss": 2.8774452209472656, "step": 8880 }, { "epoch": 2.5234175418677265, "grad_norm": 7.269435882568359, "learning_rate": 8.738858927050809e-05, "loss": 2.994422721862793, "step": 8890 }, { "epoch": 2.526256031791087, "grad_norm": 7.351464748382568, "learning_rate": 8.737439682089129e-05, "loss": 2.955552101135254, "step": 8900 }, { "epoch": 2.529094521714448, "grad_norm": 7.17045259475708, "learning_rate": 8.736020437127449e-05, "loss": 2.9020107269287108, "step": 8910 }, { "epoch": 2.5319330116378085, "grad_norm": 7.309791088104248, "learning_rate": 8.734601192165767e-05, "loss": 2.9108814239501952, "step": 8920 }, { "epoch": 2.5347715015611696, "grad_norm": 7.156074047088623, "learning_rate": 8.733181947204088e-05, "loss": 2.9068252563476564, "step": 8930 }, { "epoch": 2.5376099914845303, "grad_norm": 6.827686309814453, "learning_rate": 8.731762702242407e-05, "loss": 2.9461618423461915, "step": 8940 }, { "epoch": 2.540448481407891, "grad_norm": 6.650753974914551, "learning_rate": 8.730343457280728e-05, "loss": 2.893905258178711, "step": 8950 }, { "epoch": 2.543286971331252, "grad_norm": 7.197177886962891, "learning_rate": 8.728924212319046e-05, "loss": 2.8601760864257812, "step": 8960 }, { "epoch": 2.5461254612546127, "grad_norm": 6.991879940032959, "learning_rate": 8.727504967357367e-05, "loss": 2.8951688766479493, "step": 8970 }, { "epoch": 2.5489639511779734, "grad_norm": 6.984959602355957, "learning_rate": 8.726085722395686e-05, "loss": 2.9046159744262696, "step": 8980 }, { "epoch": 2.551802441101334, "grad_norm": 7.013354778289795, "learning_rate": 8.724666477434007e-05, "loss": 2.870209503173828, "step": 8990 }, { "epoch": 2.5546409310246947, "grad_norm": 7.137901306152344, "learning_rate": 8.723247232472325e-05, "loss": 2.8926342010498045, "step": 9000 }, { "epoch": 2.5546409310246947, "eval_accuracy": 0.2108475869523749, "eval_loss": 3.160102605819702, "eval_runtime": 48.3258, "eval_samples_per_second": 325.437, "eval_steps_per_second": 5.09, "step": 9000 }, { "epoch": 2.557479420948056, "grad_norm": 6.831748008728027, "learning_rate": 8.721827987510645e-05, "loss": 2.897312545776367, "step": 9010 }, { "epoch": 2.5603179108714165, "grad_norm": 7.344705581665039, "learning_rate": 8.720408742548965e-05, "loss": 2.9509370803833006, "step": 9020 }, { "epoch": 2.563156400794777, "grad_norm": 6.786020755767822, "learning_rate": 8.718989497587284e-05, "loss": 2.8743799209594725, "step": 9030 }, { "epoch": 2.565994890718138, "grad_norm": 6.784529685974121, "learning_rate": 8.717570252625603e-05, "loss": 2.849883270263672, "step": 9040 }, { "epoch": 2.5688333806414985, "grad_norm": 7.218177795410156, "learning_rate": 8.716151007663923e-05, "loss": 2.904035186767578, "step": 9050 }, { "epoch": 2.5716718705648596, "grad_norm": 7.197402477264404, "learning_rate": 8.714731762702243e-05, "loss": 2.895640563964844, "step": 9060 }, { "epoch": 2.5745103604882202, "grad_norm": 6.864266872406006, "learning_rate": 8.713312517740563e-05, "loss": 2.8653724670410154, "step": 9070 }, { "epoch": 2.577348850411581, "grad_norm": 7.139507293701172, "learning_rate": 8.711893272778882e-05, "loss": 2.9529796600341798, "step": 9080 }, { "epoch": 2.580187340334942, "grad_norm": 7.524747371673584, "learning_rate": 8.710474027817201e-05, "loss": 2.83605842590332, "step": 9090 }, { "epoch": 2.5830258302583027, "grad_norm": 6.888723850250244, "learning_rate": 8.709054782855522e-05, "loss": 2.935830497741699, "step": 9100 }, { "epoch": 2.5858643201816633, "grad_norm": 6.815177917480469, "learning_rate": 8.707635537893841e-05, "loss": 2.8460739135742186, "step": 9110 }, { "epoch": 2.588702810105024, "grad_norm": 7.490551471710205, "learning_rate": 8.70621629293216e-05, "loss": 2.862649345397949, "step": 9120 }, { "epoch": 2.5915413000283847, "grad_norm": 7.218399524688721, "learning_rate": 8.70479704797048e-05, "loss": 2.892140769958496, "step": 9130 }, { "epoch": 2.594379789951746, "grad_norm": 7.234127998352051, "learning_rate": 8.703377803008799e-05, "loss": 2.8471298217773438, "step": 9140 }, { "epoch": 2.5972182798751065, "grad_norm": 7.016869068145752, "learning_rate": 8.70195855804712e-05, "loss": 2.8783924102783205, "step": 9150 }, { "epoch": 2.600056769798467, "grad_norm": 7.126256465911865, "learning_rate": 8.700539313085438e-05, "loss": 2.924124908447266, "step": 9160 }, { "epoch": 2.6028952597218282, "grad_norm": 7.3610429763793945, "learning_rate": 8.699120068123759e-05, "loss": 2.9791770935058595, "step": 9170 }, { "epoch": 2.605733749645189, "grad_norm": 6.958209991455078, "learning_rate": 8.697700823162078e-05, "loss": 2.908641052246094, "step": 9180 }, { "epoch": 2.6085722395685496, "grad_norm": 7.283463478088379, "learning_rate": 8.696281578200399e-05, "loss": 2.9162174224853517, "step": 9190 }, { "epoch": 2.61141072949191, "grad_norm": 6.896111488342285, "learning_rate": 8.694862333238717e-05, "loss": 2.9343677520751954, "step": 9200 }, { "epoch": 2.614249219415271, "grad_norm": 7.100388526916504, "learning_rate": 8.693443088277037e-05, "loss": 2.9352352142333986, "step": 9210 }, { "epoch": 2.617087709338632, "grad_norm": 7.228831768035889, "learning_rate": 8.692023843315357e-05, "loss": 2.939975357055664, "step": 9220 }, { "epoch": 2.6199261992619927, "grad_norm": 7.114589691162109, "learning_rate": 8.690604598353676e-05, "loss": 2.8778923034667967, "step": 9230 }, { "epoch": 2.6227646891853533, "grad_norm": 7.116147041320801, "learning_rate": 8.689185353391995e-05, "loss": 2.8571855545043947, "step": 9240 }, { "epoch": 2.6256031791087144, "grad_norm": 7.488713264465332, "learning_rate": 8.687766108430315e-05, "loss": 2.867233657836914, "step": 9250 }, { "epoch": 2.628441669032075, "grad_norm": 7.486736297607422, "learning_rate": 8.686346863468635e-05, "loss": 2.889133071899414, "step": 9260 }, { "epoch": 2.6312801589554358, "grad_norm": 6.9945855140686035, "learning_rate": 8.684927618506955e-05, "loss": 2.8896276473999025, "step": 9270 }, { "epoch": 2.6341186488787964, "grad_norm": 6.76193380355835, "learning_rate": 8.683508373545274e-05, "loss": 2.9356460571289062, "step": 9280 }, { "epoch": 2.636957138802157, "grad_norm": 6.949671745300293, "learning_rate": 8.682089128583593e-05, "loss": 2.926127624511719, "step": 9290 }, { "epoch": 2.639795628725518, "grad_norm": 6.9622368812561035, "learning_rate": 8.680669883621914e-05, "loss": 2.9115106582641603, "step": 9300 }, { "epoch": 2.642634118648879, "grad_norm": 7.22276496887207, "learning_rate": 8.679250638660233e-05, "loss": 2.8512535095214844, "step": 9310 }, { "epoch": 2.6454726085722395, "grad_norm": 6.935904026031494, "learning_rate": 8.677831393698553e-05, "loss": 2.94833984375, "step": 9320 }, { "epoch": 2.6483110984956, "grad_norm": 7.178992748260498, "learning_rate": 8.676412148736872e-05, "loss": 2.9506540298461914, "step": 9330 }, { "epoch": 2.651149588418961, "grad_norm": 7.61398458480835, "learning_rate": 8.674992903775193e-05, "loss": 2.934183692932129, "step": 9340 }, { "epoch": 2.653988078342322, "grad_norm": 7.312222480773926, "learning_rate": 8.673573658813512e-05, "loss": 2.855960464477539, "step": 9350 }, { "epoch": 2.6568265682656826, "grad_norm": 7.047881603240967, "learning_rate": 8.672154413851831e-05, "loss": 2.938027763366699, "step": 9360 }, { "epoch": 2.6596650581890433, "grad_norm": 6.873831748962402, "learning_rate": 8.67073516889015e-05, "loss": 2.8943050384521483, "step": 9370 }, { "epoch": 2.6625035481124044, "grad_norm": 6.829535484313965, "learning_rate": 8.66931592392847e-05, "loss": 2.8915191650390626, "step": 9380 }, { "epoch": 2.665342038035765, "grad_norm": 6.799126625061035, "learning_rate": 8.667896678966791e-05, "loss": 2.9031381607055664, "step": 9390 }, { "epoch": 2.6681805279591257, "grad_norm": 6.815029144287109, "learning_rate": 8.66647743400511e-05, "loss": 2.8789329528808594, "step": 9400 }, { "epoch": 2.6710190178824864, "grad_norm": 7.525160312652588, "learning_rate": 8.66505818904343e-05, "loss": 2.845674514770508, "step": 9410 }, { "epoch": 2.673857507805847, "grad_norm": 7.041551113128662, "learning_rate": 8.663638944081749e-05, "loss": 2.848939323425293, "step": 9420 }, { "epoch": 2.676695997729208, "grad_norm": 6.919038772583008, "learning_rate": 8.66221969912007e-05, "loss": 3.001534843444824, "step": 9430 }, { "epoch": 2.679534487652569, "grad_norm": 7.323022365570068, "learning_rate": 8.660800454158389e-05, "loss": 2.953573226928711, "step": 9440 }, { "epoch": 2.6823729775759295, "grad_norm": 7.095569610595703, "learning_rate": 8.659381209196708e-05, "loss": 2.917302703857422, "step": 9450 }, { "epoch": 2.6852114674992906, "grad_norm": 7.159992218017578, "learning_rate": 8.657961964235027e-05, "loss": 2.8070098876953127, "step": 9460 }, { "epoch": 2.6880499574226513, "grad_norm": 7.048990726470947, "learning_rate": 8.656542719273347e-05, "loss": 2.9342361450195313, "step": 9470 }, { "epoch": 2.690888447346012, "grad_norm": 7.556376934051514, "learning_rate": 8.655123474311667e-05, "loss": 2.979058837890625, "step": 9480 }, { "epoch": 2.6937269372693726, "grad_norm": 7.003850936889648, "learning_rate": 8.653704229349985e-05, "loss": 2.859912300109863, "step": 9490 }, { "epoch": 2.6965654271927333, "grad_norm": 6.925973892211914, "learning_rate": 8.652284984388306e-05, "loss": 2.8788585662841797, "step": 9500 }, { "epoch": 2.6965654271927333, "eval_accuracy": 0.20690532205760795, "eval_loss": 3.1358635425567627, "eval_runtime": 45.7271, "eval_samples_per_second": 343.932, "eval_steps_per_second": 5.38, "step": 9500 }, { "epoch": 2.6994039171160944, "grad_norm": 6.93771505355835, "learning_rate": 8.650865739426625e-05, "loss": 2.8474544525146483, "step": 9510 }, { "epoch": 2.702242407039455, "grad_norm": 7.039864540100098, "learning_rate": 8.649446494464946e-05, "loss": 2.848015785217285, "step": 9520 }, { "epoch": 2.7050808969628157, "grad_norm": 6.904827117919922, "learning_rate": 8.648027249503264e-05, "loss": 2.908060836791992, "step": 9530 }, { "epoch": 2.7079193868861764, "grad_norm": 7.158736705780029, "learning_rate": 8.646608004541585e-05, "loss": 2.887478065490723, "step": 9540 }, { "epoch": 2.710757876809537, "grad_norm": 7.328029155731201, "learning_rate": 8.645188759579904e-05, "loss": 2.9839637756347654, "step": 9550 }, { "epoch": 2.713596366732898, "grad_norm": 6.906599998474121, "learning_rate": 8.643769514618223e-05, "loss": 2.864795684814453, "step": 9560 }, { "epoch": 2.716434856656259, "grad_norm": 6.760080337524414, "learning_rate": 8.642350269656543e-05, "loss": 2.816053771972656, "step": 9570 }, { "epoch": 2.7192733465796195, "grad_norm": 7.231280326843262, "learning_rate": 8.640931024694862e-05, "loss": 2.77109317779541, "step": 9580 }, { "epoch": 2.7221118365029806, "grad_norm": 6.573685169219971, "learning_rate": 8.639511779733183e-05, "loss": 2.9457962036132814, "step": 9590 }, { "epoch": 2.7249503264263413, "grad_norm": 7.0865983963012695, "learning_rate": 8.638092534771502e-05, "loss": 2.8884151458740233, "step": 9600 }, { "epoch": 2.727788816349702, "grad_norm": 7.895025253295898, "learning_rate": 8.636673289809821e-05, "loss": 2.910095977783203, "step": 9610 }, { "epoch": 2.7306273062730626, "grad_norm": 6.737241268157959, "learning_rate": 8.635254044848141e-05, "loss": 2.867777633666992, "step": 9620 }, { "epoch": 2.7334657961964233, "grad_norm": 7.49452018737793, "learning_rate": 8.633834799886461e-05, "loss": 2.8566513061523438, "step": 9630 }, { "epoch": 2.7363042861197844, "grad_norm": 7.091218948364258, "learning_rate": 8.632415554924781e-05, "loss": 2.877689743041992, "step": 9640 }, { "epoch": 2.739142776043145, "grad_norm": 7.033830165863037, "learning_rate": 8.6309963099631e-05, "loss": 2.7954593658447267, "step": 9650 }, { "epoch": 2.7419812659665057, "grad_norm": 7.058128833770752, "learning_rate": 8.62957706500142e-05, "loss": 2.861256790161133, "step": 9660 }, { "epoch": 2.744819755889867, "grad_norm": 7.073831081390381, "learning_rate": 8.62815782003974e-05, "loss": 2.94769401550293, "step": 9670 }, { "epoch": 2.7476582458132275, "grad_norm": 7.236126899719238, "learning_rate": 8.62673857507806e-05, "loss": 2.8941619873046873, "step": 9680 }, { "epoch": 2.750496735736588, "grad_norm": 7.175024509429932, "learning_rate": 8.625319330116379e-05, "loss": 2.925462341308594, "step": 9690 }, { "epoch": 2.753335225659949, "grad_norm": 7.076928615570068, "learning_rate": 8.623900085154698e-05, "loss": 2.876262092590332, "step": 9700 }, { "epoch": 2.7561737155833095, "grad_norm": 6.855789661407471, "learning_rate": 8.622480840193017e-05, "loss": 2.8927669525146484, "step": 9710 }, { "epoch": 2.7590122055066706, "grad_norm": 7.376901626586914, "learning_rate": 8.621061595231338e-05, "loss": 2.885452651977539, "step": 9720 }, { "epoch": 2.7618506954300313, "grad_norm": 7.013381481170654, "learning_rate": 8.619642350269656e-05, "loss": 2.8407966613769533, "step": 9730 }, { "epoch": 2.764689185353392, "grad_norm": 7.429962635040283, "learning_rate": 8.618223105307977e-05, "loss": 2.900274085998535, "step": 9740 }, { "epoch": 2.767527675276753, "grad_norm": 6.91085147857666, "learning_rate": 8.616803860346296e-05, "loss": 2.829970932006836, "step": 9750 }, { "epoch": 2.7703661652001137, "grad_norm": 7.175634384155273, "learning_rate": 8.615384615384617e-05, "loss": 2.8495235443115234, "step": 9760 }, { "epoch": 2.7732046551234744, "grad_norm": 7.375421524047852, "learning_rate": 8.613965370422935e-05, "loss": 2.8198928833007812, "step": 9770 }, { "epoch": 2.776043145046835, "grad_norm": 7.16146183013916, "learning_rate": 8.612546125461255e-05, "loss": 2.835564613342285, "step": 9780 }, { "epoch": 2.7788816349701957, "grad_norm": 7.1489105224609375, "learning_rate": 8.611126880499575e-05, "loss": 2.9559700012207033, "step": 9790 }, { "epoch": 2.781720124893557, "grad_norm": 6.870307445526123, "learning_rate": 8.609707635537894e-05, "loss": 2.8756765365600585, "step": 9800 }, { "epoch": 2.7845586148169175, "grad_norm": 7.23966121673584, "learning_rate": 8.608288390576213e-05, "loss": 2.837254524230957, "step": 9810 }, { "epoch": 2.787397104740278, "grad_norm": 6.955865859985352, "learning_rate": 8.606869145614533e-05, "loss": 2.8185672760009766, "step": 9820 }, { "epoch": 2.790235594663639, "grad_norm": 7.111307144165039, "learning_rate": 8.605449900652853e-05, "loss": 2.8747316360473634, "step": 9830 }, { "epoch": 2.7930740845869995, "grad_norm": 7.152837753295898, "learning_rate": 8.604030655691173e-05, "loss": 2.9623764038085936, "step": 9840 }, { "epoch": 2.7959125745103606, "grad_norm": 6.97931432723999, "learning_rate": 8.602611410729492e-05, "loss": 2.8863882064819335, "step": 9850 }, { "epoch": 2.7987510644337212, "grad_norm": 7.255564212799072, "learning_rate": 8.601192165767811e-05, "loss": 2.7981197357177736, "step": 9860 }, { "epoch": 2.801589554357082, "grad_norm": 7.143636226654053, "learning_rate": 8.599772920806132e-05, "loss": 2.8694631576538088, "step": 9870 }, { "epoch": 2.804428044280443, "grad_norm": 6.85203742980957, "learning_rate": 8.598353675844452e-05, "loss": 2.862356758117676, "step": 9880 }, { "epoch": 2.8072665342038037, "grad_norm": 7.046689510345459, "learning_rate": 8.596934430882771e-05, "loss": 2.893017387390137, "step": 9890 }, { "epoch": 2.8101050241271643, "grad_norm": 7.167710304260254, "learning_rate": 8.59551518592109e-05, "loss": 2.7994245529174804, "step": 9900 }, { "epoch": 2.812943514050525, "grad_norm": 7.149892807006836, "learning_rate": 8.59409594095941e-05, "loss": 2.8452384948730467, "step": 9910 }, { "epoch": 2.8157820039738857, "grad_norm": 6.980096817016602, "learning_rate": 8.59267669599773e-05, "loss": 2.83143310546875, "step": 9920 }, { "epoch": 2.818620493897247, "grad_norm": 7.3806352615356445, "learning_rate": 8.591257451036048e-05, "loss": 2.9198427200317383, "step": 9930 }, { "epoch": 2.8214589838206074, "grad_norm": 7.446921348571777, "learning_rate": 8.589838206074369e-05, "loss": 2.950574493408203, "step": 9940 }, { "epoch": 2.824297473743968, "grad_norm": 7.166808128356934, "learning_rate": 8.588418961112688e-05, "loss": 2.9893808364868164, "step": 9950 }, { "epoch": 2.827135963667329, "grad_norm": 6.900987148284912, "learning_rate": 8.586999716151009e-05, "loss": 2.921774673461914, "step": 9960 }, { "epoch": 2.82997445359069, "grad_norm": 7.172135353088379, "learning_rate": 8.585580471189327e-05, "loss": 2.9098478317260743, "step": 9970 }, { "epoch": 2.8328129435140506, "grad_norm": 7.091579437255859, "learning_rate": 8.584161226227648e-05, "loss": 2.806473159790039, "step": 9980 }, { "epoch": 2.835651433437411, "grad_norm": 6.734903812408447, "learning_rate": 8.582741981265967e-05, "loss": 2.8626941680908202, "step": 9990 }, { "epoch": 2.838489923360772, "grad_norm": 7.106771469116211, "learning_rate": 8.581322736304288e-05, "loss": 2.8666942596435545, "step": 10000 }, { "epoch": 2.838489923360772, "eval_accuracy": 0.2076683410694983, "eval_loss": 3.1104366779327393, "eval_runtime": 44.5759, "eval_samples_per_second": 352.814, "eval_steps_per_second": 5.519, "step": 10000 }, { "epoch": 2.841328413284133, "grad_norm": 6.764501571655273, "learning_rate": 8.579903491342606e-05, "loss": 2.841362953186035, "step": 10010 }, { "epoch": 2.8441669032074937, "grad_norm": 6.7665300369262695, "learning_rate": 8.578484246380926e-05, "loss": 2.8426652908325196, "step": 10020 }, { "epoch": 2.8470053931308543, "grad_norm": 7.004128932952881, "learning_rate": 8.577065001419246e-05, "loss": 2.8619688034057615, "step": 10030 }, { "epoch": 2.8498438830542154, "grad_norm": 6.910050392150879, "learning_rate": 8.575645756457565e-05, "loss": 2.9141164779663087, "step": 10040 }, { "epoch": 2.8526823729775757, "grad_norm": 6.9633378982543945, "learning_rate": 8.574226511495884e-05, "loss": 2.7853696823120115, "step": 10050 }, { "epoch": 2.8555208629009368, "grad_norm": 6.939952850341797, "learning_rate": 8.572807266534204e-05, "loss": 2.9263797760009767, "step": 10060 }, { "epoch": 2.8583593528242974, "grad_norm": 7.061680316925049, "learning_rate": 8.571388021572524e-05, "loss": 2.8349582672119142, "step": 10070 }, { "epoch": 2.861197842747658, "grad_norm": 7.2281575202941895, "learning_rate": 8.569968776610844e-05, "loss": 2.942378044128418, "step": 10080 }, { "epoch": 2.864036332671019, "grad_norm": 7.197792053222656, "learning_rate": 8.568549531649163e-05, "loss": 2.8293584823608398, "step": 10090 }, { "epoch": 2.86687482259438, "grad_norm": 6.5912957191467285, "learning_rate": 8.567130286687482e-05, "loss": 2.786138916015625, "step": 10100 }, { "epoch": 2.8697133125177405, "grad_norm": 7.033221244812012, "learning_rate": 8.565711041725803e-05, "loss": 2.850101089477539, "step": 10110 }, { "epoch": 2.872551802441101, "grad_norm": 7.132056713104248, "learning_rate": 8.564291796764122e-05, "loss": 2.9090105056762696, "step": 10120 }, { "epoch": 2.875390292364462, "grad_norm": 6.955399036407471, "learning_rate": 8.562872551802442e-05, "loss": 2.8867107391357423, "step": 10130 }, { "epoch": 2.878228782287823, "grad_norm": 6.928585529327393, "learning_rate": 8.561453306840761e-05, "loss": 2.8467851638793946, "step": 10140 }, { "epoch": 2.8810672722111836, "grad_norm": 6.864264488220215, "learning_rate": 8.56003406187908e-05, "loss": 2.9055124282836915, "step": 10150 }, { "epoch": 2.8839057621345443, "grad_norm": 6.780471324920654, "learning_rate": 8.558614816917401e-05, "loss": 2.827414131164551, "step": 10160 }, { "epoch": 2.8867442520579054, "grad_norm": 7.047520637512207, "learning_rate": 8.557195571955719e-05, "loss": 2.8527093887329102, "step": 10170 }, { "epoch": 2.889582741981266, "grad_norm": 7.350693225860596, "learning_rate": 8.55577632699404e-05, "loss": 2.878399658203125, "step": 10180 }, { "epoch": 2.8924212319046267, "grad_norm": 6.993347644805908, "learning_rate": 8.554357082032359e-05, "loss": 2.936553192138672, "step": 10190 }, { "epoch": 2.8952597218279874, "grad_norm": 6.856053829193115, "learning_rate": 8.55293783707068e-05, "loss": 2.8574771881103516, "step": 10200 }, { "epoch": 2.898098211751348, "grad_norm": 7.319886684417725, "learning_rate": 8.551518592108998e-05, "loss": 2.881344223022461, "step": 10210 }, { "epoch": 2.900936701674709, "grad_norm": 7.032646179199219, "learning_rate": 8.550099347147318e-05, "loss": 2.883824920654297, "step": 10220 }, { "epoch": 2.90377519159807, "grad_norm": 6.8368306159973145, "learning_rate": 8.548680102185638e-05, "loss": 2.8810070037841795, "step": 10230 }, { "epoch": 2.9066136815214305, "grad_norm": 7.700311660766602, "learning_rate": 8.547260857223958e-05, "loss": 2.9360305786132814, "step": 10240 }, { "epoch": 2.9094521714447916, "grad_norm": 7.541463851928711, "learning_rate": 8.545841612262276e-05, "loss": 2.8761768341064453, "step": 10250 }, { "epoch": 2.9122906613681523, "grad_norm": 7.697683811187744, "learning_rate": 8.544422367300597e-05, "loss": 2.863423538208008, "step": 10260 }, { "epoch": 2.915129151291513, "grad_norm": 6.803672790527344, "learning_rate": 8.543003122338916e-05, "loss": 2.8698253631591797, "step": 10270 }, { "epoch": 2.9179676412148736, "grad_norm": 6.899600505828857, "learning_rate": 8.541583877377236e-05, "loss": 2.798289489746094, "step": 10280 }, { "epoch": 2.9208061311382343, "grad_norm": 7.047751426696777, "learning_rate": 8.540164632415555e-05, "loss": 2.8430124282836915, "step": 10290 }, { "epoch": 2.9236446210615954, "grad_norm": 6.646446704864502, "learning_rate": 8.538745387453874e-05, "loss": 2.8090890884399413, "step": 10300 }, { "epoch": 2.926483110984956, "grad_norm": 6.883050918579102, "learning_rate": 8.537326142492195e-05, "loss": 2.8664308547973634, "step": 10310 }, { "epoch": 2.9293216009083167, "grad_norm": 7.221859931945801, "learning_rate": 8.535906897530514e-05, "loss": 2.8056163787841797, "step": 10320 }, { "epoch": 2.9321600908316774, "grad_norm": 6.957625389099121, "learning_rate": 8.534487652568834e-05, "loss": 2.796965217590332, "step": 10330 }, { "epoch": 2.934998580755038, "grad_norm": 6.612262725830078, "learning_rate": 8.533068407607153e-05, "loss": 2.8361000061035155, "step": 10340 }, { "epoch": 2.937837070678399, "grad_norm": 7.006862640380859, "learning_rate": 8.531649162645474e-05, "loss": 2.876750373840332, "step": 10350 }, { "epoch": 2.94067556060176, "grad_norm": 6.804136753082275, "learning_rate": 8.530229917683793e-05, "loss": 2.8702503204345704, "step": 10360 }, { "epoch": 2.9435140505251205, "grad_norm": 6.790032863616943, "learning_rate": 8.528810672722112e-05, "loss": 2.8216320037841798, "step": 10370 }, { "epoch": 2.9463525404484816, "grad_norm": 6.738595485687256, "learning_rate": 8.527391427760432e-05, "loss": 2.887453079223633, "step": 10380 }, { "epoch": 2.9491910303718423, "grad_norm": 6.932900428771973, "learning_rate": 8.525972182798751e-05, "loss": 2.786690521240234, "step": 10390 }, { "epoch": 2.952029520295203, "grad_norm": 6.831051349639893, "learning_rate": 8.524552937837072e-05, "loss": 2.79431037902832, "step": 10400 }, { "epoch": 2.9548680102185636, "grad_norm": 6.9048357009887695, "learning_rate": 8.52313369287539e-05, "loss": 2.853125, "step": 10410 }, { "epoch": 2.9577065001419243, "grad_norm": 7.270448684692383, "learning_rate": 8.52171444791371e-05, "loss": 2.9099435806274414, "step": 10420 }, { "epoch": 2.9605449900652854, "grad_norm": 7.253884792327881, "learning_rate": 8.52029520295203e-05, "loss": 2.9027328491210938, "step": 10430 }, { "epoch": 2.963383479988646, "grad_norm": 7.41327428817749, "learning_rate": 8.51887595799035e-05, "loss": 2.8264232635498048, "step": 10440 }, { "epoch": 2.9662219699120067, "grad_norm": 6.854894638061523, "learning_rate": 8.517456713028668e-05, "loss": 2.8627897262573243, "step": 10450 }, { "epoch": 2.969060459835368, "grad_norm": 7.053505897521973, "learning_rate": 8.516037468066989e-05, "loss": 2.8478330612182616, "step": 10460 }, { "epoch": 2.9718989497587285, "grad_norm": 7.170361518859863, "learning_rate": 8.514618223105308e-05, "loss": 2.822709083557129, "step": 10470 }, { "epoch": 2.974737439682089, "grad_norm": 6.8715643882751465, "learning_rate": 8.513198978143628e-05, "loss": 2.8753557205200195, "step": 10480 }, { "epoch": 2.97757592960545, "grad_norm": 7.04720401763916, "learning_rate": 8.511779733181947e-05, "loss": 2.87115592956543, "step": 10490 }, { "epoch": 2.9804144195288105, "grad_norm": 6.888540744781494, "learning_rate": 8.510360488220266e-05, "loss": 2.877322769165039, "step": 10500 }, { "epoch": 2.9804144195288105, "eval_accuracy": 0.2142811725058816, "eval_loss": 3.0873522758483887, "eval_runtime": 49.6993, "eval_samples_per_second": 316.443, "eval_steps_per_second": 4.95, "step": 10500 }, { "epoch": 2.9832529094521716, "grad_norm": 6.973071098327637, "learning_rate": 8.508941243258587e-05, "loss": 2.9233282089233397, "step": 10510 }, { "epoch": 2.9860913993755323, "grad_norm": 6.967947006225586, "learning_rate": 8.507521998296906e-05, "loss": 2.8396493911743166, "step": 10520 }, { "epoch": 2.988929889298893, "grad_norm": 7.247684955596924, "learning_rate": 8.506102753335226e-05, "loss": 2.871199035644531, "step": 10530 }, { "epoch": 2.991768379222254, "grad_norm": 6.800557613372803, "learning_rate": 8.504683508373545e-05, "loss": 2.751311111450195, "step": 10540 }, { "epoch": 2.9946068691456147, "grad_norm": 7.253666877746582, "learning_rate": 8.503264263411866e-05, "loss": 2.894973564147949, "step": 10550 }, { "epoch": 2.9974453590689754, "grad_norm": 6.983079433441162, "learning_rate": 8.501845018450185e-05, "loss": 2.775593948364258, "step": 10560 }, { "epoch": 3.000283848992336, "grad_norm": 6.772644996643066, "learning_rate": 8.500567697984672e-05, "loss": 2.831568145751953, "step": 10570 }, { "epoch": 3.0031223389156967, "grad_norm": 7.122303009033203, "learning_rate": 8.499148453022993e-05, "loss": 2.7465160369873045, "step": 10580 }, { "epoch": 3.005960828839058, "grad_norm": 7.318820476531982, "learning_rate": 8.497729208061311e-05, "loss": 2.8629743576049806, "step": 10590 }, { "epoch": 3.0087993187624185, "grad_norm": 6.791108131408691, "learning_rate": 8.496309963099632e-05, "loss": 2.7517465591430663, "step": 10600 }, { "epoch": 3.011637808685779, "grad_norm": 7.027871608734131, "learning_rate": 8.494890718137951e-05, "loss": 2.7652315139770507, "step": 10610 }, { "epoch": 3.01447629860914, "grad_norm": 6.8808112144470215, "learning_rate": 8.49347147317627e-05, "loss": 2.843840408325195, "step": 10620 }, { "epoch": 3.017314788532501, "grad_norm": 6.824723243713379, "learning_rate": 8.49205222821459e-05, "loss": 2.842269515991211, "step": 10630 }, { "epoch": 3.0201532784558616, "grad_norm": 7.320542335510254, "learning_rate": 8.490632983252909e-05, "loss": 2.821343994140625, "step": 10640 }, { "epoch": 3.0229917683792222, "grad_norm": 6.939826965332031, "learning_rate": 8.48921373829123e-05, "loss": 2.8674362182617186, "step": 10650 }, { "epoch": 3.025830258302583, "grad_norm": 7.066749095916748, "learning_rate": 8.487794493329549e-05, "loss": 2.891581344604492, "step": 10660 }, { "epoch": 3.028668748225944, "grad_norm": 7.0592780113220215, "learning_rate": 8.486375248367868e-05, "loss": 2.767078399658203, "step": 10670 }, { "epoch": 3.0315072381493047, "grad_norm": 7.067778587341309, "learning_rate": 8.484956003406188e-05, "loss": 2.7676313400268553, "step": 10680 }, { "epoch": 3.0343457280726653, "grad_norm": 6.924839496612549, "learning_rate": 8.483536758444508e-05, "loss": 2.827027130126953, "step": 10690 }, { "epoch": 3.037184217996026, "grad_norm": 7.184043884277344, "learning_rate": 8.482117513482828e-05, "loss": 2.777304267883301, "step": 10700 }, { "epoch": 3.0400227079193867, "grad_norm": 7.07753324508667, "learning_rate": 8.480698268521147e-05, "loss": 2.818754959106445, "step": 10710 }, { "epoch": 3.042861197842748, "grad_norm": 7.1668877601623535, "learning_rate": 8.479279023559466e-05, "loss": 2.715240478515625, "step": 10720 }, { "epoch": 3.0456996877661084, "grad_norm": 6.969992637634277, "learning_rate": 8.477859778597787e-05, "loss": 2.746232604980469, "step": 10730 }, { "epoch": 3.048538177689469, "grad_norm": 7.245048999786377, "learning_rate": 8.476440533636106e-05, "loss": 2.8203369140625, "step": 10740 }, { "epoch": 3.0513766676128298, "grad_norm": 6.854947090148926, "learning_rate": 8.475021288674426e-05, "loss": 2.773375129699707, "step": 10750 }, { "epoch": 3.054215157536191, "grad_norm": 7.0660810470581055, "learning_rate": 8.473602043712745e-05, "loss": 2.798933982849121, "step": 10760 }, { "epoch": 3.0570536474595515, "grad_norm": 6.767135143280029, "learning_rate": 8.472182798751064e-05, "loss": 2.7924659729003904, "step": 10770 }, { "epoch": 3.059892137382912, "grad_norm": 6.793426513671875, "learning_rate": 8.470763553789385e-05, "loss": 2.78439884185791, "step": 10780 }, { "epoch": 3.062730627306273, "grad_norm": 6.880562782287598, "learning_rate": 8.469344308827703e-05, "loss": 2.841190147399902, "step": 10790 }, { "epoch": 3.065569117229634, "grad_norm": 6.753645896911621, "learning_rate": 8.467925063866024e-05, "loss": 2.7650094985961915, "step": 10800 }, { "epoch": 3.0684076071529947, "grad_norm": 7.089629173278809, "learning_rate": 8.466505818904343e-05, "loss": 2.7504156112670897, "step": 10810 }, { "epoch": 3.0712460970763553, "grad_norm": 7.456932544708252, "learning_rate": 8.465086573942664e-05, "loss": 2.7688919067382813, "step": 10820 }, { "epoch": 3.074084586999716, "grad_norm": 7.421774864196777, "learning_rate": 8.463667328980982e-05, "loss": 2.770243453979492, "step": 10830 }, { "epoch": 3.076923076923077, "grad_norm": 7.581831455230713, "learning_rate": 8.462248084019302e-05, "loss": 2.8255695343017577, "step": 10840 }, { "epoch": 3.0797615668464378, "grad_norm": 6.929940700531006, "learning_rate": 8.460828839057622e-05, "loss": 2.7949668884277346, "step": 10850 }, { "epoch": 3.0826000567697984, "grad_norm": 7.388454437255859, "learning_rate": 8.459409594095941e-05, "loss": 2.814369964599609, "step": 10860 }, { "epoch": 3.085438546693159, "grad_norm": 6.962167263031006, "learning_rate": 8.45799034913426e-05, "loss": 2.780870056152344, "step": 10870 }, { "epoch": 3.08827703661652, "grad_norm": 7.14901065826416, "learning_rate": 8.45657110417258e-05, "loss": 2.853990936279297, "step": 10880 }, { "epoch": 3.091115526539881, "grad_norm": 6.995620250701904, "learning_rate": 8.4551518592109e-05, "loss": 2.7621265411376954, "step": 10890 }, { "epoch": 3.0939540164632415, "grad_norm": 7.089853286743164, "learning_rate": 8.45373261424922e-05, "loss": 2.8225914001464845, "step": 10900 }, { "epoch": 3.096792506386602, "grad_norm": 6.987091541290283, "learning_rate": 8.452313369287539e-05, "loss": 2.7637529373168945, "step": 10910 }, { "epoch": 3.0996309963099633, "grad_norm": 7.129463195800781, "learning_rate": 8.450894124325858e-05, "loss": 2.845949172973633, "step": 10920 }, { "epoch": 3.102469486233324, "grad_norm": 7.104438781738281, "learning_rate": 8.449474879364179e-05, "loss": 2.7549158096313477, "step": 10930 }, { "epoch": 3.1053079761566846, "grad_norm": 7.00447416305542, "learning_rate": 8.448055634402498e-05, "loss": 2.8085187911987304, "step": 10940 }, { "epoch": 3.1081464660800453, "grad_norm": 6.8701395988464355, "learning_rate": 8.446636389440818e-05, "loss": 2.807529830932617, "step": 10950 }, { "epoch": 3.1109849560034064, "grad_norm": 7.13831901550293, "learning_rate": 8.445217144479137e-05, "loss": 2.7578079223632814, "step": 10960 }, { "epoch": 3.113823445926767, "grad_norm": 6.812189102172852, "learning_rate": 8.443797899517458e-05, "loss": 2.810542106628418, "step": 10970 }, { "epoch": 3.1166619358501277, "grad_norm": 6.721221446990967, "learning_rate": 8.442378654555777e-05, "loss": 2.8150506973266602, "step": 10980 }, { "epoch": 3.1195004257734884, "grad_norm": 7.1222920417785645, "learning_rate": 8.440959409594096e-05, "loss": 2.7996749877929688, "step": 10990 }, { "epoch": 3.122338915696849, "grad_norm": 6.760138988494873, "learning_rate": 8.439540164632416e-05, "loss": 2.7571067810058594, "step": 11000 }, { "epoch": 3.122338915696849, "eval_accuracy": 0.21478985184714186, "eval_loss": 3.0722718238830566, "eval_runtime": 46.5879, "eval_samples_per_second": 337.577, "eval_steps_per_second": 5.28, "step": 11000 }, { "epoch": 3.12517740562021, "grad_norm": 6.744966506958008, "learning_rate": 8.438120919670735e-05, "loss": 2.822458267211914, "step": 11010 }, { "epoch": 3.128015895543571, "grad_norm": 7.115482807159424, "learning_rate": 8.436701674709056e-05, "loss": 2.7198795318603515, "step": 11020 }, { "epoch": 3.1308543854669315, "grad_norm": 7.389223098754883, "learning_rate": 8.435282429747374e-05, "loss": 2.754181671142578, "step": 11030 }, { "epoch": 3.133692875390292, "grad_norm": 6.6696248054504395, "learning_rate": 8.433863184785694e-05, "loss": 2.7903167724609377, "step": 11040 }, { "epoch": 3.1365313653136533, "grad_norm": 7.005199909210205, "learning_rate": 8.432443939824014e-05, "loss": 2.7624794006347657, "step": 11050 }, { "epoch": 3.139369855237014, "grad_norm": 6.938072681427002, "learning_rate": 8.431024694862335e-05, "loss": 2.8063650131225586, "step": 11060 }, { "epoch": 3.1422083451603746, "grad_norm": 6.905735969543457, "learning_rate": 8.429605449900652e-05, "loss": 2.7585063934326173, "step": 11070 }, { "epoch": 3.1450468350837353, "grad_norm": 6.582033634185791, "learning_rate": 8.428186204938973e-05, "loss": 2.775684928894043, "step": 11080 }, { "epoch": 3.1478853250070964, "grad_norm": 6.919564723968506, "learning_rate": 8.426766959977293e-05, "loss": 2.7784929275512695, "step": 11090 }, { "epoch": 3.150723814930457, "grad_norm": 6.822707176208496, "learning_rate": 8.425347715015612e-05, "loss": 2.8397758483886717, "step": 11100 }, { "epoch": 3.1535623048538177, "grad_norm": 6.635997295379639, "learning_rate": 8.423928470053933e-05, "loss": 2.81705322265625, "step": 11110 }, { "epoch": 3.1564007947771784, "grad_norm": 6.84978723526001, "learning_rate": 8.42250922509225e-05, "loss": 2.776369094848633, "step": 11120 }, { "epoch": 3.1592392847005395, "grad_norm": 7.042022228240967, "learning_rate": 8.421089980130571e-05, "loss": 2.7193683624267577, "step": 11130 }, { "epoch": 3.1620777746239, "grad_norm": 7.093931674957275, "learning_rate": 8.41967073516889e-05, "loss": 2.7741912841796874, "step": 11140 }, { "epoch": 3.164916264547261, "grad_norm": 6.943727016448975, "learning_rate": 8.418251490207211e-05, "loss": 2.8009265899658202, "step": 11150 }, { "epoch": 3.1677547544706215, "grad_norm": 7.053706645965576, "learning_rate": 8.416832245245529e-05, "loss": 2.8332359313964846, "step": 11160 }, { "epoch": 3.1705932443939826, "grad_norm": 6.849763870239258, "learning_rate": 8.41541300028385e-05, "loss": 2.723653793334961, "step": 11170 }, { "epoch": 3.1734317343173433, "grad_norm": 6.618183135986328, "learning_rate": 8.413993755322169e-05, "loss": 2.754151153564453, "step": 11180 }, { "epoch": 3.176270224240704, "grad_norm": 6.904426574707031, "learning_rate": 8.412574510360489e-05, "loss": 2.7821802139282226, "step": 11190 }, { "epoch": 3.1791087141640646, "grad_norm": 7.2688727378845215, "learning_rate": 8.411155265398808e-05, "loss": 2.823019027709961, "step": 11200 }, { "epoch": 3.1819472040874253, "grad_norm": 6.931407451629639, "learning_rate": 8.409736020437127e-05, "loss": 2.7604547500610352, "step": 11210 }, { "epoch": 3.1847856940107864, "grad_norm": 6.914058208465576, "learning_rate": 8.408316775475448e-05, "loss": 2.730825424194336, "step": 11220 }, { "epoch": 3.187624183934147, "grad_norm": 7.190228462219238, "learning_rate": 8.406897530513767e-05, "loss": 2.7311302185058595, "step": 11230 }, { "epoch": 3.1904626738575077, "grad_norm": 7.619965076446533, "learning_rate": 8.405478285552087e-05, "loss": 2.8194660186767577, "step": 11240 }, { "epoch": 3.193301163780869, "grad_norm": 7.0912981033325195, "learning_rate": 8.404059040590406e-05, "loss": 2.824405288696289, "step": 11250 }, { "epoch": 3.1961396537042295, "grad_norm": 6.907508373260498, "learning_rate": 8.402639795628727e-05, "loss": 2.7016532897949217, "step": 11260 }, { "epoch": 3.19897814362759, "grad_norm": 6.77822208404541, "learning_rate": 8.401220550667046e-05, "loss": 2.7670217514038087, "step": 11270 }, { "epoch": 3.201816633550951, "grad_norm": 6.693291664123535, "learning_rate": 8.399801305705365e-05, "loss": 2.74645881652832, "step": 11280 }, { "epoch": 3.2046551234743115, "grad_norm": 6.664721965789795, "learning_rate": 8.398382060743685e-05, "loss": 2.8169036865234376, "step": 11290 }, { "epoch": 3.2074936133976726, "grad_norm": 7.099626541137695, "learning_rate": 8.396962815782005e-05, "loss": 2.765176010131836, "step": 11300 }, { "epoch": 3.2103321033210332, "grad_norm": 7.0209455490112305, "learning_rate": 8.395543570820325e-05, "loss": 2.801409149169922, "step": 11310 }, { "epoch": 3.213170593244394, "grad_norm": 6.78694486618042, "learning_rate": 8.394124325858644e-05, "loss": 2.7908584594726564, "step": 11320 }, { "epoch": 3.2160090831677546, "grad_norm": 6.918478488922119, "learning_rate": 8.392705080896963e-05, "loss": 2.798152542114258, "step": 11330 }, { "epoch": 3.2188475730911157, "grad_norm": 7.061831474304199, "learning_rate": 8.391285835935283e-05, "loss": 2.8682159423828124, "step": 11340 }, { "epoch": 3.2216860630144764, "grad_norm": 7.264504432678223, "learning_rate": 8.389866590973603e-05, "loss": 2.755052375793457, "step": 11350 }, { "epoch": 3.224524552937837, "grad_norm": 7.110912799835205, "learning_rate": 8.388447346011921e-05, "loss": 2.7773921966552733, "step": 11360 }, { "epoch": 3.2273630428611977, "grad_norm": 6.6613688468933105, "learning_rate": 8.387028101050242e-05, "loss": 2.763319969177246, "step": 11370 }, { "epoch": 3.230201532784559, "grad_norm": 6.87830924987793, "learning_rate": 8.385608856088561e-05, "loss": 2.837119483947754, "step": 11380 }, { "epoch": 3.2330400227079195, "grad_norm": 7.0218729972839355, "learning_rate": 8.384189611126882e-05, "loss": 2.8410430908203126, "step": 11390 }, { "epoch": 3.23587851263128, "grad_norm": 7.24088191986084, "learning_rate": 8.3827703661652e-05, "loss": 2.7886484146118162, "step": 11400 }, { "epoch": 3.238717002554641, "grad_norm": 6.999053478240967, "learning_rate": 8.38135112120352e-05, "loss": 2.7968536376953126, "step": 11410 }, { "epoch": 3.241555492478002, "grad_norm": 6.619139194488525, "learning_rate": 8.37993187624184e-05, "loss": 2.7486188888549803, "step": 11420 }, { "epoch": 3.2443939824013626, "grad_norm": 6.928242206573486, "learning_rate": 8.378512631280159e-05, "loss": 2.778615188598633, "step": 11430 }, { "epoch": 3.2472324723247232, "grad_norm": 6.795560836791992, "learning_rate": 8.377093386318479e-05, "loss": 2.801285743713379, "step": 11440 }, { "epoch": 3.250070962248084, "grad_norm": 6.9416704177856445, "learning_rate": 8.375674141356798e-05, "loss": 2.807668113708496, "step": 11450 }, { "epoch": 3.252909452171445, "grad_norm": 7.305564880371094, "learning_rate": 8.374254896395119e-05, "loss": 2.6993560791015625, "step": 11460 }, { "epoch": 3.2557479420948057, "grad_norm": 6.866343975067139, "learning_rate": 8.372835651433438e-05, "loss": 2.764289665222168, "step": 11470 }, { "epoch": 3.2585864320181663, "grad_norm": 6.912966728210449, "learning_rate": 8.371416406471757e-05, "loss": 2.6866456985473635, "step": 11480 }, { "epoch": 3.261424921941527, "grad_norm": 6.733415603637695, "learning_rate": 8.369997161510077e-05, "loss": 2.848439598083496, "step": 11490 }, { "epoch": 3.2642634118648877, "grad_norm": 7.396632671356201, "learning_rate": 8.368577916548397e-05, "loss": 2.808356285095215, "step": 11500 }, { "epoch": 3.2642634118648877, "eval_accuracy": 0.21917721116551153, "eval_loss": 3.049179792404175, "eval_runtime": 46.8992, "eval_samples_per_second": 335.336, "eval_steps_per_second": 5.245, "step": 11500 }, { "epoch": 3.2671019017882488, "grad_norm": 6.75921106338501, "learning_rate": 8.367158671586717e-05, "loss": 2.727665138244629, "step": 11510 }, { "epoch": 3.2699403917116094, "grad_norm": 7.153102874755859, "learning_rate": 8.365739426625036e-05, "loss": 2.7463119506835936, "step": 11520 }, { "epoch": 3.27277888163497, "grad_norm": 7.023458003997803, "learning_rate": 8.364320181663355e-05, "loss": 2.7448944091796874, "step": 11530 }, { "epoch": 3.275617371558331, "grad_norm": 7.303589820861816, "learning_rate": 8.362900936701675e-05, "loss": 2.844456100463867, "step": 11540 }, { "epoch": 3.278455861481692, "grad_norm": 6.98989200592041, "learning_rate": 8.361481691739995e-05, "loss": 2.8043535232543944, "step": 11550 }, { "epoch": 3.2812943514050525, "grad_norm": 6.751587390899658, "learning_rate": 8.360062446778313e-05, "loss": 2.7028825759887694, "step": 11560 }, { "epoch": 3.284132841328413, "grad_norm": 6.80467414855957, "learning_rate": 8.358643201816634e-05, "loss": 2.752949905395508, "step": 11570 }, { "epoch": 3.286971331251774, "grad_norm": 6.699588775634766, "learning_rate": 8.357223956854953e-05, "loss": 2.761197090148926, "step": 11580 }, { "epoch": 3.289809821175135, "grad_norm": 6.79286527633667, "learning_rate": 8.355804711893274e-05, "loss": 2.7110111236572267, "step": 11590 }, { "epoch": 3.2926483110984957, "grad_norm": 7.131494522094727, "learning_rate": 8.354385466931592e-05, "loss": 2.780889129638672, "step": 11600 }, { "epoch": 3.2954868010218563, "grad_norm": 6.996281623840332, "learning_rate": 8.352966221969913e-05, "loss": 2.757512664794922, "step": 11610 }, { "epoch": 3.298325290945217, "grad_norm": 6.931567192077637, "learning_rate": 8.351546977008232e-05, "loss": 2.7232488632202148, "step": 11620 }, { "epoch": 3.301163780868578, "grad_norm": 6.990844249725342, "learning_rate": 8.350127732046553e-05, "loss": 2.8365428924560545, "step": 11630 }, { "epoch": 3.3040022707919388, "grad_norm": 6.768869400024414, "learning_rate": 8.348708487084871e-05, "loss": 2.7290882110595702, "step": 11640 }, { "epoch": 3.3068407607152994, "grad_norm": 7.167342662811279, "learning_rate": 8.347289242123191e-05, "loss": 2.901301956176758, "step": 11650 }, { "epoch": 3.30967925063866, "grad_norm": 7.075326442718506, "learning_rate": 8.345869997161511e-05, "loss": 2.906041145324707, "step": 11660 }, { "epoch": 3.312517740562021, "grad_norm": 7.018557548522949, "learning_rate": 8.34445075219983e-05, "loss": 2.8144065856933596, "step": 11670 }, { "epoch": 3.315356230485382, "grad_norm": 7.0980544090271, "learning_rate": 8.34303150723815e-05, "loss": 2.7238588333129883, "step": 11680 }, { "epoch": 3.3181947204087425, "grad_norm": 7.235695838928223, "learning_rate": 8.341612262276469e-05, "loss": 2.7723947525024415, "step": 11690 }, { "epoch": 3.321033210332103, "grad_norm": 6.746390342712402, "learning_rate": 8.34019301731479e-05, "loss": 2.790948486328125, "step": 11700 }, { "epoch": 3.323871700255464, "grad_norm": 6.953052520751953, "learning_rate": 8.338773772353109e-05, "loss": 2.7512386322021483, "step": 11710 }, { "epoch": 3.326710190178825, "grad_norm": 7.0883564949035645, "learning_rate": 8.337354527391428e-05, "loss": 2.757684326171875, "step": 11720 }, { "epoch": 3.3295486801021856, "grad_norm": 6.725985050201416, "learning_rate": 8.335935282429747e-05, "loss": 2.868444633483887, "step": 11730 }, { "epoch": 3.3323871700255463, "grad_norm": 6.917257785797119, "learning_rate": 8.334516037468068e-05, "loss": 2.7769790649414063, "step": 11740 }, { "epoch": 3.3352256599489074, "grad_norm": 7.176654815673828, "learning_rate": 8.333096792506387e-05, "loss": 2.7946144104003907, "step": 11750 }, { "epoch": 3.338064149872268, "grad_norm": 6.939189434051514, "learning_rate": 8.331677547544707e-05, "loss": 2.7855592727661134, "step": 11760 }, { "epoch": 3.3409026397956287, "grad_norm": 7.193537712097168, "learning_rate": 8.330258302583026e-05, "loss": 2.743514823913574, "step": 11770 }, { "epoch": 3.3437411297189894, "grad_norm": 6.709106922149658, "learning_rate": 8.328839057621345e-05, "loss": 2.804730033874512, "step": 11780 }, { "epoch": 3.34657961964235, "grad_norm": 6.963635444641113, "learning_rate": 8.327419812659666e-05, "loss": 2.8540023803710937, "step": 11790 }, { "epoch": 3.349418109565711, "grad_norm": 6.90917444229126, "learning_rate": 8.326000567697984e-05, "loss": 2.7415294647216797, "step": 11800 }, { "epoch": 3.352256599489072, "grad_norm": 6.926270484924316, "learning_rate": 8.324581322736305e-05, "loss": 2.7912929534912108, "step": 11810 }, { "epoch": 3.3550950894124325, "grad_norm": 6.752747058868408, "learning_rate": 8.323162077774624e-05, "loss": 2.6860097885131835, "step": 11820 }, { "epoch": 3.357933579335793, "grad_norm": 6.977079391479492, "learning_rate": 8.321742832812945e-05, "loss": 2.841211700439453, "step": 11830 }, { "epoch": 3.3607720692591543, "grad_norm": 7.515000820159912, "learning_rate": 8.320323587851263e-05, "loss": 2.7423837661743162, "step": 11840 }, { "epoch": 3.363610559182515, "grad_norm": 6.830822467803955, "learning_rate": 8.318904342889583e-05, "loss": 2.7807540893554688, "step": 11850 }, { "epoch": 3.3664490491058756, "grad_norm": 6.918494701385498, "learning_rate": 8.317485097927903e-05, "loss": 2.7312740325927733, "step": 11860 }, { "epoch": 3.3692875390292363, "grad_norm": 6.918797016143799, "learning_rate": 8.316065852966222e-05, "loss": 2.819452667236328, "step": 11870 }, { "epoch": 3.3721260289525974, "grad_norm": 6.958960056304932, "learning_rate": 8.314646608004541e-05, "loss": 2.747102737426758, "step": 11880 }, { "epoch": 3.374964518875958, "grad_norm": 6.790680885314941, "learning_rate": 8.313227363042861e-05, "loss": 2.803788185119629, "step": 11890 }, { "epoch": 3.3778030087993187, "grad_norm": 6.899458408355713, "learning_rate": 8.311808118081181e-05, "loss": 2.717177963256836, "step": 11900 }, { "epoch": 3.3806414987226794, "grad_norm": 7.097056865692139, "learning_rate": 8.310388873119501e-05, "loss": 2.780593681335449, "step": 11910 }, { "epoch": 3.3834799886460405, "grad_norm": 6.7558207511901855, "learning_rate": 8.30896962815782e-05, "loss": 2.7263391494750975, "step": 11920 }, { "epoch": 3.386318478569401, "grad_norm": 7.0034356117248535, "learning_rate": 8.30755038319614e-05, "loss": 2.781089019775391, "step": 11930 }, { "epoch": 3.389156968492762, "grad_norm": 6.842842102050781, "learning_rate": 8.30613113823446e-05, "loss": 2.837868309020996, "step": 11940 }, { "epoch": 3.3919954584161225, "grad_norm": 7.377260208129883, "learning_rate": 8.30471189327278e-05, "loss": 2.793296241760254, "step": 11950 }, { "epoch": 3.3948339483394836, "grad_norm": 6.763322353363037, "learning_rate": 8.303292648311099e-05, "loss": 2.8103275299072266, "step": 11960 }, { "epoch": 3.3976724382628443, "grad_norm": 7.647582530975342, "learning_rate": 8.301873403349418e-05, "loss": 2.760980796813965, "step": 11970 }, { "epoch": 3.400510928186205, "grad_norm": 7.423211097717285, "learning_rate": 8.300454158387739e-05, "loss": 2.7060354232788084, "step": 11980 }, { "epoch": 3.4033494181095656, "grad_norm": 7.08057165145874, "learning_rate": 8.299034913426058e-05, "loss": 2.725861930847168, "step": 11990 }, { "epoch": 3.4061879080329263, "grad_norm": 7.083882808685303, "learning_rate": 8.297615668464377e-05, "loss": 2.729741668701172, "step": 12000 }, { "epoch": 3.4061879080329263, "eval_accuracy": 0.22261079671901826, "eval_loss": 3.034818410873413, "eval_runtime": 45.4189, "eval_samples_per_second": 346.265, "eval_steps_per_second": 5.416, "step": 12000 }, { "epoch": 3.4090263979562874, "grad_norm": 7.112361431121826, "learning_rate": 8.296196423502697e-05, "loss": 2.76119384765625, "step": 12010 }, { "epoch": 3.411864887879648, "grad_norm": 7.327687740325928, "learning_rate": 8.294777178541016e-05, "loss": 2.83326416015625, "step": 12020 }, { "epoch": 3.4147033778030087, "grad_norm": 7.0584797859191895, "learning_rate": 8.293357933579337e-05, "loss": 2.7779108047485352, "step": 12030 }, { "epoch": 3.41754186772637, "grad_norm": 7.015773773193359, "learning_rate": 8.291938688617655e-05, "loss": 2.793351745605469, "step": 12040 }, { "epoch": 3.4203803576497305, "grad_norm": 7.1158366203308105, "learning_rate": 8.290519443655976e-05, "loss": 2.8289968490600588, "step": 12050 }, { "epoch": 3.423218847573091, "grad_norm": 6.756847858428955, "learning_rate": 8.289100198694295e-05, "loss": 2.7247730255126954, "step": 12060 }, { "epoch": 3.426057337496452, "grad_norm": 7.256504535675049, "learning_rate": 8.287680953732616e-05, "loss": 2.7904773712158204, "step": 12070 }, { "epoch": 3.4288958274198125, "grad_norm": 7.212540626525879, "learning_rate": 8.286261708770933e-05, "loss": 2.7561840057373046, "step": 12080 }, { "epoch": 3.4317343173431736, "grad_norm": 6.916162014007568, "learning_rate": 8.284842463809254e-05, "loss": 2.760806083679199, "step": 12090 }, { "epoch": 3.4345728072665342, "grad_norm": 7.044960975646973, "learning_rate": 8.283423218847574e-05, "loss": 2.6966548919677735, "step": 12100 }, { "epoch": 3.437411297189895, "grad_norm": 7.033681869506836, "learning_rate": 8.282003973885893e-05, "loss": 2.7625356674194337, "step": 12110 }, { "epoch": 3.4402497871132556, "grad_norm": 6.930113792419434, "learning_rate": 8.280584728924212e-05, "loss": 2.716951370239258, "step": 12120 }, { "epoch": 3.4430882770366167, "grad_norm": 7.167266368865967, "learning_rate": 8.279165483962532e-05, "loss": 2.752712440490723, "step": 12130 }, { "epoch": 3.4459267669599773, "grad_norm": 6.873993873596191, "learning_rate": 8.277746239000852e-05, "loss": 2.8328222274780273, "step": 12140 }, { "epoch": 3.448765256883338, "grad_norm": 6.534854888916016, "learning_rate": 8.276326994039172e-05, "loss": 2.7250158309936525, "step": 12150 }, { "epoch": 3.4516037468066987, "grad_norm": 7.011796951293945, "learning_rate": 8.274907749077491e-05, "loss": 2.69448299407959, "step": 12160 }, { "epoch": 3.45444223673006, "grad_norm": 7.047638416290283, "learning_rate": 8.27348850411581e-05, "loss": 2.7085029602050783, "step": 12170 }, { "epoch": 3.4572807266534205, "grad_norm": 7.590725898742676, "learning_rate": 8.272069259154131e-05, "loss": 2.790851593017578, "step": 12180 }, { "epoch": 3.460119216576781, "grad_norm": 6.914699554443359, "learning_rate": 8.27065001419245e-05, "loss": 2.7901777267456054, "step": 12190 }, { "epoch": 3.462957706500142, "grad_norm": 7.143256664276123, "learning_rate": 8.26923076923077e-05, "loss": 2.7290950775146485, "step": 12200 }, { "epoch": 3.4657961964235025, "grad_norm": 6.66080904006958, "learning_rate": 8.267811524269089e-05, "loss": 2.796368408203125, "step": 12210 }, { "epoch": 3.4686346863468636, "grad_norm": 6.684469699859619, "learning_rate": 8.26639227930741e-05, "loss": 2.7383928298950195, "step": 12220 }, { "epoch": 3.4714731762702242, "grad_norm": 6.572812080383301, "learning_rate": 8.264973034345729e-05, "loss": 2.7180273056030275, "step": 12230 }, { "epoch": 3.474311666193585, "grad_norm": 7.46055793762207, "learning_rate": 8.263553789384048e-05, "loss": 2.7581317901611326, "step": 12240 }, { "epoch": 3.477150156116946, "grad_norm": 6.952744960784912, "learning_rate": 8.262134544422368e-05, "loss": 2.7537641525268555, "step": 12250 }, { "epoch": 3.4799886460403067, "grad_norm": 7.00386381149292, "learning_rate": 8.260715299460687e-05, "loss": 2.6819942474365233, "step": 12260 }, { "epoch": 3.4828271359636673, "grad_norm": 7.139547824859619, "learning_rate": 8.259296054499008e-05, "loss": 2.817461395263672, "step": 12270 }, { "epoch": 3.485665625887028, "grad_norm": 6.904083251953125, "learning_rate": 8.257876809537326e-05, "loss": 2.696492004394531, "step": 12280 }, { "epoch": 3.4885041158103887, "grad_norm": 6.9233717918396, "learning_rate": 8.256457564575646e-05, "loss": 2.786868095397949, "step": 12290 }, { "epoch": 3.4913426057337498, "grad_norm": 7.130278587341309, "learning_rate": 8.255038319613966e-05, "loss": 2.780845069885254, "step": 12300 }, { "epoch": 3.4941810956571104, "grad_norm": 6.675077438354492, "learning_rate": 8.253619074652286e-05, "loss": 2.7943302154541017, "step": 12310 }, { "epoch": 3.497019585580471, "grad_norm": 6.762091159820557, "learning_rate": 8.252199829690604e-05, "loss": 2.654861831665039, "step": 12320 }, { "epoch": 3.499858075503832, "grad_norm": 7.00540828704834, "learning_rate": 8.250780584728925e-05, "loss": 2.7209407806396486, "step": 12330 }, { "epoch": 3.502696565427193, "grad_norm": 6.971551418304443, "learning_rate": 8.249361339767244e-05, "loss": 2.745055389404297, "step": 12340 }, { "epoch": 3.5055350553505535, "grad_norm": 6.555863380432129, "learning_rate": 8.247942094805564e-05, "loss": 2.7291595458984377, "step": 12350 }, { "epoch": 3.508373545273914, "grad_norm": 6.670979022979736, "learning_rate": 8.246522849843883e-05, "loss": 2.7304813385009767, "step": 12360 }, { "epoch": 3.511212035197275, "grad_norm": 6.904090404510498, "learning_rate": 8.245103604882202e-05, "loss": 2.767725372314453, "step": 12370 }, { "epoch": 3.514050525120636, "grad_norm": 6.976011753082275, "learning_rate": 8.243684359920523e-05, "loss": 2.7127153396606447, "step": 12380 }, { "epoch": 3.5168890150439966, "grad_norm": 6.937860488891602, "learning_rate": 8.242265114958842e-05, "loss": 2.789493942260742, "step": 12390 }, { "epoch": 3.5197275049673573, "grad_norm": 7.21697998046875, "learning_rate": 8.240845869997162e-05, "loss": 2.7325981140136717, "step": 12400 }, { "epoch": 3.522565994890718, "grad_norm": 6.863409519195557, "learning_rate": 8.239426625035481e-05, "loss": 2.7434268951416017, "step": 12410 }, { "epoch": 3.5254044848140786, "grad_norm": 7.172671794891357, "learning_rate": 8.238007380073802e-05, "loss": 2.764494705200195, "step": 12420 }, { "epoch": 3.5282429747374398, "grad_norm": 6.832785606384277, "learning_rate": 8.236588135112121e-05, "loss": 2.7074821472167967, "step": 12430 }, { "epoch": 3.5310814646608004, "grad_norm": 7.03511905670166, "learning_rate": 8.23516889015044e-05, "loss": 2.77734489440918, "step": 12440 }, { "epoch": 3.533919954584161, "grad_norm": 6.863156318664551, "learning_rate": 8.23374964518876e-05, "loss": 2.716941070556641, "step": 12450 }, { "epoch": 3.536758444507522, "grad_norm": 6.806926727294922, "learning_rate": 8.232330400227079e-05, "loss": 2.7084428787231447, "step": 12460 }, { "epoch": 3.539596934430883, "grad_norm": 7.154281139373779, "learning_rate": 8.2309111552654e-05, "loss": 2.7624689102172852, "step": 12470 }, { "epoch": 3.5424354243542435, "grad_norm": 7.3155293464660645, "learning_rate": 8.229491910303719e-05, "loss": 2.8118408203125, "step": 12480 }, { "epoch": 3.545273914277604, "grad_norm": 6.769201755523682, "learning_rate": 8.228072665342038e-05, "loss": 2.7109052658081056, "step": 12490 }, { "epoch": 3.548112404200965, "grad_norm": 7.296099662780762, "learning_rate": 8.226653420380358e-05, "loss": 2.786777114868164, "step": 12500 }, { "epoch": 3.548112404200965, "eval_accuracy": 0.22903287340242895, "eval_loss": 3.0100653171539307, "eval_runtime": 48.8785, "eval_samples_per_second": 321.757, "eval_steps_per_second": 5.033, "step": 12500 }, { "epoch": 3.550950894124326, "grad_norm": 7.105191230773926, "learning_rate": 8.225234175418678e-05, "loss": 2.780500602722168, "step": 12510 }, { "epoch": 3.5537893840476866, "grad_norm": 6.956323623657227, "learning_rate": 8.223814930456998e-05, "loss": 2.7951032638549806, "step": 12520 }, { "epoch": 3.5566278739710473, "grad_norm": 7.071200847625732, "learning_rate": 8.222395685495317e-05, "loss": 2.777771759033203, "step": 12530 }, { "epoch": 3.5594663638944084, "grad_norm": 7.048680782318115, "learning_rate": 8.220976440533636e-05, "loss": 2.749407958984375, "step": 12540 }, { "epoch": 3.562304853817769, "grad_norm": 6.701185703277588, "learning_rate": 8.219557195571957e-05, "loss": 2.6933944702148436, "step": 12550 }, { "epoch": 3.5651433437411297, "grad_norm": 7.2173991203308105, "learning_rate": 8.218137950610276e-05, "loss": 2.8818872451782225, "step": 12560 }, { "epoch": 3.5679818336644904, "grad_norm": 6.759720325469971, "learning_rate": 8.216718705648596e-05, "loss": 2.717042350769043, "step": 12570 }, { "epoch": 3.570820323587851, "grad_norm": 7.122035026550293, "learning_rate": 8.215299460686915e-05, "loss": 2.7695215225219725, "step": 12580 }, { "epoch": 3.573658813511212, "grad_norm": 6.725613117218018, "learning_rate": 8.213880215725234e-05, "loss": 2.751892852783203, "step": 12590 }, { "epoch": 3.576497303434573, "grad_norm": 6.9806084632873535, "learning_rate": 8.212460970763555e-05, "loss": 2.7877649307250976, "step": 12600 }, { "epoch": 3.5793357933579335, "grad_norm": 6.780242443084717, "learning_rate": 8.211041725801873e-05, "loss": 2.694253158569336, "step": 12610 }, { "epoch": 3.5821742832812946, "grad_norm": 7.119869709014893, "learning_rate": 8.209622480840194e-05, "loss": 2.7589038848876952, "step": 12620 }, { "epoch": 3.5850127732046553, "grad_norm": 7.0696940422058105, "learning_rate": 8.208203235878513e-05, "loss": 2.766234016418457, "step": 12630 }, { "epoch": 3.587851263128016, "grad_norm": 6.998388290405273, "learning_rate": 8.206783990916834e-05, "loss": 2.7031421661376953, "step": 12640 }, { "epoch": 3.5906897530513766, "grad_norm": 7.339100360870361, "learning_rate": 8.205364745955152e-05, "loss": 2.716139221191406, "step": 12650 }, { "epoch": 3.5935282429747373, "grad_norm": 6.890745639801025, "learning_rate": 8.203945500993472e-05, "loss": 2.7367366790771483, "step": 12660 }, { "epoch": 3.5963667328980984, "grad_norm": 6.967824935913086, "learning_rate": 8.202526256031792e-05, "loss": 2.8187677383422853, "step": 12670 }, { "epoch": 3.599205222821459, "grad_norm": 7.1262688636779785, "learning_rate": 8.201107011070111e-05, "loss": 2.741409683227539, "step": 12680 }, { "epoch": 3.6020437127448197, "grad_norm": 7.080568313598633, "learning_rate": 8.19968776610843e-05, "loss": 2.7558547973632814, "step": 12690 }, { "epoch": 3.6048822026681804, "grad_norm": 7.090328693389893, "learning_rate": 8.19826852114675e-05, "loss": 2.825864791870117, "step": 12700 }, { "epoch": 3.607720692591541, "grad_norm": 6.891569137573242, "learning_rate": 8.19684927618507e-05, "loss": 2.73282413482666, "step": 12710 }, { "epoch": 3.610559182514902, "grad_norm": 6.898216247558594, "learning_rate": 8.19543003122339e-05, "loss": 2.7842227935791017, "step": 12720 }, { "epoch": 3.613397672438263, "grad_norm": 7.223921298980713, "learning_rate": 8.194010786261709e-05, "loss": 2.7109588623046874, "step": 12730 }, { "epoch": 3.6162361623616235, "grad_norm": 6.831971168518066, "learning_rate": 8.192591541300028e-05, "loss": 2.765598678588867, "step": 12740 }, { "epoch": 3.6190746522849846, "grad_norm": 6.5385332107543945, "learning_rate": 8.191172296338349e-05, "loss": 2.720417022705078, "step": 12750 }, { "epoch": 3.6219131422083453, "grad_norm": 7.032243728637695, "learning_rate": 8.189753051376668e-05, "loss": 2.8082794189453124, "step": 12760 }, { "epoch": 3.624751632131706, "grad_norm": 7.019765853881836, "learning_rate": 8.188333806414988e-05, "loss": 2.6453670501708983, "step": 12770 }, { "epoch": 3.6275901220550666, "grad_norm": 6.872027397155762, "learning_rate": 8.186914561453307e-05, "loss": 2.7488636016845702, "step": 12780 }, { "epoch": 3.6304286119784273, "grad_norm": 7.084522724151611, "learning_rate": 8.185495316491626e-05, "loss": 2.6784448623657227, "step": 12790 }, { "epoch": 3.6332671019017884, "grad_norm": 6.954119682312012, "learning_rate": 8.184076071529947e-05, "loss": 2.8422550201416015, "step": 12800 }, { "epoch": 3.636105591825149, "grad_norm": 6.868659019470215, "learning_rate": 8.182656826568265e-05, "loss": 2.67303466796875, "step": 12810 }, { "epoch": 3.6389440817485097, "grad_norm": 7.175516128540039, "learning_rate": 8.181237581606586e-05, "loss": 2.749601364135742, "step": 12820 }, { "epoch": 3.641782571671871, "grad_norm": 6.414980411529541, "learning_rate": 8.179818336644905e-05, "loss": 2.688302993774414, "step": 12830 }, { "epoch": 3.6446210615952315, "grad_norm": 7.03912878036499, "learning_rate": 8.178399091683226e-05, "loss": 2.773314666748047, "step": 12840 }, { "epoch": 3.647459551518592, "grad_norm": 6.903531074523926, "learning_rate": 8.176979846721544e-05, "loss": 2.695773696899414, "step": 12850 }, { "epoch": 3.650298041441953, "grad_norm": 7.066432952880859, "learning_rate": 8.175560601759864e-05, "loss": 2.8337507247924805, "step": 12860 }, { "epoch": 3.6531365313653135, "grad_norm": 6.9591898918151855, "learning_rate": 8.174141356798184e-05, "loss": 2.799860382080078, "step": 12870 }, { "epoch": 3.6559750212886746, "grad_norm": 6.88856840133667, "learning_rate": 8.172722111836504e-05, "loss": 2.707087516784668, "step": 12880 }, { "epoch": 3.6588135112120352, "grad_norm": 6.8011603355407715, "learning_rate": 8.171302866874822e-05, "loss": 2.7042701721191404, "step": 12890 }, { "epoch": 3.661652001135396, "grad_norm": 6.969066143035889, "learning_rate": 8.169883621913143e-05, "loss": 2.652304840087891, "step": 12900 }, { "epoch": 3.664490491058757, "grad_norm": 6.679305553436279, "learning_rate": 8.168464376951462e-05, "loss": 2.8038455963134767, "step": 12910 }, { "epoch": 3.6673289809821172, "grad_norm": 6.709200382232666, "learning_rate": 8.167045131989782e-05, "loss": 2.77474422454834, "step": 12920 }, { "epoch": 3.6701674709054783, "grad_norm": 7.309234619140625, "learning_rate": 8.165625887028101e-05, "loss": 2.6217456817626954, "step": 12930 }, { "epoch": 3.673005960828839, "grad_norm": 7.2731757164001465, "learning_rate": 8.16420664206642e-05, "loss": 2.752857971191406, "step": 12940 }, { "epoch": 3.6758444507521997, "grad_norm": 6.920065402984619, "learning_rate": 8.162787397104741e-05, "loss": 2.6749887466430664, "step": 12950 }, { "epoch": 3.678682940675561, "grad_norm": 7.112610816955566, "learning_rate": 8.16136815214306e-05, "loss": 2.749250602722168, "step": 12960 }, { "epoch": 3.6815214305989215, "grad_norm": 6.863938808441162, "learning_rate": 8.15994890718138e-05, "loss": 2.7456077575683593, "step": 12970 }, { "epoch": 3.684359920522282, "grad_norm": 6.975311756134033, "learning_rate": 8.158529662219699e-05, "loss": 2.7085735321044924, "step": 12980 }, { "epoch": 3.687198410445643, "grad_norm": 7.261634349822998, "learning_rate": 8.15711041725802e-05, "loss": 2.7585140228271485, "step": 12990 }, { "epoch": 3.6900369003690034, "grad_norm": 7.032149791717529, "learning_rate": 8.155691172296339e-05, "loss": 2.72965145111084, "step": 13000 }, { "epoch": 3.6900369003690034, "eval_accuracy": 0.2277611750492783, "eval_loss": 2.9936161041259766, "eval_runtime": 50.4867, "eval_samples_per_second": 311.507, "eval_steps_per_second": 4.873, "step": 13000 }, { "epoch": 3.6928753902923646, "grad_norm": 7.039312839508057, "learning_rate": 8.154271927334658e-05, "loss": 2.7754188537597657, "step": 13010 }, { "epoch": 3.6957138802157252, "grad_norm": 6.913600921630859, "learning_rate": 8.152852682372978e-05, "loss": 2.7534223556518556, "step": 13020 }, { "epoch": 3.698552370139086, "grad_norm": 6.759521484375, "learning_rate": 8.151433437411297e-05, "loss": 2.787588119506836, "step": 13030 }, { "epoch": 3.701390860062447, "grad_norm": 7.287664413452148, "learning_rate": 8.150014192449618e-05, "loss": 2.7900363922119142, "step": 13040 }, { "epoch": 3.7042293499858077, "grad_norm": 7.015428066253662, "learning_rate": 8.148594947487936e-05, "loss": 2.7631336212158204, "step": 13050 }, { "epoch": 3.7070678399091683, "grad_norm": 6.736515522003174, "learning_rate": 8.147175702526257e-05, "loss": 2.698942947387695, "step": 13060 }, { "epoch": 3.709906329832529, "grad_norm": 6.545927047729492, "learning_rate": 8.145756457564576e-05, "loss": 2.7388296127319336, "step": 13070 }, { "epoch": 3.7127448197558897, "grad_norm": 6.865320682525635, "learning_rate": 8.144337212602897e-05, "loss": 2.6972402572631835, "step": 13080 }, { "epoch": 3.7155833096792508, "grad_norm": 7.011258125305176, "learning_rate": 8.142917967641215e-05, "loss": 2.659718322753906, "step": 13090 }, { "epoch": 3.7184217996026114, "grad_norm": 7.578081130981445, "learning_rate": 8.141498722679535e-05, "loss": 2.68774528503418, "step": 13100 }, { "epoch": 3.721260289525972, "grad_norm": 6.964227676391602, "learning_rate": 8.140079477717855e-05, "loss": 2.752146530151367, "step": 13110 }, { "epoch": 3.724098779449333, "grad_norm": 6.694744110107422, "learning_rate": 8.138660232756174e-05, "loss": 2.7084278106689452, "step": 13120 }, { "epoch": 3.726937269372694, "grad_norm": 6.915827751159668, "learning_rate": 8.137240987794493e-05, "loss": 2.688674545288086, "step": 13130 }, { "epoch": 3.7297757592960545, "grad_norm": 6.973954677581787, "learning_rate": 8.135821742832813e-05, "loss": 2.6932037353515623, "step": 13140 }, { "epoch": 3.732614249219415, "grad_norm": 6.81820821762085, "learning_rate": 8.134402497871133e-05, "loss": 2.733527183532715, "step": 13150 }, { "epoch": 3.735452739142776, "grad_norm": 7.021303653717041, "learning_rate": 8.132983252909453e-05, "loss": 2.798992919921875, "step": 13160 }, { "epoch": 3.738291229066137, "grad_norm": 6.845896244049072, "learning_rate": 8.131564007947772e-05, "loss": 2.722305488586426, "step": 13170 }, { "epoch": 3.7411297189894976, "grad_norm": 7.0987091064453125, "learning_rate": 8.130144762986091e-05, "loss": 2.724735069274902, "step": 13180 }, { "epoch": 3.7439682089128583, "grad_norm": 6.54233455657959, "learning_rate": 8.128725518024412e-05, "loss": 2.747038650512695, "step": 13190 }, { "epoch": 3.746806698836219, "grad_norm": 7.1827616691589355, "learning_rate": 8.127306273062731e-05, "loss": 2.588668441772461, "step": 13200 }, { "epoch": 3.7496451887595796, "grad_norm": 6.8617024421691895, "learning_rate": 8.12588702810105e-05, "loss": 2.749438667297363, "step": 13210 }, { "epoch": 3.7524836786829407, "grad_norm": 7.138688087463379, "learning_rate": 8.12446778313937e-05, "loss": 2.6994504928588867, "step": 13220 }, { "epoch": 3.7553221686063014, "grad_norm": 6.494115352630615, "learning_rate": 8.12304853817769e-05, "loss": 2.6825456619262695, "step": 13230 }, { "epoch": 3.758160658529662, "grad_norm": 6.965185165405273, "learning_rate": 8.12162929321601e-05, "loss": 2.678518295288086, "step": 13240 }, { "epoch": 3.760999148453023, "grad_norm": 7.011989116668701, "learning_rate": 8.120210048254329e-05, "loss": 2.7145023345947266, "step": 13250 }, { "epoch": 3.763837638376384, "grad_norm": 6.7253899574279785, "learning_rate": 8.118790803292649e-05, "loss": 2.674075126647949, "step": 13260 }, { "epoch": 3.7666761282997445, "grad_norm": 6.883346080780029, "learning_rate": 8.117371558330968e-05, "loss": 2.7375640869140625, "step": 13270 }, { "epoch": 3.769514618223105, "grad_norm": 6.974597454071045, "learning_rate": 8.115952313369289e-05, "loss": 2.7511606216430664, "step": 13280 }, { "epoch": 3.772353108146466, "grad_norm": 6.906052112579346, "learning_rate": 8.114533068407607e-05, "loss": 2.681057929992676, "step": 13290 }, { "epoch": 3.775191598069827, "grad_norm": 7.331531047821045, "learning_rate": 8.113113823445927e-05, "loss": 2.6609859466552734, "step": 13300 }, { "epoch": 3.7780300879931876, "grad_norm": 6.8312859535217285, "learning_rate": 8.111694578484247e-05, "loss": 2.8009336471557615, "step": 13310 }, { "epoch": 3.7808685779165483, "grad_norm": 7.075687885284424, "learning_rate": 8.110275333522567e-05, "loss": 2.7623340606689455, "step": 13320 }, { "epoch": 3.7837070678399094, "grad_norm": 6.636828422546387, "learning_rate": 8.108856088560885e-05, "loss": 2.697867584228516, "step": 13330 }, { "epoch": 3.78654555776327, "grad_norm": 7.128901958465576, "learning_rate": 8.107436843599206e-05, "loss": 2.7258943557739257, "step": 13340 }, { "epoch": 3.7893840476866307, "grad_norm": 7.136878967285156, "learning_rate": 8.106017598637525e-05, "loss": 2.7622707366943358, "step": 13350 }, { "epoch": 3.7922225376099914, "grad_norm": 7.178433895111084, "learning_rate": 8.104598353675845e-05, "loss": 2.720966339111328, "step": 13360 }, { "epoch": 3.795061027533352, "grad_norm": 7.031364440917969, "learning_rate": 8.103179108714164e-05, "loss": 2.753662872314453, "step": 13370 }, { "epoch": 3.797899517456713, "grad_norm": 6.869141101837158, "learning_rate": 8.101759863752483e-05, "loss": 2.731683349609375, "step": 13380 }, { "epoch": 3.800738007380074, "grad_norm": 7.026280879974365, "learning_rate": 8.100340618790804e-05, "loss": 2.726068878173828, "step": 13390 }, { "epoch": 3.8035764973034345, "grad_norm": 7.033087730407715, "learning_rate": 8.098921373829123e-05, "loss": 2.6911182403564453, "step": 13400 }, { "epoch": 3.8064149872267956, "grad_norm": 7.243199348449707, "learning_rate": 8.097502128867443e-05, "loss": 2.7108606338500976, "step": 13410 }, { "epoch": 3.8092534771501563, "grad_norm": 7.007996559143066, "learning_rate": 8.096082883905762e-05, "loss": 2.7514711380004884, "step": 13420 }, { "epoch": 3.812091967073517, "grad_norm": 7.206003189086914, "learning_rate": 8.094663638944083e-05, "loss": 2.737026405334473, "step": 13430 }, { "epoch": 3.8149304569968776, "grad_norm": 6.644754886627197, "learning_rate": 8.093244393982402e-05, "loss": 2.6966209411621094, "step": 13440 }, { "epoch": 3.8177689469202383, "grad_norm": 7.208826541900635, "learning_rate": 8.091825149020721e-05, "loss": 2.766019821166992, "step": 13450 }, { "epoch": 3.8206074368435994, "grad_norm": 7.031318187713623, "learning_rate": 8.09040590405904e-05, "loss": 2.736743927001953, "step": 13460 }, { "epoch": 3.82344592676696, "grad_norm": 6.976154804229736, "learning_rate": 8.088986659097361e-05, "loss": 2.7398408889770507, "step": 13470 }, { "epoch": 3.8262844166903207, "grad_norm": 6.734996318817139, "learning_rate": 8.08756741413568e-05, "loss": 2.779471015930176, "step": 13480 }, { "epoch": 3.8291229066136814, "grad_norm": 6.994950771331787, "learning_rate": 8.086148169174e-05, "loss": 2.732699966430664, "step": 13490 }, { "epoch": 3.831961396537042, "grad_norm": 6.893470287322998, "learning_rate": 8.084728924212319e-05, "loss": 2.8168169021606446, "step": 13500 }, { "epoch": 3.831961396537042, "eval_accuracy": 0.23895212055700388, "eval_loss": 2.9732937812805176, "eval_runtime": 51.951, "eval_samples_per_second": 302.727, "eval_steps_per_second": 4.735, "step": 13500 }, { "epoch": 3.834799886460403, "grad_norm": 7.370798587799072, "learning_rate": 8.083309679250639e-05, "loss": 2.7657180786132813, "step": 13510 }, { "epoch": 3.837638376383764, "grad_norm": 7.015325546264648, "learning_rate": 8.08189043428896e-05, "loss": 2.645700454711914, "step": 13520 }, { "epoch": 3.8404768663071245, "grad_norm": 6.820696830749512, "learning_rate": 8.080471189327277e-05, "loss": 2.7655689239501955, "step": 13530 }, { "epoch": 3.8433153562304856, "grad_norm": 6.79034423828125, "learning_rate": 8.079051944365598e-05, "loss": 2.7810543060302733, "step": 13540 }, { "epoch": 3.8461538461538463, "grad_norm": 6.7210869789123535, "learning_rate": 8.077632699403917e-05, "loss": 2.683797073364258, "step": 13550 }, { "epoch": 3.848992336077207, "grad_norm": 6.895586967468262, "learning_rate": 8.076213454442238e-05, "loss": 2.7336639404296874, "step": 13560 }, { "epoch": 3.8518308260005676, "grad_norm": 7.503326892852783, "learning_rate": 8.074794209480556e-05, "loss": 2.6867385864257813, "step": 13570 }, { "epoch": 3.8546693159239283, "grad_norm": 6.682934284210205, "learning_rate": 8.073374964518877e-05, "loss": 2.6530900955200196, "step": 13580 }, { "epoch": 3.8575078058472894, "grad_norm": 7.188476085662842, "learning_rate": 8.071955719557196e-05, "loss": 2.7397438049316407, "step": 13590 }, { "epoch": 3.86034629577065, "grad_norm": 7.213035583496094, "learning_rate": 8.070536474595515e-05, "loss": 2.6779165267944336, "step": 13600 }, { "epoch": 3.8631847856940107, "grad_norm": 6.868101596832275, "learning_rate": 8.069117229633835e-05, "loss": 2.7336524963378905, "step": 13610 }, { "epoch": 3.866023275617372, "grad_norm": 6.938918113708496, "learning_rate": 8.067697984672154e-05, "loss": 2.7016592025756836, "step": 13620 }, { "epoch": 3.8688617655407325, "grad_norm": 7.005397796630859, "learning_rate": 8.066278739710475e-05, "loss": 2.673484039306641, "step": 13630 }, { "epoch": 3.871700255464093, "grad_norm": 7.3531174659729, "learning_rate": 8.064859494748794e-05, "loss": 2.743886947631836, "step": 13640 }, { "epoch": 3.874538745387454, "grad_norm": 6.623589992523193, "learning_rate": 8.063440249787113e-05, "loss": 2.6897308349609377, "step": 13650 }, { "epoch": 3.8773772353108145, "grad_norm": 7.047057151794434, "learning_rate": 8.062021004825433e-05, "loss": 2.767506980895996, "step": 13660 }, { "epoch": 3.8802157252341756, "grad_norm": 7.069852828979492, "learning_rate": 8.060601759863753e-05, "loss": 2.7273792266845702, "step": 13670 }, { "epoch": 3.8830542151575362, "grad_norm": 6.888359069824219, "learning_rate": 8.059182514902073e-05, "loss": 2.6936050415039063, "step": 13680 }, { "epoch": 3.885892705080897, "grad_norm": 6.700680732727051, "learning_rate": 8.057763269940392e-05, "loss": 2.7128313064575194, "step": 13690 }, { "epoch": 3.888731195004258, "grad_norm": 6.965536117553711, "learning_rate": 8.056344024978711e-05, "loss": 2.704664421081543, "step": 13700 }, { "epoch": 3.8915696849276182, "grad_norm": 6.867786407470703, "learning_rate": 8.054924780017031e-05, "loss": 2.668539047241211, "step": 13710 }, { "epoch": 3.8944081748509793, "grad_norm": 6.6537041664123535, "learning_rate": 8.053505535055351e-05, "loss": 2.711451530456543, "step": 13720 }, { "epoch": 3.89724666477434, "grad_norm": 6.705828666687012, "learning_rate": 8.052086290093671e-05, "loss": 2.6248260498046876, "step": 13730 }, { "epoch": 3.9000851546977007, "grad_norm": 6.721962928771973, "learning_rate": 8.05066704513199e-05, "loss": 2.7893558502197267, "step": 13740 }, { "epoch": 3.902923644621062, "grad_norm": 6.854557037353516, "learning_rate": 8.04924780017031e-05, "loss": 2.673928642272949, "step": 13750 }, { "epoch": 3.9057621345444224, "grad_norm": 6.369113445281982, "learning_rate": 8.04782855520863e-05, "loss": 2.7098499298095704, "step": 13760 }, { "epoch": 3.908600624467783, "grad_norm": 7.246514797210693, "learning_rate": 8.04640931024695e-05, "loss": 2.703859329223633, "step": 13770 }, { "epoch": 3.911439114391144, "grad_norm": 6.883810043334961, "learning_rate": 8.044990065285269e-05, "loss": 2.687179183959961, "step": 13780 }, { "epoch": 3.9142776043145044, "grad_norm": 6.913470268249512, "learning_rate": 8.043570820323588e-05, "loss": 2.7117366790771484, "step": 13790 }, { "epoch": 3.9171160942378656, "grad_norm": 7.1357421875, "learning_rate": 8.042151575361909e-05, "loss": 2.7782243728637694, "step": 13800 }, { "epoch": 3.919954584161226, "grad_norm": 7.001255989074707, "learning_rate": 8.040732330400228e-05, "loss": 2.6748641967773437, "step": 13810 }, { "epoch": 3.922793074084587, "grad_norm": 6.586246967315674, "learning_rate": 8.039313085438547e-05, "loss": 2.678822326660156, "step": 13820 }, { "epoch": 3.925631564007948, "grad_norm": 7.359455585479736, "learning_rate": 8.037893840476867e-05, "loss": 2.7562971115112305, "step": 13830 }, { "epoch": 3.9284700539313087, "grad_norm": 6.908917427062988, "learning_rate": 8.036474595515186e-05, "loss": 2.689517021179199, "step": 13840 }, { "epoch": 3.9313085438546693, "grad_norm": 6.615794658660889, "learning_rate": 8.035055350553507e-05, "loss": 2.601657485961914, "step": 13850 }, { "epoch": 3.93414703377803, "grad_norm": 6.80448579788208, "learning_rate": 8.033636105591825e-05, "loss": 2.7845748901367187, "step": 13860 }, { "epoch": 3.9369855237013907, "grad_norm": 7.167981147766113, "learning_rate": 8.032216860630145e-05, "loss": 2.75408878326416, "step": 13870 }, { "epoch": 3.9398240136247518, "grad_norm": 7.056396484375, "learning_rate": 8.030797615668465e-05, "loss": 2.673040199279785, "step": 13880 }, { "epoch": 3.9426625035481124, "grad_norm": 7.167428016662598, "learning_rate": 8.029378370706785e-05, "loss": 2.7986404418945314, "step": 13890 }, { "epoch": 3.945500993471473, "grad_norm": 6.818063735961914, "learning_rate": 8.027959125745103e-05, "loss": 2.7252803802490235, "step": 13900 }, { "epoch": 3.948339483394834, "grad_norm": 7.023499488830566, "learning_rate": 8.026539880783424e-05, "loss": 2.7479198455810545, "step": 13910 }, { "epoch": 3.951177973318195, "grad_norm": 6.944635391235352, "learning_rate": 8.025120635821743e-05, "loss": 2.6630821228027344, "step": 13920 }, { "epoch": 3.9540164632415555, "grad_norm": 7.399410724639893, "learning_rate": 8.023701390860063e-05, "loss": 2.7929826736450196, "step": 13930 }, { "epoch": 3.956854953164916, "grad_norm": 6.954176902770996, "learning_rate": 8.022282145898382e-05, "loss": 2.678030586242676, "step": 13940 }, { "epoch": 3.959693443088277, "grad_norm": 7.47503662109375, "learning_rate": 8.020862900936701e-05, "loss": 2.75439453125, "step": 13950 }, { "epoch": 3.962531933011638, "grad_norm": 6.911853790283203, "learning_rate": 8.019443655975022e-05, "loss": 2.6681962966918946, "step": 13960 }, { "epoch": 3.9653704229349986, "grad_norm": 6.91921329498291, "learning_rate": 8.018024411013341e-05, "loss": 2.698663520812988, "step": 13970 }, { "epoch": 3.9682089128583593, "grad_norm": 7.017735481262207, "learning_rate": 8.016605166051661e-05, "loss": 2.6837711334228516, "step": 13980 }, { "epoch": 3.97104740278172, "grad_norm": 6.853330135345459, "learning_rate": 8.01518592108998e-05, "loss": 2.7480125427246094, "step": 13990 }, { "epoch": 3.9738858927050806, "grad_norm": 7.158824920654297, "learning_rate": 8.013766676128301e-05, "loss": 2.812285804748535, "step": 14000 }, { "epoch": 3.9738858927050806, "eval_accuracy": 0.23558211992115471, "eval_loss": 2.9548113346099854, "eval_runtime": 50.1928, "eval_samples_per_second": 313.332, "eval_steps_per_second": 4.901, "step": 14000 }, { "epoch": 3.9767243826284417, "grad_norm": 6.9024248123168945, "learning_rate": 8.01234743116662e-05, "loss": 2.7402130126953126, "step": 14010 }, { "epoch": 3.9795628725518024, "grad_norm": 6.9997944831848145, "learning_rate": 8.01092818620494e-05, "loss": 2.6428668975830076, "step": 14020 }, { "epoch": 3.982401362475163, "grad_norm": 6.694930076599121, "learning_rate": 8.009508941243259e-05, "loss": 2.6594259262084963, "step": 14030 }, { "epoch": 3.985239852398524, "grad_norm": 7.059966564178467, "learning_rate": 8.008089696281578e-05, "loss": 2.667495346069336, "step": 14040 }, { "epoch": 3.988078342321885, "grad_norm": 7.045225143432617, "learning_rate": 8.006670451319899e-05, "loss": 2.6486795425415037, "step": 14050 }, { "epoch": 3.9909168322452455, "grad_norm": 6.4714741706848145, "learning_rate": 8.005251206358217e-05, "loss": 2.7454809188842773, "step": 14060 }, { "epoch": 3.993755322168606, "grad_norm": 7.582186222076416, "learning_rate": 8.003831961396538e-05, "loss": 2.7162342071533203, "step": 14070 }, { "epoch": 3.996593812091967, "grad_norm": 6.932522296905518, "learning_rate": 8.002412716434857e-05, "loss": 2.6489679336547853, "step": 14080 }, { "epoch": 3.999432302015328, "grad_norm": 6.955849647521973, "learning_rate": 8.000993471473178e-05, "loss": 2.603902244567871, "step": 14090 }, { "epoch": 4.002270791938688, "grad_norm": 6.987401008605957, "learning_rate": 7.999574226511496e-05, "loss": 2.6499805450439453, "step": 14100 }, { "epoch": 4.005109281862049, "grad_norm": 6.9220709800720215, "learning_rate": 7.998154981549816e-05, "loss": 2.668747901916504, "step": 14110 }, { "epoch": 4.00794777178541, "grad_norm": 6.806492805480957, "learning_rate": 7.996735736588136e-05, "loss": 2.547375297546387, "step": 14120 }, { "epoch": 4.010786261708771, "grad_norm": 6.796830177307129, "learning_rate": 7.995316491626456e-05, "loss": 2.5838342666625977, "step": 14130 }, { "epoch": 4.013624751632132, "grad_norm": 7.202995300292969, "learning_rate": 7.993897246664774e-05, "loss": 2.664211463928223, "step": 14140 }, { "epoch": 4.016463241555493, "grad_norm": 6.905510425567627, "learning_rate": 7.992478001703095e-05, "loss": 2.7373893737792967, "step": 14150 }, { "epoch": 4.019301731478853, "grad_norm": 7.144388675689697, "learning_rate": 7.991058756741414e-05, "loss": 2.587263298034668, "step": 14160 }, { "epoch": 4.022140221402214, "grad_norm": 6.997523784637451, "learning_rate": 7.989639511779734e-05, "loss": 2.693491744995117, "step": 14170 }, { "epoch": 4.024978711325574, "grad_norm": 6.732237339019775, "learning_rate": 7.988220266818053e-05, "loss": 2.6744991302490235, "step": 14180 }, { "epoch": 4.0278172012489355, "grad_norm": 7.160460948944092, "learning_rate": 7.986801021856372e-05, "loss": 2.6662216186523438, "step": 14190 }, { "epoch": 4.030655691172297, "grad_norm": 6.929082870483398, "learning_rate": 7.985381776894693e-05, "loss": 2.663201904296875, "step": 14200 }, { "epoch": 4.033494181095657, "grad_norm": 6.986283779144287, "learning_rate": 7.983962531933012e-05, "loss": 2.6507726669311524, "step": 14210 }, { "epoch": 4.036332671019018, "grad_norm": 7.432482719421387, "learning_rate": 7.982543286971332e-05, "loss": 2.6616130828857423, "step": 14220 }, { "epoch": 4.039171160942379, "grad_norm": 6.7894511222839355, "learning_rate": 7.981124042009651e-05, "loss": 2.640717697143555, "step": 14230 }, { "epoch": 4.042009650865739, "grad_norm": 7.272835731506348, "learning_rate": 7.979704797047972e-05, "loss": 2.67056884765625, "step": 14240 }, { "epoch": 4.0448481407891, "grad_norm": 6.905271530151367, "learning_rate": 7.978285552086291e-05, "loss": 2.6289506912231446, "step": 14250 }, { "epoch": 4.047686630712461, "grad_norm": 7.3863677978515625, "learning_rate": 7.97686630712461e-05, "loss": 2.6254053115844727, "step": 14260 }, { "epoch": 4.050525120635822, "grad_norm": 6.807178974151611, "learning_rate": 7.97544706216293e-05, "loss": 2.5989910125732423, "step": 14270 }, { "epoch": 4.053363610559183, "grad_norm": 6.947817325592041, "learning_rate": 7.974027817201249e-05, "loss": 2.6750659942626953, "step": 14280 }, { "epoch": 4.056202100482543, "grad_norm": 6.968000411987305, "learning_rate": 7.97260857223957e-05, "loss": 2.6995107650756838, "step": 14290 }, { "epoch": 4.059040590405904, "grad_norm": 6.507817268371582, "learning_rate": 7.971189327277888e-05, "loss": 2.6443674087524416, "step": 14300 }, { "epoch": 4.061879080329265, "grad_norm": 7.09650182723999, "learning_rate": 7.969770082316208e-05, "loss": 2.6855548858642577, "step": 14310 }, { "epoch": 4.0647175702526255, "grad_norm": 7.0664849281311035, "learning_rate": 7.968350837354528e-05, "loss": 2.724420738220215, "step": 14320 }, { "epoch": 4.067556060175987, "grad_norm": 7.017363548278809, "learning_rate": 7.966931592392848e-05, "loss": 2.6140010833740233, "step": 14330 }, { "epoch": 4.070394550099347, "grad_norm": 6.681944370269775, "learning_rate": 7.965512347431166e-05, "loss": 2.645719528198242, "step": 14340 }, { "epoch": 4.073233040022708, "grad_norm": 6.711551666259766, "learning_rate": 7.964093102469487e-05, "loss": 2.6117904663085936, "step": 14350 }, { "epoch": 4.076071529946069, "grad_norm": 6.569639205932617, "learning_rate": 7.962673857507806e-05, "loss": 2.6111703872680665, "step": 14360 }, { "epoch": 4.078910019869429, "grad_norm": 6.561342716217041, "learning_rate": 7.961254612546126e-05, "loss": 2.636811065673828, "step": 14370 }, { "epoch": 4.08174850979279, "grad_norm": 7.116057872772217, "learning_rate": 7.959835367584445e-05, "loss": 2.670784759521484, "step": 14380 }, { "epoch": 4.084586999716151, "grad_norm": 7.280125141143799, "learning_rate": 7.958416122622764e-05, "loss": 2.6434843063354494, "step": 14390 }, { "epoch": 4.087425489639512, "grad_norm": 6.844452381134033, "learning_rate": 7.956996877661085e-05, "loss": 2.600128173828125, "step": 14400 }, { "epoch": 4.090263979562873, "grad_norm": 6.790346145629883, "learning_rate": 7.955577632699404e-05, "loss": 2.6498954772949217, "step": 14410 }, { "epoch": 4.093102469486233, "grad_norm": 7.240684509277344, "learning_rate": 7.954158387737724e-05, "loss": 2.633651351928711, "step": 14420 }, { "epoch": 4.095940959409594, "grad_norm": 7.188358783721924, "learning_rate": 7.952739142776043e-05, "loss": 2.62222843170166, "step": 14430 }, { "epoch": 4.098779449332955, "grad_norm": 7.182208061218262, "learning_rate": 7.951319897814364e-05, "loss": 2.5918407440185547, "step": 14440 }, { "epoch": 4.1016179392563155, "grad_norm": 6.500191688537598, "learning_rate": 7.949900652852683e-05, "loss": 2.627142143249512, "step": 14450 }, { "epoch": 4.104456429179677, "grad_norm": 6.866003036499023, "learning_rate": 7.948481407891002e-05, "loss": 2.670310211181641, "step": 14460 }, { "epoch": 4.107294919103037, "grad_norm": 6.727506160736084, "learning_rate": 7.947062162929322e-05, "loss": 2.5709577560424806, "step": 14470 }, { "epoch": 4.110133409026398, "grad_norm": 6.955217361450195, "learning_rate": 7.945642917967642e-05, "loss": 2.606099319458008, "step": 14480 }, { "epoch": 4.112971898949759, "grad_norm": 6.9472270011901855, "learning_rate": 7.944223673005962e-05, "loss": 2.682626724243164, "step": 14490 }, { "epoch": 4.115810388873119, "grad_norm": 7.080307960510254, "learning_rate": 7.942804428044281e-05, "loss": 2.668178176879883, "step": 14500 }, { "epoch": 4.115810388873119, "eval_accuracy": 0.242194951357538, "eval_loss": 2.937851667404175, "eval_runtime": 49.3366, "eval_samples_per_second": 318.769, "eval_steps_per_second": 4.986, "step": 14500 }, { "epoch": 4.11864887879648, "grad_norm": 6.567597389221191, "learning_rate": 7.9413851830826e-05, "loss": 2.6147697448730467, "step": 14510 }, { "epoch": 4.1214873687198414, "grad_norm": 6.604759693145752, "learning_rate": 7.93996593812092e-05, "loss": 2.5812339782714844, "step": 14520 }, { "epoch": 4.124325858643202, "grad_norm": 7.2874555587768555, "learning_rate": 7.93854669315924e-05, "loss": 2.5965471267700195, "step": 14530 }, { "epoch": 4.127164348566563, "grad_norm": 7.055933475494385, "learning_rate": 7.937127448197558e-05, "loss": 2.6618608474731444, "step": 14540 }, { "epoch": 4.130002838489923, "grad_norm": 6.793093204498291, "learning_rate": 7.935708203235879e-05, "loss": 2.614879035949707, "step": 14550 }, { "epoch": 4.132841328413284, "grad_norm": 6.687119960784912, "learning_rate": 7.934288958274198e-05, "loss": 2.70507926940918, "step": 14560 }, { "epoch": 4.135679818336645, "grad_norm": 6.885851860046387, "learning_rate": 7.932869713312519e-05, "loss": 2.6980615615844727, "step": 14570 }, { "epoch": 4.138518308260005, "grad_norm": 6.857464790344238, "learning_rate": 7.931592392847006e-05, "loss": 2.6115478515625, "step": 14580 }, { "epoch": 4.1413567981833665, "grad_norm": 6.854785442352295, "learning_rate": 7.930173147885326e-05, "loss": 2.6196739196777346, "step": 14590 }, { "epoch": 4.144195288106728, "grad_norm": 6.622756481170654, "learning_rate": 7.928753902923645e-05, "loss": 2.6829763412475587, "step": 14600 }, { "epoch": 4.147033778030088, "grad_norm": 7.105341911315918, "learning_rate": 7.927334657961964e-05, "loss": 2.727160835266113, "step": 14610 }, { "epoch": 4.149872267953449, "grad_norm": 7.149639129638672, "learning_rate": 7.925915413000285e-05, "loss": 2.666883087158203, "step": 14620 }, { "epoch": 4.152710757876809, "grad_norm": 7.353567123413086, "learning_rate": 7.924496168038604e-05, "loss": 2.7369190216064454, "step": 14630 }, { "epoch": 4.15554924780017, "grad_norm": 6.848292827606201, "learning_rate": 7.923076923076924e-05, "loss": 2.6623153686523438, "step": 14640 }, { "epoch": 4.158387737723531, "grad_norm": 6.862359523773193, "learning_rate": 7.921657678115243e-05, "loss": 2.704364013671875, "step": 14650 }, { "epoch": 4.161226227646892, "grad_norm": 6.842617988586426, "learning_rate": 7.920238433153562e-05, "loss": 2.673654556274414, "step": 14660 }, { "epoch": 4.164064717570253, "grad_norm": 6.625783443450928, "learning_rate": 7.918819188191883e-05, "loss": 2.7222227096557616, "step": 14670 }, { "epoch": 4.166903207493613, "grad_norm": 6.869696617126465, "learning_rate": 7.917399943230201e-05, "loss": 2.576323890686035, "step": 14680 }, { "epoch": 4.169741697416974, "grad_norm": 6.96436071395874, "learning_rate": 7.915980698268522e-05, "loss": 2.609970474243164, "step": 14690 }, { "epoch": 4.172580187340335, "grad_norm": 6.799201488494873, "learning_rate": 7.914561453306841e-05, "loss": 2.648122787475586, "step": 14700 }, { "epoch": 4.175418677263695, "grad_norm": 6.766674518585205, "learning_rate": 7.913142208345162e-05, "loss": 2.6162527084350584, "step": 14710 }, { "epoch": 4.1782571671870565, "grad_norm": 7.355909824371338, "learning_rate": 7.91172296338348e-05, "loss": 2.6956968307495117, "step": 14720 }, { "epoch": 4.181095657110418, "grad_norm": 6.899404525756836, "learning_rate": 7.9103037184218e-05, "loss": 2.6123165130615233, "step": 14730 }, { "epoch": 4.183934147033778, "grad_norm": 7.259029388427734, "learning_rate": 7.90888447346012e-05, "loss": 2.674127960205078, "step": 14740 }, { "epoch": 4.186772636957139, "grad_norm": 7.006997108459473, "learning_rate": 7.907465228498439e-05, "loss": 2.722941017150879, "step": 14750 }, { "epoch": 4.189611126880499, "grad_norm": 7.059523105621338, "learning_rate": 7.906045983536758e-05, "loss": 2.621030807495117, "step": 14760 }, { "epoch": 4.19244961680386, "grad_norm": 6.66042423248291, "learning_rate": 7.904626738575078e-05, "loss": 2.6525272369384765, "step": 14770 }, { "epoch": 4.195288106727221, "grad_norm": 6.767333030700684, "learning_rate": 7.903207493613398e-05, "loss": 2.6463363647460936, "step": 14780 }, { "epoch": 4.198126596650582, "grad_norm": 7.105794429779053, "learning_rate": 7.901788248651718e-05, "loss": 2.6584232330322264, "step": 14790 }, { "epoch": 4.200965086573943, "grad_norm": 6.768805027008057, "learning_rate": 7.900369003690037e-05, "loss": 2.610873794555664, "step": 14800 }, { "epoch": 4.203803576497304, "grad_norm": 7.1084160804748535, "learning_rate": 7.898949758728356e-05, "loss": 2.6880971908569338, "step": 14810 }, { "epoch": 4.206642066420664, "grad_norm": 6.766432285308838, "learning_rate": 7.897530513766677e-05, "loss": 2.7581947326660154, "step": 14820 }, { "epoch": 4.209480556344025, "grad_norm": 6.967782974243164, "learning_rate": 7.896111268804996e-05, "loss": 2.6211898803710936, "step": 14830 }, { "epoch": 4.212319046267385, "grad_norm": 6.784961223602295, "learning_rate": 7.894692023843316e-05, "loss": 2.7212379455566404, "step": 14840 }, { "epoch": 4.2151575361907465, "grad_norm": 7.166590690612793, "learning_rate": 7.893272778881635e-05, "loss": 2.642142105102539, "step": 14850 }, { "epoch": 4.217996026114108, "grad_norm": 6.925579071044922, "learning_rate": 7.891853533919956e-05, "loss": 2.653962516784668, "step": 14860 }, { "epoch": 4.220834516037468, "grad_norm": 7.191493988037109, "learning_rate": 7.890434288958275e-05, "loss": 2.646852493286133, "step": 14870 }, { "epoch": 4.223673005960829, "grad_norm": 6.598237991333008, "learning_rate": 7.889015043996594e-05, "loss": 2.656111145019531, "step": 14880 }, { "epoch": 4.226511495884189, "grad_norm": 6.875591278076172, "learning_rate": 7.887595799034914e-05, "loss": 2.6334720611572267, "step": 14890 }, { "epoch": 4.22934998580755, "grad_norm": 6.644607067108154, "learning_rate": 7.886176554073233e-05, "loss": 2.638916778564453, "step": 14900 }, { "epoch": 4.232188475730911, "grad_norm": 6.837506294250488, "learning_rate": 7.884757309111554e-05, "loss": 2.6507139205932617, "step": 14910 }, { "epoch": 4.235026965654272, "grad_norm": 6.998847484588623, "learning_rate": 7.883338064149872e-05, "loss": 2.6269588470458984, "step": 14920 }, { "epoch": 4.237865455577633, "grad_norm": 7.174222946166992, "learning_rate": 7.881918819188192e-05, "loss": 2.6714876174926756, "step": 14930 }, { "epoch": 4.240703945500994, "grad_norm": 6.766038417816162, "learning_rate": 7.880499574226512e-05, "loss": 2.687269592285156, "step": 14940 }, { "epoch": 4.243542435424354, "grad_norm": 6.804380893707275, "learning_rate": 7.879080329264832e-05, "loss": 2.6176946640014647, "step": 14950 }, { "epoch": 4.246380925347715, "grad_norm": 7.377494812011719, "learning_rate": 7.87766108430315e-05, "loss": 2.6876338958740233, "step": 14960 }, { "epoch": 4.249219415271075, "grad_norm": 6.767765998840332, "learning_rate": 7.876241839341471e-05, "loss": 2.6366477966308595, "step": 14970 }, { "epoch": 4.2520579051944365, "grad_norm": 6.924180507659912, "learning_rate": 7.87482259437979e-05, "loss": 2.627736282348633, "step": 14980 }, { "epoch": 4.254896395117798, "grad_norm": 7.007787227630615, "learning_rate": 7.87340334941811e-05, "loss": 2.614666557312012, "step": 14990 }, { "epoch": 4.257734885041158, "grad_norm": 6.672750949859619, "learning_rate": 7.871984104456429e-05, "loss": 2.6791812896728517, "step": 15000 }, { "epoch": 4.257734885041158, "eval_accuracy": 0.24448400839320913, "eval_loss": 2.927212715148926, "eval_runtime": 50.2479, "eval_samples_per_second": 312.988, "eval_steps_per_second": 4.896, "step": 15000 }, { "epoch": 4.260573374964519, "grad_norm": 7.091114044189453, "learning_rate": 7.870564859494748e-05, "loss": 2.6885454177856447, "step": 15010 }, { "epoch": 4.26341186488788, "grad_norm": 6.870449542999268, "learning_rate": 7.869145614533069e-05, "loss": 2.6682241439819334, "step": 15020 }, { "epoch": 4.26625035481124, "grad_norm": 6.719829082489014, "learning_rate": 7.867726369571388e-05, "loss": 2.7020511627197266, "step": 15030 }, { "epoch": 4.269088844734601, "grad_norm": 6.681188106536865, "learning_rate": 7.866307124609708e-05, "loss": 2.5308254241943358, "step": 15040 }, { "epoch": 4.271927334657962, "grad_norm": 7.108987808227539, "learning_rate": 7.864887879648027e-05, "loss": 2.6974096298217773, "step": 15050 }, { "epoch": 4.274765824581323, "grad_norm": 7.0082502365112305, "learning_rate": 7.863468634686348e-05, "loss": 2.6381378173828125, "step": 15060 }, { "epoch": 4.277604314504684, "grad_norm": 6.7517900466918945, "learning_rate": 7.862049389724667e-05, "loss": 2.56747989654541, "step": 15070 }, { "epoch": 4.280442804428044, "grad_norm": 7.475721836090088, "learning_rate": 7.860630144762986e-05, "loss": 2.636369323730469, "step": 15080 }, { "epoch": 4.283281294351405, "grad_norm": 7.0368804931640625, "learning_rate": 7.859210899801306e-05, "loss": 2.6568300247192385, "step": 15090 }, { "epoch": 4.286119784274765, "grad_norm": 6.6697235107421875, "learning_rate": 7.857791654839625e-05, "loss": 2.5766727447509767, "step": 15100 }, { "epoch": 4.2889582741981265, "grad_norm": 6.839993953704834, "learning_rate": 7.856372409877946e-05, "loss": 2.6894351959228517, "step": 15110 }, { "epoch": 4.291796764121488, "grad_norm": 7.082107067108154, "learning_rate": 7.854953164916264e-05, "loss": 2.6409162521362304, "step": 15120 }, { "epoch": 4.294635254044848, "grad_norm": 6.676496982574463, "learning_rate": 7.853533919954584e-05, "loss": 2.687212371826172, "step": 15130 }, { "epoch": 4.297473743968209, "grad_norm": 7.576447486877441, "learning_rate": 7.852114674992904e-05, "loss": 2.609493446350098, "step": 15140 }, { "epoch": 4.30031223389157, "grad_norm": 6.809836387634277, "learning_rate": 7.850695430031224e-05, "loss": 2.5717010498046875, "step": 15150 }, { "epoch": 4.30315072381493, "grad_norm": 7.076140403747559, "learning_rate": 7.849276185069542e-05, "loss": 2.606633758544922, "step": 15160 }, { "epoch": 4.305989213738291, "grad_norm": 6.648681163787842, "learning_rate": 7.847856940107863e-05, "loss": 2.5957242965698244, "step": 15170 }, { "epoch": 4.3088277036616525, "grad_norm": 6.81707763671875, "learning_rate": 7.846437695146182e-05, "loss": 2.67376823425293, "step": 15180 }, { "epoch": 4.311666193585013, "grad_norm": 6.84800386428833, "learning_rate": 7.845018450184503e-05, "loss": 2.6457918167114256, "step": 15190 }, { "epoch": 4.314504683508374, "grad_norm": 7.00718879699707, "learning_rate": 7.843599205222821e-05, "loss": 2.672066879272461, "step": 15200 }, { "epoch": 4.317343173431734, "grad_norm": 6.848431587219238, "learning_rate": 7.842179960261142e-05, "loss": 2.6562694549560546, "step": 15210 }, { "epoch": 4.320181663355095, "grad_norm": 7.087226390838623, "learning_rate": 7.840760715299461e-05, "loss": 2.586883544921875, "step": 15220 }, { "epoch": 4.323020153278456, "grad_norm": 7.0707688331604, "learning_rate": 7.83934147033778e-05, "loss": 2.6047283172607423, "step": 15230 }, { "epoch": 4.3258586432018165, "grad_norm": 6.799524784088135, "learning_rate": 7.8379222253761e-05, "loss": 2.694774627685547, "step": 15240 }, { "epoch": 4.328697133125178, "grad_norm": 6.670790672302246, "learning_rate": 7.836502980414419e-05, "loss": 2.6052947998046876, "step": 15250 }, { "epoch": 4.331535623048538, "grad_norm": 6.822315216064453, "learning_rate": 7.83508373545274e-05, "loss": 2.619489288330078, "step": 15260 }, { "epoch": 4.334374112971899, "grad_norm": 7.104290962219238, "learning_rate": 7.833664490491059e-05, "loss": 2.69164924621582, "step": 15270 }, { "epoch": 4.33721260289526, "grad_norm": 6.92384672164917, "learning_rate": 7.832245245529379e-05, "loss": 2.685563659667969, "step": 15280 }, { "epoch": 4.34005109281862, "grad_norm": 7.305699825286865, "learning_rate": 7.830826000567698e-05, "loss": 2.6143171310424806, "step": 15290 }, { "epoch": 4.342889582741981, "grad_norm": 7.0483317375183105, "learning_rate": 7.829406755606019e-05, "loss": 2.6781970977783205, "step": 15300 }, { "epoch": 4.3457280726653424, "grad_norm": 6.978504657745361, "learning_rate": 7.827987510644338e-05, "loss": 2.668305587768555, "step": 15310 }, { "epoch": 4.348566562588703, "grad_norm": 7.0510573387146, "learning_rate": 7.826568265682657e-05, "loss": 2.544773292541504, "step": 15320 }, { "epoch": 4.351405052512064, "grad_norm": 6.836282730102539, "learning_rate": 7.825149020720977e-05, "loss": 2.658111572265625, "step": 15330 }, { "epoch": 4.354243542435424, "grad_norm": 6.895027160644531, "learning_rate": 7.823729775759296e-05, "loss": 2.6693233489990233, "step": 15340 }, { "epoch": 4.357082032358785, "grad_norm": 6.944843292236328, "learning_rate": 7.822310530797617e-05, "loss": 2.5709932327270506, "step": 15350 }, { "epoch": 4.359920522282146, "grad_norm": 6.776124954223633, "learning_rate": 7.820891285835935e-05, "loss": 2.649019241333008, "step": 15360 }, { "epoch": 4.362759012205506, "grad_norm": 6.911236763000488, "learning_rate": 7.819472040874255e-05, "loss": 2.643080711364746, "step": 15370 }, { "epoch": 4.3655975021288675, "grad_norm": 6.624811172485352, "learning_rate": 7.818052795912575e-05, "loss": 2.6214900970458985, "step": 15380 }, { "epoch": 4.368435992052229, "grad_norm": 6.900386333465576, "learning_rate": 7.816633550950895e-05, "loss": 2.622981834411621, "step": 15390 }, { "epoch": 4.371274481975589, "grad_norm": 6.762317180633545, "learning_rate": 7.815214305989213e-05, "loss": 2.669266128540039, "step": 15400 }, { "epoch": 4.37411297189895, "grad_norm": 6.560751438140869, "learning_rate": 7.813795061027534e-05, "loss": 2.6852291107177733, "step": 15410 }, { "epoch": 4.37695146182231, "grad_norm": 6.675786972045898, "learning_rate": 7.812375816065853e-05, "loss": 2.5590728759765624, "step": 15420 }, { "epoch": 4.379789951745671, "grad_norm": 6.59379243850708, "learning_rate": 7.810956571104173e-05, "loss": 2.6278234481811524, "step": 15430 }, { "epoch": 4.382628441669032, "grad_norm": 7.150851726531982, "learning_rate": 7.809537326142493e-05, "loss": 2.6708400726318358, "step": 15440 }, { "epoch": 4.385466931592393, "grad_norm": 7.173604965209961, "learning_rate": 7.808118081180811e-05, "loss": 2.6503969192504884, "step": 15450 }, { "epoch": 4.388305421515754, "grad_norm": 7.019467830657959, "learning_rate": 7.806698836219132e-05, "loss": 2.662550354003906, "step": 15460 }, { "epoch": 4.391143911439114, "grad_norm": 6.640613079071045, "learning_rate": 7.805279591257451e-05, "loss": 2.6818981170654297, "step": 15470 }, { "epoch": 4.393982401362475, "grad_norm": 7.2305731773376465, "learning_rate": 7.803860346295772e-05, "loss": 2.671356964111328, "step": 15480 }, { "epoch": 4.396820891285836, "grad_norm": 7.291423320770264, "learning_rate": 7.80244110133409e-05, "loss": 2.6010391235351564, "step": 15490 }, { "epoch": 4.399659381209196, "grad_norm": 6.598076343536377, "learning_rate": 7.80102185637241e-05, "loss": 2.6116588592529295, "step": 15500 }, { "epoch": 4.399659381209196, "eval_accuracy": 0.25198702867679784, "eval_loss": 2.90396785736084, "eval_runtime": 55.6115, "eval_samples_per_second": 282.801, "eval_steps_per_second": 4.424, "step": 15500 }, { "epoch": 4.4024978711325575, "grad_norm": 6.998288631439209, "learning_rate": 7.79960261141073e-05, "loss": 2.665302276611328, "step": 15510 }, { "epoch": 4.405336361055919, "grad_norm": 6.918210506439209, "learning_rate": 7.79818336644905e-05, "loss": 2.635811614990234, "step": 15520 }, { "epoch": 4.408174850979279, "grad_norm": 6.712052345275879, "learning_rate": 7.796764121487369e-05, "loss": 2.633788299560547, "step": 15530 }, { "epoch": 4.41101334090264, "grad_norm": 6.698588848114014, "learning_rate": 7.795344876525689e-05, "loss": 2.607202911376953, "step": 15540 }, { "epoch": 4.413851830826, "grad_norm": 7.286678314208984, "learning_rate": 7.793925631564009e-05, "loss": 2.640147399902344, "step": 15550 }, { "epoch": 4.416690320749361, "grad_norm": 6.696683406829834, "learning_rate": 7.792506386602328e-05, "loss": 2.6500600814819335, "step": 15560 }, { "epoch": 4.419528810672722, "grad_norm": 6.912712097167969, "learning_rate": 7.791087141640647e-05, "loss": 2.663838005065918, "step": 15570 }, { "epoch": 4.422367300596083, "grad_norm": 6.42915153503418, "learning_rate": 7.789667896678967e-05, "loss": 2.5654701232910155, "step": 15580 }, { "epoch": 4.425205790519444, "grad_norm": 6.982687473297119, "learning_rate": 7.788248651717287e-05, "loss": 2.6424535751342773, "step": 15590 }, { "epoch": 4.428044280442805, "grad_norm": 6.983855247497559, "learning_rate": 7.786829406755607e-05, "loss": 2.630616569519043, "step": 15600 }, { "epoch": 4.430882770366165, "grad_norm": 7.113921165466309, "learning_rate": 7.785410161793926e-05, "loss": 2.645630645751953, "step": 15610 }, { "epoch": 4.433721260289526, "grad_norm": 6.78411865234375, "learning_rate": 7.783990916832245e-05, "loss": 2.7324861526489257, "step": 15620 }, { "epoch": 4.436559750212886, "grad_norm": 6.984187126159668, "learning_rate": 7.782571671870566e-05, "loss": 2.587297821044922, "step": 15630 }, { "epoch": 4.4393982401362475, "grad_norm": 7.026543140411377, "learning_rate": 7.781152426908885e-05, "loss": 2.721133804321289, "step": 15640 }, { "epoch": 4.442236730059609, "grad_norm": 6.9104719161987305, "learning_rate": 7.779733181947205e-05, "loss": 2.651314926147461, "step": 15650 }, { "epoch": 4.445075219982969, "grad_norm": 6.573668003082275, "learning_rate": 7.778313936985524e-05, "loss": 2.605866813659668, "step": 15660 }, { "epoch": 4.44791370990633, "grad_norm": 6.798287391662598, "learning_rate": 7.776894692023843e-05, "loss": 2.5987874984741213, "step": 15670 }, { "epoch": 4.45075219982969, "grad_norm": 6.799751281738281, "learning_rate": 7.775475447062164e-05, "loss": 2.644118309020996, "step": 15680 }, { "epoch": 4.453590689753051, "grad_norm": 6.374435901641846, "learning_rate": 7.774056202100482e-05, "loss": 2.625863265991211, "step": 15690 }, { "epoch": 4.456429179676412, "grad_norm": 7.176713943481445, "learning_rate": 7.772636957138803e-05, "loss": 2.6248186111450194, "step": 15700 }, { "epoch": 4.459267669599773, "grad_norm": 6.926121711730957, "learning_rate": 7.771217712177122e-05, "loss": 2.6480644226074217, "step": 15710 }, { "epoch": 4.462106159523134, "grad_norm": 6.71670389175415, "learning_rate": 7.769798467215443e-05, "loss": 2.581893539428711, "step": 15720 }, { "epoch": 4.464944649446495, "grad_norm": 6.851128578186035, "learning_rate": 7.76837922225376e-05, "loss": 2.615426445007324, "step": 15730 }, { "epoch": 4.467783139369855, "grad_norm": 6.783841609954834, "learning_rate": 7.766959977292081e-05, "loss": 2.6571065902709963, "step": 15740 }, { "epoch": 4.470621629293216, "grad_norm": 6.739433288574219, "learning_rate": 7.7655407323304e-05, "loss": 2.5907644271850585, "step": 15750 }, { "epoch": 4.473460119216576, "grad_norm": 6.689683437347412, "learning_rate": 7.764121487368721e-05, "loss": 2.593977355957031, "step": 15760 }, { "epoch": 4.4762986091399375, "grad_norm": 6.723315715789795, "learning_rate": 7.76270224240704e-05, "loss": 2.539154815673828, "step": 15770 }, { "epoch": 4.479137099063299, "grad_norm": 6.569394111633301, "learning_rate": 7.76128299744536e-05, "loss": 2.6554807662963866, "step": 15780 }, { "epoch": 4.481975588986659, "grad_norm": 6.73414945602417, "learning_rate": 7.75986375248368e-05, "loss": 2.630603790283203, "step": 15790 }, { "epoch": 4.48481407891002, "grad_norm": 7.381402492523193, "learning_rate": 7.758444507521999e-05, "loss": 2.683376693725586, "step": 15800 }, { "epoch": 4.487652568833381, "grad_norm": 7.062220096588135, "learning_rate": 7.757025262560318e-05, "loss": 2.574174499511719, "step": 15810 }, { "epoch": 4.490491058756741, "grad_norm": 6.7239298820495605, "learning_rate": 7.755606017598637e-05, "loss": 2.6817771911621096, "step": 15820 }, { "epoch": 4.493329548680102, "grad_norm": 7.127533435821533, "learning_rate": 7.754186772636958e-05, "loss": 2.606420707702637, "step": 15830 }, { "epoch": 4.496168038603463, "grad_norm": 7.243835926055908, "learning_rate": 7.752767527675277e-05, "loss": 2.6214626312255858, "step": 15840 }, { "epoch": 4.499006528526824, "grad_norm": 6.963974952697754, "learning_rate": 7.751348282713597e-05, "loss": 2.6045738220214845, "step": 15850 }, { "epoch": 4.501845018450185, "grad_norm": 6.899819850921631, "learning_rate": 7.749929037751916e-05, "loss": 2.5700334548950194, "step": 15860 }, { "epoch": 4.504683508373545, "grad_norm": 6.915860652923584, "learning_rate": 7.748509792790237e-05, "loss": 2.5663633346557617, "step": 15870 }, { "epoch": 4.507521998296906, "grad_norm": 6.750008583068848, "learning_rate": 7.747090547828556e-05, "loss": 2.697314643859863, "step": 15880 }, { "epoch": 4.510360488220266, "grad_norm": 6.898182392120361, "learning_rate": 7.745671302866875e-05, "loss": 2.605071258544922, "step": 15890 }, { "epoch": 4.5131989781436275, "grad_norm": 6.760778427124023, "learning_rate": 7.744252057905195e-05, "loss": 2.6446649551391603, "step": 15900 }, { "epoch": 4.516037468066989, "grad_norm": 6.845993518829346, "learning_rate": 7.742832812943514e-05, "loss": 2.6456085205078126, "step": 15910 }, { "epoch": 4.518875957990349, "grad_norm": 7.144005298614502, "learning_rate": 7.741413567981835e-05, "loss": 2.5970169067382813, "step": 15920 }, { "epoch": 4.52171444791371, "grad_norm": 6.994803428649902, "learning_rate": 7.739994323020153e-05, "loss": 2.648772430419922, "step": 15930 }, { "epoch": 4.524552937837071, "grad_norm": 7.110809803009033, "learning_rate": 7.738575078058473e-05, "loss": 2.591930961608887, "step": 15940 }, { "epoch": 4.527391427760431, "grad_norm": 7.221759796142578, "learning_rate": 7.737155833096793e-05, "loss": 2.6301158905029296, "step": 15950 }, { "epoch": 4.530229917683792, "grad_norm": 6.785404682159424, "learning_rate": 7.735736588135113e-05, "loss": 2.7242782592773436, "step": 15960 }, { "epoch": 4.5330684076071535, "grad_norm": 6.768318176269531, "learning_rate": 7.734317343173431e-05, "loss": 2.690639305114746, "step": 15970 }, { "epoch": 4.535906897530514, "grad_norm": 6.681690216064453, "learning_rate": 7.732898098211752e-05, "loss": 2.6861175537109374, "step": 15980 }, { "epoch": 4.538745387453875, "grad_norm": 6.815548419952393, "learning_rate": 7.731478853250071e-05, "loss": 2.641097640991211, "step": 15990 }, { "epoch": 4.541583877377235, "grad_norm": 6.8047332763671875, "learning_rate": 7.730059608288391e-05, "loss": 2.5919988632202147, "step": 16000 }, { "epoch": 4.541583877377235, "eval_accuracy": 0.24995231131175685, "eval_loss": 2.893839120864868, "eval_runtime": 51.4844, "eval_samples_per_second": 305.471, "eval_steps_per_second": 4.778, "step": 16000 }, { "epoch": 4.544422367300596, "grad_norm": 7.209490776062012, "learning_rate": 7.72864036332671e-05, "loss": 2.5944210052490235, "step": 16010 }, { "epoch": 4.547260857223957, "grad_norm": 6.918612480163574, "learning_rate": 7.72722111836503e-05, "loss": 2.623556900024414, "step": 16020 }, { "epoch": 4.5500993471473175, "grad_norm": 7.042250156402588, "learning_rate": 7.72580187340335e-05, "loss": 2.6380916595458985, "step": 16030 }, { "epoch": 4.552937837070679, "grad_norm": 7.028982639312744, "learning_rate": 7.72438262844167e-05, "loss": 2.6503778457641602, "step": 16040 }, { "epoch": 4.555776326994039, "grad_norm": 6.950092792510986, "learning_rate": 7.722963383479989e-05, "loss": 2.619226837158203, "step": 16050 }, { "epoch": 4.5586148169174, "grad_norm": 6.915463924407959, "learning_rate": 7.721544138518308e-05, "loss": 2.614703369140625, "step": 16060 }, { "epoch": 4.561453306840761, "grad_norm": 7.063460350036621, "learning_rate": 7.720124893556629e-05, "loss": 2.6254892349243164, "step": 16070 }, { "epoch": 4.564291796764121, "grad_norm": 7.04991340637207, "learning_rate": 7.718705648594948e-05, "loss": 2.682369041442871, "step": 16080 }, { "epoch": 4.567130286687482, "grad_norm": 6.955934047698975, "learning_rate": 7.717286403633267e-05, "loss": 2.5984962463378904, "step": 16090 }, { "epoch": 4.5699687766108426, "grad_norm": 6.612507343292236, "learning_rate": 7.715867158671587e-05, "loss": 2.679393005371094, "step": 16100 }, { "epoch": 4.572807266534204, "grad_norm": 7.213808059692383, "learning_rate": 7.714447913709907e-05, "loss": 2.6224061965942385, "step": 16110 }, { "epoch": 4.575645756457565, "grad_norm": 6.640975475311279, "learning_rate": 7.713028668748227e-05, "loss": 2.6339132308959963, "step": 16120 }, { "epoch": 4.578484246380925, "grad_norm": 6.620357513427734, "learning_rate": 7.711609423786546e-05, "loss": 2.541848564147949, "step": 16130 }, { "epoch": 4.581322736304286, "grad_norm": 7.480855941772461, "learning_rate": 7.710190178824865e-05, "loss": 2.6795446395874025, "step": 16140 }, { "epoch": 4.584161226227647, "grad_norm": 6.799022674560547, "learning_rate": 7.708770933863185e-05, "loss": 2.6354997634887694, "step": 16150 }, { "epoch": 4.586999716151007, "grad_norm": 6.642535209655762, "learning_rate": 7.707351688901505e-05, "loss": 2.630751609802246, "step": 16160 }, { "epoch": 4.5898382060743685, "grad_norm": 6.929077625274658, "learning_rate": 7.705932443939823e-05, "loss": 2.621616554260254, "step": 16170 }, { "epoch": 4.59267669599773, "grad_norm": 7.079736232757568, "learning_rate": 7.704513198978144e-05, "loss": 2.601421356201172, "step": 16180 }, { "epoch": 4.59551518592109, "grad_norm": 6.805049419403076, "learning_rate": 7.703093954016463e-05, "loss": 2.596060371398926, "step": 16190 }, { "epoch": 4.598353675844451, "grad_norm": 6.799778461456299, "learning_rate": 7.701674709054784e-05, "loss": 2.685467529296875, "step": 16200 }, { "epoch": 4.601192165767811, "grad_norm": 6.643914699554443, "learning_rate": 7.700255464093102e-05, "loss": 2.478069877624512, "step": 16210 }, { "epoch": 4.604030655691172, "grad_norm": 6.689704418182373, "learning_rate": 7.698836219131423e-05, "loss": 2.601495552062988, "step": 16220 }, { "epoch": 4.606869145614533, "grad_norm": 7.3060832023620605, "learning_rate": 7.697416974169742e-05, "loss": 2.6457588195800783, "step": 16230 }, { "epoch": 4.609707635537894, "grad_norm": 6.97845458984375, "learning_rate": 7.695997729208061e-05, "loss": 2.601262092590332, "step": 16240 }, { "epoch": 4.612546125461255, "grad_norm": 6.508108615875244, "learning_rate": 7.694578484246381e-05, "loss": 2.605082893371582, "step": 16250 }, { "epoch": 4.615384615384615, "grad_norm": 6.765830993652344, "learning_rate": 7.6931592392847e-05, "loss": 2.5954191207885744, "step": 16260 }, { "epoch": 4.618223105307976, "grad_norm": 6.6309075355529785, "learning_rate": 7.691739994323021e-05, "loss": 2.581851005554199, "step": 16270 }, { "epoch": 4.621061595231337, "grad_norm": 7.091796398162842, "learning_rate": 7.69032074936134e-05, "loss": 2.7174333572387694, "step": 16280 }, { "epoch": 4.623900085154697, "grad_norm": 6.718441963195801, "learning_rate": 7.68890150439966e-05, "loss": 2.5730335235595705, "step": 16290 }, { "epoch": 4.6267385750780585, "grad_norm": 6.823093891143799, "learning_rate": 7.687482259437979e-05, "loss": 2.662433624267578, "step": 16300 }, { "epoch": 4.62957706500142, "grad_norm": 6.764198303222656, "learning_rate": 7.6860630144763e-05, "loss": 2.6177703857421877, "step": 16310 }, { "epoch": 4.63241555492478, "grad_norm": 6.802258491516113, "learning_rate": 7.684643769514619e-05, "loss": 2.578572463989258, "step": 16320 }, { "epoch": 4.635254044848141, "grad_norm": 7.005410194396973, "learning_rate": 7.683224524552938e-05, "loss": 2.650709533691406, "step": 16330 }, { "epoch": 4.638092534771501, "grad_norm": 7.065427303314209, "learning_rate": 7.681805279591258e-05, "loss": 2.576825714111328, "step": 16340 }, { "epoch": 4.640931024694862, "grad_norm": 6.740931987762451, "learning_rate": 7.680386034629577e-05, "loss": 2.6459772109985353, "step": 16350 }, { "epoch": 4.643769514618223, "grad_norm": 6.6725568771362305, "learning_rate": 7.678966789667898e-05, "loss": 2.6227550506591797, "step": 16360 }, { "epoch": 4.646608004541584, "grad_norm": 6.734790325164795, "learning_rate": 7.677547544706216e-05, "loss": 2.632155418395996, "step": 16370 }, { "epoch": 4.649446494464945, "grad_norm": 6.708893775939941, "learning_rate": 7.676128299744536e-05, "loss": 2.618202972412109, "step": 16380 }, { "epoch": 4.652284984388306, "grad_norm": 6.9259185791015625, "learning_rate": 7.674709054782856e-05, "loss": 2.675699234008789, "step": 16390 }, { "epoch": 4.655123474311666, "grad_norm": 6.81010103225708, "learning_rate": 7.673289809821176e-05, "loss": 2.555813789367676, "step": 16400 }, { "epoch": 4.657961964235027, "grad_norm": 6.812775611877441, "learning_rate": 7.671870564859494e-05, "loss": 2.7085859298706056, "step": 16410 }, { "epoch": 4.660800454158387, "grad_norm": 6.27679967880249, "learning_rate": 7.670451319897815e-05, "loss": 2.567369270324707, "step": 16420 }, { "epoch": 4.6636389440817485, "grad_norm": 6.968793869018555, "learning_rate": 7.669032074936134e-05, "loss": 2.568222999572754, "step": 16430 }, { "epoch": 4.66647743400511, "grad_norm": 7.075534820556641, "learning_rate": 7.667612829974455e-05, "loss": 2.6075647354125975, "step": 16440 }, { "epoch": 4.66931592392847, "grad_norm": 7.166909694671631, "learning_rate": 7.666193585012773e-05, "loss": 2.6449832916259766, "step": 16450 }, { "epoch": 4.672154413851831, "grad_norm": 7.408174514770508, "learning_rate": 7.664774340051094e-05, "loss": 2.7311279296875, "step": 16460 }, { "epoch": 4.674992903775191, "grad_norm": 7.023828983306885, "learning_rate": 7.663355095089413e-05, "loss": 2.5867395401000977, "step": 16470 }, { "epoch": 4.677831393698552, "grad_norm": 6.82619571685791, "learning_rate": 7.661935850127732e-05, "loss": 2.6158056259155273, "step": 16480 }, { "epoch": 4.680669883621913, "grad_norm": 6.981490135192871, "learning_rate": 7.660516605166052e-05, "loss": 2.7438583374023438, "step": 16490 }, { "epoch": 4.683508373545274, "grad_norm": 6.999218940734863, "learning_rate": 7.659097360204371e-05, "loss": 2.6262191772460937, "step": 16500 }, { "epoch": 4.683508373545274, "eval_accuracy": 0.2560564634068799, "eval_loss": 2.8753433227539062, "eval_runtime": 53.7172, "eval_samples_per_second": 292.774, "eval_steps_per_second": 4.58, "step": 16500 }, { "epoch": 4.686346863468635, "grad_norm": 6.824333190917969, "learning_rate": 7.657678115242692e-05, "loss": 2.648883819580078, "step": 16510 }, { "epoch": 4.689185353391996, "grad_norm": 6.711226463317871, "learning_rate": 7.656258870281011e-05, "loss": 2.6191579818725588, "step": 16520 }, { "epoch": 4.692023843315356, "grad_norm": 7.311325550079346, "learning_rate": 7.65483962531933e-05, "loss": 2.69639892578125, "step": 16530 }, { "epoch": 4.694862333238717, "grad_norm": 6.8277058601379395, "learning_rate": 7.65342038035765e-05, "loss": 2.5991886138916014, "step": 16540 }, { "epoch": 4.697700823162078, "grad_norm": 7.013248920440674, "learning_rate": 7.65200113539597e-05, "loss": 2.5676748275756838, "step": 16550 }, { "epoch": 4.7005393130854385, "grad_norm": 6.839751243591309, "learning_rate": 7.65058189043429e-05, "loss": 2.656671142578125, "step": 16560 }, { "epoch": 4.7033778030088, "grad_norm": 6.6564106941223145, "learning_rate": 7.649162645472609e-05, "loss": 2.6586122512817383, "step": 16570 }, { "epoch": 4.70621629293216, "grad_norm": 6.822135925292969, "learning_rate": 7.647885325007098e-05, "loss": 2.5700305938720702, "step": 16580 }, { "epoch": 4.709054782855521, "grad_norm": 6.951724529266357, "learning_rate": 7.646466080045416e-05, "loss": 2.677645683288574, "step": 16590 }, { "epoch": 4.711893272778882, "grad_norm": 6.924982070922852, "learning_rate": 7.645046835083736e-05, "loss": 2.6440189361572264, "step": 16600 }, { "epoch": 4.714731762702242, "grad_norm": 7.017765045166016, "learning_rate": 7.643627590122056e-05, "loss": 2.6123325347900392, "step": 16610 }, { "epoch": 4.717570252625603, "grad_norm": 6.892746448516846, "learning_rate": 7.642208345160375e-05, "loss": 2.626358222961426, "step": 16620 }, { "epoch": 4.720408742548964, "grad_norm": 7.2010674476623535, "learning_rate": 7.640789100198694e-05, "loss": 2.705276298522949, "step": 16630 }, { "epoch": 4.723247232472325, "grad_norm": 6.443416595458984, "learning_rate": 7.639369855237014e-05, "loss": 2.601634216308594, "step": 16640 }, { "epoch": 4.726085722395686, "grad_norm": 7.094481468200684, "learning_rate": 7.637950610275334e-05, "loss": 2.615681457519531, "step": 16650 }, { "epoch": 4.728924212319046, "grad_norm": 6.753576755523682, "learning_rate": 7.636531365313654e-05, "loss": 2.609560966491699, "step": 16660 }, { "epoch": 4.731762702242407, "grad_norm": 6.931554794311523, "learning_rate": 7.635112120351973e-05, "loss": 2.6096033096313476, "step": 16670 }, { "epoch": 4.734601192165767, "grad_norm": 7.346916675567627, "learning_rate": 7.633692875390292e-05, "loss": 2.6277856826782227, "step": 16680 }, { "epoch": 4.7374396820891285, "grad_norm": 6.79700231552124, "learning_rate": 7.632273630428613e-05, "loss": 2.7265802383422852, "step": 16690 }, { "epoch": 4.74027817201249, "grad_norm": 7.242894172668457, "learning_rate": 7.630854385466932e-05, "loss": 2.6579729080200196, "step": 16700 }, { "epoch": 4.74311666193585, "grad_norm": 6.99674654006958, "learning_rate": 7.629435140505252e-05, "loss": 2.577122116088867, "step": 16710 }, { "epoch": 4.745955151859211, "grad_norm": 6.815381050109863, "learning_rate": 7.628015895543571e-05, "loss": 2.590708923339844, "step": 16720 }, { "epoch": 4.748793641782572, "grad_norm": 6.9038801193237305, "learning_rate": 7.62659665058189e-05, "loss": 2.5303371429443358, "step": 16730 }, { "epoch": 4.751632131705932, "grad_norm": 6.933548927307129, "learning_rate": 7.625177405620211e-05, "loss": 2.645640754699707, "step": 16740 }, { "epoch": 4.754470621629293, "grad_norm": 7.140876770019531, "learning_rate": 7.623758160658529e-05, "loss": 2.654996871948242, "step": 16750 }, { "epoch": 4.7573091115526545, "grad_norm": 6.617042541503906, "learning_rate": 7.62233891569685e-05, "loss": 2.666314888000488, "step": 16760 }, { "epoch": 4.760147601476015, "grad_norm": 6.690307140350342, "learning_rate": 7.620919670735169e-05, "loss": 2.6487525939941405, "step": 16770 }, { "epoch": 4.762986091399376, "grad_norm": 6.744871139526367, "learning_rate": 7.61950042577349e-05, "loss": 2.5803972244262696, "step": 16780 }, { "epoch": 4.765824581322736, "grad_norm": 6.887570858001709, "learning_rate": 7.618081180811808e-05, "loss": 2.597574234008789, "step": 16790 }, { "epoch": 4.768663071246097, "grad_norm": 7.147075653076172, "learning_rate": 7.616661935850128e-05, "loss": 2.6937067031860353, "step": 16800 }, { "epoch": 4.771501561169458, "grad_norm": 6.642997741699219, "learning_rate": 7.615242690888448e-05, "loss": 2.6075191497802734, "step": 16810 }, { "epoch": 4.7743400510928184, "grad_norm": 7.151413917541504, "learning_rate": 7.613823445926768e-05, "loss": 2.6210090637207033, "step": 16820 }, { "epoch": 4.77717854101618, "grad_norm": 6.741271018981934, "learning_rate": 7.612404200965086e-05, "loss": 2.6382909774780274, "step": 16830 }, { "epoch": 4.78001703093954, "grad_norm": 6.811403751373291, "learning_rate": 7.610984956003407e-05, "loss": 2.670903778076172, "step": 16840 }, { "epoch": 4.782855520862901, "grad_norm": 7.376879692077637, "learning_rate": 7.609565711041726e-05, "loss": 2.615406799316406, "step": 16850 }, { "epoch": 4.785694010786262, "grad_norm": 6.826070785522461, "learning_rate": 7.608146466080046e-05, "loss": 2.613887977600098, "step": 16860 }, { "epoch": 4.788532500709622, "grad_norm": 7.120720863342285, "learning_rate": 7.606727221118365e-05, "loss": 2.5640649795532227, "step": 16870 }, { "epoch": 4.791370990632983, "grad_norm": 7.473501205444336, "learning_rate": 7.605307976156684e-05, "loss": 2.6612897872924806, "step": 16880 }, { "epoch": 4.7942094805563436, "grad_norm": 6.610374927520752, "learning_rate": 7.603888731195005e-05, "loss": 2.5789011001586912, "step": 16890 }, { "epoch": 4.797047970479705, "grad_norm": 6.680816650390625, "learning_rate": 7.602469486233324e-05, "loss": 2.619894790649414, "step": 16900 }, { "epoch": 4.799886460403066, "grad_norm": 6.908688545227051, "learning_rate": 7.601050241271644e-05, "loss": 2.5911909103393556, "step": 16910 }, { "epoch": 4.802724950326426, "grad_norm": 6.569080829620361, "learning_rate": 7.599630996309963e-05, "loss": 2.5889816284179688, "step": 16920 }, { "epoch": 4.805563440249787, "grad_norm": 6.985697269439697, "learning_rate": 7.598211751348284e-05, "loss": 2.5179763793945313, "step": 16930 }, { "epoch": 4.808401930173148, "grad_norm": 6.6273274421691895, "learning_rate": 7.596792506386603e-05, "loss": 2.606695365905762, "step": 16940 }, { "epoch": 4.811240420096508, "grad_norm": 6.744569301605225, "learning_rate": 7.595373261424922e-05, "loss": 2.570073127746582, "step": 16950 }, { "epoch": 4.8140789100198695, "grad_norm": 6.712997913360596, "learning_rate": 7.593954016463242e-05, "loss": 2.6106014251708984, "step": 16960 }, { "epoch": 4.816917399943231, "grad_norm": 6.505614280700684, "learning_rate": 7.592534771501561e-05, "loss": 2.620905876159668, "step": 16970 }, { "epoch": 4.819755889866591, "grad_norm": 7.082197666168213, "learning_rate": 7.591115526539882e-05, "loss": 2.593044090270996, "step": 16980 }, { "epoch": 4.822594379789952, "grad_norm": 7.090095520019531, "learning_rate": 7.5896962815782e-05, "loss": 2.661026382446289, "step": 16990 }, { "epoch": 4.825432869713312, "grad_norm": 6.789365291595459, "learning_rate": 7.58827703661652e-05, "loss": 2.5912477493286135, "step": 17000 }, { "epoch": 4.825432869713312, "eval_accuracy": 0.25319514211229094, "eval_loss": 2.871666193008423, "eval_runtime": 49.5733, "eval_samples_per_second": 317.247, "eval_steps_per_second": 4.962, "step": 17000 }, { "epoch": 4.828271359636673, "grad_norm": 7.268899917602539, "learning_rate": 7.58685779165484e-05, "loss": 2.6185653686523436, "step": 17010 }, { "epoch": 4.831109849560034, "grad_norm": 7.384907245635986, "learning_rate": 7.58543854669316e-05, "loss": 2.5620975494384766, "step": 17020 }, { "epoch": 4.833948339483395, "grad_norm": 6.8038649559021, "learning_rate": 7.584019301731478e-05, "loss": 2.6537765502929687, "step": 17030 }, { "epoch": 4.836786829406756, "grad_norm": 6.924934387207031, "learning_rate": 7.582600056769799e-05, "loss": 2.666581726074219, "step": 17040 }, { "epoch": 4.839625319330116, "grad_norm": 6.743548393249512, "learning_rate": 7.581180811808118e-05, "loss": 2.6490039825439453, "step": 17050 }, { "epoch": 4.842463809253477, "grad_norm": 6.857654571533203, "learning_rate": 7.579761566846438e-05, "loss": 2.586700439453125, "step": 17060 }, { "epoch": 4.845302299176838, "grad_norm": 7.12238883972168, "learning_rate": 7.578342321884757e-05, "loss": 2.6324691772460938, "step": 17070 }, { "epoch": 4.848140789100198, "grad_norm": 7.304258346557617, "learning_rate": 7.576923076923076e-05, "loss": 2.573086166381836, "step": 17080 }, { "epoch": 4.8509792790235595, "grad_norm": 7.217774391174316, "learning_rate": 7.575503831961397e-05, "loss": 2.575849151611328, "step": 17090 }, { "epoch": 4.85381776894692, "grad_norm": 7.112896919250488, "learning_rate": 7.574084586999716e-05, "loss": 2.612152099609375, "step": 17100 }, { "epoch": 4.856656258870281, "grad_norm": 6.682685852050781, "learning_rate": 7.572665342038037e-05, "loss": 2.6339426040649414, "step": 17110 }, { "epoch": 4.859494748793642, "grad_norm": 7.2198309898376465, "learning_rate": 7.571246097076355e-05, "loss": 2.658795166015625, "step": 17120 }, { "epoch": 4.862333238717002, "grad_norm": 6.721205234527588, "learning_rate": 7.569826852114676e-05, "loss": 2.5589433670043946, "step": 17130 }, { "epoch": 4.865171728640363, "grad_norm": 6.773538112640381, "learning_rate": 7.568407607152995e-05, "loss": 2.6582923889160157, "step": 17140 }, { "epoch": 4.868010218563724, "grad_norm": 6.851817607879639, "learning_rate": 7.566988362191316e-05, "loss": 2.6621883392333983, "step": 17150 }, { "epoch": 4.870848708487085, "grad_norm": 6.753723621368408, "learning_rate": 7.565569117229634e-05, "loss": 2.6451269149780274, "step": 17160 }, { "epoch": 4.873687198410446, "grad_norm": 7.363733291625977, "learning_rate": 7.564149872267954e-05, "loss": 2.582354927062988, "step": 17170 }, { "epoch": 4.876525688333807, "grad_norm": 7.344315528869629, "learning_rate": 7.562730627306274e-05, "loss": 2.6010276794433596, "step": 17180 }, { "epoch": 4.879364178257167, "grad_norm": 6.6998443603515625, "learning_rate": 7.561311382344593e-05, "loss": 2.598748970031738, "step": 17190 }, { "epoch": 4.882202668180528, "grad_norm": 6.92685604095459, "learning_rate": 7.559892137382912e-05, "loss": 2.55700798034668, "step": 17200 }, { "epoch": 4.885041158103888, "grad_norm": 7.069101810455322, "learning_rate": 7.558472892421232e-05, "loss": 2.6522151947021486, "step": 17210 }, { "epoch": 4.8878796480272495, "grad_norm": 7.231575012207031, "learning_rate": 7.557053647459552e-05, "loss": 2.626856231689453, "step": 17220 }, { "epoch": 4.890718137950611, "grad_norm": 6.548419952392578, "learning_rate": 7.555634402497872e-05, "loss": 2.5825271606445312, "step": 17230 }, { "epoch": 4.893556627873971, "grad_norm": 6.543859481811523, "learning_rate": 7.554215157536191e-05, "loss": 2.649094009399414, "step": 17240 }, { "epoch": 4.896395117797332, "grad_norm": 6.313950061798096, "learning_rate": 7.55279591257451e-05, "loss": 2.562266731262207, "step": 17250 }, { "epoch": 4.899233607720692, "grad_norm": 7.0116071701049805, "learning_rate": 7.551376667612831e-05, "loss": 2.611105728149414, "step": 17260 }, { "epoch": 4.902072097644053, "grad_norm": 6.685194969177246, "learning_rate": 7.54995742265115e-05, "loss": 2.587358093261719, "step": 17270 }, { "epoch": 4.904910587567414, "grad_norm": 6.954743385314941, "learning_rate": 7.54853817768947e-05, "loss": 2.6241275787353517, "step": 17280 }, { "epoch": 4.907749077490775, "grad_norm": 6.6854047775268555, "learning_rate": 7.547118932727789e-05, "loss": 2.6417287826538085, "step": 17290 }, { "epoch": 4.910587567414136, "grad_norm": 6.823311805725098, "learning_rate": 7.545699687766108e-05, "loss": 2.558196258544922, "step": 17300 }, { "epoch": 4.913426057337497, "grad_norm": 6.766549587249756, "learning_rate": 7.544280442804429e-05, "loss": 2.514570617675781, "step": 17310 }, { "epoch": 4.916264547260857, "grad_norm": 7.018388748168945, "learning_rate": 7.542861197842747e-05, "loss": 2.595265769958496, "step": 17320 }, { "epoch": 4.919103037184218, "grad_norm": 6.846147537231445, "learning_rate": 7.541441952881068e-05, "loss": 2.600380516052246, "step": 17330 }, { "epoch": 4.921941527107579, "grad_norm": 7.020329475402832, "learning_rate": 7.540022707919387e-05, "loss": 2.5940582275390627, "step": 17340 }, { "epoch": 4.9247800170309395, "grad_norm": 7.0865278244018555, "learning_rate": 7.538603462957708e-05, "loss": 2.5640586853027343, "step": 17350 }, { "epoch": 4.927618506954301, "grad_norm": 6.910956382751465, "learning_rate": 7.537184217996026e-05, "loss": 2.594169044494629, "step": 17360 }, { "epoch": 4.930456996877661, "grad_norm": 6.953744411468506, "learning_rate": 7.535764973034346e-05, "loss": 2.5920419692993164, "step": 17370 }, { "epoch": 4.933295486801022, "grad_norm": 6.830177307128906, "learning_rate": 7.534345728072666e-05, "loss": 2.5507299423217775, "step": 17380 }, { "epoch": 4.936133976724383, "grad_norm": 6.70985746383667, "learning_rate": 7.532926483110985e-05, "loss": 2.614916229248047, "step": 17390 }, { "epoch": 4.938972466647743, "grad_norm": 7.090786457061768, "learning_rate": 7.531507238149304e-05, "loss": 2.616387939453125, "step": 17400 }, { "epoch": 4.941810956571104, "grad_norm": 7.0897016525268555, "learning_rate": 7.530087993187624e-05, "loss": 2.5669733047485352, "step": 17410 }, { "epoch": 4.944649446494465, "grad_norm": 6.465585231781006, "learning_rate": 7.528668748225944e-05, "loss": 2.560367393493652, "step": 17420 }, { "epoch": 4.947487936417826, "grad_norm": 6.941190719604492, "learning_rate": 7.527249503264264e-05, "loss": 2.573381233215332, "step": 17430 }, { "epoch": 4.950326426341187, "grad_norm": 6.690730094909668, "learning_rate": 7.525830258302583e-05, "loss": 2.59008903503418, "step": 17440 }, { "epoch": 4.953164916264547, "grad_norm": 7.150376796722412, "learning_rate": 7.524411013340902e-05, "loss": 2.6092605590820312, "step": 17450 }, { "epoch": 4.956003406187908, "grad_norm": 6.619582176208496, "learning_rate": 7.522991768379223e-05, "loss": 2.6659011840820312, "step": 17460 }, { "epoch": 4.958841896111268, "grad_norm": 6.902466297149658, "learning_rate": 7.521572523417543e-05, "loss": 2.533241844177246, "step": 17470 }, { "epoch": 4.9616803860346295, "grad_norm": 7.136173725128174, "learning_rate": 7.520153278455862e-05, "loss": 2.5778434753417967, "step": 17480 }, { "epoch": 4.964518875957991, "grad_norm": 6.641966819763184, "learning_rate": 7.518734033494181e-05, "loss": 2.5519554138183596, "step": 17490 }, { "epoch": 4.967357365881351, "grad_norm": 6.997091293334961, "learning_rate": 7.517314788532502e-05, "loss": 2.543931007385254, "step": 17500 }, { "epoch": 4.967357365881351, "eval_accuracy": 0.25535702931264703, "eval_loss": 2.860442638397217, "eval_runtime": 48.3046, "eval_samples_per_second": 325.58, "eval_steps_per_second": 5.093, "step": 17500 }, { "epoch": 4.970195855804712, "grad_norm": 6.974333763122559, "learning_rate": 7.515895543570821e-05, "loss": 2.5823909759521486, "step": 17510 }, { "epoch": 4.973034345728073, "grad_norm": 6.652368068695068, "learning_rate": 7.51447629860914e-05, "loss": 2.6228736877441405, "step": 17520 }, { "epoch": 4.975872835651433, "grad_norm": 7.270105838775635, "learning_rate": 7.51305705364746e-05, "loss": 2.525347900390625, "step": 17530 }, { "epoch": 4.978711325574794, "grad_norm": 6.474912166595459, "learning_rate": 7.511637808685779e-05, "loss": 2.6022689819335936, "step": 17540 }, { "epoch": 4.9815498154981555, "grad_norm": 7.098737716674805, "learning_rate": 7.5102185637241e-05, "loss": 2.6127586364746094, "step": 17550 }, { "epoch": 4.984388305421516, "grad_norm": 6.948169708251953, "learning_rate": 7.508799318762418e-05, "loss": 2.5822534561157227, "step": 17560 }, { "epoch": 4.987226795344877, "grad_norm": 6.720169544219971, "learning_rate": 7.507380073800739e-05, "loss": 2.5860443115234375, "step": 17570 }, { "epoch": 4.990065285268237, "grad_norm": 7.428252696990967, "learning_rate": 7.505960828839058e-05, "loss": 2.5026777267456053, "step": 17580 }, { "epoch": 4.992903775191598, "grad_norm": 6.803229331970215, "learning_rate": 7.504541583877379e-05, "loss": 2.615283203125, "step": 17590 }, { "epoch": 4.995742265114959, "grad_norm": 6.905067443847656, "learning_rate": 7.503122338915697e-05, "loss": 2.5458553314208983, "step": 17600 }, { "epoch": 4.9985807550383194, "grad_norm": 6.835422515869141, "learning_rate": 7.501703093954017e-05, "loss": 2.644064712524414, "step": 17610 }, { "epoch": 5.0014192449616806, "grad_norm": 6.703253746032715, "learning_rate": 7.500425773488504e-05, "loss": 2.568987274169922, "step": 17620 }, { "epoch": 5.004257734885041, "grad_norm": 6.692494869232178, "learning_rate": 7.499006528526824e-05, "loss": 2.530202293395996, "step": 17630 }, { "epoch": 5.007096224808402, "grad_norm": 6.835359573364258, "learning_rate": 7.497587283565145e-05, "loss": 2.546214485168457, "step": 17640 }, { "epoch": 5.009934714731763, "grad_norm": 6.910696506500244, "learning_rate": 7.496168038603464e-05, "loss": 2.544657516479492, "step": 17650 }, { "epoch": 5.012773204655123, "grad_norm": 6.748648643493652, "learning_rate": 7.494748793641783e-05, "loss": 2.6019575119018556, "step": 17660 }, { "epoch": 5.015611694578484, "grad_norm": 6.699620723724365, "learning_rate": 7.493329548680103e-05, "loss": 2.557318115234375, "step": 17670 }, { "epoch": 5.018450184501845, "grad_norm": 6.706846237182617, "learning_rate": 7.491910303718422e-05, "loss": 2.5594627380371096, "step": 17680 }, { "epoch": 5.021288674425206, "grad_norm": 6.496762752532959, "learning_rate": 7.490491058756743e-05, "loss": 2.5279666900634767, "step": 17690 }, { "epoch": 5.024127164348567, "grad_norm": 6.727117538452148, "learning_rate": 7.48907181379506e-05, "loss": 2.477491569519043, "step": 17700 }, { "epoch": 5.026965654271927, "grad_norm": 7.07592248916626, "learning_rate": 7.487652568833381e-05, "loss": 2.6036954879760743, "step": 17710 }, { "epoch": 5.029804144195288, "grad_norm": 6.9625444412231445, "learning_rate": 7.4862333238717e-05, "loss": 2.5150936126708983, "step": 17720 }, { "epoch": 5.032642634118649, "grad_norm": 7.0287580490112305, "learning_rate": 7.484814078910021e-05, "loss": 2.5647300720214843, "step": 17730 }, { "epoch": 5.035481124042009, "grad_norm": 6.728711128234863, "learning_rate": 7.483394833948339e-05, "loss": 2.5211580276489256, "step": 17740 }, { "epoch": 5.0383196139653705, "grad_norm": 7.121988296508789, "learning_rate": 7.48197558898666e-05, "loss": 2.516639518737793, "step": 17750 }, { "epoch": 5.041158103888731, "grad_norm": 7.078081130981445, "learning_rate": 7.480556344024979e-05, "loss": 2.559312438964844, "step": 17760 }, { "epoch": 5.043996593812092, "grad_norm": 6.989551067352295, "learning_rate": 7.479137099063299e-05, "loss": 2.5053056716918944, "step": 17770 }, { "epoch": 5.046835083735453, "grad_norm": 6.881659984588623, "learning_rate": 7.477717854101618e-05, "loss": 2.490918731689453, "step": 17780 }, { "epoch": 5.049673573658813, "grad_norm": 6.65317964553833, "learning_rate": 7.476298609139937e-05, "loss": 2.5648536682128906, "step": 17790 }, { "epoch": 5.052512063582174, "grad_norm": 7.048379898071289, "learning_rate": 7.474879364178258e-05, "loss": 2.502269744873047, "step": 17800 }, { "epoch": 5.055350553505535, "grad_norm": 6.993536949157715, "learning_rate": 7.473460119216577e-05, "loss": 2.6283452987670897, "step": 17810 }, { "epoch": 5.058189043428896, "grad_norm": 6.443151950836182, "learning_rate": 7.472040874254897e-05, "loss": 2.58115177154541, "step": 17820 }, { "epoch": 5.061027533352257, "grad_norm": 6.855377674102783, "learning_rate": 7.470621629293216e-05, "loss": 2.5672521591186523, "step": 17830 }, { "epoch": 5.063866023275617, "grad_norm": 7.031713485717773, "learning_rate": 7.469202384331537e-05, "loss": 2.772542953491211, "step": 17840 }, { "epoch": 5.066704513198978, "grad_norm": 6.591639518737793, "learning_rate": 7.467783139369856e-05, "loss": 2.488310623168945, "step": 17850 }, { "epoch": 5.069543003122339, "grad_norm": 7.103044033050537, "learning_rate": 7.466363894408175e-05, "loss": 2.4970844268798826, "step": 17860 }, { "epoch": 5.072381493045699, "grad_norm": 6.554096221923828, "learning_rate": 7.464944649446495e-05, "loss": 2.5477685928344727, "step": 17870 }, { "epoch": 5.0752199829690605, "grad_norm": 6.922588348388672, "learning_rate": 7.463525404484815e-05, "loss": 2.530625343322754, "step": 17880 }, { "epoch": 5.078058472892422, "grad_norm": 6.703078269958496, "learning_rate": 7.462106159523135e-05, "loss": 2.517251968383789, "step": 17890 }, { "epoch": 5.080896962815782, "grad_norm": 7.226039409637451, "learning_rate": 7.460686914561454e-05, "loss": 2.5784772872924804, "step": 17900 }, { "epoch": 5.083735452739143, "grad_norm": 7.049926280975342, "learning_rate": 7.459267669599773e-05, "loss": 2.5805103302001955, "step": 17910 }, { "epoch": 5.086573942662503, "grad_norm": 6.456727504730225, "learning_rate": 7.457848424638093e-05, "loss": 2.4922887802124025, "step": 17920 }, { "epoch": 5.089412432585864, "grad_norm": 6.646394729614258, "learning_rate": 7.456429179676413e-05, "loss": 2.5574295043945314, "step": 17930 }, { "epoch": 5.092250922509225, "grad_norm": 6.637722015380859, "learning_rate": 7.455009934714731e-05, "loss": 2.540232467651367, "step": 17940 }, { "epoch": 5.095089412432586, "grad_norm": 6.888846397399902, "learning_rate": 7.453590689753052e-05, "loss": 2.5678634643554688, "step": 17950 }, { "epoch": 5.097927902355947, "grad_norm": 7.046519756317139, "learning_rate": 7.452171444791371e-05, "loss": 2.5861818313598635, "step": 17960 }, { "epoch": 5.100766392279308, "grad_norm": 6.922731876373291, "learning_rate": 7.450752199829692e-05, "loss": 2.5829015731811524, "step": 17970 }, { "epoch": 5.103604882202668, "grad_norm": 6.7957611083984375, "learning_rate": 7.44933295486801e-05, "loss": 2.5129348754882814, "step": 17980 }, { "epoch": 5.106443372126029, "grad_norm": 6.676427841186523, "learning_rate": 7.44791370990633e-05, "loss": 2.4945095062255858, "step": 17990 }, { "epoch": 5.109281862049389, "grad_norm": 6.9649128913879395, "learning_rate": 7.44649446494465e-05, "loss": 2.548863410949707, "step": 18000 }, { "epoch": 5.109281862049389, "eval_accuracy": 0.26158835124308516, "eval_loss": 2.8447556495666504, "eval_runtime": 49.2265, "eval_samples_per_second": 319.482, "eval_steps_per_second": 4.997, "step": 18000 }, { "epoch": 5.1121203519727505, "grad_norm": 6.6768975257873535, "learning_rate": 7.445075219982969e-05, "loss": 2.450541877746582, "step": 18010 }, { "epoch": 5.114958841896112, "grad_norm": 6.878286838531494, "learning_rate": 7.443655975021289e-05, "loss": 2.518428611755371, "step": 18020 }, { "epoch": 5.117797331819472, "grad_norm": 6.846975326538086, "learning_rate": 7.442236730059608e-05, "loss": 2.5536436080932616, "step": 18030 }, { "epoch": 5.120635821742833, "grad_norm": 6.616571426391602, "learning_rate": 7.440817485097929e-05, "loss": 2.558480453491211, "step": 18040 }, { "epoch": 5.123474311666193, "grad_norm": 6.9521284103393555, "learning_rate": 7.439398240136248e-05, "loss": 2.5667701721191407, "step": 18050 }, { "epoch": 5.126312801589554, "grad_norm": 6.80915641784668, "learning_rate": 7.437978995174567e-05, "loss": 2.5295198440551756, "step": 18060 }, { "epoch": 5.129151291512915, "grad_norm": 7.03594970703125, "learning_rate": 7.436559750212887e-05, "loss": 2.5742984771728517, "step": 18070 }, { "epoch": 5.131989781436276, "grad_norm": 6.837252616882324, "learning_rate": 7.435140505251207e-05, "loss": 2.502463722229004, "step": 18080 }, { "epoch": 5.134828271359637, "grad_norm": 6.948042869567871, "learning_rate": 7.433721260289527e-05, "loss": 2.5125221252441405, "step": 18090 }, { "epoch": 5.137666761282998, "grad_norm": 6.925464153289795, "learning_rate": 7.432302015327846e-05, "loss": 2.607441520690918, "step": 18100 }, { "epoch": 5.140505251206358, "grad_norm": 6.682419300079346, "learning_rate": 7.430882770366165e-05, "loss": 2.6120632171630858, "step": 18110 }, { "epoch": 5.143343741129719, "grad_norm": 6.691654205322266, "learning_rate": 7.429463525404485e-05, "loss": 2.507078742980957, "step": 18120 }, { "epoch": 5.146182231053079, "grad_norm": 6.634687900543213, "learning_rate": 7.428044280442805e-05, "loss": 2.556760787963867, "step": 18130 }, { "epoch": 5.1490207209764405, "grad_norm": 7.051270484924316, "learning_rate": 7.426625035481123e-05, "loss": 2.5939153671264648, "step": 18140 }, { "epoch": 5.151859210899802, "grad_norm": 6.940392017364502, "learning_rate": 7.425205790519444e-05, "loss": 2.5601869583129884, "step": 18150 }, { "epoch": 5.154697700823162, "grad_norm": 7.088762283325195, "learning_rate": 7.423786545557763e-05, "loss": 2.5460254669189455, "step": 18160 }, { "epoch": 5.157536190746523, "grad_norm": 6.889113903045654, "learning_rate": 7.422367300596084e-05, "loss": 2.6205896377563476, "step": 18170 }, { "epoch": 5.160374680669884, "grad_norm": 6.621574401855469, "learning_rate": 7.420948055634402e-05, "loss": 2.56610164642334, "step": 18180 }, { "epoch": 5.163213170593244, "grad_norm": 6.803660869598389, "learning_rate": 7.419528810672723e-05, "loss": 2.5778339385986326, "step": 18190 }, { "epoch": 5.166051660516605, "grad_norm": 6.8745551109313965, "learning_rate": 7.418109565711042e-05, "loss": 2.5250726699829102, "step": 18200 }, { "epoch": 5.168890150439966, "grad_norm": 6.732845783233643, "learning_rate": 7.416690320749363e-05, "loss": 2.5219179153442384, "step": 18210 }, { "epoch": 5.171728640363327, "grad_norm": 6.829438209533691, "learning_rate": 7.415271075787681e-05, "loss": 2.557341766357422, "step": 18220 }, { "epoch": 5.174567130286688, "grad_norm": 6.819887638092041, "learning_rate": 7.413851830826001e-05, "loss": 2.5154226303100584, "step": 18230 }, { "epoch": 5.177405620210048, "grad_norm": 6.924181938171387, "learning_rate": 7.412432585864321e-05, "loss": 2.59810733795166, "step": 18240 }, { "epoch": 5.180244110133409, "grad_norm": 6.918481349945068, "learning_rate": 7.41101334090264e-05, "loss": 2.555336570739746, "step": 18250 }, { "epoch": 5.183082600056769, "grad_norm": 6.655999660491943, "learning_rate": 7.40959409594096e-05, "loss": 2.549691390991211, "step": 18260 }, { "epoch": 5.1859210899801305, "grad_norm": 7.025461196899414, "learning_rate": 7.408174850979279e-05, "loss": 2.5653186798095704, "step": 18270 }, { "epoch": 5.188759579903492, "grad_norm": 6.870781421661377, "learning_rate": 7.4067556060176e-05, "loss": 2.6360052108764647, "step": 18280 }, { "epoch": 5.191598069826852, "grad_norm": 6.969069004058838, "learning_rate": 7.405336361055919e-05, "loss": 2.570384216308594, "step": 18290 }, { "epoch": 5.194436559750213, "grad_norm": 6.575835704803467, "learning_rate": 7.403917116094238e-05, "loss": 2.5494720458984377, "step": 18300 }, { "epoch": 5.197275049673574, "grad_norm": 6.7682366371154785, "learning_rate": 7.402497871132557e-05, "loss": 2.4690120697021483, "step": 18310 }, { "epoch": 5.200113539596934, "grad_norm": 6.8496623039245605, "learning_rate": 7.401078626170878e-05, "loss": 2.468115043640137, "step": 18320 }, { "epoch": 5.202952029520295, "grad_norm": 6.846420764923096, "learning_rate": 7.399659381209197e-05, "loss": 2.514501190185547, "step": 18330 }, { "epoch": 5.205790519443656, "grad_norm": 6.833467483520508, "learning_rate": 7.398240136247517e-05, "loss": 2.5283206939697265, "step": 18340 }, { "epoch": 5.208629009367017, "grad_norm": 6.779111385345459, "learning_rate": 7.396820891285836e-05, "loss": 2.535419464111328, "step": 18350 }, { "epoch": 5.211467499290378, "grad_norm": 6.944159030914307, "learning_rate": 7.395401646324155e-05, "loss": 2.549949073791504, "step": 18360 }, { "epoch": 5.214305989213738, "grad_norm": 6.697237014770508, "learning_rate": 7.393982401362476e-05, "loss": 2.5271453857421875, "step": 18370 }, { "epoch": 5.217144479137099, "grad_norm": 7.076419353485107, "learning_rate": 7.392563156400794e-05, "loss": 2.6129343032836916, "step": 18380 }, { "epoch": 5.21998296906046, "grad_norm": 7.14669132232666, "learning_rate": 7.391143911439115e-05, "loss": 2.5628705978393556, "step": 18390 }, { "epoch": 5.22282145898382, "grad_norm": 6.477357387542725, "learning_rate": 7.389724666477434e-05, "loss": 2.5604366302490233, "step": 18400 }, { "epoch": 5.2256599489071816, "grad_norm": 6.861240386962891, "learning_rate": 7.388305421515755e-05, "loss": 2.5068431854248048, "step": 18410 }, { "epoch": 5.228498438830542, "grad_norm": 6.717290878295898, "learning_rate": 7.386886176554073e-05, "loss": 2.464394378662109, "step": 18420 }, { "epoch": 5.231336928753903, "grad_norm": 6.41367769241333, "learning_rate": 7.385466931592393e-05, "loss": 2.473832702636719, "step": 18430 }, { "epoch": 5.234175418677264, "grad_norm": 6.629590034484863, "learning_rate": 7.384047686630713e-05, "loss": 2.565260887145996, "step": 18440 }, { "epoch": 5.237013908600624, "grad_norm": 6.952000141143799, "learning_rate": 7.382628441669033e-05, "loss": 2.5463096618652346, "step": 18450 }, { "epoch": 5.239852398523985, "grad_norm": 7.256922721862793, "learning_rate": 7.381209196707351e-05, "loss": 2.5942829132080076, "step": 18460 }, { "epoch": 5.242690888447346, "grad_norm": 6.93370246887207, "learning_rate": 7.379789951745672e-05, "loss": 2.603807830810547, "step": 18470 }, { "epoch": 5.245529378370707, "grad_norm": 6.779187202453613, "learning_rate": 7.378370706783991e-05, "loss": 2.59134521484375, "step": 18480 }, { "epoch": 5.248367868294068, "grad_norm": 6.73549747467041, "learning_rate": 7.376951461822311e-05, "loss": 2.5757604598999024, "step": 18490 }, { "epoch": 5.251206358217428, "grad_norm": 7.295973777770996, "learning_rate": 7.37553221686063e-05, "loss": 2.5218246459960936, "step": 18500 }, { "epoch": 5.251206358217428, "eval_accuracy": 0.25726457684237297, "eval_loss": 2.8377976417541504, "eval_runtime": 49.6422, "eval_samples_per_second": 316.807, "eval_steps_per_second": 4.955, "step": 18500 }, { "epoch": 5.254044848140789, "grad_norm": 7.048372268676758, "learning_rate": 7.37411297189895e-05, "loss": 2.49593448638916, "step": 18510 }, { "epoch": 5.25688333806415, "grad_norm": 6.592235565185547, "learning_rate": 7.37269372693727e-05, "loss": 2.5772666931152344, "step": 18520 }, { "epoch": 5.25972182798751, "grad_norm": 6.638607025146484, "learning_rate": 7.37127448197559e-05, "loss": 2.57529296875, "step": 18530 }, { "epoch": 5.2625603179108715, "grad_norm": 6.672317028045654, "learning_rate": 7.369855237013909e-05, "loss": 2.4812906265258787, "step": 18540 }, { "epoch": 5.265398807834233, "grad_norm": 6.347843647003174, "learning_rate": 7.368435992052228e-05, "loss": 2.532635498046875, "step": 18550 }, { "epoch": 5.268237297757593, "grad_norm": 6.691095352172852, "learning_rate": 7.367016747090549e-05, "loss": 2.566342353820801, "step": 18560 }, { "epoch": 5.271075787680954, "grad_norm": 6.718478202819824, "learning_rate": 7.365597502128868e-05, "loss": 2.544178009033203, "step": 18570 }, { "epoch": 5.273914277604314, "grad_norm": 7.101949214935303, "learning_rate": 7.364178257167187e-05, "loss": 2.581862449645996, "step": 18580 }, { "epoch": 5.276752767527675, "grad_norm": 6.638518333435059, "learning_rate": 7.362759012205507e-05, "loss": 2.5161664962768553, "step": 18590 }, { "epoch": 5.279591257451036, "grad_norm": 6.653689861297607, "learning_rate": 7.361339767243826e-05, "loss": 2.5140287399291994, "step": 18600 }, { "epoch": 5.282429747374397, "grad_norm": 7.149965763092041, "learning_rate": 7.359920522282147e-05, "loss": 2.5179895401000976, "step": 18610 }, { "epoch": 5.285268237297758, "grad_norm": 6.836979389190674, "learning_rate": 7.358501277320465e-05, "loss": 2.5139556884765626, "step": 18620 }, { "epoch": 5.288106727221118, "grad_norm": 7.068029403686523, "learning_rate": 7.357082032358785e-05, "loss": 2.5299728393554686, "step": 18630 }, { "epoch": 5.290945217144479, "grad_norm": 7.273621559143066, "learning_rate": 7.355662787397105e-05, "loss": 2.542753219604492, "step": 18640 }, { "epoch": 5.29378370706784, "grad_norm": 6.594620704650879, "learning_rate": 7.354243542435426e-05, "loss": 2.51368408203125, "step": 18650 }, { "epoch": 5.2966221969912, "grad_norm": 6.921955108642578, "learning_rate": 7.352824297473743e-05, "loss": 2.5352672576904296, "step": 18660 }, { "epoch": 5.2994606869145615, "grad_norm": 6.9432759284973145, "learning_rate": 7.351405052512064e-05, "loss": 2.5984882354736327, "step": 18670 }, { "epoch": 5.302299176837923, "grad_norm": 6.830785274505615, "learning_rate": 7.349985807550384e-05, "loss": 2.603244972229004, "step": 18680 }, { "epoch": 5.305137666761283, "grad_norm": 6.774467945098877, "learning_rate": 7.348566562588703e-05, "loss": 2.568950653076172, "step": 18690 }, { "epoch": 5.307976156684644, "grad_norm": 6.6527910232543945, "learning_rate": 7.347147317627022e-05, "loss": 2.566614532470703, "step": 18700 }, { "epoch": 5.310814646608004, "grad_norm": 6.797240734100342, "learning_rate": 7.345728072665342e-05, "loss": 2.5881631851196287, "step": 18710 }, { "epoch": 5.313653136531365, "grad_norm": 6.921928405761719, "learning_rate": 7.344308827703662e-05, "loss": 2.5911605834960936, "step": 18720 }, { "epoch": 5.316491626454726, "grad_norm": 6.8471527099609375, "learning_rate": 7.342889582741982e-05, "loss": 2.553318977355957, "step": 18730 }, { "epoch": 5.319330116378087, "grad_norm": 6.865833759307861, "learning_rate": 7.341470337780301e-05, "loss": 2.5715354919433593, "step": 18740 }, { "epoch": 5.322168606301448, "grad_norm": 6.752242565155029, "learning_rate": 7.34005109281862e-05, "loss": 2.555495834350586, "step": 18750 }, { "epoch": 5.325007096224809, "grad_norm": 6.772656440734863, "learning_rate": 7.338631847856941e-05, "loss": 2.453829765319824, "step": 18760 }, { "epoch": 5.327845586148169, "grad_norm": 7.1725335121154785, "learning_rate": 7.33721260289526e-05, "loss": 2.5988441467285157, "step": 18770 }, { "epoch": 5.33068407607153, "grad_norm": 6.631807327270508, "learning_rate": 7.33579335793358e-05, "loss": 2.532840538024902, "step": 18780 }, { "epoch": 5.33352256599489, "grad_norm": 6.750876426696777, "learning_rate": 7.334374112971899e-05, "loss": 2.4725271224975587, "step": 18790 }, { "epoch": 5.3363610559182515, "grad_norm": 6.737439155578613, "learning_rate": 7.33295486801022e-05, "loss": 2.576161193847656, "step": 18800 }, { "epoch": 5.339199545841613, "grad_norm": 6.995692729949951, "learning_rate": 7.331535623048539e-05, "loss": 2.52645263671875, "step": 18810 }, { "epoch": 5.342038035764973, "grad_norm": 6.855576515197754, "learning_rate": 7.330116378086858e-05, "loss": 2.6000255584716796, "step": 18820 }, { "epoch": 5.344876525688334, "grad_norm": 7.014626979827881, "learning_rate": 7.328697133125178e-05, "loss": 2.5763803482055665, "step": 18830 }, { "epoch": 5.347715015611694, "grad_norm": 6.667849063873291, "learning_rate": 7.327277888163497e-05, "loss": 2.5500938415527346, "step": 18840 }, { "epoch": 5.350553505535055, "grad_norm": 7.081477642059326, "learning_rate": 7.325858643201818e-05, "loss": 2.5833318710327147, "step": 18850 }, { "epoch": 5.353391995458416, "grad_norm": 6.7574462890625, "learning_rate": 7.324439398240137e-05, "loss": 2.517710876464844, "step": 18860 }, { "epoch": 5.356230485381777, "grad_norm": 6.887168884277344, "learning_rate": 7.323020153278456e-05, "loss": 2.55814151763916, "step": 18870 }, { "epoch": 5.359068975305138, "grad_norm": 6.431946754455566, "learning_rate": 7.321600908316776e-05, "loss": 2.501334571838379, "step": 18880 }, { "epoch": 5.361907465228499, "grad_norm": 7.043050765991211, "learning_rate": 7.320181663355096e-05, "loss": 2.5736387252807615, "step": 18890 }, { "epoch": 5.364745955151859, "grad_norm": 6.630824089050293, "learning_rate": 7.318762418393416e-05, "loss": 2.487956237792969, "step": 18900 }, { "epoch": 5.36758444507522, "grad_norm": 7.07933235168457, "learning_rate": 7.317343173431735e-05, "loss": 2.542824935913086, "step": 18910 }, { "epoch": 5.37042293499858, "grad_norm": 7.062706470489502, "learning_rate": 7.315923928470054e-05, "loss": 2.4884063720703127, "step": 18920 }, { "epoch": 5.3732614249219415, "grad_norm": 6.833053112030029, "learning_rate": 7.314504683508374e-05, "loss": 2.5530284881591796, "step": 18930 }, { "epoch": 5.376099914845303, "grad_norm": 6.737825393676758, "learning_rate": 7.313085438546694e-05, "loss": 2.5032865524291994, "step": 18940 }, { "epoch": 5.378938404768663, "grad_norm": 7.1329569816589355, "learning_rate": 7.311666193585012e-05, "loss": 2.5559158325195312, "step": 18950 }, { "epoch": 5.381776894692024, "grad_norm": 6.859339714050293, "learning_rate": 7.310246948623333e-05, "loss": 2.606877899169922, "step": 18960 }, { "epoch": 5.384615384615385, "grad_norm": 6.772965431213379, "learning_rate": 7.308827703661652e-05, "loss": 2.5268327713012697, "step": 18970 }, { "epoch": 5.387453874538745, "grad_norm": 6.819723606109619, "learning_rate": 7.307408458699973e-05, "loss": 2.5261056900024412, "step": 18980 }, { "epoch": 5.390292364462106, "grad_norm": 7.130303859710693, "learning_rate": 7.305989213738291e-05, "loss": 2.4839879989624025, "step": 18990 }, { "epoch": 5.393130854385467, "grad_norm": 6.6679911613464355, "learning_rate": 7.304569968776612e-05, "loss": 2.5281118392944335, "step": 19000 }, { "epoch": 5.393130854385467, "eval_accuracy": 0.25942646404272907, "eval_loss": 2.83211350440979, "eval_runtime": 49.3282, "eval_samples_per_second": 318.824, "eval_steps_per_second": 4.987, "step": 19000 }, { "epoch": 5.395969344308828, "grad_norm": 7.138184547424316, "learning_rate": 7.303150723814931e-05, "loss": 2.597249984741211, "step": 19010 }, { "epoch": 5.398807834232189, "grad_norm": 6.902784824371338, "learning_rate": 7.30173147885325e-05, "loss": 2.5184810638427733, "step": 19020 }, { "epoch": 5.401646324155549, "grad_norm": 6.849015712738037, "learning_rate": 7.30031223389157e-05, "loss": 2.5408878326416016, "step": 19030 }, { "epoch": 5.40448481407891, "grad_norm": 6.9193644523620605, "learning_rate": 7.298892988929889e-05, "loss": 2.558734130859375, "step": 19040 }, { "epoch": 5.40732330400227, "grad_norm": 6.646902084350586, "learning_rate": 7.29747374396821e-05, "loss": 2.567116355895996, "step": 19050 }, { "epoch": 5.4101617939256315, "grad_norm": 6.7023606300354, "learning_rate": 7.296054499006529e-05, "loss": 2.565078353881836, "step": 19060 }, { "epoch": 5.413000283848993, "grad_norm": 6.625940799713135, "learning_rate": 7.294635254044848e-05, "loss": 2.5189817428588865, "step": 19070 }, { "epoch": 5.415838773772353, "grad_norm": 6.934445858001709, "learning_rate": 7.293216009083168e-05, "loss": 2.535547065734863, "step": 19080 }, { "epoch": 5.418677263695714, "grad_norm": 7.052305221557617, "learning_rate": 7.291796764121488e-05, "loss": 2.519478607177734, "step": 19090 }, { "epoch": 5.421515753619075, "grad_norm": 6.747097492218018, "learning_rate": 7.290377519159808e-05, "loss": 2.479838180541992, "step": 19100 }, { "epoch": 5.424354243542435, "grad_norm": 6.570593357086182, "learning_rate": 7.288958274198127e-05, "loss": 2.5551736831665037, "step": 19110 }, { "epoch": 5.427192733465796, "grad_norm": 6.763521671295166, "learning_rate": 7.287539029236446e-05, "loss": 2.5020227432250977, "step": 19120 }, { "epoch": 5.430031223389157, "grad_norm": 6.621832370758057, "learning_rate": 7.286119784274767e-05, "loss": 2.4768703460693358, "step": 19130 }, { "epoch": 5.432869713312518, "grad_norm": 6.980132102966309, "learning_rate": 7.284700539313086e-05, "loss": 2.5648818969726563, "step": 19140 }, { "epoch": 5.435708203235879, "grad_norm": 6.699887275695801, "learning_rate": 7.283281294351406e-05, "loss": 2.541429328918457, "step": 19150 }, { "epoch": 5.438546693159239, "grad_norm": 6.772872447967529, "learning_rate": 7.281862049389725e-05, "loss": 2.4852115631103517, "step": 19160 }, { "epoch": 5.4413851830826, "grad_norm": 6.66257905960083, "learning_rate": 7.280442804428044e-05, "loss": 2.5716556549072265, "step": 19170 }, { "epoch": 5.444223673005961, "grad_norm": 6.8739776611328125, "learning_rate": 7.279023559466365e-05, "loss": 2.549476432800293, "step": 19180 }, { "epoch": 5.447062162929321, "grad_norm": 7.106246471405029, "learning_rate": 7.277604314504683e-05, "loss": 2.560186004638672, "step": 19190 }, { "epoch": 5.4499006528526825, "grad_norm": 6.694366455078125, "learning_rate": 7.276185069543004e-05, "loss": 2.5587568283081055, "step": 19200 }, { "epoch": 5.452739142776043, "grad_norm": 6.757915019989014, "learning_rate": 7.274765824581323e-05, "loss": 2.547775650024414, "step": 19210 }, { "epoch": 5.455577632699404, "grad_norm": 6.937742710113525, "learning_rate": 7.273346579619644e-05, "loss": 2.572384834289551, "step": 19220 }, { "epoch": 5.458416122622765, "grad_norm": 7.340616703033447, "learning_rate": 7.271927334657962e-05, "loss": 2.5598072052001952, "step": 19230 }, { "epoch": 5.461254612546125, "grad_norm": 6.993860721588135, "learning_rate": 7.270508089696282e-05, "loss": 2.530800628662109, "step": 19240 }, { "epoch": 5.464093102469486, "grad_norm": 6.5969157218933105, "learning_rate": 7.269088844734602e-05, "loss": 2.54754581451416, "step": 19250 }, { "epoch": 5.4669315923928465, "grad_norm": 6.53775691986084, "learning_rate": 7.267669599772921e-05, "loss": 2.534989356994629, "step": 19260 }, { "epoch": 5.469770082316208, "grad_norm": 6.468379497528076, "learning_rate": 7.26625035481124e-05, "loss": 2.486766815185547, "step": 19270 }, { "epoch": 5.472608572239569, "grad_norm": 7.247913837432861, "learning_rate": 7.26483110984956e-05, "loss": 2.5138158798217773, "step": 19280 }, { "epoch": 5.475447062162929, "grad_norm": 6.806623935699463, "learning_rate": 7.26341186488788e-05, "loss": 2.531719207763672, "step": 19290 }, { "epoch": 5.47828555208629, "grad_norm": 6.732532978057861, "learning_rate": 7.2619926199262e-05, "loss": 2.5333492279052736, "step": 19300 }, { "epoch": 5.481124042009651, "grad_norm": 7.529943466186523, "learning_rate": 7.260573374964519e-05, "loss": 2.538269805908203, "step": 19310 }, { "epoch": 5.483962531933011, "grad_norm": 6.6532883644104, "learning_rate": 7.259154130002838e-05, "loss": 2.507336235046387, "step": 19320 }, { "epoch": 5.4868010218563725, "grad_norm": 6.485095500946045, "learning_rate": 7.257734885041159e-05, "loss": 2.5816976547241213, "step": 19330 }, { "epoch": 5.489639511779734, "grad_norm": 6.792401313781738, "learning_rate": 7.256315640079478e-05, "loss": 2.5133590698242188, "step": 19340 }, { "epoch": 5.492478001703094, "grad_norm": 6.879651069641113, "learning_rate": 7.254896395117798e-05, "loss": 2.566980743408203, "step": 19350 }, { "epoch": 5.495316491626455, "grad_norm": 6.8385162353515625, "learning_rate": 7.253477150156117e-05, "loss": 2.535706329345703, "step": 19360 }, { "epoch": 5.498154981549815, "grad_norm": 6.8929338455200195, "learning_rate": 7.252057905194436e-05, "loss": 2.5024648666381837, "step": 19370 }, { "epoch": 5.500993471473176, "grad_norm": 6.6974968910217285, "learning_rate": 7.250638660232757e-05, "loss": 2.5102712631225588, "step": 19380 }, { "epoch": 5.503831961396537, "grad_norm": 7.0171661376953125, "learning_rate": 7.249219415271075e-05, "loss": 2.566487693786621, "step": 19390 }, { "epoch": 5.506670451319898, "grad_norm": 7.125636100769043, "learning_rate": 7.247800170309396e-05, "loss": 2.5611270904541015, "step": 19400 }, { "epoch": 5.509508941243259, "grad_norm": 6.797091960906982, "learning_rate": 7.246380925347715e-05, "loss": 2.607290267944336, "step": 19410 }, { "epoch": 5.512347431166619, "grad_norm": 6.953138828277588, "learning_rate": 7.244961680386036e-05, "loss": 2.521720123291016, "step": 19420 }, { "epoch": 5.51518592108998, "grad_norm": 6.819103240966797, "learning_rate": 7.243542435424354e-05, "loss": 2.538585662841797, "step": 19430 }, { "epoch": 5.518024411013341, "grad_norm": 6.829318523406982, "learning_rate": 7.242123190462674e-05, "loss": 2.5431730270385744, "step": 19440 }, { "epoch": 5.520862900936701, "grad_norm": 6.692455291748047, "learning_rate": 7.240703945500994e-05, "loss": 2.504252052307129, "step": 19450 }, { "epoch": 5.5237013908600625, "grad_norm": 6.8009114265441895, "learning_rate": 7.239284700539314e-05, "loss": 2.587982177734375, "step": 19460 }, { "epoch": 5.526539880783423, "grad_norm": 7.031310081481934, "learning_rate": 7.237865455577632e-05, "loss": 2.51947135925293, "step": 19470 }, { "epoch": 5.529378370706784, "grad_norm": 7.116189002990723, "learning_rate": 7.236446210615953e-05, "loss": 2.561794662475586, "step": 19480 }, { "epoch": 5.532216860630145, "grad_norm": 6.521947860717773, "learning_rate": 7.235026965654272e-05, "loss": 2.4908679962158202, "step": 19490 }, { "epoch": 5.535055350553505, "grad_norm": 6.5694756507873535, "learning_rate": 7.233607720692592e-05, "loss": 2.619573974609375, "step": 19500 }, { "epoch": 5.535055350553505, "eval_accuracy": 0.2617155210784002, "eval_loss": 2.8184914588928223, "eval_runtime": 49.9009, "eval_samples_per_second": 315.165, "eval_steps_per_second": 4.93, "step": 19500 }, { "epoch": 5.537893840476866, "grad_norm": 6.95618200302124, "learning_rate": 7.232188475730911e-05, "loss": 2.5338531494140626, "step": 19510 }, { "epoch": 5.540732330400227, "grad_norm": 6.875465393066406, "learning_rate": 7.23076923076923e-05, "loss": 2.48843936920166, "step": 19520 }, { "epoch": 5.543570820323588, "grad_norm": 6.858250141143799, "learning_rate": 7.229349985807551e-05, "loss": 2.520702934265137, "step": 19530 }, { "epoch": 5.546409310246949, "grad_norm": 6.546832084655762, "learning_rate": 7.22793074084587e-05, "loss": 2.427105712890625, "step": 19540 }, { "epoch": 5.54924780017031, "grad_norm": 6.668045997619629, "learning_rate": 7.22651149588419e-05, "loss": 2.5358224868774415, "step": 19550 }, { "epoch": 5.55208629009367, "grad_norm": 6.718686103820801, "learning_rate": 7.225092250922509e-05, "loss": 2.5447790145874025, "step": 19560 }, { "epoch": 5.554924780017031, "grad_norm": 6.512726783752441, "learning_rate": 7.22367300596083e-05, "loss": 2.517647933959961, "step": 19570 }, { "epoch": 5.557763269940391, "grad_norm": 6.903363227844238, "learning_rate": 7.222253760999149e-05, "loss": 2.578038787841797, "step": 19580 }, { "epoch": 5.5606017598637525, "grad_norm": 6.820156097412109, "learning_rate": 7.220834516037468e-05, "loss": 2.556728744506836, "step": 19590 }, { "epoch": 5.563440249787114, "grad_norm": 6.822120189666748, "learning_rate": 7.219415271075788e-05, "loss": 2.553522300720215, "step": 19600 }, { "epoch": 5.566278739710474, "grad_norm": 6.766408920288086, "learning_rate": 7.217996026114107e-05, "loss": 2.498373603820801, "step": 19610 }, { "epoch": 5.569117229633835, "grad_norm": 6.507923126220703, "learning_rate": 7.216576781152428e-05, "loss": 2.489767837524414, "step": 19620 }, { "epoch": 5.571955719557195, "grad_norm": 6.8064470291137695, "learning_rate": 7.215157536190746e-05, "loss": 2.447887420654297, "step": 19630 }, { "epoch": 5.574794209480556, "grad_norm": 7.176397323608398, "learning_rate": 7.213738291229066e-05, "loss": 2.527029800415039, "step": 19640 }, { "epoch": 5.577632699403917, "grad_norm": 6.757180690765381, "learning_rate": 7.212319046267386e-05, "loss": 2.56481819152832, "step": 19650 }, { "epoch": 5.580471189327278, "grad_norm": 6.689793586730957, "learning_rate": 7.210899801305707e-05, "loss": 2.547587585449219, "step": 19660 }, { "epoch": 5.583309679250639, "grad_norm": 6.629839897155762, "learning_rate": 7.209480556344024e-05, "loss": 2.574169731140137, "step": 19670 }, { "epoch": 5.586148169174, "grad_norm": 6.835451126098633, "learning_rate": 7.208061311382345e-05, "loss": 2.4824146270751952, "step": 19680 }, { "epoch": 5.58898665909736, "grad_norm": 6.6336870193481445, "learning_rate": 7.206642066420665e-05, "loss": 2.53753547668457, "step": 19690 }, { "epoch": 5.591825149020721, "grad_norm": 6.87119197845459, "learning_rate": 7.205222821458985e-05, "loss": 2.6050891876220703, "step": 19700 }, { "epoch": 5.594663638944081, "grad_norm": 7.317224979400635, "learning_rate": 7.203803576497303e-05, "loss": 2.5334381103515624, "step": 19710 }, { "epoch": 5.5975021288674425, "grad_norm": 6.8242645263671875, "learning_rate": 7.202384331535624e-05, "loss": 2.469470405578613, "step": 19720 }, { "epoch": 5.600340618790804, "grad_norm": 6.627498149871826, "learning_rate": 7.200965086573943e-05, "loss": 2.574750518798828, "step": 19730 }, { "epoch": 5.603179108714164, "grad_norm": 6.971779823303223, "learning_rate": 7.199545841612263e-05, "loss": 2.532364082336426, "step": 19740 }, { "epoch": 5.606017598637525, "grad_norm": 6.676385879516602, "learning_rate": 7.198126596650582e-05, "loss": 2.528499794006348, "step": 19750 }, { "epoch": 5.608856088560886, "grad_norm": 6.9522600173950195, "learning_rate": 7.196707351688901e-05, "loss": 2.5297107696533203, "step": 19760 }, { "epoch": 5.611694578484246, "grad_norm": 6.532268047332764, "learning_rate": 7.195288106727222e-05, "loss": 2.4818136215209963, "step": 19770 }, { "epoch": 5.614533068407607, "grad_norm": 6.802280426025391, "learning_rate": 7.193868861765541e-05, "loss": 2.503515625, "step": 19780 }, { "epoch": 5.617371558330968, "grad_norm": 6.303707122802734, "learning_rate": 7.19244961680386e-05, "loss": 2.465984344482422, "step": 19790 }, { "epoch": 5.620210048254329, "grad_norm": 6.709959506988525, "learning_rate": 7.19103037184218e-05, "loss": 2.5424245834350585, "step": 19800 }, { "epoch": 5.62304853817769, "grad_norm": 6.832473278045654, "learning_rate": 7.1896111268805e-05, "loss": 2.560934066772461, "step": 19810 }, { "epoch": 5.62588702810105, "grad_norm": 6.583501815795898, "learning_rate": 7.18819188191882e-05, "loss": 2.456637382507324, "step": 19820 }, { "epoch": 5.628725518024411, "grad_norm": 6.908890724182129, "learning_rate": 7.186772636957139e-05, "loss": 2.489703559875488, "step": 19830 }, { "epoch": 5.631564007947771, "grad_norm": 7.152108669281006, "learning_rate": 7.185353391995459e-05, "loss": 2.404699516296387, "step": 19840 }, { "epoch": 5.6344024978711325, "grad_norm": 6.6141676902771, "learning_rate": 7.183934147033778e-05, "loss": 2.521250915527344, "step": 19850 }, { "epoch": 5.637240987794494, "grad_norm": 7.072674751281738, "learning_rate": 7.182514902072099e-05, "loss": 2.6069984436035156, "step": 19860 }, { "epoch": 5.640079477717854, "grad_norm": 6.7916412353515625, "learning_rate": 7.181095657110417e-05, "loss": 2.416520118713379, "step": 19870 }, { "epoch": 5.642917967641215, "grad_norm": 6.82216215133667, "learning_rate": 7.179676412148737e-05, "loss": 2.6216094970703123, "step": 19880 }, { "epoch": 5.645756457564576, "grad_norm": 7.2306952476501465, "learning_rate": 7.178257167187057e-05, "loss": 2.5388952255249024, "step": 19890 }, { "epoch": 5.648594947487936, "grad_norm": 7.368190288543701, "learning_rate": 7.176837922225377e-05, "loss": 2.6218284606933593, "step": 19900 }, { "epoch": 5.651433437411297, "grad_norm": 7.094532489776611, "learning_rate": 7.175418677263695e-05, "loss": 2.5568668365478517, "step": 19910 }, { "epoch": 5.654271927334658, "grad_norm": 6.755252838134766, "learning_rate": 7.173999432302016e-05, "loss": 2.550445556640625, "step": 19920 }, { "epoch": 5.657110417258019, "grad_norm": 6.87607479095459, "learning_rate": 7.172580187340335e-05, "loss": 2.5933811187744142, "step": 19930 }, { "epoch": 5.65994890718138, "grad_norm": 7.120889186859131, "learning_rate": 7.171160942378655e-05, "loss": 2.525315856933594, "step": 19940 }, { "epoch": 5.66278739710474, "grad_norm": 6.575016975402832, "learning_rate": 7.169741697416974e-05, "loss": 2.528484916687012, "step": 19950 }, { "epoch": 5.665625887028101, "grad_norm": 6.962579727172852, "learning_rate": 7.168322452455293e-05, "loss": 2.538674545288086, "step": 19960 }, { "epoch": 5.668464376951462, "grad_norm": 6.5920796394348145, "learning_rate": 7.166903207493614e-05, "loss": 2.5624441146850585, "step": 19970 }, { "epoch": 5.671302866874822, "grad_norm": 6.733766555786133, "learning_rate": 7.165483962531933e-05, "loss": 2.5596424102783204, "step": 19980 }, { "epoch": 5.6741413567981835, "grad_norm": 6.6391282081604, "learning_rate": 7.164064717570253e-05, "loss": 2.5003742218017577, "step": 19990 }, { "epoch": 5.676979846721544, "grad_norm": 7.191015243530273, "learning_rate": 7.162645472608572e-05, "loss": 2.537186622619629, "step": 20000 }, { "epoch": 5.676979846721544, "eval_accuracy": 0.2647040122083042, "eval_loss": 2.813124418258667, "eval_runtime": 47.7074, "eval_samples_per_second": 329.655, "eval_steps_per_second": 5.156, "step": 20000 }, { "epoch": 5.679818336644905, "grad_norm": 7.006134986877441, "learning_rate": 7.161226227646893e-05, "loss": 2.463343048095703, "step": 20010 }, { "epoch": 5.682656826568266, "grad_norm": 6.969027519226074, "learning_rate": 7.159806982685212e-05, "loss": 2.509832763671875, "step": 20020 }, { "epoch": 5.685495316491626, "grad_norm": 6.608606338500977, "learning_rate": 7.158387737723531e-05, "loss": 2.4888885498046873, "step": 20030 }, { "epoch": 5.688333806414987, "grad_norm": 6.399748802185059, "learning_rate": 7.15696849276185e-05, "loss": 2.538615417480469, "step": 20040 }, { "epoch": 5.6911722963383475, "grad_norm": 6.228291988372803, "learning_rate": 7.155549247800171e-05, "loss": 2.5225868225097656, "step": 20050 }, { "epoch": 5.694010786261709, "grad_norm": 6.845673084259033, "learning_rate": 7.15413000283849e-05, "loss": 2.6034759521484374, "step": 20060 }, { "epoch": 5.69684927618507, "grad_norm": 6.501115798950195, "learning_rate": 7.15271075787681e-05, "loss": 2.5472707748413086, "step": 20070 }, { "epoch": 5.69968776610843, "grad_norm": 6.842172145843506, "learning_rate": 7.151291512915129e-05, "loss": 2.5390657424926757, "step": 20080 }, { "epoch": 5.702526256031791, "grad_norm": 7.036561965942383, "learning_rate": 7.149872267953449e-05, "loss": 2.5825851440429686, "step": 20090 }, { "epoch": 5.705364745955152, "grad_norm": 6.713914394378662, "learning_rate": 7.14845302299177e-05, "loss": 2.527890205383301, "step": 20100 }, { "epoch": 5.708203235878512, "grad_norm": 7.178467273712158, "learning_rate": 7.147033778030089e-05, "loss": 2.551700782775879, "step": 20110 }, { "epoch": 5.7110417258018735, "grad_norm": 6.845144271850586, "learning_rate": 7.145614533068408e-05, "loss": 2.598997688293457, "step": 20120 }, { "epoch": 5.713880215725235, "grad_norm": 6.924452781677246, "learning_rate": 7.144195288106727e-05, "loss": 2.5517066955566405, "step": 20130 }, { "epoch": 5.716718705648595, "grad_norm": 6.9148993492126465, "learning_rate": 7.142776043145048e-05, "loss": 2.5608295440673827, "step": 20140 }, { "epoch": 5.719557195571956, "grad_norm": 6.86804723739624, "learning_rate": 7.141356798183367e-05, "loss": 2.4715065002441405, "step": 20150 }, { "epoch": 5.722395685495316, "grad_norm": 6.783485412597656, "learning_rate": 7.139937553221687e-05, "loss": 2.538326644897461, "step": 20160 }, { "epoch": 5.725234175418677, "grad_norm": 6.744363307952881, "learning_rate": 7.138518308260006e-05, "loss": 2.5318475723266602, "step": 20170 }, { "epoch": 5.728072665342038, "grad_norm": 6.965782642364502, "learning_rate": 7.137099063298325e-05, "loss": 2.5939661026000977, "step": 20180 }, { "epoch": 5.730911155265399, "grad_norm": 6.6788010597229, "learning_rate": 7.135679818336646e-05, "loss": 2.5367008209228517, "step": 20190 }, { "epoch": 5.73374964518876, "grad_norm": 7.502912998199463, "learning_rate": 7.134260573374964e-05, "loss": 2.524629020690918, "step": 20200 }, { "epoch": 5.73658813511212, "grad_norm": 6.871400833129883, "learning_rate": 7.132841328413285e-05, "loss": 2.4907482147216795, "step": 20210 }, { "epoch": 5.739426625035481, "grad_norm": 7.301566123962402, "learning_rate": 7.131422083451604e-05, "loss": 2.486821746826172, "step": 20220 }, { "epoch": 5.742265114958842, "grad_norm": 6.522151947021484, "learning_rate": 7.130002838489925e-05, "loss": 2.524544334411621, "step": 20230 }, { "epoch": 5.745103604882202, "grad_norm": 6.757544994354248, "learning_rate": 7.128583593528243e-05, "loss": 2.4787668228149413, "step": 20240 }, { "epoch": 5.7479420948055635, "grad_norm": 6.700396537780762, "learning_rate": 7.127164348566563e-05, "loss": 2.5944311141967775, "step": 20250 }, { "epoch": 5.750780584728924, "grad_norm": 6.753966331481934, "learning_rate": 7.125745103604883e-05, "loss": 2.4589111328125, "step": 20260 }, { "epoch": 5.753619074652285, "grad_norm": 7.006554126739502, "learning_rate": 7.124325858643202e-05, "loss": 2.555069923400879, "step": 20270 }, { "epoch": 5.756457564575646, "grad_norm": 6.3132829666137695, "learning_rate": 7.122906613681521e-05, "loss": 2.5059650421142576, "step": 20280 }, { "epoch": 5.759296054499006, "grad_norm": 6.8315863609313965, "learning_rate": 7.121487368719841e-05, "loss": 2.536189651489258, "step": 20290 }, { "epoch": 5.762134544422367, "grad_norm": 6.80426549911499, "learning_rate": 7.120068123758161e-05, "loss": 2.5431081771850588, "step": 20300 }, { "epoch": 5.764973034345728, "grad_norm": 6.650258541107178, "learning_rate": 7.118648878796481e-05, "loss": 2.4582359313964846, "step": 20310 }, { "epoch": 5.767811524269089, "grad_norm": 7.024355411529541, "learning_rate": 7.1172296338348e-05, "loss": 2.475737380981445, "step": 20320 }, { "epoch": 5.77065001419245, "grad_norm": 6.482913970947266, "learning_rate": 7.11581038887312e-05, "loss": 2.507969093322754, "step": 20330 }, { "epoch": 5.773488504115811, "grad_norm": 6.874989032745361, "learning_rate": 7.11439114391144e-05, "loss": 2.4990991592407226, "step": 20340 }, { "epoch": 5.776326994039171, "grad_norm": 7.24768590927124, "learning_rate": 7.11297189894976e-05, "loss": 2.5982549667358397, "step": 20350 }, { "epoch": 5.779165483962532, "grad_norm": 6.725311279296875, "learning_rate": 7.111552653988079e-05, "loss": 2.5373085021972654, "step": 20360 }, { "epoch": 5.782003973885892, "grad_norm": 6.780360221862793, "learning_rate": 7.110133409026398e-05, "loss": 2.5024124145507813, "step": 20370 }, { "epoch": 5.7848424638092535, "grad_norm": 6.922391414642334, "learning_rate": 7.108714164064719e-05, "loss": 2.5970891952514648, "step": 20380 }, { "epoch": 5.787680953732615, "grad_norm": 7.149880409240723, "learning_rate": 7.107294919103038e-05, "loss": 2.5583213806152343, "step": 20390 }, { "epoch": 5.790519443655975, "grad_norm": 6.7433552742004395, "learning_rate": 7.105875674141357e-05, "loss": 2.4724056243896486, "step": 20400 }, { "epoch": 5.793357933579336, "grad_norm": 6.691697120666504, "learning_rate": 7.104456429179677e-05, "loss": 2.5170589447021485, "step": 20410 }, { "epoch": 5.796196423502696, "grad_norm": 6.751977920532227, "learning_rate": 7.103037184217996e-05, "loss": 2.4706077575683594, "step": 20420 }, { "epoch": 5.799034913426057, "grad_norm": 6.828762054443359, "learning_rate": 7.101617939256317e-05, "loss": 2.506112480163574, "step": 20430 }, { "epoch": 5.801873403349418, "grad_norm": 7.130664825439453, "learning_rate": 7.100198694294635e-05, "loss": 2.4255701065063477, "step": 20440 }, { "epoch": 5.804711893272779, "grad_norm": 6.889125347137451, "learning_rate": 7.098779449332955e-05, "loss": 2.5740604400634766, "step": 20450 }, { "epoch": 5.80755038319614, "grad_norm": 6.84147310256958, "learning_rate": 7.097360204371275e-05, "loss": 2.568767547607422, "step": 20460 }, { "epoch": 5.810388873119501, "grad_norm": 6.726868629455566, "learning_rate": 7.095940959409595e-05, "loss": 2.592555809020996, "step": 20470 }, { "epoch": 5.813227363042861, "grad_norm": 6.95125675201416, "learning_rate": 7.094521714447913e-05, "loss": 2.473855972290039, "step": 20480 }, { "epoch": 5.816065852966222, "grad_norm": 7.0998029708862305, "learning_rate": 7.093102469486234e-05, "loss": 2.432787322998047, "step": 20490 }, { "epoch": 5.818904342889582, "grad_norm": 6.59541130065918, "learning_rate": 7.091683224524553e-05, "loss": 2.4540586471557617, "step": 20500 }, { "epoch": 5.818904342889582, "eval_accuracy": 0.2689642016913588, "eval_loss": 2.7972848415374756, "eval_runtime": 47.843, "eval_samples_per_second": 328.721, "eval_steps_per_second": 5.142, "step": 20500 }, { "epoch": 5.8217428328129435, "grad_norm": 6.7478790283203125, "learning_rate": 7.090263979562873e-05, "loss": 2.4919456481933593, "step": 20510 }, { "epoch": 5.824581322736305, "grad_norm": 6.660475730895996, "learning_rate": 7.088844734601192e-05, "loss": 2.5060956954956053, "step": 20520 }, { "epoch": 5.827419812659665, "grad_norm": 6.8873114585876465, "learning_rate": 7.087425489639511e-05, "loss": 2.5599834442138674, "step": 20530 }, { "epoch": 5.830258302583026, "grad_norm": 6.492130279541016, "learning_rate": 7.086006244677832e-05, "loss": 2.530929756164551, "step": 20540 }, { "epoch": 5.833096792506387, "grad_norm": 6.4779815673828125, "learning_rate": 7.084586999716151e-05, "loss": 2.538738250732422, "step": 20550 }, { "epoch": 5.835935282429747, "grad_norm": 7.057785987854004, "learning_rate": 7.083167754754471e-05, "loss": 2.6110065460205076, "step": 20560 }, { "epoch": 5.838773772353108, "grad_norm": 6.8963189125061035, "learning_rate": 7.08174850979279e-05, "loss": 2.537694549560547, "step": 20570 }, { "epoch": 5.841612262276469, "grad_norm": 6.653097152709961, "learning_rate": 7.080329264831111e-05, "loss": 2.500651550292969, "step": 20580 }, { "epoch": 5.84445075219983, "grad_norm": 6.909927845001221, "learning_rate": 7.07891001986943e-05, "loss": 2.5188337326049806, "step": 20590 }, { "epoch": 5.847289242123191, "grad_norm": 6.520979404449463, "learning_rate": 7.07749077490775e-05, "loss": 2.523187828063965, "step": 20600 }, { "epoch": 5.850127732046551, "grad_norm": 6.843981742858887, "learning_rate": 7.076071529946069e-05, "loss": 2.570191764831543, "step": 20610 }, { "epoch": 5.852966221969912, "grad_norm": 6.502979278564453, "learning_rate": 7.074652284984388e-05, "loss": 2.469241905212402, "step": 20620 }, { "epoch": 5.855804711893272, "grad_norm": 7.121973991394043, "learning_rate": 7.073233040022709e-05, "loss": 2.4979753494262695, "step": 20630 }, { "epoch": 5.8586432018166335, "grad_norm": 6.928045272827148, "learning_rate": 7.071813795061027e-05, "loss": 2.4878477096557616, "step": 20640 }, { "epoch": 5.861481691739995, "grad_norm": 7.194851398468018, "learning_rate": 7.070394550099348e-05, "loss": 2.494437026977539, "step": 20650 }, { "epoch": 5.864320181663355, "grad_norm": 6.7079176902771, "learning_rate": 7.068975305137667e-05, "loss": 2.5201053619384766, "step": 20660 }, { "epoch": 5.867158671586716, "grad_norm": 6.38192081451416, "learning_rate": 7.067556060175988e-05, "loss": 2.5126012802124023, "step": 20670 }, { "epoch": 5.869997161510077, "grad_norm": 6.951531410217285, "learning_rate": 7.066136815214305e-05, "loss": 2.5405000686645507, "step": 20680 }, { "epoch": 5.872835651433437, "grad_norm": 7.027202129364014, "learning_rate": 7.064717570252626e-05, "loss": 2.570561981201172, "step": 20690 }, { "epoch": 5.875674141356798, "grad_norm": 6.898168087005615, "learning_rate": 7.063298325290946e-05, "loss": 2.535934638977051, "step": 20700 }, { "epoch": 5.878512631280159, "grad_norm": 6.606186866760254, "learning_rate": 7.061879080329266e-05, "loss": 2.5048244476318358, "step": 20710 }, { "epoch": 5.88135112120352, "grad_norm": 6.843324184417725, "learning_rate": 7.060459835367584e-05, "loss": 2.532893180847168, "step": 20720 }, { "epoch": 5.884189611126881, "grad_norm": 6.669724941253662, "learning_rate": 7.059040590405905e-05, "loss": 2.5600088119506834, "step": 20730 }, { "epoch": 5.887028101050241, "grad_norm": 6.970189094543457, "learning_rate": 7.057621345444224e-05, "loss": 2.5705385208129883, "step": 20740 }, { "epoch": 5.889866590973602, "grad_norm": 6.688689231872559, "learning_rate": 7.056202100482544e-05, "loss": 2.613357734680176, "step": 20750 }, { "epoch": 5.892705080896963, "grad_norm": 7.085455894470215, "learning_rate": 7.054782855520863e-05, "loss": 2.5376073837280275, "step": 20760 }, { "epoch": 5.895543570820323, "grad_norm": 6.648007392883301, "learning_rate": 7.053363610559182e-05, "loss": 2.46313591003418, "step": 20770 }, { "epoch": 5.8983820607436845, "grad_norm": 6.808741569519043, "learning_rate": 7.051944365597503e-05, "loss": 2.4952651977539064, "step": 20780 }, { "epoch": 5.901220550667045, "grad_norm": 6.750223636627197, "learning_rate": 7.050525120635822e-05, "loss": 2.4958494186401365, "step": 20790 }, { "epoch": 5.904059040590406, "grad_norm": 6.691183567047119, "learning_rate": 7.049105875674142e-05, "loss": 2.537097358703613, "step": 20800 }, { "epoch": 5.906897530513767, "grad_norm": 7.069681167602539, "learning_rate": 7.047686630712461e-05, "loss": 2.501120758056641, "step": 20810 }, { "epoch": 5.909736020437127, "grad_norm": 6.918638706207275, "learning_rate": 7.046267385750782e-05, "loss": 2.552318572998047, "step": 20820 }, { "epoch": 5.912574510360488, "grad_norm": 6.852471351623535, "learning_rate": 7.044848140789101e-05, "loss": 2.4872804641723634, "step": 20830 }, { "epoch": 5.9154130002838485, "grad_norm": 7.248600959777832, "learning_rate": 7.04342889582742e-05, "loss": 2.4940589904785155, "step": 20840 }, { "epoch": 5.91825149020721, "grad_norm": 7.272073268890381, "learning_rate": 7.04200965086574e-05, "loss": 2.508287048339844, "step": 20850 }, { "epoch": 5.921089980130571, "grad_norm": 6.771476745605469, "learning_rate": 7.040590405904059e-05, "loss": 2.5061481475830076, "step": 20860 }, { "epoch": 5.923928470053931, "grad_norm": 6.60063362121582, "learning_rate": 7.03917116094238e-05, "loss": 2.5167753219604494, "step": 20870 }, { "epoch": 5.926766959977292, "grad_norm": 6.561298847198486, "learning_rate": 7.037751915980698e-05, "loss": 2.562516784667969, "step": 20880 }, { "epoch": 5.929605449900653, "grad_norm": 6.763286113739014, "learning_rate": 7.036332671019018e-05, "loss": 2.4988426208496093, "step": 20890 }, { "epoch": 5.932443939824013, "grad_norm": 7.327254772186279, "learning_rate": 7.034913426057338e-05, "loss": 2.526174545288086, "step": 20900 }, { "epoch": 5.9352824297473745, "grad_norm": 6.821061611175537, "learning_rate": 7.033494181095658e-05, "loss": 2.465433883666992, "step": 20910 }, { "epoch": 5.938120919670736, "grad_norm": 6.904401779174805, "learning_rate": 7.032074936133976e-05, "loss": 2.498979187011719, "step": 20920 }, { "epoch": 5.940959409594096, "grad_norm": 7.060030937194824, "learning_rate": 7.030655691172297e-05, "loss": 2.4567630767822264, "step": 20930 }, { "epoch": 5.943797899517457, "grad_norm": 6.621822357177734, "learning_rate": 7.029236446210616e-05, "loss": 2.498345947265625, "step": 20940 }, { "epoch": 5.946636389440817, "grad_norm": 6.775069236755371, "learning_rate": 7.027817201248937e-05, "loss": 2.539948654174805, "step": 20950 }, { "epoch": 5.949474879364178, "grad_norm": 6.859158992767334, "learning_rate": 7.026397956287255e-05, "loss": 2.586857223510742, "step": 20960 }, { "epoch": 5.952313369287539, "grad_norm": 6.65178108215332, "learning_rate": 7.024978711325576e-05, "loss": 2.523001861572266, "step": 20970 }, { "epoch": 5.9551518592109, "grad_norm": 6.558193206787109, "learning_rate": 7.023559466363895e-05, "loss": 2.592412567138672, "step": 20980 }, { "epoch": 5.957990349134261, "grad_norm": 6.529144763946533, "learning_rate": 7.022140221402214e-05, "loss": 2.5465497970581055, "step": 20990 }, { "epoch": 5.960828839057621, "grad_norm": 6.310122966766357, "learning_rate": 7.020720976440534e-05, "loss": 2.503729248046875, "step": 21000 }, { "epoch": 5.960828839057621, "eval_accuracy": 0.2750047688688243, "eval_loss": 2.7828288078308105, "eval_runtime": 53.5747, "eval_samples_per_second": 293.553, "eval_steps_per_second": 4.592, "step": 21000 }, { "epoch": 5.963667328980982, "grad_norm": 6.967732906341553, "learning_rate": 7.019301731478853e-05, "loss": 2.552545166015625, "step": 21010 }, { "epoch": 5.966505818904343, "grad_norm": 6.625184059143066, "learning_rate": 7.017882486517174e-05, "loss": 2.4201854705810546, "step": 21020 }, { "epoch": 5.969344308827703, "grad_norm": 6.654976844787598, "learning_rate": 7.016463241555493e-05, "loss": 2.519451904296875, "step": 21030 }, { "epoch": 5.9721827987510645, "grad_norm": 6.570440769195557, "learning_rate": 7.015043996593812e-05, "loss": 2.459340476989746, "step": 21040 }, { "epoch": 5.975021288674425, "grad_norm": 6.779837131500244, "learning_rate": 7.013624751632132e-05, "loss": 2.394080924987793, "step": 21050 }, { "epoch": 5.977859778597786, "grad_norm": 7.0436787605285645, "learning_rate": 7.012205506670452e-05, "loss": 2.537100601196289, "step": 21060 }, { "epoch": 5.980698268521147, "grad_norm": 6.714550495147705, "learning_rate": 7.010786261708772e-05, "loss": 2.528063201904297, "step": 21070 }, { "epoch": 5.983536758444507, "grad_norm": 6.747709274291992, "learning_rate": 7.009367016747091e-05, "loss": 2.4953945159912108, "step": 21080 }, { "epoch": 5.986375248367868, "grad_norm": 6.760087966918945, "learning_rate": 7.00794777178541e-05, "loss": 2.5443626403808595, "step": 21090 }, { "epoch": 5.989213738291229, "grad_norm": 7.111458778381348, "learning_rate": 7.00652852682373e-05, "loss": 2.5695425033569337, "step": 21100 }, { "epoch": 5.99205222821459, "grad_norm": 7.101138591766357, "learning_rate": 7.00510928186205e-05, "loss": 2.44791202545166, "step": 21110 }, { "epoch": 5.994890718137951, "grad_norm": 6.953344345092773, "learning_rate": 7.003690036900368e-05, "loss": 2.4756874084472655, "step": 21120 }, { "epoch": 5.997729208061312, "grad_norm": 6.81596565246582, "learning_rate": 7.002270791938689e-05, "loss": 2.4637969970703124, "step": 21130 }, { "epoch": 6.000567697984672, "grad_norm": 6.982290267944336, "learning_rate": 7.000851546977008e-05, "loss": 2.483654022216797, "step": 21140 }, { "epoch": 6.003406187908033, "grad_norm": 7.088365077972412, "learning_rate": 6.999432302015329e-05, "loss": 2.4067989349365235, "step": 21150 }, { "epoch": 6.006244677831393, "grad_norm": 6.739979267120361, "learning_rate": 6.998013057053647e-05, "loss": 2.438406753540039, "step": 21160 }, { "epoch": 6.0090831677547545, "grad_norm": 7.108844757080078, "learning_rate": 6.996593812091968e-05, "loss": 2.5760658264160154, "step": 21170 }, { "epoch": 6.011921657678116, "grad_norm": 6.890646934509277, "learning_rate": 6.995174567130287e-05, "loss": 2.4253656387329103, "step": 21180 }, { "epoch": 6.014760147601476, "grad_norm": 6.5961995124816895, "learning_rate": 6.993755322168606e-05, "loss": 2.4518707275390623, "step": 21190 }, { "epoch": 6.017598637524837, "grad_norm": 6.96980619430542, "learning_rate": 6.992336077206926e-05, "loss": 2.491604804992676, "step": 21200 }, { "epoch": 6.020437127448197, "grad_norm": 6.331051826477051, "learning_rate": 6.990916832245245e-05, "loss": 2.469913101196289, "step": 21210 }, { "epoch": 6.023275617371558, "grad_norm": 6.839109420776367, "learning_rate": 6.989497587283566e-05, "loss": 2.5031511306762697, "step": 21220 }, { "epoch": 6.026114107294919, "grad_norm": 6.476175785064697, "learning_rate": 6.988078342321885e-05, "loss": 2.3927692413330077, "step": 21230 }, { "epoch": 6.02895259721828, "grad_norm": 6.818885326385498, "learning_rate": 6.986659097360204e-05, "loss": 2.4873600006103516, "step": 21240 }, { "epoch": 6.031791087141641, "grad_norm": 7.17777681350708, "learning_rate": 6.985239852398524e-05, "loss": 2.5899978637695313, "step": 21250 }, { "epoch": 6.034629577065002, "grad_norm": 6.353330135345459, "learning_rate": 6.983820607436844e-05, "loss": 2.5307075500488283, "step": 21260 }, { "epoch": 6.037468066988362, "grad_norm": 6.424325466156006, "learning_rate": 6.982401362475164e-05, "loss": 2.497431182861328, "step": 21270 }, { "epoch": 6.040306556911723, "grad_norm": 6.705902576446533, "learning_rate": 6.980982117513483e-05, "loss": 2.5162792205810547, "step": 21280 }, { "epoch": 6.043145046835083, "grad_norm": 6.405076503753662, "learning_rate": 6.979562872551802e-05, "loss": 2.4813253402709963, "step": 21290 }, { "epoch": 6.0459835367584445, "grad_norm": 6.855953216552734, "learning_rate": 6.978143627590123e-05, "loss": 2.4956783294677733, "step": 21300 }, { "epoch": 6.048822026681806, "grad_norm": 6.9795122146606445, "learning_rate": 6.976724382628442e-05, "loss": 2.4666702270507814, "step": 21310 }, { "epoch": 6.051660516605166, "grad_norm": 6.918363094329834, "learning_rate": 6.975305137666762e-05, "loss": 2.4914283752441406, "step": 21320 }, { "epoch": 6.054499006528527, "grad_norm": 6.878627777099609, "learning_rate": 6.973885892705081e-05, "loss": 2.564632797241211, "step": 21330 }, { "epoch": 6.057337496451888, "grad_norm": 6.725796222686768, "learning_rate": 6.9724666477434e-05, "loss": 2.417620849609375, "step": 21340 }, { "epoch": 6.060175986375248, "grad_norm": 6.457788944244385, "learning_rate": 6.971047402781721e-05, "loss": 2.4324045181274414, "step": 21350 }, { "epoch": 6.063014476298609, "grad_norm": 6.750539302825928, "learning_rate": 6.969628157820039e-05, "loss": 2.440740203857422, "step": 21360 }, { "epoch": 6.06585296622197, "grad_norm": 6.572129249572754, "learning_rate": 6.96820891285836e-05, "loss": 2.4947858810424806, "step": 21370 }, { "epoch": 6.068691456145331, "grad_norm": 6.766547679901123, "learning_rate": 6.966789667896679e-05, "loss": 2.5084468841552736, "step": 21380 }, { "epoch": 6.071529946068692, "grad_norm": 6.873720169067383, "learning_rate": 6.965370422935e-05, "loss": 2.3982391357421875, "step": 21390 }, { "epoch": 6.074368435992052, "grad_norm": 6.966355800628662, "learning_rate": 6.963951177973318e-05, "loss": 2.523390197753906, "step": 21400 }, { "epoch": 6.077206925915413, "grad_norm": 6.93231201171875, "learning_rate": 6.962531933011638e-05, "loss": 2.4610429763793946, "step": 21410 }, { "epoch": 6.080045415838773, "grad_norm": 6.459434509277344, "learning_rate": 6.961112688049958e-05, "loss": 2.4413177490234377, "step": 21420 }, { "epoch": 6.0828839057621344, "grad_norm": 6.672303199768066, "learning_rate": 6.959693443088277e-05, "loss": 2.47552547454834, "step": 21430 }, { "epoch": 6.085722395685496, "grad_norm": 6.707478046417236, "learning_rate": 6.958274198126598e-05, "loss": 2.475879096984863, "step": 21440 }, { "epoch": 6.088560885608856, "grad_norm": 6.275786876678467, "learning_rate": 6.956854953164916e-05, "loss": 2.3244625091552735, "step": 21450 }, { "epoch": 6.091399375532217, "grad_norm": 6.784721851348877, "learning_rate": 6.955435708203236e-05, "loss": 2.5037551879882813, "step": 21460 }, { "epoch": 6.094237865455578, "grad_norm": 6.917561054229736, "learning_rate": 6.954016463241556e-05, "loss": 2.4509925842285156, "step": 21470 }, { "epoch": 6.097076355378938, "grad_norm": 6.613241672515869, "learning_rate": 6.952597218279876e-05, "loss": 2.4812143325805662, "step": 21480 }, { "epoch": 6.099914845302299, "grad_norm": 6.888500690460205, "learning_rate": 6.951177973318194e-05, "loss": 2.556718635559082, "step": 21490 }, { "epoch": 6.1027533352256595, "grad_norm": 6.536261081695557, "learning_rate": 6.949758728356515e-05, "loss": 2.4298770904541014, "step": 21500 }, { "epoch": 6.1027533352256595, "eval_accuracy": 0.27068099446811217, "eval_loss": 2.771641731262207, "eval_runtime": 53.2816, "eval_samples_per_second": 295.167, "eval_steps_per_second": 4.617, "step": 21500 }, { "epoch": 6.105591825149021, "grad_norm": 6.773435592651367, "learning_rate": 6.948339483394834e-05, "loss": 2.4030799865722656, "step": 21510 }, { "epoch": 6.108430315072382, "grad_norm": 7.21008825302124, "learning_rate": 6.946920238433154e-05, "loss": 2.501536560058594, "step": 21520 }, { "epoch": 6.111268804995742, "grad_norm": 7.214860439300537, "learning_rate": 6.945500993471473e-05, "loss": 2.5215057373046874, "step": 21530 }, { "epoch": 6.114107294919103, "grad_norm": 6.542391300201416, "learning_rate": 6.944081748509792e-05, "loss": 2.4818981170654295, "step": 21540 }, { "epoch": 6.116945784842464, "grad_norm": 6.854196071624756, "learning_rate": 6.942662503548113e-05, "loss": 2.4802417755126953, "step": 21550 }, { "epoch": 6.119784274765824, "grad_norm": 6.696768283843994, "learning_rate": 6.941243258586432e-05, "loss": 2.453189468383789, "step": 21560 }, { "epoch": 6.1226227646891855, "grad_norm": 6.844273567199707, "learning_rate": 6.939824013624752e-05, "loss": 2.509267807006836, "step": 21570 }, { "epoch": 6.125461254612546, "grad_norm": 6.762083053588867, "learning_rate": 6.938404768663071e-05, "loss": 2.5149490356445314, "step": 21580 }, { "epoch": 6.128299744535907, "grad_norm": 6.48878288269043, "learning_rate": 6.936985523701392e-05, "loss": 2.4924747467041017, "step": 21590 }, { "epoch": 6.131138234459268, "grad_norm": 6.626217842102051, "learning_rate": 6.935566278739711e-05, "loss": 2.4050878524780273, "step": 21600 }, { "epoch": 6.133976724382628, "grad_norm": 6.679590225219727, "learning_rate": 6.93414703377803e-05, "loss": 2.472342681884766, "step": 21610 }, { "epoch": 6.136815214305989, "grad_norm": 7.072216510772705, "learning_rate": 6.932869713312518e-05, "loss": 2.4329978942871096, "step": 21620 }, { "epoch": 6.1396537042293495, "grad_norm": 6.795796871185303, "learning_rate": 6.931450468350837e-05, "loss": 2.531970977783203, "step": 21630 }, { "epoch": 6.142492194152711, "grad_norm": 6.627623558044434, "learning_rate": 6.930031223389158e-05, "loss": 2.4272232055664062, "step": 21640 }, { "epoch": 6.145330684076072, "grad_norm": 6.626079559326172, "learning_rate": 6.928611978427477e-05, "loss": 2.3928909301757812, "step": 21650 }, { "epoch": 6.148169173999432, "grad_norm": 6.7965474128723145, "learning_rate": 6.927192733465796e-05, "loss": 2.4081193923950197, "step": 21660 }, { "epoch": 6.151007663922793, "grad_norm": 7.56017541885376, "learning_rate": 6.925773488504116e-05, "loss": 2.4058046340942383, "step": 21670 }, { "epoch": 6.153846153846154, "grad_norm": 7.068434238433838, "learning_rate": 6.924354243542436e-05, "loss": 2.4231540679931642, "step": 21680 }, { "epoch": 6.156684643769514, "grad_norm": 7.017087459564209, "learning_rate": 6.922934998580756e-05, "loss": 2.5494884490966796, "step": 21690 }, { "epoch": 6.1595231336928755, "grad_norm": 6.557997226715088, "learning_rate": 6.921515753619075e-05, "loss": 2.440201759338379, "step": 21700 }, { "epoch": 6.162361623616236, "grad_norm": 6.966424465179443, "learning_rate": 6.920096508657394e-05, "loss": 2.4962520599365234, "step": 21710 }, { "epoch": 6.165200113539597, "grad_norm": 6.754067897796631, "learning_rate": 6.918677263695714e-05, "loss": 2.4957296371459963, "step": 21720 }, { "epoch": 6.168038603462958, "grad_norm": 6.67304801940918, "learning_rate": 6.917258018734034e-05, "loss": 2.442167282104492, "step": 21730 }, { "epoch": 6.170877093386318, "grad_norm": 6.938436508178711, "learning_rate": 6.915838773772352e-05, "loss": 2.5134654998779298, "step": 21740 }, { "epoch": 6.173715583309679, "grad_norm": 6.964658260345459, "learning_rate": 6.914419528810673e-05, "loss": 2.486761283874512, "step": 21750 }, { "epoch": 6.17655407323304, "grad_norm": 6.879712104797363, "learning_rate": 6.913000283848992e-05, "loss": 2.4410274505615233, "step": 21760 }, { "epoch": 6.179392563156401, "grad_norm": 7.075400352478027, "learning_rate": 6.911581038887313e-05, "loss": 2.371192741394043, "step": 21770 }, { "epoch": 6.182231053079762, "grad_norm": 6.986252784729004, "learning_rate": 6.910161793925631e-05, "loss": 2.4933536529541014, "step": 21780 }, { "epoch": 6.185069543003122, "grad_norm": 7.138460159301758, "learning_rate": 6.908742548963952e-05, "loss": 2.444549560546875, "step": 21790 }, { "epoch": 6.187908032926483, "grad_norm": 6.4659833908081055, "learning_rate": 6.907323304002271e-05, "loss": 2.4584514617919924, "step": 21800 }, { "epoch": 6.190746522849844, "grad_norm": 6.898294925689697, "learning_rate": 6.90590405904059e-05, "loss": 2.52573299407959, "step": 21810 }, { "epoch": 6.193585012773204, "grad_norm": 7.286567687988281, "learning_rate": 6.904484814078911e-05, "loss": 2.4958169937133787, "step": 21820 }, { "epoch": 6.1964235026965655, "grad_norm": 6.795723915100098, "learning_rate": 6.903065569117229e-05, "loss": 2.478511428833008, "step": 21830 }, { "epoch": 6.199261992619927, "grad_norm": 6.796793460845947, "learning_rate": 6.90164632415555e-05, "loss": 2.52390193939209, "step": 21840 }, { "epoch": 6.202100482543287, "grad_norm": 7.070167064666748, "learning_rate": 6.900227079193869e-05, "loss": 2.4326921463012696, "step": 21850 }, { "epoch": 6.204938972466648, "grad_norm": 7.075150489807129, "learning_rate": 6.89880783423219e-05, "loss": 2.4930404663085937, "step": 21860 }, { "epoch": 6.207777462390008, "grad_norm": 6.877079010009766, "learning_rate": 6.897388589270508e-05, "loss": 2.5468109130859373, "step": 21870 }, { "epoch": 6.210615952313369, "grad_norm": 6.989907264709473, "learning_rate": 6.895969344308829e-05, "loss": 2.4364599227905273, "step": 21880 }, { "epoch": 6.21345444223673, "grad_norm": 7.10391902923584, "learning_rate": 6.894550099347148e-05, "loss": 2.493499183654785, "step": 21890 }, { "epoch": 6.216292932160091, "grad_norm": 6.73354959487915, "learning_rate": 6.893130854385467e-05, "loss": 2.3951698303222657, "step": 21900 }, { "epoch": 6.219131422083452, "grad_norm": 7.086217880249023, "learning_rate": 6.891711609423787e-05, "loss": 2.5055580139160156, "step": 21910 }, { "epoch": 6.221969912006813, "grad_norm": 6.3814377784729, "learning_rate": 6.890292364462106e-05, "loss": 2.431400680541992, "step": 21920 }, { "epoch": 6.224808401930173, "grad_norm": 6.624530792236328, "learning_rate": 6.888873119500427e-05, "loss": 2.526032638549805, "step": 21930 }, { "epoch": 6.227646891853534, "grad_norm": 6.780786514282227, "learning_rate": 6.887453874538746e-05, "loss": 2.454499053955078, "step": 21940 }, { "epoch": 6.230485381776894, "grad_norm": 7.200390338897705, "learning_rate": 6.886034629577065e-05, "loss": 2.4522653579711915, "step": 21950 }, { "epoch": 6.2333238717002555, "grad_norm": 6.752831935882568, "learning_rate": 6.884615384615385e-05, "loss": 2.504162406921387, "step": 21960 }, { "epoch": 6.236162361623617, "grad_norm": 6.543815612792969, "learning_rate": 6.883196139653705e-05, "loss": 2.442982482910156, "step": 21970 }, { "epoch": 6.239000851546977, "grad_norm": 7.20047664642334, "learning_rate": 6.881776894692025e-05, "loss": 2.379866027832031, "step": 21980 }, { "epoch": 6.241839341470338, "grad_norm": 6.535821437835693, "learning_rate": 6.880357649730344e-05, "loss": 2.4815048217773437, "step": 21990 }, { "epoch": 6.244677831393698, "grad_norm": 6.49028205871582, "learning_rate": 6.878938404768663e-05, "loss": 2.399440956115723, "step": 22000 }, { "epoch": 6.244677831393698, "eval_accuracy": 0.27468684428053664, "eval_loss": 2.772005558013916, "eval_runtime": 52.7501, "eval_samples_per_second": 298.142, "eval_steps_per_second": 4.663, "step": 22000 }, { "epoch": 6.247516321317059, "grad_norm": 7.154001235961914, "learning_rate": 6.877519159806984e-05, "loss": 2.5429290771484374, "step": 22010 }, { "epoch": 6.25035481124042, "grad_norm": 6.951819896697998, "learning_rate": 6.876099914845303e-05, "loss": 2.5087032318115234, "step": 22020 }, { "epoch": 6.253193301163781, "grad_norm": 6.733520984649658, "learning_rate": 6.874680669883623e-05, "loss": 2.519819450378418, "step": 22030 }, { "epoch": 6.256031791087142, "grad_norm": 6.443841934204102, "learning_rate": 6.873261424921942e-05, "loss": 2.493098831176758, "step": 22040 }, { "epoch": 6.258870281010503, "grad_norm": 7.11667013168335, "learning_rate": 6.871842179960261e-05, "loss": 2.4757625579833986, "step": 22050 }, { "epoch": 6.261708770933863, "grad_norm": 6.523332595825195, "learning_rate": 6.870422934998582e-05, "loss": 2.527018165588379, "step": 22060 }, { "epoch": 6.264547260857224, "grad_norm": 6.682645320892334, "learning_rate": 6.8690036900369e-05, "loss": 2.4318206787109373, "step": 22070 }, { "epoch": 6.267385750780584, "grad_norm": 6.435915946960449, "learning_rate": 6.86758444507522e-05, "loss": 2.4250419616699217, "step": 22080 }, { "epoch": 6.2702242407039455, "grad_norm": 6.802117347717285, "learning_rate": 6.86616520011354e-05, "loss": 2.4253219604492187, "step": 22090 }, { "epoch": 6.273062730627307, "grad_norm": 7.222358226776123, "learning_rate": 6.86474595515186e-05, "loss": 2.5143484115600585, "step": 22100 }, { "epoch": 6.275901220550667, "grad_norm": 6.790596961975098, "learning_rate": 6.863326710190179e-05, "loss": 2.483427047729492, "step": 22110 }, { "epoch": 6.278739710474028, "grad_norm": 6.593836307525635, "learning_rate": 6.861907465228499e-05, "loss": 2.4668949127197264, "step": 22120 }, { "epoch": 6.281578200397389, "grad_norm": 6.522164344787598, "learning_rate": 6.860488220266819e-05, "loss": 2.393271064758301, "step": 22130 }, { "epoch": 6.284416690320749, "grad_norm": 6.894057750701904, "learning_rate": 6.859068975305138e-05, "loss": 2.401931953430176, "step": 22140 }, { "epoch": 6.28725518024411, "grad_norm": 6.8038129806518555, "learning_rate": 6.857649730343457e-05, "loss": 2.4055465698242187, "step": 22150 }, { "epoch": 6.290093670167471, "grad_norm": 6.464775085449219, "learning_rate": 6.856230485381777e-05, "loss": 2.475183868408203, "step": 22160 }, { "epoch": 6.292932160090832, "grad_norm": 7.235611915588379, "learning_rate": 6.854811240420097e-05, "loss": 2.4482147216796877, "step": 22170 }, { "epoch": 6.295770650014193, "grad_norm": 7.017197132110596, "learning_rate": 6.853391995458417e-05, "loss": 2.484567070007324, "step": 22180 }, { "epoch": 6.298609139937553, "grad_norm": 7.157113552093506, "learning_rate": 6.851972750496736e-05, "loss": 2.402068328857422, "step": 22190 }, { "epoch": 6.301447629860914, "grad_norm": 7.117146968841553, "learning_rate": 6.850553505535055e-05, "loss": 2.44063720703125, "step": 22200 }, { "epoch": 6.304286119784274, "grad_norm": 6.761784553527832, "learning_rate": 6.849134260573376e-05, "loss": 2.5042428970336914, "step": 22210 }, { "epoch": 6.307124609707635, "grad_norm": 6.724444389343262, "learning_rate": 6.847715015611695e-05, "loss": 2.4434877395629884, "step": 22220 }, { "epoch": 6.3099630996309966, "grad_norm": 6.847675323486328, "learning_rate": 6.846295770650015e-05, "loss": 2.4163887023925783, "step": 22230 }, { "epoch": 6.312801589554357, "grad_norm": 6.925311088562012, "learning_rate": 6.844876525688334e-05, "loss": 2.486941337585449, "step": 22240 }, { "epoch": 6.315640079477718, "grad_norm": 6.760059833526611, "learning_rate": 6.843457280726653e-05, "loss": 2.460480308532715, "step": 22250 }, { "epoch": 6.318478569401079, "grad_norm": 6.781618595123291, "learning_rate": 6.842038035764974e-05, "loss": 2.4176666259765627, "step": 22260 }, { "epoch": 6.321317059324439, "grad_norm": 7.170528888702393, "learning_rate": 6.840618790803292e-05, "loss": 2.500167655944824, "step": 22270 }, { "epoch": 6.3241555492478, "grad_norm": 6.739192485809326, "learning_rate": 6.839199545841613e-05, "loss": 2.472794532775879, "step": 22280 }, { "epoch": 6.3269940391711605, "grad_norm": 6.618492603302002, "learning_rate": 6.837780300879932e-05, "loss": 2.5432598114013674, "step": 22290 }, { "epoch": 6.329832529094522, "grad_norm": 6.899853706359863, "learning_rate": 6.836361055918253e-05, "loss": 2.4281972885131835, "step": 22300 }, { "epoch": 6.332671019017883, "grad_norm": 7.564615249633789, "learning_rate": 6.83494181095657e-05, "loss": 2.5277191162109376, "step": 22310 }, { "epoch": 6.335509508941243, "grad_norm": 7.1228556632995605, "learning_rate": 6.833522565994891e-05, "loss": 2.5173492431640625, "step": 22320 }, { "epoch": 6.338347998864604, "grad_norm": 7.001149654388428, "learning_rate": 6.83210332103321e-05, "loss": 2.476327133178711, "step": 22330 }, { "epoch": 6.341186488787965, "grad_norm": 7.696782112121582, "learning_rate": 6.830684076071531e-05, "loss": 2.4945562362670897, "step": 22340 }, { "epoch": 6.344024978711325, "grad_norm": 6.839585304260254, "learning_rate": 6.82926483110985e-05, "loss": 2.467020606994629, "step": 22350 }, { "epoch": 6.3468634686346865, "grad_norm": 6.767400741577148, "learning_rate": 6.82784558614817e-05, "loss": 2.4794639587402343, "step": 22360 }, { "epoch": 6.349701958558047, "grad_norm": 6.821475028991699, "learning_rate": 6.82642634118649e-05, "loss": 2.474845123291016, "step": 22370 }, { "epoch": 6.352540448481408, "grad_norm": 6.497913837432861, "learning_rate": 6.825007096224809e-05, "loss": 2.5111164093017577, "step": 22380 }, { "epoch": 6.355378938404769, "grad_norm": 6.635011196136475, "learning_rate": 6.823587851263128e-05, "loss": 2.4713626861572267, "step": 22390 }, { "epoch": 6.358217428328129, "grad_norm": 6.556007385253906, "learning_rate": 6.822168606301447e-05, "loss": 2.4392017364501952, "step": 22400 }, { "epoch": 6.36105591825149, "grad_norm": 7.030292987823486, "learning_rate": 6.820749361339768e-05, "loss": 2.475643539428711, "step": 22410 }, { "epoch": 6.3638944081748505, "grad_norm": 6.615301609039307, "learning_rate": 6.819330116378087e-05, "loss": 2.4588504791259767, "step": 22420 }, { "epoch": 6.366732898098212, "grad_norm": 6.906377792358398, "learning_rate": 6.817910871416407e-05, "loss": 2.476108169555664, "step": 22430 }, { "epoch": 6.369571388021573, "grad_norm": 6.917459964752197, "learning_rate": 6.816491626454726e-05, "loss": 2.4653083801269533, "step": 22440 }, { "epoch": 6.372409877944933, "grad_norm": 6.925414085388184, "learning_rate": 6.815072381493047e-05, "loss": 2.4959545135498047, "step": 22450 }, { "epoch": 6.375248367868294, "grad_norm": 7.184194087982178, "learning_rate": 6.813653136531366e-05, "loss": 2.508264350891113, "step": 22460 }, { "epoch": 6.378086857791655, "grad_norm": 6.797396659851074, "learning_rate": 6.812233891569685e-05, "loss": 2.48192138671875, "step": 22470 }, { "epoch": 6.380925347715015, "grad_norm": 6.432242393493652, "learning_rate": 6.810814646608005e-05, "loss": 2.4192028045654297, "step": 22480 }, { "epoch": 6.3837638376383765, "grad_norm": 6.562394618988037, "learning_rate": 6.809395401646324e-05, "loss": 2.446536636352539, "step": 22490 }, { "epoch": 6.386602327561738, "grad_norm": 7.394036293029785, "learning_rate": 6.807976156684645e-05, "loss": 2.562285804748535, "step": 22500 }, { "epoch": 6.386602327561738, "eval_accuracy": 0.2795828829401666, "eval_loss": 2.7644972801208496, "eval_runtime": 50.5232, "eval_samples_per_second": 311.283, "eval_steps_per_second": 4.869, "step": 22500 }, { "epoch": 6.389440817485098, "grad_norm": 7.005687713623047, "learning_rate": 6.806556911722963e-05, "loss": 2.420686149597168, "step": 22510 }, { "epoch": 6.392279307408459, "grad_norm": 6.828493595123291, "learning_rate": 6.805137666761283e-05, "loss": 2.484848403930664, "step": 22520 }, { "epoch": 6.395117797331819, "grad_norm": 6.824126720428467, "learning_rate": 6.803718421799603e-05, "loss": 2.4206510543823243, "step": 22530 }, { "epoch": 6.39795628725518, "grad_norm": 6.647915840148926, "learning_rate": 6.802299176837923e-05, "loss": 2.4361629486083984, "step": 22540 }, { "epoch": 6.400794777178541, "grad_norm": 6.642844200134277, "learning_rate": 6.800879931876241e-05, "loss": 2.483118438720703, "step": 22550 }, { "epoch": 6.403633267101902, "grad_norm": 6.981826305389404, "learning_rate": 6.799460686914562e-05, "loss": 2.4153493881225585, "step": 22560 }, { "epoch": 6.406471757025263, "grad_norm": 6.47277307510376, "learning_rate": 6.798041441952881e-05, "loss": 2.352016830444336, "step": 22570 }, { "epoch": 6.409310246948623, "grad_norm": 6.772089004516602, "learning_rate": 6.796622196991201e-05, "loss": 2.5059768676757814, "step": 22580 }, { "epoch": 6.412148736871984, "grad_norm": 6.769845008850098, "learning_rate": 6.79520295202952e-05, "loss": 2.478219985961914, "step": 22590 }, { "epoch": 6.414987226795345, "grad_norm": 7.075986862182617, "learning_rate": 6.79378370706784e-05, "loss": 2.413740348815918, "step": 22600 }, { "epoch": 6.417825716718705, "grad_norm": 6.497706890106201, "learning_rate": 6.79236446210616e-05, "loss": 2.4269927978515624, "step": 22610 }, { "epoch": 6.4206642066420665, "grad_norm": 6.629652500152588, "learning_rate": 6.79094521714448e-05, "loss": 2.4219133377075197, "step": 22620 }, { "epoch": 6.423502696565428, "grad_norm": 6.82293701171875, "learning_rate": 6.789525972182799e-05, "loss": 2.4705963134765625, "step": 22630 }, { "epoch": 6.426341186488788, "grad_norm": 6.927430629730225, "learning_rate": 6.788106727221118e-05, "loss": 2.4850013732910154, "step": 22640 }, { "epoch": 6.429179676412149, "grad_norm": 7.024291038513184, "learning_rate": 6.786687482259439e-05, "loss": 2.4202537536621094, "step": 22650 }, { "epoch": 6.432018166335509, "grad_norm": 6.4613037109375, "learning_rate": 6.785268237297758e-05, "loss": 2.421929359436035, "step": 22660 }, { "epoch": 6.43485665625887, "grad_norm": 6.583272933959961, "learning_rate": 6.783848992336077e-05, "loss": 2.384776306152344, "step": 22670 }, { "epoch": 6.437695146182231, "grad_norm": 6.750959396362305, "learning_rate": 6.782429747374397e-05, "loss": 2.5481849670410157, "step": 22680 }, { "epoch": 6.440533636105592, "grad_norm": 6.268980026245117, "learning_rate": 6.781010502412717e-05, "loss": 2.3847578048706053, "step": 22690 }, { "epoch": 6.443372126028953, "grad_norm": 6.942320346832275, "learning_rate": 6.779591257451037e-05, "loss": 2.452330207824707, "step": 22700 }, { "epoch": 6.446210615952314, "grad_norm": 6.772799015045166, "learning_rate": 6.778172012489356e-05, "loss": 2.519463539123535, "step": 22710 }, { "epoch": 6.449049105875674, "grad_norm": 6.787113189697266, "learning_rate": 6.776752767527675e-05, "loss": 2.4602970123291015, "step": 22720 }, { "epoch": 6.451887595799035, "grad_norm": 6.983421802520752, "learning_rate": 6.775333522565995e-05, "loss": 2.5227998733520507, "step": 22730 }, { "epoch": 6.454726085722395, "grad_norm": 6.855210781097412, "learning_rate": 6.773914277604315e-05, "loss": 2.3890342712402344, "step": 22740 }, { "epoch": 6.4575645756457565, "grad_norm": 6.7545599937438965, "learning_rate": 6.772495032642633e-05, "loss": 2.4037172317504885, "step": 22750 }, { "epoch": 6.460403065569118, "grad_norm": 6.559694766998291, "learning_rate": 6.771075787680954e-05, "loss": 2.4410287857055666, "step": 22760 }, { "epoch": 6.463241555492478, "grad_norm": 6.646772861480713, "learning_rate": 6.769656542719273e-05, "loss": 2.5448837280273438, "step": 22770 }, { "epoch": 6.466080045415839, "grad_norm": 6.994681358337402, "learning_rate": 6.768237297757594e-05, "loss": 2.439699172973633, "step": 22780 }, { "epoch": 6.468918535339199, "grad_norm": 6.626687049865723, "learning_rate": 6.766818052795912e-05, "loss": 2.3973941802978516, "step": 22790 }, { "epoch": 6.47175702526256, "grad_norm": 6.454825401306152, "learning_rate": 6.765398807834233e-05, "loss": 2.394098091125488, "step": 22800 }, { "epoch": 6.474595515185921, "grad_norm": 6.815852642059326, "learning_rate": 6.763979562872552e-05, "loss": 2.5832965850830076, "step": 22810 }, { "epoch": 6.477434005109282, "grad_norm": 6.798934459686279, "learning_rate": 6.762560317910871e-05, "loss": 2.423013687133789, "step": 22820 }, { "epoch": 6.480272495032643, "grad_norm": 6.389133930206299, "learning_rate": 6.761141072949191e-05, "loss": 2.438092041015625, "step": 22830 }, { "epoch": 6.483110984956004, "grad_norm": 6.676428318023682, "learning_rate": 6.75972182798751e-05, "loss": 2.522466468811035, "step": 22840 }, { "epoch": 6.485949474879364, "grad_norm": 6.88333797454834, "learning_rate": 6.758302583025831e-05, "loss": 2.460535430908203, "step": 22850 }, { "epoch": 6.488787964802725, "grad_norm": 6.502301216125488, "learning_rate": 6.75688333806415e-05, "loss": 2.448042106628418, "step": 22860 }, { "epoch": 6.491626454726085, "grad_norm": 6.874541759490967, "learning_rate": 6.75546409310247e-05, "loss": 2.4608335494995117, "step": 22870 }, { "epoch": 6.4944649446494465, "grad_norm": 6.615033149719238, "learning_rate": 6.754044848140789e-05, "loss": 2.479871368408203, "step": 22880 }, { "epoch": 6.497303434572808, "grad_norm": 6.567814826965332, "learning_rate": 6.75262560317911e-05, "loss": 2.386138916015625, "step": 22890 }, { "epoch": 6.500141924496168, "grad_norm": 7.159061431884766, "learning_rate": 6.751206358217429e-05, "loss": 2.4154994964599608, "step": 22900 }, { "epoch": 6.502980414419529, "grad_norm": 7.001035213470459, "learning_rate": 6.749787113255748e-05, "loss": 2.434021759033203, "step": 22910 }, { "epoch": 6.50581890434289, "grad_norm": 6.895256519317627, "learning_rate": 6.748367868294068e-05, "loss": 2.4769756317138674, "step": 22920 }, { "epoch": 6.50865739426625, "grad_norm": 6.551761150360107, "learning_rate": 6.746948623332387e-05, "loss": 2.4363346099853516, "step": 22930 }, { "epoch": 6.511495884189611, "grad_norm": 6.909209728240967, "learning_rate": 6.745529378370708e-05, "loss": 2.3739343643188477, "step": 22940 }, { "epoch": 6.514334374112972, "grad_norm": 6.716180801391602, "learning_rate": 6.744252057905195e-05, "loss": 2.443856430053711, "step": 22950 }, { "epoch": 6.517172864036333, "grad_norm": 7.228034973144531, "learning_rate": 6.742832812943514e-05, "loss": 2.471078872680664, "step": 22960 }, { "epoch": 6.520011353959694, "grad_norm": 6.921933174133301, "learning_rate": 6.741413567981833e-05, "loss": 2.4163436889648438, "step": 22970 }, { "epoch": 6.522849843883054, "grad_norm": 6.729578971862793, "learning_rate": 6.739994323020153e-05, "loss": 2.40721435546875, "step": 22980 }, { "epoch": 6.525688333806415, "grad_norm": 6.9103007316589355, "learning_rate": 6.738575078058473e-05, "loss": 2.545248603820801, "step": 22990 }, { "epoch": 6.528526823729775, "grad_norm": 6.452322006225586, "learning_rate": 6.737155833096793e-05, "loss": 2.454445648193359, "step": 23000 }, { "epoch": 6.528526823729775, "eval_accuracy": 0.27888344884593375, "eval_loss": 2.7444710731506348, "eval_runtime": 50.1505, "eval_samples_per_second": 313.596, "eval_steps_per_second": 4.905, "step": 23000 }, { "epoch": 6.531365313653136, "grad_norm": 6.832770824432373, "learning_rate": 6.735736588135112e-05, "loss": 2.4212818145751953, "step": 23010 }, { "epoch": 6.5342038035764975, "grad_norm": 6.894812107086182, "learning_rate": 6.734317343173431e-05, "loss": 2.5479913711547852, "step": 23020 }, { "epoch": 6.537042293499858, "grad_norm": 6.7544846534729, "learning_rate": 6.732898098211752e-05, "loss": 2.3811260223388673, "step": 23030 }, { "epoch": 6.539880783423219, "grad_norm": 6.299253463745117, "learning_rate": 6.731478853250072e-05, "loss": 2.422995758056641, "step": 23040 }, { "epoch": 6.54271927334658, "grad_norm": 6.8589768409729, "learning_rate": 6.730059608288391e-05, "loss": 2.381074142456055, "step": 23050 }, { "epoch": 6.54555776326994, "grad_norm": 7.280567169189453, "learning_rate": 6.72864036332671e-05, "loss": 2.5286705017089846, "step": 23060 }, { "epoch": 6.548396253193301, "grad_norm": 6.691847324371338, "learning_rate": 6.727221118365031e-05, "loss": 2.4938982009887694, "step": 23070 }, { "epoch": 6.551234743116662, "grad_norm": 7.175299644470215, "learning_rate": 6.72580187340335e-05, "loss": 2.4631677627563477, "step": 23080 }, { "epoch": 6.554073233040023, "grad_norm": 7.037140846252441, "learning_rate": 6.72438262844167e-05, "loss": 2.469672966003418, "step": 23090 }, { "epoch": 6.556911722963384, "grad_norm": 6.800581455230713, "learning_rate": 6.722963383479989e-05, "loss": 2.497952461242676, "step": 23100 }, { "epoch": 6.559750212886744, "grad_norm": 6.442983627319336, "learning_rate": 6.721544138518308e-05, "loss": 2.427751159667969, "step": 23110 }, { "epoch": 6.562588702810105, "grad_norm": 6.849058151245117, "learning_rate": 6.720124893556629e-05, "loss": 2.509329986572266, "step": 23120 }, { "epoch": 6.565427192733466, "grad_norm": 7.305539608001709, "learning_rate": 6.718705648594947e-05, "loss": 2.6719028472900392, "step": 23130 }, { "epoch": 6.568265682656826, "grad_norm": 6.706221103668213, "learning_rate": 6.717286403633268e-05, "loss": 2.4136510848999024, "step": 23140 }, { "epoch": 6.5711041725801875, "grad_norm": 6.718142509460449, "learning_rate": 6.715867158671587e-05, "loss": 2.4663902282714845, "step": 23150 }, { "epoch": 6.573942662503548, "grad_norm": 7.062465190887451, "learning_rate": 6.714447913709908e-05, "loss": 2.531009483337402, "step": 23160 }, { "epoch": 6.576781152426909, "grad_norm": 6.8421630859375, "learning_rate": 6.713028668748226e-05, "loss": 2.42767333984375, "step": 23170 }, { "epoch": 6.57961964235027, "grad_norm": 6.7111687660217285, "learning_rate": 6.711609423786546e-05, "loss": 2.431218910217285, "step": 23180 }, { "epoch": 6.58245813227363, "grad_norm": 6.6736907958984375, "learning_rate": 6.710190178824866e-05, "loss": 2.4794904708862306, "step": 23190 }, { "epoch": 6.585296622196991, "grad_norm": 6.844507217407227, "learning_rate": 6.708770933863185e-05, "loss": 2.4721567153930666, "step": 23200 }, { "epoch": 6.5881351121203515, "grad_norm": 7.225371360778809, "learning_rate": 6.707351688901504e-05, "loss": 2.4389202117919924, "step": 23210 }, { "epoch": 6.590973602043713, "grad_norm": 7.038679122924805, "learning_rate": 6.705932443939824e-05, "loss": 2.47146053314209, "step": 23220 }, { "epoch": 6.593812091967074, "grad_norm": 6.776356220245361, "learning_rate": 6.704513198978144e-05, "loss": 2.432694435119629, "step": 23230 }, { "epoch": 6.596650581890434, "grad_norm": 6.7376322746276855, "learning_rate": 6.703093954016464e-05, "loss": 2.474317169189453, "step": 23240 }, { "epoch": 6.599489071813795, "grad_norm": 6.727077484130859, "learning_rate": 6.701674709054783e-05, "loss": 2.4125381469726563, "step": 23250 }, { "epoch": 6.602327561737156, "grad_norm": 6.839275360107422, "learning_rate": 6.700255464093102e-05, "loss": 2.4648054122924803, "step": 23260 }, { "epoch": 6.605166051660516, "grad_norm": 6.2537922859191895, "learning_rate": 6.698836219131423e-05, "loss": 2.5043935775756836, "step": 23270 }, { "epoch": 6.6080045415838775, "grad_norm": 6.561368465423584, "learning_rate": 6.697416974169742e-05, "loss": 2.4744985580444334, "step": 23280 }, { "epoch": 6.610843031507239, "grad_norm": 7.006391525268555, "learning_rate": 6.695997729208062e-05, "loss": 2.3987142562866213, "step": 23290 }, { "epoch": 6.613681521430599, "grad_norm": 6.726463317871094, "learning_rate": 6.694578484246381e-05, "loss": 2.48101806640625, "step": 23300 }, { "epoch": 6.61652001135396, "grad_norm": 6.70090389251709, "learning_rate": 6.6931592392847e-05, "loss": 2.403232765197754, "step": 23310 }, { "epoch": 6.61935850127732, "grad_norm": 6.653123378753662, "learning_rate": 6.691739994323021e-05, "loss": 2.4260961532592775, "step": 23320 }, { "epoch": 6.622196991200681, "grad_norm": 6.734582424163818, "learning_rate": 6.690320749361339e-05, "loss": 2.362662124633789, "step": 23330 }, { "epoch": 6.625035481124042, "grad_norm": 7.031915187835693, "learning_rate": 6.68890150439966e-05, "loss": 2.4635448455810547, "step": 23340 }, { "epoch": 6.627873971047403, "grad_norm": 6.880476474761963, "learning_rate": 6.687482259437979e-05, "loss": 2.4804594039916994, "step": 23350 }, { "epoch": 6.630712460970764, "grad_norm": 6.69880485534668, "learning_rate": 6.6860630144763e-05, "loss": 2.504436492919922, "step": 23360 }, { "epoch": 6.633550950894124, "grad_norm": 6.615448474884033, "learning_rate": 6.684643769514618e-05, "loss": 2.466110610961914, "step": 23370 }, { "epoch": 6.636389440817485, "grad_norm": 6.927416801452637, "learning_rate": 6.683224524552938e-05, "loss": 2.479190635681152, "step": 23380 }, { "epoch": 6.639227930740846, "grad_norm": 6.943418502807617, "learning_rate": 6.681805279591258e-05, "loss": 2.487298011779785, "step": 23390 }, { "epoch": 6.642066420664206, "grad_norm": 6.650552272796631, "learning_rate": 6.680386034629578e-05, "loss": 2.474765968322754, "step": 23400 }, { "epoch": 6.6449049105875675, "grad_norm": 6.897334575653076, "learning_rate": 6.678966789667896e-05, "loss": 2.496908187866211, "step": 23410 }, { "epoch": 6.647743400510928, "grad_norm": 6.67884635925293, "learning_rate": 6.677547544706217e-05, "loss": 2.4346664428710936, "step": 23420 }, { "epoch": 6.650581890434289, "grad_norm": 6.809999465942383, "learning_rate": 6.676128299744536e-05, "loss": 2.360765838623047, "step": 23430 }, { "epoch": 6.65342038035765, "grad_norm": 6.462069511413574, "learning_rate": 6.674709054782856e-05, "loss": 2.407318687438965, "step": 23440 }, { "epoch": 6.65625887028101, "grad_norm": 6.407384395599365, "learning_rate": 6.673289809821175e-05, "loss": 2.4736337661743164, "step": 23450 }, { "epoch": 6.659097360204371, "grad_norm": 6.451752185821533, "learning_rate": 6.671870564859494e-05, "loss": 2.348096466064453, "step": 23460 }, { "epoch": 6.661935850127732, "grad_norm": 6.894118309020996, "learning_rate": 6.670451319897815e-05, "loss": 2.5406436920166016, "step": 23470 }, { "epoch": 6.664774340051093, "grad_norm": 6.859557151794434, "learning_rate": 6.669032074936134e-05, "loss": 2.556243324279785, "step": 23480 }, { "epoch": 6.667612829974454, "grad_norm": 7.026027202606201, "learning_rate": 6.667612829974454e-05, "loss": 2.5305763244628907, "step": 23490 }, { "epoch": 6.670451319897815, "grad_norm": 6.589422225952148, "learning_rate": 6.666193585012773e-05, "loss": 2.493460845947266, "step": 23500 }, { "epoch": 6.670451319897815, "eval_accuracy": 0.2837794875055637, "eval_loss": 2.737304925918579, "eval_runtime": 52.6264, "eval_samples_per_second": 298.843, "eval_steps_per_second": 4.674, "step": 23500 }, { "epoch": 6.673289809821175, "grad_norm": 6.8856730461120605, "learning_rate": 6.664774340051094e-05, "loss": 2.493457794189453, "step": 23510 }, { "epoch": 6.676128299744536, "grad_norm": 6.648621082305908, "learning_rate": 6.663355095089413e-05, "loss": 2.541718292236328, "step": 23520 }, { "epoch": 6.678966789667896, "grad_norm": 6.692605495452881, "learning_rate": 6.661935850127732e-05, "loss": 2.4998741149902344, "step": 23530 }, { "epoch": 6.6818052795912575, "grad_norm": 7.099975109100342, "learning_rate": 6.660516605166052e-05, "loss": 2.415885162353516, "step": 23540 }, { "epoch": 6.684643769514619, "grad_norm": 6.683704376220703, "learning_rate": 6.659097360204371e-05, "loss": 2.4275035858154297, "step": 23550 }, { "epoch": 6.687482259437979, "grad_norm": 6.726040363311768, "learning_rate": 6.657678115242692e-05, "loss": 2.4073278427124025, "step": 23560 }, { "epoch": 6.69032074936134, "grad_norm": 6.988174915313721, "learning_rate": 6.656258870281011e-05, "loss": 2.4096118927001955, "step": 23570 }, { "epoch": 6.6931592392847, "grad_norm": 6.56510591506958, "learning_rate": 6.65483962531933e-05, "loss": 2.4673343658447267, "step": 23580 }, { "epoch": 6.695997729208061, "grad_norm": 6.781762599945068, "learning_rate": 6.65342038035765e-05, "loss": 2.326837348937988, "step": 23590 }, { "epoch": 6.698836219131422, "grad_norm": 6.936979293823242, "learning_rate": 6.65200113539597e-05, "loss": 2.4798843383789064, "step": 23600 }, { "epoch": 6.701674709054783, "grad_norm": 7.191762924194336, "learning_rate": 6.65058189043429e-05, "loss": 2.499898147583008, "step": 23610 }, { "epoch": 6.704513198978144, "grad_norm": 6.40588903427124, "learning_rate": 6.649162645472609e-05, "loss": 2.455776405334473, "step": 23620 }, { "epoch": 6.707351688901504, "grad_norm": 6.497918128967285, "learning_rate": 6.647743400510928e-05, "loss": 2.4307378768920898, "step": 23630 }, { "epoch": 6.710190178824865, "grad_norm": 6.708412170410156, "learning_rate": 6.646324155549248e-05, "loss": 2.381624221801758, "step": 23640 }, { "epoch": 6.713028668748226, "grad_norm": 7.187062740325928, "learning_rate": 6.644904910587568e-05, "loss": 2.5068288803100587, "step": 23650 }, { "epoch": 6.715867158671586, "grad_norm": 6.648885726928711, "learning_rate": 6.643485665625886e-05, "loss": 2.4334127426147463, "step": 23660 }, { "epoch": 6.7187056485949475, "grad_norm": 6.547712326049805, "learning_rate": 6.642066420664207e-05, "loss": 2.4100936889648437, "step": 23670 }, { "epoch": 6.721544138518309, "grad_norm": 6.512052536010742, "learning_rate": 6.640647175702526e-05, "loss": 2.427205276489258, "step": 23680 }, { "epoch": 6.724382628441669, "grad_norm": 6.822438716888428, "learning_rate": 6.639227930740847e-05, "loss": 2.4069156646728516, "step": 23690 }, { "epoch": 6.72722111836503, "grad_norm": 6.492216110229492, "learning_rate": 6.637808685779165e-05, "loss": 2.454742431640625, "step": 23700 }, { "epoch": 6.730059608288391, "grad_norm": 6.7577900886535645, "learning_rate": 6.636389440817486e-05, "loss": 2.4499773025512694, "step": 23710 }, { "epoch": 6.732898098211751, "grad_norm": 6.8079609870910645, "learning_rate": 6.634970195855805e-05, "loss": 2.3679771423339844, "step": 23720 }, { "epoch": 6.735736588135112, "grad_norm": 6.997687816619873, "learning_rate": 6.633550950894126e-05, "loss": 2.5040973663330077, "step": 23730 }, { "epoch": 6.738575078058473, "grad_norm": 6.821223735809326, "learning_rate": 6.632131705932444e-05, "loss": 2.4537620544433594, "step": 23740 }, { "epoch": 6.741413567981834, "grad_norm": 6.721340179443359, "learning_rate": 6.630712460970764e-05, "loss": 2.4310882568359373, "step": 23750 }, { "epoch": 6.744252057905195, "grad_norm": 6.953032493591309, "learning_rate": 6.629293216009084e-05, "loss": 2.473946952819824, "step": 23760 }, { "epoch": 6.747090547828555, "grad_norm": 7.010735988616943, "learning_rate": 6.627873971047403e-05, "loss": 2.399638366699219, "step": 23770 }, { "epoch": 6.749929037751916, "grad_norm": 6.388823986053467, "learning_rate": 6.626454726085722e-05, "loss": 2.445231628417969, "step": 23780 }, { "epoch": 6.752767527675276, "grad_norm": 6.631792068481445, "learning_rate": 6.625035481124042e-05, "loss": 2.517989158630371, "step": 23790 }, { "epoch": 6.755606017598637, "grad_norm": 6.6903581619262695, "learning_rate": 6.623616236162362e-05, "loss": 2.477436065673828, "step": 23800 }, { "epoch": 6.7584445075219985, "grad_norm": 6.648143768310547, "learning_rate": 6.622196991200682e-05, "loss": 2.5089019775390624, "step": 23810 }, { "epoch": 6.761282997445359, "grad_norm": 7.136392593383789, "learning_rate": 6.620777746239001e-05, "loss": 2.405748748779297, "step": 23820 }, { "epoch": 6.76412148736872, "grad_norm": 6.936954975128174, "learning_rate": 6.61935850127732e-05, "loss": 2.4645864486694338, "step": 23830 }, { "epoch": 6.766959977292081, "grad_norm": 7.24948263168335, "learning_rate": 6.617939256315641e-05, "loss": 2.502256965637207, "step": 23840 }, { "epoch": 6.769798467215441, "grad_norm": 6.988910675048828, "learning_rate": 6.61652001135396e-05, "loss": 2.4421823501586912, "step": 23850 }, { "epoch": 6.772636957138802, "grad_norm": 6.696432113647461, "learning_rate": 6.61510076639228e-05, "loss": 2.4145524978637694, "step": 23860 }, { "epoch": 6.775475447062163, "grad_norm": 6.225535869598389, "learning_rate": 6.613681521430599e-05, "loss": 2.391206741333008, "step": 23870 }, { "epoch": 6.778313936985524, "grad_norm": 6.832905292510986, "learning_rate": 6.612262276468918e-05, "loss": 2.513165283203125, "step": 23880 }, { "epoch": 6.781152426908885, "grad_norm": 6.724606037139893, "learning_rate": 6.610843031507239e-05, "loss": 2.4619766235351563, "step": 23890 }, { "epoch": 6.783990916832245, "grad_norm": 6.881455421447754, "learning_rate": 6.609423786545557e-05, "loss": 2.4062564849853514, "step": 23900 }, { "epoch": 6.786829406755606, "grad_norm": 6.198401927947998, "learning_rate": 6.608004541583878e-05, "loss": 2.513726997375488, "step": 23910 }, { "epoch": 6.789667896678967, "grad_norm": 6.69783878326416, "learning_rate": 6.606585296622197e-05, "loss": 2.4656442642211913, "step": 23920 }, { "epoch": 6.792506386602327, "grad_norm": 6.8291144371032715, "learning_rate": 6.605166051660518e-05, "loss": 2.3694433212280273, "step": 23930 }, { "epoch": 6.7953448765256885, "grad_norm": 6.498046398162842, "learning_rate": 6.603746806698836e-05, "loss": 2.4634815216064454, "step": 23940 }, { "epoch": 6.798183366449049, "grad_norm": 6.5471086502075195, "learning_rate": 6.602327561737156e-05, "loss": 2.4062454223632814, "step": 23950 }, { "epoch": 6.80102185637241, "grad_norm": 6.903742790222168, "learning_rate": 6.600908316775476e-05, "loss": 2.5536128997802736, "step": 23960 }, { "epoch": 6.803860346295771, "grad_norm": 7.264212131500244, "learning_rate": 6.599489071813796e-05, "loss": 2.4788234710693358, "step": 23970 }, { "epoch": 6.806698836219131, "grad_norm": 6.60231876373291, "learning_rate": 6.598069826852114e-05, "loss": 2.408544158935547, "step": 23980 }, { "epoch": 6.809537326142492, "grad_norm": 6.720109462738037, "learning_rate": 6.596650581890435e-05, "loss": 2.4876787185668947, "step": 23990 }, { "epoch": 6.8123758160658525, "grad_norm": 6.7754740715026855, "learning_rate": 6.595231336928754e-05, "loss": 2.4751224517822266, "step": 24000 }, { "epoch": 6.8123758160658525, "eval_accuracy": 0.2819991098111528, "eval_loss": 2.7317564487457275, "eval_runtime": 49.97, "eval_samples_per_second": 314.729, "eval_steps_per_second": 4.923, "step": 24000 }, { "epoch": 6.815214305989214, "grad_norm": 6.5507731437683105, "learning_rate": 6.593812091967074e-05, "loss": 2.455486869812012, "step": 24010 }, { "epoch": 6.818052795912575, "grad_norm": 7.210355758666992, "learning_rate": 6.592392847005393e-05, "loss": 2.391244125366211, "step": 24020 }, { "epoch": 6.820891285835935, "grad_norm": 6.4821953773498535, "learning_rate": 6.590973602043712e-05, "loss": 2.4531030654907227, "step": 24030 }, { "epoch": 6.823729775759296, "grad_norm": 6.988666534423828, "learning_rate": 6.589554357082033e-05, "loss": 2.44390983581543, "step": 24040 }, { "epoch": 6.826568265682657, "grad_norm": 6.782492637634277, "learning_rate": 6.588135112120353e-05, "loss": 2.417178916931152, "step": 24050 }, { "epoch": 6.829406755606017, "grad_norm": 6.816283702850342, "learning_rate": 6.586715867158672e-05, "loss": 2.4325733184814453, "step": 24060 }, { "epoch": 6.8322452455293785, "grad_norm": 6.933839321136475, "learning_rate": 6.585296622196991e-05, "loss": 2.511156463623047, "step": 24070 }, { "epoch": 6.83508373545274, "grad_norm": 6.858307838439941, "learning_rate": 6.583877377235312e-05, "loss": 2.3751115798950195, "step": 24080 }, { "epoch": 6.8379222253761, "grad_norm": 6.752106666564941, "learning_rate": 6.582458132273631e-05, "loss": 2.493433952331543, "step": 24090 }, { "epoch": 6.840760715299461, "grad_norm": 6.5091657638549805, "learning_rate": 6.58103888731195e-05, "loss": 2.3756841659545898, "step": 24100 }, { "epoch": 6.843599205222821, "grad_norm": 6.667442798614502, "learning_rate": 6.57961964235027e-05, "loss": 2.335476112365723, "step": 24110 }, { "epoch": 6.846437695146182, "grad_norm": 6.610334873199463, "learning_rate": 6.578200397388589e-05, "loss": 2.491949462890625, "step": 24120 }, { "epoch": 6.849276185069543, "grad_norm": 6.892573356628418, "learning_rate": 6.57678115242691e-05, "loss": 2.510312080383301, "step": 24130 }, { "epoch": 6.852114674992904, "grad_norm": 6.615846633911133, "learning_rate": 6.575361907465228e-05, "loss": 2.445916175842285, "step": 24140 }, { "epoch": 6.854953164916265, "grad_norm": 7.038809299468994, "learning_rate": 6.573942662503549e-05, "loss": 2.4399181365966798, "step": 24150 }, { "epoch": 6.857791654839625, "grad_norm": 6.802517890930176, "learning_rate": 6.572523417541868e-05, "loss": 2.4772003173828123, "step": 24160 }, { "epoch": 6.860630144762986, "grad_norm": 6.605574131011963, "learning_rate": 6.571104172580189e-05, "loss": 2.4880741119384764, "step": 24170 }, { "epoch": 6.863468634686347, "grad_norm": 7.113959789276123, "learning_rate": 6.569684927618507e-05, "loss": 2.4425140380859376, "step": 24180 }, { "epoch": 6.866307124609707, "grad_norm": 7.108166694641113, "learning_rate": 6.568265682656827e-05, "loss": 2.424282455444336, "step": 24190 }, { "epoch": 6.8691456145330685, "grad_norm": 6.995443820953369, "learning_rate": 6.566846437695147e-05, "loss": 2.49324951171875, "step": 24200 }, { "epoch": 6.871984104456429, "grad_norm": 6.868298530578613, "learning_rate": 6.565427192733466e-05, "loss": 2.480892372131348, "step": 24210 }, { "epoch": 6.87482259437979, "grad_norm": 6.289827823638916, "learning_rate": 6.564007947771785e-05, "loss": 2.4463436126708986, "step": 24220 }, { "epoch": 6.877661084303151, "grad_norm": 6.93380069732666, "learning_rate": 6.562588702810105e-05, "loss": 2.4354461669921874, "step": 24230 }, { "epoch": 6.880499574226511, "grad_norm": 6.985596656799316, "learning_rate": 6.561169457848425e-05, "loss": 2.4229253768920898, "step": 24240 }, { "epoch": 6.883338064149872, "grad_norm": 6.979676723480225, "learning_rate": 6.559750212886745e-05, "loss": 2.4423381805419924, "step": 24250 }, { "epoch": 6.886176554073233, "grad_norm": 6.839282989501953, "learning_rate": 6.558330967925064e-05, "loss": 2.4962535858154298, "step": 24260 }, { "epoch": 6.889015043996594, "grad_norm": 6.8118391036987305, "learning_rate": 6.556911722963383e-05, "loss": 2.4937265396118162, "step": 24270 }, { "epoch": 6.891853533919955, "grad_norm": 7.277923107147217, "learning_rate": 6.555492478001704e-05, "loss": 2.500895309448242, "step": 24280 }, { "epoch": 6.894692023843316, "grad_norm": 6.625880718231201, "learning_rate": 6.554073233040023e-05, "loss": 2.509530258178711, "step": 24290 }, { "epoch": 6.897530513766676, "grad_norm": 7.108129024505615, "learning_rate": 6.552653988078343e-05, "loss": 2.4390369415283204, "step": 24300 }, { "epoch": 6.900369003690037, "grad_norm": 6.679862976074219, "learning_rate": 6.551234743116662e-05, "loss": 2.472818946838379, "step": 24310 }, { "epoch": 6.903207493613397, "grad_norm": 6.916492938995361, "learning_rate": 6.549815498154983e-05, "loss": 2.453359031677246, "step": 24320 }, { "epoch": 6.9060459835367585, "grad_norm": 6.86735725402832, "learning_rate": 6.548396253193302e-05, "loss": 2.5145029067993163, "step": 24330 }, { "epoch": 6.90888447346012, "grad_norm": 6.49570369720459, "learning_rate": 6.546977008231621e-05, "loss": 2.381800651550293, "step": 24340 }, { "epoch": 6.91172296338348, "grad_norm": 6.690262317657471, "learning_rate": 6.54555776326994e-05, "loss": 2.3913665771484376, "step": 24350 }, { "epoch": 6.914561453306841, "grad_norm": 6.895578861236572, "learning_rate": 6.54413851830826e-05, "loss": 2.50070858001709, "step": 24360 }, { "epoch": 6.917399943230201, "grad_norm": 6.798637866973877, "learning_rate": 6.54271927334658e-05, "loss": 2.554624557495117, "step": 24370 }, { "epoch": 6.920238433153562, "grad_norm": 6.790895462036133, "learning_rate": 6.541300028384899e-05, "loss": 2.3588998794555662, "step": 24380 }, { "epoch": 6.923076923076923, "grad_norm": 6.691434860229492, "learning_rate": 6.539880783423219e-05, "loss": 2.438275146484375, "step": 24390 }, { "epoch": 6.925915413000284, "grad_norm": 6.512357711791992, "learning_rate": 6.538461538461539e-05, "loss": 2.424893379211426, "step": 24400 }, { "epoch": 6.928753902923645, "grad_norm": 6.775526523590088, "learning_rate": 6.537042293499859e-05, "loss": 2.4213218688964844, "step": 24410 }, { "epoch": 6.931592392847005, "grad_norm": 6.50932502746582, "learning_rate": 6.535623048538177e-05, "loss": 2.424142265319824, "step": 24420 }, { "epoch": 6.934430882770366, "grad_norm": 6.901050567626953, "learning_rate": 6.534203803576498e-05, "loss": 2.4723777770996094, "step": 24430 }, { "epoch": 6.937269372693727, "grad_norm": 7.065399169921875, "learning_rate": 6.532784558614817e-05, "loss": 2.386628532409668, "step": 24440 }, { "epoch": 6.940107862617087, "grad_norm": 7.496346950531006, "learning_rate": 6.531365313653137e-05, "loss": 2.4543724060058594, "step": 24450 }, { "epoch": 6.9429463525404485, "grad_norm": 7.080015659332275, "learning_rate": 6.529946068691456e-05, "loss": 2.4295675277709963, "step": 24460 }, { "epoch": 6.94578484246381, "grad_norm": 6.783768177032471, "learning_rate": 6.528526823729775e-05, "loss": 2.4104169845581054, "step": 24470 }, { "epoch": 6.94862333238717, "grad_norm": 6.503687381744385, "learning_rate": 6.527107578768096e-05, "loss": 2.403973388671875, "step": 24480 }, { "epoch": 6.951461822310531, "grad_norm": 6.769944667816162, "learning_rate": 6.525688333806415e-05, "loss": 2.5098892211914063, "step": 24490 }, { "epoch": 6.954300312233892, "grad_norm": 6.495813846588135, "learning_rate": 6.524269088844735e-05, "loss": 2.4044727325439452, "step": 24500 }, { "epoch": 6.954300312233892, "eval_accuracy": 0.28015514719908435, "eval_loss": 2.7246413230895996, "eval_runtime": 52.7928, "eval_samples_per_second": 297.9, "eval_steps_per_second": 4.66, "step": 24500 }, { "epoch": 6.957138802157252, "grad_norm": 6.8499274253845215, "learning_rate": 6.522849843883054e-05, "loss": 2.498736763000488, "step": 24510 }, { "epoch": 6.959977292080613, "grad_norm": 6.660099983215332, "learning_rate": 6.521430598921375e-05, "loss": 2.4208047866821287, "step": 24520 }, { "epoch": 6.9628157820039736, "grad_norm": 6.602716445922852, "learning_rate": 6.520011353959694e-05, "loss": 2.4188514709472657, "step": 24530 }, { "epoch": 6.965654271927335, "grad_norm": 6.964450359344482, "learning_rate": 6.518592108998013e-05, "loss": 2.3961484909057615, "step": 24540 }, { "epoch": 6.968492761850696, "grad_norm": 6.997904300689697, "learning_rate": 6.517172864036333e-05, "loss": 2.4970443725585936, "step": 24550 }, { "epoch": 6.971331251774056, "grad_norm": 6.790659427642822, "learning_rate": 6.515753619074652e-05, "loss": 2.3995267868041994, "step": 24560 }, { "epoch": 6.974169741697417, "grad_norm": 7.607274532318115, "learning_rate": 6.514334374112973e-05, "loss": 2.44451904296875, "step": 24570 }, { "epoch": 6.977008231620777, "grad_norm": 7.114135265350342, "learning_rate": 6.51291512915129e-05, "loss": 2.50982780456543, "step": 24580 }, { "epoch": 6.979846721544138, "grad_norm": 7.06069803237915, "learning_rate": 6.511495884189611e-05, "loss": 2.5106508255004885, "step": 24590 }, { "epoch": 6.9826852114674995, "grad_norm": 6.780190467834473, "learning_rate": 6.510076639227931e-05, "loss": 2.4485027313232424, "step": 24600 }, { "epoch": 6.98552370139086, "grad_norm": 7.116711139678955, "learning_rate": 6.508657394266251e-05, "loss": 2.438605308532715, "step": 24610 }, { "epoch": 6.988362191314221, "grad_norm": 7.044675350189209, "learning_rate": 6.50723814930457e-05, "loss": 2.4571840286254885, "step": 24620 }, { "epoch": 6.991200681237582, "grad_norm": 7.117720127105713, "learning_rate": 6.50581890434289e-05, "loss": 2.4416778564453123, "step": 24630 }, { "epoch": 6.994039171160942, "grad_norm": 6.784974575042725, "learning_rate": 6.50439965938121e-05, "loss": 2.437415885925293, "step": 24640 }, { "epoch": 6.996877661084303, "grad_norm": 7.2035040855407715, "learning_rate": 6.50298041441953e-05, "loss": 2.4428127288818358, "step": 24650 }, { "epoch": 6.999716151007664, "grad_norm": 6.9749860763549805, "learning_rate": 6.501561169457848e-05, "loss": 2.4315671920776367, "step": 24660 }, { "epoch": 7.002554640931025, "grad_norm": 6.722234725952148, "learning_rate": 6.500141924496169e-05, "loss": 2.3758686065673826, "step": 24670 }, { "epoch": 7.005393130854386, "grad_norm": 6.8533101081848145, "learning_rate": 6.498722679534488e-05, "loss": 2.4156700134277345, "step": 24680 }, { "epoch": 7.008231620777746, "grad_norm": 6.436008930206299, "learning_rate": 6.497303434572807e-05, "loss": 2.443888282775879, "step": 24690 }, { "epoch": 7.011070110701107, "grad_norm": 7.294393062591553, "learning_rate": 6.495884189611127e-05, "loss": 2.3559932708740234, "step": 24700 }, { "epoch": 7.013908600624468, "grad_norm": 6.759249687194824, "learning_rate": 6.494464944649446e-05, "loss": 2.3883316040039064, "step": 24710 }, { "epoch": 7.016747090547828, "grad_norm": 6.723632335662842, "learning_rate": 6.493045699687767e-05, "loss": 2.3222354888916015, "step": 24720 }, { "epoch": 7.0195855804711895, "grad_norm": 6.940752029418945, "learning_rate": 6.491626454726086e-05, "loss": 2.415761947631836, "step": 24730 }, { "epoch": 7.02242407039455, "grad_norm": 6.399590015411377, "learning_rate": 6.490207209764405e-05, "loss": 2.4074996948242187, "step": 24740 }, { "epoch": 7.025262560317911, "grad_norm": 6.385077953338623, "learning_rate": 6.488787964802725e-05, "loss": 2.3895183563232423, "step": 24750 }, { "epoch": 7.028101050241272, "grad_norm": 6.520626068115234, "learning_rate": 6.487368719841045e-05, "loss": 2.46475830078125, "step": 24760 }, { "epoch": 7.030939540164632, "grad_norm": 6.988461017608643, "learning_rate": 6.485949474879365e-05, "loss": 2.382170867919922, "step": 24770 }, { "epoch": 7.033778030087993, "grad_norm": 6.896969795227051, "learning_rate": 6.484530229917684e-05, "loss": 2.436529541015625, "step": 24780 }, { "epoch": 7.0366165200113535, "grad_norm": 6.752686500549316, "learning_rate": 6.483110984956003e-05, "loss": 2.4161907196044923, "step": 24790 }, { "epoch": 7.039455009934715, "grad_norm": 6.834910869598389, "learning_rate": 6.481691739994323e-05, "loss": 2.444271469116211, "step": 24800 }, { "epoch": 7.042293499858076, "grad_norm": 7.074514389038086, "learning_rate": 6.480272495032643e-05, "loss": 2.4325056076049805, "step": 24810 }, { "epoch": 7.045131989781436, "grad_norm": 6.778689861297607, "learning_rate": 6.478853250070963e-05, "loss": 2.481636810302734, "step": 24820 }, { "epoch": 7.047970479704797, "grad_norm": 6.642706394195557, "learning_rate": 6.477434005109282e-05, "loss": 2.370331382751465, "step": 24830 }, { "epoch": 7.050808969628158, "grad_norm": 7.060964584350586, "learning_rate": 6.476014760147601e-05, "loss": 2.38138370513916, "step": 24840 }, { "epoch": 7.053647459551518, "grad_norm": 6.85936975479126, "learning_rate": 6.474595515185922e-05, "loss": 2.405693244934082, "step": 24850 }, { "epoch": 7.0564859494748795, "grad_norm": 6.753654956817627, "learning_rate": 6.473176270224241e-05, "loss": 2.4695289611816404, "step": 24860 }, { "epoch": 7.05932443939824, "grad_norm": 6.774409294128418, "learning_rate": 6.471757025262561e-05, "loss": 2.3597883224487304, "step": 24870 }, { "epoch": 7.062162929321601, "grad_norm": 6.845193386077881, "learning_rate": 6.47033778030088e-05, "loss": 2.4191156387329102, "step": 24880 }, { "epoch": 7.065001419244962, "grad_norm": 6.654033184051514, "learning_rate": 6.4689185353392e-05, "loss": 2.3696319580078127, "step": 24890 }, { "epoch": 7.067839909168322, "grad_norm": 6.5017805099487305, "learning_rate": 6.46749929037752e-05, "loss": 2.389557456970215, "step": 24900 }, { "epoch": 7.070678399091683, "grad_norm": 6.950746059417725, "learning_rate": 6.466080045415838e-05, "loss": 2.430397605895996, "step": 24910 }, { "epoch": 7.073516889015044, "grad_norm": 6.533839225769043, "learning_rate": 6.464660800454159e-05, "loss": 2.427303695678711, "step": 24920 }, { "epoch": 7.076355378938405, "grad_norm": 6.523582935333252, "learning_rate": 6.463241555492478e-05, "loss": 2.4010000228881836, "step": 24930 }, { "epoch": 7.079193868861766, "grad_norm": 6.392292022705078, "learning_rate": 6.461822310530799e-05, "loss": 2.371573638916016, "step": 24940 }, { "epoch": 7.082032358785126, "grad_norm": 6.685912132263184, "learning_rate": 6.460403065569117e-05, "loss": 2.416305732727051, "step": 24950 }, { "epoch": 7.084870848708487, "grad_norm": 6.912413597106934, "learning_rate": 6.458983820607437e-05, "loss": 2.400714874267578, "step": 24960 }, { "epoch": 7.087709338631848, "grad_norm": 6.830107688903809, "learning_rate": 6.457564575645757e-05, "loss": 2.429644012451172, "step": 24970 }, { "epoch": 7.090547828555208, "grad_norm": 6.7503228187561035, "learning_rate": 6.456145330684077e-05, "loss": 2.470657730102539, "step": 24980 }, { "epoch": 7.0933863184785695, "grad_norm": 6.452540397644043, "learning_rate": 6.454726085722395e-05, "loss": 2.379440689086914, "step": 24990 }, { "epoch": 7.096224808401931, "grad_norm": 6.871942520141602, "learning_rate": 6.453306840760716e-05, "loss": 2.372479248046875, "step": 25000 }, { "epoch": 7.096224808401931, "eval_accuracy": 0.2846060914351116, "eval_loss": 2.716517686843872, "eval_runtime": 51.9679, "eval_samples_per_second": 302.629, "eval_steps_per_second": 4.734, "step": 25000 }, { "epoch": 7.099063298325291, "grad_norm": 6.940043926239014, "learning_rate": 6.451887595799035e-05, "loss": 2.5255081176757814, "step": 25010 }, { "epoch": 7.101901788248652, "grad_norm": 6.861844062805176, "learning_rate": 6.450468350837355e-05, "loss": 2.3800228118896483, "step": 25020 }, { "epoch": 7.104740278172012, "grad_norm": 6.932516574859619, "learning_rate": 6.449049105875674e-05, "loss": 2.389796257019043, "step": 25030 }, { "epoch": 7.107578768095373, "grad_norm": 6.621789455413818, "learning_rate": 6.447629860913993e-05, "loss": 2.347757339477539, "step": 25040 }, { "epoch": 7.110417258018734, "grad_norm": 6.240633487701416, "learning_rate": 6.446210615952314e-05, "loss": 2.3920888900756836, "step": 25050 }, { "epoch": 7.113255747942095, "grad_norm": 6.714853286743164, "learning_rate": 6.444791370990634e-05, "loss": 2.363590049743652, "step": 25060 }, { "epoch": 7.116094237865456, "grad_norm": 7.3844475746154785, "learning_rate": 6.443372126028953e-05, "loss": 2.5127822875976564, "step": 25070 }, { "epoch": 7.118932727788816, "grad_norm": 6.558947563171387, "learning_rate": 6.441952881067272e-05, "loss": 2.4449350357055666, "step": 25080 }, { "epoch": 7.121771217712177, "grad_norm": 6.919437408447266, "learning_rate": 6.440533636105593e-05, "loss": 2.418709373474121, "step": 25090 }, { "epoch": 7.124609707635538, "grad_norm": 6.7241716384887695, "learning_rate": 6.439114391143912e-05, "loss": 2.412990760803223, "step": 25100 }, { "epoch": 7.127448197558898, "grad_norm": 6.871582508087158, "learning_rate": 6.437695146182232e-05, "loss": 2.4764888763427733, "step": 25110 }, { "epoch": 7.1302866874822595, "grad_norm": 6.577028751373291, "learning_rate": 6.436275901220551e-05, "loss": 2.4310035705566406, "step": 25120 }, { "epoch": 7.133125177405621, "grad_norm": 6.546619892120361, "learning_rate": 6.43485665625887e-05, "loss": 2.3620676040649413, "step": 25130 }, { "epoch": 7.135963667328981, "grad_norm": 7.138613224029541, "learning_rate": 6.433437411297191e-05, "loss": 2.4458219528198244, "step": 25140 }, { "epoch": 7.138802157252342, "grad_norm": 6.4693474769592285, "learning_rate": 6.432018166335509e-05, "loss": 2.369105911254883, "step": 25150 }, { "epoch": 7.141640647175702, "grad_norm": 6.871633529663086, "learning_rate": 6.43059892137383e-05, "loss": 2.4469606399536135, "step": 25160 }, { "epoch": 7.144479137099063, "grad_norm": 6.845037937164307, "learning_rate": 6.429179676412149e-05, "loss": 2.4680501937866213, "step": 25170 }, { "epoch": 7.147317627022424, "grad_norm": 6.601794719696045, "learning_rate": 6.42776043145047e-05, "loss": 2.3604848861694334, "step": 25180 }, { "epoch": 7.150156116945785, "grad_norm": 6.321208477020264, "learning_rate": 6.426341186488788e-05, "loss": 2.4083883285522463, "step": 25190 }, { "epoch": 7.152994606869146, "grad_norm": 6.808413982391357, "learning_rate": 6.424921941527108e-05, "loss": 2.33880729675293, "step": 25200 }, { "epoch": 7.155833096792507, "grad_norm": 6.492012023925781, "learning_rate": 6.423502696565428e-05, "loss": 2.3919281005859374, "step": 25210 }, { "epoch": 7.158671586715867, "grad_norm": 6.816654682159424, "learning_rate": 6.422083451603748e-05, "loss": 2.4942325592041015, "step": 25220 }, { "epoch": 7.161510076639228, "grad_norm": 6.662685871124268, "learning_rate": 6.420664206642066e-05, "loss": 2.4021675109863283, "step": 25230 }, { "epoch": 7.164348566562588, "grad_norm": 6.550830841064453, "learning_rate": 6.419244961680387e-05, "loss": 2.4154277801513673, "step": 25240 }, { "epoch": 7.1671870564859494, "grad_norm": 6.722223281860352, "learning_rate": 6.417825716718706e-05, "loss": 2.3824489593505858, "step": 25250 }, { "epoch": 7.170025546409311, "grad_norm": 6.794999122619629, "learning_rate": 6.416406471757026e-05, "loss": 2.463863754272461, "step": 25260 }, { "epoch": 7.172864036332671, "grad_norm": 6.823795795440674, "learning_rate": 6.414987226795345e-05, "loss": 2.4400537490844725, "step": 25270 }, { "epoch": 7.175702526256032, "grad_norm": 6.460846900939941, "learning_rate": 6.413567981833664e-05, "loss": 2.329344367980957, "step": 25280 }, { "epoch": 7.178541016179393, "grad_norm": 6.8871169090271, "learning_rate": 6.412148736871985e-05, "loss": 2.392549514770508, "step": 25290 }, { "epoch": 7.181379506102753, "grad_norm": 7.1966705322265625, "learning_rate": 6.410729491910304e-05, "loss": 2.369533920288086, "step": 25300 }, { "epoch": 7.184217996026114, "grad_norm": 6.953819751739502, "learning_rate": 6.409310246948624e-05, "loss": 2.4385461807250977, "step": 25310 }, { "epoch": 7.1870564859494745, "grad_norm": 6.585728168487549, "learning_rate": 6.407891001986943e-05, "loss": 2.3530838012695314, "step": 25320 }, { "epoch": 7.189894975872836, "grad_norm": 6.598480224609375, "learning_rate": 6.406471757025264e-05, "loss": 2.3993553161621093, "step": 25330 }, { "epoch": 7.192733465796197, "grad_norm": 6.7202019691467285, "learning_rate": 6.405052512063583e-05, "loss": 2.401923179626465, "step": 25340 }, { "epoch": 7.195571955719557, "grad_norm": 6.788612365722656, "learning_rate": 6.403633267101902e-05, "loss": 2.4660770416259767, "step": 25350 }, { "epoch": 7.198410445642918, "grad_norm": 6.910785675048828, "learning_rate": 6.402214022140222e-05, "loss": 2.332903289794922, "step": 25360 }, { "epoch": 7.201248935566278, "grad_norm": 6.883638858795166, "learning_rate": 6.400794777178541e-05, "loss": 2.365425872802734, "step": 25370 }, { "epoch": 7.204087425489639, "grad_norm": 6.7219462394714355, "learning_rate": 6.399375532216862e-05, "loss": 2.446874237060547, "step": 25380 }, { "epoch": 7.2069259154130005, "grad_norm": 6.68604040145874, "learning_rate": 6.39795628725518e-05, "loss": 2.3195247650146484, "step": 25390 }, { "epoch": 7.209764405336361, "grad_norm": 6.5479960441589355, "learning_rate": 6.3965370422935e-05, "loss": 2.3870285034179686, "step": 25400 }, { "epoch": 7.212602895259722, "grad_norm": 6.651524066925049, "learning_rate": 6.39511779733182e-05, "loss": 2.4767858505249025, "step": 25410 }, { "epoch": 7.215441385183083, "grad_norm": 6.538511276245117, "learning_rate": 6.39369855237014e-05, "loss": 2.3863527297973635, "step": 25420 }, { "epoch": 7.218279875106443, "grad_norm": 7.171009540557861, "learning_rate": 6.392279307408458e-05, "loss": 2.4695669174194337, "step": 25430 }, { "epoch": 7.221118365029804, "grad_norm": 6.619693279266357, "learning_rate": 6.390860062446779e-05, "loss": 2.3614158630371094, "step": 25440 }, { "epoch": 7.2239568549531645, "grad_norm": 6.4533843994140625, "learning_rate": 6.389440817485098e-05, "loss": 2.3751073837280274, "step": 25450 }, { "epoch": 7.226795344876526, "grad_norm": 6.776688575744629, "learning_rate": 6.388021572523418e-05, "loss": 2.399395751953125, "step": 25460 }, { "epoch": 7.229633834799887, "grad_norm": 6.598304271697998, "learning_rate": 6.386602327561737e-05, "loss": 2.3570127487182617, "step": 25470 }, { "epoch": 7.232472324723247, "grad_norm": 6.5811238288879395, "learning_rate": 6.385183082600056e-05, "loss": 2.3981611251831056, "step": 25480 }, { "epoch": 7.235310814646608, "grad_norm": 6.634573459625244, "learning_rate": 6.383763837638377e-05, "loss": 2.3170909881591797, "step": 25490 }, { "epoch": 7.238149304569969, "grad_norm": 6.353449821472168, "learning_rate": 6.382344592676696e-05, "loss": 2.3935699462890625, "step": 25500 }, { "epoch": 7.238149304569969, "eval_accuracy": 0.2886119412475361, "eval_loss": 2.71157169342041, "eval_runtime": 53.094, "eval_samples_per_second": 296.211, "eval_steps_per_second": 4.633, "step": 25500 }, { "epoch": 7.240987794493329, "grad_norm": 6.814818859100342, "learning_rate": 6.380925347715016e-05, "loss": 2.369755744934082, "step": 25510 }, { "epoch": 7.2438262844166905, "grad_norm": 6.97282075881958, "learning_rate": 6.379506102753335e-05, "loss": 2.470378875732422, "step": 25520 }, { "epoch": 7.246664774340051, "grad_norm": 6.52481746673584, "learning_rate": 6.378086857791656e-05, "loss": 2.4409414291381837, "step": 25530 }, { "epoch": 7.249503264263412, "grad_norm": 6.703507900238037, "learning_rate": 6.376667612829975e-05, "loss": 2.4033653259277346, "step": 25540 }, { "epoch": 7.252341754186773, "grad_norm": 6.752054691314697, "learning_rate": 6.375248367868294e-05, "loss": 2.3985755920410154, "step": 25550 }, { "epoch": 7.255180244110133, "grad_norm": 6.946252346038818, "learning_rate": 6.373829122906614e-05, "loss": 2.400307464599609, "step": 25560 }, { "epoch": 7.258018734033494, "grad_norm": 6.7540106773376465, "learning_rate": 6.372409877944934e-05, "loss": 2.3972450256347657, "step": 25570 }, { "epoch": 7.2608572239568545, "grad_norm": 6.632865905761719, "learning_rate": 6.370990632983254e-05, "loss": 2.393133354187012, "step": 25580 }, { "epoch": 7.263695713880216, "grad_norm": 6.70805025100708, "learning_rate": 6.369571388021573e-05, "loss": 2.3821887969970703, "step": 25590 }, { "epoch": 7.266534203803577, "grad_norm": 6.861043453216553, "learning_rate": 6.368152143059892e-05, "loss": 2.4173622131347656, "step": 25600 }, { "epoch": 7.269372693726937, "grad_norm": 7.181243419647217, "learning_rate": 6.366732898098212e-05, "loss": 2.3798095703125, "step": 25610 }, { "epoch": 7.272211183650298, "grad_norm": 6.550010681152344, "learning_rate": 6.365313653136532e-05, "loss": 2.4473424911499024, "step": 25620 }, { "epoch": 7.275049673573659, "grad_norm": 6.855264663696289, "learning_rate": 6.36389440817485e-05, "loss": 2.3100868225097657, "step": 25630 }, { "epoch": 7.277888163497019, "grad_norm": 6.643893241882324, "learning_rate": 6.362475163213171e-05, "loss": 2.3760553359985352, "step": 25640 }, { "epoch": 7.2807266534203805, "grad_norm": 6.256556510925293, "learning_rate": 6.36105591825149e-05, "loss": 2.389259910583496, "step": 25650 }, { "epoch": 7.283565143343741, "grad_norm": 6.662134647369385, "learning_rate": 6.359636673289811e-05, "loss": 2.449393081665039, "step": 25660 }, { "epoch": 7.286403633267102, "grad_norm": 6.88886022567749, "learning_rate": 6.358217428328129e-05, "loss": 2.3785888671875, "step": 25670 }, { "epoch": 7.289242123190463, "grad_norm": 6.907346248626709, "learning_rate": 6.35679818336645e-05, "loss": 2.4362699508666994, "step": 25680 }, { "epoch": 7.292080613113823, "grad_norm": 6.691803932189941, "learning_rate": 6.355378938404769e-05, "loss": 2.3673051834106444, "step": 25690 }, { "epoch": 7.294919103037184, "grad_norm": 6.688382148742676, "learning_rate": 6.353959693443088e-05, "loss": 2.360006332397461, "step": 25700 }, { "epoch": 7.297757592960545, "grad_norm": 6.703055381774902, "learning_rate": 6.352540448481408e-05, "loss": 2.4437772750854494, "step": 25710 }, { "epoch": 7.300596082883906, "grad_norm": 6.9012908935546875, "learning_rate": 6.351121203519727e-05, "loss": 2.4069543838500977, "step": 25720 }, { "epoch": 7.303434572807267, "grad_norm": 6.760648727416992, "learning_rate": 6.349701958558048e-05, "loss": 2.4462905883789063, "step": 25730 }, { "epoch": 7.306273062730627, "grad_norm": 6.711932182312012, "learning_rate": 6.348282713596367e-05, "loss": 2.4276941299438475, "step": 25740 }, { "epoch": 7.309111552653988, "grad_norm": 6.5036468505859375, "learning_rate": 6.346863468634686e-05, "loss": 2.3854297637939452, "step": 25750 }, { "epoch": 7.311950042577349, "grad_norm": 6.627821445465088, "learning_rate": 6.345444223673006e-05, "loss": 2.3989099502563476, "step": 25760 }, { "epoch": 7.314788532500709, "grad_norm": 7.032029151916504, "learning_rate": 6.344024978711326e-05, "loss": 2.317095947265625, "step": 25770 }, { "epoch": 7.3176270224240705, "grad_norm": 6.69108772277832, "learning_rate": 6.342605733749646e-05, "loss": 2.351945495605469, "step": 25780 }, { "epoch": 7.320465512347431, "grad_norm": 6.34332799911499, "learning_rate": 6.341186488787965e-05, "loss": 2.4044971466064453, "step": 25790 }, { "epoch": 7.323304002270792, "grad_norm": 6.757865905761719, "learning_rate": 6.339767243826284e-05, "loss": 2.4328197479248046, "step": 25800 }, { "epoch": 7.326142492194153, "grad_norm": 7.060494422912598, "learning_rate": 6.338347998864604e-05, "loss": 2.454773712158203, "step": 25810 }, { "epoch": 7.328980982117513, "grad_norm": 6.621903419494629, "learning_rate": 6.336928753902924e-05, "loss": 2.442915916442871, "step": 25820 }, { "epoch": 7.331819472040874, "grad_norm": 7.123822212219238, "learning_rate": 6.335509508941242e-05, "loss": 2.422414207458496, "step": 25830 }, { "epoch": 7.334657961964235, "grad_norm": 6.501167297363281, "learning_rate": 6.334090263979563e-05, "loss": 2.395231819152832, "step": 25840 }, { "epoch": 7.337496451887596, "grad_norm": 6.653839588165283, "learning_rate": 6.332671019017882e-05, "loss": 2.435892868041992, "step": 25850 }, { "epoch": 7.340334941810957, "grad_norm": 6.817480564117432, "learning_rate": 6.331251774056203e-05, "loss": 2.3297374725341795, "step": 25860 }, { "epoch": 7.343173431734318, "grad_norm": 7.16447639465332, "learning_rate": 6.329832529094521e-05, "loss": 2.346445655822754, "step": 25870 }, { "epoch": 7.346011921657678, "grad_norm": 7.086986541748047, "learning_rate": 6.328413284132842e-05, "loss": 2.4841545104980467, "step": 25880 }, { "epoch": 7.348850411581039, "grad_norm": 6.766867637634277, "learning_rate": 6.326994039171161e-05, "loss": 2.4146390914916993, "step": 25890 }, { "epoch": 7.351688901504399, "grad_norm": 6.68381929397583, "learning_rate": 6.325574794209482e-05, "loss": 2.3876575469970702, "step": 25900 }, { "epoch": 7.3545273914277605, "grad_norm": 6.929403781890869, "learning_rate": 6.3241555492478e-05, "loss": 2.439004135131836, "step": 25910 }, { "epoch": 7.357365881351122, "grad_norm": 7.107536315917969, "learning_rate": 6.32273630428612e-05, "loss": 2.3784807205200194, "step": 25920 }, { "epoch": 7.360204371274482, "grad_norm": 6.244600772857666, "learning_rate": 6.32131705932444e-05, "loss": 2.405898857116699, "step": 25930 }, { "epoch": 7.363042861197843, "grad_norm": 6.851384162902832, "learning_rate": 6.319897814362759e-05, "loss": 2.428889846801758, "step": 25940 }, { "epoch": 7.365881351121203, "grad_norm": 6.319164752960205, "learning_rate": 6.318478569401078e-05, "loss": 2.364712142944336, "step": 25950 }, { "epoch": 7.368719841044564, "grad_norm": 6.4256978034973145, "learning_rate": 6.317059324439398e-05, "loss": 2.3582178115844727, "step": 25960 }, { "epoch": 7.371558330967925, "grad_norm": 6.865230560302734, "learning_rate": 6.315640079477718e-05, "loss": 2.383022117614746, "step": 25970 }, { "epoch": 7.374396820891286, "grad_norm": 6.71386194229126, "learning_rate": 6.314220834516038e-05, "loss": 2.3841405868530274, "step": 25980 }, { "epoch": 7.377235310814647, "grad_norm": 7.000174045562744, "learning_rate": 6.312801589554357e-05, "loss": 2.3758846282958985, "step": 25990 }, { "epoch": 7.380073800738008, "grad_norm": 7.139054298400879, "learning_rate": 6.311382344592676e-05, "loss": 2.4506731033325195, "step": 26000 }, { "epoch": 7.380073800738008, "eval_accuracy": 0.29013797927131685, "eval_loss": 2.704927682876587, "eval_runtime": 53.1131, "eval_samples_per_second": 296.104, "eval_steps_per_second": 4.632, "step": 26000 }, { "epoch": 7.382912290661368, "grad_norm": 7.039389133453369, "learning_rate": 6.309963099630997e-05, "loss": 2.441193771362305, "step": 26010 }, { "epoch": 7.385750780584729, "grad_norm": 6.761870384216309, "learning_rate": 6.308543854669316e-05, "loss": 2.4554960250854494, "step": 26020 }, { "epoch": 7.388589270508089, "grad_norm": 6.922343730926514, "learning_rate": 6.307124609707636e-05, "loss": 2.349243927001953, "step": 26030 }, { "epoch": 7.3914277604314504, "grad_norm": 6.586727142333984, "learning_rate": 6.305705364745955e-05, "loss": 2.4463062286376953, "step": 26040 }, { "epoch": 7.3942662503548116, "grad_norm": 6.9889140129089355, "learning_rate": 6.304286119784274e-05, "loss": 2.4096166610717775, "step": 26050 }, { "epoch": 7.397104740278172, "grad_norm": 7.192827224731445, "learning_rate": 6.302866874822595e-05, "loss": 2.48742561340332, "step": 26060 }, { "epoch": 7.399943230201533, "grad_norm": 6.814613342285156, "learning_rate": 6.301447629860913e-05, "loss": 2.4959869384765625, "step": 26070 }, { "epoch": 7.402781720124894, "grad_norm": 6.917481899261475, "learning_rate": 6.300028384899234e-05, "loss": 2.4464244842529297, "step": 26080 }, { "epoch": 7.405620210048254, "grad_norm": 6.729472637176514, "learning_rate": 6.298609139937553e-05, "loss": 2.4658241271972656, "step": 26090 }, { "epoch": 7.408458699971615, "grad_norm": 6.759723663330078, "learning_rate": 6.297189894975874e-05, "loss": 2.4088766098022463, "step": 26100 }, { "epoch": 7.4112971898949755, "grad_norm": 6.968299388885498, "learning_rate": 6.295770650014193e-05, "loss": 2.436565971374512, "step": 26110 }, { "epoch": 7.414135679818337, "grad_norm": 6.556213855743408, "learning_rate": 6.294351405052513e-05, "loss": 2.3345598220825194, "step": 26120 }, { "epoch": 7.416974169741698, "grad_norm": 6.614538669586182, "learning_rate": 6.292932160090832e-05, "loss": 2.34884090423584, "step": 26130 }, { "epoch": 7.419812659665058, "grad_norm": 6.509438991546631, "learning_rate": 6.291512915129151e-05, "loss": 2.3233909606933594, "step": 26140 }, { "epoch": 7.422651149588419, "grad_norm": 6.5199384689331055, "learning_rate": 6.290093670167472e-05, "loss": 2.376646614074707, "step": 26150 }, { "epoch": 7.425489639511779, "grad_norm": 7.198007583618164, "learning_rate": 6.28867442520579e-05, "loss": 2.3899763107299803, "step": 26160 }, { "epoch": 7.42832812943514, "grad_norm": 6.398108959197998, "learning_rate": 6.28725518024411e-05, "loss": 2.39815616607666, "step": 26170 }, { "epoch": 7.4311666193585015, "grad_norm": 6.7416510581970215, "learning_rate": 6.28583593528243e-05, "loss": 2.405443572998047, "step": 26180 }, { "epoch": 7.434005109281862, "grad_norm": 6.833760738372803, "learning_rate": 6.28441669032075e-05, "loss": 2.4137964248657227, "step": 26190 }, { "epoch": 7.436843599205223, "grad_norm": 6.459751605987549, "learning_rate": 6.282997445359069e-05, "loss": 2.3792362213134766, "step": 26200 }, { "epoch": 7.439682089128584, "grad_norm": 6.833566188812256, "learning_rate": 6.281578200397389e-05, "loss": 2.4150518417358398, "step": 26210 }, { "epoch": 7.442520579051944, "grad_norm": 7.083742141723633, "learning_rate": 6.280158955435709e-05, "loss": 2.335506629943848, "step": 26220 }, { "epoch": 7.445359068975305, "grad_norm": 6.997290134429932, "learning_rate": 6.278739710474029e-05, "loss": 2.405082130432129, "step": 26230 }, { "epoch": 7.4481975588986655, "grad_norm": 6.887082576751709, "learning_rate": 6.277320465512347e-05, "loss": 2.419173812866211, "step": 26240 }, { "epoch": 7.451036048822027, "grad_norm": 6.914144515991211, "learning_rate": 6.275901220550668e-05, "loss": 2.4063114166259765, "step": 26250 }, { "epoch": 7.453874538745388, "grad_norm": 6.377435684204102, "learning_rate": 6.274481975588987e-05, "loss": 2.326699066162109, "step": 26260 }, { "epoch": 7.456713028668748, "grad_norm": 6.759915828704834, "learning_rate": 6.273062730627307e-05, "loss": 2.45582275390625, "step": 26270 }, { "epoch": 7.459551518592109, "grad_norm": 6.1969685554504395, "learning_rate": 6.271643485665626e-05, "loss": 2.4615236282348634, "step": 26280 }, { "epoch": 7.46239000851547, "grad_norm": 6.946773529052734, "learning_rate": 6.270224240703945e-05, "loss": 2.423421096801758, "step": 26290 }, { "epoch": 7.46522849843883, "grad_norm": 6.653613090515137, "learning_rate": 6.268804995742266e-05, "loss": 2.452910614013672, "step": 26300 }, { "epoch": 7.4680669883621915, "grad_norm": 6.440325736999512, "learning_rate": 6.267385750780585e-05, "loss": 2.3579450607299806, "step": 26310 }, { "epoch": 7.470905478285552, "grad_norm": 6.871437072753906, "learning_rate": 6.265966505818905e-05, "loss": 2.396692657470703, "step": 26320 }, { "epoch": 7.473743968208913, "grad_norm": 6.172481536865234, "learning_rate": 6.264547260857224e-05, "loss": 2.398271942138672, "step": 26330 }, { "epoch": 7.476582458132274, "grad_norm": 6.565873622894287, "learning_rate": 6.263128015895545e-05, "loss": 2.3475236892700195, "step": 26340 }, { "epoch": 7.479420948055634, "grad_norm": 6.466111183166504, "learning_rate": 6.261708770933864e-05, "loss": 2.485027313232422, "step": 26350 }, { "epoch": 7.482259437978995, "grad_norm": 6.916967391967773, "learning_rate": 6.260289525972183e-05, "loss": 2.3680841445922853, "step": 26360 }, { "epoch": 7.4850979279023555, "grad_norm": 6.6014180183410645, "learning_rate": 6.258870281010503e-05, "loss": 2.3714099884033204, "step": 26370 }, { "epoch": 7.487936417825717, "grad_norm": 6.468209266662598, "learning_rate": 6.257451036048822e-05, "loss": 2.3858489990234375, "step": 26380 }, { "epoch": 7.490774907749078, "grad_norm": 6.698957443237305, "learning_rate": 6.256031791087143e-05, "loss": 2.3918136596679687, "step": 26390 }, { "epoch": 7.493613397672438, "grad_norm": 6.725753307342529, "learning_rate": 6.25461254612546e-05, "loss": 2.3866256713867187, "step": 26400 }, { "epoch": 7.496451887595799, "grad_norm": 6.892436504364014, "learning_rate": 6.253193301163781e-05, "loss": 2.4455028533935548, "step": 26410 }, { "epoch": 7.49929037751916, "grad_norm": 6.432968616485596, "learning_rate": 6.2517740562021e-05, "loss": 2.402680206298828, "step": 26420 }, { "epoch": 7.50212886744252, "grad_norm": 6.760826110839844, "learning_rate": 6.250354811240421e-05, "loss": 2.4207042694091796, "step": 26430 }, { "epoch": 7.5049673573658815, "grad_norm": 6.575415134429932, "learning_rate": 6.248935566278739e-05, "loss": 2.4166950225830077, "step": 26440 }, { "epoch": 7.507805847289243, "grad_norm": 6.806710243225098, "learning_rate": 6.24751632131706e-05, "loss": 2.4030006408691404, "step": 26450 }, { "epoch": 7.510644337212603, "grad_norm": 6.708930492401123, "learning_rate": 6.246097076355379e-05, "loss": 2.3943653106689453, "step": 26460 }, { "epoch": 7.513482827135964, "grad_norm": 7.117057800292969, "learning_rate": 6.2446778313937e-05, "loss": 2.4041984558105467, "step": 26470 }, { "epoch": 7.516321317059324, "grad_norm": 7.031278133392334, "learning_rate": 6.243258586432018e-05, "loss": 2.385260009765625, "step": 26480 }, { "epoch": 7.519159806982685, "grad_norm": 6.49534273147583, "learning_rate": 6.241839341470339e-05, "loss": 2.358173370361328, "step": 26490 }, { "epoch": 7.521998296906046, "grad_norm": 6.6359758377075195, "learning_rate": 6.240420096508658e-05, "loss": 2.4104686737060548, "step": 26500 }, { "epoch": 7.521998296906046, "eval_accuracy": 0.28657722388249507, "eval_loss": 2.6990387439727783, "eval_runtime": 56.658, "eval_samples_per_second": 277.578, "eval_steps_per_second": 4.342, "step": 26500 }, { "epoch": 7.524836786829407, "grad_norm": 6.397943019866943, "learning_rate": 6.239000851546977e-05, "loss": 2.4065519332885743, "step": 26510 }, { "epoch": 7.527675276752768, "grad_norm": 6.919848918914795, "learning_rate": 6.237581606585297e-05, "loss": 2.4186471939086913, "step": 26520 }, { "epoch": 7.530513766676128, "grad_norm": 6.767887115478516, "learning_rate": 6.236162361623616e-05, "loss": 2.4608768463134765, "step": 26530 }, { "epoch": 7.533352256599489, "grad_norm": 6.605247497558594, "learning_rate": 6.234743116661937e-05, "loss": 2.39440860748291, "step": 26540 }, { "epoch": 7.53619074652285, "grad_norm": 6.976144313812256, "learning_rate": 6.233323871700256e-05, "loss": 2.4006235122680666, "step": 26550 }, { "epoch": 7.53902923644621, "grad_norm": 7.204406261444092, "learning_rate": 6.231904626738575e-05, "loss": 2.392486000061035, "step": 26560 }, { "epoch": 7.5418677263695715, "grad_norm": 6.8084635734558105, "learning_rate": 6.230485381776895e-05, "loss": 2.3572366714477537, "step": 26570 }, { "epoch": 7.544706216292932, "grad_norm": 6.681204319000244, "learning_rate": 6.229066136815215e-05, "loss": 2.4145402908325195, "step": 26580 }, { "epoch": 7.547544706216293, "grad_norm": 6.693953037261963, "learning_rate": 6.227646891853535e-05, "loss": 2.4053266525268553, "step": 26590 }, { "epoch": 7.550383196139654, "grad_norm": 6.963315010070801, "learning_rate": 6.226227646891854e-05, "loss": 2.357467842102051, "step": 26600 }, { "epoch": 7.553221686063014, "grad_norm": 6.657169342041016, "learning_rate": 6.224808401930173e-05, "loss": 2.472349166870117, "step": 26610 }, { "epoch": 7.556060175986375, "grad_norm": 6.557709217071533, "learning_rate": 6.223389156968493e-05, "loss": 2.381886100769043, "step": 26620 }, { "epoch": 7.558898665909736, "grad_norm": 6.904953956604004, "learning_rate": 6.221969912006813e-05, "loss": 2.453321647644043, "step": 26630 }, { "epoch": 7.561737155833097, "grad_norm": 6.829317092895508, "learning_rate": 6.220550667045131e-05, "loss": 2.3751571655273436, "step": 26640 }, { "epoch": 7.564575645756458, "grad_norm": 6.801843166351318, "learning_rate": 6.219131422083452e-05, "loss": 2.351247787475586, "step": 26650 }, { "epoch": 7.567414135679819, "grad_norm": 6.9131951332092285, "learning_rate": 6.217712177121771e-05, "loss": 2.3679386138916017, "step": 26660 }, { "epoch": 7.570252625603179, "grad_norm": 6.901749610900879, "learning_rate": 6.216292932160092e-05, "loss": 2.3734371185302736, "step": 26670 }, { "epoch": 7.57309111552654, "grad_norm": 6.667954921722412, "learning_rate": 6.21487368719841e-05, "loss": 2.446079063415527, "step": 26680 }, { "epoch": 7.5759296054499, "grad_norm": 6.993813991546631, "learning_rate": 6.213454442236731e-05, "loss": 2.337972068786621, "step": 26690 }, { "epoch": 7.5787680953732615, "grad_norm": 6.42673921585083, "learning_rate": 6.21203519727505e-05, "loss": 2.419547271728516, "step": 26700 }, { "epoch": 7.581606585296623, "grad_norm": 6.560650825500488, "learning_rate": 6.21061595231337e-05, "loss": 2.4357295989990235, "step": 26710 }, { "epoch": 7.584445075219983, "grad_norm": 7.295538902282715, "learning_rate": 6.209196707351689e-05, "loss": 2.3246528625488283, "step": 26720 }, { "epoch": 7.587283565143344, "grad_norm": 7.029907703399658, "learning_rate": 6.207777462390008e-05, "loss": 2.3898983001708984, "step": 26730 }, { "epoch": 7.590122055066704, "grad_norm": 6.512372970581055, "learning_rate": 6.206358217428329e-05, "loss": 2.351784515380859, "step": 26740 }, { "epoch": 7.592960544990065, "grad_norm": 6.568445682525635, "learning_rate": 6.204938972466648e-05, "loss": 2.4010868072509766, "step": 26750 }, { "epoch": 7.595799034913426, "grad_norm": 6.828911781311035, "learning_rate": 6.203519727504967e-05, "loss": 2.31143741607666, "step": 26760 }, { "epoch": 7.598637524836787, "grad_norm": 6.705779075622559, "learning_rate": 6.202100482543287e-05, "loss": 2.449458122253418, "step": 26770 }, { "epoch": 7.601476014760148, "grad_norm": 6.423527240753174, "learning_rate": 6.200681237581607e-05, "loss": 2.45245418548584, "step": 26780 }, { "epoch": 7.604314504683508, "grad_norm": 6.693976402282715, "learning_rate": 6.199261992619927e-05, "loss": 2.432330513000488, "step": 26790 }, { "epoch": 7.607152994606869, "grad_norm": 6.374234199523926, "learning_rate": 6.197842747658246e-05, "loss": 2.351073455810547, "step": 26800 }, { "epoch": 7.60999148453023, "grad_norm": 6.676353454589844, "learning_rate": 6.196423502696565e-05, "loss": 2.359508514404297, "step": 26810 }, { "epoch": 7.61282997445359, "grad_norm": 6.580334186553955, "learning_rate": 6.195004257734886e-05, "loss": 2.419461250305176, "step": 26820 }, { "epoch": 7.615668464376951, "grad_norm": 6.721037864685059, "learning_rate": 6.193585012773205e-05, "loss": 2.332809066772461, "step": 26830 }, { "epoch": 7.6185069543003126, "grad_norm": 6.640226364135742, "learning_rate": 6.192165767811525e-05, "loss": 2.4591192245483398, "step": 26840 }, { "epoch": 7.621345444223673, "grad_norm": 6.662311553955078, "learning_rate": 6.190746522849844e-05, "loss": 2.4247684478759766, "step": 26850 }, { "epoch": 7.624183934147034, "grad_norm": 7.127460479736328, "learning_rate": 6.189327277888163e-05, "loss": 2.3930782318115233, "step": 26860 }, { "epoch": 7.627022424070395, "grad_norm": 6.948635101318359, "learning_rate": 6.187908032926484e-05, "loss": 2.3824302673339846, "step": 26870 }, { "epoch": 7.629860913993755, "grad_norm": 6.47599983215332, "learning_rate": 6.186488787964802e-05, "loss": 2.4302539825439453, "step": 26880 }, { "epoch": 7.632699403917116, "grad_norm": 6.730532646179199, "learning_rate": 6.185069543003123e-05, "loss": 2.4035110473632812, "step": 26890 }, { "epoch": 7.6355378938404765, "grad_norm": 6.699854373931885, "learning_rate": 6.183650298041442e-05, "loss": 2.4405256271362306, "step": 26900 }, { "epoch": 7.638376383763838, "grad_norm": 6.8603315353393555, "learning_rate": 6.182231053079763e-05, "loss": 2.358321762084961, "step": 26910 }, { "epoch": 7.641214873687199, "grad_norm": 7.01021671295166, "learning_rate": 6.180811808118081e-05, "loss": 2.2968770980834963, "step": 26920 }, { "epoch": 7.644053363610559, "grad_norm": 6.6976704597473145, "learning_rate": 6.179392563156401e-05, "loss": 2.4281091690063477, "step": 26930 }, { "epoch": 7.64689185353392, "grad_norm": 6.971150875091553, "learning_rate": 6.177973318194721e-05, "loss": 2.405438232421875, "step": 26940 }, { "epoch": 7.64973034345728, "grad_norm": 6.703743934631348, "learning_rate": 6.17655407323304e-05, "loss": 2.438441276550293, "step": 26950 }, { "epoch": 7.652568833380641, "grad_norm": 6.716554164886475, "learning_rate": 6.175276752767529e-05, "loss": 2.385936737060547, "step": 26960 }, { "epoch": 7.6554073233040025, "grad_norm": 6.688619136810303, "learning_rate": 6.173857507805848e-05, "loss": 2.4115665435791014, "step": 26970 }, { "epoch": 7.658245813227363, "grad_norm": 6.970142841339111, "learning_rate": 6.172438262844167e-05, "loss": 2.3440835952758787, "step": 26980 }, { "epoch": 7.661084303150724, "grad_norm": 6.759530067443848, "learning_rate": 6.171019017882487e-05, "loss": 2.29937629699707, "step": 26990 }, { "epoch": 7.663922793074085, "grad_norm": 6.947240829467773, "learning_rate": 6.169599772920806e-05, "loss": 2.429606246948242, "step": 27000 }, { "epoch": 7.663922793074085, "eval_accuracy": 0.2942074140013989, "eval_loss": 2.68684720993042, "eval_runtime": 53.1112, "eval_samples_per_second": 296.115, "eval_steps_per_second": 4.632, "step": 27000 }, { "epoch": 7.666761282997445, "grad_norm": 6.685393810272217, "learning_rate": 6.168180527959127e-05, "loss": 2.4189252853393555, "step": 27010 }, { "epoch": 7.669599772920806, "grad_norm": 6.556713104248047, "learning_rate": 6.166761282997445e-05, "loss": 2.3776792526245116, "step": 27020 }, { "epoch": 7.6724382628441665, "grad_norm": 6.7449235916137695, "learning_rate": 6.165342038035765e-05, "loss": 2.4051977157592774, "step": 27030 }, { "epoch": 7.675276752767528, "grad_norm": 6.642181396484375, "learning_rate": 6.163922793074085e-05, "loss": 2.375467300415039, "step": 27040 }, { "epoch": 7.678115242690889, "grad_norm": 7.363474369049072, "learning_rate": 6.162503548112405e-05, "loss": 2.3928625106811525, "step": 27050 }, { "epoch": 7.680953732614249, "grad_norm": 6.702805042266846, "learning_rate": 6.161084303150723e-05, "loss": 2.4834537506103516, "step": 27060 }, { "epoch": 7.68379222253761, "grad_norm": 6.840671062469482, "learning_rate": 6.159665058189044e-05, "loss": 2.3725242614746094, "step": 27070 }, { "epoch": 7.686630712460971, "grad_norm": 6.7849345207214355, "learning_rate": 6.158245813227363e-05, "loss": 2.365949821472168, "step": 27080 }, { "epoch": 7.689469202384331, "grad_norm": 7.07089376449585, "learning_rate": 6.156826568265683e-05, "loss": 2.434286880493164, "step": 27090 }, { "epoch": 7.6923076923076925, "grad_norm": 6.457353591918945, "learning_rate": 6.155407323304002e-05, "loss": 2.3357372283935547, "step": 27100 }, { "epoch": 7.695146182231053, "grad_norm": 6.811819553375244, "learning_rate": 6.153988078342321e-05, "loss": 2.3694347381591796, "step": 27110 }, { "epoch": 7.697984672154414, "grad_norm": 6.859020233154297, "learning_rate": 6.152568833380642e-05, "loss": 2.385867691040039, "step": 27120 }, { "epoch": 7.700823162077775, "grad_norm": 6.978448867797852, "learning_rate": 6.151149588418961e-05, "loss": 2.4387245178222656, "step": 27130 }, { "epoch": 7.703661652001135, "grad_norm": 6.71302604675293, "learning_rate": 6.149730343457281e-05, "loss": 2.500988006591797, "step": 27140 }, { "epoch": 7.706500141924496, "grad_norm": 6.790470600128174, "learning_rate": 6.1483110984956e-05, "loss": 2.33365421295166, "step": 27150 }, { "epoch": 7.7093386318478565, "grad_norm": 6.648815631866455, "learning_rate": 6.146891853533921e-05, "loss": 2.430598831176758, "step": 27160 }, { "epoch": 7.712177121771218, "grad_norm": 6.56483793258667, "learning_rate": 6.14547260857224e-05, "loss": 2.3255233764648438, "step": 27170 }, { "epoch": 7.715015611694579, "grad_norm": 6.580753326416016, "learning_rate": 6.14405336361056e-05, "loss": 2.335091972351074, "step": 27180 }, { "epoch": 7.717854101617939, "grad_norm": 6.79150390625, "learning_rate": 6.142634118648879e-05, "loss": 2.420218086242676, "step": 27190 }, { "epoch": 7.7206925915413, "grad_norm": 6.339142799377441, "learning_rate": 6.1412148736872e-05, "loss": 2.438707733154297, "step": 27200 }, { "epoch": 7.723531081464661, "grad_norm": 6.818111419677734, "learning_rate": 6.139795628725519e-05, "loss": 2.35299072265625, "step": 27210 }, { "epoch": 7.726369571388021, "grad_norm": 6.883272647857666, "learning_rate": 6.138376383763838e-05, "loss": 2.3458080291748047, "step": 27220 }, { "epoch": 7.7292080613113825, "grad_norm": 7.001041412353516, "learning_rate": 6.136957138802157e-05, "loss": 2.373095703125, "step": 27230 }, { "epoch": 7.732046551234744, "grad_norm": 6.6436262130737305, "learning_rate": 6.135537893840477e-05, "loss": 2.3940507888793947, "step": 27240 }, { "epoch": 7.734885041158104, "grad_norm": 6.837100982666016, "learning_rate": 6.134118648878798e-05, "loss": 2.4370927810668945, "step": 27250 }, { "epoch": 7.737723531081465, "grad_norm": 6.760686874389648, "learning_rate": 6.132699403917115e-05, "loss": 2.421274948120117, "step": 27260 }, { "epoch": 7.740562021004825, "grad_norm": 6.488892555236816, "learning_rate": 6.131280158955436e-05, "loss": 2.3738943099975587, "step": 27270 }, { "epoch": 7.743400510928186, "grad_norm": 6.398158550262451, "learning_rate": 6.129860913993756e-05, "loss": 2.4252838134765624, "step": 27280 }, { "epoch": 7.746239000851547, "grad_norm": 6.614654541015625, "learning_rate": 6.128441669032076e-05, "loss": 2.4014358520507812, "step": 27290 }, { "epoch": 7.749077490774908, "grad_norm": 6.708235263824463, "learning_rate": 6.127022424070394e-05, "loss": 2.3153451919555663, "step": 27300 }, { "epoch": 7.751915980698269, "grad_norm": 6.71376895904541, "learning_rate": 6.125603179108715e-05, "loss": 2.3904136657714843, "step": 27310 }, { "epoch": 7.754754470621629, "grad_norm": 6.524368762969971, "learning_rate": 6.124183934147034e-05, "loss": 2.300881004333496, "step": 27320 }, { "epoch": 7.75759296054499, "grad_norm": 6.716808319091797, "learning_rate": 6.122764689185354e-05, "loss": 2.434156799316406, "step": 27330 }, { "epoch": 7.760431450468351, "grad_norm": 6.483133792877197, "learning_rate": 6.121345444223673e-05, "loss": 2.419179916381836, "step": 27340 }, { "epoch": 7.763269940391711, "grad_norm": 6.393593788146973, "learning_rate": 6.119926199261992e-05, "loss": 2.3668760299682616, "step": 27350 }, { "epoch": 7.7661084303150725, "grad_norm": 6.597276210784912, "learning_rate": 6.118506954300313e-05, "loss": 2.3962285995483397, "step": 27360 }, { "epoch": 7.768946920238433, "grad_norm": 6.795810699462891, "learning_rate": 6.117087709338632e-05, "loss": 2.2701353073120116, "step": 27370 }, { "epoch": 7.771785410161794, "grad_norm": 6.326918601989746, "learning_rate": 6.115668464376952e-05, "loss": 2.3920015335083007, "step": 27380 }, { "epoch": 7.774623900085155, "grad_norm": 6.93362283706665, "learning_rate": 6.114249219415271e-05, "loss": 2.4107501983642576, "step": 27390 }, { "epoch": 7.777462390008515, "grad_norm": 6.461177825927734, "learning_rate": 6.112829974453592e-05, "loss": 2.4088748931884765, "step": 27400 }, { "epoch": 7.780300879931876, "grad_norm": 6.813750743865967, "learning_rate": 6.111410729491911e-05, "loss": 2.458831024169922, "step": 27410 }, { "epoch": 7.783139369855237, "grad_norm": 6.840523719787598, "learning_rate": 6.10999148453023e-05, "loss": 2.501734733581543, "step": 27420 }, { "epoch": 7.785977859778598, "grad_norm": 6.719699382781982, "learning_rate": 6.10857223956855e-05, "loss": 2.4020286560058595, "step": 27430 }, { "epoch": 7.788816349701959, "grad_norm": 6.918241024017334, "learning_rate": 6.107152994606869e-05, "loss": 2.3948337554931642, "step": 27440 }, { "epoch": 7.79165483962532, "grad_norm": 6.549995422363281, "learning_rate": 6.10573374964519e-05, "loss": 2.4098848342895507, "step": 27450 }, { "epoch": 7.79449332954868, "grad_norm": 6.895074844360352, "learning_rate": 6.104314504683508e-05, "loss": 2.4143093109130858, "step": 27460 }, { "epoch": 7.797331819472041, "grad_norm": 7.0353922843933105, "learning_rate": 6.102895259721828e-05, "loss": 2.416548156738281, "step": 27470 }, { "epoch": 7.800170309395401, "grad_norm": 6.954307556152344, "learning_rate": 6.101476014760148e-05, "loss": 2.456694984436035, "step": 27480 }, { "epoch": 7.8030087993187625, "grad_norm": 7.071238040924072, "learning_rate": 6.1000567697984676e-05, "loss": 2.4366619110107424, "step": 27490 }, { "epoch": 7.805847289242124, "grad_norm": 6.789133071899414, "learning_rate": 6.098637524836787e-05, "loss": 2.3843368530273437, "step": 27500 }, { "epoch": 7.805847289242124, "eval_accuracy": 0.2898836396006867, "eval_loss": 2.68178391456604, "eval_runtime": 56.1978, "eval_samples_per_second": 279.851, "eval_steps_per_second": 4.377, "step": 27500 }, { "epoch": 7.808685779165484, "grad_norm": 6.4439520835876465, "learning_rate": 6.097218279875106e-05, "loss": 2.422525978088379, "step": 27510 }, { "epoch": 7.811524269088845, "grad_norm": 6.56974458694458, "learning_rate": 6.095799034913426e-05, "loss": 2.4039173126220703, "step": 27520 }, { "epoch": 7.814362759012205, "grad_norm": 6.6858906745910645, "learning_rate": 6.094379789951746e-05, "loss": 2.3968647003173826, "step": 27530 }, { "epoch": 7.817201248935566, "grad_norm": 6.718699932098389, "learning_rate": 6.092960544990065e-05, "loss": 2.3267921447753905, "step": 27540 }, { "epoch": 7.820039738858927, "grad_norm": 6.554198265075684, "learning_rate": 6.091541300028385e-05, "loss": 2.4275882720947264, "step": 27550 }, { "epoch": 7.822878228782288, "grad_norm": 6.786749839782715, "learning_rate": 6.090122055066705e-05, "loss": 2.362214279174805, "step": 27560 }, { "epoch": 7.825716718705649, "grad_norm": 7.008045673370361, "learning_rate": 6.088702810105025e-05, "loss": 2.366486358642578, "step": 27570 }, { "epoch": 7.828555208629009, "grad_norm": 6.647365093231201, "learning_rate": 6.0872835651433436e-05, "loss": 2.3656856536865236, "step": 27580 }, { "epoch": 7.83139369855237, "grad_norm": 6.240646839141846, "learning_rate": 6.0858643201816636e-05, "loss": 2.366420555114746, "step": 27590 }, { "epoch": 7.834232188475731, "grad_norm": 6.919047832489014, "learning_rate": 6.0844450752199836e-05, "loss": 2.423602485656738, "step": 27600 }, { "epoch": 7.837070678399091, "grad_norm": 7.005289554595947, "learning_rate": 6.083025830258303e-05, "loss": 2.402277946472168, "step": 27610 }, { "epoch": 7.839909168322452, "grad_norm": 6.841098785400391, "learning_rate": 6.081606585296622e-05, "loss": 2.385087585449219, "step": 27620 }, { "epoch": 7.8427476582458135, "grad_norm": 6.570130825042725, "learning_rate": 6.0801873403349416e-05, "loss": 2.3755163192749023, "step": 27630 }, { "epoch": 7.845586148169174, "grad_norm": 6.948818206787109, "learning_rate": 6.0787680953732616e-05, "loss": 2.4714527130126953, "step": 27640 }, { "epoch": 7.848424638092535, "grad_norm": 6.3699517250061035, "learning_rate": 6.0773488504115816e-05, "loss": 2.4038543701171875, "step": 27650 }, { "epoch": 7.851263128015896, "grad_norm": 6.284588813781738, "learning_rate": 6.0759296054499e-05, "loss": 2.3781871795654297, "step": 27660 }, { "epoch": 7.854101617939256, "grad_norm": 6.613318920135498, "learning_rate": 6.07451036048822e-05, "loss": 2.401602363586426, "step": 27670 }, { "epoch": 7.856940107862617, "grad_norm": 6.574204444885254, "learning_rate": 6.07309111552654e-05, "loss": 2.455954742431641, "step": 27680 }, { "epoch": 7.8597785977859775, "grad_norm": 6.781551361083984, "learning_rate": 6.07167187056486e-05, "loss": 2.409098434448242, "step": 27690 }, { "epoch": 7.862617087709339, "grad_norm": 7.066442489624023, "learning_rate": 6.070252625603179e-05, "loss": 2.36193962097168, "step": 27700 }, { "epoch": 7.8654555776327, "grad_norm": 6.673850059509277, "learning_rate": 6.068833380641499e-05, "loss": 2.3557659149169923, "step": 27710 }, { "epoch": 7.86829406755606, "grad_norm": 6.835977077484131, "learning_rate": 6.067414135679818e-05, "loss": 2.368724250793457, "step": 27720 }, { "epoch": 7.871132557479421, "grad_norm": 6.583373546600342, "learning_rate": 6.065994890718138e-05, "loss": 2.345169448852539, "step": 27730 }, { "epoch": 7.873971047402781, "grad_norm": 6.427889347076416, "learning_rate": 6.064575645756457e-05, "loss": 2.437419891357422, "step": 27740 }, { "epoch": 7.876809537326142, "grad_norm": 6.613348007202148, "learning_rate": 6.063156400794777e-05, "loss": 2.459239196777344, "step": 27750 }, { "epoch": 7.8796480272495035, "grad_norm": 5.882919788360596, "learning_rate": 6.061737155833097e-05, "loss": 2.3443635940551757, "step": 27760 }, { "epoch": 7.882486517172864, "grad_norm": 7.030297756195068, "learning_rate": 6.060317910871417e-05, "loss": 2.3763330459594725, "step": 27770 }, { "epoch": 7.885325007096225, "grad_norm": 6.587264060974121, "learning_rate": 6.058898665909736e-05, "loss": 2.4304149627685545, "step": 27780 }, { "epoch": 7.888163497019586, "grad_norm": 6.697344779968262, "learning_rate": 6.057479420948056e-05, "loss": 2.3554319381713866, "step": 27790 }, { "epoch": 7.891001986942946, "grad_norm": 6.979598045349121, "learning_rate": 6.056060175986376e-05, "loss": 2.384914016723633, "step": 27800 }, { "epoch": 7.893840476866307, "grad_norm": 6.497492790222168, "learning_rate": 6.054640931024696e-05, "loss": 2.3554433822631835, "step": 27810 }, { "epoch": 7.8966789667896675, "grad_norm": 6.8351545333862305, "learning_rate": 6.053221686063015e-05, "loss": 2.320460891723633, "step": 27820 }, { "epoch": 7.899517456713029, "grad_norm": 7.0521321296691895, "learning_rate": 6.0518024411013344e-05, "loss": 2.3567800521850586, "step": 27830 }, { "epoch": 7.90235594663639, "grad_norm": 6.553372383117676, "learning_rate": 6.050383196139654e-05, "loss": 2.3493955612182615, "step": 27840 }, { "epoch": 7.90519443655975, "grad_norm": 6.6514458656311035, "learning_rate": 6.048963951177974e-05, "loss": 2.420359230041504, "step": 27850 }, { "epoch": 7.908032926483111, "grad_norm": 6.57050895690918, "learning_rate": 6.047544706216294e-05, "loss": 2.316244888305664, "step": 27860 }, { "epoch": 7.910871416406472, "grad_norm": 6.719569683074951, "learning_rate": 6.0461254612546124e-05, "loss": 2.440070152282715, "step": 27870 }, { "epoch": 7.913709906329832, "grad_norm": 6.993510723114014, "learning_rate": 6.0447062162929324e-05, "loss": 2.372110939025879, "step": 27880 }, { "epoch": 7.9165483962531935, "grad_norm": 6.756947040557861, "learning_rate": 6.0432869713312524e-05, "loss": 2.3529531478881838, "step": 27890 }, { "epoch": 7.919386886176554, "grad_norm": 6.694365978240967, "learning_rate": 6.0418677263695724e-05, "loss": 2.4008934020996096, "step": 27900 }, { "epoch": 7.922225376099915, "grad_norm": 6.595396518707275, "learning_rate": 6.040448481407891e-05, "loss": 2.3771324157714844, "step": 27910 }, { "epoch": 7.925063866023276, "grad_norm": 6.661069393157959, "learning_rate": 6.039029236446211e-05, "loss": 2.362055778503418, "step": 27920 }, { "epoch": 7.927902355946636, "grad_norm": 6.660787582397461, "learning_rate": 6.037609991484531e-05, "loss": 2.3373062133789064, "step": 27930 }, { "epoch": 7.930740845869997, "grad_norm": 6.354430198669434, "learning_rate": 6.0361907465228504e-05, "loss": 2.3859350204467775, "step": 27940 }, { "epoch": 7.9335793357933575, "grad_norm": 6.8354644775390625, "learning_rate": 6.03477150156117e-05, "loss": 2.3151453018188475, "step": 27950 }, { "epoch": 7.936417825716719, "grad_norm": 6.374092102050781, "learning_rate": 6.033352256599489e-05, "loss": 2.33992862701416, "step": 27960 }, { "epoch": 7.93925631564008, "grad_norm": 6.527353763580322, "learning_rate": 6.031933011637809e-05, "loss": 2.4147777557373047, "step": 27970 }, { "epoch": 7.94209480556344, "grad_norm": 6.330588340759277, "learning_rate": 6.030513766676129e-05, "loss": 2.3549991607666017, "step": 27980 }, { "epoch": 7.944933295486801, "grad_norm": 6.759909629821777, "learning_rate": 6.029094521714448e-05, "loss": 2.449125289916992, "step": 27990 }, { "epoch": 7.947771785410162, "grad_norm": 6.7896904945373535, "learning_rate": 6.027675276752768e-05, "loss": 2.3530956268310548, "step": 28000 }, { "epoch": 7.947771785410162, "eval_accuracy": 0.28975646976537167, "eval_loss": 2.6691195964813232, "eval_runtime": 51.6171, "eval_samples_per_second": 304.686, "eval_steps_per_second": 4.766, "step": 28000 }, { "epoch": 7.950610275333522, "grad_norm": 6.977718353271484, "learning_rate": 6.026256031791088e-05, "loss": 2.3528411865234373, "step": 28010 }, { "epoch": 7.9534487652568835, "grad_norm": 6.584957599639893, "learning_rate": 6.024836786829408e-05, "loss": 2.4038976669311523, "step": 28020 }, { "epoch": 7.956287255180245, "grad_norm": 6.5932793617248535, "learning_rate": 6.0234175418677264e-05, "loss": 2.3816322326660155, "step": 28030 }, { "epoch": 7.959125745103605, "grad_norm": 6.833106517791748, "learning_rate": 6.0219982969060464e-05, "loss": 2.3637516021728517, "step": 28040 }, { "epoch": 7.961964235026966, "grad_norm": 6.760029315948486, "learning_rate": 6.0205790519443664e-05, "loss": 2.4224170684814452, "step": 28050 }, { "epoch": 7.964802724950326, "grad_norm": 6.712746620178223, "learning_rate": 6.019159806982686e-05, "loss": 2.3210529327392577, "step": 28060 }, { "epoch": 7.967641214873687, "grad_norm": 6.727018356323242, "learning_rate": 6.017740562021005e-05, "loss": 2.354473114013672, "step": 28070 }, { "epoch": 7.970479704797048, "grad_norm": 6.650294780731201, "learning_rate": 6.0163213170593244e-05, "loss": 2.3510368347167967, "step": 28080 }, { "epoch": 7.973318194720409, "grad_norm": 6.875160217285156, "learning_rate": 6.0149020720976445e-05, "loss": 2.344834327697754, "step": 28090 }, { "epoch": 7.97615668464377, "grad_norm": 6.593456268310547, "learning_rate": 6.0134828271359645e-05, "loss": 2.3808834075927736, "step": 28100 }, { "epoch": 7.97899517456713, "grad_norm": 6.742185592651367, "learning_rate": 6.012063582174283e-05, "loss": 2.3926773071289062, "step": 28110 }, { "epoch": 7.981833664490491, "grad_norm": 7.101158142089844, "learning_rate": 6.010644337212603e-05, "loss": 2.4004745483398438, "step": 28120 }, { "epoch": 7.984672154413852, "grad_norm": 6.914261341094971, "learning_rate": 6.009225092250923e-05, "loss": 2.42657470703125, "step": 28130 }, { "epoch": 7.987510644337212, "grad_norm": 6.4855451583862305, "learning_rate": 6.007805847289243e-05, "loss": 2.3708314895629883, "step": 28140 }, { "epoch": 7.9903491342605735, "grad_norm": 6.401483058929443, "learning_rate": 6.006386602327562e-05, "loss": 2.366872787475586, "step": 28150 }, { "epoch": 7.993187624183934, "grad_norm": 6.956151485443115, "learning_rate": 6.004967357365882e-05, "loss": 2.411237335205078, "step": 28160 }, { "epoch": 7.996026114107295, "grad_norm": 6.526834011077881, "learning_rate": 6.003548112404201e-05, "loss": 2.365838623046875, "step": 28170 }, { "epoch": 7.998864604030656, "grad_norm": 6.746609210968018, "learning_rate": 6.002128867442521e-05, "loss": 2.361313056945801, "step": 28180 }, { "epoch": 8.001703093954017, "grad_norm": 7.109193801879883, "learning_rate": 6.000851546977009e-05, "loss": 2.454269790649414, "step": 28190 }, { "epoch": 8.004541583877376, "grad_norm": 6.715788841247559, "learning_rate": 5.9994323020153284e-05, "loss": 2.3892452239990236, "step": 28200 }, { "epoch": 8.007380073800737, "grad_norm": 6.623186111450195, "learning_rate": 5.998013057053648e-05, "loss": 2.3281633377075197, "step": 28210 }, { "epoch": 8.010218563724099, "grad_norm": 6.788899898529053, "learning_rate": 5.996593812091967e-05, "loss": 2.282880973815918, "step": 28220 }, { "epoch": 8.01305705364746, "grad_norm": 6.75747013092041, "learning_rate": 5.995174567130287e-05, "loss": 2.2964744567871094, "step": 28230 }, { "epoch": 8.01589554357082, "grad_norm": 6.871132850646973, "learning_rate": 5.993755322168607e-05, "loss": 2.3728532791137695, "step": 28240 }, { "epoch": 8.018734033494182, "grad_norm": 6.4863691329956055, "learning_rate": 5.992336077206926e-05, "loss": 2.355050468444824, "step": 28250 }, { "epoch": 8.021572523417541, "grad_norm": 6.803843975067139, "learning_rate": 5.990916832245246e-05, "loss": 2.3356111526489256, "step": 28260 }, { "epoch": 8.024411013340902, "grad_norm": 6.60715389251709, "learning_rate": 5.989497587283566e-05, "loss": 2.324787712097168, "step": 28270 }, { "epoch": 8.027249503264263, "grad_norm": 6.657890319824219, "learning_rate": 5.988078342321886e-05, "loss": 2.2925342559814452, "step": 28280 }, { "epoch": 8.030087993187625, "grad_norm": 6.435883522033691, "learning_rate": 5.9866590973602044e-05, "loss": 2.3841856002807615, "step": 28290 }, { "epoch": 8.032926483110986, "grad_norm": 6.705682277679443, "learning_rate": 5.9852398523985245e-05, "loss": 2.3886890411376953, "step": 28300 }, { "epoch": 8.035764973034345, "grad_norm": 6.67676305770874, "learning_rate": 5.9838206074368445e-05, "loss": 2.3527381896972654, "step": 28310 }, { "epoch": 8.038603462957706, "grad_norm": 6.897206783294678, "learning_rate": 5.982401362475164e-05, "loss": 2.32474308013916, "step": 28320 }, { "epoch": 8.041441952881067, "grad_norm": 6.718782901763916, "learning_rate": 5.980982117513483e-05, "loss": 2.4147281646728516, "step": 28330 }, { "epoch": 8.044280442804428, "grad_norm": 6.590494632720947, "learning_rate": 5.9795628725518025e-05, "loss": 2.362512969970703, "step": 28340 }, { "epoch": 8.04711893272779, "grad_norm": 7.0769219398498535, "learning_rate": 5.9781436275901225e-05, "loss": 2.46521110534668, "step": 28350 }, { "epoch": 8.049957422651149, "grad_norm": 6.495823383331299, "learning_rate": 5.9767243826284425e-05, "loss": 2.38504581451416, "step": 28360 }, { "epoch": 8.05279591257451, "grad_norm": 6.448268890380859, "learning_rate": 5.975305137666761e-05, "loss": 2.2599185943603515, "step": 28370 }, { "epoch": 8.055634402497871, "grad_norm": 6.439194679260254, "learning_rate": 5.973885892705081e-05, "loss": 2.315146636962891, "step": 28380 }, { "epoch": 8.058472892421232, "grad_norm": 6.773876667022705, "learning_rate": 5.972466647743401e-05, "loss": 2.3559829711914064, "step": 28390 }, { "epoch": 8.061311382344593, "grad_norm": 6.948919773101807, "learning_rate": 5.971047402781721e-05, "loss": 2.348996162414551, "step": 28400 }, { "epoch": 8.064149872267954, "grad_norm": 6.892505168914795, "learning_rate": 5.96962815782004e-05, "loss": 2.27396240234375, "step": 28410 }, { "epoch": 8.066988362191314, "grad_norm": 6.6147027015686035, "learning_rate": 5.96820891285836e-05, "loss": 2.3620105743408204, "step": 28420 }, { "epoch": 8.069826852114675, "grad_norm": 6.820864677429199, "learning_rate": 5.966789667896679e-05, "loss": 2.3899154663085938, "step": 28430 }, { "epoch": 8.072665342038036, "grad_norm": 6.496273994445801, "learning_rate": 5.965370422934999e-05, "loss": 2.403620147705078, "step": 28440 }, { "epoch": 8.075503831961397, "grad_norm": 6.560006618499756, "learning_rate": 5.963951177973318e-05, "loss": 2.3375511169433594, "step": 28450 }, { "epoch": 8.078342321884758, "grad_norm": 6.96863317489624, "learning_rate": 5.962531933011638e-05, "loss": 2.3460920333862303, "step": 28460 }, { "epoch": 8.081180811808117, "grad_norm": 6.506337642669678, "learning_rate": 5.961112688049958e-05, "loss": 2.3483734130859375, "step": 28470 }, { "epoch": 8.084019301731479, "grad_norm": 6.556943416595459, "learning_rate": 5.959693443088278e-05, "loss": 2.2395471572875976, "step": 28480 }, { "epoch": 8.08685779165484, "grad_norm": 6.360141277313232, "learning_rate": 5.9582741981265965e-05, "loss": 2.327769088745117, "step": 28490 }, { "epoch": 8.0896962815782, "grad_norm": 7.055751800537109, "learning_rate": 5.9568549531649165e-05, "loss": 2.365883636474609, "step": 28500 }, { "epoch": 8.0896962815782, "eval_accuracy": 0.29229986647167294, "eval_loss": 2.671118974685669, "eval_runtime": 52.4264, "eval_samples_per_second": 299.982, "eval_steps_per_second": 4.692, "step": 28500 }, { "epoch": 8.092534771501562, "grad_norm": 6.859527587890625, "learning_rate": 5.9554357082032365e-05, "loss": 2.370010185241699, "step": 28510 }, { "epoch": 8.095373261424921, "grad_norm": 6.698768138885498, "learning_rate": 5.9540164632415565e-05, "loss": 2.285432052612305, "step": 28520 }, { "epoch": 8.098211751348282, "grad_norm": 6.459585666656494, "learning_rate": 5.952597218279875e-05, "loss": 2.384771728515625, "step": 28530 }, { "epoch": 8.101050241271643, "grad_norm": 6.548915386199951, "learning_rate": 5.951177973318195e-05, "loss": 2.3297298431396483, "step": 28540 }, { "epoch": 8.103888731195005, "grad_norm": 6.736359119415283, "learning_rate": 5.9497587283565145e-05, "loss": 2.3983108520507814, "step": 28550 }, { "epoch": 8.106727221118366, "grad_norm": 6.911204814910889, "learning_rate": 5.9483394833948345e-05, "loss": 2.3585136413574217, "step": 28560 }, { "epoch": 8.109565711041725, "grad_norm": 6.655022144317627, "learning_rate": 5.946920238433153e-05, "loss": 2.3063201904296875, "step": 28570 }, { "epoch": 8.112404200965086, "grad_norm": 6.65778112411499, "learning_rate": 5.945500993471473e-05, "loss": 2.363662338256836, "step": 28580 }, { "epoch": 8.115242690888447, "grad_norm": 6.591128826141357, "learning_rate": 5.944081748509793e-05, "loss": 2.3472126007080076, "step": 28590 }, { "epoch": 8.118081180811808, "grad_norm": 6.823568820953369, "learning_rate": 5.942662503548113e-05, "loss": 2.347162437438965, "step": 28600 }, { "epoch": 8.12091967073517, "grad_norm": 6.982626438140869, "learning_rate": 5.941243258586432e-05, "loss": 2.394875144958496, "step": 28610 }, { "epoch": 8.12375816065853, "grad_norm": 6.725565433502197, "learning_rate": 5.939824013624752e-05, "loss": 2.3281002044677734, "step": 28620 }, { "epoch": 8.12659665058189, "grad_norm": 6.50210428237915, "learning_rate": 5.938404768663072e-05, "loss": 2.367918014526367, "step": 28630 }, { "epoch": 8.129435140505251, "grad_norm": 6.811273097991943, "learning_rate": 5.936985523701392e-05, "loss": 2.365338897705078, "step": 28640 }, { "epoch": 8.132273630428612, "grad_norm": 6.778713226318359, "learning_rate": 5.9355662787397106e-05, "loss": 2.387818145751953, "step": 28650 }, { "epoch": 8.135112120351973, "grad_norm": 6.522827625274658, "learning_rate": 5.9341470337780306e-05, "loss": 2.3398595809936524, "step": 28660 }, { "epoch": 8.137950610275334, "grad_norm": 6.361061096191406, "learning_rate": 5.93272778881635e-05, "loss": 2.352717399597168, "step": 28670 }, { "epoch": 8.140789100198694, "grad_norm": 6.743928909301758, "learning_rate": 5.93130854385467e-05, "loss": 2.3255191802978517, "step": 28680 }, { "epoch": 8.143627590122055, "grad_norm": 6.7778096199035645, "learning_rate": 5.9298892988929886e-05, "loss": 2.336793899536133, "step": 28690 }, { "epoch": 8.146466080045416, "grad_norm": 7.198493480682373, "learning_rate": 5.9284700539313086e-05, "loss": 2.3914567947387697, "step": 28700 }, { "epoch": 8.149304569968777, "grad_norm": 6.799160480499268, "learning_rate": 5.9270508089696286e-05, "loss": 2.2851701736450196, "step": 28710 }, { "epoch": 8.152143059892138, "grad_norm": 6.760130882263184, "learning_rate": 5.9256315640079486e-05, "loss": 2.3549837112426757, "step": 28720 }, { "epoch": 8.154981549815497, "grad_norm": 6.725278854370117, "learning_rate": 5.924212319046267e-05, "loss": 2.3661788940429687, "step": 28730 }, { "epoch": 8.157820039738858, "grad_norm": 7.010422706604004, "learning_rate": 5.922793074084587e-05, "loss": 2.409524345397949, "step": 28740 }, { "epoch": 8.16065852966222, "grad_norm": 6.664275646209717, "learning_rate": 5.921373829122907e-05, "loss": 2.322122001647949, "step": 28750 }, { "epoch": 8.16349701958558, "grad_norm": 6.6571149826049805, "learning_rate": 5.9199545841612266e-05, "loss": 2.2992116928100588, "step": 28760 }, { "epoch": 8.166335509508942, "grad_norm": 6.735662937164307, "learning_rate": 5.918535339199546e-05, "loss": 2.3149930953979494, "step": 28770 }, { "epoch": 8.169173999432301, "grad_norm": 6.796168804168701, "learning_rate": 5.917116094237866e-05, "loss": 2.4192935943603517, "step": 28780 }, { "epoch": 8.172012489355662, "grad_norm": 6.71599006652832, "learning_rate": 5.915696849276185e-05, "loss": 2.3876903533935545, "step": 28790 }, { "epoch": 8.174850979279023, "grad_norm": 7.105084419250488, "learning_rate": 5.914277604314505e-05, "loss": 2.365924072265625, "step": 28800 }, { "epoch": 8.177689469202384, "grad_norm": 6.336195468902588, "learning_rate": 5.912858359352824e-05, "loss": 2.321063232421875, "step": 28810 }, { "epoch": 8.180527959125746, "grad_norm": 6.769916534423828, "learning_rate": 5.911439114391144e-05, "loss": 2.3523113250732424, "step": 28820 }, { "epoch": 8.183366449049107, "grad_norm": 6.892121315002441, "learning_rate": 5.910019869429464e-05, "loss": 2.362961769104004, "step": 28830 }, { "epoch": 8.186204938972466, "grad_norm": 6.708437919616699, "learning_rate": 5.908600624467784e-05, "loss": 2.3758169174194337, "step": 28840 }, { "epoch": 8.189043428895827, "grad_norm": 6.7446136474609375, "learning_rate": 5.9071813795061026e-05, "loss": 2.3370071411132813, "step": 28850 }, { "epoch": 8.191881918819188, "grad_norm": 6.955051422119141, "learning_rate": 5.9057621345444226e-05, "loss": 2.3432680130004884, "step": 28860 }, { "epoch": 8.19472040874255, "grad_norm": 7.009851455688477, "learning_rate": 5.9043428895827427e-05, "loss": 2.389603614807129, "step": 28870 }, { "epoch": 8.19755889866591, "grad_norm": 6.756693363189697, "learning_rate": 5.902923644621062e-05, "loss": 2.2432588577270507, "step": 28880 }, { "epoch": 8.20039738858927, "grad_norm": 6.578105449676514, "learning_rate": 5.901504399659381e-05, "loss": 2.3455432891845702, "step": 28890 }, { "epoch": 8.203235878512631, "grad_norm": 6.604701995849609, "learning_rate": 5.9000851546977006e-05, "loss": 2.375021553039551, "step": 28900 }, { "epoch": 8.206074368435992, "grad_norm": 6.87200927734375, "learning_rate": 5.8986659097360207e-05, "loss": 2.4284317016601564, "step": 28910 }, { "epoch": 8.208912858359353, "grad_norm": 6.483757972717285, "learning_rate": 5.897246664774341e-05, "loss": 2.33446044921875, "step": 28920 }, { "epoch": 8.211751348282714, "grad_norm": 6.982252597808838, "learning_rate": 5.895827419812659e-05, "loss": 2.324961853027344, "step": 28930 }, { "epoch": 8.214589838206074, "grad_norm": 6.47934103012085, "learning_rate": 5.894408174850979e-05, "loss": 2.2825727462768555, "step": 28940 }, { "epoch": 8.217428328129435, "grad_norm": 6.915575981140137, "learning_rate": 5.8929889298892993e-05, "loss": 2.4005523681640626, "step": 28950 }, { "epoch": 8.220266818052796, "grad_norm": 6.467010021209717, "learning_rate": 5.8915696849276194e-05, "loss": 2.316960906982422, "step": 28960 }, { "epoch": 8.223105307976157, "grad_norm": 6.640051364898682, "learning_rate": 5.890150439965938e-05, "loss": 2.3631290435791015, "step": 28970 }, { "epoch": 8.225943797899518, "grad_norm": 6.58030891418457, "learning_rate": 5.888731195004258e-05, "loss": 2.464146041870117, "step": 28980 }, { "epoch": 8.228782287822877, "grad_norm": 6.969743251800537, "learning_rate": 5.887311950042578e-05, "loss": 2.3512971878051756, "step": 28990 }, { "epoch": 8.231620777746238, "grad_norm": 6.337347507476807, "learning_rate": 5.8858927050808974e-05, "loss": 2.342195510864258, "step": 29000 }, { "epoch": 8.231620777746238, "eval_accuracy": 0.29961213200228903, "eval_loss": 2.6569204330444336, "eval_runtime": 51.5224, "eval_samples_per_second": 305.246, "eval_steps_per_second": 4.775, "step": 29000 }, { "epoch": 8.2344592676696, "grad_norm": 6.958029747009277, "learning_rate": 5.884473460119217e-05, "loss": 2.369139862060547, "step": 29010 }, { "epoch": 8.23729775759296, "grad_norm": 6.660041332244873, "learning_rate": 5.883054215157536e-05, "loss": 2.3411890029907227, "step": 29020 }, { "epoch": 8.240136247516322, "grad_norm": 6.863073348999023, "learning_rate": 5.881634970195856e-05, "loss": 2.3330663681030273, "step": 29030 }, { "epoch": 8.242974737439683, "grad_norm": 6.675815105438232, "learning_rate": 5.880215725234176e-05, "loss": 2.3427270889282226, "step": 29040 }, { "epoch": 8.245813227363042, "grad_norm": 6.952666282653809, "learning_rate": 5.878796480272495e-05, "loss": 2.378923797607422, "step": 29050 }, { "epoch": 8.248651717286403, "grad_norm": 6.5739006996154785, "learning_rate": 5.877377235310815e-05, "loss": 2.3243255615234375, "step": 29060 }, { "epoch": 8.251490207209764, "grad_norm": 6.662243843078613, "learning_rate": 5.875957990349135e-05, "loss": 2.3040035247802733, "step": 29070 }, { "epoch": 8.254328697133126, "grad_norm": 6.62453556060791, "learning_rate": 5.874538745387455e-05, "loss": 2.355974006652832, "step": 29080 }, { "epoch": 8.257167187056487, "grad_norm": 6.660816669464111, "learning_rate": 5.8731195004257734e-05, "loss": 2.33551139831543, "step": 29090 }, { "epoch": 8.260005676979846, "grad_norm": 6.843783855438232, "learning_rate": 5.8717002554640934e-05, "loss": 2.302178955078125, "step": 29100 }, { "epoch": 8.262844166903207, "grad_norm": 6.349062442779541, "learning_rate": 5.8702810105024134e-05, "loss": 2.2766691207885743, "step": 29110 }, { "epoch": 8.265682656826568, "grad_norm": 6.681468963623047, "learning_rate": 5.868861765540733e-05, "loss": 2.3973770141601562, "step": 29120 }, { "epoch": 8.26852114674993, "grad_norm": 6.2638044357299805, "learning_rate": 5.867442520579052e-05, "loss": 2.323769950866699, "step": 29130 }, { "epoch": 8.27135963667329, "grad_norm": 6.527162075042725, "learning_rate": 5.8660232756173714e-05, "loss": 2.3622291564941404, "step": 29140 }, { "epoch": 8.27419812659665, "grad_norm": 6.227894306182861, "learning_rate": 5.8646040306556914e-05, "loss": 2.2893398284912108, "step": 29150 }, { "epoch": 8.27703661652001, "grad_norm": 6.866387367248535, "learning_rate": 5.8631847856940114e-05, "loss": 2.3133867263793944, "step": 29160 }, { "epoch": 8.279875106443372, "grad_norm": 6.5019307136535645, "learning_rate": 5.86176554073233e-05, "loss": 2.367503356933594, "step": 29170 }, { "epoch": 8.282713596366733, "grad_norm": 6.73724365234375, "learning_rate": 5.86034629577065e-05, "loss": 2.4110414505004885, "step": 29180 }, { "epoch": 8.285552086290094, "grad_norm": 6.687900543212891, "learning_rate": 5.85892705080897e-05, "loss": 2.3525463104248048, "step": 29190 }, { "epoch": 8.288390576213455, "grad_norm": 6.334206581115723, "learning_rate": 5.85750780584729e-05, "loss": 2.381901168823242, "step": 29200 }, { "epoch": 8.291229066136815, "grad_norm": 6.560166358947754, "learning_rate": 5.856088560885609e-05, "loss": 2.3545511245727537, "step": 29210 }, { "epoch": 8.294067556060176, "grad_norm": 6.913517475128174, "learning_rate": 5.854669315923929e-05, "loss": 2.3479997634887697, "step": 29220 }, { "epoch": 8.296906045983537, "grad_norm": 6.518318176269531, "learning_rate": 5.853250070962248e-05, "loss": 2.356728935241699, "step": 29230 }, { "epoch": 8.299744535906898, "grad_norm": 6.324429035186768, "learning_rate": 5.851830826000568e-05, "loss": 2.342626953125, "step": 29240 }, { "epoch": 8.302583025830259, "grad_norm": 6.898766994476318, "learning_rate": 5.850411581038887e-05, "loss": 2.3550819396972655, "step": 29250 }, { "epoch": 8.305421515753618, "grad_norm": 6.982869625091553, "learning_rate": 5.848992336077207e-05, "loss": 2.42128963470459, "step": 29260 }, { "epoch": 8.30826000567698, "grad_norm": 7.010739326477051, "learning_rate": 5.847573091115527e-05, "loss": 2.3867530822753906, "step": 29270 }, { "epoch": 8.31109849560034, "grad_norm": 6.5985212326049805, "learning_rate": 5.846153846153847e-05, "loss": 2.4364906311035157, "step": 29280 }, { "epoch": 8.313936985523702, "grad_norm": 6.510429382324219, "learning_rate": 5.8447346011921654e-05, "loss": 2.3434961318969725, "step": 29290 }, { "epoch": 8.316775475447063, "grad_norm": 6.454311370849609, "learning_rate": 5.8433153562304855e-05, "loss": 2.387272834777832, "step": 29300 }, { "epoch": 8.319613965370422, "grad_norm": 6.743826389312744, "learning_rate": 5.8418961112688055e-05, "loss": 2.34752197265625, "step": 29310 }, { "epoch": 8.322452455293783, "grad_norm": 6.635402202606201, "learning_rate": 5.8404768663071255e-05, "loss": 2.395268440246582, "step": 29320 }, { "epoch": 8.325290945217144, "grad_norm": 6.885883331298828, "learning_rate": 5.839057621345444e-05, "loss": 2.3988214492797852, "step": 29330 }, { "epoch": 8.328129435140506, "grad_norm": 6.877366542816162, "learning_rate": 5.837638376383764e-05, "loss": 2.377612495422363, "step": 29340 }, { "epoch": 8.330967925063867, "grad_norm": 6.57864236831665, "learning_rate": 5.8362191314220835e-05, "loss": 2.3944271087646483, "step": 29350 }, { "epoch": 8.333806414987226, "grad_norm": 6.792334079742432, "learning_rate": 5.8347998864604035e-05, "loss": 2.407621383666992, "step": 29360 }, { "epoch": 8.336644904910587, "grad_norm": 6.943974018096924, "learning_rate": 5.833380641498722e-05, "loss": 2.3656436920166017, "step": 29370 }, { "epoch": 8.339483394833948, "grad_norm": 6.8627471923828125, "learning_rate": 5.831961396537042e-05, "loss": 2.3458812713623045, "step": 29380 }, { "epoch": 8.34232188475731, "grad_norm": 6.6561737060546875, "learning_rate": 5.830542151575362e-05, "loss": 2.3518856048583983, "step": 29390 }, { "epoch": 8.34516037468067, "grad_norm": 6.86549186706543, "learning_rate": 5.829122906613682e-05, "loss": 2.420240783691406, "step": 29400 }, { "epoch": 8.347998864604032, "grad_norm": 6.896262168884277, "learning_rate": 5.827703661652001e-05, "loss": 2.338709259033203, "step": 29410 }, { "epoch": 8.35083735452739, "grad_norm": 6.633720397949219, "learning_rate": 5.826284416690321e-05, "loss": 2.3009258270263673, "step": 29420 }, { "epoch": 8.353675844450752, "grad_norm": 6.581032752990723, "learning_rate": 5.824865171728641e-05, "loss": 2.2909912109375, "step": 29430 }, { "epoch": 8.356514334374113, "grad_norm": 6.605586051940918, "learning_rate": 5.823445926766961e-05, "loss": 2.3002790451049804, "step": 29440 }, { "epoch": 8.359352824297474, "grad_norm": 6.570489406585693, "learning_rate": 5.8220266818052795e-05, "loss": 2.3851797103881838, "step": 29450 }, { "epoch": 8.362191314220835, "grad_norm": 7.04016637802124, "learning_rate": 5.8206074368435995e-05, "loss": 2.4089120864868163, "step": 29460 }, { "epoch": 8.365029804144195, "grad_norm": 6.288635730743408, "learning_rate": 5.819188191881919e-05, "loss": 2.3565025329589844, "step": 29470 }, { "epoch": 8.367868294067556, "grad_norm": 7.062806129455566, "learning_rate": 5.817768946920239e-05, "loss": 2.354460906982422, "step": 29480 }, { "epoch": 8.370706783990917, "grad_norm": 6.709425926208496, "learning_rate": 5.8163497019585575e-05, "loss": 2.397325897216797, "step": 29490 }, { "epoch": 8.373545273914278, "grad_norm": 6.827889919281006, "learning_rate": 5.8149304569968775e-05, "loss": 2.2874128341674806, "step": 29500 }, { "epoch": 8.373545273914278, "eval_accuracy": 0.2934443949895085, "eval_loss": 2.656182050704956, "eval_runtime": 49.1411, "eval_samples_per_second": 320.038, "eval_steps_per_second": 5.006, "step": 29500 }, { "epoch": 8.376383763837639, "grad_norm": 6.497047424316406, "learning_rate": 5.8135112120351975e-05, "loss": 2.381071853637695, "step": 29510 }, { "epoch": 8.379222253760998, "grad_norm": 6.779560565948486, "learning_rate": 5.8120919670735175e-05, "loss": 2.350160598754883, "step": 29520 }, { "epoch": 8.38206074368436, "grad_norm": 6.637528419494629, "learning_rate": 5.8106727221118375e-05, "loss": 2.303739547729492, "step": 29530 }, { "epoch": 8.38489923360772, "grad_norm": 6.524947643280029, "learning_rate": 5.809253477150156e-05, "loss": 2.358320426940918, "step": 29540 }, { "epoch": 8.387737723531082, "grad_norm": 6.5897698402404785, "learning_rate": 5.807834232188476e-05, "loss": 2.3120323181152345, "step": 29550 }, { "epoch": 8.390576213454443, "grad_norm": 6.467411041259766, "learning_rate": 5.806414987226796e-05, "loss": 2.331195831298828, "step": 29560 }, { "epoch": 8.393414703377802, "grad_norm": 6.584548473358154, "learning_rate": 5.8049957422651156e-05, "loss": 2.358737754821777, "step": 29570 }, { "epoch": 8.396253193301163, "grad_norm": 6.866097450256348, "learning_rate": 5.803576497303435e-05, "loss": 2.2808162689208986, "step": 29580 }, { "epoch": 8.399091683224524, "grad_norm": 6.448230743408203, "learning_rate": 5.802157252341754e-05, "loss": 2.3435773849487305, "step": 29590 }, { "epoch": 8.401930173147885, "grad_norm": 6.619566917419434, "learning_rate": 5.800738007380074e-05, "loss": 2.292142868041992, "step": 29600 }, { "epoch": 8.404768663071247, "grad_norm": 6.738437652587891, "learning_rate": 5.799318762418394e-05, "loss": 2.410378837585449, "step": 29610 }, { "epoch": 8.407607152994608, "grad_norm": 6.405421733856201, "learning_rate": 5.797899517456713e-05, "loss": 2.333327865600586, "step": 29620 }, { "epoch": 8.410445642917967, "grad_norm": 6.762335300445557, "learning_rate": 5.796480272495033e-05, "loss": 2.4074316024780273, "step": 29630 }, { "epoch": 8.413284132841328, "grad_norm": 6.495542526245117, "learning_rate": 5.795061027533353e-05, "loss": 2.3797174453735352, "step": 29640 }, { "epoch": 8.41612262276469, "grad_norm": 6.328438758850098, "learning_rate": 5.793641782571673e-05, "loss": 2.438658332824707, "step": 29650 }, { "epoch": 8.41896111268805, "grad_norm": 6.619978427886963, "learning_rate": 5.7922225376099916e-05, "loss": 2.424101638793945, "step": 29660 }, { "epoch": 8.421799602611411, "grad_norm": 6.972545146942139, "learning_rate": 5.7908032926483116e-05, "loss": 2.3070465087890626, "step": 29670 }, { "epoch": 8.42463809253477, "grad_norm": 6.678661823272705, "learning_rate": 5.789384047686631e-05, "loss": 2.3465803146362303, "step": 29680 }, { "epoch": 8.427476582458132, "grad_norm": 6.874884128570557, "learning_rate": 5.787964802724951e-05, "loss": 2.250725746154785, "step": 29690 }, { "epoch": 8.430315072381493, "grad_norm": 6.402337551116943, "learning_rate": 5.7865455577632696e-05, "loss": 2.3882219314575197, "step": 29700 }, { "epoch": 8.433153562304854, "grad_norm": 7.161057472229004, "learning_rate": 5.7851263128015896e-05, "loss": 2.3978351593017577, "step": 29710 }, { "epoch": 8.435992052228215, "grad_norm": 6.634573936462402, "learning_rate": 5.7837070678399096e-05, "loss": 2.3212921142578127, "step": 29720 }, { "epoch": 8.438830542151575, "grad_norm": 6.453999042510986, "learning_rate": 5.7822878228782296e-05, "loss": 2.289244842529297, "step": 29730 }, { "epoch": 8.441669032074936, "grad_norm": 6.584774971008301, "learning_rate": 5.780868577916548e-05, "loss": 2.313109588623047, "step": 29740 }, { "epoch": 8.444507521998297, "grad_norm": 7.016343593597412, "learning_rate": 5.779449332954868e-05, "loss": 2.3041688919067385, "step": 29750 }, { "epoch": 8.447346011921658, "grad_norm": 6.312516689300537, "learning_rate": 5.778030087993188e-05, "loss": 2.353688430786133, "step": 29760 }, { "epoch": 8.450184501845019, "grad_norm": 6.90769100189209, "learning_rate": 5.776610843031508e-05, "loss": 2.3473262786865234, "step": 29770 }, { "epoch": 8.453022991768378, "grad_norm": 6.6273064613342285, "learning_rate": 5.775191598069827e-05, "loss": 2.257670021057129, "step": 29780 }, { "epoch": 8.45586148169174, "grad_norm": 6.364598274230957, "learning_rate": 5.773772353108147e-05, "loss": 2.287656784057617, "step": 29790 }, { "epoch": 8.4586999716151, "grad_norm": 6.30982780456543, "learning_rate": 5.772353108146466e-05, "loss": 2.364837646484375, "step": 29800 }, { "epoch": 8.461538461538462, "grad_norm": 6.864008903503418, "learning_rate": 5.770933863184786e-05, "loss": 2.3420059204101564, "step": 29810 }, { "epoch": 8.464376951461823, "grad_norm": 6.658168315887451, "learning_rate": 5.769514618223105e-05, "loss": 2.369700813293457, "step": 29820 }, { "epoch": 8.467215441385184, "grad_norm": 6.961121082305908, "learning_rate": 5.768095373261425e-05, "loss": 2.338939666748047, "step": 29830 }, { "epoch": 8.470053931308543, "grad_norm": 6.189107894897461, "learning_rate": 5.766676128299745e-05, "loss": 2.297046661376953, "step": 29840 }, { "epoch": 8.472892421231904, "grad_norm": 6.827104091644287, "learning_rate": 5.765256883338065e-05, "loss": 2.396251106262207, "step": 29850 }, { "epoch": 8.475730911155265, "grad_norm": 6.8917460441589355, "learning_rate": 5.7638376383763836e-05, "loss": 2.4613601684570314, "step": 29860 }, { "epoch": 8.478569401078627, "grad_norm": 6.651663780212402, "learning_rate": 5.7624183934147036e-05, "loss": 2.3069719314575194, "step": 29870 }, { "epoch": 8.481407891001988, "grad_norm": 6.919112205505371, "learning_rate": 5.7609991484530237e-05, "loss": 2.3711727142333983, "step": 29880 }, { "epoch": 8.484246380925347, "grad_norm": 6.7759690284729, "learning_rate": 5.759579903491344e-05, "loss": 2.3313247680664064, "step": 29890 }, { "epoch": 8.487084870848708, "grad_norm": 6.641360282897949, "learning_rate": 5.758160658529662e-05, "loss": 2.3663105010986327, "step": 29900 }, { "epoch": 8.48992336077207, "grad_norm": 6.660885810852051, "learning_rate": 5.756741413567982e-05, "loss": 2.3384265899658203, "step": 29910 }, { "epoch": 8.49276185069543, "grad_norm": 6.698131084442139, "learning_rate": 5.755322168606302e-05, "loss": 2.2936965942382814, "step": 29920 }, { "epoch": 8.495600340618791, "grad_norm": 6.624886989593506, "learning_rate": 5.753902923644622e-05, "loss": 2.3795587539672853, "step": 29930 }, { "epoch": 8.49843883054215, "grad_norm": 6.761506080627441, "learning_rate": 5.75248367868294e-05, "loss": 2.350925064086914, "step": 29940 }, { "epoch": 8.501277320465512, "grad_norm": 6.394760608673096, "learning_rate": 5.75106443372126e-05, "loss": 2.3021915435791014, "step": 29950 }, { "epoch": 8.504115810388873, "grad_norm": 6.607608795166016, "learning_rate": 5.7496451887595803e-05, "loss": 2.3281972885131834, "step": 29960 }, { "epoch": 8.506954300312234, "grad_norm": 6.837239742279053, "learning_rate": 5.7482259437979004e-05, "loss": 2.334798812866211, "step": 29970 }, { "epoch": 8.509792790235595, "grad_norm": 6.392955780029297, "learning_rate": 5.746806698836219e-05, "loss": 2.3289745330810545, "step": 29980 }, { "epoch": 8.512631280158956, "grad_norm": 6.576843738555908, "learning_rate": 5.745387453874539e-05, "loss": 2.3430532455444335, "step": 29990 }, { "epoch": 8.515469770082316, "grad_norm": 6.492761135101318, "learning_rate": 5.743968208912859e-05, "loss": 2.275990104675293, "step": 30000 }, { "epoch": 8.515469770082316, "eval_accuracy": 0.30240986837922046, "eval_loss": 2.6445236206054688, "eval_runtime": 51.935, "eval_samples_per_second": 302.821, "eval_steps_per_second": 4.737, "step": 30000 }, { "epoch": 8.518308260005677, "grad_norm": 6.523593425750732, "learning_rate": 5.7425489639511784e-05, "loss": 2.3468542098999023, "step": 30010 }, { "epoch": 8.521146749929038, "grad_norm": 6.660505771636963, "learning_rate": 5.741129718989498e-05, "loss": 2.3535518646240234, "step": 30020 }, { "epoch": 8.523985239852399, "grad_norm": 6.455664157867432, "learning_rate": 5.739710474027817e-05, "loss": 2.314545249938965, "step": 30030 }, { "epoch": 8.52682372977576, "grad_norm": 7.028764724731445, "learning_rate": 5.738291229066137e-05, "loss": 2.3724720001220705, "step": 30040 }, { "epoch": 8.52966221969912, "grad_norm": 6.708054065704346, "learning_rate": 5.736871984104457e-05, "loss": 2.302078628540039, "step": 30050 }, { "epoch": 8.53250070962248, "grad_norm": 6.851576328277588, "learning_rate": 5.735452739142776e-05, "loss": 2.3798686981201174, "step": 30060 }, { "epoch": 8.535339199545842, "grad_norm": 6.9770612716674805, "learning_rate": 5.734033494181096e-05, "loss": 2.351972961425781, "step": 30070 }, { "epoch": 8.538177689469203, "grad_norm": 6.855116844177246, "learning_rate": 5.732614249219416e-05, "loss": 2.3487571716308593, "step": 30080 }, { "epoch": 8.541016179392564, "grad_norm": 6.872882843017578, "learning_rate": 5.731195004257736e-05, "loss": 2.3978914260864257, "step": 30090 }, { "epoch": 8.543854669315923, "grad_norm": 6.825599193572998, "learning_rate": 5.7297757592960544e-05, "loss": 2.418214225769043, "step": 30100 }, { "epoch": 8.546693159239284, "grad_norm": 7.007328987121582, "learning_rate": 5.7283565143343744e-05, "loss": 2.3749889373779296, "step": 30110 }, { "epoch": 8.549531649162645, "grad_norm": 6.912518501281738, "learning_rate": 5.7269372693726944e-05, "loss": 2.4081363677978516, "step": 30120 }, { "epoch": 8.552370139086007, "grad_norm": 6.40632438659668, "learning_rate": 5.725518024411014e-05, "loss": 2.3998743057250977, "step": 30130 }, { "epoch": 8.555208629009368, "grad_norm": 6.720208168029785, "learning_rate": 5.724098779449333e-05, "loss": 2.424447250366211, "step": 30140 }, { "epoch": 8.558047118932727, "grad_norm": 6.888978481292725, "learning_rate": 5.7226795344876524e-05, "loss": 2.312891387939453, "step": 30150 }, { "epoch": 8.560885608856088, "grad_norm": 6.99831485748291, "learning_rate": 5.7212602895259724e-05, "loss": 2.3691444396972656, "step": 30160 }, { "epoch": 8.56372409877945, "grad_norm": 6.507413864135742, "learning_rate": 5.7198410445642924e-05, "loss": 2.3272581100463867, "step": 30170 }, { "epoch": 8.56656258870281, "grad_norm": 6.748056411743164, "learning_rate": 5.718421799602611e-05, "loss": 2.3102283477783203, "step": 30180 }, { "epoch": 8.569401078626171, "grad_norm": 6.476143836975098, "learning_rate": 5.717002554640931e-05, "loss": 2.3407094955444334, "step": 30190 }, { "epoch": 8.57223956854953, "grad_norm": 6.756187438964844, "learning_rate": 5.715583309679251e-05, "loss": 2.4327823638916017, "step": 30200 }, { "epoch": 8.575078058472892, "grad_norm": 6.779639720916748, "learning_rate": 5.714164064717571e-05, "loss": 2.361658477783203, "step": 30210 }, { "epoch": 8.577916548396253, "grad_norm": 7.032485485076904, "learning_rate": 5.71274481975589e-05, "loss": 2.3950353622436524, "step": 30220 }, { "epoch": 8.580755038319614, "grad_norm": 6.582090377807617, "learning_rate": 5.71132557479421e-05, "loss": 2.458545684814453, "step": 30230 }, { "epoch": 8.583593528242975, "grad_norm": 6.4530839920043945, "learning_rate": 5.70990632983253e-05, "loss": 2.3391181945800783, "step": 30240 }, { "epoch": 8.586432018166336, "grad_norm": 6.516348838806152, "learning_rate": 5.708487084870849e-05, "loss": 2.3513965606689453, "step": 30250 }, { "epoch": 8.589270508089696, "grad_norm": 6.154189586639404, "learning_rate": 5.7070678399091684e-05, "loss": 2.2569040298461913, "step": 30260 }, { "epoch": 8.592108998013057, "grad_norm": 6.284759521484375, "learning_rate": 5.705648594947488e-05, "loss": 2.323533821105957, "step": 30270 }, { "epoch": 8.594947487936418, "grad_norm": 6.647985458374023, "learning_rate": 5.704229349985808e-05, "loss": 2.3643621444702148, "step": 30280 }, { "epoch": 8.597785977859779, "grad_norm": 6.680912017822266, "learning_rate": 5.702810105024128e-05, "loss": 2.3470426559448243, "step": 30290 }, { "epoch": 8.60062446778314, "grad_norm": 6.843850612640381, "learning_rate": 5.7013908600624464e-05, "loss": 2.3571109771728516, "step": 30300 }, { "epoch": 8.6034629577065, "grad_norm": 6.7519049644470215, "learning_rate": 5.6999716151007665e-05, "loss": 2.364684295654297, "step": 30310 }, { "epoch": 8.60630144762986, "grad_norm": 6.891415119171143, "learning_rate": 5.6985523701390865e-05, "loss": 2.3995447158813477, "step": 30320 }, { "epoch": 8.609139937553222, "grad_norm": 6.7349066734313965, "learning_rate": 5.6971331251774065e-05, "loss": 2.3931911468505858, "step": 30330 }, { "epoch": 8.611978427476583, "grad_norm": 7.158847332000732, "learning_rate": 5.695713880215725e-05, "loss": 2.3627878189086915, "step": 30340 }, { "epoch": 8.614816917399944, "grad_norm": 6.449487209320068, "learning_rate": 5.694294635254045e-05, "loss": 2.3012626647949217, "step": 30350 }, { "epoch": 8.617655407323305, "grad_norm": 6.930278778076172, "learning_rate": 5.692875390292365e-05, "loss": 2.3790573120117187, "step": 30360 }, { "epoch": 8.620493897246664, "grad_norm": 6.597395420074463, "learning_rate": 5.6914561453306845e-05, "loss": 2.349531364440918, "step": 30370 }, { "epoch": 8.623332387170025, "grad_norm": 6.473667621612549, "learning_rate": 5.690036900369004e-05, "loss": 2.3172943115234377, "step": 30380 }, { "epoch": 8.626170877093386, "grad_norm": 6.731377124786377, "learning_rate": 5.688617655407323e-05, "loss": 2.274510955810547, "step": 30390 }, { "epoch": 8.629009367016748, "grad_norm": 6.841826915740967, "learning_rate": 5.687198410445643e-05, "loss": 2.414444160461426, "step": 30400 }, { "epoch": 8.631847856940109, "grad_norm": 6.768213748931885, "learning_rate": 5.685779165483963e-05, "loss": 2.3007043838500976, "step": 30410 }, { "epoch": 8.634686346863468, "grad_norm": 6.720106601715088, "learning_rate": 5.684359920522282e-05, "loss": 2.3765331268310548, "step": 30420 }, { "epoch": 8.63752483678683, "grad_norm": 6.853618144989014, "learning_rate": 5.682940675560602e-05, "loss": 2.327243423461914, "step": 30430 }, { "epoch": 8.64036332671019, "grad_norm": 6.891674518585205, "learning_rate": 5.681521430598922e-05, "loss": 2.3596736907958986, "step": 30440 }, { "epoch": 8.643201816633551, "grad_norm": 6.917714595794678, "learning_rate": 5.680102185637242e-05, "loss": 2.3488256454467775, "step": 30450 }, { "epoch": 8.646040306556912, "grad_norm": 6.412086486816406, "learning_rate": 5.6786829406755605e-05, "loss": 2.372235107421875, "step": 30460 }, { "epoch": 8.648878796480272, "grad_norm": 6.6410298347473145, "learning_rate": 5.6772636957138805e-05, "loss": 2.3328615188598634, "step": 30470 }, { "epoch": 8.651717286403633, "grad_norm": 6.970602989196777, "learning_rate": 5.6758444507522e-05, "loss": 2.2509389877319337, "step": 30480 }, { "epoch": 8.654555776326994, "grad_norm": 7.139188766479492, "learning_rate": 5.67442520579052e-05, "loss": 2.350941467285156, "step": 30490 }, { "epoch": 8.657394266250355, "grad_norm": 7.04730224609375, "learning_rate": 5.6730059608288385e-05, "loss": 2.3490440368652346, "step": 30500 }, { "epoch": 8.657394266250355, "eval_accuracy": 0.3009474152730972, "eval_loss": 2.642097234725952, "eval_runtime": 56.3003, "eval_samples_per_second": 279.341, "eval_steps_per_second": 4.369, "step": 30500 }, { "epoch": 8.660232756173716, "grad_norm": 6.464805603027344, "learning_rate": 5.6715867158671585e-05, "loss": 2.3765399932861326, "step": 30510 }, { "epoch": 8.663071246097076, "grad_norm": 6.5033040046691895, "learning_rate": 5.6701674709054785e-05, "loss": 2.3050004959106447, "step": 30520 }, { "epoch": 8.665909736020437, "grad_norm": 6.998825550079346, "learning_rate": 5.6687482259437985e-05, "loss": 2.3661203384399414, "step": 30530 }, { "epoch": 8.668748225943798, "grad_norm": 6.693108558654785, "learning_rate": 5.667328980982117e-05, "loss": 2.3489664077758787, "step": 30540 }, { "epoch": 8.671586715867159, "grad_norm": 6.5957841873168945, "learning_rate": 5.665909736020437e-05, "loss": 2.324652099609375, "step": 30550 }, { "epoch": 8.67442520579052, "grad_norm": 6.527032375335693, "learning_rate": 5.664490491058757e-05, "loss": 2.4093692779541014, "step": 30560 }, { "epoch": 8.67726369571388, "grad_norm": 6.507122993469238, "learning_rate": 5.663071246097077e-05, "loss": 2.3240612030029295, "step": 30570 }, { "epoch": 8.68010218563724, "grad_norm": 6.741912364959717, "learning_rate": 5.661652001135396e-05, "loss": 2.3029003143310547, "step": 30580 }, { "epoch": 8.682940675560602, "grad_norm": 6.403688907623291, "learning_rate": 5.660232756173716e-05, "loss": 2.334452247619629, "step": 30590 }, { "epoch": 8.685779165483963, "grad_norm": 6.525060653686523, "learning_rate": 5.658813511212035e-05, "loss": 2.3488079071044923, "step": 30600 }, { "epoch": 8.688617655407324, "grad_norm": 6.394023418426514, "learning_rate": 5.657394266250355e-05, "loss": 2.35885066986084, "step": 30610 }, { "epoch": 8.691456145330685, "grad_norm": 6.978537082672119, "learning_rate": 5.655975021288674e-05, "loss": 2.3553844451904298, "step": 30620 }, { "epoch": 8.694294635254044, "grad_norm": 6.718393802642822, "learning_rate": 5.654555776326994e-05, "loss": 2.346018600463867, "step": 30630 }, { "epoch": 8.697133125177405, "grad_norm": 7.025012493133545, "learning_rate": 5.653136531365314e-05, "loss": 2.374502754211426, "step": 30640 }, { "epoch": 8.699971615100766, "grad_norm": 6.652482509613037, "learning_rate": 5.651717286403634e-05, "loss": 2.3121288299560545, "step": 30650 }, { "epoch": 8.702810105024128, "grad_norm": 6.858762264251709, "learning_rate": 5.6502980414419526e-05, "loss": 2.436146926879883, "step": 30660 }, { "epoch": 8.705648594947489, "grad_norm": 6.960207939147949, "learning_rate": 5.6488787964802726e-05, "loss": 2.279247283935547, "step": 30670 }, { "epoch": 8.708487084870848, "grad_norm": 6.251781940460205, "learning_rate": 5.6474595515185926e-05, "loss": 2.3313369750976562, "step": 30680 }, { "epoch": 8.711325574794209, "grad_norm": 6.7744460105896, "learning_rate": 5.6460403065569126e-05, "loss": 2.2950933456420897, "step": 30690 }, { "epoch": 8.71416406471757, "grad_norm": 6.732235908508301, "learning_rate": 5.644621061595231e-05, "loss": 2.354676055908203, "step": 30700 }, { "epoch": 8.717002554640931, "grad_norm": 6.839755535125732, "learning_rate": 5.643201816633551e-05, "loss": 2.367806816101074, "step": 30710 }, { "epoch": 8.719841044564292, "grad_norm": 6.665410041809082, "learning_rate": 5.6417825716718706e-05, "loss": 2.3885000228881834, "step": 30720 }, { "epoch": 8.722679534487652, "grad_norm": 6.87644100189209, "learning_rate": 5.6403633267101906e-05, "loss": 2.3244325637817385, "step": 30730 }, { "epoch": 8.725518024411013, "grad_norm": 6.64861536026001, "learning_rate": 5.638944081748509e-05, "loss": 2.312308502197266, "step": 30740 }, { "epoch": 8.728356514334374, "grad_norm": 6.969360828399658, "learning_rate": 5.637524836786829e-05, "loss": 2.357899475097656, "step": 30750 }, { "epoch": 8.731195004257735, "grad_norm": 6.023425102233887, "learning_rate": 5.636105591825149e-05, "loss": 2.2911041259765623, "step": 30760 }, { "epoch": 8.734033494181096, "grad_norm": 6.947286605834961, "learning_rate": 5.634686346863469e-05, "loss": 2.2944847106933595, "step": 30770 }, { "epoch": 8.736871984104457, "grad_norm": 6.543194770812988, "learning_rate": 5.633267101901788e-05, "loss": 2.4162853240966795, "step": 30780 }, { "epoch": 8.739710474027817, "grad_norm": 7.0901408195495605, "learning_rate": 5.631847856940108e-05, "loss": 2.338445472717285, "step": 30790 }, { "epoch": 8.742548963951178, "grad_norm": 6.893723964691162, "learning_rate": 5.630428611978428e-05, "loss": 2.3693893432617186, "step": 30800 }, { "epoch": 8.745387453874539, "grad_norm": 6.653756141662598, "learning_rate": 5.629009367016748e-05, "loss": 2.3871181488037108, "step": 30810 }, { "epoch": 8.7482259437979, "grad_norm": 6.525247573852539, "learning_rate": 5.627590122055067e-05, "loss": 2.299001693725586, "step": 30820 }, { "epoch": 8.751064433721261, "grad_norm": 6.101434230804443, "learning_rate": 5.6261708770933866e-05, "loss": 2.3030349731445314, "step": 30830 }, { "epoch": 8.75390292364462, "grad_norm": 6.544282913208008, "learning_rate": 5.624751632131706e-05, "loss": 2.399400520324707, "step": 30840 }, { "epoch": 8.756741413567982, "grad_norm": 6.748208999633789, "learning_rate": 5.623332387170026e-05, "loss": 2.3726673126220703, "step": 30850 }, { "epoch": 8.759579903491343, "grad_norm": 6.80366325378418, "learning_rate": 5.621913142208346e-05, "loss": 2.277427864074707, "step": 30860 }, { "epoch": 8.762418393414704, "grad_norm": 6.526517391204834, "learning_rate": 5.6204938972466646e-05, "loss": 2.323711967468262, "step": 30870 }, { "epoch": 8.765256883338065, "grad_norm": 6.815699577331543, "learning_rate": 5.6190746522849847e-05, "loss": 2.329864501953125, "step": 30880 }, { "epoch": 8.768095373261424, "grad_norm": 6.732678413391113, "learning_rate": 5.6176554073233047e-05, "loss": 2.3635879516601563, "step": 30890 }, { "epoch": 8.770933863184785, "grad_norm": 6.5850019454956055, "learning_rate": 5.616236162361625e-05, "loss": 2.2852481842041015, "step": 30900 }, { "epoch": 8.773772353108146, "grad_norm": 6.373252868652344, "learning_rate": 5.614816917399943e-05, "loss": 2.3619386672973635, "step": 30910 }, { "epoch": 8.776610843031508, "grad_norm": 6.434662818908691, "learning_rate": 5.613397672438263e-05, "loss": 2.322026824951172, "step": 30920 }, { "epoch": 8.779449332954869, "grad_norm": 7.184020519256592, "learning_rate": 5.611978427476583e-05, "loss": 2.3434829711914062, "step": 30930 }, { "epoch": 8.782287822878228, "grad_norm": 6.687378883361816, "learning_rate": 5.610559182514903e-05, "loss": 2.325210189819336, "step": 30940 }, { "epoch": 8.785126312801589, "grad_norm": 6.75003719329834, "learning_rate": 5.609139937553221e-05, "loss": 2.3436012268066406, "step": 30950 }, { "epoch": 8.78796480272495, "grad_norm": 6.899168491363525, "learning_rate": 5.6077206925915413e-05, "loss": 2.3414785385131838, "step": 30960 }, { "epoch": 8.790803292648311, "grad_norm": 6.6160454750061035, "learning_rate": 5.6063014476298613e-05, "loss": 2.3137847900390627, "step": 30970 }, { "epoch": 8.793641782571672, "grad_norm": 6.322574138641357, "learning_rate": 5.6048822026681814e-05, "loss": 2.312350845336914, "step": 30980 }, { "epoch": 8.796480272495032, "grad_norm": 6.7344770431518555, "learning_rate": 5.6034629577065e-05, "loss": 2.3828359603881837, "step": 30990 }, { "epoch": 8.799318762418393, "grad_norm": 6.507381439208984, "learning_rate": 5.60204371274482e-05, "loss": 2.317638397216797, "step": 31000 }, { "epoch": 8.799318762418393, "eval_accuracy": 0.30597062376804224, "eval_loss": 2.6361258029937744, "eval_runtime": 51.5687, "eval_samples_per_second": 304.972, "eval_steps_per_second": 4.77, "step": 31000 }, { "epoch": 8.802157252341754, "grad_norm": 6.7523112297058105, "learning_rate": 5.60062446778314e-05, "loss": 2.276523399353027, "step": 31010 }, { "epoch": 8.804995742265115, "grad_norm": 6.797302722930908, "learning_rate": 5.59920522282146e-05, "loss": 2.3712026596069338, "step": 31020 }, { "epoch": 8.807834232188476, "grad_norm": 6.47542667388916, "learning_rate": 5.597785977859779e-05, "loss": 2.2899118423461915, "step": 31030 }, { "epoch": 8.810672722111837, "grad_norm": 6.839663505554199, "learning_rate": 5.596366732898099e-05, "loss": 2.3528135299682615, "step": 31040 }, { "epoch": 8.813511212035197, "grad_norm": 6.538341522216797, "learning_rate": 5.594947487936418e-05, "loss": 2.3219841003417967, "step": 31050 }, { "epoch": 8.816349701958558, "grad_norm": 7.180476665496826, "learning_rate": 5.593528242974738e-05, "loss": 2.3802343368530274, "step": 31060 }, { "epoch": 8.819188191881919, "grad_norm": 6.7103657722473145, "learning_rate": 5.592108998013057e-05, "loss": 2.327041816711426, "step": 31070 }, { "epoch": 8.82202668180528, "grad_norm": 6.840463638305664, "learning_rate": 5.590689753051377e-05, "loss": 2.4303260803222657, "step": 31080 }, { "epoch": 8.824865171728641, "grad_norm": 6.60705041885376, "learning_rate": 5.589270508089697e-05, "loss": 2.3043338775634767, "step": 31090 }, { "epoch": 8.827703661652, "grad_norm": 6.683025360107422, "learning_rate": 5.587851263128017e-05, "loss": 2.3419378280639647, "step": 31100 }, { "epoch": 8.830542151575361, "grad_norm": 6.902899742126465, "learning_rate": 5.5864320181663354e-05, "loss": 2.3886791229248048, "step": 31110 }, { "epoch": 8.833380641498723, "grad_norm": 7.074864864349365, "learning_rate": 5.5850127732046554e-05, "loss": 2.3420286178588867, "step": 31120 }, { "epoch": 8.836219131422084, "grad_norm": 6.720144748687744, "learning_rate": 5.5835935282429754e-05, "loss": 2.305368423461914, "step": 31130 }, { "epoch": 8.839057621345445, "grad_norm": 6.511201858520508, "learning_rate": 5.5821742832812954e-05, "loss": 2.3302759170532226, "step": 31140 }, { "epoch": 8.841896111268806, "grad_norm": 6.671599864959717, "learning_rate": 5.580755038319614e-05, "loss": 2.251811218261719, "step": 31150 }, { "epoch": 8.844734601192165, "grad_norm": 6.707543849945068, "learning_rate": 5.579335793357934e-05, "loss": 2.383859062194824, "step": 31160 }, { "epoch": 8.847573091115526, "grad_norm": 6.510891437530518, "learning_rate": 5.5779165483962534e-05, "loss": 2.375128173828125, "step": 31170 }, { "epoch": 8.850411581038887, "grad_norm": 6.562889099121094, "learning_rate": 5.5764973034345734e-05, "loss": 2.3993148803710938, "step": 31180 }, { "epoch": 8.853250070962249, "grad_norm": 6.654133319854736, "learning_rate": 5.575078058472892e-05, "loss": 2.2418401718139647, "step": 31190 }, { "epoch": 8.85608856088561, "grad_norm": 6.393650531768799, "learning_rate": 5.573658813511212e-05, "loss": 2.288331985473633, "step": 31200 }, { "epoch": 8.858927050808969, "grad_norm": 6.612664699554443, "learning_rate": 5.572239568549532e-05, "loss": 2.2993907928466797, "step": 31210 }, { "epoch": 8.86176554073233, "grad_norm": 6.603342533111572, "learning_rate": 5.570820323587852e-05, "loss": 2.2885061264038087, "step": 31220 }, { "epoch": 8.864604030655691, "grad_norm": 6.547292709350586, "learning_rate": 5.569401078626171e-05, "loss": 2.4030439376831056, "step": 31230 }, { "epoch": 8.867442520579052, "grad_norm": 6.431107521057129, "learning_rate": 5.567981833664491e-05, "loss": 2.346238136291504, "step": 31240 }, { "epoch": 8.870281010502413, "grad_norm": 6.371731281280518, "learning_rate": 5.566562588702811e-05, "loss": 2.349106025695801, "step": 31250 }, { "epoch": 8.873119500425773, "grad_norm": 6.326968193054199, "learning_rate": 5.56514334374113e-05, "loss": 2.2663105010986326, "step": 31260 }, { "epoch": 8.875957990349134, "grad_norm": 6.539234161376953, "learning_rate": 5.5637240987794494e-05, "loss": 2.3141481399536135, "step": 31270 }, { "epoch": 8.878796480272495, "grad_norm": 6.715444087982178, "learning_rate": 5.562304853817769e-05, "loss": 2.330814743041992, "step": 31280 }, { "epoch": 8.881634970195856, "grad_norm": 7.215051174163818, "learning_rate": 5.560885608856089e-05, "loss": 2.4080669403076174, "step": 31290 }, { "epoch": 8.884473460119217, "grad_norm": 6.426395416259766, "learning_rate": 5.559466363894409e-05, "loss": 2.3852256774902343, "step": 31300 }, { "epoch": 8.887311950042577, "grad_norm": 6.644552707672119, "learning_rate": 5.5580471189327275e-05, "loss": 2.3609914779663086, "step": 31310 }, { "epoch": 8.890150439965938, "grad_norm": 6.7222514152526855, "learning_rate": 5.5566278739710475e-05, "loss": 2.2729866027832033, "step": 31320 }, { "epoch": 8.892988929889299, "grad_norm": 6.630100250244141, "learning_rate": 5.5552086290093675e-05, "loss": 2.3408037185668946, "step": 31330 }, { "epoch": 8.89582741981266, "grad_norm": 6.372684001922607, "learning_rate": 5.5537893840476875e-05, "loss": 2.382050132751465, "step": 31340 }, { "epoch": 8.898665909736021, "grad_norm": 6.388316631317139, "learning_rate": 5.552370139086006e-05, "loss": 2.3613962173461913, "step": 31350 }, { "epoch": 8.90150439965938, "grad_norm": 6.781100749969482, "learning_rate": 5.550950894124326e-05, "loss": 2.3199777603149414, "step": 31360 }, { "epoch": 8.904342889582741, "grad_norm": 6.309999942779541, "learning_rate": 5.549531649162646e-05, "loss": 2.309238243103027, "step": 31370 }, { "epoch": 8.907181379506103, "grad_norm": 6.580389976501465, "learning_rate": 5.5481124042009655e-05, "loss": 2.272125816345215, "step": 31380 }, { "epoch": 8.910019869429464, "grad_norm": 6.509474754333496, "learning_rate": 5.546693159239285e-05, "loss": 2.345244598388672, "step": 31390 }, { "epoch": 8.912858359352825, "grad_norm": 6.504703998565674, "learning_rate": 5.545273914277604e-05, "loss": 2.3108552932739257, "step": 31400 }, { "epoch": 8.915696849276186, "grad_norm": 6.772998809814453, "learning_rate": 5.543854669315924e-05, "loss": 2.352544975280762, "step": 31410 }, { "epoch": 8.918535339199545, "grad_norm": 6.405367374420166, "learning_rate": 5.542435424354244e-05, "loss": 2.4099918365478517, "step": 31420 }, { "epoch": 8.921373829122906, "grad_norm": 6.797799110412598, "learning_rate": 5.541016179392563e-05, "loss": 2.291650581359863, "step": 31430 }, { "epoch": 8.924212319046267, "grad_norm": 6.4284281730651855, "learning_rate": 5.539596934430883e-05, "loss": 2.2286666870117187, "step": 31440 }, { "epoch": 8.927050808969629, "grad_norm": 6.925333499908447, "learning_rate": 5.538177689469203e-05, "loss": 2.28583984375, "step": 31450 }, { "epoch": 8.92988929889299, "grad_norm": 6.742944717407227, "learning_rate": 5.536758444507523e-05, "loss": 2.344883918762207, "step": 31460 }, { "epoch": 8.932727788816349, "grad_norm": 6.658897876739502, "learning_rate": 5.5353391995458415e-05, "loss": 2.345302391052246, "step": 31470 }, { "epoch": 8.93556627873971, "grad_norm": 6.585961818695068, "learning_rate": 5.5339199545841615e-05, "loss": 2.364165496826172, "step": 31480 }, { "epoch": 8.938404768663071, "grad_norm": 7.301980972290039, "learning_rate": 5.5325007096224815e-05, "loss": 2.2948867797851564, "step": 31490 }, { "epoch": 8.941243258586432, "grad_norm": 6.7982683181762695, "learning_rate": 5.531081464660801e-05, "loss": 2.364105987548828, "step": 31500 }, { "epoch": 8.941243258586432, "eval_accuracy": 0.3022826985439054, "eval_loss": 2.6303014755249023, "eval_runtime": 50.9068, "eval_samples_per_second": 308.937, "eval_steps_per_second": 4.832, "step": 31500 }, { "epoch": 8.944081748509793, "grad_norm": 6.818788528442383, "learning_rate": 5.52966221969912e-05, "loss": 2.315338897705078, "step": 31510 }, { "epoch": 8.946920238433153, "grad_norm": 6.727417469024658, "learning_rate": 5.5282429747374395e-05, "loss": 2.407105827331543, "step": 31520 }, { "epoch": 8.949758728356514, "grad_norm": 6.76615571975708, "learning_rate": 5.5268237297757595e-05, "loss": 2.326645278930664, "step": 31530 }, { "epoch": 8.952597218279875, "grad_norm": 6.80183744430542, "learning_rate": 5.5254044848140795e-05, "loss": 2.278002166748047, "step": 31540 }, { "epoch": 8.955435708203236, "grad_norm": 6.476210594177246, "learning_rate": 5.523985239852398e-05, "loss": 2.2816186904907227, "step": 31550 }, { "epoch": 8.958274198126597, "grad_norm": 6.907827854156494, "learning_rate": 5.522565994890718e-05, "loss": 2.3826200485229494, "step": 31560 }, { "epoch": 8.961112688049958, "grad_norm": 6.595425128936768, "learning_rate": 5.521146749929038e-05, "loss": 2.393757629394531, "step": 31570 }, { "epoch": 8.963951177973318, "grad_norm": 6.474245548248291, "learning_rate": 5.519727504967358e-05, "loss": 2.3231342315673826, "step": 31580 }, { "epoch": 8.966789667896679, "grad_norm": 6.724374294281006, "learning_rate": 5.518308260005677e-05, "loss": 2.330351638793945, "step": 31590 }, { "epoch": 8.96962815782004, "grad_norm": 6.6324543952941895, "learning_rate": 5.516889015043997e-05, "loss": 2.3047557830810548, "step": 31600 }, { "epoch": 8.972466647743401, "grad_norm": 6.827235221862793, "learning_rate": 5.515469770082317e-05, "loss": 2.2842792510986327, "step": 31610 }, { "epoch": 8.975305137666762, "grad_norm": 6.358659267425537, "learning_rate": 5.514050525120636e-05, "loss": 2.3431976318359373, "step": 31620 }, { "epoch": 8.978143627590121, "grad_norm": 6.642461776733398, "learning_rate": 5.5126312801589556e-05, "loss": 2.306795120239258, "step": 31630 }, { "epoch": 8.980982117513483, "grad_norm": 6.614814758300781, "learning_rate": 5.511212035197275e-05, "loss": 2.330255126953125, "step": 31640 }, { "epoch": 8.983820607436844, "grad_norm": 6.620420932769775, "learning_rate": 5.509792790235595e-05, "loss": 2.2819889068603514, "step": 31650 }, { "epoch": 8.986659097360205, "grad_norm": 6.43955135345459, "learning_rate": 5.508373545273915e-05, "loss": 2.2925251007080076, "step": 31660 }, { "epoch": 8.989497587283566, "grad_norm": 6.721035480499268, "learning_rate": 5.5069543003122336e-05, "loss": 2.2899913787841797, "step": 31670 }, { "epoch": 8.992336077206925, "grad_norm": 6.798611640930176, "learning_rate": 5.5055350553505536e-05, "loss": 2.2937883377075194, "step": 31680 }, { "epoch": 8.995174567130286, "grad_norm": 6.556188106536865, "learning_rate": 5.5041158103888736e-05, "loss": 2.2785072326660156, "step": 31690 }, { "epoch": 8.998013057053647, "grad_norm": 6.7941765785217285, "learning_rate": 5.5026965654271936e-05, "loss": 2.3686407089233397, "step": 31700 }, { "epoch": 9.000851546977009, "grad_norm": 6.77374267578125, "learning_rate": 5.501277320465512e-05, "loss": 2.3417522430419924, "step": 31710 }, { "epoch": 9.00369003690037, "grad_norm": 7.023648262023926, "learning_rate": 5.499858075503832e-05, "loss": 2.309418296813965, "step": 31720 }, { "epoch": 9.006528526823729, "grad_norm": 6.666685581207275, "learning_rate": 5.4984388305421516e-05, "loss": 2.3117755889892577, "step": 31730 }, { "epoch": 9.00936701674709, "grad_norm": 6.5145111083984375, "learning_rate": 5.4970195855804716e-05, "loss": 2.3567483901977537, "step": 31740 }, { "epoch": 9.012205506670451, "grad_norm": 6.62932825088501, "learning_rate": 5.49560034061879e-05, "loss": 2.2788555145263674, "step": 31750 }, { "epoch": 9.015043996593812, "grad_norm": 7.104057788848877, "learning_rate": 5.49418109565711e-05, "loss": 2.362657928466797, "step": 31760 }, { "epoch": 9.017882486517173, "grad_norm": 6.546594619750977, "learning_rate": 5.49276185069543e-05, "loss": 2.364878463745117, "step": 31770 }, { "epoch": 9.020720976440534, "grad_norm": 6.981142520904541, "learning_rate": 5.49134260573375e-05, "loss": 2.2620168685913087, "step": 31780 }, { "epoch": 9.023559466363894, "grad_norm": 6.966663837432861, "learning_rate": 5.489923360772069e-05, "loss": 2.2282926559448244, "step": 31790 }, { "epoch": 9.026397956287255, "grad_norm": 7.012066841125488, "learning_rate": 5.488504115810389e-05, "loss": 2.338044548034668, "step": 31800 }, { "epoch": 9.029236446210616, "grad_norm": 6.422311782836914, "learning_rate": 5.487084870848709e-05, "loss": 2.334280776977539, "step": 31810 }, { "epoch": 9.032074936133977, "grad_norm": 6.666186809539795, "learning_rate": 5.485665625887029e-05, "loss": 2.291547393798828, "step": 31820 }, { "epoch": 9.034913426057338, "grad_norm": 6.749265193939209, "learning_rate": 5.4842463809253476e-05, "loss": 2.3643867492675783, "step": 31830 }, { "epoch": 9.037751915980698, "grad_norm": 6.381532669067383, "learning_rate": 5.4828271359636676e-05, "loss": 2.2378950119018555, "step": 31840 }, { "epoch": 9.040590405904059, "grad_norm": 6.787676811218262, "learning_rate": 5.481407891001987e-05, "loss": 2.3373409271240235, "step": 31850 }, { "epoch": 9.04342889582742, "grad_norm": 6.614657402038574, "learning_rate": 5.479988646040307e-05, "loss": 2.2292484283447265, "step": 31860 }, { "epoch": 9.046267385750781, "grad_norm": 6.772342681884766, "learning_rate": 5.4785694010786256e-05, "loss": 2.2283864974975587, "step": 31870 }, { "epoch": 9.049105875674142, "grad_norm": 6.722009181976318, "learning_rate": 5.4771501561169456e-05, "loss": 2.256494140625, "step": 31880 }, { "epoch": 9.051944365597501, "grad_norm": 6.649664878845215, "learning_rate": 5.4757309111552657e-05, "loss": 2.3221275329589846, "step": 31890 }, { "epoch": 9.054782855520862, "grad_norm": 6.627770900726318, "learning_rate": 5.474311666193586e-05, "loss": 2.329659271240234, "step": 31900 }, { "epoch": 9.057621345444224, "grad_norm": 6.3063764572143555, "learning_rate": 5.472892421231904e-05, "loss": 2.3433206558227537, "step": 31910 }, { "epoch": 9.060459835367585, "grad_norm": 6.330167770385742, "learning_rate": 5.471473176270224e-05, "loss": 2.23612117767334, "step": 31920 }, { "epoch": 9.063298325290946, "grad_norm": 6.612358570098877, "learning_rate": 5.470053931308544e-05, "loss": 2.265130615234375, "step": 31930 }, { "epoch": 9.066136815214305, "grad_norm": 6.712152481079102, "learning_rate": 5.4686346863468643e-05, "loss": 2.398112106323242, "step": 31940 }, { "epoch": 9.068975305137666, "grad_norm": 6.295958518981934, "learning_rate": 5.467215441385183e-05, "loss": 2.268534469604492, "step": 31950 }, { "epoch": 9.071813795061027, "grad_norm": 6.854920387268066, "learning_rate": 5.465796196423503e-05, "loss": 2.27524471282959, "step": 31960 }, { "epoch": 9.074652284984388, "grad_norm": 6.537172794342041, "learning_rate": 5.4643769514618223e-05, "loss": 2.276318168640137, "step": 31970 }, { "epoch": 9.07749077490775, "grad_norm": 6.922662734985352, "learning_rate": 5.4629577065001424e-05, "loss": 2.3172119140625, "step": 31980 }, { "epoch": 9.08032926483111, "grad_norm": 6.535214900970459, "learning_rate": 5.461538461538461e-05, "loss": 2.340689849853516, "step": 31990 }, { "epoch": 9.08316775475447, "grad_norm": 6.550475120544434, "learning_rate": 5.460119216576781e-05, "loss": 2.302538299560547, "step": 32000 }, { "epoch": 9.08316775475447, "eval_accuracy": 0.3055255293444395, "eval_loss": 2.62182879447937, "eval_runtime": 48.3821, "eval_samples_per_second": 325.058, "eval_steps_per_second": 5.085, "step": 32000 }, { "epoch": 9.086006244677831, "grad_norm": 6.774585247039795, "learning_rate": 5.458699971615101e-05, "loss": 2.3122232437133787, "step": 32010 }, { "epoch": 9.088844734601192, "grad_norm": 6.645608425140381, "learning_rate": 5.457280726653421e-05, "loss": 2.3218029022216795, "step": 32020 }, { "epoch": 9.091683224524553, "grad_norm": 6.42581844329834, "learning_rate": 5.45586148169174e-05, "loss": 2.2186317443847656, "step": 32030 }, { "epoch": 9.094521714447914, "grad_norm": 6.696235656738281, "learning_rate": 5.45444223673006e-05, "loss": 2.294163703918457, "step": 32040 }, { "epoch": 9.097360204371274, "grad_norm": 6.695120334625244, "learning_rate": 5.45302299176838e-05, "loss": 2.33133544921875, "step": 32050 }, { "epoch": 9.100198694294635, "grad_norm": 6.355082988739014, "learning_rate": 5.4516037468067e-05, "loss": 2.2989471435546873, "step": 32060 }, { "epoch": 9.103037184217996, "grad_norm": 6.533618450164795, "learning_rate": 5.4501845018450184e-05, "loss": 2.3655176162719727, "step": 32070 }, { "epoch": 9.105875674141357, "grad_norm": 6.68558931350708, "learning_rate": 5.4487652568833384e-05, "loss": 2.345503807067871, "step": 32080 }, { "epoch": 9.108714164064718, "grad_norm": 6.311290264129639, "learning_rate": 5.447346011921658e-05, "loss": 2.2254833221435546, "step": 32090 }, { "epoch": 9.111552653988078, "grad_norm": 5.852826118469238, "learning_rate": 5.445926766959978e-05, "loss": 2.2545663833618166, "step": 32100 }, { "epoch": 9.114391143911439, "grad_norm": 6.437420845031738, "learning_rate": 5.4445075219982964e-05, "loss": 2.2864986419677735, "step": 32110 }, { "epoch": 9.1172296338348, "grad_norm": 6.690932273864746, "learning_rate": 5.4430882770366164e-05, "loss": 2.309493637084961, "step": 32120 }, { "epoch": 9.120068123758161, "grad_norm": 6.739814281463623, "learning_rate": 5.4416690320749364e-05, "loss": 2.284662628173828, "step": 32130 }, { "epoch": 9.122906613681522, "grad_norm": 6.612237930297852, "learning_rate": 5.4402497871132564e-05, "loss": 2.371356201171875, "step": 32140 }, { "epoch": 9.125745103604881, "grad_norm": 6.742522716522217, "learning_rate": 5.4388305421515764e-05, "loss": 2.3289281845092775, "step": 32150 }, { "epoch": 9.128583593528242, "grad_norm": 6.590194225311279, "learning_rate": 5.437411297189895e-05, "loss": 2.3243314743041994, "step": 32160 }, { "epoch": 9.131422083451604, "grad_norm": 6.647777080535889, "learning_rate": 5.435992052228215e-05, "loss": 2.2687950134277344, "step": 32170 }, { "epoch": 9.134260573374965, "grad_norm": 6.719007968902588, "learning_rate": 5.4345728072665344e-05, "loss": 2.2888866424560548, "step": 32180 }, { "epoch": 9.137099063298326, "grad_norm": 6.660299301147461, "learning_rate": 5.4332954868010224e-05, "loss": 2.356073570251465, "step": 32190 }, { "epoch": 9.139937553221687, "grad_norm": 6.402402400970459, "learning_rate": 5.4318762418393424e-05, "loss": 2.394923973083496, "step": 32200 }, { "epoch": 9.142776043145046, "grad_norm": 6.726291179656982, "learning_rate": 5.430456996877661e-05, "loss": 2.3530956268310548, "step": 32210 }, { "epoch": 9.145614533068407, "grad_norm": 6.6922807693481445, "learning_rate": 5.429037751915981e-05, "loss": 2.2992179870605467, "step": 32220 }, { "epoch": 9.148453022991768, "grad_norm": 6.721292972564697, "learning_rate": 5.4276185069543004e-05, "loss": 2.348832893371582, "step": 32230 }, { "epoch": 9.15129151291513, "grad_norm": 6.521751403808594, "learning_rate": 5.4261992619926204e-05, "loss": 2.3227184295654295, "step": 32240 }, { "epoch": 9.15413000283849, "grad_norm": 6.67730712890625, "learning_rate": 5.424780017030939e-05, "loss": 2.256382179260254, "step": 32250 }, { "epoch": 9.15696849276185, "grad_norm": 6.455477237701416, "learning_rate": 5.423360772069259e-05, "loss": 2.25159912109375, "step": 32260 }, { "epoch": 9.159806982685211, "grad_norm": 7.129822731018066, "learning_rate": 5.421941527107579e-05, "loss": 2.266666603088379, "step": 32270 }, { "epoch": 9.162645472608572, "grad_norm": 6.591302871704102, "learning_rate": 5.420522282145899e-05, "loss": 2.200771522521973, "step": 32280 }, { "epoch": 9.165483962531933, "grad_norm": 6.878000736236572, "learning_rate": 5.419103037184218e-05, "loss": 2.30284423828125, "step": 32290 }, { "epoch": 9.168322452455294, "grad_norm": 6.635138988494873, "learning_rate": 5.417683792222538e-05, "loss": 2.3056447982788084, "step": 32300 }, { "epoch": 9.171160942378654, "grad_norm": 7.042394161224365, "learning_rate": 5.416264547260858e-05, "loss": 2.297426223754883, "step": 32310 }, { "epoch": 9.173999432302015, "grad_norm": 6.837728023529053, "learning_rate": 5.414845302299178e-05, "loss": 2.288163185119629, "step": 32320 }, { "epoch": 9.176837922225376, "grad_norm": 6.6272382736206055, "learning_rate": 5.4134260573374964e-05, "loss": 2.3046913146972656, "step": 32330 }, { "epoch": 9.179676412148737, "grad_norm": 6.580864906311035, "learning_rate": 5.4120068123758164e-05, "loss": 2.3031240463256837, "step": 32340 }, { "epoch": 9.182514902072098, "grad_norm": 6.347316265106201, "learning_rate": 5.410587567414136e-05, "loss": 2.2995317459106444, "step": 32350 }, { "epoch": 9.18535339199546, "grad_norm": 6.650463581085205, "learning_rate": 5.409168322452456e-05, "loss": 2.2802803039550783, "step": 32360 }, { "epoch": 9.188191881918819, "grad_norm": 6.994548797607422, "learning_rate": 5.4077490774907744e-05, "loss": 2.3677013397216795, "step": 32370 }, { "epoch": 9.19103037184218, "grad_norm": 6.766984939575195, "learning_rate": 5.4063298325290944e-05, "loss": 2.204318046569824, "step": 32380 }, { "epoch": 9.19386886176554, "grad_norm": 6.825021743774414, "learning_rate": 5.4049105875674144e-05, "loss": 2.249759483337402, "step": 32390 }, { "epoch": 9.196707351688902, "grad_norm": 6.576784610748291, "learning_rate": 5.4034913426057344e-05, "loss": 2.379463768005371, "step": 32400 }, { "epoch": 9.199545841612263, "grad_norm": 7.026026248931885, "learning_rate": 5.402072097644053e-05, "loss": 2.3790878295898437, "step": 32410 }, { "epoch": 9.202384331535622, "grad_norm": 6.7634100914001465, "learning_rate": 5.400652852682373e-05, "loss": 2.3411617279052734, "step": 32420 }, { "epoch": 9.205222821458984, "grad_norm": 6.59788179397583, "learning_rate": 5.399233607720693e-05, "loss": 2.2366567611694337, "step": 32430 }, { "epoch": 9.208061311382345, "grad_norm": 6.743095874786377, "learning_rate": 5.3978143627590124e-05, "loss": 2.3120718002319336, "step": 32440 }, { "epoch": 9.210899801305706, "grad_norm": 6.728921890258789, "learning_rate": 5.396395117797332e-05, "loss": 2.286648750305176, "step": 32450 }, { "epoch": 9.213738291229067, "grad_norm": 6.423162937164307, "learning_rate": 5.394975872835651e-05, "loss": 2.2751440048217773, "step": 32460 }, { "epoch": 9.216576781152426, "grad_norm": 6.363700866699219, "learning_rate": 5.393556627873971e-05, "loss": 2.308590126037598, "step": 32470 }, { "epoch": 9.219415271075787, "grad_norm": 6.806884765625, "learning_rate": 5.392137382912291e-05, "loss": 2.3383554458618163, "step": 32480 }, { "epoch": 9.222253760999148, "grad_norm": 6.7835493087768555, "learning_rate": 5.39071813795061e-05, "loss": 2.278965377807617, "step": 32490 }, { "epoch": 9.22509225092251, "grad_norm": 6.497165679931641, "learning_rate": 5.38929889298893e-05, "loss": 2.297120475769043, "step": 32500 }, { "epoch": 9.22509225092251, "eval_accuracy": 0.30679722769759016, "eval_loss": 2.618140697479248, "eval_runtime": 53.2839, "eval_samples_per_second": 295.155, "eval_steps_per_second": 4.617, "step": 32500 }, { "epoch": 9.22793074084587, "grad_norm": 6.556301593780518, "learning_rate": 5.38787964802725e-05, "loss": 2.34541015625, "step": 32510 }, { "epoch": 9.23076923076923, "grad_norm": 6.5062456130981445, "learning_rate": 5.38646040306557e-05, "loss": 2.285518455505371, "step": 32520 }, { "epoch": 9.233607720692591, "grad_norm": 6.225841999053955, "learning_rate": 5.38504115810389e-05, "loss": 2.3032760620117188, "step": 32530 }, { "epoch": 9.236446210615952, "grad_norm": 6.81283712387085, "learning_rate": 5.3836219131422085e-05, "loss": 2.277065086364746, "step": 32540 }, { "epoch": 9.239284700539313, "grad_norm": 6.532005310058594, "learning_rate": 5.3822026681805285e-05, "loss": 2.3383493423461914, "step": 32550 }, { "epoch": 9.242123190462674, "grad_norm": 6.827577590942383, "learning_rate": 5.380783423218848e-05, "loss": 2.308034133911133, "step": 32560 }, { "epoch": 9.244961680386035, "grad_norm": 6.735457420349121, "learning_rate": 5.379364178257168e-05, "loss": 2.3510873794555662, "step": 32570 }, { "epoch": 9.247800170309395, "grad_norm": 6.584004878997803, "learning_rate": 5.3779449332954865e-05, "loss": 2.239666748046875, "step": 32580 }, { "epoch": 9.250638660232756, "grad_norm": 6.674546718597412, "learning_rate": 5.3765256883338065e-05, "loss": 2.3344711303710937, "step": 32590 }, { "epoch": 9.253477150156117, "grad_norm": 6.487312316894531, "learning_rate": 5.3751064433721265e-05, "loss": 2.2636608123779296, "step": 32600 }, { "epoch": 9.256315640079478, "grad_norm": 6.863683223724365, "learning_rate": 5.3736871984104465e-05, "loss": 2.359160614013672, "step": 32610 }, { "epoch": 9.25915413000284, "grad_norm": 6.612986087799072, "learning_rate": 5.372267953448765e-05, "loss": 2.327238845825195, "step": 32620 }, { "epoch": 9.261992619926199, "grad_norm": 6.776516914367676, "learning_rate": 5.370848708487085e-05, "loss": 2.3494436264038088, "step": 32630 }, { "epoch": 9.26483110984956, "grad_norm": 6.914587020874023, "learning_rate": 5.369429463525405e-05, "loss": 2.329121780395508, "step": 32640 }, { "epoch": 9.26766959977292, "grad_norm": 6.432284355163574, "learning_rate": 5.368010218563725e-05, "loss": 2.308768081665039, "step": 32650 }, { "epoch": 9.270508089696282, "grad_norm": 6.488089084625244, "learning_rate": 5.366590973602044e-05, "loss": 2.3288957595825197, "step": 32660 }, { "epoch": 9.273346579619643, "grad_norm": 6.62701940536499, "learning_rate": 5.365171728640364e-05, "loss": 2.320966339111328, "step": 32670 }, { "epoch": 9.276185069543002, "grad_norm": 6.74114990234375, "learning_rate": 5.363752483678683e-05, "loss": 2.354597473144531, "step": 32680 }, { "epoch": 9.279023559466363, "grad_norm": 6.612016677856445, "learning_rate": 5.362333238717003e-05, "loss": 2.2967592239379884, "step": 32690 }, { "epoch": 9.281862049389725, "grad_norm": 6.027867317199707, "learning_rate": 5.360913993755322e-05, "loss": 2.3274362564086912, "step": 32700 }, { "epoch": 9.284700539313086, "grad_norm": 6.566064834594727, "learning_rate": 5.359494748793642e-05, "loss": 2.388564682006836, "step": 32710 }, { "epoch": 9.287539029236447, "grad_norm": 6.58699893951416, "learning_rate": 5.358075503831962e-05, "loss": 2.289028549194336, "step": 32720 }, { "epoch": 9.290377519159806, "grad_norm": 6.539759159088135, "learning_rate": 5.356656258870282e-05, "loss": 2.320009231567383, "step": 32730 }, { "epoch": 9.293216009083167, "grad_norm": 6.706819534301758, "learning_rate": 5.3552370139086005e-05, "loss": 2.3244501113891602, "step": 32740 }, { "epoch": 9.296054499006528, "grad_norm": 6.529994487762451, "learning_rate": 5.3538177689469205e-05, "loss": 2.315158462524414, "step": 32750 }, { "epoch": 9.29889298892989, "grad_norm": 6.348599910736084, "learning_rate": 5.3523985239852406e-05, "loss": 2.2545909881591797, "step": 32760 }, { "epoch": 9.30173147885325, "grad_norm": 6.316298484802246, "learning_rate": 5.35097927902356e-05, "loss": 2.235585021972656, "step": 32770 }, { "epoch": 9.304569968776612, "grad_norm": 6.414445400238037, "learning_rate": 5.349560034061879e-05, "loss": 2.2735965728759764, "step": 32780 }, { "epoch": 9.307408458699971, "grad_norm": 7.155065536499023, "learning_rate": 5.348140789100199e-05, "loss": 2.2799428939819335, "step": 32790 }, { "epoch": 9.310246948623332, "grad_norm": 6.576754093170166, "learning_rate": 5.3467215441385186e-05, "loss": 2.324036407470703, "step": 32800 }, { "epoch": 9.313085438546693, "grad_norm": 6.18630838394165, "learning_rate": 5.3453022991768386e-05, "loss": 2.354183006286621, "step": 32810 }, { "epoch": 9.315923928470054, "grad_norm": 6.794250011444092, "learning_rate": 5.343883054215157e-05, "loss": 2.311314582824707, "step": 32820 }, { "epoch": 9.318762418393415, "grad_norm": 6.469041347503662, "learning_rate": 5.342463809253477e-05, "loss": 2.3042808532714845, "step": 32830 }, { "epoch": 9.321600908316775, "grad_norm": 6.833839416503906, "learning_rate": 5.341044564291797e-05, "loss": 2.331252670288086, "step": 32840 }, { "epoch": 9.324439398240136, "grad_norm": 6.605815410614014, "learning_rate": 5.339625319330117e-05, "loss": 2.2726213455200197, "step": 32850 }, { "epoch": 9.327277888163497, "grad_norm": 6.5434770584106445, "learning_rate": 5.338206074368436e-05, "loss": 2.3052343368530273, "step": 32860 }, { "epoch": 9.330116378086858, "grad_norm": 6.7870330810546875, "learning_rate": 5.336786829406756e-05, "loss": 2.317494773864746, "step": 32870 }, { "epoch": 9.33295486801022, "grad_norm": 6.860762596130371, "learning_rate": 5.335367584445076e-05, "loss": 2.3044824600219727, "step": 32880 }, { "epoch": 9.335793357933579, "grad_norm": 6.836589813232422, "learning_rate": 5.333948339483395e-05, "loss": 2.341184616088867, "step": 32890 }, { "epoch": 9.33863184785694, "grad_norm": 6.558943271636963, "learning_rate": 5.3325290945217146e-05, "loss": 2.38778076171875, "step": 32900 }, { "epoch": 9.3414703377803, "grad_norm": 6.882741451263428, "learning_rate": 5.331109849560034e-05, "loss": 2.315058135986328, "step": 32910 }, { "epoch": 9.344308827703662, "grad_norm": 6.194183826446533, "learning_rate": 5.329690604598354e-05, "loss": 2.2829376220703126, "step": 32920 }, { "epoch": 9.347147317627023, "grad_norm": 6.747465133666992, "learning_rate": 5.328271359636674e-05, "loss": 2.258367729187012, "step": 32930 }, { "epoch": 9.349985807550382, "grad_norm": 6.340539932250977, "learning_rate": 5.3268521146749926e-05, "loss": 2.341826629638672, "step": 32940 }, { "epoch": 9.352824297473743, "grad_norm": 6.366337776184082, "learning_rate": 5.3254328697133126e-05, "loss": 2.2427745819091798, "step": 32950 }, { "epoch": 9.355662787397105, "grad_norm": 6.420746803283691, "learning_rate": 5.3240136247516326e-05, "loss": 2.344890594482422, "step": 32960 }, { "epoch": 9.358501277320466, "grad_norm": 6.855352878570557, "learning_rate": 5.3225943797899526e-05, "loss": 2.3024160385131838, "step": 32970 }, { "epoch": 9.361339767243827, "grad_norm": 7.179250240325928, "learning_rate": 5.321175134828271e-05, "loss": 2.2419149398803713, "step": 32980 }, { "epoch": 9.364178257167188, "grad_norm": 6.373600006103516, "learning_rate": 5.319755889866591e-05, "loss": 2.3203561782836912, "step": 32990 }, { "epoch": 9.367016747090547, "grad_norm": 6.570921421051025, "learning_rate": 5.318336644904911e-05, "loss": 2.2477256774902346, "step": 33000 }, { "epoch": 9.367016747090547, "eval_accuracy": 0.3046989254148916, "eval_loss": 2.6143832206726074, "eval_runtime": 50.7196, "eval_samples_per_second": 310.078, "eval_steps_per_second": 4.85, "step": 33000 }, { "epoch": 9.369855237013908, "grad_norm": 6.194941520690918, "learning_rate": 5.3169173999432306e-05, "loss": 2.2453380584716798, "step": 33010 }, { "epoch": 9.37269372693727, "grad_norm": 6.597099304199219, "learning_rate": 5.31549815498155e-05, "loss": 2.380506896972656, "step": 33020 }, { "epoch": 9.37553221686063, "grad_norm": 6.661083698272705, "learning_rate": 5.314078910019869e-05, "loss": 2.2547473907470703, "step": 33030 }, { "epoch": 9.378370706783992, "grad_norm": 6.623694896697998, "learning_rate": 5.312659665058189e-05, "loss": 2.2345762252807617, "step": 33040 }, { "epoch": 9.381209196707351, "grad_norm": 7.367551803588867, "learning_rate": 5.311240420096509e-05, "loss": 2.340035247802734, "step": 33050 }, { "epoch": 9.384047686630712, "grad_norm": 6.873191833496094, "learning_rate": 5.309821175134828e-05, "loss": 2.300846290588379, "step": 33060 }, { "epoch": 9.386886176554073, "grad_norm": 6.491840362548828, "learning_rate": 5.308401930173148e-05, "loss": 2.3061370849609375, "step": 33070 }, { "epoch": 9.389724666477434, "grad_norm": 6.6239471435546875, "learning_rate": 5.306982685211468e-05, "loss": 2.3192428588867187, "step": 33080 }, { "epoch": 9.392563156400795, "grad_norm": 6.480617523193359, "learning_rate": 5.305563440249788e-05, "loss": 2.3875789642333984, "step": 33090 }, { "epoch": 9.395401646324155, "grad_norm": 6.104969024658203, "learning_rate": 5.3041441952881067e-05, "loss": 2.2913761138916016, "step": 33100 }, { "epoch": 9.398240136247516, "grad_norm": 6.293223857879639, "learning_rate": 5.302724950326427e-05, "loss": 2.252407455444336, "step": 33110 }, { "epoch": 9.401078626170877, "grad_norm": 6.501431465148926, "learning_rate": 5.301305705364747e-05, "loss": 2.312226676940918, "step": 33120 }, { "epoch": 9.403917116094238, "grad_norm": 6.791201114654541, "learning_rate": 5.299886460403066e-05, "loss": 2.2725332260131834, "step": 33130 }, { "epoch": 9.4067556060176, "grad_norm": 6.582462310791016, "learning_rate": 5.298467215441385e-05, "loss": 2.3633853912353517, "step": 33140 }, { "epoch": 9.40959409594096, "grad_norm": 6.525666236877441, "learning_rate": 5.297047970479705e-05, "loss": 2.311086082458496, "step": 33150 }, { "epoch": 9.41243258586432, "grad_norm": 6.546609878540039, "learning_rate": 5.295628725518025e-05, "loss": 2.3016534805297852, "step": 33160 }, { "epoch": 9.41527107578768, "grad_norm": 6.5930986404418945, "learning_rate": 5.294209480556345e-05, "loss": 2.3168876647949217, "step": 33170 }, { "epoch": 9.418109565711042, "grad_norm": 6.564723491668701, "learning_rate": 5.2927902355946633e-05, "loss": 2.3684463500976562, "step": 33180 }, { "epoch": 9.420948055634403, "grad_norm": 6.481032371520996, "learning_rate": 5.2913709906329834e-05, "loss": 2.263235664367676, "step": 33190 }, { "epoch": 9.423786545557764, "grad_norm": 6.760184288024902, "learning_rate": 5.2899517456713034e-05, "loss": 2.2778961181640627, "step": 33200 }, { "epoch": 9.426625035481123, "grad_norm": 7.089434623718262, "learning_rate": 5.2885325007096234e-05, "loss": 2.306510162353516, "step": 33210 }, { "epoch": 9.429463525404485, "grad_norm": 6.879021644592285, "learning_rate": 5.287113255747942e-05, "loss": 2.166962242126465, "step": 33220 }, { "epoch": 9.432302015327846, "grad_norm": 6.235321998596191, "learning_rate": 5.285694010786262e-05, "loss": 2.233186721801758, "step": 33230 }, { "epoch": 9.435140505251207, "grad_norm": 7.062513828277588, "learning_rate": 5.2842747658245814e-05, "loss": 2.269602584838867, "step": 33240 }, { "epoch": 9.437978995174568, "grad_norm": 6.590318202972412, "learning_rate": 5.2828555208629014e-05, "loss": 2.3419189453125, "step": 33250 }, { "epoch": 9.440817485097927, "grad_norm": 6.5108561515808105, "learning_rate": 5.28143627590122e-05, "loss": 2.2469802856445313, "step": 33260 }, { "epoch": 9.443655975021288, "grad_norm": 6.724944591522217, "learning_rate": 5.28001703093954e-05, "loss": 2.2815460205078124, "step": 33270 }, { "epoch": 9.44649446494465, "grad_norm": 6.9560675621032715, "learning_rate": 5.27859778597786e-05, "loss": 2.365683364868164, "step": 33280 }, { "epoch": 9.44933295486801, "grad_norm": 7.029690265655518, "learning_rate": 5.27717854101618e-05, "loss": 2.325170135498047, "step": 33290 }, { "epoch": 9.452171444791372, "grad_norm": 6.426738739013672, "learning_rate": 5.275759296054499e-05, "loss": 2.338051605224609, "step": 33300 }, { "epoch": 9.455009934714731, "grad_norm": 7.1550469398498535, "learning_rate": 5.274340051092819e-05, "loss": 2.346786880493164, "step": 33310 }, { "epoch": 9.457848424638092, "grad_norm": 6.771137714385986, "learning_rate": 5.272920806131139e-05, "loss": 2.2711812973022463, "step": 33320 }, { "epoch": 9.460686914561453, "grad_norm": 6.609459400177002, "learning_rate": 5.271501561169459e-05, "loss": 2.303146743774414, "step": 33330 }, { "epoch": 9.463525404484814, "grad_norm": 6.752808094024658, "learning_rate": 5.2700823162077774e-05, "loss": 2.345793342590332, "step": 33340 }, { "epoch": 9.466363894408175, "grad_norm": 6.9947638511657715, "learning_rate": 5.2686630712460974e-05, "loss": 2.3192537307739256, "step": 33350 }, { "epoch": 9.469202384331535, "grad_norm": 7.279099941253662, "learning_rate": 5.267243826284417e-05, "loss": 2.2516252517700197, "step": 33360 }, { "epoch": 9.472040874254896, "grad_norm": 6.785923004150391, "learning_rate": 5.265824581322737e-05, "loss": 2.3400619506835936, "step": 33370 }, { "epoch": 9.474879364178257, "grad_norm": 6.568871974945068, "learning_rate": 5.2644053363610554e-05, "loss": 2.341466522216797, "step": 33380 }, { "epoch": 9.477717854101618, "grad_norm": 6.598638534545898, "learning_rate": 5.2629860913993754e-05, "loss": 2.2863922119140625, "step": 33390 }, { "epoch": 9.48055634402498, "grad_norm": 6.772311210632324, "learning_rate": 5.2615668464376954e-05, "loss": 2.330013656616211, "step": 33400 }, { "epoch": 9.48339483394834, "grad_norm": 6.492908954620361, "learning_rate": 5.2601476014760154e-05, "loss": 2.3602836608886717, "step": 33410 }, { "epoch": 9.4862333238717, "grad_norm": 6.732814788818359, "learning_rate": 5.258728356514334e-05, "loss": 2.3295883178710937, "step": 33420 }, { "epoch": 9.48907181379506, "grad_norm": 6.595634937286377, "learning_rate": 5.257309111552654e-05, "loss": 2.2975467681884765, "step": 33430 }, { "epoch": 9.491910303718422, "grad_norm": 6.776249885559082, "learning_rate": 5.255889866590974e-05, "loss": 2.2988353729248048, "step": 33440 }, { "epoch": 9.494748793641783, "grad_norm": 6.957032203674316, "learning_rate": 5.254470621629294e-05, "loss": 2.2704116821289064, "step": 33450 }, { "epoch": 9.497587283565144, "grad_norm": 6.798323631286621, "learning_rate": 5.253051376667613e-05, "loss": 2.3769205093383787, "step": 33460 }, { "epoch": 9.500425773488503, "grad_norm": 6.4832611083984375, "learning_rate": 5.251632131705933e-05, "loss": 2.2823991775512695, "step": 33470 }, { "epoch": 9.503264263411864, "grad_norm": 6.6639299392700195, "learning_rate": 5.250212886744252e-05, "loss": 2.3649839401245116, "step": 33480 }, { "epoch": 9.506102753335226, "grad_norm": 6.39483118057251, "learning_rate": 5.248793641782572e-05, "loss": 2.309320640563965, "step": 33490 }, { "epoch": 9.508941243258587, "grad_norm": 6.712348937988281, "learning_rate": 5.247374396820891e-05, "loss": 2.2509660720825195, "step": 33500 }, { "epoch": 9.508941243258587, "eval_accuracy": 0.31150251160424747, "eval_loss": 2.6050918102264404, "eval_runtime": 50.7331, "eval_samples_per_second": 309.995, "eval_steps_per_second": 4.849, "step": 33500 }, { "epoch": 9.511779733181948, "grad_norm": 6.789028167724609, "learning_rate": 5.245955151859211e-05, "loss": 2.2478065490722656, "step": 33510 }, { "epoch": 9.514618223105309, "grad_norm": 6.734299182891846, "learning_rate": 5.244535906897531e-05, "loss": 2.271519660949707, "step": 33520 }, { "epoch": 9.517456713028668, "grad_norm": 6.638622283935547, "learning_rate": 5.243116661935851e-05, "loss": 2.3153329849243165, "step": 33530 }, { "epoch": 9.52029520295203, "grad_norm": 6.622377395629883, "learning_rate": 5.2416974169741695e-05, "loss": 2.2869590759277343, "step": 33540 }, { "epoch": 9.52313369287539, "grad_norm": 6.796307563781738, "learning_rate": 5.2402781720124895e-05, "loss": 2.306618309020996, "step": 33550 }, { "epoch": 9.525972182798752, "grad_norm": 6.19141149520874, "learning_rate": 5.2388589270508095e-05, "loss": 2.2710214614868165, "step": 33560 }, { "epoch": 9.528810672722113, "grad_norm": 6.915771961212158, "learning_rate": 5.2374396820891295e-05, "loss": 2.3828821182250977, "step": 33570 }, { "epoch": 9.531649162645472, "grad_norm": 6.685100078582764, "learning_rate": 5.236020437127448e-05, "loss": 2.2466773986816406, "step": 33580 }, { "epoch": 9.534487652568833, "grad_norm": 6.269875526428223, "learning_rate": 5.234601192165768e-05, "loss": 2.222789764404297, "step": 33590 }, { "epoch": 9.537326142492194, "grad_norm": 7.202755451202393, "learning_rate": 5.2331819472040875e-05, "loss": 2.3623117446899413, "step": 33600 }, { "epoch": 9.540164632415555, "grad_norm": 6.915825843811035, "learning_rate": 5.2317627022424075e-05, "loss": 2.366685485839844, "step": 33610 }, { "epoch": 9.543003122338916, "grad_norm": 6.925063610076904, "learning_rate": 5.230343457280726e-05, "loss": 2.309331703186035, "step": 33620 }, { "epoch": 9.545841612262276, "grad_norm": 6.750931739807129, "learning_rate": 5.228924212319046e-05, "loss": 2.3269433975219727, "step": 33630 }, { "epoch": 9.548680102185637, "grad_norm": 6.6139631271362305, "learning_rate": 5.227504967357366e-05, "loss": 2.3360569000244142, "step": 33640 }, { "epoch": 9.551518592108998, "grad_norm": 6.628205299377441, "learning_rate": 5.226085722395686e-05, "loss": 2.2847900390625, "step": 33650 }, { "epoch": 9.55435708203236, "grad_norm": 6.631691932678223, "learning_rate": 5.224666477434005e-05, "loss": 2.2707204818725586, "step": 33660 }, { "epoch": 9.55719557195572, "grad_norm": 6.34300422668457, "learning_rate": 5.223247232472325e-05, "loss": 2.2319259643554688, "step": 33670 }, { "epoch": 9.56003406187908, "grad_norm": 6.610063076019287, "learning_rate": 5.221827987510645e-05, "loss": 2.318767547607422, "step": 33680 }, { "epoch": 9.56287255180244, "grad_norm": 6.641622543334961, "learning_rate": 5.220408742548964e-05, "loss": 2.3098249435424805, "step": 33690 }, { "epoch": 9.565711041725802, "grad_norm": 6.338221073150635, "learning_rate": 5.2189894975872835e-05, "loss": 2.233795166015625, "step": 33700 }, { "epoch": 9.568549531649163, "grad_norm": 7.127760887145996, "learning_rate": 5.217570252625603e-05, "loss": 2.3302831649780273, "step": 33710 }, { "epoch": 9.571388021572524, "grad_norm": 6.379977226257324, "learning_rate": 5.216151007663923e-05, "loss": 2.284628486633301, "step": 33720 }, { "epoch": 9.574226511495883, "grad_norm": 6.8734331130981445, "learning_rate": 5.214731762702243e-05, "loss": 2.3752601623535154, "step": 33730 }, { "epoch": 9.577065001419244, "grad_norm": 7.199255466461182, "learning_rate": 5.2133125177405615e-05, "loss": 2.2776145935058594, "step": 33740 }, { "epoch": 9.579903491342606, "grad_norm": 6.47513484954834, "learning_rate": 5.2118932727788815e-05, "loss": 2.2686925888061524, "step": 33750 }, { "epoch": 9.582741981265967, "grad_norm": 6.44219446182251, "learning_rate": 5.2104740278172015e-05, "loss": 2.2395835876464845, "step": 33760 }, { "epoch": 9.585580471189328, "grad_norm": 6.50620174407959, "learning_rate": 5.2090547828555216e-05, "loss": 2.3740034103393555, "step": 33770 }, { "epoch": 9.588418961112689, "grad_norm": 6.619510650634766, "learning_rate": 5.20763553789384e-05, "loss": 2.2753772735595703, "step": 33780 }, { "epoch": 9.591257451036048, "grad_norm": 6.4819159507751465, "learning_rate": 5.20621629293216e-05, "loss": 2.3322017669677733, "step": 33790 }, { "epoch": 9.59409594095941, "grad_norm": 6.639336109161377, "learning_rate": 5.20479704797048e-05, "loss": 2.3003311157226562, "step": 33800 }, { "epoch": 9.59693443088277, "grad_norm": 6.424644470214844, "learning_rate": 5.2033778030087996e-05, "loss": 2.2851980209350584, "step": 33810 }, { "epoch": 9.599772920806132, "grad_norm": 6.458781719207764, "learning_rate": 5.2019585580471196e-05, "loss": 2.331221008300781, "step": 33820 }, { "epoch": 9.602611410729493, "grad_norm": 6.5243635177612305, "learning_rate": 5.200539313085438e-05, "loss": 2.3000629425048826, "step": 33830 }, { "epoch": 9.605449900652852, "grad_norm": 6.740006923675537, "learning_rate": 5.199120068123758e-05, "loss": 2.319666290283203, "step": 33840 }, { "epoch": 9.608288390576213, "grad_norm": 6.580132484436035, "learning_rate": 5.197700823162078e-05, "loss": 2.3366134643554686, "step": 33850 }, { "epoch": 9.611126880499574, "grad_norm": 6.309546947479248, "learning_rate": 5.196281578200398e-05, "loss": 2.2484193801879884, "step": 33860 }, { "epoch": 9.613965370422935, "grad_norm": 6.972612380981445, "learning_rate": 5.194862333238717e-05, "loss": 2.3525955200195314, "step": 33870 }, { "epoch": 9.616803860346296, "grad_norm": 6.878298282623291, "learning_rate": 5.193443088277037e-05, "loss": 2.274906349182129, "step": 33880 }, { "epoch": 9.619642350269656, "grad_norm": 6.2220869064331055, "learning_rate": 5.192023843315357e-05, "loss": 2.374161148071289, "step": 33890 }, { "epoch": 9.622480840193017, "grad_norm": 6.364703178405762, "learning_rate": 5.190604598353677e-05, "loss": 2.244461250305176, "step": 33900 }, { "epoch": 9.625319330116378, "grad_norm": 6.499929428100586, "learning_rate": 5.1891853533919956e-05, "loss": 2.2265783309936524, "step": 33910 }, { "epoch": 9.628157820039739, "grad_norm": 6.680418968200684, "learning_rate": 5.1877661084303156e-05, "loss": 2.298781967163086, "step": 33920 }, { "epoch": 9.6309963099631, "grad_norm": 6.608136177062988, "learning_rate": 5.186346863468635e-05, "loss": 2.24312686920166, "step": 33930 }, { "epoch": 9.633834799886461, "grad_norm": 6.9229207038879395, "learning_rate": 5.184927618506955e-05, "loss": 2.2778377532958984, "step": 33940 }, { "epoch": 9.63667328980982, "grad_norm": 6.70973014831543, "learning_rate": 5.1835083735452736e-05, "loss": 2.2788908004760744, "step": 33950 }, { "epoch": 9.639511779733182, "grad_norm": 6.959266185760498, "learning_rate": 5.1820891285835936e-05, "loss": 2.4063953399658202, "step": 33960 }, { "epoch": 9.642350269656543, "grad_norm": 6.8199639320373535, "learning_rate": 5.1806698836219136e-05, "loss": 2.2584909439086913, "step": 33970 }, { "epoch": 9.645188759579904, "grad_norm": 6.276656627655029, "learning_rate": 5.1792506386602336e-05, "loss": 2.3167484283447264, "step": 33980 }, { "epoch": 9.648027249503265, "grad_norm": 6.672160625457764, "learning_rate": 5.177831393698552e-05, "loss": 2.3266574859619142, "step": 33990 }, { "epoch": 9.650865739426624, "grad_norm": 6.811497688293457, "learning_rate": 5.176412148736872e-05, "loss": 2.2715164184570313, "step": 34000 }, { "epoch": 9.650865739426624, "eval_accuracy": 0.30800534113308325, "eval_loss": 2.6035807132720947, "eval_runtime": 51.218, "eval_samples_per_second": 307.06, "eval_steps_per_second": 4.803, "step": 34000 }, { "epoch": 9.653704229349986, "grad_norm": 6.5368146896362305, "learning_rate": 5.174992903775192e-05, "loss": 2.351611328125, "step": 34010 }, { "epoch": 9.656542719273347, "grad_norm": 7.122772693634033, "learning_rate": 5.1735736588135116e-05, "loss": 2.310957908630371, "step": 34020 }, { "epoch": 9.659381209196708, "grad_norm": 6.501473426818848, "learning_rate": 5.172154413851831e-05, "loss": 2.359380531311035, "step": 34030 }, { "epoch": 9.662219699120069, "grad_norm": 6.607273101806641, "learning_rate": 5.170735168890151e-05, "loss": 2.209385108947754, "step": 34040 }, { "epoch": 9.665058189043428, "grad_norm": 7.218961238861084, "learning_rate": 5.16931592392847e-05, "loss": 2.231406402587891, "step": 34050 }, { "epoch": 9.66789667896679, "grad_norm": 6.611103057861328, "learning_rate": 5.16789667896679e-05, "loss": 2.256853485107422, "step": 34060 }, { "epoch": 9.67073516889015, "grad_norm": 6.607428073883057, "learning_rate": 5.166477434005109e-05, "loss": 2.2715694427490236, "step": 34070 }, { "epoch": 9.673573658813511, "grad_norm": 6.814079761505127, "learning_rate": 5.165058189043429e-05, "loss": 2.312468910217285, "step": 34080 }, { "epoch": 9.676412148736873, "grad_norm": 6.381890773773193, "learning_rate": 5.163638944081749e-05, "loss": 2.304341697692871, "step": 34090 }, { "epoch": 9.679250638660232, "grad_norm": 6.62519645690918, "learning_rate": 5.162219699120069e-05, "loss": 2.276098442077637, "step": 34100 }, { "epoch": 9.682089128583593, "grad_norm": 6.333555698394775, "learning_rate": 5.1608004541583877e-05, "loss": 2.264008140563965, "step": 34110 }, { "epoch": 9.684927618506954, "grad_norm": 6.563682556152344, "learning_rate": 5.159381209196708e-05, "loss": 2.3150894165039064, "step": 34120 }, { "epoch": 9.687766108430315, "grad_norm": 6.761855602264404, "learning_rate": 5.157961964235028e-05, "loss": 2.2525094985961913, "step": 34130 }, { "epoch": 9.690604598353676, "grad_norm": 6.520663738250732, "learning_rate": 5.156542719273347e-05, "loss": 2.2531457901000977, "step": 34140 }, { "epoch": 9.693443088277036, "grad_norm": 6.735116958618164, "learning_rate": 5.1551234743116663e-05, "loss": 2.2929176330566405, "step": 34150 }, { "epoch": 9.696281578200397, "grad_norm": 6.386178016662598, "learning_rate": 5.153704229349986e-05, "loss": 2.3080070495605467, "step": 34160 }, { "epoch": 9.699120068123758, "grad_norm": 6.608628749847412, "learning_rate": 5.152284984388306e-05, "loss": 2.3257280349731446, "step": 34170 }, { "epoch": 9.701958558047119, "grad_norm": 6.793539047241211, "learning_rate": 5.150865739426626e-05, "loss": 2.3601783752441405, "step": 34180 }, { "epoch": 9.70479704797048, "grad_norm": 6.465395927429199, "learning_rate": 5.1494464944649443e-05, "loss": 2.2515190124511717, "step": 34190 }, { "epoch": 9.707635537893841, "grad_norm": 6.913885116577148, "learning_rate": 5.148169173999432e-05, "loss": 2.2913818359375, "step": 34200 }, { "epoch": 9.7104740278172, "grad_norm": 6.557673454284668, "learning_rate": 5.1467499290377516e-05, "loss": 2.2385812759399415, "step": 34210 }, { "epoch": 9.713312517740562, "grad_norm": 6.485337734222412, "learning_rate": 5.1453306840760716e-05, "loss": 2.3179956436157227, "step": 34220 }, { "epoch": 9.716151007663923, "grad_norm": 6.411746978759766, "learning_rate": 5.1439114391143916e-05, "loss": 2.2721630096435548, "step": 34230 }, { "epoch": 9.718989497587284, "grad_norm": 6.769303798675537, "learning_rate": 5.1424921941527116e-05, "loss": 2.3301155090332033, "step": 34240 }, { "epoch": 9.721827987510645, "grad_norm": 6.317365646362305, "learning_rate": 5.14107294919103e-05, "loss": 2.320301055908203, "step": 34250 }, { "epoch": 9.724666477434004, "grad_norm": 7.005182266235352, "learning_rate": 5.13965370422935e-05, "loss": 2.2959873199462892, "step": 34260 }, { "epoch": 9.727504967357365, "grad_norm": 6.500229358673096, "learning_rate": 5.13823445926767e-05, "loss": 2.287823295593262, "step": 34270 }, { "epoch": 9.730343457280727, "grad_norm": 6.727303981781006, "learning_rate": 5.13681521430599e-05, "loss": 2.3264066696166994, "step": 34280 }, { "epoch": 9.733181947204088, "grad_norm": 7.141427040100098, "learning_rate": 5.135395969344309e-05, "loss": 2.2942010879516603, "step": 34290 }, { "epoch": 9.736020437127449, "grad_norm": 6.820417881011963, "learning_rate": 5.133976724382629e-05, "loss": 2.3775516510009767, "step": 34300 }, { "epoch": 9.73885892705081, "grad_norm": 6.178140640258789, "learning_rate": 5.132557479420948e-05, "loss": 2.25179386138916, "step": 34310 }, { "epoch": 9.74169741697417, "grad_norm": 6.728222846984863, "learning_rate": 5.131138234459268e-05, "loss": 2.2936206817626954, "step": 34320 }, { "epoch": 9.74453590689753, "grad_norm": 6.756014823913574, "learning_rate": 5.129718989497587e-05, "loss": 2.3539459228515627, "step": 34330 }, { "epoch": 9.747374396820891, "grad_norm": 6.782238960266113, "learning_rate": 5.128299744535907e-05, "loss": 2.2587724685668946, "step": 34340 }, { "epoch": 9.750212886744253, "grad_norm": 6.192161560058594, "learning_rate": 5.126880499574227e-05, "loss": 2.336833381652832, "step": 34350 }, { "epoch": 9.753051376667614, "grad_norm": 6.9512248039245605, "learning_rate": 5.125461254612547e-05, "loss": 2.237404632568359, "step": 34360 }, { "epoch": 9.755889866590973, "grad_norm": 7.256182670593262, "learning_rate": 5.124042009650866e-05, "loss": 2.2804691314697267, "step": 34370 }, { "epoch": 9.758728356514334, "grad_norm": 6.920694828033447, "learning_rate": 5.122622764689186e-05, "loss": 2.303287696838379, "step": 34380 }, { "epoch": 9.761566846437695, "grad_norm": 6.763657093048096, "learning_rate": 5.121203519727506e-05, "loss": 2.3956212997436523, "step": 34390 }, { "epoch": 9.764405336361056, "grad_norm": 6.7898759841918945, "learning_rate": 5.119784274765825e-05, "loss": 2.2624227523803713, "step": 34400 }, { "epoch": 9.767243826284417, "grad_norm": 6.468940734863281, "learning_rate": 5.1183650298041444e-05, "loss": 2.2386615753173826, "step": 34410 }, { "epoch": 9.770082316207777, "grad_norm": 6.9335761070251465, "learning_rate": 5.116945784842464e-05, "loss": 2.237003517150879, "step": 34420 }, { "epoch": 9.772920806131138, "grad_norm": 6.578887462615967, "learning_rate": 5.115526539880784e-05, "loss": 2.346457862854004, "step": 34430 }, { "epoch": 9.775759296054499, "grad_norm": 6.96647834777832, "learning_rate": 5.114107294919104e-05, "loss": 2.3199836730957033, "step": 34440 }, { "epoch": 9.77859778597786, "grad_norm": 6.745153427124023, "learning_rate": 5.1126880499574224e-05, "loss": 2.292905807495117, "step": 34450 }, { "epoch": 9.781436275901221, "grad_norm": 6.580673694610596, "learning_rate": 5.1112688049957424e-05, "loss": 2.300610160827637, "step": 34460 }, { "epoch": 9.78427476582458, "grad_norm": 6.729666709899902, "learning_rate": 5.1098495600340624e-05, "loss": 2.314024543762207, "step": 34470 }, { "epoch": 9.787113255747942, "grad_norm": 6.459171295166016, "learning_rate": 5.1084303150723824e-05, "loss": 2.251728057861328, "step": 34480 }, { "epoch": 9.789951745671303, "grad_norm": 6.766805648803711, "learning_rate": 5.107011070110701e-05, "loss": 2.309101867675781, "step": 34490 }, { "epoch": 9.792790235594664, "grad_norm": 6.527801513671875, "learning_rate": 5.105591825149021e-05, "loss": 2.272828483581543, "step": 34500 }, { "epoch": 9.792790235594664, "eval_accuracy": 0.31410949322820625, "eval_loss": 2.598240613937378, "eval_runtime": 52.3955, "eval_samples_per_second": 300.16, "eval_steps_per_second": 4.695, "step": 34500 }, { "epoch": 9.795628725518025, "grad_norm": 6.415210247039795, "learning_rate": 5.104172580187341e-05, "loss": 2.264708137512207, "step": 34510 }, { "epoch": 9.798467215441384, "grad_norm": 6.718161106109619, "learning_rate": 5.1027533352256604e-05, "loss": 2.2217803955078126, "step": 34520 }, { "epoch": 9.801305705364745, "grad_norm": 6.889768123626709, "learning_rate": 5.10133409026398e-05, "loss": 2.284589385986328, "step": 34530 }, { "epoch": 9.804144195288107, "grad_norm": 6.297563076019287, "learning_rate": 5.099914845302299e-05, "loss": 2.2812274932861327, "step": 34540 }, { "epoch": 9.806982685211468, "grad_norm": 6.9552693367004395, "learning_rate": 5.098495600340619e-05, "loss": 2.290567970275879, "step": 34550 }, { "epoch": 9.809821175134829, "grad_norm": 6.981598377227783, "learning_rate": 5.097076355378939e-05, "loss": 2.325593376159668, "step": 34560 }, { "epoch": 9.81265966505819, "grad_norm": 6.759881019592285, "learning_rate": 5.095657110417258e-05, "loss": 2.3464365005493164, "step": 34570 }, { "epoch": 9.81549815498155, "grad_norm": 6.349972724914551, "learning_rate": 5.094237865455578e-05, "loss": 2.3265108108520507, "step": 34580 }, { "epoch": 9.81833664490491, "grad_norm": 6.5381598472595215, "learning_rate": 5.092818620493898e-05, "loss": 2.3830820083618165, "step": 34590 }, { "epoch": 9.821175134828271, "grad_norm": 6.479859352111816, "learning_rate": 5.091399375532218e-05, "loss": 2.327689552307129, "step": 34600 }, { "epoch": 9.824013624751633, "grad_norm": 6.5352020263671875, "learning_rate": 5.0899801305705364e-05, "loss": 2.2673397064208984, "step": 34610 }, { "epoch": 9.826852114674994, "grad_norm": 6.968899250030518, "learning_rate": 5.0885608856088564e-05, "loss": 2.2647239685058596, "step": 34620 }, { "epoch": 9.829690604598353, "grad_norm": 6.587314128875732, "learning_rate": 5.0871416406471764e-05, "loss": 2.3042552947998045, "step": 34630 }, { "epoch": 9.832529094521714, "grad_norm": 6.487895488739014, "learning_rate": 5.085722395685496e-05, "loss": 2.309684944152832, "step": 34640 }, { "epoch": 9.835367584445075, "grad_norm": 6.413208961486816, "learning_rate": 5.084303150723815e-05, "loss": 2.3328298568725585, "step": 34650 }, { "epoch": 9.838206074368436, "grad_norm": 6.774788856506348, "learning_rate": 5.0828839057621344e-05, "loss": 2.397614097595215, "step": 34660 }, { "epoch": 9.841044564291797, "grad_norm": 6.521530628204346, "learning_rate": 5.0814646608004544e-05, "loss": 2.3231555938720705, "step": 34670 }, { "epoch": 9.843883054215157, "grad_norm": 7.113224506378174, "learning_rate": 5.0800454158387745e-05, "loss": 2.330312728881836, "step": 34680 }, { "epoch": 9.846721544138518, "grad_norm": 6.74880313873291, "learning_rate": 5.078626170877093e-05, "loss": 2.2707618713378905, "step": 34690 }, { "epoch": 9.849560034061879, "grad_norm": 6.451910495758057, "learning_rate": 5.077206925915413e-05, "loss": 2.2532417297363283, "step": 34700 }, { "epoch": 9.85239852398524, "grad_norm": 6.334444999694824, "learning_rate": 5.075787680953733e-05, "loss": 2.2907058715820314, "step": 34710 }, { "epoch": 9.855237013908601, "grad_norm": 6.605666160583496, "learning_rate": 5.074368435992053e-05, "loss": 2.283860778808594, "step": 34720 }, { "epoch": 9.858075503831962, "grad_norm": 6.605907440185547, "learning_rate": 5.072949191030372e-05, "loss": 2.2753467559814453, "step": 34730 }, { "epoch": 9.860913993755322, "grad_norm": 6.162079334259033, "learning_rate": 5.071529946068692e-05, "loss": 2.256519889831543, "step": 34740 }, { "epoch": 9.863752483678683, "grad_norm": 6.394400119781494, "learning_rate": 5.070110701107011e-05, "loss": 2.3175537109375, "step": 34750 }, { "epoch": 9.866590973602044, "grad_norm": 6.707743167877197, "learning_rate": 5.068691456145331e-05, "loss": 2.2979021072387695, "step": 34760 }, { "epoch": 9.869429463525405, "grad_norm": 6.302591800689697, "learning_rate": 5.0672722111836505e-05, "loss": 2.2774374008178713, "step": 34770 }, { "epoch": 9.872267953448766, "grad_norm": 6.5454182624816895, "learning_rate": 5.06585296622197e-05, "loss": 2.2585002899169924, "step": 34780 }, { "epoch": 9.875106443372125, "grad_norm": 6.911382675170898, "learning_rate": 5.06443372126029e-05, "loss": 2.3199947357177733, "step": 34790 }, { "epoch": 9.877944933295487, "grad_norm": 6.721408843994141, "learning_rate": 5.06301447629861e-05, "loss": 2.2988452911376953, "step": 34800 }, { "epoch": 9.880783423218848, "grad_norm": 6.722107887268066, "learning_rate": 5.0615952313369285e-05, "loss": 2.362059211730957, "step": 34810 }, { "epoch": 9.883621913142209, "grad_norm": 6.914200782775879, "learning_rate": 5.0601759863752485e-05, "loss": 2.2878129959106444, "step": 34820 }, { "epoch": 9.88646040306557, "grad_norm": 6.599009037017822, "learning_rate": 5.0587567414135685e-05, "loss": 2.3313161849975588, "step": 34830 }, { "epoch": 9.88929889298893, "grad_norm": 6.504178524017334, "learning_rate": 5.0573374964518885e-05, "loss": 2.3301856994628904, "step": 34840 }, { "epoch": 9.89213738291229, "grad_norm": 6.52891731262207, "learning_rate": 5.055918251490207e-05, "loss": 2.290486717224121, "step": 34850 }, { "epoch": 9.894975872835651, "grad_norm": 6.672214984893799, "learning_rate": 5.054499006528527e-05, "loss": 2.283523750305176, "step": 34860 }, { "epoch": 9.897814362759012, "grad_norm": 6.47728967666626, "learning_rate": 5.0530797615668465e-05, "loss": 2.2737913131713867, "step": 34870 }, { "epoch": 9.900652852682374, "grad_norm": 6.453149318695068, "learning_rate": 5.0516605166051665e-05, "loss": 2.3540401458740234, "step": 34880 }, { "epoch": 9.903491342605733, "grad_norm": 6.653935432434082, "learning_rate": 5.050241271643485e-05, "loss": 2.2531112670898437, "step": 34890 }, { "epoch": 9.906329832529094, "grad_norm": 6.565589427947998, "learning_rate": 5.048822026681805e-05, "loss": 2.2764184951782225, "step": 34900 }, { "epoch": 9.909168322452455, "grad_norm": 6.851049900054932, "learning_rate": 5.047402781720125e-05, "loss": 2.321470260620117, "step": 34910 }, { "epoch": 9.912006812375816, "grad_norm": 6.497792720794678, "learning_rate": 5.045983536758445e-05, "loss": 2.221872329711914, "step": 34920 }, { "epoch": 9.914845302299177, "grad_norm": 6.557523727416992, "learning_rate": 5.044564291796764e-05, "loss": 2.293354797363281, "step": 34930 }, { "epoch": 9.917683792222537, "grad_norm": 7.084096431732178, "learning_rate": 5.043145046835084e-05, "loss": 2.240324783325195, "step": 34940 }, { "epoch": 9.920522282145898, "grad_norm": 6.400211334228516, "learning_rate": 5.041725801873404e-05, "loss": 2.2914865493774412, "step": 34950 }, { "epoch": 9.923360772069259, "grad_norm": 6.662888526916504, "learning_rate": 5.040306556911724e-05, "loss": 2.3441442489624023, "step": 34960 }, { "epoch": 9.92619926199262, "grad_norm": 6.550973892211914, "learning_rate": 5.0388873119500425e-05, "loss": 2.2519378662109375, "step": 34970 }, { "epoch": 9.929037751915981, "grad_norm": 6.148077964782715, "learning_rate": 5.0374680669883626e-05, "loss": 2.280731773376465, "step": 34980 }, { "epoch": 9.931876241839342, "grad_norm": 6.422458171844482, "learning_rate": 5.036048822026682e-05, "loss": 2.3068185806274415, "step": 34990 }, { "epoch": 9.934714731762702, "grad_norm": 6.66442346572876, "learning_rate": 5.034629577065002e-05, "loss": 2.2833980560302733, "step": 35000 }, { "epoch": 9.934714731762702, "eval_accuracy": 0.3110574171806447, "eval_loss": 2.594289779663086, "eval_runtime": 51.0055, "eval_samples_per_second": 308.339, "eval_steps_per_second": 4.823, "step": 35000 }, { "epoch": 9.937553221686063, "grad_norm": 6.920291423797607, "learning_rate": 5.0332103321033205e-05, "loss": 2.3207950592041016, "step": 35010 }, { "epoch": 9.940391711609424, "grad_norm": 6.795268535614014, "learning_rate": 5.0317910871416406e-05, "loss": 2.278725433349609, "step": 35020 }, { "epoch": 9.943230201532785, "grad_norm": 6.696831226348877, "learning_rate": 5.0303718421799606e-05, "loss": 2.3092586517333986, "step": 35030 }, { "epoch": 9.946068691456146, "grad_norm": 6.9255805015563965, "learning_rate": 5.0289525972182806e-05, "loss": 2.2790252685546877, "step": 35040 }, { "epoch": 9.948907181379505, "grad_norm": 6.7697296142578125, "learning_rate": 5.027533352256599e-05, "loss": 2.328065872192383, "step": 35050 }, { "epoch": 9.951745671302866, "grad_norm": 7.258373737335205, "learning_rate": 5.026114107294919e-05, "loss": 2.322962188720703, "step": 35060 }, { "epoch": 9.954584161226228, "grad_norm": 6.797359943389893, "learning_rate": 5.024694862333239e-05, "loss": 2.3303525924682615, "step": 35070 }, { "epoch": 9.957422651149589, "grad_norm": 6.703403472900391, "learning_rate": 5.023275617371559e-05, "loss": 2.3755699157714845, "step": 35080 }, { "epoch": 9.96026114107295, "grad_norm": 6.512598037719727, "learning_rate": 5.021856372409878e-05, "loss": 2.3163890838623047, "step": 35090 }, { "epoch": 9.96309963099631, "grad_norm": 6.305145740509033, "learning_rate": 5.020437127448198e-05, "loss": 2.2334514617919923, "step": 35100 }, { "epoch": 9.96593812091967, "grad_norm": 7.097137928009033, "learning_rate": 5.019017882486517e-05, "loss": 2.314224624633789, "step": 35110 }, { "epoch": 9.968776610843031, "grad_norm": 6.602217674255371, "learning_rate": 5.017598637524837e-05, "loss": 2.3810827255249025, "step": 35120 }, { "epoch": 9.971615100766392, "grad_norm": 6.408008098602295, "learning_rate": 5.016179392563156e-05, "loss": 2.2389739990234374, "step": 35130 }, { "epoch": 9.974453590689754, "grad_norm": 6.6579203605651855, "learning_rate": 5.014760147601476e-05, "loss": 2.2234371185302733, "step": 35140 }, { "epoch": 9.977292080613115, "grad_norm": 6.733994483947754, "learning_rate": 5.013340902639796e-05, "loss": 2.296366310119629, "step": 35150 }, { "epoch": 9.980130570536474, "grad_norm": 6.868304252624512, "learning_rate": 5.011921657678116e-05, "loss": 2.335504722595215, "step": 35160 }, { "epoch": 9.982969060459835, "grad_norm": 6.556670665740967, "learning_rate": 5.0105024127164346e-05, "loss": 2.2822059631347655, "step": 35170 }, { "epoch": 9.985807550383196, "grad_norm": 7.073872089385986, "learning_rate": 5.0090831677547546e-05, "loss": 2.294108772277832, "step": 35180 }, { "epoch": 9.988646040306557, "grad_norm": 6.778182506561279, "learning_rate": 5.0076639227930746e-05, "loss": 2.2856990814208986, "step": 35190 }, { "epoch": 9.991484530229918, "grad_norm": 6.542712211608887, "learning_rate": 5.006244677831394e-05, "loss": 2.3076305389404297, "step": 35200 }, { "epoch": 9.994323020153278, "grad_norm": 6.727949142456055, "learning_rate": 5.004825432869713e-05, "loss": 2.273734283447266, "step": 35210 }, { "epoch": 9.997161510076639, "grad_norm": 6.610326290130615, "learning_rate": 5.0034061879080326e-05, "loss": 2.3227800369262694, "step": 35220 }, { "epoch": 10.0, "grad_norm": Infinity, "learning_rate": 5.0019869429463526e-05, "loss": 2.356099319458008, "step": 35230 }, { "epoch": 10.002838489923361, "grad_norm": 6.838511943817139, "learning_rate": 5.0007096224808406e-05, "loss": 2.2574745178222657, "step": 35240 }, { "epoch": 10.005676979846722, "grad_norm": 6.548144340515137, "learning_rate": 4.99929037751916e-05, "loss": 2.271523666381836, "step": 35250 }, { "epoch": 10.008515469770082, "grad_norm": 6.466483116149902, "learning_rate": 4.997871132557479e-05, "loss": 2.2540704727172853, "step": 35260 }, { "epoch": 10.011353959693443, "grad_norm": 6.592482089996338, "learning_rate": 4.996451887595799e-05, "loss": 2.2626670837402343, "step": 35270 }, { "epoch": 10.014192449616804, "grad_norm": 6.504171848297119, "learning_rate": 4.9950326426341186e-05, "loss": 2.3122304916381835, "step": 35280 }, { "epoch": 10.017030939540165, "grad_norm": 6.710816860198975, "learning_rate": 4.9936133976724386e-05, "loss": 2.24060001373291, "step": 35290 }, { "epoch": 10.019869429463526, "grad_norm": 6.731314659118652, "learning_rate": 4.992194152710758e-05, "loss": 2.3321813583374023, "step": 35300 }, { "epoch": 10.022707919386885, "grad_norm": 6.447091579437256, "learning_rate": 4.990774907749078e-05, "loss": 2.189549446105957, "step": 35310 }, { "epoch": 10.025546409310246, "grad_norm": 6.632152080535889, "learning_rate": 4.989355662787397e-05, "loss": 2.2474763870239256, "step": 35320 }, { "epoch": 10.028384899233608, "grad_norm": 6.859679222106934, "learning_rate": 4.987936417825717e-05, "loss": 2.328726387023926, "step": 35330 }, { "epoch": 10.031223389156969, "grad_norm": 6.780179500579834, "learning_rate": 4.9865171728640366e-05, "loss": 2.3124008178710938, "step": 35340 }, { "epoch": 10.03406187908033, "grad_norm": 6.657329082489014, "learning_rate": 4.9850979279023566e-05, "loss": 2.289463424682617, "step": 35350 }, { "epoch": 10.03690036900369, "grad_norm": 6.429683208465576, "learning_rate": 4.983678682940676e-05, "loss": 2.2781494140625, "step": 35360 }, { "epoch": 10.03973885892705, "grad_norm": 6.487689018249512, "learning_rate": 4.982259437978995e-05, "loss": 2.1849786758422853, "step": 35370 }, { "epoch": 10.042577348850411, "grad_norm": 6.7076263427734375, "learning_rate": 4.9808401930173146e-05, "loss": 2.278023529052734, "step": 35380 }, { "epoch": 10.045415838773772, "grad_norm": 6.625844478607178, "learning_rate": 4.9794209480556346e-05, "loss": 2.239242172241211, "step": 35390 }, { "epoch": 10.048254328697134, "grad_norm": 6.94658899307251, "learning_rate": 4.978001703093954e-05, "loss": 2.3141033172607424, "step": 35400 }, { "epoch": 10.051092818620495, "grad_norm": 6.799901962280273, "learning_rate": 4.976582458132274e-05, "loss": 2.2592645645141602, "step": 35410 }, { "epoch": 10.053931308543854, "grad_norm": 6.553558826446533, "learning_rate": 4.975163213170593e-05, "loss": 2.2669805526733398, "step": 35420 }, { "epoch": 10.056769798467215, "grad_norm": 6.434407711029053, "learning_rate": 4.973743968208913e-05, "loss": 2.275459861755371, "step": 35430 }, { "epoch": 10.059608288390576, "grad_norm": 6.365653038024902, "learning_rate": 4.9723247232472326e-05, "loss": 2.248605728149414, "step": 35440 }, { "epoch": 10.062446778313937, "grad_norm": 6.629933834075928, "learning_rate": 4.9709054782855526e-05, "loss": 2.2825716018676756, "step": 35450 }, { "epoch": 10.065285268237298, "grad_norm": 6.266193389892578, "learning_rate": 4.969486233323872e-05, "loss": 2.28454647064209, "step": 35460 }, { "epoch": 10.068123758160658, "grad_norm": 7.050620079040527, "learning_rate": 4.968066988362191e-05, "loss": 2.325722503662109, "step": 35470 }, { "epoch": 10.070962248084019, "grad_norm": 6.841732978820801, "learning_rate": 4.9666477434005106e-05, "loss": 2.261321258544922, "step": 35480 }, { "epoch": 10.07380073800738, "grad_norm": 6.410193920135498, "learning_rate": 4.9652284984388307e-05, "loss": 2.27252311706543, "step": 35490 }, { "epoch": 10.076639227930741, "grad_norm": 6.4743428230285645, "learning_rate": 4.96380925347715e-05, "loss": 2.247066688537598, "step": 35500 }, { "epoch": 10.076639227930741, "eval_accuracy": 0.3179245882876582, "eval_loss": 2.5860135555267334, "eval_runtime": 50.7586, "eval_samples_per_second": 309.839, "eval_steps_per_second": 4.846, "step": 35500 }, { "epoch": 10.079477717854102, "grad_norm": 6.996546268463135, "learning_rate": 4.96239000851547e-05, "loss": 2.280501937866211, "step": 35510 }, { "epoch": 10.082316207777462, "grad_norm": 6.858736038208008, "learning_rate": 4.960970763553789e-05, "loss": 2.412196731567383, "step": 35520 }, { "epoch": 10.085154697700823, "grad_norm": 6.308370590209961, "learning_rate": 4.959551518592109e-05, "loss": 2.2609222412109373, "step": 35530 }, { "epoch": 10.087993187624184, "grad_norm": 6.423556327819824, "learning_rate": 4.958132273630429e-05, "loss": 2.2591726303100588, "step": 35540 }, { "epoch": 10.090831677547545, "grad_norm": 7.027958869934082, "learning_rate": 4.956713028668749e-05, "loss": 2.2614486694335936, "step": 35550 }, { "epoch": 10.093670167470906, "grad_norm": 7.156031131744385, "learning_rate": 4.955293783707068e-05, "loss": 2.332539749145508, "step": 35560 }, { "epoch": 10.096508657394267, "grad_norm": 6.5071563720703125, "learning_rate": 4.953874538745388e-05, "loss": 2.1865657806396483, "step": 35570 }, { "epoch": 10.099347147317626, "grad_norm": 6.4133992195129395, "learning_rate": 4.9524552937837074e-05, "loss": 2.230076217651367, "step": 35580 }, { "epoch": 10.102185637240988, "grad_norm": 6.89766263961792, "learning_rate": 4.951036048822027e-05, "loss": 2.2758678436279296, "step": 35590 }, { "epoch": 10.105024127164349, "grad_norm": 7.2801313400268555, "learning_rate": 4.949616803860346e-05, "loss": 2.168141746520996, "step": 35600 }, { "epoch": 10.10786261708771, "grad_norm": 6.626921653747559, "learning_rate": 4.948197558898666e-05, "loss": 2.2402549743652345, "step": 35610 }, { "epoch": 10.11070110701107, "grad_norm": 6.94560432434082, "learning_rate": 4.9467783139369854e-05, "loss": 2.226170539855957, "step": 35620 }, { "epoch": 10.11353959693443, "grad_norm": 6.964014530181885, "learning_rate": 4.9453590689753054e-05, "loss": 2.2853954315185545, "step": 35630 }, { "epoch": 10.116378086857791, "grad_norm": 6.372677326202393, "learning_rate": 4.943939824013625e-05, "loss": 2.2126977920532225, "step": 35640 }, { "epoch": 10.119216576781152, "grad_norm": 6.535754203796387, "learning_rate": 4.942520579051945e-05, "loss": 2.25018367767334, "step": 35650 }, { "epoch": 10.122055066704513, "grad_norm": 6.454680919647217, "learning_rate": 4.941101334090264e-05, "loss": 2.3151618957519533, "step": 35660 }, { "epoch": 10.124893556627875, "grad_norm": 6.320524215698242, "learning_rate": 4.939682089128584e-05, "loss": 2.2165195465087892, "step": 35670 }, { "epoch": 10.127732046551234, "grad_norm": 6.4838762283325195, "learning_rate": 4.9382628441669034e-05, "loss": 2.305851173400879, "step": 35680 }, { "epoch": 10.130570536474595, "grad_norm": 6.685133457183838, "learning_rate": 4.9368435992052234e-05, "loss": 2.3338993072509764, "step": 35690 }, { "epoch": 10.133409026397956, "grad_norm": 6.491164207458496, "learning_rate": 4.935424354243543e-05, "loss": 2.1974620819091797, "step": 35700 }, { "epoch": 10.136247516321317, "grad_norm": 6.947259426116943, "learning_rate": 4.934005109281862e-05, "loss": 2.240152359008789, "step": 35710 }, { "epoch": 10.139086006244678, "grad_norm": 6.509232521057129, "learning_rate": 4.9325858643201814e-05, "loss": 2.2679925918579102, "step": 35720 }, { "epoch": 10.141924496168038, "grad_norm": 6.874953269958496, "learning_rate": 4.9311666193585014e-05, "loss": 2.295589637756348, "step": 35730 }, { "epoch": 10.144762986091399, "grad_norm": 6.565492153167725, "learning_rate": 4.929747374396821e-05, "loss": 2.308304786682129, "step": 35740 }, { "epoch": 10.14760147601476, "grad_norm": 6.434581756591797, "learning_rate": 4.928328129435141e-05, "loss": 2.3384109497070313, "step": 35750 }, { "epoch": 10.150439965938121, "grad_norm": 6.453946590423584, "learning_rate": 4.92690888447346e-05, "loss": 2.3248226165771486, "step": 35760 }, { "epoch": 10.153278455861482, "grad_norm": 6.555843830108643, "learning_rate": 4.92548963951178e-05, "loss": 2.219286346435547, "step": 35770 }, { "epoch": 10.156116945784843, "grad_norm": 6.653076171875, "learning_rate": 4.9240703945500994e-05, "loss": 2.2339719772338866, "step": 35780 }, { "epoch": 10.158955435708203, "grad_norm": 6.784511089324951, "learning_rate": 4.9226511495884194e-05, "loss": 2.2753042221069335, "step": 35790 }, { "epoch": 10.161793925631564, "grad_norm": 6.643929481506348, "learning_rate": 4.921231904626739e-05, "loss": 2.2379119873046873, "step": 35800 }, { "epoch": 10.164632415554925, "grad_norm": 6.400142192840576, "learning_rate": 4.919812659665059e-05, "loss": 2.2002077102661133, "step": 35810 }, { "epoch": 10.167470905478286, "grad_norm": 6.576535701751709, "learning_rate": 4.918393414703378e-05, "loss": 2.325971984863281, "step": 35820 }, { "epoch": 10.170309395401647, "grad_norm": 7.1523308753967285, "learning_rate": 4.9169741697416974e-05, "loss": 2.2334405899047853, "step": 35830 }, { "epoch": 10.173147885325006, "grad_norm": 6.343196392059326, "learning_rate": 4.915554924780017e-05, "loss": 2.2573585510253906, "step": 35840 }, { "epoch": 10.175986375248367, "grad_norm": 6.781225204467773, "learning_rate": 4.914135679818337e-05, "loss": 2.2959548950195314, "step": 35850 }, { "epoch": 10.178824865171729, "grad_norm": 7.0926103591918945, "learning_rate": 4.912716434856656e-05, "loss": 2.2239269256591796, "step": 35860 }, { "epoch": 10.18166335509509, "grad_norm": 6.7843194007873535, "learning_rate": 4.911297189894976e-05, "loss": 2.226934623718262, "step": 35870 }, { "epoch": 10.18450184501845, "grad_norm": 6.6055474281311035, "learning_rate": 4.9098779449332954e-05, "loss": 2.2484195709228514, "step": 35880 }, { "epoch": 10.18734033494181, "grad_norm": 6.523634433746338, "learning_rate": 4.9084586999716155e-05, "loss": 2.2981740951538088, "step": 35890 }, { "epoch": 10.190178824865171, "grad_norm": 6.854654788970947, "learning_rate": 4.9070394550099355e-05, "loss": 2.23681755065918, "step": 35900 }, { "epoch": 10.193017314788532, "grad_norm": 6.436176300048828, "learning_rate": 4.905620210048255e-05, "loss": 2.282923698425293, "step": 35910 }, { "epoch": 10.195855804711893, "grad_norm": 6.85226583480835, "learning_rate": 4.904200965086574e-05, "loss": 2.2317686080932617, "step": 35920 }, { "epoch": 10.198694294635255, "grad_norm": 6.6786603927612305, "learning_rate": 4.9027817201248935e-05, "loss": 2.2967662811279297, "step": 35930 }, { "epoch": 10.201532784558616, "grad_norm": 6.5782694816589355, "learning_rate": 4.9013624751632135e-05, "loss": 2.268139457702637, "step": 35940 }, { "epoch": 10.204371274481975, "grad_norm": 6.849796295166016, "learning_rate": 4.899943230201533e-05, "loss": 2.3242889404296876, "step": 35950 }, { "epoch": 10.207209764405336, "grad_norm": 6.511605262756348, "learning_rate": 4.898523985239853e-05, "loss": 2.293117332458496, "step": 35960 }, { "epoch": 10.210048254328697, "grad_norm": 6.386241436004639, "learning_rate": 4.897104740278172e-05, "loss": 2.261976623535156, "step": 35970 }, { "epoch": 10.212886744252058, "grad_norm": 6.583485126495361, "learning_rate": 4.895685495316492e-05, "loss": 2.310089683532715, "step": 35980 }, { "epoch": 10.21572523417542, "grad_norm": 6.4829936027526855, "learning_rate": 4.8942662503548115e-05, "loss": 2.25794677734375, "step": 35990 }, { "epoch": 10.218563724098779, "grad_norm": 6.569428443908691, "learning_rate": 4.8928470053931315e-05, "loss": 2.264516830444336, "step": 36000 }, { "epoch": 10.218563724098779, "eval_accuracy": 0.31143892668658996, "eval_loss": 2.588238000869751, "eval_runtime": 62.2122, "eval_samples_per_second": 252.796, "eval_steps_per_second": 3.954, "step": 36000 }, { "epoch": 10.22140221402214, "grad_norm": 6.882806777954102, "learning_rate": 4.891427760431451e-05, "loss": 2.294978713989258, "step": 36010 }, { "epoch": 10.224240703945501, "grad_norm": 6.40712833404541, "learning_rate": 4.890008515469771e-05, "loss": 2.257463073730469, "step": 36020 }, { "epoch": 10.227079193868862, "grad_norm": 6.606792449951172, "learning_rate": 4.88858927050809e-05, "loss": 2.242086410522461, "step": 36030 }, { "epoch": 10.229917683792223, "grad_norm": 6.814462184906006, "learning_rate": 4.8871700255464095e-05, "loss": 2.218131256103516, "step": 36040 }, { "epoch": 10.232756173715583, "grad_norm": 6.48512077331543, "learning_rate": 4.885750780584729e-05, "loss": 2.2368162155151365, "step": 36050 }, { "epoch": 10.235594663638944, "grad_norm": 6.321904182434082, "learning_rate": 4.884331535623049e-05, "loss": 2.2532058715820313, "step": 36060 }, { "epoch": 10.238433153562305, "grad_norm": 6.404046058654785, "learning_rate": 4.882912290661368e-05, "loss": 2.173914337158203, "step": 36070 }, { "epoch": 10.241271643485666, "grad_norm": 6.561601638793945, "learning_rate": 4.881493045699688e-05, "loss": 2.22094669342041, "step": 36080 }, { "epoch": 10.244110133409027, "grad_norm": 6.137237548828125, "learning_rate": 4.8800738007380075e-05, "loss": 2.309991645812988, "step": 36090 }, { "epoch": 10.246948623332386, "grad_norm": 6.6940741539001465, "learning_rate": 4.8786545557763275e-05, "loss": 2.225143241882324, "step": 36100 }, { "epoch": 10.249787113255747, "grad_norm": 6.68137264251709, "learning_rate": 4.877235310814647e-05, "loss": 2.2075061798095703, "step": 36110 }, { "epoch": 10.252625603179109, "grad_norm": 6.59395694732666, "learning_rate": 4.875816065852967e-05, "loss": 2.2665945053100587, "step": 36120 }, { "epoch": 10.25546409310247, "grad_norm": 6.3791890144348145, "learning_rate": 4.874396820891286e-05, "loss": 2.214837646484375, "step": 36130 }, { "epoch": 10.25830258302583, "grad_norm": 6.380196571350098, "learning_rate": 4.872977575929606e-05, "loss": 2.2282482147216798, "step": 36140 }, { "epoch": 10.261141072949192, "grad_norm": 6.418828964233398, "learning_rate": 4.8715583309679255e-05, "loss": 2.315639877319336, "step": 36150 }, { "epoch": 10.263979562872551, "grad_norm": 6.262418746948242, "learning_rate": 4.870139086006245e-05, "loss": 2.19925537109375, "step": 36160 }, { "epoch": 10.266818052795912, "grad_norm": 6.546813011169434, "learning_rate": 4.868719841044564e-05, "loss": 2.2232028961181642, "step": 36170 }, { "epoch": 10.269656542719273, "grad_norm": 6.4865641593933105, "learning_rate": 4.867300596082884e-05, "loss": 2.308414840698242, "step": 36180 }, { "epoch": 10.272495032642635, "grad_norm": 6.406373977661133, "learning_rate": 4.8658813511212036e-05, "loss": 2.204045295715332, "step": 36190 }, { "epoch": 10.275333522565996, "grad_norm": 6.773521900177002, "learning_rate": 4.8644621061595236e-05, "loss": 2.266647720336914, "step": 36200 }, { "epoch": 10.278172012489355, "grad_norm": 6.948946475982666, "learning_rate": 4.863042861197843e-05, "loss": 2.2906585693359376, "step": 36210 }, { "epoch": 10.281010502412716, "grad_norm": 6.601156711578369, "learning_rate": 4.861623616236163e-05, "loss": 2.323743438720703, "step": 36220 }, { "epoch": 10.283848992336077, "grad_norm": 6.718098163604736, "learning_rate": 4.860204371274482e-05, "loss": 2.2890317916870115, "step": 36230 }, { "epoch": 10.286687482259438, "grad_norm": 6.513415336608887, "learning_rate": 4.858785126312802e-05, "loss": 2.270050621032715, "step": 36240 }, { "epoch": 10.2895259721828, "grad_norm": 6.560976505279541, "learning_rate": 4.8573658813511216e-05, "loss": 2.1584714889526366, "step": 36250 }, { "epoch": 10.292364462106159, "grad_norm": 6.845945358276367, "learning_rate": 4.8559466363894416e-05, "loss": 2.269166946411133, "step": 36260 }, { "epoch": 10.29520295202952, "grad_norm": 6.186464786529541, "learning_rate": 4.854527391427761e-05, "loss": 2.2748348236083986, "step": 36270 }, { "epoch": 10.298041441952881, "grad_norm": 6.367295742034912, "learning_rate": 4.85310814646608e-05, "loss": 2.27453498840332, "step": 36280 }, { "epoch": 10.300879931876242, "grad_norm": 6.736718654632568, "learning_rate": 4.8516889015043996e-05, "loss": 2.273807334899902, "step": 36290 }, { "epoch": 10.303718421799603, "grad_norm": 6.614945888519287, "learning_rate": 4.8502696565427196e-05, "loss": 2.1998992919921876, "step": 36300 }, { "epoch": 10.306556911722964, "grad_norm": 6.364208698272705, "learning_rate": 4.848850411581039e-05, "loss": 2.2133451461791993, "step": 36310 }, { "epoch": 10.309395401646324, "grad_norm": 6.8537116050720215, "learning_rate": 4.847431166619359e-05, "loss": 2.2842338562011717, "step": 36320 }, { "epoch": 10.312233891569685, "grad_norm": 6.511342525482178, "learning_rate": 4.846011921657678e-05, "loss": 2.235326385498047, "step": 36330 }, { "epoch": 10.315072381493046, "grad_norm": 6.302047252655029, "learning_rate": 4.844592676695998e-05, "loss": 2.24227352142334, "step": 36340 }, { "epoch": 10.317910871416407, "grad_norm": 6.4075927734375, "learning_rate": 4.8431734317343176e-05, "loss": 2.1906347274780273, "step": 36350 }, { "epoch": 10.320749361339768, "grad_norm": 7.063528537750244, "learning_rate": 4.8417541867726376e-05, "loss": 2.3249650955200196, "step": 36360 }, { "epoch": 10.323587851263127, "grad_norm": 6.470815658569336, "learning_rate": 4.840334941810957e-05, "loss": 2.2170639038085938, "step": 36370 }, { "epoch": 10.326426341186489, "grad_norm": 6.399944305419922, "learning_rate": 4.838915696849276e-05, "loss": 2.228102684020996, "step": 36380 }, { "epoch": 10.32926483110985, "grad_norm": 7.123491287231445, "learning_rate": 4.8374964518875956e-05, "loss": 2.2556718826293944, "step": 36390 }, { "epoch": 10.33210332103321, "grad_norm": 6.709014415740967, "learning_rate": 4.8360772069259156e-05, "loss": 2.2803531646728517, "step": 36400 }, { "epoch": 10.334941810956572, "grad_norm": 6.588078498840332, "learning_rate": 4.834657961964235e-05, "loss": 2.2522699356079103, "step": 36410 }, { "epoch": 10.337780300879931, "grad_norm": 6.85468864440918, "learning_rate": 4.833238717002555e-05, "loss": 2.241667556762695, "step": 36420 }, { "epoch": 10.340618790803292, "grad_norm": 6.636046886444092, "learning_rate": 4.831819472040874e-05, "loss": 2.2749565124511717, "step": 36430 }, { "epoch": 10.343457280726653, "grad_norm": 6.539511203765869, "learning_rate": 4.830400227079194e-05, "loss": 2.2985292434692384, "step": 36440 }, { "epoch": 10.346295770650014, "grad_norm": 6.288649082183838, "learning_rate": 4.8289809821175136e-05, "loss": 2.2245874404907227, "step": 36450 }, { "epoch": 10.349134260573376, "grad_norm": 6.341139316558838, "learning_rate": 4.8275617371558337e-05, "loss": 2.308079719543457, "step": 36460 }, { "epoch": 10.351972750496735, "grad_norm": 6.688266277313232, "learning_rate": 4.826142492194153e-05, "loss": 2.2063419342041017, "step": 36470 }, { "epoch": 10.354811240420096, "grad_norm": 6.3997602462768555, "learning_rate": 4.824723247232473e-05, "loss": 2.3096975326538085, "step": 36480 }, { "epoch": 10.357649730343457, "grad_norm": 6.431365013122559, "learning_rate": 4.823304002270792e-05, "loss": 2.2523626327514648, "step": 36490 }, { "epoch": 10.360488220266818, "grad_norm": 6.210048675537109, "learning_rate": 4.8218847573091117e-05, "loss": 2.245700454711914, "step": 36500 }, { "epoch": 10.360488220266818, "eval_accuracy": 0.3123291155337954, "eval_loss": 2.5837631225585938, "eval_runtime": 48.9714, "eval_samples_per_second": 321.147, "eval_steps_per_second": 5.023, "step": 36500 }, { "epoch": 10.36332671019018, "grad_norm": 6.529856204986572, "learning_rate": 4.820465512347431e-05, "loss": 2.2988780975341796, "step": 36510 }, { "epoch": 10.366165200113539, "grad_norm": 6.604257106781006, "learning_rate": 4.819046267385751e-05, "loss": 2.2730297088623046, "step": 36520 }, { "epoch": 10.3690036900369, "grad_norm": 6.040860176086426, "learning_rate": 4.81762702242407e-05, "loss": 2.226831817626953, "step": 36530 }, { "epoch": 10.371842179960261, "grad_norm": 6.915767192840576, "learning_rate": 4.8162077774623903e-05, "loss": 2.2630916595458985, "step": 36540 }, { "epoch": 10.374680669883622, "grad_norm": 6.874558448791504, "learning_rate": 4.81478853250071e-05, "loss": 2.2532949447631836, "step": 36550 }, { "epoch": 10.377519159806983, "grad_norm": 6.645575523376465, "learning_rate": 4.81336928753903e-05, "loss": 2.286677932739258, "step": 36560 }, { "epoch": 10.380357649730344, "grad_norm": 6.539143085479736, "learning_rate": 4.811950042577349e-05, "loss": 2.2493450164794924, "step": 36570 }, { "epoch": 10.383196139653704, "grad_norm": 6.762217044830322, "learning_rate": 4.810530797615669e-05, "loss": 2.3086275100708007, "step": 36580 }, { "epoch": 10.386034629577065, "grad_norm": 6.7601637840271, "learning_rate": 4.8091115526539884e-05, "loss": 2.282481575012207, "step": 36590 }, { "epoch": 10.388873119500426, "grad_norm": 6.087039947509766, "learning_rate": 4.8076923076923084e-05, "loss": 2.2312707901000977, "step": 36600 }, { "epoch": 10.391711609423787, "grad_norm": 6.370086193084717, "learning_rate": 4.806273062730628e-05, "loss": 2.2766393661499023, "step": 36610 }, { "epoch": 10.394550099347148, "grad_norm": 6.445125102996826, "learning_rate": 4.804853817768947e-05, "loss": 2.2312393188476562, "step": 36620 }, { "epoch": 10.397388589270507, "grad_norm": 6.692849636077881, "learning_rate": 4.8034345728072664e-05, "loss": 2.189029312133789, "step": 36630 }, { "epoch": 10.400227079193868, "grad_norm": 7.1079277992248535, "learning_rate": 4.8020153278455864e-05, "loss": 2.3261541366577148, "step": 36640 }, { "epoch": 10.40306556911723, "grad_norm": 6.361904144287109, "learning_rate": 4.800596082883906e-05, "loss": 2.200244140625, "step": 36650 }, { "epoch": 10.40590405904059, "grad_norm": 6.588407516479492, "learning_rate": 4.799176837922226e-05, "loss": 2.32657470703125, "step": 36660 }, { "epoch": 10.408742548963952, "grad_norm": 6.48606014251709, "learning_rate": 4.797757592960545e-05, "loss": 2.2656953811645506, "step": 36670 }, { "epoch": 10.411581038887311, "grad_norm": 6.665980339050293, "learning_rate": 4.796338347998865e-05, "loss": 2.2419708251953123, "step": 36680 }, { "epoch": 10.414419528810672, "grad_norm": 6.551421642303467, "learning_rate": 4.7949191030371844e-05, "loss": 2.3276782989501954, "step": 36690 }, { "epoch": 10.417258018734033, "grad_norm": 6.840142726898193, "learning_rate": 4.7934998580755044e-05, "loss": 2.273194694519043, "step": 36700 }, { "epoch": 10.420096508657394, "grad_norm": 6.961146354675293, "learning_rate": 4.792080613113824e-05, "loss": 2.2284950256347655, "step": 36710 }, { "epoch": 10.422934998580756, "grad_norm": 6.9753828048706055, "learning_rate": 4.790661368152143e-05, "loss": 2.283543014526367, "step": 36720 }, { "epoch": 10.425773488504117, "grad_norm": 6.631237506866455, "learning_rate": 4.7892421231904624e-05, "loss": 2.2160564422607423, "step": 36730 }, { "epoch": 10.428611978427476, "grad_norm": 6.826803207397461, "learning_rate": 4.7878228782287824e-05, "loss": 2.246479034423828, "step": 36740 }, { "epoch": 10.431450468350837, "grad_norm": 6.833429336547852, "learning_rate": 4.786403633267102e-05, "loss": 2.2039939880371096, "step": 36750 }, { "epoch": 10.434288958274198, "grad_norm": 6.445227146148682, "learning_rate": 4.784984388305422e-05, "loss": 2.232635498046875, "step": 36760 }, { "epoch": 10.43712744819756, "grad_norm": 6.893360614776611, "learning_rate": 4.783565143343741e-05, "loss": 2.2612682342529298, "step": 36770 }, { "epoch": 10.43996593812092, "grad_norm": 6.605032444000244, "learning_rate": 4.782145898382061e-05, "loss": 2.2863399505615236, "step": 36780 }, { "epoch": 10.44280442804428, "grad_norm": 6.841236591339111, "learning_rate": 4.7807266534203804e-05, "loss": 2.345805549621582, "step": 36790 }, { "epoch": 10.44564291796764, "grad_norm": 6.827117919921875, "learning_rate": 4.7793074084587004e-05, "loss": 2.256534957885742, "step": 36800 }, { "epoch": 10.448481407891002, "grad_norm": 6.498997211456299, "learning_rate": 4.77788816349702e-05, "loss": 2.3004728317260743, "step": 36810 }, { "epoch": 10.451319897814363, "grad_norm": 6.922576904296875, "learning_rate": 4.77646891853534e-05, "loss": 2.246285629272461, "step": 36820 }, { "epoch": 10.454158387737724, "grad_norm": 6.5312299728393555, "learning_rate": 4.775049673573659e-05, "loss": 2.2274045944213867, "step": 36830 }, { "epoch": 10.456996877661084, "grad_norm": 6.880944728851318, "learning_rate": 4.7736304286119784e-05, "loss": 2.246807861328125, "step": 36840 }, { "epoch": 10.459835367584445, "grad_norm": 6.491103172302246, "learning_rate": 4.772211183650298e-05, "loss": 2.209951400756836, "step": 36850 }, { "epoch": 10.462673857507806, "grad_norm": 6.811209678649902, "learning_rate": 4.770791938688618e-05, "loss": 2.278722381591797, "step": 36860 }, { "epoch": 10.465512347431167, "grad_norm": 6.395569801330566, "learning_rate": 4.769372693726937e-05, "loss": 2.282604789733887, "step": 36870 }, { "epoch": 10.468350837354528, "grad_norm": 6.709869861602783, "learning_rate": 4.767953448765257e-05, "loss": 2.3161653518676757, "step": 36880 }, { "epoch": 10.471189327277887, "grad_norm": 6.411334037780762, "learning_rate": 4.7665342038035765e-05, "loss": 2.1767866134643556, "step": 36890 }, { "epoch": 10.474027817201248, "grad_norm": 6.671359062194824, "learning_rate": 4.7651149588418965e-05, "loss": 2.236512565612793, "step": 36900 }, { "epoch": 10.47686630712461, "grad_norm": 6.893029689788818, "learning_rate": 4.763695713880216e-05, "loss": 2.2679271697998047, "step": 36910 }, { "epoch": 10.47970479704797, "grad_norm": 6.65577507019043, "learning_rate": 4.762276468918536e-05, "loss": 2.278162956237793, "step": 36920 }, { "epoch": 10.482543286971332, "grad_norm": 6.7010297775268555, "learning_rate": 4.760857223956855e-05, "loss": 2.232887077331543, "step": 36930 }, { "epoch": 10.485381776894693, "grad_norm": 6.991005897521973, "learning_rate": 4.759437978995175e-05, "loss": 2.303346061706543, "step": 36940 }, { "epoch": 10.488220266818052, "grad_norm": 6.815080165863037, "learning_rate": 4.7580187340334945e-05, "loss": 2.3004451751708985, "step": 36950 }, { "epoch": 10.491058756741413, "grad_norm": 6.346030235290527, "learning_rate": 4.756599489071814e-05, "loss": 2.285488319396973, "step": 36960 }, { "epoch": 10.493897246664774, "grad_norm": 6.532553672790527, "learning_rate": 4.755180244110133e-05, "loss": 2.170652389526367, "step": 36970 }, { "epoch": 10.496735736588136, "grad_norm": 6.42758321762085, "learning_rate": 4.753760999148453e-05, "loss": 2.251478576660156, "step": 36980 }, { "epoch": 10.499574226511497, "grad_norm": 6.578829765319824, "learning_rate": 4.7523417541867725e-05, "loss": 2.2897729873657227, "step": 36990 }, { "epoch": 10.502412716434856, "grad_norm": 6.485733509063721, "learning_rate": 4.7509225092250925e-05, "loss": 2.217381477355957, "step": 37000 }, { "epoch": 10.502412716434856, "eval_accuracy": 0.31824251287594585, "eval_loss": 2.5739872455596924, "eval_runtime": 50.5908, "eval_samples_per_second": 310.867, "eval_steps_per_second": 4.863, "step": 37000 }, { "epoch": 10.505251206358217, "grad_norm": 6.589915752410889, "learning_rate": 4.749503264263412e-05, "loss": 2.253593635559082, "step": 37010 }, { "epoch": 10.508089696281578, "grad_norm": 6.591215133666992, "learning_rate": 4.748084019301732e-05, "loss": 2.224382781982422, "step": 37020 }, { "epoch": 10.51092818620494, "grad_norm": 6.465699195861816, "learning_rate": 4.746664774340051e-05, "loss": 2.236003112792969, "step": 37030 }, { "epoch": 10.5137666761283, "grad_norm": 6.4379143714904785, "learning_rate": 4.745245529378371e-05, "loss": 2.2242998123168944, "step": 37040 }, { "epoch": 10.51660516605166, "grad_norm": 6.375771999359131, "learning_rate": 4.7438262844166905e-05, "loss": 2.262337303161621, "step": 37050 }, { "epoch": 10.51944365597502, "grad_norm": 6.5593671798706055, "learning_rate": 4.7424070394550105e-05, "loss": 2.2485219955444338, "step": 37060 }, { "epoch": 10.522282145898382, "grad_norm": 6.648055553436279, "learning_rate": 4.74098779449333e-05, "loss": 2.2322980880737306, "step": 37070 }, { "epoch": 10.525120635821743, "grad_norm": 6.530377388000488, "learning_rate": 4.739568549531649e-05, "loss": 2.3430261611938477, "step": 37080 }, { "epoch": 10.527959125745104, "grad_norm": 6.212730407714844, "learning_rate": 4.7381493045699685e-05, "loss": 2.1995254516601563, "step": 37090 }, { "epoch": 10.530797615668465, "grad_norm": 6.693747043609619, "learning_rate": 4.7367300596082885e-05, "loss": 2.3311651229858397, "step": 37100 }, { "epoch": 10.533636105591825, "grad_norm": 6.2905097007751465, "learning_rate": 4.735310814646608e-05, "loss": 2.3473081588745117, "step": 37110 }, { "epoch": 10.536474595515186, "grad_norm": 6.94363260269165, "learning_rate": 4.733891569684928e-05, "loss": 2.376312255859375, "step": 37120 }, { "epoch": 10.539313085438547, "grad_norm": 6.782027721405029, "learning_rate": 4.732472324723247e-05, "loss": 2.2250553131103517, "step": 37130 }, { "epoch": 10.542151575361908, "grad_norm": 7.235441207885742, "learning_rate": 4.731053079761567e-05, "loss": 2.2524694442749023, "step": 37140 }, { "epoch": 10.544990065285269, "grad_norm": 6.55571174621582, "learning_rate": 4.7296338347998865e-05, "loss": 2.2461692810058596, "step": 37150 }, { "epoch": 10.547828555208628, "grad_norm": 6.72238302230835, "learning_rate": 4.7282145898382065e-05, "loss": 2.25311164855957, "step": 37160 }, { "epoch": 10.55066704513199, "grad_norm": 6.786449432373047, "learning_rate": 4.726795344876526e-05, "loss": 2.2858039855957033, "step": 37170 }, { "epoch": 10.55350553505535, "grad_norm": 6.308724880218506, "learning_rate": 4.725376099914845e-05, "loss": 2.18367977142334, "step": 37180 }, { "epoch": 10.556344024978712, "grad_norm": 6.572742938995361, "learning_rate": 4.7239568549531645e-05, "loss": 2.3120784759521484, "step": 37190 }, { "epoch": 10.559182514902073, "grad_norm": 6.566356182098389, "learning_rate": 4.7225376099914846e-05, "loss": 2.2693641662597654, "step": 37200 }, { "epoch": 10.562021004825432, "grad_norm": 6.987247943878174, "learning_rate": 4.7211183650298046e-05, "loss": 2.209634017944336, "step": 37210 }, { "epoch": 10.564859494748793, "grad_norm": 6.4124016761779785, "learning_rate": 4.719699120068124e-05, "loss": 2.3316268920898438, "step": 37220 }, { "epoch": 10.567697984672154, "grad_norm": 6.7104291915893555, "learning_rate": 4.718279875106444e-05, "loss": 2.2250740051269533, "step": 37230 }, { "epoch": 10.570536474595515, "grad_norm": 6.581799030303955, "learning_rate": 4.716860630144763e-05, "loss": 2.349669647216797, "step": 37240 }, { "epoch": 10.573374964518877, "grad_norm": 6.556941032409668, "learning_rate": 4.715441385183083e-05, "loss": 2.2397907257080076, "step": 37250 }, { "epoch": 10.576213454442236, "grad_norm": 6.588397026062012, "learning_rate": 4.7140221402214026e-05, "loss": 2.3161855697631837, "step": 37260 }, { "epoch": 10.579051944365597, "grad_norm": 6.553083896636963, "learning_rate": 4.7126028952597226e-05, "loss": 2.2695924758911135, "step": 37270 }, { "epoch": 10.581890434288958, "grad_norm": 6.301183700561523, "learning_rate": 4.711183650298042e-05, "loss": 2.2764490127563475, "step": 37280 }, { "epoch": 10.58472892421232, "grad_norm": 6.345434665679932, "learning_rate": 4.709764405336361e-05, "loss": 2.314488983154297, "step": 37290 }, { "epoch": 10.58756741413568, "grad_norm": 6.85676908493042, "learning_rate": 4.7083451603746806e-05, "loss": 2.3595094680786133, "step": 37300 }, { "epoch": 10.59040590405904, "grad_norm": 6.836460113525391, "learning_rate": 4.7069259154130006e-05, "loss": 2.2488323211669923, "step": 37310 }, { "epoch": 10.5932443939824, "grad_norm": 6.530576705932617, "learning_rate": 4.70550667045132e-05, "loss": 2.215122413635254, "step": 37320 }, { "epoch": 10.596082883905762, "grad_norm": 6.668957710266113, "learning_rate": 4.70408742548964e-05, "loss": 2.2100383758544924, "step": 37330 }, { "epoch": 10.598921373829123, "grad_norm": 6.634660243988037, "learning_rate": 4.702668180527959e-05, "loss": 2.2580745697021483, "step": 37340 }, { "epoch": 10.601759863752484, "grad_norm": 6.800546646118164, "learning_rate": 4.701248935566279e-05, "loss": 2.302572250366211, "step": 37350 }, { "epoch": 10.604598353675845, "grad_norm": 6.159020900726318, "learning_rate": 4.6998296906045986e-05, "loss": 2.2413213729858397, "step": 37360 }, { "epoch": 10.607436843599205, "grad_norm": 6.311195373535156, "learning_rate": 4.6984104456429186e-05, "loss": 2.317561721801758, "step": 37370 }, { "epoch": 10.610275333522566, "grad_norm": 6.618466377258301, "learning_rate": 4.696991200681238e-05, "loss": 2.303819465637207, "step": 37380 }, { "epoch": 10.613113823445927, "grad_norm": 6.713260650634766, "learning_rate": 4.695571955719558e-05, "loss": 2.3096569061279295, "step": 37390 }, { "epoch": 10.615952313369288, "grad_norm": 6.538779258728027, "learning_rate": 4.694152710757877e-05, "loss": 2.320255661010742, "step": 37400 }, { "epoch": 10.618790803292649, "grad_norm": 6.896006107330322, "learning_rate": 4.6927334657961966e-05, "loss": 2.2311075210571287, "step": 37410 }, { "epoch": 10.621629293216008, "grad_norm": 6.6718010902404785, "learning_rate": 4.691314220834516e-05, "loss": 2.193651008605957, "step": 37420 }, { "epoch": 10.62446778313937, "grad_norm": 6.773739814758301, "learning_rate": 4.689894975872836e-05, "loss": 2.3066169738769533, "step": 37430 }, { "epoch": 10.62730627306273, "grad_norm": 6.736380100250244, "learning_rate": 4.688475730911155e-05, "loss": 2.382419204711914, "step": 37440 }, { "epoch": 10.630144762986092, "grad_norm": 6.443817138671875, "learning_rate": 4.687056485949475e-05, "loss": 2.2565832138061523, "step": 37450 }, { "epoch": 10.632983252909453, "grad_norm": 6.076959609985352, "learning_rate": 4.6856372409877946e-05, "loss": 2.2301334381103515, "step": 37460 }, { "epoch": 10.635821742832812, "grad_norm": 6.573960304260254, "learning_rate": 4.6842179960261147e-05, "loss": 2.328387641906738, "step": 37470 }, { "epoch": 10.638660232756173, "grad_norm": 6.566829681396484, "learning_rate": 4.682798751064434e-05, "loss": 2.3131891250610352, "step": 37480 }, { "epoch": 10.641498722679534, "grad_norm": 6.8341450691223145, "learning_rate": 4.681379506102754e-05, "loss": 2.247077751159668, "step": 37490 }, { "epoch": 10.644337212602895, "grad_norm": 7.0673909187316895, "learning_rate": 4.679960261141073e-05, "loss": 2.2358341217041016, "step": 37500 }, { "epoch": 10.644337212602895, "eval_accuracy": 0.31970496598206904, "eval_loss": 2.5721046924591064, "eval_runtime": 56.4231, "eval_samples_per_second": 278.733, "eval_steps_per_second": 4.36, "step": 37500 }, { "epoch": 10.647175702526257, "grad_norm": 6.695178508758545, "learning_rate": 4.678541016179393e-05, "loss": 2.25499267578125, "step": 37510 }, { "epoch": 10.650014192449618, "grad_norm": 6.606156826019287, "learning_rate": 4.677121771217713e-05, "loss": 2.2520891189575196, "step": 37520 }, { "epoch": 10.652852682372977, "grad_norm": 6.559587478637695, "learning_rate": 4.675702526256032e-05, "loss": 2.246622848510742, "step": 37530 }, { "epoch": 10.655691172296338, "grad_norm": 7.053726673126221, "learning_rate": 4.674283281294351e-05, "loss": 2.261794853210449, "step": 37540 }, { "epoch": 10.6585296622197, "grad_norm": 7.177770137786865, "learning_rate": 4.6728640363326713e-05, "loss": 2.2876865386962892, "step": 37550 }, { "epoch": 10.66136815214306, "grad_norm": 6.491934776306152, "learning_rate": 4.671444791370991e-05, "loss": 2.22989501953125, "step": 37560 }, { "epoch": 10.664206642066421, "grad_norm": 7.0843071937561035, "learning_rate": 4.670025546409311e-05, "loss": 2.2895397186279296, "step": 37570 }, { "epoch": 10.66704513198978, "grad_norm": 6.16774845123291, "learning_rate": 4.66860630144763e-05, "loss": 2.194806671142578, "step": 37580 }, { "epoch": 10.669883621913142, "grad_norm": 6.772490501403809, "learning_rate": 4.66718705648595e-05, "loss": 2.2422458648681642, "step": 37590 }, { "epoch": 10.672722111836503, "grad_norm": 6.415410041809082, "learning_rate": 4.6657678115242694e-05, "loss": 2.2216060638427733, "step": 37600 }, { "epoch": 10.675560601759864, "grad_norm": 6.24780797958374, "learning_rate": 4.6643485665625894e-05, "loss": 2.2653079986572267, "step": 37610 }, { "epoch": 10.678399091683225, "grad_norm": 6.5262298583984375, "learning_rate": 4.662929321600909e-05, "loss": 2.3011383056640624, "step": 37620 }, { "epoch": 10.681237581606585, "grad_norm": 6.96612548828125, "learning_rate": 4.661510076639228e-05, "loss": 2.1769840240478517, "step": 37630 }, { "epoch": 10.684076071529946, "grad_norm": 6.8101606369018555, "learning_rate": 4.6600908316775474e-05, "loss": 2.318344497680664, "step": 37640 }, { "epoch": 10.686914561453307, "grad_norm": 6.924535751342773, "learning_rate": 4.6586715867158674e-05, "loss": 2.2310789108276365, "step": 37650 }, { "epoch": 10.689753051376668, "grad_norm": 6.873551845550537, "learning_rate": 4.657252341754187e-05, "loss": 2.2247190475463867, "step": 37660 }, { "epoch": 10.692591541300029, "grad_norm": 6.810730457305908, "learning_rate": 4.655833096792507e-05, "loss": 2.2248674392700196, "step": 37670 }, { "epoch": 10.695430031223388, "grad_norm": 6.631580352783203, "learning_rate": 4.654413851830826e-05, "loss": 2.260992431640625, "step": 37680 }, { "epoch": 10.69826852114675, "grad_norm": 6.963045597076416, "learning_rate": 4.652994606869146e-05, "loss": 2.2841585159301756, "step": 37690 }, { "epoch": 10.70110701107011, "grad_norm": 6.687633991241455, "learning_rate": 4.6515753619074654e-05, "loss": 2.3574291229248048, "step": 37700 }, { "epoch": 10.703945500993472, "grad_norm": 6.36392879486084, "learning_rate": 4.6501561169457854e-05, "loss": 2.154512405395508, "step": 37710 }, { "epoch": 10.706783990916833, "grad_norm": 6.342623710632324, "learning_rate": 4.648736871984105e-05, "loss": 2.2142011642456056, "step": 37720 }, { "epoch": 10.709622480840192, "grad_norm": 6.883625030517578, "learning_rate": 4.647317627022425e-05, "loss": 2.2168977737426756, "step": 37730 }, { "epoch": 10.712460970763553, "grad_norm": 6.612582206726074, "learning_rate": 4.645898382060744e-05, "loss": 2.2444114685058594, "step": 37740 }, { "epoch": 10.715299460686914, "grad_norm": 6.491725444793701, "learning_rate": 4.6444791370990634e-05, "loss": 2.216256523132324, "step": 37750 }, { "epoch": 10.718137950610275, "grad_norm": 6.930942058563232, "learning_rate": 4.643059892137383e-05, "loss": 2.2181867599487304, "step": 37760 }, { "epoch": 10.720976440533637, "grad_norm": 6.500256538391113, "learning_rate": 4.641640647175703e-05, "loss": 2.2994964599609373, "step": 37770 }, { "epoch": 10.723814930456998, "grad_norm": 6.591561794281006, "learning_rate": 4.640221402214022e-05, "loss": 2.3063987731933593, "step": 37780 }, { "epoch": 10.726653420380357, "grad_norm": 6.505067348480225, "learning_rate": 4.638802157252342e-05, "loss": 2.226812553405762, "step": 37790 }, { "epoch": 10.729491910303718, "grad_norm": 6.409343719482422, "learning_rate": 4.6373829122906614e-05, "loss": 2.3424449920654298, "step": 37800 }, { "epoch": 10.73233040022708, "grad_norm": 6.1234846115112305, "learning_rate": 4.6359636673289814e-05, "loss": 2.262360763549805, "step": 37810 }, { "epoch": 10.73516889015044, "grad_norm": 6.519476413726807, "learning_rate": 4.634544422367301e-05, "loss": 2.2609588623046877, "step": 37820 }, { "epoch": 10.738007380073801, "grad_norm": 6.291996002197266, "learning_rate": 4.633125177405621e-05, "loss": 2.302825164794922, "step": 37830 }, { "epoch": 10.74084586999716, "grad_norm": 6.723465442657471, "learning_rate": 4.63170593244394e-05, "loss": 2.3086524963378907, "step": 37840 }, { "epoch": 10.743684359920522, "grad_norm": 6.339003086090088, "learning_rate": 4.63028668748226e-05, "loss": 2.3427907943725588, "step": 37850 }, { "epoch": 10.746522849843883, "grad_norm": 6.4836835861206055, "learning_rate": 4.6288674425205794e-05, "loss": 2.2230710983276367, "step": 37860 }, { "epoch": 10.749361339767244, "grad_norm": 6.377742290496826, "learning_rate": 4.627448197558899e-05, "loss": 2.294789123535156, "step": 37870 }, { "epoch": 10.752199829690605, "grad_norm": 6.6342854499816895, "learning_rate": 4.626028952597218e-05, "loss": 2.232669639587402, "step": 37880 }, { "epoch": 10.755038319613966, "grad_norm": 6.516536712646484, "learning_rate": 4.624609707635538e-05, "loss": 2.22767333984375, "step": 37890 }, { "epoch": 10.757876809537326, "grad_norm": 6.4461870193481445, "learning_rate": 4.6231904626738575e-05, "loss": 2.3022043228149416, "step": 37900 }, { "epoch": 10.760715299460687, "grad_norm": 6.620900630950928, "learning_rate": 4.6217712177121775e-05, "loss": 2.2604793548583983, "step": 37910 }, { "epoch": 10.763553789384048, "grad_norm": 6.466830730438232, "learning_rate": 4.620351972750497e-05, "loss": 2.271818161010742, "step": 37920 }, { "epoch": 10.766392279307409, "grad_norm": 6.495964050292969, "learning_rate": 4.618932727788817e-05, "loss": 2.193471145629883, "step": 37930 }, { "epoch": 10.76923076923077, "grad_norm": 6.529226303100586, "learning_rate": 4.617513482827136e-05, "loss": 2.2529747009277346, "step": 37940 }, { "epoch": 10.77206925915413, "grad_norm": 6.906757831573486, "learning_rate": 4.616094237865456e-05, "loss": 2.2681386947631834, "step": 37950 }, { "epoch": 10.77490774907749, "grad_norm": 6.456428050994873, "learning_rate": 4.6146749929037755e-05, "loss": 2.2303050994873046, "step": 37960 }, { "epoch": 10.777746239000852, "grad_norm": 6.527261734008789, "learning_rate": 4.613255747942095e-05, "loss": 2.196172904968262, "step": 37970 }, { "epoch": 10.780584728924213, "grad_norm": 7.137967109680176, "learning_rate": 4.611836502980414e-05, "loss": 2.331686782836914, "step": 37980 }, { "epoch": 10.783423218847574, "grad_norm": 6.792810916900635, "learning_rate": 4.610417258018734e-05, "loss": 2.3039199829101564, "step": 37990 }, { "epoch": 10.786261708770933, "grad_norm": 6.735734462738037, "learning_rate": 4.6089980130570535e-05, "loss": 2.2471900939941407, "step": 38000 }, { "epoch": 10.786261708770933, "eval_accuracy": 0.3181789279582883, "eval_loss": 2.5643839836120605, "eval_runtime": 51.3475, "eval_samples_per_second": 306.286, "eval_steps_per_second": 4.791, "step": 38000 }, { "epoch": 10.789100198694294, "grad_norm": 6.656044006347656, "learning_rate": 4.6075787680953735e-05, "loss": 2.2681858062744142, "step": 38010 }, { "epoch": 10.791938688617655, "grad_norm": 6.8716559410095215, "learning_rate": 4.606159523133693e-05, "loss": 2.1927650451660154, "step": 38020 }, { "epoch": 10.794777178541016, "grad_norm": 6.77720308303833, "learning_rate": 4.604740278172013e-05, "loss": 2.3071929931640627, "step": 38030 }, { "epoch": 10.797615668464378, "grad_norm": 6.726988315582275, "learning_rate": 4.603321033210332e-05, "loss": 2.3128528594970703, "step": 38040 }, { "epoch": 10.800454158387737, "grad_norm": 6.240533828735352, "learning_rate": 4.601901788248652e-05, "loss": 2.213070106506348, "step": 38050 }, { "epoch": 10.803292648311098, "grad_norm": 6.560245037078857, "learning_rate": 4.6004825432869715e-05, "loss": 2.191526985168457, "step": 38060 }, { "epoch": 10.80613113823446, "grad_norm": 6.392862796783447, "learning_rate": 4.5990632983252915e-05, "loss": 2.2637279510498045, "step": 38070 }, { "epoch": 10.80896962815782, "grad_norm": 6.800666332244873, "learning_rate": 4.597644053363611e-05, "loss": 2.3011022567749024, "step": 38080 }, { "epoch": 10.811808118081181, "grad_norm": 6.903975963592529, "learning_rate": 4.59622480840193e-05, "loss": 2.3005035400390623, "step": 38090 }, { "epoch": 10.81464660800454, "grad_norm": 6.6504950523376465, "learning_rate": 4.5948055634402495e-05, "loss": 2.153132438659668, "step": 38100 }, { "epoch": 10.817485097927902, "grad_norm": 6.743723392486572, "learning_rate": 4.5933863184785695e-05, "loss": 2.306337356567383, "step": 38110 }, { "epoch": 10.820323587851263, "grad_norm": 6.713272571563721, "learning_rate": 4.591967073516889e-05, "loss": 2.3063636779785157, "step": 38120 }, { "epoch": 10.823162077774624, "grad_norm": 6.437305450439453, "learning_rate": 4.590547828555209e-05, "loss": 2.25931453704834, "step": 38130 }, { "epoch": 10.826000567697985, "grad_norm": 6.802011489868164, "learning_rate": 4.589128583593528e-05, "loss": 2.232932281494141, "step": 38140 }, { "epoch": 10.828839057621346, "grad_norm": 6.731400012969971, "learning_rate": 4.587709338631848e-05, "loss": 2.231718826293945, "step": 38150 }, { "epoch": 10.831677547544706, "grad_norm": 6.678519248962402, "learning_rate": 4.5862900936701675e-05, "loss": 2.2121561050415037, "step": 38160 }, { "epoch": 10.834516037468067, "grad_norm": 6.780175685882568, "learning_rate": 4.5848708487084876e-05, "loss": 2.3060129165649412, "step": 38170 }, { "epoch": 10.837354527391428, "grad_norm": 6.78185510635376, "learning_rate": 4.583451603746807e-05, "loss": 2.308066177368164, "step": 38180 }, { "epoch": 10.840193017314789, "grad_norm": 6.586338996887207, "learning_rate": 4.582032358785127e-05, "loss": 2.2749296188354493, "step": 38190 }, { "epoch": 10.84303150723815, "grad_norm": 6.948178291320801, "learning_rate": 4.580613113823446e-05, "loss": 2.267566680908203, "step": 38200 }, { "epoch": 10.84586999716151, "grad_norm": 6.969356536865234, "learning_rate": 4.5791938688617656e-05, "loss": 2.2906352996826174, "step": 38210 }, { "epoch": 10.84870848708487, "grad_norm": 6.67414665222168, "learning_rate": 4.577774623900085e-05, "loss": 2.278836250305176, "step": 38220 }, { "epoch": 10.851546977008232, "grad_norm": 6.743663311004639, "learning_rate": 4.576355378938405e-05, "loss": 2.1733419418334963, "step": 38230 }, { "epoch": 10.854385466931593, "grad_norm": 6.856447696685791, "learning_rate": 4.574936133976724e-05, "loss": 2.3058677673339845, "step": 38240 }, { "epoch": 10.857223956854954, "grad_norm": 6.371180534362793, "learning_rate": 4.573516889015044e-05, "loss": 2.218657875061035, "step": 38250 }, { "epoch": 10.860062446778313, "grad_norm": 6.284598350524902, "learning_rate": 4.5720976440533636e-05, "loss": 2.2771427154541017, "step": 38260 }, { "epoch": 10.862900936701674, "grad_norm": 6.174470901489258, "learning_rate": 4.5706783990916836e-05, "loss": 2.2593379974365235, "step": 38270 }, { "epoch": 10.865739426625035, "grad_norm": 7.001747131347656, "learning_rate": 4.569259154130003e-05, "loss": 2.36575927734375, "step": 38280 }, { "epoch": 10.868577916548396, "grad_norm": 6.210315704345703, "learning_rate": 4.567839909168323e-05, "loss": 2.253421592712402, "step": 38290 }, { "epoch": 10.871416406471758, "grad_norm": 6.433093070983887, "learning_rate": 4.566420664206642e-05, "loss": 2.3559471130371095, "step": 38300 }, { "epoch": 10.874254896395119, "grad_norm": 6.442171573638916, "learning_rate": 4.565001419244962e-05, "loss": 2.238406753540039, "step": 38310 }, { "epoch": 10.877093386318478, "grad_norm": 6.804506778717041, "learning_rate": 4.5635821742832816e-05, "loss": 2.2628414154052736, "step": 38320 }, { "epoch": 10.879931876241839, "grad_norm": 6.788893222808838, "learning_rate": 4.562162929321601e-05, "loss": 2.2733314514160154, "step": 38330 }, { "epoch": 10.8827703661652, "grad_norm": 6.549049377441406, "learning_rate": 4.56074368435992e-05, "loss": 2.277058410644531, "step": 38340 }, { "epoch": 10.885608856088561, "grad_norm": 6.5909271240234375, "learning_rate": 4.55932443939824e-05, "loss": 2.252369499206543, "step": 38350 }, { "epoch": 10.888447346011922, "grad_norm": 6.678609848022461, "learning_rate": 4.5579051944365596e-05, "loss": 2.266452980041504, "step": 38360 }, { "epoch": 10.891285835935282, "grad_norm": 6.798041820526123, "learning_rate": 4.5564859494748796e-05, "loss": 2.2540964126586913, "step": 38370 }, { "epoch": 10.894124325858643, "grad_norm": 6.400651454925537, "learning_rate": 4.555066704513199e-05, "loss": 2.1920495986938477, "step": 38380 }, { "epoch": 10.896962815782004, "grad_norm": 6.937674045562744, "learning_rate": 4.553647459551519e-05, "loss": 2.3090620040893555, "step": 38390 }, { "epoch": 10.899801305705365, "grad_norm": 6.9180192947387695, "learning_rate": 4.552228214589838e-05, "loss": 2.3073280334472654, "step": 38400 }, { "epoch": 10.902639795628726, "grad_norm": 7.086023807525635, "learning_rate": 4.550808969628158e-05, "loss": 2.2975610733032226, "step": 38410 }, { "epoch": 10.905478285552086, "grad_norm": 6.509183883666992, "learning_rate": 4.5493897246664776e-05, "loss": 2.2501594543457033, "step": 38420 }, { "epoch": 10.908316775475447, "grad_norm": 6.433135509490967, "learning_rate": 4.547970479704797e-05, "loss": 2.190632629394531, "step": 38430 }, { "epoch": 10.911155265398808, "grad_norm": 6.544419765472412, "learning_rate": 4.546551234743116e-05, "loss": 2.2997600555419924, "step": 38440 }, { "epoch": 10.913993755322169, "grad_norm": 6.82355260848999, "learning_rate": 4.545131989781436e-05, "loss": 2.3085067749023436, "step": 38450 }, { "epoch": 10.91683224524553, "grad_norm": 6.104782581329346, "learning_rate": 4.5437127448197556e-05, "loss": 2.2728321075439455, "step": 38460 }, { "epoch": 10.91967073516889, "grad_norm": 7.078842639923096, "learning_rate": 4.5422934998580756e-05, "loss": 2.1702621459960936, "step": 38470 }, { "epoch": 10.92250922509225, "grad_norm": 6.643609523773193, "learning_rate": 4.540874254896395e-05, "loss": 2.2775671005249025, "step": 38480 }, { "epoch": 10.925347715015612, "grad_norm": 6.531813621520996, "learning_rate": 4.539455009934715e-05, "loss": 2.287578010559082, "step": 38490 }, { "epoch": 10.928186204938973, "grad_norm": 6.588253021240234, "learning_rate": 4.538035764973034e-05, "loss": 2.2176828384399414, "step": 38500 }, { "epoch": 10.928186204938973, "eval_accuracy": 0.3259998728301647, "eval_loss": 2.558863639831543, "eval_runtime": 55.5601, "eval_samples_per_second": 283.063, "eval_steps_per_second": 4.428, "step": 38500 }, { "epoch": 10.931024694862334, "grad_norm": 7.000339984893799, "learning_rate": 4.536616520011354e-05, "loss": 2.2688098907470704, "step": 38510 }, { "epoch": 10.933863184785693, "grad_norm": 6.555028915405273, "learning_rate": 4.5351972750496743e-05, "loss": 2.311161422729492, "step": 38520 }, { "epoch": 10.936701674709054, "grad_norm": 6.681646347045898, "learning_rate": 4.533778030087994e-05, "loss": 2.2416162490844727, "step": 38530 }, { "epoch": 10.939540164632415, "grad_norm": 6.476521015167236, "learning_rate": 4.532358785126313e-05, "loss": 2.263160514831543, "step": 38540 }, { "epoch": 10.942378654555776, "grad_norm": 6.673933982849121, "learning_rate": 4.530939540164632e-05, "loss": 2.264727020263672, "step": 38550 }, { "epoch": 10.945217144479138, "grad_norm": 6.5690460205078125, "learning_rate": 4.5295202952029523e-05, "loss": 2.2879762649536133, "step": 38560 }, { "epoch": 10.948055634402499, "grad_norm": 6.74867582321167, "learning_rate": 4.528101050241272e-05, "loss": 2.1993452072143556, "step": 38570 }, { "epoch": 10.950894124325858, "grad_norm": 6.665872097015381, "learning_rate": 4.526681805279592e-05, "loss": 2.299881935119629, "step": 38580 }, { "epoch": 10.953732614249219, "grad_norm": 6.412293434143066, "learning_rate": 4.525262560317911e-05, "loss": 2.264771270751953, "step": 38590 }, { "epoch": 10.95657110417258, "grad_norm": 7.5223236083984375, "learning_rate": 4.523843315356231e-05, "loss": 2.312994384765625, "step": 38600 }, { "epoch": 10.959409594095941, "grad_norm": 6.316067218780518, "learning_rate": 4.5224240703945504e-05, "loss": 2.227376174926758, "step": 38610 }, { "epoch": 10.962248084019302, "grad_norm": 6.555646896362305, "learning_rate": 4.5210048254328704e-05, "loss": 2.196084403991699, "step": 38620 }, { "epoch": 10.965086573942662, "grad_norm": 6.821770668029785, "learning_rate": 4.51958558047119e-05, "loss": 2.286555290222168, "step": 38630 }, { "epoch": 10.967925063866023, "grad_norm": 6.599404335021973, "learning_rate": 4.51816633550951e-05, "loss": 2.286939239501953, "step": 38640 }, { "epoch": 10.970763553789384, "grad_norm": 6.411046504974365, "learning_rate": 4.516747090547829e-05, "loss": 2.186842155456543, "step": 38650 }, { "epoch": 10.973602043712745, "grad_norm": 6.560597896575928, "learning_rate": 4.5153278455861484e-05, "loss": 2.2939708709716795, "step": 38660 }, { "epoch": 10.976440533636106, "grad_norm": 6.447816371917725, "learning_rate": 4.513908600624468e-05, "loss": 2.2849897384643554, "step": 38670 }, { "epoch": 10.979279023559467, "grad_norm": 6.461740970611572, "learning_rate": 4.512489355662788e-05, "loss": 2.2550369262695313, "step": 38680 }, { "epoch": 10.982117513482827, "grad_norm": 6.522761821746826, "learning_rate": 4.511070110701107e-05, "loss": 2.2346845626831056, "step": 38690 }, { "epoch": 10.984956003406188, "grad_norm": 6.975781440734863, "learning_rate": 4.509650865739427e-05, "loss": 2.363984298706055, "step": 38700 }, { "epoch": 10.987794493329549, "grad_norm": 7.0069122314453125, "learning_rate": 4.5082316207777464e-05, "loss": 2.3241296768188477, "step": 38710 }, { "epoch": 10.99063298325291, "grad_norm": 6.562824726104736, "learning_rate": 4.5068123758160664e-05, "loss": 2.258711814880371, "step": 38720 }, { "epoch": 10.993471473176271, "grad_norm": 6.437473297119141, "learning_rate": 4.505393130854386e-05, "loss": 2.2191608428955076, "step": 38730 }, { "epoch": 10.99630996309963, "grad_norm": 6.4667863845825195, "learning_rate": 4.503973885892706e-05, "loss": 2.235788917541504, "step": 38740 }, { "epoch": 10.999148453022991, "grad_norm": 6.678766250610352, "learning_rate": 4.502554640931025e-05, "loss": 2.2344207763671875, "step": 38750 }, { "epoch": 11.001986942946353, "grad_norm": 7.061533451080322, "learning_rate": 4.5011353959693444e-05, "loss": 2.299582099914551, "step": 38760 }, { "epoch": 11.004825432869714, "grad_norm": 6.683467864990234, "learning_rate": 4.4997161510076644e-05, "loss": 2.250048828125, "step": 38770 }, { "epoch": 11.007663922793075, "grad_norm": 7.078280448913574, "learning_rate": 4.498296906045984e-05, "loss": 2.2753787994384767, "step": 38780 }, { "epoch": 11.010502412716434, "grad_norm": 7.044031143188477, "learning_rate": 4.496877661084303e-05, "loss": 2.1751590728759767, "step": 38790 }, { "epoch": 11.013340902639795, "grad_norm": 6.6223273277282715, "learning_rate": 4.495458416122623e-05, "loss": 2.3278812408447265, "step": 38800 }, { "epoch": 11.016179392563156, "grad_norm": 6.674236297607422, "learning_rate": 4.4940391711609424e-05, "loss": 2.1424543380737306, "step": 38810 }, { "epoch": 11.019017882486517, "grad_norm": 6.582226753234863, "learning_rate": 4.4926199261992624e-05, "loss": 2.2377058029174806, "step": 38820 }, { "epoch": 11.021856372409879, "grad_norm": 6.602753639221191, "learning_rate": 4.491200681237582e-05, "loss": 2.310238075256348, "step": 38830 }, { "epoch": 11.024694862333238, "grad_norm": 6.465996265411377, "learning_rate": 4.489781436275902e-05, "loss": 2.2736789703369142, "step": 38840 }, { "epoch": 11.027533352256599, "grad_norm": 7.131217002868652, "learning_rate": 4.488362191314221e-05, "loss": 2.288980484008789, "step": 38850 }, { "epoch": 11.03037184217996, "grad_norm": 6.97343111038208, "learning_rate": 4.486942946352541e-05, "loss": 2.1652198791503907, "step": 38860 }, { "epoch": 11.033210332103321, "grad_norm": 7.056755542755127, "learning_rate": 4.4855237013908605e-05, "loss": 2.1882350921630858, "step": 38870 }, { "epoch": 11.036048822026682, "grad_norm": 6.600419044494629, "learning_rate": 4.48410445642918e-05, "loss": 2.2200212478637695, "step": 38880 }, { "epoch": 11.038887311950042, "grad_norm": 6.773429870605469, "learning_rate": 4.482685211467499e-05, "loss": 2.217127799987793, "step": 38890 }, { "epoch": 11.041725801873403, "grad_norm": 6.6570048332214355, "learning_rate": 4.481265966505819e-05, "loss": 2.1752670288085936, "step": 38900 }, { "epoch": 11.044564291796764, "grad_norm": 6.429916858673096, "learning_rate": 4.4798467215441385e-05, "loss": 2.304402732849121, "step": 38910 }, { "epoch": 11.047402781720125, "grad_norm": 6.834599494934082, "learning_rate": 4.4784274765824585e-05, "loss": 2.290526580810547, "step": 38920 }, { "epoch": 11.050241271643486, "grad_norm": 7.289380073547363, "learning_rate": 4.477008231620778e-05, "loss": 2.1809642791748045, "step": 38930 }, { "epoch": 11.053079761566847, "grad_norm": 6.609288692474365, "learning_rate": 4.475588986659098e-05, "loss": 2.2392513275146486, "step": 38940 }, { "epoch": 11.055918251490207, "grad_norm": 6.620327949523926, "learning_rate": 4.474169741697417e-05, "loss": 2.224932861328125, "step": 38950 }, { "epoch": 11.058756741413568, "grad_norm": 6.625460147857666, "learning_rate": 4.472750496735737e-05, "loss": 2.190238189697266, "step": 38960 }, { "epoch": 11.061595231336929, "grad_norm": 6.3142828941345215, "learning_rate": 4.4713312517740565e-05, "loss": 2.174961471557617, "step": 38970 }, { "epoch": 11.06443372126029, "grad_norm": 6.931216716766357, "learning_rate": 4.4699120068123765e-05, "loss": 2.259124183654785, "step": 38980 }, { "epoch": 11.067272211183651, "grad_norm": 6.302497863769531, "learning_rate": 4.468492761850696e-05, "loss": 2.1648120880126953, "step": 38990 }, { "epoch": 11.07011070110701, "grad_norm": 6.5793938636779785, "learning_rate": 4.467073516889015e-05, "loss": 2.2617076873779296, "step": 39000 }, { "epoch": 11.07011070110701, "eval_accuracy": 0.32396515546512367, "eval_loss": 2.560734987258911, "eval_runtime": 53.8916, "eval_samples_per_second": 291.827, "eval_steps_per_second": 4.565, "step": 39000 }, { "epoch": 11.072949191030371, "grad_norm": 7.038576602935791, "learning_rate": 4.4656542719273345e-05, "loss": 2.235458564758301, "step": 39010 }, { "epoch": 11.075787680953733, "grad_norm": 6.3430681228637695, "learning_rate": 4.4642350269656545e-05, "loss": 2.224265480041504, "step": 39020 }, { "epoch": 11.078626170877094, "grad_norm": 6.384763240814209, "learning_rate": 4.462815782003974e-05, "loss": 2.2680727005004884, "step": 39030 }, { "epoch": 11.081464660800455, "grad_norm": 5.928189754486084, "learning_rate": 4.461396537042294e-05, "loss": 2.1679719924926757, "step": 39040 }, { "epoch": 11.084303150723814, "grad_norm": 7.053704738616943, "learning_rate": 4.459977292080613e-05, "loss": 2.2507883071899415, "step": 39050 }, { "epoch": 11.087141640647175, "grad_norm": 6.79683256149292, "learning_rate": 4.458558047118933e-05, "loss": 2.278091812133789, "step": 39060 }, { "epoch": 11.089980130570536, "grad_norm": 6.610622882843018, "learning_rate": 4.4571388021572525e-05, "loss": 2.2418869018554686, "step": 39070 }, { "epoch": 11.092818620493897, "grad_norm": 6.640398979187012, "learning_rate": 4.4557195571955725e-05, "loss": 2.2310710906982423, "step": 39080 }, { "epoch": 11.095657110417259, "grad_norm": 6.519617080688477, "learning_rate": 4.454300312233892e-05, "loss": 2.2582244873046875, "step": 39090 }, { "epoch": 11.09849560034062, "grad_norm": 6.543783187866211, "learning_rate": 4.452881067272212e-05, "loss": 2.225838851928711, "step": 39100 }, { "epoch": 11.101334090263979, "grad_norm": 7.067678928375244, "learning_rate": 4.451461822310531e-05, "loss": 2.176270294189453, "step": 39110 }, { "epoch": 11.10417258018734, "grad_norm": 6.49882698059082, "learning_rate": 4.4500425773488505e-05, "loss": 2.2501493453979493, "step": 39120 }, { "epoch": 11.107011070110701, "grad_norm": 6.3708319664001465, "learning_rate": 4.44862333238717e-05, "loss": 2.263948440551758, "step": 39130 }, { "epoch": 11.109849560034062, "grad_norm": 6.313395023345947, "learning_rate": 4.44720408742549e-05, "loss": 2.2302135467529296, "step": 39140 }, { "epoch": 11.112688049957423, "grad_norm": 6.69950532913208, "learning_rate": 4.445784842463809e-05, "loss": 2.179648590087891, "step": 39150 }, { "epoch": 11.115526539880783, "grad_norm": 6.890036106109619, "learning_rate": 4.444365597502129e-05, "loss": 2.249321937561035, "step": 39160 }, { "epoch": 11.118365029804144, "grad_norm": 6.665580749511719, "learning_rate": 4.4429463525404485e-05, "loss": 2.2238994598388673, "step": 39170 }, { "epoch": 11.121203519727505, "grad_norm": 6.566637992858887, "learning_rate": 4.4415271075787686e-05, "loss": 2.2698785781860353, "step": 39180 }, { "epoch": 11.124042009650866, "grad_norm": 6.315970420837402, "learning_rate": 4.440107862617088e-05, "loss": 2.2351760864257812, "step": 39190 }, { "epoch": 11.126880499574227, "grad_norm": 6.459198474884033, "learning_rate": 4.438688617655408e-05, "loss": 2.223558044433594, "step": 39200 }, { "epoch": 11.129718989497587, "grad_norm": 6.975285053253174, "learning_rate": 4.437269372693727e-05, "loss": 2.2031734466552733, "step": 39210 }, { "epoch": 11.132557479420948, "grad_norm": 6.442172527313232, "learning_rate": 4.4358501277320466e-05, "loss": 2.1931026458740233, "step": 39220 }, { "epoch": 11.135395969344309, "grad_norm": 6.8834333419799805, "learning_rate": 4.434430882770366e-05, "loss": 2.2965282440185546, "step": 39230 }, { "epoch": 11.13823445926767, "grad_norm": 6.401905059814453, "learning_rate": 4.4331535623048545e-05, "loss": 2.2516639709472654, "step": 39240 }, { "epoch": 11.141072949191031, "grad_norm": 6.55702018737793, "learning_rate": 4.431734317343174e-05, "loss": 2.2227569580078126, "step": 39250 }, { "epoch": 11.14391143911439, "grad_norm": 6.117162704467773, "learning_rate": 4.430315072381493e-05, "loss": 2.2028486251831056, "step": 39260 }, { "epoch": 11.146749929037751, "grad_norm": 6.855867862701416, "learning_rate": 4.4288958274198125e-05, "loss": 2.28690128326416, "step": 39270 }, { "epoch": 11.149588418961113, "grad_norm": 6.710844039916992, "learning_rate": 4.4274765824581325e-05, "loss": 2.236069679260254, "step": 39280 }, { "epoch": 11.152426908884474, "grad_norm": 6.338308334350586, "learning_rate": 4.426057337496452e-05, "loss": 2.180896759033203, "step": 39290 }, { "epoch": 11.155265398807835, "grad_norm": 6.303602695465088, "learning_rate": 4.424638092534772e-05, "loss": 2.3126476287841795, "step": 39300 }, { "epoch": 11.158103888731196, "grad_norm": 6.513157844543457, "learning_rate": 4.423218847573091e-05, "loss": 2.215932846069336, "step": 39310 }, { "epoch": 11.160942378654555, "grad_norm": 6.759611129760742, "learning_rate": 4.421799602611411e-05, "loss": 2.243421936035156, "step": 39320 }, { "epoch": 11.163780868577916, "grad_norm": 6.5502543449401855, "learning_rate": 4.4203803576497305e-05, "loss": 2.2783031463623047, "step": 39330 }, { "epoch": 11.166619358501277, "grad_norm": 6.231395244598389, "learning_rate": 4.4189611126880505e-05, "loss": 2.2357192993164063, "step": 39340 }, { "epoch": 11.169457848424639, "grad_norm": 7.03433084487915, "learning_rate": 4.41754186772637e-05, "loss": 2.2934795379638673, "step": 39350 }, { "epoch": 11.172296338348, "grad_norm": 6.64677095413208, "learning_rate": 4.41612262276469e-05, "loss": 2.246834564208984, "step": 39360 }, { "epoch": 11.175134828271359, "grad_norm": 6.755077362060547, "learning_rate": 4.414703377803009e-05, "loss": 2.220779609680176, "step": 39370 }, { "epoch": 11.17797331819472, "grad_norm": 6.498974323272705, "learning_rate": 4.4132841328413286e-05, "loss": 2.1326005935668944, "step": 39380 }, { "epoch": 11.180811808118081, "grad_norm": 6.404267311096191, "learning_rate": 4.411864887879648e-05, "loss": 2.2522438049316404, "step": 39390 }, { "epoch": 11.183650298041442, "grad_norm": 6.53411865234375, "learning_rate": 4.410445642917968e-05, "loss": 2.2740856170654298, "step": 39400 }, { "epoch": 11.186488787964803, "grad_norm": 6.835712432861328, "learning_rate": 4.409026397956287e-05, "loss": 2.165110969543457, "step": 39410 }, { "epoch": 11.189327277888163, "grad_norm": 6.679351329803467, "learning_rate": 4.407607152994607e-05, "loss": 2.1954320907592773, "step": 39420 }, { "epoch": 11.192165767811524, "grad_norm": 6.838817596435547, "learning_rate": 4.4061879080329266e-05, "loss": 2.232930374145508, "step": 39430 }, { "epoch": 11.195004257734885, "grad_norm": 6.353964328765869, "learning_rate": 4.4047686630712466e-05, "loss": 2.2569292068481444, "step": 39440 }, { "epoch": 11.197842747658246, "grad_norm": 6.4286065101623535, "learning_rate": 4.403349418109566e-05, "loss": 2.235107421875, "step": 39450 }, { "epoch": 11.200681237581607, "grad_norm": 6.470211505889893, "learning_rate": 4.401930173147886e-05, "loss": 2.2722745895385743, "step": 39460 }, { "epoch": 11.203519727504966, "grad_norm": 6.2943925857543945, "learning_rate": 4.400510928186205e-05, "loss": 2.1537954330444338, "step": 39470 }, { "epoch": 11.206358217428328, "grad_norm": 6.6805500984191895, "learning_rate": 4.3990916832245246e-05, "loss": 2.2333200454711912, "step": 39480 }, { "epoch": 11.209196707351689, "grad_norm": 6.630650043487549, "learning_rate": 4.397672438262844e-05, "loss": 2.271909713745117, "step": 39490 }, { "epoch": 11.21203519727505, "grad_norm": 6.514676094055176, "learning_rate": 4.396253193301164e-05, "loss": 2.1995925903320312, "step": 39500 }, { "epoch": 11.21203519727505, "eval_accuracy": 0.32173968334711006, "eval_loss": 2.556018829345703, "eval_runtime": 49.8686, "eval_samples_per_second": 315.369, "eval_steps_per_second": 4.933, "step": 39500 }, { "epoch": 11.214873687198411, "grad_norm": 6.608972549438477, "learning_rate": 4.394833948339483e-05, "loss": 2.22558479309082, "step": 39510 }, { "epoch": 11.217712177121772, "grad_norm": 6.777914047241211, "learning_rate": 4.393414703377803e-05, "loss": 2.2958450317382812, "step": 39520 }, { "epoch": 11.220550667045131, "grad_norm": 6.621232986450195, "learning_rate": 4.3919954584161226e-05, "loss": 2.2404008865356446, "step": 39530 }, { "epoch": 11.223389156968492, "grad_norm": 6.6696343421936035, "learning_rate": 4.3905762134544426e-05, "loss": 2.181357192993164, "step": 39540 }, { "epoch": 11.226227646891854, "grad_norm": 6.660664081573486, "learning_rate": 4.389156968492762e-05, "loss": 2.2126285552978517, "step": 39550 }, { "epoch": 11.229066136815215, "grad_norm": 6.549800395965576, "learning_rate": 4.387737723531082e-05, "loss": 2.256009101867676, "step": 39560 }, { "epoch": 11.231904626738576, "grad_norm": 6.806201457977295, "learning_rate": 4.386318478569401e-05, "loss": 2.2209707260131837, "step": 39570 }, { "epoch": 11.234743116661935, "grad_norm": 6.7374267578125, "learning_rate": 4.384899233607721e-05, "loss": 2.1968456268310548, "step": 39580 }, { "epoch": 11.237581606585296, "grad_norm": 6.527606010437012, "learning_rate": 4.3834799886460406e-05, "loss": 2.227440071105957, "step": 39590 }, { "epoch": 11.240420096508657, "grad_norm": 6.686519145965576, "learning_rate": 4.38206074368436e-05, "loss": 2.197513961791992, "step": 39600 }, { "epoch": 11.243258586432018, "grad_norm": 7.077059268951416, "learning_rate": 4.380641498722679e-05, "loss": 2.1396900177001954, "step": 39610 }, { "epoch": 11.24609707635538, "grad_norm": 6.808145999908447, "learning_rate": 4.379222253760999e-05, "loss": 2.22576789855957, "step": 39620 }, { "epoch": 11.248935566278739, "grad_norm": 6.418161392211914, "learning_rate": 4.3778030087993186e-05, "loss": 2.2470775604248048, "step": 39630 }, { "epoch": 11.2517740562021, "grad_norm": 6.604094505310059, "learning_rate": 4.3763837638376386e-05, "loss": 2.288692855834961, "step": 39640 }, { "epoch": 11.254612546125461, "grad_norm": 6.853221416473389, "learning_rate": 4.374964518875958e-05, "loss": 2.2426578521728517, "step": 39650 }, { "epoch": 11.257451036048822, "grad_norm": 6.675647735595703, "learning_rate": 4.373545273914278e-05, "loss": 2.2279891967773438, "step": 39660 }, { "epoch": 11.260289525972183, "grad_norm": 6.980295658111572, "learning_rate": 4.372126028952597e-05, "loss": 2.2051525115966797, "step": 39670 }, { "epoch": 11.263128015895543, "grad_norm": 6.585376739501953, "learning_rate": 4.370706783990917e-05, "loss": 2.173893356323242, "step": 39680 }, { "epoch": 11.265966505818904, "grad_norm": 6.491901397705078, "learning_rate": 4.3692875390292367e-05, "loss": 2.181882858276367, "step": 39690 }, { "epoch": 11.268804995742265, "grad_norm": 6.591220378875732, "learning_rate": 4.367868294067557e-05, "loss": 2.2584211349487306, "step": 39700 }, { "epoch": 11.271643485665626, "grad_norm": 6.763430118560791, "learning_rate": 4.366449049105876e-05, "loss": 2.1681385040283203, "step": 39710 }, { "epoch": 11.274481975588987, "grad_norm": 6.687436580657959, "learning_rate": 4.365029804144195e-05, "loss": 2.2304386138916015, "step": 39720 }, { "epoch": 11.277320465512348, "grad_norm": 6.169283390045166, "learning_rate": 4.363610559182515e-05, "loss": 2.250208854675293, "step": 39730 }, { "epoch": 11.280158955435708, "grad_norm": 6.569575786590576, "learning_rate": 4.362191314220835e-05, "loss": 2.2186874389648437, "step": 39740 }, { "epoch": 11.282997445359069, "grad_norm": 6.525807857513428, "learning_rate": 4.360772069259154e-05, "loss": 2.1648605346679686, "step": 39750 }, { "epoch": 11.28583593528243, "grad_norm": 6.463259220123291, "learning_rate": 4.359352824297474e-05, "loss": 2.1341787338256837, "step": 39760 }, { "epoch": 11.288674425205791, "grad_norm": 6.462189674377441, "learning_rate": 4.3579335793357933e-05, "loss": 2.1029481887817383, "step": 39770 }, { "epoch": 11.291512915129152, "grad_norm": 6.1765875816345215, "learning_rate": 4.3565143343741134e-05, "loss": 2.2176794052124023, "step": 39780 }, { "epoch": 11.294351405052511, "grad_norm": 6.580938339233398, "learning_rate": 4.355095089412433e-05, "loss": 2.2809797286987306, "step": 39790 }, { "epoch": 11.297189894975872, "grad_norm": 6.284786701202393, "learning_rate": 4.353675844450753e-05, "loss": 2.303865432739258, "step": 39800 }, { "epoch": 11.300028384899234, "grad_norm": 6.383693695068359, "learning_rate": 4.352256599489072e-05, "loss": 2.2028980255126953, "step": 39810 }, { "epoch": 11.302866874822595, "grad_norm": 6.198566436767578, "learning_rate": 4.350837354527392e-05, "loss": 2.206624412536621, "step": 39820 }, { "epoch": 11.305705364745956, "grad_norm": 7.0155158042907715, "learning_rate": 4.3494181095657114e-05, "loss": 2.2223136901855467, "step": 39830 }, { "epoch": 11.308543854669315, "grad_norm": 7.239226341247559, "learning_rate": 4.347998864604031e-05, "loss": 2.2340988159179687, "step": 39840 }, { "epoch": 11.311382344592676, "grad_norm": 6.586090564727783, "learning_rate": 4.34657961964235e-05, "loss": 2.187094497680664, "step": 39850 }, { "epoch": 11.314220834516037, "grad_norm": 6.514294147491455, "learning_rate": 4.34516037468067e-05, "loss": 2.300893783569336, "step": 39860 }, { "epoch": 11.317059324439398, "grad_norm": 6.306215763092041, "learning_rate": 4.3437411297189894e-05, "loss": 2.195754814147949, "step": 39870 }, { "epoch": 11.31989781436276, "grad_norm": 6.74602746963501, "learning_rate": 4.3423218847573094e-05, "loss": 2.2644458770751954, "step": 39880 }, { "epoch": 11.32273630428612, "grad_norm": 6.6337995529174805, "learning_rate": 4.340902639795629e-05, "loss": 2.2430973052978516, "step": 39890 }, { "epoch": 11.32557479420948, "grad_norm": 6.528453826904297, "learning_rate": 4.339483394833949e-05, "loss": 2.2343154907226563, "step": 39900 }, { "epoch": 11.328413284132841, "grad_norm": 6.956051826477051, "learning_rate": 4.338064149872268e-05, "loss": 2.2577899932861327, "step": 39910 }, { "epoch": 11.331251774056202, "grad_norm": 6.494673252105713, "learning_rate": 4.336644904910588e-05, "loss": 2.1861072540283204, "step": 39920 }, { "epoch": 11.334090263979563, "grad_norm": 7.103372573852539, "learning_rate": 4.3352256599489074e-05, "loss": 2.185784339904785, "step": 39930 }, { "epoch": 11.336928753902924, "grad_norm": 6.39697265625, "learning_rate": 4.333806414987227e-05, "loss": 2.298936653137207, "step": 39940 }, { "epoch": 11.339767243826284, "grad_norm": 6.601680755615234, "learning_rate": 4.332387170025546e-05, "loss": 2.243168067932129, "step": 39950 }, { "epoch": 11.342605733749645, "grad_norm": 7.068339824676514, "learning_rate": 4.330967925063866e-05, "loss": 2.235337257385254, "step": 39960 }, { "epoch": 11.345444223673006, "grad_norm": 6.331199645996094, "learning_rate": 4.3295486801021854e-05, "loss": 2.282023620605469, "step": 39970 }, { "epoch": 11.348282713596367, "grad_norm": 6.820745468139648, "learning_rate": 4.3281294351405054e-05, "loss": 2.2336950302124023, "step": 39980 }, { "epoch": 11.351121203519728, "grad_norm": 6.329095363616943, "learning_rate": 4.326710190178825e-05, "loss": 2.262409210205078, "step": 39990 }, { "epoch": 11.353959693443088, "grad_norm": 6.586746692657471, "learning_rate": 4.325290945217145e-05, "loss": 2.2862611770629884, "step": 40000 }, { "epoch": 11.353959693443088, "eval_accuracy": 0.32072232466458955, "eval_loss": 2.549705982208252, "eval_runtime": 48.4856, "eval_samples_per_second": 324.364, "eval_steps_per_second": 5.074, "step": 40000 }, { "epoch": 11.356798183366449, "grad_norm": 6.6429572105407715, "learning_rate": 4.323871700255464e-05, "loss": 2.283007049560547, "step": 40010 }, { "epoch": 11.35963667328981, "grad_norm": 6.703679084777832, "learning_rate": 4.322452455293784e-05, "loss": 2.2136232376098635, "step": 40020 }, { "epoch": 11.36247516321317, "grad_norm": 6.8389387130737305, "learning_rate": 4.3210332103321034e-05, "loss": 2.1887874603271484, "step": 40030 }, { "epoch": 11.365313653136532, "grad_norm": 6.470792293548584, "learning_rate": 4.3196139653704234e-05, "loss": 2.289596939086914, "step": 40040 }, { "epoch": 11.368152143059891, "grad_norm": 6.772748947143555, "learning_rate": 4.318194720408743e-05, "loss": 2.2401748657226563, "step": 40050 }, { "epoch": 11.370990632983252, "grad_norm": 6.551070213317871, "learning_rate": 4.316775475447062e-05, "loss": 2.1816240310668946, "step": 40060 }, { "epoch": 11.373829122906614, "grad_norm": 6.532901763916016, "learning_rate": 4.3153562304853814e-05, "loss": 2.231823539733887, "step": 40070 }, { "epoch": 11.376667612829975, "grad_norm": 6.599860191345215, "learning_rate": 4.3139369855237015e-05, "loss": 2.2479888916015627, "step": 40080 }, { "epoch": 11.379506102753336, "grad_norm": 6.542527675628662, "learning_rate": 4.312517740562021e-05, "loss": 2.288043212890625, "step": 40090 }, { "epoch": 11.382344592676697, "grad_norm": 6.64663028717041, "learning_rate": 4.311098495600341e-05, "loss": 2.232287788391113, "step": 40100 }, { "epoch": 11.385183082600056, "grad_norm": 6.571500778198242, "learning_rate": 4.30967925063866e-05, "loss": 2.2775373458862305, "step": 40110 }, { "epoch": 11.388021572523417, "grad_norm": 6.488737106323242, "learning_rate": 4.30826000567698e-05, "loss": 2.2361740112304687, "step": 40120 }, { "epoch": 11.390860062446778, "grad_norm": 6.585003852844238, "learning_rate": 4.3068407607152995e-05, "loss": 2.2932058334350587, "step": 40130 }, { "epoch": 11.39369855237014, "grad_norm": 6.434825897216797, "learning_rate": 4.3054215157536195e-05, "loss": 2.2263156890869142, "step": 40140 }, { "epoch": 11.3965370422935, "grad_norm": 6.683370113372803, "learning_rate": 4.304002270791939e-05, "loss": 2.191069793701172, "step": 40150 }, { "epoch": 11.39937553221686, "grad_norm": 6.668469429016113, "learning_rate": 4.302583025830259e-05, "loss": 2.2911056518554687, "step": 40160 }, { "epoch": 11.402214022140221, "grad_norm": 6.744688034057617, "learning_rate": 4.301163780868578e-05, "loss": 2.2819135665893553, "step": 40170 }, { "epoch": 11.405052512063582, "grad_norm": 6.65330696105957, "learning_rate": 4.2997445359068975e-05, "loss": 2.1891841888427734, "step": 40180 }, { "epoch": 11.407891001986943, "grad_norm": 6.434731960296631, "learning_rate": 4.298325290945217e-05, "loss": 2.2525302886962892, "step": 40190 }, { "epoch": 11.410729491910304, "grad_norm": 6.766043186187744, "learning_rate": 4.296906045983537e-05, "loss": 2.2370458602905274, "step": 40200 }, { "epoch": 11.413567981833664, "grad_norm": 6.099665641784668, "learning_rate": 4.295486801021857e-05, "loss": 2.189963722229004, "step": 40210 }, { "epoch": 11.416406471757025, "grad_norm": 6.610828876495361, "learning_rate": 4.294067556060176e-05, "loss": 2.2233280181884765, "step": 40220 }, { "epoch": 11.419244961680386, "grad_norm": 6.790840148925781, "learning_rate": 4.292648311098496e-05, "loss": 2.207540512084961, "step": 40230 }, { "epoch": 11.422083451603747, "grad_norm": 6.307731628417969, "learning_rate": 4.2912290661368155e-05, "loss": 2.2453136444091797, "step": 40240 }, { "epoch": 11.424921941527108, "grad_norm": 6.664694786071777, "learning_rate": 4.2898098211751355e-05, "loss": 2.170016288757324, "step": 40250 }, { "epoch": 11.427760431450467, "grad_norm": 6.5963873863220215, "learning_rate": 4.288390576213455e-05, "loss": 2.195365333557129, "step": 40260 }, { "epoch": 11.430598921373829, "grad_norm": 6.208470821380615, "learning_rate": 4.286971331251775e-05, "loss": 2.23134765625, "step": 40270 }, { "epoch": 11.43343741129719, "grad_norm": 6.5243239402771, "learning_rate": 4.285552086290094e-05, "loss": 2.2271554946899412, "step": 40280 }, { "epoch": 11.43627590122055, "grad_norm": 6.689733028411865, "learning_rate": 4.2841328413284135e-05, "loss": 2.257297325134277, "step": 40290 }, { "epoch": 11.439114391143912, "grad_norm": 6.182467937469482, "learning_rate": 4.282713596366733e-05, "loss": 2.2351139068603514, "step": 40300 }, { "epoch": 11.441952881067273, "grad_norm": 6.299415111541748, "learning_rate": 4.281294351405053e-05, "loss": 2.261507034301758, "step": 40310 }, { "epoch": 11.444791370990632, "grad_norm": 6.578554630279541, "learning_rate": 4.279875106443372e-05, "loss": 2.2276823043823244, "step": 40320 }, { "epoch": 11.447629860913993, "grad_norm": 6.424256801605225, "learning_rate": 4.278455861481692e-05, "loss": 2.1773128509521484, "step": 40330 }, { "epoch": 11.450468350837355, "grad_norm": 6.480170726776123, "learning_rate": 4.2770366165200115e-05, "loss": 2.1929550170898438, "step": 40340 }, { "epoch": 11.453306840760716, "grad_norm": 6.155201435089111, "learning_rate": 4.2756173715583315e-05, "loss": 2.2164133071899412, "step": 40350 }, { "epoch": 11.456145330684077, "grad_norm": 6.611728191375732, "learning_rate": 4.274198126596651e-05, "loss": 2.220168876647949, "step": 40360 }, { "epoch": 11.458983820607436, "grad_norm": 6.298475742340088, "learning_rate": 4.272778881634971e-05, "loss": 2.1455034255981444, "step": 40370 }, { "epoch": 11.461822310530797, "grad_norm": 6.750114917755127, "learning_rate": 4.27135963667329e-05, "loss": 2.2094453811645507, "step": 40380 }, { "epoch": 11.464660800454158, "grad_norm": 6.712581634521484, "learning_rate": 4.2699403917116096e-05, "loss": 2.2027212142944337, "step": 40390 }, { "epoch": 11.46749929037752, "grad_norm": 6.557215690612793, "learning_rate": 4.268521146749929e-05, "loss": 2.2690240859985353, "step": 40400 }, { "epoch": 11.47033778030088, "grad_norm": 6.6380510330200195, "learning_rate": 4.267101901788249e-05, "loss": 2.2877756118774415, "step": 40410 }, { "epoch": 11.47317627022424, "grad_norm": 6.517956256866455, "learning_rate": 4.265682656826568e-05, "loss": 2.244923400878906, "step": 40420 }, { "epoch": 11.476014760147601, "grad_norm": 6.709367275238037, "learning_rate": 4.264263411864888e-05, "loss": 2.2769454956054687, "step": 40430 }, { "epoch": 11.478853250070962, "grad_norm": 6.974355220794678, "learning_rate": 4.2628441669032076e-05, "loss": 2.264217567443848, "step": 40440 }, { "epoch": 11.481691739994323, "grad_norm": 6.7811994552612305, "learning_rate": 4.2614249219415276e-05, "loss": 2.1795101165771484, "step": 40450 }, { "epoch": 11.484530229917684, "grad_norm": 6.802762985229492, "learning_rate": 4.260005676979847e-05, "loss": 2.2084184646606446, "step": 40460 }, { "epoch": 11.487368719841044, "grad_norm": 6.547715187072754, "learning_rate": 4.258586432018167e-05, "loss": 2.2366588592529295, "step": 40470 }, { "epoch": 11.490207209764405, "grad_norm": 6.613384246826172, "learning_rate": 4.257167187056486e-05, "loss": 2.24920597076416, "step": 40480 }, { "epoch": 11.493045699687766, "grad_norm": 6.780065059661865, "learning_rate": 4.255747942094806e-05, "loss": 2.2130805969238283, "step": 40490 }, { "epoch": 11.495884189611127, "grad_norm": 6.3780999183654785, "learning_rate": 4.2543286971331256e-05, "loss": 2.203504943847656, "step": 40500 }, { "epoch": 11.495884189611127, "eval_accuracy": 0.3219940230177402, "eval_loss": 2.5468757152557373, "eval_runtime": 51.8404, "eval_samples_per_second": 303.373, "eval_steps_per_second": 4.745, "step": 40500 }, { "epoch": 11.498722679534488, "grad_norm": 6.572833061218262, "learning_rate": 4.252909452171445e-05, "loss": 2.2369192123413084, "step": 40510 }, { "epoch": 11.50156116945785, "grad_norm": 6.412508487701416, "learning_rate": 4.251490207209764e-05, "loss": 2.147659492492676, "step": 40520 }, { "epoch": 11.504399659381209, "grad_norm": 6.619680404663086, "learning_rate": 4.250070962248084e-05, "loss": 2.3135194778442383, "step": 40530 }, { "epoch": 11.50723814930457, "grad_norm": 6.80336332321167, "learning_rate": 4.2486517172864036e-05, "loss": 2.2461759567260744, "step": 40540 }, { "epoch": 11.51007663922793, "grad_norm": 6.512372970581055, "learning_rate": 4.2472324723247236e-05, "loss": 2.303878974914551, "step": 40550 }, { "epoch": 11.512915129151292, "grad_norm": 6.61305046081543, "learning_rate": 4.245813227363043e-05, "loss": 2.2010332107543946, "step": 40560 }, { "epoch": 11.515753619074653, "grad_norm": 6.86196756362915, "learning_rate": 4.244393982401363e-05, "loss": 2.2009395599365233, "step": 40570 }, { "epoch": 11.518592108998012, "grad_norm": 6.487507343292236, "learning_rate": 4.242974737439682e-05, "loss": 2.2643354415893553, "step": 40580 }, { "epoch": 11.521430598921373, "grad_norm": 6.656041622161865, "learning_rate": 4.241555492478002e-05, "loss": 2.2337331771850586, "step": 40590 }, { "epoch": 11.524269088844735, "grad_norm": 6.647971153259277, "learning_rate": 4.2401362475163216e-05, "loss": 2.2454898834228514, "step": 40600 }, { "epoch": 11.527107578768096, "grad_norm": 6.7347493171691895, "learning_rate": 4.2387170025546416e-05, "loss": 2.2892152786254885, "step": 40610 }, { "epoch": 11.529946068691457, "grad_norm": 6.582687854766846, "learning_rate": 4.237297757592961e-05, "loss": 2.274672508239746, "step": 40620 }, { "epoch": 11.532784558614816, "grad_norm": 6.439152240753174, "learning_rate": 4.23587851263128e-05, "loss": 2.260310173034668, "step": 40630 }, { "epoch": 11.535623048538177, "grad_norm": 6.46557092666626, "learning_rate": 4.2344592676695996e-05, "loss": 2.1818958282470704, "step": 40640 }, { "epoch": 11.538461538461538, "grad_norm": 6.706607341766357, "learning_rate": 4.2330400227079196e-05, "loss": 2.247980499267578, "step": 40650 }, { "epoch": 11.5413000283849, "grad_norm": 6.545158863067627, "learning_rate": 4.231620777746239e-05, "loss": 2.269272804260254, "step": 40660 }, { "epoch": 11.54413851830826, "grad_norm": 6.831079006195068, "learning_rate": 4.230201532784559e-05, "loss": 2.3158491134643553, "step": 40670 }, { "epoch": 11.546977008231622, "grad_norm": 6.875585556030273, "learning_rate": 4.228782287822878e-05, "loss": 2.2601858139038087, "step": 40680 }, { "epoch": 11.549815498154981, "grad_norm": 6.300083160400391, "learning_rate": 4.227363042861198e-05, "loss": 2.1239320755004885, "step": 40690 }, { "epoch": 11.552653988078342, "grad_norm": 6.440435409545898, "learning_rate": 4.2259437978995177e-05, "loss": 2.190760040283203, "step": 40700 }, { "epoch": 11.555492478001703, "grad_norm": 6.6633782386779785, "learning_rate": 4.224524552937838e-05, "loss": 2.3096698760986327, "step": 40710 }, { "epoch": 11.558330967925064, "grad_norm": 6.744316577911377, "learning_rate": 4.223105307976157e-05, "loss": 2.2432971954345704, "step": 40720 }, { "epoch": 11.561169457848425, "grad_norm": 6.807679176330566, "learning_rate": 4.221686063014476e-05, "loss": 2.2532413482666014, "step": 40730 }, { "epoch": 11.564007947771785, "grad_norm": 6.695373058319092, "learning_rate": 4.220266818052796e-05, "loss": 2.2335914611816405, "step": 40740 }, { "epoch": 11.566846437695146, "grad_norm": 6.314513683319092, "learning_rate": 4.218847573091116e-05, "loss": 2.2059610366821287, "step": 40750 }, { "epoch": 11.569684927618507, "grad_norm": 6.504467010498047, "learning_rate": 4.217428328129435e-05, "loss": 2.286749076843262, "step": 40760 }, { "epoch": 11.572523417541868, "grad_norm": 6.239585876464844, "learning_rate": 4.216009083167755e-05, "loss": 2.195003318786621, "step": 40770 }, { "epoch": 11.57536190746523, "grad_norm": 6.790648460388184, "learning_rate": 4.2145898382060744e-05, "loss": 2.190377426147461, "step": 40780 }, { "epoch": 11.578200397388589, "grad_norm": 6.297459125518799, "learning_rate": 4.2131705932443944e-05, "loss": 2.225462532043457, "step": 40790 }, { "epoch": 11.58103888731195, "grad_norm": 6.325811386108398, "learning_rate": 4.211751348282714e-05, "loss": 2.1904340744018556, "step": 40800 }, { "epoch": 11.58387737723531, "grad_norm": 6.541266918182373, "learning_rate": 4.210332103321034e-05, "loss": 2.2312084197998048, "step": 40810 }, { "epoch": 11.586715867158672, "grad_norm": 6.252431392669678, "learning_rate": 4.208912858359353e-05, "loss": 2.20465145111084, "step": 40820 }, { "epoch": 11.589554357082033, "grad_norm": 6.457735061645508, "learning_rate": 4.207493613397673e-05, "loss": 2.2437931060791017, "step": 40830 }, { "epoch": 11.592392847005392, "grad_norm": 6.822039604187012, "learning_rate": 4.2060743684359924e-05, "loss": 2.2540729522705076, "step": 40840 }, { "epoch": 11.595231336928753, "grad_norm": 6.501948833465576, "learning_rate": 4.204655123474312e-05, "loss": 2.2398540496826174, "step": 40850 }, { "epoch": 11.598069826852115, "grad_norm": 6.844522953033447, "learning_rate": 4.203235878512631e-05, "loss": 2.1869129180908202, "step": 40860 }, { "epoch": 11.600908316775476, "grad_norm": 6.59773063659668, "learning_rate": 4.201816633550951e-05, "loss": 2.1851455688476564, "step": 40870 }, { "epoch": 11.603746806698837, "grad_norm": 6.409840106964111, "learning_rate": 4.2003973885892704e-05, "loss": 2.2758798599243164, "step": 40880 }, { "epoch": 11.606585296622196, "grad_norm": 6.887763023376465, "learning_rate": 4.1989781436275904e-05, "loss": 2.2016101837158204, "step": 40890 }, { "epoch": 11.609423786545557, "grad_norm": 6.493885040283203, "learning_rate": 4.19755889866591e-05, "loss": 2.2005558013916016, "step": 40900 }, { "epoch": 11.612262276468918, "grad_norm": 6.591405868530273, "learning_rate": 4.19613965370423e-05, "loss": 2.249492645263672, "step": 40910 }, { "epoch": 11.61510076639228, "grad_norm": 6.693352699279785, "learning_rate": 4.194720408742549e-05, "loss": 2.2056432723999024, "step": 40920 }, { "epoch": 11.61793925631564, "grad_norm": 6.866115093231201, "learning_rate": 4.193301163780869e-05, "loss": 2.232978630065918, "step": 40930 }, { "epoch": 11.620777746239002, "grad_norm": 6.616400718688965, "learning_rate": 4.1918819188191884e-05, "loss": 2.183195686340332, "step": 40940 }, { "epoch": 11.623616236162361, "grad_norm": 6.766117095947266, "learning_rate": 4.1904626738575084e-05, "loss": 2.268684196472168, "step": 40950 }, { "epoch": 11.626454726085722, "grad_norm": 6.6970438957214355, "learning_rate": 4.189043428895828e-05, "loss": 2.230202865600586, "step": 40960 }, { "epoch": 11.629293216009083, "grad_norm": 6.571651935577393, "learning_rate": 4.187624183934147e-05, "loss": 2.3072463989257814, "step": 40970 }, { "epoch": 11.632131705932444, "grad_norm": 6.7047319412231445, "learning_rate": 4.1862049389724664e-05, "loss": 2.2606132507324217, "step": 40980 }, { "epoch": 11.634970195855805, "grad_norm": 6.562608242034912, "learning_rate": 4.1847856940107864e-05, "loss": 2.204140281677246, "step": 40990 }, { "epoch": 11.637808685779165, "grad_norm": 6.424014091491699, "learning_rate": 4.183366449049106e-05, "loss": 2.3173662185668946, "step": 41000 }, { "epoch": 11.637808685779165, "eval_accuracy": 0.32867043937178103, "eval_loss": 2.5423848628997803, "eval_runtime": 52.3665, "eval_samples_per_second": 300.326, "eval_steps_per_second": 4.698, "step": 41000 }, { "epoch": 11.640647175702526, "grad_norm": 6.6931023597717285, "learning_rate": 4.181947204087426e-05, "loss": 2.3299760818481445, "step": 41010 }, { "epoch": 11.643485665625887, "grad_norm": 6.4891815185546875, "learning_rate": 4.180527959125745e-05, "loss": 2.215774154663086, "step": 41020 }, { "epoch": 11.646324155549248, "grad_norm": 6.0400824546813965, "learning_rate": 4.179108714164065e-05, "loss": 2.212535095214844, "step": 41030 }, { "epoch": 11.64916264547261, "grad_norm": 6.531355381011963, "learning_rate": 4.1776894692023844e-05, "loss": 2.254108428955078, "step": 41040 }, { "epoch": 11.65200113539597, "grad_norm": 6.5644426345825195, "learning_rate": 4.1762702242407044e-05, "loss": 2.1881385803222657, "step": 41050 }, { "epoch": 11.65483962531933, "grad_norm": 6.341228485107422, "learning_rate": 4.174850979279024e-05, "loss": 2.2094287872314453, "step": 41060 }, { "epoch": 11.65767811524269, "grad_norm": 6.483936786651611, "learning_rate": 4.173431734317344e-05, "loss": 2.1846141815185547, "step": 41070 }, { "epoch": 11.660516605166052, "grad_norm": 6.40903377532959, "learning_rate": 4.172012489355663e-05, "loss": 2.2776348114013674, "step": 41080 }, { "epoch": 11.663355095089413, "grad_norm": 6.35237455368042, "learning_rate": 4.1705932443939825e-05, "loss": 2.169688606262207, "step": 41090 }, { "epoch": 11.666193585012774, "grad_norm": 6.640474796295166, "learning_rate": 4.169173999432302e-05, "loss": 2.228030967712402, "step": 41100 }, { "epoch": 11.669032074936133, "grad_norm": 6.526794910430908, "learning_rate": 4.167754754470622e-05, "loss": 2.1865213394165037, "step": 41110 }, { "epoch": 11.671870564859494, "grad_norm": 6.332681179046631, "learning_rate": 4.166335509508941e-05, "loss": 2.1872831344604493, "step": 41120 }, { "epoch": 11.674709054782856, "grad_norm": 6.544070720672607, "learning_rate": 4.164916264547261e-05, "loss": 2.2398456573486327, "step": 41130 }, { "epoch": 11.677547544706217, "grad_norm": 6.248546123504639, "learning_rate": 4.1634970195855805e-05, "loss": 2.149380683898926, "step": 41140 }, { "epoch": 11.680386034629578, "grad_norm": 6.6549530029296875, "learning_rate": 4.1620777746239005e-05, "loss": 2.206945037841797, "step": 41150 }, { "epoch": 11.683224524552937, "grad_norm": 6.134997844696045, "learning_rate": 4.16065852966222e-05, "loss": 2.2116031646728516, "step": 41160 }, { "epoch": 11.686063014476298, "grad_norm": 6.4990034103393555, "learning_rate": 4.15923928470054e-05, "loss": 2.2375917434692383, "step": 41170 }, { "epoch": 11.68890150439966, "grad_norm": 6.4579901695251465, "learning_rate": 4.157820039738859e-05, "loss": 2.2804161071777345, "step": 41180 }, { "epoch": 11.69173999432302, "grad_norm": 6.614495754241943, "learning_rate": 4.1564007947771785e-05, "loss": 2.1967853546142577, "step": 41190 }, { "epoch": 11.694578484246382, "grad_norm": 6.242429733276367, "learning_rate": 4.154981549815498e-05, "loss": 2.2050933837890625, "step": 41200 }, { "epoch": 11.697416974169741, "grad_norm": 6.5854716300964355, "learning_rate": 4.153562304853818e-05, "loss": 2.2257471084594727, "step": 41210 }, { "epoch": 11.700255464093102, "grad_norm": 6.834752082824707, "learning_rate": 4.152143059892137e-05, "loss": 2.1966167449951173, "step": 41220 }, { "epoch": 11.703093954016463, "grad_norm": 6.682590961456299, "learning_rate": 4.150723814930457e-05, "loss": 2.2158660888671875, "step": 41230 }, { "epoch": 11.705932443939824, "grad_norm": 6.3064422607421875, "learning_rate": 4.1494464944649444e-05, "loss": 2.150706100463867, "step": 41240 }, { "epoch": 11.708770933863185, "grad_norm": 6.526375770568848, "learning_rate": 4.1480272495032644e-05, "loss": 2.21997013092041, "step": 41250 }, { "epoch": 11.711609423786545, "grad_norm": 6.002439975738525, "learning_rate": 4.146608004541584e-05, "loss": 2.192977714538574, "step": 41260 }, { "epoch": 11.714447913709906, "grad_norm": 6.962424278259277, "learning_rate": 4.145188759579904e-05, "loss": 2.225108337402344, "step": 41270 }, { "epoch": 11.717286403633267, "grad_norm": 6.436768531799316, "learning_rate": 4.143769514618223e-05, "loss": 2.195530319213867, "step": 41280 }, { "epoch": 11.720124893556628, "grad_norm": 6.776752948760986, "learning_rate": 4.142350269656543e-05, "loss": 2.297582244873047, "step": 41290 }, { "epoch": 11.72296338347999, "grad_norm": 6.4422712326049805, "learning_rate": 4.1409310246948625e-05, "loss": 2.304841423034668, "step": 41300 }, { "epoch": 11.72580187340335, "grad_norm": 6.704490661621094, "learning_rate": 4.1395117797331825e-05, "loss": 2.2194282531738283, "step": 41310 }, { "epoch": 11.72864036332671, "grad_norm": 6.685679912567139, "learning_rate": 4.138092534771502e-05, "loss": 2.2413482666015625, "step": 41320 }, { "epoch": 11.73147885325007, "grad_norm": 6.219050884246826, "learning_rate": 4.136673289809822e-05, "loss": 2.222345733642578, "step": 41330 }, { "epoch": 11.734317343173432, "grad_norm": 6.711267471313477, "learning_rate": 4.135254044848141e-05, "loss": 2.1745361328125, "step": 41340 }, { "epoch": 11.737155833096793, "grad_norm": 6.743958950042725, "learning_rate": 4.1338347998864605e-05, "loss": 2.2468496322631837, "step": 41350 }, { "epoch": 11.739994323020154, "grad_norm": 6.4969024658203125, "learning_rate": 4.13241555492478e-05, "loss": 2.2716564178466796, "step": 41360 }, { "epoch": 11.742832812943513, "grad_norm": 6.399473190307617, "learning_rate": 4.1309963099631e-05, "loss": 2.262661933898926, "step": 41370 }, { "epoch": 11.745671302866874, "grad_norm": 6.683349609375, "learning_rate": 4.129577065001419e-05, "loss": 2.213835525512695, "step": 41380 }, { "epoch": 11.748509792790236, "grad_norm": 6.585613250732422, "learning_rate": 4.128157820039739e-05, "loss": 2.218077850341797, "step": 41390 }, { "epoch": 11.751348282713597, "grad_norm": 6.924930572509766, "learning_rate": 4.1267385750780585e-05, "loss": 2.148558235168457, "step": 41400 }, { "epoch": 11.754186772636958, "grad_norm": 6.741916656494141, "learning_rate": 4.1253193301163785e-05, "loss": 2.228403663635254, "step": 41410 }, { "epoch": 11.757025262560317, "grad_norm": 6.68734073638916, "learning_rate": 4.123900085154698e-05, "loss": 2.182813262939453, "step": 41420 }, { "epoch": 11.759863752483678, "grad_norm": 6.507657051086426, "learning_rate": 4.122480840193018e-05, "loss": 2.2248298645019533, "step": 41430 }, { "epoch": 11.76270224240704, "grad_norm": 7.027790069580078, "learning_rate": 4.121061595231337e-05, "loss": 2.1849061965942385, "step": 41440 }, { "epoch": 11.7655407323304, "grad_norm": 6.488811016082764, "learning_rate": 4.1196423502696565e-05, "loss": 2.2186958312988283, "step": 41450 }, { "epoch": 11.768379222253762, "grad_norm": 7.3432230949401855, "learning_rate": 4.118223105307976e-05, "loss": 2.3376392364501952, "step": 41460 }, { "epoch": 11.771217712177123, "grad_norm": 6.930030345916748, "learning_rate": 4.116803860346296e-05, "loss": 2.213908004760742, "step": 41470 }, { "epoch": 11.774056202100482, "grad_norm": 6.520373344421387, "learning_rate": 4.115384615384615e-05, "loss": 2.2439708709716797, "step": 41480 }, { "epoch": 11.776894692023843, "grad_norm": 6.374081611633301, "learning_rate": 4.113965370422935e-05, "loss": 2.2738006591796873, "step": 41490 }, { "epoch": 11.779733181947204, "grad_norm": 6.7322797775268555, "learning_rate": 4.1125461254612545e-05, "loss": 2.221029853820801, "step": 41500 }, { "epoch": 11.779733181947204, "eval_accuracy": 0.3259998728301647, "eval_loss": 2.541581153869629, "eval_runtime": 53.7682, "eval_samples_per_second": 292.497, "eval_steps_per_second": 4.575, "step": 41500 }, { "epoch": 11.782571671870565, "grad_norm": 6.14198637008667, "learning_rate": 4.1111268804995745e-05, "loss": 2.242015075683594, "step": 41510 }, { "epoch": 11.785410161793926, "grad_norm": 6.32426118850708, "learning_rate": 4.109707635537894e-05, "loss": 2.2493730545043946, "step": 41520 }, { "epoch": 11.788248651717286, "grad_norm": 6.500341892242432, "learning_rate": 4.108288390576214e-05, "loss": 2.313374328613281, "step": 41530 }, { "epoch": 11.791087141640647, "grad_norm": 6.857923984527588, "learning_rate": 4.106869145614533e-05, "loss": 2.219120216369629, "step": 41540 }, { "epoch": 11.793925631564008, "grad_norm": 6.466440200805664, "learning_rate": 4.105449900652853e-05, "loss": 2.2383359909057616, "step": 41550 }, { "epoch": 11.796764121487369, "grad_norm": 6.986555576324463, "learning_rate": 4.1040306556911725e-05, "loss": 2.194681930541992, "step": 41560 }, { "epoch": 11.79960261141073, "grad_norm": 6.769127368927002, "learning_rate": 4.102611410729492e-05, "loss": 2.2111873626708984, "step": 41570 }, { "epoch": 11.80244110133409, "grad_norm": 6.3475494384765625, "learning_rate": 4.101192165767811e-05, "loss": 2.1662857055664064, "step": 41580 }, { "epoch": 11.80527959125745, "grad_norm": 6.2668561935424805, "learning_rate": 4.099772920806131e-05, "loss": 2.245975875854492, "step": 41590 }, { "epoch": 11.808118081180812, "grad_norm": 6.4359893798828125, "learning_rate": 4.0983536758444506e-05, "loss": 2.1812095642089844, "step": 41600 }, { "epoch": 11.810956571104173, "grad_norm": 6.916361331939697, "learning_rate": 4.0969344308827706e-05, "loss": 2.220636177062988, "step": 41610 }, { "epoch": 11.813795061027534, "grad_norm": 6.522837162017822, "learning_rate": 4.09551518592109e-05, "loss": 2.156509590148926, "step": 41620 }, { "epoch": 11.816633550950893, "grad_norm": 6.718071937561035, "learning_rate": 4.09409594095941e-05, "loss": 2.317642593383789, "step": 41630 }, { "epoch": 11.819472040874254, "grad_norm": 6.603527069091797, "learning_rate": 4.092676695997729e-05, "loss": 2.2753974914550783, "step": 41640 }, { "epoch": 11.822310530797616, "grad_norm": 6.841879844665527, "learning_rate": 4.091257451036049e-05, "loss": 2.2177852630615233, "step": 41650 }, { "epoch": 11.825149020720977, "grad_norm": 6.536050319671631, "learning_rate": 4.0898382060743686e-05, "loss": 2.262161636352539, "step": 41660 }, { "epoch": 11.827987510644338, "grad_norm": 6.51875114440918, "learning_rate": 4.0884189611126886e-05, "loss": 2.2316471099853517, "step": 41670 }, { "epoch": 11.830826000567697, "grad_norm": 6.327782154083252, "learning_rate": 4.086999716151008e-05, "loss": 2.248503303527832, "step": 41680 }, { "epoch": 11.833664490491058, "grad_norm": 6.356901168823242, "learning_rate": 4.085580471189327e-05, "loss": 2.259126091003418, "step": 41690 }, { "epoch": 11.83650298041442, "grad_norm": 6.592813968658447, "learning_rate": 4.0841612262276466e-05, "loss": 2.1704973220825194, "step": 41700 }, { "epoch": 11.83934147033778, "grad_norm": 6.886476516723633, "learning_rate": 4.0827419812659666e-05, "loss": 2.227521514892578, "step": 41710 }, { "epoch": 11.842179960261142, "grad_norm": 6.471127510070801, "learning_rate": 4.081322736304286e-05, "loss": 2.1979738235473634, "step": 41720 }, { "epoch": 11.845018450184503, "grad_norm": 6.393932342529297, "learning_rate": 4.079903491342606e-05, "loss": 2.2552865982055663, "step": 41730 }, { "epoch": 11.847856940107862, "grad_norm": 6.396605491638184, "learning_rate": 4.078484246380925e-05, "loss": 2.166715621948242, "step": 41740 }, { "epoch": 11.850695430031223, "grad_norm": 6.390869617462158, "learning_rate": 4.077065001419245e-05, "loss": 2.1828868865966795, "step": 41750 }, { "epoch": 11.853533919954584, "grad_norm": 6.871070861816406, "learning_rate": 4.0756457564575646e-05, "loss": 2.220122146606445, "step": 41760 }, { "epoch": 11.856372409877945, "grad_norm": 6.781159400939941, "learning_rate": 4.0742265114958846e-05, "loss": 2.240064239501953, "step": 41770 }, { "epoch": 11.859210899801306, "grad_norm": 6.628009796142578, "learning_rate": 4.072807266534204e-05, "loss": 2.2222217559814452, "step": 41780 }, { "epoch": 11.862049389724666, "grad_norm": 6.468578815460205, "learning_rate": 4.071388021572524e-05, "loss": 2.253836250305176, "step": 41790 }, { "epoch": 11.864887879648027, "grad_norm": 6.732112884521484, "learning_rate": 4.069968776610843e-05, "loss": 2.1946001052856445, "step": 41800 }, { "epoch": 11.867726369571388, "grad_norm": 6.819105625152588, "learning_rate": 4.0685495316491626e-05, "loss": 2.201565170288086, "step": 41810 }, { "epoch": 11.870564859494749, "grad_norm": 6.517123222351074, "learning_rate": 4.067130286687482e-05, "loss": 2.207921028137207, "step": 41820 }, { "epoch": 11.87340334941811, "grad_norm": 6.749603271484375, "learning_rate": 4.065711041725802e-05, "loss": 2.2423463821411134, "step": 41830 }, { "epoch": 11.876241839341471, "grad_norm": 6.797380447387695, "learning_rate": 4.064291796764121e-05, "loss": 2.239772605895996, "step": 41840 }, { "epoch": 11.87908032926483, "grad_norm": 6.446372032165527, "learning_rate": 4.062872551802441e-05, "loss": 2.1887485504150392, "step": 41850 }, { "epoch": 11.881918819188192, "grad_norm": 6.855652809143066, "learning_rate": 4.0614533068407606e-05, "loss": 2.2219181060791016, "step": 41860 }, { "epoch": 11.884757309111553, "grad_norm": 6.494744300842285, "learning_rate": 4.0600340618790807e-05, "loss": 2.213785934448242, "step": 41870 }, { "epoch": 11.887595799034914, "grad_norm": 6.4884467124938965, "learning_rate": 4.0586148169174e-05, "loss": 2.183175277709961, "step": 41880 }, { "epoch": 11.890434288958275, "grad_norm": 6.342612266540527, "learning_rate": 4.05719557195572e-05, "loss": 2.1914508819580076, "step": 41890 }, { "epoch": 11.893272778881634, "grad_norm": 6.287228107452393, "learning_rate": 4.055776326994039e-05, "loss": 2.256176567077637, "step": 41900 }, { "epoch": 11.896111268804995, "grad_norm": 6.755331516265869, "learning_rate": 4.0543570820323587e-05, "loss": 2.1690330505371094, "step": 41910 }, { "epoch": 11.898949758728357, "grad_norm": 6.487025737762451, "learning_rate": 4.052937837070679e-05, "loss": 2.228455352783203, "step": 41920 }, { "epoch": 11.901788248651718, "grad_norm": 6.911709308624268, "learning_rate": 4.051518592108998e-05, "loss": 2.3088693618774414, "step": 41930 }, { "epoch": 11.904626738575079, "grad_norm": 6.650363445281982, "learning_rate": 4.050099347147318e-05, "loss": 2.2024871826171877, "step": 41940 }, { "epoch": 11.907465228498438, "grad_norm": 6.681570053100586, "learning_rate": 4.0486801021856373e-05, "loss": 2.273894119262695, "step": 41950 }, { "epoch": 11.9103037184218, "grad_norm": 6.540626525878906, "learning_rate": 4.0472608572239574e-05, "loss": 2.3062721252441407, "step": 41960 }, { "epoch": 11.91314220834516, "grad_norm": 6.298464298248291, "learning_rate": 4.045841612262277e-05, "loss": 2.1739984512329102, "step": 41970 }, { "epoch": 11.915980698268521, "grad_norm": 6.506964206695557, "learning_rate": 4.044422367300597e-05, "loss": 2.1901456832885744, "step": 41980 }, { "epoch": 11.918819188191883, "grad_norm": 6.526866436004639, "learning_rate": 4.043003122338916e-05, "loss": 2.200817108154297, "step": 41990 }, { "epoch": 11.921657678115242, "grad_norm": 6.136394500732422, "learning_rate": 4.041583877377236e-05, "loss": 2.201508903503418, "step": 42000 }, { "epoch": 11.921657678115242, "eval_accuracy": 0.32771666560691803, "eval_loss": 2.535595178604126, "eval_runtime": 53.2679, "eval_samples_per_second": 295.243, "eval_steps_per_second": 4.618, "step": 42000 }, { "epoch": 11.924496168038603, "grad_norm": 6.293445587158203, "learning_rate": 4.0401646324155554e-05, "loss": 2.1644086837768555, "step": 42010 }, { "epoch": 11.927334657961964, "grad_norm": 6.798417091369629, "learning_rate": 4.038745387453875e-05, "loss": 2.217880058288574, "step": 42020 }, { "epoch": 11.930173147885325, "grad_norm": 6.597682476043701, "learning_rate": 4.037326142492194e-05, "loss": 2.181562805175781, "step": 42030 }, { "epoch": 11.933011637808686, "grad_norm": 6.33215856552124, "learning_rate": 4.035906897530514e-05, "loss": 2.2327592849731444, "step": 42040 }, { "epoch": 11.935850127732046, "grad_norm": 6.97318696975708, "learning_rate": 4.0344876525688334e-05, "loss": 2.2493663787841798, "step": 42050 }, { "epoch": 11.938688617655407, "grad_norm": 6.973940372467041, "learning_rate": 4.0330684076071534e-05, "loss": 2.2335094451904296, "step": 42060 }, { "epoch": 11.941527107578768, "grad_norm": 6.4579758644104, "learning_rate": 4.031649162645473e-05, "loss": 2.183444023132324, "step": 42070 }, { "epoch": 11.944365597502129, "grad_norm": 6.403077602386475, "learning_rate": 4.030229917683793e-05, "loss": 2.239610290527344, "step": 42080 }, { "epoch": 11.94720408742549, "grad_norm": 6.365373134613037, "learning_rate": 4.028810672722112e-05, "loss": 2.210660934448242, "step": 42090 }, { "epoch": 11.950042577348851, "grad_norm": 6.47714900970459, "learning_rate": 4.027391427760432e-05, "loss": 2.1939504623413084, "step": 42100 }, { "epoch": 11.95288106727221, "grad_norm": 6.488454818725586, "learning_rate": 4.0259721827987514e-05, "loss": 2.234394073486328, "step": 42110 }, { "epoch": 11.955719557195572, "grad_norm": 6.763961315155029, "learning_rate": 4.0245529378370714e-05, "loss": 2.239279937744141, "step": 42120 }, { "epoch": 11.958558047118933, "grad_norm": 6.498063087463379, "learning_rate": 4.023133692875391e-05, "loss": 2.3037416458129885, "step": 42130 }, { "epoch": 11.961396537042294, "grad_norm": 6.894520282745361, "learning_rate": 4.02171444791371e-05, "loss": 2.2337247848510744, "step": 42140 }, { "epoch": 11.964235026965655, "grad_norm": 6.613296985626221, "learning_rate": 4.0202952029520294e-05, "loss": 2.3158565521240235, "step": 42150 }, { "epoch": 11.967073516889014, "grad_norm": 6.746070861816406, "learning_rate": 4.0188759579903494e-05, "loss": 2.178254318237305, "step": 42160 }, { "epoch": 11.969912006812375, "grad_norm": 6.698480606079102, "learning_rate": 4.017456713028669e-05, "loss": 2.2486446380615233, "step": 42170 }, { "epoch": 11.972750496735737, "grad_norm": 6.23724365234375, "learning_rate": 4.016037468066989e-05, "loss": 2.2962135314941405, "step": 42180 }, { "epoch": 11.975588986659098, "grad_norm": 6.464065074920654, "learning_rate": 4.014618223105308e-05, "loss": 2.2710351943969727, "step": 42190 }, { "epoch": 11.978427476582459, "grad_norm": 6.642095565795898, "learning_rate": 4.013198978143628e-05, "loss": 2.248558235168457, "step": 42200 }, { "epoch": 11.981265966505818, "grad_norm": 6.576180934906006, "learning_rate": 4.0117797331819474e-05, "loss": 2.1762367248535157, "step": 42210 }, { "epoch": 11.98410445642918, "grad_norm": 6.862534046173096, "learning_rate": 4.0103604882202674e-05, "loss": 2.2643901824951174, "step": 42220 }, { "epoch": 11.98694294635254, "grad_norm": 6.52085542678833, "learning_rate": 4.008941243258587e-05, "loss": 2.2714372634887696, "step": 42230 }, { "epoch": 11.989781436275901, "grad_norm": 6.558797359466553, "learning_rate": 4.007521998296906e-05, "loss": 2.2729618072509767, "step": 42240 }, { "epoch": 11.992619926199263, "grad_norm": 6.789394855499268, "learning_rate": 4.006102753335226e-05, "loss": 2.2389583587646484, "step": 42250 }, { "epoch": 11.995458416122624, "grad_norm": 6.177535533905029, "learning_rate": 4.0046835083735454e-05, "loss": 2.2419614791870117, "step": 42260 }, { "epoch": 11.998296906045983, "grad_norm": 6.558593273162842, "learning_rate": 4.003264263411865e-05, "loss": 2.30443115234375, "step": 42270 }, { "epoch": 12.001135395969344, "grad_norm": 6.773513317108154, "learning_rate": 4.001845018450185e-05, "loss": 2.1817564010620116, "step": 42280 }, { "epoch": 12.003973885892705, "grad_norm": 6.842118263244629, "learning_rate": 4.000425773488504e-05, "loss": 2.1203245162963866, "step": 42290 }, { "epoch": 12.006812375816066, "grad_norm": 6.499924182891846, "learning_rate": 3.999006528526824e-05, "loss": 2.1716983795166014, "step": 42300 }, { "epoch": 12.009650865739427, "grad_norm": 7.120082378387451, "learning_rate": 3.9975872835651435e-05, "loss": 2.215260887145996, "step": 42310 }, { "epoch": 12.012489355662787, "grad_norm": 6.4634175300598145, "learning_rate": 3.9961680386034635e-05, "loss": 2.273672676086426, "step": 42320 }, { "epoch": 12.015327845586148, "grad_norm": 6.232581615447998, "learning_rate": 3.994748793641783e-05, "loss": 2.185312843322754, "step": 42330 }, { "epoch": 12.018166335509509, "grad_norm": 6.556597709655762, "learning_rate": 3.993329548680103e-05, "loss": 2.162772369384766, "step": 42340 }, { "epoch": 12.02100482543287, "grad_norm": 6.677372932434082, "learning_rate": 3.991910303718422e-05, "loss": 2.276551628112793, "step": 42350 }, { "epoch": 12.023843315356231, "grad_norm": 6.573231220245361, "learning_rate": 3.9904910587567415e-05, "loss": 2.2064157485961915, "step": 42360 }, { "epoch": 12.02668180527959, "grad_norm": 6.161751747131348, "learning_rate": 3.989071813795061e-05, "loss": 2.186033248901367, "step": 42370 }, { "epoch": 12.029520295202952, "grad_norm": 6.401385307312012, "learning_rate": 3.987652568833381e-05, "loss": 2.139509582519531, "step": 42380 }, { "epoch": 12.032358785126313, "grad_norm": 6.354081153869629, "learning_rate": 3.9862333238717e-05, "loss": 2.229155731201172, "step": 42390 }, { "epoch": 12.035197275049674, "grad_norm": 6.647202491760254, "learning_rate": 3.98481407891002e-05, "loss": 2.2131978988647463, "step": 42400 }, { "epoch": 12.038035764973035, "grad_norm": 6.672450542449951, "learning_rate": 3.9833948339483395e-05, "loss": 2.1857749938964846, "step": 42410 }, { "epoch": 12.040874254896394, "grad_norm": 6.313722610473633, "learning_rate": 3.9819755889866595e-05, "loss": 2.195242691040039, "step": 42420 }, { "epoch": 12.043712744819755, "grad_norm": 6.871487617492676, "learning_rate": 3.980556344024979e-05, "loss": 2.085275077819824, "step": 42430 }, { "epoch": 12.046551234743117, "grad_norm": 6.4989728927612305, "learning_rate": 3.979137099063299e-05, "loss": 2.225958251953125, "step": 42440 }, { "epoch": 12.049389724666478, "grad_norm": 6.888298511505127, "learning_rate": 3.977717854101618e-05, "loss": 2.201841354370117, "step": 42450 }, { "epoch": 12.052228214589839, "grad_norm": 6.559516429901123, "learning_rate": 3.976298609139938e-05, "loss": 2.211199951171875, "step": 42460 }, { "epoch": 12.0550667045132, "grad_norm": 6.639169692993164, "learning_rate": 3.9748793641782575e-05, "loss": 2.237660217285156, "step": 42470 }, { "epoch": 12.05790519443656, "grad_norm": 6.393737316131592, "learning_rate": 3.973460119216577e-05, "loss": 2.2180459976196287, "step": 42480 }, { "epoch": 12.06074368435992, "grad_norm": 6.4547343254089355, "learning_rate": 3.972040874254896e-05, "loss": 2.22570686340332, "step": 42490 }, { "epoch": 12.063582174283281, "grad_norm": 6.487658977508545, "learning_rate": 3.970621629293216e-05, "loss": 2.206152153015137, "step": 42500 }, { "epoch": 12.063582174283281, "eval_accuracy": 0.3266993069243975, "eval_loss": 2.5341546535491943, "eval_runtime": 50.6814, "eval_samples_per_second": 310.311, "eval_steps_per_second": 4.854, "step": 42500 }, { "epoch": 12.066420664206642, "grad_norm": 6.848276615142822, "learning_rate": 3.9692023843315355e-05, "loss": 2.1832260131835937, "step": 42510 }, { "epoch": 12.069259154130004, "grad_norm": 5.8736138343811035, "learning_rate": 3.9677831393698555e-05, "loss": 2.1462095260620115, "step": 42520 }, { "epoch": 12.072097644053363, "grad_norm": 6.445212364196777, "learning_rate": 3.966363894408175e-05, "loss": 2.1796171188354494, "step": 42530 }, { "epoch": 12.074936133976724, "grad_norm": 6.32413911819458, "learning_rate": 3.964944649446495e-05, "loss": 2.1854778289794923, "step": 42540 }, { "epoch": 12.077774623900085, "grad_norm": 6.649606227874756, "learning_rate": 3.963525404484814e-05, "loss": 2.230194091796875, "step": 42550 }, { "epoch": 12.080613113823446, "grad_norm": 6.8081560134887695, "learning_rate": 3.962106159523134e-05, "loss": 2.221279525756836, "step": 42560 }, { "epoch": 12.083451603746807, "grad_norm": 6.655813694000244, "learning_rate": 3.9606869145614536e-05, "loss": 2.2774375915527343, "step": 42570 }, { "epoch": 12.086290093670167, "grad_norm": 6.714869976043701, "learning_rate": 3.9592676695997736e-05, "loss": 2.1686595916748046, "step": 42580 }, { "epoch": 12.089128583593528, "grad_norm": 6.57370138168335, "learning_rate": 3.957848424638093e-05, "loss": 2.1512115478515623, "step": 42590 }, { "epoch": 12.091967073516889, "grad_norm": 6.293887615203857, "learning_rate": 3.956429179676412e-05, "loss": 2.1881946563720702, "step": 42600 }, { "epoch": 12.09480556344025, "grad_norm": 6.648454666137695, "learning_rate": 3.9550099347147316e-05, "loss": 2.211591339111328, "step": 42610 }, { "epoch": 12.097644053363611, "grad_norm": 6.421779155731201, "learning_rate": 3.9535906897530516e-05, "loss": 2.164629364013672, "step": 42620 }, { "epoch": 12.10048254328697, "grad_norm": 6.513043403625488, "learning_rate": 3.952171444791371e-05, "loss": 2.168472671508789, "step": 42630 }, { "epoch": 12.103321033210332, "grad_norm": 6.922235012054443, "learning_rate": 3.950752199829691e-05, "loss": 2.229653167724609, "step": 42640 }, { "epoch": 12.106159523133693, "grad_norm": 6.498703479766846, "learning_rate": 3.94933295486801e-05, "loss": 2.0861286163330077, "step": 42650 }, { "epoch": 12.108998013057054, "grad_norm": 6.000240802764893, "learning_rate": 3.94791370990633e-05, "loss": 2.1704631805419923, "step": 42660 }, { "epoch": 12.111836502980415, "grad_norm": 6.29613733291626, "learning_rate": 3.9464944649446496e-05, "loss": 2.2019798278808596, "step": 42670 }, { "epoch": 12.114674992903776, "grad_norm": 6.715030193328857, "learning_rate": 3.9450752199829696e-05, "loss": 2.231133460998535, "step": 42680 }, { "epoch": 12.117513482827135, "grad_norm": 6.456587314605713, "learning_rate": 3.943655975021289e-05, "loss": 2.225822639465332, "step": 42690 }, { "epoch": 12.120351972750496, "grad_norm": 6.663890361785889, "learning_rate": 3.942236730059608e-05, "loss": 2.211732292175293, "step": 42700 }, { "epoch": 12.123190462673858, "grad_norm": 6.285447120666504, "learning_rate": 3.9408174850979276e-05, "loss": 2.1929256439208986, "step": 42710 }, { "epoch": 12.126028952597219, "grad_norm": 6.527096748352051, "learning_rate": 3.9393982401362476e-05, "loss": 2.1320659637451174, "step": 42720 }, { "epoch": 12.12886744252058, "grad_norm": 6.510185718536377, "learning_rate": 3.937978995174567e-05, "loss": 2.1700429916381836, "step": 42730 }, { "epoch": 12.13170593244394, "grad_norm": 6.297971248626709, "learning_rate": 3.936559750212887e-05, "loss": 2.1565261840820313, "step": 42740 }, { "epoch": 12.1345444223673, "grad_norm": 6.214910507202148, "learning_rate": 3.935140505251206e-05, "loss": 2.12945442199707, "step": 42750 }, { "epoch": 12.137382912290661, "grad_norm": 6.6676530838012695, "learning_rate": 3.933721260289526e-05, "loss": 2.3042110443115233, "step": 42760 }, { "epoch": 12.140221402214022, "grad_norm": 6.846136569976807, "learning_rate": 3.9323020153278456e-05, "loss": 2.246298599243164, "step": 42770 }, { "epoch": 12.143059892137384, "grad_norm": 6.402432918548584, "learning_rate": 3.9308827703661656e-05, "loss": 2.289009666442871, "step": 42780 }, { "epoch": 12.145898382060743, "grad_norm": 6.694276332855225, "learning_rate": 3.929463525404485e-05, "loss": 2.2586477279663084, "step": 42790 }, { "epoch": 12.148736871984104, "grad_norm": 6.23063325881958, "learning_rate": 3.928044280442805e-05, "loss": 2.2541637420654297, "step": 42800 }, { "epoch": 12.151575361907465, "grad_norm": 6.705968856811523, "learning_rate": 3.926625035481124e-05, "loss": 2.245838737487793, "step": 42810 }, { "epoch": 12.154413851830826, "grad_norm": 6.433023452758789, "learning_rate": 3.9252057905194436e-05, "loss": 2.1632638931274415, "step": 42820 }, { "epoch": 12.157252341754187, "grad_norm": 6.242365837097168, "learning_rate": 3.923786545557763e-05, "loss": 2.200824737548828, "step": 42830 }, { "epoch": 12.160090831677547, "grad_norm": 6.25177526473999, "learning_rate": 3.922367300596083e-05, "loss": 2.1778213500976564, "step": 42840 }, { "epoch": 12.162929321600908, "grad_norm": 6.776393413543701, "learning_rate": 3.920948055634402e-05, "loss": 2.2155027389526367, "step": 42850 }, { "epoch": 12.165767811524269, "grad_norm": 6.102976322174072, "learning_rate": 3.919528810672722e-05, "loss": 2.1406259536743164, "step": 42860 }, { "epoch": 12.16860630144763, "grad_norm": 6.791101932525635, "learning_rate": 3.9181095657110416e-05, "loss": 2.1641895294189455, "step": 42870 }, { "epoch": 12.171444791370991, "grad_norm": 6.644314765930176, "learning_rate": 3.9166903207493617e-05, "loss": 2.256964683532715, "step": 42880 }, { "epoch": 12.174283281294352, "grad_norm": 6.599416732788086, "learning_rate": 3.915271075787681e-05, "loss": 2.199799156188965, "step": 42890 }, { "epoch": 12.177121771217712, "grad_norm": 6.577578067779541, "learning_rate": 3.913851830826001e-05, "loss": 2.2331464767456053, "step": 42900 }, { "epoch": 12.179960261141073, "grad_norm": 6.555688858032227, "learning_rate": 3.91243258586432e-05, "loss": 2.2001651763916015, "step": 42910 }, { "epoch": 12.182798751064434, "grad_norm": 6.5939860343933105, "learning_rate": 3.9110133409026403e-05, "loss": 2.227234649658203, "step": 42920 }, { "epoch": 12.185637240987795, "grad_norm": 6.362725734710693, "learning_rate": 3.90959409594096e-05, "loss": 2.2300615310668945, "step": 42930 }, { "epoch": 12.188475730911156, "grad_norm": 6.300897598266602, "learning_rate": 3.908174850979279e-05, "loss": 2.1368215560913084, "step": 42940 }, { "epoch": 12.191314220834515, "grad_norm": 6.4012131690979, "learning_rate": 3.906755606017598e-05, "loss": 2.265493392944336, "step": 42950 }, { "epoch": 12.194152710757876, "grad_norm": 6.541146755218506, "learning_rate": 3.9053363610559183e-05, "loss": 2.2202121734619142, "step": 42960 }, { "epoch": 12.196991200681238, "grad_norm": 6.447317123413086, "learning_rate": 3.903917116094238e-05, "loss": 2.2044231414794924, "step": 42970 }, { "epoch": 12.199829690604599, "grad_norm": 6.491549491882324, "learning_rate": 3.902497871132558e-05, "loss": 2.255487251281738, "step": 42980 }, { "epoch": 12.20266818052796, "grad_norm": 6.06021785736084, "learning_rate": 3.901078626170877e-05, "loss": 2.2025777816772463, "step": 42990 }, { "epoch": 12.205506670451319, "grad_norm": 6.783825397491455, "learning_rate": 3.899659381209197e-05, "loss": 2.2496551513671874, "step": 43000 }, { "epoch": 12.205506670451319, "eval_accuracy": 0.3280981751128632, "eval_loss": 2.5291428565979004, "eval_runtime": 50.749, "eval_samples_per_second": 309.898, "eval_steps_per_second": 4.847, "step": 43000 }, { "epoch": 12.20834516037468, "grad_norm": 6.294044494628906, "learning_rate": 3.8982401362475164e-05, "loss": 2.1973400115966797, "step": 43010 }, { "epoch": 12.211183650298041, "grad_norm": 6.633990287780762, "learning_rate": 3.8968208912858364e-05, "loss": 2.1625152587890626, "step": 43020 }, { "epoch": 12.214022140221402, "grad_norm": 6.3907976150512695, "learning_rate": 3.895401646324156e-05, "loss": 2.226384162902832, "step": 43030 }, { "epoch": 12.216860630144764, "grad_norm": 6.714071273803711, "learning_rate": 3.893982401362476e-05, "loss": 2.155461883544922, "step": 43040 }, { "epoch": 12.219699120068125, "grad_norm": 6.966052055358887, "learning_rate": 3.892563156400795e-05, "loss": 2.2000526428222655, "step": 43050 }, { "epoch": 12.222537609991484, "grad_norm": 6.15965461730957, "learning_rate": 3.8911439114391144e-05, "loss": 2.185479736328125, "step": 43060 }, { "epoch": 12.225376099914845, "grad_norm": 6.649775505065918, "learning_rate": 3.889724666477434e-05, "loss": 2.154791831970215, "step": 43070 }, { "epoch": 12.228214589838206, "grad_norm": 6.704789161682129, "learning_rate": 3.888305421515754e-05, "loss": 2.2421316146850585, "step": 43080 }, { "epoch": 12.231053079761567, "grad_norm": 6.3000264167785645, "learning_rate": 3.886886176554073e-05, "loss": 2.226511764526367, "step": 43090 }, { "epoch": 12.233891569684928, "grad_norm": 6.7015838623046875, "learning_rate": 3.885466931592393e-05, "loss": 2.213760757446289, "step": 43100 }, { "epoch": 12.236730059608288, "grad_norm": 6.371882915496826, "learning_rate": 3.8840476866307124e-05, "loss": 2.1915920257568358, "step": 43110 }, { "epoch": 12.239568549531649, "grad_norm": 6.28787088394165, "learning_rate": 3.8826284416690324e-05, "loss": 2.136012840270996, "step": 43120 }, { "epoch": 12.24240703945501, "grad_norm": 6.681203842163086, "learning_rate": 3.881209196707352e-05, "loss": 2.232249450683594, "step": 43130 }, { "epoch": 12.245245529378371, "grad_norm": 6.533029079437256, "learning_rate": 3.879789951745672e-05, "loss": 2.2640659332275392, "step": 43140 }, { "epoch": 12.248084019301732, "grad_norm": 6.599586009979248, "learning_rate": 3.878370706783991e-05, "loss": 2.242909240722656, "step": 43150 }, { "epoch": 12.250922509225092, "grad_norm": 6.291139602661133, "learning_rate": 3.8769514618223104e-05, "loss": 2.1973283767700194, "step": 43160 }, { "epoch": 12.253760999148453, "grad_norm": 6.514769554138184, "learning_rate": 3.87553221686063e-05, "loss": 2.2203037261962892, "step": 43170 }, { "epoch": 12.256599489071814, "grad_norm": 6.453829765319824, "learning_rate": 3.87411297189895e-05, "loss": 2.213557815551758, "step": 43180 }, { "epoch": 12.259437978995175, "grad_norm": 6.526888370513916, "learning_rate": 3.872693726937269e-05, "loss": 2.2372941970825195, "step": 43190 }, { "epoch": 12.262276468918536, "grad_norm": 6.373908042907715, "learning_rate": 3.871274481975589e-05, "loss": 2.2221452713012697, "step": 43200 }, { "epoch": 12.265114958841895, "grad_norm": 6.446528434753418, "learning_rate": 3.869855237013909e-05, "loss": 2.2107091903686524, "step": 43210 }, { "epoch": 12.267953448765256, "grad_norm": 6.4989495277404785, "learning_rate": 3.8684359920522284e-05, "loss": 2.1541370391845702, "step": 43220 }, { "epoch": 12.270791938688618, "grad_norm": 6.6465606689453125, "learning_rate": 3.8670167470905484e-05, "loss": 2.18505973815918, "step": 43230 }, { "epoch": 12.273630428611979, "grad_norm": 6.23864221572876, "learning_rate": 3.865597502128868e-05, "loss": 2.1666807174682616, "step": 43240 }, { "epoch": 12.27646891853534, "grad_norm": 6.788903713226318, "learning_rate": 3.864178257167188e-05, "loss": 2.242300796508789, "step": 43250 }, { "epoch": 12.279307408458699, "grad_norm": 6.529318332672119, "learning_rate": 3.862900936701675e-05, "loss": 2.2062494277954103, "step": 43260 }, { "epoch": 12.28214589838206, "grad_norm": 6.358542442321777, "learning_rate": 3.8614816917399944e-05, "loss": 2.1929275512695314, "step": 43270 }, { "epoch": 12.284984388305421, "grad_norm": 6.839074611663818, "learning_rate": 3.8600624467783144e-05, "loss": 2.3014869689941406, "step": 43280 }, { "epoch": 12.287822878228782, "grad_norm": 6.411534309387207, "learning_rate": 3.858643201816634e-05, "loss": 2.2235910415649416, "step": 43290 }, { "epoch": 12.290661368152143, "grad_norm": 7.055890083312988, "learning_rate": 3.857223956854954e-05, "loss": 2.2483036041259767, "step": 43300 }, { "epoch": 12.293499858075505, "grad_norm": 6.49620246887207, "learning_rate": 3.855804711893273e-05, "loss": 2.2079463958740235, "step": 43310 }, { "epoch": 12.296338347998864, "grad_norm": 7.080138683319092, "learning_rate": 3.8543854669315924e-05, "loss": 2.1865360260009767, "step": 43320 }, { "epoch": 12.299176837922225, "grad_norm": 6.416501998901367, "learning_rate": 3.852966221969912e-05, "loss": 2.225001907348633, "step": 43330 }, { "epoch": 12.302015327845586, "grad_norm": 6.3853607177734375, "learning_rate": 3.851546977008232e-05, "loss": 2.195708465576172, "step": 43340 }, { "epoch": 12.304853817768947, "grad_norm": 6.305830001831055, "learning_rate": 3.850127732046551e-05, "loss": 2.1838191986083983, "step": 43350 }, { "epoch": 12.307692307692308, "grad_norm": 6.590061187744141, "learning_rate": 3.848708487084871e-05, "loss": 2.1609649658203125, "step": 43360 }, { "epoch": 12.310530797615668, "grad_norm": 7.025979995727539, "learning_rate": 3.8472892421231904e-05, "loss": 2.155064582824707, "step": 43370 }, { "epoch": 12.313369287539029, "grad_norm": 6.874801158905029, "learning_rate": 3.8458699971615104e-05, "loss": 2.23048095703125, "step": 43380 }, { "epoch": 12.31620777746239, "grad_norm": 6.540193557739258, "learning_rate": 3.84445075219983e-05, "loss": 2.1770088195800783, "step": 43390 }, { "epoch": 12.319046267385751, "grad_norm": 6.56069803237915, "learning_rate": 3.84303150723815e-05, "loss": 2.172199821472168, "step": 43400 }, { "epoch": 12.321884757309112, "grad_norm": 6.352041244506836, "learning_rate": 3.841612262276469e-05, "loss": 2.134967803955078, "step": 43410 }, { "epoch": 12.324723247232471, "grad_norm": 6.546145915985107, "learning_rate": 3.8401930173147884e-05, "loss": 2.166983413696289, "step": 43420 }, { "epoch": 12.327561737155833, "grad_norm": 6.5914082527160645, "learning_rate": 3.838773772353108e-05, "loss": 2.2315700531005858, "step": 43430 }, { "epoch": 12.330400227079194, "grad_norm": 6.712865352630615, "learning_rate": 3.837354527391428e-05, "loss": 2.2153778076171875, "step": 43440 }, { "epoch": 12.333238717002555, "grad_norm": 6.550899982452393, "learning_rate": 3.835935282429747e-05, "loss": 2.1799182891845703, "step": 43450 }, { "epoch": 12.336077206925916, "grad_norm": 6.581392765045166, "learning_rate": 3.834516037468067e-05, "loss": 2.2117198944091796, "step": 43460 }, { "epoch": 12.338915696849277, "grad_norm": 6.349491119384766, "learning_rate": 3.8330967925063864e-05, "loss": 2.161723327636719, "step": 43470 }, { "epoch": 12.341754186772636, "grad_norm": 7.034175872802734, "learning_rate": 3.8316775475447065e-05, "loss": 2.236993408203125, "step": 43480 }, { "epoch": 12.344592676695997, "grad_norm": 6.795037746429443, "learning_rate": 3.830258302583026e-05, "loss": 2.1590333938598634, "step": 43490 }, { "epoch": 12.347431166619359, "grad_norm": 6.553544521331787, "learning_rate": 3.828839057621346e-05, "loss": 2.2665775299072264, "step": 43500 }, { "epoch": 12.347431166619359, "eval_accuracy": 0.3278438354422331, "eval_loss": 2.5276846885681152, "eval_runtime": 50.2544, "eval_samples_per_second": 312.948, "eval_steps_per_second": 4.895, "step": 43500 }, { "epoch": 12.35026965654272, "grad_norm": 6.504685878753662, "learning_rate": 3.827419812659665e-05, "loss": 2.2052379608154298, "step": 43510 }, { "epoch": 12.35310814646608, "grad_norm": 6.571105480194092, "learning_rate": 3.826000567697985e-05, "loss": 2.153226852416992, "step": 43520 }, { "epoch": 12.35594663638944, "grad_norm": 6.48638916015625, "learning_rate": 3.8245813227363045e-05, "loss": 2.167741394042969, "step": 43530 }, { "epoch": 12.358785126312801, "grad_norm": 6.687403202056885, "learning_rate": 3.823162077774624e-05, "loss": 2.2049163818359374, "step": 43540 }, { "epoch": 12.361623616236162, "grad_norm": 6.709374904632568, "learning_rate": 3.821742832812943e-05, "loss": 2.16210823059082, "step": 43550 }, { "epoch": 12.364462106159523, "grad_norm": 6.422370433807373, "learning_rate": 3.820323587851263e-05, "loss": 2.2430801391601562, "step": 43560 }, { "epoch": 12.367300596082885, "grad_norm": 6.225661754608154, "learning_rate": 3.8189043428895825e-05, "loss": 2.1859107971191407, "step": 43570 }, { "epoch": 12.370139086006244, "grad_norm": 6.617638111114502, "learning_rate": 3.8174850979279025e-05, "loss": 2.22799129486084, "step": 43580 }, { "epoch": 12.372977575929605, "grad_norm": 6.654460906982422, "learning_rate": 3.816065852966222e-05, "loss": 2.299444580078125, "step": 43590 }, { "epoch": 12.375816065852966, "grad_norm": 6.901068210601807, "learning_rate": 3.814646608004542e-05, "loss": 2.3272647857666016, "step": 43600 }, { "epoch": 12.378654555776327, "grad_norm": 6.590338230133057, "learning_rate": 3.813227363042862e-05, "loss": 2.2418590545654298, "step": 43610 }, { "epoch": 12.381493045699688, "grad_norm": 6.369829177856445, "learning_rate": 3.811808118081181e-05, "loss": 2.3282819747924806, "step": 43620 }, { "epoch": 12.384331535623048, "grad_norm": 6.4285125732421875, "learning_rate": 3.810388873119501e-05, "loss": 2.1735525131225586, "step": 43630 }, { "epoch": 12.387170025546409, "grad_norm": 6.473511219024658, "learning_rate": 3.8089696281578205e-05, "loss": 2.144166946411133, "step": 43640 }, { "epoch": 12.39000851546977, "grad_norm": 6.672433376312256, "learning_rate": 3.80755038319614e-05, "loss": 2.251668930053711, "step": 43650 }, { "epoch": 12.392847005393131, "grad_norm": 6.842720031738281, "learning_rate": 3.806131138234459e-05, "loss": 2.162253570556641, "step": 43660 }, { "epoch": 12.395685495316492, "grad_norm": 6.280371189117432, "learning_rate": 3.804711893272779e-05, "loss": 2.1826587677001954, "step": 43670 }, { "epoch": 12.398523985239853, "grad_norm": 6.467804431915283, "learning_rate": 3.8032926483110985e-05, "loss": 2.204120635986328, "step": 43680 }, { "epoch": 12.401362475163213, "grad_norm": 6.729528903961182, "learning_rate": 3.8018734033494185e-05, "loss": 2.150948715209961, "step": 43690 }, { "epoch": 12.404200965086574, "grad_norm": 6.430563449859619, "learning_rate": 3.800454158387738e-05, "loss": 2.2793695449829103, "step": 43700 }, { "epoch": 12.407039455009935, "grad_norm": 6.581144332885742, "learning_rate": 3.799034913426058e-05, "loss": 2.1357799530029298, "step": 43710 }, { "epoch": 12.409877944933296, "grad_norm": 6.407391548156738, "learning_rate": 3.797615668464377e-05, "loss": 2.209674263000488, "step": 43720 }, { "epoch": 12.412716434856657, "grad_norm": 6.170020580291748, "learning_rate": 3.796196423502697e-05, "loss": 2.2622713088989257, "step": 43730 }, { "epoch": 12.415554924780016, "grad_norm": 6.489981174468994, "learning_rate": 3.7947771785410165e-05, "loss": 2.1232673645019533, "step": 43740 }, { "epoch": 12.418393414703377, "grad_norm": 6.3360185623168945, "learning_rate": 3.7933579335793366e-05, "loss": 2.1780158996582033, "step": 43750 }, { "epoch": 12.421231904626739, "grad_norm": 6.503475189208984, "learning_rate": 3.791938688617656e-05, "loss": 2.2582347869873045, "step": 43760 }, { "epoch": 12.4240703945501, "grad_norm": 6.35359001159668, "learning_rate": 3.790519443655975e-05, "loss": 2.1845293045043945, "step": 43770 }, { "epoch": 12.42690888447346, "grad_norm": 6.565482139587402, "learning_rate": 3.7891001986942946e-05, "loss": 2.111155319213867, "step": 43780 }, { "epoch": 12.42974737439682, "grad_norm": 6.7727789878845215, "learning_rate": 3.7876809537326146e-05, "loss": 2.231142044067383, "step": 43790 }, { "epoch": 12.432585864320181, "grad_norm": 7.0157623291015625, "learning_rate": 3.786261708770934e-05, "loss": 2.1293498992919924, "step": 43800 }, { "epoch": 12.435424354243542, "grad_norm": 6.429587364196777, "learning_rate": 3.784842463809254e-05, "loss": 2.207128143310547, "step": 43810 }, { "epoch": 12.438262844166903, "grad_norm": 6.614090442657471, "learning_rate": 3.783423218847573e-05, "loss": 2.2988510131835938, "step": 43820 }, { "epoch": 12.441101334090265, "grad_norm": 6.230805397033691, "learning_rate": 3.782003973885893e-05, "loss": 2.1032669067382814, "step": 43830 }, { "epoch": 12.443939824013626, "grad_norm": 6.138076305389404, "learning_rate": 3.7805847289242126e-05, "loss": 2.2425554275512694, "step": 43840 }, { "epoch": 12.446778313936985, "grad_norm": 6.249390125274658, "learning_rate": 3.7791654839625326e-05, "loss": 2.147233581542969, "step": 43850 }, { "epoch": 12.449616803860346, "grad_norm": 6.395052433013916, "learning_rate": 3.777746239000852e-05, "loss": 2.1992496490478515, "step": 43860 }, { "epoch": 12.452455293783707, "grad_norm": 6.5397419929504395, "learning_rate": 3.776326994039171e-05, "loss": 2.2293296813964845, "step": 43870 }, { "epoch": 12.455293783707068, "grad_norm": 6.7647857666015625, "learning_rate": 3.7749077490774906e-05, "loss": 2.2038854598999023, "step": 43880 }, { "epoch": 12.45813227363043, "grad_norm": 6.809720516204834, "learning_rate": 3.7734885041158106e-05, "loss": 2.272220420837402, "step": 43890 }, { "epoch": 12.460970763553789, "grad_norm": 6.682650566101074, "learning_rate": 3.77206925915413e-05, "loss": 2.194734001159668, "step": 43900 }, { "epoch": 12.46380925347715, "grad_norm": 6.488746643066406, "learning_rate": 3.77065001419245e-05, "loss": 2.3146533966064453, "step": 43910 }, { "epoch": 12.466647743400511, "grad_norm": 6.639720439910889, "learning_rate": 3.769230769230769e-05, "loss": 2.234988784790039, "step": 43920 }, { "epoch": 12.469486233323872, "grad_norm": 6.107995510101318, "learning_rate": 3.767811524269089e-05, "loss": 2.168583297729492, "step": 43930 }, { "epoch": 12.472324723247233, "grad_norm": 6.472070693969727, "learning_rate": 3.7663922793074086e-05, "loss": 2.147548484802246, "step": 43940 }, { "epoch": 12.475163213170593, "grad_norm": 6.174809455871582, "learning_rate": 3.7649730343457286e-05, "loss": 2.2461706161499024, "step": 43950 }, { "epoch": 12.478001703093954, "grad_norm": 6.441294193267822, "learning_rate": 3.763553789384048e-05, "loss": 2.2303789138793944, "step": 43960 }, { "epoch": 12.480840193017315, "grad_norm": 6.6599273681640625, "learning_rate": 3.762134544422368e-05, "loss": 2.2451080322265624, "step": 43970 }, { "epoch": 12.483678682940676, "grad_norm": 6.3926472663879395, "learning_rate": 3.760715299460687e-05, "loss": 2.204550552368164, "step": 43980 }, { "epoch": 12.486517172864037, "grad_norm": 6.657446384429932, "learning_rate": 3.7592960544990066e-05, "loss": 2.260387992858887, "step": 43990 }, { "epoch": 12.489355662787396, "grad_norm": 6.204729080200195, "learning_rate": 3.757876809537326e-05, "loss": 2.184238624572754, "step": 44000 }, { "epoch": 12.489355662787396, "eval_accuracy": 0.329624213136644, "eval_loss": 2.5241775512695312, "eval_runtime": 54.0216, "eval_samples_per_second": 291.124, "eval_steps_per_second": 4.554, "step": 44000 }, { "epoch": 12.492194152710757, "grad_norm": 6.628660202026367, "learning_rate": 3.756457564575646e-05, "loss": 2.120103645324707, "step": 44010 }, { "epoch": 12.495032642634119, "grad_norm": 6.658102989196777, "learning_rate": 3.755038319613965e-05, "loss": 2.2399415969848633, "step": 44020 }, { "epoch": 12.49787113255748, "grad_norm": 6.458024978637695, "learning_rate": 3.753619074652285e-05, "loss": 2.194095230102539, "step": 44030 }, { "epoch": 12.50070962248084, "grad_norm": 6.8571906089782715, "learning_rate": 3.7521998296906046e-05, "loss": 2.145267868041992, "step": 44040 }, { "epoch": 12.5035481124042, "grad_norm": 6.476306438446045, "learning_rate": 3.7507805847289246e-05, "loss": 2.274881935119629, "step": 44050 }, { "epoch": 12.506386602327561, "grad_norm": 6.4243879318237305, "learning_rate": 3.749361339767244e-05, "loss": 2.2862791061401366, "step": 44060 }, { "epoch": 12.509225092250922, "grad_norm": 6.893696308135986, "learning_rate": 3.747942094805564e-05, "loss": 2.201559066772461, "step": 44070 }, { "epoch": 12.512063582174283, "grad_norm": 6.456633567810059, "learning_rate": 3.746522849843883e-05, "loss": 2.161944580078125, "step": 44080 }, { "epoch": 12.514902072097644, "grad_norm": 6.41124963760376, "learning_rate": 3.745103604882203e-05, "loss": 2.2041946411132813, "step": 44090 }, { "epoch": 12.517740562021006, "grad_norm": 6.387965679168701, "learning_rate": 3.743684359920523e-05, "loss": 2.223743438720703, "step": 44100 }, { "epoch": 12.520579051944365, "grad_norm": 6.907984256744385, "learning_rate": 3.742265114958842e-05, "loss": 2.2611013412475587, "step": 44110 }, { "epoch": 12.523417541867726, "grad_norm": 6.454604625701904, "learning_rate": 3.740845869997161e-05, "loss": 2.193874740600586, "step": 44120 }, { "epoch": 12.526256031791087, "grad_norm": 6.484277248382568, "learning_rate": 3.739426625035481e-05, "loss": 2.173996925354004, "step": 44130 }, { "epoch": 12.529094521714448, "grad_norm": 6.777308940887451, "learning_rate": 3.738007380073801e-05, "loss": 2.168749237060547, "step": 44140 }, { "epoch": 12.53193301163781, "grad_norm": 7.094387531280518, "learning_rate": 3.736588135112121e-05, "loss": 2.2135215759277345, "step": 44150 }, { "epoch": 12.534771501561169, "grad_norm": 6.641486644744873, "learning_rate": 3.73516889015044e-05, "loss": 2.1251773834228516, "step": 44160 }, { "epoch": 12.53760999148453, "grad_norm": 6.723843097686768, "learning_rate": 3.73374964518876e-05, "loss": 2.2166492462158205, "step": 44170 }, { "epoch": 12.540448481407891, "grad_norm": 6.8761186599731445, "learning_rate": 3.7323304002270794e-05, "loss": 2.1200658798217775, "step": 44180 }, { "epoch": 12.543286971331252, "grad_norm": 6.407012462615967, "learning_rate": 3.7309111552653994e-05, "loss": 2.210194778442383, "step": 44190 }, { "epoch": 12.546125461254613, "grad_norm": 6.727839469909668, "learning_rate": 3.729491910303719e-05, "loss": 2.1518821716308594, "step": 44200 }, { "epoch": 12.548963951177974, "grad_norm": 6.449295520782471, "learning_rate": 3.728072665342038e-05, "loss": 2.266157531738281, "step": 44210 }, { "epoch": 12.551802441101334, "grad_norm": 6.537622451782227, "learning_rate": 3.7266534203803574e-05, "loss": 2.204644203186035, "step": 44220 }, { "epoch": 12.554640931024695, "grad_norm": 6.802608966827393, "learning_rate": 3.7252341754186774e-05, "loss": 2.1958950042724608, "step": 44230 }, { "epoch": 12.557479420948056, "grad_norm": 6.5876898765563965, "learning_rate": 3.723814930456997e-05, "loss": 2.188079071044922, "step": 44240 }, { "epoch": 12.560317910871417, "grad_norm": 6.628813743591309, "learning_rate": 3.722395685495317e-05, "loss": 2.2397268295288084, "step": 44250 }, { "epoch": 12.563156400794778, "grad_norm": 6.582003593444824, "learning_rate": 3.720976440533636e-05, "loss": 2.208628845214844, "step": 44260 }, { "epoch": 12.565994890718137, "grad_norm": 6.683676719665527, "learning_rate": 3.719557195571956e-05, "loss": 2.209869956970215, "step": 44270 }, { "epoch": 12.568833380641498, "grad_norm": 6.8729047775268555, "learning_rate": 3.7181379506102754e-05, "loss": 2.2153858184814452, "step": 44280 }, { "epoch": 12.57167187056486, "grad_norm": 6.494183540344238, "learning_rate": 3.7167187056485954e-05, "loss": 2.2144048690795897, "step": 44290 }, { "epoch": 12.57451036048822, "grad_norm": 6.683018684387207, "learning_rate": 3.715299460686915e-05, "loss": 2.2090852737426756, "step": 44300 }, { "epoch": 12.577348850411582, "grad_norm": 6.8075995445251465, "learning_rate": 3.713880215725235e-05, "loss": 2.2346004486083983, "step": 44310 }, { "epoch": 12.580187340334941, "grad_norm": 6.856479644775391, "learning_rate": 3.712460970763554e-05, "loss": 2.206538200378418, "step": 44320 }, { "epoch": 12.583025830258302, "grad_norm": 6.860814571380615, "learning_rate": 3.7110417258018734e-05, "loss": 2.1915510177612303, "step": 44330 }, { "epoch": 12.585864320181663, "grad_norm": 6.815207481384277, "learning_rate": 3.709622480840193e-05, "loss": 2.2192602157592773, "step": 44340 }, { "epoch": 12.588702810105024, "grad_norm": 6.67779016494751, "learning_rate": 3.708203235878513e-05, "loss": 2.260283279418945, "step": 44350 }, { "epoch": 12.591541300028386, "grad_norm": 6.3936333656311035, "learning_rate": 3.706783990916832e-05, "loss": 2.1887109756469725, "step": 44360 }, { "epoch": 12.594379789951745, "grad_norm": 6.349081039428711, "learning_rate": 3.705364745955152e-05, "loss": 2.2158573150634764, "step": 44370 }, { "epoch": 12.597218279875106, "grad_norm": 6.503024578094482, "learning_rate": 3.7039455009934714e-05, "loss": 2.1872879028320313, "step": 44380 }, { "epoch": 12.600056769798467, "grad_norm": 6.931358337402344, "learning_rate": 3.7025262560317914e-05, "loss": 2.196101188659668, "step": 44390 }, { "epoch": 12.602895259721828, "grad_norm": 6.69896936416626, "learning_rate": 3.701107011070111e-05, "loss": 2.1900518417358397, "step": 44400 }, { "epoch": 12.60573374964519, "grad_norm": 6.785608291625977, "learning_rate": 3.699687766108431e-05, "loss": 2.22457218170166, "step": 44410 }, { "epoch": 12.608572239568549, "grad_norm": 7.124655723571777, "learning_rate": 3.69826852114675e-05, "loss": 2.220463180541992, "step": 44420 }, { "epoch": 12.61141072949191, "grad_norm": 6.77866268157959, "learning_rate": 3.69684927618507e-05, "loss": 2.2289875030517576, "step": 44430 }, { "epoch": 12.61424921941527, "grad_norm": 6.467998504638672, "learning_rate": 3.6954300312233894e-05, "loss": 2.136729621887207, "step": 44440 }, { "epoch": 12.617087709338632, "grad_norm": 6.345668315887451, "learning_rate": 3.694010786261709e-05, "loss": 2.1778112411499024, "step": 44450 }, { "epoch": 12.619926199261993, "grad_norm": 6.785579204559326, "learning_rate": 3.692591541300028e-05, "loss": 2.2255834579467773, "step": 44460 }, { "epoch": 12.622764689185354, "grad_norm": 6.461653709411621, "learning_rate": 3.691172296338348e-05, "loss": 2.2266122817993166, "step": 44470 }, { "epoch": 12.625603179108714, "grad_norm": 6.289578437805176, "learning_rate": 3.6897530513766674e-05, "loss": 2.179103469848633, "step": 44480 }, { "epoch": 12.628441669032075, "grad_norm": 6.399742126464844, "learning_rate": 3.6883338064149875e-05, "loss": 2.159556007385254, "step": 44490 }, { "epoch": 12.631280158955436, "grad_norm": 6.604116439819336, "learning_rate": 3.686914561453307e-05, "loss": 2.2140636444091797, "step": 44500 }, { "epoch": 12.631280158955436, "eval_accuracy": 0.33070515673682205, "eval_loss": 2.51948881149292, "eval_runtime": 52.8, "eval_samples_per_second": 297.86, "eval_steps_per_second": 4.659, "step": 44500 }, { "epoch": 12.634118648878797, "grad_norm": 6.273222923278809, "learning_rate": 3.685495316491627e-05, "loss": 2.2047426223754885, "step": 44510 }, { "epoch": 12.636957138802158, "grad_norm": 6.556110382080078, "learning_rate": 3.684076071529946e-05, "loss": 2.1706329345703126, "step": 44520 }, { "epoch": 12.639795628725517, "grad_norm": 6.670453071594238, "learning_rate": 3.682656826568266e-05, "loss": 2.2134817123413084, "step": 44530 }, { "epoch": 12.642634118648878, "grad_norm": 6.324950218200684, "learning_rate": 3.6812375816065855e-05, "loss": 2.2409988403320313, "step": 44540 }, { "epoch": 12.64547260857224, "grad_norm": 6.851151466369629, "learning_rate": 3.6798183366449055e-05, "loss": 2.2803810119628904, "step": 44550 }, { "epoch": 12.6483110984956, "grad_norm": 6.529575347900391, "learning_rate": 3.678399091683225e-05, "loss": 2.1818233489990235, "step": 44560 }, { "epoch": 12.651149588418962, "grad_norm": 6.727910041809082, "learning_rate": 3.676979846721544e-05, "loss": 2.2987302780151366, "step": 44570 }, { "epoch": 12.653988078342321, "grad_norm": 6.723139762878418, "learning_rate": 3.6755606017598635e-05, "loss": 2.1792102813720704, "step": 44580 }, { "epoch": 12.656826568265682, "grad_norm": 6.739110946655273, "learning_rate": 3.6741413567981835e-05, "loss": 2.1770959854125977, "step": 44590 }, { "epoch": 12.659665058189043, "grad_norm": 6.522835731506348, "learning_rate": 3.672722111836503e-05, "loss": 2.166490173339844, "step": 44600 }, { "epoch": 12.662503548112404, "grad_norm": 6.640269756317139, "learning_rate": 3.671302866874823e-05, "loss": 2.2358257293701174, "step": 44610 }, { "epoch": 12.665342038035766, "grad_norm": 6.444235324859619, "learning_rate": 3.669883621913142e-05, "loss": 2.1595001220703125, "step": 44620 }, { "epoch": 12.668180527959127, "grad_norm": 6.5386810302734375, "learning_rate": 3.668464376951462e-05, "loss": 2.1991729736328125, "step": 44630 }, { "epoch": 12.671019017882486, "grad_norm": 6.623223781585693, "learning_rate": 3.6670451319897815e-05, "loss": 2.2295934677124025, "step": 44640 }, { "epoch": 12.673857507805847, "grad_norm": 6.908699989318848, "learning_rate": 3.6656258870281015e-05, "loss": 2.208292770385742, "step": 44650 }, { "epoch": 12.676695997729208, "grad_norm": 6.390843391418457, "learning_rate": 3.664206642066421e-05, "loss": 2.1462165832519533, "step": 44660 }, { "epoch": 12.67953448765257, "grad_norm": 6.560793876647949, "learning_rate": 3.66278739710474e-05, "loss": 2.2539445877075197, "step": 44670 }, { "epoch": 12.68237297757593, "grad_norm": 6.489957332611084, "learning_rate": 3.6613681521430595e-05, "loss": 2.2307621002197267, "step": 44680 }, { "epoch": 12.68521146749929, "grad_norm": 6.545163154602051, "learning_rate": 3.6599489071813795e-05, "loss": 2.198952484130859, "step": 44690 }, { "epoch": 12.68804995742265, "grad_norm": 6.8439507484436035, "learning_rate": 3.658529662219699e-05, "loss": 2.129464340209961, "step": 44700 }, { "epoch": 12.690888447346012, "grad_norm": 6.564327239990234, "learning_rate": 3.657110417258019e-05, "loss": 2.1522117614746095, "step": 44710 }, { "epoch": 12.693726937269373, "grad_norm": 7.142855644226074, "learning_rate": 3.655691172296338e-05, "loss": 2.203002166748047, "step": 44720 }, { "epoch": 12.696565427192734, "grad_norm": 6.651459693908691, "learning_rate": 3.654271927334658e-05, "loss": 2.198781967163086, "step": 44730 }, { "epoch": 12.699403917116094, "grad_norm": 6.161154270172119, "learning_rate": 3.6528526823729775e-05, "loss": 2.2037458419799805, "step": 44740 }, { "epoch": 12.702242407039455, "grad_norm": 6.422297954559326, "learning_rate": 3.6514334374112975e-05, "loss": 2.166704559326172, "step": 44750 }, { "epoch": 12.705080896962816, "grad_norm": 6.790238857269287, "learning_rate": 3.650014192449617e-05, "loss": 2.2517158508300783, "step": 44760 }, { "epoch": 12.707919386886177, "grad_norm": 6.459599018096924, "learning_rate": 3.648594947487937e-05, "loss": 2.217779541015625, "step": 44770 }, { "epoch": 12.710757876809538, "grad_norm": 6.51823616027832, "learning_rate": 3.647175702526256e-05, "loss": 2.235859489440918, "step": 44780 }, { "epoch": 12.713596366732897, "grad_norm": 6.787055492401123, "learning_rate": 3.6457564575645756e-05, "loss": 2.1320472717285157, "step": 44790 }, { "epoch": 12.716434856656258, "grad_norm": 6.853501319885254, "learning_rate": 3.644337212602895e-05, "loss": 2.1853769302368162, "step": 44800 }, { "epoch": 12.71927334657962, "grad_norm": 6.575509548187256, "learning_rate": 3.642917967641215e-05, "loss": 2.1779762268066407, "step": 44810 }, { "epoch": 12.72211183650298, "grad_norm": 6.307346820831299, "learning_rate": 3.641498722679534e-05, "loss": 2.177897262573242, "step": 44820 }, { "epoch": 12.724950326426342, "grad_norm": 6.37318229675293, "learning_rate": 3.640079477717854e-05, "loss": 2.159554672241211, "step": 44830 }, { "epoch": 12.727788816349701, "grad_norm": 6.404716491699219, "learning_rate": 3.6386602327561736e-05, "loss": 2.224997329711914, "step": 44840 }, { "epoch": 12.730627306273062, "grad_norm": 6.43831205368042, "learning_rate": 3.6372409877944936e-05, "loss": 2.296098518371582, "step": 44850 }, { "epoch": 12.733465796196423, "grad_norm": 6.698168754577637, "learning_rate": 3.635821742832813e-05, "loss": 2.2300718307495115, "step": 44860 }, { "epoch": 12.736304286119784, "grad_norm": 6.289602279663086, "learning_rate": 3.634402497871133e-05, "loss": 2.2010433197021486, "step": 44870 }, { "epoch": 12.739142776043145, "grad_norm": 6.437556743621826, "learning_rate": 3.632983252909452e-05, "loss": 2.1755081176757813, "step": 44880 }, { "epoch": 12.741981265966507, "grad_norm": 6.386157989501953, "learning_rate": 3.631564007947772e-05, "loss": 2.2317167282104493, "step": 44890 }, { "epoch": 12.744819755889866, "grad_norm": 6.485376358032227, "learning_rate": 3.6301447629860916e-05, "loss": 2.106953239440918, "step": 44900 }, { "epoch": 12.747658245813227, "grad_norm": 7.089203357696533, "learning_rate": 3.628725518024411e-05, "loss": 2.1678977966308595, "step": 44910 }, { "epoch": 12.750496735736588, "grad_norm": 6.63095235824585, "learning_rate": 3.627306273062731e-05, "loss": 2.1791707992553713, "step": 44920 }, { "epoch": 12.75333522565995, "grad_norm": 6.731418132781982, "learning_rate": 3.62588702810105e-05, "loss": 2.2235134124755858, "step": 44930 }, { "epoch": 12.75617371558331, "grad_norm": 6.599645614624023, "learning_rate": 3.62446778313937e-05, "loss": 2.2193655014038085, "step": 44940 }, { "epoch": 12.75901220550667, "grad_norm": 6.390628814697266, "learning_rate": 3.6230485381776896e-05, "loss": 2.1837167739868164, "step": 44950 }, { "epoch": 12.76185069543003, "grad_norm": 6.670828819274902, "learning_rate": 3.6216292932160096e-05, "loss": 2.131216812133789, "step": 44960 }, { "epoch": 12.764689185353392, "grad_norm": 6.552160263061523, "learning_rate": 3.620210048254329e-05, "loss": 2.193693161010742, "step": 44970 }, { "epoch": 12.767527675276753, "grad_norm": 6.060576438903809, "learning_rate": 3.618790803292649e-05, "loss": 2.164088821411133, "step": 44980 }, { "epoch": 12.770366165200114, "grad_norm": 6.541807174682617, "learning_rate": 3.617371558330968e-05, "loss": 2.2320701599121096, "step": 44990 }, { "epoch": 12.773204655123475, "grad_norm": 6.562819004058838, "learning_rate": 3.615952313369288e-05, "loss": 2.2486953735351562, "step": 45000 }, { "epoch": 12.773204655123475, "eval_accuracy": 0.3338844026196986, "eval_loss": 2.5146548748016357, "eval_runtime": 49.965, "eval_samples_per_second": 314.76, "eval_steps_per_second": 4.923, "step": 45000 }, { "epoch": 12.776043145046835, "grad_norm": 6.6095428466796875, "learning_rate": 3.6145330684076076e-05, "loss": 2.1246002197265623, "step": 45010 }, { "epoch": 12.778881634970196, "grad_norm": 6.076456069946289, "learning_rate": 3.613113823445927e-05, "loss": 2.146302032470703, "step": 45020 }, { "epoch": 12.781720124893557, "grad_norm": 6.58699369430542, "learning_rate": 3.611694578484246e-05, "loss": 2.326860809326172, "step": 45030 }, { "epoch": 12.784558614816918, "grad_norm": 6.444716453552246, "learning_rate": 3.610275333522566e-05, "loss": 2.166767120361328, "step": 45040 }, { "epoch": 12.787397104740279, "grad_norm": 6.322332859039307, "learning_rate": 3.6088560885608856e-05, "loss": 2.2011703491210937, "step": 45050 }, { "epoch": 12.790235594663638, "grad_norm": 6.174929618835449, "learning_rate": 3.6074368435992057e-05, "loss": 2.136324882507324, "step": 45060 }, { "epoch": 12.793074084587, "grad_norm": 6.545402526855469, "learning_rate": 3.606017598637525e-05, "loss": 2.2914987564086915, "step": 45070 }, { "epoch": 12.79591257451036, "grad_norm": 6.51641845703125, "learning_rate": 3.604598353675845e-05, "loss": 2.2570474624633787, "step": 45080 }, { "epoch": 12.798751064433722, "grad_norm": 7.085973739624023, "learning_rate": 3.603179108714164e-05, "loss": 2.1744205474853517, "step": 45090 }, { "epoch": 12.801589554357083, "grad_norm": 6.551303386688232, "learning_rate": 3.601759863752484e-05, "loss": 2.195351791381836, "step": 45100 }, { "epoch": 12.804428044280442, "grad_norm": 6.217483043670654, "learning_rate": 3.600340618790804e-05, "loss": 2.2287948608398436, "step": 45110 }, { "epoch": 12.807266534203803, "grad_norm": 6.502246379852295, "learning_rate": 3.598921373829123e-05, "loss": 2.23769474029541, "step": 45120 }, { "epoch": 12.810105024127164, "grad_norm": 6.611891746520996, "learning_rate": 3.597502128867442e-05, "loss": 2.2020542144775392, "step": 45130 }, { "epoch": 12.812943514050525, "grad_norm": 6.5636887550354, "learning_rate": 3.5960828839057623e-05, "loss": 2.212422752380371, "step": 45140 }, { "epoch": 12.815782003973887, "grad_norm": 6.56975793838501, "learning_rate": 3.594663638944082e-05, "loss": 2.232439422607422, "step": 45150 }, { "epoch": 12.818620493897246, "grad_norm": 6.383737087249756, "learning_rate": 3.593244393982402e-05, "loss": 2.2128475189208983, "step": 45160 }, { "epoch": 12.821458983820607, "grad_norm": 6.1288933753967285, "learning_rate": 3.591825149020721e-05, "loss": 2.1742279052734377, "step": 45170 }, { "epoch": 12.824297473743968, "grad_norm": 6.274975299835205, "learning_rate": 3.590405904059041e-05, "loss": 2.2457319259643556, "step": 45180 }, { "epoch": 12.82713596366733, "grad_norm": 6.567293643951416, "learning_rate": 3.5889866590973604e-05, "loss": 2.2617725372314452, "step": 45190 }, { "epoch": 12.82997445359069, "grad_norm": 6.353239059448242, "learning_rate": 3.5875674141356804e-05, "loss": 2.054848861694336, "step": 45200 }, { "epoch": 12.83281294351405, "grad_norm": 6.292553424835205, "learning_rate": 3.586148169174e-05, "loss": 2.166786956787109, "step": 45210 }, { "epoch": 12.83565143343741, "grad_norm": 6.519330024719238, "learning_rate": 3.58472892421232e-05, "loss": 2.1284252166748048, "step": 45220 }, { "epoch": 12.838489923360772, "grad_norm": 6.697631359100342, "learning_rate": 3.583309679250639e-05, "loss": 2.2046892166137697, "step": 45230 }, { "epoch": 12.841328413284133, "grad_norm": 6.475213527679443, "learning_rate": 3.5818904342889584e-05, "loss": 2.1923311233520506, "step": 45240 }, { "epoch": 12.844166903207494, "grad_norm": 6.888168811798096, "learning_rate": 3.580471189327278e-05, "loss": 2.21466121673584, "step": 45250 }, { "epoch": 12.847005393130855, "grad_norm": 6.753630638122559, "learning_rate": 3.579051944365598e-05, "loss": 2.1357460021972656, "step": 45260 }, { "epoch": 12.849843883054215, "grad_norm": 6.520064830780029, "learning_rate": 3.5777746239000857e-05, "loss": 2.1357465744018556, "step": 45270 }, { "epoch": 12.852682372977576, "grad_norm": 6.379458904266357, "learning_rate": 3.576355378938405e-05, "loss": 2.168916702270508, "step": 45280 }, { "epoch": 12.855520862900937, "grad_norm": 6.79066276550293, "learning_rate": 3.574936133976724e-05, "loss": 2.197993278503418, "step": 45290 }, { "epoch": 12.858359352824298, "grad_norm": 6.343975067138672, "learning_rate": 3.573516889015044e-05, "loss": 2.1878284454345702, "step": 45300 }, { "epoch": 12.861197842747659, "grad_norm": 6.635082721710205, "learning_rate": 3.572097644053364e-05, "loss": 2.275302696228027, "step": 45310 }, { "epoch": 12.864036332671018, "grad_norm": 6.645702838897705, "learning_rate": 3.570678399091684e-05, "loss": 2.2162546157836913, "step": 45320 }, { "epoch": 12.86687482259438, "grad_norm": 6.547659873962402, "learning_rate": 3.569259154130003e-05, "loss": 2.1873435974121094, "step": 45330 }, { "epoch": 12.86971331251774, "grad_norm": 6.715066432952881, "learning_rate": 3.567839909168323e-05, "loss": 2.221751403808594, "step": 45340 }, { "epoch": 12.872551802441102, "grad_norm": 6.354259014129639, "learning_rate": 3.5664206642066423e-05, "loss": 2.217380142211914, "step": 45350 }, { "epoch": 12.875390292364463, "grad_norm": 6.414977550506592, "learning_rate": 3.5650014192449624e-05, "loss": 2.1739971160888674, "step": 45360 }, { "epoch": 12.878228782287822, "grad_norm": 6.315077781677246, "learning_rate": 3.563582174283282e-05, "loss": 2.140269470214844, "step": 45370 }, { "epoch": 12.881067272211183, "grad_norm": 6.331647872924805, "learning_rate": 3.562162929321601e-05, "loss": 2.174962043762207, "step": 45380 }, { "epoch": 12.883905762134544, "grad_norm": 6.6448588371276855, "learning_rate": 3.5607436843599204e-05, "loss": 2.192513275146484, "step": 45390 }, { "epoch": 12.886744252057905, "grad_norm": 6.748722553253174, "learning_rate": 3.5593244393982404e-05, "loss": 2.2121608734130858, "step": 45400 }, { "epoch": 12.889582741981267, "grad_norm": 6.730984210968018, "learning_rate": 3.55790519443656e-05, "loss": 2.1549074172973635, "step": 45410 }, { "epoch": 12.892421231904628, "grad_norm": 6.6597161293029785, "learning_rate": 3.55648594947488e-05, "loss": 2.221090316772461, "step": 45420 }, { "epoch": 12.895259721827987, "grad_norm": 6.367534160614014, "learning_rate": 3.555066704513199e-05, "loss": 2.186677169799805, "step": 45430 }, { "epoch": 12.898098211751348, "grad_norm": 6.464701175689697, "learning_rate": 3.553647459551519e-05, "loss": 2.190470886230469, "step": 45440 }, { "epoch": 12.90093670167471, "grad_norm": 6.249189376831055, "learning_rate": 3.5522282145898384e-05, "loss": 2.2433355331420897, "step": 45450 }, { "epoch": 12.90377519159807, "grad_norm": 7.161572456359863, "learning_rate": 3.5508089696281584e-05, "loss": 2.198392868041992, "step": 45460 }, { "epoch": 12.906613681521431, "grad_norm": 6.935003280639648, "learning_rate": 3.549389724666478e-05, "loss": 2.240024375915527, "step": 45470 }, { "epoch": 12.90945217144479, "grad_norm": 6.52086067199707, "learning_rate": 3.547970479704798e-05, "loss": 2.090396499633789, "step": 45480 }, { "epoch": 12.912290661368152, "grad_norm": 6.538590431213379, "learning_rate": 3.546551234743117e-05, "loss": 2.229671859741211, "step": 45490 }, { "epoch": 12.915129151291513, "grad_norm": 6.729950904846191, "learning_rate": 3.5451319897814364e-05, "loss": 2.1795358657836914, "step": 45500 }, { "epoch": 12.915129151291513, "eval_accuracy": 0.3354740255611369, "eval_loss": 2.5114974975585938, "eval_runtime": 48.8664, "eval_samples_per_second": 321.837, "eval_steps_per_second": 5.034, "step": 45500 }, { "epoch": 12.917967641214874, "grad_norm": 6.966878414154053, "learning_rate": 3.543712744819756e-05, "loss": 2.240615463256836, "step": 45510 }, { "epoch": 12.920806131138235, "grad_norm": 6.276447772979736, "learning_rate": 3.542293499858076e-05, "loss": 2.1945369720458983, "step": 45520 }, { "epoch": 12.923644621061595, "grad_norm": 6.4945454597473145, "learning_rate": 3.540874254896395e-05, "loss": 2.22955322265625, "step": 45530 }, { "epoch": 12.926483110984956, "grad_norm": 6.719387054443359, "learning_rate": 3.539455009934715e-05, "loss": 2.2405960083007814, "step": 45540 }, { "epoch": 12.929321600908317, "grad_norm": 6.4910149574279785, "learning_rate": 3.5380357649730344e-05, "loss": 2.2019012451171873, "step": 45550 }, { "epoch": 12.932160090831678, "grad_norm": 6.642837047576904, "learning_rate": 3.5366165200113544e-05, "loss": 2.192341995239258, "step": 45560 }, { "epoch": 12.934998580755039, "grad_norm": 6.291227340698242, "learning_rate": 3.535197275049674e-05, "loss": 2.1935115814208985, "step": 45570 }, { "epoch": 12.937837070678398, "grad_norm": 6.827078819274902, "learning_rate": 3.533778030087994e-05, "loss": 2.239186096191406, "step": 45580 }, { "epoch": 12.94067556060176, "grad_norm": 6.53040885925293, "learning_rate": 3.532358785126313e-05, "loss": 2.1717084884643554, "step": 45590 }, { "epoch": 12.94351405052512, "grad_norm": 6.510026454925537, "learning_rate": 3.530939540164633e-05, "loss": 2.1676729202270506, "step": 45600 }, { "epoch": 12.946352540448482, "grad_norm": 6.586895942687988, "learning_rate": 3.5295202952029524e-05, "loss": 2.2329471588134764, "step": 45610 }, { "epoch": 12.949191030371843, "grad_norm": 6.6494574546813965, "learning_rate": 3.528101050241272e-05, "loss": 2.2086503982543944, "step": 45620 }, { "epoch": 12.952029520295202, "grad_norm": 6.458803176879883, "learning_rate": 3.526681805279591e-05, "loss": 2.1765300750732424, "step": 45630 }, { "epoch": 12.954868010218563, "grad_norm": 6.373988628387451, "learning_rate": 3.525262560317911e-05, "loss": 2.2320672988891603, "step": 45640 }, { "epoch": 12.957706500141924, "grad_norm": 6.971217155456543, "learning_rate": 3.5238433153562304e-05, "loss": 2.247076988220215, "step": 45650 }, { "epoch": 12.960544990065285, "grad_norm": 7.04914665222168, "learning_rate": 3.5224240703945505e-05, "loss": 2.245303726196289, "step": 45660 }, { "epoch": 12.963383479988646, "grad_norm": 6.413506507873535, "learning_rate": 3.52100482543287e-05, "loss": 2.1742088317871096, "step": 45670 }, { "epoch": 12.966221969912008, "grad_norm": 6.153796672821045, "learning_rate": 3.51958558047119e-05, "loss": 2.195536804199219, "step": 45680 }, { "epoch": 12.969060459835367, "grad_norm": 6.679362773895264, "learning_rate": 3.518166335509509e-05, "loss": 2.2710445404052733, "step": 45690 }, { "epoch": 12.971898949758728, "grad_norm": 6.274446964263916, "learning_rate": 3.516747090547829e-05, "loss": 2.2287660598754884, "step": 45700 }, { "epoch": 12.97473743968209, "grad_norm": 6.983373641967773, "learning_rate": 3.5153278455861485e-05, "loss": 2.182663154602051, "step": 45710 }, { "epoch": 12.97757592960545, "grad_norm": 6.778936862945557, "learning_rate": 3.5139086006244685e-05, "loss": 2.2303062438964845, "step": 45720 }, { "epoch": 12.980414419528811, "grad_norm": 6.543891429901123, "learning_rate": 3.512489355662788e-05, "loss": 2.1612258911132813, "step": 45730 }, { "epoch": 12.98325290945217, "grad_norm": 5.958889007568359, "learning_rate": 3.511070110701107e-05, "loss": 2.108611297607422, "step": 45740 }, { "epoch": 12.986091399375532, "grad_norm": 6.7949018478393555, "learning_rate": 3.5096508657394265e-05, "loss": 2.168751525878906, "step": 45750 }, { "epoch": 12.988929889298893, "grad_norm": 6.356585502624512, "learning_rate": 3.5082316207777465e-05, "loss": 2.1924249649047853, "step": 45760 }, { "epoch": 12.991768379222254, "grad_norm": 6.397174835205078, "learning_rate": 3.506812375816066e-05, "loss": 2.1902515411376955, "step": 45770 }, { "epoch": 12.994606869145615, "grad_norm": 6.49183464050293, "learning_rate": 3.505393130854386e-05, "loss": 2.1179794311523437, "step": 45780 }, { "epoch": 12.997445359068976, "grad_norm": 6.693378448486328, "learning_rate": 3.503973885892705e-05, "loss": 2.280075454711914, "step": 45790 }, { "epoch": 13.000283848992336, "grad_norm": 6.526815414428711, "learning_rate": 3.502554640931025e-05, "loss": 2.1938119888305665, "step": 45800 }, { "epoch": 13.003122338915697, "grad_norm": 6.575026512145996, "learning_rate": 3.5011353959693445e-05, "loss": 2.2004146575927734, "step": 45810 }, { "epoch": 13.005960828839058, "grad_norm": 6.4064531326293945, "learning_rate": 3.4997161510076645e-05, "loss": 2.1969173431396483, "step": 45820 }, { "epoch": 13.008799318762419, "grad_norm": 6.831264019012451, "learning_rate": 3.498296906045984e-05, "loss": 2.1902027130126953, "step": 45830 }, { "epoch": 13.01163780868578, "grad_norm": 6.533995628356934, "learning_rate": 3.496877661084303e-05, "loss": 2.2164894104003907, "step": 45840 }, { "epoch": 13.01447629860914, "grad_norm": 6.857189178466797, "learning_rate": 3.4954584161226225e-05, "loss": 2.240328788757324, "step": 45850 }, { "epoch": 13.0173147885325, "grad_norm": 6.804120063781738, "learning_rate": 3.4940391711609425e-05, "loss": 2.169051742553711, "step": 45860 }, { "epoch": 13.020153278455862, "grad_norm": 6.739519119262695, "learning_rate": 3.492619926199262e-05, "loss": 2.1779041290283203, "step": 45870 }, { "epoch": 13.022991768379223, "grad_norm": 6.576704502105713, "learning_rate": 3.491200681237582e-05, "loss": 2.1827823638916017, "step": 45880 }, { "epoch": 13.025830258302584, "grad_norm": 6.136642932891846, "learning_rate": 3.489781436275901e-05, "loss": 2.176346206665039, "step": 45890 }, { "epoch": 13.028668748225943, "grad_norm": 6.673829078674316, "learning_rate": 3.488362191314221e-05, "loss": 2.1980152130126953, "step": 45900 }, { "epoch": 13.031507238149304, "grad_norm": 6.539431095123291, "learning_rate": 3.4869429463525405e-05, "loss": 2.167091178894043, "step": 45910 }, { "epoch": 13.034345728072665, "grad_norm": 6.460318088531494, "learning_rate": 3.4855237013908605e-05, "loss": 2.1917888641357424, "step": 45920 }, { "epoch": 13.037184217996026, "grad_norm": 6.5593953132629395, "learning_rate": 3.48410445642918e-05, "loss": 2.2007802963256835, "step": 45930 }, { "epoch": 13.040022707919388, "grad_norm": 6.421267509460449, "learning_rate": 3.4826852114675e-05, "loss": 2.096022033691406, "step": 45940 }, { "epoch": 13.042861197842747, "grad_norm": 7.128078460693359, "learning_rate": 3.481265966505819e-05, "loss": 2.1872623443603514, "step": 45950 }, { "epoch": 13.045699687766108, "grad_norm": 6.4883198738098145, "learning_rate": 3.4798467215441385e-05, "loss": 2.1959266662597656, "step": 45960 }, { "epoch": 13.048538177689469, "grad_norm": 6.611191749572754, "learning_rate": 3.478427476582458e-05, "loss": 2.159342384338379, "step": 45970 }, { "epoch": 13.05137666761283, "grad_norm": 6.353420257568359, "learning_rate": 3.477008231620778e-05, "loss": 2.2119165420532227, "step": 45980 }, { "epoch": 13.054215157536191, "grad_norm": 6.724706649780273, "learning_rate": 3.475588986659097e-05, "loss": 2.211736297607422, "step": 45990 }, { "epoch": 13.05705364745955, "grad_norm": 6.556138515472412, "learning_rate": 3.474169741697417e-05, "loss": 2.230623245239258, "step": 46000 }, { "epoch": 13.05705364745955, "eval_accuracy": 0.3350925160551917, "eval_loss": 2.507835865020752, "eval_runtime": 50.0531, "eval_samples_per_second": 314.206, "eval_steps_per_second": 4.915, "step": 46000 }, { "epoch": 13.059892137382912, "grad_norm": 6.8066887855529785, "learning_rate": 3.4727504967357366e-05, "loss": 2.1927318572998047, "step": 46010 }, { "epoch": 13.062730627306273, "grad_norm": 6.62333869934082, "learning_rate": 3.4713312517740566e-05, "loss": 2.248439979553223, "step": 46020 }, { "epoch": 13.065569117229634, "grad_norm": 6.79385232925415, "learning_rate": 3.469912006812376e-05, "loss": 2.2027585983276365, "step": 46030 }, { "epoch": 13.068407607152995, "grad_norm": 6.739339351654053, "learning_rate": 3.468492761850696e-05, "loss": 2.138899040222168, "step": 46040 }, { "epoch": 13.071246097076356, "grad_norm": 6.52704381942749, "learning_rate": 3.467073516889015e-05, "loss": 2.17966251373291, "step": 46050 }, { "epoch": 13.074084586999716, "grad_norm": 6.372169017791748, "learning_rate": 3.465654271927335e-05, "loss": 2.0949380874633787, "step": 46060 }, { "epoch": 13.076923076923077, "grad_norm": 6.465529441833496, "learning_rate": 3.4642350269656546e-05, "loss": 2.1697126388549806, "step": 46070 }, { "epoch": 13.079761566846438, "grad_norm": 6.933211803436279, "learning_rate": 3.462815782003974e-05, "loss": 2.1735706329345703, "step": 46080 }, { "epoch": 13.082600056769799, "grad_norm": 6.140178203582764, "learning_rate": 3.461396537042293e-05, "loss": 2.1889162063598633, "step": 46090 }, { "epoch": 13.08543854669316, "grad_norm": 6.5215559005737305, "learning_rate": 3.459977292080613e-05, "loss": 2.1896120071411134, "step": 46100 }, { "epoch": 13.08827703661652, "grad_norm": 6.586458206176758, "learning_rate": 3.4585580471189326e-05, "loss": 2.147872543334961, "step": 46110 }, { "epoch": 13.09111552653988, "grad_norm": 6.7544355392456055, "learning_rate": 3.4571388021572526e-05, "loss": 2.1854225158691407, "step": 46120 }, { "epoch": 13.093954016463242, "grad_norm": 6.468243598937988, "learning_rate": 3.455719557195572e-05, "loss": 2.1316091537475588, "step": 46130 }, { "epoch": 13.096792506386603, "grad_norm": 6.2932634353637695, "learning_rate": 3.454300312233892e-05, "loss": 2.169059944152832, "step": 46140 }, { "epoch": 13.099630996309964, "grad_norm": 6.268207550048828, "learning_rate": 3.452881067272211e-05, "loss": 2.1610929489135744, "step": 46150 }, { "epoch": 13.102469486233323, "grad_norm": 6.578137397766113, "learning_rate": 3.451461822310531e-05, "loss": 2.1634584426879884, "step": 46160 }, { "epoch": 13.105307976156684, "grad_norm": 6.476487159729004, "learning_rate": 3.4500425773488506e-05, "loss": 2.174164962768555, "step": 46170 }, { "epoch": 13.108146466080045, "grad_norm": 6.417506217956543, "learning_rate": 3.44862333238717e-05, "loss": 2.177842140197754, "step": 46180 }, { "epoch": 13.110984956003406, "grad_norm": 6.864022731781006, "learning_rate": 3.447204087425489e-05, "loss": 2.2244075775146483, "step": 46190 }, { "epoch": 13.113823445926768, "grad_norm": 6.664789199829102, "learning_rate": 3.445784842463809e-05, "loss": 2.174199676513672, "step": 46200 }, { "epoch": 13.116661935850127, "grad_norm": 6.9197998046875, "learning_rate": 3.4443655975021286e-05, "loss": 2.2464237213134766, "step": 46210 }, { "epoch": 13.119500425773488, "grad_norm": 6.700075626373291, "learning_rate": 3.4429463525404486e-05, "loss": 2.168397331237793, "step": 46220 }, { "epoch": 13.122338915696849, "grad_norm": 6.363243103027344, "learning_rate": 3.441527107578768e-05, "loss": 2.2056276321411135, "step": 46230 }, { "epoch": 13.12517740562021, "grad_norm": 6.888374328613281, "learning_rate": 3.440107862617088e-05, "loss": 2.2258752822875976, "step": 46240 }, { "epoch": 13.128015895543571, "grad_norm": 6.668290615081787, "learning_rate": 3.438688617655407e-05, "loss": 2.1395034790039062, "step": 46250 }, { "epoch": 13.130854385466932, "grad_norm": 6.311073303222656, "learning_rate": 3.437269372693727e-05, "loss": 2.1916986465454102, "step": 46260 }, { "epoch": 13.133692875390292, "grad_norm": 6.560551643371582, "learning_rate": 3.4358501277320467e-05, "loss": 2.213251495361328, "step": 46270 }, { "epoch": 13.136531365313653, "grad_norm": 6.716089248657227, "learning_rate": 3.4344308827703667e-05, "loss": 2.2135942459106444, "step": 46280 }, { "epoch": 13.139369855237014, "grad_norm": 6.409336566925049, "learning_rate": 3.433011637808686e-05, "loss": 2.1520740509033205, "step": 46290 }, { "epoch": 13.142208345160375, "grad_norm": 6.648159980773926, "learning_rate": 3.431592392847005e-05, "loss": 2.187992286682129, "step": 46300 }, { "epoch": 13.145046835083736, "grad_norm": 6.630763053894043, "learning_rate": 3.4301731478853247e-05, "loss": 2.127132797241211, "step": 46310 }, { "epoch": 13.147885325007096, "grad_norm": 6.823381423950195, "learning_rate": 3.428753902923645e-05, "loss": 2.154850387573242, "step": 46320 }, { "epoch": 13.150723814930457, "grad_norm": 6.695974349975586, "learning_rate": 3.427334657961964e-05, "loss": 2.2438426971435548, "step": 46330 }, { "epoch": 13.153562304853818, "grad_norm": 6.439967155456543, "learning_rate": 3.425915413000284e-05, "loss": 2.1426151275634764, "step": 46340 }, { "epoch": 13.156400794777179, "grad_norm": 6.026881217956543, "learning_rate": 3.4244961680386033e-05, "loss": 2.1531877517700195, "step": 46350 }, { "epoch": 13.15923928470054, "grad_norm": 7.045583248138428, "learning_rate": 3.4230769230769234e-05, "loss": 2.113417053222656, "step": 46360 }, { "epoch": 13.1620777746239, "grad_norm": 6.548264503479004, "learning_rate": 3.421657678115243e-05, "loss": 2.1964555740356446, "step": 46370 }, { "epoch": 13.16491626454726, "grad_norm": 6.388326644897461, "learning_rate": 3.420238433153563e-05, "loss": 2.1409580230712892, "step": 46380 }, { "epoch": 13.167754754470621, "grad_norm": 6.691824436187744, "learning_rate": 3.418819188191882e-05, "loss": 2.188113784790039, "step": 46390 }, { "epoch": 13.170593244393983, "grad_norm": 6.7783026695251465, "learning_rate": 3.417399943230202e-05, "loss": 2.176466369628906, "step": 46400 }, { "epoch": 13.173431734317344, "grad_norm": 6.560814380645752, "learning_rate": 3.4159806982685214e-05, "loss": 2.2012960433959963, "step": 46410 }, { "epoch": 13.176270224240703, "grad_norm": 6.613675117492676, "learning_rate": 3.414561453306841e-05, "loss": 2.2199886322021483, "step": 46420 }, { "epoch": 13.179108714164064, "grad_norm": 6.285987854003906, "learning_rate": 3.41314220834516e-05, "loss": 2.1488004684448243, "step": 46430 }, { "epoch": 13.181947204087425, "grad_norm": 6.585273742675781, "learning_rate": 3.41172296338348e-05, "loss": 2.128682518005371, "step": 46440 }, { "epoch": 13.184785694010786, "grad_norm": 6.669427871704102, "learning_rate": 3.4103037184217994e-05, "loss": 2.1744056701660157, "step": 46450 }, { "epoch": 13.187624183934147, "grad_norm": 6.3247761726379395, "learning_rate": 3.4088844734601194e-05, "loss": 2.1635931015014647, "step": 46460 }, { "epoch": 13.190462673857509, "grad_norm": 6.684354305267334, "learning_rate": 3.407465228498439e-05, "loss": 2.1254638671875, "step": 46470 }, { "epoch": 13.193301163780868, "grad_norm": 6.991819381713867, "learning_rate": 3.406045983536759e-05, "loss": 2.1938385009765624, "step": 46480 }, { "epoch": 13.196139653704229, "grad_norm": 7.008077144622803, "learning_rate": 3.404626738575078e-05, "loss": 2.18090877532959, "step": 46490 }, { "epoch": 13.19897814362759, "grad_norm": 6.689757823944092, "learning_rate": 3.403207493613398e-05, "loss": 2.153369331359863, "step": 46500 }, { "epoch": 13.19897814362759, "eval_accuracy": 0.3352832708081643, "eval_loss": 2.5061275959014893, "eval_runtime": 48.0219, "eval_samples_per_second": 327.496, "eval_steps_per_second": 5.123, "step": 46500 }, { "epoch": 13.201816633550951, "grad_norm": 6.383524417877197, "learning_rate": 3.401930173147885e-05, "loss": 2.231319236755371, "step": 46510 }, { "epoch": 13.204655123474312, "grad_norm": 6.799488544464111, "learning_rate": 3.4005109281862053e-05, "loss": 2.14512939453125, "step": 46520 }, { "epoch": 13.207493613397672, "grad_norm": 6.328771114349365, "learning_rate": 3.399091683224525e-05, "loss": 2.1839582443237306, "step": 46530 }, { "epoch": 13.210332103321033, "grad_norm": 6.590666770935059, "learning_rate": 3.397672438262845e-05, "loss": 2.206013488769531, "step": 46540 }, { "epoch": 13.213170593244394, "grad_norm": 6.562514305114746, "learning_rate": 3.396253193301164e-05, "loss": 2.199308395385742, "step": 46550 }, { "epoch": 13.216009083167755, "grad_norm": 6.392251491546631, "learning_rate": 3.3948339483394833e-05, "loss": 2.2065210342407227, "step": 46560 }, { "epoch": 13.218847573091116, "grad_norm": 6.3407111167907715, "learning_rate": 3.393414703377803e-05, "loss": 2.1385776519775392, "step": 46570 }, { "epoch": 13.221686063014475, "grad_norm": 7.1117634773254395, "learning_rate": 3.391995458416123e-05, "loss": 2.1744489669799805, "step": 46580 }, { "epoch": 13.224524552937837, "grad_norm": 6.263020992279053, "learning_rate": 3.390576213454442e-05, "loss": 2.119948959350586, "step": 46590 }, { "epoch": 13.227363042861198, "grad_norm": 6.605441093444824, "learning_rate": 3.389156968492762e-05, "loss": 2.176411819458008, "step": 46600 }, { "epoch": 13.230201532784559, "grad_norm": 6.333186626434326, "learning_rate": 3.3877377235310814e-05, "loss": 2.2051143646240234, "step": 46610 }, { "epoch": 13.23304002270792, "grad_norm": 6.441667079925537, "learning_rate": 3.3863184785694014e-05, "loss": 2.13922119140625, "step": 46620 }, { "epoch": 13.235878512631281, "grad_norm": 6.7373046875, "learning_rate": 3.384899233607721e-05, "loss": 2.196026420593262, "step": 46630 }, { "epoch": 13.23871700255464, "grad_norm": 6.849289894104004, "learning_rate": 3.383479988646041e-05, "loss": 2.166085052490234, "step": 46640 }, { "epoch": 13.241555492478001, "grad_norm": 6.597332000732422, "learning_rate": 3.38206074368436e-05, "loss": 2.1625844955444338, "step": 46650 }, { "epoch": 13.244393982401363, "grad_norm": 6.652769088745117, "learning_rate": 3.38064149872268e-05, "loss": 2.1690095901489257, "step": 46660 }, { "epoch": 13.247232472324724, "grad_norm": 6.526604652404785, "learning_rate": 3.3792222537609994e-05, "loss": 2.28118782043457, "step": 46670 }, { "epoch": 13.250070962248085, "grad_norm": 6.897037506103516, "learning_rate": 3.377803008799319e-05, "loss": 2.2520286560058596, "step": 46680 }, { "epoch": 13.252909452171444, "grad_norm": 6.326217174530029, "learning_rate": 3.376383763837638e-05, "loss": 2.1807369232177733, "step": 46690 }, { "epoch": 13.255747942094805, "grad_norm": 6.411378383636475, "learning_rate": 3.374964518875958e-05, "loss": 2.229646682739258, "step": 46700 }, { "epoch": 13.258586432018166, "grad_norm": 6.465052604675293, "learning_rate": 3.3735452739142774e-05, "loss": 2.1699121475219725, "step": 46710 }, { "epoch": 13.261424921941527, "grad_norm": 7.039261817932129, "learning_rate": 3.3721260289525974e-05, "loss": 2.2286636352539064, "step": 46720 }, { "epoch": 13.264263411864889, "grad_norm": 6.443538665771484, "learning_rate": 3.370706783990917e-05, "loss": 2.20108585357666, "step": 46730 }, { "epoch": 13.267101901788248, "grad_norm": 6.347054481506348, "learning_rate": 3.369287539029237e-05, "loss": 2.1103755950927736, "step": 46740 }, { "epoch": 13.269940391711609, "grad_norm": 6.211337566375732, "learning_rate": 3.367868294067556e-05, "loss": 2.215713882446289, "step": 46750 }, { "epoch": 13.27277888163497, "grad_norm": 6.636508941650391, "learning_rate": 3.366449049105876e-05, "loss": 2.2104963302612304, "step": 46760 }, { "epoch": 13.275617371558331, "grad_norm": 6.466491222381592, "learning_rate": 3.3650298041441954e-05, "loss": 2.242578125, "step": 46770 }, { "epoch": 13.278455861481692, "grad_norm": 6.513864517211914, "learning_rate": 3.3636105591825154e-05, "loss": 2.173918533325195, "step": 46780 }, { "epoch": 13.281294351405052, "grad_norm": 6.360776424407959, "learning_rate": 3.362191314220835e-05, "loss": 2.2295761108398438, "step": 46790 }, { "epoch": 13.284132841328413, "grad_norm": 6.561796188354492, "learning_rate": 3.360772069259154e-05, "loss": 2.2423465728759764, "step": 46800 }, { "epoch": 13.286971331251774, "grad_norm": 6.4960036277771, "learning_rate": 3.3593528242974734e-05, "loss": 2.188142204284668, "step": 46810 }, { "epoch": 13.289809821175135, "grad_norm": 6.638722896575928, "learning_rate": 3.3579335793357934e-05, "loss": 2.1747787475585936, "step": 46820 }, { "epoch": 13.292648311098496, "grad_norm": 6.427298545837402, "learning_rate": 3.356514334374113e-05, "loss": 2.1274972915649415, "step": 46830 }, { "epoch": 13.295486801021857, "grad_norm": 6.661983013153076, "learning_rate": 3.355095089412433e-05, "loss": 2.2291101455688476, "step": 46840 }, { "epoch": 13.298325290945217, "grad_norm": 6.571266174316406, "learning_rate": 3.353675844450752e-05, "loss": 2.115439605712891, "step": 46850 }, { "epoch": 13.301163780868578, "grad_norm": 6.6725335121154785, "learning_rate": 3.352256599489072e-05, "loss": 2.1409610748291015, "step": 46860 }, { "epoch": 13.304002270791939, "grad_norm": 6.6383771896362305, "learning_rate": 3.3508373545273914e-05, "loss": 2.194683074951172, "step": 46870 }, { "epoch": 13.3068407607153, "grad_norm": 6.330974102020264, "learning_rate": 3.3494181095657115e-05, "loss": 2.1786766052246094, "step": 46880 }, { "epoch": 13.309679250638661, "grad_norm": 6.663891792297363, "learning_rate": 3.347998864604031e-05, "loss": 2.1306596755981446, "step": 46890 }, { "epoch": 13.31251774056202, "grad_norm": 6.330613613128662, "learning_rate": 3.34657961964235e-05, "loss": 2.1458139419555664, "step": 46900 }, { "epoch": 13.315356230485381, "grad_norm": 6.617608547210693, "learning_rate": 3.3451603746806695e-05, "loss": 2.170302963256836, "step": 46910 }, { "epoch": 13.318194720408743, "grad_norm": 6.611360549926758, "learning_rate": 3.3437411297189895e-05, "loss": 2.2006845474243164, "step": 46920 }, { "epoch": 13.321033210332104, "grad_norm": 6.67103910446167, "learning_rate": 3.342321884757309e-05, "loss": 2.1919870376586914, "step": 46930 }, { "epoch": 13.323871700255465, "grad_norm": 6.849035263061523, "learning_rate": 3.340902639795629e-05, "loss": 2.2134090423583985, "step": 46940 }, { "epoch": 13.326710190178824, "grad_norm": 6.737448215484619, "learning_rate": 3.339483394833948e-05, "loss": 2.2419851303100584, "step": 46950 }, { "epoch": 13.329548680102185, "grad_norm": 6.571868419647217, "learning_rate": 3.338064149872268e-05, "loss": 2.225188446044922, "step": 46960 }, { "epoch": 13.332387170025546, "grad_norm": 6.8283538818359375, "learning_rate": 3.3366449049105875e-05, "loss": 2.1560731887817384, "step": 46970 }, { "epoch": 13.335225659948907, "grad_norm": 6.581540584564209, "learning_rate": 3.3352256599489075e-05, "loss": 2.1803657531738283, "step": 46980 }, { "epoch": 13.338064149872269, "grad_norm": 6.455472946166992, "learning_rate": 3.333806414987227e-05, "loss": 2.198296546936035, "step": 46990 }, { "epoch": 13.34090263979563, "grad_norm": 6.261379241943359, "learning_rate": 3.332387170025547e-05, "loss": 2.1439725875854494, "step": 47000 }, { "epoch": 13.34090263979563, "eval_accuracy": 0.3358555350670821, "eval_loss": 2.5012993812561035, "eval_runtime": 49.5337, "eval_samples_per_second": 317.501, "eval_steps_per_second": 4.966, "step": 47000 }, { "epoch": 13.343741129718989, "grad_norm": 6.442269802093506, "learning_rate": 3.330967925063866e-05, "loss": 2.1591495513916015, "step": 47010 }, { "epoch": 13.34657961964235, "grad_norm": 6.416388034820557, "learning_rate": 3.3295486801021855e-05, "loss": 2.157460594177246, "step": 47020 }, { "epoch": 13.349418109565711, "grad_norm": 6.801124095916748, "learning_rate": 3.3281294351405055e-05, "loss": 2.2315109252929686, "step": 47030 }, { "epoch": 13.352256599489072, "grad_norm": 6.543035507202148, "learning_rate": 3.326710190178825e-05, "loss": 2.1729547500610353, "step": 47040 }, { "epoch": 13.355095089412433, "grad_norm": 6.3281731605529785, "learning_rate": 3.325290945217145e-05, "loss": 2.2281509399414063, "step": 47050 }, { "epoch": 13.357933579335793, "grad_norm": 6.424380779266357, "learning_rate": 3.323871700255464e-05, "loss": 2.2176212310791015, "step": 47060 }, { "epoch": 13.360772069259154, "grad_norm": 6.633056640625, "learning_rate": 3.322452455293784e-05, "loss": 2.1811878204345705, "step": 47070 }, { "epoch": 13.363610559182515, "grad_norm": 6.757806301116943, "learning_rate": 3.3210332103321035e-05, "loss": 2.206276702880859, "step": 47080 }, { "epoch": 13.366449049105876, "grad_norm": 6.187034606933594, "learning_rate": 3.3196139653704235e-05, "loss": 2.142582130432129, "step": 47090 }, { "epoch": 13.369287539029237, "grad_norm": 6.463987350463867, "learning_rate": 3.318194720408743e-05, "loss": 2.1790887832641603, "step": 47100 }, { "epoch": 13.372126028952596, "grad_norm": 6.9053425788879395, "learning_rate": 3.316775475447063e-05, "loss": 2.142988395690918, "step": 47110 }, { "epoch": 13.374964518875958, "grad_norm": 6.686795711517334, "learning_rate": 3.315356230485382e-05, "loss": 2.1745044708251955, "step": 47120 }, { "epoch": 13.377803008799319, "grad_norm": 6.131415367126465, "learning_rate": 3.3139369855237015e-05, "loss": 2.1119668960571287, "step": 47130 }, { "epoch": 13.38064149872268, "grad_norm": 6.782320499420166, "learning_rate": 3.312517740562021e-05, "loss": 2.0766429901123047, "step": 47140 }, { "epoch": 13.383479988646041, "grad_norm": 6.524796009063721, "learning_rate": 3.311098495600341e-05, "loss": 2.1490623474121096, "step": 47150 }, { "epoch": 13.3863184785694, "grad_norm": 6.905490875244141, "learning_rate": 3.30967925063866e-05, "loss": 2.2538288116455076, "step": 47160 }, { "epoch": 13.389156968492761, "grad_norm": 6.325107574462891, "learning_rate": 3.30826000567698e-05, "loss": 2.1536815643310545, "step": 47170 }, { "epoch": 13.391995458416122, "grad_norm": 6.637567043304443, "learning_rate": 3.3068407607152996e-05, "loss": 2.2484867095947267, "step": 47180 }, { "epoch": 13.394833948339484, "grad_norm": 6.487167835235596, "learning_rate": 3.3054215157536196e-05, "loss": 2.1166162490844727, "step": 47190 }, { "epoch": 13.397672438262845, "grad_norm": 6.605711936950684, "learning_rate": 3.304002270791939e-05, "loss": 2.1376855850219725, "step": 47200 }, { "epoch": 13.400510928186204, "grad_norm": 6.495489120483398, "learning_rate": 3.302583025830259e-05, "loss": 2.139403533935547, "step": 47210 }, { "epoch": 13.403349418109565, "grad_norm": 6.654447078704834, "learning_rate": 3.301163780868578e-05, "loss": 2.2074617385864257, "step": 47220 }, { "epoch": 13.406187908032926, "grad_norm": 6.477313995361328, "learning_rate": 3.299744535906898e-05, "loss": 2.1627008438110353, "step": 47230 }, { "epoch": 13.409026397956287, "grad_norm": 6.239865303039551, "learning_rate": 3.2983252909452176e-05, "loss": 2.177963066101074, "step": 47240 }, { "epoch": 13.411864887879648, "grad_norm": 6.5208306312561035, "learning_rate": 3.296906045983537e-05, "loss": 2.1899349212646486, "step": 47250 }, { "epoch": 13.41470337780301, "grad_norm": 6.47200345993042, "learning_rate": 3.295486801021856e-05, "loss": 2.1343849182128904, "step": 47260 }, { "epoch": 13.417541867726369, "grad_norm": 6.14863395690918, "learning_rate": 3.294067556060176e-05, "loss": 2.1559391021728516, "step": 47270 }, { "epoch": 13.42038035764973, "grad_norm": 6.636410236358643, "learning_rate": 3.2926483110984956e-05, "loss": 2.16552848815918, "step": 47280 }, { "epoch": 13.423218847573091, "grad_norm": 6.198687553405762, "learning_rate": 3.2912290661368156e-05, "loss": 2.1755834579467774, "step": 47290 }, { "epoch": 13.426057337496452, "grad_norm": 6.495841026306152, "learning_rate": 3.289809821175135e-05, "loss": 2.190612030029297, "step": 47300 }, { "epoch": 13.428895827419813, "grad_norm": 6.710244178771973, "learning_rate": 3.288390576213455e-05, "loss": 2.1329341888427735, "step": 47310 }, { "epoch": 13.431734317343173, "grad_norm": 6.508487224578857, "learning_rate": 3.286971331251774e-05, "loss": 2.1722822189331055, "step": 47320 }, { "epoch": 13.434572807266534, "grad_norm": 6.163900375366211, "learning_rate": 3.285552086290094e-05, "loss": 2.132175636291504, "step": 47330 }, { "epoch": 13.437411297189895, "grad_norm": 6.212059497833252, "learning_rate": 3.2841328413284136e-05, "loss": 2.138621520996094, "step": 47340 }, { "epoch": 13.440249787113256, "grad_norm": 6.6300272941589355, "learning_rate": 3.282713596366733e-05, "loss": 2.1204292297363283, "step": 47350 }, { "epoch": 13.443088277036617, "grad_norm": 6.344503879547119, "learning_rate": 3.281294351405052e-05, "loss": 2.119084930419922, "step": 47360 }, { "epoch": 13.445926766959976, "grad_norm": 6.606950759887695, "learning_rate": 3.279875106443372e-05, "loss": 2.2081392288208006, "step": 47370 }, { "epoch": 13.448765256883338, "grad_norm": 6.4512553215026855, "learning_rate": 3.2784558614816916e-05, "loss": 2.12890510559082, "step": 47380 }, { "epoch": 13.451603746806699, "grad_norm": 6.79926872253418, "learning_rate": 3.2770366165200116e-05, "loss": 2.0670055389404296, "step": 47390 }, { "epoch": 13.45444223673006, "grad_norm": 6.291860103607178, "learning_rate": 3.275617371558331e-05, "loss": 2.1893186569213867, "step": 47400 }, { "epoch": 13.457280726653421, "grad_norm": 6.400170803070068, "learning_rate": 3.274198126596651e-05, "loss": 2.177842712402344, "step": 47410 }, { "epoch": 13.460119216576782, "grad_norm": 6.447155475616455, "learning_rate": 3.27277888163497e-05, "loss": 2.1344415664672853, "step": 47420 }, { "epoch": 13.462957706500141, "grad_norm": 6.4447550773620605, "learning_rate": 3.27135963667329e-05, "loss": 2.213872718811035, "step": 47430 }, { "epoch": 13.465796196423502, "grad_norm": 6.515454292297363, "learning_rate": 3.2699403917116096e-05, "loss": 2.1717046737670898, "step": 47440 }, { "epoch": 13.468634686346864, "grad_norm": 6.395730018615723, "learning_rate": 3.2685211467499297e-05, "loss": 2.1082342147827147, "step": 47450 }, { "epoch": 13.471473176270225, "grad_norm": 6.415413856506348, "learning_rate": 3.267101901788249e-05, "loss": 2.1869998931884767, "step": 47460 }, { "epoch": 13.474311666193586, "grad_norm": 6.512421607971191, "learning_rate": 3.265682656826568e-05, "loss": 2.1085914611816405, "step": 47470 }, { "epoch": 13.477150156116945, "grad_norm": 6.417304992675781, "learning_rate": 3.2642634118648876e-05, "loss": 2.1252193450927734, "step": 47480 }, { "epoch": 13.479988646040306, "grad_norm": 6.85512638092041, "learning_rate": 3.2628441669032077e-05, "loss": 2.182598876953125, "step": 47490 }, { "epoch": 13.482827135963667, "grad_norm": 6.546575546264648, "learning_rate": 3.261424921941527e-05, "loss": 2.1819353103637695, "step": 47500 }, { "epoch": 13.482827135963667, "eval_accuracy": 0.33668213899663, "eval_loss": 2.5040371417999268, "eval_runtime": 59.2447, "eval_samples_per_second": 265.458, "eval_steps_per_second": 4.152, "step": 47500 }, { "epoch": 13.485665625887028, "grad_norm": 6.496048927307129, "learning_rate": 3.260005676979847e-05, "loss": 2.1808097839355467, "step": 47510 }, { "epoch": 13.48850411581039, "grad_norm": 6.657231330871582, "learning_rate": 3.258586432018166e-05, "loss": 2.168865203857422, "step": 47520 }, { "epoch": 13.491342605733749, "grad_norm": 6.075253486633301, "learning_rate": 3.2571671870564863e-05, "loss": 2.11364631652832, "step": 47530 }, { "epoch": 13.49418109565711, "grad_norm": 6.5345282554626465, "learning_rate": 3.255747942094806e-05, "loss": 2.199725341796875, "step": 47540 }, { "epoch": 13.497019585580471, "grad_norm": 7.045191287994385, "learning_rate": 3.254328697133126e-05, "loss": 2.156606674194336, "step": 47550 }, { "epoch": 13.499858075503832, "grad_norm": 6.416289806365967, "learning_rate": 3.252909452171445e-05, "loss": 2.159865951538086, "step": 47560 }, { "epoch": 13.502696565427193, "grad_norm": 6.482550621032715, "learning_rate": 3.251490207209765e-05, "loss": 2.1177398681640627, "step": 47570 }, { "epoch": 13.505535055350553, "grad_norm": 6.574519157409668, "learning_rate": 3.2500709622480844e-05, "loss": 2.1769250869750976, "step": 47580 }, { "epoch": 13.508373545273914, "grad_norm": 6.765495777130127, "learning_rate": 3.248651717286404e-05, "loss": 2.148430824279785, "step": 47590 }, { "epoch": 13.511212035197275, "grad_norm": 6.636418342590332, "learning_rate": 3.247232472324723e-05, "loss": 2.2307437896728515, "step": 47600 }, { "epoch": 13.514050525120636, "grad_norm": 6.900279998779297, "learning_rate": 3.245813227363043e-05, "loss": 2.15603084564209, "step": 47610 }, { "epoch": 13.516889015043997, "grad_norm": 6.260702610015869, "learning_rate": 3.2443939824013624e-05, "loss": 2.1212257385253905, "step": 47620 }, { "epoch": 13.519727504967356, "grad_norm": 6.412293910980225, "learning_rate": 3.2429747374396824e-05, "loss": 2.222853660583496, "step": 47630 }, { "epoch": 13.522565994890718, "grad_norm": 6.613524913787842, "learning_rate": 3.241555492478002e-05, "loss": 2.180500793457031, "step": 47640 }, { "epoch": 13.525404484814079, "grad_norm": 6.74688720703125, "learning_rate": 3.240136247516322e-05, "loss": 2.1542049407958985, "step": 47650 }, { "epoch": 13.52824297473744, "grad_norm": 6.202657222747803, "learning_rate": 3.238717002554641e-05, "loss": 2.180510139465332, "step": 47660 }, { "epoch": 13.5310814646608, "grad_norm": 6.269230842590332, "learning_rate": 3.237297757592961e-05, "loss": 2.1399612426757812, "step": 47670 }, { "epoch": 13.533919954584162, "grad_norm": 6.4674272537231445, "learning_rate": 3.2358785126312804e-05, "loss": 2.195985794067383, "step": 47680 }, { "epoch": 13.536758444507521, "grad_norm": 6.620067596435547, "learning_rate": 3.2344592676696e-05, "loss": 2.1479347229003904, "step": 47690 }, { "epoch": 13.539596934430882, "grad_norm": 6.657735824584961, "learning_rate": 3.233040022707919e-05, "loss": 2.248214340209961, "step": 47700 }, { "epoch": 13.542435424354244, "grad_norm": 6.5126261711120605, "learning_rate": 3.231620777746239e-05, "loss": 2.1431911468505858, "step": 47710 }, { "epoch": 13.545273914277605, "grad_norm": 6.877660274505615, "learning_rate": 3.2302015327845584e-05, "loss": 2.1231279373168945, "step": 47720 }, { "epoch": 13.548112404200966, "grad_norm": 6.548232555389404, "learning_rate": 3.2287822878228784e-05, "loss": 2.218143844604492, "step": 47730 }, { "epoch": 13.550950894124325, "grad_norm": 6.768500328063965, "learning_rate": 3.227363042861198e-05, "loss": 2.1738794326782225, "step": 47740 }, { "epoch": 13.553789384047686, "grad_norm": 6.416693210601807, "learning_rate": 3.225943797899518e-05, "loss": 2.1859725952148437, "step": 47750 }, { "epoch": 13.556627873971047, "grad_norm": 6.770720481872559, "learning_rate": 3.224524552937837e-05, "loss": 2.2333984375, "step": 47760 }, { "epoch": 13.559466363894408, "grad_norm": 6.5971903800964355, "learning_rate": 3.223105307976157e-05, "loss": 2.198861312866211, "step": 47770 }, { "epoch": 13.56230485381777, "grad_norm": 6.58443546295166, "learning_rate": 3.2216860630144764e-05, "loss": 2.246163749694824, "step": 47780 }, { "epoch": 13.56514334374113, "grad_norm": 6.472498416900635, "learning_rate": 3.2202668180527964e-05, "loss": 2.1812160491943358, "step": 47790 }, { "epoch": 13.56798183366449, "grad_norm": 6.360505104064941, "learning_rate": 3.218847573091116e-05, "loss": 2.1720703125, "step": 47800 }, { "epoch": 13.570820323587851, "grad_norm": 6.866302013397217, "learning_rate": 3.217428328129435e-05, "loss": 2.172337532043457, "step": 47810 }, { "epoch": 13.573658813511212, "grad_norm": 6.8516645431518555, "learning_rate": 3.2160090831677544e-05, "loss": 2.161039352416992, "step": 47820 }, { "epoch": 13.576497303434573, "grad_norm": 6.598275184631348, "learning_rate": 3.2145898382060744e-05, "loss": 2.1278696060180664, "step": 47830 }, { "epoch": 13.579335793357934, "grad_norm": 6.8222198486328125, "learning_rate": 3.213170593244394e-05, "loss": 2.1610679626464844, "step": 47840 }, { "epoch": 13.582174283281294, "grad_norm": 6.610134124755859, "learning_rate": 3.211751348282714e-05, "loss": 2.20311279296875, "step": 47850 }, { "epoch": 13.585012773204655, "grad_norm": 6.7336812019348145, "learning_rate": 3.210332103321033e-05, "loss": 2.252109718322754, "step": 47860 }, { "epoch": 13.587851263128016, "grad_norm": 6.279324054718018, "learning_rate": 3.208912858359353e-05, "loss": 2.199675941467285, "step": 47870 }, { "epoch": 13.590689753051377, "grad_norm": 6.390153884887695, "learning_rate": 3.2074936133976725e-05, "loss": 2.1656490325927735, "step": 47880 }, { "epoch": 13.593528242974738, "grad_norm": 6.477832317352295, "learning_rate": 3.2060743684359925e-05, "loss": 2.2097496032714843, "step": 47890 }, { "epoch": 13.596366732898097, "grad_norm": 6.477111339569092, "learning_rate": 3.204655123474312e-05, "loss": 2.20400447845459, "step": 47900 }, { "epoch": 13.599205222821459, "grad_norm": 6.656484127044678, "learning_rate": 3.203235878512632e-05, "loss": 2.1845203399658204, "step": 47910 }, { "epoch": 13.60204371274482, "grad_norm": 6.319901943206787, "learning_rate": 3.201816633550951e-05, "loss": 2.211333084106445, "step": 47920 }, { "epoch": 13.60488220266818, "grad_norm": 6.447457313537598, "learning_rate": 3.2003973885892705e-05, "loss": 2.208597946166992, "step": 47930 }, { "epoch": 13.607720692591542, "grad_norm": 6.798655986785889, "learning_rate": 3.19897814362759e-05, "loss": 2.223893165588379, "step": 47940 }, { "epoch": 13.610559182514901, "grad_norm": 6.609568119049072, "learning_rate": 3.19755889866591e-05, "loss": 2.209817886352539, "step": 47950 }, { "epoch": 13.613397672438262, "grad_norm": 6.632833480834961, "learning_rate": 3.196139653704229e-05, "loss": 2.259016227722168, "step": 47960 }, { "epoch": 13.616236162361623, "grad_norm": 6.353844165802002, "learning_rate": 3.194720408742549e-05, "loss": 2.2298412322998047, "step": 47970 }, { "epoch": 13.619074652284985, "grad_norm": 6.477324485778809, "learning_rate": 3.1933011637808685e-05, "loss": 2.1862209320068358, "step": 47980 }, { "epoch": 13.621913142208346, "grad_norm": 6.544798851013184, "learning_rate": 3.1918819188191885e-05, "loss": 2.184852409362793, "step": 47990 }, { "epoch": 13.624751632131705, "grad_norm": 6.267632007598877, "learning_rate": 3.190462673857508e-05, "loss": 2.196056938171387, "step": 48000 }, { "epoch": 13.624751632131705, "eval_accuracy": 0.3380810071850957, "eval_loss": 2.499730348587036, "eval_runtime": 53.1478, "eval_samples_per_second": 295.911, "eval_steps_per_second": 4.629, "step": 48000 }, { "epoch": 13.627590122055066, "grad_norm": 6.522346019744873, "learning_rate": 3.189043428895828e-05, "loss": 2.2251209259033202, "step": 48010 }, { "epoch": 13.630428611978427, "grad_norm": 7.319828510284424, "learning_rate": 3.187624183934147e-05, "loss": 2.138623809814453, "step": 48020 }, { "epoch": 13.633267101901788, "grad_norm": 6.687557220458984, "learning_rate": 3.186204938972467e-05, "loss": 2.16962890625, "step": 48030 }, { "epoch": 13.63610559182515, "grad_norm": 6.540821552276611, "learning_rate": 3.1847856940107865e-05, "loss": 2.2484516143798827, "step": 48040 }, { "epoch": 13.63894408174851, "grad_norm": 6.36393928527832, "learning_rate": 3.183366449049106e-05, "loss": 2.1634519577026365, "step": 48050 }, { "epoch": 13.64178257167187, "grad_norm": 6.694311141967773, "learning_rate": 3.181947204087425e-05, "loss": 2.1540458679199217, "step": 48060 }, { "epoch": 13.644621061595231, "grad_norm": 6.4355878829956055, "learning_rate": 3.180527959125745e-05, "loss": 2.129506301879883, "step": 48070 }, { "epoch": 13.647459551518592, "grad_norm": 6.054873466491699, "learning_rate": 3.1791087141640645e-05, "loss": 2.1751224517822267, "step": 48080 }, { "epoch": 13.650298041441953, "grad_norm": 6.505258560180664, "learning_rate": 3.1776894692023845e-05, "loss": 2.1812507629394533, "step": 48090 }, { "epoch": 13.653136531365314, "grad_norm": 6.463338375091553, "learning_rate": 3.176270224240704e-05, "loss": 2.1646072387695314, "step": 48100 }, { "epoch": 13.655975021288674, "grad_norm": 6.333786964416504, "learning_rate": 3.174850979279024e-05, "loss": 2.1858308792114256, "step": 48110 }, { "epoch": 13.658813511212035, "grad_norm": 6.654691696166992, "learning_rate": 3.173431734317343e-05, "loss": 2.241705322265625, "step": 48120 }, { "epoch": 13.661652001135396, "grad_norm": 6.73656702041626, "learning_rate": 3.172012489355663e-05, "loss": 2.2289262771606446, "step": 48130 }, { "epoch": 13.664490491058757, "grad_norm": 6.675867557525635, "learning_rate": 3.1705932443939825e-05, "loss": 2.1396434783935545, "step": 48140 }, { "epoch": 13.667328980982118, "grad_norm": 6.371293067932129, "learning_rate": 3.169173999432302e-05, "loss": 2.153974914550781, "step": 48150 }, { "epoch": 13.670167470905477, "grad_norm": 6.394639492034912, "learning_rate": 3.167754754470621e-05, "loss": 2.1006460189819336, "step": 48160 }, { "epoch": 13.673005960828839, "grad_norm": 6.7193989753723145, "learning_rate": 3.166335509508941e-05, "loss": 2.230514144897461, "step": 48170 }, { "epoch": 13.6758444507522, "grad_norm": 6.701028823852539, "learning_rate": 3.1649162645472605e-05, "loss": 2.2004329681396486, "step": 48180 }, { "epoch": 13.67868294067556, "grad_norm": 6.479400634765625, "learning_rate": 3.1634970195855806e-05, "loss": 2.197312355041504, "step": 48190 }, { "epoch": 13.681521430598922, "grad_norm": 6.4442524909973145, "learning_rate": 3.1620777746239e-05, "loss": 2.195107269287109, "step": 48200 }, { "epoch": 13.684359920522283, "grad_norm": 6.646457195281982, "learning_rate": 3.16065852966222e-05, "loss": 2.1869718551635744, "step": 48210 }, { "epoch": 13.687198410445642, "grad_norm": 6.363117694854736, "learning_rate": 3.159239284700539e-05, "loss": 2.1385826110839843, "step": 48220 }, { "epoch": 13.690036900369003, "grad_norm": 6.3032612800598145, "learning_rate": 3.157820039738859e-05, "loss": 2.106973075866699, "step": 48230 }, { "epoch": 13.692875390292365, "grad_norm": 6.7986555099487305, "learning_rate": 3.1564007947771786e-05, "loss": 2.1922977447509764, "step": 48240 }, { "epoch": 13.695713880215726, "grad_norm": 6.343910217285156, "learning_rate": 3.1549815498154986e-05, "loss": 2.1443111419677736, "step": 48250 }, { "epoch": 13.698552370139087, "grad_norm": 6.629879474639893, "learning_rate": 3.153562304853818e-05, "loss": 2.2348724365234376, "step": 48260 }, { "epoch": 13.701390860062446, "grad_norm": 6.711562633514404, "learning_rate": 3.152143059892137e-05, "loss": 2.1435531616210937, "step": 48270 }, { "epoch": 13.704229349985807, "grad_norm": 6.5732340812683105, "learning_rate": 3.1507238149304566e-05, "loss": 2.1468873977661134, "step": 48280 }, { "epoch": 13.707067839909168, "grad_norm": 6.702856540679932, "learning_rate": 3.1493045699687766e-05, "loss": 2.210529899597168, "step": 48290 }, { "epoch": 13.70990632983253, "grad_norm": 6.546940326690674, "learning_rate": 3.1478853250070966e-05, "loss": 2.2124301910400392, "step": 48300 }, { "epoch": 13.71274481975589, "grad_norm": 6.323910713195801, "learning_rate": 3.146466080045416e-05, "loss": 2.146609306335449, "step": 48310 }, { "epoch": 13.71558330967925, "grad_norm": 6.282607555389404, "learning_rate": 3.145046835083736e-05, "loss": 2.1676544189453124, "step": 48320 }, { "epoch": 13.718421799602611, "grad_norm": 6.954321384429932, "learning_rate": 3.143627590122055e-05, "loss": 2.10168399810791, "step": 48330 }, { "epoch": 13.721260289525972, "grad_norm": 6.708166599273682, "learning_rate": 3.142208345160375e-05, "loss": 2.152330207824707, "step": 48340 }, { "epoch": 13.724098779449333, "grad_norm": 6.527822971343994, "learning_rate": 3.1407891001986946e-05, "loss": 2.1587520599365235, "step": 48350 }, { "epoch": 13.726937269372694, "grad_norm": 6.141233444213867, "learning_rate": 3.1393698552370146e-05, "loss": 2.1081281661987306, "step": 48360 }, { "epoch": 13.729775759296054, "grad_norm": 6.618691444396973, "learning_rate": 3.137950610275334e-05, "loss": 2.1283220291137694, "step": 48370 }, { "epoch": 13.732614249219415, "grad_norm": 6.41591739654541, "learning_rate": 3.136531365313653e-05, "loss": 2.1471889495849608, "step": 48380 }, { "epoch": 13.735452739142776, "grad_norm": 6.145511150360107, "learning_rate": 3.1351121203519726e-05, "loss": 2.159589385986328, "step": 48390 }, { "epoch": 13.738291229066137, "grad_norm": 6.371829986572266, "learning_rate": 3.1336928753902926e-05, "loss": 2.115733528137207, "step": 48400 }, { "epoch": 13.741129718989498, "grad_norm": 6.304243564605713, "learning_rate": 3.132273630428612e-05, "loss": 2.114486503601074, "step": 48410 }, { "epoch": 13.743968208912857, "grad_norm": 6.4298810958862305, "learning_rate": 3.130854385466932e-05, "loss": 2.232365036010742, "step": 48420 }, { "epoch": 13.746806698836219, "grad_norm": 6.866431713104248, "learning_rate": 3.129435140505251e-05, "loss": 2.2291048049926756, "step": 48430 }, { "epoch": 13.74964518875958, "grad_norm": 6.384503364562988, "learning_rate": 3.128015895543571e-05, "loss": 2.244907760620117, "step": 48440 }, { "epoch": 13.75248367868294, "grad_norm": 6.391608238220215, "learning_rate": 3.1265966505818906e-05, "loss": 2.1889461517333983, "step": 48450 }, { "epoch": 13.755322168606302, "grad_norm": 6.839885711669922, "learning_rate": 3.1251774056202107e-05, "loss": 2.197902297973633, "step": 48460 }, { "epoch": 13.758160658529663, "grad_norm": 7.028793811798096, "learning_rate": 3.12375816065853e-05, "loss": 2.2480363845825195, "step": 48470 }, { "epoch": 13.760999148453022, "grad_norm": 6.743922710418701, "learning_rate": 3.12233891569685e-05, "loss": 2.188379096984863, "step": 48480 }, { "epoch": 13.763837638376383, "grad_norm": 6.779515743255615, "learning_rate": 3.120919670735169e-05, "loss": 2.1383270263671874, "step": 48490 }, { "epoch": 13.766676128299745, "grad_norm": 6.353847026824951, "learning_rate": 3.119500425773489e-05, "loss": 2.2616329193115234, "step": 48500 }, { "epoch": 13.766676128299745, "eval_accuracy": 0.33769949767915053, "eval_loss": 2.4959161281585693, "eval_runtime": 53.7132, "eval_samples_per_second": 292.796, "eval_steps_per_second": 4.58, "step": 48500 }, { "epoch": 13.769514618223106, "grad_norm": 6.498905181884766, "learning_rate": 3.118081180811808e-05, "loss": 2.149883270263672, "step": 48510 }, { "epoch": 13.772353108146467, "grad_norm": 6.223232269287109, "learning_rate": 3.116661935850128e-05, "loss": 2.2114643096923827, "step": 48520 }, { "epoch": 13.775191598069826, "grad_norm": 6.743539333343506, "learning_rate": 3.115242690888447e-05, "loss": 2.135660171508789, "step": 48530 }, { "epoch": 13.778030087993187, "grad_norm": 6.457038402557373, "learning_rate": 3.1138234459267673e-05, "loss": 2.122100067138672, "step": 48540 }, { "epoch": 13.780868577916548, "grad_norm": 6.220773220062256, "learning_rate": 3.112404200965087e-05, "loss": 2.1749847412109373, "step": 48550 }, { "epoch": 13.78370706783991, "grad_norm": 6.763852119445801, "learning_rate": 3.110984956003407e-05, "loss": 2.170695495605469, "step": 48560 }, { "epoch": 13.78654555776327, "grad_norm": 6.308357238769531, "learning_rate": 3.109565711041726e-05, "loss": 2.095016670227051, "step": 48570 }, { "epoch": 13.789384047686632, "grad_norm": 6.959364414215088, "learning_rate": 3.108146466080046e-05, "loss": 2.1384117126464846, "step": 48580 }, { "epoch": 13.792222537609991, "grad_norm": 6.20652437210083, "learning_rate": 3.1067272211183654e-05, "loss": 2.140072250366211, "step": 48590 }, { "epoch": 13.795061027533352, "grad_norm": 6.395134449005127, "learning_rate": 3.105307976156685e-05, "loss": 2.155643844604492, "step": 48600 }, { "epoch": 13.797899517456713, "grad_norm": 6.93730354309082, "learning_rate": 3.103888731195004e-05, "loss": 2.1476573944091797, "step": 48610 }, { "epoch": 13.800738007380074, "grad_norm": 6.569911956787109, "learning_rate": 3.102469486233324e-05, "loss": 2.086914825439453, "step": 48620 }, { "epoch": 13.803576497303435, "grad_norm": 6.788222789764404, "learning_rate": 3.1010502412716434e-05, "loss": 2.23653507232666, "step": 48630 }, { "epoch": 13.806414987226795, "grad_norm": 6.784295082092285, "learning_rate": 3.0996309963099634e-05, "loss": 2.185405731201172, "step": 48640 }, { "epoch": 13.809253477150156, "grad_norm": 6.923871040344238, "learning_rate": 3.098211751348283e-05, "loss": 2.2130651473999023, "step": 48650 }, { "epoch": 13.812091967073517, "grad_norm": 6.35095739364624, "learning_rate": 3.096792506386603e-05, "loss": 2.1751741409301757, "step": 48660 }, { "epoch": 13.814930456996878, "grad_norm": 6.8019819259643555, "learning_rate": 3.095373261424922e-05, "loss": 2.1381881713867186, "step": 48670 }, { "epoch": 13.81776894692024, "grad_norm": 6.384081840515137, "learning_rate": 3.093954016463242e-05, "loss": 2.1659404754638674, "step": 48680 }, { "epoch": 13.820607436843598, "grad_norm": 6.381661415100098, "learning_rate": 3.0925347715015614e-05, "loss": 2.2818716049194334, "step": 48690 }, { "epoch": 13.82344592676696, "grad_norm": 6.894775390625, "learning_rate": 3.0911155265398814e-05, "loss": 2.130124092102051, "step": 48700 }, { "epoch": 13.82628441669032, "grad_norm": 6.751191139221191, "learning_rate": 3.089696281578201e-05, "loss": 2.234927940368652, "step": 48710 }, { "epoch": 13.829122906613682, "grad_norm": 6.577940940856934, "learning_rate": 3.08827703661652e-05, "loss": 2.219229888916016, "step": 48720 }, { "epoch": 13.831961396537043, "grad_norm": 6.959753513336182, "learning_rate": 3.0868577916548394e-05, "loss": 2.190095901489258, "step": 48730 }, { "epoch": 13.834799886460402, "grad_norm": 6.3694047927856445, "learning_rate": 3.0854385466931594e-05, "loss": 2.1908231735229493, "step": 48740 }, { "epoch": 13.837638376383763, "grad_norm": 6.772424697875977, "learning_rate": 3.084019301731479e-05, "loss": 2.1873472213745115, "step": 48750 }, { "epoch": 13.840476866307124, "grad_norm": 6.464141368865967, "learning_rate": 3.082600056769799e-05, "loss": 2.092848014831543, "step": 48760 }, { "epoch": 13.843315356230486, "grad_norm": 6.282130241394043, "learning_rate": 3.081180811808118e-05, "loss": 2.172450065612793, "step": 48770 }, { "epoch": 13.846153846153847, "grad_norm": 6.284754276275635, "learning_rate": 3.079761566846438e-05, "loss": 2.1042198181152343, "step": 48780 }, { "epoch": 13.848992336077206, "grad_norm": 6.7702131271362305, "learning_rate": 3.0783423218847574e-05, "loss": 2.1605892181396484, "step": 48790 }, { "epoch": 13.851830826000567, "grad_norm": 6.578610420227051, "learning_rate": 3.0769230769230774e-05, "loss": 2.1998928070068358, "step": 48800 }, { "epoch": 13.854669315923928, "grad_norm": 6.479125499725342, "learning_rate": 3.075503831961397e-05, "loss": 2.234031867980957, "step": 48810 }, { "epoch": 13.85750780584729, "grad_norm": 6.489680767059326, "learning_rate": 3.074084586999717e-05, "loss": 2.177166748046875, "step": 48820 }, { "epoch": 13.86034629577065, "grad_norm": 6.71108341217041, "learning_rate": 3.072665342038036e-05, "loss": 2.121153450012207, "step": 48830 }, { "epoch": 13.863184785694012, "grad_norm": 6.571735382080078, "learning_rate": 3.0712460970763554e-05, "loss": 2.188981628417969, "step": 48840 }, { "epoch": 13.866023275617371, "grad_norm": 6.921374797821045, "learning_rate": 3.069826852114675e-05, "loss": 2.1879951477050783, "step": 48850 }, { "epoch": 13.868861765540732, "grad_norm": 6.564777851104736, "learning_rate": 3.068407607152995e-05, "loss": 2.1773582458496095, "step": 48860 }, { "epoch": 13.871700255464093, "grad_norm": 6.59065580368042, "learning_rate": 3.066988362191314e-05, "loss": 2.1711246490478517, "step": 48870 }, { "epoch": 13.874538745387454, "grad_norm": 6.8380351066589355, "learning_rate": 3.065569117229634e-05, "loss": 2.212139129638672, "step": 48880 }, { "epoch": 13.877377235310815, "grad_norm": 6.633449077606201, "learning_rate": 3.0641498722679535e-05, "loss": 2.110609245300293, "step": 48890 }, { "epoch": 13.880215725234175, "grad_norm": 6.552531719207764, "learning_rate": 3.0627306273062735e-05, "loss": 2.1542373657226563, "step": 48900 }, { "epoch": 13.883054215157536, "grad_norm": 6.410208225250244, "learning_rate": 3.061311382344593e-05, "loss": 2.2431222915649416, "step": 48910 }, { "epoch": 13.885892705080897, "grad_norm": 6.851353645324707, "learning_rate": 3.059892137382913e-05, "loss": 2.215172004699707, "step": 48920 }, { "epoch": 13.888731195004258, "grad_norm": 6.465672016143799, "learning_rate": 3.058472892421232e-05, "loss": 2.151013946533203, "step": 48930 }, { "epoch": 13.89156968492762, "grad_norm": 6.597815036773682, "learning_rate": 3.0570536474595515e-05, "loss": 2.2188940048217773, "step": 48940 }, { "epoch": 13.894408174850978, "grad_norm": 6.73621940612793, "learning_rate": 3.055634402497871e-05, "loss": 2.221741485595703, "step": 48950 }, { "epoch": 13.89724666477434, "grad_norm": 6.590400695800781, "learning_rate": 3.054215157536191e-05, "loss": 2.1517265319824217, "step": 48960 }, { "epoch": 13.9000851546977, "grad_norm": 6.593061447143555, "learning_rate": 3.05279591257451e-05, "loss": 2.247547149658203, "step": 48970 }, { "epoch": 13.902923644621062, "grad_norm": 6.995433330535889, "learning_rate": 3.05137666761283e-05, "loss": 2.2156656265258787, "step": 48980 }, { "epoch": 13.905762134544423, "grad_norm": 6.528536319732666, "learning_rate": 3.0499574226511495e-05, "loss": 2.171085166931152, "step": 48990 }, { "epoch": 13.908600624467784, "grad_norm": 6.68820858001709, "learning_rate": 3.0485381776894695e-05, "loss": 2.2174169540405275, "step": 49000 }, { "epoch": 13.908600624467784, "eval_accuracy": 0.3418961022445476, "eval_loss": 2.490493059158325, "eval_runtime": 55.7495, "eval_samples_per_second": 282.101, "eval_steps_per_second": 4.413, "step": 49000 }, { "epoch": 13.911439114391143, "grad_norm": 6.3713603019714355, "learning_rate": 3.047118932727789e-05, "loss": 2.1680479049682617, "step": 49010 }, { "epoch": 13.914277604314504, "grad_norm": 6.290866851806641, "learning_rate": 3.045699687766109e-05, "loss": 2.2090282440185547, "step": 49020 }, { "epoch": 13.917116094237866, "grad_norm": 6.6053924560546875, "learning_rate": 3.0442804428044282e-05, "loss": 2.2283340454101563, "step": 49030 }, { "epoch": 13.919954584161227, "grad_norm": 6.790106296539307, "learning_rate": 3.042861197842748e-05, "loss": 2.1831809997558596, "step": 49040 }, { "epoch": 13.922793074084588, "grad_norm": 6.525965213775635, "learning_rate": 3.0414419528810672e-05, "loss": 2.1530548095703126, "step": 49050 }, { "epoch": 13.925631564007947, "grad_norm": 6.174263954162598, "learning_rate": 3.0400227079193872e-05, "loss": 2.1896411895751955, "step": 49060 }, { "epoch": 13.928470053931308, "grad_norm": 6.096621513366699, "learning_rate": 3.0386034629577065e-05, "loss": 2.1560760498046876, "step": 49070 }, { "epoch": 13.93130854385467, "grad_norm": 6.8643035888671875, "learning_rate": 3.0371842179960265e-05, "loss": 2.1472944259643554, "step": 49080 }, { "epoch": 13.93414703377803, "grad_norm": 6.703065395355225, "learning_rate": 3.035764973034346e-05, "loss": 2.1852598190307617, "step": 49090 }, { "epoch": 13.936985523701392, "grad_norm": 6.4650559425354, "learning_rate": 3.0343457280726655e-05, "loss": 2.1716943740844727, "step": 49100 }, { "epoch": 13.93982401362475, "grad_norm": 6.6707048416137695, "learning_rate": 3.032926483110985e-05, "loss": 2.174869346618652, "step": 49110 }, { "epoch": 13.942662503548112, "grad_norm": 6.548351764678955, "learning_rate": 3.031507238149305e-05, "loss": 2.2237966537475584, "step": 49120 }, { "epoch": 13.945500993471473, "grad_norm": 6.515334129333496, "learning_rate": 3.0300879931876242e-05, "loss": 2.220727729797363, "step": 49130 }, { "epoch": 13.948339483394834, "grad_norm": 6.264738082885742, "learning_rate": 3.028668748225944e-05, "loss": 2.1283943176269533, "step": 49140 }, { "epoch": 13.951177973318195, "grad_norm": 6.652980327606201, "learning_rate": 3.0272495032642632e-05, "loss": 2.2517887115478517, "step": 49150 }, { "epoch": 13.954016463241555, "grad_norm": 6.702874660491943, "learning_rate": 3.0258302583025832e-05, "loss": 2.218549346923828, "step": 49160 }, { "epoch": 13.956854953164916, "grad_norm": 7.136145114898682, "learning_rate": 3.0244110133409025e-05, "loss": 2.1778488159179688, "step": 49170 }, { "epoch": 13.959693443088277, "grad_norm": 7.093662738800049, "learning_rate": 3.0229917683792226e-05, "loss": 2.161533737182617, "step": 49180 }, { "epoch": 13.962531933011638, "grad_norm": 5.902863502502441, "learning_rate": 3.021572523417542e-05, "loss": 2.168564224243164, "step": 49190 }, { "epoch": 13.965370422934999, "grad_norm": 6.215870380401611, "learning_rate": 3.0201532784558616e-05, "loss": 2.1812639236450195, "step": 49200 }, { "epoch": 13.968208912858358, "grad_norm": 6.257544040679932, "learning_rate": 3.018734033494181e-05, "loss": 2.172671318054199, "step": 49210 }, { "epoch": 13.97104740278172, "grad_norm": 6.308863639831543, "learning_rate": 3.017314788532501e-05, "loss": 2.1246305465698243, "step": 49220 }, { "epoch": 13.97388589270508, "grad_norm": 6.564654350280762, "learning_rate": 3.0158955435708202e-05, "loss": 2.1568634033203127, "step": 49230 }, { "epoch": 13.976724382628442, "grad_norm": 6.3888139724731445, "learning_rate": 3.0144762986091402e-05, "loss": 2.1856151580810548, "step": 49240 }, { "epoch": 13.979562872551803, "grad_norm": 6.7088165283203125, "learning_rate": 3.0130570536474596e-05, "loss": 2.1568864822387694, "step": 49250 }, { "epoch": 13.982401362475164, "grad_norm": 6.845228672027588, "learning_rate": 3.0116378086857792e-05, "loss": 2.1652692794799804, "step": 49260 }, { "epoch": 13.985239852398523, "grad_norm": 6.133147239685059, "learning_rate": 3.0102185637240986e-05, "loss": 2.173904609680176, "step": 49270 }, { "epoch": 13.988078342321884, "grad_norm": 6.86015510559082, "learning_rate": 3.0087993187624186e-05, "loss": 2.1795129776000977, "step": 49280 }, { "epoch": 13.990916832245246, "grad_norm": 6.452433109283447, "learning_rate": 3.007380073800738e-05, "loss": 2.157810020446777, "step": 49290 }, { "epoch": 13.993755322168607, "grad_norm": 6.43502950668335, "learning_rate": 3.005960828839058e-05, "loss": 2.1038402557373046, "step": 49300 }, { "epoch": 13.996593812091968, "grad_norm": 6.317543983459473, "learning_rate": 3.0045415838773773e-05, "loss": 2.149483489990234, "step": 49310 }, { "epoch": 13.999432302015327, "grad_norm": 6.272774696350098, "learning_rate": 3.003122338915697e-05, "loss": 2.1652353286743162, "step": 49320 }, { "epoch": 14.002270791938688, "grad_norm": 6.766143798828125, "learning_rate": 3.0017030939540163e-05, "loss": 2.0943557739257814, "step": 49330 }, { "epoch": 14.00510928186205, "grad_norm": 6.151926040649414, "learning_rate": 3.0002838489923363e-05, "loss": 2.1784784317016603, "step": 49340 }, { "epoch": 14.00794777178541, "grad_norm": 6.655762672424316, "learning_rate": 2.9988646040306556e-05, "loss": 2.1586431503295898, "step": 49350 }, { "epoch": 14.010786261708772, "grad_norm": 6.527685165405273, "learning_rate": 2.9974453590689756e-05, "loss": 2.139314651489258, "step": 49360 }, { "epoch": 14.01362475163213, "grad_norm": 6.230309009552002, "learning_rate": 2.996026114107295e-05, "loss": 2.1311580657958986, "step": 49370 }, { "epoch": 14.016463241555492, "grad_norm": 6.6175689697265625, "learning_rate": 2.9946068691456146e-05, "loss": 2.1620426177978516, "step": 49380 }, { "epoch": 14.019301731478853, "grad_norm": 6.666400909423828, "learning_rate": 2.993187624183934e-05, "loss": 2.1292144775390627, "step": 49390 }, { "epoch": 14.022140221402214, "grad_norm": 6.396266460418701, "learning_rate": 2.991768379222254e-05, "loss": 2.1636960983276365, "step": 49400 }, { "epoch": 14.024978711325575, "grad_norm": 6.52863073348999, "learning_rate": 2.9903491342605733e-05, "loss": 2.190277099609375, "step": 49410 }, { "epoch": 14.027817201248936, "grad_norm": 6.489435195922852, "learning_rate": 2.9889298892988933e-05, "loss": 2.1565166473388673, "step": 49420 }, { "epoch": 14.030655691172296, "grad_norm": 7.095069885253906, "learning_rate": 2.9875106443372126e-05, "loss": 2.144746780395508, "step": 49430 }, { "epoch": 14.033494181095657, "grad_norm": 6.174325942993164, "learning_rate": 2.9860913993755323e-05, "loss": 2.1741432189941405, "step": 49440 }, { "epoch": 14.036332671019018, "grad_norm": 6.06071138381958, "learning_rate": 2.9846721544138516e-05, "loss": 2.072831726074219, "step": 49450 }, { "epoch": 14.039171160942379, "grad_norm": 6.903478145599365, "learning_rate": 2.9832529094521717e-05, "loss": 2.0985836029052733, "step": 49460 }, { "epoch": 14.04200965086574, "grad_norm": 6.736930847167969, "learning_rate": 2.981833664490491e-05, "loss": 2.239886474609375, "step": 49470 }, { "epoch": 14.0448481407891, "grad_norm": 6.37468147277832, "learning_rate": 2.980414419528811e-05, "loss": 2.1640388488769533, "step": 49480 }, { "epoch": 14.04768663071246, "grad_norm": 6.671175003051758, "learning_rate": 2.9789951745671303e-05, "loss": 2.198875617980957, "step": 49490 }, { "epoch": 14.050525120635822, "grad_norm": 6.288443565368652, "learning_rate": 2.97757592960545e-05, "loss": 2.0816654205322265, "step": 49500 }, { "epoch": 14.050525120635822, "eval_accuracy": 0.33757232784383545, "eval_loss": 2.4915268421173096, "eval_runtime": 48.5579, "eval_samples_per_second": 323.882, "eval_steps_per_second": 5.066, "step": 49500 }, { "epoch": 14.053363610559183, "grad_norm": 6.517474174499512, "learning_rate": 2.9761566846437693e-05, "loss": 2.132839584350586, "step": 49510 }, { "epoch": 14.056202100482544, "grad_norm": 6.673788547515869, "learning_rate": 2.9747374396820893e-05, "loss": 2.1127796173095703, "step": 49520 }, { "epoch": 14.059040590405903, "grad_norm": 6.5107293128967285, "learning_rate": 2.9733181947204087e-05, "loss": 2.1630184173583986, "step": 49530 }, { "epoch": 14.061879080329264, "grad_norm": 6.718975067138672, "learning_rate": 2.9718989497587287e-05, "loss": 2.154999542236328, "step": 49540 }, { "epoch": 14.064717570252625, "grad_norm": 6.299687385559082, "learning_rate": 2.970479704797048e-05, "loss": 2.1313898086547853, "step": 49550 }, { "epoch": 14.067556060175987, "grad_norm": 6.758580207824707, "learning_rate": 2.9690604598353677e-05, "loss": 2.2343442916870115, "step": 49560 }, { "epoch": 14.070394550099348, "grad_norm": 7.026632785797119, "learning_rate": 2.967641214873687e-05, "loss": 2.2020015716552734, "step": 49570 }, { "epoch": 14.073233040022707, "grad_norm": 6.351383209228516, "learning_rate": 2.966221969912007e-05, "loss": 2.160951614379883, "step": 49580 }, { "epoch": 14.076071529946068, "grad_norm": 6.308315277099609, "learning_rate": 2.9648027249503264e-05, "loss": 2.151520538330078, "step": 49590 }, { "epoch": 14.07891001986943, "grad_norm": 6.680037975311279, "learning_rate": 2.963383479988646e-05, "loss": 2.188091850280762, "step": 49600 }, { "epoch": 14.08174850979279, "grad_norm": 6.745242595672607, "learning_rate": 2.961964235026966e-05, "loss": 2.1490150451660157, "step": 49610 }, { "epoch": 14.084586999716151, "grad_norm": 6.306706428527832, "learning_rate": 2.9605449900652854e-05, "loss": 2.145468330383301, "step": 49620 }, { "epoch": 14.087425489639513, "grad_norm": 6.316686630249023, "learning_rate": 2.9591257451036054e-05, "loss": 2.1132484436035157, "step": 49630 }, { "epoch": 14.090263979562872, "grad_norm": 6.6239542961120605, "learning_rate": 2.9577065001419247e-05, "loss": 2.114768218994141, "step": 49640 }, { "epoch": 14.093102469486233, "grad_norm": 6.6891188621521, "learning_rate": 2.9562872551802444e-05, "loss": 2.145659828186035, "step": 49650 }, { "epoch": 14.095940959409594, "grad_norm": 6.2364888191223145, "learning_rate": 2.9548680102185637e-05, "loss": 2.144001770019531, "step": 49660 }, { "epoch": 14.098779449332955, "grad_norm": 6.3776326179504395, "learning_rate": 2.9534487652568837e-05, "loss": 2.2101131439208985, "step": 49670 }, { "epoch": 14.101617939256316, "grad_norm": 6.486617565155029, "learning_rate": 2.952029520295203e-05, "loss": 2.1327003479003905, "step": 49680 }, { "epoch": 14.104456429179676, "grad_norm": 6.595492362976074, "learning_rate": 2.950610275333523e-05, "loss": 2.1300111770629884, "step": 49690 }, { "epoch": 14.107294919103037, "grad_norm": 6.493927001953125, "learning_rate": 2.9491910303718424e-05, "loss": 2.1651546478271486, "step": 49700 }, { "epoch": 14.110133409026398, "grad_norm": 6.755426406860352, "learning_rate": 2.947771785410162e-05, "loss": 2.14638614654541, "step": 49710 }, { "epoch": 14.112971898949759, "grad_norm": 7.278680801391602, "learning_rate": 2.9463525404484814e-05, "loss": 2.2108455657958985, "step": 49720 }, { "epoch": 14.11581038887312, "grad_norm": 6.462455749511719, "learning_rate": 2.9449332954868014e-05, "loss": 2.1376771926879883, "step": 49730 }, { "epoch": 14.11864887879648, "grad_norm": 6.513915538787842, "learning_rate": 2.9435140505251207e-05, "loss": 2.1542570114135744, "step": 49740 }, { "epoch": 14.12148736871984, "grad_norm": 6.381139278411865, "learning_rate": 2.9420948055634408e-05, "loss": 2.1551950454711912, "step": 49750 }, { "epoch": 14.124325858643202, "grad_norm": 6.578488349914551, "learning_rate": 2.94067556060176e-05, "loss": 2.096298408508301, "step": 49760 }, { "epoch": 14.127164348566563, "grad_norm": 6.692105293273926, "learning_rate": 2.9392563156400798e-05, "loss": 2.106465721130371, "step": 49770 }, { "epoch": 14.130002838489924, "grad_norm": 6.663797855377197, "learning_rate": 2.937837070678399e-05, "loss": 2.1875696182250977, "step": 49780 }, { "epoch": 14.132841328413285, "grad_norm": 6.710667133331299, "learning_rate": 2.936417825716719e-05, "loss": 2.119576644897461, "step": 49790 }, { "epoch": 14.135679818336644, "grad_norm": 6.704268932342529, "learning_rate": 2.9349985807550384e-05, "loss": 2.1191781997680663, "step": 49800 }, { "epoch": 14.138518308260005, "grad_norm": 6.412662506103516, "learning_rate": 2.9335793357933584e-05, "loss": 2.166848373413086, "step": 49810 }, { "epoch": 14.141356798183367, "grad_norm": 7.178735256195068, "learning_rate": 2.9321600908316778e-05, "loss": 2.1181907653808594, "step": 49820 }, { "epoch": 14.144195288106728, "grad_norm": 6.737724781036377, "learning_rate": 2.9307408458699974e-05, "loss": 2.192335510253906, "step": 49830 }, { "epoch": 14.147033778030089, "grad_norm": 6.3069868087768555, "learning_rate": 2.9293216009083168e-05, "loss": 2.226486015319824, "step": 49840 }, { "epoch": 14.149872267953448, "grad_norm": 6.547250747680664, "learning_rate": 2.9279023559466368e-05, "loss": 2.185256576538086, "step": 49850 }, { "epoch": 14.15271075787681, "grad_norm": 6.2369279861450195, "learning_rate": 2.926483110984956e-05, "loss": 2.133827781677246, "step": 49860 }, { "epoch": 14.15554924780017, "grad_norm": 6.395570278167725, "learning_rate": 2.925063866023276e-05, "loss": 2.1326698303222655, "step": 49870 }, { "epoch": 14.158387737723531, "grad_norm": 6.4066853523254395, "learning_rate": 2.9236446210615955e-05, "loss": 2.146313285827637, "step": 49880 }, { "epoch": 14.161226227646893, "grad_norm": 6.3357343673706055, "learning_rate": 2.922225376099915e-05, "loss": 2.1064918518066404, "step": 49890 }, { "epoch": 14.164064717570252, "grad_norm": 7.151516437530518, "learning_rate": 2.9208061311382345e-05, "loss": 2.1741466522216797, "step": 49900 }, { "epoch": 14.166903207493613, "grad_norm": 6.683149337768555, "learning_rate": 2.9193868861765545e-05, "loss": 2.160301399230957, "step": 49910 }, { "epoch": 14.169741697416974, "grad_norm": 6.67438268661499, "learning_rate": 2.9179676412148738e-05, "loss": 2.135519027709961, "step": 49920 }, { "epoch": 14.172580187340335, "grad_norm": 6.684703350067139, "learning_rate": 2.9165483962531938e-05, "loss": 2.237898254394531, "step": 49930 }, { "epoch": 14.175418677263696, "grad_norm": 6.0845208168029785, "learning_rate": 2.915129151291513e-05, "loss": 2.0719676971435548, "step": 49940 }, { "epoch": 14.178257167187056, "grad_norm": 6.629350185394287, "learning_rate": 2.9137099063298328e-05, "loss": 2.1318471908569334, "step": 49950 }, { "epoch": 14.181095657110417, "grad_norm": 6.411121368408203, "learning_rate": 2.912290661368152e-05, "loss": 2.148434066772461, "step": 49960 }, { "epoch": 14.183934147033778, "grad_norm": 6.774742603302002, "learning_rate": 2.910871416406472e-05, "loss": 2.1539871215820314, "step": 49970 }, { "epoch": 14.186772636957139, "grad_norm": 6.944456577301025, "learning_rate": 2.9094521714447915e-05, "loss": 2.143825912475586, "step": 49980 }, { "epoch": 14.1896111268805, "grad_norm": 6.674496173858643, "learning_rate": 2.908032926483111e-05, "loss": 2.142850875854492, "step": 49990 }, { "epoch": 14.192449616803861, "grad_norm": 6.670775890350342, "learning_rate": 2.9066136815214305e-05, "loss": 2.2119060516357423, "step": 50000 }, { "epoch": 14.192449616803861, "eval_accuracy": 0.3392891206205888, "eval_loss": 2.49064040184021, "eval_runtime": 48.7515, "eval_samples_per_second": 322.595, "eval_steps_per_second": 5.046, "step": 50000 }, { "epoch": 14.19528810672722, "grad_norm": 6.41035270690918, "learning_rate": 2.9051944365597505e-05, "loss": 2.122951698303223, "step": 50010 }, { "epoch": 14.198126596650582, "grad_norm": 6.731324195861816, "learning_rate": 2.90377519159807e-05, "loss": 2.1483222961425783, "step": 50020 }, { "epoch": 14.200965086573943, "grad_norm": 6.736081600189209, "learning_rate": 2.90235594663639e-05, "loss": 2.147163963317871, "step": 50030 }, { "epoch": 14.203803576497304, "grad_norm": 6.572803020477295, "learning_rate": 2.9009367016747092e-05, "loss": 2.1540035247802733, "step": 50040 }, { "epoch": 14.206642066420665, "grad_norm": 6.392016410827637, "learning_rate": 2.899517456713029e-05, "loss": 2.126939392089844, "step": 50050 }, { "epoch": 14.209480556344024, "grad_norm": 7.088597774505615, "learning_rate": 2.8980982117513482e-05, "loss": 2.1409204483032225, "step": 50060 }, { "epoch": 14.212319046267385, "grad_norm": 6.554594993591309, "learning_rate": 2.8966789667896682e-05, "loss": 2.1544622421264648, "step": 50070 }, { "epoch": 14.215157536190747, "grad_norm": 6.207372665405273, "learning_rate": 2.8952597218279875e-05, "loss": 2.060622787475586, "step": 50080 }, { "epoch": 14.217996026114108, "grad_norm": 6.347860336303711, "learning_rate": 2.8938404768663075e-05, "loss": 2.1314659118652344, "step": 50090 }, { "epoch": 14.220834516037469, "grad_norm": 6.463359832763672, "learning_rate": 2.892421231904627e-05, "loss": 2.060759353637695, "step": 50100 }, { "epoch": 14.223673005960828, "grad_norm": 6.2331414222717285, "learning_rate": 2.8910019869429465e-05, "loss": 2.1493289947509764, "step": 50110 }, { "epoch": 14.22651149588419, "grad_norm": 6.331341743469238, "learning_rate": 2.889582741981266e-05, "loss": 2.210454559326172, "step": 50120 }, { "epoch": 14.22934998580755, "grad_norm": 6.708117961883545, "learning_rate": 2.888163497019586e-05, "loss": 2.160935974121094, "step": 50130 }, { "epoch": 14.232188475730911, "grad_norm": 6.717487335205078, "learning_rate": 2.8867442520579052e-05, "loss": 2.061229705810547, "step": 50140 }, { "epoch": 14.235026965654273, "grad_norm": 6.178483963012695, "learning_rate": 2.8853250070962252e-05, "loss": 2.0693445205688477, "step": 50150 }, { "epoch": 14.237865455577632, "grad_norm": 6.71461820602417, "learning_rate": 2.8839057621345445e-05, "loss": 2.1693130493164063, "step": 50160 }, { "epoch": 14.240703945500993, "grad_norm": 6.717408657073975, "learning_rate": 2.8824865171728642e-05, "loss": 2.171620559692383, "step": 50170 }, { "epoch": 14.243542435424354, "grad_norm": 6.561066627502441, "learning_rate": 2.8810672722111836e-05, "loss": 2.227181243896484, "step": 50180 }, { "epoch": 14.246380925347715, "grad_norm": 6.486503601074219, "learning_rate": 2.8796480272495036e-05, "loss": 2.1727819442749023, "step": 50190 }, { "epoch": 14.249219415271076, "grad_norm": 6.5097126960754395, "learning_rate": 2.878228782287823e-05, "loss": 2.1861852645874023, "step": 50200 }, { "epoch": 14.252057905194437, "grad_norm": 6.200997829437256, "learning_rate": 2.876809537326143e-05, "loss": 2.2017375946044924, "step": 50210 }, { "epoch": 14.254896395117797, "grad_norm": 6.330826759338379, "learning_rate": 2.8753902923644622e-05, "loss": 2.2102012634277344, "step": 50220 }, { "epoch": 14.257734885041158, "grad_norm": 6.337263584136963, "learning_rate": 2.873971047402782e-05, "loss": 2.1872255325317385, "step": 50230 }, { "epoch": 14.260573374964519, "grad_norm": 6.685473918914795, "learning_rate": 2.8725518024411012e-05, "loss": 2.1850055694580077, "step": 50240 }, { "epoch": 14.26341186488788, "grad_norm": 6.31723165512085, "learning_rate": 2.8711325574794212e-05, "loss": 2.182633399963379, "step": 50250 }, { "epoch": 14.266250354811241, "grad_norm": 6.700789928436279, "learning_rate": 2.8697133125177406e-05, "loss": 2.1781482696533203, "step": 50260 }, { "epoch": 14.2690888447346, "grad_norm": 6.3923211097717285, "learning_rate": 2.8682940675560606e-05, "loss": 2.1452861785888673, "step": 50270 }, { "epoch": 14.271927334657962, "grad_norm": 6.326618194580078, "learning_rate": 2.86687482259438e-05, "loss": 2.1221900939941407, "step": 50280 }, { "epoch": 14.274765824581323, "grad_norm": 6.37226676940918, "learning_rate": 2.8654555776326996e-05, "loss": 2.1907758712768555, "step": 50290 }, { "epoch": 14.277604314504684, "grad_norm": 6.441317558288574, "learning_rate": 2.864036332671019e-05, "loss": 2.1427276611328123, "step": 50300 }, { "epoch": 14.280442804428045, "grad_norm": 6.733927249908447, "learning_rate": 2.862617087709339e-05, "loss": 2.18890323638916, "step": 50310 }, { "epoch": 14.283281294351404, "grad_norm": 6.432321548461914, "learning_rate": 2.8611978427476583e-05, "loss": 2.1872621536254884, "step": 50320 }, { "epoch": 14.286119784274765, "grad_norm": 6.454476833343506, "learning_rate": 2.8597785977859783e-05, "loss": 2.087635612487793, "step": 50330 }, { "epoch": 14.288958274198126, "grad_norm": 6.065932750701904, "learning_rate": 2.8583593528242976e-05, "loss": 2.1208948135375976, "step": 50340 }, { "epoch": 14.291796764121488, "grad_norm": 6.57275915145874, "learning_rate": 2.8569401078626173e-05, "loss": 2.1650360107421873, "step": 50350 }, { "epoch": 14.294635254044849, "grad_norm": 6.676151275634766, "learning_rate": 2.8555208629009366e-05, "loss": 2.115275764465332, "step": 50360 }, { "epoch": 14.297473743968208, "grad_norm": 6.820719242095947, "learning_rate": 2.8541016179392566e-05, "loss": 2.166851043701172, "step": 50370 }, { "epoch": 14.30031223389157, "grad_norm": 6.360339641571045, "learning_rate": 2.852682372977576e-05, "loss": 2.208696746826172, "step": 50380 }, { "epoch": 14.30315072381493, "grad_norm": 7.084065914154053, "learning_rate": 2.8512631280158956e-05, "loss": 2.1964412689208985, "step": 50390 }, { "epoch": 14.305989213738291, "grad_norm": 6.514272212982178, "learning_rate": 2.849843883054215e-05, "loss": 2.1838104248046877, "step": 50400 }, { "epoch": 14.308827703661652, "grad_norm": 6.522924423217773, "learning_rate": 2.848424638092535e-05, "loss": 2.234448051452637, "step": 50410 }, { "epoch": 14.311666193585014, "grad_norm": 6.4696221351623535, "learning_rate": 2.8470053931308543e-05, "loss": 2.134520149230957, "step": 50420 }, { "epoch": 14.314504683508373, "grad_norm": 6.774727821350098, "learning_rate": 2.8455861481691743e-05, "loss": 2.132607078552246, "step": 50430 }, { "epoch": 14.317343173431734, "grad_norm": 6.319742202758789, "learning_rate": 2.8441669032074936e-05, "loss": 2.248378562927246, "step": 50440 }, { "epoch": 14.320181663355095, "grad_norm": 6.53349494934082, "learning_rate": 2.8427476582458133e-05, "loss": 2.064142417907715, "step": 50450 }, { "epoch": 14.323020153278456, "grad_norm": 6.339288234710693, "learning_rate": 2.8413284132841326e-05, "loss": 2.133876419067383, "step": 50460 }, { "epoch": 14.325858643201817, "grad_norm": 6.531294822692871, "learning_rate": 2.8399091683224527e-05, "loss": 2.2200380325317384, "step": 50470 }, { "epoch": 14.328697133125177, "grad_norm": 6.332139015197754, "learning_rate": 2.838489923360772e-05, "loss": 2.2072004318237304, "step": 50480 }, { "epoch": 14.331535623048538, "grad_norm": 6.4100871086120605, "learning_rate": 2.837070678399092e-05, "loss": 2.1827909469604494, "step": 50490 }, { "epoch": 14.334374112971899, "grad_norm": 6.715420246124268, "learning_rate": 2.8356514334374113e-05, "loss": 2.1811275482177734, "step": 50500 }, { "epoch": 14.334374112971899, "eval_accuracy": 0.34024289438545174, "eval_loss": 2.4887008666992188, "eval_runtime": 49.8003, "eval_samples_per_second": 315.801, "eval_steps_per_second": 4.94, "step": 50500 }, { "epoch": 14.33721260289526, "grad_norm": 6.501711845397949, "learning_rate": 2.8343741129718993e-05, "loss": 2.150197219848633, "step": 50510 }, { "epoch": 14.340051092818621, "grad_norm": 6.4807963371276855, "learning_rate": 2.8329548680102186e-05, "loss": 2.1662776947021483, "step": 50520 }, { "epoch": 14.34288958274198, "grad_norm": 6.431113243103027, "learning_rate": 2.8315356230485386e-05, "loss": 2.2058027267456053, "step": 50530 }, { "epoch": 14.345728072665342, "grad_norm": 6.715238094329834, "learning_rate": 2.830116378086858e-05, "loss": 2.1184104919433593, "step": 50540 }, { "epoch": 14.348566562588703, "grad_norm": 6.179220676422119, "learning_rate": 2.8286971331251776e-05, "loss": 2.1397682189941407, "step": 50550 }, { "epoch": 14.351405052512064, "grad_norm": 6.35304069519043, "learning_rate": 2.827277888163497e-05, "loss": 2.1721775054931642, "step": 50560 }, { "epoch": 14.354243542435425, "grad_norm": 6.457651138305664, "learning_rate": 2.825858643201817e-05, "loss": 2.2530168533325194, "step": 50570 }, { "epoch": 14.357082032358786, "grad_norm": 6.413639545440674, "learning_rate": 2.8244393982401363e-05, "loss": 2.125766944885254, "step": 50580 }, { "epoch": 14.359920522282145, "grad_norm": 6.724222660064697, "learning_rate": 2.8230201532784563e-05, "loss": 2.171182060241699, "step": 50590 }, { "epoch": 14.362759012205506, "grad_norm": 6.587679386138916, "learning_rate": 2.8216009083167756e-05, "loss": 2.1542205810546875, "step": 50600 }, { "epoch": 14.365597502128868, "grad_norm": 6.760066509246826, "learning_rate": 2.8201816633550953e-05, "loss": 2.1251667022705076, "step": 50610 }, { "epoch": 14.368435992052229, "grad_norm": 6.426425457000732, "learning_rate": 2.8187624183934146e-05, "loss": 2.1284273147583006, "step": 50620 }, { "epoch": 14.37127448197559, "grad_norm": 6.034432888031006, "learning_rate": 2.8173431734317346e-05, "loss": 2.067336845397949, "step": 50630 }, { "epoch": 14.374112971898949, "grad_norm": 6.273949146270752, "learning_rate": 2.815923928470054e-05, "loss": 2.112919235229492, "step": 50640 }, { "epoch": 14.37695146182231, "grad_norm": 6.386902809143066, "learning_rate": 2.814504683508374e-05, "loss": 2.0832546234130858, "step": 50650 }, { "epoch": 14.379789951745671, "grad_norm": 6.567882537841797, "learning_rate": 2.8130854385466933e-05, "loss": 2.1648509979248045, "step": 50660 }, { "epoch": 14.382628441669032, "grad_norm": 6.554747104644775, "learning_rate": 2.811666193585013e-05, "loss": 2.1767881393432615, "step": 50670 }, { "epoch": 14.385466931592394, "grad_norm": 6.4774885177612305, "learning_rate": 2.8102469486233323e-05, "loss": 2.1645397186279296, "step": 50680 }, { "epoch": 14.388305421515753, "grad_norm": 6.191102504730225, "learning_rate": 2.8088277036616523e-05, "loss": 2.1855451583862306, "step": 50690 }, { "epoch": 14.391143911439114, "grad_norm": 6.145212650299072, "learning_rate": 2.8074084586999717e-05, "loss": 2.147823143005371, "step": 50700 }, { "epoch": 14.393982401362475, "grad_norm": 6.693686008453369, "learning_rate": 2.8059892137382913e-05, "loss": 2.112519073486328, "step": 50710 }, { "epoch": 14.396820891285836, "grad_norm": 6.725261688232422, "learning_rate": 2.8045699687766107e-05, "loss": 2.110723686218262, "step": 50720 }, { "epoch": 14.399659381209197, "grad_norm": 6.650083541870117, "learning_rate": 2.8031507238149307e-05, "loss": 2.1617095947265623, "step": 50730 }, { "epoch": 14.402497871132557, "grad_norm": 6.524197578430176, "learning_rate": 2.80173147885325e-05, "loss": 2.115464782714844, "step": 50740 }, { "epoch": 14.405336361055918, "grad_norm": 6.667537212371826, "learning_rate": 2.80031223389157e-05, "loss": 2.1598747253417967, "step": 50750 }, { "epoch": 14.408174850979279, "grad_norm": 6.4798150062561035, "learning_rate": 2.7988929889298893e-05, "loss": 2.172978973388672, "step": 50760 }, { "epoch": 14.41101334090264, "grad_norm": 6.468686580657959, "learning_rate": 2.797473743968209e-05, "loss": 2.2350112915039064, "step": 50770 }, { "epoch": 14.413851830826001, "grad_norm": 6.768537521362305, "learning_rate": 2.7960544990065284e-05, "loss": 2.1872697830200196, "step": 50780 }, { "epoch": 14.416690320749362, "grad_norm": 6.0862250328063965, "learning_rate": 2.7946352540448484e-05, "loss": 2.111405944824219, "step": 50790 }, { "epoch": 14.419528810672722, "grad_norm": 6.831700801849365, "learning_rate": 2.7932160090831677e-05, "loss": 2.1622241973876952, "step": 50800 }, { "epoch": 14.422367300596083, "grad_norm": 6.9676361083984375, "learning_rate": 2.7917967641214877e-05, "loss": 2.1530250549316405, "step": 50810 }, { "epoch": 14.425205790519444, "grad_norm": 6.207257270812988, "learning_rate": 2.790377519159807e-05, "loss": 2.0738582611083984, "step": 50820 }, { "epoch": 14.428044280442805, "grad_norm": 6.315426826477051, "learning_rate": 2.7889582741981267e-05, "loss": 2.128038024902344, "step": 50830 }, { "epoch": 14.430882770366166, "grad_norm": 6.624908924102783, "learning_rate": 2.787539029236446e-05, "loss": 2.1621526718139648, "step": 50840 }, { "epoch": 14.433721260289525, "grad_norm": 6.681087493896484, "learning_rate": 2.786119784274766e-05, "loss": 2.2016881942749023, "step": 50850 }, { "epoch": 14.436559750212886, "grad_norm": 6.85474967956543, "learning_rate": 2.7847005393130854e-05, "loss": 2.2269313812255858, "step": 50860 }, { "epoch": 14.439398240136248, "grad_norm": 6.440207481384277, "learning_rate": 2.7832812943514054e-05, "loss": 2.168167304992676, "step": 50870 }, { "epoch": 14.442236730059609, "grad_norm": 6.61535120010376, "learning_rate": 2.7818620493897247e-05, "loss": 2.133441925048828, "step": 50880 }, { "epoch": 14.44507521998297, "grad_norm": 6.5209574699401855, "learning_rate": 2.7804428044280444e-05, "loss": 2.114146041870117, "step": 50890 }, { "epoch": 14.447913709906329, "grad_norm": 6.2939324378967285, "learning_rate": 2.7790235594663637e-05, "loss": 2.2141769409179686, "step": 50900 }, { "epoch": 14.45075219982969, "grad_norm": 6.505369663238525, "learning_rate": 2.7776043145046837e-05, "loss": 2.1430732727050783, "step": 50910 }, { "epoch": 14.453590689753051, "grad_norm": 6.409021854400635, "learning_rate": 2.776185069543003e-05, "loss": 2.1229833602905273, "step": 50920 }, { "epoch": 14.456429179676412, "grad_norm": 6.656196594238281, "learning_rate": 2.774765824581323e-05, "loss": 2.1866554260253905, "step": 50930 }, { "epoch": 14.459267669599773, "grad_norm": 6.580402374267578, "learning_rate": 2.7733465796196424e-05, "loss": 2.1863895416259767, "step": 50940 }, { "epoch": 14.462106159523135, "grad_norm": 6.523726463317871, "learning_rate": 2.771927334657962e-05, "loss": 2.160990524291992, "step": 50950 }, { "epoch": 14.464944649446494, "grad_norm": 6.504967212677002, "learning_rate": 2.7705080896962814e-05, "loss": 2.1191038131713866, "step": 50960 }, { "epoch": 14.467783139369855, "grad_norm": 6.488245487213135, "learning_rate": 2.7690888447346014e-05, "loss": 2.1496793746948244, "step": 50970 }, { "epoch": 14.470621629293216, "grad_norm": 6.592358112335205, "learning_rate": 2.7676695997729208e-05, "loss": 2.1440948486328124, "step": 50980 }, { "epoch": 14.473460119216577, "grad_norm": 6.45151424407959, "learning_rate": 2.7662503548112408e-05, "loss": 2.1891925811767576, "step": 50990 }, { "epoch": 14.476298609139938, "grad_norm": 6.798641204833984, "learning_rate": 2.76483110984956e-05, "loss": 2.2543750762939454, "step": 51000 }, { "epoch": 14.476298609139938, "eval_accuracy": 0.3417689324092325, "eval_loss": 2.4843475818634033, "eval_runtime": 49.4236, "eval_samples_per_second": 318.209, "eval_steps_per_second": 4.977, "step": 51000 }, { "epoch": 14.479137099063298, "grad_norm": 6.317224025726318, "learning_rate": 2.7634118648878798e-05, "loss": 2.1225408554077148, "step": 51010 }, { "epoch": 14.481975588986659, "grad_norm": 6.769524097442627, "learning_rate": 2.761992619926199e-05, "loss": 2.1819831848144533, "step": 51020 }, { "epoch": 14.48481407891002, "grad_norm": 6.32417631149292, "learning_rate": 2.760573374964519e-05, "loss": 2.0914226531982423, "step": 51030 }, { "epoch": 14.487652568833381, "grad_norm": 6.9899091720581055, "learning_rate": 2.7591541300028384e-05, "loss": 2.167842674255371, "step": 51040 }, { "epoch": 14.490491058756742, "grad_norm": 6.630452632904053, "learning_rate": 2.7577348850411585e-05, "loss": 2.1952140808105467, "step": 51050 }, { "epoch": 14.493329548680101, "grad_norm": 6.6860857009887695, "learning_rate": 2.7563156400794778e-05, "loss": 2.1569400787353517, "step": 51060 }, { "epoch": 14.496168038603463, "grad_norm": 6.342560291290283, "learning_rate": 2.7548963951177975e-05, "loss": 2.229557418823242, "step": 51070 }, { "epoch": 14.499006528526824, "grad_norm": 6.573225975036621, "learning_rate": 2.7534771501561168e-05, "loss": 2.14282341003418, "step": 51080 }, { "epoch": 14.501845018450185, "grad_norm": 6.1086907386779785, "learning_rate": 2.7520579051944368e-05, "loss": 2.1775402069091796, "step": 51090 }, { "epoch": 14.504683508373546, "grad_norm": 6.586647033691406, "learning_rate": 2.750638660232756e-05, "loss": 2.111284065246582, "step": 51100 }, { "epoch": 14.507521998296905, "grad_norm": 6.701176643371582, "learning_rate": 2.7492194152710758e-05, "loss": 2.2061794281005858, "step": 51110 }, { "epoch": 14.510360488220266, "grad_norm": 6.845107555389404, "learning_rate": 2.747800170309395e-05, "loss": 2.225076675415039, "step": 51120 }, { "epoch": 14.513198978143627, "grad_norm": 6.545674800872803, "learning_rate": 2.746380925347715e-05, "loss": 2.1299249649047853, "step": 51130 }, { "epoch": 14.516037468066989, "grad_norm": 6.659729480743408, "learning_rate": 2.7449616803860345e-05, "loss": 2.1598445892333986, "step": 51140 }, { "epoch": 14.51887595799035, "grad_norm": 6.202633380889893, "learning_rate": 2.7435424354243545e-05, "loss": 2.225986671447754, "step": 51150 }, { "epoch": 14.521714447913709, "grad_norm": 6.477132797241211, "learning_rate": 2.7421231904626738e-05, "loss": 2.137507438659668, "step": 51160 }, { "epoch": 14.52455293783707, "grad_norm": 6.4419941902160645, "learning_rate": 2.7407039455009935e-05, "loss": 2.1545249938964846, "step": 51170 }, { "epoch": 14.527391427760431, "grad_norm": 6.84859037399292, "learning_rate": 2.7392847005393128e-05, "loss": 2.1792123794555662, "step": 51180 }, { "epoch": 14.530229917683792, "grad_norm": 6.294368743896484, "learning_rate": 2.7378654555776328e-05, "loss": 2.2072736740112306, "step": 51190 }, { "epoch": 14.533068407607153, "grad_norm": 6.304792881011963, "learning_rate": 2.736446210615952e-05, "loss": 2.1201751708984373, "step": 51200 }, { "epoch": 14.535906897530515, "grad_norm": 6.108436107635498, "learning_rate": 2.735026965654272e-05, "loss": 2.14279842376709, "step": 51210 }, { "epoch": 14.538745387453874, "grad_norm": 6.1982927322387695, "learning_rate": 2.7336077206925915e-05, "loss": 2.146861457824707, "step": 51220 }, { "epoch": 14.541583877377235, "grad_norm": 6.452439785003662, "learning_rate": 2.7321884757309112e-05, "loss": 2.150737762451172, "step": 51230 }, { "epoch": 14.544422367300596, "grad_norm": 6.402815818786621, "learning_rate": 2.7307692307692305e-05, "loss": 2.1149324417114257, "step": 51240 }, { "epoch": 14.547260857223957, "grad_norm": 6.42545747756958, "learning_rate": 2.7293499858075505e-05, "loss": 2.153603744506836, "step": 51250 }, { "epoch": 14.550099347147318, "grad_norm": 6.025592803955078, "learning_rate": 2.72793074084587e-05, "loss": 2.1620046615600588, "step": 51260 }, { "epoch": 14.552937837070678, "grad_norm": 6.56567907333374, "learning_rate": 2.72651149588419e-05, "loss": 2.2038536071777344, "step": 51270 }, { "epoch": 14.555776326994039, "grad_norm": 6.263679504394531, "learning_rate": 2.7250922509225092e-05, "loss": 2.147072410583496, "step": 51280 }, { "epoch": 14.5586148169174, "grad_norm": 6.52733039855957, "learning_rate": 2.723673005960829e-05, "loss": 2.162394714355469, "step": 51290 }, { "epoch": 14.561453306840761, "grad_norm": 6.691681385040283, "learning_rate": 2.7222537609991482e-05, "loss": 2.1909440994262694, "step": 51300 }, { "epoch": 14.564291796764122, "grad_norm": 6.767354965209961, "learning_rate": 2.7208345160374682e-05, "loss": 2.201687240600586, "step": 51310 }, { "epoch": 14.567130286687481, "grad_norm": 6.634027004241943, "learning_rate": 2.7194152710757882e-05, "loss": 2.1666831970214844, "step": 51320 }, { "epoch": 14.569968776610843, "grad_norm": 6.310926914215088, "learning_rate": 2.7179960261141075e-05, "loss": 2.1761714935302736, "step": 51330 }, { "epoch": 14.572807266534204, "grad_norm": 6.3013410568237305, "learning_rate": 2.7165767811524272e-05, "loss": 2.148580551147461, "step": 51340 }, { "epoch": 14.575645756457565, "grad_norm": 6.402513027191162, "learning_rate": 2.7151575361907465e-05, "loss": 2.1168903350830077, "step": 51350 }, { "epoch": 14.578484246380926, "grad_norm": 6.792891979217529, "learning_rate": 2.7137382912290666e-05, "loss": 2.1102806091308595, "step": 51360 }, { "epoch": 14.581322736304287, "grad_norm": 6.349704265594482, "learning_rate": 2.712319046267386e-05, "loss": 2.143623924255371, "step": 51370 }, { "epoch": 14.584161226227646, "grad_norm": 6.353003978729248, "learning_rate": 2.710899801305706e-05, "loss": 2.115347671508789, "step": 51380 }, { "epoch": 14.586999716151007, "grad_norm": 6.2230401039123535, "learning_rate": 2.7094805563440252e-05, "loss": 2.190634536743164, "step": 51390 }, { "epoch": 14.589838206074369, "grad_norm": 6.749535083770752, "learning_rate": 2.708061311382345e-05, "loss": 2.0894100189208986, "step": 51400 }, { "epoch": 14.59267669599773, "grad_norm": 6.611583232879639, "learning_rate": 2.7066420664206642e-05, "loss": 2.2179210662841795, "step": 51410 }, { "epoch": 14.59551518592109, "grad_norm": 6.3915886878967285, "learning_rate": 2.7052228214589842e-05, "loss": 2.1224597930908202, "step": 51420 }, { "epoch": 14.59835367584445, "grad_norm": 6.397669792175293, "learning_rate": 2.7038035764973036e-05, "loss": 2.1589418411254884, "step": 51430 }, { "epoch": 14.601192165767811, "grad_norm": 6.828464508056641, "learning_rate": 2.7023843315356236e-05, "loss": 2.183242416381836, "step": 51440 }, { "epoch": 14.604030655691172, "grad_norm": 6.7262372970581055, "learning_rate": 2.700965086573943e-05, "loss": 2.103154182434082, "step": 51450 }, { "epoch": 14.606869145614533, "grad_norm": 6.937077522277832, "learning_rate": 2.6995458416122626e-05, "loss": 2.115538215637207, "step": 51460 }, { "epoch": 14.609707635537895, "grad_norm": 6.404132843017578, "learning_rate": 2.698126596650582e-05, "loss": 2.0946632385253907, "step": 51470 }, { "epoch": 14.612546125461254, "grad_norm": 6.637913703918457, "learning_rate": 2.696707351688902e-05, "loss": 2.133811187744141, "step": 51480 }, { "epoch": 14.615384615384615, "grad_norm": 6.609971523284912, "learning_rate": 2.6952881067272213e-05, "loss": 2.2034812927246095, "step": 51490 }, { "epoch": 14.618223105307976, "grad_norm": 6.362635612487793, "learning_rate": 2.693868861765541e-05, "loss": 2.2079301834106446, "step": 51500 }, { "epoch": 14.618223105307976, "eval_accuracy": 0.3436764799389585, "eval_loss": 2.4811148643493652, "eval_runtime": 50.005, "eval_samples_per_second": 314.508, "eval_steps_per_second": 4.92, "step": 51500 }, { "epoch": 14.621061595231337, "grad_norm": 6.33699893951416, "learning_rate": 2.6924496168038603e-05, "loss": 2.1411365509033202, "step": 51510 }, { "epoch": 14.623900085154698, "grad_norm": 6.523435592651367, "learning_rate": 2.6910303718421803e-05, "loss": 2.1925590515136717, "step": 51520 }, { "epoch": 14.626738575078058, "grad_norm": 6.592581272125244, "learning_rate": 2.6896111268804996e-05, "loss": 2.2266162872314452, "step": 51530 }, { "epoch": 14.629577065001419, "grad_norm": 6.565569877624512, "learning_rate": 2.6881918819188196e-05, "loss": 2.1645414352416994, "step": 51540 }, { "epoch": 14.63241555492478, "grad_norm": 6.281209468841553, "learning_rate": 2.686772636957139e-05, "loss": 2.154530334472656, "step": 51550 }, { "epoch": 14.635254044848141, "grad_norm": 6.895339012145996, "learning_rate": 2.6853533919954586e-05, "loss": 2.2028446197509766, "step": 51560 }, { "epoch": 14.638092534771502, "grad_norm": 6.214515209197998, "learning_rate": 2.683934147033778e-05, "loss": 2.095122146606445, "step": 51570 }, { "epoch": 14.640931024694861, "grad_norm": 6.610279560089111, "learning_rate": 2.682514902072098e-05, "loss": 2.151142120361328, "step": 51580 }, { "epoch": 14.643769514618223, "grad_norm": 6.2783308029174805, "learning_rate": 2.6810956571104173e-05, "loss": 2.142813301086426, "step": 51590 }, { "epoch": 14.646608004541584, "grad_norm": 6.817516803741455, "learning_rate": 2.6796764121487373e-05, "loss": 2.1327493667602537, "step": 51600 }, { "epoch": 14.649446494464945, "grad_norm": 6.579459190368652, "learning_rate": 2.6782571671870566e-05, "loss": 2.14814453125, "step": 51610 }, { "epoch": 14.652284984388306, "grad_norm": 6.451214790344238, "learning_rate": 2.6768379222253763e-05, "loss": 2.1766384124755858, "step": 51620 }, { "epoch": 14.655123474311667, "grad_norm": 6.391461372375488, "learning_rate": 2.6754186772636956e-05, "loss": 2.155415344238281, "step": 51630 }, { "epoch": 14.657961964235026, "grad_norm": 6.4180908203125, "learning_rate": 2.6739994323020156e-05, "loss": 2.1299627304077147, "step": 51640 }, { "epoch": 14.660800454158387, "grad_norm": 6.387005805969238, "learning_rate": 2.672580187340335e-05, "loss": 2.1250144958496096, "step": 51650 }, { "epoch": 14.663638944081749, "grad_norm": 6.749711036682129, "learning_rate": 2.671160942378655e-05, "loss": 2.1593242645263673, "step": 51660 }, { "epoch": 14.66647743400511, "grad_norm": 6.2961578369140625, "learning_rate": 2.6697416974169743e-05, "loss": 2.1379907608032225, "step": 51670 }, { "epoch": 14.66931592392847, "grad_norm": 6.426239490509033, "learning_rate": 2.668322452455294e-05, "loss": 2.1921054840087892, "step": 51680 }, { "epoch": 14.67215441385183, "grad_norm": 6.456857204437256, "learning_rate": 2.6669032074936133e-05, "loss": 2.0777645111083984, "step": 51690 }, { "epoch": 14.674992903775191, "grad_norm": 6.287974834442139, "learning_rate": 2.6654839625319333e-05, "loss": 2.0970552444458006, "step": 51700 }, { "epoch": 14.677831393698552, "grad_norm": 7.161823272705078, "learning_rate": 2.6640647175702527e-05, "loss": 2.176998329162598, "step": 51710 }, { "epoch": 14.680669883621913, "grad_norm": 6.8348002433776855, "learning_rate": 2.6626454726085727e-05, "loss": 2.14770450592041, "step": 51720 }, { "epoch": 14.683508373545274, "grad_norm": 6.3840813636779785, "learning_rate": 2.661226227646892e-05, "loss": 2.072208595275879, "step": 51730 }, { "epoch": 14.686346863468636, "grad_norm": 6.531306266784668, "learning_rate": 2.6598069826852117e-05, "loss": 2.154979705810547, "step": 51740 }, { "epoch": 14.689185353391995, "grad_norm": 6.408574104309082, "learning_rate": 2.658387737723531e-05, "loss": 2.133731460571289, "step": 51750 }, { "epoch": 14.692023843315356, "grad_norm": 7.095334529876709, "learning_rate": 2.656968492761851e-05, "loss": 2.218230438232422, "step": 51760 }, { "epoch": 14.694862333238717, "grad_norm": 7.03800630569458, "learning_rate": 2.6555492478001704e-05, "loss": 2.147591209411621, "step": 51770 }, { "epoch": 14.697700823162078, "grad_norm": 6.760427474975586, "learning_rate": 2.6541300028384904e-05, "loss": 2.142368125915527, "step": 51780 }, { "epoch": 14.70053931308544, "grad_norm": 6.533491134643555, "learning_rate": 2.6527107578768097e-05, "loss": 2.1544658660888674, "step": 51790 }, { "epoch": 14.703377803008799, "grad_norm": 6.609166622161865, "learning_rate": 2.6512915129151294e-05, "loss": 2.1563388824462892, "step": 51800 }, { "epoch": 14.70621629293216, "grad_norm": 6.647651195526123, "learning_rate": 2.6498722679534487e-05, "loss": 2.1552967071533202, "step": 51810 }, { "epoch": 14.709054782855521, "grad_norm": 6.615218639373779, "learning_rate": 2.6484530229917687e-05, "loss": 2.159320259094238, "step": 51820 }, { "epoch": 14.711893272778882, "grad_norm": 6.457290172576904, "learning_rate": 2.647033778030088e-05, "loss": 2.133828353881836, "step": 51830 }, { "epoch": 14.714731762702243, "grad_norm": 6.400230884552002, "learning_rate": 2.645614533068408e-05, "loss": 2.154510498046875, "step": 51840 }, { "epoch": 14.717570252625602, "grad_norm": 6.679096221923828, "learning_rate": 2.6441952881067274e-05, "loss": 2.1114116668701173, "step": 51850 }, { "epoch": 14.720408742548964, "grad_norm": 6.4525299072265625, "learning_rate": 2.642776043145047e-05, "loss": 2.191804122924805, "step": 51860 }, { "epoch": 14.723247232472325, "grad_norm": 6.636955261230469, "learning_rate": 2.6413567981833664e-05, "loss": 2.2263309478759767, "step": 51870 }, { "epoch": 14.726085722395686, "grad_norm": 6.4568986892700195, "learning_rate": 2.6399375532216864e-05, "loss": 2.2066934585571287, "step": 51880 }, { "epoch": 14.728924212319047, "grad_norm": 6.120373249053955, "learning_rate": 2.6385183082600057e-05, "loss": 2.120578956604004, "step": 51890 }, { "epoch": 14.731762702242406, "grad_norm": 6.534599781036377, "learning_rate": 2.6370990632983257e-05, "loss": 2.147431564331055, "step": 51900 }, { "epoch": 14.734601192165767, "grad_norm": 6.4307169914245605, "learning_rate": 2.635679818336645e-05, "loss": 2.139154624938965, "step": 51910 }, { "epoch": 14.737439682089128, "grad_norm": 6.191473484039307, "learning_rate": 2.6342605733749647e-05, "loss": 2.146489143371582, "step": 51920 }, { "epoch": 14.74027817201249, "grad_norm": 6.521615505218506, "learning_rate": 2.632841328413284e-05, "loss": 2.0849864959716795, "step": 51930 }, { "epoch": 14.74311666193585, "grad_norm": 6.182727336883545, "learning_rate": 2.631422083451604e-05, "loss": 2.140036201477051, "step": 51940 }, { "epoch": 14.74595515185921, "grad_norm": 6.55917501449585, "learning_rate": 2.6300028384899234e-05, "loss": 2.163673973083496, "step": 51950 }, { "epoch": 14.748793641782571, "grad_norm": 6.156399250030518, "learning_rate": 2.628583593528243e-05, "loss": 2.129225730895996, "step": 51960 }, { "epoch": 14.751632131705932, "grad_norm": 6.315659999847412, "learning_rate": 2.6271643485665624e-05, "loss": 2.2016931533813477, "step": 51970 }, { "epoch": 14.754470621629293, "grad_norm": 6.834527969360352, "learning_rate": 2.6257451036048824e-05, "loss": 2.195423126220703, "step": 51980 }, { "epoch": 14.757309111552654, "grad_norm": 6.196653842926025, "learning_rate": 2.6243258586432018e-05, "loss": 2.1250331878662108, "step": 51990 }, { "epoch": 14.760147601476016, "grad_norm": 6.325605869293213, "learning_rate": 2.6229066136815218e-05, "loss": 2.106566047668457, "step": 52000 }, { "epoch": 14.760147601476016, "eval_accuracy": 0.34437591403319134, "eval_loss": 2.4782767295837402, "eval_runtime": 49.7411, "eval_samples_per_second": 316.177, "eval_steps_per_second": 4.946, "step": 52000 }, { "epoch": 14.762986091399375, "grad_norm": 6.402698516845703, "learning_rate": 2.621487368719841e-05, "loss": 2.2030361175537108, "step": 52010 }, { "epoch": 14.765824581322736, "grad_norm": 6.134268283843994, "learning_rate": 2.6200681237581608e-05, "loss": 2.174569320678711, "step": 52020 }, { "epoch": 14.768663071246097, "grad_norm": 6.545248985290527, "learning_rate": 2.61864887879648e-05, "loss": 2.1770694732666014, "step": 52030 }, { "epoch": 14.771501561169458, "grad_norm": 6.574280261993408, "learning_rate": 2.6172296338348e-05, "loss": 2.105604553222656, "step": 52040 }, { "epoch": 14.77434005109282, "grad_norm": 6.874595642089844, "learning_rate": 2.6158103888731194e-05, "loss": 2.143328094482422, "step": 52050 }, { "epoch": 14.777178541016179, "grad_norm": 6.745079517364502, "learning_rate": 2.6143911439114395e-05, "loss": 2.1397533416748047, "step": 52060 }, { "epoch": 14.78001703093954, "grad_norm": 7.4488043785095215, "learning_rate": 2.6129718989497588e-05, "loss": 2.1749122619628904, "step": 52070 }, { "epoch": 14.782855520862901, "grad_norm": 6.172996520996094, "learning_rate": 2.6115526539880785e-05, "loss": 2.223131561279297, "step": 52080 }, { "epoch": 14.785694010786262, "grad_norm": 6.3191609382629395, "learning_rate": 2.6101334090263978e-05, "loss": 2.1226634979248047, "step": 52090 }, { "epoch": 14.788532500709623, "grad_norm": 6.197514533996582, "learning_rate": 2.6087141640647178e-05, "loss": 2.1875637054443358, "step": 52100 }, { "epoch": 14.791370990632982, "grad_norm": 6.17463493347168, "learning_rate": 2.607294919103037e-05, "loss": 2.202804946899414, "step": 52110 }, { "epoch": 14.794209480556344, "grad_norm": 6.58581018447876, "learning_rate": 2.605875674141357e-05, "loss": 2.1632997512817385, "step": 52120 }, { "epoch": 14.797047970479705, "grad_norm": 6.495224475860596, "learning_rate": 2.6044564291796765e-05, "loss": 2.1224464416503905, "step": 52130 }, { "epoch": 14.799886460403066, "grad_norm": 6.443127155303955, "learning_rate": 2.603037184217996e-05, "loss": 2.1315711975097655, "step": 52140 }, { "epoch": 14.802724950326427, "grad_norm": 6.654836177825928, "learning_rate": 2.6016179392563155e-05, "loss": 2.1881237030029297, "step": 52150 }, { "epoch": 14.805563440249788, "grad_norm": 6.35429048538208, "learning_rate": 2.6001986942946355e-05, "loss": 2.167977523803711, "step": 52160 }, { "epoch": 14.808401930173147, "grad_norm": 6.538930892944336, "learning_rate": 2.5987794493329548e-05, "loss": 2.162141036987305, "step": 52170 }, { "epoch": 14.811240420096508, "grad_norm": 6.833978652954102, "learning_rate": 2.5973602043712748e-05, "loss": 2.139962005615234, "step": 52180 }, { "epoch": 14.81407891001987, "grad_norm": 6.558335781097412, "learning_rate": 2.595940959409594e-05, "loss": 2.2221569061279296, "step": 52190 }, { "epoch": 14.81691739994323, "grad_norm": 6.568458557128906, "learning_rate": 2.594521714447914e-05, "loss": 2.1195186614990233, "step": 52200 }, { "epoch": 14.819755889866592, "grad_norm": 6.446073055267334, "learning_rate": 2.593102469486233e-05, "loss": 2.1400382995605467, "step": 52210 }, { "epoch": 14.822594379789951, "grad_norm": 6.373106956481934, "learning_rate": 2.5916832245245532e-05, "loss": 2.1656646728515625, "step": 52220 }, { "epoch": 14.825432869713312, "grad_norm": 6.230677604675293, "learning_rate": 2.5902639795628725e-05, "loss": 2.0880367279052736, "step": 52230 }, { "epoch": 14.828271359636673, "grad_norm": 6.66623592376709, "learning_rate": 2.5888447346011925e-05, "loss": 2.1974414825439452, "step": 52240 }, { "epoch": 14.831109849560034, "grad_norm": 6.3333587646484375, "learning_rate": 2.587425489639512e-05, "loss": 2.172192192077637, "step": 52250 }, { "epoch": 14.833948339483396, "grad_norm": 6.785139083862305, "learning_rate": 2.5860062446778315e-05, "loss": 2.1580020904541017, "step": 52260 }, { "epoch": 14.836786829406755, "grad_norm": 6.09185266494751, "learning_rate": 2.584586999716151e-05, "loss": 2.173833465576172, "step": 52270 }, { "epoch": 14.839625319330116, "grad_norm": 6.584026336669922, "learning_rate": 2.583167754754471e-05, "loss": 2.251263236999512, "step": 52280 }, { "epoch": 14.842463809253477, "grad_norm": 6.2776994705200195, "learning_rate": 2.5817485097927902e-05, "loss": 2.1520580291748046, "step": 52290 }, { "epoch": 14.845302299176838, "grad_norm": 7.341075420379639, "learning_rate": 2.5803292648311102e-05, "loss": 2.1309440612792967, "step": 52300 }, { "epoch": 14.8481407891002, "grad_norm": 6.5840163230896, "learning_rate": 2.5789100198694295e-05, "loss": 2.159071922302246, "step": 52310 }, { "epoch": 14.850979279023559, "grad_norm": 6.6433868408203125, "learning_rate": 2.5774907749077492e-05, "loss": 2.1375473022460936, "step": 52320 }, { "epoch": 14.85381776894692, "grad_norm": 6.476303577423096, "learning_rate": 2.5760715299460685e-05, "loss": 2.1724275588989257, "step": 52330 }, { "epoch": 14.85665625887028, "grad_norm": 6.792259693145752, "learning_rate": 2.5746522849843885e-05, "loss": 2.152323341369629, "step": 52340 }, { "epoch": 14.859494748793642, "grad_norm": 6.508646488189697, "learning_rate": 2.573233040022708e-05, "loss": 2.1625585556030273, "step": 52350 }, { "epoch": 14.862333238717003, "grad_norm": 6.53209924697876, "learning_rate": 2.5718137950610275e-05, "loss": 2.1787342071533202, "step": 52360 }, { "epoch": 14.865171728640362, "grad_norm": 6.115235328674316, "learning_rate": 2.570394550099347e-05, "loss": 2.223087120056152, "step": 52370 }, { "epoch": 14.868010218563724, "grad_norm": 6.464908599853516, "learning_rate": 2.568975305137667e-05, "loss": 2.175510215759277, "step": 52380 }, { "epoch": 14.870848708487085, "grad_norm": 6.384645938873291, "learning_rate": 2.5675560601759862e-05, "loss": 2.1409093856811525, "step": 52390 }, { "epoch": 14.873687198410446, "grad_norm": 6.4138617515563965, "learning_rate": 2.5661368152143062e-05, "loss": 2.1409442901611326, "step": 52400 }, { "epoch": 14.876525688333807, "grad_norm": 6.39212703704834, "learning_rate": 2.5647175702526256e-05, "loss": 2.1335681915283202, "step": 52410 }, { "epoch": 14.879364178257168, "grad_norm": 6.490799903869629, "learning_rate": 2.5632983252909452e-05, "loss": 2.15484561920166, "step": 52420 }, { "epoch": 14.882202668180527, "grad_norm": 6.639063835144043, "learning_rate": 2.5618790803292646e-05, "loss": 2.1344459533691404, "step": 52430 }, { "epoch": 14.885041158103888, "grad_norm": 6.396474361419678, "learning_rate": 2.5604598353675846e-05, "loss": 2.2239797592163084, "step": 52440 }, { "epoch": 14.88787964802725, "grad_norm": 6.414002418518066, "learning_rate": 2.559040590405904e-05, "loss": 2.1655910491943358, "step": 52450 }, { "epoch": 14.89071813795061, "grad_norm": 6.543750762939453, "learning_rate": 2.557621345444224e-05, "loss": 2.181911849975586, "step": 52460 }, { "epoch": 14.893556627873972, "grad_norm": 6.6248602867126465, "learning_rate": 2.5562021004825433e-05, "loss": 2.2091259002685546, "step": 52470 }, { "epoch": 14.896395117797331, "grad_norm": 6.905755996704102, "learning_rate": 2.554782855520863e-05, "loss": 2.193409729003906, "step": 52480 }, { "epoch": 14.899233607720692, "grad_norm": 6.30825138092041, "learning_rate": 2.5533636105591823e-05, "loss": 2.1182792663574217, "step": 52490 }, { "epoch": 14.902072097644053, "grad_norm": 6.780544281005859, "learning_rate": 2.5519443655975023e-05, "loss": 2.146870994567871, "step": 52500 }, { "epoch": 14.902072097644053, "eval_accuracy": 0.3473644051630953, "eval_loss": 2.4746053218841553, "eval_runtime": 49.0617, "eval_samples_per_second": 320.556, "eval_steps_per_second": 5.014, "step": 52500 }, { "epoch": 14.904910587567414, "grad_norm": 6.1855292320251465, "learning_rate": 2.5505251206358216e-05, "loss": 2.0212799072265626, "step": 52510 }, { "epoch": 14.907749077490775, "grad_norm": 6.718217372894287, "learning_rate": 2.5492478001703095e-05, "loss": 2.145752716064453, "step": 52520 }, { "epoch": 14.910587567414137, "grad_norm": 6.585477352142334, "learning_rate": 2.547828555208629e-05, "loss": 2.15454216003418, "step": 52530 }, { "epoch": 14.913426057337496, "grad_norm": 6.5534749031066895, "learning_rate": 2.546409310246949e-05, "loss": 2.1055809020996095, "step": 52540 }, { "epoch": 14.916264547260857, "grad_norm": 6.486547470092773, "learning_rate": 2.5449900652852682e-05, "loss": 2.059457778930664, "step": 52550 }, { "epoch": 14.919103037184218, "grad_norm": 6.5908522605896, "learning_rate": 2.5435708203235882e-05, "loss": 2.210110664367676, "step": 52560 }, { "epoch": 14.92194152710758, "grad_norm": 6.61740779876709, "learning_rate": 2.5421515753619076e-05, "loss": 2.2143957138061525, "step": 52570 }, { "epoch": 14.92478001703094, "grad_norm": 6.536825656890869, "learning_rate": 2.5407323304002272e-05, "loss": 2.140397644042969, "step": 52580 }, { "epoch": 14.9276185069543, "grad_norm": 6.379024028778076, "learning_rate": 2.5393130854385466e-05, "loss": 2.1397916793823244, "step": 52590 }, { "epoch": 14.93045699687766, "grad_norm": 6.788222312927246, "learning_rate": 2.5378938404768666e-05, "loss": 2.132095718383789, "step": 52600 }, { "epoch": 14.933295486801022, "grad_norm": 6.761805534362793, "learning_rate": 2.536474595515186e-05, "loss": 2.1166791915893555, "step": 52610 }, { "epoch": 14.936133976724383, "grad_norm": 6.458646774291992, "learning_rate": 2.5350553505535056e-05, "loss": 2.1415475845336913, "step": 52620 }, { "epoch": 14.938972466647744, "grad_norm": 6.302643775939941, "learning_rate": 2.5336361055918252e-05, "loss": 2.192586326599121, "step": 52630 }, { "epoch": 14.941810956571103, "grad_norm": 6.820559501647949, "learning_rate": 2.532216860630145e-05, "loss": 2.2397216796875, "step": 52640 }, { "epoch": 14.944649446494465, "grad_norm": 6.514204025268555, "learning_rate": 2.5307976156684642e-05, "loss": 2.132811737060547, "step": 52650 }, { "epoch": 14.947487936417826, "grad_norm": 6.317018508911133, "learning_rate": 2.5293783707067843e-05, "loss": 2.116554832458496, "step": 52660 }, { "epoch": 14.950326426341187, "grad_norm": 7.040482044219971, "learning_rate": 2.5279591257451036e-05, "loss": 2.1716796875, "step": 52670 }, { "epoch": 14.953164916264548, "grad_norm": 6.715221881866455, "learning_rate": 2.5265398807834233e-05, "loss": 2.1472499847412108, "step": 52680 }, { "epoch": 14.956003406187907, "grad_norm": 6.410496711730957, "learning_rate": 2.5251206358217426e-05, "loss": 2.1312278747558593, "step": 52690 }, { "epoch": 14.958841896111268, "grad_norm": 6.457828044891357, "learning_rate": 2.5237013908600626e-05, "loss": 2.10549259185791, "step": 52700 }, { "epoch": 14.96168038603463, "grad_norm": 6.331070899963379, "learning_rate": 2.522282145898382e-05, "loss": 2.100213623046875, "step": 52710 }, { "epoch": 14.96451887595799, "grad_norm": 6.5535569190979, "learning_rate": 2.520862900936702e-05, "loss": 2.115045166015625, "step": 52720 }, { "epoch": 14.967357365881352, "grad_norm": 6.623978137969971, "learning_rate": 2.5194436559750213e-05, "loss": 2.1795068740844727, "step": 52730 }, { "epoch": 14.970195855804711, "grad_norm": 6.51175594329834, "learning_rate": 2.518024411013341e-05, "loss": 2.186904525756836, "step": 52740 }, { "epoch": 14.973034345728072, "grad_norm": 6.166961669921875, "learning_rate": 2.5166051660516603e-05, "loss": 2.1794233322143555, "step": 52750 }, { "epoch": 14.975872835651433, "grad_norm": 6.7807207107543945, "learning_rate": 2.5151859210899803e-05, "loss": 2.1726797103881834, "step": 52760 }, { "epoch": 14.978711325574794, "grad_norm": 6.4161763191223145, "learning_rate": 2.5137666761282996e-05, "loss": 2.2196271896362303, "step": 52770 }, { "epoch": 14.981549815498155, "grad_norm": 6.267117500305176, "learning_rate": 2.5123474311666196e-05, "loss": 2.0858116149902344, "step": 52780 }, { "epoch": 14.984388305421517, "grad_norm": 6.567619800567627, "learning_rate": 2.510928186204939e-05, "loss": 2.1771800994873045, "step": 52790 }, { "epoch": 14.987226795344876, "grad_norm": 6.14469575881958, "learning_rate": 2.5095089412432586e-05, "loss": 2.2282691955566407, "step": 52800 }, { "epoch": 14.990065285268237, "grad_norm": 6.245480060577393, "learning_rate": 2.508089696281578e-05, "loss": 2.0946395874023436, "step": 52810 }, { "epoch": 14.992903775191598, "grad_norm": 6.374760627746582, "learning_rate": 2.506670451319898e-05, "loss": 2.174965476989746, "step": 52820 }, { "epoch": 14.99574226511496, "grad_norm": 6.384993076324463, "learning_rate": 2.5052512063582173e-05, "loss": 2.159699058532715, "step": 52830 }, { "epoch": 14.99858075503832, "grad_norm": 6.55537223815918, "learning_rate": 2.5038319613965373e-05, "loss": 2.1324089050292967, "step": 52840 }, { "epoch": 15.00141924496168, "grad_norm": 5.989511013031006, "learning_rate": 2.5024127164348566e-05, "loss": 2.066140365600586, "step": 52850 }, { "epoch": 15.00425773488504, "grad_norm": 6.8162312507629395, "learning_rate": 2.5009934714731763e-05, "loss": 2.2069065093994142, "step": 52860 }, { "epoch": 15.007096224808402, "grad_norm": 6.390866279602051, "learning_rate": 2.499574226511496e-05, "loss": 2.166504669189453, "step": 52870 }, { "epoch": 15.009934714731763, "grad_norm": 6.618041515350342, "learning_rate": 2.4981549815498157e-05, "loss": 2.1738889694213865, "step": 52880 }, { "epoch": 15.012773204655124, "grad_norm": 5.943508148193359, "learning_rate": 2.4967357365881353e-05, "loss": 2.1276668548583983, "step": 52890 }, { "epoch": 15.015611694578483, "grad_norm": 6.367565631866455, "learning_rate": 2.495316491626455e-05, "loss": 2.133428764343262, "step": 52900 }, { "epoch": 15.018450184501845, "grad_norm": 6.2424845695495605, "learning_rate": 2.4938972466647743e-05, "loss": 2.166765594482422, "step": 52910 }, { "epoch": 15.021288674425206, "grad_norm": 6.451653957366943, "learning_rate": 2.492478001703094e-05, "loss": 2.0814380645751953, "step": 52920 }, { "epoch": 15.024127164348567, "grad_norm": 6.603716850280762, "learning_rate": 2.4910587567414137e-05, "loss": 2.1639148712158205, "step": 52930 }, { "epoch": 15.026965654271928, "grad_norm": 6.519184112548828, "learning_rate": 2.4896395117797333e-05, "loss": 2.1594873428344727, "step": 52940 }, { "epoch": 15.029804144195289, "grad_norm": 6.443635940551758, "learning_rate": 2.488220266818053e-05, "loss": 2.1173273086547852, "step": 52950 }, { "epoch": 15.032642634118648, "grad_norm": 6.4254631996154785, "learning_rate": 2.4868010218563727e-05, "loss": 2.130908393859863, "step": 52960 }, { "epoch": 15.03548112404201, "grad_norm": 6.45412015914917, "learning_rate": 2.485381776894692e-05, "loss": 2.0992549896240233, "step": 52970 }, { "epoch": 15.03831961396537, "grad_norm": 6.858759880065918, "learning_rate": 2.4839625319330117e-05, "loss": 2.1381227493286135, "step": 52980 }, { "epoch": 15.041158103888732, "grad_norm": 6.604097843170166, "learning_rate": 2.4825432869713314e-05, "loss": 2.085332489013672, "step": 52990 }, { "epoch": 15.043996593812093, "grad_norm": 6.177943706512451, "learning_rate": 2.481124042009651e-05, "loss": 2.0926300048828126, "step": 53000 }, { "epoch": 15.043996593812093, "eval_accuracy": 0.3425319514211229, "eval_loss": 2.4736616611480713, "eval_runtime": 49.5335, "eval_samples_per_second": 317.502, "eval_steps_per_second": 4.966, "step": 53000 }, { "epoch": 15.046835083735452, "grad_norm": 6.3311638832092285, "learning_rate": 2.4797047970479707e-05, "loss": 2.136520767211914, "step": 53010 }, { "epoch": 15.049673573658813, "grad_norm": 6.527366638183594, "learning_rate": 2.4782855520862904e-05, "loss": 2.13500919342041, "step": 53020 }, { "epoch": 15.052512063582174, "grad_norm": 6.828665733337402, "learning_rate": 2.4768663071246097e-05, "loss": 2.126957893371582, "step": 53030 }, { "epoch": 15.055350553505535, "grad_norm": 6.80460262298584, "learning_rate": 2.4754470621629294e-05, "loss": 2.1851362228393554, "step": 53040 }, { "epoch": 15.058189043428897, "grad_norm": 5.9483184814453125, "learning_rate": 2.474027817201249e-05, "loss": 2.0909902572631838, "step": 53050 }, { "epoch": 15.061027533352256, "grad_norm": 6.2031331062316895, "learning_rate": 2.4726085722395687e-05, "loss": 2.1609514236450194, "step": 53060 }, { "epoch": 15.063866023275617, "grad_norm": 6.334000110626221, "learning_rate": 2.4711893272778884e-05, "loss": 2.160488510131836, "step": 53070 }, { "epoch": 15.066704513198978, "grad_norm": 6.84523868560791, "learning_rate": 2.4697700823162077e-05, "loss": 2.1760480880737303, "step": 53080 }, { "epoch": 15.06954300312234, "grad_norm": 6.458366394042969, "learning_rate": 2.4683508373545274e-05, "loss": 2.15084228515625, "step": 53090 }, { "epoch": 15.0723814930457, "grad_norm": 6.522457122802734, "learning_rate": 2.466931592392847e-05, "loss": 2.14431209564209, "step": 53100 }, { "epoch": 15.07521998296906, "grad_norm": 6.537765026092529, "learning_rate": 2.4655123474311667e-05, "loss": 2.1196414947509767, "step": 53110 }, { "epoch": 15.07805847289242, "grad_norm": 6.320575714111328, "learning_rate": 2.4640931024694864e-05, "loss": 2.1552047729492188, "step": 53120 }, { "epoch": 15.080896962815782, "grad_norm": 6.069414138793945, "learning_rate": 2.462673857507806e-05, "loss": 2.0473211288452147, "step": 53130 }, { "epoch": 15.083735452739143, "grad_norm": 6.429774284362793, "learning_rate": 2.4612546125461254e-05, "loss": 2.133646011352539, "step": 53140 }, { "epoch": 15.086573942662504, "grad_norm": 6.299708843231201, "learning_rate": 2.459835367584445e-05, "loss": 2.1635181427001955, "step": 53150 }, { "epoch": 15.089412432585865, "grad_norm": 6.383599758148193, "learning_rate": 2.4584161226227648e-05, "loss": 2.17415714263916, "step": 53160 }, { "epoch": 15.092250922509225, "grad_norm": 6.555253028869629, "learning_rate": 2.4569968776610844e-05, "loss": 2.1075885772705076, "step": 53170 }, { "epoch": 15.095089412432586, "grad_norm": 6.511601448059082, "learning_rate": 2.455577632699404e-05, "loss": 2.1428033828735353, "step": 53180 }, { "epoch": 15.097927902355947, "grad_norm": 6.539876461029053, "learning_rate": 2.4541583877377238e-05, "loss": 2.1421014785766603, "step": 53190 }, { "epoch": 15.100766392279308, "grad_norm": 6.252833366394043, "learning_rate": 2.452739142776043e-05, "loss": 2.0691417694091796, "step": 53200 }, { "epoch": 15.103604882202669, "grad_norm": 6.323064804077148, "learning_rate": 2.4513198978143628e-05, "loss": 2.1839569091796873, "step": 53210 }, { "epoch": 15.106443372126028, "grad_norm": 6.343355178833008, "learning_rate": 2.4499006528526824e-05, "loss": 2.1785358428955077, "step": 53220 }, { "epoch": 15.10928186204939, "grad_norm": 6.399342060089111, "learning_rate": 2.448481407891002e-05, "loss": 2.12392635345459, "step": 53230 }, { "epoch": 15.11212035197275, "grad_norm": 6.795849800109863, "learning_rate": 2.4470621629293218e-05, "loss": 2.1421518325805664, "step": 53240 }, { "epoch": 15.114958841896112, "grad_norm": 6.546746253967285, "learning_rate": 2.445642917967641e-05, "loss": 2.219986152648926, "step": 53250 }, { "epoch": 15.117797331819473, "grad_norm": 6.29577112197876, "learning_rate": 2.4442236730059608e-05, "loss": 2.086741256713867, "step": 53260 }, { "epoch": 15.120635821742832, "grad_norm": 6.265172481536865, "learning_rate": 2.4428044280442805e-05, "loss": 2.0032865524291994, "step": 53270 }, { "epoch": 15.123474311666193, "grad_norm": 6.474606990814209, "learning_rate": 2.4413851830826e-05, "loss": 2.1189889907836914, "step": 53280 }, { "epoch": 15.126312801589554, "grad_norm": 6.4445624351501465, "learning_rate": 2.4399659381209198e-05, "loss": 2.063644599914551, "step": 53290 }, { "epoch": 15.129151291512915, "grad_norm": 6.677933692932129, "learning_rate": 2.4385466931592395e-05, "loss": 2.163540267944336, "step": 53300 }, { "epoch": 15.131989781436276, "grad_norm": 6.625088214874268, "learning_rate": 2.4371274481975588e-05, "loss": 2.1740617752075195, "step": 53310 }, { "epoch": 15.134828271359636, "grad_norm": 6.436030387878418, "learning_rate": 2.4357082032358785e-05, "loss": 2.1459875106811523, "step": 53320 }, { "epoch": 15.137666761282997, "grad_norm": 6.445911884307861, "learning_rate": 2.434288958274198e-05, "loss": 2.0770259857177735, "step": 53330 }, { "epoch": 15.140505251206358, "grad_norm": 6.890684604644775, "learning_rate": 2.4328697133125178e-05, "loss": 2.1650415420532227, "step": 53340 }, { "epoch": 15.14334374112972, "grad_norm": 6.358134746551514, "learning_rate": 2.4314504683508375e-05, "loss": 2.2428955078125, "step": 53350 }, { "epoch": 15.14618223105308, "grad_norm": 6.61152458190918, "learning_rate": 2.430031223389157e-05, "loss": 2.146128273010254, "step": 53360 }, { "epoch": 15.149020720976441, "grad_norm": 6.6259565353393555, "learning_rate": 2.4286119784274765e-05, "loss": 2.2593557357788088, "step": 53370 }, { "epoch": 15.1518592108998, "grad_norm": 6.841925621032715, "learning_rate": 2.427192733465796e-05, "loss": 2.1109275817871094, "step": 53380 }, { "epoch": 15.154697700823162, "grad_norm": 6.812272548675537, "learning_rate": 2.4257734885041158e-05, "loss": 2.131489944458008, "step": 53390 }, { "epoch": 15.157536190746523, "grad_norm": 7.02168083190918, "learning_rate": 2.4243542435424355e-05, "loss": 2.158890724182129, "step": 53400 }, { "epoch": 15.160374680669884, "grad_norm": 6.808154582977295, "learning_rate": 2.422934998580755e-05, "loss": 2.161627769470215, "step": 53410 }, { "epoch": 15.163213170593245, "grad_norm": 6.438183307647705, "learning_rate": 2.421515753619075e-05, "loss": 2.1877168655395507, "step": 53420 }, { "epoch": 15.166051660516604, "grad_norm": 6.7278523445129395, "learning_rate": 2.4200965086573942e-05, "loss": 2.0828578948974608, "step": 53430 }, { "epoch": 15.168890150439966, "grad_norm": 6.086673259735107, "learning_rate": 2.418677263695714e-05, "loss": 2.2404348373413088, "step": 53440 }, { "epoch": 15.171728640363327, "grad_norm": 6.129936218261719, "learning_rate": 2.4172580187340335e-05, "loss": 2.1418832778930663, "step": 53450 }, { "epoch": 15.174567130286688, "grad_norm": 6.53759765625, "learning_rate": 2.4158387737723532e-05, "loss": 2.082825469970703, "step": 53460 }, { "epoch": 15.177405620210049, "grad_norm": 6.381281852722168, "learning_rate": 2.414419528810673e-05, "loss": 2.121112823486328, "step": 53470 }, { "epoch": 15.180244110133408, "grad_norm": 6.4015631675720215, "learning_rate": 2.4130002838489922e-05, "loss": 2.1192867279052736, "step": 53480 }, { "epoch": 15.18308260005677, "grad_norm": 6.4805192947387695, "learning_rate": 2.411581038887312e-05, "loss": 2.1381322860717775, "step": 53490 }, { "epoch": 15.18592108998013, "grad_norm": 6.988881587982178, "learning_rate": 2.4101617939256315e-05, "loss": 2.1900100708007812, "step": 53500 }, { "epoch": 15.18592108998013, "eval_accuracy": 0.34768232975138297, "eval_loss": 2.472740411758423, "eval_runtime": 49.6355, "eval_samples_per_second": 316.85, "eval_steps_per_second": 4.956, "step": 53500 }, { "epoch": 15.188759579903492, "grad_norm": 6.72157621383667, "learning_rate": 2.4087425489639512e-05, "loss": 2.1035024642944338, "step": 53510 }, { "epoch": 15.191598069826853, "grad_norm": 6.33087158203125, "learning_rate": 2.407323304002271e-05, "loss": 2.0983543395996094, "step": 53520 }, { "epoch": 15.194436559750212, "grad_norm": 6.732876777648926, "learning_rate": 2.4059040590405905e-05, "loss": 2.146035385131836, "step": 53530 }, { "epoch": 15.197275049673573, "grad_norm": 6.324202537536621, "learning_rate": 2.40448481407891e-05, "loss": 2.098387336730957, "step": 53540 }, { "epoch": 15.200113539596934, "grad_norm": 6.124294281005859, "learning_rate": 2.4032074936133978e-05, "loss": 2.0868406295776367, "step": 53550 }, { "epoch": 15.202952029520295, "grad_norm": 6.6089301109313965, "learning_rate": 2.4017882486517175e-05, "loss": 2.062884521484375, "step": 53560 }, { "epoch": 15.205790519443656, "grad_norm": 6.772044658660889, "learning_rate": 2.4003690036900368e-05, "loss": 2.138472557067871, "step": 53570 }, { "epoch": 15.208629009367018, "grad_norm": 6.258087635040283, "learning_rate": 2.3989497587283565e-05, "loss": 2.1143705368041994, "step": 53580 }, { "epoch": 15.211467499290377, "grad_norm": 6.68936824798584, "learning_rate": 2.397530513766676e-05, "loss": 2.1234134674072265, "step": 53590 }, { "epoch": 15.214305989213738, "grad_norm": 6.423226833343506, "learning_rate": 2.3961112688049958e-05, "loss": 2.1313196182250977, "step": 53600 }, { "epoch": 15.217144479137099, "grad_norm": 6.658445835113525, "learning_rate": 2.3946920238433155e-05, "loss": 2.164855194091797, "step": 53610 }, { "epoch": 15.21998296906046, "grad_norm": 6.541187286376953, "learning_rate": 2.3932727788816352e-05, "loss": 2.1785003662109377, "step": 53620 }, { "epoch": 15.222821458983821, "grad_norm": 6.220309257507324, "learning_rate": 2.3918535339199545e-05, "loss": 2.078019714355469, "step": 53630 }, { "epoch": 15.22565994890718, "grad_norm": 6.248966693878174, "learning_rate": 2.3904342889582742e-05, "loss": 2.1491947174072266, "step": 53640 }, { "epoch": 15.228498438830542, "grad_norm": 6.841236591339111, "learning_rate": 2.389015043996594e-05, "loss": 2.1221763610839846, "step": 53650 }, { "epoch": 15.231336928753903, "grad_norm": 6.487249851226807, "learning_rate": 2.3875957990349135e-05, "loss": 2.191390037536621, "step": 53660 }, { "epoch": 15.234175418677264, "grad_norm": 6.040241718292236, "learning_rate": 2.3861765540732332e-05, "loss": 2.1695024490356447, "step": 53670 }, { "epoch": 15.237013908600625, "grad_norm": 6.389219760894775, "learning_rate": 2.384757309111553e-05, "loss": 2.1424081802368162, "step": 53680 }, { "epoch": 15.239852398523984, "grad_norm": 6.31194543838501, "learning_rate": 2.3833380641498722e-05, "loss": 2.150219535827637, "step": 53690 }, { "epoch": 15.242690888447346, "grad_norm": 6.392569541931152, "learning_rate": 2.381918819188192e-05, "loss": 2.126376724243164, "step": 53700 }, { "epoch": 15.245529378370707, "grad_norm": 6.300080299377441, "learning_rate": 2.3804995742265115e-05, "loss": 2.147800254821777, "step": 53710 }, { "epoch": 15.248367868294068, "grad_norm": 6.206798076629639, "learning_rate": 2.3790803292648312e-05, "loss": 2.136124038696289, "step": 53720 }, { "epoch": 15.251206358217429, "grad_norm": 6.141240119934082, "learning_rate": 2.377661084303151e-05, "loss": 2.0857013702392577, "step": 53730 }, { "epoch": 15.25404484814079, "grad_norm": 6.811527729034424, "learning_rate": 2.3762418393414705e-05, "loss": 2.167082977294922, "step": 53740 }, { "epoch": 15.25688333806415, "grad_norm": 6.795415878295898, "learning_rate": 2.37482259437979e-05, "loss": 2.1486042022705076, "step": 53750 }, { "epoch": 15.25972182798751, "grad_norm": 6.464561939239502, "learning_rate": 2.3734033494181095e-05, "loss": 2.161417007446289, "step": 53760 }, { "epoch": 15.262560317910872, "grad_norm": 6.342434883117676, "learning_rate": 2.3719841044564292e-05, "loss": 2.132794952392578, "step": 53770 }, { "epoch": 15.265398807834233, "grad_norm": 6.486599922180176, "learning_rate": 2.370564859494749e-05, "loss": 2.069606399536133, "step": 53780 }, { "epoch": 15.268237297757594, "grad_norm": 6.3498735427856445, "learning_rate": 2.3691456145330686e-05, "loss": 2.1208024978637696, "step": 53790 }, { "epoch": 15.271075787680953, "grad_norm": 6.536123752593994, "learning_rate": 2.367726369571388e-05, "loss": 2.194383430480957, "step": 53800 }, { "epoch": 15.273914277604314, "grad_norm": 6.4565653800964355, "learning_rate": 2.3663071246097076e-05, "loss": 2.1596778869628905, "step": 53810 }, { "epoch": 15.276752767527675, "grad_norm": 6.270236968994141, "learning_rate": 2.3648878796480272e-05, "loss": 2.0944581985473634, "step": 53820 }, { "epoch": 15.279591257451036, "grad_norm": 6.640946865081787, "learning_rate": 2.363468634686347e-05, "loss": 2.1873416900634766, "step": 53830 }, { "epoch": 15.282429747374398, "grad_norm": 6.382467269897461, "learning_rate": 2.3620493897246666e-05, "loss": 2.2086532592773436, "step": 53840 }, { "epoch": 15.285268237297757, "grad_norm": 6.363834381103516, "learning_rate": 2.3606301447629862e-05, "loss": 2.151993179321289, "step": 53850 }, { "epoch": 15.288106727221118, "grad_norm": 6.467418193817139, "learning_rate": 2.3592108998013056e-05, "loss": 2.1213348388671873, "step": 53860 }, { "epoch": 15.290945217144479, "grad_norm": 6.5376973152160645, "learning_rate": 2.3577916548396253e-05, "loss": 2.1332914352416994, "step": 53870 }, { "epoch": 15.29378370706784, "grad_norm": 6.497325897216797, "learning_rate": 2.356372409877945e-05, "loss": 2.186327362060547, "step": 53880 }, { "epoch": 15.296622196991201, "grad_norm": 6.341065406799316, "learning_rate": 2.3549531649162646e-05, "loss": 2.1216289520263674, "step": 53890 }, { "epoch": 15.29946068691456, "grad_norm": 6.515848159790039, "learning_rate": 2.3535339199545843e-05, "loss": 2.251807975769043, "step": 53900 }, { "epoch": 15.302299176837922, "grad_norm": 6.598287105560303, "learning_rate": 2.352114674992904e-05, "loss": 2.057073783874512, "step": 53910 }, { "epoch": 15.305137666761283, "grad_norm": 6.216014385223389, "learning_rate": 2.3506954300312233e-05, "loss": 2.136677360534668, "step": 53920 }, { "epoch": 15.307976156684644, "grad_norm": 6.345959663391113, "learning_rate": 2.349276185069543e-05, "loss": 2.1482215881347657, "step": 53930 }, { "epoch": 15.310814646608005, "grad_norm": 6.767091274261475, "learning_rate": 2.3478569401078626e-05, "loss": 2.2014610290527346, "step": 53940 }, { "epoch": 15.313653136531366, "grad_norm": 6.734010696411133, "learning_rate": 2.3464376951461823e-05, "loss": 2.1290990829467775, "step": 53950 }, { "epoch": 15.316491626454726, "grad_norm": 6.4903059005737305, "learning_rate": 2.345018450184502e-05, "loss": 2.1168041229248047, "step": 53960 }, { "epoch": 15.319330116378087, "grad_norm": 6.525570392608643, "learning_rate": 2.3435992052228213e-05, "loss": 2.1247940063476562, "step": 53970 }, { "epoch": 15.322168606301448, "grad_norm": 6.49920129776001, "learning_rate": 2.342179960261141e-05, "loss": 2.120111656188965, "step": 53980 }, { "epoch": 15.325007096224809, "grad_norm": 6.4179840087890625, "learning_rate": 2.3407607152994606e-05, "loss": 2.1584396362304688, "step": 53990 }, { "epoch": 15.32784558614817, "grad_norm": 6.104822158813477, "learning_rate": 2.3393414703377803e-05, "loss": 2.133905792236328, "step": 54000 }, { "epoch": 15.32784558614817, "eval_accuracy": 0.3459019520569721, "eval_loss": 2.4723052978515625, "eval_runtime": 48.6729, "eval_samples_per_second": 323.116, "eval_steps_per_second": 5.054, "step": 54000 }, { "epoch": 15.33068407607153, "grad_norm": 7.08802604675293, "learning_rate": 2.3379222253761e-05, "loss": 2.161177062988281, "step": 54010 }, { "epoch": 15.33352256599489, "grad_norm": 6.6432929039001465, "learning_rate": 2.3365029804144196e-05, "loss": 2.077311134338379, "step": 54020 }, { "epoch": 15.336361055918251, "grad_norm": 6.349873065948486, "learning_rate": 2.335083735452739e-05, "loss": 2.004250335693359, "step": 54030 }, { "epoch": 15.339199545841613, "grad_norm": 6.684207439422607, "learning_rate": 2.3336644904910586e-05, "loss": 2.11606502532959, "step": 54040 }, { "epoch": 15.342038035764974, "grad_norm": 6.192377090454102, "learning_rate": 2.3322452455293787e-05, "loss": 2.1678062438964845, "step": 54050 }, { "epoch": 15.344876525688333, "grad_norm": 6.655971527099609, "learning_rate": 2.3308260005676983e-05, "loss": 2.1605913162231447, "step": 54060 }, { "epoch": 15.347715015611694, "grad_norm": 6.410719871520996, "learning_rate": 2.329406755606018e-05, "loss": 2.0964250564575195, "step": 54070 }, { "epoch": 15.350553505535055, "grad_norm": 6.694612979888916, "learning_rate": 2.3279875106443373e-05, "loss": 2.0665283203125, "step": 54080 }, { "epoch": 15.353391995458416, "grad_norm": 6.059391021728516, "learning_rate": 2.326568265682657e-05, "loss": 2.1616695404052733, "step": 54090 }, { "epoch": 15.356230485381777, "grad_norm": 6.658933162689209, "learning_rate": 2.3251490207209767e-05, "loss": 2.1590549468994142, "step": 54100 }, { "epoch": 15.359068975305137, "grad_norm": 6.797679901123047, "learning_rate": 2.3237297757592963e-05, "loss": 2.1571144104003905, "step": 54110 }, { "epoch": 15.361907465228498, "grad_norm": 6.297545433044434, "learning_rate": 2.322310530797616e-05, "loss": 2.1529010772705077, "step": 54120 }, { "epoch": 15.364745955151859, "grad_norm": 7.080237865447998, "learning_rate": 2.3208912858359357e-05, "loss": 2.168043327331543, "step": 54130 }, { "epoch": 15.36758444507522, "grad_norm": 6.7907209396362305, "learning_rate": 2.319472040874255e-05, "loss": 2.2044921875, "step": 54140 }, { "epoch": 15.370422934998581, "grad_norm": 6.168161869049072, "learning_rate": 2.3180527959125747e-05, "loss": 2.1420530319213866, "step": 54150 }, { "epoch": 15.373261424921942, "grad_norm": 6.599157333374023, "learning_rate": 2.3166335509508944e-05, "loss": 2.204114532470703, "step": 54160 }, { "epoch": 15.376099914845302, "grad_norm": 6.676085948944092, "learning_rate": 2.315214305989214e-05, "loss": 2.1748138427734376, "step": 54170 }, { "epoch": 15.378938404768663, "grad_norm": 6.7879509925842285, "learning_rate": 2.3137950610275337e-05, "loss": 2.173269844055176, "step": 54180 }, { "epoch": 15.381776894692024, "grad_norm": 6.186537742614746, "learning_rate": 2.312375816065853e-05, "loss": 2.1595428466796873, "step": 54190 }, { "epoch": 15.384615384615385, "grad_norm": 6.053598403930664, "learning_rate": 2.3109565711041727e-05, "loss": 2.136849021911621, "step": 54200 }, { "epoch": 15.387453874538746, "grad_norm": 6.237117290496826, "learning_rate": 2.3095373261424924e-05, "loss": 2.1499923706054687, "step": 54210 }, { "epoch": 15.390292364462105, "grad_norm": 6.253475189208984, "learning_rate": 2.308118081180812e-05, "loss": 2.0710851669311525, "step": 54220 }, { "epoch": 15.393130854385467, "grad_norm": 6.579779624938965, "learning_rate": 2.3066988362191317e-05, "loss": 2.121834373474121, "step": 54230 }, { "epoch": 15.395969344308828, "grad_norm": 6.385104179382324, "learning_rate": 2.3052795912574514e-05, "loss": 2.1385345458984375, "step": 54240 }, { "epoch": 15.398807834232189, "grad_norm": 6.760473728179932, "learning_rate": 2.3038603462957707e-05, "loss": 2.172305870056152, "step": 54250 }, { "epoch": 15.40164632415555, "grad_norm": 6.606520652770996, "learning_rate": 2.3024411013340904e-05, "loss": 2.130588150024414, "step": 54260 }, { "epoch": 15.40448481407891, "grad_norm": 6.925575256347656, "learning_rate": 2.30102185637241e-05, "loss": 2.1122720718383787, "step": 54270 }, { "epoch": 15.40732330400227, "grad_norm": 6.772551536560059, "learning_rate": 2.2996026114107297e-05, "loss": 2.1324270248413084, "step": 54280 }, { "epoch": 15.410161793925631, "grad_norm": 6.218590259552002, "learning_rate": 2.2981833664490494e-05, "loss": 2.100429344177246, "step": 54290 }, { "epoch": 15.413000283848993, "grad_norm": 6.287024974822998, "learning_rate": 2.296764121487369e-05, "loss": 2.159824562072754, "step": 54300 }, { "epoch": 15.415838773772354, "grad_norm": 6.807006359100342, "learning_rate": 2.2953448765256884e-05, "loss": 2.2057373046875, "step": 54310 }, { "epoch": 15.418677263695713, "grad_norm": 6.435606002807617, "learning_rate": 2.293925631564008e-05, "loss": 2.116690444946289, "step": 54320 }, { "epoch": 15.421515753619074, "grad_norm": 6.559557914733887, "learning_rate": 2.2925063866023277e-05, "loss": 2.1528282165527344, "step": 54330 }, { "epoch": 15.424354243542435, "grad_norm": 6.66141939163208, "learning_rate": 2.2910871416406474e-05, "loss": 2.1115139007568358, "step": 54340 }, { "epoch": 15.427192733465796, "grad_norm": 6.365882396697998, "learning_rate": 2.289667896678967e-05, "loss": 2.1733154296875, "step": 54350 }, { "epoch": 15.430031223389157, "grad_norm": 6.620124340057373, "learning_rate": 2.2882486517172864e-05, "loss": 2.1131103515625, "step": 54360 }, { "epoch": 15.432869713312519, "grad_norm": 6.429750442504883, "learning_rate": 2.286829406755606e-05, "loss": 2.0707109451293944, "step": 54370 }, { "epoch": 15.435708203235878, "grad_norm": 6.7499284744262695, "learning_rate": 2.2854101617939258e-05, "loss": 2.1208187103271485, "step": 54380 }, { "epoch": 15.438546693159239, "grad_norm": 6.823802947998047, "learning_rate": 2.2839909168322454e-05, "loss": 2.15653133392334, "step": 54390 }, { "epoch": 15.4413851830826, "grad_norm": 6.406754493713379, "learning_rate": 2.282571671870565e-05, "loss": 2.0977291107177733, "step": 54400 }, { "epoch": 15.444223673005961, "grad_norm": 6.452045440673828, "learning_rate": 2.2811524269088848e-05, "loss": 2.145039367675781, "step": 54410 }, { "epoch": 15.447062162929322, "grad_norm": 6.266096115112305, "learning_rate": 2.279733181947204e-05, "loss": 2.105270004272461, "step": 54420 }, { "epoch": 15.449900652852682, "grad_norm": 6.4556779861450195, "learning_rate": 2.2783139369855238e-05, "loss": 2.105578804016113, "step": 54430 }, { "epoch": 15.452739142776043, "grad_norm": 6.461822032928467, "learning_rate": 2.2768946920238434e-05, "loss": 2.1304977416992186, "step": 54440 }, { "epoch": 15.455577632699404, "grad_norm": 6.7905426025390625, "learning_rate": 2.275475447062163e-05, "loss": 2.096087646484375, "step": 54450 }, { "epoch": 15.458416122622765, "grad_norm": 6.499311447143555, "learning_rate": 2.2740562021004828e-05, "loss": 2.159975051879883, "step": 54460 }, { "epoch": 15.461254612546126, "grad_norm": 6.499456405639648, "learning_rate": 2.2726369571388025e-05, "loss": 2.1427927017211914, "step": 54470 }, { "epoch": 15.464093102469485, "grad_norm": 6.463589668273926, "learning_rate": 2.2712177121771218e-05, "loss": 2.0939796447753904, "step": 54480 }, { "epoch": 15.466931592392847, "grad_norm": 6.399744510650635, "learning_rate": 2.2697984672154415e-05, "loss": 2.1472476959228515, "step": 54490 }, { "epoch": 15.469770082316208, "grad_norm": 6.285637378692627, "learning_rate": 2.268379222253761e-05, "loss": 2.083094024658203, "step": 54500 }, { "epoch": 15.469770082316208, "eval_accuracy": 0.3461562917276022, "eval_loss": 2.4694736003875732, "eval_runtime": 54.3022, "eval_samples_per_second": 289.62, "eval_steps_per_second": 4.53, "step": 54500 }, { "epoch": 15.472608572239569, "grad_norm": 6.471351146697998, "learning_rate": 2.2669599772920808e-05, "loss": 2.09869384765625, "step": 54510 }, { "epoch": 15.47544706216293, "grad_norm": 6.854947090148926, "learning_rate": 2.2655407323304005e-05, "loss": 2.163711929321289, "step": 54520 }, { "epoch": 15.478285552086291, "grad_norm": 6.303498268127441, "learning_rate": 2.26412148736872e-05, "loss": 2.1199666976928713, "step": 54530 }, { "epoch": 15.48112404200965, "grad_norm": 6.098452091217041, "learning_rate": 2.2627022424070395e-05, "loss": 2.027938652038574, "step": 54540 }, { "epoch": 15.483962531933011, "grad_norm": 6.359694480895996, "learning_rate": 2.261282997445359e-05, "loss": 2.1906801223754884, "step": 54550 }, { "epoch": 15.486801021856373, "grad_norm": 6.491250514984131, "learning_rate": 2.2598637524836788e-05, "loss": 2.123559761047363, "step": 54560 }, { "epoch": 15.489639511779734, "grad_norm": 6.463473796844482, "learning_rate": 2.2584445075219985e-05, "loss": 2.182482147216797, "step": 54570 }, { "epoch": 15.492478001703095, "grad_norm": 6.475100517272949, "learning_rate": 2.257025262560318e-05, "loss": 2.13449764251709, "step": 54580 }, { "epoch": 15.495316491626454, "grad_norm": 6.605320930480957, "learning_rate": 2.2556060175986375e-05, "loss": 2.1453872680664063, "step": 54590 }, { "epoch": 15.498154981549815, "grad_norm": 6.763043403625488, "learning_rate": 2.254186772636957e-05, "loss": 2.153734588623047, "step": 54600 }, { "epoch": 15.500993471473176, "grad_norm": 6.095156669616699, "learning_rate": 2.252767527675277e-05, "loss": 2.1268760681152346, "step": 54610 }, { "epoch": 15.503831961396537, "grad_norm": 6.8226518630981445, "learning_rate": 2.2513482827135965e-05, "loss": 2.2218555450439452, "step": 54620 }, { "epoch": 15.506670451319899, "grad_norm": 6.574230194091797, "learning_rate": 2.2499290377519162e-05, "loss": 2.118303108215332, "step": 54630 }, { "epoch": 15.509508941243258, "grad_norm": 6.295968532562256, "learning_rate": 2.248509792790236e-05, "loss": 2.1655929565429686, "step": 54640 }, { "epoch": 15.512347431166619, "grad_norm": 6.360483646392822, "learning_rate": 2.2470905478285552e-05, "loss": 2.1849382400512694, "step": 54650 }, { "epoch": 15.51518592108998, "grad_norm": 6.169153213500977, "learning_rate": 2.245671302866875e-05, "loss": 2.1302791595458985, "step": 54660 }, { "epoch": 15.518024411013341, "grad_norm": 6.332473278045654, "learning_rate": 2.2442520579051945e-05, "loss": 2.1033355712890627, "step": 54670 }, { "epoch": 15.520862900936702, "grad_norm": 6.555882453918457, "learning_rate": 2.2428328129435142e-05, "loss": 2.1769454956054686, "step": 54680 }, { "epoch": 15.523701390860062, "grad_norm": 6.72318172454834, "learning_rate": 2.241413567981834e-05, "loss": 2.127999687194824, "step": 54690 }, { "epoch": 15.526539880783423, "grad_norm": 6.6428093910217285, "learning_rate": 2.2399943230201535e-05, "loss": 2.1569541931152343, "step": 54700 }, { "epoch": 15.529378370706784, "grad_norm": 6.757039546966553, "learning_rate": 2.238575078058473e-05, "loss": 2.083237648010254, "step": 54710 }, { "epoch": 15.532216860630145, "grad_norm": 6.444150447845459, "learning_rate": 2.2371558330967925e-05, "loss": 2.0610891342163087, "step": 54720 }, { "epoch": 15.535055350553506, "grad_norm": 6.612267971038818, "learning_rate": 2.2357365881351122e-05, "loss": 2.125376510620117, "step": 54730 }, { "epoch": 15.537893840476865, "grad_norm": 6.408292770385742, "learning_rate": 2.234317343173432e-05, "loss": 2.0938287734985352, "step": 54740 }, { "epoch": 15.540732330400227, "grad_norm": 6.2583746910095215, "learning_rate": 2.2328980982117515e-05, "loss": 2.0821407318115233, "step": 54750 }, { "epoch": 15.543570820323588, "grad_norm": 6.764197826385498, "learning_rate": 2.2314788532500712e-05, "loss": 2.1193450927734374, "step": 54760 }, { "epoch": 15.546409310246949, "grad_norm": 6.677443027496338, "learning_rate": 2.2300596082883906e-05, "loss": 2.0582931518554686, "step": 54770 }, { "epoch": 15.54924780017031, "grad_norm": 6.48002290725708, "learning_rate": 2.2286403633267102e-05, "loss": 2.1064790725708007, "step": 54780 }, { "epoch": 15.552086290093671, "grad_norm": 6.169969081878662, "learning_rate": 2.22722111836503e-05, "loss": 2.103662872314453, "step": 54790 }, { "epoch": 15.55492478001703, "grad_norm": 6.3790998458862305, "learning_rate": 2.2258018734033496e-05, "loss": 2.1215444564819337, "step": 54800 }, { "epoch": 15.557763269940391, "grad_norm": 6.403201103210449, "learning_rate": 2.2243826284416692e-05, "loss": 2.1729141235351563, "step": 54810 }, { "epoch": 15.560601759863752, "grad_norm": 6.440572738647461, "learning_rate": 2.2229633834799886e-05, "loss": 2.0647796630859374, "step": 54820 }, { "epoch": 15.563440249787114, "grad_norm": 6.689314365386963, "learning_rate": 2.2215441385183082e-05, "loss": 2.187534141540527, "step": 54830 }, { "epoch": 15.566278739710475, "grad_norm": 6.301349639892578, "learning_rate": 2.220124893556628e-05, "loss": 2.1775238037109377, "step": 54840 }, { "epoch": 15.569117229633834, "grad_norm": 6.686894416809082, "learning_rate": 2.2187056485949476e-05, "loss": 2.1217857360839845, "step": 54850 }, { "epoch": 15.571955719557195, "grad_norm": 6.372021198272705, "learning_rate": 2.2172864036332673e-05, "loss": 2.183876609802246, "step": 54860 }, { "epoch": 15.574794209480556, "grad_norm": 6.207261562347412, "learning_rate": 2.215867158671587e-05, "loss": 2.113250160217285, "step": 54870 }, { "epoch": 15.577632699403917, "grad_norm": 6.6745524406433105, "learning_rate": 2.2144479137099063e-05, "loss": 2.196917915344238, "step": 54880 }, { "epoch": 15.580471189327278, "grad_norm": 6.4283599853515625, "learning_rate": 2.213028668748226e-05, "loss": 2.121527671813965, "step": 54890 }, { "epoch": 15.58330967925064, "grad_norm": 6.460705280303955, "learning_rate": 2.2116094237865456e-05, "loss": 2.116798210144043, "step": 54900 }, { "epoch": 15.586148169173999, "grad_norm": 6.4172587394714355, "learning_rate": 2.2101901788248653e-05, "loss": 2.1102741241455076, "step": 54910 }, { "epoch": 15.58898665909736, "grad_norm": 6.574142932891846, "learning_rate": 2.208770933863185e-05, "loss": 2.171348762512207, "step": 54920 }, { "epoch": 15.591825149020721, "grad_norm": 6.683598518371582, "learning_rate": 2.2073516889015046e-05, "loss": 2.126019287109375, "step": 54930 }, { "epoch": 15.594663638944082, "grad_norm": 6.168909549713135, "learning_rate": 2.205932443939824e-05, "loss": 2.1621377944946287, "step": 54940 }, { "epoch": 15.597502128867443, "grad_norm": 6.186236381530762, "learning_rate": 2.2045131989781436e-05, "loss": 2.0523553848266602, "step": 54950 }, { "epoch": 15.600340618790803, "grad_norm": 6.5728607177734375, "learning_rate": 2.2030939540164633e-05, "loss": 2.114817810058594, "step": 54960 }, { "epoch": 15.603179108714164, "grad_norm": 6.353207111358643, "learning_rate": 2.201674709054783e-05, "loss": 2.174020767211914, "step": 54970 }, { "epoch": 15.606017598637525, "grad_norm": 6.461165904998779, "learning_rate": 2.2002554640931026e-05, "loss": 2.186916732788086, "step": 54980 }, { "epoch": 15.608856088560886, "grad_norm": 6.475900173187256, "learning_rate": 2.198836219131422e-05, "loss": 2.1315401077270506, "step": 54990 }, { "epoch": 15.611694578484247, "grad_norm": 6.360066890716553, "learning_rate": 2.1974169741697416e-05, "loss": 2.1306583404541017, "step": 55000 }, { "epoch": 15.611694578484247, "eval_accuracy": 0.34545685763336936, "eval_loss": 2.4668076038360596, "eval_runtime": 50.152, "eval_samples_per_second": 313.586, "eval_steps_per_second": 4.905, "step": 55000 }, { "epoch": 15.614533068407606, "grad_norm": 6.363924980163574, "learning_rate": 2.1959977292080613e-05, "loss": 2.215139389038086, "step": 55010 }, { "epoch": 15.617371558330968, "grad_norm": 6.276641845703125, "learning_rate": 2.194578484246381e-05, "loss": 2.1591423034667967, "step": 55020 }, { "epoch": 15.620210048254329, "grad_norm": 6.3573479652404785, "learning_rate": 2.1931592392847006e-05, "loss": 2.110849952697754, "step": 55030 }, { "epoch": 15.62304853817769, "grad_norm": 6.916094779968262, "learning_rate": 2.1917399943230203e-05, "loss": 2.193385887145996, "step": 55040 }, { "epoch": 15.625887028101051, "grad_norm": 6.39791202545166, "learning_rate": 2.1903207493613396e-05, "loss": 2.078820991516113, "step": 55050 }, { "epoch": 15.62872551802441, "grad_norm": 6.211337089538574, "learning_rate": 2.1889015043996593e-05, "loss": 2.200706672668457, "step": 55060 }, { "epoch": 15.631564007947771, "grad_norm": 6.54977560043335, "learning_rate": 2.187482259437979e-05, "loss": 2.03580322265625, "step": 55070 }, { "epoch": 15.634402497871132, "grad_norm": 6.561676979064941, "learning_rate": 2.1860630144762987e-05, "loss": 2.214691162109375, "step": 55080 }, { "epoch": 15.637240987794494, "grad_norm": 7.024967193603516, "learning_rate": 2.1846437695146183e-05, "loss": 2.144103240966797, "step": 55090 }, { "epoch": 15.640079477717855, "grad_norm": 6.058532238006592, "learning_rate": 2.183224524552938e-05, "loss": 2.126852798461914, "step": 55100 }, { "epoch": 15.642917967641214, "grad_norm": 6.041906356811523, "learning_rate": 2.1818052795912573e-05, "loss": 2.122625732421875, "step": 55110 }, { "epoch": 15.645756457564575, "grad_norm": 6.5440826416015625, "learning_rate": 2.180386034629577e-05, "loss": 2.26905517578125, "step": 55120 }, { "epoch": 15.648594947487936, "grad_norm": 6.828850269317627, "learning_rate": 2.1789667896678967e-05, "loss": 2.1589111328125, "step": 55130 }, { "epoch": 15.651433437411297, "grad_norm": 6.480410575866699, "learning_rate": 2.1775475447062163e-05, "loss": 2.117763137817383, "step": 55140 }, { "epoch": 15.654271927334658, "grad_norm": 6.624996185302734, "learning_rate": 2.176128299744536e-05, "loss": 2.1165321350097654, "step": 55150 }, { "epoch": 15.65711041725802, "grad_norm": 6.682621002197266, "learning_rate": 2.1747090547828557e-05, "loss": 2.1419580459594725, "step": 55160 }, { "epoch": 15.659948907181379, "grad_norm": 6.636375427246094, "learning_rate": 2.173289809821175e-05, "loss": 2.1373401641845704, "step": 55170 }, { "epoch": 15.66278739710474, "grad_norm": 6.349449157714844, "learning_rate": 2.1718705648594947e-05, "loss": 2.1069923400878907, "step": 55180 }, { "epoch": 15.665625887028101, "grad_norm": 6.302611351013184, "learning_rate": 2.1704513198978144e-05, "loss": 2.1485193252563475, "step": 55190 }, { "epoch": 15.668464376951462, "grad_norm": 6.506890773773193, "learning_rate": 2.169032074936134e-05, "loss": 2.158328628540039, "step": 55200 }, { "epoch": 15.671302866874823, "grad_norm": 6.498637676239014, "learning_rate": 2.1676128299744537e-05, "loss": 2.130145454406738, "step": 55210 }, { "epoch": 15.674141356798183, "grad_norm": 6.544780731201172, "learning_rate": 2.166193585012773e-05, "loss": 2.096833419799805, "step": 55220 }, { "epoch": 15.676979846721544, "grad_norm": 6.225309371948242, "learning_rate": 2.1647743400510927e-05, "loss": 2.1532852172851564, "step": 55230 }, { "epoch": 15.679818336644905, "grad_norm": 6.365910530090332, "learning_rate": 2.1633550950894124e-05, "loss": 2.1517410278320312, "step": 55240 }, { "epoch": 15.682656826568266, "grad_norm": 6.25162935256958, "learning_rate": 2.161935850127732e-05, "loss": 2.1165611267089846, "step": 55250 }, { "epoch": 15.685495316491627, "grad_norm": 6.554846286773682, "learning_rate": 2.1605166051660517e-05, "loss": 2.1438552856445314, "step": 55260 }, { "epoch": 15.688333806414986, "grad_norm": 6.308656215667725, "learning_rate": 2.1590973602043714e-05, "loss": 2.2300033569335938, "step": 55270 }, { "epoch": 15.691172296338348, "grad_norm": 6.671712398529053, "learning_rate": 2.1576781152426907e-05, "loss": 2.168380355834961, "step": 55280 }, { "epoch": 15.694010786261709, "grad_norm": 6.655372619628906, "learning_rate": 2.1562588702810104e-05, "loss": 2.1330469131469725, "step": 55290 }, { "epoch": 15.69684927618507, "grad_norm": 6.439777851104736, "learning_rate": 2.15483962531933e-05, "loss": 2.2159374237060545, "step": 55300 }, { "epoch": 15.69968776610843, "grad_norm": 6.288915634155273, "learning_rate": 2.1534203803576497e-05, "loss": 2.1478351593017577, "step": 55310 }, { "epoch": 15.702526256031792, "grad_norm": 6.47551155090332, "learning_rate": 2.1520011353959694e-05, "loss": 2.1707180023193358, "step": 55320 }, { "epoch": 15.705364745955151, "grad_norm": 6.467959403991699, "learning_rate": 2.150581890434289e-05, "loss": 2.072372245788574, "step": 55330 }, { "epoch": 15.708203235878512, "grad_norm": 6.570208549499512, "learning_rate": 2.1491626454726084e-05, "loss": 2.1319026947021484, "step": 55340 }, { "epoch": 15.711041725801874, "grad_norm": 6.148925304412842, "learning_rate": 2.1477434005109284e-05, "loss": 2.116451644897461, "step": 55350 }, { "epoch": 15.713880215725235, "grad_norm": 6.300544261932373, "learning_rate": 2.146324155549248e-05, "loss": 2.1446483612060545, "step": 55360 }, { "epoch": 15.716718705648596, "grad_norm": 6.25795316696167, "learning_rate": 2.1449049105875678e-05, "loss": 2.140696334838867, "step": 55370 }, { "epoch": 15.719557195571955, "grad_norm": 6.203399658203125, "learning_rate": 2.1434856656258874e-05, "loss": 2.189131736755371, "step": 55380 }, { "epoch": 15.722395685495316, "grad_norm": 6.961124420166016, "learning_rate": 2.1420664206642068e-05, "loss": 2.1048988342285155, "step": 55390 }, { "epoch": 15.725234175418677, "grad_norm": 6.340852737426758, "learning_rate": 2.1406471757025264e-05, "loss": 2.082383155822754, "step": 55400 }, { "epoch": 15.728072665342038, "grad_norm": 6.643930912017822, "learning_rate": 2.139227930740846e-05, "loss": 2.112348175048828, "step": 55410 }, { "epoch": 15.7309111552654, "grad_norm": 6.631180763244629, "learning_rate": 2.1378086857791658e-05, "loss": 2.1975482940673827, "step": 55420 }, { "epoch": 15.733749645188759, "grad_norm": 6.625923156738281, "learning_rate": 2.1363894408174854e-05, "loss": 2.1325687408447265, "step": 55430 }, { "epoch": 15.73658813511212, "grad_norm": 6.219590663909912, "learning_rate": 2.1349701958558048e-05, "loss": 2.1586170196533203, "step": 55440 }, { "epoch": 15.739426625035481, "grad_norm": 6.564734935760498, "learning_rate": 2.1335509508941244e-05, "loss": 2.0748498916625975, "step": 55450 }, { "epoch": 15.742265114958842, "grad_norm": 6.390951633453369, "learning_rate": 2.132131705932444e-05, "loss": 2.200987434387207, "step": 55460 }, { "epoch": 15.745103604882203, "grad_norm": 6.442089557647705, "learning_rate": 2.1307124609707638e-05, "loss": 2.147441864013672, "step": 55470 }, { "epoch": 15.747942094805563, "grad_norm": 6.311034679412842, "learning_rate": 2.1292932160090835e-05, "loss": 2.0843341827392576, "step": 55480 }, { "epoch": 15.750780584728924, "grad_norm": 6.104394435882568, "learning_rate": 2.127873971047403e-05, "loss": 2.1529178619384766, "step": 55490 }, { "epoch": 15.753619074652285, "grad_norm": 6.307803153991699, "learning_rate": 2.1264547260857225e-05, "loss": 2.087451362609863, "step": 55500 }, { "epoch": 15.753619074652285, "eval_accuracy": 0.34806383925732814, "eval_loss": 2.4627952575683594, "eval_runtime": 50.2273, "eval_samples_per_second": 313.117, "eval_steps_per_second": 4.898, "step": 55500 }, { "epoch": 15.756457564575646, "grad_norm": 6.358206272125244, "learning_rate": 2.125035481124042e-05, "loss": 2.0641357421875, "step": 55510 }, { "epoch": 15.759296054499007, "grad_norm": 6.636035442352295, "learning_rate": 2.1236162361623618e-05, "loss": 2.233322525024414, "step": 55520 }, { "epoch": 15.762134544422366, "grad_norm": 6.607859134674072, "learning_rate": 2.1221969912006815e-05, "loss": 2.1634321212768555, "step": 55530 }, { "epoch": 15.764973034345727, "grad_norm": 6.227398872375488, "learning_rate": 2.120777746239001e-05, "loss": 2.067828369140625, "step": 55540 }, { "epoch": 15.767811524269089, "grad_norm": 6.3411335945129395, "learning_rate": 2.1193585012773208e-05, "loss": 2.1784950256347657, "step": 55550 }, { "epoch": 15.77065001419245, "grad_norm": 6.399652004241943, "learning_rate": 2.11793925631564e-05, "loss": 2.0974452972412108, "step": 55560 }, { "epoch": 15.77348850411581, "grad_norm": 6.551273345947266, "learning_rate": 2.1165200113539598e-05, "loss": 2.1199750900268555, "step": 55570 }, { "epoch": 15.776326994039172, "grad_norm": 6.554952621459961, "learning_rate": 2.1151007663922795e-05, "loss": 2.159870147705078, "step": 55580 }, { "epoch": 15.779165483962531, "grad_norm": 6.668982982635498, "learning_rate": 2.113681521430599e-05, "loss": 2.16904354095459, "step": 55590 }, { "epoch": 15.782003973885892, "grad_norm": 6.245400428771973, "learning_rate": 2.112262276468919e-05, "loss": 2.082863426208496, "step": 55600 }, { "epoch": 15.784842463809253, "grad_norm": 6.618069648742676, "learning_rate": 2.110843031507238e-05, "loss": 2.165401268005371, "step": 55610 }, { "epoch": 15.787680953732615, "grad_norm": 6.100390434265137, "learning_rate": 2.109423786545558e-05, "loss": 2.1486244201660156, "step": 55620 }, { "epoch": 15.790519443655976, "grad_norm": 6.339571952819824, "learning_rate": 2.1080045415838775e-05, "loss": 2.0377527236938477, "step": 55630 }, { "epoch": 15.793357933579335, "grad_norm": 6.599186897277832, "learning_rate": 2.1065852966221972e-05, "loss": 2.2265886306762694, "step": 55640 }, { "epoch": 15.796196423502696, "grad_norm": 6.254552364349365, "learning_rate": 2.105166051660517e-05, "loss": 2.213062286376953, "step": 55650 }, { "epoch": 15.799034913426057, "grad_norm": 6.121212959289551, "learning_rate": 2.1037468066988365e-05, "loss": 2.1187801361083984, "step": 55660 }, { "epoch": 15.801873403349418, "grad_norm": 6.2491374015808105, "learning_rate": 2.102327561737156e-05, "loss": 2.1640670776367186, "step": 55670 }, { "epoch": 15.80471189327278, "grad_norm": 6.643091678619385, "learning_rate": 2.1009083167754755e-05, "loss": 2.167956733703613, "step": 55680 }, { "epoch": 15.80755038319614, "grad_norm": 6.445571422576904, "learning_rate": 2.0994890718137952e-05, "loss": 2.131795310974121, "step": 55690 }, { "epoch": 15.8103888731195, "grad_norm": 6.524064540863037, "learning_rate": 2.098069826852115e-05, "loss": 2.081622505187988, "step": 55700 }, { "epoch": 15.813227363042861, "grad_norm": 6.670311450958252, "learning_rate": 2.0966505818904345e-05, "loss": 2.166672134399414, "step": 55710 }, { "epoch": 15.816065852966222, "grad_norm": 6.249622344970703, "learning_rate": 2.0952313369287542e-05, "loss": 2.1125408172607423, "step": 55720 }, { "epoch": 15.818904342889583, "grad_norm": 6.34185266494751, "learning_rate": 2.0938120919670735e-05, "loss": 2.137819290161133, "step": 55730 }, { "epoch": 15.821742832812944, "grad_norm": 6.632528305053711, "learning_rate": 2.0923928470053932e-05, "loss": 2.1109081268310548, "step": 55740 }, { "epoch": 15.824581322736304, "grad_norm": 6.718249320983887, "learning_rate": 2.090973602043713e-05, "loss": 2.13812255859375, "step": 55750 }, { "epoch": 15.827419812659665, "grad_norm": 6.345604419708252, "learning_rate": 2.0895543570820326e-05, "loss": 2.227246856689453, "step": 55760 }, { "epoch": 15.830258302583026, "grad_norm": 6.439296722412109, "learning_rate": 2.0881351121203522e-05, "loss": 2.1056583404541014, "step": 55770 }, { "epoch": 15.833096792506387, "grad_norm": 6.7050981521606445, "learning_rate": 2.086715867158672e-05, "loss": 2.1892200469970704, "step": 55780 }, { "epoch": 15.835935282429748, "grad_norm": 6.359012603759766, "learning_rate": 2.0852966221969912e-05, "loss": 2.099631500244141, "step": 55790 }, { "epoch": 15.838773772353107, "grad_norm": 6.551657199859619, "learning_rate": 2.083877377235311e-05, "loss": 2.1506584167480467, "step": 55800 }, { "epoch": 15.841612262276469, "grad_norm": 6.3321709632873535, "learning_rate": 2.0824581322736306e-05, "loss": 2.113478088378906, "step": 55810 }, { "epoch": 15.84445075219983, "grad_norm": 6.738295555114746, "learning_rate": 2.0810388873119502e-05, "loss": 2.1221994400024413, "step": 55820 }, { "epoch": 15.84728924212319, "grad_norm": 6.778073787689209, "learning_rate": 2.07961964235027e-05, "loss": 2.1685211181640627, "step": 55830 }, { "epoch": 15.850127732046552, "grad_norm": 6.534323692321777, "learning_rate": 2.0782003973885892e-05, "loss": 2.1723539352416994, "step": 55840 }, { "epoch": 15.852966221969911, "grad_norm": 6.642859935760498, "learning_rate": 2.076781152426909e-05, "loss": 2.1325967788696287, "step": 55850 }, { "epoch": 15.855804711893272, "grad_norm": 6.355303764343262, "learning_rate": 2.0753619074652286e-05, "loss": 2.105749320983887, "step": 55860 }, { "epoch": 15.858643201816633, "grad_norm": 6.741189002990723, "learning_rate": 2.0739426625035483e-05, "loss": 2.16808967590332, "step": 55870 }, { "epoch": 15.861481691739995, "grad_norm": 6.595548152923584, "learning_rate": 2.072523417541868e-05, "loss": 2.1404563903808596, "step": 55880 }, { "epoch": 15.864320181663356, "grad_norm": 6.112933158874512, "learning_rate": 2.0711041725801876e-05, "loss": 2.130727767944336, "step": 55890 }, { "epoch": 15.867158671586715, "grad_norm": 6.522345066070557, "learning_rate": 2.069684927618507e-05, "loss": 2.166324996948242, "step": 55900 }, { "epoch": 15.869997161510076, "grad_norm": 6.5565643310546875, "learning_rate": 2.0682656826568266e-05, "loss": 2.163987731933594, "step": 55910 }, { "epoch": 15.872835651433437, "grad_norm": 6.277466297149658, "learning_rate": 2.0668464376951463e-05, "loss": 2.1121139526367188, "step": 55920 }, { "epoch": 15.875674141356798, "grad_norm": 6.217223167419434, "learning_rate": 2.065427192733466e-05, "loss": 2.1693702697753907, "step": 55930 }, { "epoch": 15.87851263128016, "grad_norm": 6.230303764343262, "learning_rate": 2.0640079477717856e-05, "loss": 2.1166757583618163, "step": 55940 }, { "epoch": 15.88135112120352, "grad_norm": 6.665637493133545, "learning_rate": 2.0625887028101053e-05, "loss": 2.149790954589844, "step": 55950 }, { "epoch": 15.88418961112688, "grad_norm": 6.965818881988525, "learning_rate": 2.0611694578484246e-05, "loss": 2.208869743347168, "step": 55960 }, { "epoch": 15.887028101050241, "grad_norm": 6.512807846069336, "learning_rate": 2.0597502128867443e-05, "loss": 2.1416711807250977, "step": 55970 }, { "epoch": 15.889866590973602, "grad_norm": 6.502883434295654, "learning_rate": 2.058330967925064e-05, "loss": 2.1184106826782227, "step": 55980 }, { "epoch": 15.892705080896963, "grad_norm": 6.344593524932861, "learning_rate": 2.0569117229633836e-05, "loss": 2.0711669921875, "step": 55990 }, { "epoch": 15.895543570820324, "grad_norm": 6.530801296234131, "learning_rate": 2.0554924780017033e-05, "loss": 2.1880640029907226, "step": 56000 }, { "epoch": 15.895543570820324, "eval_accuracy": 0.3527691231639855, "eval_loss": 2.4630377292633057, "eval_runtime": 51.4046, "eval_samples_per_second": 305.945, "eval_steps_per_second": 4.786, "step": 56000 }, { "epoch": 15.898382060743684, "grad_norm": 6.574306011199951, "learning_rate": 2.054073233040023e-05, "loss": 2.130392646789551, "step": 56010 }, { "epoch": 15.901220550667045, "grad_norm": 6.515095233917236, "learning_rate": 2.0526539880783423e-05, "loss": 2.077656364440918, "step": 56020 }, { "epoch": 15.904059040590406, "grad_norm": 6.308082580566406, "learning_rate": 2.051234743116662e-05, "loss": 2.114459991455078, "step": 56030 }, { "epoch": 15.906897530513767, "grad_norm": 6.220919609069824, "learning_rate": 2.0498154981549816e-05, "loss": 2.165493392944336, "step": 56040 }, { "epoch": 15.909736020437128, "grad_norm": 6.016589641571045, "learning_rate": 2.0483962531933013e-05, "loss": 2.118942451477051, "step": 56050 }, { "epoch": 15.912574510360487, "grad_norm": 6.698748588562012, "learning_rate": 2.046977008231621e-05, "loss": 2.1862680435180666, "step": 56060 }, { "epoch": 15.915413000283849, "grad_norm": 6.550107479095459, "learning_rate": 2.0455577632699403e-05, "loss": 2.1546600341796873, "step": 56070 }, { "epoch": 15.91825149020721, "grad_norm": 6.40856409072876, "learning_rate": 2.04413851830826e-05, "loss": 2.1057729721069336, "step": 56080 }, { "epoch": 15.92108998013057, "grad_norm": 6.047467231750488, "learning_rate": 2.0427192733465797e-05, "loss": 2.103183364868164, "step": 56090 }, { "epoch": 15.923928470053932, "grad_norm": 6.102746963500977, "learning_rate": 2.0413000283848993e-05, "loss": 2.1055389404296876, "step": 56100 }, { "epoch": 15.926766959977293, "grad_norm": 5.921529293060303, "learning_rate": 2.039880783423219e-05, "loss": 2.1324792861938477, "step": 56110 }, { "epoch": 15.929605449900652, "grad_norm": 6.230980396270752, "learning_rate": 2.0384615384615387e-05, "loss": 2.1263885498046875, "step": 56120 }, { "epoch": 15.932443939824013, "grad_norm": 6.597650051116943, "learning_rate": 2.037042293499858e-05, "loss": 2.128056526184082, "step": 56130 }, { "epoch": 15.935282429747375, "grad_norm": 5.9945387840271, "learning_rate": 2.0356230485381777e-05, "loss": 2.059731674194336, "step": 56140 }, { "epoch": 15.938120919670736, "grad_norm": 6.621414661407471, "learning_rate": 2.0342038035764973e-05, "loss": 2.1469268798828125, "step": 56150 }, { "epoch": 15.940959409594097, "grad_norm": 6.5647196769714355, "learning_rate": 2.032784558614817e-05, "loss": 2.146204948425293, "step": 56160 }, { "epoch": 15.943797899517456, "grad_norm": 6.3357672691345215, "learning_rate": 2.0313653136531367e-05, "loss": 2.1902744293212892, "step": 56170 }, { "epoch": 15.946636389440817, "grad_norm": 6.5044636726379395, "learning_rate": 2.0299460686914564e-05, "loss": 2.1783273696899412, "step": 56180 }, { "epoch": 15.949474879364178, "grad_norm": 6.456685543060303, "learning_rate": 2.0285268237297757e-05, "loss": 2.0287567138671876, "step": 56190 }, { "epoch": 15.95231336928754, "grad_norm": 6.66785192489624, "learning_rate": 2.0271075787680954e-05, "loss": 2.1432830810546877, "step": 56200 }, { "epoch": 15.9551518592109, "grad_norm": 6.384958744049072, "learning_rate": 2.025688333806415e-05, "loss": 2.0741952896118163, "step": 56210 }, { "epoch": 15.95799034913426, "grad_norm": 6.427942752838135, "learning_rate": 2.0242690888447347e-05, "loss": 2.1580623626708983, "step": 56220 }, { "epoch": 15.960828839057621, "grad_norm": 6.369714736938477, "learning_rate": 2.0228498438830544e-05, "loss": 2.1306509017944335, "step": 56230 }, { "epoch": 15.963667328980982, "grad_norm": 6.733680725097656, "learning_rate": 2.0214305989213737e-05, "loss": 2.1938926696777346, "step": 56240 }, { "epoch": 15.966505818904343, "grad_norm": 6.414203643798828, "learning_rate": 2.0200113539596934e-05, "loss": 2.140518379211426, "step": 56250 }, { "epoch": 15.969344308827704, "grad_norm": 6.623561859130859, "learning_rate": 2.018592108998013e-05, "loss": 2.163002777099609, "step": 56260 }, { "epoch": 15.972182798751064, "grad_norm": 6.011220455169678, "learning_rate": 2.0171728640363327e-05, "loss": 2.1063888549804686, "step": 56270 }, { "epoch": 15.975021288674425, "grad_norm": 6.387247085571289, "learning_rate": 2.0157536190746524e-05, "loss": 2.1294748306274416, "step": 56280 }, { "epoch": 15.977859778597786, "grad_norm": 6.485818862915039, "learning_rate": 2.014334374112972e-05, "loss": 2.14907169342041, "step": 56290 }, { "epoch": 15.980698268521147, "grad_norm": 6.237002372741699, "learning_rate": 2.0129151291512914e-05, "loss": 2.114191246032715, "step": 56300 }, { "epoch": 15.983536758444508, "grad_norm": 6.825778961181641, "learning_rate": 2.011495884189611e-05, "loss": 2.1926815032958986, "step": 56310 }, { "epoch": 15.986375248367867, "grad_norm": 6.577533721923828, "learning_rate": 2.0100766392279307e-05, "loss": 2.1264230728149416, "step": 56320 }, { "epoch": 15.989213738291228, "grad_norm": 6.290762901306152, "learning_rate": 2.0086573942662504e-05, "loss": 2.122489166259766, "step": 56330 }, { "epoch": 15.99205222821459, "grad_norm": 6.794461250305176, "learning_rate": 2.00723814930457e-05, "loss": 2.103113555908203, "step": 56340 }, { "epoch": 15.99489071813795, "grad_norm": 5.979503154754639, "learning_rate": 2.0058189043428897e-05, "loss": 2.145572853088379, "step": 56350 }, { "epoch": 15.997729208061312, "grad_norm": 6.52864408493042, "learning_rate": 2.004399659381209e-05, "loss": 2.022388458251953, "step": 56360 }, { "epoch": 16.00056769798467, "grad_norm": 6.453740119934082, "learning_rate": 2.0029804144195288e-05, "loss": 2.1277578353881834, "step": 56370 }, { "epoch": 16.003406187908034, "grad_norm": 6.480365753173828, "learning_rate": 2.0015611694578484e-05, "loss": 2.153166961669922, "step": 56380 }, { "epoch": 16.006244677831393, "grad_norm": 6.64892578125, "learning_rate": 2.000141924496168e-05, "loss": 2.1316577911376955, "step": 56390 }, { "epoch": 16.009083167754753, "grad_norm": 7.0249409675598145, "learning_rate": 1.9987226795344878e-05, "loss": 2.134756088256836, "step": 56400 }, { "epoch": 16.011921657678116, "grad_norm": 6.3692545890808105, "learning_rate": 1.9973034345728074e-05, "loss": 2.0666168212890623, "step": 56410 }, { "epoch": 16.014760147601475, "grad_norm": 6.479156494140625, "learning_rate": 1.9958841896111268e-05, "loss": 2.172831153869629, "step": 56420 }, { "epoch": 16.017598637524838, "grad_norm": 6.7918381690979, "learning_rate": 1.9944649446494464e-05, "loss": 2.148954963684082, "step": 56430 }, { "epoch": 16.020437127448197, "grad_norm": 6.364224910736084, "learning_rate": 1.993045699687766e-05, "loss": 2.112704849243164, "step": 56440 }, { "epoch": 16.02327561737156, "grad_norm": 6.7614426612854, "learning_rate": 1.9916264547260858e-05, "loss": 2.128007507324219, "step": 56450 }, { "epoch": 16.02611410729492, "grad_norm": 6.176269054412842, "learning_rate": 1.9902072097644055e-05, "loss": 2.1200773239135744, "step": 56460 }, { "epoch": 16.02895259721828, "grad_norm": 6.4892258644104, "learning_rate": 1.9887879648027248e-05, "loss": 2.1077106475830076, "step": 56470 }, { "epoch": 16.03179108714164, "grad_norm": 6.011456489562988, "learning_rate": 1.9873687198410445e-05, "loss": 2.1124271392822265, "step": 56480 }, { "epoch": 16.034629577065, "grad_norm": 6.82062292098999, "learning_rate": 1.985949474879364e-05, "loss": 2.172458839416504, "step": 56490 }, { "epoch": 16.037468066988364, "grad_norm": 6.502970218658447, "learning_rate": 1.9845302299176838e-05, "loss": 2.135758972167969, "step": 56500 }, { "epoch": 16.037468066988364, "eval_accuracy": 0.34780949958669805, "eval_loss": 2.45963454246521, "eval_runtime": 52.097, "eval_samples_per_second": 301.879, "eval_steps_per_second": 4.722, "step": 56500 }, { "epoch": 16.040306556911723, "grad_norm": 7.056969165802002, "learning_rate": 1.9831109849560035e-05, "loss": 2.0904315948486327, "step": 56510 }, { "epoch": 16.043145046835082, "grad_norm": 6.312551498413086, "learning_rate": 1.981691739994323e-05, "loss": 2.084178161621094, "step": 56520 }, { "epoch": 16.045983536758445, "grad_norm": 6.489106178283691, "learning_rate": 1.9802724950326425e-05, "loss": 2.19710750579834, "step": 56530 }, { "epoch": 16.048822026681805, "grad_norm": 6.790370941162109, "learning_rate": 1.978853250070962e-05, "loss": 2.120567512512207, "step": 56540 }, { "epoch": 16.051660516605168, "grad_norm": 6.476224899291992, "learning_rate": 1.9774340051092818e-05, "loss": 2.103171157836914, "step": 56550 }, { "epoch": 16.054499006528527, "grad_norm": 6.491943836212158, "learning_rate": 1.9760147601476015e-05, "loss": 2.0961736679077148, "step": 56560 }, { "epoch": 16.057337496451886, "grad_norm": 6.108053684234619, "learning_rate": 1.974595515185921e-05, "loss": 2.140785980224609, "step": 56570 }, { "epoch": 16.06017598637525, "grad_norm": 6.469905853271484, "learning_rate": 1.9731762702242408e-05, "loss": 2.105434036254883, "step": 56580 }, { "epoch": 16.06301447629861, "grad_norm": 6.598701477050781, "learning_rate": 1.97175702526256e-05, "loss": 2.0928150177001954, "step": 56590 }, { "epoch": 16.06585296622197, "grad_norm": 6.534041404724121, "learning_rate": 1.9703377803008798e-05, "loss": 2.1176563262939454, "step": 56600 }, { "epoch": 16.06869145614533, "grad_norm": 6.203941822052002, "learning_rate": 1.9689185353391995e-05, "loss": 2.062545967102051, "step": 56610 }, { "epoch": 16.07152994606869, "grad_norm": 6.433261871337891, "learning_rate": 1.9674992903775192e-05, "loss": 2.0866775512695312, "step": 56620 }, { "epoch": 16.074368435992053, "grad_norm": 6.9069623947143555, "learning_rate": 1.966080045415839e-05, "loss": 2.178293991088867, "step": 56630 }, { "epoch": 16.077206925915412, "grad_norm": 6.427900314331055, "learning_rate": 1.9646608004541585e-05, "loss": 2.120858001708984, "step": 56640 }, { "epoch": 16.080045415838775, "grad_norm": 6.299102306365967, "learning_rate": 1.963241555492478e-05, "loss": 2.134337043762207, "step": 56650 }, { "epoch": 16.082883905762134, "grad_norm": 6.042558670043945, "learning_rate": 1.961822310530798e-05, "loss": 2.08172607421875, "step": 56660 }, { "epoch": 16.085722395685494, "grad_norm": 6.253006458282471, "learning_rate": 1.9604030655691175e-05, "loss": 2.0817684173583983, "step": 56670 }, { "epoch": 16.088560885608857, "grad_norm": 6.3398756980896, "learning_rate": 1.9589838206074372e-05, "loss": 2.1628055572509766, "step": 56680 }, { "epoch": 16.091399375532216, "grad_norm": 6.010324478149414, "learning_rate": 1.9575645756457565e-05, "loss": 2.0786664962768553, "step": 56690 }, { "epoch": 16.09423786545558, "grad_norm": 6.406030178070068, "learning_rate": 1.9561453306840762e-05, "loss": 2.032108497619629, "step": 56700 }, { "epoch": 16.097076355378938, "grad_norm": 6.346515655517578, "learning_rate": 1.954726085722396e-05, "loss": 2.0969276428222656, "step": 56710 }, { "epoch": 16.099914845302298, "grad_norm": 6.517237663269043, "learning_rate": 1.9533068407607155e-05, "loss": 2.1330814361572266, "step": 56720 }, { "epoch": 16.10275333522566, "grad_norm": 6.335617542266846, "learning_rate": 1.9518875957990352e-05, "loss": 2.0560359954833984, "step": 56730 }, { "epoch": 16.10559182514902, "grad_norm": 6.685981750488281, "learning_rate": 1.950468350837355e-05, "loss": 2.088942527770996, "step": 56740 }, { "epoch": 16.108430315072383, "grad_norm": 6.18524169921875, "learning_rate": 1.9490491058756742e-05, "loss": 2.0701086044311525, "step": 56750 }, { "epoch": 16.111268804995742, "grad_norm": 6.345234394073486, "learning_rate": 1.947629860913994e-05, "loss": 2.0896060943603514, "step": 56760 }, { "epoch": 16.1141072949191, "grad_norm": 6.6163225173950195, "learning_rate": 1.9462106159523136e-05, "loss": 2.1309131622314452, "step": 56770 }, { "epoch": 16.116945784842464, "grad_norm": 6.124363422393799, "learning_rate": 1.9447913709906332e-05, "loss": 2.092904472351074, "step": 56780 }, { "epoch": 16.119784274765824, "grad_norm": 6.28560733795166, "learning_rate": 1.943372126028953e-05, "loss": 2.1170490264892576, "step": 56790 }, { "epoch": 16.122622764689186, "grad_norm": 6.41277551651001, "learning_rate": 1.9419528810672726e-05, "loss": 2.1479902267456055, "step": 56800 }, { "epoch": 16.125461254612546, "grad_norm": 6.565389156341553, "learning_rate": 1.940533636105592e-05, "loss": 2.175071144104004, "step": 56810 }, { "epoch": 16.12829974453591, "grad_norm": 6.597179412841797, "learning_rate": 1.9391143911439116e-05, "loss": 2.139188766479492, "step": 56820 }, { "epoch": 16.131138234459268, "grad_norm": 6.396845817565918, "learning_rate": 1.9376951461822312e-05, "loss": 2.2000711441040037, "step": 56830 }, { "epoch": 16.133976724382627, "grad_norm": 6.5333638191223145, "learning_rate": 1.936275901220551e-05, "loss": 2.088016319274902, "step": 56840 }, { "epoch": 16.13681521430599, "grad_norm": 6.589280128479004, "learning_rate": 1.9348566562588706e-05, "loss": 2.0636247634887694, "step": 56850 }, { "epoch": 16.13965370422935, "grad_norm": 6.567751884460449, "learning_rate": 1.93343741129719e-05, "loss": 2.144084930419922, "step": 56860 }, { "epoch": 16.142492194152712, "grad_norm": 6.586024761199951, "learning_rate": 1.9320181663355096e-05, "loss": 2.1200420379638674, "step": 56870 }, { "epoch": 16.14533068407607, "grad_norm": 6.453419208526611, "learning_rate": 1.9305989213738293e-05, "loss": 2.1307771682739256, "step": 56880 }, { "epoch": 16.14816917399943, "grad_norm": 6.4965314865112305, "learning_rate": 1.929179676412149e-05, "loss": 2.1269094467163088, "step": 56890 }, { "epoch": 16.151007663922794, "grad_norm": 6.634128093719482, "learning_rate": 1.9277604314504686e-05, "loss": 2.1221424102783204, "step": 56900 }, { "epoch": 16.153846153846153, "grad_norm": 6.628468990325928, "learning_rate": 1.9263411864887883e-05, "loss": 2.1019618988037108, "step": 56910 }, { "epoch": 16.156684643769516, "grad_norm": 6.101174354553223, "learning_rate": 1.9249219415271076e-05, "loss": 2.107146072387695, "step": 56920 }, { "epoch": 16.159523133692876, "grad_norm": 6.479446887969971, "learning_rate": 1.9235026965654273e-05, "loss": 2.0552772521972655, "step": 56930 }, { "epoch": 16.162361623616235, "grad_norm": 6.237868309020996, "learning_rate": 1.922083451603747e-05, "loss": 2.13381290435791, "step": 56940 }, { "epoch": 16.165200113539598, "grad_norm": 6.360213279724121, "learning_rate": 1.9206642066420666e-05, "loss": 2.1201946258544924, "step": 56950 }, { "epoch": 16.168038603462957, "grad_norm": 6.597311496734619, "learning_rate": 1.9192449616803863e-05, "loss": 2.1618377685546877, "step": 56960 }, { "epoch": 16.17087709338632, "grad_norm": 6.357169151306152, "learning_rate": 1.917825716718706e-05, "loss": 2.120911979675293, "step": 56970 }, { "epoch": 16.17371558330968, "grad_norm": 6.347872734069824, "learning_rate": 1.9164064717570253e-05, "loss": 2.142375946044922, "step": 56980 }, { "epoch": 16.17655407323304, "grad_norm": 6.332310676574707, "learning_rate": 1.914987226795345e-05, "loss": 2.1696809768676757, "step": 56990 }, { "epoch": 16.1793925631564, "grad_norm": 6.524888038635254, "learning_rate": 1.9135679818336646e-05, "loss": 2.1170799255371096, "step": 57000 }, { "epoch": 16.1793925631564, "eval_accuracy": 0.3499078018693966, "eval_loss": 2.459838628768921, "eval_runtime": 51.3953, "eval_samples_per_second": 306.001, "eval_steps_per_second": 4.786, "step": 57000 }, { "epoch": 16.18223105307976, "grad_norm": 6.5016303062438965, "learning_rate": 1.9121487368719843e-05, "loss": 2.066444206237793, "step": 57010 }, { "epoch": 16.185069543003124, "grad_norm": 6.263265132904053, "learning_rate": 1.910729491910304e-05, "loss": 2.110235023498535, "step": 57020 }, { "epoch": 16.187908032926483, "grad_norm": 6.261903285980225, "learning_rate": 1.9093102469486236e-05, "loss": 2.0782642364501953, "step": 57030 }, { "epoch": 16.190746522849842, "grad_norm": 6.6318817138671875, "learning_rate": 1.907891001986943e-05, "loss": 2.1028261184692383, "step": 57040 }, { "epoch": 16.193585012773205, "grad_norm": 6.418631076812744, "learning_rate": 1.9064717570252626e-05, "loss": 2.172225570678711, "step": 57050 }, { "epoch": 16.196423502696565, "grad_norm": 6.022671699523926, "learning_rate": 1.9050525120635823e-05, "loss": 2.176744270324707, "step": 57060 }, { "epoch": 16.199261992619927, "grad_norm": 6.940549850463867, "learning_rate": 1.903633267101902e-05, "loss": 2.0998376846313476, "step": 57070 }, { "epoch": 16.202100482543287, "grad_norm": 6.270472526550293, "learning_rate": 1.9022140221402217e-05, "loss": 2.085245704650879, "step": 57080 }, { "epoch": 16.204938972466646, "grad_norm": 6.3556365966796875, "learning_rate": 1.900794777178541e-05, "loss": 2.091543960571289, "step": 57090 }, { "epoch": 16.20777746239001, "grad_norm": 6.518282413482666, "learning_rate": 1.8993755322168607e-05, "loss": 2.0746185302734377, "step": 57100 }, { "epoch": 16.21061595231337, "grad_norm": 6.388627052307129, "learning_rate": 1.8979562872551803e-05, "loss": 2.035645294189453, "step": 57110 }, { "epoch": 16.21345444223673, "grad_norm": 6.19718599319458, "learning_rate": 1.8965370422935e-05, "loss": 2.0994338989257812, "step": 57120 }, { "epoch": 16.21629293216009, "grad_norm": 6.754570007324219, "learning_rate": 1.8951177973318197e-05, "loss": 2.121324157714844, "step": 57130 }, { "epoch": 16.21913142208345, "grad_norm": 6.483152866363525, "learning_rate": 1.8936985523701393e-05, "loss": 2.0364526748657226, "step": 57140 }, { "epoch": 16.221969912006813, "grad_norm": 6.197349548339844, "learning_rate": 1.8922793074084587e-05, "loss": 2.1084577560424806, "step": 57150 }, { "epoch": 16.224808401930172, "grad_norm": 6.367859840393066, "learning_rate": 1.8908600624467784e-05, "loss": 2.1359548568725586, "step": 57160 }, { "epoch": 16.227646891853535, "grad_norm": 6.72208833694458, "learning_rate": 1.889440817485098e-05, "loss": 2.138518142700195, "step": 57170 }, { "epoch": 16.230485381776894, "grad_norm": 6.584402561187744, "learning_rate": 1.8880215725234177e-05, "loss": 2.1931156158447265, "step": 57180 }, { "epoch": 16.233323871700254, "grad_norm": 6.863824367523193, "learning_rate": 1.8866023275617374e-05, "loss": 2.0896369934082033, "step": 57190 }, { "epoch": 16.236162361623617, "grad_norm": 6.7979302406311035, "learning_rate": 1.885183082600057e-05, "loss": 2.1156829833984374, "step": 57200 }, { "epoch": 16.239000851546976, "grad_norm": 6.3479743003845215, "learning_rate": 1.8837638376383764e-05, "loss": 2.094942092895508, "step": 57210 }, { "epoch": 16.24183934147034, "grad_norm": 6.385201930999756, "learning_rate": 1.882344592676696e-05, "loss": 2.1464208602905273, "step": 57220 }, { "epoch": 16.244677831393698, "grad_norm": 6.469370365142822, "learning_rate": 1.8809253477150157e-05, "loss": 2.0865503311157227, "step": 57230 }, { "epoch": 16.24751632131706, "grad_norm": 6.096279621124268, "learning_rate": 1.8795061027533354e-05, "loss": 2.129465866088867, "step": 57240 }, { "epoch": 16.25035481124042, "grad_norm": 6.376060485839844, "learning_rate": 1.878086857791655e-05, "loss": 2.1481128692626954, "step": 57250 }, { "epoch": 16.25319330116378, "grad_norm": 6.183290004730225, "learning_rate": 1.8766676128299747e-05, "loss": 2.171940231323242, "step": 57260 }, { "epoch": 16.256031791087143, "grad_norm": 6.359033584594727, "learning_rate": 1.875248367868294e-05, "loss": 2.063446044921875, "step": 57270 }, { "epoch": 16.258870281010502, "grad_norm": 6.709403991699219, "learning_rate": 1.8738291229066137e-05, "loss": 2.139659881591797, "step": 57280 }, { "epoch": 16.261708770933865, "grad_norm": 6.535684108734131, "learning_rate": 1.8724098779449334e-05, "loss": 2.0904314041137697, "step": 57290 }, { "epoch": 16.264547260857224, "grad_norm": 6.169727802276611, "learning_rate": 1.870990632983253e-05, "loss": 2.097991371154785, "step": 57300 }, { "epoch": 16.267385750780583, "grad_norm": 6.07445764541626, "learning_rate": 1.8695713880215727e-05, "loss": 2.1109031677246093, "step": 57310 }, { "epoch": 16.270224240703946, "grad_norm": 6.484192848205566, "learning_rate": 1.868152143059892e-05, "loss": 2.090283966064453, "step": 57320 }, { "epoch": 16.273062730627306, "grad_norm": 6.344349384307861, "learning_rate": 1.8667328980982117e-05, "loss": 2.126021385192871, "step": 57330 }, { "epoch": 16.27590122055067, "grad_norm": 6.761787414550781, "learning_rate": 1.8653136531365314e-05, "loss": 2.101837158203125, "step": 57340 }, { "epoch": 16.278739710474028, "grad_norm": 6.561008930206299, "learning_rate": 1.863894408174851e-05, "loss": 2.0673479080200194, "step": 57350 }, { "epoch": 16.281578200397387, "grad_norm": 6.808931350708008, "learning_rate": 1.8624751632131708e-05, "loss": 2.1988779067993165, "step": 57360 }, { "epoch": 16.28441669032075, "grad_norm": 6.230459213256836, "learning_rate": 1.8610559182514904e-05, "loss": 2.17215576171875, "step": 57370 }, { "epoch": 16.28725518024411, "grad_norm": 6.627284526824951, "learning_rate": 1.8596366732898098e-05, "loss": 2.128833770751953, "step": 57380 }, { "epoch": 16.290093670167472, "grad_norm": 6.387476921081543, "learning_rate": 1.8582174283281294e-05, "loss": 2.155180740356445, "step": 57390 }, { "epoch": 16.29293216009083, "grad_norm": 6.469947814941406, "learning_rate": 1.856798183366449e-05, "loss": 2.1063348770141603, "step": 57400 }, { "epoch": 16.29577065001419, "grad_norm": 6.546379566192627, "learning_rate": 1.8553789384047688e-05, "loss": 2.112262725830078, "step": 57410 }, { "epoch": 16.298609139937554, "grad_norm": 6.398911952972412, "learning_rate": 1.8539596934430884e-05, "loss": 2.150659942626953, "step": 57420 }, { "epoch": 16.301447629860913, "grad_norm": 6.155949592590332, "learning_rate": 1.852540448481408e-05, "loss": 2.039188766479492, "step": 57430 }, { "epoch": 16.304286119784276, "grad_norm": 6.907550811767578, "learning_rate": 1.8511212035197274e-05, "loss": 2.1468238830566406, "step": 57440 }, { "epoch": 16.307124609707635, "grad_norm": 6.157660007476807, "learning_rate": 1.849701958558047e-05, "loss": 2.1208988189697267, "step": 57450 }, { "epoch": 16.309963099630995, "grad_norm": 6.324675559997559, "learning_rate": 1.8482827135963668e-05, "loss": 2.123966407775879, "step": 57460 }, { "epoch": 16.312801589554358, "grad_norm": 6.737855911254883, "learning_rate": 1.8468634686346865e-05, "loss": 2.1093936920166017, "step": 57470 }, { "epoch": 16.315640079477717, "grad_norm": 6.678573131561279, "learning_rate": 1.845444223673006e-05, "loss": 2.185909080505371, "step": 57480 }, { "epoch": 16.31847856940108, "grad_norm": 6.631179332733154, "learning_rate": 1.8440249787113255e-05, "loss": 2.1075830459594727, "step": 57490 }, { "epoch": 16.32131705932444, "grad_norm": 6.547634124755859, "learning_rate": 1.842605733749645e-05, "loss": 2.0255537033081055, "step": 57500 }, { "epoch": 16.32131705932444, "eval_accuracy": 0.35474025561136896, "eval_loss": 2.4580881595611572, "eval_runtime": 49.3963, "eval_samples_per_second": 318.384, "eval_steps_per_second": 4.98, "step": 57500 }, { "epoch": 16.3241555492478, "grad_norm": 6.4858503341674805, "learning_rate": 1.8411864887879648e-05, "loss": 2.1025976181030273, "step": 57510 }, { "epoch": 16.32699403917116, "grad_norm": 6.603931903839111, "learning_rate": 1.8397672438262845e-05, "loss": 2.1511867523193358, "step": 57520 }, { "epoch": 16.32983252909452, "grad_norm": 6.713150501251221, "learning_rate": 1.838347998864604e-05, "loss": 2.1042051315307617, "step": 57530 }, { "epoch": 16.332671019017884, "grad_norm": 6.492433071136475, "learning_rate": 1.8369287539029238e-05, "loss": 2.159398078918457, "step": 57540 }, { "epoch": 16.335509508941243, "grad_norm": 6.439465045928955, "learning_rate": 1.835509508941243e-05, "loss": 2.134530258178711, "step": 57550 }, { "epoch": 16.338347998864602, "grad_norm": 6.340213298797607, "learning_rate": 1.834232188475731e-05, "loss": 2.093168258666992, "step": 57560 }, { "epoch": 16.341186488787965, "grad_norm": 6.558653831481934, "learning_rate": 1.8328129435140508e-05, "loss": 2.208477020263672, "step": 57570 }, { "epoch": 16.344024978711325, "grad_norm": 6.305880069732666, "learning_rate": 1.83139369855237e-05, "loss": 2.0612857818603514, "step": 57580 }, { "epoch": 16.346863468634687, "grad_norm": 6.404212474822998, "learning_rate": 1.8299744535906898e-05, "loss": 2.0332786560058596, "step": 57590 }, { "epoch": 16.349701958558047, "grad_norm": 6.244786262512207, "learning_rate": 1.8285552086290094e-05, "loss": 2.0684955596923826, "step": 57600 }, { "epoch": 16.352540448481406, "grad_norm": 6.584094047546387, "learning_rate": 1.827135963667329e-05, "loss": 2.1986255645751953, "step": 57610 }, { "epoch": 16.35537893840477, "grad_norm": 6.688960552215576, "learning_rate": 1.8257167187056488e-05, "loss": 2.1237133026123045, "step": 57620 }, { "epoch": 16.35821742832813, "grad_norm": 6.783132553100586, "learning_rate": 1.8242974737439684e-05, "loss": 2.1533775329589844, "step": 57630 }, { "epoch": 16.36105591825149, "grad_norm": 6.336055278778076, "learning_rate": 1.8228782287822878e-05, "loss": 2.1428268432617186, "step": 57640 }, { "epoch": 16.36389440817485, "grad_norm": 7.173360347747803, "learning_rate": 1.8214589838206074e-05, "loss": 2.1174604415893556, "step": 57650 }, { "epoch": 16.366732898098213, "grad_norm": 6.552349090576172, "learning_rate": 1.820039738858927e-05, "loss": 2.210141181945801, "step": 57660 }, { "epoch": 16.369571388021573, "grad_norm": 6.36847448348999, "learning_rate": 1.8186204938972468e-05, "loss": 2.1538974761962892, "step": 57670 }, { "epoch": 16.372409877944932, "grad_norm": 6.78908109664917, "learning_rate": 1.8172012489355665e-05, "loss": 2.1300058364868164, "step": 57680 }, { "epoch": 16.375248367868295, "grad_norm": 6.8837080001831055, "learning_rate": 1.815782003973886e-05, "loss": 2.159927177429199, "step": 57690 }, { "epoch": 16.378086857791654, "grad_norm": 6.552935600280762, "learning_rate": 1.8143627590122055e-05, "loss": 2.073088836669922, "step": 57700 }, { "epoch": 16.380925347715017, "grad_norm": 6.344912528991699, "learning_rate": 1.812943514050525e-05, "loss": 2.0571664810180663, "step": 57710 }, { "epoch": 16.383763837638377, "grad_norm": 6.450818061828613, "learning_rate": 1.8115242690888448e-05, "loss": 2.0767932891845704, "step": 57720 }, { "epoch": 16.386602327561736, "grad_norm": 6.791046619415283, "learning_rate": 1.8101050241271645e-05, "loss": 2.0502758026123047, "step": 57730 }, { "epoch": 16.3894408174851, "grad_norm": 6.258696556091309, "learning_rate": 1.808685779165484e-05, "loss": 2.142597198486328, "step": 57740 }, { "epoch": 16.392279307408458, "grad_norm": 6.256496429443359, "learning_rate": 1.8072665342038038e-05, "loss": 2.0491573333740236, "step": 57750 }, { "epoch": 16.39511779733182, "grad_norm": 6.478979110717773, "learning_rate": 1.805847289242123e-05, "loss": 2.11676025390625, "step": 57760 }, { "epoch": 16.39795628725518, "grad_norm": 6.101078510284424, "learning_rate": 1.8044280442804428e-05, "loss": 2.1428857803344727, "step": 57770 }, { "epoch": 16.40079477717854, "grad_norm": 6.584621429443359, "learning_rate": 1.8030087993187625e-05, "loss": 2.0877138137817384, "step": 57780 }, { "epoch": 16.403633267101903, "grad_norm": 6.480956554412842, "learning_rate": 1.801589554357082e-05, "loss": 2.161904144287109, "step": 57790 }, { "epoch": 16.406471757025262, "grad_norm": 6.347400188446045, "learning_rate": 1.800170309395402e-05, "loss": 2.1080766677856446, "step": 57800 }, { "epoch": 16.409310246948625, "grad_norm": 6.393922328948975, "learning_rate": 1.798751064433721e-05, "loss": 2.0493635177612304, "step": 57810 }, { "epoch": 16.412148736871984, "grad_norm": 6.590887069702148, "learning_rate": 1.797331819472041e-05, "loss": 2.1578439712524413, "step": 57820 }, { "epoch": 16.414987226795343, "grad_norm": 6.551849842071533, "learning_rate": 1.7959125745103605e-05, "loss": 2.1130157470703126, "step": 57830 }, { "epoch": 16.417825716718706, "grad_norm": 6.479953765869141, "learning_rate": 1.7944933295486802e-05, "loss": 2.1311737060546876, "step": 57840 }, { "epoch": 16.420664206642066, "grad_norm": 6.528669834136963, "learning_rate": 1.793074084587e-05, "loss": 2.181615447998047, "step": 57850 }, { "epoch": 16.42350269656543, "grad_norm": 6.788939952850342, "learning_rate": 1.7916548396253195e-05, "loss": 2.2105361938476564, "step": 57860 }, { "epoch": 16.426341186488788, "grad_norm": 6.039098262786865, "learning_rate": 1.790235594663639e-05, "loss": 2.1998586654663086, "step": 57870 }, { "epoch": 16.429179676412147, "grad_norm": 6.222032070159912, "learning_rate": 1.7888163497019585e-05, "loss": 2.1457386016845703, "step": 57880 }, { "epoch": 16.43201816633551, "grad_norm": 6.652284622192383, "learning_rate": 1.7873971047402782e-05, "loss": 2.1798986434936523, "step": 57890 }, { "epoch": 16.43485665625887, "grad_norm": 6.591973781585693, "learning_rate": 1.785977859778598e-05, "loss": 2.1281408309936523, "step": 57900 }, { "epoch": 16.437695146182232, "grad_norm": 6.111810684204102, "learning_rate": 1.7845586148169175e-05, "loss": 2.153522491455078, "step": 57910 }, { "epoch": 16.44053363610559, "grad_norm": 6.554922103881836, "learning_rate": 1.7831393698552372e-05, "loss": 2.1606317520141602, "step": 57920 }, { "epoch": 16.44337212602895, "grad_norm": 6.342309474945068, "learning_rate": 1.7817201248935565e-05, "loss": 2.0880197525024413, "step": 57930 }, { "epoch": 16.446210615952314, "grad_norm": 6.984057426452637, "learning_rate": 1.7803008799318762e-05, "loss": 2.0946718215942384, "step": 57940 }, { "epoch": 16.449049105875673, "grad_norm": 6.15356969833374, "learning_rate": 1.778881634970196e-05, "loss": 2.110753631591797, "step": 57950 }, { "epoch": 16.451887595799036, "grad_norm": 6.331192970275879, "learning_rate": 1.7774623900085156e-05, "loss": 2.096161460876465, "step": 57960 }, { "epoch": 16.454726085722395, "grad_norm": 6.129480838775635, "learning_rate": 1.7760431450468352e-05, "loss": 2.0702798843383787, "step": 57970 }, { "epoch": 16.457564575645755, "grad_norm": 6.386324882507324, "learning_rate": 1.7746239000851546e-05, "loss": 2.079339027404785, "step": 57980 }, { "epoch": 16.460403065569118, "grad_norm": 6.670182228088379, "learning_rate": 1.7732046551234742e-05, "loss": 2.1111309051513674, "step": 57990 }, { "epoch": 16.463241555492477, "grad_norm": 6.429001808166504, "learning_rate": 1.771785410161794e-05, "loss": 2.1320169448852537, "step": 58000 }, { "epoch": 16.463241555492477, "eval_accuracy": 0.34927195269282124, "eval_loss": 2.45554518699646, "eval_runtime": 53.2068, "eval_samples_per_second": 295.583, "eval_steps_per_second": 4.623, "step": 58000 }, { "epoch": 16.46608004541584, "grad_norm": 6.54331111907959, "learning_rate": 1.7703661652001136e-05, "loss": 2.1272960662841798, "step": 58010 }, { "epoch": 16.4689185353392, "grad_norm": 6.293883323669434, "learning_rate": 1.7689469202384332e-05, "loss": 2.1780839920043946, "step": 58020 }, { "epoch": 16.471757025262562, "grad_norm": 6.376737117767334, "learning_rate": 1.767527675276753e-05, "loss": 2.174096870422363, "step": 58030 }, { "epoch": 16.47459551518592, "grad_norm": 6.243081092834473, "learning_rate": 1.7661084303150722e-05, "loss": 2.1950544357299804, "step": 58040 }, { "epoch": 16.47743400510928, "grad_norm": 6.314570903778076, "learning_rate": 1.764689185353392e-05, "loss": 2.1292064666748045, "step": 58050 }, { "epoch": 16.480272495032644, "grad_norm": 6.356591701507568, "learning_rate": 1.7632699403917116e-05, "loss": 2.0712873458862306, "step": 58060 }, { "epoch": 16.483110984956003, "grad_norm": 6.367936611175537, "learning_rate": 1.7618506954300313e-05, "loss": 2.169592094421387, "step": 58070 }, { "epoch": 16.485949474879366, "grad_norm": 6.263311862945557, "learning_rate": 1.760431450468351e-05, "loss": 2.2400096893310546, "step": 58080 }, { "epoch": 16.488787964802725, "grad_norm": 6.55941915512085, "learning_rate": 1.7590122055066706e-05, "loss": 2.0130647659301757, "step": 58090 }, { "epoch": 16.491626454726084, "grad_norm": 6.523941993713379, "learning_rate": 1.75759296054499e-05, "loss": 2.1166067123413086, "step": 58100 }, { "epoch": 16.494464944649447, "grad_norm": 5.792716979980469, "learning_rate": 1.7561737155833096e-05, "loss": 2.132173538208008, "step": 58110 }, { "epoch": 16.497303434572807, "grad_norm": 6.263317108154297, "learning_rate": 1.7547544706216293e-05, "loss": 2.0518142700195314, "step": 58120 }, { "epoch": 16.50014192449617, "grad_norm": 6.462588787078857, "learning_rate": 1.753335225659949e-05, "loss": 2.1587066650390625, "step": 58130 }, { "epoch": 16.50298041441953, "grad_norm": 6.70943546295166, "learning_rate": 1.7519159806982686e-05, "loss": 2.1494251251220704, "step": 58140 }, { "epoch": 16.50581890434289, "grad_norm": 6.571167469024658, "learning_rate": 1.7504967357365883e-05, "loss": 2.0745264053344727, "step": 58150 }, { "epoch": 16.50865739426625, "grad_norm": 6.339486598968506, "learning_rate": 1.7490774907749076e-05, "loss": 2.1090139389038085, "step": 58160 }, { "epoch": 16.51149588418961, "grad_norm": 6.578514575958252, "learning_rate": 1.7476582458132273e-05, "loss": 2.1185335159301757, "step": 58170 }, { "epoch": 16.514334374112973, "grad_norm": 6.8175249099731445, "learning_rate": 1.746239000851547e-05, "loss": 2.203438377380371, "step": 58180 }, { "epoch": 16.517172864036333, "grad_norm": 6.340778827667236, "learning_rate": 1.7448197558898666e-05, "loss": 2.106861877441406, "step": 58190 }, { "epoch": 16.520011353959692, "grad_norm": 6.557157039642334, "learning_rate": 1.7434005109281863e-05, "loss": 2.087404251098633, "step": 58200 }, { "epoch": 16.522849843883055, "grad_norm": 6.379532814025879, "learning_rate": 1.7419812659665056e-05, "loss": 2.0376571655273437, "step": 58210 }, { "epoch": 16.525688333806414, "grad_norm": 6.7920002937316895, "learning_rate": 1.7405620210048253e-05, "loss": 2.146072006225586, "step": 58220 }, { "epoch": 16.528526823729777, "grad_norm": 6.908193588256836, "learning_rate": 1.739142776043145e-05, "loss": 2.1632461547851562, "step": 58230 }, { "epoch": 16.531365313653136, "grad_norm": 7.0148749351501465, "learning_rate": 1.7377235310814646e-05, "loss": 2.127926826477051, "step": 58240 }, { "epoch": 16.534203803576496, "grad_norm": 6.76417875289917, "learning_rate": 1.7363042861197843e-05, "loss": 2.1771398544311524, "step": 58250 }, { "epoch": 16.53704229349986, "grad_norm": 6.526082515716553, "learning_rate": 1.734885041158104e-05, "loss": 2.106030082702637, "step": 58260 }, { "epoch": 16.539880783423218, "grad_norm": 6.408634185791016, "learning_rate": 1.7334657961964233e-05, "loss": 2.0337427139282225, "step": 58270 }, { "epoch": 16.54271927334658, "grad_norm": 6.852200508117676, "learning_rate": 1.732046551234743e-05, "loss": 2.186812973022461, "step": 58280 }, { "epoch": 16.54555776326994, "grad_norm": 6.502713680267334, "learning_rate": 1.7306273062730627e-05, "loss": 2.1692955017089846, "step": 58290 }, { "epoch": 16.5483962531933, "grad_norm": 6.285790920257568, "learning_rate": 1.7292080613113823e-05, "loss": 2.0975833892822267, "step": 58300 }, { "epoch": 16.551234743116662, "grad_norm": 6.1851677894592285, "learning_rate": 1.727788816349702e-05, "loss": 2.1328414916992187, "step": 58310 }, { "epoch": 16.55407323304002, "grad_norm": 6.175334453582764, "learning_rate": 1.7263695713880217e-05, "loss": 2.129705047607422, "step": 58320 }, { "epoch": 16.556911722963385, "grad_norm": 6.591728687286377, "learning_rate": 1.724950326426341e-05, "loss": 2.1239118576049805, "step": 58330 }, { "epoch": 16.559750212886744, "grad_norm": 6.327157974243164, "learning_rate": 1.7235310814646607e-05, "loss": 2.0630205154418944, "step": 58340 }, { "epoch": 16.562588702810103, "grad_norm": 6.115103244781494, "learning_rate": 1.7221118365029807e-05, "loss": 2.096940803527832, "step": 58350 }, { "epoch": 16.565427192733466, "grad_norm": 6.288856029510498, "learning_rate": 1.7206925915413004e-05, "loss": 2.10669002532959, "step": 58360 }, { "epoch": 16.568265682656826, "grad_norm": 6.39185094833374, "learning_rate": 1.71927334657962e-05, "loss": 2.1474666595458984, "step": 58370 }, { "epoch": 16.57110417258019, "grad_norm": 6.558883190155029, "learning_rate": 1.7178541016179394e-05, "loss": 2.074970817565918, "step": 58380 }, { "epoch": 16.573942662503548, "grad_norm": 6.251704692840576, "learning_rate": 1.716434856656259e-05, "loss": 2.1288227081298827, "step": 58390 }, { "epoch": 16.57678115242691, "grad_norm": 6.483485221862793, "learning_rate": 1.7150156116945787e-05, "loss": 2.0427188873291016, "step": 58400 }, { "epoch": 16.57961964235027, "grad_norm": 6.952577114105225, "learning_rate": 1.7135963667328984e-05, "loss": 2.153371047973633, "step": 58410 }, { "epoch": 16.58245813227363, "grad_norm": 6.160638809204102, "learning_rate": 1.712177121771218e-05, "loss": 2.0507104873657225, "step": 58420 }, { "epoch": 16.585296622196992, "grad_norm": 6.4898481369018555, "learning_rate": 1.7107578768095374e-05, "loss": 2.121957206726074, "step": 58430 }, { "epoch": 16.58813511212035, "grad_norm": 6.713178634643555, "learning_rate": 1.709338631847857e-05, "loss": 2.062621307373047, "step": 58440 }, { "epoch": 16.590973602043714, "grad_norm": 6.747702121734619, "learning_rate": 1.7079193868861767e-05, "loss": 2.105851173400879, "step": 58450 }, { "epoch": 16.593812091967074, "grad_norm": 6.191826820373535, "learning_rate": 1.7065001419244964e-05, "loss": 2.176871109008789, "step": 58460 }, { "epoch": 16.596650581890433, "grad_norm": 6.650170803070068, "learning_rate": 1.705080896962816e-05, "loss": 2.118031883239746, "step": 58470 }, { "epoch": 16.599489071813796, "grad_norm": 6.554111957550049, "learning_rate": 1.7036616520011357e-05, "loss": 2.1110599517822264, "step": 58480 }, { "epoch": 16.602327561737155, "grad_norm": 6.467939853668213, "learning_rate": 1.702242407039455e-05, "loss": 2.1361225128173826, "step": 58490 }, { "epoch": 16.605166051660518, "grad_norm": 6.666162490844727, "learning_rate": 1.7008231620777747e-05, "loss": 2.113541603088379, "step": 58500 }, { "epoch": 16.605166051660518, "eval_accuracy": 0.35143383989317734, "eval_loss": 2.4534289836883545, "eval_runtime": 48.5208, "eval_samples_per_second": 324.129, "eval_steps_per_second": 5.07, "step": 58500 }, { "epoch": 16.608004541583878, "grad_norm": 6.7383904457092285, "learning_rate": 1.6994039171160944e-05, "loss": 2.161474418640137, "step": 58510 }, { "epoch": 16.610843031507237, "grad_norm": 6.9398345947265625, "learning_rate": 1.697984672154414e-05, "loss": 2.1085184097290037, "step": 58520 }, { "epoch": 16.6136815214306, "grad_norm": 6.396667003631592, "learning_rate": 1.6965654271927337e-05, "loss": 2.0565052032470703, "step": 58530 }, { "epoch": 16.61652001135396, "grad_norm": 6.419022083282471, "learning_rate": 1.6951461822310534e-05, "loss": 2.070595932006836, "step": 58540 }, { "epoch": 16.619358501277322, "grad_norm": 5.96672248840332, "learning_rate": 1.6937269372693727e-05, "loss": 2.1307682037353515, "step": 58550 }, { "epoch": 16.62219699120068, "grad_norm": 6.432441234588623, "learning_rate": 1.6923076923076924e-05, "loss": 2.123627853393555, "step": 58560 }, { "epoch": 16.62503548112404, "grad_norm": 6.347167491912842, "learning_rate": 1.690888447346012e-05, "loss": 2.1447469711303713, "step": 58570 }, { "epoch": 16.627873971047404, "grad_norm": 6.661600112915039, "learning_rate": 1.6894692023843318e-05, "loss": 2.015125846862793, "step": 58580 }, { "epoch": 16.630712460970763, "grad_norm": 6.90977144241333, "learning_rate": 1.6880499574226514e-05, "loss": 2.1416751861572267, "step": 58590 }, { "epoch": 16.633550950894126, "grad_norm": 6.346673488616943, "learning_rate": 1.6866307124609708e-05, "loss": 2.1208913803100584, "step": 58600 }, { "epoch": 16.636389440817485, "grad_norm": 6.2410101890563965, "learning_rate": 1.6852114674992904e-05, "loss": 2.1138607025146485, "step": 58610 }, { "epoch": 16.639227930740844, "grad_norm": 6.47459602355957, "learning_rate": 1.68379222253761e-05, "loss": 2.116667938232422, "step": 58620 }, { "epoch": 16.642066420664207, "grad_norm": 6.203646183013916, "learning_rate": 1.6823729775759298e-05, "loss": 2.0591804504394533, "step": 58630 }, { "epoch": 16.644904910587567, "grad_norm": 6.382458209991455, "learning_rate": 1.6809537326142494e-05, "loss": 2.1892786026000977, "step": 58640 }, { "epoch": 16.64774340051093, "grad_norm": 6.313211441040039, "learning_rate": 1.679534487652569e-05, "loss": 2.0920616149902345, "step": 58650 }, { "epoch": 16.65058189043429, "grad_norm": 6.347242832183838, "learning_rate": 1.6781152426908885e-05, "loss": 2.1337181091308595, "step": 58660 }, { "epoch": 16.653420380357648, "grad_norm": 6.245785236358643, "learning_rate": 1.676695997729208e-05, "loss": 2.1870166778564455, "step": 58670 }, { "epoch": 16.65625887028101, "grad_norm": 6.128716468811035, "learning_rate": 1.6752767527675278e-05, "loss": 2.0964651107788086, "step": 58680 }, { "epoch": 16.65909736020437, "grad_norm": 6.695804595947266, "learning_rate": 1.6738575078058475e-05, "loss": 2.155950164794922, "step": 58690 }, { "epoch": 16.661935850127733, "grad_norm": 6.409322261810303, "learning_rate": 1.672438262844167e-05, "loss": 2.135628890991211, "step": 58700 }, { "epoch": 16.664774340051093, "grad_norm": 6.578305721282959, "learning_rate": 1.6710190178824868e-05, "loss": 2.1678035736083983, "step": 58710 }, { "epoch": 16.667612829974452, "grad_norm": 6.619303226470947, "learning_rate": 1.669599772920806e-05, "loss": 2.130233573913574, "step": 58720 }, { "epoch": 16.670451319897815, "grad_norm": 6.688461780548096, "learning_rate": 1.6681805279591258e-05, "loss": 2.152533531188965, "step": 58730 }, { "epoch": 16.673289809821174, "grad_norm": 6.425325393676758, "learning_rate": 1.6667612829974455e-05, "loss": 2.0940277099609377, "step": 58740 }, { "epoch": 16.676128299744537, "grad_norm": 6.005398273468018, "learning_rate": 1.665342038035765e-05, "loss": 2.053813171386719, "step": 58750 }, { "epoch": 16.678966789667896, "grad_norm": 6.879661560058594, "learning_rate": 1.6639227930740848e-05, "loss": 2.1807220458984373, "step": 58760 }, { "epoch": 16.68180527959126, "grad_norm": 6.3822550773620605, "learning_rate": 1.6625035481124045e-05, "loss": 2.089982604980469, "step": 58770 }, { "epoch": 16.68464376951462, "grad_norm": 6.596411228179932, "learning_rate": 1.6610843031507238e-05, "loss": 2.082105827331543, "step": 58780 }, { "epoch": 16.687482259437978, "grad_norm": 6.3692097663879395, "learning_rate": 1.6596650581890435e-05, "loss": 2.1279699325561525, "step": 58790 }, { "epoch": 16.69032074936134, "grad_norm": 6.919433116912842, "learning_rate": 1.658245813227363e-05, "loss": 2.1285636901855467, "step": 58800 }, { "epoch": 16.6931592392847, "grad_norm": 6.490898609161377, "learning_rate": 1.656826568265683e-05, "loss": 2.1649784088134765, "step": 58810 }, { "epoch": 16.695997729208063, "grad_norm": 6.492420673370361, "learning_rate": 1.6554073233040025e-05, "loss": 2.1612384796142576, "step": 58820 }, { "epoch": 16.698836219131422, "grad_norm": 6.647693634033203, "learning_rate": 1.653988078342322e-05, "loss": 2.170660972595215, "step": 58830 }, { "epoch": 16.70167470905478, "grad_norm": 6.751434326171875, "learning_rate": 1.6525688333806415e-05, "loss": 2.131575012207031, "step": 58840 }, { "epoch": 16.704513198978145, "grad_norm": 6.263935089111328, "learning_rate": 1.6511495884189612e-05, "loss": 2.027792739868164, "step": 58850 }, { "epoch": 16.707351688901504, "grad_norm": 6.86271858215332, "learning_rate": 1.649730343457281e-05, "loss": 2.125057029724121, "step": 58860 }, { "epoch": 16.710190178824867, "grad_norm": 6.464856147766113, "learning_rate": 1.6483110984956005e-05, "loss": 2.120099639892578, "step": 58870 }, { "epoch": 16.713028668748226, "grad_norm": 6.479288578033447, "learning_rate": 1.6468918535339202e-05, "loss": 2.1780014038085938, "step": 58880 }, { "epoch": 16.715867158671585, "grad_norm": 7.034056663513184, "learning_rate": 1.6454726085722395e-05, "loss": 2.1876495361328123, "step": 58890 }, { "epoch": 16.71870564859495, "grad_norm": 6.064935207366943, "learning_rate": 1.6440533636105592e-05, "loss": 2.131290626525879, "step": 58900 }, { "epoch": 16.721544138518308, "grad_norm": 6.29191780090332, "learning_rate": 1.642634118648879e-05, "loss": 2.035770034790039, "step": 58910 }, { "epoch": 16.72438262844167, "grad_norm": 6.862910747528076, "learning_rate": 1.6412148736871985e-05, "loss": 2.1142101287841797, "step": 58920 }, { "epoch": 16.72722111836503, "grad_norm": 6.446345806121826, "learning_rate": 1.6397956287255182e-05, "loss": 2.0205619812011717, "step": 58930 }, { "epoch": 16.73005960828839, "grad_norm": 6.354816913604736, "learning_rate": 1.638376383763838e-05, "loss": 2.0490854263305662, "step": 58940 }, { "epoch": 16.732898098211752, "grad_norm": 6.213539123535156, "learning_rate": 1.6369571388021572e-05, "loss": 2.0770803451538087, "step": 58950 }, { "epoch": 16.73573658813511, "grad_norm": 6.273269176483154, "learning_rate": 1.635537893840477e-05, "loss": 2.2079803466796877, "step": 58960 }, { "epoch": 16.738575078058474, "grad_norm": 6.392449378967285, "learning_rate": 1.6341186488787966e-05, "loss": 2.1070207595825194, "step": 58970 }, { "epoch": 16.741413567981834, "grad_norm": 6.425469875335693, "learning_rate": 1.6326994039171162e-05, "loss": 2.0972137451171875, "step": 58980 }, { "epoch": 16.744252057905193, "grad_norm": 6.508802890777588, "learning_rate": 1.631280158955436e-05, "loss": 2.0492103576660154, "step": 58990 }, { "epoch": 16.747090547828556, "grad_norm": 6.155045032501221, "learning_rate": 1.6298609139937556e-05, "loss": 2.0463157653808595, "step": 59000 }, { "epoch": 16.747090547828556, "eval_accuracy": 0.35372289692884845, "eval_loss": 2.452171802520752, "eval_runtime": 49.4015, "eval_samples_per_second": 318.351, "eval_steps_per_second": 4.98, "step": 59000 }, { "epoch": 16.749929037751915, "grad_norm": 6.017492771148682, "learning_rate": 1.628441669032075e-05, "loss": 2.1532835006713866, "step": 59010 }, { "epoch": 16.752767527675278, "grad_norm": 7.26192569732666, "learning_rate": 1.6270224240703946e-05, "loss": 2.112754058837891, "step": 59020 }, { "epoch": 16.755606017598637, "grad_norm": 6.600637435913086, "learning_rate": 1.6256031791087142e-05, "loss": 2.1786828994750977, "step": 59030 }, { "epoch": 16.758444507521997, "grad_norm": 6.43130350112915, "learning_rate": 1.624183934147034e-05, "loss": 2.088344192504883, "step": 59040 }, { "epoch": 16.76128299744536, "grad_norm": 6.242670059204102, "learning_rate": 1.6227646891853536e-05, "loss": 2.134825897216797, "step": 59050 }, { "epoch": 16.76412148736872, "grad_norm": 6.356899261474609, "learning_rate": 1.621345444223673e-05, "loss": 2.1233600616455077, "step": 59060 }, { "epoch": 16.766959977292082, "grad_norm": 6.462685585021973, "learning_rate": 1.6199261992619926e-05, "loss": 2.1080835342407225, "step": 59070 }, { "epoch": 16.76979846721544, "grad_norm": 6.802971839904785, "learning_rate": 1.6185069543003123e-05, "loss": 2.149094009399414, "step": 59080 }, { "epoch": 16.7726369571388, "grad_norm": 6.4752068519592285, "learning_rate": 1.617087709338632e-05, "loss": 2.172618865966797, "step": 59090 }, { "epoch": 16.775475447062163, "grad_norm": 6.4925761222839355, "learning_rate": 1.6156684643769516e-05, "loss": 2.1049848556518556, "step": 59100 }, { "epoch": 16.778313936985523, "grad_norm": 6.262664318084717, "learning_rate": 1.6142492194152713e-05, "loss": 2.166682243347168, "step": 59110 }, { "epoch": 16.781152426908886, "grad_norm": 6.468127727508545, "learning_rate": 1.6128299744535906e-05, "loss": 2.157582664489746, "step": 59120 }, { "epoch": 16.783990916832245, "grad_norm": 6.5440263748168945, "learning_rate": 1.6114107294919103e-05, "loss": 2.2057287216186525, "step": 59130 }, { "epoch": 16.786829406755604, "grad_norm": 6.474178791046143, "learning_rate": 1.60999148453023e-05, "loss": 2.244017791748047, "step": 59140 }, { "epoch": 16.789667896678967, "grad_norm": 7.022958755493164, "learning_rate": 1.6085722395685496e-05, "loss": 2.0799510955810545, "step": 59150 }, { "epoch": 16.792506386602327, "grad_norm": 6.537268161773682, "learning_rate": 1.6071529946068693e-05, "loss": 2.07237548828125, "step": 59160 }, { "epoch": 16.79534487652569, "grad_norm": 6.5635199546813965, "learning_rate": 1.605733749645189e-05, "loss": 2.1706302642822264, "step": 59170 }, { "epoch": 16.79818336644905, "grad_norm": 6.164402008056641, "learning_rate": 1.6043145046835083e-05, "loss": 2.1240327835083006, "step": 59180 }, { "epoch": 16.801021856372408, "grad_norm": 6.7295145988464355, "learning_rate": 1.602895259721828e-05, "loss": 2.156471061706543, "step": 59190 }, { "epoch": 16.80386034629577, "grad_norm": 6.258521556854248, "learning_rate": 1.6014760147601476e-05, "loss": 2.089732360839844, "step": 59200 }, { "epoch": 16.80669883621913, "grad_norm": 6.791895866394043, "learning_rate": 1.6000567697984673e-05, "loss": 2.223611831665039, "step": 59210 }, { "epoch": 16.809537326142493, "grad_norm": 6.033153057098389, "learning_rate": 1.598637524836787e-05, "loss": 2.1122867584228517, "step": 59220 }, { "epoch": 16.812375816065853, "grad_norm": 6.366550922393799, "learning_rate": 1.5972182798751063e-05, "loss": 2.054530715942383, "step": 59230 }, { "epoch": 16.815214305989215, "grad_norm": 6.569028854370117, "learning_rate": 1.595799034913426e-05, "loss": 2.0619218826293944, "step": 59240 }, { "epoch": 16.818052795912575, "grad_norm": 6.549862384796143, "learning_rate": 1.5943797899517456e-05, "loss": 2.1998979568481447, "step": 59250 }, { "epoch": 16.820891285835934, "grad_norm": 6.331091403961182, "learning_rate": 1.5929605449900653e-05, "loss": 2.169635009765625, "step": 59260 }, { "epoch": 16.823729775759297, "grad_norm": 6.26522970199585, "learning_rate": 1.591541300028385e-05, "loss": 2.0860538482666016, "step": 59270 }, { "epoch": 16.826568265682656, "grad_norm": 6.403234958648682, "learning_rate": 1.5901220550667047e-05, "loss": 2.086394691467285, "step": 59280 }, { "epoch": 16.82940675560602, "grad_norm": 6.116052627563477, "learning_rate": 1.588702810105024e-05, "loss": 2.145246124267578, "step": 59290 }, { "epoch": 16.83224524552938, "grad_norm": 6.649975299835205, "learning_rate": 1.5872835651433437e-05, "loss": 2.059648132324219, "step": 59300 }, { "epoch": 16.835083735452738, "grad_norm": 6.267669200897217, "learning_rate": 1.5858643201816633e-05, "loss": 2.075215721130371, "step": 59310 }, { "epoch": 16.8379222253761, "grad_norm": 6.887203216552734, "learning_rate": 1.584445075219983e-05, "loss": 2.1365217208862304, "step": 59320 }, { "epoch": 16.84076071529946, "grad_norm": 6.362013816833496, "learning_rate": 1.5830258302583027e-05, "loss": 2.111671257019043, "step": 59330 }, { "epoch": 16.843599205222823, "grad_norm": 6.535689830780029, "learning_rate": 1.5816065852966223e-05, "loss": 2.1880107879638673, "step": 59340 }, { "epoch": 16.846437695146182, "grad_norm": 6.373207092285156, "learning_rate": 1.5801873403349417e-05, "loss": 2.127791976928711, "step": 59350 }, { "epoch": 16.84927618506954, "grad_norm": 6.624334812164307, "learning_rate": 1.5787680953732614e-05, "loss": 2.1519405364990236, "step": 59360 }, { "epoch": 16.852114674992904, "grad_norm": 6.4244065284729, "learning_rate": 1.577348850411581e-05, "loss": 2.179579162597656, "step": 59370 }, { "epoch": 16.854953164916264, "grad_norm": 6.336109638214111, "learning_rate": 1.5759296054499007e-05, "loss": 2.0840755462646485, "step": 59380 }, { "epoch": 16.857791654839627, "grad_norm": 6.61503267288208, "learning_rate": 1.5745103604882204e-05, "loss": 2.129965400695801, "step": 59390 }, { "epoch": 16.860630144762986, "grad_norm": 6.492679119110107, "learning_rate": 1.57309111552654e-05, "loss": 2.0813228607177736, "step": 59400 }, { "epoch": 16.863468634686345, "grad_norm": 6.678558826446533, "learning_rate": 1.5716718705648594e-05, "loss": 2.1089170455932615, "step": 59410 }, { "epoch": 16.86630712460971, "grad_norm": 6.630539894104004, "learning_rate": 1.570252625603179e-05, "loss": 2.1963470458984373, "step": 59420 }, { "epoch": 16.869145614533068, "grad_norm": 6.424307823181152, "learning_rate": 1.5688333806414987e-05, "loss": 2.1044952392578127, "step": 59430 }, { "epoch": 16.87198410445643, "grad_norm": 6.475567817687988, "learning_rate": 1.5674141356798184e-05, "loss": 2.1052051544189454, "step": 59440 }, { "epoch": 16.87482259437979, "grad_norm": 6.62249755859375, "learning_rate": 1.565994890718138e-05, "loss": 2.1784873962402345, "step": 59450 }, { "epoch": 16.87766108430315, "grad_norm": 6.539790153503418, "learning_rate": 1.5645756457564574e-05, "loss": 2.1511693954467774, "step": 59460 }, { "epoch": 16.880499574226512, "grad_norm": 6.099449157714844, "learning_rate": 1.563156400794777e-05, "loss": 2.1640987396240234, "step": 59470 }, { "epoch": 16.88333806414987, "grad_norm": 6.5003981590271, "learning_rate": 1.5617371558330967e-05, "loss": 2.154550552368164, "step": 59480 }, { "epoch": 16.886176554073234, "grad_norm": 6.4379730224609375, "learning_rate": 1.5603179108714164e-05, "loss": 2.188999366760254, "step": 59490 }, { "epoch": 16.889015043996594, "grad_norm": 6.685571670532227, "learning_rate": 1.558898665909736e-05, "loss": 2.0628826141357424, "step": 59500 }, { "epoch": 16.889015043996594, "eval_accuracy": 0.3551853500349717, "eval_loss": 2.4484872817993164, "eval_runtime": 51.4188, "eval_samples_per_second": 305.861, "eval_steps_per_second": 4.784, "step": 59500 }, { "epoch": 16.891853533919953, "grad_norm": 6.529905319213867, "learning_rate": 1.5574794209480557e-05, "loss": 2.091212272644043, "step": 59510 }, { "epoch": 16.894692023843316, "grad_norm": 7.0127716064453125, "learning_rate": 1.556060175986375e-05, "loss": 2.111943817138672, "step": 59520 }, { "epoch": 16.897530513766675, "grad_norm": 6.777892112731934, "learning_rate": 1.5546409310246947e-05, "loss": 2.156541442871094, "step": 59530 }, { "epoch": 16.900369003690038, "grad_norm": 6.349162578582764, "learning_rate": 1.5532216860630144e-05, "loss": 2.13936710357666, "step": 59540 }, { "epoch": 16.903207493613397, "grad_norm": 6.365996837615967, "learning_rate": 1.551802441101334e-05, "loss": 2.1583038330078126, "step": 59550 }, { "epoch": 16.906045983536757, "grad_norm": 6.364405632019043, "learning_rate": 1.5505251206358217e-05, "loss": 2.1429601669311524, "step": 59560 }, { "epoch": 16.90888447346012, "grad_norm": 6.222811222076416, "learning_rate": 1.5491058756741414e-05, "loss": 2.2190475463867188, "step": 59570 }, { "epoch": 16.91172296338348, "grad_norm": 6.534058094024658, "learning_rate": 1.547686630712461e-05, "loss": 2.128894233703613, "step": 59580 }, { "epoch": 16.914561453306842, "grad_norm": 6.060498237609863, "learning_rate": 1.5462673857507807e-05, "loss": 2.072765350341797, "step": 59590 }, { "epoch": 16.9173999432302, "grad_norm": 6.29252815246582, "learning_rate": 1.5448481407891004e-05, "loss": 2.0832725524902345, "step": 59600 }, { "epoch": 16.920238433153564, "grad_norm": 6.742370128631592, "learning_rate": 1.5434288958274197e-05, "loss": 2.1244245529174806, "step": 59610 }, { "epoch": 16.923076923076923, "grad_norm": 6.299205303192139, "learning_rate": 1.5420096508657394e-05, "loss": 2.0978389739990235, "step": 59620 }, { "epoch": 16.925915413000283, "grad_norm": 6.114504814147949, "learning_rate": 1.540590405904059e-05, "loss": 2.1695734024047852, "step": 59630 }, { "epoch": 16.928753902923646, "grad_norm": 6.3788323402404785, "learning_rate": 1.5391711609423787e-05, "loss": 2.1165678024291994, "step": 59640 }, { "epoch": 16.931592392847005, "grad_norm": 6.456028938293457, "learning_rate": 1.5377519159806984e-05, "loss": 2.082106590270996, "step": 59650 }, { "epoch": 16.934430882770368, "grad_norm": 6.127455234527588, "learning_rate": 1.536332671019018e-05, "loss": 2.1799108505249025, "step": 59660 }, { "epoch": 16.937269372693727, "grad_norm": 6.004118919372559, "learning_rate": 1.5349134260573374e-05, "loss": 2.0877485275268555, "step": 59670 }, { "epoch": 16.940107862617086, "grad_norm": 6.285346984863281, "learning_rate": 1.533494181095657e-05, "loss": 2.1453407287597654, "step": 59680 }, { "epoch": 16.94294635254045, "grad_norm": 6.26959228515625, "learning_rate": 1.5320749361339767e-05, "loss": 2.105076217651367, "step": 59690 }, { "epoch": 16.94578484246381, "grad_norm": 6.608125686645508, "learning_rate": 1.5306556911722964e-05, "loss": 2.165878105163574, "step": 59700 }, { "epoch": 16.94862333238717, "grad_norm": 6.574593544006348, "learning_rate": 1.529236446210616e-05, "loss": 2.1892810821533204, "step": 59710 }, { "epoch": 16.95146182231053, "grad_norm": 6.556127071380615, "learning_rate": 1.5278172012489354e-05, "loss": 2.1473295211791994, "step": 59720 }, { "epoch": 16.95430031223389, "grad_norm": 6.431490898132324, "learning_rate": 1.526397956287255e-05, "loss": 2.0895435333251955, "step": 59730 }, { "epoch": 16.957138802157253, "grad_norm": 6.161937236785889, "learning_rate": 1.5249787113255747e-05, "loss": 2.120193672180176, "step": 59740 }, { "epoch": 16.959977292080612, "grad_norm": 6.271885395050049, "learning_rate": 1.5235594663638944e-05, "loss": 2.0213081359863283, "step": 59750 }, { "epoch": 16.962815782003975, "grad_norm": 6.278640270233154, "learning_rate": 1.5221402214022141e-05, "loss": 2.113099479675293, "step": 59760 }, { "epoch": 16.965654271927335, "grad_norm": 6.548995018005371, "learning_rate": 1.5207209764405336e-05, "loss": 2.1808923721313476, "step": 59770 }, { "epoch": 16.968492761850694, "grad_norm": 7.085607528686523, "learning_rate": 1.5193017314788533e-05, "loss": 2.156149482727051, "step": 59780 }, { "epoch": 16.971331251774057, "grad_norm": 6.452113151550293, "learning_rate": 1.517882486517173e-05, "loss": 2.1428955078125, "step": 59790 }, { "epoch": 16.974169741697416, "grad_norm": 6.218883037567139, "learning_rate": 1.5164632415554924e-05, "loss": 2.0681434631347657, "step": 59800 }, { "epoch": 16.97700823162078, "grad_norm": 6.64301872253418, "learning_rate": 1.5150439965938121e-05, "loss": 2.1293529510498046, "step": 59810 }, { "epoch": 16.97984672154414, "grad_norm": 6.414233684539795, "learning_rate": 1.5136247516321316e-05, "loss": 2.126271057128906, "step": 59820 }, { "epoch": 16.982685211467498, "grad_norm": 6.372224807739258, "learning_rate": 1.5122055066704513e-05, "loss": 2.0583324432373047, "step": 59830 }, { "epoch": 16.98552370139086, "grad_norm": 6.61337423324585, "learning_rate": 1.510786261708771e-05, "loss": 2.160342788696289, "step": 59840 }, { "epoch": 16.98836219131422, "grad_norm": 6.361216068267822, "learning_rate": 1.5093670167470904e-05, "loss": 2.063399887084961, "step": 59850 }, { "epoch": 16.991200681237583, "grad_norm": 6.4677653312683105, "learning_rate": 1.5079477717854101e-05, "loss": 2.094265937805176, "step": 59860 }, { "epoch": 16.994039171160942, "grad_norm": 6.416655540466309, "learning_rate": 1.5065285268237298e-05, "loss": 2.108565330505371, "step": 59870 }, { "epoch": 16.9968776610843, "grad_norm": 6.40622615814209, "learning_rate": 1.5051092818620493e-05, "loss": 2.1313501358032227, "step": 59880 }, { "epoch": 16.999716151007664, "grad_norm": 6.352440357208252, "learning_rate": 1.503690036900369e-05, "loss": 2.1564453125, "step": 59890 }, { "epoch": 17.002554640931024, "grad_norm": 6.608471393585205, "learning_rate": 1.5024127164348567e-05, "loss": 2.1892812728881834, "step": 59900 }, { "epoch": 17.005393130854387, "grad_norm": 6.541316986083984, "learning_rate": 1.5009934714731762e-05, "loss": 2.083477592468262, "step": 59910 }, { "epoch": 17.008231620777746, "grad_norm": 6.7843241691589355, "learning_rate": 1.4995742265114959e-05, "loss": 2.131167984008789, "step": 59920 }, { "epoch": 17.011070110701105, "grad_norm": 6.54405403137207, "learning_rate": 1.4981549815498156e-05, "loss": 2.086264801025391, "step": 59930 }, { "epoch": 17.013908600624468, "grad_norm": 6.329928874969482, "learning_rate": 1.496735736588135e-05, "loss": 2.0797414779663086, "step": 59940 }, { "epoch": 17.016747090547828, "grad_norm": 6.430119037628174, "learning_rate": 1.4953164916264547e-05, "loss": 2.137734794616699, "step": 59950 }, { "epoch": 17.01958558047119, "grad_norm": 6.25243616104126, "learning_rate": 1.4938972466647744e-05, "loss": 2.1215578079223634, "step": 59960 }, { "epoch": 17.02242407039455, "grad_norm": 6.150838375091553, "learning_rate": 1.492478001703094e-05, "loss": 2.098955535888672, "step": 59970 }, { "epoch": 17.025262560317913, "grad_norm": 6.4149980545043945, "learning_rate": 1.4910587567414136e-05, "loss": 2.0118885040283203, "step": 59980 }, { "epoch": 17.028101050241272, "grad_norm": 6.160953521728516, "learning_rate": 1.4896395117797333e-05, "loss": 2.0940683364868162, "step": 59990 }, { "epoch": 17.03093954016463, "grad_norm": 6.4580559730529785, "learning_rate": 1.4882202668180528e-05, "loss": 2.118644142150879, "step": 60000 }, { "epoch": 17.03093954016463, "eval_accuracy": 0.3541044064347937, "eval_loss": 2.448896646499634, "eval_runtime": 50.4625, "eval_samples_per_second": 311.657, "eval_steps_per_second": 4.875, "step": 60000 }, { "epoch": 17.033778030087994, "grad_norm": 5.847116947174072, "learning_rate": 1.4868010218563724e-05, "loss": 2.1179555892944335, "step": 60010 }, { "epoch": 17.036616520011354, "grad_norm": 6.3256144523620605, "learning_rate": 1.4853817768946921e-05, "loss": 2.109488296508789, "step": 60020 }, { "epoch": 17.039455009934716, "grad_norm": 6.579216957092285, "learning_rate": 1.4839625319330116e-05, "loss": 2.1722200393676756, "step": 60030 }, { "epoch": 17.042293499858076, "grad_norm": 6.499370098114014, "learning_rate": 1.4825432869713313e-05, "loss": 2.129041290283203, "step": 60040 }, { "epoch": 17.045131989781435, "grad_norm": 6.496866703033447, "learning_rate": 1.481124042009651e-05, "loss": 2.0731678009033203, "step": 60050 }, { "epoch": 17.047970479704798, "grad_norm": 6.649099349975586, "learning_rate": 1.4797047970479705e-05, "loss": 2.1257608413696287, "step": 60060 }, { "epoch": 17.050808969628157, "grad_norm": 6.378681182861328, "learning_rate": 1.4782855520862901e-05, "loss": 2.142395782470703, "step": 60070 }, { "epoch": 17.05364745955152, "grad_norm": 6.71944522857666, "learning_rate": 1.4768663071246098e-05, "loss": 2.133505630493164, "step": 60080 }, { "epoch": 17.05648594947488, "grad_norm": 6.542564392089844, "learning_rate": 1.4754470621629293e-05, "loss": 2.122756767272949, "step": 60090 }, { "epoch": 17.05932443939824, "grad_norm": 7.179574012756348, "learning_rate": 1.474027817201249e-05, "loss": 2.061177635192871, "step": 60100 }, { "epoch": 17.0621629293216, "grad_norm": 6.390422821044922, "learning_rate": 1.4726085722395685e-05, "loss": 2.105596160888672, "step": 60110 }, { "epoch": 17.06500141924496, "grad_norm": 6.432822227478027, "learning_rate": 1.4711893272778881e-05, "loss": 2.1002422332763673, "step": 60120 }, { "epoch": 17.067839909168324, "grad_norm": 6.163914203643799, "learning_rate": 1.4697700823162078e-05, "loss": 2.0714229583740233, "step": 60130 }, { "epoch": 17.070678399091683, "grad_norm": 6.458765506744385, "learning_rate": 1.4683508373545273e-05, "loss": 2.0325984954833984, "step": 60140 }, { "epoch": 17.073516889015043, "grad_norm": 6.636162281036377, "learning_rate": 1.466931592392847e-05, "loss": 2.071198272705078, "step": 60150 }, { "epoch": 17.076355378938405, "grad_norm": 6.358109951019287, "learning_rate": 1.4655123474311667e-05, "loss": 2.10357551574707, "step": 60160 }, { "epoch": 17.079193868861765, "grad_norm": 6.6357574462890625, "learning_rate": 1.4640931024694862e-05, "loss": 2.0837352752685545, "step": 60170 }, { "epoch": 17.082032358785128, "grad_norm": 6.673110008239746, "learning_rate": 1.4626738575078058e-05, "loss": 2.1281919479370117, "step": 60180 }, { "epoch": 17.084870848708487, "grad_norm": 6.596370697021484, "learning_rate": 1.4612546125461255e-05, "loss": 2.0882312774658205, "step": 60190 }, { "epoch": 17.087709338631846, "grad_norm": 6.637057781219482, "learning_rate": 1.459835367584445e-05, "loss": 2.1822975158691404, "step": 60200 }, { "epoch": 17.09054782855521, "grad_norm": 6.2228684425354, "learning_rate": 1.4584161226227647e-05, "loss": 2.0292875289916994, "step": 60210 }, { "epoch": 17.09338631847857, "grad_norm": 6.299417495727539, "learning_rate": 1.4569968776610843e-05, "loss": 2.104446792602539, "step": 60220 }, { "epoch": 17.09622480840193, "grad_norm": 6.558992385864258, "learning_rate": 1.4555776326994038e-05, "loss": 2.182000923156738, "step": 60230 }, { "epoch": 17.09906329832529, "grad_norm": 6.36713171005249, "learning_rate": 1.4541583877377235e-05, "loss": 2.0834835052490233, "step": 60240 }, { "epoch": 17.10190178824865, "grad_norm": 6.6134843826293945, "learning_rate": 1.4527391427760432e-05, "loss": 2.0853132247924804, "step": 60250 }, { "epoch": 17.104740278172013, "grad_norm": 6.518113136291504, "learning_rate": 1.4513198978143627e-05, "loss": 2.1477989196777343, "step": 60260 }, { "epoch": 17.107578768095372, "grad_norm": 6.439436912536621, "learning_rate": 1.4499006528526824e-05, "loss": 2.126470947265625, "step": 60270 }, { "epoch": 17.110417258018735, "grad_norm": 6.35228157043457, "learning_rate": 1.448481407891002e-05, "loss": 2.126068115234375, "step": 60280 }, { "epoch": 17.113255747942095, "grad_norm": 6.40854024887085, "learning_rate": 1.4470621629293215e-05, "loss": 2.1240158081054688, "step": 60290 }, { "epoch": 17.116094237865454, "grad_norm": 6.129135608673096, "learning_rate": 1.4456429179676412e-05, "loss": 2.1321043014526366, "step": 60300 }, { "epoch": 17.118932727788817, "grad_norm": 6.549203395843506, "learning_rate": 1.4442236730059609e-05, "loss": 2.125930404663086, "step": 60310 }, { "epoch": 17.121771217712176, "grad_norm": 6.196710109710693, "learning_rate": 1.4428044280442804e-05, "loss": 2.08373966217041, "step": 60320 }, { "epoch": 17.12460970763554, "grad_norm": 6.36288595199585, "learning_rate": 1.4413851830826e-05, "loss": 2.100708770751953, "step": 60330 }, { "epoch": 17.1274481975589, "grad_norm": 6.262726306915283, "learning_rate": 1.4399659381209195e-05, "loss": 2.10403995513916, "step": 60340 }, { "epoch": 17.130286687482258, "grad_norm": 6.405741214752197, "learning_rate": 1.4385466931592392e-05, "loss": 2.016373062133789, "step": 60350 }, { "epoch": 17.13312517740562, "grad_norm": 6.707220554351807, "learning_rate": 1.4371274481975589e-05, "loss": 2.103748893737793, "step": 60360 }, { "epoch": 17.13596366732898, "grad_norm": 6.262211799621582, "learning_rate": 1.4357082032358784e-05, "loss": 2.0841957092285157, "step": 60370 }, { "epoch": 17.138802157252343, "grad_norm": 6.684305667877197, "learning_rate": 1.434288958274198e-05, "loss": 2.029327392578125, "step": 60380 }, { "epoch": 17.141640647175702, "grad_norm": 6.363909721374512, "learning_rate": 1.4328697133125177e-05, "loss": 2.142097282409668, "step": 60390 }, { "epoch": 17.144479137099065, "grad_norm": 6.803969383239746, "learning_rate": 1.4314504683508372e-05, "loss": 2.0959848403930663, "step": 60400 }, { "epoch": 17.147317627022424, "grad_norm": 6.615866184234619, "learning_rate": 1.4300312233891569e-05, "loss": 2.168914222717285, "step": 60410 }, { "epoch": 17.150156116945784, "grad_norm": 6.551262378692627, "learning_rate": 1.4286119784274766e-05, "loss": 2.043583297729492, "step": 60420 }, { "epoch": 17.152994606869147, "grad_norm": 6.372097492218018, "learning_rate": 1.427192733465796e-05, "loss": 2.06236572265625, "step": 60430 }, { "epoch": 17.155833096792506, "grad_norm": 6.622402191162109, "learning_rate": 1.425773488504116e-05, "loss": 2.126373291015625, "step": 60440 }, { "epoch": 17.15867158671587, "grad_norm": 6.73896598815918, "learning_rate": 1.4243542435424356e-05, "loss": 2.092626190185547, "step": 60450 }, { "epoch": 17.161510076639228, "grad_norm": 6.975832462310791, "learning_rate": 1.4229349985807553e-05, "loss": 2.041607475280762, "step": 60460 }, { "epoch": 17.164348566562587, "grad_norm": 6.977777004241943, "learning_rate": 1.421515753619075e-05, "loss": 2.09289436340332, "step": 60470 }, { "epoch": 17.16718705648595, "grad_norm": 5.906913757324219, "learning_rate": 1.4200965086573944e-05, "loss": 2.0782354354858397, "step": 60480 }, { "epoch": 17.17002554640931, "grad_norm": 6.5357666015625, "learning_rate": 1.4186772636957141e-05, "loss": 2.1709400177001954, "step": 60490 }, { "epoch": 17.172864036332673, "grad_norm": 7.024181365966797, "learning_rate": 1.4172580187340338e-05, "loss": 2.1288665771484374, "step": 60500 }, { "epoch": 17.172864036332673, "eval_accuracy": 0.3539772365994786, "eval_loss": 2.447812795639038, "eval_runtime": 48.5541, "eval_samples_per_second": 323.907, "eval_steps_per_second": 5.067, "step": 60500 }, { "epoch": 17.175702526256032, "grad_norm": 7.058117866516113, "learning_rate": 1.4158387737723533e-05, "loss": 2.1579742431640625, "step": 60510 }, { "epoch": 17.17854101617939, "grad_norm": 6.692976474761963, "learning_rate": 1.414419528810673e-05, "loss": 2.055519866943359, "step": 60520 }, { "epoch": 17.181379506102754, "grad_norm": 6.143016338348389, "learning_rate": 1.4130002838489924e-05, "loss": 2.089095878601074, "step": 60530 }, { "epoch": 17.184217996026113, "grad_norm": 7.108044624328613, "learning_rate": 1.4115810388873121e-05, "loss": 2.07230224609375, "step": 60540 }, { "epoch": 17.187056485949476, "grad_norm": 6.468245029449463, "learning_rate": 1.4101617939256318e-05, "loss": 2.113076019287109, "step": 60550 }, { "epoch": 17.189894975872836, "grad_norm": 6.627634048461914, "learning_rate": 1.4087425489639513e-05, "loss": 2.06494083404541, "step": 60560 }, { "epoch": 17.192733465796195, "grad_norm": 6.457289695739746, "learning_rate": 1.407323304002271e-05, "loss": 2.1179595947265626, "step": 60570 }, { "epoch": 17.195571955719558, "grad_norm": 6.719277858734131, "learning_rate": 1.4059040590405906e-05, "loss": 2.0763999938964846, "step": 60580 }, { "epoch": 17.198410445642917, "grad_norm": 6.582827091217041, "learning_rate": 1.4044848140789101e-05, "loss": 2.0665573120117187, "step": 60590 }, { "epoch": 17.20124893556628, "grad_norm": 6.702386856079102, "learning_rate": 1.4030655691172298e-05, "loss": 2.1452592849731444, "step": 60600 }, { "epoch": 17.20408742548964, "grad_norm": 6.610776901245117, "learning_rate": 1.4016463241555495e-05, "loss": 2.039366912841797, "step": 60610 }, { "epoch": 17.206925915413, "grad_norm": 6.048492908477783, "learning_rate": 1.400227079193869e-05, "loss": 2.0333709716796875, "step": 60620 }, { "epoch": 17.20976440533636, "grad_norm": 6.0263671875, "learning_rate": 1.3988078342321886e-05, "loss": 2.1357872009277346, "step": 60630 }, { "epoch": 17.21260289525972, "grad_norm": 6.1069254875183105, "learning_rate": 1.3973885892705083e-05, "loss": 2.1393531799316405, "step": 60640 }, { "epoch": 17.215441385183084, "grad_norm": 6.355607986450195, "learning_rate": 1.3959693443088278e-05, "loss": 2.128170967102051, "step": 60650 }, { "epoch": 17.218279875106443, "grad_norm": 6.466501712799072, "learning_rate": 1.3945500993471475e-05, "loss": 2.161060905456543, "step": 60660 }, { "epoch": 17.221118365029803, "grad_norm": 6.407556533813477, "learning_rate": 1.3931308543854672e-05, "loss": 2.1536916732788085, "step": 60670 }, { "epoch": 17.223956854953165, "grad_norm": 6.366998195648193, "learning_rate": 1.3917116094237867e-05, "loss": 2.197832489013672, "step": 60680 }, { "epoch": 17.226795344876525, "grad_norm": 6.373558521270752, "learning_rate": 1.3902923644621063e-05, "loss": 2.087432861328125, "step": 60690 }, { "epoch": 17.229633834799888, "grad_norm": 6.061450481414795, "learning_rate": 1.388873119500426e-05, "loss": 1.990736961364746, "step": 60700 }, { "epoch": 17.232472324723247, "grad_norm": 6.107224941253662, "learning_rate": 1.3874538745387455e-05, "loss": 2.1028011322021483, "step": 60710 }, { "epoch": 17.235310814646606, "grad_norm": 6.393880844116211, "learning_rate": 1.3860346295770652e-05, "loss": 2.148082733154297, "step": 60720 }, { "epoch": 17.23814930456997, "grad_norm": 6.552264213562012, "learning_rate": 1.3846153846153847e-05, "loss": 2.118338394165039, "step": 60730 }, { "epoch": 17.24098779449333, "grad_norm": 6.647693157196045, "learning_rate": 1.3831961396537043e-05, "loss": 2.121087646484375, "step": 60740 }, { "epoch": 17.24382628441669, "grad_norm": 6.96408224105835, "learning_rate": 1.381776894692024e-05, "loss": 2.0924098968505858, "step": 60750 }, { "epoch": 17.24666477434005, "grad_norm": 6.490701675415039, "learning_rate": 1.3803576497303435e-05, "loss": 2.1090965270996094, "step": 60760 }, { "epoch": 17.249503264263414, "grad_norm": 6.279256343841553, "learning_rate": 1.3789384047686632e-05, "loss": 2.123834228515625, "step": 60770 }, { "epoch": 17.252341754186773, "grad_norm": 6.405404090881348, "learning_rate": 1.3775191598069829e-05, "loss": 2.106098747253418, "step": 60780 }, { "epoch": 17.255180244110132, "grad_norm": 6.367740631103516, "learning_rate": 1.3760999148453024e-05, "loss": 2.1352014541625977, "step": 60790 }, { "epoch": 17.258018734033495, "grad_norm": 6.178104400634766, "learning_rate": 1.374680669883622e-05, "loss": 2.0958484649658202, "step": 60800 }, { "epoch": 17.260857223956855, "grad_norm": 6.3831706047058105, "learning_rate": 1.3732614249219417e-05, "loss": 2.1298139572143553, "step": 60810 }, { "epoch": 17.263695713880217, "grad_norm": 6.671202182769775, "learning_rate": 1.3718421799602612e-05, "loss": 2.093104934692383, "step": 60820 }, { "epoch": 17.266534203803577, "grad_norm": 6.347376823425293, "learning_rate": 1.3704229349985809e-05, "loss": 2.105167579650879, "step": 60830 }, { "epoch": 17.269372693726936, "grad_norm": 6.33451509475708, "learning_rate": 1.3690036900369005e-05, "loss": 2.114225387573242, "step": 60840 }, { "epoch": 17.2722111836503, "grad_norm": 6.825623512268066, "learning_rate": 1.36758444507522e-05, "loss": 2.1007226943969726, "step": 60850 }, { "epoch": 17.27504967357366, "grad_norm": 6.5536394119262695, "learning_rate": 1.3661652001135397e-05, "loss": 2.1020984649658203, "step": 60860 }, { "epoch": 17.27788816349702, "grad_norm": 6.4439520835876465, "learning_rate": 1.3647459551518594e-05, "loss": 2.1481039047241213, "step": 60870 }, { "epoch": 17.28072665342038, "grad_norm": 5.897321701049805, "learning_rate": 1.3633267101901789e-05, "loss": 2.0733766555786133, "step": 60880 }, { "epoch": 17.28356514334374, "grad_norm": 6.269729137420654, "learning_rate": 1.3619074652284986e-05, "loss": 2.156168746948242, "step": 60890 }, { "epoch": 17.286403633267103, "grad_norm": 6.231815338134766, "learning_rate": 1.3604882202668182e-05, "loss": 2.044289207458496, "step": 60900 }, { "epoch": 17.289242123190462, "grad_norm": 6.269872188568115, "learning_rate": 1.3590689753051377e-05, "loss": 2.127248191833496, "step": 60910 }, { "epoch": 17.292080613113825, "grad_norm": 6.362732410430908, "learning_rate": 1.3576497303434574e-05, "loss": 2.08974552154541, "step": 60920 }, { "epoch": 17.294919103037184, "grad_norm": 6.988313674926758, "learning_rate": 1.356230485381777e-05, "loss": 2.1093505859375, "step": 60930 }, { "epoch": 17.297757592960544, "grad_norm": 6.969214916229248, "learning_rate": 1.3548112404200966e-05, "loss": 2.0897806167602537, "step": 60940 }, { "epoch": 17.300596082883906, "grad_norm": 6.606720447540283, "learning_rate": 1.3533919954584163e-05, "loss": 2.1750497817993164, "step": 60950 }, { "epoch": 17.303434572807266, "grad_norm": 6.32132625579834, "learning_rate": 1.3519727504967358e-05, "loss": 2.162131690979004, "step": 60960 }, { "epoch": 17.30627306273063, "grad_norm": 6.7284040451049805, "learning_rate": 1.3505535055350554e-05, "loss": 2.080636215209961, "step": 60970 }, { "epoch": 17.309111552653988, "grad_norm": 6.254078388214111, "learning_rate": 1.3491342605733751e-05, "loss": 2.1699485778808594, "step": 60980 }, { "epoch": 17.311950042577347, "grad_norm": 6.377214431762695, "learning_rate": 1.3477150156116946e-05, "loss": 2.1376211166381838, "step": 60990 }, { "epoch": 17.31478853250071, "grad_norm": 6.7831501960754395, "learning_rate": 1.3462957706500143e-05, "loss": 2.1906652450561523, "step": 61000 }, { "epoch": 17.31478853250071, "eval_accuracy": 0.35086157563425957, "eval_loss": 2.446355104446411, "eval_runtime": 51.2824, "eval_samples_per_second": 306.675, "eval_steps_per_second": 4.797, "step": 61000 }, { "epoch": 17.31762702242407, "grad_norm": 6.460679531097412, "learning_rate": 1.344876525688334e-05, "loss": 2.2208106994628904, "step": 61010 }, { "epoch": 17.320465512347432, "grad_norm": 6.234585285186768, "learning_rate": 1.3434572807266534e-05, "loss": 2.1450435638427736, "step": 61020 }, { "epoch": 17.323304002270792, "grad_norm": 6.6469244956970215, "learning_rate": 1.3420380357649731e-05, "loss": 2.119343566894531, "step": 61030 }, { "epoch": 17.32614249219415, "grad_norm": 6.2639899253845215, "learning_rate": 1.3406187908032928e-05, "loss": 2.061864471435547, "step": 61040 }, { "epoch": 17.328980982117514, "grad_norm": 6.215234756469727, "learning_rate": 1.3391995458416123e-05, "loss": 2.155030632019043, "step": 61050 }, { "epoch": 17.331819472040873, "grad_norm": 6.3191633224487305, "learning_rate": 1.337780300879932e-05, "loss": 2.118383598327637, "step": 61060 }, { "epoch": 17.334657961964236, "grad_norm": 6.228386878967285, "learning_rate": 1.3363610559182516e-05, "loss": 2.0926246643066406, "step": 61070 }, { "epoch": 17.337496451887596, "grad_norm": 6.835360050201416, "learning_rate": 1.3349418109565711e-05, "loss": 2.201292610168457, "step": 61080 }, { "epoch": 17.340334941810955, "grad_norm": 6.207193851470947, "learning_rate": 1.3335225659948908e-05, "loss": 2.116934394836426, "step": 61090 }, { "epoch": 17.343173431734318, "grad_norm": 7.035484790802002, "learning_rate": 1.3321033210332105e-05, "loss": 2.095166015625, "step": 61100 }, { "epoch": 17.346011921657677, "grad_norm": 6.724499225616455, "learning_rate": 1.33068407607153e-05, "loss": 2.164213752746582, "step": 61110 }, { "epoch": 17.34885041158104, "grad_norm": 6.756214618682861, "learning_rate": 1.3292648311098496e-05, "loss": 2.1245471954345705, "step": 61120 }, { "epoch": 17.3516889015044, "grad_norm": 6.5177459716796875, "learning_rate": 1.3278455861481693e-05, "loss": 2.0486793518066406, "step": 61130 }, { "epoch": 17.35452739142776, "grad_norm": 6.362992763519287, "learning_rate": 1.3264263411864888e-05, "loss": 2.127602195739746, "step": 61140 }, { "epoch": 17.35736588135112, "grad_norm": 6.794569969177246, "learning_rate": 1.3250070962248085e-05, "loss": 2.0979408264160155, "step": 61150 }, { "epoch": 17.36020437127448, "grad_norm": 6.268159866333008, "learning_rate": 1.323587851263128e-05, "loss": 2.078872299194336, "step": 61160 }, { "epoch": 17.363042861197844, "grad_norm": 6.162642002105713, "learning_rate": 1.3221686063014477e-05, "loss": 2.06579647064209, "step": 61170 }, { "epoch": 17.365881351121203, "grad_norm": 6.2365617752075195, "learning_rate": 1.3207493613397673e-05, "loss": 2.0863189697265625, "step": 61180 }, { "epoch": 17.368719841044566, "grad_norm": 6.186573505401611, "learning_rate": 1.3193301163780868e-05, "loss": 2.0394887924194336, "step": 61190 }, { "epoch": 17.371558330967925, "grad_norm": 6.561928749084473, "learning_rate": 1.3179108714164065e-05, "loss": 2.0659391403198244, "step": 61200 }, { "epoch": 17.374396820891285, "grad_norm": 7.029858112335205, "learning_rate": 1.3164916264547262e-05, "loss": 2.1780250549316404, "step": 61210 }, { "epoch": 17.377235310814648, "grad_norm": 6.693567276000977, "learning_rate": 1.3150723814930457e-05, "loss": 2.0770050048828126, "step": 61220 }, { "epoch": 17.380073800738007, "grad_norm": 6.104567050933838, "learning_rate": 1.3136531365313653e-05, "loss": 2.0147558212280274, "step": 61230 }, { "epoch": 17.38291229066137, "grad_norm": 6.356402397155762, "learning_rate": 1.312233891569685e-05, "loss": 2.0776609420776366, "step": 61240 }, { "epoch": 17.38575078058473, "grad_norm": 6.353640079498291, "learning_rate": 1.3108146466080045e-05, "loss": 1.9454832077026367, "step": 61250 }, { "epoch": 17.38858927050809, "grad_norm": 6.728814125061035, "learning_rate": 1.3093954016463242e-05, "loss": 2.1201669692993166, "step": 61260 }, { "epoch": 17.39142776043145, "grad_norm": 6.8033857345581055, "learning_rate": 1.3079761566846439e-05, "loss": 2.0539615631103514, "step": 61270 }, { "epoch": 17.39426625035481, "grad_norm": 6.4211201667785645, "learning_rate": 1.3065569117229634e-05, "loss": 2.1281673431396486, "step": 61280 }, { "epoch": 17.397104740278174, "grad_norm": 6.290921688079834, "learning_rate": 1.305137666761283e-05, "loss": 2.0717870712280275, "step": 61290 }, { "epoch": 17.399943230201533, "grad_norm": 6.558976650238037, "learning_rate": 1.3037184217996027e-05, "loss": 2.0405961990356447, "step": 61300 }, { "epoch": 17.402781720124892, "grad_norm": 6.074294090270996, "learning_rate": 1.3022991768379222e-05, "loss": 2.1278173446655275, "step": 61310 }, { "epoch": 17.405620210048255, "grad_norm": 6.360847473144531, "learning_rate": 1.3008799318762419e-05, "loss": 2.137257194519043, "step": 61320 }, { "epoch": 17.408458699971614, "grad_norm": 6.633480548858643, "learning_rate": 1.2994606869145615e-05, "loss": 2.073769378662109, "step": 61330 }, { "epoch": 17.411297189894977, "grad_norm": 6.740935802459717, "learning_rate": 1.298041441952881e-05, "loss": 2.15179500579834, "step": 61340 }, { "epoch": 17.414135679818337, "grad_norm": 6.368393421173096, "learning_rate": 1.2966221969912007e-05, "loss": 2.141633415222168, "step": 61350 }, { "epoch": 17.416974169741696, "grad_norm": 6.679074287414551, "learning_rate": 1.2952029520295202e-05, "loss": 2.1114477157592773, "step": 61360 }, { "epoch": 17.41981265966506, "grad_norm": 6.258053779602051, "learning_rate": 1.2937837070678399e-05, "loss": 2.125336837768555, "step": 61370 }, { "epoch": 17.422651149588418, "grad_norm": 6.163348197937012, "learning_rate": 1.2923644621061596e-05, "loss": 2.1151006698608397, "step": 61380 }, { "epoch": 17.42548963951178, "grad_norm": 6.602224826812744, "learning_rate": 1.290945217144479e-05, "loss": 2.0928623199462892, "step": 61390 }, { "epoch": 17.42832812943514, "grad_norm": 6.3259358406066895, "learning_rate": 1.2895259721827987e-05, "loss": 2.1493810653686523, "step": 61400 }, { "epoch": 17.4311666193585, "grad_norm": 6.161622047424316, "learning_rate": 1.2881067272211184e-05, "loss": 2.055315399169922, "step": 61410 }, { "epoch": 17.434005109281863, "grad_norm": 6.612868309020996, "learning_rate": 1.2866874822594379e-05, "loss": 2.1078727722167967, "step": 61420 }, { "epoch": 17.436843599205222, "grad_norm": 6.595709323883057, "learning_rate": 1.2852682372977576e-05, "loss": 2.1174665451049806, "step": 61430 }, { "epoch": 17.439682089128585, "grad_norm": 6.540686130523682, "learning_rate": 1.2838489923360772e-05, "loss": 2.1256034851074217, "step": 61440 }, { "epoch": 17.442520579051944, "grad_norm": 6.092038631439209, "learning_rate": 1.2824297473743967e-05, "loss": 2.1595216751098634, "step": 61450 }, { "epoch": 17.445359068975304, "grad_norm": 6.666629791259766, "learning_rate": 1.2810105024127164e-05, "loss": 2.1439796447753907, "step": 61460 }, { "epoch": 17.448197558898666, "grad_norm": 6.020827770233154, "learning_rate": 1.2795912574510361e-05, "loss": 2.087324333190918, "step": 61470 }, { "epoch": 17.451036048822026, "grad_norm": 6.647865295410156, "learning_rate": 1.2781720124893556e-05, "loss": 2.0982561111450195, "step": 61480 }, { "epoch": 17.45387453874539, "grad_norm": 6.2991943359375, "learning_rate": 1.2767527675276753e-05, "loss": 2.0619611740112305, "step": 61490 }, { "epoch": 17.456713028668748, "grad_norm": 6.421746730804443, "learning_rate": 1.275333522565995e-05, "loss": 2.1015241622924803, "step": 61500 }, { "epoch": 17.456713028668748, "eval_accuracy": 0.3530870477522732, "eval_loss": 2.443981409072876, "eval_runtime": 49.6635, "eval_samples_per_second": 316.671, "eval_steps_per_second": 4.953, "step": 61500 }, { "epoch": 17.459551518592107, "grad_norm": 6.180727958679199, "learning_rate": 1.2739142776043144e-05, "loss": 2.146354103088379, "step": 61510 }, { "epoch": 17.46239000851547, "grad_norm": 6.752898216247559, "learning_rate": 1.2724950326426341e-05, "loss": 2.1565567016601563, "step": 61520 }, { "epoch": 17.46522849843883, "grad_norm": 6.618875980377197, "learning_rate": 1.2710757876809538e-05, "loss": 2.1504690170288088, "step": 61530 }, { "epoch": 17.468066988362192, "grad_norm": 6.477664947509766, "learning_rate": 1.2696565427192733e-05, "loss": 2.2004247665405274, "step": 61540 }, { "epoch": 17.47090547828555, "grad_norm": 6.4968581199646, "learning_rate": 1.268237297757593e-05, "loss": 2.143640899658203, "step": 61550 }, { "epoch": 17.473743968208915, "grad_norm": 6.249914646148682, "learning_rate": 1.2668180527959126e-05, "loss": 1.9946779251098632, "step": 61560 }, { "epoch": 17.476582458132274, "grad_norm": 6.191343784332275, "learning_rate": 1.2653988078342321e-05, "loss": 2.1272525787353516, "step": 61570 }, { "epoch": 17.479420948055633, "grad_norm": 6.162976264953613, "learning_rate": 1.2639795628725518e-05, "loss": 2.11334285736084, "step": 61580 }, { "epoch": 17.482259437978996, "grad_norm": 6.352663993835449, "learning_rate": 1.2625603179108713e-05, "loss": 2.169632911682129, "step": 61590 }, { "epoch": 17.485097927902356, "grad_norm": 6.579433917999268, "learning_rate": 1.261141072949191e-05, "loss": 2.2013008117675783, "step": 61600 }, { "epoch": 17.48793641782572, "grad_norm": 6.994508743286133, "learning_rate": 1.2597218279875106e-05, "loss": 2.0067216873168947, "step": 61610 }, { "epoch": 17.490774907749078, "grad_norm": 6.495379447937012, "learning_rate": 1.2583025830258301e-05, "loss": 2.112092208862305, "step": 61620 }, { "epoch": 17.493613397672437, "grad_norm": 6.745548248291016, "learning_rate": 1.2568833380641498e-05, "loss": 2.1166038513183594, "step": 61630 }, { "epoch": 17.4964518875958, "grad_norm": 6.775463581085205, "learning_rate": 1.2554640931024695e-05, "loss": 2.0388816833496093, "step": 61640 }, { "epoch": 17.49929037751916, "grad_norm": 6.579274654388428, "learning_rate": 1.254044848140789e-05, "loss": 2.0825244903564455, "step": 61650 }, { "epoch": 17.502128867442522, "grad_norm": 6.741671085357666, "learning_rate": 1.2526256031791087e-05, "loss": 2.0741851806640623, "step": 61660 }, { "epoch": 17.50496735736588, "grad_norm": 6.418030738830566, "learning_rate": 1.2512063582174283e-05, "loss": 2.141576957702637, "step": 61670 }, { "epoch": 17.50780584728924, "grad_norm": 6.544433116912842, "learning_rate": 1.249787113255748e-05, "loss": 2.030382537841797, "step": 61680 }, { "epoch": 17.510644337212604, "grad_norm": 6.307040691375732, "learning_rate": 1.2483678682940677e-05, "loss": 2.085826301574707, "step": 61690 }, { "epoch": 17.513482827135963, "grad_norm": 6.317930221557617, "learning_rate": 1.2469486233323872e-05, "loss": 2.0995952606201174, "step": 61700 }, { "epoch": 17.516321317059326, "grad_norm": 6.631149768829346, "learning_rate": 1.2455293783707068e-05, "loss": 2.132617378234863, "step": 61710 }, { "epoch": 17.519159806982685, "grad_norm": 6.309354305267334, "learning_rate": 1.2441101334090265e-05, "loss": 2.0678726196289063, "step": 61720 }, { "epoch": 17.521998296906045, "grad_norm": 6.610644817352295, "learning_rate": 1.242690888447346e-05, "loss": 2.139414978027344, "step": 61730 }, { "epoch": 17.524836786829407, "grad_norm": 6.83234977722168, "learning_rate": 1.2412716434856657e-05, "loss": 2.181488037109375, "step": 61740 }, { "epoch": 17.527675276752767, "grad_norm": 6.675538063049316, "learning_rate": 1.2398523985239854e-05, "loss": 2.099203109741211, "step": 61750 }, { "epoch": 17.53051376667613, "grad_norm": 6.335094928741455, "learning_rate": 1.2384331535623049e-05, "loss": 2.117589569091797, "step": 61760 }, { "epoch": 17.53335225659949, "grad_norm": 6.776953220367432, "learning_rate": 1.2370139086006245e-05, "loss": 2.101405715942383, "step": 61770 }, { "epoch": 17.53619074652285, "grad_norm": 6.293622970581055, "learning_rate": 1.2355946636389442e-05, "loss": 2.108960723876953, "step": 61780 }, { "epoch": 17.53902923644621, "grad_norm": 6.35422945022583, "learning_rate": 1.2341754186772637e-05, "loss": 2.121735954284668, "step": 61790 }, { "epoch": 17.54186772636957, "grad_norm": 6.7096099853515625, "learning_rate": 1.2327561737155834e-05, "loss": 2.179922103881836, "step": 61800 }, { "epoch": 17.544706216292933, "grad_norm": 6.516861438751221, "learning_rate": 1.231336928753903e-05, "loss": 2.1380558013916016, "step": 61810 }, { "epoch": 17.547544706216293, "grad_norm": 6.659820079803467, "learning_rate": 1.2299176837922225e-05, "loss": 2.062379837036133, "step": 61820 }, { "epoch": 17.550383196139652, "grad_norm": 6.3985185623168945, "learning_rate": 1.2284984388305422e-05, "loss": 2.175676727294922, "step": 61830 }, { "epoch": 17.553221686063015, "grad_norm": 6.680093765258789, "learning_rate": 1.2270791938688619e-05, "loss": 2.1902677536010744, "step": 61840 }, { "epoch": 17.556060175986374, "grad_norm": 6.596042633056641, "learning_rate": 1.2256599489071814e-05, "loss": 2.081932258605957, "step": 61850 }, { "epoch": 17.558898665909737, "grad_norm": 6.490371227264404, "learning_rate": 1.224240703945501e-05, "loss": 2.1657846450805662, "step": 61860 }, { "epoch": 17.561737155833097, "grad_norm": 6.136172294616699, "learning_rate": 1.2228214589838206e-05, "loss": 2.0819787979125977, "step": 61870 }, { "epoch": 17.564575645756456, "grad_norm": 6.484204292297363, "learning_rate": 1.2214022140221402e-05, "loss": 2.1545181274414062, "step": 61880 }, { "epoch": 17.56741413567982, "grad_norm": 6.341609001159668, "learning_rate": 1.2199829690604599e-05, "loss": 2.103130912780762, "step": 61890 }, { "epoch": 17.570252625603178, "grad_norm": 5.998603820800781, "learning_rate": 1.2185637240987794e-05, "loss": 2.0368221282958983, "step": 61900 }, { "epoch": 17.57309111552654, "grad_norm": 6.596738815307617, "learning_rate": 1.217144479137099e-05, "loss": 2.05755558013916, "step": 61910 }, { "epoch": 17.5759296054499, "grad_norm": 6.403003215789795, "learning_rate": 1.2157252341754187e-05, "loss": 2.163652229309082, "step": 61920 }, { "epoch": 17.578768095373263, "grad_norm": 6.498582363128662, "learning_rate": 1.2143059892137382e-05, "loss": 2.1584091186523438, "step": 61930 }, { "epoch": 17.581606585296623, "grad_norm": 6.160919666290283, "learning_rate": 1.2128867442520579e-05, "loss": 2.1085332870483398, "step": 61940 }, { "epoch": 17.584445075219982, "grad_norm": 6.811058521270752, "learning_rate": 1.2114674992903776e-05, "loss": 2.140699005126953, "step": 61950 }, { "epoch": 17.587283565143345, "grad_norm": 6.731261253356934, "learning_rate": 1.2100482543286971e-05, "loss": 2.1155426025390627, "step": 61960 }, { "epoch": 17.590122055066704, "grad_norm": 6.324090480804443, "learning_rate": 1.2086290093670168e-05, "loss": 2.1091424942016603, "step": 61970 }, { "epoch": 17.592960544990063, "grad_norm": 6.392658710479736, "learning_rate": 1.2072097644053364e-05, "loss": 2.134625053405762, "step": 61980 }, { "epoch": 17.595799034913426, "grad_norm": 6.98992919921875, "learning_rate": 1.205790519443656e-05, "loss": 2.066788101196289, "step": 61990 }, { "epoch": 17.598637524836786, "grad_norm": 6.749881267547607, "learning_rate": 1.2043712744819756e-05, "loss": 2.0956933975219725, "step": 62000 }, { "epoch": 17.598637524836786, "eval_accuracy": 0.35683855789406754, "eval_loss": 2.4428958892822266, "eval_runtime": 51.2011, "eval_samples_per_second": 307.161, "eval_steps_per_second": 4.805, "step": 62000 }, { "epoch": 17.60147601476015, "grad_norm": 6.376654148101807, "learning_rate": 1.2029520295202953e-05, "loss": 2.124970054626465, "step": 62010 }, { "epoch": 17.604314504683508, "grad_norm": 6.517435073852539, "learning_rate": 1.2015327845586148e-05, "loss": 2.1723556518554688, "step": 62020 }, { "epoch": 17.60715299460687, "grad_norm": 6.471639633178711, "learning_rate": 1.2001135395969344e-05, "loss": 2.0671665191650392, "step": 62030 }, { "epoch": 17.60999148453023, "grad_norm": 6.404410362243652, "learning_rate": 1.1986942946352541e-05, "loss": 2.108059120178223, "step": 62040 }, { "epoch": 17.61282997445359, "grad_norm": 6.819919586181641, "learning_rate": 1.1972750496735736e-05, "loss": 2.152800369262695, "step": 62050 }, { "epoch": 17.615668464376952, "grad_norm": 6.341378688812256, "learning_rate": 1.1958558047118933e-05, "loss": 2.092339515686035, "step": 62060 }, { "epoch": 17.61850695430031, "grad_norm": 6.652298450469971, "learning_rate": 1.194436559750213e-05, "loss": 2.132748603820801, "step": 62070 }, { "epoch": 17.621345444223675, "grad_norm": 6.681756973266602, "learning_rate": 1.1930173147885326e-05, "loss": 2.071816635131836, "step": 62080 }, { "epoch": 17.624183934147034, "grad_norm": 6.749481678009033, "learning_rate": 1.1915980698268523e-05, "loss": 2.115176773071289, "step": 62090 }, { "epoch": 17.627022424070393, "grad_norm": 6.323782920837402, "learning_rate": 1.1901788248651718e-05, "loss": 2.120757484436035, "step": 62100 }, { "epoch": 17.629860913993756, "grad_norm": 6.215813636779785, "learning_rate": 1.1887595799034915e-05, "loss": 2.1561826705932616, "step": 62110 }, { "epoch": 17.632699403917115, "grad_norm": 6.348634719848633, "learning_rate": 1.1873403349418111e-05, "loss": 2.0899133682250977, "step": 62120 }, { "epoch": 17.63553789384048, "grad_norm": 6.888927936553955, "learning_rate": 1.1859210899801306e-05, "loss": 2.103727340698242, "step": 62130 }, { "epoch": 17.638376383763838, "grad_norm": 6.396149158477783, "learning_rate": 1.1845018450184503e-05, "loss": 2.098347282409668, "step": 62140 }, { "epoch": 17.641214873687197, "grad_norm": 6.206672191619873, "learning_rate": 1.18308260005677e-05, "loss": 2.1292720794677735, "step": 62150 }, { "epoch": 17.64405336361056, "grad_norm": 6.644042015075684, "learning_rate": 1.1816633550950895e-05, "loss": 2.1082473754882813, "step": 62160 }, { "epoch": 17.64689185353392, "grad_norm": 6.220208644866943, "learning_rate": 1.1802441101334092e-05, "loss": 2.101627540588379, "step": 62170 }, { "epoch": 17.649730343457282, "grad_norm": 6.345632076263428, "learning_rate": 1.1788248651717287e-05, "loss": 2.0995386123657225, "step": 62180 }, { "epoch": 17.65256883338064, "grad_norm": 6.877683162689209, "learning_rate": 1.1774056202100483e-05, "loss": 2.1000545501708983, "step": 62190 }, { "epoch": 17.655407323304, "grad_norm": 6.509185791015625, "learning_rate": 1.175986375248368e-05, "loss": 2.0747501373291017, "step": 62200 }, { "epoch": 17.658245813227364, "grad_norm": 6.614027976989746, "learning_rate": 1.1745671302866875e-05, "loss": 2.1508062362670897, "step": 62210 }, { "epoch": 17.661084303150723, "grad_norm": 6.249119281768799, "learning_rate": 1.1731478853250072e-05, "loss": 2.1205255508422853, "step": 62220 }, { "epoch": 17.663922793074086, "grad_norm": 6.1780476570129395, "learning_rate": 1.1717286403633268e-05, "loss": 2.134262466430664, "step": 62230 }, { "epoch": 17.666761282997445, "grad_norm": 6.495049953460693, "learning_rate": 1.1703093954016463e-05, "loss": 2.100452995300293, "step": 62240 }, { "epoch": 17.669599772920805, "grad_norm": 6.608557224273682, "learning_rate": 1.168890150439966e-05, "loss": 2.079188919067383, "step": 62250 }, { "epoch": 17.672438262844167, "grad_norm": 6.531807899475098, "learning_rate": 1.1674709054782857e-05, "loss": 2.0295089721679687, "step": 62260 }, { "epoch": 17.675276752767527, "grad_norm": 6.471685886383057, "learning_rate": 1.1660516605166052e-05, "loss": 2.1274532318115233, "step": 62270 }, { "epoch": 17.67811524269089, "grad_norm": 6.070362567901611, "learning_rate": 1.1646324155549249e-05, "loss": 2.0479015350341796, "step": 62280 }, { "epoch": 17.68095373261425, "grad_norm": 6.465672492980957, "learning_rate": 1.1632131705932445e-05, "loss": 2.073362922668457, "step": 62290 }, { "epoch": 17.68379222253761, "grad_norm": 6.38710880279541, "learning_rate": 1.161793925631564e-05, "loss": 2.1222991943359375, "step": 62300 }, { "epoch": 17.68663071246097, "grad_norm": 6.660451412200928, "learning_rate": 1.1603746806698837e-05, "loss": 2.137079429626465, "step": 62310 }, { "epoch": 17.68946920238433, "grad_norm": 6.383259296417236, "learning_rate": 1.1589554357082034e-05, "loss": 2.093511772155762, "step": 62320 }, { "epoch": 17.692307692307693, "grad_norm": 6.663867473602295, "learning_rate": 1.1575361907465229e-05, "loss": 2.101418304443359, "step": 62330 }, { "epoch": 17.695146182231053, "grad_norm": 6.22501277923584, "learning_rate": 1.1561169457848425e-05, "loss": 2.168321990966797, "step": 62340 }, { "epoch": 17.697984672154412, "grad_norm": 6.484969615936279, "learning_rate": 1.1546977008231622e-05, "loss": 2.1377212524414064, "step": 62350 }, { "epoch": 17.700823162077775, "grad_norm": 6.644406795501709, "learning_rate": 1.1532784558614817e-05, "loss": 2.095628356933594, "step": 62360 }, { "epoch": 17.703661652001134, "grad_norm": 6.339768409729004, "learning_rate": 1.1518592108998014e-05, "loss": 2.1387535095214845, "step": 62370 }, { "epoch": 17.706500141924497, "grad_norm": 6.6202921867370605, "learning_rate": 1.150439965938121e-05, "loss": 2.1307350158691407, "step": 62380 }, { "epoch": 17.709338631847857, "grad_norm": 6.732110977172852, "learning_rate": 1.1490207209764406e-05, "loss": 2.1773937225341795, "step": 62390 }, { "epoch": 17.71217712177122, "grad_norm": 6.425174713134766, "learning_rate": 1.1476014760147602e-05, "loss": 2.0811172485351563, "step": 62400 }, { "epoch": 17.71501561169458, "grad_norm": 6.498219013214111, "learning_rate": 1.1461822310530797e-05, "loss": 2.0715301513671873, "step": 62410 }, { "epoch": 17.717854101617938, "grad_norm": 6.433671474456787, "learning_rate": 1.1447629860913994e-05, "loss": 2.0483413696289063, "step": 62420 }, { "epoch": 17.7206925915413, "grad_norm": 6.634025573730469, "learning_rate": 1.143343741129719e-05, "loss": 2.1989118576049806, "step": 62430 }, { "epoch": 17.72353108146466, "grad_norm": 6.534836769104004, "learning_rate": 1.1419244961680386e-05, "loss": 2.151072311401367, "step": 62440 }, { "epoch": 17.726369571388023, "grad_norm": 6.673799991607666, "learning_rate": 1.1405052512063583e-05, "loss": 2.0957855224609374, "step": 62450 }, { "epoch": 17.729208061311382, "grad_norm": 6.244089603424072, "learning_rate": 1.139086006244678e-05, "loss": 2.207863998413086, "step": 62460 }, { "epoch": 17.732046551234742, "grad_norm": 6.703178405761719, "learning_rate": 1.1376667612829974e-05, "loss": 2.1555816650390627, "step": 62470 }, { "epoch": 17.734885041158105, "grad_norm": 6.189230442047119, "learning_rate": 1.1362475163213171e-05, "loss": 2.083062171936035, "step": 62480 }, { "epoch": 17.737723531081464, "grad_norm": 6.227758884429932, "learning_rate": 1.1348282713596368e-05, "loss": 2.1889543533325195, "step": 62490 }, { "epoch": 17.740562021004827, "grad_norm": 6.527083873748779, "learning_rate": 1.1334090263979563e-05, "loss": 2.0995269775390626, "step": 62500 }, { "epoch": 17.740562021004827, "eval_accuracy": 0.35461308577605394, "eval_loss": 2.441162586212158, "eval_runtime": 51.778, "eval_samples_per_second": 303.739, "eval_steps_per_second": 4.751, "step": 62500 }, { "epoch": 17.743400510928186, "grad_norm": 6.392878532409668, "learning_rate": 1.131989781436276e-05, "loss": 2.047053909301758, "step": 62510 }, { "epoch": 17.746239000851546, "grad_norm": 6.368244647979736, "learning_rate": 1.1305705364745956e-05, "loss": 2.0685482025146484, "step": 62520 }, { "epoch": 17.74907749077491, "grad_norm": 6.285984992980957, "learning_rate": 1.1291512915129151e-05, "loss": 2.0608137130737303, "step": 62530 }, { "epoch": 17.751915980698268, "grad_norm": 6.674759864807129, "learning_rate": 1.1277320465512348e-05, "loss": 2.027962875366211, "step": 62540 }, { "epoch": 17.75475447062163, "grad_norm": 6.278096675872803, "learning_rate": 1.1263128015895545e-05, "loss": 2.1158767700195313, "step": 62550 }, { "epoch": 17.75759296054499, "grad_norm": 6.894362926483154, "learning_rate": 1.124893556627874e-05, "loss": 2.1987579345703123, "step": 62560 }, { "epoch": 17.76043145046835, "grad_norm": 6.238091468811035, "learning_rate": 1.1234743116661936e-05, "loss": 2.1110107421875, "step": 62570 }, { "epoch": 17.763269940391712, "grad_norm": 6.319118976593018, "learning_rate": 1.1220550667045133e-05, "loss": 2.1410470962524415, "step": 62580 }, { "epoch": 17.76610843031507, "grad_norm": 6.543454170227051, "learning_rate": 1.1206358217428328e-05, "loss": 2.1182235717773437, "step": 62590 }, { "epoch": 17.768946920238434, "grad_norm": 6.128547191619873, "learning_rate": 1.1192165767811525e-05, "loss": 2.074143981933594, "step": 62600 }, { "epoch": 17.771785410161794, "grad_norm": 6.434366703033447, "learning_rate": 1.117797331819472e-05, "loss": 2.1339916229248046, "step": 62610 }, { "epoch": 17.774623900085153, "grad_norm": 6.233227729797363, "learning_rate": 1.1163780868577916e-05, "loss": 2.1954557418823244, "step": 62620 }, { "epoch": 17.777462390008516, "grad_norm": 6.920441627502441, "learning_rate": 1.1149588418961113e-05, "loss": 2.0378114700317385, "step": 62630 }, { "epoch": 17.780300879931875, "grad_norm": 6.545797348022461, "learning_rate": 1.1135395969344308e-05, "loss": 2.03729133605957, "step": 62640 }, { "epoch": 17.78313936985524, "grad_norm": 6.350860595703125, "learning_rate": 1.1121203519727505e-05, "loss": 2.0993072509765627, "step": 62650 }, { "epoch": 17.785977859778598, "grad_norm": 6.334438323974609, "learning_rate": 1.1107011070110702e-05, "loss": 2.039194107055664, "step": 62660 }, { "epoch": 17.788816349701957, "grad_norm": 6.010382652282715, "learning_rate": 1.1092818620493897e-05, "loss": 2.0882900238037108, "step": 62670 }, { "epoch": 17.79165483962532, "grad_norm": 6.4919047355651855, "learning_rate": 1.1078626170877093e-05, "loss": 2.1535520553588867, "step": 62680 }, { "epoch": 17.79449332954868, "grad_norm": 6.69216251373291, "learning_rate": 1.106443372126029e-05, "loss": 2.0225353240966797, "step": 62690 }, { "epoch": 17.797331819472042, "grad_norm": 6.81975793838501, "learning_rate": 1.1050241271643485e-05, "loss": 2.1525917053222656, "step": 62700 }, { "epoch": 17.8001703093954, "grad_norm": 6.275907039642334, "learning_rate": 1.1036048822026682e-05, "loss": 2.1992746353149415, "step": 62710 }, { "epoch": 17.80300879931876, "grad_norm": 6.238183498382568, "learning_rate": 1.1021856372409878e-05, "loss": 2.080526351928711, "step": 62720 }, { "epoch": 17.805847289242124, "grad_norm": 6.350930213928223, "learning_rate": 1.1007663922793075e-05, "loss": 2.0289411544799805, "step": 62730 }, { "epoch": 17.808685779165483, "grad_norm": 6.690194606781006, "learning_rate": 1.0993471473176272e-05, "loss": 2.0856876373291016, "step": 62740 }, { "epoch": 17.811524269088846, "grad_norm": 5.957854270935059, "learning_rate": 1.0979279023559467e-05, "loss": 2.0787368774414063, "step": 62750 }, { "epoch": 17.814362759012205, "grad_norm": 6.482333183288574, "learning_rate": 1.0965086573942664e-05, "loss": 2.153191566467285, "step": 62760 }, { "epoch": 17.817201248935568, "grad_norm": 6.304389953613281, "learning_rate": 1.095089412432586e-05, "loss": 2.0626110076904296, "step": 62770 }, { "epoch": 17.820039738858927, "grad_norm": 6.347423553466797, "learning_rate": 1.0936701674709055e-05, "loss": 2.155375289916992, "step": 62780 }, { "epoch": 17.822878228782287, "grad_norm": 6.891338348388672, "learning_rate": 1.0922509225092252e-05, "loss": 2.140969657897949, "step": 62790 }, { "epoch": 17.82571671870565, "grad_norm": 6.408513069152832, "learning_rate": 1.0908316775475449e-05, "loss": 2.057811737060547, "step": 62800 }, { "epoch": 17.82855520862901, "grad_norm": 6.338256359100342, "learning_rate": 1.0894124325858644e-05, "loss": 2.1286409378051756, "step": 62810 }, { "epoch": 17.83139369855237, "grad_norm": 6.445232391357422, "learning_rate": 1.087993187624184e-05, "loss": 2.0191987991333007, "step": 62820 }, { "epoch": 17.83423218847573, "grad_norm": 6.347233772277832, "learning_rate": 1.0865739426625037e-05, "loss": 2.218991279602051, "step": 62830 }, { "epoch": 17.83707067839909, "grad_norm": 6.696002960205078, "learning_rate": 1.0851546977008232e-05, "loss": 2.1457145690917967, "step": 62840 }, { "epoch": 17.839909168322453, "grad_norm": 6.479933738708496, "learning_rate": 1.0837354527391429e-05, "loss": 2.067269515991211, "step": 62850 }, { "epoch": 17.842747658245813, "grad_norm": 5.991268634796143, "learning_rate": 1.0823162077774626e-05, "loss": 2.0729122161865234, "step": 62860 }, { "epoch": 17.845586148169176, "grad_norm": 6.131773471832275, "learning_rate": 1.080896962815782e-05, "loss": 2.1122714996337892, "step": 62870 }, { "epoch": 17.848424638092535, "grad_norm": 6.7979607582092285, "learning_rate": 1.0794777178541017e-05, "loss": 2.118095588684082, "step": 62880 }, { "epoch": 17.851263128015894, "grad_norm": 6.285269260406494, "learning_rate": 1.0780584728924214e-05, "loss": 2.1037553787231444, "step": 62890 }, { "epoch": 17.854101617939257, "grad_norm": 6.731152534484863, "learning_rate": 1.0766392279307409e-05, "loss": 2.036783790588379, "step": 62900 }, { "epoch": 17.856940107862616, "grad_norm": 6.280046463012695, "learning_rate": 1.0752199829690606e-05, "loss": 2.090911102294922, "step": 62910 }, { "epoch": 17.85977859778598, "grad_norm": 6.475110054016113, "learning_rate": 1.07380073800738e-05, "loss": 2.170290946960449, "step": 62920 }, { "epoch": 17.86261708770934, "grad_norm": 6.372335910797119, "learning_rate": 1.0723814930456997e-05, "loss": 2.121186447143555, "step": 62930 }, { "epoch": 17.865455577632698, "grad_norm": 6.595027446746826, "learning_rate": 1.0709622480840194e-05, "loss": 2.0770475387573244, "step": 62940 }, { "epoch": 17.86829406755606, "grad_norm": 6.465315341949463, "learning_rate": 1.069543003122339e-05, "loss": 2.1906665802001952, "step": 62950 }, { "epoch": 17.87113255747942, "grad_norm": 6.402523517608643, "learning_rate": 1.0681237581606586e-05, "loss": 2.1078609466552733, "step": 62960 }, { "epoch": 17.873971047402783, "grad_norm": 6.4219865798950195, "learning_rate": 1.0667045131989783e-05, "loss": 2.133266830444336, "step": 62970 }, { "epoch": 17.876809537326142, "grad_norm": 6.653850078582764, "learning_rate": 1.0652852682372978e-05, "loss": 2.124836730957031, "step": 62980 }, { "epoch": 17.8796480272495, "grad_norm": 6.39287805557251, "learning_rate": 1.0638660232756174e-05, "loss": 2.056980514526367, "step": 62990 }, { "epoch": 17.882486517172865, "grad_norm": 6.565313816070557, "learning_rate": 1.0624467783139371e-05, "loss": 2.130046844482422, "step": 63000 }, { "epoch": 17.882486517172865, "eval_accuracy": 0.3585553506708209, "eval_loss": 2.4404516220092773, "eval_runtime": 52.5684, "eval_samples_per_second": 299.172, "eval_steps_per_second": 4.68, "step": 63000 }, { "epoch": 17.885325007096224, "grad_norm": 6.337362289428711, "learning_rate": 1.0610275333522566e-05, "loss": 2.091939926147461, "step": 63010 }, { "epoch": 17.888163497019587, "grad_norm": 6.520432472229004, "learning_rate": 1.0596082883905763e-05, "loss": 2.091647720336914, "step": 63020 }, { "epoch": 17.891001986942946, "grad_norm": 6.964673042297363, "learning_rate": 1.058189043428896e-05, "loss": 2.0611019134521484, "step": 63030 }, { "epoch": 17.893840476866306, "grad_norm": 6.2380828857421875, "learning_rate": 1.0567697984672154e-05, "loss": 2.038481521606445, "step": 63040 }, { "epoch": 17.89667896678967, "grad_norm": 6.404336452484131, "learning_rate": 1.0553505535055351e-05, "loss": 2.1166139602661134, "step": 63050 }, { "epoch": 17.899517456713028, "grad_norm": 6.386620044708252, "learning_rate": 1.0539313085438548e-05, "loss": 2.130527114868164, "step": 63060 }, { "epoch": 17.90235594663639, "grad_norm": 6.360386371612549, "learning_rate": 1.0525120635821743e-05, "loss": 2.0726099014282227, "step": 63070 }, { "epoch": 17.90519443655975, "grad_norm": 6.517320156097412, "learning_rate": 1.051092818620494e-05, "loss": 2.097260093688965, "step": 63080 }, { "epoch": 17.90803292648311, "grad_norm": 6.656134128570557, "learning_rate": 1.0496735736588136e-05, "loss": 2.137518310546875, "step": 63090 }, { "epoch": 17.910871416406472, "grad_norm": 6.614232540130615, "learning_rate": 1.0482543286971331e-05, "loss": 2.1531936645507814, "step": 63100 }, { "epoch": 17.91370990632983, "grad_norm": 6.01057243347168, "learning_rate": 1.0468350837354528e-05, "loss": 2.0314609527587892, "step": 63110 }, { "epoch": 17.916548396253194, "grad_norm": 6.994449138641357, "learning_rate": 1.0454158387737723e-05, "loss": 2.114409828186035, "step": 63120 }, { "epoch": 17.919386886176554, "grad_norm": 6.335590839385986, "learning_rate": 1.043996593812092e-05, "loss": 2.088612747192383, "step": 63130 }, { "epoch": 17.922225376099917, "grad_norm": 6.356225490570068, "learning_rate": 1.0425773488504116e-05, "loss": 2.059055137634277, "step": 63140 }, { "epoch": 17.925063866023276, "grad_norm": 6.426771640777588, "learning_rate": 1.0411581038887311e-05, "loss": 2.138066864013672, "step": 63150 }, { "epoch": 17.927902355946635, "grad_norm": 6.959150791168213, "learning_rate": 1.0397388589270508e-05, "loss": 2.1032257080078125, "step": 63160 }, { "epoch": 17.930740845869998, "grad_norm": 6.650171756744385, "learning_rate": 1.0383196139653705e-05, "loss": 2.1010398864746094, "step": 63170 }, { "epoch": 17.933579335793358, "grad_norm": 6.3786163330078125, "learning_rate": 1.03690036900369e-05, "loss": 2.016551208496094, "step": 63180 }, { "epoch": 17.93641782571672, "grad_norm": 6.7219061851501465, "learning_rate": 1.0354811240420097e-05, "loss": 2.1433479309082033, "step": 63190 }, { "epoch": 17.93925631564008, "grad_norm": 6.410816669464111, "learning_rate": 1.0340618790803293e-05, "loss": 2.1368078231811523, "step": 63200 }, { "epoch": 17.94209480556344, "grad_norm": 6.285545349121094, "learning_rate": 1.0326426341186488e-05, "loss": 2.111435890197754, "step": 63210 }, { "epoch": 17.944933295486802, "grad_norm": 6.2650957107543945, "learning_rate": 1.0312233891569685e-05, "loss": 2.211384963989258, "step": 63220 }, { "epoch": 17.94777178541016, "grad_norm": 6.543778419494629, "learning_rate": 1.0298041441952882e-05, "loss": 2.162944030761719, "step": 63230 }, { "epoch": 17.950610275333524, "grad_norm": 6.092548370361328, "learning_rate": 1.0283848992336077e-05, "loss": 2.072039794921875, "step": 63240 }, { "epoch": 17.953448765256883, "grad_norm": 6.61891508102417, "learning_rate": 1.0269656542719274e-05, "loss": 2.1518577575683593, "step": 63250 }, { "epoch": 17.956287255180243, "grad_norm": 6.142733573913574, "learning_rate": 1.025546409310247e-05, "loss": 2.076502799987793, "step": 63260 }, { "epoch": 17.959125745103606, "grad_norm": 6.706806182861328, "learning_rate": 1.0241271643485665e-05, "loss": 2.070616340637207, "step": 63270 }, { "epoch": 17.961964235026965, "grad_norm": 6.728141784667969, "learning_rate": 1.0227079193868862e-05, "loss": 2.1129316329956054, "step": 63280 }, { "epoch": 17.964802724950328, "grad_norm": 6.458790302276611, "learning_rate": 1.0212886744252059e-05, "loss": 2.099974822998047, "step": 63290 }, { "epoch": 17.967641214873687, "grad_norm": 6.409369468688965, "learning_rate": 1.0198694294635254e-05, "loss": 2.108241653442383, "step": 63300 }, { "epoch": 17.970479704797047, "grad_norm": 6.173311233520508, "learning_rate": 1.018450184501845e-05, "loss": 2.094637870788574, "step": 63310 }, { "epoch": 17.97331819472041, "grad_norm": 6.641015529632568, "learning_rate": 1.0170309395401645e-05, "loss": 2.0785453796386717, "step": 63320 }, { "epoch": 17.97615668464377, "grad_norm": 6.626389980316162, "learning_rate": 1.0156116945784842e-05, "loss": 2.129155731201172, "step": 63330 }, { "epoch": 17.97899517456713, "grad_norm": 7.132336616516113, "learning_rate": 1.0141924496168039e-05, "loss": 2.0705841064453123, "step": 63340 }, { "epoch": 17.98183366449049, "grad_norm": 6.520066738128662, "learning_rate": 1.0127732046551234e-05, "loss": 2.0977432250976564, "step": 63350 }, { "epoch": 17.98467215441385, "grad_norm": 6.79781436920166, "learning_rate": 1.011353959693443e-05, "loss": 2.1410919189453126, "step": 63360 }, { "epoch": 17.987510644337213, "grad_norm": 6.531245231628418, "learning_rate": 1.0099347147317627e-05, "loss": 2.166928291320801, "step": 63370 }, { "epoch": 17.990349134260573, "grad_norm": 6.658928871154785, "learning_rate": 1.0085154697700824e-05, "loss": 2.0878250122070314, "step": 63380 }, { "epoch": 17.993187624183935, "grad_norm": 6.4615864753723145, "learning_rate": 1.007096224808402e-05, "loss": 2.1053340911865233, "step": 63390 }, { "epoch": 17.996026114107295, "grad_norm": 6.58988094329834, "learning_rate": 1.0056769798467217e-05, "loss": 2.093967628479004, "step": 63400 }, { "epoch": 17.998864604030654, "grad_norm": 6.476963520050049, "learning_rate": 1.0042577348850412e-05, "loss": 2.0532146453857423, "step": 63410 }, { "epoch": 18.001703093954017, "grad_norm": 6.164493083953857, "learning_rate": 1.0029804144195288e-05, "loss": 2.0882843017578123, "step": 63420 }, { "epoch": 18.004541583877376, "grad_norm": 6.418278217315674, "learning_rate": 1.0015611694578485e-05, "loss": 2.0705690383911133, "step": 63430 }, { "epoch": 18.00738007380074, "grad_norm": 6.36760139465332, "learning_rate": 1.000141924496168e-05, "loss": 2.2015239715576174, "step": 63440 }, { "epoch": 18.0102185637241, "grad_norm": 6.614510536193848, "learning_rate": 9.987226795344877e-06, "loss": 2.162152862548828, "step": 63450 }, { "epoch": 18.013057053647458, "grad_norm": 6.39099645614624, "learning_rate": 9.973034345728074e-06, "loss": 2.1069665908813477, "step": 63460 }, { "epoch": 18.01589554357082, "grad_norm": 6.397565841674805, "learning_rate": 9.958841896111269e-06, "loss": 2.1168096542358397, "step": 63470 }, { "epoch": 18.01873403349418, "grad_norm": 6.508903980255127, "learning_rate": 9.944649446494465e-06, "loss": 2.033406448364258, "step": 63480 }, { "epoch": 18.021572523417543, "grad_norm": 6.090424060821533, "learning_rate": 9.930456996877662e-06, "loss": 2.044053649902344, "step": 63490 }, { "epoch": 18.024411013340902, "grad_norm": 6.239316463470459, "learning_rate": 9.916264547260857e-06, "loss": 2.0727422714233397, "step": 63500 }, { "epoch": 18.024411013340902, "eval_accuracy": 0.359763464106314, "eval_loss": 2.4389302730560303, "eval_runtime": 52.4518, "eval_samples_per_second": 299.837, "eval_steps_per_second": 4.69, "step": 63500 }, { "epoch": 18.02724950326426, "grad_norm": 6.599865436553955, "learning_rate": 9.902072097644054e-06, "loss": 2.0814117431640624, "step": 63510 }, { "epoch": 18.030087993187625, "grad_norm": 6.118131160736084, "learning_rate": 9.88787964802725e-06, "loss": 2.0428306579589846, "step": 63520 }, { "epoch": 18.032926483110984, "grad_norm": 6.398134708404541, "learning_rate": 9.873687198410445e-06, "loss": 2.126425552368164, "step": 63530 }, { "epoch": 18.035764973034347, "grad_norm": 6.469013214111328, "learning_rate": 9.859494748793642e-06, "loss": 2.0950273513793944, "step": 63540 }, { "epoch": 18.038603462957706, "grad_norm": 6.547028064727783, "learning_rate": 9.845302299176839e-06, "loss": 2.1130142211914062, "step": 63550 }, { "epoch": 18.04144195288107, "grad_norm": 6.339157581329346, "learning_rate": 9.831109849560034e-06, "loss": 2.1505338668823244, "step": 63560 }, { "epoch": 18.04428044280443, "grad_norm": 6.431417465209961, "learning_rate": 9.81691739994323e-06, "loss": 2.1418148040771485, "step": 63570 }, { "epoch": 18.047118932727788, "grad_norm": 6.210983753204346, "learning_rate": 9.802724950326427e-06, "loss": 2.102730560302734, "step": 63580 }, { "epoch": 18.04995742265115, "grad_norm": 6.8515424728393555, "learning_rate": 9.788532500709622e-06, "loss": 2.110096740722656, "step": 63590 }, { "epoch": 18.05279591257451, "grad_norm": 6.167492389678955, "learning_rate": 9.774340051092819e-06, "loss": 2.031091499328613, "step": 63600 }, { "epoch": 18.055634402497873, "grad_norm": 6.508444786071777, "learning_rate": 9.760147601476014e-06, "loss": 2.0987264633178713, "step": 63610 }, { "epoch": 18.058472892421232, "grad_norm": 7.035311698913574, "learning_rate": 9.74595515185921e-06, "loss": 2.168170928955078, "step": 63620 }, { "epoch": 18.06131138234459, "grad_norm": 6.4163079261779785, "learning_rate": 9.731762702242407e-06, "loss": 2.0439781188964843, "step": 63630 }, { "epoch": 18.064149872267954, "grad_norm": 5.934492588043213, "learning_rate": 9.717570252625602e-06, "loss": 2.0320690155029295, "step": 63640 }, { "epoch": 18.066988362191314, "grad_norm": 6.636424541473389, "learning_rate": 9.7033778030088e-06, "loss": 2.1580657958984375, "step": 63650 }, { "epoch": 18.069826852114677, "grad_norm": 6.393188953399658, "learning_rate": 9.689185353391996e-06, "loss": 2.153169059753418, "step": 63660 }, { "epoch": 18.072665342038036, "grad_norm": 6.121419429779053, "learning_rate": 9.674992903775191e-06, "loss": 2.0708520889282225, "step": 63670 }, { "epoch": 18.075503831961395, "grad_norm": 6.285571575164795, "learning_rate": 9.660800454158388e-06, "loss": 2.044774627685547, "step": 63680 }, { "epoch": 18.078342321884758, "grad_norm": 6.161478042602539, "learning_rate": 9.646608004541584e-06, "loss": 1.9754058837890625, "step": 63690 }, { "epoch": 18.081180811808117, "grad_norm": 6.430078506469727, "learning_rate": 9.63241555492478e-06, "loss": 2.2030828475952147, "step": 63700 }, { "epoch": 18.08401930173148, "grad_norm": 6.619796276092529, "learning_rate": 9.618223105307976e-06, "loss": 2.0872983932495117, "step": 63710 }, { "epoch": 18.08685779165484, "grad_norm": 6.63834810256958, "learning_rate": 9.604030655691173e-06, "loss": 2.0685806274414062, "step": 63720 }, { "epoch": 18.0896962815782, "grad_norm": 6.437267780303955, "learning_rate": 9.589838206074368e-06, "loss": 2.1572410583496096, "step": 63730 }, { "epoch": 18.092534771501562, "grad_norm": 6.28673791885376, "learning_rate": 9.575645756457564e-06, "loss": 2.0788612365722656, "step": 63740 }, { "epoch": 18.09537326142492, "grad_norm": 6.086430072784424, "learning_rate": 9.561453306840761e-06, "loss": 2.034844398498535, "step": 63750 }, { "epoch": 18.098211751348284, "grad_norm": 6.514573097229004, "learning_rate": 9.547260857223956e-06, "loss": 2.1472505569458007, "step": 63760 }, { "epoch": 18.101050241271643, "grad_norm": 6.2951507568359375, "learning_rate": 9.533068407607155e-06, "loss": 2.017036247253418, "step": 63770 }, { "epoch": 18.103888731195003, "grad_norm": 5.984198570251465, "learning_rate": 9.51887595799035e-06, "loss": 2.0198436737060548, "step": 63780 }, { "epoch": 18.106727221118366, "grad_norm": 6.395943641662598, "learning_rate": 9.504683508373546e-06, "loss": 2.0169227600097654, "step": 63790 }, { "epoch": 18.109565711041725, "grad_norm": 6.28702974319458, "learning_rate": 9.490491058756743e-06, "loss": 2.069470977783203, "step": 63800 }, { "epoch": 18.112404200965088, "grad_norm": 6.2268195152282715, "learning_rate": 9.476298609139938e-06, "loss": 2.0981927871704102, "step": 63810 }, { "epoch": 18.115242690888447, "grad_norm": 6.582431316375732, "learning_rate": 9.462106159523135e-06, "loss": 2.0918197631835938, "step": 63820 }, { "epoch": 18.118081180811807, "grad_norm": 6.809426307678223, "learning_rate": 9.447913709906331e-06, "loss": 2.131814384460449, "step": 63830 }, { "epoch": 18.12091967073517, "grad_norm": 6.060179233551025, "learning_rate": 9.433721260289526e-06, "loss": 1.976340103149414, "step": 63840 }, { "epoch": 18.12375816065853, "grad_norm": 6.392913341522217, "learning_rate": 9.419528810672723e-06, "loss": 2.0465713500976563, "step": 63850 }, { "epoch": 18.12659665058189, "grad_norm": 6.645934104919434, "learning_rate": 9.40533636105592e-06, "loss": 2.1120187759399416, "step": 63860 }, { "epoch": 18.12943514050525, "grad_norm": 6.205315113067627, "learning_rate": 9.391143911439115e-06, "loss": 2.1010929107666017, "step": 63870 }, { "epoch": 18.13227363042861, "grad_norm": 6.191686630249023, "learning_rate": 9.376951461822312e-06, "loss": 2.114798355102539, "step": 63880 }, { "epoch": 18.135112120351973, "grad_norm": 6.364065170288086, "learning_rate": 9.362759012205508e-06, "loss": 2.1027284622192384, "step": 63890 }, { "epoch": 18.137950610275333, "grad_norm": 6.504979610443115, "learning_rate": 9.348566562588703e-06, "loss": 2.0952743530273437, "step": 63900 }, { "epoch": 18.140789100198695, "grad_norm": 6.273201942443848, "learning_rate": 9.3343741129719e-06, "loss": 2.0779260635375976, "step": 63910 }, { "epoch": 18.143627590122055, "grad_norm": 6.484992504119873, "learning_rate": 9.320181663355095e-06, "loss": 2.022547149658203, "step": 63920 }, { "epoch": 18.146466080045414, "grad_norm": 6.402980804443359, "learning_rate": 9.305989213738292e-06, "loss": 2.1657886505126953, "step": 63930 }, { "epoch": 18.149304569968777, "grad_norm": 6.370507717132568, "learning_rate": 9.291796764121488e-06, "loss": 2.0940494537353516, "step": 63940 }, { "epoch": 18.152143059892136, "grad_norm": 6.193453788757324, "learning_rate": 9.277604314504684e-06, "loss": 2.024892234802246, "step": 63950 }, { "epoch": 18.1549815498155, "grad_norm": 6.6371169090271, "learning_rate": 9.26341186488788e-06, "loss": 2.1246358871459963, "step": 63960 }, { "epoch": 18.15782003973886, "grad_norm": 6.125627040863037, "learning_rate": 9.249219415271077e-06, "loss": 2.0992202758789062, "step": 63970 }, { "epoch": 18.16065852966222, "grad_norm": 6.215463161468506, "learning_rate": 9.235026965654272e-06, "loss": 2.077500915527344, "step": 63980 }, { "epoch": 18.16349701958558, "grad_norm": 6.50735330581665, "learning_rate": 9.220834516037469e-06, "loss": 2.0891895294189453, "step": 63990 }, { "epoch": 18.16633550950894, "grad_norm": 6.543880939483643, "learning_rate": 9.206642066420665e-06, "loss": 2.1059215545654295, "step": 64000 }, { "epoch": 18.16633550950894, "eval_accuracy": 0.3606536529535194, "eval_loss": 2.4384875297546387, "eval_runtime": 49.1624, "eval_samples_per_second": 319.899, "eval_steps_per_second": 5.004, "step": 64000 }, { "epoch": 18.169173999432303, "grad_norm": 6.338160514831543, "learning_rate": 9.19244961680386e-06, "loss": 2.1480350494384766, "step": 64010 }, { "epoch": 18.172012489355662, "grad_norm": 6.6628828048706055, "learning_rate": 9.178257167187057e-06, "loss": 2.1649801254272463, "step": 64020 }, { "epoch": 18.174850979279025, "grad_norm": 6.633697986602783, "learning_rate": 9.164064717570254e-06, "loss": 2.0272260665893556, "step": 64030 }, { "epoch": 18.177689469202384, "grad_norm": 6.202208518981934, "learning_rate": 9.149872267953449e-06, "loss": 2.0897375106811524, "step": 64040 }, { "epoch": 18.180527959125744, "grad_norm": 6.835944175720215, "learning_rate": 9.135679818336646e-06, "loss": 2.0272890090942384, "step": 64050 }, { "epoch": 18.183366449049107, "grad_norm": 6.188246726989746, "learning_rate": 9.121487368719842e-06, "loss": 2.1155670166015623, "step": 64060 }, { "epoch": 18.186204938972466, "grad_norm": 6.548593521118164, "learning_rate": 9.107294919103037e-06, "loss": 2.1051462173461912, "step": 64070 }, { "epoch": 18.18904342889583, "grad_norm": 6.168612003326416, "learning_rate": 9.093102469486234e-06, "loss": 2.0781270980834963, "step": 64080 }, { "epoch": 18.19188191881919, "grad_norm": 6.221258163452148, "learning_rate": 9.07891001986943e-06, "loss": 2.0354934692382813, "step": 64090 }, { "epoch": 18.194720408742548, "grad_norm": 6.0121893882751465, "learning_rate": 9.064717570252626e-06, "loss": 2.0815744400024414, "step": 64100 }, { "epoch": 18.19755889866591, "grad_norm": 6.3140435218811035, "learning_rate": 9.050525120635822e-06, "loss": 2.0615325927734376, "step": 64110 }, { "epoch": 18.20039738858927, "grad_norm": 6.585452079772949, "learning_rate": 9.036332671019019e-06, "loss": 2.0956295013427733, "step": 64120 }, { "epoch": 18.203235878512633, "grad_norm": 6.1329121589660645, "learning_rate": 9.022140221402214e-06, "loss": 2.137342071533203, "step": 64130 }, { "epoch": 18.206074368435992, "grad_norm": 6.295035362243652, "learning_rate": 9.00794777178541e-06, "loss": 2.0554477691650392, "step": 64140 }, { "epoch": 18.20891285835935, "grad_norm": 6.219274997711182, "learning_rate": 8.993755322168606e-06, "loss": 2.1262977600097654, "step": 64150 }, { "epoch": 18.211751348282714, "grad_norm": 6.520815372467041, "learning_rate": 8.979562872551803e-06, "loss": 2.0742679595947267, "step": 64160 }, { "epoch": 18.214589838206074, "grad_norm": 6.872211456298828, "learning_rate": 8.965370422935e-06, "loss": 2.071143913269043, "step": 64170 }, { "epoch": 18.217428328129436, "grad_norm": 6.066454887390137, "learning_rate": 8.951177973318194e-06, "loss": 2.0832542419433593, "step": 64180 }, { "epoch": 18.220266818052796, "grad_norm": 6.376120090484619, "learning_rate": 8.936985523701391e-06, "loss": 2.109826850891113, "step": 64190 }, { "epoch": 18.223105307976155, "grad_norm": 6.779813766479492, "learning_rate": 8.922793074084588e-06, "loss": 2.0581398010253906, "step": 64200 }, { "epoch": 18.225943797899518, "grad_norm": 6.174276351928711, "learning_rate": 8.908600624467783e-06, "loss": 2.017460250854492, "step": 64210 }, { "epoch": 18.228782287822877, "grad_norm": 6.256805896759033, "learning_rate": 8.89440817485098e-06, "loss": 2.067185974121094, "step": 64220 }, { "epoch": 18.23162077774624, "grad_norm": 6.7016825675964355, "learning_rate": 8.880215725234176e-06, "loss": 2.0979183197021483, "step": 64230 }, { "epoch": 18.2344592676696, "grad_norm": 6.474767208099365, "learning_rate": 8.866023275617371e-06, "loss": 2.128011703491211, "step": 64240 }, { "epoch": 18.23729775759296, "grad_norm": 6.2146525382995605, "learning_rate": 8.851830826000568e-06, "loss": 2.0917430877685548, "step": 64250 }, { "epoch": 18.240136247516322, "grad_norm": 6.581207752227783, "learning_rate": 8.837638376383765e-06, "loss": 2.141128730773926, "step": 64260 }, { "epoch": 18.24297473743968, "grad_norm": 6.295041561126709, "learning_rate": 8.82344592676696e-06, "loss": 2.0590795516967773, "step": 64270 }, { "epoch": 18.245813227363044, "grad_norm": 6.391772270202637, "learning_rate": 8.809253477150156e-06, "loss": 2.1161872863769533, "step": 64280 }, { "epoch": 18.248651717286403, "grad_norm": 6.5177435874938965, "learning_rate": 8.795061027533353e-06, "loss": 2.104078483581543, "step": 64290 }, { "epoch": 18.251490207209763, "grad_norm": 6.321416854858398, "learning_rate": 8.780868577916548e-06, "loss": 2.1144891738891602, "step": 64300 }, { "epoch": 18.254328697133126, "grad_norm": 6.494858264923096, "learning_rate": 8.766676128299745e-06, "loss": 2.026073455810547, "step": 64310 }, { "epoch": 18.257167187056485, "grad_norm": 6.408641815185547, "learning_rate": 8.752483678682941e-06, "loss": 2.151146125793457, "step": 64320 }, { "epoch": 18.260005676979848, "grad_norm": 5.929804801940918, "learning_rate": 8.738291229066136e-06, "loss": 2.073348045349121, "step": 64330 }, { "epoch": 18.262844166903207, "grad_norm": 6.131336688995361, "learning_rate": 8.724098779449333e-06, "loss": 2.082143211364746, "step": 64340 }, { "epoch": 18.26568265682657, "grad_norm": 6.617753505706787, "learning_rate": 8.709906329832528e-06, "loss": 2.1760244369506836, "step": 64350 }, { "epoch": 18.26852114674993, "grad_norm": 6.325116157531738, "learning_rate": 8.695713880215725e-06, "loss": 2.119493865966797, "step": 64360 }, { "epoch": 18.27135963667329, "grad_norm": 6.183959484100342, "learning_rate": 8.681521430598922e-06, "loss": 2.0323320388793946, "step": 64370 }, { "epoch": 18.27419812659665, "grad_norm": 6.505180358886719, "learning_rate": 8.667328980982117e-06, "loss": 2.0848276138305666, "step": 64380 }, { "epoch": 18.27703661652001, "grad_norm": 6.7425456047058105, "learning_rate": 8.653136531365313e-06, "loss": 2.159921073913574, "step": 64390 }, { "epoch": 18.279875106443374, "grad_norm": 6.5049662590026855, "learning_rate": 8.63894408174851e-06, "loss": 2.091504669189453, "step": 64400 }, { "epoch": 18.282713596366733, "grad_norm": 6.485337257385254, "learning_rate": 8.624751632131705e-06, "loss": 2.1021953582763673, "step": 64410 }, { "epoch": 18.285552086290092, "grad_norm": 6.216005802154541, "learning_rate": 8.610559182514903e-06, "loss": 2.065947723388672, "step": 64420 }, { "epoch": 18.288390576213455, "grad_norm": 6.730464935302734, "learning_rate": 8.5963667328981e-06, "loss": 2.1393966674804688, "step": 64430 }, { "epoch": 18.291229066136815, "grad_norm": 6.541171073913574, "learning_rate": 8.582174283281295e-06, "loss": 2.1743650436401367, "step": 64440 }, { "epoch": 18.294067556060178, "grad_norm": 6.340284824371338, "learning_rate": 8.567981833664492e-06, "loss": 2.0862733840942385, "step": 64450 }, { "epoch": 18.296906045983537, "grad_norm": 6.582402229309082, "learning_rate": 8.553789384047687e-06, "loss": 2.1269062042236326, "step": 64460 }, { "epoch": 18.299744535906896, "grad_norm": 6.203645706176758, "learning_rate": 8.539596934430884e-06, "loss": 2.07861442565918, "step": 64470 }, { "epoch": 18.30258302583026, "grad_norm": 6.267248153686523, "learning_rate": 8.52540448481408e-06, "loss": 2.012795639038086, "step": 64480 }, { "epoch": 18.30542151575362, "grad_norm": 6.373089790344238, "learning_rate": 8.511212035197275e-06, "loss": 2.0764720916748045, "step": 64490 }, { "epoch": 18.30826000567698, "grad_norm": 6.863913059234619, "learning_rate": 8.497019585580472e-06, "loss": 2.099348449707031, "step": 64500 }, { "epoch": 18.30826000567698, "eval_accuracy": 0.3610351624594646, "eval_loss": 2.436873197555542, "eval_runtime": 48.9827, "eval_samples_per_second": 321.073, "eval_steps_per_second": 5.022, "step": 64500 }, { "epoch": 18.31109849560034, "grad_norm": 6.08783483505249, "learning_rate": 8.482827135963669e-06, "loss": 2.0887453079223635, "step": 64510 }, { "epoch": 18.3139369855237, "grad_norm": 6.473641872406006, "learning_rate": 8.468634686346864e-06, "loss": 2.056540298461914, "step": 64520 }, { "epoch": 18.316775475447063, "grad_norm": 6.191627025604248, "learning_rate": 8.45444223673006e-06, "loss": 2.080792999267578, "step": 64530 }, { "epoch": 18.319613965370422, "grad_norm": 6.631406307220459, "learning_rate": 8.440249787113257e-06, "loss": 2.1109149932861326, "step": 64540 }, { "epoch": 18.322452455293785, "grad_norm": 6.7042765617370605, "learning_rate": 8.426057337496452e-06, "loss": 2.1531999588012694, "step": 64550 }, { "epoch": 18.325290945217144, "grad_norm": 6.433002471923828, "learning_rate": 8.411864887879649e-06, "loss": 2.0896533966064452, "step": 64560 }, { "epoch": 18.328129435140504, "grad_norm": 6.160502910614014, "learning_rate": 8.397672438262846e-06, "loss": 2.045802879333496, "step": 64570 }, { "epoch": 18.330967925063867, "grad_norm": 6.51307487487793, "learning_rate": 8.38347998864604e-06, "loss": 2.060006523132324, "step": 64580 }, { "epoch": 18.333806414987226, "grad_norm": 6.116743087768555, "learning_rate": 8.369287539029237e-06, "loss": 2.0463191986083986, "step": 64590 }, { "epoch": 18.33664490491059, "grad_norm": 6.0057501792907715, "learning_rate": 8.355095089412434e-06, "loss": 2.0485355377197267, "step": 64600 }, { "epoch": 18.339483394833948, "grad_norm": 6.549429893493652, "learning_rate": 8.340902639795629e-06, "loss": 2.1397468566894533, "step": 64610 }, { "epoch": 18.342321884757308, "grad_norm": 6.298701763153076, "learning_rate": 8.326710190178826e-06, "loss": 2.0762151718139648, "step": 64620 }, { "epoch": 18.34516037468067, "grad_norm": 6.447253704071045, "learning_rate": 8.312517740562022e-06, "loss": 2.1392000198364256, "step": 64630 }, { "epoch": 18.34799886460403, "grad_norm": 6.3866801261901855, "learning_rate": 8.298325290945217e-06, "loss": 2.106633186340332, "step": 64640 }, { "epoch": 18.350837354527393, "grad_norm": 6.312609672546387, "learning_rate": 8.284132841328414e-06, "loss": 2.084321975708008, "step": 64650 }, { "epoch": 18.353675844450752, "grad_norm": 6.691795825958252, "learning_rate": 8.26994039171161e-06, "loss": 2.0864337921142577, "step": 64660 }, { "epoch": 18.35651433437411, "grad_norm": 6.2366108894348145, "learning_rate": 8.255747942094806e-06, "loss": 2.0596664428710936, "step": 64670 }, { "epoch": 18.359352824297474, "grad_norm": 6.206855297088623, "learning_rate": 8.241555492478003e-06, "loss": 2.0896583557128907, "step": 64680 }, { "epoch": 18.362191314220834, "grad_norm": 6.363806247711182, "learning_rate": 8.227363042861198e-06, "loss": 2.1671274185180662, "step": 64690 }, { "epoch": 18.365029804144196, "grad_norm": 6.553501129150391, "learning_rate": 8.213170593244394e-06, "loss": 2.1132427215576173, "step": 64700 }, { "epoch": 18.367868294067556, "grad_norm": 6.356370449066162, "learning_rate": 8.198978143627591e-06, "loss": 2.1215200424194336, "step": 64710 }, { "epoch": 18.37070678399092, "grad_norm": 6.659488201141357, "learning_rate": 8.184785694010786e-06, "loss": 2.1209014892578124, "step": 64720 }, { "epoch": 18.373545273914278, "grad_norm": 6.701727867126465, "learning_rate": 8.170593244393983e-06, "loss": 2.1956317901611326, "step": 64730 }, { "epoch": 18.376383763837637, "grad_norm": 6.931812763214111, "learning_rate": 8.15640079477718e-06, "loss": 2.0694265365600586, "step": 64740 }, { "epoch": 18.379222253761, "grad_norm": 6.385660648345947, "learning_rate": 8.142208345160375e-06, "loss": 2.041141319274902, "step": 64750 }, { "epoch": 18.38206074368436, "grad_norm": 6.398952960968018, "learning_rate": 8.128015895543571e-06, "loss": 2.0657321929931642, "step": 64760 }, { "epoch": 18.384899233607722, "grad_norm": 6.350265026092529, "learning_rate": 8.113823445926768e-06, "loss": 2.0713163375854493, "step": 64770 }, { "epoch": 18.38773772353108, "grad_norm": 6.242082118988037, "learning_rate": 8.099630996309963e-06, "loss": 2.1364927291870117, "step": 64780 }, { "epoch": 18.39057621345444, "grad_norm": 6.371827125549316, "learning_rate": 8.08543854669316e-06, "loss": 2.0609468460083007, "step": 64790 }, { "epoch": 18.393414703377804, "grad_norm": 6.267342567443848, "learning_rate": 8.071246097076356e-06, "loss": 2.1169815063476562, "step": 64800 }, { "epoch": 18.396253193301163, "grad_norm": 6.399655818939209, "learning_rate": 8.057053647459551e-06, "loss": 2.1554258346557615, "step": 64810 }, { "epoch": 18.399091683224526, "grad_norm": 6.519018650054932, "learning_rate": 8.042861197842748e-06, "loss": 2.0538612365722657, "step": 64820 }, { "epoch": 18.401930173147885, "grad_norm": 6.541567325592041, "learning_rate": 8.028668748225945e-06, "loss": 2.052492141723633, "step": 64830 }, { "epoch": 18.404768663071245, "grad_norm": 6.710897922515869, "learning_rate": 8.01447629860914e-06, "loss": 2.1143844604492186, "step": 64840 }, { "epoch": 18.407607152994608, "grad_norm": 6.3660173416137695, "learning_rate": 8.000283848992337e-06, "loss": 2.0869911193847654, "step": 64850 }, { "epoch": 18.410445642917967, "grad_norm": 6.436506271362305, "learning_rate": 7.986091399375532e-06, "loss": 2.121567153930664, "step": 64860 }, { "epoch": 18.41328413284133, "grad_norm": 6.1288042068481445, "learning_rate": 7.971898949758728e-06, "loss": 2.1151552200317383, "step": 64870 }, { "epoch": 18.41612262276469, "grad_norm": 6.359764099121094, "learning_rate": 7.957706500141925e-06, "loss": 2.031490516662598, "step": 64880 }, { "epoch": 18.41896111268805, "grad_norm": 6.8111395835876465, "learning_rate": 7.94351405052512e-06, "loss": 2.047151565551758, "step": 64890 }, { "epoch": 18.42179960261141, "grad_norm": 6.553795337677002, "learning_rate": 7.929321600908317e-06, "loss": 2.1331085205078124, "step": 64900 }, { "epoch": 18.42463809253477, "grad_norm": 6.808077335357666, "learning_rate": 7.915129151291513e-06, "loss": 2.103446197509766, "step": 64910 }, { "epoch": 18.427476582458134, "grad_norm": 6.311574459075928, "learning_rate": 7.900936701674708e-06, "loss": 2.118061065673828, "step": 64920 }, { "epoch": 18.430315072381493, "grad_norm": 6.465188026428223, "learning_rate": 7.886744252057905e-06, "loss": 2.073598861694336, "step": 64930 }, { "epoch": 18.433153562304852, "grad_norm": 6.657501697540283, "learning_rate": 7.872551802441102e-06, "loss": 2.1064525604248048, "step": 64940 }, { "epoch": 18.435992052228215, "grad_norm": 6.903112411499023, "learning_rate": 7.858359352824297e-06, "loss": 2.0963809967041014, "step": 64950 }, { "epoch": 18.438830542151575, "grad_norm": 6.806290626525879, "learning_rate": 7.844166903207494e-06, "loss": 2.114859962463379, "step": 64960 }, { "epoch": 18.441669032074937, "grad_norm": 6.550230979919434, "learning_rate": 7.82997445359069e-06, "loss": 2.118639373779297, "step": 64970 }, { "epoch": 18.444507521998297, "grad_norm": 6.349015712738037, "learning_rate": 7.815782003973885e-06, "loss": 2.0754220962524412, "step": 64980 }, { "epoch": 18.447346011921656, "grad_norm": 6.217617034912109, "learning_rate": 7.801589554357082e-06, "loss": 1.9928306579589843, "step": 64990 }, { "epoch": 18.45018450184502, "grad_norm": 6.542662143707275, "learning_rate": 7.787397104740279e-06, "loss": 2.115343475341797, "step": 65000 }, { "epoch": 18.45018450184502, "eval_accuracy": 0.36370572900108095, "eval_loss": 2.4354331493377686, "eval_runtime": 52.3253, "eval_samples_per_second": 300.562, "eval_steps_per_second": 4.701, "step": 65000 }, { "epoch": 18.45302299176838, "grad_norm": 6.532248497009277, "learning_rate": 7.773204655123474e-06, "loss": 2.0939151763916017, "step": 65010 }, { "epoch": 18.45586148169174, "grad_norm": 6.447332859039307, "learning_rate": 7.75901220550667e-06, "loss": 2.1651119232177733, "step": 65020 }, { "epoch": 18.4586999716151, "grad_norm": 6.072951793670654, "learning_rate": 7.744819755889867e-06, "loss": 2.042768669128418, "step": 65030 }, { "epoch": 18.46153846153846, "grad_norm": 6.852917671203613, "learning_rate": 7.730627306273062e-06, "loss": 2.0803050994873047, "step": 65040 }, { "epoch": 18.464376951461823, "grad_norm": 6.359960556030273, "learning_rate": 7.716434856656259e-06, "loss": 2.1249267578125, "step": 65050 }, { "epoch": 18.467215441385182, "grad_norm": 6.303300380706787, "learning_rate": 7.702242407039456e-06, "loss": 2.095119285583496, "step": 65060 }, { "epoch": 18.470053931308545, "grad_norm": 6.48914909362793, "learning_rate": 7.68804995742265e-06, "loss": 2.219911575317383, "step": 65070 }, { "epoch": 18.472892421231904, "grad_norm": 6.094334125518799, "learning_rate": 7.673857507805849e-06, "loss": 2.0424705505371095, "step": 65080 }, { "epoch": 18.475730911155264, "grad_norm": 6.509434700012207, "learning_rate": 7.659665058189044e-06, "loss": 2.066524124145508, "step": 65090 }, { "epoch": 18.478569401078627, "grad_norm": 6.362374305725098, "learning_rate": 7.64547260857224e-06, "loss": 2.054180717468262, "step": 65100 }, { "epoch": 18.481407891001986, "grad_norm": 6.828793048858643, "learning_rate": 7.631280158955437e-06, "loss": 2.1041040420532227, "step": 65110 }, { "epoch": 18.48424638092535, "grad_norm": 6.239342212677002, "learning_rate": 7.617087709338632e-06, "loss": 2.0625432968139648, "step": 65120 }, { "epoch": 18.487084870848708, "grad_norm": 6.2430596351623535, "learning_rate": 7.602895259721829e-06, "loss": 2.086251640319824, "step": 65130 }, { "epoch": 18.48992336077207, "grad_norm": 6.358465671539307, "learning_rate": 7.588702810105025e-06, "loss": 2.1011667251586914, "step": 65140 }, { "epoch": 18.49276185069543, "grad_norm": 6.007746696472168, "learning_rate": 7.574510360488221e-06, "loss": 2.0003868103027345, "step": 65150 }, { "epoch": 18.49560034061879, "grad_norm": 6.615300178527832, "learning_rate": 7.5603179108714176e-06, "loss": 2.108740043640137, "step": 65160 }, { "epoch": 18.498438830542153, "grad_norm": 6.416965007781982, "learning_rate": 7.5461254612546134e-06, "loss": 2.1024410247802736, "step": 65170 }, { "epoch": 18.501277320465512, "grad_norm": 6.451404571533203, "learning_rate": 7.531933011637809e-06, "loss": 2.0642301559448244, "step": 65180 }, { "epoch": 18.504115810388875, "grad_norm": 6.508080005645752, "learning_rate": 7.517740562021006e-06, "loss": 2.071232223510742, "step": 65190 }, { "epoch": 18.506954300312234, "grad_norm": 6.7808942794799805, "learning_rate": 7.503548112404202e-06, "loss": 2.070822906494141, "step": 65200 }, { "epoch": 18.509792790235593, "grad_norm": 6.534443378448486, "learning_rate": 7.489355662787398e-06, "loss": 2.0017154693603514, "step": 65210 }, { "epoch": 18.512631280158956, "grad_norm": 6.625760555267334, "learning_rate": 7.475163213170594e-06, "loss": 2.1912403106689453, "step": 65220 }, { "epoch": 18.515469770082316, "grad_norm": 6.846066474914551, "learning_rate": 7.46097076355379e-06, "loss": 2.146574020385742, "step": 65230 }, { "epoch": 18.51830826000568, "grad_norm": 6.462479591369629, "learning_rate": 7.446778313936986e-06, "loss": 2.1454294204711912, "step": 65240 }, { "epoch": 18.521146749929038, "grad_norm": 6.373232364654541, "learning_rate": 7.432585864320182e-06, "loss": 2.046456527709961, "step": 65250 }, { "epoch": 18.523985239852397, "grad_norm": 6.262520790100098, "learning_rate": 7.418393414703379e-06, "loss": 2.117007255554199, "step": 65260 }, { "epoch": 18.52682372977576, "grad_norm": 6.621224403381348, "learning_rate": 7.404200965086575e-06, "loss": 2.097883605957031, "step": 65270 }, { "epoch": 18.52966221969912, "grad_norm": 6.24954891204834, "learning_rate": 7.3900085154697705e-06, "loss": 2.064107322692871, "step": 65280 }, { "epoch": 18.532500709622482, "grad_norm": 6.070961952209473, "learning_rate": 7.375816065852967e-06, "loss": 2.1463531494140624, "step": 65290 }, { "epoch": 18.53533919954584, "grad_norm": 6.634556770324707, "learning_rate": 7.361623616236163e-06, "loss": 2.103110122680664, "step": 65300 }, { "epoch": 18.5381776894692, "grad_norm": 6.398317337036133, "learning_rate": 7.347431166619359e-06, "loss": 2.0919910430908204, "step": 65310 }, { "epoch": 18.541016179392564, "grad_norm": 6.326926231384277, "learning_rate": 7.333238717002555e-06, "loss": 2.1170434951782227, "step": 65320 }, { "epoch": 18.543854669315923, "grad_norm": 6.724093437194824, "learning_rate": 7.3190462673857515e-06, "loss": 2.0747196197509767, "step": 65330 }, { "epoch": 18.546693159239286, "grad_norm": 6.481235027313232, "learning_rate": 7.304853817768947e-06, "loss": 2.1179080963134767, "step": 65340 }, { "epoch": 18.549531649162645, "grad_norm": 6.6799798011779785, "learning_rate": 7.290661368152143e-06, "loss": 2.097890090942383, "step": 65350 }, { "epoch": 18.552370139086005, "grad_norm": 6.915953159332275, "learning_rate": 7.27646891853534e-06, "loss": 2.1501838684082033, "step": 65360 }, { "epoch": 18.555208629009368, "grad_norm": 6.209902286529541, "learning_rate": 7.262276468918536e-06, "loss": 2.175489044189453, "step": 65370 }, { "epoch": 18.558047118932727, "grad_norm": 6.2600884437561035, "learning_rate": 7.248084019301732e-06, "loss": 2.086576461791992, "step": 65380 }, { "epoch": 18.56088560885609, "grad_norm": 6.248402118682861, "learning_rate": 7.233891569684928e-06, "loss": 2.1706993103027346, "step": 65390 }, { "epoch": 18.56372409877945, "grad_norm": 6.5293755531311035, "learning_rate": 7.219699120068124e-06, "loss": 2.0980274200439455, "step": 65400 }, { "epoch": 18.56656258870281, "grad_norm": 6.453487396240234, "learning_rate": 7.20550667045132e-06, "loss": 2.1162328720092773, "step": 65410 }, { "epoch": 18.56940107862617, "grad_norm": 6.246020793914795, "learning_rate": 7.191314220834517e-06, "loss": 2.1382381439208986, "step": 65420 }, { "epoch": 18.57223956854953, "grad_norm": 6.036157131195068, "learning_rate": 7.177121771217713e-06, "loss": 2.087345314025879, "step": 65430 }, { "epoch": 18.575078058472894, "grad_norm": 6.469088554382324, "learning_rate": 7.1629293216009085e-06, "loss": 2.104218864440918, "step": 65440 }, { "epoch": 18.577916548396253, "grad_norm": 6.384708404541016, "learning_rate": 7.148736871984104e-06, "loss": 2.096060371398926, "step": 65450 }, { "epoch": 18.580755038319612, "grad_norm": 6.067671298980713, "learning_rate": 7.134544422367301e-06, "loss": 2.09384708404541, "step": 65460 }, { "epoch": 18.583593528242975, "grad_norm": 6.467475891113281, "learning_rate": 7.120351972750497e-06, "loss": 2.056918716430664, "step": 65470 }, { "epoch": 18.586432018166335, "grad_norm": 6.315921783447266, "learning_rate": 7.106159523133693e-06, "loss": 2.099884033203125, "step": 65480 }, { "epoch": 18.589270508089697, "grad_norm": 6.21213436126709, "learning_rate": 7.0919670735168895e-06, "loss": 2.0986284255981444, "step": 65490 }, { "epoch": 18.592108998013057, "grad_norm": 6.227791786193848, "learning_rate": 7.077774623900085e-06, "loss": 2.168014717102051, "step": 65500 }, { "epoch": 18.592108998013057, "eval_accuracy": 0.3618617663890125, "eval_loss": 2.4347426891326904, "eval_runtime": 52.8141, "eval_samples_per_second": 297.781, "eval_steps_per_second": 4.658, "step": 65500 }, { "epoch": 18.594947487936416, "grad_norm": 6.125910758972168, "learning_rate": 7.063582174283281e-06, "loss": 2.078364372253418, "step": 65510 }, { "epoch": 18.59778597785978, "grad_norm": 6.721903324127197, "learning_rate": 7.049389724666478e-06, "loss": 2.082302284240723, "step": 65520 }, { "epoch": 18.60062446778314, "grad_norm": 6.486910343170166, "learning_rate": 7.035197275049674e-06, "loss": 2.11962947845459, "step": 65530 }, { "epoch": 18.6034629577065, "grad_norm": 6.717968940734863, "learning_rate": 7.02100482543287e-06, "loss": 2.090346336364746, "step": 65540 }, { "epoch": 18.60630144762986, "grad_norm": 6.486374855041504, "learning_rate": 7.0068123758160655e-06, "loss": 2.097625732421875, "step": 65550 }, { "epoch": 18.609139937553223, "grad_norm": 6.151224613189697, "learning_rate": 6.992619926199262e-06, "loss": 2.027336311340332, "step": 65560 }, { "epoch": 18.611978427476583, "grad_norm": 6.468955993652344, "learning_rate": 6.978427476582458e-06, "loss": 2.062033843994141, "step": 65570 }, { "epoch": 18.614816917399942, "grad_norm": 6.793084621429443, "learning_rate": 6.964235026965654e-06, "loss": 2.0497304916381838, "step": 65580 }, { "epoch": 18.617655407323305, "grad_norm": 6.0018839836120605, "learning_rate": 6.950042577348851e-06, "loss": 2.1081342697143555, "step": 65590 }, { "epoch": 18.620493897246664, "grad_norm": 6.453134059906006, "learning_rate": 6.9358501277320465e-06, "loss": 2.041850280761719, "step": 65600 }, { "epoch": 18.623332387170027, "grad_norm": 5.972736358642578, "learning_rate": 6.921657678115242e-06, "loss": 2.0813623428344727, "step": 65610 }, { "epoch": 18.626170877093386, "grad_norm": 6.70637321472168, "learning_rate": 6.907465228498439e-06, "loss": 2.094507026672363, "step": 65620 }, { "epoch": 18.629009367016746, "grad_norm": 6.46790075302124, "learning_rate": 6.893272778881635e-06, "loss": 2.107981872558594, "step": 65630 }, { "epoch": 18.63184785694011, "grad_norm": 6.702522277832031, "learning_rate": 6.879080329264831e-06, "loss": 2.0832338333129883, "step": 65640 }, { "epoch": 18.634686346863468, "grad_norm": 6.4478254318237305, "learning_rate": 6.864887879648027e-06, "loss": 2.0461843490600584, "step": 65650 }, { "epoch": 18.63752483678683, "grad_norm": 6.741181373596191, "learning_rate": 6.850695430031223e-06, "loss": 2.075997734069824, "step": 65660 }, { "epoch": 18.64036332671019, "grad_norm": 6.094913005828857, "learning_rate": 6.836502980414419e-06, "loss": 2.080614471435547, "step": 65670 }, { "epoch": 18.64320181663355, "grad_norm": 6.199418067932129, "learning_rate": 6.822310530797615e-06, "loss": 2.1045040130615233, "step": 65680 }, { "epoch": 18.646040306556912, "grad_norm": 6.711644649505615, "learning_rate": 6.808118081180812e-06, "loss": 2.212717056274414, "step": 65690 }, { "epoch": 18.648878796480272, "grad_norm": 6.79026460647583, "learning_rate": 6.793925631564008e-06, "loss": 2.1265302658081056, "step": 65700 }, { "epoch": 18.651717286403635, "grad_norm": 6.694177627563477, "learning_rate": 6.7797331819472035e-06, "loss": 2.135468673706055, "step": 65710 }, { "epoch": 18.654555776326994, "grad_norm": 6.351978778839111, "learning_rate": 6.7655407323304e-06, "loss": 2.053749656677246, "step": 65720 }, { "epoch": 18.657394266250353, "grad_norm": 6.287079811096191, "learning_rate": 6.751348282713597e-06, "loss": 2.0670230865478514, "step": 65730 }, { "epoch": 18.660232756173716, "grad_norm": 6.886959075927734, "learning_rate": 6.737155833096794e-06, "loss": 2.1308183670043945, "step": 65740 }, { "epoch": 18.663071246097076, "grad_norm": 6.130692481994629, "learning_rate": 6.7229633834799895e-06, "loss": 2.100071334838867, "step": 65750 }, { "epoch": 18.66590973602044, "grad_norm": 6.530364036560059, "learning_rate": 6.708770933863185e-06, "loss": 2.115652084350586, "step": 65760 }, { "epoch": 18.668748225943798, "grad_norm": 6.4802093505859375, "learning_rate": 6.694578484246382e-06, "loss": 2.119556427001953, "step": 65770 }, { "epoch": 18.671586715867157, "grad_norm": 6.3044915199279785, "learning_rate": 6.680386034629578e-06, "loss": 2.0686031341552735, "step": 65780 }, { "epoch": 18.67442520579052, "grad_norm": 6.388713836669922, "learning_rate": 6.666193585012774e-06, "loss": 2.0810611724853514, "step": 65790 }, { "epoch": 18.67726369571388, "grad_norm": 6.752663612365723, "learning_rate": 6.6520011353959705e-06, "loss": 2.1551774978637694, "step": 65800 }, { "epoch": 18.680102185637242, "grad_norm": 6.547235012054443, "learning_rate": 6.637808685779166e-06, "loss": 2.1522802352905273, "step": 65810 }, { "epoch": 18.6829406755606, "grad_norm": 6.200343132019043, "learning_rate": 6.623616236162362e-06, "loss": 2.0997528076171874, "step": 65820 }, { "epoch": 18.68577916548396, "grad_norm": 6.348862171173096, "learning_rate": 6.609423786545559e-06, "loss": 2.09462947845459, "step": 65830 }, { "epoch": 18.688617655407324, "grad_norm": 6.670496463775635, "learning_rate": 6.595231336928755e-06, "loss": 2.1336421966552734, "step": 65840 }, { "epoch": 18.691456145330683, "grad_norm": 6.109516620635986, "learning_rate": 6.581038887311951e-06, "loss": 2.030636215209961, "step": 65850 }, { "epoch": 18.694294635254046, "grad_norm": 6.652522087097168, "learning_rate": 6.5668464376951466e-06, "loss": 2.1982494354248048, "step": 65860 }, { "epoch": 18.697133125177405, "grad_norm": 6.4732818603515625, "learning_rate": 6.552653988078343e-06, "loss": 2.101965141296387, "step": 65870 }, { "epoch": 18.699971615100765, "grad_norm": 6.205769062042236, "learning_rate": 6.538461538461539e-06, "loss": 2.065937805175781, "step": 65880 }, { "epoch": 18.702810105024128, "grad_norm": 6.4180827140808105, "learning_rate": 6.524269088844735e-06, "loss": 2.1300731658935548, "step": 65890 }, { "epoch": 18.705648594947487, "grad_norm": 6.915193557739258, "learning_rate": 6.510076639227932e-06, "loss": 2.144454765319824, "step": 65900 }, { "epoch": 18.70848708487085, "grad_norm": 6.756799221038818, "learning_rate": 6.4958841896111276e-06, "loss": 2.1008230209350587, "step": 65910 }, { "epoch": 18.71132557479421, "grad_norm": 6.462621688842773, "learning_rate": 6.481691739994323e-06, "loss": 2.128449821472168, "step": 65920 }, { "epoch": 18.714164064717572, "grad_norm": 6.341053009033203, "learning_rate": 6.46749929037752e-06, "loss": 2.119014549255371, "step": 65930 }, { "epoch": 18.71700255464093, "grad_norm": 6.66563081741333, "learning_rate": 6.453306840760716e-06, "loss": 2.0347333908081056, "step": 65940 }, { "epoch": 18.71984104456429, "grad_norm": 6.713007926940918, "learning_rate": 6.439114391143912e-06, "loss": 2.1470434188842775, "step": 65950 }, { "epoch": 18.722679534487654, "grad_norm": 6.298444747924805, "learning_rate": 6.424921941527108e-06, "loss": 2.1115705490112306, "step": 65960 }, { "epoch": 18.725518024411013, "grad_norm": 6.135115146636963, "learning_rate": 6.4107294919103044e-06, "loss": 2.129867362976074, "step": 65970 }, { "epoch": 18.728356514334376, "grad_norm": 6.459167957305908, "learning_rate": 6.3965370422935e-06, "loss": 2.1194080352783202, "step": 65980 }, { "epoch": 18.731195004257735, "grad_norm": 6.202145099639893, "learning_rate": 6.382344592676696e-06, "loss": 2.126788520812988, "step": 65990 }, { "epoch": 18.734033494181094, "grad_norm": 6.255209922790527, "learning_rate": 6.368152143059893e-06, "loss": 2.122597503662109, "step": 66000 }, { "epoch": 18.734033494181094, "eval_accuracy": 0.3652953519425192, "eval_loss": 2.43337345123291, "eval_runtime": 53.9143, "eval_samples_per_second": 291.703, "eval_steps_per_second": 4.563, "step": 66000 }, { "epoch": 18.736871984104457, "grad_norm": 6.060238361358643, "learning_rate": 6.353959693443089e-06, "loss": 2.148123931884766, "step": 66010 }, { "epoch": 18.739710474027817, "grad_norm": 6.372814655303955, "learning_rate": 6.339767243826285e-06, "loss": 2.0819658279418944, "step": 66020 }, { "epoch": 18.74254896395118, "grad_norm": 6.321505546569824, "learning_rate": 6.325574794209481e-06, "loss": 2.0136272430419924, "step": 66030 }, { "epoch": 18.74538745387454, "grad_norm": 6.267404079437256, "learning_rate": 6.311382344592677e-06, "loss": 2.111591339111328, "step": 66040 }, { "epoch": 18.748225943797898, "grad_norm": 6.629931449890137, "learning_rate": 6.297189894975873e-06, "loss": 2.109416389465332, "step": 66050 }, { "epoch": 18.75106443372126, "grad_norm": 6.504267692565918, "learning_rate": 6.282997445359069e-06, "loss": 2.069186973571777, "step": 66060 }, { "epoch": 18.75390292364462, "grad_norm": 6.119106292724609, "learning_rate": 6.268804995742266e-06, "loss": 2.066851997375488, "step": 66070 }, { "epoch": 18.756741413567983, "grad_norm": 6.554816246032715, "learning_rate": 6.2546125461254615e-06, "loss": 2.1173423767089843, "step": 66080 }, { "epoch": 18.759579903491343, "grad_norm": 6.2366790771484375, "learning_rate": 6.240420096508657e-06, "loss": 2.0284467697143556, "step": 66090 }, { "epoch": 18.762418393414702, "grad_norm": 7.076211452484131, "learning_rate": 6.226227646891854e-06, "loss": 2.0600793838500975, "step": 66100 }, { "epoch": 18.765256883338065, "grad_norm": 6.108314514160156, "learning_rate": 6.21203519727505e-06, "loss": 2.0910552978515624, "step": 66110 }, { "epoch": 18.768095373261424, "grad_norm": 6.868163108825684, "learning_rate": 6.197842747658246e-06, "loss": 2.1623512268066407, "step": 66120 }, { "epoch": 18.770933863184787, "grad_norm": 6.050357341766357, "learning_rate": 6.1836502980414425e-06, "loss": 2.1133396148681642, "step": 66130 }, { "epoch": 18.773772353108146, "grad_norm": 6.943560600280762, "learning_rate": 6.169457848424638e-06, "loss": 2.116322708129883, "step": 66140 }, { "epoch": 18.776610843031506, "grad_norm": 6.8135480880737305, "learning_rate": 6.155265398807834e-06, "loss": 2.1582925796508787, "step": 66150 }, { "epoch": 18.77944933295487, "grad_norm": 6.480197429656982, "learning_rate": 6.14107294919103e-06, "loss": 2.112314796447754, "step": 66160 }, { "epoch": 18.782287822878228, "grad_norm": 6.135546684265137, "learning_rate": 6.126880499574227e-06, "loss": 2.068522834777832, "step": 66170 }, { "epoch": 18.78512631280159, "grad_norm": 6.315862655639648, "learning_rate": 6.112688049957423e-06, "loss": 2.186738967895508, "step": 66180 }, { "epoch": 18.78796480272495, "grad_norm": 6.633881568908691, "learning_rate": 6.0984956003406185e-06, "loss": 2.0773899078369142, "step": 66190 }, { "epoch": 18.79080329264831, "grad_norm": 6.467921257019043, "learning_rate": 6.084303150723815e-06, "loss": 2.0958444595336916, "step": 66200 }, { "epoch": 18.793641782571672, "grad_norm": 6.560643672943115, "learning_rate": 6.070110701107011e-06, "loss": 2.0947477340698244, "step": 66210 }, { "epoch": 18.79648027249503, "grad_norm": 6.852553844451904, "learning_rate": 6.055918251490208e-06, "loss": 2.104860305786133, "step": 66220 }, { "epoch": 18.799318762418395, "grad_norm": 6.224774360656738, "learning_rate": 6.041725801873404e-06, "loss": 2.1231002807617188, "step": 66230 }, { "epoch": 18.802157252341754, "grad_norm": 6.4370269775390625, "learning_rate": 6.0275333522566e-06, "loss": 2.078472137451172, "step": 66240 }, { "epoch": 18.804995742265113, "grad_norm": 6.54070520401001, "learning_rate": 6.013340902639796e-06, "loss": 2.1154924392700196, "step": 66250 }, { "epoch": 18.807834232188476, "grad_norm": 6.271167278289795, "learning_rate": 5.999148453022992e-06, "loss": 2.102960395812988, "step": 66260 }, { "epoch": 18.810672722111835, "grad_norm": 6.372697353363037, "learning_rate": 5.984956003406189e-06, "loss": 2.0510316848754884, "step": 66270 }, { "epoch": 18.8135112120352, "grad_norm": 6.372917652130127, "learning_rate": 5.970763553789385e-06, "loss": 2.1396720886230467, "step": 66280 }, { "epoch": 18.816349701958558, "grad_norm": 6.822504043579102, "learning_rate": 5.9565711041725805e-06, "loss": 2.1029743194580077, "step": 66290 }, { "epoch": 18.81918819188192, "grad_norm": 6.389019966125488, "learning_rate": 5.942378654555776e-06, "loss": 2.1234867095947267, "step": 66300 }, { "epoch": 18.82202668180528, "grad_norm": 6.11255407333374, "learning_rate": 5.928186204938973e-06, "loss": 2.05350341796875, "step": 66310 }, { "epoch": 18.82486517172864, "grad_norm": 6.6881303787231445, "learning_rate": 5.913993755322169e-06, "loss": 2.1480434417724608, "step": 66320 }, { "epoch": 18.827703661652002, "grad_norm": 6.526291370391846, "learning_rate": 5.899801305705365e-06, "loss": 2.159148597717285, "step": 66330 }, { "epoch": 18.83054215157536, "grad_norm": 6.325294017791748, "learning_rate": 5.8856088560885615e-06, "loss": 2.1228975296020507, "step": 66340 }, { "epoch": 18.833380641498724, "grad_norm": 6.329010963439941, "learning_rate": 5.871416406471757e-06, "loss": 2.1121347427368162, "step": 66350 }, { "epoch": 18.836219131422084, "grad_norm": 6.50437068939209, "learning_rate": 5.857223956854953e-06, "loss": 2.1582435607910155, "step": 66360 }, { "epoch": 18.839057621345443, "grad_norm": 6.796273231506348, "learning_rate": 5.84303150723815e-06, "loss": 2.146057891845703, "step": 66370 }, { "epoch": 18.841896111268806, "grad_norm": 6.192599296569824, "learning_rate": 5.828839057621346e-06, "loss": 2.0954158782958983, "step": 66380 }, { "epoch": 18.844734601192165, "grad_norm": 6.11782693862915, "learning_rate": 5.814646608004542e-06, "loss": 2.0877017974853516, "step": 66390 }, { "epoch": 18.847573091115528, "grad_norm": 6.323272228240967, "learning_rate": 5.800454158387738e-06, "loss": 2.0916702270507814, "step": 66400 }, { "epoch": 18.850411581038887, "grad_norm": 6.1704230308532715, "learning_rate": 5.786261708770934e-06, "loss": 2.1132329940795898, "step": 66410 }, { "epoch": 18.853250070962247, "grad_norm": 6.411996841430664, "learning_rate": 5.77206925915413e-06, "loss": 2.099520683288574, "step": 66420 }, { "epoch": 18.85608856088561, "grad_norm": 6.394660472869873, "learning_rate": 5.757876809537326e-06, "loss": 2.060613822937012, "step": 66430 }, { "epoch": 18.85892705080897, "grad_norm": 6.675267219543457, "learning_rate": 5.743684359920523e-06, "loss": 2.103582191467285, "step": 66440 }, { "epoch": 18.861765540732332, "grad_norm": 6.240618705749512, "learning_rate": 5.7294919103037185e-06, "loss": 2.112631988525391, "step": 66450 }, { "epoch": 18.86460403065569, "grad_norm": 6.452848434448242, "learning_rate": 5.715299460686914e-06, "loss": 2.0452674865722655, "step": 66460 }, { "epoch": 18.86744252057905, "grad_norm": 6.185317039489746, "learning_rate": 5.701107011070111e-06, "loss": 2.0774303436279298, "step": 66470 }, { "epoch": 18.870281010502413, "grad_norm": 6.279379367828369, "learning_rate": 5.686914561453307e-06, "loss": 1.9999151229858398, "step": 66480 }, { "epoch": 18.873119500425773, "grad_norm": 6.266867637634277, "learning_rate": 5.672722111836503e-06, "loss": 2.1433582305908203, "step": 66490 }, { "epoch": 18.875957990349136, "grad_norm": 6.626073360443115, "learning_rate": 5.6585296622196995e-06, "loss": 2.1052309036254884, "step": 66500 }, { "epoch": 18.875957990349136, "eval_accuracy": 0.3631970496598207, "eval_loss": 2.4327402114868164, "eval_runtime": 49.5668, "eval_samples_per_second": 317.289, "eval_steps_per_second": 4.963, "step": 66500 }, { "epoch": 18.878796480272495, "grad_norm": 6.527654647827148, "learning_rate": 5.644337212602895e-06, "loss": 2.083056831359863, "step": 66510 }, { "epoch": 18.881634970195854, "grad_norm": 6.502869129180908, "learning_rate": 5.630144762986091e-06, "loss": 2.1233102798461916, "step": 66520 }, { "epoch": 18.884473460119217, "grad_norm": 6.633118152618408, "learning_rate": 5.615952313369287e-06, "loss": 2.1229114532470703, "step": 66530 }, { "epoch": 18.887311950042577, "grad_norm": 6.399628162384033, "learning_rate": 5.601759863752485e-06, "loss": 2.029714584350586, "step": 66540 }, { "epoch": 18.89015043996594, "grad_norm": 6.357913970947266, "learning_rate": 5.5875674141356805e-06, "loss": 2.1252328872680666, "step": 66550 }, { "epoch": 18.8929889298893, "grad_norm": 6.352670669555664, "learning_rate": 5.573374964518876e-06, "loss": 2.0517856597900392, "step": 66560 }, { "epoch": 18.895827419812658, "grad_norm": 6.452559947967529, "learning_rate": 5.559182514902072e-06, "loss": 2.1307939529418944, "step": 66570 }, { "epoch": 18.89866590973602, "grad_norm": 6.813271522521973, "learning_rate": 5.544990065285269e-06, "loss": 2.1672319412231444, "step": 66580 }, { "epoch": 18.90150439965938, "grad_norm": 6.492202281951904, "learning_rate": 5.530797615668465e-06, "loss": 2.1631837844848634, "step": 66590 }, { "epoch": 18.904342889582743, "grad_norm": 6.503082275390625, "learning_rate": 5.516605166051661e-06, "loss": 2.0483179092407227, "step": 66600 }, { "epoch": 18.907181379506103, "grad_norm": 6.541421413421631, "learning_rate": 5.502412716434857e-06, "loss": 2.1117353439331055, "step": 66610 }, { "epoch": 18.910019869429462, "grad_norm": 6.803465843200684, "learning_rate": 5.488220266818053e-06, "loss": 2.142854118347168, "step": 66620 }, { "epoch": 18.912858359352825, "grad_norm": 6.2262067794799805, "learning_rate": 5.474027817201249e-06, "loss": 2.113760757446289, "step": 66630 }, { "epoch": 18.915696849276184, "grad_norm": 6.2585272789001465, "learning_rate": 5.459835367584446e-06, "loss": 2.099937629699707, "step": 66640 }, { "epoch": 18.918535339199547, "grad_norm": 6.281336784362793, "learning_rate": 5.445642917967642e-06, "loss": 1.9959651947021484, "step": 66650 }, { "epoch": 18.921373829122906, "grad_norm": 6.678567886352539, "learning_rate": 5.4314504683508376e-06, "loss": 2.0679048538208007, "step": 66660 }, { "epoch": 18.92421231904627, "grad_norm": 6.241562366485596, "learning_rate": 5.417258018734033e-06, "loss": 2.128246307373047, "step": 66670 }, { "epoch": 18.92705080896963, "grad_norm": 6.1897125244140625, "learning_rate": 5.40306556911723e-06, "loss": 2.043190574645996, "step": 66680 }, { "epoch": 18.929889298892988, "grad_norm": 6.3719916343688965, "learning_rate": 5.388873119500426e-06, "loss": 2.067709732055664, "step": 66690 }, { "epoch": 18.93272778881635, "grad_norm": 6.347736358642578, "learning_rate": 5.374680669883622e-06, "loss": 2.1353805541992186, "step": 66700 }, { "epoch": 18.93556627873971, "grad_norm": 7.102441310882568, "learning_rate": 5.3604882202668186e-06, "loss": 2.158308410644531, "step": 66710 }, { "epoch": 18.93840476866307, "grad_norm": 6.83226203918457, "learning_rate": 5.3462957706500144e-06, "loss": 2.1186525344848635, "step": 66720 }, { "epoch": 18.941243258586432, "grad_norm": 6.493803024291992, "learning_rate": 5.33210332103321e-06, "loss": 2.0793405532836915, "step": 66730 }, { "epoch": 18.94408174850979, "grad_norm": 6.40255069732666, "learning_rate": 5.317910871416407e-06, "loss": 2.073501396179199, "step": 66740 }, { "epoch": 18.946920238433155, "grad_norm": 6.6030354499816895, "learning_rate": 5.303718421799603e-06, "loss": 2.0708423614501954, "step": 66750 }, { "epoch": 18.949758728356514, "grad_norm": 6.373889446258545, "learning_rate": 5.289525972182799e-06, "loss": 2.1023632049560548, "step": 66760 }, { "epoch": 18.952597218279877, "grad_norm": 6.693175792694092, "learning_rate": 5.275333522565995e-06, "loss": 2.199353790283203, "step": 66770 }, { "epoch": 18.955435708203236, "grad_norm": 6.759591102600098, "learning_rate": 5.261141072949191e-06, "loss": 2.089085006713867, "step": 66780 }, { "epoch": 18.958274198126595, "grad_norm": 6.786888122558594, "learning_rate": 5.246948623332387e-06, "loss": 2.1499608993530273, "step": 66790 }, { "epoch": 18.96111268804996, "grad_norm": 6.304567813873291, "learning_rate": 5.232756173715583e-06, "loss": 2.035688018798828, "step": 66800 }, { "epoch": 18.963951177973318, "grad_norm": 6.3857245445251465, "learning_rate": 5.21856372409878e-06, "loss": 2.0661731719970704, "step": 66810 }, { "epoch": 18.96678966789668, "grad_norm": 6.496391296386719, "learning_rate": 5.204371274481976e-06, "loss": 2.133726692199707, "step": 66820 }, { "epoch": 18.96962815782004, "grad_norm": 6.347146987915039, "learning_rate": 5.1901788248651715e-06, "loss": 2.0912113189697266, "step": 66830 }, { "epoch": 18.9724666477434, "grad_norm": 7.023360252380371, "learning_rate": 5.175986375248368e-06, "loss": 2.0656538009643555, "step": 66840 }, { "epoch": 18.975305137666762, "grad_norm": 6.174143314361572, "learning_rate": 5.161793925631564e-06, "loss": 2.1380031585693358, "step": 66850 }, { "epoch": 18.97814362759012, "grad_norm": 6.448389053344727, "learning_rate": 5.14760147601476e-06, "loss": 2.0879657745361326, "step": 66860 }, { "epoch": 18.980982117513484, "grad_norm": 6.580738544464111, "learning_rate": 5.133409026397957e-06, "loss": 2.0720142364501952, "step": 66870 }, { "epoch": 18.983820607436844, "grad_norm": 6.151330947875977, "learning_rate": 5.119216576781153e-06, "loss": 2.082585906982422, "step": 66880 }, { "epoch": 18.986659097360203, "grad_norm": 6.074293613433838, "learning_rate": 5.105024127164349e-06, "loss": 2.1468950271606446, "step": 66890 }, { "epoch": 18.989497587283566, "grad_norm": 7.02142333984375, "learning_rate": 5.090831677547545e-06, "loss": 2.1284135818481444, "step": 66900 }, { "epoch": 18.992336077206925, "grad_norm": 6.497470855712891, "learning_rate": 5.076639227930742e-06, "loss": 2.1055139541625976, "step": 66910 }, { "epoch": 18.995174567130288, "grad_norm": 6.582899570465088, "learning_rate": 5.062446778313938e-06, "loss": 2.138006591796875, "step": 66920 }, { "epoch": 18.998013057053647, "grad_norm": 6.332634449005127, "learning_rate": 5.0482543286971335e-06, "loss": 2.0868186950683594, "step": 66930 }, { "epoch": 19.000851546977007, "grad_norm": 6.250434398651123, "learning_rate": 5.034061879080329e-06, "loss": 2.0675540924072267, "step": 66940 }, { "epoch": 19.00369003690037, "grad_norm": 6.382821083068848, "learning_rate": 5.019869429463526e-06, "loss": 2.0730169296264647, "step": 66950 }, { "epoch": 19.00652852682373, "grad_norm": 6.407469749450684, "learning_rate": 5.005676979846722e-06, "loss": 2.141368865966797, "step": 66960 }, { "epoch": 19.009367016747092, "grad_norm": 6.951335906982422, "learning_rate": 4.991484530229918e-06, "loss": 2.147075653076172, "step": 66970 }, { "epoch": 19.01220550667045, "grad_norm": 6.612213611602783, "learning_rate": 4.9772920806131145e-06, "loss": 2.1595701217651366, "step": 66980 }, { "epoch": 19.01504399659381, "grad_norm": 6.29664945602417, "learning_rate": 4.96309963099631e-06, "loss": 2.0512727737426757, "step": 66990 }, { "epoch": 19.017882486517173, "grad_norm": 6.244273662567139, "learning_rate": 4.948907181379506e-06, "loss": 2.1098939895629885, "step": 67000 }, { "epoch": 19.017882486517173, "eval_accuracy": 0.3620525211419851, "eval_loss": 2.4319686889648438, "eval_runtime": 49.2976, "eval_samples_per_second": 319.022, "eval_steps_per_second": 4.99, "step": 67000 }, { "epoch": 19.020720976440533, "grad_norm": 6.086441516876221, "learning_rate": 4.934714731762703e-06, "loss": 2.1198974609375, "step": 67010 }, { "epoch": 19.023559466363896, "grad_norm": 6.316961765289307, "learning_rate": 4.920522282145899e-06, "loss": 2.083339309692383, "step": 67020 }, { "epoch": 19.026397956287255, "grad_norm": 6.508834362030029, "learning_rate": 4.906329832529095e-06, "loss": 2.132358360290527, "step": 67030 }, { "epoch": 19.029236446210614, "grad_norm": 6.318736553192139, "learning_rate": 4.8921373829122905e-06, "loss": 2.0788511276245116, "step": 67040 }, { "epoch": 19.032074936133977, "grad_norm": 6.292352676391602, "learning_rate": 4.877944933295487e-06, "loss": 2.0873512268066405, "step": 67050 }, { "epoch": 19.034913426057336, "grad_norm": 6.695289611816406, "learning_rate": 4.863752483678683e-06, "loss": 2.142196273803711, "step": 67060 }, { "epoch": 19.0377519159807, "grad_norm": 6.207818031311035, "learning_rate": 4.849560034061879e-06, "loss": 2.0837871551513674, "step": 67070 }, { "epoch": 19.04059040590406, "grad_norm": 6.8561224937438965, "learning_rate": 4.835367584445076e-06, "loss": 2.0398509979248045, "step": 67080 }, { "epoch": 19.043428895827418, "grad_norm": 6.542081832885742, "learning_rate": 4.8211751348282715e-06, "loss": 2.0955856323242186, "step": 67090 }, { "epoch": 19.04626738575078, "grad_norm": 6.3761515617370605, "learning_rate": 4.806982685211467e-06, "loss": 2.054019546508789, "step": 67100 }, { "epoch": 19.04910587567414, "grad_norm": 6.413814067840576, "learning_rate": 4.792790235594664e-06, "loss": 2.1492116928100584, "step": 67110 }, { "epoch": 19.051944365597503, "grad_norm": 6.240847110748291, "learning_rate": 4.77859778597786e-06, "loss": 2.1371826171875, "step": 67120 }, { "epoch": 19.054782855520862, "grad_norm": 6.057521820068359, "learning_rate": 4.764405336361056e-06, "loss": 2.1272699356079103, "step": 67130 }, { "epoch": 19.057621345444225, "grad_norm": 6.15475606918335, "learning_rate": 4.750212886744252e-06, "loss": 2.074911117553711, "step": 67140 }, { "epoch": 19.060459835367585, "grad_norm": 6.443698406219482, "learning_rate": 4.736020437127448e-06, "loss": 2.107526397705078, "step": 67150 }, { "epoch": 19.063298325290944, "grad_norm": 6.985461711883545, "learning_rate": 4.721827987510644e-06, "loss": 2.095809745788574, "step": 67160 }, { "epoch": 19.066136815214307, "grad_norm": 6.084860324859619, "learning_rate": 4.70763553789384e-06, "loss": 2.09850959777832, "step": 67170 }, { "epoch": 19.068975305137666, "grad_norm": 6.45111083984375, "learning_rate": 4.693443088277037e-06, "loss": 2.036625862121582, "step": 67180 }, { "epoch": 19.07181379506103, "grad_norm": 6.6113152503967285, "learning_rate": 4.679250638660233e-06, "loss": 2.0712963104248048, "step": 67190 }, { "epoch": 19.07465228498439, "grad_norm": 6.495716571807861, "learning_rate": 4.665058189043429e-06, "loss": 2.0762359619140627, "step": 67200 }, { "epoch": 19.077490774907748, "grad_norm": 6.676681041717529, "learning_rate": 4.650865739426625e-06, "loss": 2.082073974609375, "step": 67210 }, { "epoch": 19.08032926483111, "grad_norm": 6.494283199310303, "learning_rate": 4.636673289809822e-06, "loss": 2.1639713287353515, "step": 67220 }, { "epoch": 19.08316775475447, "grad_norm": 6.253046035766602, "learning_rate": 4.622480840193018e-06, "loss": 2.0553775787353517, "step": 67230 }, { "epoch": 19.086006244677833, "grad_norm": 6.536293029785156, "learning_rate": 4.608288390576214e-06, "loss": 2.1125921249389648, "step": 67240 }, { "epoch": 19.088844734601192, "grad_norm": 5.932563304901123, "learning_rate": 4.59409594095941e-06, "loss": 2.0679742813110353, "step": 67250 }, { "epoch": 19.09168322452455, "grad_norm": 6.3105902671813965, "learning_rate": 4.579903491342606e-06, "loss": 2.036901664733887, "step": 67260 }, { "epoch": 19.094521714447914, "grad_norm": 6.845808506011963, "learning_rate": 4.565711041725802e-06, "loss": 2.1524208068847654, "step": 67270 }, { "epoch": 19.097360204371274, "grad_norm": 6.593080520629883, "learning_rate": 4.551518592108999e-06, "loss": 2.1450265884399413, "step": 67280 }, { "epoch": 19.100198694294637, "grad_norm": 6.4230427742004395, "learning_rate": 4.537326142492195e-06, "loss": 2.0990627288818358, "step": 67290 }, { "epoch": 19.103037184217996, "grad_norm": 6.259340763092041, "learning_rate": 4.5231336928753905e-06, "loss": 2.1285423278808593, "step": 67300 }, { "epoch": 19.105875674141355, "grad_norm": 6.603865146636963, "learning_rate": 4.508941243258586e-06, "loss": 2.0848546981811524, "step": 67310 }, { "epoch": 19.10871416406472, "grad_norm": 6.831305027008057, "learning_rate": 4.494748793641783e-06, "loss": 2.1165136337280273, "step": 67320 }, { "epoch": 19.111552653988078, "grad_norm": 6.297320365905762, "learning_rate": 4.480556344024979e-06, "loss": 2.0151609420776366, "step": 67330 }, { "epoch": 19.11439114391144, "grad_norm": 6.522212982177734, "learning_rate": 4.466363894408175e-06, "loss": 2.1344934463500977, "step": 67340 }, { "epoch": 19.1172296338348, "grad_norm": 5.907701015472412, "learning_rate": 4.4521714447913715e-06, "loss": 2.1200750350952147, "step": 67350 }, { "epoch": 19.12006812375816, "grad_norm": 6.31487512588501, "learning_rate": 4.437978995174567e-06, "loss": 2.098598670959473, "step": 67360 }, { "epoch": 19.122906613681522, "grad_norm": 6.250391006469727, "learning_rate": 4.423786545557763e-06, "loss": 2.1314167022705077, "step": 67370 }, { "epoch": 19.12574510360488, "grad_norm": 6.051304817199707, "learning_rate": 4.40959409594096e-06, "loss": 2.113035202026367, "step": 67380 }, { "epoch": 19.128583593528244, "grad_norm": 6.474516868591309, "learning_rate": 4.395401646324156e-06, "loss": 2.107962799072266, "step": 67390 }, { "epoch": 19.131422083451604, "grad_norm": 6.1452813148498535, "learning_rate": 4.381209196707352e-06, "loss": 2.0844701766967773, "step": 67400 }, { "epoch": 19.134260573374963, "grad_norm": 6.313295364379883, "learning_rate": 4.3670167470905476e-06, "loss": 2.095779228210449, "step": 67410 }, { "epoch": 19.137099063298326, "grad_norm": 6.331062316894531, "learning_rate": 4.352824297473744e-06, "loss": 2.099691390991211, "step": 67420 }, { "epoch": 19.139937553221685, "grad_norm": 6.392172336578369, "learning_rate": 4.34005109281862e-06, "loss": 2.086508369445801, "step": 67430 }, { "epoch": 19.142776043145048, "grad_norm": 6.549079895019531, "learning_rate": 4.325858643201817e-06, "loss": 2.1336069107055664, "step": 67440 }, { "epoch": 19.145614533068407, "grad_norm": 6.550612449645996, "learning_rate": 4.311666193585013e-06, "loss": 2.1533985137939453, "step": 67450 }, { "epoch": 19.148453022991767, "grad_norm": 6.6946187019348145, "learning_rate": 4.297473743968209e-06, "loss": 2.1196590423583985, "step": 67460 }, { "epoch": 19.15129151291513, "grad_norm": 6.396333694458008, "learning_rate": 4.2832812943514054e-06, "loss": 2.068689155578613, "step": 67470 }, { "epoch": 19.15413000283849, "grad_norm": 6.670053005218506, "learning_rate": 4.269088844734601e-06, "loss": 2.059081268310547, "step": 67480 }, { "epoch": 19.15696849276185, "grad_norm": 6.24868106842041, "learning_rate": 4.254896395117797e-06, "loss": 2.0591171264648436, "step": 67490 }, { "epoch": 19.15980698268521, "grad_norm": 6.5188469886779785, "learning_rate": 4.240703945500994e-06, "loss": 2.087921714782715, "step": 67500 }, { "epoch": 19.15980698268521, "eval_accuracy": 0.3650410122718891, "eval_loss": 2.431579351425171, "eval_runtime": 50.5971, "eval_samples_per_second": 310.828, "eval_steps_per_second": 4.862, "step": 67500 }, { "epoch": 19.162645472608574, "grad_norm": 6.620319843292236, "learning_rate": 4.22651149588419e-06, "loss": 2.118330192565918, "step": 67510 }, { "epoch": 19.165483962531933, "grad_norm": 6.206390380859375, "learning_rate": 4.212319046267386e-06, "loss": 2.0721864700317383, "step": 67520 }, { "epoch": 19.168322452455293, "grad_norm": 6.233907222747803, "learning_rate": 4.1981265966505814e-06, "loss": 2.0736772537231447, "step": 67530 }, { "epoch": 19.171160942378656, "grad_norm": 6.154344081878662, "learning_rate": 4.183934147033778e-06, "loss": 2.1800485610961915, "step": 67540 }, { "epoch": 19.173999432302015, "grad_norm": 6.853445053100586, "learning_rate": 4.169741697416974e-06, "loss": 2.039468002319336, "step": 67550 }, { "epoch": 19.176837922225378, "grad_norm": 6.786056995391846, "learning_rate": 4.15554924780017e-06, "loss": 2.1086858749389648, "step": 67560 }, { "epoch": 19.179676412148737, "grad_norm": 6.439430236816406, "learning_rate": 4.141356798183367e-06, "loss": 2.1754375457763673, "step": 67570 }, { "epoch": 19.182514902072096, "grad_norm": 6.104950428009033, "learning_rate": 4.1271643485665625e-06, "loss": 2.1044631958007813, "step": 67580 }, { "epoch": 19.18535339199546, "grad_norm": 6.515640735626221, "learning_rate": 4.112971898949759e-06, "loss": 2.059745979309082, "step": 67590 }, { "epoch": 19.18819188191882, "grad_norm": 6.210728168487549, "learning_rate": 4.098779449332955e-06, "loss": 2.111520195007324, "step": 67600 }, { "epoch": 19.19103037184218, "grad_norm": 6.353830814361572, "learning_rate": 4.084586999716152e-06, "loss": 2.1136795043945313, "step": 67610 }, { "epoch": 19.19386886176554, "grad_norm": 6.225428581237793, "learning_rate": 4.070394550099348e-06, "loss": 2.115445137023926, "step": 67620 }, { "epoch": 19.1967073516889, "grad_norm": 6.274924278259277, "learning_rate": 4.0562021004825435e-06, "loss": 2.02576904296875, "step": 67630 }, { "epoch": 19.199545841612263, "grad_norm": 6.488601207733154, "learning_rate": 4.04200965086574e-06, "loss": 2.057938003540039, "step": 67640 }, { "epoch": 19.202384331535622, "grad_norm": 6.500701427459717, "learning_rate": 4.027817201248936e-06, "loss": 2.1151050567626952, "step": 67650 }, { "epoch": 19.205222821458985, "grad_norm": 6.7889814376831055, "learning_rate": 4.013624751632132e-06, "loss": 2.1135974884033204, "step": 67660 }, { "epoch": 19.208061311382345, "grad_norm": 6.249377727508545, "learning_rate": 3.999432302015329e-06, "loss": 2.1258464813232423, "step": 67670 }, { "epoch": 19.210899801305704, "grad_norm": 6.833857536315918, "learning_rate": 3.9852398523985245e-06, "loss": 2.142241668701172, "step": 67680 }, { "epoch": 19.213738291229067, "grad_norm": 6.481902122497559, "learning_rate": 3.97104740278172e-06, "loss": 2.1111291885375976, "step": 67690 }, { "epoch": 19.216576781152426, "grad_norm": 6.591602802276611, "learning_rate": 3.956854953164916e-06, "loss": 2.1232831954956053, "step": 67700 }, { "epoch": 19.21941527107579, "grad_norm": 6.090618133544922, "learning_rate": 3.942662503548113e-06, "loss": 2.0820865631103516, "step": 67710 }, { "epoch": 19.22225376099915, "grad_norm": 6.579020023345947, "learning_rate": 3.928470053931309e-06, "loss": 2.097948455810547, "step": 67720 }, { "epoch": 19.225092250922508, "grad_norm": 6.47311544418335, "learning_rate": 3.914277604314505e-06, "loss": 2.084700012207031, "step": 67730 }, { "epoch": 19.22793074084587, "grad_norm": 6.267272472381592, "learning_rate": 3.900085154697701e-06, "loss": 2.052010917663574, "step": 67740 }, { "epoch": 19.23076923076923, "grad_norm": 6.84351110458374, "learning_rate": 3.885892705080897e-06, "loss": 2.100758361816406, "step": 67750 }, { "epoch": 19.233607720692593, "grad_norm": 6.385375499725342, "learning_rate": 3.871700255464093e-06, "loss": 2.106715774536133, "step": 67760 }, { "epoch": 19.236446210615952, "grad_norm": 6.436838626861572, "learning_rate": 3.85750780584729e-06, "loss": 2.1534347534179688, "step": 67770 }, { "epoch": 19.23928470053931, "grad_norm": 6.4915266036987305, "learning_rate": 3.843315356230486e-06, "loss": 2.0411409378051757, "step": 67780 }, { "epoch": 19.242123190462674, "grad_norm": 6.619574069976807, "learning_rate": 3.8291229066136815e-06, "loss": 2.080924224853516, "step": 67790 }, { "epoch": 19.244961680386034, "grad_norm": 6.41443395614624, "learning_rate": 3.814930456996877e-06, "loss": 2.1188499450683596, "step": 67800 }, { "epoch": 19.247800170309397, "grad_norm": 6.599486827850342, "learning_rate": 3.800738007380074e-06, "loss": 2.1512733459472657, "step": 67810 }, { "epoch": 19.250638660232756, "grad_norm": 6.34444522857666, "learning_rate": 3.78654555776327e-06, "loss": 2.1221818923950195, "step": 67820 }, { "epoch": 19.253477150156115, "grad_norm": 6.309469699859619, "learning_rate": 3.772353108146466e-06, "loss": 2.1254341125488283, "step": 67830 }, { "epoch": 19.256315640079478, "grad_norm": 6.409161567687988, "learning_rate": 3.758160658529662e-06, "loss": 2.0847213745117186, "step": 67840 }, { "epoch": 19.259154130002837, "grad_norm": 6.481197834014893, "learning_rate": 3.7439682089128584e-06, "loss": 2.129412078857422, "step": 67850 }, { "epoch": 19.2619926199262, "grad_norm": 6.42285680770874, "learning_rate": 3.7297757592960546e-06, "loss": 2.1039052963256837, "step": 67860 }, { "epoch": 19.26483110984956, "grad_norm": 6.617273330688477, "learning_rate": 3.7155833096792505e-06, "loss": 2.1772266387939454, "step": 67870 }, { "epoch": 19.267669599772923, "grad_norm": 6.670629024505615, "learning_rate": 3.701390860062447e-06, "loss": 2.11373291015625, "step": 67880 }, { "epoch": 19.270508089696282, "grad_norm": 6.543330192565918, "learning_rate": 3.6871984104456427e-06, "loss": 2.0739601135253904, "step": 67890 }, { "epoch": 19.27334657961964, "grad_norm": 6.003958225250244, "learning_rate": 3.673005960828839e-06, "loss": 2.076491928100586, "step": 67900 }, { "epoch": 19.276185069543004, "grad_norm": 6.386395454406738, "learning_rate": 3.6588135112120352e-06, "loss": 2.0682565689086916, "step": 67910 }, { "epoch": 19.279023559466363, "grad_norm": 6.555779933929443, "learning_rate": 3.644621061595232e-06, "loss": 2.1017219543457033, "step": 67920 }, { "epoch": 19.281862049389726, "grad_norm": 6.307458400726318, "learning_rate": 3.630428611978428e-06, "loss": 2.0498634338378907, "step": 67930 }, { "epoch": 19.284700539313086, "grad_norm": 6.174363613128662, "learning_rate": 3.616236162361624e-06, "loss": 2.0460586547851562, "step": 67940 }, { "epoch": 19.287539029236445, "grad_norm": 6.3906474113464355, "learning_rate": 3.6020437127448204e-06, "loss": 2.10177001953125, "step": 67950 }, { "epoch": 19.290377519159808, "grad_norm": 6.623344421386719, "learning_rate": 3.5878512631280162e-06, "loss": 2.1362373352050783, "step": 67960 }, { "epoch": 19.293216009083167, "grad_norm": 6.526345252990723, "learning_rate": 3.5736588135112125e-06, "loss": 2.048365592956543, "step": 67970 }, { "epoch": 19.29605449900653, "grad_norm": 6.324169158935547, "learning_rate": 3.5594663638944084e-06, "loss": 2.0744321823120115, "step": 67980 }, { "epoch": 19.29889298892989, "grad_norm": 6.3816142082214355, "learning_rate": 3.5452739142776047e-06, "loss": 2.110812187194824, "step": 67990 }, { "epoch": 19.30173147885325, "grad_norm": 6.322437763214111, "learning_rate": 3.531081464660801e-06, "loss": 2.1442949295043947, "step": 68000 }, { "epoch": 19.30173147885325, "eval_accuracy": 0.3641508234246837, "eval_loss": 2.4306957721710205, "eval_runtime": 49.8155, "eval_samples_per_second": 315.705, "eval_steps_per_second": 4.938, "step": 68000 }, { "epoch": 19.30456996877661, "grad_norm": 6.433173179626465, "learning_rate": 3.516889015043997e-06, "loss": 2.055878257751465, "step": 68010 }, { "epoch": 19.30740845869997, "grad_norm": 6.7855119705200195, "learning_rate": 3.502696565427193e-06, "loss": 2.1172344207763674, "step": 68020 }, { "epoch": 19.310246948623334, "grad_norm": 6.798932075500488, "learning_rate": 3.488504115810389e-06, "loss": 2.0963972091674803, "step": 68030 }, { "epoch": 19.313085438546693, "grad_norm": 6.313915729522705, "learning_rate": 3.4743116661935852e-06, "loss": 2.186637115478516, "step": 68040 }, { "epoch": 19.315923928470053, "grad_norm": 6.561005115509033, "learning_rate": 3.4601192165767815e-06, "loss": 2.052413558959961, "step": 68050 }, { "epoch": 19.318762418393415, "grad_norm": 6.475256443023682, "learning_rate": 3.4459267669599774e-06, "loss": 2.088300895690918, "step": 68060 }, { "epoch": 19.321600908316775, "grad_norm": 6.183530807495117, "learning_rate": 3.4317343173431737e-06, "loss": 2.0234760284423827, "step": 68070 }, { "epoch": 19.324439398240138, "grad_norm": 6.353967666625977, "learning_rate": 3.4175418677263695e-06, "loss": 2.030813980102539, "step": 68080 }, { "epoch": 19.327277888163497, "grad_norm": 6.481558322906494, "learning_rate": 3.403349418109566e-06, "loss": 2.0772983551025392, "step": 68090 }, { "epoch": 19.330116378086856, "grad_norm": 6.477407455444336, "learning_rate": 3.389156968492762e-06, "loss": 2.0559816360473633, "step": 68100 }, { "epoch": 19.33295486801022, "grad_norm": 6.368035793304443, "learning_rate": 3.374964518875958e-06, "loss": 2.107138824462891, "step": 68110 }, { "epoch": 19.33579335793358, "grad_norm": 6.01248836517334, "learning_rate": 3.3607720692591543e-06, "loss": 2.0996753692626955, "step": 68120 }, { "epoch": 19.33863184785694, "grad_norm": 6.076104164123535, "learning_rate": 3.34657961964235e-06, "loss": 2.007410430908203, "step": 68130 }, { "epoch": 19.3414703377803, "grad_norm": 6.55950927734375, "learning_rate": 3.3323871700255464e-06, "loss": 2.031051254272461, "step": 68140 }, { "epoch": 19.34430882770366, "grad_norm": 6.5766825675964355, "learning_rate": 3.3181947204087427e-06, "loss": 2.1053787231445313, "step": 68150 }, { "epoch": 19.347147317627023, "grad_norm": 6.504432678222656, "learning_rate": 3.3040022707919386e-06, "loss": 2.020735740661621, "step": 68160 }, { "epoch": 19.349985807550382, "grad_norm": 6.359589099884033, "learning_rate": 3.289809821175135e-06, "loss": 2.0964040756225586, "step": 68170 }, { "epoch": 19.352824297473745, "grad_norm": 6.424010753631592, "learning_rate": 3.2756173715583307e-06, "loss": 2.0562885284423826, "step": 68180 }, { "epoch": 19.355662787397105, "grad_norm": 6.51389217376709, "learning_rate": 3.261424921941527e-06, "loss": 2.0943450927734375, "step": 68190 }, { "epoch": 19.358501277320464, "grad_norm": 6.3142828941345215, "learning_rate": 3.2472324723247233e-06, "loss": 2.003646659851074, "step": 68200 }, { "epoch": 19.361339767243827, "grad_norm": 6.532552242279053, "learning_rate": 3.233040022707919e-06, "loss": 2.023122787475586, "step": 68210 }, { "epoch": 19.364178257167186, "grad_norm": 6.451848030090332, "learning_rate": 3.2188475730911154e-06, "loss": 2.013893890380859, "step": 68220 }, { "epoch": 19.36701674709055, "grad_norm": 6.399197578430176, "learning_rate": 3.2046551234743113e-06, "loss": 2.134144592285156, "step": 68230 }, { "epoch": 19.36985523701391, "grad_norm": 6.330262184143066, "learning_rate": 3.1904626738575084e-06, "loss": 2.0642988204956056, "step": 68240 }, { "epoch": 19.372693726937268, "grad_norm": 6.4088335037231445, "learning_rate": 3.1762702242407043e-06, "loss": 2.0868934631347655, "step": 68250 }, { "epoch": 19.37553221686063, "grad_norm": 6.564688205718994, "learning_rate": 3.1620777746239006e-06, "loss": 2.065348434448242, "step": 68260 }, { "epoch": 19.37837070678399, "grad_norm": 6.757561683654785, "learning_rate": 3.147885325007097e-06, "loss": 2.0951576232910156, "step": 68270 }, { "epoch": 19.381209196707353, "grad_norm": 6.190958499908447, "learning_rate": 3.1336928753902927e-06, "loss": 2.0929521560668944, "step": 68280 }, { "epoch": 19.384047686630712, "grad_norm": 6.480405807495117, "learning_rate": 3.1195004257734886e-06, "loss": 2.119298553466797, "step": 68290 }, { "epoch": 19.386886176554075, "grad_norm": 6.733817100524902, "learning_rate": 3.105307976156685e-06, "loss": 2.1123773574829103, "step": 68300 }, { "epoch": 19.389724666477434, "grad_norm": 6.348940849304199, "learning_rate": 3.0911155265398807e-06, "loss": 2.077132225036621, "step": 68310 }, { "epoch": 19.392563156400794, "grad_norm": 6.392515659332275, "learning_rate": 3.0769230769230774e-06, "loss": 2.0769668579101563, "step": 68320 }, { "epoch": 19.395401646324157, "grad_norm": 6.598771572113037, "learning_rate": 3.0627306273062733e-06, "loss": 2.0756444931030273, "step": 68330 }, { "epoch": 19.398240136247516, "grad_norm": 6.36906623840332, "learning_rate": 3.0485381776894696e-06, "loss": 2.0876592636108398, "step": 68340 }, { "epoch": 19.40107862617088, "grad_norm": 6.3148274421691895, "learning_rate": 3.0343457280726654e-06, "loss": 2.027593994140625, "step": 68350 }, { "epoch": 19.403917116094238, "grad_norm": 6.534308433532715, "learning_rate": 3.0201532784558617e-06, "loss": 2.0963987350463866, "step": 68360 }, { "epoch": 19.406755606017597, "grad_norm": 6.628869533538818, "learning_rate": 3.005960828839058e-06, "loss": 2.0649541854858398, "step": 68370 }, { "epoch": 19.40959409594096, "grad_norm": 6.762785911560059, "learning_rate": 2.991768379222254e-06, "loss": 2.0839954376220704, "step": 68380 }, { "epoch": 19.41243258586432, "grad_norm": 6.476390838623047, "learning_rate": 2.97757592960545e-06, "loss": 2.124989318847656, "step": 68390 }, { "epoch": 19.415271075787683, "grad_norm": 6.133391380310059, "learning_rate": 2.963383479988646e-06, "loss": 2.039175033569336, "step": 68400 }, { "epoch": 19.418109565711042, "grad_norm": 6.370323181152344, "learning_rate": 2.9491910303718423e-06, "loss": 2.0231199264526367, "step": 68410 }, { "epoch": 19.4209480556344, "grad_norm": 6.63827657699585, "learning_rate": 2.9349985807550386e-06, "loss": 2.191200065612793, "step": 68420 }, { "epoch": 19.423786545557764, "grad_norm": 6.157556533813477, "learning_rate": 2.9208061311382345e-06, "loss": 2.011971282958984, "step": 68430 }, { "epoch": 19.426625035481123, "grad_norm": 6.4703545570373535, "learning_rate": 2.9066136815214307e-06, "loss": 2.087154006958008, "step": 68440 }, { "epoch": 19.429463525404486, "grad_norm": 6.829237461090088, "learning_rate": 2.8924212319046266e-06, "loss": 2.1728994369506838, "step": 68450 }, { "epoch": 19.432302015327846, "grad_norm": 6.086109161376953, "learning_rate": 2.878228782287823e-06, "loss": 2.0808073043823243, "step": 68460 }, { "epoch": 19.435140505251205, "grad_norm": 6.7190985679626465, "learning_rate": 2.864036332671019e-06, "loss": 2.0816802978515625, "step": 68470 }, { "epoch": 19.437978995174568, "grad_norm": 6.311476230621338, "learning_rate": 2.849843883054215e-06, "loss": 1.972696304321289, "step": 68480 }, { "epoch": 19.440817485097927, "grad_norm": 6.698836803436279, "learning_rate": 2.8356514334374118e-06, "loss": 2.1288875579833983, "step": 68490 }, { "epoch": 19.44365597502129, "grad_norm": 6.6951704025268555, "learning_rate": 2.8214589838206076e-06, "loss": 2.172747802734375, "step": 68500 }, { "epoch": 19.44365597502129, "eval_accuracy": 0.3663762955426973, "eval_loss": 2.4298970699310303, "eval_runtime": 52.308, "eval_samples_per_second": 300.661, "eval_steps_per_second": 4.703, "step": 68500 }, { "epoch": 19.44649446494465, "grad_norm": 6.248802661895752, "learning_rate": 2.807266534203804e-06, "loss": 2.0958675384521483, "step": 68510 }, { "epoch": 19.44933295486801, "grad_norm": 6.19848108291626, "learning_rate": 2.7930740845869998e-06, "loss": 2.053085708618164, "step": 68520 }, { "epoch": 19.45217144479137, "grad_norm": 6.741959571838379, "learning_rate": 2.778881634970196e-06, "loss": 2.078105354309082, "step": 68530 }, { "epoch": 19.45500993471473, "grad_norm": 6.489747524261475, "learning_rate": 2.7646891853533923e-06, "loss": 2.0964242935180666, "step": 68540 }, { "epoch": 19.457848424638094, "grad_norm": 6.529415607452393, "learning_rate": 2.750496735736588e-06, "loss": 2.124808120727539, "step": 68550 }, { "epoch": 19.460686914561453, "grad_norm": 6.2543463706970215, "learning_rate": 2.7363042861197845e-06, "loss": 2.1236902236938477, "step": 68560 }, { "epoch": 19.463525404484812, "grad_norm": 5.905772686004639, "learning_rate": 2.7221118365029803e-06, "loss": 2.098014259338379, "step": 68570 }, { "epoch": 19.466363894408175, "grad_norm": Infinity, "learning_rate": 2.7079193868861766e-06, "loss": 2.119043159484863, "step": 68580 }, { "epoch": 19.469202384331535, "grad_norm": 6.694770812988281, "learning_rate": 2.6951461822310535e-06, "loss": 1.9850883483886719, "step": 68590 }, { "epoch": 19.472040874254898, "grad_norm": 6.372528076171875, "learning_rate": 2.6809537326142494e-06, "loss": 2.091802787780762, "step": 68600 }, { "epoch": 19.474879364178257, "grad_norm": 6.120728015899658, "learning_rate": 2.6667612829974456e-06, "loss": 2.028517150878906, "step": 68610 }, { "epoch": 19.477717854101616, "grad_norm": 6.344459056854248, "learning_rate": 2.6525688333806415e-06, "loss": 1.950128173828125, "step": 68620 }, { "epoch": 19.48055634402498, "grad_norm": 6.216738700866699, "learning_rate": 2.638376383763838e-06, "loss": 2.106606674194336, "step": 68630 }, { "epoch": 19.48339483394834, "grad_norm": 6.477115631103516, "learning_rate": 2.624183934147034e-06, "loss": 2.1037315368652343, "step": 68640 }, { "epoch": 19.4862333238717, "grad_norm": 6.261562347412109, "learning_rate": 2.60999148453023e-06, "loss": 2.101034927368164, "step": 68650 }, { "epoch": 19.48907181379506, "grad_norm": 6.869441509246826, "learning_rate": 2.5957990349134262e-06, "loss": 2.1076961517333985, "step": 68660 }, { "epoch": 19.49191030371842, "grad_norm": 6.2857794761657715, "learning_rate": 2.581606585296622e-06, "loss": 2.080287551879883, "step": 68670 }, { "epoch": 19.494748793641783, "grad_norm": 6.0823140144348145, "learning_rate": 2.5674141356798184e-06, "loss": 2.0599817276000976, "step": 68680 }, { "epoch": 19.497587283565142, "grad_norm": 6.834015369415283, "learning_rate": 2.5532216860630147e-06, "loss": 2.1260766983032227, "step": 68690 }, { "epoch": 19.500425773488505, "grad_norm": 6.121969699859619, "learning_rate": 2.5390292364462105e-06, "loss": 1.972315216064453, "step": 68700 }, { "epoch": 19.503264263411864, "grad_norm": 6.327489852905273, "learning_rate": 2.524836786829407e-06, "loss": 2.125595474243164, "step": 68710 }, { "epoch": 19.506102753335227, "grad_norm": 6.320100784301758, "learning_rate": 2.510644337212603e-06, "loss": 2.1015209197998046, "step": 68720 }, { "epoch": 19.508941243258587, "grad_norm": 6.575984954833984, "learning_rate": 2.4964518875957994e-06, "loss": 2.221806526184082, "step": 68730 }, { "epoch": 19.511779733181946, "grad_norm": 6.644559860229492, "learning_rate": 2.4822594379789952e-06, "loss": 2.0597557067871093, "step": 68740 }, { "epoch": 19.51461822310531, "grad_norm": 6.166727066040039, "learning_rate": 2.4680669883621915e-06, "loss": 2.048693084716797, "step": 68750 }, { "epoch": 19.51745671302867, "grad_norm": 6.366209983825684, "learning_rate": 2.453874538745388e-06, "loss": 2.0710613250732424, "step": 68760 }, { "epoch": 19.52029520295203, "grad_norm": 6.398375511169434, "learning_rate": 2.4396820891285837e-06, "loss": 2.1323663711547853, "step": 68770 }, { "epoch": 19.52313369287539, "grad_norm": 6.424984455108643, "learning_rate": 2.42548963951178e-06, "loss": 2.0863359451293944, "step": 68780 }, { "epoch": 19.52597218279875, "grad_norm": 6.497331142425537, "learning_rate": 2.411297189894976e-06, "loss": 2.110247039794922, "step": 68790 }, { "epoch": 19.528810672722113, "grad_norm": 6.156383991241455, "learning_rate": 2.397104740278172e-06, "loss": 2.023802375793457, "step": 68800 }, { "epoch": 19.531649162645472, "grad_norm": 6.553114414215088, "learning_rate": 2.3829122906613684e-06, "loss": 2.058660316467285, "step": 68810 }, { "epoch": 19.534487652568835, "grad_norm": 6.401943683624268, "learning_rate": 2.3687198410445643e-06, "loss": 2.1156185150146483, "step": 68820 }, { "epoch": 19.537326142492194, "grad_norm": 6.368393898010254, "learning_rate": 2.3545273914277605e-06, "loss": 2.1067373275756838, "step": 68830 }, { "epoch": 19.540164632415554, "grad_norm": 6.9620184898376465, "learning_rate": 2.3403349418109564e-06, "loss": 2.1292692184448243, "step": 68840 }, { "epoch": 19.543003122338916, "grad_norm": 6.547654628753662, "learning_rate": 2.3261424921941527e-06, "loss": 2.1287246704101563, "step": 68850 }, { "epoch": 19.545841612262276, "grad_norm": 7.024515151977539, "learning_rate": 2.311950042577349e-06, "loss": 2.0981239318847655, "step": 68860 }, { "epoch": 19.54868010218564, "grad_norm": 6.355991840362549, "learning_rate": 2.297757592960545e-06, "loss": 2.093443489074707, "step": 68870 }, { "epoch": 19.551518592108998, "grad_norm": 6.35290002822876, "learning_rate": 2.2835651433437416e-06, "loss": 2.1018646240234373, "step": 68880 }, { "epoch": 19.554357082032357, "grad_norm": 6.427873611450195, "learning_rate": 2.2693726937269374e-06, "loss": 2.0272119522094725, "step": 68890 }, { "epoch": 19.55719557195572, "grad_norm": 6.554610252380371, "learning_rate": 2.2551802441101337e-06, "loss": 2.06140193939209, "step": 68900 }, { "epoch": 19.56003406187908, "grad_norm": 6.582852840423584, "learning_rate": 2.2409877944933296e-06, "loss": 2.0516437530517577, "step": 68910 }, { "epoch": 19.562872551802442, "grad_norm": 6.433305740356445, "learning_rate": 2.226795344876526e-06, "loss": 2.0444961547851563, "step": 68920 }, { "epoch": 19.565711041725802, "grad_norm": 6.0980424880981445, "learning_rate": 2.212602895259722e-06, "loss": 2.098438835144043, "step": 68930 }, { "epoch": 19.56854953164916, "grad_norm": 6.2662153244018555, "learning_rate": 2.198410445642918e-06, "loss": 2.0465307235717773, "step": 68940 }, { "epoch": 19.571388021572524, "grad_norm": 6.558994770050049, "learning_rate": 2.1842179960261143e-06, "loss": 2.0794052124023437, "step": 68950 }, { "epoch": 19.574226511495883, "grad_norm": 6.90064811706543, "learning_rate": 2.17002554640931e-06, "loss": 2.1465631484985352, "step": 68960 }, { "epoch": 19.577065001419246, "grad_norm": 5.997625350952148, "learning_rate": 2.1558330967925064e-06, "loss": 1.9484722137451171, "step": 68970 }, { "epoch": 19.579903491342606, "grad_norm": 6.3610758781433105, "learning_rate": 2.1416406471757027e-06, "loss": 2.14228515625, "step": 68980 }, { "epoch": 19.582741981265965, "grad_norm": 6.175620079040527, "learning_rate": 2.1274481975588986e-06, "loss": 2.1037158966064453, "step": 68990 }, { "epoch": 19.585580471189328, "grad_norm": 6.88193941116333, "learning_rate": 2.113255747942095e-06, "loss": 2.074156379699707, "step": 69000 }, { "epoch": 19.585580471189328, "eval_accuracy": 0.363832898836396, "eval_loss": 2.429527997970581, "eval_runtime": 49.5323, "eval_samples_per_second": 317.51, "eval_steps_per_second": 4.966, "step": 69000 }, { "epoch": 19.588418961112687, "grad_norm": 6.5178656578063965, "learning_rate": 2.0990632983252907e-06, "loss": 2.0957250595092773, "step": 69010 }, { "epoch": 19.59125745103605, "grad_norm": 6.651074409484863, "learning_rate": 2.084870848708487e-06, "loss": 2.0821769714355467, "step": 69020 }, { "epoch": 19.59409594095941, "grad_norm": 6.540046215057373, "learning_rate": 2.0706783990916833e-06, "loss": 2.047433853149414, "step": 69030 }, { "epoch": 19.59693443088277, "grad_norm": 6.409818649291992, "learning_rate": 2.0564859494748796e-06, "loss": 2.0924171447753905, "step": 69040 }, { "epoch": 19.59977292080613, "grad_norm": 6.203290939331055, "learning_rate": 2.042293499858076e-06, "loss": 2.0303627014160157, "step": 69050 }, { "epoch": 19.60261141072949, "grad_norm": 6.8145575523376465, "learning_rate": 2.0281010502412717e-06, "loss": 2.0696079254150392, "step": 69060 }, { "epoch": 19.605449900652854, "grad_norm": 6.138033390045166, "learning_rate": 2.013908600624468e-06, "loss": 2.079982376098633, "step": 69070 }, { "epoch": 19.608288390576213, "grad_norm": 6.311410427093506, "learning_rate": 1.9997161510076643e-06, "loss": 1.9916648864746094, "step": 69080 }, { "epoch": 19.611126880499576, "grad_norm": 6.275455951690674, "learning_rate": 1.98552370139086e-06, "loss": 2.0353031158447266, "step": 69090 }, { "epoch": 19.613965370422935, "grad_norm": 7.152448654174805, "learning_rate": 1.9713312517740564e-06, "loss": 2.1159635543823243, "step": 69100 }, { "epoch": 19.616803860346295, "grad_norm": 6.406869411468506, "learning_rate": 1.9571388021572523e-06, "loss": 2.085752487182617, "step": 69110 }, { "epoch": 19.619642350269658, "grad_norm": 6.7403411865234375, "learning_rate": 1.9429463525404486e-06, "loss": 2.0758743286132812, "step": 69120 }, { "epoch": 19.622480840193017, "grad_norm": 6.564423561096191, "learning_rate": 1.928753902923645e-06, "loss": 2.1132293701171876, "step": 69130 }, { "epoch": 19.62531933011638, "grad_norm": 6.264313697814941, "learning_rate": 1.9145614533068407e-06, "loss": 2.109466552734375, "step": 69140 }, { "epoch": 19.62815782003974, "grad_norm": 6.447768211364746, "learning_rate": 1.900369003690037e-06, "loss": 2.0333276748657227, "step": 69150 }, { "epoch": 19.6309963099631, "grad_norm": 6.147251605987549, "learning_rate": 1.886176554073233e-06, "loss": 2.0807138442993165, "step": 69160 }, { "epoch": 19.63383479988646, "grad_norm": 6.1322126388549805, "learning_rate": 1.8719841044564292e-06, "loss": 2.025460433959961, "step": 69170 }, { "epoch": 19.63667328980982, "grad_norm": 6.406500816345215, "learning_rate": 1.8577916548396253e-06, "loss": 2.1637714385986326, "step": 69180 }, { "epoch": 19.639511779733184, "grad_norm": 6.300953388214111, "learning_rate": 1.8435992052228213e-06, "loss": 2.084793281555176, "step": 69190 }, { "epoch": 19.642350269656543, "grad_norm": 5.932681560516357, "learning_rate": 1.8294067556060176e-06, "loss": 2.0200353622436524, "step": 69200 }, { "epoch": 19.645188759579902, "grad_norm": 6.336604595184326, "learning_rate": 1.815214305989214e-06, "loss": 2.084672546386719, "step": 69210 }, { "epoch": 19.648027249503265, "grad_norm": 6.576728820800781, "learning_rate": 1.8010218563724102e-06, "loss": 2.0848663330078123, "step": 69220 }, { "epoch": 19.650865739426624, "grad_norm": 6.242593288421631, "learning_rate": 1.7868294067556063e-06, "loss": 2.086861228942871, "step": 69230 }, { "epoch": 19.653704229349987, "grad_norm": 6.838253021240234, "learning_rate": 1.7726369571388023e-06, "loss": 2.03204345703125, "step": 69240 }, { "epoch": 19.656542719273347, "grad_norm": 6.431533336639404, "learning_rate": 1.7584445075219984e-06, "loss": 2.0890405654907225, "step": 69250 }, { "epoch": 19.659381209196706, "grad_norm": 6.467026710510254, "learning_rate": 1.7442520579051945e-06, "loss": 2.0950977325439455, "step": 69260 }, { "epoch": 19.66221969912007, "grad_norm": 6.527979373931885, "learning_rate": 1.7300596082883908e-06, "loss": 2.11132698059082, "step": 69270 }, { "epoch": 19.665058189043428, "grad_norm": 6.265219688415527, "learning_rate": 1.7158671586715868e-06, "loss": 2.1423484802246096, "step": 69280 }, { "epoch": 19.66789667896679, "grad_norm": 6.581800937652588, "learning_rate": 1.701674709054783e-06, "loss": 2.111594581604004, "step": 69290 }, { "epoch": 19.67073516889015, "grad_norm": 6.314266681671143, "learning_rate": 1.687482259437979e-06, "loss": 2.1023841857910157, "step": 69300 }, { "epoch": 19.67357365881351, "grad_norm": 6.072166442871094, "learning_rate": 1.673289809821175e-06, "loss": 2.0151603698730467, "step": 69310 }, { "epoch": 19.676412148736873, "grad_norm": 6.856130123138428, "learning_rate": 1.6590973602043713e-06, "loss": 2.074780082702637, "step": 69320 }, { "epoch": 19.679250638660232, "grad_norm": 6.315464496612549, "learning_rate": 1.6449049105875674e-06, "loss": 2.0206417083740233, "step": 69330 }, { "epoch": 19.682089128583595, "grad_norm": 6.425766468048096, "learning_rate": 1.6307124609707635e-06, "loss": 2.042862319946289, "step": 69340 }, { "epoch": 19.684927618506954, "grad_norm": 6.416319370269775, "learning_rate": 1.6165200113539596e-06, "loss": 2.1174427032470704, "step": 69350 }, { "epoch": 19.687766108430313, "grad_norm": 6.212642669677734, "learning_rate": 1.6023275617371556e-06, "loss": 2.05150146484375, "step": 69360 }, { "epoch": 19.690604598353676, "grad_norm": 6.530059337615967, "learning_rate": 1.5881351121203521e-06, "loss": 2.122101593017578, "step": 69370 }, { "epoch": 19.693443088277036, "grad_norm": 6.3336896896362305, "learning_rate": 1.5739426625035484e-06, "loss": 2.0802213668823244, "step": 69380 }, { "epoch": 19.6962815782004, "grad_norm": 6.64817476272583, "learning_rate": 1.5597502128867443e-06, "loss": 2.0026714324951174, "step": 69390 }, { "epoch": 19.699120068123758, "grad_norm": 6.228942394256592, "learning_rate": 1.5455577632699404e-06, "loss": 2.1517845153808595, "step": 69400 }, { "epoch": 19.701958558047117, "grad_norm": 6.513351917266846, "learning_rate": 1.5313653136531366e-06, "loss": 2.0465925216674803, "step": 69410 }, { "epoch": 19.70479704797048, "grad_norm": 6.679276943206787, "learning_rate": 1.5171728640363327e-06, "loss": 2.0939632415771485, "step": 69420 }, { "epoch": 19.70763553789384, "grad_norm": 6.732625961303711, "learning_rate": 1.502980414419529e-06, "loss": 2.153000831604004, "step": 69430 }, { "epoch": 19.710474027817202, "grad_norm": 6.59044885635376, "learning_rate": 1.488787964802725e-06, "loss": 2.080823516845703, "step": 69440 }, { "epoch": 19.71331251774056, "grad_norm": 6.357271671295166, "learning_rate": 1.4745955151859212e-06, "loss": 2.1361320495605467, "step": 69450 }, { "epoch": 19.716151007663925, "grad_norm": 6.5613250732421875, "learning_rate": 1.4604030655691172e-06, "loss": 2.0299667358398437, "step": 69460 }, { "epoch": 19.718989497587284, "grad_norm": 6.785191535949707, "learning_rate": 1.4462106159523133e-06, "loss": 2.131132125854492, "step": 69470 }, { "epoch": 19.721827987510643, "grad_norm": 6.623903751373291, "learning_rate": 1.4320181663355096e-06, "loss": 2.064895439147949, "step": 69480 }, { "epoch": 19.724666477434006, "grad_norm": 6.878127574920654, "learning_rate": 1.4178257167187059e-06, "loss": 2.0331493377685548, "step": 69490 }, { "epoch": 19.727504967357365, "grad_norm": 6.783996105194092, "learning_rate": 1.403633267101902e-06, "loss": 2.1021900177001953, "step": 69500 }, { "epoch": 19.727504967357365, "eval_accuracy": 0.36389648375405353, "eval_loss": 2.428943395614624, "eval_runtime": 50.542, "eval_samples_per_second": 311.167, "eval_steps_per_second": 4.867, "step": 69500 }, { "epoch": 19.73034345728073, "grad_norm": 6.605818748474121, "learning_rate": 1.389440817485098e-06, "loss": 2.155445861816406, "step": 69510 }, { "epoch": 19.733181947204088, "grad_norm": 6.578017234802246, "learning_rate": 1.375248367868294e-06, "loss": 2.1013708114624023, "step": 69520 }, { "epoch": 19.736020437127447, "grad_norm": 6.575100898742676, "learning_rate": 1.3610559182514902e-06, "loss": 2.0244258880615233, "step": 69530 }, { "epoch": 19.73885892705081, "grad_norm": 6.490363121032715, "learning_rate": 1.3468634686346865e-06, "loss": 2.1456220626831053, "step": 69540 }, { "epoch": 19.74169741697417, "grad_norm": 6.180432319641113, "learning_rate": 1.3326710190178825e-06, "loss": 2.0188009262084963, "step": 69550 }, { "epoch": 19.744535906897532, "grad_norm": 6.669561386108398, "learning_rate": 1.3184785694010786e-06, "loss": 2.0336816787719725, "step": 69560 }, { "epoch": 19.74737439682089, "grad_norm": 6.477922439575195, "learning_rate": 1.3042861197842749e-06, "loss": 2.068826675415039, "step": 69570 }, { "epoch": 19.75021288674425, "grad_norm": 6.638596057891846, "learning_rate": 1.290093670167471e-06, "loss": 2.140557861328125, "step": 69580 }, { "epoch": 19.753051376667614, "grad_norm": 6.732465744018555, "learning_rate": 1.275901220550667e-06, "loss": 2.096497344970703, "step": 69590 }, { "epoch": 19.755889866590973, "grad_norm": 6.50888204574585, "learning_rate": 1.2617087709338633e-06, "loss": 2.0711315155029295, "step": 69600 }, { "epoch": 19.758728356514336, "grad_norm": 6.373586654663086, "learning_rate": 1.2475163213170594e-06, "loss": 2.0337282180786134, "step": 69610 }, { "epoch": 19.761566846437695, "grad_norm": 6.293712139129639, "learning_rate": 1.2333238717002555e-06, "loss": 2.013810920715332, "step": 69620 }, { "epoch": 19.764405336361055, "grad_norm": 6.362240314483643, "learning_rate": 1.2191314220834515e-06, "loss": 2.0971038818359373, "step": 69630 }, { "epoch": 19.767243826284417, "grad_norm": 6.302797794342041, "learning_rate": 1.2049389724666478e-06, "loss": 2.0742010116577148, "step": 69640 }, { "epoch": 19.770082316207777, "grad_norm": 6.421935081481934, "learning_rate": 1.1907465228498441e-06, "loss": 2.1230838775634764, "step": 69650 }, { "epoch": 19.77292080613114, "grad_norm": 6.6694746017456055, "learning_rate": 1.1765540732330402e-06, "loss": 2.090347671508789, "step": 69660 }, { "epoch": 19.7757592960545, "grad_norm": 6.576367378234863, "learning_rate": 1.1623616236162363e-06, "loss": 2.0490203857421876, "step": 69670 }, { "epoch": 19.77859778597786, "grad_norm": 6.610611438751221, "learning_rate": 1.1481691739994323e-06, "loss": 2.0596195220947267, "step": 69680 }, { "epoch": 19.78143627590122, "grad_norm": 6.635383605957031, "learning_rate": 1.1339767243826284e-06, "loss": 2.0331985473632814, "step": 69690 }, { "epoch": 19.78427476582458, "grad_norm": 6.516880035400391, "learning_rate": 1.1197842747658247e-06, "loss": 2.1182861328125, "step": 69700 }, { "epoch": 19.787113255747943, "grad_norm": 6.74734354019165, "learning_rate": 1.1055918251490208e-06, "loss": 2.1611026763916015, "step": 69710 }, { "epoch": 19.789951745671303, "grad_norm": 6.429272174835205, "learning_rate": 1.0913993755322168e-06, "loss": 2.0792654037475584, "step": 69720 }, { "epoch": 19.792790235594662, "grad_norm": 6.335063457489014, "learning_rate": 1.077206925915413e-06, "loss": 2.151951026916504, "step": 69730 }, { "epoch": 19.795628725518025, "grad_norm": 6.300475120544434, "learning_rate": 1.0630144762986092e-06, "loss": 2.155730438232422, "step": 69740 }, { "epoch": 19.798467215441384, "grad_norm": 6.741990089416504, "learning_rate": 1.0488220266818053e-06, "loss": 2.130934715270996, "step": 69750 }, { "epoch": 19.801305705364747, "grad_norm": 6.406464576721191, "learning_rate": 1.0346295770650016e-06, "loss": 2.031396675109863, "step": 69760 }, { "epoch": 19.804144195288107, "grad_norm": 6.2535624504089355, "learning_rate": 1.0204371274481976e-06, "loss": 2.024748992919922, "step": 69770 }, { "epoch": 19.806982685211466, "grad_norm": 6.507019996643066, "learning_rate": 1.0062446778313937e-06, "loss": 2.1001792907714845, "step": 69780 }, { "epoch": 19.80982117513483, "grad_norm": 6.536484718322754, "learning_rate": 9.920522282145898e-07, "loss": 2.15488224029541, "step": 69790 }, { "epoch": 19.812659665058188, "grad_norm": 6.497816562652588, "learning_rate": 9.778597785977859e-07, "loss": 2.0621753692626954, "step": 69800 }, { "epoch": 19.81549815498155, "grad_norm": 6.61769437789917, "learning_rate": 9.636673289809821e-07, "loss": 2.139110565185547, "step": 69810 }, { "epoch": 19.81833664490491, "grad_norm": 6.268617630004883, "learning_rate": 9.494748793641783e-07, "loss": 2.0984918594360353, "step": 69820 }, { "epoch": 19.82117513482827, "grad_norm": 6.077764511108398, "learning_rate": 9.352824297473745e-07, "loss": 2.175589179992676, "step": 69830 }, { "epoch": 19.824013624751633, "grad_norm": 6.65739631652832, "learning_rate": 9.210899801305706e-07, "loss": 2.079314041137695, "step": 69840 }, { "epoch": 19.826852114674992, "grad_norm": 6.353156089782715, "learning_rate": 9.068975305137668e-07, "loss": 2.1594438552856445, "step": 69850 }, { "epoch": 19.829690604598355, "grad_norm": 6.472555160522461, "learning_rate": 8.927050808969628e-07, "loss": 2.122989463806152, "step": 69860 }, { "epoch": 19.832529094521714, "grad_norm": 6.416472434997559, "learning_rate": 8.785126312801589e-07, "loss": 2.0730276107788086, "step": 69870 }, { "epoch": 19.835367584445073, "grad_norm": 6.228769779205322, "learning_rate": 8.643201816633551e-07, "loss": 2.085074234008789, "step": 69880 }, { "epoch": 19.838206074368436, "grad_norm": 6.791378974914551, "learning_rate": 8.501277320465512e-07, "loss": 2.1471956253051756, "step": 69890 }, { "epoch": 19.841044564291796, "grad_norm": 6.221165657043457, "learning_rate": 8.359352824297475e-07, "loss": 2.0418134689331056, "step": 69900 }, { "epoch": 19.84388305421516, "grad_norm": 6.9033203125, "learning_rate": 8.217428328129436e-07, "loss": 2.139931297302246, "step": 69910 }, { "epoch": 19.846721544138518, "grad_norm": 6.291630744934082, "learning_rate": 8.075503831961397e-07, "loss": 2.060548782348633, "step": 69920 }, { "epoch": 19.84956003406188, "grad_norm": 6.360634803771973, "learning_rate": 7.933579335793359e-07, "loss": 2.0880220413208006, "step": 69930 }, { "epoch": 19.85239852398524, "grad_norm": 6.274386405944824, "learning_rate": 7.79165483962532e-07, "loss": 2.0465826034545898, "step": 69940 }, { "epoch": 19.8552370139086, "grad_norm": 6.493825912475586, "learning_rate": 7.64973034345728e-07, "loss": 2.099424934387207, "step": 69950 }, { "epoch": 19.858075503831962, "grad_norm": 6.265414714813232, "learning_rate": 7.507805847289243e-07, "loss": 2.052669715881348, "step": 69960 }, { "epoch": 19.86091399375532, "grad_norm": 6.82550048828125, "learning_rate": 7.365881351121204e-07, "loss": 2.124346923828125, "step": 69970 }, { "epoch": 19.863752483678685, "grad_norm": 6.237912654876709, "learning_rate": 7.223956854953165e-07, "loss": 2.110402297973633, "step": 69980 }, { "epoch": 19.866590973602044, "grad_norm": 6.27518367767334, "learning_rate": 7.082032358785126e-07, "loss": 2.1032867431640625, "step": 69990 }, { "epoch": 19.869429463525403, "grad_norm": 6.4367265701293945, "learning_rate": 6.940107862617088e-07, "loss": 2.0360553741455076, "step": 70000 }, { "epoch": 19.869429463525403, "eval_accuracy": 0.36396006867171105, "eval_loss": 2.4284770488739014, "eval_runtime": 48.8934, "eval_samples_per_second": 321.659, "eval_steps_per_second": 5.031, "step": 70000 }, { "epoch": 19.872267953448766, "grad_norm": 6.189291000366211, "learning_rate": 6.79818336644905e-07, "loss": 2.032421875, "step": 70010 }, { "epoch": 19.875106443372125, "grad_norm": 6.291744709014893, "learning_rate": 6.656258870281011e-07, "loss": 2.0752363204956055, "step": 70020 }, { "epoch": 19.87794493329549, "grad_norm": 6.490698337554932, "learning_rate": 6.514334374112972e-07, "loss": 2.1882293701171873, "step": 70030 }, { "epoch": 19.880783423218848, "grad_norm": 6.523173809051514, "learning_rate": 6.372409877944934e-07, "loss": 2.201486587524414, "step": 70040 }, { "epoch": 19.883621913142207, "grad_norm": 6.674595832824707, "learning_rate": 6.230485381776895e-07, "loss": 2.125614547729492, "step": 70050 }, { "epoch": 19.88646040306557, "grad_norm": 6.454622268676758, "learning_rate": 6.088560885608856e-07, "loss": 2.0606828689575196, "step": 70060 }, { "epoch": 19.88929889298893, "grad_norm": 6.7107834815979, "learning_rate": 5.946636389440818e-07, "loss": 2.0904788970947266, "step": 70070 }, { "epoch": 19.892137382912292, "grad_norm": 6.537896156311035, "learning_rate": 5.80471189327278e-07, "loss": 2.0547550201416014, "step": 70080 }, { "epoch": 19.89497587283565, "grad_norm": 6.579782485961914, "learning_rate": 5.66278739710474e-07, "loss": 2.092763900756836, "step": 70090 }, { "epoch": 19.89781436275901, "grad_norm": 6.321341514587402, "learning_rate": 5.520862900936702e-07, "loss": 2.000357437133789, "step": 70100 }, { "epoch": 19.900652852682374, "grad_norm": 6.4502739906311035, "learning_rate": 5.378938404768663e-07, "loss": 2.098824882507324, "step": 70110 }, { "epoch": 19.903491342605733, "grad_norm": 5.968274116516113, "learning_rate": 5.237013908600626e-07, "loss": 2.0149648666381834, "step": 70120 }, { "epoch": 19.906329832529096, "grad_norm": 6.710988998413086, "learning_rate": 5.095089412432586e-07, "loss": 2.095047378540039, "step": 70130 }, { "epoch": 19.909168322452455, "grad_norm": 6.498376369476318, "learning_rate": 4.953164916264547e-07, "loss": 2.0828248977661135, "step": 70140 }, { "epoch": 19.912006812375814, "grad_norm": 6.19550085067749, "learning_rate": 4.811240420096509e-07, "loss": 2.0620908737182617, "step": 70150 }, { "epoch": 19.914845302299177, "grad_norm": 6.073607921600342, "learning_rate": 4.6693159239284707e-07, "loss": 2.071014976501465, "step": 70160 }, { "epoch": 19.917683792222537, "grad_norm": 6.462817192077637, "learning_rate": 4.527391427760432e-07, "loss": 2.072454833984375, "step": 70170 }, { "epoch": 19.9205222821459, "grad_norm": 6.488985061645508, "learning_rate": 4.385466931592393e-07, "loss": 2.096533203125, "step": 70180 }, { "epoch": 19.92336077206926, "grad_norm": 6.295605182647705, "learning_rate": 4.243542435424354e-07, "loss": 2.1011539459228517, "step": 70190 }, { "epoch": 19.92619926199262, "grad_norm": 6.319330215454102, "learning_rate": 4.1016179392563163e-07, "loss": 2.1107526779174806, "step": 70200 }, { "epoch": 19.92903775191598, "grad_norm": 6.012654781341553, "learning_rate": 3.9596934430882776e-07, "loss": 2.042133331298828, "step": 70210 }, { "epoch": 19.93187624183934, "grad_norm": 6.192951202392578, "learning_rate": 3.817768946920239e-07, "loss": 2.0520441055297853, "step": 70220 }, { "epoch": 19.934714731762703, "grad_norm": 6.8538994789123535, "learning_rate": 3.6758444507521996e-07, "loss": 2.0941450119018556, "step": 70230 }, { "epoch": 19.937553221686063, "grad_norm": 6.317290782928467, "learning_rate": 3.5339199545841614e-07, "loss": 2.1179752349853516, "step": 70240 }, { "epoch": 19.940391711609422, "grad_norm": 6.60971212387085, "learning_rate": 3.3919954584161226e-07, "loss": 2.067679786682129, "step": 70250 }, { "epoch": 19.943230201532785, "grad_norm": 6.087317943572998, "learning_rate": 3.2500709622480844e-07, "loss": 2.013270378112793, "step": 70260 }, { "epoch": 19.946068691456144, "grad_norm": 6.370477199554443, "learning_rate": 3.108146466080045e-07, "loss": 2.102530860900879, "step": 70270 }, { "epoch": 19.948907181379507, "grad_norm": 6.351400852203369, "learning_rate": 2.966221969912007e-07, "loss": 2.1118730545043944, "step": 70280 }, { "epoch": 19.951745671302866, "grad_norm": 6.5126953125, "learning_rate": 2.824297473743968e-07, "loss": 2.1576152801513673, "step": 70290 }, { "epoch": 19.95458416122623, "grad_norm": 6.5545244216918945, "learning_rate": 2.6823729775759295e-07, "loss": 2.060870552062988, "step": 70300 }, { "epoch": 19.95742265114959, "grad_norm": 6.4074482917785645, "learning_rate": 2.540448481407891e-07, "loss": 2.0706377029418945, "step": 70310 }, { "epoch": 19.960261141072948, "grad_norm": 6.411105155944824, "learning_rate": 2.3985239852398526e-07, "loss": 2.0689136505126955, "step": 70320 }, { "epoch": 19.96309963099631, "grad_norm": 6.861893177032471, "learning_rate": 2.2565994890718139e-07, "loss": 2.1574092864990235, "step": 70330 }, { "epoch": 19.96593812091967, "grad_norm": 6.269639492034912, "learning_rate": 2.1146749929037754e-07, "loss": 2.0784053802490234, "step": 70340 }, { "epoch": 19.968776610843033, "grad_norm": 6.652785778045654, "learning_rate": 1.9727504967357364e-07, "loss": 2.077069854736328, "step": 70350 }, { "epoch": 19.971615100766392, "grad_norm": 6.455522060394287, "learning_rate": 1.8308260005676982e-07, "loss": 2.1296791076660155, "step": 70360 }, { "epoch": 19.974453590689752, "grad_norm": 6.158466339111328, "learning_rate": 1.6889015043996595e-07, "loss": 2.0750759124755858, "step": 70370 }, { "epoch": 19.977292080613115, "grad_norm": 6.586935520172119, "learning_rate": 1.546977008231621e-07, "loss": 2.0874744415283204, "step": 70380 }, { "epoch": 19.980130570536474, "grad_norm": 6.814191818237305, "learning_rate": 1.4050525120635823e-07, "loss": 2.0517749786376953, "step": 70390 }, { "epoch": 19.982969060459837, "grad_norm": 6.699864387512207, "learning_rate": 1.2631280158955438e-07, "loss": 2.073122024536133, "step": 70400 }, { "epoch": 19.985807550383196, "grad_norm": 6.475768089294434, "learning_rate": 1.1212035197275051e-07, "loss": 2.1750820159912108, "step": 70410 }, { "epoch": 19.988646040306556, "grad_norm": 6.801063537597656, "learning_rate": 9.792790235594665e-08, "loss": 2.0694402694702148, "step": 70420 }, { "epoch": 19.99148453022992, "grad_norm": 6.069818019866943, "learning_rate": 8.373545273914277e-08, "loss": 2.0290908813476562, "step": 70430 }, { "epoch": 19.994323020153278, "grad_norm": 6.337346076965332, "learning_rate": 6.954300312233891e-08, "loss": 2.0907007217407227, "step": 70440 }, { "epoch": 19.99716151007664, "grad_norm": 6.645975589752197, "learning_rate": 5.5350553505535055e-08, "loss": 2.0308506011962892, "step": 70450 }, { "epoch": 20.0, "grad_norm": 10.76362419128418, "learning_rate": 4.1158103888731195e-08, "loss": 2.120552825927734, "step": 70460 }, { "epoch": 20.0, "eval_accuracy": 0.36351497424810836, "eval_loss": 2.428387403488159, "eval_runtime": 50.1695, "eval_samples_per_second": 313.478, "eval_steps_per_second": 4.903, "step": 70460 } ], "logging_steps": 10, "max_steps": 70460, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.669251689678635e+20, "train_batch_size": 64, "trial_name": null, "trial_params": null }