{ "best_metric": 0.3424564003944397, "best_model_checkpoint": "./results/chembl.trans_only_T5Chem_dmodel512_layers12_batch120_lr0.0005_train0.80/checkpoint-25000", "epoch": 1.4214236979758925, "eval_steps": 5000, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002842847395951785, "grad_norm": 0.4646081328392029, "learning_rate": 0.0004999955580509438, "loss": 0.6585, "step": 50 }, { "epoch": 0.00568569479190357, "grad_norm": 0.19792014360427856, "learning_rate": 0.0004999911161018877, "loss": 0.53, "step": 100 }, { "epoch": 0.008528542187855356, "grad_norm": 0.16343972086906433, "learning_rate": 0.0004999866741528315, "loss": 0.5034, "step": 150 }, { "epoch": 0.01137138958380714, "grad_norm": 0.20607765018939972, "learning_rate": 0.0004999822322037754, "loss": 0.496, "step": 200 }, { "epoch": 0.014214236979758927, "grad_norm": 0.14076846837997437, "learning_rate": 0.0004999777902547192, "loss": 0.4841, "step": 250 }, { "epoch": 0.017057084375710713, "grad_norm": 0.12710894644260406, "learning_rate": 0.000499973348305663, "loss": 0.4816, "step": 300 }, { "epoch": 0.019899931771662497, "grad_norm": 0.1586833894252777, "learning_rate": 0.0004999689063566068, "loss": 0.477, "step": 350 }, { "epoch": 0.02274277916761428, "grad_norm": 0.18906599283218384, "learning_rate": 0.0004999644644075506, "loss": 0.4743, "step": 400 }, { "epoch": 0.02558562656356607, "grad_norm": 0.14747406542301178, "learning_rate": 0.0004999600224584944, "loss": 0.465, "step": 450 }, { "epoch": 0.028428473959517853, "grad_norm": 0.15983326733112335, "learning_rate": 0.0004999555805094383, "loss": 0.4647, "step": 500 }, { "epoch": 0.03127132135546964, "grad_norm": 0.14171165227890015, "learning_rate": 0.0004999511385603821, "loss": 0.4603, "step": 550 }, { "epoch": 0.034114168751421425, "grad_norm": 0.13967591524124146, "learning_rate": 0.0004999466966113259, "loss": 0.4574, "step": 600 }, { "epoch": 0.036957016147373206, "grad_norm": 0.2039215862751007, "learning_rate": 0.0004999422546622698, "loss": 0.4536, "step": 650 }, { "epoch": 0.039799863543324994, "grad_norm": 0.15800215303897858, "learning_rate": 0.0004999378127132136, "loss": 0.4544, "step": 700 }, { "epoch": 0.04264271093927678, "grad_norm": 0.16743983328342438, "learning_rate": 0.0004999333707641574, "loss": 0.4528, "step": 750 }, { "epoch": 0.04548555833522856, "grad_norm": 0.16410502791404724, "learning_rate": 0.0004999289288151012, "loss": 0.4498, "step": 800 }, { "epoch": 0.04832840573118035, "grad_norm": 0.17722439765930176, "learning_rate": 0.000499924486866045, "loss": 0.4461, "step": 850 }, { "epoch": 0.05117125312713214, "grad_norm": 0.13122214376926422, "learning_rate": 0.0004999200449169889, "loss": 0.4422, "step": 900 }, { "epoch": 0.05401410052308392, "grad_norm": 0.1328468769788742, "learning_rate": 0.0004999156029679327, "loss": 0.4438, "step": 950 }, { "epoch": 0.05685694791903571, "grad_norm": 0.1839675009250641, "learning_rate": 0.0004999111610188765, "loss": 0.4429, "step": 1000 }, { "epoch": 0.059699795314987494, "grad_norm": 0.14567476511001587, "learning_rate": 0.0004999067190698203, "loss": 0.4356, "step": 1050 }, { "epoch": 0.06254264271093928, "grad_norm": 0.1527237445116043, "learning_rate": 0.0004999022771207641, "loss": 0.438, "step": 1100 }, { "epoch": 0.06538549010689106, "grad_norm": 0.1287076622247696, "learning_rate": 0.000499897835171708, "loss": 0.4343, "step": 1150 }, { "epoch": 0.06822833750284285, "grad_norm": 0.11131107807159424, "learning_rate": 0.0004998933932226518, "loss": 0.4367, "step": 1200 }, { "epoch": 0.07107118489879463, "grad_norm": 0.13373662531375885, "learning_rate": 0.0004998889512735956, "loss": 0.4316, "step": 1250 }, { "epoch": 0.07391403229474641, "grad_norm": 0.13251946866512299, "learning_rate": 0.0004998845093245395, "loss": 0.4347, "step": 1300 }, { "epoch": 0.07675687969069821, "grad_norm": 0.14924846589565277, "learning_rate": 0.0004998800673754833, "loss": 0.4315, "step": 1350 }, { "epoch": 0.07959972708664999, "grad_norm": 0.12222905457019806, "learning_rate": 0.0004998756254264271, "loss": 0.4311, "step": 1400 }, { "epoch": 0.08244257448260177, "grad_norm": 0.10474367439746857, "learning_rate": 0.0004998711834773709, "loss": 0.4289, "step": 1450 }, { "epoch": 0.08528542187855356, "grad_norm": 0.11413396149873734, "learning_rate": 0.0004998667415283148, "loss": 0.4287, "step": 1500 }, { "epoch": 0.08812826927450534, "grad_norm": 0.1532815843820572, "learning_rate": 0.0004998622995792587, "loss": 0.4266, "step": 1550 }, { "epoch": 0.09097111667045713, "grad_norm": 0.16395249962806702, "learning_rate": 0.0004998578576302025, "loss": 0.4254, "step": 1600 }, { "epoch": 0.09381396406640892, "grad_norm": 0.13058966398239136, "learning_rate": 0.0004998534156811463, "loss": 0.4236, "step": 1650 }, { "epoch": 0.0966568114623607, "grad_norm": 0.15570798516273499, "learning_rate": 0.0004998489737320901, "loss": 0.4267, "step": 1700 }, { "epoch": 0.09949965885831248, "grad_norm": 0.1350376456975937, "learning_rate": 0.0004998445317830339, "loss": 0.421, "step": 1750 }, { "epoch": 0.10234250625426428, "grad_norm": 0.13523083925247192, "learning_rate": 0.0004998400898339777, "loss": 0.4212, "step": 1800 }, { "epoch": 0.10518535365021606, "grad_norm": 0.12544061243534088, "learning_rate": 0.0004998356478849215, "loss": 0.424, "step": 1850 }, { "epoch": 0.10802820104616784, "grad_norm": 0.12443090230226517, "learning_rate": 0.0004998312059358654, "loss": 0.4217, "step": 1900 }, { "epoch": 0.11087104844211963, "grad_norm": 0.11098171025514603, "learning_rate": 0.0004998267639868092, "loss": 0.4226, "step": 1950 }, { "epoch": 0.11371389583807141, "grad_norm": 0.129473477602005, "learning_rate": 0.000499822322037753, "loss": 0.4237, "step": 2000 }, { "epoch": 0.1165567432340232, "grad_norm": 0.1107649877667427, "learning_rate": 0.0004998178800886969, "loss": 0.4241, "step": 2050 }, { "epoch": 0.11939959062997499, "grad_norm": 0.12195456773042679, "learning_rate": 0.0004998134381396407, "loss": 0.4176, "step": 2100 }, { "epoch": 0.12224243802592677, "grad_norm": 0.12097357213497162, "learning_rate": 0.0004998089961905845, "loss": 0.4156, "step": 2150 }, { "epoch": 0.12508528542187855, "grad_norm": 0.11261444538831711, "learning_rate": 0.0004998045542415283, "loss": 0.4179, "step": 2200 }, { "epoch": 0.12792813281783033, "grad_norm": 0.13488833606243134, "learning_rate": 0.0004998001122924721, "loss": 0.4175, "step": 2250 }, { "epoch": 0.1307709802137821, "grad_norm": 0.11204372346401215, "learning_rate": 0.000499795670343416, "loss": 0.4169, "step": 2300 }, { "epoch": 0.13361382760973392, "grad_norm": 0.10848614573478699, "learning_rate": 0.0004997912283943598, "loss": 0.4156, "step": 2350 }, { "epoch": 0.1364566750056857, "grad_norm": 0.1451840102672577, "learning_rate": 0.0004997867864453036, "loss": 0.4154, "step": 2400 }, { "epoch": 0.13929952240163748, "grad_norm": 0.14736251533031464, "learning_rate": 0.0004997823444962474, "loss": 0.4155, "step": 2450 }, { "epoch": 0.14214236979758926, "grad_norm": 0.12735013663768768, "learning_rate": 0.0004997779025471912, "loss": 0.4089, "step": 2500 }, { "epoch": 0.14498521719354104, "grad_norm": 0.212030827999115, "learning_rate": 0.0004997734605981351, "loss": 0.415, "step": 2550 }, { "epoch": 0.14782806458949282, "grad_norm": 0.11738018691539764, "learning_rate": 0.0004997690186490789, "loss": 0.4102, "step": 2600 }, { "epoch": 0.15067091198544463, "grad_norm": 0.1060812696814537, "learning_rate": 0.0004997645767000228, "loss": 0.4136, "step": 2650 }, { "epoch": 0.15351375938139641, "grad_norm": 0.12909935414791107, "learning_rate": 0.0004997601347509666, "loss": 0.4102, "step": 2700 }, { "epoch": 0.1563566067773482, "grad_norm": 0.15052352845668793, "learning_rate": 0.0004997556928019104, "loss": 0.4092, "step": 2750 }, { "epoch": 0.15919945417329998, "grad_norm": 0.13438868522644043, "learning_rate": 0.0004997512508528542, "loss": 0.4102, "step": 2800 }, { "epoch": 0.16204230156925176, "grad_norm": 0.11692175269126892, "learning_rate": 0.000499746808903798, "loss": 0.4103, "step": 2850 }, { "epoch": 0.16488514896520354, "grad_norm": 0.11706849187612534, "learning_rate": 0.0004997423669547419, "loss": 0.4104, "step": 2900 }, { "epoch": 0.16772799636115535, "grad_norm": 0.14066801965236664, "learning_rate": 0.0004997379250056858, "loss": 0.4104, "step": 2950 }, { "epoch": 0.17057084375710713, "grad_norm": 0.14922069013118744, "learning_rate": 0.0004997334830566296, "loss": 0.4125, "step": 3000 }, { "epoch": 0.1734136911530589, "grad_norm": 0.14433512091636658, "learning_rate": 0.0004997290411075734, "loss": 0.404, "step": 3050 }, { "epoch": 0.1762565385490107, "grad_norm": 0.11257103830575943, "learning_rate": 0.0004997245991585172, "loss": 0.4089, "step": 3100 }, { "epoch": 0.17909938594496247, "grad_norm": 0.13209494948387146, "learning_rate": 0.000499720157209461, "loss": 0.4081, "step": 3150 }, { "epoch": 0.18194223334091425, "grad_norm": 0.13629132509231567, "learning_rate": 0.0004997157152604048, "loss": 0.4062, "step": 3200 }, { "epoch": 0.18478508073686606, "grad_norm": 0.11546222865581512, "learning_rate": 0.0004997112733113486, "loss": 0.4037, "step": 3250 }, { "epoch": 0.18762792813281784, "grad_norm": 0.12525054812431335, "learning_rate": 0.0004997068313622925, "loss": 0.4064, "step": 3300 }, { "epoch": 0.19047077552876962, "grad_norm": 0.14944538474082947, "learning_rate": 0.0004997023894132363, "loss": 0.4085, "step": 3350 }, { "epoch": 0.1933136229247214, "grad_norm": 0.15211829543113708, "learning_rate": 0.0004996979474641801, "loss": 0.4063, "step": 3400 }, { "epoch": 0.19615647032067318, "grad_norm": 0.12800821661949158, "learning_rate": 0.000499693505515124, "loss": 0.4033, "step": 3450 }, { "epoch": 0.19899931771662496, "grad_norm": 0.10901828110218048, "learning_rate": 0.0004996890635660678, "loss": 0.4041, "step": 3500 }, { "epoch": 0.20184216511257674, "grad_norm": 0.13710814714431763, "learning_rate": 0.0004996846216170116, "loss": 0.4059, "step": 3550 }, { "epoch": 0.20468501250852855, "grad_norm": 0.11688549071550369, "learning_rate": 0.0004996801796679554, "loss": 0.4019, "step": 3600 }, { "epoch": 0.20752785990448033, "grad_norm": 0.10226283967494965, "learning_rate": 0.0004996757377188992, "loss": 0.4023, "step": 3650 }, { "epoch": 0.2103707073004321, "grad_norm": 0.1010371595621109, "learning_rate": 0.0004996712957698431, "loss": 0.4031, "step": 3700 }, { "epoch": 0.2132135546963839, "grad_norm": 0.12441287934780121, "learning_rate": 0.0004996668538207869, "loss": 0.4031, "step": 3750 }, { "epoch": 0.21605640209233568, "grad_norm": 0.12629808485507965, "learning_rate": 0.0004996624118717307, "loss": 0.4046, "step": 3800 }, { "epoch": 0.21889924948828746, "grad_norm": 0.1441943198442459, "learning_rate": 0.0004996579699226745, "loss": 0.4065, "step": 3850 }, { "epoch": 0.22174209688423926, "grad_norm": 0.08717290312051773, "learning_rate": 0.0004996535279736184, "loss": 0.4043, "step": 3900 }, { "epoch": 0.22458494428019105, "grad_norm": 0.12982551753520966, "learning_rate": 0.0004996490860245623, "loss": 0.3997, "step": 3950 }, { "epoch": 0.22742779167614283, "grad_norm": 0.11208934336900711, "learning_rate": 0.0004996446440755061, "loss": 0.3966, "step": 4000 }, { "epoch": 0.2302706390720946, "grad_norm": 0.13264520466327667, "learning_rate": 0.0004996402021264499, "loss": 0.401, "step": 4050 }, { "epoch": 0.2331134864680464, "grad_norm": 0.1360943764448166, "learning_rate": 0.0004996357601773937, "loss": 0.3999, "step": 4100 }, { "epoch": 0.23595633386399817, "grad_norm": 0.1145951896905899, "learning_rate": 0.0004996313182283375, "loss": 0.3995, "step": 4150 }, { "epoch": 0.23879918125994998, "grad_norm": 0.12475068867206573, "learning_rate": 0.0004996268762792813, "loss": 0.4017, "step": 4200 }, { "epoch": 0.24164202865590176, "grad_norm": 0.09585878252983093, "learning_rate": 0.0004996224343302251, "loss": 0.4007, "step": 4250 }, { "epoch": 0.24448487605185354, "grad_norm": 0.11792745441198349, "learning_rate": 0.000499617992381169, "loss": 0.3991, "step": 4300 }, { "epoch": 0.24732772344780532, "grad_norm": 0.16740038990974426, "learning_rate": 0.0004996135504321129, "loss": 0.3984, "step": 4350 }, { "epoch": 0.2501705708437571, "grad_norm": 0.09640038013458252, "learning_rate": 0.0004996091084830567, "loss": 0.3976, "step": 4400 }, { "epoch": 0.2530134182397089, "grad_norm": 0.09332569688558578, "learning_rate": 0.0004996046665340005, "loss": 0.3998, "step": 4450 }, { "epoch": 0.25585626563566066, "grad_norm": 0.11328030377626419, "learning_rate": 0.0004996002245849443, "loss": 0.3979, "step": 4500 }, { "epoch": 0.25869911303161247, "grad_norm": 0.1477869302034378, "learning_rate": 0.0004995957826358881, "loss": 0.3973, "step": 4550 }, { "epoch": 0.2615419604275642, "grad_norm": 0.09623896330595016, "learning_rate": 0.0004995913406868319, "loss": 0.3962, "step": 4600 }, { "epoch": 0.26438480782351603, "grad_norm": 0.11626730114221573, "learning_rate": 0.0004995868987377757, "loss": 0.3958, "step": 4650 }, { "epoch": 0.26722765521946784, "grad_norm": 0.10988286882638931, "learning_rate": 0.0004995824567887196, "loss": 0.3944, "step": 4700 }, { "epoch": 0.2700705026154196, "grad_norm": 0.12107770144939423, "learning_rate": 0.0004995780148396634, "loss": 0.3931, "step": 4750 }, { "epoch": 0.2729133500113714, "grad_norm": 0.11602004617452621, "learning_rate": 0.0004995735728906072, "loss": 0.3963, "step": 4800 }, { "epoch": 0.27575619740732316, "grad_norm": 0.10920259356498718, "learning_rate": 0.0004995691309415511, "loss": 0.395, "step": 4850 }, { "epoch": 0.27859904480327496, "grad_norm": 0.1066461130976677, "learning_rate": 0.0004995646889924949, "loss": 0.3938, "step": 4900 }, { "epoch": 0.2814418921992268, "grad_norm": 0.1071213036775589, "learning_rate": 0.0004995602470434387, "loss": 0.3943, "step": 4950 }, { "epoch": 0.2842847395951785, "grad_norm": 0.08286375552415848, "learning_rate": 0.0004995558050943825, "loss": 0.3948, "step": 5000 }, { "epoch": 0.2842847395951785, "eval_loss": 0.38135138154029846, "eval_runtime": 583.2891, "eval_samples_per_second": 226.149, "eval_steps_per_second": 28.269, "step": 5000 }, { "epoch": 0.28712758699113033, "grad_norm": 0.11819513142108917, "learning_rate": 0.0004995513631453263, "loss": 0.3969, "step": 5050 }, { "epoch": 0.2899704343870821, "grad_norm": 0.10034069418907166, "learning_rate": 0.0004995469211962702, "loss": 0.3962, "step": 5100 }, { "epoch": 0.2928132817830339, "grad_norm": 0.09750229120254517, "learning_rate": 0.000499542479247214, "loss": 0.3938, "step": 5150 }, { "epoch": 0.29565612917898565, "grad_norm": 0.14667393267154694, "learning_rate": 0.0004995380372981578, "loss": 0.395, "step": 5200 }, { "epoch": 0.29849897657493746, "grad_norm": 0.1200270876288414, "learning_rate": 0.0004995335953491016, "loss": 0.3939, "step": 5250 }, { "epoch": 0.30134182397088927, "grad_norm": 0.12305665761232376, "learning_rate": 0.0004995291534000456, "loss": 0.3885, "step": 5300 }, { "epoch": 0.304184671366841, "grad_norm": 0.09782176464796066, "learning_rate": 0.0004995247114509894, "loss": 0.3907, "step": 5350 }, { "epoch": 0.30702751876279283, "grad_norm": 0.10045728832483292, "learning_rate": 0.0004995202695019332, "loss": 0.3921, "step": 5400 }, { "epoch": 0.3098703661587446, "grad_norm": 0.15331445634365082, "learning_rate": 0.000499515827552877, "loss": 0.3953, "step": 5450 }, { "epoch": 0.3127132135546964, "grad_norm": 0.10946525633335114, "learning_rate": 0.0004995113856038208, "loss": 0.3917, "step": 5500 }, { "epoch": 0.31555606095064814, "grad_norm": 0.12001294642686844, "learning_rate": 0.0004995069436547646, "loss": 0.3915, "step": 5550 }, { "epoch": 0.31839890834659995, "grad_norm": 0.11100505292415619, "learning_rate": 0.0004995025017057084, "loss": 0.3916, "step": 5600 }, { "epoch": 0.32124175574255176, "grad_norm": 0.11148565262556076, "learning_rate": 0.0004994980597566523, "loss": 0.3947, "step": 5650 }, { "epoch": 0.3240846031385035, "grad_norm": 0.10728944838047028, "learning_rate": 0.0004994936178075961, "loss": 0.3898, "step": 5700 }, { "epoch": 0.3269274505344553, "grad_norm": 0.09397928416728973, "learning_rate": 0.00049948917585854, "loss": 0.3877, "step": 5750 }, { "epoch": 0.3297702979304071, "grad_norm": 0.12607477605342865, "learning_rate": 0.0004994847339094838, "loss": 0.3919, "step": 5800 }, { "epoch": 0.3326131453263589, "grad_norm": 0.11440327763557434, "learning_rate": 0.0004994802919604276, "loss": 0.3928, "step": 5850 }, { "epoch": 0.3354559927223107, "grad_norm": 0.1275177150964737, "learning_rate": 0.0004994758500113714, "loss": 0.389, "step": 5900 }, { "epoch": 0.33829884011826244, "grad_norm": 0.11246796697378159, "learning_rate": 0.0004994714080623152, "loss": 0.3862, "step": 5950 }, { "epoch": 0.34114168751421425, "grad_norm": 0.11354568600654602, "learning_rate": 0.000499466966113259, "loss": 0.3908, "step": 6000 }, { "epoch": 0.343984534910166, "grad_norm": 0.10693259537220001, "learning_rate": 0.0004994625241642029, "loss": 0.389, "step": 6050 }, { "epoch": 0.3468273823061178, "grad_norm": 0.12362982332706451, "learning_rate": 0.0004994580822151467, "loss": 0.39, "step": 6100 }, { "epoch": 0.34967022970206957, "grad_norm": 0.1343277543783188, "learning_rate": 0.0004994536402660905, "loss": 0.3934, "step": 6150 }, { "epoch": 0.3525130770980214, "grad_norm": 0.15133637189865112, "learning_rate": 0.0004994491983170344, "loss": 0.382, "step": 6200 }, { "epoch": 0.3553559244939732, "grad_norm": 0.11031973361968994, "learning_rate": 0.0004994447563679782, "loss": 0.3908, "step": 6250 }, { "epoch": 0.35819877188992494, "grad_norm": 0.09201149642467499, "learning_rate": 0.000499440314418922, "loss": 0.3888, "step": 6300 }, { "epoch": 0.36104161928587675, "grad_norm": 0.10396040976047516, "learning_rate": 0.0004994358724698658, "loss": 0.3861, "step": 6350 }, { "epoch": 0.3638844666818285, "grad_norm": 0.11148199439048767, "learning_rate": 0.0004994314305208096, "loss": 0.3897, "step": 6400 }, { "epoch": 0.3667273140777803, "grad_norm": 0.11479378491640091, "learning_rate": 0.0004994269885717535, "loss": 0.3883, "step": 6450 }, { "epoch": 0.3695701614737321, "grad_norm": 0.09433585405349731, "learning_rate": 0.0004994225466226973, "loss": 0.3841, "step": 6500 }, { "epoch": 0.37241300886968387, "grad_norm": 0.1304759830236435, "learning_rate": 0.0004994181046736411, "loss": 0.3876, "step": 6550 }, { "epoch": 0.3752558562656357, "grad_norm": 0.11292573064565659, "learning_rate": 0.0004994136627245849, "loss": 0.3872, "step": 6600 }, { "epoch": 0.37809870366158743, "grad_norm": 0.0928904116153717, "learning_rate": 0.0004994092207755289, "loss": 0.3875, "step": 6650 }, { "epoch": 0.38094155105753924, "grad_norm": 0.10247938334941864, "learning_rate": 0.0004994047788264727, "loss": 0.3885, "step": 6700 }, { "epoch": 0.383784398453491, "grad_norm": 0.11605235934257507, "learning_rate": 0.0004994003368774165, "loss": 0.3847, "step": 6750 }, { "epoch": 0.3866272458494428, "grad_norm": 0.1496395617723465, "learning_rate": 0.0004993958949283603, "loss": 0.3861, "step": 6800 }, { "epoch": 0.3894700932453946, "grad_norm": 0.11461085826158524, "learning_rate": 0.0004993914529793041, "loss": 0.3862, "step": 6850 }, { "epoch": 0.39231294064134636, "grad_norm": 0.11239515244960785, "learning_rate": 0.0004993870110302479, "loss": 0.3865, "step": 6900 }, { "epoch": 0.3951557880372982, "grad_norm": 0.11601614952087402, "learning_rate": 0.0004993825690811917, "loss": 0.3837, "step": 6950 }, { "epoch": 0.3979986354332499, "grad_norm": 0.13389568030834198, "learning_rate": 0.0004993781271321355, "loss": 0.3866, "step": 7000 }, { "epoch": 0.40084148282920173, "grad_norm": 0.12181925028562546, "learning_rate": 0.0004993736851830794, "loss": 0.388, "step": 7050 }, { "epoch": 0.4036843302251535, "grad_norm": 0.10346251726150513, "learning_rate": 0.0004993692432340233, "loss": 0.3879, "step": 7100 }, { "epoch": 0.4065271776211053, "grad_norm": 0.11786284297704697, "learning_rate": 0.0004993648012849671, "loss": 0.3859, "step": 7150 }, { "epoch": 0.4093700250170571, "grad_norm": 0.10377379506826401, "learning_rate": 0.0004993603593359109, "loss": 0.3838, "step": 7200 }, { "epoch": 0.41221287241300886, "grad_norm": 0.12437117844820023, "learning_rate": 0.0004993559173868547, "loss": 0.385, "step": 7250 }, { "epoch": 0.41505571980896067, "grad_norm": 0.09871742129325867, "learning_rate": 0.0004993514754377985, "loss": 0.387, "step": 7300 }, { "epoch": 0.4178985672049124, "grad_norm": 0.09648123383522034, "learning_rate": 0.0004993470334887423, "loss": 0.3843, "step": 7350 }, { "epoch": 0.4207414146008642, "grad_norm": 0.14795717597007751, "learning_rate": 0.0004993425915396861, "loss": 0.3833, "step": 7400 }, { "epoch": 0.42358426199681604, "grad_norm": 0.10902711004018784, "learning_rate": 0.00049933814959063, "loss": 0.3823, "step": 7450 }, { "epoch": 0.4264271093927678, "grad_norm": 0.09183204919099808, "learning_rate": 0.0004993337076415738, "loss": 0.3871, "step": 7500 }, { "epoch": 0.4292699567887196, "grad_norm": 0.11348464339971542, "learning_rate": 0.0004993292656925176, "loss": 0.3828, "step": 7550 }, { "epoch": 0.43211280418467135, "grad_norm": 0.08863567560911179, "learning_rate": 0.0004993248237434615, "loss": 0.3817, "step": 7600 }, { "epoch": 0.43495565158062316, "grad_norm": 0.1327882558107376, "learning_rate": 0.0004993203817944053, "loss": 0.3838, "step": 7650 }, { "epoch": 0.4377984989765749, "grad_norm": 0.11749105900526047, "learning_rate": 0.0004993159398453491, "loss": 0.3865, "step": 7700 }, { "epoch": 0.4406413463725267, "grad_norm": 0.09009282290935516, "learning_rate": 0.0004993114978962929, "loss": 0.3823, "step": 7750 }, { "epoch": 0.44348419376847853, "grad_norm": 0.106484554708004, "learning_rate": 0.0004993070559472367, "loss": 0.3797, "step": 7800 }, { "epoch": 0.4463270411644303, "grad_norm": 0.12862497568130493, "learning_rate": 0.0004993026139981806, "loss": 0.3804, "step": 7850 }, { "epoch": 0.4491698885603821, "grad_norm": 0.1015087440609932, "learning_rate": 0.0004992981720491244, "loss": 0.3806, "step": 7900 }, { "epoch": 0.45201273595633384, "grad_norm": 0.10782980173826218, "learning_rate": 0.0004992937301000682, "loss": 0.3858, "step": 7950 }, { "epoch": 0.45485558335228565, "grad_norm": 0.09852251410484314, "learning_rate": 0.000499289288151012, "loss": 0.3824, "step": 8000 }, { "epoch": 0.45769843074823746, "grad_norm": 0.10274305194616318, "learning_rate": 0.000499284846201956, "loss": 0.381, "step": 8050 }, { "epoch": 0.4605412781441892, "grad_norm": 0.10695527493953705, "learning_rate": 0.0004992804042528998, "loss": 0.3839, "step": 8100 }, { "epoch": 0.463384125540141, "grad_norm": 0.09151016920804977, "learning_rate": 0.0004992759623038436, "loss": 0.3842, "step": 8150 }, { "epoch": 0.4662269729360928, "grad_norm": 0.15730440616607666, "learning_rate": 0.0004992715203547874, "loss": 0.3798, "step": 8200 }, { "epoch": 0.4690698203320446, "grad_norm": 0.10243304073810577, "learning_rate": 0.0004992670784057312, "loss": 0.38, "step": 8250 }, { "epoch": 0.47191266772799634, "grad_norm": 0.1174805536866188, "learning_rate": 0.000499262636456675, "loss": 0.3817, "step": 8300 }, { "epoch": 0.47475551512394815, "grad_norm": 0.10942196846008301, "learning_rate": 0.0004992581945076188, "loss": 0.3829, "step": 8350 }, { "epoch": 0.47759836251989995, "grad_norm": 0.10109516978263855, "learning_rate": 0.0004992537525585626, "loss": 0.3803, "step": 8400 }, { "epoch": 0.4804412099158517, "grad_norm": 0.09248687326908112, "learning_rate": 0.0004992493106095065, "loss": 0.3796, "step": 8450 }, { "epoch": 0.4832840573118035, "grad_norm": 0.11525557935237885, "learning_rate": 0.0004992448686604504, "loss": 0.3823, "step": 8500 }, { "epoch": 0.48612690470775527, "grad_norm": 0.10481023788452148, "learning_rate": 0.0004992404267113942, "loss": 0.3781, "step": 8550 }, { "epoch": 0.4889697521037071, "grad_norm": 0.09493274986743927, "learning_rate": 0.000499235984762338, "loss": 0.38, "step": 8600 }, { "epoch": 0.49181259949965883, "grad_norm": 0.08859369158744812, "learning_rate": 0.0004992315428132818, "loss": 0.3789, "step": 8650 }, { "epoch": 0.49465544689561064, "grad_norm": 0.11814738810062408, "learning_rate": 0.0004992271008642256, "loss": 0.3809, "step": 8700 }, { "epoch": 0.49749829429156245, "grad_norm": 0.1322488635778427, "learning_rate": 0.0004992226589151694, "loss": 0.3795, "step": 8750 }, { "epoch": 0.5003411416875142, "grad_norm": 0.1304454803466797, "learning_rate": 0.0004992182169661132, "loss": 0.3794, "step": 8800 }, { "epoch": 0.503183989083466, "grad_norm": 0.09153052419424057, "learning_rate": 0.0004992137750170571, "loss": 0.3769, "step": 8850 }, { "epoch": 0.5060268364794178, "grad_norm": 0.10608222335577011, "learning_rate": 0.0004992093330680009, "loss": 0.3748, "step": 8900 }, { "epoch": 0.5088696838753696, "grad_norm": 0.1054573506116867, "learning_rate": 0.0004992048911189447, "loss": 0.3817, "step": 8950 }, { "epoch": 0.5117125312713213, "grad_norm": 0.1694241166114807, "learning_rate": 0.0004992004491698886, "loss": 0.3774, "step": 9000 }, { "epoch": 0.5145553786672732, "grad_norm": 0.13877439498901367, "learning_rate": 0.0004991960072208324, "loss": 0.3769, "step": 9050 }, { "epoch": 0.5173982260632249, "grad_norm": 0.10788954794406891, "learning_rate": 0.0004991915652717762, "loss": 0.3819, "step": 9100 }, { "epoch": 0.5202410734591767, "grad_norm": 0.12326483428478241, "learning_rate": 0.00049918712332272, "loss": 0.3812, "step": 9150 }, { "epoch": 0.5230839208551284, "grad_norm": 0.07986534386873245, "learning_rate": 0.0004991826813736638, "loss": 0.3797, "step": 9200 }, { "epoch": 0.5259267682510803, "grad_norm": 0.11102836579084396, "learning_rate": 0.0004991782394246077, "loss": 0.3797, "step": 9250 }, { "epoch": 0.5287696156470321, "grad_norm": 0.10697384923696518, "learning_rate": 0.0004991737974755515, "loss": 0.3767, "step": 9300 }, { "epoch": 0.5316124630429838, "grad_norm": 0.11679094284772873, "learning_rate": 0.0004991693555264953, "loss": 0.3782, "step": 9350 }, { "epoch": 0.5344553104389357, "grad_norm": 0.10827042162418365, "learning_rate": 0.0004991649135774391, "loss": 0.376, "step": 9400 }, { "epoch": 0.5372981578348874, "grad_norm": 0.0934649109840393, "learning_rate": 0.0004991604716283831, "loss": 0.3775, "step": 9450 }, { "epoch": 0.5401410052308392, "grad_norm": 0.08999691903591156, "learning_rate": 0.0004991560296793269, "loss": 0.3783, "step": 9500 }, { "epoch": 0.5429838526267909, "grad_norm": 0.11615636944770813, "learning_rate": 0.0004991515877302707, "loss": 0.3779, "step": 9550 }, { "epoch": 0.5458267000227428, "grad_norm": 0.4005848467350006, "learning_rate": 0.0004991471457812145, "loss": 0.3745, "step": 9600 }, { "epoch": 0.5486695474186946, "grad_norm": 0.12695498764514923, "learning_rate": 0.0004991427038321583, "loss": 0.3916, "step": 9650 }, { "epoch": 0.5515123948146463, "grad_norm": 0.10009093582630157, "learning_rate": 0.0004991382618831021, "loss": 0.3798, "step": 9700 }, { "epoch": 0.5543552422105982, "grad_norm": 0.1480248123407364, "learning_rate": 0.0004991338199340459, "loss": 0.3752, "step": 9750 }, { "epoch": 0.5571980896065499, "grad_norm": 0.10428158193826675, "learning_rate": 0.0004991293779849897, "loss": 0.3787, "step": 9800 }, { "epoch": 0.5600409370025017, "grad_norm": 0.1932564377784729, "learning_rate": 0.0004991249360359336, "loss": 0.3812, "step": 9850 }, { "epoch": 0.5628837843984535, "grad_norm": 0.088712178170681, "learning_rate": 0.0004991204940868775, "loss": 0.3783, "step": 9900 }, { "epoch": 0.5657266317944053, "grad_norm": 0.10896250605583191, "learning_rate": 0.0004991160521378213, "loss": 0.3782, "step": 9950 }, { "epoch": 0.568569479190357, "grad_norm": 0.1077795922756195, "learning_rate": 0.0004991116101887651, "loss": 0.3773, "step": 10000 }, { "epoch": 0.568569479190357, "eval_loss": 0.36588892340660095, "eval_runtime": 584.439, "eval_samples_per_second": 225.704, "eval_steps_per_second": 28.213, "step": 10000 }, { "epoch": 0.5714123265863088, "grad_norm": 0.09190759807825089, "learning_rate": 0.0004991071682397089, "loss": 0.3785, "step": 10050 }, { "epoch": 0.5742551739822607, "grad_norm": 0.11082971841096878, "learning_rate": 0.0004991027262906527, "loss": 0.3775, "step": 10100 }, { "epoch": 0.5770980213782124, "grad_norm": 0.10099225491285324, "learning_rate": 0.0004990982843415965, "loss": 0.3775, "step": 10150 }, { "epoch": 0.5799408687741642, "grad_norm": 0.1112075224518776, "learning_rate": 0.0004990938423925403, "loss": 0.3764, "step": 10200 }, { "epoch": 0.582783716170116, "grad_norm": 0.09865789860486984, "learning_rate": 0.0004990894004434842, "loss": 0.3802, "step": 10250 }, { "epoch": 0.5856265635660678, "grad_norm": 0.10351789742708206, "learning_rate": 0.000499084958494428, "loss": 0.3795, "step": 10300 }, { "epoch": 0.5884694109620195, "grad_norm": 0.10376661270856857, "learning_rate": 0.0004990805165453719, "loss": 0.3753, "step": 10350 }, { "epoch": 0.5913122583579713, "grad_norm": 0.09663544595241547, "learning_rate": 0.0004990760745963157, "loss": 0.3737, "step": 10400 }, { "epoch": 0.5941551057539232, "grad_norm": 0.09701116383075714, "learning_rate": 0.0004990716326472595, "loss": 0.3737, "step": 10450 }, { "epoch": 0.5969979531498749, "grad_norm": 0.1021418496966362, "learning_rate": 0.0004990671906982033, "loss": 0.3728, "step": 10500 }, { "epoch": 0.5998408005458267, "grad_norm": 0.12170026451349258, "learning_rate": 0.0004990627487491472, "loss": 0.3755, "step": 10550 }, { "epoch": 0.6026836479417785, "grad_norm": 0.1142646074295044, "learning_rate": 0.000499058306800091, "loss": 0.378, "step": 10600 }, { "epoch": 0.6055264953377303, "grad_norm": 0.09568295627832413, "learning_rate": 0.0004990538648510348, "loss": 0.3742, "step": 10650 }, { "epoch": 0.608369342733682, "grad_norm": 0.11586258560419083, "learning_rate": 0.0004990494229019786, "loss": 0.3731, "step": 10700 }, { "epoch": 0.6112121901296338, "grad_norm": 0.0984320268034935, "learning_rate": 0.0004990449809529224, "loss": 0.3767, "step": 10750 }, { "epoch": 0.6140550375255857, "grad_norm": 0.10665503889322281, "learning_rate": 0.0004990405390038662, "loss": 0.3756, "step": 10800 }, { "epoch": 0.6168978849215374, "grad_norm": 0.10374121367931366, "learning_rate": 0.0004990360970548102, "loss": 0.376, "step": 10850 }, { "epoch": 0.6197407323174892, "grad_norm": 0.10508013516664505, "learning_rate": 0.000499031655105754, "loss": 0.3717, "step": 10900 }, { "epoch": 0.622583579713441, "grad_norm": 0.10919308662414551, "learning_rate": 0.0004990272131566978, "loss": 0.3746, "step": 10950 }, { "epoch": 0.6254264271093928, "grad_norm": 0.09990997612476349, "learning_rate": 0.0004990227712076416, "loss": 0.3738, "step": 11000 }, { "epoch": 0.6282692745053445, "grad_norm": 0.10817346721887589, "learning_rate": 0.0004990183292585854, "loss": 0.3688, "step": 11050 }, { "epoch": 0.6311121219012963, "grad_norm": 0.0922752320766449, "learning_rate": 0.0004990138873095292, "loss": 0.3751, "step": 11100 }, { "epoch": 0.6339549692972481, "grad_norm": 0.09761642664670944, "learning_rate": 0.000499009445360473, "loss": 0.3724, "step": 11150 }, { "epoch": 0.6367978166931999, "grad_norm": 0.09401151537895203, "learning_rate": 0.0004990050034114168, "loss": 0.3717, "step": 11200 }, { "epoch": 0.6396406640891517, "grad_norm": 0.10084612667560577, "learning_rate": 0.0004990005614623607, "loss": 0.3711, "step": 11250 }, { "epoch": 0.6424835114851035, "grad_norm": 0.11709214001893997, "learning_rate": 0.0004989961195133046, "loss": 0.3739, "step": 11300 }, { "epoch": 0.6453263588810553, "grad_norm": 0.1035287156701088, "learning_rate": 0.0004989916775642484, "loss": 0.3684, "step": 11350 }, { "epoch": 0.648169206277007, "grad_norm": 0.09968025982379913, "learning_rate": 0.0004989872356151922, "loss": 0.3749, "step": 11400 }, { "epoch": 0.6510120536729589, "grad_norm": 0.1003747433423996, "learning_rate": 0.000498982793666136, "loss": 0.3721, "step": 11450 }, { "epoch": 0.6538549010689106, "grad_norm": 0.09462849795818329, "learning_rate": 0.0004989783517170798, "loss": 0.3687, "step": 11500 }, { "epoch": 0.6566977484648624, "grad_norm": 0.10089970380067825, "learning_rate": 0.0004989739097680236, "loss": 0.3724, "step": 11550 }, { "epoch": 0.6595405958608141, "grad_norm": 0.11282893270254135, "learning_rate": 0.0004989694678189675, "loss": 0.3726, "step": 11600 }, { "epoch": 0.662383443256766, "grad_norm": 0.12321527302265167, "learning_rate": 0.0004989650258699113, "loss": 0.3704, "step": 11650 }, { "epoch": 0.6652262906527178, "grad_norm": 0.10225247591733932, "learning_rate": 0.0004989605839208551, "loss": 0.3717, "step": 11700 }, { "epoch": 0.6680691380486695, "grad_norm": 0.09634348750114441, "learning_rate": 0.000498956141971799, "loss": 0.3731, "step": 11750 }, { "epoch": 0.6709119854446214, "grad_norm": 0.09670031815767288, "learning_rate": 0.0004989517000227428, "loss": 0.3697, "step": 11800 }, { "epoch": 0.6737548328405731, "grad_norm": 0.11874838918447495, "learning_rate": 0.0004989472580736867, "loss": 0.3717, "step": 11850 }, { "epoch": 0.6765976802365249, "grad_norm": 0.1474309116601944, "learning_rate": 0.0004989428161246305, "loss": 0.3719, "step": 11900 }, { "epoch": 0.6794405276324766, "grad_norm": 0.0953158363699913, "learning_rate": 0.0004989383741755743, "loss": 0.3698, "step": 11950 }, { "epoch": 0.6822833750284285, "grad_norm": 0.07613354921340942, "learning_rate": 0.0004989339322265181, "loss": 0.372, "step": 12000 }, { "epoch": 0.6851262224243803, "grad_norm": 0.10099935531616211, "learning_rate": 0.0004989294902774619, "loss": 0.3715, "step": 12050 }, { "epoch": 0.687969069820332, "grad_norm": 0.1142035722732544, "learning_rate": 0.0004989250483284057, "loss": 0.371, "step": 12100 }, { "epoch": 0.6908119172162839, "grad_norm": 0.11868602782487869, "learning_rate": 0.0004989206063793495, "loss": 0.3718, "step": 12150 }, { "epoch": 0.6936547646122356, "grad_norm": 0.11403531581163406, "learning_rate": 0.0004989161644302935, "loss": 0.37, "step": 12200 }, { "epoch": 0.6964976120081874, "grad_norm": 0.10048527270555496, "learning_rate": 0.0004989117224812373, "loss": 0.3689, "step": 12250 }, { "epoch": 0.6993404594041391, "grad_norm": 0.0894421935081482, "learning_rate": 0.0004989072805321811, "loss": 0.3675, "step": 12300 }, { "epoch": 0.702183306800091, "grad_norm": 0.10465215891599655, "learning_rate": 0.0004989028385831249, "loss": 0.3707, "step": 12350 }, { "epoch": 0.7050261541960428, "grad_norm": 0.10983111709356308, "learning_rate": 0.0004988983966340687, "loss": 0.3713, "step": 12400 }, { "epoch": 0.7078690015919945, "grad_norm": 0.11033093929290771, "learning_rate": 0.0004988939546850125, "loss": 0.3723, "step": 12450 }, { "epoch": 0.7107118489879464, "grad_norm": 0.11053282022476196, "learning_rate": 0.0004988895127359563, "loss": 0.3701, "step": 12500 }, { "epoch": 0.7135546963838981, "grad_norm": 0.09599506109952927, "learning_rate": 0.0004988850707869001, "loss": 0.3696, "step": 12550 }, { "epoch": 0.7163975437798499, "grad_norm": 0.10971349477767944, "learning_rate": 0.000498880628837844, "loss": 0.3692, "step": 12600 }, { "epoch": 0.7192403911758016, "grad_norm": 0.09890740364789963, "learning_rate": 0.0004988761868887879, "loss": 0.3671, "step": 12650 }, { "epoch": 0.7220832385717535, "grad_norm": 0.09667664021253586, "learning_rate": 0.0004988717449397317, "loss": 0.3708, "step": 12700 }, { "epoch": 0.7249260859677052, "grad_norm": 0.08852635324001312, "learning_rate": 0.0004988673029906755, "loss": 0.3704, "step": 12750 }, { "epoch": 0.727768933363657, "grad_norm": 0.08930686861276627, "learning_rate": 0.0004988628610416193, "loss": 0.3676, "step": 12800 }, { "epoch": 0.7306117807596089, "grad_norm": 0.10307100415229797, "learning_rate": 0.0004988584190925631, "loss": 0.3702, "step": 12850 }, { "epoch": 0.7334546281555606, "grad_norm": 0.09995568543672562, "learning_rate": 0.0004988539771435069, "loss": 0.367, "step": 12900 }, { "epoch": 0.7362974755515124, "grad_norm": 0.09159702807664871, "learning_rate": 0.0004988495351944507, "loss": 0.3661, "step": 12950 }, { "epoch": 0.7391403229474642, "grad_norm": 0.0963875874876976, "learning_rate": 0.0004988450932453946, "loss": 0.3667, "step": 13000 }, { "epoch": 0.741983170343416, "grad_norm": 0.09584073722362518, "learning_rate": 0.0004988406512963384, "loss": 0.3697, "step": 13050 }, { "epoch": 0.7448260177393677, "grad_norm": 0.09499222785234451, "learning_rate": 0.0004988362093472822, "loss": 0.3692, "step": 13100 }, { "epoch": 0.7476688651353195, "grad_norm": 0.10076665878295898, "learning_rate": 0.0004988317673982261, "loss": 0.3672, "step": 13150 }, { "epoch": 0.7505117125312714, "grad_norm": 0.1046692505478859, "learning_rate": 0.00049882732544917, "loss": 0.3669, "step": 13200 }, { "epoch": 0.7533545599272231, "grad_norm": 0.11345544457435608, "learning_rate": 0.0004988228835001138, "loss": 0.3676, "step": 13250 }, { "epoch": 0.7561974073231749, "grad_norm": 0.09689969569444656, "learning_rate": 0.0004988184415510576, "loss": 0.367, "step": 13300 }, { "epoch": 0.7590402547191267, "grad_norm": 0.10489045083522797, "learning_rate": 0.0004988139996020014, "loss": 0.3681, "step": 13350 }, { "epoch": 0.7618831021150785, "grad_norm": 0.09123999625444412, "learning_rate": 0.0004988095576529452, "loss": 0.3687, "step": 13400 }, { "epoch": 0.7647259495110302, "grad_norm": 0.09405972063541412, "learning_rate": 0.000498805115703889, "loss": 0.3715, "step": 13450 }, { "epoch": 0.767568796906982, "grad_norm": 0.08794659376144409, "learning_rate": 0.0004988006737548328, "loss": 0.3653, "step": 13500 }, { "epoch": 0.7704116443029339, "grad_norm": 0.11547774076461792, "learning_rate": 0.0004987962318057766, "loss": 0.3677, "step": 13550 }, { "epoch": 0.7732544916988856, "grad_norm": 0.0955629050731659, "learning_rate": 0.0004987917898567206, "loss": 0.369, "step": 13600 }, { "epoch": 0.7760973390948374, "grad_norm": 0.10582061856985092, "learning_rate": 0.0004987873479076644, "loss": 0.3696, "step": 13650 }, { "epoch": 0.7789401864907892, "grad_norm": 0.11626145988702774, "learning_rate": 0.0004987829059586082, "loss": 0.369, "step": 13700 }, { "epoch": 0.781783033886741, "grad_norm": 0.08376283198595047, "learning_rate": 0.000498778464009552, "loss": 0.3686, "step": 13750 }, { "epoch": 0.7846258812826927, "grad_norm": 0.09740207344293594, "learning_rate": 0.0004987740220604958, "loss": 0.3649, "step": 13800 }, { "epoch": 0.7874687286786445, "grad_norm": 0.08660556375980377, "learning_rate": 0.0004987695801114396, "loss": 0.3654, "step": 13850 }, { "epoch": 0.7903115760745963, "grad_norm": 0.09063564240932465, "learning_rate": 0.0004987651381623834, "loss": 0.3671, "step": 13900 }, { "epoch": 0.7931544234705481, "grad_norm": 0.12205325812101364, "learning_rate": 0.0004987606962133272, "loss": 0.3667, "step": 13950 }, { "epoch": 0.7959972708664999, "grad_norm": 0.1259283423423767, "learning_rate": 0.0004987562542642711, "loss": 0.3611, "step": 14000 }, { "epoch": 0.7988401182624517, "grad_norm": 0.0938865914940834, "learning_rate": 0.000498751812315215, "loss": 0.3666, "step": 14050 }, { "epoch": 0.8016829656584035, "grad_norm": 0.10185267776250839, "learning_rate": 0.0004987473703661588, "loss": 0.3648, "step": 14100 }, { "epoch": 0.8045258130543552, "grad_norm": 0.11661996692419052, "learning_rate": 0.0004987429284171026, "loss": 0.3657, "step": 14150 }, { "epoch": 0.807368660450307, "grad_norm": 0.0994558110833168, "learning_rate": 0.0004987384864680464, "loss": 0.3639, "step": 14200 }, { "epoch": 0.8102115078462588, "grad_norm": 0.09819088876247406, "learning_rate": 0.0004987340445189902, "loss": 0.3642, "step": 14250 }, { "epoch": 0.8130543552422106, "grad_norm": 0.09436295926570892, "learning_rate": 0.000498729602569934, "loss": 0.3677, "step": 14300 }, { "epoch": 0.8158972026381623, "grad_norm": 0.12002996355295181, "learning_rate": 0.0004987251606208778, "loss": 0.366, "step": 14350 }, { "epoch": 0.8187400500341142, "grad_norm": 0.08650200068950653, "learning_rate": 0.0004987207186718217, "loss": 0.3645, "step": 14400 }, { "epoch": 0.821582897430066, "grad_norm": 0.1826743632555008, "learning_rate": 0.0004987162767227655, "loss": 0.3681, "step": 14450 }, { "epoch": 0.8244257448260177, "grad_norm": 0.10164210200309753, "learning_rate": 0.0004987118347737093, "loss": 0.3706, "step": 14500 }, { "epoch": 0.8272685922219696, "grad_norm": 0.09652096033096313, "learning_rate": 0.0004987073928246532, "loss": 0.364, "step": 14550 }, { "epoch": 0.8301114396179213, "grad_norm": 0.1318165808916092, "learning_rate": 0.0004987029508755971, "loss": 0.3651, "step": 14600 }, { "epoch": 0.8329542870138731, "grad_norm": 0.09845109283924103, "learning_rate": 0.0004986985089265409, "loss": 0.3636, "step": 14650 }, { "epoch": 0.8357971344098248, "grad_norm": 0.12955370545387268, "learning_rate": 0.0004986940669774847, "loss": 0.3641, "step": 14700 }, { "epoch": 0.8386399818057767, "grad_norm": 0.10223904252052307, "learning_rate": 0.0004986896250284285, "loss": 0.363, "step": 14750 }, { "epoch": 0.8414828292017285, "grad_norm": 0.09701918810606003, "learning_rate": 0.0004986851830793723, "loss": 0.3627, "step": 14800 }, { "epoch": 0.8443256765976802, "grad_norm": 0.0875435471534729, "learning_rate": 0.0004986807411303161, "loss": 0.3651, "step": 14850 }, { "epoch": 0.8471685239936321, "grad_norm": 0.09006401896476746, "learning_rate": 0.0004986762991812599, "loss": 0.3632, "step": 14900 }, { "epoch": 0.8500113713895838, "grad_norm": 0.09841691702604294, "learning_rate": 0.0004986718572322037, "loss": 0.3641, "step": 14950 }, { "epoch": 0.8528542187855356, "grad_norm": 0.11014077812433243, "learning_rate": 0.0004986674152831477, "loss": 0.3654, "step": 15000 }, { "epoch": 0.8528542187855356, "eval_loss": 0.3545059561729431, "eval_runtime": 572.3313, "eval_samples_per_second": 230.478, "eval_steps_per_second": 28.81, "step": 15000 }, { "epoch": 0.8556970661814873, "grad_norm": 0.11560770869255066, "learning_rate": 0.0004986629733340915, "loss": 0.3644, "step": 15050 }, { "epoch": 0.8585399135774392, "grad_norm": 0.08875507861375809, "learning_rate": 0.0004986585313850353, "loss": 0.3637, "step": 15100 }, { "epoch": 0.861382760973391, "grad_norm": 0.09458891302347183, "learning_rate": 0.0004986540894359791, "loss": 0.3638, "step": 15150 }, { "epoch": 0.8642256083693427, "grad_norm": 0.10495459288358688, "learning_rate": 0.0004986496474869229, "loss": 0.3647, "step": 15200 }, { "epoch": 0.8670684557652946, "grad_norm": 0.10532251745462418, "learning_rate": 0.0004986452055378667, "loss": 0.3664, "step": 15250 }, { "epoch": 0.8699113031612463, "grad_norm": 0.09231790155172348, "learning_rate": 0.0004986407635888105, "loss": 0.3643, "step": 15300 }, { "epoch": 0.8727541505571981, "grad_norm": 0.09325330704450607, "learning_rate": 0.0004986363216397543, "loss": 0.3659, "step": 15350 }, { "epoch": 0.8755969979531498, "grad_norm": 0.11519546806812286, "learning_rate": 0.0004986318796906982, "loss": 0.3654, "step": 15400 }, { "epoch": 0.8784398453491017, "grad_norm": 0.08858964592218399, "learning_rate": 0.0004986274377416421, "loss": 0.3647, "step": 15450 }, { "epoch": 0.8812826927450534, "grad_norm": 0.11211369931697845, "learning_rate": 0.0004986229957925859, "loss": 0.3646, "step": 15500 }, { "epoch": 0.8841255401410052, "grad_norm": 0.09691707044839859, "learning_rate": 0.0004986185538435297, "loss": 0.3633, "step": 15550 }, { "epoch": 0.8869683875369571, "grad_norm": 0.09831210970878601, "learning_rate": 0.0004986141118944735, "loss": 0.3644, "step": 15600 }, { "epoch": 0.8898112349329088, "grad_norm": 0.10148192197084427, "learning_rate": 0.0004986096699454173, "loss": 0.3613, "step": 15650 }, { "epoch": 0.8926540823288606, "grad_norm": 0.0974993035197258, "learning_rate": 0.0004986052279963611, "loss": 0.3648, "step": 15700 }, { "epoch": 0.8954969297248123, "grad_norm": 0.09062571078538895, "learning_rate": 0.0004986007860473049, "loss": 0.3622, "step": 15750 }, { "epoch": 0.8983397771207642, "grad_norm": 0.11505638062953949, "learning_rate": 0.0004985963440982488, "loss": 0.3647, "step": 15800 }, { "epoch": 0.9011826245167159, "grad_norm": 0.10409519821405411, "learning_rate": 0.0004985919021491926, "loss": 0.3684, "step": 15850 }, { "epoch": 0.9040254719126677, "grad_norm": 0.09695091098546982, "learning_rate": 0.0004985874602001365, "loss": 0.3627, "step": 15900 }, { "epoch": 0.9068683193086196, "grad_norm": 0.10802540183067322, "learning_rate": 0.0004985830182510803, "loss": 0.363, "step": 15950 }, { "epoch": 0.9097111667045713, "grad_norm": 0.08416867256164551, "learning_rate": 0.0004985785763020242, "loss": 0.3633, "step": 16000 }, { "epoch": 0.9125540141005231, "grad_norm": 0.12350185215473175, "learning_rate": 0.000498574134352968, "loss": 0.3645, "step": 16050 }, { "epoch": 0.9153968614964749, "grad_norm": 0.10537421703338623, "learning_rate": 0.0004985696924039118, "loss": 0.3635, "step": 16100 }, { "epoch": 0.9182397088924267, "grad_norm": 0.09744290262460709, "learning_rate": 0.0004985652504548556, "loss": 0.3623, "step": 16150 }, { "epoch": 0.9210825562883784, "grad_norm": 0.09753882884979248, "learning_rate": 0.0004985608085057994, "loss": 0.3617, "step": 16200 }, { "epoch": 0.9239254036843302, "grad_norm": 0.08662727475166321, "learning_rate": 0.0004985563665567432, "loss": 0.3624, "step": 16250 }, { "epoch": 0.926768251080282, "grad_norm": 0.09310087561607361, "learning_rate": 0.000498551924607687, "loss": 0.3631, "step": 16300 }, { "epoch": 0.9296110984762338, "grad_norm": 0.10131137073040009, "learning_rate": 0.0004985474826586308, "loss": 0.3593, "step": 16350 }, { "epoch": 0.9324539458721856, "grad_norm": 0.09597592055797577, "learning_rate": 0.0004985430407095748, "loss": 0.3628, "step": 16400 }, { "epoch": 0.9352967932681374, "grad_norm": 0.09163256734609604, "learning_rate": 0.0004985385987605186, "loss": 0.3596, "step": 16450 }, { "epoch": 0.9381396406640892, "grad_norm": 0.11283744126558304, "learning_rate": 0.0004985341568114624, "loss": 0.3629, "step": 16500 }, { "epoch": 0.9409824880600409, "grad_norm": 0.1027892455458641, "learning_rate": 0.0004985297148624062, "loss": 0.3624, "step": 16550 }, { "epoch": 0.9438253354559927, "grad_norm": 0.09853541105985641, "learning_rate": 0.00049852527291335, "loss": 0.3606, "step": 16600 }, { "epoch": 0.9466681828519445, "grad_norm": 0.09280608594417572, "learning_rate": 0.0004985208309642938, "loss": 0.3626, "step": 16650 }, { "epoch": 0.9495110302478963, "grad_norm": 0.09166574478149414, "learning_rate": 0.0004985163890152376, "loss": 0.3631, "step": 16700 }, { "epoch": 0.952353877643848, "grad_norm": 0.10630334913730621, "learning_rate": 0.0004985119470661814, "loss": 0.3611, "step": 16750 }, { "epoch": 0.9551967250397999, "grad_norm": 0.09024935215711594, "learning_rate": 0.0004985075051171253, "loss": 0.3627, "step": 16800 }, { "epoch": 0.9580395724357517, "grad_norm": 0.08191289007663727, "learning_rate": 0.0004985030631680692, "loss": 0.3586, "step": 16850 }, { "epoch": 0.9608824198317034, "grad_norm": 0.08423138409852982, "learning_rate": 0.000498498621219013, "loss": 0.3622, "step": 16900 }, { "epoch": 0.9637252672276552, "grad_norm": 0.09373466670513153, "learning_rate": 0.0004984941792699568, "loss": 0.3611, "step": 16950 }, { "epoch": 0.966568114623607, "grad_norm": 0.09812022745609283, "learning_rate": 0.0004984897373209006, "loss": 0.3627, "step": 17000 }, { "epoch": 0.9694109620195588, "grad_norm": 0.09764017909765244, "learning_rate": 0.0004984852953718444, "loss": 0.3589, "step": 17050 }, { "epoch": 0.9722538094155105, "grad_norm": 0.10176458954811096, "learning_rate": 0.0004984808534227883, "loss": 0.3594, "step": 17100 }, { "epoch": 0.9750966568114624, "grad_norm": 0.10278456658124924, "learning_rate": 0.0004984764114737321, "loss": 0.3581, "step": 17150 }, { "epoch": 0.9779395042074142, "grad_norm": 0.1218334510922432, "learning_rate": 0.0004984719695246759, "loss": 0.3606, "step": 17200 }, { "epoch": 0.9807823516033659, "grad_norm": 0.09451757371425629, "learning_rate": 0.0004984675275756197, "loss": 0.3614, "step": 17250 }, { "epoch": 0.9836251989993177, "grad_norm": 0.10088694840669632, "learning_rate": 0.0004984630856265636, "loss": 0.3604, "step": 17300 }, { "epoch": 0.9864680463952695, "grad_norm": 0.10026043653488159, "learning_rate": 0.0004984586436775075, "loss": 0.3604, "step": 17350 }, { "epoch": 0.9893108937912213, "grad_norm": 0.11412831395864487, "learning_rate": 0.0004984542017284513, "loss": 0.3608, "step": 17400 }, { "epoch": 0.992153741187173, "grad_norm": 0.10250277817249298, "learning_rate": 0.0004984497597793951, "loss": 0.3597, "step": 17450 }, { "epoch": 0.9949965885831249, "grad_norm": 0.10922574251890182, "learning_rate": 0.0004984453178303389, "loss": 0.3611, "step": 17500 }, { "epoch": 0.9978394359790766, "grad_norm": 0.09554197639226913, "learning_rate": 0.0004984408758812827, "loss": 0.3596, "step": 17550 }, { "epoch": 1.0006822833750284, "grad_norm": 0.11213108897209167, "learning_rate": 0.0004984364339322265, "loss": 0.3617, "step": 17600 }, { "epoch": 1.0035251307709803, "grad_norm": 0.13735197484493256, "learning_rate": 0.0004984319919831703, "loss": 0.3528, "step": 17650 }, { "epoch": 1.006367978166932, "grad_norm": 0.10477373749017715, "learning_rate": 0.0004984275500341141, "loss": 0.3561, "step": 17700 }, { "epoch": 1.0092108255628838, "grad_norm": 0.10232077538967133, "learning_rate": 0.0004984231080850581, "loss": 0.3563, "step": 17750 }, { "epoch": 1.0120536729588356, "grad_norm": 0.11949522793292999, "learning_rate": 0.0004984186661360019, "loss": 0.3534, "step": 17800 }, { "epoch": 1.0148965203547873, "grad_norm": 0.10367763787508011, "learning_rate": 0.0004984142241869457, "loss": 0.3564, "step": 17850 }, { "epoch": 1.0177393677507391, "grad_norm": 0.11785215884447098, "learning_rate": 0.0004984097822378895, "loss": 0.3584, "step": 17900 }, { "epoch": 1.020582215146691, "grad_norm": 0.09687703847885132, "learning_rate": 0.0004984053402888333, "loss": 0.356, "step": 17950 }, { "epoch": 1.0234250625426426, "grad_norm": 0.1126837283372879, "learning_rate": 0.0004984008983397771, "loss": 0.3545, "step": 18000 }, { "epoch": 1.0262679099385945, "grad_norm": 0.0994856208562851, "learning_rate": 0.0004983964563907209, "loss": 0.3556, "step": 18050 }, { "epoch": 1.0291107573345464, "grad_norm": 0.09857575595378876, "learning_rate": 0.0004983920144416647, "loss": 0.3596, "step": 18100 }, { "epoch": 1.031953604730498, "grad_norm": 0.10661829262971878, "learning_rate": 0.0004983875724926086, "loss": 0.3557, "step": 18150 }, { "epoch": 1.0347964521264499, "grad_norm": 0.08805033564567566, "learning_rate": 0.0004983831305435525, "loss": 0.3552, "step": 18200 }, { "epoch": 1.0376392995224015, "grad_norm": 0.09851615130901337, "learning_rate": 0.0004983786885944963, "loss": 0.3595, "step": 18250 }, { "epoch": 1.0404821469183534, "grad_norm": 0.10526396334171295, "learning_rate": 0.0004983742466454401, "loss": 0.3557, "step": 18300 }, { "epoch": 1.0433249943143053, "grad_norm": 0.10095933824777603, "learning_rate": 0.0004983698046963839, "loss": 0.3545, "step": 18350 }, { "epoch": 1.046167841710257, "grad_norm": 0.0906311646103859, "learning_rate": 0.0004983653627473278, "loss": 0.3539, "step": 18400 }, { "epoch": 1.0490106891062088, "grad_norm": 0.10459648072719574, "learning_rate": 0.0004983609207982716, "loss": 0.3551, "step": 18450 }, { "epoch": 1.0518535365021606, "grad_norm": 0.08786173164844513, "learning_rate": 0.0004983564788492154, "loss": 0.3534, "step": 18500 }, { "epoch": 1.0546963838981123, "grad_norm": 0.10580555349588394, "learning_rate": 0.0004983520369001592, "loss": 0.3552, "step": 18550 }, { "epoch": 1.0575392312940641, "grad_norm": 0.09845657646656036, "learning_rate": 0.000498347594951103, "loss": 0.3567, "step": 18600 }, { "epoch": 1.060382078690016, "grad_norm": 0.09070323407649994, "learning_rate": 0.0004983431530020468, "loss": 0.3572, "step": 18650 }, { "epoch": 1.0632249260859676, "grad_norm": 0.11504799127578735, "learning_rate": 0.0004983387110529907, "loss": 0.3546, "step": 18700 }, { "epoch": 1.0660677734819195, "grad_norm": 0.0939740464091301, "learning_rate": 0.0004983342691039346, "loss": 0.3572, "step": 18750 }, { "epoch": 1.0689106208778714, "grad_norm": 0.11082996428012848, "learning_rate": 0.0004983298271548784, "loss": 0.357, "step": 18800 }, { "epoch": 1.071753468273823, "grad_norm": 0.09426229447126389, "learning_rate": 0.0004983253852058222, "loss": 0.3552, "step": 18850 }, { "epoch": 1.0745963156697749, "grad_norm": 0.09430352598428726, "learning_rate": 0.000498320943256766, "loss": 0.3559, "step": 18900 }, { "epoch": 1.0774391630657267, "grad_norm": 0.11377741396427155, "learning_rate": 0.0004983165013077098, "loss": 0.3517, "step": 18950 }, { "epoch": 1.0802820104616784, "grad_norm": 0.0836120992898941, "learning_rate": 0.0004983120593586536, "loss": 0.3546, "step": 19000 }, { "epoch": 1.0831248578576302, "grad_norm": 0.09483205527067184, "learning_rate": 0.0004983076174095974, "loss": 0.354, "step": 19050 }, { "epoch": 1.0859677052535819, "grad_norm": 0.08619007468223572, "learning_rate": 0.0004983031754605412, "loss": 0.3526, "step": 19100 }, { "epoch": 1.0888105526495337, "grad_norm": 0.09521963447332382, "learning_rate": 0.0004982987335114852, "loss": 0.3542, "step": 19150 }, { "epoch": 1.0916534000454856, "grad_norm": 0.08765676617622375, "learning_rate": 0.000498294291562429, "loss": 0.3542, "step": 19200 }, { "epoch": 1.0944962474414373, "grad_norm": 0.10146308690309525, "learning_rate": 0.0004982898496133728, "loss": 0.3559, "step": 19250 }, { "epoch": 1.0973390948373891, "grad_norm": 0.09321428835391998, "learning_rate": 0.0004982854076643166, "loss": 0.3576, "step": 19300 }, { "epoch": 1.100181942233341, "grad_norm": 0.11401006579399109, "learning_rate": 0.0004982809657152604, "loss": 0.3549, "step": 19350 }, { "epoch": 1.1030247896292926, "grad_norm": 0.11573155969381332, "learning_rate": 0.0004982765237662042, "loss": 0.3569, "step": 19400 }, { "epoch": 1.1058676370252445, "grad_norm": 0.08966556191444397, "learning_rate": 0.000498272081817148, "loss": 0.3603, "step": 19450 }, { "epoch": 1.1087104844211964, "grad_norm": 0.12934494018554688, "learning_rate": 0.0004982676398680918, "loss": 0.3533, "step": 19500 }, { "epoch": 1.111553331817148, "grad_norm": 0.13741086423397064, "learning_rate": 0.0004982631979190357, "loss": 0.3542, "step": 19550 }, { "epoch": 1.1143961792130999, "grad_norm": 0.10928856581449509, "learning_rate": 0.0004982587559699796, "loss": 0.3559, "step": 19600 }, { "epoch": 1.1172390266090517, "grad_norm": 0.09697018563747406, "learning_rate": 0.0004982543140209234, "loss": 0.3559, "step": 19650 }, { "epoch": 1.1200818740050034, "grad_norm": 0.09221459925174713, "learning_rate": 0.0004982498720718672, "loss": 0.3554, "step": 19700 }, { "epoch": 1.1229247214009552, "grad_norm": 0.091732919216156, "learning_rate": 0.0004982454301228111, "loss": 0.352, "step": 19750 }, { "epoch": 1.125767568796907, "grad_norm": 0.10662009567022324, "learning_rate": 0.0004982409881737549, "loss": 0.3582, "step": 19800 }, { "epoch": 1.1286104161928587, "grad_norm": 0.08810966461896896, "learning_rate": 0.0004982365462246987, "loss": 0.3534, "step": 19850 }, { "epoch": 1.1314532635888106, "grad_norm": 0.10295290499925613, "learning_rate": 0.0004982321042756425, "loss": 0.3577, "step": 19900 }, { "epoch": 1.1342961109847622, "grad_norm": 0.08759327977895737, "learning_rate": 0.0004982276623265863, "loss": 0.3529, "step": 19950 }, { "epoch": 1.137138958380714, "grad_norm": 0.08622145652770996, "learning_rate": 0.0004982232203775301, "loss": 0.3541, "step": 20000 }, { "epoch": 1.137138958380714, "eval_loss": 0.3468220829963684, "eval_runtime": 575.5127, "eval_samples_per_second": 229.204, "eval_steps_per_second": 28.651, "step": 20000 }, { "epoch": 1.139981805776666, "grad_norm": 0.09593155980110168, "learning_rate": 0.000498218778428474, "loss": 0.3533, "step": 20050 }, { "epoch": 1.1428246531726176, "grad_norm": 0.1172272190451622, "learning_rate": 0.0004982143364794178, "loss": 0.3513, "step": 20100 }, { "epoch": 1.1456675005685695, "grad_norm": 0.10402148216962814, "learning_rate": 0.0004982098945303617, "loss": 0.3587, "step": 20150 }, { "epoch": 1.1485103479645213, "grad_norm": 0.14057371020317078, "learning_rate": 0.0004982054525813055, "loss": 0.3525, "step": 20200 }, { "epoch": 1.151353195360473, "grad_norm": 0.11363080888986588, "learning_rate": 0.0004982010106322493, "loss": 0.3527, "step": 20250 }, { "epoch": 1.1541960427564248, "grad_norm": 0.10476770997047424, "learning_rate": 0.0004981965686831931, "loss": 0.354, "step": 20300 }, { "epoch": 1.1570388901523767, "grad_norm": 0.11272389441728592, "learning_rate": 0.0004981921267341369, "loss": 0.3562, "step": 20350 }, { "epoch": 1.1598817375483284, "grad_norm": 0.0867941826581955, "learning_rate": 0.0004981876847850807, "loss": 0.3531, "step": 20400 }, { "epoch": 1.1627245849442802, "grad_norm": 0.12125921249389648, "learning_rate": 0.0004981832428360245, "loss": 0.3523, "step": 20450 }, { "epoch": 1.1655674323402319, "grad_norm": 0.09518919885158539, "learning_rate": 0.0004981788008869683, "loss": 0.3562, "step": 20500 }, { "epoch": 1.1684102797361837, "grad_norm": 0.09472860395908356, "learning_rate": 0.0004981743589379123, "loss": 0.3555, "step": 20550 }, { "epoch": 1.1712531271321356, "grad_norm": 0.09812992066144943, "learning_rate": 0.0004981699169888561, "loss": 0.3559, "step": 20600 }, { "epoch": 1.1740959745280874, "grad_norm": 0.09982824325561523, "learning_rate": 0.0004981654750397999, "loss": 0.3511, "step": 20650 }, { "epoch": 1.176938821924039, "grad_norm": 0.11572203040122986, "learning_rate": 0.0004981610330907437, "loss": 0.3524, "step": 20700 }, { "epoch": 1.179781669319991, "grad_norm": 0.113502636551857, "learning_rate": 0.0004981565911416875, "loss": 0.3526, "step": 20750 }, { "epoch": 1.1826245167159426, "grad_norm": 0.09475291520357132, "learning_rate": 0.0004981521491926313, "loss": 0.3536, "step": 20800 }, { "epoch": 1.1854673641118945, "grad_norm": 0.09980995953083038, "learning_rate": 0.0004981477072435751, "loss": 0.3525, "step": 20850 }, { "epoch": 1.1883102115078463, "grad_norm": 0.09389813244342804, "learning_rate": 0.0004981432652945189, "loss": 0.3536, "step": 20900 }, { "epoch": 1.191153058903798, "grad_norm": 0.09969864040613174, "learning_rate": 0.0004981388233454628, "loss": 0.3536, "step": 20950 }, { "epoch": 1.1939959062997498, "grad_norm": 0.08675362169742584, "learning_rate": 0.0004981343813964067, "loss": 0.3538, "step": 21000 }, { "epoch": 1.1968387536957017, "grad_norm": 0.13347360491752625, "learning_rate": 0.0004981299394473505, "loss": 0.3536, "step": 21050 }, { "epoch": 1.1996816010916533, "grad_norm": 0.1156553402543068, "learning_rate": 0.0004981254974982943, "loss": 0.3516, "step": 21100 }, { "epoch": 1.2025244484876052, "grad_norm": 0.08983340859413147, "learning_rate": 0.0004981210555492382, "loss": 0.3537, "step": 21150 }, { "epoch": 1.205367295883557, "grad_norm": 0.0790681466460228, "learning_rate": 0.000498116613600182, "loss": 0.3527, "step": 21200 }, { "epoch": 1.2082101432795087, "grad_norm": 0.12216062098741531, "learning_rate": 0.0004981121716511258, "loss": 0.3535, "step": 21250 }, { "epoch": 1.2110529906754606, "grad_norm": 0.08232084661722183, "learning_rate": 0.0004981077297020696, "loss": 0.3543, "step": 21300 }, { "epoch": 1.2138958380714122, "grad_norm": 0.09808045625686646, "learning_rate": 0.0004981032877530134, "loss": 0.3526, "step": 21350 }, { "epoch": 1.216738685467364, "grad_norm": 0.0974017009139061, "learning_rate": 0.0004980988458039572, "loss": 0.3518, "step": 21400 }, { "epoch": 1.219581532863316, "grad_norm": 0.11234429478645325, "learning_rate": 0.000498094403854901, "loss": 0.3549, "step": 21450 }, { "epoch": 1.2224243802592678, "grad_norm": 0.0913853719830513, "learning_rate": 0.0004980899619058449, "loss": 0.3572, "step": 21500 }, { "epoch": 1.2252672276552194, "grad_norm": 0.11108215898275375, "learning_rate": 0.0004980855199567888, "loss": 0.3532, "step": 21550 }, { "epoch": 1.2281100750511713, "grad_norm": 0.09476979076862335, "learning_rate": 0.0004980810780077326, "loss": 0.3557, "step": 21600 }, { "epoch": 1.230952922447123, "grad_norm": 0.0904000923037529, "learning_rate": 0.0004980766360586764, "loss": 0.3512, "step": 21650 }, { "epoch": 1.2337957698430748, "grad_norm": 0.09948475658893585, "learning_rate": 0.0004980721941096202, "loss": 0.3533, "step": 21700 }, { "epoch": 1.2366386172390267, "grad_norm": 0.0987270325422287, "learning_rate": 0.000498067752160564, "loss": 0.354, "step": 21750 }, { "epoch": 1.2394814646349783, "grad_norm": 0.10672775655984879, "learning_rate": 0.0004980633102115078, "loss": 0.3521, "step": 21800 }, { "epoch": 1.2423243120309302, "grad_norm": 0.11016605794429779, "learning_rate": 0.0004980588682624516, "loss": 0.3519, "step": 21850 }, { "epoch": 1.245167159426882, "grad_norm": 0.11307205259799957, "learning_rate": 0.0004980544263133954, "loss": 0.3545, "step": 21900 }, { "epoch": 1.2480100068228337, "grad_norm": 0.09647481143474579, "learning_rate": 0.0004980499843643394, "loss": 0.3497, "step": 21950 }, { "epoch": 1.2508528542187856, "grad_norm": 0.11781589686870575, "learning_rate": 0.0004980455424152832, "loss": 0.3527, "step": 22000 }, { "epoch": 1.2536957016147374, "grad_norm": 0.09155919402837753, "learning_rate": 0.000498041100466227, "loss": 0.3541, "step": 22050 }, { "epoch": 1.256538549010689, "grad_norm": 0.09770865738391876, "learning_rate": 0.0004980366585171708, "loss": 0.351, "step": 22100 }, { "epoch": 1.259381396406641, "grad_norm": 0.09057383239269257, "learning_rate": 0.0004980322165681146, "loss": 0.3526, "step": 22150 }, { "epoch": 1.2622242438025926, "grad_norm": 0.10171497613191605, "learning_rate": 0.0004980277746190584, "loss": 0.3516, "step": 22200 }, { "epoch": 1.2650670911985444, "grad_norm": 0.12109605222940445, "learning_rate": 0.0004980233326700022, "loss": 0.3479, "step": 22250 }, { "epoch": 1.2679099385944963, "grad_norm": 0.08494057506322861, "learning_rate": 0.000498018890720946, "loss": 0.3536, "step": 22300 }, { "epoch": 1.2707527859904482, "grad_norm": 0.10715505480766296, "learning_rate": 0.00049801444877189, "loss": 0.354, "step": 22350 }, { "epoch": 1.2735956333863998, "grad_norm": 0.09710002690553665, "learning_rate": 0.0004980100068228338, "loss": 0.3504, "step": 22400 }, { "epoch": 1.2764384807823517, "grad_norm": 0.10588357597589493, "learning_rate": 0.0004980055648737776, "loss": 0.3552, "step": 22450 }, { "epoch": 1.2792813281783033, "grad_norm": 0.09577326476573944, "learning_rate": 0.0004980011229247214, "loss": 0.3523, "step": 22500 }, { "epoch": 1.2821241755742552, "grad_norm": 0.0960511788725853, "learning_rate": 0.0004979966809756653, "loss": 0.3496, "step": 22550 }, { "epoch": 1.284967022970207, "grad_norm": 0.09112073481082916, "learning_rate": 0.0004979922390266091, "loss": 0.3516, "step": 22600 }, { "epoch": 1.2878098703661587, "grad_norm": 0.11254040151834488, "learning_rate": 0.0004979877970775529, "loss": 0.3508, "step": 22650 }, { "epoch": 1.2906527177621105, "grad_norm": 0.1305217444896698, "learning_rate": 0.0004979833551284967, "loss": 0.3484, "step": 22700 }, { "epoch": 1.2934955651580622, "grad_norm": 0.11914518475532532, "learning_rate": 0.0004979789131794405, "loss": 0.3492, "step": 22750 }, { "epoch": 1.296338412554014, "grad_norm": 0.10428331047296524, "learning_rate": 0.0004979744712303843, "loss": 0.3516, "step": 22800 }, { "epoch": 1.299181259949966, "grad_norm": 0.10348132997751236, "learning_rate": 0.0004979700292813282, "loss": 0.3497, "step": 22850 }, { "epoch": 1.3020241073459178, "grad_norm": 0.08855703473091125, "learning_rate": 0.0004979655873322721, "loss": 0.3516, "step": 22900 }, { "epoch": 1.3048669547418694, "grad_norm": 0.11277921497821808, "learning_rate": 0.0004979611453832159, "loss": 0.3473, "step": 22950 }, { "epoch": 1.3077098021378213, "grad_norm": 0.092954620718956, "learning_rate": 0.0004979567034341597, "loss": 0.3483, "step": 23000 }, { "epoch": 1.310552649533773, "grad_norm": 0.11179167777299881, "learning_rate": 0.0004979522614851035, "loss": 0.3512, "step": 23050 }, { "epoch": 1.3133954969297248, "grad_norm": 0.09125496447086334, "learning_rate": 0.0004979478195360473, "loss": 0.3508, "step": 23100 }, { "epoch": 1.3162383443256767, "grad_norm": 0.0916043147444725, "learning_rate": 0.0004979433775869911, "loss": 0.3523, "step": 23150 }, { "epoch": 1.3190811917216285, "grad_norm": 0.09685220569372177, "learning_rate": 0.0004979389356379349, "loss": 0.3546, "step": 23200 }, { "epoch": 1.3219240391175802, "grad_norm": 0.10811689496040344, "learning_rate": 0.0004979344936888787, "loss": 0.3498, "step": 23250 }, { "epoch": 1.324766886513532, "grad_norm": 0.08741540461778641, "learning_rate": 0.0004979300517398227, "loss": 0.3493, "step": 23300 }, { "epoch": 1.3276097339094837, "grad_norm": 0.08877366036176682, "learning_rate": 0.0004979256097907665, "loss": 0.3527, "step": 23350 }, { "epoch": 1.3304525813054355, "grad_norm": 0.1235891655087471, "learning_rate": 0.0004979211678417103, "loss": 0.3525, "step": 23400 }, { "epoch": 1.3332954287013874, "grad_norm": 0.0945284515619278, "learning_rate": 0.0004979167258926541, "loss": 0.354, "step": 23450 }, { "epoch": 1.336138276097339, "grad_norm": 0.08477913588285446, "learning_rate": 0.0004979122839435979, "loss": 0.3515, "step": 23500 }, { "epoch": 1.338981123493291, "grad_norm": 0.09075385332107544, "learning_rate": 0.0004979078419945417, "loss": 0.3515, "step": 23550 }, { "epoch": 1.3418239708892425, "grad_norm": 0.08231678605079651, "learning_rate": 0.0004979034000454855, "loss": 0.3506, "step": 23600 }, { "epoch": 1.3446668182851944, "grad_norm": 0.08512122184038162, "learning_rate": 0.0004978989580964293, "loss": 0.3506, "step": 23650 }, { "epoch": 1.3475096656811463, "grad_norm": 0.09195214509963989, "learning_rate": 0.0004978945161473732, "loss": 0.3493, "step": 23700 }, { "epoch": 1.3503525130770981, "grad_norm": 0.09307179600000381, "learning_rate": 0.000497890074198317, "loss": 0.3466, "step": 23750 }, { "epoch": 1.3531953604730498, "grad_norm": 0.08473914861679077, "learning_rate": 0.0004978856322492609, "loss": 0.3512, "step": 23800 }, { "epoch": 1.3560382078690016, "grad_norm": 0.09676692634820938, "learning_rate": 0.0004978811903002047, "loss": 0.3495, "step": 23850 }, { "epoch": 1.3588810552649533, "grad_norm": 0.08961619436740875, "learning_rate": 0.0004978767483511486, "loss": 0.3493, "step": 23900 }, { "epoch": 1.3617239026609051, "grad_norm": 0.10256984084844589, "learning_rate": 0.0004978723064020924, "loss": 0.3519, "step": 23950 }, { "epoch": 1.364566750056857, "grad_norm": 0.09426955133676529, "learning_rate": 0.0004978678644530362, "loss": 0.3496, "step": 24000 }, { "epoch": 1.3674095974528087, "grad_norm": 0.09407513588666916, "learning_rate": 0.00049786342250398, "loss": 0.35, "step": 24050 }, { "epoch": 1.3702524448487605, "grad_norm": 0.11077912896871567, "learning_rate": 0.0004978589805549238, "loss": 0.3499, "step": 24100 }, { "epoch": 1.3730952922447124, "grad_norm": 0.09664590656757355, "learning_rate": 0.0004978545386058676, "loss": 0.3509, "step": 24150 }, { "epoch": 1.375938139640664, "grad_norm": 0.08676467835903168, "learning_rate": 0.0004978500966568114, "loss": 0.3498, "step": 24200 }, { "epoch": 1.378780987036616, "grad_norm": 0.07996726781129837, "learning_rate": 0.0004978456547077553, "loss": 0.3525, "step": 24250 }, { "epoch": 1.3816238344325678, "grad_norm": 0.1071229949593544, "learning_rate": 0.0004978412127586992, "loss": 0.3529, "step": 24300 }, { "epoch": 1.3844666818285194, "grad_norm": 0.09521368145942688, "learning_rate": 0.000497836770809643, "loss": 0.3501, "step": 24350 }, { "epoch": 1.3873095292244713, "grad_norm": 0.10842622071504593, "learning_rate": 0.0004978323288605868, "loss": 0.3469, "step": 24400 }, { "epoch": 1.390152376620423, "grad_norm": 0.08254586160182953, "learning_rate": 0.0004978278869115306, "loss": 0.3481, "step": 24450 }, { "epoch": 1.3929952240163748, "grad_norm": 0.122976154088974, "learning_rate": 0.0004978234449624744, "loss": 0.3519, "step": 24500 }, { "epoch": 1.3958380714123266, "grad_norm": 0.10202399641275406, "learning_rate": 0.0004978190030134182, "loss": 0.3481, "step": 24550 }, { "epoch": 1.3986809188082785, "grad_norm": 0.09278547018766403, "learning_rate": 0.000497814561064362, "loss": 0.3477, "step": 24600 }, { "epoch": 1.4015237662042301, "grad_norm": 0.09204497933387756, "learning_rate": 0.0004978101191153058, "loss": 0.3487, "step": 24650 }, { "epoch": 1.404366613600182, "grad_norm": 0.09469152987003326, "learning_rate": 0.0004978056771662498, "loss": 0.352, "step": 24700 }, { "epoch": 1.4072094609961336, "grad_norm": 0.09524402767419815, "learning_rate": 0.0004978012352171936, "loss": 0.3502, "step": 24750 }, { "epoch": 1.4100523083920855, "grad_norm": 0.09983129799365997, "learning_rate": 0.0004977967932681374, "loss": 0.3491, "step": 24800 }, { "epoch": 1.4128951557880374, "grad_norm": 0.13477516174316406, "learning_rate": 0.0004977923513190812, "loss": 0.3505, "step": 24850 }, { "epoch": 1.415738003183989, "grad_norm": 0.09205208718776703, "learning_rate": 0.000497787909370025, "loss": 0.3505, "step": 24900 }, { "epoch": 1.4185808505799409, "grad_norm": 0.08766143023967743, "learning_rate": 0.0004977834674209688, "loss": 0.3508, "step": 24950 }, { "epoch": 1.4214236979758925, "grad_norm": 0.11106764525175095, "learning_rate": 0.0004977790254719127, "loss": 0.3501, "step": 25000 }, { "epoch": 1.4214236979758925, "eval_loss": 0.3424564003944397, "eval_runtime": 577.5935, "eval_samples_per_second": 228.379, "eval_steps_per_second": 28.548, "step": 25000 } ], "logging_steps": 50, "max_steps": 5628160, "num_input_tokens_seen": 0, "num_train_epochs": 320, "save_steps": 5000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.134362268821094e+17, "train_batch_size": 120, "trial_name": null, "trial_params": null }