{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.125, "eval_steps": 2000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025, "grad_norm": 31.5, "learning_rate": 0.0001, "loss": 7.633, "loss/crossentropy": 2.065455098450184, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.20220321230590343, "step": 10 }, { "epoch": 0.0005, "grad_norm": 35.0, "grad_norm_var": 2.6895182291666666, "learning_rate": 0.0001, "loss": 7.4618, "loss/crossentropy": 1.9399560801684856, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.19191570337861777, "step": 20 }, { "epoch": 0.00075, "grad_norm": 37.5, "grad_norm_var": 6.579622395833334, "learning_rate": 0.0001, "loss": 7.5972, "loss/crossentropy": 2.130601316690445, "loss/hidden": 3.38984375, "loss/jsd": 0.0, "loss/logits": 0.20188977513462306, "step": 30 }, { "epoch": 0.001, "grad_norm": 33.5, "grad_norm_var": 6.253125, "learning_rate": 0.0001, "loss": 7.5917, "loss/crossentropy": 2.2571407079696657, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.19847887996584176, "step": 40 }, { "epoch": 0.00125, "grad_norm": 32.25, "grad_norm_var": 2.1619140625, "learning_rate": 0.0001, "loss": 7.6054, "loss/crossentropy": 2.1717565625905992, "loss/hidden": 3.43359375, "loss/jsd": 0.0, "loss/logits": 0.20264342725276946, "step": 50 }, { "epoch": 0.0015, "grad_norm": 35.5, "grad_norm_var": 15.786393229166666, "learning_rate": 0.0001, "loss": 7.5513, "loss/crossentropy": 2.070718301087618, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.19855907820165158, "step": 60 }, { "epoch": 0.00175, "grad_norm": 31.0, "grad_norm_var": 12.4625, "learning_rate": 0.0001, "loss": 7.5447, "loss/crossentropy": 2.118075390160084, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.20283062420785428, "step": 70 }, { "epoch": 0.002, "grad_norm": 32.25, "grad_norm_var": 1.2643229166666667, "learning_rate": 0.0001, "loss": 7.468, "loss/crossentropy": 2.0006178975105287, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.18958428762853147, "step": 80 }, { "epoch": 0.00225, "grad_norm": 30.625, "grad_norm_var": 3.470572916666667, "learning_rate": 0.0001, "loss": 7.5061, "loss/crossentropy": 1.9605075903236866, "loss/hidden": 3.54375, "loss/jsd": 0.0, "loss/logits": 0.20559987109154462, "step": 90 }, { "epoch": 0.0025, "grad_norm": 31.125, "grad_norm_var": 6.763541666666667, "learning_rate": 0.0001, "loss": 7.4928, "loss/crossentropy": 2.1205389350652695, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19496036488562823, "step": 100 }, { "epoch": 0.00275, "grad_norm": 31.0, "grad_norm_var": 6.1509765625, "learning_rate": 0.0001, "loss": 7.595, "loss/crossentropy": 2.1240097641944886, "loss/hidden": 3.43671875, "loss/jsd": 0.0, "loss/logits": 0.19564666803926228, "step": 110 }, { "epoch": 0.003, "grad_norm": 31.25, "grad_norm_var": 3.348893229166667, "learning_rate": 0.0001, "loss": 7.5329, "loss/crossentropy": 2.175096944719553, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.21303062327206135, "step": 120 }, { "epoch": 0.00325, "grad_norm": 32.0, "grad_norm_var": 2.8541666666666665, "learning_rate": 0.0001, "loss": 7.5536, "loss/crossentropy": 2.1472502022981645, "loss/hidden": 3.342578125, "loss/jsd": 0.0, "loss/logits": 0.18929538186639547, "step": 130 }, { "epoch": 0.0035, "grad_norm": 29.375, "grad_norm_var": 29.683268229166668, "learning_rate": 0.0001, "loss": 7.5191, "loss/crossentropy": 2.015011890232563, "loss/hidden": 3.44296875, "loss/jsd": 0.0, "loss/logits": 0.20328481420874595, "step": 140 }, { "epoch": 0.00375, "grad_norm": 28.75, "grad_norm_var": 28.74765625, "learning_rate": 0.0001, "loss": 7.4158, "loss/crossentropy": 1.9774167470633983, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.19464388117194176, "step": 150 }, { "epoch": 0.004, "grad_norm": 30.875, "grad_norm_var": 1.3635416666666667, "learning_rate": 0.0001, "loss": 7.6354, "loss/crossentropy": 2.320629420876503, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.20745602920651435, "step": 160 }, { "epoch": 0.00425, "grad_norm": 31.5, "grad_norm_var": 1.0270182291666667, "learning_rate": 0.0001, "loss": 7.4137, "loss/crossentropy": 1.900385806709528, "loss/hidden": 3.345703125, "loss/jsd": 0.0, "loss/logits": 0.16769229620695114, "step": 170 }, { "epoch": 0.0045, "grad_norm": 31.25, "grad_norm_var": 0.9833333333333333, "learning_rate": 0.0001, "loss": 7.5763, "loss/crossentropy": 2.129625543951988, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.2102549459785223, "step": 180 }, { "epoch": 0.00475, "grad_norm": 32.25, "grad_norm_var": 3.05390625, "learning_rate": 0.0001, "loss": 7.6166, "loss/crossentropy": 2.1552532628178596, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.2250068686902523, "step": 190 }, { "epoch": 0.005, "grad_norm": 29.625, "grad_norm_var": 3.8375, "learning_rate": 0.0001, "loss": 7.5745, "loss/crossentropy": 1.9441482461988926, "loss/hidden": 3.387890625, "loss/jsd": 0.0, "loss/logits": 0.195942450594157, "step": 200 }, { "epoch": 0.00525, "grad_norm": 32.5, "grad_norm_var": 18.396875, "learning_rate": 0.0001, "loss": 7.5292, "loss/crossentropy": 1.9941987417638303, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.18264975901693106, "step": 210 }, { "epoch": 0.0055, "grad_norm": 31.75, "grad_norm_var": 20.736393229166666, "learning_rate": 0.0001, "loss": 7.4899, "loss/crossentropy": 2.0191620789468288, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.18100650198757648, "step": 220 }, { "epoch": 0.00575, "grad_norm": 30.375, "grad_norm_var": 2.342643229166667, "learning_rate": 0.0001, "loss": 7.5199, "loss/crossentropy": 2.001779730618, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.17959208656102418, "step": 230 }, { "epoch": 0.006, "grad_norm": 30.75, "grad_norm_var": 1.271875, "learning_rate": 0.0001, "loss": 7.6842, "loss/crossentropy": 2.1846971333026888, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.2059234745800495, "step": 240 }, { "epoch": 0.00625, "grad_norm": 29.5, "grad_norm_var": 5.688541666666667, "learning_rate": 0.0001, "loss": 7.5196, "loss/crossentropy": 2.174124576151371, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.20000722594559192, "step": 250 }, { "epoch": 0.0065, "grad_norm": 28.75, "grad_norm_var": 1.9572265625, "learning_rate": 0.0001, "loss": 7.3875, "loss/crossentropy": 1.9285166233778, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.18449910767376423, "step": 260 }, { "epoch": 0.00675, "grad_norm": 33.5, "grad_norm_var": 2.0999348958333335, "learning_rate": 0.0001, "loss": 7.5877, "loss/crossentropy": 2.0323276594281197, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.19395631980150937, "step": 270 }, { "epoch": 0.007, "grad_norm": 30.5, "grad_norm_var": 2.15390625, "learning_rate": 0.0001, "loss": 7.5791, "loss/crossentropy": 2.126656140387058, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.21661139875650406, "step": 280 }, { "epoch": 0.00725, "grad_norm": 29.5, "grad_norm_var": 3.193489583333333, "learning_rate": 0.0001, "loss": 7.5587, "loss/crossentropy": 2.200097793340683, "loss/hidden": 3.529296875, "loss/jsd": 0.0, "loss/logits": 0.21046234332025052, "step": 290 }, { "epoch": 0.0075, "grad_norm": 26.75, "grad_norm_var": 4.27265625, "learning_rate": 0.0001, "loss": 7.5404, "loss/crossentropy": 2.1184144005179406, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.20949590150266886, "step": 300 }, { "epoch": 0.00775, "grad_norm": 33.0, "grad_norm_var": 3.3643229166666666, "learning_rate": 0.0001, "loss": 7.5628, "loss/crossentropy": 1.9984030593186617, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.18789457948878407, "step": 310 }, { "epoch": 0.008, "grad_norm": 32.5, "grad_norm_var": 2.5645182291666666, "learning_rate": 0.0001, "loss": 7.5695, "loss/crossentropy": 2.143594169616699, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.19360470157116652, "step": 320 }, { "epoch": 0.00825, "grad_norm": 29.375, "grad_norm_var": 1.8749348958333334, "learning_rate": 0.0001, "loss": 7.3627, "loss/crossentropy": 2.1077703177928924, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.19771252572536469, "step": 330 }, { "epoch": 0.0085, "grad_norm": 29.75, "grad_norm_var": 1.5978515625, "learning_rate": 0.0001, "loss": 7.4192, "loss/crossentropy": 2.0583472289144993, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.20189273860305548, "step": 340 }, { "epoch": 0.00875, "grad_norm": 29.875, "grad_norm_var": 1.2872395833333334, "learning_rate": 0.0001, "loss": 7.5432, "loss/crossentropy": 2.0804511278867723, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19735569059848784, "step": 350 }, { "epoch": 0.009, "grad_norm": 30.5, "grad_norm_var": 18.731184895833334, "learning_rate": 0.0001, "loss": 7.4948, "loss/crossentropy": 2.0466629534959795, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.18366040643304588, "step": 360 }, { "epoch": 0.00925, "grad_norm": 30.875, "grad_norm_var": 25.9916015625, "learning_rate": 0.0001, "loss": 7.5081, "loss/crossentropy": 1.9005662694573402, "loss/hidden": 3.501171875, "loss/jsd": 0.0, "loss/logits": 0.1900689721107483, "step": 370 }, { "epoch": 0.0095, "grad_norm": 28.75, "grad_norm_var": 2.451041666666667, "learning_rate": 0.0001, "loss": 7.4305, "loss/crossentropy": 2.0674299761652946, "loss/hidden": 3.517578125, "loss/jsd": 0.0, "loss/logits": 0.21062961965799332, "step": 380 }, { "epoch": 0.00975, "grad_norm": 31.25, "grad_norm_var": 5.645247395833334, "learning_rate": 0.0001, "loss": 7.5168, "loss/crossentropy": 2.0279919117689134, "loss/hidden": 3.503125, "loss/jsd": 0.0, "loss/logits": 0.20519332773983479, "step": 390 }, { "epoch": 0.01, "grad_norm": 31.125, "grad_norm_var": 5.928125, "learning_rate": 0.0001, "loss": 7.4985, "loss/crossentropy": 2.0427632443606853, "loss/hidden": 3.53125, "loss/jsd": 0.0, "loss/logits": 0.20287631042301654, "step": 400 }, { "epoch": 0.01025, "grad_norm": 38.5, "grad_norm_var": 438.43515625, "learning_rate": 0.0001, "loss": 7.5633, "loss/crossentropy": 2.199043881893158, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.21130343191325665, "step": 410 }, { "epoch": 0.0105, "grad_norm": 30.875, "grad_norm_var": 43.14140625, "learning_rate": 0.0001, "loss": 7.4835, "loss/crossentropy": 1.9102243572473525, "loss/hidden": 3.42578125, "loss/jsd": 0.0, "loss/logits": 0.1895731385797262, "step": 420 }, { "epoch": 0.01075, "grad_norm": 31.75, "grad_norm_var": 5.658268229166667, "learning_rate": 0.0001, "loss": 7.3897, "loss/crossentropy": 2.159160128980875, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.20280379485338926, "step": 430 }, { "epoch": 0.011, "grad_norm": 28.375, "grad_norm_var": 16.3375, "learning_rate": 0.0001, "loss": 7.5463, "loss/crossentropy": 2.1217672407627104, "loss/hidden": 3.545703125, "loss/jsd": 0.0, "loss/logits": 0.23856931366026402, "step": 440 }, { "epoch": 0.01125, "grad_norm": 30.5, "grad_norm_var": 17.098372395833334, "learning_rate": 0.0001, "loss": 7.5225, "loss/crossentropy": 1.969854873791337, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.19548849146813155, "step": 450 }, { "epoch": 0.0115, "grad_norm": 29.875, "grad_norm_var": 2.5677083333333335, "learning_rate": 0.0001, "loss": 7.5046, "loss/crossentropy": 2.121321603655815, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.19364523217082025, "step": 460 }, { "epoch": 0.01175, "grad_norm": 32.25, "grad_norm_var": 8.585416666666667, "learning_rate": 0.0001, "loss": 7.4558, "loss/crossentropy": 1.9360710382461548, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.1893781816586852, "step": 470 }, { "epoch": 0.012, "grad_norm": 29.875, "grad_norm_var": 3.417122395833333, "learning_rate": 0.0001, "loss": 7.531, "loss/crossentropy": 2.082458943128586, "loss/hidden": 3.471875, "loss/jsd": 0.0, "loss/logits": 0.2220946006476879, "step": 480 }, { "epoch": 0.01225, "grad_norm": 31.0, "grad_norm_var": 48.96640625, "learning_rate": 0.0001, "loss": 7.5651, "loss/crossentropy": 2.1382531195878984, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.20847559962421655, "step": 490 }, { "epoch": 0.0125, "grad_norm": 29.875, "grad_norm_var": 49.2666015625, "learning_rate": 0.0001, "loss": 7.5679, "loss/crossentropy": 2.0875915244221686, "loss/hidden": 3.33125, "loss/jsd": 0.0, "loss/logits": 0.1850985599681735, "step": 500 }, { "epoch": 0.01275, "grad_norm": 31.875, "grad_norm_var": 1.45, "learning_rate": 0.0001, "loss": 7.5263, "loss/crossentropy": 2.182442346215248, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.19555890336632728, "step": 510 }, { "epoch": 0.013, "grad_norm": 34.0, "grad_norm_var": 1.6931640625, "learning_rate": 0.0001, "loss": 7.5209, "loss/crossentropy": 1.9812136888504028, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.1965757070109248, "step": 520 }, { "epoch": 0.01325, "grad_norm": 31.0, "grad_norm_var": 2.101822916666667, "learning_rate": 0.0001, "loss": 7.6059, "loss/crossentropy": 2.0372241511940956, "loss/hidden": 3.564453125, "loss/jsd": 0.0, "loss/logits": 0.204646560549736, "step": 530 }, { "epoch": 0.0135, "grad_norm": 29.125, "grad_norm_var": 20.071875, "learning_rate": 0.0001, "loss": 7.5725, "loss/crossentropy": 2.155761349201202, "loss/hidden": 3.4125, "loss/jsd": 0.0, "loss/logits": 0.19602423422038556, "step": 540 }, { "epoch": 0.01375, "grad_norm": 29.125, "grad_norm_var": 20.506705729166665, "learning_rate": 0.0001, "loss": 7.5842, "loss/crossentropy": 1.8869566857814788, "loss/hidden": 3.437890625, "loss/jsd": 0.0, "loss/logits": 0.20957522764801978, "step": 550 }, { "epoch": 0.014, "grad_norm": 30.625, "grad_norm_var": 10.025455729166667, "learning_rate": 0.0001, "loss": 7.4975, "loss/crossentropy": 2.0370677679777147, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.19026046600192786, "step": 560 }, { "epoch": 0.01425, "grad_norm": 33.0, "grad_norm_var": 2.2270833333333333, "learning_rate": 0.0001, "loss": 7.5688, "loss/crossentropy": 2.1931444257497787, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.2036376902833581, "step": 570 }, { "epoch": 0.0145, "grad_norm": 35.0, "grad_norm_var": 3.5681640625, "learning_rate": 0.0001, "loss": 7.478, "loss/crossentropy": 2.061052493005991, "loss/hidden": 3.478125, "loss/jsd": 0.0, "loss/logits": 0.2282864760607481, "step": 580 }, { "epoch": 0.01475, "grad_norm": 32.5, "grad_norm_var": 2.8705729166666667, "learning_rate": 0.0001, "loss": 7.5957, "loss/crossentropy": 2.0078392371535303, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.19647251404821872, "step": 590 }, { "epoch": 0.015, "grad_norm": 30.25, "grad_norm_var": 31.449934895833334, "learning_rate": 0.0001, "loss": 7.5096, "loss/crossentropy": 2.0417068414390087, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.19782953998073935, "step": 600 }, { "epoch": 0.01525, "grad_norm": 30.5, "grad_norm_var": 26.253059895833335, "learning_rate": 0.0001, "loss": 7.5368, "loss/crossentropy": 2.1738049775362014, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.1996332859620452, "step": 610 }, { "epoch": 0.0155, "grad_norm": 30.125, "grad_norm_var": 2.334375, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 1.7587297886610032, "loss/hidden": 3.475390625, "loss/jsd": 0.0, "loss/logits": 0.18938990794122218, "step": 620 }, { "epoch": 0.01575, "grad_norm": 29.25, "grad_norm_var": 27.393684895833335, "learning_rate": 0.0001, "loss": 7.4833, "loss/crossentropy": 1.9551145888864994, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.20075901364907622, "step": 630 }, { "epoch": 0.016, "grad_norm": 29.75, "grad_norm_var": 29.6947265625, "learning_rate": 0.0001, "loss": 7.4608, "loss/crossentropy": 2.128718316555023, "loss/hidden": 3.3625, "loss/jsd": 0.0, "loss/logits": 0.19077460393309592, "step": 640 }, { "epoch": 0.01625, "grad_norm": 29.75, "grad_norm_var": 27.322330729166666, "learning_rate": 0.0001, "loss": 7.6033, "loss/crossentropy": 1.9678708665072917, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.18875791020691396, "step": 650 }, { "epoch": 0.0165, "grad_norm": 30.375, "grad_norm_var": 3.129622395833333, "learning_rate": 0.0001, "loss": 7.3873, "loss/crossentropy": 1.9582339562475681, "loss/hidden": 3.34765625, "loss/jsd": 0.0, "loss/logits": 0.18309127148240806, "step": 660 }, { "epoch": 0.01675, "grad_norm": 32.75, "grad_norm_var": 2.7009765625, "learning_rate": 0.0001, "loss": 7.4913, "loss/crossentropy": 2.0773802563548087, "loss/hidden": 3.505078125, "loss/jsd": 0.0, "loss/logits": 0.20910798981785775, "step": 670 }, { "epoch": 0.017, "grad_norm": 34.0, "grad_norm_var": 3.3854166666666665, "learning_rate": 0.0001, "loss": 7.4847, "loss/crossentropy": 2.12913373708725, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.201920267008245, "step": 680 }, { "epoch": 0.01725, "grad_norm": 30.75, "grad_norm_var": 1.7176432291666666, "learning_rate": 0.0001, "loss": 7.5065, "loss/crossentropy": 1.9141538538038732, "loss/hidden": 3.44921875, "loss/jsd": 0.0, "loss/logits": 0.1841401271522045, "step": 690 }, { "epoch": 0.0175, "grad_norm": 31.0, "grad_norm_var": 1.6374348958333333, "learning_rate": 0.0001, "loss": 7.5897, "loss/crossentropy": 2.207232800126076, "loss/hidden": 3.399609375, "loss/jsd": 0.0, "loss/logits": 0.21376523859798907, "step": 700 }, { "epoch": 0.01775, "grad_norm": 32.75, "grad_norm_var": 2.3655598958333335, "learning_rate": 0.0001, "loss": 7.5075, "loss/crossentropy": 2.03845998942852, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.1920805646572262, "step": 710 }, { "epoch": 0.018, "grad_norm": 32.5, "grad_norm_var": 1.3893229166666667, "learning_rate": 0.0001, "loss": 7.4669, "loss/crossentropy": 2.054341807588935, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.19716067584231495, "step": 720 }, { "epoch": 0.01825, "grad_norm": 31.625, "grad_norm_var": 3.54140625, "learning_rate": 0.0001, "loss": 7.517, "loss/crossentropy": 2.2111608639359472, "loss/hidden": 3.409765625, "loss/jsd": 0.0, "loss/logits": 0.20262118335813284, "step": 730 }, { "epoch": 0.0185, "grad_norm": 29.125, "grad_norm_var": 4.692122395833334, "learning_rate": 0.0001, "loss": 7.4784, "loss/crossentropy": 2.0551758617162705, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.20378697756677866, "step": 740 }, { "epoch": 0.01875, "grad_norm": 33.0, "grad_norm_var": 4.295572916666667, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 2.128055375814438, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.19904747987166047, "step": 750 }, { "epoch": 0.019, "grad_norm": 6106906624.0, "grad_norm_var": 2.3308942582349476e+18, "learning_rate": 0.0001, "loss": 7.4633, "loss/crossentropy": 2.248567137122154, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.19723597317934036, "step": 760 }, { "epoch": 0.01925, "grad_norm": 28.5, "grad_norm_var": 2.330894258158611e+18, "learning_rate": 0.0001, "loss": 7.4542, "loss/crossentropy": 2.132212319970131, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18174959290772677, "step": 770 }, { "epoch": 0.0195, "grad_norm": 36.5, "grad_norm_var": 4.833333333333333, "learning_rate": 0.0001, "loss": 7.465, "loss/crossentropy": 2.046277052164078, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.21161840241402388, "step": 780 }, { "epoch": 0.01975, "grad_norm": 32.75, "grad_norm_var": 5.137434895833334, "learning_rate": 0.0001, "loss": 7.4171, "loss/crossentropy": 2.058088332414627, "loss/hidden": 3.315234375, "loss/jsd": 0.0, "loss/logits": 0.1815673651173711, "step": 790 }, { "epoch": 0.02, "grad_norm": 30.125, "grad_norm_var": 12.37265625, "learning_rate": 0.0001, "loss": 7.4153, "loss/crossentropy": 2.064726157486439, "loss/hidden": 3.515625, "loss/jsd": 0.0, "loss/logits": 0.19402222614735365, "step": 800 }, { "epoch": 0.02025, "grad_norm": 32.0, "grad_norm_var": 12.240625, "learning_rate": 0.0001, "loss": 7.3739, "loss/crossentropy": 2.0926051691174505, "loss/hidden": 3.476953125, "loss/jsd": 0.0, "loss/logits": 0.21017331834882497, "step": 810 }, { "epoch": 0.0205, "grad_norm": 31.875, "grad_norm_var": 3.6853515625, "learning_rate": 0.0001, "loss": 7.409, "loss/crossentropy": 2.016859006881714, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.20363395065069198, "step": 820 }, { "epoch": 0.02075, "grad_norm": 34.0, "grad_norm_var": 278.1108723958333, "learning_rate": 0.0001, "loss": 7.6725, "loss/crossentropy": 2.03957434669137, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.19866096526384353, "step": 830 }, { "epoch": 0.021, "grad_norm": 35.75, "grad_norm_var": 281.2239583333333, "learning_rate": 0.0001, "loss": 7.4058, "loss/crossentropy": 2.1190530106425287, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19663097113370895, "step": 840 }, { "epoch": 0.02125, "grad_norm": 32.25, "grad_norm_var": 4.044791666666667, "learning_rate": 0.0001, "loss": 7.4687, "loss/crossentropy": 2.1552326917648315, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19604418501257898, "step": 850 }, { "epoch": 0.0215, "grad_norm": 37.25, "grad_norm_var": 2.7587362193217157e+18, "learning_rate": 0.0001, "loss": 7.5552, "loss/crossentropy": 2.1164004117250443, "loss/hidden": 3.375, "loss/jsd": 0.0, "loss/logits": 0.19724889248609542, "step": 860 }, { "epoch": 0.02175, "grad_norm": 35.25, "grad_norm_var": 2.758736219342478e+18, "learning_rate": 0.0001, "loss": 7.5021, "loss/crossentropy": 2.036998500674963, "loss/hidden": 3.298828125, "loss/jsd": 0.0, "loss/logits": 0.18320635841228067, "step": 870 }, { "epoch": 0.022, "grad_norm": 37.0, "grad_norm_var": 16.9541015625, "learning_rate": 0.0001, "loss": 7.5059, "loss/crossentropy": 1.9707016140222549, "loss/hidden": 3.36328125, "loss/jsd": 0.0, "loss/logits": 0.20436920877546072, "step": 880 }, { "epoch": 0.02225, "grad_norm": 31.375, "grad_norm_var": 30.538541666666667, "learning_rate": 0.0001, "loss": 7.4935, "loss/crossentropy": 2.206394499540329, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.20495780408382416, "step": 890 }, { "epoch": 0.0225, "grad_norm": 29.875, "grad_norm_var": 28.020833333333332, "learning_rate": 0.0001, "loss": 7.4823, "loss/crossentropy": 2.091763325035572, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.20592593550682067, "step": 900 }, { "epoch": 0.02275, "grad_norm": 31.875, "grad_norm_var": 3.5645182291666666, "learning_rate": 0.0001, "loss": 7.422, "loss/crossentropy": 1.9740761511027813, "loss/hidden": 3.494921875, "loss/jsd": 0.0, "loss/logits": 0.2015986293554306, "step": 910 }, { "epoch": 0.023, "grad_norm": 32.0, "grad_norm_var": 56.256184895833336, "learning_rate": 0.0001, "loss": 7.4528, "loss/crossentropy": 2.030415116250515, "loss/hidden": 3.205078125, "loss/jsd": 0.0, "loss/logits": 0.1614784031175077, "step": 920 }, { "epoch": 0.02325, "grad_norm": 30.0, "grad_norm_var": 57.1619140625, "learning_rate": 0.0001, "loss": 7.3713, "loss/crossentropy": 2.0250086903572084, "loss/hidden": 3.455859375, "loss/jsd": 0.0, "loss/logits": 0.19023355115205048, "step": 930 }, { "epoch": 0.0235, "grad_norm": 30.625, "grad_norm_var": 1.3830729166666667, "learning_rate": 0.0001, "loss": 7.5277, "loss/crossentropy": 2.222324788570404, "loss/hidden": 3.366796875, "loss/jsd": 0.0, "loss/logits": 0.19078677501529456, "step": 940 }, { "epoch": 0.02375, "grad_norm": 31.0, "grad_norm_var": 3.1455729166666666, "learning_rate": 0.0001, "loss": 7.5086, "loss/crossentropy": 2.1299516543745995, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.21310927756130696, "step": 950 }, { "epoch": 0.024, "grad_norm": 29.875, "grad_norm_var": 8.883072916666666, "learning_rate": 0.0001, "loss": 7.5579, "loss/crossentropy": 2.0535727672278883, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.18507701791822911, "step": 960 }, { "epoch": 0.02425, "grad_norm": 32.75, "grad_norm_var": 2.5916015625, "learning_rate": 0.0001, "loss": 7.537, "loss/crossentropy": 2.1785535484552385, "loss/hidden": 3.309765625, "loss/jsd": 0.0, "loss/logits": 0.1955953363329172, "step": 970 }, { "epoch": 0.0245, "grad_norm": 36.5, "grad_norm_var": 6.852083333333334, "learning_rate": 0.0001, "loss": 7.5091, "loss/crossentropy": 2.0967498391866686, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.2146583067253232, "step": 980 }, { "epoch": 0.02475, "grad_norm": 29.625, "grad_norm_var": 4.325455729166666, "learning_rate": 0.0001, "loss": 7.5901, "loss/crossentropy": 2.1134474128484726, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.19056662563234567, "step": 990 }, { "epoch": 0.025, "grad_norm": 42.0, "grad_norm_var": 4.1552039405313587e+18, "learning_rate": 0.0001, "loss": 7.6082, "loss/crossentropy": 2.0916516482830048, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.19376826155930757, "step": 1000 }, { "epoch": 0.02525, "grad_norm": 29.625, "grad_norm_var": 4.1552039416015355e+18, "learning_rate": 0.0001, "loss": 7.4528, "loss/crossentropy": 2.003750593960285, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.18129821103066207, "step": 1010 }, { "epoch": 0.0255, "grad_norm": 35.25, "grad_norm_var": 24.095572916666665, "learning_rate": 0.0001, "loss": 7.5395, "loss/crossentropy": 2.0453194856643675, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.199107607267797, "step": 1020 }, { "epoch": 0.02575, "grad_norm": 32.25, "grad_norm_var": 19.5259765625, "learning_rate": 0.0001, "loss": 7.31, "loss/crossentropy": 2.1016619503498077, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.184703135676682, "step": 1030 }, { "epoch": 0.026, "grad_norm": 30.75, "grad_norm_var": 1.87890625, "learning_rate": 0.0001, "loss": 7.5425, "loss/crossentropy": 2.1467826470732687, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.20074132941663264, "step": 1040 }, { "epoch": 0.02625, "grad_norm": 30.625, "grad_norm_var": 0.7452473958333333, "learning_rate": 0.0001, "loss": 7.4114, "loss/crossentropy": 2.049474111199379, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.20267941821366547, "step": 1050 }, { "epoch": 0.0265, "grad_norm": 31.75, "grad_norm_var": 3.124739583333333, "learning_rate": 0.0001, "loss": 7.4845, "loss/crossentropy": 2.036583887040615, "loss/hidden": 3.391796875, "loss/jsd": 0.0, "loss/logits": 0.1893632340244949, "step": 1060 }, { "epoch": 0.02675, "grad_norm": 40.75, "grad_norm_var": 3.405847188209664e+18, "learning_rate": 0.0001, "loss": 7.3982, "loss/crossentropy": 2.124411530792713, "loss/hidden": 3.4484375, "loss/jsd": 0.0, "loss/logits": 0.19454579129815103, "step": 1070 }, { "epoch": 0.027, "grad_norm": 28.25, "grad_norm_var": 3.4058471885941417e+18, "learning_rate": 0.0001, "loss": 7.3928, "loss/crossentropy": 2.0034691862761975, "loss/hidden": 3.503515625, "loss/jsd": 0.0, "loss/logits": 0.21349683087319135, "step": 1080 }, { "epoch": 0.02725, "grad_norm": 29.875, "grad_norm_var": 4.88515625, "learning_rate": 0.0001, "loss": 7.5095, "loss/crossentropy": 1.9183670297265052, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.19249978363513948, "step": 1090 }, { "epoch": 0.0275, "grad_norm": 30.5, "grad_norm_var": 3.2728515625, "learning_rate": 0.0001, "loss": 7.37, "loss/crossentropy": 2.145428071916103, "loss/hidden": 3.35703125, "loss/jsd": 0.0, "loss/logits": 0.19729665387421846, "step": 1100 }, { "epoch": 0.02775, "grad_norm": 31.25, "grad_norm_var": 2.34765625, "learning_rate": 0.0001, "loss": 7.4772, "loss/crossentropy": 2.10652961358428, "loss/hidden": 3.398046875, "loss/jsd": 0.0, "loss/logits": 0.19585925145074726, "step": 1110 }, { "epoch": 0.028, "grad_norm": 31.25, "grad_norm_var": 2.434477049308093e+18, "learning_rate": 0.0001, "loss": 7.4016, "loss/crossentropy": 1.9645449101924897, "loss/hidden": 3.44453125, "loss/jsd": 0.0, "loss/logits": 0.19977953620254993, "step": 1120 }, { "epoch": 0.02825, "grad_norm": 32.0, "grad_norm_var": 2.4344770492950907e+18, "learning_rate": 0.0001, "loss": 7.4453, "loss/crossentropy": 2.131172102689743, "loss/hidden": 3.383984375, "loss/jsd": 0.0, "loss/logits": 0.2083016105927527, "step": 1130 }, { "epoch": 0.0285, "grad_norm": 32.75, "grad_norm_var": 3.7080729166666666, "learning_rate": 0.0001, "loss": 7.4009, "loss/crossentropy": 2.003016713261604, "loss/hidden": 3.34453125, "loss/jsd": 0.0, "loss/logits": 0.18665643623098732, "step": 1140 }, { "epoch": 0.02875, "grad_norm": 30.875, "grad_norm_var": 1.34765625, "learning_rate": 0.0001, "loss": 7.5648, "loss/crossentropy": 2.0709651306271555, "loss/hidden": 3.45703125, "loss/jsd": 0.0, "loss/logits": 0.18793081305921078, "step": 1150 }, { "epoch": 0.029, "grad_norm": 32.25, "grad_norm_var": 2.1582682291666666, "learning_rate": 0.0001, "loss": 7.4644, "loss/crossentropy": 2.06434089243412, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.2109043262898922, "step": 1160 }, { "epoch": 0.02925, "grad_norm": 31.375, "grad_norm_var": 2.4010416666666665, "learning_rate": 0.0001, "loss": 7.4403, "loss/crossentropy": 2.0107607185840606, "loss/hidden": 3.498046875, "loss/jsd": 0.0, "loss/logits": 0.20349722560495137, "step": 1170 }, { "epoch": 0.0295, "grad_norm": 33.25, "grad_norm_var": 1.2260416666666667, "learning_rate": 0.0001, "loss": 7.4412, "loss/crossentropy": 2.096436749398708, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.20087064132094384, "step": 1180 }, { "epoch": 0.02975, "grad_norm": 29.75, "grad_norm_var": 1.8046223958333334, "learning_rate": 0.0001, "loss": 7.4458, "loss/crossentropy": 1.972258360683918, "loss/hidden": 3.583984375, "loss/jsd": 0.0, "loss/logits": 0.20998958311975002, "step": 1190 }, { "epoch": 0.03, "grad_norm": 33.75, "grad_norm_var": 3.7395833333333335, "learning_rate": 0.0001, "loss": 7.3931, "loss/crossentropy": 1.8556599006056786, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.19810242671519518, "step": 1200 }, { "epoch": 0.03025, "grad_norm": 29.0, "grad_norm_var": 9.394791666666666, "learning_rate": 0.0001, "loss": 7.5849, "loss/crossentropy": 2.0611833460628985, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.19216072149574756, "step": 1210 }, { "epoch": 0.0305, "grad_norm": 31.75, "grad_norm_var": 3.26640625, "learning_rate": 0.0001, "loss": 7.4844, "loss/crossentropy": 2.0546294137835504, "loss/hidden": 3.58828125, "loss/jsd": 0.0, "loss/logits": 0.21588555499911308, "step": 1220 }, { "epoch": 0.03075, "grad_norm": 31.625, "grad_norm_var": 2.3968098958333335, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 2.0615282475948336, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.206529095210135, "step": 1230 }, { "epoch": 0.031, "grad_norm": 32.0, "grad_norm_var": 1.6124348958333334, "learning_rate": 0.0001, "loss": 7.4647, "loss/crossentropy": 1.9786661133170127, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.17899234425276517, "step": 1240 }, { "epoch": 0.03125, "grad_norm": 5838471168.0, "grad_norm_var": 2.1304840753447437e+18, "learning_rate": 0.0001, "loss": 7.4926, "loss/crossentropy": 2.04936410933733, "loss/hidden": 3.714453125, "loss/jsd": 0.0, "loss/logits": 0.1995564555749297, "step": 1250 }, { "epoch": 0.0315, "grad_norm": 31.25, "grad_norm_var": 2.1304840747304878e+18, "learning_rate": 0.0001, "loss": 7.5078, "loss/crossentropy": 2.1189576953649523, "loss/hidden": 3.43515625, "loss/jsd": 0.0, "loss/logits": 0.19967459067702292, "step": 1260 }, { "epoch": 0.03175, "grad_norm": 30.5, "grad_norm_var": 3.178580729166667, "learning_rate": 0.0001, "loss": 7.4255, "loss/crossentropy": 2.163596141338348, "loss/hidden": 3.4546875, "loss/jsd": 0.0, "loss/logits": 0.19321363251656293, "step": 1270 }, { "epoch": 0.032, "grad_norm": 33.25, "grad_norm_var": 2.1639973958333334, "learning_rate": 0.0001, "loss": 7.4609, "loss/crossentropy": 1.9938266813755035, "loss/hidden": 3.351953125, "loss/jsd": 0.0, "loss/logits": 0.18334759529680014, "step": 1280 }, { "epoch": 0.03225, "grad_norm": 29.375, "grad_norm_var": 1.67890625, "learning_rate": 0.0001, "loss": 7.4652, "loss/crossentropy": 2.161333967000246, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19740422032773494, "step": 1290 }, { "epoch": 0.0325, "grad_norm": 32.75, "grad_norm_var": 3.0385416666666667, "learning_rate": 0.0001, "loss": 7.3146, "loss/crossentropy": 2.0165325723588468, "loss/hidden": 3.49921875, "loss/jsd": 0.0, "loss/logits": 0.19117104820907116, "step": 1300 }, { "epoch": 0.03275, "grad_norm": 28.25, "grad_norm_var": 9.158072916666667, "learning_rate": 0.0001, "loss": 7.4955, "loss/crossentropy": 2.124955786764622, "loss/hidden": 3.491015625, "loss/jsd": 0.0, "loss/logits": 0.19802952595055104, "step": 1310 }, { "epoch": 0.033, "grad_norm": 30.75, "grad_norm_var": 2.4535807291666667, "learning_rate": 0.0001, "loss": 7.4311, "loss/crossentropy": 2.018800371140242, "loss/hidden": 3.542578125, "loss/jsd": 0.0, "loss/logits": 0.2196814114227891, "step": 1320 }, { "epoch": 0.03325, "grad_norm": 31.375, "grad_norm_var": 2.39375, "learning_rate": 0.0001, "loss": 7.5164, "loss/crossentropy": 2.0520452961325644, "loss/hidden": 3.454296875, "loss/jsd": 0.0, "loss/logits": 0.2013697015121579, "step": 1330 }, { "epoch": 0.0335, "grad_norm": 32.5, "grad_norm_var": 1.0431640625, "learning_rate": 0.0001, "loss": 7.5302, "loss/crossentropy": 2.12932348549366, "loss/hidden": 3.525, "loss/jsd": 0.0, "loss/logits": 0.20245677568018436, "step": 1340 }, { "epoch": 0.03375, "grad_norm": 30.625, "grad_norm_var": 3.3900390625, "learning_rate": 0.0001, "loss": 7.5292, "loss/crossentropy": 2.031618994474411, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.19062725063413383, "step": 1350 }, { "epoch": 0.034, "grad_norm": 32.0, "grad_norm_var": 3.3447265625, "learning_rate": 0.0001, "loss": 7.5755, "loss/crossentropy": 2.2257011234760284, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.1979327043518424, "step": 1360 }, { "epoch": 0.03425, "grad_norm": 30.625, "grad_norm_var": 3.3421223958333335, "learning_rate": 0.0001, "loss": 7.4219, "loss/crossentropy": 2.155778780579567, "loss/hidden": 3.31796875, "loss/jsd": 0.0, "loss/logits": 0.19018295016139747, "step": 1370 }, { "epoch": 0.0345, "grad_norm": 30.25, "grad_norm_var": 2.5872395833333335, "learning_rate": 0.0001, "loss": 7.4637, "loss/crossentropy": 2.058405503630638, "loss/hidden": 3.39296875, "loss/jsd": 0.0, "loss/logits": 0.2114524593576789, "step": 1380 }, { "epoch": 0.03475, "grad_norm": 32.5, "grad_norm_var": 3.2994140625, "learning_rate": 0.0001, "loss": 7.5834, "loss/crossentropy": 2.1654782712459566, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.2024593001231551, "step": 1390 }, { "epoch": 0.035, "grad_norm": 31.125, "grad_norm_var": 12.812239583333334, "learning_rate": 0.0001, "loss": 7.4442, "loss/crossentropy": 2.0921876966953277, "loss/hidden": 3.286328125, "loss/jsd": 0.0, "loss/logits": 0.19270132519304753, "step": 1400 }, { "epoch": 0.03525, "grad_norm": 29.25, "grad_norm_var": 1.5108723958333334, "learning_rate": 0.0001, "loss": 7.4779, "loss/crossentropy": 1.9434148371219635, "loss/hidden": 3.366015625, "loss/jsd": 0.0, "loss/logits": 0.17576389852911234, "step": 1410 }, { "epoch": 0.0355, "grad_norm": 30.125, "grad_norm_var": 2.154166666666667, "learning_rate": 0.0001, "loss": 7.508, "loss/crossentropy": 2.0766889482736586, "loss/hidden": 3.485546875, "loss/jsd": 0.0, "loss/logits": 0.20394362770020963, "step": 1420 }, { "epoch": 0.03575, "grad_norm": 30.125, "grad_norm_var": 17.580208333333335, "learning_rate": 0.0001, "loss": 7.4612, "loss/crossentropy": 2.00380075648427, "loss/hidden": 3.4453125, "loss/jsd": 0.0, "loss/logits": 0.18816210143268108, "step": 1430 }, { "epoch": 0.036, "grad_norm": 31.375, "grad_norm_var": 16.758268229166667, "learning_rate": 0.0001, "loss": 7.4602, "loss/crossentropy": 2.1938020154833793, "loss/hidden": 3.4234375, "loss/jsd": 0.0, "loss/logits": 0.2016971528530121, "step": 1440 }, { "epoch": 0.03625, "grad_norm": 30.875, "grad_norm_var": 1.2556640625, "learning_rate": 0.0001, "loss": 7.4245, "loss/crossentropy": 2.0232372283935547, "loss/hidden": 3.40234375, "loss/jsd": 0.0, "loss/logits": 0.19209201391786337, "step": 1450 }, { "epoch": 0.0365, "grad_norm": 31.0, "grad_norm_var": 1.4041015625, "learning_rate": 0.0001, "loss": 7.5518, "loss/crossentropy": 2.2000616788864136, "loss/hidden": 3.473046875, "loss/jsd": 0.0, "loss/logits": 0.22938326951116322, "step": 1460 }, { "epoch": 0.03675, "grad_norm": 28.375, "grad_norm_var": 2.0322916666666666, "learning_rate": 0.0001, "loss": 7.4397, "loss/crossentropy": 2.0838582158088683, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.20685861641541123, "step": 1470 }, { "epoch": 0.037, "grad_norm": 32.0, "grad_norm_var": 1.5020833333333334, "learning_rate": 0.0001, "loss": 7.4183, "loss/crossentropy": 2.149951633810997, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.1984950641170144, "step": 1480 }, { "epoch": 0.03725, "grad_norm": 33.75, "grad_norm_var": 34.10826822916667, "learning_rate": 0.0001, "loss": 7.453, "loss/crossentropy": 2.128306310623884, "loss/hidden": 3.33203125, "loss/jsd": 0.0, "loss/logits": 0.19783397912979125, "step": 1490 }, { "epoch": 0.0375, "grad_norm": 29.5, "grad_norm_var": 5.008072916666666, "learning_rate": 0.0001, "loss": 7.469, "loss/crossentropy": 2.042660539597273, "loss/hidden": 3.365234375, "loss/jsd": 0.0, "loss/logits": 0.19274956732988358, "step": 1500 }, { "epoch": 0.03775, "grad_norm": 33.0, "grad_norm_var": 19.1775390625, "learning_rate": 0.0001, "loss": 7.4119, "loss/crossentropy": 2.043857058137655, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.18266947232186795, "step": 1510 }, { "epoch": 0.038, "grad_norm": 29.625, "grad_norm_var": 14.303580729166667, "learning_rate": 0.0001, "loss": 7.4362, "loss/crossentropy": 1.9492302805185318, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.1754497304558754, "step": 1520 }, { "epoch": 0.03825, "grad_norm": 29.75, "grad_norm_var": 23.764518229166665, "learning_rate": 0.0001, "loss": 7.4444, "loss/crossentropy": 2.0668226674199106, "loss/hidden": 3.473828125, "loss/jsd": 0.0, "loss/logits": 0.1921279976144433, "step": 1530 }, { "epoch": 0.0385, "grad_norm": 32.75, "grad_norm_var": 3.2226069790467994e+18, "learning_rate": 0.0001, "loss": 7.5077, "loss/crossentropy": 2.1122784771025183, "loss/hidden": 3.46953125, "loss/jsd": 0.0, "loss/logits": 0.22245875597000123, "step": 1540 }, { "epoch": 0.03875, "grad_norm": 30.25, "grad_norm_var": 5.382291666666666, "learning_rate": 0.0001, "loss": 7.4525, "loss/crossentropy": 2.264697426557541, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.2075907403603196, "step": 1550 }, { "epoch": 0.039, "grad_norm": 30.0, "grad_norm_var": 6.353580729166667, "learning_rate": 0.0001, "loss": 7.5064, "loss/crossentropy": 2.1150408178567885, "loss/hidden": 3.5203125, "loss/jsd": 0.0, "loss/logits": 0.23207673486322164, "step": 1560 }, { "epoch": 0.03925, "grad_norm": 34.25, "grad_norm_var": 6.72265625, "learning_rate": 0.0001, "loss": 7.4578, "loss/crossentropy": 2.188142140209675, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.20429779235273599, "step": 1570 }, { "epoch": 0.0395, "grad_norm": 34.75, "grad_norm_var": 897.6666015625, "learning_rate": 0.0001, "loss": 7.434, "loss/crossentropy": 2.0795677445828913, "loss/hidden": 3.3828125, "loss/jsd": 0.0, "loss/logits": 0.18706642352044583, "step": 1580 }, { "epoch": 0.03975, "grad_norm": 28.0, "grad_norm_var": 903.6327473958333, "learning_rate": 0.0001, "loss": 7.5655, "loss/crossentropy": 2.1025844663381577, "loss/hidden": 3.469140625, "loss/jsd": 0.0, "loss/logits": 0.1966788914054632, "step": 1590 }, { "epoch": 0.04, "grad_norm": 28.625, "grad_norm_var": 11.97890625, "learning_rate": 0.0001, "loss": 7.2578, "loss/crossentropy": 2.050418493151665, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.20104087069630622, "step": 1600 }, { "epoch": 0.04025, "grad_norm": 28.0, "grad_norm_var": 2.255989583333333, "learning_rate": 0.0001, "loss": 7.4393, "loss/crossentropy": 2.1767756581306457, "loss/hidden": 3.5140625, "loss/jsd": 0.0, "loss/logits": 0.2213939843699336, "step": 1610 }, { "epoch": 0.0405, "grad_norm": 29.75, "grad_norm_var": 3.80390625, "learning_rate": 0.0001, "loss": 7.5026, "loss/crossentropy": 2.126803469657898, "loss/hidden": 3.39375, "loss/jsd": 0.0, "loss/logits": 0.19106289148330688, "step": 1620 }, { "epoch": 0.04075, "grad_norm": 32.0, "grad_norm_var": 3.1249348958333334, "learning_rate": 0.0001, "loss": 7.4274, "loss/crossentropy": 2.144256164133549, "loss/hidden": 3.424609375, "loss/jsd": 0.0, "loss/logits": 0.21435861438512802, "step": 1630 }, { "epoch": 0.041, "grad_norm": 30.25, "grad_norm_var": 29.265559895833334, "learning_rate": 0.0001, "loss": 7.5728, "loss/crossentropy": 2.2575725719332693, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.20658138059079648, "step": 1640 }, { "epoch": 0.04125, "grad_norm": 30.5, "grad_norm_var": 48.35390625, "learning_rate": 0.0001, "loss": 7.5776, "loss/crossentropy": 2.096929042041302, "loss/hidden": 3.346875, "loss/jsd": 0.0, "loss/logits": 0.18803389491513373, "step": 1650 }, { "epoch": 0.0415, "grad_norm": 30.5, "grad_norm_var": 1.1010416666666667, "learning_rate": 0.0001, "loss": 7.3792, "loss/crossentropy": 2.0290944524109364, "loss/hidden": 3.313671875, "loss/jsd": 0.0, "loss/logits": 0.19023821037262678, "step": 1660 }, { "epoch": 0.04175, "grad_norm": 28.125, "grad_norm_var": 33.49270833333333, "learning_rate": 0.0001, "loss": 7.5018, "loss/crossentropy": 2.0678361281752586, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.18862500675022603, "step": 1670 }, { "epoch": 0.042, "grad_norm": 29.75, "grad_norm_var": 2.2955729166666665, "learning_rate": 0.0001, "loss": 7.4432, "loss/crossentropy": 2.0549797296524046, "loss/hidden": 3.441796875, "loss/jsd": 0.0, "loss/logits": 0.19089050237089394, "step": 1680 }, { "epoch": 0.04225, "grad_norm": 29.75, "grad_norm_var": 1.8791666666666667, "learning_rate": 0.0001, "loss": 7.3842, "loss/crossentropy": 2.0077505365014074, "loss/hidden": 3.382421875, "loss/jsd": 0.0, "loss/logits": 0.18722779098898173, "step": 1690 }, { "epoch": 0.0425, "grad_norm": 29.375, "grad_norm_var": 0.9434895833333333, "learning_rate": 0.0001, "loss": 7.4273, "loss/crossentropy": 2.071325332671404, "loss/hidden": 3.486328125, "loss/jsd": 0.0, "loss/logits": 0.20270166713744403, "step": 1700 }, { "epoch": 0.04275, "grad_norm": 38.25, "grad_norm_var": 7.669791666666667, "learning_rate": 0.0001, "loss": 7.4176, "loss/crossentropy": 2.1353142291307448, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.19663168713450432, "step": 1710 }, { "epoch": 0.043, "grad_norm": 28.25, "grad_norm_var": 7.75, "learning_rate": 0.0001, "loss": 7.3818, "loss/crossentropy": 1.9995346069335938, "loss/hidden": 3.41015625, "loss/jsd": 0.0, "loss/logits": 0.18310597026720643, "step": 1720 }, { "epoch": 0.04325, "grad_norm": 29.5, "grad_norm_var": 3.7619140625, "learning_rate": 0.0001, "loss": 7.4912, "loss/crossentropy": 2.1415088951587675, "loss/hidden": 3.55078125, "loss/jsd": 0.0, "loss/logits": 0.22313783299177886, "step": 1730 }, { "epoch": 0.0435, "grad_norm": 31.625, "grad_norm_var": 3.0416666666666665, "learning_rate": 0.0001, "loss": 7.4999, "loss/crossentropy": 2.1686330527067184, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.20409150077030064, "step": 1740 }, { "epoch": 0.04375, "grad_norm": 31.375, "grad_norm_var": 2.724739583333333, "learning_rate": 0.0001, "loss": 7.438, "loss/crossentropy": 1.9411263287067413, "loss/hidden": 3.304296875, "loss/jsd": 0.0, "loss/logits": 0.17631518254056572, "step": 1750 }, { "epoch": 0.044, "grad_norm": 32.0, "grad_norm_var": 1.9145833333333333, "learning_rate": 0.0001, "loss": 7.679, "loss/crossentropy": 2.1614590853452684, "loss/hidden": 3.36015625, "loss/jsd": 0.0, "loss/logits": 0.194198589771986, "step": 1760 }, { "epoch": 0.04425, "grad_norm": 28.5, "grad_norm_var": 2.039322916666667, "learning_rate": 0.0001, "loss": 7.5095, "loss/crossentropy": 2.282147654891014, "loss/hidden": 3.359765625, "loss/jsd": 0.0, "loss/logits": 0.19978236705064772, "step": 1770 }, { "epoch": 0.0445, "grad_norm": 29.625, "grad_norm_var": 2.34140625, "learning_rate": 0.0001, "loss": 7.5296, "loss/crossentropy": 2.2078514605760575, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.19668537452816964, "step": 1780 }, { "epoch": 0.04475, "grad_norm": 30.25, "grad_norm_var": 2.70390625, "learning_rate": 0.0001, "loss": 7.5779, "loss/crossentropy": 2.1053253799676894, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.20323336366564035, "step": 1790 }, { "epoch": 0.045, "grad_norm": 28.5, "grad_norm_var": 4.8712890625, "learning_rate": 0.0001, "loss": 7.4866, "loss/crossentropy": 2.060333488881588, "loss/hidden": 3.373828125, "loss/jsd": 0.0, "loss/logits": 0.18627767637372017, "step": 1800 }, { "epoch": 0.04525, "grad_norm": 28.0, "grad_norm_var": 14.480989583333333, "learning_rate": 0.0001, "loss": 7.5225, "loss/crossentropy": 1.9755317773669958, "loss/hidden": 3.54375, "loss/jsd": 0.0, "loss/logits": 0.20334282671101392, "step": 1810 }, { "epoch": 0.0455, "grad_norm": 29.875, "grad_norm_var": 12.935872395833334, "learning_rate": 0.0001, "loss": 7.4781, "loss/crossentropy": 2.1289859026670457, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.1973018018528819, "step": 1820 }, { "epoch": 0.04575, "grad_norm": 31.75, "grad_norm_var": 2.123893229166667, "learning_rate": 0.0001, "loss": 7.3915, "loss/crossentropy": 1.9609280914068221, "loss/hidden": 3.386328125, "loss/jsd": 0.0, "loss/logits": 0.1916458262130618, "step": 1830 }, { "epoch": 0.046, "grad_norm": 32.0, "grad_norm_var": 1.6332682291666667, "learning_rate": 0.0001, "loss": 7.5095, "loss/crossentropy": 2.0019985377788543, "loss/hidden": 3.384375, "loss/jsd": 0.0, "loss/logits": 0.19768325993791222, "step": 1840 }, { "epoch": 0.04625, "grad_norm": 29.875, "grad_norm_var": 2.225455729166667, "learning_rate": 0.0001, "loss": 7.623, "loss/crossentropy": 2.0607564479112623, "loss/hidden": 3.507421875, "loss/jsd": 0.0, "loss/logits": 0.20858939345926047, "step": 1850 }, { "epoch": 0.0465, "grad_norm": 29.5, "grad_norm_var": 1.9863932291666666, "learning_rate": 0.0001, "loss": 7.3836, "loss/crossentropy": 2.132562433928251, "loss/hidden": 3.40859375, "loss/jsd": 0.0, "loss/logits": 0.1956317812204361, "step": 1860 }, { "epoch": 0.04675, "grad_norm": 36.0, "grad_norm_var": 3.2171223958333335, "learning_rate": 0.0001, "loss": 7.4803, "loss/crossentropy": 2.0316790327429772, "loss/hidden": 3.396875, "loss/jsd": 0.0, "loss/logits": 0.20630075875669718, "step": 1870 }, { "epoch": 0.047, "grad_norm": 33.25, "grad_norm_var": 16.304622395833334, "learning_rate": 0.0001, "loss": 7.576, "loss/crossentropy": 2.161964085698128, "loss/hidden": 3.513671875, "loss/jsd": 0.0, "loss/logits": 0.21842746511101724, "step": 1880 }, { "epoch": 0.04725, "grad_norm": 29.75, "grad_norm_var": 2.3541666666666665, "learning_rate": 0.0001, "loss": 7.5036, "loss/crossentropy": 1.8695943117141725, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.18793469872325658, "step": 1890 }, { "epoch": 0.0475, "grad_norm": 34.25, "grad_norm_var": 2.1780598958333335, "learning_rate": 0.0001, "loss": 7.5623, "loss/crossentropy": 2.2376974314451217, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.21696731727570295, "step": 1900 }, { "epoch": 0.04775, "grad_norm": 30.75, "grad_norm_var": 14.924934895833333, "learning_rate": 0.0001, "loss": 7.388, "loss/crossentropy": 1.9403380863368511, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.18128401823341847, "step": 1910 }, { "epoch": 0.048, "grad_norm": 29.25, "grad_norm_var": 25.1916015625, "learning_rate": 0.0001, "loss": 7.4109, "loss/crossentropy": 2.1744547933340073, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.20097011709585785, "step": 1920 }, { "epoch": 0.04825, "grad_norm": 29.25, "grad_norm_var": 14.801822916666667, "learning_rate": 0.0001, "loss": 7.2893, "loss/crossentropy": 2.101319019496441, "loss/hidden": 3.44140625, "loss/jsd": 0.0, "loss/logits": 0.1921493023633957, "step": 1930 }, { "epoch": 0.0485, "grad_norm": 30.125, "grad_norm_var": 14.517708333333333, "learning_rate": 0.0001, "loss": 7.579, "loss/crossentropy": 2.057158224284649, "loss/hidden": 3.59140625, "loss/jsd": 0.0, "loss/logits": 0.21765361074358225, "step": 1940 }, { "epoch": 0.04875, "grad_norm": 29.625, "grad_norm_var": 15.790559895833333, "learning_rate": 0.0001, "loss": 7.3712, "loss/crossentropy": 1.9415803879499436, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.18346730088815094, "step": 1950 }, { "epoch": 0.049, "grad_norm": 27.625, "grad_norm_var": 9.794791666666667, "learning_rate": 0.0001, "loss": 7.4902, "loss/crossentropy": 2.035348242521286, "loss/hidden": 3.439453125, "loss/jsd": 0.0, "loss/logits": 0.20268035624176264, "step": 1960 }, { "epoch": 0.04925, "grad_norm": 35.25, "grad_norm_var": 12.768684895833333, "learning_rate": 0.0001, "loss": 7.4627, "loss/crossentropy": 2.054542076587677, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.2003987120464444, "step": 1970 }, { "epoch": 0.0495, "grad_norm": 36.0, "grad_norm_var": 12.572916666666666, "learning_rate": 0.0001, "loss": 7.353, "loss/crossentropy": 1.9634785205125809, "loss/hidden": 3.301171875, "loss/jsd": 0.0, "loss/logits": 0.17985089337453247, "step": 1980 }, { "epoch": 0.04975, "grad_norm": 36.25, "grad_norm_var": 9.2353515625, "learning_rate": 0.0001, "loss": 7.4473, "loss/crossentropy": 2.059533824026585, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.19096513148397207, "step": 1990 }, { "epoch": 0.05, "grad_norm": 29.125, "grad_norm_var": 13.320572916666666, "learning_rate": 0.0001, "loss": 7.3914, "loss/crossentropy": 2.011685383319855, "loss/hidden": 3.4421875, "loss/jsd": 0.0, "loss/logits": 0.19188414234668016, "step": 2000 }, { "epoch": 0.05025, "grad_norm": 36.25, "grad_norm_var": 14.026822916666667, "learning_rate": 0.0001, "loss": 7.4213, "loss/crossentropy": 2.309766414761543, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.20372038893401623, "step": 2010 }, { "epoch": 0.0505, "grad_norm": 29.0, "grad_norm_var": 9.237239583333333, "learning_rate": 0.0001, "loss": 7.4145, "loss/crossentropy": 2.1240487143397333, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.20137840434908866, "step": 2020 }, { "epoch": 0.05075, "grad_norm": 38.5, "grad_norm_var": 89.21432291666666, "learning_rate": 0.0001, "loss": 7.3696, "loss/crossentropy": 2.112667274475098, "loss/hidden": 3.487109375, "loss/jsd": 0.0, "loss/logits": 0.19770587887614965, "step": 2030 }, { "epoch": 0.051, "grad_norm": 27.75, "grad_norm_var": 94.06015625, "learning_rate": 0.0001, "loss": 7.2471, "loss/crossentropy": 1.9955052442848682, "loss/hidden": 3.30546875, "loss/jsd": 0.0, "loss/logits": 0.1880181163549423, "step": 2040 }, { "epoch": 0.05125, "grad_norm": 35.25, "grad_norm_var": 3.67265625, "learning_rate": 0.0001, "loss": 7.458, "loss/crossentropy": 2.1320972844958304, "loss/hidden": 3.384765625, "loss/jsd": 0.0, "loss/logits": 0.18908526431769132, "step": 2050 }, { "epoch": 0.0515, "grad_norm": 38.75, "grad_norm_var": 10.776822916666667, "learning_rate": 0.0001, "loss": 7.3769, "loss/crossentropy": 2.171598494052887, "loss/hidden": 3.29765625, "loss/jsd": 0.0, "loss/logits": 0.18929236195981503, "step": 2060 }, { "epoch": 0.05175, "grad_norm": 32.75, "grad_norm_var": 10.53515625, "learning_rate": 0.0001, "loss": 7.5279, "loss/crossentropy": 2.0172302186489106, "loss/hidden": 3.4203125, "loss/jsd": 0.0, "loss/logits": 0.2013201082125306, "step": 2070 }, { "epoch": 0.052, "grad_norm": 32.0, "grad_norm_var": 7.678125, "learning_rate": 0.0001, "loss": 7.3619, "loss/crossentropy": 1.982726515084505, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.17850281894207, "step": 2080 }, { "epoch": 0.05225, "grad_norm": 29.75, "grad_norm_var": 63.6681640625, "learning_rate": 0.0001, "loss": 7.5109, "loss/crossentropy": 2.121504098176956, "loss/hidden": 3.50703125, "loss/jsd": 0.0, "loss/logits": 0.240205854550004, "step": 2090 }, { "epoch": 0.0525, "grad_norm": 34.5, "grad_norm_var": 7.506184895833333, "learning_rate": 0.0001, "loss": 7.4658, "loss/crossentropy": 2.110687591135502, "loss/hidden": 3.530078125, "loss/jsd": 0.0, "loss/logits": 0.2039638390764594, "step": 2100 }, { "epoch": 0.05275, "grad_norm": 32.5, "grad_norm_var": 19.075455729166666, "learning_rate": 0.0001, "loss": 7.5668, "loss/crossentropy": 1.9557841390371322, "loss/hidden": 3.462109375, "loss/jsd": 0.0, "loss/logits": 0.18774209143593906, "step": 2110 }, { "epoch": 0.053, "grad_norm": 31.125, "grad_norm_var": 3.85390625, "learning_rate": 0.0001, "loss": 7.5735, "loss/crossentropy": 2.0219520531594752, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.18533632289618254, "step": 2120 }, { "epoch": 0.05325, "grad_norm": 32.25, "grad_norm_var": 3.8910807291666667, "learning_rate": 0.0001, "loss": 7.4083, "loss/crossentropy": 2.1359280541539194, "loss/hidden": 3.412890625, "loss/jsd": 0.0, "loss/logits": 0.1897095028311014, "step": 2130 }, { "epoch": 0.0535, "grad_norm": 31.25, "grad_norm_var": 2.5957682291666666, "learning_rate": 0.0001, "loss": 7.446, "loss/crossentropy": 2.170258317142725, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.1826348526403308, "step": 2140 }, { "epoch": 0.05375, "grad_norm": 31.25, "grad_norm_var": 3.785416666666667, "learning_rate": 0.0001, "loss": 7.4014, "loss/crossentropy": 2.131239393353462, "loss/hidden": 3.303515625, "loss/jsd": 0.0, "loss/logits": 0.18656531646847724, "step": 2150 }, { "epoch": 0.054, "grad_norm": 31.0, "grad_norm_var": 4.8666015625, "learning_rate": 0.0001, "loss": 7.5478, "loss/crossentropy": 2.223896725475788, "loss/hidden": 3.383203125, "loss/jsd": 0.0, "loss/logits": 0.1951376979239285, "step": 2160 }, { "epoch": 0.05425, "grad_norm": 30.375, "grad_norm_var": 8.437955729166667, "learning_rate": 0.0001, "loss": 7.5562, "loss/crossentropy": 2.1203987300395966, "loss/hidden": 3.351171875, "loss/jsd": 0.0, "loss/logits": 0.1970507999882102, "step": 2170 }, { "epoch": 0.0545, "grad_norm": 32.0, "grad_norm_var": 2.9488932291666665, "learning_rate": 0.0001, "loss": 7.5532, "loss/crossentropy": 2.080265050381422, "loss/hidden": 3.544140625, "loss/jsd": 0.0, "loss/logits": 0.2216239819303155, "step": 2180 }, { "epoch": 0.05475, "grad_norm": 31.125, "grad_norm_var": 8.1728515625, "learning_rate": 0.0001, "loss": 7.382, "loss/crossentropy": 2.2114535331726075, "loss/hidden": 3.37734375, "loss/jsd": 0.0, "loss/logits": 0.20577374435961246, "step": 2190 }, { "epoch": 0.055, "grad_norm": 28.875, "grad_norm_var": 14.520833333333334, "learning_rate": 0.0001, "loss": 7.5766, "loss/crossentropy": 2.1003271512687207, "loss/hidden": 3.358984375, "loss/jsd": 0.0, "loss/logits": 0.18811229150742292, "step": 2200 }, { "epoch": 0.05525, "grad_norm": 33.5, "grad_norm_var": 16.099739583333335, "learning_rate": 0.0001, "loss": 7.5553, "loss/crossentropy": 2.1326127350330353, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.22006579730659723, "step": 2210 }, { "epoch": 0.0555, "grad_norm": 32.25, "grad_norm_var": 9.305143229166667, "learning_rate": 0.0001, "loss": 7.3766, "loss/crossentropy": 2.1496046826243402, "loss/hidden": 3.476171875, "loss/jsd": 0.0, "loss/logits": 0.1952402491122484, "step": 2220 }, { "epoch": 0.05575, "grad_norm": 29.125, "grad_norm_var": 6.805143229166666, "learning_rate": 0.0001, "loss": 7.3648, "loss/crossentropy": 2.13938904479146, "loss/hidden": 3.36640625, "loss/jsd": 0.0, "loss/logits": 0.19394674636423587, "step": 2230 }, { "epoch": 0.056, "grad_norm": 27.625, "grad_norm_var": 15.0712890625, "learning_rate": 0.0001, "loss": 7.4292, "loss/crossentropy": 2.0648645758628845, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.18520106598734856, "step": 2240 }, { "epoch": 0.05625, "grad_norm": 29.25, "grad_norm_var": 12.034309895833333, "learning_rate": 0.0001, "loss": 7.4469, "loss/crossentropy": 2.080448921024799, "loss/hidden": 3.3109375, "loss/jsd": 0.0, "loss/logits": 0.18507405128329993, "step": 2250 }, { "epoch": 0.0565, "grad_norm": 31.375, "grad_norm_var": 2.014518229166667, "learning_rate": 0.0001, "loss": 7.4325, "loss/crossentropy": 2.0871294140815735, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.20059894528239966, "step": 2260 }, { "epoch": 0.05675, "grad_norm": 28.75, "grad_norm_var": 1.8103515625, "learning_rate": 0.0001, "loss": 7.4268, "loss/crossentropy": 2.010594163835049, "loss/hidden": 3.39453125, "loss/jsd": 0.0, "loss/logits": 0.19413960948586464, "step": 2270 }, { "epoch": 0.057, "grad_norm": 32.5, "grad_norm_var": 4.0369140625, "learning_rate": 0.0001, "loss": 7.4346, "loss/crossentropy": 2.1129174560308455, "loss/hidden": 3.416015625, "loss/jsd": 0.0, "loss/logits": 0.1961110396310687, "step": 2280 }, { "epoch": 0.05725, "grad_norm": 39.0, "grad_norm_var": 30.42265625, "learning_rate": 0.0001, "loss": 7.4422, "loss/crossentropy": 2.002947611361742, "loss/hidden": 3.432421875, "loss/jsd": 0.0, "loss/logits": 0.2081361676566303, "step": 2290 }, { "epoch": 0.0575, "grad_norm": 37.25, "grad_norm_var": 25.699934895833334, "learning_rate": 0.0001, "loss": 7.4312, "loss/crossentropy": 2.06134437918663, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.18918452728539706, "step": 2300 }, { "epoch": 0.05775, "grad_norm": 28.875, "grad_norm_var": 9.115559895833334, "learning_rate": 0.0001, "loss": 7.4209, "loss/crossentropy": 2.041922479122877, "loss/hidden": 3.403515625, "loss/jsd": 0.0, "loss/logits": 0.20907302405685185, "step": 2310 }, { "epoch": 0.058, "grad_norm": 30.125, "grad_norm_var": 22.248372395833332, "learning_rate": 0.0001, "loss": 7.6844, "loss/crossentropy": 2.0152460247278214, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.1905667196959257, "step": 2320 }, { "epoch": 0.05825, "grad_norm": 38.25, "grad_norm_var": 31.398893229166667, "learning_rate": 0.0001, "loss": 7.4713, "loss/crossentropy": 2.105386929959059, "loss/hidden": 3.452734375, "loss/jsd": 0.0, "loss/logits": 0.1982942834496498, "step": 2330 }, { "epoch": 0.0585, "grad_norm": 28.375, "grad_norm_var": 54.94264322916667, "learning_rate": 0.0001, "loss": 7.4575, "loss/crossentropy": 2.2358868844807147, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19232469592243434, "step": 2340 }, { "epoch": 0.05875, "grad_norm": 33.5, "grad_norm_var": 165.74583333333334, "learning_rate": 0.0001, "loss": 7.2987, "loss/crossentropy": 1.9657546751201154, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.18062973748892547, "step": 2350 }, { "epoch": 0.059, "grad_norm": 41.0, "grad_norm_var": 15.376822916666667, "learning_rate": 0.0001, "loss": 7.4431, "loss/crossentropy": 2.191007924079895, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.2068317520432174, "step": 2360 }, { "epoch": 0.05925, "grad_norm": 30.625, "grad_norm_var": 12.109375, "learning_rate": 0.0001, "loss": 7.3325, "loss/crossentropy": 2.0140789330005644, "loss/hidden": 3.4109375, "loss/jsd": 0.0, "loss/logits": 0.18166892379522323, "step": 2370 }, { "epoch": 0.0595, "grad_norm": 31.875, "grad_norm_var": 6.941666666666666, "learning_rate": 0.0001, "loss": 7.4039, "loss/crossentropy": 2.0221361994743345, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.1934544663876295, "step": 2380 }, { "epoch": 0.05975, "grad_norm": 30.125, "grad_norm_var": 10.472330729166666, "learning_rate": 0.0001, "loss": 7.5862, "loss/crossentropy": 1.9840030640363693, "loss/hidden": 3.46640625, "loss/jsd": 0.0, "loss/logits": 0.19178631734102963, "step": 2390 }, { "epoch": 0.06, "grad_norm": 29.875, "grad_norm_var": 14.10625, "learning_rate": 0.0001, "loss": 7.4826, "loss/crossentropy": 2.1700179904699324, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.1915024297311902, "step": 2400 }, { "epoch": 0.06025, "grad_norm": 32.75, "grad_norm_var": 7.370768229166667, "learning_rate": 0.0001, "loss": 7.3889, "loss/crossentropy": 2.091843403875828, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.18695627991110086, "step": 2410 }, { "epoch": 0.0605, "grad_norm": 29.0, "grad_norm_var": 9.922330729166667, "learning_rate": 0.0001, "loss": 7.4655, "loss/crossentropy": 2.172381104528904, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.20078962799161673, "step": 2420 }, { "epoch": 0.06075, "grad_norm": 34.25, "grad_norm_var": 8.637239583333333, "learning_rate": 0.0001, "loss": 7.519, "loss/crossentropy": 1.995463601499796, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.1993358489125967, "step": 2430 }, { "epoch": 0.061, "grad_norm": 31.25, "grad_norm_var": 11.9431640625, "learning_rate": 0.0001, "loss": 7.5169, "loss/crossentropy": 2.296917426586151, "loss/hidden": 3.513671875, "loss/jsd": 0.0, "loss/logits": 0.23228074796497822, "step": 2440 }, { "epoch": 0.06125, "grad_norm": 30.25, "grad_norm_var": 3.4368798046573737e+18, "learning_rate": 0.0001, "loss": 7.5038, "loss/crossentropy": 2.1944432735443113, "loss/hidden": 3.3921875, "loss/jsd": 0.0, "loss/logits": 0.21073084995150565, "step": 2450 }, { "epoch": 0.0615, "grad_norm": 33.5, "grad_norm_var": 3.436879805205814e+18, "learning_rate": 0.0001, "loss": 7.4423, "loss/crossentropy": 2.152103579044342, "loss/hidden": 3.512109375, "loss/jsd": 0.0, "loss/logits": 0.20929353777319193, "step": 2460 }, { "epoch": 0.06175, "grad_norm": 39.0, "grad_norm_var": 2.2045823633093297e+18, "learning_rate": 0.0001, "loss": 7.4382, "loss/crossentropy": 2.017627691477537, "loss/hidden": 3.355078125, "loss/jsd": 0.0, "loss/logits": 0.19590776292607187, "step": 2470 }, { "epoch": 0.062, "grad_norm": 29.375, "grad_norm_var": 2.2045823636681523e+18, "learning_rate": 0.0001, "loss": 7.4072, "loss/crossentropy": 2.1076912328600885, "loss/hidden": 3.433203125, "loss/jsd": 0.0, "loss/logits": 0.1988623272627592, "step": 2480 }, { "epoch": 0.06225, "grad_norm": 30.125, "grad_norm_var": 3.2494140625, "learning_rate": 0.0001, "loss": 7.3192, "loss/crossentropy": 1.9777067750692368, "loss/hidden": 3.429296875, "loss/jsd": 0.0, "loss/logits": 0.20539684109389783, "step": 2490 }, { "epoch": 0.0625, "grad_norm": 29.125, "grad_norm_var": 5.580208333333333, "learning_rate": 0.0001, "loss": 7.3283, "loss/crossentropy": 2.061080713570118, "loss/hidden": 3.4953125, "loss/jsd": 0.0, "loss/logits": 0.20077812522649766, "step": 2500 }, { "epoch": 0.06275, "grad_norm": 28.375, "grad_norm_var": 5.618489583333333, "learning_rate": 0.0001, "loss": 7.4401, "loss/crossentropy": 2.2099071338772776, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.2055276283994317, "step": 2510 }, { "epoch": 0.063, "grad_norm": 28.125, "grad_norm_var": 7.118684895833334, "learning_rate": 0.0001, "loss": 7.3509, "loss/crossentropy": 1.962952435016632, "loss/hidden": 3.421484375, "loss/jsd": 0.0, "loss/logits": 0.19731322024017572, "step": 2520 }, { "epoch": 0.06325, "grad_norm": 31.375, "grad_norm_var": 1.9681640625, "learning_rate": 0.0001, "loss": 7.3695, "loss/crossentropy": 1.9843583509325982, "loss/hidden": 3.4921875, "loss/jsd": 0.0, "loss/logits": 0.2062232268974185, "step": 2530 }, { "epoch": 0.0635, "grad_norm": 31.5, "grad_norm_var": 3.7988932291666666, "learning_rate": 0.0001, "loss": 7.4485, "loss/crossentropy": 2.1427679538726805, "loss/hidden": 3.38125, "loss/jsd": 0.0, "loss/logits": 0.2011977185495198, "step": 2540 }, { "epoch": 0.06375, "grad_norm": 30.0, "grad_norm_var": 2.5885416666666665, "learning_rate": 0.0001, "loss": 7.4157, "loss/crossentropy": 1.9085583783686162, "loss/hidden": 3.325, "loss/jsd": 0.0, "loss/logits": 0.17416954301297666, "step": 2550 }, { "epoch": 0.064, "grad_norm": 31.25, "grad_norm_var": 1.21015625, "learning_rate": 0.0001, "loss": 7.5141, "loss/crossentropy": 1.9622327491641045, "loss/hidden": 3.361328125, "loss/jsd": 0.0, "loss/logits": 0.18756412472575903, "step": 2560 }, { "epoch": 0.06425, "grad_norm": 30.0, "grad_norm_var": 1.7143229166666667, "learning_rate": 0.0001, "loss": 7.4624, "loss/crossentropy": 2.192887546122074, "loss/hidden": 3.4296875, "loss/jsd": 0.0, "loss/logits": 0.1984951412305236, "step": 2570 }, { "epoch": 0.0645, "grad_norm": 30.125, "grad_norm_var": 1.9143229166666667, "learning_rate": 0.0001, "loss": 7.3947, "loss/crossentropy": 2.102549520134926, "loss/hidden": 3.463671875, "loss/jsd": 0.0, "loss/logits": 0.1989850653335452, "step": 2580 }, { "epoch": 0.06475, "grad_norm": 32.25, "grad_norm_var": 9.5322265625, "learning_rate": 0.0001, "loss": 7.5147, "loss/crossentropy": 2.213281115144491, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.2027151037938893, "step": 2590 }, { "epoch": 0.065, "grad_norm": 30.625, "grad_norm_var": 2.3427083333333334, "learning_rate": 0.0001, "loss": 7.4691, "loss/crossentropy": 2.1138279482722284, "loss/hidden": 3.379296875, "loss/jsd": 0.0, "loss/logits": 0.20825629755854608, "step": 2600 }, { "epoch": 0.06525, "grad_norm": 36.0, "grad_norm_var": 3.3395182291666665, "learning_rate": 0.0001, "loss": 7.4775, "loss/crossentropy": 2.107349547743797, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.19337845854461194, "step": 2610 }, { "epoch": 0.0655, "grad_norm": 29.25, "grad_norm_var": 12.757291666666667, "learning_rate": 0.0001, "loss": 7.5438, "loss/crossentropy": 2.0628502368927, "loss/hidden": 3.4984375, "loss/jsd": 0.0, "loss/logits": 0.20967572089284658, "step": 2620 }, { "epoch": 0.06575, "grad_norm": 28.625, "grad_norm_var": 11.805208333333333, "learning_rate": 0.0001, "loss": 7.3354, "loss/crossentropy": 2.1009589530527593, "loss/hidden": 3.33828125, "loss/jsd": 0.0, "loss/logits": 0.18132725274190306, "step": 2630 }, { "epoch": 0.066, "grad_norm": 32.5, "grad_norm_var": 2.730208333333333, "learning_rate": 0.0001, "loss": 7.4257, "loss/crossentropy": 1.983342681080103, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.19340286049991845, "step": 2640 }, { "epoch": 0.06625, "grad_norm": 30.25, "grad_norm_var": 3.7549465282226944e+18, "learning_rate": 0.0001, "loss": 7.309, "loss/crossentropy": 2.0057250812649725, "loss/hidden": 3.418359375, "loss/jsd": 0.0, "loss/logits": 0.18936716187745334, "step": 2650 }, { "epoch": 0.0665, "grad_norm": 36.25, "grad_norm_var": 8.832747395833334, "learning_rate": 0.0001, "loss": 7.5442, "loss/crossentropy": 2.054753464460373, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.2035602940246463, "step": 2660 }, { "epoch": 0.06675, "grad_norm": 32.5, "grad_norm_var": 4.8900390625, "learning_rate": 0.0001, "loss": 7.4106, "loss/crossentropy": 2.0181221179664135, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.1878144398331642, "step": 2670 }, { "epoch": 0.067, "grad_norm": 30.125, "grad_norm_var": 4.280989583333334, "learning_rate": 0.0001, "loss": 7.4597, "loss/crossentropy": 2.200540581345558, "loss/hidden": 3.4046875, "loss/jsd": 0.0, "loss/logits": 0.20286752395331858, "step": 2680 }, { "epoch": 0.06725, "grad_norm": 31.75, "grad_norm_var": 3.8559895833333333, "learning_rate": 0.0001, "loss": 7.4643, "loss/crossentropy": 2.0630861818790436, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.20401672925800085, "step": 2690 }, { "epoch": 0.0675, "grad_norm": 33.0, "grad_norm_var": 7.073958333333334, "learning_rate": 0.0001, "loss": 7.4001, "loss/crossentropy": 1.927167509496212, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.17901942003518342, "step": 2700 }, { "epoch": 0.06775, "grad_norm": 30.25, "grad_norm_var": 8.9009765625, "learning_rate": 0.0001, "loss": 7.3461, "loss/crossentropy": 2.0538916781544687, "loss/hidden": 3.35234375, "loss/jsd": 0.0, "loss/logits": 0.1864149821922183, "step": 2710 }, { "epoch": 0.068, "grad_norm": 29.5, "grad_norm_var": 2.218489583333333, "learning_rate": 0.0001, "loss": 7.526, "loss/crossentropy": 2.211588367819786, "loss/hidden": 3.487890625, "loss/jsd": 0.0, "loss/logits": 0.20801848396658898, "step": 2720 }, { "epoch": 0.06825, "grad_norm": 31.375, "grad_norm_var": 1.0768229166666667, "learning_rate": 0.0001, "loss": 7.5535, "loss/crossentropy": 2.268890543282032, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.21352684032171965, "step": 2730 }, { "epoch": 0.0685, "grad_norm": 33.25, "grad_norm_var": 5.663997395833333, "learning_rate": 0.0001, "loss": 7.411, "loss/crossentropy": 1.902898482978344, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.1794701736420393, "step": 2740 }, { "epoch": 0.06875, "grad_norm": 32.25, "grad_norm_var": 6.167708333333334, "learning_rate": 0.0001, "loss": 7.3718, "loss/crossentropy": 1.9450767874717712, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.18759301900863648, "step": 2750 }, { "epoch": 0.069, "grad_norm": 31.125, "grad_norm_var": 31.185872395833332, "learning_rate": 0.0001, "loss": 7.4359, "loss/crossentropy": 2.0783849939703942, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18503105416893958, "step": 2760 }, { "epoch": 0.06925, "grad_norm": 36.5, "grad_norm_var": 35.412434895833336, "learning_rate": 0.0001, "loss": 7.5806, "loss/crossentropy": 2.2374701410532, "loss/hidden": 3.378125, "loss/jsd": 0.0, "loss/logits": 0.19615829903632404, "step": 2770 }, { "epoch": 0.0695, "grad_norm": 30.25, "grad_norm_var": 19.787239583333335, "learning_rate": 0.0001, "loss": 7.3197, "loss/crossentropy": 1.8297001466155052, "loss/hidden": 3.3171875, "loss/jsd": 0.0, "loss/logits": 0.16481583826243879, "step": 2780 }, { "epoch": 0.06975, "grad_norm": 428.0, "grad_norm_var": 9873.31640625, "learning_rate": 0.0001, "loss": 7.5313, "loss/crossentropy": 2.249661484360695, "loss/hidden": 3.392578125, "loss/jsd": 0.0, "loss/logits": 0.2018596636131406, "step": 2790 }, { "epoch": 0.07, "grad_norm": 31.0, "grad_norm_var": 9755.6625, "learning_rate": 0.0001, "loss": 7.3957, "loss/crossentropy": 1.9368772380053998, "loss/hidden": 3.48203125, "loss/jsd": 0.0, "loss/logits": 0.18386599626392125, "step": 2800 }, { "epoch": 0.07025, "grad_norm": 30.75, "grad_norm_var": 1.8317057291666667, "learning_rate": 0.0001, "loss": 7.4372, "loss/crossentropy": 1.98307463824749, "loss/hidden": 3.464453125, "loss/jsd": 0.0, "loss/logits": 0.19818334747105837, "step": 2810 }, { "epoch": 0.0705, "grad_norm": 29.375, "grad_norm_var": 2.589583333333333, "learning_rate": 0.0001, "loss": 7.5014, "loss/crossentropy": 2.1463105253875256, "loss/hidden": 3.5046875, "loss/jsd": 0.0, "loss/logits": 0.20105676222592592, "step": 2820 }, { "epoch": 0.07075, "grad_norm": 60.5, "grad_norm_var": 178.2556640625, "learning_rate": 0.0001, "loss": 7.4527, "loss/crossentropy": 2.0776613369584083, "loss/hidden": 3.420703125, "loss/jsd": 0.0, "loss/logits": 0.19452448841184378, "step": 2830 }, { "epoch": 0.071, "grad_norm": 29.25, "grad_norm_var": 172.31451822916668, "learning_rate": 0.0001, "loss": 7.4802, "loss/crossentropy": 2.1200039610266685, "loss/hidden": 3.417578125, "loss/jsd": 0.0, "loss/logits": 0.19831879772245883, "step": 2840 }, { "epoch": 0.07125, "grad_norm": 69.0, "grad_norm_var": 117.23098958333334, "learning_rate": 0.0001, "loss": 7.434, "loss/crossentropy": 2.024143140017986, "loss/hidden": 3.348828125, "loss/jsd": 0.0, "loss/logits": 0.1836528332903981, "step": 2850 }, { "epoch": 0.0715, "grad_norm": 31.375, "grad_norm_var": 92.53723958333333, "learning_rate": 0.0001, "loss": 7.4934, "loss/crossentropy": 2.2765417456626893, "loss/hidden": 3.446484375, "loss/jsd": 0.0, "loss/logits": 0.20736196860671044, "step": 2860 }, { "epoch": 0.07175, "grad_norm": 31.625, "grad_norm_var": 7.986393229166667, "learning_rate": 0.0001, "loss": 7.4826, "loss/crossentropy": 2.269197002053261, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.19869209118187428, "step": 2870 }, { "epoch": 0.072, "grad_norm": 31.25, "grad_norm_var": 3.1806640625, "learning_rate": 0.0001, "loss": 7.4018, "loss/crossentropy": 2.2985214799642564, "loss/hidden": 3.390234375, "loss/jsd": 0.0, "loss/logits": 0.20524807646870613, "step": 2880 }, { "epoch": 0.07225, "grad_norm": 30.875, "grad_norm_var": 4.801822916666667, "learning_rate": 0.0001, "loss": 7.5148, "loss/crossentropy": 2.2387808740139006, "loss/hidden": 3.46015625, "loss/jsd": 0.0, "loss/logits": 0.19951685946434736, "step": 2890 }, { "epoch": 0.0725, "grad_norm": 28.875, "grad_norm_var": 13.836458333333333, "learning_rate": 0.0001, "loss": 7.5232, "loss/crossentropy": 2.049694790691137, "loss/hidden": 3.410546875, "loss/jsd": 0.0, "loss/logits": 0.19052465092390775, "step": 2900 }, { "epoch": 0.07275, "grad_norm": 29.625, "grad_norm_var": 17.91640625, "learning_rate": 0.0001, "loss": 7.3227, "loss/crossentropy": 2.0360258772969244, "loss/hidden": 3.40546875, "loss/jsd": 0.0, "loss/logits": 0.18495636582374572, "step": 2910 }, { "epoch": 0.073, "grad_norm": 32.0, "grad_norm_var": 1.8926377214767268e+18, "learning_rate": 0.0001, "loss": 7.4512, "loss/crossentropy": 2.13848315179348, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.18625867497175932, "step": 2920 }, { "epoch": 0.07325, "grad_norm": 29.875, "grad_norm_var": 1.8926377199175642e+18, "learning_rate": 0.0001, "loss": 7.5038, "loss/crossentropy": 2.166595605015755, "loss/hidden": 3.49375, "loss/jsd": 0.0, "loss/logits": 0.20948194600641729, "step": 2930 }, { "epoch": 0.0735, "grad_norm": 28.5, "grad_norm_var": 73.08020833333333, "learning_rate": 0.0001, "loss": 7.374, "loss/crossentropy": 1.9849643550813199, "loss/hidden": 3.301171875, "loss/jsd": 0.0, "loss/logits": 0.18302082028239966, "step": 2940 }, { "epoch": 0.07375, "grad_norm": 29.125, "grad_norm_var": 24.825, "learning_rate": 0.0001, "loss": 7.3651, "loss/crossentropy": 2.057874396443367, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.1866615541279316, "step": 2950 }, { "epoch": 0.074, "grad_norm": 30.625, "grad_norm_var": 883.6354166666666, "learning_rate": 0.0001, "loss": 7.5415, "loss/crossentropy": 2.1631729155778885, "loss/hidden": 3.388671875, "loss/jsd": 0.0, "loss/logits": 0.20762786027044058, "step": 2960 }, { "epoch": 0.07425, "grad_norm": 32.75, "grad_norm_var": 887.2705729166667, "learning_rate": 0.0001, "loss": 7.4471, "loss/crossentropy": 1.9493468508124352, "loss/hidden": 3.3578125, "loss/jsd": 0.0, "loss/logits": 0.1884406829252839, "step": 2970 }, { "epoch": 0.0745, "grad_norm": 28.875, "grad_norm_var": 5.070768229166666, "learning_rate": 0.0001, "loss": 7.605, "loss/crossentropy": 2.122344336658716, "loss/hidden": 3.460546875, "loss/jsd": 0.0, "loss/logits": 0.21057356838136912, "step": 2980 }, { "epoch": 0.07475, "grad_norm": 37.0, "grad_norm_var": 21.535416666666666, "learning_rate": 0.0001, "loss": 7.469, "loss/crossentropy": 2.008989527821541, "loss/hidden": 3.54140625, "loss/jsd": 0.0, "loss/logits": 0.2172183733433485, "step": 2990 }, { "epoch": 0.075, "grad_norm": 29.375, "grad_norm_var": 18.198958333333334, "learning_rate": 0.0001, "loss": 7.3932, "loss/crossentropy": 2.1922819674015046, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.20425879992544652, "step": 3000 }, { "epoch": 0.07525, "grad_norm": 29.5, "grad_norm_var": 2.668684895833333, "learning_rate": 0.0001, "loss": 7.3505, "loss/crossentropy": 2.189265179634094, "loss/hidden": 3.34609375, "loss/jsd": 0.0, "loss/logits": 0.20808048862963915, "step": 3010 }, { "epoch": 0.0755, "grad_norm": 30.75, "grad_norm_var": 14.20625, "learning_rate": 0.0001, "loss": 7.5013, "loss/crossentropy": 2.0573098927736284, "loss/hidden": 3.3515625, "loss/jsd": 0.0, "loss/logits": 0.18116160985082388, "step": 3020 }, { "epoch": 0.07575, "grad_norm": 31.375, "grad_norm_var": 16.983333333333334, "learning_rate": 0.0001, "loss": 7.4455, "loss/crossentropy": 1.9735823571681976, "loss/hidden": 3.459765625, "loss/jsd": 0.0, "loss/logits": 0.19495000168681145, "step": 3030 }, { "epoch": 0.076, "grad_norm": 7247757312.0, "grad_norm_var": 3.2831240991582193e+18, "learning_rate": 0.0001, "loss": 7.4881, "loss/crossentropy": 1.971890377253294, "loss/hidden": 3.40625, "loss/jsd": 0.0, "loss/logits": 0.18015608433634042, "step": 3040 }, { "epoch": 0.07625, "grad_norm": 28.25, "grad_norm_var": 3.283124098780732e+18, "learning_rate": 0.0001, "loss": 7.3664, "loss/crossentropy": 1.8378953270614147, "loss/hidden": 3.3609375, "loss/jsd": 0.0, "loss/logits": 0.1741427879780531, "step": 3050 }, { "epoch": 0.0765, "grad_norm": 31.75, "grad_norm_var": 1.89140625, "learning_rate": 0.0001, "loss": 7.5137, "loss/crossentropy": 2.141886255145073, "loss/hidden": 3.443359375, "loss/jsd": 0.0, "loss/logits": 0.19584037065505983, "step": 3060 }, { "epoch": 0.07675, "grad_norm": 27.25, "grad_norm_var": 2.4244140625, "learning_rate": 0.0001, "loss": 7.4296, "loss/crossentropy": 2.0373554110527037, "loss/hidden": 3.5640625, "loss/jsd": 0.0, "loss/logits": 0.216986732929945, "step": 3070 }, { "epoch": 0.077, "grad_norm": 35.25, "grad_norm_var": 3.7322265625, "learning_rate": 0.0001, "loss": 7.5269, "loss/crossentropy": 1.975497831404209, "loss/hidden": 3.333984375, "loss/jsd": 0.0, "loss/logits": 0.1780722170136869, "step": 3080 }, { "epoch": 0.07725, "grad_norm": 32.75, "grad_norm_var": 3.6895182291666666, "learning_rate": 0.0001, "loss": 7.4938, "loss/crossentropy": 2.151789793372154, "loss/hidden": 3.502734375, "loss/jsd": 0.0, "loss/logits": 0.21854450944811105, "step": 3090 }, { "epoch": 0.0775, "grad_norm": 29.5, "grad_norm_var": 6.82265625, "learning_rate": 0.0001, "loss": 7.4321, "loss/crossentropy": 1.9484706297516823, "loss/hidden": 3.506640625, "loss/jsd": 0.0, "loss/logits": 0.19896488767117262, "step": 3100 }, { "epoch": 0.07775, "grad_norm": 29.75, "grad_norm_var": 3.0780598958333334, "learning_rate": 0.0001, "loss": 7.5471, "loss/crossentropy": 2.165594828128815, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.20095103643834591, "step": 3110 }, { "epoch": 0.078, "grad_norm": 29.0, "grad_norm_var": 2.2197916666666666, "learning_rate": 0.0001, "loss": 7.6334, "loss/crossentropy": 2.1854751259088516, "loss/hidden": 3.4640625, "loss/jsd": 0.0, "loss/logits": 0.21246263310313224, "step": 3120 }, { "epoch": 0.07825, "grad_norm": 29.0, "grad_norm_var": 3.71640625, "learning_rate": 0.0001, "loss": 7.4278, "loss/crossentropy": 1.914103902876377, "loss/hidden": 3.451953125, "loss/jsd": 0.0, "loss/logits": 0.18373754434287548, "step": 3130 }, { "epoch": 0.0785, "grad_norm": 29.0, "grad_norm_var": 1.2952473958333333, "learning_rate": 0.0001, "loss": 7.4487, "loss/crossentropy": 1.9421842776238918, "loss/hidden": 3.5296875, "loss/jsd": 0.0, "loss/logits": 0.19919300880283117, "step": 3140 }, { "epoch": 0.07875, "grad_norm": 29.375, "grad_norm_var": 1.8268229166666667, "learning_rate": 0.0001, "loss": 7.5818, "loss/crossentropy": 2.0765694811940194, "loss/hidden": 3.5171875, "loss/jsd": 0.0, "loss/logits": 0.19946561977267266, "step": 3150 }, { "epoch": 0.079, "grad_norm": 28.125, "grad_norm_var": 11.483268229166667, "learning_rate": 0.0001, "loss": 7.4372, "loss/crossentropy": 2.013955050334334, "loss/hidden": 3.4078125, "loss/jsd": 0.0, "loss/logits": 0.20109358858317136, "step": 3160 }, { "epoch": 0.07925, "grad_norm": 28.875, "grad_norm_var": 12.871809895833334, "learning_rate": 0.0001, "loss": 7.4606, "loss/crossentropy": 2.2802242666482924, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.21229397617280482, "step": 3170 }, { "epoch": 0.0795, "grad_norm": 28.375, "grad_norm_var": 1.6301432291666667, "learning_rate": 0.0001, "loss": 7.4691, "loss/crossentropy": 2.134338477253914, "loss/hidden": 3.404296875, "loss/jsd": 0.0, "loss/logits": 0.18632632456719875, "step": 3180 }, { "epoch": 0.07975, "grad_norm": 30.625, "grad_norm_var": 2.6113932291666666, "learning_rate": 0.0001, "loss": 7.4903, "loss/crossentropy": 2.192245528101921, "loss/hidden": 3.381640625, "loss/jsd": 0.0, "loss/logits": 0.19276445377618073, "step": 3190 }, { "epoch": 0.08, "grad_norm": 27.875, "grad_norm_var": 2.6830729166666667, "learning_rate": 0.0001, "loss": 7.4715, "loss/crossentropy": 2.1333388604223726, "loss/hidden": 3.376953125, "loss/jsd": 0.0, "loss/logits": 0.1902673264965415, "step": 3200 }, { "epoch": 0.08025, "grad_norm": 29.625, "grad_norm_var": 2.7072265625, "learning_rate": 0.0001, "loss": 7.4646, "loss/crossentropy": 2.1069626569747926, "loss/hidden": 3.374609375, "loss/jsd": 0.0, "loss/logits": 0.18933899328112602, "step": 3210 }, { "epoch": 0.0805, "grad_norm": 33.0, "grad_norm_var": 1.6457682291666667, "learning_rate": 0.0001, "loss": 7.3771, "loss/crossentropy": 2.143903985619545, "loss/hidden": 3.34921875, "loss/jsd": 0.0, "loss/logits": 0.19841080270707606, "step": 3220 }, { "epoch": 0.08075, "grad_norm": 29.5, "grad_norm_var": 2.405143229166667, "learning_rate": 0.0001, "loss": 7.4629, "loss/crossentropy": 1.9501185864210129, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.2003694986924529, "step": 3230 }, { "epoch": 0.081, "grad_norm": 35.0, "grad_norm_var": 3.4619140625, "learning_rate": 0.0001, "loss": 7.6085, "loss/crossentropy": 2.1099744185805323, "loss/hidden": 3.33515625, "loss/jsd": 0.0, "loss/logits": 0.1865939747542143, "step": 3240 }, { "epoch": 0.08125, "grad_norm": 38.0, "grad_norm_var": 15.54140625, "learning_rate": 0.0001, "loss": 7.4858, "loss/crossentropy": 1.8915734700858593, "loss/hidden": 3.550390625, "loss/jsd": 0.0, "loss/logits": 0.20414282865822314, "step": 3250 }, { "epoch": 0.0815, "grad_norm": 31.875, "grad_norm_var": 15.074934895833334, "learning_rate": 0.0001, "loss": 7.4995, "loss/crossentropy": 2.0746393710374833, "loss/hidden": 3.448046875, "loss/jsd": 0.0, "loss/logits": 0.19025763403624296, "step": 3260 }, { "epoch": 0.08175, "grad_norm": 29.625, "grad_norm_var": 4.532291666666667, "learning_rate": 0.0001, "loss": 7.4517, "loss/crossentropy": 2.201898355782032, "loss/hidden": 3.38515625, "loss/jsd": 0.0, "loss/logits": 0.1851862959563732, "step": 3270 }, { "epoch": 0.082, "grad_norm": 32.25, "grad_norm_var": 9.199739583333333, "learning_rate": 0.0001, "loss": 7.4085, "loss/crossentropy": 1.9774614453315735, "loss/hidden": 3.5078125, "loss/jsd": 0.0, "loss/logits": 0.1853517958894372, "step": 3280 }, { "epoch": 0.08225, "grad_norm": 31.0, "grad_norm_var": 13.801497395833334, "learning_rate": 0.0001, "loss": 7.4065, "loss/crossentropy": 2.1263367265462874, "loss/hidden": 3.412109375, "loss/jsd": 0.0, "loss/logits": 0.18529028967022895, "step": 3290 }, { "epoch": 0.0825, "grad_norm": 29.5, "grad_norm_var": 2.967643229166667, "learning_rate": 0.0001, "loss": 7.4165, "loss/crossentropy": 2.193544697761536, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19897244460880756, "step": 3300 }, { "epoch": 0.08275, "grad_norm": 33.75, "grad_norm_var": 9.687239583333334, "learning_rate": 0.0001, "loss": 7.5716, "loss/crossentropy": 2.0868531957268717, "loss/hidden": 3.616796875, "loss/jsd": 0.0, "loss/logits": 0.21278488002717494, "step": 3310 }, { "epoch": 0.083, "grad_norm": 31.0, "grad_norm_var": 7.9478515625, "learning_rate": 0.0001, "loss": 7.5543, "loss/crossentropy": 2.1392074063420297, "loss/hidden": 3.395703125, "loss/jsd": 0.0, "loss/logits": 0.21113577168434858, "step": 3320 }, { "epoch": 0.08325, "grad_norm": 30.0, "grad_norm_var": 2.0268229166666667, "learning_rate": 0.0001, "loss": 7.4454, "loss/crossentropy": 2.0691144198179243, "loss/hidden": 3.341796875, "loss/jsd": 0.0, "loss/logits": 0.20186964478343725, "step": 3330 }, { "epoch": 0.0835, "grad_norm": 31.5, "grad_norm_var": 2.6211653265769103e+18, "learning_rate": 0.0001, "loss": 7.4481, "loss/crossentropy": 2.0832756504416468, "loss/hidden": 3.423046875, "loss/jsd": 0.0, "loss/logits": 0.19915037509053946, "step": 3340 }, { "epoch": 0.08375, "grad_norm": 32.5, "grad_norm_var": 2.621165324337292e+18, "learning_rate": 0.0001, "loss": 7.3606, "loss/crossentropy": 2.102260760962963, "loss/hidden": 3.372265625, "loss/jsd": 0.0, "loss/logits": 0.19333885367959738, "step": 3350 }, { "epoch": 0.084, "grad_norm": 29.25, "grad_norm_var": 85.575, "learning_rate": 0.0001, "loss": 7.4073, "loss/crossentropy": 2.149528594315052, "loss/hidden": 3.491796875, "loss/jsd": 0.0, "loss/logits": 0.2177526842802763, "step": 3360 }, { "epoch": 0.08425, "grad_norm": 30.25, "grad_norm_var": 2.8645833333333335, "learning_rate": 0.0001, "loss": 7.4642, "loss/crossentropy": 2.085590344667435, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.17804578468203544, "step": 3370 }, { "epoch": 0.0845, "grad_norm": 33.25, "grad_norm_var": 2.996875, "learning_rate": 0.0001, "loss": 7.3953, "loss/crossentropy": 2.0975965946912765, "loss/hidden": 3.3, "loss/jsd": 0.0, "loss/logits": 0.1847201505675912, "step": 3380 }, { "epoch": 0.08475, "grad_norm": 32.0, "grad_norm_var": 2.470572916666667, "learning_rate": 0.0001, "loss": 7.4553, "loss/crossentropy": 2.1018140748143197, "loss/hidden": 3.36796875, "loss/jsd": 0.0, "loss/logits": 0.18250287007540464, "step": 3390 }, { "epoch": 0.085, "grad_norm": 30.25, "grad_norm_var": 2.887955729166667, "learning_rate": 0.0001, "loss": 7.5238, "loss/crossentropy": 2.1050665065646172, "loss/hidden": 3.501953125, "loss/jsd": 0.0, "loss/logits": 0.2124734738841653, "step": 3400 }, { "epoch": 0.08525, "grad_norm": 30.0, "grad_norm_var": 1.7143229166666667, "learning_rate": 0.0001, "loss": 7.2754, "loss/crossentropy": 2.0948296964168547, "loss/hidden": 3.37421875, "loss/jsd": 0.0, "loss/logits": 0.178215317055583, "step": 3410 }, { "epoch": 0.0855, "grad_norm": 30.875, "grad_norm_var": 5.354622395833333, "learning_rate": 0.0001, "loss": 7.4191, "loss/crossentropy": 2.0418393671512605, "loss/hidden": 3.4734375, "loss/jsd": 0.0, "loss/logits": 0.18740264605730772, "step": 3420 }, { "epoch": 0.08575, "grad_norm": 32.25, "grad_norm_var": 6.430989583333333, "learning_rate": 0.0001, "loss": 7.5642, "loss/crossentropy": 2.0279636546969413, "loss/hidden": 3.55859375, "loss/jsd": 0.0, "loss/logits": 0.20154636316001415, "step": 3430 }, { "epoch": 0.086, "grad_norm": 29.125, "grad_norm_var": 53.64791666666667, "learning_rate": 0.0001, "loss": 7.485, "loss/crossentropy": 2.0705729112029077, "loss/hidden": 3.456640625, "loss/jsd": 0.0, "loss/logits": 0.22035282999277114, "step": 3440 }, { "epoch": 0.08625, "grad_norm": 30.375, "grad_norm_var": 5.54765625, "learning_rate": 0.0001, "loss": 7.428, "loss/crossentropy": 1.9830067940056324, "loss/hidden": 3.376171875, "loss/jsd": 0.0, "loss/logits": 0.19354272997006775, "step": 3450 }, { "epoch": 0.0865, "grad_norm": 28.375, "grad_norm_var": 2.758736220726598e+18, "learning_rate": 0.0001, "loss": 7.4342, "loss/crossentropy": 2.1590976014733316, "loss/hidden": 3.489453125, "loss/jsd": 0.0, "loss/logits": 0.20231232214719058, "step": 3460 }, { "epoch": 0.08675, "grad_norm": 29.125, "grad_norm_var": 2.470572916666667, "learning_rate": 0.0001, "loss": 7.3376, "loss/crossentropy": 2.108407254517078, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.18425025548785925, "step": 3470 }, { "epoch": 0.087, "grad_norm": 32.5, "grad_norm_var": 19.315559895833335, "learning_rate": 0.0001, "loss": 7.391, "loss/crossentropy": 2.086346108466387, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.19492445401847364, "step": 3480 }, { "epoch": 0.08725, "grad_norm": 30.75, "grad_norm_var": 3.9009765625, "learning_rate": 0.0001, "loss": 7.454, "loss/crossentropy": 2.0728737086057665, "loss/hidden": 3.474609375, "loss/jsd": 0.0, "loss/logits": 0.21246139723807572, "step": 3490 }, { "epoch": 0.0875, "grad_norm": 53.25, "grad_norm_var": 34.962955729166666, "learning_rate": 0.0001, "loss": 7.4001, "loss/crossentropy": 1.9173476293683052, "loss/hidden": 3.330859375, "loss/jsd": 0.0, "loss/logits": 0.18263984741643072, "step": 3500 }, { "epoch": 0.08775, "grad_norm": 29.875, "grad_norm_var": 36.22389322916667, "learning_rate": 0.0001, "loss": 7.5855, "loss/crossentropy": 1.9761252515017986, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.20959299746900797, "step": 3510 }, { "epoch": 0.088, "grad_norm": 32.25, "grad_norm_var": 17.7337890625, "learning_rate": 0.0001, "loss": 7.4728, "loss/crossentropy": 2.0416554152965545, "loss/hidden": 3.4703125, "loss/jsd": 0.0, "loss/logits": 0.19014756735414268, "step": 3520 }, { "epoch": 0.08825, "grad_norm": 29.375, "grad_norm_var": 14.664322916666666, "learning_rate": 0.0001, "loss": 7.5608, "loss/crossentropy": 2.2834356099367143, "loss/hidden": 3.3953125, "loss/jsd": 0.0, "loss/logits": 0.19908843878656626, "step": 3530 }, { "epoch": 0.0885, "grad_norm": 31.875, "grad_norm_var": 2.6702473958333335, "learning_rate": 0.0001, "loss": 7.4804, "loss/crossentropy": 2.0417330890893934, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.20852382443845272, "step": 3540 }, { "epoch": 0.08875, "grad_norm": 31.625, "grad_norm_var": 2.460724589971584e+18, "learning_rate": 0.0001, "loss": 7.5559, "loss/crossentropy": 2.1676768481731417, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.1989177169278264, "step": 3550 }, { "epoch": 0.089, "grad_norm": 30.0, "grad_norm_var": 6.881705729166667, "learning_rate": 0.0001, "loss": 7.4678, "loss/crossentropy": 2.221273897588253, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.19402988757938147, "step": 3560 }, { "epoch": 0.08925, "grad_norm": 31.375, "grad_norm_var": 7.732747395833333, "learning_rate": 0.0001, "loss": 7.4508, "loss/crossentropy": 2.1802149415016174, "loss/hidden": 3.43828125, "loss/jsd": 0.0, "loss/logits": 0.20121808685362338, "step": 3570 }, { "epoch": 0.0895, "grad_norm": 52.5, "grad_norm_var": 30.9775390625, "learning_rate": 0.0001, "loss": 7.3982, "loss/crossentropy": 2.085124118626118, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.18448642026633025, "step": 3580 }, { "epoch": 0.08975, "grad_norm": 30.875, "grad_norm_var": 32.91295572916667, "learning_rate": 0.0001, "loss": 7.4381, "loss/crossentropy": 2.1467449337244036, "loss/hidden": 3.3734375, "loss/jsd": 0.0, "loss/logits": 0.19393185302615165, "step": 3590 }, { "epoch": 0.09, "grad_norm": 29.25, "grad_norm_var": 1.4708333333333334, "learning_rate": 0.0001, "loss": 7.415, "loss/crossentropy": 2.0135369554162024, "loss/hidden": 3.37109375, "loss/jsd": 0.0, "loss/logits": 0.18443848174065353, "step": 3600 }, { "epoch": 0.09025, "grad_norm": 31.375, "grad_norm_var": 6.1962890625, "learning_rate": 0.0001, "loss": 7.4028, "loss/crossentropy": 2.1443901300430297, "loss/hidden": 3.440234375, "loss/jsd": 0.0, "loss/logits": 0.2054579086601734, "step": 3610 }, { "epoch": 0.0905, "grad_norm": 26.5, "grad_norm_var": 3.562239583333333, "learning_rate": 0.0001, "loss": 7.3255, "loss/crossentropy": 1.799356396496296, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.17441922090947629, "step": 3620 }, { "epoch": 0.09075, "grad_norm": 31.375, "grad_norm_var": 2.2083333333333335, "learning_rate": 0.0001, "loss": 7.4272, "loss/crossentropy": 1.9925116747617722, "loss/hidden": 3.52578125, "loss/jsd": 0.0, "loss/logits": 0.21653544921427964, "step": 3630 }, { "epoch": 0.091, "grad_norm": 30.125, "grad_norm_var": 0.6125, "learning_rate": 0.0001, "loss": 7.3649, "loss/crossentropy": 2.135761073231697, "loss/hidden": 3.4140625, "loss/jsd": 0.0, "loss/logits": 0.18989351522177458, "step": 3640 }, { "epoch": 0.09125, "grad_norm": 31.375, "grad_norm_var": 1.4330729166666667, "learning_rate": 0.0001, "loss": 7.4505, "loss/crossentropy": 2.0986070543527604, "loss/hidden": 3.334375, "loss/jsd": 0.0, "loss/logits": 0.18352905213832854, "step": 3650 }, { "epoch": 0.0915, "grad_norm": 29.625, "grad_norm_var": 2.5869140625, "learning_rate": 0.0001, "loss": 7.4199, "loss/crossentropy": 2.1555575743317603, "loss/hidden": 3.402734375, "loss/jsd": 0.0, "loss/logits": 0.19261632524430752, "step": 3660 }, { "epoch": 0.09175, "grad_norm": 31.5, "grad_norm_var": 2.371875, "learning_rate": 0.0001, "loss": 7.5463, "loss/crossentropy": 2.1411691516637803, "loss/hidden": 3.446875, "loss/jsd": 0.0, "loss/logits": 0.2046652188524604, "step": 3670 }, { "epoch": 0.092, "grad_norm": 30.625, "grad_norm_var": 4.703580729166666, "learning_rate": 0.0001, "loss": 7.404, "loss/crossentropy": 2.142404294013977, "loss/hidden": 3.445703125, "loss/jsd": 0.0, "loss/logits": 0.20414466112852098, "step": 3680 }, { "epoch": 0.09225, "grad_norm": 30.375, "grad_norm_var": 3.25625, "learning_rate": 0.0001, "loss": 7.4774, "loss/crossentropy": 2.187901920080185, "loss/hidden": 3.480859375, "loss/jsd": 0.0, "loss/logits": 0.21911972090601922, "step": 3690 }, { "epoch": 0.0925, "grad_norm": 31.875, "grad_norm_var": 1.2166666666666666, "learning_rate": 0.0001, "loss": 7.5965, "loss/crossentropy": 2.086391404271126, "loss/hidden": 3.438671875, "loss/jsd": 0.0, "loss/logits": 0.2020766455680132, "step": 3700 }, { "epoch": 0.09275, "grad_norm": 30.625, "grad_norm_var": 2.147330729166667, "learning_rate": 0.0001, "loss": 7.4579, "loss/crossentropy": 2.09081457182765, "loss/hidden": 3.369140625, "loss/jsd": 0.0, "loss/logits": 0.1868050311692059, "step": 3710 }, { "epoch": 0.093, "grad_norm": 34.25, "grad_norm_var": 2.467643229166667, "learning_rate": 0.0001, "loss": 7.522, "loss/crossentropy": 2.12264247238636, "loss/hidden": 3.453125, "loss/jsd": 0.0, "loss/logits": 0.18927707765251398, "step": 3720 }, { "epoch": 0.09325, "grad_norm": 32.25, "grad_norm_var": 3.981184895833333, "learning_rate": 0.0001, "loss": 7.4155, "loss/crossentropy": 2.1118928104639054, "loss/hidden": 3.44765625, "loss/jsd": 0.0, "loss/logits": 0.19489197488874196, "step": 3730 }, { "epoch": 0.0935, "grad_norm": 34.0, "grad_norm_var": 5.312434895833333, "learning_rate": 0.0001, "loss": 7.5053, "loss/crossentropy": 2.1360882744193077, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.19313989579677582, "step": 3740 }, { "epoch": 0.09375, "grad_norm": 29.125, "grad_norm_var": 4.549739583333333, "learning_rate": 0.0001, "loss": 7.3275, "loss/crossentropy": 2.010613538324833, "loss/hidden": 3.355859375, "loss/jsd": 0.0, "loss/logits": 0.18421147018671036, "step": 3750 }, { "epoch": 0.094, "grad_norm": 31.625, "grad_norm_var": 1.5541666666666667, "learning_rate": 0.0001, "loss": 7.4784, "loss/crossentropy": 2.1465295113623144, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.18987073097378016, "step": 3760 }, { "epoch": 0.09425, "grad_norm": 32.75, "grad_norm_var": 1.9018229166666667, "learning_rate": 0.0001, "loss": 7.3495, "loss/crossentropy": 2.17747982442379, "loss/hidden": 3.48046875, "loss/jsd": 0.0, "loss/logits": 0.2016214355826378, "step": 3770 }, { "epoch": 0.0945, "grad_norm": 30.875, "grad_norm_var": 3.088997395833333, "learning_rate": 0.0001, "loss": 7.5384, "loss/crossentropy": 2.179350584745407, "loss/hidden": 3.350390625, "loss/jsd": 0.0, "loss/logits": 0.19142594784498215, "step": 3780 }, { "epoch": 0.09475, "grad_norm": 29.625, "grad_norm_var": 1.1559895833333333, "learning_rate": 0.0001, "loss": 7.4035, "loss/crossentropy": 2.155378046631813, "loss/hidden": 3.32109375, "loss/jsd": 0.0, "loss/logits": 0.19720839541405438, "step": 3790 }, { "epoch": 0.095, "grad_norm": 30.625, "grad_norm_var": 1.1999348958333333, "learning_rate": 0.0001, "loss": 7.4441, "loss/crossentropy": 2.0597486779093743, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.19279775265604257, "step": 3800 }, { "epoch": 0.09525, "grad_norm": 33.5, "grad_norm_var": 2.1666666666666665, "learning_rate": 0.0001, "loss": 7.5146, "loss/crossentropy": 2.1966816753149034, "loss/hidden": 3.44375, "loss/jsd": 0.0, "loss/logits": 0.20174810625612735, "step": 3810 }, { "epoch": 0.0955, "grad_norm": 31.5, "grad_norm_var": 1.9593098958333333, "learning_rate": 0.0001, "loss": 7.539, "loss/crossentropy": 2.165803623199463, "loss/hidden": 3.34375, "loss/jsd": 0.0, "loss/logits": 0.1953417781740427, "step": 3820 }, { "epoch": 0.09575, "grad_norm": 32.0, "grad_norm_var": 6.690625, "learning_rate": 0.0001, "loss": 7.514, "loss/crossentropy": 2.0817860513925552, "loss/hidden": 3.453515625, "loss/jsd": 0.0, "loss/logits": 0.20838446952402592, "step": 3830 }, { "epoch": 0.096, "grad_norm": 32.75, "grad_norm_var": 7.6431640625, "learning_rate": 0.0001, "loss": 7.5472, "loss/crossentropy": 2.231910442560911, "loss/hidden": 3.442578125, "loss/jsd": 0.0, "loss/logits": 0.21717903479002415, "step": 3840 }, { "epoch": 0.09625, "grad_norm": 32.0, "grad_norm_var": 16.134375, "learning_rate": 0.0001, "loss": 7.5807, "loss/crossentropy": 2.0746277555823327, "loss/hidden": 3.47578125, "loss/jsd": 0.0, "loss/logits": 0.20851925816386938, "step": 3850 }, { "epoch": 0.0965, "grad_norm": 30.625, "grad_norm_var": 16.132747395833334, "learning_rate": 0.0001, "loss": 7.3749, "loss/crossentropy": 2.1463438466191294, "loss/hidden": 3.356640625, "loss/jsd": 0.0, "loss/logits": 0.19305863380432128, "step": 3860 }, { "epoch": 0.09675, "grad_norm": 32.5, "grad_norm_var": 1.0895182291666667, "learning_rate": 0.0001, "loss": 7.5499, "loss/crossentropy": 2.2108413323760034, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.20310868676751853, "step": 3870 }, { "epoch": 0.097, "grad_norm": 30.75, "grad_norm_var": 1.4559895833333334, "learning_rate": 0.0001, "loss": 7.4788, "loss/crossentropy": 2.0900154620409013, "loss/hidden": 3.42734375, "loss/jsd": 0.0, "loss/logits": 0.18780422061681748, "step": 3880 }, { "epoch": 0.09725, "grad_norm": 30.625, "grad_norm_var": 13.917643229166666, "learning_rate": 0.0001, "loss": 7.4391, "loss/crossentropy": 2.0574848279356956, "loss/hidden": 3.3875, "loss/jsd": 0.0, "loss/logits": 0.19390027467161416, "step": 3890 }, { "epoch": 0.0975, "grad_norm": 27.375, "grad_norm_var": 13.55, "learning_rate": 0.0001, "loss": 7.4327, "loss/crossentropy": 2.2832688719034193, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.20608801003545524, "step": 3900 }, { "epoch": 0.09775, "grad_norm": 29.0, "grad_norm_var": 3.296875, "learning_rate": 0.0001, "loss": 7.3691, "loss/crossentropy": 1.9183307077735663, "loss/hidden": 3.37890625, "loss/jsd": 0.0, "loss/logits": 0.1917601386550814, "step": 3910 }, { "epoch": 0.098, "grad_norm": 34.0, "grad_norm_var": 3.24765625, "learning_rate": 0.0001, "loss": 7.4628, "loss/crossentropy": 2.0630046002566815, "loss/hidden": 3.338671875, "loss/jsd": 0.0, "loss/logits": 0.1871832549571991, "step": 3920 }, { "epoch": 0.09825, "grad_norm": 31.75, "grad_norm_var": 1.5384765625, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.061261148750782, "loss/hidden": 3.415234375, "loss/jsd": 0.0, "loss/logits": 0.18525551967322826, "step": 3930 }, { "epoch": 0.0985, "grad_norm": 29.75, "grad_norm_var": 1.584375, "learning_rate": 0.0001, "loss": 7.5498, "loss/crossentropy": 2.0895790114998816, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.1932330032810569, "step": 3940 }, { "epoch": 0.09875, "grad_norm": 30.625, "grad_norm_var": 25.79765625, "learning_rate": 0.0001, "loss": 7.6502, "loss/crossentropy": 2.1616804771125318, "loss/hidden": 3.365625, "loss/jsd": 0.0, "loss/logits": 0.18905209768563508, "step": 3950 }, { "epoch": 0.099, "grad_norm": 30.5, "grad_norm_var": 28.547916666666666, "learning_rate": 0.0001, "loss": 7.3334, "loss/crossentropy": 2.1435488507151605, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.1910943292081356, "step": 3960 }, { "epoch": 0.09925, "grad_norm": 32.75, "grad_norm_var": 6.3650390625, "learning_rate": 0.0001, "loss": 7.542, "loss/crossentropy": 2.176460310816765, "loss/hidden": 3.41328125, "loss/jsd": 0.0, "loss/logits": 0.18821860365569593, "step": 3970 }, { "epoch": 0.0995, "grad_norm": 31.625, "grad_norm_var": 3.9905598958333335, "learning_rate": 0.0001, "loss": 7.5231, "loss/crossentropy": 2.2077176332473756, "loss/hidden": 3.4515625, "loss/jsd": 0.0, "loss/logits": 0.21911400128155947, "step": 3980 }, { "epoch": 0.09975, "grad_norm": 31.125, "grad_norm_var": 1.75625, "learning_rate": 0.0001, "loss": 7.4868, "loss/crossentropy": 2.105836200714111, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.1997914554551244, "step": 3990 }, { "epoch": 0.1, "grad_norm": 38.0, "grad_norm_var": 4.710416666666666, "learning_rate": 0.0001, "loss": 7.5675, "loss/crossentropy": 2.233233967423439, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.20876242108643056, "step": 4000 }, { "epoch": 0.10025, "grad_norm": 28.625, "grad_norm_var": 7.56640625, "learning_rate": 0.0001, "loss": 7.4736, "loss/crossentropy": 2.103509198874235, "loss/hidden": 3.413671875, "loss/jsd": 0.0, "loss/logits": 0.1953927006572485, "step": 4010 }, { "epoch": 0.1005, "grad_norm": 28.875, "grad_norm_var": 4.119791666666667, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 1.9697775058448315, "loss/hidden": 3.308984375, "loss/jsd": 0.0, "loss/logits": 0.17186311883851885, "step": 4020 }, { "epoch": 0.10075, "grad_norm": 29.5, "grad_norm_var": 1.3177083333333333, "learning_rate": 0.0001, "loss": 7.333, "loss/crossentropy": 2.0519870311021804, "loss/hidden": 3.42421875, "loss/jsd": 0.0, "loss/logits": 0.1872571600601077, "step": 4030 }, { "epoch": 0.101, "grad_norm": 29.5, "grad_norm_var": 1.2785807291666667, "learning_rate": 0.0001, "loss": 7.3466, "loss/crossentropy": 2.0663713179528713, "loss/hidden": 3.39921875, "loss/jsd": 0.0, "loss/logits": 0.18582073990255593, "step": 4040 }, { "epoch": 0.10125, "grad_norm": 30.375, "grad_norm_var": 1.9577473958333333, "learning_rate": 0.0001, "loss": 7.3812, "loss/crossentropy": 2.1256399258971213, "loss/hidden": 3.30390625, "loss/jsd": 0.0, "loss/logits": 0.19628962082788348, "step": 4050 }, { "epoch": 0.1015, "grad_norm": 30.625, "grad_norm_var": 0.53125, "learning_rate": 0.0001, "loss": 7.3726, "loss/crossentropy": 2.1235328309237955, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.18646292947232723, "step": 4060 }, { "epoch": 0.10175, "grad_norm": 29.0, "grad_norm_var": 3.19255952647709e+18, "learning_rate": 0.0001, "loss": 7.4564, "loss/crossentropy": 2.0213126331567763, "loss/hidden": 3.496875, "loss/jsd": 0.0, "loss/logits": 0.19607899691909553, "step": 4070 }, { "epoch": 0.102, "grad_norm": 28.75, "grad_norm_var": 3.48515625, "learning_rate": 0.0001, "loss": 7.3886, "loss/crossentropy": 2.0899658009409903, "loss/hidden": 3.340625, "loss/jsd": 0.0, "loss/logits": 0.1851665174588561, "step": 4080 }, { "epoch": 0.10225, "grad_norm": 29.5, "grad_norm_var": 1.8692057291666666, "learning_rate": 0.0001, "loss": 7.4838, "loss/crossentropy": 2.027493818849325, "loss/hidden": 3.49765625, "loss/jsd": 0.0, "loss/logits": 0.19640162959694862, "step": 4090 }, { "epoch": 0.1025, "grad_norm": 29.125, "grad_norm_var": 11.762434895833334, "learning_rate": 0.0001, "loss": 7.5099, "loss/crossentropy": 2.056584618985653, "loss/hidden": 3.32265625, "loss/jsd": 0.0, "loss/logits": 0.17638762388378382, "step": 4100 }, { "epoch": 0.10275, "grad_norm": 30.125, "grad_norm_var": 12.459375, "learning_rate": 0.0001, "loss": 7.5255, "loss/crossentropy": 2.0713445380330087, "loss/hidden": 3.36953125, "loss/jsd": 0.0, "loss/logits": 0.18587317056953906, "step": 4110 }, { "epoch": 0.103, "grad_norm": 33.25, "grad_norm_var": 1.9958333333333333, "learning_rate": 0.0001, "loss": 7.4437, "loss/crossentropy": 2.2338072419166566, "loss/hidden": 3.36171875, "loss/jsd": 0.0, "loss/logits": 0.18814200926572083, "step": 4120 }, { "epoch": 0.10325, "grad_norm": 32.75, "grad_norm_var": 3.1259765625, "learning_rate": 0.0001, "loss": 7.3184, "loss/crossentropy": 2.0210259817540646, "loss/hidden": 3.3671875, "loss/jsd": 0.0, "loss/logits": 0.18816483654081823, "step": 4130 }, { "epoch": 0.1035, "grad_norm": 29.5, "grad_norm_var": 2.870247395833333, "learning_rate": 0.0001, "loss": 7.5124, "loss/crossentropy": 2.0151045128703116, "loss/hidden": 3.371484375, "loss/jsd": 0.0, "loss/logits": 0.19255878478288652, "step": 4140 }, { "epoch": 0.10375, "grad_norm": 30.625, "grad_norm_var": 1.3926432291666666, "learning_rate": 0.0001, "loss": 7.5096, "loss/crossentropy": 1.9808883003890514, "loss/hidden": 3.449609375, "loss/jsd": 0.0, "loss/logits": 0.19115560222417116, "step": 4150 }, { "epoch": 0.104, "grad_norm": 30.75, "grad_norm_var": 1.6979166666666667, "learning_rate": 0.0001, "loss": 7.549, "loss/crossentropy": 2.1932784736156465, "loss/hidden": 3.39765625, "loss/jsd": 0.0, "loss/logits": 0.20479805655777455, "step": 4160 }, { "epoch": 0.10425, "grad_norm": 30.125, "grad_norm_var": 2.3333333333333335, "learning_rate": 0.0001, "loss": 7.3875, "loss/crossentropy": 1.8820222720503808, "loss/hidden": 3.337109375, "loss/jsd": 0.0, "loss/logits": 0.17310038600116967, "step": 4170 }, { "epoch": 0.1045, "grad_norm": 33.0, "grad_norm_var": 3.7728515625, "learning_rate": 0.0001, "loss": 7.4212, "loss/crossentropy": 2.082476270198822, "loss/hidden": 3.334765625, "loss/jsd": 0.0, "loss/logits": 0.19099258184432982, "step": 4180 }, { "epoch": 0.10475, "grad_norm": 30.875, "grad_norm_var": 11.408268229166667, "learning_rate": 0.0001, "loss": 7.4991, "loss/crossentropy": 2.287242355942726, "loss/hidden": 3.375390625, "loss/jsd": 0.0, "loss/logits": 0.1982285875827074, "step": 4190 }, { "epoch": 0.105, "grad_norm": 28.75, "grad_norm_var": 2.999739583333333, "learning_rate": 0.0001, "loss": 7.5959, "loss/crossentropy": 2.1783332407474516, "loss/hidden": 3.422265625, "loss/jsd": 0.0, "loss/logits": 0.2117959801107645, "step": 4200 }, { "epoch": 0.10525, "grad_norm": 30.0, "grad_norm_var": 4.708268229166666, "learning_rate": 0.0001, "loss": 7.3363, "loss/crossentropy": 1.955865352600813, "loss/hidden": 3.409375, "loss/jsd": 0.0, "loss/logits": 0.18177355360239744, "step": 4210 }, { "epoch": 0.1055, "grad_norm": 30.625, "grad_norm_var": 3.0254557291666666, "learning_rate": 0.0001, "loss": 7.4673, "loss/crossentropy": 1.833389012515545, "loss/hidden": 3.394921875, "loss/jsd": 0.0, "loss/logits": 0.1878132861107588, "step": 4220 }, { "epoch": 0.10575, "grad_norm": 32.0, "grad_norm_var": 3.05, "learning_rate": 0.0001, "loss": 7.3969, "loss/crossentropy": 1.9096243590116502, "loss/hidden": 3.35625, "loss/jsd": 0.0, "loss/logits": 0.17025592969730496, "step": 4230 }, { "epoch": 0.106, "grad_norm": 30.875, "grad_norm_var": 1.82265625, "learning_rate": 0.0001, "loss": 7.4638, "loss/crossentropy": 2.0454175233840943, "loss/hidden": 3.436328125, "loss/jsd": 0.0, "loss/logits": 0.20515710916370153, "step": 4240 }, { "epoch": 0.10625, "grad_norm": 30.125, "grad_norm_var": 3.1333333333333333, "learning_rate": 0.0001, "loss": 7.5126, "loss/crossentropy": 2.089062933623791, "loss/hidden": 3.4328125, "loss/jsd": 0.0, "loss/logits": 0.19156677946448325, "step": 4250 }, { "epoch": 0.1065, "grad_norm": 29.0, "grad_norm_var": 4.311393229166667, "learning_rate": 0.0001, "loss": 7.4468, "loss/crossentropy": 2.0564094200730323, "loss/hidden": 3.433984375, "loss/jsd": 0.0, "loss/logits": 0.19553639348596336, "step": 4260 }, { "epoch": 0.10675, "grad_norm": 32.0, "grad_norm_var": 3.2587890625, "learning_rate": 0.0001, "loss": 7.4186, "loss/crossentropy": 2.13806764036417, "loss/hidden": 3.3859375, "loss/jsd": 0.0, "loss/logits": 0.19822277761995793, "step": 4270 }, { "epoch": 0.107, "grad_norm": 28.0, "grad_norm_var": 1.6926432291666667, "learning_rate": 0.0001, "loss": 7.4595, "loss/crossentropy": 2.0767486467957497, "loss/hidden": 3.41953125, "loss/jsd": 0.0, "loss/logits": 0.1884168043732643, "step": 4280 }, { "epoch": 0.10725, "grad_norm": 33.0, "grad_norm_var": 2.3059895833333335, "learning_rate": 0.0001, "loss": 7.4481, "loss/crossentropy": 2.033916361629963, "loss/hidden": 3.45, "loss/jsd": 0.0, "loss/logits": 0.20558829829096795, "step": 4290 }, { "epoch": 0.1075, "grad_norm": 31.0, "grad_norm_var": 2.9375, "learning_rate": 0.0001, "loss": 7.4871, "loss/crossentropy": 2.078028707951307, "loss/hidden": 3.37578125, "loss/jsd": 0.0, "loss/logits": 0.188079852424562, "step": 4300 }, { "epoch": 0.10775, "grad_norm": 33.25, "grad_norm_var": 2.1020833333333333, "learning_rate": 0.0001, "loss": 7.5379, "loss/crossentropy": 2.003500834107399, "loss/hidden": 3.544921875, "loss/jsd": 0.0, "loss/logits": 0.20521650360897184, "step": 4310 }, { "epoch": 0.108, "grad_norm": 29.625, "grad_norm_var": 2.8447916666666666, "learning_rate": 0.0001, "loss": 7.3536, "loss/crossentropy": 2.043112625181675, "loss/hidden": 3.380859375, "loss/jsd": 0.0, "loss/logits": 0.19910661596804857, "step": 4320 }, { "epoch": 0.10825, "grad_norm": 28.5, "grad_norm_var": 4.000455729166666, "learning_rate": 0.0001, "loss": 7.3717, "loss/crossentropy": 2.1422011658549307, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.19375871792435645, "step": 4330 }, { "epoch": 0.1085, "grad_norm": 29.0, "grad_norm_var": 3.6259765625, "learning_rate": 0.0001, "loss": 7.5021, "loss/crossentropy": 2.131446525454521, "loss/hidden": 3.480078125, "loss/jsd": 0.0, "loss/logits": 0.2063008865341544, "step": 4340 }, { "epoch": 0.10875, "grad_norm": 32.0, "grad_norm_var": 5.9525390625, "learning_rate": 0.0001, "loss": 7.4749, "loss/crossentropy": 2.085691845417023, "loss/hidden": 3.359375, "loss/jsd": 0.0, "loss/logits": 0.1889802658930421, "step": 4350 }, { "epoch": 0.109, "grad_norm": 30.75, "grad_norm_var": 3.154166666666667, "learning_rate": 0.0001, "loss": 7.3816, "loss/crossentropy": 1.8972876839339734, "loss/hidden": 3.319140625, "loss/jsd": 0.0, "loss/logits": 0.17174729090183974, "step": 4360 }, { "epoch": 0.10925, "grad_norm": 29.875, "grad_norm_var": 1.7509765625, "learning_rate": 0.0001, "loss": 7.4444, "loss/crossentropy": 2.127763804793358, "loss/hidden": 3.401953125, "loss/jsd": 0.0, "loss/logits": 0.18679574280977249, "step": 4370 }, { "epoch": 0.1095, "grad_norm": 29.875, "grad_norm_var": 2.16015625, "learning_rate": 0.0001, "loss": 7.4682, "loss/crossentropy": 2.1872297644615175, "loss/hidden": 3.31328125, "loss/jsd": 0.0, "loss/logits": 0.183891461789608, "step": 4380 }, { "epoch": 0.10975, "grad_norm": 28.875, "grad_norm_var": 3.3692057291666666, "learning_rate": 0.0001, "loss": 7.429, "loss/crossentropy": 2.19267495572567, "loss/hidden": 3.394140625, "loss/jsd": 0.0, "loss/logits": 0.20111876968294382, "step": 4390 }, { "epoch": 0.11, "grad_norm": 29.375, "grad_norm_var": 1.6858723958333333, "learning_rate": 0.0001, "loss": 7.556, "loss/crossentropy": 2.1324411287903784, "loss/hidden": 3.4359375, "loss/jsd": 0.0, "loss/logits": 0.2090261412784457, "step": 4400 }, { "epoch": 0.11025, "grad_norm": 33.5, "grad_norm_var": 3.374739583333333, "learning_rate": 0.0001, "loss": 7.4081, "loss/crossentropy": 1.9800483137369156, "loss/hidden": 3.584375, "loss/jsd": 0.0, "loss/logits": 0.19881114605814218, "step": 4410 }, { "epoch": 0.1105, "grad_norm": 31.75, "grad_norm_var": 4.13515625, "learning_rate": 0.0001, "loss": 7.4904, "loss/crossentropy": 2.053773292154074, "loss/hidden": 3.323828125, "loss/jsd": 0.0, "loss/logits": 0.18270381446927786, "step": 4420 }, { "epoch": 0.11075, "grad_norm": 30.75, "grad_norm_var": 2.3499348958333335, "learning_rate": 0.0001, "loss": 7.4509, "loss/crossentropy": 2.0641689248383046, "loss/hidden": 3.40390625, "loss/jsd": 0.0, "loss/logits": 0.19245190378278493, "step": 4430 }, { "epoch": 0.111, "grad_norm": 33.25, "grad_norm_var": 3.158333333333333, "learning_rate": 0.0001, "loss": 7.3576, "loss/crossentropy": 2.073286408931017, "loss/hidden": 3.37265625, "loss/jsd": 0.0, "loss/logits": 0.1892416624352336, "step": 4440 }, { "epoch": 0.11125, "grad_norm": 35.75, "grad_norm_var": 6.167122395833333, "learning_rate": 0.0001, "loss": 7.456, "loss/crossentropy": 2.191167525947094, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.19327596500515937, "step": 4450 }, { "epoch": 0.1115, "grad_norm": 28.0, "grad_norm_var": 6.762239583333334, "learning_rate": 0.0001, "loss": 7.4254, "loss/crossentropy": 1.9917161837220192, "loss/hidden": 3.48828125, "loss/jsd": 0.0, "loss/logits": 0.18673346154391765, "step": 4460 }, { "epoch": 0.11175, "grad_norm": 31.0, "grad_norm_var": 2.763541666666667, "learning_rate": 0.0001, "loss": 7.4458, "loss/crossentropy": 2.0167058646678924, "loss/hidden": 3.477734375, "loss/jsd": 0.0, "loss/logits": 0.20151916183531285, "step": 4470 }, { "epoch": 0.112, "grad_norm": 30.5, "grad_norm_var": 7.175455729166667, "learning_rate": 0.0001, "loss": 7.4057, "loss/crossentropy": 2.013149876892567, "loss/hidden": 3.405859375, "loss/jsd": 0.0, "loss/logits": 0.1819242848083377, "step": 4480 }, { "epoch": 0.11225, "grad_norm": 43.25, "grad_norm_var": 13.478580729166667, "learning_rate": 0.0001, "loss": 7.4416, "loss/crossentropy": 2.111778366565704, "loss/hidden": 3.4, "loss/jsd": 0.0, "loss/logits": 0.20088088884949684, "step": 4490 }, { "epoch": 0.1125, "grad_norm": 30.125, "grad_norm_var": 11.905143229166667, "learning_rate": 0.0001, "loss": 7.4435, "loss/crossentropy": 2.0223396182060243, "loss/hidden": 3.408984375, "loss/jsd": 0.0, "loss/logits": 0.1967620700597763, "step": 4500 }, { "epoch": 0.11275, "grad_norm": 28.375, "grad_norm_var": 2.2978515625, "learning_rate": 0.0001, "loss": 7.3969, "loss/crossentropy": 1.9966137878596784, "loss/hidden": 3.416796875, "loss/jsd": 0.0, "loss/logits": 0.19062119219452142, "step": 4510 }, { "epoch": 0.113, "grad_norm": 29.75, "grad_norm_var": 3.1759765625, "learning_rate": 0.0001, "loss": 7.2845, "loss/crossentropy": 1.8878834903240205, "loss/hidden": 3.346484375, "loss/jsd": 0.0, "loss/logits": 0.16922880560159684, "step": 4520 }, { "epoch": 0.11325, "grad_norm": 33.5, "grad_norm_var": 3.78515625, "learning_rate": 0.0001, "loss": 7.5223, "loss/crossentropy": 2.0424712359905244, "loss/hidden": 3.42109375, "loss/jsd": 0.0, "loss/logits": 0.18261839263141155, "step": 4530 }, { "epoch": 0.1135, "grad_norm": 41.75, "grad_norm_var": 13.172330729166667, "learning_rate": 0.0001, "loss": 7.4917, "loss/crossentropy": 2.1800880253314974, "loss/hidden": 3.419140625, "loss/jsd": 0.0, "loss/logits": 0.1875661849975586, "step": 4540 }, { "epoch": 0.11375, "grad_norm": 29.5, "grad_norm_var": 13.737239583333333, "learning_rate": 0.0001, "loss": 7.4929, "loss/crossentropy": 2.1130245834589005, "loss/hidden": 3.514453125, "loss/jsd": 0.0, "loss/logits": 0.20742647554725407, "step": 4550 }, { "epoch": 0.114, "grad_norm": 31.875, "grad_norm_var": 3.1447265625, "learning_rate": 0.0001, "loss": 7.4885, "loss/crossentropy": 2.0878429099917413, "loss/hidden": 3.4625, "loss/jsd": 0.0, "loss/logits": 0.19807947240769863, "step": 4560 }, { "epoch": 0.11425, "grad_norm": 32.0, "grad_norm_var": 1.9080729166666666, "learning_rate": 0.0001, "loss": 7.412, "loss/crossentropy": 2.045598204433918, "loss/hidden": 3.43984375, "loss/jsd": 0.0, "loss/logits": 0.19935160782188177, "step": 4570 }, { "epoch": 0.1145, "grad_norm": 31.25, "grad_norm_var": 2.703285650940459e+18, "learning_rate": 0.0001, "loss": 7.4112, "loss/crossentropy": 1.9612677067518234, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.1939171139150858, "step": 4580 }, { "epoch": 0.11475, "grad_norm": 30.125, "grad_norm_var": 9.067708333333334, "learning_rate": 0.0001, "loss": 7.4109, "loss/crossentropy": 2.066862888634205, "loss/hidden": 3.440625, "loss/jsd": 0.0, "loss/logits": 0.20057452656328678, "step": 4590 }, { "epoch": 0.115, "grad_norm": 29.25, "grad_norm_var": 6.670833333333333, "learning_rate": 0.0001, "loss": 7.3857, "loss/crossentropy": 2.0378803849220275, "loss/hidden": 3.495703125, "loss/jsd": 0.0, "loss/logits": 0.19217969439923763, "step": 4600 }, { "epoch": 0.11525, "grad_norm": 32.0, "grad_norm_var": 8.108268229166667, "learning_rate": 0.0001, "loss": 7.4449, "loss/crossentropy": 1.9883966132998467, "loss/hidden": 3.378515625, "loss/jsd": 0.0, "loss/logits": 0.1796421378850937, "step": 4610 }, { "epoch": 0.1155, "grad_norm": 28.5, "grad_norm_var": 2.8853515625, "learning_rate": 0.0001, "loss": 7.43, "loss/crossentropy": 2.2122382700443266, "loss/hidden": 3.434765625, "loss/jsd": 0.0, "loss/logits": 0.20737907551229, "step": 4620 }, { "epoch": 0.11575, "grad_norm": 30.375, "grad_norm_var": 3.7968098958333334, "learning_rate": 0.0001, "loss": 7.3858, "loss/crossentropy": 2.0896764233708383, "loss/hidden": 3.540234375, "loss/jsd": 0.0, "loss/logits": 0.20905990786850454, "step": 4630 }, { "epoch": 0.116, "grad_norm": 27.5, "grad_norm_var": 3.6879557291666667, "learning_rate": 0.0001, "loss": 7.5145, "loss/crossentropy": 2.104724445939064, "loss/hidden": 3.3796875, "loss/jsd": 0.0, "loss/logits": 0.19548750538378953, "step": 4640 }, { "epoch": 0.11625, "grad_norm": 29.875, "grad_norm_var": 8.7056640625, "learning_rate": 0.0001, "loss": 7.4009, "loss/crossentropy": 2.155320603400469, "loss/hidden": 3.47578125, "loss/jsd": 0.0, "loss/logits": 0.2002986514940858, "step": 4650 }, { "epoch": 0.1165, "grad_norm": 27.0, "grad_norm_var": 5.1541015625, "learning_rate": 0.0001, "loss": 7.3193, "loss/crossentropy": 2.085461828112602, "loss/hidden": 3.38671875, "loss/jsd": 0.0, "loss/logits": 0.1905359473079443, "step": 4660 }, { "epoch": 0.11675, "grad_norm": 30.5, "grad_norm_var": 1.5926432291666666, "learning_rate": 0.0001, "loss": 7.3125, "loss/crossentropy": 1.9927285239100456, "loss/hidden": 3.411328125, "loss/jsd": 0.0, "loss/logits": 0.17640038076788186, "step": 4670 }, { "epoch": 0.117, "grad_norm": 33.75, "grad_norm_var": 4.747330729166666, "learning_rate": 0.0001, "loss": 7.469, "loss/crossentropy": 2.1633560836315153, "loss/hidden": 3.324609375, "loss/jsd": 0.0, "loss/logits": 0.1862495567649603, "step": 4680 }, { "epoch": 0.11725, "grad_norm": 28.25, "grad_norm_var": 7.198372395833333, "learning_rate": 0.0001, "loss": 7.4318, "loss/crossentropy": 2.2390024289488792, "loss/hidden": 3.430078125, "loss/jsd": 0.0, "loss/logits": 0.2097862558439374, "step": 4690 }, { "epoch": 0.1175, "grad_norm": 31.375, "grad_norm_var": 5.760872395833333, "learning_rate": 0.0001, "loss": 7.4669, "loss/crossentropy": 2.0608770951628683, "loss/hidden": 3.4390625, "loss/jsd": 0.0, "loss/logits": 0.19615320730954408, "step": 4700 }, { "epoch": 0.11775, "grad_norm": 34.25, "grad_norm_var": 4.1894735190686346e+18, "learning_rate": 0.0001, "loss": 7.4596, "loss/crossentropy": 2.0900899082422257, "loss/hidden": 3.360546875, "loss/jsd": 0.0, "loss/logits": 0.17933723451569678, "step": 4710 }, { "epoch": 0.118, "grad_norm": 29.625, "grad_norm_var": 58.10729166666667, "learning_rate": 0.0001, "loss": 7.3979, "loss/crossentropy": 2.094898019731045, "loss/hidden": 3.46875, "loss/jsd": 0.0, "loss/logits": 0.20720194689929486, "step": 4720 }, { "epoch": 0.11825, "grad_norm": 30.25, "grad_norm_var": 1.98515625, "learning_rate": 0.0001, "loss": 7.4519, "loss/crossentropy": 2.083225329220295, "loss/hidden": 3.426171875, "loss/jsd": 0.0, "loss/logits": 0.20777787994593383, "step": 4730 }, { "epoch": 0.1185, "grad_norm": 30.375, "grad_norm_var": 4.818684895833333, "learning_rate": 0.0001, "loss": 7.4795, "loss/crossentropy": 2.1974314540624618, "loss/hidden": 3.38046875, "loss/jsd": 0.0, "loss/logits": 0.19978385213762523, "step": 4740 }, { "epoch": 0.11875, "grad_norm": 32.5, "grad_norm_var": 3.439322916666667, "learning_rate": 0.0001, "loss": 7.3843, "loss/crossentropy": 1.9562335655093193, "loss/hidden": 3.39140625, "loss/jsd": 0.0, "loss/logits": 0.18924889974296094, "step": 4750 }, { "epoch": 0.119, "grad_norm": 30.625, "grad_norm_var": 1.3015402743274143e+18, "learning_rate": 0.0001, "loss": 7.5729, "loss/crossentropy": 2.0693807609379293, "loss/hidden": 3.339453125, "loss/jsd": 0.0, "loss/logits": 0.18801879994571208, "step": 4760 }, { "epoch": 0.11925, "grad_norm": 35.25, "grad_norm_var": 258.8791015625, "learning_rate": 0.0001, "loss": 7.3013, "loss/crossentropy": 2.0631250627338886, "loss/hidden": 3.3703125, "loss/jsd": 0.0, "loss/logits": 0.18974527437239885, "step": 4770 }, { "epoch": 0.1195, "grad_norm": 28.625, "grad_norm_var": 301.52233072916664, "learning_rate": 0.0001, "loss": 7.4639, "loss/crossentropy": 2.1473939388990404, "loss/hidden": 3.3984375, "loss/jsd": 0.0, "loss/logits": 0.19722200892865657, "step": 4780 }, { "epoch": 0.11975, "grad_norm": 31.125, "grad_norm_var": 25.472330729166668, "learning_rate": 0.0001, "loss": 7.3161, "loss/crossentropy": 2.1767601929605007, "loss/hidden": 3.380078125, "loss/jsd": 0.0, "loss/logits": 0.20041130091995002, "step": 4790 }, { "epoch": 0.12, "grad_norm": 29.5, "grad_norm_var": 2.8580729166666665, "learning_rate": 0.0001, "loss": 7.3077, "loss/crossentropy": 2.0214909121394156, "loss/hidden": 3.38828125, "loss/jsd": 0.0, "loss/logits": 0.19553480856120586, "step": 4800 }, { "epoch": 0.12025, "grad_norm": 34.25, "grad_norm_var": 2.3666015625, "learning_rate": 0.0001, "loss": 7.4537, "loss/crossentropy": 2.092876334488392, "loss/hidden": 3.276171875, "loss/jsd": 0.0, "loss/logits": 0.19079044535756112, "step": 4810 }, { "epoch": 0.1205, "grad_norm": 28.75, "grad_norm_var": 2.1494140625, "learning_rate": 0.0001, "loss": 7.3579, "loss/crossentropy": 2.159788618981838, "loss/hidden": 3.447265625, "loss/jsd": 0.0, "loss/logits": 0.20938555523753166, "step": 4820 }, { "epoch": 0.12075, "grad_norm": 31.625, "grad_norm_var": 1.2635411529466906e+18, "learning_rate": 0.0001, "loss": 7.3822, "loss/crossentropy": 2.221826246380806, "loss/hidden": 3.3140625, "loss/jsd": 0.0, "loss/logits": 0.18899439387023448, "step": 4830 }, { "epoch": 0.121, "grad_norm": 29.375, "grad_norm_var": 7.171875, "learning_rate": 0.0001, "loss": 7.3649, "loss/crossentropy": 2.2076950490474703, "loss/hidden": 3.321875, "loss/jsd": 0.0, "loss/logits": 0.1911212421953678, "step": 4840 }, { "epoch": 0.12125, "grad_norm": 28.875, "grad_norm_var": 5.397916666666666, "learning_rate": 0.0001, "loss": 7.2934, "loss/crossentropy": 2.1398009806871414, "loss/hidden": 3.276953125, "loss/jsd": 0.0, "loss/logits": 0.18104367554187775, "step": 4850 }, { "epoch": 0.1215, "grad_norm": 33.25, "grad_norm_var": 2.292122395833333, "learning_rate": 0.0001, "loss": 7.3944, "loss/crossentropy": 2.0568679124116898, "loss/hidden": 3.31953125, "loss/jsd": 0.0, "loss/logits": 0.19066975675523282, "step": 4860 }, { "epoch": 0.12175, "grad_norm": 31.75, "grad_norm_var": 1.5145182291666666, "learning_rate": 0.0001, "loss": 7.5365, "loss/crossentropy": 2.2600763499736787, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.20988074019551278, "step": 4870 }, { "epoch": 0.122, "grad_norm": 30.125, "grad_norm_var": 0.8442057291666667, "learning_rate": 0.0001, "loss": 7.4425, "loss/crossentropy": 2.087808459997177, "loss/hidden": 3.397265625, "loss/jsd": 0.0, "loss/logits": 0.20126468148082494, "step": 4880 }, { "epoch": 0.12225, "grad_norm": 29.25, "grad_norm_var": 1.9455729166666667, "learning_rate": 0.0001, "loss": 7.4649, "loss/crossentropy": 2.089573635160923, "loss/hidden": 3.3890625, "loss/jsd": 0.0, "loss/logits": 0.18984669484198094, "step": 4890 }, { "epoch": 0.1225, "grad_norm": 29.125, "grad_norm_var": 2.7552083333333335, "learning_rate": 0.0001, "loss": 7.4894, "loss/crossentropy": 2.1424145482480528, "loss/hidden": 3.47890625, "loss/jsd": 0.0, "loss/logits": 0.20886036530137062, "step": 4900 }, { "epoch": 0.12275, "grad_norm": 31.0, "grad_norm_var": 4.751497395833334, "learning_rate": 0.0001, "loss": 7.5033, "loss/crossentropy": 2.104494086652994, "loss/hidden": 3.41875, "loss/jsd": 0.0, "loss/logits": 0.1945918256416917, "step": 4910 }, { "epoch": 0.123, "grad_norm": 28.125, "grad_norm_var": 5.330989583333333, "learning_rate": 0.0001, "loss": 7.4954, "loss/crossentropy": 2.0843611776828768, "loss/hidden": 3.358203125, "loss/jsd": 0.0, "loss/logits": 0.1925347488373518, "step": 4920 }, { "epoch": 0.12325, "grad_norm": 28.625, "grad_norm_var": 3.8166015625, "learning_rate": 0.0001, "loss": 7.4404, "loss/crossentropy": 2.205425333976746, "loss/hidden": 3.3359375, "loss/jsd": 0.0, "loss/logits": 0.18580489940941333, "step": 4930 }, { "epoch": 0.1235, "grad_norm": 29.375, "grad_norm_var": 14.980208333333334, "learning_rate": 0.0001, "loss": 7.3481, "loss/crossentropy": 1.9896500617265702, "loss/hidden": 3.39609375, "loss/jsd": 0.0, "loss/logits": 0.1904701752588153, "step": 4940 }, { "epoch": 0.12375, "grad_norm": 32.75, "grad_norm_var": 19.178580729166665, "learning_rate": 0.0001, "loss": 7.5252, "loss/crossentropy": 2.1207278318703175, "loss/hidden": 3.484375, "loss/jsd": 0.0, "loss/logits": 0.19760717861354352, "step": 4950 }, { "epoch": 0.124, "grad_norm": 32.5, "grad_norm_var": 17.264583333333334, "learning_rate": 0.0001, "loss": 7.2678, "loss/crossentropy": 1.9271991185843944, "loss/hidden": 3.419921875, "loss/jsd": 0.0, "loss/logits": 0.19860625620931388, "step": 4960 }, { "epoch": 0.12425, "grad_norm": 28.625, "grad_norm_var": 11.196809895833333, "learning_rate": 0.0001, "loss": 7.3703, "loss/crossentropy": 2.0659097760915754, "loss/hidden": 3.287109375, "loss/jsd": 0.0, "loss/logits": 0.18224728610366583, "step": 4970 }, { "epoch": 0.1245, "grad_norm": 37.75, "grad_norm_var": 10.03515625, "learning_rate": 0.0001, "loss": 7.5041, "loss/crossentropy": 1.9809176340699195, "loss/hidden": 3.41796875, "loss/jsd": 0.0, "loss/logits": 0.19965030066668987, "step": 4980 }, { "epoch": 0.12475, "grad_norm": 27.125, "grad_norm_var": 11.567708333333334, "learning_rate": 0.0001, "loss": 7.327, "loss/crossentropy": 2.0197409205138683, "loss/hidden": 3.368359375, "loss/jsd": 0.0, "loss/logits": 0.18525638189166785, "step": 4990 }, { "epoch": 0.125, "grad_norm": 34.75, "grad_norm_var": 8.558268229166666, "learning_rate": 0.0001, "loss": 7.393, "loss/crossentropy": 2.100055608153343, "loss/hidden": 3.391015625, "loss/jsd": 0.0, "loss/logits": 0.19607669236138464, "step": 5000 } ], "logging_steps": 10, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4287550160044032e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }