diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6033 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.125, + "eval_steps": 2000, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00025, + "grad_norm": 31.5, + "learning_rate": 0.0001, + "loss": 7.633, + "loss/crossentropy": 2.065455098450184, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.20220321230590343, + "step": 10 + }, + { + "epoch": 0.0005, + "grad_norm": 35.0, + "grad_norm_var": 2.6895182291666666, + "learning_rate": 0.0001, + "loss": 7.4618, + "loss/crossentropy": 1.9399560801684856, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.19191570337861777, + "step": 20 + }, + { + "epoch": 0.00075, + "grad_norm": 37.5, + "grad_norm_var": 6.579622395833334, + "learning_rate": 0.0001, + "loss": 7.5972, + "loss/crossentropy": 2.130601316690445, + "loss/hidden": 3.38984375, + "loss/jsd": 0.0, + "loss/logits": 0.20188977513462306, + "step": 30 + }, + { + "epoch": 0.001, + "grad_norm": 33.5, + "grad_norm_var": 6.253125, + "learning_rate": 0.0001, + "loss": 7.5917, + "loss/crossentropy": 2.2571407079696657, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.19847887996584176, + "step": 40 + }, + { + "epoch": 0.00125, + "grad_norm": 32.25, + "grad_norm_var": 2.1619140625, + "learning_rate": 0.0001, + "loss": 7.6054, + "loss/crossentropy": 2.1717565625905992, + "loss/hidden": 3.43359375, + "loss/jsd": 0.0, + "loss/logits": 0.20264342725276946, + "step": 50 + }, + { + "epoch": 0.0015, + "grad_norm": 35.5, + "grad_norm_var": 15.786393229166666, + "learning_rate": 0.0001, + "loss": 7.5513, + "loss/crossentropy": 2.070718301087618, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.19855907820165158, + "step": 60 + }, + { + "epoch": 0.00175, + "grad_norm": 31.0, + "grad_norm_var": 12.4625, + "learning_rate": 0.0001, + "loss": 7.5447, + "loss/crossentropy": 2.118075390160084, + "loss/hidden": 3.473828125, + "loss/jsd": 0.0, + "loss/logits": 0.20283062420785428, + "step": 70 + }, + { + "epoch": 0.002, + "grad_norm": 32.25, + "grad_norm_var": 1.2643229166666667, + "learning_rate": 0.0001, + "loss": 7.468, + "loss/crossentropy": 2.0006178975105287, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.18958428762853147, + "step": 80 + }, + { + "epoch": 0.00225, + "grad_norm": 30.625, + "grad_norm_var": 3.470572916666667, + "learning_rate": 0.0001, + "loss": 7.5061, + "loss/crossentropy": 1.9605075903236866, + "loss/hidden": 3.54375, + "loss/jsd": 0.0, + "loss/logits": 0.20559987109154462, + "step": 90 + }, + { + "epoch": 0.0025, + "grad_norm": 31.125, + "grad_norm_var": 6.763541666666667, + "learning_rate": 0.0001, + "loss": 7.4928, + "loss/crossentropy": 2.1205389350652695, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.19496036488562823, + "step": 100 + }, + { + "epoch": 0.00275, + "grad_norm": 31.0, + "grad_norm_var": 6.1509765625, + "learning_rate": 0.0001, + "loss": 7.595, + "loss/crossentropy": 2.1240097641944886, + "loss/hidden": 3.43671875, + "loss/jsd": 0.0, + "loss/logits": 0.19564666803926228, + "step": 110 + }, + { + "epoch": 0.003, + "grad_norm": 31.25, + "grad_norm_var": 3.348893229166667, + "learning_rate": 0.0001, + "loss": 7.5329, + "loss/crossentropy": 2.175096944719553, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.21303062327206135, + "step": 120 + }, + { + "epoch": 0.00325, + "grad_norm": 32.0, + "grad_norm_var": 2.8541666666666665, + "learning_rate": 0.0001, + "loss": 7.5536, + "loss/crossentropy": 2.1472502022981645, + "loss/hidden": 3.342578125, + "loss/jsd": 0.0, + "loss/logits": 0.18929538186639547, + "step": 130 + }, + { + "epoch": 0.0035, + "grad_norm": 29.375, + "grad_norm_var": 29.683268229166668, + "learning_rate": 0.0001, + "loss": 7.5191, + "loss/crossentropy": 2.015011890232563, + "loss/hidden": 3.44296875, + "loss/jsd": 0.0, + "loss/logits": 0.20328481420874595, + "step": 140 + }, + { + "epoch": 0.00375, + "grad_norm": 28.75, + "grad_norm_var": 28.74765625, + "learning_rate": 0.0001, + "loss": 7.4158, + "loss/crossentropy": 1.9774167470633983, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.19464388117194176, + "step": 150 + }, + { + "epoch": 0.004, + "grad_norm": 30.875, + "grad_norm_var": 1.3635416666666667, + "learning_rate": 0.0001, + "loss": 7.6354, + "loss/crossentropy": 2.320629420876503, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.20745602920651435, + "step": 160 + }, + { + "epoch": 0.00425, + "grad_norm": 31.5, + "grad_norm_var": 1.0270182291666667, + "learning_rate": 0.0001, + "loss": 7.4137, + "loss/crossentropy": 1.900385806709528, + "loss/hidden": 3.345703125, + "loss/jsd": 0.0, + "loss/logits": 0.16769229620695114, + "step": 170 + }, + { + "epoch": 0.0045, + "grad_norm": 31.25, + "grad_norm_var": 0.9833333333333333, + "learning_rate": 0.0001, + "loss": 7.5763, + "loss/crossentropy": 2.129625543951988, + "loss/hidden": 3.5171875, + "loss/jsd": 0.0, + "loss/logits": 0.2102549459785223, + "step": 180 + }, + { + "epoch": 0.00475, + "grad_norm": 32.25, + "grad_norm_var": 3.05390625, + "learning_rate": 0.0001, + "loss": 7.6166, + "loss/crossentropy": 2.1552532628178596, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.2250068686902523, + "step": 190 + }, + { + "epoch": 0.005, + "grad_norm": 29.625, + "grad_norm_var": 3.8375, + "learning_rate": 0.0001, + "loss": 7.5745, + "loss/crossentropy": 1.9441482461988926, + "loss/hidden": 3.387890625, + "loss/jsd": 0.0, + "loss/logits": 0.195942450594157, + "step": 200 + }, + { + "epoch": 0.00525, + "grad_norm": 32.5, + "grad_norm_var": 18.396875, + "learning_rate": 0.0001, + "loss": 7.5292, + "loss/crossentropy": 1.9941987417638303, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.18264975901693106, + "step": 210 + }, + { + "epoch": 0.0055, + "grad_norm": 31.75, + "grad_norm_var": 20.736393229166666, + "learning_rate": 0.0001, + "loss": 7.4899, + "loss/crossentropy": 2.0191620789468288, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.18100650198757648, + "step": 220 + }, + { + "epoch": 0.00575, + "grad_norm": 30.375, + "grad_norm_var": 2.342643229166667, + "learning_rate": 0.0001, + "loss": 7.5199, + "loss/crossentropy": 2.001779730618, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.17959208656102418, + "step": 230 + }, + { + "epoch": 0.006, + "grad_norm": 30.75, + "grad_norm_var": 1.271875, + "learning_rate": 0.0001, + "loss": 7.6842, + "loss/crossentropy": 2.1846971333026888, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.2059234745800495, + "step": 240 + }, + { + "epoch": 0.00625, + "grad_norm": 29.5, + "grad_norm_var": 5.688541666666667, + "learning_rate": 0.0001, + "loss": 7.5196, + "loss/crossentropy": 2.174124576151371, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.20000722594559192, + "step": 250 + }, + { + "epoch": 0.0065, + "grad_norm": 28.75, + "grad_norm_var": 1.9572265625, + "learning_rate": 0.0001, + "loss": 7.3875, + "loss/crossentropy": 1.9285166233778, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.18449910767376423, + "step": 260 + }, + { + "epoch": 0.00675, + "grad_norm": 33.5, + "grad_norm_var": 2.0999348958333335, + "learning_rate": 0.0001, + "loss": 7.5877, + "loss/crossentropy": 2.0323276594281197, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.19395631980150937, + "step": 270 + }, + { + "epoch": 0.007, + "grad_norm": 30.5, + "grad_norm_var": 2.15390625, + "learning_rate": 0.0001, + "loss": 7.5791, + "loss/crossentropy": 2.126656140387058, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.21661139875650406, + "step": 280 + }, + { + "epoch": 0.00725, + "grad_norm": 29.5, + "grad_norm_var": 3.193489583333333, + "learning_rate": 0.0001, + "loss": 7.5587, + "loss/crossentropy": 2.200097793340683, + "loss/hidden": 3.529296875, + "loss/jsd": 0.0, + "loss/logits": 0.21046234332025052, + "step": 290 + }, + { + "epoch": 0.0075, + "grad_norm": 26.75, + "grad_norm_var": 4.27265625, + "learning_rate": 0.0001, + "loss": 7.5404, + "loss/crossentropy": 2.1184144005179406, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.20949590150266886, + "step": 300 + }, + { + "epoch": 0.00775, + "grad_norm": 33.0, + "grad_norm_var": 3.3643229166666666, + "learning_rate": 0.0001, + "loss": 7.5628, + "loss/crossentropy": 1.9984030593186617, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.18789457948878407, + "step": 310 + }, + { + "epoch": 0.008, + "grad_norm": 32.5, + "grad_norm_var": 2.5645182291666666, + "learning_rate": 0.0001, + "loss": 7.5695, + "loss/crossentropy": 2.143594169616699, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.19360470157116652, + "step": 320 + }, + { + "epoch": 0.00825, + "grad_norm": 29.375, + "grad_norm_var": 1.8749348958333334, + "learning_rate": 0.0001, + "loss": 7.3627, + "loss/crossentropy": 2.1077703177928924, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.19771252572536469, + "step": 330 + }, + { + "epoch": 0.0085, + "grad_norm": 29.75, + "grad_norm_var": 1.5978515625, + "learning_rate": 0.0001, + "loss": 7.4192, + "loss/crossentropy": 2.0583472289144993, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.20189273860305548, + "step": 340 + }, + { + "epoch": 0.00875, + "grad_norm": 29.875, + "grad_norm_var": 1.2872395833333334, + "learning_rate": 0.0001, + "loss": 7.5432, + "loss/crossentropy": 2.0804511278867723, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19735569059848784, + "step": 350 + }, + { + "epoch": 0.009, + "grad_norm": 30.5, + "grad_norm_var": 18.731184895833334, + "learning_rate": 0.0001, + "loss": 7.4948, + "loss/crossentropy": 2.0466629534959795, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.18366040643304588, + "step": 360 + }, + { + "epoch": 0.00925, + "grad_norm": 30.875, + "grad_norm_var": 25.9916015625, + "learning_rate": 0.0001, + "loss": 7.5081, + "loss/crossentropy": 1.9005662694573402, + "loss/hidden": 3.501171875, + "loss/jsd": 0.0, + "loss/logits": 0.1900689721107483, + "step": 370 + }, + { + "epoch": 0.0095, + "grad_norm": 28.75, + "grad_norm_var": 2.451041666666667, + "learning_rate": 0.0001, + "loss": 7.4305, + "loss/crossentropy": 2.0674299761652946, + "loss/hidden": 3.517578125, + "loss/jsd": 0.0, + "loss/logits": 0.21062961965799332, + "step": 380 + }, + { + "epoch": 0.00975, + "grad_norm": 31.25, + "grad_norm_var": 5.645247395833334, + "learning_rate": 0.0001, + "loss": 7.5168, + "loss/crossentropy": 2.0279919117689134, + "loss/hidden": 3.503125, + "loss/jsd": 0.0, + "loss/logits": 0.20519332773983479, + "step": 390 + }, + { + "epoch": 0.01, + "grad_norm": 31.125, + "grad_norm_var": 5.928125, + "learning_rate": 0.0001, + "loss": 7.4985, + "loss/crossentropy": 2.0427632443606853, + "loss/hidden": 3.53125, + "loss/jsd": 0.0, + "loss/logits": 0.20287631042301654, + "step": 400 + }, + { + "epoch": 0.01025, + "grad_norm": 38.5, + "grad_norm_var": 438.43515625, + "learning_rate": 0.0001, + "loss": 7.5633, + "loss/crossentropy": 2.199043881893158, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.21130343191325665, + "step": 410 + }, + { + "epoch": 0.0105, + "grad_norm": 30.875, + "grad_norm_var": 43.14140625, + "learning_rate": 0.0001, + "loss": 7.4835, + "loss/crossentropy": 1.9102243572473525, + "loss/hidden": 3.42578125, + "loss/jsd": 0.0, + "loss/logits": 0.1895731385797262, + "step": 420 + }, + { + "epoch": 0.01075, + "grad_norm": 31.75, + "grad_norm_var": 5.658268229166667, + "learning_rate": 0.0001, + "loss": 7.3897, + "loss/crossentropy": 2.159160128980875, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.20280379485338926, + "step": 430 + }, + { + "epoch": 0.011, + "grad_norm": 28.375, + "grad_norm_var": 16.3375, + "learning_rate": 0.0001, + "loss": 7.5463, + "loss/crossentropy": 2.1217672407627104, + "loss/hidden": 3.545703125, + "loss/jsd": 0.0, + "loss/logits": 0.23856931366026402, + "step": 440 + }, + { + "epoch": 0.01125, + "grad_norm": 30.5, + "grad_norm_var": 17.098372395833334, + "learning_rate": 0.0001, + "loss": 7.5225, + "loss/crossentropy": 1.969854873791337, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.19548849146813155, + "step": 450 + }, + { + "epoch": 0.0115, + "grad_norm": 29.875, + "grad_norm_var": 2.5677083333333335, + "learning_rate": 0.0001, + "loss": 7.5046, + "loss/crossentropy": 2.121321603655815, + "loss/hidden": 3.476171875, + "loss/jsd": 0.0, + "loss/logits": 0.19364523217082025, + "step": 460 + }, + { + "epoch": 0.01175, + "grad_norm": 32.25, + "grad_norm_var": 8.585416666666667, + "learning_rate": 0.0001, + "loss": 7.4558, + "loss/crossentropy": 1.9360710382461548, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.1893781816586852, + "step": 470 + }, + { + "epoch": 0.012, + "grad_norm": 29.875, + "grad_norm_var": 3.417122395833333, + "learning_rate": 0.0001, + "loss": 7.531, + "loss/crossentropy": 2.082458943128586, + "loss/hidden": 3.471875, + "loss/jsd": 0.0, + "loss/logits": 0.2220946006476879, + "step": 480 + }, + { + "epoch": 0.01225, + "grad_norm": 31.0, + "grad_norm_var": 48.96640625, + "learning_rate": 0.0001, + "loss": 7.5651, + "loss/crossentropy": 2.1382531195878984, + "loss/hidden": 3.480078125, + "loss/jsd": 0.0, + "loss/logits": 0.20847559962421655, + "step": 490 + }, + { + "epoch": 0.0125, + "grad_norm": 29.875, + "grad_norm_var": 49.2666015625, + "learning_rate": 0.0001, + "loss": 7.5679, + "loss/crossentropy": 2.0875915244221686, + "loss/hidden": 3.33125, + "loss/jsd": 0.0, + "loss/logits": 0.1850985599681735, + "step": 500 + }, + { + "epoch": 0.01275, + "grad_norm": 31.875, + "grad_norm_var": 1.45, + "learning_rate": 0.0001, + "loss": 7.5263, + "loss/crossentropy": 2.182442346215248, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.19555890336632728, + "step": 510 + }, + { + "epoch": 0.013, + "grad_norm": 34.0, + "grad_norm_var": 1.6931640625, + "learning_rate": 0.0001, + "loss": 7.5209, + "loss/crossentropy": 1.9812136888504028, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.1965757070109248, + "step": 520 + }, + { + "epoch": 0.01325, + "grad_norm": 31.0, + "grad_norm_var": 2.101822916666667, + "learning_rate": 0.0001, + "loss": 7.6059, + "loss/crossentropy": 2.0372241511940956, + "loss/hidden": 3.564453125, + "loss/jsd": 0.0, + "loss/logits": 0.204646560549736, + "step": 530 + }, + { + "epoch": 0.0135, + "grad_norm": 29.125, + "grad_norm_var": 20.071875, + "learning_rate": 0.0001, + "loss": 7.5725, + "loss/crossentropy": 2.155761349201202, + "loss/hidden": 3.4125, + "loss/jsd": 0.0, + "loss/logits": 0.19602423422038556, + "step": 540 + }, + { + "epoch": 0.01375, + "grad_norm": 29.125, + "grad_norm_var": 20.506705729166665, + "learning_rate": 0.0001, + "loss": 7.5842, + "loss/crossentropy": 1.8869566857814788, + "loss/hidden": 3.437890625, + "loss/jsd": 0.0, + "loss/logits": 0.20957522764801978, + "step": 550 + }, + { + "epoch": 0.014, + "grad_norm": 30.625, + "grad_norm_var": 10.025455729166667, + "learning_rate": 0.0001, + "loss": 7.4975, + "loss/crossentropy": 2.0370677679777147, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.19026046600192786, + "step": 560 + }, + { + "epoch": 0.01425, + "grad_norm": 33.0, + "grad_norm_var": 2.2270833333333333, + "learning_rate": 0.0001, + "loss": 7.5688, + "loss/crossentropy": 2.1931444257497787, + "loss/hidden": 3.415234375, + "loss/jsd": 0.0, + "loss/logits": 0.2036376902833581, + "step": 570 + }, + { + "epoch": 0.0145, + "grad_norm": 35.0, + "grad_norm_var": 3.5681640625, + "learning_rate": 0.0001, + "loss": 7.478, + "loss/crossentropy": 2.061052493005991, + "loss/hidden": 3.478125, + "loss/jsd": 0.0, + "loss/logits": 0.2282864760607481, + "step": 580 + }, + { + "epoch": 0.01475, + "grad_norm": 32.5, + "grad_norm_var": 2.8705729166666667, + "learning_rate": 0.0001, + "loss": 7.5957, + "loss/crossentropy": 2.0078392371535303, + "loss/hidden": 3.45, + "loss/jsd": 0.0, + "loss/logits": 0.19647251404821872, + "step": 590 + }, + { + "epoch": 0.015, + "grad_norm": 30.25, + "grad_norm_var": 31.449934895833334, + "learning_rate": 0.0001, + "loss": 7.5096, + "loss/crossentropy": 2.0417068414390087, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.19782953998073935, + "step": 600 + }, + { + "epoch": 0.01525, + "grad_norm": 30.5, + "grad_norm_var": 26.253059895833335, + "learning_rate": 0.0001, + "loss": 7.5368, + "loss/crossentropy": 2.1738049775362014, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.1996332859620452, + "step": 610 + }, + { + "epoch": 0.0155, + "grad_norm": 30.125, + "grad_norm_var": 2.334375, + "learning_rate": 0.0001, + "loss": 7.4868, + "loss/crossentropy": 1.7587297886610032, + "loss/hidden": 3.475390625, + "loss/jsd": 0.0, + "loss/logits": 0.18938990794122218, + "step": 620 + }, + { + "epoch": 0.01575, + "grad_norm": 29.25, + "grad_norm_var": 27.393684895833335, + "learning_rate": 0.0001, + "loss": 7.4833, + "loss/crossentropy": 1.9551145888864994, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.20075901364907622, + "step": 630 + }, + { + "epoch": 0.016, + "grad_norm": 29.75, + "grad_norm_var": 29.6947265625, + "learning_rate": 0.0001, + "loss": 7.4608, + "loss/crossentropy": 2.128718316555023, + "loss/hidden": 3.3625, + "loss/jsd": 0.0, + "loss/logits": 0.19077460393309592, + "step": 640 + }, + { + "epoch": 0.01625, + "grad_norm": 29.75, + "grad_norm_var": 27.322330729166666, + "learning_rate": 0.0001, + "loss": 7.6033, + "loss/crossentropy": 1.9678708665072917, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.18875791020691396, + "step": 650 + }, + { + "epoch": 0.0165, + "grad_norm": 30.375, + "grad_norm_var": 3.129622395833333, + "learning_rate": 0.0001, + "loss": 7.3873, + "loss/crossentropy": 1.9582339562475681, + "loss/hidden": 3.34765625, + "loss/jsd": 0.0, + "loss/logits": 0.18309127148240806, + "step": 660 + }, + { + "epoch": 0.01675, + "grad_norm": 32.75, + "grad_norm_var": 2.7009765625, + "learning_rate": 0.0001, + "loss": 7.4913, + "loss/crossentropy": 2.0773802563548087, + "loss/hidden": 3.505078125, + "loss/jsd": 0.0, + "loss/logits": 0.20910798981785775, + "step": 670 + }, + { + "epoch": 0.017, + "grad_norm": 34.0, + "grad_norm_var": 3.3854166666666665, + "learning_rate": 0.0001, + "loss": 7.4847, + "loss/crossentropy": 2.12913373708725, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.201920267008245, + "step": 680 + }, + { + "epoch": 0.01725, + "grad_norm": 30.75, + "grad_norm_var": 1.7176432291666666, + "learning_rate": 0.0001, + "loss": 7.5065, + "loss/crossentropy": 1.9141538538038732, + "loss/hidden": 3.44921875, + "loss/jsd": 0.0, + "loss/logits": 0.1841401271522045, + "step": 690 + }, + { + "epoch": 0.0175, + "grad_norm": 31.0, + "grad_norm_var": 1.6374348958333333, + "learning_rate": 0.0001, + "loss": 7.5897, + "loss/crossentropy": 2.207232800126076, + "loss/hidden": 3.399609375, + "loss/jsd": 0.0, + "loss/logits": 0.21376523859798907, + "step": 700 + }, + { + "epoch": 0.01775, + "grad_norm": 32.75, + "grad_norm_var": 2.3655598958333335, + "learning_rate": 0.0001, + "loss": 7.5075, + "loss/crossentropy": 2.03845998942852, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.1920805646572262, + "step": 710 + }, + { + "epoch": 0.018, + "grad_norm": 32.5, + "grad_norm_var": 1.3893229166666667, + "learning_rate": 0.0001, + "loss": 7.4669, + "loss/crossentropy": 2.054341807588935, + "loss/hidden": 3.489453125, + "loss/jsd": 0.0, + "loss/logits": 0.19716067584231495, + "step": 720 + }, + { + "epoch": 0.01825, + "grad_norm": 31.625, + "grad_norm_var": 3.54140625, + "learning_rate": 0.0001, + "loss": 7.517, + "loss/crossentropy": 2.2111608639359472, + "loss/hidden": 3.409765625, + "loss/jsd": 0.0, + "loss/logits": 0.20262118335813284, + "step": 730 + }, + { + "epoch": 0.0185, + "grad_norm": 29.125, + "grad_norm_var": 4.692122395833334, + "learning_rate": 0.0001, + "loss": 7.4784, + "loss/crossentropy": 2.0551758617162705, + "loss/hidden": 3.446875, + "loss/jsd": 0.0, + "loss/logits": 0.20378697756677866, + "step": 740 + }, + { + "epoch": 0.01875, + "grad_norm": 33.0, + "grad_norm_var": 4.295572916666667, + "learning_rate": 0.0001, + "loss": 7.4016, + "loss/crossentropy": 2.128055375814438, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.19904747987166047, + "step": 750 + }, + { + "epoch": 0.019, + "grad_norm": 6106906624.0, + "grad_norm_var": 2.3308942582349476e+18, + "learning_rate": 0.0001, + "loss": 7.4633, + "loss/crossentropy": 2.248567137122154, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.19723597317934036, + "step": 760 + }, + { + "epoch": 0.01925, + "grad_norm": 28.5, + "grad_norm_var": 2.330894258158611e+18, + "learning_rate": 0.0001, + "loss": 7.4542, + "loss/crossentropy": 2.132212319970131, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.18174959290772677, + "step": 770 + }, + { + "epoch": 0.0195, + "grad_norm": 36.5, + "grad_norm_var": 4.833333333333333, + "learning_rate": 0.0001, + "loss": 7.465, + "loss/crossentropy": 2.046277052164078, + "loss/hidden": 3.491015625, + "loss/jsd": 0.0, + "loss/logits": 0.21161840241402388, + "step": 780 + }, + { + "epoch": 0.01975, + "grad_norm": 32.75, + "grad_norm_var": 5.137434895833334, + "learning_rate": 0.0001, + "loss": 7.4171, + "loss/crossentropy": 2.058088332414627, + "loss/hidden": 3.315234375, + "loss/jsd": 0.0, + "loss/logits": 0.1815673651173711, + "step": 790 + }, + { + "epoch": 0.02, + "grad_norm": 30.125, + "grad_norm_var": 12.37265625, + "learning_rate": 0.0001, + "loss": 7.4153, + "loss/crossentropy": 2.064726157486439, + "loss/hidden": 3.515625, + "loss/jsd": 0.0, + "loss/logits": 0.19402222614735365, + "step": 800 + }, + { + "epoch": 0.02025, + "grad_norm": 32.0, + "grad_norm_var": 12.240625, + "learning_rate": 0.0001, + "loss": 7.3739, + "loss/crossentropy": 2.0926051691174505, + "loss/hidden": 3.476953125, + "loss/jsd": 0.0, + "loss/logits": 0.21017331834882497, + "step": 810 + }, + { + "epoch": 0.0205, + "grad_norm": 31.875, + "grad_norm_var": 3.6853515625, + "learning_rate": 0.0001, + "loss": 7.409, + "loss/crossentropy": 2.016859006881714, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.20363395065069198, + "step": 820 + }, + { + "epoch": 0.02075, + "grad_norm": 34.0, + "grad_norm_var": 278.1108723958333, + "learning_rate": 0.0001, + "loss": 7.6725, + "loss/crossentropy": 2.03957434669137, + "loss/hidden": 3.4625, + "loss/jsd": 0.0, + "loss/logits": 0.19866096526384353, + "step": 830 + }, + { + "epoch": 0.021, + "grad_norm": 35.75, + "grad_norm_var": 281.2239583333333, + "learning_rate": 0.0001, + "loss": 7.4058, + "loss/crossentropy": 2.1190530106425287, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.19663097113370895, + "step": 840 + }, + { + "epoch": 0.02125, + "grad_norm": 32.25, + "grad_norm_var": 4.044791666666667, + "learning_rate": 0.0001, + "loss": 7.4687, + "loss/crossentropy": 2.1552326917648315, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.19604418501257898, + "step": 850 + }, + { + "epoch": 0.0215, + "grad_norm": 37.25, + "grad_norm_var": 2.7587362193217157e+18, + "learning_rate": 0.0001, + "loss": 7.5552, + "loss/crossentropy": 2.1164004117250443, + "loss/hidden": 3.375, + "loss/jsd": 0.0, + "loss/logits": 0.19724889248609542, + "step": 860 + }, + { + "epoch": 0.02175, + "grad_norm": 35.25, + "grad_norm_var": 2.758736219342478e+18, + "learning_rate": 0.0001, + "loss": 7.5021, + "loss/crossentropy": 2.036998500674963, + "loss/hidden": 3.298828125, + "loss/jsd": 0.0, + "loss/logits": 0.18320635841228067, + "step": 870 + }, + { + "epoch": 0.022, + "grad_norm": 37.0, + "grad_norm_var": 16.9541015625, + "learning_rate": 0.0001, + "loss": 7.5059, + "loss/crossentropy": 1.9707016140222549, + "loss/hidden": 3.36328125, + "loss/jsd": 0.0, + "loss/logits": 0.20436920877546072, + "step": 880 + }, + { + "epoch": 0.02225, + "grad_norm": 31.375, + "grad_norm_var": 30.538541666666667, + "learning_rate": 0.0001, + "loss": 7.4935, + "loss/crossentropy": 2.206394499540329, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.20495780408382416, + "step": 890 + }, + { + "epoch": 0.0225, + "grad_norm": 29.875, + "grad_norm_var": 28.020833333333332, + "learning_rate": 0.0001, + "loss": 7.4823, + "loss/crossentropy": 2.091763325035572, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.20592593550682067, + "step": 900 + }, + { + "epoch": 0.02275, + "grad_norm": 31.875, + "grad_norm_var": 3.5645182291666666, + "learning_rate": 0.0001, + "loss": 7.422, + "loss/crossentropy": 1.9740761511027813, + "loss/hidden": 3.494921875, + "loss/jsd": 0.0, + "loss/logits": 0.2015986293554306, + "step": 910 + }, + { + "epoch": 0.023, + "grad_norm": 32.0, + "grad_norm_var": 56.256184895833336, + "learning_rate": 0.0001, + "loss": 7.4528, + "loss/crossentropy": 2.030415116250515, + "loss/hidden": 3.205078125, + "loss/jsd": 0.0, + "loss/logits": 0.1614784031175077, + "step": 920 + }, + { + "epoch": 0.02325, + "grad_norm": 30.0, + "grad_norm_var": 57.1619140625, + "learning_rate": 0.0001, + "loss": 7.3713, + "loss/crossentropy": 2.0250086903572084, + "loss/hidden": 3.455859375, + "loss/jsd": 0.0, + "loss/logits": 0.19023355115205048, + "step": 930 + }, + { + "epoch": 0.0235, + "grad_norm": 30.625, + "grad_norm_var": 1.3830729166666667, + "learning_rate": 0.0001, + "loss": 7.5277, + "loss/crossentropy": 2.222324788570404, + "loss/hidden": 3.366796875, + "loss/jsd": 0.0, + "loss/logits": 0.19078677501529456, + "step": 940 + }, + { + "epoch": 0.02375, + "grad_norm": 31.0, + "grad_norm_var": 3.1455729166666666, + "learning_rate": 0.0001, + "loss": 7.5086, + "loss/crossentropy": 2.1299516543745995, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.21310927756130696, + "step": 950 + }, + { + "epoch": 0.024, + "grad_norm": 29.875, + "grad_norm_var": 8.883072916666666, + "learning_rate": 0.0001, + "loss": 7.5579, + "loss/crossentropy": 2.0535727672278883, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.18507701791822911, + "step": 960 + }, + { + "epoch": 0.02425, + "grad_norm": 32.75, + "grad_norm_var": 2.5916015625, + "learning_rate": 0.0001, + "loss": 7.537, + "loss/crossentropy": 2.1785535484552385, + "loss/hidden": 3.309765625, + "loss/jsd": 0.0, + "loss/logits": 0.1955953363329172, + "step": 970 + }, + { + "epoch": 0.0245, + "grad_norm": 36.5, + "grad_norm_var": 6.852083333333334, + "learning_rate": 0.0001, + "loss": 7.5091, + "loss/crossentropy": 2.0967498391866686, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.2146583067253232, + "step": 980 + }, + { + "epoch": 0.02475, + "grad_norm": 29.625, + "grad_norm_var": 4.325455729166666, + "learning_rate": 0.0001, + "loss": 7.5901, + "loss/crossentropy": 2.1134474128484726, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.19056662563234567, + "step": 990 + }, + { + "epoch": 0.025, + "grad_norm": 42.0, + "grad_norm_var": 4.1552039405313587e+18, + "learning_rate": 0.0001, + "loss": 7.6082, + "loss/crossentropy": 2.0916516482830048, + "loss/hidden": 3.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.19376826155930757, + "step": 1000 + }, + { + "epoch": 0.02525, + "grad_norm": 29.625, + "grad_norm_var": 4.1552039416015355e+18, + "learning_rate": 0.0001, + "loss": 7.4528, + "loss/crossentropy": 2.003750593960285, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.18129821103066207, + "step": 1010 + }, + { + "epoch": 0.0255, + "grad_norm": 35.25, + "grad_norm_var": 24.095572916666665, + "learning_rate": 0.0001, + "loss": 7.5395, + "loss/crossentropy": 2.0453194856643675, + "loss/hidden": 3.477734375, + "loss/jsd": 0.0, + "loss/logits": 0.199107607267797, + "step": 1020 + }, + { + "epoch": 0.02575, + "grad_norm": 32.25, + "grad_norm_var": 19.5259765625, + "learning_rate": 0.0001, + "loss": 7.31, + "loss/crossentropy": 2.1016619503498077, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.184703135676682, + "step": 1030 + }, + { + "epoch": 0.026, + "grad_norm": 30.75, + "grad_norm_var": 1.87890625, + "learning_rate": 0.0001, + "loss": 7.5425, + "loss/crossentropy": 2.1467826470732687, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.20074132941663264, + "step": 1040 + }, + { + "epoch": 0.02625, + "grad_norm": 30.625, + "grad_norm_var": 0.7452473958333333, + "learning_rate": 0.0001, + "loss": 7.4114, + "loss/crossentropy": 2.049474111199379, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.20267941821366547, + "step": 1050 + }, + { + "epoch": 0.0265, + "grad_norm": 31.75, + "grad_norm_var": 3.124739583333333, + "learning_rate": 0.0001, + "loss": 7.4845, + "loss/crossentropy": 2.036583887040615, + "loss/hidden": 3.391796875, + "loss/jsd": 0.0, + "loss/logits": 0.1893632340244949, + "step": 1060 + }, + { + "epoch": 0.02675, + "grad_norm": 40.75, + "grad_norm_var": 3.405847188209664e+18, + "learning_rate": 0.0001, + "loss": 7.3982, + "loss/crossentropy": 2.124411530792713, + "loss/hidden": 3.4484375, + "loss/jsd": 0.0, + "loss/logits": 0.19454579129815103, + "step": 1070 + }, + { + "epoch": 0.027, + "grad_norm": 28.25, + "grad_norm_var": 3.4058471885941417e+18, + "learning_rate": 0.0001, + "loss": 7.3928, + "loss/crossentropy": 2.0034691862761975, + "loss/hidden": 3.503515625, + "loss/jsd": 0.0, + "loss/logits": 0.21349683087319135, + "step": 1080 + }, + { + "epoch": 0.02725, + "grad_norm": 29.875, + "grad_norm_var": 4.88515625, + "learning_rate": 0.0001, + "loss": 7.5095, + "loss/crossentropy": 1.9183670297265052, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.19249978363513948, + "step": 1090 + }, + { + "epoch": 0.0275, + "grad_norm": 30.5, + "grad_norm_var": 3.2728515625, + "learning_rate": 0.0001, + "loss": 7.37, + "loss/crossentropy": 2.145428071916103, + "loss/hidden": 3.35703125, + "loss/jsd": 0.0, + "loss/logits": 0.19729665387421846, + "step": 1100 + }, + { + "epoch": 0.02775, + "grad_norm": 31.25, + "grad_norm_var": 2.34765625, + "learning_rate": 0.0001, + "loss": 7.4772, + "loss/crossentropy": 2.10652961358428, + "loss/hidden": 3.398046875, + "loss/jsd": 0.0, + "loss/logits": 0.19585925145074726, + "step": 1110 + }, + { + "epoch": 0.028, + "grad_norm": 31.25, + "grad_norm_var": 2.434477049308093e+18, + "learning_rate": 0.0001, + "loss": 7.4016, + "loss/crossentropy": 1.9645449101924897, + "loss/hidden": 3.44453125, + "loss/jsd": 0.0, + "loss/logits": 0.19977953620254993, + "step": 1120 + }, + { + "epoch": 0.02825, + "grad_norm": 32.0, + "grad_norm_var": 2.4344770492950907e+18, + "learning_rate": 0.0001, + "loss": 7.4453, + "loss/crossentropy": 2.131172102689743, + "loss/hidden": 3.383984375, + "loss/jsd": 0.0, + "loss/logits": 0.2083016105927527, + "step": 1130 + }, + { + "epoch": 0.0285, + "grad_norm": 32.75, + "grad_norm_var": 3.7080729166666666, + "learning_rate": 0.0001, + "loss": 7.4009, + "loss/crossentropy": 2.003016713261604, + "loss/hidden": 3.34453125, + "loss/jsd": 0.0, + "loss/logits": 0.18665643623098732, + "step": 1140 + }, + { + "epoch": 0.02875, + "grad_norm": 30.875, + "grad_norm_var": 1.34765625, + "learning_rate": 0.0001, + "loss": 7.5648, + "loss/crossentropy": 2.0709651306271555, + "loss/hidden": 3.45703125, + "loss/jsd": 0.0, + "loss/logits": 0.18793081305921078, + "step": 1150 + }, + { + "epoch": 0.029, + "grad_norm": 32.25, + "grad_norm_var": 2.1582682291666666, + "learning_rate": 0.0001, + "loss": 7.4644, + "loss/crossentropy": 2.06434089243412, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.2109043262898922, + "step": 1160 + }, + { + "epoch": 0.02925, + "grad_norm": 31.375, + "grad_norm_var": 2.4010416666666665, + "learning_rate": 0.0001, + "loss": 7.4403, + "loss/crossentropy": 2.0107607185840606, + "loss/hidden": 3.498046875, + "loss/jsd": 0.0, + "loss/logits": 0.20349722560495137, + "step": 1170 + }, + { + "epoch": 0.0295, + "grad_norm": 33.25, + "grad_norm_var": 1.2260416666666667, + "learning_rate": 0.0001, + "loss": 7.4412, + "loss/crossentropy": 2.096436749398708, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.20087064132094384, + "step": 1180 + }, + { + "epoch": 0.02975, + "grad_norm": 29.75, + "grad_norm_var": 1.8046223958333334, + "learning_rate": 0.0001, + "loss": 7.4458, + "loss/crossentropy": 1.972258360683918, + "loss/hidden": 3.583984375, + "loss/jsd": 0.0, + "loss/logits": 0.20998958311975002, + "step": 1190 + }, + { + "epoch": 0.03, + "grad_norm": 33.75, + "grad_norm_var": 3.7395833333333335, + "learning_rate": 0.0001, + "loss": 7.3931, + "loss/crossentropy": 1.8556599006056786, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.19810242671519518, + "step": 1200 + }, + { + "epoch": 0.03025, + "grad_norm": 29.0, + "grad_norm_var": 9.394791666666666, + "learning_rate": 0.0001, + "loss": 7.5849, + "loss/crossentropy": 2.0611833460628985, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.19216072149574756, + "step": 1210 + }, + { + "epoch": 0.0305, + "grad_norm": 31.75, + "grad_norm_var": 3.26640625, + "learning_rate": 0.0001, + "loss": 7.4844, + "loss/crossentropy": 2.0546294137835504, + "loss/hidden": 3.58828125, + "loss/jsd": 0.0, + "loss/logits": 0.21588555499911308, + "step": 1220 + }, + { + "epoch": 0.03075, + "grad_norm": 31.625, + "grad_norm_var": 2.3968098958333335, + "learning_rate": 0.0001, + "loss": 7.4858, + "loss/crossentropy": 2.0615282475948336, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.206529095210135, + "step": 1230 + }, + { + "epoch": 0.031, + "grad_norm": 32.0, + "grad_norm_var": 1.6124348958333334, + "learning_rate": 0.0001, + "loss": 7.4647, + "loss/crossentropy": 1.9786661133170127, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.17899234425276517, + "step": 1240 + }, + { + "epoch": 0.03125, + "grad_norm": 5838471168.0, + "grad_norm_var": 2.1304840753447437e+18, + "learning_rate": 0.0001, + "loss": 7.4926, + "loss/crossentropy": 2.04936410933733, + "loss/hidden": 3.714453125, + "loss/jsd": 0.0, + "loss/logits": 0.1995564555749297, + "step": 1250 + }, + { + "epoch": 0.0315, + "grad_norm": 31.25, + "grad_norm_var": 2.1304840747304878e+18, + "learning_rate": 0.0001, + "loss": 7.5078, + "loss/crossentropy": 2.1189576953649523, + "loss/hidden": 3.43515625, + "loss/jsd": 0.0, + "loss/logits": 0.19967459067702292, + "step": 1260 + }, + { + "epoch": 0.03175, + "grad_norm": 30.5, + "grad_norm_var": 3.178580729166667, + "learning_rate": 0.0001, + "loss": 7.4255, + "loss/crossentropy": 2.163596141338348, + "loss/hidden": 3.4546875, + "loss/jsd": 0.0, + "loss/logits": 0.19321363251656293, + "step": 1270 + }, + { + "epoch": 0.032, + "grad_norm": 33.25, + "grad_norm_var": 2.1639973958333334, + "learning_rate": 0.0001, + "loss": 7.4609, + "loss/crossentropy": 1.9938266813755035, + "loss/hidden": 3.351953125, + "loss/jsd": 0.0, + "loss/logits": 0.18334759529680014, + "step": 1280 + }, + { + "epoch": 0.03225, + "grad_norm": 29.375, + "grad_norm_var": 1.67890625, + "learning_rate": 0.0001, + "loss": 7.4652, + "loss/crossentropy": 2.161333967000246, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19740422032773494, + "step": 1290 + }, + { + "epoch": 0.0325, + "grad_norm": 32.75, + "grad_norm_var": 3.0385416666666667, + "learning_rate": 0.0001, + "loss": 7.3146, + "loss/crossentropy": 2.0165325723588468, + "loss/hidden": 3.49921875, + "loss/jsd": 0.0, + "loss/logits": 0.19117104820907116, + "step": 1300 + }, + { + "epoch": 0.03275, + "grad_norm": 28.25, + "grad_norm_var": 9.158072916666667, + "learning_rate": 0.0001, + "loss": 7.4955, + "loss/crossentropy": 2.124955786764622, + "loss/hidden": 3.491015625, + "loss/jsd": 0.0, + "loss/logits": 0.19802952595055104, + "step": 1310 + }, + { + "epoch": 0.033, + "grad_norm": 30.75, + "grad_norm_var": 2.4535807291666667, + "learning_rate": 0.0001, + "loss": 7.4311, + "loss/crossentropy": 2.018800371140242, + "loss/hidden": 3.542578125, + "loss/jsd": 0.0, + "loss/logits": 0.2196814114227891, + "step": 1320 + }, + { + "epoch": 0.03325, + "grad_norm": 31.375, + "grad_norm_var": 2.39375, + "learning_rate": 0.0001, + "loss": 7.5164, + "loss/crossentropy": 2.0520452961325644, + "loss/hidden": 3.454296875, + "loss/jsd": 0.0, + "loss/logits": 0.2013697015121579, + "step": 1330 + }, + { + "epoch": 0.0335, + "grad_norm": 32.5, + "grad_norm_var": 1.0431640625, + "learning_rate": 0.0001, + "loss": 7.5302, + "loss/crossentropy": 2.12932348549366, + "loss/hidden": 3.525, + "loss/jsd": 0.0, + "loss/logits": 0.20245677568018436, + "step": 1340 + }, + { + "epoch": 0.03375, + "grad_norm": 30.625, + "grad_norm_var": 3.3900390625, + "learning_rate": 0.0001, + "loss": 7.5292, + "loss/crossentropy": 2.031618994474411, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.19062725063413383, + "step": 1350 + }, + { + "epoch": 0.034, + "grad_norm": 32.0, + "grad_norm_var": 3.3447265625, + "learning_rate": 0.0001, + "loss": 7.5755, + "loss/crossentropy": 2.2257011234760284, + "loss/hidden": 3.447265625, + "loss/jsd": 0.0, + "loss/logits": 0.1979327043518424, + "step": 1360 + }, + { + "epoch": 0.03425, + "grad_norm": 30.625, + "grad_norm_var": 3.3421223958333335, + "learning_rate": 0.0001, + "loss": 7.4219, + "loss/crossentropy": 2.155778780579567, + "loss/hidden": 3.31796875, + "loss/jsd": 0.0, + "loss/logits": 0.19018295016139747, + "step": 1370 + }, + { + "epoch": 0.0345, + "grad_norm": 30.25, + "grad_norm_var": 2.5872395833333335, + "learning_rate": 0.0001, + "loss": 7.4637, + "loss/crossentropy": 2.058405503630638, + "loss/hidden": 3.39296875, + "loss/jsd": 0.0, + "loss/logits": 0.2114524593576789, + "step": 1380 + }, + { + "epoch": 0.03475, + "grad_norm": 32.5, + "grad_norm_var": 3.2994140625, + "learning_rate": 0.0001, + "loss": 7.5834, + "loss/crossentropy": 2.1654782712459566, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.2024593001231551, + "step": 1390 + }, + { + "epoch": 0.035, + "grad_norm": 31.125, + "grad_norm_var": 12.812239583333334, + "learning_rate": 0.0001, + "loss": 7.4442, + "loss/crossentropy": 2.0921876966953277, + "loss/hidden": 3.286328125, + "loss/jsd": 0.0, + "loss/logits": 0.19270132519304753, + "step": 1400 + }, + { + "epoch": 0.03525, + "grad_norm": 29.25, + "grad_norm_var": 1.5108723958333334, + "learning_rate": 0.0001, + "loss": 7.4779, + "loss/crossentropy": 1.9434148371219635, + "loss/hidden": 3.366015625, + "loss/jsd": 0.0, + "loss/logits": 0.17576389852911234, + "step": 1410 + }, + { + "epoch": 0.0355, + "grad_norm": 30.125, + "grad_norm_var": 2.154166666666667, + "learning_rate": 0.0001, + "loss": 7.508, + "loss/crossentropy": 2.0766889482736586, + "loss/hidden": 3.485546875, + "loss/jsd": 0.0, + "loss/logits": 0.20394362770020963, + "step": 1420 + }, + { + "epoch": 0.03575, + "grad_norm": 30.125, + "grad_norm_var": 17.580208333333335, + "learning_rate": 0.0001, + "loss": 7.4612, + "loss/crossentropy": 2.00380075648427, + "loss/hidden": 3.4453125, + "loss/jsd": 0.0, + "loss/logits": 0.18816210143268108, + "step": 1430 + }, + { + "epoch": 0.036, + "grad_norm": 31.375, + "grad_norm_var": 16.758268229166667, + "learning_rate": 0.0001, + "loss": 7.4602, + "loss/crossentropy": 2.1938020154833793, + "loss/hidden": 3.4234375, + "loss/jsd": 0.0, + "loss/logits": 0.2016971528530121, + "step": 1440 + }, + { + "epoch": 0.03625, + "grad_norm": 30.875, + "grad_norm_var": 1.2556640625, + "learning_rate": 0.0001, + "loss": 7.4245, + "loss/crossentropy": 2.0232372283935547, + "loss/hidden": 3.40234375, + "loss/jsd": 0.0, + "loss/logits": 0.19209201391786337, + "step": 1450 + }, + { + "epoch": 0.0365, + "grad_norm": 31.0, + "grad_norm_var": 1.4041015625, + "learning_rate": 0.0001, + "loss": 7.5518, + "loss/crossentropy": 2.2000616788864136, + "loss/hidden": 3.473046875, + "loss/jsd": 0.0, + "loss/logits": 0.22938326951116322, + "step": 1460 + }, + { + "epoch": 0.03675, + "grad_norm": 28.375, + "grad_norm_var": 2.0322916666666666, + "learning_rate": 0.0001, + "loss": 7.4397, + "loss/crossentropy": 2.0838582158088683, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.20685861641541123, + "step": 1470 + }, + { + "epoch": 0.037, + "grad_norm": 32.0, + "grad_norm_var": 1.5020833333333334, + "learning_rate": 0.0001, + "loss": 7.4183, + "loss/crossentropy": 2.149951633810997, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1984950641170144, + "step": 1480 + }, + { + "epoch": 0.03725, + "grad_norm": 33.75, + "grad_norm_var": 34.10826822916667, + "learning_rate": 0.0001, + "loss": 7.453, + "loss/crossentropy": 2.128306310623884, + "loss/hidden": 3.33203125, + "loss/jsd": 0.0, + "loss/logits": 0.19783397912979125, + "step": 1490 + }, + { + "epoch": 0.0375, + "grad_norm": 29.5, + "grad_norm_var": 5.008072916666666, + "learning_rate": 0.0001, + "loss": 7.469, + "loss/crossentropy": 2.042660539597273, + "loss/hidden": 3.365234375, + "loss/jsd": 0.0, + "loss/logits": 0.19274956732988358, + "step": 1500 + }, + { + "epoch": 0.03775, + "grad_norm": 33.0, + "grad_norm_var": 19.1775390625, + "learning_rate": 0.0001, + "loss": 7.4119, + "loss/crossentropy": 2.043857058137655, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.18266947232186795, + "step": 1510 + }, + { + "epoch": 0.038, + "grad_norm": 29.625, + "grad_norm_var": 14.303580729166667, + "learning_rate": 0.0001, + "loss": 7.4362, + "loss/crossentropy": 1.9492302805185318, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.1754497304558754, + "step": 1520 + }, + { + "epoch": 0.03825, + "grad_norm": 29.75, + "grad_norm_var": 23.764518229166665, + "learning_rate": 0.0001, + "loss": 7.4444, + "loss/crossentropy": 2.0668226674199106, + "loss/hidden": 3.473828125, + "loss/jsd": 0.0, + "loss/logits": 0.1921279976144433, + "step": 1530 + }, + { + "epoch": 0.0385, + "grad_norm": 32.75, + "grad_norm_var": 3.2226069790467994e+18, + "learning_rate": 0.0001, + "loss": 7.5077, + "loss/crossentropy": 2.1122784771025183, + "loss/hidden": 3.46953125, + "loss/jsd": 0.0, + "loss/logits": 0.22245875597000123, + "step": 1540 + }, + { + "epoch": 0.03875, + "grad_norm": 30.25, + "grad_norm_var": 5.382291666666666, + "learning_rate": 0.0001, + "loss": 7.4525, + "loss/crossentropy": 2.264697426557541, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.2075907403603196, + "step": 1550 + }, + { + "epoch": 0.039, + "grad_norm": 30.0, + "grad_norm_var": 6.353580729166667, + "learning_rate": 0.0001, + "loss": 7.5064, + "loss/crossentropy": 2.1150408178567885, + "loss/hidden": 3.5203125, + "loss/jsd": 0.0, + "loss/logits": 0.23207673486322164, + "step": 1560 + }, + { + "epoch": 0.03925, + "grad_norm": 34.25, + "grad_norm_var": 6.72265625, + "learning_rate": 0.0001, + "loss": 7.4578, + "loss/crossentropy": 2.188142140209675, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.20429779235273599, + "step": 1570 + }, + { + "epoch": 0.0395, + "grad_norm": 34.75, + "grad_norm_var": 897.6666015625, + "learning_rate": 0.0001, + "loss": 7.434, + "loss/crossentropy": 2.0795677445828913, + "loss/hidden": 3.3828125, + "loss/jsd": 0.0, + "loss/logits": 0.18706642352044583, + "step": 1580 + }, + { + "epoch": 0.03975, + "grad_norm": 28.0, + "grad_norm_var": 903.6327473958333, + "learning_rate": 0.0001, + "loss": 7.5655, + "loss/crossentropy": 2.1025844663381577, + "loss/hidden": 3.469140625, + "loss/jsd": 0.0, + "loss/logits": 0.1966788914054632, + "step": 1590 + }, + { + "epoch": 0.04, + "grad_norm": 28.625, + "grad_norm_var": 11.97890625, + "learning_rate": 0.0001, + "loss": 7.2578, + "loss/crossentropy": 2.050418493151665, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.20104087069630622, + "step": 1600 + }, + { + "epoch": 0.04025, + "grad_norm": 28.0, + "grad_norm_var": 2.255989583333333, + "learning_rate": 0.0001, + "loss": 7.4393, + "loss/crossentropy": 2.1767756581306457, + "loss/hidden": 3.5140625, + "loss/jsd": 0.0, + "loss/logits": 0.2213939843699336, + "step": 1610 + }, + { + "epoch": 0.0405, + "grad_norm": 29.75, + "grad_norm_var": 3.80390625, + "learning_rate": 0.0001, + "loss": 7.5026, + "loss/crossentropy": 2.126803469657898, + "loss/hidden": 3.39375, + "loss/jsd": 0.0, + "loss/logits": 0.19106289148330688, + "step": 1620 + }, + { + "epoch": 0.04075, + "grad_norm": 32.0, + "grad_norm_var": 3.1249348958333334, + "learning_rate": 0.0001, + "loss": 7.4274, + "loss/crossentropy": 2.144256164133549, + "loss/hidden": 3.424609375, + "loss/jsd": 0.0, + "loss/logits": 0.21435861438512802, + "step": 1630 + }, + { + "epoch": 0.041, + "grad_norm": 30.25, + "grad_norm_var": 29.265559895833334, + "learning_rate": 0.0001, + "loss": 7.5728, + "loss/crossentropy": 2.2575725719332693, + "loss/hidden": 3.4421875, + "loss/jsd": 0.0, + "loss/logits": 0.20658138059079648, + "step": 1640 + }, + { + "epoch": 0.04125, + "grad_norm": 30.5, + "grad_norm_var": 48.35390625, + "learning_rate": 0.0001, + "loss": 7.5776, + "loss/crossentropy": 2.096929042041302, + "loss/hidden": 3.346875, + "loss/jsd": 0.0, + "loss/logits": 0.18803389491513373, + "step": 1650 + }, + { + "epoch": 0.0415, + "grad_norm": 30.5, + "grad_norm_var": 1.1010416666666667, + "learning_rate": 0.0001, + "loss": 7.3792, + "loss/crossentropy": 2.0290944524109364, + "loss/hidden": 3.313671875, + "loss/jsd": 0.0, + "loss/logits": 0.19023821037262678, + "step": 1660 + }, + { + "epoch": 0.04175, + "grad_norm": 28.125, + "grad_norm_var": 33.49270833333333, + "learning_rate": 0.0001, + "loss": 7.5018, + "loss/crossentropy": 2.0678361281752586, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.18862500675022603, + "step": 1670 + }, + { + "epoch": 0.042, + "grad_norm": 29.75, + "grad_norm_var": 2.2955729166666665, + "learning_rate": 0.0001, + "loss": 7.4432, + "loss/crossentropy": 2.0549797296524046, + "loss/hidden": 3.441796875, + "loss/jsd": 0.0, + "loss/logits": 0.19089050237089394, + "step": 1680 + }, + { + "epoch": 0.04225, + "grad_norm": 29.75, + "grad_norm_var": 1.8791666666666667, + "learning_rate": 0.0001, + "loss": 7.3842, + "loss/crossentropy": 2.0077505365014074, + "loss/hidden": 3.382421875, + "loss/jsd": 0.0, + "loss/logits": 0.18722779098898173, + "step": 1690 + }, + { + "epoch": 0.0425, + "grad_norm": 29.375, + "grad_norm_var": 0.9434895833333333, + "learning_rate": 0.0001, + "loss": 7.4273, + "loss/crossentropy": 2.071325332671404, + "loss/hidden": 3.486328125, + "loss/jsd": 0.0, + "loss/logits": 0.20270166713744403, + "step": 1700 + }, + { + "epoch": 0.04275, + "grad_norm": 38.25, + "grad_norm_var": 7.669791666666667, + "learning_rate": 0.0001, + "loss": 7.4176, + "loss/crossentropy": 2.1353142291307448, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.19663168713450432, + "step": 1710 + }, + { + "epoch": 0.043, + "grad_norm": 28.25, + "grad_norm_var": 7.75, + "learning_rate": 0.0001, + "loss": 7.3818, + "loss/crossentropy": 1.9995346069335938, + "loss/hidden": 3.41015625, + "loss/jsd": 0.0, + "loss/logits": 0.18310597026720643, + "step": 1720 + }, + { + "epoch": 0.04325, + "grad_norm": 29.5, + "grad_norm_var": 3.7619140625, + "learning_rate": 0.0001, + "loss": 7.4912, + "loss/crossentropy": 2.1415088951587675, + "loss/hidden": 3.55078125, + "loss/jsd": 0.0, + "loss/logits": 0.22313783299177886, + "step": 1730 + }, + { + "epoch": 0.0435, + "grad_norm": 31.625, + "grad_norm_var": 3.0416666666666665, + "learning_rate": 0.0001, + "loss": 7.4999, + "loss/crossentropy": 2.1686330527067184, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.20409150077030064, + "step": 1740 + }, + { + "epoch": 0.04375, + "grad_norm": 31.375, + "grad_norm_var": 2.724739583333333, + "learning_rate": 0.0001, + "loss": 7.438, + "loss/crossentropy": 1.9411263287067413, + "loss/hidden": 3.304296875, + "loss/jsd": 0.0, + "loss/logits": 0.17631518254056572, + "step": 1750 + }, + { + "epoch": 0.044, + "grad_norm": 32.0, + "grad_norm_var": 1.9145833333333333, + "learning_rate": 0.0001, + "loss": 7.679, + "loss/crossentropy": 2.1614590853452684, + "loss/hidden": 3.36015625, + "loss/jsd": 0.0, + "loss/logits": 0.194198589771986, + "step": 1760 + }, + { + "epoch": 0.04425, + "grad_norm": 28.5, + "grad_norm_var": 2.039322916666667, + "learning_rate": 0.0001, + "loss": 7.5095, + "loss/crossentropy": 2.282147654891014, + "loss/hidden": 3.359765625, + "loss/jsd": 0.0, + "loss/logits": 0.19978236705064772, + "step": 1770 + }, + { + "epoch": 0.0445, + "grad_norm": 29.625, + "grad_norm_var": 2.34140625, + "learning_rate": 0.0001, + "loss": 7.5296, + "loss/crossentropy": 2.2078514605760575, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.19668537452816964, + "step": 1780 + }, + { + "epoch": 0.04475, + "grad_norm": 30.25, + "grad_norm_var": 2.70390625, + "learning_rate": 0.0001, + "loss": 7.5779, + "loss/crossentropy": 2.1053253799676894, + "loss/hidden": 3.433984375, + "loss/jsd": 0.0, + "loss/logits": 0.20323336366564035, + "step": 1790 + }, + { + "epoch": 0.045, + "grad_norm": 28.5, + "grad_norm_var": 4.8712890625, + "learning_rate": 0.0001, + "loss": 7.4866, + "loss/crossentropy": 2.060333488881588, + "loss/hidden": 3.373828125, + "loss/jsd": 0.0, + "loss/logits": 0.18627767637372017, + "step": 1800 + }, + { + "epoch": 0.04525, + "grad_norm": 28.0, + "grad_norm_var": 14.480989583333333, + "learning_rate": 0.0001, + "loss": 7.5225, + "loss/crossentropy": 1.9755317773669958, + "loss/hidden": 3.54375, + "loss/jsd": 0.0, + "loss/logits": 0.20334282671101392, + "step": 1810 + }, + { + "epoch": 0.0455, + "grad_norm": 29.875, + "grad_norm_var": 12.935872395833334, + "learning_rate": 0.0001, + "loss": 7.4781, + "loss/crossentropy": 2.1289859026670457, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.1973018018528819, + "step": 1820 + }, + { + "epoch": 0.04575, + "grad_norm": 31.75, + "grad_norm_var": 2.123893229166667, + "learning_rate": 0.0001, + "loss": 7.3915, + "loss/crossentropy": 1.9609280914068221, + "loss/hidden": 3.386328125, + "loss/jsd": 0.0, + "loss/logits": 0.1916458262130618, + "step": 1830 + }, + { + "epoch": 0.046, + "grad_norm": 32.0, + "grad_norm_var": 1.6332682291666667, + "learning_rate": 0.0001, + "loss": 7.5095, + "loss/crossentropy": 2.0019985377788543, + "loss/hidden": 3.384375, + "loss/jsd": 0.0, + "loss/logits": 0.19768325993791222, + "step": 1840 + }, + { + "epoch": 0.04625, + "grad_norm": 29.875, + "grad_norm_var": 2.225455729166667, + "learning_rate": 0.0001, + "loss": 7.623, + "loss/crossentropy": 2.0607564479112623, + "loss/hidden": 3.507421875, + "loss/jsd": 0.0, + "loss/logits": 0.20858939345926047, + "step": 1850 + }, + { + "epoch": 0.0465, + "grad_norm": 29.5, + "grad_norm_var": 1.9863932291666666, + "learning_rate": 0.0001, + "loss": 7.3836, + "loss/crossentropy": 2.132562433928251, + "loss/hidden": 3.40859375, + "loss/jsd": 0.0, + "loss/logits": 0.1956317812204361, + "step": 1860 + }, + { + "epoch": 0.04675, + "grad_norm": 36.0, + "grad_norm_var": 3.2171223958333335, + "learning_rate": 0.0001, + "loss": 7.4803, + "loss/crossentropy": 2.0316790327429772, + "loss/hidden": 3.396875, + "loss/jsd": 0.0, + "loss/logits": 0.20630075875669718, + "step": 1870 + }, + { + "epoch": 0.047, + "grad_norm": 33.25, + "grad_norm_var": 16.304622395833334, + "learning_rate": 0.0001, + "loss": 7.576, + "loss/crossentropy": 2.161964085698128, + "loss/hidden": 3.513671875, + "loss/jsd": 0.0, + "loss/logits": 0.21842746511101724, + "step": 1880 + }, + { + "epoch": 0.04725, + "grad_norm": 29.75, + "grad_norm_var": 2.3541666666666665, + "learning_rate": 0.0001, + "loss": 7.5036, + "loss/crossentropy": 1.8695943117141725, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.18793469872325658, + "step": 1890 + }, + { + "epoch": 0.0475, + "grad_norm": 34.25, + "grad_norm_var": 2.1780598958333335, + "learning_rate": 0.0001, + "loss": 7.5623, + "loss/crossentropy": 2.2376974314451217, + "loss/hidden": 3.489453125, + "loss/jsd": 0.0, + "loss/logits": 0.21696731727570295, + "step": 1900 + }, + { + "epoch": 0.04775, + "grad_norm": 30.75, + "grad_norm_var": 14.924934895833333, + "learning_rate": 0.0001, + "loss": 7.388, + "loss/crossentropy": 1.9403380863368511, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.18128401823341847, + "step": 1910 + }, + { + "epoch": 0.048, + "grad_norm": 29.25, + "grad_norm_var": 25.1916015625, + "learning_rate": 0.0001, + "loss": 7.4109, + "loss/crossentropy": 2.1744547933340073, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.20097011709585785, + "step": 1920 + }, + { + "epoch": 0.04825, + "grad_norm": 29.25, + "grad_norm_var": 14.801822916666667, + "learning_rate": 0.0001, + "loss": 7.2893, + "loss/crossentropy": 2.101319019496441, + "loss/hidden": 3.44140625, + "loss/jsd": 0.0, + "loss/logits": 0.1921493023633957, + "step": 1930 + }, + { + "epoch": 0.0485, + "grad_norm": 30.125, + "grad_norm_var": 14.517708333333333, + "learning_rate": 0.0001, + "loss": 7.579, + "loss/crossentropy": 2.057158224284649, + "loss/hidden": 3.59140625, + "loss/jsd": 0.0, + "loss/logits": 0.21765361074358225, + "step": 1940 + }, + { + "epoch": 0.04875, + "grad_norm": 29.625, + "grad_norm_var": 15.790559895833333, + "learning_rate": 0.0001, + "loss": 7.3712, + "loss/crossentropy": 1.9415803879499436, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.18346730088815094, + "step": 1950 + }, + { + "epoch": 0.049, + "grad_norm": 27.625, + "grad_norm_var": 9.794791666666667, + "learning_rate": 0.0001, + "loss": 7.4902, + "loss/crossentropy": 2.035348242521286, + "loss/hidden": 3.439453125, + "loss/jsd": 0.0, + "loss/logits": 0.20268035624176264, + "step": 1960 + }, + { + "epoch": 0.04925, + "grad_norm": 35.25, + "grad_norm_var": 12.768684895833333, + "learning_rate": 0.0001, + "loss": 7.4627, + "loss/crossentropy": 2.054542076587677, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.2003987120464444, + "step": 1970 + }, + { + "epoch": 0.0495, + "grad_norm": 36.0, + "grad_norm_var": 12.572916666666666, + "learning_rate": 0.0001, + "loss": 7.353, + "loss/crossentropy": 1.9634785205125809, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.17985089337453247, + "step": 1980 + }, + { + "epoch": 0.04975, + "grad_norm": 36.25, + "grad_norm_var": 9.2353515625, + "learning_rate": 0.0001, + "loss": 7.4473, + "loss/crossentropy": 2.059533824026585, + "loss/hidden": 3.3578125, + "loss/jsd": 0.0, + "loss/logits": 0.19096513148397207, + "step": 1990 + }, + { + "epoch": 0.05, + "grad_norm": 29.125, + "grad_norm_var": 13.320572916666666, + "learning_rate": 0.0001, + "loss": 7.3914, + "loss/crossentropy": 2.011685383319855, + "loss/hidden": 3.4421875, + "loss/jsd": 0.0, + "loss/logits": 0.19188414234668016, + "step": 2000 + }, + { + "epoch": 0.05025, + "grad_norm": 36.25, + "grad_norm_var": 14.026822916666667, + "learning_rate": 0.0001, + "loss": 7.4213, + "loss/crossentropy": 2.309766414761543, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.20372038893401623, + "step": 2010 + }, + { + "epoch": 0.0505, + "grad_norm": 29.0, + "grad_norm_var": 9.237239583333333, + "learning_rate": 0.0001, + "loss": 7.4145, + "loss/crossentropy": 2.1240487143397333, + "loss/hidden": 3.447265625, + "loss/jsd": 0.0, + "loss/logits": 0.20137840434908866, + "step": 2020 + }, + { + "epoch": 0.05075, + "grad_norm": 38.5, + "grad_norm_var": 89.21432291666666, + "learning_rate": 0.0001, + "loss": 7.3696, + "loss/crossentropy": 2.112667274475098, + "loss/hidden": 3.487109375, + "loss/jsd": 0.0, + "loss/logits": 0.19770587887614965, + "step": 2030 + }, + { + "epoch": 0.051, + "grad_norm": 27.75, + "grad_norm_var": 94.06015625, + "learning_rate": 0.0001, + "loss": 7.2471, + "loss/crossentropy": 1.9955052442848682, + "loss/hidden": 3.30546875, + "loss/jsd": 0.0, + "loss/logits": 0.1880181163549423, + "step": 2040 + }, + { + "epoch": 0.05125, + "grad_norm": 35.25, + "grad_norm_var": 3.67265625, + "learning_rate": 0.0001, + "loss": 7.458, + "loss/crossentropy": 2.1320972844958304, + "loss/hidden": 3.384765625, + "loss/jsd": 0.0, + "loss/logits": 0.18908526431769132, + "step": 2050 + }, + { + "epoch": 0.0515, + "grad_norm": 38.75, + "grad_norm_var": 10.776822916666667, + "learning_rate": 0.0001, + "loss": 7.3769, + "loss/crossentropy": 2.171598494052887, + "loss/hidden": 3.29765625, + "loss/jsd": 0.0, + "loss/logits": 0.18929236195981503, + "step": 2060 + }, + { + "epoch": 0.05175, + "grad_norm": 32.75, + "grad_norm_var": 10.53515625, + "learning_rate": 0.0001, + "loss": 7.5279, + "loss/crossentropy": 2.0172302186489106, + "loss/hidden": 3.4203125, + "loss/jsd": 0.0, + "loss/logits": 0.2013201082125306, + "step": 2070 + }, + { + "epoch": 0.052, + "grad_norm": 32.0, + "grad_norm_var": 7.678125, + "learning_rate": 0.0001, + "loss": 7.3619, + "loss/crossentropy": 1.982726515084505, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.17850281894207, + "step": 2080 + }, + { + "epoch": 0.05225, + "grad_norm": 29.75, + "grad_norm_var": 63.6681640625, + "learning_rate": 0.0001, + "loss": 7.5109, + "loss/crossentropy": 2.121504098176956, + "loss/hidden": 3.50703125, + "loss/jsd": 0.0, + "loss/logits": 0.240205854550004, + "step": 2090 + }, + { + "epoch": 0.0525, + "grad_norm": 34.5, + "grad_norm_var": 7.506184895833333, + "learning_rate": 0.0001, + "loss": 7.4658, + "loss/crossentropy": 2.110687591135502, + "loss/hidden": 3.530078125, + "loss/jsd": 0.0, + "loss/logits": 0.2039638390764594, + "step": 2100 + }, + { + "epoch": 0.05275, + "grad_norm": 32.5, + "grad_norm_var": 19.075455729166666, + "learning_rate": 0.0001, + "loss": 7.5668, + "loss/crossentropy": 1.9557841390371322, + "loss/hidden": 3.462109375, + "loss/jsd": 0.0, + "loss/logits": 0.18774209143593906, + "step": 2110 + }, + { + "epoch": 0.053, + "grad_norm": 31.125, + "grad_norm_var": 3.85390625, + "learning_rate": 0.0001, + "loss": 7.5735, + "loss/crossentropy": 2.0219520531594752, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.18533632289618254, + "step": 2120 + }, + { + "epoch": 0.05325, + "grad_norm": 32.25, + "grad_norm_var": 3.8910807291666667, + "learning_rate": 0.0001, + "loss": 7.4083, + "loss/crossentropy": 2.1359280541539194, + "loss/hidden": 3.412890625, + "loss/jsd": 0.0, + "loss/logits": 0.1897095028311014, + "step": 2130 + }, + { + "epoch": 0.0535, + "grad_norm": 31.25, + "grad_norm_var": 2.5957682291666666, + "learning_rate": 0.0001, + "loss": 7.446, + "loss/crossentropy": 2.170258317142725, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.1826348526403308, + "step": 2140 + }, + { + "epoch": 0.05375, + "grad_norm": 31.25, + "grad_norm_var": 3.785416666666667, + "learning_rate": 0.0001, + "loss": 7.4014, + "loss/crossentropy": 2.131239393353462, + "loss/hidden": 3.303515625, + "loss/jsd": 0.0, + "loss/logits": 0.18656531646847724, + "step": 2150 + }, + { + "epoch": 0.054, + "grad_norm": 31.0, + "grad_norm_var": 4.8666015625, + "learning_rate": 0.0001, + "loss": 7.5478, + "loss/crossentropy": 2.223896725475788, + "loss/hidden": 3.383203125, + "loss/jsd": 0.0, + "loss/logits": 0.1951376979239285, + "step": 2160 + }, + { + "epoch": 0.05425, + "grad_norm": 30.375, + "grad_norm_var": 8.437955729166667, + "learning_rate": 0.0001, + "loss": 7.5562, + "loss/crossentropy": 2.1203987300395966, + "loss/hidden": 3.351171875, + "loss/jsd": 0.0, + "loss/logits": 0.1970507999882102, + "step": 2170 + }, + { + "epoch": 0.0545, + "grad_norm": 32.0, + "grad_norm_var": 2.9488932291666665, + "learning_rate": 0.0001, + "loss": 7.5532, + "loss/crossentropy": 2.080265050381422, + "loss/hidden": 3.544140625, + "loss/jsd": 0.0, + "loss/logits": 0.2216239819303155, + "step": 2180 + }, + { + "epoch": 0.05475, + "grad_norm": 31.125, + "grad_norm_var": 8.1728515625, + "learning_rate": 0.0001, + "loss": 7.382, + "loss/crossentropy": 2.2114535331726075, + "loss/hidden": 3.37734375, + "loss/jsd": 0.0, + "loss/logits": 0.20577374435961246, + "step": 2190 + }, + { + "epoch": 0.055, + "grad_norm": 28.875, + "grad_norm_var": 14.520833333333334, + "learning_rate": 0.0001, + "loss": 7.5766, + "loss/crossentropy": 2.1003271512687207, + "loss/hidden": 3.358984375, + "loss/jsd": 0.0, + "loss/logits": 0.18811229150742292, + "step": 2200 + }, + { + "epoch": 0.05525, + "grad_norm": 33.5, + "grad_norm_var": 16.099739583333335, + "learning_rate": 0.0001, + "loss": 7.5553, + "loss/crossentropy": 2.1326127350330353, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.22006579730659723, + "step": 2210 + }, + { + "epoch": 0.0555, + "grad_norm": 32.25, + "grad_norm_var": 9.305143229166667, + "learning_rate": 0.0001, + "loss": 7.3766, + "loss/crossentropy": 2.1496046826243402, + "loss/hidden": 3.476171875, + "loss/jsd": 0.0, + "loss/logits": 0.1952402491122484, + "step": 2220 + }, + { + "epoch": 0.05575, + "grad_norm": 29.125, + "grad_norm_var": 6.805143229166666, + "learning_rate": 0.0001, + "loss": 7.3648, + "loss/crossentropy": 2.13938904479146, + "loss/hidden": 3.36640625, + "loss/jsd": 0.0, + "loss/logits": 0.19394674636423587, + "step": 2230 + }, + { + "epoch": 0.056, + "grad_norm": 27.625, + "grad_norm_var": 15.0712890625, + "learning_rate": 0.0001, + "loss": 7.4292, + "loss/crossentropy": 2.0648645758628845, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.18520106598734856, + "step": 2240 + }, + { + "epoch": 0.05625, + "grad_norm": 29.25, + "grad_norm_var": 12.034309895833333, + "learning_rate": 0.0001, + "loss": 7.4469, + "loss/crossentropy": 2.080448921024799, + "loss/hidden": 3.3109375, + "loss/jsd": 0.0, + "loss/logits": 0.18507405128329993, + "step": 2250 + }, + { + "epoch": 0.0565, + "grad_norm": 31.375, + "grad_norm_var": 2.014518229166667, + "learning_rate": 0.0001, + "loss": 7.4325, + "loss/crossentropy": 2.0871294140815735, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.20059894528239966, + "step": 2260 + }, + { + "epoch": 0.05675, + "grad_norm": 28.75, + "grad_norm_var": 1.8103515625, + "learning_rate": 0.0001, + "loss": 7.4268, + "loss/crossentropy": 2.010594163835049, + "loss/hidden": 3.39453125, + "loss/jsd": 0.0, + "loss/logits": 0.19413960948586464, + "step": 2270 + }, + { + "epoch": 0.057, + "grad_norm": 32.5, + "grad_norm_var": 4.0369140625, + "learning_rate": 0.0001, + "loss": 7.4346, + "loss/crossentropy": 2.1129174560308455, + "loss/hidden": 3.416015625, + "loss/jsd": 0.0, + "loss/logits": 0.1961110396310687, + "step": 2280 + }, + { + "epoch": 0.05725, + "grad_norm": 39.0, + "grad_norm_var": 30.42265625, + "learning_rate": 0.0001, + "loss": 7.4422, + "loss/crossentropy": 2.002947611361742, + "loss/hidden": 3.432421875, + "loss/jsd": 0.0, + "loss/logits": 0.2081361676566303, + "step": 2290 + }, + { + "epoch": 0.0575, + "grad_norm": 37.25, + "grad_norm_var": 25.699934895833334, + "learning_rate": 0.0001, + "loss": 7.4312, + "loss/crossentropy": 2.06134437918663, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.18918452728539706, + "step": 2300 + }, + { + "epoch": 0.05775, + "grad_norm": 28.875, + "grad_norm_var": 9.115559895833334, + "learning_rate": 0.0001, + "loss": 7.4209, + "loss/crossentropy": 2.041922479122877, + "loss/hidden": 3.403515625, + "loss/jsd": 0.0, + "loss/logits": 0.20907302405685185, + "step": 2310 + }, + { + "epoch": 0.058, + "grad_norm": 30.125, + "grad_norm_var": 22.248372395833332, + "learning_rate": 0.0001, + "loss": 7.6844, + "loss/crossentropy": 2.0152460247278214, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.1905667196959257, + "step": 2320 + }, + { + "epoch": 0.05825, + "grad_norm": 38.25, + "grad_norm_var": 31.398893229166667, + "learning_rate": 0.0001, + "loss": 7.4713, + "loss/crossentropy": 2.105386929959059, + "loss/hidden": 3.452734375, + "loss/jsd": 0.0, + "loss/logits": 0.1982942834496498, + "step": 2330 + }, + { + "epoch": 0.0585, + "grad_norm": 28.375, + "grad_norm_var": 54.94264322916667, + "learning_rate": 0.0001, + "loss": 7.4575, + "loss/crossentropy": 2.2358868844807147, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.19232469592243434, + "step": 2340 + }, + { + "epoch": 0.05875, + "grad_norm": 33.5, + "grad_norm_var": 165.74583333333334, + "learning_rate": 0.0001, + "loss": 7.2987, + "loss/crossentropy": 1.9657546751201154, + "loss/hidden": 3.3921875, + "loss/jsd": 0.0, + "loss/logits": 0.18062973748892547, + "step": 2350 + }, + { + "epoch": 0.059, + "grad_norm": 41.0, + "grad_norm_var": 15.376822916666667, + "learning_rate": 0.0001, + "loss": 7.4431, + "loss/crossentropy": 2.191007924079895, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.2068317520432174, + "step": 2360 + }, + { + "epoch": 0.05925, + "grad_norm": 30.625, + "grad_norm_var": 12.109375, + "learning_rate": 0.0001, + "loss": 7.3325, + "loss/crossentropy": 2.0140789330005644, + "loss/hidden": 3.4109375, + "loss/jsd": 0.0, + "loss/logits": 0.18166892379522323, + "step": 2370 + }, + { + "epoch": 0.0595, + "grad_norm": 31.875, + "grad_norm_var": 6.941666666666666, + "learning_rate": 0.0001, + "loss": 7.4039, + "loss/crossentropy": 2.0221361994743345, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.1934544663876295, + "step": 2380 + }, + { + "epoch": 0.05975, + "grad_norm": 30.125, + "grad_norm_var": 10.472330729166666, + "learning_rate": 0.0001, + "loss": 7.5862, + "loss/crossentropy": 1.9840030640363693, + "loss/hidden": 3.46640625, + "loss/jsd": 0.0, + "loss/logits": 0.19178631734102963, + "step": 2390 + }, + { + "epoch": 0.06, + "grad_norm": 29.875, + "grad_norm_var": 14.10625, + "learning_rate": 0.0001, + "loss": 7.4826, + "loss/crossentropy": 2.1700179904699324, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.1915024297311902, + "step": 2400 + }, + { + "epoch": 0.06025, + "grad_norm": 32.75, + "grad_norm_var": 7.370768229166667, + "learning_rate": 0.0001, + "loss": 7.3889, + "loss/crossentropy": 2.091843403875828, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.18695627991110086, + "step": 2410 + }, + { + "epoch": 0.0605, + "grad_norm": 29.0, + "grad_norm_var": 9.922330729166667, + "learning_rate": 0.0001, + "loss": 7.4655, + "loss/crossentropy": 2.172381104528904, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.20078962799161673, + "step": 2420 + }, + { + "epoch": 0.06075, + "grad_norm": 34.25, + "grad_norm_var": 8.637239583333333, + "learning_rate": 0.0001, + "loss": 7.519, + "loss/crossentropy": 1.995463601499796, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.1993358489125967, + "step": 2430 + }, + { + "epoch": 0.061, + "grad_norm": 31.25, + "grad_norm_var": 11.9431640625, + "learning_rate": 0.0001, + "loss": 7.5169, + "loss/crossentropy": 2.296917426586151, + "loss/hidden": 3.513671875, + "loss/jsd": 0.0, + "loss/logits": 0.23228074796497822, + "step": 2440 + }, + { + "epoch": 0.06125, + "grad_norm": 30.25, + "grad_norm_var": 3.4368798046573737e+18, + "learning_rate": 0.0001, + "loss": 7.5038, + "loss/crossentropy": 2.1944432735443113, + "loss/hidden": 3.3921875, + "loss/jsd": 0.0, + "loss/logits": 0.21073084995150565, + "step": 2450 + }, + { + "epoch": 0.0615, + "grad_norm": 33.5, + "grad_norm_var": 3.436879805205814e+18, + "learning_rate": 0.0001, + "loss": 7.4423, + "loss/crossentropy": 2.152103579044342, + "loss/hidden": 3.512109375, + "loss/jsd": 0.0, + "loss/logits": 0.20929353777319193, + "step": 2460 + }, + { + "epoch": 0.06175, + "grad_norm": 39.0, + "grad_norm_var": 2.2045823633093297e+18, + "learning_rate": 0.0001, + "loss": 7.4382, + "loss/crossentropy": 2.017627691477537, + "loss/hidden": 3.355078125, + "loss/jsd": 0.0, + "loss/logits": 0.19590776292607187, + "step": 2470 + }, + { + "epoch": 0.062, + "grad_norm": 29.375, + "grad_norm_var": 2.2045823636681523e+18, + "learning_rate": 0.0001, + "loss": 7.4072, + "loss/crossentropy": 2.1076912328600885, + "loss/hidden": 3.433203125, + "loss/jsd": 0.0, + "loss/logits": 0.1988623272627592, + "step": 2480 + }, + { + "epoch": 0.06225, + "grad_norm": 30.125, + "grad_norm_var": 3.2494140625, + "learning_rate": 0.0001, + "loss": 7.3192, + "loss/crossentropy": 1.9777067750692368, + "loss/hidden": 3.429296875, + "loss/jsd": 0.0, + "loss/logits": 0.20539684109389783, + "step": 2490 + }, + { + "epoch": 0.0625, + "grad_norm": 29.125, + "grad_norm_var": 5.580208333333333, + "learning_rate": 0.0001, + "loss": 7.3283, + "loss/crossentropy": 2.061080713570118, + "loss/hidden": 3.4953125, + "loss/jsd": 0.0, + "loss/logits": 0.20077812522649766, + "step": 2500 + }, + { + "epoch": 0.06275, + "grad_norm": 28.375, + "grad_norm_var": 5.618489583333333, + "learning_rate": 0.0001, + "loss": 7.4401, + "loss/crossentropy": 2.2099071338772776, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.2055276283994317, + "step": 2510 + }, + { + "epoch": 0.063, + "grad_norm": 28.125, + "grad_norm_var": 7.118684895833334, + "learning_rate": 0.0001, + "loss": 7.3509, + "loss/crossentropy": 1.962952435016632, + "loss/hidden": 3.421484375, + "loss/jsd": 0.0, + "loss/logits": 0.19731322024017572, + "step": 2520 + }, + { + "epoch": 0.06325, + "grad_norm": 31.375, + "grad_norm_var": 1.9681640625, + "learning_rate": 0.0001, + "loss": 7.3695, + "loss/crossentropy": 1.9843583509325982, + "loss/hidden": 3.4921875, + "loss/jsd": 0.0, + "loss/logits": 0.2062232268974185, + "step": 2530 + }, + { + "epoch": 0.0635, + "grad_norm": 31.5, + "grad_norm_var": 3.7988932291666666, + "learning_rate": 0.0001, + "loss": 7.4485, + "loss/crossentropy": 2.1427679538726805, + "loss/hidden": 3.38125, + "loss/jsd": 0.0, + "loss/logits": 0.2011977185495198, + "step": 2540 + }, + { + "epoch": 0.06375, + "grad_norm": 30.0, + "grad_norm_var": 2.5885416666666665, + "learning_rate": 0.0001, + "loss": 7.4157, + "loss/crossentropy": 1.9085583783686162, + "loss/hidden": 3.325, + "loss/jsd": 0.0, + "loss/logits": 0.17416954301297666, + "step": 2550 + }, + { + "epoch": 0.064, + "grad_norm": 31.25, + "grad_norm_var": 1.21015625, + "learning_rate": 0.0001, + "loss": 7.5141, + "loss/crossentropy": 1.9622327491641045, + "loss/hidden": 3.361328125, + "loss/jsd": 0.0, + "loss/logits": 0.18756412472575903, + "step": 2560 + }, + { + "epoch": 0.06425, + "grad_norm": 30.0, + "grad_norm_var": 1.7143229166666667, + "learning_rate": 0.0001, + "loss": 7.4624, + "loss/crossentropy": 2.192887546122074, + "loss/hidden": 3.4296875, + "loss/jsd": 0.0, + "loss/logits": 0.1984951412305236, + "step": 2570 + }, + { + "epoch": 0.0645, + "grad_norm": 30.125, + "grad_norm_var": 1.9143229166666667, + "learning_rate": 0.0001, + "loss": 7.3947, + "loss/crossentropy": 2.102549520134926, + "loss/hidden": 3.463671875, + "loss/jsd": 0.0, + "loss/logits": 0.1989850653335452, + "step": 2580 + }, + { + "epoch": 0.06475, + "grad_norm": 32.25, + "grad_norm_var": 9.5322265625, + "learning_rate": 0.0001, + "loss": 7.5147, + "loss/crossentropy": 2.213281115144491, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.2027151037938893, + "step": 2590 + }, + { + "epoch": 0.065, + "grad_norm": 30.625, + "grad_norm_var": 2.3427083333333334, + "learning_rate": 0.0001, + "loss": 7.4691, + "loss/crossentropy": 2.1138279482722284, + "loss/hidden": 3.379296875, + "loss/jsd": 0.0, + "loss/logits": 0.20825629755854608, + "step": 2600 + }, + { + "epoch": 0.06525, + "grad_norm": 36.0, + "grad_norm_var": 3.3395182291666665, + "learning_rate": 0.0001, + "loss": 7.4775, + "loss/crossentropy": 2.107349547743797, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.19337845854461194, + "step": 2610 + }, + { + "epoch": 0.0655, + "grad_norm": 29.25, + "grad_norm_var": 12.757291666666667, + "learning_rate": 0.0001, + "loss": 7.5438, + "loss/crossentropy": 2.0628502368927, + "loss/hidden": 3.4984375, + "loss/jsd": 0.0, + "loss/logits": 0.20967572089284658, + "step": 2620 + }, + { + "epoch": 0.06575, + "grad_norm": 28.625, + "grad_norm_var": 11.805208333333333, + "learning_rate": 0.0001, + "loss": 7.3354, + "loss/crossentropy": 2.1009589530527593, + "loss/hidden": 3.33828125, + "loss/jsd": 0.0, + "loss/logits": 0.18132725274190306, + "step": 2630 + }, + { + "epoch": 0.066, + "grad_norm": 32.5, + "grad_norm_var": 2.730208333333333, + "learning_rate": 0.0001, + "loss": 7.4257, + "loss/crossentropy": 1.983342681080103, + "loss/hidden": 3.480078125, + "loss/jsd": 0.0, + "loss/logits": 0.19340286049991845, + "step": 2640 + }, + { + "epoch": 0.06625, + "grad_norm": 30.25, + "grad_norm_var": 3.7549465282226944e+18, + "learning_rate": 0.0001, + "loss": 7.309, + "loss/crossentropy": 2.0057250812649725, + "loss/hidden": 3.418359375, + "loss/jsd": 0.0, + "loss/logits": 0.18936716187745334, + "step": 2650 + }, + { + "epoch": 0.0665, + "grad_norm": 36.25, + "grad_norm_var": 8.832747395833334, + "learning_rate": 0.0001, + "loss": 7.5442, + "loss/crossentropy": 2.054753464460373, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.2035602940246463, + "step": 2660 + }, + { + "epoch": 0.06675, + "grad_norm": 32.5, + "grad_norm_var": 4.8900390625, + "learning_rate": 0.0001, + "loss": 7.4106, + "loss/crossentropy": 2.0181221179664135, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.1878144398331642, + "step": 2670 + }, + { + "epoch": 0.067, + "grad_norm": 30.125, + "grad_norm_var": 4.280989583333334, + "learning_rate": 0.0001, + "loss": 7.4597, + "loss/crossentropy": 2.200540581345558, + "loss/hidden": 3.4046875, + "loss/jsd": 0.0, + "loss/logits": 0.20286752395331858, + "step": 2680 + }, + { + "epoch": 0.06725, + "grad_norm": 31.75, + "grad_norm_var": 3.8559895833333333, + "learning_rate": 0.0001, + "loss": 7.4643, + "loss/crossentropy": 2.0630861818790436, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.20401672925800085, + "step": 2690 + }, + { + "epoch": 0.0675, + "grad_norm": 33.0, + "grad_norm_var": 7.073958333333334, + "learning_rate": 0.0001, + "loss": 7.4001, + "loss/crossentropy": 1.927167509496212, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.17901942003518342, + "step": 2700 + }, + { + "epoch": 0.06775, + "grad_norm": 30.25, + "grad_norm_var": 8.9009765625, + "learning_rate": 0.0001, + "loss": 7.3461, + "loss/crossentropy": 2.0538916781544687, + "loss/hidden": 3.35234375, + "loss/jsd": 0.0, + "loss/logits": 0.1864149821922183, + "step": 2710 + }, + { + "epoch": 0.068, + "grad_norm": 29.5, + "grad_norm_var": 2.218489583333333, + "learning_rate": 0.0001, + "loss": 7.526, + "loss/crossentropy": 2.211588367819786, + "loss/hidden": 3.487890625, + "loss/jsd": 0.0, + "loss/logits": 0.20801848396658898, + "step": 2720 + }, + { + "epoch": 0.06825, + "grad_norm": 31.375, + "grad_norm_var": 1.0768229166666667, + "learning_rate": 0.0001, + "loss": 7.5535, + "loss/crossentropy": 2.268890543282032, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.21352684032171965, + "step": 2730 + }, + { + "epoch": 0.0685, + "grad_norm": 33.25, + "grad_norm_var": 5.663997395833333, + "learning_rate": 0.0001, + "loss": 7.411, + "loss/crossentropy": 1.902898482978344, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.1794701736420393, + "step": 2740 + }, + { + "epoch": 0.06875, + "grad_norm": 32.25, + "grad_norm_var": 6.167708333333334, + "learning_rate": 0.0001, + "loss": 7.3718, + "loss/crossentropy": 1.9450767874717712, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.18759301900863648, + "step": 2750 + }, + { + "epoch": 0.069, + "grad_norm": 31.125, + "grad_norm_var": 31.185872395833332, + "learning_rate": 0.0001, + "loss": 7.4359, + "loss/crossentropy": 2.0783849939703942, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18503105416893958, + "step": 2760 + }, + { + "epoch": 0.06925, + "grad_norm": 36.5, + "grad_norm_var": 35.412434895833336, + "learning_rate": 0.0001, + "loss": 7.5806, + "loss/crossentropy": 2.2374701410532, + "loss/hidden": 3.378125, + "loss/jsd": 0.0, + "loss/logits": 0.19615829903632404, + "step": 2770 + }, + { + "epoch": 0.0695, + "grad_norm": 30.25, + "grad_norm_var": 19.787239583333335, + "learning_rate": 0.0001, + "loss": 7.3197, + "loss/crossentropy": 1.8297001466155052, + "loss/hidden": 3.3171875, + "loss/jsd": 0.0, + "loss/logits": 0.16481583826243879, + "step": 2780 + }, + { + "epoch": 0.06975, + "grad_norm": 428.0, + "grad_norm_var": 9873.31640625, + "learning_rate": 0.0001, + "loss": 7.5313, + "loss/crossentropy": 2.249661484360695, + "loss/hidden": 3.392578125, + "loss/jsd": 0.0, + "loss/logits": 0.2018596636131406, + "step": 2790 + }, + { + "epoch": 0.07, + "grad_norm": 31.0, + "grad_norm_var": 9755.6625, + "learning_rate": 0.0001, + "loss": 7.3957, + "loss/crossentropy": 1.9368772380053998, + "loss/hidden": 3.48203125, + "loss/jsd": 0.0, + "loss/logits": 0.18386599626392125, + "step": 2800 + }, + { + "epoch": 0.07025, + "grad_norm": 30.75, + "grad_norm_var": 1.8317057291666667, + "learning_rate": 0.0001, + "loss": 7.4372, + "loss/crossentropy": 1.98307463824749, + "loss/hidden": 3.464453125, + "loss/jsd": 0.0, + "loss/logits": 0.19818334747105837, + "step": 2810 + }, + { + "epoch": 0.0705, + "grad_norm": 29.375, + "grad_norm_var": 2.589583333333333, + "learning_rate": 0.0001, + "loss": 7.5014, + "loss/crossentropy": 2.1463105253875256, + "loss/hidden": 3.5046875, + "loss/jsd": 0.0, + "loss/logits": 0.20105676222592592, + "step": 2820 + }, + { + "epoch": 0.07075, + "grad_norm": 60.5, + "grad_norm_var": 178.2556640625, + "learning_rate": 0.0001, + "loss": 7.4527, + "loss/crossentropy": 2.0776613369584083, + "loss/hidden": 3.420703125, + "loss/jsd": 0.0, + "loss/logits": 0.19452448841184378, + "step": 2830 + }, + { + "epoch": 0.071, + "grad_norm": 29.25, + "grad_norm_var": 172.31451822916668, + "learning_rate": 0.0001, + "loss": 7.4802, + "loss/crossentropy": 2.1200039610266685, + "loss/hidden": 3.417578125, + "loss/jsd": 0.0, + "loss/logits": 0.19831879772245883, + "step": 2840 + }, + { + "epoch": 0.07125, + "grad_norm": 69.0, + "grad_norm_var": 117.23098958333334, + "learning_rate": 0.0001, + "loss": 7.434, + "loss/crossentropy": 2.024143140017986, + "loss/hidden": 3.348828125, + "loss/jsd": 0.0, + "loss/logits": 0.1836528332903981, + "step": 2850 + }, + { + "epoch": 0.0715, + "grad_norm": 31.375, + "grad_norm_var": 92.53723958333333, + "learning_rate": 0.0001, + "loss": 7.4934, + "loss/crossentropy": 2.2765417456626893, + "loss/hidden": 3.446484375, + "loss/jsd": 0.0, + "loss/logits": 0.20736196860671044, + "step": 2860 + }, + { + "epoch": 0.07175, + "grad_norm": 31.625, + "grad_norm_var": 7.986393229166667, + "learning_rate": 0.0001, + "loss": 7.4826, + "loss/crossentropy": 2.269197002053261, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.19869209118187428, + "step": 2870 + }, + { + "epoch": 0.072, + "grad_norm": 31.25, + "grad_norm_var": 3.1806640625, + "learning_rate": 0.0001, + "loss": 7.4018, + "loss/crossentropy": 2.2985214799642564, + "loss/hidden": 3.390234375, + "loss/jsd": 0.0, + "loss/logits": 0.20524807646870613, + "step": 2880 + }, + { + "epoch": 0.07225, + "grad_norm": 30.875, + "grad_norm_var": 4.801822916666667, + "learning_rate": 0.0001, + "loss": 7.5148, + "loss/crossentropy": 2.2387808740139006, + "loss/hidden": 3.46015625, + "loss/jsd": 0.0, + "loss/logits": 0.19951685946434736, + "step": 2890 + }, + { + "epoch": 0.0725, + "grad_norm": 28.875, + "grad_norm_var": 13.836458333333333, + "learning_rate": 0.0001, + "loss": 7.5232, + "loss/crossentropy": 2.049694790691137, + "loss/hidden": 3.410546875, + "loss/jsd": 0.0, + "loss/logits": 0.19052465092390775, + "step": 2900 + }, + { + "epoch": 0.07275, + "grad_norm": 29.625, + "grad_norm_var": 17.91640625, + "learning_rate": 0.0001, + "loss": 7.3227, + "loss/crossentropy": 2.0360258772969244, + "loss/hidden": 3.40546875, + "loss/jsd": 0.0, + "loss/logits": 0.18495636582374572, + "step": 2910 + }, + { + "epoch": 0.073, + "grad_norm": 32.0, + "grad_norm_var": 1.8926377214767268e+18, + "learning_rate": 0.0001, + "loss": 7.4512, + "loss/crossentropy": 2.13848315179348, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.18625867497175932, + "step": 2920 + }, + { + "epoch": 0.07325, + "grad_norm": 29.875, + "grad_norm_var": 1.8926377199175642e+18, + "learning_rate": 0.0001, + "loss": 7.5038, + "loss/crossentropy": 2.166595605015755, + "loss/hidden": 3.49375, + "loss/jsd": 0.0, + "loss/logits": 0.20948194600641729, + "step": 2930 + }, + { + "epoch": 0.0735, + "grad_norm": 28.5, + "grad_norm_var": 73.08020833333333, + "learning_rate": 0.0001, + "loss": 7.374, + "loss/crossentropy": 1.9849643550813199, + "loss/hidden": 3.301171875, + "loss/jsd": 0.0, + "loss/logits": 0.18302082028239966, + "step": 2940 + }, + { + "epoch": 0.07375, + "grad_norm": 29.125, + "grad_norm_var": 24.825, + "learning_rate": 0.0001, + "loss": 7.3651, + "loss/crossentropy": 2.057874396443367, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.1866615541279316, + "step": 2950 + }, + { + "epoch": 0.074, + "grad_norm": 30.625, + "grad_norm_var": 883.6354166666666, + "learning_rate": 0.0001, + "loss": 7.5415, + "loss/crossentropy": 2.1631729155778885, + "loss/hidden": 3.388671875, + "loss/jsd": 0.0, + "loss/logits": 0.20762786027044058, + "step": 2960 + }, + { + "epoch": 0.07425, + "grad_norm": 32.75, + "grad_norm_var": 887.2705729166667, + "learning_rate": 0.0001, + "loss": 7.4471, + "loss/crossentropy": 1.9493468508124352, + "loss/hidden": 3.3578125, + "loss/jsd": 0.0, + "loss/logits": 0.1884406829252839, + "step": 2970 + }, + { + "epoch": 0.0745, + "grad_norm": 28.875, + "grad_norm_var": 5.070768229166666, + "learning_rate": 0.0001, + "loss": 7.605, + "loss/crossentropy": 2.122344336658716, + "loss/hidden": 3.460546875, + "loss/jsd": 0.0, + "loss/logits": 0.21057356838136912, + "step": 2980 + }, + { + "epoch": 0.07475, + "grad_norm": 37.0, + "grad_norm_var": 21.535416666666666, + "learning_rate": 0.0001, + "loss": 7.469, + "loss/crossentropy": 2.008989527821541, + "loss/hidden": 3.54140625, + "loss/jsd": 0.0, + "loss/logits": 0.2172183733433485, + "step": 2990 + }, + { + "epoch": 0.075, + "grad_norm": 29.375, + "grad_norm_var": 18.198958333333334, + "learning_rate": 0.0001, + "loss": 7.3932, + "loss/crossentropy": 2.1922819674015046, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.20425879992544652, + "step": 3000 + }, + { + "epoch": 0.07525, + "grad_norm": 29.5, + "grad_norm_var": 2.668684895833333, + "learning_rate": 0.0001, + "loss": 7.3505, + "loss/crossentropy": 2.189265179634094, + "loss/hidden": 3.34609375, + "loss/jsd": 0.0, + "loss/logits": 0.20808048862963915, + "step": 3010 + }, + { + "epoch": 0.0755, + "grad_norm": 30.75, + "grad_norm_var": 14.20625, + "learning_rate": 0.0001, + "loss": 7.5013, + "loss/crossentropy": 2.0573098927736284, + "loss/hidden": 3.3515625, + "loss/jsd": 0.0, + "loss/logits": 0.18116160985082388, + "step": 3020 + }, + { + "epoch": 0.07575, + "grad_norm": 31.375, + "grad_norm_var": 16.983333333333334, + "learning_rate": 0.0001, + "loss": 7.4455, + "loss/crossentropy": 1.9735823571681976, + "loss/hidden": 3.459765625, + "loss/jsd": 0.0, + "loss/logits": 0.19495000168681145, + "step": 3030 + }, + { + "epoch": 0.076, + "grad_norm": 7247757312.0, + "grad_norm_var": 3.2831240991582193e+18, + "learning_rate": 0.0001, + "loss": 7.4881, + "loss/crossentropy": 1.971890377253294, + "loss/hidden": 3.40625, + "loss/jsd": 0.0, + "loss/logits": 0.18015608433634042, + "step": 3040 + }, + { + "epoch": 0.07625, + "grad_norm": 28.25, + "grad_norm_var": 3.283124098780732e+18, + "learning_rate": 0.0001, + "loss": 7.3664, + "loss/crossentropy": 1.8378953270614147, + "loss/hidden": 3.3609375, + "loss/jsd": 0.0, + "loss/logits": 0.1741427879780531, + "step": 3050 + }, + { + "epoch": 0.0765, + "grad_norm": 31.75, + "grad_norm_var": 1.89140625, + "learning_rate": 0.0001, + "loss": 7.5137, + "loss/crossentropy": 2.141886255145073, + "loss/hidden": 3.443359375, + "loss/jsd": 0.0, + "loss/logits": 0.19584037065505983, + "step": 3060 + }, + { + "epoch": 0.07675, + "grad_norm": 27.25, + "grad_norm_var": 2.4244140625, + "learning_rate": 0.0001, + "loss": 7.4296, + "loss/crossentropy": 2.0373554110527037, + "loss/hidden": 3.5640625, + "loss/jsd": 0.0, + "loss/logits": 0.216986732929945, + "step": 3070 + }, + { + "epoch": 0.077, + "grad_norm": 35.25, + "grad_norm_var": 3.7322265625, + "learning_rate": 0.0001, + "loss": 7.5269, + "loss/crossentropy": 1.975497831404209, + "loss/hidden": 3.333984375, + "loss/jsd": 0.0, + "loss/logits": 0.1780722170136869, + "step": 3080 + }, + { + "epoch": 0.07725, + "grad_norm": 32.75, + "grad_norm_var": 3.6895182291666666, + "learning_rate": 0.0001, + "loss": 7.4938, + "loss/crossentropy": 2.151789793372154, + "loss/hidden": 3.502734375, + "loss/jsd": 0.0, + "loss/logits": 0.21854450944811105, + "step": 3090 + }, + { + "epoch": 0.0775, + "grad_norm": 29.5, + "grad_norm_var": 6.82265625, + "learning_rate": 0.0001, + "loss": 7.4321, + "loss/crossentropy": 1.9484706297516823, + "loss/hidden": 3.506640625, + "loss/jsd": 0.0, + "loss/logits": 0.19896488767117262, + "step": 3100 + }, + { + "epoch": 0.07775, + "grad_norm": 29.75, + "grad_norm_var": 3.0780598958333334, + "learning_rate": 0.0001, + "loss": 7.5471, + "loss/crossentropy": 2.165594828128815, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.20095103643834591, + "step": 3110 + }, + { + "epoch": 0.078, + "grad_norm": 29.0, + "grad_norm_var": 2.2197916666666666, + "learning_rate": 0.0001, + "loss": 7.6334, + "loss/crossentropy": 2.1854751259088516, + "loss/hidden": 3.4640625, + "loss/jsd": 0.0, + "loss/logits": 0.21246263310313224, + "step": 3120 + }, + { + "epoch": 0.07825, + "grad_norm": 29.0, + "grad_norm_var": 3.71640625, + "learning_rate": 0.0001, + "loss": 7.4278, + "loss/crossentropy": 1.914103902876377, + "loss/hidden": 3.451953125, + "loss/jsd": 0.0, + "loss/logits": 0.18373754434287548, + "step": 3130 + }, + { + "epoch": 0.0785, + "grad_norm": 29.0, + "grad_norm_var": 1.2952473958333333, + "learning_rate": 0.0001, + "loss": 7.4487, + "loss/crossentropy": 1.9421842776238918, + "loss/hidden": 3.5296875, + "loss/jsd": 0.0, + "loss/logits": 0.19919300880283117, + "step": 3140 + }, + { + "epoch": 0.07875, + "grad_norm": 29.375, + "grad_norm_var": 1.8268229166666667, + "learning_rate": 0.0001, + "loss": 7.5818, + "loss/crossentropy": 2.0765694811940194, + "loss/hidden": 3.5171875, + "loss/jsd": 0.0, + "loss/logits": 0.19946561977267266, + "step": 3150 + }, + { + "epoch": 0.079, + "grad_norm": 28.125, + "grad_norm_var": 11.483268229166667, + "learning_rate": 0.0001, + "loss": 7.4372, + "loss/crossentropy": 2.013955050334334, + "loss/hidden": 3.4078125, + "loss/jsd": 0.0, + "loss/logits": 0.20109358858317136, + "step": 3160 + }, + { + "epoch": 0.07925, + "grad_norm": 28.875, + "grad_norm_var": 12.871809895833334, + "learning_rate": 0.0001, + "loss": 7.4606, + "loss/crossentropy": 2.2802242666482924, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.21229397617280482, + "step": 3170 + }, + { + "epoch": 0.0795, + "grad_norm": 28.375, + "grad_norm_var": 1.6301432291666667, + "learning_rate": 0.0001, + "loss": 7.4691, + "loss/crossentropy": 2.134338477253914, + "loss/hidden": 3.404296875, + "loss/jsd": 0.0, + "loss/logits": 0.18632632456719875, + "step": 3180 + }, + { + "epoch": 0.07975, + "grad_norm": 30.625, + "grad_norm_var": 2.6113932291666666, + "learning_rate": 0.0001, + "loss": 7.4903, + "loss/crossentropy": 2.192245528101921, + "loss/hidden": 3.381640625, + "loss/jsd": 0.0, + "loss/logits": 0.19276445377618073, + "step": 3190 + }, + { + "epoch": 0.08, + "grad_norm": 27.875, + "grad_norm_var": 2.6830729166666667, + "learning_rate": 0.0001, + "loss": 7.4715, + "loss/crossentropy": 2.1333388604223726, + "loss/hidden": 3.376953125, + "loss/jsd": 0.0, + "loss/logits": 0.1902673264965415, + "step": 3200 + }, + { + "epoch": 0.08025, + "grad_norm": 29.625, + "grad_norm_var": 2.7072265625, + "learning_rate": 0.0001, + "loss": 7.4646, + "loss/crossentropy": 2.1069626569747926, + "loss/hidden": 3.374609375, + "loss/jsd": 0.0, + "loss/logits": 0.18933899328112602, + "step": 3210 + }, + { + "epoch": 0.0805, + "grad_norm": 33.0, + "grad_norm_var": 1.6457682291666667, + "learning_rate": 0.0001, + "loss": 7.3771, + "loss/crossentropy": 2.143903985619545, + "loss/hidden": 3.34921875, + "loss/jsd": 0.0, + "loss/logits": 0.19841080270707606, + "step": 3220 + }, + { + "epoch": 0.08075, + "grad_norm": 29.5, + "grad_norm_var": 2.405143229166667, + "learning_rate": 0.0001, + "loss": 7.4629, + "loss/crossentropy": 1.9501185864210129, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.2003694986924529, + "step": 3230 + }, + { + "epoch": 0.081, + "grad_norm": 35.0, + "grad_norm_var": 3.4619140625, + "learning_rate": 0.0001, + "loss": 7.6085, + "loss/crossentropy": 2.1099744185805323, + "loss/hidden": 3.33515625, + "loss/jsd": 0.0, + "loss/logits": 0.1865939747542143, + "step": 3240 + }, + { + "epoch": 0.08125, + "grad_norm": 38.0, + "grad_norm_var": 15.54140625, + "learning_rate": 0.0001, + "loss": 7.4858, + "loss/crossentropy": 1.8915734700858593, + "loss/hidden": 3.550390625, + "loss/jsd": 0.0, + "loss/logits": 0.20414282865822314, + "step": 3250 + }, + { + "epoch": 0.0815, + "grad_norm": 31.875, + "grad_norm_var": 15.074934895833334, + "learning_rate": 0.0001, + "loss": 7.4995, + "loss/crossentropy": 2.0746393710374833, + "loss/hidden": 3.448046875, + "loss/jsd": 0.0, + "loss/logits": 0.19025763403624296, + "step": 3260 + }, + { + "epoch": 0.08175, + "grad_norm": 29.625, + "grad_norm_var": 4.532291666666667, + "learning_rate": 0.0001, + "loss": 7.4517, + "loss/crossentropy": 2.201898355782032, + "loss/hidden": 3.38515625, + "loss/jsd": 0.0, + "loss/logits": 0.1851862959563732, + "step": 3270 + }, + { + "epoch": 0.082, + "grad_norm": 32.25, + "grad_norm_var": 9.199739583333333, + "learning_rate": 0.0001, + "loss": 7.4085, + "loss/crossentropy": 1.9774614453315735, + "loss/hidden": 3.5078125, + "loss/jsd": 0.0, + "loss/logits": 0.1853517958894372, + "step": 3280 + }, + { + "epoch": 0.08225, + "grad_norm": 31.0, + "grad_norm_var": 13.801497395833334, + "learning_rate": 0.0001, + "loss": 7.4065, + "loss/crossentropy": 2.1263367265462874, + "loss/hidden": 3.412109375, + "loss/jsd": 0.0, + "loss/logits": 0.18529028967022895, + "step": 3290 + }, + { + "epoch": 0.0825, + "grad_norm": 29.5, + "grad_norm_var": 2.967643229166667, + "learning_rate": 0.0001, + "loss": 7.4165, + "loss/crossentropy": 2.193544697761536, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.19897244460880756, + "step": 3300 + }, + { + "epoch": 0.08275, + "grad_norm": 33.75, + "grad_norm_var": 9.687239583333334, + "learning_rate": 0.0001, + "loss": 7.5716, + "loss/crossentropy": 2.0868531957268717, + "loss/hidden": 3.616796875, + "loss/jsd": 0.0, + "loss/logits": 0.21278488002717494, + "step": 3310 + }, + { + "epoch": 0.083, + "grad_norm": 31.0, + "grad_norm_var": 7.9478515625, + "learning_rate": 0.0001, + "loss": 7.5543, + "loss/crossentropy": 2.1392074063420297, + "loss/hidden": 3.395703125, + "loss/jsd": 0.0, + "loss/logits": 0.21113577168434858, + "step": 3320 + }, + { + "epoch": 0.08325, + "grad_norm": 30.0, + "grad_norm_var": 2.0268229166666667, + "learning_rate": 0.0001, + "loss": 7.4454, + "loss/crossentropy": 2.0691144198179243, + "loss/hidden": 3.341796875, + "loss/jsd": 0.0, + "loss/logits": 0.20186964478343725, + "step": 3330 + }, + { + "epoch": 0.0835, + "grad_norm": 31.5, + "grad_norm_var": 2.6211653265769103e+18, + "learning_rate": 0.0001, + "loss": 7.4481, + "loss/crossentropy": 2.0832756504416468, + "loss/hidden": 3.423046875, + "loss/jsd": 0.0, + "loss/logits": 0.19915037509053946, + "step": 3340 + }, + { + "epoch": 0.08375, + "grad_norm": 32.5, + "grad_norm_var": 2.621165324337292e+18, + "learning_rate": 0.0001, + "loss": 7.3606, + "loss/crossentropy": 2.102260760962963, + "loss/hidden": 3.372265625, + "loss/jsd": 0.0, + "loss/logits": 0.19333885367959738, + "step": 3350 + }, + { + "epoch": 0.084, + "grad_norm": 29.25, + "grad_norm_var": 85.575, + "learning_rate": 0.0001, + "loss": 7.4073, + "loss/crossentropy": 2.149528594315052, + "loss/hidden": 3.491796875, + "loss/jsd": 0.0, + "loss/logits": 0.2177526842802763, + "step": 3360 + }, + { + "epoch": 0.08425, + "grad_norm": 30.25, + "grad_norm_var": 2.8645833333333335, + "learning_rate": 0.0001, + "loss": 7.4642, + "loss/crossentropy": 2.085590344667435, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.17804578468203544, + "step": 3370 + }, + { + "epoch": 0.0845, + "grad_norm": 33.25, + "grad_norm_var": 2.996875, + "learning_rate": 0.0001, + "loss": 7.3953, + "loss/crossentropy": 2.0975965946912765, + "loss/hidden": 3.3, + "loss/jsd": 0.0, + "loss/logits": 0.1847201505675912, + "step": 3380 + }, + { + "epoch": 0.08475, + "grad_norm": 32.0, + "grad_norm_var": 2.470572916666667, + "learning_rate": 0.0001, + "loss": 7.4553, + "loss/crossentropy": 2.1018140748143197, + "loss/hidden": 3.36796875, + "loss/jsd": 0.0, + "loss/logits": 0.18250287007540464, + "step": 3390 + }, + { + "epoch": 0.085, + "grad_norm": 30.25, + "grad_norm_var": 2.887955729166667, + "learning_rate": 0.0001, + "loss": 7.5238, + "loss/crossentropy": 2.1050665065646172, + "loss/hidden": 3.501953125, + "loss/jsd": 0.0, + "loss/logits": 0.2124734738841653, + "step": 3400 + }, + { + "epoch": 0.08525, + "grad_norm": 30.0, + "grad_norm_var": 1.7143229166666667, + "learning_rate": 0.0001, + "loss": 7.2754, + "loss/crossentropy": 2.0948296964168547, + "loss/hidden": 3.37421875, + "loss/jsd": 0.0, + "loss/logits": 0.178215317055583, + "step": 3410 + }, + { + "epoch": 0.0855, + "grad_norm": 30.875, + "grad_norm_var": 5.354622395833333, + "learning_rate": 0.0001, + "loss": 7.4191, + "loss/crossentropy": 2.0418393671512605, + "loss/hidden": 3.4734375, + "loss/jsd": 0.0, + "loss/logits": 0.18740264605730772, + "step": 3420 + }, + { + "epoch": 0.08575, + "grad_norm": 32.25, + "grad_norm_var": 6.430989583333333, + "learning_rate": 0.0001, + "loss": 7.5642, + "loss/crossentropy": 2.0279636546969413, + "loss/hidden": 3.55859375, + "loss/jsd": 0.0, + "loss/logits": 0.20154636316001415, + "step": 3430 + }, + { + "epoch": 0.086, + "grad_norm": 29.125, + "grad_norm_var": 53.64791666666667, + "learning_rate": 0.0001, + "loss": 7.485, + "loss/crossentropy": 2.0705729112029077, + "loss/hidden": 3.456640625, + "loss/jsd": 0.0, + "loss/logits": 0.22035282999277114, + "step": 3440 + }, + { + "epoch": 0.08625, + "grad_norm": 30.375, + "grad_norm_var": 5.54765625, + "learning_rate": 0.0001, + "loss": 7.428, + "loss/crossentropy": 1.9830067940056324, + "loss/hidden": 3.376171875, + "loss/jsd": 0.0, + "loss/logits": 0.19354272997006775, + "step": 3450 + }, + { + "epoch": 0.0865, + "grad_norm": 28.375, + "grad_norm_var": 2.758736220726598e+18, + "learning_rate": 0.0001, + "loss": 7.4342, + "loss/crossentropy": 2.1590976014733316, + "loss/hidden": 3.489453125, + "loss/jsd": 0.0, + "loss/logits": 0.20231232214719058, + "step": 3460 + }, + { + "epoch": 0.08675, + "grad_norm": 29.125, + "grad_norm_var": 2.470572916666667, + "learning_rate": 0.0001, + "loss": 7.3376, + "loss/crossentropy": 2.108407254517078, + "loss/hidden": 3.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.18425025548785925, + "step": 3470 + }, + { + "epoch": 0.087, + "grad_norm": 32.5, + "grad_norm_var": 19.315559895833335, + "learning_rate": 0.0001, + "loss": 7.391, + "loss/crossentropy": 2.086346108466387, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.19492445401847364, + "step": 3480 + }, + { + "epoch": 0.08725, + "grad_norm": 30.75, + "grad_norm_var": 3.9009765625, + "learning_rate": 0.0001, + "loss": 7.454, + "loss/crossentropy": 2.0728737086057665, + "loss/hidden": 3.474609375, + "loss/jsd": 0.0, + "loss/logits": 0.21246139723807572, + "step": 3490 + }, + { + "epoch": 0.0875, + "grad_norm": 53.25, + "grad_norm_var": 34.962955729166666, + "learning_rate": 0.0001, + "loss": 7.4001, + "loss/crossentropy": 1.9173476293683052, + "loss/hidden": 3.330859375, + "loss/jsd": 0.0, + "loss/logits": 0.18263984741643072, + "step": 3500 + }, + { + "epoch": 0.08775, + "grad_norm": 29.875, + "grad_norm_var": 36.22389322916667, + "learning_rate": 0.0001, + "loss": 7.5855, + "loss/crossentropy": 1.9761252515017986, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.20959299746900797, + "step": 3510 + }, + { + "epoch": 0.088, + "grad_norm": 32.25, + "grad_norm_var": 17.7337890625, + "learning_rate": 0.0001, + "loss": 7.4728, + "loss/crossentropy": 2.0416554152965545, + "loss/hidden": 3.4703125, + "loss/jsd": 0.0, + "loss/logits": 0.19014756735414268, + "step": 3520 + }, + { + "epoch": 0.08825, + "grad_norm": 29.375, + "grad_norm_var": 14.664322916666666, + "learning_rate": 0.0001, + "loss": 7.5608, + "loss/crossentropy": 2.2834356099367143, + "loss/hidden": 3.3953125, + "loss/jsd": 0.0, + "loss/logits": 0.19908843878656626, + "step": 3530 + }, + { + "epoch": 0.0885, + "grad_norm": 31.875, + "grad_norm_var": 2.6702473958333335, + "learning_rate": 0.0001, + "loss": 7.4804, + "loss/crossentropy": 2.0417330890893934, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.20852382443845272, + "step": 3540 + }, + { + "epoch": 0.08875, + "grad_norm": 31.625, + "grad_norm_var": 2.460724589971584e+18, + "learning_rate": 0.0001, + "loss": 7.5559, + "loss/crossentropy": 2.1676768481731417, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.1989177169278264, + "step": 3550 + }, + { + "epoch": 0.089, + "grad_norm": 30.0, + "grad_norm_var": 6.881705729166667, + "learning_rate": 0.0001, + "loss": 7.4678, + "loss/crossentropy": 2.221273897588253, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.19402988757938147, + "step": 3560 + }, + { + "epoch": 0.08925, + "grad_norm": 31.375, + "grad_norm_var": 7.732747395833333, + "learning_rate": 0.0001, + "loss": 7.4508, + "loss/crossentropy": 2.1802149415016174, + "loss/hidden": 3.43828125, + "loss/jsd": 0.0, + "loss/logits": 0.20121808685362338, + "step": 3570 + }, + { + "epoch": 0.0895, + "grad_norm": 52.5, + "grad_norm_var": 30.9775390625, + "learning_rate": 0.0001, + "loss": 7.3982, + "loss/crossentropy": 2.085124118626118, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.18448642026633025, + "step": 3580 + }, + { + "epoch": 0.08975, + "grad_norm": 30.875, + "grad_norm_var": 32.91295572916667, + "learning_rate": 0.0001, + "loss": 7.4381, + "loss/crossentropy": 2.1467449337244036, + "loss/hidden": 3.3734375, + "loss/jsd": 0.0, + "loss/logits": 0.19393185302615165, + "step": 3590 + }, + { + "epoch": 0.09, + "grad_norm": 29.25, + "grad_norm_var": 1.4708333333333334, + "learning_rate": 0.0001, + "loss": 7.415, + "loss/crossentropy": 2.0135369554162024, + "loss/hidden": 3.37109375, + "loss/jsd": 0.0, + "loss/logits": 0.18443848174065353, + "step": 3600 + }, + { + "epoch": 0.09025, + "grad_norm": 31.375, + "grad_norm_var": 6.1962890625, + "learning_rate": 0.0001, + "loss": 7.4028, + "loss/crossentropy": 2.1443901300430297, + "loss/hidden": 3.440234375, + "loss/jsd": 0.0, + "loss/logits": 0.2054579086601734, + "step": 3610 + }, + { + "epoch": 0.0905, + "grad_norm": 26.5, + "grad_norm_var": 3.562239583333333, + "learning_rate": 0.0001, + "loss": 7.3255, + "loss/crossentropy": 1.799356396496296, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.17441922090947629, + "step": 3620 + }, + { + "epoch": 0.09075, + "grad_norm": 31.375, + "grad_norm_var": 2.2083333333333335, + "learning_rate": 0.0001, + "loss": 7.4272, + "loss/crossentropy": 1.9925116747617722, + "loss/hidden": 3.52578125, + "loss/jsd": 0.0, + "loss/logits": 0.21653544921427964, + "step": 3630 + }, + { + "epoch": 0.091, + "grad_norm": 30.125, + "grad_norm_var": 0.6125, + "learning_rate": 0.0001, + "loss": 7.3649, + "loss/crossentropy": 2.135761073231697, + "loss/hidden": 3.4140625, + "loss/jsd": 0.0, + "loss/logits": 0.18989351522177458, + "step": 3640 + }, + { + "epoch": 0.09125, + "grad_norm": 31.375, + "grad_norm_var": 1.4330729166666667, + "learning_rate": 0.0001, + "loss": 7.4505, + "loss/crossentropy": 2.0986070543527604, + "loss/hidden": 3.334375, + "loss/jsd": 0.0, + "loss/logits": 0.18352905213832854, + "step": 3650 + }, + { + "epoch": 0.0915, + "grad_norm": 29.625, + "grad_norm_var": 2.5869140625, + "learning_rate": 0.0001, + "loss": 7.4199, + "loss/crossentropy": 2.1555575743317603, + "loss/hidden": 3.402734375, + "loss/jsd": 0.0, + "loss/logits": 0.19261632524430752, + "step": 3660 + }, + { + "epoch": 0.09175, + "grad_norm": 31.5, + "grad_norm_var": 2.371875, + "learning_rate": 0.0001, + "loss": 7.5463, + "loss/crossentropy": 2.1411691516637803, + "loss/hidden": 3.446875, + "loss/jsd": 0.0, + "loss/logits": 0.2046652188524604, + "step": 3670 + }, + { + "epoch": 0.092, + "grad_norm": 30.625, + "grad_norm_var": 4.703580729166666, + "learning_rate": 0.0001, + "loss": 7.404, + "loss/crossentropy": 2.142404294013977, + "loss/hidden": 3.445703125, + "loss/jsd": 0.0, + "loss/logits": 0.20414466112852098, + "step": 3680 + }, + { + "epoch": 0.09225, + "grad_norm": 30.375, + "grad_norm_var": 3.25625, + "learning_rate": 0.0001, + "loss": 7.4774, + "loss/crossentropy": 2.187901920080185, + "loss/hidden": 3.480859375, + "loss/jsd": 0.0, + "loss/logits": 0.21911972090601922, + "step": 3690 + }, + { + "epoch": 0.0925, + "grad_norm": 31.875, + "grad_norm_var": 1.2166666666666666, + "learning_rate": 0.0001, + "loss": 7.5965, + "loss/crossentropy": 2.086391404271126, + "loss/hidden": 3.438671875, + "loss/jsd": 0.0, + "loss/logits": 0.2020766455680132, + "step": 3700 + }, + { + "epoch": 0.09275, + "grad_norm": 30.625, + "grad_norm_var": 2.147330729166667, + "learning_rate": 0.0001, + "loss": 7.4579, + "loss/crossentropy": 2.09081457182765, + "loss/hidden": 3.369140625, + "loss/jsd": 0.0, + "loss/logits": 0.1868050311692059, + "step": 3710 + }, + { + "epoch": 0.093, + "grad_norm": 34.25, + "grad_norm_var": 2.467643229166667, + "learning_rate": 0.0001, + "loss": 7.522, + "loss/crossentropy": 2.12264247238636, + "loss/hidden": 3.453125, + "loss/jsd": 0.0, + "loss/logits": 0.18927707765251398, + "step": 3720 + }, + { + "epoch": 0.09325, + "grad_norm": 32.25, + "grad_norm_var": 3.981184895833333, + "learning_rate": 0.0001, + "loss": 7.4155, + "loss/crossentropy": 2.1118928104639054, + "loss/hidden": 3.44765625, + "loss/jsd": 0.0, + "loss/logits": 0.19489197488874196, + "step": 3730 + }, + { + "epoch": 0.0935, + "grad_norm": 34.0, + "grad_norm_var": 5.312434895833333, + "learning_rate": 0.0001, + "loss": 7.5053, + "loss/crossentropy": 2.1360882744193077, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.19313989579677582, + "step": 3740 + }, + { + "epoch": 0.09375, + "grad_norm": 29.125, + "grad_norm_var": 4.549739583333333, + "learning_rate": 0.0001, + "loss": 7.3275, + "loss/crossentropy": 2.010613538324833, + "loss/hidden": 3.355859375, + "loss/jsd": 0.0, + "loss/logits": 0.18421147018671036, + "step": 3750 + }, + { + "epoch": 0.094, + "grad_norm": 31.625, + "grad_norm_var": 1.5541666666666667, + "learning_rate": 0.0001, + "loss": 7.4784, + "loss/crossentropy": 2.1465295113623144, + "loss/hidden": 3.323828125, + "loss/jsd": 0.0, + "loss/logits": 0.18987073097378016, + "step": 3760 + }, + { + "epoch": 0.09425, + "grad_norm": 32.75, + "grad_norm_var": 1.9018229166666667, + "learning_rate": 0.0001, + "loss": 7.3495, + "loss/crossentropy": 2.17747982442379, + "loss/hidden": 3.48046875, + "loss/jsd": 0.0, + "loss/logits": 0.2016214355826378, + "step": 3770 + }, + { + "epoch": 0.0945, + "grad_norm": 30.875, + "grad_norm_var": 3.088997395833333, + "learning_rate": 0.0001, + "loss": 7.5384, + "loss/crossentropy": 2.179350584745407, + "loss/hidden": 3.350390625, + "loss/jsd": 0.0, + "loss/logits": 0.19142594784498215, + "step": 3780 + }, + { + "epoch": 0.09475, + "grad_norm": 29.625, + "grad_norm_var": 1.1559895833333333, + "learning_rate": 0.0001, + "loss": 7.4035, + "loss/crossentropy": 2.155378046631813, + "loss/hidden": 3.32109375, + "loss/jsd": 0.0, + "loss/logits": 0.19720839541405438, + "step": 3790 + }, + { + "epoch": 0.095, + "grad_norm": 30.625, + "grad_norm_var": 1.1999348958333333, + "learning_rate": 0.0001, + "loss": 7.4441, + "loss/crossentropy": 2.0597486779093743, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.19279775265604257, + "step": 3800 + }, + { + "epoch": 0.09525, + "grad_norm": 33.5, + "grad_norm_var": 2.1666666666666665, + "learning_rate": 0.0001, + "loss": 7.5146, + "loss/crossentropy": 2.1966816753149034, + "loss/hidden": 3.44375, + "loss/jsd": 0.0, + "loss/logits": 0.20174810625612735, + "step": 3810 + }, + { + "epoch": 0.0955, + "grad_norm": 31.5, + "grad_norm_var": 1.9593098958333333, + "learning_rate": 0.0001, + "loss": 7.539, + "loss/crossentropy": 2.165803623199463, + "loss/hidden": 3.34375, + "loss/jsd": 0.0, + "loss/logits": 0.1953417781740427, + "step": 3820 + }, + { + "epoch": 0.09575, + "grad_norm": 32.0, + "grad_norm_var": 6.690625, + "learning_rate": 0.0001, + "loss": 7.514, + "loss/crossentropy": 2.0817860513925552, + "loss/hidden": 3.453515625, + "loss/jsd": 0.0, + "loss/logits": 0.20838446952402592, + "step": 3830 + }, + { + "epoch": 0.096, + "grad_norm": 32.75, + "grad_norm_var": 7.6431640625, + "learning_rate": 0.0001, + "loss": 7.5472, + "loss/crossentropy": 2.231910442560911, + "loss/hidden": 3.442578125, + "loss/jsd": 0.0, + "loss/logits": 0.21717903479002415, + "step": 3840 + }, + { + "epoch": 0.09625, + "grad_norm": 32.0, + "grad_norm_var": 16.134375, + "learning_rate": 0.0001, + "loss": 7.5807, + "loss/crossentropy": 2.0746277555823327, + "loss/hidden": 3.47578125, + "loss/jsd": 0.0, + "loss/logits": 0.20851925816386938, + "step": 3850 + }, + { + "epoch": 0.0965, + "grad_norm": 30.625, + "grad_norm_var": 16.132747395833334, + "learning_rate": 0.0001, + "loss": 7.3749, + "loss/crossentropy": 2.1463438466191294, + "loss/hidden": 3.356640625, + "loss/jsd": 0.0, + "loss/logits": 0.19305863380432128, + "step": 3860 + }, + { + "epoch": 0.09675, + "grad_norm": 32.5, + "grad_norm_var": 1.0895182291666667, + "learning_rate": 0.0001, + "loss": 7.5499, + "loss/crossentropy": 2.2108413323760034, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.20310868676751853, + "step": 3870 + }, + { + "epoch": 0.097, + "grad_norm": 30.75, + "grad_norm_var": 1.4559895833333334, + "learning_rate": 0.0001, + "loss": 7.4788, + "loss/crossentropy": 2.0900154620409013, + "loss/hidden": 3.42734375, + "loss/jsd": 0.0, + "loss/logits": 0.18780422061681748, + "step": 3880 + }, + { + "epoch": 0.09725, + "grad_norm": 30.625, + "grad_norm_var": 13.917643229166666, + "learning_rate": 0.0001, + "loss": 7.4391, + "loss/crossentropy": 2.0574848279356956, + "loss/hidden": 3.3875, + "loss/jsd": 0.0, + "loss/logits": 0.19390027467161416, + "step": 3890 + }, + { + "epoch": 0.0975, + "grad_norm": 27.375, + "grad_norm_var": 13.55, + "learning_rate": 0.0001, + "loss": 7.4327, + "loss/crossentropy": 2.2832688719034193, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.20608801003545524, + "step": 3900 + }, + { + "epoch": 0.09775, + "grad_norm": 29.0, + "grad_norm_var": 3.296875, + "learning_rate": 0.0001, + "loss": 7.3691, + "loss/crossentropy": 1.9183307077735663, + "loss/hidden": 3.37890625, + "loss/jsd": 0.0, + "loss/logits": 0.1917601386550814, + "step": 3910 + }, + { + "epoch": 0.098, + "grad_norm": 34.0, + "grad_norm_var": 3.24765625, + "learning_rate": 0.0001, + "loss": 7.4628, + "loss/crossentropy": 2.0630046002566815, + "loss/hidden": 3.338671875, + "loss/jsd": 0.0, + "loss/logits": 0.1871832549571991, + "step": 3920 + }, + { + "epoch": 0.09825, + "grad_norm": 31.75, + "grad_norm_var": 1.5384765625, + "learning_rate": 0.0001, + "loss": 7.4868, + "loss/crossentropy": 2.061261148750782, + "loss/hidden": 3.415234375, + "loss/jsd": 0.0, + "loss/logits": 0.18525551967322826, + "step": 3930 + }, + { + "epoch": 0.0985, + "grad_norm": 29.75, + "grad_norm_var": 1.584375, + "learning_rate": 0.0001, + "loss": 7.5498, + "loss/crossentropy": 2.0895790114998816, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.1932330032810569, + "step": 3940 + }, + { + "epoch": 0.09875, + "grad_norm": 30.625, + "grad_norm_var": 25.79765625, + "learning_rate": 0.0001, + "loss": 7.6502, + "loss/crossentropy": 2.1616804771125318, + "loss/hidden": 3.365625, + "loss/jsd": 0.0, + "loss/logits": 0.18905209768563508, + "step": 3950 + }, + { + "epoch": 0.099, + "grad_norm": 30.5, + "grad_norm_var": 28.547916666666666, + "learning_rate": 0.0001, + "loss": 7.3334, + "loss/crossentropy": 2.1435488507151605, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.1910943292081356, + "step": 3960 + }, + { + "epoch": 0.09925, + "grad_norm": 32.75, + "grad_norm_var": 6.3650390625, + "learning_rate": 0.0001, + "loss": 7.542, + "loss/crossentropy": 2.176460310816765, + "loss/hidden": 3.41328125, + "loss/jsd": 0.0, + "loss/logits": 0.18821860365569593, + "step": 3970 + }, + { + "epoch": 0.0995, + "grad_norm": 31.625, + "grad_norm_var": 3.9905598958333335, + "learning_rate": 0.0001, + "loss": 7.5231, + "loss/crossentropy": 2.2077176332473756, + "loss/hidden": 3.4515625, + "loss/jsd": 0.0, + "loss/logits": 0.21911400128155947, + "step": 3980 + }, + { + "epoch": 0.09975, + "grad_norm": 31.125, + "grad_norm_var": 1.75625, + "learning_rate": 0.0001, + "loss": 7.4868, + "loss/crossentropy": 2.105836200714111, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.1997914554551244, + "step": 3990 + }, + { + "epoch": 0.1, + "grad_norm": 38.0, + "grad_norm_var": 4.710416666666666, + "learning_rate": 0.0001, + "loss": 7.5675, + "loss/crossentropy": 2.233233967423439, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.20876242108643056, + "step": 4000 + }, + { + "epoch": 0.10025, + "grad_norm": 28.625, + "grad_norm_var": 7.56640625, + "learning_rate": 0.0001, + "loss": 7.4736, + "loss/crossentropy": 2.103509198874235, + "loss/hidden": 3.413671875, + "loss/jsd": 0.0, + "loss/logits": 0.1953927006572485, + "step": 4010 + }, + { + "epoch": 0.1005, + "grad_norm": 28.875, + "grad_norm_var": 4.119791666666667, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 1.9697775058448315, + "loss/hidden": 3.308984375, + "loss/jsd": 0.0, + "loss/logits": 0.17186311883851885, + "step": 4020 + }, + { + "epoch": 0.10075, + "grad_norm": 29.5, + "grad_norm_var": 1.3177083333333333, + "learning_rate": 0.0001, + "loss": 7.333, + "loss/crossentropy": 2.0519870311021804, + "loss/hidden": 3.42421875, + "loss/jsd": 0.0, + "loss/logits": 0.1872571600601077, + "step": 4030 + }, + { + "epoch": 0.101, + "grad_norm": 29.5, + "grad_norm_var": 1.2785807291666667, + "learning_rate": 0.0001, + "loss": 7.3466, + "loss/crossentropy": 2.0663713179528713, + "loss/hidden": 3.39921875, + "loss/jsd": 0.0, + "loss/logits": 0.18582073990255593, + "step": 4040 + }, + { + "epoch": 0.10125, + "grad_norm": 30.375, + "grad_norm_var": 1.9577473958333333, + "learning_rate": 0.0001, + "loss": 7.3812, + "loss/crossentropy": 2.1256399258971213, + "loss/hidden": 3.30390625, + "loss/jsd": 0.0, + "loss/logits": 0.19628962082788348, + "step": 4050 + }, + { + "epoch": 0.1015, + "grad_norm": 30.625, + "grad_norm_var": 0.53125, + "learning_rate": 0.0001, + "loss": 7.3726, + "loss/crossentropy": 2.1235328309237955, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.18646292947232723, + "step": 4060 + }, + { + "epoch": 0.10175, + "grad_norm": 29.0, + "grad_norm_var": 3.19255952647709e+18, + "learning_rate": 0.0001, + "loss": 7.4564, + "loss/crossentropy": 2.0213126331567763, + "loss/hidden": 3.496875, + "loss/jsd": 0.0, + "loss/logits": 0.19607899691909553, + "step": 4070 + }, + { + "epoch": 0.102, + "grad_norm": 28.75, + "grad_norm_var": 3.48515625, + "learning_rate": 0.0001, + "loss": 7.3886, + "loss/crossentropy": 2.0899658009409903, + "loss/hidden": 3.340625, + "loss/jsd": 0.0, + "loss/logits": 0.1851665174588561, + "step": 4080 + }, + { + "epoch": 0.10225, + "grad_norm": 29.5, + "grad_norm_var": 1.8692057291666666, + "learning_rate": 0.0001, + "loss": 7.4838, + "loss/crossentropy": 2.027493818849325, + "loss/hidden": 3.49765625, + "loss/jsd": 0.0, + "loss/logits": 0.19640162959694862, + "step": 4090 + }, + { + "epoch": 0.1025, + "grad_norm": 29.125, + "grad_norm_var": 11.762434895833334, + "learning_rate": 0.0001, + "loss": 7.5099, + "loss/crossentropy": 2.056584618985653, + "loss/hidden": 3.32265625, + "loss/jsd": 0.0, + "loss/logits": 0.17638762388378382, + "step": 4100 + }, + { + "epoch": 0.10275, + "grad_norm": 30.125, + "grad_norm_var": 12.459375, + "learning_rate": 0.0001, + "loss": 7.5255, + "loss/crossentropy": 2.0713445380330087, + "loss/hidden": 3.36953125, + "loss/jsd": 0.0, + "loss/logits": 0.18587317056953906, + "step": 4110 + }, + { + "epoch": 0.103, + "grad_norm": 33.25, + "grad_norm_var": 1.9958333333333333, + "learning_rate": 0.0001, + "loss": 7.4437, + "loss/crossentropy": 2.2338072419166566, + "loss/hidden": 3.36171875, + "loss/jsd": 0.0, + "loss/logits": 0.18814200926572083, + "step": 4120 + }, + { + "epoch": 0.10325, + "grad_norm": 32.75, + "grad_norm_var": 3.1259765625, + "learning_rate": 0.0001, + "loss": 7.3184, + "loss/crossentropy": 2.0210259817540646, + "loss/hidden": 3.3671875, + "loss/jsd": 0.0, + "loss/logits": 0.18816483654081823, + "step": 4130 + }, + { + "epoch": 0.1035, + "grad_norm": 29.5, + "grad_norm_var": 2.870247395833333, + "learning_rate": 0.0001, + "loss": 7.5124, + "loss/crossentropy": 2.0151045128703116, + "loss/hidden": 3.371484375, + "loss/jsd": 0.0, + "loss/logits": 0.19255878478288652, + "step": 4140 + }, + { + "epoch": 0.10375, + "grad_norm": 30.625, + "grad_norm_var": 1.3926432291666666, + "learning_rate": 0.0001, + "loss": 7.5096, + "loss/crossentropy": 1.9808883003890514, + "loss/hidden": 3.449609375, + "loss/jsd": 0.0, + "loss/logits": 0.19115560222417116, + "step": 4150 + }, + { + "epoch": 0.104, + "grad_norm": 30.75, + "grad_norm_var": 1.6979166666666667, + "learning_rate": 0.0001, + "loss": 7.549, + "loss/crossentropy": 2.1932784736156465, + "loss/hidden": 3.39765625, + "loss/jsd": 0.0, + "loss/logits": 0.20479805655777455, + "step": 4160 + }, + { + "epoch": 0.10425, + "grad_norm": 30.125, + "grad_norm_var": 2.3333333333333335, + "learning_rate": 0.0001, + "loss": 7.3875, + "loss/crossentropy": 1.8820222720503808, + "loss/hidden": 3.337109375, + "loss/jsd": 0.0, + "loss/logits": 0.17310038600116967, + "step": 4170 + }, + { + "epoch": 0.1045, + "grad_norm": 33.0, + "grad_norm_var": 3.7728515625, + "learning_rate": 0.0001, + "loss": 7.4212, + "loss/crossentropy": 2.082476270198822, + "loss/hidden": 3.334765625, + "loss/jsd": 0.0, + "loss/logits": 0.19099258184432982, + "step": 4180 + }, + { + "epoch": 0.10475, + "grad_norm": 30.875, + "grad_norm_var": 11.408268229166667, + "learning_rate": 0.0001, + "loss": 7.4991, + "loss/crossentropy": 2.287242355942726, + "loss/hidden": 3.375390625, + "loss/jsd": 0.0, + "loss/logits": 0.1982285875827074, + "step": 4190 + }, + { + "epoch": 0.105, + "grad_norm": 28.75, + "grad_norm_var": 2.999739583333333, + "learning_rate": 0.0001, + "loss": 7.5959, + "loss/crossentropy": 2.1783332407474516, + "loss/hidden": 3.422265625, + "loss/jsd": 0.0, + "loss/logits": 0.2117959801107645, + "step": 4200 + }, + { + "epoch": 0.10525, + "grad_norm": 30.0, + "grad_norm_var": 4.708268229166666, + "learning_rate": 0.0001, + "loss": 7.3363, + "loss/crossentropy": 1.955865352600813, + "loss/hidden": 3.409375, + "loss/jsd": 0.0, + "loss/logits": 0.18177355360239744, + "step": 4210 + }, + { + "epoch": 0.1055, + "grad_norm": 30.625, + "grad_norm_var": 3.0254557291666666, + "learning_rate": 0.0001, + "loss": 7.4673, + "loss/crossentropy": 1.833389012515545, + "loss/hidden": 3.394921875, + "loss/jsd": 0.0, + "loss/logits": 0.1878132861107588, + "step": 4220 + }, + { + "epoch": 0.10575, + "grad_norm": 32.0, + "grad_norm_var": 3.05, + "learning_rate": 0.0001, + "loss": 7.3969, + "loss/crossentropy": 1.9096243590116502, + "loss/hidden": 3.35625, + "loss/jsd": 0.0, + "loss/logits": 0.17025592969730496, + "step": 4230 + }, + { + "epoch": 0.106, + "grad_norm": 30.875, + "grad_norm_var": 1.82265625, + "learning_rate": 0.0001, + "loss": 7.4638, + "loss/crossentropy": 2.0454175233840943, + "loss/hidden": 3.436328125, + "loss/jsd": 0.0, + "loss/logits": 0.20515710916370153, + "step": 4240 + }, + { + "epoch": 0.10625, + "grad_norm": 30.125, + "grad_norm_var": 3.1333333333333333, + "learning_rate": 0.0001, + "loss": 7.5126, + "loss/crossentropy": 2.089062933623791, + "loss/hidden": 3.4328125, + "loss/jsd": 0.0, + "loss/logits": 0.19156677946448325, + "step": 4250 + }, + { + "epoch": 0.1065, + "grad_norm": 29.0, + "grad_norm_var": 4.311393229166667, + "learning_rate": 0.0001, + "loss": 7.4468, + "loss/crossentropy": 2.0564094200730323, + "loss/hidden": 3.433984375, + "loss/jsd": 0.0, + "loss/logits": 0.19553639348596336, + "step": 4260 + }, + { + "epoch": 0.10675, + "grad_norm": 32.0, + "grad_norm_var": 3.2587890625, + "learning_rate": 0.0001, + "loss": 7.4186, + "loss/crossentropy": 2.13806764036417, + "loss/hidden": 3.3859375, + "loss/jsd": 0.0, + "loss/logits": 0.19822277761995793, + "step": 4270 + }, + { + "epoch": 0.107, + "grad_norm": 28.0, + "grad_norm_var": 1.6926432291666667, + "learning_rate": 0.0001, + "loss": 7.4595, + "loss/crossentropy": 2.0767486467957497, + "loss/hidden": 3.41953125, + "loss/jsd": 0.0, + "loss/logits": 0.1884168043732643, + "step": 4280 + }, + { + "epoch": 0.10725, + "grad_norm": 33.0, + "grad_norm_var": 2.3059895833333335, + "learning_rate": 0.0001, + "loss": 7.4481, + "loss/crossentropy": 2.033916361629963, + "loss/hidden": 3.45, + "loss/jsd": 0.0, + "loss/logits": 0.20558829829096795, + "step": 4290 + }, + { + "epoch": 0.1075, + "grad_norm": 31.0, + "grad_norm_var": 2.9375, + "learning_rate": 0.0001, + "loss": 7.4871, + "loss/crossentropy": 2.078028707951307, + "loss/hidden": 3.37578125, + "loss/jsd": 0.0, + "loss/logits": 0.188079852424562, + "step": 4300 + }, + { + "epoch": 0.10775, + "grad_norm": 33.25, + "grad_norm_var": 2.1020833333333333, + "learning_rate": 0.0001, + "loss": 7.5379, + "loss/crossentropy": 2.003500834107399, + "loss/hidden": 3.544921875, + "loss/jsd": 0.0, + "loss/logits": 0.20521650360897184, + "step": 4310 + }, + { + "epoch": 0.108, + "grad_norm": 29.625, + "grad_norm_var": 2.8447916666666666, + "learning_rate": 0.0001, + "loss": 7.3536, + "loss/crossentropy": 2.043112625181675, + "loss/hidden": 3.380859375, + "loss/jsd": 0.0, + "loss/logits": 0.19910661596804857, + "step": 4320 + }, + { + "epoch": 0.10825, + "grad_norm": 28.5, + "grad_norm_var": 4.000455729166666, + "learning_rate": 0.0001, + "loss": 7.3717, + "loss/crossentropy": 2.1422011658549307, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.19375871792435645, + "step": 4330 + }, + { + "epoch": 0.1085, + "grad_norm": 29.0, + "grad_norm_var": 3.6259765625, + "learning_rate": 0.0001, + "loss": 7.5021, + "loss/crossentropy": 2.131446525454521, + "loss/hidden": 3.480078125, + "loss/jsd": 0.0, + "loss/logits": 0.2063008865341544, + "step": 4340 + }, + { + "epoch": 0.10875, + "grad_norm": 32.0, + "grad_norm_var": 5.9525390625, + "learning_rate": 0.0001, + "loss": 7.4749, + "loss/crossentropy": 2.085691845417023, + "loss/hidden": 3.359375, + "loss/jsd": 0.0, + "loss/logits": 0.1889802658930421, + "step": 4350 + }, + { + "epoch": 0.109, + "grad_norm": 30.75, + "grad_norm_var": 3.154166666666667, + "learning_rate": 0.0001, + "loss": 7.3816, + "loss/crossentropy": 1.8972876839339734, + "loss/hidden": 3.319140625, + "loss/jsd": 0.0, + "loss/logits": 0.17174729090183974, + "step": 4360 + }, + { + "epoch": 0.10925, + "grad_norm": 29.875, + "grad_norm_var": 1.7509765625, + "learning_rate": 0.0001, + "loss": 7.4444, + "loss/crossentropy": 2.127763804793358, + "loss/hidden": 3.401953125, + "loss/jsd": 0.0, + "loss/logits": 0.18679574280977249, + "step": 4370 + }, + { + "epoch": 0.1095, + "grad_norm": 29.875, + "grad_norm_var": 2.16015625, + "learning_rate": 0.0001, + "loss": 7.4682, + "loss/crossentropy": 2.1872297644615175, + "loss/hidden": 3.31328125, + "loss/jsd": 0.0, + "loss/logits": 0.183891461789608, + "step": 4380 + }, + { + "epoch": 0.10975, + "grad_norm": 28.875, + "grad_norm_var": 3.3692057291666666, + "learning_rate": 0.0001, + "loss": 7.429, + "loss/crossentropy": 2.19267495572567, + "loss/hidden": 3.394140625, + "loss/jsd": 0.0, + "loss/logits": 0.20111876968294382, + "step": 4390 + }, + { + "epoch": 0.11, + "grad_norm": 29.375, + "grad_norm_var": 1.6858723958333333, + "learning_rate": 0.0001, + "loss": 7.556, + "loss/crossentropy": 2.1324411287903784, + "loss/hidden": 3.4359375, + "loss/jsd": 0.0, + "loss/logits": 0.2090261412784457, + "step": 4400 + }, + { + "epoch": 0.11025, + "grad_norm": 33.5, + "grad_norm_var": 3.374739583333333, + "learning_rate": 0.0001, + "loss": 7.4081, + "loss/crossentropy": 1.9800483137369156, + "loss/hidden": 3.584375, + "loss/jsd": 0.0, + "loss/logits": 0.19881114605814218, + "step": 4410 + }, + { + "epoch": 0.1105, + "grad_norm": 31.75, + "grad_norm_var": 4.13515625, + "learning_rate": 0.0001, + "loss": 7.4904, + "loss/crossentropy": 2.053773292154074, + "loss/hidden": 3.323828125, + "loss/jsd": 0.0, + "loss/logits": 0.18270381446927786, + "step": 4420 + }, + { + "epoch": 0.11075, + "grad_norm": 30.75, + "grad_norm_var": 2.3499348958333335, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 2.0641689248383046, + "loss/hidden": 3.40390625, + "loss/jsd": 0.0, + "loss/logits": 0.19245190378278493, + "step": 4430 + }, + { + "epoch": 0.111, + "grad_norm": 33.25, + "grad_norm_var": 3.158333333333333, + "learning_rate": 0.0001, + "loss": 7.3576, + "loss/crossentropy": 2.073286408931017, + "loss/hidden": 3.37265625, + "loss/jsd": 0.0, + "loss/logits": 0.1892416624352336, + "step": 4440 + }, + { + "epoch": 0.11125, + "grad_norm": 35.75, + "grad_norm_var": 6.167122395833333, + "learning_rate": 0.0001, + "loss": 7.456, + "loss/crossentropy": 2.191167525947094, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.19327596500515937, + "step": 4450 + }, + { + "epoch": 0.1115, + "grad_norm": 28.0, + "grad_norm_var": 6.762239583333334, + "learning_rate": 0.0001, + "loss": 7.4254, + "loss/crossentropy": 1.9917161837220192, + "loss/hidden": 3.48828125, + "loss/jsd": 0.0, + "loss/logits": 0.18673346154391765, + "step": 4460 + }, + { + "epoch": 0.11175, + "grad_norm": 31.0, + "grad_norm_var": 2.763541666666667, + "learning_rate": 0.0001, + "loss": 7.4458, + "loss/crossentropy": 2.0167058646678924, + "loss/hidden": 3.477734375, + "loss/jsd": 0.0, + "loss/logits": 0.20151916183531285, + "step": 4470 + }, + { + "epoch": 0.112, + "grad_norm": 30.5, + "grad_norm_var": 7.175455729166667, + "learning_rate": 0.0001, + "loss": 7.4057, + "loss/crossentropy": 2.013149876892567, + "loss/hidden": 3.405859375, + "loss/jsd": 0.0, + "loss/logits": 0.1819242848083377, + "step": 4480 + }, + { + "epoch": 0.11225, + "grad_norm": 43.25, + "grad_norm_var": 13.478580729166667, + "learning_rate": 0.0001, + "loss": 7.4416, + "loss/crossentropy": 2.111778366565704, + "loss/hidden": 3.4, + "loss/jsd": 0.0, + "loss/logits": 0.20088088884949684, + "step": 4490 + }, + { + "epoch": 0.1125, + "grad_norm": 30.125, + "grad_norm_var": 11.905143229166667, + "learning_rate": 0.0001, + "loss": 7.4435, + "loss/crossentropy": 2.0223396182060243, + "loss/hidden": 3.408984375, + "loss/jsd": 0.0, + "loss/logits": 0.1967620700597763, + "step": 4500 + }, + { + "epoch": 0.11275, + "grad_norm": 28.375, + "grad_norm_var": 2.2978515625, + "learning_rate": 0.0001, + "loss": 7.3969, + "loss/crossentropy": 1.9966137878596784, + "loss/hidden": 3.416796875, + "loss/jsd": 0.0, + "loss/logits": 0.19062119219452142, + "step": 4510 + }, + { + "epoch": 0.113, + "grad_norm": 29.75, + "grad_norm_var": 3.1759765625, + "learning_rate": 0.0001, + "loss": 7.2845, + "loss/crossentropy": 1.8878834903240205, + "loss/hidden": 3.346484375, + "loss/jsd": 0.0, + "loss/logits": 0.16922880560159684, + "step": 4520 + }, + { + "epoch": 0.11325, + "grad_norm": 33.5, + "grad_norm_var": 3.78515625, + "learning_rate": 0.0001, + "loss": 7.5223, + "loss/crossentropy": 2.0424712359905244, + "loss/hidden": 3.42109375, + "loss/jsd": 0.0, + "loss/logits": 0.18261839263141155, + "step": 4530 + }, + { + "epoch": 0.1135, + "grad_norm": 41.75, + "grad_norm_var": 13.172330729166667, + "learning_rate": 0.0001, + "loss": 7.4917, + "loss/crossentropy": 2.1800880253314974, + "loss/hidden": 3.419140625, + "loss/jsd": 0.0, + "loss/logits": 0.1875661849975586, + "step": 4540 + }, + { + "epoch": 0.11375, + "grad_norm": 29.5, + "grad_norm_var": 13.737239583333333, + "learning_rate": 0.0001, + "loss": 7.4929, + "loss/crossentropy": 2.1130245834589005, + "loss/hidden": 3.514453125, + "loss/jsd": 0.0, + "loss/logits": 0.20742647554725407, + "step": 4550 + }, + { + "epoch": 0.114, + "grad_norm": 31.875, + "grad_norm_var": 3.1447265625, + "learning_rate": 0.0001, + "loss": 7.4885, + "loss/crossentropy": 2.0878429099917413, + "loss/hidden": 3.4625, + "loss/jsd": 0.0, + "loss/logits": 0.19807947240769863, + "step": 4560 + }, + { + "epoch": 0.11425, + "grad_norm": 32.0, + "grad_norm_var": 1.9080729166666666, + "learning_rate": 0.0001, + "loss": 7.412, + "loss/crossentropy": 2.045598204433918, + "loss/hidden": 3.43984375, + "loss/jsd": 0.0, + "loss/logits": 0.19935160782188177, + "step": 4570 + }, + { + "epoch": 0.1145, + "grad_norm": 31.25, + "grad_norm_var": 2.703285650940459e+18, + "learning_rate": 0.0001, + "loss": 7.4112, + "loss/crossentropy": 1.9612677067518234, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.1939171139150858, + "step": 4580 + }, + { + "epoch": 0.11475, + "grad_norm": 30.125, + "grad_norm_var": 9.067708333333334, + "learning_rate": 0.0001, + "loss": 7.4109, + "loss/crossentropy": 2.066862888634205, + "loss/hidden": 3.440625, + "loss/jsd": 0.0, + "loss/logits": 0.20057452656328678, + "step": 4590 + }, + { + "epoch": 0.115, + "grad_norm": 29.25, + "grad_norm_var": 6.670833333333333, + "learning_rate": 0.0001, + "loss": 7.3857, + "loss/crossentropy": 2.0378803849220275, + "loss/hidden": 3.495703125, + "loss/jsd": 0.0, + "loss/logits": 0.19217969439923763, + "step": 4600 + }, + { + "epoch": 0.11525, + "grad_norm": 32.0, + "grad_norm_var": 8.108268229166667, + "learning_rate": 0.0001, + "loss": 7.4449, + "loss/crossentropy": 1.9883966132998467, + "loss/hidden": 3.378515625, + "loss/jsd": 0.0, + "loss/logits": 0.1796421378850937, + "step": 4610 + }, + { + "epoch": 0.1155, + "grad_norm": 28.5, + "grad_norm_var": 2.8853515625, + "learning_rate": 0.0001, + "loss": 7.43, + "loss/crossentropy": 2.2122382700443266, + "loss/hidden": 3.434765625, + "loss/jsd": 0.0, + "loss/logits": 0.20737907551229, + "step": 4620 + }, + { + "epoch": 0.11575, + "grad_norm": 30.375, + "grad_norm_var": 3.7968098958333334, + "learning_rate": 0.0001, + "loss": 7.3858, + "loss/crossentropy": 2.0896764233708383, + "loss/hidden": 3.540234375, + "loss/jsd": 0.0, + "loss/logits": 0.20905990786850454, + "step": 4630 + }, + { + "epoch": 0.116, + "grad_norm": 27.5, + "grad_norm_var": 3.6879557291666667, + "learning_rate": 0.0001, + "loss": 7.5145, + "loss/crossentropy": 2.104724445939064, + "loss/hidden": 3.3796875, + "loss/jsd": 0.0, + "loss/logits": 0.19548750538378953, + "step": 4640 + }, + { + "epoch": 0.11625, + "grad_norm": 29.875, + "grad_norm_var": 8.7056640625, + "learning_rate": 0.0001, + "loss": 7.4009, + "loss/crossentropy": 2.155320603400469, + "loss/hidden": 3.47578125, + "loss/jsd": 0.0, + "loss/logits": 0.2002986514940858, + "step": 4650 + }, + { + "epoch": 0.1165, + "grad_norm": 27.0, + "grad_norm_var": 5.1541015625, + "learning_rate": 0.0001, + "loss": 7.3193, + "loss/crossentropy": 2.085461828112602, + "loss/hidden": 3.38671875, + "loss/jsd": 0.0, + "loss/logits": 0.1905359473079443, + "step": 4660 + }, + { + "epoch": 0.11675, + "grad_norm": 30.5, + "grad_norm_var": 1.5926432291666666, + "learning_rate": 0.0001, + "loss": 7.3125, + "loss/crossentropy": 1.9927285239100456, + "loss/hidden": 3.411328125, + "loss/jsd": 0.0, + "loss/logits": 0.17640038076788186, + "step": 4670 + }, + { + "epoch": 0.117, + "grad_norm": 33.75, + "grad_norm_var": 4.747330729166666, + "learning_rate": 0.0001, + "loss": 7.469, + "loss/crossentropy": 2.1633560836315153, + "loss/hidden": 3.324609375, + "loss/jsd": 0.0, + "loss/logits": 0.1862495567649603, + "step": 4680 + }, + { + "epoch": 0.11725, + "grad_norm": 28.25, + "grad_norm_var": 7.198372395833333, + "learning_rate": 0.0001, + "loss": 7.4318, + "loss/crossentropy": 2.2390024289488792, + "loss/hidden": 3.430078125, + "loss/jsd": 0.0, + "loss/logits": 0.2097862558439374, + "step": 4690 + }, + { + "epoch": 0.1175, + "grad_norm": 31.375, + "grad_norm_var": 5.760872395833333, + "learning_rate": 0.0001, + "loss": 7.4669, + "loss/crossentropy": 2.0608770951628683, + "loss/hidden": 3.4390625, + "loss/jsd": 0.0, + "loss/logits": 0.19615320730954408, + "step": 4700 + }, + { + "epoch": 0.11775, + "grad_norm": 34.25, + "grad_norm_var": 4.1894735190686346e+18, + "learning_rate": 0.0001, + "loss": 7.4596, + "loss/crossentropy": 2.0900899082422257, + "loss/hidden": 3.360546875, + "loss/jsd": 0.0, + "loss/logits": 0.17933723451569678, + "step": 4710 + }, + { + "epoch": 0.118, + "grad_norm": 29.625, + "grad_norm_var": 58.10729166666667, + "learning_rate": 0.0001, + "loss": 7.3979, + "loss/crossentropy": 2.094898019731045, + "loss/hidden": 3.46875, + "loss/jsd": 0.0, + "loss/logits": 0.20720194689929486, + "step": 4720 + }, + { + "epoch": 0.11825, + "grad_norm": 30.25, + "grad_norm_var": 1.98515625, + "learning_rate": 0.0001, + "loss": 7.4519, + "loss/crossentropy": 2.083225329220295, + "loss/hidden": 3.426171875, + "loss/jsd": 0.0, + "loss/logits": 0.20777787994593383, + "step": 4730 + }, + { + "epoch": 0.1185, + "grad_norm": 30.375, + "grad_norm_var": 4.818684895833333, + "learning_rate": 0.0001, + "loss": 7.4795, + "loss/crossentropy": 2.1974314540624618, + "loss/hidden": 3.38046875, + "loss/jsd": 0.0, + "loss/logits": 0.19978385213762523, + "step": 4740 + }, + { + "epoch": 0.11875, + "grad_norm": 32.5, + "grad_norm_var": 3.439322916666667, + "learning_rate": 0.0001, + "loss": 7.3843, + "loss/crossentropy": 1.9562335655093193, + "loss/hidden": 3.39140625, + "loss/jsd": 0.0, + "loss/logits": 0.18924889974296094, + "step": 4750 + }, + { + "epoch": 0.119, + "grad_norm": 30.625, + "grad_norm_var": 1.3015402743274143e+18, + "learning_rate": 0.0001, + "loss": 7.5729, + "loss/crossentropy": 2.0693807609379293, + "loss/hidden": 3.339453125, + "loss/jsd": 0.0, + "loss/logits": 0.18801879994571208, + "step": 4760 + }, + { + "epoch": 0.11925, + "grad_norm": 35.25, + "grad_norm_var": 258.8791015625, + "learning_rate": 0.0001, + "loss": 7.3013, + "loss/crossentropy": 2.0631250627338886, + "loss/hidden": 3.3703125, + "loss/jsd": 0.0, + "loss/logits": 0.18974527437239885, + "step": 4770 + }, + { + "epoch": 0.1195, + "grad_norm": 28.625, + "grad_norm_var": 301.52233072916664, + "learning_rate": 0.0001, + "loss": 7.4639, + "loss/crossentropy": 2.1473939388990404, + "loss/hidden": 3.3984375, + "loss/jsd": 0.0, + "loss/logits": 0.19722200892865657, + "step": 4780 + }, + { + "epoch": 0.11975, + "grad_norm": 31.125, + "grad_norm_var": 25.472330729166668, + "learning_rate": 0.0001, + "loss": 7.3161, + "loss/crossentropy": 2.1767601929605007, + "loss/hidden": 3.380078125, + "loss/jsd": 0.0, + "loss/logits": 0.20041130091995002, + "step": 4790 + }, + { + "epoch": 0.12, + "grad_norm": 29.5, + "grad_norm_var": 2.8580729166666665, + "learning_rate": 0.0001, + "loss": 7.3077, + "loss/crossentropy": 2.0214909121394156, + "loss/hidden": 3.38828125, + "loss/jsd": 0.0, + "loss/logits": 0.19553480856120586, + "step": 4800 + }, + { + "epoch": 0.12025, + "grad_norm": 34.25, + "grad_norm_var": 2.3666015625, + "learning_rate": 0.0001, + "loss": 7.4537, + "loss/crossentropy": 2.092876334488392, + "loss/hidden": 3.276171875, + "loss/jsd": 0.0, + "loss/logits": 0.19079044535756112, + "step": 4810 + }, + { + "epoch": 0.1205, + "grad_norm": 28.75, + "grad_norm_var": 2.1494140625, + "learning_rate": 0.0001, + "loss": 7.3579, + "loss/crossentropy": 2.159788618981838, + "loss/hidden": 3.447265625, + "loss/jsd": 0.0, + "loss/logits": 0.20938555523753166, + "step": 4820 + }, + { + "epoch": 0.12075, + "grad_norm": 31.625, + "grad_norm_var": 1.2635411529466906e+18, + "learning_rate": 0.0001, + "loss": 7.3822, + "loss/crossentropy": 2.221826246380806, + "loss/hidden": 3.3140625, + "loss/jsd": 0.0, + "loss/logits": 0.18899439387023448, + "step": 4830 + }, + { + "epoch": 0.121, + "grad_norm": 29.375, + "grad_norm_var": 7.171875, + "learning_rate": 0.0001, + "loss": 7.3649, + "loss/crossentropy": 2.2076950490474703, + "loss/hidden": 3.321875, + "loss/jsd": 0.0, + "loss/logits": 0.1911212421953678, + "step": 4840 + }, + { + "epoch": 0.12125, + "grad_norm": 28.875, + "grad_norm_var": 5.397916666666666, + "learning_rate": 0.0001, + "loss": 7.2934, + "loss/crossentropy": 2.1398009806871414, + "loss/hidden": 3.276953125, + "loss/jsd": 0.0, + "loss/logits": 0.18104367554187775, + "step": 4850 + }, + { + "epoch": 0.1215, + "grad_norm": 33.25, + "grad_norm_var": 2.292122395833333, + "learning_rate": 0.0001, + "loss": 7.3944, + "loss/crossentropy": 2.0568679124116898, + "loss/hidden": 3.31953125, + "loss/jsd": 0.0, + "loss/logits": 0.19066975675523282, + "step": 4860 + }, + { + "epoch": 0.12175, + "grad_norm": 31.75, + "grad_norm_var": 1.5145182291666666, + "learning_rate": 0.0001, + "loss": 7.5365, + "loss/crossentropy": 2.2600763499736787, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.20988074019551278, + "step": 4870 + }, + { + "epoch": 0.122, + "grad_norm": 30.125, + "grad_norm_var": 0.8442057291666667, + "learning_rate": 0.0001, + "loss": 7.4425, + "loss/crossentropy": 2.087808459997177, + "loss/hidden": 3.397265625, + "loss/jsd": 0.0, + "loss/logits": 0.20126468148082494, + "step": 4880 + }, + { + "epoch": 0.12225, + "grad_norm": 29.25, + "grad_norm_var": 1.9455729166666667, + "learning_rate": 0.0001, + "loss": 7.4649, + "loss/crossentropy": 2.089573635160923, + "loss/hidden": 3.3890625, + "loss/jsd": 0.0, + "loss/logits": 0.18984669484198094, + "step": 4890 + }, + { + "epoch": 0.1225, + "grad_norm": 29.125, + "grad_norm_var": 2.7552083333333335, + "learning_rate": 0.0001, + "loss": 7.4894, + "loss/crossentropy": 2.1424145482480528, + "loss/hidden": 3.47890625, + "loss/jsd": 0.0, + "loss/logits": 0.20886036530137062, + "step": 4900 + }, + { + "epoch": 0.12275, + "grad_norm": 31.0, + "grad_norm_var": 4.751497395833334, + "learning_rate": 0.0001, + "loss": 7.5033, + "loss/crossentropy": 2.104494086652994, + "loss/hidden": 3.41875, + "loss/jsd": 0.0, + "loss/logits": 0.1945918256416917, + "step": 4910 + }, + { + "epoch": 0.123, + "grad_norm": 28.125, + "grad_norm_var": 5.330989583333333, + "learning_rate": 0.0001, + "loss": 7.4954, + "loss/crossentropy": 2.0843611776828768, + "loss/hidden": 3.358203125, + "loss/jsd": 0.0, + "loss/logits": 0.1925347488373518, + "step": 4920 + }, + { + "epoch": 0.12325, + "grad_norm": 28.625, + "grad_norm_var": 3.8166015625, + "learning_rate": 0.0001, + "loss": 7.4404, + "loss/crossentropy": 2.205425333976746, + "loss/hidden": 3.3359375, + "loss/jsd": 0.0, + "loss/logits": 0.18580489940941333, + "step": 4930 + }, + { + "epoch": 0.1235, + "grad_norm": 29.375, + "grad_norm_var": 14.980208333333334, + "learning_rate": 0.0001, + "loss": 7.3481, + "loss/crossentropy": 1.9896500617265702, + "loss/hidden": 3.39609375, + "loss/jsd": 0.0, + "loss/logits": 0.1904701752588153, + "step": 4940 + }, + { + "epoch": 0.12375, + "grad_norm": 32.75, + "grad_norm_var": 19.178580729166665, + "learning_rate": 0.0001, + "loss": 7.5252, + "loss/crossentropy": 2.1207278318703175, + "loss/hidden": 3.484375, + "loss/jsd": 0.0, + "loss/logits": 0.19760717861354352, + "step": 4950 + }, + { + "epoch": 0.124, + "grad_norm": 32.5, + "grad_norm_var": 17.264583333333334, + "learning_rate": 0.0001, + "loss": 7.2678, + "loss/crossentropy": 1.9271991185843944, + "loss/hidden": 3.419921875, + "loss/jsd": 0.0, + "loss/logits": 0.19860625620931388, + "step": 4960 + }, + { + "epoch": 0.12425, + "grad_norm": 28.625, + "grad_norm_var": 11.196809895833333, + "learning_rate": 0.0001, + "loss": 7.3703, + "loss/crossentropy": 2.0659097760915754, + "loss/hidden": 3.287109375, + "loss/jsd": 0.0, + "loss/logits": 0.18224728610366583, + "step": 4970 + }, + { + "epoch": 0.1245, + "grad_norm": 37.75, + "grad_norm_var": 10.03515625, + "learning_rate": 0.0001, + "loss": 7.5041, + "loss/crossentropy": 1.9809176340699195, + "loss/hidden": 3.41796875, + "loss/jsd": 0.0, + "loss/logits": 0.19965030066668987, + "step": 4980 + }, + { + "epoch": 0.12475, + "grad_norm": 27.125, + "grad_norm_var": 11.567708333333334, + "learning_rate": 0.0001, + "loss": 7.327, + "loss/crossentropy": 2.0197409205138683, + "loss/hidden": 3.368359375, + "loss/jsd": 0.0, + "loss/logits": 0.18525638189166785, + "step": 4990 + }, + { + "epoch": 0.125, + "grad_norm": 34.75, + "grad_norm_var": 8.558268229166666, + "learning_rate": 0.0001, + "loss": 7.393, + "loss/crossentropy": 2.100055608153343, + "loss/hidden": 3.391015625, + "loss/jsd": 0.0, + "loss/logits": 0.19607669236138464, + "step": 5000 + } + ], + "logging_steps": 10, + "max_steps": 40000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4287550160044032e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}