diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3705 @@ +{ + "best_metric": 0.04752533510327339, + "best_model_checkpoint": "results/checkpoint-35000", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 36070, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02772387025228722, + "grad_norm": 1.750556230545044, + "learning_rate": 9.999814117181637e-06, + "loss": 6.049, + "step": 100 + }, + { + "epoch": 0.05544774050457444, + "grad_norm": 0.824866533279419, + "learning_rate": 9.999248953493363e-06, + "loss": 3.0817, + "step": 200 + }, + { + "epoch": 0.08317161075686166, + "grad_norm": 0.4907461702823639, + "learning_rate": 9.998304532844263e-06, + "loss": 2.3969, + "step": 300 + }, + { + "epoch": 0.11089548100914888, + "grad_norm": 0.4534800946712494, + "learning_rate": 9.996980926880713e-06, + "loss": 2.0935, + "step": 400 + }, + { + "epoch": 0.1386193512614361, + "grad_norm": 0.47491493821144104, + "learning_rate": 9.995278236015153e-06, + "loss": 1.9245, + "step": 500 + }, + { + "epoch": 0.1386193512614361, + "eval_valid_loss": 1.7945984601974487, + "eval_valid_runtime": 6.4498, + "eval_valid_samples_per_second": 214.426, + "eval_valid_steps_per_second": 6.822, + "step": 500 + }, + { + "epoch": 0.1386193512614361, + "eval_valid_target_loss": 1.875697374343872, + "eval_valid_target_runtime": 6.5527, + "eval_valid_target_samples_per_second": 218.841, + "eval_valid_target_steps_per_second": 6.867, + "step": 500 + }, + { + "epoch": 0.16634322151372333, + "grad_norm": 0.5983259677886963, + "learning_rate": 9.99319658941846e-06, + "loss": 1.8294, + "step": 600 + }, + { + "epoch": 0.19406709176601053, + "grad_norm": 0.6906803846359253, + "learning_rate": 9.990736145010146e-06, + "loss": 1.7625, + "step": 700 + }, + { + "epoch": 0.22179096201829776, + "grad_norm": 1.4024661779403687, + "learning_rate": 9.987897089446381e-06, + "loss": 1.709, + "step": 800 + }, + { + "epoch": 0.24951483227058496, + "grad_norm": 1.073205590248108, + "learning_rate": 9.984679638105837e-06, + "loss": 1.6595, + "step": 900 + }, + { + "epoch": 0.2772387025228722, + "grad_norm": 1.280462384223938, + "learning_rate": 9.981084035073337e-06, + "loss": 1.6153, + "step": 1000 + }, + { + "epoch": 0.2772387025228722, + "eval_valid_loss": 1.5186923742294312, + "eval_valid_runtime": 6.4198, + "eval_valid_samples_per_second": 215.427, + "eval_valid_steps_per_second": 6.854, + "step": 1000 + }, + { + "epoch": 0.2772387025228722, + "eval_valid_target_loss": 1.5994268655776978, + "eval_valid_target_runtime": 6.5778, + "eval_valid_target_samples_per_second": 218.006, + "eval_valid_target_steps_per_second": 6.841, + "step": 1000 + }, + { + "epoch": 0.3049625727751594, + "grad_norm": 0.9407665133476257, + "learning_rate": 9.977110553121353e-06, + "loss": 1.567, + "step": 1100 + }, + { + "epoch": 0.33268644302744665, + "grad_norm": 1.5439337491989136, + "learning_rate": 9.972759493689301e-06, + "loss": 1.5275, + "step": 1200 + }, + { + "epoch": 0.36041031327973383, + "grad_norm": 2.2176036834716797, + "learning_rate": 9.968031186860677e-06, + "loss": 1.4833, + "step": 1300 + }, + { + "epoch": 0.38813418353202106, + "grad_norm": 1.6237233877182007, + "learning_rate": 9.962925991338018e-06, + "loss": 1.4457, + "step": 1400 + }, + { + "epoch": 0.4158580537843083, + "grad_norm": 1.3075989484786987, + "learning_rate": 9.957444294415685e-06, + "loss": 1.407, + "step": 1500 + }, + { + "epoch": 0.4158580537843083, + "eval_valid_loss": 1.326136589050293, + "eval_valid_runtime": 6.413, + "eval_valid_samples_per_second": 215.655, + "eval_valid_steps_per_second": 6.861, + "step": 1500 + }, + { + "epoch": 0.4158580537843083, + "eval_valid_target_loss": 1.3982958793640137, + "eval_valid_target_runtime": 6.5728, + "eval_valid_target_samples_per_second": 218.172, + "eval_valid_target_steps_per_second": 6.846, + "step": 1500 + }, + { + "epoch": 0.4435819240365955, + "grad_norm": 1.379807472229004, + "learning_rate": 9.951586511950491e-06, + "loss": 1.3768, + "step": 1600 + }, + { + "epoch": 0.47130579428888275, + "grad_norm": 0.737086832523346, + "learning_rate": 9.945353088330137e-06, + "loss": 1.347, + "step": 1700 + }, + { + "epoch": 0.4990296645411699, + "grad_norm": 0.6332296133041382, + "learning_rate": 9.93874449643952e-06, + "loss": 1.3188, + "step": 1800 + }, + { + "epoch": 0.5267535347934572, + "grad_norm": 0.6948099732398987, + "learning_rate": 9.931761237624833e-06, + "loss": 1.2903, + "step": 1900 + }, + { + "epoch": 0.5544774050457444, + "grad_norm": 0.9397527575492859, + "learning_rate": 9.924403841655565e-06, + "loss": 1.2671, + "step": 2000 + }, + { + "epoch": 0.5544774050457444, + "eval_valid_loss": 1.2014020681381226, + "eval_valid_runtime": 6.4367, + "eval_valid_samples_per_second": 214.861, + "eval_valid_steps_per_second": 6.836, + "step": 2000 + }, + { + "epoch": 0.5544774050457444, + "eval_valid_target_loss": 1.2820453643798828, + "eval_valid_target_runtime": 6.5614, + "eval_valid_target_samples_per_second": 218.55, + "eval_valid_target_steps_per_second": 6.858, + "step": 2000 + }, + { + "epoch": 0.5822012752980316, + "grad_norm": 0.5302172303199768, + "learning_rate": 9.916672866684275e-06, + "loss": 1.2439, + "step": 2100 + }, + { + "epoch": 0.6099251455503188, + "grad_norm": 0.5439279675483704, + "learning_rate": 9.908568899204281e-06, + "loss": 1.2231, + "step": 2200 + }, + { + "epoch": 0.637649015802606, + "grad_norm": 0.7026234865188599, + "learning_rate": 9.90009255400514e-06, + "loss": 1.2027, + "step": 2300 + }, + { + "epoch": 0.6653728860548933, + "grad_norm": 0.642803430557251, + "learning_rate": 9.89124447412603e-06, + "loss": 1.1864, + "step": 2400 + }, + { + "epoch": 0.6930967563071805, + "grad_norm": 1.3601601123809814, + "learning_rate": 9.882025330806952e-06, + "loss": 1.1654, + "step": 2500 + }, + { + "epoch": 0.6930967563071805, + "eval_valid_loss": 1.1063387393951416, + "eval_valid_runtime": 6.4314, + "eval_valid_samples_per_second": 215.037, + "eval_valid_steps_per_second": 6.841, + "step": 2500 + }, + { + "epoch": 0.6930967563071805, + "eval_valid_target_loss": 1.208246111869812, + "eval_valid_target_runtime": 6.5564, + "eval_valid_target_samples_per_second": 218.719, + "eval_valid_target_steps_per_second": 6.864, + "step": 2500 + }, + { + "epoch": 0.7208206265594677, + "grad_norm": 0.7053922414779663, + "learning_rate": 9.872435823437816e-06, + "loss": 1.1433, + "step": 2600 + }, + { + "epoch": 0.748544496811755, + "grad_norm": 0.6601741909980774, + "learning_rate": 9.862476679505384e-06, + "loss": 1.1193, + "step": 2700 + }, + { + "epoch": 0.7762683670640421, + "grad_norm": 0.7706498503684998, + "learning_rate": 9.852148654538072e-06, + "loss": 1.0954, + "step": 2800 + }, + { + "epoch": 0.8039922373163294, + "grad_norm": 0.8355486392974854, + "learning_rate": 9.841452532048648e-06, + "loss": 1.069, + "step": 2900 + }, + { + "epoch": 0.8317161075686166, + "grad_norm": 0.8369494676589966, + "learning_rate": 9.830389123474773e-06, + "loss": 1.0384, + "step": 3000 + }, + { + "epoch": 0.8317161075686166, + "eval_valid_loss": 0.9615023732185364, + "eval_valid_runtime": 6.4156, + "eval_valid_samples_per_second": 215.57, + "eval_valid_steps_per_second": 6.858, + "step": 3000 + }, + { + "epoch": 0.8317161075686166, + "eval_valid_target_loss": 1.0947415828704834, + "eval_valid_target_runtime": 6.5753, + "eval_valid_target_samples_per_second": 218.088, + "eval_valid_target_steps_per_second": 6.844, + "step": 3000 + }, + { + "epoch": 0.8594399778209038, + "grad_norm": 1.4864110946655273, + "learning_rate": 9.818959268117464e-06, + "loss": 1.0103, + "step": 3100 + }, + { + "epoch": 0.887163848073191, + "grad_norm": 0.7728907465934753, + "learning_rate": 9.807163833077407e-06, + "loss": 0.982, + "step": 3200 + }, + { + "epoch": 0.9148877183254782, + "grad_norm": 0.6881595253944397, + "learning_rate": 9.795003713189187e-06, + "loss": 0.9492, + "step": 3300 + }, + { + "epoch": 0.9426115885777655, + "grad_norm": 1.0222816467285156, + "learning_rate": 9.782479830953388e-06, + "loss": 0.9142, + "step": 3400 + }, + { + "epoch": 0.9703354588300527, + "grad_norm": 0.6671555042266846, + "learning_rate": 9.769593136466633e-06, + "loss": 0.8838, + "step": 3500 + }, + { + "epoch": 0.9703354588300527, + "eval_valid_loss": 0.8037808537483215, + "eval_valid_runtime": 6.4314, + "eval_valid_samples_per_second": 215.038, + "eval_valid_steps_per_second": 6.841, + "step": 3500 + }, + { + "epoch": 0.9703354588300527, + "eval_valid_target_loss": 0.9639121294021606, + "eval_valid_target_runtime": 6.6053, + "eval_valid_target_samples_per_second": 217.1, + "eval_valid_target_steps_per_second": 6.813, + "step": 3500 + }, + { + "epoch": 0.9980593290823399, + "grad_norm": 0.7793981432914734, + "learning_rate": 9.756344607349483e-06, + "loss": 0.8496, + "step": 3600 + }, + { + "epoch": 1.0257831993346271, + "grad_norm": 0.7545821070671082, + "learning_rate": 9.74273524867229e-06, + "loss": 0.8117, + "step": 3700 + }, + { + "epoch": 1.0535070695869144, + "grad_norm": 0.631118893623352, + "learning_rate": 9.728766092878934e-06, + "loss": 0.7749, + "step": 3800 + }, + { + "epoch": 1.0812309398392015, + "grad_norm": 0.7934292554855347, + "learning_rate": 9.714438199708516e-06, + "loss": 0.7321, + "step": 3900 + }, + { + "epoch": 1.1089548100914888, + "grad_norm": 0.6160613298416138, + "learning_rate": 9.699752656114947e-06, + "loss": 0.6891, + "step": 4000 + }, + { + "epoch": 1.1089548100914888, + "eval_valid_loss": 0.5853330492973328, + "eval_valid_runtime": 6.4069, + "eval_valid_samples_per_second": 215.861, + "eval_valid_steps_per_second": 6.868, + "step": 4000 + }, + { + "epoch": 1.1089548100914888, + "eval_valid_target_loss": 0.7543638944625854, + "eval_valid_target_runtime": 6.5591, + "eval_valid_target_samples_per_second": 218.627, + "eval_valid_target_steps_per_second": 6.861, + "step": 4000 + }, + { + "epoch": 1.136678680343776, + "grad_norm": 0.4765689969062805, + "learning_rate": 9.684710576184504e-06, + "loss": 0.6383, + "step": 4100 + }, + { + "epoch": 1.1644025505960631, + "grad_norm": 0.7610909938812256, + "learning_rate": 9.669313101051295e-06, + "loss": 0.5894, + "step": 4200 + }, + { + "epoch": 1.1921264208483504, + "grad_norm": 0.5010733008384705, + "learning_rate": 9.653561398810706e-06, + "loss": 0.5446, + "step": 4300 + }, + { + "epoch": 1.2198502911006377, + "grad_norm": 0.6305666565895081, + "learning_rate": 9.637456664430776e-06, + "loss": 0.5097, + "step": 4400 + }, + { + "epoch": 1.247574161352925, + "grad_norm": 0.8064519762992859, + "learning_rate": 9.621000119661545e-06, + "loss": 0.4678, + "step": 4500 + }, + { + "epoch": 1.247574161352925, + "eval_valid_loss": 0.38276800513267517, + "eval_valid_runtime": 6.4349, + "eval_valid_samples_per_second": 214.922, + "eval_valid_steps_per_second": 6.838, + "step": 4500 + }, + { + "epoch": 1.247574161352925, + "eval_valid_target_loss": 0.4976137578487396, + "eval_valid_target_runtime": 6.5738, + "eval_valid_target_samples_per_second": 218.139, + "eval_valid_target_steps_per_second": 6.845, + "step": 4500 + }, + { + "epoch": 1.275298031605212, + "grad_norm": 0.49154090881347656, + "learning_rate": 9.604193012942375e-06, + "loss": 0.4326, + "step": 4600 + }, + { + "epoch": 1.3030219018574993, + "grad_norm": 0.5592367053031921, + "learning_rate": 9.587036619307226e-06, + "loss": 0.4054, + "step": 4700 + }, + { + "epoch": 1.3307457721097866, + "grad_norm": 0.48195400834083557, + "learning_rate": 9.569532240287946e-06, + "loss": 0.3828, + "step": 4800 + }, + { + "epoch": 1.3584696423620737, + "grad_norm": 0.5364578366279602, + "learning_rate": 9.551681203815517e-06, + "loss": 0.3595, + "step": 4900 + }, + { + "epoch": 1.386193512614361, + "grad_norm": 0.5409713387489319, + "learning_rate": 9.533484864119327e-06, + "loss": 0.3405, + "step": 5000 + }, + { + "epoch": 1.386193512614361, + "eval_valid_loss": 0.2857649326324463, + "eval_valid_runtime": 6.4118, + "eval_valid_samples_per_second": 215.697, + "eval_valid_steps_per_second": 6.862, + "step": 5000 + }, + { + "epoch": 1.386193512614361, + "eval_valid_target_loss": 0.33146464824676514, + "eval_valid_target_runtime": 6.5717, + "eval_valid_target_samples_per_second": 218.209, + "eval_valid_target_steps_per_second": 6.848, + "step": 5000 + }, + { + "epoch": 1.4139173828666483, + "grad_norm": 0.7294422388076782, + "learning_rate": 9.514944601624427e-06, + "loss": 0.328, + "step": 5100 + }, + { + "epoch": 1.4416412531189353, + "grad_norm": 0.4695785343647003, + "learning_rate": 9.49606182284681e-06, + "loss": 0.3095, + "step": 5200 + }, + { + "epoch": 1.4693651233712226, + "grad_norm": 0.5484552979469299, + "learning_rate": 9.476837960286707e-06, + "loss": 0.3016, + "step": 5300 + }, + { + "epoch": 1.49708899362351, + "grad_norm": 0.38614729046821594, + "learning_rate": 9.457274472319919e-06, + "loss": 0.2875, + "step": 5400 + }, + { + "epoch": 1.524812863875797, + "grad_norm": 0.3303731381893158, + "learning_rate": 9.437372843087175e-06, + "loss": 0.2821, + "step": 5500 + }, + { + "epoch": 1.524812863875797, + "eval_valid_loss": 0.23669035732746124, + "eval_valid_runtime": 6.4303, + "eval_valid_samples_per_second": 215.074, + "eval_valid_steps_per_second": 6.843, + "step": 5500 + }, + { + "epoch": 1.524812863875797, + "eval_valid_target_loss": 0.2617432773113251, + "eval_valid_target_runtime": 6.5556, + "eval_valid_target_samples_per_second": 218.744, + "eval_valid_target_steps_per_second": 6.864, + "step": 5500 + }, + { + "epoch": 1.5525367341280842, + "grad_norm": 0.5144414305686951, + "learning_rate": 9.417134582381548e-06, + "loss": 0.2696, + "step": 5600 + }, + { + "epoch": 1.5802606043803715, + "grad_norm": 0.5522892475128174, + "learning_rate": 9.396561225533902e-06, + "loss": 0.2617, + "step": 5700 + }, + { + "epoch": 1.6079844746326586, + "grad_norm": 0.4152807295322418, + "learning_rate": 9.37565433329644e-06, + "loss": 0.2522, + "step": 5800 + }, + { + "epoch": 1.635708344884946, + "grad_norm": 0.3866608142852783, + "learning_rate": 9.35441549172428e-06, + "loss": 0.2469, + "step": 5900 + }, + { + "epoch": 1.6634322151372332, + "grad_norm": 0.3131564259529114, + "learning_rate": 9.33284631205515e-06, + "loss": 0.2425, + "step": 6000 + }, + { + "epoch": 1.6634322151372332, + "eval_valid_loss": 0.20471729338169098, + "eval_valid_runtime": 6.4284, + "eval_valid_samples_per_second": 215.138, + "eval_valid_steps_per_second": 6.845, + "step": 6000 + }, + { + "epoch": 1.6634322151372332, + "eval_valid_target_loss": 0.2232024222612381, + "eval_valid_target_runtime": 6.5873, + "eval_valid_target_samples_per_second": 217.69, + "eval_valid_target_steps_per_second": 6.831, + "step": 6000 + }, + { + "epoch": 1.6911560853895202, + "grad_norm": 0.4385012090206146, + "learning_rate": 9.31094843058714e-06, + "loss": 0.2346, + "step": 6100 + }, + { + "epoch": 1.7188799556418077, + "grad_norm": 0.3904290497303009, + "learning_rate": 9.28872350855458e-06, + "loss": 0.2279, + "step": 6200 + }, + { + "epoch": 1.7466038258940948, + "grad_norm": 0.4294661581516266, + "learning_rate": 9.266173232002005e-06, + "loss": 0.2218, + "step": 6300 + }, + { + "epoch": 1.774327696146382, + "grad_norm": 0.40256062150001526, + "learning_rate": 9.243299311656253e-06, + "loss": 0.2189, + "step": 6400 + }, + { + "epoch": 1.8020515663986694, + "grad_norm": 0.39798569679260254, + "learning_rate": 9.220103482796683e-06, + "loss": 0.2154, + "step": 6500 + }, + { + "epoch": 1.8020515663986694, + "eval_valid_loss": 0.18116505444049835, + "eval_valid_runtime": 6.4306, + "eval_valid_samples_per_second": 215.065, + "eval_valid_steps_per_second": 6.842, + "step": 6500 + }, + { + "epoch": 1.8020515663986694, + "eval_valid_target_loss": 0.19611063599586487, + "eval_valid_target_runtime": 6.5521, + "eval_valid_target_samples_per_second": 218.86, + "eval_valid_target_steps_per_second": 6.868, + "step": 6500 + }, + { + "epoch": 1.8297754366509564, + "grad_norm": 0.2555886507034302, + "learning_rate": 9.196587505123526e-06, + "loss": 0.2082, + "step": 6600 + }, + { + "epoch": 1.8574993069032437, + "grad_norm": 0.278145968914032, + "learning_rate": 9.172753162624401e-06, + "loss": 0.2025, + "step": 6700 + }, + { + "epoch": 1.885223177155531, + "grad_norm": 0.43592485785484314, + "learning_rate": 9.148602263438967e-06, + "loss": 0.2006, + "step": 6800 + }, + { + "epoch": 1.912947047407818, + "grad_norm": 0.3828723430633545, + "learning_rate": 9.124136639721757e-06, + "loss": 0.1963, + "step": 6900 + }, + { + "epoch": 1.9406709176601054, + "grad_norm": 0.3468044102191925, + "learning_rate": 9.09935814750318e-06, + "loss": 0.1928, + "step": 7000 + }, + { + "epoch": 1.9406709176601054, + "eval_valid_loss": 0.16255635023117065, + "eval_valid_runtime": 6.4262, + "eval_valid_samples_per_second": 215.213, + "eval_valid_steps_per_second": 6.847, + "step": 7000 + }, + { + "epoch": 1.9406709176601054, + "eval_valid_target_loss": 0.17588204145431519, + "eval_valid_target_runtime": 6.5759, + "eval_valid_target_samples_per_second": 218.07, + "eval_valid_target_steps_per_second": 6.843, + "step": 7000 + }, + { + "epoch": 1.9683947879123926, + "grad_norm": 0.28793609142303467, + "learning_rate": 9.074268666548728e-06, + "loss": 0.1868, + "step": 7100 + }, + { + "epoch": 1.9961186581646797, + "grad_norm": 0.4627343714237213, + "learning_rate": 9.04887010021636e-06, + "loss": 0.1857, + "step": 7200 + }, + { + "epoch": 2.023842528416967, + "grad_norm": 0.4490989148616791, + "learning_rate": 9.023164375312117e-06, + "loss": 0.1786, + "step": 7300 + }, + { + "epoch": 2.0515663986692543, + "grad_norm": 0.319859117269516, + "learning_rate": 8.997153441943944e-06, + "loss": 0.1779, + "step": 7400 + }, + { + "epoch": 2.0792902689215413, + "grad_norm": 0.3379845917224884, + "learning_rate": 8.970839273373748e-06, + "loss": 0.1717, + "step": 7500 + }, + { + "epoch": 2.0792902689215413, + "eval_valid_loss": 0.1455078125, + "eval_valid_runtime": 6.4396, + "eval_valid_samples_per_second": 214.766, + "eval_valid_steps_per_second": 6.833, + "step": 7500 + }, + { + "epoch": 2.0792902689215413, + "eval_valid_target_loss": 0.15758885443210602, + "eval_valid_target_runtime": 6.5627, + "eval_valid_target_samples_per_second": 218.508, + "eval_valid_target_steps_per_second": 6.857, + "step": 7500 + }, + { + "epoch": 2.107014139173829, + "grad_norm": 0.3079555928707123, + "learning_rate": 8.944223865867712e-06, + "loss": 0.1688, + "step": 7600 + }, + { + "epoch": 2.134738009426116, + "grad_norm": 0.346603125333786, + "learning_rate": 8.917309238544834e-06, + "loss": 0.1661, + "step": 7700 + }, + { + "epoch": 2.162461879678403, + "grad_norm": 0.3899448812007904, + "learning_rate": 8.890097433223766e-06, + "loss": 0.1653, + "step": 7800 + }, + { + "epoch": 2.1901857499306905, + "grad_norm": 0.31352731585502625, + "learning_rate": 8.862590514267915e-06, + "loss": 0.1609, + "step": 7900 + }, + { + "epoch": 2.2179096201829775, + "grad_norm": 0.29558128118515015, + "learning_rate": 8.834790568428827e-06, + "loss": 0.158, + "step": 8000 + }, + { + "epoch": 2.2179096201829775, + "eval_valid_loss": 0.1319538652896881, + "eval_valid_runtime": 6.417, + "eval_valid_samples_per_second": 215.521, + "eval_valid_steps_per_second": 6.857, + "step": 8000 + }, + { + "epoch": 2.2179096201829775, + "eval_valid_target_loss": 0.1427442878484726, + "eval_valid_target_runtime": 6.5854, + "eval_valid_target_samples_per_second": 217.754, + "eval_valid_target_steps_per_second": 6.833, + "step": 8000 + }, + { + "epoch": 2.2456334904352646, + "grad_norm": 0.29061177372932434, + "learning_rate": 8.80669970468788e-06, + "loss": 0.1545, + "step": 8100 + }, + { + "epoch": 2.273357360687552, + "grad_norm": 0.3253875970840454, + "learning_rate": 8.778320054096306e-06, + "loss": 0.1528, + "step": 8200 + }, + { + "epoch": 2.301081230939839, + "grad_norm": 0.2402360886335373, + "learning_rate": 8.749653769613502e-06, + "loss": 0.1511, + "step": 8300 + }, + { + "epoch": 2.3288051011921262, + "grad_norm": 0.31634458899497986, + "learning_rate": 8.720703025943717e-06, + "loss": 0.1461, + "step": 8400 + }, + { + "epoch": 2.3565289714444138, + "grad_norm": 0.21685920655727386, + "learning_rate": 8.691470019371065e-06, + "loss": 0.143, + "step": 8500 + }, + { + "epoch": 2.3565289714444138, + "eval_valid_loss": 0.12121625989675522, + "eval_valid_runtime": 6.4171, + "eval_valid_samples_per_second": 215.519, + "eval_valid_steps_per_second": 6.857, + "step": 8500 + }, + { + "epoch": 2.3565289714444138, + "eval_valid_target_loss": 0.1312141716480255, + "eval_valid_target_runtime": 6.57, + "eval_valid_target_samples_per_second": 218.266, + "eval_valid_target_steps_per_second": 6.849, + "step": 8500 + }, + { + "epoch": 2.384252841696701, + "grad_norm": 0.24635937809944153, + "learning_rate": 8.661956967592907e-06, + "loss": 0.1424, + "step": 8600 + }, + { + "epoch": 2.411976711948988, + "grad_norm": 0.21958141028881073, + "learning_rate": 8.632166109551623e-06, + "loss": 0.1388, + "step": 8700 + }, + { + "epoch": 2.4397005822012754, + "grad_norm": 0.2693657875061035, + "learning_rate": 8.60209970526474e-06, + "loss": 0.1392, + "step": 8800 + }, + { + "epoch": 2.4674244524535625, + "grad_norm": 0.22512082755565643, + "learning_rate": 8.5717600356535e-06, + "loss": 0.1356, + "step": 8900 + }, + { + "epoch": 2.49514832270585, + "grad_norm": 0.3446211516857147, + "learning_rate": 8.541149402369806e-06, + "loss": 0.1324, + "step": 9000 + }, + { + "epoch": 2.49514832270585, + "eval_valid_loss": 0.11042323708534241, + "eval_valid_runtime": 6.4273, + "eval_valid_samples_per_second": 215.176, + "eval_valid_steps_per_second": 6.846, + "step": 9000 + }, + { + "epoch": 2.49514832270585, + "eval_valid_target_loss": 0.11918216943740845, + "eval_valid_target_runtime": 6.5885, + "eval_valid_target_samples_per_second": 217.651, + "eval_valid_target_steps_per_second": 6.83, + "step": 9000 + }, + { + "epoch": 2.522872192958137, + "grad_norm": 0.21913643181324005, + "learning_rate": 8.51027012762163e-06, + "loss": 0.1303, + "step": 9100 + }, + { + "epoch": 2.550596063210424, + "grad_norm": 0.24243904650211334, + "learning_rate": 8.479124553996824e-06, + "loss": 0.1268, + "step": 9200 + }, + { + "epoch": 2.578319933462711, + "grad_norm": 0.22184187173843384, + "learning_rate": 8.447715044285425e-06, + "loss": 0.1251, + "step": 9300 + }, + { + "epoch": 2.6060438037149987, + "grad_norm": 0.22888724505901337, + "learning_rate": 8.41604398130039e-06, + "loss": 0.1221, + "step": 9400 + }, + { + "epoch": 2.6337676739672857, + "grad_norm": 0.24152572453022003, + "learning_rate": 8.384113767696838e-06, + "loss": 0.121, + "step": 9500 + }, + { + "epoch": 2.6337676739672857, + "eval_valid_loss": 0.10074004530906677, + "eval_valid_runtime": 6.4317, + "eval_valid_samples_per_second": 215.03, + "eval_valid_steps_per_second": 6.841, + "step": 9500 + }, + { + "epoch": 2.6337676739672857, + "eval_valid_target_loss": 0.10891123861074448, + "eval_valid_target_runtime": 6.5593, + "eval_valid_target_samples_per_second": 218.622, + "eval_valid_target_steps_per_second": 6.861, + "step": 9500 + }, + { + "epoch": 2.6614915442195732, + "grad_norm": 0.2756216526031494, + "learning_rate": 8.35192682578978e-06, + "loss": 0.1195, + "step": 9600 + }, + { + "epoch": 2.6892154144718603, + "grad_norm": 0.24438254535198212, + "learning_rate": 8.319485597370348e-06, + "loss": 0.1157, + "step": 9700 + }, + { + "epoch": 2.7169392847241474, + "grad_norm": 0.35991132259368896, + "learning_rate": 8.286792543520556e-06, + "loss": 0.115, + "step": 9800 + }, + { + "epoch": 2.744663154976435, + "grad_norm": 0.22763152420520782, + "learning_rate": 8.253850144426606e-06, + "loss": 0.1134, + "step": 9900 + }, + { + "epoch": 2.772387025228722, + "grad_norm": 0.24357567727565765, + "learning_rate": 8.220660899190712e-06, + "loss": 0.1106, + "step": 10000 + }, + { + "epoch": 2.772387025228722, + "eval_valid_loss": 0.092686228454113, + "eval_valid_runtime": 6.4287, + "eval_valid_samples_per_second": 215.129, + "eval_valid_steps_per_second": 6.844, + "step": 10000 + }, + { + "epoch": 2.772387025228722, + "eval_valid_target_loss": 0.1005280539393425, + "eval_valid_target_runtime": 6.5902, + "eval_valid_target_samples_per_second": 217.596, + "eval_valid_target_steps_per_second": 6.828, + "step": 10000 + }, + { + "epoch": 2.800110895481009, + "grad_norm": 0.20446299016475677, + "learning_rate": 8.187227325641534e-06, + "loss": 0.109, + "step": 10100 + }, + { + "epoch": 2.8278347657332965, + "grad_norm": 0.24309873580932617, + "learning_rate": 8.153551960143157e-06, + "loss": 0.1087, + "step": 10200 + }, + { + "epoch": 2.8555586359855836, + "grad_norm": 0.21243679523468018, + "learning_rate": 8.119637357402676e-06, + "loss": 0.1063, + "step": 10300 + }, + { + "epoch": 2.8832825062378706, + "grad_norm": 0.2227753847837448, + "learning_rate": 8.085486090276391e-06, + "loss": 0.1057, + "step": 10400 + }, + { + "epoch": 2.911006376490158, + "grad_norm": 0.1933346837759018, + "learning_rate": 8.05110074957462e-06, + "loss": 0.1037, + "step": 10500 + }, + { + "epoch": 2.911006376490158, + "eval_valid_loss": 0.08755628019571304, + "eval_valid_runtime": 6.4374, + "eval_valid_samples_per_second": 214.84, + "eval_valid_steps_per_second": 6.835, + "step": 10500 + }, + { + "epoch": 2.911006376490158, + "eval_valid_target_loss": 0.09479602426290512, + "eval_valid_target_runtime": 6.5624, + "eval_valid_target_samples_per_second": 218.517, + "eval_valid_target_steps_per_second": 6.857, + "step": 10500 + }, + { + "epoch": 2.938730246742445, + "grad_norm": 0.24507193267345428, + "learning_rate": 8.016483943865158e-06, + "loss": 0.1026, + "step": 10600 + }, + { + "epoch": 2.9664541169947327, + "grad_norm": 0.16903254389762878, + "learning_rate": 7.98163829927538e-06, + "loss": 0.1019, + "step": 10700 + }, + { + "epoch": 2.99417798724702, + "grad_norm": 0.21406187117099762, + "learning_rate": 7.946566459293014e-06, + "loss": 0.1016, + "step": 10800 + }, + { + "epoch": 3.021901857499307, + "grad_norm": 0.17749078571796417, + "learning_rate": 7.911271084565603e-06, + "loss": 0.0988, + "step": 10900 + }, + { + "epoch": 3.049625727751594, + "grad_norm": 0.2052767425775528, + "learning_rate": 7.875754852698658e-06, + "loss": 0.099, + "step": 11000 + }, + { + "epoch": 3.049625727751594, + "eval_valid_loss": 0.08359777182340622, + "eval_valid_runtime": 6.4134, + "eval_valid_samples_per_second": 215.643, + "eval_valid_steps_per_second": 6.861, + "step": 11000 + }, + { + "epoch": 3.049625727751594, + "eval_valid_target_loss": 0.09044167399406433, + "eval_valid_target_runtime": 6.5678, + "eval_valid_target_samples_per_second": 218.336, + "eval_valid_target_steps_per_second": 6.852, + "step": 11000 + }, + { + "epoch": 3.0773495980038814, + "grad_norm": 0.20621031522750854, + "learning_rate": 7.840020458052529e-06, + "loss": 0.0961, + "step": 11100 + }, + { + "epoch": 3.1050734682561685, + "grad_norm": 0.18608888983726501, + "learning_rate": 7.804070611538001e-06, + "loss": 0.0964, + "step": 11200 + }, + { + "epoch": 3.132797338508456, + "grad_norm": 0.14550629258155823, + "learning_rate": 7.767908040410642e-06, + "loss": 0.0957, + "step": 11300 + }, + { + "epoch": 3.160521208760743, + "grad_norm": 0.21664443612098694, + "learning_rate": 7.731535488063895e-06, + "loss": 0.0948, + "step": 11400 + }, + { + "epoch": 3.18824507901303, + "grad_norm": 0.17702756822109222, + "learning_rate": 7.694955713820974e-06, + "loss": 0.0935, + "step": 11500 + }, + { + "epoch": 3.18824507901303, + "eval_valid_loss": 0.07985392957925797, + "eval_valid_runtime": 6.4194, + "eval_valid_samples_per_second": 215.442, + "eval_valid_steps_per_second": 6.854, + "step": 11500 + }, + { + "epoch": 3.18824507901303, + "eval_valid_target_loss": 0.08640262484550476, + "eval_valid_target_runtime": 6.5608, + "eval_valid_target_samples_per_second": 218.572, + "eval_valid_target_steps_per_second": 6.859, + "step": 11500 + }, + { + "epoch": 3.2159689492653176, + "grad_norm": 0.19913919270038605, + "learning_rate": 7.658171492725513e-06, + "loss": 0.0936, + "step": 11600 + }, + { + "epoch": 3.2436928195176047, + "grad_norm": 0.18789726495742798, + "learning_rate": 7.621185615331061e-06, + "loss": 0.0924, + "step": 11700 + }, + { + "epoch": 3.2714166897698918, + "grad_norm": 0.18376338481903076, + "learning_rate": 7.584000887489373e-06, + "loss": 0.0911, + "step": 11800 + }, + { + "epoch": 3.2991405600221793, + "grad_norm": 0.19736219942569733, + "learning_rate": 7.546620130137557e-06, + "loss": 0.0912, + "step": 11900 + }, + { + "epoch": 3.3268644302744663, + "grad_norm": 0.19527922570705414, + "learning_rate": 7.509046179084061e-06, + "loss": 0.0912, + "step": 12000 + }, + { + "epoch": 3.3268644302744663, + "eval_valid_loss": 0.07622889429330826, + "eval_valid_runtime": 6.4437, + "eval_valid_samples_per_second": 214.627, + "eval_valid_steps_per_second": 6.828, + "step": 12000 + }, + { + "epoch": 3.3268644302744663, + "eval_valid_target_loss": 0.0823676660656929, + "eval_valid_target_runtime": 6.5589, + "eval_valid_target_samples_per_second": 218.635, + "eval_valid_target_steps_per_second": 6.861, + "step": 12000 + }, + { + "epoch": 3.3545883005267534, + "grad_norm": 0.18916228413581848, + "learning_rate": 7.471281884793544e-06, + "loss": 0.0896, + "step": 12100 + }, + { + "epoch": 3.382312170779041, + "grad_norm": 0.1649465262889862, + "learning_rate": 7.4333301121706445e-06, + "loss": 0.0881, + "step": 12200 + }, + { + "epoch": 3.410036041031328, + "grad_norm": 0.18362993001937866, + "learning_rate": 7.3951937403426186e-06, + "loss": 0.0892, + "step": 12300 + }, + { + "epoch": 3.437759911283615, + "grad_norm": 0.19268861413002014, + "learning_rate": 7.356875662440939e-06, + "loss": 0.0879, + "step": 12400 + }, + { + "epoch": 3.4654837815359025, + "grad_norm": 0.17124581336975098, + "learning_rate": 7.318378785381802e-06, + "loss": 0.086, + "step": 12500 + }, + { + "epoch": 3.4654837815359025, + "eval_valid_loss": 0.07317828387022018, + "eval_valid_runtime": 6.4273, + "eval_valid_samples_per_second": 215.177, + "eval_valid_steps_per_second": 6.846, + "step": 12500 + }, + { + "epoch": 3.4654837815359025, + "eval_valid_target_loss": 0.07900213450193405, + "eval_valid_target_runtime": 6.5852, + "eval_valid_target_samples_per_second": 217.76, + "eval_valid_target_steps_per_second": 6.833, + "step": 12500 + }, + { + "epoch": 3.4932076517881896, + "grad_norm": 0.23004941642284393, + "learning_rate": 7.279706029645615e-06, + "loss": 0.0855, + "step": 12600 + }, + { + "epoch": 3.5209315220404767, + "grad_norm": 0.16131635010242462, + "learning_rate": 7.240860329055422e-06, + "loss": 0.0848, + "step": 12700 + }, + { + "epoch": 3.548655392292764, + "grad_norm": 0.19867731630802155, + "learning_rate": 7.201844630554353e-06, + "loss": 0.0851, + "step": 12800 + }, + { + "epoch": 3.5763792625450512, + "grad_norm": 0.17405714094638824, + "learning_rate": 7.162661893982052e-06, + "loss": 0.0839, + "step": 12900 + }, + { + "epoch": 3.6041031327973387, + "grad_norm": 0.19404906034469604, + "learning_rate": 7.123315091850136e-06, + "loss": 0.0839, + "step": 13000 + }, + { + "epoch": 3.6041031327973387, + "eval_valid_loss": 0.07132507115602493, + "eval_valid_runtime": 6.4118, + "eval_valid_samples_per_second": 215.695, + "eval_valid_steps_per_second": 6.862, + "step": 13000 + }, + { + "epoch": 3.6041031327973387, + "eval_valid_target_loss": 0.0771123468875885, + "eval_valid_target_runtime": 6.5745, + "eval_valid_target_samples_per_second": 218.117, + "eval_valid_target_steps_per_second": 6.845, + "step": 13000 + }, + { + "epoch": 3.631827003049626, + "grad_norm": 0.15152141451835632, + "learning_rate": 7.083807209116689e-06, + "loss": 0.0836, + "step": 13100 + }, + { + "epoch": 3.659550873301913, + "grad_norm": 0.18368007242679596, + "learning_rate": 7.044141242959826e-06, + "loss": 0.0827, + "step": 13200 + }, + { + "epoch": 3.6872747435542, + "grad_norm": 0.18081355094909668, + "learning_rate": 7.004320202550303e-06, + "loss": 0.0823, + "step": 13300 + }, + { + "epoch": 3.7149986138064874, + "grad_norm": 0.15222586691379547, + "learning_rate": 6.9643471088232506e-06, + "loss": 0.0801, + "step": 13400 + }, + { + "epoch": 3.7427224840587745, + "grad_norm": 0.1571241021156311, + "learning_rate": 6.9242249942489755e-06, + "loss": 0.0807, + "step": 13500 + }, + { + "epoch": 3.7427224840587745, + "eval_valid_loss": 0.06911951303482056, + "eval_valid_runtime": 6.4701, + "eval_valid_samples_per_second": 213.752, + "eval_valid_steps_per_second": 6.8, + "step": 13500 + }, + { + "epoch": 3.7427224840587745, + "eval_valid_target_loss": 0.07482416182756424, + "eval_valid_target_runtime": 6.5611, + "eval_valid_target_samples_per_second": 218.56, + "eval_valid_target_steps_per_second": 6.859, + "step": 13500 + }, + { + "epoch": 3.770446354311062, + "grad_norm": 0.1546078324317932, + "learning_rate": 6.883956902602933e-06, + "loss": 0.0811, + "step": 13600 + }, + { + "epoch": 3.798170224563349, + "grad_norm": 0.1428447812795639, + "learning_rate": 6.843545888734801e-06, + "loss": 0.0795, + "step": 13700 + }, + { + "epoch": 3.825894094815636, + "grad_norm": 0.1369272619485855, + "learning_rate": 6.802995018336736e-06, + "loss": 0.0794, + "step": 13800 + }, + { + "epoch": 3.8536179650679236, + "grad_norm": 0.1972970962524414, + "learning_rate": 6.762307367710797e-06, + "loss": 0.0785, + "step": 13900 + }, + { + "epoch": 3.8813418353202107, + "grad_norm": 0.15961000323295593, + "learning_rate": 6.721486023535577e-06, + "loss": 0.0787, + "step": 14000 + }, + { + "epoch": 3.8813418353202107, + "eval_valid_loss": 0.06712613999843597, + "eval_valid_runtime": 6.4106, + "eval_valid_samples_per_second": 215.737, + "eval_valid_steps_per_second": 6.864, + "step": 14000 + }, + { + "epoch": 3.8813418353202107, + "eval_valid_target_loss": 0.07271508872509003, + "eval_valid_target_runtime": 6.5891, + "eval_valid_target_samples_per_second": 217.633, + "eval_valid_target_steps_per_second": 6.829, + "step": 14000 + }, + { + "epoch": 3.9090657055724978, + "grad_norm": 0.15836742520332336, + "learning_rate": 6.680534082632036e-06, + "loss": 0.0779, + "step": 14100 + }, + { + "epoch": 3.9367895758247853, + "grad_norm": 0.1906501203775406, + "learning_rate": 6.639454651728561e-06, + "loss": 0.0772, + "step": 14200 + }, + { + "epoch": 3.9645134460770723, + "grad_norm": 0.1872212439775467, + "learning_rate": 6.598250847225286e-06, + "loss": 0.0772, + "step": 14300 + }, + { + "epoch": 3.9922373163293594, + "grad_norm": 0.1689438670873642, + "learning_rate": 6.556925794957678e-06, + "loss": 0.0769, + "step": 14400 + }, + { + "epoch": 4.0199611865816465, + "grad_norm": 0.1830626279115677, + "learning_rate": 6.515482629959392e-06, + "loss": 0.0764, + "step": 14500 + }, + { + "epoch": 4.0199611865816465, + "eval_valid_loss": 0.0653899684548378, + "eval_valid_runtime": 6.4271, + "eval_valid_samples_per_second": 215.181, + "eval_valid_steps_per_second": 6.846, + "step": 14500 + }, + { + "epoch": 4.0199611865816465, + "eval_valid_target_loss": 0.0708317682147026, + "eval_valid_target_runtime": 6.5574, + "eval_valid_target_samples_per_second": 218.684, + "eval_valid_target_steps_per_second": 6.862, + "step": 14500 + }, + { + "epoch": 4.047685056833934, + "grad_norm": 0.1517285257577896, + "learning_rate": 6.473924496224447e-06, + "loss": 0.0757, + "step": 14600 + }, + { + "epoch": 4.0754089270862215, + "grad_norm": 0.15981799364089966, + "learning_rate": 6.432254546468708e-06, + "loss": 0.0751, + "step": 14700 + }, + { + "epoch": 4.1031327973385086, + "grad_norm": 0.14974670112133026, + "learning_rate": 6.3904759418907194e-06, + "loss": 0.0755, + "step": 14800 + }, + { + "epoch": 4.130856667590796, + "grad_norm": 0.15918827056884766, + "learning_rate": 6.348591851931879e-06, + "loss": 0.0743, + "step": 14900 + }, + { + "epoch": 4.158580537843083, + "grad_norm": 0.17248332500457764, + "learning_rate": 6.306605454036001e-06, + "loss": 0.0747, + "step": 15000 + }, + { + "epoch": 4.158580537843083, + "eval_valid_loss": 0.06470626592636108, + "eval_valid_runtime": 6.4429, + "eval_valid_samples_per_second": 214.654, + "eval_valid_steps_per_second": 6.829, + "step": 15000 + }, + { + "epoch": 4.158580537843083, + "eval_valid_target_loss": 0.07004554569721222, + "eval_valid_target_runtime": 6.5941, + "eval_valid_target_samples_per_second": 217.468, + "eval_valid_target_steps_per_second": 6.824, + "step": 15000 + }, + { + "epoch": 4.18630440809537, + "grad_norm": 0.18200209736824036, + "learning_rate": 6.2645199334082674e-06, + "loss": 0.0735, + "step": 15100 + }, + { + "epoch": 4.214028278347658, + "grad_norm": 0.12851852178573608, + "learning_rate": 6.222338482773584e-06, + "loss": 0.0736, + "step": 15200 + }, + { + "epoch": 4.241752148599945, + "grad_norm": 0.15132804214954376, + "learning_rate": 6.180064302134374e-06, + "loss": 0.0738, + "step": 15300 + }, + { + "epoch": 4.269476018852232, + "grad_norm": 0.15047667920589447, + "learning_rate": 6.1377005985278205e-06, + "loss": 0.073, + "step": 15400 + }, + { + "epoch": 4.297199889104519, + "grad_norm": 0.19985252618789673, + "learning_rate": 6.095250585782562e-06, + "loss": 0.0732, + "step": 15500 + }, + { + "epoch": 4.297199889104519, + "eval_valid_loss": 0.062382254749536514, + "eval_valid_runtime": 6.4347, + "eval_valid_samples_per_second": 214.927, + "eval_valid_steps_per_second": 6.838, + "step": 15500 + }, + { + "epoch": 4.297199889104519, + "eval_valid_target_loss": 0.06759324669837952, + "eval_valid_target_runtime": 6.5646, + "eval_valid_target_samples_per_second": 218.446, + "eval_valid_target_steps_per_second": 6.855, + "step": 15500 + }, + { + "epoch": 4.324923759356806, + "grad_norm": 0.16384641826152802, + "learning_rate": 6.0527174842748994e-06, + "loss": 0.0716, + "step": 15600 + }, + { + "epoch": 4.352647629609093, + "grad_norm": 0.14244656264781952, + "learning_rate": 6.0101045206844676e-06, + "loss": 0.0716, + "step": 15700 + }, + { + "epoch": 4.380371499861381, + "grad_norm": 0.16209416091442108, + "learning_rate": 5.9674149277494694e-06, + "loss": 0.0714, + "step": 15800 + }, + { + "epoch": 4.408095370113668, + "grad_norm": 0.17041273415088654, + "learning_rate": 5.92465194402142e-06, + "loss": 0.0715, + "step": 15900 + }, + { + "epoch": 4.435819240365955, + "grad_norm": 0.16730940341949463, + "learning_rate": 5.881818813619463e-06, + "loss": 0.0714, + "step": 16000 + }, + { + "epoch": 4.435819240365955, + "eval_valid_loss": 0.061134014278650284, + "eval_valid_runtime": 6.4104, + "eval_valid_samples_per_second": 215.742, + "eval_valid_steps_per_second": 6.864, + "step": 16000 + }, + { + "epoch": 4.435819240365955, + "eval_valid_target_loss": 0.06638547778129578, + "eval_valid_target_runtime": 6.5651, + "eval_valid_target_samples_per_second": 218.427, + "eval_valid_target_steps_per_second": 6.854, + "step": 16000 + }, + { + "epoch": 4.463543110618242, + "grad_norm": 0.13161396980285645, + "learning_rate": 5.8389187859842675e-06, + "loss": 0.0703, + "step": 16100 + }, + { + "epoch": 4.491266980870529, + "grad_norm": 0.13423210382461548, + "learning_rate": 5.7959551156315156e-06, + "loss": 0.0707, + "step": 16200 + }, + { + "epoch": 4.518990851122817, + "grad_norm": 0.20051045715808868, + "learning_rate": 5.752931061904994e-06, + "loss": 0.0699, + "step": 16300 + }, + { + "epoch": 4.546714721375104, + "grad_norm": 0.15945318341255188, + "learning_rate": 5.709849888729351e-06, + "loss": 0.0697, + "step": 16400 + }, + { + "epoch": 4.574438591627391, + "grad_norm": 0.13749030232429504, + "learning_rate": 5.666714864362468e-06, + "loss": 0.0704, + "step": 16500 + }, + { + "epoch": 4.574438591627391, + "eval_valid_loss": 0.06001834571361542, + "eval_valid_runtime": 6.4467, + "eval_valid_samples_per_second": 214.529, + "eval_valid_steps_per_second": 6.825, + "step": 16500 + }, + { + "epoch": 4.574438591627391, + "eval_valid_target_loss": 0.06535307317972183, + "eval_valid_target_runtime": 6.5686, + "eval_valid_target_samples_per_second": 218.311, + "eval_valid_target_steps_per_second": 6.851, + "step": 16500 + }, + { + "epoch": 4.602162461879678, + "grad_norm": 0.133077010512352, + "learning_rate": 5.6235292611475326e-06, + "loss": 0.0693, + "step": 16600 + }, + { + "epoch": 4.629886332131965, + "grad_norm": 0.1508035957813263, + "learning_rate": 5.580296355264783e-06, + "loss": 0.069, + "step": 16700 + }, + { + "epoch": 4.6576102023842525, + "grad_norm": 0.14195485413074493, + "learning_rate": 5.537019426482966e-06, + "loss": 0.0695, + "step": 16800 + }, + { + "epoch": 4.6853340726365404, + "grad_norm": 0.16586261987686157, + "learning_rate": 5.493701757910536e-06, + "loss": 0.0684, + "step": 16900 + }, + { + "epoch": 4.7130579428888275, + "grad_norm": 0.13865657150745392, + "learning_rate": 5.4503466357465765e-06, + "loss": 0.0682, + "step": 17000 + }, + { + "epoch": 4.7130579428888275, + "eval_valid_loss": 0.0584811232984066, + "eval_valid_runtime": 6.422, + "eval_valid_samples_per_second": 215.352, + "eval_valid_steps_per_second": 6.851, + "step": 17000 + }, + { + "epoch": 4.7130579428888275, + "eval_valid_target_loss": 0.06370435655117035, + "eval_valid_target_runtime": 6.5705, + "eval_valid_target_samples_per_second": 218.247, + "eval_valid_target_steps_per_second": 6.849, + "step": 17000 + }, + { + "epoch": 4.740781813141115, + "grad_norm": 0.1934811919927597, + "learning_rate": 5.406957349031504e-06, + "loss": 0.0686, + "step": 17100 + }, + { + "epoch": 4.768505683393402, + "grad_norm": 0.16662567853927612, + "learning_rate": 5.363537189397556e-06, + "loss": 0.0682, + "step": 17200 + }, + { + "epoch": 4.796229553645689, + "grad_norm": 0.15507076680660248, + "learning_rate": 5.320089450819075e-06, + "loss": 0.0673, + "step": 17300 + }, + { + "epoch": 4.823953423897976, + "grad_norm": 0.12763585150241852, + "learning_rate": 5.276617429362616e-06, + "loss": 0.0671, + "step": 17400 + }, + { + "epoch": 4.851677294150264, + "grad_norm": 0.15640078485012054, + "learning_rate": 5.233124422936906e-06, + "loss": 0.0669, + "step": 17500 + }, + { + "epoch": 4.851677294150264, + "eval_valid_loss": 0.05754322186112404, + "eval_valid_runtime": 6.4388, + "eval_valid_samples_per_second": 214.792, + "eval_valid_steps_per_second": 6.834, + "step": 17500 + }, + { + "epoch": 4.851677294150264, + "eval_valid_target_loss": 0.06262939423322678, + "eval_valid_target_runtime": 6.5536, + "eval_valid_target_samples_per_second": 218.81, + "eval_valid_target_steps_per_second": 6.866, + "step": 17500 + }, + { + "epoch": 4.879401164402551, + "grad_norm": 0.16545389592647552, + "learning_rate": 5.189613731042645e-06, + "loss": 0.0663, + "step": 17600 + }, + { + "epoch": 4.907125034654838, + "grad_norm": 0.17085812985897064, + "learning_rate": 5.146088654522208e-06, + "loss": 0.0657, + "step": 17700 + }, + { + "epoch": 4.934848904907125, + "grad_norm": 0.14638109505176544, + "learning_rate": 5.102552495309222e-06, + "loss": 0.0677, + "step": 17800 + }, + { + "epoch": 4.962572775159412, + "grad_norm": 0.15568013489246368, + "learning_rate": 5.059008556178079e-06, + "loss": 0.0657, + "step": 17900 + }, + { + "epoch": 4.9902966454117, + "grad_norm": 0.16898399591445923, + "learning_rate": 5.015460140493381e-06, + "loss": 0.0661, + "step": 18000 + }, + { + "epoch": 4.9902966454117, + "eval_valid_loss": 0.05648580938577652, + "eval_valid_runtime": 6.4207, + "eval_valid_samples_per_second": 215.397, + "eval_valid_steps_per_second": 6.853, + "step": 18000 + }, + { + "epoch": 4.9902966454117, + "eval_valid_target_loss": 0.06151015684008598, + "eval_valid_target_runtime": 6.5952, + "eval_valid_target_samples_per_second": 217.432, + "eval_valid_target_steps_per_second": 6.823, + "step": 18000 + }, + { + "epoch": 5.018020515663987, + "grad_norm": 0.13535688817501068, + "learning_rate": 4.971910551959332e-06, + "loss": 0.0654, + "step": 18100 + }, + { + "epoch": 5.045744385916274, + "grad_norm": 0.16001687943935394, + "learning_rate": 4.928363094369108e-06, + "loss": 0.0656, + "step": 18200 + }, + { + "epoch": 5.073468256168561, + "grad_norm": 0.1575719267129898, + "learning_rate": 4.88482107135423e-06, + "loss": 0.0641, + "step": 18300 + }, + { + "epoch": 5.101192126420848, + "grad_norm": 0.1607745736837387, + "learning_rate": 4.841287786133937e-06, + "loss": 0.0642, + "step": 18400 + }, + { + "epoch": 5.128915996673135, + "grad_norm": 0.13689269125461578, + "learning_rate": 4.797766541264592e-06, + "loss": 0.0646, + "step": 18500 + }, + { + "epoch": 5.128915996673135, + "eval_valid_loss": 0.05563423037528992, + "eval_valid_runtime": 6.4248, + "eval_valid_samples_per_second": 215.261, + "eval_valid_steps_per_second": 6.849, + "step": 18500 + }, + { + "epoch": 5.128915996673135, + "eval_valid_target_loss": 0.06068035215139389, + "eval_valid_target_runtime": 6.561, + "eval_valid_target_samples_per_second": 218.566, + "eval_valid_target_steps_per_second": 6.859, + "step": 18500 + }, + { + "epoch": 5.156639866925423, + "grad_norm": 0.13576319813728333, + "learning_rate": 4.754260638389145e-06, + "loss": 0.0641, + "step": 18600 + }, + { + "epoch": 5.18436373717771, + "grad_norm": 0.13574448227882385, + "learning_rate": 4.710773377986659e-06, + "loss": 0.0643, + "step": 18700 + }, + { + "epoch": 5.212087607429997, + "grad_norm": 0.11536768078804016, + "learning_rate": 4.667308059121928e-06, + "loss": 0.064, + "step": 18800 + }, + { + "epoch": 5.239811477682284, + "grad_norm": 0.1470881700515747, + "learning_rate": 4.623867979195196e-06, + "loss": 0.0637, + "step": 18900 + }, + { + "epoch": 5.2675353479345715, + "grad_norm": 0.13156047463417053, + "learning_rate": 4.580456433692017e-06, + "loss": 0.0635, + "step": 19000 + }, + { + "epoch": 5.2675353479345715, + "eval_valid_loss": 0.05473410338163376, + "eval_valid_runtime": 6.4623, + "eval_valid_samples_per_second": 214.012, + "eval_valid_steps_per_second": 6.809, + "step": 19000 + }, + { + "epoch": 5.2675353479345715, + "eval_valid_target_loss": 0.05973204970359802, + "eval_valid_target_runtime": 6.5636, + "eval_valid_target_samples_per_second": 218.477, + "eval_valid_target_steps_per_second": 6.856, + "step": 19000 + }, + { + "epoch": 5.2952592181868585, + "grad_norm": 0.132376030087471, + "learning_rate": 4.537076715933242e-06, + "loss": 0.0638, + "step": 19100 + }, + { + "epoch": 5.3229830884391465, + "grad_norm": 0.14191821217536926, + "learning_rate": 4.493732116825174e-06, + "loss": 0.064, + "step": 19200 + }, + { + "epoch": 5.3507069586914335, + "grad_norm": 0.1247839480638504, + "learning_rate": 4.45042592460993e-06, + "loss": 0.0627, + "step": 19300 + }, + { + "epoch": 5.378430828943721, + "grad_norm": 0.12980355322360992, + "learning_rate": 4.4071614246159596e-06, + "loss": 0.0632, + "step": 19400 + }, + { + "epoch": 5.406154699196008, + "grad_norm": 0.1391134262084961, + "learning_rate": 4.363941899008833e-06, + "loss": 0.0625, + "step": 19500 + }, + { + "epoch": 5.406154699196008, + "eval_valid_loss": 0.05415208637714386, + "eval_valid_runtime": 6.4065, + "eval_valid_samples_per_second": 215.873, + "eval_valid_steps_per_second": 6.868, + "step": 19500 + }, + { + "epoch": 5.406154699196008, + "eval_valid_target_loss": 0.05894719064235687, + "eval_valid_target_runtime": 6.569, + "eval_valid_target_samples_per_second": 218.299, + "eval_valid_target_steps_per_second": 6.85, + "step": 19500 + }, + { + "epoch": 5.433878569448295, + "grad_norm": 0.2045671045780182, + "learning_rate": 4.320770626542238e-06, + "loss": 0.0629, + "step": 19600 + }, + { + "epoch": 5.461602439700582, + "grad_norm": 0.1417771577835083, + "learning_rate": 4.277650882309238e-06, + "loss": 0.0625, + "step": 19700 + }, + { + "epoch": 5.48932630995287, + "grad_norm": 0.14284995198249817, + "learning_rate": 4.234585937493829e-06, + "loss": 0.0623, + "step": 19800 + }, + { + "epoch": 5.517050180205157, + "grad_norm": 0.1546027809381485, + "learning_rate": 4.1915790591227615e-06, + "loss": 0.0625, + "step": 19900 + }, + { + "epoch": 5.544774050457444, + "grad_norm": 0.1454819142818451, + "learning_rate": 4.148633509817715e-06, + "loss": 0.0613, + "step": 20000 + }, + { + "epoch": 5.544774050457444, + "eval_valid_loss": 0.05364985764026642, + "eval_valid_runtime": 6.436, + "eval_valid_samples_per_second": 214.885, + "eval_valid_steps_per_second": 6.837, + "step": 20000 + }, + { + "epoch": 5.544774050457444, + "eval_valid_target_loss": 0.05850011110305786, + "eval_valid_target_runtime": 6.5534, + "eval_valid_target_samples_per_second": 218.819, + "eval_valid_target_steps_per_second": 6.867, + "step": 20000 + }, + { + "epoch": 5.572497920709731, + "grad_norm": 0.12440012395381927, + "learning_rate": 4.105752547547764e-06, + "loss": 0.0613, + "step": 20100 + }, + { + "epoch": 5.600221790962018, + "grad_norm": 0.14089658856391907, + "learning_rate": 4.062939425382236e-06, + "loss": 0.0616, + "step": 20200 + }, + { + "epoch": 5.627945661214305, + "grad_norm": 0.24770374596118927, + "learning_rate": 4.020197391243922e-06, + "loss": 0.0621, + "step": 20300 + }, + { + "epoch": 5.655669531466593, + "grad_norm": 0.11835476011037827, + "learning_rate": 3.977529687662671e-06, + "loss": 0.0619, + "step": 20400 + }, + { + "epoch": 5.68339340171888, + "grad_norm": 0.12585273385047913, + "learning_rate": 3.93493955152941e-06, + "loss": 0.0612, + "step": 20500 + }, + { + "epoch": 5.68339340171888, + "eval_valid_loss": 0.05319705978035927, + "eval_valid_runtime": 6.4196, + "eval_valid_samples_per_second": 215.435, + "eval_valid_steps_per_second": 6.854, + "step": 20500 + }, + { + "epoch": 5.68339340171888, + "eval_valid_target_loss": 0.058061882853507996, + "eval_valid_target_runtime": 6.5894, + "eval_valid_target_samples_per_second": 217.622, + "eval_valid_target_steps_per_second": 6.829, + "step": 20500 + }, + { + "epoch": 5.711117271971167, + "grad_norm": 0.15103484690189362, + "learning_rate": 3.892430213850587e-06, + "loss": 0.0615, + "step": 20600 + }, + { + "epoch": 5.738841142223454, + "grad_norm": 0.1266421228647232, + "learning_rate": 3.850004899503051e-06, + "loss": 0.0613, + "step": 20700 + }, + { + "epoch": 5.766565012475741, + "grad_norm": 0.1100655049085617, + "learning_rate": 3.8076668269894045e-06, + "loss": 0.0606, + "step": 20800 + }, + { + "epoch": 5.794288882728029, + "grad_norm": 0.1395365446805954, + "learning_rate": 3.765419208193848e-06, + "loss": 0.0614, + "step": 20900 + }, + { + "epoch": 5.822012752980316, + "grad_norm": 0.12668344378471375, + "learning_rate": 3.723265248138506e-06, + "loss": 0.0614, + "step": 21000 + }, + { + "epoch": 5.822012752980316, + "eval_valid_loss": 0.052489351481199265, + "eval_valid_runtime": 6.4455, + "eval_valid_samples_per_second": 214.567, + "eval_valid_steps_per_second": 6.826, + "step": 21000 + }, + { + "epoch": 5.822012752980316, + "eval_valid_target_loss": 0.057213690131902695, + "eval_valid_target_runtime": 6.5546, + "eval_valid_target_samples_per_second": 218.777, + "eval_valid_target_steps_per_second": 6.865, + "step": 21000 + }, + { + "epoch": 5.849736623232603, + "grad_norm": 0.12728376686573029, + "learning_rate": 3.681208144740291e-06, + "loss": 0.0612, + "step": 21100 + }, + { + "epoch": 5.87746049348489, + "grad_norm": 0.14501620829105377, + "learning_rate": 3.6392510885682965e-06, + "loss": 0.0601, + "step": 21200 + }, + { + "epoch": 5.9051843637371775, + "grad_norm": 0.1082565188407898, + "learning_rate": 3.5973972626017594e-06, + "loss": 0.0608, + "step": 21300 + }, + { + "epoch": 5.9329082339894645, + "grad_norm": 0.14926603436470032, + "learning_rate": 3.5556498419885867e-06, + "loss": 0.0603, + "step": 21400 + }, + { + "epoch": 5.9606321042417525, + "grad_norm": 0.1263745278120041, + "learning_rate": 3.514011993804469e-06, + "loss": 0.0602, + "step": 21500 + }, + { + "epoch": 5.9606321042417525, + "eval_valid_loss": 0.05212084576487541, + "eval_valid_runtime": 6.439, + "eval_valid_samples_per_second": 214.785, + "eval_valid_steps_per_second": 6.833, + "step": 21500 + }, + { + "epoch": 5.9606321042417525, + "eval_valid_target_loss": 0.05688408389687538, + "eval_valid_target_runtime": 6.5822, + "eval_valid_target_samples_per_second": 217.862, + "eval_valid_target_steps_per_second": 6.837, + "step": 21500 + }, + { + "epoch": 5.98835597449404, + "grad_norm": 0.1368781179189682, + "learning_rate": 3.4724868768126384e-06, + "loss": 0.0604, + "step": 21600 + }, + { + "epoch": 6.016079844746327, + "grad_norm": 0.15087148547172546, + "learning_rate": 3.4310776412242195e-06, + "loss": 0.06, + "step": 21700 + }, + { + "epoch": 6.043803714998614, + "grad_norm": 0.11400382220745087, + "learning_rate": 3.3897874284592467e-06, + "loss": 0.0594, + "step": 21800 + }, + { + "epoch": 6.071527585250901, + "grad_norm": 0.1169167011976242, + "learning_rate": 3.348619370908361e-06, + "loss": 0.0598, + "step": 21900 + }, + { + "epoch": 6.099251455503188, + "grad_norm": 0.12172160297632217, + "learning_rate": 3.3075765916951576e-06, + "loss": 0.0599, + "step": 22000 + }, + { + "epoch": 6.099251455503188, + "eval_valid_loss": 0.05157113075256348, + "eval_valid_runtime": 6.4258, + "eval_valid_samples_per_second": 215.224, + "eval_valid_steps_per_second": 6.847, + "step": 22000 + }, + { + "epoch": 6.099251455503188, + "eval_valid_target_loss": 0.056347791105508804, + "eval_valid_target_runtime": 6.5915, + "eval_valid_target_samples_per_second": 217.554, + "eval_valid_target_steps_per_second": 6.827, + "step": 22000 + }, + { + "epoch": 6.126975325755476, + "grad_norm": 0.1324358880519867, + "learning_rate": 3.2666622044392765e-06, + "loss": 0.0591, + "step": 22100 + }, + { + "epoch": 6.154699196007763, + "grad_norm": 0.12708991765975952, + "learning_rate": 3.225879313020178e-06, + "loss": 0.0591, + "step": 22200 + }, + { + "epoch": 6.18242306626005, + "grad_norm": 0.11844506114721298, + "learning_rate": 3.18523101134169e-06, + "loss": 0.0592, + "step": 22300 + }, + { + "epoch": 6.210146936512337, + "grad_norm": 0.12888644635677338, + "learning_rate": 3.1447203830972827e-06, + "loss": 0.0597, + "step": 22400 + }, + { + "epoch": 6.237870806764624, + "grad_norm": 0.1485096514225006, + "learning_rate": 3.104350501536134e-06, + "loss": 0.0598, + "step": 22500 + }, + { + "epoch": 6.237870806764624, + "eval_valid_loss": 0.051265206187963486, + "eval_valid_runtime": 6.437, + "eval_valid_samples_per_second": 214.85, + "eval_valid_steps_per_second": 6.835, + "step": 22500 + }, + { + "epoch": 6.237870806764624, + "eval_valid_target_loss": 0.056084584444761276, + "eval_valid_target_runtime": 6.6, + "eval_valid_target_samples_per_second": 217.273, + "eval_valid_target_steps_per_second": 6.818, + "step": 22500 + }, + { + "epoch": 6.265594677016912, + "grad_norm": 0.11319620907306671, + "learning_rate": 3.064124429229992e-06, + "loss": 0.0581, + "step": 22600 + }, + { + "epoch": 6.293318547269199, + "grad_norm": 0.125896617770195, + "learning_rate": 3.0240452178408286e-06, + "loss": 0.0594, + "step": 22700 + }, + { + "epoch": 6.321042417521486, + "grad_norm": 0.13202796876430511, + "learning_rate": 2.9841159078893377e-06, + "loss": 0.0587, + "step": 22800 + }, + { + "epoch": 6.348766287773773, + "grad_norm": 0.12477891147136688, + "learning_rate": 2.944339528524278e-06, + "loss": 0.0582, + "step": 22900 + }, + { + "epoch": 6.37649015802606, + "grad_norm": 0.13174673914909363, + "learning_rate": 2.9047190972926597e-06, + "loss": 0.0585, + "step": 23000 + }, + { + "epoch": 6.37649015802606, + "eval_valid_loss": 0.05099370330572128, + "eval_valid_runtime": 6.4377, + "eval_valid_samples_per_second": 214.828, + "eval_valid_steps_per_second": 6.835, + "step": 23000 + }, + { + "epoch": 6.37649015802606, + "eval_valid_target_loss": 0.055660318583250046, + "eval_valid_target_runtime": 6.5668, + "eval_valid_target_samples_per_second": 218.37, + "eval_valid_target_steps_per_second": 6.853, + "step": 23000 + }, + { + "epoch": 6.404214028278347, + "grad_norm": 0.12851925194263458, + "learning_rate": 2.8652576199108395e-06, + "loss": 0.0586, + "step": 23100 + }, + { + "epoch": 6.431937898530635, + "grad_norm": 0.10676029324531555, + "learning_rate": 2.8259580900364825e-06, + "loss": 0.0584, + "step": 23200 + }, + { + "epoch": 6.459661768782922, + "grad_norm": 0.1461838185787201, + "learning_rate": 2.786823489041478e-06, + "loss": 0.0583, + "step": 23300 + }, + { + "epoch": 6.487385639035209, + "grad_norm": 0.12321025878190994, + "learning_rate": 2.747856785785743e-06, + "loss": 0.0579, + "step": 23400 + }, + { + "epoch": 6.515109509287496, + "grad_norm": 0.1209678128361702, + "learning_rate": 2.7090609363919986e-06, + "loss": 0.0581, + "step": 23500 + }, + { + "epoch": 6.515109509287496, + "eval_valid_loss": 0.050510190427303314, + "eval_valid_runtime": 6.447, + "eval_valid_samples_per_second": 214.517, + "eval_valid_steps_per_second": 6.825, + "step": 23500 + }, + { + "epoch": 6.515109509287496, + "eval_valid_target_loss": 0.0551883801817894, + "eval_valid_target_runtime": 6.5701, + "eval_valid_target_samples_per_second": 218.262, + "eval_valid_target_steps_per_second": 6.849, + "step": 23500 + }, + { + "epoch": 6.5428333795397835, + "grad_norm": 0.15566356480121613, + "learning_rate": 2.6704388840215277e-06, + "loss": 0.0578, + "step": 23600 + }, + { + "epoch": 6.570557249792071, + "grad_norm": 0.10754121840000153, + "learning_rate": 2.6319935586508814e-06, + "loss": 0.058, + "step": 23700 + }, + { + "epoch": 6.5982811200443585, + "grad_norm": 0.12134023010730743, + "learning_rate": 2.593727876849601e-06, + "loss": 0.0577, + "step": 23800 + }, + { + "epoch": 6.626004990296646, + "grad_norm": 0.12984460592269897, + "learning_rate": 2.555644741558979e-06, + "loss": 0.0575, + "step": 23900 + }, + { + "epoch": 6.653728860548933, + "grad_norm": 0.13557353615760803, + "learning_rate": 2.51774704187181e-06, + "loss": 0.0571, + "step": 24000 + }, + { + "epoch": 6.653728860548933, + "eval_valid_loss": 0.0503346286714077, + "eval_valid_runtime": 6.419, + "eval_valid_samples_per_second": 215.455, + "eval_valid_steps_per_second": 6.855, + "step": 24000 + }, + { + "epoch": 6.653728860548933, + "eval_valid_target_loss": 0.0548863522708416, + "eval_valid_target_runtime": 6.5823, + "eval_valid_target_samples_per_second": 217.857, + "eval_valid_target_steps_per_second": 6.837, + "step": 24000 + }, + { + "epoch": 6.68145273080122, + "grad_norm": 0.10979162156581879, + "learning_rate": 2.4800376528132297e-06, + "loss": 0.0576, + "step": 24100 + }, + { + "epoch": 6.709176601053507, + "grad_norm": 0.16127757728099823, + "learning_rate": 2.4425194351226082e-06, + "loss": 0.0579, + "step": 24200 + }, + { + "epoch": 6.736900471305795, + "grad_norm": 0.13306181132793427, + "learning_rate": 2.4051952350365194e-06, + "loss": 0.0572, + "step": 24300 + }, + { + "epoch": 6.764624341558082, + "grad_norm": 0.11353787779808044, + "learning_rate": 2.368067884072821e-06, + "loss": 0.0573, + "step": 24400 + }, + { + "epoch": 6.792348211810369, + "grad_norm": 0.10115820914506912, + "learning_rate": 2.331140198815849e-06, + "loss": 0.0574, + "step": 24500 + }, + { + "epoch": 6.792348211810369, + "eval_valid_loss": 0.049953412264585495, + "eval_valid_runtime": 6.4338, + "eval_valid_samples_per_second": 214.958, + "eval_valid_steps_per_second": 6.839, + "step": 24500 + }, + { + "epoch": 6.792348211810369, + "eval_valid_target_loss": 0.054579559713602066, + "eval_valid_target_runtime": 6.5694, + "eval_valid_target_samples_per_second": 218.283, + "eval_valid_target_steps_per_second": 6.85, + "step": 24500 + }, + { + "epoch": 6.820072082062656, + "grad_norm": 0.10899285972118378, + "learning_rate": 2.294414980702741e-06, + "loss": 0.0573, + "step": 24600 + }, + { + "epoch": 6.847795952314943, + "grad_norm": 0.1248159185051918, + "learning_rate": 2.257895015810913e-06, + "loss": 0.0568, + "step": 24700 + }, + { + "epoch": 6.87551982256723, + "grad_norm": 0.10761197656393051, + "learning_rate": 2.221583074646701e-06, + "loss": 0.0574, + "step": 24800 + }, + { + "epoch": 6.903243692819517, + "grad_norm": 0.13541601598262787, + "learning_rate": 2.1854819119351784e-06, + "loss": 0.0562, + "step": 24900 + }, + { + "epoch": 6.930967563071805, + "grad_norm": 0.10959000140428543, + "learning_rate": 2.1495942664111814e-06, + "loss": 0.0576, + "step": 25000 + }, + { + "epoch": 6.930967563071805, + "eval_valid_loss": 0.049802832305431366, + "eval_valid_runtime": 6.4091, + "eval_valid_samples_per_second": 215.786, + "eval_valid_steps_per_second": 6.865, + "step": 25000 + }, + { + "epoch": 6.930967563071805, + "eval_valid_target_loss": 0.05434631556272507, + "eval_valid_target_runtime": 6.5766, + "eval_valid_target_samples_per_second": 218.047, + "eval_valid_target_steps_per_second": 6.842, + "step": 25000 + }, + { + "epoch": 6.958691433324092, + "grad_norm": 0.11864270269870758, + "learning_rate": 2.113922860611532e-06, + "loss": 0.0571, + "step": 25100 + }, + { + "epoch": 6.986415303576379, + "grad_norm": 0.10493431985378265, + "learning_rate": 2.078470400668506e-06, + "loss": 0.0572, + "step": 25200 + }, + { + "epoch": 7.014139173828666, + "grad_norm": 0.10294145345687866, + "learning_rate": 2.0432395761045427e-06, + "loss": 0.0562, + "step": 25300 + }, + { + "epoch": 7.041863044080953, + "grad_norm": 0.11174608767032623, + "learning_rate": 2.008233059628193e-06, + "loss": 0.0562, + "step": 25400 + }, + { + "epoch": 7.069586914333241, + "grad_norm": 0.10171514004468918, + "learning_rate": 1.9734535069313753e-06, + "loss": 0.056, + "step": 25500 + }, + { + "epoch": 7.069586914333241, + "eval_valid_loss": 0.04948737472295761, + "eval_valid_runtime": 6.442, + "eval_valid_samples_per_second": 214.685, + "eval_valid_steps_per_second": 6.83, + "step": 25500 + }, + { + "epoch": 7.069586914333241, + "eval_valid_target_loss": 0.05410830304026604, + "eval_valid_target_runtime": 6.5896, + "eval_valid_target_samples_per_second": 217.617, + "eval_valid_target_steps_per_second": 6.829, + "step": 25500 + }, + { + "epoch": 7.097310784585528, + "grad_norm": 0.10731488466262817, + "learning_rate": 1.9389035564879104e-06, + "loss": 0.0569, + "step": 25600 + }, + { + "epoch": 7.125034654837815, + "grad_norm": 0.0954216718673706, + "learning_rate": 1.9045858293533399e-06, + "loss": 0.0566, + "step": 25700 + }, + { + "epoch": 7.1527585250901025, + "grad_norm": 0.11443454772233963, + "learning_rate": 1.8705029289661054e-06, + "loss": 0.057, + "step": 25800 + }, + { + "epoch": 7.1804823953423895, + "grad_norm": 0.10671606659889221, + "learning_rate": 1.8366574409500344e-06, + "loss": 0.0561, + "step": 25900 + }, + { + "epoch": 7.208206265594677, + "grad_norm": 0.1028604656457901, + "learning_rate": 1.8030519329181916e-06, + "loss": 0.0561, + "step": 26000 + }, + { + "epoch": 7.208206265594677, + "eval_valid_loss": 0.04931313917040825, + "eval_valid_runtime": 6.431, + "eval_valid_samples_per_second": 215.053, + "eval_valid_steps_per_second": 6.842, + "step": 26000 + }, + { + "epoch": 7.208206265594677, + "eval_valid_target_loss": 0.053888678550720215, + "eval_valid_target_runtime": 6.5712, + "eval_valid_target_samples_per_second": 218.225, + "eval_valid_target_steps_per_second": 6.848, + "step": 26000 + }, + { + "epoch": 7.2359301358469645, + "grad_norm": 0.11538730561733246, + "learning_rate": 1.7696889542780904e-06, + "loss": 0.0564, + "step": 26100 + }, + { + "epoch": 7.263654006099252, + "grad_norm": 0.10585539788007736, + "learning_rate": 1.7365710360382882e-06, + "loss": 0.0562, + "step": 26200 + }, + { + "epoch": 7.291377876351539, + "grad_norm": 0.09750411659479141, + "learning_rate": 1.7037006906163773e-06, + "loss": 0.0563, + "step": 26300 + }, + { + "epoch": 7.319101746603826, + "grad_norm": 0.10777630656957626, + "learning_rate": 1.6710804116483886e-06, + "loss": 0.0556, + "step": 26400 + }, + { + "epoch": 7.346825616856113, + "grad_norm": 0.13231071829795837, + "learning_rate": 1.6387126737996067e-06, + "loss": 0.0559, + "step": 26500 + }, + { + "epoch": 7.346825616856113, + "eval_valid_loss": 0.04909936338663101, + "eval_valid_runtime": 6.4292, + "eval_valid_samples_per_second": 215.112, + "eval_valid_steps_per_second": 6.844, + "step": 26500 + }, + { + "epoch": 7.346825616856113, + "eval_valid_target_loss": 0.05357712134718895, + "eval_valid_target_runtime": 6.5542, + "eval_valid_target_samples_per_second": 218.792, + "eval_valid_target_steps_per_second": 6.866, + "step": 26500 + }, + { + "epoch": 7.374549487108401, + "grad_norm": 0.10591776669025421, + "learning_rate": 1.6065999325768544e-06, + "loss": 0.0559, + "step": 26600 + }, + { + "epoch": 7.402273357360688, + "grad_norm": 0.11603645980358124, + "learning_rate": 1.5747446241421931e-06, + "loss": 0.0557, + "step": 26700 + }, + { + "epoch": 7.429997227612975, + "grad_norm": 0.09715123474597931, + "learning_rate": 1.5431491651281123e-06, + "loss": 0.0563, + "step": 26800 + }, + { + "epoch": 7.457721097865262, + "grad_norm": 0.10046205669641495, + "learning_rate": 1.511815952454208e-06, + "loss": 0.0556, + "step": 26900 + }, + { + "epoch": 7.485444968117549, + "grad_norm": 0.11805932968854904, + "learning_rate": 1.480747363145334e-06, + "loss": 0.0556, + "step": 27000 + }, + { + "epoch": 7.485444968117549, + "eval_valid_loss": 0.04887402430176735, + "eval_valid_runtime": 6.4098, + "eval_valid_samples_per_second": 215.763, + "eval_valid_steps_per_second": 6.864, + "step": 27000 + }, + { + "epoch": 7.485444968117549, + "eval_valid_target_loss": 0.05348382145166397, + "eval_valid_target_runtime": 6.5773, + "eval_valid_target_samples_per_second": 218.023, + "eval_valid_target_steps_per_second": 6.842, + "step": 27000 + }, + { + "epoch": 7.513168838369836, + "grad_norm": 0.1107444316148758, + "learning_rate": 1.4499457541512746e-06, + "loss": 0.0554, + "step": 27100 + }, + { + "epoch": 7.540892708622124, + "grad_norm": 0.10029349476099014, + "learning_rate": 1.4194134621679478e-06, + "loss": 0.0559, + "step": 27200 + }, + { + "epoch": 7.568616578874411, + "grad_norm": 0.09976372122764587, + "learning_rate": 1.3891528034601316e-06, + "loss": 0.0565, + "step": 27300 + }, + { + "epoch": 7.596340449126698, + "grad_norm": 0.10560230165719986, + "learning_rate": 1.3591660736857453e-06, + "loss": 0.0553, + "step": 27400 + }, + { + "epoch": 7.624064319378985, + "grad_norm": 0.09814602881669998, + "learning_rate": 1.329455547721697e-06, + "loss": 0.0552, + "step": 27500 + }, + { + "epoch": 7.624064319378985, + "eval_valid_loss": 0.04867083579301834, + "eval_valid_runtime": 6.4389, + "eval_valid_samples_per_second": 214.79, + "eval_valid_steps_per_second": 6.834, + "step": 27500 + }, + { + "epoch": 7.624064319378985, + "eval_valid_target_loss": 0.053231850266456604, + "eval_valid_target_runtime": 6.5692, + "eval_valid_target_samples_per_second": 218.292, + "eval_valid_target_steps_per_second": 6.85, + "step": 27500 + }, + { + "epoch": 7.651788189631272, + "grad_norm": 0.10253589600324631, + "learning_rate": 1.300023479491303e-06, + "loss": 0.0555, + "step": 27600 + }, + { + "epoch": 7.67951205988356, + "grad_norm": 0.10933282226324081, + "learning_rate": 1.2708721017933007e-06, + "loss": 0.0551, + "step": 27700 + }, + { + "epoch": 7.707235930135847, + "grad_norm": 0.11853484809398651, + "learning_rate": 1.2420036261324598e-06, + "loss": 0.056, + "step": 27800 + }, + { + "epoch": 7.734959800388134, + "grad_norm": 0.0992041826248169, + "learning_rate": 1.2134202425518139e-06, + "loss": 0.0547, + "step": 27900 + }, + { + "epoch": 7.762683670640421, + "grad_norm": 0.10824355483055115, + "learning_rate": 1.185124119466517e-06, + "loss": 0.0554, + "step": 28000 + }, + { + "epoch": 7.762683670640421, + "eval_valid_loss": 0.048471271991729736, + "eval_valid_runtime": 6.414, + "eval_valid_samples_per_second": 215.623, + "eval_valid_steps_per_second": 6.86, + "step": 28000 + }, + { + "epoch": 7.762683670640421, + "eval_valid_target_loss": 0.05302482470870018, + "eval_valid_target_runtime": 6.5682, + "eval_valid_target_samples_per_second": 218.326, + "eval_valid_target_steps_per_second": 6.851, + "step": 28000 + }, + { + "epoch": 7.7904075408927085, + "grad_norm": 0.09927680343389511, + "learning_rate": 1.1571174034993416e-06, + "loss": 0.0555, + "step": 28100 + }, + { + "epoch": 7.8181314111449955, + "grad_norm": 0.09600567072629929, + "learning_rate": 1.129402219317825e-06, + "loss": 0.0553, + "step": 28200 + }, + { + "epoch": 7.845855281397283, + "grad_norm": 0.11057105660438538, + "learning_rate": 1.1019806694730989e-06, + "loss": 0.0557, + "step": 28300 + }, + { + "epoch": 7.873579151649571, + "grad_norm": 0.10991726815700531, + "learning_rate": 1.074854834240368e-06, + "loss": 0.0553, + "step": 28400 + }, + { + "epoch": 7.901303021901858, + "grad_norm": 0.09168905019760132, + "learning_rate": 1.0480267714611048e-06, + "loss": 0.0551, + "step": 28500 + }, + { + "epoch": 7.901303021901858, + "eval_valid_loss": 0.04835043475031853, + "eval_valid_runtime": 6.4532, + "eval_valid_samples_per_second": 214.313, + "eval_valid_steps_per_second": 6.818, + "step": 28500 + }, + { + "epoch": 7.901303021901858, + "eval_valid_target_loss": 0.05293356999754906, + "eval_valid_target_runtime": 6.5812, + "eval_valid_target_samples_per_second": 217.894, + "eval_valid_target_steps_per_second": 6.838, + "step": 28500 + }, + { + "epoch": 7.929026892154145, + "grad_norm": 0.09465237706899643, + "learning_rate": 1.0214985163869378e-06, + "loss": 0.0556, + "step": 28600 + }, + { + "epoch": 7.956750762406432, + "grad_norm": 0.10842736065387726, + "learning_rate": 9.952720815252397e-07, + "loss": 0.0543, + "step": 28700 + }, + { + "epoch": 7.984474632658719, + "grad_norm": 0.09609558433294296, + "learning_rate": 9.693494564864648e-07, + "loss": 0.0554, + "step": 28800 + }, + { + "epoch": 8.012198502911007, + "grad_norm": 0.10819283127784729, + "learning_rate": 9.437326078332099e-07, + "loss": 0.0545, + "step": 28900 + }, + { + "epoch": 8.039922373163293, + "grad_norm": 0.09054001420736313, + "learning_rate": 9.18423478931016e-07, + "loss": 0.0554, + "step": 29000 + }, + { + "epoch": 8.039922373163293, + "eval_valid_loss": 0.04819526523351669, + "eval_valid_runtime": 6.4165, + "eval_valid_samples_per_second": 215.536, + "eval_valid_steps_per_second": 6.857, + "step": 29000 + }, + { + "epoch": 8.039922373163293, + "eval_valid_target_loss": 0.05275378376245499, + "eval_valid_target_runtime": 6.5635, + "eval_valid_target_samples_per_second": 218.482, + "eval_valid_target_steps_per_second": 6.856, + "step": 29000 + }, + { + "epoch": 8.067646243415581, + "grad_norm": 0.10373499244451523, + "learning_rate": 8.934239898009517e-07, + "loss": 0.0552, + "step": 29100 + }, + { + "epoch": 8.095370113667869, + "grad_norm": 0.09614498168230057, + "learning_rate": 8.687360369739473e-07, + "loss": 0.0545, + "step": 29200 + }, + { + "epoch": 8.123093983920155, + "grad_norm": 0.1014479324221611, + "learning_rate": 8.443614933469208e-07, + "loss": 0.0549, + "step": 29300 + }, + { + "epoch": 8.150817854172443, + "grad_norm": 0.08971751481294632, + "learning_rate": 8.203022080406952e-07, + "loss": 0.0546, + "step": 29400 + }, + { + "epoch": 8.17854172442473, + "grad_norm": 0.09659924358129501, + "learning_rate": 7.965600062597184e-07, + "loss": 0.0542, + "step": 29500 + }, + { + "epoch": 8.17854172442473, + "eval_valid_loss": 0.04812739044427872, + "eval_valid_runtime": 6.4674, + "eval_valid_samples_per_second": 213.843, + "eval_valid_steps_per_second": 6.803, + "step": 29500 + }, + { + "epoch": 8.17854172442473, + "eval_valid_target_loss": 0.05264822766184807, + "eval_valid_target_runtime": 6.5912, + "eval_valid_target_samples_per_second": 217.563, + "eval_valid_target_steps_per_second": 6.827, + "step": 29500 + }, + { + "epoch": 8.206265594677017, + "grad_norm": 0.1034499853849411, + "learning_rate": 7.731366891535969e-07, + "loss": 0.0548, + "step": 29600 + }, + { + "epoch": 8.233989464929303, + "grad_norm": 0.0934043675661087, + "learning_rate": 7.500340336804607e-07, + "loss": 0.0542, + "step": 29700 + }, + { + "epoch": 8.261713335181591, + "grad_norm": 0.09693789482116699, + "learning_rate": 7.272537924721467e-07, + "loss": 0.0553, + "step": 29800 + }, + { + "epoch": 8.28943720543388, + "grad_norm": 0.09552415460348129, + "learning_rate": 7.047976937012568e-07, + "loss": 0.0543, + "step": 29900 + }, + { + "epoch": 8.317161075686165, + "grad_norm": 0.0978178158402443, + "learning_rate": 6.826674409500389e-07, + "loss": 0.0548, + "step": 30000 + }, + { + "epoch": 8.317161075686165, + "eval_valid_loss": 0.04797354340553284, + "eval_valid_runtime": 6.442, + "eval_valid_samples_per_second": 214.683, + "eval_valid_steps_per_second": 6.83, + "step": 30000 + }, + { + "epoch": 8.317161075686165, + "eval_valid_target_loss": 0.052511684596538544, + "eval_valid_target_runtime": 6.5615, + "eval_valid_target_samples_per_second": 218.549, + "eval_valid_target_steps_per_second": 6.858, + "step": 30000 + }, + { + "epoch": 8.344884945938453, + "grad_norm": 0.09591928869485855, + "learning_rate": 6.608647130811502e-07, + "loss": 0.0543, + "step": 30100 + }, + { + "epoch": 8.37260881619074, + "grad_norm": 0.09678730368614197, + "learning_rate": 6.393911641103051e-07, + "loss": 0.0542, + "step": 30200 + }, + { + "epoch": 8.400332686443027, + "grad_norm": 0.10894029587507248, + "learning_rate": 6.182484230807845e-07, + "loss": 0.0542, + "step": 30300 + }, + { + "epoch": 8.428056556695315, + "grad_norm": 0.10065341740846634, + "learning_rate": 5.974380939398555e-07, + "loss": 0.0549, + "step": 30400 + }, + { + "epoch": 8.455780426947602, + "grad_norm": 0.11015477776527405, + "learning_rate": 5.769617554170959e-07, + "loss": 0.0544, + "step": 30500 + }, + { + "epoch": 8.455780426947602, + "eval_valid_loss": 0.04785359278321266, + "eval_valid_runtime": 6.4159, + "eval_valid_samples_per_second": 215.558, + "eval_valid_steps_per_second": 6.858, + "step": 30500 + }, + { + "epoch": 8.455780426947602, + "eval_valid_target_loss": 0.05238433927297592, + "eval_valid_target_runtime": 6.575, + "eval_valid_target_samples_per_second": 218.1, + "eval_valid_target_steps_per_second": 6.844, + "step": 30500 + }, + { + "epoch": 8.48350429719989, + "grad_norm": 0.10229642689228058, + "learning_rate": 5.568209609046238e-07, + "loss": 0.0542, + "step": 30600 + }, + { + "epoch": 8.511228167452176, + "grad_norm": 0.1019807681441307, + "learning_rate": 5.370172383392514e-07, + "loss": 0.0548, + "step": 30700 + }, + { + "epoch": 8.538952037704464, + "grad_norm": 0.1037830114364624, + "learning_rate": 5.175520900865754e-07, + "loss": 0.0538, + "step": 30800 + }, + { + "epoch": 8.56667590795675, + "grad_norm": 0.0952112227678299, + "learning_rate": 4.984269928270002e-07, + "loss": 0.0537, + "step": 30900 + }, + { + "epoch": 8.594399778209038, + "grad_norm": 0.09642232209444046, + "learning_rate": 4.796433974437148e-07, + "loss": 0.0533, + "step": 31000 + }, + { + "epoch": 8.594399778209038, + "eval_valid_loss": 0.04777803644537926, + "eval_valid_runtime": 6.4399, + "eval_valid_samples_per_second": 214.756, + "eval_valid_steps_per_second": 6.832, + "step": 31000 + }, + { + "epoch": 8.594399778209038, + "eval_valid_target_loss": 0.052354373037815094, + "eval_valid_target_runtime": 6.5668, + "eval_valid_target_samples_per_second": 218.371, + "eval_valid_target_steps_per_second": 6.853, + "step": 31000 + }, + { + "epoch": 8.622123648461326, + "grad_norm": 0.10211507230997086, + "learning_rate": 4.6120272891262365e-07, + "loss": 0.0544, + "step": 31100 + }, + { + "epoch": 8.649847518713612, + "grad_norm": 0.0912129357457161, + "learning_rate": 4.4310638619424363e-07, + "loss": 0.0536, + "step": 31200 + }, + { + "epoch": 8.6775713889659, + "grad_norm": 0.10558176785707474, + "learning_rate": 4.2535574212757667e-07, + "loss": 0.0542, + "step": 31300 + }, + { + "epoch": 8.705295259218186, + "grad_norm": 0.10381397604942322, + "learning_rate": 4.0795214332596145e-07, + "loss": 0.0547, + "step": 31400 + }, + { + "epoch": 8.733019129470474, + "grad_norm": 0.09383094310760498, + "learning_rate": 3.908969100749121e-07, + "loss": 0.055, + "step": 31500 + }, + { + "epoch": 8.733019129470474, + "eval_valid_loss": 0.047727905213832855, + "eval_valid_runtime": 6.4171, + "eval_valid_samples_per_second": 215.518, + "eval_valid_steps_per_second": 6.857, + "step": 31500 + }, + { + "epoch": 8.733019129470474, + "eval_valid_target_loss": 0.05224745720624924, + "eval_valid_target_runtime": 6.5727, + "eval_valid_target_samples_per_second": 218.174, + "eval_valid_target_steps_per_second": 6.846, + "step": 31500 + }, + { + "epoch": 8.760742999722762, + "grad_norm": 0.10438426584005356, + "learning_rate": 3.7419133623196825e-07, + "loss": 0.0541, + "step": 31600 + }, + { + "epoch": 8.788466869975048, + "grad_norm": 0.09324101358652115, + "learning_rate": 3.5783668912852453e-07, + "loss": 0.0537, + "step": 31700 + }, + { + "epoch": 8.816190740227336, + "grad_norm": 0.09235464036464691, + "learning_rate": 3.4183420947369873e-07, + "loss": 0.0544, + "step": 31800 + }, + { + "epoch": 8.843914610479622, + "grad_norm": 0.09870747476816177, + "learning_rate": 3.261851112602055e-07, + "loss": 0.0543, + "step": 31900 + }, + { + "epoch": 8.87163848073191, + "grad_norm": 0.10918495059013367, + "learning_rate": 3.108905816722546e-07, + "loss": 0.054, + "step": 32000 + }, + { + "epoch": 8.87163848073191, + "eval_valid_loss": 0.047707512974739075, + "eval_valid_runtime": 6.4362, + "eval_valid_samples_per_second": 214.879, + "eval_valid_steps_per_second": 6.836, + "step": 32000 + }, + { + "epoch": 8.87163848073191, + "eval_valid_target_loss": 0.05221306532621384, + "eval_valid_target_runtime": 6.5779, + "eval_valid_target_samples_per_second": 218.002, + "eval_valid_target_steps_per_second": 6.841, + "step": 32000 + }, + { + "epoch": 8.899362350984198, + "grad_norm": 0.09537260234355927, + "learning_rate": 2.9595178099549315e-07, + "loss": 0.054, + "step": 32100 + }, + { + "epoch": 8.927086221236484, + "grad_norm": 0.09188380092382431, + "learning_rate": 2.8136984252898515e-07, + "loss": 0.0542, + "step": 32200 + }, + { + "epoch": 8.954810091488772, + "grad_norm": 0.09919969737529755, + "learning_rate": 2.671458724992254e-07, + "loss": 0.0542, + "step": 32300 + }, + { + "epoch": 8.982533961741058, + "grad_norm": 0.09692647308111191, + "learning_rate": 2.532809499762312e-07, + "loss": 0.0544, + "step": 32400 + }, + { + "epoch": 9.010257831993346, + "grad_norm": 0.09277132153511047, + "learning_rate": 2.397761267916726e-07, + "loss": 0.0539, + "step": 32500 + }, + { + "epoch": 9.010257831993346, + "eval_valid_loss": 0.047637518495321274, + "eval_valid_runtime": 6.4471, + "eval_valid_samples_per_second": 214.516, + "eval_valid_steps_per_second": 6.825, + "step": 32500 + }, + { + "epoch": 9.010257831993346, + "eval_valid_target_loss": 0.052208978682756424, + "eval_valid_target_runtime": 6.5636, + "eval_valid_target_samples_per_second": 218.477, + "eval_valid_target_steps_per_second": 6.856, + "step": 32500 + }, + { + "epoch": 9.037981702245634, + "grad_norm": 0.09585940837860107, + "learning_rate": 2.2663242745908087e-07, + "loss": 0.0542, + "step": 32600 + }, + { + "epoch": 9.06570557249792, + "grad_norm": 0.09488432109355927, + "learning_rate": 2.138508490961244e-07, + "loss": 0.0533, + "step": 32700 + }, + { + "epoch": 9.093429442750208, + "grad_norm": 0.09499957412481308, + "learning_rate": 2.014323613489666e-07, + "loss": 0.0543, + "step": 32800 + }, + { + "epoch": 9.121153313002495, + "grad_norm": 0.09435317665338516, + "learning_rate": 1.8937790631870345e-07, + "loss": 0.0536, + "step": 32900 + }, + { + "epoch": 9.148877183254783, + "grad_norm": 0.10342779755592346, + "learning_rate": 1.7768839848989584e-07, + "loss": 0.0539, + "step": 33000 + }, + { + "epoch": 9.148877183254783, + "eval_valid_loss": 0.047598063945770264, + "eval_valid_runtime": 6.4315, + "eval_valid_samples_per_second": 215.037, + "eval_valid_steps_per_second": 6.841, + "step": 33000 + }, + { + "epoch": 9.148877183254783, + "eval_valid_target_loss": 0.05212317034602165, + "eval_valid_target_runtime": 6.5736, + "eval_valid_target_samples_per_second": 218.146, + "eval_valid_target_steps_per_second": 6.846, + "step": 33000 + }, + { + "epoch": 9.176601053507069, + "grad_norm": 0.09814909845590591, + "learning_rate": 1.6636472466118992e-07, + "loss": 0.0542, + "step": 33100 + }, + { + "epoch": 9.204324923759357, + "grad_norm": 0.09484022855758667, + "learning_rate": 1.5540774387804825e-07, + "loss": 0.0544, + "step": 33200 + }, + { + "epoch": 9.232048794011645, + "grad_norm": 0.07888332009315491, + "learning_rate": 1.448182873675752e-07, + "loss": 0.0539, + "step": 33300 + }, + { + "epoch": 9.25977266426393, + "grad_norm": 0.0964021384716034, + "learning_rate": 1.345971584754585e-07, + "loss": 0.0539, + "step": 33400 + }, + { + "epoch": 9.287496534516219, + "grad_norm": 0.10322096943855286, + "learning_rate": 1.2474513260502695e-07, + "loss": 0.0536, + "step": 33500 + }, + { + "epoch": 9.287496534516219, + "eval_valid_loss": 0.047564879059791565, + "eval_valid_runtime": 6.4358, + "eval_valid_samples_per_second": 214.89, + "eval_valid_steps_per_second": 6.837, + "step": 33500 + }, + { + "epoch": 9.287496534516219, + "eval_valid_target_loss": 0.05209695175290108, + "eval_valid_target_runtime": 6.5809, + "eval_valid_target_samples_per_second": 217.904, + "eval_valid_target_steps_per_second": 6.838, + "step": 33500 + }, + { + "epoch": 9.315220404768505, + "grad_norm": 0.10957927256822586, + "learning_rate": 1.1526295715842628e-07, + "loss": 0.0541, + "step": 33600 + }, + { + "epoch": 9.342944275020793, + "grad_norm": 0.09433583915233612, + "learning_rate": 1.0615135147991562e-07, + "loss": 0.0542, + "step": 33700 + }, + { + "epoch": 9.370668145273081, + "grad_norm": 0.09703412652015686, + "learning_rate": 9.741100680130122e-08, + "loss": 0.0535, + "step": 33800 + }, + { + "epoch": 9.398392015525367, + "grad_norm": 0.10180799663066864, + "learning_rate": 8.904258618949335e-08, + "loss": 0.054, + "step": 33900 + }, + { + "epoch": 9.426115885777655, + "grad_norm": 0.09336613118648529, + "learning_rate": 8.104672449620598e-08, + "loss": 0.0532, + "step": 34000 + }, + { + "epoch": 9.426115885777655, + "eval_valid_loss": 0.047556404024362564, + "eval_valid_runtime": 6.42, + "eval_valid_samples_per_second": 215.421, + "eval_valid_steps_per_second": 6.854, + "step": 34000 + }, + { + "epoch": 9.426115885777655, + "eval_valid_target_loss": 0.05208129063248634, + "eval_valid_target_runtime": 6.595, + "eval_valid_target_samples_per_second": 217.437, + "eval_valid_target_steps_per_second": 6.823, + "step": 34000 + }, + { + "epoch": 9.453839756029941, + "grad_norm": 0.0890408605337143, + "learning_rate": 7.342402830979589e-08, + "loss": 0.054, + "step": 34100 + }, + { + "epoch": 9.48156362628223, + "grad_norm": 0.09568461775779724, + "learning_rate": 6.617507590924332e-08, + "loss": 0.0535, + "step": 34200 + }, + { + "epoch": 9.509287496534515, + "grad_norm": 0.09256019443273544, + "learning_rate": 5.930041722028379e-08, + "loss": 0.054, + "step": 34300 + }, + { + "epoch": 9.537011366786803, + "grad_norm": 0.09314898401498795, + "learning_rate": 5.280057377368863e-08, + "loss": 0.0535, + "step": 34400 + }, + { + "epoch": 9.564735237039091, + "grad_norm": 0.10256827622652054, + "learning_rate": 4.667603866569892e-08, + "loss": 0.0537, + "step": 34500 + }, + { + "epoch": 9.564735237039091, + "eval_valid_loss": 0.047560639679431915, + "eval_valid_runtime": 6.4632, + "eval_valid_samples_per_second": 213.979, + "eval_valid_steps_per_second": 6.808, + "step": 34500 + }, + { + "epoch": 9.564735237039091, + "eval_valid_target_loss": 0.05206665024161339, + "eval_valid_target_runtime": 6.5886, + "eval_valid_target_samples_per_second": 217.649, + "eval_valid_target_steps_per_second": 6.83, + "step": 34500 + }, + { + "epoch": 9.592459107291377, + "grad_norm": 0.0861942321062088, + "learning_rate": 4.092727652062034e-08, + "loss": 0.0537, + "step": 34600 + }, + { + "epoch": 9.620182977543665, + "grad_norm": 0.09521106630563736, + "learning_rate": 3.555472345557365e-08, + "loss": 0.0535, + "step": 34700 + }, + { + "epoch": 9.647906847795952, + "grad_norm": 0.10885845869779587, + "learning_rate": 3.055878704741e-08, + "loss": 0.0542, + "step": 34800 + }, + { + "epoch": 9.67563071804824, + "grad_norm": 0.09145703911781311, + "learning_rate": 2.5939846301791804e-08, + "loss": 0.0541, + "step": 34900 + }, + { + "epoch": 9.703354588300527, + "grad_norm": 0.09051796793937683, + "learning_rate": 2.1698251624438503e-08, + "loss": 0.0544, + "step": 35000 + }, + { + "epoch": 9.703354588300527, + "eval_valid_loss": 0.04752533510327339, + "eval_valid_runtime": 6.4168, + "eval_valid_samples_per_second": 215.528, + "eval_valid_steps_per_second": 6.857, + "step": 35000 + }, + { + "epoch": 9.703354588300527, + "eval_valid_target_loss": 0.05207618325948715, + "eval_valid_target_runtime": 6.57, + "eval_valid_target_samples_per_second": 218.265, + "eval_valid_target_steps_per_second": 6.849, + "step": 35000 + }, + { + "epoch": 9.731078458552814, + "grad_norm": 0.0903056338429451, + "learning_rate": 1.7834324794546164e-08, + "loss": 0.0539, + "step": 35100 + }, + { + "epoch": 9.758802328805102, + "grad_norm": 0.0897304117679596, + "learning_rate": 1.434835894037423e-08, + "loss": 0.0539, + "step": 35200 + }, + { + "epoch": 9.786526199057388, + "grad_norm": 0.10058806836605072, + "learning_rate": 1.1240618517009416e-08, + "loss": 0.0542, + "step": 35300 + }, + { + "epoch": 9.814250069309676, + "grad_norm": 0.1056876927614212, + "learning_rate": 8.511339286303432e-09, + "loss": 0.0537, + "step": 35400 + }, + { + "epoch": 9.841973939561964, + "grad_norm": 0.08990786969661713, + "learning_rate": 6.1607282989856184e-09, + "loss": 0.0547, + "step": 35500 + }, + { + "epoch": 9.841973939561964, + "eval_valid_loss": 0.047528158873319626, + "eval_valid_runtime": 6.4412, + "eval_valid_samples_per_second": 214.712, + "eval_valid_steps_per_second": 6.831, + "step": 35500 + }, + { + "epoch": 9.841973939561964, + "eval_valid_target_loss": 0.05206017941236496, + "eval_valid_target_runtime": 6.5864, + "eval_valid_target_samples_per_second": 217.72, + "eval_valid_target_steps_per_second": 6.832, + "step": 35500 + }, + { + "epoch": 9.86969780981425, + "grad_norm": 0.08090436458587646, + "learning_rate": 4.188963878958841e-09, + "loss": 0.0536, + "step": 35600 + }, + { + "epoch": 9.897421680066538, + "grad_norm": 0.08319131284952164, + "learning_rate": 2.5961956097669827e-09, + "loss": 0.0541, + "step": 35700 + }, + { + "epoch": 9.925145550318824, + "grad_norm": 0.10666873306035995, + "learning_rate": 1.3825443232517999e-09, + "loss": 0.0541, + "step": 35800 + }, + { + "epoch": 9.952869420571112, + "grad_norm": 0.10748881101608276, + "learning_rate": 5.48102090381919e-10, + "loss": 0.0543, + "step": 35900 + }, + { + "epoch": 9.9805932908234, + "grad_norm": 0.10198221355676651, + "learning_rate": 9.293221427231214e-11, + "loss": 0.0533, + "step": 36000 + }, + { + "epoch": 9.9805932908234, + "eval_valid_loss": 0.04753027856349945, + "eval_valid_runtime": 6.4518, + "eval_valid_samples_per_second": 214.359, + "eval_valid_steps_per_second": 6.82, + "step": 36000 + }, + { + "epoch": 9.9805932908234, + "eval_valid_target_loss": 0.05205439031124115, + "eval_valid_target_runtime": 6.5698, + "eval_valid_target_samples_per_second": 218.272, + "eval_valid_target_steps_per_second": 6.85, + "step": 36000 + } + ], + "logging_steps": 100, + "max_steps": 36070, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.429394066302619e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}