diff --git "a/checkpoints/20260401_111253/train.log" "b/checkpoints/20260401_111253/train.log" new file mode 100644--- /dev/null +++ "b/checkpoints/20260401_111253/train.log" @@ -0,0 +1,8207 @@ +[setup] device=cuda:0 tokenized_dir=data/tokenized_full/data ddp=False world_size=1 +[data] token-cache train=miss val=miss test=miss +/root/smiles_decoding/training/train_autoregressive.py:513: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead. + scaler = torch.cuda.amp.GradScaler(enabled=use_grad_scaler) +[epoch 1/50] step=20 train_loss=2.6453 tok_s=27737.9 opt_steps=20 +[epoch 1/50] step=40 train_loss=2.1733 tok_s=37962.9 opt_steps=40 +[epoch 1/50] step=60 train_loss=1.9649 tok_s=43307.1 opt_steps=60 +[epoch 1/50] step=80 train_loss=1.8307 tok_s=46536.4 opt_steps=80 +[epoch 1/50] step=100 train_loss=1.7302 tok_s=48681.0 opt_steps=100 +[epoch 1/50] step=120 train_loss=1.6525 tok_s=50294.6 opt_steps=120 +[epoch 1/50] step=140 train_loss=1.5818 tok_s=51487.7 opt_steps=140 +[epoch 1/50] step=160 train_loss=1.5227 tok_s=52373.9 opt_steps=160 +[epoch 1/50] step=180 train_loss=1.4685 tok_s=53087.3 opt_steps=180 +[epoch 1/50] step=200 train_loss=1.4216 tok_s=53736.1 opt_steps=200 +[epoch 1/50] step=220 train_loss=1.3789 tok_s=54256.0 opt_steps=220 +[epoch 1/50] step=240 train_loss=1.3396 tok_s=54690.5 opt_steps=240 +[epoch 1/50] step=260 train_loss=1.3037 tok_s=55091.1 opt_steps=260 +[epoch 1/50] step=280 train_loss=1.2711 tok_s=55426.8 opt_steps=280 +[epoch 1/50] step=300 train_loss=1.2417 tok_s=55703.5 opt_steps=300 +[epoch 1/50] step=320 train_loss=1.2138 tok_s=55969.8 opt_steps=320 +[epoch 1/50] step=340 train_loss=1.1885 tok_s=56193.4 opt_steps=340 +[epoch 1/50] step=360 train_loss=1.1644 tok_s=56381.3 opt_steps=360 +[epoch 1/50] step=380 train_loss=1.1421 tok_s=56556.9 opt_steps=380 +[epoch 1/50] step=400 train_loss=1.1216 tok_s=56724.4 opt_steps=400 +[epoch 1/50] step=420 train_loss=1.1023 tok_s=56894.4 opt_steps=420 +[epoch 1/50] step=440 train_loss=1.0839 tok_s=57047.2 opt_steps=440 +[epoch 1/50] step=460 train_loss=1.0668 tok_s=57162.1 opt_steps=460 +[epoch 1/50] step=480 train_loss=1.0513 tok_s=57275.7 opt_steps=480 +[epoch 1/50] step=500 train_loss=1.0363 tok_s=57402.3 opt_steps=500 +[epoch 1/50] step=520 train_loss=1.0220 tok_s=57519.0 opt_steps=520 +[epoch 1/50] step=540 train_loss=1.0082 tok_s=57607.4 opt_steps=540 +[epoch 1/50] step=560 train_loss=0.9954 tok_s=57700.8 opt_steps=560 +[epoch 1/50] step=580 train_loss=0.9828 tok_s=57766.7 opt_steps=580 +[epoch 1/50] step=600 train_loss=0.9711 tok_s=57858.0 opt_steps=600 +[epoch 1/50] step=620 train_loss=0.9600 tok_s=57939.8 opt_steps=620 +[epoch 1/50] step=640 train_loss=0.9491 tok_s=57992.4 opt_steps=640 +[epoch 1/50] step=660 train_loss=0.9388 tok_s=58068.4 opt_steps=660 +[epoch 1/50] step=680 train_loss=0.9287 tok_s=58134.5 opt_steps=680 +[epoch 1/50] step=700 train_loss=0.9190 tok_s=58195.2 opt_steps=700 +[epoch 1/50] step=720 train_loss=0.9096 tok_s=58249.4 opt_steps=720 +[epoch 1/50] step=740 train_loss=0.9006 tok_s=58312.7 opt_steps=740 +[epoch 1/50] step=760 train_loss=0.8918 tok_s=58358.8 opt_steps=760 +[epoch 1/50] step=780 train_loss=0.8834 tok_s=58395.8 opt_steps=780 +[epoch 1/50] step=800 train_loss=0.8752 tok_s=58433.4 opt_steps=800 +[epoch 1/50] step=820 train_loss=0.8672 tok_s=58480.4 opt_steps=820 +[epoch 1/50] step=840 train_loss=0.8594 tok_s=58503.2 opt_steps=840 +[epoch 1/50] step=860 train_loss=0.8518 tok_s=58539.8 opt_steps=860 +[epoch 1/50] step=880 train_loss=0.8445 tok_s=58574.0 opt_steps=880 +[epoch 1/50] step=900 train_loss=0.8374 tok_s=58605.2 opt_steps=900 +[epoch 1/50] step=920 train_loss=0.8305 tok_s=58633.2 opt_steps=920 +[epoch 1/50] step=940 train_loss=0.8237 tok_s=58683.7 opt_steps=940 +[epoch 1/50] step=960 train_loss=0.8171 tok_s=58712.2 opt_steps=960 +[epoch 1/50] step=980 train_loss=0.8107 tok_s=58744.0 opt_steps=980 +[epoch 1/50] step=1000 train_loss=0.8044 tok_s=58768.5 opt_steps=1000 +[epoch 1/50] step=1020 train_loss=0.7986 tok_s=58793.4 opt_steps=1020 +[epoch 1/50] step=1040 train_loss=0.7928 tok_s=58823.9 opt_steps=1040 +[epoch 1/50] step=1060 train_loss=0.7871 tok_s=58851.8 opt_steps=1060 +[epoch 1/50] step=1080 train_loss=0.7815 tok_s=58883.8 opt_steps=1080 +[epoch 1/50] step=1100 train_loss=0.7761 tok_s=58899.9 opt_steps=1100 +[epoch 1/50] step=1120 train_loss=0.7708 tok_s=58924.3 opt_steps=1120 +[epoch 1/50] step=1140 train_loss=0.7656 tok_s=58959.5 opt_steps=1140 +[epoch 1/50] step=1160 train_loss=0.7605 tok_s=58977.8 opt_steps=1160 +[epoch 1/50] step=1180 train_loss=0.7555 tok_s=58992.0 opt_steps=1180 +[epoch 1/50] step=1200 train_loss=0.7507 tok_s=59009.6 opt_steps=1200 +[epoch 1/50] step=1220 train_loss=0.7459 tok_s=59025.9 opt_steps=1220 +[epoch 1/50] step=1240 train_loss=0.7413 tok_s=59037.6 opt_steps=1240 +[epoch 1/50] step=1260 train_loss=0.7367 tok_s=59049.3 opt_steps=1260 +[epoch 1/50] step=1280 train_loss=0.7323 tok_s=59073.4 opt_steps=1280 +[epoch 1/50] step=1300 train_loss=0.7279 tok_s=59078.7 opt_steps=1300 +[epoch 1/50] step=1320 train_loss=0.7237 tok_s=59088.7 opt_steps=1320 +[epoch 1/50] step=1340 train_loss=0.7195 tok_s=59099.9 opt_steps=1340 +[epoch 1/50] step=1360 train_loss=0.7154 tok_s=59116.6 opt_steps=1360 +[epoch 1/50] step=1380 train_loss=0.7114 tok_s=59133.1 opt_steps=1380 +[epoch 1/50] step=1400 train_loss=0.7075 tok_s=59146.3 opt_steps=1400 +[epoch 1/50] step=1420 train_loss=0.7037 tok_s=59162.1 opt_steps=1420 +[epoch 1/50] step=1440 train_loss=0.6999 tok_s=59175.6 opt_steps=1440 +[epoch 1/50] step=1460 train_loss=0.6961 tok_s=59193.1 opt_steps=1460 +[epoch 1/50] step=1480 train_loss=0.6924 tok_s=59209.3 opt_steps=1480 +[epoch 1/50] step=1500 train_loss=0.6888 tok_s=59227.4 opt_steps=1500 +[epoch 1/50] step=1520 train_loss=0.6852 tok_s=59242.3 opt_steps=1520 +[epoch 1/50] step=1540 train_loss=0.6818 tok_s=59255.0 opt_steps=1540 +[epoch 1/50] step=1560 train_loss=0.6783 tok_s=59271.9 opt_steps=1560 +[epoch 1/50] step=1580 train_loss=0.6750 tok_s=59287.5 opt_steps=1580 +[epoch 1/50] step=1600 train_loss=0.6717 tok_s=59305.5 opt_steps=1600 +[epoch 1/50] step=1620 train_loss=0.6684 tok_s=59317.5 opt_steps=1620 +[epoch 1/50] step=1640 train_loss=0.6651 tok_s=59325.9 opt_steps=1640 +[epoch 1/50] step=1660 train_loss=0.6619 tok_s=59333.5 opt_steps=1660 +[epoch 1/50] step=1680 train_loss=0.6588 tok_s=59339.7 opt_steps=1680 +[epoch 1/50] step=1700 train_loss=0.6558 tok_s=59342.7 opt_steps=1700 +[epoch 1/50] step=1720 train_loss=0.6528 tok_s=59359.1 opt_steps=1720 +[epoch 1/50] step=1740 train_loss=0.6499 tok_s=59368.4 opt_steps=1740 +[epoch 1/50] step=1760 train_loss=0.6470 tok_s=59376.9 opt_steps=1760 +[epoch 1/50] step=1780 train_loss=0.6441 tok_s=59392.0 opt_steps=1780 +[epoch 1/50] step=1800 train_loss=0.6413 tok_s=59397.0 opt_steps=1800 +[epoch 1/50] step=1820 train_loss=0.6385 tok_s=59411.1 opt_steps=1820 +[epoch 1/50] step=1840 train_loss=0.6358 tok_s=59418.9 opt_steps=1840 +[epoch 1/50] step=1860 train_loss=0.6330 tok_s=59429.8 opt_steps=1860 +[epoch 1/50] step=1880 train_loss=0.6304 tok_s=59436.8 opt_steps=1880 +[epoch 1/50] step=1900 train_loss=0.6278 tok_s=59444.7 opt_steps=1900 +[epoch 1/50] step=1920 train_loss=0.6252 tok_s=59449.2 opt_steps=1920 +[epoch 1/50] step=1940 train_loss=0.6226 tok_s=59459.8 opt_steps=1940 +[epoch 1/50] step=1960 train_loss=0.6201 tok_s=59467.2 opt_steps=1960 +[epoch 1/50] step=1980 train_loss=0.6176 tok_s=59477.6 opt_steps=1980 +[epoch 1/50] step=2000 train_loss=0.6151 tok_s=59486.0 opt_steps=2000 +[epoch 1/50] step=2020 train_loss=0.6126 tok_s=59487.9 opt_steps=2020 +[epoch 1/50] step=2040 train_loss=0.6102 tok_s=59495.9 opt_steps=2040 +[epoch 1/50] step=2060 train_loss=0.6078 tok_s=59502.8 opt_steps=2060 +[epoch 1/50] step=2080 train_loss=0.6054 tok_s=59502.3 opt_steps=2080 +[epoch 1/50] step=2100 train_loss=0.6032 tok_s=59511.1 opt_steps=2100 +[epoch 1/50] step=2120 train_loss=0.6009 tok_s=59514.5 opt_steps=2120 +[epoch 1/50] step=2140 train_loss=0.5986 tok_s=59519.3 opt_steps=2140 +[epoch 1/50] step=2160 train_loss=0.5964 tok_s=59521.5 opt_steps=2160 +[epoch 1/50] step=2180 train_loss=0.5942 tok_s=59528.3 opt_steps=2180 +[epoch 1/50] step=2200 train_loss=0.5920 tok_s=59531.4 opt_steps=2200 +[epoch 1/50] step=2220 train_loss=0.5899 tok_s=59534.6 opt_steps=2220 +[epoch 1/50] step=2240 train_loss=0.5878 tok_s=59536.4 opt_steps=2240 +[epoch 1/50] step=2260 train_loss=0.5857 tok_s=59541.2 opt_steps=2260 +[epoch 1/50] step=2280 train_loss=0.5836 tok_s=59542.4 opt_steps=2280 +[epoch 1/50] step=2300 train_loss=0.5815 tok_s=59545.1 opt_steps=2300 +[epoch 1/50] step=2320 train_loss=0.5796 tok_s=59552.8 opt_steps=2320 +[epoch 1/50] step=2340 train_loss=0.5776 tok_s=59557.5 opt_steps=2340 +[epoch 1/50] step=2360 train_loss=0.5756 tok_s=59560.2 opt_steps=2360 +[epoch 1/50] step=2380 train_loss=0.5737 tok_s=59565.2 opt_steps=2380 +[epoch 1/50] step=2400 train_loss=0.5718 tok_s=59568.5 opt_steps=2400 +[epoch 1/50] step=2420 train_loss=0.5699 tok_s=59572.8 opt_steps=2420 +[epoch 1/50] step=2440 train_loss=0.5680 tok_s=59577.8 opt_steps=2440 +[epoch 1/50] step=2460 train_loss=0.5661 tok_s=59580.8 opt_steps=2460 +[epoch 1/50] step=2480 train_loss=0.5643 tok_s=59586.7 opt_steps=2480 +[epoch 1/50] step=2500 train_loss=0.5625 tok_s=59591.8 opt_steps=2500 +[epoch 1/50] step=2520 train_loss=0.5607 tok_s=59596.5 opt_steps=2520 +[epoch 1/50] step=2540 train_loss=0.5589 tok_s=59602.2 opt_steps=2540 +[epoch 1/50] step=2560 train_loss=0.5572 tok_s=59604.7 opt_steps=2560 +[epoch 1/50] step=2580 train_loss=0.5554 tok_s=59608.2 opt_steps=2580 +[epoch 1/50] step=2600 train_loss=0.5536 tok_s=59615.2 opt_steps=2600 +[epoch 1/50] step=2620 train_loss=0.5519 tok_s=59625.1 opt_steps=2620 +[epoch 1/50] step=2640 train_loss=0.5502 tok_s=59630.5 opt_steps=2640 +[epoch 1/50] step=2660 train_loss=0.5486 tok_s=59634.0 opt_steps=2660 +[epoch 1/50] step=2680 train_loss=0.5469 tok_s=59638.3 opt_steps=2680 +[epoch 1/50] step=2700 train_loss=0.5453 tok_s=59642.8 opt_steps=2700 +[epoch 1/50] step=2720 train_loss=0.5437 tok_s=59648.8 opt_steps=2720 +[epoch 1/50] step=2740 train_loss=0.5420 tok_s=59656.5 opt_steps=2740 +[epoch 1/50] step=2760 train_loss=0.5405 tok_s=59664.2 opt_steps=2760 +[epoch 1/50] step=2780 train_loss=0.5389 tok_s=59671.5 opt_steps=2780 +[epoch 1/50] step=2800 train_loss=0.5373 tok_s=59678.0 opt_steps=2800 +[epoch 1/50] step=2820 train_loss=0.5358 tok_s=59679.2 opt_steps=2820 +[epoch 1/50] step=2840 train_loss=0.5343 tok_s=59684.2 opt_steps=2840 +[epoch 1/50] step=2860 train_loss=0.5328 tok_s=59690.2 opt_steps=2860 +[epoch 1/50] step=2880 train_loss=0.5313 tok_s=59694.4 opt_steps=2880 +[epoch 1/50] step=2900 train_loss=0.5298 tok_s=59697.5 opt_steps=2900 +[epoch 1/50] step=2920 train_loss=0.5283 tok_s=59699.1 opt_steps=2920 +[epoch 1/50] step=2940 train_loss=0.5268 tok_s=59702.4 opt_steps=2940 +[epoch 1/50] step=2960 train_loss=0.5254 tok_s=59706.0 opt_steps=2960 +[epoch 1/50] step=2980 train_loss=0.5239 tok_s=59708.7 opt_steps=2980 +[epoch 1/50] step=3000 train_loss=0.5225 tok_s=59714.6 opt_steps=3000 +[epoch 1/50] step=3020 train_loss=0.5211 tok_s=59716.2 opt_steps=3020 +[epoch 1/50] step=3040 train_loss=0.5198 tok_s=59719.3 opt_steps=3040 +[epoch 1/50] step=3060 train_loss=0.5184 tok_s=59725.0 opt_steps=3060 +[epoch 1/50] step=3080 train_loss=0.5170 tok_s=59730.7 opt_steps=3080 +[epoch 1/50] step=3100 train_loss=0.5157 tok_s=59733.0 opt_steps=3100 +[epoch 1/50] step=3120 train_loss=0.5143 tok_s=59734.5 opt_steps=3120 +[epoch 1/50] step=3140 train_loss=0.5130 tok_s=59734.7 opt_steps=3140 +[epoch 1/50] step=3160 train_loss=0.5117 tok_s=59738.4 opt_steps=3160 +[epoch 1/50] step=3180 train_loss=0.5104 tok_s=59740.4 opt_steps=3180 +[epoch 1/50] step=3200 train_loss=0.5091 tok_s=59742.8 opt_steps=3200 +[epoch 1/50] step=3220 train_loss=0.5078 tok_s=59741.7 opt_steps=3220 +[epoch 1/50] step=3240 train_loss=0.5065 tok_s=59745.9 opt_steps=3240 +[epoch 1/50] step=3260 train_loss=0.5053 tok_s=59744.9 opt_steps=3260 +[epoch 1/50] train_loss=0.5045 val_skipped tok_s=59747.4 opt_steps=3273 +[epoch 2/50] step=20 train_loss=0.2885 tok_s=58114.3 opt_steps=20 +[epoch 2/50] step=40 train_loss=0.2865 tok_s=59144.5 opt_steps=40 +[epoch 2/50] step=60 train_loss=0.2875 tok_s=59616.8 opt_steps=60 +[epoch 2/50] step=80 train_loss=0.2871 tok_s=59831.8 opt_steps=80 +[epoch 2/50] step=100 train_loss=0.2861 tok_s=59886.8 opt_steps=100 +[epoch 2/50] step=120 train_loss=0.2861 tok_s=60004.9 opt_steps=120 +[epoch 2/50] step=140 train_loss=0.2856 tok_s=60020.4 opt_steps=140 +[epoch 2/50] step=160 train_loss=0.2855 tok_s=60118.6 opt_steps=160 +[epoch 2/50] step=180 train_loss=0.2852 tok_s=60158.3 opt_steps=180 +[epoch 2/50] step=200 train_loss=0.2854 tok_s=60126.3 opt_steps=200 +[epoch 2/50] step=220 train_loss=0.2850 tok_s=60129.6 opt_steps=220 +[epoch 2/50] step=240 train_loss=0.2846 tok_s=60131.1 opt_steps=240 +[epoch 2/50] step=260 train_loss=0.2845 tok_s=60095.4 opt_steps=260 +[epoch 2/50] step=280 train_loss=0.2842 tok_s=60135.2 opt_steps=280 +[epoch 2/50] step=300 train_loss=0.2841 tok_s=60155.1 opt_steps=300 +[epoch 2/50] step=320 train_loss=0.2839 tok_s=60170.0 opt_steps=320 +[epoch 2/50] step=340 train_loss=0.2838 tok_s=60206.3 opt_steps=340 +[epoch 2/50] step=360 train_loss=0.2836 tok_s=60179.4 opt_steps=360 +[epoch 2/50] step=380 train_loss=0.2834 tok_s=60158.2 opt_steps=380 +[epoch 2/50] step=400 train_loss=0.2832 tok_s=60172.1 opt_steps=400 +[epoch 2/50] step=420 train_loss=0.2830 tok_s=60180.3 opt_steps=420 +[epoch 2/50] step=440 train_loss=0.2829 tok_s=60191.0 opt_steps=440 +[epoch 2/50] step=460 train_loss=0.2826 tok_s=60188.7 opt_steps=460 +[epoch 2/50] step=480 train_loss=0.2825 tok_s=60169.3 opt_steps=480 +[epoch 2/50] step=500 train_loss=0.2823 tok_s=60169.8 opt_steps=500 +[epoch 2/50] step=520 train_loss=0.2819 tok_s=60156.6 opt_steps=520 +[epoch 2/50] step=540 train_loss=0.2815 tok_s=60161.2 opt_steps=540 +[epoch 2/50] step=560 train_loss=0.2813 tok_s=60158.9 opt_steps=560 +[epoch 2/50] step=580 train_loss=0.2811 tok_s=60153.4 opt_steps=580 +[epoch 2/50] step=600 train_loss=0.2809 tok_s=60175.9 opt_steps=600 +[epoch 2/50] step=620 train_loss=0.2806 tok_s=60171.7 opt_steps=620 +[epoch 2/50] step=640 train_loss=0.2804 tok_s=60178.0 opt_steps=640 +[epoch 2/50] step=660 train_loss=0.2802 tok_s=60183.0 opt_steps=660 +[epoch 2/50] step=680 train_loss=0.2800 tok_s=60201.9 opt_steps=680 +[epoch 2/50] step=700 train_loss=0.2797 tok_s=60206.2 opt_steps=700 +[epoch 2/50] step=720 train_loss=0.2794 tok_s=60199.9 opt_steps=720 +[epoch 2/50] step=740 train_loss=0.2791 tok_s=60207.5 opt_steps=740 +[epoch 2/50] step=760 train_loss=0.2787 tok_s=60210.3 opt_steps=760 +[epoch 2/50] step=780 train_loss=0.2785 tok_s=60212.5 opt_steps=780 +[epoch 2/50] step=800 train_loss=0.2782 tok_s=60201.8 opt_steps=800 +[epoch 2/50] step=820 train_loss=0.2779 tok_s=60197.4 opt_steps=820 +[epoch 2/50] step=840 train_loss=0.2775 tok_s=60190.4 opt_steps=840 +[epoch 2/50] step=860 train_loss=0.2772 tok_s=60182.6 opt_steps=860 +[epoch 2/50] step=880 train_loss=0.2769 tok_s=60171.1 opt_steps=880 +[epoch 2/50] step=900 train_loss=0.2767 tok_s=60161.8 opt_steps=900 +[epoch 2/50] step=920 train_loss=0.2765 tok_s=60172.6 opt_steps=920 +[epoch 2/50] step=940 train_loss=0.2763 tok_s=60165.4 opt_steps=940 +[epoch 2/50] step=960 train_loss=0.2759 tok_s=60164.6 opt_steps=960 +[epoch 2/50] step=980 train_loss=0.2756 tok_s=60111.2 opt_steps=980 +[epoch 2/50] step=1000 train_loss=0.2753 tok_s=60106.3 opt_steps=1000 +[epoch 2/50] step=1020 train_loss=0.2751 tok_s=60101.7 opt_steps=1020 +[epoch 2/50] step=1040 train_loss=0.2748 tok_s=60095.3 opt_steps=1040 +[epoch 2/50] step=1060 train_loss=0.2746 tok_s=60093.9 opt_steps=1060 +[epoch 2/50] step=1080 train_loss=0.2742 tok_s=60088.9 opt_steps=1080 +[epoch 2/50] step=1100 train_loss=0.2740 tok_s=60095.7 opt_steps=1100 +[epoch 2/50] step=1120 train_loss=0.2737 tok_s=60096.8 opt_steps=1120 +[epoch 2/50] step=1140 train_loss=0.2734 tok_s=60099.8 opt_steps=1140 +[epoch 2/50] step=1160 train_loss=0.2732 tok_s=60106.0 opt_steps=1160 +[epoch 2/50] step=1180 train_loss=0.2730 tok_s=60109.1 opt_steps=1180 +[epoch 2/50] step=1200 train_loss=0.2728 tok_s=60111.7 opt_steps=1200 +[epoch 2/50] step=1220 train_loss=0.2726 tok_s=60116.8 opt_steps=1220 +[epoch 2/50] step=1240 train_loss=0.2723 tok_s=60116.9 opt_steps=1240 +[epoch 2/50] step=1260 train_loss=0.2720 tok_s=60114.5 opt_steps=1260 +[epoch 2/50] step=1280 train_loss=0.2717 tok_s=60109.5 opt_steps=1280 +[epoch 2/50] step=1300 train_loss=0.2715 tok_s=60114.5 opt_steps=1300 +[epoch 2/50] step=1320 train_loss=0.2713 tok_s=60120.9 opt_steps=1320 +[epoch 2/50] step=1340 train_loss=0.2710 tok_s=60121.4 opt_steps=1340 +[epoch 2/50] step=1360 train_loss=0.2707 tok_s=60123.6 opt_steps=1360 +[epoch 2/50] step=1380 train_loss=0.2705 tok_s=60116.9 opt_steps=1380 +[epoch 2/50] step=1400 train_loss=0.2702 tok_s=60123.9 opt_steps=1400 +[epoch 2/50] step=1420 train_loss=0.2700 tok_s=60123.1 opt_steps=1420 +[epoch 2/50] step=1440 train_loss=0.2697 tok_s=60122.6 opt_steps=1440 +[epoch 2/50] step=1460 train_loss=0.2695 tok_s=60120.8 opt_steps=1460 +[epoch 2/50] step=1480 train_loss=0.2693 tok_s=60126.1 opt_steps=1480 +[epoch 2/50] step=1500 train_loss=0.2690 tok_s=60126.8 opt_steps=1500 +[epoch 2/50] step=1520 train_loss=0.2688 tok_s=60125.0 opt_steps=1520 +[epoch 2/50] step=1540 train_loss=0.2685 tok_s=60128.7 opt_steps=1540 +[epoch 2/50] step=1560 train_loss=0.2683 tok_s=60129.9 opt_steps=1560 +[epoch 2/50] step=1580 train_loss=0.2681 tok_s=60131.9 opt_steps=1580 +[epoch 2/50] step=1600 train_loss=0.2678 tok_s=60131.6 opt_steps=1600 +[epoch 2/50] step=1620 train_loss=0.2676 tok_s=60135.0 opt_steps=1620 +[epoch 2/50] step=1640 train_loss=0.2673 tok_s=60137.9 opt_steps=1640 +[epoch 2/50] step=1660 train_loss=0.2671 tok_s=60137.2 opt_steps=1660 +[epoch 2/50] step=1680 train_loss=0.2668 tok_s=60131.0 opt_steps=1680 +[epoch 2/50] step=1700 train_loss=0.2665 tok_s=60133.2 opt_steps=1700 +[epoch 2/50] step=1720 train_loss=0.2663 tok_s=60136.9 opt_steps=1720 +[epoch 2/50] step=1740 train_loss=0.2660 tok_s=60133.1 opt_steps=1740 +[epoch 2/50] step=1760 train_loss=0.2657 tok_s=60131.2 opt_steps=1760 +[epoch 2/50] step=1780 train_loss=0.2655 tok_s=60133.7 opt_steps=1780 +[epoch 2/50] step=1800 train_loss=0.2652 tok_s=60136.8 opt_steps=1800 +[epoch 2/50] step=1820 train_loss=0.2650 tok_s=60137.2 opt_steps=1820 +[epoch 2/50] step=1840 train_loss=0.2647 tok_s=60138.4 opt_steps=1840 +[epoch 2/50] step=1860 train_loss=0.2645 tok_s=60134.8 opt_steps=1860 +[epoch 2/50] step=1880 train_loss=0.2642 tok_s=60135.8 opt_steps=1880 +[epoch 2/50] step=1900 train_loss=0.2640 tok_s=60136.1 opt_steps=1900 +[epoch 2/50] step=1920 train_loss=0.2637 tok_s=60137.5 opt_steps=1920 +[epoch 2/50] step=1940 train_loss=0.2635 tok_s=60137.4 opt_steps=1940 +[epoch 2/50] step=1960 train_loss=0.2633 tok_s=60136.2 opt_steps=1960 +[epoch 2/50] step=1980 train_loss=0.2630 tok_s=60134.7 opt_steps=1980 +[epoch 2/50] step=2000 train_loss=0.2628 tok_s=60134.0 opt_steps=2000 +[epoch 2/50] step=2020 train_loss=0.2625 tok_s=60137.4 opt_steps=2020 +[epoch 2/50] step=2040 train_loss=0.2623 tok_s=60139.4 opt_steps=2040 +[epoch 2/50] step=2060 train_loss=0.2621 tok_s=60138.0 opt_steps=2060 +[epoch 2/50] step=2080 train_loss=0.2618 tok_s=60134.7 opt_steps=2080 +[epoch 2/50] step=2100 train_loss=0.2616 tok_s=60131.3 opt_steps=2100 +[epoch 2/50] step=2120 train_loss=0.2613 tok_s=60127.3 opt_steps=2120 +[epoch 2/50] step=2140 train_loss=0.2611 tok_s=60126.9 opt_steps=2140 +[epoch 2/50] step=2160 train_loss=0.2609 tok_s=60127.1 opt_steps=2160 +[epoch 2/50] step=2180 train_loss=0.2606 tok_s=60125.3 opt_steps=2180 +[epoch 2/50] step=2200 train_loss=0.2604 tok_s=60125.9 opt_steps=2200 +[epoch 2/50] step=2220 train_loss=0.2602 tok_s=60126.7 opt_steps=2220 +[epoch 2/50] step=2240 train_loss=0.2599 tok_s=60130.5 opt_steps=2240 +[epoch 2/50] step=2260 train_loss=0.2597 tok_s=60128.9 opt_steps=2260 +[epoch 2/50] step=2280 train_loss=0.2594 tok_s=60129.7 opt_steps=2280 +[epoch 2/50] step=2300 train_loss=0.2592 tok_s=60125.8 opt_steps=2300 +[epoch 2/50] step=2320 train_loss=0.2590 tok_s=60122.9 opt_steps=2320 +[epoch 2/50] step=2340 train_loss=0.2588 tok_s=60128.0 opt_steps=2340 +[epoch 2/50] step=2360 train_loss=0.2585 tok_s=60129.8 opt_steps=2360 +[epoch 2/50] step=2380 train_loss=0.2583 tok_s=60131.4 opt_steps=2380 +[epoch 2/50] step=2400 train_loss=0.2581 tok_s=60128.3 opt_steps=2400 +[epoch 2/50] step=2420 train_loss=0.2578 tok_s=60130.0 opt_steps=2420 +[epoch 2/50] step=2440 train_loss=0.2576 tok_s=60134.5 opt_steps=2440 +[epoch 2/50] step=2460 train_loss=0.2574 tok_s=60137.4 opt_steps=2460 +[epoch 2/50] step=2480 train_loss=0.2571 tok_s=60135.4 opt_steps=2480 +[epoch 2/50] step=2500 train_loss=0.2569 tok_s=60136.5 opt_steps=2500 +[epoch 2/50] step=2520 train_loss=0.2567 tok_s=60135.0 opt_steps=2520 +[epoch 2/50] step=2540 train_loss=0.2565 tok_s=60130.6 opt_steps=2540 +[epoch 2/50] step=2560 train_loss=0.2563 tok_s=60137.4 opt_steps=2560 +[epoch 2/50] step=2580 train_loss=0.2561 tok_s=60136.5 opt_steps=2580 +[epoch 2/50] step=2600 train_loss=0.2559 tok_s=60136.2 opt_steps=2600 +[epoch 2/50] step=2620 train_loss=0.2557 tok_s=60133.5 opt_steps=2620 +[epoch 2/50] step=2640 train_loss=0.2555 tok_s=60133.3 opt_steps=2640 +[epoch 2/50] step=2660 train_loss=0.2553 tok_s=60133.4 opt_steps=2660 +[epoch 2/50] step=2680 train_loss=0.2550 tok_s=60136.4 opt_steps=2680 +[epoch 2/50] step=2700 train_loss=0.2548 tok_s=60137.1 opt_steps=2700 +[epoch 2/50] step=2720 train_loss=0.2546 tok_s=60137.4 opt_steps=2720 +[epoch 2/50] step=2740 train_loss=0.2544 tok_s=60136.4 opt_steps=2740 +[epoch 2/50] step=2760 train_loss=0.2541 tok_s=60136.8 opt_steps=2760 +[epoch 2/50] step=2780 train_loss=0.2539 tok_s=60133.1 opt_steps=2780 +[epoch 2/50] step=2800 train_loss=0.2537 tok_s=60135.3 opt_steps=2800 +[epoch 2/50] step=2820 train_loss=0.2535 tok_s=60136.6 opt_steps=2820 +[epoch 2/50] step=2840 train_loss=0.2533 tok_s=60136.0 opt_steps=2840 +[epoch 2/50] step=2860 train_loss=0.2530 tok_s=60136.5 opt_steps=2860 +[epoch 2/50] step=2880 train_loss=0.2528 tok_s=60132.2 opt_steps=2880 +[epoch 2/50] step=2900 train_loss=0.2526 tok_s=60131.5 opt_steps=2900 +[epoch 2/50] step=2920 train_loss=0.2524 tok_s=60133.1 opt_steps=2920 +[epoch 2/50] step=2940 train_loss=0.2522 tok_s=60131.6 opt_steps=2940 +[epoch 2/50] step=2960 train_loss=0.2520 tok_s=60131.2 opt_steps=2960 +[epoch 2/50] step=2980 train_loss=0.2518 tok_s=60131.4 opt_steps=2980 +[epoch 2/50] step=3000 train_loss=0.2516 tok_s=60130.9 opt_steps=3000 +[epoch 2/50] step=3020 train_loss=0.2514 tok_s=60132.4 opt_steps=3020 +[epoch 2/50] step=3040 train_loss=0.2512 tok_s=60132.4 opt_steps=3040 +[epoch 2/50] step=3060 train_loss=0.2510 tok_s=60133.5 opt_steps=3060 +[epoch 2/50] step=3080 train_loss=0.2508 tok_s=60135.6 opt_steps=3080 +[epoch 2/50] step=3100 train_loss=0.2505 tok_s=60133.1 opt_steps=3100 +[epoch 2/50] step=3120 train_loss=0.2503 tok_s=60132.9 opt_steps=3120 +[epoch 2/50] step=3140 train_loss=0.2501 tok_s=60134.7 opt_steps=3140 +[epoch 2/50] step=3160 train_loss=0.2499 tok_s=60133.5 opt_steps=3160 +[epoch 2/50] step=3180 train_loss=0.2497 tok_s=60136.0 opt_steps=3180 +[epoch 2/50] step=3200 train_loss=0.2495 tok_s=60137.0 opt_steps=3200 +[epoch 2/50] step=3220 train_loss=0.2493 tok_s=60139.9 opt_steps=3220 +[epoch 2/50] step=3240 train_loss=0.2491 tok_s=60139.8 opt_steps=3240 +[epoch 2/50] step=3260 train_loss=0.2489 tok_s=60132.6 opt_steps=3260 +[epoch 2/50] train_loss=0.2488 val_skipped tok_s=60131.4 opt_steps=3273 +[epoch 3/50] step=20 train_loss=0.1985 tok_s=58341.2 opt_steps=20 +[epoch 3/50] step=40 train_loss=0.1993 tok_s=59104.0 opt_steps=40 +[epoch 3/50] step=60 train_loss=0.1982 tok_s=59117.0 opt_steps=60 +[epoch 3/50] step=80 train_loss=0.1977 tok_s=59513.9 opt_steps=80 +[epoch 3/50] step=100 train_loss=0.1975 tok_s=59739.5 opt_steps=100 +[epoch 3/50] step=120 train_loss=0.1975 tok_s=59794.1 opt_steps=120 +[epoch 3/50] step=140 train_loss=0.1972 tok_s=59854.9 opt_steps=140 +[epoch 3/50] step=160 train_loss=0.1976 tok_s=59969.1 opt_steps=160 +[epoch 3/50] step=180 train_loss=0.1975 tok_s=59997.1 opt_steps=180 +[epoch 3/50] step=200 train_loss=0.1975 tok_s=60051.9 opt_steps=200 +[epoch 3/50] step=220 train_loss=0.1975 tok_s=60067.6 opt_steps=220 +[epoch 3/50] step=240 train_loss=0.1975 tok_s=60055.8 opt_steps=240 +[epoch 3/50] step=260 train_loss=0.1971 tok_s=60062.8 opt_steps=260 +[epoch 3/50] step=280 train_loss=0.1972 tok_s=60035.2 opt_steps=280 +[epoch 3/50] step=300 train_loss=0.1971 tok_s=60025.3 opt_steps=300 +[epoch 3/50] step=320 train_loss=0.1971 tok_s=60038.5 opt_steps=320 +[epoch 3/50] step=340 train_loss=0.1970 tok_s=60065.7 opt_steps=340 +[epoch 3/50] step=360 train_loss=0.1971 tok_s=60088.2 opt_steps=360 +[epoch 3/50] step=380 train_loss=0.1970 tok_s=60119.7 opt_steps=380 +[epoch 3/50] step=400 train_loss=0.1968 tok_s=60125.7 opt_steps=400 +[epoch 3/50] step=420 train_loss=0.1968 tok_s=60102.0 opt_steps=420 +[epoch 3/50] step=440 train_loss=0.1968 tok_s=60113.2 opt_steps=440 +[epoch 3/50] step=460 train_loss=0.1968 tok_s=60105.7 opt_steps=460 +[epoch 3/50] step=480 train_loss=0.1968 tok_s=60107.3 opt_steps=480 +[epoch 3/50] step=500 train_loss=0.1968 tok_s=60115.8 opt_steps=500 +[epoch 3/50] step=520 train_loss=0.1969 tok_s=60114.1 opt_steps=520 +[epoch 3/50] step=540 train_loss=0.1969 tok_s=60102.2 opt_steps=540 +[epoch 3/50] step=560 train_loss=0.1969 tok_s=60091.4 opt_steps=560 +[epoch 3/50] step=580 train_loss=0.1969 tok_s=60101.0 opt_steps=580 +[epoch 3/50] step=600 train_loss=0.1969 tok_s=60101.0 opt_steps=600 +[epoch 3/50] step=620 train_loss=0.1968 tok_s=60089.1 opt_steps=620 +[epoch 3/50] step=640 train_loss=0.1968 tok_s=60085.3 opt_steps=640 +[epoch 3/50] step=660 train_loss=0.1968 tok_s=60082.5 opt_steps=660 +[epoch 3/50] step=680 train_loss=0.1966 tok_s=60088.4 opt_steps=680 +[epoch 3/50] step=700 train_loss=0.1965 tok_s=60089.3 opt_steps=700 +[epoch 3/50] step=720 train_loss=0.1965 tok_s=60096.0 opt_steps=720 +[epoch 3/50] step=740 train_loss=0.1964 tok_s=60099.3 opt_steps=740 +[epoch 3/50] step=760 train_loss=0.1964 tok_s=60097.6 opt_steps=760 +[epoch 3/50] step=780 train_loss=0.1964 tok_s=60087.8 opt_steps=780 +[epoch 3/50] step=800 train_loss=0.1963 tok_s=60101.8 opt_steps=800 +[epoch 3/50] step=820 train_loss=0.1962 tok_s=60113.1 opt_steps=820 +[epoch 3/50] step=840 train_loss=0.1962 tok_s=60117.3 opt_steps=840 +[epoch 3/50] step=860 train_loss=0.1960 tok_s=60131.1 opt_steps=860 +[epoch 3/50] step=880 train_loss=0.1960 tok_s=60137.4 opt_steps=880 +[epoch 3/50] step=900 train_loss=0.1959 tok_s=60135.0 opt_steps=900 +[epoch 3/50] step=920 train_loss=0.1958 tok_s=60129.2 opt_steps=920 +[epoch 3/50] step=940 train_loss=0.1957 tok_s=60128.9 opt_steps=940 +[epoch 3/50] step=960 train_loss=0.1957 tok_s=60137.1 opt_steps=960 +[epoch 3/50] step=980 train_loss=0.1956 tok_s=60127.2 opt_steps=980 +[epoch 3/50] step=1000 train_loss=0.1956 tok_s=60126.6 opt_steps=1000 +[epoch 3/50] step=1020 train_loss=0.1955 tok_s=60136.6 opt_steps=1020 +[epoch 3/50] step=1040 train_loss=0.1954 tok_s=60127.7 opt_steps=1040 +[epoch 3/50] step=1060 train_loss=0.1953 tok_s=60117.6 opt_steps=1060 +[epoch 3/50] step=1080 train_loss=0.1953 tok_s=60113.0 opt_steps=1080 +[epoch 3/50] step=1100 train_loss=0.1952 tok_s=60097.9 opt_steps=1100 +[epoch 3/50] step=1120 train_loss=0.1951 tok_s=60093.4 opt_steps=1120 +[epoch 3/50] step=1140 train_loss=0.1949 tok_s=60086.8 opt_steps=1140 +[epoch 3/50] step=1160 train_loss=0.1949 tok_s=60082.7 opt_steps=1160 +[epoch 3/50] step=1180 train_loss=0.1948 tok_s=60078.6 opt_steps=1180 +[epoch 3/50] step=1200 train_loss=0.1947 tok_s=60079.5 opt_steps=1200 +[epoch 3/50] step=1220 train_loss=0.1947 tok_s=60090.8 opt_steps=1220 +[epoch 3/50] step=1240 train_loss=0.1946 tok_s=60095.4 opt_steps=1240 +[epoch 3/50] step=1260 train_loss=0.1945 tok_s=60093.4 opt_steps=1260 +[epoch 3/50] step=1280 train_loss=0.1945 tok_s=60097.4 opt_steps=1280 +[epoch 3/50] step=1300 train_loss=0.1944 tok_s=60102.0 opt_steps=1300 +[epoch 3/50] step=1320 train_loss=0.1943 tok_s=60097.4 opt_steps=1320 +[epoch 3/50] step=1340 train_loss=0.1942 tok_s=60092.3 opt_steps=1340 +[epoch 3/50] step=1360 train_loss=0.1942 tok_s=60086.4 opt_steps=1360 +[epoch 3/50] step=1380 train_loss=0.1941 tok_s=60088.1 opt_steps=1380 +[epoch 3/50] step=1400 train_loss=0.1940 tok_s=60098.5 opt_steps=1400 +[epoch 3/50] step=1420 train_loss=0.1939 tok_s=60100.9 opt_steps=1420 +[epoch 3/50] step=1440 train_loss=0.1938 tok_s=60108.2 opt_steps=1440 +[epoch 3/50] step=1460 train_loss=0.1938 tok_s=60117.5 opt_steps=1460 +[epoch 3/50] step=1480 train_loss=0.1936 tok_s=60118.7 opt_steps=1480 +[epoch 3/50] step=1500 train_loss=0.1936 tok_s=60117.7 opt_steps=1500 +[epoch 3/50] step=1520 train_loss=0.1935 tok_s=60123.2 opt_steps=1520 +[epoch 3/50] step=1540 train_loss=0.1934 tok_s=60129.1 opt_steps=1540 +[epoch 3/50] step=1560 train_loss=0.1933 tok_s=60133.7 opt_steps=1560 +[epoch 3/50] step=1580 train_loss=0.1932 tok_s=60135.2 opt_steps=1580 +[epoch 3/50] step=1600 train_loss=0.1932 tok_s=60139.6 opt_steps=1600 +[epoch 3/50] step=1620 train_loss=0.1931 tok_s=60139.2 opt_steps=1620 +[epoch 3/50] step=1640 train_loss=0.1930 tok_s=60145.5 opt_steps=1640 +[epoch 3/50] step=1660 train_loss=0.1928 tok_s=60142.1 opt_steps=1660 +[epoch 3/50] step=1680 train_loss=0.1927 tok_s=60138.4 opt_steps=1680 +[epoch 3/50] step=1700 train_loss=0.1927 tok_s=60142.9 opt_steps=1700 +[epoch 3/50] step=1720 train_loss=0.1926 tok_s=60145.2 opt_steps=1720 +[epoch 3/50] step=1740 train_loss=0.1925 tok_s=60145.8 opt_steps=1740 +[epoch 3/50] step=1760 train_loss=0.1924 tok_s=60155.3 opt_steps=1760 +[epoch 3/50] step=1780 train_loss=0.1923 tok_s=60152.7 opt_steps=1780 +[epoch 3/50] step=1800 train_loss=0.1922 tok_s=60153.2 opt_steps=1800 +[epoch 3/50] step=1820 train_loss=0.1921 tok_s=60153.2 opt_steps=1820 +[epoch 3/50] step=1840 train_loss=0.1920 tok_s=60149.4 opt_steps=1840 +[epoch 3/50] step=1860 train_loss=0.1918 tok_s=60144.1 opt_steps=1860 +[epoch 3/50] step=1880 train_loss=0.1918 tok_s=60144.8 opt_steps=1880 +[epoch 3/50] step=1900 train_loss=0.1917 tok_s=60144.3 opt_steps=1900 +[epoch 3/50] step=1920 train_loss=0.1916 tok_s=60145.1 opt_steps=1920 +[epoch 3/50] step=1940 train_loss=0.1915 tok_s=60148.6 opt_steps=1940 +[epoch 3/50] step=1960 train_loss=0.1914 tok_s=60151.2 opt_steps=1960 +[epoch 3/50] step=1980 train_loss=0.1913 tok_s=60151.5 opt_steps=1980 +[epoch 3/50] step=2000 train_loss=0.1912 tok_s=60147.0 opt_steps=2000 +[epoch 3/50] step=2020 train_loss=0.1911 tok_s=60141.8 opt_steps=2020 +[epoch 3/50] step=2040 train_loss=0.1910 tok_s=60141.2 opt_steps=2040 +[epoch 3/50] step=2060 train_loss=0.1909 tok_s=60137.8 opt_steps=2060 +[epoch 3/50] step=2080 train_loss=0.1909 tok_s=60138.2 opt_steps=2080 +[epoch 3/50] step=2100 train_loss=0.1908 tok_s=60133.2 opt_steps=2100 +[epoch 3/50] step=2120 train_loss=0.1907 tok_s=60126.4 opt_steps=2120 +[epoch 3/50] step=2140 train_loss=0.1907 tok_s=60126.6 opt_steps=2140 +[epoch 3/50] step=2160 train_loss=0.1906 tok_s=60127.7 opt_steps=2160 +[epoch 3/50] step=2180 train_loss=0.1905 tok_s=60130.1 opt_steps=2180 +[epoch 3/50] step=2200 train_loss=0.1904 tok_s=60133.3 opt_steps=2200 +[epoch 3/50] step=2220 train_loss=0.1903 tok_s=60134.7 opt_steps=2220 +[epoch 3/50] step=2240 train_loss=0.1902 tok_s=60135.0 opt_steps=2240 +[epoch 3/50] step=2260 train_loss=0.1901 tok_s=60139.0 opt_steps=2260 +[epoch 3/50] step=2280 train_loss=0.1900 tok_s=60138.9 opt_steps=2280 +[epoch 3/50] step=2300 train_loss=0.1899 tok_s=60138.8 opt_steps=2300 +[epoch 3/50] step=2320 train_loss=0.1898 tok_s=60137.0 opt_steps=2320 +[epoch 3/50] step=2340 train_loss=0.1897 tok_s=60139.1 opt_steps=2340 +[epoch 3/50] step=2360 train_loss=0.1897 tok_s=60137.5 opt_steps=2360 +[epoch 3/50] step=2380 train_loss=0.1896 tok_s=60135.9 opt_steps=2380 +[epoch 3/50] step=2400 train_loss=0.1895 tok_s=60132.8 opt_steps=2400 +[epoch 3/50] step=2420 train_loss=0.1894 tok_s=60132.7 opt_steps=2420 +[epoch 3/50] step=2440 train_loss=0.1894 tok_s=60133.7 opt_steps=2440 +[epoch 3/50] step=2460 train_loss=0.1893 tok_s=60137.4 opt_steps=2460 +[epoch 3/50] step=2480 train_loss=0.1892 tok_s=60138.8 opt_steps=2480 +[epoch 3/50] step=2500 train_loss=0.1891 tok_s=60140.8 opt_steps=2500 +[epoch 3/50] step=2520 train_loss=0.1890 tok_s=60139.0 opt_steps=2520 +[epoch 3/50] step=2540 train_loss=0.1889 tok_s=60132.0 opt_steps=2540 +[epoch 3/50] step=2560 train_loss=0.1888 tok_s=60136.4 opt_steps=2560 +[epoch 3/50] step=2580 train_loss=0.1888 tok_s=60139.9 opt_steps=2580 +[epoch 3/50] step=2600 train_loss=0.1887 tok_s=60135.7 opt_steps=2600 +[epoch 3/50] step=2620 train_loss=0.1886 tok_s=60133.3 opt_steps=2620 +[epoch 3/50] step=2640 train_loss=0.1885 tok_s=60131.5 opt_steps=2640 +[epoch 3/50] step=2660 train_loss=0.1884 tok_s=60127.8 opt_steps=2660 +[epoch 3/50] step=2680 train_loss=0.1884 tok_s=60123.9 opt_steps=2680 +[epoch 3/50] step=2700 train_loss=0.1882 tok_s=60123.4 opt_steps=2700 +[epoch 3/50] step=2720 train_loss=0.1882 tok_s=60121.5 opt_steps=2720 +[epoch 3/50] step=2740 train_loss=0.1881 tok_s=60121.4 opt_steps=2740 +[epoch 3/50] step=2760 train_loss=0.1880 tok_s=60120.7 opt_steps=2760 +[epoch 3/50] step=2780 train_loss=0.1879 tok_s=60121.3 opt_steps=2780 +[epoch 3/50] step=2800 train_loss=0.1878 tok_s=60119.2 opt_steps=2800 +[epoch 3/50] step=2820 train_loss=0.1877 tok_s=60120.6 opt_steps=2820 +[epoch 3/50] step=2840 train_loss=0.1877 tok_s=60117.8 opt_steps=2840 +[epoch 3/50] step=2860 train_loss=0.1876 tok_s=60117.5 opt_steps=2860 +[epoch 3/50] step=2880 train_loss=0.1875 tok_s=60120.6 opt_steps=2880 +[epoch 3/50] step=2900 train_loss=0.1874 tok_s=60119.7 opt_steps=2900 +[epoch 3/50] step=2920 train_loss=0.1873 tok_s=60120.6 opt_steps=2920 +[epoch 3/50] step=2940 train_loss=0.1872 tok_s=60119.4 opt_steps=2940 +[epoch 3/50] step=2960 train_loss=0.1871 tok_s=60118.6 opt_steps=2960 +[epoch 3/50] step=2980 train_loss=0.1871 tok_s=60121.6 opt_steps=2980 +[epoch 3/50] step=3000 train_loss=0.1870 tok_s=60124.5 opt_steps=3000 +[epoch 3/50] step=3020 train_loss=0.1869 tok_s=60125.6 opt_steps=3020 +[epoch 3/50] step=3040 train_loss=0.1867 tok_s=60128.0 opt_steps=3040 +[epoch 3/50] step=3060 train_loss=0.1867 tok_s=60127.8 opt_steps=3060 +[epoch 3/50] step=3080 train_loss=0.1866 tok_s=60128.7 opt_steps=3080 +[epoch 3/50] step=3100 train_loss=0.1865 tok_s=60122.2 opt_steps=3100 +[epoch 3/50] step=3120 train_loss=0.1864 tok_s=60121.3 opt_steps=3120 +[epoch 3/50] step=3140 train_loss=0.1863 tok_s=60120.2 opt_steps=3140 +[epoch 3/50] step=3160 train_loss=0.1862 tok_s=60121.8 opt_steps=3160 +[epoch 3/50] step=3180 train_loss=0.1861 tok_s=60120.9 opt_steps=3180 +[epoch 3/50] step=3200 train_loss=0.1860 tok_s=60117.2 opt_steps=3200 +[epoch 3/50] step=3220 train_loss=0.1859 tok_s=60116.7 opt_steps=3220 +[epoch 3/50] step=3240 train_loss=0.1858 tok_s=60116.4 opt_steps=3240 +[epoch 3/50] step=3260 train_loss=0.1857 tok_s=60109.4 opt_steps=3260 +[epoch 3/50] train_loss=0.1857 val_skipped tok_s=60110.1 opt_steps=3273 +[epoch 4/50] step=20 train_loss=0.1523 tok_s=58173.0 opt_steps=20 +[epoch 4/50] step=40 train_loss=0.1527 tok_s=59090.9 opt_steps=40 +[epoch 4/50] step=60 train_loss=0.1521 tok_s=59605.8 opt_steps=60 +[epoch 4/50] step=80 train_loss=0.1523 tok_s=59721.9 opt_steps=80 +[epoch 4/50] step=100 train_loss=0.1518 tok_s=59800.1 opt_steps=100 +[epoch 4/50] step=120 train_loss=0.1518 tok_s=59850.4 opt_steps=120 +[epoch 4/50] step=140 train_loss=0.1520 tok_s=59910.1 opt_steps=140 +[epoch 4/50] step=160 train_loss=0.1519 tok_s=59949.3 opt_steps=160 +[epoch 4/50] step=180 train_loss=0.1520 tok_s=59945.0 opt_steps=180 +[epoch 4/50] step=200 train_loss=0.1522 tok_s=59964.8 opt_steps=200 +[epoch 4/50] step=220 train_loss=0.1525 tok_s=59980.3 opt_steps=220 +[epoch 4/50] step=240 train_loss=0.1526 tok_s=59951.1 opt_steps=240 +[epoch 4/50] step=260 train_loss=0.1530 tok_s=59959.3 opt_steps=260 +[epoch 4/50] step=280 train_loss=0.1532 tok_s=60034.0 opt_steps=280 +[epoch 4/50] step=300 train_loss=0.1531 tok_s=60041.8 opt_steps=300 +[epoch 4/50] step=320 train_loss=0.1532 tok_s=60061.8 opt_steps=320 +[epoch 4/50] step=340 train_loss=0.1533 tok_s=60065.9 opt_steps=340 +[epoch 4/50] step=360 train_loss=0.1534 tok_s=60059.2 opt_steps=360 +[epoch 4/50] step=380 train_loss=0.1533 tok_s=60103.8 opt_steps=380 +[epoch 4/50] step=400 train_loss=0.1532 tok_s=60116.5 opt_steps=400 +[epoch 4/50] step=420 train_loss=0.1533 tok_s=60149.2 opt_steps=420 +[epoch 4/50] step=440 train_loss=0.1534 tok_s=60146.9 opt_steps=440 +[epoch 4/50] step=460 train_loss=0.1534 tok_s=60162.0 opt_steps=460 +[epoch 4/50] step=480 train_loss=0.1534 tok_s=60140.2 opt_steps=480 +[epoch 4/50] step=500 train_loss=0.1533 tok_s=60156.4 opt_steps=500 +[epoch 4/50] step=520 train_loss=0.1534 tok_s=60186.3 opt_steps=520 +[epoch 4/50] step=540 train_loss=0.1532 tok_s=60170.8 opt_steps=540 +[epoch 4/50] step=560 train_loss=0.1532 tok_s=60169.3 opt_steps=560 +[epoch 4/50] step=580 train_loss=0.1531 tok_s=60177.1 opt_steps=580 +[epoch 4/50] step=600 train_loss=0.1531 tok_s=60177.2 opt_steps=600 +[epoch 4/50] step=620 train_loss=0.1530 tok_s=60194.7 opt_steps=620 +[epoch 4/50] step=640 train_loss=0.1530 tok_s=60187.7 opt_steps=640 +[epoch 4/50] step=660 train_loss=0.1531 tok_s=60185.4 opt_steps=660 +[epoch 4/50] step=680 train_loss=0.1530 tok_s=60187.3 opt_steps=680 +[epoch 4/50] step=700 train_loss=0.1530 tok_s=60185.5 opt_steps=700 +[epoch 4/50] step=720 train_loss=0.1530 tok_s=60197.4 opt_steps=720 +[epoch 4/50] step=740 train_loss=0.1530 tok_s=60198.5 opt_steps=740 +[epoch 4/50] step=760 train_loss=0.1530 tok_s=60201.5 opt_steps=760 +[epoch 4/50] step=780 train_loss=0.1529 tok_s=60199.2 opt_steps=780 +[epoch 4/50] step=800 train_loss=0.1529 tok_s=60189.5 opt_steps=800 +[epoch 4/50] step=820 train_loss=0.1529 tok_s=60177.6 opt_steps=820 +[epoch 4/50] step=840 train_loss=0.1529 tok_s=60176.0 opt_steps=840 +[epoch 4/50] step=860 train_loss=0.1529 tok_s=60171.0 opt_steps=860 +[epoch 4/50] step=880 train_loss=0.1529 tok_s=60168.7 opt_steps=880 +[epoch 4/50] step=900 train_loss=0.1529 tok_s=60162.5 opt_steps=900 +[epoch 4/50] step=920 train_loss=0.1529 tok_s=60144.3 opt_steps=920 +[epoch 4/50] step=940 train_loss=0.1529 tok_s=60134.9 opt_steps=940 +[epoch 4/50] step=960 train_loss=0.1529 tok_s=60128.1 opt_steps=960 +[epoch 4/50] step=980 train_loss=0.1528 tok_s=60132.9 opt_steps=980 +[epoch 4/50] step=1000 train_loss=0.1528 tok_s=60136.6 opt_steps=1000 +[epoch 4/50] step=1020 train_loss=0.1528 tok_s=60131.3 opt_steps=1020 +[epoch 4/50] step=1040 train_loss=0.1528 tok_s=60123.3 opt_steps=1040 +[epoch 4/50] step=1060 train_loss=0.1528 tok_s=60129.0 opt_steps=1060 +[epoch 4/50] step=1080 train_loss=0.1527 tok_s=60132.1 opt_steps=1080 +[epoch 4/50] step=1100 train_loss=0.1527 tok_s=60132.4 opt_steps=1100 +[epoch 4/50] step=1120 train_loss=0.1527 tok_s=60135.2 opt_steps=1120 +[epoch 4/50] step=1140 train_loss=0.1527 tok_s=60134.1 opt_steps=1140 +[epoch 4/50] step=1160 train_loss=0.1526 tok_s=60135.5 opt_steps=1160 +[epoch 4/50] step=1180 train_loss=0.1526 tok_s=60138.7 opt_steps=1180 +[epoch 4/50] step=1200 train_loss=0.1525 tok_s=60144.7 opt_steps=1200 +[epoch 4/50] step=1220 train_loss=0.1525 tok_s=60145.3 opt_steps=1220 +[epoch 4/50] step=1240 train_loss=0.1525 tok_s=60155.3 opt_steps=1240 +[epoch 4/50] step=1260 train_loss=0.1525 tok_s=60163.2 opt_steps=1260 +[epoch 4/50] step=1280 train_loss=0.1524 tok_s=60163.5 opt_steps=1280 +[epoch 4/50] step=1300 train_loss=0.1524 tok_s=60165.9 opt_steps=1300 +[epoch 4/50] step=1320 train_loss=0.1523 tok_s=60162.6 opt_steps=1320 +[epoch 4/50] step=1340 train_loss=0.1523 tok_s=60159.8 opt_steps=1340 +[epoch 4/50] step=1360 train_loss=0.1523 tok_s=60158.6 opt_steps=1360 +[epoch 4/50] step=1380 train_loss=0.1523 tok_s=60163.1 opt_steps=1380 +[epoch 4/50] step=1400 train_loss=0.1523 tok_s=60153.6 opt_steps=1400 +[epoch 4/50] step=1420 train_loss=0.1522 tok_s=60156.1 opt_steps=1420 +[epoch 4/50] step=1440 train_loss=0.1522 tok_s=60159.3 opt_steps=1440 +[epoch 4/50] step=1460 train_loss=0.1522 tok_s=60163.3 opt_steps=1460 +[epoch 4/50] step=1480 train_loss=0.1521 tok_s=60162.7 opt_steps=1480 +[epoch 4/50] step=1500 train_loss=0.1521 tok_s=60153.4 opt_steps=1500 +[epoch 4/50] step=1520 train_loss=0.1521 tok_s=60148.0 opt_steps=1520 +[epoch 4/50] step=1540 train_loss=0.1520 tok_s=60143.2 opt_steps=1540 +[epoch 4/50] step=1560 train_loss=0.1520 tok_s=60140.6 opt_steps=1560 +[epoch 4/50] step=1580 train_loss=0.1520 tok_s=60141.2 opt_steps=1580 +[epoch 4/50] step=1600 train_loss=0.1520 tok_s=60141.8 opt_steps=1600 +[epoch 4/50] step=1620 train_loss=0.1520 tok_s=60146.6 opt_steps=1620 +[epoch 4/50] step=1640 train_loss=0.1519 tok_s=60147.2 opt_steps=1640 +[epoch 4/50] step=1660 train_loss=0.1519 tok_s=60147.6 opt_steps=1660 +[epoch 4/50] step=1680 train_loss=0.1518 tok_s=60142.5 opt_steps=1680 +[epoch 4/50] step=1700 train_loss=0.1518 tok_s=60145.8 opt_steps=1700 +[epoch 4/50] step=1720 train_loss=0.1518 tok_s=60145.7 opt_steps=1720 +[epoch 4/50] step=1740 train_loss=0.1518 tok_s=60145.8 opt_steps=1740 +[epoch 4/50] step=1760 train_loss=0.1517 tok_s=60146.6 opt_steps=1760 +[epoch 4/50] step=1780 train_loss=0.1517 tok_s=60147.7 opt_steps=1780 +[epoch 4/50] step=1800 train_loss=0.1517 tok_s=60143.5 opt_steps=1800 +[epoch 4/50] step=1820 train_loss=0.1516 tok_s=60141.4 opt_steps=1820 +[epoch 4/50] step=1840 train_loss=0.1516 tok_s=60146.1 opt_steps=1840 +[epoch 4/50] step=1860 train_loss=0.1515 tok_s=60151.0 opt_steps=1860 +[epoch 4/50] step=1880 train_loss=0.1515 tok_s=60147.1 opt_steps=1880 +[epoch 4/50] step=1900 train_loss=0.1515 tok_s=60148.3 opt_steps=1900 +[epoch 4/50] step=1920 train_loss=0.1514 tok_s=60148.3 opt_steps=1920 +[epoch 4/50] step=1940 train_loss=0.1514 tok_s=60150.8 opt_steps=1940 +[epoch 4/50] step=1960 train_loss=0.1514 tok_s=60152.9 opt_steps=1960 +[epoch 4/50] step=1980 train_loss=0.1514 tok_s=60157.9 opt_steps=1980 +[epoch 4/50] step=2000 train_loss=0.1514 tok_s=60157.3 opt_steps=2000 +[epoch 4/50] step=2020 train_loss=0.1514 tok_s=60154.9 opt_steps=2020 +[epoch 4/50] step=2040 train_loss=0.1514 tok_s=60159.9 opt_steps=2040 +[epoch 4/50] step=2060 train_loss=0.1513 tok_s=60165.7 opt_steps=2060 +[epoch 4/50] step=2080 train_loss=0.1513 tok_s=60166.5 opt_steps=2080 +[epoch 4/50] step=2100 train_loss=0.1513 tok_s=60165.1 opt_steps=2100 +[epoch 4/50] step=2120 train_loss=0.1512 tok_s=60158.0 opt_steps=2120 +[epoch 4/50] step=2140 train_loss=0.1512 tok_s=60150.8 opt_steps=2140 +[epoch 4/50] step=2160 train_loss=0.1511 tok_s=60150.8 opt_steps=2160 +[epoch 4/50] step=2180 train_loss=0.1511 tok_s=60150.7 opt_steps=2180 +[epoch 4/50] step=2200 train_loss=0.1510 tok_s=60147.9 opt_steps=2200 +[epoch 4/50] step=2220 train_loss=0.1510 tok_s=60147.9 opt_steps=2220 +[epoch 4/50] step=2240 train_loss=0.1510 tok_s=60132.4 opt_steps=2240 +[epoch 4/50] step=2260 train_loss=0.1509 tok_s=60122.7 opt_steps=2260 +[epoch 4/50] step=2280 train_loss=0.1509 tok_s=60112.0 opt_steps=2280 +[epoch 4/50] step=2300 train_loss=0.1508 tok_s=60098.1 opt_steps=2300 +[epoch 4/50] step=2320 train_loss=0.1508 tok_s=60088.8 opt_steps=2320 +[epoch 4/50] step=2340 train_loss=0.1508 tok_s=60082.0 opt_steps=2340 +[epoch 4/50] step=2360 train_loss=0.1508 tok_s=60080.8 opt_steps=2360 +[epoch 4/50] step=2380 train_loss=0.1507 tok_s=60079.7 opt_steps=2380 +[epoch 4/50] step=2400 train_loss=0.1507 tok_s=60078.8 opt_steps=2400 +[epoch 4/50] step=2420 train_loss=0.1506 tok_s=60077.4 opt_steps=2420 +[epoch 4/50] step=2440 train_loss=0.1506 tok_s=60079.5 opt_steps=2440 +[epoch 4/50] step=2460 train_loss=0.1505 tok_s=60082.6 opt_steps=2460 +[epoch 4/50] step=2480 train_loss=0.1505 tok_s=60078.1 opt_steps=2480 +[epoch 4/50] step=2500 train_loss=0.1504 tok_s=60078.4 opt_steps=2500 +[epoch 4/50] step=2520 train_loss=0.1504 tok_s=60074.0 opt_steps=2520 +[epoch 4/50] step=2540 train_loss=0.1504 tok_s=60074.7 opt_steps=2540 +[epoch 4/50] step=2560 train_loss=0.1504 tok_s=60078.3 opt_steps=2560 +[epoch 4/50] step=2580 train_loss=0.1504 tok_s=60080.3 opt_steps=2580 +[epoch 4/50] step=2600 train_loss=0.1503 tok_s=60078.9 opt_steps=2600 +[epoch 4/50] step=2620 train_loss=0.1502 tok_s=60078.2 opt_steps=2620 +[epoch 4/50] step=2640 train_loss=0.1502 tok_s=60078.1 opt_steps=2640 +[epoch 4/50] step=2660 train_loss=0.1502 tok_s=60080.3 opt_steps=2660 +[epoch 4/50] step=2680 train_loss=0.1501 tok_s=60080.1 opt_steps=2680 +[epoch 4/50] step=2700 train_loss=0.1501 tok_s=60082.4 opt_steps=2700 +[epoch 4/50] step=2720 train_loss=0.1500 tok_s=60084.6 opt_steps=2720 +[epoch 4/50] step=2740 train_loss=0.1500 tok_s=60087.2 opt_steps=2740 +[epoch 4/50] step=2760 train_loss=0.1499 tok_s=60087.8 opt_steps=2760 +[epoch 4/50] step=2780 train_loss=0.1499 tok_s=60085.5 opt_steps=2780 +[epoch 4/50] step=2800 train_loss=0.1499 tok_s=60089.2 opt_steps=2800 +[epoch 4/50] step=2820 train_loss=0.1498 tok_s=60092.3 opt_steps=2820 +[epoch 4/50] step=2840 train_loss=0.1498 tok_s=60093.3 opt_steps=2840 +[epoch 4/50] step=2860 train_loss=0.1498 tok_s=60096.4 opt_steps=2860 +[epoch 4/50] step=2880 train_loss=0.1498 tok_s=60096.2 opt_steps=2880 +[epoch 4/50] step=2900 train_loss=0.1497 tok_s=60094.8 opt_steps=2900 +[epoch 4/50] step=2920 train_loss=0.1497 tok_s=60091.9 opt_steps=2920 +[epoch 4/50] step=2940 train_loss=0.1496 tok_s=60094.7 opt_steps=2940 +[epoch 4/50] step=2960 train_loss=0.1496 tok_s=60096.2 opt_steps=2960 +[epoch 4/50] step=2980 train_loss=0.1496 tok_s=60095.0 opt_steps=2980 +[epoch 4/50] step=3000 train_loss=0.1495 tok_s=60097.9 opt_steps=3000 +[epoch 4/50] step=3020 train_loss=0.1495 tok_s=60099.9 opt_steps=3020 +[epoch 4/50] step=3040 train_loss=0.1495 tok_s=60101.0 opt_steps=3040 +[epoch 4/50] step=3060 train_loss=0.1495 tok_s=60099.9 opt_steps=3060 +[epoch 4/50] step=3080 train_loss=0.1494 tok_s=60097.7 opt_steps=3080 +[epoch 4/50] step=3100 train_loss=0.1494 tok_s=60097.3 opt_steps=3100 +[epoch 4/50] step=3120 train_loss=0.1494 tok_s=60099.1 opt_steps=3120 +[epoch 4/50] step=3140 train_loss=0.1493 tok_s=60097.5 opt_steps=3140 +[epoch 4/50] step=3160 train_loss=0.1493 tok_s=60096.0 opt_steps=3160 +[epoch 4/50] step=3180 train_loss=0.1493 tok_s=60099.6 opt_steps=3180 +[epoch 4/50] step=3200 train_loss=0.1492 tok_s=60095.8 opt_steps=3200 +[epoch 4/50] step=3220 train_loss=0.1492 tok_s=60086.3 opt_steps=3220 +[epoch 4/50] step=3240 train_loss=0.1491 tok_s=60082.9 opt_steps=3240 +[epoch 4/50] step=3260 train_loss=0.1491 tok_s=60075.2 opt_steps=3260 +[epoch 4/50] train_loss=0.1491 val_skipped tok_s=60077.7 opt_steps=3273 +[epoch 5/50] step=20 train_loss=0.1209 tok_s=58304.9 opt_steps=20 +[epoch 5/50] step=40 train_loss=0.1209 tok_s=59382.2 opt_steps=40 +[epoch 5/50] step=60 train_loss=0.1209 tok_s=59589.2 opt_steps=60 +[epoch 5/50] step=80 train_loss=0.1206 tok_s=59838.5 opt_steps=80 +[epoch 5/50] step=100 train_loss=0.1200 tok_s=59854.4 opt_steps=100 +[epoch 5/50] step=120 train_loss=0.1202 tok_s=59822.8 opt_steps=120 +[epoch 5/50] step=140 train_loss=0.1200 tok_s=59830.6 opt_steps=140 +[epoch 5/50] step=160 train_loss=0.1201 tok_s=59815.7 opt_steps=160 +[epoch 5/50] step=180 train_loss=0.1200 tok_s=59782.9 opt_steps=180 +[epoch 5/50] step=200 train_loss=0.1204 tok_s=59753.8 opt_steps=200 +[epoch 5/50] step=220 train_loss=0.1206 tok_s=59778.4 opt_steps=220 +[epoch 5/50] step=240 train_loss=0.1211 tok_s=59792.9 opt_steps=240 +[epoch 5/50] step=260 train_loss=0.1212 tok_s=59832.8 opt_steps=260 +[epoch 5/50] step=280 train_loss=0.1212 tok_s=59814.8 opt_steps=280 +[epoch 5/50] step=300 train_loss=0.1213 tok_s=59852.5 opt_steps=300 +[epoch 5/50] step=320 train_loss=0.1214 tok_s=59831.6 opt_steps=320 +[epoch 5/50] step=340 train_loss=0.1215 tok_s=59815.3 opt_steps=340 +[epoch 5/50] step=360 train_loss=0.1217 tok_s=59826.6 opt_steps=360 +[epoch 5/50] step=380 train_loss=0.1217 tok_s=59831.9 opt_steps=380 +[epoch 5/50] step=400 train_loss=0.1219 tok_s=59843.2 opt_steps=400 +[epoch 5/50] step=420 train_loss=0.1220 tok_s=59860.3 opt_steps=420 +[epoch 5/50] step=440 train_loss=0.1222 tok_s=59868.4 opt_steps=440 +[epoch 5/50] step=460 train_loss=0.1222 tok_s=59858.1 opt_steps=460 +[epoch 5/50] step=480 train_loss=0.1223 tok_s=59873.4 opt_steps=480 +[epoch 5/50] step=500 train_loss=0.1225 tok_s=59876.7 opt_steps=500 +[epoch 5/50] step=520 train_loss=0.1227 tok_s=59874.6 opt_steps=520 +[epoch 5/50] step=540 train_loss=0.1228 tok_s=59862.7 opt_steps=540 +[epoch 5/50] step=560 train_loss=0.1229 tok_s=59864.9 opt_steps=560 +[epoch 5/50] step=580 train_loss=0.1229 tok_s=59871.7 opt_steps=580 +[epoch 5/50] step=600 train_loss=0.1230 tok_s=59897.8 opt_steps=600 +[epoch 5/50] step=620 train_loss=0.1230 tok_s=59893.2 opt_steps=620 +[epoch 5/50] step=640 train_loss=0.1231 tok_s=59889.2 opt_steps=640 +[epoch 5/50] step=660 train_loss=0.1232 tok_s=59892.1 opt_steps=660 +[epoch 5/50] step=680 train_loss=0.1233 tok_s=59896.5 opt_steps=680 +[epoch 5/50] step=700 train_loss=0.1234 tok_s=59907.8 opt_steps=700 +[epoch 5/50] step=720 train_loss=0.1235 tok_s=59933.5 opt_steps=720 +[epoch 5/50] step=740 train_loss=0.1235 tok_s=59936.8 opt_steps=740 +[epoch 5/50] step=760 train_loss=0.1236 tok_s=59925.6 opt_steps=760 +[epoch 5/50] step=780 train_loss=0.1237 tok_s=59941.8 opt_steps=780 +[epoch 5/50] step=800 train_loss=0.1237 tok_s=59935.4 opt_steps=800 +[epoch 5/50] step=820 train_loss=0.1237 tok_s=59930.6 opt_steps=820 +[epoch 5/50] step=840 train_loss=0.1237 tok_s=59923.0 opt_steps=840 +[epoch 5/50] step=860 train_loss=0.1237 tok_s=59927.6 opt_steps=860 +[epoch 5/50] step=880 train_loss=0.1238 tok_s=59946.5 opt_steps=880 +[epoch 5/50] step=900 train_loss=0.1238 tok_s=59940.6 opt_steps=900 +[epoch 5/50] step=920 train_loss=0.1238 tok_s=59934.6 opt_steps=920 +[epoch 5/50] step=940 train_loss=0.1239 tok_s=59950.4 opt_steps=940 +[epoch 5/50] step=960 train_loss=0.1239 tok_s=59966.6 opt_steps=960 +[epoch 5/50] step=980 train_loss=0.1240 tok_s=59962.8 opt_steps=980 +[epoch 5/50] step=1000 train_loss=0.1240 tok_s=59964.4 opt_steps=1000 +[epoch 5/50] step=1020 train_loss=0.1240 tok_s=59966.0 opt_steps=1020 +[epoch 5/50] step=1040 train_loss=0.1240 tok_s=59962.8 opt_steps=1040 +[epoch 5/50] step=1060 train_loss=0.1241 tok_s=59967.5 opt_steps=1060 +[epoch 5/50] step=1080 train_loss=0.1241 tok_s=59975.5 opt_steps=1080 +[epoch 5/50] step=1100 train_loss=0.1241 tok_s=59974.8 opt_steps=1100 +[epoch 5/50] step=1120 train_loss=0.1242 tok_s=59969.7 opt_steps=1120 +[epoch 5/50] step=1140 train_loss=0.1242 tok_s=59955.6 opt_steps=1140 +[epoch 5/50] step=1160 train_loss=0.1242 tok_s=59945.1 opt_steps=1160 +[epoch 5/50] step=1180 train_loss=0.1242 tok_s=59927.5 opt_steps=1180 +[epoch 5/50] step=1200 train_loss=0.1242 tok_s=59916.6 opt_steps=1200 +[epoch 5/50] step=1220 train_loss=0.1242 tok_s=59897.7 opt_steps=1220 +[epoch 5/50] step=1240 train_loss=0.1243 tok_s=59885.5 opt_steps=1240 +[epoch 5/50] step=1260 train_loss=0.1243 tok_s=59885.1 opt_steps=1260 +[epoch 5/50] step=1280 train_loss=0.1242 tok_s=59889.9 opt_steps=1280 +[epoch 5/50] step=1300 train_loss=0.1243 tok_s=59890.4 opt_steps=1300 +[epoch 5/50] step=1320 train_loss=0.1243 tok_s=59892.2 opt_steps=1320 +[epoch 5/50] step=1340 train_loss=0.1244 tok_s=59895.5 opt_steps=1340 +[epoch 5/50] step=1360 train_loss=0.1244 tok_s=59899.9 opt_steps=1360 +[epoch 5/50] step=1380 train_loss=0.1244 tok_s=59902.7 opt_steps=1380 +[epoch 5/50] step=1400 train_loss=0.1244 tok_s=59904.0 opt_steps=1400 +[epoch 5/50] step=1420 train_loss=0.1244 tok_s=59911.6 opt_steps=1420 +[epoch 5/50] step=1440 train_loss=0.1243 tok_s=59917.5 opt_steps=1440 +[epoch 5/50] step=1460 train_loss=0.1243 tok_s=59918.7 opt_steps=1460 +[epoch 5/50] step=1480 train_loss=0.1243 tok_s=59921.3 opt_steps=1480 +[epoch 5/50] step=1500 train_loss=0.1243 tok_s=59928.5 opt_steps=1500 +[epoch 5/50] step=1520 train_loss=0.1242 tok_s=59933.0 opt_steps=1520 +[epoch 5/50] step=1540 train_loss=0.1243 tok_s=59935.4 opt_steps=1540 +[epoch 5/50] step=1560 train_loss=0.1243 tok_s=59934.8 opt_steps=1560 +[epoch 5/50] step=1580 train_loss=0.1243 tok_s=59935.3 opt_steps=1580 +[epoch 5/50] step=1600 train_loss=0.1243 tok_s=59935.8 opt_steps=1600 +[epoch 5/50] step=1620 train_loss=0.1243 tok_s=59940.4 opt_steps=1620 +[epoch 5/50] step=1640 train_loss=0.1243 tok_s=59947.4 opt_steps=1640 +[epoch 5/50] step=1660 train_loss=0.1243 tok_s=59949.9 opt_steps=1660 +[epoch 5/50] step=1680 train_loss=0.1243 tok_s=59950.8 opt_steps=1680 +[epoch 5/50] step=1700 train_loss=0.1243 tok_s=59957.8 opt_steps=1700 +[epoch 5/50] step=1720 train_loss=0.1243 tok_s=59965.3 opt_steps=1720 +[epoch 5/50] step=1740 train_loss=0.1243 tok_s=59968.2 opt_steps=1740 +[epoch 5/50] step=1760 train_loss=0.1243 tok_s=59974.5 opt_steps=1760 +[epoch 5/50] step=1780 train_loss=0.1243 tok_s=59980.1 opt_steps=1780 +[epoch 5/50] step=1800 train_loss=0.1243 tok_s=59983.2 opt_steps=1800 +[epoch 5/50] step=1820 train_loss=0.1243 tok_s=59988.0 opt_steps=1820 +[epoch 5/50] step=1840 train_loss=0.1244 tok_s=59991.0 opt_steps=1840 +[epoch 5/50] step=1860 train_loss=0.1244 tok_s=60000.5 opt_steps=1860 +[epoch 5/50] step=1880 train_loss=0.1244 tok_s=59997.3 opt_steps=1880 +[epoch 5/50] step=1900 train_loss=0.1244 tok_s=59999.7 opt_steps=1900 +[epoch 5/50] step=1920 train_loss=0.1244 tok_s=60004.7 opt_steps=1920 +[epoch 5/50] step=1940 train_loss=0.1244 tok_s=60005.3 opt_steps=1940 +[epoch 5/50] step=1960 train_loss=0.1244 tok_s=60012.8 opt_steps=1960 +[epoch 5/50] step=1980 train_loss=0.1244 tok_s=60011.0 opt_steps=1980 +[epoch 5/50] step=2000 train_loss=0.1244 tok_s=60010.2 opt_steps=2000 +[epoch 5/50] step=2020 train_loss=0.1244 tok_s=60012.8 opt_steps=2020 +[epoch 5/50] step=2040 train_loss=0.1244 tok_s=60015.7 opt_steps=2040 +[epoch 5/50] step=2060 train_loss=0.1244 tok_s=60016.0 opt_steps=2060 +[epoch 5/50] step=2080 train_loss=0.1244 tok_s=60018.5 opt_steps=2080 +[epoch 5/50] step=2100 train_loss=0.1244 tok_s=60022.0 opt_steps=2100 +[epoch 5/50] step=2120 train_loss=0.1244 tok_s=60026.1 opt_steps=2120 +[epoch 5/50] step=2140 train_loss=0.1244 tok_s=60025.6 opt_steps=2140 +[epoch 5/50] step=2160 train_loss=0.1243 tok_s=60026.5 opt_steps=2160 +[epoch 5/50] step=2180 train_loss=0.1243 tok_s=60029.2 opt_steps=2180 +[epoch 5/50] step=2200 train_loss=0.1243 tok_s=60030.5 opt_steps=2200 +[epoch 5/50] step=2220 train_loss=0.1243 tok_s=60030.3 opt_steps=2220 +[epoch 5/50] step=2240 train_loss=0.1243 tok_s=60031.9 opt_steps=2240 +[epoch 5/50] step=2260 train_loss=0.1243 tok_s=60036.2 opt_steps=2260 +[epoch 5/50] step=2280 train_loss=0.1243 tok_s=60038.8 opt_steps=2280 +[epoch 5/50] step=2300 train_loss=0.1243 tok_s=60041.3 opt_steps=2300 +[epoch 5/50] step=2320 train_loss=0.1243 tok_s=60046.5 opt_steps=2320 +[epoch 5/50] step=2340 train_loss=0.1243 tok_s=60049.4 opt_steps=2340 +[epoch 5/50] step=2360 train_loss=0.1243 tok_s=60052.1 opt_steps=2360 +[epoch 5/50] step=2380 train_loss=0.1243 tok_s=60054.5 opt_steps=2380 +[epoch 5/50] step=2400 train_loss=0.1243 tok_s=60053.2 opt_steps=2400 +[epoch 5/50] step=2420 train_loss=0.1243 tok_s=60048.9 opt_steps=2420 +[epoch 5/50] step=2440 train_loss=0.1243 tok_s=60047.5 opt_steps=2440 +[epoch 5/50] step=2460 train_loss=0.1243 tok_s=60048.3 opt_steps=2460 +[epoch 5/50] step=2480 train_loss=0.1243 tok_s=60045.6 opt_steps=2480 +[epoch 5/50] step=2500 train_loss=0.1243 tok_s=60047.0 opt_steps=2500 +[epoch 5/50] step=2520 train_loss=0.1243 tok_s=60049.6 opt_steps=2520 +[epoch 5/50] step=2540 train_loss=0.1243 tok_s=60047.7 opt_steps=2540 +[epoch 5/50] step=2560 train_loss=0.1243 tok_s=60052.2 opt_steps=2560 +[epoch 5/50] step=2580 train_loss=0.1243 tok_s=60052.1 opt_steps=2580 +[epoch 5/50] step=2600 train_loss=0.1242 tok_s=60048.1 opt_steps=2600 +[epoch 5/50] step=2620 train_loss=0.1242 tok_s=60049.0 opt_steps=2620 +[epoch 5/50] step=2640 train_loss=0.1242 tok_s=60036.7 opt_steps=2640 +[epoch 5/50] step=2660 train_loss=0.1242 tok_s=60038.1 opt_steps=2660 +[epoch 5/50] step=2680 train_loss=0.1242 tok_s=60038.4 opt_steps=2680 +[epoch 5/50] step=2700 train_loss=0.1242 tok_s=60037.2 opt_steps=2700 +[epoch 5/50] step=2720 train_loss=0.1242 tok_s=60037.5 opt_steps=2720 +[epoch 5/50] step=2740 train_loss=0.1242 tok_s=60039.4 opt_steps=2740 +[epoch 5/50] step=2760 train_loss=0.1242 tok_s=60037.7 opt_steps=2760 +[epoch 5/50] step=2780 train_loss=0.1242 tok_s=60038.4 opt_steps=2780 +[epoch 5/50] step=2800 train_loss=0.1242 tok_s=60037.1 opt_steps=2800 +[epoch 5/50] step=2820 train_loss=0.1241 tok_s=60041.5 opt_steps=2820 +[epoch 5/50] step=2840 train_loss=0.1241 tok_s=60043.2 opt_steps=2840 +[epoch 5/50] step=2860 train_loss=0.1241 tok_s=60044.5 opt_steps=2860 +[epoch 5/50] step=2880 train_loss=0.1241 tok_s=60047.7 opt_steps=2880 +[epoch 5/50] step=2900 train_loss=0.1241 tok_s=60048.1 opt_steps=2900 +[epoch 5/50] step=2920 train_loss=0.1241 tok_s=60052.4 opt_steps=2920 +[epoch 5/50] step=2940 train_loss=0.1241 tok_s=60051.9 opt_steps=2940 +[epoch 5/50] step=2960 train_loss=0.1241 tok_s=60050.9 opt_steps=2960 +[epoch 5/50] step=2980 train_loss=0.1241 tok_s=60049.4 opt_steps=2980 +[epoch 5/50] step=3000 train_loss=0.1241 tok_s=60053.6 opt_steps=3000 +[epoch 5/50] step=3020 train_loss=0.1241 tok_s=60056.6 opt_steps=3020 +[epoch 5/50] step=3040 train_loss=0.1240 tok_s=60059.8 opt_steps=3040 +[epoch 5/50] step=3060 train_loss=0.1240 tok_s=60060.6 opt_steps=3060 +[epoch 5/50] step=3080 train_loss=0.1240 tok_s=60050.6 opt_steps=3080 +[epoch 5/50] step=3100 train_loss=0.1240 tok_s=60041.8 opt_steps=3100 +[epoch 5/50] step=3120 train_loss=0.1240 tok_s=60036.4 opt_steps=3120 +[epoch 5/50] step=3140 train_loss=0.1240 tok_s=60032.3 opt_steps=3140 +[epoch 5/50] step=3160 train_loss=0.1240 tok_s=60028.1 opt_steps=3160 +[epoch 5/50] step=3180 train_loss=0.1240 tok_s=60029.4 opt_steps=3180 +[epoch 5/50] step=3200 train_loss=0.1240 tok_s=60031.4 opt_steps=3200 +[epoch 5/50] step=3220 train_loss=0.1240 tok_s=60029.4 opt_steps=3220 +[epoch 5/50] step=3240 train_loss=0.1240 tok_s=60028.9 opt_steps=3240 +[epoch 5/50] step=3260 train_loss=0.1240 tok_s=60023.8 opt_steps=3260 +[epoch 5/50] train_loss=0.1240 val_skipped tok_s=60022.4 opt_steps=3273 +[epoch 6/50] step=20 train_loss=0.1008 tok_s=57806.8 opt_steps=20 +[epoch 6/50] step=40 train_loss=0.1008 tok_s=59026.6 opt_steps=40 +[epoch 6/50] step=60 train_loss=0.1009 tok_s=59328.4 opt_steps=60 +[epoch 6/50] step=80 train_loss=0.1008 tok_s=59481.1 opt_steps=80 +[epoch 6/50] step=100 train_loss=0.1009 tok_s=59548.4 opt_steps=100 +[epoch 6/50] step=120 train_loss=0.1008 tok_s=59625.6 opt_steps=120 +[epoch 6/50] step=140 train_loss=0.1005 tok_s=59686.5 opt_steps=140 +[epoch 6/50] step=160 train_loss=0.1006 tok_s=59703.4 opt_steps=160 +[epoch 6/50] step=180 train_loss=0.1007 tok_s=59750.6 opt_steps=180 +[epoch 6/50] step=200 train_loss=0.1006 tok_s=59801.7 opt_steps=200 +[epoch 6/50] step=220 train_loss=0.1008 tok_s=59862.2 opt_steps=220 +[epoch 6/50] step=240 train_loss=0.1010 tok_s=59883.9 opt_steps=240 +[epoch 6/50] step=260 train_loss=0.1011 tok_s=59898.4 opt_steps=260 +[epoch 6/50] step=280 train_loss=0.1012 tok_s=59947.1 opt_steps=280 +[epoch 6/50] step=300 train_loss=0.1012 tok_s=59967.3 opt_steps=300 +[epoch 6/50] step=320 train_loss=0.1013 tok_s=59976.5 opt_steps=320 +[epoch 6/50] step=340 train_loss=0.1014 tok_s=59953.0 opt_steps=340 +[epoch 6/50] step=360 train_loss=0.1014 tok_s=59946.7 opt_steps=360 +[epoch 6/50] step=380 train_loss=0.1016 tok_s=59962.6 opt_steps=380 +[epoch 6/50] step=400 train_loss=0.1018 tok_s=59975.3 opt_steps=400 +[epoch 6/50] step=420 train_loss=0.1020 tok_s=59997.7 opt_steps=420 +[epoch 6/50] step=440 train_loss=0.1021 tok_s=60008.1 opt_steps=440 +[epoch 6/50] step=460 train_loss=0.1022 tok_s=60037.5 opt_steps=460 +[epoch 6/50] step=480 train_loss=0.1022 tok_s=60037.0 opt_steps=480 +[epoch 6/50] step=500 train_loss=0.1022 tok_s=60034.1 opt_steps=500 +[epoch 6/50] step=520 train_loss=0.1024 tok_s=60053.5 opt_steps=520 +[epoch 6/50] step=540 train_loss=0.1024 tok_s=60067.7 opt_steps=540 +[epoch 6/50] step=560 train_loss=0.1025 tok_s=60073.1 opt_steps=560 +[epoch 6/50] step=580 train_loss=0.1025 tok_s=60086.5 opt_steps=580 +[epoch 6/50] step=600 train_loss=0.1027 tok_s=60107.0 opt_steps=600 +[epoch 6/50] step=620 train_loss=0.1027 tok_s=60117.7 opt_steps=620 +[epoch 6/50] step=640 train_loss=0.1028 tok_s=60129.8 opt_steps=640 +[epoch 6/50] step=660 train_loss=0.1028 tok_s=60126.8 opt_steps=660 +[epoch 6/50] step=680 train_loss=0.1029 tok_s=60134.4 opt_steps=680 +[epoch 6/50] step=700 train_loss=0.1029 tok_s=60143.6 opt_steps=700 +[epoch 6/50] step=720 train_loss=0.1030 tok_s=60139.6 opt_steps=720 +[epoch 6/50] step=740 train_loss=0.1030 tok_s=60134.9 opt_steps=740 +[epoch 6/50] step=760 train_loss=0.1031 tok_s=60130.6 opt_steps=760 +[epoch 6/50] step=780 train_loss=0.1032 tok_s=60144.5 opt_steps=780 +[epoch 6/50] step=800 train_loss=0.1032 tok_s=60138.8 opt_steps=800 +[epoch 6/50] step=820 train_loss=0.1033 tok_s=60131.0 opt_steps=820 +[epoch 6/50] step=840 train_loss=0.1034 tok_s=60127.5 opt_steps=840 +[epoch 6/50] step=860 train_loss=0.1035 tok_s=60121.2 opt_steps=860 +[epoch 6/50] step=880 train_loss=0.1035 tok_s=60122.4 opt_steps=880 +[epoch 6/50] step=900 train_loss=0.1036 tok_s=60119.4 opt_steps=900 +[epoch 6/50] step=920 train_loss=0.1037 tok_s=60122.0 opt_steps=920 +[epoch 6/50] step=940 train_loss=0.1037 tok_s=60113.6 opt_steps=940 +[epoch 6/50] step=960 train_loss=0.1038 tok_s=60103.9 opt_steps=960 +[epoch 6/50] step=980 train_loss=0.1039 tok_s=60113.0 opt_steps=980 +[epoch 6/50] step=1000 train_loss=0.1039 tok_s=60123.8 opt_steps=1000 +[epoch 6/50] step=1020 train_loss=0.1039 tok_s=60123.6 opt_steps=1020 +[epoch 6/50] step=1040 train_loss=0.1040 tok_s=60125.5 opt_steps=1040 +[epoch 6/50] step=1060 train_loss=0.1040 tok_s=60119.7 opt_steps=1060 +[epoch 6/50] step=1080 train_loss=0.1040 tok_s=60126.7 opt_steps=1080 +[epoch 6/50] step=1100 train_loss=0.1040 tok_s=60130.6 opt_steps=1100 +[epoch 6/50] step=1120 train_loss=0.1040 tok_s=60127.8 opt_steps=1120 +[epoch 6/50] step=1140 train_loss=0.1040 tok_s=60130.7 opt_steps=1140 +[epoch 6/50] step=1160 train_loss=0.1040 tok_s=60130.7 opt_steps=1160 +[epoch 6/50] step=1180 train_loss=0.1041 tok_s=60132.0 opt_steps=1180 +[epoch 6/50] step=1200 train_loss=0.1040 tok_s=60123.3 opt_steps=1200 +[epoch 6/50] step=1220 train_loss=0.1041 tok_s=60119.1 opt_steps=1220 +[epoch 6/50] step=1240 train_loss=0.1042 tok_s=60116.6 opt_steps=1240 +[epoch 6/50] step=1260 train_loss=0.1042 tok_s=60116.1 opt_steps=1260 +[epoch 6/50] step=1280 train_loss=0.1042 tok_s=60128.3 opt_steps=1280 +[epoch 6/50] step=1300 train_loss=0.1043 tok_s=60133.5 opt_steps=1300 +[epoch 6/50] step=1320 train_loss=0.1043 tok_s=60128.9 opt_steps=1320 +[epoch 6/50] step=1340 train_loss=0.1043 tok_s=60127.8 opt_steps=1340 +[epoch 6/50] step=1360 train_loss=0.1043 tok_s=60127.0 opt_steps=1360 +[epoch 6/50] step=1380 train_loss=0.1043 tok_s=60128.7 opt_steps=1380 +[epoch 6/50] step=1400 train_loss=0.1043 tok_s=60130.0 opt_steps=1400 +[epoch 6/50] step=1420 train_loss=0.1044 tok_s=60126.4 opt_steps=1420 +[epoch 6/50] step=1440 train_loss=0.1044 tok_s=60129.2 opt_steps=1440 +[epoch 6/50] step=1460 train_loss=0.1044 tok_s=60133.6 opt_steps=1460 +[epoch 6/50] step=1480 train_loss=0.1044 tok_s=60133.1 opt_steps=1480 +[epoch 6/50] step=1500 train_loss=0.1044 tok_s=60135.2 opt_steps=1500 +[epoch 6/50] step=1520 train_loss=0.1045 tok_s=60133.9 opt_steps=1520 +[epoch 6/50] step=1540 train_loss=0.1045 tok_s=60132.7 opt_steps=1540 +[epoch 6/50] step=1560 train_loss=0.1046 tok_s=60135.0 opt_steps=1560 +[epoch 6/50] step=1580 train_loss=0.1046 tok_s=60138.1 opt_steps=1580 +[epoch 6/50] step=1600 train_loss=0.1046 tok_s=60132.7 opt_steps=1600 +[epoch 6/50] step=1620 train_loss=0.1046 tok_s=60129.6 opt_steps=1620 +[epoch 6/50] step=1640 train_loss=0.1046 tok_s=60136.5 opt_steps=1640 +[epoch 6/50] step=1660 train_loss=0.1047 tok_s=60137.4 opt_steps=1660 +[epoch 6/50] step=1680 train_loss=0.1047 tok_s=60133.9 opt_steps=1680 +[epoch 6/50] step=1700 train_loss=0.1047 tok_s=60127.6 opt_steps=1700 +[epoch 6/50] step=1720 train_loss=0.1047 tok_s=60128.3 opt_steps=1720 +[epoch 6/50] step=1740 train_loss=0.1048 tok_s=60130.7 opt_steps=1740 +[epoch 6/50] step=1760 train_loss=0.1048 tok_s=60133.8 opt_steps=1760 +[epoch 6/50] step=1780 train_loss=0.1048 tok_s=60133.1 opt_steps=1780 +[epoch 6/50] step=1800 train_loss=0.1049 tok_s=60130.2 opt_steps=1800 +[epoch 6/50] step=1820 train_loss=0.1049 tok_s=60133.1 opt_steps=1820 +[epoch 6/50] step=1840 train_loss=0.1049 tok_s=60129.4 opt_steps=1840 +[epoch 6/50] step=1860 train_loss=0.1050 tok_s=60130.3 opt_steps=1860 +[epoch 6/50] step=1880 train_loss=0.1050 tok_s=60134.0 opt_steps=1880 +[epoch 6/50] step=1900 train_loss=0.1050 tok_s=60138.8 opt_steps=1900 +[epoch 6/50] step=1920 train_loss=0.1050 tok_s=60138.2 opt_steps=1920 +[epoch 6/50] step=1940 train_loss=0.1051 tok_s=60142.6 opt_steps=1940 +[epoch 6/50] step=1960 train_loss=0.1052 tok_s=60139.9 opt_steps=1960 +[epoch 6/50] step=1980 train_loss=0.1052 tok_s=60140.8 opt_steps=1980 +[epoch 6/50] step=2000 train_loss=0.1052 tok_s=60135.2 opt_steps=2000 +[epoch 6/50] step=2020 train_loss=0.1052 tok_s=60132.3 opt_steps=2020 +[epoch 6/50] step=2040 train_loss=0.1053 tok_s=60134.8 opt_steps=2040 +[epoch 6/50] step=2060 train_loss=0.1053 tok_s=60135.5 opt_steps=2060 +[epoch 6/50] step=2080 train_loss=0.1053 tok_s=60134.5 opt_steps=2080 +[epoch 6/50] step=2100 train_loss=0.1053 tok_s=60136.5 opt_steps=2100 +[epoch 6/50] step=2120 train_loss=0.1053 tok_s=60138.0 opt_steps=2120 +[epoch 6/50] step=2140 train_loss=0.1053 tok_s=60142.6 opt_steps=2140 +[epoch 6/50] step=2160 train_loss=0.1053 tok_s=60143.9 opt_steps=2160 +[epoch 6/50] step=2180 train_loss=0.1053 tok_s=60147.9 opt_steps=2180 +[epoch 6/50] step=2200 train_loss=0.1053 tok_s=60150.7 opt_steps=2200 +[epoch 6/50] step=2220 train_loss=0.1053 tok_s=60150.5 opt_steps=2220 +[epoch 6/50] step=2240 train_loss=0.1053 tok_s=60140.4 opt_steps=2240 +[epoch 6/50] step=2260 train_loss=0.1053 tok_s=60136.6 opt_steps=2260 +[epoch 6/50] step=2280 train_loss=0.1053 tok_s=60128.9 opt_steps=2280 +[epoch 6/50] step=2300 train_loss=0.1053 tok_s=60118.9 opt_steps=2300 +[epoch 6/50] step=2320 train_loss=0.1054 tok_s=60112.3 opt_steps=2320 +[epoch 6/50] step=2340 train_loss=0.1054 tok_s=60109.1 opt_steps=2340 +[epoch 6/50] step=2360 train_loss=0.1054 tok_s=60100.3 opt_steps=2360 +[epoch 6/50] step=2380 train_loss=0.1054 tok_s=60091.6 opt_steps=2380 +[epoch 6/50] step=2400 train_loss=0.1054 tok_s=60090.8 opt_steps=2400 +[epoch 6/50] step=2420 train_loss=0.1054 tok_s=60086.7 opt_steps=2420 +[epoch 6/50] step=2440 train_loss=0.1054 tok_s=60085.9 opt_steps=2440 +[epoch 6/50] step=2460 train_loss=0.1054 tok_s=60082.0 opt_steps=2460 +[epoch 6/50] step=2480 train_loss=0.1054 tok_s=60078.2 opt_steps=2480 +[epoch 6/50] step=2500 train_loss=0.1054 tok_s=60076.9 opt_steps=2500 +[epoch 6/50] step=2520 train_loss=0.1054 tok_s=60072.4 opt_steps=2520 +[epoch 6/50] step=2540 train_loss=0.1054 tok_s=60073.7 opt_steps=2540 +[epoch 6/50] step=2560 train_loss=0.1054 tok_s=60070.8 opt_steps=2560 +[epoch 6/50] step=2580 train_loss=0.1055 tok_s=60073.6 opt_steps=2580 +[epoch 6/50] step=2600 train_loss=0.1055 tok_s=60075.4 opt_steps=2600 +[epoch 6/50] step=2620 train_loss=0.1054 tok_s=60077.1 opt_steps=2620 +[epoch 6/50] step=2640 train_loss=0.1054 tok_s=60075.4 opt_steps=2640 +[epoch 6/50] step=2660 train_loss=0.1055 tok_s=60075.0 opt_steps=2660 +[epoch 6/50] step=2680 train_loss=0.1055 tok_s=60074.3 opt_steps=2680 +[epoch 6/50] step=2700 train_loss=0.1054 tok_s=60072.8 opt_steps=2700 +[epoch 6/50] step=2720 train_loss=0.1055 tok_s=60073.2 opt_steps=2720 +[epoch 6/50] step=2740 train_loss=0.1055 tok_s=60073.2 opt_steps=2740 +[epoch 6/50] step=2760 train_loss=0.1055 tok_s=60070.0 opt_steps=2760 +[epoch 6/50] step=2780 train_loss=0.1055 tok_s=60068.7 opt_steps=2780 +[epoch 6/50] step=2800 train_loss=0.1055 tok_s=60069.2 opt_steps=2800 +[epoch 6/50] step=2820 train_loss=0.1055 tok_s=60069.8 opt_steps=2820 +[epoch 6/50] step=2840 train_loss=0.1055 tok_s=60070.3 opt_steps=2840 +[epoch 6/50] step=2860 train_loss=0.1055 tok_s=60071.1 opt_steps=2860 +[epoch 6/50] step=2880 train_loss=0.1055 tok_s=60069.1 opt_steps=2880 +[epoch 6/50] step=2900 train_loss=0.1055 tok_s=60070.7 opt_steps=2900 +[epoch 6/50] step=2920 train_loss=0.1055 tok_s=60071.5 opt_steps=2920 +[epoch 6/50] step=2940 train_loss=0.1055 tok_s=60074.2 opt_steps=2940 +[epoch 6/50] step=2960 train_loss=0.1056 tok_s=60077.4 opt_steps=2960 +[epoch 6/50] step=2980 train_loss=0.1056 tok_s=60077.8 opt_steps=2980 +[epoch 6/50] step=3000 train_loss=0.1056 tok_s=60078.6 opt_steps=3000 +[epoch 6/50] step=3020 train_loss=0.1056 tok_s=60077.0 opt_steps=3020 +[epoch 6/50] step=3040 train_loss=0.1056 tok_s=60076.4 opt_steps=3040 +[epoch 6/50] step=3060 train_loss=0.1056 tok_s=60076.0 opt_steps=3060 +[epoch 6/50] step=3080 train_loss=0.1056 tok_s=60080.3 opt_steps=3080 +[epoch 6/50] step=3100 train_loss=0.1056 tok_s=60074.8 opt_steps=3100 +[epoch 6/50] step=3120 train_loss=0.1056 tok_s=60072.6 opt_steps=3120 +[epoch 6/50] step=3140 train_loss=0.1055 tok_s=60075.7 opt_steps=3140 +[epoch 6/50] step=3160 train_loss=0.1055 tok_s=60078.3 opt_steps=3160 +[epoch 6/50] step=3180 train_loss=0.1055 tok_s=60077.2 opt_steps=3180 +[epoch 6/50] step=3200 train_loss=0.1055 tok_s=60075.0 opt_steps=3200 +[epoch 6/50] step=3220 train_loss=0.1055 tok_s=60074.1 opt_steps=3220 +[epoch 6/50] step=3240 train_loss=0.1055 tok_s=60074.2 opt_steps=3240 +[epoch 6/50] step=3260 train_loss=0.1055 tok_s=60068.8 opt_steps=3260 +[epoch 6/50] train_loss=0.1055 val_skipped tok_s=60069.7 opt_steps=3273 +[epoch 7/50] step=20 train_loss=0.0859 tok_s=58607.1 opt_steps=20 +[epoch 7/50] step=40 train_loss=0.0845 tok_s=59421.0 opt_steps=40 +[epoch 7/50] step=60 train_loss=0.0834 tok_s=59633.7 opt_steps=60 +[epoch 7/50] step=80 train_loss=0.0831 tok_s=59903.4 opt_steps=80 +[epoch 7/50] step=100 train_loss=0.0828 tok_s=60204.5 opt_steps=100 +[epoch 7/50] step=120 train_loss=0.0829 tok_s=60277.9 opt_steps=120 +[epoch 7/50] step=140 train_loss=0.0832 tok_s=60296.4 opt_steps=140 +[epoch 7/50] step=160 train_loss=0.0835 tok_s=60301.7 opt_steps=160 +[epoch 7/50] step=180 train_loss=0.0836 tok_s=60289.8 opt_steps=180 +[epoch 7/50] step=200 train_loss=0.0838 tok_s=60250.6 opt_steps=200 +[epoch 7/50] step=220 train_loss=0.0839 tok_s=60231.4 opt_steps=220 +[epoch 7/50] step=240 train_loss=0.0840 tok_s=60257.2 opt_steps=240 +[epoch 7/50] step=260 train_loss=0.0842 tok_s=60234.4 opt_steps=260 +[epoch 7/50] step=280 train_loss=0.0844 tok_s=60261.4 opt_steps=280 +[epoch 7/50] step=300 train_loss=0.0845 tok_s=60297.4 opt_steps=300 +[epoch 7/50] step=320 train_loss=0.0847 tok_s=60303.8 opt_steps=320 +[epoch 7/50] step=340 train_loss=0.0848 tok_s=60341.1 opt_steps=340 +[epoch 7/50] step=360 train_loss=0.0850 tok_s=60353.2 opt_steps=360 +[epoch 7/50] step=380 train_loss=0.0852 tok_s=60339.8 opt_steps=380 +[epoch 7/50] step=400 train_loss=0.0854 tok_s=60354.9 opt_steps=400 +[epoch 7/50] step=420 train_loss=0.0855 tok_s=60335.8 opt_steps=420 +[epoch 7/50] step=440 train_loss=0.0856 tok_s=60332.6 opt_steps=440 +[epoch 7/50] step=460 train_loss=0.0857 tok_s=60321.1 opt_steps=460 +[epoch 7/50] step=480 train_loss=0.0859 tok_s=60328.5 opt_steps=480 +[epoch 7/50] step=500 train_loss=0.0859 tok_s=60330.4 opt_steps=500 +[epoch 7/50] step=520 train_loss=0.0860 tok_s=60307.0 opt_steps=520 +[epoch 7/50] step=540 train_loss=0.0861 tok_s=60292.8 opt_steps=540 +[epoch 7/50] step=560 train_loss=0.0862 tok_s=60293.4 opt_steps=560 +[epoch 7/50] step=580 train_loss=0.0863 tok_s=60287.3 opt_steps=580 +[epoch 7/50] step=600 train_loss=0.0864 tok_s=60298.1 opt_steps=600 +[epoch 7/50] step=620 train_loss=0.0865 tok_s=60288.3 opt_steps=620 +[epoch 7/50] step=640 train_loss=0.0865 tok_s=60294.5 opt_steps=640 +[epoch 7/50] step=660 train_loss=0.0866 tok_s=60281.2 opt_steps=660 +[epoch 7/50] step=680 train_loss=0.0867 tok_s=60272.4 opt_steps=680 +[epoch 7/50] step=700 train_loss=0.0867 tok_s=60269.3 opt_steps=700 +[epoch 7/50] step=720 train_loss=0.0868 tok_s=60263.9 opt_steps=720 +[epoch 7/50] step=740 train_loss=0.0869 tok_s=60262.6 opt_steps=740 +[epoch 7/50] step=760 train_loss=0.0870 tok_s=60252.5 opt_steps=760 +[epoch 7/50] step=780 train_loss=0.0870 tok_s=60247.7 opt_steps=780 +[epoch 7/50] step=800 train_loss=0.0870 tok_s=60240.7 opt_steps=800 +[epoch 7/50] step=820 train_loss=0.0871 tok_s=60248.7 opt_steps=820 +[epoch 7/50] step=840 train_loss=0.0872 tok_s=60246.6 opt_steps=840 +[epoch 7/50] step=860 train_loss=0.0872 tok_s=60238.7 opt_steps=860 +[epoch 7/50] step=880 train_loss=0.0874 tok_s=60245.1 opt_steps=880 +[epoch 7/50] step=900 train_loss=0.0874 tok_s=60257.5 opt_steps=900 +[epoch 7/50] step=920 train_loss=0.0875 tok_s=60259.8 opt_steps=920 +[epoch 7/50] step=940 train_loss=0.0876 tok_s=60258.2 opt_steps=940 +[epoch 7/50] step=960 train_loss=0.0876 tok_s=60248.7 opt_steps=960 +[epoch 7/50] step=980 train_loss=0.0877 tok_s=60240.7 opt_steps=980 +[epoch 7/50] step=1000 train_loss=0.0878 tok_s=60234.8 opt_steps=1000 +[epoch 7/50] step=1020 train_loss=0.0878 tok_s=60232.4 opt_steps=1020 +[epoch 7/50] step=1040 train_loss=0.0879 tok_s=60232.2 opt_steps=1040 +[epoch 7/50] step=1060 train_loss=0.0880 tok_s=60236.8 opt_steps=1060 +[epoch 7/50] step=1080 train_loss=0.0880 tok_s=60232.2 opt_steps=1080 +[epoch 7/50] step=1100 train_loss=0.0881 tok_s=60232.5 opt_steps=1100 +[epoch 7/50] step=1120 train_loss=0.0881 tok_s=60239.0 opt_steps=1120 +[epoch 7/50] step=1140 train_loss=0.0882 tok_s=60244.0 opt_steps=1140 +[epoch 7/50] step=1160 train_loss=0.0882 tok_s=60246.9 opt_steps=1160 +[epoch 7/50] step=1180 train_loss=0.0883 tok_s=60245.6 opt_steps=1180 +[epoch 7/50] step=1200 train_loss=0.0884 tok_s=60246.2 opt_steps=1200 +[epoch 7/50] step=1220 train_loss=0.0885 tok_s=60250.9 opt_steps=1220 +[epoch 7/50] step=1240 train_loss=0.0885 tok_s=60252.0 opt_steps=1240 +[epoch 7/50] step=1260 train_loss=0.0886 tok_s=60245.8 opt_steps=1260 +[epoch 7/50] step=1280 train_loss=0.0886 tok_s=60242.3 opt_steps=1280 +[epoch 7/50] step=1300 train_loss=0.0886 tok_s=60235.6 opt_steps=1300 +[epoch 7/50] step=1320 train_loss=0.0886 tok_s=60231.8 opt_steps=1320 +[epoch 7/50] step=1340 train_loss=0.0887 tok_s=60232.5 opt_steps=1340 +[epoch 7/50] step=1360 train_loss=0.0888 tok_s=60236.1 opt_steps=1360 +[epoch 7/50] step=1380 train_loss=0.0888 tok_s=60239.2 opt_steps=1380 +[epoch 7/50] step=1400 train_loss=0.0889 tok_s=60231.4 opt_steps=1400 +[epoch 7/50] step=1420 train_loss=0.0889 tok_s=60222.9 opt_steps=1420 +[epoch 7/50] step=1440 train_loss=0.0890 tok_s=60214.0 opt_steps=1440 +[epoch 7/50] step=1460 train_loss=0.0890 tok_s=60211.9 opt_steps=1460 +[epoch 7/50] step=1480 train_loss=0.0891 tok_s=60215.7 opt_steps=1480 +[epoch 7/50] step=1500 train_loss=0.0891 tok_s=60213.2 opt_steps=1500 +[epoch 7/50] step=1520 train_loss=0.0892 tok_s=60210.9 opt_steps=1520 +[epoch 7/50] step=1540 train_loss=0.0892 tok_s=60207.7 opt_steps=1540 +[epoch 7/50] step=1560 train_loss=0.0892 tok_s=60209.0 opt_steps=1560 +[epoch 7/50] step=1580 train_loss=0.0893 tok_s=60205.9 opt_steps=1580 +[epoch 7/50] step=1600 train_loss=0.0893 tok_s=60204.6 opt_steps=1600 +[epoch 7/50] step=1620 train_loss=0.0894 tok_s=60203.2 opt_steps=1620 +[epoch 7/50] step=1640 train_loss=0.0894 tok_s=60202.0 opt_steps=1640 +[epoch 7/50] step=1660 train_loss=0.0894 tok_s=60200.6 opt_steps=1660 +[epoch 7/50] step=1680 train_loss=0.0894 tok_s=60205.3 opt_steps=1680 +[epoch 7/50] step=1700 train_loss=0.0894 tok_s=60203.3 opt_steps=1700 +[epoch 7/50] step=1720 train_loss=0.0894 tok_s=60201.2 opt_steps=1720 +[epoch 7/50] step=1740 train_loss=0.0895 tok_s=60202.0 opt_steps=1740 +[epoch 7/50] step=1760 train_loss=0.0895 tok_s=60207.8 opt_steps=1760 +[epoch 7/50] step=1780 train_loss=0.0895 tok_s=60210.2 opt_steps=1780 +[epoch 7/50] step=1800 train_loss=0.0895 tok_s=60215.2 opt_steps=1800 +[epoch 7/50] step=1820 train_loss=0.0896 tok_s=60212.7 opt_steps=1820 +[epoch 7/50] step=1840 train_loss=0.0896 tok_s=60213.7 opt_steps=1840 +[epoch 7/50] step=1860 train_loss=0.0896 tok_s=60209.1 opt_steps=1860 +[epoch 7/50] step=1880 train_loss=0.0897 tok_s=60206.1 opt_steps=1880 +[epoch 7/50] step=1900 train_loss=0.0897 tok_s=60208.9 opt_steps=1900 +[epoch 7/50] step=1920 train_loss=0.0897 tok_s=60209.3 opt_steps=1920 +[epoch 7/50] step=1940 train_loss=0.0898 tok_s=60212.3 opt_steps=1940 +[epoch 7/50] step=1960 train_loss=0.0898 tok_s=60214.9 opt_steps=1960 +[epoch 7/50] step=1980 train_loss=0.0898 tok_s=60212.7 opt_steps=1980 +[epoch 7/50] step=2000 train_loss=0.0898 tok_s=60211.3 opt_steps=2000 +[epoch 7/50] step=2020 train_loss=0.0899 tok_s=60210.7 opt_steps=2020 +[epoch 7/50] step=2040 train_loss=0.0899 tok_s=60209.9 opt_steps=2040 +[epoch 7/50] step=2060 train_loss=0.0899 tok_s=60209.6 opt_steps=2060 +[epoch 7/50] step=2080 train_loss=0.0899 tok_s=60209.9 opt_steps=2080 +[epoch 7/50] step=2100 train_loss=0.0899 tok_s=60206.8 opt_steps=2100 +[epoch 7/50] step=2120 train_loss=0.0900 tok_s=60204.4 opt_steps=2120 +[epoch 7/50] step=2140 train_loss=0.0900 tok_s=60201.2 opt_steps=2140 +[epoch 7/50] step=2160 train_loss=0.0900 tok_s=60202.5 opt_steps=2160 +[epoch 7/50] step=2180 train_loss=0.0901 tok_s=60200.7 opt_steps=2180 +[epoch 7/50] step=2200 train_loss=0.0901 tok_s=60202.3 opt_steps=2200 +[epoch 7/50] step=2220 train_loss=0.0901 tok_s=60199.1 opt_steps=2220 +[epoch 7/50] step=2240 train_loss=0.0902 tok_s=60200.4 opt_steps=2240 +[epoch 7/50] step=2260 train_loss=0.0902 tok_s=60204.3 opt_steps=2260 +[epoch 7/50] step=2280 train_loss=0.0902 tok_s=60205.4 opt_steps=2280 +[epoch 7/50] step=2300 train_loss=0.0903 tok_s=60206.7 opt_steps=2300 +[epoch 7/50] step=2320 train_loss=0.0903 tok_s=60209.8 opt_steps=2320 +[epoch 7/50] step=2340 train_loss=0.0903 tok_s=60212.5 opt_steps=2340 +[epoch 7/50] step=2360 train_loss=0.0903 tok_s=60215.8 opt_steps=2360 +[epoch 7/50] step=2380 train_loss=0.0904 tok_s=60217.9 opt_steps=2380 +[epoch 7/50] step=2400 train_loss=0.0904 tok_s=60215.3 opt_steps=2400 +[epoch 7/50] step=2420 train_loss=0.0904 tok_s=60210.8 opt_steps=2420 +[epoch 7/50] step=2440 train_loss=0.0904 tok_s=60211.4 opt_steps=2440 +[epoch 7/50] step=2460 train_loss=0.0904 tok_s=60213.4 opt_steps=2460 +[epoch 7/50] step=2480 train_loss=0.0904 tok_s=60215.1 opt_steps=2480 +[epoch 7/50] step=2500 train_loss=0.0905 tok_s=60213.7 opt_steps=2500 +[epoch 7/50] step=2520 train_loss=0.0905 tok_s=60213.8 opt_steps=2520 +[epoch 7/50] step=2540 train_loss=0.0905 tok_s=60212.5 opt_steps=2540 +[epoch 7/50] step=2560 train_loss=0.0905 tok_s=60211.9 opt_steps=2560 +[epoch 7/50] step=2580 train_loss=0.0905 tok_s=60215.3 opt_steps=2580 +[epoch 7/50] step=2600 train_loss=0.0905 tok_s=60214.5 opt_steps=2600 +[epoch 7/50] step=2620 train_loss=0.0905 tok_s=60214.7 opt_steps=2620 +[epoch 7/50] step=2640 train_loss=0.0905 tok_s=60217.1 opt_steps=2640 +[epoch 7/50] step=2660 train_loss=0.0906 tok_s=60217.8 opt_steps=2660 +[epoch 7/50] step=2680 train_loss=0.0906 tok_s=60217.2 opt_steps=2680 +[epoch 7/50] step=2700 train_loss=0.0906 tok_s=60219.8 opt_steps=2700 +[epoch 7/50] step=2720 train_loss=0.0906 tok_s=60217.9 opt_steps=2720 +[epoch 7/50] step=2740 train_loss=0.0906 tok_s=60218.9 opt_steps=2740 +[epoch 7/50] step=2760 train_loss=0.0906 tok_s=60219.5 opt_steps=2760 +[epoch 7/50] step=2780 train_loss=0.0906 tok_s=60222.3 opt_steps=2780 +[epoch 7/50] step=2800 train_loss=0.0906 tok_s=60226.8 opt_steps=2800 +[epoch 7/50] step=2820 train_loss=0.0906 tok_s=60225.3 opt_steps=2820 +[epoch 7/50] step=2840 train_loss=0.0907 tok_s=60227.1 opt_steps=2840 +[epoch 7/50] step=2860 train_loss=0.0907 tok_s=60228.0 opt_steps=2860 +[epoch 7/50] step=2880 train_loss=0.0907 tok_s=60229.4 opt_steps=2880 +[epoch 7/50] step=2900 train_loss=0.0907 tok_s=60232.2 opt_steps=2900 +[epoch 7/50] step=2920 train_loss=0.0907 tok_s=60234.0 opt_steps=2920 +[epoch 7/50] step=2940 train_loss=0.0907 tok_s=60233.9 opt_steps=2940 +[epoch 7/50] step=2960 train_loss=0.0908 tok_s=60238.3 opt_steps=2960 +[epoch 7/50] step=2980 train_loss=0.0908 tok_s=60240.2 opt_steps=2980 +[epoch 7/50] step=3000 train_loss=0.0908 tok_s=60237.9 opt_steps=3000 +[epoch 7/50] step=3020 train_loss=0.0908 tok_s=60239.5 opt_steps=3020 +[epoch 7/50] step=3040 train_loss=0.0908 tok_s=60240.2 opt_steps=3040 +[epoch 7/50] step=3060 train_loss=0.0908 tok_s=60239.4 opt_steps=3060 +[epoch 7/50] step=3080 train_loss=0.0908 tok_s=60236.2 opt_steps=3080 +[epoch 7/50] step=3100 train_loss=0.0908 tok_s=60237.3 opt_steps=3100 +[epoch 7/50] step=3120 train_loss=0.0908 tok_s=60238.5 opt_steps=3120 +[epoch 7/50] step=3140 train_loss=0.0909 tok_s=60242.1 opt_steps=3140 +[epoch 7/50] step=3160 train_loss=0.0908 tok_s=60240.1 opt_steps=3160 +[epoch 7/50] step=3180 train_loss=0.0908 tok_s=60243.3 opt_steps=3180 +[epoch 7/50] step=3200 train_loss=0.0909 tok_s=60245.5 opt_steps=3200 +[epoch 7/50] step=3220 train_loss=0.0909 tok_s=60245.4 opt_steps=3220 +[epoch 7/50] step=3240 train_loss=0.0909 tok_s=60245.8 opt_steps=3240 +[epoch 7/50] step=3260 train_loss=0.0909 tok_s=60237.9 opt_steps=3260 +[epoch 7/50] train_loss=0.0909 val_skipped tok_s=60238.8 opt_steps=3273 +[epoch 8/50] step=20 train_loss=0.0701 tok_s=58510.3 opt_steps=20 +[epoch 8/50] step=40 train_loss=0.0709 tok_s=59675.2 opt_steps=40 +[epoch 8/50] step=60 train_loss=0.0710 tok_s=59830.8 opt_steps=60 +[epoch 8/50] step=80 train_loss=0.0708 tok_s=59980.3 opt_steps=80 +[epoch 8/50] step=100 train_loss=0.0709 tok_s=59982.5 opt_steps=100 +[epoch 8/50] step=120 train_loss=0.0711 tok_s=60004.6 opt_steps=120 +[epoch 8/50] step=140 train_loss=0.0711 tok_s=60045.5 opt_steps=140 +[epoch 8/50] step=160 train_loss=0.0714 tok_s=60095.7 opt_steps=160 +[epoch 8/50] step=180 train_loss=0.0716 tok_s=60102.7 opt_steps=180 +[epoch 8/50] step=200 train_loss=0.0717 tok_s=60121.0 opt_steps=200 +[epoch 8/50] step=220 train_loss=0.0717 tok_s=60100.1 opt_steps=220 +[epoch 8/50] step=240 train_loss=0.0720 tok_s=60062.8 opt_steps=240 +[epoch 8/50] step=260 train_loss=0.0721 tok_s=60031.8 opt_steps=260 +[epoch 8/50] step=280 train_loss=0.0722 tok_s=60047.4 opt_steps=280 +[epoch 8/50] step=300 train_loss=0.0724 tok_s=60040.4 opt_steps=300 +[epoch 8/50] step=320 train_loss=0.0727 tok_s=60054.4 opt_steps=320 +[epoch 8/50] step=340 train_loss=0.0729 tok_s=60067.8 opt_steps=340 +[epoch 8/50] step=360 train_loss=0.0730 tok_s=60053.7 opt_steps=360 +[epoch 8/50] step=380 train_loss=0.0731 tok_s=60071.0 opt_steps=380 +[epoch 8/50] step=400 train_loss=0.0731 tok_s=60070.6 opt_steps=400 +[epoch 8/50] step=420 train_loss=0.0732 tok_s=60069.0 opt_steps=420 +[epoch 8/50] step=440 train_loss=0.0733 tok_s=60091.1 opt_steps=440 +[epoch 8/50] step=460 train_loss=0.0734 tok_s=60078.8 opt_steps=460 +[epoch 8/50] step=480 train_loss=0.0735 tok_s=60068.0 opt_steps=480 +[epoch 8/50] step=500 train_loss=0.0736 tok_s=60081.9 opt_steps=500 +[epoch 8/50] step=520 train_loss=0.0737 tok_s=60116.7 opt_steps=520 +[epoch 8/50] step=540 train_loss=0.0738 tok_s=60125.5 opt_steps=540 +[epoch 8/50] step=560 train_loss=0.0738 tok_s=60118.8 opt_steps=560 +[epoch 8/50] step=580 train_loss=0.0739 tok_s=60116.0 opt_steps=580 +[epoch 8/50] step=600 train_loss=0.0740 tok_s=60121.4 opt_steps=600 +[epoch 8/50] step=620 train_loss=0.0741 tok_s=60117.6 opt_steps=620 +[epoch 8/50] step=640 train_loss=0.0742 tok_s=60137.5 opt_steps=640 +[epoch 8/50] step=660 train_loss=0.0742 tok_s=60139.8 opt_steps=660 +[epoch 8/50] step=680 train_loss=0.0743 tok_s=60131.1 opt_steps=680 +[epoch 8/50] step=700 train_loss=0.0745 tok_s=60133.3 opt_steps=700 +[epoch 8/50] step=720 train_loss=0.0746 tok_s=60137.1 opt_steps=720 +[epoch 8/50] step=740 train_loss=0.0746 tok_s=60150.7 opt_steps=740 +[epoch 8/50] step=760 train_loss=0.0747 tok_s=60144.4 opt_steps=760 +[epoch 8/50] step=780 train_loss=0.0748 tok_s=60146.5 opt_steps=780 +[epoch 8/50] step=800 train_loss=0.0748 tok_s=60105.2 opt_steps=800 +[epoch 8/50] step=820 train_loss=0.0749 tok_s=60044.2 opt_steps=820 +[epoch 8/50] step=840 train_loss=0.0750 tok_s=60012.8 opt_steps=840 +[epoch 8/50] step=860 train_loss=0.0751 tok_s=59991.5 opt_steps=860 +[epoch 8/50] step=880 train_loss=0.0752 tok_s=59971.8 opt_steps=880 +[epoch 8/50] step=900 train_loss=0.0752 tok_s=59964.4 opt_steps=900 +[epoch 8/50] step=920 train_loss=0.0753 tok_s=59967.0 opt_steps=920 +[epoch 8/50] step=940 train_loss=0.0754 tok_s=59976.9 opt_steps=940 +[epoch 8/50] step=960 train_loss=0.0756 tok_s=59978.0 opt_steps=960 +[epoch 8/50] step=980 train_loss=0.0756 tok_s=59984.0 opt_steps=980 +[epoch 8/50] step=1000 train_loss=0.0756 tok_s=59977.5 opt_steps=1000 +[epoch 8/50] step=1020 train_loss=0.0757 tok_s=59984.4 opt_steps=1020 +[epoch 8/50] step=1040 train_loss=0.0757 tok_s=59989.7 opt_steps=1040 +[epoch 8/50] step=1060 train_loss=0.0758 tok_s=59981.5 opt_steps=1060 +[epoch 8/50] step=1080 train_loss=0.0759 tok_s=59981.8 opt_steps=1080 +[epoch 8/50] step=1100 train_loss=0.0759 tok_s=59977.0 opt_steps=1100 +[epoch 8/50] step=1120 train_loss=0.0760 tok_s=59977.6 opt_steps=1120 +[epoch 8/50] step=1140 train_loss=0.0760 tok_s=59983.1 opt_steps=1140 +[epoch 8/50] step=1160 train_loss=0.0761 tok_s=59994.3 opt_steps=1160 +[epoch 8/50] step=1180 train_loss=0.0762 tok_s=59994.7 opt_steps=1180 +[epoch 8/50] step=1200 train_loss=0.0763 tok_s=59994.1 opt_steps=1200 +[epoch 8/50] step=1220 train_loss=0.0763 tok_s=60000.7 opt_steps=1220 +[epoch 8/50] step=1240 train_loss=0.0764 tok_s=60008.3 opt_steps=1240 +[epoch 8/50] step=1260 train_loss=0.0764 tok_s=60012.2 opt_steps=1260 +[epoch 8/50] step=1280 train_loss=0.0764 tok_s=60011.8 opt_steps=1280 +[epoch 8/50] step=1300 train_loss=0.0765 tok_s=60016.2 opt_steps=1300 +[epoch 8/50] step=1320 train_loss=0.0765 tok_s=60008.3 opt_steps=1320 +[epoch 8/50] step=1340 train_loss=0.0766 tok_s=60014.4 opt_steps=1340 +[epoch 8/50] step=1360 train_loss=0.0767 tok_s=60011.0 opt_steps=1360 +[epoch 8/50] step=1380 train_loss=0.0767 tok_s=60010.5 opt_steps=1380 +[epoch 8/50] step=1400 train_loss=0.0768 tok_s=60010.6 opt_steps=1400 +[epoch 8/50] step=1420 train_loss=0.0768 tok_s=60010.2 opt_steps=1420 +[epoch 8/50] step=1440 train_loss=0.0769 tok_s=60015.4 opt_steps=1440 +[epoch 8/50] step=1460 train_loss=0.0769 tok_s=60019.2 opt_steps=1460 +[epoch 8/50] step=1480 train_loss=0.0770 tok_s=60019.6 opt_steps=1480 +[epoch 8/50] step=1500 train_loss=0.0770 tok_s=60021.1 opt_steps=1500 +[epoch 8/50] step=1520 train_loss=0.0770 tok_s=60018.4 opt_steps=1520 +[epoch 8/50] step=1540 train_loss=0.0771 tok_s=60019.2 opt_steps=1540 +[epoch 8/50] step=1560 train_loss=0.0771 tok_s=60020.7 opt_steps=1560 +[epoch 8/50] step=1580 train_loss=0.0772 tok_s=60021.2 opt_steps=1580 +[epoch 8/50] step=1600 train_loss=0.0772 tok_s=60015.0 opt_steps=1600 +[epoch 8/50] step=1620 train_loss=0.0772 tok_s=60012.6 opt_steps=1620 +[epoch 8/50] step=1640 train_loss=0.0773 tok_s=60008.5 opt_steps=1640 +[epoch 8/50] step=1660 train_loss=0.0773 tok_s=60009.6 opt_steps=1660 +[epoch 8/50] step=1680 train_loss=0.0774 tok_s=60012.7 opt_steps=1680 +[epoch 8/50] step=1700 train_loss=0.0774 tok_s=60010.2 opt_steps=1700 +[epoch 8/50] step=1720 train_loss=0.0774 tok_s=60007.1 opt_steps=1720 +[epoch 8/50] step=1740 train_loss=0.0775 tok_s=60004.6 opt_steps=1740 +[epoch 8/50] step=1760 train_loss=0.0775 tok_s=59993.0 opt_steps=1760 +[epoch 8/50] step=1780 train_loss=0.0776 tok_s=59994.2 opt_steps=1780 +[epoch 8/50] step=1800 train_loss=0.0776 tok_s=59991.3 opt_steps=1800 +[epoch 8/50] step=1820 train_loss=0.0776 tok_s=59993.3 opt_steps=1820 +[epoch 8/50] step=1840 train_loss=0.0777 tok_s=59995.3 opt_steps=1840 +[epoch 8/50] step=1860 train_loss=0.0777 tok_s=59997.6 opt_steps=1860 +[epoch 8/50] step=1880 train_loss=0.0777 tok_s=59998.0 opt_steps=1880 +[epoch 8/50] step=1900 train_loss=0.0778 tok_s=60001.5 opt_steps=1900 +[epoch 8/50] step=1920 train_loss=0.0778 tok_s=59996.4 opt_steps=1920 +[epoch 8/50] step=1940 train_loss=0.0779 tok_s=59993.0 opt_steps=1940 +[epoch 8/50] step=1960 train_loss=0.0779 tok_s=59993.4 opt_steps=1960 +[epoch 8/50] step=1980 train_loss=0.0779 tok_s=59995.1 opt_steps=1980 +[epoch 8/50] step=2000 train_loss=0.0780 tok_s=59993.9 opt_steps=2000 +[epoch 8/50] step=2020 train_loss=0.0780 tok_s=59995.0 opt_steps=2020 +[epoch 8/50] step=2040 train_loss=0.0780 tok_s=59996.2 opt_steps=2040 +[epoch 8/50] step=2060 train_loss=0.0780 tok_s=59992.8 opt_steps=2060 +[epoch 8/50] step=2080 train_loss=0.0781 tok_s=59994.8 opt_steps=2080 +[epoch 8/50] step=2100 train_loss=0.0781 tok_s=59992.4 opt_steps=2100 +[epoch 8/50] step=2120 train_loss=0.0781 tok_s=59995.1 opt_steps=2120 +[epoch 8/50] step=2140 train_loss=0.0781 tok_s=60001.4 opt_steps=2140 +[epoch 8/50] step=2160 train_loss=0.0781 tok_s=60003.6 opt_steps=2160 +[epoch 8/50] step=2180 train_loss=0.0782 tok_s=60006.9 opt_steps=2180 +[epoch 8/50] step=2200 train_loss=0.0782 tok_s=60006.1 opt_steps=2200 +[epoch 8/50] step=2220 train_loss=0.0782 tok_s=60007.4 opt_steps=2220 +[epoch 8/50] step=2240 train_loss=0.0782 tok_s=60009.0 opt_steps=2240 +[epoch 8/50] step=2260 train_loss=0.0782 tok_s=60010.7 opt_steps=2260 +[epoch 8/50] step=2280 train_loss=0.0782 tok_s=60008.2 opt_steps=2280 +[epoch 8/50] step=2300 train_loss=0.0783 tok_s=60008.8 opt_steps=2300 +[epoch 8/50] step=2320 train_loss=0.0783 tok_s=60006.6 opt_steps=2320 +[epoch 8/50] step=2340 train_loss=0.0783 tok_s=60005.5 opt_steps=2340 +[epoch 8/50] step=2360 train_loss=0.0783 tok_s=60002.5 opt_steps=2360 +[epoch 8/50] step=2380 train_loss=0.0783 tok_s=60003.7 opt_steps=2380 +[epoch 8/50] step=2400 train_loss=0.0784 tok_s=59999.2 opt_steps=2400 +[epoch 8/50] step=2420 train_loss=0.0784 tok_s=59998.4 opt_steps=2420 +[epoch 8/50] step=2440 train_loss=0.0784 tok_s=60001.9 opt_steps=2440 +[epoch 8/50] step=2460 train_loss=0.0785 tok_s=59998.4 opt_steps=2460 +[epoch 8/50] step=2480 train_loss=0.0785 tok_s=59996.5 opt_steps=2480 +[epoch 8/50] step=2500 train_loss=0.0785 tok_s=59999.8 opt_steps=2500 +[epoch 8/50] step=2520 train_loss=0.0786 tok_s=59994.3 opt_steps=2520 +[epoch 8/50] step=2540 train_loss=0.0786 tok_s=59991.5 opt_steps=2540 +[epoch 8/50] step=2560 train_loss=0.0786 tok_s=59988.9 opt_steps=2560 +[epoch 8/50] step=2580 train_loss=0.0786 tok_s=59991.3 opt_steps=2580 +[epoch 8/50] step=2600 train_loss=0.0786 tok_s=59993.1 opt_steps=2600 +[epoch 8/50] step=2620 train_loss=0.0787 tok_s=59992.2 opt_steps=2620 +[epoch 8/50] step=2640 train_loss=0.0787 tok_s=59993.8 opt_steps=2640 +[epoch 8/50] step=2660 train_loss=0.0787 tok_s=59996.6 opt_steps=2660 +[epoch 8/50] step=2680 train_loss=0.0788 tok_s=59998.7 opt_steps=2680 +[epoch 8/50] step=2700 train_loss=0.0788 tok_s=59999.1 opt_steps=2700 +[epoch 8/50] step=2720 train_loss=0.0788 tok_s=59997.4 opt_steps=2720 +[epoch 8/50] step=2740 train_loss=0.0788 tok_s=60000.1 opt_steps=2740 +[epoch 8/50] step=2760 train_loss=0.0788 tok_s=60002.9 opt_steps=2760 +[epoch 8/50] step=2780 train_loss=0.0788 tok_s=60001.2 opt_steps=2780 +[epoch 8/50] step=2800 train_loss=0.0789 tok_s=60005.1 opt_steps=2800 +[epoch 8/50] step=2820 train_loss=0.0789 tok_s=60006.8 opt_steps=2820 +[epoch 8/50] step=2840 train_loss=0.0789 tok_s=60009.2 opt_steps=2840 +[epoch 8/50] step=2860 train_loss=0.0789 tok_s=60009.4 opt_steps=2860 +[epoch 8/50] step=2880 train_loss=0.0789 tok_s=60010.2 opt_steps=2880 +[epoch 8/50] step=2900 train_loss=0.0789 tok_s=60012.8 opt_steps=2900 +[epoch 8/50] step=2920 train_loss=0.0790 tok_s=60015.7 opt_steps=2920 +[epoch 8/50] step=2940 train_loss=0.0790 tok_s=60015.4 opt_steps=2940 +[epoch 8/50] step=2960 train_loss=0.0790 tok_s=60015.7 opt_steps=2960 +[epoch 8/50] step=2980 train_loss=0.0790 tok_s=60016.4 opt_steps=2980 +[epoch 8/50] step=3000 train_loss=0.0790 tok_s=60018.6 opt_steps=3000 +[epoch 8/50] step=3020 train_loss=0.0790 tok_s=60016.7 opt_steps=3020 +[epoch 8/50] step=3040 train_loss=0.0791 tok_s=60016.2 opt_steps=3040 +[epoch 8/50] step=3060 train_loss=0.0791 tok_s=60020.0 opt_steps=3060 +[epoch 8/50] step=3080 train_loss=0.0791 tok_s=60019.8 opt_steps=3080 +[epoch 8/50] step=3100 train_loss=0.0791 tok_s=60025.1 opt_steps=3100 +[epoch 8/50] step=3120 train_loss=0.0791 tok_s=60024.4 opt_steps=3120 +[epoch 8/50] step=3140 train_loss=0.0791 tok_s=60026.8 opt_steps=3140 +[epoch 8/50] step=3160 train_loss=0.0792 tok_s=60029.6 opt_steps=3160 +[epoch 8/50] step=3180 train_loss=0.0792 tok_s=60028.8 opt_steps=3180 +[epoch 8/50] step=3200 train_loss=0.0792 tok_s=60027.0 opt_steps=3200 +[epoch 8/50] step=3220 train_loss=0.0792 tok_s=60028.2 opt_steps=3220 +[epoch 8/50] step=3240 train_loss=0.0792 tok_s=60029.6 opt_steps=3240 +[epoch 8/50] step=3260 train_loss=0.0792 tok_s=60023.1 opt_steps=3260 +[epoch 8/50] train_loss=0.0793 val_skipped tok_s=60021.9 opt_steps=3273 +[epoch 9/50] step=20 train_loss=0.0636 tok_s=57211.4 opt_steps=20 +[epoch 9/50] step=40 train_loss=0.0629 tok_s=58955.5 opt_steps=40 +[epoch 9/50] step=60 train_loss=0.0625 tok_s=59341.9 opt_steps=60 +[epoch 9/50] step=80 train_loss=0.0622 tok_s=59508.6 opt_steps=80 +[epoch 9/50] step=100 train_loss=0.0621 tok_s=59669.6 opt_steps=100 +[epoch 9/50] step=120 train_loss=0.0622 tok_s=59763.3 opt_steps=120 +[epoch 9/50] step=140 train_loss=0.0624 tok_s=59770.5 opt_steps=140 +[epoch 9/50] step=160 train_loss=0.0621 tok_s=59797.1 opt_steps=160 +[epoch 9/50] step=180 train_loss=0.0621 tok_s=59879.2 opt_steps=180 +[epoch 9/50] step=200 train_loss=0.0623 tok_s=59913.6 opt_steps=200 +[epoch 9/50] step=220 train_loss=0.0625 tok_s=59988.1 opt_steps=220 +[epoch 9/50] step=240 train_loss=0.0628 tok_s=60010.6 opt_steps=240 +[epoch 9/50] step=260 train_loss=0.0629 tok_s=60051.5 opt_steps=260 +[epoch 9/50] step=280 train_loss=0.0632 tok_s=60058.9 opt_steps=280 +[epoch 9/50] step=300 train_loss=0.0632 tok_s=60079.0 opt_steps=300 +[epoch 9/50] step=320 train_loss=0.0633 tok_s=60098.4 opt_steps=320 +[epoch 9/50] step=340 train_loss=0.0635 tok_s=60096.2 opt_steps=340 +[epoch 9/50] step=360 train_loss=0.0636 tok_s=60112.5 opt_steps=360 +[epoch 9/50] step=380 train_loss=0.0637 tok_s=60104.0 opt_steps=380 +[epoch 9/50] step=400 train_loss=0.0637 tok_s=60114.6 opt_steps=400 +[epoch 9/50] step=420 train_loss=0.0638 tok_s=60132.2 opt_steps=420 +[epoch 9/50] step=440 train_loss=0.0638 tok_s=60126.5 opt_steps=440 +[epoch 9/50] step=460 train_loss=0.0639 tok_s=60137.1 opt_steps=460 +[epoch 9/50] step=480 train_loss=0.0639 tok_s=60155.1 opt_steps=480 +[epoch 9/50] step=500 train_loss=0.0640 tok_s=60167.2 opt_steps=500 +[epoch 9/50] step=520 train_loss=0.0641 tok_s=60177.8 opt_steps=520 +[epoch 9/50] step=540 train_loss=0.0642 tok_s=60179.9 opt_steps=540 +[epoch 9/50] step=560 train_loss=0.0643 tok_s=60168.2 opt_steps=560 +[epoch 9/50] step=580 train_loss=0.0645 tok_s=60179.0 opt_steps=580 +[epoch 9/50] step=600 train_loss=0.0646 tok_s=60166.8 opt_steps=600 +[epoch 9/50] step=620 train_loss=0.0647 tok_s=60168.1 opt_steps=620 +[epoch 9/50] step=640 train_loss=0.0648 tok_s=60169.1 opt_steps=640 +[epoch 9/50] step=660 train_loss=0.0649 tok_s=60184.8 opt_steps=660 +[epoch 9/50] step=680 train_loss=0.0649 tok_s=60191.2 opt_steps=680 +[epoch 9/50] step=700 train_loss=0.0649 tok_s=60189.0 opt_steps=700 +[epoch 9/50] step=720 train_loss=0.0650 tok_s=60199.8 opt_steps=720 +[epoch 9/50] step=740 train_loss=0.0651 tok_s=60192.6 opt_steps=740 +[epoch 9/50] step=760 train_loss=0.0651 tok_s=60201.1 opt_steps=760 +[epoch 9/50] step=780 train_loss=0.0652 tok_s=60205.4 opt_steps=780 +[epoch 9/50] step=800 train_loss=0.0652 tok_s=60198.9 opt_steps=800 +[epoch 9/50] step=820 train_loss=0.0653 tok_s=60182.7 opt_steps=820 +[epoch 9/50] step=840 train_loss=0.0654 tok_s=60180.8 opt_steps=840 +[epoch 9/50] step=860 train_loss=0.0655 tok_s=60179.6 opt_steps=860 +[epoch 9/50] step=880 train_loss=0.0656 tok_s=60188.1 opt_steps=880 +[epoch 9/50] step=900 train_loss=0.0656 tok_s=60191.9 opt_steps=900 +[epoch 9/50] step=920 train_loss=0.0657 tok_s=60177.8 opt_steps=920 +[epoch 9/50] step=940 train_loss=0.0658 tok_s=60173.2 opt_steps=940 +[epoch 9/50] step=960 train_loss=0.0658 tok_s=60166.7 opt_steps=960 +[epoch 9/50] step=980 train_loss=0.0659 tok_s=60154.5 opt_steps=980 +[epoch 9/50] step=1000 train_loss=0.0659 tok_s=60148.1 opt_steps=1000 +[epoch 9/50] step=1020 train_loss=0.0660 tok_s=60136.7 opt_steps=1020 +[epoch 9/50] step=1040 train_loss=0.0660 tok_s=60126.8 opt_steps=1040 +[epoch 9/50] step=1060 train_loss=0.0661 tok_s=60123.9 opt_steps=1060 +[epoch 9/50] step=1080 train_loss=0.0661 tok_s=60083.7 opt_steps=1080 +[epoch 9/50] step=1100 train_loss=0.0662 tok_s=60079.0 opt_steps=1100 +[epoch 9/50] step=1120 train_loss=0.0662 tok_s=60080.2 opt_steps=1120 +[epoch 9/50] step=1140 train_loss=0.0663 tok_s=60075.1 opt_steps=1140 +[epoch 9/50] step=1160 train_loss=0.0664 tok_s=60074.5 opt_steps=1160 +[epoch 9/50] step=1180 train_loss=0.0664 tok_s=60073.4 opt_steps=1180 +[epoch 9/50] step=1200 train_loss=0.0665 tok_s=60082.4 opt_steps=1200 +[epoch 9/50] step=1220 train_loss=0.0665 tok_s=60084.0 opt_steps=1220 +[epoch 9/50] step=1240 train_loss=0.0666 tok_s=60074.6 opt_steps=1240 +[epoch 9/50] step=1260 train_loss=0.0667 tok_s=60073.1 opt_steps=1260 +[epoch 9/50] step=1280 train_loss=0.0667 tok_s=60067.8 opt_steps=1280 +[epoch 9/50] step=1300 train_loss=0.0668 tok_s=60068.5 opt_steps=1300 +[epoch 9/50] step=1320 train_loss=0.0668 tok_s=60072.7 opt_steps=1320 +[epoch 9/50] step=1340 train_loss=0.0669 tok_s=60071.4 opt_steps=1340 +[epoch 9/50] step=1360 train_loss=0.0670 tok_s=60075.2 opt_steps=1360 +[epoch 9/50] step=1380 train_loss=0.0670 tok_s=60075.9 opt_steps=1380 +[epoch 9/50] step=1400 train_loss=0.0671 tok_s=60080.4 opt_steps=1400 +[epoch 9/50] step=1420 train_loss=0.0671 tok_s=60082.7 opt_steps=1420 +[epoch 9/50] step=1440 train_loss=0.0672 tok_s=60084.6 opt_steps=1440 +[epoch 9/50] step=1460 train_loss=0.0672 tok_s=60085.9 opt_steps=1460 +[epoch 9/50] step=1480 train_loss=0.0673 tok_s=60082.4 opt_steps=1480 +[epoch 9/50] step=1500 train_loss=0.0673 tok_s=60086.7 opt_steps=1500 +[epoch 9/50] step=1520 train_loss=0.0673 tok_s=60092.4 opt_steps=1520 +[epoch 9/50] step=1540 train_loss=0.0674 tok_s=60087.0 opt_steps=1540 +[epoch 9/50] step=1560 train_loss=0.0674 tok_s=60089.1 opt_steps=1560 +[epoch 9/50] step=1580 train_loss=0.0674 tok_s=60087.3 opt_steps=1580 +[epoch 9/50] step=1600 train_loss=0.0675 tok_s=60092.0 opt_steps=1600 +[epoch 9/50] step=1620 train_loss=0.0675 tok_s=60095.6 opt_steps=1620 +[epoch 9/50] step=1640 train_loss=0.0676 tok_s=60095.9 opt_steps=1640 +[epoch 9/50] step=1660 train_loss=0.0676 tok_s=60104.4 opt_steps=1660 +[epoch 9/50] step=1680 train_loss=0.0677 tok_s=60104.8 opt_steps=1680 +[epoch 9/50] step=1700 train_loss=0.0677 tok_s=60105.0 opt_steps=1700 +[epoch 9/50] step=1720 train_loss=0.0677 tok_s=60104.1 opt_steps=1720 +[epoch 9/50] step=1740 train_loss=0.0678 tok_s=60096.7 opt_steps=1740 +[epoch 9/50] step=1760 train_loss=0.0678 tok_s=60093.5 opt_steps=1760 +[epoch 9/50] step=1780 train_loss=0.0678 tok_s=60093.7 opt_steps=1780 +[epoch 9/50] step=1800 train_loss=0.0679 tok_s=60101.1 opt_steps=1800 +[epoch 9/50] step=1820 train_loss=0.0679 tok_s=60098.7 opt_steps=1820 +[epoch 9/50] step=1840 train_loss=0.0679 tok_s=60094.3 opt_steps=1840 +[epoch 9/50] step=1860 train_loss=0.0680 tok_s=60094.5 opt_steps=1860 +[epoch 9/50] step=1880 train_loss=0.0680 tok_s=60093.9 opt_steps=1880 +[epoch 9/50] step=1900 train_loss=0.0680 tok_s=60089.8 opt_steps=1900 +[epoch 9/50] step=1920 train_loss=0.0681 tok_s=60088.1 opt_steps=1920 +[epoch 9/50] step=1940 train_loss=0.0681 tok_s=60090.0 opt_steps=1940 +[epoch 9/50] step=1960 train_loss=0.0681 tok_s=60095.0 opt_steps=1960 +[epoch 9/50] step=1980 train_loss=0.0682 tok_s=60094.2 opt_steps=1980 +[epoch 9/50] step=2000 train_loss=0.0682 tok_s=60090.5 opt_steps=2000 +[epoch 9/50] step=2020 train_loss=0.0682 tok_s=60090.4 opt_steps=2020 +[epoch 9/50] step=2040 train_loss=0.0682 tok_s=60091.2 opt_steps=2040 +[epoch 9/50] step=2060 train_loss=0.0683 tok_s=60094.0 opt_steps=2060 +[epoch 9/50] step=2080 train_loss=0.0683 tok_s=60095.5 opt_steps=2080 +[epoch 9/50] step=2100 train_loss=0.0683 tok_s=60099.2 opt_steps=2100 +[epoch 9/50] step=2120 train_loss=0.0684 tok_s=60099.6 opt_steps=2120 +[epoch 9/50] step=2140 train_loss=0.0684 tok_s=60102.3 opt_steps=2140 +[epoch 9/50] step=2160 train_loss=0.0685 tok_s=60104.0 opt_steps=2160 +[epoch 9/50] step=2180 train_loss=0.0685 tok_s=60109.8 opt_steps=2180 +[epoch 9/50] step=2200 train_loss=0.0685 tok_s=60108.7 opt_steps=2200 +[epoch 9/50] step=2220 train_loss=0.0686 tok_s=60109.2 opt_steps=2220 +[epoch 9/50] step=2240 train_loss=0.0686 tok_s=60113.7 opt_steps=2240 +[epoch 9/50] step=2260 train_loss=0.0686 tok_s=60113.6 opt_steps=2260 +[epoch 9/50] step=2280 train_loss=0.0687 tok_s=60113.7 opt_steps=2280 +[epoch 9/50] step=2300 train_loss=0.0687 tok_s=60114.6 opt_steps=2300 +[epoch 9/50] step=2320 train_loss=0.0687 tok_s=60112.2 opt_steps=2320 +[epoch 9/50] step=2340 train_loss=0.0688 tok_s=60110.9 opt_steps=2340 +[epoch 9/50] step=2360 train_loss=0.0688 tok_s=60114.8 opt_steps=2360 +[epoch 9/50] step=2380 train_loss=0.0688 tok_s=60112.6 opt_steps=2380 +[epoch 9/50] step=2400 train_loss=0.0688 tok_s=60111.3 opt_steps=2400 +[epoch 9/50] step=2420 train_loss=0.0688 tok_s=60110.2 opt_steps=2420 +[epoch 9/50] step=2440 train_loss=0.0689 tok_s=60115.2 opt_steps=2440 +[epoch 9/50] step=2460 train_loss=0.0689 tok_s=60115.1 opt_steps=2460 +[epoch 9/50] step=2480 train_loss=0.0689 tok_s=60119.5 opt_steps=2480 +[epoch 9/50] step=2500 train_loss=0.0690 tok_s=60119.4 opt_steps=2500 +[epoch 9/50] step=2520 train_loss=0.0690 tok_s=60122.5 opt_steps=2520 +[epoch 9/50] step=2540 train_loss=0.0690 tok_s=60123.5 opt_steps=2540 +[epoch 9/50] step=2560 train_loss=0.0690 tok_s=60121.2 opt_steps=2560 +[epoch 9/50] step=2580 train_loss=0.0691 tok_s=60117.9 opt_steps=2580 +[epoch 9/50] step=2600 train_loss=0.0691 tok_s=60115.6 opt_steps=2600 +[epoch 9/50] step=2620 train_loss=0.0691 tok_s=60114.5 opt_steps=2620 +[epoch 9/50] step=2640 train_loss=0.0692 tok_s=60113.1 opt_steps=2640 +[epoch 9/50] step=2660 train_loss=0.0692 tok_s=60113.5 opt_steps=2660 +[epoch 9/50] step=2680 train_loss=0.0692 tok_s=60112.1 opt_steps=2680 +[epoch 9/50] step=2700 train_loss=0.0692 tok_s=60111.1 opt_steps=2700 +[epoch 9/50] step=2720 train_loss=0.0693 tok_s=60107.0 opt_steps=2720 +[epoch 9/50] step=2740 train_loss=0.0693 tok_s=60105.2 opt_steps=2740 +[epoch 9/50] step=2760 train_loss=0.0693 tok_s=60105.0 opt_steps=2760 +[epoch 9/50] step=2780 train_loss=0.0693 tok_s=60106.2 opt_steps=2780 +[epoch 9/50] step=2800 train_loss=0.0694 tok_s=60105.9 opt_steps=2800 +[epoch 9/50] step=2820 train_loss=0.0694 tok_s=60106.7 opt_steps=2820 +[epoch 9/50] step=2840 train_loss=0.0694 tok_s=60106.9 opt_steps=2840 +[epoch 9/50] step=2860 train_loss=0.0694 tok_s=60108.3 opt_steps=2860 +[epoch 9/50] step=2880 train_loss=0.0694 tok_s=60110.6 opt_steps=2880 +[epoch 9/50] step=2900 train_loss=0.0695 tok_s=60110.5 opt_steps=2900 +[epoch 9/50] step=2920 train_loss=0.0695 tok_s=60110.3 opt_steps=2920 +[epoch 9/50] step=2940 train_loss=0.0695 tok_s=60108.6 opt_steps=2940 +[epoch 9/50] step=2960 train_loss=0.0695 tok_s=60113.2 opt_steps=2960 +[epoch 9/50] step=2980 train_loss=0.0696 tok_s=60113.7 opt_steps=2980 +[epoch 9/50] step=3000 train_loss=0.0696 tok_s=60115.7 opt_steps=3000 +[epoch 9/50] step=3020 train_loss=0.0696 tok_s=60117.2 opt_steps=3020 +[epoch 9/50] step=3040 train_loss=0.0697 tok_s=60115.0 opt_steps=3040 +[epoch 9/50] step=3060 train_loss=0.0697 tok_s=60115.3 opt_steps=3060 +[epoch 9/50] step=3080 train_loss=0.0697 tok_s=60115.1 opt_steps=3080 +[epoch 9/50] step=3100 train_loss=0.0697 tok_s=60113.6 opt_steps=3100 +[epoch 9/50] step=3120 train_loss=0.0697 tok_s=60117.2 opt_steps=3120 +[epoch 9/50] step=3140 train_loss=0.0698 tok_s=60113.1 opt_steps=3140 +[epoch 9/50] step=3160 train_loss=0.0698 tok_s=60112.0 opt_steps=3160 +[epoch 9/50] step=3180 train_loss=0.0698 tok_s=60111.6 opt_steps=3180 +[epoch 9/50] step=3200 train_loss=0.0698 tok_s=60111.5 opt_steps=3200 +[epoch 9/50] step=3220 train_loss=0.0698 tok_s=60113.0 opt_steps=3220 +[epoch 9/50] step=3240 train_loss=0.0698 tok_s=60112.0 opt_steps=3240 +[epoch 9/50] step=3260 train_loss=0.0698 tok_s=60104.9 opt_steps=3260 +[epoch 9/50] train_loss=0.0698 val_skipped tok_s=60106.1 opt_steps=3273 +[epoch 10/50] step=20 train_loss=0.0548 tok_s=57467.7 opt_steps=20 +[epoch 10/50] step=40 train_loss=0.0540 tok_s=58681.0 opt_steps=40 +[epoch 10/50] step=60 train_loss=0.0536 tok_s=59263.6 opt_steps=60 +[epoch 10/50] step=80 train_loss=0.0540 tok_s=59506.7 opt_steps=80 +[epoch 10/50] step=100 train_loss=0.0540 tok_s=59647.0 opt_steps=100 +[epoch 10/50] step=120 train_loss=0.0539 tok_s=59760.8 opt_steps=120 +[epoch 10/50] step=140 train_loss=0.0541 tok_s=59797.9 opt_steps=140 +[epoch 10/50] step=160 train_loss=0.0542 tok_s=59785.9 opt_steps=160 +[epoch 10/50] step=180 train_loss=0.0542 tok_s=59786.7 opt_steps=180 +[epoch 10/50] step=200 train_loss=0.0543 tok_s=59806.2 opt_steps=200 +[epoch 10/50] step=220 train_loss=0.0543 tok_s=59813.8 opt_steps=220 +[epoch 10/50] step=240 train_loss=0.0544 tok_s=59865.3 opt_steps=240 +[epoch 10/50] step=260 train_loss=0.0544 tok_s=59861.4 opt_steps=260 +[epoch 10/50] step=280 train_loss=0.0544 tok_s=59906.2 opt_steps=280 +[epoch 10/50] step=300 train_loss=0.0545 tok_s=59957.2 opt_steps=300 +[epoch 10/50] step=320 train_loss=0.0546 tok_s=59987.0 opt_steps=320 +[epoch 10/50] step=340 train_loss=0.0547 tok_s=60009.8 opt_steps=340 +[epoch 10/50] step=360 train_loss=0.0548 tok_s=59995.4 opt_steps=360 +[epoch 10/50] step=380 train_loss=0.0549 tok_s=60011.8 opt_steps=380 +[epoch 10/50] step=400 train_loss=0.0550 tok_s=60003.1 opt_steps=400 +[epoch 10/50] step=420 train_loss=0.0551 tok_s=60009.9 opt_steps=420 +[epoch 10/50] step=440 train_loss=0.0552 tok_s=60028.0 opt_steps=440 +[epoch 10/50] step=460 train_loss=0.0554 tok_s=60039.7 opt_steps=460 +[epoch 10/50] step=480 train_loss=0.0555 tok_s=60039.2 opt_steps=480 +[epoch 10/50] step=500 train_loss=0.0556 tok_s=60042.1 opt_steps=500 +[epoch 10/50] step=520 train_loss=0.0556 tok_s=60037.9 opt_steps=520 +[epoch 10/50] step=540 train_loss=0.0558 tok_s=60032.2 opt_steps=540 +[epoch 10/50] step=560 train_loss=0.0558 tok_s=60053.8 opt_steps=560 +[epoch 10/50] step=580 train_loss=0.0559 tok_s=60054.3 opt_steps=580 +[epoch 10/50] step=600 train_loss=0.0560 tok_s=60051.3 opt_steps=600 +[epoch 10/50] step=620 train_loss=0.0561 tok_s=60063.2 opt_steps=620 +[epoch 10/50] step=640 train_loss=0.0562 tok_s=60073.1 opt_steps=640 +[epoch 10/50] step=660 train_loss=0.0562 tok_s=60079.6 opt_steps=660 +[epoch 10/50] step=680 train_loss=0.0564 tok_s=60075.4 opt_steps=680 +[epoch 10/50] step=700 train_loss=0.0565 tok_s=60060.9 opt_steps=700 +[epoch 10/50] step=720 train_loss=0.0565 tok_s=60062.2 opt_steps=720 +[epoch 10/50] step=740 train_loss=0.0566 tok_s=60063.0 opt_steps=740 +[epoch 10/50] step=760 train_loss=0.0567 tok_s=60060.8 opt_steps=760 +[epoch 10/50] step=780 train_loss=0.0569 tok_s=60055.2 opt_steps=780 +[epoch 10/50] step=800 train_loss=0.0570 tok_s=60053.3 opt_steps=800 +[epoch 10/50] step=820 train_loss=0.0571 tok_s=60059.8 opt_steps=820 +[epoch 10/50] step=840 train_loss=0.0572 tok_s=60051.8 opt_steps=840 +[epoch 10/50] step=860 train_loss=0.0572 tok_s=60055.4 opt_steps=860 +[epoch 10/50] step=880 train_loss=0.0573 tok_s=60075.7 opt_steps=880 +[epoch 10/50] step=900 train_loss=0.0573 tok_s=60080.8 opt_steps=900 +[epoch 10/50] step=920 train_loss=0.0574 tok_s=60080.0 opt_steps=920 +[epoch 10/50] step=940 train_loss=0.0574 tok_s=60068.3 opt_steps=940 +[epoch 10/50] step=960 train_loss=0.0575 tok_s=60067.2 opt_steps=960 +[epoch 10/50] step=980 train_loss=0.0575 tok_s=60069.6 opt_steps=980 +[epoch 10/50] step=1000 train_loss=0.0576 tok_s=60066.8 opt_steps=1000 +[epoch 10/50] step=1020 train_loss=0.0577 tok_s=60071.1 opt_steps=1020 +[epoch 10/50] step=1040 train_loss=0.0578 tok_s=60073.4 opt_steps=1040 +[epoch 10/50] step=1060 train_loss=0.0579 tok_s=60076.8 opt_steps=1060 +[epoch 10/50] step=1080 train_loss=0.0579 tok_s=60065.2 opt_steps=1080 +[epoch 10/50] step=1100 train_loss=0.0580 tok_s=60068.3 opt_steps=1100 +[epoch 10/50] step=1120 train_loss=0.0580 tok_s=60068.8 opt_steps=1120 +[epoch 10/50] step=1140 train_loss=0.0581 tok_s=60067.8 opt_steps=1140 +[epoch 10/50] step=1160 train_loss=0.0581 tok_s=60062.6 opt_steps=1160 +[epoch 10/50] step=1180 train_loss=0.0582 tok_s=60067.2 opt_steps=1180 +[epoch 10/50] step=1200 train_loss=0.0582 tok_s=60069.7 opt_steps=1200 +[epoch 10/50] step=1220 train_loss=0.0582 tok_s=60070.6 opt_steps=1220 +[epoch 10/50] step=1240 train_loss=0.0583 tok_s=60073.5 opt_steps=1240 +[epoch 10/50] step=1260 train_loss=0.0584 tok_s=60075.5 opt_steps=1260 +[epoch 10/50] step=1280 train_loss=0.0584 tok_s=60077.1 opt_steps=1280 +[epoch 10/50] step=1300 train_loss=0.0584 tok_s=60068.7 opt_steps=1300 +[epoch 10/50] step=1320 train_loss=0.0585 tok_s=60062.1 opt_steps=1320 +[epoch 10/50] step=1340 train_loss=0.0586 tok_s=60065.3 opt_steps=1340 +[epoch 10/50] step=1360 train_loss=0.0586 tok_s=60067.9 opt_steps=1360 +[epoch 10/50] step=1380 train_loss=0.0587 tok_s=60062.8 opt_steps=1380 +[epoch 10/50] step=1400 train_loss=0.0587 tok_s=60068.5 opt_steps=1400 +[epoch 10/50] step=1420 train_loss=0.0588 tok_s=60071.6 opt_steps=1420 +[epoch 10/50] step=1440 train_loss=0.0588 tok_s=60077.0 opt_steps=1440 +[epoch 10/50] step=1460 train_loss=0.0589 tok_s=60074.7 opt_steps=1460 +[epoch 10/50] step=1480 train_loss=0.0589 tok_s=60077.0 opt_steps=1480 +[epoch 10/50] step=1500 train_loss=0.0590 tok_s=60080.5 opt_steps=1500 +[epoch 10/50] step=1520 train_loss=0.0590 tok_s=60083.2 opt_steps=1520 +[epoch 10/50] step=1540 train_loss=0.0591 tok_s=60085.5 opt_steps=1540 +[epoch 10/50] step=1560 train_loss=0.0591 tok_s=60081.5 opt_steps=1560 +[epoch 10/50] step=1580 train_loss=0.0592 tok_s=60077.5 opt_steps=1580 +[epoch 10/50] step=1600 train_loss=0.0592 tok_s=60076.1 opt_steps=1600 +[epoch 10/50] step=1620 train_loss=0.0592 tok_s=60074.2 opt_steps=1620 +[epoch 10/50] step=1640 train_loss=0.0593 tok_s=60074.8 opt_steps=1640 +[epoch 10/50] step=1660 train_loss=0.0593 tok_s=60073.4 opt_steps=1660 +[epoch 10/50] step=1680 train_loss=0.0594 tok_s=60074.4 opt_steps=1680 +[epoch 10/50] step=1700 train_loss=0.0594 tok_s=60071.1 opt_steps=1700 +[epoch 10/50] step=1720 train_loss=0.0594 tok_s=60074.2 opt_steps=1720 +[epoch 10/50] step=1740 train_loss=0.0595 tok_s=60082.1 opt_steps=1740 +[epoch 10/50] step=1760 train_loss=0.0596 tok_s=60081.7 opt_steps=1760 +[epoch 10/50] step=1780 train_loss=0.0596 tok_s=60086.6 opt_steps=1780 +[epoch 10/50] step=1800 train_loss=0.0597 tok_s=60087.7 opt_steps=1800 +[epoch 10/50] step=1820 train_loss=0.0597 tok_s=60087.8 opt_steps=1820 +[epoch 10/50] step=1840 train_loss=0.0598 tok_s=60084.6 opt_steps=1840 +[epoch 10/50] step=1860 train_loss=0.0598 tok_s=60084.8 opt_steps=1860 +[epoch 10/50] step=1880 train_loss=0.0599 tok_s=60083.4 opt_steps=1880 +[epoch 10/50] step=1900 train_loss=0.0599 tok_s=60085.6 opt_steps=1900 +[epoch 10/50] step=1920 train_loss=0.0599 tok_s=60091.2 opt_steps=1920 +[epoch 10/50] step=1940 train_loss=0.0600 tok_s=60091.3 opt_steps=1940 +[epoch 10/50] step=1960 train_loss=0.0600 tok_s=60090.3 opt_steps=1960 +[epoch 10/50] step=1980 train_loss=0.0601 tok_s=60095.6 opt_steps=1980 +[epoch 10/50] step=2000 train_loss=0.0601 tok_s=60099.5 opt_steps=2000 +[epoch 10/50] step=2020 train_loss=0.0602 tok_s=60105.7 opt_steps=2020 +[epoch 10/50] step=2040 train_loss=0.0602 tok_s=60105.7 opt_steps=2040 +[epoch 10/50] step=2060 train_loss=0.0602 tok_s=60106.1 opt_steps=2060 +[epoch 10/50] step=2080 train_loss=0.0602 tok_s=60112.4 opt_steps=2080 +[epoch 10/50] step=2100 train_loss=0.0603 tok_s=60114.7 opt_steps=2100 +[epoch 10/50] step=2120 train_loss=0.0603 tok_s=60114.0 opt_steps=2120 +[epoch 10/50] step=2140 train_loss=0.0603 tok_s=60113.1 opt_steps=2140 +[epoch 10/50] step=2160 train_loss=0.0604 tok_s=60116.3 opt_steps=2160 +[epoch 10/50] step=2180 train_loss=0.0604 tok_s=60113.7 opt_steps=2180 +[epoch 10/50] step=2200 train_loss=0.0604 tok_s=60112.7 opt_steps=2200 +[epoch 10/50] step=2220 train_loss=0.0605 tok_s=60110.3 opt_steps=2220 +[epoch 10/50] step=2240 train_loss=0.0605 tok_s=60113.1 opt_steps=2240 +[epoch 10/50] step=2260 train_loss=0.0605 tok_s=60114.8 opt_steps=2260 +[epoch 10/50] step=2280 train_loss=0.0606 tok_s=60113.5 opt_steps=2280 +[epoch 10/50] step=2300 train_loss=0.0606 tok_s=60112.3 opt_steps=2300 +[epoch 10/50] step=2320 train_loss=0.0606 tok_s=60107.4 opt_steps=2320 +[epoch 10/50] step=2340 train_loss=0.0607 tok_s=60106.4 opt_steps=2340 +[epoch 10/50] step=2360 train_loss=0.0607 tok_s=60110.0 opt_steps=2360 +[epoch 10/50] step=2380 train_loss=0.0607 tok_s=60107.5 opt_steps=2380 +[epoch 10/50] step=2400 train_loss=0.0608 tok_s=60104.4 opt_steps=2400 +[epoch 10/50] step=2420 train_loss=0.0608 tok_s=60105.1 opt_steps=2420 +[epoch 10/50] step=2440 train_loss=0.0608 tok_s=60104.0 opt_steps=2440 +[epoch 10/50] step=2460 train_loss=0.0609 tok_s=60106.2 opt_steps=2460 +[epoch 10/50] step=2480 train_loss=0.0609 tok_s=60106.2 opt_steps=2480 +[epoch 10/50] step=2500 train_loss=0.0609 tok_s=60111.8 opt_steps=2500 +[epoch 10/50] step=2520 train_loss=0.0609 tok_s=60113.3 opt_steps=2520 +[epoch 10/50] step=2540 train_loss=0.0610 tok_s=60111.2 opt_steps=2540 +[epoch 10/50] step=2560 train_loss=0.0610 tok_s=60109.5 opt_steps=2560 +[epoch 10/50] step=2580 train_loss=0.0610 tok_s=60108.2 opt_steps=2580 +[epoch 10/50] step=2600 train_loss=0.0611 tok_s=60106.5 opt_steps=2600 +[epoch 10/50] step=2620 train_loss=0.0611 tok_s=60104.1 opt_steps=2620 +[epoch 10/50] step=2640 train_loss=0.0611 tok_s=60107.3 opt_steps=2640 +[epoch 10/50] step=2660 train_loss=0.0611 tok_s=60109.2 opt_steps=2660 +[epoch 10/50] step=2680 train_loss=0.0612 tok_s=60107.5 opt_steps=2680 +[epoch 10/50] step=2700 train_loss=0.0612 tok_s=60108.2 opt_steps=2700 +[epoch 10/50] step=2720 train_loss=0.0612 tok_s=60107.1 opt_steps=2720 +[epoch 10/50] step=2740 train_loss=0.0613 tok_s=60109.2 opt_steps=2740 +[epoch 10/50] step=2760 train_loss=0.0613 tok_s=60112.2 opt_steps=2760 +[epoch 10/50] step=2780 train_loss=0.0613 tok_s=60112.0 opt_steps=2780 +[epoch 10/50] step=2800 train_loss=0.0613 tok_s=60114.9 opt_steps=2800 +[epoch 10/50] step=2820 train_loss=0.0614 tok_s=60114.3 opt_steps=2820 +[epoch 10/50] step=2840 train_loss=0.0614 tok_s=60113.3 opt_steps=2840 +[epoch 10/50] step=2860 train_loss=0.0614 tok_s=60108.9 opt_steps=2860 +[epoch 10/50] step=2880 train_loss=0.0615 tok_s=60107.5 opt_steps=2880 +[epoch 10/50] step=2900 train_loss=0.0615 tok_s=60107.9 opt_steps=2900 +[epoch 10/50] step=2920 train_loss=0.0615 tok_s=60108.1 opt_steps=2920 +[epoch 10/50] step=2940 train_loss=0.0615 tok_s=60106.7 opt_steps=2940 +[epoch 10/50] step=2960 train_loss=0.0616 tok_s=60106.3 opt_steps=2960 +[epoch 10/50] step=2980 train_loss=0.0616 tok_s=60102.3 opt_steps=2980 +[epoch 10/50] step=3000 train_loss=0.0616 tok_s=60102.6 opt_steps=3000 +[epoch 10/50] step=3020 train_loss=0.0616 tok_s=60100.7 opt_steps=3020 +[epoch 10/50] step=3040 train_loss=0.0617 tok_s=60100.0 opt_steps=3040 +[epoch 10/50] step=3060 train_loss=0.0617 tok_s=60102.2 opt_steps=3060 +[epoch 10/50] step=3080 train_loss=0.0617 tok_s=60101.9 opt_steps=3080 +[epoch 10/50] step=3100 train_loss=0.0618 tok_s=60099.4 opt_steps=3100 +[epoch 10/50] step=3120 train_loss=0.0618 tok_s=60100.7 opt_steps=3120 +[epoch 10/50] step=3140 train_loss=0.0618 tok_s=60101.9 opt_steps=3140 +[epoch 10/50] step=3160 train_loss=0.0618 tok_s=60101.9 opt_steps=3160 +[epoch 10/50] step=3180 train_loss=0.0618 tok_s=60098.9 opt_steps=3180 +[epoch 10/50] step=3200 train_loss=0.0618 tok_s=60099.1 opt_steps=3200 +[epoch 10/50] step=3220 train_loss=0.0619 tok_s=60099.5 opt_steps=3220 +[epoch 10/50] step=3240 train_loss=0.0619 tok_s=60099.1 opt_steps=3240 +[epoch 10/50] step=3260 train_loss=0.0619 tok_s=60099.2 opt_steps=3260 +[epoch 10/50] train_loss=0.0619 val_skipped tok_s=60100.2 opt_steps=3273 +[epoch 11/50] step=20 train_loss=0.0479 tok_s=58186.3 opt_steps=20 +[epoch 11/50] step=40 train_loss=0.0474 tok_s=59024.3 opt_steps=40 +[epoch 11/50] step=60 train_loss=0.0476 tok_s=59223.0 opt_steps=60 +[epoch 11/50] step=80 train_loss=0.0477 tok_s=59567.0 opt_steps=80 +[epoch 11/50] step=100 train_loss=0.0478 tok_s=59746.4 opt_steps=100 +[epoch 11/50] step=120 train_loss=0.0478 tok_s=59781.8 opt_steps=120 +[epoch 11/50] step=140 train_loss=0.0479 tok_s=59871.7 opt_steps=140 +[epoch 11/50] step=160 train_loss=0.0480 tok_s=59948.5 opt_steps=160 +[epoch 11/50] step=180 train_loss=0.0481 tok_s=59920.1 opt_steps=180 +[epoch 11/50] step=200 train_loss=0.0482 tok_s=59958.1 opt_steps=200 +[epoch 11/50] step=220 train_loss=0.0484 tok_s=60012.6 opt_steps=220 +[epoch 11/50] step=240 train_loss=0.0484 tok_s=60056.8 opt_steps=240 +[epoch 11/50] step=260 train_loss=0.0485 tok_s=60101.1 opt_steps=260 +[epoch 11/50] step=280 train_loss=0.0486 tok_s=60111.0 opt_steps=280 +[epoch 11/50] step=300 train_loss=0.0487 tok_s=60140.4 opt_steps=300 +[epoch 11/50] step=320 train_loss=0.0488 tok_s=60146.6 opt_steps=320 +[epoch 11/50] step=340 train_loss=0.0489 tok_s=60149.8 opt_steps=340 +[epoch 11/50] step=360 train_loss=0.0490 tok_s=60130.3 opt_steps=360 +[epoch 11/50] step=380 train_loss=0.0491 tok_s=60135.9 opt_steps=380 +[epoch 11/50] step=400 train_loss=0.0492 tok_s=60133.3 opt_steps=400 +[epoch 11/50] step=420 train_loss=0.0493 tok_s=60116.1 opt_steps=420 +[epoch 11/50] step=440 train_loss=0.0494 tok_s=60112.2 opt_steps=440 +[epoch 11/50] step=460 train_loss=0.0494 tok_s=60087.8 opt_steps=460 +[epoch 11/50] step=480 train_loss=0.0495 tok_s=60073.2 opt_steps=480 +[epoch 11/50] step=500 train_loss=0.0497 tok_s=60059.1 opt_steps=500 +[epoch 11/50] step=520 train_loss=0.0497 tok_s=60064.9 opt_steps=520 +[epoch 11/50] step=540 train_loss=0.0498 tok_s=60054.6 opt_steps=540 +[epoch 11/50] step=560 train_loss=0.0498 tok_s=60036.9 opt_steps=560 +[epoch 11/50] step=580 train_loss=0.0498 tok_s=60064.9 opt_steps=580 +[epoch 11/50] step=600 train_loss=0.0499 tok_s=60071.4 opt_steps=600 +[epoch 11/50] step=620 train_loss=0.0500 tok_s=60070.2 opt_steps=620 +[epoch 11/50] step=640 train_loss=0.0501 tok_s=60071.1 opt_steps=640 +[epoch 11/50] step=660 train_loss=0.0502 tok_s=60076.5 opt_steps=660 +[epoch 11/50] step=680 train_loss=0.0503 tok_s=60068.9 opt_steps=680 +[epoch 11/50] step=700 train_loss=0.0503 tok_s=60070.8 opt_steps=700 +[epoch 11/50] step=720 train_loss=0.0504 tok_s=60064.9 opt_steps=720 +[epoch 11/50] step=740 train_loss=0.0504 tok_s=60066.3 opt_steps=740 +[epoch 11/50] step=760 train_loss=0.0505 tok_s=60064.1 opt_steps=760 +[epoch 11/50] step=780 train_loss=0.0506 tok_s=60054.4 opt_steps=780 +[epoch 11/50] step=800 train_loss=0.0507 tok_s=60058.2 opt_steps=800 +[epoch 11/50] step=820 train_loss=0.0508 tok_s=60062.3 opt_steps=820 +[epoch 11/50] step=840 train_loss=0.0509 tok_s=60060.2 opt_steps=840 +[epoch 11/50] step=860 train_loss=0.0510 tok_s=60062.3 opt_steps=860 +[epoch 11/50] step=880 train_loss=0.0510 tok_s=60062.6 opt_steps=880 +[epoch 11/50] step=900 train_loss=0.0511 tok_s=60054.2 opt_steps=900 +[epoch 11/50] step=920 train_loss=0.0512 tok_s=60053.5 opt_steps=920 +[epoch 11/50] step=940 train_loss=0.0512 tok_s=60055.4 opt_steps=940 +[epoch 11/50] step=960 train_loss=0.0513 tok_s=60050.9 opt_steps=960 +[epoch 11/50] step=980 train_loss=0.0513 tok_s=60047.6 opt_steps=980 +[epoch 11/50] step=1000 train_loss=0.0513 tok_s=60049.8 opt_steps=1000 +[epoch 11/50] step=1020 train_loss=0.0514 tok_s=60047.3 opt_steps=1020 +[epoch 11/50] step=1040 train_loss=0.0515 tok_s=60049.4 opt_steps=1040 +[epoch 11/50] step=1060 train_loss=0.0515 tok_s=60056.6 opt_steps=1060 +[epoch 11/50] step=1080 train_loss=0.0516 tok_s=60050.9 opt_steps=1080 +[epoch 11/50] step=1100 train_loss=0.0516 tok_s=60052.3 opt_steps=1100 +[epoch 11/50] step=1120 train_loss=0.0516 tok_s=60050.3 opt_steps=1120 +[epoch 11/50] step=1140 train_loss=0.0517 tok_s=60049.2 opt_steps=1140 +[epoch 11/50] step=1160 train_loss=0.0518 tok_s=60049.5 opt_steps=1160 +[epoch 11/50] step=1180 train_loss=0.0518 tok_s=60048.5 opt_steps=1180 +[epoch 11/50] step=1200 train_loss=0.0519 tok_s=60040.5 opt_steps=1200 +[epoch 11/50] step=1220 train_loss=0.0520 tok_s=60042.0 opt_steps=1220 +[epoch 11/50] step=1240 train_loss=0.0520 tok_s=60040.8 opt_steps=1240 +[epoch 11/50] step=1260 train_loss=0.0521 tok_s=60050.8 opt_steps=1260 +[epoch 11/50] step=1280 train_loss=0.0521 tok_s=60048.6 opt_steps=1280 +[epoch 11/50] step=1300 train_loss=0.0521 tok_s=60052.7 opt_steps=1300 +[epoch 11/50] step=1320 train_loss=0.0522 tok_s=60063.7 opt_steps=1320 +[epoch 11/50] step=1340 train_loss=0.0523 tok_s=60062.7 opt_steps=1340 +[epoch 11/50] step=1360 train_loss=0.0523 tok_s=60053.5 opt_steps=1360 +[epoch 11/50] step=1380 train_loss=0.0524 tok_s=60054.3 opt_steps=1380 +[epoch 11/50] step=1400 train_loss=0.0524 tok_s=60056.3 opt_steps=1400 +[epoch 11/50] step=1420 train_loss=0.0525 tok_s=60059.3 opt_steps=1420 +[epoch 11/50] step=1440 train_loss=0.0525 tok_s=60064.5 opt_steps=1440 +[epoch 11/50] step=1460 train_loss=0.0525 tok_s=60063.3 opt_steps=1460 +[epoch 11/50] step=1480 train_loss=0.0526 tok_s=60062.8 opt_steps=1480 +[epoch 11/50] step=1500 train_loss=0.0527 tok_s=60069.9 opt_steps=1500 +[epoch 11/50] step=1520 train_loss=0.0527 tok_s=60071.2 opt_steps=1520 +[epoch 11/50] step=1540 train_loss=0.0528 tok_s=60071.3 opt_steps=1540 +[epoch 11/50] step=1560 train_loss=0.0528 tok_s=60072.3 opt_steps=1560 +[epoch 11/50] step=1580 train_loss=0.0528 tok_s=60072.3 opt_steps=1580 +[epoch 11/50] step=1600 train_loss=0.0529 tok_s=60076.4 opt_steps=1600 +[epoch 11/50] step=1620 train_loss=0.0529 tok_s=60086.9 opt_steps=1620 +[epoch 11/50] step=1640 train_loss=0.0530 tok_s=60085.2 opt_steps=1640 +[epoch 11/50] step=1660 train_loss=0.0530 tok_s=60087.8 opt_steps=1660 +[epoch 11/50] step=1680 train_loss=0.0531 tok_s=60082.7 opt_steps=1680 +[epoch 11/50] step=1700 train_loss=0.0531 tok_s=60082.3 opt_steps=1700 +[epoch 11/50] step=1720 train_loss=0.0532 tok_s=60090.8 opt_steps=1720 +[epoch 11/50] step=1740 train_loss=0.0532 tok_s=60090.3 opt_steps=1740 +[epoch 11/50] step=1760 train_loss=0.0533 tok_s=60092.1 opt_steps=1760 +[epoch 11/50] step=1780 train_loss=0.0533 tok_s=60094.7 opt_steps=1780 +[epoch 11/50] step=1800 train_loss=0.0534 tok_s=60093.3 opt_steps=1800 +[epoch 11/50] step=1820 train_loss=0.0534 tok_s=60097.3 opt_steps=1820 +[epoch 11/50] step=1840 train_loss=0.0534 tok_s=60100.4 opt_steps=1840 +[epoch 11/50] step=1860 train_loss=0.0535 tok_s=60102.1 opt_steps=1860 +[epoch 11/50] step=1880 train_loss=0.0535 tok_s=60104.2 opt_steps=1880 +[epoch 11/50] step=1900 train_loss=0.0536 tok_s=60108.5 opt_steps=1900 +[epoch 11/50] step=1920 train_loss=0.0536 tok_s=60106.5 opt_steps=1920 +[epoch 11/50] step=1940 train_loss=0.0536 tok_s=60105.6 opt_steps=1940 +[epoch 11/50] step=1960 train_loss=0.0537 tok_s=60102.7 opt_steps=1960 +[epoch 11/50] step=1980 train_loss=0.0537 tok_s=60103.6 opt_steps=1980 +[epoch 11/50] step=2000 train_loss=0.0538 tok_s=60104.5 opt_steps=2000 +[epoch 11/50] step=2020 train_loss=0.0538 tok_s=60108.0 opt_steps=2020 +[epoch 11/50] step=2040 train_loss=0.0539 tok_s=60105.4 opt_steps=2040 +[epoch 11/50] step=2060 train_loss=0.0539 tok_s=60105.3 opt_steps=2060 +[epoch 11/50] step=2080 train_loss=0.0539 tok_s=60105.8 opt_steps=2080 +[epoch 11/50] step=2100 train_loss=0.0540 tok_s=60104.9 opt_steps=2100 +[epoch 11/50] step=2120 train_loss=0.0540 tok_s=60105.9 opt_steps=2120 +[epoch 11/50] step=2140 train_loss=0.0540 tok_s=60105.8 opt_steps=2140 +[epoch 11/50] step=2160 train_loss=0.0541 tok_s=60107.8 opt_steps=2160 +[epoch 11/50] step=2180 train_loss=0.0541 tok_s=60110.5 opt_steps=2180 +[epoch 11/50] step=2200 train_loss=0.0541 tok_s=60115.3 opt_steps=2200 +[epoch 11/50] step=2220 train_loss=0.0541 tok_s=60115.0 opt_steps=2220 +[epoch 11/50] step=2240 train_loss=0.0542 tok_s=60116.4 opt_steps=2240 +[epoch 11/50] step=2260 train_loss=0.0542 tok_s=60119.9 opt_steps=2260 +[epoch 11/50] step=2280 train_loss=0.0542 tok_s=60120.6 opt_steps=2280 +[epoch 11/50] step=2300 train_loss=0.0542 tok_s=60121.2 opt_steps=2300 +[epoch 11/50] step=2320 train_loss=0.0543 tok_s=60120.4 opt_steps=2320 +[epoch 11/50] step=2340 train_loss=0.0543 tok_s=60123.8 opt_steps=2340 +[epoch 11/50] step=2360 train_loss=0.0543 tok_s=60127.6 opt_steps=2360 +[epoch 11/50] step=2380 train_loss=0.0543 tok_s=60128.8 opt_steps=2380 +[epoch 11/50] step=2400 train_loss=0.0544 tok_s=60132.8 opt_steps=2400 +[epoch 11/50] step=2420 train_loss=0.0544 tok_s=60134.8 opt_steps=2420 +[epoch 11/50] step=2440 train_loss=0.0545 tok_s=60133.2 opt_steps=2440 +[epoch 11/50] step=2460 train_loss=0.0545 tok_s=60132.1 opt_steps=2460 +[epoch 11/50] step=2480 train_loss=0.0545 tok_s=60133.8 opt_steps=2480 +[epoch 11/50] step=2500 train_loss=0.0545 tok_s=60134.8 opt_steps=2500 +[epoch 11/50] step=2520 train_loss=0.0546 tok_s=60135.1 opt_steps=2520 +[epoch 11/50] step=2540 train_loss=0.0546 tok_s=60134.4 opt_steps=2540 +[epoch 11/50] step=2560 train_loss=0.0546 tok_s=60134.8 opt_steps=2560 +[epoch 11/50] step=2580 train_loss=0.0546 tok_s=60137.9 opt_steps=2580 +[epoch 11/50] step=2600 train_loss=0.0547 tok_s=60138.3 opt_steps=2600 +[epoch 11/50] step=2620 train_loss=0.0547 tok_s=60142.1 opt_steps=2620 +[epoch 11/50] step=2640 train_loss=0.0547 tok_s=60139.6 opt_steps=2640 +[epoch 11/50] step=2660 train_loss=0.0547 tok_s=60135.2 opt_steps=2660 +[epoch 11/50] step=2680 train_loss=0.0548 tok_s=60137.7 opt_steps=2680 +[epoch 11/50] step=2700 train_loss=0.0548 tok_s=60140.8 opt_steps=2700 +[epoch 11/50] step=2720 train_loss=0.0548 tok_s=60141.9 opt_steps=2720 +[epoch 11/50] step=2740 train_loss=0.0549 tok_s=60143.6 opt_steps=2740 +[epoch 11/50] step=2760 train_loss=0.0549 tok_s=60143.4 opt_steps=2760 +[epoch 11/50] step=2780 train_loss=0.0549 tok_s=60141.3 opt_steps=2780 +[epoch 11/50] step=2800 train_loss=0.0549 tok_s=60142.1 opt_steps=2800 +[epoch 11/50] step=2820 train_loss=0.0550 tok_s=60144.1 opt_steps=2820 +[epoch 11/50] step=2840 train_loss=0.0550 tok_s=60145.8 opt_steps=2840 +[epoch 11/50] step=2860 train_loss=0.0550 tok_s=60148.7 opt_steps=2860 +[epoch 11/50] step=2880 train_loss=0.0551 tok_s=60152.1 opt_steps=2880 +[epoch 11/50] step=2900 train_loss=0.0551 tok_s=60150.7 opt_steps=2900 +[epoch 11/50] step=2920 train_loss=0.0551 tok_s=60145.7 opt_steps=2920 +[epoch 11/50] step=2940 train_loss=0.0551 tok_s=60144.8 opt_steps=2940 +[epoch 11/50] step=2960 train_loss=0.0551 tok_s=60142.4 opt_steps=2960 +[epoch 11/50] step=2980 train_loss=0.0552 tok_s=60139.0 opt_steps=2980 +[epoch 11/50] step=3000 train_loss=0.0552 tok_s=60139.8 opt_steps=3000 +[epoch 11/50] step=3020 train_loss=0.0552 tok_s=60138.0 opt_steps=3020 +[epoch 11/50] step=3040 train_loss=0.0553 tok_s=60139.1 opt_steps=3040 +[epoch 11/50] step=3060 train_loss=0.0553 tok_s=60140.6 opt_steps=3060 +[epoch 11/50] step=3080 train_loss=0.0553 tok_s=60142.9 opt_steps=3080 +[epoch 11/50] step=3100 train_loss=0.0553 tok_s=60143.8 opt_steps=3100 +[epoch 11/50] step=3120 train_loss=0.0553 tok_s=60145.2 opt_steps=3120 +[epoch 11/50] step=3140 train_loss=0.0553 tok_s=60144.7 opt_steps=3140 +[epoch 11/50] step=3160 train_loss=0.0553 tok_s=60143.2 opt_steps=3160 +[epoch 11/50] step=3180 train_loss=0.0554 tok_s=60141.7 opt_steps=3180 +[epoch 11/50] step=3200 train_loss=0.0554 tok_s=60141.1 opt_steps=3200 +[epoch 11/50] step=3220 train_loss=0.0554 tok_s=60139.9 opt_steps=3220 +[epoch 11/50] step=3240 train_loss=0.0554 tok_s=60140.0 opt_steps=3240 +[epoch 11/50] step=3260 train_loss=0.0555 tok_s=60135.6 opt_steps=3260 +[epoch 11/50] train_loss=0.0555 val_skipped tok_s=60136.1 opt_steps=3273 +[epoch 12/50] step=20 train_loss=0.0419 tok_s=57891.9 opt_steps=20 +[epoch 12/50] step=40 train_loss=0.0423 tok_s=58952.8 opt_steps=40 +[epoch 12/50] step=60 train_loss=0.0424 tok_s=59325.0 opt_steps=60 +[epoch 12/50] step=80 train_loss=0.0424 tok_s=59500.2 opt_steps=80 +[epoch 12/50] step=100 train_loss=0.0422 tok_s=59645.1 opt_steps=100 +[epoch 12/50] step=120 train_loss=0.0421 tok_s=59719.6 opt_steps=120 +[epoch 12/50] step=140 train_loss=0.0421 tok_s=59796.5 opt_steps=140 +[epoch 12/50] step=160 train_loss=0.0421 tok_s=59840.4 opt_steps=160 +[epoch 12/50] step=180 train_loss=0.0422 tok_s=59877.3 opt_steps=180 +[epoch 12/50] step=200 train_loss=0.0423 tok_s=59931.5 opt_steps=200 +[epoch 12/50] step=220 train_loss=0.0424 tok_s=59927.6 opt_steps=220 +[epoch 12/50] step=240 train_loss=0.0426 tok_s=59931.2 opt_steps=240 +[epoch 12/50] step=260 train_loss=0.0427 tok_s=59934.6 opt_steps=260 +[epoch 12/50] step=280 train_loss=0.0428 tok_s=59950.1 opt_steps=280 +[epoch 12/50] step=300 train_loss=0.0429 tok_s=59978.0 opt_steps=300 +[epoch 12/50] step=320 train_loss=0.0431 tok_s=59953.7 opt_steps=320 +[epoch 12/50] step=340 train_loss=0.0432 tok_s=59941.7 opt_steps=340 +[epoch 12/50] step=360 train_loss=0.0434 tok_s=59935.6 opt_steps=360 +[epoch 12/50] step=380 train_loss=0.0434 tok_s=59937.4 opt_steps=380 +[epoch 12/50] step=400 train_loss=0.0435 tok_s=59922.2 opt_steps=400 +[epoch 12/50] step=420 train_loss=0.0436 tok_s=59913.4 opt_steps=420 +[epoch 12/50] step=440 train_loss=0.0437 tok_s=59908.4 opt_steps=440 +[epoch 12/50] step=460 train_loss=0.0437 tok_s=59898.5 opt_steps=460 +[epoch 12/50] step=480 train_loss=0.0437 tok_s=59870.0 opt_steps=480 +[epoch 12/50] step=500 train_loss=0.0438 tok_s=59877.7 opt_steps=500 +[epoch 12/50] step=520 train_loss=0.0439 tok_s=59876.3 opt_steps=520 +[epoch 12/50] step=540 train_loss=0.0440 tok_s=59872.6 opt_steps=540 +[epoch 12/50] step=560 train_loss=0.0441 tok_s=59877.1 opt_steps=560 +[epoch 12/50] step=580 train_loss=0.0442 tok_s=59878.9 opt_steps=580 +[epoch 12/50] step=600 train_loss=0.0443 tok_s=59890.0 opt_steps=600 +[epoch 12/50] step=620 train_loss=0.0444 tok_s=59894.4 opt_steps=620 +[epoch 12/50] step=640 train_loss=0.0444 tok_s=59871.6 opt_steps=640 +[epoch 12/50] step=660 train_loss=0.0445 tok_s=59870.7 opt_steps=660 +[epoch 12/50] step=680 train_loss=0.0446 tok_s=59862.4 opt_steps=680 +[epoch 12/50] step=700 train_loss=0.0447 tok_s=59865.1 opt_steps=700 +[epoch 12/50] step=720 train_loss=0.0448 tok_s=59861.9 opt_steps=720 +[epoch 12/50] step=740 train_loss=0.0448 tok_s=59855.6 opt_steps=740 +[epoch 12/50] step=760 train_loss=0.0449 tok_s=59880.5 opt_steps=760 +[epoch 12/50] step=780 train_loss=0.0450 tok_s=59887.9 opt_steps=780 +[epoch 12/50] step=800 train_loss=0.0451 tok_s=59893.9 opt_steps=800 +[epoch 12/50] step=820 train_loss=0.0451 tok_s=59887.6 opt_steps=820 +[epoch 12/50] step=840 train_loss=0.0452 tok_s=59884.5 opt_steps=840 +[epoch 12/50] step=860 train_loss=0.0452 tok_s=59872.2 opt_steps=860 +[epoch 12/50] step=880 train_loss=0.0453 tok_s=59883.9 opt_steps=880 +[epoch 12/50] step=900 train_loss=0.0454 tok_s=59879.0 opt_steps=900 +[epoch 12/50] step=920 train_loss=0.0454 tok_s=59883.3 opt_steps=920 +[epoch 12/50] step=940 train_loss=0.0455 tok_s=59887.0 opt_steps=940 +[epoch 12/50] step=960 train_loss=0.0456 tok_s=59888.8 opt_steps=960 +[epoch 12/50] step=980 train_loss=0.0456 tok_s=59883.8 opt_steps=980 +[epoch 12/50] step=1000 train_loss=0.0457 tok_s=59883.7 opt_steps=1000 +[epoch 12/50] step=1020 train_loss=0.0458 tok_s=59887.0 opt_steps=1020 +[epoch 12/50] step=1040 train_loss=0.0458 tok_s=59891.8 opt_steps=1040 +[epoch 12/50] step=1060 train_loss=0.0459 tok_s=59886.1 opt_steps=1060 +[epoch 12/50] step=1080 train_loss=0.0460 tok_s=59895.6 opt_steps=1080 +[epoch 12/50] step=1100 train_loss=0.0460 tok_s=59900.5 opt_steps=1100 +[epoch 12/50] step=1120 train_loss=0.0461 tok_s=59896.5 opt_steps=1120 +[epoch 12/50] step=1140 train_loss=0.0461 tok_s=59910.8 opt_steps=1140 +[epoch 12/50] step=1160 train_loss=0.0462 tok_s=59923.3 opt_steps=1160 +[epoch 12/50] step=1180 train_loss=0.0462 tok_s=59922.2 opt_steps=1180 +[epoch 12/50] step=1200 train_loss=0.0463 tok_s=59912.9 opt_steps=1200 +[epoch 12/50] step=1220 train_loss=0.0464 tok_s=59911.1 opt_steps=1220 +[epoch 12/50] step=1240 train_loss=0.0464 tok_s=59905.8 opt_steps=1240 +[epoch 12/50] step=1260 train_loss=0.0465 tok_s=59915.2 opt_steps=1260 +[epoch 12/50] step=1280 train_loss=0.0466 tok_s=59927.3 opt_steps=1280 +[epoch 12/50] step=1300 train_loss=0.0466 tok_s=59936.1 opt_steps=1300 +[epoch 12/50] step=1320 train_loss=0.0467 tok_s=59940.1 opt_steps=1320 +[epoch 12/50] step=1340 train_loss=0.0467 tok_s=59943.8 opt_steps=1340 +[epoch 12/50] step=1360 train_loss=0.0468 tok_s=59950.2 opt_steps=1360 +[epoch 12/50] step=1380 train_loss=0.0468 tok_s=59955.4 opt_steps=1380 +[epoch 12/50] step=1400 train_loss=0.0469 tok_s=59960.6 opt_steps=1400 +[epoch 12/50] step=1420 train_loss=0.0469 tok_s=59958.0 opt_steps=1420 +[epoch 12/50] step=1440 train_loss=0.0469 tok_s=59960.9 opt_steps=1440 +[epoch 12/50] step=1460 train_loss=0.0470 tok_s=59968.8 opt_steps=1460 +[epoch 12/50] step=1480 train_loss=0.0470 tok_s=59970.7 opt_steps=1480 +[epoch 12/50] step=1500 train_loss=0.0471 tok_s=59971.6 opt_steps=1500 +[epoch 12/50] step=1520 train_loss=0.0471 tok_s=59984.4 opt_steps=1520 +[epoch 12/50] step=1540 train_loss=0.0472 tok_s=59988.1 opt_steps=1540 +[epoch 12/50] step=1560 train_loss=0.0472 tok_s=59989.7 opt_steps=1560 +[epoch 12/50] step=1580 train_loss=0.0473 tok_s=59985.3 opt_steps=1580 +[epoch 12/50] step=1600 train_loss=0.0473 tok_s=59981.4 opt_steps=1600 +[epoch 12/50] step=1620 train_loss=0.0473 tok_s=59977.2 opt_steps=1620 +[epoch 12/50] step=1640 train_loss=0.0474 tok_s=59979.9 opt_steps=1640 +[epoch 12/50] step=1660 train_loss=0.0474 tok_s=59982.6 opt_steps=1660 +[epoch 12/50] step=1680 train_loss=0.0475 tok_s=59984.9 opt_steps=1680 +[epoch 12/50] step=1700 train_loss=0.0475 tok_s=59986.6 opt_steps=1700 +[epoch 12/50] step=1720 train_loss=0.0476 tok_s=59984.9 opt_steps=1720 +[epoch 12/50] step=1740 train_loss=0.0476 tok_s=59990.1 opt_steps=1740 +[epoch 12/50] step=1760 train_loss=0.0476 tok_s=59997.7 opt_steps=1760 +[epoch 12/50] step=1780 train_loss=0.0477 tok_s=59999.2 opt_steps=1780 +[epoch 12/50] step=1800 train_loss=0.0477 tok_s=59999.2 opt_steps=1800 +[epoch 12/50] step=1820 train_loss=0.0477 tok_s=60005.6 opt_steps=1820 +[epoch 12/50] step=1840 train_loss=0.0478 tok_s=60006.5 opt_steps=1840 +[epoch 12/50] step=1860 train_loss=0.0478 tok_s=60012.6 opt_steps=1860 +[epoch 12/50] step=1880 train_loss=0.0478 tok_s=60010.0 opt_steps=1880 +[epoch 12/50] step=1900 train_loss=0.0479 tok_s=60013.7 opt_steps=1900 +[epoch 12/50] step=1920 train_loss=0.0479 tok_s=60015.8 opt_steps=1920 +[epoch 12/50] step=1940 train_loss=0.0479 tok_s=60019.2 opt_steps=1940 +[epoch 12/50] step=1960 train_loss=0.0480 tok_s=60020.5 opt_steps=1960 +[epoch 12/50] step=1980 train_loss=0.0480 tok_s=60022.4 opt_steps=1980 +[epoch 12/50] step=2000 train_loss=0.0481 tok_s=60022.1 opt_steps=2000 +[epoch 12/50] step=2020 train_loss=0.0481 tok_s=60025.8 opt_steps=2020 +[epoch 12/50] step=2040 train_loss=0.0482 tok_s=60025.2 opt_steps=2040 +[epoch 12/50] step=2060 train_loss=0.0482 tok_s=60030.6 opt_steps=2060 +[epoch 12/50] step=2080 train_loss=0.0482 tok_s=60034.8 opt_steps=2080 +[epoch 12/50] step=2100 train_loss=0.0483 tok_s=60038.1 opt_steps=2100 +[epoch 12/50] step=2120 train_loss=0.0483 tok_s=60037.5 opt_steps=2120 +[epoch 12/50] step=2140 train_loss=0.0483 tok_s=60040.2 opt_steps=2140 +[epoch 12/50] step=2160 train_loss=0.0484 tok_s=60044.8 opt_steps=2160 +[epoch 12/50] step=2180 train_loss=0.0484 tok_s=60043.0 opt_steps=2180 +[epoch 12/50] step=2200 train_loss=0.0484 tok_s=60043.5 opt_steps=2200 +[epoch 12/50] step=2220 train_loss=0.0484 tok_s=60043.5 opt_steps=2220 +[epoch 12/50] step=2240 train_loss=0.0485 tok_s=60043.5 opt_steps=2240 +[epoch 12/50] step=2260 train_loss=0.0485 tok_s=60045.6 opt_steps=2260 +[epoch 12/50] step=2280 train_loss=0.0485 tok_s=60049.1 opt_steps=2280 +[epoch 12/50] step=2300 train_loss=0.0486 tok_s=60048.4 opt_steps=2300 +[epoch 12/50] step=2320 train_loss=0.0486 tok_s=60050.2 opt_steps=2320 +[epoch 12/50] step=2340 train_loss=0.0486 tok_s=60053.3 opt_steps=2340 +[epoch 12/50] step=2360 train_loss=0.0487 tok_s=60053.7 opt_steps=2360 +[epoch 12/50] step=2380 train_loss=0.0487 tok_s=60054.4 opt_steps=2380 +[epoch 12/50] step=2400 train_loss=0.0487 tok_s=60055.4 opt_steps=2400 +[epoch 12/50] step=2420 train_loss=0.0488 tok_s=60056.3 opt_steps=2420 +[epoch 12/50] step=2440 train_loss=0.0488 tok_s=60056.6 opt_steps=2440 +[epoch 12/50] step=2460 train_loss=0.0488 tok_s=60054.4 opt_steps=2460 +[epoch 12/50] step=2480 train_loss=0.0489 tok_s=60055.5 opt_steps=2480 +[epoch 12/50] step=2500 train_loss=0.0489 tok_s=60057.6 opt_steps=2500 +[epoch 12/50] step=2520 train_loss=0.0489 tok_s=60063.5 opt_steps=2520 +[epoch 12/50] step=2540 train_loss=0.0490 tok_s=60061.6 opt_steps=2540 +[epoch 12/50] step=2560 train_loss=0.0490 tok_s=60060.6 opt_steps=2560 +[epoch 12/50] step=2580 train_loss=0.0491 tok_s=60060.5 opt_steps=2580 +[epoch 12/50] step=2600 train_loss=0.0491 tok_s=60062.0 opt_steps=2600 +[epoch 12/50] step=2620 train_loss=0.0491 tok_s=60062.3 opt_steps=2620 +[epoch 12/50] step=2640 train_loss=0.0491 tok_s=60062.8 opt_steps=2640 +[epoch 12/50] step=2660 train_loss=0.0492 tok_s=60064.4 opt_steps=2660 +[epoch 12/50] step=2680 train_loss=0.0492 tok_s=60065.2 opt_steps=2680 +[epoch 12/50] step=2700 train_loss=0.0492 tok_s=60063.6 opt_steps=2700 +[epoch 12/50] step=2720 train_loss=0.0492 tok_s=60064.7 opt_steps=2720 +[epoch 12/50] step=2740 train_loss=0.0493 tok_s=60064.3 opt_steps=2740 +[epoch 12/50] step=2760 train_loss=0.0493 tok_s=60063.6 opt_steps=2760 +[epoch 12/50] step=2780 train_loss=0.0493 tok_s=60063.5 opt_steps=2780 +[epoch 12/50] step=2800 train_loss=0.0493 tok_s=60063.6 opt_steps=2800 +[epoch 12/50] step=2820 train_loss=0.0494 tok_s=60060.5 opt_steps=2820 +[epoch 12/50] step=2840 train_loss=0.0494 tok_s=60061.6 opt_steps=2840 +[epoch 12/50] step=2860 train_loss=0.0494 tok_s=60060.4 opt_steps=2860 +[epoch 12/50] step=2880 train_loss=0.0494 tok_s=60061.3 opt_steps=2880 +[epoch 12/50] step=2900 train_loss=0.0495 tok_s=60062.3 opt_steps=2900 +[epoch 12/50] step=2920 train_loss=0.0495 tok_s=60062.9 opt_steps=2920 +[epoch 12/50] step=2940 train_loss=0.0495 tok_s=60064.0 opt_steps=2940 +[epoch 12/50] step=2960 train_loss=0.0496 tok_s=60066.6 opt_steps=2960 +[epoch 12/50] step=2980 train_loss=0.0496 tok_s=60068.0 opt_steps=2980 +[epoch 12/50] step=3000 train_loss=0.0496 tok_s=60069.7 opt_steps=3000 +[epoch 12/50] step=3020 train_loss=0.0496 tok_s=60072.2 opt_steps=3020 +[epoch 12/50] step=3040 train_loss=0.0497 tok_s=60071.1 opt_steps=3040 +[epoch 12/50] step=3060 train_loss=0.0497 tok_s=60071.3 opt_steps=3060 +[epoch 12/50] step=3080 train_loss=0.0497 tok_s=60070.4 opt_steps=3080 +[epoch 12/50] step=3100 train_loss=0.0497 tok_s=60069.0 opt_steps=3100 +[epoch 12/50] step=3120 train_loss=0.0498 tok_s=60069.6 opt_steps=3120 +[epoch 12/50] step=3140 train_loss=0.0498 tok_s=60069.3 opt_steps=3140 +[epoch 12/50] step=3160 train_loss=0.0498 tok_s=60065.1 opt_steps=3160 +[epoch 12/50] step=3180 train_loss=0.0498 tok_s=60062.2 opt_steps=3180 +[epoch 12/50] step=3200 train_loss=0.0499 tok_s=60063.2 opt_steps=3200 +[epoch 12/50] step=3220 train_loss=0.0499 tok_s=60064.4 opt_steps=3220 +[epoch 12/50] step=3240 train_loss=0.0499 tok_s=60062.5 opt_steps=3240 +[epoch 12/50] step=3260 train_loss=0.0499 tok_s=60056.8 opt_steps=3260 +[epoch 12/50] train_loss=0.0499 val_skipped tok_s=60059.9 opt_steps=3273 +[epoch 13/50] step=20 train_loss=0.0385 tok_s=56806.9 opt_steps=20 +[epoch 13/50] step=40 train_loss=0.0381 tok_s=58308.9 opt_steps=40 +[epoch 13/50] step=60 train_loss=0.0377 tok_s=58971.1 opt_steps=60 +[epoch 13/50] step=80 train_loss=0.0378 tok_s=59352.8 opt_steps=80 +[epoch 13/50] step=100 train_loss=0.0377 tok_s=59476.0 opt_steps=100 +[epoch 13/50] step=120 train_loss=0.0378 tok_s=59611.6 opt_steps=120 +[epoch 13/50] step=140 train_loss=0.0377 tok_s=59688.4 opt_steps=140 +[epoch 13/50] step=160 train_loss=0.0376 tok_s=59742.9 opt_steps=160 +[epoch 13/50] step=180 train_loss=0.0378 tok_s=59840.8 opt_steps=180 +[epoch 13/50] step=200 train_loss=0.0379 tok_s=59899.3 opt_steps=200 +[epoch 13/50] step=220 train_loss=0.0379 tok_s=59898.7 opt_steps=220 +[epoch 13/50] step=240 train_loss=0.0380 tok_s=59925.1 opt_steps=240 +[epoch 13/50] step=260 train_loss=0.0382 tok_s=59978.7 opt_steps=260 +[epoch 13/50] step=280 train_loss=0.0384 tok_s=60018.1 opt_steps=280 +[epoch 13/50] step=300 train_loss=0.0385 tok_s=60014.3 opt_steps=300 +[epoch 13/50] step=320 train_loss=0.0386 tok_s=60013.2 opt_steps=320 +[epoch 13/50] step=340 train_loss=0.0387 tok_s=60031.6 opt_steps=340 +[epoch 13/50] step=360 train_loss=0.0389 tok_s=60009.2 opt_steps=360 +[epoch 13/50] step=380 train_loss=0.0390 tok_s=60025.4 opt_steps=380 +[epoch 13/50] step=400 train_loss=0.0391 tok_s=60090.4 opt_steps=400 +[epoch 13/50] step=420 train_loss=0.0392 tok_s=60087.9 opt_steps=420 +[epoch 13/50] step=440 train_loss=0.0393 tok_s=60087.8 opt_steps=440 +[epoch 13/50] step=460 train_loss=0.0394 tok_s=60087.7 opt_steps=460 +[epoch 13/50] step=480 train_loss=0.0394 tok_s=60076.2 opt_steps=480 +[epoch 13/50] step=500 train_loss=0.0396 tok_s=60081.7 opt_steps=500 +[epoch 13/50] step=520 train_loss=0.0397 tok_s=60085.2 opt_steps=520 +[epoch 13/50] step=540 train_loss=0.0398 tok_s=60096.0 opt_steps=540 +[epoch 13/50] step=560 train_loss=0.0399 tok_s=60132.2 opt_steps=560 +[epoch 13/50] step=580 train_loss=0.0400 tok_s=60116.1 opt_steps=580 +[epoch 13/50] step=600 train_loss=0.0400 tok_s=60116.9 opt_steps=600 +[epoch 13/50] step=620 train_loss=0.0401 tok_s=60121.4 opt_steps=620 +[epoch 13/50] step=640 train_loss=0.0401 tok_s=60118.1 opt_steps=640 +[epoch 13/50] step=660 train_loss=0.0402 tok_s=60128.4 opt_steps=660 +[epoch 13/50] step=680 train_loss=0.0403 tok_s=60124.5 opt_steps=680 +[epoch 13/50] step=700 train_loss=0.0404 tok_s=60124.2 opt_steps=700 +[epoch 13/50] step=720 train_loss=0.0405 tok_s=60145.0 opt_steps=720 +[epoch 13/50] step=740 train_loss=0.0405 tok_s=60130.8 opt_steps=740 +[epoch 13/50] step=760 train_loss=0.0406 tok_s=60140.3 opt_steps=760 +[epoch 13/50] step=780 train_loss=0.0406 tok_s=60143.3 opt_steps=780 +[epoch 13/50] step=800 train_loss=0.0407 tok_s=60142.0 opt_steps=800 +[epoch 13/50] step=820 train_loss=0.0408 tok_s=60147.8 opt_steps=820 +[epoch 13/50] step=840 train_loss=0.0408 tok_s=60127.7 opt_steps=840 +[epoch 13/50] step=860 train_loss=0.0409 tok_s=60111.1 opt_steps=860 +[epoch 13/50] step=880 train_loss=0.0410 tok_s=60071.5 opt_steps=880 +[epoch 13/50] step=900 train_loss=0.0410 tok_s=60055.6 opt_steps=900 +[epoch 13/50] step=920 train_loss=0.0410 tok_s=60058.0 opt_steps=920 +[epoch 13/50] step=940 train_loss=0.0411 tok_s=60055.6 opt_steps=940 +[epoch 13/50] step=960 train_loss=0.0411 tok_s=60048.4 opt_steps=960 +[epoch 13/50] step=980 train_loss=0.0412 tok_s=60053.5 opt_steps=980 +[epoch 13/50] step=1000 train_loss=0.0412 tok_s=60047.5 opt_steps=1000 +[epoch 13/50] step=1020 train_loss=0.0413 tok_s=60044.5 opt_steps=1020 +[epoch 13/50] step=1040 train_loss=0.0414 tok_s=60055.1 opt_steps=1040 +[epoch 13/50] step=1060 train_loss=0.0414 tok_s=60046.5 opt_steps=1060 +[epoch 13/50] step=1080 train_loss=0.0415 tok_s=60044.9 opt_steps=1080 +[epoch 13/50] step=1100 train_loss=0.0416 tok_s=60036.7 opt_steps=1100 +[epoch 13/50] step=1120 train_loss=0.0416 tok_s=60038.5 opt_steps=1120 +[epoch 13/50] step=1140 train_loss=0.0417 tok_s=60035.7 opt_steps=1140 +[epoch 13/50] step=1160 train_loss=0.0417 tok_s=60043.3 opt_steps=1160 +[epoch 13/50] step=1180 train_loss=0.0418 tok_s=60040.1 opt_steps=1180 +[epoch 13/50] step=1200 train_loss=0.0418 tok_s=60039.3 opt_steps=1200 +[epoch 13/50] step=1220 train_loss=0.0418 tok_s=60046.0 opt_steps=1220 +[epoch 13/50] step=1240 train_loss=0.0419 tok_s=60050.7 opt_steps=1240 +[epoch 13/50] step=1260 train_loss=0.0419 tok_s=60058.0 opt_steps=1260 +[epoch 13/50] step=1280 train_loss=0.0420 tok_s=60056.1 opt_steps=1280 +[epoch 13/50] step=1300 train_loss=0.0420 tok_s=60062.6 opt_steps=1300 +[epoch 13/50] step=1320 train_loss=0.0421 tok_s=60062.5 opt_steps=1320 +[epoch 13/50] step=1340 train_loss=0.0421 tok_s=60058.0 opt_steps=1340 +[epoch 13/50] step=1360 train_loss=0.0421 tok_s=60063.9 opt_steps=1360 +[epoch 13/50] step=1380 train_loss=0.0422 tok_s=60069.0 opt_steps=1380 +[epoch 13/50] step=1400 train_loss=0.0422 tok_s=60067.3 opt_steps=1400 +[epoch 13/50] step=1420 train_loss=0.0422 tok_s=60069.9 opt_steps=1420 +[epoch 13/50] step=1440 train_loss=0.0423 tok_s=60070.7 opt_steps=1440 +[epoch 13/50] step=1460 train_loss=0.0423 tok_s=60069.1 opt_steps=1460 +[epoch 13/50] step=1480 train_loss=0.0424 tok_s=60070.0 opt_steps=1480 +[epoch 13/50] step=1500 train_loss=0.0424 tok_s=60065.2 opt_steps=1500 +[epoch 13/50] step=1520 train_loss=0.0425 tok_s=60064.7 opt_steps=1520 +[epoch 13/50] step=1540 train_loss=0.0425 tok_s=60060.1 opt_steps=1540 +[epoch 13/50] step=1560 train_loss=0.0425 tok_s=60059.4 opt_steps=1560 +[epoch 13/50] step=1580 train_loss=0.0426 tok_s=60058.1 opt_steps=1580 +[epoch 13/50] step=1600 train_loss=0.0426 tok_s=60064.8 opt_steps=1600 +[epoch 13/50] step=1620 train_loss=0.0427 tok_s=60065.6 opt_steps=1620 +[epoch 13/50] step=1640 train_loss=0.0427 tok_s=60064.5 opt_steps=1640 +[epoch 13/50] step=1660 train_loss=0.0428 tok_s=60070.0 opt_steps=1660 +[epoch 13/50] step=1680 train_loss=0.0428 tok_s=60073.6 opt_steps=1680 +[epoch 13/50] step=1700 train_loss=0.0429 tok_s=60068.8 opt_steps=1700 +[epoch 13/50] step=1720 train_loss=0.0429 tok_s=60070.7 opt_steps=1720 +[epoch 13/50] step=1740 train_loss=0.0430 tok_s=60074.6 opt_steps=1740 +[epoch 13/50] step=1760 train_loss=0.0430 tok_s=60073.3 opt_steps=1760 +[epoch 13/50] step=1780 train_loss=0.0431 tok_s=60074.8 opt_steps=1780 +[epoch 13/50] step=1800 train_loss=0.0431 tok_s=60071.9 opt_steps=1800 +[epoch 13/50] step=1820 train_loss=0.0432 tok_s=60075.1 opt_steps=1820 +[epoch 13/50] step=1840 train_loss=0.0432 tok_s=60078.5 opt_steps=1840 +[epoch 13/50] step=1860 train_loss=0.0432 tok_s=60081.4 opt_steps=1860 +[epoch 13/50] step=1880 train_loss=0.0433 tok_s=60079.4 opt_steps=1880 +[epoch 13/50] step=1900 train_loss=0.0433 tok_s=60081.9 opt_steps=1900 +[epoch 13/50] step=1920 train_loss=0.0434 tok_s=60083.8 opt_steps=1920 +[epoch 13/50] step=1940 train_loss=0.0434 tok_s=60083.9 opt_steps=1940 +[epoch 13/50] step=1960 train_loss=0.0434 tok_s=60085.3 opt_steps=1960 +[epoch 13/50] step=1980 train_loss=0.0435 tok_s=60087.3 opt_steps=1980 +[epoch 13/50] step=2000 train_loss=0.0435 tok_s=60091.5 opt_steps=2000 +[epoch 13/50] step=2020 train_loss=0.0436 tok_s=60094.2 opt_steps=2020 +[epoch 13/50] step=2040 train_loss=0.0436 tok_s=60093.0 opt_steps=2040 +[epoch 13/50] step=2060 train_loss=0.0436 tok_s=60098.1 opt_steps=2060 +[epoch 13/50] step=2080 train_loss=0.0436 tok_s=60098.7 opt_steps=2080 +[epoch 13/50] step=2100 train_loss=0.0437 tok_s=60099.1 opt_steps=2100 +[epoch 13/50] step=2120 train_loss=0.0437 tok_s=60101.7 opt_steps=2120 +[epoch 13/50] step=2140 train_loss=0.0438 tok_s=60101.3 opt_steps=2140 +[epoch 13/50] step=2160 train_loss=0.0438 tok_s=60105.2 opt_steps=2160 +[epoch 13/50] step=2180 train_loss=0.0438 tok_s=60109.1 opt_steps=2180 +[epoch 13/50] step=2200 train_loss=0.0439 tok_s=60107.2 opt_steps=2200 +[epoch 13/50] step=2220 train_loss=0.0439 tok_s=60106.6 opt_steps=2220 +[epoch 13/50] step=2240 train_loss=0.0439 tok_s=60109.0 opt_steps=2240 +[epoch 13/50] step=2260 train_loss=0.0439 tok_s=60108.4 opt_steps=2260 +[epoch 13/50] step=2280 train_loss=0.0440 tok_s=60112.4 opt_steps=2280 +[epoch 13/50] step=2300 train_loss=0.0440 tok_s=60113.3 opt_steps=2300 +[epoch 13/50] step=2320 train_loss=0.0440 tok_s=60113.0 opt_steps=2320 +[epoch 13/50] step=2340 train_loss=0.0441 tok_s=60117.3 opt_steps=2340 +[epoch 13/50] step=2360 train_loss=0.0441 tok_s=60115.6 opt_steps=2360 +[epoch 13/50] step=2380 train_loss=0.0441 tok_s=60118.9 opt_steps=2380 +[epoch 13/50] step=2400 train_loss=0.0441 tok_s=60122.2 opt_steps=2400 +[epoch 13/50] step=2420 train_loss=0.0441 tok_s=60123.9 opt_steps=2420 +[epoch 13/50] step=2440 train_loss=0.0442 tok_s=60123.2 opt_steps=2440 +[epoch 13/50] step=2460 train_loss=0.0442 tok_s=60124.6 opt_steps=2460 +[epoch 13/50] step=2480 train_loss=0.0442 tok_s=60121.0 opt_steps=2480 +[epoch 13/50] step=2500 train_loss=0.0442 tok_s=60120.4 opt_steps=2500 +[epoch 13/50] step=2520 train_loss=0.0443 tok_s=60118.5 opt_steps=2520 +[epoch 13/50] step=2540 train_loss=0.0443 tok_s=60116.2 opt_steps=2540 +[epoch 13/50] step=2560 train_loss=0.0444 tok_s=60120.1 opt_steps=2560 +[epoch 13/50] step=2580 train_loss=0.0444 tok_s=60121.0 opt_steps=2580 +[epoch 13/50] step=2600 train_loss=0.0444 tok_s=60122.5 opt_steps=2600 +[epoch 13/50] step=2620 train_loss=0.0444 tok_s=60122.8 opt_steps=2620 +[epoch 13/50] step=2640 train_loss=0.0445 tok_s=60122.7 opt_steps=2640 +[epoch 13/50] step=2660 train_loss=0.0445 tok_s=60123.6 opt_steps=2660 +[epoch 13/50] step=2680 train_loss=0.0445 tok_s=60123.0 opt_steps=2680 +[epoch 13/50] step=2700 train_loss=0.0446 tok_s=60125.0 opt_steps=2700 +[epoch 13/50] step=2720 train_loss=0.0446 tok_s=60125.5 opt_steps=2720 +[epoch 13/50] step=2740 train_loss=0.0446 tok_s=60126.6 opt_steps=2740 +[epoch 13/50] step=2760 train_loss=0.0446 tok_s=60127.7 opt_steps=2760 +[epoch 13/50] step=2780 train_loss=0.0447 tok_s=60129.3 opt_steps=2780 +[epoch 13/50] step=2800 train_loss=0.0447 tok_s=60126.8 opt_steps=2800 +[epoch 13/50] step=2820 train_loss=0.0447 tok_s=60125.1 opt_steps=2820 +[epoch 13/50] step=2840 train_loss=0.0447 tok_s=60126.4 opt_steps=2840 +[epoch 13/50] step=2860 train_loss=0.0448 tok_s=60128.3 opt_steps=2860 +[epoch 13/50] step=2880 train_loss=0.0448 tok_s=60127.0 opt_steps=2880 +[epoch 13/50] step=2900 train_loss=0.0448 tok_s=60127.3 opt_steps=2900 +[epoch 13/50] step=2920 train_loss=0.0448 tok_s=60127.6 opt_steps=2920 +[epoch 13/50] step=2940 train_loss=0.0449 tok_s=60129.3 opt_steps=2940 +[epoch 13/50] step=2960 train_loss=0.0449 tok_s=60131.0 opt_steps=2960 +[epoch 13/50] step=2980 train_loss=0.0449 tok_s=60129.9 opt_steps=2980 +[epoch 13/50] step=3000 train_loss=0.0449 tok_s=60129.7 opt_steps=3000 +[epoch 13/50] step=3020 train_loss=0.0449 tok_s=60127.0 opt_steps=3020 +[epoch 13/50] step=3040 train_loss=0.0450 tok_s=60123.9 opt_steps=3040 +[epoch 13/50] step=3060 train_loss=0.0450 tok_s=60123.5 opt_steps=3060 +[epoch 13/50] step=3080 train_loss=0.0450 tok_s=60125.8 opt_steps=3080 +[epoch 13/50] step=3100 train_loss=0.0450 tok_s=60127.7 opt_steps=3100 +[epoch 13/50] step=3120 train_loss=0.0451 tok_s=60123.7 opt_steps=3120 +[epoch 13/50] step=3140 train_loss=0.0451 tok_s=60126.3 opt_steps=3140 +[epoch 13/50] step=3160 train_loss=0.0451 tok_s=60122.5 opt_steps=3160 +[epoch 13/50] step=3180 train_loss=0.0452 tok_s=60120.8 opt_steps=3180 +[epoch 13/50] step=3200 train_loss=0.0452 tok_s=60120.3 opt_steps=3200 +[epoch 13/50] step=3220 train_loss=0.0452 tok_s=60116.3 opt_steps=3220 +[epoch 13/50] step=3240 train_loss=0.0452 tok_s=60114.8 opt_steps=3240 +[epoch 13/50] step=3260 train_loss=0.0452 tok_s=60106.7 opt_steps=3260 +[epoch 13/50] train_loss=0.0453 val_skipped tok_s=60105.5 opt_steps=3273 +[epoch 14/50] step=20 train_loss=0.0352 tok_s=57672.7 opt_steps=20 +[epoch 14/50] step=40 train_loss=0.0345 tok_s=58429.2 opt_steps=40 +[epoch 14/50] step=60 train_loss=0.0347 tok_s=58925.5 opt_steps=60 +[epoch 14/50] step=80 train_loss=0.0345 tok_s=59386.3 opt_steps=80 +[epoch 14/50] step=100 train_loss=0.0347 tok_s=59530.4 opt_steps=100 +[epoch 14/50] step=120 train_loss=0.0346 tok_s=59628.0 opt_steps=120 +[epoch 14/50] step=140 train_loss=0.0346 tok_s=59645.9 opt_steps=140 +[epoch 14/50] step=160 train_loss=0.0345 tok_s=59766.2 opt_steps=160 +[epoch 14/50] step=180 train_loss=0.0346 tok_s=59768.4 opt_steps=180 +[epoch 14/50] step=200 train_loss=0.0347 tok_s=59790.1 opt_steps=200 +[epoch 14/50] step=220 train_loss=0.0349 tok_s=59813.9 opt_steps=220 +[epoch 14/50] step=240 train_loss=0.0350 tok_s=59845.3 opt_steps=240 +[epoch 14/50] step=260 train_loss=0.0350 tok_s=59836.6 opt_steps=260 +[epoch 14/50] step=280 train_loss=0.0350 tok_s=59842.4 opt_steps=280 +[epoch 14/50] step=300 train_loss=0.0351 tok_s=59848.2 opt_steps=300 +[epoch 14/50] step=320 train_loss=0.0352 tok_s=59859.4 opt_steps=320 +[epoch 14/50] step=340 train_loss=0.0353 tok_s=59884.6 opt_steps=340 +[epoch 14/50] step=360 train_loss=0.0354 tok_s=59872.4 opt_steps=360 +[epoch 14/50] step=380 train_loss=0.0354 tok_s=59875.7 opt_steps=380 +[epoch 14/50] step=400 train_loss=0.0355 tok_s=59890.3 opt_steps=400 +[epoch 14/50] step=420 train_loss=0.0355 tok_s=59871.4 opt_steps=420 +[epoch 14/50] step=440 train_loss=0.0357 tok_s=59885.8 opt_steps=440 +[epoch 14/50] step=460 train_loss=0.0357 tok_s=59858.9 opt_steps=460 +[epoch 14/50] step=480 train_loss=0.0358 tok_s=59855.0 opt_steps=480 +[epoch 14/50] step=500 train_loss=0.0359 tok_s=59863.7 opt_steps=500 +[epoch 14/50] step=520 train_loss=0.0360 tok_s=59868.5 opt_steps=520 +[epoch 14/50] step=540 train_loss=0.0360 tok_s=59880.6 opt_steps=540 +[epoch 14/50] step=560 train_loss=0.0361 tok_s=59889.4 opt_steps=560 +[epoch 14/50] step=580 train_loss=0.0362 tok_s=59906.4 opt_steps=580 +[epoch 14/50] step=600 train_loss=0.0363 tok_s=59902.1 opt_steps=600 +[epoch 14/50] step=620 train_loss=0.0363 tok_s=59928.7 opt_steps=620 +[epoch 14/50] step=640 train_loss=0.0364 tok_s=59946.8 opt_steps=640 +[epoch 14/50] step=660 train_loss=0.0364 tok_s=59965.5 opt_steps=660 +[epoch 14/50] step=680 train_loss=0.0365 tok_s=59971.5 opt_steps=680 +[epoch 14/50] step=700 train_loss=0.0365 tok_s=59986.3 opt_steps=700 +[epoch 14/50] step=720 train_loss=0.0366 tok_s=59994.8 opt_steps=720 +[epoch 14/50] step=740 train_loss=0.0367 tok_s=59991.7 opt_steps=740 +[epoch 14/50] step=760 train_loss=0.0367 tok_s=60008.1 opt_steps=760 +[epoch 14/50] step=780 train_loss=0.0368 tok_s=60016.9 opt_steps=780 +[epoch 14/50] step=800 train_loss=0.0368 tok_s=60031.0 opt_steps=800 +[epoch 14/50] step=820 train_loss=0.0370 tok_s=60041.4 opt_steps=820 +[epoch 14/50] step=840 train_loss=0.0370 tok_s=60053.4 opt_steps=840 +[epoch 14/50] step=860 train_loss=0.0370 tok_s=60057.6 opt_steps=860 +[epoch 14/50] step=880 train_loss=0.0371 tok_s=60055.0 opt_steps=880 +[epoch 14/50] step=900 train_loss=0.0372 tok_s=60066.0 opt_steps=900 +[epoch 14/50] step=920 train_loss=0.0372 tok_s=60070.1 opt_steps=920 +[epoch 14/50] step=940 train_loss=0.0373 tok_s=60069.6 opt_steps=940 +[epoch 14/50] step=960 train_loss=0.0373 tok_s=60073.0 opt_steps=960 +[epoch 14/50] step=980 train_loss=0.0374 tok_s=60069.6 opt_steps=980 +[epoch 14/50] step=1000 train_loss=0.0374 tok_s=60064.6 opt_steps=1000 +[epoch 14/50] step=1020 train_loss=0.0375 tok_s=60061.6 opt_steps=1020 +[epoch 14/50] step=1040 train_loss=0.0375 tok_s=60067.2 opt_steps=1040 +[epoch 14/50] step=1060 train_loss=0.0376 tok_s=60064.8 opt_steps=1060 +[epoch 14/50] step=1080 train_loss=0.0377 tok_s=60061.6 opt_steps=1080 +[epoch 14/50] step=1100 train_loss=0.0377 tok_s=60070.1 opt_steps=1100 +[epoch 14/50] step=1120 train_loss=0.0378 tok_s=60077.2 opt_steps=1120 +[epoch 14/50] step=1140 train_loss=0.0379 tok_s=60078.1 opt_steps=1140 +[epoch 14/50] step=1160 train_loss=0.0379 tok_s=60076.0 opt_steps=1160 +[epoch 14/50] step=1180 train_loss=0.0380 tok_s=60079.5 opt_steps=1180 +[epoch 14/50] step=1200 train_loss=0.0380 tok_s=60084.0 opt_steps=1200 +[epoch 14/50] step=1220 train_loss=0.0381 tok_s=60084.8 opt_steps=1220 +[epoch 14/50] step=1240 train_loss=0.0381 tok_s=60095.0 opt_steps=1240 +[epoch 14/50] step=1260 train_loss=0.0381 tok_s=60095.6 opt_steps=1260 +[epoch 14/50] step=1280 train_loss=0.0382 tok_s=60098.2 opt_steps=1280 +[epoch 14/50] step=1300 train_loss=0.0382 tok_s=60110.7 opt_steps=1300 +[epoch 14/50] step=1320 train_loss=0.0382 tok_s=60110.6 opt_steps=1320 +[epoch 14/50] step=1340 train_loss=0.0383 tok_s=60112.8 opt_steps=1340 +[epoch 14/50] step=1360 train_loss=0.0384 tok_s=60115.8 opt_steps=1360 +[epoch 14/50] step=1380 train_loss=0.0384 tok_s=60111.2 opt_steps=1380 +[epoch 14/50] step=1400 train_loss=0.0384 tok_s=60115.4 opt_steps=1400 +[epoch 14/50] step=1420 train_loss=0.0385 tok_s=60114.0 opt_steps=1420 +[epoch 14/50] step=1440 train_loss=0.0385 tok_s=60111.0 opt_steps=1440 +[epoch 14/50] step=1460 train_loss=0.0385 tok_s=60101.0 opt_steps=1460 +[epoch 14/50] step=1480 train_loss=0.0386 tok_s=60101.0 opt_steps=1480 +[epoch 14/50] step=1500 train_loss=0.0386 tok_s=60105.2 opt_steps=1500 +[epoch 14/50] step=1520 train_loss=0.0387 tok_s=60107.3 opt_steps=1520 +[epoch 14/50] step=1540 train_loss=0.0387 tok_s=60103.5 opt_steps=1540 +[epoch 14/50] step=1560 train_loss=0.0388 tok_s=60108.6 opt_steps=1560 +[epoch 14/50] step=1580 train_loss=0.0388 tok_s=60109.8 opt_steps=1580 +[epoch 14/50] step=1600 train_loss=0.0388 tok_s=60113.0 opt_steps=1600 +[epoch 14/50] step=1620 train_loss=0.0388 tok_s=60118.4 opt_steps=1620 +[epoch 14/50] step=1640 train_loss=0.0389 tok_s=60119.5 opt_steps=1640 +[epoch 14/50] step=1660 train_loss=0.0389 tok_s=60121.6 opt_steps=1660 +[epoch 14/50] step=1680 train_loss=0.0390 tok_s=60123.8 opt_steps=1680 +[epoch 14/50] step=1700 train_loss=0.0390 tok_s=60122.9 opt_steps=1700 +[epoch 14/50] step=1720 train_loss=0.0390 tok_s=60129.4 opt_steps=1720 +[epoch 14/50] step=1740 train_loss=0.0391 tok_s=60135.7 opt_steps=1740 +[epoch 14/50] step=1760 train_loss=0.0391 tok_s=60138.0 opt_steps=1760 +[epoch 14/50] step=1780 train_loss=0.0391 tok_s=60132.3 opt_steps=1780 +[epoch 14/50] step=1800 train_loss=0.0391 tok_s=60130.4 opt_steps=1800 +[epoch 14/50] step=1820 train_loss=0.0392 tok_s=60131.8 opt_steps=1820 +[epoch 14/50] step=1840 train_loss=0.0392 tok_s=60127.3 opt_steps=1840 +[epoch 14/50] step=1860 train_loss=0.0393 tok_s=60128.6 opt_steps=1860 +[epoch 14/50] step=1880 train_loss=0.0393 tok_s=60127.2 opt_steps=1880 +[epoch 14/50] step=1900 train_loss=0.0393 tok_s=60133.5 opt_steps=1900 +[epoch 14/50] step=1920 train_loss=0.0394 tok_s=60131.2 opt_steps=1920 +[epoch 14/50] step=1940 train_loss=0.0394 tok_s=60132.7 opt_steps=1940 +[epoch 14/50] step=1960 train_loss=0.0395 tok_s=60128.9 opt_steps=1960 +[epoch 14/50] step=1980 train_loss=0.0395 tok_s=60126.5 opt_steps=1980 +[epoch 14/50] step=2000 train_loss=0.0395 tok_s=60123.7 opt_steps=2000 +[epoch 14/50] step=2020 train_loss=0.0395 tok_s=60123.4 opt_steps=2020 +[epoch 14/50] step=2040 train_loss=0.0396 tok_s=60128.2 opt_steps=2040 +[epoch 14/50] step=2060 train_loss=0.0396 tok_s=60125.4 opt_steps=2060 +[epoch 14/50] step=2080 train_loss=0.0396 tok_s=60125.1 opt_steps=2080 +[epoch 14/50] step=2100 train_loss=0.0397 tok_s=60129.8 opt_steps=2100 +[epoch 14/50] step=2120 train_loss=0.0397 tok_s=60130.6 opt_steps=2120 +[epoch 14/50] step=2140 train_loss=0.0397 tok_s=60131.9 opt_steps=2140 +[epoch 14/50] step=2160 train_loss=0.0398 tok_s=60126.4 opt_steps=2160 +[epoch 14/50] step=2180 train_loss=0.0398 tok_s=60126.4 opt_steps=2180 +[epoch 14/50] step=2200 train_loss=0.0398 tok_s=60129.0 opt_steps=2200 +[epoch 14/50] step=2220 train_loss=0.0399 tok_s=60128.3 opt_steps=2220 +[epoch 14/50] step=2240 train_loss=0.0399 tok_s=60133.0 opt_steps=2240 +[epoch 14/50] step=2260 train_loss=0.0399 tok_s=60137.4 opt_steps=2260 +[epoch 14/50] step=2280 train_loss=0.0399 tok_s=60144.8 opt_steps=2280 +[epoch 14/50] step=2300 train_loss=0.0400 tok_s=60142.9 opt_steps=2300 +[epoch 14/50] step=2320 train_loss=0.0400 tok_s=60145.6 opt_steps=2320 +[epoch 14/50] step=2340 train_loss=0.0400 tok_s=60145.6 opt_steps=2340 +[epoch 14/50] step=2360 train_loss=0.0401 tok_s=60147.8 opt_steps=2360 +[epoch 14/50] step=2380 train_loss=0.0401 tok_s=60144.3 opt_steps=2380 +[epoch 14/50] step=2400 train_loss=0.0401 tok_s=60146.9 opt_steps=2400 +[epoch 14/50] step=2420 train_loss=0.0401 tok_s=60147.0 opt_steps=2420 +[epoch 14/50] step=2440 train_loss=0.0402 tok_s=60149.7 opt_steps=2440 +[epoch 14/50] step=2460 train_loss=0.0402 tok_s=60149.2 opt_steps=2460 +[epoch 14/50] step=2480 train_loss=0.0402 tok_s=60151.3 opt_steps=2480 +[epoch 14/50] step=2500 train_loss=0.0403 tok_s=60150.5 opt_steps=2500 +[epoch 14/50] step=2520 train_loss=0.0403 tok_s=60149.9 opt_steps=2520 +[epoch 14/50] step=2540 train_loss=0.0403 tok_s=60152.3 opt_steps=2540 +[epoch 14/50] step=2560 train_loss=0.0403 tok_s=60151.7 opt_steps=2560 +[epoch 14/50] step=2580 train_loss=0.0404 tok_s=60157.1 opt_steps=2580 +[epoch 14/50] step=2600 train_loss=0.0404 tok_s=60160.1 opt_steps=2600 +[epoch 14/50] step=2620 train_loss=0.0404 tok_s=60162.2 opt_steps=2620 +[epoch 14/50] step=2640 train_loss=0.0404 tok_s=60163.4 opt_steps=2640 +[epoch 14/50] step=2660 train_loss=0.0405 tok_s=60160.3 opt_steps=2660 +[epoch 14/50] step=2680 train_loss=0.0405 tok_s=60164.7 opt_steps=2680 +[epoch 14/50] step=2700 train_loss=0.0405 tok_s=60164.6 opt_steps=2700 +[epoch 14/50] step=2720 train_loss=0.0406 tok_s=60164.4 opt_steps=2720 +[epoch 14/50] step=2740 train_loss=0.0406 tok_s=60164.8 opt_steps=2740 +[epoch 14/50] step=2760 train_loss=0.0406 tok_s=60166.6 opt_steps=2760 +[epoch 14/50] step=2780 train_loss=0.0406 tok_s=60167.5 opt_steps=2780 +[epoch 14/50] step=2800 train_loss=0.0407 tok_s=60167.7 opt_steps=2800 +[epoch 14/50] step=2820 train_loss=0.0407 tok_s=60172.3 opt_steps=2820 +[epoch 14/50] step=2840 train_loss=0.0407 tok_s=60176.3 opt_steps=2840 +[epoch 14/50] step=2860 train_loss=0.0408 tok_s=60173.4 opt_steps=2860 +[epoch 14/50] step=2880 train_loss=0.0408 tok_s=60175.5 opt_steps=2880 +[epoch 14/50] step=2900 train_loss=0.0408 tok_s=60178.3 opt_steps=2900 +[epoch 14/50] step=2920 train_loss=0.0409 tok_s=60178.9 opt_steps=2920 +[epoch 14/50] step=2940 train_loss=0.0409 tok_s=60182.9 opt_steps=2940 +[epoch 14/50] step=2960 train_loss=0.0409 tok_s=60187.2 opt_steps=2960 +[epoch 14/50] step=2980 train_loss=0.0409 tok_s=60185.2 opt_steps=2980 +[epoch 14/50] step=3000 train_loss=0.0410 tok_s=60185.3 opt_steps=3000 +[epoch 14/50] step=3020 train_loss=0.0410 tok_s=60187.4 opt_steps=3020 +[epoch 14/50] step=3040 train_loss=0.0410 tok_s=60187.4 opt_steps=3040 +[epoch 14/50] step=3060 train_loss=0.0410 tok_s=60187.5 opt_steps=3060 +[epoch 14/50] step=3080 train_loss=0.0411 tok_s=60188.6 opt_steps=3080 +[epoch 14/50] step=3100 train_loss=0.0411 tok_s=60190.0 opt_steps=3100 +[epoch 14/50] step=3120 train_loss=0.0411 tok_s=60191.9 opt_steps=3120 +[epoch 14/50] step=3140 train_loss=0.0411 tok_s=60195.2 opt_steps=3140 +[epoch 14/50] step=3160 train_loss=0.0411 tok_s=60197.1 opt_steps=3160 +[epoch 14/50] step=3180 train_loss=0.0412 tok_s=60198.6 opt_steps=3180 +[epoch 14/50] step=3200 train_loss=0.0412 tok_s=60200.1 opt_steps=3200 +[epoch 14/50] step=3220 train_loss=0.0412 tok_s=60200.5 opt_steps=3220 +[epoch 14/50] step=3240 train_loss=0.0412 tok_s=60201.4 opt_steps=3240 +[epoch 14/50] step=3260 train_loss=0.0412 tok_s=60195.7 opt_steps=3260 +[epoch 14/50] train_loss=0.0413 val_skipped tok_s=60198.2 opt_steps=3273 +[epoch 15/50] step=20 train_loss=0.0317 tok_s=57978.8 opt_steps=20 +[epoch 15/50] step=40 train_loss=0.0313 tok_s=59080.5 opt_steps=40 +[epoch 15/50] step=60 train_loss=0.0318 tok_s=59647.4 opt_steps=60 +[epoch 15/50] step=80 train_loss=0.0318 tok_s=59760.2 opt_steps=80 +[epoch 15/50] step=100 train_loss=0.0316 tok_s=59687.3 opt_steps=100 +[epoch 15/50] step=120 train_loss=0.0317 tok_s=59714.7 opt_steps=120 +[epoch 15/50] step=140 train_loss=0.0317 tok_s=59826.0 opt_steps=140 +[epoch 15/50] step=160 train_loss=0.0320 tok_s=59896.9 opt_steps=160 +[epoch 15/50] step=180 train_loss=0.0319 tok_s=59937.5 opt_steps=180 +[epoch 15/50] step=200 train_loss=0.0320 tok_s=60020.3 opt_steps=200 +[epoch 15/50] step=220 train_loss=0.0320 tok_s=60044.2 opt_steps=220 +[epoch 15/50] step=240 train_loss=0.0320 tok_s=60077.6 opt_steps=240 +[epoch 15/50] step=260 train_loss=0.0320 tok_s=60088.2 opt_steps=260 +[epoch 15/50] step=280 train_loss=0.0320 tok_s=60081.2 opt_steps=280 +[epoch 15/50] step=300 train_loss=0.0320 tok_s=60048.1 opt_steps=300 +[epoch 15/50] step=320 train_loss=0.0320 tok_s=60038.3 opt_steps=320 +[epoch 15/50] step=340 train_loss=0.0321 tok_s=60031.0 opt_steps=340 +[epoch 15/50] step=360 train_loss=0.0321 tok_s=60026.4 opt_steps=360 +[epoch 15/50] step=380 train_loss=0.0321 tok_s=60030.0 opt_steps=380 +[epoch 15/50] step=400 train_loss=0.0322 tok_s=60042.8 opt_steps=400 +[epoch 15/50] step=420 train_loss=0.0323 tok_s=60030.9 opt_steps=420 +[epoch 15/50] step=440 train_loss=0.0324 tok_s=60026.1 opt_steps=440 +[epoch 15/50] step=460 train_loss=0.0324 tok_s=60024.5 opt_steps=460 +[epoch 15/50] step=480 train_loss=0.0325 tok_s=60031.9 opt_steps=480 +[epoch 15/50] step=500 train_loss=0.0326 tok_s=60055.7 opt_steps=500 +[epoch 15/50] step=520 train_loss=0.0326 tok_s=60073.5 opt_steps=520 +[epoch 15/50] step=540 train_loss=0.0327 tok_s=60065.1 opt_steps=540 +[epoch 15/50] step=560 train_loss=0.0328 tok_s=60069.3 opt_steps=560 +[epoch 15/50] step=580 train_loss=0.0328 tok_s=60056.8 opt_steps=580 +[epoch 15/50] step=600 train_loss=0.0329 tok_s=60040.0 opt_steps=600 +[epoch 15/50] step=620 train_loss=0.0330 tok_s=60019.0 opt_steps=620 +[epoch 15/50] step=640 train_loss=0.0331 tok_s=60019.4 opt_steps=640 +[epoch 15/50] step=660 train_loss=0.0332 tok_s=60027.9 opt_steps=660 +[epoch 15/50] step=680 train_loss=0.0332 tok_s=60028.7 opt_steps=680 +[epoch 15/50] step=700 train_loss=0.0333 tok_s=60035.6 opt_steps=700 +[epoch 15/50] step=720 train_loss=0.0334 tok_s=60045.0 opt_steps=720 +[epoch 15/50] step=740 train_loss=0.0334 tok_s=60051.5 opt_steps=740 +[epoch 15/50] step=760 train_loss=0.0335 tok_s=60049.0 opt_steps=760 +[epoch 15/50] step=780 train_loss=0.0335 tok_s=60059.7 opt_steps=780 +[epoch 15/50] step=800 train_loss=0.0336 tok_s=60075.1 opt_steps=800 +[epoch 15/50] step=820 train_loss=0.0336 tok_s=60074.5 opt_steps=820 +[epoch 15/50] step=840 train_loss=0.0336 tok_s=60067.1 opt_steps=840 +[epoch 15/50] step=860 train_loss=0.0337 tok_s=60061.3 opt_steps=860 +[epoch 15/50] step=880 train_loss=0.0338 tok_s=60052.7 opt_steps=880 +[epoch 15/50] step=900 train_loss=0.0338 tok_s=60045.8 opt_steps=900 +[epoch 15/50] step=920 train_loss=0.0339 tok_s=60058.5 opt_steps=920 +[epoch 15/50] step=940 train_loss=0.0339 tok_s=60049.7 opt_steps=940 +[epoch 15/50] step=960 train_loss=0.0340 tok_s=60049.5 opt_steps=960 +[epoch 15/50] step=980 train_loss=0.0341 tok_s=60057.9 opt_steps=980 +[epoch 15/50] step=1000 train_loss=0.0341 tok_s=60061.6 opt_steps=1000 +[epoch 15/50] step=1020 train_loss=0.0342 tok_s=60070.1 opt_steps=1020 +[epoch 15/50] step=1040 train_loss=0.0342 tok_s=60074.3 opt_steps=1040 +[epoch 15/50] step=1060 train_loss=0.0343 tok_s=60072.1 opt_steps=1060 +[epoch 15/50] step=1080 train_loss=0.0343 tok_s=60080.2 opt_steps=1080 +[epoch 15/50] step=1100 train_loss=0.0344 tok_s=60078.5 opt_steps=1100 +[epoch 15/50] step=1120 train_loss=0.0344 tok_s=60082.4 opt_steps=1120 +[epoch 15/50] step=1140 train_loss=0.0345 tok_s=60081.7 opt_steps=1140 +[epoch 15/50] step=1160 train_loss=0.0345 tok_s=60081.7 opt_steps=1160 +[epoch 15/50] step=1180 train_loss=0.0346 tok_s=60078.4 opt_steps=1180 +[epoch 15/50] step=1200 train_loss=0.0346 tok_s=60086.0 opt_steps=1200 +[epoch 15/50] step=1220 train_loss=0.0347 tok_s=60084.7 opt_steps=1220 +[epoch 15/50] step=1240 train_loss=0.0347 tok_s=60095.9 opt_steps=1240 +[epoch 15/50] step=1260 train_loss=0.0348 tok_s=60104.7 opt_steps=1260 +[epoch 15/50] step=1280 train_loss=0.0348 tok_s=60111.7 opt_steps=1280 +[epoch 15/50] step=1300 train_loss=0.0349 tok_s=60112.3 opt_steps=1300 +[epoch 15/50] step=1320 train_loss=0.0349 tok_s=60113.0 opt_steps=1320 +[epoch 15/50] step=1340 train_loss=0.0349 tok_s=60103.5 opt_steps=1340 +[epoch 15/50] step=1360 train_loss=0.0350 tok_s=60103.5 opt_steps=1360 +[epoch 15/50] step=1380 train_loss=0.0350 tok_s=60103.6 opt_steps=1380 +[epoch 15/50] step=1400 train_loss=0.0351 tok_s=60099.5 opt_steps=1400 +[epoch 15/50] step=1420 train_loss=0.0351 tok_s=60111.3 opt_steps=1420 +[epoch 15/50] step=1440 train_loss=0.0352 tok_s=60113.4 opt_steps=1440 +[epoch 15/50] step=1460 train_loss=0.0352 tok_s=60116.5 opt_steps=1460 +[epoch 15/50] step=1480 train_loss=0.0353 tok_s=60118.1 opt_steps=1480 +[epoch 15/50] step=1500 train_loss=0.0353 tok_s=60120.7 opt_steps=1500 +[epoch 15/50] step=1520 train_loss=0.0354 tok_s=60129.2 opt_steps=1520 +[epoch 15/50] step=1540 train_loss=0.0354 tok_s=60131.6 opt_steps=1540 +[epoch 15/50] step=1560 train_loss=0.0355 tok_s=60130.7 opt_steps=1560 +[epoch 15/50] step=1580 train_loss=0.0355 tok_s=60135.5 opt_steps=1580 +[epoch 15/50] step=1600 train_loss=0.0355 tok_s=60136.4 opt_steps=1600 +[epoch 15/50] step=1620 train_loss=0.0355 tok_s=60142.8 opt_steps=1620 +[epoch 15/50] step=1640 train_loss=0.0356 tok_s=60141.9 opt_steps=1640 +[epoch 15/50] step=1660 train_loss=0.0356 tok_s=60137.2 opt_steps=1660 +[epoch 15/50] step=1680 train_loss=0.0357 tok_s=60139.8 opt_steps=1680 +[epoch 15/50] step=1700 train_loss=0.0357 tok_s=60137.9 opt_steps=1700 +[epoch 15/50] step=1720 train_loss=0.0357 tok_s=60133.9 opt_steps=1720 +[epoch 15/50] step=1740 train_loss=0.0358 tok_s=60131.4 opt_steps=1740 +[epoch 15/50] step=1760 train_loss=0.0358 tok_s=60130.4 opt_steps=1760 +[epoch 15/50] step=1780 train_loss=0.0358 tok_s=60126.3 opt_steps=1780 +[epoch 15/50] step=1800 train_loss=0.0359 tok_s=60128.6 opt_steps=1800 +[epoch 15/50] step=1820 train_loss=0.0359 tok_s=60122.2 opt_steps=1820 +[epoch 15/50] step=1840 train_loss=0.0360 tok_s=60122.2 opt_steps=1840 +[epoch 15/50] step=1860 train_loss=0.0360 tok_s=60118.4 opt_steps=1860 +[epoch 15/50] step=1880 train_loss=0.0361 tok_s=60125.4 opt_steps=1880 +[epoch 15/50] step=1900 train_loss=0.0361 tok_s=60125.6 opt_steps=1900 +[epoch 15/50] step=1920 train_loss=0.0361 tok_s=60126.7 opt_steps=1920 +[epoch 15/50] step=1940 train_loss=0.0362 tok_s=60124.1 opt_steps=1940 +[epoch 15/50] step=1960 train_loss=0.0362 tok_s=60123.7 opt_steps=1960 +[epoch 15/50] step=1980 train_loss=0.0362 tok_s=60125.4 opt_steps=1980 +[epoch 15/50] step=2000 train_loss=0.0363 tok_s=60125.3 opt_steps=2000 +[epoch 15/50] step=2020 train_loss=0.0363 tok_s=60130.0 opt_steps=2020 +[epoch 15/50] step=2040 train_loss=0.0363 tok_s=60132.3 opt_steps=2040 +[epoch 15/50] step=2060 train_loss=0.0364 tok_s=60133.4 opt_steps=2060 +[epoch 15/50] step=2080 train_loss=0.0364 tok_s=60133.1 opt_steps=2080 +[epoch 15/50] step=2100 train_loss=0.0364 tok_s=60136.7 opt_steps=2100 +[epoch 15/50] step=2120 train_loss=0.0365 tok_s=60133.5 opt_steps=2120 +[epoch 15/50] step=2140 train_loss=0.0365 tok_s=60127.2 opt_steps=2140 +[epoch 15/50] step=2160 train_loss=0.0365 tok_s=60124.0 opt_steps=2160 +[epoch 15/50] step=2180 train_loss=0.0366 tok_s=60125.8 opt_steps=2180 +[epoch 15/50] step=2200 train_loss=0.0366 tok_s=60125.2 opt_steps=2200 +[epoch 15/50] step=2220 train_loss=0.0366 tok_s=60130.9 opt_steps=2220 +[epoch 15/50] step=2240 train_loss=0.0366 tok_s=60131.4 opt_steps=2240 +[epoch 15/50] step=2260 train_loss=0.0367 tok_s=60133.4 opt_steps=2260 +[epoch 15/50] step=2280 train_loss=0.0367 tok_s=60138.3 opt_steps=2280 +[epoch 15/50] step=2300 train_loss=0.0367 tok_s=60140.8 opt_steps=2300 +[epoch 15/50] step=2320 train_loss=0.0368 tok_s=60140.4 opt_steps=2320 +[epoch 15/50] step=2340 train_loss=0.0368 tok_s=60143.6 opt_steps=2340 +[epoch 15/50] step=2360 train_loss=0.0368 tok_s=60145.3 opt_steps=2360 +[epoch 15/50] step=2380 train_loss=0.0369 tok_s=60149.5 opt_steps=2380 +[epoch 15/50] step=2400 train_loss=0.0369 tok_s=60147.5 opt_steps=2400 +[epoch 15/50] step=2420 train_loss=0.0369 tok_s=60147.5 opt_steps=2420 +[epoch 15/50] step=2440 train_loss=0.0369 tok_s=60147.6 opt_steps=2440 +[epoch 15/50] step=2460 train_loss=0.0370 tok_s=60142.6 opt_steps=2460 +[epoch 15/50] step=2480 train_loss=0.0370 tok_s=60141.2 opt_steps=2480 +[epoch 15/50] step=2500 train_loss=0.0370 tok_s=60139.4 opt_steps=2500 +[epoch 15/50] step=2520 train_loss=0.0371 tok_s=60141.9 opt_steps=2520 +[epoch 15/50] step=2540 train_loss=0.0371 tok_s=60140.7 opt_steps=2540 +[epoch 15/50] step=2560 train_loss=0.0371 tok_s=60141.4 opt_steps=2560 +[epoch 15/50] step=2580 train_loss=0.0371 tok_s=60142.0 opt_steps=2580 +[epoch 15/50] step=2600 train_loss=0.0371 tok_s=60142.2 opt_steps=2600 +[epoch 15/50] step=2620 train_loss=0.0372 tok_s=60145.2 opt_steps=2620 +[epoch 15/50] step=2640 train_loss=0.0372 tok_s=60148.1 opt_steps=2640 +[epoch 15/50] step=2660 train_loss=0.0372 tok_s=60152.0 opt_steps=2660 +[epoch 15/50] step=2680 train_loss=0.0373 tok_s=60155.4 opt_steps=2680 +[epoch 15/50] step=2700 train_loss=0.0373 tok_s=60156.8 opt_steps=2700 +[epoch 15/50] step=2720 train_loss=0.0373 tok_s=60155.0 opt_steps=2720 +[epoch 15/50] step=2740 train_loss=0.0373 tok_s=60157.4 opt_steps=2740 +[epoch 15/50] step=2760 train_loss=0.0374 tok_s=60157.9 opt_steps=2760 +[epoch 15/50] step=2780 train_loss=0.0374 tok_s=60156.6 opt_steps=2780 +[epoch 15/50] step=2800 train_loss=0.0374 tok_s=60156.8 opt_steps=2800 +[epoch 15/50] step=2820 train_loss=0.0374 tok_s=60157.6 opt_steps=2820 +[epoch 15/50] step=2840 train_loss=0.0375 tok_s=60159.5 opt_steps=2840 +[epoch 15/50] step=2860 train_loss=0.0375 tok_s=60160.9 opt_steps=2860 +[epoch 15/50] step=2880 train_loss=0.0375 tok_s=60161.8 opt_steps=2880 +[epoch 15/50] step=2900 train_loss=0.0375 tok_s=60163.1 opt_steps=2900 +[epoch 15/50] step=2920 train_loss=0.0376 tok_s=60162.0 opt_steps=2920 +[epoch 15/50] step=2940 train_loss=0.0376 tok_s=60164.1 opt_steps=2940 +[epoch 15/50] step=2960 train_loss=0.0376 tok_s=60166.6 opt_steps=2960 +[epoch 15/50] step=2980 train_loss=0.0376 tok_s=60168.4 opt_steps=2980 +[epoch 15/50] step=3000 train_loss=0.0376 tok_s=60170.7 opt_steps=3000 +[epoch 15/50] step=3020 train_loss=0.0377 tok_s=60169.7 opt_steps=3020 +[epoch 15/50] step=3040 train_loss=0.0377 tok_s=60165.3 opt_steps=3040 +[epoch 15/50] step=3060 train_loss=0.0377 tok_s=60160.6 opt_steps=3060 +[epoch 15/50] step=3080 train_loss=0.0378 tok_s=60154.0 opt_steps=3080 +[epoch 15/50] step=3100 train_loss=0.0378 tok_s=60151.3 opt_steps=3100 +[epoch 15/50] step=3120 train_loss=0.0378 tok_s=60147.2 opt_steps=3120 +[epoch 15/50] step=3140 train_loss=0.0378 tok_s=60142.6 opt_steps=3140 +[epoch 15/50] step=3160 train_loss=0.0378 tok_s=60140.2 opt_steps=3160 +[epoch 15/50] step=3180 train_loss=0.0379 tok_s=60135.8 opt_steps=3180 +[epoch 15/50] step=3200 train_loss=0.0379 tok_s=60130.6 opt_steps=3200 +[epoch 15/50] step=3220 train_loss=0.0379 tok_s=60120.4 opt_steps=3220 +[epoch 15/50] step=3240 train_loss=0.0379 tok_s=60109.7 opt_steps=3240 +[epoch 15/50] step=3260 train_loss=0.0379 tok_s=60099.6 opt_steps=3260 +[epoch 15/50] train_loss=0.0379 val_skipped tok_s=60100.8 opt_steps=3273 +[epoch 16/50] step=20 train_loss=0.0294 tok_s=56900.6 opt_steps=20 +[epoch 16/50] step=40 train_loss=0.0282 tok_s=57896.4 opt_steps=40 +[epoch 16/50] step=60 train_loss=0.0283 tok_s=58078.1 opt_steps=60 +[epoch 16/50] step=80 train_loss=0.0284 tok_s=58424.3 opt_steps=80 +[epoch 16/50] step=100 train_loss=0.0284 tok_s=58820.6 opt_steps=100 +[epoch 16/50] step=120 train_loss=0.0285 tok_s=59147.3 opt_steps=120 +[epoch 16/50] step=140 train_loss=0.0284 tok_s=59343.9 opt_steps=140 +[epoch 16/50] step=160 train_loss=0.0286 tok_s=59469.2 opt_steps=160 +[epoch 16/50] step=180 train_loss=0.0286 tok_s=59571.5 opt_steps=180 +[epoch 16/50] step=200 train_loss=0.0288 tok_s=59640.2 opt_steps=200 +[epoch 16/50] step=220 train_loss=0.0287 tok_s=59716.4 opt_steps=220 +[epoch 16/50] step=240 train_loss=0.0288 tok_s=59730.3 opt_steps=240 +[epoch 16/50] step=260 train_loss=0.0289 tok_s=59767.2 opt_steps=260 +[epoch 16/50] step=280 train_loss=0.0289 tok_s=59792.8 opt_steps=280 +[epoch 16/50] step=300 train_loss=0.0289 tok_s=59825.4 opt_steps=300 +[epoch 16/50] step=320 train_loss=0.0291 tok_s=59858.9 opt_steps=320 +[epoch 16/50] step=340 train_loss=0.0292 tok_s=59896.2 opt_steps=340 +[epoch 16/50] step=360 train_loss=0.0292 tok_s=59930.5 opt_steps=360 +[epoch 16/50] step=380 train_loss=0.0294 tok_s=59949.7 opt_steps=380 +[epoch 16/50] step=400 train_loss=0.0294 tok_s=59984.2 opt_steps=400 +[epoch 16/50] step=420 train_loss=0.0295 tok_s=59992.3 opt_steps=420 +[epoch 16/50] step=440 train_loss=0.0296 tok_s=60013.2 opt_steps=440 +[epoch 16/50] step=460 train_loss=0.0297 tok_s=60026.9 opt_steps=460 +[epoch 16/50] step=480 train_loss=0.0298 tok_s=60072.0 opt_steps=480 +[epoch 16/50] step=500 train_loss=0.0299 tok_s=60093.5 opt_steps=500 +[epoch 16/50] step=520 train_loss=0.0299 tok_s=60122.8 opt_steps=520 +[epoch 16/50] step=540 train_loss=0.0300 tok_s=60117.5 opt_steps=540 +[epoch 16/50] step=560 train_loss=0.0301 tok_s=60119.6 opt_steps=560 +[epoch 16/50] step=580 train_loss=0.0302 tok_s=60110.8 opt_steps=580 +[epoch 16/50] step=600 train_loss=0.0302 tok_s=60114.0 opt_steps=600 +[epoch 16/50] step=620 train_loss=0.0303 tok_s=60111.2 opt_steps=620 +[epoch 16/50] step=640 train_loss=0.0303 tok_s=60112.1 opt_steps=640 +[epoch 16/50] step=660 train_loss=0.0304 tok_s=60119.6 opt_steps=660 +[epoch 16/50] step=680 train_loss=0.0304 tok_s=60114.4 opt_steps=680 +[epoch 16/50] step=700 train_loss=0.0305 tok_s=60128.3 opt_steps=700 +[epoch 16/50] step=720 train_loss=0.0306 tok_s=60142.4 opt_steps=720 +[epoch 16/50] step=740 train_loss=0.0307 tok_s=60148.4 opt_steps=740 +[epoch 16/50] step=760 train_loss=0.0307 tok_s=60145.1 opt_steps=760 +[epoch 16/50] step=780 train_loss=0.0308 tok_s=60149.2 opt_steps=780 +[epoch 16/50] step=800 train_loss=0.0308 tok_s=60159.7 opt_steps=800 +[epoch 16/50] step=820 train_loss=0.0309 tok_s=60170.6 opt_steps=820 +[epoch 16/50] step=840 train_loss=0.0309 tok_s=60177.5 opt_steps=840 +[epoch 16/50] step=860 train_loss=0.0309 tok_s=60180.4 opt_steps=860 +[epoch 16/50] step=880 train_loss=0.0310 tok_s=60182.6 opt_steps=880 +[epoch 16/50] step=900 train_loss=0.0311 tok_s=60182.5 opt_steps=900 +[epoch 16/50] step=920 train_loss=0.0311 tok_s=60174.1 opt_steps=920 +[epoch 16/50] step=940 train_loss=0.0312 tok_s=60161.3 opt_steps=940 +[epoch 16/50] step=960 train_loss=0.0313 tok_s=60151.8 opt_steps=960 +[epoch 16/50] step=980 train_loss=0.0313 tok_s=60147.9 opt_steps=980 +[epoch 16/50] step=1000 train_loss=0.0314 tok_s=60153.9 opt_steps=1000 +[epoch 16/50] step=1020 train_loss=0.0314 tok_s=60148.7 opt_steps=1020 +[epoch 16/50] step=1040 train_loss=0.0314 tok_s=60141.4 opt_steps=1040 +[epoch 16/50] step=1060 train_loss=0.0315 tok_s=60134.8 opt_steps=1060 +[epoch 16/50] step=1080 train_loss=0.0315 tok_s=60142.4 opt_steps=1080 +[epoch 16/50] step=1100 train_loss=0.0315 tok_s=60142.6 opt_steps=1100 +[epoch 16/50] step=1120 train_loss=0.0316 tok_s=60139.1 opt_steps=1120 +[epoch 16/50] step=1140 train_loss=0.0316 tok_s=60135.0 opt_steps=1140 +[epoch 16/50] step=1160 train_loss=0.0317 tok_s=60145.8 opt_steps=1160 +[epoch 16/50] step=1180 train_loss=0.0318 tok_s=60143.3 opt_steps=1180 +[epoch 16/50] step=1200 train_loss=0.0318 tok_s=60150.3 opt_steps=1200 +[epoch 16/50] step=1220 train_loss=0.0319 tok_s=60151.3 opt_steps=1220 +[epoch 16/50] step=1240 train_loss=0.0319 tok_s=60150.7 opt_steps=1240 +[epoch 16/50] step=1260 train_loss=0.0319 tok_s=60155.5 opt_steps=1260 +[epoch 16/50] step=1280 train_loss=0.0320 tok_s=60148.2 opt_steps=1280 +[epoch 16/50] step=1300 train_loss=0.0321 tok_s=60144.2 opt_steps=1300 +[epoch 16/50] step=1320 train_loss=0.0321 tok_s=60147.1 opt_steps=1320 +[epoch 16/50] step=1340 train_loss=0.0322 tok_s=60147.4 opt_steps=1340 +[epoch 16/50] step=1360 train_loss=0.0322 tok_s=60143.4 opt_steps=1360 +[epoch 16/50] step=1380 train_loss=0.0323 tok_s=60143.5 opt_steps=1380 +[epoch 16/50] step=1400 train_loss=0.0323 tok_s=60143.0 opt_steps=1400 +[epoch 16/50] step=1420 train_loss=0.0324 tok_s=60148.4 opt_steps=1420 +[epoch 16/50] step=1440 train_loss=0.0324 tok_s=60146.1 opt_steps=1440 +[epoch 16/50] step=1460 train_loss=0.0324 tok_s=60146.6 opt_steps=1460 +[epoch 16/50] step=1480 train_loss=0.0325 tok_s=60152.7 opt_steps=1480 +[epoch 16/50] step=1500 train_loss=0.0325 tok_s=60147.8 opt_steps=1500 +[epoch 16/50] step=1520 train_loss=0.0326 tok_s=60144.9 opt_steps=1520 +[epoch 16/50] step=1540 train_loss=0.0326 tok_s=60153.6 opt_steps=1540 +[epoch 16/50] step=1560 train_loss=0.0327 tok_s=60156.1 opt_steps=1560 +[epoch 16/50] step=1580 train_loss=0.0327 tok_s=60155.6 opt_steps=1580 +[epoch 16/50] step=1600 train_loss=0.0327 tok_s=60154.9 opt_steps=1600 +[epoch 16/50] step=1620 train_loss=0.0328 tok_s=60152.8 opt_steps=1620 +[epoch 16/50] step=1640 train_loss=0.0328 tok_s=60148.0 opt_steps=1640 +[epoch 16/50] step=1660 train_loss=0.0328 tok_s=60148.8 opt_steps=1660 +[epoch 16/50] step=1680 train_loss=0.0329 tok_s=60152.3 opt_steps=1680 +[epoch 16/50] step=1700 train_loss=0.0329 tok_s=60154.5 opt_steps=1700 +[epoch 16/50] step=1720 train_loss=0.0330 tok_s=60154.7 opt_steps=1720 +[epoch 16/50] step=1740 train_loss=0.0330 tok_s=60149.7 opt_steps=1740 +[epoch 16/50] step=1760 train_loss=0.0330 tok_s=60150.0 opt_steps=1760 +[epoch 16/50] step=1780 train_loss=0.0331 tok_s=60145.7 opt_steps=1780 +[epoch 16/50] step=1800 train_loss=0.0331 tok_s=60147.3 opt_steps=1800 +[epoch 16/50] step=1820 train_loss=0.0332 tok_s=60145.8 opt_steps=1820 +[epoch 16/50] step=1840 train_loss=0.0332 tok_s=60147.1 opt_steps=1840 +[epoch 16/50] step=1860 train_loss=0.0332 tok_s=60143.8 opt_steps=1860 +[epoch 16/50] step=1880 train_loss=0.0333 tok_s=60136.0 opt_steps=1880 +[epoch 16/50] step=1900 train_loss=0.0333 tok_s=60140.4 opt_steps=1900 +[epoch 16/50] step=1920 train_loss=0.0333 tok_s=60142.9 opt_steps=1920 +[epoch 16/50] step=1940 train_loss=0.0334 tok_s=60142.5 opt_steps=1940 +[epoch 16/50] step=1960 train_loss=0.0334 tok_s=60143.1 opt_steps=1960 +[epoch 16/50] step=1980 train_loss=0.0334 tok_s=60143.9 opt_steps=1980 +[epoch 16/50] step=2000 train_loss=0.0335 tok_s=60144.9 opt_steps=2000 +[epoch 16/50] step=2020 train_loss=0.0335 tok_s=60147.5 opt_steps=2020 +[epoch 16/50] step=2040 train_loss=0.0336 tok_s=60151.2 opt_steps=2040 +[epoch 16/50] step=2060 train_loss=0.0336 tok_s=60149.5 opt_steps=2060 +[epoch 16/50] step=2080 train_loss=0.0336 tok_s=60140.6 opt_steps=2080 +[epoch 16/50] step=2100 train_loss=0.0337 tok_s=60136.7 opt_steps=2100 +[epoch 16/50] step=2120 train_loss=0.0337 tok_s=60131.8 opt_steps=2120 +[epoch 16/50] step=2140 train_loss=0.0337 tok_s=60114.5 opt_steps=2140 +[epoch 16/50] step=2160 train_loss=0.0338 tok_s=60117.3 opt_steps=2160 +[epoch 16/50] step=2180 train_loss=0.0338 tok_s=60120.1 opt_steps=2180 +[epoch 16/50] step=2200 train_loss=0.0338 tok_s=60121.0 opt_steps=2200 +[epoch 16/50] step=2220 train_loss=0.0338 tok_s=60122.7 opt_steps=2220 +[epoch 16/50] step=2240 train_loss=0.0338 tok_s=60126.4 opt_steps=2240 +[epoch 16/50] step=2260 train_loss=0.0338 tok_s=60129.0 opt_steps=2260 +[epoch 16/50] step=2280 train_loss=0.0339 tok_s=60128.7 opt_steps=2280 +[epoch 16/50] step=2300 train_loss=0.0339 tok_s=60130.4 opt_steps=2300 +[epoch 16/50] step=2320 train_loss=0.0339 tok_s=60129.1 opt_steps=2320 +[epoch 16/50] step=2340 train_loss=0.0340 tok_s=60133.7 opt_steps=2340 +[epoch 16/50] step=2360 train_loss=0.0340 tok_s=60142.9 opt_steps=2360 +[epoch 16/50] step=2380 train_loss=0.0340 tok_s=60144.6 opt_steps=2380 +[epoch 16/50] step=2400 train_loss=0.0340 tok_s=60146.0 opt_steps=2400 +[epoch 16/50] step=2420 train_loss=0.0341 tok_s=60148.9 opt_steps=2420 +[epoch 16/50] step=2440 train_loss=0.0341 tok_s=60150.6 opt_steps=2440 +[epoch 16/50] step=2460 train_loss=0.0341 tok_s=60151.4 opt_steps=2460 +[epoch 16/50] step=2480 train_loss=0.0341 tok_s=60158.3 opt_steps=2480 +[epoch 16/50] step=2500 train_loss=0.0341 tok_s=60161.0 opt_steps=2500 +[epoch 16/50] step=2520 train_loss=0.0342 tok_s=60164.3 opt_steps=2520 +[epoch 16/50] step=2540 train_loss=0.0342 tok_s=60160.1 opt_steps=2540 +[epoch 16/50] step=2560 train_loss=0.0342 tok_s=60159.0 opt_steps=2560 +[epoch 16/50] step=2580 train_loss=0.0343 tok_s=60160.8 opt_steps=2580 +[epoch 16/50] step=2600 train_loss=0.0343 tok_s=60159.0 opt_steps=2600 +[epoch 16/50] step=2620 train_loss=0.0343 tok_s=60156.3 opt_steps=2620 +[epoch 16/50] step=2640 train_loss=0.0343 tok_s=60154.4 opt_steps=2640 +[epoch 16/50] step=2660 train_loss=0.0344 tok_s=60155.6 opt_steps=2660 +[epoch 16/50] step=2680 train_loss=0.0344 tok_s=60151.6 opt_steps=2680 +[epoch 16/50] step=2700 train_loss=0.0344 tok_s=60149.3 opt_steps=2700 +[epoch 16/50] step=2720 train_loss=0.0344 tok_s=60149.3 opt_steps=2720 +[epoch 16/50] step=2740 train_loss=0.0345 tok_s=60148.8 opt_steps=2740 +[epoch 16/50] step=2760 train_loss=0.0345 tok_s=60150.3 opt_steps=2760 +[epoch 16/50] step=2780 train_loss=0.0345 tok_s=60152.2 opt_steps=2780 +[epoch 16/50] step=2800 train_loss=0.0345 tok_s=60151.8 opt_steps=2800 +[epoch 16/50] step=2820 train_loss=0.0346 tok_s=60150.1 opt_steps=2820 +[epoch 16/50] step=2840 train_loss=0.0346 tok_s=60154.7 opt_steps=2840 +[epoch 16/50] step=2860 train_loss=0.0346 tok_s=60156.6 opt_steps=2860 +[epoch 16/50] step=2880 train_loss=0.0346 tok_s=60155.4 opt_steps=2880 +[epoch 16/50] step=2900 train_loss=0.0346 tok_s=60155.7 opt_steps=2900 +[epoch 16/50] step=2920 train_loss=0.0347 tok_s=60157.1 opt_steps=2920 +[epoch 16/50] step=2940 train_loss=0.0347 tok_s=60158.4 opt_steps=2940 +[epoch 16/50] step=2960 train_loss=0.0347 tok_s=60160.5 opt_steps=2960 +[epoch 16/50] step=2980 train_loss=0.0347 tok_s=60162.6 opt_steps=2980 +[epoch 16/50] step=3000 train_loss=0.0348 tok_s=60162.1 opt_steps=3000 +[epoch 16/50] step=3020 train_loss=0.0348 tok_s=60164.4 opt_steps=3020 +[epoch 16/50] step=3040 train_loss=0.0348 tok_s=60167.8 opt_steps=3040 +[epoch 16/50] step=3060 train_loss=0.0348 tok_s=60168.9 opt_steps=3060 +[epoch 16/50] step=3080 train_loss=0.0348 tok_s=60170.0 opt_steps=3080 +[epoch 16/50] step=3100 train_loss=0.0349 tok_s=60170.5 opt_steps=3100 +[epoch 16/50] step=3120 train_loss=0.0349 tok_s=60171.0 opt_steps=3120 +[epoch 16/50] step=3140 train_loss=0.0349 tok_s=60172.6 opt_steps=3140 +[epoch 16/50] step=3160 train_loss=0.0349 tok_s=60175.0 opt_steps=3160 +[epoch 16/50] step=3180 train_loss=0.0349 tok_s=60173.9 opt_steps=3180 +[epoch 16/50] step=3200 train_loss=0.0350 tok_s=60176.1 opt_steps=3200 +[epoch 16/50] step=3220 train_loss=0.0350 tok_s=60178.7 opt_steps=3220 +[epoch 16/50] step=3240 train_loss=0.0350 tok_s=60182.2 opt_steps=3240 +[epoch 16/50] step=3260 train_loss=0.0350 tok_s=60176.7 opt_steps=3260 +[epoch 16/50] train_loss=0.0351 val_skipped tok_s=60176.1 opt_steps=3273 +[epoch 17/50] step=20 train_loss=0.0269 tok_s=59342.6 opt_steps=20 +[epoch 17/50] step=40 train_loss=0.0270 tok_s=59665.4 opt_steps=40 +[epoch 17/50] step=60 train_loss=0.0266 tok_s=59997.3 opt_steps=60 +[epoch 17/50] step=80 train_loss=0.0266 tok_s=59980.3 opt_steps=80 +[epoch 17/50] step=100 train_loss=0.0267 tok_s=60069.3 opt_steps=100 +[epoch 17/50] step=120 train_loss=0.0267 tok_s=60047.6 opt_steps=120 +[epoch 17/50] step=140 train_loss=0.0267 tok_s=60047.2 opt_steps=140 +[epoch 17/50] step=160 train_loss=0.0268 tok_s=60073.4 opt_steps=160 +[epoch 17/50] step=180 train_loss=0.0268 tok_s=60131.7 opt_steps=180 +[epoch 17/50] step=200 train_loss=0.0269 tok_s=60146.7 opt_steps=200 +[epoch 17/50] step=220 train_loss=0.0269 tok_s=60163.9 opt_steps=220 +[epoch 17/50] step=240 train_loss=0.0269 tok_s=60238.7 opt_steps=240 +[epoch 17/50] step=260 train_loss=0.0270 tok_s=60226.9 opt_steps=260 +[epoch 17/50] step=280 train_loss=0.0271 tok_s=60216.1 opt_steps=280 +[epoch 17/50] step=300 train_loss=0.0272 tok_s=60217.0 opt_steps=300 +[epoch 17/50] step=320 train_loss=0.0273 tok_s=60225.8 opt_steps=320 +[epoch 17/50] step=340 train_loss=0.0274 tok_s=60245.2 opt_steps=340 +[epoch 17/50] step=360 train_loss=0.0274 tok_s=60255.6 opt_steps=360 +[epoch 17/50] step=380 train_loss=0.0275 tok_s=60242.3 opt_steps=380 +[epoch 17/50] step=400 train_loss=0.0275 tok_s=60239.2 opt_steps=400 +[epoch 17/50] step=420 train_loss=0.0275 tok_s=60238.5 opt_steps=420 +[epoch 17/50] step=440 train_loss=0.0275 tok_s=60234.2 opt_steps=440 +[epoch 17/50] step=460 train_loss=0.0276 tok_s=60249.5 opt_steps=460 +[epoch 17/50] step=480 train_loss=0.0277 tok_s=60263.8 opt_steps=480 +[epoch 17/50] step=500 train_loss=0.0278 tok_s=60256.6 opt_steps=500 +[epoch 17/50] step=520 train_loss=0.0278 tok_s=60247.9 opt_steps=520 +[epoch 17/50] step=540 train_loss=0.0279 tok_s=60241.0 opt_steps=540 +[epoch 17/50] step=560 train_loss=0.0280 tok_s=60230.5 opt_steps=560 +[epoch 17/50] step=580 train_loss=0.0280 tok_s=60236.2 opt_steps=580 +[epoch 17/50] step=600 train_loss=0.0281 tok_s=60246.4 opt_steps=600 +[epoch 17/50] step=620 train_loss=0.0282 tok_s=60243.3 opt_steps=620 +[epoch 17/50] step=640 train_loss=0.0282 tok_s=60248.3 opt_steps=640 +[epoch 17/50] step=660 train_loss=0.0283 tok_s=60260.4 opt_steps=660 +[epoch 17/50] step=680 train_loss=0.0283 tok_s=60259.1 opt_steps=680 +[epoch 17/50] step=700 train_loss=0.0284 tok_s=60257.5 opt_steps=700 +[epoch 17/50] step=720 train_loss=0.0285 tok_s=60283.4 opt_steps=720 +[epoch 17/50] step=740 train_loss=0.0286 tok_s=60286.3 opt_steps=740 +[epoch 17/50] step=760 train_loss=0.0286 tok_s=60290.1 opt_steps=760 +[epoch 17/50] step=780 train_loss=0.0286 tok_s=60297.0 opt_steps=780 +[epoch 17/50] step=800 train_loss=0.0287 tok_s=60306.1 opt_steps=800 +[epoch 17/50] step=820 train_loss=0.0287 tok_s=60314.0 opt_steps=820 +[epoch 17/50] step=840 train_loss=0.0287 tok_s=60309.0 opt_steps=840 +[epoch 17/50] step=860 train_loss=0.0288 tok_s=60310.4 opt_steps=860 +[epoch 17/50] step=880 train_loss=0.0288 tok_s=60316.5 opt_steps=880 +[epoch 17/50] step=900 train_loss=0.0289 tok_s=60323.5 opt_steps=900 +[epoch 17/50] step=920 train_loss=0.0289 tok_s=60328.1 opt_steps=920 +[epoch 17/50] step=940 train_loss=0.0290 tok_s=60320.6 opt_steps=940 +[epoch 17/50] step=960 train_loss=0.0290 tok_s=60311.9 opt_steps=960 +[epoch 17/50] step=980 train_loss=0.0291 tok_s=60328.9 opt_steps=980 +[epoch 17/50] step=1000 train_loss=0.0291 tok_s=60326.8 opt_steps=1000 +[epoch 17/50] step=1020 train_loss=0.0291 tok_s=60323.6 opt_steps=1020 +[epoch 17/50] step=1040 train_loss=0.0292 tok_s=60332.4 opt_steps=1040 +[epoch 17/50] step=1060 train_loss=0.0292 tok_s=60331.2 opt_steps=1060 +[epoch 17/50] step=1080 train_loss=0.0293 tok_s=60328.5 opt_steps=1080 +[epoch 17/50] step=1100 train_loss=0.0293 tok_s=60333.7 opt_steps=1100 +[epoch 17/50] step=1120 train_loss=0.0293 tok_s=60332.5 opt_steps=1120 +[epoch 17/50] step=1140 train_loss=0.0294 tok_s=60334.3 opt_steps=1140 +[epoch 17/50] step=1160 train_loss=0.0294 tok_s=60332.9 opt_steps=1160 +[epoch 17/50] step=1180 train_loss=0.0295 tok_s=60325.2 opt_steps=1180 +[epoch 17/50] step=1200 train_loss=0.0295 tok_s=60332.6 opt_steps=1200 +[epoch 17/50] step=1220 train_loss=0.0296 tok_s=60337.7 opt_steps=1220 +[epoch 17/50] step=1240 train_loss=0.0297 tok_s=60336.9 opt_steps=1240 +[epoch 17/50] step=1260 train_loss=0.0297 tok_s=60332.5 opt_steps=1260 +[epoch 17/50] step=1280 train_loss=0.0297 tok_s=60328.5 opt_steps=1280 +[epoch 17/50] step=1300 train_loss=0.0298 tok_s=60328.4 opt_steps=1300 +[epoch 17/50] step=1320 train_loss=0.0298 tok_s=60320.9 opt_steps=1320 +[epoch 17/50] step=1340 train_loss=0.0299 tok_s=60320.3 opt_steps=1340 +[epoch 17/50] step=1360 train_loss=0.0299 tok_s=60323.3 opt_steps=1360 +[epoch 17/50] step=1380 train_loss=0.0299 tok_s=60320.0 opt_steps=1380 +[epoch 17/50] step=1400 train_loss=0.0300 tok_s=60322.1 opt_steps=1400 +[epoch 17/50] step=1420 train_loss=0.0300 tok_s=60323.9 opt_steps=1420 +[epoch 17/50] step=1440 train_loss=0.0300 tok_s=60318.7 opt_steps=1440 +[epoch 17/50] step=1460 train_loss=0.0301 tok_s=60316.5 opt_steps=1460 +[epoch 17/50] step=1480 train_loss=0.0301 tok_s=60320.2 opt_steps=1480 +[epoch 17/50] step=1500 train_loss=0.0302 tok_s=60317.5 opt_steps=1500 +[epoch 17/50] step=1520 train_loss=0.0302 tok_s=60313.2 opt_steps=1520 +[epoch 17/50] step=1540 train_loss=0.0302 tok_s=60311.7 opt_steps=1540 +[epoch 17/50] step=1560 train_loss=0.0302 tok_s=60315.3 opt_steps=1560 +[epoch 17/50] step=1580 train_loss=0.0303 tok_s=60316.3 opt_steps=1580 +[epoch 17/50] step=1600 train_loss=0.0303 tok_s=60315.9 opt_steps=1600 +[epoch 17/50] step=1620 train_loss=0.0304 tok_s=60312.6 opt_steps=1620 +[epoch 17/50] step=1640 train_loss=0.0304 tok_s=60314.5 opt_steps=1640 +[epoch 17/50] step=1660 train_loss=0.0304 tok_s=60311.2 opt_steps=1660 +[epoch 17/50] step=1680 train_loss=0.0305 tok_s=60314.3 opt_steps=1680 +[epoch 17/50] step=1700 train_loss=0.0305 tok_s=60311.6 opt_steps=1700 +[epoch 17/50] step=1720 train_loss=0.0305 tok_s=60311.8 opt_steps=1720 +[epoch 17/50] step=1740 train_loss=0.0306 tok_s=60310.4 opt_steps=1740 +[epoch 17/50] step=1760 train_loss=0.0306 tok_s=60308.6 opt_steps=1760 +[epoch 17/50] step=1780 train_loss=0.0306 tok_s=60307.8 opt_steps=1780 +[epoch 17/50] step=1800 train_loss=0.0307 tok_s=60305.7 opt_steps=1800 +[epoch 17/50] step=1820 train_loss=0.0307 tok_s=60303.4 opt_steps=1820 +[epoch 17/50] step=1840 train_loss=0.0307 tok_s=60305.1 opt_steps=1840 +[epoch 17/50] step=1860 train_loss=0.0307 tok_s=60311.4 opt_steps=1860 +[epoch 17/50] step=1880 train_loss=0.0308 tok_s=60313.4 opt_steps=1880 +[epoch 17/50] step=1900 train_loss=0.0308 tok_s=60314.3 opt_steps=1900 +[epoch 17/50] step=1920 train_loss=0.0308 tok_s=60313.1 opt_steps=1920 +[epoch 17/50] step=1940 train_loss=0.0309 tok_s=60321.5 opt_steps=1940 +[epoch 17/50] step=1960 train_loss=0.0309 tok_s=60323.8 opt_steps=1960 +[epoch 17/50] step=1980 train_loss=0.0309 tok_s=60320.2 opt_steps=1980 +[epoch 17/50] step=2000 train_loss=0.0309 tok_s=60320.5 opt_steps=2000 +[epoch 17/50] step=2020 train_loss=0.0310 tok_s=60324.3 opt_steps=2020 +[epoch 17/50] step=2040 train_loss=0.0310 tok_s=60323.3 opt_steps=2040 +[epoch 17/50] step=2060 train_loss=0.0311 tok_s=60321.4 opt_steps=2060 +[epoch 17/50] step=2080 train_loss=0.0311 tok_s=60321.5 opt_steps=2080 +[epoch 17/50] step=2100 train_loss=0.0311 tok_s=60318.2 opt_steps=2100 +[epoch 17/50] step=2120 train_loss=0.0312 tok_s=60319.4 opt_steps=2120 +[epoch 17/50] step=2140 train_loss=0.0312 tok_s=60321.6 opt_steps=2140 +[epoch 17/50] step=2160 train_loss=0.0312 tok_s=60320.0 opt_steps=2160 +[epoch 17/50] step=2180 train_loss=0.0313 tok_s=60319.5 opt_steps=2180 +[epoch 17/50] step=2200 train_loss=0.0313 tok_s=60317.9 opt_steps=2200 +[epoch 17/50] step=2220 train_loss=0.0313 tok_s=60314.2 opt_steps=2220 +[epoch 17/50] step=2240 train_loss=0.0314 tok_s=60314.2 opt_steps=2240 +[epoch 17/50] step=2260 train_loss=0.0314 tok_s=60316.6 opt_steps=2260 +[epoch 17/50] step=2280 train_loss=0.0314 tok_s=60319.4 opt_steps=2280 +[epoch 17/50] step=2300 train_loss=0.0315 tok_s=60325.3 opt_steps=2300 +[epoch 17/50] step=2320 train_loss=0.0315 tok_s=60323.2 opt_steps=2320 +[epoch 17/50] step=2340 train_loss=0.0315 tok_s=60326.7 opt_steps=2340 +[epoch 17/50] step=2360 train_loss=0.0315 tok_s=60324.5 opt_steps=2360 +[epoch 17/50] step=2380 train_loss=0.0315 tok_s=60325.5 opt_steps=2380 +[epoch 17/50] step=2400 train_loss=0.0316 tok_s=60323.7 opt_steps=2400 +[epoch 17/50] step=2420 train_loss=0.0316 tok_s=60323.9 opt_steps=2420 +[epoch 17/50] step=2440 train_loss=0.0316 tok_s=60327.0 opt_steps=2440 +[epoch 17/50] step=2460 train_loss=0.0316 tok_s=60328.4 opt_steps=2460 +[epoch 17/50] step=2480 train_loss=0.0317 tok_s=60329.1 opt_steps=2480 +[epoch 17/50] step=2500 train_loss=0.0317 tok_s=60327.0 opt_steps=2500 +[epoch 17/50] step=2520 train_loss=0.0317 tok_s=60324.8 opt_steps=2520 +[epoch 17/50] step=2540 train_loss=0.0317 tok_s=60326.6 opt_steps=2540 +[epoch 17/50] step=2560 train_loss=0.0317 tok_s=60327.5 opt_steps=2560 +[epoch 17/50] step=2580 train_loss=0.0318 tok_s=60332.3 opt_steps=2580 +[epoch 17/50] step=2600 train_loss=0.0318 tok_s=60333.5 opt_steps=2600 +[epoch 17/50] step=2620 train_loss=0.0318 tok_s=60335.2 opt_steps=2620 +[epoch 17/50] step=2640 train_loss=0.0318 tok_s=60335.8 opt_steps=2640 +[epoch 17/50] step=2660 train_loss=0.0319 tok_s=60335.0 opt_steps=2660 +[epoch 17/50] step=2680 train_loss=0.0319 tok_s=60334.8 opt_steps=2680 +[epoch 17/50] step=2700 train_loss=0.0319 tok_s=60335.4 opt_steps=2700 +[epoch 17/50] step=2720 train_loss=0.0319 tok_s=60335.0 opt_steps=2720 +[epoch 17/50] step=2740 train_loss=0.0319 tok_s=60334.6 opt_steps=2740 +[epoch 17/50] step=2760 train_loss=0.0320 tok_s=60332.4 opt_steps=2760 +[epoch 17/50] step=2780 train_loss=0.0320 tok_s=60333.8 opt_steps=2780 +[epoch 17/50] step=2800 train_loss=0.0320 tok_s=60332.5 opt_steps=2800 +[epoch 17/50] step=2820 train_loss=0.0320 tok_s=60333.8 opt_steps=2820 +[epoch 17/50] step=2840 train_loss=0.0320 tok_s=60333.8 opt_steps=2840 +[epoch 17/50] step=2860 train_loss=0.0321 tok_s=60333.5 opt_steps=2860 +[epoch 17/50] step=2880 train_loss=0.0321 tok_s=60337.6 opt_steps=2880 +[epoch 17/50] step=2900 train_loss=0.0321 tok_s=60338.6 opt_steps=2900 +[epoch 17/50] step=2920 train_loss=0.0321 tok_s=60335.6 opt_steps=2920 +[epoch 17/50] step=2940 train_loss=0.0321 tok_s=60336.3 opt_steps=2940 +[epoch 17/50] step=2960 train_loss=0.0322 tok_s=60335.7 opt_steps=2960 +[epoch 17/50] step=2980 train_loss=0.0322 tok_s=60336.2 opt_steps=2980 +[epoch 17/50] step=3000 train_loss=0.0322 tok_s=60332.0 opt_steps=3000 +[epoch 17/50] step=3020 train_loss=0.0322 tok_s=60327.9 opt_steps=3020 +[epoch 17/50] step=3040 train_loss=0.0322 tok_s=60328.9 opt_steps=3040 +[epoch 17/50] step=3060 train_loss=0.0323 tok_s=60326.3 opt_steps=3060 +[epoch 17/50] step=3080 train_loss=0.0323 tok_s=60323.9 opt_steps=3080 +[epoch 17/50] step=3100 train_loss=0.0323 tok_s=60320.8 opt_steps=3100 +[epoch 17/50] step=3120 train_loss=0.0323 tok_s=60318.2 opt_steps=3120 +[epoch 17/50] step=3140 train_loss=0.0324 tok_s=60317.3 opt_steps=3140 +[epoch 17/50] step=3160 train_loss=0.0324 tok_s=60315.3 opt_steps=3160 +[epoch 17/50] step=3180 train_loss=0.0324 tok_s=60313.5 opt_steps=3180 +[epoch 17/50] step=3200 train_loss=0.0324 tok_s=60312.1 opt_steps=3200 +[epoch 17/50] step=3220 train_loss=0.0324 tok_s=60311.6 opt_steps=3220 +[epoch 17/50] step=3240 train_loss=0.0325 tok_s=60313.8 opt_steps=3240 +[epoch 17/50] step=3260 train_loss=0.0325 tok_s=60308.8 opt_steps=3260 +[epoch 17/50] train_loss=0.0325 val_skipped tok_s=60308.9 opt_steps=3273 +[epoch 18/50] step=20 train_loss=0.0262 tok_s=58170.6 opt_steps=20 +[epoch 18/50] step=40 train_loss=0.0253 tok_s=59197.4 opt_steps=40 +[epoch 18/50] step=60 train_loss=0.0248 tok_s=59593.5 opt_steps=60 +[epoch 18/50] step=80 train_loss=0.0252 tok_s=59845.2 opt_steps=80 +[epoch 18/50] step=100 train_loss=0.0250 tok_s=59954.6 opt_steps=100 +[epoch 18/50] step=120 train_loss=0.0250 tok_s=60035.7 opt_steps=120 +[epoch 18/50] step=140 train_loss=0.0250 tok_s=60095.9 opt_steps=140 +[epoch 18/50] step=160 train_loss=0.0250 tok_s=60172.6 opt_steps=160 +[epoch 18/50] step=180 train_loss=0.0251 tok_s=60145.5 opt_steps=180 +[epoch 18/50] step=200 train_loss=0.0252 tok_s=60210.6 opt_steps=200 +[epoch 18/50] step=220 train_loss=0.0253 tok_s=60201.4 opt_steps=220 +[epoch 18/50] step=240 train_loss=0.0253 tok_s=60199.2 opt_steps=240 +[epoch 18/50] step=260 train_loss=0.0253 tok_s=60227.0 opt_steps=260 +[epoch 18/50] step=280 train_loss=0.0253 tok_s=60238.9 opt_steps=280 +[epoch 18/50] step=300 train_loss=0.0254 tok_s=60236.5 opt_steps=300 +[epoch 18/50] step=320 train_loss=0.0254 tok_s=60258.4 opt_steps=320 +[epoch 18/50] step=340 train_loss=0.0254 tok_s=60282.6 opt_steps=340 +[epoch 18/50] step=360 train_loss=0.0255 tok_s=60291.1 opt_steps=360 +[epoch 18/50] step=380 train_loss=0.0255 tok_s=60293.6 opt_steps=380 +[epoch 18/50] step=400 train_loss=0.0256 tok_s=60286.1 opt_steps=400 +[epoch 18/50] step=420 train_loss=0.0257 tok_s=60293.6 opt_steps=420 +[epoch 18/50] step=440 train_loss=0.0257 tok_s=60315.2 opt_steps=440 +[epoch 18/50] step=460 train_loss=0.0258 tok_s=60323.4 opt_steps=460 +[epoch 18/50] step=480 train_loss=0.0258 tok_s=60305.0 opt_steps=480 +[epoch 18/50] step=500 train_loss=0.0259 tok_s=60313.7 opt_steps=500 +[epoch 18/50] step=520 train_loss=0.0259 tok_s=60310.2 opt_steps=520 +[epoch 18/50] step=540 train_loss=0.0260 tok_s=60307.3 opt_steps=540 +[epoch 18/50] step=560 train_loss=0.0260 tok_s=60298.0 opt_steps=560 +[epoch 18/50] step=580 train_loss=0.0261 tok_s=60306.6 opt_steps=580 +[epoch 18/50] step=600 train_loss=0.0262 tok_s=60302.2 opt_steps=600 +[epoch 18/50] step=620 train_loss=0.0262 tok_s=60288.1 opt_steps=620 +[epoch 18/50] step=640 train_loss=0.0262 tok_s=60300.1 opt_steps=640 +[epoch 18/50] step=660 train_loss=0.0263 tok_s=60307.8 opt_steps=660 +[epoch 18/50] step=680 train_loss=0.0263 tok_s=60300.7 opt_steps=680 +[epoch 18/50] step=700 train_loss=0.0263 tok_s=60286.2 opt_steps=700 +[epoch 18/50] step=720 train_loss=0.0264 tok_s=60286.5 opt_steps=720 +[epoch 18/50] step=740 train_loss=0.0264 tok_s=60291.5 opt_steps=740 +[epoch 18/50] step=760 train_loss=0.0264 tok_s=60303.8 opt_steps=760 +[epoch 18/50] step=780 train_loss=0.0265 tok_s=60309.1 opt_steps=780 +[epoch 18/50] step=800 train_loss=0.0265 tok_s=60310.6 opt_steps=800 +[epoch 18/50] step=820 train_loss=0.0266 tok_s=60298.5 opt_steps=820 +[epoch 18/50] step=840 train_loss=0.0267 tok_s=60292.3 opt_steps=840 +[epoch 18/50] step=860 train_loss=0.0267 tok_s=60290.5 opt_steps=860 +[epoch 18/50] step=880 train_loss=0.0268 tok_s=60290.8 opt_steps=880 +[epoch 18/50] step=900 train_loss=0.0268 tok_s=60288.0 opt_steps=900 +[epoch 18/50] step=920 train_loss=0.0269 tok_s=60288.8 opt_steps=920 +[epoch 18/50] step=940 train_loss=0.0269 tok_s=60295.9 opt_steps=940 +[epoch 18/50] step=960 train_loss=0.0270 tok_s=60294.5 opt_steps=960 +[epoch 18/50] step=980 train_loss=0.0270 tok_s=60294.9 opt_steps=980 +[epoch 18/50] step=1000 train_loss=0.0271 tok_s=60297.1 opt_steps=1000 +[epoch 18/50] step=1020 train_loss=0.0272 tok_s=60300.1 opt_steps=1020 +[epoch 18/50] step=1040 train_loss=0.0272 tok_s=60301.8 opt_steps=1040 +[epoch 18/50] step=1060 train_loss=0.0273 tok_s=60297.3 opt_steps=1060 +[epoch 18/50] step=1080 train_loss=0.0273 tok_s=60284.7 opt_steps=1080 +[epoch 18/50] step=1100 train_loss=0.0273 tok_s=60281.5 opt_steps=1100 +[epoch 18/50] step=1120 train_loss=0.0274 tok_s=60269.3 opt_steps=1120 +[epoch 18/50] step=1140 train_loss=0.0274 tok_s=60262.8 opt_steps=1140 +[epoch 18/50] step=1160 train_loss=0.0275 tok_s=60254.8 opt_steps=1160 +[epoch 18/50] step=1180 train_loss=0.0275 tok_s=60260.8 opt_steps=1180 +[epoch 18/50] step=1200 train_loss=0.0276 tok_s=60264.4 opt_steps=1200 +[epoch 18/50] step=1220 train_loss=0.0276 tok_s=60264.8 opt_steps=1220 +[epoch 18/50] step=1240 train_loss=0.0276 tok_s=60271.0 opt_steps=1240 +[epoch 18/50] step=1260 train_loss=0.0277 tok_s=60271.8 opt_steps=1260 +[epoch 18/50] step=1280 train_loss=0.0277 tok_s=60275.9 opt_steps=1280 +[epoch 18/50] step=1300 train_loss=0.0278 tok_s=60281.1 opt_steps=1300 +[epoch 18/50] step=1320 train_loss=0.0278 tok_s=60283.5 opt_steps=1320 +[epoch 18/50] step=1340 train_loss=0.0278 tok_s=60287.7 opt_steps=1340 +[epoch 18/50] step=1360 train_loss=0.0279 tok_s=60292.5 opt_steps=1360 +[epoch 18/50] step=1380 train_loss=0.0279 tok_s=60288.0 opt_steps=1380 +[epoch 18/50] step=1400 train_loss=0.0279 tok_s=60280.8 opt_steps=1400 +[epoch 18/50] step=1420 train_loss=0.0280 tok_s=60289.3 opt_steps=1420 +[epoch 18/50] step=1440 train_loss=0.0280 tok_s=60285.7 opt_steps=1440 +[epoch 18/50] step=1460 train_loss=0.0281 tok_s=60286.4 opt_steps=1460 +[epoch 18/50] step=1480 train_loss=0.0281 tok_s=60284.3 opt_steps=1480 +[epoch 18/50] step=1500 train_loss=0.0282 tok_s=60280.8 opt_steps=1500 +[epoch 18/50] step=1520 train_loss=0.0282 tok_s=60283.8 opt_steps=1520 +[epoch 18/50] step=1540 train_loss=0.0282 tok_s=60281.6 opt_steps=1540 +[epoch 18/50] step=1560 train_loss=0.0283 tok_s=60284.3 opt_steps=1560 +[epoch 18/50] step=1580 train_loss=0.0283 tok_s=60283.6 opt_steps=1580 +[epoch 18/50] step=1600 train_loss=0.0283 tok_s=60287.4 opt_steps=1600 +[epoch 18/50] step=1620 train_loss=0.0284 tok_s=60286.2 opt_steps=1620 +[epoch 18/50] step=1640 train_loss=0.0284 tok_s=60293.2 opt_steps=1640 +[epoch 18/50] step=1660 train_loss=0.0285 tok_s=60294.0 opt_steps=1660 +[epoch 18/50] step=1680 train_loss=0.0285 tok_s=60296.1 opt_steps=1680 +[epoch 18/50] step=1700 train_loss=0.0285 tok_s=60293.8 opt_steps=1700 +[epoch 18/50] step=1720 train_loss=0.0285 tok_s=60289.8 opt_steps=1720 +[epoch 18/50] step=1740 train_loss=0.0286 tok_s=60290.3 opt_steps=1740 +[epoch 18/50] step=1760 train_loss=0.0286 tok_s=60293.0 opt_steps=1760 +[epoch 18/50] step=1780 train_loss=0.0286 tok_s=60289.7 opt_steps=1780 +[epoch 18/50] step=1800 train_loss=0.0286 tok_s=60287.1 opt_steps=1800 +[epoch 18/50] step=1820 train_loss=0.0287 tok_s=60288.9 opt_steps=1820 +[epoch 18/50] step=1840 train_loss=0.0287 tok_s=60291.9 opt_steps=1840 +[epoch 18/50] step=1860 train_loss=0.0287 tok_s=60288.4 opt_steps=1860 +[epoch 18/50] step=1880 train_loss=0.0287 tok_s=60293.0 opt_steps=1880 +[epoch 18/50] step=1900 train_loss=0.0288 tok_s=60297.5 opt_steps=1900 +[epoch 18/50] step=1920 train_loss=0.0288 tok_s=60296.3 opt_steps=1920 +[epoch 18/50] step=1940 train_loss=0.0289 tok_s=60295.7 opt_steps=1940 +[epoch 18/50] step=1960 train_loss=0.0289 tok_s=60295.0 opt_steps=1960 +[epoch 18/50] step=1980 train_loss=0.0289 tok_s=60293.1 opt_steps=1980 +[epoch 18/50] step=2000 train_loss=0.0290 tok_s=60293.5 opt_steps=2000 +[epoch 18/50] step=2020 train_loss=0.0290 tok_s=60290.9 opt_steps=2020 +[epoch 18/50] step=2040 train_loss=0.0290 tok_s=60290.8 opt_steps=2040 +[epoch 18/50] step=2060 train_loss=0.0290 tok_s=60287.8 opt_steps=2060 +[epoch 18/50] step=2080 train_loss=0.0291 tok_s=60289.8 opt_steps=2080 +[epoch 18/50] step=2100 train_loss=0.0291 tok_s=60291.6 opt_steps=2100 +[epoch 18/50] step=2120 train_loss=0.0291 tok_s=60291.6 opt_steps=2120 +[epoch 18/50] step=2140 train_loss=0.0292 tok_s=60289.7 opt_steps=2140 +[epoch 18/50] step=2160 train_loss=0.0292 tok_s=60289.0 opt_steps=2160 +[epoch 18/50] step=2180 train_loss=0.0292 tok_s=60282.8 opt_steps=2180 +[epoch 18/50] step=2200 train_loss=0.0292 tok_s=60283.2 opt_steps=2200 +[epoch 18/50] step=2220 train_loss=0.0293 tok_s=60278.0 opt_steps=2220 +[epoch 18/50] step=2240 train_loss=0.0293 tok_s=60270.8 opt_steps=2240 +[epoch 18/50] step=2260 train_loss=0.0293 tok_s=60264.2 opt_steps=2260 +[epoch 18/50] step=2280 train_loss=0.0294 tok_s=60264.6 opt_steps=2280 +[epoch 18/50] step=2300 train_loss=0.0294 tok_s=60263.3 opt_steps=2300 +[epoch 18/50] step=2320 train_loss=0.0294 tok_s=60260.9 opt_steps=2320 +[epoch 18/50] step=2340 train_loss=0.0294 tok_s=60268.0 opt_steps=2340 +[epoch 18/50] step=2360 train_loss=0.0295 tok_s=60266.4 opt_steps=2360 +[epoch 18/50] step=2380 train_loss=0.0295 tok_s=60266.9 opt_steps=2380 +[epoch 18/50] step=2400 train_loss=0.0295 tok_s=60265.6 opt_steps=2400 +[epoch 18/50] step=2420 train_loss=0.0295 tok_s=60262.4 opt_steps=2420 +[epoch 18/50] step=2440 train_loss=0.0295 tok_s=60263.8 opt_steps=2440 +[epoch 18/50] step=2460 train_loss=0.0296 tok_s=60258.4 opt_steps=2460 +[epoch 18/50] step=2480 train_loss=0.0296 tok_s=60248.3 opt_steps=2480 +[epoch 18/50] step=2500 train_loss=0.0296 tok_s=60245.2 opt_steps=2500 +[epoch 18/50] step=2520 train_loss=0.0296 tok_s=60246.7 opt_steps=2520 +[epoch 18/50] step=2540 train_loss=0.0296 tok_s=60246.0 opt_steps=2540 +[epoch 18/50] step=2560 train_loss=0.0297 tok_s=60243.7 opt_steps=2560 +[epoch 18/50] step=2580 train_loss=0.0297 tok_s=60241.2 opt_steps=2580 +[epoch 18/50] step=2600 train_loss=0.0297 tok_s=60233.7 opt_steps=2600 +[epoch 18/50] step=2620 train_loss=0.0297 tok_s=60227.4 opt_steps=2620 +[epoch 18/50] step=2640 train_loss=0.0297 tok_s=60230.9 opt_steps=2640 +[epoch 18/50] step=2660 train_loss=0.0298 tok_s=60233.9 opt_steps=2660 +[epoch 18/50] step=2680 train_loss=0.0298 tok_s=60234.6 opt_steps=2680 +[epoch 18/50] step=2700 train_loss=0.0298 tok_s=60238.7 opt_steps=2700 +[epoch 18/50] step=2720 train_loss=0.0298 tok_s=60237.4 opt_steps=2720 +[epoch 18/50] step=2740 train_loss=0.0299 tok_s=60236.3 opt_steps=2740 +[epoch 18/50] step=2760 train_loss=0.0299 tok_s=60238.6 opt_steps=2760 +[epoch 18/50] step=2780 train_loss=0.0299 tok_s=60237.4 opt_steps=2780 +[epoch 18/50] step=2800 train_loss=0.0299 tok_s=60238.0 opt_steps=2800 +[epoch 18/50] step=2820 train_loss=0.0299 tok_s=60240.6 opt_steps=2820 +[epoch 18/50] step=2840 train_loss=0.0300 tok_s=60244.5 opt_steps=2840 +[epoch 18/50] step=2860 train_loss=0.0300 tok_s=60245.3 opt_steps=2860 +[epoch 18/50] step=2880 train_loss=0.0300 tok_s=60244.8 opt_steps=2880 +[epoch 18/50] step=2900 train_loss=0.0300 tok_s=60243.4 opt_steps=2900 +[epoch 18/50] step=2920 train_loss=0.0301 tok_s=60244.4 opt_steps=2920 +[epoch 18/50] step=2940 train_loss=0.0301 tok_s=60245.4 opt_steps=2940 +[epoch 18/50] step=2960 train_loss=0.0301 tok_s=60245.6 opt_steps=2960 +[epoch 18/50] step=2980 train_loss=0.0301 tok_s=60248.9 opt_steps=2980 +[epoch 18/50] step=3000 train_loss=0.0301 tok_s=60249.2 opt_steps=3000 +[epoch 18/50] step=3020 train_loss=0.0301 tok_s=60251.0 opt_steps=3020 +[epoch 18/50] step=3040 train_loss=0.0302 tok_s=60254.1 opt_steps=3040 +[epoch 18/50] step=3060 train_loss=0.0302 tok_s=60255.3 opt_steps=3060 +[epoch 18/50] step=3080 train_loss=0.0302 tok_s=60256.6 opt_steps=3080 +[epoch 18/50] step=3100 train_loss=0.0302 tok_s=60255.5 opt_steps=3100 +[epoch 18/50] step=3120 train_loss=0.0302 tok_s=60254.5 opt_steps=3120 +[epoch 18/50] step=3140 train_loss=0.0302 tok_s=60254.8 opt_steps=3140 +[epoch 18/50] step=3160 train_loss=0.0303 tok_s=60256.1 opt_steps=3160 +[epoch 18/50] step=3180 train_loss=0.0303 tok_s=60256.2 opt_steps=3180 +[epoch 18/50] step=3200 train_loss=0.0303 tok_s=60254.7 opt_steps=3200 +[epoch 18/50] step=3220 train_loss=0.0303 tok_s=60253.8 opt_steps=3220 +[epoch 18/50] step=3240 train_loss=0.0303 tok_s=60254.0 opt_steps=3240 +[epoch 18/50] step=3260 train_loss=0.0304 tok_s=60251.6 opt_steps=3260 +[epoch 18/50] train_loss=0.0304 val_skipped tok_s=60251.2 opt_steps=3273 +[epoch 19/50] step=20 train_loss=0.0231 tok_s=58017.6 opt_steps=20 +[epoch 19/50] step=40 train_loss=0.0236 tok_s=58738.2 opt_steps=40 +[epoch 19/50] step=60 train_loss=0.0235 tok_s=59144.7 opt_steps=60 +[epoch 19/50] step=80 train_loss=0.0237 tok_s=59396.7 opt_steps=80 +[epoch 19/50] step=100 train_loss=0.0239 tok_s=59596.3 opt_steps=100 +[epoch 19/50] step=120 train_loss=0.0238 tok_s=59689.5 opt_steps=120 +[epoch 19/50] step=140 train_loss=0.0237 tok_s=59812.0 opt_steps=140 +[epoch 19/50] step=160 train_loss=0.0237 tok_s=59848.4 opt_steps=160 +[epoch 19/50] step=180 train_loss=0.0237 tok_s=59909.7 opt_steps=180 +[epoch 19/50] step=200 train_loss=0.0236 tok_s=59916.1 opt_steps=200 +[epoch 19/50] step=220 train_loss=0.0237 tok_s=59927.1 opt_steps=220 +[epoch 19/50] step=240 train_loss=0.0238 tok_s=59968.6 opt_steps=240 +[epoch 19/50] step=260 train_loss=0.0239 tok_s=59986.2 opt_steps=260 +[epoch 19/50] step=280 train_loss=0.0239 tok_s=60012.8 opt_steps=280 +[epoch 19/50] step=300 train_loss=0.0240 tok_s=60049.4 opt_steps=300 +[epoch 19/50] step=320 train_loss=0.0240 tok_s=60069.5 opt_steps=320 +[epoch 19/50] step=340 train_loss=0.0241 tok_s=60079.9 opt_steps=340 +[epoch 19/50] step=360 train_loss=0.0241 tok_s=60065.8 opt_steps=360 +[epoch 19/50] step=380 train_loss=0.0242 tok_s=60066.6 opt_steps=380 +[epoch 19/50] step=400 train_loss=0.0242 tok_s=60083.9 opt_steps=400 +[epoch 19/50] step=420 train_loss=0.0242 tok_s=60109.5 opt_steps=420 +[epoch 19/50] step=440 train_loss=0.0242 tok_s=60101.9 opt_steps=440 +[epoch 19/50] step=460 train_loss=0.0242 tok_s=60087.0 opt_steps=460 +[epoch 19/50] step=480 train_loss=0.0243 tok_s=60101.3 opt_steps=480 +[epoch 19/50] step=500 train_loss=0.0243 tok_s=60109.1 opt_steps=500 +[epoch 19/50] step=520 train_loss=0.0243 tok_s=60119.5 opt_steps=520 +[epoch 19/50] step=540 train_loss=0.0243 tok_s=60127.1 opt_steps=540 +[epoch 19/50] step=560 train_loss=0.0243 tok_s=60158.0 opt_steps=560 +[epoch 19/50] step=580 train_loss=0.0244 tok_s=60169.9 opt_steps=580 +[epoch 19/50] step=600 train_loss=0.0244 tok_s=60163.9 opt_steps=600 +[epoch 19/50] step=620 train_loss=0.0244 tok_s=60157.5 opt_steps=620 +[epoch 19/50] step=640 train_loss=0.0245 tok_s=60162.1 opt_steps=640 +[epoch 19/50] step=660 train_loss=0.0246 tok_s=60161.8 opt_steps=660 +[epoch 19/50] step=680 train_loss=0.0246 tok_s=60176.1 opt_steps=680 +[epoch 19/50] step=700 train_loss=0.0247 tok_s=60187.4 opt_steps=700 +[epoch 19/50] step=720 train_loss=0.0247 tok_s=60194.7 opt_steps=720 +[epoch 19/50] step=740 train_loss=0.0248 tok_s=60190.7 opt_steps=740 +[epoch 19/50] step=760 train_loss=0.0248 tok_s=60191.0 opt_steps=760 +[epoch 19/50] step=780 train_loss=0.0248 tok_s=60189.5 opt_steps=780 +[epoch 19/50] step=800 train_loss=0.0249 tok_s=60196.3 opt_steps=800 +[epoch 19/50] step=820 train_loss=0.0249 tok_s=60196.0 opt_steps=820 +[epoch 19/50] step=840 train_loss=0.0249 tok_s=60181.9 opt_steps=840 +[epoch 19/50] step=860 train_loss=0.0250 tok_s=60188.9 opt_steps=860 +[epoch 19/50] step=880 train_loss=0.0250 tok_s=60188.5 opt_steps=880 +[epoch 19/50] step=900 train_loss=0.0251 tok_s=60197.5 opt_steps=900 +[epoch 19/50] step=920 train_loss=0.0251 tok_s=60201.8 opt_steps=920 +[epoch 19/50] step=940 train_loss=0.0252 tok_s=60212.5 opt_steps=940 +[epoch 19/50] step=960 train_loss=0.0252 tok_s=60212.8 opt_steps=960 +[epoch 19/50] step=980 train_loss=0.0252 tok_s=60221.0 opt_steps=980 +[epoch 19/50] step=1000 train_loss=0.0253 tok_s=60238.3 opt_steps=1000 +[epoch 19/50] step=1020 train_loss=0.0253 tok_s=60237.6 opt_steps=1020 +[epoch 19/50] step=1040 train_loss=0.0253 tok_s=60242.3 opt_steps=1040 +[epoch 19/50] step=1060 train_loss=0.0253 tok_s=60244.9 opt_steps=1060 +[epoch 19/50] step=1080 train_loss=0.0254 tok_s=60247.2 opt_steps=1080 +[epoch 19/50] step=1100 train_loss=0.0255 tok_s=60249.3 opt_steps=1100 +[epoch 19/50] step=1120 train_loss=0.0255 tok_s=60241.9 opt_steps=1120 +[epoch 19/50] step=1140 train_loss=0.0256 tok_s=60251.4 opt_steps=1140 +[epoch 19/50] step=1160 train_loss=0.0256 tok_s=60249.3 opt_steps=1160 +[epoch 19/50] step=1180 train_loss=0.0257 tok_s=60251.0 opt_steps=1180 +[epoch 19/50] step=1200 train_loss=0.0257 tok_s=60252.7 opt_steps=1200 +[epoch 19/50] step=1220 train_loss=0.0257 tok_s=60253.5 opt_steps=1220 +[epoch 19/50] step=1240 train_loss=0.0258 tok_s=60233.8 opt_steps=1240 +[epoch 19/50] step=1260 train_loss=0.0258 tok_s=60194.0 opt_steps=1260 +[epoch 19/50] step=1280 train_loss=0.0258 tok_s=60155.4 opt_steps=1280 +[epoch 19/50] step=1300 train_loss=0.0259 tok_s=60138.6 opt_steps=1300 +[epoch 19/50] step=1320 train_loss=0.0259 tok_s=60108.3 opt_steps=1320 +[epoch 19/50] step=1340 train_loss=0.0259 tok_s=60096.6 opt_steps=1340 +[epoch 19/50] step=1360 train_loss=0.0260 tok_s=60078.6 opt_steps=1360 +[epoch 19/50] step=1380 train_loss=0.0260 tok_s=60065.6 opt_steps=1380 +[epoch 19/50] step=1400 train_loss=0.0261 tok_s=60059.8 opt_steps=1400 +[epoch 19/50] step=1420 train_loss=0.0261 tok_s=60038.4 opt_steps=1420 +[epoch 19/50] step=1440 train_loss=0.0262 tok_s=60014.5 opt_steps=1440 +[epoch 19/50] step=1460 train_loss=0.0262 tok_s=59999.6 opt_steps=1460 +[epoch 19/50] step=1480 train_loss=0.0262 tok_s=59962.9 opt_steps=1480 +[epoch 19/50] step=1500 train_loss=0.0262 tok_s=59956.7 opt_steps=1500 +[epoch 19/50] step=1520 train_loss=0.0263 tok_s=59959.1 opt_steps=1520 +[epoch 19/50] step=1540 train_loss=0.0263 tok_s=59962.1 opt_steps=1540 +[epoch 19/50] step=1560 train_loss=0.0263 tok_s=59966.1 opt_steps=1560 +[epoch 19/50] step=1580 train_loss=0.0264 tok_s=59965.7 opt_steps=1580 +[epoch 19/50] step=1600 train_loss=0.0264 tok_s=59969.4 opt_steps=1600 +[epoch 19/50] step=1620 train_loss=0.0264 tok_s=59973.9 opt_steps=1620 +[epoch 19/50] step=1640 train_loss=0.0265 tok_s=59976.6 opt_steps=1640 +[epoch 19/50] step=1660 train_loss=0.0265 tok_s=59980.1 opt_steps=1660 +[epoch 19/50] step=1680 train_loss=0.0265 tok_s=59976.0 opt_steps=1680 +[epoch 19/50] step=1700 train_loss=0.0265 tok_s=59976.3 opt_steps=1700 +[epoch 19/50] step=1720 train_loss=0.0266 tok_s=59981.6 opt_steps=1720 +[epoch 19/50] step=1740 train_loss=0.0266 tok_s=59971.9 opt_steps=1740 +[epoch 19/50] step=1760 train_loss=0.0266 tok_s=59972.8 opt_steps=1760 +[epoch 19/50] step=1780 train_loss=0.0267 tok_s=59965.4 opt_steps=1780 +[epoch 19/50] step=1800 train_loss=0.0267 tok_s=59959.5 opt_steps=1800 +[epoch 19/50] step=1820 train_loss=0.0267 tok_s=59950.8 opt_steps=1820 +[epoch 19/50] step=1840 train_loss=0.0267 tok_s=59950.8 opt_steps=1840 +[epoch 19/50] step=1860 train_loss=0.0268 tok_s=59949.5 opt_steps=1860 +[epoch 19/50] step=1880 train_loss=0.0268 tok_s=59941.4 opt_steps=1880 +[epoch 19/50] step=1900 train_loss=0.0268 tok_s=59942.9 opt_steps=1900 +[epoch 19/50] step=1920 train_loss=0.0269 tok_s=59947.0 opt_steps=1920 +[epoch 19/50] step=1940 train_loss=0.0269 tok_s=59951.0 opt_steps=1940 +[epoch 19/50] step=1960 train_loss=0.0269 tok_s=59953.3 opt_steps=1960 +[epoch 19/50] step=1980 train_loss=0.0269 tok_s=59951.0 opt_steps=1980 +[epoch 19/50] step=2000 train_loss=0.0270 tok_s=59953.9 opt_steps=2000 +[epoch 19/50] step=2020 train_loss=0.0270 tok_s=59957.0 opt_steps=2020 +[epoch 19/50] step=2040 train_loss=0.0270 tok_s=59959.2 opt_steps=2040 +[epoch 19/50] step=2060 train_loss=0.0271 tok_s=59965.0 opt_steps=2060 +[epoch 19/50] step=2080 train_loss=0.0271 tok_s=59966.7 opt_steps=2080 +[epoch 19/50] step=2100 train_loss=0.0271 tok_s=59964.1 opt_steps=2100 +[epoch 19/50] step=2120 train_loss=0.0271 tok_s=59963.5 opt_steps=2120 +[epoch 19/50] step=2140 train_loss=0.0271 tok_s=59967.2 opt_steps=2140 +[epoch 19/50] step=2160 train_loss=0.0272 tok_s=59965.6 opt_steps=2160 +[epoch 19/50] step=2180 train_loss=0.0272 tok_s=59968.6 opt_steps=2180 +[epoch 19/50] step=2200 train_loss=0.0272 tok_s=59973.2 opt_steps=2200 +[epoch 19/50] step=2220 train_loss=0.0272 tok_s=59979.1 opt_steps=2220 +[epoch 19/50] step=2240 train_loss=0.0273 tok_s=59980.7 opt_steps=2240 +[epoch 19/50] step=2260 train_loss=0.0273 tok_s=59983.3 opt_steps=2260 +[epoch 19/50] step=2280 train_loss=0.0273 tok_s=59984.1 opt_steps=2280 +[epoch 19/50] step=2300 train_loss=0.0274 tok_s=59986.1 opt_steps=2300 +[epoch 19/50] step=2320 train_loss=0.0274 tok_s=59990.6 opt_steps=2320 +[epoch 19/50] step=2340 train_loss=0.0274 tok_s=59991.0 opt_steps=2340 +[epoch 19/50] step=2360 train_loss=0.0274 tok_s=59995.5 opt_steps=2360 +[epoch 19/50] step=2380 train_loss=0.0274 tok_s=59997.6 opt_steps=2380 +[epoch 19/50] step=2400 train_loss=0.0275 tok_s=59997.4 opt_steps=2400 +[epoch 19/50] step=2420 train_loss=0.0275 tok_s=60003.8 opt_steps=2420 +[epoch 19/50] step=2440 train_loss=0.0275 tok_s=60008.2 opt_steps=2440 +[epoch 19/50] step=2460 train_loss=0.0276 tok_s=60010.6 opt_steps=2460 +[epoch 19/50] step=2480 train_loss=0.0276 tok_s=60013.0 opt_steps=2480 +[epoch 19/50] step=2500 train_loss=0.0276 tok_s=60016.4 opt_steps=2500 +[epoch 19/50] step=2520 train_loss=0.0276 tok_s=60017.0 opt_steps=2520 +[epoch 19/50] step=2540 train_loss=0.0276 tok_s=60019.6 opt_steps=2540 +[epoch 19/50] step=2560 train_loss=0.0277 tok_s=60022.4 opt_steps=2560 +[epoch 19/50] step=2580 train_loss=0.0277 tok_s=60023.1 opt_steps=2580 +[epoch 19/50] step=2600 train_loss=0.0277 tok_s=60027.8 opt_steps=2600 +[epoch 19/50] step=2620 train_loss=0.0277 tok_s=60029.4 opt_steps=2620 +[epoch 19/50] step=2640 train_loss=0.0278 tok_s=60034.1 opt_steps=2640 +[epoch 19/50] step=2660 train_loss=0.0278 tok_s=60034.1 opt_steps=2660 +[epoch 19/50] step=2680 train_loss=0.0278 tok_s=60035.5 opt_steps=2680 +[epoch 19/50] step=2700 train_loss=0.0278 tok_s=60035.7 opt_steps=2700 +[epoch 19/50] step=2720 train_loss=0.0278 tok_s=60034.9 opt_steps=2720 +[epoch 19/50] step=2740 train_loss=0.0279 tok_s=60034.6 opt_steps=2740 +[epoch 19/50] step=2760 train_loss=0.0279 tok_s=60036.7 opt_steps=2760 +[epoch 19/50] step=2780 train_loss=0.0279 tok_s=60035.2 opt_steps=2780 +[epoch 19/50] step=2800 train_loss=0.0279 tok_s=60037.5 opt_steps=2800 +[epoch 19/50] step=2820 train_loss=0.0279 tok_s=60040.6 opt_steps=2820 +[epoch 19/50] step=2840 train_loss=0.0280 tok_s=60041.7 opt_steps=2840 +[epoch 19/50] step=2860 train_loss=0.0280 tok_s=60043.6 opt_steps=2860 +[epoch 19/50] step=2880 train_loss=0.0280 tok_s=60048.6 opt_steps=2880 +[epoch 19/50] step=2900 train_loss=0.0280 tok_s=60050.7 opt_steps=2900 +[epoch 19/50] step=2920 train_loss=0.0281 tok_s=60049.7 opt_steps=2920 +[epoch 19/50] step=2940 train_loss=0.0281 tok_s=60052.0 opt_steps=2940 +[epoch 19/50] step=2960 train_loss=0.0281 tok_s=60054.0 opt_steps=2960 +[epoch 19/50] step=2980 train_loss=0.0281 tok_s=60054.7 opt_steps=2980 +[epoch 19/50] step=3000 train_loss=0.0281 tok_s=60052.0 opt_steps=3000 +[epoch 19/50] step=3020 train_loss=0.0282 tok_s=60052.5 opt_steps=3020 +[epoch 19/50] step=3040 train_loss=0.0282 tok_s=60054.4 opt_steps=3040 +[epoch 19/50] step=3060 train_loss=0.0282 tok_s=60055.5 opt_steps=3060 +[epoch 19/50] step=3080 train_loss=0.0282 tok_s=60057.8 opt_steps=3080 +[epoch 19/50] step=3100 train_loss=0.0282 tok_s=60056.7 opt_steps=3100 +[epoch 19/50] step=3120 train_loss=0.0283 tok_s=60059.4 opt_steps=3120 +[epoch 19/50] step=3140 train_loss=0.0283 tok_s=60061.6 opt_steps=3140 +[epoch 19/50] step=3160 train_loss=0.0283 tok_s=60063.5 opt_steps=3160 +[epoch 19/50] step=3180 train_loss=0.0283 tok_s=60063.1 opt_steps=3180 +[epoch 19/50] step=3200 train_loss=0.0283 tok_s=60059.9 opt_steps=3200 +[epoch 19/50] step=3220 train_loss=0.0283 tok_s=60059.3 opt_steps=3220 +[epoch 19/50] step=3240 train_loss=0.0284 tok_s=60063.0 opt_steps=3240 +[epoch 19/50] step=3260 train_loss=0.0284 tok_s=60061.5 opt_steps=3260 +[epoch 19/50] train_loss=0.0284 val_skipped tok_s=60062.1 opt_steps=3273 +[epoch 20/50] step=20 train_loss=0.0216 tok_s=58417.9 opt_steps=20 +[epoch 20/50] step=40 train_loss=0.0215 tok_s=59241.3 opt_steps=40 +[epoch 20/50] step=60 train_loss=0.0217 tok_s=59699.5 opt_steps=60 +[epoch 20/50] step=80 train_loss=0.0219 tok_s=59900.0 opt_steps=80 +[epoch 20/50] step=100 train_loss=0.0220 tok_s=60001.1 opt_steps=100 +[epoch 20/50] step=120 train_loss=0.0220 tok_s=60047.1 opt_steps=120 +[epoch 20/50] step=140 train_loss=0.0221 tok_s=60069.3 opt_steps=140 +[epoch 20/50] step=160 train_loss=0.0221 tok_s=60059.6 opt_steps=160 +[epoch 20/50] step=180 train_loss=0.0221 tok_s=60058.9 opt_steps=180 +[epoch 20/50] step=200 train_loss=0.0220 tok_s=60050.6 opt_steps=200 +[epoch 20/50] step=220 train_loss=0.0221 tok_s=60076.4 opt_steps=220 +[epoch 20/50] step=240 train_loss=0.0222 tok_s=60086.7 opt_steps=240 +[epoch 20/50] step=260 train_loss=0.0223 tok_s=60093.3 opt_steps=260 +[epoch 20/50] step=280 train_loss=0.0223 tok_s=60138.7 opt_steps=280 +[epoch 20/50] step=300 train_loss=0.0223 tok_s=60182.3 opt_steps=300 +[epoch 20/50] step=320 train_loss=0.0224 tok_s=60182.4 opt_steps=320 +[epoch 20/50] step=340 train_loss=0.0224 tok_s=60168.6 opt_steps=340 +[epoch 20/50] step=360 train_loss=0.0224 tok_s=60162.4 opt_steps=360 +[epoch 20/50] step=380 train_loss=0.0225 tok_s=60179.6 opt_steps=380 +[epoch 20/50] step=400 train_loss=0.0225 tok_s=60164.3 opt_steps=400 +[epoch 20/50] step=420 train_loss=0.0226 tok_s=60177.6 opt_steps=420 +[epoch 20/50] step=440 train_loss=0.0226 tok_s=60183.2 opt_steps=440 +[epoch 20/50] step=460 train_loss=0.0226 tok_s=60170.9 opt_steps=460 +[epoch 20/50] step=480 train_loss=0.0227 tok_s=60173.8 opt_steps=480 +[epoch 20/50] step=500 train_loss=0.0228 tok_s=60160.1 opt_steps=500 +[epoch 20/50] step=520 train_loss=0.0229 tok_s=60170.0 opt_steps=520 +[epoch 20/50] step=540 train_loss=0.0229 tok_s=60159.1 opt_steps=540 +[epoch 20/50] step=560 train_loss=0.0230 tok_s=60152.3 opt_steps=560 +[epoch 20/50] step=580 train_loss=0.0231 tok_s=60160.5 opt_steps=580 +[epoch 20/50] step=600 train_loss=0.0231 tok_s=60143.0 opt_steps=600 +[epoch 20/50] step=620 train_loss=0.0232 tok_s=60170.7 opt_steps=620 +[epoch 20/50] step=640 train_loss=0.0232 tok_s=60171.7 opt_steps=640 +[epoch 20/50] step=660 train_loss=0.0233 tok_s=60180.1 opt_steps=660 +[epoch 20/50] step=680 train_loss=0.0233 tok_s=60187.1 opt_steps=680 +[epoch 20/50] step=700 train_loss=0.0233 tok_s=60183.6 opt_steps=700 +[epoch 20/50] step=720 train_loss=0.0233 tok_s=60173.4 opt_steps=720 +[epoch 20/50] step=740 train_loss=0.0234 tok_s=60182.0 opt_steps=740 +[epoch 20/50] step=760 train_loss=0.0234 tok_s=60195.2 opt_steps=760 +[epoch 20/50] step=780 train_loss=0.0235 tok_s=60208.1 opt_steps=780 +[epoch 20/50] step=800 train_loss=0.0235 tok_s=60208.2 opt_steps=800 +[epoch 20/50] step=820 train_loss=0.0236 tok_s=60202.3 opt_steps=820 +[epoch 20/50] step=840 train_loss=0.0237 tok_s=60200.2 opt_steps=840 +[epoch 20/50] step=860 train_loss=0.0237 tok_s=60208.8 opt_steps=860 +[epoch 20/50] step=880 train_loss=0.0237 tok_s=60221.6 opt_steps=880 +[epoch 20/50] step=900 train_loss=0.0237 tok_s=60233.6 opt_steps=900 +[epoch 20/50] step=920 train_loss=0.0238 tok_s=60242.6 opt_steps=920 +[epoch 20/50] step=940 train_loss=0.0238 tok_s=60239.3 opt_steps=940 +[epoch 20/50] step=960 train_loss=0.0238 tok_s=60245.0 opt_steps=960 +[epoch 20/50] step=980 train_loss=0.0238 tok_s=60245.1 opt_steps=980 +[epoch 20/50] step=1000 train_loss=0.0239 tok_s=60242.9 opt_steps=1000 +[epoch 20/50] step=1020 train_loss=0.0239 tok_s=60246.7 opt_steps=1020 +[epoch 20/50] step=1040 train_loss=0.0240 tok_s=60250.7 opt_steps=1040 +[epoch 20/50] step=1060 train_loss=0.0240 tok_s=60249.6 opt_steps=1060 +[epoch 20/50] step=1080 train_loss=0.0240 tok_s=60223.5 opt_steps=1080 +[epoch 20/50] step=1100 train_loss=0.0241 tok_s=60224.8 opt_steps=1100 +[epoch 20/50] step=1120 train_loss=0.0241 tok_s=60212.6 opt_steps=1120 +[epoch 20/50] step=1140 train_loss=0.0241 tok_s=60220.1 opt_steps=1140 +[epoch 20/50] step=1160 train_loss=0.0242 tok_s=60221.2 opt_steps=1160 +[epoch 20/50] step=1180 train_loss=0.0242 tok_s=60214.6 opt_steps=1180 +[epoch 20/50] step=1200 train_loss=0.0242 tok_s=60212.1 opt_steps=1200 +[epoch 20/50] step=1220 train_loss=0.0242 tok_s=60205.1 opt_steps=1220 +[epoch 20/50] step=1240 train_loss=0.0243 tok_s=60214.7 opt_steps=1240 +[epoch 20/50] step=1260 train_loss=0.0243 tok_s=60214.9 opt_steps=1260 +[epoch 20/50] step=1280 train_loss=0.0244 tok_s=60225.8 opt_steps=1280 +[epoch 20/50] step=1300 train_loss=0.0244 tok_s=60229.8 opt_steps=1300 +[epoch 20/50] step=1320 train_loss=0.0244 tok_s=60228.1 opt_steps=1320 +[epoch 20/50] step=1340 train_loss=0.0245 tok_s=60232.1 opt_steps=1340 +[epoch 20/50] step=1360 train_loss=0.0245 tok_s=60233.3 opt_steps=1360 +[epoch 20/50] step=1380 train_loss=0.0245 tok_s=60231.2 opt_steps=1380 +[epoch 20/50] step=1400 train_loss=0.0245 tok_s=60234.9 opt_steps=1400 +[epoch 20/50] step=1420 train_loss=0.0246 tok_s=60235.6 opt_steps=1420 +[epoch 20/50] step=1440 train_loss=0.0246 tok_s=60243.1 opt_steps=1440 +[epoch 20/50] step=1460 train_loss=0.0246 tok_s=60247.6 opt_steps=1460 +[epoch 20/50] step=1480 train_loss=0.0246 tok_s=60243.0 opt_steps=1480 +[epoch 20/50] step=1500 train_loss=0.0247 tok_s=60245.5 opt_steps=1500 +[epoch 20/50] step=1520 train_loss=0.0247 tok_s=60249.0 opt_steps=1520 +[epoch 20/50] step=1540 train_loss=0.0247 tok_s=60249.8 opt_steps=1540 +[epoch 20/50] step=1560 train_loss=0.0248 tok_s=60252.0 opt_steps=1560 +[epoch 20/50] step=1580 train_loss=0.0248 tok_s=60255.0 opt_steps=1580 +[epoch 20/50] step=1600 train_loss=0.0248 tok_s=60254.6 opt_steps=1600 +[epoch 20/50] step=1620 train_loss=0.0249 tok_s=60258.4 opt_steps=1620 +[epoch 20/50] step=1640 train_loss=0.0249 tok_s=60255.3 opt_steps=1640 +[epoch 20/50] step=1660 train_loss=0.0249 tok_s=60250.5 opt_steps=1660 +[epoch 20/50] step=1680 train_loss=0.0249 tok_s=60247.9 opt_steps=1680 +[epoch 20/50] step=1700 train_loss=0.0250 tok_s=60254.1 opt_steps=1700 +[epoch 20/50] step=1720 train_loss=0.0250 tok_s=60254.9 opt_steps=1720 +[epoch 20/50] step=1740 train_loss=0.0250 tok_s=60259.6 opt_steps=1740 +[epoch 20/50] step=1760 train_loss=0.0251 tok_s=60260.7 opt_steps=1760 +[epoch 20/50] step=1780 train_loss=0.0251 tok_s=60263.7 opt_steps=1780 +[epoch 20/50] step=1800 train_loss=0.0251 tok_s=60266.9 opt_steps=1800 +[epoch 20/50] step=1820 train_loss=0.0251 tok_s=60271.2 opt_steps=1820 +[epoch 20/50] step=1840 train_loss=0.0252 tok_s=60275.2 opt_steps=1840 +[epoch 20/50] step=1860 train_loss=0.0252 tok_s=60274.2 opt_steps=1860 +[epoch 20/50] step=1880 train_loss=0.0252 tok_s=60273.5 opt_steps=1880 +[epoch 20/50] step=1900 train_loss=0.0253 tok_s=60270.4 opt_steps=1900 +[epoch 20/50] step=1920 train_loss=0.0253 tok_s=60269.6 opt_steps=1920 +[epoch 20/50] step=1940 train_loss=0.0253 tok_s=60271.2 opt_steps=1940 +[epoch 20/50] step=1960 train_loss=0.0254 tok_s=60268.2 opt_steps=1960 +[epoch 20/50] step=1980 train_loss=0.0254 tok_s=60264.3 opt_steps=1980 +[epoch 20/50] step=2000 train_loss=0.0254 tok_s=60268.6 opt_steps=2000 +[epoch 20/50] step=2020 train_loss=0.0254 tok_s=60270.1 opt_steps=2020 +[epoch 20/50] step=2040 train_loss=0.0255 tok_s=60269.0 opt_steps=2040 +[epoch 20/50] step=2060 train_loss=0.0255 tok_s=60268.6 opt_steps=2060 +[epoch 20/50] step=2080 train_loss=0.0255 tok_s=60270.8 opt_steps=2080 +[epoch 20/50] step=2100 train_loss=0.0256 tok_s=60270.6 opt_steps=2100 +[epoch 20/50] step=2120 train_loss=0.0256 tok_s=60266.3 opt_steps=2120 +[epoch 20/50] step=2140 train_loss=0.0256 tok_s=60267.8 opt_steps=2140 +[epoch 20/50] step=2160 train_loss=0.0256 tok_s=60267.4 opt_steps=2160 +[epoch 20/50] step=2180 train_loss=0.0257 tok_s=60264.7 opt_steps=2180 +[epoch 20/50] step=2200 train_loss=0.0257 tok_s=60270.1 opt_steps=2200 +[epoch 20/50] step=2220 train_loss=0.0257 tok_s=60262.4 opt_steps=2220 +[epoch 20/50] step=2240 train_loss=0.0257 tok_s=60254.8 opt_steps=2240 +[epoch 20/50] step=2260 train_loss=0.0257 tok_s=60255.3 opt_steps=2260 +[epoch 20/50] step=2280 train_loss=0.0258 tok_s=60253.4 opt_steps=2280 +[epoch 20/50] step=2300 train_loss=0.0258 tok_s=60251.1 opt_steps=2300 +[epoch 20/50] step=2320 train_loss=0.0258 tok_s=60244.5 opt_steps=2320 +[epoch 20/50] step=2340 train_loss=0.0258 tok_s=60250.1 opt_steps=2340 +[epoch 20/50] step=2360 train_loss=0.0258 tok_s=60245.3 opt_steps=2360 +[epoch 20/50] step=2380 train_loss=0.0259 tok_s=60243.5 opt_steps=2380 +[epoch 20/50] step=2400 train_loss=0.0259 tok_s=60243.4 opt_steps=2400 +[epoch 20/50] step=2420 train_loss=0.0259 tok_s=60242.5 opt_steps=2420 +[epoch 20/50] step=2440 train_loss=0.0259 tok_s=60244.4 opt_steps=2440 +[epoch 20/50] step=2460 train_loss=0.0259 tok_s=60242.8 opt_steps=2460 +[epoch 20/50] step=2480 train_loss=0.0260 tok_s=60242.3 opt_steps=2480 +[epoch 20/50] step=2500 train_loss=0.0260 tok_s=60235.4 opt_steps=2500 +[epoch 20/50] step=2520 train_loss=0.0260 tok_s=60233.1 opt_steps=2520 +[epoch 20/50] step=2540 train_loss=0.0260 tok_s=60228.9 opt_steps=2540 +[epoch 20/50] step=2560 train_loss=0.0261 tok_s=60228.8 opt_steps=2560 +[epoch 20/50] step=2580 train_loss=0.0261 tok_s=60224.3 opt_steps=2580 +[epoch 20/50] step=2600 train_loss=0.0261 tok_s=60222.3 opt_steps=2600 +[epoch 20/50] step=2620 train_loss=0.0261 tok_s=60219.1 opt_steps=2620 +[epoch 20/50] step=2640 train_loss=0.0262 tok_s=60217.1 opt_steps=2640 +[epoch 20/50] step=2660 train_loss=0.0262 tok_s=60220.0 opt_steps=2660 +[epoch 20/50] step=2680 train_loss=0.0262 tok_s=60218.7 opt_steps=2680 +[epoch 20/50] step=2700 train_loss=0.0262 tok_s=60213.8 opt_steps=2700 +[epoch 20/50] step=2720 train_loss=0.0262 tok_s=60213.3 opt_steps=2720 +[epoch 20/50] step=2740 train_loss=0.0263 tok_s=60212.1 opt_steps=2740 +[epoch 20/50] step=2760 train_loss=0.0263 tok_s=60211.0 opt_steps=2760 +[epoch 20/50] step=2780 train_loss=0.0263 tok_s=60212.7 opt_steps=2780 +[epoch 20/50] step=2800 train_loss=0.0263 tok_s=60211.6 opt_steps=2800 +[epoch 20/50] step=2820 train_loss=0.0263 tok_s=60216.0 opt_steps=2820 +[epoch 20/50] step=2840 train_loss=0.0264 tok_s=60215.0 opt_steps=2840 +[epoch 20/50] step=2860 train_loss=0.0264 tok_s=60214.6 opt_steps=2860 +[epoch 20/50] step=2880 train_loss=0.0264 tok_s=60214.6 opt_steps=2880 +[epoch 20/50] step=2900 train_loss=0.0264 tok_s=60209.7 opt_steps=2900 +[epoch 20/50] step=2920 train_loss=0.0264 tok_s=60210.1 opt_steps=2920 +[epoch 20/50] step=2940 train_loss=0.0265 tok_s=60209.9 opt_steps=2940 +[epoch 20/50] step=2960 train_loss=0.0265 tok_s=60211.8 opt_steps=2960 +[epoch 20/50] step=2980 train_loss=0.0265 tok_s=60207.8 opt_steps=2980 +[epoch 20/50] step=3000 train_loss=0.0265 tok_s=60206.4 opt_steps=3000 +[epoch 20/50] step=3020 train_loss=0.0266 tok_s=60208.1 opt_steps=3020 +[epoch 20/50] step=3040 train_loss=0.0266 tok_s=60207.6 opt_steps=3040 +[epoch 20/50] step=3060 train_loss=0.0266 tok_s=60207.3 opt_steps=3060 +[epoch 20/50] step=3080 train_loss=0.0266 tok_s=60206.8 opt_steps=3080 +[epoch 20/50] step=3100 train_loss=0.0266 tok_s=60209.3 opt_steps=3100 +[epoch 20/50] step=3120 train_loss=0.0266 tok_s=60208.8 opt_steps=3120 +[epoch 20/50] step=3140 train_loss=0.0267 tok_s=60209.6 opt_steps=3140 +[epoch 20/50] step=3160 train_loss=0.0267 tok_s=60207.3 opt_steps=3160 +[epoch 20/50] step=3180 train_loss=0.0267 tok_s=60207.1 opt_steps=3180 +[epoch 20/50] step=3200 train_loss=0.0267 tok_s=60207.8 opt_steps=3200 +[epoch 20/50] step=3220 train_loss=0.0267 tok_s=60202.1 opt_steps=3220 +[epoch 20/50] step=3240 train_loss=0.0267 tok_s=60202.4 opt_steps=3240 +[epoch 20/50] step=3260 train_loss=0.0268 tok_s=60197.9 opt_steps=3260 +[epoch 20/50] train_loss=0.0268 val_skipped tok_s=60198.9 opt_steps=3273 +[epoch 21/50] step=20 train_loss=0.0208 tok_s=58495.4 opt_steps=20 +[epoch 21/50] step=40 train_loss=0.0205 tok_s=59399.4 opt_steps=40 +[epoch 21/50] step=60 train_loss=0.0209 tok_s=59625.6 opt_steps=60 +[epoch 21/50] step=80 train_loss=0.0207 tok_s=59828.7 opt_steps=80 +[epoch 21/50] step=100 train_loss=0.0206 tok_s=59999.6 opt_steps=100 +[epoch 21/50] step=120 train_loss=0.0207 tok_s=60006.8 opt_steps=120 +[epoch 21/50] step=140 train_loss=0.0207 tok_s=60006.6 opt_steps=140 +[epoch 21/50] step=160 train_loss=0.0208 tok_s=59975.2 opt_steps=160 +[epoch 21/50] step=180 train_loss=0.0208 tok_s=60046.9 opt_steps=180 +[epoch 21/50] step=200 train_loss=0.0210 tok_s=60057.1 opt_steps=200 +[epoch 21/50] step=220 train_loss=0.0210 tok_s=60053.8 opt_steps=220 +[epoch 21/50] step=240 train_loss=0.0210 tok_s=60051.5 opt_steps=240 +[epoch 21/50] step=260 train_loss=0.0211 tok_s=60026.7 opt_steps=260 +[epoch 21/50] step=280 train_loss=0.0212 tok_s=60013.0 opt_steps=280 +[epoch 21/50] step=300 train_loss=0.0212 tok_s=60030.7 opt_steps=300 +[epoch 21/50] step=320 train_loss=0.0213 tok_s=60027.2 opt_steps=320 +[epoch 21/50] step=340 train_loss=0.0214 tok_s=60018.5 opt_steps=340 +[epoch 21/50] step=360 train_loss=0.0214 tok_s=60047.9 opt_steps=360 +[epoch 21/50] step=380 train_loss=0.0214 tok_s=60081.5 opt_steps=380 +[epoch 21/50] step=400 train_loss=0.0215 tok_s=60090.7 opt_steps=400 +[epoch 21/50] step=420 train_loss=0.0215 tok_s=60090.6 opt_steps=420 +[epoch 21/50] step=440 train_loss=0.0216 tok_s=60092.3 opt_steps=440 +[epoch 21/50] step=460 train_loss=0.0217 tok_s=60103.7 opt_steps=460 +[epoch 21/50] step=480 train_loss=0.0217 tok_s=60101.8 opt_steps=480 +[epoch 21/50] step=500 train_loss=0.0217 tok_s=60114.8 opt_steps=500 +[epoch 21/50] step=520 train_loss=0.0218 tok_s=60106.5 opt_steps=520 +[epoch 21/50] step=540 train_loss=0.0219 tok_s=60110.3 opt_steps=540 +[epoch 21/50] step=560 train_loss=0.0219 tok_s=60124.5 opt_steps=560 +[epoch 21/50] step=580 train_loss=0.0219 tok_s=60113.7 opt_steps=580 +[epoch 21/50] step=600 train_loss=0.0220 tok_s=60125.4 opt_steps=600 +[epoch 21/50] step=620 train_loss=0.0220 tok_s=60119.3 opt_steps=620 +[epoch 21/50] step=640 train_loss=0.0220 tok_s=60121.6 opt_steps=640 +[epoch 21/50] step=660 train_loss=0.0221 tok_s=60126.1 opt_steps=660 +[epoch 21/50] step=680 train_loss=0.0221 tok_s=60135.5 opt_steps=680 +[epoch 21/50] step=700 train_loss=0.0222 tok_s=60131.7 opt_steps=700 +[epoch 21/50] step=720 train_loss=0.0222 tok_s=60146.3 opt_steps=720 +[epoch 21/50] step=740 train_loss=0.0222 tok_s=60137.5 opt_steps=740 +[epoch 21/50] step=760 train_loss=0.0223 tok_s=60153.6 opt_steps=760 +[epoch 21/50] step=780 train_loss=0.0223 tok_s=60152.3 opt_steps=780 +[epoch 21/50] step=800 train_loss=0.0223 tok_s=60158.4 opt_steps=800 +[epoch 21/50] step=820 train_loss=0.0223 tok_s=60159.9 opt_steps=820 +[epoch 21/50] step=840 train_loss=0.0224 tok_s=60156.8 opt_steps=840 +[epoch 21/50] step=860 train_loss=0.0224 tok_s=60157.1 opt_steps=860 +[epoch 21/50] step=880 train_loss=0.0225 tok_s=60159.9 opt_steps=880 +[epoch 21/50] step=900 train_loss=0.0225 tok_s=60162.1 opt_steps=900 +[epoch 21/50] step=920 train_loss=0.0225 tok_s=60177.7 opt_steps=920 +[epoch 21/50] step=940 train_loss=0.0226 tok_s=60179.4 opt_steps=940 +[epoch 21/50] step=960 train_loss=0.0226 tok_s=60183.2 opt_steps=960 +[epoch 21/50] step=980 train_loss=0.0226 tok_s=60176.6 opt_steps=980 +[epoch 21/50] step=1000 train_loss=0.0226 tok_s=60178.5 opt_steps=1000 +[epoch 21/50] step=1020 train_loss=0.0227 tok_s=60189.3 opt_steps=1020 +[epoch 21/50] step=1040 train_loss=0.0227 tok_s=60193.7 opt_steps=1040 +[epoch 21/50] step=1060 train_loss=0.0227 tok_s=60203.3 opt_steps=1060 +[epoch 21/50] step=1080 train_loss=0.0228 tok_s=60204.9 opt_steps=1080 +[epoch 21/50] step=1100 train_loss=0.0228 tok_s=60202.6 opt_steps=1100 +[epoch 21/50] step=1120 train_loss=0.0228 tok_s=60205.4 opt_steps=1120 +[epoch 21/50] step=1140 train_loss=0.0228 tok_s=60201.4 opt_steps=1140 +[epoch 21/50] step=1160 train_loss=0.0229 tok_s=60199.1 opt_steps=1160 +[epoch 21/50] step=1180 train_loss=0.0229 tok_s=60200.5 opt_steps=1180 +[epoch 21/50] step=1200 train_loss=0.0229 tok_s=60204.7 opt_steps=1200 +[epoch 21/50] step=1220 train_loss=0.0229 tok_s=60199.4 opt_steps=1220 +[epoch 21/50] step=1240 train_loss=0.0230 tok_s=60202.5 opt_steps=1240 +[epoch 21/50] step=1260 train_loss=0.0230 tok_s=60204.9 opt_steps=1260 +[epoch 21/50] step=1280 train_loss=0.0231 tok_s=60201.0 opt_steps=1280 +[epoch 21/50] step=1300 train_loss=0.0231 tok_s=60200.0 opt_steps=1300 +[epoch 21/50] step=1320 train_loss=0.0231 tok_s=60196.2 opt_steps=1320 +[epoch 21/50] step=1340 train_loss=0.0231 tok_s=60192.6 opt_steps=1340 +[epoch 21/50] step=1360 train_loss=0.0232 tok_s=60190.4 opt_steps=1360 +[epoch 21/50] step=1380 train_loss=0.0232 tok_s=60202.1 opt_steps=1380 +[epoch 21/50] step=1400 train_loss=0.0233 tok_s=60199.5 opt_steps=1400 +[epoch 21/50] step=1420 train_loss=0.0233 tok_s=60198.5 opt_steps=1420 +[epoch 21/50] step=1440 train_loss=0.0233 tok_s=60193.2 opt_steps=1440 +[epoch 21/50] step=1460 train_loss=0.0233 tok_s=60196.7 opt_steps=1460 +[epoch 21/50] step=1480 train_loss=0.0233 tok_s=60202.2 opt_steps=1480 +[epoch 21/50] step=1500 train_loss=0.0234 tok_s=60207.1 opt_steps=1500 +[epoch 21/50] step=1520 train_loss=0.0234 tok_s=60204.3 opt_steps=1520 +[epoch 21/50] step=1540 train_loss=0.0235 tok_s=60203.9 opt_steps=1540 +[epoch 21/50] step=1560 train_loss=0.0235 tok_s=60210.2 opt_steps=1560 +[epoch 21/50] step=1580 train_loss=0.0235 tok_s=60214.8 opt_steps=1580 +[epoch 21/50] step=1600 train_loss=0.0235 tok_s=60211.2 opt_steps=1600 +[epoch 21/50] step=1620 train_loss=0.0236 tok_s=60206.3 opt_steps=1620 +[epoch 21/50] step=1640 train_loss=0.0236 tok_s=60205.9 opt_steps=1640 +[epoch 21/50] step=1660 train_loss=0.0237 tok_s=60208.0 opt_steps=1660 +[epoch 21/50] step=1680 train_loss=0.0237 tok_s=60205.3 opt_steps=1680 +[epoch 21/50] step=1700 train_loss=0.0237 tok_s=60204.7 opt_steps=1700 +[epoch 21/50] step=1720 train_loss=0.0237 tok_s=60204.0 opt_steps=1720 +[epoch 21/50] step=1740 train_loss=0.0237 tok_s=60200.3 opt_steps=1740 +[epoch 21/50] step=1760 train_loss=0.0237 tok_s=60201.4 opt_steps=1760 +[epoch 21/50] step=1780 train_loss=0.0238 tok_s=60202.8 opt_steps=1780 +[epoch 21/50] step=1800 train_loss=0.0238 tok_s=60202.6 opt_steps=1800 +[epoch 21/50] step=1820 train_loss=0.0238 tok_s=60204.6 opt_steps=1820 +[epoch 21/50] step=1840 train_loss=0.0238 tok_s=60202.8 opt_steps=1840 +[epoch 21/50] step=1860 train_loss=0.0239 tok_s=60208.0 opt_steps=1860 +[epoch 21/50] step=1880 train_loss=0.0239 tok_s=60205.3 opt_steps=1880 +[epoch 21/50] step=1900 train_loss=0.0239 tok_s=60213.8 opt_steps=1900 +[epoch 21/50] step=1920 train_loss=0.0239 tok_s=60216.3 opt_steps=1920 +[epoch 21/50] step=1940 train_loss=0.0240 tok_s=60216.3 opt_steps=1940 +[epoch 21/50] step=1960 train_loss=0.0240 tok_s=60215.3 opt_steps=1960 +[epoch 21/50] step=1980 train_loss=0.0240 tok_s=60206.2 opt_steps=1980 +[epoch 21/50] step=2000 train_loss=0.0240 tok_s=60207.5 opt_steps=2000 +[epoch 21/50] step=2020 train_loss=0.0241 tok_s=60204.8 opt_steps=2020 +[epoch 21/50] step=2040 train_loss=0.0241 tok_s=60201.1 opt_steps=2040 +[epoch 21/50] step=2060 train_loss=0.0241 tok_s=60199.6 opt_steps=2060 +[epoch 21/50] step=2080 train_loss=0.0241 tok_s=60207.1 opt_steps=2080 +[epoch 21/50] step=2100 train_loss=0.0241 tok_s=60205.1 opt_steps=2100 +[epoch 21/50] step=2120 train_loss=0.0242 tok_s=60204.3 opt_steps=2120 +[epoch 21/50] step=2140 train_loss=0.0242 tok_s=60204.5 opt_steps=2140 +[epoch 21/50] step=2160 train_loss=0.0242 tok_s=60207.6 opt_steps=2160 +[epoch 21/50] step=2180 train_loss=0.0242 tok_s=60212.6 opt_steps=2180 +[epoch 21/50] step=2200 train_loss=0.0243 tok_s=60211.6 opt_steps=2200 +[epoch 21/50] step=2220 train_loss=0.0243 tok_s=60212.1 opt_steps=2220 +[epoch 21/50] step=2240 train_loss=0.0243 tok_s=60216.0 opt_steps=2240 +[epoch 21/50] step=2260 train_loss=0.0243 tok_s=60220.4 opt_steps=2260 +[epoch 21/50] step=2280 train_loss=0.0243 tok_s=60217.6 opt_steps=2280 +[epoch 21/50] step=2300 train_loss=0.0244 tok_s=60218.8 opt_steps=2300 +[epoch 21/50] step=2320 train_loss=0.0244 tok_s=60217.6 opt_steps=2320 +[epoch 21/50] step=2340 train_loss=0.0244 tok_s=60217.6 opt_steps=2340 +[epoch 21/50] step=2360 train_loss=0.0244 tok_s=60217.0 opt_steps=2360 +[epoch 21/50] step=2380 train_loss=0.0245 tok_s=60219.2 opt_steps=2380 +[epoch 21/50] step=2400 train_loss=0.0245 tok_s=60222.6 opt_steps=2400 +[epoch 21/50] step=2420 train_loss=0.0245 tok_s=60221.0 opt_steps=2420 +[epoch 21/50] step=2440 train_loss=0.0245 tok_s=60219.6 opt_steps=2440 +[epoch 21/50] step=2460 train_loss=0.0245 tok_s=60220.2 opt_steps=2460 +[epoch 21/50] step=2480 train_loss=0.0246 tok_s=60222.7 opt_steps=2480 +[epoch 21/50] step=2500 train_loss=0.0246 tok_s=60227.3 opt_steps=2500 +[epoch 21/50] step=2520 train_loss=0.0246 tok_s=60227.9 opt_steps=2520 +[epoch 21/50] step=2540 train_loss=0.0246 tok_s=60223.0 opt_steps=2540 +[epoch 21/50] step=2560 train_loss=0.0246 tok_s=60222.7 opt_steps=2560 +[epoch 21/50] step=2580 train_loss=0.0247 tok_s=60223.5 opt_steps=2580 +[epoch 21/50] step=2600 train_loss=0.0247 tok_s=60225.8 opt_steps=2600 +[epoch 21/50] step=2620 train_loss=0.0247 tok_s=60224.7 opt_steps=2620 +[epoch 21/50] step=2640 train_loss=0.0247 tok_s=60227.6 opt_steps=2640 +[epoch 21/50] step=2660 train_loss=0.0247 tok_s=60227.6 opt_steps=2660 +[epoch 21/50] step=2680 train_loss=0.0248 tok_s=60229.8 opt_steps=2680 +[epoch 21/50] step=2700 train_loss=0.0248 tok_s=60230.5 opt_steps=2700 +[epoch 21/50] step=2720 train_loss=0.0248 tok_s=60226.7 opt_steps=2720 +[epoch 21/50] step=2740 train_loss=0.0248 tok_s=60225.9 opt_steps=2740 +[epoch 21/50] step=2760 train_loss=0.0248 tok_s=60228.4 opt_steps=2760 +[epoch 21/50] step=2780 train_loss=0.0249 tok_s=60226.1 opt_steps=2780 +[epoch 21/50] step=2800 train_loss=0.0249 tok_s=60225.3 opt_steps=2800 +[epoch 21/50] step=2820 train_loss=0.0249 tok_s=60229.8 opt_steps=2820 +[epoch 21/50] step=2840 train_loss=0.0249 tok_s=60228.6 opt_steps=2840 +[epoch 21/50] step=2860 train_loss=0.0249 tok_s=60228.8 opt_steps=2860 +[epoch 21/50] step=2880 train_loss=0.0249 tok_s=60224.3 opt_steps=2880 +[epoch 21/50] step=2900 train_loss=0.0250 tok_s=60225.5 opt_steps=2900 +[epoch 21/50] step=2920 train_loss=0.0250 tok_s=60226.2 opt_steps=2920 +[epoch 21/50] step=2940 train_loss=0.0250 tok_s=60221.5 opt_steps=2940 +[epoch 21/50] step=2960 train_loss=0.0250 tok_s=60218.4 opt_steps=2960 +[epoch 21/50] step=2980 train_loss=0.0250 tok_s=60217.4 opt_steps=2980 +[epoch 21/50] step=3000 train_loss=0.0250 tok_s=60217.7 opt_steps=3000 +[epoch 21/50] step=3020 train_loss=0.0251 tok_s=60219.5 opt_steps=3020 +[epoch 21/50] step=3040 train_loss=0.0251 tok_s=60221.5 opt_steps=3040 +[epoch 21/50] step=3060 train_loss=0.0251 tok_s=60218.3 opt_steps=3060 +[epoch 21/50] step=3080 train_loss=0.0251 tok_s=60216.5 opt_steps=3080 +[epoch 21/50] step=3100 train_loss=0.0251 tok_s=60216.0 opt_steps=3100 +[epoch 21/50] step=3120 train_loss=0.0251 tok_s=60212.7 opt_steps=3120 +[epoch 21/50] step=3140 train_loss=0.0252 tok_s=60210.0 opt_steps=3140 +[epoch 21/50] step=3160 train_loss=0.0252 tok_s=60207.2 opt_steps=3160 +[epoch 21/50] step=3180 train_loss=0.0252 tok_s=60207.0 opt_steps=3180 +[epoch 21/50] step=3200 train_loss=0.0252 tok_s=60207.3 opt_steps=3200 +[epoch 21/50] step=3220 train_loss=0.0252 tok_s=60207.3 opt_steps=3220 +[epoch 21/50] step=3240 train_loss=0.0253 tok_s=60209.6 opt_steps=3240 +[epoch 21/50] step=3260 train_loss=0.0253 tok_s=60201.8 opt_steps=3260 +[epoch 21/50] train_loss=0.0253 val_skipped tok_s=60203.1 opt_steps=3273 +[epoch 22/50] step=20 train_loss=0.0201 tok_s=58265.2 opt_steps=20 +[epoch 22/50] step=40 train_loss=0.0196 tok_s=59266.0 opt_steps=40 +[epoch 22/50] step=60 train_loss=0.0195 tok_s=59559.0 opt_steps=60 +[epoch 22/50] step=80 train_loss=0.0197 tok_s=59658.9 opt_steps=80 +[epoch 22/50] step=100 train_loss=0.0198 tok_s=59749.9 opt_steps=100 +[epoch 22/50] step=120 train_loss=0.0196 tok_s=59885.8 opt_steps=120 +[epoch 22/50] step=140 train_loss=0.0196 tok_s=59918.6 opt_steps=140 +[epoch 22/50] step=160 train_loss=0.0196 tok_s=59950.5 opt_steps=160 +[epoch 22/50] step=180 train_loss=0.0196 tok_s=59969.2 opt_steps=180 +[epoch 22/50] step=200 train_loss=0.0197 tok_s=59995.2 opt_steps=200 +[epoch 22/50] step=220 train_loss=0.0196 tok_s=60010.9 opt_steps=220 +[epoch 22/50] step=240 train_loss=0.0196 tok_s=60032.0 opt_steps=240 +[epoch 22/50] step=260 train_loss=0.0197 tok_s=60048.6 opt_steps=260 +[epoch 22/50] step=280 train_loss=0.0197 tok_s=60111.4 opt_steps=280 +[epoch 22/50] step=300 train_loss=0.0198 tok_s=60144.3 opt_steps=300 +[epoch 22/50] step=320 train_loss=0.0198 tok_s=60138.0 opt_steps=320 +[epoch 22/50] step=340 train_loss=0.0200 tok_s=60148.0 opt_steps=340 +[epoch 22/50] step=360 train_loss=0.0201 tok_s=60143.7 opt_steps=360 +[epoch 22/50] step=380 train_loss=0.0201 tok_s=60169.5 opt_steps=380 +[epoch 22/50] step=400 train_loss=0.0202 tok_s=60166.8 opt_steps=400 +[epoch 22/50] step=420 train_loss=0.0202 tok_s=60150.7 opt_steps=420 +[epoch 22/50] step=440 train_loss=0.0203 tok_s=60170.0 opt_steps=440 +[epoch 22/50] step=460 train_loss=0.0203 tok_s=60174.8 opt_steps=460 +[epoch 22/50] step=480 train_loss=0.0203 tok_s=60171.8 opt_steps=480 +[epoch 22/50] step=500 train_loss=0.0204 tok_s=60184.0 opt_steps=500 +[epoch 22/50] step=520 train_loss=0.0204 tok_s=60191.9 opt_steps=520 +[epoch 22/50] step=540 train_loss=0.0205 tok_s=60185.7 opt_steps=540 +[epoch 22/50] step=560 train_loss=0.0205 tok_s=60170.7 opt_steps=560 +[epoch 22/50] step=580 train_loss=0.0206 tok_s=60164.3 opt_steps=580 +[epoch 22/50] step=600 train_loss=0.0206 tok_s=60164.7 opt_steps=600 +[epoch 22/50] step=620 train_loss=0.0206 tok_s=60150.8 opt_steps=620 +[epoch 22/50] step=640 train_loss=0.0207 tok_s=60155.2 opt_steps=640 +[epoch 22/50] step=660 train_loss=0.0207 tok_s=60155.5 opt_steps=660 +[epoch 22/50] step=680 train_loss=0.0208 tok_s=60150.9 opt_steps=680 +[epoch 22/50] step=700 train_loss=0.0208 tok_s=60153.5 opt_steps=700 +[epoch 22/50] step=720 train_loss=0.0209 tok_s=60158.1 opt_steps=720 +[epoch 22/50] step=740 train_loss=0.0209 tok_s=60158.8 opt_steps=740 +[epoch 22/50] step=760 train_loss=0.0210 tok_s=60166.6 opt_steps=760 +[epoch 22/50] step=780 train_loss=0.0210 tok_s=60172.9 opt_steps=780 +[epoch 22/50] step=800 train_loss=0.0210 tok_s=60161.0 opt_steps=800 +[epoch 22/50] step=820 train_loss=0.0211 tok_s=60166.9 opt_steps=820 +[epoch 22/50] step=840 train_loss=0.0211 tok_s=60171.3 opt_steps=840 +[epoch 22/50] step=860 train_loss=0.0211 tok_s=60173.9 opt_steps=860 +[epoch 22/50] step=880 train_loss=0.0212 tok_s=60171.2 opt_steps=880 +[epoch 22/50] step=900 train_loss=0.0212 tok_s=60171.0 opt_steps=900 +[epoch 22/50] step=920 train_loss=0.0212 tok_s=60167.3 opt_steps=920 +[epoch 22/50] step=940 train_loss=0.0213 tok_s=60171.2 opt_steps=940 +[epoch 22/50] step=960 train_loss=0.0213 tok_s=60167.5 opt_steps=960 +[epoch 22/50] step=980 train_loss=0.0213 tok_s=60168.8 opt_steps=980 +[epoch 22/50] step=1000 train_loss=0.0214 tok_s=60164.1 opt_steps=1000 +[epoch 22/50] step=1020 train_loss=0.0214 tok_s=60167.4 opt_steps=1020 +[epoch 22/50] step=1040 train_loss=0.0215 tok_s=60168.6 opt_steps=1040 +[epoch 22/50] step=1060 train_loss=0.0215 tok_s=60172.2 opt_steps=1060 +[epoch 22/50] step=1080 train_loss=0.0216 tok_s=60173.7 opt_steps=1080 +[epoch 22/50] step=1100 train_loss=0.0216 tok_s=60163.9 opt_steps=1100 +[epoch 22/50] step=1120 train_loss=0.0216 tok_s=60169.6 opt_steps=1120 +[epoch 22/50] step=1140 train_loss=0.0217 tok_s=60167.1 opt_steps=1140 +[epoch 22/50] step=1160 train_loss=0.0217 tok_s=60165.7 opt_steps=1160 +[epoch 22/50] step=1180 train_loss=0.0217 tok_s=60179.6 opt_steps=1180 +[epoch 22/50] step=1200 train_loss=0.0218 tok_s=60172.4 opt_steps=1200 +[epoch 22/50] step=1220 train_loss=0.0218 tok_s=60164.2 opt_steps=1220 +[epoch 22/50] step=1240 train_loss=0.0218 tok_s=60158.2 opt_steps=1240 +[epoch 22/50] step=1260 train_loss=0.0218 tok_s=60154.8 opt_steps=1260 +[epoch 22/50] step=1280 train_loss=0.0219 tok_s=60159.2 opt_steps=1280 +[epoch 22/50] step=1300 train_loss=0.0219 tok_s=60159.3 opt_steps=1300 +[epoch 22/50] step=1320 train_loss=0.0219 tok_s=60154.3 opt_steps=1320 +[epoch 22/50] step=1340 train_loss=0.0220 tok_s=60162.4 opt_steps=1340 +[epoch 22/50] step=1360 train_loss=0.0220 tok_s=60162.8 opt_steps=1360 +[epoch 22/50] step=1380 train_loss=0.0220 tok_s=60163.5 opt_steps=1380 +[epoch 22/50] step=1400 train_loss=0.0220 tok_s=60160.8 opt_steps=1400 +[epoch 22/50] step=1420 train_loss=0.0220 tok_s=60158.6 opt_steps=1420 +[epoch 22/50] step=1440 train_loss=0.0221 tok_s=60151.7 opt_steps=1440 +[epoch 22/50] step=1460 train_loss=0.0221 tok_s=60146.8 opt_steps=1460 +[epoch 22/50] step=1480 train_loss=0.0221 tok_s=60147.4 opt_steps=1480 +[epoch 22/50] step=1500 train_loss=0.0222 tok_s=60146.1 opt_steps=1500 +[epoch 22/50] step=1520 train_loss=0.0222 tok_s=60147.9 opt_steps=1520 +[epoch 22/50] step=1540 train_loss=0.0222 tok_s=60145.6 opt_steps=1540 +[epoch 22/50] step=1560 train_loss=0.0222 tok_s=60146.6 opt_steps=1560 +[epoch 22/50] step=1580 train_loss=0.0222 tok_s=60145.8 opt_steps=1580 +[epoch 22/50] step=1600 train_loss=0.0223 tok_s=60149.9 opt_steps=1600 +[epoch 22/50] step=1620 train_loss=0.0223 tok_s=60147.5 opt_steps=1620 +[epoch 22/50] step=1640 train_loss=0.0223 tok_s=60146.8 opt_steps=1640 +[epoch 22/50] step=1660 train_loss=0.0223 tok_s=60150.8 opt_steps=1660 +[epoch 22/50] step=1680 train_loss=0.0224 tok_s=60149.7 opt_steps=1680 +[epoch 22/50] step=1700 train_loss=0.0224 tok_s=60149.2 opt_steps=1700 +[epoch 22/50] step=1720 train_loss=0.0224 tok_s=60155.1 opt_steps=1720 +[epoch 22/50] step=1740 train_loss=0.0224 tok_s=60154.6 opt_steps=1740 +[epoch 22/50] step=1760 train_loss=0.0225 tok_s=60152.7 opt_steps=1760 +[epoch 22/50] step=1780 train_loss=0.0225 tok_s=60147.3 opt_steps=1780 +[epoch 22/50] step=1800 train_loss=0.0225 tok_s=60150.6 opt_steps=1800 +[epoch 22/50] step=1820 train_loss=0.0225 tok_s=60153.5 opt_steps=1820 +[epoch 22/50] step=1840 train_loss=0.0226 tok_s=60158.5 opt_steps=1840 +[epoch 22/50] step=1860 train_loss=0.0226 tok_s=60158.7 opt_steps=1860 +[epoch 22/50] step=1880 train_loss=0.0226 tok_s=60152.9 opt_steps=1880 +[epoch 22/50] step=1900 train_loss=0.0227 tok_s=60155.1 opt_steps=1900 +[epoch 22/50] step=1920 train_loss=0.0227 tok_s=60154.9 opt_steps=1920 +[epoch 22/50] step=1940 train_loss=0.0227 tok_s=60154.4 opt_steps=1940 +[epoch 22/50] step=1960 train_loss=0.0227 tok_s=60156.4 opt_steps=1960 +[epoch 22/50] step=1980 train_loss=0.0228 tok_s=60157.9 opt_steps=1980 +[epoch 22/50] step=2000 train_loss=0.0228 tok_s=60162.0 opt_steps=2000 +[epoch 22/50] step=2020 train_loss=0.0228 tok_s=60162.9 opt_steps=2020 +[epoch 22/50] step=2040 train_loss=0.0228 tok_s=60161.1 opt_steps=2040 +[epoch 22/50] step=2060 train_loss=0.0228 tok_s=60157.2 opt_steps=2060 +[epoch 22/50] step=2080 train_loss=0.0229 tok_s=60157.9 opt_steps=2080 +[epoch 22/50] step=2100 train_loss=0.0229 tok_s=60162.3 opt_steps=2100 +[epoch 22/50] step=2120 train_loss=0.0229 tok_s=60163.2 opt_steps=2120 +[epoch 22/50] step=2140 train_loss=0.0229 tok_s=60165.5 opt_steps=2140 +[epoch 22/50] step=2160 train_loss=0.0229 tok_s=60166.6 opt_steps=2160 +[epoch 22/50] step=2180 train_loss=0.0230 tok_s=60164.4 opt_steps=2180 +[epoch 22/50] step=2200 train_loss=0.0230 tok_s=60160.8 opt_steps=2200 +[epoch 22/50] step=2220 train_loss=0.0230 tok_s=60162.3 opt_steps=2220 +[epoch 22/50] step=2240 train_loss=0.0230 tok_s=60159.1 opt_steps=2240 +[epoch 22/50] step=2260 train_loss=0.0230 tok_s=60160.3 opt_steps=2260 +[epoch 22/50] step=2280 train_loss=0.0231 tok_s=60160.3 opt_steps=2280 +[epoch 22/50] step=2300 train_loss=0.0231 tok_s=60159.8 opt_steps=2300 +[epoch 22/50] step=2320 train_loss=0.0231 tok_s=60163.4 opt_steps=2320 +[epoch 22/50] step=2340 train_loss=0.0231 tok_s=60165.0 opt_steps=2340 +[epoch 22/50] step=2360 train_loss=0.0232 tok_s=60163.7 opt_steps=2360 +[epoch 22/50] step=2380 train_loss=0.0232 tok_s=60160.6 opt_steps=2380 +[epoch 22/50] step=2400 train_loss=0.0232 tok_s=60156.9 opt_steps=2400 +[epoch 22/50] step=2420 train_loss=0.0232 tok_s=60156.6 opt_steps=2420 +[epoch 22/50] step=2440 train_loss=0.0233 tok_s=60153.9 opt_steps=2440 +[epoch 22/50] step=2460 train_loss=0.0233 tok_s=60146.3 opt_steps=2460 +[epoch 22/50] step=2480 train_loss=0.0233 tok_s=60140.5 opt_steps=2480 +[epoch 22/50] step=2500 train_loss=0.0233 tok_s=60141.7 opt_steps=2500 +[epoch 22/50] step=2520 train_loss=0.0233 tok_s=60148.0 opt_steps=2520 +[epoch 22/50] step=2540 train_loss=0.0233 tok_s=60148.5 opt_steps=2540 +[epoch 22/50] step=2560 train_loss=0.0234 tok_s=60145.5 opt_steps=2560 +[epoch 22/50] step=2580 train_loss=0.0234 tok_s=60149.4 opt_steps=2580 +[epoch 22/50] step=2600 train_loss=0.0234 tok_s=60149.7 opt_steps=2600 +[epoch 22/50] step=2620 train_loss=0.0234 tok_s=60147.6 opt_steps=2620 +[epoch 22/50] step=2640 train_loss=0.0235 tok_s=60148.5 opt_steps=2640 +[epoch 22/50] step=2660 train_loss=0.0235 tok_s=60151.2 opt_steps=2660 +[epoch 22/50] step=2680 train_loss=0.0235 tok_s=60149.7 opt_steps=2680 +[epoch 22/50] step=2700 train_loss=0.0235 tok_s=60152.0 opt_steps=2700 +[epoch 22/50] step=2720 train_loss=0.0235 tok_s=60154.1 opt_steps=2720 +[epoch 22/50] step=2740 train_loss=0.0236 tok_s=60151.5 opt_steps=2740 +[epoch 22/50] step=2760 train_loss=0.0236 tok_s=60150.3 opt_steps=2760 +[epoch 22/50] step=2780 train_loss=0.0236 tok_s=60151.7 opt_steps=2780 +[epoch 22/50] step=2800 train_loss=0.0236 tok_s=60153.0 opt_steps=2800 +[epoch 22/50] step=2820 train_loss=0.0236 tok_s=60149.3 opt_steps=2820 +[epoch 22/50] step=2840 train_loss=0.0237 tok_s=60152.1 opt_steps=2840 +[epoch 22/50] step=2860 train_loss=0.0237 tok_s=60151.4 opt_steps=2860 +[epoch 22/50] step=2880 train_loss=0.0237 tok_s=60145.6 opt_steps=2880 +[epoch 22/50] step=2900 train_loss=0.0237 tok_s=60144.4 opt_steps=2900 +[epoch 22/50] step=2920 train_loss=0.0237 tok_s=60141.5 opt_steps=2920 +[epoch 22/50] step=2940 train_loss=0.0237 tok_s=60143.1 opt_steps=2940 +[epoch 22/50] step=2960 train_loss=0.0238 tok_s=60145.4 opt_steps=2960 +[epoch 22/50] step=2980 train_loss=0.0238 tok_s=60143.5 opt_steps=2980 +[epoch 22/50] step=3000 train_loss=0.0238 tok_s=60142.2 opt_steps=3000 +[epoch 22/50] step=3020 train_loss=0.0238 tok_s=60141.5 opt_steps=3020 +[epoch 22/50] step=3040 train_loss=0.0238 tok_s=60142.5 opt_steps=3040 +[epoch 22/50] step=3060 train_loss=0.0238 tok_s=60146.5 opt_steps=3060 +[epoch 22/50] step=3080 train_loss=0.0238 tok_s=60146.8 opt_steps=3080 +[epoch 22/50] step=3100 train_loss=0.0239 tok_s=60146.4 opt_steps=3100 +[epoch 22/50] step=3120 train_loss=0.0239 tok_s=60148.6 opt_steps=3120 +[epoch 22/50] step=3140 train_loss=0.0239 tok_s=60144.8 opt_steps=3140 +[epoch 22/50] step=3160 train_loss=0.0239 tok_s=60143.4 opt_steps=3160 +[epoch 22/50] step=3180 train_loss=0.0239 tok_s=60144.2 opt_steps=3180 +[epoch 22/50] step=3200 train_loss=0.0239 tok_s=60143.9 opt_steps=3200 +[epoch 22/50] step=3220 train_loss=0.0240 tok_s=60144.1 opt_steps=3220 +[epoch 22/50] step=3240 train_loss=0.0240 tok_s=60148.5 opt_steps=3240 +[epoch 22/50] step=3260 train_loss=0.0240 tok_s=60146.1 opt_steps=3260 +[epoch 22/50] train_loss=0.0240 val_skipped tok_s=60144.8 opt_steps=3273 +[epoch 23/50] step=20 train_loss=0.0189 tok_s=58308.3 opt_steps=20 +[epoch 23/50] step=40 train_loss=0.0190 tok_s=59212.3 opt_steps=40 +[epoch 23/50] step=60 train_loss=0.0189 tok_s=59422.2 opt_steps=60 +[epoch 23/50] step=80 train_loss=0.0187 tok_s=59736.1 opt_steps=80 +[epoch 23/50] step=100 train_loss=0.0186 tok_s=59857.8 opt_steps=100 +[epoch 23/50] step=120 train_loss=0.0186 tok_s=59930.6 opt_steps=120 +[epoch 23/50] step=140 train_loss=0.0185 tok_s=59957.6 opt_steps=140 +[epoch 23/50] step=160 train_loss=0.0185 tok_s=59946.2 opt_steps=160 +[epoch 23/50] step=180 train_loss=0.0186 tok_s=59982.8 opt_steps=180 +[epoch 23/50] step=200 train_loss=0.0186 tok_s=59986.1 opt_steps=200 +[epoch 23/50] step=220 train_loss=0.0186 tok_s=59995.0 opt_steps=220 +[epoch 23/50] step=240 train_loss=0.0187 tok_s=59975.2 opt_steps=240 +[epoch 23/50] step=260 train_loss=0.0188 tok_s=60010.9 opt_steps=260 +[epoch 23/50] step=280 train_loss=0.0189 tok_s=60080.9 opt_steps=280 +[epoch 23/50] step=300 train_loss=0.0189 tok_s=60089.5 opt_steps=300 +[epoch 23/50] step=320 train_loss=0.0190 tok_s=60122.8 opt_steps=320 +[epoch 23/50] step=340 train_loss=0.0190 tok_s=60145.7 opt_steps=340 +[epoch 23/50] step=360 train_loss=0.0191 tok_s=60165.2 opt_steps=360 +[epoch 23/50] step=380 train_loss=0.0191 tok_s=60168.5 opt_steps=380 +[epoch 23/50] step=400 train_loss=0.0192 tok_s=60188.6 opt_steps=400 +[epoch 23/50] step=420 train_loss=0.0192 tok_s=60171.1 opt_steps=420 +[epoch 23/50] step=440 train_loss=0.0192 tok_s=60175.9 opt_steps=440 +[epoch 23/50] step=460 train_loss=0.0193 tok_s=60168.5 opt_steps=460 +[epoch 23/50] step=480 train_loss=0.0194 tok_s=60158.8 opt_steps=480 +[epoch 23/50] step=500 train_loss=0.0194 tok_s=60172.8 opt_steps=500 +[epoch 23/50] step=520 train_loss=0.0194 tok_s=60185.0 opt_steps=520 +[epoch 23/50] step=540 train_loss=0.0195 tok_s=60193.7 opt_steps=540 +[epoch 23/50] step=560 train_loss=0.0195 tok_s=60191.0 opt_steps=560 +[epoch 23/50] step=580 train_loss=0.0195 tok_s=60181.5 opt_steps=580 +[epoch 23/50] step=600 train_loss=0.0196 tok_s=60182.9 opt_steps=600 +[epoch 23/50] step=620 train_loss=0.0196 tok_s=60170.2 opt_steps=620 +[epoch 23/50] step=640 train_loss=0.0196 tok_s=60161.1 opt_steps=640 +[epoch 23/50] step=660 train_loss=0.0197 tok_s=60164.5 opt_steps=660 +[epoch 23/50] step=680 train_loss=0.0198 tok_s=60193.6 opt_steps=680 +[epoch 23/50] step=700 train_loss=0.0198 tok_s=60180.5 opt_steps=700 +[epoch 23/50] step=720 train_loss=0.0198 tok_s=60167.2 opt_steps=720 +[epoch 23/50] step=740 train_loss=0.0198 tok_s=60159.0 opt_steps=740 +[epoch 23/50] step=760 train_loss=0.0199 tok_s=60171.6 opt_steps=760 +[epoch 23/50] step=780 train_loss=0.0199 tok_s=60173.4 opt_steps=780 +[epoch 23/50] step=800 train_loss=0.0199 tok_s=60173.9 opt_steps=800 +[epoch 23/50] step=820 train_loss=0.0200 tok_s=60179.5 opt_steps=820 +[epoch 23/50] step=840 train_loss=0.0200 tok_s=60182.5 opt_steps=840 +[epoch 23/50] step=860 train_loss=0.0200 tok_s=60187.2 opt_steps=860 +[epoch 23/50] step=880 train_loss=0.0201 tok_s=60188.6 opt_steps=880 +[epoch 23/50] step=900 train_loss=0.0201 tok_s=60190.9 opt_steps=900 +[epoch 23/50] step=920 train_loss=0.0201 tok_s=60190.6 opt_steps=920 +[epoch 23/50] step=940 train_loss=0.0202 tok_s=60198.9 opt_steps=940 +[epoch 23/50] step=960 train_loss=0.0202 tok_s=60198.4 opt_steps=960 +[epoch 23/50] step=980 train_loss=0.0202 tok_s=60191.8 opt_steps=980 +[epoch 23/50] step=1000 train_loss=0.0203 tok_s=60195.1 opt_steps=1000 +[epoch 23/50] step=1020 train_loss=0.0203 tok_s=60198.3 opt_steps=1020 +[epoch 23/50] step=1040 train_loss=0.0204 tok_s=60193.7 opt_steps=1040 +[epoch 23/50] step=1060 train_loss=0.0204 tok_s=60201.6 opt_steps=1060 +[epoch 23/50] step=1080 train_loss=0.0204 tok_s=60197.0 opt_steps=1080 +[epoch 23/50] step=1100 train_loss=0.0205 tok_s=60191.6 opt_steps=1100 +[epoch 23/50] step=1120 train_loss=0.0206 tok_s=60188.2 opt_steps=1120 +[epoch 23/50] step=1140 train_loss=0.0206 tok_s=60197.2 opt_steps=1140 +[epoch 23/50] step=1160 train_loss=0.0206 tok_s=60198.2 opt_steps=1160 +[epoch 23/50] step=1180 train_loss=0.0206 tok_s=60203.0 opt_steps=1180 +[epoch 23/50] step=1200 train_loss=0.0206 tok_s=60210.6 opt_steps=1200 +[epoch 23/50] step=1220 train_loss=0.0207 tok_s=60207.5 opt_steps=1220 +[epoch 23/50] step=1240 train_loss=0.0207 tok_s=60214.9 opt_steps=1240 +[epoch 23/50] step=1260 train_loss=0.0207 tok_s=60217.3 opt_steps=1260 +[epoch 23/50] step=1280 train_loss=0.0208 tok_s=60224.4 opt_steps=1280 +[epoch 23/50] step=1300 train_loss=0.0208 tok_s=60232.6 opt_steps=1300 +[epoch 23/50] step=1320 train_loss=0.0208 tok_s=60230.5 opt_steps=1320 +[epoch 23/50] step=1340 train_loss=0.0208 tok_s=60225.1 opt_steps=1340 +[epoch 23/50] step=1360 train_loss=0.0209 tok_s=60217.5 opt_steps=1360 +[epoch 23/50] step=1380 train_loss=0.0209 tok_s=60216.1 opt_steps=1380 +[epoch 23/50] step=1400 train_loss=0.0209 tok_s=60217.4 opt_steps=1400 +[epoch 23/50] step=1420 train_loss=0.0209 tok_s=60220.9 opt_steps=1420 +[epoch 23/50] step=1440 train_loss=0.0210 tok_s=60218.6 opt_steps=1440 +[epoch 23/50] step=1460 train_loss=0.0210 tok_s=60213.4 opt_steps=1460 +[epoch 23/50] step=1480 train_loss=0.0210 tok_s=60222.0 opt_steps=1480 +[epoch 23/50] step=1500 train_loss=0.0211 tok_s=60225.7 opt_steps=1500 +[epoch 23/50] step=1520 train_loss=0.0211 tok_s=60232.3 opt_steps=1520 +[epoch 23/50] step=1540 train_loss=0.0211 tok_s=60236.9 opt_steps=1540 +[epoch 23/50] step=1560 train_loss=0.0211 tok_s=60235.4 opt_steps=1560 +[epoch 23/50] step=1580 train_loss=0.0212 tok_s=60221.7 opt_steps=1580 +[epoch 23/50] step=1600 train_loss=0.0212 tok_s=60211.4 opt_steps=1600 +[epoch 23/50] step=1620 train_loss=0.0212 tok_s=60211.5 opt_steps=1620 +[epoch 23/50] step=1640 train_loss=0.0213 tok_s=60217.0 opt_steps=1640 +[epoch 23/50] step=1660 train_loss=0.0213 tok_s=60216.6 opt_steps=1660 +[epoch 23/50] step=1680 train_loss=0.0213 tok_s=60215.7 opt_steps=1680 +[epoch 23/50] step=1700 train_loss=0.0213 tok_s=60216.9 opt_steps=1700 +[epoch 23/50] step=1720 train_loss=0.0214 tok_s=60208.0 opt_steps=1720 +[epoch 23/50] step=1740 train_loss=0.0214 tok_s=60207.4 opt_steps=1740 +[epoch 23/50] step=1760 train_loss=0.0214 tok_s=60203.1 opt_steps=1760 +[epoch 23/50] step=1780 train_loss=0.0214 tok_s=60201.2 opt_steps=1780 +[epoch 23/50] step=1800 train_loss=0.0215 tok_s=60201.8 opt_steps=1800 +[epoch 23/50] step=1820 train_loss=0.0215 tok_s=60204.8 opt_steps=1820 +[epoch 23/50] step=1840 train_loss=0.0215 tok_s=60208.6 opt_steps=1840 +[epoch 23/50] step=1860 train_loss=0.0215 tok_s=60208.6 opt_steps=1860 +[epoch 23/50] step=1880 train_loss=0.0215 tok_s=60211.0 opt_steps=1880 +[epoch 23/50] step=1900 train_loss=0.0216 tok_s=60215.7 opt_steps=1900 +[epoch 23/50] step=1920 train_loss=0.0216 tok_s=60217.8 opt_steps=1920 +[epoch 23/50] step=1940 train_loss=0.0216 tok_s=60214.0 opt_steps=1940 +[epoch 23/50] step=1960 train_loss=0.0216 tok_s=60210.9 opt_steps=1960 +[epoch 23/50] step=1980 train_loss=0.0217 tok_s=60209.9 opt_steps=1980 +[epoch 23/50] step=2000 train_loss=0.0217 tok_s=60208.9 opt_steps=2000 +[epoch 23/50] step=2020 train_loss=0.0217 tok_s=60210.5 opt_steps=2020 +[epoch 23/50] step=2040 train_loss=0.0217 tok_s=60210.3 opt_steps=2040 +[epoch 23/50] step=2060 train_loss=0.0217 tok_s=60208.4 opt_steps=2060 +[epoch 23/50] step=2080 train_loss=0.0217 tok_s=60205.6 opt_steps=2080 +[epoch 23/50] step=2100 train_loss=0.0218 tok_s=60202.0 opt_steps=2100 +[epoch 23/50] step=2120 train_loss=0.0218 tok_s=60205.2 opt_steps=2120 +[epoch 23/50] step=2140 train_loss=0.0218 tok_s=60206.2 opt_steps=2140 +[epoch 23/50] step=2160 train_loss=0.0218 tok_s=60209.4 opt_steps=2160 +[epoch 23/50] step=2180 train_loss=0.0219 tok_s=60208.5 opt_steps=2180 +[epoch 23/50] step=2200 train_loss=0.0219 tok_s=60213.4 opt_steps=2200 +[epoch 23/50] step=2220 train_loss=0.0219 tok_s=60211.6 opt_steps=2220 +[epoch 23/50] step=2240 train_loss=0.0219 tok_s=60213.5 opt_steps=2240 +[epoch 23/50] step=2260 train_loss=0.0219 tok_s=60210.6 opt_steps=2260 +[epoch 23/50] step=2280 train_loss=0.0220 tok_s=60214.3 opt_steps=2280 +[epoch 23/50] step=2300 train_loss=0.0220 tok_s=60214.1 opt_steps=2300 +[epoch 23/50] step=2320 train_loss=0.0220 tok_s=60216.8 opt_steps=2320 +[epoch 23/50] step=2340 train_loss=0.0220 tok_s=60217.3 opt_steps=2340 +[epoch 23/50] step=2360 train_loss=0.0220 tok_s=60220.2 opt_steps=2360 +[epoch 23/50] step=2380 train_loss=0.0220 tok_s=60221.6 opt_steps=2380 +[epoch 23/50] step=2400 train_loss=0.0221 tok_s=60225.0 opt_steps=2400 +[epoch 23/50] step=2420 train_loss=0.0221 tok_s=60226.0 opt_steps=2420 +[epoch 23/50] step=2440 train_loss=0.0221 tok_s=60225.2 opt_steps=2440 +[epoch 23/50] step=2460 train_loss=0.0221 tok_s=60223.1 opt_steps=2460 +[epoch 23/50] step=2480 train_loss=0.0221 tok_s=60219.9 opt_steps=2480 +[epoch 23/50] step=2500 train_loss=0.0221 tok_s=60219.4 opt_steps=2500 +[epoch 23/50] step=2520 train_loss=0.0222 tok_s=60220.3 opt_steps=2520 +[epoch 23/50] step=2540 train_loss=0.0222 tok_s=60219.0 opt_steps=2540 +[epoch 23/50] step=2560 train_loss=0.0222 tok_s=60220.1 opt_steps=2560 +[epoch 23/50] step=2580 train_loss=0.0222 tok_s=60225.0 opt_steps=2580 +[epoch 23/50] step=2600 train_loss=0.0222 tok_s=60224.4 opt_steps=2600 +[epoch 23/50] step=2620 train_loss=0.0222 tok_s=60226.1 opt_steps=2620 +[epoch 23/50] step=2640 train_loss=0.0223 tok_s=60226.7 opt_steps=2640 +[epoch 23/50] step=2660 train_loss=0.0223 tok_s=60228.9 opt_steps=2660 +[epoch 23/50] step=2680 train_loss=0.0223 tok_s=60228.4 opt_steps=2680 +[epoch 23/50] step=2700 train_loss=0.0223 tok_s=60227.4 opt_steps=2700 +[epoch 23/50] step=2720 train_loss=0.0223 tok_s=60227.6 opt_steps=2720 +[epoch 23/50] step=2740 train_loss=0.0223 tok_s=60213.5 opt_steps=2740 +[epoch 23/50] step=2760 train_loss=0.0224 tok_s=60216.8 opt_steps=2760 +[epoch 23/50] step=2780 train_loss=0.0224 tok_s=60218.8 opt_steps=2780 +[epoch 23/50] step=2800 train_loss=0.0224 tok_s=60220.4 opt_steps=2800 +[epoch 23/50] step=2820 train_loss=0.0224 tok_s=60220.7 opt_steps=2820 +[epoch 23/50] step=2840 train_loss=0.0224 tok_s=60223.0 opt_steps=2840 +[epoch 23/50] step=2860 train_loss=0.0224 tok_s=60227.6 opt_steps=2860 +[epoch 23/50] step=2880 train_loss=0.0225 tok_s=60229.2 opt_steps=2880 +[epoch 23/50] step=2900 train_loss=0.0225 tok_s=60226.6 opt_steps=2900 +[epoch 23/50] step=2920 train_loss=0.0225 tok_s=60232.7 opt_steps=2920 +[epoch 23/50] step=2940 train_loss=0.0225 tok_s=60234.9 opt_steps=2940 +[epoch 23/50] step=2960 train_loss=0.0225 tok_s=60237.3 opt_steps=2960 +[epoch 23/50] step=2980 train_loss=0.0225 tok_s=60237.4 opt_steps=2980 +[epoch 23/50] step=3000 train_loss=0.0225 tok_s=60237.9 opt_steps=3000 +[epoch 23/50] step=3020 train_loss=0.0226 tok_s=60242.6 opt_steps=3020 +[epoch 23/50] step=3040 train_loss=0.0226 tok_s=60243.7 opt_steps=3040 +[epoch 23/50] step=3060 train_loss=0.0226 tok_s=60249.6 opt_steps=3060 +[epoch 23/50] step=3080 train_loss=0.0226 tok_s=60247.2 opt_steps=3080 +[epoch 23/50] step=3100 train_loss=0.0226 tok_s=60245.5 opt_steps=3100 +[epoch 23/50] step=3120 train_loss=0.0226 tok_s=60248.8 opt_steps=3120 +[epoch 23/50] step=3140 train_loss=0.0227 tok_s=60248.4 opt_steps=3140 +[epoch 23/50] step=3160 train_loss=0.0227 tok_s=60251.8 opt_steps=3160 +[epoch 23/50] step=3180 train_loss=0.0227 tok_s=60254.8 opt_steps=3180 +[epoch 23/50] step=3200 train_loss=0.0227 tok_s=60254.7 opt_steps=3200 +[epoch 23/50] step=3220 train_loss=0.0227 tok_s=60255.2 opt_steps=3220 +[epoch 23/50] step=3240 train_loss=0.0227 tok_s=60254.2 opt_steps=3240 +[epoch 23/50] step=3260 train_loss=0.0228 tok_s=60251.6 opt_steps=3260 +[epoch 23/50] train_loss=0.0228 val_skipped tok_s=60253.7 opt_steps=3273 +[epoch 24/50] step=20 train_loss=0.0177 tok_s=58352.9 opt_steps=20 +[epoch 24/50] step=40 train_loss=0.0178 tok_s=59340.9 opt_steps=40 +[epoch 24/50] step=60 train_loss=0.0177 tok_s=59592.9 opt_steps=60 +[epoch 24/50] step=80 train_loss=0.0177 tok_s=59833.8 opt_steps=80 +[epoch 24/50] step=100 train_loss=0.0178 tok_s=59861.4 opt_steps=100 +[epoch 24/50] step=120 train_loss=0.0177 tok_s=59931.6 opt_steps=120 +[epoch 24/50] step=140 train_loss=0.0178 tok_s=60019.2 opt_steps=140 +[epoch 24/50] step=160 train_loss=0.0177 tok_s=60085.7 opt_steps=160 +[epoch 24/50] step=180 train_loss=0.0176 tok_s=60124.8 opt_steps=180 +[epoch 24/50] step=200 train_loss=0.0177 tok_s=60171.3 opt_steps=200 +[epoch 24/50] step=220 train_loss=0.0177 tok_s=60226.3 opt_steps=220 +[epoch 24/50] step=240 train_loss=0.0176 tok_s=60234.3 opt_steps=240 +[epoch 24/50] step=260 train_loss=0.0177 tok_s=60254.0 opt_steps=260 +[epoch 24/50] step=280 train_loss=0.0177 tok_s=60302.5 opt_steps=280 +[epoch 24/50] step=300 train_loss=0.0178 tok_s=60345.1 opt_steps=300 +[epoch 24/50] step=320 train_loss=0.0178 tok_s=60349.4 opt_steps=320 +[epoch 24/50] step=340 train_loss=0.0178 tok_s=60329.3 opt_steps=340 +[epoch 24/50] step=360 train_loss=0.0179 tok_s=60327.4 opt_steps=360 +[epoch 24/50] step=380 train_loss=0.0179 tok_s=60329.8 opt_steps=380 +[epoch 24/50] step=400 train_loss=0.0179 tok_s=60321.1 opt_steps=400 +[epoch 24/50] step=420 train_loss=0.0180 tok_s=60345.0 opt_steps=420 +[epoch 24/50] step=440 train_loss=0.0180 tok_s=60333.6 opt_steps=440 +[epoch 24/50] step=460 train_loss=0.0180 tok_s=60337.7 opt_steps=460 +[epoch 24/50] step=480 train_loss=0.0180 tok_s=60357.8 opt_steps=480 +[epoch 24/50] step=500 train_loss=0.0181 tok_s=60377.9 opt_steps=500 +[epoch 24/50] step=520 train_loss=0.0181 tok_s=60383.0 opt_steps=520 +[epoch 24/50] step=540 train_loss=0.0182 tok_s=60383.8 opt_steps=540 +[epoch 24/50] step=560 train_loss=0.0182 tok_s=60389.1 opt_steps=560 +[epoch 24/50] step=580 train_loss=0.0183 tok_s=60409.0 opt_steps=580 +[epoch 24/50] step=600 train_loss=0.0183 tok_s=60412.9 opt_steps=600 +[epoch 24/50] step=620 train_loss=0.0184 tok_s=60420.7 opt_steps=620 +[epoch 24/50] step=640 train_loss=0.0185 tok_s=60430.2 opt_steps=640 +[epoch 24/50] step=660 train_loss=0.0185 tok_s=60422.5 opt_steps=660 +[epoch 24/50] step=680 train_loss=0.0185 tok_s=60428.9 opt_steps=680 +[epoch 24/50] step=700 train_loss=0.0185 tok_s=60425.3 opt_steps=700 +[epoch 24/50] step=720 train_loss=0.0186 tok_s=60430.9 opt_steps=720 +[epoch 24/50] step=740 train_loss=0.0186 tok_s=60435.1 opt_steps=740 +[epoch 24/50] step=760 train_loss=0.0186 tok_s=60444.2 opt_steps=760 +[epoch 24/50] step=780 train_loss=0.0187 tok_s=60449.0 opt_steps=780 +[epoch 24/50] step=800 train_loss=0.0188 tok_s=60450.1 opt_steps=800 +[epoch 24/50] step=820 train_loss=0.0188 tok_s=60457.3 opt_steps=820 +[epoch 24/50] step=840 train_loss=0.0188 tok_s=60462.2 opt_steps=840 +[epoch 24/50] step=860 train_loss=0.0189 tok_s=60465.2 opt_steps=860 +[epoch 24/50] step=880 train_loss=0.0189 tok_s=60475.0 opt_steps=880 +[epoch 24/50] step=900 train_loss=0.0190 tok_s=60478.6 opt_steps=900 +[epoch 24/50] step=920 train_loss=0.0190 tok_s=60466.9 opt_steps=920 +[epoch 24/50] step=940 train_loss=0.0190 tok_s=60459.4 opt_steps=940 +[epoch 24/50] step=960 train_loss=0.0191 tok_s=60458.7 opt_steps=960 +[epoch 24/50] step=980 train_loss=0.0191 tok_s=60467.6 opt_steps=980 +[epoch 24/50] step=1000 train_loss=0.0192 tok_s=60457.4 opt_steps=1000 +[epoch 24/50] step=1020 train_loss=0.0192 tok_s=60463.1 opt_steps=1020 +[epoch 24/50] step=1040 train_loss=0.0192 tok_s=60460.9 opt_steps=1040 +[epoch 24/50] step=1060 train_loss=0.0193 tok_s=60467.9 opt_steps=1060 +[epoch 24/50] step=1080 train_loss=0.0193 tok_s=60475.0 opt_steps=1080 +[epoch 24/50] step=1100 train_loss=0.0193 tok_s=60479.1 opt_steps=1100 +[epoch 24/50] step=1120 train_loss=0.0194 tok_s=60481.6 opt_steps=1120 +[epoch 24/50] step=1140 train_loss=0.0194 tok_s=60473.9 opt_steps=1140 +[epoch 24/50] step=1160 train_loss=0.0195 tok_s=60476.1 opt_steps=1160 +[epoch 24/50] step=1180 train_loss=0.0195 tok_s=60480.8 opt_steps=1180 +[epoch 24/50] step=1200 train_loss=0.0195 tok_s=60474.0 opt_steps=1200 +[epoch 24/50] step=1220 train_loss=0.0196 tok_s=60464.8 opt_steps=1220 +[epoch 24/50] step=1240 train_loss=0.0196 tok_s=60466.3 opt_steps=1240 +[epoch 24/50] step=1260 train_loss=0.0197 tok_s=60473.6 opt_steps=1260 +[epoch 24/50] step=1280 train_loss=0.0197 tok_s=60463.8 opt_steps=1280 +[epoch 24/50] step=1300 train_loss=0.0197 tok_s=60464.3 opt_steps=1300 +[epoch 24/50] step=1320 train_loss=0.0197 tok_s=60460.9 opt_steps=1320 +[epoch 24/50] step=1340 train_loss=0.0198 tok_s=60460.8 opt_steps=1340 +[epoch 24/50] step=1360 train_loss=0.0198 tok_s=60464.6 opt_steps=1360 +[epoch 24/50] step=1380 train_loss=0.0198 tok_s=60467.7 opt_steps=1380 +[epoch 24/50] step=1400 train_loss=0.0198 tok_s=60471.4 opt_steps=1400 +[epoch 24/50] step=1420 train_loss=0.0199 tok_s=60465.7 opt_steps=1420 +[epoch 24/50] step=1440 train_loss=0.0199 tok_s=60461.4 opt_steps=1440 +[epoch 24/50] step=1460 train_loss=0.0199 tok_s=60459.5 opt_steps=1460 +[epoch 24/50] step=1480 train_loss=0.0199 tok_s=60460.5 opt_steps=1480 +[epoch 24/50] step=1500 train_loss=0.0199 tok_s=60460.0 opt_steps=1500 +[epoch 24/50] step=1520 train_loss=0.0200 tok_s=60463.6 opt_steps=1520 +[epoch 24/50] step=1540 train_loss=0.0200 tok_s=60466.1 opt_steps=1540 +[epoch 24/50] step=1560 train_loss=0.0200 tok_s=60468.7 opt_steps=1560 +[epoch 24/50] step=1580 train_loss=0.0200 tok_s=60464.4 opt_steps=1580 +[epoch 24/50] step=1600 train_loss=0.0201 tok_s=60466.9 opt_steps=1600 +[epoch 24/50] step=1620 train_loss=0.0201 tok_s=60468.3 opt_steps=1620 +[epoch 24/50] step=1640 train_loss=0.0201 tok_s=60465.8 opt_steps=1640 +[epoch 24/50] step=1660 train_loss=0.0202 tok_s=60468.4 opt_steps=1660 +[epoch 24/50] step=1680 train_loss=0.0202 tok_s=60469.4 opt_steps=1680 +[epoch 24/50] step=1700 train_loss=0.0202 tok_s=60465.5 opt_steps=1700 +[epoch 24/50] step=1720 train_loss=0.0202 tok_s=60467.2 opt_steps=1720 +[epoch 24/50] step=1740 train_loss=0.0202 tok_s=60470.4 opt_steps=1740 +[epoch 24/50] step=1760 train_loss=0.0202 tok_s=60474.5 opt_steps=1760 +[epoch 24/50] step=1780 train_loss=0.0203 tok_s=60470.2 opt_steps=1780 +[epoch 24/50] step=1800 train_loss=0.0203 tok_s=60468.1 opt_steps=1800 +[epoch 24/50] step=1820 train_loss=0.0203 tok_s=60468.8 opt_steps=1820 +[epoch 24/50] step=1840 train_loss=0.0203 tok_s=60468.1 opt_steps=1840 +[epoch 24/50] step=1860 train_loss=0.0203 tok_s=60462.4 opt_steps=1860 +[epoch 24/50] step=1880 train_loss=0.0204 tok_s=60445.7 opt_steps=1880 +[epoch 24/50] step=1900 train_loss=0.0204 tok_s=60431.1 opt_steps=1900 +[epoch 24/50] step=1920 train_loss=0.0204 tok_s=60428.4 opt_steps=1920 +[epoch 24/50] step=1940 train_loss=0.0205 tok_s=60426.9 opt_steps=1940 +[epoch 24/50] step=1960 train_loss=0.0205 tok_s=60424.9 opt_steps=1960 +[epoch 24/50] step=1980 train_loss=0.0205 tok_s=60417.2 opt_steps=1980 +[epoch 24/50] step=2000 train_loss=0.0205 tok_s=60413.7 opt_steps=2000 +[epoch 24/50] step=2020 train_loss=0.0206 tok_s=60414.0 opt_steps=2020 +[epoch 24/50] step=2040 train_loss=0.0206 tok_s=60408.3 opt_steps=2040 +[epoch 24/50] step=2060 train_loss=0.0206 tok_s=60409.2 opt_steps=2060 +[epoch 24/50] step=2080 train_loss=0.0206 tok_s=60406.1 opt_steps=2080 +[epoch 24/50] step=2100 train_loss=0.0207 tok_s=60399.8 opt_steps=2100 +[epoch 24/50] step=2120 train_loss=0.0207 tok_s=60398.8 opt_steps=2120 +[epoch 24/50] step=2140 train_loss=0.0207 tok_s=60400.0 opt_steps=2140 +[epoch 24/50] step=2160 train_loss=0.0207 tok_s=60397.9 opt_steps=2160 +[epoch 24/50] step=2180 train_loss=0.0207 tok_s=60395.3 opt_steps=2180 +[epoch 24/50] step=2200 train_loss=0.0207 tok_s=60391.1 opt_steps=2200 +[epoch 24/50] step=2220 train_loss=0.0208 tok_s=60383.1 opt_steps=2220 +[epoch 24/50] step=2240 train_loss=0.0208 tok_s=60374.8 opt_steps=2240 +[epoch 24/50] step=2260 train_loss=0.0208 tok_s=60373.9 opt_steps=2260 +[epoch 24/50] step=2280 train_loss=0.0208 tok_s=60373.5 opt_steps=2280 +[epoch 24/50] step=2300 train_loss=0.0209 tok_s=60372.2 opt_steps=2300 +[epoch 24/50] step=2320 train_loss=0.0209 tok_s=60368.6 opt_steps=2320 +[epoch 24/50] step=2340 train_loss=0.0209 tok_s=60366.3 opt_steps=2340 +[epoch 24/50] step=2360 train_loss=0.0209 tok_s=60368.6 opt_steps=2360 +[epoch 24/50] step=2380 train_loss=0.0209 tok_s=60367.5 opt_steps=2380 +[epoch 24/50] step=2400 train_loss=0.0210 tok_s=60365.3 opt_steps=2400 +[epoch 24/50] step=2420 train_loss=0.0210 tok_s=60363.7 opt_steps=2420 +[epoch 24/50] step=2440 train_loss=0.0210 tok_s=60359.6 opt_steps=2440 +[epoch 24/50] step=2460 train_loss=0.0210 tok_s=60359.2 opt_steps=2460 +[epoch 24/50] step=2480 train_loss=0.0210 tok_s=60354.7 opt_steps=2480 +[epoch 24/50] step=2500 train_loss=0.0211 tok_s=60352.1 opt_steps=2500 +[epoch 24/50] step=2520 train_loss=0.0211 tok_s=60352.4 opt_steps=2520 +[epoch 24/50] step=2540 train_loss=0.0211 tok_s=60349.6 opt_steps=2540 +[epoch 24/50] step=2560 train_loss=0.0211 tok_s=60344.6 opt_steps=2560 +[epoch 24/50] step=2580 train_loss=0.0211 tok_s=60337.9 opt_steps=2580 +[epoch 24/50] step=2600 train_loss=0.0212 tok_s=60335.6 opt_steps=2600 +[epoch 24/50] step=2620 train_loss=0.0212 tok_s=60332.0 opt_steps=2620 +[epoch 24/50] step=2640 train_loss=0.0212 tok_s=60330.6 opt_steps=2640 +[epoch 24/50] step=2660 train_loss=0.0212 tok_s=60326.1 opt_steps=2660 +[epoch 24/50] step=2680 train_loss=0.0212 tok_s=60322.0 opt_steps=2680 +[epoch 24/50] step=2700 train_loss=0.0212 tok_s=60318.3 opt_steps=2700 +[epoch 24/50] step=2720 train_loss=0.0213 tok_s=60316.6 opt_steps=2720 +[epoch 24/50] step=2740 train_loss=0.0213 tok_s=60311.1 opt_steps=2740 +[epoch 24/50] step=2760 train_loss=0.0213 tok_s=60307.1 opt_steps=2760 +[epoch 24/50] step=2780 train_loss=0.0213 tok_s=60301.2 opt_steps=2780 +[epoch 24/50] step=2800 train_loss=0.0213 tok_s=60299.1 opt_steps=2800 +[epoch 24/50] step=2820 train_loss=0.0213 tok_s=60294.8 opt_steps=2820 +[epoch 24/50] step=2840 train_loss=0.0214 tok_s=60294.9 opt_steps=2840 +[epoch 24/50] step=2860 train_loss=0.0214 tok_s=60292.2 opt_steps=2860 +[epoch 24/50] step=2880 train_loss=0.0214 tok_s=60287.5 opt_steps=2880 +[epoch 24/50] step=2900 train_loss=0.0214 tok_s=60287.3 opt_steps=2900 +[epoch 24/50] step=2920 train_loss=0.0214 tok_s=60285.0 opt_steps=2920 +[epoch 24/50] step=2940 train_loss=0.0214 tok_s=60284.1 opt_steps=2940 +[epoch 24/50] step=2960 train_loss=0.0214 tok_s=60280.1 opt_steps=2960 +[epoch 24/50] step=2980 train_loss=0.0215 tok_s=60281.5 opt_steps=2980 +[epoch 24/50] step=3000 train_loss=0.0215 tok_s=60279.6 opt_steps=3000 +[epoch 24/50] step=3020 train_loss=0.0215 tok_s=60279.7 opt_steps=3020 +[epoch 24/50] step=3040 train_loss=0.0215 tok_s=60278.9 opt_steps=3040 +[epoch 24/50] step=3060 train_loss=0.0215 tok_s=60277.1 opt_steps=3060 +[epoch 24/50] step=3080 train_loss=0.0215 tok_s=60276.2 opt_steps=3080 +[epoch 24/50] step=3100 train_loss=0.0215 tok_s=60277.3 opt_steps=3100 +[epoch 24/50] step=3120 train_loss=0.0216 tok_s=60274.9 opt_steps=3120 +[epoch 24/50] step=3140 train_loss=0.0216 tok_s=60274.6 opt_steps=3140 +[epoch 24/50] step=3160 train_loss=0.0216 tok_s=60273.7 opt_steps=3160 +[epoch 24/50] step=3180 train_loss=0.0216 tok_s=60273.5 opt_steps=3180 +[epoch 24/50] step=3200 train_loss=0.0216 tok_s=60272.0 opt_steps=3200 +[epoch 24/50] step=3220 train_loss=0.0216 tok_s=60272.5 opt_steps=3220 +[epoch 24/50] step=3240 train_loss=0.0217 tok_s=60272.7 opt_steps=3240 +[epoch 24/50] step=3260 train_loss=0.0217 tok_s=60268.7 opt_steps=3260 +[epoch 24/50] train_loss=0.0217 val_skipped tok_s=60270.2 opt_steps=3273 +[epoch 25/50] step=20 train_loss=0.0179 tok_s=57965.7 opt_steps=20 +[epoch 25/50] step=40 train_loss=0.0175 tok_s=59065.2 opt_steps=40 +[epoch 25/50] step=60 train_loss=0.0172 tok_s=59285.1 opt_steps=60 +[epoch 25/50] step=80 train_loss=0.0171 tok_s=59526.9 opt_steps=80 +[epoch 25/50] step=100 train_loss=0.0170 tok_s=59605.7 opt_steps=100 +[epoch 25/50] step=120 train_loss=0.0169 tok_s=59693.0 opt_steps=120 +[epoch 25/50] step=140 train_loss=0.0169 tok_s=59886.3 opt_steps=140 +[epoch 25/50] step=160 train_loss=0.0169 tok_s=59970.4 opt_steps=160 +[epoch 25/50] step=180 train_loss=0.0169 tok_s=59928.3 opt_steps=180 +[epoch 25/50] step=200 train_loss=0.0169 tok_s=59855.5 opt_steps=200 +[epoch 25/50] step=220 train_loss=0.0169 tok_s=59869.7 opt_steps=220 +[epoch 25/50] step=240 train_loss=0.0171 tok_s=59890.6 opt_steps=240 +[epoch 25/50] step=260 train_loss=0.0170 tok_s=59912.9 opt_steps=260 +[epoch 25/50] step=280 train_loss=0.0170 tok_s=59919.9 opt_steps=280 +[epoch 25/50] step=300 train_loss=0.0171 tok_s=59935.6 opt_steps=300 +[epoch 25/50] step=320 train_loss=0.0171 tok_s=59929.8 opt_steps=320 +[epoch 25/50] step=340 train_loss=0.0171 tok_s=59919.5 opt_steps=340 +[epoch 25/50] step=360 train_loss=0.0171 tok_s=59904.3 opt_steps=360 +[epoch 25/50] step=380 train_loss=0.0172 tok_s=59934.0 opt_steps=380 +[epoch 25/50] step=400 train_loss=0.0172 tok_s=59946.4 opt_steps=400 +[epoch 25/50] step=420 train_loss=0.0173 tok_s=59964.0 opt_steps=420 +[epoch 25/50] step=440 train_loss=0.0174 tok_s=59961.6 opt_steps=440 +[epoch 25/50] step=460 train_loss=0.0174 tok_s=59942.1 opt_steps=460 +[epoch 25/50] step=480 train_loss=0.0174 tok_s=59937.5 opt_steps=480 +[epoch 25/50] step=500 train_loss=0.0175 tok_s=59938.0 opt_steps=500 +[epoch 25/50] step=520 train_loss=0.0175 tok_s=59935.0 opt_steps=520 +[epoch 25/50] step=540 train_loss=0.0176 tok_s=59964.0 opt_steps=540 +[epoch 25/50] step=560 train_loss=0.0176 tok_s=59974.0 opt_steps=560 +[epoch 25/50] step=580 train_loss=0.0177 tok_s=59993.8 opt_steps=580 +[epoch 25/50] step=600 train_loss=0.0177 tok_s=59996.7 opt_steps=600 +[epoch 25/50] step=620 train_loss=0.0178 tok_s=59981.9 opt_steps=620 +[epoch 25/50] step=640 train_loss=0.0178 tok_s=59993.5 opt_steps=640 +[epoch 25/50] step=660 train_loss=0.0178 tok_s=59992.8 opt_steps=660 +[epoch 25/50] step=680 train_loss=0.0178 tok_s=59992.2 opt_steps=680 +[epoch 25/50] step=700 train_loss=0.0179 tok_s=59997.2 opt_steps=700 +[epoch 25/50] step=720 train_loss=0.0179 tok_s=60013.0 opt_steps=720 +[epoch 25/50] step=740 train_loss=0.0180 tok_s=60003.1 opt_steps=740 +[epoch 25/50] step=760 train_loss=0.0180 tok_s=60006.2 opt_steps=760 +[epoch 25/50] step=780 train_loss=0.0180 tok_s=60013.7 opt_steps=780 +[epoch 25/50] step=800 train_loss=0.0181 tok_s=60021.1 opt_steps=800 +[epoch 25/50] step=820 train_loss=0.0181 tok_s=60029.4 opt_steps=820 +[epoch 25/50] step=840 train_loss=0.0181 tok_s=60048.8 opt_steps=840 +[epoch 25/50] step=860 train_loss=0.0182 tok_s=60044.6 opt_steps=860 +[epoch 25/50] step=880 train_loss=0.0182 tok_s=60040.3 opt_steps=880 +[epoch 25/50] step=900 train_loss=0.0183 tok_s=60039.4 opt_steps=900 +[epoch 25/50] step=920 train_loss=0.0183 tok_s=60044.9 opt_steps=920 +[epoch 25/50] step=940 train_loss=0.0183 tok_s=60054.7 opt_steps=940 +[epoch 25/50] step=960 train_loss=0.0184 tok_s=60043.9 opt_steps=960 +[epoch 25/50] step=980 train_loss=0.0184 tok_s=60050.1 opt_steps=980 +[epoch 25/50] step=1000 train_loss=0.0185 tok_s=60051.1 opt_steps=1000 +[epoch 25/50] step=1020 train_loss=0.0185 tok_s=60048.2 opt_steps=1020 +[epoch 25/50] step=1040 train_loss=0.0185 tok_s=60043.3 opt_steps=1040 +[epoch 25/50] step=1060 train_loss=0.0185 tok_s=60058.1 opt_steps=1060 +[epoch 25/50] step=1080 train_loss=0.0186 tok_s=60052.8 opt_steps=1080 +[epoch 25/50] step=1100 train_loss=0.0186 tok_s=60033.0 opt_steps=1100 +[epoch 25/50] step=1120 train_loss=0.0186 tok_s=59995.3 opt_steps=1120 +[epoch 25/50] step=1140 train_loss=0.0186 tok_s=59956.0 opt_steps=1140 +[epoch 25/50] step=1160 train_loss=0.0186 tok_s=59950.7 opt_steps=1160 +[epoch 25/50] step=1180 train_loss=0.0187 tok_s=59926.9 opt_steps=1180 +[epoch 25/50] step=1200 train_loss=0.0187 tok_s=59906.1 opt_steps=1200 +[epoch 25/50] step=1220 train_loss=0.0187 tok_s=59887.6 opt_steps=1220 +[epoch 25/50] step=1240 train_loss=0.0187 tok_s=59869.0 opt_steps=1240 +[epoch 25/50] step=1260 train_loss=0.0188 tok_s=59851.8 opt_steps=1260 +[epoch 25/50] step=1280 train_loss=0.0188 tok_s=59835.3 opt_steps=1280 +[epoch 25/50] step=1300 train_loss=0.0188 tok_s=59825.5 opt_steps=1300 +[epoch 25/50] step=1320 train_loss=0.0188 tok_s=59820.2 opt_steps=1320 +[epoch 25/50] step=1340 train_loss=0.0188 tok_s=59807.5 opt_steps=1340 +[epoch 25/50] step=1360 train_loss=0.0189 tok_s=59802.4 opt_steps=1360 +[epoch 25/50] step=1380 train_loss=0.0189 tok_s=59800.1 opt_steps=1380 +[epoch 25/50] step=1400 train_loss=0.0189 tok_s=59801.5 opt_steps=1400 +[epoch 25/50] step=1420 train_loss=0.0189 tok_s=59797.5 opt_steps=1420 +[epoch 25/50] step=1440 train_loss=0.0190 tok_s=59801.9 opt_steps=1440 +[epoch 25/50] step=1460 train_loss=0.0190 tok_s=59810.5 opt_steps=1460 +[epoch 25/50] step=1480 train_loss=0.0190 tok_s=59805.0 opt_steps=1480 +[epoch 25/50] step=1500 train_loss=0.0191 tok_s=59803.8 opt_steps=1500 +[epoch 25/50] step=1520 train_loss=0.0191 tok_s=59807.3 opt_steps=1520 +[epoch 25/50] step=1540 train_loss=0.0191 tok_s=59813.4 opt_steps=1540 +[epoch 25/50] step=1560 train_loss=0.0191 tok_s=59811.5 opt_steps=1560 +[epoch 25/50] step=1580 train_loss=0.0192 tok_s=59815.4 opt_steps=1580 +[epoch 25/50] step=1600 train_loss=0.0192 tok_s=59819.5 opt_steps=1600 +[epoch 25/50] step=1620 train_loss=0.0192 tok_s=59828.0 opt_steps=1620 +[epoch 25/50] step=1640 train_loss=0.0192 tok_s=59833.3 opt_steps=1640 +[epoch 25/50] step=1660 train_loss=0.0193 tok_s=59838.5 opt_steps=1660 +[epoch 25/50] step=1680 train_loss=0.0193 tok_s=59837.9 opt_steps=1680 +[epoch 25/50] step=1700 train_loss=0.0193 tok_s=59836.7 opt_steps=1700 +[epoch 25/50] step=1720 train_loss=0.0193 tok_s=59838.3 opt_steps=1720 +[epoch 25/50] step=1740 train_loss=0.0194 tok_s=59838.3 opt_steps=1740 +[epoch 25/50] step=1760 train_loss=0.0194 tok_s=59844.0 opt_steps=1760 +[epoch 25/50] step=1780 train_loss=0.0194 tok_s=59847.4 opt_steps=1780 +[epoch 25/50] step=1800 train_loss=0.0194 tok_s=59852.8 opt_steps=1800 +[epoch 25/50] step=1820 train_loss=0.0194 tok_s=59858.3 opt_steps=1820 +[epoch 25/50] step=1840 train_loss=0.0195 tok_s=59857.9 opt_steps=1840 +[epoch 25/50] step=1860 train_loss=0.0195 tok_s=59856.2 opt_steps=1860 +[epoch 25/50] step=1880 train_loss=0.0195 tok_s=59859.8 opt_steps=1880 +[epoch 25/50] step=1900 train_loss=0.0195 tok_s=59859.1 opt_steps=1900 +[epoch 25/50] step=1920 train_loss=0.0195 tok_s=59863.3 opt_steps=1920 +[epoch 25/50] step=1940 train_loss=0.0196 tok_s=59864.5 opt_steps=1940 +[epoch 25/50] step=1960 train_loss=0.0196 tok_s=59864.1 opt_steps=1960 +[epoch 25/50] step=1980 train_loss=0.0196 tok_s=59860.3 opt_steps=1980 +[epoch 25/50] step=2000 train_loss=0.0196 tok_s=59858.8 opt_steps=2000 +[epoch 25/50] step=2020 train_loss=0.0196 tok_s=59852.9 opt_steps=2020 +[epoch 25/50] step=2040 train_loss=0.0197 tok_s=59848.5 opt_steps=2040 +[epoch 25/50] step=2060 train_loss=0.0197 tok_s=59855.1 opt_steps=2060 +[epoch 25/50] step=2080 train_loss=0.0197 tok_s=59849.2 opt_steps=2080 +[epoch 25/50] step=2100 train_loss=0.0197 tok_s=59851.1 opt_steps=2100 +[epoch 25/50] step=2120 train_loss=0.0197 tok_s=59854.3 opt_steps=2120 +[epoch 25/50] step=2140 train_loss=0.0198 tok_s=59864.3 opt_steps=2140 +[epoch 25/50] step=2160 train_loss=0.0198 tok_s=59867.1 opt_steps=2160 +[epoch 25/50] step=2180 train_loss=0.0198 tok_s=59864.8 opt_steps=2180 +[epoch 25/50] step=2200 train_loss=0.0198 tok_s=59867.7 opt_steps=2200 +[epoch 25/50] step=2220 train_loss=0.0199 tok_s=59865.2 opt_steps=2220 +[epoch 25/50] step=2240 train_loss=0.0199 tok_s=59866.7 opt_steps=2240 +[epoch 25/50] step=2260 train_loss=0.0199 tok_s=59866.3 opt_steps=2260 +[epoch 25/50] step=2280 train_loss=0.0199 tok_s=59868.2 opt_steps=2280 +[epoch 25/50] step=2300 train_loss=0.0199 tok_s=59868.4 opt_steps=2300 +[epoch 25/50] step=2320 train_loss=0.0200 tok_s=59870.6 opt_steps=2320 +[epoch 25/50] step=2340 train_loss=0.0200 tok_s=59872.9 opt_steps=2340 +[epoch 25/50] step=2360 train_loss=0.0200 tok_s=59879.5 opt_steps=2360 +[epoch 25/50] step=2380 train_loss=0.0200 tok_s=59883.1 opt_steps=2380 +[epoch 25/50] step=2400 train_loss=0.0200 tok_s=59882.1 opt_steps=2400 +[epoch 25/50] step=2420 train_loss=0.0200 tok_s=59881.8 opt_steps=2420 +[epoch 25/50] step=2440 train_loss=0.0201 tok_s=59882.1 opt_steps=2440 +[epoch 25/50] step=2460 train_loss=0.0201 tok_s=59884.0 opt_steps=2460 +[epoch 25/50] step=2480 train_loss=0.0201 tok_s=59885.1 opt_steps=2480 +[epoch 25/50] step=2500 train_loss=0.0201 tok_s=59885.4 opt_steps=2500 +[epoch 25/50] step=2520 train_loss=0.0201 tok_s=59884.2 opt_steps=2520 +[epoch 25/50] step=2540 train_loss=0.0202 tok_s=59884.9 opt_steps=2540 +[epoch 25/50] step=2560 train_loss=0.0202 tok_s=59885.2 opt_steps=2560 +[epoch 25/50] step=2580 train_loss=0.0202 tok_s=59889.2 opt_steps=2580 +[epoch 25/50] step=2600 train_loss=0.0202 tok_s=59893.0 opt_steps=2600 +[epoch 25/50] step=2620 train_loss=0.0202 tok_s=59890.9 opt_steps=2620 +[epoch 25/50] step=2640 train_loss=0.0202 tok_s=59892.4 opt_steps=2640 +[epoch 25/50] step=2660 train_loss=0.0202 tok_s=59890.6 opt_steps=2660 +[epoch 25/50] step=2680 train_loss=0.0203 tok_s=59892.2 opt_steps=2680 +[epoch 25/50] step=2700 train_loss=0.0203 tok_s=59891.3 opt_steps=2700 +[epoch 25/50] step=2720 train_loss=0.0203 tok_s=59890.5 opt_steps=2720 +[epoch 25/50] step=2740 train_loss=0.0203 tok_s=59889.5 opt_steps=2740 +[epoch 25/50] step=2760 train_loss=0.0203 tok_s=59887.3 opt_steps=2760 +[epoch 25/50] step=2780 train_loss=0.0203 tok_s=59886.0 opt_steps=2780 +[epoch 25/50] step=2800 train_loss=0.0203 tok_s=59883.7 opt_steps=2800 +[epoch 25/50] step=2820 train_loss=0.0203 tok_s=59880.5 opt_steps=2820 +[epoch 25/50] step=2840 train_loss=0.0204 tok_s=59880.4 opt_steps=2840 +[epoch 25/50] step=2860 train_loss=0.0204 tok_s=59881.9 opt_steps=2860 +[epoch 25/50] step=2880 train_loss=0.0204 tok_s=59880.4 opt_steps=2880 +[epoch 25/50] step=2900 train_loss=0.0204 tok_s=59882.6 opt_steps=2900 +[epoch 25/50] step=2920 train_loss=0.0204 tok_s=59883.1 opt_steps=2920 +[epoch 25/50] step=2940 train_loss=0.0204 tok_s=59884.6 opt_steps=2940 +[epoch 25/50] step=2960 train_loss=0.0205 tok_s=59882.1 opt_steps=2960 +[epoch 25/50] step=2980 train_loss=0.0205 tok_s=59882.6 opt_steps=2980 +[epoch 25/50] step=3000 train_loss=0.0205 tok_s=59884.4 opt_steps=3000 +[epoch 25/50] step=3020 train_loss=0.0205 tok_s=59881.6 opt_steps=3020 +[epoch 25/50] step=3040 train_loss=0.0205 tok_s=59884.0 opt_steps=3040 +[epoch 25/50] step=3060 train_loss=0.0205 tok_s=59884.9 opt_steps=3060 +[epoch 25/50] step=3080 train_loss=0.0205 tok_s=59887.8 opt_steps=3080 +[epoch 25/50] step=3100 train_loss=0.0205 tok_s=59887.9 opt_steps=3100 +[epoch 25/50] step=3120 train_loss=0.0206 tok_s=59890.8 opt_steps=3120 +[epoch 25/50] step=3140 train_loss=0.0206 tok_s=59892.9 opt_steps=3140 +[epoch 25/50] step=3160 train_loss=0.0206 tok_s=59893.6 opt_steps=3160 +[epoch 25/50] step=3180 train_loss=0.0206 tok_s=59893.7 opt_steps=3180 +[epoch 25/50] step=3200 train_loss=0.0206 tok_s=59894.6 opt_steps=3200 +[epoch 25/50] step=3220 train_loss=0.0206 tok_s=59893.5 opt_steps=3220 +[epoch 25/50] step=3240 train_loss=0.0206 tok_s=59892.5 opt_steps=3240 +[epoch 25/50] step=3260 train_loss=0.0206 tok_s=59885.2 opt_steps=3260 +[epoch 25/50] train_loss=0.0207 val_loss=0.0924 tok_s=59567.2 opt_steps=3273 +Saved checkpoint: checkpoints_h100_100m_sparse_full/epoch_25.pt +[epoch 26/50] step=20 train_loss=0.0160 tok_s=51800.8 opt_steps=20 +[epoch 26/50] step=40 train_loss=0.0165 tok_s=55690.1 opt_steps=40 +[epoch 26/50] step=60 train_loss=0.0166 tok_s=56791.9 opt_steps=60 +[epoch 26/50] step=80 train_loss=0.0167 tok_s=57541.9 opt_steps=80 +[epoch 26/50] step=100 train_loss=0.0165 tok_s=58042.6 opt_steps=100 +[epoch 26/50] step=120 train_loss=0.0166 tok_s=58362.2 opt_steps=120 +[epoch 26/50] step=140 train_loss=0.0166 tok_s=58645.1 opt_steps=140 +[epoch 26/50] step=160 train_loss=0.0166 tok_s=58814.0 opt_steps=160 +[epoch 26/50] step=180 train_loss=0.0165 tok_s=59005.0 opt_steps=180 +[epoch 26/50] step=200 train_loss=0.0166 tok_s=59096.8 opt_steps=200 +[epoch 26/50] step=220 train_loss=0.0166 tok_s=59166.8 opt_steps=220 +[epoch 26/50] step=240 train_loss=0.0167 tok_s=59239.7 opt_steps=240 +[epoch 26/50] step=260 train_loss=0.0167 tok_s=59299.7 opt_steps=260 +[epoch 26/50] step=280 train_loss=0.0167 tok_s=59346.1 opt_steps=280 +[epoch 26/50] step=300 train_loss=0.0167 tok_s=59427.6 opt_steps=300 +[epoch 26/50] step=320 train_loss=0.0168 tok_s=59491.4 opt_steps=320 +[epoch 26/50] step=340 train_loss=0.0167 tok_s=59536.8 opt_steps=340 +[epoch 26/50] step=360 train_loss=0.0168 tok_s=59570.5 opt_steps=360 +[epoch 26/50] step=380 train_loss=0.0169 tok_s=59592.9 opt_steps=380 +[epoch 26/50] step=400 train_loss=0.0169 tok_s=59620.5 opt_steps=400 +[epoch 26/50] step=420 train_loss=0.0169 tok_s=59653.2 opt_steps=420 +[epoch 26/50] step=440 train_loss=0.0170 tok_s=59671.6 opt_steps=440 +[epoch 26/50] step=460 train_loss=0.0170 tok_s=59706.5 opt_steps=460 +[epoch 26/50] step=480 train_loss=0.0170 tok_s=59712.8 opt_steps=480 +[epoch 26/50] step=500 train_loss=0.0170 tok_s=59722.3 opt_steps=500 +[epoch 26/50] step=520 train_loss=0.0170 tok_s=59747.1 opt_steps=520 +[epoch 26/50] step=540 train_loss=0.0171 tok_s=59763.5 opt_steps=540 +[epoch 26/50] step=560 train_loss=0.0171 tok_s=59770.1 opt_steps=560 +[epoch 26/50] step=580 train_loss=0.0171 tok_s=59775.7 opt_steps=580 +[epoch 26/50] step=600 train_loss=0.0172 tok_s=59794.0 opt_steps=600 +[epoch 26/50] step=620 train_loss=0.0172 tok_s=59793.7 opt_steps=620 +[epoch 26/50] step=640 train_loss=0.0173 tok_s=59809.9 opt_steps=640 +[epoch 26/50] step=660 train_loss=0.0173 tok_s=59819.5 opt_steps=660 +[epoch 26/50] step=680 train_loss=0.0173 tok_s=59829.8 opt_steps=680 +[epoch 26/50] step=700 train_loss=0.0173 tok_s=59838.1 opt_steps=700 +[epoch 26/50] step=720 train_loss=0.0174 tok_s=59838.3 opt_steps=720 +[epoch 26/50] step=740 train_loss=0.0174 tok_s=59841.0 opt_steps=740 +[epoch 26/50] step=760 train_loss=0.0175 tok_s=59842.3 opt_steps=760 +[epoch 26/50] step=780 train_loss=0.0175 tok_s=59843.0 opt_steps=780 +[epoch 26/50] step=800 train_loss=0.0175 tok_s=59852.9 opt_steps=800 +[epoch 26/50] step=820 train_loss=0.0175 tok_s=59855.3 opt_steps=820 +[epoch 26/50] step=840 train_loss=0.0176 tok_s=59858.5 opt_steps=840 +[epoch 26/50] step=860 train_loss=0.0176 tok_s=59853.5 opt_steps=860 +[epoch 26/50] step=880 train_loss=0.0177 tok_s=59859.0 opt_steps=880 +[epoch 26/50] step=900 train_loss=0.0177 tok_s=59862.8 opt_steps=900 +[epoch 26/50] step=920 train_loss=0.0177 tok_s=59871.0 opt_steps=920 +[epoch 26/50] step=940 train_loss=0.0177 tok_s=59878.9 opt_steps=940 +[epoch 26/50] step=960 train_loss=0.0178 tok_s=59882.1 opt_steps=960 +[epoch 26/50] step=980 train_loss=0.0178 tok_s=59888.8 opt_steps=980 +[epoch 26/50] step=1000 train_loss=0.0178 tok_s=59891.6 opt_steps=1000 +[epoch 26/50] step=1020 train_loss=0.0178 tok_s=59894.9 opt_steps=1020 +[epoch 26/50] step=1040 train_loss=0.0179 tok_s=59893.8 opt_steps=1040 +[epoch 26/50] step=1060 train_loss=0.0179 tok_s=59901.2 opt_steps=1060 +[epoch 26/50] step=1080 train_loss=0.0179 tok_s=59902.8 opt_steps=1080 +[epoch 26/50] step=1100 train_loss=0.0179 tok_s=59915.0 opt_steps=1100 +[epoch 26/50] step=1120 train_loss=0.0180 tok_s=59914.2 opt_steps=1120 +[epoch 26/50] step=1140 train_loss=0.0180 tok_s=59913.9 opt_steps=1140 +[epoch 26/50] step=1160 train_loss=0.0180 tok_s=59917.2 opt_steps=1160 +[epoch 26/50] step=1180 train_loss=0.0181 tok_s=59926.1 opt_steps=1180 +[epoch 26/50] step=1200 train_loss=0.0181 tok_s=59933.9 opt_steps=1200 +[epoch 26/50] step=1220 train_loss=0.0181 tok_s=59935.4 opt_steps=1220 +[epoch 26/50] step=1240 train_loss=0.0181 tok_s=59928.4 opt_steps=1240 +[epoch 26/50] step=1260 train_loss=0.0181 tok_s=59931.4 opt_steps=1260 +[epoch 26/50] step=1280 train_loss=0.0181 tok_s=59937.8 opt_steps=1280 +[epoch 26/50] step=1300 train_loss=0.0182 tok_s=59931.8 opt_steps=1300 +[epoch 26/50] step=1320 train_loss=0.0182 tok_s=59930.2 opt_steps=1320 +[epoch 26/50] step=1340 train_loss=0.0182 tok_s=59933.2 opt_steps=1340 +[epoch 26/50] step=1360 train_loss=0.0182 tok_s=59935.2 opt_steps=1360 +[epoch 26/50] step=1380 train_loss=0.0183 tok_s=59928.8 opt_steps=1380 +[epoch 26/50] step=1400 train_loss=0.0183 tok_s=59921.6 opt_steps=1400 +[epoch 26/50] step=1420 train_loss=0.0183 tok_s=59920.1 opt_steps=1420 +[epoch 26/50] step=1440 train_loss=0.0183 tok_s=59919.9 opt_steps=1440 +[epoch 26/50] step=1460 train_loss=0.0184 tok_s=59920.0 opt_steps=1460 +[epoch 26/50] step=1480 train_loss=0.0184 tok_s=59915.6 opt_steps=1480 +[epoch 26/50] step=1500 train_loss=0.0184 tok_s=59910.3 opt_steps=1500 +[epoch 26/50] step=1520 train_loss=0.0184 tok_s=59912.3 opt_steps=1520 +[epoch 26/50] step=1540 train_loss=0.0185 tok_s=59923.6 opt_steps=1540 +[epoch 26/50] step=1560 train_loss=0.0185 tok_s=59921.6 opt_steps=1560 +[epoch 26/50] step=1580 train_loss=0.0185 tok_s=59923.4 opt_steps=1580 +[epoch 26/50] step=1600 train_loss=0.0185 tok_s=59932.0 opt_steps=1600 +[epoch 26/50] step=1620 train_loss=0.0186 tok_s=59931.7 opt_steps=1620 +[epoch 26/50] step=1640 train_loss=0.0186 tok_s=59928.4 opt_steps=1640 +[epoch 26/50] step=1660 train_loss=0.0186 tok_s=59932.7 opt_steps=1660 +[epoch 26/50] step=1680 train_loss=0.0186 tok_s=59931.3 opt_steps=1680 +[epoch 26/50] step=1700 train_loss=0.0186 tok_s=59926.7 opt_steps=1700 +[epoch 26/50] step=1720 train_loss=0.0187 tok_s=59926.0 opt_steps=1720 +[epoch 26/50] step=1740 train_loss=0.0187 tok_s=59923.8 opt_steps=1740 +[epoch 26/50] step=1760 train_loss=0.0187 tok_s=59927.7 opt_steps=1760 +[epoch 26/50] step=1780 train_loss=0.0187 tok_s=59926.9 opt_steps=1780 +[epoch 26/50] step=1800 train_loss=0.0188 tok_s=59925.8 opt_steps=1800 +[epoch 26/50] step=1820 train_loss=0.0188 tok_s=59928.7 opt_steps=1820 +[epoch 26/50] step=1840 train_loss=0.0188 tok_s=59921.3 opt_steps=1840 +[epoch 26/50] step=1860 train_loss=0.0188 tok_s=59921.2 opt_steps=1860 +[epoch 26/50] step=1880 train_loss=0.0188 tok_s=59918.6 opt_steps=1880 +[epoch 26/50] step=1900 train_loss=0.0188 tok_s=59919.0 opt_steps=1900 +[epoch 26/50] step=1920 train_loss=0.0189 tok_s=59917.8 opt_steps=1920 +[epoch 26/50] step=1940 train_loss=0.0189 tok_s=59920.2 opt_steps=1940 +[epoch 26/50] step=1960 train_loss=0.0189 tok_s=59918.6 opt_steps=1960 +[epoch 26/50] step=1980 train_loss=0.0189 tok_s=59922.0 opt_steps=1980 +[epoch 26/50] step=2000 train_loss=0.0189 tok_s=59919.0 opt_steps=2000 +[epoch 26/50] step=2020 train_loss=0.0190 tok_s=59914.0 opt_steps=2020 +[epoch 26/50] step=2040 train_loss=0.0190 tok_s=59915.2 opt_steps=2040 +[epoch 26/50] step=2060 train_loss=0.0190 tok_s=59914.5 opt_steps=2060 +[epoch 26/50] step=2080 train_loss=0.0190 tok_s=59906.8 opt_steps=2080 +[epoch 26/50] step=2100 train_loss=0.0190 tok_s=59905.4 opt_steps=2100 +[epoch 26/50] step=2120 train_loss=0.0191 tok_s=59900.9 opt_steps=2120 +[epoch 26/50] step=2140 train_loss=0.0191 tok_s=59903.5 opt_steps=2140 +[epoch 26/50] step=2160 train_loss=0.0191 tok_s=59895.6 opt_steps=2160 +[epoch 26/50] step=2180 train_loss=0.0191 tok_s=59891.7 opt_steps=2180 +[epoch 26/50] step=2200 train_loss=0.0191 tok_s=59892.4 opt_steps=2200 +[epoch 26/50] step=2220 train_loss=0.0191 tok_s=59892.9 opt_steps=2220 +[epoch 26/50] step=2240 train_loss=0.0192 tok_s=59895.4 opt_steps=2240 +[epoch 26/50] step=2260 train_loss=0.0192 tok_s=59897.9 opt_steps=2260 +[epoch 26/50] step=2280 train_loss=0.0192 tok_s=59902.7 opt_steps=2280 +[epoch 26/50] step=2300 train_loss=0.0192 tok_s=59902.1 opt_steps=2300 +[epoch 26/50] step=2320 train_loss=0.0192 tok_s=59903.6 opt_steps=2320 +[epoch 26/50] step=2340 train_loss=0.0193 tok_s=59903.8 opt_steps=2340 +[epoch 26/50] step=2360 train_loss=0.0193 tok_s=59904.2 opt_steps=2360 +[epoch 26/50] step=2380 train_loss=0.0193 tok_s=59903.5 opt_steps=2380 +[epoch 26/50] step=2400 train_loss=0.0193 tok_s=59903.1 opt_steps=2400 +[epoch 26/50] step=2420 train_loss=0.0193 tok_s=59898.8 opt_steps=2420 +[epoch 26/50] step=2440 train_loss=0.0193 tok_s=59900.0 opt_steps=2440 +[epoch 26/50] step=2460 train_loss=0.0193 tok_s=59898.3 opt_steps=2460 +[epoch 26/50] step=2480 train_loss=0.0194 tok_s=59900.4 opt_steps=2480 +[epoch 26/50] step=2500 train_loss=0.0194 tok_s=59908.2 opt_steps=2500 +[epoch 26/50] step=2520 train_loss=0.0194 tok_s=59902.5 opt_steps=2520 +[epoch 26/50] step=2540 train_loss=0.0194 tok_s=59900.5 opt_steps=2540 +[epoch 26/50] step=2560 train_loss=0.0194 tok_s=59900.5 opt_steps=2560 +[epoch 26/50] step=2580 train_loss=0.0194 tok_s=59898.1 opt_steps=2580 +[epoch 26/50] step=2600 train_loss=0.0194 tok_s=59898.4 opt_steps=2600 +[epoch 26/50] step=2620 train_loss=0.0195 tok_s=59892.9 opt_steps=2620 +[epoch 26/50] step=2640 train_loss=0.0195 tok_s=59889.5 opt_steps=2640 +[epoch 26/50] step=2660 train_loss=0.0195 tok_s=59884.8 opt_steps=2660 +[epoch 26/50] step=2680 train_loss=0.0195 tok_s=59883.3 opt_steps=2680 +[epoch 26/50] step=2700 train_loss=0.0195 tok_s=59881.3 opt_steps=2700 +[epoch 26/50] step=2720 train_loss=0.0195 tok_s=59884.3 opt_steps=2720 +[epoch 26/50] step=2740 train_loss=0.0195 tok_s=59884.5 opt_steps=2740 +[epoch 26/50] step=2760 train_loss=0.0195 tok_s=59882.4 opt_steps=2760 +[epoch 26/50] step=2780 train_loss=0.0196 tok_s=59880.1 opt_steps=2780 +[epoch 26/50] step=2800 train_loss=0.0196 tok_s=59880.9 opt_steps=2800 +[epoch 26/50] step=2820 train_loss=0.0196 tok_s=59877.6 opt_steps=2820 +[epoch 26/50] step=2840 train_loss=0.0196 tok_s=59876.0 opt_steps=2840 +[epoch 26/50] step=2860 train_loss=0.0196 tok_s=59873.5 opt_steps=2860 +[epoch 26/50] step=2880 train_loss=0.0196 tok_s=59876.7 opt_steps=2880 +[epoch 26/50] step=2900 train_loss=0.0196 tok_s=59879.7 opt_steps=2900 +[epoch 26/50] step=2920 train_loss=0.0197 tok_s=59881.8 opt_steps=2920 +[epoch 26/50] step=2940 train_loss=0.0197 tok_s=59884.1 opt_steps=2940 +[epoch 26/50] step=2960 train_loss=0.0197 tok_s=59888.5 opt_steps=2960 +[epoch 26/50] step=2980 train_loss=0.0197 tok_s=59890.6 opt_steps=2980 +[epoch 26/50] step=3000 train_loss=0.0197 tok_s=59893.4 opt_steps=3000 +[epoch 26/50] step=3020 train_loss=0.0197 tok_s=59891.8 opt_steps=3020 +[epoch 26/50] step=3040 train_loss=0.0197 tok_s=59891.7 opt_steps=3040 +[epoch 26/50] step=3060 train_loss=0.0197 tok_s=59890.0 opt_steps=3060 +[epoch 26/50] step=3080 train_loss=0.0197 tok_s=59890.9 opt_steps=3080 +[epoch 26/50] step=3100 train_loss=0.0198 tok_s=59883.9 opt_steps=3100 +[epoch 26/50] step=3120 train_loss=0.0198 tok_s=59883.4 opt_steps=3120 +[epoch 26/50] step=3140 train_loss=0.0198 tok_s=59884.4 opt_steps=3140 +[epoch 26/50] step=3160 train_loss=0.0198 tok_s=59886.3 opt_steps=3160 +[epoch 26/50] step=3180 train_loss=0.0198 tok_s=59886.6 opt_steps=3180 +[epoch 26/50] step=3200 train_loss=0.0198 tok_s=59887.3 opt_steps=3200 +[epoch 26/50] step=3220 train_loss=0.0198 tok_s=59885.8 opt_steps=3220 +[epoch 26/50] step=3240 train_loss=0.0198 tok_s=59885.7 opt_steps=3240 +[epoch 26/50] step=3260 train_loss=0.0199 tok_s=59881.8 opt_steps=3260 +[epoch 26/50] train_loss=0.0199 val_skipped tok_s=59883.0 opt_steps=3273 +[epoch 27/50] step=20 train_loss=0.0161 tok_s=57336.8 opt_steps=20 +[epoch 27/50] step=40 train_loss=0.0163 tok_s=58298.7 opt_steps=40 +[epoch 27/50] step=60 train_loss=0.0166 tok_s=58655.1 opt_steps=60 +[epoch 27/50] step=80 train_loss=0.0164 tok_s=58929.9 opt_steps=80 +[epoch 27/50] step=100 train_loss=0.0162 tok_s=59200.9 opt_steps=100 +[epoch 27/50] step=120 train_loss=0.0161 tok_s=59352.8 opt_steps=120 +[epoch 27/50] step=140 train_loss=0.0159 tok_s=59410.8 opt_steps=140 +[epoch 27/50] step=160 train_loss=0.0159 tok_s=59461.1 opt_steps=160 +[epoch 27/50] step=180 train_loss=0.0160 tok_s=59541.3 opt_steps=180 +[epoch 27/50] step=200 train_loss=0.0159 tok_s=59579.8 opt_steps=200 +[epoch 27/50] step=220 train_loss=0.0159 tok_s=59587.1 opt_steps=220 +[epoch 27/50] step=240 train_loss=0.0158 tok_s=59646.5 opt_steps=240 +[epoch 27/50] step=260 train_loss=0.0158 tok_s=59682.3 opt_steps=260 +[epoch 27/50] step=280 train_loss=0.0159 tok_s=59721.5 opt_steps=280 +[epoch 27/50] step=300 train_loss=0.0159 tok_s=59756.8 opt_steps=300 +[epoch 27/50] step=320 train_loss=0.0159 tok_s=59791.3 opt_steps=320 +[epoch 27/50] step=340 train_loss=0.0159 tok_s=59798.1 opt_steps=340 +[epoch 27/50] step=360 train_loss=0.0160 tok_s=59804.1 opt_steps=360 +[epoch 27/50] step=380 train_loss=0.0160 tok_s=59811.3 opt_steps=380 +[epoch 27/50] step=400 train_loss=0.0161 tok_s=59807.4 opt_steps=400 +[epoch 27/50] step=420 train_loss=0.0161 tok_s=59823.8 opt_steps=420 +[epoch 27/50] step=440 train_loss=0.0161 tok_s=59857.5 opt_steps=440 +[epoch 27/50] step=460 train_loss=0.0161 tok_s=59872.5 opt_steps=460 +[epoch 27/50] step=480 train_loss=0.0162 tok_s=59910.7 opt_steps=480 +[epoch 27/50] step=500 train_loss=0.0163 tok_s=59910.7 opt_steps=500 +[epoch 27/50] step=520 train_loss=0.0163 tok_s=59925.4 opt_steps=520 +[epoch 27/50] step=540 train_loss=0.0164 tok_s=59925.9 opt_steps=540 +[epoch 27/50] step=560 train_loss=0.0164 tok_s=59928.4 opt_steps=560 +[epoch 27/50] step=580 train_loss=0.0164 tok_s=59950.6 opt_steps=580 +[epoch 27/50] step=600 train_loss=0.0165 tok_s=59957.8 opt_steps=600 +[epoch 27/50] step=620 train_loss=0.0165 tok_s=59963.0 opt_steps=620 +[epoch 27/50] step=640 train_loss=0.0166 tok_s=59961.9 opt_steps=640 +[epoch 27/50] step=660 train_loss=0.0166 tok_s=59973.0 opt_steps=660 +[epoch 27/50] step=680 train_loss=0.0166 tok_s=59972.7 opt_steps=680 +[epoch 27/50] step=700 train_loss=0.0167 tok_s=59906.2 opt_steps=700 +[epoch 27/50] step=720 train_loss=0.0167 tok_s=59897.5 opt_steps=720 +[epoch 27/50] step=740 train_loss=0.0167 tok_s=59908.8 opt_steps=740 +[epoch 27/50] step=760 train_loss=0.0168 tok_s=59917.0 opt_steps=760 +[epoch 27/50] step=780 train_loss=0.0168 tok_s=59916.7 opt_steps=780 +[epoch 27/50] step=800 train_loss=0.0169 tok_s=59913.1 opt_steps=800 +[epoch 27/50] step=820 train_loss=0.0169 tok_s=59923.7 opt_steps=820 +[epoch 27/50] step=840 train_loss=0.0170 tok_s=59945.1 opt_steps=840 +[epoch 27/50] step=860 train_loss=0.0170 tok_s=59950.7 opt_steps=860 +[epoch 27/50] step=880 train_loss=0.0170 tok_s=59955.0 opt_steps=880 +[epoch 27/50] step=900 train_loss=0.0170 tok_s=59952.0 opt_steps=900 +[epoch 27/50] step=920 train_loss=0.0170 tok_s=59943.3 opt_steps=920 +[epoch 27/50] step=940 train_loss=0.0171 tok_s=59954.2 opt_steps=940 +[epoch 27/50] step=960 train_loss=0.0171 tok_s=59953.0 opt_steps=960 +[epoch 27/50] step=980 train_loss=0.0171 tok_s=59956.3 opt_steps=980 +[epoch 27/50] step=1000 train_loss=0.0171 tok_s=59956.8 opt_steps=1000 +[epoch 27/50] step=1020 train_loss=0.0171 tok_s=59954.1 opt_steps=1020 +[epoch 27/50] step=1040 train_loss=0.0172 tok_s=59960.9 opt_steps=1040 +[epoch 27/50] step=1060 train_loss=0.0172 tok_s=59962.2 opt_steps=1060 +[epoch 27/50] step=1080 train_loss=0.0172 tok_s=59973.6 opt_steps=1080 +[epoch 27/50] step=1100 train_loss=0.0172 tok_s=59979.1 opt_steps=1100 +[epoch 27/50] step=1120 train_loss=0.0172 tok_s=59984.5 opt_steps=1120 +[epoch 27/50] step=1140 train_loss=0.0172 tok_s=59991.0 opt_steps=1140 +[epoch 27/50] step=1160 train_loss=0.0173 tok_s=59998.8 opt_steps=1160 +[epoch 27/50] step=1180 train_loss=0.0173 tok_s=60001.3 opt_steps=1180 +[epoch 27/50] step=1200 train_loss=0.0173 tok_s=59989.9 opt_steps=1200 +[epoch 27/50] step=1220 train_loss=0.0173 tok_s=59987.7 opt_steps=1220 +[epoch 27/50] step=1240 train_loss=0.0174 tok_s=59991.1 opt_steps=1240 +[epoch 27/50] step=1260 train_loss=0.0174 tok_s=59991.3 opt_steps=1260 +[epoch 27/50] step=1280 train_loss=0.0174 tok_s=59995.9 opt_steps=1280 +[epoch 27/50] step=1300 train_loss=0.0174 tok_s=60002.4 opt_steps=1300 +[epoch 27/50] step=1320 train_loss=0.0175 tok_s=60002.4 opt_steps=1320 +[epoch 27/50] step=1340 train_loss=0.0175 tok_s=59998.6 opt_steps=1340 +[epoch 27/50] step=1360 train_loss=0.0175 tok_s=60005.8 opt_steps=1360 +[epoch 27/50] step=1380 train_loss=0.0175 tok_s=60007.0 opt_steps=1380 +[epoch 27/50] step=1400 train_loss=0.0176 tok_s=60005.7 opt_steps=1400 +[epoch 27/50] step=1420 train_loss=0.0176 tok_s=60006.5 opt_steps=1420 +[epoch 27/50] step=1440 train_loss=0.0176 tok_s=60011.4 opt_steps=1440 +[epoch 27/50] step=1460 train_loss=0.0176 tok_s=60007.2 opt_steps=1460 +[epoch 27/50] step=1480 train_loss=0.0176 tok_s=60012.5 opt_steps=1480 +[epoch 27/50] step=1500 train_loss=0.0177 tok_s=60011.1 opt_steps=1500 +[epoch 27/50] step=1520 train_loss=0.0177 tok_s=60019.1 opt_steps=1520 +[epoch 27/50] step=1540 train_loss=0.0177 tok_s=60021.8 opt_steps=1540 +[epoch 27/50] step=1560 train_loss=0.0177 tok_s=60025.3 opt_steps=1560 +[epoch 27/50] step=1580 train_loss=0.0177 tok_s=60026.1 opt_steps=1580 +[epoch 27/50] step=1600 train_loss=0.0178 tok_s=60026.6 opt_steps=1600 +[epoch 27/50] step=1620 train_loss=0.0178 tok_s=60026.8 opt_steps=1620 +[epoch 27/50] step=1640 train_loss=0.0178 tok_s=60025.4 opt_steps=1640 +[epoch 27/50] step=1660 train_loss=0.0178 tok_s=60022.7 opt_steps=1660 +[epoch 27/50] step=1680 train_loss=0.0178 tok_s=60002.9 opt_steps=1680 +[epoch 27/50] step=1700 train_loss=0.0179 tok_s=59986.4 opt_steps=1700 +[epoch 27/50] step=1720 train_loss=0.0179 tok_s=59980.2 opt_steps=1720 +[epoch 27/50] step=1740 train_loss=0.0179 tok_s=59982.8 opt_steps=1740 +[epoch 27/50] step=1760 train_loss=0.0179 tok_s=59980.5 opt_steps=1760 +[epoch 27/50] step=1780 train_loss=0.0179 tok_s=59976.4 opt_steps=1780 +[epoch 27/50] step=1800 train_loss=0.0180 tok_s=59976.1 opt_steps=1800 +[epoch 27/50] step=1820 train_loss=0.0180 tok_s=59973.8 opt_steps=1820 +[epoch 27/50] step=1840 train_loss=0.0180 tok_s=59976.0 opt_steps=1840 +[epoch 27/50] step=1860 train_loss=0.0180 tok_s=59979.9 opt_steps=1860 +[epoch 27/50] step=1880 train_loss=0.0181 tok_s=59986.5 opt_steps=1880 +[epoch 27/50] step=1900 train_loss=0.0181 tok_s=59986.8 opt_steps=1900 +[epoch 27/50] step=1920 train_loss=0.0181 tok_s=59990.3 opt_steps=1920 +[epoch 27/50] step=1940 train_loss=0.0181 tok_s=59991.7 opt_steps=1940 +[epoch 27/50] step=1960 train_loss=0.0181 tok_s=59992.2 opt_steps=1960 +[epoch 27/50] step=1980 train_loss=0.0181 tok_s=59992.3 opt_steps=1980 +[epoch 27/50] step=2000 train_loss=0.0182 tok_s=59992.9 opt_steps=2000 +[epoch 27/50] step=2020 train_loss=0.0182 tok_s=59994.6 opt_steps=2020 +[epoch 27/50] step=2040 train_loss=0.0182 tok_s=59997.7 opt_steps=2040 +[epoch 27/50] step=2060 train_loss=0.0182 tok_s=59996.0 opt_steps=2060 +[epoch 27/50] step=2080 train_loss=0.0182 tok_s=59996.9 opt_steps=2080 +[epoch 27/50] step=2100 train_loss=0.0183 tok_s=59997.9 opt_steps=2100 +[epoch 27/50] step=2120 train_loss=0.0183 tok_s=59997.6 opt_steps=2120 +[epoch 27/50] step=2140 train_loss=0.0183 tok_s=59997.2 opt_steps=2140 +[epoch 27/50] step=2160 train_loss=0.0183 tok_s=60001.9 opt_steps=2160 +[epoch 27/50] step=2180 train_loss=0.0183 tok_s=60003.0 opt_steps=2180 +[epoch 27/50] step=2200 train_loss=0.0184 tok_s=60002.7 opt_steps=2200 +[epoch 27/50] step=2220 train_loss=0.0184 tok_s=60001.5 opt_steps=2220 +[epoch 27/50] step=2240 train_loss=0.0184 tok_s=60005.3 opt_steps=2240 +[epoch 27/50] step=2260 train_loss=0.0184 tok_s=60006.8 opt_steps=2260 +[epoch 27/50] step=2280 train_loss=0.0184 tok_s=60008.4 opt_steps=2280 +[epoch 27/50] step=2300 train_loss=0.0184 tok_s=60014.2 opt_steps=2300 +[epoch 27/50] step=2320 train_loss=0.0185 tok_s=60013.1 opt_steps=2320 +[epoch 27/50] step=2340 train_loss=0.0185 tok_s=60014.1 opt_steps=2340 +[epoch 27/50] step=2360 train_loss=0.0185 tok_s=60006.7 opt_steps=2360 +[epoch 27/50] step=2380 train_loss=0.0185 tok_s=60000.7 opt_steps=2380 +[epoch 27/50] step=2400 train_loss=0.0185 tok_s=59998.3 opt_steps=2400 +[epoch 27/50] step=2420 train_loss=0.0185 tok_s=60002.1 opt_steps=2420 +[epoch 27/50] step=2440 train_loss=0.0185 tok_s=60004.7 opt_steps=2440 +[epoch 27/50] step=2460 train_loss=0.0186 tok_s=60002.6 opt_steps=2460 +[epoch 27/50] step=2480 train_loss=0.0186 tok_s=59999.4 opt_steps=2480 +[epoch 27/50] step=2500 train_loss=0.0186 tok_s=60001.5 opt_steps=2500 +[epoch 27/50] step=2520 train_loss=0.0186 tok_s=60004.3 opt_steps=2520 +[epoch 27/50] step=2540 train_loss=0.0186 tok_s=60007.2 opt_steps=2540 +[epoch 27/50] step=2560 train_loss=0.0186 tok_s=60007.8 opt_steps=2560 +[epoch 27/50] step=2580 train_loss=0.0187 tok_s=60006.3 opt_steps=2580 +[epoch 27/50] step=2600 train_loss=0.0187 tok_s=60006.9 opt_steps=2600 +[epoch 27/50] step=2620 train_loss=0.0187 tok_s=60012.5 opt_steps=2620 +[epoch 27/50] step=2640 train_loss=0.0187 tok_s=60016.6 opt_steps=2640 +[epoch 27/50] step=2660 train_loss=0.0187 tok_s=60019.8 opt_steps=2660 +[epoch 27/50] step=2680 train_loss=0.0187 tok_s=60016.6 opt_steps=2680 +[epoch 27/50] step=2700 train_loss=0.0187 tok_s=60012.0 opt_steps=2700 +[epoch 27/50] step=2720 train_loss=0.0188 tok_s=60010.6 opt_steps=2720 +[epoch 27/50] step=2740 train_loss=0.0188 tok_s=60015.5 opt_steps=2740 +[epoch 27/50] step=2760 train_loss=0.0188 tok_s=60016.4 opt_steps=2760 +[epoch 27/50] step=2780 train_loss=0.0188 tok_s=60016.5 opt_steps=2780 +[epoch 27/50] step=2800 train_loss=0.0188 tok_s=60018.4 opt_steps=2800 +[epoch 27/50] step=2820 train_loss=0.0188 tok_s=60018.1 opt_steps=2820 +[epoch 27/50] step=2840 train_loss=0.0188 tok_s=60020.2 opt_steps=2840 +[epoch 27/50] step=2860 train_loss=0.0188 tok_s=60021.7 opt_steps=2860 +[epoch 27/50] step=2880 train_loss=0.0188 tok_s=60023.2 opt_steps=2880 +[epoch 27/50] step=2900 train_loss=0.0188 tok_s=60022.8 opt_steps=2900 +[epoch 27/50] step=2920 train_loss=0.0189 tok_s=60024.9 opt_steps=2920 +[epoch 27/50] step=2940 train_loss=0.0189 tok_s=60023.4 opt_steps=2940 +[epoch 27/50] step=2960 train_loss=0.0189 tok_s=60026.6 opt_steps=2960 +[epoch 27/50] step=2980 train_loss=0.0189 tok_s=60025.4 opt_steps=2980 +[epoch 27/50] step=3000 train_loss=0.0189 tok_s=60025.2 opt_steps=3000 +[epoch 27/50] step=3020 train_loss=0.0189 tok_s=60026.9 opt_steps=3020 +[epoch 27/50] step=3040 train_loss=0.0189 tok_s=60024.2 opt_steps=3040 +[epoch 27/50] step=3060 train_loss=0.0190 tok_s=60025.0 opt_steps=3060 +[epoch 27/50] step=3080 train_loss=0.0190 tok_s=60024.4 opt_steps=3080 +[epoch 27/50] step=3100 train_loss=0.0190 tok_s=60025.1 opt_steps=3100 +[epoch 27/50] step=3120 train_loss=0.0190 tok_s=60024.7 opt_steps=3120 +[epoch 27/50] step=3140 train_loss=0.0190 tok_s=60024.1 opt_steps=3140 +[epoch 27/50] step=3160 train_loss=0.0190 tok_s=60024.0 opt_steps=3160 +[epoch 27/50] step=3180 train_loss=0.0190 tok_s=60023.3 opt_steps=3180 +[epoch 27/50] step=3200 train_loss=0.0191 tok_s=60024.4 opt_steps=3200 +[epoch 27/50] step=3220 train_loss=0.0191 tok_s=60026.5 opt_steps=3220 +[epoch 27/50] step=3240 train_loss=0.0191 tok_s=60026.8 opt_steps=3240 +[epoch 27/50] step=3260 train_loss=0.0191 tok_s=60021.7 opt_steps=3260 +[epoch 27/50] train_loss=0.0191 val_skipped tok_s=60018.1 opt_steps=3273 +[epoch 28/50] step=20 train_loss=0.0153 tok_s=58562.9 opt_steps=20 +[epoch 28/50] step=40 train_loss=0.0150 tok_s=59299.5 opt_steps=40 +[epoch 28/50] step=60 train_loss=0.0152 tok_s=59583.2 opt_steps=60 +[epoch 28/50] step=80 train_loss=0.0152 tok_s=59759.8 opt_steps=80 +[epoch 28/50] step=100 train_loss=0.0153 tok_s=59836.0 opt_steps=100 +[epoch 28/50] step=120 train_loss=0.0153 tok_s=59936.8 opt_steps=120 +[epoch 28/50] step=140 train_loss=0.0154 tok_s=59998.4 opt_steps=140 +[epoch 28/50] step=160 train_loss=0.0156 tok_s=60021.9 opt_steps=160 +[epoch 28/50] step=180 train_loss=0.0155 tok_s=60071.6 opt_steps=180 +[epoch 28/50] step=200 train_loss=0.0155 tok_s=60106.0 opt_steps=200 +[epoch 28/50] step=220 train_loss=0.0155 tok_s=60101.7 opt_steps=220 +[epoch 28/50] step=240 train_loss=0.0155 tok_s=60108.0 opt_steps=240 +[epoch 28/50] step=260 train_loss=0.0155 tok_s=60090.9 opt_steps=260 +[epoch 28/50] step=280 train_loss=0.0156 tok_s=60074.5 opt_steps=280 +[epoch 28/50] step=300 train_loss=0.0155 tok_s=60073.3 opt_steps=300 +[epoch 28/50] step=320 train_loss=0.0156 tok_s=60069.7 opt_steps=320 +[epoch 28/50] step=340 train_loss=0.0156 tok_s=60073.4 opt_steps=340 +[epoch 28/50] step=360 train_loss=0.0155 tok_s=60077.9 opt_steps=360 +[epoch 28/50] step=380 train_loss=0.0155 tok_s=60068.8 opt_steps=380 +[epoch 28/50] step=400 train_loss=0.0156 tok_s=60076.1 opt_steps=400 +[epoch 28/50] step=420 train_loss=0.0156 tok_s=60054.8 opt_steps=420 +[epoch 28/50] step=440 train_loss=0.0156 tok_s=60061.6 opt_steps=440 +[epoch 28/50] step=460 train_loss=0.0156 tok_s=60056.9 opt_steps=460 +[epoch 28/50] step=480 train_loss=0.0156 tok_s=60054.8 opt_steps=480 +[epoch 28/50] step=500 train_loss=0.0156 tok_s=60067.2 opt_steps=500 +[epoch 28/50] step=520 train_loss=0.0156 tok_s=60074.5 opt_steps=520 +[epoch 28/50] step=540 train_loss=0.0157 tok_s=60082.7 opt_steps=540 +[epoch 28/50] step=560 train_loss=0.0157 tok_s=60058.4 opt_steps=560 +[epoch 28/50] step=580 train_loss=0.0157 tok_s=60071.4 opt_steps=580 +[epoch 28/50] step=600 train_loss=0.0158 tok_s=60075.6 opt_steps=600 +[epoch 28/50] step=620 train_loss=0.0158 tok_s=60080.7 opt_steps=620 +[epoch 28/50] step=640 train_loss=0.0158 tok_s=60083.1 opt_steps=640 +[epoch 28/50] step=660 train_loss=0.0159 tok_s=60080.8 opt_steps=660 +[epoch 28/50] step=680 train_loss=0.0159 tok_s=60091.9 opt_steps=680 +[epoch 28/50] step=700 train_loss=0.0159 tok_s=60099.0 opt_steps=700 +[epoch 28/50] step=720 train_loss=0.0159 tok_s=60103.1 opt_steps=720 +[epoch 28/50] step=740 train_loss=0.0160 tok_s=60108.9 opt_steps=740 +[epoch 28/50] step=760 train_loss=0.0160 tok_s=60109.3 opt_steps=760 +[epoch 28/50] step=780 train_loss=0.0161 tok_s=60126.9 opt_steps=780 +[epoch 28/50] step=800 train_loss=0.0161 tok_s=60143.7 opt_steps=800 +[epoch 28/50] step=820 train_loss=0.0161 tok_s=60146.8 opt_steps=820 +[epoch 28/50] step=840 train_loss=0.0161 tok_s=60137.7 opt_steps=840 +[epoch 28/50] step=860 train_loss=0.0161 tok_s=60135.1 opt_steps=860 +[epoch 28/50] step=880 train_loss=0.0161 tok_s=60139.3 opt_steps=880 +[epoch 28/50] step=900 train_loss=0.0162 tok_s=60137.0 opt_steps=900 +[epoch 28/50] step=920 train_loss=0.0162 tok_s=60136.2 opt_steps=920 +[epoch 28/50] step=940 train_loss=0.0162 tok_s=60141.8 opt_steps=940 +[epoch 28/50] step=960 train_loss=0.0163 tok_s=60142.0 opt_steps=960 +[epoch 28/50] step=980 train_loss=0.0163 tok_s=60152.7 opt_steps=980 +[epoch 28/50] step=1000 train_loss=0.0163 tok_s=60157.5 opt_steps=1000 +[epoch 28/50] step=1020 train_loss=0.0163 tok_s=60157.8 opt_steps=1020 +[epoch 28/50] step=1040 train_loss=0.0164 tok_s=60166.8 opt_steps=1040 +[epoch 28/50] step=1060 train_loss=0.0164 tok_s=60172.2 opt_steps=1060 +[epoch 28/50] step=1080 train_loss=0.0164 tok_s=60174.7 opt_steps=1080 +[epoch 28/50] step=1100 train_loss=0.0164 tok_s=60172.4 opt_steps=1100 +[epoch 28/50] step=1120 train_loss=0.0165 tok_s=60172.1 opt_steps=1120 +[epoch 28/50] step=1140 train_loss=0.0165 tok_s=60174.1 opt_steps=1140 +[epoch 28/50] step=1160 train_loss=0.0165 tok_s=60165.7 opt_steps=1160 +[epoch 28/50] step=1180 train_loss=0.0165 tok_s=60158.6 opt_steps=1180 +[epoch 28/50] step=1200 train_loss=0.0166 tok_s=60155.7 opt_steps=1200 +[epoch 28/50] step=1220 train_loss=0.0166 tok_s=60159.0 opt_steps=1220 +[epoch 28/50] step=1240 train_loss=0.0166 tok_s=60156.9 opt_steps=1240 +[epoch 28/50] step=1260 train_loss=0.0166 tok_s=60159.2 opt_steps=1260 +[epoch 28/50] step=1280 train_loss=0.0167 tok_s=60154.4 opt_steps=1280 +[epoch 28/50] step=1300 train_loss=0.0167 tok_s=60154.8 opt_steps=1300 +[epoch 28/50] step=1320 train_loss=0.0167 tok_s=60165.1 opt_steps=1320 +[epoch 28/50] step=1340 train_loss=0.0167 tok_s=60162.7 opt_steps=1340 +[epoch 28/50] step=1360 train_loss=0.0168 tok_s=60165.5 opt_steps=1360 +[epoch 28/50] step=1380 train_loss=0.0168 tok_s=60166.2 opt_steps=1380 +[epoch 28/50] step=1400 train_loss=0.0168 tok_s=60168.6 opt_steps=1400 +[epoch 28/50] step=1420 train_loss=0.0169 tok_s=60171.8 opt_steps=1420 +[epoch 28/50] step=1440 train_loss=0.0169 tok_s=60170.5 opt_steps=1440 +[epoch 28/50] step=1460 train_loss=0.0169 tok_s=60165.3 opt_steps=1460 +[epoch 28/50] step=1480 train_loss=0.0169 tok_s=60163.3 opt_steps=1480 +[epoch 28/50] step=1500 train_loss=0.0170 tok_s=60164.6 opt_steps=1500 +[epoch 28/50] step=1520 train_loss=0.0170 tok_s=60169.0 opt_steps=1520 +[epoch 28/50] step=1540 train_loss=0.0170 tok_s=60165.7 opt_steps=1540 +[epoch 28/50] step=1560 train_loss=0.0170 tok_s=60163.7 opt_steps=1560 +[epoch 28/50] step=1580 train_loss=0.0170 tok_s=60166.6 opt_steps=1580 +[epoch 28/50] step=1600 train_loss=0.0171 tok_s=60164.2 opt_steps=1600 +[epoch 28/50] step=1620 train_loss=0.0171 tok_s=60157.6 opt_steps=1620 +[epoch 28/50] step=1640 train_loss=0.0171 tok_s=60153.2 opt_steps=1640 +[epoch 28/50] step=1660 train_loss=0.0171 tok_s=60153.1 opt_steps=1660 +[epoch 28/50] step=1680 train_loss=0.0171 tok_s=60154.4 opt_steps=1680 +[epoch 28/50] step=1700 train_loss=0.0171 tok_s=60162.3 opt_steps=1700 +[epoch 28/50] step=1720 train_loss=0.0172 tok_s=60159.9 opt_steps=1720 +[epoch 28/50] step=1740 train_loss=0.0172 tok_s=60156.8 opt_steps=1740 +[epoch 28/50] step=1760 train_loss=0.0172 tok_s=60158.0 opt_steps=1760 +[epoch 28/50] step=1780 train_loss=0.0172 tok_s=60159.9 opt_steps=1780 +[epoch 28/50] step=1800 train_loss=0.0172 tok_s=60159.5 opt_steps=1800 +[epoch 28/50] step=1820 train_loss=0.0173 tok_s=60157.7 opt_steps=1820 +[epoch 28/50] step=1840 train_loss=0.0173 tok_s=60158.0 opt_steps=1840 +[epoch 28/50] step=1860 train_loss=0.0173 tok_s=60158.5 opt_steps=1860 +[epoch 28/50] step=1880 train_loss=0.0173 tok_s=60156.6 opt_steps=1880 +[epoch 28/50] step=1900 train_loss=0.0173 tok_s=60159.1 opt_steps=1900 +[epoch 28/50] step=1920 train_loss=0.0173 tok_s=60159.0 opt_steps=1920 +[epoch 28/50] step=1940 train_loss=0.0173 tok_s=60157.2 opt_steps=1940 +[epoch 28/50] step=1960 train_loss=0.0174 tok_s=60157.4 opt_steps=1960 +[epoch 28/50] step=1980 train_loss=0.0174 tok_s=60161.1 opt_steps=1980 +[epoch 28/50] step=2000 train_loss=0.0174 tok_s=60160.2 opt_steps=2000 +[epoch 28/50] step=2020 train_loss=0.0174 tok_s=60159.0 opt_steps=2020 +[epoch 28/50] step=2040 train_loss=0.0174 tok_s=60157.4 opt_steps=2040 +[epoch 28/50] step=2060 train_loss=0.0174 tok_s=60156.1 opt_steps=2060 +[epoch 28/50] step=2080 train_loss=0.0175 tok_s=60157.7 opt_steps=2080 +[epoch 28/50] step=2100 train_loss=0.0175 tok_s=60160.6 opt_steps=2100 +[epoch 28/50] step=2120 train_loss=0.0175 tok_s=60158.9 opt_steps=2120 +[epoch 28/50] step=2140 train_loss=0.0175 tok_s=60153.1 opt_steps=2140 +[epoch 28/50] step=2160 train_loss=0.0175 tok_s=60152.3 opt_steps=2160 +[epoch 28/50] step=2180 train_loss=0.0176 tok_s=60150.2 opt_steps=2180 +[epoch 28/50] step=2200 train_loss=0.0176 tok_s=60148.0 opt_steps=2200 +[epoch 28/50] step=2220 train_loss=0.0176 tok_s=60147.0 opt_steps=2220 +[epoch 28/50] step=2240 train_loss=0.0176 tok_s=60147.0 opt_steps=2240 +[epoch 28/50] step=2260 train_loss=0.0176 tok_s=60147.8 opt_steps=2260 +[epoch 28/50] step=2280 train_loss=0.0176 tok_s=60150.3 opt_steps=2280 +[epoch 28/50] step=2300 train_loss=0.0176 tok_s=60146.9 opt_steps=2300 +[epoch 28/50] step=2320 train_loss=0.0176 tok_s=60143.9 opt_steps=2320 +[epoch 28/50] step=2340 train_loss=0.0177 tok_s=60145.9 opt_steps=2340 +[epoch 28/50] step=2360 train_loss=0.0177 tok_s=60145.6 opt_steps=2360 +[epoch 28/50] step=2380 train_loss=0.0177 tok_s=60143.1 opt_steps=2380 +[epoch 28/50] step=2400 train_loss=0.0177 tok_s=60142.7 opt_steps=2400 +[epoch 28/50] step=2420 train_loss=0.0177 tok_s=60137.5 opt_steps=2420 +[epoch 28/50] step=2440 train_loss=0.0177 tok_s=60137.9 opt_steps=2440 +[epoch 28/50] step=2460 train_loss=0.0178 tok_s=60138.2 opt_steps=2460 +[epoch 28/50] step=2480 train_loss=0.0178 tok_s=60137.7 opt_steps=2480 +[epoch 28/50] step=2500 train_loss=0.0178 tok_s=60134.8 opt_steps=2500 +[epoch 28/50] step=2520 train_loss=0.0178 tok_s=60134.1 opt_steps=2520 +[epoch 28/50] step=2540 train_loss=0.0178 tok_s=60132.2 opt_steps=2540 +[epoch 28/50] step=2560 train_loss=0.0178 tok_s=60130.1 opt_steps=2560 +[epoch 28/50] step=2580 train_loss=0.0178 tok_s=60130.1 opt_steps=2580 +[epoch 28/50] step=2600 train_loss=0.0179 tok_s=60130.9 opt_steps=2600 +[epoch 28/50] step=2620 train_loss=0.0179 tok_s=60128.5 opt_steps=2620 +[epoch 28/50] step=2640 train_loss=0.0179 tok_s=60124.0 opt_steps=2640 +[epoch 28/50] step=2660 train_loss=0.0179 tok_s=60123.1 opt_steps=2660 +[epoch 28/50] step=2680 train_loss=0.0179 tok_s=60122.4 opt_steps=2680 +[epoch 28/50] step=2700 train_loss=0.0179 tok_s=60119.9 opt_steps=2700 +[epoch 28/50] step=2720 train_loss=0.0179 tok_s=60121.1 opt_steps=2720 +[epoch 28/50] step=2740 train_loss=0.0180 tok_s=60122.6 opt_steps=2740 +[epoch 28/50] step=2760 train_loss=0.0180 tok_s=60124.4 opt_steps=2760 +[epoch 28/50] step=2780 train_loss=0.0180 tok_s=60125.1 opt_steps=2780 +[epoch 28/50] step=2800 train_loss=0.0180 tok_s=60123.6 opt_steps=2800 +[epoch 28/50] step=2820 train_loss=0.0180 tok_s=60121.9 opt_steps=2820 +[epoch 28/50] step=2840 train_loss=0.0180 tok_s=60123.8 opt_steps=2840 +[epoch 28/50] step=2860 train_loss=0.0180 tok_s=60127.7 opt_steps=2860 +[epoch 28/50] step=2880 train_loss=0.0181 tok_s=60131.0 opt_steps=2880 +[epoch 28/50] step=2900 train_loss=0.0181 tok_s=60131.1 opt_steps=2900 +[epoch 28/50] step=2920 train_loss=0.0181 tok_s=60129.2 opt_steps=2920 +[epoch 28/50] step=2940 train_loss=0.0181 tok_s=60127.3 opt_steps=2940 +[epoch 28/50] step=2960 train_loss=0.0181 tok_s=60126.0 opt_steps=2960 +[epoch 28/50] step=2980 train_loss=0.0181 tok_s=60124.5 opt_steps=2980 +[epoch 28/50] step=3000 train_loss=0.0181 tok_s=60121.6 opt_steps=3000 +[epoch 28/50] step=3020 train_loss=0.0182 tok_s=60120.1 opt_steps=3020 +[epoch 28/50] step=3040 train_loss=0.0182 tok_s=60119.8 opt_steps=3040 +[epoch 28/50] step=3060 train_loss=0.0182 tok_s=60118.4 opt_steps=3060 +[epoch 28/50] step=3080 train_loss=0.0182 tok_s=60116.0 opt_steps=3080 +[epoch 28/50] step=3100 train_loss=0.0182 tok_s=60112.8 opt_steps=3100 +[epoch 28/50] step=3120 train_loss=0.0182 tok_s=60109.9 opt_steps=3120 +[epoch 28/50] step=3140 train_loss=0.0182 tok_s=60106.1 opt_steps=3140 +[epoch 28/50] step=3160 train_loss=0.0182 tok_s=60104.2 opt_steps=3160 +[epoch 28/50] step=3180 train_loss=0.0182 tok_s=60105.5 opt_steps=3180 +[epoch 28/50] step=3200 train_loss=0.0182 tok_s=60104.5 opt_steps=3200 +[epoch 28/50] step=3220 train_loss=0.0183 tok_s=60106.8 opt_steps=3220 +[epoch 28/50] step=3240 train_loss=0.0183 tok_s=60104.9 opt_steps=3240 +[epoch 28/50] step=3260 train_loss=0.0183 tok_s=60101.5 opt_steps=3260 +[epoch 28/50] train_loss=0.0183 val_skipped tok_s=60102.4 opt_steps=3273 +[epoch 29/50] step=20 train_loss=0.0140 tok_s=57890.7 opt_steps=20 +[epoch 29/50] step=40 train_loss=0.0141 tok_s=59144.9 opt_steps=40 +[epoch 29/50] step=60 train_loss=0.0144 tok_s=59225.0 opt_steps=60 +[epoch 29/50] step=80 train_loss=0.0143 tok_s=59586.4 opt_steps=80 +[epoch 29/50] step=100 train_loss=0.0144 tok_s=59660.4 opt_steps=100 +[epoch 29/50] step=120 train_loss=0.0144 tok_s=59763.3 opt_steps=120 +[epoch 29/50] step=140 train_loss=0.0144 tok_s=59768.5 opt_steps=140 +[epoch 29/50] step=160 train_loss=0.0143 tok_s=59820.4 opt_steps=160 +[epoch 29/50] step=180 train_loss=0.0144 tok_s=59859.3 opt_steps=180 +[epoch 29/50] step=200 train_loss=0.0144 tok_s=59863.8 opt_steps=200 +[epoch 29/50] step=220 train_loss=0.0143 tok_s=59885.3 opt_steps=220 +[epoch 29/50] step=240 train_loss=0.0143 tok_s=59924.7 opt_steps=240 +[epoch 29/50] step=260 train_loss=0.0144 tok_s=59942.7 opt_steps=260 +[epoch 29/50] step=280 train_loss=0.0144 tok_s=59974.3 opt_steps=280 +[epoch 29/50] step=300 train_loss=0.0144 tok_s=59962.7 opt_steps=300 +[epoch 29/50] step=320 train_loss=0.0144 tok_s=59961.6 opt_steps=320 +[epoch 29/50] step=340 train_loss=0.0145 tok_s=59971.4 opt_steps=340 +[epoch 29/50] step=360 train_loss=0.0145 tok_s=59981.4 opt_steps=360 +[epoch 29/50] step=380 train_loss=0.0146 tok_s=60001.1 opt_steps=380 +[epoch 29/50] step=400 train_loss=0.0146 tok_s=59989.3 opt_steps=400 +[epoch 29/50] step=420 train_loss=0.0147 tok_s=60000.4 opt_steps=420 +[epoch 29/50] step=440 train_loss=0.0147 tok_s=60012.1 opt_steps=440 +[epoch 29/50] step=460 train_loss=0.0148 tok_s=60013.0 opt_steps=460 +[epoch 29/50] step=480 train_loss=0.0148 tok_s=60018.8 opt_steps=480 +[epoch 29/50] step=500 train_loss=0.0149 tok_s=60021.3 opt_steps=500 +[epoch 29/50] step=520 train_loss=0.0149 tok_s=60024.8 opt_steps=520 +[epoch 29/50] step=540 train_loss=0.0150 tok_s=60025.5 opt_steps=540 +[epoch 29/50] step=560 train_loss=0.0150 tok_s=60012.3 opt_steps=560 +[epoch 29/50] step=580 train_loss=0.0150 tok_s=60039.3 opt_steps=580 +[epoch 29/50] step=600 train_loss=0.0151 tok_s=60046.4 opt_steps=600 +[epoch 29/50] step=620 train_loss=0.0151 tok_s=60049.4 opt_steps=620 +[epoch 29/50] step=640 train_loss=0.0151 tok_s=60052.1 opt_steps=640 +[epoch 29/50] step=660 train_loss=0.0152 tok_s=60051.3 opt_steps=660 +[epoch 29/50] step=680 train_loss=0.0152 tok_s=60040.3 opt_steps=680 +[epoch 29/50] step=700 train_loss=0.0152 tok_s=60044.8 opt_steps=700 +[epoch 29/50] step=720 train_loss=0.0153 tok_s=60038.3 opt_steps=720 +[epoch 29/50] step=740 train_loss=0.0153 tok_s=60048.9 opt_steps=740 +[epoch 29/50] step=760 train_loss=0.0154 tok_s=60054.1 opt_steps=760 +[epoch 29/50] step=780 train_loss=0.0154 tok_s=60055.3 opt_steps=780 +[epoch 29/50] step=800 train_loss=0.0155 tok_s=60052.5 opt_steps=800 +[epoch 29/50] step=820 train_loss=0.0155 tok_s=60061.9 opt_steps=820 +[epoch 29/50] step=840 train_loss=0.0155 tok_s=60065.0 opt_steps=840 +[epoch 29/50] step=860 train_loss=0.0155 tok_s=60060.9 opt_steps=860 +[epoch 29/50] step=880 train_loss=0.0155 tok_s=60061.6 opt_steps=880 +[epoch 29/50] step=900 train_loss=0.0156 tok_s=60067.9 opt_steps=900 +[epoch 29/50] step=920 train_loss=0.0156 tok_s=60070.3 opt_steps=920 +[epoch 29/50] step=940 train_loss=0.0156 tok_s=60069.3 opt_steps=940 +[epoch 29/50] step=960 train_loss=0.0157 tok_s=60078.2 opt_steps=960 +[epoch 29/50] step=980 train_loss=0.0157 tok_s=60066.9 opt_steps=980 +[epoch 29/50] step=1000 train_loss=0.0158 tok_s=60072.3 opt_steps=1000 +[epoch 29/50] step=1020 train_loss=0.0158 tok_s=60073.2 opt_steps=1020 +[epoch 29/50] step=1040 train_loss=0.0158 tok_s=60075.9 opt_steps=1040 +[epoch 29/50] step=1060 train_loss=0.0158 tok_s=60081.6 opt_steps=1060 +[epoch 29/50] step=1080 train_loss=0.0158 tok_s=60072.6 opt_steps=1080 +[epoch 29/50] step=1100 train_loss=0.0159 tok_s=60079.5 opt_steps=1100 +[epoch 29/50] step=1120 train_loss=0.0159 tok_s=60080.3 opt_steps=1120 +[epoch 29/50] step=1140 train_loss=0.0159 tok_s=60080.9 opt_steps=1140 +[epoch 29/50] step=1160 train_loss=0.0160 tok_s=60081.7 opt_steps=1160 +[epoch 29/50] step=1180 train_loss=0.0160 tok_s=60084.9 opt_steps=1180 +[epoch 29/50] step=1200 train_loss=0.0160 tok_s=60087.7 opt_steps=1200 +[epoch 29/50] step=1220 train_loss=0.0160 tok_s=60089.2 opt_steps=1220 +[epoch 29/50] step=1240 train_loss=0.0161 tok_s=60087.7 opt_steps=1240 +[epoch 29/50] step=1260 train_loss=0.0161 tok_s=60082.5 opt_steps=1260 +[epoch 29/50] step=1280 train_loss=0.0161 tok_s=60091.5 opt_steps=1280 +[epoch 29/50] step=1300 train_loss=0.0161 tok_s=60090.9 opt_steps=1300 +[epoch 29/50] step=1320 train_loss=0.0162 tok_s=60095.5 opt_steps=1320 +[epoch 29/50] step=1340 train_loss=0.0162 tok_s=60094.2 opt_steps=1340 +[epoch 29/50] step=1360 train_loss=0.0162 tok_s=60097.2 opt_steps=1360 +[epoch 29/50] step=1380 train_loss=0.0162 tok_s=60094.8 opt_steps=1380 +[epoch 29/50] step=1400 train_loss=0.0163 tok_s=60087.8 opt_steps=1400 +[epoch 29/50] step=1420 train_loss=0.0163 tok_s=60084.1 opt_steps=1420 +[epoch 29/50] step=1440 train_loss=0.0163 tok_s=60087.3 opt_steps=1440 +[epoch 29/50] step=1460 train_loss=0.0163 tok_s=60089.5 opt_steps=1460 +[epoch 29/50] step=1480 train_loss=0.0163 tok_s=60096.0 opt_steps=1480 +[epoch 29/50] step=1500 train_loss=0.0163 tok_s=60098.8 opt_steps=1500 +[epoch 29/50] step=1520 train_loss=0.0164 tok_s=60101.2 opt_steps=1520 +[epoch 29/50] step=1540 train_loss=0.0164 tok_s=60106.0 opt_steps=1540 +[epoch 29/50] step=1560 train_loss=0.0164 tok_s=60108.8 opt_steps=1560 +[epoch 29/50] step=1580 train_loss=0.0164 tok_s=60110.4 opt_steps=1580 +[epoch 29/50] step=1600 train_loss=0.0164 tok_s=60109.7 opt_steps=1600 +[epoch 29/50] step=1620 train_loss=0.0164 tok_s=60111.8 opt_steps=1620 +[epoch 29/50] step=1640 train_loss=0.0165 tok_s=60113.3 opt_steps=1640 +[epoch 29/50] step=1660 train_loss=0.0165 tok_s=60110.6 opt_steps=1660 +[epoch 29/50] step=1680 train_loss=0.0165 tok_s=60108.9 opt_steps=1680 +[epoch 29/50] step=1700 train_loss=0.0165 tok_s=60107.0 opt_steps=1700 +[epoch 29/50] step=1720 train_loss=0.0165 tok_s=60104.4 opt_steps=1720 +[epoch 29/50] step=1740 train_loss=0.0166 tok_s=60102.7 opt_steps=1740 +[epoch 29/50] step=1760 train_loss=0.0166 tok_s=60104.8 opt_steps=1760 +[epoch 29/50] step=1780 train_loss=0.0166 tok_s=60103.4 opt_steps=1780 +[epoch 29/50] step=1800 train_loss=0.0166 tok_s=60107.6 opt_steps=1800 +[epoch 29/50] step=1820 train_loss=0.0166 tok_s=60108.7 opt_steps=1820 +[epoch 29/50] step=1840 train_loss=0.0166 tok_s=60111.8 opt_steps=1840 +[epoch 29/50] step=1860 train_loss=0.0166 tok_s=60113.4 opt_steps=1860 +[epoch 29/50] step=1880 train_loss=0.0166 tok_s=60118.0 opt_steps=1880 +[epoch 29/50] step=1900 train_loss=0.0167 tok_s=60119.1 opt_steps=1900 +[epoch 29/50] step=1920 train_loss=0.0167 tok_s=60115.0 opt_steps=1920 +[epoch 29/50] step=1940 train_loss=0.0167 tok_s=60114.5 opt_steps=1940 +[epoch 29/50] step=1960 train_loss=0.0167 tok_s=60119.1 opt_steps=1960 +[epoch 29/50] step=1980 train_loss=0.0168 tok_s=60114.7 opt_steps=1980 +[epoch 29/50] step=2000 train_loss=0.0168 tok_s=60115.0 opt_steps=2000 +[epoch 29/50] step=2020 train_loss=0.0168 tok_s=60119.5 opt_steps=2020 +[epoch 29/50] step=2040 train_loss=0.0168 tok_s=60122.2 opt_steps=2040 +[epoch 29/50] step=2060 train_loss=0.0168 tok_s=60121.1 opt_steps=2060 +[epoch 29/50] step=2080 train_loss=0.0168 tok_s=60120.0 opt_steps=2080 +[epoch 29/50] step=2100 train_loss=0.0169 tok_s=60117.4 opt_steps=2100 +[epoch 29/50] step=2120 train_loss=0.0169 tok_s=60114.1 opt_steps=2120 +[epoch 29/50] step=2140 train_loss=0.0169 tok_s=60114.5 opt_steps=2140 +[epoch 29/50] step=2160 train_loss=0.0169 tok_s=60116.3 opt_steps=2160 +[epoch 29/50] step=2180 train_loss=0.0169 tok_s=60117.2 opt_steps=2180 +[epoch 29/50] step=2200 train_loss=0.0170 tok_s=60116.4 opt_steps=2200 +[epoch 29/50] step=2220 train_loss=0.0170 tok_s=60116.9 opt_steps=2220 +[epoch 29/50] step=2240 train_loss=0.0170 tok_s=60119.0 opt_steps=2240 +[epoch 29/50] step=2260 train_loss=0.0170 tok_s=60121.4 opt_steps=2260 +[epoch 29/50] step=2280 train_loss=0.0170 tok_s=60121.6 opt_steps=2280 +[epoch 29/50] step=2300 train_loss=0.0170 tok_s=60119.6 opt_steps=2300 +[epoch 29/50] step=2320 train_loss=0.0170 tok_s=60122.1 opt_steps=2320 +[epoch 29/50] step=2340 train_loss=0.0171 tok_s=60122.4 opt_steps=2340 +[epoch 29/50] step=2360 train_loss=0.0171 tok_s=60118.8 opt_steps=2360 +[epoch 29/50] step=2380 train_loss=0.0171 tok_s=60113.2 opt_steps=2380 +[epoch 29/50] step=2400 train_loss=0.0171 tok_s=60115.7 opt_steps=2400 +[epoch 29/50] step=2420 train_loss=0.0171 tok_s=60112.0 opt_steps=2420 +[epoch 29/50] step=2440 train_loss=0.0171 tok_s=60112.4 opt_steps=2440 +[epoch 29/50] step=2460 train_loss=0.0171 tok_s=60113.1 opt_steps=2460 +[epoch 29/50] step=2480 train_loss=0.0172 tok_s=60109.5 opt_steps=2480 +[epoch 29/50] step=2500 train_loss=0.0172 tok_s=60109.2 opt_steps=2500 +[epoch 29/50] step=2520 train_loss=0.0172 tok_s=60105.0 opt_steps=2520 +[epoch 29/50] step=2540 train_loss=0.0172 tok_s=60106.9 opt_steps=2540 +[epoch 29/50] step=2560 train_loss=0.0172 tok_s=60104.9 opt_steps=2560 +[epoch 29/50] step=2580 train_loss=0.0172 tok_s=60105.1 opt_steps=2580 +[epoch 29/50] step=2600 train_loss=0.0172 tok_s=60105.1 opt_steps=2600 +[epoch 29/50] step=2620 train_loss=0.0173 tok_s=60106.8 opt_steps=2620 +[epoch 29/50] step=2640 train_loss=0.0173 tok_s=60109.8 opt_steps=2640 +[epoch 29/50] step=2660 train_loss=0.0173 tok_s=60110.6 opt_steps=2660 +[epoch 29/50] step=2680 train_loss=0.0173 tok_s=60111.8 opt_steps=2680 +[epoch 29/50] step=2700 train_loss=0.0173 tok_s=60111.6 opt_steps=2700 +[epoch 29/50] step=2720 train_loss=0.0173 tok_s=60114.6 opt_steps=2720 +[epoch 29/50] step=2740 train_loss=0.0173 tok_s=60115.8 opt_steps=2740 +[epoch 29/50] step=2760 train_loss=0.0173 tok_s=60116.6 opt_steps=2760 +[epoch 29/50] step=2780 train_loss=0.0174 tok_s=60116.2 opt_steps=2780 +[epoch 29/50] step=2800 train_loss=0.0174 tok_s=60119.6 opt_steps=2800 +[epoch 29/50] step=2820 train_loss=0.0174 tok_s=60122.9 opt_steps=2820 +[epoch 29/50] step=2840 train_loss=0.0174 tok_s=60126.3 opt_steps=2840 +[epoch 29/50] step=2860 train_loss=0.0174 tok_s=60125.3 opt_steps=2860 +[epoch 29/50] step=2880 train_loss=0.0174 tok_s=60127.0 opt_steps=2880 +[epoch 29/50] step=2900 train_loss=0.0174 tok_s=60126.4 opt_steps=2900 +[epoch 29/50] step=2920 train_loss=0.0174 tok_s=60125.4 opt_steps=2920 +[epoch 29/50] step=2940 train_loss=0.0175 tok_s=60130.0 opt_steps=2940 +[epoch 29/50] step=2960 train_loss=0.0175 tok_s=60130.3 opt_steps=2960 +[epoch 29/50] step=2980 train_loss=0.0175 tok_s=60130.6 opt_steps=2980 +[epoch 29/50] step=3000 train_loss=0.0175 tok_s=60132.5 opt_steps=3000 +[epoch 29/50] step=3020 train_loss=0.0175 tok_s=60132.6 opt_steps=3020 +[epoch 29/50] step=3040 train_loss=0.0175 tok_s=60134.4 opt_steps=3040 +[epoch 29/50] step=3060 train_loss=0.0175 tok_s=60131.9 opt_steps=3060 +[epoch 29/50] step=3080 train_loss=0.0175 tok_s=60132.6 opt_steps=3080 +[epoch 29/50] step=3100 train_loss=0.0176 tok_s=60129.3 opt_steps=3100 +[epoch 29/50] step=3120 train_loss=0.0176 tok_s=60130.8 opt_steps=3120 +[epoch 29/50] step=3140 train_loss=0.0176 tok_s=60133.3 opt_steps=3140 +[epoch 29/50] step=3160 train_loss=0.0176 tok_s=60132.2 opt_steps=3160 +[epoch 29/50] step=3180 train_loss=0.0176 tok_s=60130.5 opt_steps=3180 +[epoch 29/50] step=3200 train_loss=0.0176 tok_s=60132.9 opt_steps=3200 +[epoch 29/50] step=3220 train_loss=0.0176 tok_s=60134.5 opt_steps=3220 +[epoch 29/50] step=3240 train_loss=0.0176 tok_s=60134.4 opt_steps=3240 +[epoch 29/50] step=3260 train_loss=0.0176 tok_s=60130.9 opt_steps=3260 +[epoch 29/50] train_loss=0.0176 val_skipped tok_s=60130.0 opt_steps=3273 +[epoch 30/50] step=20 train_loss=0.0147 tok_s=57788.5 opt_steps=20 +[epoch 30/50] step=40 train_loss=0.0143 tok_s=59033.9 opt_steps=40 +[epoch 30/50] step=60 train_loss=0.0144 tok_s=59235.7 opt_steps=60 +[epoch 30/50] step=80 train_loss=0.0144 tok_s=59486.8 opt_steps=80 +[epoch 30/50] step=100 train_loss=0.0145 tok_s=59671.1 opt_steps=100 +[epoch 30/50] step=120 train_loss=0.0144 tok_s=59708.4 opt_steps=120 +[epoch 30/50] step=140 train_loss=0.0142 tok_s=59854.6 opt_steps=140 +[epoch 30/50] step=160 train_loss=0.0143 tok_s=59897.1 opt_steps=160 +[epoch 30/50] step=180 train_loss=0.0143 tok_s=59977.9 opt_steps=180 +[epoch 30/50] step=200 train_loss=0.0143 tok_s=59983.7 opt_steps=200 +[epoch 30/50] step=220 train_loss=0.0143 tok_s=59946.2 opt_steps=220 +[epoch 30/50] step=240 train_loss=0.0143 tok_s=59959.6 opt_steps=240 +[epoch 30/50] step=260 train_loss=0.0143 tok_s=59958.2 opt_steps=260 +[epoch 30/50] step=280 train_loss=0.0143 tok_s=59967.0 opt_steps=280 +[epoch 30/50] step=300 train_loss=0.0144 tok_s=59977.6 opt_steps=300 +[epoch 30/50] step=320 train_loss=0.0145 tok_s=60001.7 opt_steps=320 +[epoch 30/50] step=340 train_loss=0.0145 tok_s=59997.9 opt_steps=340 +[epoch 30/50] step=360 train_loss=0.0145 tok_s=60020.4 opt_steps=360 +[epoch 30/50] step=380 train_loss=0.0145 tok_s=60017.1 opt_steps=380 +[epoch 30/50] step=400 train_loss=0.0145 tok_s=60032.9 opt_steps=400 +[epoch 30/50] step=420 train_loss=0.0146 tok_s=60039.0 opt_steps=420 +[epoch 30/50] step=440 train_loss=0.0146 tok_s=60067.7 opt_steps=440 +[epoch 30/50] step=460 train_loss=0.0147 tok_s=60072.7 opt_steps=460 +[epoch 30/50] step=480 train_loss=0.0147 tok_s=60064.4 opt_steps=480 +[epoch 30/50] step=500 train_loss=0.0147 tok_s=60087.2 opt_steps=500 +[epoch 30/50] step=520 train_loss=0.0147 tok_s=60077.9 opt_steps=520 +[epoch 30/50] step=540 train_loss=0.0148 tok_s=60072.0 opt_steps=540 +[epoch 30/50] step=560 train_loss=0.0148 tok_s=60087.1 opt_steps=560 +[epoch 30/50] step=580 train_loss=0.0148 tok_s=60067.8 opt_steps=580 +[epoch 30/50] step=600 train_loss=0.0148 tok_s=60063.4 opt_steps=600 +[epoch 30/50] step=620 train_loss=0.0148 tok_s=60075.8 opt_steps=620 +[epoch 30/50] step=640 train_loss=0.0148 tok_s=60074.4 opt_steps=640 +[epoch 30/50] step=660 train_loss=0.0149 tok_s=60084.0 opt_steps=660 +[epoch 30/50] step=680 train_loss=0.0149 tok_s=60091.5 opt_steps=680 +[epoch 30/50] step=700 train_loss=0.0149 tok_s=60114.9 opt_steps=700 +[epoch 30/50] step=720 train_loss=0.0149 tok_s=60124.9 opt_steps=720 +[epoch 30/50] step=740 train_loss=0.0149 tok_s=60144.3 opt_steps=740 +[epoch 30/50] step=760 train_loss=0.0149 tok_s=60149.3 opt_steps=760 +[epoch 30/50] step=780 train_loss=0.0150 tok_s=60145.5 opt_steps=780 +[epoch 30/50] step=800 train_loss=0.0150 tok_s=60143.3 opt_steps=800 +[epoch 30/50] step=820 train_loss=0.0150 tok_s=60152.5 opt_steps=820 +[epoch 30/50] step=840 train_loss=0.0150 tok_s=60153.2 opt_steps=840 +[epoch 30/50] step=860 train_loss=0.0151 tok_s=60167.6 opt_steps=860 +[epoch 30/50] step=880 train_loss=0.0151 tok_s=60173.0 opt_steps=880 +[epoch 30/50] step=900 train_loss=0.0152 tok_s=60167.7 opt_steps=900 +[epoch 30/50] step=920 train_loss=0.0152 tok_s=60165.1 opt_steps=920 +[epoch 30/50] step=940 train_loss=0.0152 tok_s=60174.5 opt_steps=940 +[epoch 30/50] step=960 train_loss=0.0152 tok_s=60175.0 opt_steps=960 +[epoch 30/50] step=980 train_loss=0.0153 tok_s=60163.4 opt_steps=980 +[epoch 30/50] step=1000 train_loss=0.0153 tok_s=60160.0 opt_steps=1000 +[epoch 30/50] step=1020 train_loss=0.0153 tok_s=60154.9 opt_steps=1020 +[epoch 30/50] step=1040 train_loss=0.0153 tok_s=60148.2 opt_steps=1040 +[epoch 30/50] step=1060 train_loss=0.0153 tok_s=60144.2 opt_steps=1060 +[epoch 30/50] step=1080 train_loss=0.0153 tok_s=60140.6 opt_steps=1080 +[epoch 30/50] step=1100 train_loss=0.0154 tok_s=60139.9 opt_steps=1100 +[epoch 30/50] step=1120 train_loss=0.0154 tok_s=60143.8 opt_steps=1120 +[epoch 30/50] step=1140 train_loss=0.0154 tok_s=60155.4 opt_steps=1140 +[epoch 30/50] step=1160 train_loss=0.0154 tok_s=60157.6 opt_steps=1160 +[epoch 30/50] step=1180 train_loss=0.0155 tok_s=60166.6 opt_steps=1180 +[epoch 30/50] step=1200 train_loss=0.0155 tok_s=60162.1 opt_steps=1200 +[epoch 30/50] step=1220 train_loss=0.0155 tok_s=60156.8 opt_steps=1220 +[epoch 30/50] step=1240 train_loss=0.0155 tok_s=60153.5 opt_steps=1240 +[epoch 30/50] step=1260 train_loss=0.0155 tok_s=60147.3 opt_steps=1260 +[epoch 30/50] step=1280 train_loss=0.0156 tok_s=60151.4 opt_steps=1280 +[epoch 30/50] step=1300 train_loss=0.0156 tok_s=60152.4 opt_steps=1300 +[epoch 30/50] step=1320 train_loss=0.0156 tok_s=60149.4 opt_steps=1320 +[epoch 30/50] step=1340 train_loss=0.0156 tok_s=60150.8 opt_steps=1340 +[epoch 30/50] step=1360 train_loss=0.0156 tok_s=60153.9 opt_steps=1360 +[epoch 30/50] step=1380 train_loss=0.0157 tok_s=60153.3 opt_steps=1380 +[epoch 30/50] step=1400 train_loss=0.0157 tok_s=60154.2 opt_steps=1400 +[epoch 30/50] step=1420 train_loss=0.0157 tok_s=60156.5 opt_steps=1420 +[epoch 30/50] step=1440 train_loss=0.0157 tok_s=60157.5 opt_steps=1440 +[epoch 30/50] step=1460 train_loss=0.0157 tok_s=60158.5 opt_steps=1460 +[epoch 30/50] step=1480 train_loss=0.0157 tok_s=60161.8 opt_steps=1480 +[epoch 30/50] step=1500 train_loss=0.0158 tok_s=60161.7 opt_steps=1500 +[epoch 30/50] step=1520 train_loss=0.0158 tok_s=60159.9 opt_steps=1520 +[epoch 30/50] step=1540 train_loss=0.0158 tok_s=60162.5 opt_steps=1540 +[epoch 30/50] step=1560 train_loss=0.0158 tok_s=60166.6 opt_steps=1560 +[epoch 30/50] step=1580 train_loss=0.0159 tok_s=60164.7 opt_steps=1580 +[epoch 30/50] step=1600 train_loss=0.0159 tok_s=60166.9 opt_steps=1600 +[epoch 30/50] step=1620 train_loss=0.0159 tok_s=60172.4 opt_steps=1620 +[epoch 30/50] step=1640 train_loss=0.0159 tok_s=60175.0 opt_steps=1640 +[epoch 30/50] step=1660 train_loss=0.0159 tok_s=60172.7 opt_steps=1660 +[epoch 30/50] step=1680 train_loss=0.0160 tok_s=60175.2 opt_steps=1680 +[epoch 30/50] step=1700 train_loss=0.0160 tok_s=60173.0 opt_steps=1700 +[epoch 30/50] step=1720 train_loss=0.0160 tok_s=60176.8 opt_steps=1720 +[epoch 30/50] step=1740 train_loss=0.0160 tok_s=60175.9 opt_steps=1740 +[epoch 30/50] step=1760 train_loss=0.0160 tok_s=60173.5 opt_steps=1760 +[epoch 30/50] step=1780 train_loss=0.0161 tok_s=60174.2 opt_steps=1780 +[epoch 30/50] step=1800 train_loss=0.0161 tok_s=60175.4 opt_steps=1800 +[epoch 30/50] step=1820 train_loss=0.0161 tok_s=60169.5 opt_steps=1820 +[epoch 30/50] step=1840 train_loss=0.0161 tok_s=60171.8 opt_steps=1840 +[epoch 30/50] step=1860 train_loss=0.0161 tok_s=60170.7 opt_steps=1860 +[epoch 30/50] step=1880 train_loss=0.0162 tok_s=60170.7 opt_steps=1880 +[epoch 30/50] step=1900 train_loss=0.0162 tok_s=60170.2 opt_steps=1900 +[epoch 30/50] step=1920 train_loss=0.0162 tok_s=60175.5 opt_steps=1920 +[epoch 30/50] step=1940 train_loss=0.0162 tok_s=60175.2 opt_steps=1940 +[epoch 30/50] step=1960 train_loss=0.0162 tok_s=60174.0 opt_steps=1960 +[epoch 30/50] step=1980 train_loss=0.0162 tok_s=60177.0 opt_steps=1980 +[epoch 30/50] step=2000 train_loss=0.0163 tok_s=60175.3 opt_steps=2000 +[epoch 30/50] step=2020 train_loss=0.0163 tok_s=60176.7 opt_steps=2020 +[epoch 30/50] step=2040 train_loss=0.0163 tok_s=60177.3 opt_steps=2040 +[epoch 30/50] step=2060 train_loss=0.0163 tok_s=60176.8 opt_steps=2060 +[epoch 30/50] step=2080 train_loss=0.0163 tok_s=60175.1 opt_steps=2080 +[epoch 30/50] step=2100 train_loss=0.0163 tok_s=60174.6 opt_steps=2100 +[epoch 30/50] step=2120 train_loss=0.0163 tok_s=60171.1 opt_steps=2120 +[epoch 30/50] step=2140 train_loss=0.0163 tok_s=60170.0 opt_steps=2140 +[epoch 30/50] step=2160 train_loss=0.0164 tok_s=60168.9 opt_steps=2160 +[epoch 30/50] step=2180 train_loss=0.0164 tok_s=60170.1 opt_steps=2180 +[epoch 30/50] step=2200 train_loss=0.0164 tok_s=60169.4 opt_steps=2200 +[epoch 30/50] step=2220 train_loss=0.0164 tok_s=60167.3 opt_steps=2220 +[epoch 30/50] step=2240 train_loss=0.0164 tok_s=60164.7 opt_steps=2240 +[epoch 30/50] step=2260 train_loss=0.0164 tok_s=60164.3 opt_steps=2260 +[epoch 30/50] step=2280 train_loss=0.0164 tok_s=60162.7 opt_steps=2280 +[epoch 30/50] step=2300 train_loss=0.0165 tok_s=60161.0 opt_steps=2300 +[epoch 30/50] step=2320 train_loss=0.0165 tok_s=60162.8 opt_steps=2320 +[epoch 30/50] step=2340 train_loss=0.0165 tok_s=60162.3 opt_steps=2340 +[epoch 30/50] step=2360 train_loss=0.0165 tok_s=60165.0 opt_steps=2360 +[epoch 30/50] step=2380 train_loss=0.0165 tok_s=60168.2 opt_steps=2380 +[epoch 30/50] step=2400 train_loss=0.0165 tok_s=60166.4 opt_steps=2400 +[epoch 30/50] step=2420 train_loss=0.0166 tok_s=60165.2 opt_steps=2420 +[epoch 30/50] step=2440 train_loss=0.0166 tok_s=60164.6 opt_steps=2440 +[epoch 30/50] step=2460 train_loss=0.0166 tok_s=60159.8 opt_steps=2460 +[epoch 30/50] step=2480 train_loss=0.0166 tok_s=60158.4 opt_steps=2480 +[epoch 30/50] step=2500 train_loss=0.0166 tok_s=60159.6 opt_steps=2500 +[epoch 30/50] step=2520 train_loss=0.0166 tok_s=60158.7 opt_steps=2520 +[epoch 30/50] step=2540 train_loss=0.0166 tok_s=60142.9 opt_steps=2540 +[epoch 30/50] step=2560 train_loss=0.0167 tok_s=60147.8 opt_steps=2560 +[epoch 30/50] step=2580 train_loss=0.0167 tok_s=60148.5 opt_steps=2580 +[epoch 30/50] step=2600 train_loss=0.0167 tok_s=60150.1 opt_steps=2600 +[epoch 30/50] step=2620 train_loss=0.0167 tok_s=60146.3 opt_steps=2620 +[epoch 30/50] step=2640 train_loss=0.0167 tok_s=60145.4 opt_steps=2640 +[epoch 30/50] step=2660 train_loss=0.0167 tok_s=60144.4 opt_steps=2660 +[epoch 30/50] step=2680 train_loss=0.0167 tok_s=60147.1 opt_steps=2680 +[epoch 30/50] step=2700 train_loss=0.0167 tok_s=60145.3 opt_steps=2700 +[epoch 30/50] step=2720 train_loss=0.0167 tok_s=60145.7 opt_steps=2720 +[epoch 30/50] step=2740 train_loss=0.0168 tok_s=60148.4 opt_steps=2740 +[epoch 30/50] step=2760 train_loss=0.0168 tok_s=60147.8 opt_steps=2760 +[epoch 30/50] step=2780 train_loss=0.0168 tok_s=60144.9 opt_steps=2780 +[epoch 30/50] step=2800 train_loss=0.0168 tok_s=60140.9 opt_steps=2800 +[epoch 30/50] step=2820 train_loss=0.0168 tok_s=60142.0 opt_steps=2820 +[epoch 30/50] step=2840 train_loss=0.0168 tok_s=60140.4 opt_steps=2840 +[epoch 30/50] step=2860 train_loss=0.0168 tok_s=60135.9 opt_steps=2860 +[epoch 30/50] step=2880 train_loss=0.0168 tok_s=60135.3 opt_steps=2880 +[epoch 30/50] step=2900 train_loss=0.0168 tok_s=60134.0 opt_steps=2900 +[epoch 30/50] step=2920 train_loss=0.0169 tok_s=60131.0 opt_steps=2920 +[epoch 30/50] step=2940 train_loss=0.0169 tok_s=60129.4 opt_steps=2940 +[epoch 30/50] step=2960 train_loss=0.0169 tok_s=60130.2 opt_steps=2960 +[epoch 30/50] step=2980 train_loss=0.0169 tok_s=60130.3 opt_steps=2980 +[epoch 30/50] step=3000 train_loss=0.0169 tok_s=60127.4 opt_steps=3000 +[epoch 30/50] step=3020 train_loss=0.0169 tok_s=60125.1 opt_steps=3020 +[epoch 30/50] step=3040 train_loss=0.0169 tok_s=60125.7 opt_steps=3040 +[epoch 30/50] step=3060 train_loss=0.0169 tok_s=60124.1 opt_steps=3060 +[epoch 30/50] step=3080 train_loss=0.0169 tok_s=60119.3 opt_steps=3080 +[epoch 30/50] step=3100 train_loss=0.0170 tok_s=60119.2 opt_steps=3100 +[epoch 30/50] step=3120 train_loss=0.0170 tok_s=60118.3 opt_steps=3120 +[epoch 30/50] step=3140 train_loss=0.0170 tok_s=60118.2 opt_steps=3140 +[epoch 30/50] step=3160 train_loss=0.0170 tok_s=60119.6 opt_steps=3160 +[epoch 30/50] step=3180 train_loss=0.0170 tok_s=60118.3 opt_steps=3180 +[epoch 30/50] step=3200 train_loss=0.0170 tok_s=60119.2 opt_steps=3200 +[epoch 30/50] step=3220 train_loss=0.0170 tok_s=60117.1 opt_steps=3220 +[epoch 30/50] step=3240 train_loss=0.0170 tok_s=60118.7 opt_steps=3240 +[epoch 30/50] step=3260 train_loss=0.0170 tok_s=60115.7 opt_steps=3260 +[epoch 30/50] train_loss=0.0171 val_skipped tok_s=60116.7 opt_steps=3273 +[epoch 31/50] step=20 train_loss=0.0137 tok_s=58049.6 opt_steps=20 +[epoch 31/50] step=40 train_loss=0.0136 tok_s=59194.9 opt_steps=40 +[epoch 31/50] step=60 train_loss=0.0134 tok_s=59620.7 opt_steps=60 +[epoch 31/50] step=80 train_loss=0.0134 tok_s=59801.5 opt_steps=80 +[epoch 31/50] step=100 train_loss=0.0135 tok_s=59922.3 opt_steps=100 +[epoch 31/50] step=120 train_loss=0.0135 tok_s=59880.2 opt_steps=120 +[epoch 31/50] step=140 train_loss=0.0135 tok_s=59907.2 opt_steps=140 +[epoch 31/50] step=160 train_loss=0.0135 tok_s=59937.9 opt_steps=160 +[epoch 31/50] step=180 train_loss=0.0135 tok_s=59985.3 opt_steps=180 +[epoch 31/50] step=200 train_loss=0.0136 tok_s=60003.6 opt_steps=200 +[epoch 31/50] step=220 train_loss=0.0136 tok_s=60014.1 opt_steps=220 +[epoch 31/50] step=240 train_loss=0.0136 tok_s=60027.3 opt_steps=240 +[epoch 31/50] step=260 train_loss=0.0136 tok_s=60009.7 opt_steps=260 +[epoch 31/50] step=280 train_loss=0.0137 tok_s=60038.0 opt_steps=280 +[epoch 31/50] step=300 train_loss=0.0138 tok_s=60051.8 opt_steps=300 +[epoch 31/50] step=320 train_loss=0.0138 tok_s=60063.4 opt_steps=320 +[epoch 31/50] step=340 train_loss=0.0139 tok_s=60042.0 opt_steps=340 +[epoch 31/50] step=360 train_loss=0.0139 tok_s=60008.4 opt_steps=360 +[epoch 31/50] step=380 train_loss=0.0139 tok_s=59918.2 opt_steps=380 +[epoch 31/50] step=400 train_loss=0.0140 tok_s=59869.5 opt_steps=400 +[epoch 31/50] step=420 train_loss=0.0140 tok_s=59824.5 opt_steps=420 +[epoch 31/50] step=440 train_loss=0.0140 tok_s=59778.3 opt_steps=440 +[epoch 31/50] step=460 train_loss=0.0141 tok_s=59698.0 opt_steps=460 +[epoch 31/50] step=480 train_loss=0.0141 tok_s=59679.7 opt_steps=480 +[epoch 31/50] step=500 train_loss=0.0142 tok_s=59693.6 opt_steps=500 +[epoch 31/50] step=520 train_loss=0.0143 tok_s=59698.5 opt_steps=520 +[epoch 31/50] step=540 train_loss=0.0143 tok_s=59701.6 opt_steps=540 +[epoch 31/50] step=560 train_loss=0.0143 tok_s=59716.0 opt_steps=560 +[epoch 31/50] step=580 train_loss=0.0143 tok_s=59726.6 opt_steps=580 +[epoch 31/50] step=600 train_loss=0.0143 tok_s=59743.6 opt_steps=600 +[epoch 31/50] step=620 train_loss=0.0143 tok_s=59775.0 opt_steps=620 +[epoch 31/50] step=640 train_loss=0.0144 tok_s=59771.7 opt_steps=640 +[epoch 31/50] step=660 train_loss=0.0144 tok_s=59787.3 opt_steps=660 +[epoch 31/50] step=680 train_loss=0.0144 tok_s=59792.2 opt_steps=680 +[epoch 31/50] step=700 train_loss=0.0144 tok_s=59805.2 opt_steps=700 +[epoch 31/50] step=720 train_loss=0.0144 tok_s=59803.7 opt_steps=720 +[epoch 31/50] step=740 train_loss=0.0145 tok_s=59808.4 opt_steps=740 +[epoch 31/50] step=760 train_loss=0.0145 tok_s=59809.9 opt_steps=760 +[epoch 31/50] step=780 train_loss=0.0145 tok_s=59814.9 opt_steps=780 +[epoch 31/50] step=800 train_loss=0.0146 tok_s=59810.4 opt_steps=800 +[epoch 31/50] step=820 train_loss=0.0146 tok_s=59814.2 opt_steps=820 +[epoch 31/50] step=840 train_loss=0.0146 tok_s=59804.4 opt_steps=840 +[epoch 31/50] step=860 train_loss=0.0146 tok_s=59810.0 opt_steps=860 +[epoch 31/50] step=880 train_loss=0.0147 tok_s=59822.0 opt_steps=880 +[epoch 31/50] step=900 train_loss=0.0147 tok_s=59830.3 opt_steps=900 +[epoch 31/50] step=920 train_loss=0.0147 tok_s=59847.3 opt_steps=920 +[epoch 31/50] step=940 train_loss=0.0148 tok_s=59850.7 opt_steps=940 +[epoch 31/50] step=960 train_loss=0.0148 tok_s=59861.4 opt_steps=960 +[epoch 31/50] step=980 train_loss=0.0148 tok_s=59876.0 opt_steps=980 +[epoch 31/50] step=1000 train_loss=0.0148 tok_s=59879.7 opt_steps=1000 +[epoch 31/50] step=1020 train_loss=0.0149 tok_s=59881.9 opt_steps=1020 +[epoch 31/50] step=1040 train_loss=0.0149 tok_s=59887.9 opt_steps=1040 +[epoch 31/50] step=1060 train_loss=0.0149 tok_s=59889.4 opt_steps=1060 +[epoch 31/50] step=1080 train_loss=0.0149 tok_s=59897.0 opt_steps=1080 +[epoch 31/50] step=1100 train_loss=0.0150 tok_s=59905.4 opt_steps=1100 +[epoch 31/50] step=1120 train_loss=0.0150 tok_s=59905.4 opt_steps=1120 +[epoch 31/50] step=1140 train_loss=0.0150 tok_s=59914.9 opt_steps=1140 +[epoch 31/50] step=1160 train_loss=0.0150 tok_s=59918.2 opt_steps=1160 +[epoch 31/50] step=1180 train_loss=0.0150 tok_s=59922.2 opt_steps=1180 +[epoch 31/50] step=1200 train_loss=0.0150 tok_s=59935.3 opt_steps=1200 +[epoch 31/50] step=1220 train_loss=0.0151 tok_s=59940.7 opt_steps=1220 +[epoch 31/50] step=1240 train_loss=0.0151 tok_s=59948.7 opt_steps=1240 +[epoch 31/50] step=1260 train_loss=0.0151 tok_s=59954.2 opt_steps=1260 +[epoch 31/50] step=1280 train_loss=0.0151 tok_s=59958.3 opt_steps=1280 +[epoch 31/50] step=1300 train_loss=0.0151 tok_s=59960.9 opt_steps=1300 +[epoch 31/50] step=1320 train_loss=0.0151 tok_s=59960.2 opt_steps=1320 +[epoch 31/50] step=1340 train_loss=0.0151 tok_s=59966.0 opt_steps=1340 +[epoch 31/50] step=1360 train_loss=0.0152 tok_s=59978.0 opt_steps=1360 +[epoch 31/50] step=1380 train_loss=0.0152 tok_s=59978.9 opt_steps=1380 +[epoch 31/50] step=1400 train_loss=0.0152 tok_s=59981.4 opt_steps=1400 +[epoch 31/50] step=1420 train_loss=0.0152 tok_s=59987.4 opt_steps=1420 +[epoch 31/50] step=1440 train_loss=0.0152 tok_s=59993.5 opt_steps=1440 +[epoch 31/50] step=1460 train_loss=0.0152 tok_s=60002.5 opt_steps=1460 +[epoch 31/50] step=1480 train_loss=0.0153 tok_s=60011.3 opt_steps=1480 +[epoch 31/50] step=1500 train_loss=0.0153 tok_s=60014.8 opt_steps=1500 +[epoch 31/50] step=1520 train_loss=0.0153 tok_s=60018.2 opt_steps=1520 +[epoch 31/50] step=1540 train_loss=0.0153 tok_s=60020.9 opt_steps=1540 +[epoch 31/50] step=1560 train_loss=0.0153 tok_s=60029.3 opt_steps=1560 +[epoch 31/50] step=1580 train_loss=0.0153 tok_s=60033.5 opt_steps=1580 +[epoch 31/50] step=1600 train_loss=0.0154 tok_s=60034.3 opt_steps=1600 +[epoch 31/50] step=1620 train_loss=0.0154 tok_s=60033.6 opt_steps=1620 +[epoch 31/50] step=1640 train_loss=0.0154 tok_s=60032.4 opt_steps=1640 +[epoch 31/50] step=1660 train_loss=0.0154 tok_s=60035.7 opt_steps=1660 +[epoch 31/50] step=1680 train_loss=0.0154 tok_s=60037.3 opt_steps=1680 +[epoch 31/50] step=1700 train_loss=0.0154 tok_s=60040.2 opt_steps=1700 +[epoch 31/50] step=1720 train_loss=0.0154 tok_s=60048.1 opt_steps=1720 +[epoch 31/50] step=1740 train_loss=0.0154 tok_s=60046.7 opt_steps=1740 +[epoch 31/50] step=1760 train_loss=0.0155 tok_s=60046.5 opt_steps=1760 +[epoch 31/50] step=1780 train_loss=0.0155 tok_s=60046.0 opt_steps=1780 +[epoch 31/50] step=1800 train_loss=0.0155 tok_s=60047.6 opt_steps=1800 +[epoch 31/50] step=1820 train_loss=0.0155 tok_s=60051.5 opt_steps=1820 +[epoch 31/50] step=1840 train_loss=0.0155 tok_s=60048.9 opt_steps=1840 +[epoch 31/50] step=1860 train_loss=0.0155 tok_s=60051.8 opt_steps=1860 +[epoch 31/50] step=1880 train_loss=0.0156 tok_s=60057.5 opt_steps=1880 +[epoch 31/50] step=1900 train_loss=0.0156 tok_s=60060.2 opt_steps=1900 +[epoch 31/50] step=1920 train_loss=0.0156 tok_s=60062.4 opt_steps=1920 +[epoch 31/50] step=1940 train_loss=0.0156 tok_s=60062.5 opt_steps=1940 +[epoch 31/50] step=1960 train_loss=0.0156 tok_s=60061.9 opt_steps=1960 +[epoch 31/50] step=1980 train_loss=0.0156 tok_s=60058.5 opt_steps=1980 +[epoch 31/50] step=2000 train_loss=0.0157 tok_s=60054.7 opt_steps=2000 +[epoch 31/50] step=2020 train_loss=0.0157 tok_s=60063.2 opt_steps=2020 +[epoch 31/50] step=2040 train_loss=0.0157 tok_s=60061.5 opt_steps=2040 +[epoch 31/50] step=2060 train_loss=0.0157 tok_s=60063.7 opt_steps=2060 +[epoch 31/50] step=2080 train_loss=0.0157 tok_s=60065.7 opt_steps=2080 +[epoch 31/50] step=2100 train_loss=0.0157 tok_s=60068.5 opt_steps=2100 +[epoch 31/50] step=2120 train_loss=0.0157 tok_s=60068.2 opt_steps=2120 +[epoch 31/50] step=2140 train_loss=0.0157 tok_s=60069.7 opt_steps=2140 +[epoch 31/50] step=2160 train_loss=0.0158 tok_s=60068.8 opt_steps=2160 +[epoch 31/50] step=2180 train_loss=0.0158 tok_s=60071.7 opt_steps=2180 +[epoch 31/50] step=2200 train_loss=0.0158 tok_s=60071.1 opt_steps=2200 +[epoch 31/50] step=2220 train_loss=0.0158 tok_s=60071.7 opt_steps=2220 +[epoch 31/50] step=2240 train_loss=0.0158 tok_s=60073.8 opt_steps=2240 +[epoch 31/50] step=2260 train_loss=0.0158 tok_s=60071.0 opt_steps=2260 +[epoch 31/50] step=2280 train_loss=0.0158 tok_s=60069.1 opt_steps=2280 +[epoch 31/50] step=2300 train_loss=0.0159 tok_s=60063.8 opt_steps=2300 +[epoch 31/50] step=2320 train_loss=0.0159 tok_s=60064.3 opt_steps=2320 +[epoch 31/50] step=2340 train_loss=0.0159 tok_s=60066.1 opt_steps=2340 +[epoch 31/50] step=2360 train_loss=0.0159 tok_s=60066.5 opt_steps=2360 +[epoch 31/50] step=2380 train_loss=0.0159 tok_s=60068.7 opt_steps=2380 +[epoch 31/50] step=2400 train_loss=0.0159 tok_s=60069.2 opt_steps=2400 +[epoch 31/50] step=2420 train_loss=0.0159 tok_s=60071.2 opt_steps=2420 +[epoch 31/50] step=2440 train_loss=0.0159 tok_s=60069.4 opt_steps=2440 +[epoch 31/50] step=2460 train_loss=0.0160 tok_s=60069.6 opt_steps=2460 +[epoch 31/50] step=2480 train_loss=0.0160 tok_s=60072.1 opt_steps=2480 +[epoch 31/50] step=2500 train_loss=0.0160 tok_s=60072.8 opt_steps=2500 +[epoch 31/50] step=2520 train_loss=0.0160 tok_s=60075.2 opt_steps=2520 +[epoch 31/50] step=2540 train_loss=0.0160 tok_s=60077.8 opt_steps=2540 +[epoch 31/50] step=2560 train_loss=0.0160 tok_s=60077.9 opt_steps=2560 +[epoch 31/50] step=2580 train_loss=0.0160 tok_s=60079.8 opt_steps=2580 +[epoch 31/50] step=2600 train_loss=0.0161 tok_s=60080.0 opt_steps=2600 +[epoch 31/50] step=2620 train_loss=0.0161 tok_s=60085.5 opt_steps=2620 +[epoch 31/50] step=2640 train_loss=0.0161 tok_s=60085.1 opt_steps=2640 +[epoch 31/50] step=2660 train_loss=0.0161 tok_s=60091.2 opt_steps=2660 +[epoch 31/50] step=2680 train_loss=0.0161 tok_s=60092.3 opt_steps=2680 +[epoch 31/50] step=2700 train_loss=0.0161 tok_s=60089.3 opt_steps=2700 +[epoch 31/50] step=2720 train_loss=0.0161 tok_s=60093.8 opt_steps=2720 +[epoch 31/50] step=2740 train_loss=0.0161 tok_s=60094.1 opt_steps=2740 +[epoch 31/50] step=2760 train_loss=0.0161 tok_s=60093.1 opt_steps=2760 +[epoch 31/50] step=2780 train_loss=0.0162 tok_s=60096.5 opt_steps=2780 +[epoch 31/50] step=2800 train_loss=0.0162 tok_s=60097.1 opt_steps=2800 +[epoch 31/50] step=2820 train_loss=0.0162 tok_s=60096.4 opt_steps=2820 +[epoch 31/50] step=2840 train_loss=0.0162 tok_s=60099.5 opt_steps=2840 +[epoch 31/50] step=2860 train_loss=0.0162 tok_s=60100.3 opt_steps=2860 +[epoch 31/50] step=2880 train_loss=0.0162 tok_s=60100.7 opt_steps=2880 +[epoch 31/50] step=2900 train_loss=0.0162 tok_s=60103.1 opt_steps=2900 +[epoch 31/50] step=2920 train_loss=0.0162 tok_s=60102.4 opt_steps=2920 +[epoch 31/50] step=2940 train_loss=0.0163 tok_s=60105.4 opt_steps=2940 +[epoch 31/50] step=2960 train_loss=0.0163 tok_s=60104.9 opt_steps=2960 +[epoch 31/50] step=2980 train_loss=0.0163 tok_s=60102.3 opt_steps=2980 +[epoch 31/50] step=3000 train_loss=0.0163 tok_s=60100.8 opt_steps=3000 +[epoch 31/50] step=3020 train_loss=0.0163 tok_s=60100.9 opt_steps=3020 +[epoch 31/50] step=3040 train_loss=0.0163 tok_s=60102.2 opt_steps=3040 +[epoch 31/50] step=3060 train_loss=0.0163 tok_s=60097.9 opt_steps=3060 +[epoch 31/50] step=3080 train_loss=0.0164 tok_s=60093.6 opt_steps=3080 +[epoch 31/50] step=3100 train_loss=0.0164 tok_s=60092.1 opt_steps=3100 +[epoch 31/50] step=3120 train_loss=0.0164 tok_s=60093.1 opt_steps=3120 +[epoch 31/50] step=3140 train_loss=0.0164 tok_s=60096.0 opt_steps=3140 +[epoch 31/50] step=3160 train_loss=0.0164 tok_s=60098.3 opt_steps=3160 +[epoch 31/50] step=3180 train_loss=0.0164 tok_s=60096.8 opt_steps=3180 +[epoch 31/50] step=3200 train_loss=0.0164 tok_s=60100.3 opt_steps=3200 +[epoch 31/50] step=3220 train_loss=0.0164 tok_s=60101.2 opt_steps=3220 +[epoch 31/50] step=3240 train_loss=0.0164 tok_s=60100.2 opt_steps=3240 +[epoch 31/50] step=3260 train_loss=0.0164 tok_s=60095.8 opt_steps=3260 +[epoch 31/50] train_loss=0.0165 val_skipped tok_s=60096.5 opt_steps=3273 +[epoch 32/50] step=20 train_loss=0.0135 tok_s=58012.2 opt_steps=20 +[epoch 32/50] step=40 train_loss=0.0135 tok_s=58607.2 opt_steps=40 +[epoch 32/50] step=60 train_loss=0.0135 tok_s=58936.9 opt_steps=60 +[epoch 32/50] step=80 train_loss=0.0134 tok_s=59115.2 opt_steps=80 +[epoch 32/50] step=100 train_loss=0.0134 tok_s=59145.2 opt_steps=100 +[epoch 32/50] step=120 train_loss=0.0134 tok_s=59199.8 opt_steps=120 +[epoch 32/50] step=140 train_loss=0.0133 tok_s=59219.0 opt_steps=140 +[epoch 32/50] step=160 train_loss=0.0134 tok_s=59254.7 opt_steps=160 +[epoch 32/50] step=180 train_loss=0.0133 tok_s=59267.0 opt_steps=180 +[epoch 32/50] step=200 train_loss=0.0133 tok_s=59300.8 opt_steps=200 +[epoch 32/50] step=220 train_loss=0.0133 tok_s=59335.2 opt_steps=220 +[epoch 32/50] step=240 train_loss=0.0134 tok_s=59351.2 opt_steps=240 +[epoch 32/50] step=260 train_loss=0.0134 tok_s=59429.4 opt_steps=260 +[epoch 32/50] step=280 train_loss=0.0134 tok_s=59435.7 opt_steps=280 +[epoch 32/50] step=300 train_loss=0.0135 tok_s=59449.8 opt_steps=300 +[epoch 32/50] step=320 train_loss=0.0135 tok_s=59461.1 opt_steps=320 +[epoch 32/50] step=340 train_loss=0.0135 tok_s=59487.3 opt_steps=340 +[epoch 32/50] step=360 train_loss=0.0136 tok_s=59490.0 opt_steps=360 +[epoch 32/50] step=380 train_loss=0.0136 tok_s=59514.7 opt_steps=380 +[epoch 32/50] step=400 train_loss=0.0136 tok_s=59526.7 opt_steps=400 +[epoch 32/50] step=420 train_loss=0.0136 tok_s=59537.3 opt_steps=420 +[epoch 32/50] step=440 train_loss=0.0136 tok_s=59543.0 opt_steps=440 +[epoch 32/50] step=460 train_loss=0.0136 tok_s=59584.6 opt_steps=460 +[epoch 32/50] step=480 train_loss=0.0137 tok_s=59584.3 opt_steps=480 +[epoch 32/50] step=500 train_loss=0.0137 tok_s=59586.5 opt_steps=500 +[epoch 32/50] step=520 train_loss=0.0137 tok_s=59590.6 opt_steps=520 +[epoch 32/50] step=540 train_loss=0.0137 tok_s=59608.2 opt_steps=540 +[epoch 32/50] step=560 train_loss=0.0138 tok_s=59620.3 opt_steps=560 +[epoch 32/50] step=580 train_loss=0.0138 tok_s=59641.0 opt_steps=580 +[epoch 32/50] step=600 train_loss=0.0138 tok_s=59656.5 opt_steps=600 +[epoch 32/50] step=620 train_loss=0.0139 tok_s=59684.5 opt_steps=620 +[epoch 32/50] step=640 train_loss=0.0139 tok_s=59703.9 opt_steps=640 +[epoch 32/50] step=660 train_loss=0.0139 tok_s=59721.2 opt_steps=660 +[epoch 32/50] step=680 train_loss=0.0139 tok_s=59723.5 opt_steps=680 +[epoch 32/50] step=700 train_loss=0.0140 tok_s=59733.5 opt_steps=700 +[epoch 32/50] step=720 train_loss=0.0140 tok_s=59754.7 opt_steps=720 +[epoch 32/50] step=740 train_loss=0.0140 tok_s=59766.0 opt_steps=740 +[epoch 32/50] step=760 train_loss=0.0140 tok_s=59774.3 opt_steps=760 +[epoch 32/50] step=780 train_loss=0.0141 tok_s=59781.7 opt_steps=780 +[epoch 32/50] step=800 train_loss=0.0142 tok_s=59790.9 opt_steps=800 +[epoch 32/50] step=820 train_loss=0.0142 tok_s=59805.8 opt_steps=820 +[epoch 32/50] step=840 train_loss=0.0143 tok_s=59809.9 opt_steps=840 +[epoch 32/50] step=860 train_loss=0.0143 tok_s=59827.8 opt_steps=860 +[epoch 32/50] step=880 train_loss=0.0143 tok_s=59841.0 opt_steps=880 +[epoch 32/50] step=900 train_loss=0.0143 tok_s=59854.0 opt_steps=900 +[epoch 32/50] step=920 train_loss=0.0144 tok_s=59859.3 opt_steps=920 +[epoch 32/50] step=940 train_loss=0.0144 tok_s=59861.5 opt_steps=940 +[epoch 32/50] step=960 train_loss=0.0144 tok_s=59858.1 opt_steps=960 +[epoch 32/50] step=980 train_loss=0.0144 tok_s=59873.1 opt_steps=980 +[epoch 32/50] step=1000 train_loss=0.0144 tok_s=59877.6 opt_steps=1000 +[epoch 32/50] step=1020 train_loss=0.0145 tok_s=59889.9 opt_steps=1020 +[epoch 32/50] step=1040 train_loss=0.0145 tok_s=59900.2 opt_steps=1040 +[epoch 32/50] step=1060 train_loss=0.0145 tok_s=59907.1 opt_steps=1060 +[epoch 32/50] step=1080 train_loss=0.0145 tok_s=59914.2 opt_steps=1080 +[epoch 32/50] step=1100 train_loss=0.0145 tok_s=59923.5 opt_steps=1100 +[epoch 32/50] step=1120 train_loss=0.0145 tok_s=59924.6 opt_steps=1120 +[epoch 32/50] step=1140 train_loss=0.0146 tok_s=59922.0 opt_steps=1140 +[epoch 32/50] step=1160 train_loss=0.0146 tok_s=59911.2 opt_steps=1160 +[epoch 32/50] step=1180 train_loss=0.0146 tok_s=59912.1 opt_steps=1180 +[epoch 32/50] step=1200 train_loss=0.0146 tok_s=59917.5 opt_steps=1200 +[epoch 32/50] step=1220 train_loss=0.0147 tok_s=59921.9 opt_steps=1220 +[epoch 32/50] step=1240 train_loss=0.0147 tok_s=59917.2 opt_steps=1240 +[epoch 32/50] step=1260 train_loss=0.0147 tok_s=59924.9 opt_steps=1260 +[epoch 32/50] step=1280 train_loss=0.0147 tok_s=59932.2 opt_steps=1280 +[epoch 32/50] step=1300 train_loss=0.0147 tok_s=59939.8 opt_steps=1300 +[epoch 32/50] step=1320 train_loss=0.0147 tok_s=59942.1 opt_steps=1320 +[epoch 32/50] step=1340 train_loss=0.0148 tok_s=59942.2 opt_steps=1340 +[epoch 32/50] step=1360 train_loss=0.0148 tok_s=59945.3 opt_steps=1360 +[epoch 32/50] step=1380 train_loss=0.0148 tok_s=59952.5 opt_steps=1380 +[epoch 32/50] step=1400 train_loss=0.0148 tok_s=59955.4 opt_steps=1400 +[epoch 32/50] step=1420 train_loss=0.0148 tok_s=59959.3 opt_steps=1420 +[epoch 32/50] step=1440 train_loss=0.0148 tok_s=59964.2 opt_steps=1440 +[epoch 32/50] step=1460 train_loss=0.0148 tok_s=59968.0 opt_steps=1460 +[epoch 32/50] step=1480 train_loss=0.0149 tok_s=59963.2 opt_steps=1480 +[epoch 32/50] step=1500 train_loss=0.0149 tok_s=59967.7 opt_steps=1500 +[epoch 32/50] step=1520 train_loss=0.0149 tok_s=59964.5 opt_steps=1520 +[epoch 32/50] step=1540 train_loss=0.0149 tok_s=59963.4 opt_steps=1540 +[epoch 32/50] step=1560 train_loss=0.0149 tok_s=59971.7 opt_steps=1560 +[epoch 32/50] step=1580 train_loss=0.0150 tok_s=59968.0 opt_steps=1580 +[epoch 32/50] step=1600 train_loss=0.0150 tok_s=59973.1 opt_steps=1600 +[epoch 32/50] step=1620 train_loss=0.0150 tok_s=59979.7 opt_steps=1620 +[epoch 32/50] step=1640 train_loss=0.0150 tok_s=59977.5 opt_steps=1640 +[epoch 32/50] step=1660 train_loss=0.0150 tok_s=59983.0 opt_steps=1660 +[epoch 32/50] step=1680 train_loss=0.0150 tok_s=59986.5 opt_steps=1680 +[epoch 32/50] step=1700 train_loss=0.0150 tok_s=59983.1 opt_steps=1700 +[epoch 32/50] step=1720 train_loss=0.0150 tok_s=59983.3 opt_steps=1720 +[epoch 32/50] step=1740 train_loss=0.0150 tok_s=59986.1 opt_steps=1740 +[epoch 32/50] step=1760 train_loss=0.0150 tok_s=59989.2 opt_steps=1760 +[epoch 32/50] step=1780 train_loss=0.0151 tok_s=59990.3 opt_steps=1780 +[epoch 32/50] step=1800 train_loss=0.0151 tok_s=59984.5 opt_steps=1800 +[epoch 32/50] step=1820 train_loss=0.0151 tok_s=59983.9 opt_steps=1820 +[epoch 32/50] step=1840 train_loss=0.0151 tok_s=59989.7 opt_steps=1840 +[epoch 32/50] step=1860 train_loss=0.0151 tok_s=59984.3 opt_steps=1860 +[epoch 32/50] step=1880 train_loss=0.0151 tok_s=59980.7 opt_steps=1880 +[epoch 32/50] step=1900 train_loss=0.0152 tok_s=59981.1 opt_steps=1900 +[epoch 32/50] step=1920 train_loss=0.0152 tok_s=59987.5 opt_steps=1920 +[epoch 32/50] step=1940 train_loss=0.0152 tok_s=59992.8 opt_steps=1940 +[epoch 32/50] step=1960 train_loss=0.0152 tok_s=59994.5 opt_steps=1960 +[epoch 32/50] step=1980 train_loss=0.0152 tok_s=59998.8 opt_steps=1980 +[epoch 32/50] step=2000 train_loss=0.0152 tok_s=60001.7 opt_steps=2000 +[epoch 32/50] step=2020 train_loss=0.0152 tok_s=60006.0 opt_steps=2020 +[epoch 32/50] step=2040 train_loss=0.0152 tok_s=60011.5 opt_steps=2040 +[epoch 32/50] step=2060 train_loss=0.0153 tok_s=60011.3 opt_steps=2060 +[epoch 32/50] step=2080 train_loss=0.0153 tok_s=60014.2 opt_steps=2080 +[epoch 32/50] step=2100 train_loss=0.0153 tok_s=60014.4 opt_steps=2100 +[epoch 32/50] step=2120 train_loss=0.0153 tok_s=60017.4 opt_steps=2120 +[epoch 32/50] step=2140 train_loss=0.0153 tok_s=60012.4 opt_steps=2140 +[epoch 32/50] step=2160 train_loss=0.0153 tok_s=60013.4 opt_steps=2160 +[epoch 32/50] step=2180 train_loss=0.0153 tok_s=60011.3 opt_steps=2180 +[epoch 32/50] step=2200 train_loss=0.0154 tok_s=60011.2 opt_steps=2200 +[epoch 32/50] step=2220 train_loss=0.0154 tok_s=60009.8 opt_steps=2220 +[epoch 32/50] step=2240 train_loss=0.0154 tok_s=60014.0 opt_steps=2240 +[epoch 32/50] step=2260 train_loss=0.0154 tok_s=60013.3 opt_steps=2260 +[epoch 32/50] step=2280 train_loss=0.0154 tok_s=60013.4 opt_steps=2280 +[epoch 32/50] step=2300 train_loss=0.0154 tok_s=60009.6 opt_steps=2300 +[epoch 32/50] step=2320 train_loss=0.0154 tok_s=60013.4 opt_steps=2320 +[epoch 32/50] step=2340 train_loss=0.0155 tok_s=60014.8 opt_steps=2340 +[epoch 32/50] step=2360 train_loss=0.0155 tok_s=60014.2 opt_steps=2360 +[epoch 32/50] step=2380 train_loss=0.0155 tok_s=60018.2 opt_steps=2380 +[epoch 32/50] step=2400 train_loss=0.0155 tok_s=60019.0 opt_steps=2400 +[epoch 32/50] step=2420 train_loss=0.0155 tok_s=60016.2 opt_steps=2420 +[epoch 32/50] step=2440 train_loss=0.0155 tok_s=60015.7 opt_steps=2440 +[epoch 32/50] step=2460 train_loss=0.0155 tok_s=60018.3 opt_steps=2460 +[epoch 32/50] step=2480 train_loss=0.0155 tok_s=60018.0 opt_steps=2480 +[epoch 32/50] step=2500 train_loss=0.0156 tok_s=60021.5 opt_steps=2500 +[epoch 32/50] step=2520 train_loss=0.0156 tok_s=60027.0 opt_steps=2520 +[epoch 32/50] step=2540 train_loss=0.0156 tok_s=60027.6 opt_steps=2540 +[epoch 32/50] step=2560 train_loss=0.0156 tok_s=60032.6 opt_steps=2560 +[epoch 32/50] step=2580 train_loss=0.0157 tok_s=60032.5 opt_steps=2580 +[epoch 32/50] step=2600 train_loss=0.0157 tok_s=60035.2 opt_steps=2600 +[epoch 32/50] step=2620 train_loss=0.0157 tok_s=60035.1 opt_steps=2620 +[epoch 32/50] step=2640 train_loss=0.0157 tok_s=60036.5 opt_steps=2640 +[epoch 32/50] step=2660 train_loss=0.0157 tok_s=60037.4 opt_steps=2660 +[epoch 32/50] step=2680 train_loss=0.0157 tok_s=60038.4 opt_steps=2680 +[epoch 32/50] step=2700 train_loss=0.0157 tok_s=60040.8 opt_steps=2700 +[epoch 32/50] step=2720 train_loss=0.0157 tok_s=60040.0 opt_steps=2720 +[epoch 32/50] step=2740 train_loss=0.0157 tok_s=60041.6 opt_steps=2740 +[epoch 32/50] step=2760 train_loss=0.0157 tok_s=60039.0 opt_steps=2760 +[epoch 32/50] step=2780 train_loss=0.0158 tok_s=60035.5 opt_steps=2780 +[epoch 32/50] step=2800 train_loss=0.0158 tok_s=60031.0 opt_steps=2800 +[epoch 32/50] step=2820 train_loss=0.0158 tok_s=60023.8 opt_steps=2820 +[epoch 32/50] step=2840 train_loss=0.0158 tok_s=60012.4 opt_steps=2840 +[epoch 32/50] step=2860 train_loss=0.0158 tok_s=60012.7 opt_steps=2860 +[epoch 32/50] step=2880 train_loss=0.0158 tok_s=60014.8 opt_steps=2880 +[epoch 32/50] step=2900 train_loss=0.0158 tok_s=60015.5 opt_steps=2900 +[epoch 32/50] step=2920 train_loss=0.0158 tok_s=60016.2 opt_steps=2920 +[epoch 32/50] step=2940 train_loss=0.0158 tok_s=60014.7 opt_steps=2940 +[epoch 32/50] step=2960 train_loss=0.0159 tok_s=60016.8 opt_steps=2960 +[epoch 32/50] step=2980 train_loss=0.0159 tok_s=60017.2 opt_steps=2980 +[epoch 32/50] step=3000 train_loss=0.0159 tok_s=60013.7 opt_steps=3000 +[epoch 32/50] step=3020 train_loss=0.0159 tok_s=60016.1 opt_steps=3020 +[epoch 32/50] step=3040 train_loss=0.0159 tok_s=60014.9 opt_steps=3040 +[epoch 32/50] step=3060 train_loss=0.0159 tok_s=60016.7 opt_steps=3060 +[epoch 32/50] step=3080 train_loss=0.0159 tok_s=60014.6 opt_steps=3080 +[epoch 32/50] step=3100 train_loss=0.0159 tok_s=60016.4 opt_steps=3100 +[epoch 32/50] step=3120 train_loss=0.0159 tok_s=60013.5 opt_steps=3120 +[epoch 32/50] step=3140 train_loss=0.0159 tok_s=60018.4 opt_steps=3140 +[epoch 32/50] step=3160 train_loss=0.0160 tok_s=60020.5 opt_steps=3160 +[epoch 32/50] step=3180 train_loss=0.0160 tok_s=60021.1 opt_steps=3180 +[epoch 32/50] step=3200 train_loss=0.0160 tok_s=60020.8 opt_steps=3200 +[epoch 32/50] step=3220 train_loss=0.0160 tok_s=60022.0 opt_steps=3220 +[epoch 32/50] step=3240 train_loss=0.0160 tok_s=60024.1 opt_steps=3240 +[epoch 32/50] step=3260 train_loss=0.0160 tok_s=60021.0 opt_steps=3260 +[epoch 32/50] train_loss=0.0160 val_skipped tok_s=60023.6 opt_steps=3273 +[epoch 33/50] step=20 train_loss=0.0128 tok_s=58603.1 opt_steps=20 +[epoch 33/50] step=40 train_loss=0.0129 tok_s=59280.4 opt_steps=40 +[epoch 33/50] step=60 train_loss=0.0130 tok_s=59714.7 opt_steps=60 +[epoch 33/50] step=80 train_loss=0.0129 tok_s=59842.1 opt_steps=80 +[epoch 33/50] step=100 train_loss=0.0128 tok_s=59991.6 opt_steps=100 +[epoch 33/50] step=120 train_loss=0.0127 tok_s=60069.3 opt_steps=120 +[epoch 33/50] step=140 train_loss=0.0126 tok_s=60073.2 opt_steps=140 +[epoch 33/50] step=160 train_loss=0.0126 tok_s=60124.2 opt_steps=160 +[epoch 33/50] step=180 train_loss=0.0127 tok_s=60140.8 opt_steps=180 +[epoch 33/50] step=200 train_loss=0.0127 tok_s=60178.7 opt_steps=200 +[epoch 33/50] step=220 train_loss=0.0127 tok_s=60200.8 opt_steps=220 +[epoch 33/50] step=240 train_loss=0.0128 tok_s=60212.6 opt_steps=240 +[epoch 33/50] step=260 train_loss=0.0128 tok_s=60235.2 opt_steps=260 +[epoch 33/50] step=280 train_loss=0.0129 tok_s=60227.1 opt_steps=280 +[epoch 33/50] step=300 train_loss=0.0129 tok_s=60212.3 opt_steps=300 +[epoch 33/50] step=320 train_loss=0.0129 tok_s=60225.5 opt_steps=320 +[epoch 33/50] step=340 train_loss=0.0130 tok_s=60229.5 opt_steps=340 +[epoch 33/50] step=360 train_loss=0.0131 tok_s=60238.4 opt_steps=360 +[epoch 33/50] step=380 train_loss=0.0131 tok_s=60213.6 opt_steps=380 +[epoch 33/50] step=400 train_loss=0.0131 tok_s=60214.1 opt_steps=400 +[epoch 33/50] step=420 train_loss=0.0131 tok_s=60216.2 opt_steps=420 +[epoch 33/50] step=440 train_loss=0.0131 tok_s=60219.1 opt_steps=440 +[epoch 33/50] step=460 train_loss=0.0131 tok_s=60223.1 opt_steps=460 +[epoch 33/50] step=480 train_loss=0.0131 tok_s=60229.6 opt_steps=480 +[epoch 33/50] step=500 train_loss=0.0132 tok_s=60234.6 opt_steps=500 +[epoch 33/50] step=520 train_loss=0.0132 tok_s=60229.4 opt_steps=520 +[epoch 33/50] step=540 train_loss=0.0132 tok_s=60223.1 opt_steps=540 +[epoch 33/50] step=560 train_loss=0.0132 tok_s=60230.9 opt_steps=560 +[epoch 33/50] step=580 train_loss=0.0132 tok_s=60232.8 opt_steps=580 +[epoch 33/50] step=600 train_loss=0.0133 tok_s=60230.1 opt_steps=600 +[epoch 33/50] step=620 train_loss=0.0133 tok_s=60234.5 opt_steps=620 +[epoch 33/50] step=640 train_loss=0.0133 tok_s=60254.3 opt_steps=640 +[epoch 33/50] step=660 train_loss=0.0133 tok_s=60253.8 opt_steps=660 +[epoch 33/50] step=680 train_loss=0.0133 tok_s=60271.9 opt_steps=680 +[epoch 33/50] step=700 train_loss=0.0134 tok_s=60253.1 opt_steps=700 +[epoch 33/50] step=720 train_loss=0.0134 tok_s=60246.2 opt_steps=720 +[epoch 33/50] step=740 train_loss=0.0134 tok_s=60225.8 opt_steps=740 +[epoch 33/50] step=760 train_loss=0.0134 tok_s=60214.4 opt_steps=760 +[epoch 33/50] step=780 train_loss=0.0135 tok_s=60203.6 opt_steps=780 +[epoch 33/50] step=800 train_loss=0.0135 tok_s=60188.7 opt_steps=800 +[epoch 33/50] step=820 train_loss=0.0135 tok_s=60175.3 opt_steps=820 +[epoch 33/50] step=840 train_loss=0.0136 tok_s=60155.2 opt_steps=840 +[epoch 33/50] step=860 train_loss=0.0136 tok_s=60161.7 opt_steps=860 +[epoch 33/50] step=880 train_loss=0.0136 tok_s=60165.2 opt_steps=880 +[epoch 33/50] step=900 train_loss=0.0136 tok_s=60166.0 opt_steps=900 +[epoch 33/50] step=920 train_loss=0.0137 tok_s=60173.2 opt_steps=920 +[epoch 33/50] step=940 train_loss=0.0137 tok_s=60172.3 opt_steps=940 +[epoch 33/50] step=960 train_loss=0.0137 tok_s=60165.4 opt_steps=960 +[epoch 33/50] step=980 train_loss=0.0137 tok_s=60156.8 opt_steps=980 +[epoch 33/50] step=1000 train_loss=0.0137 tok_s=60138.6 opt_steps=1000 +[epoch 33/50] step=1020 train_loss=0.0138 tok_s=60136.6 opt_steps=1020 +[epoch 33/50] step=1040 train_loss=0.0138 tok_s=60139.9 opt_steps=1040 +[epoch 33/50] step=1060 train_loss=0.0138 tok_s=60147.3 opt_steps=1060 +[epoch 33/50] step=1080 train_loss=0.0138 tok_s=60138.3 opt_steps=1080 +[epoch 33/50] step=1100 train_loss=0.0138 tok_s=60133.5 opt_steps=1100 +[epoch 33/50] step=1120 train_loss=0.0138 tok_s=60134.5 opt_steps=1120 +[epoch 33/50] step=1140 train_loss=0.0138 tok_s=60130.9 opt_steps=1140 +[epoch 33/50] step=1160 train_loss=0.0139 tok_s=60129.5 opt_steps=1160 +[epoch 33/50] step=1180 train_loss=0.0139 tok_s=60126.8 opt_steps=1180 +[epoch 33/50] step=1200 train_loss=0.0139 tok_s=60131.7 opt_steps=1200 +[epoch 33/50] step=1220 train_loss=0.0140 tok_s=60133.4 opt_steps=1220 +[epoch 33/50] step=1240 train_loss=0.0140 tok_s=60138.0 opt_steps=1240 +[epoch 33/50] step=1260 train_loss=0.0140 tok_s=60136.0 opt_steps=1260 +[epoch 33/50] step=1280 train_loss=0.0140 tok_s=60142.0 opt_steps=1280 +[epoch 33/50] step=1300 train_loss=0.0141 tok_s=60137.4 opt_steps=1300 +[epoch 33/50] step=1320 train_loss=0.0141 tok_s=60136.8 opt_steps=1320 +[epoch 33/50] step=1340 train_loss=0.0141 tok_s=60131.4 opt_steps=1340 +[epoch 33/50] step=1360 train_loss=0.0141 tok_s=60131.2 opt_steps=1360 +[epoch 33/50] step=1380 train_loss=0.0141 tok_s=60130.4 opt_steps=1380 +[epoch 33/50] step=1400 train_loss=0.0142 tok_s=60127.3 opt_steps=1400 +[epoch 33/50] step=1420 train_loss=0.0142 tok_s=60131.8 opt_steps=1420 +[epoch 33/50] step=1440 train_loss=0.0142 tok_s=60129.8 opt_steps=1440 +[epoch 33/50] step=1460 train_loss=0.0142 tok_s=60122.0 opt_steps=1460 +[epoch 33/50] step=1480 train_loss=0.0143 tok_s=60119.4 opt_steps=1480 +[epoch 33/50] step=1500 train_loss=0.0143 tok_s=60119.2 opt_steps=1500 +[epoch 33/50] step=1520 train_loss=0.0143 tok_s=60125.1 opt_steps=1520 +[epoch 33/50] step=1540 train_loss=0.0143 tok_s=60112.1 opt_steps=1540 +[epoch 33/50] step=1560 train_loss=0.0143 tok_s=60110.3 opt_steps=1560 +[epoch 33/50] step=1580 train_loss=0.0143 tok_s=60114.0 opt_steps=1580 +[epoch 33/50] step=1600 train_loss=0.0144 tok_s=60113.7 opt_steps=1600 +[epoch 33/50] step=1620 train_loss=0.0144 tok_s=60110.5 opt_steps=1620 +[epoch 33/50] step=1640 train_loss=0.0144 tok_s=60105.8 opt_steps=1640 +[epoch 33/50] step=1660 train_loss=0.0144 tok_s=60113.8 opt_steps=1660 +[epoch 33/50] step=1680 train_loss=0.0144 tok_s=60115.1 opt_steps=1680 +[epoch 33/50] step=1700 train_loss=0.0145 tok_s=60103.1 opt_steps=1700 +[epoch 33/50] step=1720 train_loss=0.0145 tok_s=60101.7 opt_steps=1720 +[epoch 33/50] step=1740 train_loss=0.0145 tok_s=60102.0 opt_steps=1740 +[epoch 33/50] step=1760 train_loss=0.0145 tok_s=60104.8 opt_steps=1760 +[epoch 33/50] step=1780 train_loss=0.0145 tok_s=60109.8 opt_steps=1780 +[epoch 33/50] step=1800 train_loss=0.0145 tok_s=60108.3 opt_steps=1800 +[epoch 33/50] step=1820 train_loss=0.0146 tok_s=60107.9 opt_steps=1820 +[epoch 33/50] step=1840 train_loss=0.0146 tok_s=60112.9 opt_steps=1840 +[epoch 33/50] step=1860 train_loss=0.0146 tok_s=60108.9 opt_steps=1860 +[epoch 33/50] step=1880 train_loss=0.0146 tok_s=60106.8 opt_steps=1880 +[epoch 33/50] step=1900 train_loss=0.0146 tok_s=60105.8 opt_steps=1900 +[epoch 33/50] step=1920 train_loss=0.0146 tok_s=60103.7 opt_steps=1920 +[epoch 33/50] step=1940 train_loss=0.0146 tok_s=60102.5 opt_steps=1940 +[epoch 33/50] step=1960 train_loss=0.0147 tok_s=60098.2 opt_steps=1960 +[epoch 33/50] step=1980 train_loss=0.0147 tok_s=60099.7 opt_steps=1980 +[epoch 33/50] step=2000 train_loss=0.0147 tok_s=60097.6 opt_steps=2000 +[epoch 33/50] step=2020 train_loss=0.0147 tok_s=60092.6 opt_steps=2020 +[epoch 33/50] step=2040 train_loss=0.0147 tok_s=60092.5 opt_steps=2040 +[epoch 33/50] step=2060 train_loss=0.0147 tok_s=60097.6 opt_steps=2060 +[epoch 33/50] step=2080 train_loss=0.0148 tok_s=60101.5 opt_steps=2080 +[epoch 33/50] step=2100 train_loss=0.0148 tok_s=60103.2 opt_steps=2100 +[epoch 33/50] step=2120 train_loss=0.0148 tok_s=60104.9 opt_steps=2120 +[epoch 33/50] step=2140 train_loss=0.0148 tok_s=60102.9 opt_steps=2140 +[epoch 33/50] step=2160 train_loss=0.0148 tok_s=60100.8 opt_steps=2160 +[epoch 33/50] step=2180 train_loss=0.0148 tok_s=60106.5 opt_steps=2180 +[epoch 33/50] step=2200 train_loss=0.0148 tok_s=60108.4 opt_steps=2200 +[epoch 33/50] step=2220 train_loss=0.0148 tok_s=60105.5 opt_steps=2220 +[epoch 33/50] step=2240 train_loss=0.0149 tok_s=60099.4 opt_steps=2240 +[epoch 33/50] step=2260 train_loss=0.0149 tok_s=60101.2 opt_steps=2260 +[epoch 33/50] step=2280 train_loss=0.0149 tok_s=60098.9 opt_steps=2280 +[epoch 33/50] step=2300 train_loss=0.0149 tok_s=60091.0 opt_steps=2300 +[epoch 33/50] step=2320 train_loss=0.0149 tok_s=60087.9 opt_steps=2320 +[epoch 33/50] step=2340 train_loss=0.0149 tok_s=60086.0 opt_steps=2340 +[epoch 33/50] step=2360 train_loss=0.0149 tok_s=60088.3 opt_steps=2360 +[epoch 33/50] step=2380 train_loss=0.0149 tok_s=60091.9 opt_steps=2380 +[epoch 33/50] step=2400 train_loss=0.0150 tok_s=60091.6 opt_steps=2400 +[epoch 33/50] step=2420 train_loss=0.0150 tok_s=60091.9 opt_steps=2420 +[epoch 33/50] step=2440 train_loss=0.0150 tok_s=60094.0 opt_steps=2440 +[epoch 33/50] step=2460 train_loss=0.0150 tok_s=60094.0 opt_steps=2460 +[epoch 33/50] step=2480 train_loss=0.0150 tok_s=60090.7 opt_steps=2480 +[epoch 33/50] step=2500 train_loss=0.0150 tok_s=60090.9 opt_steps=2500 +[epoch 33/50] step=2520 train_loss=0.0150 tok_s=60093.6 opt_steps=2520 +[epoch 33/50] step=2540 train_loss=0.0150 tok_s=60098.0 opt_steps=2540 +[epoch 33/50] step=2560 train_loss=0.0151 tok_s=60098.5 opt_steps=2560 +[epoch 33/50] step=2580 train_loss=0.0151 tok_s=60094.6 opt_steps=2580 +[epoch 33/50] step=2600 train_loss=0.0151 tok_s=60096.2 opt_steps=2600 +[epoch 33/50] step=2620 train_loss=0.0151 tok_s=60097.0 opt_steps=2620 +[epoch 33/50] step=2640 train_loss=0.0151 tok_s=60093.6 opt_steps=2640 +[epoch 33/50] step=2660 train_loss=0.0151 tok_s=60095.0 opt_steps=2660 +[epoch 33/50] step=2680 train_loss=0.0151 tok_s=60094.3 opt_steps=2680 +[epoch 33/50] step=2700 train_loss=0.0151 tok_s=60092.5 opt_steps=2700 +[epoch 33/50] step=2720 train_loss=0.0151 tok_s=60093.7 opt_steps=2720 +[epoch 33/50] step=2740 train_loss=0.0152 tok_s=60095.4 opt_steps=2740 +[epoch 33/50] step=2760 train_loss=0.0152 tok_s=60091.4 opt_steps=2760 +[epoch 33/50] step=2780 train_loss=0.0152 tok_s=60090.6 opt_steps=2780 +[epoch 33/50] step=2800 train_loss=0.0152 tok_s=60090.9 opt_steps=2800 +[epoch 33/50] step=2820 train_loss=0.0152 tok_s=60091.2 opt_steps=2820 +[epoch 33/50] step=2840 train_loss=0.0152 tok_s=60088.1 opt_steps=2840 +[epoch 33/50] step=2860 train_loss=0.0152 tok_s=60091.8 opt_steps=2860 +[epoch 33/50] step=2880 train_loss=0.0152 tok_s=60092.3 opt_steps=2880 +[epoch 33/50] step=2900 train_loss=0.0152 tok_s=60089.6 opt_steps=2900 +[epoch 33/50] step=2920 train_loss=0.0153 tok_s=60090.8 opt_steps=2920 +[epoch 33/50] step=2940 train_loss=0.0153 tok_s=60092.4 opt_steps=2940 +[epoch 33/50] step=2960 train_loss=0.0153 tok_s=60096.4 opt_steps=2960 +[epoch 33/50] step=2980 train_loss=0.0153 tok_s=60099.8 opt_steps=2980 +[epoch 33/50] step=3000 train_loss=0.0153 tok_s=60099.6 opt_steps=3000 +[epoch 33/50] step=3020 train_loss=0.0153 tok_s=60101.5 opt_steps=3020 +[epoch 33/50] step=3040 train_loss=0.0153 tok_s=60104.1 opt_steps=3040 +[epoch 33/50] step=3060 train_loss=0.0153 tok_s=60103.2 opt_steps=3060 +[epoch 33/50] step=3080 train_loss=0.0153 tok_s=60102.5 opt_steps=3080 +[epoch 33/50] step=3100 train_loss=0.0154 tok_s=60102.5 opt_steps=3100 +[epoch 33/50] step=3120 train_loss=0.0154 tok_s=60103.1 opt_steps=3120 +[epoch 33/50] step=3140 train_loss=0.0154 tok_s=60109.0 opt_steps=3140 +[epoch 33/50] step=3160 train_loss=0.0154 tok_s=60110.5 opt_steps=3160 +[epoch 33/50] step=3180 train_loss=0.0154 tok_s=60113.0 opt_steps=3180 +[epoch 33/50] step=3200 train_loss=0.0154 tok_s=60113.9 opt_steps=3200 +[epoch 33/50] step=3220 train_loss=0.0154 tok_s=60115.4 opt_steps=3220 +[epoch 33/50] step=3240 train_loss=0.0154 tok_s=60114.3 opt_steps=3240 +[epoch 33/50] step=3260 train_loss=0.0154 tok_s=60111.3 opt_steps=3260 +[epoch 33/50] train_loss=0.0155 val_skipped tok_s=60114.0 opt_steps=3273 +[epoch 34/50] step=20 train_loss=0.0131 tok_s=58356.3 opt_steps=20 +[epoch 34/50] step=40 train_loss=0.0129 tok_s=59252.4 opt_steps=40 +[epoch 34/50] step=60 train_loss=0.0127 tok_s=59476.6 opt_steps=60 +[epoch 34/50] step=80 train_loss=0.0124 tok_s=59663.6 opt_steps=80 +[epoch 34/50] step=100 train_loss=0.0125 tok_s=59858.6 opt_steps=100 +[epoch 34/50] step=120 train_loss=0.0124 tok_s=59955.2 opt_steps=120 +[epoch 34/50] step=140 train_loss=0.0124 tok_s=60015.7 opt_steps=140 +[epoch 34/50] step=160 train_loss=0.0124 tok_s=60018.2 opt_steps=160 +[epoch 34/50] step=180 train_loss=0.0124 tok_s=60064.9 opt_steps=180 +[epoch 34/50] step=200 train_loss=0.0125 tok_s=60062.8 opt_steps=200 +[epoch 34/50] step=220 train_loss=0.0125 tok_s=60101.1 opt_steps=220 +[epoch 34/50] step=240 train_loss=0.0125 tok_s=60139.4 opt_steps=240 +[epoch 34/50] step=260 train_loss=0.0125 tok_s=60157.3 opt_steps=260 +[epoch 34/50] step=280 train_loss=0.0125 tok_s=60158.4 opt_steps=280 +[epoch 34/50] step=300 train_loss=0.0125 tok_s=60177.8 opt_steps=300 +[epoch 34/50] step=320 train_loss=0.0125 tok_s=60168.3 opt_steps=320 +[epoch 34/50] step=340 train_loss=0.0126 tok_s=60167.1 opt_steps=340 +[epoch 34/50] step=360 train_loss=0.0126 tok_s=60136.4 opt_steps=360 +[epoch 34/50] step=380 train_loss=0.0127 tok_s=60139.3 opt_steps=380 +[epoch 34/50] step=400 train_loss=0.0127 tok_s=60135.9 opt_steps=400 +[epoch 34/50] step=420 train_loss=0.0127 tok_s=60118.7 opt_steps=420 +[epoch 34/50] step=440 train_loss=0.0127 tok_s=60097.1 opt_steps=440 +[epoch 34/50] step=460 train_loss=0.0127 tok_s=60075.5 opt_steps=460 +[epoch 34/50] step=480 train_loss=0.0127 tok_s=60075.7 opt_steps=480 +[epoch 34/50] step=500 train_loss=0.0128 tok_s=60078.0 opt_steps=500 +[epoch 34/50] step=520 train_loss=0.0128 tok_s=60083.0 opt_steps=520 +[epoch 34/50] step=540 train_loss=0.0128 tok_s=60085.7 opt_steps=540 +[epoch 34/50] step=560 train_loss=0.0129 tok_s=60097.5 opt_steps=560 +[epoch 34/50] step=580 train_loss=0.0129 tok_s=60111.1 opt_steps=580 +[epoch 34/50] step=600 train_loss=0.0129 tok_s=60108.7 opt_steps=600 +[epoch 34/50] step=620 train_loss=0.0129 tok_s=60117.4 opt_steps=620 +[epoch 34/50] step=640 train_loss=0.0130 tok_s=60112.0 opt_steps=640 +[epoch 34/50] step=660 train_loss=0.0130 tok_s=60115.3 opt_steps=660 +[epoch 34/50] step=680 train_loss=0.0130 tok_s=60122.8 opt_steps=680 +[epoch 34/50] step=700 train_loss=0.0130 tok_s=60130.4 opt_steps=700 +[epoch 34/50] step=720 train_loss=0.0130 tok_s=60113.2 opt_steps=720 +[epoch 34/50] step=740 train_loss=0.0130 tok_s=60131.9 opt_steps=740 +[epoch 34/50] step=760 train_loss=0.0131 tok_s=60139.6 opt_steps=760 +[epoch 34/50] step=780 train_loss=0.0131 tok_s=60140.8 opt_steps=780 +[epoch 34/50] step=800 train_loss=0.0131 tok_s=60142.7 opt_steps=800 +[epoch 34/50] step=820 train_loss=0.0132 tok_s=60133.9 opt_steps=820 +[epoch 34/50] step=840 train_loss=0.0132 tok_s=60130.8 opt_steps=840 +[epoch 34/50] step=860 train_loss=0.0132 tok_s=60128.0 opt_steps=860 +[epoch 34/50] step=880 train_loss=0.0133 tok_s=60118.6 opt_steps=880 +[epoch 34/50] step=900 train_loss=0.0133 tok_s=60124.1 opt_steps=900 +[epoch 34/50] step=920 train_loss=0.0133 tok_s=60124.9 opt_steps=920 +[epoch 34/50] step=940 train_loss=0.0133 tok_s=60119.4 opt_steps=940 +[epoch 34/50] step=960 train_loss=0.0133 tok_s=60117.8 opt_steps=960 +[epoch 34/50] step=980 train_loss=0.0133 tok_s=60123.2 opt_steps=980 +[epoch 34/50] step=1000 train_loss=0.0134 tok_s=60110.0 opt_steps=1000 +[epoch 34/50] step=1020 train_loss=0.0134 tok_s=60100.8 opt_steps=1020 +[epoch 34/50] step=1040 train_loss=0.0134 tok_s=60098.8 opt_steps=1040 +[epoch 34/50] step=1060 train_loss=0.0134 tok_s=60093.5 opt_steps=1060 +[epoch 34/50] step=1080 train_loss=0.0134 tok_s=60098.3 opt_steps=1080 +[epoch 34/50] step=1100 train_loss=0.0134 tok_s=60099.8 opt_steps=1100 +[epoch 34/50] step=1120 train_loss=0.0135 tok_s=60103.4 opt_steps=1120 +[epoch 34/50] step=1140 train_loss=0.0135 tok_s=60103.6 opt_steps=1140 +[epoch 34/50] step=1160 train_loss=0.0135 tok_s=60094.8 opt_steps=1160 +[epoch 34/50] step=1180 train_loss=0.0135 tok_s=60083.6 opt_steps=1180 +[epoch 34/50] step=1200 train_loss=0.0136 tok_s=60077.9 opt_steps=1200 +[epoch 34/50] step=1220 train_loss=0.0136 tok_s=60068.3 opt_steps=1220 +[epoch 34/50] step=1240 train_loss=0.0136 tok_s=60072.2 opt_steps=1240 +[epoch 34/50] step=1260 train_loss=0.0136 tok_s=60069.8 opt_steps=1260 +[epoch 34/50] step=1280 train_loss=0.0136 tok_s=60066.8 opt_steps=1280 +[epoch 34/50] step=1300 train_loss=0.0137 tok_s=60060.6 opt_steps=1300 +[epoch 34/50] step=1320 train_loss=0.0137 tok_s=60058.6 opt_steps=1320 +[epoch 34/50] step=1340 train_loss=0.0137 tok_s=60048.7 opt_steps=1340 +[epoch 34/50] step=1360 train_loss=0.0137 tok_s=60052.3 opt_steps=1360 +[epoch 34/50] step=1380 train_loss=0.0137 tok_s=60053.1 opt_steps=1380 +[epoch 34/50] step=1400 train_loss=0.0137 tok_s=60051.0 opt_steps=1400 +[epoch 34/50] step=1420 train_loss=0.0138 tok_s=60051.7 opt_steps=1420 +[epoch 34/50] step=1440 train_loss=0.0138 tok_s=60055.8 opt_steps=1440 +[epoch 34/50] step=1460 train_loss=0.0138 tok_s=60047.5 opt_steps=1460 +[epoch 34/50] step=1480 train_loss=0.0138 tok_s=60046.0 opt_steps=1480 +[epoch 34/50] step=1500 train_loss=0.0138 tok_s=60049.2 opt_steps=1500 +[epoch 34/50] step=1520 train_loss=0.0138 tok_s=60046.1 opt_steps=1520 +[epoch 34/50] step=1540 train_loss=0.0139 tok_s=60044.1 opt_steps=1540 +[epoch 34/50] step=1560 train_loss=0.0139 tok_s=60042.8 opt_steps=1560 +[epoch 34/50] step=1580 train_loss=0.0139 tok_s=60037.3 opt_steps=1580 +[epoch 34/50] step=1600 train_loss=0.0139 tok_s=60034.2 opt_steps=1600 +[epoch 34/50] step=1620 train_loss=0.0139 tok_s=60031.6 opt_steps=1620 +[epoch 34/50] step=1640 train_loss=0.0139 tok_s=60029.5 opt_steps=1640 +[epoch 34/50] step=1660 train_loss=0.0140 tok_s=60020.4 opt_steps=1660 +[epoch 34/50] step=1680 train_loss=0.0140 tok_s=60023.7 opt_steps=1680 +[epoch 34/50] step=1700 train_loss=0.0140 tok_s=60021.6 opt_steps=1700 +[epoch 34/50] step=1720 train_loss=0.0140 tok_s=60015.2 opt_steps=1720 +[epoch 34/50] step=1740 train_loss=0.0140 tok_s=60015.0 opt_steps=1740 +[epoch 34/50] step=1760 train_loss=0.0140 tok_s=60015.8 opt_steps=1760 +[epoch 34/50] step=1780 train_loss=0.0140 tok_s=59992.5 opt_steps=1780 +[epoch 34/50] step=1800 train_loss=0.0140 tok_s=59995.0 opt_steps=1800 +[epoch 34/50] step=1820 train_loss=0.0141 tok_s=59995.9 opt_steps=1820 +[epoch 34/50] step=1840 train_loss=0.0141 tok_s=59994.9 opt_steps=1840 +[epoch 34/50] step=1860 train_loss=0.0141 tok_s=59998.3 opt_steps=1860 +[epoch 34/50] step=1880 train_loss=0.0141 tok_s=60006.3 opt_steps=1880 +[epoch 34/50] step=1900 train_loss=0.0141 tok_s=60005.9 opt_steps=1900 +[epoch 34/50] step=1920 train_loss=0.0141 tok_s=59996.5 opt_steps=1920 +[epoch 34/50] step=1940 train_loss=0.0141 tok_s=59990.1 opt_steps=1940 +[epoch 34/50] step=1960 train_loss=0.0142 tok_s=59979.2 opt_steps=1960 +[epoch 34/50] step=1980 train_loss=0.0142 tok_s=59974.5 opt_steps=1980 +[epoch 34/50] step=2000 train_loss=0.0142 tok_s=59975.0 opt_steps=2000 +[epoch 34/50] step=2020 train_loss=0.0142 tok_s=59974.3 opt_steps=2020 +[epoch 34/50] step=2040 train_loss=0.0142 tok_s=59974.5 opt_steps=2040 +[epoch 34/50] step=2060 train_loss=0.0142 tok_s=59976.5 opt_steps=2060 +[epoch 34/50] step=2080 train_loss=0.0143 tok_s=59981.1 opt_steps=2080 +[epoch 34/50] step=2100 train_loss=0.0143 tok_s=59983.4 opt_steps=2100 +[epoch 34/50] step=2120 train_loss=0.0143 tok_s=59974.9 opt_steps=2120 +[epoch 34/50] step=2140 train_loss=0.0143 tok_s=59972.6 opt_steps=2140 +[epoch 34/50] step=2160 train_loss=0.0143 tok_s=59968.1 opt_steps=2160 +[epoch 34/50] step=2180 train_loss=0.0143 tok_s=59969.9 opt_steps=2180 +[epoch 34/50] step=2200 train_loss=0.0143 tok_s=59967.5 opt_steps=2200 +[epoch 34/50] step=2220 train_loss=0.0143 tok_s=59969.6 opt_steps=2220 +[epoch 34/50] step=2240 train_loss=0.0144 tok_s=59969.0 opt_steps=2240 +[epoch 34/50] step=2260 train_loss=0.0144 tok_s=59967.5 opt_steps=2260 +[epoch 34/50] step=2280 train_loss=0.0144 tok_s=59966.5 opt_steps=2280 +[epoch 34/50] step=2300 train_loss=0.0144 tok_s=59964.8 opt_steps=2300 +[epoch 34/50] step=2320 train_loss=0.0144 tok_s=59969.2 opt_steps=2320 +[epoch 34/50] step=2340 train_loss=0.0144 tok_s=59974.8 opt_steps=2340 +[epoch 34/50] step=2360 train_loss=0.0144 tok_s=59976.6 opt_steps=2360 +[epoch 34/50] step=2380 train_loss=0.0144 tok_s=59976.4 opt_steps=2380 +[epoch 34/50] step=2400 train_loss=0.0145 tok_s=59972.0 opt_steps=2400 +[epoch 34/50] step=2420 train_loss=0.0145 tok_s=59958.8 opt_steps=2420 +[epoch 34/50] step=2440 train_loss=0.0145 tok_s=59944.6 opt_steps=2440 +[epoch 34/50] step=2460 train_loss=0.0145 tok_s=59930.7 opt_steps=2460 +[epoch 34/50] step=2480 train_loss=0.0145 tok_s=59921.8 opt_steps=2480 +[epoch 34/50] step=2500 train_loss=0.0145 tok_s=59923.8 opt_steps=2500 +[epoch 34/50] step=2520 train_loss=0.0145 tok_s=59927.3 opt_steps=2520 +[epoch 34/50] step=2540 train_loss=0.0145 tok_s=59932.5 opt_steps=2540 +[epoch 34/50] step=2560 train_loss=0.0145 tok_s=59934.5 opt_steps=2560 +[epoch 34/50] step=2580 train_loss=0.0146 tok_s=59937.2 opt_steps=2580 +[epoch 34/50] step=2600 train_loss=0.0146 tok_s=59938.7 opt_steps=2600 +[epoch 34/50] step=2620 train_loss=0.0146 tok_s=59939.8 opt_steps=2620 +[epoch 34/50] step=2640 train_loss=0.0146 tok_s=59938.4 opt_steps=2640 +[epoch 34/50] step=2660 train_loss=0.0146 tok_s=59937.1 opt_steps=2660 +[epoch 34/50] step=2680 train_loss=0.0146 tok_s=59939.2 opt_steps=2680 +[epoch 34/50] step=2700 train_loss=0.0146 tok_s=59942.7 opt_steps=2700 +[epoch 34/50] step=2720 train_loss=0.0146 tok_s=59942.1 opt_steps=2720 +[epoch 34/50] step=2740 train_loss=0.0147 tok_s=59945.0 opt_steps=2740 +[epoch 34/50] step=2760 train_loss=0.0147 tok_s=59945.0 opt_steps=2760 +[epoch 34/50] step=2780 train_loss=0.0147 tok_s=59941.5 opt_steps=2780 +[epoch 34/50] step=2800 train_loss=0.0147 tok_s=59942.7 opt_steps=2800 +[epoch 34/50] step=2820 train_loss=0.0147 tok_s=59944.8 opt_steps=2820 +[epoch 34/50] step=2840 train_loss=0.0147 tok_s=59947.1 opt_steps=2840 +[epoch 34/50] step=2860 train_loss=0.0147 tok_s=59948.2 opt_steps=2860 +[epoch 34/50] step=2880 train_loss=0.0148 tok_s=59951.8 opt_steps=2880 +[epoch 34/50] step=2900 train_loss=0.0148 tok_s=59954.5 opt_steps=2900 +[epoch 34/50] step=2920 train_loss=0.0148 tok_s=59955.0 opt_steps=2920 +[epoch 34/50] step=2940 train_loss=0.0148 tok_s=59956.9 opt_steps=2940 +[epoch 34/50] step=2960 train_loss=0.0148 tok_s=59960.0 opt_steps=2960 +[epoch 34/50] step=2980 train_loss=0.0148 tok_s=59957.2 opt_steps=2980 +[epoch 34/50] step=3000 train_loss=0.0148 tok_s=59959.8 opt_steps=3000 +[epoch 34/50] step=3020 train_loss=0.0148 tok_s=59961.5 opt_steps=3020 +[epoch 34/50] step=3040 train_loss=0.0148 tok_s=59958.1 opt_steps=3040 +[epoch 34/50] step=3060 train_loss=0.0149 tok_s=59959.4 opt_steps=3060 +[epoch 34/50] step=3080 train_loss=0.0149 tok_s=59961.7 opt_steps=3080 +[epoch 34/50] step=3100 train_loss=0.0149 tok_s=59960.3 opt_steps=3100 +[epoch 34/50] step=3120 train_loss=0.0149 tok_s=59960.1 opt_steps=3120 +[epoch 34/50] step=3140 train_loss=0.0149 tok_s=59963.2 opt_steps=3140 +[epoch 34/50] step=3160 train_loss=0.0149 tok_s=59964.0 opt_steps=3160 +[epoch 34/50] step=3180 train_loss=0.0149 tok_s=59963.9 opt_steps=3180 +[epoch 34/50] step=3200 train_loss=0.0149 tok_s=59966.6 opt_steps=3200 +[epoch 34/50] step=3220 train_loss=0.0149 tok_s=59968.6 opt_steps=3220 +[epoch 34/50] step=3240 train_loss=0.0149 tok_s=59967.2 opt_steps=3240 +[epoch 34/50] step=3260 train_loss=0.0150 tok_s=59963.6 opt_steps=3260 +[epoch 34/50] train_loss=0.0150 val_skipped tok_s=59964.3 opt_steps=3273 +[epoch 35/50] step=20 train_loss=0.0126 tok_s=57476.1 opt_steps=20 +[epoch 35/50] step=40 train_loss=0.0125 tok_s=58827.9 opt_steps=40 +[epoch 35/50] step=60 train_loss=0.0122 tok_s=59297.0 opt_steps=60 +[epoch 35/50] step=80 train_loss=0.0123 tok_s=59544.3 opt_steps=80 +[epoch 35/50] step=100 train_loss=0.0122 tok_s=59685.0 opt_steps=100 +[epoch 35/50] step=120 train_loss=0.0123 tok_s=59747.2 opt_steps=120 +[epoch 35/50] step=140 train_loss=0.0122 tok_s=59830.2 opt_steps=140 +[epoch 35/50] step=160 train_loss=0.0122 tok_s=59859.5 opt_steps=160 +[epoch 35/50] step=180 train_loss=0.0122 tok_s=59870.2 opt_steps=180 +[epoch 35/50] step=200 train_loss=0.0121 tok_s=59893.5 opt_steps=200 +[epoch 35/50] step=220 train_loss=0.0122 tok_s=59897.9 opt_steps=220 +[epoch 35/50] step=240 train_loss=0.0122 tok_s=59928.7 opt_steps=240 +[epoch 35/50] step=260 train_loss=0.0122 tok_s=59930.6 opt_steps=260 +[epoch 35/50] step=280 train_loss=0.0122 tok_s=59899.0 opt_steps=280 +[epoch 35/50] step=300 train_loss=0.0122 tok_s=59879.6 opt_steps=300 +[epoch 35/50] step=320 train_loss=0.0123 tok_s=59880.5 opt_steps=320 +[epoch 35/50] step=340 train_loss=0.0123 tok_s=59854.2 opt_steps=340 +[epoch 35/50] step=360 train_loss=0.0123 tok_s=59858.8 opt_steps=360 +[epoch 35/50] step=380 train_loss=0.0123 tok_s=59875.3 opt_steps=380 +[epoch 35/50] step=400 train_loss=0.0124 tok_s=59890.3 opt_steps=400 +[epoch 35/50] step=420 train_loss=0.0124 tok_s=59901.1 opt_steps=420 +[epoch 35/50] step=440 train_loss=0.0124 tok_s=59910.5 opt_steps=440 +[epoch 35/50] step=460 train_loss=0.0124 tok_s=59906.8 opt_steps=460 +[epoch 35/50] step=480 train_loss=0.0125 tok_s=59925.6 opt_steps=480 +[epoch 35/50] step=500 train_loss=0.0125 tok_s=59924.6 opt_steps=500 +[epoch 35/50] step=520 train_loss=0.0125 tok_s=59935.9 opt_steps=520 +[epoch 35/50] step=540 train_loss=0.0126 tok_s=59947.8 opt_steps=540 +[epoch 35/50] step=560 train_loss=0.0126 tok_s=59951.4 opt_steps=560 +[epoch 35/50] step=580 train_loss=0.0126 tok_s=59947.5 opt_steps=580 +[epoch 35/50] step=600 train_loss=0.0126 tok_s=59954.0 opt_steps=600 +[epoch 35/50] step=620 train_loss=0.0126 tok_s=59959.6 opt_steps=620 +[epoch 35/50] step=640 train_loss=0.0127 tok_s=59965.4 opt_steps=640 +[epoch 35/50] step=660 train_loss=0.0127 tok_s=59952.5 opt_steps=660 +[epoch 35/50] step=680 train_loss=0.0127 tok_s=59955.1 opt_steps=680 +[epoch 35/50] step=700 train_loss=0.0127 tok_s=59948.1 opt_steps=700 +[epoch 35/50] step=720 train_loss=0.0128 tok_s=59954.6 opt_steps=720 +[epoch 35/50] step=740 train_loss=0.0128 tok_s=59961.3 opt_steps=740 +[epoch 35/50] step=760 train_loss=0.0128 tok_s=59965.7 opt_steps=760 +[epoch 35/50] step=780 train_loss=0.0128 tok_s=59982.3 opt_steps=780 +[epoch 35/50] step=800 train_loss=0.0129 tok_s=59983.8 opt_steps=800 +[epoch 35/50] step=820 train_loss=0.0129 tok_s=60001.1 opt_steps=820 +[epoch 35/50] step=840 train_loss=0.0129 tok_s=60000.6 opt_steps=840 +[epoch 35/50] step=860 train_loss=0.0129 tok_s=60014.6 opt_steps=860 +[epoch 35/50] step=880 train_loss=0.0129 tok_s=60018.1 opt_steps=880 +[epoch 35/50] step=900 train_loss=0.0129 tok_s=60017.9 opt_steps=900 +[epoch 35/50] step=920 train_loss=0.0130 tok_s=60018.4 opt_steps=920 +[epoch 35/50] step=940 train_loss=0.0130 tok_s=60021.7 opt_steps=940 +[epoch 35/50] step=960 train_loss=0.0130 tok_s=60025.1 opt_steps=960 +[epoch 35/50] step=980 train_loss=0.0130 tok_s=60040.0 opt_steps=980 +[epoch 35/50] step=1000 train_loss=0.0130 tok_s=60047.7 opt_steps=1000 +[epoch 35/50] step=1020 train_loss=0.0131 tok_s=60052.5 opt_steps=1020 +[epoch 35/50] step=1040 train_loss=0.0131 tok_s=60056.9 opt_steps=1040 +[epoch 35/50] step=1060 train_loss=0.0131 tok_s=60060.8 opt_steps=1060 +[epoch 35/50] step=1080 train_loss=0.0131 tok_s=60063.8 opt_steps=1080 +[epoch 35/50] step=1100 train_loss=0.0131 tok_s=60066.8 opt_steps=1100 +[epoch 35/50] step=1120 train_loss=0.0132 tok_s=60069.8 opt_steps=1120 +[epoch 35/50] step=1140 train_loss=0.0132 tok_s=60073.3 opt_steps=1140 +[epoch 35/50] step=1160 train_loss=0.0132 tok_s=60071.7 opt_steps=1160 +[epoch 35/50] step=1180 train_loss=0.0132 tok_s=60071.4 opt_steps=1180 +[epoch 35/50] step=1200 train_loss=0.0132 tok_s=60070.4 opt_steps=1200 +[epoch 35/50] step=1220 train_loss=0.0133 tok_s=60075.4 opt_steps=1220 +[epoch 35/50] step=1240 train_loss=0.0133 tok_s=60073.4 opt_steps=1240 +[epoch 35/50] step=1260 train_loss=0.0133 tok_s=60076.5 opt_steps=1260 +[epoch 35/50] step=1280 train_loss=0.0133 tok_s=60077.9 opt_steps=1280 +[epoch 35/50] step=1300 train_loss=0.0133 tok_s=60072.3 opt_steps=1300 +[epoch 35/50] step=1320 train_loss=0.0133 tok_s=60072.6 opt_steps=1320 +[epoch 35/50] step=1340 train_loss=0.0133 tok_s=60076.0 opt_steps=1340 +[epoch 35/50] step=1360 train_loss=0.0133 tok_s=60074.4 opt_steps=1360 +[epoch 35/50] step=1380 train_loss=0.0134 tok_s=60082.3 opt_steps=1380 +[epoch 35/50] step=1400 train_loss=0.0134 tok_s=60080.9 opt_steps=1400 +[epoch 35/50] step=1420 train_loss=0.0134 tok_s=60072.1 opt_steps=1420 +[epoch 35/50] step=1440 train_loss=0.0134 tok_s=60081.4 opt_steps=1440 +[epoch 35/50] step=1460 train_loss=0.0134 tok_s=60084.0 opt_steps=1460 +[epoch 35/50] step=1480 train_loss=0.0134 tok_s=60086.7 opt_steps=1480 +[epoch 35/50] step=1500 train_loss=0.0134 tok_s=60083.9 opt_steps=1500 +[epoch 35/50] step=1520 train_loss=0.0135 tok_s=60091.3 opt_steps=1520 +[epoch 35/50] step=1540 train_loss=0.0135 tok_s=60087.3 opt_steps=1540 +[epoch 35/50] step=1560 train_loss=0.0135 tok_s=60088.7 opt_steps=1560 +[epoch 35/50] step=1580 train_loss=0.0135 tok_s=60085.7 opt_steps=1580 +[epoch 35/50] step=1600 train_loss=0.0135 tok_s=60079.9 opt_steps=1600 +[epoch 35/50] step=1620 train_loss=0.0135 tok_s=60084.1 opt_steps=1620 +[epoch 35/50] step=1640 train_loss=0.0136 tok_s=60089.7 opt_steps=1640 +[epoch 35/50] step=1660 train_loss=0.0136 tok_s=60087.0 opt_steps=1660 +[epoch 35/50] step=1680 train_loss=0.0136 tok_s=60088.4 opt_steps=1680 +[epoch 35/50] step=1700 train_loss=0.0136 tok_s=60087.7 opt_steps=1700 +[epoch 35/50] step=1720 train_loss=0.0136 tok_s=60086.1 opt_steps=1720 +[epoch 35/50] step=1740 train_loss=0.0136 tok_s=60088.3 opt_steps=1740 +[epoch 35/50] step=1760 train_loss=0.0136 tok_s=60093.2 opt_steps=1760 +[epoch 35/50] step=1780 train_loss=0.0137 tok_s=60091.2 opt_steps=1780 +[epoch 35/50] step=1800 train_loss=0.0137 tok_s=60090.5 opt_steps=1800 +[epoch 35/50] step=1820 train_loss=0.0137 tok_s=60089.1 opt_steps=1820 +[epoch 35/50] step=1840 train_loss=0.0137 tok_s=60084.8 opt_steps=1840 +[epoch 35/50] step=1860 train_loss=0.0137 tok_s=60081.6 opt_steps=1860 +[epoch 35/50] step=1880 train_loss=0.0137 tok_s=60082.6 opt_steps=1880 +[epoch 35/50] step=1900 train_loss=0.0137 tok_s=60077.6 opt_steps=1900 +[epoch 35/50] step=1920 train_loss=0.0138 tok_s=60077.7 opt_steps=1920 +[epoch 35/50] step=1940 train_loss=0.0138 tok_s=60072.4 opt_steps=1940 +[epoch 35/50] step=1960 train_loss=0.0138 tok_s=60073.4 opt_steps=1960 +[epoch 35/50] step=1980 train_loss=0.0138 tok_s=60075.4 opt_steps=1980 +[epoch 35/50] step=2000 train_loss=0.0138 tok_s=60074.8 opt_steps=2000 +[epoch 35/50] step=2020 train_loss=0.0138 tok_s=60074.0 opt_steps=2020 +[epoch 35/50] step=2040 train_loss=0.0139 tok_s=60074.2 opt_steps=2040 +[epoch 35/50] step=2060 train_loss=0.0139 tok_s=60074.0 opt_steps=2060 +[epoch 35/50] step=2080 train_loss=0.0139 tok_s=60073.4 opt_steps=2080 +[epoch 35/50] step=2100 train_loss=0.0139 tok_s=60076.5 opt_steps=2100 +[epoch 35/50] step=2120 train_loss=0.0139 tok_s=60089.3 opt_steps=2120 +[epoch 35/50] step=2140 train_loss=0.0139 tok_s=60088.6 opt_steps=2140 +[epoch 35/50] step=2160 train_loss=0.0139 tok_s=60086.2 opt_steps=2160 +[epoch 35/50] step=2180 train_loss=0.0139 tok_s=60082.3 opt_steps=2180 +[epoch 35/50] step=2200 train_loss=0.0140 tok_s=60075.3 opt_steps=2200 +[epoch 35/50] step=2220 train_loss=0.0140 tok_s=60072.4 opt_steps=2220 +[epoch 35/50] step=2240 train_loss=0.0140 tok_s=60073.8 opt_steps=2240 +[epoch 35/50] step=2260 train_loss=0.0140 tok_s=60074.0 opt_steps=2260 +[epoch 35/50] step=2280 train_loss=0.0140 tok_s=60069.3 opt_steps=2280 +[epoch 35/50] step=2300 train_loss=0.0140 tok_s=60067.8 opt_steps=2300 +[epoch 35/50] step=2320 train_loss=0.0140 tok_s=60065.0 opt_steps=2320 +[epoch 35/50] step=2340 train_loss=0.0140 tok_s=60062.6 opt_steps=2340 +[epoch 35/50] step=2360 train_loss=0.0140 tok_s=60060.3 opt_steps=2360 +[epoch 35/50] step=2380 train_loss=0.0140 tok_s=60058.1 opt_steps=2380 +[epoch 35/50] step=2400 train_loss=0.0141 tok_s=60060.0 opt_steps=2400 +[epoch 35/50] step=2420 train_loss=0.0141 tok_s=60055.4 opt_steps=2420 +[epoch 35/50] step=2440 train_loss=0.0141 tok_s=60051.4 opt_steps=2440 +[epoch 35/50] step=2460 train_loss=0.0141 tok_s=60049.4 opt_steps=2460 +[epoch 35/50] step=2480 train_loss=0.0141 tok_s=60045.4 opt_steps=2480 +[epoch 35/50] step=2500 train_loss=0.0141 tok_s=60044.0 opt_steps=2500 +[epoch 35/50] step=2520 train_loss=0.0141 tok_s=60039.5 opt_steps=2520 +[epoch 35/50] step=2540 train_loss=0.0141 tok_s=60034.8 opt_steps=2540 +[epoch 35/50] step=2560 train_loss=0.0142 tok_s=60037.0 opt_steps=2560 +[epoch 35/50] step=2580 train_loss=0.0142 tok_s=60032.6 opt_steps=2580 +[epoch 35/50] step=2600 train_loss=0.0142 tok_s=60030.4 opt_steps=2600 +[epoch 35/50] step=2620 train_loss=0.0142 tok_s=60024.7 opt_steps=2620 +[epoch 35/50] step=2640 train_loss=0.0142 tok_s=60024.4 opt_steps=2640 +[epoch 35/50] step=2660 train_loss=0.0142 tok_s=60023.5 opt_steps=2660 +[epoch 35/50] step=2680 train_loss=0.0142 tok_s=60023.0 opt_steps=2680 +[epoch 35/50] step=2700 train_loss=0.0142 tok_s=60026.0 opt_steps=2700 +[epoch 35/50] step=2720 train_loss=0.0143 tok_s=60030.9 opt_steps=2720 +[epoch 35/50] step=2740 train_loss=0.0143 tok_s=60029.6 opt_steps=2740 +[epoch 35/50] step=2760 train_loss=0.0143 tok_s=60033.4 opt_steps=2760 +[epoch 35/50] step=2780 train_loss=0.0143 tok_s=60029.6 opt_steps=2780 +[epoch 35/50] step=2800 train_loss=0.0143 tok_s=60027.7 opt_steps=2800 +[epoch 35/50] step=2820 train_loss=0.0143 tok_s=60026.4 opt_steps=2820 +[epoch 35/50] step=2840 train_loss=0.0143 tok_s=60026.3 opt_steps=2840 +[epoch 35/50] step=2860 train_loss=0.0143 tok_s=60025.7 opt_steps=2860 +[epoch 35/50] step=2880 train_loss=0.0143 tok_s=60026.1 opt_steps=2880 +[epoch 35/50] step=2900 train_loss=0.0143 tok_s=60028.3 opt_steps=2900 +[epoch 35/50] step=2920 train_loss=0.0144 tok_s=60030.6 opt_steps=2920 +[epoch 35/50] step=2940 train_loss=0.0144 tok_s=60031.8 opt_steps=2940 +[epoch 35/50] step=2960 train_loss=0.0144 tok_s=60030.7 opt_steps=2960 +[epoch 35/50] step=2980 train_loss=0.0144 tok_s=60031.6 opt_steps=2980 +[epoch 35/50] step=3000 train_loss=0.0144 tok_s=60035.3 opt_steps=3000 +[epoch 35/50] step=3020 train_loss=0.0144 tok_s=60033.9 opt_steps=3020 +[epoch 35/50] step=3040 train_loss=0.0144 tok_s=60034.6 opt_steps=3040 +[epoch 35/50] step=3060 train_loss=0.0144 tok_s=60035.1 opt_steps=3060 +[epoch 35/50] step=3080 train_loss=0.0144 tok_s=60039.8 opt_steps=3080 +[epoch 35/50] step=3100 train_loss=0.0144 tok_s=60040.4 opt_steps=3100 +[epoch 35/50] step=3120 train_loss=0.0145 tok_s=60038.3 opt_steps=3120 +[epoch 35/50] step=3140 train_loss=0.0145 tok_s=60039.4 opt_steps=3140 +[epoch 35/50] step=3160 train_loss=0.0145 tok_s=60037.9 opt_steps=3160 +[epoch 35/50] step=3180 train_loss=0.0145 tok_s=60040.3 opt_steps=3180 +[epoch 35/50] step=3200 train_loss=0.0145 tok_s=60039.9 opt_steps=3200 +[epoch 35/50] step=3220 train_loss=0.0145 tok_s=60037.0 opt_steps=3220 +[epoch 35/50] step=3240 train_loss=0.0145 tok_s=60039.8 opt_steps=3240 +[epoch 35/50] step=3260 train_loss=0.0145 tok_s=60035.8 opt_steps=3260 +[epoch 35/50] train_loss=0.0145 val_skipped tok_s=60036.8 opt_steps=3273 +[epoch 36/50] step=20 train_loss=0.0121 tok_s=58524.1 opt_steps=20 +[epoch 36/50] step=40 train_loss=0.0120 tok_s=59451.6 opt_steps=40 +[epoch 36/50] step=60 train_loss=0.0121 tok_s=59797.6 opt_steps=60 +[epoch 36/50] step=80 train_loss=0.0118 tok_s=59877.2 opt_steps=80 +[epoch 36/50] step=100 train_loss=0.0117 tok_s=60049.8 opt_steps=100 +[epoch 36/50] step=120 train_loss=0.0118 tok_s=60129.2 opt_steps=120 +[epoch 36/50] step=140 train_loss=0.0119 tok_s=60068.1 opt_steps=140 +[epoch 36/50] step=160 train_loss=0.0119 tok_s=60169.5 opt_steps=160 +[epoch 36/50] step=180 train_loss=0.0119 tok_s=60205.2 opt_steps=180 +[epoch 36/50] step=200 train_loss=0.0119 tok_s=60192.2 opt_steps=200 +[epoch 36/50] step=220 train_loss=0.0118 tok_s=60179.1 opt_steps=220 +[epoch 36/50] step=240 train_loss=0.0119 tok_s=60185.9 opt_steps=240 +[epoch 36/50] step=260 train_loss=0.0119 tok_s=60173.1 opt_steps=260 +[epoch 36/50] step=280 train_loss=0.0119 tok_s=60138.3 opt_steps=280 +[epoch 36/50] step=300 train_loss=0.0119 tok_s=60134.6 opt_steps=300 +[epoch 36/50] step=320 train_loss=0.0119 tok_s=60131.4 opt_steps=320 +[epoch 36/50] step=340 train_loss=0.0119 tok_s=60157.3 opt_steps=340 +[epoch 36/50] step=360 train_loss=0.0120 tok_s=60163.2 opt_steps=360 +[epoch 36/50] step=380 train_loss=0.0120 tok_s=60158.0 opt_steps=380 +[epoch 36/50] step=400 train_loss=0.0120 tok_s=60173.3 opt_steps=400 +[epoch 36/50] step=420 train_loss=0.0120 tok_s=60156.6 opt_steps=420 +[epoch 36/50] step=440 train_loss=0.0121 tok_s=60157.3 opt_steps=440 +[epoch 36/50] step=460 train_loss=0.0121 tok_s=60161.6 opt_steps=460 +[epoch 36/50] step=480 train_loss=0.0121 tok_s=60147.6 opt_steps=480 +[epoch 36/50] step=500 train_loss=0.0121 tok_s=60158.4 opt_steps=500 +[epoch 36/50] step=520 train_loss=0.0121 tok_s=60162.4 opt_steps=520 +[epoch 36/50] step=540 train_loss=0.0121 tok_s=60162.6 opt_steps=540 +[epoch 36/50] step=560 train_loss=0.0121 tok_s=60177.0 opt_steps=560 +[epoch 36/50] step=580 train_loss=0.0122 tok_s=60154.7 opt_steps=580 +[epoch 36/50] step=600 train_loss=0.0122 tok_s=60144.7 opt_steps=600 +[epoch 36/50] step=620 train_loss=0.0122 tok_s=60136.5 opt_steps=620 +[epoch 36/50] step=640 train_loss=0.0123 tok_s=60137.7 opt_steps=640 +[epoch 36/50] step=660 train_loss=0.0123 tok_s=60149.1 opt_steps=660 +[epoch 36/50] step=680 train_loss=0.0124 tok_s=60154.3 opt_steps=680 +[epoch 36/50] step=700 train_loss=0.0124 tok_s=60145.3 opt_steps=700 +[epoch 36/50] step=720 train_loss=0.0124 tok_s=60147.6 opt_steps=720 +[epoch 36/50] step=740 train_loss=0.0124 tok_s=60128.9 opt_steps=740 +[epoch 36/50] step=760 train_loss=0.0124 tok_s=60117.4 opt_steps=760 +[epoch 36/50] step=780 train_loss=0.0124 tok_s=60120.6 opt_steps=780 +[epoch 36/50] step=800 train_loss=0.0125 tok_s=60138.7 opt_steps=800 +[epoch 36/50] step=820 train_loss=0.0125 tok_s=60153.8 opt_steps=820 +[epoch 36/50] step=840 train_loss=0.0125 tok_s=60161.5 opt_steps=840 +[epoch 36/50] step=860 train_loss=0.0125 tok_s=60156.3 opt_steps=860 +[epoch 36/50] step=880 train_loss=0.0126 tok_s=60145.8 opt_steps=880 +[epoch 36/50] step=900 train_loss=0.0126 tok_s=60137.4 opt_steps=900 +[epoch 36/50] step=920 train_loss=0.0126 tok_s=60138.3 opt_steps=920 +[epoch 36/50] step=940 train_loss=0.0126 tok_s=60157.6 opt_steps=940 +[epoch 36/50] step=960 train_loss=0.0127 tok_s=60154.6 opt_steps=960 +[epoch 36/50] step=980 train_loss=0.0127 tok_s=60158.1 opt_steps=980 +[epoch 36/50] step=1000 train_loss=0.0127 tok_s=60167.8 opt_steps=1000 +[epoch 36/50] step=1020 train_loss=0.0127 tok_s=60165.1 opt_steps=1020 +[epoch 36/50] step=1040 train_loss=0.0127 tok_s=60167.7 opt_steps=1040 +[epoch 36/50] step=1060 train_loss=0.0127 tok_s=60176.7 opt_steps=1060 +[epoch 36/50] step=1080 train_loss=0.0127 tok_s=60174.3 opt_steps=1080 +[epoch 36/50] step=1100 train_loss=0.0127 tok_s=60170.6 opt_steps=1100 +[epoch 36/50] step=1120 train_loss=0.0128 tok_s=60173.3 opt_steps=1120 +[epoch 36/50] step=1140 train_loss=0.0128 tok_s=60178.3 opt_steps=1140 +[epoch 36/50] step=1160 train_loss=0.0128 tok_s=60174.5 opt_steps=1160 +[epoch 36/50] step=1180 train_loss=0.0128 tok_s=60171.1 opt_steps=1180 +[epoch 36/50] step=1200 train_loss=0.0128 tok_s=60175.7 opt_steps=1200 +[epoch 36/50] step=1220 train_loss=0.0129 tok_s=60178.0 opt_steps=1220 +[epoch 36/50] step=1240 train_loss=0.0129 tok_s=60169.4 opt_steps=1240 +[epoch 36/50] step=1260 train_loss=0.0129 tok_s=60168.2 opt_steps=1260 +[epoch 36/50] step=1280 train_loss=0.0129 tok_s=60173.9 opt_steps=1280 +[epoch 36/50] step=1300 train_loss=0.0130 tok_s=60173.8 opt_steps=1300 +[epoch 36/50] step=1320 train_loss=0.0130 tok_s=60167.5 opt_steps=1320 +[epoch 36/50] step=1340 train_loss=0.0130 tok_s=60170.3 opt_steps=1340 +[epoch 36/50] step=1360 train_loss=0.0130 tok_s=60168.6 opt_steps=1360 +[epoch 36/50] step=1380 train_loss=0.0130 tok_s=60175.1 opt_steps=1380 +[epoch 36/50] step=1400 train_loss=0.0130 tok_s=60174.9 opt_steps=1400 +[epoch 36/50] step=1420 train_loss=0.0130 tok_s=60175.0 opt_steps=1420 +[epoch 36/50] step=1440 train_loss=0.0131 tok_s=60167.7 opt_steps=1440 +[epoch 36/50] step=1460 train_loss=0.0131 tok_s=60174.4 opt_steps=1460 +[epoch 36/50] step=1480 train_loss=0.0131 tok_s=60178.6 opt_steps=1480 +[epoch 36/50] step=1500 train_loss=0.0131 tok_s=60178.4 opt_steps=1500 +[epoch 36/50] step=1520 train_loss=0.0131 tok_s=60177.9 opt_steps=1520 +[epoch 36/50] step=1540 train_loss=0.0131 tok_s=60171.7 opt_steps=1540 +[epoch 36/50] step=1560 train_loss=0.0131 tok_s=60175.9 opt_steps=1560 +[epoch 36/50] step=1580 train_loss=0.0132 tok_s=60180.6 opt_steps=1580 +[epoch 36/50] step=1600 train_loss=0.0132 tok_s=60184.8 opt_steps=1600 +[epoch 36/50] step=1620 train_loss=0.0132 tok_s=60185.7 opt_steps=1620 +[epoch 36/50] step=1640 train_loss=0.0132 tok_s=60182.8 opt_steps=1640 +[epoch 36/50] step=1660 train_loss=0.0132 tok_s=60177.1 opt_steps=1660 +[epoch 36/50] step=1680 train_loss=0.0133 tok_s=60173.7 opt_steps=1680 +[epoch 36/50] step=1700 train_loss=0.0133 tok_s=60174.9 opt_steps=1700 +[epoch 36/50] step=1720 train_loss=0.0133 tok_s=60180.0 opt_steps=1720 +[epoch 36/50] step=1740 train_loss=0.0133 tok_s=60186.5 opt_steps=1740 +[epoch 36/50] step=1760 train_loss=0.0133 tok_s=60189.4 opt_steps=1760 +[epoch 36/50] step=1780 train_loss=0.0133 tok_s=60192.5 opt_steps=1780 +[epoch 36/50] step=1800 train_loss=0.0133 tok_s=60190.4 opt_steps=1800 +[epoch 36/50] step=1820 train_loss=0.0133 tok_s=60190.1 opt_steps=1820 +[epoch 36/50] step=1840 train_loss=0.0134 tok_s=60189.8 opt_steps=1840 +[epoch 36/50] step=1860 train_loss=0.0134 tok_s=60190.5 opt_steps=1860 +[epoch 36/50] step=1880 train_loss=0.0134 tok_s=60192.3 opt_steps=1880 +[epoch 36/50] step=1900 train_loss=0.0134 tok_s=60194.4 opt_steps=1900 +[epoch 36/50] step=1920 train_loss=0.0134 tok_s=60191.7 opt_steps=1920 +[epoch 36/50] step=1940 train_loss=0.0134 tok_s=60191.7 opt_steps=1940 +[epoch 36/50] step=1960 train_loss=0.0134 tok_s=60187.6 opt_steps=1960 +[epoch 36/50] step=1980 train_loss=0.0135 tok_s=60184.7 opt_steps=1980 +[epoch 36/50] step=2000 train_loss=0.0135 tok_s=60187.0 opt_steps=2000 +[epoch 36/50] step=2020 train_loss=0.0135 tok_s=60189.5 opt_steps=2020 +[epoch 36/50] step=2040 train_loss=0.0135 tok_s=60188.7 opt_steps=2040 +[epoch 36/50] step=2060 train_loss=0.0135 tok_s=60185.8 opt_steps=2060 +[epoch 36/50] step=2080 train_loss=0.0135 tok_s=60184.7 opt_steps=2080 +[epoch 36/50] step=2100 train_loss=0.0135 tok_s=60187.6 opt_steps=2100 +[epoch 36/50] step=2120 train_loss=0.0135 tok_s=60186.9 opt_steps=2120 +[epoch 36/50] step=2140 train_loss=0.0135 tok_s=60180.6 opt_steps=2140 +[epoch 36/50] step=2160 train_loss=0.0136 tok_s=60176.6 opt_steps=2160 +[epoch 36/50] step=2180 train_loss=0.0136 tok_s=60171.5 opt_steps=2180 +[epoch 36/50] step=2200 train_loss=0.0136 tok_s=60174.7 opt_steps=2200 +[epoch 36/50] step=2220 train_loss=0.0136 tok_s=60176.6 opt_steps=2220 +[epoch 36/50] step=2240 train_loss=0.0136 tok_s=60175.5 opt_steps=2240 +[epoch 36/50] step=2260 train_loss=0.0136 tok_s=60178.3 opt_steps=2260 +[epoch 36/50] step=2280 train_loss=0.0136 tok_s=60180.8 opt_steps=2280 +[epoch 36/50] step=2300 train_loss=0.0137 tok_s=60177.7 opt_steps=2300 +[epoch 36/50] step=2320 train_loss=0.0137 tok_s=60176.6 opt_steps=2320 +[epoch 36/50] step=2340 train_loss=0.0137 tok_s=60177.3 opt_steps=2340 +[epoch 36/50] step=2360 train_loss=0.0137 tok_s=60174.9 opt_steps=2360 +[epoch 36/50] step=2380 train_loss=0.0137 tok_s=60177.9 opt_steps=2380 +[epoch 36/50] step=2400 train_loss=0.0137 tok_s=60177.0 opt_steps=2400 +[epoch 36/50] step=2420 train_loss=0.0137 tok_s=60179.5 opt_steps=2420 +[epoch 36/50] step=2440 train_loss=0.0137 tok_s=60183.1 opt_steps=2440 +[epoch 36/50] step=2460 train_loss=0.0137 tok_s=60184.1 opt_steps=2460 +[epoch 36/50] step=2480 train_loss=0.0138 tok_s=60188.3 opt_steps=2480 +[epoch 36/50] step=2500 train_loss=0.0138 tok_s=60187.6 opt_steps=2500 +[epoch 36/50] step=2520 train_loss=0.0138 tok_s=60184.1 opt_steps=2520 +[epoch 36/50] step=2540 train_loss=0.0138 tok_s=60183.5 opt_steps=2540 +[epoch 36/50] step=2560 train_loss=0.0138 tok_s=60183.5 opt_steps=2560 +[epoch 36/50] step=2580 train_loss=0.0138 tok_s=60184.4 opt_steps=2580 +[epoch 36/50] step=2600 train_loss=0.0138 tok_s=60187.2 opt_steps=2600 +[epoch 36/50] step=2620 train_loss=0.0138 tok_s=60187.5 opt_steps=2620 +[epoch 36/50] step=2640 train_loss=0.0139 tok_s=60187.4 opt_steps=2640 +[epoch 36/50] step=2660 train_loss=0.0139 tok_s=60188.5 opt_steps=2660 +[epoch 36/50] step=2680 train_loss=0.0139 tok_s=60188.2 opt_steps=2680 +[epoch 36/50] step=2700 train_loss=0.0139 tok_s=60188.9 opt_steps=2700 +[epoch 36/50] step=2720 train_loss=0.0139 tok_s=60188.2 opt_steps=2720 +[epoch 36/50] step=2740 train_loss=0.0139 tok_s=60189.5 opt_steps=2740 +[epoch 36/50] step=2760 train_loss=0.0139 tok_s=60188.8 opt_steps=2760 +[epoch 36/50] step=2780 train_loss=0.0139 tok_s=60185.9 opt_steps=2780 +[epoch 36/50] step=2800 train_loss=0.0139 tok_s=60181.4 opt_steps=2800 +[epoch 36/50] step=2820 train_loss=0.0140 tok_s=60178.9 opt_steps=2820 +[epoch 36/50] step=2840 train_loss=0.0140 tok_s=60177.4 opt_steps=2840 +[epoch 36/50] step=2860 train_loss=0.0140 tok_s=60180.2 opt_steps=2860 +[epoch 36/50] step=2880 train_loss=0.0140 tok_s=60179.3 opt_steps=2880 +[epoch 36/50] step=2900 train_loss=0.0140 tok_s=60178.2 opt_steps=2900 +[epoch 36/50] step=2920 train_loss=0.0140 tok_s=60179.6 opt_steps=2920 +[epoch 36/50] step=2940 train_loss=0.0140 tok_s=60178.3 opt_steps=2940 +[epoch 36/50] step=2960 train_loss=0.0140 tok_s=60180.0 opt_steps=2960 +[epoch 36/50] step=2980 train_loss=0.0140 tok_s=60179.6 opt_steps=2980 +[epoch 36/50] step=3000 train_loss=0.0140 tok_s=60179.2 opt_steps=3000 +[epoch 36/50] step=3020 train_loss=0.0140 tok_s=60174.3 opt_steps=3020 +[epoch 36/50] step=3040 train_loss=0.0141 tok_s=60173.5 opt_steps=3040 +[epoch 36/50] step=3060 train_loss=0.0141 tok_s=60172.6 opt_steps=3060 +[epoch 36/50] step=3080 train_loss=0.0141 tok_s=60169.4 opt_steps=3080 +[epoch 36/50] step=3100 train_loss=0.0141 tok_s=60165.9 opt_steps=3100 +[epoch 36/50] step=3120 train_loss=0.0141 tok_s=60165.3 opt_steps=3120 +[epoch 36/50] step=3140 train_loss=0.0141 tok_s=60163.3 opt_steps=3140 +[epoch 36/50] step=3160 train_loss=0.0141 tok_s=60164.1 opt_steps=3160 +[epoch 36/50] step=3180 train_loss=0.0141 tok_s=60162.4 opt_steps=3180 +[epoch 36/50] step=3200 train_loss=0.0141 tok_s=60160.2 opt_steps=3200 +[epoch 36/50] step=3220 train_loss=0.0141 tok_s=60159.6 opt_steps=3220 +[epoch 36/50] step=3240 train_loss=0.0141 tok_s=60166.3 opt_steps=3240 +[epoch 36/50] step=3260 train_loss=0.0141 tok_s=60160.1 opt_steps=3260 +[epoch 36/50] train_loss=0.0141 val_skipped tok_s=60162.5 opt_steps=3273 +[epoch 37/50] step=20 train_loss=0.0108 tok_s=58150.0 opt_steps=20 +[epoch 37/50] step=40 train_loss=0.0109 tok_s=59159.7 opt_steps=40 +[epoch 37/50] step=60 train_loss=0.0110 tok_s=59624.4 opt_steps=60 +[epoch 37/50] step=80 train_loss=0.0111 tok_s=59623.2 opt_steps=80 +[epoch 37/50] step=100 train_loss=0.0111 tok_s=59771.2 opt_steps=100 +[epoch 37/50] step=120 train_loss=0.0110 tok_s=59772.0 opt_steps=120 +[epoch 37/50] step=140 train_loss=0.0111 tok_s=59771.1 opt_steps=140 +[epoch 37/50] step=160 train_loss=0.0112 tok_s=59765.4 opt_steps=160 +[epoch 37/50] step=180 train_loss=0.0112 tok_s=59740.9 opt_steps=180 +[epoch 37/50] step=200 train_loss=0.0113 tok_s=59744.2 opt_steps=200 +[epoch 37/50] step=220 train_loss=0.0113 tok_s=59760.4 opt_steps=220 +[epoch 37/50] step=240 train_loss=0.0113 tok_s=59697.4 opt_steps=240 +[epoch 37/50] step=260 train_loss=0.0115 tok_s=59625.1 opt_steps=260 +[epoch 37/50] step=280 train_loss=0.0115 tok_s=59587.4 opt_steps=280 +[epoch 37/50] step=300 train_loss=0.0116 tok_s=59507.6 opt_steps=300 +[epoch 37/50] step=320 train_loss=0.0116 tok_s=59508.2 opt_steps=320 +[epoch 37/50] step=340 train_loss=0.0116 tok_s=59474.6 opt_steps=340 +[epoch 37/50] step=360 train_loss=0.0116 tok_s=59436.8 opt_steps=360 +[epoch 37/50] step=380 train_loss=0.0117 tok_s=59390.1 opt_steps=380 +[epoch 37/50] step=400 train_loss=0.0117 tok_s=59376.1 opt_steps=400 +[epoch 37/50] step=420 train_loss=0.0117 tok_s=58993.2 opt_steps=420 +[epoch 37/50] step=440 train_loss=0.0118 tok_s=58922.1 opt_steps=440 +[epoch 37/50] step=460 train_loss=0.0118 tok_s=58853.8 opt_steps=460 +[epoch 37/50] step=480 train_loss=0.0118 tok_s=58889.8 opt_steps=480 +[epoch 37/50] step=500 train_loss=0.0119 tok_s=58908.4 opt_steps=500 +[epoch 37/50] step=520 train_loss=0.0119 tok_s=58950.4 opt_steps=520 +[epoch 37/50] step=540 train_loss=0.0119 tok_s=58984.9 opt_steps=540 +[epoch 37/50] step=560 train_loss=0.0119 tok_s=58953.7 opt_steps=560 +[epoch 37/50] step=580 train_loss=0.0119 tok_s=58920.0 opt_steps=580 +[epoch 37/50] step=600 train_loss=0.0119 tok_s=58900.4 opt_steps=600 +[epoch 37/50] step=620 train_loss=0.0119 tok_s=58917.8 opt_steps=620 +[epoch 37/50] step=640 train_loss=0.0119 tok_s=58922.4 opt_steps=640 +[epoch 37/50] step=660 train_loss=0.0120 tok_s=58930.7 opt_steps=660 +[epoch 37/50] step=680 train_loss=0.0120 tok_s=58873.3 opt_steps=680 +[epoch 37/50] step=700 train_loss=0.0120 tok_s=58763.3 opt_steps=700 +[epoch 37/50] step=720 train_loss=0.0120 tok_s=58709.2 opt_steps=720 +[epoch 37/50] step=740 train_loss=0.0121 tok_s=58702.7 opt_steps=740 +[epoch 37/50] step=760 train_loss=0.0121 tok_s=58719.5 opt_steps=760 +[epoch 37/50] step=780 train_loss=0.0121 tok_s=58755.9 opt_steps=780 +[epoch 37/50] step=800 train_loss=0.0121 tok_s=58777.2 opt_steps=800 +[epoch 37/50] step=820 train_loss=0.0121 tok_s=58814.6 opt_steps=820 +[epoch 37/50] step=840 train_loss=0.0122 tok_s=58846.8 opt_steps=840 +[epoch 37/50] step=860 train_loss=0.0122 tok_s=58878.9 opt_steps=860 +[epoch 37/50] step=880 train_loss=0.0122 tok_s=58907.6 opt_steps=880 +[epoch 37/50] step=900 train_loss=0.0122 tok_s=58938.8 opt_steps=900 +[epoch 37/50] step=920 train_loss=0.0122 tok_s=58964.1 opt_steps=920 +[epoch 37/50] step=940 train_loss=0.0123 tok_s=58984.8 opt_steps=940 +[epoch 37/50] step=960 train_loss=0.0123 tok_s=59007.9 opt_steps=960 +[epoch 37/50] step=980 train_loss=0.0123 tok_s=59031.0 opt_steps=980 +[epoch 37/50] step=1000 train_loss=0.0123 tok_s=59046.5 opt_steps=1000 +[epoch 37/50] step=1020 train_loss=0.0124 tok_s=59067.8 opt_steps=1020 +[epoch 37/50] step=1040 train_loss=0.0124 tok_s=59084.1 opt_steps=1040 +[epoch 37/50] step=1060 train_loss=0.0124 tok_s=59111.0 opt_steps=1060 +[epoch 37/50] step=1080 train_loss=0.0124 tok_s=59126.8 opt_steps=1080 +[epoch 37/50] step=1100 train_loss=0.0124 tok_s=59138.8 opt_steps=1100 +[epoch 37/50] step=1120 train_loss=0.0124 tok_s=59144.5 opt_steps=1120 +[epoch 37/50] step=1140 train_loss=0.0124 tok_s=59158.4 opt_steps=1140 +[epoch 37/50] step=1160 train_loss=0.0125 tok_s=59175.3 opt_steps=1160 +[epoch 37/50] step=1180 train_loss=0.0125 tok_s=59187.2 opt_steps=1180 +[epoch 37/50] step=1200 train_loss=0.0125 tok_s=59201.7 opt_steps=1200 +[epoch 37/50] step=1220 train_loss=0.0125 tok_s=59213.6 opt_steps=1220 +[epoch 37/50] step=1240 train_loss=0.0126 tok_s=59219.0 opt_steps=1240 +[epoch 37/50] step=1260 train_loss=0.0126 tok_s=59234.6 opt_steps=1260 +[epoch 37/50] step=1280 train_loss=0.0126 tok_s=59242.0 opt_steps=1280 +[epoch 37/50] step=1300 train_loss=0.0126 tok_s=59254.2 opt_steps=1300 +[epoch 37/50] step=1320 train_loss=0.0126 tok_s=59260.5 opt_steps=1320 +[epoch 37/50] step=1340 train_loss=0.0127 tok_s=59262.5 opt_steps=1340 +[epoch 37/50] step=1360 train_loss=0.0127 tok_s=59274.5 opt_steps=1360 +[epoch 37/50] step=1380 train_loss=0.0127 tok_s=59274.4 opt_steps=1380 +[epoch 37/50] step=1400 train_loss=0.0127 tok_s=59281.6 opt_steps=1400 +[epoch 37/50] step=1420 train_loss=0.0127 tok_s=59287.1 opt_steps=1420 +[epoch 37/50] step=1440 train_loss=0.0127 tok_s=59290.6 opt_steps=1440 +[epoch 37/50] step=1460 train_loss=0.0128 tok_s=59297.9 opt_steps=1460 +[epoch 37/50] step=1480 train_loss=0.0128 tok_s=59307.0 opt_steps=1480 +[epoch 37/50] step=1500 train_loss=0.0128 tok_s=59316.2 opt_steps=1500 +[epoch 37/50] step=1520 train_loss=0.0128 tok_s=59329.4 opt_steps=1520 +[epoch 37/50] step=1540 train_loss=0.0128 tok_s=59338.3 opt_steps=1540 +[epoch 37/50] step=1560 train_loss=0.0128 tok_s=59348.0 opt_steps=1560 +[epoch 37/50] step=1580 train_loss=0.0128 tok_s=59354.7 opt_steps=1580 +[epoch 37/50] step=1600 train_loss=0.0129 tok_s=59363.9 opt_steps=1600 +[epoch 37/50] step=1620 train_loss=0.0129 tok_s=59373.3 opt_steps=1620 +[epoch 37/50] step=1640 train_loss=0.0129 tok_s=59377.2 opt_steps=1640 +[epoch 37/50] step=1660 train_loss=0.0129 tok_s=59386.7 opt_steps=1660 +[epoch 37/50] step=1680 train_loss=0.0129 tok_s=59395.7 opt_steps=1680 +[epoch 37/50] step=1700 train_loss=0.0129 tok_s=59404.8 opt_steps=1700 +[epoch 37/50] step=1720 train_loss=0.0129 tok_s=59412.2 opt_steps=1720 +[epoch 37/50] step=1740 train_loss=0.0129 tok_s=59420.8 opt_steps=1740 +[epoch 37/50] step=1760 train_loss=0.0129 tok_s=59429.2 opt_steps=1760 +[epoch 37/50] step=1780 train_loss=0.0130 tok_s=59431.9 opt_steps=1780 +[epoch 37/50] step=1800 train_loss=0.0130 tok_s=59440.6 opt_steps=1800 +[epoch 37/50] step=1820 train_loss=0.0130 tok_s=59446.3 opt_steps=1820 +[epoch 37/50] step=1840 train_loss=0.0130 tok_s=59453.7 opt_steps=1840 +[epoch 37/50] step=1860 train_loss=0.0130 tok_s=59467.5 opt_steps=1860 +[epoch 37/50] step=1880 train_loss=0.0130 tok_s=59472.7 opt_steps=1880 +[epoch 37/50] step=1900 train_loss=0.0130 tok_s=59484.2 opt_steps=1900 +[epoch 37/50] step=1920 train_loss=0.0130 tok_s=59492.6 opt_steps=1920 +[epoch 37/50] step=1940 train_loss=0.0131 tok_s=59503.1 opt_steps=1940 +[epoch 37/50] step=1960 train_loss=0.0131 tok_s=59509.3 opt_steps=1960 +[epoch 37/50] step=1980 train_loss=0.0131 tok_s=59513.8 opt_steps=1980 +[epoch 37/50] step=2000 train_loss=0.0131 tok_s=59524.0 opt_steps=2000 +[epoch 37/50] step=2020 train_loss=0.0131 tok_s=59527.8 opt_steps=2020 +[epoch 37/50] step=2040 train_loss=0.0131 tok_s=59532.2 opt_steps=2040 +[epoch 37/50] step=2060 train_loss=0.0132 tok_s=59538.5 opt_steps=2060 +[epoch 37/50] step=2080 train_loss=0.0132 tok_s=59546.1 opt_steps=2080 +[epoch 37/50] step=2100 train_loss=0.0132 tok_s=59546.5 opt_steps=2100 +[epoch 37/50] step=2120 train_loss=0.0132 tok_s=59549.2 opt_steps=2120 +[epoch 37/50] step=2140 train_loss=0.0132 tok_s=59552.6 opt_steps=2140 +[epoch 37/50] step=2160 train_loss=0.0132 tok_s=59553.1 opt_steps=2160 +[epoch 37/50] step=2180 train_loss=0.0132 tok_s=59555.6 opt_steps=2180 +[epoch 37/50] step=2200 train_loss=0.0132 tok_s=59561.9 opt_steps=2200 +[epoch 37/50] step=2220 train_loss=0.0132 tok_s=59566.4 opt_steps=2220 +[epoch 37/50] step=2240 train_loss=0.0133 tok_s=59572.1 opt_steps=2240 +[epoch 37/50] step=2260 train_loss=0.0133 tok_s=59575.2 opt_steps=2260 +[epoch 37/50] step=2280 train_loss=0.0133 tok_s=59576.9 opt_steps=2280 +[epoch 37/50] step=2300 train_loss=0.0133 tok_s=59579.6 opt_steps=2300 +[epoch 37/50] step=2320 train_loss=0.0133 tok_s=59585.1 opt_steps=2320 +[epoch 37/50] step=2340 train_loss=0.0133 tok_s=59589.7 opt_steps=2340 +[epoch 37/50] step=2360 train_loss=0.0133 tok_s=59593.6 opt_steps=2360 +[epoch 37/50] step=2380 train_loss=0.0133 tok_s=59594.8 opt_steps=2380 +[epoch 37/50] step=2400 train_loss=0.0134 tok_s=59598.0 opt_steps=2400 +[epoch 37/50] step=2420 train_loss=0.0134 tok_s=59606.8 opt_steps=2420 +[epoch 37/50] step=2440 train_loss=0.0134 tok_s=59612.9 opt_steps=2440 +[epoch 37/50] step=2460 train_loss=0.0134 tok_s=59616.5 opt_steps=2460 +[epoch 37/50] step=2480 train_loss=0.0134 tok_s=59620.7 opt_steps=2480 +[epoch 37/50] step=2500 train_loss=0.0134 tok_s=59622.8 opt_steps=2500 +[epoch 37/50] step=2520 train_loss=0.0134 tok_s=59623.8 opt_steps=2520 +[epoch 37/50] step=2540 train_loss=0.0134 tok_s=59624.4 opt_steps=2540 +[epoch 37/50] step=2560 train_loss=0.0134 tok_s=59626.2 opt_steps=2560 +[epoch 37/50] step=2580 train_loss=0.0134 tok_s=59627.1 opt_steps=2580 +[epoch 37/50] step=2600 train_loss=0.0135 tok_s=59628.1 opt_steps=2600 +[epoch 37/50] step=2620 train_loss=0.0135 tok_s=59630.5 opt_steps=2620 +[epoch 37/50] step=2640 train_loss=0.0135 tok_s=59632.7 opt_steps=2640 +[epoch 37/50] step=2660 train_loss=0.0135 tok_s=59636.6 opt_steps=2660 +[epoch 37/50] step=2680 train_loss=0.0135 tok_s=59636.3 opt_steps=2680 +[epoch 37/50] step=2700 train_loss=0.0135 tok_s=59634.9 opt_steps=2700 +[epoch 37/50] step=2720 train_loss=0.0135 tok_s=59634.7 opt_steps=2720 +[epoch 37/50] step=2740 train_loss=0.0135 tok_s=59634.8 opt_steps=2740 +[epoch 37/50] step=2760 train_loss=0.0135 tok_s=59636.1 opt_steps=2760 +[epoch 37/50] step=2780 train_loss=0.0136 tok_s=59637.7 opt_steps=2780 +[epoch 37/50] step=2800 train_loss=0.0136 tok_s=59639.3 opt_steps=2800 +[epoch 37/50] step=2820 train_loss=0.0136 tok_s=59639.3 opt_steps=2820 +[epoch 37/50] step=2840 train_loss=0.0136 tok_s=59643.5 opt_steps=2840 +[epoch 37/50] step=2860 train_loss=0.0136 tok_s=59648.2 opt_steps=2860 +[epoch 37/50] step=2880 train_loss=0.0136 tok_s=59653.8 opt_steps=2880 +[epoch 37/50] step=2900 train_loss=0.0136 tok_s=59658.1 opt_steps=2900 +[epoch 37/50] step=2920 train_loss=0.0136 tok_s=59662.7 opt_steps=2920 +[epoch 37/50] step=2940 train_loss=0.0136 tok_s=59663.4 opt_steps=2940 +[epoch 37/50] step=2960 train_loss=0.0136 tok_s=59664.9 opt_steps=2960 +[epoch 37/50] step=2980 train_loss=0.0137 tok_s=59668.1 opt_steps=2980 +[epoch 37/50] step=3000 train_loss=0.0137 tok_s=59669.1 opt_steps=3000 +[epoch 37/50] step=3020 train_loss=0.0137 tok_s=59667.2 opt_steps=3020 +[epoch 37/50] step=3040 train_loss=0.0137 tok_s=59668.4 opt_steps=3040 +[epoch 37/50] step=3060 train_loss=0.0137 tok_s=59671.5 opt_steps=3060 +[epoch 37/50] step=3080 train_loss=0.0137 tok_s=59675.1 opt_steps=3080 +[epoch 37/50] step=3100 train_loss=0.0137 tok_s=59680.8 opt_steps=3100 +[epoch 37/50] step=3120 train_loss=0.0137 tok_s=59681.7 opt_steps=3120 +[epoch 37/50] step=3140 train_loss=0.0137 tok_s=59685.5 opt_steps=3140 +[epoch 37/50] step=3160 train_loss=0.0137 tok_s=59688.9 opt_steps=3160 +[epoch 37/50] step=3180 train_loss=0.0138 tok_s=59695.0 opt_steps=3180 +[epoch 37/50] step=3200 train_loss=0.0138 tok_s=59696.9 opt_steps=3200 +[epoch 37/50] step=3220 train_loss=0.0138 tok_s=59699.7 opt_steps=3220 +[epoch 37/50] step=3240 train_loss=0.0138 tok_s=59702.6 opt_steps=3240 +[epoch 37/50] step=3260 train_loss=0.0138 tok_s=59696.6 opt_steps=3260 +[epoch 37/50] train_loss=0.0138 val_skipped tok_s=59700.6 opt_steps=3273 +[epoch 38/50] step=20 train_loss=0.0118 tok_s=58231.3 opt_steps=20 +[epoch 38/50] step=40 train_loss=0.0115 tok_s=59014.1 opt_steps=40 +[epoch 38/50] step=60 train_loss=0.0113 tok_s=59578.8 opt_steps=60 +[epoch 38/50] step=80 train_loss=0.0112 tok_s=59713.9 opt_steps=80 +[epoch 38/50] step=100 train_loss=0.0111 tok_s=59858.1 opt_steps=100 +[epoch 38/50] step=120 train_loss=0.0111 tok_s=59949.6 opt_steps=120 +[epoch 38/50] step=140 train_loss=0.0111 tok_s=59921.1 opt_steps=140 +[epoch 38/50] step=160 train_loss=0.0110 tok_s=59888.5 opt_steps=160 +[epoch 38/50] step=180 train_loss=0.0110 tok_s=59911.2 opt_steps=180 +[epoch 38/50] step=200 train_loss=0.0110 tok_s=59934.5 opt_steps=200 +[epoch 38/50] step=220 train_loss=0.0110 tok_s=59977.9 opt_steps=220 +[epoch 38/50] step=240 train_loss=0.0110 tok_s=60014.3 opt_steps=240 +[epoch 38/50] step=260 train_loss=0.0110 tok_s=59998.0 opt_steps=260 +[epoch 38/50] step=280 train_loss=0.0111 tok_s=60001.1 opt_steps=280 +[epoch 38/50] step=300 train_loss=0.0111 tok_s=60049.4 opt_steps=300 +[epoch 38/50] step=320 train_loss=0.0112 tok_s=60035.6 opt_steps=320 +[epoch 38/50] step=340 train_loss=0.0112 tok_s=60044.7 opt_steps=340 +[epoch 38/50] step=360 train_loss=0.0112 tok_s=60043.3 opt_steps=360 +[epoch 38/50] step=380 train_loss=0.0113 tok_s=60042.5 opt_steps=380 +[epoch 38/50] step=400 train_loss=0.0113 tok_s=60029.7 opt_steps=400 +[epoch 38/50] step=420 train_loss=0.0113 tok_s=60036.0 opt_steps=420 +[epoch 38/50] step=440 train_loss=0.0113 tok_s=60015.5 opt_steps=440 +[epoch 38/50] step=460 train_loss=0.0114 tok_s=60013.0 opt_steps=460 +[epoch 38/50] step=480 train_loss=0.0114 tok_s=60014.1 opt_steps=480 +[epoch 38/50] step=500 train_loss=0.0114 tok_s=60000.0 opt_steps=500 +[epoch 38/50] step=520 train_loss=0.0114 tok_s=60011.0 opt_steps=520 +[epoch 38/50] step=540 train_loss=0.0115 tok_s=60009.1 opt_steps=540 +[epoch 38/50] step=560 train_loss=0.0115 tok_s=60022.0 opt_steps=560 +[epoch 38/50] step=580 train_loss=0.0115 tok_s=60020.6 opt_steps=580 +[epoch 38/50] step=600 train_loss=0.0115 tok_s=60015.6 opt_steps=600 +[epoch 38/50] step=620 train_loss=0.0115 tok_s=60029.0 opt_steps=620 +[epoch 38/50] step=640 train_loss=0.0116 tok_s=60033.0 opt_steps=640 +[epoch 38/50] step=660 train_loss=0.0116 tok_s=60039.3 opt_steps=660 +[epoch 38/50] step=680 train_loss=0.0116 tok_s=60052.2 opt_steps=680 +[epoch 38/50] step=700 train_loss=0.0116 tok_s=60058.3 opt_steps=700 +[epoch 38/50] step=720 train_loss=0.0117 tok_s=60061.2 opt_steps=720 +[epoch 38/50] step=740 train_loss=0.0117 tok_s=60068.2 opt_steps=740 +[epoch 38/50] step=760 train_loss=0.0117 tok_s=60066.1 opt_steps=760 +[epoch 38/50] step=780 train_loss=0.0117 tok_s=60088.3 opt_steps=780 +[epoch 38/50] step=800 train_loss=0.0117 tok_s=60086.0 opt_steps=800 +[epoch 38/50] step=820 train_loss=0.0118 tok_s=60085.3 opt_steps=820 +[epoch 38/50] step=840 train_loss=0.0118 tok_s=60082.0 opt_steps=840 +[epoch 38/50] step=860 train_loss=0.0118 tok_s=60091.4 opt_steps=860 +[epoch 38/50] step=880 train_loss=0.0118 tok_s=60099.4 opt_steps=880 +[epoch 38/50] step=900 train_loss=0.0118 tok_s=60052.2 opt_steps=900 +[epoch 38/50] step=920 train_loss=0.0118 tok_s=60057.8 opt_steps=920 +[epoch 38/50] step=940 train_loss=0.0119 tok_s=60062.6 opt_steps=940 +[epoch 38/50] step=960 train_loss=0.0119 tok_s=60048.4 opt_steps=960 +[epoch 38/50] step=980 train_loss=0.0119 tok_s=60027.6 opt_steps=980 +[epoch 38/50] step=1000 train_loss=0.0119 tok_s=60035.7 opt_steps=1000 +[epoch 38/50] step=1020 train_loss=0.0119 tok_s=60029.1 opt_steps=1020 +[epoch 38/50] step=1040 train_loss=0.0119 tok_s=60035.8 opt_steps=1040 +[epoch 38/50] step=1060 train_loss=0.0120 tok_s=60040.3 opt_steps=1060 +[epoch 38/50] step=1080 train_loss=0.0120 tok_s=60034.1 opt_steps=1080 +[epoch 38/50] step=1100 train_loss=0.0120 tok_s=60035.1 opt_steps=1100 +[epoch 38/50] step=1120 train_loss=0.0120 tok_s=60046.5 opt_steps=1120 +[epoch 38/50] step=1140 train_loss=0.0120 tok_s=60049.8 opt_steps=1140 +[epoch 38/50] step=1160 train_loss=0.0120 tok_s=60041.4 opt_steps=1160 +[epoch 38/50] step=1180 train_loss=0.0121 tok_s=60033.6 opt_steps=1180 +[epoch 38/50] step=1200 train_loss=0.0121 tok_s=60036.0 opt_steps=1200 +[epoch 38/50] step=1220 train_loss=0.0121 tok_s=60038.8 opt_steps=1220 +[epoch 38/50] step=1240 train_loss=0.0121 tok_s=60042.9 opt_steps=1240 +[epoch 38/50] step=1260 train_loss=0.0121 tok_s=60051.2 opt_steps=1260 +[epoch 38/50] step=1280 train_loss=0.0121 tok_s=60050.6 opt_steps=1280 +[epoch 38/50] step=1300 train_loss=0.0121 tok_s=60051.0 opt_steps=1300 +[epoch 38/50] step=1320 train_loss=0.0122 tok_s=60044.9 opt_steps=1320 +[epoch 38/50] step=1340 train_loss=0.0122 tok_s=60046.7 opt_steps=1340 +[epoch 38/50] step=1360 train_loss=0.0122 tok_s=60049.3 opt_steps=1360 +[epoch 38/50] step=1380 train_loss=0.0122 tok_s=60047.2 opt_steps=1380 +[epoch 38/50] step=1400 train_loss=0.0122 tok_s=60041.2 opt_steps=1400 +[epoch 38/50] step=1420 train_loss=0.0122 tok_s=60042.7 opt_steps=1420 +[epoch 38/50] step=1440 train_loss=0.0123 tok_s=60046.4 opt_steps=1440 +[epoch 38/50] step=1460 train_loss=0.0123 tok_s=60047.3 opt_steps=1460 +[epoch 38/50] step=1480 train_loss=0.0123 tok_s=60052.2 opt_steps=1480 +[epoch 38/50] step=1500 train_loss=0.0123 tok_s=60061.2 opt_steps=1500 +[epoch 38/50] step=1520 train_loss=0.0123 tok_s=60065.1 opt_steps=1520 +[epoch 38/50] step=1540 train_loss=0.0123 tok_s=60063.6 opt_steps=1540 +[epoch 38/50] step=1560 train_loss=0.0123 tok_s=60064.9 opt_steps=1560 +[epoch 38/50] step=1580 train_loss=0.0124 tok_s=60068.5 opt_steps=1580 +[epoch 38/50] step=1600 train_loss=0.0124 tok_s=60070.3 opt_steps=1600 +[epoch 38/50] step=1620 train_loss=0.0124 tok_s=60073.3 opt_steps=1620 +[epoch 38/50] step=1640 train_loss=0.0124 tok_s=60073.5 opt_steps=1640 +[epoch 38/50] step=1660 train_loss=0.0124 tok_s=60076.1 opt_steps=1660 +[epoch 38/50] step=1680 train_loss=0.0124 tok_s=60071.9 opt_steps=1680 +[epoch 38/50] step=1700 train_loss=0.0124 tok_s=60071.7 opt_steps=1700 +[epoch 38/50] step=1720 train_loss=0.0125 tok_s=60072.2 opt_steps=1720 +[epoch 38/50] step=1740 train_loss=0.0125 tok_s=60077.4 opt_steps=1740 +[epoch 38/50] step=1760 train_loss=0.0125 tok_s=60074.9 opt_steps=1760 +[epoch 38/50] step=1780 train_loss=0.0125 tok_s=60067.9 opt_steps=1780 +[epoch 38/50] step=1800 train_loss=0.0125 tok_s=60070.7 opt_steps=1800 +[epoch 38/50] step=1820 train_loss=0.0126 tok_s=60072.8 opt_steps=1820 +[epoch 38/50] step=1840 train_loss=0.0126 tok_s=60066.1 opt_steps=1840 +[epoch 38/50] step=1860 train_loss=0.0126 tok_s=60066.3 opt_steps=1860 +[epoch 38/50] step=1880 train_loss=0.0126 tok_s=60067.6 opt_steps=1880 +[epoch 38/50] step=1900 train_loss=0.0126 tok_s=60068.5 opt_steps=1900 +[epoch 38/50] step=1920 train_loss=0.0126 tok_s=60069.1 opt_steps=1920 +[epoch 38/50] step=1940 train_loss=0.0126 tok_s=60072.8 opt_steps=1940 +[epoch 38/50] step=1960 train_loss=0.0126 tok_s=60068.6 opt_steps=1960 +[epoch 38/50] step=1980 train_loss=0.0127 tok_s=60072.2 opt_steps=1980 +[epoch 38/50] step=2000 train_loss=0.0127 tok_s=60069.9 opt_steps=2000 +[epoch 38/50] step=2020 train_loss=0.0127 tok_s=60073.5 opt_steps=2020 +[epoch 38/50] step=2040 train_loss=0.0127 tok_s=60075.4 opt_steps=2040 +[epoch 38/50] step=2060 train_loss=0.0127 tok_s=60075.3 opt_steps=2060 +[epoch 38/50] step=2080 train_loss=0.0127 tok_s=60072.0 opt_steps=2080 +[epoch 38/50] step=2100 train_loss=0.0127 tok_s=60072.9 opt_steps=2100 +[epoch 38/50] step=2120 train_loss=0.0128 tok_s=60069.2 opt_steps=2120 +[epoch 38/50] step=2140 train_loss=0.0128 tok_s=60067.6 opt_steps=2140 +[epoch 38/50] step=2160 train_loss=0.0128 tok_s=60071.4 opt_steps=2160 +[epoch 38/50] step=2180 train_loss=0.0128 tok_s=60072.3 opt_steps=2180 +[epoch 38/50] step=2200 train_loss=0.0128 tok_s=60073.9 opt_steps=2200 +[epoch 38/50] step=2220 train_loss=0.0128 tok_s=60074.9 opt_steps=2220 +[epoch 38/50] step=2240 train_loss=0.0129 tok_s=60076.6 opt_steps=2240 +[epoch 38/50] step=2260 train_loss=0.0129 tok_s=60073.7 opt_steps=2260 +[epoch 38/50] step=2280 train_loss=0.0129 tok_s=60073.4 opt_steps=2280 +[epoch 38/50] step=2300 train_loss=0.0129 tok_s=60077.2 opt_steps=2300 +[epoch 38/50] step=2320 train_loss=0.0129 tok_s=60077.3 opt_steps=2320 +[epoch 38/50] step=2340 train_loss=0.0129 tok_s=60078.9 opt_steps=2340 +[epoch 38/50] step=2360 train_loss=0.0129 tok_s=60077.8 opt_steps=2360 +[epoch 38/50] step=2380 train_loss=0.0130 tok_s=60077.7 opt_steps=2380 +[epoch 38/50] step=2400 train_loss=0.0130 tok_s=60080.1 opt_steps=2400 +[epoch 38/50] step=2420 train_loss=0.0130 tok_s=60082.9 opt_steps=2420 +[epoch 38/50] step=2440 train_loss=0.0130 tok_s=60080.7 opt_steps=2440 +[epoch 38/50] step=2460 train_loss=0.0130 tok_s=60084.8 opt_steps=2460 +[epoch 38/50] step=2480 train_loss=0.0130 tok_s=60087.4 opt_steps=2480 +[epoch 38/50] step=2500 train_loss=0.0130 tok_s=60091.3 opt_steps=2500 +[epoch 38/50] step=2520 train_loss=0.0130 tok_s=60095.2 opt_steps=2520 +[epoch 38/50] step=2540 train_loss=0.0130 tok_s=60095.0 opt_steps=2540 +[epoch 38/50] step=2560 train_loss=0.0130 tok_s=60092.3 opt_steps=2560 +[epoch 38/50] step=2580 train_loss=0.0131 tok_s=60099.1 opt_steps=2580 +[epoch 38/50] step=2600 train_loss=0.0131 tok_s=60099.7 opt_steps=2600 +[epoch 38/50] step=2620 train_loss=0.0131 tok_s=60102.4 opt_steps=2620 +[epoch 38/50] step=2640 train_loss=0.0131 tok_s=60103.4 opt_steps=2640 +[epoch 38/50] step=2660 train_loss=0.0131 tok_s=60099.8 opt_steps=2660 +[epoch 38/50] step=2680 train_loss=0.0131 tok_s=60101.6 opt_steps=2680 +[epoch 38/50] step=2700 train_loss=0.0131 tok_s=60103.9 opt_steps=2700 +[epoch 38/50] step=2720 train_loss=0.0131 tok_s=60105.4 opt_steps=2720 +[epoch 38/50] step=2740 train_loss=0.0131 tok_s=60104.5 opt_steps=2740 +[epoch 38/50] step=2760 train_loss=0.0132 tok_s=60103.5 opt_steps=2760 +[epoch 38/50] step=2780 train_loss=0.0132 tok_s=60107.8 opt_steps=2780 +[epoch 38/50] step=2800 train_loss=0.0132 tok_s=60107.8 opt_steps=2800 +[epoch 38/50] step=2820 train_loss=0.0132 tok_s=60107.2 opt_steps=2820 +[epoch 38/50] step=2840 train_loss=0.0132 tok_s=60109.5 opt_steps=2840 +[epoch 38/50] step=2860 train_loss=0.0132 tok_s=60107.9 opt_steps=2860 +[epoch 38/50] step=2880 train_loss=0.0132 tok_s=60106.4 opt_steps=2880 +[epoch 38/50] step=2900 train_loss=0.0132 tok_s=60102.1 opt_steps=2900 +[epoch 38/50] step=2920 train_loss=0.0132 tok_s=60103.0 opt_steps=2920 +[epoch 38/50] step=2940 train_loss=0.0132 tok_s=60100.1 opt_steps=2940 +[epoch 38/50] step=2960 train_loss=0.0133 tok_s=60100.2 opt_steps=2960 +[epoch 38/50] step=2980 train_loss=0.0133 tok_s=60101.3 opt_steps=2980 +[epoch 38/50] step=3000 train_loss=0.0133 tok_s=60102.5 opt_steps=3000 +[epoch 38/50] step=3020 train_loss=0.0133 tok_s=60100.5 opt_steps=3020 +[epoch 38/50] step=3040 train_loss=0.0133 tok_s=60104.8 opt_steps=3040 +[epoch 38/50] step=3060 train_loss=0.0133 tok_s=60105.5 opt_steps=3060 +[epoch 38/50] step=3080 train_loss=0.0133 tok_s=60106.9 opt_steps=3080 +[epoch 38/50] step=3100 train_loss=0.0133 tok_s=60107.1 opt_steps=3100 +[epoch 38/50] step=3120 train_loss=0.0133 tok_s=60106.1 opt_steps=3120 +[epoch 38/50] step=3140 train_loss=0.0133 tok_s=60106.5 opt_steps=3140 +[epoch 38/50] step=3160 train_loss=0.0133 tok_s=60105.6 opt_steps=3160 +[epoch 38/50] step=3180 train_loss=0.0133 tok_s=60106.4 opt_steps=3180 +[epoch 38/50] step=3200 train_loss=0.0134 tok_s=60107.1 opt_steps=3200 +[epoch 38/50] step=3220 train_loss=0.0134 tok_s=60106.2 opt_steps=3220 +[epoch 38/50] step=3240 train_loss=0.0134 tok_s=60109.0 opt_steps=3240 +[epoch 38/50] step=3260 train_loss=0.0134 tok_s=60103.7 opt_steps=3260 +[epoch 38/50] train_loss=0.0134 val_skipped tok_s=60103.3 opt_steps=3273 +[epoch 39/50] step=20 train_loss=0.0109 tok_s=58011.5 opt_steps=20 +[epoch 39/50] step=40 train_loss=0.0110 tok_s=59033.1 opt_steps=40 +[epoch 39/50] step=60 train_loss=0.0109 tok_s=59272.5 opt_steps=60 +[epoch 39/50] step=80 train_loss=0.0106 tok_s=59500.3 opt_steps=80 +[epoch 39/50] step=100 train_loss=0.0108 tok_s=59673.9 opt_steps=100 +[epoch 39/50] step=120 train_loss=0.0107 tok_s=59759.0 opt_steps=120 +[epoch 39/50] step=140 train_loss=0.0107 tok_s=59815.2 opt_steps=140 +[epoch 39/50] step=160 train_loss=0.0108 tok_s=59866.0 opt_steps=160 +[epoch 39/50] step=180 train_loss=0.0107 tok_s=59882.5 opt_steps=180 +[epoch 39/50] step=200 train_loss=0.0108 tok_s=59917.9 opt_steps=200 +[epoch 39/50] step=220 train_loss=0.0108 tok_s=60004.1 opt_steps=220 +[epoch 39/50] step=240 train_loss=0.0108 tok_s=60012.8 opt_steps=240 +[epoch 39/50] step=260 train_loss=0.0109 tok_s=60019.2 opt_steps=260 +[epoch 39/50] step=280 train_loss=0.0110 tok_s=60032.9 opt_steps=280 +[epoch 39/50] step=300 train_loss=0.0110 tok_s=60032.8 opt_steps=300 +[epoch 39/50] step=320 train_loss=0.0110 tok_s=60066.9 opt_steps=320 +[epoch 39/50] step=340 train_loss=0.0111 tok_s=60045.1 opt_steps=340 +[epoch 39/50] step=360 train_loss=0.0111 tok_s=60045.2 opt_steps=360 +[epoch 39/50] step=380 train_loss=0.0111 tok_s=60042.1 opt_steps=380 +[epoch 39/50] step=400 train_loss=0.0111 tok_s=60057.3 opt_steps=400 +[epoch 39/50] step=420 train_loss=0.0111 tok_s=60065.3 opt_steps=420 +[epoch 39/50] step=440 train_loss=0.0111 tok_s=60073.6 opt_steps=440 +[epoch 39/50] step=460 train_loss=0.0111 tok_s=60087.4 opt_steps=460 +[epoch 39/50] step=480 train_loss=0.0111 tok_s=60104.5 opt_steps=480 +[epoch 39/50] step=500 train_loss=0.0112 tok_s=60104.9 opt_steps=500 +[epoch 39/50] step=520 train_loss=0.0112 tok_s=60107.2 opt_steps=520 +[epoch 39/50] step=540 train_loss=0.0112 tok_s=60111.7 opt_steps=540 +[epoch 39/50] step=560 train_loss=0.0112 tok_s=60117.4 opt_steps=560 +[epoch 39/50] step=580 train_loss=0.0112 tok_s=60114.8 opt_steps=580 +[epoch 39/50] step=600 train_loss=0.0113 tok_s=60116.0 opt_steps=600 +[epoch 39/50] step=620 train_loss=0.0113 tok_s=60126.3 opt_steps=620 +[epoch 39/50] step=640 train_loss=0.0113 tok_s=60120.0 opt_steps=640 +[epoch 39/50] step=660 train_loss=0.0113 tok_s=60126.0 opt_steps=660 +[epoch 39/50] step=680 train_loss=0.0113 tok_s=60133.7 opt_steps=680 +[epoch 39/50] step=700 train_loss=0.0114 tok_s=60139.3 opt_steps=700 +[epoch 39/50] step=720 train_loss=0.0114 tok_s=60137.8 opt_steps=720 +[epoch 39/50] step=740 train_loss=0.0114 tok_s=60135.0 opt_steps=740 +[epoch 39/50] step=760 train_loss=0.0114 tok_s=60129.6 opt_steps=760 +[epoch 39/50] step=780 train_loss=0.0114 tok_s=60129.8 opt_steps=780 +[epoch 39/50] step=800 train_loss=0.0114 tok_s=60132.9 opt_steps=800 +[epoch 39/50] step=820 train_loss=0.0115 tok_s=60126.1 opt_steps=820 +[epoch 39/50] step=840 train_loss=0.0115 tok_s=60127.7 opt_steps=840 +[epoch 39/50] step=860 train_loss=0.0115 tok_s=60124.7 opt_steps=860 +[epoch 39/50] step=880 train_loss=0.0115 tok_s=60130.8 opt_steps=880 +[epoch 39/50] step=900 train_loss=0.0115 tok_s=60133.9 opt_steps=900 +[epoch 39/50] step=920 train_loss=0.0116 tok_s=60133.6 opt_steps=920 +[epoch 39/50] step=940 train_loss=0.0116 tok_s=60137.3 opt_steps=940 +[epoch 39/50] step=960 train_loss=0.0116 tok_s=60148.3 opt_steps=960 +[epoch 39/50] step=980 train_loss=0.0116 tok_s=60151.2 opt_steps=980 +[epoch 39/50] step=1000 train_loss=0.0116 tok_s=60148.0 opt_steps=1000 +[epoch 39/50] step=1020 train_loss=0.0116 tok_s=60145.4 opt_steps=1020 +[epoch 39/50] step=1040 train_loss=0.0117 tok_s=60148.8 opt_steps=1040 +[epoch 39/50] step=1060 train_loss=0.0117 tok_s=60155.4 opt_steps=1060 +[epoch 39/50] step=1080 train_loss=0.0117 tok_s=60154.8 opt_steps=1080 +[epoch 39/50] step=1100 train_loss=0.0117 tok_s=60152.5 opt_steps=1100 +[epoch 39/50] step=1120 train_loss=0.0118 tok_s=60149.7 opt_steps=1120 +[epoch 39/50] step=1140 train_loss=0.0118 tok_s=60142.5 opt_steps=1140 +[epoch 39/50] step=1160 train_loss=0.0118 tok_s=60135.4 opt_steps=1160 +[epoch 39/50] step=1180 train_loss=0.0118 tok_s=60138.9 opt_steps=1180 +[epoch 39/50] step=1200 train_loss=0.0118 tok_s=60130.1 opt_steps=1200 +[epoch 39/50] step=1220 train_loss=0.0118 tok_s=60130.1 opt_steps=1220 +[epoch 39/50] step=1240 train_loss=0.0118 tok_s=60126.3 opt_steps=1240 +[epoch 39/50] step=1260 train_loss=0.0118 tok_s=60128.4 opt_steps=1260 +[epoch 39/50] step=1280 train_loss=0.0119 tok_s=60135.8 opt_steps=1280 +[epoch 39/50] step=1300 train_loss=0.0119 tok_s=60138.0 opt_steps=1300 +[epoch 39/50] step=1320 train_loss=0.0119 tok_s=60144.0 opt_steps=1320 +[epoch 39/50] step=1340 train_loss=0.0119 tok_s=60144.8 opt_steps=1340 +[epoch 39/50] step=1360 train_loss=0.0119 tok_s=60151.4 opt_steps=1360 +[epoch 39/50] step=1380 train_loss=0.0120 tok_s=60154.6 opt_steps=1380 +[epoch 39/50] step=1400 train_loss=0.0120 tok_s=60145.6 opt_steps=1400 +[epoch 39/50] step=1420 train_loss=0.0120 tok_s=60135.3 opt_steps=1420 +[epoch 39/50] step=1440 train_loss=0.0120 tok_s=60135.4 opt_steps=1440 +[epoch 39/50] step=1460 train_loss=0.0120 tok_s=60138.7 opt_steps=1460 +[epoch 39/50] step=1480 train_loss=0.0120 tok_s=60144.3 opt_steps=1480 +[epoch 39/50] step=1500 train_loss=0.0120 tok_s=60139.7 opt_steps=1500 +[epoch 39/50] step=1520 train_loss=0.0120 tok_s=60144.1 opt_steps=1520 +[epoch 39/50] step=1540 train_loss=0.0121 tok_s=60144.9 opt_steps=1540 +[epoch 39/50] step=1560 train_loss=0.0121 tok_s=60144.0 opt_steps=1560 +[epoch 39/50] step=1580 train_loss=0.0121 tok_s=60146.7 opt_steps=1580 +[epoch 39/50] step=1600 train_loss=0.0121 tok_s=60153.0 opt_steps=1600 +[epoch 39/50] step=1620 train_loss=0.0121 tok_s=60149.8 opt_steps=1620 +[epoch 39/50] step=1640 train_loss=0.0121 tok_s=60139.6 opt_steps=1640 +[epoch 39/50] step=1660 train_loss=0.0121 tok_s=60138.3 opt_steps=1660 +[epoch 39/50] step=1680 train_loss=0.0122 tok_s=60144.0 opt_steps=1680 +[epoch 39/50] step=1700 train_loss=0.0122 tok_s=60138.9 opt_steps=1700 +[epoch 39/50] step=1720 train_loss=0.0122 tok_s=60138.3 opt_steps=1720 +[epoch 39/50] step=1740 train_loss=0.0122 tok_s=60139.0 opt_steps=1740 +[epoch 39/50] step=1760 train_loss=0.0122 tok_s=60140.4 opt_steps=1760 +[epoch 39/50] step=1780 train_loss=0.0123 tok_s=60140.6 opt_steps=1780 +[epoch 39/50] step=1800 train_loss=0.0123 tok_s=60141.5 opt_steps=1800 +[epoch 39/50] step=1820 train_loss=0.0123 tok_s=60138.9 opt_steps=1820 +[epoch 39/50] step=1840 train_loss=0.0123 tok_s=60136.1 opt_steps=1840 +[epoch 39/50] step=1860 train_loss=0.0123 tok_s=60144.4 opt_steps=1860 +[epoch 39/50] step=1880 train_loss=0.0123 tok_s=60142.5 opt_steps=1880 +[epoch 39/50] step=1900 train_loss=0.0123 tok_s=60139.2 opt_steps=1900 +[epoch 39/50] step=1920 train_loss=0.0124 tok_s=60144.0 opt_steps=1920 +[epoch 39/50] step=1940 train_loss=0.0124 tok_s=60141.4 opt_steps=1940 +[epoch 39/50] step=1960 train_loss=0.0124 tok_s=60133.5 opt_steps=1960 +[epoch 39/50] step=1980 train_loss=0.0124 tok_s=60132.4 opt_steps=1980 +[epoch 39/50] step=2000 train_loss=0.0124 tok_s=60132.1 opt_steps=2000 +[epoch 39/50] step=2020 train_loss=0.0124 tok_s=60132.6 opt_steps=2020 +[epoch 39/50] step=2040 train_loss=0.0124 tok_s=60130.8 opt_steps=2040 +[epoch 39/50] step=2060 train_loss=0.0125 tok_s=60117.8 opt_steps=2060 +[epoch 39/50] step=2080 train_loss=0.0125 tok_s=60113.9 opt_steps=2080 +[epoch 39/50] step=2100 train_loss=0.0125 tok_s=60115.2 opt_steps=2100 +[epoch 39/50] step=2120 train_loss=0.0125 tok_s=60117.1 opt_steps=2120 +[epoch 39/50] step=2140 train_loss=0.0125 tok_s=60116.5 opt_steps=2140 +[epoch 39/50] step=2160 train_loss=0.0125 tok_s=60113.9 opt_steps=2160 +[epoch 39/50] step=2180 train_loss=0.0125 tok_s=60115.7 opt_steps=2180 +[epoch 39/50] step=2200 train_loss=0.0125 tok_s=60114.4 opt_steps=2200 +[epoch 39/50] step=2220 train_loss=0.0125 tok_s=60115.1 opt_steps=2220 +[epoch 39/50] step=2240 train_loss=0.0125 tok_s=60123.4 opt_steps=2240 +[epoch 39/50] step=2260 train_loss=0.0125 tok_s=60123.1 opt_steps=2260 +[epoch 39/50] step=2280 train_loss=0.0126 tok_s=60121.8 opt_steps=2280 +[epoch 39/50] step=2300 train_loss=0.0126 tok_s=60122.2 opt_steps=2300 +[epoch 39/50] step=2320 train_loss=0.0126 tok_s=60119.9 opt_steps=2320 +[epoch 39/50] step=2340 train_loss=0.0126 tok_s=60117.6 opt_steps=2340 +[epoch 39/50] step=2360 train_loss=0.0126 tok_s=60114.6 opt_steps=2360 +[epoch 39/50] step=2380 train_loss=0.0126 tok_s=60113.0 opt_steps=2380 +[epoch 39/50] step=2400 train_loss=0.0126 tok_s=60113.5 opt_steps=2400 +[epoch 39/50] step=2420 train_loss=0.0126 tok_s=60115.2 opt_steps=2420 +[epoch 39/50] step=2440 train_loss=0.0126 tok_s=60113.1 opt_steps=2440 +[epoch 39/50] step=2460 train_loss=0.0127 tok_s=60113.1 opt_steps=2460 +[epoch 39/50] step=2480 train_loss=0.0127 tok_s=60113.3 opt_steps=2480 +[epoch 39/50] step=2500 train_loss=0.0127 tok_s=60113.7 opt_steps=2500 +[epoch 39/50] step=2520 train_loss=0.0127 tok_s=60114.3 opt_steps=2520 +[epoch 39/50] step=2540 train_loss=0.0127 tok_s=60118.8 opt_steps=2540 +[epoch 39/50] step=2560 train_loss=0.0127 tok_s=60121.1 opt_steps=2560 +[epoch 39/50] step=2580 train_loss=0.0127 tok_s=60123.9 opt_steps=2580 +[epoch 39/50] step=2600 train_loss=0.0127 tok_s=60125.9 opt_steps=2600 +[epoch 39/50] step=2620 train_loss=0.0128 tok_s=60124.7 opt_steps=2620 +[epoch 39/50] step=2640 train_loss=0.0128 tok_s=60121.1 opt_steps=2640 +[epoch 39/50] step=2660 train_loss=0.0128 tok_s=60121.3 opt_steps=2660 +[epoch 39/50] step=2680 train_loss=0.0128 tok_s=60123.2 opt_steps=2680 +[epoch 39/50] step=2700 train_loss=0.0128 tok_s=60123.9 opt_steps=2700 +[epoch 39/50] step=2720 train_loss=0.0128 tok_s=60118.8 opt_steps=2720 +[epoch 39/50] step=2740 train_loss=0.0128 tok_s=60116.3 opt_steps=2740 +[epoch 39/50] step=2760 train_loss=0.0128 tok_s=60116.8 opt_steps=2760 +[epoch 39/50] step=2780 train_loss=0.0128 tok_s=60115.8 opt_steps=2780 +[epoch 39/50] step=2800 train_loss=0.0129 tok_s=60114.1 opt_steps=2800 +[epoch 39/50] step=2820 train_loss=0.0129 tok_s=60113.8 opt_steps=2820 +[epoch 39/50] step=2840 train_loss=0.0129 tok_s=60111.0 opt_steps=2840 +[epoch 39/50] step=2860 train_loss=0.0129 tok_s=60109.2 opt_steps=2860 +[epoch 39/50] step=2880 train_loss=0.0129 tok_s=60108.6 opt_steps=2880 +[epoch 39/50] step=2900 train_loss=0.0129 tok_s=60105.3 opt_steps=2900 +[epoch 39/50] step=2920 train_loss=0.0129 tok_s=60109.3 opt_steps=2920 +[epoch 39/50] step=2940 train_loss=0.0129 tok_s=60109.1 opt_steps=2940 +[epoch 39/50] step=2960 train_loss=0.0129 tok_s=60110.7 opt_steps=2960 +[epoch 39/50] step=2980 train_loss=0.0129 tok_s=60111.3 opt_steps=2980 +[epoch 39/50] step=3000 train_loss=0.0129 tok_s=60111.9 opt_steps=3000 +[epoch 39/50] step=3020 train_loss=0.0129 tok_s=60110.3 opt_steps=3020 +[epoch 39/50] step=3040 train_loss=0.0130 tok_s=60112.7 opt_steps=3040 +[epoch 39/50] step=3060 train_loss=0.0130 tok_s=60113.5 opt_steps=3060 +[epoch 39/50] step=3080 train_loss=0.0130 tok_s=60113.4 opt_steps=3080 +[epoch 39/50] step=3100 train_loss=0.0130 tok_s=60112.3 opt_steps=3100 +[epoch 39/50] step=3120 train_loss=0.0130 tok_s=60110.8 opt_steps=3120 +[epoch 39/50] step=3140 train_loss=0.0130 tok_s=60109.9 opt_steps=3140 +[epoch 39/50] step=3160 train_loss=0.0130 tok_s=60110.5 opt_steps=3160 +[epoch 39/50] step=3180 train_loss=0.0130 tok_s=60110.8 opt_steps=3180 +[epoch 39/50] step=3200 train_loss=0.0130 tok_s=60108.9 opt_steps=3200 +[epoch 39/50] step=3220 train_loss=0.0130 tok_s=60107.1 opt_steps=3220 +[epoch 39/50] step=3240 train_loss=0.0130 tok_s=60108.3 opt_steps=3240 +[epoch 39/50] step=3260 train_loss=0.0130 tok_s=60103.0 opt_steps=3260 +[epoch 39/50] train_loss=0.0130 val_skipped tok_s=60101.3 opt_steps=3273 +[epoch 40/50] step=20 train_loss=0.0112 tok_s=58753.5 opt_steps=20 +[epoch 40/50] step=40 train_loss=0.0110 tok_s=59604.2 opt_steps=40 +[epoch 40/50] step=60 train_loss=0.0108 tok_s=59792.0 opt_steps=60 +[epoch 40/50] step=80 train_loss=0.0107 tok_s=59874.0 opt_steps=80 +[epoch 40/50] step=100 train_loss=0.0106 tok_s=59989.5 opt_steps=100 +[epoch 40/50] step=120 train_loss=0.0106 tok_s=60007.8 opt_steps=120 +[epoch 40/50] step=140 train_loss=0.0106 tok_s=60067.9 opt_steps=140 +[epoch 40/50] step=160 train_loss=0.0107 tok_s=60082.0 opt_steps=160 +[epoch 40/50] step=180 train_loss=0.0107 tok_s=60116.1 opt_steps=180 +[epoch 40/50] step=200 train_loss=0.0107 tok_s=60067.0 opt_steps=200 +[epoch 40/50] step=220 train_loss=0.0108 tok_s=60060.4 opt_steps=220 +[epoch 40/50] step=240 train_loss=0.0108 tok_s=60095.7 opt_steps=240 +[epoch 40/50] step=260 train_loss=0.0108 tok_s=60080.9 opt_steps=260 +[epoch 40/50] step=280 train_loss=0.0108 tok_s=60071.6 opt_steps=280 +[epoch 40/50] step=300 train_loss=0.0109 tok_s=60065.5 opt_steps=300 +[epoch 40/50] step=320 train_loss=0.0109 tok_s=60044.5 opt_steps=320 +[epoch 40/50] step=340 train_loss=0.0109 tok_s=60064.1 opt_steps=340 +[epoch 40/50] step=360 train_loss=0.0109 tok_s=60064.5 opt_steps=360 +[epoch 40/50] step=380 train_loss=0.0109 tok_s=60047.7 opt_steps=380 +[epoch 40/50] step=400 train_loss=0.0109 tok_s=60069.4 opt_steps=400 +[epoch 40/50] step=420 train_loss=0.0109 tok_s=60090.5 opt_steps=420 +[epoch 40/50] step=440 train_loss=0.0109 tok_s=60063.7 opt_steps=440 +[epoch 40/50] step=460 train_loss=0.0109 tok_s=60076.6 opt_steps=460 +[epoch 40/50] step=480 train_loss=0.0110 tok_s=60106.7 opt_steps=480 +[epoch 40/50] step=500 train_loss=0.0109 tok_s=60119.1 opt_steps=500 +[epoch 40/50] step=520 train_loss=0.0110 tok_s=60122.6 opt_steps=520 +[epoch 40/50] step=540 train_loss=0.0110 tok_s=60145.2 opt_steps=540 +[epoch 40/50] step=560 train_loss=0.0110 tok_s=60154.4 opt_steps=560 +[epoch 40/50] step=580 train_loss=0.0111 tok_s=60151.2 opt_steps=580 +[epoch 40/50] step=600 train_loss=0.0111 tok_s=60146.7 opt_steps=600 +[epoch 40/50] step=620 train_loss=0.0111 tok_s=60152.1 opt_steps=620 +[epoch 40/50] step=640 train_loss=0.0111 tok_s=60153.6 opt_steps=640 +[epoch 40/50] step=660 train_loss=0.0111 tok_s=60154.7 opt_steps=660 +[epoch 40/50] step=680 train_loss=0.0111 tok_s=60141.7 opt_steps=680 +[epoch 40/50] step=700 train_loss=0.0112 tok_s=60096.1 opt_steps=700 +[epoch 40/50] step=720 train_loss=0.0112 tok_s=60093.3 opt_steps=720 +[epoch 40/50] step=740 train_loss=0.0112 tok_s=60077.2 opt_steps=740 +[epoch 40/50] step=760 train_loss=0.0112 tok_s=60054.6 opt_steps=760 +[epoch 40/50] step=780 train_loss=0.0112 tok_s=60045.0 opt_steps=780 +[epoch 40/50] step=800 train_loss=0.0112 tok_s=60037.0 opt_steps=800 +[epoch 40/50] step=820 train_loss=0.0113 tok_s=60014.9 opt_steps=820 +[epoch 40/50] step=840 train_loss=0.0113 tok_s=59995.7 opt_steps=840 +[epoch 40/50] step=860 train_loss=0.0113 tok_s=59988.4 opt_steps=860 +[epoch 40/50] step=880 train_loss=0.0113 tok_s=59973.5 opt_steps=880 +[epoch 40/50] step=900 train_loss=0.0114 tok_s=59968.2 opt_steps=900 +[epoch 40/50] step=920 train_loss=0.0114 tok_s=59960.5 opt_steps=920 +[epoch 40/50] step=940 train_loss=0.0114 tok_s=59955.0 opt_steps=940 +[epoch 40/50] step=960 train_loss=0.0114 tok_s=59948.9 opt_steps=960 +[epoch 40/50] step=980 train_loss=0.0114 tok_s=59945.4 opt_steps=980 +[epoch 40/50] step=1000 train_loss=0.0114 tok_s=59923.7 opt_steps=1000 +[epoch 40/50] step=1020 train_loss=0.0115 tok_s=59919.4 opt_steps=1020 +[epoch 40/50] step=1040 train_loss=0.0115 tok_s=59909.5 opt_steps=1040 +[epoch 40/50] step=1060 train_loss=0.0115 tok_s=59914.0 opt_steps=1060 +[epoch 40/50] step=1080 train_loss=0.0115 tok_s=59911.9 opt_steps=1080 +[epoch 40/50] step=1100 train_loss=0.0115 tok_s=59909.5 opt_steps=1100 +[epoch 40/50] step=1120 train_loss=0.0115 tok_s=59900.2 opt_steps=1120 +[epoch 40/50] step=1140 train_loss=0.0115 tok_s=59903.1 opt_steps=1140 +[epoch 40/50] step=1160 train_loss=0.0115 tok_s=59902.2 opt_steps=1160 +[epoch 40/50] step=1180 train_loss=0.0116 tok_s=59904.2 opt_steps=1180 +[epoch 40/50] step=1200 train_loss=0.0116 tok_s=59902.8 opt_steps=1200 +[epoch 40/50] step=1220 train_loss=0.0116 tok_s=59914.1 opt_steps=1220 +[epoch 40/50] step=1240 train_loss=0.0116 tok_s=59907.6 opt_steps=1240 +[epoch 40/50] step=1260 train_loss=0.0116 tok_s=59904.6 opt_steps=1260 +[epoch 40/50] step=1280 train_loss=0.0116 tok_s=59900.5 opt_steps=1280 +[epoch 40/50] step=1300 train_loss=0.0117 tok_s=59903.3 opt_steps=1300 +[epoch 40/50] step=1320 train_loss=0.0117 tok_s=59912.8 opt_steps=1320 +[epoch 40/50] step=1340 train_loss=0.0117 tok_s=59909.9 opt_steps=1340 +[epoch 40/50] step=1360 train_loss=0.0117 tok_s=59907.1 opt_steps=1360 +[epoch 40/50] step=1380 train_loss=0.0117 tok_s=59908.1 opt_steps=1380 +[epoch 40/50] step=1400 train_loss=0.0117 tok_s=59912.7 opt_steps=1400 +[epoch 40/50] step=1420 train_loss=0.0118 tok_s=59922.7 opt_steps=1420 +[epoch 40/50] step=1440 train_loss=0.0118 tok_s=59926.2 opt_steps=1440 +[epoch 40/50] step=1460 train_loss=0.0118 tok_s=59928.7 opt_steps=1460 +[epoch 40/50] step=1480 train_loss=0.0118 tok_s=59935.6 opt_steps=1480 +[epoch 40/50] step=1500 train_loss=0.0118 tok_s=59942.7 opt_steps=1500 +[epoch 40/50] step=1520 train_loss=0.0118 tok_s=59945.3 opt_steps=1520 +[epoch 40/50] step=1540 train_loss=0.0119 tok_s=59945.4 opt_steps=1540 +[epoch 40/50] step=1560 train_loss=0.0119 tok_s=59941.0 opt_steps=1560 +[epoch 40/50] step=1580 train_loss=0.0119 tok_s=59948.8 opt_steps=1580 +[epoch 40/50] step=1600 train_loss=0.0119 tok_s=59956.3 opt_steps=1600 +[epoch 40/50] step=1620 train_loss=0.0119 tok_s=59955.4 opt_steps=1620 +[epoch 40/50] step=1640 train_loss=0.0119 tok_s=59961.9 opt_steps=1640 +[epoch 40/50] step=1660 train_loss=0.0120 tok_s=59963.7 opt_steps=1660 +[epoch 40/50] step=1680 train_loss=0.0120 tok_s=59965.8 opt_steps=1680 +[epoch 40/50] step=1700 train_loss=0.0120 tok_s=59963.0 opt_steps=1700 +[epoch 40/50] step=1720 train_loss=0.0120 tok_s=59962.9 opt_steps=1720 +[epoch 40/50] step=1740 train_loss=0.0120 tok_s=59965.7 opt_steps=1740 +[epoch 40/50] step=1760 train_loss=0.0120 tok_s=59967.3 opt_steps=1760 +[epoch 40/50] step=1780 train_loss=0.0120 tok_s=59962.0 opt_steps=1780 +[epoch 40/50] step=1800 train_loss=0.0120 tok_s=59962.7 opt_steps=1800 +[epoch 40/50] step=1820 train_loss=0.0121 tok_s=59964.1 opt_steps=1820 +[epoch 40/50] step=1840 train_loss=0.0121 tok_s=59966.0 opt_steps=1840 +[epoch 40/50] step=1860 train_loss=0.0121 tok_s=59968.9 opt_steps=1860 +[epoch 40/50] step=1880 train_loss=0.0121 tok_s=59974.7 opt_steps=1880 +[epoch 40/50] step=1900 train_loss=0.0121 tok_s=59976.7 opt_steps=1900 +[epoch 40/50] step=1920 train_loss=0.0121 tok_s=59977.2 opt_steps=1920 +[epoch 40/50] step=1940 train_loss=0.0121 tok_s=59979.5 opt_steps=1940 +[epoch 40/50] step=1960 train_loss=0.0121 tok_s=59983.7 opt_steps=1960 +[epoch 40/50] step=1980 train_loss=0.0122 tok_s=59987.0 opt_steps=1980 +[epoch 40/50] step=2000 train_loss=0.0122 tok_s=59988.0 opt_steps=2000 +[epoch 40/50] step=2020 train_loss=0.0122 tok_s=59986.7 opt_steps=2020 +[epoch 40/50] step=2040 train_loss=0.0122 tok_s=59985.7 opt_steps=2040 +[epoch 40/50] step=2060 train_loss=0.0122 tok_s=59982.6 opt_steps=2060 +[epoch 40/50] step=2080 train_loss=0.0122 tok_s=59980.6 opt_steps=2080 +[epoch 40/50] step=2100 train_loss=0.0122 tok_s=59982.6 opt_steps=2100 +[epoch 40/50] step=2120 train_loss=0.0122 tok_s=59984.8 opt_steps=2120 +[epoch 40/50] step=2140 train_loss=0.0122 tok_s=59986.5 opt_steps=2140 +[epoch 40/50] step=2160 train_loss=0.0122 tok_s=59989.3 opt_steps=2160 +[epoch 40/50] step=2180 train_loss=0.0123 tok_s=59988.3 opt_steps=2180 +[epoch 40/50] step=2200 train_loss=0.0123 tok_s=59988.8 opt_steps=2200 +[epoch 40/50] step=2220 train_loss=0.0123 tok_s=59990.9 opt_steps=2220 +[epoch 40/50] step=2240 train_loss=0.0123 tok_s=59995.6 opt_steps=2240 +[epoch 40/50] step=2260 train_loss=0.0123 tok_s=60003.5 opt_steps=2260 +[epoch 40/50] step=2280 train_loss=0.0123 tok_s=60002.2 opt_steps=2280 +[epoch 40/50] step=2300 train_loss=0.0123 tok_s=60001.5 opt_steps=2300 +[epoch 40/50] step=2320 train_loss=0.0123 tok_s=60006.9 opt_steps=2320 +[epoch 40/50] step=2340 train_loss=0.0123 tok_s=60006.5 opt_steps=2340 +[epoch 40/50] step=2360 train_loss=0.0123 tok_s=60010.5 opt_steps=2360 +[epoch 40/50] step=2380 train_loss=0.0124 tok_s=60008.3 opt_steps=2380 +[epoch 40/50] step=2400 train_loss=0.0124 tok_s=60011.8 opt_steps=2400 +[epoch 40/50] step=2420 train_loss=0.0124 tok_s=60015.0 opt_steps=2420 +[epoch 40/50] step=2440 train_loss=0.0124 tok_s=60013.6 opt_steps=2440 +[epoch 40/50] step=2460 train_loss=0.0124 tok_s=60012.6 opt_steps=2460 +[epoch 40/50] step=2480 train_loss=0.0124 tok_s=60012.7 opt_steps=2480 +[epoch 40/50] step=2500 train_loss=0.0124 tok_s=60014.2 opt_steps=2500 +[epoch 40/50] step=2520 train_loss=0.0124 tok_s=60017.3 opt_steps=2520 +[epoch 40/50] step=2540 train_loss=0.0124 tok_s=60017.1 opt_steps=2540 +[epoch 40/50] step=2560 train_loss=0.0124 tok_s=60020.4 opt_steps=2560 +[epoch 40/50] step=2580 train_loss=0.0125 tok_s=60020.6 opt_steps=2580 +[epoch 40/50] step=2600 train_loss=0.0125 tok_s=60020.7 opt_steps=2600 +[epoch 40/50] step=2620 train_loss=0.0125 tok_s=60010.2 opt_steps=2620 +[epoch 40/50] step=2640 train_loss=0.0125 tok_s=60004.3 opt_steps=2640 +[epoch 40/50] step=2660 train_loss=0.0125 tok_s=59994.6 opt_steps=2660 +[epoch 40/50] step=2680 train_loss=0.0125 tok_s=59981.9 opt_steps=2680 +[epoch 40/50] step=2700 train_loss=0.0125 tok_s=59973.5 opt_steps=2700 +[epoch 40/50] step=2720 train_loss=0.0125 tok_s=59966.0 opt_steps=2720 +[epoch 40/50] step=2740 train_loss=0.0125 tok_s=59966.7 opt_steps=2740 +[epoch 40/50] step=2760 train_loss=0.0125 tok_s=59967.1 opt_steps=2760 +[epoch 40/50] step=2780 train_loss=0.0125 tok_s=59965.2 opt_steps=2780 +[epoch 40/50] step=2800 train_loss=0.0126 tok_s=59965.6 opt_steps=2800 +[epoch 40/50] step=2820 train_loss=0.0126 tok_s=59967.2 opt_steps=2820 +[epoch 40/50] step=2840 train_loss=0.0126 tok_s=59969.6 opt_steps=2840 +[epoch 40/50] step=2860 train_loss=0.0126 tok_s=59970.8 opt_steps=2860 +[epoch 40/50] step=2880 train_loss=0.0126 tok_s=59970.7 opt_steps=2880 +[epoch 40/50] step=2900 train_loss=0.0126 tok_s=59973.4 opt_steps=2900 +[epoch 40/50] step=2920 train_loss=0.0126 tok_s=59971.8 opt_steps=2920 +[epoch 40/50] step=2940 train_loss=0.0126 tok_s=59972.2 opt_steps=2940 +[epoch 40/50] step=2960 train_loss=0.0126 tok_s=59974.5 opt_steps=2960 +[epoch 40/50] step=2980 train_loss=0.0126 tok_s=59975.6 opt_steps=2980 +[epoch 40/50] step=3000 train_loss=0.0126 tok_s=59979.8 opt_steps=3000 +[epoch 40/50] step=3020 train_loss=0.0126 tok_s=59980.9 opt_steps=3020 +[epoch 40/50] step=3040 train_loss=0.0127 tok_s=59983.7 opt_steps=3040 +[epoch 40/50] step=3060 train_loss=0.0127 tok_s=59983.1 opt_steps=3060 +[epoch 40/50] step=3080 train_loss=0.0127 tok_s=59985.8 opt_steps=3080 +[epoch 40/50] step=3100 train_loss=0.0127 tok_s=59991.0 opt_steps=3100 +[epoch 40/50] step=3120 train_loss=0.0127 tok_s=59990.8 opt_steps=3120 +[epoch 40/50] step=3140 train_loss=0.0127 tok_s=59988.1 opt_steps=3140 +[epoch 40/50] step=3160 train_loss=0.0127 tok_s=59994.4 opt_steps=3160 +[epoch 40/50] step=3180 train_loss=0.0127 tok_s=59996.7 opt_steps=3180 +[epoch 40/50] step=3200 train_loss=0.0127 tok_s=59996.8 opt_steps=3200 +[epoch 40/50] step=3220 train_loss=0.0127 tok_s=59994.5 opt_steps=3220 +[epoch 40/50] step=3240 train_loss=0.0127 tok_s=59994.3 opt_steps=3240 +[epoch 40/50] step=3260 train_loss=0.0128 tok_s=59989.5 opt_steps=3260 +[epoch 40/50] train_loss=0.0128 val_skipped tok_s=59989.6 opt_steps=3273 +[epoch 41/50] step=20 train_loss=0.0113 tok_s=57633.3 opt_steps=20 +[epoch 41/50] step=40 train_loss=0.0108 tok_s=58984.0 opt_steps=40 +[epoch 41/50] step=60 train_loss=0.0106 tok_s=59397.0 opt_steps=60 +[epoch 41/50] step=80 train_loss=0.0105 tok_s=59500.1 opt_steps=80 +[epoch 41/50] step=100 train_loss=0.0105 tok_s=59526.4 opt_steps=100 +[epoch 41/50] step=120 train_loss=0.0104 tok_s=59658.5 opt_steps=120 +[epoch 41/50] step=140 train_loss=0.0104 tok_s=59685.9 opt_steps=140 +[epoch 41/50] step=160 train_loss=0.0104 tok_s=59752.0 opt_steps=160 +[epoch 41/50] step=180 train_loss=0.0104 tok_s=59749.5 opt_steps=180 +[epoch 41/50] step=200 train_loss=0.0104 tok_s=59791.9 opt_steps=200 +[epoch 41/50] step=220 train_loss=0.0104 tok_s=59804.9 opt_steps=220 +[epoch 41/50] step=240 train_loss=0.0104 tok_s=59834.8 opt_steps=240 +[epoch 41/50] step=260 train_loss=0.0104 tok_s=59834.6 opt_steps=260 +[epoch 41/50] step=280 train_loss=0.0104 tok_s=59804.4 opt_steps=280 +[epoch 41/50] step=300 train_loss=0.0104 tok_s=59875.9 opt_steps=300 +[epoch 41/50] step=320 train_loss=0.0104 tok_s=59877.0 opt_steps=320 +[epoch 41/50] step=340 train_loss=0.0104 tok_s=59889.9 opt_steps=340 +[epoch 41/50] step=360 train_loss=0.0104 tok_s=59924.4 opt_steps=360 +[epoch 41/50] step=380 train_loss=0.0104 tok_s=59927.3 opt_steps=380 +[epoch 41/50] step=400 train_loss=0.0104 tok_s=59923.1 opt_steps=400 +[epoch 41/50] step=420 train_loss=0.0104 tok_s=59919.8 opt_steps=420 +[epoch 41/50] step=440 train_loss=0.0105 tok_s=59929.1 opt_steps=440 +[epoch 41/50] step=460 train_loss=0.0105 tok_s=59940.0 opt_steps=460 +[epoch 41/50] step=480 train_loss=0.0105 tok_s=59953.1 opt_steps=480 +[epoch 41/50] step=500 train_loss=0.0106 tok_s=59951.4 opt_steps=500 +[epoch 41/50] step=520 train_loss=0.0106 tok_s=59943.5 opt_steps=520 +[epoch 41/50] step=540 train_loss=0.0106 tok_s=59960.5 opt_steps=540 +[epoch 41/50] step=560 train_loss=0.0107 tok_s=59951.8 opt_steps=560 +[epoch 41/50] step=580 train_loss=0.0107 tok_s=59947.8 opt_steps=580 +[epoch 41/50] step=600 train_loss=0.0107 tok_s=59942.3 opt_steps=600 +[epoch 41/50] step=620 train_loss=0.0107 tok_s=59933.5 opt_steps=620 +[epoch 41/50] step=640 train_loss=0.0108 tok_s=59939.9 opt_steps=640 +[epoch 41/50] step=660 train_loss=0.0108 tok_s=59946.1 opt_steps=660 +[epoch 41/50] step=680 train_loss=0.0108 tok_s=59952.0 opt_steps=680 +[epoch 41/50] step=700 train_loss=0.0109 tok_s=59954.7 opt_steps=700 +[epoch 41/50] step=720 train_loss=0.0109 tok_s=59962.8 opt_steps=720 +[epoch 41/50] step=740 train_loss=0.0109 tok_s=59954.3 opt_steps=740 +[epoch 41/50] step=760 train_loss=0.0109 tok_s=59955.4 opt_steps=760 +[epoch 41/50] step=780 train_loss=0.0109 tok_s=59969.8 opt_steps=780 +[epoch 41/50] step=800 train_loss=0.0110 tok_s=59972.0 opt_steps=800 +[epoch 41/50] step=820 train_loss=0.0110 tok_s=59969.4 opt_steps=820 +[epoch 41/50] step=840 train_loss=0.0110 tok_s=59976.4 opt_steps=840 +[epoch 41/50] step=860 train_loss=0.0110 tok_s=59981.9 opt_steps=860 +[epoch 41/50] step=880 train_loss=0.0111 tok_s=59991.6 opt_steps=880 +[epoch 41/50] step=900 train_loss=0.0111 tok_s=59997.6 opt_steps=900 +[epoch 41/50] step=920 train_loss=0.0111 tok_s=59995.3 opt_steps=920 +[epoch 41/50] step=940 train_loss=0.0111 tok_s=60000.9 opt_steps=940 +[epoch 41/50] step=960 train_loss=0.0111 tok_s=60015.4 opt_steps=960 +[epoch 41/50] step=980 train_loss=0.0111 tok_s=60010.8 opt_steps=980 +[epoch 41/50] step=1000 train_loss=0.0112 tok_s=60012.6 opt_steps=1000 +[epoch 41/50] step=1020 train_loss=0.0112 tok_s=60009.3 opt_steps=1020 +[epoch 41/50] step=1040 train_loss=0.0112 tok_s=60005.5 opt_steps=1040 +[epoch 41/50] step=1060 train_loss=0.0112 tok_s=60008.3 opt_steps=1060 +[epoch 41/50] step=1080 train_loss=0.0112 tok_s=60016.3 opt_steps=1080 +[epoch 41/50] step=1100 train_loss=0.0112 tok_s=60015.9 opt_steps=1100 +[epoch 41/50] step=1120 train_loss=0.0113 tok_s=60020.1 opt_steps=1120 +[epoch 41/50] step=1140 train_loss=0.0113 tok_s=60023.4 opt_steps=1140 +[epoch 41/50] step=1160 train_loss=0.0113 tok_s=60021.8 opt_steps=1160 +[epoch 41/50] step=1180 train_loss=0.0113 tok_s=60026.4 opt_steps=1180 +[epoch 41/50] step=1200 train_loss=0.0113 tok_s=60023.2 opt_steps=1200 +[epoch 41/50] step=1220 train_loss=0.0113 tok_s=60021.0 opt_steps=1220 +[epoch 41/50] step=1240 train_loss=0.0113 tok_s=60018.3 opt_steps=1240 +[epoch 41/50] step=1260 train_loss=0.0114 tok_s=60020.7 opt_steps=1260 +[epoch 41/50] step=1280 train_loss=0.0114 tok_s=60012.8 opt_steps=1280 +[epoch 41/50] step=1300 train_loss=0.0114 tok_s=60013.4 opt_steps=1300 +[epoch 41/50] step=1320 train_loss=0.0114 tok_s=60017.3 opt_steps=1320 +[epoch 41/50] step=1340 train_loss=0.0114 tok_s=60023.1 opt_steps=1340 +[epoch 41/50] step=1360 train_loss=0.0114 tok_s=60028.2 opt_steps=1360 +[epoch 41/50] step=1380 train_loss=0.0114 tok_s=60033.3 opt_steps=1380 +[epoch 41/50] step=1400 train_loss=0.0115 tok_s=60033.4 opt_steps=1400 +[epoch 41/50] step=1420 train_loss=0.0115 tok_s=60032.1 opt_steps=1420 +[epoch 41/50] step=1440 train_loss=0.0115 tok_s=60033.3 opt_steps=1440 +[epoch 41/50] step=1460 train_loss=0.0115 tok_s=60037.4 opt_steps=1460 +[epoch 41/50] step=1480 train_loss=0.0115 tok_s=60044.1 opt_steps=1480 +[epoch 41/50] step=1500 train_loss=0.0115 tok_s=60049.5 opt_steps=1500 +[epoch 41/50] step=1520 train_loss=0.0115 tok_s=60049.9 opt_steps=1520 +[epoch 41/50] step=1540 train_loss=0.0116 tok_s=60054.3 opt_steps=1540 +[epoch 41/50] step=1560 train_loss=0.0116 tok_s=60054.2 opt_steps=1560 +[epoch 41/50] step=1580 train_loss=0.0116 tok_s=60055.4 opt_steps=1580 +[epoch 41/50] step=1600 train_loss=0.0116 tok_s=60049.2 opt_steps=1600 +[epoch 41/50] step=1620 train_loss=0.0116 tok_s=60050.4 opt_steps=1620 +[epoch 41/50] step=1640 train_loss=0.0116 tok_s=60051.0 opt_steps=1640 +[epoch 41/50] step=1660 train_loss=0.0116 tok_s=60043.7 opt_steps=1660 +[epoch 41/50] step=1680 train_loss=0.0117 tok_s=60042.0 opt_steps=1680 +[epoch 41/50] step=1700 train_loss=0.0117 tok_s=60043.3 opt_steps=1700 +[epoch 41/50] step=1720 train_loss=0.0117 tok_s=60047.4 opt_steps=1720 +[epoch 41/50] step=1740 train_loss=0.0117 tok_s=60052.0 opt_steps=1740 +[epoch 41/50] step=1760 train_loss=0.0117 tok_s=60054.2 opt_steps=1760 +[epoch 41/50] step=1780 train_loss=0.0117 tok_s=60063.5 opt_steps=1780 +[epoch 41/50] step=1800 train_loss=0.0117 tok_s=60062.0 opt_steps=1800 +[epoch 41/50] step=1820 train_loss=0.0117 tok_s=60069.4 opt_steps=1820 +[epoch 41/50] step=1840 train_loss=0.0118 tok_s=60070.1 opt_steps=1840 +[epoch 41/50] step=1860 train_loss=0.0118 tok_s=60068.8 opt_steps=1860 +[epoch 41/50] step=1880 train_loss=0.0118 tok_s=60070.0 opt_steps=1880 +[epoch 41/50] step=1900 train_loss=0.0118 tok_s=60068.8 opt_steps=1900 +[epoch 41/50] step=1920 train_loss=0.0118 tok_s=60071.1 opt_steps=1920 +[epoch 41/50] step=1940 train_loss=0.0118 tok_s=60071.0 opt_steps=1940 +[epoch 41/50] step=1960 train_loss=0.0118 tok_s=60071.0 opt_steps=1960 +[epoch 41/50] step=1980 train_loss=0.0118 tok_s=60065.8 opt_steps=1980 +[epoch 41/50] step=2000 train_loss=0.0119 tok_s=60067.0 opt_steps=2000 +[epoch 41/50] step=2020 train_loss=0.0119 tok_s=60066.8 opt_steps=2020 +[epoch 41/50] step=2040 train_loss=0.0119 tok_s=60070.2 opt_steps=2040 +[epoch 41/50] step=2060 train_loss=0.0119 tok_s=60069.0 opt_steps=2060 +[epoch 41/50] step=2080 train_loss=0.0119 tok_s=60071.1 opt_steps=2080 +[epoch 41/50] step=2100 train_loss=0.0119 tok_s=60074.3 opt_steps=2100 +[epoch 41/50] step=2120 train_loss=0.0119 tok_s=60077.1 opt_steps=2120 +[epoch 41/50] step=2140 train_loss=0.0119 tok_s=60079.0 opt_steps=2140 +[epoch 41/50] step=2160 train_loss=0.0119 tok_s=60085.0 opt_steps=2160 +[epoch 41/50] step=2180 train_loss=0.0119 tok_s=60085.3 opt_steps=2180 +[epoch 41/50] step=2200 train_loss=0.0120 tok_s=60089.1 opt_steps=2200 +[epoch 41/50] step=2220 train_loss=0.0120 tok_s=60089.5 opt_steps=2220 +[epoch 41/50] step=2240 train_loss=0.0120 tok_s=60093.1 opt_steps=2240 +[epoch 41/50] step=2260 train_loss=0.0120 tok_s=60089.4 opt_steps=2260 +[epoch 41/50] step=2280 train_loss=0.0120 tok_s=60096.2 opt_steps=2280 +[epoch 41/50] step=2300 train_loss=0.0120 tok_s=60102.6 opt_steps=2300 +[epoch 41/50] step=2320 train_loss=0.0120 tok_s=60101.7 opt_steps=2320 +[epoch 41/50] step=2340 train_loss=0.0120 tok_s=60097.7 opt_steps=2340 +[epoch 41/50] step=2360 train_loss=0.0121 tok_s=60099.9 opt_steps=2360 +[epoch 41/50] step=2380 train_loss=0.0121 tok_s=60101.1 opt_steps=2380 +[epoch 41/50] step=2400 train_loss=0.0121 tok_s=60094.8 opt_steps=2400 +[epoch 41/50] step=2420 train_loss=0.0121 tok_s=60097.1 opt_steps=2420 +[epoch 41/50] step=2440 train_loss=0.0121 tok_s=60095.3 opt_steps=2440 +[epoch 41/50] step=2460 train_loss=0.0121 tok_s=60091.3 opt_steps=2460 +[epoch 41/50] step=2480 train_loss=0.0121 tok_s=60092.9 opt_steps=2480 +[epoch 41/50] step=2500 train_loss=0.0121 tok_s=60095.8 opt_steps=2500 +[epoch 41/50] step=2520 train_loss=0.0121 tok_s=60091.8 opt_steps=2520 +[epoch 41/50] step=2540 train_loss=0.0121 tok_s=60092.8 opt_steps=2540 +[epoch 41/50] step=2560 train_loss=0.0121 tok_s=60094.4 opt_steps=2560 +[epoch 41/50] step=2580 train_loss=0.0122 tok_s=60095.1 opt_steps=2580 +[epoch 41/50] step=2600 train_loss=0.0122 tok_s=60094.9 opt_steps=2600 +[epoch 41/50] step=2620 train_loss=0.0122 tok_s=60093.0 opt_steps=2620 +[epoch 41/50] step=2640 train_loss=0.0122 tok_s=60095.6 opt_steps=2640 +[epoch 41/50] step=2660 train_loss=0.0122 tok_s=60098.5 opt_steps=2660 +[epoch 41/50] step=2680 train_loss=0.0122 tok_s=60099.6 opt_steps=2680 +[epoch 41/50] step=2700 train_loss=0.0122 tok_s=60097.9 opt_steps=2700 +[epoch 41/50] step=2720 train_loss=0.0122 tok_s=60095.2 opt_steps=2720 +[epoch 41/50] step=2740 train_loss=0.0122 tok_s=60094.8 opt_steps=2740 +[epoch 41/50] step=2760 train_loss=0.0122 tok_s=60093.5 opt_steps=2760 +[epoch 41/50] step=2780 train_loss=0.0122 tok_s=60093.8 opt_steps=2780 +[epoch 41/50] step=2800 train_loss=0.0123 tok_s=60092.9 opt_steps=2800 +[epoch 41/50] step=2820 train_loss=0.0123 tok_s=60092.1 opt_steps=2820 +[epoch 41/50] step=2840 train_loss=0.0123 tok_s=60093.3 opt_steps=2840 +[epoch 41/50] step=2860 train_loss=0.0123 tok_s=60092.1 opt_steps=2860 +[epoch 41/50] step=2880 train_loss=0.0123 tok_s=60077.5 opt_steps=2880 +[epoch 41/50] step=2900 train_loss=0.0123 tok_s=60080.9 opt_steps=2900 +[epoch 41/50] step=2920 train_loss=0.0123 tok_s=60084.9 opt_steps=2920 +[epoch 41/50] step=2940 train_loss=0.0123 tok_s=60085.5 opt_steps=2940 +[epoch 41/50] step=2960 train_loss=0.0123 tok_s=60086.7 opt_steps=2960 +[epoch 41/50] step=2980 train_loss=0.0123 tok_s=60087.7 opt_steps=2980 +[epoch 41/50] step=3000 train_loss=0.0123 tok_s=60090.2 opt_steps=3000 +[epoch 41/50] step=3020 train_loss=0.0124 tok_s=60090.6 opt_steps=3020 +[epoch 41/50] step=3040 train_loss=0.0124 tok_s=60092.5 opt_steps=3040 +[epoch 41/50] step=3060 train_loss=0.0124 tok_s=60091.2 opt_steps=3060 +[epoch 41/50] step=3080 train_loss=0.0124 tok_s=60088.8 opt_steps=3080 +[epoch 41/50] step=3100 train_loss=0.0124 tok_s=60089.0 opt_steps=3100 +[epoch 41/50] step=3120 train_loss=0.0124 tok_s=60088.0 opt_steps=3120 +[epoch 41/50] step=3140 train_loss=0.0124 tok_s=60085.9 opt_steps=3140 +[epoch 41/50] step=3160 train_loss=0.0124 tok_s=60087.3 opt_steps=3160 +[epoch 41/50] step=3180 train_loss=0.0124 tok_s=60084.6 opt_steps=3180 +[epoch 41/50] step=3200 train_loss=0.0124 tok_s=60085.1 opt_steps=3200 +[epoch 41/50] step=3220 train_loss=0.0124 tok_s=60090.9 opt_steps=3220 +[epoch 41/50] step=3240 train_loss=0.0124 tok_s=60091.4 opt_steps=3240 +[epoch 41/50] step=3260 train_loss=0.0124 tok_s=60086.0 opt_steps=3260 +[epoch 41/50] train_loss=0.0124 val_skipped tok_s=60087.7 opt_steps=3273 +[epoch 42/50] step=20 train_loss=0.0102 tok_s=58240.4 opt_steps=20 +[epoch 42/50] step=40 train_loss=0.0101 tok_s=59177.0 opt_steps=40 +[epoch 42/50] step=60 train_loss=0.0100 tok_s=59493.1 opt_steps=60 +[epoch 42/50] step=80 train_loss=0.0100 tok_s=59680.0 opt_steps=80 +[epoch 42/50] step=100 train_loss=0.0101 tok_s=59721.3 opt_steps=100 +[epoch 42/50] step=120 train_loss=0.0100 tok_s=59800.4 opt_steps=120 +[epoch 42/50] step=140 train_loss=0.0100 tok_s=59736.8 opt_steps=140 +[epoch 42/50] step=160 train_loss=0.0101 tok_s=59803.1 opt_steps=160 +[epoch 42/50] step=180 train_loss=0.0101 tok_s=59819.3 opt_steps=180 +[epoch 42/50] step=200 train_loss=0.0102 tok_s=59872.9 opt_steps=200 +[epoch 42/50] step=220 train_loss=0.0102 tok_s=59897.4 opt_steps=220 +[epoch 42/50] step=240 train_loss=0.0102 tok_s=59940.5 opt_steps=240 +[epoch 42/50] step=260 train_loss=0.0102 tok_s=59964.4 opt_steps=260 +[epoch 42/50] step=280 train_loss=0.0102 tok_s=60002.0 opt_steps=280 +[epoch 42/50] step=300 train_loss=0.0102 tok_s=59994.5 opt_steps=300 +[epoch 42/50] step=320 train_loss=0.0103 tok_s=60002.9 opt_steps=320 +[epoch 42/50] step=340 train_loss=0.0103 tok_s=60001.4 opt_steps=340 +[epoch 42/50] step=360 train_loss=0.0103 tok_s=60009.8 opt_steps=360 +[epoch 42/50] step=380 train_loss=0.0103 tok_s=60045.7 opt_steps=380 +[epoch 42/50] step=400 train_loss=0.0103 tok_s=60052.5 opt_steps=400 +[epoch 42/50] step=420 train_loss=0.0103 tok_s=60024.7 opt_steps=420 +[epoch 42/50] step=440 train_loss=0.0104 tok_s=60022.6 opt_steps=440 +[epoch 42/50] step=460 train_loss=0.0104 tok_s=60024.1 opt_steps=460 +[epoch 42/50] step=480 train_loss=0.0104 tok_s=60042.4 opt_steps=480 +[epoch 42/50] step=500 train_loss=0.0104 tok_s=60047.8 opt_steps=500 +[epoch 42/50] step=520 train_loss=0.0104 tok_s=60028.2 opt_steps=520 +[epoch 42/50] step=540 train_loss=0.0105 tok_s=60038.7 opt_steps=540 +[epoch 42/50] step=560 train_loss=0.0105 tok_s=60051.0 opt_steps=560 +[epoch 42/50] step=580 train_loss=0.0105 tok_s=60045.8 opt_steps=580 +[epoch 42/50] step=600 train_loss=0.0105 tok_s=60043.2 opt_steps=600 +[epoch 42/50] step=620 train_loss=0.0106 tok_s=60044.1 opt_steps=620 +[epoch 42/50] step=640 train_loss=0.0106 tok_s=60052.0 opt_steps=640 +[epoch 42/50] step=660 train_loss=0.0106 tok_s=60040.6 opt_steps=660 +[epoch 42/50] step=680 train_loss=0.0106 tok_s=60026.5 opt_steps=680 +[epoch 42/50] step=700 train_loss=0.0107 tok_s=60033.2 opt_steps=700 +[epoch 42/50] step=720 train_loss=0.0107 tok_s=60028.8 opt_steps=720 +[epoch 42/50] step=740 train_loss=0.0107 tok_s=60031.7 opt_steps=740 +[epoch 42/50] step=760 train_loss=0.0107 tok_s=60040.8 opt_steps=760 +[epoch 42/50] step=780 train_loss=0.0107 tok_s=60042.6 opt_steps=780 +[epoch 42/50] step=800 train_loss=0.0107 tok_s=60046.5 opt_steps=800 +[epoch 42/50] step=820 train_loss=0.0107 tok_s=60046.0 opt_steps=820 +[epoch 42/50] step=840 train_loss=0.0108 tok_s=60044.7 opt_steps=840 +[epoch 42/50] step=860 train_loss=0.0108 tok_s=60047.9 opt_steps=860 +[epoch 42/50] step=880 train_loss=0.0108 tok_s=60052.6 opt_steps=880 +[epoch 42/50] step=900 train_loss=0.0108 tok_s=60054.6 opt_steps=900 +[epoch 42/50] step=920 train_loss=0.0109 tok_s=60057.9 opt_steps=920 +[epoch 42/50] step=940 train_loss=0.0109 tok_s=60062.0 opt_steps=940 +[epoch 42/50] step=960 train_loss=0.0109 tok_s=60056.2 opt_steps=960 +[epoch 42/50] step=980 train_loss=0.0109 tok_s=60058.0 opt_steps=980 +[epoch 42/50] step=1000 train_loss=0.0109 tok_s=60058.4 opt_steps=1000 +[epoch 42/50] step=1020 train_loss=0.0109 tok_s=60060.5 opt_steps=1020 +[epoch 42/50] step=1040 train_loss=0.0110 tok_s=60058.4 opt_steps=1040 +[epoch 42/50] step=1060 train_loss=0.0110 tok_s=60061.3 opt_steps=1060 +[epoch 42/50] step=1080 train_loss=0.0110 tok_s=60065.4 opt_steps=1080 +[epoch 42/50] step=1100 train_loss=0.0110 tok_s=60059.2 opt_steps=1100 +[epoch 42/50] step=1120 train_loss=0.0110 tok_s=60055.3 opt_steps=1120 +[epoch 42/50] step=1140 train_loss=0.0111 tok_s=60057.6 opt_steps=1140 +[epoch 42/50] step=1160 train_loss=0.0111 tok_s=60056.8 opt_steps=1160 +[epoch 42/50] step=1180 train_loss=0.0111 tok_s=60058.0 opt_steps=1180 +[epoch 42/50] step=1200 train_loss=0.0111 tok_s=60063.9 opt_steps=1200 +[epoch 42/50] step=1220 train_loss=0.0111 tok_s=60067.3 opt_steps=1220 +[epoch 42/50] step=1240 train_loss=0.0111 tok_s=60065.3 opt_steps=1240 +[epoch 42/50] step=1260 train_loss=0.0112 tok_s=60069.6 opt_steps=1260 +[epoch 42/50] step=1280 train_loss=0.0112 tok_s=60073.4 opt_steps=1280 +[epoch 42/50] step=1300 train_loss=0.0112 tok_s=60075.9 opt_steps=1300 +[epoch 42/50] step=1320 train_loss=0.0112 tok_s=60076.3 opt_steps=1320 +[epoch 42/50] step=1340 train_loss=0.0112 tok_s=60078.4 opt_steps=1340 +[epoch 42/50] step=1360 train_loss=0.0112 tok_s=60081.3 opt_steps=1360 +[epoch 42/50] step=1380 train_loss=0.0113 tok_s=60086.4 opt_steps=1380 +[epoch 42/50] step=1400 train_loss=0.0113 tok_s=60089.2 opt_steps=1400 +[epoch 42/50] step=1420 train_loss=0.0113 tok_s=60089.5 opt_steps=1420 +[epoch 42/50] step=1440 train_loss=0.0113 tok_s=60084.5 opt_steps=1440 +[epoch 42/50] step=1460 train_loss=0.0113 tok_s=60080.8 opt_steps=1460 +[epoch 42/50] step=1480 train_loss=0.0113 tok_s=60081.8 opt_steps=1480 +[epoch 42/50] step=1500 train_loss=0.0113 tok_s=60083.8 opt_steps=1500 +[epoch 42/50] step=1520 train_loss=0.0113 tok_s=60092.6 opt_steps=1520 +[epoch 42/50] step=1540 train_loss=0.0113 tok_s=60093.0 opt_steps=1540 +[epoch 42/50] step=1560 train_loss=0.0113 tok_s=60099.7 opt_steps=1560 +[epoch 42/50] step=1580 train_loss=0.0113 tok_s=60100.9 opt_steps=1580 +[epoch 42/50] step=1600 train_loss=0.0114 tok_s=60094.2 opt_steps=1600 +[epoch 42/50] step=1620 train_loss=0.0114 tok_s=60094.0 opt_steps=1620 +[epoch 42/50] step=1640 train_loss=0.0114 tok_s=60093.7 opt_steps=1640 +[epoch 42/50] step=1660 train_loss=0.0114 tok_s=60097.3 opt_steps=1660 +[epoch 42/50] step=1680 train_loss=0.0114 tok_s=60097.7 opt_steps=1680 +[epoch 42/50] step=1700 train_loss=0.0114 tok_s=60096.0 opt_steps=1700 +[epoch 42/50] step=1720 train_loss=0.0114 tok_s=60095.7 opt_steps=1720 +[epoch 42/50] step=1740 train_loss=0.0114 tok_s=60091.4 opt_steps=1740 +[epoch 42/50] step=1760 train_loss=0.0115 tok_s=60091.6 opt_steps=1760 +[epoch 42/50] step=1780 train_loss=0.0115 tok_s=60091.1 opt_steps=1780 +[epoch 42/50] step=1800 train_loss=0.0115 tok_s=60090.8 opt_steps=1800 +[epoch 42/50] step=1820 train_loss=0.0115 tok_s=60084.7 opt_steps=1820 +[epoch 42/50] step=1840 train_loss=0.0115 tok_s=60089.4 opt_steps=1840 +[epoch 42/50] step=1860 train_loss=0.0115 tok_s=60089.9 opt_steps=1860 +[epoch 42/50] step=1880 train_loss=0.0115 tok_s=60088.2 opt_steps=1880 +[epoch 42/50] step=1900 train_loss=0.0115 tok_s=60090.8 opt_steps=1900 +[epoch 42/50] step=1920 train_loss=0.0115 tok_s=60090.5 opt_steps=1920 +[epoch 42/50] step=1940 train_loss=0.0115 tok_s=60089.9 opt_steps=1940 +[epoch 42/50] step=1960 train_loss=0.0116 tok_s=60093.4 opt_steps=1960 +[epoch 42/50] step=1980 train_loss=0.0116 tok_s=60098.0 opt_steps=1980 +[epoch 42/50] step=2000 train_loss=0.0116 tok_s=60100.4 opt_steps=2000 +[epoch 42/50] step=2020 train_loss=0.0116 tok_s=60097.0 opt_steps=2020 +[epoch 42/50] step=2040 train_loss=0.0116 tok_s=60100.4 opt_steps=2040 +[epoch 42/50] step=2060 train_loss=0.0116 tok_s=60098.2 opt_steps=2060 +[epoch 42/50] step=2080 train_loss=0.0116 tok_s=60098.2 opt_steps=2080 +[epoch 42/50] step=2100 train_loss=0.0116 tok_s=60100.8 opt_steps=2100 +[epoch 42/50] step=2120 train_loss=0.0116 tok_s=60097.9 opt_steps=2120 +[epoch 42/50] step=2140 train_loss=0.0117 tok_s=60094.7 opt_steps=2140 +[epoch 42/50] step=2160 train_loss=0.0117 tok_s=60095.7 opt_steps=2160 +[epoch 42/50] step=2180 train_loss=0.0117 tok_s=60096.4 opt_steps=2180 +[epoch 42/50] step=2200 train_loss=0.0117 tok_s=60096.7 opt_steps=2200 +[epoch 42/50] step=2220 train_loss=0.0117 tok_s=60099.1 opt_steps=2220 +[epoch 42/50] step=2240 train_loss=0.0117 tok_s=60095.3 opt_steps=2240 +[epoch 42/50] step=2260 train_loss=0.0117 tok_s=60097.7 opt_steps=2260 +[epoch 42/50] step=2280 train_loss=0.0118 tok_s=60098.0 opt_steps=2280 +[epoch 42/50] step=2300 train_loss=0.0118 tok_s=60094.6 opt_steps=2300 +[epoch 42/50] step=2320 train_loss=0.0118 tok_s=60097.6 opt_steps=2320 +[epoch 42/50] step=2340 train_loss=0.0118 tok_s=60099.0 opt_steps=2340 +[epoch 42/50] step=2360 train_loss=0.0118 tok_s=60096.0 opt_steps=2360 +[epoch 42/50] step=2380 train_loss=0.0118 tok_s=60093.1 opt_steps=2380 +[epoch 42/50] step=2400 train_loss=0.0118 tok_s=60094.6 opt_steps=2400 +[epoch 42/50] step=2420 train_loss=0.0118 tok_s=60095.0 opt_steps=2420 +[epoch 42/50] step=2440 train_loss=0.0118 tok_s=60097.4 opt_steps=2440 +[epoch 42/50] step=2460 train_loss=0.0118 tok_s=60096.0 opt_steps=2460 +[epoch 42/50] step=2480 train_loss=0.0118 tok_s=60095.0 opt_steps=2480 +[epoch 42/50] step=2500 train_loss=0.0118 tok_s=60096.9 opt_steps=2500 +[epoch 42/50] step=2520 train_loss=0.0119 tok_s=60099.7 opt_steps=2520 +[epoch 42/50] step=2540 train_loss=0.0119 tok_s=60093.6 opt_steps=2540 +[epoch 42/50] step=2560 train_loss=0.0119 tok_s=60094.1 opt_steps=2560 +[epoch 42/50] step=2580 train_loss=0.0119 tok_s=60093.8 opt_steps=2580 +[epoch 42/50] step=2600 train_loss=0.0119 tok_s=60094.0 opt_steps=2600 +[epoch 42/50] step=2620 train_loss=0.0119 tok_s=60095.3 opt_steps=2620 +[epoch 42/50] step=2640 train_loss=0.0119 tok_s=60094.6 opt_steps=2640 +[epoch 42/50] step=2660 train_loss=0.0119 tok_s=60094.8 opt_steps=2660 +[epoch 42/50] step=2680 train_loss=0.0119 tok_s=60096.3 opt_steps=2680 +[epoch 42/50] step=2700 train_loss=0.0119 tok_s=60096.4 opt_steps=2700 +[epoch 42/50] step=2720 train_loss=0.0119 tok_s=60099.4 opt_steps=2720 +[epoch 42/50] step=2740 train_loss=0.0119 tok_s=60098.7 opt_steps=2740 +[epoch 42/50] step=2760 train_loss=0.0119 tok_s=60098.9 opt_steps=2760 +[epoch 42/50] step=2780 train_loss=0.0120 tok_s=60100.3 opt_steps=2780 +[epoch 42/50] step=2800 train_loss=0.0120 tok_s=60103.3 opt_steps=2800 +[epoch 42/50] step=2820 train_loss=0.0120 tok_s=60103.3 opt_steps=2820 +[epoch 42/50] step=2840 train_loss=0.0120 tok_s=60103.5 opt_steps=2840 +[epoch 42/50] step=2860 train_loss=0.0120 tok_s=60104.2 opt_steps=2860 +[epoch 42/50] step=2880 train_loss=0.0120 tok_s=60101.1 opt_steps=2880 +[epoch 42/50] step=2900 train_loss=0.0120 tok_s=60105.4 opt_steps=2900 +[epoch 42/50] step=2920 train_loss=0.0120 tok_s=60105.1 opt_steps=2920 +[epoch 42/50] step=2940 train_loss=0.0120 tok_s=60102.3 opt_steps=2940 +[epoch 42/50] step=2960 train_loss=0.0120 tok_s=60103.6 opt_steps=2960 +[epoch 42/50] step=2980 train_loss=0.0120 tok_s=60102.1 opt_steps=2980 +[epoch 42/50] step=3000 train_loss=0.0120 tok_s=60102.7 opt_steps=3000 +[epoch 42/50] step=3020 train_loss=0.0120 tok_s=60103.1 opt_steps=3020 +[epoch 42/50] step=3040 train_loss=0.0121 tok_s=60108.2 opt_steps=3040 +[epoch 42/50] step=3060 train_loss=0.0121 tok_s=60106.6 opt_steps=3060 +[epoch 42/50] step=3080 train_loss=0.0121 tok_s=60101.4 opt_steps=3080 +[epoch 42/50] step=3100 train_loss=0.0121 tok_s=60100.3 opt_steps=3100 +[epoch 42/50] step=3120 train_loss=0.0121 tok_s=60102.3 opt_steps=3120 +[epoch 42/50] step=3140 train_loss=0.0121 tok_s=60103.5 opt_steps=3140 +[epoch 42/50] step=3160 train_loss=0.0121 tok_s=60105.4 opt_steps=3160 +[epoch 42/50] step=3180 train_loss=0.0121 tok_s=60105.5 opt_steps=3180 +[epoch 42/50] step=3200 train_loss=0.0121 tok_s=60104.8 opt_steps=3200 +[epoch 42/50] step=3220 train_loss=0.0121 tok_s=60104.6 opt_steps=3220 +[epoch 42/50] step=3240 train_loss=0.0121 tok_s=60107.6 opt_steps=3240 +[epoch 42/50] step=3260 train_loss=0.0121 tok_s=60103.7 opt_steps=3260 +[epoch 42/50] train_loss=0.0122 val_skipped tok_s=60105.7 opt_steps=3273 +[epoch 43/50] step=20 train_loss=0.0098 tok_s=58363.4 opt_steps=20 +[epoch 43/50] step=40 train_loss=0.0097 tok_s=58986.3 opt_steps=40 +[epoch 43/50] step=60 train_loss=0.0099 tok_s=59488.4 opt_steps=60 +[epoch 43/50] step=80 train_loss=0.0098 tok_s=59787.2 opt_steps=80 +[epoch 43/50] step=100 train_loss=0.0098 tok_s=59777.2 opt_steps=100 +[epoch 43/50] step=120 train_loss=0.0097 tok_s=59839.2 opt_steps=120 +[epoch 43/50] step=140 train_loss=0.0098 tok_s=59871.8 opt_steps=140 +[epoch 43/50] step=160 train_loss=0.0098 tok_s=59885.2 opt_steps=160 +[epoch 43/50] step=180 train_loss=0.0098 tok_s=59917.7 opt_steps=180 +[epoch 43/50] step=200 train_loss=0.0097 tok_s=59891.7 opt_steps=200 +[epoch 43/50] step=220 train_loss=0.0098 tok_s=59941.7 opt_steps=220 +[epoch 43/50] step=240 train_loss=0.0098 tok_s=59959.8 opt_steps=240 +[epoch 43/50] step=260 train_loss=0.0098 tok_s=59961.4 opt_steps=260 +[epoch 43/50] step=280 train_loss=0.0098 tok_s=59960.8 opt_steps=280 +[epoch 43/50] step=300 train_loss=0.0099 tok_s=59991.9 opt_steps=300 +[epoch 43/50] step=320 train_loss=0.0099 tok_s=60038.0 opt_steps=320 +[epoch 43/50] step=340 train_loss=0.0099 tok_s=60059.5 opt_steps=340 +[epoch 43/50] step=360 train_loss=0.0099 tok_s=60053.2 opt_steps=360 +[epoch 43/50] step=380 train_loss=0.0100 tok_s=60062.9 opt_steps=380 +[epoch 43/50] step=400 train_loss=0.0100 tok_s=60049.1 opt_steps=400 +[epoch 43/50] step=420 train_loss=0.0100 tok_s=60046.0 opt_steps=420 +[epoch 43/50] step=440 train_loss=0.0100 tok_s=60040.0 opt_steps=440 +[epoch 43/50] step=460 train_loss=0.0101 tok_s=60058.8 opt_steps=460 +[epoch 43/50] step=480 train_loss=0.0101 tok_s=60059.8 opt_steps=480 +[epoch 43/50] step=500 train_loss=0.0101 tok_s=60061.8 opt_steps=500 +[epoch 43/50] step=520 train_loss=0.0101 tok_s=60057.6 opt_steps=520 +[epoch 43/50] step=540 train_loss=0.0101 tok_s=60058.5 opt_steps=540 +[epoch 43/50] step=560 train_loss=0.0102 tok_s=60062.8 opt_steps=560 +[epoch 43/50] step=580 train_loss=0.0102 tok_s=60062.5 opt_steps=580 +[epoch 43/50] step=600 train_loss=0.0102 tok_s=60069.4 opt_steps=600 +[epoch 43/50] step=620 train_loss=0.0102 tok_s=60080.9 opt_steps=620 +[epoch 43/50] step=640 train_loss=0.0103 tok_s=60079.2 opt_steps=640 +[epoch 43/50] step=660 train_loss=0.0103 tok_s=60069.0 opt_steps=660 +[epoch 43/50] step=680 train_loss=0.0103 tok_s=60073.2 opt_steps=680 +[epoch 43/50] step=700 train_loss=0.0103 tok_s=60069.3 opt_steps=700 +[epoch 43/50] step=720 train_loss=0.0104 tok_s=60069.2 opt_steps=720 +[epoch 43/50] step=740 train_loss=0.0104 tok_s=60079.5 opt_steps=740 +[epoch 43/50] step=760 train_loss=0.0104 tok_s=60073.5 opt_steps=760 +[epoch 43/50] step=780 train_loss=0.0104 tok_s=60065.7 opt_steps=780 +[epoch 43/50] step=800 train_loss=0.0105 tok_s=60070.3 opt_steps=800 +[epoch 43/50] step=820 train_loss=0.0105 tok_s=60066.7 opt_steps=820 +[epoch 43/50] step=840 train_loss=0.0105 tok_s=60064.5 opt_steps=840 +[epoch 43/50] step=860 train_loss=0.0105 tok_s=60063.8 opt_steps=860 +[epoch 43/50] step=880 train_loss=0.0106 tok_s=60064.3 opt_steps=880 +[epoch 43/50] step=900 train_loss=0.0106 tok_s=60065.2 opt_steps=900 +[epoch 43/50] step=920 train_loss=0.0106 tok_s=60055.5 opt_steps=920 +[epoch 43/50] step=940 train_loss=0.0106 tok_s=60057.6 opt_steps=940 +[epoch 43/50] step=960 train_loss=0.0106 tok_s=60068.7 opt_steps=960 +[epoch 43/50] step=980 train_loss=0.0107 tok_s=60066.2 opt_steps=980 +[epoch 43/50] step=1000 train_loss=0.0107 tok_s=60065.1 opt_steps=1000 +[epoch 43/50] step=1020 train_loss=0.0107 tok_s=60073.7 opt_steps=1020 +[epoch 43/50] step=1040 train_loss=0.0107 tok_s=60078.0 opt_steps=1040 +[epoch 43/50] step=1060 train_loss=0.0108 tok_s=60094.4 opt_steps=1060 +[epoch 43/50] step=1080 train_loss=0.0108 tok_s=60094.6 opt_steps=1080 +[epoch 43/50] step=1100 train_loss=0.0108 tok_s=60106.3 opt_steps=1100 +[epoch 43/50] step=1120 train_loss=0.0108 tok_s=60102.1 opt_steps=1120 +[epoch 43/50] step=1140 train_loss=0.0108 tok_s=60095.3 opt_steps=1140 +[epoch 43/50] step=1160 train_loss=0.0109 tok_s=60085.7 opt_steps=1160 +[epoch 43/50] step=1180 train_loss=0.0109 tok_s=60087.0 opt_steps=1180 +[epoch 43/50] step=1200 train_loss=0.0109 tok_s=60083.6 opt_steps=1200 +[epoch 43/50] step=1220 train_loss=0.0109 tok_s=60083.4 opt_steps=1220 +[epoch 43/50] step=1240 train_loss=0.0109 tok_s=60080.7 opt_steps=1240 +[epoch 43/50] step=1260 train_loss=0.0109 tok_s=60086.9 opt_steps=1260 +[epoch 43/50] step=1280 train_loss=0.0109 tok_s=60087.4 opt_steps=1280 +[epoch 43/50] step=1300 train_loss=0.0109 tok_s=60095.4 opt_steps=1300 +[epoch 43/50] step=1320 train_loss=0.0110 tok_s=60100.9 opt_steps=1320 +[epoch 43/50] step=1340 train_loss=0.0110 tok_s=60108.3 opt_steps=1340 +[epoch 43/50] step=1360 train_loss=0.0110 tok_s=60108.4 opt_steps=1360 +[epoch 43/50] step=1380 train_loss=0.0110 tok_s=60106.5 opt_steps=1380 +[epoch 43/50] step=1400 train_loss=0.0110 tok_s=60097.6 opt_steps=1400 +[epoch 43/50] step=1420 train_loss=0.0110 tok_s=60093.9 opt_steps=1420 +[epoch 43/50] step=1440 train_loss=0.0110 tok_s=60098.8 opt_steps=1440 +[epoch 43/50] step=1460 train_loss=0.0111 tok_s=60099.4 opt_steps=1460 +[epoch 43/50] step=1480 train_loss=0.0111 tok_s=60096.6 opt_steps=1480 +[epoch 43/50] step=1500 train_loss=0.0111 tok_s=60099.1 opt_steps=1500 +[epoch 43/50] step=1520 train_loss=0.0111 tok_s=60099.5 opt_steps=1520 +[epoch 43/50] step=1540 train_loss=0.0111 tok_s=60100.3 opt_steps=1540 +[epoch 43/50] step=1560 train_loss=0.0111 tok_s=60099.5 opt_steps=1560 +[epoch 43/50] step=1580 train_loss=0.0112 tok_s=60098.6 opt_steps=1580 +[epoch 43/50] step=1600 train_loss=0.0112 tok_s=60103.1 opt_steps=1600 +[epoch 43/50] step=1620 train_loss=0.0112 tok_s=60107.1 opt_steps=1620 +[epoch 43/50] step=1640 train_loss=0.0112 tok_s=60105.8 opt_steps=1640 +[epoch 43/50] step=1660 train_loss=0.0112 tok_s=60116.2 opt_steps=1660 +[epoch 43/50] step=1680 train_loss=0.0112 tok_s=60123.0 opt_steps=1680 +[epoch 43/50] step=1700 train_loss=0.0112 tok_s=60120.1 opt_steps=1700 +[epoch 43/50] step=1720 train_loss=0.0113 tok_s=60116.3 opt_steps=1720 +[epoch 43/50] step=1740 train_loss=0.0113 tok_s=60110.3 opt_steps=1740 +[epoch 43/50] step=1760 train_loss=0.0113 tok_s=60106.9 opt_steps=1760 +[epoch 43/50] step=1780 train_loss=0.0113 tok_s=60103.3 opt_steps=1780 +[epoch 43/50] step=1800 train_loss=0.0113 tok_s=60106.6 opt_steps=1800 +[epoch 43/50] step=1820 train_loss=0.0113 tok_s=60105.9 opt_steps=1820 +[epoch 43/50] step=1840 train_loss=0.0113 tok_s=60099.2 opt_steps=1840 +[epoch 43/50] step=1860 train_loss=0.0113 tok_s=60103.5 opt_steps=1860 +[epoch 43/50] step=1880 train_loss=0.0113 tok_s=60104.1 opt_steps=1880 +[epoch 43/50] step=1900 train_loss=0.0114 tok_s=60103.2 opt_steps=1900 +[epoch 43/50] step=1920 train_loss=0.0114 tok_s=60106.3 opt_steps=1920 +[epoch 43/50] step=1940 train_loss=0.0114 tok_s=60106.5 opt_steps=1940 +[epoch 43/50] step=1960 train_loss=0.0114 tok_s=60107.4 opt_steps=1960 +[epoch 43/50] step=1980 train_loss=0.0114 tok_s=60110.8 opt_steps=1980 +[epoch 43/50] step=2000 train_loss=0.0114 tok_s=60117.6 opt_steps=2000 +[epoch 43/50] step=2020 train_loss=0.0114 tok_s=60119.2 opt_steps=2020 +[epoch 43/50] step=2040 train_loss=0.0115 tok_s=60120.3 opt_steps=2040 +[epoch 43/50] step=2060 train_loss=0.0115 tok_s=60123.0 opt_steps=2060 +[epoch 43/50] step=2080 train_loss=0.0115 tok_s=60119.9 opt_steps=2080 +[epoch 43/50] step=2100 train_loss=0.0115 tok_s=60118.6 opt_steps=2100 +[epoch 43/50] step=2120 train_loss=0.0115 tok_s=60118.2 opt_steps=2120 +[epoch 43/50] step=2140 train_loss=0.0115 tok_s=60115.8 opt_steps=2140 +[epoch 43/50] step=2160 train_loss=0.0115 tok_s=60118.5 opt_steps=2160 +[epoch 43/50] step=2180 train_loss=0.0115 tok_s=60122.7 opt_steps=2180 +[epoch 43/50] step=2200 train_loss=0.0115 tok_s=60119.6 opt_steps=2200 +[epoch 43/50] step=2220 train_loss=0.0115 tok_s=60118.4 opt_steps=2220 +[epoch 43/50] step=2240 train_loss=0.0115 tok_s=60119.6 opt_steps=2240 +[epoch 43/50] step=2260 train_loss=0.0115 tok_s=60118.4 opt_steps=2260 +[epoch 43/50] step=2280 train_loss=0.0115 tok_s=60122.7 opt_steps=2280 +[epoch 43/50] step=2300 train_loss=0.0116 tok_s=60124.7 opt_steps=2300 +[epoch 43/50] step=2320 train_loss=0.0116 tok_s=60121.0 opt_steps=2320 +[epoch 43/50] step=2340 train_loss=0.0116 tok_s=60121.5 opt_steps=2340 +[epoch 43/50] step=2360 train_loss=0.0116 tok_s=60123.9 opt_steps=2360 +[epoch 43/50] step=2380 train_loss=0.0116 tok_s=60129.8 opt_steps=2380 +[epoch 43/50] step=2400 train_loss=0.0116 tok_s=60125.4 opt_steps=2400 +[epoch 43/50] step=2420 train_loss=0.0116 tok_s=60119.5 opt_steps=2420 +[epoch 43/50] step=2440 train_loss=0.0116 tok_s=60115.0 opt_steps=2440 +[epoch 43/50] step=2460 train_loss=0.0116 tok_s=60116.5 opt_steps=2460 +[epoch 43/50] step=2480 train_loss=0.0116 tok_s=60115.9 opt_steps=2480 +[epoch 43/50] step=2500 train_loss=0.0116 tok_s=60118.7 opt_steps=2500 +[epoch 43/50] step=2520 train_loss=0.0116 tok_s=60119.9 opt_steps=2520 +[epoch 43/50] step=2540 train_loss=0.0117 tok_s=60119.8 opt_steps=2540 +[epoch 43/50] step=2560 train_loss=0.0117 tok_s=60118.9 opt_steps=2560 +[epoch 43/50] step=2580 train_loss=0.0117 tok_s=60122.3 opt_steps=2580 +[epoch 43/50] step=2600 train_loss=0.0117 tok_s=60123.2 opt_steps=2600 +[epoch 43/50] step=2620 train_loss=0.0117 tok_s=60128.6 opt_steps=2620 +[epoch 43/50] step=2640 train_loss=0.0117 tok_s=60130.3 opt_steps=2640 +[epoch 43/50] step=2660 train_loss=0.0117 tok_s=60129.9 opt_steps=2660 +[epoch 43/50] step=2680 train_loss=0.0117 tok_s=60127.6 opt_steps=2680 +[epoch 43/50] step=2700 train_loss=0.0117 tok_s=60131.1 opt_steps=2700 +[epoch 43/50] step=2720 train_loss=0.0117 tok_s=60132.5 opt_steps=2720 +[epoch 43/50] step=2740 train_loss=0.0117 tok_s=60132.5 opt_steps=2740 +[epoch 43/50] step=2760 train_loss=0.0117 tok_s=60132.0 opt_steps=2760 +[epoch 43/50] step=2780 train_loss=0.0117 tok_s=60134.4 opt_steps=2780 +[epoch 43/50] step=2800 train_loss=0.0118 tok_s=60137.1 opt_steps=2800 +[epoch 43/50] step=2820 train_loss=0.0118 tok_s=60139.1 opt_steps=2820 +[epoch 43/50] step=2840 train_loss=0.0118 tok_s=60137.7 opt_steps=2840 +[epoch 43/50] step=2860 train_loss=0.0118 tok_s=60142.1 opt_steps=2860 +[epoch 43/50] step=2880 train_loss=0.0118 tok_s=60140.7 opt_steps=2880 +[epoch 43/50] step=2900 train_loss=0.0118 tok_s=60141.2 opt_steps=2900 +[epoch 43/50] step=2920 train_loss=0.0118 tok_s=60144.3 opt_steps=2920 +[epoch 43/50] step=2940 train_loss=0.0118 tok_s=60143.7 opt_steps=2940 +[epoch 43/50] step=2960 train_loss=0.0118 tok_s=60142.9 opt_steps=2960 +[epoch 43/50] step=2980 train_loss=0.0118 tok_s=60144.6 opt_steps=2980 +[epoch 43/50] step=3000 train_loss=0.0118 tok_s=60146.4 opt_steps=3000 +[epoch 43/50] step=3020 train_loss=0.0118 tok_s=60144.1 opt_steps=3020 +[epoch 43/50] step=3040 train_loss=0.0118 tok_s=60143.4 opt_steps=3040 +[epoch 43/50] step=3060 train_loss=0.0118 tok_s=60142.3 opt_steps=3060 +[epoch 43/50] step=3080 train_loss=0.0118 tok_s=60141.7 opt_steps=3080 +[epoch 43/50] step=3100 train_loss=0.0119 tok_s=60142.3 opt_steps=3100 +[epoch 43/50] step=3120 train_loss=0.0119 tok_s=60141.4 opt_steps=3120 +[epoch 43/50] step=3140 train_loss=0.0119 tok_s=60140.0 opt_steps=3140 +[epoch 43/50] step=3160 train_loss=0.0119 tok_s=60142.6 opt_steps=3160 +[epoch 43/50] step=3180 train_loss=0.0119 tok_s=60140.2 opt_steps=3180 +[epoch 43/50] step=3200 train_loss=0.0119 tok_s=60139.3 opt_steps=3200 +[epoch 43/50] step=3220 train_loss=0.0119 tok_s=60139.5 opt_steps=3220 +[epoch 43/50] step=3240 train_loss=0.0119 tok_s=60138.3 opt_steps=3240 +[epoch 43/50] step=3260 train_loss=0.0119 tok_s=60134.1 opt_steps=3260 +[epoch 43/50] train_loss=0.0119 val_skipped tok_s=60135.2 opt_steps=3273 +[epoch 44/50] step=20 train_loss=0.0099 tok_s=58172.5 opt_steps=20 +[epoch 44/50] step=40 train_loss=0.0098 tok_s=59165.9 opt_steps=40 +[epoch 44/50] step=60 train_loss=0.0101 tok_s=59460.1 opt_steps=60 +[epoch 44/50] step=80 train_loss=0.0100 tok_s=59553.3 opt_steps=80 +[epoch 44/50] step=100 train_loss=0.0099 tok_s=59760.9 opt_steps=100 +[epoch 44/50] step=120 train_loss=0.0100 tok_s=59786.6 opt_steps=120 +[epoch 44/50] step=140 train_loss=0.0100 tok_s=59877.5 opt_steps=140 +[epoch 44/50] step=160 train_loss=0.0099 tok_s=59981.2 opt_steps=160 +[epoch 44/50] step=180 train_loss=0.0099 tok_s=59994.3 opt_steps=180 +[epoch 44/50] step=200 train_loss=0.0099 tok_s=60022.6 opt_steps=200 +[epoch 44/50] step=220 train_loss=0.0098 tok_s=60057.7 opt_steps=220 +[epoch 44/50] step=240 train_loss=0.0097 tok_s=60075.2 opt_steps=240 +[epoch 44/50] step=260 train_loss=0.0097 tok_s=60074.4 opt_steps=260 +[epoch 44/50] step=280 train_loss=0.0097 tok_s=60075.7 opt_steps=280 +[epoch 44/50] step=300 train_loss=0.0097 tok_s=60095.3 opt_steps=300 +[epoch 44/50] step=320 train_loss=0.0097 tok_s=60093.2 opt_steps=320 +[epoch 44/50] step=340 train_loss=0.0097 tok_s=60092.9 opt_steps=340 +[epoch 44/50] step=360 train_loss=0.0097 tok_s=60075.8 opt_steps=360 +[epoch 44/50] step=380 train_loss=0.0097 tok_s=60110.4 opt_steps=380 +[epoch 44/50] step=400 train_loss=0.0098 tok_s=60091.8 opt_steps=400 +[epoch 44/50] step=420 train_loss=0.0098 tok_s=60091.1 opt_steps=420 +[epoch 44/50] step=440 train_loss=0.0098 tok_s=60099.5 opt_steps=440 +[epoch 44/50] step=460 train_loss=0.0098 tok_s=60109.2 opt_steps=460 +[epoch 44/50] step=480 train_loss=0.0099 tok_s=60112.2 opt_steps=480 +[epoch 44/50] step=500 train_loss=0.0099 tok_s=60133.7 opt_steps=500 +[epoch 44/50] step=520 train_loss=0.0099 tok_s=60142.7 opt_steps=520 +[epoch 44/50] step=540 train_loss=0.0099 tok_s=60138.9 opt_steps=540 +[epoch 44/50] step=560 train_loss=0.0099 tok_s=60139.7 opt_steps=560 +[epoch 44/50] step=580 train_loss=0.0099 tok_s=60144.3 opt_steps=580 +[epoch 44/50] step=600 train_loss=0.0100 tok_s=60151.0 opt_steps=600 +[epoch 44/50] step=620 train_loss=0.0100 tok_s=60137.9 opt_steps=620 +[epoch 44/50] step=640 train_loss=0.0100 tok_s=60155.0 opt_steps=640 +[epoch 44/50] step=660 train_loss=0.0100 tok_s=60160.1 opt_steps=660 +[epoch 44/50] step=680 train_loss=0.0101 tok_s=60145.4 opt_steps=680 +[epoch 44/50] step=700 train_loss=0.0101 tok_s=60143.6 opt_steps=700 +[epoch 44/50] step=720 train_loss=0.0101 tok_s=60130.4 opt_steps=720 +[epoch 44/50] step=740 train_loss=0.0101 tok_s=60115.1 opt_steps=740 +[epoch 44/50] step=760 train_loss=0.0102 tok_s=60117.4 opt_steps=760 +[epoch 44/50] step=780 train_loss=0.0102 tok_s=60107.0 opt_steps=780 +[epoch 44/50] step=800 train_loss=0.0102 tok_s=60125.0 opt_steps=800 +[epoch 44/50] step=820 train_loss=0.0102 tok_s=60136.2 opt_steps=820 +[epoch 44/50] step=840 train_loss=0.0103 tok_s=60134.9 opt_steps=840 +[epoch 44/50] step=860 train_loss=0.0103 tok_s=60119.3 opt_steps=860 +[epoch 44/50] step=880 train_loss=0.0103 tok_s=60109.1 opt_steps=880 +[epoch 44/50] step=900 train_loss=0.0103 tok_s=60111.9 opt_steps=900 +[epoch 44/50] step=920 train_loss=0.0103 tok_s=60106.1 opt_steps=920 +[epoch 44/50] step=940 train_loss=0.0103 tok_s=60113.4 opt_steps=940 +[epoch 44/50] step=960 train_loss=0.0103 tok_s=60108.1 opt_steps=960 +[epoch 44/50] step=980 train_loss=0.0104 tok_s=60105.6 opt_steps=980 +[epoch 44/50] step=1000 train_loss=0.0104 tok_s=60113.2 opt_steps=1000 +[epoch 44/50] step=1020 train_loss=0.0104 tok_s=60102.9 opt_steps=1020 +[epoch 44/50] step=1040 train_loss=0.0104 tok_s=60102.6 opt_steps=1040 +[epoch 44/50] step=1060 train_loss=0.0104 tok_s=60112.3 opt_steps=1060 +[epoch 44/50] step=1080 train_loss=0.0105 tok_s=60102.7 opt_steps=1080 +[epoch 44/50] step=1100 train_loss=0.0105 tok_s=60096.7 opt_steps=1100 +[epoch 44/50] step=1120 train_loss=0.0105 tok_s=60091.5 opt_steps=1120 +[epoch 44/50] step=1140 train_loss=0.0105 tok_s=60091.6 opt_steps=1140 +[epoch 44/50] step=1160 train_loss=0.0106 tok_s=60098.8 opt_steps=1160 +[epoch 44/50] step=1180 train_loss=0.0106 tok_s=60088.7 opt_steps=1180 +[epoch 44/50] step=1200 train_loss=0.0106 tok_s=60079.6 opt_steps=1200 +[epoch 44/50] step=1220 train_loss=0.0106 tok_s=60080.8 opt_steps=1220 +[epoch 44/50] step=1240 train_loss=0.0106 tok_s=60084.3 opt_steps=1240 +[epoch 44/50] step=1260 train_loss=0.0106 tok_s=60083.4 opt_steps=1260 +[epoch 44/50] step=1280 train_loss=0.0106 tok_s=60080.1 opt_steps=1280 +[epoch 44/50] step=1300 train_loss=0.0106 tok_s=60069.1 opt_steps=1300 +[epoch 44/50] step=1320 train_loss=0.0107 tok_s=60072.2 opt_steps=1320 +[epoch 44/50] step=1340 train_loss=0.0107 tok_s=60072.7 opt_steps=1340 +[epoch 44/50] step=1360 train_loss=0.0107 tok_s=60074.0 opt_steps=1360 +[epoch 44/50] step=1380 train_loss=0.0107 tok_s=60072.3 opt_steps=1380 +[epoch 44/50] step=1400 train_loss=0.0107 tok_s=60072.0 opt_steps=1400 +[epoch 44/50] step=1420 train_loss=0.0107 tok_s=60070.7 opt_steps=1420 +[epoch 44/50] step=1440 train_loss=0.0107 tok_s=60059.6 opt_steps=1440 +[epoch 44/50] step=1460 train_loss=0.0107 tok_s=60060.1 opt_steps=1460 +[epoch 44/50] step=1480 train_loss=0.0107 tok_s=60057.5 opt_steps=1480 +[epoch 44/50] step=1500 train_loss=0.0108 tok_s=60057.5 opt_steps=1500 +[epoch 44/50] step=1520 train_loss=0.0108 tok_s=60046.0 opt_steps=1520 +[epoch 44/50] step=1540 train_loss=0.0108 tok_s=60050.6 opt_steps=1540 +[epoch 44/50] step=1560 train_loss=0.0108 tok_s=60054.4 opt_steps=1560 +[epoch 44/50] step=1580 train_loss=0.0108 tok_s=60050.2 opt_steps=1580 +[epoch 44/50] step=1600 train_loss=0.0108 tok_s=60051.2 opt_steps=1600 +[epoch 44/50] step=1620 train_loss=0.0108 tok_s=60049.4 opt_steps=1620 +[epoch 44/50] step=1640 train_loss=0.0108 tok_s=60053.2 opt_steps=1640 +[epoch 44/50] step=1660 train_loss=0.0109 tok_s=60051.0 opt_steps=1660 +[epoch 44/50] step=1680 train_loss=0.0109 tok_s=60056.1 opt_steps=1680 +[epoch 44/50] step=1700 train_loss=0.0109 tok_s=60054.8 opt_steps=1700 +[epoch 44/50] step=1720 train_loss=0.0109 tok_s=60056.3 opt_steps=1720 +[epoch 44/50] step=1740 train_loss=0.0109 tok_s=60052.6 opt_steps=1740 +[epoch 44/50] step=1760 train_loss=0.0109 tok_s=60056.5 opt_steps=1760 +[epoch 44/50] step=1780 train_loss=0.0109 tok_s=60060.5 opt_steps=1780 +[epoch 44/50] step=1800 train_loss=0.0109 tok_s=60065.0 opt_steps=1800 +[epoch 44/50] step=1820 train_loss=0.0110 tok_s=60068.7 opt_steps=1820 +[epoch 44/50] step=1840 train_loss=0.0110 tok_s=60071.2 opt_steps=1840 +[epoch 44/50] step=1860 train_loss=0.0110 tok_s=60072.6 opt_steps=1860 +[epoch 44/50] step=1880 train_loss=0.0110 tok_s=60077.3 opt_steps=1880 +[epoch 44/50] step=1900 train_loss=0.0110 tok_s=60083.6 opt_steps=1900 +[epoch 44/50] step=1920 train_loss=0.0110 tok_s=60087.7 opt_steps=1920 +[epoch 44/50] step=1940 train_loss=0.0111 tok_s=60085.0 opt_steps=1940 +[epoch 44/50] step=1960 train_loss=0.0111 tok_s=60084.3 opt_steps=1960 +[epoch 44/50] step=1980 train_loss=0.0111 tok_s=60088.3 opt_steps=1980 +[epoch 44/50] step=2000 train_loss=0.0111 tok_s=60090.2 opt_steps=2000 +[epoch 44/50] step=2020 train_loss=0.0111 tok_s=60092.4 opt_steps=2020 +[epoch 44/50] step=2040 train_loss=0.0111 tok_s=60093.1 opt_steps=2040 +[epoch 44/50] step=2060 train_loss=0.0111 tok_s=60091.1 opt_steps=2060 +[epoch 44/50] step=2080 train_loss=0.0111 tok_s=60088.7 opt_steps=2080 +[epoch 44/50] step=2100 train_loss=0.0112 tok_s=60089.7 opt_steps=2100 +[epoch 44/50] step=2120 train_loss=0.0112 tok_s=60092.1 opt_steps=2120 +[epoch 44/50] step=2140 train_loss=0.0112 tok_s=60094.0 opt_steps=2140 +[epoch 44/50] step=2160 train_loss=0.0112 tok_s=60093.0 opt_steps=2160 +[epoch 44/50] step=2180 train_loss=0.0112 tok_s=60094.6 opt_steps=2180 +[epoch 44/50] step=2200 train_loss=0.0112 tok_s=60100.7 opt_steps=2200 +[epoch 44/50] step=2220 train_loss=0.0112 tok_s=60097.7 opt_steps=2220 +[epoch 44/50] step=2240 train_loss=0.0112 tok_s=60098.4 opt_steps=2240 +[epoch 44/50] step=2260 train_loss=0.0112 tok_s=60101.1 opt_steps=2260 +[epoch 44/50] step=2280 train_loss=0.0112 tok_s=60099.7 opt_steps=2280 +[epoch 44/50] step=2300 train_loss=0.0112 tok_s=60100.9 opt_steps=2300 +[epoch 44/50] step=2320 train_loss=0.0113 tok_s=60097.9 opt_steps=2320 +[epoch 44/50] step=2340 train_loss=0.0113 tok_s=60096.9 opt_steps=2340 +[epoch 44/50] step=2360 train_loss=0.0113 tok_s=60095.7 opt_steps=2360 +[epoch 44/50] step=2380 train_loss=0.0113 tok_s=60096.4 opt_steps=2380 +[epoch 44/50] step=2400 train_loss=0.0113 tok_s=60095.4 opt_steps=2400 +[epoch 44/50] step=2420 train_loss=0.0113 tok_s=60097.4 opt_steps=2420 +[epoch 44/50] step=2440 train_loss=0.0113 tok_s=60097.3 opt_steps=2440 +[epoch 44/50] step=2460 train_loss=0.0113 tok_s=60100.4 opt_steps=2460 +[epoch 44/50] step=2480 train_loss=0.0113 tok_s=60100.4 opt_steps=2480 +[epoch 44/50] step=2500 train_loss=0.0113 tok_s=60098.7 opt_steps=2500 +[epoch 44/50] step=2520 train_loss=0.0113 tok_s=60099.7 opt_steps=2520 +[epoch 44/50] step=2540 train_loss=0.0113 tok_s=60096.3 opt_steps=2540 +[epoch 44/50] step=2560 train_loss=0.0114 tok_s=60097.1 opt_steps=2560 +[epoch 44/50] step=2580 train_loss=0.0114 tok_s=60099.3 opt_steps=2580 +[epoch 44/50] step=2600 train_loss=0.0114 tok_s=60098.6 opt_steps=2600 +[epoch 44/50] step=2620 train_loss=0.0114 tok_s=60096.3 opt_steps=2620 +[epoch 44/50] step=2640 train_loss=0.0114 tok_s=60095.9 opt_steps=2640 +[epoch 44/50] step=2660 train_loss=0.0114 tok_s=60099.9 opt_steps=2660 +[epoch 44/50] step=2680 train_loss=0.0114 tok_s=60101.5 opt_steps=2680 +[epoch 44/50] step=2700 train_loss=0.0114 tok_s=60104.1 opt_steps=2700 +[epoch 44/50] step=2720 train_loss=0.0114 tok_s=60103.3 opt_steps=2720 +[epoch 44/50] step=2740 train_loss=0.0114 tok_s=60102.8 opt_steps=2740 +[epoch 44/50] step=2760 train_loss=0.0114 tok_s=60103.5 opt_steps=2760 +[epoch 44/50] step=2780 train_loss=0.0114 tok_s=60106.1 opt_steps=2780 +[epoch 44/50] step=2800 train_loss=0.0114 tok_s=60103.9 opt_steps=2800 +[epoch 44/50] step=2820 train_loss=0.0115 tok_s=60106.5 opt_steps=2820 +[epoch 44/50] step=2840 train_loss=0.0115 tok_s=60106.6 opt_steps=2840 +[epoch 44/50] step=2860 train_loss=0.0115 tok_s=60109.0 opt_steps=2860 +[epoch 44/50] step=2880 train_loss=0.0115 tok_s=60109.5 opt_steps=2880 +[epoch 44/50] step=2900 train_loss=0.0115 tok_s=60110.2 opt_steps=2900 +[epoch 44/50] step=2920 train_loss=0.0115 tok_s=60110.0 opt_steps=2920 +[epoch 44/50] step=2940 train_loss=0.0115 tok_s=60108.3 opt_steps=2940 +[epoch 44/50] step=2960 train_loss=0.0115 tok_s=60113.2 opt_steps=2960 +[epoch 44/50] step=2980 train_loss=0.0115 tok_s=60115.1 opt_steps=2980 +[epoch 44/50] step=3000 train_loss=0.0115 tok_s=60115.7 opt_steps=3000 +[epoch 44/50] step=3020 train_loss=0.0115 tok_s=60115.0 opt_steps=3020 +[epoch 44/50] step=3040 train_loss=0.0115 tok_s=60113.3 opt_steps=3040 +[epoch 44/50] step=3060 train_loss=0.0116 tok_s=60111.3 opt_steps=3060 +[epoch 44/50] step=3080 train_loss=0.0116 tok_s=60110.7 opt_steps=3080 +[epoch 44/50] step=3100 train_loss=0.0116 tok_s=60112.2 opt_steps=3100 +[epoch 44/50] step=3120 train_loss=0.0116 tok_s=60113.2 opt_steps=3120 +[epoch 44/50] step=3140 train_loss=0.0116 tok_s=60110.0 opt_steps=3140 +[epoch 44/50] step=3160 train_loss=0.0116 tok_s=60109.2 opt_steps=3160 +[epoch 44/50] step=3180 train_loss=0.0116 tok_s=60112.5 opt_steps=3180 +[epoch 44/50] step=3200 train_loss=0.0116 tok_s=60113.2 opt_steps=3200 +[epoch 44/50] step=3220 train_loss=0.0116 tok_s=60108.6 opt_steps=3220 +[epoch 44/50] step=3240 train_loss=0.0116 tok_s=60111.7 opt_steps=3240 +[epoch 44/50] step=3260 train_loss=0.0116 tok_s=60109.8 opt_steps=3260 +[epoch 44/50] train_loss=0.0116 val_skipped tok_s=60107.4 opt_steps=3273 +[epoch 45/50] step=20 train_loss=0.0091 tok_s=58094.8 opt_steps=20 +[epoch 45/50] step=40 train_loss=0.0092 tok_s=59082.2 opt_steps=40 +[epoch 45/50] step=60 train_loss=0.0092 tok_s=59317.3 opt_steps=60 +[epoch 45/50] step=80 train_loss=0.0092 tok_s=59625.0 opt_steps=80 +[epoch 45/50] step=100 train_loss=0.0092 tok_s=59732.2 opt_steps=100 +[epoch 45/50] step=120 train_loss=0.0093 tok_s=59770.5 opt_steps=120 +[epoch 45/50] step=140 train_loss=0.0093 tok_s=59876.5 opt_steps=140 +[epoch 45/50] step=160 train_loss=0.0092 tok_s=59881.2 opt_steps=160 +[epoch 45/50] step=180 train_loss=0.0093 tok_s=59918.6 opt_steps=180 +[epoch 45/50] step=200 train_loss=0.0093 tok_s=59966.0 opt_steps=200 +[epoch 45/50] step=220 train_loss=0.0093 tok_s=60012.0 opt_steps=220 +[epoch 45/50] step=240 train_loss=0.0093 tok_s=60032.3 opt_steps=240 +[epoch 45/50] step=260 train_loss=0.0094 tok_s=60019.8 opt_steps=260 +[epoch 45/50] step=280 train_loss=0.0094 tok_s=60010.3 opt_steps=280 +[epoch 45/50] step=300 train_loss=0.0094 tok_s=60023.7 opt_steps=300 +[epoch 45/50] step=320 train_loss=0.0094 tok_s=60045.9 opt_steps=320 +[epoch 45/50] step=340 train_loss=0.0094 tok_s=60040.9 opt_steps=340 +[epoch 45/50] step=360 train_loss=0.0095 tok_s=60060.7 opt_steps=360 +[epoch 45/50] step=380 train_loss=0.0095 tok_s=60097.0 opt_steps=380 +[epoch 45/50] step=400 train_loss=0.0095 tok_s=60107.9 opt_steps=400 +[epoch 45/50] step=420 train_loss=0.0096 tok_s=60094.1 opt_steps=420 +[epoch 45/50] step=440 train_loss=0.0096 tok_s=60084.2 opt_steps=440 +[epoch 45/50] step=460 train_loss=0.0096 tok_s=60079.6 opt_steps=460 +[epoch 45/50] step=480 train_loss=0.0096 tok_s=60069.0 opt_steps=480 +[epoch 45/50] step=500 train_loss=0.0097 tok_s=60077.4 opt_steps=500 +[epoch 45/50] step=520 train_loss=0.0097 tok_s=60090.9 opt_steps=520 +[epoch 45/50] step=540 train_loss=0.0097 tok_s=60093.3 opt_steps=540 +[epoch 45/50] step=560 train_loss=0.0098 tok_s=60096.6 opt_steps=560 +[epoch 45/50] step=580 train_loss=0.0098 tok_s=60106.8 opt_steps=580 +[epoch 45/50] step=600 train_loss=0.0098 tok_s=60093.8 opt_steps=600 +[epoch 45/50] step=620 train_loss=0.0098 tok_s=60098.2 opt_steps=620 +[epoch 45/50] step=640 train_loss=0.0099 tok_s=60097.2 opt_steps=640 +[epoch 45/50] step=660 train_loss=0.0099 tok_s=60086.9 opt_steps=660 +[epoch 45/50] step=680 train_loss=0.0099 tok_s=60089.2 opt_steps=680 +[epoch 45/50] step=700 train_loss=0.0099 tok_s=60094.8 opt_steps=700 +[epoch 45/50] step=720 train_loss=0.0100 tok_s=60086.1 opt_steps=720 +[epoch 45/50] step=740 train_loss=0.0100 tok_s=60083.5 opt_steps=740 +[epoch 45/50] step=760 train_loss=0.0100 tok_s=60071.9 opt_steps=760 +[epoch 45/50] step=780 train_loss=0.0100 tok_s=60057.2 opt_steps=780 +[epoch 45/50] step=800 train_loss=0.0100 tok_s=60049.4 opt_steps=800 +[epoch 45/50] step=820 train_loss=0.0100 tok_s=60034.6 opt_steps=820 +[epoch 45/50] step=840 train_loss=0.0101 tok_s=60028.9 opt_steps=840 +[epoch 45/50] step=860 train_loss=0.0101 tok_s=60025.4 opt_steps=860 +[epoch 45/50] step=880 train_loss=0.0101 tok_s=60022.9 opt_steps=880 +[epoch 45/50] step=900 train_loss=0.0101 tok_s=60028.5 opt_steps=900 +[epoch 45/50] step=920 train_loss=0.0101 tok_s=60019.9 opt_steps=920 +[epoch 45/50] step=940 train_loss=0.0102 tok_s=60008.7 opt_steps=940 +[epoch 45/50] step=960 train_loss=0.0102 tok_s=60003.5 opt_steps=960 +[epoch 45/50] step=980 train_loss=0.0102 tok_s=59995.3 opt_steps=980 +[epoch 45/50] step=1000 train_loss=0.0102 tok_s=60002.7 opt_steps=1000 +[epoch 45/50] step=1020 train_loss=0.0102 tok_s=59997.3 opt_steps=1020 +[epoch 45/50] step=1040 train_loss=0.0102 tok_s=59988.8 opt_steps=1040 +[epoch 45/50] step=1060 train_loss=0.0103 tok_s=59993.1 opt_steps=1060 +[epoch 45/50] step=1080 train_loss=0.0103 tok_s=59988.8 opt_steps=1080 +[epoch 45/50] step=1100 train_loss=0.0103 tok_s=59989.9 opt_steps=1100 +[epoch 45/50] step=1120 train_loss=0.0103 tok_s=59994.0 opt_steps=1120 +[epoch 45/50] step=1140 train_loss=0.0103 tok_s=59986.5 opt_steps=1140 +[epoch 45/50] step=1160 train_loss=0.0103 tok_s=59984.3 opt_steps=1160 +[epoch 45/50] step=1180 train_loss=0.0104 tok_s=59984.9 opt_steps=1180 +[epoch 45/50] step=1200 train_loss=0.0104 tok_s=59990.1 opt_steps=1200 +[epoch 45/50] step=1220 train_loss=0.0104 tok_s=59992.4 opt_steps=1220 +[epoch 45/50] step=1240 train_loss=0.0104 tok_s=59988.1 opt_steps=1240 +[epoch 45/50] step=1260 train_loss=0.0104 tok_s=59981.9 opt_steps=1260 +[epoch 45/50] step=1280 train_loss=0.0105 tok_s=59988.0 opt_steps=1280 +[epoch 45/50] step=1300 train_loss=0.0105 tok_s=59978.4 opt_steps=1300 +[epoch 45/50] step=1320 train_loss=0.0105 tok_s=59973.2 opt_steps=1320 +[epoch 45/50] step=1340 train_loss=0.0105 tok_s=59975.7 opt_steps=1340 +[epoch 45/50] step=1360 train_loss=0.0105 tok_s=59976.8 opt_steps=1360 +[epoch 45/50] step=1380 train_loss=0.0105 tok_s=59971.5 opt_steps=1380 +[epoch 45/50] step=1400 train_loss=0.0105 tok_s=59978.9 opt_steps=1400 +[epoch 45/50] step=1420 train_loss=0.0105 tok_s=59989.0 opt_steps=1420 +[epoch 45/50] step=1440 train_loss=0.0105 tok_s=59988.5 opt_steps=1440 +[epoch 45/50] step=1460 train_loss=0.0106 tok_s=59995.4 opt_steps=1460 +[epoch 45/50] step=1480 train_loss=0.0106 tok_s=59999.7 opt_steps=1480 +[epoch 45/50] step=1500 train_loss=0.0106 tok_s=59999.3 opt_steps=1500 +[epoch 45/50] step=1520 train_loss=0.0106 tok_s=60000.5 opt_steps=1520 +[epoch 45/50] step=1540 train_loss=0.0106 tok_s=60001.2 opt_steps=1540 +[epoch 45/50] step=1560 train_loss=0.0106 tok_s=60005.2 opt_steps=1560 +[epoch 45/50] step=1580 train_loss=0.0106 tok_s=60008.0 opt_steps=1580 +[epoch 45/50] step=1600 train_loss=0.0107 tok_s=60011.1 opt_steps=1600 +[epoch 45/50] step=1620 train_loss=0.0107 tok_s=60015.2 opt_steps=1620 +[epoch 45/50] step=1640 train_loss=0.0107 tok_s=60017.8 opt_steps=1640 +[epoch 45/50] step=1660 train_loss=0.0107 tok_s=59962.1 opt_steps=1660 +[epoch 45/50] step=1680 train_loss=0.0107 tok_s=59953.2 opt_steps=1680 +[epoch 45/50] step=1700 train_loss=0.0107 tok_s=59945.7 opt_steps=1700 +[epoch 45/50] step=1720 train_loss=0.0107 tok_s=59928.7 opt_steps=1720 +[epoch 45/50] step=1740 train_loss=0.0107 tok_s=59916.5 opt_steps=1740 +[epoch 45/50] step=1760 train_loss=0.0108 tok_s=59907.6 opt_steps=1760 +[epoch 45/50] step=1780 train_loss=0.0108 tok_s=59911.1 opt_steps=1780 +[epoch 45/50] step=1800 train_loss=0.0108 tok_s=59918.0 opt_steps=1800 +[epoch 45/50] step=1820 train_loss=0.0108 tok_s=59916.2 opt_steps=1820 +[epoch 45/50] step=1840 train_loss=0.0108 tok_s=59919.5 opt_steps=1840 +[epoch 45/50] step=1860 train_loss=0.0108 tok_s=59923.8 opt_steps=1860 +[epoch 45/50] step=1880 train_loss=0.0108 tok_s=59924.1 opt_steps=1880 +[epoch 45/50] step=1900 train_loss=0.0108 tok_s=59924.1 opt_steps=1900 +[epoch 45/50] step=1920 train_loss=0.0108 tok_s=59932.1 opt_steps=1920 +[epoch 45/50] step=1940 train_loss=0.0109 tok_s=59937.4 opt_steps=1940 +[epoch 45/50] step=1960 train_loss=0.0109 tok_s=59947.4 opt_steps=1960 +[epoch 45/50] step=1980 train_loss=0.0109 tok_s=59953.9 opt_steps=1980 +[epoch 45/50] step=2000 train_loss=0.0109 tok_s=59952.9 opt_steps=2000 +[epoch 45/50] step=2020 train_loss=0.0109 tok_s=59958.1 opt_steps=2020 +[epoch 45/50] step=2040 train_loss=0.0109 tok_s=59956.8 opt_steps=2040 +[epoch 45/50] step=2060 train_loss=0.0109 tok_s=59956.3 opt_steps=2060 +[epoch 45/50] step=2080 train_loss=0.0109 tok_s=59953.0 opt_steps=2080 +[epoch 45/50] step=2100 train_loss=0.0109 tok_s=59956.4 opt_steps=2100 +[epoch 45/50] step=2120 train_loss=0.0109 tok_s=59956.8 opt_steps=2120 +[epoch 45/50] step=2140 train_loss=0.0109 tok_s=59959.0 opt_steps=2140 +[epoch 45/50] step=2160 train_loss=0.0109 tok_s=59965.9 opt_steps=2160 +[epoch 45/50] step=2180 train_loss=0.0110 tok_s=59966.0 opt_steps=2180 +[epoch 45/50] step=2200 train_loss=0.0110 tok_s=59971.2 opt_steps=2200 +[epoch 45/50] step=2220 train_loss=0.0110 tok_s=59971.4 opt_steps=2220 +[epoch 45/50] step=2240 train_loss=0.0110 tok_s=59974.1 opt_steps=2240 +[epoch 45/50] step=2260 train_loss=0.0110 tok_s=59975.3 opt_steps=2260 +[epoch 45/50] step=2280 train_loss=0.0110 tok_s=59976.2 opt_steps=2280 +[epoch 45/50] step=2300 train_loss=0.0110 tok_s=59977.8 opt_steps=2300 +[epoch 45/50] step=2320 train_loss=0.0110 tok_s=59976.5 opt_steps=2320 +[epoch 45/50] step=2340 train_loss=0.0110 tok_s=59983.3 opt_steps=2340 +[epoch 45/50] step=2360 train_loss=0.0110 tok_s=59986.1 opt_steps=2360 +[epoch 45/50] step=2380 train_loss=0.0110 tok_s=59989.1 opt_steps=2380 +[epoch 45/50] step=2400 train_loss=0.0111 tok_s=59990.8 opt_steps=2400 +[epoch 45/50] step=2420 train_loss=0.0111 tok_s=59992.9 opt_steps=2420 +[epoch 45/50] step=2440 train_loss=0.0111 tok_s=59997.3 opt_steps=2440 +[epoch 45/50] step=2460 train_loss=0.0111 tok_s=59999.6 opt_steps=2460 +[epoch 45/50] step=2480 train_loss=0.0111 tok_s=60001.2 opt_steps=2480 +[epoch 45/50] step=2500 train_loss=0.0111 tok_s=60003.9 opt_steps=2500 +[epoch 45/50] step=2520 train_loss=0.0111 tok_s=60005.2 opt_steps=2520 +[epoch 45/50] step=2540 train_loss=0.0111 tok_s=60007.8 opt_steps=2540 +[epoch 45/50] step=2560 train_loss=0.0111 tok_s=60010.7 opt_steps=2560 +[epoch 45/50] step=2580 train_loss=0.0111 tok_s=60015.3 opt_steps=2580 +[epoch 45/50] step=2600 train_loss=0.0112 tok_s=60019.9 opt_steps=2600 +[epoch 45/50] step=2620 train_loss=0.0112 tok_s=60026.3 opt_steps=2620 +[epoch 45/50] step=2640 train_loss=0.0112 tok_s=60028.9 opt_steps=2640 +[epoch 45/50] step=2660 train_loss=0.0112 tok_s=60030.8 opt_steps=2660 +[epoch 45/50] step=2680 train_loss=0.0112 tok_s=60032.3 opt_steps=2680 +[epoch 45/50] step=2700 train_loss=0.0112 tok_s=60035.8 opt_steps=2700 +[epoch 45/50] step=2720 train_loss=0.0112 tok_s=60039.5 opt_steps=2720 +[epoch 45/50] step=2740 train_loss=0.0112 tok_s=60042.5 opt_steps=2740 +[epoch 45/50] step=2760 train_loss=0.0112 tok_s=60047.6 opt_steps=2760 +[epoch 45/50] step=2780 train_loss=0.0112 tok_s=60051.0 opt_steps=2780 +[epoch 45/50] step=2800 train_loss=0.0112 tok_s=60053.3 opt_steps=2800 +[epoch 45/50] step=2820 train_loss=0.0112 tok_s=60054.8 opt_steps=2820 +[epoch 45/50] step=2840 train_loss=0.0112 tok_s=60058.6 opt_steps=2840 +[epoch 45/50] step=2860 train_loss=0.0113 tok_s=60064.5 opt_steps=2860 +[epoch 45/50] step=2880 train_loss=0.0113 tok_s=60069.7 opt_steps=2880 +[epoch 45/50] step=2900 train_loss=0.0113 tok_s=60072.2 opt_steps=2900 +[epoch 45/50] step=2920 train_loss=0.0113 tok_s=60077.7 opt_steps=2920 +[epoch 45/50] step=2940 train_loss=0.0113 tok_s=60083.7 opt_steps=2940 +[epoch 45/50] step=2960 train_loss=0.0113 tok_s=60086.1 opt_steps=2960 +[epoch 45/50] step=2980 train_loss=0.0113 tok_s=60091.9 opt_steps=2980 +[epoch 45/50] step=3000 train_loss=0.0113 tok_s=60093.6 opt_steps=3000 +[epoch 45/50] step=3020 train_loss=0.0113 tok_s=60095.4 opt_steps=3020 +[epoch 45/50] step=3040 train_loss=0.0113 tok_s=60096.7 opt_steps=3040 +[epoch 45/50] step=3060 train_loss=0.0113 tok_s=60098.1 opt_steps=3060 +[epoch 45/50] step=3080 train_loss=0.0113 tok_s=60102.0 opt_steps=3080 +[epoch 45/50] step=3100 train_loss=0.0114 tok_s=60103.7 opt_steps=3100 +[epoch 45/50] step=3120 train_loss=0.0114 tok_s=60103.3 opt_steps=3120 +[epoch 45/50] step=3140 train_loss=0.0114 tok_s=60106.7 opt_steps=3140 +[epoch 45/50] step=3160 train_loss=0.0114 tok_s=60108.0 opt_steps=3160 +[epoch 45/50] step=3180 train_loss=0.0114 tok_s=60111.2 opt_steps=3180 +[epoch 45/50] step=3200 train_loss=0.0114 tok_s=60113.3 opt_steps=3200 +[epoch 45/50] step=3220 train_loss=0.0114 tok_s=60117.1 opt_steps=3220 +[epoch 45/50] step=3240 train_loss=0.0114 tok_s=60122.4 opt_steps=3240 +[epoch 45/50] step=3260 train_loss=0.0114 tok_s=60119.1 opt_steps=3260 +[epoch 45/50] train_loss=0.0114 val_skipped tok_s=60120.7 opt_steps=3273 +[epoch 46/50] step=20 train_loss=0.0095 tok_s=59475.6 opt_steps=20 +[epoch 46/50] step=40 train_loss=0.0095 tok_s=59971.3 opt_steps=40 +[epoch 46/50] step=60 train_loss=0.0095 tok_s=60332.3 opt_steps=60 +[epoch 46/50] step=80 train_loss=0.0096 tok_s=60429.3 opt_steps=80 +[epoch 46/50] step=100 train_loss=0.0095 tok_s=60562.3 opt_steps=100 +[epoch 46/50] step=120 train_loss=0.0095 tok_s=60624.3 opt_steps=120 +[epoch 46/50] step=140 train_loss=0.0094 tok_s=60628.0 opt_steps=140 +[epoch 46/50] step=160 train_loss=0.0094 tok_s=60599.1 opt_steps=160 +[epoch 46/50] step=180 train_loss=0.0094 tok_s=60601.1 opt_steps=180 +[epoch 46/50] step=200 train_loss=0.0094 tok_s=60585.5 opt_steps=200 +[epoch 46/50] step=220 train_loss=0.0094 tok_s=60597.2 opt_steps=220 +[epoch 46/50] step=240 train_loss=0.0094 tok_s=60598.6 opt_steps=240 +[epoch 46/50] step=260 train_loss=0.0094 tok_s=60607.3 opt_steps=260 +[epoch 46/50] step=280 train_loss=0.0094 tok_s=60635.9 opt_steps=280 +[epoch 46/50] step=300 train_loss=0.0094 tok_s=60645.5 opt_steps=300 +[epoch 46/50] step=320 train_loss=0.0094 tok_s=60657.1 opt_steps=320 +[epoch 46/50] step=340 train_loss=0.0094 tok_s=60652.3 opt_steps=340 +[epoch 46/50] step=360 train_loss=0.0095 tok_s=60622.1 opt_steps=360 +[epoch 46/50] step=380 train_loss=0.0095 tok_s=60634.3 opt_steps=380 +[epoch 46/50] step=400 train_loss=0.0095 tok_s=60628.4 opt_steps=400 +[epoch 46/50] step=420 train_loss=0.0095 tok_s=60636.7 opt_steps=420 +[epoch 46/50] step=440 train_loss=0.0096 tok_s=60626.2 opt_steps=440 +[epoch 46/50] step=460 train_loss=0.0096 tok_s=60626.9 opt_steps=460 +[epoch 46/50] step=480 train_loss=0.0096 tok_s=60629.0 opt_steps=480 +[epoch 46/50] step=500 train_loss=0.0096 tok_s=60634.0 opt_steps=500 +[epoch 46/50] step=520 train_loss=0.0096 tok_s=60633.3 opt_steps=520 +[epoch 46/50] step=540 train_loss=0.0096 tok_s=60634.3 opt_steps=540 +[epoch 46/50] step=560 train_loss=0.0097 tok_s=60636.1 opt_steps=560 +[epoch 46/50] step=580 train_loss=0.0097 tok_s=60642.8 opt_steps=580 +[epoch 46/50] step=600 train_loss=0.0097 tok_s=60650.7 opt_steps=600 +[epoch 46/50] step=620 train_loss=0.0097 tok_s=60635.0 opt_steps=620 +[epoch 46/50] step=640 train_loss=0.0098 tok_s=60629.3 opt_steps=640 +[epoch 46/50] step=660 train_loss=0.0098 tok_s=60607.3 opt_steps=660 +[epoch 46/50] step=680 train_loss=0.0098 tok_s=60616.7 opt_steps=680 +[epoch 46/50] step=700 train_loss=0.0098 tok_s=60629.8 opt_steps=700 +[epoch 46/50] step=720 train_loss=0.0098 tok_s=60634.7 opt_steps=720 +[epoch 46/50] step=740 train_loss=0.0098 tok_s=60627.5 opt_steps=740 +[epoch 46/50] step=760 train_loss=0.0098 tok_s=60628.5 opt_steps=760 +[epoch 46/50] step=780 train_loss=0.0098 tok_s=60615.1 opt_steps=780 +[epoch 46/50] step=800 train_loss=0.0099 tok_s=60616.9 opt_steps=800 +[epoch 46/50] step=820 train_loss=0.0099 tok_s=60628.6 opt_steps=820 +[epoch 46/50] step=840 train_loss=0.0099 tok_s=60622.0 opt_steps=840 +[epoch 46/50] step=860 train_loss=0.0099 tok_s=60625.3 opt_steps=860 +[epoch 46/50] step=880 train_loss=0.0099 tok_s=60631.8 opt_steps=880 +[epoch 46/50] step=900 train_loss=0.0100 tok_s=60629.2 opt_steps=900 +[epoch 46/50] step=920 train_loss=0.0100 tok_s=60630.5 opt_steps=920 +[epoch 46/50] step=940 train_loss=0.0100 tok_s=60624.4 opt_steps=940 +[epoch 46/50] step=960 train_loss=0.0100 tok_s=60624.2 opt_steps=960 +[epoch 46/50] step=980 train_loss=0.0100 tok_s=60621.3 opt_steps=980 +[epoch 46/50] step=1000 train_loss=0.0101 tok_s=60622.5 opt_steps=1000 +[epoch 46/50] step=1020 train_loss=0.0101 tok_s=60631.5 opt_steps=1020 +[epoch 46/50] step=1040 train_loss=0.0101 tok_s=60628.8 opt_steps=1040 +[epoch 46/50] step=1060 train_loss=0.0101 tok_s=60628.4 opt_steps=1060 +[epoch 46/50] step=1080 train_loss=0.0101 tok_s=60633.9 opt_steps=1080 +[epoch 46/50] step=1100 train_loss=0.0101 tok_s=60632.8 opt_steps=1100 +[epoch 46/50] step=1120 train_loss=0.0102 tok_s=60632.0 opt_steps=1120 +[epoch 46/50] step=1140 train_loss=0.0102 tok_s=60631.2 opt_steps=1140 +[epoch 46/50] step=1160 train_loss=0.0102 tok_s=60626.2 opt_steps=1160 +[epoch 46/50] step=1180 train_loss=0.0102 tok_s=60623.8 opt_steps=1180 +[epoch 46/50] step=1200 train_loss=0.0102 tok_s=60629.5 opt_steps=1200 +[epoch 46/50] step=1220 train_loss=0.0102 tok_s=60639.2 opt_steps=1220 +[epoch 46/50] step=1240 train_loss=0.0102 tok_s=60636.6 opt_steps=1240 +[epoch 46/50] step=1260 train_loss=0.0102 tok_s=60638.7 opt_steps=1260 +[epoch 46/50] step=1280 train_loss=0.0102 tok_s=60647.3 opt_steps=1280 +[epoch 46/50] step=1300 train_loss=0.0102 tok_s=60639.8 opt_steps=1300 +[epoch 46/50] step=1320 train_loss=0.0103 tok_s=60635.1 opt_steps=1320 +[epoch 46/50] step=1340 train_loss=0.0103 tok_s=60632.9 opt_steps=1340 +[epoch 46/50] step=1360 train_loss=0.0103 tok_s=60634.0 opt_steps=1360 +[epoch 46/50] step=1380 train_loss=0.0103 tok_s=60639.9 opt_steps=1380 +[epoch 46/50] step=1400 train_loss=0.0103 tok_s=60635.0 opt_steps=1400 +[epoch 46/50] step=1420 train_loss=0.0103 tok_s=60638.2 opt_steps=1420 +[epoch 46/50] step=1440 train_loss=0.0103 tok_s=60643.6 opt_steps=1440 +[epoch 46/50] step=1460 train_loss=0.0104 tok_s=60638.4 opt_steps=1460 +[epoch 46/50] step=1480 train_loss=0.0104 tok_s=60640.3 opt_steps=1480 +[epoch 46/50] step=1500 train_loss=0.0104 tok_s=60638.7 opt_steps=1500 +[epoch 46/50] step=1520 train_loss=0.0104 tok_s=60643.3 opt_steps=1520 +[epoch 46/50] step=1540 train_loss=0.0104 tok_s=60643.5 opt_steps=1540 +[epoch 46/50] step=1560 train_loss=0.0104 tok_s=60650.2 opt_steps=1560 +[epoch 46/50] step=1580 train_loss=0.0104 tok_s=60644.3 opt_steps=1580 +[epoch 46/50] step=1600 train_loss=0.0104 tok_s=60642.1 opt_steps=1600 +[epoch 46/50] step=1620 train_loss=0.0104 tok_s=60641.0 opt_steps=1620 +[epoch 46/50] step=1640 train_loss=0.0104 tok_s=60648.1 opt_steps=1640 +[epoch 46/50] step=1660 train_loss=0.0105 tok_s=60654.6 opt_steps=1660 +[epoch 46/50] step=1680 train_loss=0.0105 tok_s=60656.5 opt_steps=1680 +[epoch 46/50] step=1700 train_loss=0.0105 tok_s=60657.1 opt_steps=1700 +[epoch 46/50] step=1720 train_loss=0.0105 tok_s=60655.8 opt_steps=1720 +[epoch 46/50] step=1740 train_loss=0.0105 tok_s=60655.5 opt_steps=1740 +[epoch 46/50] step=1760 train_loss=0.0105 tok_s=60661.4 opt_steps=1760 +[epoch 46/50] step=1780 train_loss=0.0105 tok_s=60663.3 opt_steps=1780 +[epoch 46/50] step=1800 train_loss=0.0105 tok_s=60663.3 opt_steps=1800 +[epoch 46/50] step=1820 train_loss=0.0106 tok_s=60661.0 opt_steps=1820 +[epoch 46/50] step=1840 train_loss=0.0106 tok_s=60658.8 opt_steps=1840 +[epoch 46/50] step=1860 train_loss=0.0106 tok_s=60661.6 opt_steps=1860 +[epoch 46/50] step=1880 train_loss=0.0106 tok_s=60667.5 opt_steps=1880 +[epoch 46/50] step=1900 train_loss=0.0106 tok_s=60666.1 opt_steps=1900 +[epoch 46/50] step=1920 train_loss=0.0106 tok_s=60665.6 opt_steps=1920 +[epoch 46/50] step=1940 train_loss=0.0106 tok_s=60664.0 opt_steps=1940 +[epoch 46/50] step=1960 train_loss=0.0107 tok_s=60661.3 opt_steps=1960 +[epoch 46/50] step=1980 train_loss=0.0107 tok_s=60662.3 opt_steps=1980 +[epoch 46/50] step=2000 train_loss=0.0107 tok_s=60657.5 opt_steps=2000 +[epoch 46/50] step=2020 train_loss=0.0107 tok_s=60652.4 opt_steps=2020 +[epoch 46/50] step=2040 train_loss=0.0107 tok_s=60654.3 opt_steps=2040 +[epoch 46/50] step=2060 train_loss=0.0107 tok_s=60653.2 opt_steps=2060 +[epoch 46/50] step=2080 train_loss=0.0107 tok_s=60647.9 opt_steps=2080 +[epoch 46/50] step=2100 train_loss=0.0107 tok_s=60650.8 opt_steps=2100 +[epoch 46/50] step=2120 train_loss=0.0107 tok_s=60648.7 opt_steps=2120 +[epoch 46/50] step=2140 train_loss=0.0107 tok_s=60647.7 opt_steps=2140 +[epoch 46/50] step=2160 train_loss=0.0107 tok_s=60648.3 opt_steps=2160 +[epoch 46/50] step=2180 train_loss=0.0107 tok_s=60650.6 opt_steps=2180 +[epoch 46/50] step=2200 train_loss=0.0108 tok_s=60653.0 opt_steps=2200 +[epoch 46/50] step=2220 train_loss=0.0108 tok_s=60654.8 opt_steps=2220 +[epoch 46/50] step=2240 train_loss=0.0108 tok_s=60654.4 opt_steps=2240 +[epoch 46/50] step=2260 train_loss=0.0108 tok_s=60654.3 opt_steps=2260 +[epoch 46/50] step=2280 train_loss=0.0108 tok_s=60652.3 opt_steps=2280 +[epoch 46/50] step=2300 train_loss=0.0108 tok_s=60651.3 opt_steps=2300 +[epoch 46/50] step=2320 train_loss=0.0108 tok_s=60654.6 opt_steps=2320 +[epoch 46/50] step=2340 train_loss=0.0108 tok_s=60658.0 opt_steps=2340 +[epoch 46/50] step=2360 train_loss=0.0108 tok_s=60657.5 opt_steps=2360 +[epoch 46/50] step=2380 train_loss=0.0108 tok_s=60654.1 opt_steps=2380 +[epoch 46/50] step=2400 train_loss=0.0108 tok_s=60654.1 opt_steps=2400 +[epoch 46/50] step=2420 train_loss=0.0109 tok_s=60651.4 opt_steps=2420 +[epoch 46/50] step=2440 train_loss=0.0109 tok_s=60650.8 opt_steps=2440 +[epoch 46/50] step=2460 train_loss=0.0109 tok_s=60647.6 opt_steps=2460 +[epoch 46/50] step=2480 train_loss=0.0109 tok_s=60645.7 opt_steps=2480 +[epoch 46/50] step=2500 train_loss=0.0109 tok_s=60650.1 opt_steps=2500 +[epoch 46/50] step=2520 train_loss=0.0109 tok_s=60650.4 opt_steps=2520 +[epoch 46/50] step=2540 train_loss=0.0109 tok_s=60650.4 opt_steps=2540 +[epoch 46/50] step=2560 train_loss=0.0109 tok_s=60650.8 opt_steps=2560 +[epoch 46/50] step=2580 train_loss=0.0109 tok_s=60652.4 opt_steps=2580 +[epoch 46/50] step=2600 train_loss=0.0109 tok_s=60648.0 opt_steps=2600 +[epoch 46/50] step=2620 train_loss=0.0109 tok_s=60649.5 opt_steps=2620 +[epoch 46/50] step=2640 train_loss=0.0109 tok_s=60646.2 opt_steps=2640 +[epoch 46/50] step=2660 train_loss=0.0109 tok_s=60649.2 opt_steps=2660 +[epoch 46/50] step=2680 train_loss=0.0109 tok_s=60653.4 opt_steps=2680 +[epoch 46/50] step=2700 train_loss=0.0109 tok_s=60653.4 opt_steps=2700 +[epoch 46/50] step=2720 train_loss=0.0109 tok_s=60654.6 opt_steps=2720 +[epoch 46/50] step=2740 train_loss=0.0110 tok_s=60656.1 opt_steps=2740 +[epoch 46/50] step=2760 train_loss=0.0110 tok_s=60655.3 opt_steps=2760 +[epoch 46/50] step=2780 train_loss=0.0110 tok_s=60658.0 opt_steps=2780 +[epoch 46/50] step=2800 train_loss=0.0110 tok_s=60657.0 opt_steps=2800 +[epoch 46/50] step=2820 train_loss=0.0110 tok_s=60659.0 opt_steps=2820 +[epoch 46/50] step=2840 train_loss=0.0110 tok_s=60658.1 opt_steps=2840 +[epoch 46/50] step=2860 train_loss=0.0110 tok_s=60654.4 opt_steps=2860 +[epoch 46/50] step=2880 train_loss=0.0110 tok_s=60655.2 opt_steps=2880 +[epoch 46/50] step=2900 train_loss=0.0110 tok_s=60655.3 opt_steps=2900 +[epoch 46/50] step=2920 train_loss=0.0110 tok_s=60655.1 opt_steps=2920 +[epoch 46/50] step=2940 train_loss=0.0111 tok_s=60653.8 opt_steps=2940 +[epoch 46/50] step=2960 train_loss=0.0111 tok_s=60653.4 opt_steps=2960 +[epoch 46/50] step=2980 train_loss=0.0111 tok_s=60653.5 opt_steps=2980 +[epoch 46/50] step=3000 train_loss=0.0111 tok_s=60651.3 opt_steps=3000 +[epoch 46/50] step=3020 train_loss=0.0111 tok_s=60651.5 opt_steps=3020 +[epoch 46/50] step=3040 train_loss=0.0111 tok_s=60650.8 opt_steps=3040 +[epoch 46/50] step=3060 train_loss=0.0111 tok_s=60652.7 opt_steps=3060 +[epoch 46/50] step=3080 train_loss=0.0111 tok_s=60651.5 opt_steps=3080 +[epoch 46/50] step=3100 train_loss=0.0111 tok_s=60652.4 opt_steps=3100 +[epoch 46/50] step=3120 train_loss=0.0111 tok_s=60651.6 opt_steps=3120 +[epoch 46/50] step=3140 train_loss=0.0111 tok_s=60649.8 opt_steps=3140 +[epoch 46/50] step=3160 train_loss=0.0111 tok_s=60647.1 opt_steps=3160 +[epoch 46/50] step=3180 train_loss=0.0111 tok_s=60649.1 opt_steps=3180 +[epoch 46/50] step=3200 train_loss=0.0112 tok_s=60652.6 opt_steps=3200 +[epoch 46/50] step=3220 train_loss=0.0112 tok_s=60647.8 opt_steps=3220 +[epoch 46/50] step=3240 train_loss=0.0112 tok_s=60648.4 opt_steps=3240 +[epoch 46/50] step=3260 train_loss=0.0112 tok_s=60642.2 opt_steps=3260 +[epoch 46/50] train_loss=0.0112 val_skipped tok_s=60639.6 opt_steps=3273 +[epoch 47/50] step=20 train_loss=0.0095 tok_s=58684.6 opt_steps=20 +[epoch 47/50] step=40 train_loss=0.0096 tok_s=59484.0 opt_steps=40 +[epoch 47/50] step=60 train_loss=0.0096 tok_s=59956.1 opt_steps=60 +[epoch 47/50] step=80 train_loss=0.0095 tok_s=60175.0 opt_steps=80 +[epoch 47/50] step=100 train_loss=0.0095 tok_s=60252.9 opt_steps=100 +[epoch 47/50] step=120 train_loss=0.0094 tok_s=60339.7 opt_steps=120 +[epoch 47/50] step=140 train_loss=0.0094 tok_s=60421.8 opt_steps=140 +[epoch 47/50] step=160 train_loss=0.0093 tok_s=60366.2 opt_steps=160 +[epoch 47/50] step=180 train_loss=0.0092 tok_s=60362.3 opt_steps=180 +[epoch 47/50] step=200 train_loss=0.0092 tok_s=60354.2 opt_steps=200 +[epoch 47/50] step=220 train_loss=0.0092 tok_s=60448.7 opt_steps=220 +[epoch 47/50] step=240 train_loss=0.0093 tok_s=60477.6 opt_steps=240 +[epoch 47/50] step=260 train_loss=0.0093 tok_s=60509.9 opt_steps=260 +[epoch 47/50] step=280 train_loss=0.0093 tok_s=60513.0 opt_steps=280 +[epoch 47/50] step=300 train_loss=0.0092 tok_s=60509.8 opt_steps=300 +[epoch 47/50] step=320 train_loss=0.0092 tok_s=60499.4 opt_steps=320 +[epoch 47/50] step=340 train_loss=0.0092 tok_s=60545.1 opt_steps=340 +[epoch 47/50] step=360 train_loss=0.0092 tok_s=60580.5 opt_steps=360 +[epoch 47/50] step=380 train_loss=0.0092 tok_s=60577.3 opt_steps=380 +[epoch 47/50] step=400 train_loss=0.0092 tok_s=60579.2 opt_steps=400 +[epoch 47/50] step=420 train_loss=0.0093 tok_s=60579.4 opt_steps=420 +[epoch 47/50] step=440 train_loss=0.0093 tok_s=60572.9 opt_steps=440 +[epoch 47/50] step=460 train_loss=0.0093 tok_s=60581.9 opt_steps=460 +[epoch 47/50] step=480 train_loss=0.0094 tok_s=60611.1 opt_steps=480 +[epoch 47/50] step=500 train_loss=0.0094 tok_s=60603.8 opt_steps=500 +[epoch 47/50] step=520 train_loss=0.0094 tok_s=60582.3 opt_steps=520 +[epoch 47/50] step=540 train_loss=0.0094 tok_s=60580.2 opt_steps=540 +[epoch 47/50] step=560 train_loss=0.0094 tok_s=60594.6 opt_steps=560 +[epoch 47/50] step=580 train_loss=0.0094 tok_s=60597.9 opt_steps=580 +[epoch 47/50] step=600 train_loss=0.0095 tok_s=60605.5 opt_steps=600 +[epoch 47/50] step=620 train_loss=0.0095 tok_s=60613.8 opt_steps=620 +[epoch 47/50] step=640 train_loss=0.0095 tok_s=60611.4 opt_steps=640 +[epoch 47/50] step=660 train_loss=0.0095 tok_s=60615.2 opt_steps=660 +[epoch 47/50] step=680 train_loss=0.0095 tok_s=60604.6 opt_steps=680 +[epoch 47/50] step=700 train_loss=0.0096 tok_s=60590.0 opt_steps=700 +[epoch 47/50] step=720 train_loss=0.0096 tok_s=60598.5 opt_steps=720 +[epoch 47/50] step=740 train_loss=0.0096 tok_s=60602.8 opt_steps=740 +[epoch 47/50] step=760 train_loss=0.0096 tok_s=60600.9 opt_steps=760 +[epoch 47/50] step=780 train_loss=0.0096 tok_s=60603.4 opt_steps=780 +[epoch 47/50] step=800 train_loss=0.0096 tok_s=60603.5 opt_steps=800 +[epoch 47/50] step=820 train_loss=0.0096 tok_s=60603.3 opt_steps=820 +[epoch 47/50] step=840 train_loss=0.0096 tok_s=60610.8 opt_steps=840 +[epoch 47/50] step=860 train_loss=0.0097 tok_s=60609.3 opt_steps=860 +[epoch 47/50] step=880 train_loss=0.0097 tok_s=60609.2 opt_steps=880 +[epoch 47/50] step=900 train_loss=0.0097 tok_s=60599.3 opt_steps=900 +[epoch 47/50] step=920 train_loss=0.0097 tok_s=60596.2 opt_steps=920 +[epoch 47/50] step=940 train_loss=0.0098 tok_s=60594.7 opt_steps=940 +[epoch 47/50] step=960 train_loss=0.0098 tok_s=60596.7 opt_steps=960 +[epoch 47/50] step=980 train_loss=0.0098 tok_s=60596.3 opt_steps=980 +[epoch 47/50] step=1000 train_loss=0.0098 tok_s=60603.7 opt_steps=1000 +[epoch 47/50] step=1020 train_loss=0.0098 tok_s=60601.2 opt_steps=1020 +[epoch 47/50] step=1040 train_loss=0.0098 tok_s=60612.4 opt_steps=1040 +[epoch 47/50] step=1060 train_loss=0.0099 tok_s=60617.1 opt_steps=1060 +[epoch 47/50] step=1080 train_loss=0.0099 tok_s=60606.7 opt_steps=1080 +[epoch 47/50] step=1100 train_loss=0.0099 tok_s=60609.2 opt_steps=1100 +[epoch 47/50] step=1120 train_loss=0.0099 tok_s=60604.4 opt_steps=1120 +[epoch 47/50] step=1140 train_loss=0.0099 tok_s=60605.8 opt_steps=1140 +[epoch 47/50] step=1160 train_loss=0.0100 tok_s=60607.4 opt_steps=1160 +[epoch 47/50] step=1180 train_loss=0.0100 tok_s=60612.7 opt_steps=1180 +[epoch 47/50] step=1200 train_loss=0.0100 tok_s=60608.3 opt_steps=1200 +[epoch 47/50] step=1220 train_loss=0.0100 tok_s=60603.8 opt_steps=1220 +[epoch 47/50] step=1240 train_loss=0.0100 tok_s=60604.6 opt_steps=1240 +[epoch 47/50] step=1260 train_loss=0.0100 tok_s=60606.8 opt_steps=1260 +[epoch 47/50] step=1280 train_loss=0.0101 tok_s=60611.5 opt_steps=1280 +[epoch 47/50] step=1300 train_loss=0.0101 tok_s=60614.5 opt_steps=1300 +[epoch 47/50] step=1320 train_loss=0.0101 tok_s=60618.4 opt_steps=1320 +[epoch 47/50] step=1340 train_loss=0.0101 tok_s=60621.2 opt_steps=1340 +[epoch 47/50] step=1360 train_loss=0.0101 tok_s=60615.7 opt_steps=1360 +[epoch 47/50] step=1380 train_loss=0.0101 tok_s=60610.7 opt_steps=1380 +[epoch 47/50] step=1400 train_loss=0.0101 tok_s=60614.8 opt_steps=1400 +[epoch 47/50] step=1420 train_loss=0.0101 tok_s=60607.9 opt_steps=1420 +[epoch 47/50] step=1440 train_loss=0.0101 tok_s=60605.7 opt_steps=1440 +[epoch 47/50] step=1460 train_loss=0.0101 tok_s=60595.6 opt_steps=1460 +[epoch 47/50] step=1480 train_loss=0.0101 tok_s=60595.2 opt_steps=1480 +[epoch 47/50] step=1500 train_loss=0.0101 tok_s=60588.7 opt_steps=1500 +[epoch 47/50] step=1520 train_loss=0.0102 tok_s=60577.7 opt_steps=1520 +[epoch 47/50] step=1540 train_loss=0.0102 tok_s=60576.3 opt_steps=1540 +[epoch 47/50] step=1560 train_loss=0.0102 tok_s=60581.2 opt_steps=1560 +[epoch 47/50] step=1580 train_loss=0.0102 tok_s=60583.1 opt_steps=1580 +[epoch 47/50] step=1600 train_loss=0.0102 tok_s=60571.0 opt_steps=1600 +[epoch 47/50] step=1620 train_loss=0.0102 tok_s=60554.9 opt_steps=1620 +[epoch 47/50] step=1640 train_loss=0.0102 tok_s=60533.0 opt_steps=1640 +[epoch 47/50] step=1660 train_loss=0.0102 tok_s=60523.6 opt_steps=1660 +[epoch 47/50] step=1680 train_loss=0.0102 tok_s=60509.4 opt_steps=1680 +[epoch 47/50] step=1700 train_loss=0.0102 tok_s=60498.1 opt_steps=1700 +[epoch 47/50] step=1720 train_loss=0.0103 tok_s=60483.7 opt_steps=1720 +[epoch 47/50] step=1740 train_loss=0.0103 tok_s=60476.3 opt_steps=1740 +[epoch 47/50] step=1760 train_loss=0.0103 tok_s=60462.3 opt_steps=1760 +[epoch 47/50] step=1780 train_loss=0.0103 tok_s=60460.2 opt_steps=1780 +[epoch 47/50] step=1800 train_loss=0.0103 tok_s=60454.0 opt_steps=1800 +[epoch 47/50] step=1820 train_loss=0.0103 tok_s=60451.4 opt_steps=1820 +[epoch 47/50] step=1840 train_loss=0.0103 tok_s=60446.4 opt_steps=1840 +[epoch 47/50] step=1860 train_loss=0.0103 tok_s=60445.0 opt_steps=1860 +[epoch 47/50] step=1880 train_loss=0.0104 tok_s=60436.6 opt_steps=1880 +[epoch 47/50] step=1900 train_loss=0.0104 tok_s=60428.4 opt_steps=1900 +[epoch 47/50] step=1920 train_loss=0.0104 tok_s=60423.7 opt_steps=1920 +[epoch 47/50] step=1940 train_loss=0.0104 tok_s=60418.3 opt_steps=1940 +[epoch 47/50] step=1960 train_loss=0.0104 tok_s=60410.5 opt_steps=1960 +[epoch 47/50] step=1980 train_loss=0.0104 tok_s=60405.9 opt_steps=1980 +[epoch 47/50] step=2000 train_loss=0.0104 tok_s=60398.7 opt_steps=2000 +[epoch 47/50] step=2020 train_loss=0.0105 tok_s=60384.6 opt_steps=2020 +[epoch 47/50] step=2040 train_loss=0.0105 tok_s=60377.5 opt_steps=2040 +[epoch 47/50] step=2060 train_loss=0.0105 tok_s=60379.0 opt_steps=2060 +[epoch 47/50] step=2080 train_loss=0.0105 tok_s=60376.0 opt_steps=2080 +[epoch 47/50] step=2100 train_loss=0.0105 tok_s=60372.3 opt_steps=2100 +[epoch 47/50] step=2120 train_loss=0.0105 tok_s=60366.4 opt_steps=2120 +[epoch 47/50] step=2140 train_loss=0.0105 tok_s=60365.3 opt_steps=2140 +[epoch 47/50] step=2160 train_loss=0.0105 tok_s=60355.3 opt_steps=2160 +[epoch 47/50] step=2180 train_loss=0.0105 tok_s=60352.6 opt_steps=2180 +[epoch 47/50] step=2200 train_loss=0.0106 tok_s=60346.9 opt_steps=2200 +[epoch 47/50] step=2220 train_loss=0.0106 tok_s=60344.0 opt_steps=2220 +[epoch 47/50] step=2240 train_loss=0.0106 tok_s=60334.1 opt_steps=2240 +[epoch 47/50] step=2260 train_loss=0.0106 tok_s=60322.1 opt_steps=2260 +[epoch 47/50] step=2280 train_loss=0.0106 tok_s=60307.9 opt_steps=2280 +[epoch 47/50] step=2300 train_loss=0.0106 tok_s=60295.5 opt_steps=2300 +[epoch 47/50] step=2320 train_loss=0.0106 tok_s=60281.5 opt_steps=2320 +[epoch 47/50] step=2340 train_loss=0.0106 tok_s=60276.0 opt_steps=2340 +[epoch 47/50] step=2360 train_loss=0.0107 tok_s=60272.8 opt_steps=2360 +[epoch 47/50] step=2380 train_loss=0.0107 tok_s=60265.1 opt_steps=2380 +[epoch 47/50] step=2400 train_loss=0.0107 tok_s=60259.6 opt_steps=2400 +[epoch 47/50] step=2420 train_loss=0.0107 tok_s=60250.6 opt_steps=2420 +[epoch 47/50] step=2440 train_loss=0.0107 tok_s=60245.1 opt_steps=2440 +[epoch 47/50] step=2460 train_loss=0.0107 tok_s=60241.2 opt_steps=2460 +[epoch 47/50] step=2480 train_loss=0.0107 tok_s=60236.1 opt_steps=2480 +[epoch 47/50] step=2500 train_loss=0.0107 tok_s=60228.5 opt_steps=2500 +[epoch 47/50] step=2520 train_loss=0.0107 tok_s=60223.9 opt_steps=2520 +[epoch 47/50] step=2540 train_loss=0.0107 tok_s=60210.6 opt_steps=2540 +[epoch 47/50] step=2560 train_loss=0.0107 tok_s=60209.0 opt_steps=2560 +[epoch 47/50] step=2580 train_loss=0.0108 tok_s=60198.4 opt_steps=2580 +[epoch 47/50] step=2600 train_loss=0.0108 tok_s=60193.2 opt_steps=2600 +[epoch 47/50] step=2620 train_loss=0.0108 tok_s=60180.6 opt_steps=2620 +[epoch 47/50] step=2640 train_loss=0.0108 tok_s=60170.8 opt_steps=2640 +[epoch 47/50] step=2660 train_loss=0.0108 tok_s=60163.5 opt_steps=2660 +[epoch 47/50] step=2680 train_loss=0.0108 tok_s=60160.7 opt_steps=2680 +[epoch 47/50] step=2700 train_loss=0.0108 tok_s=60150.0 opt_steps=2700 +[epoch 47/50] step=2720 train_loss=0.0108 tok_s=60144.2 opt_steps=2720 +[epoch 47/50] step=2740 train_loss=0.0108 tok_s=60139.5 opt_steps=2740 +[epoch 47/50] step=2760 train_loss=0.0108 tok_s=60135.0 opt_steps=2760 +[epoch 47/50] step=2780 train_loss=0.0108 tok_s=60127.1 opt_steps=2780 +[epoch 47/50] step=2800 train_loss=0.0108 tok_s=60122.2 opt_steps=2800 +[epoch 47/50] step=2820 train_loss=0.0108 tok_s=60116.7 opt_steps=2820 +[epoch 47/50] step=2840 train_loss=0.0108 tok_s=60116.2 opt_steps=2840 +[epoch 47/50] step=2860 train_loss=0.0108 tok_s=60109.0 opt_steps=2860 +[epoch 47/50] step=2880 train_loss=0.0109 tok_s=60105.3 opt_steps=2880 +[epoch 47/50] step=2900 train_loss=0.0109 tok_s=60105.2 opt_steps=2900 +[epoch 47/50] step=2920 train_loss=0.0109 tok_s=60100.4 opt_steps=2920 +[epoch 47/50] step=2940 train_loss=0.0109 tok_s=60093.4 opt_steps=2940 +[epoch 47/50] step=2960 train_loss=0.0109 tok_s=60088.7 opt_steps=2960 +[epoch 47/50] step=2980 train_loss=0.0109 tok_s=60085.5 opt_steps=2980 +[epoch 47/50] step=3000 train_loss=0.0109 tok_s=60085.4 opt_steps=3000 +[epoch 47/50] step=3020 train_loss=0.0109 tok_s=60082.9 opt_steps=3020 +[epoch 47/50] step=3040 train_loss=0.0109 tok_s=60076.9 opt_steps=3040 +[epoch 47/50] step=3060 train_loss=0.0109 tok_s=60073.8 opt_steps=3060 +[epoch 47/50] step=3080 train_loss=0.0109 tok_s=60068.3 opt_steps=3080 +[epoch 47/50] step=3100 train_loss=0.0109 tok_s=60067.2 opt_steps=3100 +[epoch 47/50] step=3120 train_loss=0.0110 tok_s=60055.6 opt_steps=3120 +[epoch 47/50] step=3140 train_loss=0.0110 tok_s=60053.7 opt_steps=3140 +[epoch 47/50] step=3160 train_loss=0.0110 tok_s=60051.9 opt_steps=3160 +[epoch 47/50] step=3180 train_loss=0.0110 tok_s=60049.7 opt_steps=3180 +[epoch 47/50] step=3200 train_loss=0.0110 tok_s=60050.1 opt_steps=3200 +[epoch 47/50] step=3220 train_loss=0.0110 tok_s=60049.2 opt_steps=3220 +[epoch 47/50] step=3240 train_loss=0.0110 tok_s=60050.4 opt_steps=3240 +[epoch 47/50] step=3260 train_loss=0.0110 tok_s=60044.8 opt_steps=3260 +[epoch 47/50] train_loss=0.0110 val_skipped tok_s=60046.9 opt_steps=3273 +[epoch 48/50] step=20 train_loss=0.0088 tok_s=57713.0 opt_steps=20 +[epoch 48/50] step=40 train_loss=0.0088 tok_s=58772.7 opt_steps=40 +[epoch 48/50] step=60 train_loss=0.0089 tok_s=59274.7 opt_steps=60 +[epoch 48/50] step=80 train_loss=0.0088 tok_s=59365.1 opt_steps=80 +[epoch 48/50] step=100 train_loss=0.0088 tok_s=59387.5 opt_steps=100 +[epoch 48/50] step=120 train_loss=0.0088 tok_s=59504.2 opt_steps=120 +[epoch 48/50] step=140 train_loss=0.0088 tok_s=59563.8 opt_steps=140 +[epoch 48/50] step=160 train_loss=0.0087 tok_s=59606.9 opt_steps=160 +[epoch 48/50] step=180 train_loss=0.0087 tok_s=59651.2 opt_steps=180 +[epoch 48/50] step=200 train_loss=0.0087 tok_s=59696.4 opt_steps=200 +[epoch 48/50] step=220 train_loss=0.0088 tok_s=59716.4 opt_steps=220 +[epoch 48/50] step=240 train_loss=0.0088 tok_s=59737.4 opt_steps=240 +[epoch 48/50] step=260 train_loss=0.0088 tok_s=59789.0 opt_steps=260 +[epoch 48/50] step=280 train_loss=0.0089 tok_s=59776.0 opt_steps=280 +[epoch 48/50] step=300 train_loss=0.0089 tok_s=59762.6 opt_steps=300 +[epoch 48/50] step=320 train_loss=0.0090 tok_s=59753.6 opt_steps=320 +[epoch 48/50] step=340 train_loss=0.0090 tok_s=59758.2 opt_steps=340 +[epoch 48/50] step=360 train_loss=0.0090 tok_s=59752.4 opt_steps=360 +[epoch 48/50] step=380 train_loss=0.0091 tok_s=59753.8 opt_steps=380 +[epoch 48/50] step=400 train_loss=0.0091 tok_s=59761.0 opt_steps=400 +[epoch 48/50] step=420 train_loss=0.0091 tok_s=59754.3 opt_steps=420 +[epoch 48/50] step=440 train_loss=0.0091 tok_s=59753.3 opt_steps=440 +[epoch 48/50] step=460 train_loss=0.0091 tok_s=59747.3 opt_steps=460 +[epoch 48/50] step=480 train_loss=0.0092 tok_s=59761.3 opt_steps=480 +[epoch 48/50] step=500 train_loss=0.0092 tok_s=59768.8 opt_steps=500 +[epoch 48/50] step=520 train_loss=0.0092 tok_s=59770.8 opt_steps=520 +[epoch 48/50] step=540 train_loss=0.0092 tok_s=59803.1 opt_steps=540 +[epoch 48/50] step=560 train_loss=0.0093 tok_s=59820.3 opt_steps=560 +[epoch 48/50] step=580 train_loss=0.0093 tok_s=59834.0 opt_steps=580 +[epoch 48/50] step=600 train_loss=0.0093 tok_s=59862.1 opt_steps=600 +[epoch 48/50] step=620 train_loss=0.0093 tok_s=59874.5 opt_steps=620 +[epoch 48/50] step=640 train_loss=0.0093 tok_s=59888.4 opt_steps=640 +[epoch 48/50] step=660 train_loss=0.0093 tok_s=59885.4 opt_steps=660 +[epoch 48/50] step=680 train_loss=0.0093 tok_s=59880.4 opt_steps=680 +[epoch 48/50] step=700 train_loss=0.0094 tok_s=59894.6 opt_steps=700 +[epoch 48/50] step=720 train_loss=0.0094 tok_s=59882.8 opt_steps=720 +[epoch 48/50] step=740 train_loss=0.0094 tok_s=59887.5 opt_steps=740 +[epoch 48/50] step=760 train_loss=0.0094 tok_s=59897.1 opt_steps=760 +[epoch 48/50] step=780 train_loss=0.0094 tok_s=59889.0 opt_steps=780 +[epoch 48/50] step=800 train_loss=0.0095 tok_s=59886.8 opt_steps=800 +[epoch 48/50] step=820 train_loss=0.0095 tok_s=59885.9 opt_steps=820 +[epoch 48/50] step=840 train_loss=0.0095 tok_s=59895.8 opt_steps=840 +[epoch 48/50] step=860 train_loss=0.0095 tok_s=59903.6 opt_steps=860 +[epoch 48/50] step=880 train_loss=0.0095 tok_s=59907.0 opt_steps=880 +[epoch 48/50] step=900 train_loss=0.0095 tok_s=59911.8 opt_steps=900 +[epoch 48/50] step=920 train_loss=0.0095 tok_s=59915.2 opt_steps=920 +[epoch 48/50] step=940 train_loss=0.0096 tok_s=59918.1 opt_steps=940 +[epoch 48/50] step=960 train_loss=0.0096 tok_s=59914.2 opt_steps=960 +[epoch 48/50] step=980 train_loss=0.0096 tok_s=59909.8 opt_steps=980 +[epoch 48/50] step=1000 train_loss=0.0096 tok_s=59918.1 opt_steps=1000 +[epoch 48/50] step=1020 train_loss=0.0096 tok_s=59924.3 opt_steps=1020 +[epoch 48/50] step=1040 train_loss=0.0096 tok_s=59923.4 opt_steps=1040 +[epoch 48/50] step=1060 train_loss=0.0096 tok_s=59938.0 opt_steps=1060 +[epoch 48/50] step=1080 train_loss=0.0096 tok_s=59943.0 opt_steps=1080 +[epoch 48/50] step=1100 train_loss=0.0097 tok_s=59945.9 opt_steps=1100 +[epoch 48/50] step=1120 train_loss=0.0097 tok_s=59952.5 opt_steps=1120 +[epoch 48/50] step=1140 train_loss=0.0097 tok_s=59952.8 opt_steps=1140 +[epoch 48/50] step=1160 train_loss=0.0097 tok_s=59957.7 opt_steps=1160 +[epoch 48/50] step=1180 train_loss=0.0097 tok_s=59958.7 opt_steps=1180 +[epoch 48/50] step=1200 train_loss=0.0097 tok_s=59958.6 opt_steps=1200 +[epoch 48/50] step=1220 train_loss=0.0097 tok_s=59955.7 opt_steps=1220 +[epoch 48/50] step=1240 train_loss=0.0097 tok_s=59951.2 opt_steps=1240 +[epoch 48/50] step=1260 train_loss=0.0097 tok_s=59944.1 opt_steps=1260 +[epoch 48/50] step=1280 train_loss=0.0098 tok_s=59940.8 opt_steps=1280 +[epoch 48/50] step=1300 train_loss=0.0098 tok_s=59938.7 opt_steps=1300 +[epoch 48/50] step=1320 train_loss=0.0098 tok_s=59944.2 opt_steps=1320 +[epoch 48/50] step=1340 train_loss=0.0098 tok_s=59956.1 opt_steps=1340 +[epoch 48/50] step=1360 train_loss=0.0098 tok_s=59959.9 opt_steps=1360 +[epoch 48/50] step=1380 train_loss=0.0098 tok_s=59964.4 opt_steps=1380 +[epoch 48/50] step=1400 train_loss=0.0099 tok_s=59961.9 opt_steps=1400 +[epoch 48/50] step=1420 train_loss=0.0099 tok_s=59968.2 opt_steps=1420 +[epoch 48/50] step=1440 train_loss=0.0099 tok_s=59966.8 opt_steps=1440 +[epoch 48/50] step=1460 train_loss=0.0099 tok_s=59973.0 opt_steps=1460 +[epoch 48/50] step=1480 train_loss=0.0099 tok_s=59974.6 opt_steps=1480 +[epoch 48/50] step=1500 train_loss=0.0099 tok_s=59976.7 opt_steps=1500 +[epoch 48/50] step=1520 train_loss=0.0099 tok_s=59980.0 opt_steps=1520 +[epoch 48/50] step=1540 train_loss=0.0099 tok_s=59984.5 opt_steps=1540 +[epoch 48/50] step=1560 train_loss=0.0099 tok_s=59981.7 opt_steps=1560 +[epoch 48/50] step=1580 train_loss=0.0099 tok_s=59976.8 opt_steps=1580 +[epoch 48/50] step=1600 train_loss=0.0100 tok_s=59979.2 opt_steps=1600 +[epoch 48/50] step=1620 train_loss=0.0100 tok_s=59980.0 opt_steps=1620 +[epoch 48/50] step=1640 train_loss=0.0100 tok_s=59980.5 opt_steps=1640 +[epoch 48/50] step=1660 train_loss=0.0100 tok_s=59982.2 opt_steps=1660 +[epoch 48/50] step=1680 train_loss=0.0100 tok_s=59981.6 opt_steps=1680 +[epoch 48/50] step=1700 train_loss=0.0100 tok_s=59987.7 opt_steps=1700 +[epoch 48/50] step=1720 train_loss=0.0101 tok_s=59988.6 opt_steps=1720 +[epoch 48/50] step=1740 train_loss=0.0101 tok_s=59986.9 opt_steps=1740 +[epoch 48/50] step=1760 train_loss=0.0101 tok_s=59992.0 opt_steps=1760 +[epoch 48/50] step=1780 train_loss=0.0101 tok_s=59992.4 opt_steps=1780 +[epoch 48/50] step=1800 train_loss=0.0101 tok_s=59995.1 opt_steps=1800 +[epoch 48/50] step=1820 train_loss=0.0101 tok_s=59996.8 opt_steps=1820 +[epoch 48/50] step=1840 train_loss=0.0101 tok_s=59996.7 opt_steps=1840 +[epoch 48/50] step=1860 train_loss=0.0101 tok_s=59997.6 opt_steps=1860 +[epoch 48/50] step=1880 train_loss=0.0101 tok_s=60000.4 opt_steps=1880 +[epoch 48/50] step=1900 train_loss=0.0102 tok_s=60000.8 opt_steps=1900 +[epoch 48/50] step=1920 train_loss=0.0102 tok_s=60005.6 opt_steps=1920 +[epoch 48/50] step=1940 train_loss=0.0102 tok_s=60008.9 opt_steps=1940 +[epoch 48/50] step=1960 train_loss=0.0102 tok_s=60015.2 opt_steps=1960 +[epoch 48/50] step=1980 train_loss=0.0102 tok_s=60013.2 opt_steps=1980 +[epoch 48/50] step=2000 train_loss=0.0102 tok_s=60021.4 opt_steps=2000 +[epoch 48/50] step=2020 train_loss=0.0102 tok_s=60025.8 opt_steps=2020 +[epoch 48/50] step=2040 train_loss=0.0102 tok_s=60031.9 opt_steps=2040 +[epoch 48/50] step=2060 train_loss=0.0102 tok_s=60032.2 opt_steps=2060 +[epoch 48/50] step=2080 train_loss=0.0102 tok_s=60031.7 opt_steps=2080 +[epoch 48/50] step=2100 train_loss=0.0102 tok_s=60035.5 opt_steps=2100 +[epoch 48/50] step=2120 train_loss=0.0103 tok_s=60041.9 opt_steps=2120 +[epoch 48/50] step=2140 train_loss=0.0103 tok_s=60043.0 opt_steps=2140 +[epoch 48/50] step=2160 train_loss=0.0103 tok_s=60041.9 opt_steps=2160 +[epoch 48/50] step=2180 train_loss=0.0103 tok_s=60041.9 opt_steps=2180 +[epoch 48/50] step=2200 train_loss=0.0103 tok_s=60041.7 opt_steps=2200 +[epoch 48/50] step=2220 train_loss=0.0103 tok_s=60040.8 opt_steps=2220 +[epoch 48/50] step=2240 train_loss=0.0103 tok_s=60040.7 opt_steps=2240 +[epoch 48/50] step=2260 train_loss=0.0103 tok_s=60039.3 opt_steps=2260 +[epoch 48/50] step=2280 train_loss=0.0103 tok_s=60037.2 opt_steps=2280 +[epoch 48/50] step=2300 train_loss=0.0103 tok_s=60037.5 opt_steps=2300 +[epoch 48/50] step=2320 train_loss=0.0104 tok_s=60040.5 opt_steps=2320 +[epoch 48/50] step=2340 train_loss=0.0104 tok_s=60040.1 opt_steps=2340 +[epoch 48/50] step=2360 train_loss=0.0104 tok_s=60041.0 opt_steps=2360 +[epoch 48/50] step=2380 train_loss=0.0104 tok_s=60044.9 opt_steps=2380 +[epoch 48/50] step=2400 train_loss=0.0104 tok_s=60044.5 opt_steps=2400 +[epoch 48/50] step=2420 train_loss=0.0104 tok_s=60043.0 opt_steps=2420 +[epoch 48/50] step=2440 train_loss=0.0104 tok_s=60042.9 opt_steps=2440 +[epoch 48/50] step=2460 train_loss=0.0104 tok_s=60046.3 opt_steps=2460 +[epoch 48/50] step=2480 train_loss=0.0104 tok_s=60046.0 opt_steps=2480 +[epoch 48/50] step=2500 train_loss=0.0104 tok_s=60047.6 opt_steps=2500 +[epoch 48/50] step=2520 train_loss=0.0104 tok_s=60047.3 opt_steps=2520 +[epoch 48/50] step=2540 train_loss=0.0104 tok_s=60047.0 opt_steps=2540 +[epoch 48/50] step=2560 train_loss=0.0104 tok_s=60046.8 opt_steps=2560 +[epoch 48/50] step=2580 train_loss=0.0105 tok_s=60050.3 opt_steps=2580 +[epoch 48/50] step=2600 train_loss=0.0105 tok_s=60049.1 opt_steps=2600 +[epoch 48/50] step=2620 train_loss=0.0105 tok_s=60047.8 opt_steps=2620 +[epoch 48/50] step=2640 train_loss=0.0105 tok_s=60049.7 opt_steps=2640 +[epoch 48/50] step=2660 train_loss=0.0105 tok_s=60051.1 opt_steps=2660 +[epoch 48/50] step=2680 train_loss=0.0105 tok_s=60053.0 opt_steps=2680 +[epoch 48/50] step=2700 train_loss=0.0105 tok_s=60053.4 opt_steps=2700 +[epoch 48/50] step=2720 train_loss=0.0105 tok_s=60057.7 opt_steps=2720 +[epoch 48/50] step=2740 train_loss=0.0105 tok_s=60056.2 opt_steps=2740 +[epoch 48/50] step=2760 train_loss=0.0105 tok_s=60056.2 opt_steps=2760 +[epoch 48/50] step=2780 train_loss=0.0105 tok_s=60056.5 opt_steps=2780 +[epoch 48/50] step=2800 train_loss=0.0105 tok_s=60059.3 opt_steps=2800 +[epoch 48/50] step=2820 train_loss=0.0105 tok_s=60061.9 opt_steps=2820 +[epoch 48/50] step=2840 train_loss=0.0106 tok_s=60061.3 opt_steps=2840 +[epoch 48/50] step=2860 train_loss=0.0106 tok_s=60065.4 opt_steps=2860 +[epoch 48/50] step=2880 train_loss=0.0106 tok_s=60069.1 opt_steps=2880 +[epoch 48/50] step=2900 train_loss=0.0106 tok_s=60070.8 opt_steps=2900 +[epoch 48/50] step=2920 train_loss=0.0106 tok_s=60069.9 opt_steps=2920 +[epoch 48/50] step=2940 train_loss=0.0106 tok_s=60071.8 opt_steps=2940 +[epoch 48/50] step=2960 train_loss=0.0106 tok_s=60070.1 opt_steps=2960 +[epoch 48/50] step=2980 train_loss=0.0106 tok_s=60068.3 opt_steps=2980 +[epoch 48/50] step=3000 train_loss=0.0106 tok_s=60067.9 opt_steps=3000 +[epoch 48/50] step=3020 train_loss=0.0106 tok_s=60071.1 opt_steps=3020 +[epoch 48/50] step=3040 train_loss=0.0106 tok_s=60073.5 opt_steps=3040 +[epoch 48/50] step=3060 train_loss=0.0106 tok_s=60075.2 opt_steps=3060 +[epoch 48/50] step=3080 train_loss=0.0106 tok_s=60074.2 opt_steps=3080 +[epoch 48/50] step=3100 train_loss=0.0107 tok_s=60072.8 opt_steps=3100 +[epoch 48/50] step=3120 train_loss=0.0107 tok_s=60074.5 opt_steps=3120 +[epoch 48/50] step=3140 train_loss=0.0107 tok_s=60074.2 opt_steps=3140 +[epoch 48/50] step=3160 train_loss=0.0107 tok_s=60078.0 opt_steps=3160 +[epoch 48/50] step=3180 train_loss=0.0107 tok_s=60077.7 opt_steps=3180 +[epoch 48/50] step=3200 train_loss=0.0107 tok_s=60075.4 opt_steps=3200 +[epoch 48/50] step=3220 train_loss=0.0107 tok_s=60078.2 opt_steps=3220 +[epoch 48/50] step=3240 train_loss=0.0107 tok_s=60080.1 opt_steps=3240 +[epoch 48/50] step=3260 train_loss=0.0107 tok_s=60076.2 opt_steps=3260 +[epoch 48/50] train_loss=0.0107 val_skipped tok_s=60077.2 opt_steps=3273 +[epoch 49/50] step=20 train_loss=0.0091 tok_s=57506.8 opt_steps=20 +[epoch 49/50] step=40 train_loss=0.0091 tok_s=58642.4 opt_steps=40 +[epoch 49/50] step=60 train_loss=0.0090 tok_s=59015.5 opt_steps=60 +[epoch 49/50] step=80 train_loss=0.0091 tok_s=59259.7 opt_steps=80 +[epoch 49/50] step=100 train_loss=0.0092 tok_s=59534.6 opt_steps=100 +[epoch 49/50] step=120 train_loss=0.0092 tok_s=59700.5 opt_steps=120 +[epoch 49/50] step=140 train_loss=0.0092 tok_s=59763.8 opt_steps=140 +[epoch 49/50] step=160 train_loss=0.0092 tok_s=59849.9 opt_steps=160 +[epoch 49/50] step=180 train_loss=0.0091 tok_s=59933.0 opt_steps=180 +[epoch 49/50] step=200 train_loss=0.0091 tok_s=59975.6 opt_steps=200 +[epoch 49/50] step=220 train_loss=0.0091 tok_s=59993.0 opt_steps=220 +[epoch 49/50] step=240 train_loss=0.0091 tok_s=59989.9 opt_steps=240 +[epoch 49/50] step=260 train_loss=0.0091 tok_s=60022.4 opt_steps=260 +[epoch 49/50] step=280 train_loss=0.0091 tok_s=59989.3 opt_steps=280 +[epoch 49/50] step=300 train_loss=0.0091 tok_s=60005.9 opt_steps=300 +[epoch 49/50] step=320 train_loss=0.0091 tok_s=60004.6 opt_steps=320 +[epoch 49/50] step=340 train_loss=0.0091 tok_s=60009.3 opt_steps=340 +[epoch 49/50] step=360 train_loss=0.0092 tok_s=60007.1 opt_steps=360 +[epoch 49/50] step=380 train_loss=0.0092 tok_s=60016.9 opt_steps=380 +[epoch 49/50] step=400 train_loss=0.0092 tok_s=59898.7 opt_steps=400 +[epoch 49/50] step=420 train_loss=0.0092 tok_s=59942.2 opt_steps=420 +[epoch 49/50] step=440 train_loss=0.0092 tok_s=59947.8 opt_steps=440 +[epoch 49/50] step=460 train_loss=0.0092 tok_s=59936.5 opt_steps=460 +[epoch 49/50] step=480 train_loss=0.0092 tok_s=59936.5 opt_steps=480 +[epoch 49/50] step=500 train_loss=0.0092 tok_s=59966.7 opt_steps=500 +[epoch 49/50] step=520 train_loss=0.0092 tok_s=59980.3 opt_steps=520 +[epoch 49/50] step=540 train_loss=0.0092 tok_s=59994.4 opt_steps=540 +[epoch 49/50] step=560 train_loss=0.0092 tok_s=60019.6 opt_steps=560 +[epoch 49/50] step=580 train_loss=0.0092 tok_s=60037.3 opt_steps=580 +[epoch 49/50] step=600 train_loss=0.0092 tok_s=60038.4 opt_steps=600 +[epoch 49/50] step=620 train_loss=0.0092 tok_s=60039.0 opt_steps=620 +[epoch 49/50] step=640 train_loss=0.0092 tok_s=60048.4 opt_steps=640 +[epoch 49/50] step=660 train_loss=0.0092 tok_s=60056.8 opt_steps=660 +[epoch 49/50] step=680 train_loss=0.0092 tok_s=60061.4 opt_steps=680 +[epoch 49/50] step=700 train_loss=0.0092 tok_s=60069.0 opt_steps=700 +[epoch 49/50] step=720 train_loss=0.0092 tok_s=60084.3 opt_steps=720 +[epoch 49/50] step=740 train_loss=0.0092 tok_s=60076.8 opt_steps=740 +[epoch 49/50] step=760 train_loss=0.0092 tok_s=60068.6 opt_steps=760 +[epoch 49/50] step=780 train_loss=0.0092 tok_s=60065.9 opt_steps=780 +[epoch 49/50] step=800 train_loss=0.0093 tok_s=60069.4 opt_steps=800 +[epoch 49/50] step=820 train_loss=0.0093 tok_s=60062.6 opt_steps=820 +[epoch 49/50] step=840 train_loss=0.0093 tok_s=60075.6 opt_steps=840 +[epoch 49/50] step=860 train_loss=0.0093 tok_s=60078.2 opt_steps=860 +[epoch 49/50] step=880 train_loss=0.0093 tok_s=60073.9 opt_steps=880 +[epoch 49/50] step=900 train_loss=0.0094 tok_s=60075.3 opt_steps=900 +[epoch 49/50] step=920 train_loss=0.0094 tok_s=60080.6 opt_steps=920 +[epoch 49/50] step=940 train_loss=0.0094 tok_s=60086.7 opt_steps=940 +[epoch 49/50] step=960 train_loss=0.0094 tok_s=60086.0 opt_steps=960 +[epoch 49/50] step=980 train_loss=0.0094 tok_s=60090.0 opt_steps=980 +[epoch 49/50] step=1000 train_loss=0.0094 tok_s=60099.0 opt_steps=1000 +[epoch 49/50] step=1020 train_loss=0.0095 tok_s=60100.2 opt_steps=1020 +[epoch 49/50] step=1040 train_loss=0.0095 tok_s=60096.8 opt_steps=1040 +[epoch 49/50] step=1060 train_loss=0.0095 tok_s=60111.1 opt_steps=1060 +[epoch 49/50] step=1080 train_loss=0.0095 tok_s=60108.5 opt_steps=1080 +[epoch 49/50] step=1100 train_loss=0.0096 tok_s=60119.1 opt_steps=1100 +[epoch 49/50] step=1120 train_loss=0.0096 tok_s=60123.1 opt_steps=1120 +[epoch 49/50] step=1140 train_loss=0.0096 tok_s=60127.9 opt_steps=1140 +[epoch 49/50] step=1160 train_loss=0.0096 tok_s=60124.3 opt_steps=1160 +[epoch 49/50] step=1180 train_loss=0.0096 tok_s=60133.3 opt_steps=1180 +[epoch 49/50] step=1200 train_loss=0.0096 tok_s=60133.0 opt_steps=1200 +[epoch 49/50] step=1220 train_loss=0.0096 tok_s=60126.9 opt_steps=1220 +[epoch 49/50] step=1240 train_loss=0.0096 tok_s=60131.5 opt_steps=1240 +[epoch 49/50] step=1260 train_loss=0.0097 tok_s=60121.7 opt_steps=1260 +[epoch 49/50] step=1280 train_loss=0.0097 tok_s=60109.6 opt_steps=1280 +[epoch 49/50] step=1300 train_loss=0.0097 tok_s=60107.8 opt_steps=1300 +[epoch 49/50] step=1320 train_loss=0.0097 tok_s=60105.6 opt_steps=1320 +[epoch 49/50] step=1340 train_loss=0.0098 tok_s=60104.3 opt_steps=1340 +[epoch 49/50] step=1360 train_loss=0.0098 tok_s=60104.2 opt_steps=1360 +[epoch 49/50] step=1380 train_loss=0.0098 tok_s=60105.3 opt_steps=1380 +[epoch 49/50] step=1400 train_loss=0.0098 tok_s=60096.7 opt_steps=1400 +[epoch 49/50] step=1420 train_loss=0.0098 tok_s=60090.3 opt_steps=1420 +[epoch 49/50] step=1440 train_loss=0.0098 tok_s=60095.8 opt_steps=1440 +[epoch 49/50] step=1460 train_loss=0.0098 tok_s=60096.8 opt_steps=1460 +[epoch 49/50] step=1480 train_loss=0.0098 tok_s=60102.2 opt_steps=1480 +[epoch 49/50] step=1500 train_loss=0.0099 tok_s=60105.2 opt_steps=1500 +[epoch 49/50] step=1520 train_loss=0.0099 tok_s=60107.7 opt_steps=1520 +[epoch 49/50] step=1540 train_loss=0.0099 tok_s=60106.9 opt_steps=1540 +[epoch 49/50] step=1560 train_loss=0.0099 tok_s=60112.5 opt_steps=1560 +[epoch 49/50] step=1580 train_loss=0.0099 tok_s=60113.8 opt_steps=1580 +[epoch 49/50] step=1600 train_loss=0.0099 tok_s=60109.3 opt_steps=1600 +[epoch 49/50] step=1620 train_loss=0.0099 tok_s=60111.6 opt_steps=1620 +[epoch 49/50] step=1640 train_loss=0.0099 tok_s=60107.2 opt_steps=1640 +[epoch 49/50] step=1660 train_loss=0.0099 tok_s=60111.8 opt_steps=1660 +[epoch 49/50] step=1680 train_loss=0.0099 tok_s=60113.3 opt_steps=1680 +[epoch 49/50] step=1700 train_loss=0.0099 tok_s=60114.0 opt_steps=1700 +[epoch 49/50] step=1720 train_loss=0.0099 tok_s=60115.5 opt_steps=1720 +[epoch 49/50] step=1740 train_loss=0.0100 tok_s=60115.9 opt_steps=1740 +[epoch 49/50] step=1760 train_loss=0.0100 tok_s=60119.5 opt_steps=1760 +[epoch 49/50] step=1780 train_loss=0.0100 tok_s=60120.7 opt_steps=1780 +[epoch 49/50] step=1800 train_loss=0.0100 tok_s=60126.3 opt_steps=1800 +[epoch 49/50] step=1820 train_loss=0.0100 tok_s=60133.4 opt_steps=1820 +[epoch 49/50] step=1840 train_loss=0.0100 tok_s=60138.9 opt_steps=1840 +[epoch 49/50] step=1860 train_loss=0.0100 tok_s=60144.2 opt_steps=1860 +[epoch 49/50] step=1880 train_loss=0.0101 tok_s=60145.2 opt_steps=1880 +[epoch 49/50] step=1900 train_loss=0.0101 tok_s=60150.7 opt_steps=1900 +[epoch 49/50] step=1920 train_loss=0.0101 tok_s=60154.1 opt_steps=1920 +[epoch 49/50] step=1940 train_loss=0.0101 tok_s=60153.3 opt_steps=1940 +[epoch 49/50] step=1960 train_loss=0.0101 tok_s=60158.0 opt_steps=1960 +[epoch 49/50] step=1980 train_loss=0.0101 tok_s=60161.8 opt_steps=1980 +[epoch 49/50] step=2000 train_loss=0.0101 tok_s=60161.9 opt_steps=2000 +[epoch 49/50] step=2020 train_loss=0.0101 tok_s=60168.2 opt_steps=2020 +[epoch 49/50] step=2040 train_loss=0.0101 tok_s=60166.2 opt_steps=2040 +[epoch 49/50] step=2060 train_loss=0.0101 tok_s=60162.2 opt_steps=2060 +[epoch 49/50] step=2080 train_loss=0.0102 tok_s=60156.9 opt_steps=2080 +[epoch 49/50] step=2100 train_loss=0.0102 tok_s=60160.8 opt_steps=2100 +[epoch 49/50] step=2120 train_loss=0.0102 tok_s=60163.9 opt_steps=2120 +[epoch 49/50] step=2140 train_loss=0.0102 tok_s=60162.7 opt_steps=2140 +[epoch 49/50] step=2160 train_loss=0.0102 tok_s=60159.3 opt_steps=2160 +[epoch 49/50] step=2180 train_loss=0.0102 tok_s=60156.1 opt_steps=2180 +[epoch 49/50] step=2200 train_loss=0.0102 tok_s=60164.0 opt_steps=2200 +[epoch 49/50] step=2220 train_loss=0.0102 tok_s=60161.3 opt_steps=2220 +[epoch 49/50] step=2240 train_loss=0.0102 tok_s=60156.3 opt_steps=2240 +[epoch 49/50] step=2260 train_loss=0.0102 tok_s=60143.5 opt_steps=2260 +[epoch 49/50] step=2280 train_loss=0.0102 tok_s=60143.0 opt_steps=2280 +[epoch 49/50] step=2300 train_loss=0.0103 tok_s=60137.7 opt_steps=2300 +[epoch 49/50] step=2320 train_loss=0.0103 tok_s=60141.0 opt_steps=2320 +[epoch 49/50] step=2340 train_loss=0.0103 tok_s=60142.6 opt_steps=2340 +[epoch 49/50] step=2360 train_loss=0.0103 tok_s=60144.5 opt_steps=2360 +[epoch 49/50] step=2380 train_loss=0.0103 tok_s=60144.1 opt_steps=2380 +[epoch 49/50] step=2400 train_loss=0.0103 tok_s=60147.4 opt_steps=2400 +[epoch 49/50] step=2420 train_loss=0.0103 tok_s=60149.6 opt_steps=2420 +[epoch 49/50] step=2440 train_loss=0.0103 tok_s=60153.2 opt_steps=2440 +[epoch 49/50] step=2460 train_loss=0.0103 tok_s=60152.0 opt_steps=2460 +[epoch 49/50] step=2480 train_loss=0.0103 tok_s=60147.0 opt_steps=2480 +[epoch 49/50] step=2500 train_loss=0.0103 tok_s=60147.4 opt_steps=2500 +[epoch 49/50] step=2520 train_loss=0.0103 tok_s=60147.7 opt_steps=2520 +[epoch 49/50] step=2540 train_loss=0.0103 tok_s=60148.9 opt_steps=2540 +[epoch 49/50] step=2560 train_loss=0.0103 tok_s=60145.3 opt_steps=2560 +[epoch 49/50] step=2580 train_loss=0.0103 tok_s=60145.8 opt_steps=2580 +[epoch 49/50] step=2600 train_loss=0.0104 tok_s=60150.0 opt_steps=2600 +[epoch 49/50] step=2620 train_loss=0.0104 tok_s=60148.4 opt_steps=2620 +[epoch 49/50] step=2640 train_loss=0.0104 tok_s=60150.4 opt_steps=2640 +[epoch 49/50] step=2660 train_loss=0.0104 tok_s=60155.2 opt_steps=2660 +[epoch 49/50] step=2680 train_loss=0.0104 tok_s=60158.2 opt_steps=2680 +[epoch 49/50] step=2700 train_loss=0.0104 tok_s=60160.0 opt_steps=2700 +[epoch 49/50] step=2720 train_loss=0.0104 tok_s=60159.6 opt_steps=2720 +[epoch 49/50] step=2740 train_loss=0.0104 tok_s=60166.7 opt_steps=2740 +[epoch 49/50] step=2760 train_loss=0.0104 tok_s=60166.5 opt_steps=2760 +[epoch 49/50] step=2780 train_loss=0.0104 tok_s=60168.4 opt_steps=2780 +[epoch 49/50] step=2800 train_loss=0.0104 tok_s=60171.2 opt_steps=2800 +[epoch 49/50] step=2820 train_loss=0.0104 tok_s=60170.3 opt_steps=2820 +[epoch 49/50] step=2840 train_loss=0.0104 tok_s=60171.0 opt_steps=2840 +[epoch 49/50] step=2860 train_loss=0.0104 tok_s=60170.6 opt_steps=2860 +[epoch 49/50] step=2880 train_loss=0.0104 tok_s=60171.0 opt_steps=2880 +[epoch 49/50] step=2900 train_loss=0.0104 tok_s=60172.9 opt_steps=2900 +[epoch 49/50] step=2920 train_loss=0.0105 tok_s=60175.2 opt_steps=2920 +[epoch 49/50] step=2940 train_loss=0.0105 tok_s=60175.6 opt_steps=2940 +[epoch 49/50] step=2960 train_loss=0.0105 tok_s=60180.5 opt_steps=2960 +[epoch 49/50] step=2980 train_loss=0.0105 tok_s=60182.6 opt_steps=2980 +[epoch 49/50] step=3000 train_loss=0.0105 tok_s=60181.1 opt_steps=3000 +[epoch 49/50] step=3020 train_loss=0.0105 tok_s=60181.9 opt_steps=3020 +[epoch 49/50] step=3040 train_loss=0.0105 tok_s=60183.0 opt_steps=3040 +[epoch 49/50] step=3060 train_loss=0.0105 tok_s=60185.5 opt_steps=3060 +[epoch 49/50] step=3080 train_loss=0.0105 tok_s=60186.2 opt_steps=3080 +[epoch 49/50] step=3100 train_loss=0.0105 tok_s=60189.6 opt_steps=3100 +[epoch 49/50] step=3120 train_loss=0.0105 tok_s=60190.7 opt_steps=3120 +[epoch 49/50] step=3140 train_loss=0.0105 tok_s=60190.5 opt_steps=3140 +[epoch 49/50] step=3160 train_loss=0.0105 tok_s=60190.0 opt_steps=3160 +[epoch 49/50] step=3180 train_loss=0.0105 tok_s=60190.9 opt_steps=3180 +[epoch 49/50] step=3200 train_loss=0.0106 tok_s=60196.4 opt_steps=3200 +[epoch 49/50] step=3220 train_loss=0.0106 tok_s=60200.8 opt_steps=3220 +[epoch 49/50] step=3240 train_loss=0.0106 tok_s=60202.7 opt_steps=3240 +[epoch 49/50] step=3260 train_loss=0.0106 tok_s=60199.3 opt_steps=3260 +[epoch 49/50] train_loss=0.0106 val_skipped tok_s=60202.5 opt_steps=3273 +[epoch 50/50] step=20 train_loss=0.0089 tok_s=58985.5 opt_steps=20 +[epoch 50/50] step=40 train_loss=0.0086 tok_s=59824.8 opt_steps=40 +[epoch 50/50] step=60 train_loss=0.0086 tok_s=59962.5 opt_steps=60 +[epoch 50/50] step=80 train_loss=0.0087 tok_s=60064.6 opt_steps=80 +[epoch 50/50] step=100 train_loss=0.0088 tok_s=60192.0 opt_steps=100 +[epoch 50/50] step=120 train_loss=0.0087 tok_s=60335.3 opt_steps=120 +[epoch 50/50] step=140 train_loss=0.0087 tok_s=60370.8 opt_steps=140 +[epoch 50/50] step=160 train_loss=0.0087 tok_s=60363.8 opt_steps=160 +[epoch 50/50] step=180 train_loss=0.0087 tok_s=60455.7 opt_steps=180 +[epoch 50/50] step=200 train_loss=0.0087 tok_s=60487.6 opt_steps=200 +[epoch 50/50] step=220 train_loss=0.0086 tok_s=60498.4 opt_steps=220 +[epoch 50/50] step=240 train_loss=0.0087 tok_s=60511.3 opt_steps=240 +[epoch 50/50] step=260 train_loss=0.0087 tok_s=60556.7 opt_steps=260 +[epoch 50/50] step=280 train_loss=0.0087 tok_s=60553.0 opt_steps=280 +[epoch 50/50] step=300 train_loss=0.0087 tok_s=60560.1 opt_steps=300 +[epoch 50/50] step=320 train_loss=0.0087 tok_s=60551.8 opt_steps=320 +[epoch 50/50] step=340 train_loss=0.0086 tok_s=60545.2 opt_steps=340 +[epoch 50/50] step=360 train_loss=0.0087 tok_s=60562.9 opt_steps=360 +[epoch 50/50] step=380 train_loss=0.0087 tok_s=60551.0 opt_steps=380 +[epoch 50/50] step=400 train_loss=0.0088 tok_s=60578.4 opt_steps=400 +[epoch 50/50] step=420 train_loss=0.0088 tok_s=60576.1 opt_steps=420 +[epoch 50/50] step=440 train_loss=0.0088 tok_s=60580.3 opt_steps=440 +[epoch 50/50] step=460 train_loss=0.0089 tok_s=60602.5 opt_steps=460 +[epoch 50/50] step=480 train_loss=0.0089 tok_s=60586.5 opt_steps=480 +[epoch 50/50] step=500 train_loss=0.0089 tok_s=60601.4 opt_steps=500 +[epoch 50/50] step=520 train_loss=0.0089 tok_s=60608.2 opt_steps=520 +[epoch 50/50] step=540 train_loss=0.0089 tok_s=60595.0 opt_steps=540 +[epoch 50/50] step=560 train_loss=0.0089 tok_s=60599.9 opt_steps=560 +[epoch 50/50] step=580 train_loss=0.0089 tok_s=60586.2 opt_steps=580 +[epoch 50/50] step=600 train_loss=0.0090 tok_s=60593.5 opt_steps=600 +[epoch 50/50] step=620 train_loss=0.0090 tok_s=60591.7 opt_steps=620 +[epoch 50/50] step=640 train_loss=0.0090 tok_s=60583.5 opt_steps=640 +[epoch 50/50] step=660 train_loss=0.0090 tok_s=60590.6 opt_steps=660 +[epoch 50/50] step=680 train_loss=0.0090 tok_s=60607.9 opt_steps=680 +[epoch 50/50] step=700 train_loss=0.0091 tok_s=60596.7 opt_steps=700 +[epoch 50/50] step=720 train_loss=0.0091 tok_s=60603.4 opt_steps=720 +[epoch 50/50] step=740 train_loss=0.0091 tok_s=60616.1 opt_steps=740 +[epoch 50/50] step=760 train_loss=0.0091 tok_s=60597.1 opt_steps=760 +[epoch 50/50] step=780 train_loss=0.0091 tok_s=60600.6 opt_steps=780 +[epoch 50/50] step=800 train_loss=0.0091 tok_s=60610.4 opt_steps=800 +[epoch 50/50] step=820 train_loss=0.0091 tok_s=60602.7 opt_steps=820 +[epoch 50/50] step=840 train_loss=0.0091 tok_s=60608.3 opt_steps=840 +[epoch 50/50] step=860 train_loss=0.0091 tok_s=60591.2 opt_steps=860 +[epoch 50/50] step=880 train_loss=0.0092 tok_s=60588.7 opt_steps=880 +[epoch 50/50] step=900 train_loss=0.0092 tok_s=60590.6 opt_steps=900 +[epoch 50/50] step=920 train_loss=0.0092 tok_s=60581.4 opt_steps=920 +[epoch 50/50] step=940 train_loss=0.0092 tok_s=60590.0 opt_steps=940 +[epoch 50/50] step=960 train_loss=0.0092 tok_s=60592.1 opt_steps=960 +[epoch 50/50] step=980 train_loss=0.0092 tok_s=60604.3 opt_steps=980 +[epoch 50/50] step=1000 train_loss=0.0092 tok_s=60607.2 opt_steps=1000 +[epoch 50/50] step=1020 train_loss=0.0092 tok_s=60595.0 opt_steps=1020 +[epoch 50/50] step=1040 train_loss=0.0092 tok_s=60595.0 opt_steps=1040 +[epoch 50/50] step=1060 train_loss=0.0093 tok_s=60599.6 opt_steps=1060 +[epoch 50/50] step=1080 train_loss=0.0093 tok_s=60588.0 opt_steps=1080 +[epoch 50/50] step=1100 train_loss=0.0093 tok_s=60594.9 opt_steps=1100 +[epoch 50/50] step=1120 train_loss=0.0093 tok_s=60596.1 opt_steps=1120 +[epoch 50/50] step=1140 train_loss=0.0093 tok_s=60590.2 opt_steps=1140 +[epoch 50/50] step=1160 train_loss=0.0093 tok_s=60579.6 opt_steps=1160 +[epoch 50/50] step=1180 train_loss=0.0094 tok_s=60566.6 opt_steps=1180 +[epoch 50/50] step=1200 train_loss=0.0094 tok_s=60566.4 opt_steps=1200 +[epoch 50/50] step=1220 train_loss=0.0094 tok_s=60563.8 opt_steps=1220 +[epoch 50/50] step=1240 train_loss=0.0094 tok_s=60567.7 opt_steps=1240 +[epoch 50/50] step=1260 train_loss=0.0094 tok_s=60562.2 opt_steps=1260 +[epoch 50/50] step=1280 train_loss=0.0094 tok_s=60565.9 opt_steps=1280 +[epoch 50/50] step=1300 train_loss=0.0094 tok_s=60567.9 opt_steps=1300 +[epoch 50/50] step=1320 train_loss=0.0095 tok_s=60562.3 opt_steps=1320 +[epoch 50/50] step=1340 train_loss=0.0095 tok_s=60566.7 opt_steps=1340 +[epoch 50/50] step=1360 train_loss=0.0095 tok_s=60566.7 opt_steps=1360 +[epoch 50/50] step=1380 train_loss=0.0095 tok_s=60571.3 opt_steps=1380 +[epoch 50/50] step=1400 train_loss=0.0095 tok_s=60568.5 opt_steps=1400 +[epoch 50/50] step=1420 train_loss=0.0095 tok_s=60570.0 opt_steps=1420 +[epoch 50/50] step=1440 train_loss=0.0096 tok_s=60570.7 opt_steps=1440 +[epoch 50/50] step=1460 train_loss=0.0096 tok_s=60573.3 opt_steps=1460 +[epoch 50/50] step=1480 train_loss=0.0096 tok_s=60577.4 opt_steps=1480 +[epoch 50/50] step=1500 train_loss=0.0096 tok_s=60577.2 opt_steps=1500 +[epoch 50/50] step=1520 train_loss=0.0096 tok_s=60575.9 opt_steps=1520 +[epoch 50/50] step=1540 train_loss=0.0096 tok_s=60576.7 opt_steps=1540 +[epoch 50/50] step=1560 train_loss=0.0097 tok_s=60581.5 opt_steps=1560 +[epoch 50/50] step=1580 train_loss=0.0097 tok_s=60579.3 opt_steps=1580 +[epoch 50/50] step=1600 train_loss=0.0097 tok_s=60585.0 opt_steps=1600 +[epoch 50/50] step=1620 train_loss=0.0097 tok_s=60585.8 opt_steps=1620 +[epoch 50/50] step=1640 train_loss=0.0097 tok_s=60581.5 opt_steps=1640 +[epoch 50/50] step=1660 train_loss=0.0097 tok_s=60581.8 opt_steps=1660 +[epoch 50/50] step=1680 train_loss=0.0097 tok_s=60582.2 opt_steps=1680 +[epoch 50/50] step=1700 train_loss=0.0097 tok_s=60582.9 opt_steps=1700 +[epoch 50/50] step=1720 train_loss=0.0097 tok_s=60585.9 opt_steps=1720 +[epoch 50/50] step=1740 train_loss=0.0098 tok_s=60589.6 opt_steps=1740 +[epoch 50/50] step=1760 train_loss=0.0098 tok_s=60591.2 opt_steps=1760 +[epoch 50/50] step=1780 train_loss=0.0098 tok_s=60592.0 opt_steps=1780 +[epoch 50/50] step=1800 train_loss=0.0098 tok_s=60587.5 opt_steps=1800 +[epoch 50/50] step=1820 train_loss=0.0098 tok_s=60587.8 opt_steps=1820 +[epoch 50/50] step=1840 train_loss=0.0098 tok_s=60584.8 opt_steps=1840 +[epoch 50/50] step=1860 train_loss=0.0098 tok_s=60581.2 opt_steps=1860 +[epoch 50/50] step=1880 train_loss=0.0098 tok_s=60583.8 opt_steps=1880 +[epoch 50/50] step=1900 train_loss=0.0098 tok_s=60583.2 opt_steps=1900 +[epoch 50/50] step=1920 train_loss=0.0098 tok_s=60586.8 opt_steps=1920 +[epoch 50/50] step=1940 train_loss=0.0098 tok_s=60589.3 opt_steps=1940 +[epoch 50/50] step=1960 train_loss=0.0099 tok_s=60592.9 opt_steps=1960 +[epoch 50/50] step=1980 train_loss=0.0099 tok_s=60592.1 opt_steps=1980 +[epoch 50/50] step=2000 train_loss=0.0099 tok_s=60601.9 opt_steps=2000 +[epoch 50/50] step=2020 train_loss=0.0099 tok_s=60603.9 opt_steps=2020 +[epoch 50/50] step=2040 train_loss=0.0099 tok_s=60594.5 opt_steps=2040 +[epoch 50/50] step=2060 train_loss=0.0099 tok_s=60597.2 opt_steps=2060 +[epoch 50/50] step=2080 train_loss=0.0099 tok_s=60598.7 opt_steps=2080 +[epoch 50/50] step=2100 train_loss=0.0099 tok_s=60599.0 opt_steps=2100 +[epoch 50/50] step=2120 train_loss=0.0099 tok_s=60601.2 opt_steps=2120 +[epoch 50/50] step=2140 train_loss=0.0099 tok_s=60598.3 opt_steps=2140 +[epoch 50/50] step=2160 train_loss=0.0099 tok_s=60599.9 opt_steps=2160 +[epoch 50/50] step=2180 train_loss=0.0099 tok_s=60599.2 opt_steps=2180 +[epoch 50/50] step=2200 train_loss=0.0100 tok_s=60600.9 opt_steps=2200 +[epoch 50/50] step=2220 train_loss=0.0100 tok_s=60602.7 opt_steps=2220 +[epoch 50/50] step=2240 train_loss=0.0100 tok_s=60599.4 opt_steps=2240 +[epoch 50/50] step=2260 train_loss=0.0100 tok_s=60596.9 opt_steps=2260 +[epoch 50/50] step=2280 train_loss=0.0100 tok_s=60596.1 opt_steps=2280 +[epoch 50/50] step=2300 train_loss=0.0100 tok_s=60599.1 opt_steps=2300 +[epoch 50/50] step=2320 train_loss=0.0100 tok_s=60595.9 opt_steps=2320 +[epoch 50/50] step=2340 train_loss=0.0100 tok_s=60590.3 opt_steps=2340 +[epoch 50/50] step=2360 train_loss=0.0100 tok_s=60594.0 opt_steps=2360 +[epoch 50/50] step=2380 train_loss=0.0100 tok_s=60592.9 opt_steps=2380 +[epoch 50/50] step=2400 train_loss=0.0100 tok_s=60588.1 opt_steps=2400 +[epoch 50/50] step=2420 train_loss=0.0100 tok_s=60589.8 opt_steps=2420 +[epoch 50/50] step=2440 train_loss=0.0100 tok_s=60589.2 opt_steps=2440 +[epoch 50/50] step=2460 train_loss=0.0101 tok_s=60585.9 opt_steps=2460 +[epoch 50/50] step=2480 train_loss=0.0101 tok_s=60589.9 opt_steps=2480 +[epoch 50/50] step=2500 train_loss=0.0101 tok_s=60591.2 opt_steps=2500 +[epoch 50/50] step=2520 train_loss=0.0101 tok_s=60589.8 opt_steps=2520 +[epoch 50/50] step=2540 train_loss=0.0101 tok_s=60592.7 opt_steps=2540 +[epoch 50/50] step=2560 train_loss=0.0101 tok_s=60591.4 opt_steps=2560 +[epoch 50/50] step=2580 train_loss=0.0101 tok_s=60593.3 opt_steps=2580 +[epoch 50/50] step=2600 train_loss=0.0101 tok_s=60592.7 opt_steps=2600 +[epoch 50/50] step=2620 train_loss=0.0101 tok_s=60594.9 opt_steps=2620 +[epoch 50/50] step=2640 train_loss=0.0101 tok_s=60596.8 opt_steps=2640 +[epoch 50/50] step=2660 train_loss=0.0101 tok_s=60597.4 opt_steps=2660 +[epoch 50/50] step=2680 train_loss=0.0101 tok_s=60597.0 opt_steps=2680 +[epoch 50/50] step=2700 train_loss=0.0102 tok_s=60595.5 opt_steps=2700 +[epoch 50/50] step=2720 train_loss=0.0102 tok_s=60594.5 opt_steps=2720 +[epoch 50/50] step=2740 train_loss=0.0102 tok_s=60594.2 opt_steps=2740 +[epoch 50/50] step=2760 train_loss=0.0102 tok_s=60592.9 opt_steps=2760 +[epoch 50/50] step=2780 train_loss=0.0102 tok_s=60590.2 opt_steps=2780 +[epoch 50/50] step=2800 train_loss=0.0102 tok_s=60587.4 opt_steps=2800 +[epoch 50/50] step=2820 train_loss=0.0102 tok_s=60590.5 opt_steps=2820 +[epoch 50/50] step=2840 train_loss=0.0102 tok_s=60590.0 opt_steps=2840 +[epoch 50/50] step=2860 train_loss=0.0102 tok_s=60587.2 opt_steps=2860 +[epoch 50/50] step=2880 train_loss=0.0102 tok_s=60587.0 opt_steps=2880 +[epoch 50/50] step=2900 train_loss=0.0102 tok_s=60587.2 opt_steps=2900 +[epoch 50/50] step=2920 train_loss=0.0103 tok_s=60586.5 opt_steps=2920 +[epoch 50/50] step=2940 train_loss=0.0103 tok_s=60584.2 opt_steps=2940 +[epoch 50/50] step=2960 train_loss=0.0103 tok_s=60586.5 opt_steps=2960 +[epoch 50/50] step=2980 train_loss=0.0103 tok_s=60584.6 opt_steps=2980 +[epoch 50/50] step=3000 train_loss=0.0103 tok_s=60581.6 opt_steps=3000 +[epoch 50/50] step=3020 train_loss=0.0103 tok_s=60581.5 opt_steps=3020 +[epoch 50/50] step=3040 train_loss=0.0103 tok_s=60586.0 opt_steps=3040 +[epoch 50/50] step=3060 train_loss=0.0103 tok_s=60587.2 opt_steps=3060 +[epoch 50/50] step=3080 train_loss=0.0103 tok_s=60585.9 opt_steps=3080 +[epoch 50/50] step=3100 train_loss=0.0103 tok_s=60586.7 opt_steps=3100 +[epoch 50/50] step=3120 train_loss=0.0103 tok_s=60585.6 opt_steps=3120 +[epoch 50/50] step=3140 train_loss=0.0103 tok_s=60584.3 opt_steps=3140 +[epoch 50/50] step=3160 train_loss=0.0103 tok_s=60584.9 opt_steps=3160 +[epoch 50/50] step=3180 train_loss=0.0103 tok_s=60581.3 opt_steps=3180 +[epoch 50/50] step=3200 train_loss=0.0103 tok_s=60580.9 opt_steps=3200 +[epoch 50/50] step=3220 train_loss=0.0104 tok_s=60584.1 opt_steps=3220 +[epoch 50/50] step=3240 train_loss=0.0104 tok_s=60583.3 opt_steps=3240 +[epoch 50/50] step=3260 train_loss=0.0104 tok_s=60579.8 opt_steps=3260 +[epoch 50/50] train_loss=0.0104 val_loss=0.0954 tok_s=60538.8 opt_steps=3273 +Saved checkpoint: checkpoints_h100_100m_sparse_full/epoch_50.pt +[final] test_loss=0.0850